Context Navigation

source: trunk/src/kmk/strcache2.c@ 1920

Last change on this file since 1920 was 1920, checked in by bird, 17 years ago
strcache2: don't hash strings that aren't small (experimental).
Property svn:eol-style set to `native` Property svn:keywords set to `Author Date Id Revision`
File size: 37.9 KB

Line
1	/* $Id: strcache2.c 1920 2008-10-24 00:00:27Z bird $ */
2	/** @file
3	* strcache2 - New string cache.
4	*/
5
6	/*
7	* Copyright (c) 2008 knut st. osmundsen <bird-src-spam@anduin.net>
8	*
9	* This file is part of kBuild.
10	*
11	* kBuild is free software; you can redistribute it and/or modify
12	* it under the terms of the GNU General Public License as published by
13	* the Free Software Foundation; either version 2 of the License, or
14	* (at your option) any later version.
15	*
16	* kBuild is distributed in the hope that it will be useful,
17	* but WITHOUT ANY WARRANTY; without even the implied warranty of
18	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19	* GNU General Public License for more details.
20	*
21	* You should have received a copy of the GNU General Public License
22	* along with kBuild; if not, write to the Free Software
23	* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
24	*
25	*/
26
27
28	/*******************************************************************************
29	* Header Files *
30	*******************************************************************************/
31	#include "make.h"
32	#include "strcache2.h"
33
34	#include <assert.h>
35
36	#include "debug.h"
37
38	#ifdef _MSC_VER
39	typedef unsigned char uint8_t;
40	typedef unsigned short uint16_t;
41	typedef unsigned int uint32_t;
42	typedef signed char int8_t;
43	typedef signed short int16_t;
44	typedef signed int int32_t;
45	#else
46	# include <stdint.h>
47	#endif
48
49	#ifdef WINDOWS32
50	# include <io.h>
51	# include <process.h>
52	# include <Windows.h>
53	# define PARSE_IN_WORKER
54	#endif
55
56	#ifdef __OS2__
57	# include <sys/fmutex.h>
58	#endif
59
60	#ifdef HAVE_PTHREAD
61	# include <pthread.h>
62	#endif
63
64
65	/*******************************************************************************
66	* Defined Constants And Macros *
67	*******************************************************************************/
68	/* The default size of a memory segment (1MB). */
69	#define STRCACHE2_SEG_SIZE (1024U*1024U)
70	/* The default hash table shift (hash size give as a power of two). */
71	#define STRCACHE2_HASH_SHIFT 16
72	/** Does the modding / masking of a hash number into an index. */
73	#ifdef STRCACHE2_USE_MASK
74	# define STRCACHE2_MOD_IT(cache, hash) ((hash) & (cache)->hash_mask)
75	#else
76	# define STRCACHE2_MOD_IT(cache, hash) ((hash) % (cache)->hash_div)
77	#endif
78
79	# if defined(__amd64__) \|\| defined(__x86_64__) \|\| defined(__AMD64__) \|\| defined(_M_X64) \|\| defined(__amd64) \
80	\|\| defined(__i386__) \|\| defined(__x86__) \|\| defined(__X86__) \|\| defined(_M_IX86) \|\| defined(__i386)
81	# define strcache2_get_unaligned_16bits(ptr) ( ((const uint16_t )(ptr)))
82	# else
83	/* (endian doesn't matter) */
84	# define strcache2_get_unaligned_16bits(ptr) ( (((const uint8_t *)(ptr))[0] << 8) \
85	\| (((const uint8_t *)(ptr))[1]) )
86	# endif
87
88
89	/*******************************************************************************
90	* Global Variables *
91	*******************************************************************************/
92	/* List of initialized string caches. */
93	static struct strcache2 *strcache_head;
94
95
96	/** Finds the closest primary number for power of two value (or something else
97	* useful if not support). */
98	MY_INLINE unsigned int strcache2_find_prime(unsigned int shift)
99	{
100	switch (shift)
101	{
102	case 5: return 31;
103	case 6: return 61;
104	case 7: return 127;
105	case 8: return 251;
106	case 9: return 509;
107	case 10: return 1021;
108	case 11: return 2039;
109	case 12: return 4093;
110	case 13: return 8191;
111	case 14: return 16381;
112	case 15: return 32749;
113	case 16: return 65521;
114	case 17: return 131063;
115	case 18: return 262139;
116	case 19: return 524269;
117	case 20: return 1048573;
118	case 21: return 2097143;
119	case 22: return 4194301;
120	case 23: return 8388593;
121
122	default:
123	assert (0);
124	return (1 << shift) - 1;
125	}
126	}
127
128	/* The following is a bit experiment. It produces longer chains, i.e. worse
129	distribution of the strings in the table, however the actual make
130	performances is better (<time). The explanation is probably that the
131	collisions only really increase for entries that aren't looked up that
132	much and that it actually improoves the situation for those that is. Or
133	that we spend so much less time hashing that it makes up (and more) for
134	the pentalty we suffer from the longer chains and worse distribution.
135
136	XXX: Check how this works out with different path lengths. I suspect it
137	might depend on the length of PATH_ROOT and the depth of the files
138	in the project as well. If it does, this might make matters worse
139	for some and better for others which isn't very cool... */
140
141	#define BIG_HASH_SIZE 32
142	#define BIG_HASH_TAIL 12
143	#define BIG_HASH_HEAD 16
144
145	#ifdef BIG_HASH_SIZE
146	/* long string: hash head and tail, drop the middle. */
147	MY_INLINE unsigned int
148	strcache2_case_sensitive_hash_big (const char *str, unsigned int len)
149	{
150	uint32_t hash = len;
151	uint32_t tmp;
152	unsigned int head;
153
154	/* head BIG_HASH_HEAD bytes */
155	head = (BIG_HASH_HEAD >> 2);
156	while (head-- > 0)
157	{
158	hash += strcache2_get_unaligned_16bits (str);
159	tmp = (strcache2_get_unaligned_16bits (str + 2) << 11) ^ hash;
160	hash = (hash << 16) ^ tmp;
161	str += 2 * sizeof (uint16_t);
162	hash += hash >> 11;
163	}
164
165	/* tail BIG_HASH_TAIL bytes (minus the odd ones) */
166	str += (len - BIG_HASH_HEAD - BIG_HASH_TAIL) & ~3U;
167	head = (BIG_HASH_TAIL >> 2);
168	while (head-- > 0)
169	{
170	hash += strcache2_get_unaligned_16bits (str);
171	tmp = (strcache2_get_unaligned_16bits (str + 2) << 11) ^ hash;
172	hash = (hash << 16) ^ tmp;
173	str += 2 * sizeof (uint16_t);
174	hash += hash >> 11;
175	}
176
177	/* force "avalanching" of final 127 bits. */
178	hash ^= hash << 3;
179	hash += hash >> 5;
180	hash ^= hash << 4;
181	hash += hash >> 17;
182	hash ^= hash << 25;
183	hash += hash >> 6;
184
185	return hash;
186	}
187	#endif /* BIG_HASH_SIZE */
188
189	MY_INLINE unsigned int
190	strcache2_case_sensitive_hash (const char *str, unsigned int len)
191	{
192	#if 1
193	/* Paul Hsieh hash SuperFast function:
194	http://www.azillionmonkeys.com/qed/hash.html
195
196	This performs very good and as a sligtly better distribution than
197	STRING_N_HASH_1 on a typical kBuild run.
198
199	It is also 37% faster than return_STRING_N_HASH_1 when running the
200	two 100 times over typical kBuild strings that end up here (did a
201	fprintf here and built kBuild). Compiler was 32-bit gcc 4.0.1, darwin,
202	with -O2.
203
204	FIXME: A path for well aligned data should be added to speed up
205	execution on alignment sensitive systems. */
206	unsigned int rem;
207	uint32_t hash;
208	uint32_t tmp;
209
210	assert (sizeof (uint8_t) == sizeof (char));
211
212	# ifdef BIG_HASH_SIZE
213	/* long string? */
214	if (len >= BIG_HASH_SIZE)
215	return strcache2_case_sensitive_hash_big (str, len);
216	# endif
217
218	/* short string: main loop, walking on 2 x uint16_t */
219	hash = len;
220	rem = len & 3;
221	len >>= 2;
222	while (len > 0)
223	{
224	hash += strcache2_get_unaligned_16bits (str);
225	tmp = (strcache2_get_unaligned_16bits (str + 2) << 11) ^ hash;
226	hash = (hash << 16) ^ tmp;
227	str += 2 * sizeof (uint16_t);
228	hash += hash >> 11;
229	len--;
230	}
231
232	/* the remainder */
233	switch (rem)
234	{
235	case 3:
236	hash += strcache2_get_unaligned_16bits (str);
237	hash ^= hash << 16;
238	hash ^= str[sizeof (uint16_t)] << 18;
239	hash += hash >> 11;
240	break;
241	case 2:
242	hash += strcache2_get_unaligned_16bits (str);
243	hash ^= hash << 11;
244	hash += hash >> 17;
245	break;
246	case 1:
247	hash += *str;
248	hash ^= hash << 10;
249	hash += hash >> 1;
250	break;
251	}
252
253	/* force "avalanching" of final 127 bits. */
254	hash ^= hash << 3;
255	hash += hash >> 5;
256	hash ^= hash << 4;
257	hash += hash >> 17;
258	hash ^= hash << 25;
259	hash += hash >> 6;
260
261	return hash;
262
263	#elif 1
264	/* Note! This implementation is 18% faster than return_STRING_N_HASH_1
265	when running the two 100 times over typical kBuild strings that
266	end up here (did a fprintf here and built kBuild).
267	Compiler was 32-bit gcc 4.0.1, darwin, with -O2. */
268
269	unsigned int hash = 0;
270	if (MY_PREDICT_TRUE(len >= 2))
271	{
272	unsigned int ch0 = *str++;
273	hash = 0;
274	len--;
275	while (len >= 2)
276	{
277	unsigned int ch1 = *str++;
278	hash += ch0 << (ch1 & 0xf);
279
280	ch0 = *str++;
281	hash += ch1 << (ch0 & 0xf);
282
283	len -= 2;
284	}
285	if (len == 1)
286	{
287	unsigned ch1 = *str;
288	hash += ch0 << (ch1 & 0xf);
289
290	hash += ch1;
291	}
292	else
293	hash += ch0;
294	}
295	else if (len)
296	{
297	hash = *str;
298	hash += hash << (hash & 0xf);
299	}
300	else
301	hash = 0;
302	return hash;
303
304	#elif 1
305	# if 0
306	/* This is SDBM. This specific form/unroll was benchmarked to be 28% faster
307	than return_STRING_N_HASH_1. (Now the weird thing is that putting the (ch)
308	first in the assignment made it noticably slower.)
309
310	However, it is noticably slower in practice, most likely because of more
311	collisions. Hrmpf. */
312
313	# define UPDATE_HASH(ch) hash = (hash << 6) + (hash << 16) - hash + (ch)
314	unsigned int hash = 0;
315
316	# else
317	/* This is DJB2. This specific form/unroll was benchmarked to be 27%
318	fast than return_STRING_N_HASH_1.
319
320	Ditto. */
321
322	# define UPDATE_HASH(ch) hash = (hash << 5) + hash + (ch)
323	unsigned int hash = 5381;
324	# endif
325
326
327	while (len >= 4)
328	{
329	UPDATE_HASH (str[0]);
330	UPDATE_HASH (str[1]);
331	UPDATE_HASH (str[2]);
332	UPDATE_HASH (str[3]);
333	str += 4;
334	len -= 4;
335	}
336	switch (len)
337	{
338	default:
339	case 0:
340	return hash;
341	case 1:
342	UPDATE_HASH (str[0]);
343	return hash;
344	case 2:
345	UPDATE_HASH (str[0]);
346	UPDATE_HASH (str[1]);
347	return hash;
348	case 3:
349	UPDATE_HASH (str[0]);
350	UPDATE_HASH (str[1]);
351	UPDATE_HASH (str[2]);
352	return hash;
353	}
354	#endif
355	}
356
357	MY_INLINE unsigned int
358	strcache2_case_insensitive_hash (const char *str, unsigned int len)
359	{
360	unsigned int hash = 0;
361	if (MY_PREDICT_TRUE(len >= 2))
362	{
363	unsigned int ch0 = *str++;
364	ch0 = tolower (ch0);
365	hash = 0;
366	len--;
367	while (len >= 2)
368	{
369	unsigned int ch1 = *str++;
370	ch1 = tolower (ch1);
371	hash += ch0 << (ch1 & 0xf);
372
373	ch0 = *str++;
374	ch0 = tolower (ch0);
375	hash += ch1 << (ch0 & 0xf);
376
377	len -= 2;
378	}
379	if (len == 1)
380	{
381	unsigned ch1 = *str;
382	ch1 = tolower (ch1);
383	hash += ch0 << (ch1 & 0xf);
384
385	hash += ch1;
386	}
387	else
388	hash += ch0;
389	}
390	else if (len)
391	{
392	hash = *str;
393	hash += hash << (hash & 0xf);
394	}
395	else
396	hash = 0;
397	return hash;
398	}
399
400	MY_INLINE int
401	strcache2_memcmp_inline_short (const char xs, const char ys, unsigned int length)
402	{
403	if (length <= 8)
404	{
405	/* short string compare - ~50% of the kBuild calls. */
406	assert ( !((size_t)ys & 3) );
407	if (!((size_t)xs & 3))
408	{
409	/* aligned */
410	int result;
411	switch (length)
412	{
413	default: /* memcmp for longer strings */
414	return memcmp (xs, ys, length);
415	case 8:
416	result = (int32_t)(xs + 4) - (int32_t)(ys + 4);
417	result \|= (int32_t)xs - (int32_t)ys;
418	return result;
419	case 7:
420	result = xs[6] - ys[6];
421	result \|= xs[5] - ys[5];
422	result \|= xs[4] - ys[4];
423	result \|= (int32_t)xs - (int32_t)ys;
424	return result;
425	case 6:
426	result = xs[5] - ys[5];
427	result \|= xs[4] - ys[4];
428	result \|= (int32_t)xs - (int32_t)ys;
429	return result;
430	case 5:
431	result = xs[4] - ys[4];
432	result \|= (int32_t)xs - (int32_t)ys;
433	return result;
434	case 4:
435	return (int32_t)xs - (int32_t)ys;
436	case 3:
437	result = xs[2] - ys[2];
438	result \|= xs[1] - ys[1];
439	result \|= xs[0] - ys[0];
440	return result;
441	case 2:
442	result = xs[1] - ys[1];
443	result \|= xs[0] - ys[0];
444	return result;
445	case 1:
446	return xs - ys;
447	case 0:
448	return 0;
449	}
450	}
451	else
452	{
453	/* unaligned */
454	int result = 0;
455	switch (length)
456	{
457	case 8: result \|= xs[7] - ys[7];
458	case 7: result \|= xs[6] - ys[6];
459	case 6: result \|= xs[5] - ys[5];
460	case 5: result \|= xs[4] - ys[4];
461	case 4: result \|= xs[3] - ys[3];
462	case 3: result \|= xs[2] - ys[2];
463	case 2: result \|= xs[1] - ys[1];
464	case 1: result \|= xs[0] - ys[0];
465	case 0:
466	return result;
467	}
468	}
469	}
470
471	/* memcmp for longer strings */
472	return memcmp (xs, ys, length);
473	}
474
475	MY_INLINE int
476	strcache2_memcmp_inlined (const char xs, const char ys, unsigned int length)
477	{
478	#ifndef ELECTRIC_HEAP
479	assert ( !((size_t)ys & 3) );
480	#endif
481	if (!((size_t)xs & 3))
482	{
483	int result;
484	/* aligned */
485	while (length >= 8)
486	{
487	result = (int32_t)xs - (int32_t)ys;
488	result \|= (int32_t)(xs + 4) - (int32_t)(ys + 4);
489	if (MY_PREDICT_FALSE(result))
490	return result;
491	xs += 8;
492	ys += 8;
493	length -= 8;
494	}
495	switch (length)
496	{
497	case 7:
498	result = (int32_t)xs - (int32_t)ys;
499	result \|= xs[6] - ys[6];
500	result \|= xs[5] - ys[5];
501	result \|= xs[4] - ys[4];
502	return result;
503	case 6:
504	result = (int32_t)xs - (int32_t)ys;
505	result \|= xs[5] - ys[5];
506	result \|= xs[4] - ys[4];
507	return result;
508	case 5:
509	result = (int32_t)xs - (int32_t)ys;
510	result \|= xs[4] - ys[4];
511	return result;
512	case 4:
513	return (int32_t)xs - (int32_t)ys;
514	case 3:
515	result = xs[2] - ys[2];
516	result \|= xs[1] - ys[1];
517	result \|= xs[0] - ys[0];
518	return result;
519	case 2:
520	result = xs[1] - ys[1];
521	result \|= xs[0] - ys[0];
522	return result;
523	case 1:
524	return xs - ys;
525	default:
526	case 0:
527	return 0;
528	}
529	}
530	else
531	{
532	/* unaligned */
533	int result;
534	while (length >= 8)
535	{
536	#if defined(__i386__) \|\| defined(__x86_64__)
537	result = ( ((int32_t)xs[3] << 24)
538	\| ((int32_t)xs[2] << 16)
539	\| ((int32_t)xs[1] << 8)
540	\| xs[0] )
541	- (int32_t)ys;
542	result \|= ( ((int32_t)xs[7] << 24)
543	\| ((int32_t)xs[6] << 16)
544	\| ((int32_t)xs[5] << 8)
545	\| xs[4] )
546	- (int32_t)(ys + 4);
547	#else
548	result = xs[3] - ys[3];
549	result \|= xs[2] - ys[2];
550	result \|= xs[1] - ys[1];
551	result \|= xs[0] - ys[0];
552	result \|= xs[7] - ys[7];
553	result \|= xs[6] - ys[6];
554	result \|= xs[5] - ys[5];
555	result \|= xs[4] - ys[4];
556	#endif
557	if (MY_PREDICT_FALSE(result))
558	return result;
559	xs += 8;
560	ys += 8;
561	length -= 8;
562	}
563	result = 0;
564	switch (length)
565	{
566	case 7: result \|= xs[6] - ys[6];
567	case 6: result \|= xs[5] - ys[5];
568	case 5: result \|= xs[4] - ys[4];
569	case 4: result \|= xs[3] - ys[3];
570	case 3: result \|= xs[2] - ys[2];
571	case 2: result \|= xs[1] - ys[1];
572	case 1: result \|= xs[0] - ys[0];
573	return result;
574	default:
575	case 0:
576	return 0;
577	}
578	}
579	}
580
581	MY_INLINE int
582	strcache2_is_equal (struct strcache2 cache, struct strcache2_entry const entry,
583	const char *str, unsigned int length, unsigned int hash)
584	{
585	assert (!cache->case_insensitive);
586
587	/* the simple stuff first. */
588	if ( entry->hash != hash
589	\|\| entry->length != length)
590	return 0;
591
592	#if 0
593	return memcmp (str, entry + 1, length) == 0;
594	#elif 1
595	return strcache2_memcmp_inlined (str, (const char *)(entry + 1), length) == 0;
596	#else
597	return strcache2_memcmp_inline_short (str, (const char *)(entry + 1), length) == 0;
598	#endif
599	}
600
601	MY_INLINE int
602	strcache2_is_iequal (struct strcache2 cache, struct strcache2_entry const entry,
603	const char *str, unsigned int length, unsigned int hash)
604	{
605	assert (cache->case_insensitive);
606
607	/* the simple stuff first. */
608	if ( entry->hash != hash
609	\|\| entry->length != length)
610	return 0;
611
612	#if defined(_MSC_VER) \|\| defined(__OS2__)
613	return _memicmp (entry + 1, str, length) == 0;
614	#else
615	return strncasecmp ((const char *)(entry + 1), str, length) == 0;
616	#endif
617	}
618
619	static void
620	strcache2_rehash (struct strcache2 *cache)
621	{
622	unsigned int src = cache->hash_size;
623	struct strcache2_entry **src_tab = cache->hash_tab;
624	struct strcache2_entry **dst_tab;
625	#ifndef STRCACHE2_USE_MASK
626	unsigned int hash_shift;
627	#endif
628
629	/* Allocate a new hash table twice the size of the current. */
630	cache->hash_size <<= 1;
631	#ifdef STRCACHE2_USE_MASK
632	cache->hash_mask <<= 1;
633	cache->hash_mask \|= 1;
634	#else
635	for (hash_shift = 1; (1U << hash_shift) < cache->hash_size; hash_shift++)
636	/* nothing */;
637	cache->hash_div = strcache2_find_prime (hash_shift);
638	#endif
639	cache->rehash_count <<= 1;
640	cache->hash_tab = dst_tab = (struct strcache2_entry **)
641	xmalloc (cache->hash_size * sizeof (struct strcache2_entry *));
642	memset (dst_tab, '\0', cache->hash_size * sizeof (struct strcache2_entry *));
643
644	/* Copy the entries from the old to the new hash table. */
645	cache->collision_count = 0;
646	while (src-- > 0)
647	{
648	struct strcache2_entry *entry = src_tab[src];
649	while (entry)
650	{
651	struct strcache2_entry *next = entry->next;
652	unsigned int dst = STRCACHE2_MOD_IT (cache, entry->hash);
653	if ((entry->next = dst_tab[dst]) != 0)
654	cache->collision_count++;
655	dst_tab[dst] = entry;
656
657	entry = next;
658	}
659	}
660
661	/* That's it, just free the old table and we're done. */
662	free (src_tab);
663	}
664
665	static struct strcache2_seg *
666	strcache2_new_seg (struct strcache2 *cache, unsigned int minlen)
667	{
668	struct strcache2_seg *seg;
669	size_t size;
670	size_t off;
671
672	size = cache->def_seg_size;
673	if (size < (size_t)minlen + sizeof (struct strcache2_seg) + STRCACHE2_ENTRY_ALIGNMENT)
674	{
675	size = (size_t)minlen * 2;
676	size = (size + 0xfff) & ~(size_t)0xfff;
677	}
678
679	seg = xmalloc (size);
680	seg->start = (char *)(seg + 1);
681	seg->size = size - sizeof (struct strcache2_seg);
682	off = (size_t)seg->start & (STRCACHE2_ENTRY_ALIGNMENT - 1);
683	if (off)
684	{
685	seg->start += off;
686	seg->size -= off;
687	}
688	assert (seg->size > minlen);
689	seg->cursor = seg->start;
690	seg->avail = seg->size;
691
692	seg->next = cache->seg_head;
693	cache->seg_head = seg;
694
695	return seg;
696	}
697
698	/* Internal worker that enters a new string into the cache. */
699	static const char *
700	strcache2_enter_string (struct strcache2 *cache, unsigned int idx,
701	const char *str, unsigned int length,
702	unsigned int hash)
703	{
704	struct strcache2_entry *entry;
705	struct strcache2_seg *seg;
706	unsigned int size;
707	char *str_copy;
708
709	/* Allocate space for the string. */
710
711	size = length + 1 + sizeof (struct strcache2_entry);
712	size = (size + STRCACHE2_ENTRY_ALIGNMENT - 1) & ~(STRCACHE2_ENTRY_ALIGNMENT - 1U);
713
714	seg = cache->seg_head;
715	if (MY_PREDICT_FALSE(seg->avail < size))
716	{
717	do
718	seg = seg->next;
719	while (seg && seg->avail < size);
720	if (!seg)
721	seg = strcache2_new_seg (cache, size);
722	}
723
724	entry = (struct strcache2_entry *) seg->cursor;
725	assert ((size_t)entry & (STRCACHE2_ENTRY_ALIGNMENT - 1));
726	seg->cursor += size;
727	seg->avail -= size;
728
729	/* Setup the entry, copy the string and insert it into the hash table. */
730
731	entry->user = NULL;
732	entry->length = length;
733	entry->hash = hash;
734	str_copy = (char *) memcpy (entry + 1, str, length);
735	str_copy[length] = '\0';
736
737	if ((entry->next = cache->hash_tab[idx]) != 0)
738	cache->collision_count++;
739	cache->hash_tab[idx] = entry;
740	cache->count++;
741	if (cache->count >= cache->rehash_count)
742	strcache2_rehash (cache);
743
744	return str_copy;
745	}
746
747	/* The public add string interface. */
748	const char *
749	strcache2_add (struct strcache2 cache, const char str, unsigned int length)
750	{
751	struct strcache2_entry const *entry;
752	unsigned int hash = strcache2_case_sensitive_hash (str, length);
753	unsigned int idx;
754
755	assert (!cache->case_insensitive);
756
757	MAKE_STATS (cache->lookup_count++);
758
759	/* Lookup the entry in the hash table, hoping for an
760	early match. If not found, enter the string at IDX. */
761	idx = STRCACHE2_MOD_IT (cache, hash);
762	entry = cache->hash_tab[idx];
763	if (!entry)
764	return strcache2_enter_string (cache, idx, str, length, hash);
765	if (strcache2_is_equal (cache, entry, str, length, hash))
766	return (const char *)(entry + 1);
767	MAKE_STATS (cache->collision_1st_count++);
768
769	entry = entry->next;
770	if (!entry)
771	return strcache2_enter_string (cache, idx, str, length, hash);
772	if (strcache2_is_equal (cache, entry, str, length, hash))
773	return (const char *)(entry + 1);
774	MAKE_STATS (cache->collision_2nd_count++);
775
776	/* Loop the rest. */
777	for (;;)
778	{
779	entry = entry->next;
780	if (!entry)
781	return strcache2_enter_string (cache, idx, str, length, hash);
782	if (strcache2_is_equal (cache, entry, str, length, hash))
783	return (const char *)(entry + 1);
784	MAKE_STATS (cache->collision_3rd_count++);
785	}
786	/* not reached */
787	}
788
789	/* The public add string interface for prehashed strings.
790	Use strcache2_hash_str to calculate the hash of a string. */
791	const char *
792	strcache2_add_hashed (struct strcache2 cache, const char str,
793	unsigned int length, unsigned int hash)
794	{
795	struct strcache2_entry const *entry;
796	unsigned int idx;
797	#ifndef NDEBUG
798	unsigned correct_hash;
799
800	assert (!cache->case_insensitive);
801	correct_hash = strcache2_case_sensitive_hash (str, length);
802	MY_ASSERT_MSG (hash == correct_hash, ("%#x != %#x\n", hash, correct_hash));
803	#endif /* NDEBUG */
804
805	MAKE_STATS (cache->lookup_count++);
806
807	/* Lookup the entry in the hash table, hoping for an
808	early match. If not found, enter the string at IDX. */
809	idx = STRCACHE2_MOD_IT (cache, hash);
810	entry = cache->hash_tab[idx];
811	if (!entry)
812	return strcache2_enter_string (cache, idx, str, length, hash);
813	if (strcache2_is_equal (cache, entry, str, length, hash))
814	return (const char *)(entry + 1);
815	MAKE_STATS (cache->collision_1st_count++);
816
817	entry = entry->next;
818	if (!entry)
819	return strcache2_enter_string (cache, idx, str, length, hash);
820	if (strcache2_is_equal (cache, entry, str, length, hash))
821	return (const char *)(entry + 1);
822	MAKE_STATS (cache->collision_2nd_count++);
823
824	/* Loop the rest. */
825	for (;;)
826	{
827	entry = entry->next;
828	if (!entry)
829	return strcache2_enter_string (cache, idx, str, length, hash);
830	if (strcache2_is_equal (cache, entry, str, length, hash))
831	return (const char *)(entry + 1);
832	MAKE_STATS (cache->collision_3rd_count++);
833	}
834	/* not reached */
835	}
836
837	/* The public lookup (case sensitive) string interface. */
838	const char *
839	strcache2_lookup (struct strcache2 cache, const char str, unsigned int length)
840	{
841	struct strcache2_entry const *entry;
842	unsigned int hash = strcache2_case_sensitive_hash (str, length);
843	unsigned int idx;
844
845	assert (!cache->case_insensitive);
846
847	MAKE_STATS (cache->lookup_count++);
848
849	/* Lookup the entry in the hash table, hoping for an
850	early match. */
851	idx = STRCACHE2_MOD_IT (cache, hash);
852	entry = cache->hash_tab[idx];
853	if (!entry)
854	return NULL;
855	if (strcache2_is_equal (cache, entry, str, length, hash))
856	return (const char *)(entry + 1);
857	MAKE_STATS (cache->collision_1st_count++);
858
859	entry = entry->next;
860	if (!entry)
861	return NULL;
862	if (strcache2_is_equal (cache, entry, str, length, hash))
863	return (const char *)(entry + 1);
864	MAKE_STATS (cache->collision_2nd_count++);
865
866	/* Loop the rest. */
867	for (;;)
868	{
869	entry = entry->next;
870	if (!entry)
871	return NULL;
872	if (strcache2_is_equal (cache, entry, str, length, hash))
873	return (const char *)(entry + 1);
874	MAKE_STATS (cache->collision_3rd_count++);
875	}
876	/* not reached */
877	}
878
879	#if defined(HAVE_CASE_INSENSITIVE_FS)
880
881	/* The public add string interface for case insensitive strings. */
882	const char *
883	strcache2_iadd (struct strcache2 cache, const char str, unsigned int length)
884	{
885	struct strcache2_entry const *entry;
886	unsigned int hash = strcache2_case_insensitive_hash (str, length);
887	unsigned int idx;
888
889	assert (!cache->case_insensitive);
890
891	MAKE_STATS (cache->lookup_count++);
892
893	/* Lookup the entry in the hash table, hoping for an
894	early match. If not found, enter the string at IDX. */
895	idx = STRCACHE2_MOD_IT (cache, hash);
896	entry = cache->hash_tab[idx];
897	if (!entry)
898	return strcache2_enter_string (cache, idx, str, length, hash);
899	if (strcache2_is_equal (cache, entry, str, length, hash))
900	return (const char *)(entry + 1);
901	MAKE_STATS (cache->collision_1st_count++);
902
903	entry = entry->next;
904	if (!entry)
905	return strcache2_enter_string (cache, idx, str, length, hash);
906	if (strcache2_is_equal (cache, entry, str, length, hash))
907	return (const char *)(entry + 1);
908	MAKE_STATS (cache->collision_2nd_count++);
909
910	/* Loop the rest. */
911	for (;;)
912	{
913	entry = entry->next;
914	if (!entry)
915	return strcache2_enter_string (cache, idx, str, length, hash);
916	if (strcache2_is_equal (cache, entry, str, length, hash))
917	return (const char *)(entry + 1);
918	MAKE_STATS (cache->collision_3rd_count++);
919	}
920	/* not reached */
921	}
922
923	/* The public add string interface for prehashed case insensitive strings.
924	Use strcache2_hash_istr to calculate the hash of a string. */
925	const char *
926	strcache2_iadd_hashed (struct strcache2 cache, const char str,
927	unsigned int length, unsigned int hash)
928	{
929	struct strcache2_entry const *entry;
930	unsigned int idx;
931	#ifndef NDEBUG
932	unsigned correct_hash;
933
934	assert (!cache->case_insensitive);
935	correct_hash = strcache2_case_insensitive_hash (str, length);
936	MY_ASSERT_MSG (hash == correct_hash, ("%#x != %#x\n", hash, correct_hash));
937	#endif /* NDEBUG */
938
939	MAKE_STATS (cache->lookup_count++);
940
941	/* Lookup the entry in the hash table, hoping for an
942	early match. If not found, enter the string at IDX. */
943	idx = STRCACHE2_MOD_IT (cache, hash);
944	entry = cache->hash_tab[idx];
945	if (!entry)
946	return strcache2_enter_string (cache, idx, str, length, hash);
947	if (strcache2_is_equal (cache, entry, str, length, hash))
948	return (const char *)(entry + 1);
949	MAKE_STATS (cache->collision_1st_count++);
950
951	entry = entry->next;
952	if (!entry)
953	return strcache2_enter_string (cache, idx, str, length, hash);
954	if (strcache2_is_equal (cache, entry, str, length, hash))
955	return (const char *)(entry + 1);
956	MAKE_STATS (cache->collision_2nd_count++);
957
958	/* Loop the rest. */
959	for (;;)
960	{
961	entry = entry->next;
962	if (!entry)
963	return strcache2_enter_string (cache, idx, str, length, hash);
964	if (strcache2_is_equal (cache, entry, str, length, hash))
965	return (const char *)(entry + 1);
966	MAKE_STATS (cache->collision_3rd_count++);
967	}
968	/* not reached */
969	}
970
971	/* The public lookup (case insensitive) string interface. */
972	const char *
973	strcache2_ilookup (struct strcache2 cache, const char str, unsigned int length)
974	{
975	struct strcache2_entry const *entry;
976	unsigned int hash = strcache2_case_insensitive_hash (str, length);
977	unsigned int idx;
978
979	assert (!cache->case_insensitive);
980
981	MAKE_STATS (cache->lookup_count++);
982
983	/* Lookup the entry in the hash table, hoping for an
984	early match. */
985	idx = STRCACHE2_MOD_IT (cache, hash);
986	entry = cache->hash_tab[idx];
987	if (!entry)
988	return NULL;
989	if (strcache2_is_equal (cache, entry, str, length, hash))
990	return (const char *)(entry + 1);
991	MAKE_STATS (cache->collision_1st_count++);
992
993	entry = entry->next;
994	if (!entry)
995	return NULL;
996	if (strcache2_is_equal (cache, entry, str, length, hash))
997	return (const char *)(entry + 1);
998	MAKE_STATS (cache->collision_2nd_count++);
999
1000	/* Loop the rest. */
1001	for (;;)
1002	{
1003	entry = entry->next;
1004	if (!entry)
1005	return NULL;
1006	if (strcache2_is_equal (cache, entry, str, length, hash))
1007	return (const char *)(entry + 1);
1008	MAKE_STATS (cache->collision_3rd_count++);
1009	}
1010	/* not reached */
1011	}
1012
1013	#endif /* HAVE_CASE_INSENSITIVE_FS */
1014
1015	/* Is the given string cached? returns 1 if it is, 0 if it isn't. */
1016	int
1017	strcache2_is_cached (struct strcache2 cache, const char str)
1018	{
1019	/* Check mandatory alignment first. */
1020	if (!((size_t)str & (sizeof (void *) - 1)))
1021	{
1022	/* Check the segment list and consider the question answered if the
1023	string is within one of them. (Could check it more thoroughly...) */
1024	struct strcache2_seg const *seg;
1025	for (seg = cache->seg_head; seg; seg = seg->next)
1026	if ((size_t)(str - seg->start) < seg->size)
1027	return 1;
1028	}
1029
1030	return 0;
1031	}
1032
1033
1034	/* Verify the integrity of the specified string, returning 0 if OK. */
1035	int
1036	strcache2_verify_entry (struct strcache2 cache, const char str)
1037	{
1038	struct strcache2_entry const *entry;
1039	unsigned hash;
1040	const char *end;
1041
1042	if ((size_t)str & (sizeof (void *) - 1))
1043	{
1044	fprintf (stderr,
1045	"strcache2[%s]: missaligned string %p\nstring: %s\n",
1046	cache->name, (void *)str, str);
1047	return -1;
1048	}
1049
1050	entry = (struct strcache2_entry const *)str - 1;
1051	end = memchr (str, '\0', entry->length);
1052	if ((size_t)(end - str) == entry->length)
1053	{
1054	fprintf (stderr,
1055	"strcache2[%s]: corrupt entry %p, length: %lu, expected %u;\nstring: %s\n",
1056	cache->name, (void *)entry, (unsigned long)(end - str), entry->length, str);
1057	return -1;
1058	}
1059
1060	hash = cache->case_insensitive
1061	? strcache2_case_insensitive_hash (str, entry->length)
1062	: strcache2_case_sensitive_hash (str, entry->length);
1063	if (hash != entry->hash)
1064	{
1065	fprintf (stderr,
1066	"strcache2[%s]: corrupt entry %p, hash#1: %x, expected %x;\nstring: %s\n",
1067	cache->name, (void *)entry, hash, entry->hash, str);
1068	return -1;
1069	}
1070
1071	return 0;
1072	}
1073
1074
1075	/* Calculates the case sensitive hash values of the string.
1076	The first hash is returned, the other is put at HASH2P. */
1077	unsigned int strcache2_hash_str (const char str, unsigned int length, unsigned int hash2p)
1078	{
1079	*hash2p = 1;
1080	return strcache2_case_sensitive_hash (str, length);
1081	}
1082
1083	/* Calculates the case insensitive hash values of the string.
1084	The first hash is returned, the other is put at HASH2P. */
1085	unsigned int strcache2_hash_istr (const char str, unsigned int length, unsigned int hash2p)
1086	{
1087	*hash2p = 1;
1088	return strcache2_case_insensitive_hash (str, length);
1089	}
1090
1091
1092
1093	/* Initalizes a new cache. */
1094	void
1095	strcache2_init (struct strcache2 cache, const char name, unsigned int size,
1096	unsigned int def_seg_size, int case_insensitive, int thread_safe)
1097	{
1098	unsigned hash_shift;
1099	assert (!thread_safe);
1100
1101	/* calc the size as a power of two */
1102	if (!size)
1103	hash_shift = STRCACHE2_HASH_SHIFT;
1104	else
1105	{
1106	assert (size <= (~0U / 2 + 1));
1107	for (hash_shift = 8; (1U << hash_shift) < size; hash_shift++)
1108	/* nothing */;
1109	}
1110
1111	/* adjust the default segment size */
1112	if (!def_seg_size)
1113	def_seg_size = STRCACHE2_SEG_SIZE;
1114	else if (def_seg_size < sizeof (struct strcache2_seg) * 10)
1115	def_seg_size = sizeof (struct strcache2_seg) * 10;
1116	else if ((def_seg_size & 0xfff) < 0xf00)
1117	def_seg_size = ((def_seg_size + 0xfff) & ~0xfffU);
1118
1119
1120	/* init the structure. */
1121	cache->case_insensitive = case_insensitive;
1122	#ifdef STRCACHE2_USE_MASK
1123	cache->hash_mask = (1U << hash_shift) - 1U;
1124	#else
1125	cache->hash_div = strcache2_find_prime(hash_shift);
1126	#endif
1127	cache->count = 0;
1128	cache->collision_count = 0;
1129	cache->lookup_count = 0;
1130	cache->collision_1st_count = 0;
1131	cache->collision_2nd_count = 0;
1132	cache->collision_3rd_count = 0;
1133	cache->rehash_count = (1U << hash_shift) / 4 * 3; /* rehash at 75% */
1134	cache->init_size = 1U << hash_shift;
1135	cache->hash_size = 1U << hash_shift;
1136	cache->def_seg_size = def_seg_size;
1137	cache->lock = NULL;
1138	cache->name = name;
1139
1140	/* allocate the hash table and first segment. */
1141	cache->hash_tab = (struct strcache2_entry **)
1142	xmalloc (cache->init_size * sizeof (struct strcache2_entry *));
1143	memset (cache->hash_tab, '\0', cache->init_size * sizeof (struct strcache2_entry *));
1144	strcache2_new_seg (cache, 0);
1145
1146	/* link it */
1147	cache->next = strcache_head;
1148	strcache_head = cache;
1149	}
1150
1151
1152	/* Terminates a string cache, freeing all memory and other
1153	associated resources. */
1154	void
1155	strcache2_term (struct strcache2 *cache)
1156	{
1157	/* unlink it */
1158	if (strcache_head == cache)
1159	strcache_head = cache->next;
1160	else
1161	{
1162	struct strcache2 *prev = strcache_head;
1163	while (prev->next != cache)
1164	prev = prev->next;
1165	assert (prev);
1166	prev->next = cache->next;
1167	}
1168
1169	/* free the memory segments */
1170	do
1171	{
1172	void *free_it = cache->seg_head;
1173	cache->seg_head = cache->seg_head->next;
1174	free (free_it);
1175	}
1176	while (cache->seg_head);
1177
1178	/* free the hash and clear the structure. */
1179	free (cache->hash_tab);
1180	memset (cache, '\0', sizeof (struct strcache2));
1181	}
1182
1183	/* Print statistics a string cache. */
1184	void
1185	strcache2_print_stats (struct strcache2 cache, const char prefix)
1186	{
1187	unsigned int seg_count = 0;
1188	unsigned long seg_total_bytes = 0;
1189	unsigned long seg_avg_bytes;
1190	unsigned long seg_avail_bytes = 0;
1191	unsigned long seg_max_bytes = 0;
1192	struct strcache2_seg *seg;
1193	unsigned int str_count = 0;
1194	unsigned long str_total_len = 0;
1195	unsigned int str_avg_len;
1196	unsigned int str_min_len = ~0U;
1197	unsigned int str_max_len = 0;
1198	unsigned int idx;
1199	unsigned int rehashes;
1200	unsigned int chain_depths[32];
1201
1202	printf (_("\n%s strcache2: %s\n"), prefix, cache->name);
1203
1204	/* Segment statistics. */
1205	for (seg = cache->seg_head; seg; seg = seg->next)
1206	{
1207	seg_count++;
1208	seg_total_bytes += seg->size;
1209	seg_avail_bytes += seg->avail;
1210	if (seg->size > seg_max_bytes)
1211	seg_max_bytes = seg->size;
1212	}
1213	seg_avg_bytes = seg_total_bytes / seg_count;
1214	printf (_("%s %u segments: total = %lu / max = %lu / avg = %lu / def = %u avail = %lu\n"),
1215	prefix, seg_count, seg_total_bytes, seg_max_bytes, seg_avg_bytes,
1216	cache->def_seg_size, seg_avail_bytes);
1217
1218	/* String statistics. */
1219	memset (chain_depths, '\0', sizeof (chain_depths));
1220	idx = cache->hash_size;
1221	while (idx-- > 0)
1222	{
1223	struct strcache2_entry const *entry = cache->hash_tab[idx];
1224	unsigned int depth = 0;
1225	for (; entry != 0; entry = entry->next, depth++)
1226	{
1227	unsigned int length = entry->length;
1228	str_total_len += length;
1229	if (length > str_max_len)
1230	str_max_len = length;
1231	if (length < str_min_len)
1232	str_min_len = length;
1233	str_count++;
1234	}
1235	chain_depths[depth >= 32 ? 31 : depth]++;
1236	}
1237	str_avg_len = cache->count ? str_total_len / cache->count : 0;
1238	printf (_("%s %u strings: total len = %lu / max = %u / avg = %u / min = %u\n"),
1239	prefix, cache->count, str_total_len, str_max_len, str_avg_len, str_min_len);
1240	if (str_count != cache->count)
1241	printf (_("%s string count mismatch! cache->count=%u, actual count is %u\n"), prefix,
1242	cache->count, str_count);
1243
1244	/* Hash statistics. */
1245	idx = cache->init_size;
1246	rehashes = 0;
1247	while (idx < cache->hash_size)
1248	{
1249	idx *= 2;
1250	rehashes++;
1251	}
1252
1253	#ifdef STRCACHE2_USE_MASK
1254	printf (_("%s hash size = %u mask = %#x rehashed %u times lookups = %lu\n"),
1255	prefix, cache->hash_size, cache->hash_mask, rehashes, cache->lookup_count);
1256	#else
1257	printf (_("%s hash size = %u div = %#x rehashed %u times lookups = %lu\n"),
1258	prefix, cache->hash_size, cache->hash_div, rehashes, cache->lookup_count);
1259	#endif
1260	if (cache->lookup_count)
1261	printf (_("%s hash collisions 1st = %lu (%u%%) 2nd = %lu (%u%%) 3rd = %lu (%u%%)\n"),
1262	prefix,
1263	cache->collision_1st_count, (unsigned int)((100.0 * cache->collision_1st_count) / cache->lookup_count),
1264	cache->collision_2nd_count, (unsigned int)((100.0 * cache->collision_2nd_count) / cache->lookup_count),
1265	cache->collision_3rd_count, (unsigned int)((100.0 * cache->collision_3rd_count) / cache->lookup_count));
1266	printf (_("%s hash insert collisions = %u (%u%%)\n"),
1267	prefix, cache->collision_count,(unsigned int)((100.0 * cache->collision_count) / cache->count));
1268	printf (_("%s %5u (%u%%) empty hash table slots\n"),
1269	prefix, chain_depths[0], (unsigned int)((100.0 * chain_depths[0]) / cache->hash_size));
1270	printf (_("%s %5u (%u%%) occupied hash table slots\n"),
1271	prefix, chain_depths[1], (unsigned int)((100.0 * chain_depths[1]) / cache->hash_size));
1272	for (idx = 2; idx < 32; idx++)
1273	{
1274	unsigned strs_at_this_depth = chain_depths[idx];
1275	unsigned i;
1276	for (i = idx + 1; i < 32; i++)
1277	strs_at_this_depth += chain_depths[i];
1278	if (strs_at_this_depth)
1279	printf (_("%s %5u (%2u%%) with %u string%s chained on; %5u (%2u%%) strings at depth %u.\n"),
1280	prefix, chain_depths[idx], (unsigned int)((100.0 * chain_depths[idx]) / (cache->count - cache->collision_count)),
1281	idx - 1, idx == 2 ? " " : "s",
1282	strs_at_this_depth, (unsigned int)((100.0 * strs_at_this_depth) / cache->count), idx - 1);
1283	}
1284	}
1285
1286	/* Print statistics for all string caches. */
1287	void
1288	strcache2_print_stats_all (const char *prefix)
1289	{
1290	struct strcache2 *cur;
1291	for (cur = strcache_head; cur; cur = cur->next)
1292	strcache2_print_stats (cur, prefix);
1293	}
1294

Note: See TracBrowser for help on using the repository browser.

Download in other formats: