source: trunk/src/kmk/strcache2.c@ 1912

Last change on this file since 1912 was 1912, checked in by bird, 17 years ago

strcache2: collapsed the STRCACHE2_USE_CHAINING checks.

  • Property svn:eol-style set to native
  • Property svn:keywords set to Author Date Id Revision
File size: 36.2 KB
Line 
1/* $Id: strcache2.c 1912 2008-10-22 21:17:27Z bird $ */
2/** @file
3 * strcache2 - New string cache.
4 */
5
6/*
7 * Copyright (c) 2008 knut st. osmundsen <bird-src-spam@anduin.net>
8 *
9 * This file is part of kBuild.
10 *
11 * kBuild is free software; you can redistribute it and/or modify
12 * it under the terms of the GNU General Public License as published by
13 * the Free Software Foundation; either version 2 of the License, or
14 * (at your option) any later version.
15 *
16 * kBuild is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 * GNU General Public License for more details.
20 *
21 * You should have received a copy of the GNU General Public License
22 * along with kBuild; if not, write to the Free Software
23 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
24 *
25 */
26
27
28/*******************************************************************************
29* Header Files *
30*******************************************************************************/
31#include "make.h"
32#include "strcache2.h"
33
34#include <assert.h>
35
36#include "debug.h"
37
38#ifdef _MSC_VER
39typedef unsigned char uint8_t;
40typedef unsigned short uint16_t;
41typedef unsigned int uint32_t;
42#else
43# include <stdint.h>
44#endif
45
46#ifdef WINDOWS32
47# include <io.h>
48# include <process.h>
49# include <Windows.h>
50# define PARSE_IN_WORKER
51#endif
52
53#ifdef __OS2__
54# include <sys/fmutex.h>
55#endif
56
57#ifdef HAVE_PTHREAD
58# include <pthread.h>
59#endif
60
61
62/*******************************************************************************
63* Defined Constants And Macros *
64*******************************************************************************/
65/* The default size of a memory segment (1MB). */
66#define STRCACHE2_SEG_SIZE (1024U*1024U)
67/* The default hash table shift (hash size give as a power of two). */
68#define STRCACHE2_HASH_SHIFT 16
69/** Does the modding / masking of a hash number into an index. */
70#ifdef STRCACHE2_USE_MASK
71# define STRCACHE2_MOD_IT(cache, hash) ((hash) & (cache)->hash_mask)
72#else
73# define STRCACHE2_MOD_IT(cache, hash) ((hash) % (cache)->hash_div)
74#endif
75
76
77/*******************************************************************************
78* Global Variables *
79*******************************************************************************/
80/* List of initialized string caches. */
81static struct strcache2 *strcache_head;
82
83
84/** Finds the closest primary number for power of two value (or something else
85 * useful if not support). */
86MY_INLINE unsigned int strcache2_find_prime(unsigned int shift)
87{
88 switch (shift)
89 {
90 case 5: return 31;
91 case 6: return 61;
92 case 7: return 127;
93 case 8: return 251;
94 case 9: return 509;
95 case 10: return 1021;
96 case 11: return 2039;
97 case 12: return 4093;
98 case 13: return 8191;
99 case 14: return 16381;
100 case 15: return 32749;
101 case 16: return 65521;
102 case 17: return 131063;
103 case 18: return 262139;
104 case 19: return 524269;
105 case 20: return 1048573;
106 case 21: return 2097143;
107 case 22: return 4194301;
108 case 23: return 8388593;
109
110 default:
111 assert (0);
112 return (1 << shift) - 1;
113 }
114}
115
116MY_INLINE unsigned int
117strcache2_case_sensitive_hash_1 (const char *str, unsigned int len)
118{
119#if 1
120 /* Paul Hsieh hash SuperFast function:
121 http://www.azillionmonkeys.com/qed/hash.html
122
123 This performs very good and as a sligtly better distribution than
124 STRING_N_HASH_1 on a typical kBuild run.
125
126 It is also 37% faster than return_STRING_N_HASH_1 when running the
127 two 100 times over typical kBuild strings that end up here (did a
128 fprintf here and built kBuild). Compiler was 32-bit gcc 4.0.1, darwin,
129 with -O2.
130
131 FIXME: A path for well aligned data should be added to speed up
132 execution on alignment sensitive systems. */
133
134# if defined(__amd64__) || defined(__x86_64__) || defined(__AMD64__) || defined(_M_X64) || defined(__amd64) \
135 || defined(__i386__) || defined(__x86__) || defined(__X86__) || defined(_M_IX86) || defined(__i386)
136# define strcache2_get_unaligned_16bits(ptr) ( *((const uint16_t *)(ptr)))
137# else
138 /* (endian doesn't matter) */
139# define strcache2_get_unaligned_16bits(ptr) ( (((const uint8_t *)(ptr))[0] << 8) \
140 | (((const uint8_t *)(ptr))[1]) )
141# endif
142 uint32_t hash = len;
143 uint32_t tmp;
144 int rem;
145
146 assert (sizeof (uint8_t) == sizeof (char));
147 if (len == 0)
148 return 0;
149
150 /* main loop, walking on 2 x uint16_t */
151 rem = len & 3;
152 len >>= 2;
153 while (len > 0)
154 {
155 hash += strcache2_get_unaligned_16bits (str);
156 tmp = (strcache2_get_unaligned_16bits (str + 2) << 11) ^ hash;
157 hash = (hash << 16) ^ tmp;
158 str += 2 * sizeof (uint16_t);
159 hash += hash >> 11;
160 len--;
161 }
162
163 /* the remainer */
164 switch (rem)
165 {
166 case 3:
167 hash += strcache2_get_unaligned_16bits (str);
168 hash ^= hash << 16;
169 hash ^= str[sizeof (uint16_t)] << 18;
170 hash += hash >> 11;
171 break;
172 case 2:
173 hash += strcache2_get_unaligned_16bits (str);
174 hash ^= hash << 11;
175 hash += hash >> 17;
176 break;
177 case 1:
178 hash += *str;
179 hash ^= hash << 10;
180 hash += hash >> 1;
181 break;
182 }
183
184 /* force "avalanching" of final 127 bits. */
185 hash ^= hash << 3;
186 hash += hash >> 5;
187 hash ^= hash << 4;
188 hash += hash >> 17;
189 hash ^= hash << 25;
190 hash += hash >> 6;
191
192 return hash;
193
194#elif 1
195 /* Note! This implementation is 18% faster than return_STRING_N_HASH_1
196 when running the two 100 times over typical kBuild strings that
197 end up here (did a fprintf here and built kBuild).
198 Compiler was 32-bit gcc 4.0.1, darwin, with -O2. */
199
200 unsigned int hash = 0;
201 if (MY_PREDICT_TRUE(len >= 2))
202 {
203 unsigned int ch0 = *str++;
204 hash = 0;
205 len--;
206 while (len >= 2)
207 {
208 unsigned int ch1 = *str++;
209 hash += ch0 << (ch1 & 0xf);
210
211 ch0 = *str++;
212 hash += ch1 << (ch0 & 0xf);
213
214 len -= 2;
215 }
216 if (len == 1)
217 {
218 unsigned ch1 = *str;
219 hash += ch0 << (ch1 & 0xf);
220
221 hash += ch1;
222 }
223 else
224 hash += ch0;
225 }
226 else if (len)
227 {
228 hash = *str;
229 hash += hash << (hash & 0xf);
230 }
231 else
232 hash = 0;
233 return hash;
234
235#elif 1
236# if 0
237 /* This is SDBM. This specific form/unroll was benchmarked to be 28% faster
238 than return_STRING_N_HASH_1. (Now the weird thing is that putting the (ch)
239 first in the assignment made it noticably slower.)
240
241 However, it is noticably slower in practice, most likely because of more
242 collisions. Hrmpf. */
243
244# define UPDATE_HASH(ch) hash = (hash << 6) + (hash << 16) - hash + (ch)
245 unsigned int hash = 0;
246
247# else
248 /* This is DJB2. This specific form/unroll was benchmarked to be 27%
249 fast than return_STRING_N_HASH_1.
250
251 Ditto. */
252
253# define UPDATE_HASH(ch) hash = (hash << 5) + hash + (ch)
254 unsigned int hash = 5381;
255# endif
256
257
258 while (len >= 4)
259 {
260 UPDATE_HASH (str[0]);
261 UPDATE_HASH (str[1]);
262 UPDATE_HASH (str[2]);
263 UPDATE_HASH (str[3]);
264 str += 4;
265 len -= 4;
266 }
267 switch (len)
268 {
269 default:
270 case 0:
271 return hash;
272 case 1:
273 UPDATE_HASH (str[0]);
274 return hash;
275 case 2:
276 UPDATE_HASH (str[0]);
277 UPDATE_HASH (str[1]);
278 return hash;
279 case 3:
280 UPDATE_HASH (str[0]);
281 UPDATE_HASH (str[1]);
282 UPDATE_HASH (str[2]);
283 return hash;
284 }
285#endif
286}
287
288MY_INLINE unsigned int
289strcache2_case_insensitive_hash_1 (const char *str, unsigned int len)
290{
291 unsigned int hash = 0;
292 if (MY_PREDICT_TRUE(len >= 2))
293 {
294 unsigned int ch0 = *str++;
295 ch0 = tolower (ch0);
296 hash = 0;
297 len--;
298 while (len >= 2)
299 {
300 unsigned int ch1 = *str++;
301 ch1 = tolower (ch1);
302 hash += ch0 << (ch1 & 0xf);
303
304 ch0 = *str++;
305 ch0 = tolower (ch0);
306 hash += ch1 << (ch0 & 0xf);
307
308 len -= 2;
309 }
310 if (len == 1)
311 {
312 unsigned ch1 = *str;
313 ch1 = tolower (ch1);
314 hash += ch0 << (ch1 & 0xf);
315
316 hash += ch1;
317 }
318 else
319 hash += ch0;
320 }
321 else if (len)
322 {
323 hash = *str;
324 hash += hash << (hash & 0xf);
325 }
326 else
327 hash = 0;
328 return hash;
329}
330
331MY_INLINE int
332strcache2_memcmp_inline_short (const char *xs, const char *ys, unsigned int length)
333{
334 if (length <= 8)
335 {
336 /* short string compare - ~50% of the kBuild calls. */
337 assert ( !((size_t)ys & 3) );
338 if (!((size_t)xs & 3))
339 {
340 /* aligned */
341 int result;
342 switch (length)
343 {
344 default: /* memcmp for longer strings */
345 return memcmp (xs, ys, length);
346 case 8:
347 result = *(int32_t*)(xs + 4) - *(int32_t*)(ys + 4);
348 result |= *(int32_t*)xs - *(int32_t*)ys;
349 return result;
350 case 7:
351 result = xs[6] - ys[6];
352 result |= xs[5] - ys[5];
353 result |= xs[4] - ys[4];
354 result |= *(int32_t*)xs - *(int32_t*)ys;
355 return result;
356 case 6:
357 result = xs[5] - ys[5];
358 result |= xs[4] - ys[4];
359 result |= *(int32_t*)xs - *(int32_t*)ys;
360 return result;
361 case 5:
362 result = xs[4] - ys[4];
363 result |= *(int32_t*)xs - *(int32_t*)ys;
364 return result;
365 case 4:
366 return *(int32_t*)xs - *(int32_t*)ys;
367 case 3:
368 result = xs[2] - ys[2];
369 result |= xs[1] - ys[1];
370 result |= xs[0] - ys[0];
371 return result;
372 case 2:
373 result = xs[1] - ys[1];
374 result |= xs[0] - ys[0];
375 return result;
376 case 1:
377 return *xs - *ys;
378 case 0:
379 return 0;
380 }
381 }
382 else
383 {
384 /* unaligned */
385 int result = 0;
386 switch (length)
387 {
388 case 8: result |= xs[7] - ys[7];
389 case 7: result |= xs[6] - ys[6];
390 case 6: result |= xs[5] - ys[5];
391 case 5: result |= xs[4] - ys[4];
392 case 4: result |= xs[3] - ys[3];
393 case 3: result |= xs[2] - ys[2];
394 case 2: result |= xs[1] - ys[1];
395 case 1: result |= xs[0] - ys[0];
396 case 0:
397 return result;
398 }
399 }
400 }
401
402 /* memcmp for longer strings */
403 return memcmp (xs, ys, length);
404}
405
406MY_INLINE int
407strcache2_memcmp_inlined (const char *xs, const char *ys, unsigned int length)
408{
409#ifndef ELECTRIC_HEAP
410 assert ( !((size_t)ys & 3) );
411#endif
412 if (!((size_t)xs & 3))
413 {
414 int result;
415 /* aligned */
416 while (length >= 8)
417 {
418 result = *(int32_t*)xs - *(int32_t*)ys;
419 result |= *(int32_t*)(xs + 4) - *(int32_t*)(ys + 4);
420 if (MY_PREDICT_FALSE(result))
421 return result;
422 xs += 8;
423 ys += 8;
424 length -= 8;
425 }
426 switch (length)
427 {
428 case 7:
429 result = *(int32_t*)xs - *(int32_t*)ys;
430 result |= xs[6] - ys[6];
431 result |= xs[5] - ys[5];
432 result |= xs[4] - ys[4];
433 return result;
434 case 6:
435 result = *(int32_t*)xs - *(int32_t*)ys;
436 result |= xs[5] - ys[5];
437 result |= xs[4] - ys[4];
438 return result;
439 case 5:
440 result = *(int32_t*)xs - *(int32_t*)ys;
441 result |= xs[4] - ys[4];
442 return result;
443 case 4:
444 return *(int32_t*)xs - *(int32_t*)ys;
445 case 3:
446 result = xs[2] - ys[2];
447 result |= xs[1] - ys[1];
448 result |= xs[0] - ys[0];
449 return result;
450 case 2:
451 result = xs[1] - ys[1];
452 result |= xs[0] - ys[0];
453 return result;
454 case 1:
455 return *xs - *ys;
456 default:
457 case 0:
458 return 0;
459 }
460 }
461 else
462 {
463 /* unaligned */
464 int result;
465 while (length >= 8)
466 {
467#if defined(__i386__) || defined(__x86_64__)
468 result = ( ((int32_t)xs[3] << 24)
469 | ((int32_t)xs[2] << 16)
470 | ((int32_t)xs[1] << 8)
471 | xs[0] )
472 - *(int32_t*)ys;
473 result |= ( ((int32_t)xs[7] << 24)
474 | ((int32_t)xs[6] << 16)
475 | ((int32_t)xs[5] << 8)
476 | xs[4] )
477 - *(int32_t*)(ys + 4);
478#else
479 result = xs[3] - ys[3];
480 result |= xs[2] - ys[2];
481 result |= xs[1] - ys[1];
482 result |= xs[0] - ys[0];
483 result |= xs[7] - ys[7];
484 result |= xs[6] - ys[6];
485 result |= xs[5] - ys[5];
486 result |= xs[4] - ys[4];
487#endif
488 if (MY_PREDICT_FALSE(result))
489 return result;
490 xs += 8;
491 ys += 8;
492 length -= 8;
493 }
494 result = 0;
495 switch (length)
496 {
497 case 7: result |= xs[6] - ys[6];
498 case 6: result |= xs[5] - ys[5];
499 case 5: result |= xs[4] - ys[4];
500 case 4: result |= xs[3] - ys[3];
501 case 3: result |= xs[2] - ys[2];
502 case 2: result |= xs[1] - ys[1];
503 case 1: result |= xs[0] - ys[0];
504 return result;
505 default:
506 case 0:
507 return 0;
508 }
509 }
510}
511
512MY_INLINE int
513strcache2_is_equal (struct strcache2 *cache, struct strcache2_entry const *entry,
514 const char *str, unsigned int length, unsigned int hash1)
515{
516 assert (!cache->case_insensitive);
517
518 /* the simple stuff first. */
519 if ( entry->hash1 != hash1
520 || entry->length != length)
521 return 0;
522
523#if 0
524 return memcmp (str, entry + 1, length) == 0;
525#elif 1
526 return strcache2_memcmp_inlined (str, (const char *)(entry + 1), length) == 0;
527#else
528 return strcache2_memcmp_inline_short (str, (const char *)(entry + 1), length) == 0;
529#endif
530}
531
532MY_INLINE int
533strcache2_is_iequal (struct strcache2 *cache, struct strcache2_entry const *entry,
534 const char *str, unsigned int length, unsigned int hash1)
535{
536 assert (cache->case_insensitive);
537
538 /* the simple stuff first. */
539 if ( entry->hash1 != hash1
540 || entry->length != length)
541 return 0;
542
543#if defined(_MSC_VER) || defined(__OS2__)
544 return _memicmp (entry + 1, str, length) == 0;
545#else
546 return strncasecmp ((const char *)(entry + 1), str, length) == 0;
547#endif
548}
549
550static void
551strcache2_rehash (struct strcache2 *cache)
552{
553 unsigned int src = cache->hash_size;
554 struct strcache2_entry **src_tab = cache->hash_tab;
555 struct strcache2_entry **dst_tab;
556#ifndef STRCACHE2_USE_MASK
557 unsigned int hash_shift;
558#endif
559
560 /* Allocate a new hash table twice the size of the current. */
561 cache->hash_size <<= 1;
562#ifdef STRCACHE2_USE_MASK
563 cache->hash_mask <<= 1;
564 cache->hash_mask |= 1;
565#else
566 for (hash_shift = 1; (1U << hash_shift) < cache->hash_size; hash_shift++)
567 /* nothing */;
568 cache->hash_div = strcache2_find_prime (hash_shift);
569#endif
570 cache->rehash_count <<= 1;
571 cache->hash_tab = dst_tab = (struct strcache2_entry **)
572 xmalloc (cache->hash_size * sizeof (struct strcache2_entry *));
573 memset (dst_tab, '\0', cache->hash_size * sizeof (struct strcache2_entry *));
574
575 /* Copy the entries from the old to the new hash table. */
576 cache->collision_count = 0;
577 while (src-- > 0)
578 {
579 struct strcache2_entry *entry = src_tab[src];
580 while (entry)
581 {
582 struct strcache2_entry *next = entry->next;
583 unsigned int dst = STRCACHE2_MOD_IT (cache, entry->hash1);
584 if ((entry->next = dst_tab[dst]) != 0)
585 cache->collision_count++;
586 dst_tab[dst] = entry;
587
588 entry = next;
589 }
590 }
591
592 /* That's it, just free the old table and we're done. */
593 free (src_tab);
594}
595
596static struct strcache2_seg *
597strcache2_new_seg (struct strcache2 *cache, unsigned int minlen)
598{
599 struct strcache2_seg *seg;
600 size_t size;
601 size_t off;
602
603 size = cache->def_seg_size;
604 if (size < (size_t)minlen + sizeof (struct strcache2_seg) + STRCACHE2_ENTRY_ALIGNMENT)
605 {
606 size = (size_t)minlen * 2;
607 size = (size + 0xfff) & ~(size_t)0xfff;
608 }
609
610 seg = xmalloc (size);
611 seg->start = (char *)(seg + 1);
612 seg->size = size - sizeof (struct strcache2_seg);
613 off = (size_t)seg->start & (STRCACHE2_ENTRY_ALIGNMENT - 1);
614 if (off)
615 {
616 seg->start += off;
617 seg->size -= off;
618 }
619 assert (seg->size > minlen);
620 seg->cursor = seg->start;
621 seg->avail = seg->size;
622
623 seg->next = cache->seg_head;
624 cache->seg_head = seg;
625
626 return seg;
627}
628
629/* Internal worker that enters a new string into the cache. */
630static const char *
631strcache2_enter_string (struct strcache2 *cache, unsigned int idx,
632 const char *str, unsigned int length,
633 unsigned int hash1, unsigned hash2)
634{
635 struct strcache2_entry *entry;
636 struct strcache2_seg *seg;
637 unsigned int size;
638 char *str_copy;
639
640 /* Allocate space for the string. */
641
642 size = length + 1 + sizeof (struct strcache2_entry);
643 size = (size + STRCACHE2_ENTRY_ALIGNMENT - 1) & ~(STRCACHE2_ENTRY_ALIGNMENT - 1U);
644
645 seg = cache->seg_head;
646 if (MY_PREDICT_FALSE(seg->avail < size))
647 {
648 do
649 seg = seg->next;
650 while (seg && seg->avail < size);
651 if (!seg)
652 seg = strcache2_new_seg (cache, size);
653 }
654
655 entry = (struct strcache2_entry *) seg->cursor;
656 assert ((size_t)entry & (STRCACHE2_ENTRY_ALIGNMENT - 1));
657 seg->cursor += size;
658 seg->avail -= size;
659
660 /* Setup the entry, copy the string and insert it into the hash table. */
661
662 entry->user = NULL;
663 entry->length = length;
664 entry->hash1 = hash1;
665 str_copy = (char *) memcpy (entry + 1, str, length);
666 str_copy[length] = '\0';
667
668 if ((entry->next = cache->hash_tab[idx]) != 0)
669 cache->collision_count++;
670 cache->hash_tab[idx] = entry;
671 cache->count++;
672 if (cache->count >= cache->rehash_count)
673 strcache2_rehash (cache);
674
675 return str_copy;
676}
677
678/* The public add string interface. */
679const char *
680strcache2_add (struct strcache2 *cache, const char *str, unsigned int length)
681{
682 struct strcache2_entry const *entry;
683 unsigned int hash1 = strcache2_case_sensitive_hash_1 (str, length);
684 unsigned int hash2 = 0;
685 unsigned int idx;
686
687 assert (!cache->case_insensitive);
688
689 cache->lookup_count++;
690
691 /* Lookup the entry in the hash table, hoping for an
692 early match. If not found, enter the string at IDX. */
693 idx = STRCACHE2_MOD_IT (cache, hash1);
694 entry = cache->hash_tab[idx];
695 if (!entry)
696 return strcache2_enter_string (cache, idx, str, length, hash1, hash2);
697 if (strcache2_is_equal (cache, entry, str, length, hash1))
698 return (const char *)(entry + 1);
699 cache->collision_1st_count++;
700
701 entry = entry->next;
702 if (!entry)
703 return strcache2_enter_string (cache, idx, str, length, hash1, hash2);
704 if (strcache2_is_equal (cache, entry, str, length, hash1))
705 return (const char *)(entry + 1);
706 cache->collision_2nd_count++;
707
708 /* (We've established hash2, so we can do a straight loop now.) */
709 for (;;)
710 {
711 entry = entry->next;
712 if (!entry)
713 return strcache2_enter_string (cache, idx, str, length, hash1, hash2);
714 if (strcache2_is_equal (cache, entry, str, length, hash1))
715 return (const char *)(entry + 1);
716 cache->collision_3rd_count++;
717 }
718 /* not reached */
719}
720
721/* The public add string interface for prehashed strings.
722 Use strcache2_hash_str to calculate the hash of a string. */
723const char *
724strcache2_add_hashed (struct strcache2 *cache, const char *str, unsigned int length,
725 unsigned int hash1, unsigned int hash2)
726{
727 struct strcache2_entry const *entry;
728 unsigned int idx;
729#ifndef NDEBUG
730 unsigned correct_hash;
731
732 assert (!cache->case_insensitive);
733 correct_hash = strcache2_case_sensitive_hash_1 (str, length);
734 MY_ASSERT_MSG (hash1 == correct_hash, ("%#x != %#x\n", hash1, correct_hash));
735 if (hash2)
736 {
737 correct_hash = strcache2_case_sensitive_hash_2 (str, length);
738 MY_ASSERT_MSG (hash2 == correct_hash, ("%#x != %#x\n", hash2, correct_hash));
739 }
740#endif /* NDEBUG */
741
742 cache->lookup_count++;
743
744 /* Lookup the entry in the hash table, hoping for an
745 early match. If not found, enter the string at IDX. */
746 idx = STRCACHE2_MOD_IT (cache, hash1);
747 entry = cache->hash_tab[idx];
748 if (!entry)
749 return strcache2_enter_string (cache, idx, str, length, hash1, hash2);
750 if (strcache2_is_equal (cache, entry, str, length, hash1))
751 return (const char *)(entry + 1);
752 cache->collision_1st_count++;
753
754 entry = entry->next;
755 if (!entry)
756 return strcache2_enter_string (cache, idx, str, length, hash1, hash2);
757 if (strcache2_is_equal (cache, entry, str, length, hash1))
758 return (const char *)(entry + 1);
759 cache->collision_2nd_count++;
760
761 /* (We've established hash2, so we can do a straight loop now.) */
762 for (;;)
763 {
764 entry = entry->next;
765 if (!entry)
766 return strcache2_enter_string (cache, idx, str, length, hash1, hash2);
767 if (strcache2_is_equal (cache, entry, str, length, hash1))
768 return (const char *)(entry + 1);
769 cache->collision_3rd_count++;
770 }
771 /* not reached */
772}
773
774/* The public lookup (case sensitive) string interface. */
775const char *
776strcache2_lookup (struct strcache2 *cache, const char *str, unsigned int length)
777{
778 struct strcache2_entry const *entry;
779 unsigned int hash1 = strcache2_case_sensitive_hash_1 (str, length);
780 unsigned int idx;
781
782 assert (!cache->case_insensitive);
783
784 cache->lookup_count++;
785
786 /* Lookup the entry in the hash table, hoping for an
787 early match. */
788 idx = STRCACHE2_MOD_IT (cache, hash1);
789 entry = cache->hash_tab[idx];
790 if (!entry)
791 return NULL;
792 if (strcache2_is_equal (cache, entry, str, length, hash1))
793 return (const char *)(entry + 1);
794 cache->collision_1st_count++;
795
796 entry = entry->next;
797 if (!entry)
798 return NULL;
799 if (strcache2_is_equal (cache, entry, str, length, hash1))
800 return (const char *)(entry + 1);
801 cache->collision_2nd_count++;
802
803 /* (We've established hash2, so we can do a straight loop now.) */
804 for (;;)
805 {
806 entry = entry->next;
807 if (!entry)
808 return NULL;
809 if (strcache2_is_equal (cache, entry, str, length, hash1))
810 return (const char *)(entry + 1);
811 cache->collision_3rd_count++;
812 }
813 /* not reached */
814}
815
816#if defined(HAVE_CASE_INSENSITIVE_FS)
817
818/* The public add string interface for case insensitive strings. */
819const char *
820strcache2_iadd (struct strcache2 *cache, const char *str, unsigned int length)
821{
822 struct strcache2_entry const *entry;
823 unsigned int hash1 = strcache2_case_insensitive_hash_1 (str, length);
824 unsigned int hash2 = 0;
825 unsigned int idx;
826
827 assert (!cache->case_insensitive);
828
829 cache->lookup_count++;
830
831 /* Lookup the entry in the hash table, hoping for an
832 early match. If not found, enter the string at IDX. */
833 idx = STRCACHE2_MOD_IT (cache, hash1);
834 entry = cache->hash_tab[idx];
835 if (!entry)
836 return strcache2_enter_string (cache, idx, str, length, hash1, hash2);
837 if (strcache2_is_equal (cache, entry, str, length, hash1))
838 return (const char *)(entry + 1);
839 cache->collision_1st_count++;
840
841 entry = entry->next;
842 if (!entry)
843 return strcache2_enter_string (cache, idx, str, length, hash1, hash2);
844 if (strcache2_is_equal (cache, entry, str, length, hash1))
845 return (const char *)(entry + 1);
846 cache->collision_2nd_count++;
847
848 /* (We've established hash2, so we can do a straight loop now.) */
849 for (;;)
850 {
851 entry = entry->next;
852 if (!entry)
853 return strcache2_enter_string (cache, idx, str, length, hash1, hash2);
854 if (strcache2_is_equal (cache, entry, str, length, hash1))
855 return (const char *)(entry + 1);
856 cache->collision_3rd_count++;
857 }
858 /* not reached */
859}
860
861/* The public add string interface for prehashed case insensitive strings.
862 Use strcache2_hash_istr to calculate the hash of a string. */
863const char *
864strcache2_iadd_hashed (struct strcache2 *cache, const char *str, unsigned int length,
865 unsigned int hash1, unsigned int hash2)
866{
867 struct strcache2_entry const *entry;
868 unsigned int idx;
869#ifndef NDEBUG
870 unsigned correct_hash;
871
872 assert (!cache->case_insensitive);
873 correct_hash = strcache2_case_insensitive_hash_1 (str, length);
874 MY_ASSERT_MSG (hash1 == correct_hash, ("%#x != %#x\n", hash1, correct_hash));
875 if (hash2)
876 {
877 correct_hash = strcache2_case_insensitive_hash_2 (str, length);
878 MY_ASSERT_MSG (hash2 == correct_hash, ("%#x != %#x\n", hash2, correct_hash));
879 }
880#endif /* NDEBUG */
881
882 cache->lookup_count++;
883
884 /* Lookup the entry in the hash table, hoping for an
885 early match. If not found, enter the string at IDX. */
886 idx = STRCACHE2_MOD_IT (cache, hash1);
887 entry = cache->hash_tab[idx];
888 if (!entry)
889 return strcache2_enter_string (cache, idx, str, length, hash1, hash2);
890 if (strcache2_is_equal (cache, entry, str, length, hash1))
891 return (const char *)(entry + 1);
892 cache->collision_1st_count++;
893
894 entry = entry->next;
895 if (!entry)
896 return strcache2_enter_string (cache, idx, str, length, hash1, hash2);
897 if (strcache2_is_equal (cache, entry, str, length, hash1))
898 return (const char *)(entry + 1);
899 cache->collision_2nd_count++;
900
901 /* (We've established hash2, so we can do a straight loop now.) */
902 for (;;)
903 {
904 entry = entry->next;
905 if (!entry)
906 return strcache2_enter_string (cache, idx, str, length, hash1, hash2);
907 if (strcache2_is_equal (cache, entry, str, length, hash1))
908 return (const char *)(entry + 1);
909 cache->collision_3rd_count++;
910 }
911 /* not reached */
912}
913
914/* The public lookup (case insensitive) string interface. */
915const char *
916strcache2_ilookup (struct strcache2 *cache, const char *str, unsigned int length)
917{
918 struct strcache2_entry const *entry;
919 unsigned int hash1 = strcache2_case_insensitive_hash_1 (str, length);
920 unsigned int idx;
921
922 assert (!cache->case_insensitive);
923
924 cache->lookup_count++;
925
926 /* Lookup the entry in the hash table, hoping for an
927 early match. */
928 idx = STRCACHE2_MOD_IT (cache, hash1);
929 entry = cache->hash_tab[idx];
930 if (!entry)
931 return NULL;
932 if (strcache2_is_equal (cache, entry, str, length, hash1))
933 return (const char *)(entry + 1);
934 cache->collision_1st_count++;
935
936 entry = entry->next;
937 if (!entry)
938 return NULL;
939 if (strcache2_is_equal (cache, entry, str, length, hash1))
940 return (const char *)(entry + 1);
941 cache->collision_2nd_count++;
942
943 /* (We've established hash2, so we can do a straight loop now.) */
944 for (;;)
945 {
946 entry = entry->next;
947 if (!entry)
948 return NULL;
949 if (strcache2_is_equal (cache, entry, str, length, hash1))
950 return (const char *)(entry + 1);
951 cache->collision_3rd_count++;
952 }
953 /* not reached */
954}
955
956#endif /* HAVE_CASE_INSENSITIVE_FS */
957
958/* Is the given string cached? returns 1 if it is, 0 if it isn't. */
959int
960strcache2_is_cached (struct strcache2 *cache, const char *str)
961{
962 /* Check mandatory alignment first. */
963 if (!((size_t)str & (sizeof (void *) - 1)))
964 {
965 /* Check the segment list and consider the question answered if the
966 string is within one of them. (Could check it more thoroughly...) */
967 struct strcache2_seg const *seg;
968 for (seg = cache->seg_head; seg; seg = seg->next)
969 if ((size_t)(str - seg->start) < seg->size)
970 return 1;
971 }
972
973 return 0;
974}
975
976
977/* Verify the integrity of the specified string, returning 0 if OK. */
978int
979strcache2_verify_entry (struct strcache2 *cache, const char *str)
980{
981 struct strcache2_entry const *entry;
982 unsigned hash;
983 const char *end;
984
985 if ((size_t)str & (sizeof (void *) - 1))
986 {
987 fprintf (stderr,
988 "strcache2[%s]: missaligned string %p\nstring: %s\n",
989 cache->name, (void *)str, str);
990 return -1;
991 }
992
993 entry = (struct strcache2_entry const *)str - 1;
994 end = memchr (str, '\0', entry->length);
995 if ((size_t)(end - str) == entry->length)
996 {
997 fprintf (stderr,
998 "strcache2[%s]: corrupt entry %p, length: %lu, expected %u;\nstring: %s\n",
999 cache->name, (void *)entry, (unsigned long)(end - str), entry->length, str);
1000 return -1;
1001 }
1002
1003 hash = cache->case_insensitive
1004 ? strcache2_case_insensitive_hash_1 (str, entry->length)
1005 : strcache2_case_sensitive_hash_1 (str, entry->length);
1006 if (hash != entry->hash1)
1007 {
1008 fprintf (stderr,
1009 "strcache2[%s]: corrupt entry %p, hash#1: %x, expected %x;\nstring: %s\n",
1010 cache->name, (void *)entry, hash, entry->hash1, str);
1011 return -1;
1012 }
1013
1014 return 0;
1015}
1016
1017
1018/* Calculates the case sensitive hash values of the string.
1019 The first hash is returned, the other is put at HASH2P. */
1020unsigned int strcache2_hash_str (const char *str, unsigned int length, unsigned int *hash2p)
1021{
1022 *hash2p = 1;
1023 return strcache2_case_sensitive_hash_1 (str, length);
1024}
1025
1026/* Calculates the case insensitive hash values of the string.
1027 The first hash is returned, the other is put at HASH2P. */
1028unsigned int strcache2_hash_istr (const char *str, unsigned int length, unsigned int *hash2p)
1029{
1030 *hash2p = 1;
1031 return strcache2_case_insensitive_hash_1 (str, length);
1032}
1033
1034
1035
1036/* Initalizes a new cache. */
1037void
1038strcache2_init (struct strcache2 *cache, const char *name, unsigned int size,
1039 unsigned int def_seg_size, int case_insensitive, int thread_safe)
1040{
1041 unsigned hash_shift;
1042 assert (!thread_safe);
1043
1044 /* calc the size as a power of two */
1045 if (!size)
1046 hash_shift = STRCACHE2_HASH_SHIFT;
1047 else
1048 {
1049 assert (size <= (~0U / 2 + 1));
1050 for (hash_shift = 8; (1U << hash_shift) < size; hash_shift++)
1051 /* nothing */;
1052 }
1053
1054 /* adjust the default segment size */
1055 if (!def_seg_size)
1056 def_seg_size = STRCACHE2_SEG_SIZE;
1057 else if (def_seg_size < sizeof (struct strcache2_seg) * 10)
1058 def_seg_size = sizeof (struct strcache2_seg) * 10;
1059 else if ((def_seg_size & 0xfff) < 0xf00)
1060 def_seg_size = ((def_seg_size + 0xfff) & ~0xfffU);
1061
1062
1063 /* init the structure. */
1064 cache->case_insensitive = case_insensitive;
1065#ifdef STRCACHE2_USE_MASK
1066 cache->hash_mask = (1U << hash_shift) - 1U;
1067#else
1068 cache->hash_div = strcache2_find_prime(hash_shift);
1069#endif
1070 cache->count = 0;
1071 cache->collision_count = 0;
1072 cache->lookup_count = 0;
1073 cache->collision_1st_count = 0;
1074 cache->collision_2nd_count = 0;
1075 cache->collision_3rd_count = 0;
1076 cache->rehash_count = (1U << hash_shift) / 4 * 3; /* rehash at 75% */
1077 cache->init_size = 1U << hash_shift;
1078 cache->hash_size = 1U << hash_shift;
1079 cache->def_seg_size = def_seg_size;
1080 cache->lock = NULL;
1081 cache->name = name;
1082
1083 /* allocate the hash table and first segment. */
1084 cache->hash_tab = (struct strcache2_entry **)
1085 xmalloc (cache->init_size * sizeof (struct strcache2_entry *));
1086 memset (cache->hash_tab, '\0', cache->init_size * sizeof (struct strcache2_entry *));
1087 strcache2_new_seg (cache, 0);
1088
1089 /* link it */
1090 cache->next = strcache_head;
1091 strcache_head = cache;
1092}
1093
1094
1095/* Terminates a string cache, freeing all memory and other
1096 associated resources. */
1097void
1098strcache2_term (struct strcache2 *cache)
1099{
1100 /* unlink it */
1101 if (strcache_head == cache)
1102 strcache_head = cache->next;
1103 else
1104 {
1105 struct strcache2 *prev = strcache_head;
1106 while (prev->next != cache)
1107 prev = prev->next;
1108 assert (prev);
1109 prev->next = cache->next;
1110 }
1111
1112 /* free the memory segments */
1113 do
1114 {
1115 void *free_it = cache->seg_head;
1116 cache->seg_head = cache->seg_head->next;
1117 free (free_it);
1118 }
1119 while (cache->seg_head);
1120
1121 /* free the hash and clear the structure. */
1122 free (cache->hash_tab);
1123 memset (cache, '\0', sizeof (struct strcache2));
1124}
1125
1126/* Print statistics a string cache. */
1127void
1128strcache2_print_stats (struct strcache2 *cache, const char *prefix)
1129{
1130 unsigned int seg_count = 0;
1131 unsigned long seg_total_bytes = 0;
1132 unsigned long seg_avg_bytes;
1133 unsigned long seg_avail_bytes = 0;
1134 unsigned long seg_max_bytes = 0;
1135 struct strcache2_seg *seg;
1136 unsigned int str_count = 0;
1137 unsigned long str_total_len = 0;
1138 unsigned int str_avg_len;
1139 unsigned int str_min_len = ~0U;
1140 unsigned int str_max_len = 0;
1141 unsigned int idx;
1142 unsigned int rehashes;
1143 unsigned int chain_depths[32];
1144
1145 printf (_("\n%s strcache2: %s\n"), prefix, cache->name);
1146
1147 /* Segment statistics. */
1148 for (seg = cache->seg_head; seg; seg = seg->next)
1149 {
1150 seg_count++;
1151 seg_total_bytes += seg->size;
1152 seg_avail_bytes += seg->avail;
1153 if (seg->size > seg_max_bytes)
1154 seg_max_bytes = seg->size;
1155 }
1156 seg_avg_bytes = seg_total_bytes / seg_count;
1157 printf (_("%s %u segments: total = %lu / max = %lu / avg = %lu / def = %u avail = %lu\n"),
1158 prefix, seg_count, seg_total_bytes, seg_max_bytes, seg_avg_bytes,
1159 cache->def_seg_size, seg_avail_bytes);
1160
1161 /* String statistics. */
1162 memset (chain_depths, '\0', sizeof (chain_depths));
1163 idx = cache->hash_size;
1164 while (idx-- > 0)
1165 {
1166 struct strcache2_entry const *entry = cache->hash_tab[idx];
1167 unsigned int depth = 0;
1168 for (; entry != 0; entry = entry->next, depth++)
1169 {
1170 unsigned int length = entry->length;
1171 str_total_len += length;
1172 if (length > str_max_len)
1173 str_max_len = length;
1174 if (length < str_min_len)
1175 str_min_len = length;
1176 str_count++;
1177 }
1178 chain_depths[depth >= 32 ? 31 : depth]++;
1179 }
1180 str_avg_len = cache->count ? str_total_len / cache->count : 0;
1181 printf (_("%s %u strings: total len = %lu / max = %u / avg = %u / min = %u\n"),
1182 prefix, cache->count, str_total_len, str_max_len, str_avg_len, str_min_len);
1183 if (str_count != cache->count)
1184 printf (_("%s string count mismatch! cache->count=%u, actual count is %u\n"), prefix,
1185 cache->count, str_count);
1186
1187 /* Hash statistics. */
1188 idx = cache->init_size;
1189 rehashes = 0;
1190 while (idx < cache->hash_size)
1191 {
1192 idx *= 2;
1193 rehashes++;
1194 }
1195
1196#ifdef STRCACHE2_USE_MASK
1197 printf (_("%s hash size = %u mask = %#x rehashed %u times lookups = %lu\n"),
1198 prefix, cache->hash_size, cache->hash_mask, rehashes, cache->lookup_count);
1199#else
1200 printf (_("%s hash size = %u div = %#x rehashed %u times lookups = %lu\n"),
1201 prefix, cache->hash_size, cache->hash_div, rehashes, cache->lookup_count);
1202#endif
1203 printf (_("%s hash collisions 1st = %lu (%u%%) 2nd = %lu (%u%%) 3rd = %lu (%u%%)\n"),
1204 prefix,
1205 cache->collision_1st_count, (unsigned int)((100.0 * cache->collision_1st_count) / cache->lookup_count),
1206 cache->collision_2nd_count, (unsigned int)((100.0 * cache->collision_2nd_count) / cache->lookup_count),
1207 cache->collision_3rd_count, (unsigned int)((100.0 * cache->collision_3rd_count) / cache->lookup_count));
1208 printf (_("%s hash insert collisions = %u (%u%%)\n"),
1209 prefix, cache->collision_count,(unsigned int)((100.0 * cache->collision_count) / cache->count));
1210 printf (_("%s %5u (%u%%) empty hash table slots\n"),
1211 prefix, chain_depths[0], (unsigned int)((100.0 * chain_depths[0]) / cache->hash_size));
1212 printf (_("%s %5u (%u%%) occupied hash table slots\n"),
1213 prefix, chain_depths[1], (unsigned int)((100.0 * chain_depths[1]) / cache->hash_size));
1214 for (idx = 2; idx < 32; idx++)
1215 {
1216 unsigned strs_at_this_depth = chain_depths[idx];
1217 unsigned i;
1218 for (i = idx + 1; i < 32; i++)
1219 strs_at_this_depth += chain_depths[i];
1220 if (strs_at_this_depth)
1221 printf (_("%s %5u (%2u%%) with %u string%s chained on; %5u (%2u%%) strings at depth %u.\n"),
1222 prefix, chain_depths[idx], (unsigned int)((100.0 * chain_depths[idx]) / (cache->count - cache->collision_count)),
1223 idx - 1, idx == 2 ? " " : "s",
1224 strs_at_this_depth, (unsigned int)((100.0 * strs_at_this_depth) / cache->count), idx - 1);
1225 }
1226}
1227
1228/* Print statistics for all string caches. */
1229void
1230strcache2_print_stats_all (const char *prefix)
1231{
1232 struct strcache2 *cur;
1233 for (cur = strcache_head; cur; cur = cur->next)
1234 strcache2_print_stats (cur, prefix);
1235}
1236
Note: See TracBrowser for help on using the repository browser.