source: trunk/src/kmk/strcache2.c@ 1920

Last change on this file since 1920 was 1920, checked in by bird, 17 years ago

strcache2: don't hash strings that aren't small (experimental).

  • Property svn:eol-style set to native
  • Property svn:keywords set to Author Date Id Revision
File size: 37.9 KB
Line 
1/* $Id: strcache2.c 1920 2008-10-24 00:00:27Z bird $ */
2/** @file
3 * strcache2 - New string cache.
4 */
5
6/*
7 * Copyright (c) 2008 knut st. osmundsen <bird-src-spam@anduin.net>
8 *
9 * This file is part of kBuild.
10 *
11 * kBuild is free software; you can redistribute it and/or modify
12 * it under the terms of the GNU General Public License as published by
13 * the Free Software Foundation; either version 2 of the License, or
14 * (at your option) any later version.
15 *
16 * kBuild is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 * GNU General Public License for more details.
20 *
21 * You should have received a copy of the GNU General Public License
22 * along with kBuild; if not, write to the Free Software
23 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
24 *
25 */
26
27
28/*******************************************************************************
29* Header Files *
30*******************************************************************************/
31#include "make.h"
32#include "strcache2.h"
33
34#include <assert.h>
35
36#include "debug.h"
37
38#ifdef _MSC_VER
39typedef unsigned char uint8_t;
40typedef unsigned short uint16_t;
41typedef unsigned int uint32_t;
42typedef signed char int8_t;
43typedef signed short int16_t;
44typedef signed int int32_t;
45#else
46# include <stdint.h>
47#endif
48
49#ifdef WINDOWS32
50# include <io.h>
51# include <process.h>
52# include <Windows.h>
53# define PARSE_IN_WORKER
54#endif
55
56#ifdef __OS2__
57# include <sys/fmutex.h>
58#endif
59
60#ifdef HAVE_PTHREAD
61# include <pthread.h>
62#endif
63
64
65/*******************************************************************************
66* Defined Constants And Macros *
67*******************************************************************************/
68/* The default size of a memory segment (1MB). */
69#define STRCACHE2_SEG_SIZE (1024U*1024U)
70/* The default hash table shift (hash size give as a power of two). */
71#define STRCACHE2_HASH_SHIFT 16
72/** Does the modding / masking of a hash number into an index. */
73#ifdef STRCACHE2_USE_MASK
74# define STRCACHE2_MOD_IT(cache, hash) ((hash) & (cache)->hash_mask)
75#else
76# define STRCACHE2_MOD_IT(cache, hash) ((hash) % (cache)->hash_div)
77#endif
78
79# if defined(__amd64__) || defined(__x86_64__) || defined(__AMD64__) || defined(_M_X64) || defined(__amd64) \
80 || defined(__i386__) || defined(__x86__) || defined(__X86__) || defined(_M_IX86) || defined(__i386)
81# define strcache2_get_unaligned_16bits(ptr) ( *((const uint16_t *)(ptr)))
82# else
83 /* (endian doesn't matter) */
84# define strcache2_get_unaligned_16bits(ptr) ( (((const uint8_t *)(ptr))[0] << 8) \
85 | (((const uint8_t *)(ptr))[1]) )
86# endif
87
88
89/*******************************************************************************
90* Global Variables *
91*******************************************************************************/
92/* List of initialized string caches. */
93static struct strcache2 *strcache_head;
94
95
96/** Finds the closest primary number for power of two value (or something else
97 * useful if not support). */
98MY_INLINE unsigned int strcache2_find_prime(unsigned int shift)
99{
100 switch (shift)
101 {
102 case 5: return 31;
103 case 6: return 61;
104 case 7: return 127;
105 case 8: return 251;
106 case 9: return 509;
107 case 10: return 1021;
108 case 11: return 2039;
109 case 12: return 4093;
110 case 13: return 8191;
111 case 14: return 16381;
112 case 15: return 32749;
113 case 16: return 65521;
114 case 17: return 131063;
115 case 18: return 262139;
116 case 19: return 524269;
117 case 20: return 1048573;
118 case 21: return 2097143;
119 case 22: return 4194301;
120 case 23: return 8388593;
121
122 default:
123 assert (0);
124 return (1 << shift) - 1;
125 }
126}
127
128/* The following is a bit experiment. It produces longer chains, i.e. worse
129 distribution of the strings in the table, however the actual make
130 performances is better (<time). The explanation is probably that the
131 collisions only really increase for entries that aren't looked up that
132 much and that it actually improoves the situation for those that is. Or
133 that we spend so much less time hashing that it makes up (and more) for
134 the pentalty we suffer from the longer chains and worse distribution.
135
136 XXX: Check how this works out with different path lengths. I suspect it
137 might depend on the length of PATH_ROOT and the depth of the files
138 in the project as well. If it does, this might make matters worse
139 for some and better for others which isn't very cool... */
140
141#define BIG_HASH_SIZE 32
142#define BIG_HASH_TAIL 12
143#define BIG_HASH_HEAD 16
144
145#ifdef BIG_HASH_SIZE
146/* long string: hash head and tail, drop the middle. */
147MY_INLINE unsigned int
148strcache2_case_sensitive_hash_big (const char *str, unsigned int len)
149{
150 uint32_t hash = len;
151 uint32_t tmp;
152 unsigned int head;
153
154 /* head BIG_HASH_HEAD bytes */
155 head = (BIG_HASH_HEAD >> 2);
156 while (head-- > 0)
157 {
158 hash += strcache2_get_unaligned_16bits (str);
159 tmp = (strcache2_get_unaligned_16bits (str + 2) << 11) ^ hash;
160 hash = (hash << 16) ^ tmp;
161 str += 2 * sizeof (uint16_t);
162 hash += hash >> 11;
163 }
164
165 /* tail BIG_HASH_TAIL bytes (minus the odd ones) */
166 str += (len - BIG_HASH_HEAD - BIG_HASH_TAIL) & ~3U;
167 head = (BIG_HASH_TAIL >> 2);
168 while (head-- > 0)
169 {
170 hash += strcache2_get_unaligned_16bits (str);
171 tmp = (strcache2_get_unaligned_16bits (str + 2) << 11) ^ hash;
172 hash = (hash << 16) ^ tmp;
173 str += 2 * sizeof (uint16_t);
174 hash += hash >> 11;
175 }
176
177 /* force "avalanching" of final 127 bits. */
178 hash ^= hash << 3;
179 hash += hash >> 5;
180 hash ^= hash << 4;
181 hash += hash >> 17;
182 hash ^= hash << 25;
183 hash += hash >> 6;
184
185 return hash;
186}
187#endif /* BIG_HASH_SIZE */
188
189MY_INLINE unsigned int
190strcache2_case_sensitive_hash (const char *str, unsigned int len)
191{
192#if 1
193 /* Paul Hsieh hash SuperFast function:
194 http://www.azillionmonkeys.com/qed/hash.html
195
196 This performs very good and as a sligtly better distribution than
197 STRING_N_HASH_1 on a typical kBuild run.
198
199 It is also 37% faster than return_STRING_N_HASH_1 when running the
200 two 100 times over typical kBuild strings that end up here (did a
201 fprintf here and built kBuild). Compiler was 32-bit gcc 4.0.1, darwin,
202 with -O2.
203
204 FIXME: A path for well aligned data should be added to speed up
205 execution on alignment sensitive systems. */
206 unsigned int rem;
207 uint32_t hash;
208 uint32_t tmp;
209
210 assert (sizeof (uint8_t) == sizeof (char));
211
212# ifdef BIG_HASH_SIZE
213 /* long string? */
214 if (len >= BIG_HASH_SIZE)
215 return strcache2_case_sensitive_hash_big (str, len);
216# endif
217
218 /* short string: main loop, walking on 2 x uint16_t */
219 hash = len;
220 rem = len & 3;
221 len >>= 2;
222 while (len > 0)
223 {
224 hash += strcache2_get_unaligned_16bits (str);
225 tmp = (strcache2_get_unaligned_16bits (str + 2) << 11) ^ hash;
226 hash = (hash << 16) ^ tmp;
227 str += 2 * sizeof (uint16_t);
228 hash += hash >> 11;
229 len--;
230 }
231
232 /* the remainder */
233 switch (rem)
234 {
235 case 3:
236 hash += strcache2_get_unaligned_16bits (str);
237 hash ^= hash << 16;
238 hash ^= str[sizeof (uint16_t)] << 18;
239 hash += hash >> 11;
240 break;
241 case 2:
242 hash += strcache2_get_unaligned_16bits (str);
243 hash ^= hash << 11;
244 hash += hash >> 17;
245 break;
246 case 1:
247 hash += *str;
248 hash ^= hash << 10;
249 hash += hash >> 1;
250 break;
251 }
252
253 /* force "avalanching" of final 127 bits. */
254 hash ^= hash << 3;
255 hash += hash >> 5;
256 hash ^= hash << 4;
257 hash += hash >> 17;
258 hash ^= hash << 25;
259 hash += hash >> 6;
260
261 return hash;
262
263#elif 1
264 /* Note! This implementation is 18% faster than return_STRING_N_HASH_1
265 when running the two 100 times over typical kBuild strings that
266 end up here (did a fprintf here and built kBuild).
267 Compiler was 32-bit gcc 4.0.1, darwin, with -O2. */
268
269 unsigned int hash = 0;
270 if (MY_PREDICT_TRUE(len >= 2))
271 {
272 unsigned int ch0 = *str++;
273 hash = 0;
274 len--;
275 while (len >= 2)
276 {
277 unsigned int ch1 = *str++;
278 hash += ch0 << (ch1 & 0xf);
279
280 ch0 = *str++;
281 hash += ch1 << (ch0 & 0xf);
282
283 len -= 2;
284 }
285 if (len == 1)
286 {
287 unsigned ch1 = *str;
288 hash += ch0 << (ch1 & 0xf);
289
290 hash += ch1;
291 }
292 else
293 hash += ch0;
294 }
295 else if (len)
296 {
297 hash = *str;
298 hash += hash << (hash & 0xf);
299 }
300 else
301 hash = 0;
302 return hash;
303
304#elif 1
305# if 0
306 /* This is SDBM. This specific form/unroll was benchmarked to be 28% faster
307 than return_STRING_N_HASH_1. (Now the weird thing is that putting the (ch)
308 first in the assignment made it noticably slower.)
309
310 However, it is noticably slower in practice, most likely because of more
311 collisions. Hrmpf. */
312
313# define UPDATE_HASH(ch) hash = (hash << 6) + (hash << 16) - hash + (ch)
314 unsigned int hash = 0;
315
316# else
317 /* This is DJB2. This specific form/unroll was benchmarked to be 27%
318 fast than return_STRING_N_HASH_1.
319
320 Ditto. */
321
322# define UPDATE_HASH(ch) hash = (hash << 5) + hash + (ch)
323 unsigned int hash = 5381;
324# endif
325
326
327 while (len >= 4)
328 {
329 UPDATE_HASH (str[0]);
330 UPDATE_HASH (str[1]);
331 UPDATE_HASH (str[2]);
332 UPDATE_HASH (str[3]);
333 str += 4;
334 len -= 4;
335 }
336 switch (len)
337 {
338 default:
339 case 0:
340 return hash;
341 case 1:
342 UPDATE_HASH (str[0]);
343 return hash;
344 case 2:
345 UPDATE_HASH (str[0]);
346 UPDATE_HASH (str[1]);
347 return hash;
348 case 3:
349 UPDATE_HASH (str[0]);
350 UPDATE_HASH (str[1]);
351 UPDATE_HASH (str[2]);
352 return hash;
353 }
354#endif
355}
356
357MY_INLINE unsigned int
358strcache2_case_insensitive_hash (const char *str, unsigned int len)
359{
360 unsigned int hash = 0;
361 if (MY_PREDICT_TRUE(len >= 2))
362 {
363 unsigned int ch0 = *str++;
364 ch0 = tolower (ch0);
365 hash = 0;
366 len--;
367 while (len >= 2)
368 {
369 unsigned int ch1 = *str++;
370 ch1 = tolower (ch1);
371 hash += ch0 << (ch1 & 0xf);
372
373 ch0 = *str++;
374 ch0 = tolower (ch0);
375 hash += ch1 << (ch0 & 0xf);
376
377 len -= 2;
378 }
379 if (len == 1)
380 {
381 unsigned ch1 = *str;
382 ch1 = tolower (ch1);
383 hash += ch0 << (ch1 & 0xf);
384
385 hash += ch1;
386 }
387 else
388 hash += ch0;
389 }
390 else if (len)
391 {
392 hash = *str;
393 hash += hash << (hash & 0xf);
394 }
395 else
396 hash = 0;
397 return hash;
398}
399
400MY_INLINE int
401strcache2_memcmp_inline_short (const char *xs, const char *ys, unsigned int length)
402{
403 if (length <= 8)
404 {
405 /* short string compare - ~50% of the kBuild calls. */
406 assert ( !((size_t)ys & 3) );
407 if (!((size_t)xs & 3))
408 {
409 /* aligned */
410 int result;
411 switch (length)
412 {
413 default: /* memcmp for longer strings */
414 return memcmp (xs, ys, length);
415 case 8:
416 result = *(int32_t*)(xs + 4) - *(int32_t*)(ys + 4);
417 result |= *(int32_t*)xs - *(int32_t*)ys;
418 return result;
419 case 7:
420 result = xs[6] - ys[6];
421 result |= xs[5] - ys[5];
422 result |= xs[4] - ys[4];
423 result |= *(int32_t*)xs - *(int32_t*)ys;
424 return result;
425 case 6:
426 result = xs[5] - ys[5];
427 result |= xs[4] - ys[4];
428 result |= *(int32_t*)xs - *(int32_t*)ys;
429 return result;
430 case 5:
431 result = xs[4] - ys[4];
432 result |= *(int32_t*)xs - *(int32_t*)ys;
433 return result;
434 case 4:
435 return *(int32_t*)xs - *(int32_t*)ys;
436 case 3:
437 result = xs[2] - ys[2];
438 result |= xs[1] - ys[1];
439 result |= xs[0] - ys[0];
440 return result;
441 case 2:
442 result = xs[1] - ys[1];
443 result |= xs[0] - ys[0];
444 return result;
445 case 1:
446 return *xs - *ys;
447 case 0:
448 return 0;
449 }
450 }
451 else
452 {
453 /* unaligned */
454 int result = 0;
455 switch (length)
456 {
457 case 8: result |= xs[7] - ys[7];
458 case 7: result |= xs[6] - ys[6];
459 case 6: result |= xs[5] - ys[5];
460 case 5: result |= xs[4] - ys[4];
461 case 4: result |= xs[3] - ys[3];
462 case 3: result |= xs[2] - ys[2];
463 case 2: result |= xs[1] - ys[1];
464 case 1: result |= xs[0] - ys[0];
465 case 0:
466 return result;
467 }
468 }
469 }
470
471 /* memcmp for longer strings */
472 return memcmp (xs, ys, length);
473}
474
475MY_INLINE int
476strcache2_memcmp_inlined (const char *xs, const char *ys, unsigned int length)
477{
478#ifndef ELECTRIC_HEAP
479 assert ( !((size_t)ys & 3) );
480#endif
481 if (!((size_t)xs & 3))
482 {
483 int result;
484 /* aligned */
485 while (length >= 8)
486 {
487 result = *(int32_t*)xs - *(int32_t*)ys;
488 result |= *(int32_t*)(xs + 4) - *(int32_t*)(ys + 4);
489 if (MY_PREDICT_FALSE(result))
490 return result;
491 xs += 8;
492 ys += 8;
493 length -= 8;
494 }
495 switch (length)
496 {
497 case 7:
498 result = *(int32_t*)xs - *(int32_t*)ys;
499 result |= xs[6] - ys[6];
500 result |= xs[5] - ys[5];
501 result |= xs[4] - ys[4];
502 return result;
503 case 6:
504 result = *(int32_t*)xs - *(int32_t*)ys;
505 result |= xs[5] - ys[5];
506 result |= xs[4] - ys[4];
507 return result;
508 case 5:
509 result = *(int32_t*)xs - *(int32_t*)ys;
510 result |= xs[4] - ys[4];
511 return result;
512 case 4:
513 return *(int32_t*)xs - *(int32_t*)ys;
514 case 3:
515 result = xs[2] - ys[2];
516 result |= xs[1] - ys[1];
517 result |= xs[0] - ys[0];
518 return result;
519 case 2:
520 result = xs[1] - ys[1];
521 result |= xs[0] - ys[0];
522 return result;
523 case 1:
524 return *xs - *ys;
525 default:
526 case 0:
527 return 0;
528 }
529 }
530 else
531 {
532 /* unaligned */
533 int result;
534 while (length >= 8)
535 {
536#if defined(__i386__) || defined(__x86_64__)
537 result = ( ((int32_t)xs[3] << 24)
538 | ((int32_t)xs[2] << 16)
539 | ((int32_t)xs[1] << 8)
540 | xs[0] )
541 - *(int32_t*)ys;
542 result |= ( ((int32_t)xs[7] << 24)
543 | ((int32_t)xs[6] << 16)
544 | ((int32_t)xs[5] << 8)
545 | xs[4] )
546 - *(int32_t*)(ys + 4);
547#else
548 result = xs[3] - ys[3];
549 result |= xs[2] - ys[2];
550 result |= xs[1] - ys[1];
551 result |= xs[0] - ys[0];
552 result |= xs[7] - ys[7];
553 result |= xs[6] - ys[6];
554 result |= xs[5] - ys[5];
555 result |= xs[4] - ys[4];
556#endif
557 if (MY_PREDICT_FALSE(result))
558 return result;
559 xs += 8;
560 ys += 8;
561 length -= 8;
562 }
563 result = 0;
564 switch (length)
565 {
566 case 7: result |= xs[6] - ys[6];
567 case 6: result |= xs[5] - ys[5];
568 case 5: result |= xs[4] - ys[4];
569 case 4: result |= xs[3] - ys[3];
570 case 3: result |= xs[2] - ys[2];
571 case 2: result |= xs[1] - ys[1];
572 case 1: result |= xs[0] - ys[0];
573 return result;
574 default:
575 case 0:
576 return 0;
577 }
578 }
579}
580
581MY_INLINE int
582strcache2_is_equal (struct strcache2 *cache, struct strcache2_entry const *entry,
583 const char *str, unsigned int length, unsigned int hash)
584{
585 assert (!cache->case_insensitive);
586
587 /* the simple stuff first. */
588 if ( entry->hash != hash
589 || entry->length != length)
590 return 0;
591
592#if 0
593 return memcmp (str, entry + 1, length) == 0;
594#elif 1
595 return strcache2_memcmp_inlined (str, (const char *)(entry + 1), length) == 0;
596#else
597 return strcache2_memcmp_inline_short (str, (const char *)(entry + 1), length) == 0;
598#endif
599}
600
601MY_INLINE int
602strcache2_is_iequal (struct strcache2 *cache, struct strcache2_entry const *entry,
603 const char *str, unsigned int length, unsigned int hash)
604{
605 assert (cache->case_insensitive);
606
607 /* the simple stuff first. */
608 if ( entry->hash != hash
609 || entry->length != length)
610 return 0;
611
612#if defined(_MSC_VER) || defined(__OS2__)
613 return _memicmp (entry + 1, str, length) == 0;
614#else
615 return strncasecmp ((const char *)(entry + 1), str, length) == 0;
616#endif
617}
618
619static void
620strcache2_rehash (struct strcache2 *cache)
621{
622 unsigned int src = cache->hash_size;
623 struct strcache2_entry **src_tab = cache->hash_tab;
624 struct strcache2_entry **dst_tab;
625#ifndef STRCACHE2_USE_MASK
626 unsigned int hash_shift;
627#endif
628
629 /* Allocate a new hash table twice the size of the current. */
630 cache->hash_size <<= 1;
631#ifdef STRCACHE2_USE_MASK
632 cache->hash_mask <<= 1;
633 cache->hash_mask |= 1;
634#else
635 for (hash_shift = 1; (1U << hash_shift) < cache->hash_size; hash_shift++)
636 /* nothing */;
637 cache->hash_div = strcache2_find_prime (hash_shift);
638#endif
639 cache->rehash_count <<= 1;
640 cache->hash_tab = dst_tab = (struct strcache2_entry **)
641 xmalloc (cache->hash_size * sizeof (struct strcache2_entry *));
642 memset (dst_tab, '\0', cache->hash_size * sizeof (struct strcache2_entry *));
643
644 /* Copy the entries from the old to the new hash table. */
645 cache->collision_count = 0;
646 while (src-- > 0)
647 {
648 struct strcache2_entry *entry = src_tab[src];
649 while (entry)
650 {
651 struct strcache2_entry *next = entry->next;
652 unsigned int dst = STRCACHE2_MOD_IT (cache, entry->hash);
653 if ((entry->next = dst_tab[dst]) != 0)
654 cache->collision_count++;
655 dst_tab[dst] = entry;
656
657 entry = next;
658 }
659 }
660
661 /* That's it, just free the old table and we're done. */
662 free (src_tab);
663}
664
665static struct strcache2_seg *
666strcache2_new_seg (struct strcache2 *cache, unsigned int minlen)
667{
668 struct strcache2_seg *seg;
669 size_t size;
670 size_t off;
671
672 size = cache->def_seg_size;
673 if (size < (size_t)minlen + sizeof (struct strcache2_seg) + STRCACHE2_ENTRY_ALIGNMENT)
674 {
675 size = (size_t)minlen * 2;
676 size = (size + 0xfff) & ~(size_t)0xfff;
677 }
678
679 seg = xmalloc (size);
680 seg->start = (char *)(seg + 1);
681 seg->size = size - sizeof (struct strcache2_seg);
682 off = (size_t)seg->start & (STRCACHE2_ENTRY_ALIGNMENT - 1);
683 if (off)
684 {
685 seg->start += off;
686 seg->size -= off;
687 }
688 assert (seg->size > minlen);
689 seg->cursor = seg->start;
690 seg->avail = seg->size;
691
692 seg->next = cache->seg_head;
693 cache->seg_head = seg;
694
695 return seg;
696}
697
698/* Internal worker that enters a new string into the cache. */
699static const char *
700strcache2_enter_string (struct strcache2 *cache, unsigned int idx,
701 const char *str, unsigned int length,
702 unsigned int hash)
703{
704 struct strcache2_entry *entry;
705 struct strcache2_seg *seg;
706 unsigned int size;
707 char *str_copy;
708
709 /* Allocate space for the string. */
710
711 size = length + 1 + sizeof (struct strcache2_entry);
712 size = (size + STRCACHE2_ENTRY_ALIGNMENT - 1) & ~(STRCACHE2_ENTRY_ALIGNMENT - 1U);
713
714 seg = cache->seg_head;
715 if (MY_PREDICT_FALSE(seg->avail < size))
716 {
717 do
718 seg = seg->next;
719 while (seg && seg->avail < size);
720 if (!seg)
721 seg = strcache2_new_seg (cache, size);
722 }
723
724 entry = (struct strcache2_entry *) seg->cursor;
725 assert ((size_t)entry & (STRCACHE2_ENTRY_ALIGNMENT - 1));
726 seg->cursor += size;
727 seg->avail -= size;
728
729 /* Setup the entry, copy the string and insert it into the hash table. */
730
731 entry->user = NULL;
732 entry->length = length;
733 entry->hash = hash;
734 str_copy = (char *) memcpy (entry + 1, str, length);
735 str_copy[length] = '\0';
736
737 if ((entry->next = cache->hash_tab[idx]) != 0)
738 cache->collision_count++;
739 cache->hash_tab[idx] = entry;
740 cache->count++;
741 if (cache->count >= cache->rehash_count)
742 strcache2_rehash (cache);
743
744 return str_copy;
745}
746
747/* The public add string interface. */
748const char *
749strcache2_add (struct strcache2 *cache, const char *str, unsigned int length)
750{
751 struct strcache2_entry const *entry;
752 unsigned int hash = strcache2_case_sensitive_hash (str, length);
753 unsigned int idx;
754
755 assert (!cache->case_insensitive);
756
757 MAKE_STATS (cache->lookup_count++);
758
759 /* Lookup the entry in the hash table, hoping for an
760 early match. If not found, enter the string at IDX. */
761 idx = STRCACHE2_MOD_IT (cache, hash);
762 entry = cache->hash_tab[idx];
763 if (!entry)
764 return strcache2_enter_string (cache, idx, str, length, hash);
765 if (strcache2_is_equal (cache, entry, str, length, hash))
766 return (const char *)(entry + 1);
767 MAKE_STATS (cache->collision_1st_count++);
768
769 entry = entry->next;
770 if (!entry)
771 return strcache2_enter_string (cache, idx, str, length, hash);
772 if (strcache2_is_equal (cache, entry, str, length, hash))
773 return (const char *)(entry + 1);
774 MAKE_STATS (cache->collision_2nd_count++);
775
776 /* Loop the rest. */
777 for (;;)
778 {
779 entry = entry->next;
780 if (!entry)
781 return strcache2_enter_string (cache, idx, str, length, hash);
782 if (strcache2_is_equal (cache, entry, str, length, hash))
783 return (const char *)(entry + 1);
784 MAKE_STATS (cache->collision_3rd_count++);
785 }
786 /* not reached */
787}
788
789/* The public add string interface for prehashed strings.
790 Use strcache2_hash_str to calculate the hash of a string. */
791const char *
792strcache2_add_hashed (struct strcache2 *cache, const char *str,
793 unsigned int length, unsigned int hash)
794{
795 struct strcache2_entry const *entry;
796 unsigned int idx;
797#ifndef NDEBUG
798 unsigned correct_hash;
799
800 assert (!cache->case_insensitive);
801 correct_hash = strcache2_case_sensitive_hash (str, length);
802 MY_ASSERT_MSG (hash == correct_hash, ("%#x != %#x\n", hash, correct_hash));
803#endif /* NDEBUG */
804
805 MAKE_STATS (cache->lookup_count++);
806
807 /* Lookup the entry in the hash table, hoping for an
808 early match. If not found, enter the string at IDX. */
809 idx = STRCACHE2_MOD_IT (cache, hash);
810 entry = cache->hash_tab[idx];
811 if (!entry)
812 return strcache2_enter_string (cache, idx, str, length, hash);
813 if (strcache2_is_equal (cache, entry, str, length, hash))
814 return (const char *)(entry + 1);
815 MAKE_STATS (cache->collision_1st_count++);
816
817 entry = entry->next;
818 if (!entry)
819 return strcache2_enter_string (cache, idx, str, length, hash);
820 if (strcache2_is_equal (cache, entry, str, length, hash))
821 return (const char *)(entry + 1);
822 MAKE_STATS (cache->collision_2nd_count++);
823
824 /* Loop the rest. */
825 for (;;)
826 {
827 entry = entry->next;
828 if (!entry)
829 return strcache2_enter_string (cache, idx, str, length, hash);
830 if (strcache2_is_equal (cache, entry, str, length, hash))
831 return (const char *)(entry + 1);
832 MAKE_STATS (cache->collision_3rd_count++);
833 }
834 /* not reached */
835}
836
837/* The public lookup (case sensitive) string interface. */
838const char *
839strcache2_lookup (struct strcache2 *cache, const char *str, unsigned int length)
840{
841 struct strcache2_entry const *entry;
842 unsigned int hash = strcache2_case_sensitive_hash (str, length);
843 unsigned int idx;
844
845 assert (!cache->case_insensitive);
846
847 MAKE_STATS (cache->lookup_count++);
848
849 /* Lookup the entry in the hash table, hoping for an
850 early match. */
851 idx = STRCACHE2_MOD_IT (cache, hash);
852 entry = cache->hash_tab[idx];
853 if (!entry)
854 return NULL;
855 if (strcache2_is_equal (cache, entry, str, length, hash))
856 return (const char *)(entry + 1);
857 MAKE_STATS (cache->collision_1st_count++);
858
859 entry = entry->next;
860 if (!entry)
861 return NULL;
862 if (strcache2_is_equal (cache, entry, str, length, hash))
863 return (const char *)(entry + 1);
864 MAKE_STATS (cache->collision_2nd_count++);
865
866 /* Loop the rest. */
867 for (;;)
868 {
869 entry = entry->next;
870 if (!entry)
871 return NULL;
872 if (strcache2_is_equal (cache, entry, str, length, hash))
873 return (const char *)(entry + 1);
874 MAKE_STATS (cache->collision_3rd_count++);
875 }
876 /* not reached */
877}
878
879#if defined(HAVE_CASE_INSENSITIVE_FS)
880
881/* The public add string interface for case insensitive strings. */
882const char *
883strcache2_iadd (struct strcache2 *cache, const char *str, unsigned int length)
884{
885 struct strcache2_entry const *entry;
886 unsigned int hash = strcache2_case_insensitive_hash (str, length);
887 unsigned int idx;
888
889 assert (!cache->case_insensitive);
890
891 MAKE_STATS (cache->lookup_count++);
892
893 /* Lookup the entry in the hash table, hoping for an
894 early match. If not found, enter the string at IDX. */
895 idx = STRCACHE2_MOD_IT (cache, hash);
896 entry = cache->hash_tab[idx];
897 if (!entry)
898 return strcache2_enter_string (cache, idx, str, length, hash);
899 if (strcache2_is_equal (cache, entry, str, length, hash))
900 return (const char *)(entry + 1);
901 MAKE_STATS (cache->collision_1st_count++);
902
903 entry = entry->next;
904 if (!entry)
905 return strcache2_enter_string (cache, idx, str, length, hash);
906 if (strcache2_is_equal (cache, entry, str, length, hash))
907 return (const char *)(entry + 1);
908 MAKE_STATS (cache->collision_2nd_count++);
909
910 /* Loop the rest. */
911 for (;;)
912 {
913 entry = entry->next;
914 if (!entry)
915 return strcache2_enter_string (cache, idx, str, length, hash);
916 if (strcache2_is_equal (cache, entry, str, length, hash))
917 return (const char *)(entry + 1);
918 MAKE_STATS (cache->collision_3rd_count++);
919 }
920 /* not reached */
921}
922
923/* The public add string interface for prehashed case insensitive strings.
924 Use strcache2_hash_istr to calculate the hash of a string. */
925const char *
926strcache2_iadd_hashed (struct strcache2 *cache, const char *str,
927 unsigned int length, unsigned int hash)
928{
929 struct strcache2_entry const *entry;
930 unsigned int idx;
931#ifndef NDEBUG
932 unsigned correct_hash;
933
934 assert (!cache->case_insensitive);
935 correct_hash = strcache2_case_insensitive_hash (str, length);
936 MY_ASSERT_MSG (hash == correct_hash, ("%#x != %#x\n", hash, correct_hash));
937#endif /* NDEBUG */
938
939 MAKE_STATS (cache->lookup_count++);
940
941 /* Lookup the entry in the hash table, hoping for an
942 early match. If not found, enter the string at IDX. */
943 idx = STRCACHE2_MOD_IT (cache, hash);
944 entry = cache->hash_tab[idx];
945 if (!entry)
946 return strcache2_enter_string (cache, idx, str, length, hash);
947 if (strcache2_is_equal (cache, entry, str, length, hash))
948 return (const char *)(entry + 1);
949 MAKE_STATS (cache->collision_1st_count++);
950
951 entry = entry->next;
952 if (!entry)
953 return strcache2_enter_string (cache, idx, str, length, hash);
954 if (strcache2_is_equal (cache, entry, str, length, hash))
955 return (const char *)(entry + 1);
956 MAKE_STATS (cache->collision_2nd_count++);
957
958 /* Loop the rest. */
959 for (;;)
960 {
961 entry = entry->next;
962 if (!entry)
963 return strcache2_enter_string (cache, idx, str, length, hash);
964 if (strcache2_is_equal (cache, entry, str, length, hash))
965 return (const char *)(entry + 1);
966 MAKE_STATS (cache->collision_3rd_count++);
967 }
968 /* not reached */
969}
970
971/* The public lookup (case insensitive) string interface. */
972const char *
973strcache2_ilookup (struct strcache2 *cache, const char *str, unsigned int length)
974{
975 struct strcache2_entry const *entry;
976 unsigned int hash = strcache2_case_insensitive_hash (str, length);
977 unsigned int idx;
978
979 assert (!cache->case_insensitive);
980
981 MAKE_STATS (cache->lookup_count++);
982
983 /* Lookup the entry in the hash table, hoping for an
984 early match. */
985 idx = STRCACHE2_MOD_IT (cache, hash);
986 entry = cache->hash_tab[idx];
987 if (!entry)
988 return NULL;
989 if (strcache2_is_equal (cache, entry, str, length, hash))
990 return (const char *)(entry + 1);
991 MAKE_STATS (cache->collision_1st_count++);
992
993 entry = entry->next;
994 if (!entry)
995 return NULL;
996 if (strcache2_is_equal (cache, entry, str, length, hash))
997 return (const char *)(entry + 1);
998 MAKE_STATS (cache->collision_2nd_count++);
999
1000 /* Loop the rest. */
1001 for (;;)
1002 {
1003 entry = entry->next;
1004 if (!entry)
1005 return NULL;
1006 if (strcache2_is_equal (cache, entry, str, length, hash))
1007 return (const char *)(entry + 1);
1008 MAKE_STATS (cache->collision_3rd_count++);
1009 }
1010 /* not reached */
1011}
1012
1013#endif /* HAVE_CASE_INSENSITIVE_FS */
1014
1015/* Is the given string cached? returns 1 if it is, 0 if it isn't. */
1016int
1017strcache2_is_cached (struct strcache2 *cache, const char *str)
1018{
1019 /* Check mandatory alignment first. */
1020 if (!((size_t)str & (sizeof (void *) - 1)))
1021 {
1022 /* Check the segment list and consider the question answered if the
1023 string is within one of them. (Could check it more thoroughly...) */
1024 struct strcache2_seg const *seg;
1025 for (seg = cache->seg_head; seg; seg = seg->next)
1026 if ((size_t)(str - seg->start) < seg->size)
1027 return 1;
1028 }
1029
1030 return 0;
1031}
1032
1033
1034/* Verify the integrity of the specified string, returning 0 if OK. */
1035int
1036strcache2_verify_entry (struct strcache2 *cache, const char *str)
1037{
1038 struct strcache2_entry const *entry;
1039 unsigned hash;
1040 const char *end;
1041
1042 if ((size_t)str & (sizeof (void *) - 1))
1043 {
1044 fprintf (stderr,
1045 "strcache2[%s]: missaligned string %p\nstring: %s\n",
1046 cache->name, (void *)str, str);
1047 return -1;
1048 }
1049
1050 entry = (struct strcache2_entry const *)str - 1;
1051 end = memchr (str, '\0', entry->length);
1052 if ((size_t)(end - str) == entry->length)
1053 {
1054 fprintf (stderr,
1055 "strcache2[%s]: corrupt entry %p, length: %lu, expected %u;\nstring: %s\n",
1056 cache->name, (void *)entry, (unsigned long)(end - str), entry->length, str);
1057 return -1;
1058 }
1059
1060 hash = cache->case_insensitive
1061 ? strcache2_case_insensitive_hash (str, entry->length)
1062 : strcache2_case_sensitive_hash (str, entry->length);
1063 if (hash != entry->hash)
1064 {
1065 fprintf (stderr,
1066 "strcache2[%s]: corrupt entry %p, hash#1: %x, expected %x;\nstring: %s\n",
1067 cache->name, (void *)entry, hash, entry->hash, str);
1068 return -1;
1069 }
1070
1071 return 0;
1072}
1073
1074
1075/* Calculates the case sensitive hash values of the string.
1076 The first hash is returned, the other is put at HASH2P. */
1077unsigned int strcache2_hash_str (const char *str, unsigned int length, unsigned int *hash2p)
1078{
1079 *hash2p = 1;
1080 return strcache2_case_sensitive_hash (str, length);
1081}
1082
1083/* Calculates the case insensitive hash values of the string.
1084 The first hash is returned, the other is put at HASH2P. */
1085unsigned int strcache2_hash_istr (const char *str, unsigned int length, unsigned int *hash2p)
1086{
1087 *hash2p = 1;
1088 return strcache2_case_insensitive_hash (str, length);
1089}
1090
1091
1092
1093/* Initalizes a new cache. */
1094void
1095strcache2_init (struct strcache2 *cache, const char *name, unsigned int size,
1096 unsigned int def_seg_size, int case_insensitive, int thread_safe)
1097{
1098 unsigned hash_shift;
1099 assert (!thread_safe);
1100
1101 /* calc the size as a power of two */
1102 if (!size)
1103 hash_shift = STRCACHE2_HASH_SHIFT;
1104 else
1105 {
1106 assert (size <= (~0U / 2 + 1));
1107 for (hash_shift = 8; (1U << hash_shift) < size; hash_shift++)
1108 /* nothing */;
1109 }
1110
1111 /* adjust the default segment size */
1112 if (!def_seg_size)
1113 def_seg_size = STRCACHE2_SEG_SIZE;
1114 else if (def_seg_size < sizeof (struct strcache2_seg) * 10)
1115 def_seg_size = sizeof (struct strcache2_seg) * 10;
1116 else if ((def_seg_size & 0xfff) < 0xf00)
1117 def_seg_size = ((def_seg_size + 0xfff) & ~0xfffU);
1118
1119
1120 /* init the structure. */
1121 cache->case_insensitive = case_insensitive;
1122#ifdef STRCACHE2_USE_MASK
1123 cache->hash_mask = (1U << hash_shift) - 1U;
1124#else
1125 cache->hash_div = strcache2_find_prime(hash_shift);
1126#endif
1127 cache->count = 0;
1128 cache->collision_count = 0;
1129 cache->lookup_count = 0;
1130 cache->collision_1st_count = 0;
1131 cache->collision_2nd_count = 0;
1132 cache->collision_3rd_count = 0;
1133 cache->rehash_count = (1U << hash_shift) / 4 * 3; /* rehash at 75% */
1134 cache->init_size = 1U << hash_shift;
1135 cache->hash_size = 1U << hash_shift;
1136 cache->def_seg_size = def_seg_size;
1137 cache->lock = NULL;
1138 cache->name = name;
1139
1140 /* allocate the hash table and first segment. */
1141 cache->hash_tab = (struct strcache2_entry **)
1142 xmalloc (cache->init_size * sizeof (struct strcache2_entry *));
1143 memset (cache->hash_tab, '\0', cache->init_size * sizeof (struct strcache2_entry *));
1144 strcache2_new_seg (cache, 0);
1145
1146 /* link it */
1147 cache->next = strcache_head;
1148 strcache_head = cache;
1149}
1150
1151
1152/* Terminates a string cache, freeing all memory and other
1153 associated resources. */
1154void
1155strcache2_term (struct strcache2 *cache)
1156{
1157 /* unlink it */
1158 if (strcache_head == cache)
1159 strcache_head = cache->next;
1160 else
1161 {
1162 struct strcache2 *prev = strcache_head;
1163 while (prev->next != cache)
1164 prev = prev->next;
1165 assert (prev);
1166 prev->next = cache->next;
1167 }
1168
1169 /* free the memory segments */
1170 do
1171 {
1172 void *free_it = cache->seg_head;
1173 cache->seg_head = cache->seg_head->next;
1174 free (free_it);
1175 }
1176 while (cache->seg_head);
1177
1178 /* free the hash and clear the structure. */
1179 free (cache->hash_tab);
1180 memset (cache, '\0', sizeof (struct strcache2));
1181}
1182
1183/* Print statistics a string cache. */
1184void
1185strcache2_print_stats (struct strcache2 *cache, const char *prefix)
1186{
1187 unsigned int seg_count = 0;
1188 unsigned long seg_total_bytes = 0;
1189 unsigned long seg_avg_bytes;
1190 unsigned long seg_avail_bytes = 0;
1191 unsigned long seg_max_bytes = 0;
1192 struct strcache2_seg *seg;
1193 unsigned int str_count = 0;
1194 unsigned long str_total_len = 0;
1195 unsigned int str_avg_len;
1196 unsigned int str_min_len = ~0U;
1197 unsigned int str_max_len = 0;
1198 unsigned int idx;
1199 unsigned int rehashes;
1200 unsigned int chain_depths[32];
1201
1202 printf (_("\n%s strcache2: %s\n"), prefix, cache->name);
1203
1204 /* Segment statistics. */
1205 for (seg = cache->seg_head; seg; seg = seg->next)
1206 {
1207 seg_count++;
1208 seg_total_bytes += seg->size;
1209 seg_avail_bytes += seg->avail;
1210 if (seg->size > seg_max_bytes)
1211 seg_max_bytes = seg->size;
1212 }
1213 seg_avg_bytes = seg_total_bytes / seg_count;
1214 printf (_("%s %u segments: total = %lu / max = %lu / avg = %lu / def = %u avail = %lu\n"),
1215 prefix, seg_count, seg_total_bytes, seg_max_bytes, seg_avg_bytes,
1216 cache->def_seg_size, seg_avail_bytes);
1217
1218 /* String statistics. */
1219 memset (chain_depths, '\0', sizeof (chain_depths));
1220 idx = cache->hash_size;
1221 while (idx-- > 0)
1222 {
1223 struct strcache2_entry const *entry = cache->hash_tab[idx];
1224 unsigned int depth = 0;
1225 for (; entry != 0; entry = entry->next, depth++)
1226 {
1227 unsigned int length = entry->length;
1228 str_total_len += length;
1229 if (length > str_max_len)
1230 str_max_len = length;
1231 if (length < str_min_len)
1232 str_min_len = length;
1233 str_count++;
1234 }
1235 chain_depths[depth >= 32 ? 31 : depth]++;
1236 }
1237 str_avg_len = cache->count ? str_total_len / cache->count : 0;
1238 printf (_("%s %u strings: total len = %lu / max = %u / avg = %u / min = %u\n"),
1239 prefix, cache->count, str_total_len, str_max_len, str_avg_len, str_min_len);
1240 if (str_count != cache->count)
1241 printf (_("%s string count mismatch! cache->count=%u, actual count is %u\n"), prefix,
1242 cache->count, str_count);
1243
1244 /* Hash statistics. */
1245 idx = cache->init_size;
1246 rehashes = 0;
1247 while (idx < cache->hash_size)
1248 {
1249 idx *= 2;
1250 rehashes++;
1251 }
1252
1253#ifdef STRCACHE2_USE_MASK
1254 printf (_("%s hash size = %u mask = %#x rehashed %u times lookups = %lu\n"),
1255 prefix, cache->hash_size, cache->hash_mask, rehashes, cache->lookup_count);
1256#else
1257 printf (_("%s hash size = %u div = %#x rehashed %u times lookups = %lu\n"),
1258 prefix, cache->hash_size, cache->hash_div, rehashes, cache->lookup_count);
1259#endif
1260 if (cache->lookup_count)
1261 printf (_("%s hash collisions 1st = %lu (%u%%) 2nd = %lu (%u%%) 3rd = %lu (%u%%)\n"),
1262 prefix,
1263 cache->collision_1st_count, (unsigned int)((100.0 * cache->collision_1st_count) / cache->lookup_count),
1264 cache->collision_2nd_count, (unsigned int)((100.0 * cache->collision_2nd_count) / cache->lookup_count),
1265 cache->collision_3rd_count, (unsigned int)((100.0 * cache->collision_3rd_count) / cache->lookup_count));
1266 printf (_("%s hash insert collisions = %u (%u%%)\n"),
1267 prefix, cache->collision_count,(unsigned int)((100.0 * cache->collision_count) / cache->count));
1268 printf (_("%s %5u (%u%%) empty hash table slots\n"),
1269 prefix, chain_depths[0], (unsigned int)((100.0 * chain_depths[0]) / cache->hash_size));
1270 printf (_("%s %5u (%u%%) occupied hash table slots\n"),
1271 prefix, chain_depths[1], (unsigned int)((100.0 * chain_depths[1]) / cache->hash_size));
1272 for (idx = 2; idx < 32; idx++)
1273 {
1274 unsigned strs_at_this_depth = chain_depths[idx];
1275 unsigned i;
1276 for (i = idx + 1; i < 32; i++)
1277 strs_at_this_depth += chain_depths[i];
1278 if (strs_at_this_depth)
1279 printf (_("%s %5u (%2u%%) with %u string%s chained on; %5u (%2u%%) strings at depth %u.\n"),
1280 prefix, chain_depths[idx], (unsigned int)((100.0 * chain_depths[idx]) / (cache->count - cache->collision_count)),
1281 idx - 1, idx == 2 ? " " : "s",
1282 strs_at_this_depth, (unsigned int)((100.0 * strs_at_this_depth) / cache->count), idx - 1);
1283 }
1284}
1285
1286/* Print statistics for all string caches. */
1287void
1288strcache2_print_stats_all (const char *prefix)
1289{
1290 struct strcache2 *cur;
1291 for (cur = strcache_head; cur; cur = cur->next)
1292 strcache2_print_stats (cur, prefix);
1293}
1294
Note: See TracBrowser for help on using the repository browser.