| 1 | /* Hash tables.
|
|---|
| 2 | Copyright (C) 2000, 2001 Free Software Foundation, Inc.
|
|---|
| 3 |
|
|---|
| 4 | This file is part of GNU Wget.
|
|---|
| 5 |
|
|---|
| 6 | GNU Wget is free software; you can redistribute it and/or modify
|
|---|
| 7 | it under the terms of the GNU General Public License as published by
|
|---|
| 8 | the Free Software Foundation; either version 2 of the License, or (at
|
|---|
| 9 | your option) any later version.
|
|---|
| 10 |
|
|---|
| 11 | GNU Wget is distributed in the hope that it will be useful,
|
|---|
| 12 | but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|---|
| 13 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|---|
| 14 | GNU General Public License for more details.
|
|---|
| 15 |
|
|---|
| 16 | You should have received a copy of the GNU General Public License
|
|---|
| 17 | along with Wget; if not, write to the Free Software
|
|---|
| 18 | Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
|
|---|
| 19 |
|
|---|
| 20 | In addition, as a special exception, the Free Software Foundation
|
|---|
| 21 | gives permission to link the code of its release of Wget with the
|
|---|
| 22 | OpenSSL project's "OpenSSL" library (or with modified versions of it
|
|---|
| 23 | that use the same license as the "OpenSSL" library), and distribute
|
|---|
| 24 | the linked executables. You must obey the GNU General Public License
|
|---|
| 25 | in all respects for all of the code used other than "OpenSSL". If you
|
|---|
| 26 | modify this file, you may extend this exception to your version of the
|
|---|
| 27 | file, but you are not obligated to do so. If you do not wish to do
|
|---|
| 28 | so, delete this exception statement from your version. */
|
|---|
| 29 |
|
|---|
| 30 | /* With -DSTANDALONE, this file can be compiled outside Wget source
|
|---|
| 31 | tree. To test, also use -DTEST. */
|
|---|
| 32 |
|
|---|
| 33 | #ifdef HAVE_CONFIG_H
|
|---|
| 34 | # include <config.h>
|
|---|
| 35 | # ifdef HAVE_STRING_H
|
|---|
| 36 | # include <string.h>
|
|---|
| 37 | # else
|
|---|
| 38 | # include <strings.h>
|
|---|
| 39 | # endif
|
|---|
| 40 | # ifdef HAVE_LIMITS_H
|
|---|
| 41 | # include <limits.h>
|
|---|
| 42 | # endif
|
|---|
| 43 | #else
|
|---|
| 44 | /* If running without Autoconf, go ahead and assume presence of
|
|---|
| 45 | standard C89 headers. */
|
|---|
| 46 | # include <string.h>
|
|---|
| 47 | # include <limits.h>
|
|---|
| 48 | #endif
|
|---|
| 49 |
|
|---|
| 50 | #include <stdio.h>
|
|---|
| 51 | #include <stdlib.h>
|
|---|
| 52 | #include <assert.h>
|
|---|
| 53 |
|
|---|
| 54 | #ifndef STANDALONE
|
|---|
| 55 | /* Get Wget's utility headers. */
|
|---|
| 56 | # include "wget.h"
|
|---|
| 57 | # include "utils.h"
|
|---|
| 58 | #else
|
|---|
| 59 | /* Make do without them. */
|
|---|
| 60 | # define xnew(x) xmalloc (sizeof (x))
|
|---|
| 61 | # define xnew_array(type, x) xmalloc (sizeof (type) * (x))
|
|---|
| 62 | # define xmalloc malloc /* or something that exits
|
|---|
| 63 | if not enough memory */
|
|---|
| 64 | # define xfree free
|
|---|
| 65 | # define countof(x) (sizeof (x) / sizeof ((x)[0]))
|
|---|
| 66 | # define TOLOWER(x) ('A' <= (x) && (x) <= 'Z' ? (x) - 32 : (x))
|
|---|
| 67 | # define PARAMS(x) x
|
|---|
| 68 | #endif
|
|---|
| 69 |
|
|---|
| 70 | #include "hash.h"
|
|---|
| 71 |
|
|---|
| 72 | /* INTERFACE:
|
|---|
| 73 |
|
|---|
| 74 | Hash tables are a technique used to implement mapping between
|
|---|
| 75 | objects with near-constant-time access and storage. The table
|
|---|
| 76 | associates keys to values, and a value can be very quickly
|
|---|
| 77 | retrieved by providing the key. Fast lookup tables are typically
|
|---|
| 78 | implemented as hash tables.
|
|---|
| 79 |
|
|---|
| 80 | The entry points are
|
|---|
| 81 | hash_table_new -- creates the table.
|
|---|
| 82 | hash_table_destroy -- destroys the table.
|
|---|
| 83 | hash_table_put -- establishes or updates key->value mapping.
|
|---|
| 84 | hash_table_get -- retrieves value of key.
|
|---|
| 85 | hash_table_get_pair -- get key/value pair for key.
|
|---|
| 86 | hash_table_contains -- test whether the table contains key.
|
|---|
| 87 | hash_table_remove -- remove the key->value mapping for key.
|
|---|
| 88 | hash_table_map -- iterate through table mappings.
|
|---|
| 89 | hash_table_clear -- clear hash table contents.
|
|---|
| 90 | hash_table_count -- return the number of entries in the table.
|
|---|
| 91 |
|
|---|
| 92 | The hash table grows internally as new entries are added and is not
|
|---|
| 93 | limited in size, except by available memory. The table doubles
|
|---|
| 94 | with each resize, which ensures that the amortized time per
|
|---|
| 95 | operation remains constant.
|
|---|
| 96 |
|
|---|
| 97 | By default, tables created by hash_table_new consider the keys to
|
|---|
| 98 | be equal if their pointer values are the same. You can use
|
|---|
| 99 | make_string_hash_table to create tables whose keys are considered
|
|---|
| 100 | equal if their string contents are the same. In the general case,
|
|---|
| 101 | the criterion of equality used to compare keys is specified at
|
|---|
| 102 | table creation time with two callback functions, "hash" and "test".
|
|---|
| 103 | The hash function transforms the key into an arbitrary number that
|
|---|
| 104 | must be the same for two equal keys. The test function accepts two
|
|---|
| 105 | keys and returns non-zero if they are to be considered equal.
|
|---|
| 106 |
|
|---|
| 107 | Note that neither keys nor values are copied when inserted into the
|
|---|
| 108 | hash table, so they must exist for the lifetime of the table. This
|
|---|
| 109 | means that e.g. the use of static strings is OK, but objects with a
|
|---|
| 110 | shorter life-time need to be copied (with strdup() or the like in
|
|---|
| 111 | the case of strings) before being inserted. */
|
|---|
| 112 |
|
|---|
| 113 | /* IMPLEMENTATION:
|
|---|
| 114 |
|
|---|
| 115 | The hash table is implemented as an open-addressed table with
|
|---|
| 116 | linear probing collision resolution.
|
|---|
| 117 |
|
|---|
| 118 | The above means that all the hash entries (pairs of pointers, key
|
|---|
| 119 | and value) are stored in a contiguous array. The position of each
|
|---|
| 120 | mapping is determined by the hash value of its key and the size of
|
|---|
| 121 | the table: location := hash(key) % size. If two different keys end
|
|---|
| 122 | up on the same position (collide), the one that came second is
|
|---|
| 123 | placed at the next empty position following the occupied place.
|
|---|
| 124 | This collision resolution technique is called "linear probing".
|
|---|
| 125 |
|
|---|
| 126 | There are more advanced collision resolution methods (quadratic
|
|---|
| 127 | probing, double hashing), but we don't use them because they incur
|
|---|
| 128 | more non-sequential access to the array, which results in worse CPU
|
|---|
| 129 | cache behavior. Linear probing works well as long as the
|
|---|
| 130 | count/size ratio (fullness) is kept below 75%. We make sure to
|
|---|
| 131 | grow and rehash the table whenever this threshold is exceeded.
|
|---|
| 132 |
|
|---|
| 133 | Collisions make deletion tricky because clearing a position
|
|---|
| 134 | followed by a colliding entry would make the position seem empty
|
|---|
| 135 | and the colliding entry not found. One solution is to leave a
|
|---|
| 136 | "tombstone" instead of clearing the entry, and another is to
|
|---|
| 137 | carefully rehash the entries immediately following the deleted one.
|
|---|
| 138 | We use the latter method because it results in less bookkeeping and
|
|---|
| 139 | faster retrieval at the (slight) expense of deletion. */
|
|---|
| 140 |
|
|---|
| 141 | /* Maximum allowed fullness: when hash table's fullness exceeds this
|
|---|
| 142 | value, the table is resized. */
|
|---|
| 143 | #define HASH_MAX_FULLNESS 0.75
|
|---|
| 144 |
|
|---|
| 145 | /* The hash table size is multiplied by this factor (and then rounded
|
|---|
| 146 | to the next prime) with each resize. This guarantees infrequent
|
|---|
| 147 | resizes. */
|
|---|
| 148 | #define HASH_RESIZE_FACTOR 2
|
|---|
| 149 |
|
|---|
| 150 | struct mapping {
|
|---|
| 151 | void *key;
|
|---|
| 152 | void *value;
|
|---|
| 153 | };
|
|---|
| 154 |
|
|---|
| 155 | typedef unsigned long (*hashfun_t) PARAMS ((const void *));
|
|---|
| 156 | typedef int (*testfun_t) PARAMS ((const void *, const void *));
|
|---|
| 157 |
|
|---|
| 158 | struct hash_table {
|
|---|
| 159 | hashfun_t hash_function;
|
|---|
| 160 | testfun_t test_function;
|
|---|
| 161 |
|
|---|
| 162 | struct mapping *mappings; /* pointer to the table entries. */
|
|---|
| 163 | int size; /* size of the array. */
|
|---|
| 164 |
|
|---|
| 165 | int count; /* number of non-empty entries. */
|
|---|
| 166 | int resize_threshold; /* after size exceeds this number of
|
|---|
| 167 | entries, resize the table. */
|
|---|
| 168 | int prime_offset; /* the offset of the current prime in
|
|---|
| 169 | the prime table. */
|
|---|
| 170 | };
|
|---|
| 171 |
|
|---|
| 172 | /* We use the all-bits-set constant (INVALID_PTR) marker to mean that
|
|---|
| 173 | a mapping is empty. It is unaligned and therefore illegal as a
|
|---|
| 174 | pointer. INVALID_PTR_BYTE (0xff) is the one-byte value used to
|
|---|
| 175 | initialize the mappings array as empty.
|
|---|
| 176 |
|
|---|
| 177 | The all-bits-set value is a better choice than NULL because it
|
|---|
| 178 | allows the use of NULL/0 keys. Since the keys are either integers
|
|---|
| 179 | or pointers, the only key that cannot be used is the integer value
|
|---|
| 180 | -1. This is acceptable because it still allows the use of
|
|---|
| 181 | nonnegative integer keys. */
|
|---|
| 182 |
|
|---|
| 183 | #define INVALID_PTR ((void *) ~(unsigned long)0)
|
|---|
| 184 | #ifndef UCHAR_MAX
|
|---|
| 185 | # define UCHAR_MAX 0xff
|
|---|
| 186 | #endif
|
|---|
| 187 | #define INVALID_PTR_BYTE UCHAR_MAX
|
|---|
| 188 |
|
|---|
| 189 | #define NON_EMPTY(mp) ((mp)->key != INVALID_PTR)
|
|---|
| 190 | #define MARK_AS_EMPTY(mp) ((mp)->key = INVALID_PTR)
|
|---|
| 191 |
|
|---|
| 192 | /* "Next" mapping is the mapping after MP, but wrapping back to
|
|---|
| 193 | MAPPINGS when MP would reach MAPPINGS+SIZE. */
|
|---|
| 194 | #define NEXT_MAPPING(mp, mappings, size) (mp != mappings + (size - 1) \
|
|---|
| 195 | ? mp + 1 : mappings)
|
|---|
| 196 |
|
|---|
| 197 | /* Loop over non-empty mappings starting at MP. */
|
|---|
| 198 | #define LOOP_NON_EMPTY(mp, mappings, size) \
|
|---|
| 199 | for (; NON_EMPTY (mp); mp = NEXT_MAPPING (mp, mappings, size))
|
|---|
| 200 |
|
|---|
| 201 | /* Return the position of KEY in hash table SIZE large, hash function
|
|---|
| 202 | being HASHFUN. */
|
|---|
| 203 | #define HASH_POSITION(key, hashfun, size) ((hashfun) (key) % size)
|
|---|
| 204 |
|
|---|
| 205 | /* Find a prime near, but greather than or equal to SIZE. The primes
|
|---|
| 206 | are looked up from a table with a selection of primes convenient
|
|---|
| 207 | for this purpose.
|
|---|
| 208 |
|
|---|
| 209 | PRIME_OFFSET is a minor optimization: it specifies start position
|
|---|
| 210 | for the search for the large enough prime. The final offset is
|
|---|
| 211 | stored in the same variable. That way the list of primes does not
|
|---|
| 212 | have to be scanned from the beginning each time around. */
|
|---|
| 213 |
|
|---|
| 214 | static int
|
|---|
| 215 | prime_size (int size, int *prime_offset)
|
|---|
| 216 | {
|
|---|
| 217 | static const int primes[] = {
|
|---|
| 218 | 13, 19, 29, 41, 59, 79, 107, 149, 197, 263, 347, 457, 599, 787, 1031,
|
|---|
| 219 | 1361, 1777, 2333, 3037, 3967, 5167, 6719, 8737, 11369, 14783,
|
|---|
| 220 | 19219, 24989, 32491, 42257, 54941, 71429, 92861, 120721, 156941,
|
|---|
| 221 | 204047, 265271, 344857, 448321, 582821, 757693, 985003, 1280519,
|
|---|
| 222 | 1664681, 2164111, 2813353, 3657361, 4754591, 6180989, 8035301,
|
|---|
| 223 | 10445899, 13579681, 17653589, 22949669, 29834603, 38784989,
|
|---|
| 224 | 50420551, 65546729, 85210757, 110774011, 144006217, 187208107,
|
|---|
| 225 | 243370577, 316381771, 411296309, 534685237, 695090819, 903618083,
|
|---|
| 226 | 1174703521, 1527114613, 1837299131, 2147483647
|
|---|
| 227 | };
|
|---|
| 228 | int i;
|
|---|
| 229 |
|
|---|
| 230 | for (i = *prime_offset; i < countof (primes); i++)
|
|---|
| 231 | if (primes[i] >= size)
|
|---|
| 232 | {
|
|---|
| 233 | /* Set the offset to the next prime. That is safe because,
|
|---|
| 234 | next time we are called, it will be with a larger SIZE,
|
|---|
| 235 | which means we could never return the same prime anyway.
|
|---|
| 236 | (If that is not the case, the caller can simply reset
|
|---|
| 237 | *prime_offset.) */
|
|---|
| 238 | *prime_offset = i + 1;
|
|---|
| 239 | return primes[i];
|
|---|
| 240 | }
|
|---|
| 241 |
|
|---|
| 242 | abort ();
|
|---|
| 243 | }
|
|---|
| 244 |
|
|---|
| 245 | static int cmp_pointer PARAMS ((const void *, const void *));
|
|---|
| 246 |
|
|---|
| 247 | /* Create a hash table with hash function HASH_FUNCTION and test
|
|---|
| 248 | function TEST_FUNCTION. The table is empty (its count is 0), but
|
|---|
| 249 | pre-allocated to store at least ITEMS items.
|
|---|
| 250 |
|
|---|
| 251 | ITEMS is the number of items that the table can accept without
|
|---|
| 252 | needing to resize. It is useful when creating a table that is to
|
|---|
| 253 | be immediately filled with a known number of items. In that case,
|
|---|
| 254 | the regrows are a waste of time, and specifying ITEMS correctly
|
|---|
| 255 | will avoid them altogether.
|
|---|
| 256 |
|
|---|
| 257 | Note that hash tables grow dynamically regardless of ITEMS. The
|
|---|
| 258 | only use of ITEMS is to preallocate the table and avoid unnecessary
|
|---|
| 259 | dynamic regrows. Don't bother making ITEMS prime because it's not
|
|---|
| 260 | used as size unchanged. To start with a small table that grows as
|
|---|
| 261 | needed, simply specify zero ITEMS.
|
|---|
| 262 |
|
|---|
| 263 | If hash and test callbacks are not specified, identity mapping is
|
|---|
| 264 | assumed, i.e. pointer values are used for key comparison. (Common
|
|---|
| 265 | Lisp calls such tables EQ hash tables, and Java calls them
|
|---|
| 266 | IdentityHashMaps.) If your keys require different comparison,
|
|---|
| 267 | specify hash and test functions. For easy use of C strings as hash
|
|---|
| 268 | keys, you can use the convenience functions make_string_hash_table
|
|---|
| 269 | and make_nocase_string_hash_table. */
|
|---|
| 270 |
|
|---|
| 271 | struct hash_table *
|
|---|
| 272 | hash_table_new (int items,
|
|---|
| 273 | unsigned long (*hash_function) (const void *),
|
|---|
| 274 | int (*test_function) (const void *, const void *))
|
|---|
| 275 | {
|
|---|
| 276 | int size;
|
|---|
| 277 | struct hash_table *ht = xnew (struct hash_table);
|
|---|
| 278 |
|
|---|
| 279 | ht->hash_function = hash_function ? hash_function : hash_pointer;
|
|---|
| 280 | ht->test_function = test_function ? test_function : cmp_pointer;
|
|---|
| 281 |
|
|---|
| 282 | /* If the size of struct hash_table ever becomes a concern, this
|
|---|
| 283 | field can go. (Wget doesn't create many hashes.) */
|
|---|
| 284 | ht->prime_offset = 0;
|
|---|
| 285 |
|
|---|
| 286 | /* Calculate the size that ensures that the table will store at
|
|---|
| 287 | least ITEMS keys without the need to resize. */
|
|---|
| 288 | size = 1 + items / HASH_MAX_FULLNESS;
|
|---|
| 289 | size = prime_size (size, &ht->prime_offset);
|
|---|
| 290 | ht->size = size;
|
|---|
| 291 | ht->resize_threshold = size * HASH_MAX_FULLNESS;
|
|---|
| 292 | /*assert (ht->resize_threshold >= items);*/
|
|---|
| 293 |
|
|---|
| 294 | ht->mappings = xnew_array (struct mapping, ht->size);
|
|---|
| 295 |
|
|---|
| 296 | /* Mark mappings as empty. We use 0xff rather than 0 to mark empty
|
|---|
| 297 | keys because it allows us to use NULL/0 as keys. */
|
|---|
| 298 | memset (ht->mappings, INVALID_PTR_BYTE, size * sizeof (struct mapping));
|
|---|
| 299 |
|
|---|
| 300 | ht->count = 0;
|
|---|
| 301 |
|
|---|
| 302 | return ht;
|
|---|
| 303 | }
|
|---|
| 304 |
|
|---|
| 305 | /* Free the data associated with hash table HT. */
|
|---|
| 306 |
|
|---|
| 307 | void
|
|---|
| 308 | hash_table_destroy (struct hash_table *ht)
|
|---|
| 309 | {
|
|---|
| 310 | xfree (ht->mappings);
|
|---|
| 311 | xfree (ht);
|
|---|
| 312 | }
|
|---|
| 313 |
|
|---|
| 314 | /* The heart of most functions in this file -- find the mapping whose
|
|---|
| 315 | KEY is equal to key, using linear probing. Returns the mapping
|
|---|
| 316 | that matches KEY, or the first empty mapping if none matches. */
|
|---|
| 317 |
|
|---|
| 318 | static inline struct mapping *
|
|---|
| 319 | find_mapping (const struct hash_table *ht, const void *key)
|
|---|
| 320 | {
|
|---|
| 321 | struct mapping *mappings = ht->mappings;
|
|---|
| 322 | int size = ht->size;
|
|---|
| 323 | struct mapping *mp = mappings + HASH_POSITION (key, ht->hash_function, size);
|
|---|
| 324 | testfun_t equals = ht->test_function;
|
|---|
| 325 |
|
|---|
| 326 | LOOP_NON_EMPTY (mp, mappings, size)
|
|---|
| 327 | if (equals (key, mp->key))
|
|---|
| 328 | break;
|
|---|
| 329 | return mp;
|
|---|
| 330 | }
|
|---|
| 331 |
|
|---|
| 332 | /* Get the value that corresponds to the key KEY in the hash table HT.
|
|---|
| 333 | If no value is found, return NULL. Note that NULL is a legal value
|
|---|
| 334 | for value; if you are storing NULLs in your hash table, you can use
|
|---|
| 335 | hash_table_contains to be sure that a (possibly NULL) value exists
|
|---|
| 336 | in the table. Or, you can use hash_table_get_pair instead of this
|
|---|
| 337 | function. */
|
|---|
| 338 |
|
|---|
| 339 | void *
|
|---|
| 340 | hash_table_get (const struct hash_table *ht, const void *key)
|
|---|
| 341 | {
|
|---|
| 342 | struct mapping *mp = find_mapping (ht, key);
|
|---|
| 343 | if (NON_EMPTY (mp))
|
|---|
| 344 | return mp->value;
|
|---|
| 345 | else
|
|---|
| 346 | return NULL;
|
|---|
| 347 | }
|
|---|
| 348 |
|
|---|
| 349 | /* Like hash_table_get, but writes out the pointers to both key and
|
|---|
| 350 | value. Returns non-zero on success. */
|
|---|
| 351 |
|
|---|
| 352 | int
|
|---|
| 353 | hash_table_get_pair (const struct hash_table *ht, const void *lookup_key,
|
|---|
| 354 | void *orig_key, void *value)
|
|---|
| 355 | {
|
|---|
| 356 | struct mapping *mp = find_mapping (ht, lookup_key);
|
|---|
| 357 | if (NON_EMPTY (mp))
|
|---|
| 358 | {
|
|---|
| 359 | if (orig_key)
|
|---|
| 360 | *(void **)orig_key = mp->key;
|
|---|
| 361 | if (value)
|
|---|
| 362 | *(void **)value = mp->value;
|
|---|
| 363 | return 1;
|
|---|
| 364 | }
|
|---|
| 365 | else
|
|---|
| 366 | return 0;
|
|---|
| 367 | }
|
|---|
| 368 |
|
|---|
| 369 | /* Return 1 if HT contains KEY, 0 otherwise. */
|
|---|
| 370 |
|
|---|
| 371 | int
|
|---|
| 372 | hash_table_contains (const struct hash_table *ht, const void *key)
|
|---|
| 373 | {
|
|---|
| 374 | struct mapping *mp = find_mapping (ht, key);
|
|---|
| 375 | return NON_EMPTY (mp);
|
|---|
| 376 | }
|
|---|
| 377 |
|
|---|
| 378 | /* Grow hash table HT as necessary, and rehash all the key-value
|
|---|
| 379 | mappings. */
|
|---|
| 380 |
|
|---|
| 381 | static void
|
|---|
| 382 | grow_hash_table (struct hash_table *ht)
|
|---|
| 383 | {
|
|---|
| 384 | hashfun_t hasher = ht->hash_function;
|
|---|
| 385 | struct mapping *old_mappings = ht->mappings;
|
|---|
| 386 | struct mapping *old_end = ht->mappings + ht->size;
|
|---|
| 387 | struct mapping *mp, *mappings;
|
|---|
| 388 | int newsize;
|
|---|
| 389 |
|
|---|
| 390 | newsize = prime_size (ht->size * HASH_RESIZE_FACTOR, &ht->prime_offset);
|
|---|
| 391 | #if 0
|
|---|
| 392 | printf ("growing from %d to %d; fullness %.2f%% to %.2f%%\n",
|
|---|
| 393 | ht->size, newsize,
|
|---|
| 394 | 100.0 * ht->count / ht->size,
|
|---|
| 395 | 100.0 * ht->count / newsize);
|
|---|
| 396 | #endif
|
|---|
| 397 |
|
|---|
| 398 | ht->size = newsize;
|
|---|
| 399 | ht->resize_threshold = newsize * HASH_MAX_FULLNESS;
|
|---|
| 400 |
|
|---|
| 401 | mappings = xnew_array (struct mapping, newsize);
|
|---|
| 402 | memset (mappings, INVALID_PTR_BYTE, newsize * sizeof (struct mapping));
|
|---|
| 403 | ht->mappings = mappings;
|
|---|
| 404 |
|
|---|
| 405 | for (mp = old_mappings; mp < old_end; mp++)
|
|---|
| 406 | if (NON_EMPTY (mp))
|
|---|
| 407 | {
|
|---|
| 408 | struct mapping *new_mp;
|
|---|
| 409 | /* We don't need to test for uniqueness of keys because they
|
|---|
| 410 | come from the hash table and are therefore known to be
|
|---|
| 411 | unique. */
|
|---|
| 412 | new_mp = mappings + HASH_POSITION (mp->key, hasher, newsize);
|
|---|
| 413 | LOOP_NON_EMPTY (new_mp, mappings, newsize)
|
|---|
| 414 | ;
|
|---|
| 415 | *new_mp = *mp;
|
|---|
| 416 | }
|
|---|
| 417 |
|
|---|
| 418 | xfree (old_mappings);
|
|---|
| 419 | }
|
|---|
| 420 |
|
|---|
| 421 | /* Put VALUE in the hash table HT under the key KEY. This regrows the
|
|---|
| 422 | table if necessary. */
|
|---|
| 423 |
|
|---|
| 424 | void
|
|---|
| 425 | hash_table_put (struct hash_table *ht, const void *key, void *value)
|
|---|
| 426 | {
|
|---|
| 427 | struct mapping *mp = find_mapping (ht, key);
|
|---|
| 428 | if (NON_EMPTY (mp))
|
|---|
| 429 | {
|
|---|
| 430 | /* update existing item */
|
|---|
| 431 | mp->key = (void *)key; /* const? */
|
|---|
| 432 | mp->value = value;
|
|---|
| 433 | return;
|
|---|
| 434 | }
|
|---|
| 435 |
|
|---|
| 436 | /* If adding the item would make the table exceed max. fullness,
|
|---|
| 437 | grow the table first. */
|
|---|
| 438 | if (ht->count >= ht->resize_threshold)
|
|---|
| 439 | {
|
|---|
| 440 | grow_hash_table (ht);
|
|---|
| 441 | mp = find_mapping (ht, key);
|
|---|
| 442 | }
|
|---|
| 443 |
|
|---|
| 444 | /* add new item */
|
|---|
| 445 | ++ht->count;
|
|---|
| 446 | mp->key = (void *)key; /* const? */
|
|---|
| 447 | mp->value = value;
|
|---|
| 448 | }
|
|---|
| 449 |
|
|---|
| 450 | /* Remove a mapping that matches KEY from HT. Return 0 if there was
|
|---|
| 451 | no such entry; return 1 if an entry was removed. */
|
|---|
| 452 |
|
|---|
| 453 | int
|
|---|
| 454 | hash_table_remove (struct hash_table *ht, const void *key)
|
|---|
| 455 | {
|
|---|
| 456 | struct mapping *mp = find_mapping (ht, key);
|
|---|
| 457 | if (!NON_EMPTY (mp))
|
|---|
| 458 | return 0;
|
|---|
| 459 | else
|
|---|
| 460 | {
|
|---|
| 461 | int size = ht->size;
|
|---|
| 462 | struct mapping *mappings = ht->mappings;
|
|---|
| 463 | hashfun_t hasher = ht->hash_function;
|
|---|
| 464 |
|
|---|
| 465 | MARK_AS_EMPTY (mp);
|
|---|
| 466 | --ht->count;
|
|---|
| 467 |
|
|---|
| 468 | /* Rehash all the entries following MP. The alternative
|
|---|
| 469 | approach is to mark the entry as deleted, i.e. create a
|
|---|
| 470 | "tombstone". That speeds up removal, but leaves a lot of
|
|---|
| 471 | garbage and slows down hash_table_get and hash_table_put. */
|
|---|
| 472 |
|
|---|
| 473 | mp = NEXT_MAPPING (mp, mappings, size);
|
|---|
| 474 | LOOP_NON_EMPTY (mp, mappings, size)
|
|---|
| 475 | {
|
|---|
| 476 | const void *key2 = mp->key;
|
|---|
| 477 | struct mapping *mp_new;
|
|---|
| 478 |
|
|---|
| 479 | /* Find the new location for the key. */
|
|---|
| 480 | mp_new = mappings + HASH_POSITION (key2, hasher, size);
|
|---|
| 481 | LOOP_NON_EMPTY (mp_new, mappings, size)
|
|---|
| 482 | if (key2 == mp_new->key)
|
|---|
| 483 | /* The mapping MP (key2) is already where we want it (in
|
|---|
| 484 | MP_NEW's "chain" of keys.) */
|
|---|
| 485 | goto next_rehash;
|
|---|
| 486 |
|
|---|
| 487 | *mp_new = *mp;
|
|---|
| 488 | MARK_AS_EMPTY (mp);
|
|---|
| 489 |
|
|---|
| 490 | next_rehash:
|
|---|
| 491 | ;
|
|---|
| 492 | }
|
|---|
| 493 | return 1;
|
|---|
| 494 | }
|
|---|
| 495 | }
|
|---|
| 496 |
|
|---|
| 497 | /* Clear HT of all entries. After calling this function, the count
|
|---|
| 498 | and the fullness of the hash table will be zero. The size will
|
|---|
| 499 | remain unchanged. */
|
|---|
| 500 |
|
|---|
| 501 | void
|
|---|
| 502 | hash_table_clear (struct hash_table *ht)
|
|---|
| 503 | {
|
|---|
| 504 | memset (ht->mappings, INVALID_PTR_BYTE, ht->size * sizeof (struct mapping));
|
|---|
| 505 | ht->count = 0;
|
|---|
| 506 | }
|
|---|
| 507 |
|
|---|
| 508 | /* Map MAPFUN over all the mappings in hash table HT. MAPFUN is
|
|---|
| 509 | called with three arguments: the key, the value, and MAPARG.
|
|---|
| 510 |
|
|---|
| 511 | It is undefined what happens if you add or remove entries in the
|
|---|
| 512 | hash table while hash_table_map is running. The exception is the
|
|---|
| 513 | entry you're currently mapping over; you may remove or change that
|
|---|
| 514 | entry. */
|
|---|
| 515 |
|
|---|
| 516 | void
|
|---|
| 517 | hash_table_map (struct hash_table *ht,
|
|---|
| 518 | int (*mapfun) (void *, void *, void *),
|
|---|
| 519 | void *maparg)
|
|---|
| 520 | {
|
|---|
| 521 | struct mapping *mp = ht->mappings;
|
|---|
| 522 | struct mapping *end = ht->mappings + ht->size;
|
|---|
| 523 |
|
|---|
| 524 | for (; mp < end; mp++)
|
|---|
| 525 | if (NON_EMPTY (mp))
|
|---|
| 526 | {
|
|---|
| 527 | void *key;
|
|---|
| 528 | repeat:
|
|---|
| 529 | key = mp->key;
|
|---|
| 530 | if (mapfun (key, mp->value, maparg))
|
|---|
| 531 | return;
|
|---|
| 532 | /* hash_table_remove might have moved the adjacent
|
|---|
| 533 | mappings. */
|
|---|
| 534 | if (mp->key != key && NON_EMPTY (mp))
|
|---|
| 535 | goto repeat;
|
|---|
| 536 | }
|
|---|
| 537 | }
|
|---|
| 538 |
|
|---|
| 539 | /* Return the number of elements in the hash table. This is not the
|
|---|
| 540 | same as the physical size of the hash table, which is always
|
|---|
| 541 | greater than the number of elements. */
|
|---|
| 542 |
|
|---|
| 543 | int
|
|---|
| 544 | hash_table_count (const struct hash_table *ht)
|
|---|
| 545 | {
|
|---|
| 546 | return ht->count;
|
|---|
| 547 | }
|
|---|
| 548 | |
|---|
| 549 |
|
|---|
| 550 | /* Functions from this point onward are meant for convenience and
|
|---|
| 551 | don't strictly belong to this file. However, this is as good a
|
|---|
| 552 | place for them as any. */
|
|---|
| 553 |
|
|---|
| 554 | /* Guidelines for creating custom hash and test functions:
|
|---|
| 555 |
|
|---|
| 556 | - The test function returns non-zero for keys that are considered
|
|---|
| 557 | "equal", zero otherwise.
|
|---|
| 558 |
|
|---|
| 559 | - The hash function returns a number that represents the
|
|---|
| 560 | "distinctness" of the object. In more precise terms, it means
|
|---|
| 561 | that for any two objects that test "equal" under the test
|
|---|
| 562 | function, the hash function MUST produce the same result.
|
|---|
| 563 |
|
|---|
| 564 | This does not mean that all different objects must produce
|
|---|
| 565 | different values (that would be "perfect" hashing), only that
|
|---|
| 566 | non-distinct objects must produce the same values! For instance,
|
|---|
| 567 | a hash function that returns 0 for any given object is a
|
|---|
| 568 | perfectly valid (albeit extremely bad) hash function. A hash
|
|---|
| 569 | function that hashes a string by adding up all its characters is
|
|---|
| 570 | another example of a valid (but quite bad) hash function.
|
|---|
| 571 |
|
|---|
| 572 | It is not hard to make hash and test functions agree about
|
|---|
| 573 | equality. For example, if the test function compares strings
|
|---|
| 574 | case-insensitively, the hash function can lower-case the
|
|---|
| 575 | characters when calculating the hash value. That ensures that
|
|---|
| 576 | two strings differing only in case will hash the same.
|
|---|
| 577 |
|
|---|
| 578 | - If you care about performance, choose a hash function with as
|
|---|
| 579 | good "spreading" as possible. A good hash function will use all
|
|---|
| 580 | the bits of the input when calculating the hash, and will react
|
|---|
| 581 | to even small changes in input with a completely different
|
|---|
| 582 | output. Finally, don't make the hash function itself overly
|
|---|
| 583 | slow, because you'll be incurring a non-negligible overhead to
|
|---|
| 584 | all hash table operations. */
|
|---|
| 585 |
|
|---|
| 586 | /*
|
|---|
| 587 | * Support for hash tables whose keys are strings.
|
|---|
| 588 | *
|
|---|
| 589 | */
|
|---|
| 590 |
|
|---|
| 591 | /* 31 bit hash function. Taken from Gnome's glib, modified to use
|
|---|
| 592 | standard C types.
|
|---|
| 593 |
|
|---|
| 594 | We used to use the popular hash function from the Dragon Book, but
|
|---|
| 595 | this one seems to perform much better. */
|
|---|
| 596 |
|
|---|
| 597 | static unsigned long
|
|---|
| 598 | hash_string (const void *key)
|
|---|
| 599 | {
|
|---|
| 600 | const char *p = key;
|
|---|
| 601 | unsigned int h = *p;
|
|---|
| 602 |
|
|---|
| 603 | if (h)
|
|---|
| 604 | for (p += 1; *p != '\0'; p++)
|
|---|
| 605 | h = (h << 5) - h + *p;
|
|---|
| 606 |
|
|---|
| 607 | return h;
|
|---|
| 608 | }
|
|---|
| 609 |
|
|---|
| 610 | /* Frontend for strcmp usable for hash tables. */
|
|---|
| 611 |
|
|---|
| 612 | static int
|
|---|
| 613 | cmp_string (const void *s1, const void *s2)
|
|---|
| 614 | {
|
|---|
| 615 | return !strcmp ((const char *)s1, (const char *)s2);
|
|---|
| 616 | }
|
|---|
| 617 |
|
|---|
| 618 | /* Return a hash table of preallocated to store at least ITEMS items
|
|---|
| 619 | suitable to use strings as keys. */
|
|---|
| 620 |
|
|---|
| 621 | struct hash_table *
|
|---|
| 622 | make_string_hash_table (int items)
|
|---|
| 623 | {
|
|---|
| 624 | return hash_table_new (items, hash_string, cmp_string);
|
|---|
| 625 | }
|
|---|
| 626 |
|
|---|
| 627 | /*
|
|---|
| 628 | * Support for hash tables whose keys are strings, but which are
|
|---|
| 629 | * compared case-insensitively.
|
|---|
| 630 | *
|
|---|
| 631 | */
|
|---|
| 632 |
|
|---|
| 633 | /* Like hash_string, but produce the same hash regardless of the case. */
|
|---|
| 634 |
|
|---|
| 635 | static unsigned long
|
|---|
| 636 | hash_string_nocase (const void *key)
|
|---|
| 637 | {
|
|---|
| 638 | const char *p = key;
|
|---|
| 639 | unsigned int h = TOLOWER (*p);
|
|---|
| 640 |
|
|---|
| 641 | if (h)
|
|---|
| 642 | for (p += 1; *p != '\0'; p++)
|
|---|
| 643 | h = (h << 5) - h + TOLOWER (*p);
|
|---|
| 644 |
|
|---|
| 645 | return h;
|
|---|
| 646 | }
|
|---|
| 647 |
|
|---|
| 648 | /* Like string_cmp, but doing case-insensitive compareison. */
|
|---|
| 649 |
|
|---|
| 650 | static int
|
|---|
| 651 | string_cmp_nocase (const void *s1, const void *s2)
|
|---|
| 652 | {
|
|---|
| 653 | return !strcasecmp ((const char *)s1, (const char *)s2);
|
|---|
| 654 | }
|
|---|
| 655 |
|
|---|
| 656 | /* Like make_string_hash_table, but uses string_hash_nocase and
|
|---|
| 657 | string_cmp_nocase. */
|
|---|
| 658 |
|
|---|
| 659 | struct hash_table *
|
|---|
| 660 | make_nocase_string_hash_table (int items)
|
|---|
| 661 | {
|
|---|
| 662 | return hash_table_new (items, hash_string_nocase, string_cmp_nocase);
|
|---|
| 663 | }
|
|---|
| 664 |
|
|---|
| 665 | /* Hashing of numeric values, such as pointers and integers.
|
|---|
| 666 |
|
|---|
| 667 | This implementation is the Robert Jenkins' 32 bit Mix Function,
|
|---|
| 668 | with a simple adaptation for 64-bit values. It offers excellent
|
|---|
| 669 | spreading of values and doesn't need to know the hash table size to
|
|---|
| 670 | work (unlike the very popular Knuth's multiplication hash). */
|
|---|
| 671 |
|
|---|
| 672 | unsigned long
|
|---|
| 673 | hash_pointer (const void *ptr)
|
|---|
| 674 | {
|
|---|
| 675 | unsigned long key = (unsigned long)ptr;
|
|---|
| 676 | key += (key << 12);
|
|---|
| 677 | key ^= (key >> 22);
|
|---|
| 678 | key += (key << 4);
|
|---|
| 679 | key ^= (key >> 9);
|
|---|
| 680 | key += (key << 10);
|
|---|
| 681 | key ^= (key >> 2);
|
|---|
| 682 | key += (key << 7);
|
|---|
| 683 | key ^= (key >> 12);
|
|---|
| 684 | #if SIZEOF_LONG > 4
|
|---|
| 685 | key += (key << 44);
|
|---|
| 686 | key ^= (key >> 54);
|
|---|
| 687 | key += (key << 36);
|
|---|
| 688 | key ^= (key >> 41);
|
|---|
| 689 | key += (key << 42);
|
|---|
| 690 | key ^= (key >> 34);
|
|---|
| 691 | key += (key << 39);
|
|---|
| 692 | key ^= (key >> 44);
|
|---|
| 693 | #endif
|
|---|
| 694 | return key;
|
|---|
| 695 | }
|
|---|
| 696 |
|
|---|
| 697 | static int
|
|---|
| 698 | cmp_pointer (const void *ptr1, const void *ptr2)
|
|---|
| 699 | {
|
|---|
| 700 | return ptr1 == ptr2;
|
|---|
| 701 | }
|
|---|
| 702 | |
|---|
| 703 |
|
|---|
| 704 | #ifdef TEST
|
|---|
| 705 |
|
|---|
| 706 | #include <stdio.h>
|
|---|
| 707 | #include <string.h>
|
|---|
| 708 |
|
|---|
| 709 | int
|
|---|
| 710 | print_hash_table_mapper (void *key, void *value, void *count)
|
|---|
| 711 | {
|
|---|
| 712 | ++*(int *)count;
|
|---|
| 713 | printf ("%s: %s\n", (const char *)key, (char *)value);
|
|---|
| 714 | return 0;
|
|---|
| 715 | }
|
|---|
| 716 |
|
|---|
| 717 | void
|
|---|
| 718 | print_hash (struct hash_table *sht)
|
|---|
| 719 | {
|
|---|
| 720 | int debug_count = 0;
|
|---|
| 721 | hash_table_map (sht, print_hash_table_mapper, &debug_count);
|
|---|
| 722 | assert (debug_count == sht->count);
|
|---|
| 723 | }
|
|---|
| 724 |
|
|---|
| 725 | int
|
|---|
| 726 | main (void)
|
|---|
| 727 | {
|
|---|
| 728 | struct hash_table *ht = make_string_hash_table (0);
|
|---|
| 729 | char line[80];
|
|---|
| 730 | while ((fgets (line, sizeof (line), stdin)))
|
|---|
| 731 | {
|
|---|
| 732 | int len = strlen (line);
|
|---|
| 733 | if (len <= 1)
|
|---|
| 734 | continue;
|
|---|
| 735 | line[--len] = '\0';
|
|---|
| 736 | if (!hash_table_contains (ht, line))
|
|---|
| 737 | hash_table_put (ht, strdup (line), "here I am!");
|
|---|
| 738 | #if 1
|
|---|
| 739 | if (len % 5 == 0)
|
|---|
| 740 | {
|
|---|
| 741 | char *line_copy;
|
|---|
| 742 | if (hash_table_get_pair (ht, line, &line_copy, NULL))
|
|---|
| 743 | {
|
|---|
| 744 | hash_table_remove (ht, line);
|
|---|
| 745 | xfree (line_copy);
|
|---|
| 746 | }
|
|---|
| 747 | }
|
|---|
| 748 | #endif
|
|---|
| 749 | }
|
|---|
| 750 | #if 0
|
|---|
| 751 | print_hash (ht);
|
|---|
| 752 | #endif
|
|---|
| 753 | #if 1
|
|---|
| 754 | printf ("%d %d\n", ht->count, ht->size);
|
|---|
| 755 | #endif
|
|---|
| 756 | return 0;
|
|---|
| 757 | }
|
|---|
| 758 | #endif /* TEST */
|
|---|