Rework the hash table code.

2025-12-02 13:09:22 +00:00 · 2002-12-19 12:35:17 +00:00
parent 388a431c7d
commit 1186e616cb
4 changed files with 163 additions and 77 deletions
--- a/19
+++ b/19
@@ -1,3 +1,22 @@
 2002-11-03  Bruno Haible  <bruno@clisp.org>
 	Bug fix: The hash table could fail to detect duplicates, between
 	keywords of different length, when option -n (option[NOLENGTH]) was
 	given.
 	* src/hash-table.h (Hash_Table::Hash_Table): Pass table size, not
 	vector and vector size as arguments.
 	(Hash_Table::_log_size): New field.
 	(Hash_Table::equal): New declaration.
 	* src/hash-table.cc (size_factor): New variable.
 	(Hash_Table::Hash_Table): Pass table size, not vector and vector size
 	as arguments. Allocate the vector here.
 	(Hash_Table::~Hash_Table): Deallocate the vector here.
 	(Hash_Table::equal): New function.
 	(Hash_Table::insert): Use it. Don't use item->_allchars_length for the
 	increment if _ignore_length is true.
 	* src/search.cc (TABLE_MULTIPLE): Remove variable.
 	(Search::prepare): Update.
 2002-11-02  Bruno Haible  <bruno@clisp.org>
 	Provide documentation also in PDF format.
--- a/src/hash-table.cc
+++ b/src/hash-table.cc
@@ -28,21 +28,61 @@
 #include <hash.h>
 #include "options.h"
-/* The size of the hash table is always the smallest power of 2 >= the size
+/* We use a hash table with double hashing.  This is the simplest kind of
-   indicated by the user.  This allows several optimizations, including
+   hash table, given that we always only insert and never remove entries
-   the use of double hashing and elimination of the mod instruction.
+   from the hash table.  */
   Note that the size had better be larger than the number of items
   in the hash table, else there's trouble!!!  Note that the memory
   for the hash table is allocated *outside* the intialization routine.
   This compromises information hiding somewhat, but greatly reduces
   memory fragmentation, since we can now use alloca! */
-Hash_Table::Hash_Table (KeywordExt **table_ptr, int s, bool ignore_len):
+/* To make double hashing efficient, there need to be enough spare entries.  */
-     _table (table_ptr), _size (s), _collisions (0), _ignore_length (ignore_len)
+static const int size_factor = 10;
 /* We make the size of the hash table a power of 2.  This allows for two
   optimizations: It eliminates the modulo instruction, and allows for an
   easy secondary hashing function.  */
 /* Constructor.  */
 Hash_Table::Hash_Table (unsigned int size, bool ignore_length)
  : _ignore_length (ignore_length),
    _collisions (0)
 {
  /* There need to be enough spare entries.  */
  size = size * size_factor;
  /* Find smallest power of 2 that is >= size.  */
  unsigned int shift = 0;
  if ((size >> 16) > 0)
    {
      size = size >> 16;
      shift += 16;
    }
  if ((size >> 8) > 0)
    {
      size = size >> 8;
      shift += 8;
    }
  if ((size >> 4) > 0)
    {
      size = size >> 4;
      shift += 4;
    }
  if ((size >> 2) > 0)
    {
      size = size >> 2;
      shift += 2;
    }
  if ((size >> 1) > 0)
    {
      size = size >> 1;
      shift += 1;
    }
  _log_size = shift;
  _size = 1 << shift;
  /* Allocate table.  */
  _table = new KeywordExt*[_size];
  memset (_table, 0, _size * sizeof (*_table));
 }
 /* Destructor.  */
 Hash_Table::~Hash_Table ()
 {
  if (option[DEBUG])
@@ -76,24 +116,38 @@ Hash_Table::~Hash_Table ()
      fprintf (stderr, "\nend dumping hash table\n\n");
    }
  delete[] _table;
 }
-/* If the ITEM is already in the hash table return the item found
+/* Compares two items.  */
-   in the table.  Otherwise inserts the ITEM, and returns FALSE.
+inline bool
-   Uses double hashing. */
+Hash_Table::equal (KeywordExt *item1, KeywordExt *item2)
 {
  return item1->_selchars_length == item2->_selchars_length
         && memcmp (item1->_selchars, item2->_selchars, item2->_selchars_length)
            == 0
         && (_ignore_length
             || item1->_allchars_length == item2->_allchars_length);
 }
 /* Attempts to insert ITEM in the table.  If there is already an equal
   entry in it, returns it.  Otherwise inserts ITEM and returns NULL.  */
 KeywordExt *
 Hash_Table::insert (KeywordExt *item)
 {
-  unsigned hash_val  = hashpjw (item->_selchars, item->_selchars_length);
+  unsigned hash_val = hashpjw (item->_selchars, item->_selchars_length);
-  int      probe     = hash_val & (_size - 1);
+  unsigned int probe = hash_val & (_size - 1);
-  int      increment = ((hash_val ^ item->_allchars_length) | 1) & (_size - 1);
+  unsigned int increment =
    (((hash_val >> _log_size)
      ^ (_ignore_length ? 0 : item->_allchars_length))
     << 1) + 1;
  /* Note that because _size is a power of 2 and increment is odd,
     we have gcd(increment,_size) = 1, which guarantees that we'll find
     an empty entry during the loop.  */
-  while (_table[probe])
+  while (_table[probe] != NULL)
    {
-      if (_table[probe]->_selchars_length == item->_selchars_length
+      if (equal (_table[probe], item))
          && memcmp (_table[probe]->_selchars, item->_selchars, item->_selchars_length) == 0
          && (_ignore_length || _table[probe]->_allchars_length == item->_allchars_length))
        return _table[probe];
      _collisions++;
--- a/src/hash-table.h
+++ b/src/hash-table.h
@@ -28,18 +28,37 @@
 #include "keyword.h"
 /* Hash table of KeywordExt* entries.
   Two entries are considered equal if their _selchars are the same and
   - if !ignore_length - if their _allchars_length are the same.  */
 class Hash_Table
 {
 private:
  KeywordExt **         _table;      /* Vector of pointers to linked lists of keywords. */
  int                   _size;       /* Size of the vector. */
  int                   _collisions; /* Find out how well our double hashing is working! */
  bool                  _ignore_length;
 public:
-                        Hash_Table (KeywordExt **t, int s, bool ignore_len);
+  /* Constructor.
     size is the maximum number of entries.
     ignore_length determines a detail in the comparison function.  */
                        Hash_Table (unsigned int size, bool ignore_length);
  /* Destructor.  */
                        ~Hash_Table ();
  /* Attempts to insert ITEM in the table.  If there is already an equal
     entry in it, returns it.  Otherwise inserts ITEM and returns NULL.  */
  KeywordExt *          insert (KeywordExt *item);
 private:
  /* Vector of entries.  */
  KeywordExt **         _table;
  /* Size of the vector.  */
  unsigned int          _size;
  /* log2(_size).  */
  unsigned int          _log_size;
  /* A detail of the comparison function.  */
  bool                  _ignore_length;
  /* Statistics: Number of collisions so far.  */
  unsigned int          _collisions;
  /* Compares two items.  */
  bool                  equal (KeywordExt *item1, KeywordExt *item2);
 };
 #endif
--- a/src/search.cc
+++ b/src/search.cc
@@ -31,9 +31,6 @@
 #include "options.h"
 #include "hash-table.h"
 /* Make the hash table 8 times larger than the number of keyword entries. */
 static const int TABLE_MULTIPLE     = 10;
 /* Efficiently returns the least power of two greater than or equal to X! */
 #define POW(X) ((!X)?1:(X-=1,X|=X>>1,X|=X>>2,X|=X>>4,X|=X>>8,X|=X>>16,(++X)))
@@ -50,7 +47,6 @@ void
 Search::prepare ()
 {
  KeywordExt_List *temp;
  KeywordExt_List *trail = NULL;
  _total_keys = 0;
  for (temp = _head; temp; temp = temp->rest())
@@ -58,59 +54,57 @@ Search::prepare ()
      temp->first()->init_selchars(_occurrences);
      _total_keys++;
    }
  /* Hash table this number of times larger than keyword number. */
  int table_size = (_list_len = _total_keys) * TABLE_MULTIPLE;
  /* Table must be a power of 2 for the hash function scheme to work. */
  KeywordExt **table = new KeywordExt*[POW (table_size)];
-  /* Make large hash table for efficiency. */
+  _list_len = _total_keys;
  Hash_Table found_link (table, table_size, option[NOLENGTH]);
-  /* Test whether there are any links and also set the maximum length of
+  {
-     an identifier in the keyword list. */
+    /* Make hash table for efficiency. */
-  _total_duplicates = 0;
+    Hash_Table found_link (_list_len, option[NOLENGTH]);
  _max_key_len = INT_MIN;
  _min_key_len = INT_MAX;
  for (temp = _head; temp; temp = temp->rest())
    {
      KeywordExt *keyword = temp->first();
      KeywordExt *other_keyword = found_link.insert (keyword);
-      /* Check for links.  We deal with these by building an equivalence class
+    /* Test whether there are any links and also set the maximum length of
-         of all duplicate values (i.e., links) so that only 1 keyword is
+       an identifier in the keyword list. */
-         representative of the entire collection.  This *greatly* simplifies
+    _total_duplicates = 0;
-         processing during later stages of the program. */
+    _max_key_len = INT_MIN;
    _min_key_len = INT_MAX;
    KeywordExt_List *trail = NULL;
    for (temp = _head; temp; temp = temp->rest())
      {
        KeywordExt *keyword = temp->first();
        KeywordExt *other_keyword = found_link.insert (keyword);
-      if (other_keyword)
+        /* Check for links.  We deal with these by building an equivalence class
-        {
+           of all duplicate values (i.e., links) so that only 1 keyword is
-          _total_duplicates++;
+           representative of the entire collection.  This *greatly* simplifies
-          _list_len--;
+           processing during later stages of the program. */
          trail->rest() = temp->rest();
          temp->first()->_duplicate_link = other_keyword->_duplicate_link;
          other_keyword->_duplicate_link = temp->first();
-          /* Complain if user hasn't enabled the duplicate option. */
+        if (other_keyword)
-          if (!option[DUP] || option[DEBUG])
+          {
-            fprintf (stderr, "Key link: \"%.*s\" = \"%.*s\", with key set \"%.*s\".\n",
+            _total_duplicates++;
-                             keyword->_allchars_length, keyword->_allchars,
+            _list_len--;
-                             other_keyword->_allchars_length, other_keyword->_allchars,
+            trail->rest() = temp->rest();
-                             keyword->_selchars_length, keyword->_selchars);
+            temp->first()->_duplicate_link = other_keyword->_duplicate_link;
-        }
+            other_keyword->_duplicate_link = temp->first();
      else
        {
          temp->first()->_duplicate_link = NULL;
          trail = temp;
        }
-      /* Update minimum and maximum keyword length, if needed. */
+            /* Complain if user hasn't enabled the duplicate option. */
-      if (_max_key_len < keyword->_allchars_length)
+            if (!option[DUP] || option[DEBUG])
-        _max_key_len = keyword->_allchars_length;
+              fprintf (stderr, "Key link: \"%.*s\" = \"%.*s\", with key set \"%.*s\".\n",
-      if (_min_key_len > keyword->_allchars_length)
+                               keyword->_allchars_length, keyword->_allchars,
-        _min_key_len = keyword->_allchars_length;
+                               other_keyword->_allchars_length, other_keyword->_allchars,
-    }
+                               keyword->_selchars_length, keyword->_selchars);
          }
        else
          {
            temp->first()->_duplicate_link = NULL;
            trail = temp;
          }
-  delete[] table;
+        /* Update minimum and maximum keyword length, if needed. */
        if (_max_key_len < keyword->_allchars_length)
          _max_key_len = keyword->_allchars_length;
        if (_min_key_len > keyword->_allchars_length)
          _min_key_len = keyword->_allchars_length;
      }
  }
  /* Exit program if links exists and option[DUP] not set, since we can't continue */
  if (_total_duplicates)