1
0
mirror of https://git.savannah.gnu.org/git/gperf.git synced 2025-12-02 13:09:22 +00:00

Rework the hash table code.

This commit is contained in:
Bruno Haible
2002-12-19 12:35:17 +00:00
parent 388a431c7d
commit 1186e616cb
4 changed files with 163 additions and 77 deletions

View File

@@ -1,3 +1,22 @@
2002-11-03 Bruno Haible <bruno@clisp.org>
Bug fix: The hash table could fail to detect duplicates, between
keywords of different length, when option -n (option[NOLENGTH]) was
given.
* src/hash-table.h (Hash_Table::Hash_Table): Pass table size, not
vector and vector size as arguments.
(Hash_Table::_log_size): New field.
(Hash_Table::equal): New declaration.
* src/hash-table.cc (size_factor): New variable.
(Hash_Table::Hash_Table): Pass table size, not vector and vector size
as arguments. Allocate the vector here.
(Hash_Table::~Hash_Table): Deallocate the vector here.
(Hash_Table::equal): New function.
(Hash_Table::insert): Use it. Don't use item->_allchars_length for the
increment if _ignore_length is true.
* src/search.cc (TABLE_MULTIPLE): Remove variable.
(Search::prepare): Update.
2002-11-02 Bruno Haible <bruno@clisp.org>
Provide documentation also in PDF format.

View File

@@ -28,21 +28,61 @@
#include <hash.h>
#include "options.h"
/* The size of the hash table is always the smallest power of 2 >= the size
indicated by the user. This allows several optimizations, including
the use of double hashing and elimination of the mod instruction.
Note that the size had better be larger than the number of items
in the hash table, else there's trouble!!! Note that the memory
for the hash table is allocated *outside* the intialization routine.
This compromises information hiding somewhat, but greatly reduces
memory fragmentation, since we can now use alloca! */
/* We use a hash table with double hashing. This is the simplest kind of
hash table, given that we always only insert and never remove entries
from the hash table. */
Hash_Table::Hash_Table (KeywordExt **table_ptr, int s, bool ignore_len):
_table (table_ptr), _size (s), _collisions (0), _ignore_length (ignore_len)
/* To make double hashing efficient, there need to be enough spare entries. */
static const int size_factor = 10;
/* We make the size of the hash table a power of 2. This allows for two
optimizations: It eliminates the modulo instruction, and allows for an
easy secondary hashing function. */
/* Constructor. */
Hash_Table::Hash_Table (unsigned int size, bool ignore_length)
: _ignore_length (ignore_length),
_collisions (0)
{
/* There need to be enough spare entries. */
size = size * size_factor;
/* Find smallest power of 2 that is >= size. */
unsigned int shift = 0;
if ((size >> 16) > 0)
{
size = size >> 16;
shift += 16;
}
if ((size >> 8) > 0)
{
size = size >> 8;
shift += 8;
}
if ((size >> 4) > 0)
{
size = size >> 4;
shift += 4;
}
if ((size >> 2) > 0)
{
size = size >> 2;
shift += 2;
}
if ((size >> 1) > 0)
{
size = size >> 1;
shift += 1;
}
_log_size = shift;
_size = 1 << shift;
/* Allocate table. */
_table = new KeywordExt*[_size];
memset (_table, 0, _size * sizeof (*_table));
}
/* Destructor. */
Hash_Table::~Hash_Table ()
{
if (option[DEBUG])
@@ -76,24 +116,38 @@ Hash_Table::~Hash_Table ()
fprintf (stderr, "\nend dumping hash table\n\n");
}
delete[] _table;
}
/* If the ITEM is already in the hash table return the item found
in the table. Otherwise inserts the ITEM, and returns FALSE.
Uses double hashing. */
/* Compares two items. */
inline bool
Hash_Table::equal (KeywordExt *item1, KeywordExt *item2)
{
return item1->_selchars_length == item2->_selchars_length
&& memcmp (item1->_selchars, item2->_selchars, item2->_selchars_length)
== 0
&& (_ignore_length
|| item1->_allchars_length == item2->_allchars_length);
}
/* Attempts to insert ITEM in the table. If there is already an equal
entry in it, returns it. Otherwise inserts ITEM and returns NULL. */
KeywordExt *
Hash_Table::insert (KeywordExt *item)
{
unsigned hash_val = hashpjw (item->_selchars, item->_selchars_length);
int probe = hash_val & (_size - 1);
int increment = ((hash_val ^ item->_allchars_length) | 1) & (_size - 1);
unsigned hash_val = hashpjw (item->_selchars, item->_selchars_length);
unsigned int probe = hash_val & (_size - 1);
unsigned int increment =
(((hash_val >> _log_size)
^ (_ignore_length ? 0 : item->_allchars_length))
<< 1) + 1;
/* Note that because _size is a power of 2 and increment is odd,
we have gcd(increment,_size) = 1, which guarantees that we'll find
an empty entry during the loop. */
while (_table[probe])
while (_table[probe] != NULL)
{
if (_table[probe]->_selchars_length == item->_selchars_length
&& memcmp (_table[probe]->_selchars, item->_selchars, item->_selchars_length) == 0
&& (_ignore_length || _table[probe]->_allchars_length == item->_allchars_length))
if (equal (_table[probe], item))
return _table[probe];
_collisions++;

View File

@@ -28,18 +28,37 @@
#include "keyword.h"
/* Hash table of KeywordExt* entries.
Two entries are considered equal if their _selchars are the same and
- if !ignore_length - if their _allchars_length are the same. */
class Hash_Table
{
private:
KeywordExt ** _table; /* Vector of pointers to linked lists of keywords. */
int _size; /* Size of the vector. */
int _collisions; /* Find out how well our double hashing is working! */
bool _ignore_length;
public:
Hash_Table (KeywordExt **t, int s, bool ignore_len);
/* Constructor.
size is the maximum number of entries.
ignore_length determines a detail in the comparison function. */
Hash_Table (unsigned int size, bool ignore_length);
/* Destructor. */
~Hash_Table ();
/* Attempts to insert ITEM in the table. If there is already an equal
entry in it, returns it. Otherwise inserts ITEM and returns NULL. */
KeywordExt * insert (KeywordExt *item);
private:
/* Vector of entries. */
KeywordExt ** _table;
/* Size of the vector. */
unsigned int _size;
/* log2(_size). */
unsigned int _log_size;
/* A detail of the comparison function. */
bool _ignore_length;
/* Statistics: Number of collisions so far. */
unsigned int _collisions;
/* Compares two items. */
bool equal (KeywordExt *item1, KeywordExt *item2);
};
#endif

View File

@@ -31,9 +31,6 @@
#include "options.h"
#include "hash-table.h"
/* Make the hash table 8 times larger than the number of keyword entries. */
static const int TABLE_MULTIPLE = 10;
/* Efficiently returns the least power of two greater than or equal to X! */
#define POW(X) ((!X)?1:(X-=1,X|=X>>1,X|=X>>2,X|=X>>4,X|=X>>8,X|=X>>16,(++X)))
@@ -50,7 +47,6 @@ void
Search::prepare ()
{
KeywordExt_List *temp;
KeywordExt_List *trail = NULL;
_total_keys = 0;
for (temp = _head; temp; temp = temp->rest())
@@ -58,59 +54,57 @@ Search::prepare ()
temp->first()->init_selchars(_occurrences);
_total_keys++;
}
/* Hash table this number of times larger than keyword number. */
int table_size = (_list_len = _total_keys) * TABLE_MULTIPLE;
/* Table must be a power of 2 for the hash function scheme to work. */
KeywordExt **table = new KeywordExt*[POW (table_size)];
/* Make large hash table for efficiency. */
Hash_Table found_link (table, table_size, option[NOLENGTH]);
_list_len = _total_keys;
/* Test whether there are any links and also set the maximum length of
an identifier in the keyword list. */
_total_duplicates = 0;
_max_key_len = INT_MIN;
_min_key_len = INT_MAX;
for (temp = _head; temp; temp = temp->rest())
{
KeywordExt *keyword = temp->first();
KeywordExt *other_keyword = found_link.insert (keyword);
{
/* Make hash table for efficiency. */
Hash_Table found_link (_list_len, option[NOLENGTH]);
/* Check for links. We deal with these by building an equivalence class
of all duplicate values (i.e., links) so that only 1 keyword is
representative of the entire collection. This *greatly* simplifies
processing during later stages of the program. */
/* Test whether there are any links and also set the maximum length of
an identifier in the keyword list. */
_total_duplicates = 0;
_max_key_len = INT_MIN;
_min_key_len = INT_MAX;
KeywordExt_List *trail = NULL;
for (temp = _head; temp; temp = temp->rest())
{
KeywordExt *keyword = temp->first();
KeywordExt *other_keyword = found_link.insert (keyword);
if (other_keyword)
{
_total_duplicates++;
_list_len--;
trail->rest() = temp->rest();
temp->first()->_duplicate_link = other_keyword->_duplicate_link;
other_keyword->_duplicate_link = temp->first();
/* Check for links. We deal with these by building an equivalence class
of all duplicate values (i.e., links) so that only 1 keyword is
representative of the entire collection. This *greatly* simplifies
processing during later stages of the program. */
/* Complain if user hasn't enabled the duplicate option. */
if (!option[DUP] || option[DEBUG])
fprintf (stderr, "Key link: \"%.*s\" = \"%.*s\", with key set \"%.*s\".\n",
keyword->_allchars_length, keyword->_allchars,
other_keyword->_allchars_length, other_keyword->_allchars,
keyword->_selchars_length, keyword->_selchars);
}
else
{
temp->first()->_duplicate_link = NULL;
trail = temp;
}
if (other_keyword)
{
_total_duplicates++;
_list_len--;
trail->rest() = temp->rest();
temp->first()->_duplicate_link = other_keyword->_duplicate_link;
other_keyword->_duplicate_link = temp->first();
/* Update minimum and maximum keyword length, if needed. */
if (_max_key_len < keyword->_allchars_length)
_max_key_len = keyword->_allchars_length;
if (_min_key_len > keyword->_allchars_length)
_min_key_len = keyword->_allchars_length;
}
/* Complain if user hasn't enabled the duplicate option. */
if (!option[DUP] || option[DEBUG])
fprintf (stderr, "Key link: \"%.*s\" = \"%.*s\", with key set \"%.*s\".\n",
keyword->_allchars_length, keyword->_allchars,
other_keyword->_allchars_length, other_keyword->_allchars,
keyword->_selchars_length, keyword->_selchars);
}
else
{
temp->first()->_duplicate_link = NULL;
trail = temp;
}
delete[] table;
/* Update minimum and maximum keyword length, if needed. */
if (_max_key_len < keyword->_allchars_length)
_max_key_len = keyword->_allchars_length;
if (_min_key_len > keyword->_allchars_length)
_min_key_len = keyword->_allchars_length;
}
}
/* Exit program if links exists and option[DUP] not set, since we can't continue */
if (_total_duplicates)