mirror of
https://git.savannah.gnu.org/git/gperf.git
synced 2025-12-02 13:09:22 +00:00
Rework the hash table code.
This commit is contained in:
19
ChangeLog
19
ChangeLog
@@ -1,3 +1,22 @@
|
||||
2002-11-03 Bruno Haible <bruno@clisp.org>
|
||||
|
||||
Bug fix: The hash table could fail to detect duplicates, between
|
||||
keywords of different length, when option -n (option[NOLENGTH]) was
|
||||
given.
|
||||
* src/hash-table.h (Hash_Table::Hash_Table): Pass table size, not
|
||||
vector and vector size as arguments.
|
||||
(Hash_Table::_log_size): New field.
|
||||
(Hash_Table::equal): New declaration.
|
||||
* src/hash-table.cc (size_factor): New variable.
|
||||
(Hash_Table::Hash_Table): Pass table size, not vector and vector size
|
||||
as arguments. Allocate the vector here.
|
||||
(Hash_Table::~Hash_Table): Deallocate the vector here.
|
||||
(Hash_Table::equal): New function.
|
||||
(Hash_Table::insert): Use it. Don't use item->_allchars_length for the
|
||||
increment if _ignore_length is true.
|
||||
* src/search.cc (TABLE_MULTIPLE): Remove variable.
|
||||
(Search::prepare): Update.
|
||||
|
||||
2002-11-02 Bruno Haible <bruno@clisp.org>
|
||||
|
||||
Provide documentation also in PDF format.
|
||||
|
||||
@@ -28,21 +28,61 @@
|
||||
#include <hash.h>
|
||||
#include "options.h"
|
||||
|
||||
/* The size of the hash table is always the smallest power of 2 >= the size
|
||||
indicated by the user. This allows several optimizations, including
|
||||
the use of double hashing and elimination of the mod instruction.
|
||||
Note that the size had better be larger than the number of items
|
||||
in the hash table, else there's trouble!!! Note that the memory
|
||||
for the hash table is allocated *outside* the intialization routine.
|
||||
This compromises information hiding somewhat, but greatly reduces
|
||||
memory fragmentation, since we can now use alloca! */
|
||||
/* We use a hash table with double hashing. This is the simplest kind of
|
||||
hash table, given that we always only insert and never remove entries
|
||||
from the hash table. */
|
||||
|
||||
Hash_Table::Hash_Table (KeywordExt **table_ptr, int s, bool ignore_len):
|
||||
_table (table_ptr), _size (s), _collisions (0), _ignore_length (ignore_len)
|
||||
/* To make double hashing efficient, there need to be enough spare entries. */
|
||||
static const int size_factor = 10;
|
||||
|
||||
/* We make the size of the hash table a power of 2. This allows for two
|
||||
optimizations: It eliminates the modulo instruction, and allows for an
|
||||
easy secondary hashing function. */
|
||||
|
||||
/* Constructor. */
|
||||
Hash_Table::Hash_Table (unsigned int size, bool ignore_length)
|
||||
: _ignore_length (ignore_length),
|
||||
_collisions (0)
|
||||
{
|
||||
/* There need to be enough spare entries. */
|
||||
size = size * size_factor;
|
||||
|
||||
/* Find smallest power of 2 that is >= size. */
|
||||
unsigned int shift = 0;
|
||||
if ((size >> 16) > 0)
|
||||
{
|
||||
size = size >> 16;
|
||||
shift += 16;
|
||||
}
|
||||
if ((size >> 8) > 0)
|
||||
{
|
||||
size = size >> 8;
|
||||
shift += 8;
|
||||
}
|
||||
if ((size >> 4) > 0)
|
||||
{
|
||||
size = size >> 4;
|
||||
shift += 4;
|
||||
}
|
||||
if ((size >> 2) > 0)
|
||||
{
|
||||
size = size >> 2;
|
||||
shift += 2;
|
||||
}
|
||||
if ((size >> 1) > 0)
|
||||
{
|
||||
size = size >> 1;
|
||||
shift += 1;
|
||||
}
|
||||
_log_size = shift;
|
||||
_size = 1 << shift;
|
||||
|
||||
/* Allocate table. */
|
||||
_table = new KeywordExt*[_size];
|
||||
memset (_table, 0, _size * sizeof (*_table));
|
||||
}
|
||||
|
||||
/* Destructor. */
|
||||
Hash_Table::~Hash_Table ()
|
||||
{
|
||||
if (option[DEBUG])
|
||||
@@ -76,24 +116,38 @@ Hash_Table::~Hash_Table ()
|
||||
|
||||
fprintf (stderr, "\nend dumping hash table\n\n");
|
||||
}
|
||||
delete[] _table;
|
||||
}
|
||||
|
||||
/* If the ITEM is already in the hash table return the item found
|
||||
in the table. Otherwise inserts the ITEM, and returns FALSE.
|
||||
Uses double hashing. */
|
||||
/* Compares two items. */
|
||||
inline bool
|
||||
Hash_Table::equal (KeywordExt *item1, KeywordExt *item2)
|
||||
{
|
||||
return item1->_selchars_length == item2->_selchars_length
|
||||
&& memcmp (item1->_selchars, item2->_selchars, item2->_selchars_length)
|
||||
== 0
|
||||
&& (_ignore_length
|
||||
|| item1->_allchars_length == item2->_allchars_length);
|
||||
}
|
||||
|
||||
/* Attempts to insert ITEM in the table. If there is already an equal
|
||||
entry in it, returns it. Otherwise inserts ITEM and returns NULL. */
|
||||
KeywordExt *
|
||||
Hash_Table::insert (KeywordExt *item)
|
||||
{
|
||||
unsigned hash_val = hashpjw (item->_selchars, item->_selchars_length);
|
||||
int probe = hash_val & (_size - 1);
|
||||
int increment = ((hash_val ^ item->_allchars_length) | 1) & (_size - 1);
|
||||
unsigned hash_val = hashpjw (item->_selchars, item->_selchars_length);
|
||||
unsigned int probe = hash_val & (_size - 1);
|
||||
unsigned int increment =
|
||||
(((hash_val >> _log_size)
|
||||
^ (_ignore_length ? 0 : item->_allchars_length))
|
||||
<< 1) + 1;
|
||||
/* Note that because _size is a power of 2 and increment is odd,
|
||||
we have gcd(increment,_size) = 1, which guarantees that we'll find
|
||||
an empty entry during the loop. */
|
||||
|
||||
while (_table[probe])
|
||||
while (_table[probe] != NULL)
|
||||
{
|
||||
if (_table[probe]->_selchars_length == item->_selchars_length
|
||||
&& memcmp (_table[probe]->_selchars, item->_selchars, item->_selchars_length) == 0
|
||||
&& (_ignore_length || _table[probe]->_allchars_length == item->_allchars_length))
|
||||
if (equal (_table[probe], item))
|
||||
return _table[probe];
|
||||
|
||||
_collisions++;
|
||||
|
||||
@@ -28,18 +28,37 @@
|
||||
|
||||
#include "keyword.h"
|
||||
|
||||
/* Hash table of KeywordExt* entries.
|
||||
Two entries are considered equal if their _selchars are the same and
|
||||
- if !ignore_length - if their _allchars_length are the same. */
|
||||
|
||||
class Hash_Table
|
||||
{
|
||||
private:
|
||||
KeywordExt ** _table; /* Vector of pointers to linked lists of keywords. */
|
||||
int _size; /* Size of the vector. */
|
||||
int _collisions; /* Find out how well our double hashing is working! */
|
||||
bool _ignore_length;
|
||||
|
||||
public:
|
||||
Hash_Table (KeywordExt **t, int s, bool ignore_len);
|
||||
/* Constructor.
|
||||
size is the maximum number of entries.
|
||||
ignore_length determines a detail in the comparison function. */
|
||||
Hash_Table (unsigned int size, bool ignore_length);
|
||||
/* Destructor. */
|
||||
~Hash_Table ();
|
||||
/* Attempts to insert ITEM in the table. If there is already an equal
|
||||
entry in it, returns it. Otherwise inserts ITEM and returns NULL. */
|
||||
KeywordExt * insert (KeywordExt *item);
|
||||
|
||||
private:
|
||||
/* Vector of entries. */
|
||||
KeywordExt ** _table;
|
||||
/* Size of the vector. */
|
||||
unsigned int _size;
|
||||
/* log2(_size). */
|
||||
unsigned int _log_size;
|
||||
/* A detail of the comparison function. */
|
||||
bool _ignore_length;
|
||||
/* Statistics: Number of collisions so far. */
|
||||
unsigned int _collisions;
|
||||
|
||||
/* Compares two items. */
|
||||
bool equal (KeywordExt *item1, KeywordExt *item2);
|
||||
};
|
||||
|
||||
#endif
|
||||
|
||||
@@ -31,9 +31,6 @@
|
||||
#include "options.h"
|
||||
#include "hash-table.h"
|
||||
|
||||
/* Make the hash table 8 times larger than the number of keyword entries. */
|
||||
static const int TABLE_MULTIPLE = 10;
|
||||
|
||||
/* Efficiently returns the least power of two greater than or equal to X! */
|
||||
#define POW(X) ((!X)?1:(X-=1,X|=X>>1,X|=X>>2,X|=X>>4,X|=X>>8,X|=X>>16,(++X)))
|
||||
|
||||
@@ -50,7 +47,6 @@ void
|
||||
Search::prepare ()
|
||||
{
|
||||
KeywordExt_List *temp;
|
||||
KeywordExt_List *trail = NULL;
|
||||
|
||||
_total_keys = 0;
|
||||
for (temp = _head; temp; temp = temp->rest())
|
||||
@@ -58,59 +54,57 @@ Search::prepare ()
|
||||
temp->first()->init_selchars(_occurrences);
|
||||
_total_keys++;
|
||||
}
|
||||
|
||||
/* Hash table this number of times larger than keyword number. */
|
||||
int table_size = (_list_len = _total_keys) * TABLE_MULTIPLE;
|
||||
/* Table must be a power of 2 for the hash function scheme to work. */
|
||||
KeywordExt **table = new KeywordExt*[POW (table_size)];
|
||||
|
||||
/* Make large hash table for efficiency. */
|
||||
Hash_Table found_link (table, table_size, option[NOLENGTH]);
|
||||
_list_len = _total_keys;
|
||||
|
||||
/* Test whether there are any links and also set the maximum length of
|
||||
an identifier in the keyword list. */
|
||||
_total_duplicates = 0;
|
||||
_max_key_len = INT_MIN;
|
||||
_min_key_len = INT_MAX;
|
||||
for (temp = _head; temp; temp = temp->rest())
|
||||
{
|
||||
KeywordExt *keyword = temp->first();
|
||||
KeywordExt *other_keyword = found_link.insert (keyword);
|
||||
{
|
||||
/* Make hash table for efficiency. */
|
||||
Hash_Table found_link (_list_len, option[NOLENGTH]);
|
||||
|
||||
/* Check for links. We deal with these by building an equivalence class
|
||||
of all duplicate values (i.e., links) so that only 1 keyword is
|
||||
representative of the entire collection. This *greatly* simplifies
|
||||
processing during later stages of the program. */
|
||||
/* Test whether there are any links and also set the maximum length of
|
||||
an identifier in the keyword list. */
|
||||
_total_duplicates = 0;
|
||||
_max_key_len = INT_MIN;
|
||||
_min_key_len = INT_MAX;
|
||||
KeywordExt_List *trail = NULL;
|
||||
for (temp = _head; temp; temp = temp->rest())
|
||||
{
|
||||
KeywordExt *keyword = temp->first();
|
||||
KeywordExt *other_keyword = found_link.insert (keyword);
|
||||
|
||||
if (other_keyword)
|
||||
{
|
||||
_total_duplicates++;
|
||||
_list_len--;
|
||||
trail->rest() = temp->rest();
|
||||
temp->first()->_duplicate_link = other_keyword->_duplicate_link;
|
||||
other_keyword->_duplicate_link = temp->first();
|
||||
/* Check for links. We deal with these by building an equivalence class
|
||||
of all duplicate values (i.e., links) so that only 1 keyword is
|
||||
representative of the entire collection. This *greatly* simplifies
|
||||
processing during later stages of the program. */
|
||||
|
||||
/* Complain if user hasn't enabled the duplicate option. */
|
||||
if (!option[DUP] || option[DEBUG])
|
||||
fprintf (stderr, "Key link: \"%.*s\" = \"%.*s\", with key set \"%.*s\".\n",
|
||||
keyword->_allchars_length, keyword->_allchars,
|
||||
other_keyword->_allchars_length, other_keyword->_allchars,
|
||||
keyword->_selchars_length, keyword->_selchars);
|
||||
}
|
||||
else
|
||||
{
|
||||
temp->first()->_duplicate_link = NULL;
|
||||
trail = temp;
|
||||
}
|
||||
if (other_keyword)
|
||||
{
|
||||
_total_duplicates++;
|
||||
_list_len--;
|
||||
trail->rest() = temp->rest();
|
||||
temp->first()->_duplicate_link = other_keyword->_duplicate_link;
|
||||
other_keyword->_duplicate_link = temp->first();
|
||||
|
||||
/* Update minimum and maximum keyword length, if needed. */
|
||||
if (_max_key_len < keyword->_allchars_length)
|
||||
_max_key_len = keyword->_allchars_length;
|
||||
if (_min_key_len > keyword->_allchars_length)
|
||||
_min_key_len = keyword->_allchars_length;
|
||||
}
|
||||
/* Complain if user hasn't enabled the duplicate option. */
|
||||
if (!option[DUP] || option[DEBUG])
|
||||
fprintf (stderr, "Key link: \"%.*s\" = \"%.*s\", with key set \"%.*s\".\n",
|
||||
keyword->_allchars_length, keyword->_allchars,
|
||||
other_keyword->_allchars_length, other_keyword->_allchars,
|
||||
keyword->_selchars_length, keyword->_selchars);
|
||||
}
|
||||
else
|
||||
{
|
||||
temp->first()->_duplicate_link = NULL;
|
||||
trail = temp;
|
||||
}
|
||||
|
||||
delete[] table;
|
||||
/* Update minimum and maximum keyword length, if needed. */
|
||||
if (_max_key_len < keyword->_allchars_length)
|
||||
_max_key_len = keyword->_allchars_length;
|
||||
if (_min_key_len > keyword->_allchars_length)
|
||||
_min_key_len = keyword->_allchars_length;
|
||||
}
|
||||
}
|
||||
|
||||
/* Exit program if links exists and option[DUP] not set, since we can't continue */
|
||||
if (_total_duplicates)
|
||||
|
||||
Reference in New Issue
Block a user