mirror of
https://git.savannah.gnu.org/git/gperf.git
synced 2025-12-02 13:09:22 +00:00
Rework the hash table code.
This commit is contained in:
19
ChangeLog
19
ChangeLog
@@ -1,3 +1,22 @@
|
|||||||
|
2002-11-03 Bruno Haible <bruno@clisp.org>
|
||||||
|
|
||||||
|
Bug fix: The hash table could fail to detect duplicates, between
|
||||||
|
keywords of different length, when option -n (option[NOLENGTH]) was
|
||||||
|
given.
|
||||||
|
* src/hash-table.h (Hash_Table::Hash_Table): Pass table size, not
|
||||||
|
vector and vector size as arguments.
|
||||||
|
(Hash_Table::_log_size): New field.
|
||||||
|
(Hash_Table::equal): New declaration.
|
||||||
|
* src/hash-table.cc (size_factor): New variable.
|
||||||
|
(Hash_Table::Hash_Table): Pass table size, not vector and vector size
|
||||||
|
as arguments. Allocate the vector here.
|
||||||
|
(Hash_Table::~Hash_Table): Deallocate the vector here.
|
||||||
|
(Hash_Table::equal): New function.
|
||||||
|
(Hash_Table::insert): Use it. Don't use item->_allchars_length for the
|
||||||
|
increment if _ignore_length is true.
|
||||||
|
* src/search.cc (TABLE_MULTIPLE): Remove variable.
|
||||||
|
(Search::prepare): Update.
|
||||||
|
|
||||||
2002-11-02 Bruno Haible <bruno@clisp.org>
|
2002-11-02 Bruno Haible <bruno@clisp.org>
|
||||||
|
|
||||||
Provide documentation also in PDF format.
|
Provide documentation also in PDF format.
|
||||||
|
|||||||
@@ -28,21 +28,61 @@
|
|||||||
#include <hash.h>
|
#include <hash.h>
|
||||||
#include "options.h"
|
#include "options.h"
|
||||||
|
|
||||||
/* The size of the hash table is always the smallest power of 2 >= the size
|
/* We use a hash table with double hashing. This is the simplest kind of
|
||||||
indicated by the user. This allows several optimizations, including
|
hash table, given that we always only insert and never remove entries
|
||||||
the use of double hashing and elimination of the mod instruction.
|
from the hash table. */
|
||||||
Note that the size had better be larger than the number of items
|
|
||||||
in the hash table, else there's trouble!!! Note that the memory
|
|
||||||
for the hash table is allocated *outside* the intialization routine.
|
|
||||||
This compromises information hiding somewhat, but greatly reduces
|
|
||||||
memory fragmentation, since we can now use alloca! */
|
|
||||||
|
|
||||||
Hash_Table::Hash_Table (KeywordExt **table_ptr, int s, bool ignore_len):
|
/* To make double hashing efficient, there need to be enough spare entries. */
|
||||||
_table (table_ptr), _size (s), _collisions (0), _ignore_length (ignore_len)
|
static const int size_factor = 10;
|
||||||
|
|
||||||
|
/* We make the size of the hash table a power of 2. This allows for two
|
||||||
|
optimizations: It eliminates the modulo instruction, and allows for an
|
||||||
|
easy secondary hashing function. */
|
||||||
|
|
||||||
|
/* Constructor. */
|
||||||
|
Hash_Table::Hash_Table (unsigned int size, bool ignore_length)
|
||||||
|
: _ignore_length (ignore_length),
|
||||||
|
_collisions (0)
|
||||||
{
|
{
|
||||||
|
/* There need to be enough spare entries. */
|
||||||
|
size = size * size_factor;
|
||||||
|
|
||||||
|
/* Find smallest power of 2 that is >= size. */
|
||||||
|
unsigned int shift = 0;
|
||||||
|
if ((size >> 16) > 0)
|
||||||
|
{
|
||||||
|
size = size >> 16;
|
||||||
|
shift += 16;
|
||||||
|
}
|
||||||
|
if ((size >> 8) > 0)
|
||||||
|
{
|
||||||
|
size = size >> 8;
|
||||||
|
shift += 8;
|
||||||
|
}
|
||||||
|
if ((size >> 4) > 0)
|
||||||
|
{
|
||||||
|
size = size >> 4;
|
||||||
|
shift += 4;
|
||||||
|
}
|
||||||
|
if ((size >> 2) > 0)
|
||||||
|
{
|
||||||
|
size = size >> 2;
|
||||||
|
shift += 2;
|
||||||
|
}
|
||||||
|
if ((size >> 1) > 0)
|
||||||
|
{
|
||||||
|
size = size >> 1;
|
||||||
|
shift += 1;
|
||||||
|
}
|
||||||
|
_log_size = shift;
|
||||||
|
_size = 1 << shift;
|
||||||
|
|
||||||
|
/* Allocate table. */
|
||||||
|
_table = new KeywordExt*[_size];
|
||||||
memset (_table, 0, _size * sizeof (*_table));
|
memset (_table, 0, _size * sizeof (*_table));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* Destructor. */
|
||||||
Hash_Table::~Hash_Table ()
|
Hash_Table::~Hash_Table ()
|
||||||
{
|
{
|
||||||
if (option[DEBUG])
|
if (option[DEBUG])
|
||||||
@@ -76,24 +116,38 @@ Hash_Table::~Hash_Table ()
|
|||||||
|
|
||||||
fprintf (stderr, "\nend dumping hash table\n\n");
|
fprintf (stderr, "\nend dumping hash table\n\n");
|
||||||
}
|
}
|
||||||
|
delete[] _table;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* If the ITEM is already in the hash table return the item found
|
/* Compares two items. */
|
||||||
in the table. Otherwise inserts the ITEM, and returns FALSE.
|
inline bool
|
||||||
Uses double hashing. */
|
Hash_Table::equal (KeywordExt *item1, KeywordExt *item2)
|
||||||
|
{
|
||||||
|
return item1->_selchars_length == item2->_selchars_length
|
||||||
|
&& memcmp (item1->_selchars, item2->_selchars, item2->_selchars_length)
|
||||||
|
== 0
|
||||||
|
&& (_ignore_length
|
||||||
|
|| item1->_allchars_length == item2->_allchars_length);
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Attempts to insert ITEM in the table. If there is already an equal
|
||||||
|
entry in it, returns it. Otherwise inserts ITEM and returns NULL. */
|
||||||
KeywordExt *
|
KeywordExt *
|
||||||
Hash_Table::insert (KeywordExt *item)
|
Hash_Table::insert (KeywordExt *item)
|
||||||
{
|
{
|
||||||
unsigned hash_val = hashpjw (item->_selchars, item->_selchars_length);
|
unsigned hash_val = hashpjw (item->_selchars, item->_selchars_length);
|
||||||
int probe = hash_val & (_size - 1);
|
unsigned int probe = hash_val & (_size - 1);
|
||||||
int increment = ((hash_val ^ item->_allchars_length) | 1) & (_size - 1);
|
unsigned int increment =
|
||||||
|
(((hash_val >> _log_size)
|
||||||
|
^ (_ignore_length ? 0 : item->_allchars_length))
|
||||||
|
<< 1) + 1;
|
||||||
|
/* Note that because _size is a power of 2 and increment is odd,
|
||||||
|
we have gcd(increment,_size) = 1, which guarantees that we'll find
|
||||||
|
an empty entry during the loop. */
|
||||||
|
|
||||||
while (_table[probe])
|
while (_table[probe] != NULL)
|
||||||
{
|
{
|
||||||
if (_table[probe]->_selchars_length == item->_selchars_length
|
if (equal (_table[probe], item))
|
||||||
&& memcmp (_table[probe]->_selchars, item->_selchars, item->_selchars_length) == 0
|
|
||||||
&& (_ignore_length || _table[probe]->_allchars_length == item->_allchars_length))
|
|
||||||
return _table[probe];
|
return _table[probe];
|
||||||
|
|
||||||
_collisions++;
|
_collisions++;
|
||||||
|
|||||||
@@ -28,18 +28,37 @@
|
|||||||
|
|
||||||
#include "keyword.h"
|
#include "keyword.h"
|
||||||
|
|
||||||
|
/* Hash table of KeywordExt* entries.
|
||||||
|
Two entries are considered equal if their _selchars are the same and
|
||||||
|
- if !ignore_length - if their _allchars_length are the same. */
|
||||||
|
|
||||||
class Hash_Table
|
class Hash_Table
|
||||||
{
|
{
|
||||||
private:
|
|
||||||
KeywordExt ** _table; /* Vector of pointers to linked lists of keywords. */
|
|
||||||
int _size; /* Size of the vector. */
|
|
||||||
int _collisions; /* Find out how well our double hashing is working! */
|
|
||||||
bool _ignore_length;
|
|
||||||
|
|
||||||
public:
|
public:
|
||||||
Hash_Table (KeywordExt **t, int s, bool ignore_len);
|
/* Constructor.
|
||||||
|
size is the maximum number of entries.
|
||||||
|
ignore_length determines a detail in the comparison function. */
|
||||||
|
Hash_Table (unsigned int size, bool ignore_length);
|
||||||
|
/* Destructor. */
|
||||||
~Hash_Table ();
|
~Hash_Table ();
|
||||||
|
/* Attempts to insert ITEM in the table. If there is already an equal
|
||||||
|
entry in it, returns it. Otherwise inserts ITEM and returns NULL. */
|
||||||
KeywordExt * insert (KeywordExt *item);
|
KeywordExt * insert (KeywordExt *item);
|
||||||
|
|
||||||
|
private:
|
||||||
|
/* Vector of entries. */
|
||||||
|
KeywordExt ** _table;
|
||||||
|
/* Size of the vector. */
|
||||||
|
unsigned int _size;
|
||||||
|
/* log2(_size). */
|
||||||
|
unsigned int _log_size;
|
||||||
|
/* A detail of the comparison function. */
|
||||||
|
bool _ignore_length;
|
||||||
|
/* Statistics: Number of collisions so far. */
|
||||||
|
unsigned int _collisions;
|
||||||
|
|
||||||
|
/* Compares two items. */
|
||||||
|
bool equal (KeywordExt *item1, KeywordExt *item2);
|
||||||
};
|
};
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|||||||
@@ -31,9 +31,6 @@
|
|||||||
#include "options.h"
|
#include "options.h"
|
||||||
#include "hash-table.h"
|
#include "hash-table.h"
|
||||||
|
|
||||||
/* Make the hash table 8 times larger than the number of keyword entries. */
|
|
||||||
static const int TABLE_MULTIPLE = 10;
|
|
||||||
|
|
||||||
/* Efficiently returns the least power of two greater than or equal to X! */
|
/* Efficiently returns the least power of two greater than or equal to X! */
|
||||||
#define POW(X) ((!X)?1:(X-=1,X|=X>>1,X|=X>>2,X|=X>>4,X|=X>>8,X|=X>>16,(++X)))
|
#define POW(X) ((!X)?1:(X-=1,X|=X>>1,X|=X>>2,X|=X>>4,X|=X>>8,X|=X>>16,(++X)))
|
||||||
|
|
||||||
@@ -50,7 +47,6 @@ void
|
|||||||
Search::prepare ()
|
Search::prepare ()
|
||||||
{
|
{
|
||||||
KeywordExt_List *temp;
|
KeywordExt_List *temp;
|
||||||
KeywordExt_List *trail = NULL;
|
|
||||||
|
|
||||||
_total_keys = 0;
|
_total_keys = 0;
|
||||||
for (temp = _head; temp; temp = temp->rest())
|
for (temp = _head; temp; temp = temp->rest())
|
||||||
@@ -58,59 +54,57 @@ Search::prepare ()
|
|||||||
temp->first()->init_selchars(_occurrences);
|
temp->first()->init_selchars(_occurrences);
|
||||||
_total_keys++;
|
_total_keys++;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Hash table this number of times larger than keyword number. */
|
|
||||||
int table_size = (_list_len = _total_keys) * TABLE_MULTIPLE;
|
|
||||||
/* Table must be a power of 2 for the hash function scheme to work. */
|
|
||||||
KeywordExt **table = new KeywordExt*[POW (table_size)];
|
|
||||||
|
|
||||||
/* Make large hash table for efficiency. */
|
_list_len = _total_keys;
|
||||||
Hash_Table found_link (table, table_size, option[NOLENGTH]);
|
|
||||||
|
|
||||||
/* Test whether there are any links and also set the maximum length of
|
{
|
||||||
an identifier in the keyword list. */
|
/* Make hash table for efficiency. */
|
||||||
_total_duplicates = 0;
|
Hash_Table found_link (_list_len, option[NOLENGTH]);
|
||||||
_max_key_len = INT_MIN;
|
|
||||||
_min_key_len = INT_MAX;
|
|
||||||
for (temp = _head; temp; temp = temp->rest())
|
|
||||||
{
|
|
||||||
KeywordExt *keyword = temp->first();
|
|
||||||
KeywordExt *other_keyword = found_link.insert (keyword);
|
|
||||||
|
|
||||||
/* Check for links. We deal with these by building an equivalence class
|
/* Test whether there are any links and also set the maximum length of
|
||||||
of all duplicate values (i.e., links) so that only 1 keyword is
|
an identifier in the keyword list. */
|
||||||
representative of the entire collection. This *greatly* simplifies
|
_total_duplicates = 0;
|
||||||
processing during later stages of the program. */
|
_max_key_len = INT_MIN;
|
||||||
|
_min_key_len = INT_MAX;
|
||||||
|
KeywordExt_List *trail = NULL;
|
||||||
|
for (temp = _head; temp; temp = temp->rest())
|
||||||
|
{
|
||||||
|
KeywordExt *keyword = temp->first();
|
||||||
|
KeywordExt *other_keyword = found_link.insert (keyword);
|
||||||
|
|
||||||
if (other_keyword)
|
/* Check for links. We deal with these by building an equivalence class
|
||||||
{
|
of all duplicate values (i.e., links) so that only 1 keyword is
|
||||||
_total_duplicates++;
|
representative of the entire collection. This *greatly* simplifies
|
||||||
_list_len--;
|
processing during later stages of the program. */
|
||||||
trail->rest() = temp->rest();
|
|
||||||
temp->first()->_duplicate_link = other_keyword->_duplicate_link;
|
|
||||||
other_keyword->_duplicate_link = temp->first();
|
|
||||||
|
|
||||||
/* Complain if user hasn't enabled the duplicate option. */
|
if (other_keyword)
|
||||||
if (!option[DUP] || option[DEBUG])
|
{
|
||||||
fprintf (stderr, "Key link: \"%.*s\" = \"%.*s\", with key set \"%.*s\".\n",
|
_total_duplicates++;
|
||||||
keyword->_allchars_length, keyword->_allchars,
|
_list_len--;
|
||||||
other_keyword->_allchars_length, other_keyword->_allchars,
|
trail->rest() = temp->rest();
|
||||||
keyword->_selchars_length, keyword->_selchars);
|
temp->first()->_duplicate_link = other_keyword->_duplicate_link;
|
||||||
}
|
other_keyword->_duplicate_link = temp->first();
|
||||||
else
|
|
||||||
{
|
|
||||||
temp->first()->_duplicate_link = NULL;
|
|
||||||
trail = temp;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* Update minimum and maximum keyword length, if needed. */
|
/* Complain if user hasn't enabled the duplicate option. */
|
||||||
if (_max_key_len < keyword->_allchars_length)
|
if (!option[DUP] || option[DEBUG])
|
||||||
_max_key_len = keyword->_allchars_length;
|
fprintf (stderr, "Key link: \"%.*s\" = \"%.*s\", with key set \"%.*s\".\n",
|
||||||
if (_min_key_len > keyword->_allchars_length)
|
keyword->_allchars_length, keyword->_allchars,
|
||||||
_min_key_len = keyword->_allchars_length;
|
other_keyword->_allchars_length, other_keyword->_allchars,
|
||||||
}
|
keyword->_selchars_length, keyword->_selchars);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
temp->first()->_duplicate_link = NULL;
|
||||||
|
trail = temp;
|
||||||
|
}
|
||||||
|
|
||||||
delete[] table;
|
/* Update minimum and maximum keyword length, if needed. */
|
||||||
|
if (_max_key_len < keyword->_allchars_length)
|
||||||
|
_max_key_len = keyword->_allchars_length;
|
||||||
|
if (_min_key_len > keyword->_allchars_length)
|
||||||
|
_min_key_len = keyword->_allchars_length;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/* Exit program if links exists and option[DUP] not set, since we can't continue */
|
/* Exit program if links exists and option[DUP] not set, since we can't continue */
|
||||||
if (_total_duplicates)
|
if (_total_duplicates)
|
||||||
|
|||||||
Reference in New Issue
Block a user