1
0
mirror of https://git.savannah.gnu.org/git/gperf.git synced 2025-12-02 13:09:22 +00:00

Introduce new alpha_inc pass, to avoid artificial duplicates.

This commit is contained in:
Bruno Haible
2003-02-17 10:36:47 +00:00
parent 799d1c7534
commit ec800f65ec
11 changed files with 970 additions and 686 deletions

View File

@@ -31,15 +31,44 @@
#include "options.h"
#include "hash-table.h"
/* The most general form of the hash function is
hash (keyword) = sum (asso_values[keyword[i] + alpha_inc[i]] : i in Pos)
where Pos is a set of byte positions,
each alpha_inc[i] is a nonnegative integer,
each asso_values[c] is a nonnegative integer.
Theorem 1: If all keywords are different, there is a set Pos such that
all tuples (keyword[i] : i in Pos) are different.
Theorem 2: If all tuples (keyword[i] : i in Pos) are different, there
are nonnegative integers alpha_inc[i] such that all multisets
{keyword[i] + alpha_inc[i] : i in Pos} are different.
Theorem 3: If all multisets selchars[keyword] are different, there are
nonnegative integers asso_values[c] such that all hash values
sum (asso_values[c] : c in selchars[keyword]) are different.
Based on these three facts, we find the hash function in three steps:
Step 1 (Finding good byte positions):
Find a set Pos, as small as possible, such that all tuples
(keyword[i] : i in Pos) are different.
Step 2 (Finding good alpha increments):
Find nonnegative integers alpha_inc[i], as many of them as possible being
zero, and the others being as small as possible, such that all multisets
{keyword[i] + alpha_inc[i] : i in Pos} are different.
Step 3 (Finding good asso_values):
Find asso_values[c] such that all hash (keyword) are different.
*/
/* -------------------- Initialization and Preparation --------------------- */
Search::Search (KeywordExt_List *list)
: _head (list),
_key_positions (option.get_key_positions()),
_alpha_size (option[SEVENBIT] ? 128 : 256),
_occurrences (new int[_alpha_size]),
_asso_values (new int[_alpha_size]),
_determined (new bool[_alpha_size])
: _head (list)
{
}
@@ -77,12 +106,14 @@ Search::preprepare ()
}
}
/* ---------------------- Finding good byte positions ---------------------- */
/* Initializes each keyword's _selchars array. */
void
Search::init_selchars (bool use_all_chars, const Positions& positions) const
Search::init_selchars_tuple (bool use_all_chars, const Positions& positions) const
{
for (KeywordExt_List *temp = _head; temp; temp = temp->rest())
temp->first()->init_selchars(use_all_chars, positions);
temp->first()->init_selchars_tuple(use_all_chars, positions);
}
/* Deletes each keyword's _selchars array. */
@@ -95,29 +126,31 @@ Search::delete_selchars () const
/* Count the duplicate keywords that occur with a given set of positions. */
unsigned int
Search::count_duplicates (const Positions& positions) const
Search::count_duplicates_tuple (const Positions& positions) const
{
init_selchars (false, positions);
init_selchars_tuple (option[ALLCHARS], positions);
unsigned int count = 0;
Hash_Table representatives (_total_keys, option[NOLENGTH]);
for (KeywordExt_List *temp = _head; temp; temp = temp->rest())
{
KeywordExt *keyword = temp->first();
if (representatives.insert (keyword))
count++;
}
{
Hash_Table representatives (_total_keys, option[NOLENGTH]);
for (KeywordExt_List *temp = _head; temp; temp = temp->rest())
{
KeywordExt *keyword = temp->first();
if (representatives.insert (keyword))
count++;
}
}
delete_selchars ();
return count;
}
/* Find good key positions. */
void
Search::find_positions ()
{
/* Determine good key positions. */
/* 1. Find positions that must occur in order to distinguish duplicates. */
Positions mandatory;
@@ -159,7 +192,7 @@ Search::find_positions ()
int imax = (_max_key_len < Positions::MAX_KEY_POS
? _max_key_len : Positions::MAX_KEY_POS);
Positions current = mandatory;
unsigned int current_duplicates_count = count_duplicates (current);
unsigned int current_duplicates_count = count_duplicates_tuple (current);
for (;;)
{
Positions best;
@@ -170,7 +203,7 @@ Search::find_positions ()
{
Positions tryal = current;
tryal.add (i);
unsigned int try_duplicates_count = count_duplicates (tryal);
unsigned int try_duplicates_count = count_duplicates_tuple (tryal);
/* We prefer 'try' to 'best' if it produces less duplicates,
or if it produces the same number of duplicates but with
@@ -203,7 +236,7 @@ Search::find_positions ()
{
Positions tryal = current;
tryal.remove (i);
unsigned int try_duplicates_count = count_duplicates (tryal);
unsigned int try_duplicates_count = count_duplicates_tuple (tryal);
/* We prefer 'try' to 'best' if it produces less duplicates,
or if it produces the same number of duplicates but with
@@ -243,7 +276,7 @@ Search::find_positions ()
tryal.remove (i2);
tryal.add (i3);
unsigned int try_duplicates_count =
count_duplicates (tryal);
count_duplicates_tuple (tryal);
/* We prefer 'try' to 'best' if it produces less duplicates,
or if it produces the same number of duplicates but with
@@ -269,18 +302,141 @@ Search::find_positions ()
_key_positions = current;
}
/* --------------------- Finding good alpha increments --------------------- */
/* Initializes each keyword's _selchars array. */
void
Search::init_selchars_multiset (bool use_all_chars, const Positions& positions, const unsigned int *alpha_inc) const
{
for (KeywordExt_List *temp = _head; temp; temp = temp->rest())
temp->first()->init_selchars_multiset(use_all_chars, positions, alpha_inc);
}
/* Count the duplicate keywords that occur with the given set of positions
and a given alpha_inc[] array. */
unsigned int
Search::count_duplicates_multiset (const unsigned int *alpha_inc) const
{
init_selchars_multiset (option[ALLCHARS], _key_positions, alpha_inc);
unsigned int count = 0;
{
Hash_Table representatives (_total_keys, option[NOLENGTH]);
for (KeywordExt_List *temp = _head; temp; temp = temp->rest())
{
KeywordExt *keyword = temp->first();
if (representatives.insert (keyword))
count++;
}
}
delete_selchars ();
return count;
}
/* Find good _alpha_inc[]. */
void
Search::find_alpha_inc ()
{
/* The goal is to choose _alpha_inc[] such that it doesn't introduce
artificial duplicates. */
unsigned int duplicates_goal = count_duplicates_tuple (_key_positions);
/* Start with zero increments. This is sufficient in most cases. */
unsigned int *current = new unsigned int [_max_key_len];
for (int i = 0; i < _max_key_len; i++)
current[i] = 0;
unsigned int current_duplicates_count = count_duplicates_multiset (current);
if (current_duplicates_count > duplicates_goal)
{
/* Look which _alpha_inc[i] we are free to increment. */
unsigned int nindices;
if (option[ALLCHARS])
nindices = _max_key_len;
else
{
/* Ignore Positions::LASTCHAR. Remember that since Positions are
sorted in decreasing order, Positions::LASTCHAR comes last. */
nindices = (_key_positions.get_size() == 0
|| _key_positions[_key_positions.get_size() - 1]
!= Positions::LASTCHAR
? _key_positions.get_size()
: _key_positions.get_size() - 1);
}
unsigned int indices[nindices];
if (option[ALLCHARS])
for (unsigned int j = 0; j < nindices; j++)
indices[j] = j;
else
{
PositionIterator iter (_key_positions);
for (unsigned int j = 0; j < nindices; j++)
{
int key_pos = iter.next ();
if (key_pos == PositionIterator::EOS
|| key_pos == Positions::LASTCHAR)
abort ();
indices[j] = key_pos - 1;
}
}
/* Perform several rounds of searching for a good alpha increment.
Each round reduces the number of artificial collisions by adding
an increment in a single key position. */
unsigned int best[_max_key_len];
unsigned int tryal[_max_key_len];
do
{
/* An increment of 1 is not always enough. Try higher increments
also. */
for (unsigned int inc = 1; ; inc++)
{
unsigned int best_duplicates_count = UINT_MAX;
for (unsigned int j = 0; j < nindices; j++)
{
memcpy (tryal, current, _max_key_len * sizeof (unsigned int));
tryal[indices[j]] += inc;
unsigned int try_duplicates_count =
count_duplicates_multiset (tryal);
/* We prefer 'try' to 'best' if it produces less
duplicates. */
if (try_duplicates_count < best_duplicates_count)
{
memcpy (best, tryal, _max_key_len * sizeof (unsigned int));
best_duplicates_count = try_duplicates_count;
}
}
/* Stop this round when we got an improvement. */
if (best_duplicates_count < current_duplicates_count)
{
memcpy (current, best, _max_key_len * sizeof (unsigned int));
current_duplicates_count = best_duplicates_count;
break;
}
}
}
while (current_duplicates_count > duplicates_goal);
}
_alpha_inc = current;
}
/* ------------------------------------------------------------------------- */
void
Search::prepare ()
{
KeywordExt_List *temp;
preprepare ();
if (!option[POSITIONS])
find_positions ();
/* Initialize each keyword's _selchars array. */
init_selchars (option[ALLCHARS], _key_positions);
init_selchars_multiset(option[ALLCHARS], _key_positions, _alpha_inc);
/* Check for duplicates, i.e. keywords with the same _selchars array
(and - if !option[NOLENGTH] - also the same length).
@@ -357,7 +513,16 @@ Search::prepare ()
}
}
/* Compute _alpha_size, the upper bound on the indices passed to
asso_values[]. */
unsigned int max_alpha_inc = 0;
for (int i = 0; i < _max_key_len; i++)
if (max_alpha_inc < _alpha_inc[i])
max_alpha_inc = _alpha_inc[i];
_alpha_size = (option[SEVENBIT] ? 128 : 256) + max_alpha_inc;
/* Compute the occurrences of each character in the alphabet. */
_occurrences = new int[_alpha_size];
memset (_occurrences, 0, _alpha_size * sizeof (_occurrences[0]));
for (temp = _head; temp; temp = temp->rest())
{
@@ -366,6 +531,10 @@ Search::prepare ()
for (int count = keyword->_selchars_length; count > 0; ptr++, count--)
_occurrences[*ptr]++;
}
/* Memory allocation. */
_asso_values = new int[_alpha_size];
_determined = new bool[_alpha_size];
}
/* ---------------- Reordering the Keyword list (optional) ----------------- */
@@ -878,6 +1047,11 @@ void
Search::optimize ()
{
/* Preparations. */
preprepare ();
_key_positions = option.get_key_positions();
if (!option[POSITIONS])
find_positions ();
find_alpha_inc ();
prepare ();
if (option[ORDER])
reorder ();
@@ -1025,4 +1199,5 @@ Search::~Search ()
}
delete[] _asso_values;
delete[] _occurrences;
delete[] _alpha_inc;
}