1
0
mirror of https://git.savannah.gnu.org/git/gperf.git synced 2025-12-02 13:09:22 +00:00

Introduce new alpha_inc pass, to avoid artificial duplicates.

This commit is contained in:
Bruno Haible
2003-02-17 10:36:47 +00:00
parent 799d1c7534
commit ec800f65ec
11 changed files with 970 additions and 686 deletions

View File

@@ -57,8 +57,9 @@ static inline void sort_char_set (unsigned int *base, int len)
Furthermore we sort the selchars array, to ease detection of duplicates
later.
*/
void
KeywordExt::init_selchars (bool use_all_chars, const Positions& positions)
unsigned int *
KeywordExt::init_selchars_low (bool use_all_chars, const Positions& positions, const unsigned int *alpha_inc)
{
const char *k = _allchars;
unsigned int *key_set =
@@ -69,7 +70,10 @@ KeywordExt::init_selchars (bool use_all_chars, const Positions& positions)
/* Use all the character positions in the KEY. */
for (int i = _allchars_length; i > 0; k++, i--)
{
*ptr = static_cast<unsigned char>(*k);
unsigned int c = static_cast<unsigned char>(*k);
if (alpha_inc)
c += alpha_inc[k-_allchars];
*ptr = c;
ptr++;
}
else
@@ -81,24 +85,45 @@ KeywordExt::init_selchars (bool use_all_chars, const Positions& positions)
for (int i; (i = iter.next ()) != PositionIterator::EOS; )
{
unsigned int c;
if (i == Positions::LASTCHAR)
/* Special notation for last KEY position, i.e. '$'. */
*ptr = static_cast<unsigned char>(_allchars[_allchars_length - 1]);
c = static_cast<unsigned char>(_allchars[_allchars_length - 1]);
else if (i <= _allchars_length)
/* Within range of KEY length, so we'll keep it. */
*ptr = static_cast<unsigned char>(_allchars[i - 1]);
{
/* Within range of KEY length, so we'll keep it. */
c = static_cast<unsigned char>(_allchars[i - 1]);
if (alpha_inc)
c += alpha_inc[i - 1];
}
else
/* Out of range of KEY length, so we'll just skip it. */
continue;
*ptr = c;
ptr++;
}
}
/* Sort the KEY_SET items alphabetically. */
sort_char_set (key_set, ptr - key_set);
_selchars = key_set;
_selchars_length = ptr - key_set;
return key_set;
}
void
KeywordExt::init_selchars_tuple (bool use_all_chars, const Positions& positions)
{
init_selchars_low (use_all_chars, positions, NULL);
}
void
KeywordExt::init_selchars_multiset (bool use_all_chars, const Positions& positions, const unsigned int *alpha_inc)
{
unsigned int *selchars =
init_selchars_low (use_all_chars, positions, alpha_inc);
/* Sort the selchars elements alphabetically. */
sort_char_set (selchars, _selchars_length);
}
/* Deletes selchars. */

View File

@@ -67,8 +67,10 @@ struct KeywordExt : public Keyword
KeywordExt * _duplicate_link;
/* Methods depending on the keyposition list. */
/* Initializes selchars and selchars_length. */
void init_selchars (bool use_all_chars, const Positions& positions);
/* Initializes selchars and selchars_length, without reordering. */
void init_selchars_tuple (bool use_all_chars, const Positions& positions);
/* Initializes selchars and selchars_length, with reordering. */
void init_selchars_multiset (bool use_all_chars, const Positions& positions, const unsigned int *alpha_inc);
/* Deletes selchars. */
void delete_selchars ();
@@ -78,6 +80,9 @@ struct KeywordExt : public Keyword
/* Data members used by the output routines. */
int _final_index;
private:
unsigned int * init_selchars_low (bool use_all_chars, const Positions& positions, const unsigned int *alpha_inc);
};
/* An abstract factory for creating Keyword instances.

View File

@@ -106,6 +106,7 @@ main (int argc, char *argv[])
searcher._max_key_len,
searcher._min_key_len,
searcher._key_positions,
searcher._alpha_inc,
searcher._total_duplicates,
searcher._alpha_size,
searcher._occurrences,

View File

@@ -88,8 +88,8 @@ Output::Output (KeywordExt_List *head, const char *struct_decl,
const char *verbatim_code, const char *verbatim_code_end,
unsigned int verbatim_code_lineno,
int total_keys, int max_key_len, int min_key_len,
const Positions& positions, int total_duplicates,
int alpha_size, const int *occurrences,
const Positions& positions, const unsigned int *alpha_inc,
int total_duplicates, int alpha_size, const int *occurrences,
const int *asso_values)
: _head (head), _struct_decl (struct_decl),
_struct_decl_lineno (struct_decl_lineno), _return_type (return_type),
@@ -102,7 +102,7 @@ Output::Output (KeywordExt_List *head, const char *struct_decl,
_verbatim_code_lineno (verbatim_code_lineno),
_total_keys (total_keys),
_max_key_len (max_key_len), _min_key_len (min_key_len),
_key_positions (positions),
_key_positions (positions), _alpha_inc (alpha_inc),
_total_duplicates (total_duplicates), _alpha_size (alpha_size),
_occurrences (occurrences), _asso_values (asso_values)
{
@@ -521,9 +521,14 @@ Output::output_hash_function () const
option[NOLENGTH] ? "len" : "hval");
for (int i = _max_key_len; i > 0; i--)
printf (" case %d:\n"
" hval += asso_values[%sstr[%d]];\n",
i, char_to_index, i - 1);
{
printf (" case %d:\n"
" hval += asso_values[%sstr[%d]",
i, char_to_index, i - 1);
if (_alpha_inc[i - 1])
printf ("+%u", _alpha_inc[i - 1]);
printf ("];\n");
}
printf (" break;\n"
" }\n"
@@ -560,13 +565,21 @@ Output::output_hash_function () const
&& _key_positions[0] == 1
&& _key_positions[1] == Positions::LASTCHAR)
/* Optimize special case of "-k 1,$". */
printf ("asso_values[%sstr[len - 1]] + asso_values[%sstr[0]]",
char_to_index, char_to_index);
{
printf ("asso_values[%sstr[len - 1]] + asso_values[%sstr[0]",
char_to_index, char_to_index);
if (_alpha_inc[0])
printf ("+%u", _alpha_inc[0]);
printf ("]");
}
else
{
for (; key_pos != Positions::LASTCHAR; )
{
printf ("asso_values[%sstr[%d]]", char_to_index, key_pos - 1);
printf ("asso_values[%sstr[%d]", char_to_index, key_pos - 1);
if (_alpha_inc[key_pos - 1])
printf ("+%u", _alpha_inc[key_pos - 1]);
printf ("]");
if ((key_pos = iter.next ()) != PositionIterator::EOS)
printf (" + ");
else
@@ -601,8 +614,11 @@ Output::output_hash_function () const
for ( ; i >= key_pos; i--)
printf (" case %d:\n", i);
printf (" hval += asso_values[%sstr[%d]];\n",
printf (" hval += asso_values[%sstr[%d]",
char_to_index, key_pos - 1);
if (_alpha_inc[key_pos - 1])
printf ("+%u", _alpha_inc[key_pos - 1]);
printf ("];\n");
key_pos = iter.next ();
}

View File

@@ -51,6 +51,7 @@ public:
int total_keys,
int max_key_len, int min_key_len,
const Positions& positions,
const unsigned int *alpha_inc,
int total_duplicates,
int alpha_size,
const int *occurrences,
@@ -121,6 +122,8 @@ private:
int const _min_key_len;
/* Key positions. Only to be used if !options[ALLCHARS]. */
Positions const _key_positions;
/* Adjustments to add to bytes add specific key positions. */
const unsigned int * const _alpha_inc;
/* Total number of duplicate hash values. */
int const _total_duplicates;
/* Minimum hash value for all keywords. */

View File

@@ -31,15 +31,44 @@
#include "options.h"
#include "hash-table.h"
/* The most general form of the hash function is
hash (keyword) = sum (asso_values[keyword[i] + alpha_inc[i]] : i in Pos)
where Pos is a set of byte positions,
each alpha_inc[i] is a nonnegative integer,
each asso_values[c] is a nonnegative integer.
Theorem 1: If all keywords are different, there is a set Pos such that
all tuples (keyword[i] : i in Pos) are different.
Theorem 2: If all tuples (keyword[i] : i in Pos) are different, there
are nonnegative integers alpha_inc[i] such that all multisets
{keyword[i] + alpha_inc[i] : i in Pos} are different.
Theorem 3: If all multisets selchars[keyword] are different, there are
nonnegative integers asso_values[c] such that all hash values
sum (asso_values[c] : c in selchars[keyword]) are different.
Based on these three facts, we find the hash function in three steps:
Step 1 (Finding good byte positions):
Find a set Pos, as small as possible, such that all tuples
(keyword[i] : i in Pos) are different.
Step 2 (Finding good alpha increments):
Find nonnegative integers alpha_inc[i], as many of them as possible being
zero, and the others being as small as possible, such that all multisets
{keyword[i] + alpha_inc[i] : i in Pos} are different.
Step 3 (Finding good asso_values):
Find asso_values[c] such that all hash (keyword) are different.
*/
/* -------------------- Initialization and Preparation --------------------- */
Search::Search (KeywordExt_List *list)
: _head (list),
_key_positions (option.get_key_positions()),
_alpha_size (option[SEVENBIT] ? 128 : 256),
_occurrences (new int[_alpha_size]),
_asso_values (new int[_alpha_size]),
_determined (new bool[_alpha_size])
: _head (list)
{
}
@@ -77,12 +106,14 @@ Search::preprepare ()
}
}
/* ---------------------- Finding good byte positions ---------------------- */
/* Initializes each keyword's _selchars array. */
void
Search::init_selchars (bool use_all_chars, const Positions& positions) const
Search::init_selchars_tuple (bool use_all_chars, const Positions& positions) const
{
for (KeywordExt_List *temp = _head; temp; temp = temp->rest())
temp->first()->init_selchars(use_all_chars, positions);
temp->first()->init_selchars_tuple(use_all_chars, positions);
}
/* Deletes each keyword's _selchars array. */
@@ -95,29 +126,31 @@ Search::delete_selchars () const
/* Count the duplicate keywords that occur with a given set of positions. */
unsigned int
Search::count_duplicates (const Positions& positions) const
Search::count_duplicates_tuple (const Positions& positions) const
{
init_selchars (false, positions);
init_selchars_tuple (option[ALLCHARS], positions);
unsigned int count = 0;
Hash_Table representatives (_total_keys, option[NOLENGTH]);
for (KeywordExt_List *temp = _head; temp; temp = temp->rest())
{
KeywordExt *keyword = temp->first();
if (representatives.insert (keyword))
count++;
}
{
Hash_Table representatives (_total_keys, option[NOLENGTH]);
for (KeywordExt_List *temp = _head; temp; temp = temp->rest())
{
KeywordExt *keyword = temp->first();
if (representatives.insert (keyword))
count++;
}
}
delete_selchars ();
return count;
}
/* Find good key positions. */
void
Search::find_positions ()
{
/* Determine good key positions. */
/* 1. Find positions that must occur in order to distinguish duplicates. */
Positions mandatory;
@@ -159,7 +192,7 @@ Search::find_positions ()
int imax = (_max_key_len < Positions::MAX_KEY_POS
? _max_key_len : Positions::MAX_KEY_POS);
Positions current = mandatory;
unsigned int current_duplicates_count = count_duplicates (current);
unsigned int current_duplicates_count = count_duplicates_tuple (current);
for (;;)
{
Positions best;
@@ -170,7 +203,7 @@ Search::find_positions ()
{
Positions tryal = current;
tryal.add (i);
unsigned int try_duplicates_count = count_duplicates (tryal);
unsigned int try_duplicates_count = count_duplicates_tuple (tryal);
/* We prefer 'try' to 'best' if it produces less duplicates,
or if it produces the same number of duplicates but with
@@ -203,7 +236,7 @@ Search::find_positions ()
{
Positions tryal = current;
tryal.remove (i);
unsigned int try_duplicates_count = count_duplicates (tryal);
unsigned int try_duplicates_count = count_duplicates_tuple (tryal);
/* We prefer 'try' to 'best' if it produces less duplicates,
or if it produces the same number of duplicates but with
@@ -243,7 +276,7 @@ Search::find_positions ()
tryal.remove (i2);
tryal.add (i3);
unsigned int try_duplicates_count =
count_duplicates (tryal);
count_duplicates_tuple (tryal);
/* We prefer 'try' to 'best' if it produces less duplicates,
or if it produces the same number of duplicates but with
@@ -269,18 +302,141 @@ Search::find_positions ()
_key_positions = current;
}
/* --------------------- Finding good alpha increments --------------------- */
/* Initializes each keyword's _selchars array. */
void
Search::init_selchars_multiset (bool use_all_chars, const Positions& positions, const unsigned int *alpha_inc) const
{
for (KeywordExt_List *temp = _head; temp; temp = temp->rest())
temp->first()->init_selchars_multiset(use_all_chars, positions, alpha_inc);
}
/* Count the duplicate keywords that occur with the given set of positions
and a given alpha_inc[] array. */
unsigned int
Search::count_duplicates_multiset (const unsigned int *alpha_inc) const
{
init_selchars_multiset (option[ALLCHARS], _key_positions, alpha_inc);
unsigned int count = 0;
{
Hash_Table representatives (_total_keys, option[NOLENGTH]);
for (KeywordExt_List *temp = _head; temp; temp = temp->rest())
{
KeywordExt *keyword = temp->first();
if (representatives.insert (keyword))
count++;
}
}
delete_selchars ();
return count;
}
/* Find good _alpha_inc[]. */
void
Search::find_alpha_inc ()
{
/* The goal is to choose _alpha_inc[] such that it doesn't introduce
artificial duplicates. */
unsigned int duplicates_goal = count_duplicates_tuple (_key_positions);
/* Start with zero increments. This is sufficient in most cases. */
unsigned int *current = new unsigned int [_max_key_len];
for (int i = 0; i < _max_key_len; i++)
current[i] = 0;
unsigned int current_duplicates_count = count_duplicates_multiset (current);
if (current_duplicates_count > duplicates_goal)
{
/* Look which _alpha_inc[i] we are free to increment. */
unsigned int nindices;
if (option[ALLCHARS])
nindices = _max_key_len;
else
{
/* Ignore Positions::LASTCHAR. Remember that since Positions are
sorted in decreasing order, Positions::LASTCHAR comes last. */
nindices = (_key_positions.get_size() == 0
|| _key_positions[_key_positions.get_size() - 1]
!= Positions::LASTCHAR
? _key_positions.get_size()
: _key_positions.get_size() - 1);
}
unsigned int indices[nindices];
if (option[ALLCHARS])
for (unsigned int j = 0; j < nindices; j++)
indices[j] = j;
else
{
PositionIterator iter (_key_positions);
for (unsigned int j = 0; j < nindices; j++)
{
int key_pos = iter.next ();
if (key_pos == PositionIterator::EOS
|| key_pos == Positions::LASTCHAR)
abort ();
indices[j] = key_pos - 1;
}
}
/* Perform several rounds of searching for a good alpha increment.
Each round reduces the number of artificial collisions by adding
an increment in a single key position. */
unsigned int best[_max_key_len];
unsigned int tryal[_max_key_len];
do
{
/* An increment of 1 is not always enough. Try higher increments
also. */
for (unsigned int inc = 1; ; inc++)
{
unsigned int best_duplicates_count = UINT_MAX;
for (unsigned int j = 0; j < nindices; j++)
{
memcpy (tryal, current, _max_key_len * sizeof (unsigned int));
tryal[indices[j]] += inc;
unsigned int try_duplicates_count =
count_duplicates_multiset (tryal);
/* We prefer 'try' to 'best' if it produces less
duplicates. */
if (try_duplicates_count < best_duplicates_count)
{
memcpy (best, tryal, _max_key_len * sizeof (unsigned int));
best_duplicates_count = try_duplicates_count;
}
}
/* Stop this round when we got an improvement. */
if (best_duplicates_count < current_duplicates_count)
{
memcpy (current, best, _max_key_len * sizeof (unsigned int));
current_duplicates_count = best_duplicates_count;
break;
}
}
}
while (current_duplicates_count > duplicates_goal);
}
_alpha_inc = current;
}
/* ------------------------------------------------------------------------- */
void
Search::prepare ()
{
KeywordExt_List *temp;
preprepare ();
if (!option[POSITIONS])
find_positions ();
/* Initialize each keyword's _selchars array. */
init_selchars (option[ALLCHARS], _key_positions);
init_selchars_multiset(option[ALLCHARS], _key_positions, _alpha_inc);
/* Check for duplicates, i.e. keywords with the same _selchars array
(and - if !option[NOLENGTH] - also the same length).
@@ -357,7 +513,16 @@ Search::prepare ()
}
}
/* Compute _alpha_size, the upper bound on the indices passed to
asso_values[]. */
unsigned int max_alpha_inc = 0;
for (int i = 0; i < _max_key_len; i++)
if (max_alpha_inc < _alpha_inc[i])
max_alpha_inc = _alpha_inc[i];
_alpha_size = (option[SEVENBIT] ? 128 : 256) + max_alpha_inc;
/* Compute the occurrences of each character in the alphabet. */
_occurrences = new int[_alpha_size];
memset (_occurrences, 0, _alpha_size * sizeof (_occurrences[0]));
for (temp = _head; temp; temp = temp->rest())
{
@@ -366,6 +531,10 @@ Search::prepare ()
for (int count = keyword->_selchars_length; count > 0; ptr++, count--)
_occurrences[*ptr]++;
}
/* Memory allocation. */
_asso_values = new int[_alpha_size];
_determined = new bool[_alpha_size];
}
/* ---------------- Reordering the Keyword list (optional) ----------------- */
@@ -878,6 +1047,11 @@ void
Search::optimize ()
{
/* Preparations. */
preprepare ();
_key_positions = option.get_key_positions();
if (!option[POSITIONS])
find_positions ();
find_alpha_inc ();
prepare ();
if (option[ORDER])
reorder ();
@@ -1025,4 +1199,5 @@ Search::~Search ()
}
delete[] _asso_values;
delete[] _occurrences;
delete[] _alpha_inc;
}

View File

@@ -40,15 +40,26 @@ private:
void preprepare ();
/* Initializes each keyword's _selchars array. */
void init_selchars (bool use_all_chars, const Positions& positions) const;
void init_selchars_tuple (bool use_all_chars, const Positions& positions) const;
/* Deletes each keyword's _selchars array. */
void delete_selchars () const;
/* Count the duplicate keywords that occur with a given set of positions. */
unsigned int count_duplicates (const Positions& positions) const;
unsigned int count_duplicates_tuple (const Positions& positions) const;
/* Find good key positions. */
void find_positions ();
/* Initializes each keyword's _selchars array. */
void init_selchars_multiset (bool use_all_chars, const Positions& positions, const unsigned int *alpha_inc) const;
/* Count the duplicate keywords that occur with the given set of positions
and a given alpha_inc[] array. */
unsigned int count_duplicates_multiset (const unsigned int *alpha_inc) const;
/* Find good _alpha_inc[]. */
void find_alpha_inc ();
void prepare ();
/* Computes the sum of occurrences of the _selchars of a keyword. */
@@ -112,19 +123,22 @@ public:
/* User-specified or computed key positions. */
Positions _key_positions;
/* Adjustments to add to bytes add specific key positions. */
unsigned int * _alpha_inc;
/* Total number of duplicates that have been moved to _duplicate_link lists
(not counting their representatives which stay on the main list). */
int _total_duplicates;
/* Size of alphabet. */
int const _alpha_size;
int _alpha_size;
/* Counts occurrences of each key set character.
_occurrences[c] is the number of times that c occurs among the _selchars
of a keyword. */
int * const _occurrences;
int * _occurrences;
/* Value associated with each character. */
int * const _asso_values;
int * _asso_values;
private:
@@ -132,7 +146,7 @@ private:
int _list_len;
/* Vector used during Search::reorder(). */
bool * const _determined;
bool * _determined;
/* Exclusive upper bound for every _asso_values[c]. A power of 2. */
int _asso_value_max;