1
0
mirror of https://git.savannah.gnu.org/git/gperf.git synced 2025-12-02 13:09:22 +00:00

Introduce new alpha_inc pass, to avoid artificial duplicates.

This commit is contained in:
Bruno Haible
2003-02-17 10:36:47 +00:00
parent 799d1c7534
commit ec800f65ec
11 changed files with 970 additions and 686 deletions

View File

@@ -1,5 +1,44 @@
2002-11-17 Bruno Haible <bruno@clisp.org> 2002-11-17 Bruno Haible <bruno@clisp.org>
Avoid artificial duplicates.
* src/keyword.h (KeywordExt::init_selchars_tuple): New declaration.
(KeywordExt::init_selchars_multiset): Renamed from
KeywordExt::init_selchars.
(KeywordExt::init_selchars_low): New declaration.
* src/keyword.cc (KeywordExt::init_selchars_low): Renamed from
KeywordExt::init_selchars. Add alpha_inc argument. Remove sorting.
(KeywordExt::init_selchars_tuple): New method.
(KeywordExt::init_selchars_multiset): New method, replaces
KeywordExt::init_selchars.
* src/search.h (Search::init_selchars_tuple): Renamed from
Search::init_selchars.
(Search::count_duplicates_tuple): Renamed from Search::count_duplicates.
(Search::init_selchars_multiset, Search::count_duplicates_multiset,
Search::find_alpha_inc): New declarations.
(Search::_alpha_inc): New field.
(Search::_alpha_size, Search::_occurrences, Search::_asso_values,
Search::_determined): Make non-const.
* src/search.cc (Search::Search): Don't initialize _key_positions,
_alpha_size, _occurrences, _asso_values, _determined here.
(Search::init_selchars_tuple): Renamed from Search::init_selchars.
(Search::count_duplicates_tuple): Renamed from Search::count_duplicates.
(Search::find_positions): Update.
(Search::init_selchars_multiset, Search::count_duplicates_multiset,
Search::find_alpha_inc): New methods.
(Search::prepare): Move preprepare, find_positions calls away.
Initialize _alpha_size, _occurrences, _asso_values, _determined here.
(Search::optimize): Call preprepare, find_positions here. Initialize
_key_positions here.
(Search::~Search): Deallocate _alpha_inc.
* src/output.cc (Output::Output): Add alpha_inc argument.
(Output::output_hash_function): Use _alpha_inc.
* src/output.h (Output::Output): Add alpha_inc argument.
(Output::_alpha_inc): New field.
* src/main.cc (main): Pass _alpha_inc from Search to Output.
* tests/chill.exp: Update.
* doc/gperf.texi (Algorithmic Details): Remove description of
artificial duplicates.
* src/keyword.h (KeywordExt::_selchars): Change type to * src/keyword.h (KeywordExt::_selchars): Change type to
'const unsigned int *'. 'const unsigned int *'.
* src/keyword.cc (sort_char_set): Change argument type to * src/keyword.cc (sort_char_set): Change argument type to

3
NEWS
View File

@@ -31,6 +31,9 @@ New in 2.8:
computed depending on the set of keywords. computed depending on the set of keywords.
* If the input file is given by name, the output file will now contain * If the input file is given by name, the output file will now contain
#line directives referring to the input file. #line directives referring to the input file.
* Some keyword sets containing permutations, like { "xy", "yx", "xz", "zx" }
or { "abc", "acb", "bca", "cab" }, are now handled by gperf without
requiring the option -D.
* Bug fixes. * Bug fixes.
New in 2.7.2: New in 2.7.2:

View File

@@ -993,7 +993,7 @@ through a search that minimizes the number of byte positions.
@itemx --duplicates @itemx --duplicates
@cindex Duplicates @cindex Duplicates
Handle keywords whose selected byte sets hash to duplicate values. Handle keywords whose selected byte sets hash to duplicate values.
Duplicate hash values can occur for three reasons: Duplicate hash values can occur for two reasons:
@itemize @bullet @itemize @bullet
@item @item
@@ -1003,15 +1003,6 @@ However, frequently only a very small number of duplicates occur, and
the majority of keywords still require one probe into the table. To the majority of keywords still require one probe into the table. To
overcome this problem, the option @samp{-m 50} should be used. overcome this problem, the option @samp{-m 50} should be used.
@item
Since the @code{gperf} generated hash function treats the bytes at
different byte positions with equal weight, keywords that are permutations
of each other can lead to the same hash function value if they are not
disambiguated by the set of selected byte positions. Sometimes even this
is not possible; for example, the keyword set @{"xy", "yx", "xz", "zx"@}
will always lead to duplicates, regardless how the selected byte positions
are chosen. You can use the option @samp{-D} to handle this rare case.
@item @item
Sometimes a set of keywords may have the same names, but possess different Sometimes a set of keywords may have the same names, but possess different
attributes. With the -D option @code{gperf} treats all these keywords as attributes. With the -D option @code{gperf} treats all these keywords as

View File

@@ -57,8 +57,9 @@ static inline void sort_char_set (unsigned int *base, int len)
Furthermore we sort the selchars array, to ease detection of duplicates Furthermore we sort the selchars array, to ease detection of duplicates
later. later.
*/ */
void
KeywordExt::init_selchars (bool use_all_chars, const Positions& positions) unsigned int *
KeywordExt::init_selchars_low (bool use_all_chars, const Positions& positions, const unsigned int *alpha_inc)
{ {
const char *k = _allchars; const char *k = _allchars;
unsigned int *key_set = unsigned int *key_set =
@@ -69,7 +70,10 @@ KeywordExt::init_selchars (bool use_all_chars, const Positions& positions)
/* Use all the character positions in the KEY. */ /* Use all the character positions in the KEY. */
for (int i = _allchars_length; i > 0; k++, i--) for (int i = _allchars_length; i > 0; k++, i--)
{ {
*ptr = static_cast<unsigned char>(*k); unsigned int c = static_cast<unsigned char>(*k);
if (alpha_inc)
c += alpha_inc[k-_allchars];
*ptr = c;
ptr++; ptr++;
} }
else else
@@ -81,24 +85,45 @@ KeywordExt::init_selchars (bool use_all_chars, const Positions& positions)
for (int i; (i = iter.next ()) != PositionIterator::EOS; ) for (int i; (i = iter.next ()) != PositionIterator::EOS; )
{ {
unsigned int c;
if (i == Positions::LASTCHAR) if (i == Positions::LASTCHAR)
/* Special notation for last KEY position, i.e. '$'. */ /* Special notation for last KEY position, i.e. '$'. */
*ptr = static_cast<unsigned char>(_allchars[_allchars_length - 1]); c = static_cast<unsigned char>(_allchars[_allchars_length - 1]);
else if (i <= _allchars_length) else if (i <= _allchars_length)
{
/* Within range of KEY length, so we'll keep it. */ /* Within range of KEY length, so we'll keep it. */
*ptr = static_cast<unsigned char>(_allchars[i - 1]); c = static_cast<unsigned char>(_allchars[i - 1]);
if (alpha_inc)
c += alpha_inc[i - 1];
}
else else
/* Out of range of KEY length, so we'll just skip it. */ /* Out of range of KEY length, so we'll just skip it. */
continue; continue;
*ptr = c;
ptr++; ptr++;
} }
} }
/* Sort the KEY_SET items alphabetically. */
sort_char_set (key_set, ptr - key_set);
_selchars = key_set; _selchars = key_set;
_selchars_length = ptr - key_set; _selchars_length = ptr - key_set;
return key_set;
}
void
KeywordExt::init_selchars_tuple (bool use_all_chars, const Positions& positions)
{
init_selchars_low (use_all_chars, positions, NULL);
}
void
KeywordExt::init_selchars_multiset (bool use_all_chars, const Positions& positions, const unsigned int *alpha_inc)
{
unsigned int *selchars =
init_selchars_low (use_all_chars, positions, alpha_inc);
/* Sort the selchars elements alphabetically. */
sort_char_set (selchars, _selchars_length);
} }
/* Deletes selchars. */ /* Deletes selchars. */

View File

@@ -67,8 +67,10 @@ struct KeywordExt : public Keyword
KeywordExt * _duplicate_link; KeywordExt * _duplicate_link;
/* Methods depending on the keyposition list. */ /* Methods depending on the keyposition list. */
/* Initializes selchars and selchars_length. */ /* Initializes selchars and selchars_length, without reordering. */
void init_selchars (bool use_all_chars, const Positions& positions); void init_selchars_tuple (bool use_all_chars, const Positions& positions);
/* Initializes selchars and selchars_length, with reordering. */
void init_selchars_multiset (bool use_all_chars, const Positions& positions, const unsigned int *alpha_inc);
/* Deletes selchars. */ /* Deletes selchars. */
void delete_selchars (); void delete_selchars ();
@@ -78,6 +80,9 @@ struct KeywordExt : public Keyword
/* Data members used by the output routines. */ /* Data members used by the output routines. */
int _final_index; int _final_index;
private:
unsigned int * init_selchars_low (bool use_all_chars, const Positions& positions, const unsigned int *alpha_inc);
}; };
/* An abstract factory for creating Keyword instances. /* An abstract factory for creating Keyword instances.

View File

@@ -106,6 +106,7 @@ main (int argc, char *argv[])
searcher._max_key_len, searcher._max_key_len,
searcher._min_key_len, searcher._min_key_len,
searcher._key_positions, searcher._key_positions,
searcher._alpha_inc,
searcher._total_duplicates, searcher._total_duplicates,
searcher._alpha_size, searcher._alpha_size,
searcher._occurrences, searcher._occurrences,

View File

@@ -88,8 +88,8 @@ Output::Output (KeywordExt_List *head, const char *struct_decl,
const char *verbatim_code, const char *verbatim_code_end, const char *verbatim_code, const char *verbatim_code_end,
unsigned int verbatim_code_lineno, unsigned int verbatim_code_lineno,
int total_keys, int max_key_len, int min_key_len, int total_keys, int max_key_len, int min_key_len,
const Positions& positions, int total_duplicates, const Positions& positions, const unsigned int *alpha_inc,
int alpha_size, const int *occurrences, int total_duplicates, int alpha_size, const int *occurrences,
const int *asso_values) const int *asso_values)
: _head (head), _struct_decl (struct_decl), : _head (head), _struct_decl (struct_decl),
_struct_decl_lineno (struct_decl_lineno), _return_type (return_type), _struct_decl_lineno (struct_decl_lineno), _return_type (return_type),
@@ -102,7 +102,7 @@ Output::Output (KeywordExt_List *head, const char *struct_decl,
_verbatim_code_lineno (verbatim_code_lineno), _verbatim_code_lineno (verbatim_code_lineno),
_total_keys (total_keys), _total_keys (total_keys),
_max_key_len (max_key_len), _min_key_len (min_key_len), _max_key_len (max_key_len), _min_key_len (min_key_len),
_key_positions (positions), _key_positions (positions), _alpha_inc (alpha_inc),
_total_duplicates (total_duplicates), _alpha_size (alpha_size), _total_duplicates (total_duplicates), _alpha_size (alpha_size),
_occurrences (occurrences), _asso_values (asso_values) _occurrences (occurrences), _asso_values (asso_values)
{ {
@@ -521,9 +521,14 @@ Output::output_hash_function () const
option[NOLENGTH] ? "len" : "hval"); option[NOLENGTH] ? "len" : "hval");
for (int i = _max_key_len; i > 0; i--) for (int i = _max_key_len; i > 0; i--)
{
printf (" case %d:\n" printf (" case %d:\n"
" hval += asso_values[%sstr[%d]];\n", " hval += asso_values[%sstr[%d]",
i, char_to_index, i - 1); i, char_to_index, i - 1);
if (_alpha_inc[i - 1])
printf ("+%u", _alpha_inc[i - 1]);
printf ("];\n");
}
printf (" break;\n" printf (" break;\n"
" }\n" " }\n"
@@ -560,13 +565,21 @@ Output::output_hash_function () const
&& _key_positions[0] == 1 && _key_positions[0] == 1
&& _key_positions[1] == Positions::LASTCHAR) && _key_positions[1] == Positions::LASTCHAR)
/* Optimize special case of "-k 1,$". */ /* Optimize special case of "-k 1,$". */
printf ("asso_values[%sstr[len - 1]] + asso_values[%sstr[0]]", {
printf ("asso_values[%sstr[len - 1]] + asso_values[%sstr[0]",
char_to_index, char_to_index); char_to_index, char_to_index);
if (_alpha_inc[0])
printf ("+%u", _alpha_inc[0]);
printf ("]");
}
else else
{ {
for (; key_pos != Positions::LASTCHAR; ) for (; key_pos != Positions::LASTCHAR; )
{ {
printf ("asso_values[%sstr[%d]]", char_to_index, key_pos - 1); printf ("asso_values[%sstr[%d]", char_to_index, key_pos - 1);
if (_alpha_inc[key_pos - 1])
printf ("+%u", _alpha_inc[key_pos - 1]);
printf ("]");
if ((key_pos = iter.next ()) != PositionIterator::EOS) if ((key_pos = iter.next ()) != PositionIterator::EOS)
printf (" + "); printf (" + ");
else else
@@ -601,8 +614,11 @@ Output::output_hash_function () const
for ( ; i >= key_pos; i--) for ( ; i >= key_pos; i--)
printf (" case %d:\n", i); printf (" case %d:\n", i);
printf (" hval += asso_values[%sstr[%d]];\n", printf (" hval += asso_values[%sstr[%d]",
char_to_index, key_pos - 1); char_to_index, key_pos - 1);
if (_alpha_inc[key_pos - 1])
printf ("+%u", _alpha_inc[key_pos - 1]);
printf ("];\n");
key_pos = iter.next (); key_pos = iter.next ();
} }

View File

@@ -51,6 +51,7 @@ public:
int total_keys, int total_keys,
int max_key_len, int min_key_len, int max_key_len, int min_key_len,
const Positions& positions, const Positions& positions,
const unsigned int *alpha_inc,
int total_duplicates, int total_duplicates,
int alpha_size, int alpha_size,
const int *occurrences, const int *occurrences,
@@ -121,6 +122,8 @@ private:
int const _min_key_len; int const _min_key_len;
/* Key positions. Only to be used if !options[ALLCHARS]. */ /* Key positions. Only to be used if !options[ALLCHARS]. */
Positions const _key_positions; Positions const _key_positions;
/* Adjustments to add to bytes add specific key positions. */
const unsigned int * const _alpha_inc;
/* Total number of duplicate hash values. */ /* Total number of duplicate hash values. */
int const _total_duplicates; int const _total_duplicates;
/* Minimum hash value for all keywords. */ /* Minimum hash value for all keywords. */

View File

@@ -31,15 +31,44 @@
#include "options.h" #include "options.h"
#include "hash-table.h" #include "hash-table.h"
/* The most general form of the hash function is
hash (keyword) = sum (asso_values[keyword[i] + alpha_inc[i]] : i in Pos)
where Pos is a set of byte positions,
each alpha_inc[i] is a nonnegative integer,
each asso_values[c] is a nonnegative integer.
Theorem 1: If all keywords are different, there is a set Pos such that
all tuples (keyword[i] : i in Pos) are different.
Theorem 2: If all tuples (keyword[i] : i in Pos) are different, there
are nonnegative integers alpha_inc[i] such that all multisets
{keyword[i] + alpha_inc[i] : i in Pos} are different.
Theorem 3: If all multisets selchars[keyword] are different, there are
nonnegative integers asso_values[c] such that all hash values
sum (asso_values[c] : c in selchars[keyword]) are different.
Based on these three facts, we find the hash function in three steps:
Step 1 (Finding good byte positions):
Find a set Pos, as small as possible, such that all tuples
(keyword[i] : i in Pos) are different.
Step 2 (Finding good alpha increments):
Find nonnegative integers alpha_inc[i], as many of them as possible being
zero, and the others being as small as possible, such that all multisets
{keyword[i] + alpha_inc[i] : i in Pos} are different.
Step 3 (Finding good asso_values):
Find asso_values[c] such that all hash (keyword) are different.
*/
/* -------------------- Initialization and Preparation --------------------- */ /* -------------------- Initialization and Preparation --------------------- */
Search::Search (KeywordExt_List *list) Search::Search (KeywordExt_List *list)
: _head (list), : _head (list)
_key_positions (option.get_key_positions()),
_alpha_size (option[SEVENBIT] ? 128 : 256),
_occurrences (new int[_alpha_size]),
_asso_values (new int[_alpha_size]),
_determined (new bool[_alpha_size])
{ {
} }
@@ -77,12 +106,14 @@ Search::preprepare ()
} }
} }
/* ---------------------- Finding good byte positions ---------------------- */
/* Initializes each keyword's _selchars array. */ /* Initializes each keyword's _selchars array. */
void void
Search::init_selchars (bool use_all_chars, const Positions& positions) const Search::init_selchars_tuple (bool use_all_chars, const Positions& positions) const
{ {
for (KeywordExt_List *temp = _head; temp; temp = temp->rest()) for (KeywordExt_List *temp = _head; temp; temp = temp->rest())
temp->first()->init_selchars(use_all_chars, positions); temp->first()->init_selchars_tuple(use_all_chars, positions);
} }
/* Deletes each keyword's _selchars array. */ /* Deletes each keyword's _selchars array. */
@@ -95,11 +126,12 @@ Search::delete_selchars () const
/* Count the duplicate keywords that occur with a given set of positions. */ /* Count the duplicate keywords that occur with a given set of positions. */
unsigned int unsigned int
Search::count_duplicates (const Positions& positions) const Search::count_duplicates_tuple (const Positions& positions) const
{ {
init_selchars (false, positions); init_selchars_tuple (option[ALLCHARS], positions);
unsigned int count = 0; unsigned int count = 0;
{
Hash_Table representatives (_total_keys, option[NOLENGTH]); Hash_Table representatives (_total_keys, option[NOLENGTH]);
for (KeywordExt_List *temp = _head; temp; temp = temp->rest()) for (KeywordExt_List *temp = _head; temp; temp = temp->rest())
{ {
@@ -107,17 +139,18 @@ Search::count_duplicates (const Positions& positions) const
if (representatives.insert (keyword)) if (representatives.insert (keyword))
count++; count++;
} }
}
delete_selchars (); delete_selchars ();
return count; return count;
} }
/* Find good key positions. */
void void
Search::find_positions () Search::find_positions ()
{ {
/* Determine good key positions. */
/* 1. Find positions that must occur in order to distinguish duplicates. */ /* 1. Find positions that must occur in order to distinguish duplicates. */
Positions mandatory; Positions mandatory;
@@ -159,7 +192,7 @@ Search::find_positions ()
int imax = (_max_key_len < Positions::MAX_KEY_POS int imax = (_max_key_len < Positions::MAX_KEY_POS
? _max_key_len : Positions::MAX_KEY_POS); ? _max_key_len : Positions::MAX_KEY_POS);
Positions current = mandatory; Positions current = mandatory;
unsigned int current_duplicates_count = count_duplicates (current); unsigned int current_duplicates_count = count_duplicates_tuple (current);
for (;;) for (;;)
{ {
Positions best; Positions best;
@@ -170,7 +203,7 @@ Search::find_positions ()
{ {
Positions tryal = current; Positions tryal = current;
tryal.add (i); tryal.add (i);
unsigned int try_duplicates_count = count_duplicates (tryal); unsigned int try_duplicates_count = count_duplicates_tuple (tryal);
/* We prefer 'try' to 'best' if it produces less duplicates, /* We prefer 'try' to 'best' if it produces less duplicates,
or if it produces the same number of duplicates but with or if it produces the same number of duplicates but with
@@ -203,7 +236,7 @@ Search::find_positions ()
{ {
Positions tryal = current; Positions tryal = current;
tryal.remove (i); tryal.remove (i);
unsigned int try_duplicates_count = count_duplicates (tryal); unsigned int try_duplicates_count = count_duplicates_tuple (tryal);
/* We prefer 'try' to 'best' if it produces less duplicates, /* We prefer 'try' to 'best' if it produces less duplicates,
or if it produces the same number of duplicates but with or if it produces the same number of duplicates but with
@@ -243,7 +276,7 @@ Search::find_positions ()
tryal.remove (i2); tryal.remove (i2);
tryal.add (i3); tryal.add (i3);
unsigned int try_duplicates_count = unsigned int try_duplicates_count =
count_duplicates (tryal); count_duplicates_tuple (tryal);
/* We prefer 'try' to 'best' if it produces less duplicates, /* We prefer 'try' to 'best' if it produces less duplicates,
or if it produces the same number of duplicates but with or if it produces the same number of duplicates but with
@@ -269,18 +302,141 @@ Search::find_positions ()
_key_positions = current; _key_positions = current;
} }
/* --------------------- Finding good alpha increments --------------------- */
/* Initializes each keyword's _selchars array. */
void
Search::init_selchars_multiset (bool use_all_chars, const Positions& positions, const unsigned int *alpha_inc) const
{
for (KeywordExt_List *temp = _head; temp; temp = temp->rest())
temp->first()->init_selchars_multiset(use_all_chars, positions, alpha_inc);
}
/* Count the duplicate keywords that occur with the given set of positions
and a given alpha_inc[] array. */
unsigned int
Search::count_duplicates_multiset (const unsigned int *alpha_inc) const
{
init_selchars_multiset (option[ALLCHARS], _key_positions, alpha_inc);
unsigned int count = 0;
{
Hash_Table representatives (_total_keys, option[NOLENGTH]);
for (KeywordExt_List *temp = _head; temp; temp = temp->rest())
{
KeywordExt *keyword = temp->first();
if (representatives.insert (keyword))
count++;
}
}
delete_selchars ();
return count;
}
/* Find good _alpha_inc[]. */
void
Search::find_alpha_inc ()
{
/* The goal is to choose _alpha_inc[] such that it doesn't introduce
artificial duplicates. */
unsigned int duplicates_goal = count_duplicates_tuple (_key_positions);
/* Start with zero increments. This is sufficient in most cases. */
unsigned int *current = new unsigned int [_max_key_len];
for (int i = 0; i < _max_key_len; i++)
current[i] = 0;
unsigned int current_duplicates_count = count_duplicates_multiset (current);
if (current_duplicates_count > duplicates_goal)
{
/* Look which _alpha_inc[i] we are free to increment. */
unsigned int nindices;
if (option[ALLCHARS])
nindices = _max_key_len;
else
{
/* Ignore Positions::LASTCHAR. Remember that since Positions are
sorted in decreasing order, Positions::LASTCHAR comes last. */
nindices = (_key_positions.get_size() == 0
|| _key_positions[_key_positions.get_size() - 1]
!= Positions::LASTCHAR
? _key_positions.get_size()
: _key_positions.get_size() - 1);
}
unsigned int indices[nindices];
if (option[ALLCHARS])
for (unsigned int j = 0; j < nindices; j++)
indices[j] = j;
else
{
PositionIterator iter (_key_positions);
for (unsigned int j = 0; j < nindices; j++)
{
int key_pos = iter.next ();
if (key_pos == PositionIterator::EOS
|| key_pos == Positions::LASTCHAR)
abort ();
indices[j] = key_pos - 1;
}
}
/* Perform several rounds of searching for a good alpha increment.
Each round reduces the number of artificial collisions by adding
an increment in a single key position. */
unsigned int best[_max_key_len];
unsigned int tryal[_max_key_len];
do
{
/* An increment of 1 is not always enough. Try higher increments
also. */
for (unsigned int inc = 1; ; inc++)
{
unsigned int best_duplicates_count = UINT_MAX;
for (unsigned int j = 0; j < nindices; j++)
{
memcpy (tryal, current, _max_key_len * sizeof (unsigned int));
tryal[indices[j]] += inc;
unsigned int try_duplicates_count =
count_duplicates_multiset (tryal);
/* We prefer 'try' to 'best' if it produces less
duplicates. */
if (try_duplicates_count < best_duplicates_count)
{
memcpy (best, tryal, _max_key_len * sizeof (unsigned int));
best_duplicates_count = try_duplicates_count;
}
}
/* Stop this round when we got an improvement. */
if (best_duplicates_count < current_duplicates_count)
{
memcpy (current, best, _max_key_len * sizeof (unsigned int));
current_duplicates_count = best_duplicates_count;
break;
}
}
}
while (current_duplicates_count > duplicates_goal);
}
_alpha_inc = current;
}
/* ------------------------------------------------------------------------- */
void void
Search::prepare () Search::prepare ()
{ {
KeywordExt_List *temp; KeywordExt_List *temp;
preprepare ();
if (!option[POSITIONS])
find_positions ();
/* Initialize each keyword's _selchars array. */ /* Initialize each keyword's _selchars array. */
init_selchars (option[ALLCHARS], _key_positions); init_selchars_multiset(option[ALLCHARS], _key_positions, _alpha_inc);
/* Check for duplicates, i.e. keywords with the same _selchars array /* Check for duplicates, i.e. keywords with the same _selchars array
(and - if !option[NOLENGTH] - also the same length). (and - if !option[NOLENGTH] - also the same length).
@@ -357,7 +513,16 @@ Search::prepare ()
} }
} }
/* Compute _alpha_size, the upper bound on the indices passed to
asso_values[]. */
unsigned int max_alpha_inc = 0;
for (int i = 0; i < _max_key_len; i++)
if (max_alpha_inc < _alpha_inc[i])
max_alpha_inc = _alpha_inc[i];
_alpha_size = (option[SEVENBIT] ? 128 : 256) + max_alpha_inc;
/* Compute the occurrences of each character in the alphabet. */ /* Compute the occurrences of each character in the alphabet. */
_occurrences = new int[_alpha_size];
memset (_occurrences, 0, _alpha_size * sizeof (_occurrences[0])); memset (_occurrences, 0, _alpha_size * sizeof (_occurrences[0]));
for (temp = _head; temp; temp = temp->rest()) for (temp = _head; temp; temp = temp->rest())
{ {
@@ -366,6 +531,10 @@ Search::prepare ()
for (int count = keyword->_selchars_length; count > 0; ptr++, count--) for (int count = keyword->_selchars_length; count > 0; ptr++, count--)
_occurrences[*ptr]++; _occurrences[*ptr]++;
} }
/* Memory allocation. */
_asso_values = new int[_alpha_size];
_determined = new bool[_alpha_size];
} }
/* ---------------- Reordering the Keyword list (optional) ----------------- */ /* ---------------- Reordering the Keyword list (optional) ----------------- */
@@ -878,6 +1047,11 @@ void
Search::optimize () Search::optimize ()
{ {
/* Preparations. */ /* Preparations. */
preprepare ();
_key_positions = option.get_key_positions();
if (!option[POSITIONS])
find_positions ();
find_alpha_inc ();
prepare (); prepare ();
if (option[ORDER]) if (option[ORDER])
reorder (); reorder ();
@@ -1025,4 +1199,5 @@ Search::~Search ()
} }
delete[] _asso_values; delete[] _asso_values;
delete[] _occurrences; delete[] _occurrences;
delete[] _alpha_inc;
} }

View File

@@ -40,15 +40,26 @@ private:
void preprepare (); void preprepare ();
/* Initializes each keyword's _selchars array. */ /* Initializes each keyword's _selchars array. */
void init_selchars (bool use_all_chars, const Positions& positions) const; void init_selchars_tuple (bool use_all_chars, const Positions& positions) const;
/* Deletes each keyword's _selchars array. */ /* Deletes each keyword's _selchars array. */
void delete_selchars () const; void delete_selchars () const;
/* Count the duplicate keywords that occur with a given set of positions. */ /* Count the duplicate keywords that occur with a given set of positions. */
unsigned int count_duplicates (const Positions& positions) const; unsigned int count_duplicates_tuple (const Positions& positions) const;
/* Find good key positions. */
void find_positions (); void find_positions ();
/* Initializes each keyword's _selchars array. */
void init_selchars_multiset (bool use_all_chars, const Positions& positions, const unsigned int *alpha_inc) const;
/* Count the duplicate keywords that occur with the given set of positions
and a given alpha_inc[] array. */
unsigned int count_duplicates_multiset (const unsigned int *alpha_inc) const;
/* Find good _alpha_inc[]. */
void find_alpha_inc ();
void prepare (); void prepare ();
/* Computes the sum of occurrences of the _selchars of a keyword. */ /* Computes the sum of occurrences of the _selchars of a keyword. */
@@ -112,19 +123,22 @@ public:
/* User-specified or computed key positions. */ /* User-specified or computed key positions. */
Positions _key_positions; Positions _key_positions;
/* Adjustments to add to bytes add specific key positions. */
unsigned int * _alpha_inc;
/* Total number of duplicates that have been moved to _duplicate_link lists /* Total number of duplicates that have been moved to _duplicate_link lists
(not counting their representatives which stay on the main list). */ (not counting their representatives which stay on the main list). */
int _total_duplicates; int _total_duplicates;
/* Size of alphabet. */ /* Size of alphabet. */
int const _alpha_size; int _alpha_size;
/* Counts occurrences of each key set character. /* Counts occurrences of each key set character.
_occurrences[c] is the number of times that c occurs among the _selchars _occurrences[c] is the number of times that c occurs among the _selchars
of a keyword. */ of a keyword. */
int * const _occurrences; int * _occurrences;
/* Value associated with each character. */ /* Value associated with each character. */
int * const _asso_values; int * _asso_values;
private: private:
@@ -132,7 +146,7 @@ private:
int _list_len; int _list_len;
/* Vector used during Search::reorder(). */ /* Vector used during Search::reorder(). */
bool * const _determined; bool * _determined;
/* Exclusive upper bound for every _asso_values[c]. A power of 2. */ /* Exclusive upper bound for every _asso_values[c]. A power of 2. */
int _asso_value_max; int _asso_value_max;

File diff suppressed because it is too large Load Diff