mirror of
https://git.savannah.gnu.org/git/gperf.git
synced 2025-12-02 13:09:22 +00:00
Introduce new alpha_inc pass, to avoid artificial duplicates.
This commit is contained in:
39
ChangeLog
39
ChangeLog
@@ -1,5 +1,44 @@
|
||||
2002-11-17 Bruno Haible <bruno@clisp.org>
|
||||
|
||||
Avoid artificial duplicates.
|
||||
* src/keyword.h (KeywordExt::init_selchars_tuple): New declaration.
|
||||
(KeywordExt::init_selchars_multiset): Renamed from
|
||||
KeywordExt::init_selchars.
|
||||
(KeywordExt::init_selchars_low): New declaration.
|
||||
* src/keyword.cc (KeywordExt::init_selchars_low): Renamed from
|
||||
KeywordExt::init_selchars. Add alpha_inc argument. Remove sorting.
|
||||
(KeywordExt::init_selchars_tuple): New method.
|
||||
(KeywordExt::init_selchars_multiset): New method, replaces
|
||||
KeywordExt::init_selchars.
|
||||
* src/search.h (Search::init_selchars_tuple): Renamed from
|
||||
Search::init_selchars.
|
||||
(Search::count_duplicates_tuple): Renamed from Search::count_duplicates.
|
||||
(Search::init_selchars_multiset, Search::count_duplicates_multiset,
|
||||
Search::find_alpha_inc): New declarations.
|
||||
(Search::_alpha_inc): New field.
|
||||
(Search::_alpha_size, Search::_occurrences, Search::_asso_values,
|
||||
Search::_determined): Make non-const.
|
||||
* src/search.cc (Search::Search): Don't initialize _key_positions,
|
||||
_alpha_size, _occurrences, _asso_values, _determined here.
|
||||
(Search::init_selchars_tuple): Renamed from Search::init_selchars.
|
||||
(Search::count_duplicates_tuple): Renamed from Search::count_duplicates.
|
||||
(Search::find_positions): Update.
|
||||
(Search::init_selchars_multiset, Search::count_duplicates_multiset,
|
||||
Search::find_alpha_inc): New methods.
|
||||
(Search::prepare): Move preprepare, find_positions calls away.
|
||||
Initialize _alpha_size, _occurrences, _asso_values, _determined here.
|
||||
(Search::optimize): Call preprepare, find_positions here. Initialize
|
||||
_key_positions here.
|
||||
(Search::~Search): Deallocate _alpha_inc.
|
||||
* src/output.cc (Output::Output): Add alpha_inc argument.
|
||||
(Output::output_hash_function): Use _alpha_inc.
|
||||
* src/output.h (Output::Output): Add alpha_inc argument.
|
||||
(Output::_alpha_inc): New field.
|
||||
* src/main.cc (main): Pass _alpha_inc from Search to Output.
|
||||
* tests/chill.exp: Update.
|
||||
* doc/gperf.texi (Algorithmic Details): Remove description of
|
||||
artificial duplicates.
|
||||
|
||||
* src/keyword.h (KeywordExt::_selchars): Change type to
|
||||
'const unsigned int *'.
|
||||
* src/keyword.cc (sort_char_set): Change argument type to
|
||||
|
||||
3
NEWS
3
NEWS
@@ -31,6 +31,9 @@ New in 2.8:
|
||||
computed depending on the set of keywords.
|
||||
* If the input file is given by name, the output file will now contain
|
||||
#line directives referring to the input file.
|
||||
* Some keyword sets containing permutations, like { "xy", "yx", "xz", "zx" }
|
||||
or { "abc", "acb", "bca", "cab" }, are now handled by gperf without
|
||||
requiring the option -D.
|
||||
* Bug fixes.
|
||||
|
||||
New in 2.7.2:
|
||||
|
||||
@@ -993,7 +993,7 @@ through a search that minimizes the number of byte positions.
|
||||
@itemx --duplicates
|
||||
@cindex Duplicates
|
||||
Handle keywords whose selected byte sets hash to duplicate values.
|
||||
Duplicate hash values can occur for three reasons:
|
||||
Duplicate hash values can occur for two reasons:
|
||||
|
||||
@itemize @bullet
|
||||
@item
|
||||
@@ -1003,15 +1003,6 @@ However, frequently only a very small number of duplicates occur, and
|
||||
the majority of keywords still require one probe into the table. To
|
||||
overcome this problem, the option @samp{-m 50} should be used.
|
||||
|
||||
@item
|
||||
Since the @code{gperf} generated hash function treats the bytes at
|
||||
different byte positions with equal weight, keywords that are permutations
|
||||
of each other can lead to the same hash function value if they are not
|
||||
disambiguated by the set of selected byte positions. Sometimes even this
|
||||
is not possible; for example, the keyword set @{"xy", "yx", "xz", "zx"@}
|
||||
will always lead to duplicates, regardless how the selected byte positions
|
||||
are chosen. You can use the option @samp{-D} to handle this rare case.
|
||||
|
||||
@item
|
||||
Sometimes a set of keywords may have the same names, but possess different
|
||||
attributes. With the -D option @code{gperf} treats all these keywords as
|
||||
|
||||
@@ -57,8 +57,9 @@ static inline void sort_char_set (unsigned int *base, int len)
|
||||
Furthermore we sort the selchars array, to ease detection of duplicates
|
||||
later.
|
||||
*/
|
||||
void
|
||||
KeywordExt::init_selchars (bool use_all_chars, const Positions& positions)
|
||||
|
||||
unsigned int *
|
||||
KeywordExt::init_selchars_low (bool use_all_chars, const Positions& positions, const unsigned int *alpha_inc)
|
||||
{
|
||||
const char *k = _allchars;
|
||||
unsigned int *key_set =
|
||||
@@ -69,7 +70,10 @@ KeywordExt::init_selchars (bool use_all_chars, const Positions& positions)
|
||||
/* Use all the character positions in the KEY. */
|
||||
for (int i = _allchars_length; i > 0; k++, i--)
|
||||
{
|
||||
*ptr = static_cast<unsigned char>(*k);
|
||||
unsigned int c = static_cast<unsigned char>(*k);
|
||||
if (alpha_inc)
|
||||
c += alpha_inc[k-_allchars];
|
||||
*ptr = c;
|
||||
ptr++;
|
||||
}
|
||||
else
|
||||
@@ -81,24 +85,45 @@ KeywordExt::init_selchars (bool use_all_chars, const Positions& positions)
|
||||
|
||||
for (int i; (i = iter.next ()) != PositionIterator::EOS; )
|
||||
{
|
||||
unsigned int c;
|
||||
if (i == Positions::LASTCHAR)
|
||||
/* Special notation for last KEY position, i.e. '$'. */
|
||||
*ptr = static_cast<unsigned char>(_allchars[_allchars_length - 1]);
|
||||
c = static_cast<unsigned char>(_allchars[_allchars_length - 1]);
|
||||
else if (i <= _allchars_length)
|
||||
/* Within range of KEY length, so we'll keep it. */
|
||||
*ptr = static_cast<unsigned char>(_allchars[i - 1]);
|
||||
{
|
||||
/* Within range of KEY length, so we'll keep it. */
|
||||
c = static_cast<unsigned char>(_allchars[i - 1]);
|
||||
if (alpha_inc)
|
||||
c += alpha_inc[i - 1];
|
||||
}
|
||||
else
|
||||
/* Out of range of KEY length, so we'll just skip it. */
|
||||
continue;
|
||||
*ptr = c;
|
||||
ptr++;
|
||||
}
|
||||
}
|
||||
|
||||
/* Sort the KEY_SET items alphabetically. */
|
||||
sort_char_set (key_set, ptr - key_set);
|
||||
|
||||
_selchars = key_set;
|
||||
_selchars_length = ptr - key_set;
|
||||
|
||||
return key_set;
|
||||
}
|
||||
|
||||
void
|
||||
KeywordExt::init_selchars_tuple (bool use_all_chars, const Positions& positions)
|
||||
{
|
||||
init_selchars_low (use_all_chars, positions, NULL);
|
||||
}
|
||||
|
||||
void
|
||||
KeywordExt::init_selchars_multiset (bool use_all_chars, const Positions& positions, const unsigned int *alpha_inc)
|
||||
{
|
||||
unsigned int *selchars =
|
||||
init_selchars_low (use_all_chars, positions, alpha_inc);
|
||||
|
||||
/* Sort the selchars elements alphabetically. */
|
||||
sort_char_set (selchars, _selchars_length);
|
||||
}
|
||||
|
||||
/* Deletes selchars. */
|
||||
|
||||
@@ -67,8 +67,10 @@ struct KeywordExt : public Keyword
|
||||
KeywordExt * _duplicate_link;
|
||||
|
||||
/* Methods depending on the keyposition list. */
|
||||
/* Initializes selchars and selchars_length. */
|
||||
void init_selchars (bool use_all_chars, const Positions& positions);
|
||||
/* Initializes selchars and selchars_length, without reordering. */
|
||||
void init_selchars_tuple (bool use_all_chars, const Positions& positions);
|
||||
/* Initializes selchars and selchars_length, with reordering. */
|
||||
void init_selchars_multiset (bool use_all_chars, const Positions& positions, const unsigned int *alpha_inc);
|
||||
/* Deletes selchars. */
|
||||
void delete_selchars ();
|
||||
|
||||
@@ -78,6 +80,9 @@ struct KeywordExt : public Keyword
|
||||
|
||||
/* Data members used by the output routines. */
|
||||
int _final_index;
|
||||
|
||||
private:
|
||||
unsigned int * init_selchars_low (bool use_all_chars, const Positions& positions, const unsigned int *alpha_inc);
|
||||
};
|
||||
|
||||
/* An abstract factory for creating Keyword instances.
|
||||
|
||||
@@ -106,6 +106,7 @@ main (int argc, char *argv[])
|
||||
searcher._max_key_len,
|
||||
searcher._min_key_len,
|
||||
searcher._key_positions,
|
||||
searcher._alpha_inc,
|
||||
searcher._total_duplicates,
|
||||
searcher._alpha_size,
|
||||
searcher._occurrences,
|
||||
|
||||
@@ -88,8 +88,8 @@ Output::Output (KeywordExt_List *head, const char *struct_decl,
|
||||
const char *verbatim_code, const char *verbatim_code_end,
|
||||
unsigned int verbatim_code_lineno,
|
||||
int total_keys, int max_key_len, int min_key_len,
|
||||
const Positions& positions, int total_duplicates,
|
||||
int alpha_size, const int *occurrences,
|
||||
const Positions& positions, const unsigned int *alpha_inc,
|
||||
int total_duplicates, int alpha_size, const int *occurrences,
|
||||
const int *asso_values)
|
||||
: _head (head), _struct_decl (struct_decl),
|
||||
_struct_decl_lineno (struct_decl_lineno), _return_type (return_type),
|
||||
@@ -102,7 +102,7 @@ Output::Output (KeywordExt_List *head, const char *struct_decl,
|
||||
_verbatim_code_lineno (verbatim_code_lineno),
|
||||
_total_keys (total_keys),
|
||||
_max_key_len (max_key_len), _min_key_len (min_key_len),
|
||||
_key_positions (positions),
|
||||
_key_positions (positions), _alpha_inc (alpha_inc),
|
||||
_total_duplicates (total_duplicates), _alpha_size (alpha_size),
|
||||
_occurrences (occurrences), _asso_values (asso_values)
|
||||
{
|
||||
@@ -521,9 +521,14 @@ Output::output_hash_function () const
|
||||
option[NOLENGTH] ? "len" : "hval");
|
||||
|
||||
for (int i = _max_key_len; i > 0; i--)
|
||||
printf (" case %d:\n"
|
||||
" hval += asso_values[%sstr[%d]];\n",
|
||||
i, char_to_index, i - 1);
|
||||
{
|
||||
printf (" case %d:\n"
|
||||
" hval += asso_values[%sstr[%d]",
|
||||
i, char_to_index, i - 1);
|
||||
if (_alpha_inc[i - 1])
|
||||
printf ("+%u", _alpha_inc[i - 1]);
|
||||
printf ("];\n");
|
||||
}
|
||||
|
||||
printf (" break;\n"
|
||||
" }\n"
|
||||
@@ -560,13 +565,21 @@ Output::output_hash_function () const
|
||||
&& _key_positions[0] == 1
|
||||
&& _key_positions[1] == Positions::LASTCHAR)
|
||||
/* Optimize special case of "-k 1,$". */
|
||||
printf ("asso_values[%sstr[len - 1]] + asso_values[%sstr[0]]",
|
||||
char_to_index, char_to_index);
|
||||
{
|
||||
printf ("asso_values[%sstr[len - 1]] + asso_values[%sstr[0]",
|
||||
char_to_index, char_to_index);
|
||||
if (_alpha_inc[0])
|
||||
printf ("+%u", _alpha_inc[0]);
|
||||
printf ("]");
|
||||
}
|
||||
else
|
||||
{
|
||||
for (; key_pos != Positions::LASTCHAR; )
|
||||
{
|
||||
printf ("asso_values[%sstr[%d]]", char_to_index, key_pos - 1);
|
||||
printf ("asso_values[%sstr[%d]", char_to_index, key_pos - 1);
|
||||
if (_alpha_inc[key_pos - 1])
|
||||
printf ("+%u", _alpha_inc[key_pos - 1]);
|
||||
printf ("]");
|
||||
if ((key_pos = iter.next ()) != PositionIterator::EOS)
|
||||
printf (" + ");
|
||||
else
|
||||
@@ -601,8 +614,11 @@ Output::output_hash_function () const
|
||||
for ( ; i >= key_pos; i--)
|
||||
printf (" case %d:\n", i);
|
||||
|
||||
printf (" hval += asso_values[%sstr[%d]];\n",
|
||||
printf (" hval += asso_values[%sstr[%d]",
|
||||
char_to_index, key_pos - 1);
|
||||
if (_alpha_inc[key_pos - 1])
|
||||
printf ("+%u", _alpha_inc[key_pos - 1]);
|
||||
printf ("];\n");
|
||||
|
||||
key_pos = iter.next ();
|
||||
}
|
||||
|
||||
@@ -51,6 +51,7 @@ public:
|
||||
int total_keys,
|
||||
int max_key_len, int min_key_len,
|
||||
const Positions& positions,
|
||||
const unsigned int *alpha_inc,
|
||||
int total_duplicates,
|
||||
int alpha_size,
|
||||
const int *occurrences,
|
||||
@@ -121,6 +122,8 @@ private:
|
||||
int const _min_key_len;
|
||||
/* Key positions. Only to be used if !options[ALLCHARS]. */
|
||||
Positions const _key_positions;
|
||||
/* Adjustments to add to bytes add specific key positions. */
|
||||
const unsigned int * const _alpha_inc;
|
||||
/* Total number of duplicate hash values. */
|
||||
int const _total_duplicates;
|
||||
/* Minimum hash value for all keywords. */
|
||||
|
||||
233
src/search.cc
233
src/search.cc
@@ -31,15 +31,44 @@
|
||||
#include "options.h"
|
||||
#include "hash-table.h"
|
||||
|
||||
/* The most general form of the hash function is
|
||||
|
||||
hash (keyword) = sum (asso_values[keyword[i] + alpha_inc[i]] : i in Pos)
|
||||
|
||||
where Pos is a set of byte positions,
|
||||
each alpha_inc[i] is a nonnegative integer,
|
||||
each asso_values[c] is a nonnegative integer.
|
||||
|
||||
Theorem 1: If all keywords are different, there is a set Pos such that
|
||||
all tuples (keyword[i] : i in Pos) are different.
|
||||
|
||||
Theorem 2: If all tuples (keyword[i] : i in Pos) are different, there
|
||||
are nonnegative integers alpha_inc[i] such that all multisets
|
||||
{keyword[i] + alpha_inc[i] : i in Pos} are different.
|
||||
|
||||
Theorem 3: If all multisets selchars[keyword] are different, there are
|
||||
nonnegative integers asso_values[c] such that all hash values
|
||||
sum (asso_values[c] : c in selchars[keyword]) are different.
|
||||
|
||||
Based on these three facts, we find the hash function in three steps:
|
||||
|
||||
Step 1 (Finding good byte positions):
|
||||
Find a set Pos, as small as possible, such that all tuples
|
||||
(keyword[i] : i in Pos) are different.
|
||||
|
||||
Step 2 (Finding good alpha increments):
|
||||
Find nonnegative integers alpha_inc[i], as many of them as possible being
|
||||
zero, and the others being as small as possible, such that all multisets
|
||||
{keyword[i] + alpha_inc[i] : i in Pos} are different.
|
||||
|
||||
Step 3 (Finding good asso_values):
|
||||
Find asso_values[c] such that all hash (keyword) are different.
|
||||
*/
|
||||
|
||||
/* -------------------- Initialization and Preparation --------------------- */
|
||||
|
||||
Search::Search (KeywordExt_List *list)
|
||||
: _head (list),
|
||||
_key_positions (option.get_key_positions()),
|
||||
_alpha_size (option[SEVENBIT] ? 128 : 256),
|
||||
_occurrences (new int[_alpha_size]),
|
||||
_asso_values (new int[_alpha_size]),
|
||||
_determined (new bool[_alpha_size])
|
||||
: _head (list)
|
||||
{
|
||||
}
|
||||
|
||||
@@ -77,12 +106,14 @@ Search::preprepare ()
|
||||
}
|
||||
}
|
||||
|
||||
/* ---------------------- Finding good byte positions ---------------------- */
|
||||
|
||||
/* Initializes each keyword's _selchars array. */
|
||||
void
|
||||
Search::init_selchars (bool use_all_chars, const Positions& positions) const
|
||||
Search::init_selchars_tuple (bool use_all_chars, const Positions& positions) const
|
||||
{
|
||||
for (KeywordExt_List *temp = _head; temp; temp = temp->rest())
|
||||
temp->first()->init_selchars(use_all_chars, positions);
|
||||
temp->first()->init_selchars_tuple(use_all_chars, positions);
|
||||
}
|
||||
|
||||
/* Deletes each keyword's _selchars array. */
|
||||
@@ -95,29 +126,31 @@ Search::delete_selchars () const
|
||||
|
||||
/* Count the duplicate keywords that occur with a given set of positions. */
|
||||
unsigned int
|
||||
Search::count_duplicates (const Positions& positions) const
|
||||
Search::count_duplicates_tuple (const Positions& positions) const
|
||||
{
|
||||
init_selchars (false, positions);
|
||||
init_selchars_tuple (option[ALLCHARS], positions);
|
||||
|
||||
unsigned int count = 0;
|
||||
Hash_Table representatives (_total_keys, option[NOLENGTH]);
|
||||
for (KeywordExt_List *temp = _head; temp; temp = temp->rest())
|
||||
{
|
||||
KeywordExt *keyword = temp->first();
|
||||
if (representatives.insert (keyword))
|
||||
count++;
|
||||
}
|
||||
{
|
||||
Hash_Table representatives (_total_keys, option[NOLENGTH]);
|
||||
for (KeywordExt_List *temp = _head; temp; temp = temp->rest())
|
||||
{
|
||||
KeywordExt *keyword = temp->first();
|
||||
if (representatives.insert (keyword))
|
||||
count++;
|
||||
}
|
||||
}
|
||||
|
||||
delete_selchars ();
|
||||
|
||||
return count;
|
||||
}
|
||||
|
||||
/* Find good key positions. */
|
||||
|
||||
void
|
||||
Search::find_positions ()
|
||||
{
|
||||
/* Determine good key positions. */
|
||||
|
||||
/* 1. Find positions that must occur in order to distinguish duplicates. */
|
||||
Positions mandatory;
|
||||
|
||||
@@ -159,7 +192,7 @@ Search::find_positions ()
|
||||
int imax = (_max_key_len < Positions::MAX_KEY_POS
|
||||
? _max_key_len : Positions::MAX_KEY_POS);
|
||||
Positions current = mandatory;
|
||||
unsigned int current_duplicates_count = count_duplicates (current);
|
||||
unsigned int current_duplicates_count = count_duplicates_tuple (current);
|
||||
for (;;)
|
||||
{
|
||||
Positions best;
|
||||
@@ -170,7 +203,7 @@ Search::find_positions ()
|
||||
{
|
||||
Positions tryal = current;
|
||||
tryal.add (i);
|
||||
unsigned int try_duplicates_count = count_duplicates (tryal);
|
||||
unsigned int try_duplicates_count = count_duplicates_tuple (tryal);
|
||||
|
||||
/* We prefer 'try' to 'best' if it produces less duplicates,
|
||||
or if it produces the same number of duplicates but with
|
||||
@@ -203,7 +236,7 @@ Search::find_positions ()
|
||||
{
|
||||
Positions tryal = current;
|
||||
tryal.remove (i);
|
||||
unsigned int try_duplicates_count = count_duplicates (tryal);
|
||||
unsigned int try_duplicates_count = count_duplicates_tuple (tryal);
|
||||
|
||||
/* We prefer 'try' to 'best' if it produces less duplicates,
|
||||
or if it produces the same number of duplicates but with
|
||||
@@ -243,7 +276,7 @@ Search::find_positions ()
|
||||
tryal.remove (i2);
|
||||
tryal.add (i3);
|
||||
unsigned int try_duplicates_count =
|
||||
count_duplicates (tryal);
|
||||
count_duplicates_tuple (tryal);
|
||||
|
||||
/* We prefer 'try' to 'best' if it produces less duplicates,
|
||||
or if it produces the same number of duplicates but with
|
||||
@@ -269,18 +302,141 @@ Search::find_positions ()
|
||||
_key_positions = current;
|
||||
}
|
||||
|
||||
/* --------------------- Finding good alpha increments --------------------- */
|
||||
|
||||
/* Initializes each keyword's _selchars array. */
|
||||
void
|
||||
Search::init_selchars_multiset (bool use_all_chars, const Positions& positions, const unsigned int *alpha_inc) const
|
||||
{
|
||||
for (KeywordExt_List *temp = _head; temp; temp = temp->rest())
|
||||
temp->first()->init_selchars_multiset(use_all_chars, positions, alpha_inc);
|
||||
}
|
||||
|
||||
/* Count the duplicate keywords that occur with the given set of positions
|
||||
and a given alpha_inc[] array. */
|
||||
unsigned int
|
||||
Search::count_duplicates_multiset (const unsigned int *alpha_inc) const
|
||||
{
|
||||
init_selchars_multiset (option[ALLCHARS], _key_positions, alpha_inc);
|
||||
|
||||
unsigned int count = 0;
|
||||
{
|
||||
Hash_Table representatives (_total_keys, option[NOLENGTH]);
|
||||
for (KeywordExt_List *temp = _head; temp; temp = temp->rest())
|
||||
{
|
||||
KeywordExt *keyword = temp->first();
|
||||
if (representatives.insert (keyword))
|
||||
count++;
|
||||
}
|
||||
}
|
||||
|
||||
delete_selchars ();
|
||||
|
||||
return count;
|
||||
}
|
||||
|
||||
/* Find good _alpha_inc[]. */
|
||||
|
||||
void
|
||||
Search::find_alpha_inc ()
|
||||
{
|
||||
/* The goal is to choose _alpha_inc[] such that it doesn't introduce
|
||||
artificial duplicates. */
|
||||
unsigned int duplicates_goal = count_duplicates_tuple (_key_positions);
|
||||
|
||||
/* Start with zero increments. This is sufficient in most cases. */
|
||||
unsigned int *current = new unsigned int [_max_key_len];
|
||||
for (int i = 0; i < _max_key_len; i++)
|
||||
current[i] = 0;
|
||||
unsigned int current_duplicates_count = count_duplicates_multiset (current);
|
||||
|
||||
if (current_duplicates_count > duplicates_goal)
|
||||
{
|
||||
/* Look which _alpha_inc[i] we are free to increment. */
|
||||
unsigned int nindices;
|
||||
if (option[ALLCHARS])
|
||||
nindices = _max_key_len;
|
||||
else
|
||||
{
|
||||
/* Ignore Positions::LASTCHAR. Remember that since Positions are
|
||||
sorted in decreasing order, Positions::LASTCHAR comes last. */
|
||||
nindices = (_key_positions.get_size() == 0
|
||||
|| _key_positions[_key_positions.get_size() - 1]
|
||||
!= Positions::LASTCHAR
|
||||
? _key_positions.get_size()
|
||||
: _key_positions.get_size() - 1);
|
||||
}
|
||||
|
||||
unsigned int indices[nindices];
|
||||
if (option[ALLCHARS])
|
||||
for (unsigned int j = 0; j < nindices; j++)
|
||||
indices[j] = j;
|
||||
else
|
||||
{
|
||||
PositionIterator iter (_key_positions);
|
||||
for (unsigned int j = 0; j < nindices; j++)
|
||||
{
|
||||
int key_pos = iter.next ();
|
||||
if (key_pos == PositionIterator::EOS
|
||||
|| key_pos == Positions::LASTCHAR)
|
||||
abort ();
|
||||
indices[j] = key_pos - 1;
|
||||
}
|
||||
}
|
||||
|
||||
/* Perform several rounds of searching for a good alpha increment.
|
||||
Each round reduces the number of artificial collisions by adding
|
||||
an increment in a single key position. */
|
||||
unsigned int best[_max_key_len];
|
||||
unsigned int tryal[_max_key_len];
|
||||
do
|
||||
{
|
||||
/* An increment of 1 is not always enough. Try higher increments
|
||||
also. */
|
||||
for (unsigned int inc = 1; ; inc++)
|
||||
{
|
||||
unsigned int best_duplicates_count = UINT_MAX;
|
||||
|
||||
for (unsigned int j = 0; j < nindices; j++)
|
||||
{
|
||||
memcpy (tryal, current, _max_key_len * sizeof (unsigned int));
|
||||
tryal[indices[j]] += inc;
|
||||
unsigned int try_duplicates_count =
|
||||
count_duplicates_multiset (tryal);
|
||||
|
||||
/* We prefer 'try' to 'best' if it produces less
|
||||
duplicates. */
|
||||
if (try_duplicates_count < best_duplicates_count)
|
||||
{
|
||||
memcpy (best, tryal, _max_key_len * sizeof (unsigned int));
|
||||
best_duplicates_count = try_duplicates_count;
|
||||
}
|
||||
}
|
||||
|
||||
/* Stop this round when we got an improvement. */
|
||||
if (best_duplicates_count < current_duplicates_count)
|
||||
{
|
||||
memcpy (current, best, _max_key_len * sizeof (unsigned int));
|
||||
current_duplicates_count = best_duplicates_count;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
while (current_duplicates_count > duplicates_goal);
|
||||
}
|
||||
|
||||
_alpha_inc = current;
|
||||
}
|
||||
|
||||
/* ------------------------------------------------------------------------- */
|
||||
|
||||
void
|
||||
Search::prepare ()
|
||||
{
|
||||
KeywordExt_List *temp;
|
||||
|
||||
preprepare ();
|
||||
|
||||
if (!option[POSITIONS])
|
||||
find_positions ();
|
||||
|
||||
/* Initialize each keyword's _selchars array. */
|
||||
init_selchars (option[ALLCHARS], _key_positions);
|
||||
init_selchars_multiset(option[ALLCHARS], _key_positions, _alpha_inc);
|
||||
|
||||
/* Check for duplicates, i.e. keywords with the same _selchars array
|
||||
(and - if !option[NOLENGTH] - also the same length).
|
||||
@@ -357,7 +513,16 @@ Search::prepare ()
|
||||
}
|
||||
}
|
||||
|
||||
/* Compute _alpha_size, the upper bound on the indices passed to
|
||||
asso_values[]. */
|
||||
unsigned int max_alpha_inc = 0;
|
||||
for (int i = 0; i < _max_key_len; i++)
|
||||
if (max_alpha_inc < _alpha_inc[i])
|
||||
max_alpha_inc = _alpha_inc[i];
|
||||
_alpha_size = (option[SEVENBIT] ? 128 : 256) + max_alpha_inc;
|
||||
|
||||
/* Compute the occurrences of each character in the alphabet. */
|
||||
_occurrences = new int[_alpha_size];
|
||||
memset (_occurrences, 0, _alpha_size * sizeof (_occurrences[0]));
|
||||
for (temp = _head; temp; temp = temp->rest())
|
||||
{
|
||||
@@ -366,6 +531,10 @@ Search::prepare ()
|
||||
for (int count = keyword->_selchars_length; count > 0; ptr++, count--)
|
||||
_occurrences[*ptr]++;
|
||||
}
|
||||
|
||||
/* Memory allocation. */
|
||||
_asso_values = new int[_alpha_size];
|
||||
_determined = new bool[_alpha_size];
|
||||
}
|
||||
|
||||
/* ---------------- Reordering the Keyword list (optional) ----------------- */
|
||||
@@ -878,6 +1047,11 @@ void
|
||||
Search::optimize ()
|
||||
{
|
||||
/* Preparations. */
|
||||
preprepare ();
|
||||
_key_positions = option.get_key_positions();
|
||||
if (!option[POSITIONS])
|
||||
find_positions ();
|
||||
find_alpha_inc ();
|
||||
prepare ();
|
||||
if (option[ORDER])
|
||||
reorder ();
|
||||
@@ -1025,4 +1199,5 @@ Search::~Search ()
|
||||
}
|
||||
delete[] _asso_values;
|
||||
delete[] _occurrences;
|
||||
delete[] _alpha_inc;
|
||||
}
|
||||
|
||||
26
src/search.h
26
src/search.h
@@ -40,15 +40,26 @@ private:
|
||||
void preprepare ();
|
||||
|
||||
/* Initializes each keyword's _selchars array. */
|
||||
void init_selchars (bool use_all_chars, const Positions& positions) const;
|
||||
void init_selchars_tuple (bool use_all_chars, const Positions& positions) const;
|
||||
/* Deletes each keyword's _selchars array. */
|
||||
void delete_selchars () const;
|
||||
|
||||
/* Count the duplicate keywords that occur with a given set of positions. */
|
||||
unsigned int count_duplicates (const Positions& positions) const;
|
||||
unsigned int count_duplicates_tuple (const Positions& positions) const;
|
||||
|
||||
/* Find good key positions. */
|
||||
void find_positions ();
|
||||
|
||||
/* Initializes each keyword's _selchars array. */
|
||||
void init_selchars_multiset (bool use_all_chars, const Positions& positions, const unsigned int *alpha_inc) const;
|
||||
|
||||
/* Count the duplicate keywords that occur with the given set of positions
|
||||
and a given alpha_inc[] array. */
|
||||
unsigned int count_duplicates_multiset (const unsigned int *alpha_inc) const;
|
||||
|
||||
/* Find good _alpha_inc[]. */
|
||||
void find_alpha_inc ();
|
||||
|
||||
void prepare ();
|
||||
|
||||
/* Computes the sum of occurrences of the _selchars of a keyword. */
|
||||
@@ -112,19 +123,22 @@ public:
|
||||
/* User-specified or computed key positions. */
|
||||
Positions _key_positions;
|
||||
|
||||
/* Adjustments to add to bytes add specific key positions. */
|
||||
unsigned int * _alpha_inc;
|
||||
|
||||
/* Total number of duplicates that have been moved to _duplicate_link lists
|
||||
(not counting their representatives which stay on the main list). */
|
||||
int _total_duplicates;
|
||||
|
||||
/* Size of alphabet. */
|
||||
int const _alpha_size;
|
||||
int _alpha_size;
|
||||
|
||||
/* Counts occurrences of each key set character.
|
||||
_occurrences[c] is the number of times that c occurs among the _selchars
|
||||
of a keyword. */
|
||||
int * const _occurrences;
|
||||
int * _occurrences;
|
||||
/* Value associated with each character. */
|
||||
int * const _asso_values;
|
||||
int * _asso_values;
|
||||
|
||||
private:
|
||||
|
||||
@@ -132,7 +146,7 @@ private:
|
||||
int _list_len;
|
||||
|
||||
/* Vector used during Search::reorder(). */
|
||||
bool * const _determined;
|
||||
bool * _determined;
|
||||
|
||||
/* Exclusive upper bound for every _asso_values[c]. A power of 2. */
|
||||
int _asso_value_max;
|
||||
|
||||
1252
tests/chill.exp
1252
tests/chill.exp
File diff suppressed because it is too large
Load Diff
Reference in New Issue
Block a user