mirror of
https://git.savannah.gnu.org/git/gperf.git
synced 2025-12-02 13:09:22 +00:00
Introduce new alpha_inc pass, to avoid artificial duplicates.
This commit is contained in:
39
ChangeLog
39
ChangeLog
@@ -1,5 +1,44 @@
|
|||||||
2002-11-17 Bruno Haible <bruno@clisp.org>
|
2002-11-17 Bruno Haible <bruno@clisp.org>
|
||||||
|
|
||||||
|
Avoid artificial duplicates.
|
||||||
|
* src/keyword.h (KeywordExt::init_selchars_tuple): New declaration.
|
||||||
|
(KeywordExt::init_selchars_multiset): Renamed from
|
||||||
|
KeywordExt::init_selchars.
|
||||||
|
(KeywordExt::init_selchars_low): New declaration.
|
||||||
|
* src/keyword.cc (KeywordExt::init_selchars_low): Renamed from
|
||||||
|
KeywordExt::init_selchars. Add alpha_inc argument. Remove sorting.
|
||||||
|
(KeywordExt::init_selchars_tuple): New method.
|
||||||
|
(KeywordExt::init_selchars_multiset): New method, replaces
|
||||||
|
KeywordExt::init_selchars.
|
||||||
|
* src/search.h (Search::init_selchars_tuple): Renamed from
|
||||||
|
Search::init_selchars.
|
||||||
|
(Search::count_duplicates_tuple): Renamed from Search::count_duplicates.
|
||||||
|
(Search::init_selchars_multiset, Search::count_duplicates_multiset,
|
||||||
|
Search::find_alpha_inc): New declarations.
|
||||||
|
(Search::_alpha_inc): New field.
|
||||||
|
(Search::_alpha_size, Search::_occurrences, Search::_asso_values,
|
||||||
|
Search::_determined): Make non-const.
|
||||||
|
* src/search.cc (Search::Search): Don't initialize _key_positions,
|
||||||
|
_alpha_size, _occurrences, _asso_values, _determined here.
|
||||||
|
(Search::init_selchars_tuple): Renamed from Search::init_selchars.
|
||||||
|
(Search::count_duplicates_tuple): Renamed from Search::count_duplicates.
|
||||||
|
(Search::find_positions): Update.
|
||||||
|
(Search::init_selchars_multiset, Search::count_duplicates_multiset,
|
||||||
|
Search::find_alpha_inc): New methods.
|
||||||
|
(Search::prepare): Move preprepare, find_positions calls away.
|
||||||
|
Initialize _alpha_size, _occurrences, _asso_values, _determined here.
|
||||||
|
(Search::optimize): Call preprepare, find_positions here. Initialize
|
||||||
|
_key_positions here.
|
||||||
|
(Search::~Search): Deallocate _alpha_inc.
|
||||||
|
* src/output.cc (Output::Output): Add alpha_inc argument.
|
||||||
|
(Output::output_hash_function): Use _alpha_inc.
|
||||||
|
* src/output.h (Output::Output): Add alpha_inc argument.
|
||||||
|
(Output::_alpha_inc): New field.
|
||||||
|
* src/main.cc (main): Pass _alpha_inc from Search to Output.
|
||||||
|
* tests/chill.exp: Update.
|
||||||
|
* doc/gperf.texi (Algorithmic Details): Remove description of
|
||||||
|
artificial duplicates.
|
||||||
|
|
||||||
* src/keyword.h (KeywordExt::_selchars): Change type to
|
* src/keyword.h (KeywordExt::_selchars): Change type to
|
||||||
'const unsigned int *'.
|
'const unsigned int *'.
|
||||||
* src/keyword.cc (sort_char_set): Change argument type to
|
* src/keyword.cc (sort_char_set): Change argument type to
|
||||||
|
|||||||
3
NEWS
3
NEWS
@@ -31,6 +31,9 @@ New in 2.8:
|
|||||||
computed depending on the set of keywords.
|
computed depending on the set of keywords.
|
||||||
* If the input file is given by name, the output file will now contain
|
* If the input file is given by name, the output file will now contain
|
||||||
#line directives referring to the input file.
|
#line directives referring to the input file.
|
||||||
|
* Some keyword sets containing permutations, like { "xy", "yx", "xz", "zx" }
|
||||||
|
or { "abc", "acb", "bca", "cab" }, are now handled by gperf without
|
||||||
|
requiring the option -D.
|
||||||
* Bug fixes.
|
* Bug fixes.
|
||||||
|
|
||||||
New in 2.7.2:
|
New in 2.7.2:
|
||||||
|
|||||||
@@ -993,7 +993,7 @@ through a search that minimizes the number of byte positions.
|
|||||||
@itemx --duplicates
|
@itemx --duplicates
|
||||||
@cindex Duplicates
|
@cindex Duplicates
|
||||||
Handle keywords whose selected byte sets hash to duplicate values.
|
Handle keywords whose selected byte sets hash to duplicate values.
|
||||||
Duplicate hash values can occur for three reasons:
|
Duplicate hash values can occur for two reasons:
|
||||||
|
|
||||||
@itemize @bullet
|
@itemize @bullet
|
||||||
@item
|
@item
|
||||||
@@ -1003,15 +1003,6 @@ However, frequently only a very small number of duplicates occur, and
|
|||||||
the majority of keywords still require one probe into the table. To
|
the majority of keywords still require one probe into the table. To
|
||||||
overcome this problem, the option @samp{-m 50} should be used.
|
overcome this problem, the option @samp{-m 50} should be used.
|
||||||
|
|
||||||
@item
|
|
||||||
Since the @code{gperf} generated hash function treats the bytes at
|
|
||||||
different byte positions with equal weight, keywords that are permutations
|
|
||||||
of each other can lead to the same hash function value if they are not
|
|
||||||
disambiguated by the set of selected byte positions. Sometimes even this
|
|
||||||
is not possible; for example, the keyword set @{"xy", "yx", "xz", "zx"@}
|
|
||||||
will always lead to duplicates, regardless how the selected byte positions
|
|
||||||
are chosen. You can use the option @samp{-D} to handle this rare case.
|
|
||||||
|
|
||||||
@item
|
@item
|
||||||
Sometimes a set of keywords may have the same names, but possess different
|
Sometimes a set of keywords may have the same names, but possess different
|
||||||
attributes. With the -D option @code{gperf} treats all these keywords as
|
attributes. With the -D option @code{gperf} treats all these keywords as
|
||||||
|
|||||||
@@ -57,8 +57,9 @@ static inline void sort_char_set (unsigned int *base, int len)
|
|||||||
Furthermore we sort the selchars array, to ease detection of duplicates
|
Furthermore we sort the selchars array, to ease detection of duplicates
|
||||||
later.
|
later.
|
||||||
*/
|
*/
|
||||||
void
|
|
||||||
KeywordExt::init_selchars (bool use_all_chars, const Positions& positions)
|
unsigned int *
|
||||||
|
KeywordExt::init_selchars_low (bool use_all_chars, const Positions& positions, const unsigned int *alpha_inc)
|
||||||
{
|
{
|
||||||
const char *k = _allchars;
|
const char *k = _allchars;
|
||||||
unsigned int *key_set =
|
unsigned int *key_set =
|
||||||
@@ -69,7 +70,10 @@ KeywordExt::init_selchars (bool use_all_chars, const Positions& positions)
|
|||||||
/* Use all the character positions in the KEY. */
|
/* Use all the character positions in the KEY. */
|
||||||
for (int i = _allchars_length; i > 0; k++, i--)
|
for (int i = _allchars_length; i > 0; k++, i--)
|
||||||
{
|
{
|
||||||
*ptr = static_cast<unsigned char>(*k);
|
unsigned int c = static_cast<unsigned char>(*k);
|
||||||
|
if (alpha_inc)
|
||||||
|
c += alpha_inc[k-_allchars];
|
||||||
|
*ptr = c;
|
||||||
ptr++;
|
ptr++;
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
@@ -81,24 +85,45 @@ KeywordExt::init_selchars (bool use_all_chars, const Positions& positions)
|
|||||||
|
|
||||||
for (int i; (i = iter.next ()) != PositionIterator::EOS; )
|
for (int i; (i = iter.next ()) != PositionIterator::EOS; )
|
||||||
{
|
{
|
||||||
|
unsigned int c;
|
||||||
if (i == Positions::LASTCHAR)
|
if (i == Positions::LASTCHAR)
|
||||||
/* Special notation for last KEY position, i.e. '$'. */
|
/* Special notation for last KEY position, i.e. '$'. */
|
||||||
*ptr = static_cast<unsigned char>(_allchars[_allchars_length - 1]);
|
c = static_cast<unsigned char>(_allchars[_allchars_length - 1]);
|
||||||
else if (i <= _allchars_length)
|
else if (i <= _allchars_length)
|
||||||
/* Within range of KEY length, so we'll keep it. */
|
{
|
||||||
*ptr = static_cast<unsigned char>(_allchars[i - 1]);
|
/* Within range of KEY length, so we'll keep it. */
|
||||||
|
c = static_cast<unsigned char>(_allchars[i - 1]);
|
||||||
|
if (alpha_inc)
|
||||||
|
c += alpha_inc[i - 1];
|
||||||
|
}
|
||||||
else
|
else
|
||||||
/* Out of range of KEY length, so we'll just skip it. */
|
/* Out of range of KEY length, so we'll just skip it. */
|
||||||
continue;
|
continue;
|
||||||
|
*ptr = c;
|
||||||
ptr++;
|
ptr++;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Sort the KEY_SET items alphabetically. */
|
|
||||||
sort_char_set (key_set, ptr - key_set);
|
|
||||||
|
|
||||||
_selchars = key_set;
|
_selchars = key_set;
|
||||||
_selchars_length = ptr - key_set;
|
_selchars_length = ptr - key_set;
|
||||||
|
|
||||||
|
return key_set;
|
||||||
|
}
|
||||||
|
|
||||||
|
void
|
||||||
|
KeywordExt::init_selchars_tuple (bool use_all_chars, const Positions& positions)
|
||||||
|
{
|
||||||
|
init_selchars_low (use_all_chars, positions, NULL);
|
||||||
|
}
|
||||||
|
|
||||||
|
void
|
||||||
|
KeywordExt::init_selchars_multiset (bool use_all_chars, const Positions& positions, const unsigned int *alpha_inc)
|
||||||
|
{
|
||||||
|
unsigned int *selchars =
|
||||||
|
init_selchars_low (use_all_chars, positions, alpha_inc);
|
||||||
|
|
||||||
|
/* Sort the selchars elements alphabetically. */
|
||||||
|
sort_char_set (selchars, _selchars_length);
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Deletes selchars. */
|
/* Deletes selchars. */
|
||||||
|
|||||||
@@ -67,8 +67,10 @@ struct KeywordExt : public Keyword
|
|||||||
KeywordExt * _duplicate_link;
|
KeywordExt * _duplicate_link;
|
||||||
|
|
||||||
/* Methods depending on the keyposition list. */
|
/* Methods depending on the keyposition list. */
|
||||||
/* Initializes selchars and selchars_length. */
|
/* Initializes selchars and selchars_length, without reordering. */
|
||||||
void init_selchars (bool use_all_chars, const Positions& positions);
|
void init_selchars_tuple (bool use_all_chars, const Positions& positions);
|
||||||
|
/* Initializes selchars and selchars_length, with reordering. */
|
||||||
|
void init_selchars_multiset (bool use_all_chars, const Positions& positions, const unsigned int *alpha_inc);
|
||||||
/* Deletes selchars. */
|
/* Deletes selchars. */
|
||||||
void delete_selchars ();
|
void delete_selchars ();
|
||||||
|
|
||||||
@@ -78,6 +80,9 @@ struct KeywordExt : public Keyword
|
|||||||
|
|
||||||
/* Data members used by the output routines. */
|
/* Data members used by the output routines. */
|
||||||
int _final_index;
|
int _final_index;
|
||||||
|
|
||||||
|
private:
|
||||||
|
unsigned int * init_selchars_low (bool use_all_chars, const Positions& positions, const unsigned int *alpha_inc);
|
||||||
};
|
};
|
||||||
|
|
||||||
/* An abstract factory for creating Keyword instances.
|
/* An abstract factory for creating Keyword instances.
|
||||||
|
|||||||
@@ -106,6 +106,7 @@ main (int argc, char *argv[])
|
|||||||
searcher._max_key_len,
|
searcher._max_key_len,
|
||||||
searcher._min_key_len,
|
searcher._min_key_len,
|
||||||
searcher._key_positions,
|
searcher._key_positions,
|
||||||
|
searcher._alpha_inc,
|
||||||
searcher._total_duplicates,
|
searcher._total_duplicates,
|
||||||
searcher._alpha_size,
|
searcher._alpha_size,
|
||||||
searcher._occurrences,
|
searcher._occurrences,
|
||||||
|
|||||||
@@ -88,8 +88,8 @@ Output::Output (KeywordExt_List *head, const char *struct_decl,
|
|||||||
const char *verbatim_code, const char *verbatim_code_end,
|
const char *verbatim_code, const char *verbatim_code_end,
|
||||||
unsigned int verbatim_code_lineno,
|
unsigned int verbatim_code_lineno,
|
||||||
int total_keys, int max_key_len, int min_key_len,
|
int total_keys, int max_key_len, int min_key_len,
|
||||||
const Positions& positions, int total_duplicates,
|
const Positions& positions, const unsigned int *alpha_inc,
|
||||||
int alpha_size, const int *occurrences,
|
int total_duplicates, int alpha_size, const int *occurrences,
|
||||||
const int *asso_values)
|
const int *asso_values)
|
||||||
: _head (head), _struct_decl (struct_decl),
|
: _head (head), _struct_decl (struct_decl),
|
||||||
_struct_decl_lineno (struct_decl_lineno), _return_type (return_type),
|
_struct_decl_lineno (struct_decl_lineno), _return_type (return_type),
|
||||||
@@ -102,7 +102,7 @@ Output::Output (KeywordExt_List *head, const char *struct_decl,
|
|||||||
_verbatim_code_lineno (verbatim_code_lineno),
|
_verbatim_code_lineno (verbatim_code_lineno),
|
||||||
_total_keys (total_keys),
|
_total_keys (total_keys),
|
||||||
_max_key_len (max_key_len), _min_key_len (min_key_len),
|
_max_key_len (max_key_len), _min_key_len (min_key_len),
|
||||||
_key_positions (positions),
|
_key_positions (positions), _alpha_inc (alpha_inc),
|
||||||
_total_duplicates (total_duplicates), _alpha_size (alpha_size),
|
_total_duplicates (total_duplicates), _alpha_size (alpha_size),
|
||||||
_occurrences (occurrences), _asso_values (asso_values)
|
_occurrences (occurrences), _asso_values (asso_values)
|
||||||
{
|
{
|
||||||
@@ -521,9 +521,14 @@ Output::output_hash_function () const
|
|||||||
option[NOLENGTH] ? "len" : "hval");
|
option[NOLENGTH] ? "len" : "hval");
|
||||||
|
|
||||||
for (int i = _max_key_len; i > 0; i--)
|
for (int i = _max_key_len; i > 0; i--)
|
||||||
printf (" case %d:\n"
|
{
|
||||||
" hval += asso_values[%sstr[%d]];\n",
|
printf (" case %d:\n"
|
||||||
i, char_to_index, i - 1);
|
" hval += asso_values[%sstr[%d]",
|
||||||
|
i, char_to_index, i - 1);
|
||||||
|
if (_alpha_inc[i - 1])
|
||||||
|
printf ("+%u", _alpha_inc[i - 1]);
|
||||||
|
printf ("];\n");
|
||||||
|
}
|
||||||
|
|
||||||
printf (" break;\n"
|
printf (" break;\n"
|
||||||
" }\n"
|
" }\n"
|
||||||
@@ -560,13 +565,21 @@ Output::output_hash_function () const
|
|||||||
&& _key_positions[0] == 1
|
&& _key_positions[0] == 1
|
||||||
&& _key_positions[1] == Positions::LASTCHAR)
|
&& _key_positions[1] == Positions::LASTCHAR)
|
||||||
/* Optimize special case of "-k 1,$". */
|
/* Optimize special case of "-k 1,$". */
|
||||||
printf ("asso_values[%sstr[len - 1]] + asso_values[%sstr[0]]",
|
{
|
||||||
char_to_index, char_to_index);
|
printf ("asso_values[%sstr[len - 1]] + asso_values[%sstr[0]",
|
||||||
|
char_to_index, char_to_index);
|
||||||
|
if (_alpha_inc[0])
|
||||||
|
printf ("+%u", _alpha_inc[0]);
|
||||||
|
printf ("]");
|
||||||
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
for (; key_pos != Positions::LASTCHAR; )
|
for (; key_pos != Positions::LASTCHAR; )
|
||||||
{
|
{
|
||||||
printf ("asso_values[%sstr[%d]]", char_to_index, key_pos - 1);
|
printf ("asso_values[%sstr[%d]", char_to_index, key_pos - 1);
|
||||||
|
if (_alpha_inc[key_pos - 1])
|
||||||
|
printf ("+%u", _alpha_inc[key_pos - 1]);
|
||||||
|
printf ("]");
|
||||||
if ((key_pos = iter.next ()) != PositionIterator::EOS)
|
if ((key_pos = iter.next ()) != PositionIterator::EOS)
|
||||||
printf (" + ");
|
printf (" + ");
|
||||||
else
|
else
|
||||||
@@ -601,8 +614,11 @@ Output::output_hash_function () const
|
|||||||
for ( ; i >= key_pos; i--)
|
for ( ; i >= key_pos; i--)
|
||||||
printf (" case %d:\n", i);
|
printf (" case %d:\n", i);
|
||||||
|
|
||||||
printf (" hval += asso_values[%sstr[%d]];\n",
|
printf (" hval += asso_values[%sstr[%d]",
|
||||||
char_to_index, key_pos - 1);
|
char_to_index, key_pos - 1);
|
||||||
|
if (_alpha_inc[key_pos - 1])
|
||||||
|
printf ("+%u", _alpha_inc[key_pos - 1]);
|
||||||
|
printf ("];\n");
|
||||||
|
|
||||||
key_pos = iter.next ();
|
key_pos = iter.next ();
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -51,6 +51,7 @@ public:
|
|||||||
int total_keys,
|
int total_keys,
|
||||||
int max_key_len, int min_key_len,
|
int max_key_len, int min_key_len,
|
||||||
const Positions& positions,
|
const Positions& positions,
|
||||||
|
const unsigned int *alpha_inc,
|
||||||
int total_duplicates,
|
int total_duplicates,
|
||||||
int alpha_size,
|
int alpha_size,
|
||||||
const int *occurrences,
|
const int *occurrences,
|
||||||
@@ -121,6 +122,8 @@ private:
|
|||||||
int const _min_key_len;
|
int const _min_key_len;
|
||||||
/* Key positions. Only to be used if !options[ALLCHARS]. */
|
/* Key positions. Only to be used if !options[ALLCHARS]. */
|
||||||
Positions const _key_positions;
|
Positions const _key_positions;
|
||||||
|
/* Adjustments to add to bytes add specific key positions. */
|
||||||
|
const unsigned int * const _alpha_inc;
|
||||||
/* Total number of duplicate hash values. */
|
/* Total number of duplicate hash values. */
|
||||||
int const _total_duplicates;
|
int const _total_duplicates;
|
||||||
/* Minimum hash value for all keywords. */
|
/* Minimum hash value for all keywords. */
|
||||||
|
|||||||
233
src/search.cc
233
src/search.cc
@@ -31,15 +31,44 @@
|
|||||||
#include "options.h"
|
#include "options.h"
|
||||||
#include "hash-table.h"
|
#include "hash-table.h"
|
||||||
|
|
||||||
|
/* The most general form of the hash function is
|
||||||
|
|
||||||
|
hash (keyword) = sum (asso_values[keyword[i] + alpha_inc[i]] : i in Pos)
|
||||||
|
|
||||||
|
where Pos is a set of byte positions,
|
||||||
|
each alpha_inc[i] is a nonnegative integer,
|
||||||
|
each asso_values[c] is a nonnegative integer.
|
||||||
|
|
||||||
|
Theorem 1: If all keywords are different, there is a set Pos such that
|
||||||
|
all tuples (keyword[i] : i in Pos) are different.
|
||||||
|
|
||||||
|
Theorem 2: If all tuples (keyword[i] : i in Pos) are different, there
|
||||||
|
are nonnegative integers alpha_inc[i] such that all multisets
|
||||||
|
{keyword[i] + alpha_inc[i] : i in Pos} are different.
|
||||||
|
|
||||||
|
Theorem 3: If all multisets selchars[keyword] are different, there are
|
||||||
|
nonnegative integers asso_values[c] such that all hash values
|
||||||
|
sum (asso_values[c] : c in selchars[keyword]) are different.
|
||||||
|
|
||||||
|
Based on these three facts, we find the hash function in three steps:
|
||||||
|
|
||||||
|
Step 1 (Finding good byte positions):
|
||||||
|
Find a set Pos, as small as possible, such that all tuples
|
||||||
|
(keyword[i] : i in Pos) are different.
|
||||||
|
|
||||||
|
Step 2 (Finding good alpha increments):
|
||||||
|
Find nonnegative integers alpha_inc[i], as many of them as possible being
|
||||||
|
zero, and the others being as small as possible, such that all multisets
|
||||||
|
{keyword[i] + alpha_inc[i] : i in Pos} are different.
|
||||||
|
|
||||||
|
Step 3 (Finding good asso_values):
|
||||||
|
Find asso_values[c] such that all hash (keyword) are different.
|
||||||
|
*/
|
||||||
|
|
||||||
/* -------------------- Initialization and Preparation --------------------- */
|
/* -------------------- Initialization and Preparation --------------------- */
|
||||||
|
|
||||||
Search::Search (KeywordExt_List *list)
|
Search::Search (KeywordExt_List *list)
|
||||||
: _head (list),
|
: _head (list)
|
||||||
_key_positions (option.get_key_positions()),
|
|
||||||
_alpha_size (option[SEVENBIT] ? 128 : 256),
|
|
||||||
_occurrences (new int[_alpha_size]),
|
|
||||||
_asso_values (new int[_alpha_size]),
|
|
||||||
_determined (new bool[_alpha_size])
|
|
||||||
{
|
{
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -77,12 +106,14 @@ Search::preprepare ()
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* ---------------------- Finding good byte positions ---------------------- */
|
||||||
|
|
||||||
/* Initializes each keyword's _selchars array. */
|
/* Initializes each keyword's _selchars array. */
|
||||||
void
|
void
|
||||||
Search::init_selchars (bool use_all_chars, const Positions& positions) const
|
Search::init_selchars_tuple (bool use_all_chars, const Positions& positions) const
|
||||||
{
|
{
|
||||||
for (KeywordExt_List *temp = _head; temp; temp = temp->rest())
|
for (KeywordExt_List *temp = _head; temp; temp = temp->rest())
|
||||||
temp->first()->init_selchars(use_all_chars, positions);
|
temp->first()->init_selchars_tuple(use_all_chars, positions);
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Deletes each keyword's _selchars array. */
|
/* Deletes each keyword's _selchars array. */
|
||||||
@@ -95,29 +126,31 @@ Search::delete_selchars () const
|
|||||||
|
|
||||||
/* Count the duplicate keywords that occur with a given set of positions. */
|
/* Count the duplicate keywords that occur with a given set of positions. */
|
||||||
unsigned int
|
unsigned int
|
||||||
Search::count_duplicates (const Positions& positions) const
|
Search::count_duplicates_tuple (const Positions& positions) const
|
||||||
{
|
{
|
||||||
init_selchars (false, positions);
|
init_selchars_tuple (option[ALLCHARS], positions);
|
||||||
|
|
||||||
unsigned int count = 0;
|
unsigned int count = 0;
|
||||||
Hash_Table representatives (_total_keys, option[NOLENGTH]);
|
{
|
||||||
for (KeywordExt_List *temp = _head; temp; temp = temp->rest())
|
Hash_Table representatives (_total_keys, option[NOLENGTH]);
|
||||||
{
|
for (KeywordExt_List *temp = _head; temp; temp = temp->rest())
|
||||||
KeywordExt *keyword = temp->first();
|
{
|
||||||
if (representatives.insert (keyword))
|
KeywordExt *keyword = temp->first();
|
||||||
count++;
|
if (representatives.insert (keyword))
|
||||||
}
|
count++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
delete_selchars ();
|
delete_selchars ();
|
||||||
|
|
||||||
return count;
|
return count;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* Find good key positions. */
|
||||||
|
|
||||||
void
|
void
|
||||||
Search::find_positions ()
|
Search::find_positions ()
|
||||||
{
|
{
|
||||||
/* Determine good key positions. */
|
|
||||||
|
|
||||||
/* 1. Find positions that must occur in order to distinguish duplicates. */
|
/* 1. Find positions that must occur in order to distinguish duplicates. */
|
||||||
Positions mandatory;
|
Positions mandatory;
|
||||||
|
|
||||||
@@ -159,7 +192,7 @@ Search::find_positions ()
|
|||||||
int imax = (_max_key_len < Positions::MAX_KEY_POS
|
int imax = (_max_key_len < Positions::MAX_KEY_POS
|
||||||
? _max_key_len : Positions::MAX_KEY_POS);
|
? _max_key_len : Positions::MAX_KEY_POS);
|
||||||
Positions current = mandatory;
|
Positions current = mandatory;
|
||||||
unsigned int current_duplicates_count = count_duplicates (current);
|
unsigned int current_duplicates_count = count_duplicates_tuple (current);
|
||||||
for (;;)
|
for (;;)
|
||||||
{
|
{
|
||||||
Positions best;
|
Positions best;
|
||||||
@@ -170,7 +203,7 @@ Search::find_positions ()
|
|||||||
{
|
{
|
||||||
Positions tryal = current;
|
Positions tryal = current;
|
||||||
tryal.add (i);
|
tryal.add (i);
|
||||||
unsigned int try_duplicates_count = count_duplicates (tryal);
|
unsigned int try_duplicates_count = count_duplicates_tuple (tryal);
|
||||||
|
|
||||||
/* We prefer 'try' to 'best' if it produces less duplicates,
|
/* We prefer 'try' to 'best' if it produces less duplicates,
|
||||||
or if it produces the same number of duplicates but with
|
or if it produces the same number of duplicates but with
|
||||||
@@ -203,7 +236,7 @@ Search::find_positions ()
|
|||||||
{
|
{
|
||||||
Positions tryal = current;
|
Positions tryal = current;
|
||||||
tryal.remove (i);
|
tryal.remove (i);
|
||||||
unsigned int try_duplicates_count = count_duplicates (tryal);
|
unsigned int try_duplicates_count = count_duplicates_tuple (tryal);
|
||||||
|
|
||||||
/* We prefer 'try' to 'best' if it produces less duplicates,
|
/* We prefer 'try' to 'best' if it produces less duplicates,
|
||||||
or if it produces the same number of duplicates but with
|
or if it produces the same number of duplicates but with
|
||||||
@@ -243,7 +276,7 @@ Search::find_positions ()
|
|||||||
tryal.remove (i2);
|
tryal.remove (i2);
|
||||||
tryal.add (i3);
|
tryal.add (i3);
|
||||||
unsigned int try_duplicates_count =
|
unsigned int try_duplicates_count =
|
||||||
count_duplicates (tryal);
|
count_duplicates_tuple (tryal);
|
||||||
|
|
||||||
/* We prefer 'try' to 'best' if it produces less duplicates,
|
/* We prefer 'try' to 'best' if it produces less duplicates,
|
||||||
or if it produces the same number of duplicates but with
|
or if it produces the same number of duplicates but with
|
||||||
@@ -269,18 +302,141 @@ Search::find_positions ()
|
|||||||
_key_positions = current;
|
_key_positions = current;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* --------------------- Finding good alpha increments --------------------- */
|
||||||
|
|
||||||
|
/* Initializes each keyword's _selchars array. */
|
||||||
|
void
|
||||||
|
Search::init_selchars_multiset (bool use_all_chars, const Positions& positions, const unsigned int *alpha_inc) const
|
||||||
|
{
|
||||||
|
for (KeywordExt_List *temp = _head; temp; temp = temp->rest())
|
||||||
|
temp->first()->init_selchars_multiset(use_all_chars, positions, alpha_inc);
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Count the duplicate keywords that occur with the given set of positions
|
||||||
|
and a given alpha_inc[] array. */
|
||||||
|
unsigned int
|
||||||
|
Search::count_duplicates_multiset (const unsigned int *alpha_inc) const
|
||||||
|
{
|
||||||
|
init_selchars_multiset (option[ALLCHARS], _key_positions, alpha_inc);
|
||||||
|
|
||||||
|
unsigned int count = 0;
|
||||||
|
{
|
||||||
|
Hash_Table representatives (_total_keys, option[NOLENGTH]);
|
||||||
|
for (KeywordExt_List *temp = _head; temp; temp = temp->rest())
|
||||||
|
{
|
||||||
|
KeywordExt *keyword = temp->first();
|
||||||
|
if (representatives.insert (keyword))
|
||||||
|
count++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
delete_selchars ();
|
||||||
|
|
||||||
|
return count;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Find good _alpha_inc[]. */
|
||||||
|
|
||||||
|
void
|
||||||
|
Search::find_alpha_inc ()
|
||||||
|
{
|
||||||
|
/* The goal is to choose _alpha_inc[] such that it doesn't introduce
|
||||||
|
artificial duplicates. */
|
||||||
|
unsigned int duplicates_goal = count_duplicates_tuple (_key_positions);
|
||||||
|
|
||||||
|
/* Start with zero increments. This is sufficient in most cases. */
|
||||||
|
unsigned int *current = new unsigned int [_max_key_len];
|
||||||
|
for (int i = 0; i < _max_key_len; i++)
|
||||||
|
current[i] = 0;
|
||||||
|
unsigned int current_duplicates_count = count_duplicates_multiset (current);
|
||||||
|
|
||||||
|
if (current_duplicates_count > duplicates_goal)
|
||||||
|
{
|
||||||
|
/* Look which _alpha_inc[i] we are free to increment. */
|
||||||
|
unsigned int nindices;
|
||||||
|
if (option[ALLCHARS])
|
||||||
|
nindices = _max_key_len;
|
||||||
|
else
|
||||||
|
{
|
||||||
|
/* Ignore Positions::LASTCHAR. Remember that since Positions are
|
||||||
|
sorted in decreasing order, Positions::LASTCHAR comes last. */
|
||||||
|
nindices = (_key_positions.get_size() == 0
|
||||||
|
|| _key_positions[_key_positions.get_size() - 1]
|
||||||
|
!= Positions::LASTCHAR
|
||||||
|
? _key_positions.get_size()
|
||||||
|
: _key_positions.get_size() - 1);
|
||||||
|
}
|
||||||
|
|
||||||
|
unsigned int indices[nindices];
|
||||||
|
if (option[ALLCHARS])
|
||||||
|
for (unsigned int j = 0; j < nindices; j++)
|
||||||
|
indices[j] = j;
|
||||||
|
else
|
||||||
|
{
|
||||||
|
PositionIterator iter (_key_positions);
|
||||||
|
for (unsigned int j = 0; j < nindices; j++)
|
||||||
|
{
|
||||||
|
int key_pos = iter.next ();
|
||||||
|
if (key_pos == PositionIterator::EOS
|
||||||
|
|| key_pos == Positions::LASTCHAR)
|
||||||
|
abort ();
|
||||||
|
indices[j] = key_pos - 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Perform several rounds of searching for a good alpha increment.
|
||||||
|
Each round reduces the number of artificial collisions by adding
|
||||||
|
an increment in a single key position. */
|
||||||
|
unsigned int best[_max_key_len];
|
||||||
|
unsigned int tryal[_max_key_len];
|
||||||
|
do
|
||||||
|
{
|
||||||
|
/* An increment of 1 is not always enough. Try higher increments
|
||||||
|
also. */
|
||||||
|
for (unsigned int inc = 1; ; inc++)
|
||||||
|
{
|
||||||
|
unsigned int best_duplicates_count = UINT_MAX;
|
||||||
|
|
||||||
|
for (unsigned int j = 0; j < nindices; j++)
|
||||||
|
{
|
||||||
|
memcpy (tryal, current, _max_key_len * sizeof (unsigned int));
|
||||||
|
tryal[indices[j]] += inc;
|
||||||
|
unsigned int try_duplicates_count =
|
||||||
|
count_duplicates_multiset (tryal);
|
||||||
|
|
||||||
|
/* We prefer 'try' to 'best' if it produces less
|
||||||
|
duplicates. */
|
||||||
|
if (try_duplicates_count < best_duplicates_count)
|
||||||
|
{
|
||||||
|
memcpy (best, tryal, _max_key_len * sizeof (unsigned int));
|
||||||
|
best_duplicates_count = try_duplicates_count;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Stop this round when we got an improvement. */
|
||||||
|
if (best_duplicates_count < current_duplicates_count)
|
||||||
|
{
|
||||||
|
memcpy (current, best, _max_key_len * sizeof (unsigned int));
|
||||||
|
current_duplicates_count = best_duplicates_count;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
while (current_duplicates_count > duplicates_goal);
|
||||||
|
}
|
||||||
|
|
||||||
|
_alpha_inc = current;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* ------------------------------------------------------------------------- */
|
||||||
|
|
||||||
void
|
void
|
||||||
Search::prepare ()
|
Search::prepare ()
|
||||||
{
|
{
|
||||||
KeywordExt_List *temp;
|
KeywordExt_List *temp;
|
||||||
|
|
||||||
preprepare ();
|
|
||||||
|
|
||||||
if (!option[POSITIONS])
|
|
||||||
find_positions ();
|
|
||||||
|
|
||||||
/* Initialize each keyword's _selchars array. */
|
/* Initialize each keyword's _selchars array. */
|
||||||
init_selchars (option[ALLCHARS], _key_positions);
|
init_selchars_multiset(option[ALLCHARS], _key_positions, _alpha_inc);
|
||||||
|
|
||||||
/* Check for duplicates, i.e. keywords with the same _selchars array
|
/* Check for duplicates, i.e. keywords with the same _selchars array
|
||||||
(and - if !option[NOLENGTH] - also the same length).
|
(and - if !option[NOLENGTH] - also the same length).
|
||||||
@@ -357,7 +513,16 @@ Search::prepare ()
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* Compute _alpha_size, the upper bound on the indices passed to
|
||||||
|
asso_values[]. */
|
||||||
|
unsigned int max_alpha_inc = 0;
|
||||||
|
for (int i = 0; i < _max_key_len; i++)
|
||||||
|
if (max_alpha_inc < _alpha_inc[i])
|
||||||
|
max_alpha_inc = _alpha_inc[i];
|
||||||
|
_alpha_size = (option[SEVENBIT] ? 128 : 256) + max_alpha_inc;
|
||||||
|
|
||||||
/* Compute the occurrences of each character in the alphabet. */
|
/* Compute the occurrences of each character in the alphabet. */
|
||||||
|
_occurrences = new int[_alpha_size];
|
||||||
memset (_occurrences, 0, _alpha_size * sizeof (_occurrences[0]));
|
memset (_occurrences, 0, _alpha_size * sizeof (_occurrences[0]));
|
||||||
for (temp = _head; temp; temp = temp->rest())
|
for (temp = _head; temp; temp = temp->rest())
|
||||||
{
|
{
|
||||||
@@ -366,6 +531,10 @@ Search::prepare ()
|
|||||||
for (int count = keyword->_selchars_length; count > 0; ptr++, count--)
|
for (int count = keyword->_selchars_length; count > 0; ptr++, count--)
|
||||||
_occurrences[*ptr]++;
|
_occurrences[*ptr]++;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* Memory allocation. */
|
||||||
|
_asso_values = new int[_alpha_size];
|
||||||
|
_determined = new bool[_alpha_size];
|
||||||
}
|
}
|
||||||
|
|
||||||
/* ---------------- Reordering the Keyword list (optional) ----------------- */
|
/* ---------------- Reordering the Keyword list (optional) ----------------- */
|
||||||
@@ -878,6 +1047,11 @@ void
|
|||||||
Search::optimize ()
|
Search::optimize ()
|
||||||
{
|
{
|
||||||
/* Preparations. */
|
/* Preparations. */
|
||||||
|
preprepare ();
|
||||||
|
_key_positions = option.get_key_positions();
|
||||||
|
if (!option[POSITIONS])
|
||||||
|
find_positions ();
|
||||||
|
find_alpha_inc ();
|
||||||
prepare ();
|
prepare ();
|
||||||
if (option[ORDER])
|
if (option[ORDER])
|
||||||
reorder ();
|
reorder ();
|
||||||
@@ -1025,4 +1199,5 @@ Search::~Search ()
|
|||||||
}
|
}
|
||||||
delete[] _asso_values;
|
delete[] _asso_values;
|
||||||
delete[] _occurrences;
|
delete[] _occurrences;
|
||||||
|
delete[] _alpha_inc;
|
||||||
}
|
}
|
||||||
|
|||||||
26
src/search.h
26
src/search.h
@@ -40,15 +40,26 @@ private:
|
|||||||
void preprepare ();
|
void preprepare ();
|
||||||
|
|
||||||
/* Initializes each keyword's _selchars array. */
|
/* Initializes each keyword's _selchars array. */
|
||||||
void init_selchars (bool use_all_chars, const Positions& positions) const;
|
void init_selchars_tuple (bool use_all_chars, const Positions& positions) const;
|
||||||
/* Deletes each keyword's _selchars array. */
|
/* Deletes each keyword's _selchars array. */
|
||||||
void delete_selchars () const;
|
void delete_selchars () const;
|
||||||
|
|
||||||
/* Count the duplicate keywords that occur with a given set of positions. */
|
/* Count the duplicate keywords that occur with a given set of positions. */
|
||||||
unsigned int count_duplicates (const Positions& positions) const;
|
unsigned int count_duplicates_tuple (const Positions& positions) const;
|
||||||
|
|
||||||
|
/* Find good key positions. */
|
||||||
void find_positions ();
|
void find_positions ();
|
||||||
|
|
||||||
|
/* Initializes each keyword's _selchars array. */
|
||||||
|
void init_selchars_multiset (bool use_all_chars, const Positions& positions, const unsigned int *alpha_inc) const;
|
||||||
|
|
||||||
|
/* Count the duplicate keywords that occur with the given set of positions
|
||||||
|
and a given alpha_inc[] array. */
|
||||||
|
unsigned int count_duplicates_multiset (const unsigned int *alpha_inc) const;
|
||||||
|
|
||||||
|
/* Find good _alpha_inc[]. */
|
||||||
|
void find_alpha_inc ();
|
||||||
|
|
||||||
void prepare ();
|
void prepare ();
|
||||||
|
|
||||||
/* Computes the sum of occurrences of the _selchars of a keyword. */
|
/* Computes the sum of occurrences of the _selchars of a keyword. */
|
||||||
@@ -112,19 +123,22 @@ public:
|
|||||||
/* User-specified or computed key positions. */
|
/* User-specified or computed key positions. */
|
||||||
Positions _key_positions;
|
Positions _key_positions;
|
||||||
|
|
||||||
|
/* Adjustments to add to bytes add specific key positions. */
|
||||||
|
unsigned int * _alpha_inc;
|
||||||
|
|
||||||
/* Total number of duplicates that have been moved to _duplicate_link lists
|
/* Total number of duplicates that have been moved to _duplicate_link lists
|
||||||
(not counting their representatives which stay on the main list). */
|
(not counting their representatives which stay on the main list). */
|
||||||
int _total_duplicates;
|
int _total_duplicates;
|
||||||
|
|
||||||
/* Size of alphabet. */
|
/* Size of alphabet. */
|
||||||
int const _alpha_size;
|
int _alpha_size;
|
||||||
|
|
||||||
/* Counts occurrences of each key set character.
|
/* Counts occurrences of each key set character.
|
||||||
_occurrences[c] is the number of times that c occurs among the _selchars
|
_occurrences[c] is the number of times that c occurs among the _selchars
|
||||||
of a keyword. */
|
of a keyword. */
|
||||||
int * const _occurrences;
|
int * _occurrences;
|
||||||
/* Value associated with each character. */
|
/* Value associated with each character. */
|
||||||
int * const _asso_values;
|
int * _asso_values;
|
||||||
|
|
||||||
private:
|
private:
|
||||||
|
|
||||||
@@ -132,7 +146,7 @@ private:
|
|||||||
int _list_len;
|
int _list_len;
|
||||||
|
|
||||||
/* Vector used during Search::reorder(). */
|
/* Vector used during Search::reorder(). */
|
||||||
bool * const _determined;
|
bool * _determined;
|
||||||
|
|
||||||
/* Exclusive upper bound for every _asso_values[c]. A power of 2. */
|
/* Exclusive upper bound for every _asso_values[c]. A power of 2. */
|
||||||
int _asso_value_max;
|
int _asso_value_max;
|
||||||
|
|||||||
1252
tests/chill.exp
1252
tests/chill.exp
File diff suppressed because it is too large
Load Diff
Reference in New Issue
Block a user