mirror of
https://git.savannah.gnu.org/git/gperf.git
synced 2025-12-02 13:09:22 +00:00
Implement backtracking.
This commit is contained in:
22
ChangeLog
22
ChangeLog
@@ -1,3 +1,25 @@
|
|||||||
|
2002-11-20 Bruno Haible <bruno@clisp.org>
|
||||||
|
|
||||||
|
Implement backtracking.
|
||||||
|
* src/search.h (Search::has_collisions): Renamed from
|
||||||
|
Search::less_collisions. Return a boolean.
|
||||||
|
* src/search.cc (Search::has_collisions): Renamed from
|
||||||
|
Search::less_collisions. Return a boolean.
|
||||||
|
(StackEntry): Remove field _collisions_so_far.
|
||||||
|
(Search::find_asso_values): Backtrack when encountering an unresolved
|
||||||
|
collision. Assume collisions_so_far is always zero.
|
||||||
|
(Search::optimize): Exit if there are accidental duplicates at the end.
|
||||||
|
* src/output.cc (Output::num_hash_values): Simply return the list
|
||||||
|
length.
|
||||||
|
(Output::output_keylength_table): Remove handling of accidental
|
||||||
|
duplicates.
|
||||||
|
(Output::output_keyword_table, Output::output_lookup_array): Likewise.
|
||||||
|
(output_switch_case, output_switches): Likewise.
|
||||||
|
* doc/gperf.texi (Algorithmic Details): Adjust description of options
|
||||||
|
-D, -f, -o, -r.
|
||||||
|
(Bugs): Remove note about missing backtracking.
|
||||||
|
(Projects): Likewise.
|
||||||
|
|
||||||
2002-11-19 Bruno Haible <bruno@clisp.org>
|
2002-11-19 Bruno Haible <bruno@clisp.org>
|
||||||
|
|
||||||
Prepare for backtracking.
|
Prepare for backtracking.
|
||||||
|
|||||||
3
NEWS
3
NEWS
@@ -34,6 +34,9 @@ New in 2.8:
|
|||||||
* Some keyword sets containing permutations, like { "xy", "yx", "xz", "zx" }
|
* Some keyword sets containing permutations, like { "xy", "yx", "xz", "zx" }
|
||||||
or { "abc", "acb", "bca", "cab" }, are now handled by gperf without
|
or { "abc", "acb", "bca", "cab" }, are now handled by gperf without
|
||||||
requiring the option -D.
|
requiring the option -D.
|
||||||
|
* When the search for a good hash function is not immediately successful,
|
||||||
|
backtracking is now used to continue the search. Earlier versions of gperf
|
||||||
|
bailed out with an "Internal error, duplicate hash code value".
|
||||||
* Bug fixes.
|
* Bug fixes.
|
||||||
|
|
||||||
New in 2.7.2:
|
New in 2.7.2:
|
||||||
|
|||||||
@@ -7,7 +7,7 @@
|
|||||||
|
|
||||||
@c some day we should @include version.texi instead of defining
|
@c some day we should @include version.texi instead of defining
|
||||||
@c these values at hand.
|
@c these values at hand.
|
||||||
@set UPDATED 16 November 2002
|
@set UPDATED 20 November 2002
|
||||||
@set EDITION 2.7.2
|
@set EDITION 2.7.2
|
||||||
@set VERSION 2.7.2
|
@set VERSION 2.7.2
|
||||||
@c ---------------------
|
@c ---------------------
|
||||||
@@ -993,27 +993,14 @@ through a search that minimizes the number of byte positions.
|
|||||||
@itemx --duplicates
|
@itemx --duplicates
|
||||||
@cindex Duplicates
|
@cindex Duplicates
|
||||||
Handle keywords whose selected byte sets hash to duplicate values.
|
Handle keywords whose selected byte sets hash to duplicate values.
|
||||||
Duplicate hash values can occur for two reasons:
|
Duplicate hash values can occur if a set of keywords has the same names, but
|
||||||
|
possesses different attributes, or if the selected byte positions are not well
|
||||||
@itemize @bullet
|
chosen. With the -D option @code{gperf} treats all these keywords as
|
||||||
@item
|
|
||||||
Since @code{gperf} does not backtrack it is possible for it to process
|
|
||||||
all your input keywords without finding a unique mapping for each word.
|
|
||||||
However, frequently only a very small number of duplicates occur, and
|
|
||||||
the majority of keywords still require one probe into the table. To
|
|
||||||
overcome this problem, the option @samp{-m 50} should be used.
|
|
||||||
|
|
||||||
@item
|
|
||||||
Sometimes a set of keywords may have the same names, but possess different
|
|
||||||
attributes. With the -D option @code{gperf} treats all these keywords as
|
|
||||||
part of an equivalence class and generates a perfect hash function with
|
part of an equivalence class and generates a perfect hash function with
|
||||||
multiple comparisons for duplicate keywords. It is up to you to completely
|
multiple comparisons for duplicate keywords. It is up to you to completely
|
||||||
disambiguate the keywords by modifying the generated C code. However,
|
disambiguate the keywords by modifying the generated C code. However,
|
||||||
@code{gperf} helps you out by organizing the output.
|
@code{gperf} helps you out by organizing the output.
|
||||||
@end itemize
|
|
||||||
|
|
||||||
Option @samp{-D} is extremely useful for certain large or highly
|
|
||||||
redundant keyword sets, e.g., assembler instruction opcodes.
|
|
||||||
Using this option usually means that the generated hash function is no
|
Using this option usually means that the generated hash function is no
|
||||||
longer perfect. On the other hand, it permits @code{gperf} to work on
|
longer perfect. On the other hand, it permits @code{gperf} to work on
|
||||||
keyword sets that it otherwise could not handle.
|
keyword sets that it otherwise could not handle.
|
||||||
@@ -1025,7 +1012,7 @@ Generate the perfect hash function ``fast''. This decreases
|
|||||||
table-size. The iteration amount represents the number of times to
|
table-size. The iteration amount represents the number of times to
|
||||||
iterate when resolving a collision. `0' means iterate by the number of
|
iterate when resolving a collision. `0' means iterate by the number of
|
||||||
keywords. This option is probably most useful when used in conjunction
|
keywords. This option is probably most useful when used in conjunction
|
||||||
with options @samp{-D} and/or @samp{-S} for @emph{large} keyword sets.
|
with option @samp{-o} for @emph{large} keyword sets.
|
||||||
|
|
||||||
@item -m @var{iterations}
|
@item -m @var{iterations}
|
||||||
@itemx --multiple-iterations=@var{iterations}
|
@itemx --multiple-iterations=@var{iterations}
|
||||||
@@ -1067,7 +1054,7 @@ produce more minimal perfect hash functions. The reason for this is
|
|||||||
that the reordering helps prune the search time by handling inevitable
|
that the reordering helps prune the search time by handling inevitable
|
||||||
collisions early in the search process. On the other hand, in practice,
|
collisions early in the search process. On the other hand, in practice,
|
||||||
a decreased search time also means a less minimal hash function, and a
|
a decreased search time also means a less minimal hash function, and a
|
||||||
higher probability of duplicate hash values. Furthermore, if the
|
higher frequency of backtracking. Furthermore, if the
|
||||||
number of keywords is @emph{very} large using @samp{-o} may
|
number of keywords is @emph{very} large using @samp{-o} may
|
||||||
@emph{increase} @code{gperf}'s execution time, since collisions will
|
@emph{increase} @code{gperf}'s execution time, since collisions will
|
||||||
begin earlier and continue throughout the remainder of keyword
|
begin earlier and continue throughout the remainder of keyword
|
||||||
@@ -1080,8 +1067,7 @@ Utilizes randomness to initialize the associated values table. This
|
|||||||
frequently generates solutions faster than using deterministic
|
frequently generates solutions faster than using deterministic
|
||||||
initialization (which starts all associated values at 0). Furthermore,
|
initialization (which starts all associated values at 0). Furthermore,
|
||||||
using the randomization option generally increases the size of the
|
using the randomization option generally increases the size of the
|
||||||
table. If @code{gperf} has difficultly with a certain keyword set try using
|
table.
|
||||||
@samp{-r} or @samp{-D}.
|
|
||||||
|
|
||||||
@item -s @var{size-multiple}
|
@item -s @var{size-multiple}
|
||||||
@itemx --size-multiple=@var{size-multiple}
|
@itemx --size-multiple=@var{size-multiple}
|
||||||
@@ -1154,16 +1140,6 @@ work efficiently on much larger keyword sets (over 15,000 keywords).
|
|||||||
When processing large keyword sets it helps greatly to have over 8 megs
|
When processing large keyword sets it helps greatly to have over 8 megs
|
||||||
of RAM.
|
of RAM.
|
||||||
|
|
||||||
However, since @code{gperf} does not backtrack no guaranteed solution
|
|
||||||
occurs on every run. On the other hand, it is usually easy to obtain a
|
|
||||||
solution by varying the option parameters. In particular, try the
|
|
||||||
@samp{-r} option, and also try changing the default arguments to the
|
|
||||||
@samp{-s} and @samp{-j} options. To @emph{guarantee} a solution, use
|
|
||||||
the @samp{-D} and @samp{-S} options, although the final results are not
|
|
||||||
likely to be a @emph{perfect} hash function anymore! Finally, use the
|
|
||||||
@samp{-f} option if you want @code{gperf} to generate the perfect hash
|
|
||||||
function @emph{fast}, with less emphasis on making it minimal.
|
|
||||||
|
|
||||||
@item
|
@item
|
||||||
The size of the generate static keyword array can get @emph{extremely}
|
The size of the generate static keyword array can get @emph{extremely}
|
||||||
large if the input keyword file is large or if the keywords are quite
|
large if the input keyword file is large or if the keywords are quite
|
||||||
@@ -1171,7 +1147,7 @@ similar. This tends to slow down the compilation of the generated C
|
|||||||
code, and @emph{greatly} inflates the object code size. If this
|
code, and @emph{greatly} inflates the object code size. If this
|
||||||
situation occurs, consider using the @samp{-S} option to reduce data
|
situation occurs, consider using the @samp{-S} option to reduce data
|
||||||
size, potentially increasing keyword recognition time a negligible
|
size, potentially increasing keyword recognition time a negligible
|
||||||
amount. Since many C compilers cannot correctly generated code for
|
amount. Since many C compilers cannot correctly generate code for
|
||||||
large switch statements it is important to qualify the @var{-S} option
|
large switch statements it is important to qualify the @var{-S} option
|
||||||
with an appropriate numerical argument that controls the number of
|
with an appropriate numerical argument that controls the number of
|
||||||
switch statements generated.
|
switch statements generated.
|
||||||
@@ -1192,19 +1168,11 @@ module is essential independent from other program modules. Additional
|
|||||||
worthwhile improvements include:
|
worthwhile improvements include:
|
||||||
|
|
||||||
@itemize @bullet
|
@itemize @bullet
|
||||||
@item
|
|
||||||
Make the algorithm more robust. At present, the program halts with an
|
|
||||||
error diagnostic if it can't find a direct solution and the @samp{-D}
|
|
||||||
option is not enabled. A more comprehensive, albeit computationally
|
|
||||||
expensive, approach would employ backtracking or enable alternative
|
|
||||||
options and retry. It's not clear how helpful this would be, in
|
|
||||||
general, since most search sets are rather small in practice.
|
|
||||||
|
|
||||||
@item
|
@item
|
||||||
Another useful extension involves modifying the program to generate
|
Another useful extension involves modifying the program to generate
|
||||||
``minimal'' perfect hash functions (under certain circumstances, the
|
``minimal'' perfect hash functions (under certain circumstances, the
|
||||||
current version can be rather extravagant in the generated table size).
|
current version can be rather extravagant in the generated table size).
|
||||||
Again, this is mostly of theoretical interest, since a sparse table
|
This is mostly of theoretical interest, since a sparse table
|
||||||
often produces faster lookups, and use of the @samp{-S} @code{switch}
|
often produces faster lookups, and use of the @samp{-S} @code{switch}
|
||||||
option can minimize the data size, at the expense of slightly longer
|
option can minimize the data size, at the expense of slightly longer
|
||||||
lookups (note that the gcc compiler generally produces good code for
|
lookups (note that the gcc compiler generally produces good code for
|
||||||
|
|||||||
@@ -75,10 +75,9 @@ static const char *char_to_index;
|
|||||||
- Duplicates, i.e. keywords with the same _selchars set, are chained
|
- Duplicates, i.e. keywords with the same _selchars set, are chained
|
||||||
through the _duplicate_link pointer. Only one representative per
|
through the _duplicate_link pointer. Only one representative per
|
||||||
duplicate equivalence class remains on the linear keyword list.
|
duplicate equivalence class remains on the linear keyword list.
|
||||||
- Still, accidental duplicates, i.e. keywords for which the _asso_values[]
|
- Accidental duplicates, i.e. keywords for which the _asso_values[] search
|
||||||
search couldn't achieve different hash values, can occur on the linear
|
couldn't achieve different hash values, cannot occur on the linear
|
||||||
keyword list. After Search::sort(), we know that they form blocks of
|
keyword list. Search::optimize would catch this mistake.
|
||||||
consecutive list elements.
|
|
||||||
*/
|
*/
|
||||||
Output::Output (KeywordExt_List *head, const char *struct_decl,
|
Output::Output (KeywordExt_List *head, const char *struct_decl,
|
||||||
unsigned int struct_decl_lineno, const char *return_type,
|
unsigned int struct_decl_lineno, const char *return_type,
|
||||||
@@ -134,20 +133,11 @@ Output::compute_min_max ()
|
|||||||
int
|
int
|
||||||
Output::num_hash_values () const
|
Output::num_hash_values () const
|
||||||
{
|
{
|
||||||
/* Since the list is already sorted by hash value we can count the
|
/* Since the list is already sorted by hash value and doesn't contain
|
||||||
different hash values in a single pass through the list. */
|
duplicates, we can simply count the number of keywords on the list. */
|
||||||
int count = 1;
|
int count = 0;
|
||||||
KeywordExt_List *temp;
|
for (KeywordExt_List *temp = _head; temp; temp = temp->rest())
|
||||||
int value;
|
count++;
|
||||||
|
|
||||||
for (temp = _head, value = temp->first()->_hash_value; (temp = temp->rest()) != NULL; )
|
|
||||||
{
|
|
||||||
if (value != temp->first()->_hash_value)
|
|
||||||
{
|
|
||||||
value = temp->first()->_hash_value;
|
|
||||||
count++;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return count;
|
return count;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -667,9 +657,7 @@ Output::output_keylength_table () const
|
|||||||
/* If generating a switch statement, and there is no user defined type,
|
/* If generating a switch statement, and there is no user defined type,
|
||||||
we generate non-duplicates directly in the code. Only duplicates go
|
we generate non-duplicates directly in the code. Only duplicates go
|
||||||
into the table. */
|
into the table. */
|
||||||
if (option[SWITCH] && !option[TYPE]
|
if (option[SWITCH] && !option[TYPE] && !temp->first()->_duplicate_link)
|
||||||
&& !(temp->first()->_duplicate_link
|
|
||||||
|| (temp->rest() && temp->first()->_hash_value == temp->rest()->first()->_hash_value)))
|
|
||||||
continue;
|
continue;
|
||||||
|
|
||||||
if (index < temp->first()->_hash_value && !option[SWITCH] && !option[DUP])
|
if (index < temp->first()->_hash_value && !option[SWITCH] && !option[DUP])
|
||||||
@@ -789,9 +777,7 @@ Output::output_keyword_table () const
|
|||||||
|
|
||||||
for (temp = _head, index = 0; temp; temp = temp->rest())
|
for (temp = _head, index = 0; temp; temp = temp->rest())
|
||||||
{
|
{
|
||||||
if (option[SWITCH] && !option[TYPE]
|
if (option[SWITCH] && !option[TYPE] && !temp->first()->_duplicate_link)
|
||||||
&& !(temp->first()->_duplicate_link
|
|
||||||
|| (temp->rest() && temp->first()->_hash_value == temp->rest()->first()->_hash_value)))
|
|
||||||
continue;
|
continue;
|
||||||
|
|
||||||
if (index > 0)
|
if (index > 0)
|
||||||
@@ -865,34 +851,20 @@ Output::output_lookup_array () const
|
|||||||
if (option[DEBUG])
|
if (option[DEBUG])
|
||||||
fprintf (stderr, "keyword = %.*s, index = %d\n",
|
fprintf (stderr, "keyword = %.*s, index = %d\n",
|
||||||
temp->first()->_allchars_length, temp->first()->_allchars, temp->first()->_final_index);
|
temp->first()->_allchars_length, temp->first()->_allchars, temp->first()->_final_index);
|
||||||
if (temp->first()->_duplicate_link
|
if (temp->first()->_duplicate_link)
|
||||||
|| (temp->rest() && hash_value == temp->rest()->first()->_hash_value))
|
|
||||||
{
|
{
|
||||||
/* Start a duplicate entry. */
|
/* Start a duplicate entry. */
|
||||||
dup_ptr->hash_value = hash_value;
|
dup_ptr->hash_value = hash_value;
|
||||||
dup_ptr->index = temp->first()->_final_index;
|
dup_ptr->index = temp->first()->_final_index;
|
||||||
dup_ptr->count = 1;
|
dup_ptr->count = 1;
|
||||||
|
|
||||||
for (;;)
|
for (KeywordExt *ptr = temp->first()->_duplicate_link; ptr; ptr = ptr->_duplicate_link)
|
||||||
{
|
{
|
||||||
for (KeywordExt *ptr = temp->first()->_duplicate_link; ptr; ptr = ptr->_duplicate_link)
|
|
||||||
{
|
|
||||||
dup_ptr->count++;
|
|
||||||
if (option[DEBUG])
|
|
||||||
fprintf (stderr,
|
|
||||||
"static linked keyword = %.*s, index = %d\n",
|
|
||||||
ptr->_allchars_length, ptr->_allchars, ptr->_final_index);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (!(temp->rest() && hash_value == temp->rest()->first()->_hash_value))
|
|
||||||
break;
|
|
||||||
|
|
||||||
temp = temp->rest();
|
|
||||||
|
|
||||||
dup_ptr->count++;
|
dup_ptr->count++;
|
||||||
if (option[DEBUG])
|
if (option[DEBUG])
|
||||||
fprintf (stderr, "dynamic linked keyword = %.*s, index = %d\n",
|
fprintf (stderr,
|
||||||
temp->first()->_allchars_length, temp->first()->_allchars, temp->first()->_final_index);
|
"static linked keyword = %.*s, index = %d\n",
|
||||||
|
ptr->_allchars_length, ptr->_allchars, ptr->_final_index);
|
||||||
}
|
}
|
||||||
assert (dup_ptr->count >= 2);
|
assert (dup_ptr->count >= 2);
|
||||||
dup_ptr++;
|
dup_ptr++;
|
||||||
@@ -1026,9 +998,7 @@ output_switch_case (KeywordExt_List *list, int indent, int *jumps_away)
|
|||||||
printf ("%*s/* hash value = %4d, keyword = \"%.*s\" */\n",
|
printf ("%*s/* hash value = %4d, keyword = \"%.*s\" */\n",
|
||||||
indent, "", list->first()->_hash_value, list->first()->_allchars_length, list->first()->_allchars);
|
indent, "", list->first()->_hash_value, list->first()->_allchars_length, list->first()->_allchars);
|
||||||
|
|
||||||
if (option[DUP]
|
if (option[DUP] && list->first()->_duplicate_link)
|
||||||
&& (list->first()->_duplicate_link
|
|
||||||
|| (list->rest() && list->first()->_hash_value == list->rest()->first()->_hash_value)))
|
|
||||||
{
|
{
|
||||||
if (option[LENTABLE])
|
if (option[LENTABLE])
|
||||||
printf ("%*slengthptr = &lengthtable[%d];\n",
|
printf ("%*slengthptr = &lengthtable[%d];\n",
|
||||||
@@ -1037,13 +1007,8 @@ output_switch_case (KeywordExt_List *list, int indent, int *jumps_away)
|
|||||||
indent, "", option.get_wordlist_name (), list->first()->_final_index);
|
indent, "", option.get_wordlist_name (), list->first()->_final_index);
|
||||||
|
|
||||||
int count = 0;
|
int count = 0;
|
||||||
for (KeywordExt_List *temp = list; ; temp = temp->rest())
|
for (KeywordExt *links = list->first(); links; links = links->_duplicate_link)
|
||||||
{
|
count++;
|
||||||
for (KeywordExt *links = temp->first(); links; links = links->_duplicate_link)
|
|
||||||
count++;
|
|
||||||
if (!(temp->rest() && temp->first()->_hash_value == temp->rest()->first()->_hash_value))
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
|
|
||||||
printf ("%*swordendptr = wordptr + %d;\n"
|
printf ("%*swordendptr = wordptr + %d;\n"
|
||||||
"%*sgoto multicompare;\n",
|
"%*sgoto multicompare;\n",
|
||||||
@@ -1080,10 +1045,7 @@ output_switch_case (KeywordExt_List *list, int indent, int *jumps_away)
|
|||||||
*jumps_away = 1;
|
*jumps_away = 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
while (list->rest() && list->first()->_hash_value == list->rest()->first()->_hash_value)
|
return list->rest();
|
||||||
list = list->rest();
|
|
||||||
list = list->rest();
|
|
||||||
return list;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Output a total of size cases, grouped into num_switches switch statements,
|
/* Output a total of size cases, grouped into num_switches switch statements,
|
||||||
@@ -1105,11 +1067,7 @@ output_switches (KeywordExt_List *list, int num_switches, int size, int min_hash
|
|||||||
|
|
||||||
KeywordExt_List *temp = list;
|
KeywordExt_List *temp = list;
|
||||||
for (int count = size1; count > 0; count--)
|
for (int count = size1; count > 0; count--)
|
||||||
{
|
temp = temp->rest();
|
||||||
while (temp->first()->_hash_value == temp->rest()->first()->_hash_value)
|
|
||||||
temp = temp->rest();
|
|
||||||
temp = temp->rest();
|
|
||||||
}
|
|
||||||
|
|
||||||
printf ("%*sif (key < %d)\n"
|
printf ("%*sif (key < %d)\n"
|
||||||
"%*s {\n",
|
"%*s {\n",
|
||||||
|
|||||||
117
src/search.cc
117
src/search.cc
@@ -891,16 +891,12 @@ Search::sort_by_occurrence (unsigned int *set, int len) const
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/* If the recomputed hash values for the keywords from _head->first() to
|
/* Returns true if the recomputed hash values for the keywords from
|
||||||
curr - inclusive - give fewer than collision_bound collisions, this
|
_head->first() to curr - inclusive - give at least one collision.
|
||||||
collision count is returned. Otherwise some value >= collision_bound
|
|
||||||
is returned.
|
|
||||||
This is called very frequently, and needs to be fast! */
|
This is called very frequently, and needs to be fast! */
|
||||||
unsigned int
|
bool
|
||||||
Search::less_collisions (KeywordExt *curr, unsigned int collision_bound)
|
Search::has_collisions (KeywordExt *curr)
|
||||||
{
|
{
|
||||||
unsigned int collisions = 0;
|
|
||||||
|
|
||||||
/* Iteration Number array is a win, O(1) initialization time! */
|
/* Iteration Number array is a win, O(1) initialization time! */
|
||||||
_collision_detector->clear ();
|
_collision_detector->clear ();
|
||||||
|
|
||||||
@@ -911,12 +907,11 @@ Search::less_collisions (KeywordExt *curr, unsigned int collision_bound)
|
|||||||
/* Compute new hash code for the keyword, and see whether it
|
/* Compute new hash code for the keyword, and see whether it
|
||||||
collides with another keyword's hash code. If we have too
|
collides with another keyword's hash code. If we have too
|
||||||
many collisions, we can safely abort the fruitless loop. */
|
many collisions, we can safely abort the fruitless loop. */
|
||||||
if (_collision_detector->set_bit (compute_hash (keyword))
|
if (_collision_detector->set_bit (compute_hash (keyword)))
|
||||||
&& ++collisions >= collision_bound)
|
return true;
|
||||||
return collision_bound; /* >= collision_bound */
|
|
||||||
|
|
||||||
if (keyword == curr)
|
if (keyword == curr)
|
||||||
return collisions; /* < collision_bound */
|
return false;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -945,9 +940,6 @@ Search::collision_prior_to (KeywordExt *curr)
|
|||||||
we perform the processing without recursion, and simulate the stack. */
|
we perform the processing without recursion, and simulate the stack. */
|
||||||
struct StackEntry
|
struct StackEntry
|
||||||
{
|
{
|
||||||
/* The number of collisions so far. */
|
|
||||||
unsigned int _collisions_so_far;
|
|
||||||
|
|
||||||
/* The current keyword. */
|
/* The current keyword. */
|
||||||
KeywordExt * _curr;
|
KeywordExt * _curr;
|
||||||
|
|
||||||
@@ -1006,8 +998,6 @@ Search::find_asso_values ()
|
|||||||
StackEntry *sp = &stack[0];
|
StackEntry *sp = &stack[0];
|
||||||
|
|
||||||
/* Local variables corresponding to *sp. */
|
/* Local variables corresponding to *sp. */
|
||||||
/* The number of collisions so far. */
|
|
||||||
unsigned int collisions_so_far;
|
|
||||||
/* The current keyword. */
|
/* The current keyword. */
|
||||||
KeywordExt *curr;
|
KeywordExt *curr;
|
||||||
/* The prior keyword, with which curr collides. */
|
/* The prior keyword, with which curr collides. */
|
||||||
@@ -1024,8 +1014,6 @@ Search::find_asso_values ()
|
|||||||
/* Remaining number of iterations. */
|
/* Remaining number of iterations. */
|
||||||
int iter;
|
int iter;
|
||||||
|
|
||||||
collisions_so_far = 0;
|
|
||||||
|
|
||||||
STARTOUTERLOOP:
|
STARTOUTERLOOP:
|
||||||
|
|
||||||
/* Next keyword from the list. */
|
/* Next keyword from the list. */
|
||||||
@@ -1039,8 +1027,6 @@ Search::find_asso_values ()
|
|||||||
|
|
||||||
if (prior != NULL)
|
if (prior != NULL)
|
||||||
{
|
{
|
||||||
collisions_so_far++;
|
|
||||||
|
|
||||||
/* Handle collision: Attempt to change an _asso_value[], in order to
|
/* Handle collision: Attempt to change an _asso_value[], in order to
|
||||||
resolve a hash value collision between the two given keywords. */
|
resolve a hash value collision between the two given keywords. */
|
||||||
|
|
||||||
@@ -1075,11 +1061,10 @@ Search::find_asso_values ()
|
|||||||
|
|
||||||
/* Try various other values for _asso_values[c]. A value is
|
/* Try various other values for _asso_values[c]. A value is
|
||||||
successful if, with it, the recomputed hash values for the
|
successful if, with it, the recomputed hash values for the
|
||||||
keywords from _head->first() to curr - inclusive - give fewer
|
keywords from _head->first() to curr - inclusive - give no
|
||||||
than collisions_so_far collisions. Up to the given number of
|
collisions. Up to the given number of iterations are performed.
|
||||||
iterations are performed. If successful, _asso_values[c] is
|
If successful, _asso_values[c] is changed, and the recursion
|
||||||
changed, collisions_so_far is decreased, and the recursion
|
continues. If all iterations are unsuccessful, _asso_values[c]
|
||||||
continued. If all iterations are unsuccessful, _asso_values[c]
|
|
||||||
is restored and we backtrack, trying the next union_index. */
|
is restored and we backtrack, trying the next union_index. */
|
||||||
|
|
||||||
original_asso_value = _asso_values[c];
|
original_asso_value = _asso_values[c];
|
||||||
@@ -1092,11 +1077,8 @@ Search::find_asso_values ()
|
|||||||
(_asso_values[c] + (_jump != 0 ? _jump : rand ()))
|
(_asso_values[c] + (_jump != 0 ? _jump : rand ()))
|
||||||
& (_asso_value_max - 1);
|
& (_asso_value_max - 1);
|
||||||
|
|
||||||
unsigned int collisions =
|
if (!has_collisions (curr))
|
||||||
less_collisions (curr, collisions_so_far);
|
|
||||||
if (collisions < collisions_so_far)
|
|
||||||
{
|
{
|
||||||
collisions_so_far = collisions;
|
|
||||||
/* Good, this _asso_values[] modification reduces the
|
/* Good, this _asso_values[] modification reduces the
|
||||||
number of collisions so far.
|
number of collisions so far.
|
||||||
All keyword->_hash_value up to curr - inclusive -
|
All keyword->_hash_value up to curr - inclusive -
|
||||||
@@ -1110,6 +1092,16 @@ Search::find_asso_values ()
|
|||||||
fflush (stderr);
|
fflush (stderr);
|
||||||
}
|
}
|
||||||
goto RECURSE;
|
goto RECURSE;
|
||||||
|
BACKTRACK_COLLISION: ;
|
||||||
|
if (option[DEBUG])
|
||||||
|
{
|
||||||
|
fprintf (stderr, "back to collision on keyword #%d, prior = \"%.*s\", curr = \"%.*s\" hash = %d\n",
|
||||||
|
sp - stack + 1,
|
||||||
|
prior->_allchars_length, prior->_allchars,
|
||||||
|
curr->_allchars_length, curr->_allchars,
|
||||||
|
curr->_hash_value);
|
||||||
|
fflush (stderr);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -1119,24 +1111,52 @@ Search::find_asso_values ()
|
|||||||
|
|
||||||
/* Failed to resolve a collision. */
|
/* Failed to resolve a collision. */
|
||||||
|
|
||||||
/* Recompute all keyword->_hash_value up to curr - inclusive -. */
|
/* Recompute all keyword->_hash_value up to curr - exclusive -. */
|
||||||
for (KeywordExt_List *ptr = _head; ; ptr = ptr->rest())
|
for (KeywordExt_List *ptr = _head; ; ptr = ptr->rest())
|
||||||
{
|
{
|
||||||
KeywordExt* keyword = ptr->first();
|
KeywordExt* keyword = ptr->first();
|
||||||
compute_hash (keyword);
|
|
||||||
if (keyword == curr)
|
if (keyword == curr)
|
||||||
break;
|
break;
|
||||||
|
compute_hash (keyword);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (option[DEBUG])
|
if (option[DEBUG])
|
||||||
{
|
{
|
||||||
fprintf (stderr, "** collision not resolved after %d iterations, %d duplicates remain, continuing...\n",
|
fprintf (stderr, "** collision not resolved after %d iterations, backtracking...\n",
|
||||||
iterations, collisions_so_far + _total_duplicates);
|
iterations);
|
||||||
fflush (stderr);
|
fflush (stderr);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
BACKTRACK_NO_COLLISION:
|
||||||
|
if (sp != stack)
|
||||||
|
{
|
||||||
|
sp--;
|
||||||
|
curr = sp->_curr;
|
||||||
|
prior = sp->_prior;
|
||||||
|
union_set = sp->_union_set;
|
||||||
|
union_set_length = sp->_union_set_length;
|
||||||
|
union_index = sp->_union_index;
|
||||||
|
c = sp->_c;
|
||||||
|
original_asso_value = sp->_original_asso_value;
|
||||||
|
iter = sp->_iter;
|
||||||
|
if (prior != NULL)
|
||||||
|
goto BACKTRACK_COLLISION;
|
||||||
|
else
|
||||||
|
goto BACKTRACK_NO_COLLISION;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* No solution found after an exhaustive search!
|
||||||
|
We should ideally turn off option[FAST] and, if that doesn't help,
|
||||||
|
multiply _asso_value_max by 2. */
|
||||||
|
fprintf (stderr,
|
||||||
|
"\nBig failure, always got duplicate hash code values.\n");
|
||||||
|
if (option[POSITIONS])
|
||||||
|
fprintf (stderr, "try options -m or -r, or use new key positions.\n\n");
|
||||||
|
else
|
||||||
|
fprintf (stderr, "try options -m or -r.\n\n");
|
||||||
|
exit (1);
|
||||||
}
|
}
|
||||||
RECURSE:
|
RECURSE:
|
||||||
sp->_collisions_so_far = collisions_so_far;
|
|
||||||
/*sp->_curr = curr;*/ // redundant
|
/*sp->_curr = curr;*/ // redundant
|
||||||
sp->_prior = prior;
|
sp->_prior = prior;
|
||||||
/*sp->_union_set = union_set;*/ // redundant
|
/*sp->_union_set = union_set;*/ // redundant
|
||||||
@@ -1147,10 +1167,7 @@ Search::find_asso_values ()
|
|||||||
sp->_iter = iter;
|
sp->_iter = iter;
|
||||||
sp++;
|
sp++;
|
||||||
if (sp - stack < _list_len)
|
if (sp - stack < _list_len)
|
||||||
{
|
goto STARTOUTERLOOP;
|
||||||
/*collisions_so_far = sp[-1]._collisions_so_far;*/ // redundant
|
|
||||||
goto STARTOUTERLOOP;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Deallocate stack. */
|
/* Deallocate stack. */
|
||||||
@@ -1285,19 +1302,15 @@ Search::optimize ()
|
|||||||
unsigned int hashcode = compute_hash (curr);
|
unsigned int hashcode = compute_hash (curr);
|
||||||
if (_collision_detector->set_bit (hashcode))
|
if (_collision_detector->set_bit (hashcode))
|
||||||
{
|
{
|
||||||
if (option[DUP]) /* Keep track of this number... */
|
/* This shouldn't happen. proj1, proj2, proj3 must have been
|
||||||
_total_duplicates++;
|
computed to be injective on the given keyword set. */
|
||||||
else /* Yow, big problems. we're outta here! */
|
fprintf (stderr,
|
||||||
{
|
"\nInternal error, unexpected duplicate hash code\n");
|
||||||
fprintf (stderr,
|
if (option[POSITIONS])
|
||||||
"\nInternal error, duplicate hash code value %d:\n",
|
fprintf (stderr, "try options -m or -r, or use new key positions.\n\n");
|
||||||
hashcode);
|
else
|
||||||
if (option[POSITIONS])
|
fprintf (stderr, "try options -m or -r.\n\n");
|
||||||
fprintf (stderr, "try options -m or -D or -r, or use new key positions.\n\n");
|
exit (1);
|
||||||
else
|
|
||||||
fprintf (stderr, "try options -m or -D or -r.\n\n");
|
|
||||||
exit (1);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -93,7 +93,7 @@ private:
|
|||||||
/* Sorts the given set in increasing frequency of _occurrences[]. */
|
/* Sorts the given set in increasing frequency of _occurrences[]. */
|
||||||
void sort_by_occurrence (unsigned int *set, int len) const;
|
void sort_by_occurrence (unsigned int *set, int len) const;
|
||||||
|
|
||||||
unsigned int less_collisions (KeywordExt *curr, unsigned int collision_bound);
|
bool has_collisions (KeywordExt *curr);
|
||||||
|
|
||||||
KeywordExt * collision_prior_to (KeywordExt *curr);
|
KeywordExt * collision_prior_to (KeywordExt *curr);
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user