mirror of
https://git.savannah.gnu.org/git/gperf.git
synced 2025-12-02 13:09:22 +00:00
Restructure the asso_values[] searching code.
This commit is contained in:
16
ChangeLog
16
ChangeLog
@@ -1,5 +1,21 @@
|
||||
2002-11-03 Bruno Haible <bruno@clisp.org>
|
||||
|
||||
* src/search.h (Search::init_asso_values, Search::find_asso_values):
|
||||
New declarations.
|
||||
(Search::try_asso_value): Renamed from Search::affects_prev.
|
||||
(Search::change_some_asso_value): Renamed from Search::change.
|
||||
(Search::set_asso_max, Search::get_asso_max): Remove methods.
|
||||
(Search::_union_set): New field.
|
||||
* src/search.cc (Search::init_asso_values): New method, extracted
|
||||
from Search::optimize.
|
||||
(Search::try_asso_value): Renamed from Search::affects_prev. Take the
|
||||
iteration count as argument.
|
||||
(Search::change_some_asso_value): Renamed from Search::change. Don't
|
||||
make union_set static. Don't increment _fewest_collisions here.
|
||||
(Search::find_asso_values): New method, extracted from
|
||||
Search::optimize.
|
||||
(Search::optimize); Update.
|
||||
|
||||
* src/search.h (Search::compute_hash): Renamed from Search::hash.
|
||||
(Search::compute_disjoint_union): Remove declaration.
|
||||
(Search::sort_by_occurrence): Renamed from Search::sort_set.
|
||||
|
||||
269
src/search.cc
269
src/search.cc
@@ -31,8 +31,7 @@
|
||||
#include "options.h"
|
||||
#include "hash-table.h"
|
||||
|
||||
/* Efficiently returns the least power of two greater than or equal to X! */
|
||||
#define POW(X) ((!X)?1:(X-=1,X|=X>>1,X|=X>>2,X|=X>>4,X|=X>>8,X|=X>>16,(++X)))
|
||||
/* -------------------- Initialization and Preparation --------------------- */
|
||||
|
||||
Search::Search (KeywordExt_List *list)
|
||||
: _head (list),
|
||||
@@ -41,7 +40,6 @@ Search::Search (KeywordExt_List *list)
|
||||
_asso_values (new int[_alpha_size]),
|
||||
_determined (new bool[_alpha_size])
|
||||
{
|
||||
memset (_asso_values, 0, _alpha_size * sizeof (_asso_values[0]));
|
||||
}
|
||||
|
||||
void
|
||||
@@ -154,6 +152,8 @@ Search::prepare ()
|
||||
}
|
||||
}
|
||||
|
||||
/* ----------------------- Sorting the Keyword list ------------------------ */
|
||||
|
||||
/* Merges two sorted lists together to form one sorted list.
|
||||
The sorting criterion depends on which of _occurrence_sort and _hash_sort
|
||||
is set to true. This is a kludge, but permits nice sharing of almost
|
||||
@@ -230,6 +230,8 @@ Search::merge_sort (KeywordExt_List *head)
|
||||
}
|
||||
}
|
||||
|
||||
/* ---------------- Reordering the Keyword list (optional) ----------------- */
|
||||
|
||||
/* Computes the sum of occurrences of the _selchars of a keyword.
|
||||
This is a kind of correlation measure: Keywords which have many
|
||||
selected characters in common with other keywords have a high
|
||||
@@ -356,6 +358,8 @@ Search::reorder ()
|
||||
}
|
||||
}
|
||||
|
||||
/* ------------------------------------------------------------------------- */
|
||||
|
||||
/* Returns the length of keyword list. */
|
||||
|
||||
int
|
||||
@@ -380,6 +384,71 @@ Search::get_max_keysig_size ()
|
||||
return option[ALLCHARS] ? _max_key_len : option.get_max_keysig_size ();
|
||||
}
|
||||
|
||||
/* ---------------------- Finding good asso_values[] ----------------------- */
|
||||
|
||||
/* Initializes the asso_values[] related parameters and put a first guess
|
||||
into asso_values[]. */
|
||||
|
||||
void
|
||||
Search::init_asso_values ()
|
||||
{
|
||||
int size_multiple = option.get_size_multiple ();
|
||||
int non_linked_length = keyword_list_length ();
|
||||
int asso_value_max;
|
||||
|
||||
if (size_multiple == 0)
|
||||
asso_value_max = non_linked_length;
|
||||
else if (size_multiple > 0)
|
||||
asso_value_max = non_linked_length * size_multiple;
|
||||
else /* if (size_multiple < 0) */
|
||||
asso_value_max = non_linked_length / -size_multiple;
|
||||
/* Round up to the next power of two. This makes it easy to ensure
|
||||
an _asso_value[c] is >= 0 and < asso_value_max. Also, the jump value
|
||||
being odd, it guarantees that Search::try_asso_value() will iterate
|
||||
through different values for _asso_value[c]. */
|
||||
if (asso_value_max == 0)
|
||||
asso_value_max = 1;
|
||||
asso_value_max |= asso_value_max >> 1;
|
||||
asso_value_max |= asso_value_max >> 2;
|
||||
asso_value_max |= asso_value_max >> 4;
|
||||
asso_value_max |= asso_value_max >> 8;
|
||||
asso_value_max |= asso_value_max >> 16;
|
||||
asso_value_max++;
|
||||
_asso_value_max = asso_value_max;
|
||||
|
||||
if (option[RANDOM])
|
||||
{
|
||||
srand (reinterpret_cast<long>(time (0)));
|
||||
|
||||
for (int i = 0; i < _alpha_size; i++)
|
||||
_asso_values[i] = rand () & (asso_value_max - 1);
|
||||
}
|
||||
else
|
||||
{
|
||||
int asso_value = option.get_initial_asso_value ();
|
||||
|
||||
asso_value = asso_value & (_asso_value_max - 1);
|
||||
for (int i = 0; i < _alpha_size; i++)
|
||||
_asso_values[i] = asso_value;
|
||||
}
|
||||
|
||||
/* Given the bound for _asso_values[c], we have a bound for the possible
|
||||
hash values, as computed in compute_hash(). */
|
||||
_max_hash_value = (option[NOLENGTH] ? 0 : max_key_length ())
|
||||
+ (_asso_value_max - 1) * get_max_keysig_size ();
|
||||
/* Allocate a sparse bit vector for detection of collisions of hash
|
||||
values. */
|
||||
_collision_detector = new Bool_Array (_max_hash_value + 1);
|
||||
|
||||
/* Allocate scratch set. */
|
||||
_union_set = new unsigned char [2 * get_max_keysig_size ()];
|
||||
|
||||
if (option[DEBUG])
|
||||
fprintf (stderr, "total non-linked keys = %d\nmaximum associated value is %d"
|
||||
"\nmaximum size of generated hash table is %d\n",
|
||||
non_linked_length, asso_value_max, _max_hash_value);
|
||||
}
|
||||
|
||||
/* Computes a keyword's hash value, relative to the current _asso_values[],
|
||||
and stores it in keyword->_hash_value.
|
||||
This is called very frequently, and needs to be fast! */
|
||||
@@ -467,68 +536,67 @@ Search::sort_by_occurrence (unsigned char *set, int len)
|
||||
}
|
||||
}
|
||||
|
||||
/* Find out how character value change affects successfully hashed items.
|
||||
Returns FALSE if no other hash values are affected, else returns TRUE.
|
||||
Note that because option.get_asso_max() is a power of two we can guarantee
|
||||
that all valid asso_values are visited without repetition since
|
||||
Option.Get_Jump was forced to be an odd value! */
|
||||
/* Tries various other values for _asso_values[c]. A value is successful
|
||||
if, with it, the recomputed hash values for the keywords from
|
||||
_head->first() to curr - inclusive - give fewer than _fewest_collisions
|
||||
collisions. Up to the given number of iterations are performed.
|
||||
If successful, _asso_values[c] is changed, _fewest_collisions is decreased,
|
||||
and false is returned.
|
||||
If all iterations are unsuccessful, _asso_values[c] is restored and
|
||||
true is returned.
|
||||
This is called very frequently, and needs to be fast! */
|
||||
|
||||
inline bool
|
||||
Search::affects_prev (unsigned char c, KeywordExt *curr)
|
||||
Search::try_asso_value (unsigned char c, KeywordExt *curr, int iterations)
|
||||
{
|
||||
int original_char = _asso_values[c];
|
||||
int total_iterations = !option[FAST]
|
||||
? get_asso_max () : option.get_iterations () ? option.get_iterations () : keyword_list_length ();
|
||||
int original_value = _asso_values[c];
|
||||
|
||||
/* Try all valid associated values. */
|
||||
|
||||
for (int i = total_iterations - 1; i >= 0; i--)
|
||||
/* Try many valid associated values. */
|
||||
for (int i = iterations - 1; i >= 0; i--)
|
||||
{
|
||||
int collisions = 0;
|
||||
|
||||
/* Try next value. Wrap around mod _asso_value_max. */
|
||||
_asso_values[c] =
|
||||
(_asso_values[c] + (option.get_jump () ? option.get_jump () : rand ()))
|
||||
& (get_asso_max () - 1);
|
||||
& (_asso_value_max - 1);
|
||||
|
||||
/* Iteration Number array is a win, O(1) intialization time! */
|
||||
_collision_detector->clear ();
|
||||
|
||||
/* See how this asso_value change affects previous keywords. If
|
||||
it does better than before we'll take it! */
|
||||
|
||||
for (KeywordExt_List *ptr = _head; ; ptr = ptr->rest())
|
||||
{
|
||||
KeywordExt *keyword = ptr->first();
|
||||
|
||||
/* Compute new hash code for the keyword, and see whether it
|
||||
collides with another keyword's hash code. If we have too
|
||||
many collisions, we can safely abort the fruitless loop. */
|
||||
if (_collision_detector->set_bit (compute_hash (keyword))
|
||||
&& ++collisions >= _fewest_collisions)
|
||||
break;
|
||||
|
||||
if (keyword == curr)
|
||||
{
|
||||
_fewest_collisions = collisions;
|
||||
if (option[DEBUG])
|
||||
fprintf (stderr, "- resolved after %d iterations", total_iterations - i);
|
||||
fprintf (stderr, "- resolved after %d iterations",
|
||||
iterations - i);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* Restore original values, no more tries. */
|
||||
_asso_values[c] = original_char;
|
||||
/* If we're this far it's time to try the next character.... */
|
||||
_asso_values[c] = original_value;
|
||||
return true;
|
||||
}
|
||||
|
||||
/* Change a character value, try least-used characters first. */
|
||||
/* Attempts to change an _asso_value[], in order to resolve a hash value
|
||||
collision between the two given keywords. */
|
||||
|
||||
void
|
||||
Search::change (KeywordExt *prior, KeywordExt *curr)
|
||||
Search::change_some_asso_value (KeywordExt *prior, KeywordExt *curr)
|
||||
{
|
||||
static unsigned char *union_set;
|
||||
int union_set_length;
|
||||
|
||||
if (!union_set)
|
||||
union_set = new unsigned char [2 * get_max_keysig_size ()];
|
||||
|
||||
if (option[DEBUG])
|
||||
{
|
||||
fprintf (stderr, "collision on keyword #%d, prior = \"%.*s\", curr = \"%.*s\" hash = %d\n",
|
||||
@@ -538,26 +606,50 @@ Search::change (KeywordExt *prior, KeywordExt *curr)
|
||||
curr->_hash_value);
|
||||
fflush (stderr);
|
||||
}
|
||||
union_set_length = compute_disjoint_union (prior->_selchars, prior->_selchars_length, curr->_selchars, curr->_selchars_length, union_set);
|
||||
|
||||
/* To achieve that the two hash values become different, we have to
|
||||
change an _asso_values[c] for a character c that contributes to the
|
||||
hash functions of prior and curr with different multiplicity.
|
||||
So we compute the set of such c. */
|
||||
unsigned char *union_set = _union_set;
|
||||
int union_set_length =
|
||||
compute_disjoint_union (prior->_selchars, prior->_selchars_length,
|
||||
curr->_selchars, curr->_selchars_length,
|
||||
union_set);
|
||||
|
||||
/* Sort by decreasing occurrence: Try least-used characters c first.
|
||||
The idea is that this reduces the number of freshly introduced
|
||||
collisions. */
|
||||
sort_by_occurrence (union_set, union_set_length);
|
||||
|
||||
/* Try changing some values, if change doesn't alter other values continue normal action. */
|
||||
_fewest_collisions++;
|
||||
int iterations =
|
||||
!option[FAST]
|
||||
? _asso_value_max /* Try all possible values of _asso_values[c]. */
|
||||
: option.get_iterations ()
|
||||
? option.get_iterations ()
|
||||
: keyword_list_length ();
|
||||
|
||||
const unsigned char *p = union_set;
|
||||
int i = union_set_length;
|
||||
for (; i > 0; p++, i--)
|
||||
if (!affects_prev (*p, curr))
|
||||
if (!try_asso_value (*p, curr, iterations))
|
||||
{
|
||||
/* Good, this _asso_values[] modification reduces the number of
|
||||
collisions so far.
|
||||
All keyword->_hash_value up to curr - inclusive - and
|
||||
_fewest_collisions have been updated. */
|
||||
if (option[DEBUG])
|
||||
{
|
||||
fprintf (stderr, " by changing asso_value['%c'] (char #%d) to %d\n",
|
||||
*p, p - union_set + 1, _asso_values[*p]);
|
||||
fflush (stderr);
|
||||
}
|
||||
return; /* Good, doesn't affect previous hash values, we'll take it. */
|
||||
return;
|
||||
}
|
||||
|
||||
/* Failed to resolve a collision. */
|
||||
|
||||
/* Recompute all keyword->_hash_value up to curr - inclusive -. */
|
||||
for (KeywordExt_List *ptr = _head; ; ptr = ptr->rest())
|
||||
{
|
||||
KeywordExt* keyword = ptr->first();
|
||||
@@ -569,12 +661,51 @@ Search::change (KeywordExt *prior, KeywordExt *curr)
|
||||
if (option[DEBUG])
|
||||
{
|
||||
fprintf (stderr, "** collision not resolved after %d iterations, %d duplicates remain, continuing...\n",
|
||||
!option[FAST] ? get_asso_max () : option.get_iterations () ? option.get_iterations () : keyword_list_length (),
|
||||
_fewest_collisions + _total_duplicates);
|
||||
iterations, _fewest_collisions + _total_duplicates);
|
||||
fflush (stderr);
|
||||
}
|
||||
}
|
||||
|
||||
/* Finds good _asso_values[]. */
|
||||
|
||||
void
|
||||
Search::find_asso_values ()
|
||||
{
|
||||
_fewest_collisions = 0;
|
||||
init_asso_values ();
|
||||
|
||||
/* Add one keyword after the other and see whether its hash value collides
|
||||
with one of the previous hash values. */
|
||||
_num_done = 1;
|
||||
for (KeywordExt_List *curr_ptr = _head;
|
||||
curr_ptr != NULL;
|
||||
curr_ptr = curr_ptr->rest(), _num_done++)
|
||||
{
|
||||
KeywordExt *curr = curr_ptr->first();
|
||||
|
||||
/* Compute this keyword's hash value. */
|
||||
compute_hash (curr);
|
||||
|
||||
/* See if it collides with a prior keyword. */
|
||||
for (KeywordExt_List *prior_ptr = _head;
|
||||
prior_ptr != curr_ptr;
|
||||
prior_ptr = prior_ptr->rest())
|
||||
{
|
||||
KeywordExt *prior = prior_ptr->first();
|
||||
|
||||
if (prior->_hash_value == curr->_hash_value)
|
||||
{
|
||||
_fewest_collisions++;
|
||||
/* Handle collision. */
|
||||
change_some_asso_value (prior, curr);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* ------------------------------------------------------------------------- */
|
||||
|
||||
/* Sorts the keys by hash value. */
|
||||
|
||||
void
|
||||
@@ -582,78 +713,26 @@ Search::sort ()
|
||||
{
|
||||
_hash_sort = true;
|
||||
_occurrence_sort = false;
|
||||
|
||||
_head = merge_sort (_head);
|
||||
}
|
||||
|
||||
void
|
||||
Search::optimize ()
|
||||
{
|
||||
/* Preparations. */
|
||||
prepare ();
|
||||
if (option[ORDER])
|
||||
reorder ();
|
||||
_num_done = 1;
|
||||
_fewest_collisions = 0;
|
||||
int asso_value_max = option.get_size_multiple ();
|
||||
int non_linked_length = keyword_list_length ();
|
||||
if (asso_value_max == 0)
|
||||
asso_value_max = non_linked_length;
|
||||
else if (asso_value_max > 0)
|
||||
asso_value_max *= non_linked_length;
|
||||
else /* if (asso_value_max < 0) */
|
||||
asso_value_max = non_linked_length / -asso_value_max;
|
||||
set_asso_max (POW (asso_value_max));
|
||||
|
||||
if (option[RANDOM])
|
||||
{
|
||||
srand (reinterpret_cast<long>(time (0)));
|
||||
|
||||
for (int i = 0; i < _alpha_size; i++)
|
||||
_asso_values[i] = rand () & (asso_value_max - 1);
|
||||
}
|
||||
else
|
||||
{
|
||||
int asso_value = option.get_initial_asso_value ();
|
||||
|
||||
if (asso_value) /* Initialize array if user requests non-zero default. */
|
||||
for (int i = _alpha_size - 1; i >= 0; i--)
|
||||
_asso_values[i] = asso_value & get_asso_max () - 1;
|
||||
}
|
||||
_max_hash_value = max_key_length () + get_asso_max () * get_max_keysig_size ();
|
||||
_collision_detector = new Bool_Array (_max_hash_value + 1);
|
||||
|
||||
if (option[DEBUG])
|
||||
fprintf (stderr, "total non-linked keys = %d\nmaximum associated value is %d"
|
||||
"\nmaximum size of generated hash table is %d\n",
|
||||
non_linked_length, asso_value_max, _max_hash_value);
|
||||
|
||||
KeywordExt_List *curr;
|
||||
for (curr = _head; curr != NULL; curr = curr->rest())
|
||||
{
|
||||
KeywordExt *currkw = curr->first();
|
||||
|
||||
compute_hash (currkw);
|
||||
|
||||
for (KeywordExt_List *ptr = _head; ptr != curr; ptr = ptr->rest())
|
||||
{
|
||||
KeywordExt *ptrkw = ptr->first();
|
||||
|
||||
if (ptrkw->_hash_value == currkw->_hash_value)
|
||||
{
|
||||
change (ptrkw, currkw);
|
||||
break;
|
||||
}
|
||||
}
|
||||
_num_done++;
|
||||
}
|
||||
/* Search for good _asso_values[]. */
|
||||
find_asso_values ();
|
||||
|
||||
/* Make one final check, just to make sure nothing weird happened.... */
|
||||
|
||||
_collision_detector->clear ();
|
||||
|
||||
for (curr = _head; curr; curr = curr->rest())
|
||||
for (KeywordExt_List *curr_ptr = _head; curr_ptr; curr_ptr = curr_ptr->rest())
|
||||
{
|
||||
unsigned int hashcode = compute_hash (curr->first());
|
||||
KeywordExt *curr = curr_ptr->first();
|
||||
unsigned int hashcode = compute_hash (curr);
|
||||
if (_collision_detector->set_bit (hashcode))
|
||||
{
|
||||
if (option[DUP]) /* Keep track of this number... */
|
||||
|
||||
40
src/search.h
40
src/search.h
@@ -62,15 +62,27 @@ private:
|
||||
/* Returns the number of key positions. */
|
||||
int get_max_keysig_size ();
|
||||
|
||||
/* Initializes the asso_values[] related parameters and put a first guess
|
||||
into asso_values[]. */
|
||||
void init_asso_values ();
|
||||
|
||||
/* Computes a keyword's hash value, relative to the current _asso_values[],
|
||||
and stores it in keyword->_hash_value. */
|
||||
int compute_hash (KeywordExt *key_node);
|
||||
int compute_hash (KeywordExt *keyword);
|
||||
|
||||
/* Sorts the given set in increasing frequency of _occurrences[]. */
|
||||
void sort_by_occurrence (unsigned char *set, int len);
|
||||
|
||||
bool affects_prev (unsigned char c, KeywordExt *curr);
|
||||
void change (KeywordExt *prior, KeywordExt *curr);
|
||||
/* Tries various other values for _asso_values[c]. */
|
||||
bool try_asso_value (unsigned char c, KeywordExt *curr, int iterations);
|
||||
|
||||
/* Attempts to change an _asso_value[], in order to resolve a hash value
|
||||
collision between the two given keywords. */
|
||||
void change_some_asso_value (KeywordExt *prior, KeywordExt *curr);
|
||||
|
||||
/* Finds good _asso_values[]. */
|
||||
void find_asso_values ();
|
||||
|
||||
void sort ();
|
||||
public:
|
||||
|
||||
@@ -114,13 +126,23 @@ private:
|
||||
/* Vector used during Search::reorder(). */
|
||||
bool * const _determined;
|
||||
|
||||
int _num_done; /* Number of keywords processed without a collision. */
|
||||
int _fewest_collisions; /* Records fewest # of collisions for asso value. */
|
||||
int _max_hash_value; /* Maximum possible hash value. */
|
||||
/* Exclusive upper bound for every _asso_values[c]. A power of 2. */
|
||||
int _asso_value_max;
|
||||
|
||||
/* Maximal possible hash value. */
|
||||
int _max_hash_value;
|
||||
|
||||
/* Sparse bit vector for collision detection. */
|
||||
Bool_Array * _collision_detector;
|
||||
int _size; /* Range of the hash table. */
|
||||
void set_asso_max (int r) { _size = r; }
|
||||
int get_asso_max () { return _size; }
|
||||
|
||||
/* Minimal number of collisions found so far. */
|
||||
int _fewest_collisions;
|
||||
|
||||
/* Scratch set, used during Search::change_some_asso_value. */
|
||||
unsigned char * _union_set;
|
||||
|
||||
/* Number of keyword being handled during Search::find_asso_values. */
|
||||
int _num_done;
|
||||
};
|
||||
|
||||
#endif
|
||||
|
||||
Reference in New Issue
Block a user