mirror of
https://git.savannah.gnu.org/git/gperf.git
synced 2025-12-02 13:09:22 +00:00
1478 lines
49 KiB
C++
1478 lines
49 KiB
C++
/* Search algorithm.
|
|
Copyright (C) 1989-1998, 2000, 2002 Free Software Foundation, Inc.
|
|
Written by Douglas C. Schmidt <schmidt@ics.uci.edu>
|
|
and Bruno Haible <bruno@clisp.org>.
|
|
|
|
This file is part of GNU GPERF.
|
|
|
|
GNU GPERF is free software; you can redistribute it and/or modify
|
|
it under the terms of the GNU General Public License as published by
|
|
the Free Software Foundation; either version 2, or (at your option)
|
|
any later version.
|
|
|
|
GNU GPERF is distributed in the hope that it will be useful,
|
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
GNU General Public License for more details.
|
|
|
|
You should have received a copy of the GNU General Public License
|
|
along with this program; see the file COPYING.
|
|
If not, write to the Free Software Foundation, Inc.,
|
|
59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */
|
|
|
|
/* Specification. */
|
|
#include "search.h"
|
|
|
|
#include <stdio.h>
|
|
#include <stdlib.h> /* declares exit(), rand(), srand() */
|
|
#include <string.h> /* declares memset(), memcmp() */
|
|
#include <time.h> /* declares time() */
|
|
#include <limits.h> /* defines INT_MIN, INT_MAX, UINT_MAX */
|
|
#include "options.h"
|
|
#include "hash-table.h"
|
|
|
|
/* The most general form of the hash function is
|
|
|
|
hash (keyword) = sum (asso_values[keyword[i] + alpha_inc[i]] : i in Pos)
|
|
+ len (keyword)
|
|
|
|
where Pos is a set of byte positions,
|
|
each alpha_inc[i] is a nonnegative integer,
|
|
each asso_values[c] is a nonnegative integer,
|
|
len (keyword) is the keyword's length if !option[NOLENGTH], or 0 otherwise.
|
|
|
|
Theorem 1: If all keywords are different, there is a set Pos such that
|
|
all tuples (keyword[i] : i in Pos) are different.
|
|
|
|
Theorem 2: If all tuples (keyword[i] : i in Pos) are different, there
|
|
are nonnegative integers alpha_inc[i] such that all multisets
|
|
{keyword[i] + alpha_inc[i] : i in Pos} are different.
|
|
|
|
Theorem 3: If all multisets selchars[keyword] are different, there are
|
|
nonnegative integers asso_values[c] such that all hash values
|
|
sum (asso_values[c] : c in selchars[keyword]) are different.
|
|
|
|
Based on these three facts, we find the hash function in three steps:
|
|
|
|
Step 1 (Finding good byte positions):
|
|
Find a set Pos, as small as possible, such that all tuples
|
|
(keyword[i] : i in Pos) are different.
|
|
|
|
Step 2 (Finding good alpha increments):
|
|
Find nonnegative integers alpha_inc[i], as many of them as possible being
|
|
zero, and the others being as small as possible, such that all multisets
|
|
{keyword[i] + alpha_inc[i] : i in Pos} are different.
|
|
|
|
Step 3 (Finding good asso_values):
|
|
Find asso_values[c] such that all hash (keyword) are different.
|
|
|
|
In other words, each step finds a projection that is injective on the
|
|
given finite set:
|
|
proj1 : String --> Map (Pos --> N)
|
|
proj2 : Map (Pos --> N) --> Map (Pos --> N) / S(Pos)
|
|
proj3 : Map (Pos --> N) / S(Pos) --> N
|
|
where N denotes the set of nonnegative integers, and S(Pos) is the
|
|
symmetric group over Pos.
|
|
|
|
This was the theory for option[NOLENGTH]; if !option[NOLENGTH], slight
|
|
modifications apply:
|
|
proj1 : String --> Map (Pos --> N) x N
|
|
proj2 : Map (Pos --> N) x N --> Map (Pos --> N) / S(Pos) x N
|
|
proj3 : Map (Pos --> N) / S(Pos) x N --> N
|
|
*/
|
|
|
|
/* ==================== Initialization and Preparation ===================== */
|
|
|
|
Search::Search (KeywordExt_List *list)
|
|
: _head (list)
|
|
{
|
|
}
|
|
|
|
void
|
|
Search::preprepare ()
|
|
{
|
|
KeywordExt_List *temp;
|
|
|
|
/* Compute the total number of keywords. */
|
|
_total_keys = 0;
|
|
for (temp = _head; temp; temp = temp->rest())
|
|
_total_keys++;
|
|
|
|
/* Compute the minimum and maximum keyword length. */
|
|
_max_key_len = INT_MIN;
|
|
_min_key_len = INT_MAX;
|
|
for (temp = _head; temp; temp = temp->rest())
|
|
{
|
|
KeywordExt *keyword = temp->first();
|
|
|
|
if (_max_key_len < keyword->_allchars_length)
|
|
_max_key_len = keyword->_allchars_length;
|
|
if (_min_key_len > keyword->_allchars_length)
|
|
_min_key_len = keyword->_allchars_length;
|
|
}
|
|
|
|
/* Exit program if an empty string is used as key, since the comparison
|
|
expressions don't work correctly for looking up an empty string. */
|
|
if (_min_key_len == 0)
|
|
{
|
|
fprintf (stderr, "Empty input key is not allowed.\n"
|
|
"To recognize an empty input key, your code should check for\n"
|
|
"len == 0 before calling the gperf generated lookup function.\n");
|
|
exit (1);
|
|
}
|
|
}
|
|
|
|
/* ====================== Finding good byte positions ====================== */
|
|
|
|
/* Initializes each keyword's _selchars array. */
|
|
void
|
|
Search::init_selchars_tuple (bool use_all_chars, const Positions& positions) const
|
|
{
|
|
for (KeywordExt_List *temp = _head; temp; temp = temp->rest())
|
|
temp->first()->init_selchars_tuple(use_all_chars, positions);
|
|
}
|
|
|
|
/* Deletes each keyword's _selchars array. */
|
|
void
|
|
Search::delete_selchars () const
|
|
{
|
|
for (KeywordExt_List *temp = _head; temp; temp = temp->rest())
|
|
temp->first()->delete_selchars();
|
|
}
|
|
|
|
/* Count the duplicate keywords that occur with a given set of positions.
|
|
In other words, it returns the difference
|
|
# K - # proj1 (K)
|
|
where K is the multiset of given keywords. */
|
|
unsigned int
|
|
Search::count_duplicates_tuple (const Positions& positions) const
|
|
{
|
|
/* Run through the keyword list and count the duplicates incrementally.
|
|
The result does not depend on the order of the keyword list, thanks to
|
|
the formula above. */
|
|
init_selchars_tuple (option[ALLCHARS], positions);
|
|
|
|
unsigned int count = 0;
|
|
{
|
|
Hash_Table representatives (_total_keys, option[NOLENGTH]);
|
|
for (KeywordExt_List *temp = _head; temp; temp = temp->rest())
|
|
{
|
|
KeywordExt *keyword = temp->first();
|
|
if (representatives.insert (keyword))
|
|
count++;
|
|
}
|
|
}
|
|
|
|
delete_selchars ();
|
|
|
|
return count;
|
|
}
|
|
|
|
/* Find good key positions. */
|
|
|
|
void
|
|
Search::find_positions ()
|
|
{
|
|
/* If the user gave the key positions, we use them. */
|
|
if (option[POSITIONS])
|
|
{
|
|
_key_positions = option.get_key_positions();
|
|
return;
|
|
}
|
|
|
|
/* 1. Find positions that must occur in order to distinguish duplicates. */
|
|
Positions mandatory;
|
|
|
|
if (!option[DUP])
|
|
{
|
|
for (KeywordExt_List *l1 = _head; l1 && l1->rest(); l1 = l1->rest())
|
|
{
|
|
KeywordExt *keyword1 = l1->first();
|
|
for (KeywordExt_List *l2 = l1->rest(); l2; l2 = l2->rest())
|
|
{
|
|
KeywordExt *keyword2 = l2->first();
|
|
|
|
/* If keyword1 and keyword2 have the same length and differ
|
|
in just one position, and it is not the last character,
|
|
this position is mandatory. */
|
|
if (keyword1->_allchars_length == keyword2->_allchars_length)
|
|
{
|
|
int n = keyword1->_allchars_length;
|
|
int i;
|
|
for (i = 1; i < n; i++)
|
|
if (keyword1->_allchars[i-1] != keyword2->_allchars[i-1])
|
|
break;
|
|
if (i < n
|
|
&& memcmp (&keyword1->_allchars[i],
|
|
&keyword2->_allchars[i],
|
|
n - i)
|
|
== 0)
|
|
{
|
|
/* Position i is mandatory. */
|
|
if (!mandatory.contains (i))
|
|
mandatory.add (i);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
/* 2. Add positions, as long as this decreases the duplicates count. */
|
|
int imax = (_max_key_len < Positions::MAX_KEY_POS
|
|
? _max_key_len : Positions::MAX_KEY_POS);
|
|
Positions current = mandatory;
|
|
unsigned int current_duplicates_count = count_duplicates_tuple (current);
|
|
for (;;)
|
|
{
|
|
Positions best;
|
|
unsigned int best_duplicates_count = UINT_MAX;
|
|
|
|
for (int i = imax; i >= 0; i--)
|
|
if (!current.contains (i))
|
|
{
|
|
Positions tryal = current;
|
|
tryal.add (i);
|
|
unsigned int try_duplicates_count = count_duplicates_tuple (tryal);
|
|
|
|
/* We prefer 'try' to 'best' if it produces less duplicates,
|
|
or if it produces the same number of duplicates but with
|
|
a more efficient hash function. */
|
|
if (try_duplicates_count < best_duplicates_count
|
|
|| (try_duplicates_count == best_duplicates_count && i > 0))
|
|
{
|
|
best = tryal;
|
|
best_duplicates_count = try_duplicates_count;
|
|
}
|
|
}
|
|
|
|
/* Stop adding positions when it gives no improvement. */
|
|
if (best_duplicates_count >= current_duplicates_count)
|
|
break;
|
|
|
|
current = best;
|
|
current_duplicates_count = best_duplicates_count;
|
|
}
|
|
|
|
/* 3. Remove positions, as long as this doesn't increase the duplicates
|
|
count. */
|
|
for (;;)
|
|
{
|
|
Positions best;
|
|
unsigned int best_duplicates_count = UINT_MAX;
|
|
|
|
for (int i = imax; i >= 0; i--)
|
|
if (current.contains (i) && !mandatory.contains (i))
|
|
{
|
|
Positions tryal = current;
|
|
tryal.remove (i);
|
|
unsigned int try_duplicates_count = count_duplicates_tuple (tryal);
|
|
|
|
/* We prefer 'try' to 'best' if it produces less duplicates,
|
|
or if it produces the same number of duplicates but with
|
|
a more efficient hash function. */
|
|
if (try_duplicates_count < best_duplicates_count
|
|
|| (try_duplicates_count == best_duplicates_count && i == 0))
|
|
{
|
|
best = tryal;
|
|
best_duplicates_count = try_duplicates_count;
|
|
}
|
|
}
|
|
|
|
/* Stop removing positions when it gives no improvement. */
|
|
if (best_duplicates_count > current_duplicates_count)
|
|
break;
|
|
|
|
current = best;
|
|
current_duplicates_count = best_duplicates_count;
|
|
}
|
|
|
|
/* 4. Replace two positions by one, as long as this doesn't increase the
|
|
duplicates count. */
|
|
for (;;)
|
|
{
|
|
Positions best;
|
|
unsigned int best_duplicates_count = UINT_MAX;
|
|
|
|
for (int i1 = imax; i1 >= 0; i1--)
|
|
if (current.contains (i1) && !mandatory.contains (i1))
|
|
for (int i2 = imax; i2 >= 0; i2--)
|
|
if (current.contains (i2) && !mandatory.contains (i2) && i2 != i1)
|
|
for (int i3 = imax; i3 >= 0; i3--)
|
|
if (!current.contains (i3))
|
|
{
|
|
Positions tryal = current;
|
|
tryal.remove (i1);
|
|
tryal.remove (i2);
|
|
tryal.add (i3);
|
|
unsigned int try_duplicates_count =
|
|
count_duplicates_tuple (tryal);
|
|
|
|
/* We prefer 'try' to 'best' if it produces less duplicates,
|
|
or if it produces the same number of duplicates but with
|
|
a more efficient hash function. */
|
|
if (try_duplicates_count < best_duplicates_count
|
|
|| (try_duplicates_count == best_duplicates_count
|
|
&& (i1 == 0 || i2 == 0 || i3 > 0)))
|
|
{
|
|
best = tryal;
|
|
best_duplicates_count = try_duplicates_count;
|
|
}
|
|
}
|
|
|
|
/* Stop removing positions when it gives no improvement. */
|
|
if (best_duplicates_count > current_duplicates_count)
|
|
break;
|
|
|
|
current = best;
|
|
current_duplicates_count = best_duplicates_count;
|
|
}
|
|
|
|
/* That's it. Hope it's good enough. */
|
|
_key_positions = current;
|
|
|
|
if (option[DEBUG])
|
|
{
|
|
/* Print the result. */
|
|
fprintf (stderr, "\nComputed positions: ");
|
|
PositionReverseIterator iter (_key_positions);
|
|
bool seen_lastchar = false;
|
|
bool first = true;
|
|
for (int i; (i = iter.next ()) != PositionReverseIterator::EOS; )
|
|
{
|
|
if (!first)
|
|
fprintf (stderr, ", ");
|
|
if (i == Positions::LASTCHAR)
|
|
seen_lastchar = true;
|
|
else
|
|
{
|
|
fprintf (stderr, "%d", i);
|
|
first = false;
|
|
}
|
|
}
|
|
if (seen_lastchar)
|
|
{
|
|
if (!first)
|
|
fprintf (stderr, ", ");
|
|
fprintf (stderr, "$");
|
|
}
|
|
fprintf (stderr, "\n");
|
|
}
|
|
}
|
|
|
|
/* ===================== Finding good alpha increments ===================== */
|
|
|
|
/* Initializes each keyword's _selchars array. */
|
|
void
|
|
Search::init_selchars_multiset (bool use_all_chars, const Positions& positions, const unsigned int *alpha_inc) const
|
|
{
|
|
for (KeywordExt_List *temp = _head; temp; temp = temp->rest())
|
|
temp->first()->init_selchars_multiset(use_all_chars, positions, alpha_inc);
|
|
}
|
|
|
|
/* Count the duplicate keywords that occur with the given set of positions
|
|
and a given alpha_inc[] array.
|
|
In other words, it returns the difference
|
|
# K - # proj2 (proj1 (K))
|
|
where K is the multiset of given keywords. */
|
|
unsigned int
|
|
Search::count_duplicates_multiset (const unsigned int *alpha_inc) const
|
|
{
|
|
/* Run through the keyword list and count the duplicates incrementally.
|
|
The result does not depend on the order of the keyword list, thanks to
|
|
the formula above. */
|
|
init_selchars_multiset (option[ALLCHARS], _key_positions, alpha_inc);
|
|
|
|
unsigned int count = 0;
|
|
{
|
|
Hash_Table representatives (_total_keys, option[NOLENGTH]);
|
|
for (KeywordExt_List *temp = _head; temp; temp = temp->rest())
|
|
{
|
|
KeywordExt *keyword = temp->first();
|
|
if (representatives.insert (keyword))
|
|
count++;
|
|
}
|
|
}
|
|
|
|
delete_selchars ();
|
|
|
|
return count;
|
|
}
|
|
|
|
/* Find good _alpha_inc[]. */
|
|
|
|
void
|
|
Search::find_alpha_inc ()
|
|
{
|
|
/* The goal is to choose _alpha_inc[] such that it doesn't introduce
|
|
artificial duplicates.
|
|
In other words, the goal is # proj2 (proj1 (K)) = # proj1 (K). */
|
|
unsigned int duplicates_goal = count_duplicates_tuple (_key_positions);
|
|
|
|
/* Start with zero increments. This is sufficient in most cases. */
|
|
unsigned int *current = new unsigned int [_max_key_len];
|
|
for (int i = 0; i < _max_key_len; i++)
|
|
current[i] = 0;
|
|
unsigned int current_duplicates_count = count_duplicates_multiset (current);
|
|
|
|
if (current_duplicates_count > duplicates_goal)
|
|
{
|
|
/* Look which _alpha_inc[i] we are free to increment. */
|
|
unsigned int nindices;
|
|
if (option[ALLCHARS])
|
|
nindices = _max_key_len;
|
|
else
|
|
{
|
|
/* Ignore Positions::LASTCHAR. Remember that since Positions are
|
|
sorted in decreasing order, Positions::LASTCHAR comes last. */
|
|
nindices = (_key_positions.get_size() == 0
|
|
|| _key_positions[_key_positions.get_size() - 1]
|
|
!= Positions::LASTCHAR
|
|
? _key_positions.get_size()
|
|
: _key_positions.get_size() - 1);
|
|
}
|
|
|
|
unsigned int indices[nindices];
|
|
if (option[ALLCHARS])
|
|
for (unsigned int j = 0; j < nindices; j++)
|
|
indices[j] = j;
|
|
else
|
|
{
|
|
PositionIterator iter (_key_positions);
|
|
for (unsigned int j = 0; j < nindices; j++)
|
|
{
|
|
int key_pos = iter.next ();
|
|
if (key_pos == PositionIterator::EOS
|
|
|| key_pos == Positions::LASTCHAR)
|
|
abort ();
|
|
indices[j] = key_pos - 1;
|
|
}
|
|
}
|
|
|
|
/* Perform several rounds of searching for a good alpha increment.
|
|
Each round reduces the number of artificial collisions by adding
|
|
an increment in a single key position. */
|
|
unsigned int best[_max_key_len];
|
|
unsigned int tryal[_max_key_len];
|
|
do
|
|
{
|
|
/* An increment of 1 is not always enough. Try higher increments
|
|
also. */
|
|
for (unsigned int inc = 1; ; inc++)
|
|
{
|
|
unsigned int best_duplicates_count = UINT_MAX;
|
|
|
|
for (unsigned int j = 0; j < nindices; j++)
|
|
{
|
|
memcpy (tryal, current, _max_key_len * sizeof (unsigned int));
|
|
tryal[indices[j]] += inc;
|
|
unsigned int try_duplicates_count =
|
|
count_duplicates_multiset (tryal);
|
|
|
|
/* We prefer 'try' to 'best' if it produces less
|
|
duplicates. */
|
|
if (try_duplicates_count < best_duplicates_count)
|
|
{
|
|
memcpy (best, tryal, _max_key_len * sizeof (unsigned int));
|
|
best_duplicates_count = try_duplicates_count;
|
|
}
|
|
}
|
|
|
|
/* Stop this round when we got an improvement. */
|
|
if (best_duplicates_count < current_duplicates_count)
|
|
{
|
|
memcpy (current, best, _max_key_len * sizeof (unsigned int));
|
|
current_duplicates_count = best_duplicates_count;
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
while (current_duplicates_count > duplicates_goal);
|
|
|
|
if (option[DEBUG])
|
|
{
|
|
/* Print the result. */
|
|
fprintf (stderr, "\nComputed alpha increments: ");
|
|
if (option[ALLCHARS])
|
|
{
|
|
bool first = true;
|
|
for (unsigned int j = 0; j < nindices; j++)
|
|
if (current[indices[j]] != 0)
|
|
{
|
|
if (!first)
|
|
fprintf (stderr, ", ");
|
|
fprintf (stderr, "%u:+%u",
|
|
indices[j] + 1, current[indices[j]]);
|
|
first = false;
|
|
}
|
|
}
|
|
else
|
|
{
|
|
bool first = true;
|
|
for (unsigned int j = nindices; j-- > 0; )
|
|
if (current[indices[j]] != 0)
|
|
{
|
|
if (!first)
|
|
fprintf (stderr, ", ");
|
|
fprintf (stderr, "%u:+%u",
|
|
indices[j] + 1, current[indices[j]]);
|
|
first = false;
|
|
}
|
|
}
|
|
fprintf (stderr, "\n");
|
|
}
|
|
}
|
|
|
|
_alpha_inc = current;
|
|
}
|
|
|
|
/* ======================= Finding good asso_values ======================== */
|
|
|
|
void
|
|
Search::prepare ()
|
|
{
|
|
KeywordExt_List *temp;
|
|
|
|
/* Initialize each keyword's _selchars array. */
|
|
init_selchars_multiset(option[ALLCHARS], _key_positions, _alpha_inc);
|
|
|
|
/* Check for duplicates, i.e. keywords with the same _selchars array
|
|
(and - if !option[NOLENGTH] - also the same length).
|
|
We deal with these by building an equivalence class, so that only
|
|
1 keyword is representative of the entire collection. Only this
|
|
representative remains in the keyword list; the others are accessible
|
|
through the _duplicate_link chain, starting at the representative.
|
|
This *greatly* simplifies processing during later stages of the program.
|
|
Set _total_duplicates and _list_len = _total_keys - _total_duplicates. */
|
|
{
|
|
_list_len = _total_keys;
|
|
_total_duplicates = 0;
|
|
/* Make hash table for efficiency. */
|
|
Hash_Table representatives (_list_len, option[NOLENGTH]);
|
|
|
|
KeywordExt_List *prev = NULL; /* list node before temp */
|
|
for (temp = _head; temp; )
|
|
{
|
|
KeywordExt *keyword = temp->first();
|
|
KeywordExt *other_keyword = representatives.insert (keyword);
|
|
KeywordExt_List *garbage = NULL;
|
|
|
|
if (other_keyword)
|
|
{
|
|
_total_duplicates++;
|
|
_list_len--;
|
|
/* Remove keyword from the main list. */
|
|
prev->rest() = temp->rest();
|
|
garbage = temp;
|
|
/* And insert it on other_keyword's duplicate list. */
|
|
keyword->_duplicate_link = other_keyword->_duplicate_link;
|
|
other_keyword->_duplicate_link = keyword;
|
|
|
|
/* Complain if user hasn't enabled the duplicate option. */
|
|
if (!option[DUP] || option[DEBUG])
|
|
{
|
|
fprintf (stderr, "Key link: \"%.*s\" = \"%.*s\", with key set \"",
|
|
keyword->_allchars_length, keyword->_allchars,
|
|
other_keyword->_allchars_length, other_keyword->_allchars);
|
|
for (int j = 0; j < keyword->_selchars_length; j++)
|
|
putc (keyword->_selchars[j], stderr);
|
|
fprintf (stderr, "\".\n");
|
|
}
|
|
}
|
|
else
|
|
{
|
|
keyword->_duplicate_link = NULL;
|
|
prev = temp;
|
|
}
|
|
temp = temp->rest();
|
|
if (garbage)
|
|
delete garbage;
|
|
}
|
|
if (option[DEBUG])
|
|
representatives.dump();
|
|
}
|
|
|
|
/* Exit program if duplicates exists and option[DUP] not set, since we
|
|
don't want to continue in this case. (We don't want to turn on
|
|
option[DUP] implicitly, because the generated code is usually much
|
|
slower. */
|
|
if (_total_duplicates)
|
|
{
|
|
if (option[DUP])
|
|
fprintf (stderr, "%d input keys have identical hash values, examine output carefully...\n",
|
|
_total_duplicates);
|
|
else
|
|
{
|
|
fprintf (stderr, "%d input keys have identical hash values,\n",
|
|
_total_duplicates);
|
|
if (option[POSITIONS])
|
|
fprintf (stderr, "try different key positions or use option -D.\n");
|
|
else
|
|
fprintf (stderr, "use option -D.\n");
|
|
exit (1);
|
|
}
|
|
}
|
|
|
|
/* Compute _alpha_size, the upper bound on the indices passed to
|
|
asso_values[]. */
|
|
unsigned int max_alpha_inc = 0;
|
|
for (int i = 0; i < _max_key_len; i++)
|
|
if (max_alpha_inc < _alpha_inc[i])
|
|
max_alpha_inc = _alpha_inc[i];
|
|
_alpha_size = (option[SEVENBIT] ? 128 : 256) + max_alpha_inc;
|
|
|
|
/* Compute the occurrences of each character in the alphabet. */
|
|
_occurrences = new int[_alpha_size];
|
|
memset (_occurrences, 0, _alpha_size * sizeof (_occurrences[0]));
|
|
for (temp = _head; temp; temp = temp->rest())
|
|
{
|
|
KeywordExt *keyword = temp->first();
|
|
const unsigned int *ptr = keyword->_selchars;
|
|
for (int count = keyword->_selchars_length; count > 0; ptr++, count--)
|
|
_occurrences[*ptr]++;
|
|
}
|
|
|
|
/* Memory allocation. */
|
|
_asso_values = new int[_alpha_size];
|
|
_determined = new bool[_alpha_size];
|
|
}
|
|
|
|
/* ---------------- Reordering the Keyword list (optional) ----------------- */
|
|
|
|
/* Computes the sum of occurrences of the _selchars of a keyword.
|
|
This is a kind of correlation measure: Keywords which have many
|
|
selected characters in common with other keywords have a high
|
|
occurrence sum. Keywords whose selected characters don't occur
|
|
in other keywords have a low occurrence sum. */
|
|
|
|
inline int
|
|
Search::compute_occurrence (KeywordExt *ptr) const
|
|
{
|
|
int value = 0;
|
|
|
|
const unsigned int *p = ptr->_selchars;
|
|
unsigned int i = ptr->_selchars_length;
|
|
for (; i > 0; p++, i--)
|
|
value += _occurrences[*p];
|
|
|
|
return value;
|
|
}
|
|
|
|
/* Comparison function for sorting by decreasing _occurrence valuation. */
|
|
static bool
|
|
greater_by_occurrence (KeywordExt *keyword1, KeywordExt *keyword2)
|
|
{
|
|
return keyword1->_occurrence > keyword2->_occurrence;
|
|
}
|
|
|
|
/* Auxiliary function for reorder():
|
|
Sets all alphabet characters as undetermined. */
|
|
|
|
inline void
|
|
Search::clear_determined ()
|
|
{
|
|
memset (_determined, 0, _alpha_size * sizeof (_determined[0]));
|
|
}
|
|
|
|
/* Auxiliary function for reorder():
|
|
Sets all selected characters of the keyword as determined. */
|
|
|
|
inline void
|
|
Search::set_determined (KeywordExt *keyword)
|
|
{
|
|
const unsigned int *p = keyword->_selchars;
|
|
unsigned int i = keyword->_selchars_length;
|
|
for (; i > 0; p++, i--)
|
|
_determined[*p] = true;
|
|
}
|
|
|
|
/* Auxiliary function for reorder():
|
|
Returns true if the keyword's selected characters are all determined. */
|
|
|
|
inline bool
|
|
Search::already_determined (KeywordExt *keyword) const
|
|
{
|
|
const unsigned int *p = keyword->_selchars;
|
|
unsigned int i = keyword->_selchars_length;
|
|
for (; i > 0; p++, i--)
|
|
if (!_determined[*p])
|
|
return false;
|
|
|
|
return true;
|
|
}
|
|
|
|
/* Reorders the keyword list so as to minimize search times.
|
|
First the list is reordered so that frequently occuring keys appear first.
|
|
Then the list is reordered so that keys whose values are already determined
|
|
will be placed towards the front of the list. This helps prune the search
|
|
time by handling inevitable collisions early in the search process. See
|
|
Cichelli's paper from Jan 1980 JACM for details.... */
|
|
|
|
void
|
|
Search::reorder ()
|
|
{
|
|
KeywordExt_List *ptr;
|
|
|
|
/* Compute the _occurrence valuation of every keyword on the list. */
|
|
for (ptr = _head; ptr; ptr = ptr->rest())
|
|
{
|
|
KeywordExt *keyword = ptr->first();
|
|
|
|
keyword->_occurrence = compute_occurrence (keyword);
|
|
}
|
|
|
|
/* Sort the list by decreasing _occurrence valuation. */
|
|
_head = mergesort_list (_head, greater_by_occurrence);
|
|
|
|
/* Reorder the list to maximize the efficiency of the search. */
|
|
|
|
/* At the beginning, consider that no asso_values[c] is fixed. */
|
|
clear_determined ();
|
|
for (ptr = _head; ptr != NULL && ptr->rest() != NULL; ptr = ptr->rest())
|
|
{
|
|
KeywordExt *keyword = ptr->first();
|
|
|
|
/* Then we'll fix asso_values[c] for all c occurring in this keyword. */
|
|
set_determined (keyword);
|
|
|
|
/* Then we wish to test for hash value collisions the remaining keywords
|
|
whose hash value is completely determined, as quickly as possible.
|
|
For this purpose, move all the completely determined keywords in the
|
|
remaining list immediately past this keyword. */
|
|
KeywordExt_List *curr_ptr;
|
|
KeywordExt_List *next_ptr; /* = curr_ptr->rest() */
|
|
for (curr_ptr = ptr, next_ptr = curr_ptr->rest();
|
|
next_ptr != NULL;
|
|
next_ptr = curr_ptr->rest())
|
|
{
|
|
KeywordExt *next_keyword = next_ptr->first();
|
|
|
|
if (already_determined (next_keyword))
|
|
{
|
|
if (curr_ptr == ptr)
|
|
/* Keep next_ptr where it is. */
|
|
curr_ptr = next_ptr;
|
|
else
|
|
{
|
|
/* Remove next_ptr from its current list position... */
|
|
curr_ptr->rest() = next_ptr->rest();
|
|
/* ... and insert it right after ptr. */
|
|
next_ptr->rest() = ptr->rest();
|
|
ptr->rest() = next_ptr;
|
|
}
|
|
|
|
/* Advance ptr. */
|
|
ptr = ptr->rest();
|
|
}
|
|
else
|
|
curr_ptr = next_ptr;
|
|
}
|
|
}
|
|
}
|
|
|
|
/* ------------------------------------------------------------------------- */
|
|
|
|
/* Returns the length of keyword list. */
|
|
|
|
int
|
|
Search::keyword_list_length () const
|
|
{
|
|
return _list_len;
|
|
}
|
|
|
|
/* Returns the maximum length of keywords. */
|
|
|
|
int
|
|
Search::max_key_length () const
|
|
{
|
|
return _max_key_len;
|
|
}
|
|
|
|
/* Returns the number of key positions. */
|
|
|
|
int
|
|
Search::get_max_keysig_size () const
|
|
{
|
|
return option[ALLCHARS] ? _max_key_len : _key_positions.get_size();
|
|
}
|
|
|
|
/* ---------------------- Finding good asso_values[] ----------------------- */
|
|
|
|
/* Initializes the asso_values[] related parameters. */
|
|
|
|
void
|
|
Search::prepare_asso_values ()
|
|
{
|
|
int non_linked_length = keyword_list_length ();
|
|
int asso_value_max;
|
|
|
|
asso_value_max =
|
|
static_cast<int>(non_linked_length * option.get_size_multiple());
|
|
/* Round up to the next power of two. This makes it easy to ensure
|
|
an _asso_value[c] is >= 0 and < asso_value_max. Also, the jump value
|
|
being odd, it guarantees that Search::try_asso_value() will iterate
|
|
through different values for _asso_value[c]. */
|
|
if (asso_value_max == 0)
|
|
asso_value_max = 1;
|
|
asso_value_max |= asso_value_max >> 1;
|
|
asso_value_max |= asso_value_max >> 2;
|
|
asso_value_max |= asso_value_max >> 4;
|
|
asso_value_max |= asso_value_max >> 8;
|
|
asso_value_max |= asso_value_max >> 16;
|
|
asso_value_max++;
|
|
_asso_value_max = asso_value_max;
|
|
|
|
/* Given the bound for _asso_values[c], we have a bound for the possible
|
|
hash values, as computed in compute_hash(). */
|
|
_max_hash_value = (option[NOLENGTH] ? 0 : max_key_length ())
|
|
+ (_asso_value_max - 1) * get_max_keysig_size ();
|
|
/* Allocate a sparse bit vector for detection of collisions of hash
|
|
values. */
|
|
_collision_detector = new Bool_Array (_max_hash_value + 1);
|
|
|
|
if (option[DEBUG])
|
|
{
|
|
fprintf (stderr, "total non-linked keys = %d\nmaximum associated value is %d"
|
|
"\nmaximum size of generated hash table is %d\n",
|
|
non_linked_length, asso_value_max, _max_hash_value);
|
|
|
|
int field_width;
|
|
|
|
field_width = 0;
|
|
{
|
|
for (KeywordExt_List *temp = _head; temp; temp = temp->rest())
|
|
{
|
|
KeywordExt *keyword = temp->first();
|
|
if (field_width < keyword->_selchars_length)
|
|
field_width = keyword->_selchars_length;
|
|
}
|
|
}
|
|
|
|
fprintf (stderr, "\ndumping the keyword list without duplicates\n");
|
|
fprintf (stderr, "keyword #, %*s, keyword\n", field_width, "keysig");
|
|
int i = 0;
|
|
for (KeywordExt_List *temp = _head; temp; temp = temp->rest())
|
|
{
|
|
KeywordExt *keyword = temp->first();
|
|
fprintf (stderr, "%9d, ", ++i);
|
|
if (field_width > keyword->_selchars_length)
|
|
fprintf (stderr, "%*s", field_width - keyword->_selchars_length, "");
|
|
for (int j = 0; j < keyword->_selchars_length; j++)
|
|
putc (keyword->_selchars[j], stderr);
|
|
fprintf (stderr, ", %.*s\n",
|
|
keyword->_allchars_length, keyword->_allchars);
|
|
}
|
|
fprintf (stderr, "\nend of keyword list\n\n");
|
|
}
|
|
|
|
if (option[RANDOM] || option.get_jump () == 0)
|
|
/* We will use rand(), so initialize the random number generator. */
|
|
srand (static_cast<long>(time (0)));
|
|
|
|
_initial_asso_value = (option[RANDOM] ? -1 : option.get_initial_asso_value ());
|
|
_jump = option.get_jump ();
|
|
}
|
|
|
|
/* Puts a first guess into asso_values[]. */
|
|
|
|
void
|
|
Search::init_asso_values ()
|
|
{
|
|
if (_initial_asso_value < 0)
|
|
{
|
|
for (int i = 0; i < _alpha_size; i++)
|
|
_asso_values[i] = rand () & (_asso_value_max - 1);
|
|
}
|
|
else
|
|
{
|
|
int asso_value = _initial_asso_value;
|
|
|
|
asso_value = asso_value & (_asso_value_max - 1);
|
|
for (int i = 0; i < _alpha_size; i++)
|
|
_asso_values[i] = asso_value;
|
|
}
|
|
}
|
|
|
|
/* Computes a keyword's hash value, relative to the current _asso_values[],
|
|
and stores it in keyword->_hash_value.
|
|
This is called very frequently, and needs to be fast! */
|
|
|
|
inline int
|
|
Search::compute_hash (KeywordExt *keyword) const
|
|
{
|
|
int sum = option[NOLENGTH] ? 0 : keyword->_allchars_length;
|
|
|
|
const unsigned int *p = keyword->_selchars;
|
|
int i = keyword->_selchars_length;
|
|
for (; i > 0; p++, i--)
|
|
sum += _asso_values[*p];
|
|
|
|
return keyword->_hash_value = sum;
|
|
}
|
|
|
|
/* Computes the disjoint union of two multisets of characters, i.e.
|
|
the set of characters that are contained with a different multiplicity
|
|
in set_1 and set_2. This includes those characters which are contained
|
|
in one of the sets but not both.
|
|
Both sets set_1[0..size_1-1] and set_2[0..size_2-1] are given ordered.
|
|
The result, an ordered set (not multiset!) is stored in set_3[0...].
|
|
Returns the size of the resulting set. */
|
|
|
|
inline int
|
|
compute_disjoint_union (const unsigned int *set_1, int size_1,
|
|
const unsigned int *set_2, int size_2,
|
|
unsigned int *set_3)
|
|
{
|
|
unsigned int *base = set_3;
|
|
|
|
while (size_1 > 0 && size_2 > 0)
|
|
if (*set_1 == *set_2)
|
|
{
|
|
set_1++, size_1--;
|
|
set_2++, size_2--;
|
|
}
|
|
else
|
|
{
|
|
unsigned int next;
|
|
if (*set_1 < *set_2)
|
|
next = *set_1++, size_1--;
|
|
else
|
|
next = *set_2++, size_2--;
|
|
if (set_3 == base || next != set_3[-1])
|
|
*set_3++ = next;
|
|
}
|
|
|
|
while (size_1 > 0)
|
|
{
|
|
unsigned int next;
|
|
next = *set_1++, size_1--;
|
|
if (set_3 == base || next != set_3[-1])
|
|
*set_3++ = next;
|
|
}
|
|
|
|
while (size_2 > 0)
|
|
{
|
|
unsigned int next;
|
|
next = *set_2++, size_2--;
|
|
if (set_3 == base || next != set_3[-1])
|
|
*set_3++ = next;
|
|
}
|
|
return set_3 - base;
|
|
}
|
|
|
|
/* Sorts the given set in increasing frequency of _occurrences[]. */
|
|
|
|
inline void
|
|
Search::sort_by_occurrence (unsigned int *set, int len) const
|
|
{
|
|
/* Use bubble sort, since the set is typically short. */
|
|
for (int i = 1; i < len; i++)
|
|
{
|
|
int curr;
|
|
unsigned int tmp;
|
|
|
|
for (curr = i, tmp = set[curr];
|
|
curr > 0 && _occurrences[tmp] < _occurrences[set[curr-1]];
|
|
curr--)
|
|
set[curr] = set[curr - 1];
|
|
|
|
set[curr] = tmp;
|
|
}
|
|
}
|
|
|
|
/* Returns true if the recomputed hash values for the keywords from
|
|
_head->first() to curr - inclusive - give at least one collision.
|
|
This is called very frequently, and needs to be fast! */
|
|
bool
|
|
Search::has_collisions (KeywordExt *curr)
|
|
{
|
|
/* Iteration Number array is a win, O(1) initialization time! */
|
|
_collision_detector->clear ();
|
|
|
|
for (KeywordExt_List *ptr = _head; ; ptr = ptr->rest())
|
|
{
|
|
KeywordExt *keyword = ptr->first();
|
|
|
|
/* Compute new hash code for the keyword, and see whether it
|
|
collides with another keyword's hash code. If we have too
|
|
many collisions, we can safely abort the fruitless loop. */
|
|
if (_collision_detector->set_bit (compute_hash (keyword)))
|
|
return true;
|
|
|
|
if (keyword == curr)
|
|
return false;
|
|
}
|
|
}
|
|
|
|
/* Tests whether the given keyword has the same hash value as another one
|
|
earlier in the list. If yes, this earlier keyword is returned (more
|
|
precisely, the first one of them, but it doesn't really matter which one).
|
|
If no collision is present, NULL is returned. */
|
|
KeywordExt *
|
|
Search::collision_prior_to (KeywordExt *curr)
|
|
{
|
|
for (KeywordExt_List *prior_ptr = _head;
|
|
prior_ptr->first() != curr;
|
|
prior_ptr = prior_ptr->rest())
|
|
{
|
|
KeywordExt *prior = prior_ptr->first();
|
|
|
|
if (prior->_hash_value == curr->_hash_value)
|
|
return prior;
|
|
}
|
|
return NULL;
|
|
}
|
|
|
|
/* Finding good asso_values is normally straightforwards, but needs
|
|
backtracking in some cases. The recurse/backtrack depth can be at most
|
|
_list_len. Since we cannot assume that the C stack is large enough,
|
|
we perform the processing without recursion, and simulate the stack. */
|
|
struct StackEntry
|
|
{
|
|
/* The current keyword. */
|
|
KeywordExt * _curr;
|
|
|
|
/* The prior keyword, with which curr collides. */
|
|
KeywordExt * _prior;
|
|
|
|
/* Scratch set. */
|
|
unsigned int * _union_set;
|
|
unsigned int _union_set_length;
|
|
|
|
/* Current index into the scratch set. */
|
|
unsigned int _union_index;
|
|
|
|
/* Trying a different value for _asso_values[_c]. */
|
|
unsigned int _c;
|
|
|
|
/* The original value of _asso_values[_c]. */
|
|
unsigned int _original_asso_value;
|
|
|
|
/* Remaining number of iterations. */
|
|
int _iter;
|
|
};
|
|
|
|
/* Finds some _asso_values[] that fit. */
|
|
|
|
void
|
|
Search::find_asso_values ()
|
|
{
|
|
/* Add one keyword after the other and see whether its hash value collides
|
|
with one of the previous hash values. If so, change some asso_values[]
|
|
entry until the number of collisions so far is reduced. Then continue
|
|
with the next keyword. */
|
|
|
|
init_asso_values ();
|
|
|
|
int iterations =
|
|
!option[FAST]
|
|
? _asso_value_max /* Try all possible values of _asso_values[c]. */
|
|
: option.get_iterations ()
|
|
? option.get_iterations ()
|
|
: keyword_list_length ();
|
|
|
|
/* Allocate stack. */
|
|
StackEntry *stack = new StackEntry[_list_len];
|
|
{
|
|
KeywordExt_List *ptr = _head;
|
|
for (int i = 0; i < _list_len; i++, ptr = ptr->rest())
|
|
{
|
|
stack[i]._curr = ptr->first();
|
|
stack[i]._union_set = new unsigned int [2 * get_max_keysig_size ()];
|
|
}
|
|
}
|
|
|
|
/* Backtracking according to the standard Prolog call pattern:
|
|
|
|
+------------------+
|
|
-------CALL------>| |-------RETURN------>
|
|
| |
|
|
<------FAIL-------| |<------REDO---------
|
|
+------------------+
|
|
|
|
A CALL and RETURN increase the stack pointer, FAIL and REDO decrease it.
|
|
*/
|
|
{
|
|
/* Current stack pointer. */
|
|
StackEntry *sp = &stack[0];
|
|
|
|
/* Local variables corresponding to *sp. */
|
|
/* The current keyword. */
|
|
KeywordExt *curr;
|
|
/* The prior keyword, with which curr collides. */
|
|
KeywordExt *prior;
|
|
/* Scratch set. */
|
|
unsigned int *union_set;
|
|
unsigned int union_set_length;
|
|
/* Current index into the scratch set. */
|
|
unsigned int union_index;
|
|
/* Trying a different value for _asso_values[c]. */
|
|
unsigned int c;
|
|
/* The original value of _asso_values[c]. */
|
|
unsigned int original_asso_value;
|
|
/* Remaining number of iterations. */
|
|
int iter;
|
|
|
|
/* ==== CALL ==== */
|
|
CALL:
|
|
|
|
/* Next keyword from the list. */
|
|
curr = sp->_curr;
|
|
|
|
/* Compute this keyword's hash value. */
|
|
compute_hash (curr);
|
|
|
|
/* See if it collides with a prior keyword. */
|
|
prior = collision_prior_to (curr);
|
|
|
|
if (prior != NULL)
|
|
{
|
|
/* Handle collision: Attempt to change an _asso_value[], in order to
|
|
resolve a hash value collision between the two given keywords. */
|
|
|
|
if (option[DEBUG])
|
|
{
|
|
fprintf (stderr, "collision on keyword #%d, prior = \"%.*s\", curr = \"%.*s\" hash = %d\n",
|
|
sp - stack + 1,
|
|
prior->_allchars_length, prior->_allchars,
|
|
curr->_allchars_length, curr->_allchars,
|
|
curr->_hash_value);
|
|
fflush (stderr);
|
|
}
|
|
|
|
/* To achieve that the two hash values become different, we have to
|
|
change an _asso_values[c] for a character c that contributes to the
|
|
hash functions of prior and curr with different multiplicity.
|
|
So we compute the set of such c. */
|
|
union_set = sp->_union_set;
|
|
union_set_length =
|
|
compute_disjoint_union (prior->_selchars, prior->_selchars_length,
|
|
curr->_selchars, curr->_selchars_length,
|
|
union_set);
|
|
|
|
/* Sort by decreasing occurrence: Try least-used characters c first.
|
|
The idea is that this reduces the number of freshly introduced
|
|
collisions. */
|
|
sort_by_occurrence (union_set, union_set_length);
|
|
|
|
for (union_index = 0; union_index < union_set_length; union_index++)
|
|
{
|
|
c = union_set[union_index];
|
|
|
|
/* Try various other values for _asso_values[c]. A value is
|
|
successful if, with it, the recomputed hash values for the
|
|
keywords from _head->first() to curr - inclusive - give no
|
|
collisions. Up to the given number of iterations are performed.
|
|
If successful, _asso_values[c] is changed, and the recursion
|
|
continues. If all iterations are unsuccessful, _asso_values[c]
|
|
is restored and we backtrack, trying the next union_index. */
|
|
|
|
original_asso_value = _asso_values[c];
|
|
|
|
/* Try many valid associated values. */
|
|
for (iter = iterations; iter > 0; iter--)
|
|
{
|
|
/* Try next value. Wrap around mod _asso_value_max. */
|
|
_asso_values[c] =
|
|
(_asso_values[c] + (_jump != 0 ? _jump : rand ()))
|
|
& (_asso_value_max - 1);
|
|
|
|
if (!has_collisions (curr))
|
|
{
|
|
/* Good, this _asso_values[] modification reduces the
|
|
number of collisions so far.
|
|
All keyword->_hash_value up to curr - inclusive -
|
|
have been updated. */
|
|
if (option[DEBUG])
|
|
{
|
|
fprintf (stderr, "- resolved after %d iterations by "
|
|
"changing asso_value['%c'] (char #%d) to %d\n",
|
|
iterations - iter + 1, c,
|
|
union_index + 1, _asso_values[c]);
|
|
fflush (stderr);
|
|
}
|
|
goto RECURSE_COLLISION;
|
|
BACKTRACK_COLLISION:
|
|
if (option[DEBUG])
|
|
{
|
|
fprintf (stderr, "back to collision on keyword #%d, prior = \"%.*s\", curr = \"%.*s\" hash = %d\n",
|
|
sp - stack + 1,
|
|
prior->_allchars_length, prior->_allchars,
|
|
curr->_allchars_length, curr->_allchars,
|
|
curr->_hash_value);
|
|
fflush (stderr);
|
|
}
|
|
}
|
|
}
|
|
|
|
/* Restore original values, no more tries. */
|
|
_asso_values[c] = original_asso_value;
|
|
}
|
|
|
|
/* Failed to resolve a collision. */
|
|
|
|
/* Recompute all keyword->_hash_value up to curr - exclusive -. */
|
|
for (KeywordExt_List *ptr = _head; ; ptr = ptr->rest())
|
|
{
|
|
KeywordExt* keyword = ptr->first();
|
|
if (keyword == curr)
|
|
break;
|
|
compute_hash (keyword);
|
|
}
|
|
|
|
if (option[DEBUG])
|
|
{
|
|
fprintf (stderr, "** collision not resolved after %d iterations of asso_value[",
|
|
iterations);
|
|
for (union_index = 0; union_index < union_set_length; union_index++)
|
|
{
|
|
if (union_index > 0)
|
|
fprintf (stderr, ",");
|
|
fprintf(stderr, "'%c'", union_set[union_index]);
|
|
}
|
|
fprintf (stderr, "], backtracking...\n");
|
|
fflush (stderr);
|
|
}
|
|
}
|
|
else
|
|
{
|
|
/* Nothing to do, just recurse. */
|
|
goto RECURSE_NO_COLLISION;
|
|
BACKTRACK_NO_COLLISION: ;
|
|
}
|
|
|
|
/* ==== FAIL ==== */
|
|
if (sp != stack)
|
|
{
|
|
sp--;
|
|
/* ==== REDO ==== */
|
|
curr = sp->_curr;
|
|
prior = sp->_prior;
|
|
if (prior == NULL)
|
|
goto BACKTRACK_NO_COLLISION;
|
|
union_set = sp->_union_set;
|
|
union_set_length = sp->_union_set_length;
|
|
union_index = sp->_union_index;
|
|
c = sp->_c;
|
|
original_asso_value = sp->_original_asso_value;
|
|
iter = sp->_iter;
|
|
goto BACKTRACK_COLLISION;
|
|
}
|
|
|
|
/* No solution found after an exhaustive search!
|
|
We should ideally turn off option[FAST] and, if that doesn't help,
|
|
multiply _asso_value_max by 2. */
|
|
fprintf (stderr,
|
|
"\nBig failure, always got duplicate hash code values.\n");
|
|
if (option[POSITIONS])
|
|
fprintf (stderr, "try options -m or -r, or use new key positions.\n\n");
|
|
else
|
|
fprintf (stderr, "try options -m or -r.\n\n");
|
|
exit (1);
|
|
|
|
RECURSE_COLLISION:
|
|
/*sp->_union_set = union_set;*/ // redundant
|
|
sp->_union_set_length = union_set_length;
|
|
sp->_union_index = union_index;
|
|
sp->_c = c;
|
|
sp->_original_asso_value = original_asso_value;
|
|
sp->_iter = iter;
|
|
RECURSE_NO_COLLISION:
|
|
/*sp->_curr = curr;*/ // redundant
|
|
sp->_prior = prior;
|
|
/* ==== RETURN ==== */
|
|
sp++;
|
|
if (sp - stack < _list_len)
|
|
goto CALL;
|
|
}
|
|
|
|
/* Deallocate stack. */
|
|
{
|
|
for (int i = 0; i < _list_len; i++)
|
|
delete[] stack[i]._union_set;
|
|
}
|
|
delete[] stack;
|
|
}
|
|
|
|
/* Finds good _asso_values[]. */
|
|
|
|
void
|
|
Search::find_good_asso_values ()
|
|
{
|
|
prepare ();
|
|
if (option[ORDER])
|
|
reorder ();
|
|
prepare_asso_values ();
|
|
|
|
/* Search for good _asso_values[]. */
|
|
int asso_iteration;
|
|
if ((asso_iteration = option.get_asso_iterations ()) == 0)
|
|
/* Try only the given _initial_asso_value and _jump. */
|
|
find_asso_values ();
|
|
else
|
|
{
|
|
/* Try different pairs of _initial_asso_value and _jump, in the
|
|
following order:
|
|
(0, 1)
|
|
(1, 1)
|
|
(2, 1) (0, 3)
|
|
(3, 1) (1, 3)
|
|
(4, 1) (2, 3) (0, 5)
|
|
(5, 1) (3, 3) (1, 5)
|
|
..... */
|
|
KeywordExt_List *saved_head = _head;
|
|
int best_initial_asso_value = 0;
|
|
int best_jump = 1;
|
|
int *best_asso_values = new int[_alpha_size];
|
|
int best_collisions = INT_MAX;
|
|
int best_max_hash_value = INT_MAX;
|
|
|
|
_initial_asso_value = 0; _jump = 1;
|
|
for (;;)
|
|
{
|
|
/* Restore the keyword list in its original order. */
|
|
_head = copy_list (saved_head);
|
|
/* Find good _asso_values[]. */
|
|
find_asso_values ();
|
|
/* Test whether it is the best solution so far. */
|
|
int collisions = 0;
|
|
int max_hash_value = INT_MIN;
|
|
_collision_detector->clear ();
|
|
for (KeywordExt_List *ptr = _head; ptr; ptr = ptr->rest())
|
|
{
|
|
KeywordExt *keyword = ptr->first();
|
|
int hashcode = compute_hash (keyword);
|
|
if (max_hash_value < hashcode)
|
|
max_hash_value = hashcode;
|
|
if (_collision_detector->set_bit (hashcode))
|
|
collisions++;
|
|
}
|
|
if (collisions < best_collisions
|
|
|| (collisions == best_collisions
|
|
&& max_hash_value < best_max_hash_value))
|
|
{
|
|
memcpy (best_asso_values, _asso_values,
|
|
_alpha_size * sizeof (_asso_values[0]));
|
|
best_collisions = collisions;
|
|
best_max_hash_value = max_hash_value;
|
|
}
|
|
/* Delete the copied keyword list. */
|
|
delete_list (_head);
|
|
|
|
if (--asso_iteration == 0)
|
|
break;
|
|
/* Prepare for next iteration. */
|
|
if (_initial_asso_value >= 2)
|
|
_initial_asso_value -= 2, _jump += 2;
|
|
else
|
|
_initial_asso_value += _jump, _jump = 1;
|
|
}
|
|
_head = saved_head;
|
|
/* Install the best found asso_values. */
|
|
_initial_asso_value = best_initial_asso_value;
|
|
_jump = best_jump;
|
|
memcpy (_asso_values, best_asso_values,
|
|
_alpha_size * sizeof (_asso_values[0]));
|
|
delete[] best_asso_values;
|
|
/* The keywords' _hash_value fields are recomputed below. */
|
|
}
|
|
}
|
|
|
|
/* ========================================================================= */
|
|
|
|
/* Comparison function for sorting by increasing _hash_value. */
|
|
static bool
|
|
less_by_hash_value (KeywordExt *keyword1, KeywordExt *keyword2)
|
|
{
|
|
return keyword1->_hash_value < keyword2->_hash_value;
|
|
}
|
|
|
|
/* Sorts the keyword list by hash value. */
|
|
|
|
void
|
|
Search::sort ()
|
|
{
|
|
_head = mergesort_list (_head, less_by_hash_value);
|
|
}
|
|
|
|
void
|
|
Search::optimize ()
|
|
{
|
|
/* Preparations. */
|
|
preprepare ();
|
|
|
|
/* Step 1: Finding good byte positions. */
|
|
find_positions ();
|
|
|
|
/* Step 2: Finding good alpha increments. */
|
|
find_alpha_inc ();
|
|
|
|
/* Step 3: Finding good asso_values. */
|
|
find_good_asso_values ();
|
|
|
|
/* Make one final check, just to make sure nothing weird happened.... */
|
|
_collision_detector->clear ();
|
|
for (KeywordExt_List *curr_ptr = _head; curr_ptr; curr_ptr = curr_ptr->rest())
|
|
{
|
|
KeywordExt *curr = curr_ptr->first();
|
|
unsigned int hashcode = compute_hash (curr);
|
|
if (_collision_detector->set_bit (hashcode))
|
|
{
|
|
/* This shouldn't happen. proj1, proj2, proj3 must have been
|
|
computed to be injective on the given keyword set. */
|
|
fprintf (stderr,
|
|
"\nInternal error, unexpected duplicate hash code\n");
|
|
if (option[POSITIONS])
|
|
fprintf (stderr, "try options -m or -r, or use new key positions.\n\n");
|
|
else
|
|
fprintf (stderr, "try options -m or -r.\n\n");
|
|
exit (1);
|
|
}
|
|
}
|
|
|
|
/* Sorts the keyword list by hash value. */
|
|
sort ();
|
|
}
|
|
|
|
/* Prints out some diagnostics upon completion. */
|
|
|
|
Search::~Search ()
|
|
{
|
|
delete _collision_detector;
|
|
delete[] _determined;
|
|
if (option[DEBUG])
|
|
{
|
|
fprintf (stderr, "\ndumping occurrence and associated values tables\n");
|
|
|
|
for (int i = 0; i < _alpha_size; i++)
|
|
if (_occurrences[i])
|
|
fprintf (stderr, "asso_values[%c] = %6d, occurrences[%c] = %6d\n",
|
|
i, _asso_values[i], i, _occurrences[i]);
|
|
|
|
fprintf (stderr, "end table dumping\n");
|
|
|
|
fprintf (stderr, "\nDumping key list information:\ntotal non-static linked keywords = %d"
|
|
"\ntotal keywords = %d\ntotal duplicates = %d\nmaximum key length = %d\n",
|
|
_list_len, _total_keys, _total_duplicates, _max_key_len);
|
|
|
|
int field_width = get_max_keysig_size ();
|
|
fprintf (stderr, "\nList contents are:\n(hash value, key length, index, %*s, keyword):\n",
|
|
field_width, "selchars");
|
|
for (KeywordExt_List *ptr = _head; ptr; ptr = ptr->rest())
|
|
{
|
|
fprintf (stderr, "%11d,%11d,%6d, ",
|
|
ptr->first()->_hash_value, ptr->first()->_allchars_length, ptr->first()->_final_index);
|
|
if (field_width > ptr->first()->_selchars_length)
|
|
fprintf (stderr, "%*s", field_width - ptr->first()->_selchars_length, "");
|
|
for (int j = 0; j < ptr->first()->_selchars_length; j++)
|
|
putc (ptr->first()->_selchars[j], stderr);
|
|
fprintf (stderr, ", %.*s\n",
|
|
ptr->first()->_allchars_length, ptr->first()->_allchars);
|
|
}
|
|
|
|
fprintf (stderr, "End dumping list.\n\n");
|
|
}
|
|
delete[] _asso_values;
|
|
delete[] _occurrences;
|
|
delete[] _alpha_inc;
|
|
}
|