1
0
mirror of https://git.savannah.gnu.org/git/gperf.git synced 2025-12-02 13:09:22 +00:00

Compute the occurrences after removal of duplicates, not before.

This commit is contained in:
Bruno Haible
2002-12-20 12:22:27 +00:00
parent 1186e616cb
commit 1f70ea5dfd
6 changed files with 863 additions and 786 deletions

View File

@@ -1,5 +1,17 @@
2002-11-03 Bruno Haible <bruno@clisp.org>
Compute the occurrences after removal of duplicates, not before.
* src/keyword.h (KeywordExt::init_selchars): Remove occurrences
argument.
* src/keyword.cc (KeywordExt::init_selchars): Likewise.
* src/search.cc (Search::prepare): Reorder the code. Compute the
occurrences after removal of duplicates.
(Search::merge_sort): Optimize the loop.
(Search::compute_occurrence): Renamed from Search::get_occurrence.
* src/search.h (Search::compute_occurrence): Renamed from
Search::get_occurrence.
* tests/chill.exp: Regenerated.
Bug fix: The hash table could fail to detect duplicates, between
keywords of different length, when option -n (option[NOLENGTH]) was
given.

View File

@@ -47,7 +47,7 @@ static inline void sort_char_set (unsigned char *base, int len)
}
}
/* Initialize selchars and selchars_length, and update occurrences.
/* Initialize selchars and selchars_length.
The hash function will be computed as
asso_values[allchars[key_pos[0]]] + asso_values[allchars[key_pos[1]]] + ...
We compute selchars as the multiset
@@ -57,7 +57,7 @@ static inline void sort_char_set (unsigned char *base, int len)
Furthermore we sort the selchars array, to ease detection of duplicates
later.
*/
void KeywordExt::init_selchars (int *occurrences)
void KeywordExt::init_selchars ()
{
const char *k = _allchars;
unsigned char *key_set =
@@ -69,14 +69,13 @@ void KeywordExt::init_selchars (int *occurrences)
for (int i = _allchars_length; i > 0; k++, i--)
{
*ptr = static_cast<unsigned char>(*k);
occurrences[*ptr]++;
ptr++;
}
else
/* Only use those character positions specified by the user. */
{
/* Iterate through the list of key_positions, initializing occurrences
table and selchars (via ptr). */
/* Iterate through the list of key_positions, initializing selchars
(via ptr). */
PositionIterator iter (option.get_key_positions ());
for (int i; (i = iter.next ()) != PositionIterator::EOS; )
@@ -90,7 +89,6 @@ void KeywordExt::init_selchars (int *occurrences)
else
/* Out of range of KEY length, so we'll just skip it. */
continue;
occurrences[*ptr]++;
ptr++;
}

View File

@@ -56,12 +56,14 @@ struct KeywordExt : public Keyword
multiset. */
const unsigned char * _selchars;
int _selchars_length;
/* Chained list of keywords having the same selchars. */
/* Chained list of keywords having the same _selchars and
- if !option[NOLENGTH] - also the same _allchars_length.
Note that these duplicates are not members of the main keyword list. */
KeywordExt * _duplicate_link;
/* Methods depending on the keyposition list. */
/* Initialize selchars and selchars_length, and update occurrences. */
void init_selchars (int *occurrences);
/* Initialize selchars and selchars_length. */
void init_selchars ();
/* Data members used by the algorithm. */
int _occurrence; /* Frequency of key set occurrences. */

View File

@@ -41,6 +41,8 @@ Search::Search (KeywordExt_List *list)
_asso_values (new int[_alpha_size]),
_determined (new bool[_alpha_size])
{
memset (_asso_values, 0, _alpha_size * sizeof (_asso_values[0]));
memset (_determined, 0, _alpha_size * sizeof (_determined[0]));
}
void
@@ -48,42 +50,67 @@ Search::prepare ()
{
KeywordExt_List *temp;
/* Compute the total number of keywords. */
_total_keys = 0;
for (temp = _head; temp; temp = temp->rest())
_total_keys++;
/* Initialize each keyword's _selchars array. */
for (temp = _head; temp; temp = temp->rest())
temp->first()->init_selchars();
/* Compute the minimum and maximum keyword length. */
_max_key_len = INT_MIN;
_min_key_len = INT_MAX;
for (temp = _head; temp; temp = temp->rest())
{
temp->first()->init_selchars(_occurrences);
_total_keys++;
KeywordExt *keyword = temp->first();
if (_max_key_len < keyword->_allchars_length)
_max_key_len = keyword->_allchars_length;
if (_min_key_len > keyword->_allchars_length)
_min_key_len = keyword->_allchars_length;
}
_list_len = _total_keys;
/* Exit program if an empty string is used as key, since the comparison
expressions don't work correctly for looking up an empty string. */
if (_min_key_len == 0)
{
fprintf (stderr, "Empty input key is not allowed.\n"
"To recognize an empty input key, your code should check for\n"
"len == 0 before calling the gperf generated lookup function.\n");
exit (1);
}
/* Check for duplicates, i.e. keywords with the same _selchars array
(and - if !option[NOLENGTH] - also the same length).
We deal with these by building an equivalence class, so that only
1 keyword is representative of the entire collection. Only this
representative remains in the keyword list; the others are accessible
through the _duplicate_link chain, starting at the representative.
This *greatly* simplifies processing during later stages of the program.
Set _total_duplicates and _list_len = _total_keys - _total_duplicates. */
{
/* Make hash table for efficiency. */
Hash_Table found_link (_list_len, option[NOLENGTH]);
/* Test whether there are any links and also set the maximum length of
an identifier in the keyword list. */
_list_len = _total_keys;
_total_duplicates = 0;
_max_key_len = INT_MIN;
_min_key_len = INT_MAX;
KeywordExt_List *trail = NULL;
/* Make hash table for efficiency. */
Hash_Table representatives (_list_len, option[NOLENGTH]);
KeywordExt_List *prev = NULL; /* list node before temp */
for (temp = _head; temp; temp = temp->rest())
{
KeywordExt *keyword = temp->first();
KeywordExt *other_keyword = found_link.insert (keyword);
/* Check for links. We deal with these by building an equivalence class
of all duplicate values (i.e., links) so that only 1 keyword is
representative of the entire collection. This *greatly* simplifies
processing during later stages of the program. */
KeywordExt *other_keyword = representatives.insert (keyword);
if (other_keyword)
{
_total_duplicates++;
_list_len--;
trail->rest() = temp->rest();
temp->first()->_duplicate_link = other_keyword->_duplicate_link;
other_keyword->_duplicate_link = temp->first();
/* Remove keyword from the main list. */
prev->rest() = temp->rest();
/* And insert it on other_keyword's duplicate list. */
keyword->_duplicate_link = other_keyword->_duplicate_link;
other_keyword->_duplicate_link = keyword;
/* Complain if user hasn't enabled the duplicate option. */
if (!option[DUP] || option[DEBUG])
@@ -94,19 +121,16 @@ Search::prepare ()
}
else
{
temp->first()->_duplicate_link = NULL;
trail = temp;
keyword->_duplicate_link = NULL;
prev = temp;
}
/* Update minimum and maximum keyword length, if needed. */
if (_max_key_len < keyword->_allchars_length)
_max_key_len = keyword->_allchars_length;
if (_min_key_len > keyword->_allchars_length)
_min_key_len = keyword->_allchars_length;
}
}
/* Exit program if links exists and option[DUP] not set, since we can't continue */
/* Exit program if duplicates exists and option[DUP] not set, since we
don't want to continue in this case. (We don't want to turn on
option[DUP] implicitly, because the generated code is usually much
slower. */
if (_total_duplicates)
{
if (option[DUP])
@@ -119,20 +143,23 @@ Search::prepare ()
exit (1);
}
}
/* Exit program if an empty string is used as key, since the comparison
expressions don't work correctly for looking up an empty string. */
if (_min_key_len == 0)
/* Compute the occurrences of each character in the alphabet. */
memset (_occurrences, 0, _alpha_size * sizeof (_occurrences[0]));
for (temp = _head; temp; temp = temp->rest())
{
fprintf (stderr, "Empty input key is not allowed.\nTo recognize an empty input key, your code should check for\nlen == 0 before calling the gperf generated lookup function.\n");
exit (1);
KeywordExt *keyword = temp->first();
const unsigned char *ptr = keyword->_selchars;
for (int count = keyword->_selchars_length; count > 0; ptr++, count--)
_occurrences[*ptr]++;
}
}
/* Recursively merges two sorted lists together to form one sorted list. The
ordering criteria is by frequency of occurrence of elements in the key set
or by the hash value. This is a kludge, but permits nice sharing of
almost identical code without incurring the overhead of a function
call comparison. */
/* Merges two sorted lists together to form one sorted list.
The sorting criterion depends on which of _occurrence_sort and _hash_sort
is set to true. This is a kludge, but permits nice sharing of almost
identical code without incurring the overhead of a function call for
every comparison. */
KeywordExt_List *
Search::merge (KeywordExt_List *list1, KeywordExt_List *list2)
@@ -151,8 +178,10 @@ Search::merge (KeywordExt_List *list1, KeywordExt_List *list2)
*resultp = list1;
break;
}
if (_occurrence_sort && list1->first()->_occurrence < list2->first()->_occurrence
|| _hash_sort && list1->first()->_hash_value > list2->first()->_hash_value)
if ((_occurrence_sort
&& list1->first()->_occurrence < list2->first()->_occurrence)
|| (_hash_sort
&& list1->first()->_hash_value > list2->first()->_hash_value))
{
*resultp = list2;
resultp = &list2->rest(); list2 = list1; list1 = *resultp;
@@ -166,37 +195,46 @@ Search::merge (KeywordExt_List *list1, KeywordExt_List *list2)
return result;
}
/* Applies the merge sort algorithm to recursively sort the key list by
frequency of occurrence of elements in the key set. */
/* Sorts a list using the recursive merge sort algorithm.
The sorting criterion depends on which of _occurrence_sort and _hash_sort
is set to true. */
KeywordExt_List *
Search::merge_sort (KeywordExt_List *head)
{
if (!head || !head->rest())
/* List of length 0 or 1. Nothing to do. */
return head;
else
{
/* Determine a list node in the middle. */
KeywordExt_List *middle = head;
KeywordExt_List *temp = head->rest()->rest();
while (temp)
for (KeywordExt_List *temp = head->rest();;)
{
temp = temp->rest();
temp = temp->rest();
if (temp == NULL)
break;
temp = temp->rest();
middle = middle->rest();
if (temp)
temp = temp->rest();
if (temp == NULL)
break;
}
temp = middle->rest();
middle->rest() = 0;
return merge (merge_sort (head), merge_sort (temp));
/* Cut the list into two halves.
If the list has n elements, the left half has ceiling(n/2) elements
and the right half has floor(n/2) elements. */
KeywordExt_List *right_half = middle->rest();
middle->rest() = NULL;
/* Sort the two halves, then merge them. */
return merge (merge_sort (head), merge_sort (right_half));
}
}
/* Returns the frequency of occurrence of elements in the key set. */
inline int
Search::get_occurrence (KeywordExt *ptr)
Search::compute_occurrence (KeywordExt *ptr)
{
int value = 0;
@@ -249,7 +287,7 @@ Search::reorder ()
{
KeywordExt *keyword = ptr->first();
keyword->_occurrence = get_occurrence (keyword);
keyword->_occurrence = compute_occurrence (keyword);
}
_hash_sort = false;

View File

@@ -37,9 +37,13 @@ public:
void optimize ();
private:
void prepare ();
/* Merges two sorted lists together to form one sorted list. */
KeywordExt_List * merge (KeywordExt_List *list1, KeywordExt_List *list2);
/* Sorts a list using the recursive merge sort algorithm. */
KeywordExt_List * merge_sort (KeywordExt_List *head);
int get_occurrence (KeywordExt *ptr);
int compute_occurrence (KeywordExt *ptr);
void set_determined (KeywordExt *ptr);
bool already_determined (KeywordExt *ptr);
void reorder ();
@@ -53,21 +57,44 @@ private:
void change (KeywordExt *prior, KeywordExt *curr);
void sort ();
public:
KeywordExt_List * _head; /* Points to the head of the linked list. */
int _total_keys; /* Total number of keys, counting duplicates. */
int _total_duplicates; /* Total number of duplicate hash values. */
int _max_key_len; /* Maximum length of the longest keyword. */
int _min_key_len; /* Minimum length of the shortest keyword. */
/* Size of alphabet. */
/* Linked list of keywords. */
KeywordExt_List * _head;
/* Total number of keywords, counting duplicates. */
int _total_keys;
/* Total number of duplicates that have been moved to _duplicate_link lists
(not counting their representatives which stay on the main list). */
int _total_duplicates;
/* Maximum length of the longest keyword. */
int _max_key_len;
/* Minimum length of the shortest keyword. */
int _min_key_len;
/* Size of alphabet. */
int const _alpha_size;
/* Counts occurrences of each key set character. */
/* Counts occurrences of each key set character.
_occurrences[c] is the number of times that c occurs among the _selchars
of a keyword. */
int * const _occurrences;
/* Value associated with each character. */
int * const _asso_values;
private:
int _list_len; /* Length of head's Key_List, not counting duplicates. */
bool _occurrence_sort; /* True if sorting by occurrence. */
bool _hash_sort; /* True if sorting by hash value. */
/* Length of _head list. Number of keywords, not counting duplicates. */
int _list_len;
/* Choice of sorting criterion during Search::merge_sort. */
/* True if sorting by occurrence. */
bool _occurrence_sort;
/* True if sorting by hash value. */
bool _hash_sort;
bool * const _determined; /* Used in function reorder, below. */
int _num_done; /* Number of keywords processed without a collision. */
int _fewest_collisions; /* Records fewest # of collisions for asso value. */

File diff suppressed because it is too large Load Diff