1
0
mirror of https://git.savannah.gnu.org/git/gperf.git synced 2025-12-02 21:19:24 +00:00

Compute the occurrences after removal of duplicates, not before.

This commit is contained in:
Bruno Haible
2002-12-20 12:22:27 +00:00
parent 1186e616cb
commit 1f70ea5dfd
6 changed files with 863 additions and 786 deletions

View File

@@ -1,5 +1,17 @@
2002-11-03 Bruno Haible <bruno@clisp.org> 2002-11-03 Bruno Haible <bruno@clisp.org>
Compute the occurrences after removal of duplicates, not before.
* src/keyword.h (KeywordExt::init_selchars): Remove occurrences
argument.
* src/keyword.cc (KeywordExt::init_selchars): Likewise.
* src/search.cc (Search::prepare): Reorder the code. Compute the
occurrences after removal of duplicates.
(Search::merge_sort): Optimize the loop.
(Search::compute_occurrence): Renamed from Search::get_occurrence.
* src/search.h (Search::compute_occurrence): Renamed from
Search::get_occurrence.
* tests/chill.exp: Regenerated.
Bug fix: The hash table could fail to detect duplicates, between Bug fix: The hash table could fail to detect duplicates, between
keywords of different length, when option -n (option[NOLENGTH]) was keywords of different length, when option -n (option[NOLENGTH]) was
given. given.

View File

@@ -47,7 +47,7 @@ static inline void sort_char_set (unsigned char *base, int len)
} }
} }
/* Initialize selchars and selchars_length, and update occurrences. /* Initialize selchars and selchars_length.
The hash function will be computed as The hash function will be computed as
asso_values[allchars[key_pos[0]]] + asso_values[allchars[key_pos[1]]] + ... asso_values[allchars[key_pos[0]]] + asso_values[allchars[key_pos[1]]] + ...
We compute selchars as the multiset We compute selchars as the multiset
@@ -57,7 +57,7 @@ static inline void sort_char_set (unsigned char *base, int len)
Furthermore we sort the selchars array, to ease detection of duplicates Furthermore we sort the selchars array, to ease detection of duplicates
later. later.
*/ */
void KeywordExt::init_selchars (int *occurrences) void KeywordExt::init_selchars ()
{ {
const char *k = _allchars; const char *k = _allchars;
unsigned char *key_set = unsigned char *key_set =
@@ -69,14 +69,13 @@ void KeywordExt::init_selchars (int *occurrences)
for (int i = _allchars_length; i > 0; k++, i--) for (int i = _allchars_length; i > 0; k++, i--)
{ {
*ptr = static_cast<unsigned char>(*k); *ptr = static_cast<unsigned char>(*k);
occurrences[*ptr]++;
ptr++; ptr++;
} }
else else
/* Only use those character positions specified by the user. */ /* Only use those character positions specified by the user. */
{ {
/* Iterate through the list of key_positions, initializing occurrences /* Iterate through the list of key_positions, initializing selchars
table and selchars (via ptr). */ (via ptr). */
PositionIterator iter (option.get_key_positions ()); PositionIterator iter (option.get_key_positions ());
for (int i; (i = iter.next ()) != PositionIterator::EOS; ) for (int i; (i = iter.next ()) != PositionIterator::EOS; )
@@ -90,7 +89,6 @@ void KeywordExt::init_selchars (int *occurrences)
else else
/* Out of range of KEY length, so we'll just skip it. */ /* Out of range of KEY length, so we'll just skip it. */
continue; continue;
occurrences[*ptr]++;
ptr++; ptr++;
} }

View File

@@ -56,12 +56,14 @@ struct KeywordExt : public Keyword
multiset. */ multiset. */
const unsigned char * _selchars; const unsigned char * _selchars;
int _selchars_length; int _selchars_length;
/* Chained list of keywords having the same selchars. */ /* Chained list of keywords having the same _selchars and
- if !option[NOLENGTH] - also the same _allchars_length.
Note that these duplicates are not members of the main keyword list. */
KeywordExt * _duplicate_link; KeywordExt * _duplicate_link;
/* Methods depending on the keyposition list. */ /* Methods depending on the keyposition list. */
/* Initialize selchars and selchars_length, and update occurrences. */ /* Initialize selchars and selchars_length. */
void init_selchars (int *occurrences); void init_selchars ();
/* Data members used by the algorithm. */ /* Data members used by the algorithm. */
int _occurrence; /* Frequency of key set occurrences. */ int _occurrence; /* Frequency of key set occurrences. */

View File

@@ -41,6 +41,8 @@ Search::Search (KeywordExt_List *list)
_asso_values (new int[_alpha_size]), _asso_values (new int[_alpha_size]),
_determined (new bool[_alpha_size]) _determined (new bool[_alpha_size])
{ {
memset (_asso_values, 0, _alpha_size * sizeof (_asso_values[0]));
memset (_determined, 0, _alpha_size * sizeof (_determined[0]));
} }
void void
@@ -48,42 +50,67 @@ Search::prepare ()
{ {
KeywordExt_List *temp; KeywordExt_List *temp;
/* Compute the total number of keywords. */
_total_keys = 0; _total_keys = 0;
for (temp = _head; temp; temp = temp->rest()) for (temp = _head; temp; temp = temp->rest())
{
temp->first()->init_selchars(_occurrences);
_total_keys++; _total_keys++;
}
_list_len = _total_keys; /* Initialize each keyword's _selchars array. */
for (temp = _head; temp; temp = temp->rest())
temp->first()->init_selchars();
{ /* Compute the minimum and maximum keyword length. */
/* Make hash table for efficiency. */
Hash_Table found_link (_list_len, option[NOLENGTH]);
/* Test whether there are any links and also set the maximum length of
an identifier in the keyword list. */
_total_duplicates = 0;
_max_key_len = INT_MIN; _max_key_len = INT_MIN;
_min_key_len = INT_MAX; _min_key_len = INT_MAX;
KeywordExt_List *trail = NULL;
for (temp = _head; temp; temp = temp->rest()) for (temp = _head; temp; temp = temp->rest())
{ {
KeywordExt *keyword = temp->first(); KeywordExt *keyword = temp->first();
KeywordExt *other_keyword = found_link.insert (keyword);
/* Check for links. We deal with these by building an equivalence class if (_max_key_len < keyword->_allchars_length)
of all duplicate values (i.e., links) so that only 1 keyword is _max_key_len = keyword->_allchars_length;
representative of the entire collection. This *greatly* simplifies if (_min_key_len > keyword->_allchars_length)
processing during later stages of the program. */ _min_key_len = keyword->_allchars_length;
}
/* Exit program if an empty string is used as key, since the comparison
expressions don't work correctly for looking up an empty string. */
if (_min_key_len == 0)
{
fprintf (stderr, "Empty input key is not allowed.\n"
"To recognize an empty input key, your code should check for\n"
"len == 0 before calling the gperf generated lookup function.\n");
exit (1);
}
/* Check for duplicates, i.e. keywords with the same _selchars array
(and - if !option[NOLENGTH] - also the same length).
We deal with these by building an equivalence class, so that only
1 keyword is representative of the entire collection. Only this
representative remains in the keyword list; the others are accessible
through the _duplicate_link chain, starting at the representative.
This *greatly* simplifies processing during later stages of the program.
Set _total_duplicates and _list_len = _total_keys - _total_duplicates. */
{
_list_len = _total_keys;
_total_duplicates = 0;
/* Make hash table for efficiency. */
Hash_Table representatives (_list_len, option[NOLENGTH]);
KeywordExt_List *prev = NULL; /* list node before temp */
for (temp = _head; temp; temp = temp->rest())
{
KeywordExt *keyword = temp->first();
KeywordExt *other_keyword = representatives.insert (keyword);
if (other_keyword) if (other_keyword)
{ {
_total_duplicates++; _total_duplicates++;
_list_len--; _list_len--;
trail->rest() = temp->rest(); /* Remove keyword from the main list. */
temp->first()->_duplicate_link = other_keyword->_duplicate_link; prev->rest() = temp->rest();
other_keyword->_duplicate_link = temp->first(); /* And insert it on other_keyword's duplicate list. */
keyword->_duplicate_link = other_keyword->_duplicate_link;
other_keyword->_duplicate_link = keyword;
/* Complain if user hasn't enabled the duplicate option. */ /* Complain if user hasn't enabled the duplicate option. */
if (!option[DUP] || option[DEBUG]) if (!option[DUP] || option[DEBUG])
@@ -94,19 +121,16 @@ Search::prepare ()
} }
else else
{ {
temp->first()->_duplicate_link = NULL; keyword->_duplicate_link = NULL;
trail = temp; prev = temp;
} }
/* Update minimum and maximum keyword length, if needed. */
if (_max_key_len < keyword->_allchars_length)
_max_key_len = keyword->_allchars_length;
if (_min_key_len > keyword->_allchars_length)
_min_key_len = keyword->_allchars_length;
} }
} }
/* Exit program if links exists and option[DUP] not set, since we can't continue */ /* Exit program if duplicates exists and option[DUP] not set, since we
don't want to continue in this case. (We don't want to turn on
option[DUP] implicitly, because the generated code is usually much
slower. */
if (_total_duplicates) if (_total_duplicates)
{ {
if (option[DUP]) if (option[DUP])
@@ -119,20 +143,23 @@ Search::prepare ()
exit (1); exit (1);
} }
} }
/* Exit program if an empty string is used as key, since the comparison
expressions don't work correctly for looking up an empty string. */ /* Compute the occurrences of each character in the alphabet. */
if (_min_key_len == 0) memset (_occurrences, 0, _alpha_size * sizeof (_occurrences[0]));
for (temp = _head; temp; temp = temp->rest())
{ {
fprintf (stderr, "Empty input key is not allowed.\nTo recognize an empty input key, your code should check for\nlen == 0 before calling the gperf generated lookup function.\n"); KeywordExt *keyword = temp->first();
exit (1); const unsigned char *ptr = keyword->_selchars;
for (int count = keyword->_selchars_length; count > 0; ptr++, count--)
_occurrences[*ptr]++;
} }
} }
/* Recursively merges two sorted lists together to form one sorted list. The /* Merges two sorted lists together to form one sorted list.
ordering criteria is by frequency of occurrence of elements in the key set The sorting criterion depends on which of _occurrence_sort and _hash_sort
or by the hash value. This is a kludge, but permits nice sharing of is set to true. This is a kludge, but permits nice sharing of almost
almost identical code without incurring the overhead of a function identical code without incurring the overhead of a function call for
call comparison. */ every comparison. */
KeywordExt_List * KeywordExt_List *
Search::merge (KeywordExt_List *list1, KeywordExt_List *list2) Search::merge (KeywordExt_List *list1, KeywordExt_List *list2)
@@ -151,8 +178,10 @@ Search::merge (KeywordExt_List *list1, KeywordExt_List *list2)
*resultp = list1; *resultp = list1;
break; break;
} }
if (_occurrence_sort && list1->first()->_occurrence < list2->first()->_occurrence if ((_occurrence_sort
|| _hash_sort && list1->first()->_hash_value > list2->first()->_hash_value) && list1->first()->_occurrence < list2->first()->_occurrence)
|| (_hash_sort
&& list1->first()->_hash_value > list2->first()->_hash_value))
{ {
*resultp = list2; *resultp = list2;
resultp = &list2->rest(); list2 = list1; list1 = *resultp; resultp = &list2->rest(); list2 = list1; list1 = *resultp;
@@ -166,37 +195,46 @@ Search::merge (KeywordExt_List *list1, KeywordExt_List *list2)
return result; return result;
} }
/* Applies the merge sort algorithm to recursively sort the key list by /* Sorts a list using the recursive merge sort algorithm.
frequency of occurrence of elements in the key set. */ The sorting criterion depends on which of _occurrence_sort and _hash_sort
is set to true. */
KeywordExt_List * KeywordExt_List *
Search::merge_sort (KeywordExt_List *head) Search::merge_sort (KeywordExt_List *head)
{ {
if (!head || !head->rest()) if (!head || !head->rest())
/* List of length 0 or 1. Nothing to do. */
return head; return head;
else else
{ {
/* Determine a list node in the middle. */
KeywordExt_List *middle = head; KeywordExt_List *middle = head;
KeywordExt_List *temp = head->rest()->rest(); for (KeywordExt_List *temp = head->rest();;)
while (temp)
{ {
temp = temp->rest(); temp = temp->rest();
middle = middle->rest(); if (temp == NULL)
if (temp) break;
temp = temp->rest(); temp = temp->rest();
middle = middle->rest();
if (temp == NULL)
break;
} }
temp = middle->rest(); /* Cut the list into two halves.
middle->rest() = 0; If the list has n elements, the left half has ceiling(n/2) elements
return merge (merge_sort (head), merge_sort (temp)); and the right half has floor(n/2) elements. */
KeywordExt_List *right_half = middle->rest();
middle->rest() = NULL;
/* Sort the two halves, then merge them. */
return merge (merge_sort (head), merge_sort (right_half));
} }
} }
/* Returns the frequency of occurrence of elements in the key set. */ /* Returns the frequency of occurrence of elements in the key set. */
inline int inline int
Search::get_occurrence (KeywordExt *ptr) Search::compute_occurrence (KeywordExt *ptr)
{ {
int value = 0; int value = 0;
@@ -249,7 +287,7 @@ Search::reorder ()
{ {
KeywordExt *keyword = ptr->first(); KeywordExt *keyword = ptr->first();
keyword->_occurrence = get_occurrence (keyword); keyword->_occurrence = compute_occurrence (keyword);
} }
_hash_sort = false; _hash_sort = false;

View File

@@ -37,9 +37,13 @@ public:
void optimize (); void optimize ();
private: private:
void prepare (); void prepare ();
/* Merges two sorted lists together to form one sorted list. */
KeywordExt_List * merge (KeywordExt_List *list1, KeywordExt_List *list2); KeywordExt_List * merge (KeywordExt_List *list1, KeywordExt_List *list2);
/* Sorts a list using the recursive merge sort algorithm. */
KeywordExt_List * merge_sort (KeywordExt_List *head); KeywordExt_List * merge_sort (KeywordExt_List *head);
int get_occurrence (KeywordExt *ptr);
int compute_occurrence (KeywordExt *ptr);
void set_determined (KeywordExt *ptr); void set_determined (KeywordExt *ptr);
bool already_determined (KeywordExt *ptr); bool already_determined (KeywordExt *ptr);
void reorder (); void reorder ();
@@ -53,21 +57,44 @@ private:
void change (KeywordExt *prior, KeywordExt *curr); void change (KeywordExt *prior, KeywordExt *curr);
void sort (); void sort ();
public: public:
KeywordExt_List * _head; /* Points to the head of the linked list. */
int _total_keys; /* Total number of keys, counting duplicates. */ /* Linked list of keywords. */
int _total_duplicates; /* Total number of duplicate hash values. */ KeywordExt_List * _head;
int _max_key_len; /* Maximum length of the longest keyword. */
int _min_key_len; /* Minimum length of the shortest keyword. */ /* Total number of keywords, counting duplicates. */
int _total_keys;
/* Total number of duplicates that have been moved to _duplicate_link lists
(not counting their representatives which stay on the main list). */
int _total_duplicates;
/* Maximum length of the longest keyword. */
int _max_key_len;
/* Minimum length of the shortest keyword. */
int _min_key_len;
/* Size of alphabet. */ /* Size of alphabet. */
int const _alpha_size; int const _alpha_size;
/* Counts occurrences of each key set character. */
/* Counts occurrences of each key set character.
_occurrences[c] is the number of times that c occurs among the _selchars
of a keyword. */
int * const _occurrences; int * const _occurrences;
/* Value associated with each character. */ /* Value associated with each character. */
int * const _asso_values; int * const _asso_values;
private: private:
int _list_len; /* Length of head's Key_List, not counting duplicates. */
bool _occurrence_sort; /* True if sorting by occurrence. */ /* Length of _head list. Number of keywords, not counting duplicates. */
bool _hash_sort; /* True if sorting by hash value. */ int _list_len;
/* Choice of sorting criterion during Search::merge_sort. */
/* True if sorting by occurrence. */
bool _occurrence_sort;
/* True if sorting by hash value. */
bool _hash_sort;
bool * const _determined; /* Used in function reorder, below. */ bool * const _determined; /* Used in function reorder, below. */
int _num_done; /* Number of keywords processed without a collision. */ int _num_done; /* Number of keywords processed without a collision. */
int _fewest_collisions; /* Records fewest # of collisions for asso value. */ int _fewest_collisions; /* Records fewest # of collisions for asso value. */

File diff suppressed because it is too large Load Diff