mirror of
https://git.savannah.gnu.org/git/gperf.git
synced 2025-12-02 21:19:24 +00:00
Compute the occurrences after removal of duplicates, not before.
This commit is contained in:
12
ChangeLog
12
ChangeLog
@@ -1,5 +1,17 @@
|
|||||||
2002-11-03 Bruno Haible <bruno@clisp.org>
|
2002-11-03 Bruno Haible <bruno@clisp.org>
|
||||||
|
|
||||||
|
Compute the occurrences after removal of duplicates, not before.
|
||||||
|
* src/keyword.h (KeywordExt::init_selchars): Remove occurrences
|
||||||
|
argument.
|
||||||
|
* src/keyword.cc (KeywordExt::init_selchars): Likewise.
|
||||||
|
* src/search.cc (Search::prepare): Reorder the code. Compute the
|
||||||
|
occurrences after removal of duplicates.
|
||||||
|
(Search::merge_sort): Optimize the loop.
|
||||||
|
(Search::compute_occurrence): Renamed from Search::get_occurrence.
|
||||||
|
* src/search.h (Search::compute_occurrence): Renamed from
|
||||||
|
Search::get_occurrence.
|
||||||
|
* tests/chill.exp: Regenerated.
|
||||||
|
|
||||||
Bug fix: The hash table could fail to detect duplicates, between
|
Bug fix: The hash table could fail to detect duplicates, between
|
||||||
keywords of different length, when option -n (option[NOLENGTH]) was
|
keywords of different length, when option -n (option[NOLENGTH]) was
|
||||||
given.
|
given.
|
||||||
|
|||||||
@@ -47,7 +47,7 @@ static inline void sort_char_set (unsigned char *base, int len)
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Initialize selchars and selchars_length, and update occurrences.
|
/* Initialize selchars and selchars_length.
|
||||||
The hash function will be computed as
|
The hash function will be computed as
|
||||||
asso_values[allchars[key_pos[0]]] + asso_values[allchars[key_pos[1]]] + ...
|
asso_values[allchars[key_pos[0]]] + asso_values[allchars[key_pos[1]]] + ...
|
||||||
We compute selchars as the multiset
|
We compute selchars as the multiset
|
||||||
@@ -57,7 +57,7 @@ static inline void sort_char_set (unsigned char *base, int len)
|
|||||||
Furthermore we sort the selchars array, to ease detection of duplicates
|
Furthermore we sort the selchars array, to ease detection of duplicates
|
||||||
later.
|
later.
|
||||||
*/
|
*/
|
||||||
void KeywordExt::init_selchars (int *occurrences)
|
void KeywordExt::init_selchars ()
|
||||||
{
|
{
|
||||||
const char *k = _allchars;
|
const char *k = _allchars;
|
||||||
unsigned char *key_set =
|
unsigned char *key_set =
|
||||||
@@ -69,14 +69,13 @@ void KeywordExt::init_selchars (int *occurrences)
|
|||||||
for (int i = _allchars_length; i > 0; k++, i--)
|
for (int i = _allchars_length; i > 0; k++, i--)
|
||||||
{
|
{
|
||||||
*ptr = static_cast<unsigned char>(*k);
|
*ptr = static_cast<unsigned char>(*k);
|
||||||
occurrences[*ptr]++;
|
|
||||||
ptr++;
|
ptr++;
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
/* Only use those character positions specified by the user. */
|
/* Only use those character positions specified by the user. */
|
||||||
{
|
{
|
||||||
/* Iterate through the list of key_positions, initializing occurrences
|
/* Iterate through the list of key_positions, initializing selchars
|
||||||
table and selchars (via ptr). */
|
(via ptr). */
|
||||||
PositionIterator iter (option.get_key_positions ());
|
PositionIterator iter (option.get_key_positions ());
|
||||||
|
|
||||||
for (int i; (i = iter.next ()) != PositionIterator::EOS; )
|
for (int i; (i = iter.next ()) != PositionIterator::EOS; )
|
||||||
@@ -90,7 +89,6 @@ void KeywordExt::init_selchars (int *occurrences)
|
|||||||
else
|
else
|
||||||
/* Out of range of KEY length, so we'll just skip it. */
|
/* Out of range of KEY length, so we'll just skip it. */
|
||||||
continue;
|
continue;
|
||||||
occurrences[*ptr]++;
|
|
||||||
ptr++;
|
ptr++;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -56,12 +56,14 @@ struct KeywordExt : public Keyword
|
|||||||
multiset. */
|
multiset. */
|
||||||
const unsigned char * _selchars;
|
const unsigned char * _selchars;
|
||||||
int _selchars_length;
|
int _selchars_length;
|
||||||
/* Chained list of keywords having the same selchars. */
|
/* Chained list of keywords having the same _selchars and
|
||||||
|
- if !option[NOLENGTH] - also the same _allchars_length.
|
||||||
|
Note that these duplicates are not members of the main keyword list. */
|
||||||
KeywordExt * _duplicate_link;
|
KeywordExt * _duplicate_link;
|
||||||
|
|
||||||
/* Methods depending on the keyposition list. */
|
/* Methods depending on the keyposition list. */
|
||||||
/* Initialize selchars and selchars_length, and update occurrences. */
|
/* Initialize selchars and selchars_length. */
|
||||||
void init_selchars (int *occurrences);
|
void init_selchars ();
|
||||||
|
|
||||||
/* Data members used by the algorithm. */
|
/* Data members used by the algorithm. */
|
||||||
int _occurrence; /* Frequency of key set occurrences. */
|
int _occurrence; /* Frequency of key set occurrences. */
|
||||||
|
|||||||
144
src/search.cc
144
src/search.cc
@@ -41,6 +41,8 @@ Search::Search (KeywordExt_List *list)
|
|||||||
_asso_values (new int[_alpha_size]),
|
_asso_values (new int[_alpha_size]),
|
||||||
_determined (new bool[_alpha_size])
|
_determined (new bool[_alpha_size])
|
||||||
{
|
{
|
||||||
|
memset (_asso_values, 0, _alpha_size * sizeof (_asso_values[0]));
|
||||||
|
memset (_determined, 0, _alpha_size * sizeof (_determined[0]));
|
||||||
}
|
}
|
||||||
|
|
||||||
void
|
void
|
||||||
@@ -48,42 +50,67 @@ Search::prepare ()
|
|||||||
{
|
{
|
||||||
KeywordExt_List *temp;
|
KeywordExt_List *temp;
|
||||||
|
|
||||||
|
/* Compute the total number of keywords. */
|
||||||
_total_keys = 0;
|
_total_keys = 0;
|
||||||
for (temp = _head; temp; temp = temp->rest())
|
for (temp = _head; temp; temp = temp->rest())
|
||||||
{
|
|
||||||
temp->first()->init_selchars(_occurrences);
|
|
||||||
_total_keys++;
|
_total_keys++;
|
||||||
}
|
|
||||||
|
|
||||||
_list_len = _total_keys;
|
/* Initialize each keyword's _selchars array. */
|
||||||
|
for (temp = _head; temp; temp = temp->rest())
|
||||||
|
temp->first()->init_selchars();
|
||||||
|
|
||||||
{
|
/* Compute the minimum and maximum keyword length. */
|
||||||
/* Make hash table for efficiency. */
|
|
||||||
Hash_Table found_link (_list_len, option[NOLENGTH]);
|
|
||||||
|
|
||||||
/* Test whether there are any links and also set the maximum length of
|
|
||||||
an identifier in the keyword list. */
|
|
||||||
_total_duplicates = 0;
|
|
||||||
_max_key_len = INT_MIN;
|
_max_key_len = INT_MIN;
|
||||||
_min_key_len = INT_MAX;
|
_min_key_len = INT_MAX;
|
||||||
KeywordExt_List *trail = NULL;
|
|
||||||
for (temp = _head; temp; temp = temp->rest())
|
for (temp = _head; temp; temp = temp->rest())
|
||||||
{
|
{
|
||||||
KeywordExt *keyword = temp->first();
|
KeywordExt *keyword = temp->first();
|
||||||
KeywordExt *other_keyword = found_link.insert (keyword);
|
|
||||||
|
|
||||||
/* Check for links. We deal with these by building an equivalence class
|
if (_max_key_len < keyword->_allchars_length)
|
||||||
of all duplicate values (i.e., links) so that only 1 keyword is
|
_max_key_len = keyword->_allchars_length;
|
||||||
representative of the entire collection. This *greatly* simplifies
|
if (_min_key_len > keyword->_allchars_length)
|
||||||
processing during later stages of the program. */
|
_min_key_len = keyword->_allchars_length;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Exit program if an empty string is used as key, since the comparison
|
||||||
|
expressions don't work correctly for looking up an empty string. */
|
||||||
|
if (_min_key_len == 0)
|
||||||
|
{
|
||||||
|
fprintf (stderr, "Empty input key is not allowed.\n"
|
||||||
|
"To recognize an empty input key, your code should check for\n"
|
||||||
|
"len == 0 before calling the gperf generated lookup function.\n");
|
||||||
|
exit (1);
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Check for duplicates, i.e. keywords with the same _selchars array
|
||||||
|
(and - if !option[NOLENGTH] - also the same length).
|
||||||
|
We deal with these by building an equivalence class, so that only
|
||||||
|
1 keyword is representative of the entire collection. Only this
|
||||||
|
representative remains in the keyword list; the others are accessible
|
||||||
|
through the _duplicate_link chain, starting at the representative.
|
||||||
|
This *greatly* simplifies processing during later stages of the program.
|
||||||
|
Set _total_duplicates and _list_len = _total_keys - _total_duplicates. */
|
||||||
|
{
|
||||||
|
_list_len = _total_keys;
|
||||||
|
_total_duplicates = 0;
|
||||||
|
/* Make hash table for efficiency. */
|
||||||
|
Hash_Table representatives (_list_len, option[NOLENGTH]);
|
||||||
|
|
||||||
|
KeywordExt_List *prev = NULL; /* list node before temp */
|
||||||
|
for (temp = _head; temp; temp = temp->rest())
|
||||||
|
{
|
||||||
|
KeywordExt *keyword = temp->first();
|
||||||
|
KeywordExt *other_keyword = representatives.insert (keyword);
|
||||||
|
|
||||||
if (other_keyword)
|
if (other_keyword)
|
||||||
{
|
{
|
||||||
_total_duplicates++;
|
_total_duplicates++;
|
||||||
_list_len--;
|
_list_len--;
|
||||||
trail->rest() = temp->rest();
|
/* Remove keyword from the main list. */
|
||||||
temp->first()->_duplicate_link = other_keyword->_duplicate_link;
|
prev->rest() = temp->rest();
|
||||||
other_keyword->_duplicate_link = temp->first();
|
/* And insert it on other_keyword's duplicate list. */
|
||||||
|
keyword->_duplicate_link = other_keyword->_duplicate_link;
|
||||||
|
other_keyword->_duplicate_link = keyword;
|
||||||
|
|
||||||
/* Complain if user hasn't enabled the duplicate option. */
|
/* Complain if user hasn't enabled the duplicate option. */
|
||||||
if (!option[DUP] || option[DEBUG])
|
if (!option[DUP] || option[DEBUG])
|
||||||
@@ -94,19 +121,16 @@ Search::prepare ()
|
|||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
temp->first()->_duplicate_link = NULL;
|
keyword->_duplicate_link = NULL;
|
||||||
trail = temp;
|
prev = temp;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Update minimum and maximum keyword length, if needed. */
|
|
||||||
if (_max_key_len < keyword->_allchars_length)
|
|
||||||
_max_key_len = keyword->_allchars_length;
|
|
||||||
if (_min_key_len > keyword->_allchars_length)
|
|
||||||
_min_key_len = keyword->_allchars_length;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Exit program if links exists and option[DUP] not set, since we can't continue */
|
/* Exit program if duplicates exists and option[DUP] not set, since we
|
||||||
|
don't want to continue in this case. (We don't want to turn on
|
||||||
|
option[DUP] implicitly, because the generated code is usually much
|
||||||
|
slower. */
|
||||||
if (_total_duplicates)
|
if (_total_duplicates)
|
||||||
{
|
{
|
||||||
if (option[DUP])
|
if (option[DUP])
|
||||||
@@ -119,20 +143,23 @@ Search::prepare ()
|
|||||||
exit (1);
|
exit (1);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
/* Exit program if an empty string is used as key, since the comparison
|
|
||||||
expressions don't work correctly for looking up an empty string. */
|
/* Compute the occurrences of each character in the alphabet. */
|
||||||
if (_min_key_len == 0)
|
memset (_occurrences, 0, _alpha_size * sizeof (_occurrences[0]));
|
||||||
|
for (temp = _head; temp; temp = temp->rest())
|
||||||
{
|
{
|
||||||
fprintf (stderr, "Empty input key is not allowed.\nTo recognize an empty input key, your code should check for\nlen == 0 before calling the gperf generated lookup function.\n");
|
KeywordExt *keyword = temp->first();
|
||||||
exit (1);
|
const unsigned char *ptr = keyword->_selchars;
|
||||||
|
for (int count = keyword->_selchars_length; count > 0; ptr++, count--)
|
||||||
|
_occurrences[*ptr]++;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Recursively merges two sorted lists together to form one sorted list. The
|
/* Merges two sorted lists together to form one sorted list.
|
||||||
ordering criteria is by frequency of occurrence of elements in the key set
|
The sorting criterion depends on which of _occurrence_sort and _hash_sort
|
||||||
or by the hash value. This is a kludge, but permits nice sharing of
|
is set to true. This is a kludge, but permits nice sharing of almost
|
||||||
almost identical code without incurring the overhead of a function
|
identical code without incurring the overhead of a function call for
|
||||||
call comparison. */
|
every comparison. */
|
||||||
|
|
||||||
KeywordExt_List *
|
KeywordExt_List *
|
||||||
Search::merge (KeywordExt_List *list1, KeywordExt_List *list2)
|
Search::merge (KeywordExt_List *list1, KeywordExt_List *list2)
|
||||||
@@ -151,8 +178,10 @@ Search::merge (KeywordExt_List *list1, KeywordExt_List *list2)
|
|||||||
*resultp = list1;
|
*resultp = list1;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
if (_occurrence_sort && list1->first()->_occurrence < list2->first()->_occurrence
|
if ((_occurrence_sort
|
||||||
|| _hash_sort && list1->first()->_hash_value > list2->first()->_hash_value)
|
&& list1->first()->_occurrence < list2->first()->_occurrence)
|
||||||
|
|| (_hash_sort
|
||||||
|
&& list1->first()->_hash_value > list2->first()->_hash_value))
|
||||||
{
|
{
|
||||||
*resultp = list2;
|
*resultp = list2;
|
||||||
resultp = &list2->rest(); list2 = list1; list1 = *resultp;
|
resultp = &list2->rest(); list2 = list1; list1 = *resultp;
|
||||||
@@ -166,37 +195,46 @@ Search::merge (KeywordExt_List *list1, KeywordExt_List *list2)
|
|||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Applies the merge sort algorithm to recursively sort the key list by
|
/* Sorts a list using the recursive merge sort algorithm.
|
||||||
frequency of occurrence of elements in the key set. */
|
The sorting criterion depends on which of _occurrence_sort and _hash_sort
|
||||||
|
is set to true. */
|
||||||
|
|
||||||
KeywordExt_List *
|
KeywordExt_List *
|
||||||
Search::merge_sort (KeywordExt_List *head)
|
Search::merge_sort (KeywordExt_List *head)
|
||||||
{
|
{
|
||||||
if (!head || !head->rest())
|
if (!head || !head->rest())
|
||||||
|
/* List of length 0 or 1. Nothing to do. */
|
||||||
return head;
|
return head;
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
|
/* Determine a list node in the middle. */
|
||||||
KeywordExt_List *middle = head;
|
KeywordExt_List *middle = head;
|
||||||
KeywordExt_List *temp = head->rest()->rest();
|
for (KeywordExt_List *temp = head->rest();;)
|
||||||
|
|
||||||
while (temp)
|
|
||||||
{
|
{
|
||||||
temp = temp->rest();
|
temp = temp->rest();
|
||||||
middle = middle->rest();
|
if (temp == NULL)
|
||||||
if (temp)
|
break;
|
||||||
temp = temp->rest();
|
temp = temp->rest();
|
||||||
|
middle = middle->rest();
|
||||||
|
if (temp == NULL)
|
||||||
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
temp = middle->rest();
|
/* Cut the list into two halves.
|
||||||
middle->rest() = 0;
|
If the list has n elements, the left half has ceiling(n/2) elements
|
||||||
return merge (merge_sort (head), merge_sort (temp));
|
and the right half has floor(n/2) elements. */
|
||||||
|
KeywordExt_List *right_half = middle->rest();
|
||||||
|
middle->rest() = NULL;
|
||||||
|
|
||||||
|
/* Sort the two halves, then merge them. */
|
||||||
|
return merge (merge_sort (head), merge_sort (right_half));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Returns the frequency of occurrence of elements in the key set. */
|
/* Returns the frequency of occurrence of elements in the key set. */
|
||||||
|
|
||||||
inline int
|
inline int
|
||||||
Search::get_occurrence (KeywordExt *ptr)
|
Search::compute_occurrence (KeywordExt *ptr)
|
||||||
{
|
{
|
||||||
int value = 0;
|
int value = 0;
|
||||||
|
|
||||||
@@ -249,7 +287,7 @@ Search::reorder ()
|
|||||||
{
|
{
|
||||||
KeywordExt *keyword = ptr->first();
|
KeywordExt *keyword = ptr->first();
|
||||||
|
|
||||||
keyword->_occurrence = get_occurrence (keyword);
|
keyword->_occurrence = compute_occurrence (keyword);
|
||||||
}
|
}
|
||||||
|
|
||||||
_hash_sort = false;
|
_hash_sort = false;
|
||||||
|
|||||||
47
src/search.h
47
src/search.h
@@ -37,9 +37,13 @@ public:
|
|||||||
void optimize ();
|
void optimize ();
|
||||||
private:
|
private:
|
||||||
void prepare ();
|
void prepare ();
|
||||||
|
|
||||||
|
/* Merges two sorted lists together to form one sorted list. */
|
||||||
KeywordExt_List * merge (KeywordExt_List *list1, KeywordExt_List *list2);
|
KeywordExt_List * merge (KeywordExt_List *list1, KeywordExt_List *list2);
|
||||||
|
/* Sorts a list using the recursive merge sort algorithm. */
|
||||||
KeywordExt_List * merge_sort (KeywordExt_List *head);
|
KeywordExt_List * merge_sort (KeywordExt_List *head);
|
||||||
int get_occurrence (KeywordExt *ptr);
|
|
||||||
|
int compute_occurrence (KeywordExt *ptr);
|
||||||
void set_determined (KeywordExt *ptr);
|
void set_determined (KeywordExt *ptr);
|
||||||
bool already_determined (KeywordExt *ptr);
|
bool already_determined (KeywordExt *ptr);
|
||||||
void reorder ();
|
void reorder ();
|
||||||
@@ -53,21 +57,44 @@ private:
|
|||||||
void change (KeywordExt *prior, KeywordExt *curr);
|
void change (KeywordExt *prior, KeywordExt *curr);
|
||||||
void sort ();
|
void sort ();
|
||||||
public:
|
public:
|
||||||
KeywordExt_List * _head; /* Points to the head of the linked list. */
|
|
||||||
int _total_keys; /* Total number of keys, counting duplicates. */
|
/* Linked list of keywords. */
|
||||||
int _total_duplicates; /* Total number of duplicate hash values. */
|
KeywordExt_List * _head;
|
||||||
int _max_key_len; /* Maximum length of the longest keyword. */
|
|
||||||
int _min_key_len; /* Minimum length of the shortest keyword. */
|
/* Total number of keywords, counting duplicates. */
|
||||||
|
int _total_keys;
|
||||||
|
|
||||||
|
/* Total number of duplicates that have been moved to _duplicate_link lists
|
||||||
|
(not counting their representatives which stay on the main list). */
|
||||||
|
int _total_duplicates;
|
||||||
|
|
||||||
|
/* Maximum length of the longest keyword. */
|
||||||
|
int _max_key_len;
|
||||||
|
|
||||||
|
/* Minimum length of the shortest keyword. */
|
||||||
|
int _min_key_len;
|
||||||
|
|
||||||
/* Size of alphabet. */
|
/* Size of alphabet. */
|
||||||
int const _alpha_size;
|
int const _alpha_size;
|
||||||
/* Counts occurrences of each key set character. */
|
|
||||||
|
/* Counts occurrences of each key set character.
|
||||||
|
_occurrences[c] is the number of times that c occurs among the _selchars
|
||||||
|
of a keyword. */
|
||||||
int * const _occurrences;
|
int * const _occurrences;
|
||||||
/* Value associated with each character. */
|
/* Value associated with each character. */
|
||||||
int * const _asso_values;
|
int * const _asso_values;
|
||||||
|
|
||||||
private:
|
private:
|
||||||
int _list_len; /* Length of head's Key_List, not counting duplicates. */
|
|
||||||
bool _occurrence_sort; /* True if sorting by occurrence. */
|
/* Length of _head list. Number of keywords, not counting duplicates. */
|
||||||
bool _hash_sort; /* True if sorting by hash value. */
|
int _list_len;
|
||||||
|
|
||||||
|
/* Choice of sorting criterion during Search::merge_sort. */
|
||||||
|
/* True if sorting by occurrence. */
|
||||||
|
bool _occurrence_sort;
|
||||||
|
/* True if sorting by hash value. */
|
||||||
|
bool _hash_sort;
|
||||||
|
|
||||||
bool * const _determined; /* Used in function reorder, below. */
|
bool * const _determined; /* Used in function reorder, below. */
|
||||||
int _num_done; /* Number of keywords processed without a collision. */
|
int _num_done; /* Number of keywords processed without a collision. */
|
||||||
int _fewest_collisions; /* Records fewest # of collisions for asso value. */
|
int _fewest_collisions; /* Records fewest # of collisions for asso value. */
|
||||||
|
|||||||
1424
tests/chill.exp
1424
tests/chill.exp
File diff suppressed because it is too large
Load Diff
Reference in New Issue
Block a user