Compute the occurrences after removal of duplicates, not before.

2025-12-02 21:19:24 +00:00 · 2002-12-20 12:22:27 +00:00
parent 1186e616cb
commit 1f70ea5dfd
6 changed files with 863 additions and 786 deletions
--- a/12
+++ b/12
@@ -1,5 +1,17 @@
 2002-11-03  Bruno Haible  <bruno@clisp.org>
 	Compute the occurrences after removal of duplicates, not before.
 	* src/keyword.h (KeywordExt::init_selchars): Remove occurrences
 	argument.
 	* src/keyword.cc (KeywordExt::init_selchars): Likewise.
 	* src/search.cc (Search::prepare): Reorder the code. Compute the
 	occurrences after removal of duplicates.
 	(Search::merge_sort): Optimize the loop.
 	(Search::compute_occurrence): Renamed from Search::get_occurrence.
 	* src/search.h (Search::compute_occurrence): Renamed from
 	Search::get_occurrence.
 	* tests/chill.exp: Regenerated.
 	Bug fix: The hash table could fail to detect duplicates, between
 	keywords of different length, when option -n (option[NOLENGTH]) was
 	given.
--- a/src/keyword.cc
+++ b/src/keyword.cc
@@ -47,7 +47,7 @@ static inline void sort_char_set (unsigned char *base, int len)
    }
 }
-/* Initialize selchars and selchars_length, and update occurrences.
+/* Initialize selchars and selchars_length.
   The hash function will be computed as
   asso_values[allchars[key_pos[0]]] + asso_values[allchars[key_pos[1]]] + ...
   We compute selchars as the multiset
@@ -57,7 +57,7 @@ static inline void sort_char_set (unsigned char *base, int len)
   Furthermore we sort the selchars array, to ease detection of duplicates
   later.
 */
-void KeywordExt::init_selchars (int *occurrences)
+void KeywordExt::init_selchars ()
 {
  const char *k = _allchars;
  unsigned char *key_set =
@@ -69,14 +69,13 @@ void KeywordExt::init_selchars (int *occurrences)
    for (int i = _allchars_length; i > 0; k++, i--)
      {
        *ptr = static_cast<unsigned char>(*k);
        occurrences[*ptr]++;
        ptr++;
      }
  else
    /* Only use those character positions specified by the user.  */
    {
-      /* Iterate through the list of key_positions, initializing occurrences
+      /* Iterate through the list of key_positions, initializing selchars
-         table and selchars (via ptr).  */
+         (via ptr).  */
      PositionIterator iter (option.get_key_positions ());
      for (int i; (i = iter.next ()) != PositionIterator::EOS; )
@@ -90,7 +89,6 @@ void KeywordExt::init_selchars (int *occurrences)
          else
            /* Out of range of KEY length, so we'll just skip it.  */
            continue;
          occurrences[*ptr]++;
          ptr++;
        }
--- a/src/keyword.h
+++ b/src/keyword.h
@@ -56,12 +56,14 @@ struct KeywordExt : public Keyword
     multiset.  */
  const unsigned char * _selchars;
  int                   _selchars_length;
-  /* Chained list of keywords having the same selchars.  */
+  /* Chained list of keywords having the same _selchars and
     - if !option[NOLENGTH] - also the same _allchars_length.
     Note that these duplicates are not members of the main keyword list.  */
  KeywordExt *          _duplicate_link;
  /* Methods depending on the keyposition list.  */
-  /* Initialize selchars and selchars_length, and update occurrences.  */
+  /* Initialize selchars and selchars_length.  */
-  void                  init_selchars (int *occurrences);
+  void                  init_selchars ();
  /* Data members used by the algorithm.  */
  int                   _occurrence; /* Frequency of key set occurrences.  */
--- a/src/search.cc
+++ b/src/search.cc
@@ -41,6 +41,8 @@ Search::Search (KeywordExt_List *list)
    _asso_values (new int[_alpha_size]),
    _determined (new bool[_alpha_size])
 {
  memset (_asso_values, 0, _alpha_size * sizeof (_asso_values[0]));
  memset (_determined, 0, _alpha_size * sizeof (_determined[0]));
 }
 void
@@ -48,42 +50,67 @@ Search::prepare ()
 {
  KeywordExt_List *temp;
  /* Compute the total number of keywords.  */
  _total_keys = 0;
  for (temp = _head; temp; temp = temp->rest())
    {
      temp->first()->init_selchars(_occurrences);
    _total_keys++;
    }
-  _list_len = _total_keys;
+  /* Initialize each keyword's _selchars array.  */
  for (temp = _head; temp; temp = temp->rest())
    temp->first()->init_selchars();
-  {
+  /* Compute the minimum and maximum keyword length.  */
    /* Make hash table for efficiency. */
    Hash_Table found_link (_list_len, option[NOLENGTH]);
    /* Test whether there are any links and also set the maximum length of
       an identifier in the keyword list. */
    _total_duplicates = 0;
  _max_key_len = INT_MIN;
  _min_key_len = INT_MAX;
    KeywordExt_List *trail = NULL;
  for (temp = _head; temp; temp = temp->rest())
    {
      KeywordExt *keyword = temp->first();
        KeywordExt *other_keyword = found_link.insert (keyword);
-        /* Check for links.  We deal with these by building an equivalence class
+      if (_max_key_len < keyword->_allchars_length)
-           of all duplicate values (i.e., links) so that only 1 keyword is
+        _max_key_len = keyword->_allchars_length;
-           representative of the entire collection.  This *greatly* simplifies
+      if (_min_key_len > keyword->_allchars_length)
-           processing during later stages of the program. */
+        _min_key_len = keyword->_allchars_length;
    }
  /* Exit program if an empty string is used as key, since the comparison
     expressions don't work correctly for looking up an empty string.  */
  if (_min_key_len == 0)
    {
      fprintf (stderr, "Empty input key is not allowed.\n"
                       "To recognize an empty input key, your code should check for\n"
                       "len == 0 before calling the gperf generated lookup function.\n");
      exit (1);
    }
  /* Check for duplicates, i.e. keywords with the same _selchars array
     (and - if !option[NOLENGTH] - also the same length).
     We deal with these by building an equivalence class, so that only
     1 keyword is representative of the entire collection.  Only this
     representative remains in the keyword list; the others are accessible
     through the _duplicate_link chain, starting at the representative.
     This *greatly* simplifies processing during later stages of the program.
     Set _total_duplicates and _list_len = _total_keys - _total_duplicates.  */
  {
    _list_len = _total_keys;
    _total_duplicates = 0;
    /* Make hash table for efficiency.  */
    Hash_Table representatives (_list_len, option[NOLENGTH]);
    KeywordExt_List *prev = NULL; /* list node before temp */
    for (temp = _head; temp; temp = temp->rest())
      {
        KeywordExt *keyword = temp->first();
        KeywordExt *other_keyword = representatives.insert (keyword);
        if (other_keyword)
          {
            _total_duplicates++;
            _list_len--;
-            trail->rest() = temp->rest();
+            /* Remove keyword from the main list.  */
-            temp->first()->_duplicate_link = other_keyword->_duplicate_link;
+            prev->rest() = temp->rest();
-            other_keyword->_duplicate_link = temp->first();
+            /* And insert it on other_keyword's duplicate list.  */
            keyword->_duplicate_link = other_keyword->_duplicate_link;
            other_keyword->_duplicate_link = keyword;
            /* Complain if user hasn't enabled the duplicate option. */
            if (!option[DUP] || option[DEBUG])
@@ -94,19 +121,16 @@ Search::prepare ()
          }
        else
          {
-            temp->first()->_duplicate_link = NULL;
+            keyword->_duplicate_link = NULL;
-            trail = temp;
+            prev = temp;
          }
        /* Update minimum and maximum keyword length, if needed. */
        if (_max_key_len < keyword->_allchars_length)
          _max_key_len = keyword->_allchars_length;
        if (_min_key_len > keyword->_allchars_length)
          _min_key_len = keyword->_allchars_length;
      }
  }
-  /* Exit program if links exists and option[DUP] not set, since we can't continue */
+  /* Exit program if duplicates exists and option[DUP] not set, since we
     don't want to continue in this case.  (We don't want to turn on
     option[DUP] implicitly, because the generated code is usually much
     slower.  */
  if (_total_duplicates)
    {
      if (option[DUP])
@@ -119,20 +143,23 @@ Search::prepare ()
          exit (1);
        }
    }
-  /* Exit program if an empty string is used as key, since the comparison
+
-     expressions don't work correctly for looking up an empty string. */
+  /* Compute the occurrences of each character in the alphabet.  */
-  if (_min_key_len == 0)
+  memset (_occurrences, 0, _alpha_size * sizeof (_occurrences[0]));
  for (temp = _head; temp; temp = temp->rest())
    {
-      fprintf (stderr, "Empty input key is not allowed.\nTo recognize an empty input key, your code should check for\nlen == 0 before calling the gperf generated lookup function.\n");
+      KeywordExt *keyword = temp->first();
-      exit (1);
+      const unsigned char *ptr = keyword->_selchars;
      for (int count = keyword->_selchars_length; count > 0; ptr++, count--)
        _occurrences[*ptr]++;
    }
 }
-/* Recursively merges two sorted lists together to form one sorted list. The
+/* Merges two sorted lists together to form one sorted list.
-   ordering criteria is by frequency of occurrence of elements in the key set
+   The sorting criterion depends on which of _occurrence_sort and _hash_sort
-   or by the hash value.  This is a kludge, but permits nice sharing of
+   is set to true.  This is a kludge, but permits nice sharing of almost
-   almost identical code without incurring the overhead of a function
+   identical code without incurring the overhead of a function call for
-   call comparison. */
+   every comparison.  */
 KeywordExt_List *
 Search::merge (KeywordExt_List *list1, KeywordExt_List *list2)
@@ -151,8 +178,10 @@ Search::merge (KeywordExt_List *list1, KeywordExt_List *list2)
          *resultp = list1;
          break;
        }
-      if (_occurrence_sort && list1->first()->_occurrence < list2->first()->_occurrence
+      if ((_occurrence_sort
-          || _hash_sort && list1->first()->_hash_value > list2->first()->_hash_value)
+           && list1->first()->_occurrence < list2->first()->_occurrence)
          || (_hash_sort
              && list1->first()->_hash_value > list2->first()->_hash_value))
        {
          *resultp = list2;
          resultp = &list2->rest(); list2 = list1; list1 = *resultp;
@@ -166,37 +195,46 @@ Search::merge (KeywordExt_List *list1, KeywordExt_List *list2)
  return result;
 }
-/* Applies the merge sort algorithm to recursively sort the key list by
+/* Sorts a list using the recursive merge sort algorithm.
-   frequency of occurrence of elements in the key set. */
+   The sorting criterion depends on which of _occurrence_sort and _hash_sort
   is set to true.  */
 KeywordExt_List *
 Search::merge_sort (KeywordExt_List *head)
 {
  if (!head || !head->rest())
    /* List of length 0 or 1.  Nothing to do.  */
    return head;
  else
    {
      /* Determine a list node in the middle.  */
      KeywordExt_List *middle = head;
-      KeywordExt_List *temp   = head->rest()->rest();
+      for (KeywordExt_List *temp = head->rest();;)
      while (temp)
        {
          temp = temp->rest();
-          middle = middle->rest();
+          if (temp == NULL)
-          if (temp)
+            break;
          temp = temp->rest();
          middle = middle->rest();
          if (temp == NULL)
            break;
        }
-      temp         = middle->rest();
+      /* Cut the list into two halves.
-      middle->rest() = 0;
+         If the list has n elements, the left half has ceiling(n/2) elements
-      return merge (merge_sort (head), merge_sort (temp));
+         and the right half has floor(n/2) elements.  */
      KeywordExt_List *right_half = middle->rest();
      middle->rest() = NULL;
      /* Sort the two halves, then merge them.  */
      return merge (merge_sort (head), merge_sort (right_half));
    }
 }
 /* Returns the frequency of occurrence of elements in the key set. */
 inline int
-Search::get_occurrence (KeywordExt *ptr)
+Search::compute_occurrence (KeywordExt *ptr)
 {
  int value = 0;
@@ -249,7 +287,7 @@ Search::reorder ()
    {
      KeywordExt *keyword = ptr->first();
-      keyword->_occurrence = get_occurrence (keyword);
+      keyword->_occurrence = compute_occurrence (keyword);
    }
  _hash_sort = false;
--- a/src/search.h
+++ b/src/search.h
@@ -37,9 +37,13 @@ public:
  void                  optimize ();
 private:
  void                  prepare ();
  /* Merges two sorted lists together to form one sorted list.  */
  KeywordExt_List *     merge (KeywordExt_List *list1, KeywordExt_List *list2);
  /* Sorts a list using the recursive merge sort algorithm.  */
  KeywordExt_List *     merge_sort (KeywordExt_List *head);
-  int                   get_occurrence (KeywordExt *ptr);
+
  int                   compute_occurrence (KeywordExt *ptr);
  void                  set_determined (KeywordExt *ptr);
  bool                  already_determined (KeywordExt *ptr);
  void                  reorder ();
@@ -53,21 +57,44 @@ private:
  void                  change (KeywordExt *prior, KeywordExt *curr);
  void                  sort ();
 public:
-  KeywordExt_List *     _head;                            /* Points to the head of the linked list. */
+
-  int                   _total_keys;                           /* Total number of keys, counting duplicates. */
+  /* Linked list of keywords.  */
-  int                   _total_duplicates;                     /* Total number of duplicate hash values. */
+  KeywordExt_List *     _head;
-  int                   _max_key_len;                          /* Maximum length of the longest keyword. */
+
-  int                   _min_key_len;                          /* Minimum length of the shortest keyword. */
+  /* Total number of keywords, counting duplicates.  */
  int                   _total_keys;
  /* Total number of duplicates that have been moved to _duplicate_link lists
     (not counting their representatives which stay on the main list).  */
  int                   _total_duplicates;
  /* Maximum length of the longest keyword.  */
  int                   _max_key_len;
  /* Minimum length of the shortest keyword.  */
  int                   _min_key_len;
  /* Size of alphabet.  */
  int const             _alpha_size;
-  /* Counts occurrences of each key set character. */
+
  /* Counts occurrences of each key set character.
     _occurrences[c] is the number of times that c occurs among the _selchars
     of a keyword.  */
  int * const           _occurrences;
  /* Value associated with each character. */
  int * const           _asso_values;
 private:
-  int                   _list_len;                             /* Length of head's Key_List, not counting duplicates. */
+
-  bool                  _occurrence_sort;                      /* True if sorting by occurrence. */
+  /* Length of _head list.  Number of keywords, not counting duplicates.  */
-  bool                  _hash_sort;                            /* True if sorting by hash value. */
+  int                   _list_len;
  /* Choice of sorting criterion during Search::merge_sort.  */
  /* True if sorting by occurrence.  */
  bool                  _occurrence_sort;
  /* True if sorting by hash value.  */
  bool                  _hash_sort;
  bool * const          _determined;                           /* Used in function reorder, below. */
  int                   _num_done;          /* Number of keywords processed without a collision. */
  int                   _fewest_collisions; /* Records fewest # of collisions for asso value. */
--- a/tests/chill.exp
+++ b/tests/chill.exp