Compute the occurrences after removal of duplicates, not before.

2025-12-02 13:09:22 +00:00 · 2002-12-20 12:22:27 +00:00
parent 1186e616cb
commit 1f70ea5dfd
6 changed files with 863 additions and 786 deletions
--- a/12
+++ b/12
@@ -1,5 +1,17 @@
 2002-11-03  Bruno Haible  <bruno@clisp.org>

+	Compute the occurrences after removal of duplicates, not before.
+	* src/keyword.h (KeywordExt::init_selchars): Remove occurrences
+	argument.
+	* src/keyword.cc (KeywordExt::init_selchars): Likewise.
+	* src/search.cc (Search::prepare): Reorder the code. Compute the
+	occurrences after removal of duplicates.
+	(Search::merge_sort): Optimize the loop.
+	(Search::compute_occurrence): Renamed from Search::get_occurrence.
+	* src/search.h (Search::compute_occurrence): Renamed from
+	Search::get_occurrence.
+	* tests/chill.exp: Regenerated.
+
 	Bug fix: The hash table could fail to detect duplicates, between
 	keywords of different length, when option -n (option[NOLENGTH]) was
 	given.
--- a/src/keyword.cc
+++ b/src/keyword.cc
@@ -47,7 +47,7 @@ static inline void sort_char_set (unsigned char *base, int len)
    }
 }

-/* Initialize selchars and selchars_length, and update occurrences.
+/* Initialize selchars and selchars_length.
   The hash function will be computed as
   asso_values[allchars[key_pos[0]]] + asso_values[allchars[key_pos[1]]] + ...
   We compute selchars as the multiset
@@ -57,7 +57,7 @@ static inline void sort_char_set (unsigned char *base, int len)
   Furthermore we sort the selchars array, to ease detection of duplicates
   later.
 */
-void KeywordExt::init_selchars (int *occurrences)
+void KeywordExt::init_selchars ()
 {
  const char *k = _allchars;
  unsigned char *key_set =
@@ -69,14 +69,13 @@ void KeywordExt::init_selchars (int *occurrences)
    for (int i = _allchars_length; i > 0; k++, i--)
      {
        *ptr = static_cast<unsigned char>(*k);
-        occurrences[*ptr]++;
        ptr++;
      }
  else
    /* Only use those character positions specified by the user.  */
    {
-      /* Iterate through the list of key_positions, initializing occurrences
-         table and selchars (via ptr).  */
+      /* Iterate through the list of key_positions, initializing selchars
+         (via ptr).  */
      PositionIterator iter (option.get_key_positions ());

      for (int i; (i = iter.next ()) != PositionIterator::EOS; )
@@ -90,7 +89,6 @@ void KeywordExt::init_selchars (int *occurrences)
          else
            /* Out of range of KEY length, so we'll just skip it.  */
            continue;
-          occurrences[*ptr]++;
          ptr++;
        }

--- a/src/keyword.h
+++ b/src/keyword.h
@@ -56,12 +56,14 @@ struct KeywordExt : public Keyword
     multiset.  */
  const unsigned char * _selchars;
  int                   _selchars_length;
-  /* Chained list of keywords having the same selchars.  */
+  /* Chained list of keywords having the same _selchars and
+     - if !option[NOLENGTH] - also the same _allchars_length.
+     Note that these duplicates are not members of the main keyword list.  */
  KeywordExt *          _duplicate_link;

  /* Methods depending on the keyposition list.  */
-  /* Initialize selchars and selchars_length, and update occurrences.  */
-  void                  init_selchars (int *occurrences);
+  /* Initialize selchars and selchars_length.  */
+  void                  init_selchars ();

  /* Data members used by the algorithm.  */
  int                   _occurrence; /* Frequency of key set occurrences.  */
--- a/src/search.cc
+++ b/src/search.cc
@@ -41,6 +41,8 @@ Search::Search (KeywordExt_List *list)
    _asso_values (new int[_alpha_size]),
    _determined (new bool[_alpha_size])
 {
+  memset (_asso_values, 0, _alpha_size * sizeof (_asso_values[0]));
+  memset (_determined, 0, _alpha_size * sizeof (_determined[0]));
 }

 void
@@ -48,42 +50,67 @@ Search::prepare ()
 {
  KeywordExt_List *temp;

+  /* Compute the total number of keywords.  */
  _total_keys = 0;
+  for (temp = _head; temp; temp = temp->rest())
+    _total_keys++;
+
+  /* Initialize each keyword's _selchars array.  */
+  for (temp = _head; temp; temp = temp->rest())
+    temp->first()->init_selchars();
+
+  /* Compute the minimum and maximum keyword length.  */
+  _max_key_len = INT_MIN;
+  _min_key_len = INT_MAX;
  for (temp = _head; temp; temp = temp->rest())
    {
-      temp->first()->init_selchars(_occurrences);
-      _total_keys++;
+      KeywordExt *keyword = temp->first();
+
+      if (_max_key_len < keyword->_allchars_length)
+        _max_key_len = keyword->_allchars_length;
+      if (_min_key_len > keyword->_allchars_length)
+        _min_key_len = keyword->_allchars_length;
    }

-  _list_len = _total_keys;
+  /* Exit program if an empty string is used as key, since the comparison
+     expressions don't work correctly for looking up an empty string.  */
+  if (_min_key_len == 0)
+    {
+      fprintf (stderr, "Empty input key is not allowed.\n"
+                       "To recognize an empty input key, your code should check for\n"
+                       "len == 0 before calling the gperf generated lookup function.\n");
+      exit (1);
+    }

+  /* Check for duplicates, i.e. keywords with the same _selchars array
+     (and - if !option[NOLENGTH] - also the same length).
+     We deal with these by building an equivalence class, so that only
+     1 keyword is representative of the entire collection.  Only this
+     representative remains in the keyword list; the others are accessible
+     through the _duplicate_link chain, starting at the representative.
+     This *greatly* simplifies processing during later stages of the program.
+     Set _total_duplicates and _list_len = _total_keys - _total_duplicates.  */
  {
-    /* Make hash table for efficiency. */
-    Hash_Table found_link (_list_len, option[NOLENGTH]);
-
-    /* Test whether there are any links and also set the maximum length of
-       an identifier in the keyword list. */
+    _list_len = _total_keys;
    _total_duplicates = 0;
-    _max_key_len = INT_MIN;
-    _min_key_len = INT_MAX;
-    KeywordExt_List *trail = NULL;
+    /* Make hash table for efficiency.  */
+    Hash_Table representatives (_list_len, option[NOLENGTH]);
+
+    KeywordExt_List *prev = NULL; /* list node before temp */
    for (temp = _head; temp; temp = temp->rest())
      {
        KeywordExt *keyword = temp->first();
-        KeywordExt *other_keyword = found_link.insert (keyword);
-
-        /* Check for links.  We deal with these by building an equivalence class
-           of all duplicate values (i.e., links) so that only 1 keyword is
-           representative of the entire collection.  This *greatly* simplifies
-           processing during later stages of the program. */
+        KeywordExt *other_keyword = representatives.insert (keyword);

        if (other_keyword)
          {
            _total_duplicates++;
            _list_len--;
-            trail->rest() = temp->rest();
-            temp->first()->_duplicate_link = other_keyword->_duplicate_link;
-            other_keyword->_duplicate_link = temp->first();
+            /* Remove keyword from the main list.  */
+            prev->rest() = temp->rest();
+            /* And insert it on other_keyword's duplicate list.  */
+            keyword->_duplicate_link = other_keyword->_duplicate_link;
+            other_keyword->_duplicate_link = keyword;

            /* Complain if user hasn't enabled the duplicate option. */
            if (!option[DUP] || option[DEBUG])
@@ -94,19 +121,16 @@ Search::prepare ()
          }
        else
          {
-            temp->first()->_duplicate_link = NULL;
-            trail = temp;
+            keyword->_duplicate_link = NULL;
+            prev = temp;
          }
-
-        /* Update minimum and maximum keyword length, if needed. */
-        if (_max_key_len < keyword->_allchars_length)
-          _max_key_len = keyword->_allchars_length;
-        if (_min_key_len > keyword->_allchars_length)
-          _min_key_len = keyword->_allchars_length;
      }
  }

-  /* Exit program if links exists and option[DUP] not set, since we can't continue */
+  /* Exit program if duplicates exists and option[DUP] not set, since we
+     don't want to continue in this case.  (We don't want to turn on
+     option[DUP] implicitly, because the generated code is usually much
+     slower.  */
  if (_total_duplicates)
    {
      if (option[DUP])
@@ -119,20 +143,23 @@ Search::prepare ()
          exit (1);
        }
    }
-  /* Exit program if an empty string is used as key, since the comparison
-     expressions don't work correctly for looking up an empty string. */
-  if (_min_key_len == 0)
+
+  /* Compute the occurrences of each character in the alphabet.  */
+  memset (_occurrences, 0, _alpha_size * sizeof (_occurrences[0]));
+  for (temp = _head; temp; temp = temp->rest())
    {
-      fprintf (stderr, "Empty input key is not allowed.\nTo recognize an empty input key, your code should check for\nlen == 0 before calling the gperf generated lookup function.\n");
-      exit (1);
+      KeywordExt *keyword = temp->first();
+      const unsigned char *ptr = keyword->_selchars;
+      for (int count = keyword->_selchars_length; count > 0; ptr++, count--)
+        _occurrences[*ptr]++;
    }
 }

-/* Recursively merges two sorted lists together to form one sorted list. The
-   ordering criteria is by frequency of occurrence of elements in the key set
-   or by the hash value.  This is a kludge, but permits nice sharing of
-   almost identical code without incurring the overhead of a function
-   call comparison. */
+/* Merges two sorted lists together to form one sorted list.
+   The sorting criterion depends on which of _occurrence_sort and _hash_sort
+   is set to true.  This is a kludge, but permits nice sharing of almost
+   identical code without incurring the overhead of a function call for
+   every comparison.  */

 KeywordExt_List *
 Search::merge (KeywordExt_List *list1, KeywordExt_List *list2)
@@ -151,8 +178,10 @@ Search::merge (KeywordExt_List *list1, KeywordExt_List *list2)
          *resultp = list1;
          break;
        }
-      if (_occurrence_sort && list1->first()->_occurrence < list2->first()->_occurrence
-          || _hash_sort && list1->first()->_hash_value > list2->first()->_hash_value)
+      if ((_occurrence_sort
+           && list1->first()->_occurrence < list2->first()->_occurrence)
+          || (_hash_sort
+              && list1->first()->_hash_value > list2->first()->_hash_value))
        {
          *resultp = list2;
          resultp = &list2->rest(); list2 = list1; list1 = *resultp;
@@ -166,37 +195,46 @@ Search::merge (KeywordExt_List *list1, KeywordExt_List *list2)
  return result;
 }

-/* Applies the merge sort algorithm to recursively sort the key list by
-   frequency of occurrence of elements in the key set. */
+/* Sorts a list using the recursive merge sort algorithm.
+   The sorting criterion depends on which of _occurrence_sort and _hash_sort
+   is set to true.  */

 KeywordExt_List *
 Search::merge_sort (KeywordExt_List *head)
 {
  if (!head || !head->rest())
+    /* List of length 0 or 1.  Nothing to do.  */
    return head;
  else
    {
+      /* Determine a list node in the middle.  */
      KeywordExt_List *middle = head;
-      KeywordExt_List *temp   = head->rest()->rest();
-
-      while (temp)
+      for (KeywordExt_List *temp = head->rest();;)
        {
-          temp   = temp->rest();
+          temp = temp->rest();
+          if (temp == NULL)
+            break;
+          temp = temp->rest();
          middle = middle->rest();
-          if (temp)
-            temp = temp->rest();
+          if (temp == NULL)
+            break;
        }

-      temp         = middle->rest();
-      middle->rest() = 0;
-      return merge (merge_sort (head), merge_sort (temp));
+      /* Cut the list into two halves.
+         If the list has n elements, the left half has ceiling(n/2) elements
+         and the right half has floor(n/2) elements.  */
+      KeywordExt_List *right_half = middle->rest();
+      middle->rest() = NULL;
+
+      /* Sort the two halves, then merge them.  */
+      return merge (merge_sort (head), merge_sort (right_half));
    }
 }

 /* Returns the frequency of occurrence of elements in the key set. */

 inline int
-Search::get_occurrence (KeywordExt *ptr)
+Search::compute_occurrence (KeywordExt *ptr)
 {
  int value = 0;

@@ -249,7 +287,7 @@ Search::reorder ()
    {
      KeywordExt *keyword = ptr->first();

-      keyword->_occurrence = get_occurrence (keyword);
+      keyword->_occurrence = compute_occurrence (keyword);
    }

  _hash_sort = false;
--- a/src/search.h
+++ b/src/search.h
@@ -37,9 +37,13 @@ public:
  void                  optimize ();
 private:
  void                  prepare ();
+
+  /* Merges two sorted lists together to form one sorted list.  */
  KeywordExt_List *     merge (KeywordExt_List *list1, KeywordExt_List *list2);
+  /* Sorts a list using the recursive merge sort algorithm.  */
  KeywordExt_List *     merge_sort (KeywordExt_List *head);
-  int                   get_occurrence (KeywordExt *ptr);
+
+  int                   compute_occurrence (KeywordExt *ptr);
  void                  set_determined (KeywordExt *ptr);
  bool                  already_determined (KeywordExt *ptr);
  void                  reorder ();
@@ -53,21 +57,44 @@ private:
  void                  change (KeywordExt *prior, KeywordExt *curr);
  void                  sort ();
 public:
-  KeywordExt_List *     _head;                            /* Points to the head of the linked list. */
-  int                   _total_keys;                           /* Total number of keys, counting duplicates. */
-  int                   _total_duplicates;                     /* Total number of duplicate hash values. */
-  int                   _max_key_len;                          /* Maximum length of the longest keyword. */
-  int                   _min_key_len;                          /* Minimum length of the shortest keyword. */
-  /* Size of alphabet. */
+
+  /* Linked list of keywords.  */
+  KeywordExt_List *     _head;
+
+  /* Total number of keywords, counting duplicates.  */
+  int                   _total_keys;
+
+  /* Total number of duplicates that have been moved to _duplicate_link lists
+     (not counting their representatives which stay on the main list).  */
+  int                   _total_duplicates;
+
+  /* Maximum length of the longest keyword.  */
+  int                   _max_key_len;
+
+  /* Minimum length of the shortest keyword.  */
+  int                   _min_key_len;
+
+  /* Size of alphabet.  */
  int const             _alpha_size;
-  /* Counts occurrences of each key set character. */
+
+  /* Counts occurrences of each key set character.
+     _occurrences[c] is the number of times that c occurs among the _selchars
+     of a keyword.  */
  int * const           _occurrences;
  /* Value associated with each character. */
  int * const           _asso_values;
+
 private:
-  int                   _list_len;                             /* Length of head's Key_List, not counting duplicates. */
-  bool                  _occurrence_sort;                      /* True if sorting by occurrence. */
-  bool                  _hash_sort;                            /* True if sorting by hash value. */
+
+  /* Length of _head list.  Number of keywords, not counting duplicates.  */
+  int                   _list_len;
+
+  /* Choice of sorting criterion during Search::merge_sort.  */
+  /* True if sorting by occurrence.  */
+  bool                  _occurrence_sort;
+  /* True if sorting by hash value.  */
+  bool                  _hash_sort;
+
  bool * const          _determined;                           /* Used in function reorder, below. */
  int                   _num_done;          /* Number of keywords processed without a collision. */
  int                   _fewest_collisions; /* Records fewest # of collisions for asso value. */
--- a/tests/chill.exp
+++ b/tests/chill.exp