diff --git a/ChangeLog b/ChangeLog index f232c55..a23252c 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,21 @@ +2025-04-19 Bruno Haible + + Optimize: Use a hash table in compute_partition. + This reduces the execution time of gperf on large inputs by ca. 30%. + * autogen.sh (GNULIB_MODULES): Add map-c++, hash-map. + * src/keyword.h: Include . + (struct KeywordExt): Add fields _undetermined_chars, + _undetermined_chars_length, _undetermined_chars_hashcode. + * src/search.cc: Include gl_map.hh, gl_hash_map.h. + (Search::prepare_asso_values): Initialize the _undetermined_chars field. + (struct EquivalenceClass): Remove the fields _undetermined_chars, + _undetermined_chars_length. + (undetermined_equals, undetermined_hashcode): New functions. + (Search::compute_partition): Initialize the _undetermined_chars* fields + of all keywords. Use a hash map instead of a loop over the equivalence + classes. + (Search::find_good_asso_values): Deallocate the _undetermined_chars field. + 2025-04-19 Bruno Haible Optimize: Make Bool_Array take less memory. diff --git a/autogen.sh b/autogen.sh index eab62fc..a335f61 100755 --- a/autogen.sh +++ b/autogen.sh @@ -66,6 +66,7 @@ if test $skip_gnulib = false; then GNULIB_MODULES=' filename getopt-gnu + map-c++ hash-map read-file package-version ' diff --git a/lib/.gitignore b/lib/.gitignore index 1180ab2..c50746b 100644 --- a/lib/.gitignore +++ b/lib/.gitignore @@ -5,12 +5,19 @@ /alloca.in.h /arg-nonnull.h /assert.in.h +/attribute.h +/basename-lgpl.c +/basename-lgpl.h /c++defs.h /cloexec.c /cloexec.h /close.c /dup2.c /errno.in.h +/error.c +/error.in.h +/exitfail.c +/exitfail.h /fcntl.c /fcntl.in.h /fd-hook.c @@ -31,9 +38,22 @@ /getopt.in.h /getopt1.c /getopt_int.h +/getprogname.c +/getprogname.h /gettext.h +/gl_anyhash1.h +/gl_anyhash2.h +/gl_anyhash_primes.h +/gl_hash_map.c +/gl_hash_map.h +/gl_map.c +/gl_map.h +/gl_map.hh +/gl_xmap.c +/gl_xmap.h /idx.h /intprops-internal.h +/intprops.h /inttypes.in.h /limits.in.h /lseek.c @@ -51,6 +71,7 @@ /read-file.c /read-file.h /realloc.c +/size_max.h /stat-time.c /stat-time.h /stat-w32.c @@ -65,6 +86,9 @@ /stdio.in.h /stdlib.c /stdlib.in.h +/strerror-override.c +/strerror-override.h +/strerror.c /string.in.h /sys_stat.in.h /sys_types.in.h @@ -74,7 +98,11 @@ /verify.h /warn-on-use.h /wchar.in.h +/xalloc-die.c /xalloc-oversized.h +/xalloc.h +/xsize.c +/xsize.h # Files generated by the autotools: /aclocal.m4 diff --git a/src/keyword.h b/src/keyword.h index 35cacef..b76dd6e 100644 --- a/src/keyword.h +++ b/src/keyword.h @@ -24,6 +24,8 @@ #ifndef keyword_h #define keyword_h 1 +#include /* defines size_t */ + /* Class defined in "positions.h". */ class Positions; @@ -72,6 +74,13 @@ struct KeywordExt : public Keyword /* Deletes selchars. */ void delete_selchars (); + /* Data members used by the algorithm, specifically compute_partition. */ + /* The undetermined selected characters for this keyword, as a + canonically reordered multiset. */ + unsigned int * _undetermined_chars; + unsigned int _undetermined_chars_length; + size_t _undetermined_chars_hashcode; + /* Data members used by the algorithm. */ int _hash_value; /* Hash value for the keyword. */ diff --git a/src/search.cc b/src/search.cc index 6ddfaa3..aa45a42 100644 --- a/src/search.cc +++ b/src/search.cc @@ -28,7 +28,9 @@ #include /* declares memset(), memcmp() */ #include /* declares time() */ #include /* declares exp() */ -#include /* defines INT_MIN, INT_MAX, UINT_MAX */ +#include /* defines INT_MIN, INT_MAX, UINT_MAX, CHAR_BIT */ +#include "gl_map.hh" +#include "gl_hash_map.h" #include "options.h" #include "hash-table.h" @@ -839,6 +841,13 @@ Search::prepare_asso_values () /* Memory allocation. */ _asso_values = new int[_alpha_size]; + /* Memory allocation in each Keyword. */ + for (temp = _head; temp; temp = temp->rest()) + { + KeywordExt *keyword = temp->first(); + keyword->_undetermined_chars = new unsigned int[keyword->_selchars_length]; + } + int non_linked_length = _list_len; unsigned int asso_value_max; @@ -943,10 +952,6 @@ struct EquivalenceClass KeywordExt_List * _keywords_last; /* The number of keywords in this equivalence class. */ unsigned int _cardinality; - /* The undetermined selected characters for the keywords in this - equivalence class, as a canonically reordered multiset. */ - unsigned int * _undetermined_chars; - unsigned int _undetermined_chars_length; EquivalenceClass * _next; }; @@ -984,48 +989,78 @@ equals (const unsigned int *ptr1, const unsigned int *ptr2, unsigned int len) return true; } +static bool +undetermined_equals (KeywordExt *key1, KeywordExt *key2) +{ + return (key1->_undetermined_chars_length == key2 ->_undetermined_chars_length) + && equals (key1->_undetermined_chars, key2->_undetermined_chars, + key1->_undetermined_chars_length); +} + +static size_t +undetermined_hashcode (KeywordExt *key) +{ + return key->_undetermined_chars_hashcode; +} + EquivalenceClass * Search::compute_partition (bool *undetermined) const { - EquivalenceClass *partition = NULL; - EquivalenceClass *partition_last = NULL; + /* Prepare the use of the hash-map: For each keyword, + compute the undetermined characters and their hash code. */ for (KeywordExt_List *temp = _head; temp; temp = temp->rest()) { KeywordExt *keyword = temp->first(); - /* Compute the undetermined characters for this keyword. */ - unsigned int *undetermined_chars = - new unsigned int[keyword->_selchars_length]; + /* This scratch memory, an array of length keyword->_selchars_length, + was allocated earlier. */ + unsigned int *undetermined_chars = keyword->_undetermined_chars; unsigned int undetermined_chars_length = 0; for (int i = 0; i < keyword->_selchars_length; i++) if (undetermined[keyword->_selchars[i]]) undetermined_chars[undetermined_chars_length++] = keyword->_selchars[i]; + keyword->_undetermined_chars_length = undetermined_chars_length; + + { + const int SIZE_BITS = sizeof (size_t) * CHAR_BIT; + size_t h = undetermined_chars_length; + for (unsigned int i = 0; i < undetermined_chars_length; i++) + h = undetermined_chars[i] * 641 + ((h << 9) | (h >> (SIZE_BITS - 9))); + keyword->_undetermined_chars_hashcode = h; + } + } + + EquivalenceClass *partition = NULL; + EquivalenceClass *partition_last = NULL; + /* A hash-map that maps each keyword to the EquivalenceClass that contains + it. */ + gl_Map + map (GL_HASH_MAP, undetermined_equals, undetermined_hashcode, NULL, NULL); + for (KeywordExt_List *temp = _head; temp; temp = temp->rest()) + { + KeywordExt *keyword = temp->first(); /* Look up the equivalence class to which this keyword belongs. */ - EquivalenceClass *equclass; - for (equclass = partition; equclass; equclass = equclass->_next) - if (equclass->_undetermined_chars_length == undetermined_chars_length - && equals (equclass->_undetermined_chars, undetermined_chars, - undetermined_chars_length)) - break; + EquivalenceClass *equclass = const_cast(map.get(keyword)); if (equclass == NULL) { equclass = new EquivalenceClass(); equclass->_keywords = NULL; equclass->_keywords_last = NULL; equclass->_cardinality = 0; - equclass->_undetermined_chars = undetermined_chars; - equclass->_undetermined_chars_length = undetermined_chars_length; equclass->_next = NULL; + + /* Map this keyword (and all equivalent ones that will be seen later) + to equclass. */ + map.put(keyword, equclass); + if (partition) partition_last->_next = equclass; else partition = equclass; partition_last = equclass; } - else - delete[] undetermined_chars; /* Add the keyword to the equivalence class. */ KeywordExt_List *cons = new KeywordExt_List(keyword); @@ -1037,10 +1072,6 @@ Search::compute_partition (bool *undetermined) const equclass->_cardinality++; } - /* Free some of the allocated memory. The caller doesn't need it. */ - for (EquivalenceClass *cls = partition; cls; cls = cls->_next) - delete[] cls->_undetermined_chars; - return partition; } @@ -1052,7 +1083,6 @@ delete_partition (EquivalenceClass *partition) EquivalenceClass *equclass = partition; partition = equclass->_next; delete_list (equclass->_keywords); - //delete[] equclass->_undetermined_chars; // already freed above delete equclass; } } @@ -1572,6 +1602,13 @@ Search::find_good_asso_values () delete[] best_asso_values; /* The keywords' _hash_value fields are recomputed below. */ } + + /* Memory deallocation in each Keyword. */ + for (KeywordExt_List *temp = _head; temp; temp = temp->rest()) + { + KeywordExt *keyword = temp->first(); + delete[] keyword->_undetermined_chars; + } } /* ========================================================================= */