1
0
mirror of https://git.savannah.gnu.org/git/gperf.git synced 2025-12-02 13:09:22 +00:00

New option --ignore-case.

This commit is contained in:
Bruno Haible
2003-04-02 09:26:05 +00:00
parent 0093e33163
commit 7dfd32b736
15 changed files with 583 additions and 39 deletions

View File

@@ -1,5 +1,49 @@
2002-12-10 Bruno Haible <bruno@clisp.org> 2002-12-10 Bruno Haible <bruno@clisp.org>
* src/options.h (UPPERLOWER): New enum value.
* src/options.cc (Options::long_usage): Document option --ignore-case.
(Options::~Options): Update.
(long_options): Add option --ignore-case.
(Options::parse_options): Handle option --ignore-case.
* src/input.cc (Input::read_input): Recognize option %ignore-case.
* src/keyword.h (KeywordExt::init_selchars_tuple,
KeywordExt::init_selchars_multiset, KeywordExt::init_selchars_low):
Add alpha_unify argument.
* src/keyword.cc (KeywordExt::init_selchars_low): Add alpha_unify
argument.
(KeywordExt::init_selchars_tuple): Add alpha_unify argument.
(KeywordExt::init_selchars_multiset): Add alpha_unify argument.
* src/search.h (Search::compute_alpha_size,
Search::compute_alpha_unify): New declarations.
(Search::init_selchars_multiset): Add alpha_unify argument.
(Search::_alpha_unify): New field.
* src/search.cc (Search::compute_alpha_size,
Search::compute_alpha_unify): New functions.
(Search::init_selchars_tuple): Update.
(Search::find_positions): Temporarily set _alpha_unify. Perform a
case insensitive comparison if needed.
(Search::init_selchars_multiset): Add alpha_unify argument.
(Search::count_duplicates_multiset): Call compute_alpha_unify.
(Search::find_alpha_inc): Temporarily set _alpha_unify. At the end,
set _alpha_size and _alpha_unify.
(Search::prepare): Update. Don't compute _alpga_size here.
(Search::optimize): Propagate unified asso_values.
(Search::~Search) Delete _alpha_unify.
* src/output.cc (output_upperlower_strcmp, output_upperlower_strncmp,
output_upperlower_memcmp): New functions.
(Output_Compare_Strcmp::output_comparison,
Output_Compare_Strncmp::output_comparison,
Output_Compare_Memcmp::output_comparison): Use the case-insensitive
comparison function if --ignore-case was given.
(Output::output): Emit the auxiliary case-insensitive comparison
function if needed.
* tests/permutc2.gperf, tests/permutc2.exp: New files.
* tests/Makefile.in (check-test): Also check permutc2.gperf.
* tests/test-6.exp: Update.
* doc/gperf.texi (Gperf Declarations): Document %ignore-case.
(Input Details): Document option --ignore-case.
* NEWS: Update.
* src/search.cc (Search::optimize): Fill unused asso_values[] entries * src/search.cc (Search::optimize): Fill unused asso_values[] entries
with a large value. with a large value.
* src/output.h (Output::Output): Remove occurrences argument. * src/output.h (Output::Output): Remove occurrences argument.

2
NEWS
View File

@@ -10,6 +10,7 @@ New in 2.97:
* The following options can now be specified inside the input file: * The following options can now be specified inside the input file:
%delimiters=DELIMITER-LIST %delimiters=DELIMITER-LIST
%struct-type %struct-type
%ignore-case
%language=LANGUAGE-NAME %language=LANGUAGE-NAME
%define slot-name NAME %define slot-name NAME
%define hash-function-name NAME %define hash-function-name NAME
@@ -42,6 +43,7 @@ New in 2.97:
* The options -f/--fast and -o/--occurrence-sort have no effect any more. * The options -f/--fast and -o/--occurrence-sort have no effect any more.
* Added option -P/--pic that optimizes the generated code for use in shared * Added option -P/--pic that optimizes the generated code for use in shared
libraries. libraries.
* Added option --ignore-case that produces a case independent lookup function.
* Bug fixes. * Bug fixes.
New in 2.7.2: New in 2.7.2:

View File

@@ -436,6 +436,12 @@ commas or newlines.
Allows you to include a @code{struct} type declaration for generated Allows you to include a @code{struct} type declaration for generated
code; see above for an example. code; see above for an example.
@item %ignore-case
@cindex @samp{%ignore-case}
Consider upper and lower case ASCII characters as equivalent. The string
comparison will use a case insignificant character comparison. Note that
locale dependent case mappings are ignored.
@item %language=@var{language-name} @item %language=@var{language-name}
@cindex @samp{%language} @cindex @samp{%language}
Instructs @code{gperf} to generate code in the language specified by the Instructs @code{gperf} to generate code in the language specified by the
@@ -790,6 +796,17 @@ part of the type declaration. Keywords and additional fields may follow
this, one group of fields per line. A set of examples for generating this, one group of fields per line. A set of examples for generating
perfect hash tables and functions for Ada, C, C++, Pascal, Modula 2, perfect hash tables and functions for Ada, C, C++, Pascal, Modula 2,
Modula 3 and JavaScript reserved words are distributed with this release. Modula 3 and JavaScript reserved words are distributed with this release.
@item --ignore-case
Consider upper and lower case ASCII characters as equivalent. The string
comparison will use a case insignificant character comparison. Note that
locale dependent case mappings are ignored. This option is therefore not
suitable if a properly internationalized or locale aware case mapping
should be used. (For example, in a Turkish locale, the upper case equivalent
of the lowercase ASCII letter @samp{i} is the non-ASCII character
@samp{capital i with dot above}.) For this case, it is better to apply
an uppercase or lowercase conversion on the string before passing it to
the @code{gperf} generated function.
@end table @end table
@node Output Language, Output Details, Input Details, Options @node Output Language, Output Details, Input Details, Options

View File

@@ -472,6 +472,10 @@ Input::read_input ()
option.set (TYPE); option.set (TYPE);
else else
if (is_declaration (line, line_end, lineno, "ignore-case"))
option.set (UPPERLOWER);
else
if (is_declaration_with_arg (line, line_end, lineno, if (is_declaration_with_arg (line, line_end, lineno,
"language", &arg)) "language", &arg))
option.set_language (arg); option.set_language (arg);

View File

@@ -59,7 +59,7 @@ static inline void sort_char_set (unsigned int *base, int len)
*/ */
unsigned int * unsigned int *
KeywordExt::init_selchars_low (bool use_all_chars, const Positions& positions, const unsigned int *alpha_inc) KeywordExt::init_selchars_low (bool use_all_chars, const Positions& positions, const unsigned int *alpha_unify, const unsigned int *alpha_inc)
{ {
const char *k = _allchars; const char *k = _allchars;
unsigned int *key_set = unsigned int *key_set =
@@ -73,6 +73,8 @@ KeywordExt::init_selchars_low (bool use_all_chars, const Positions& positions, c
unsigned int c = static_cast<unsigned char>(*k); unsigned int c = static_cast<unsigned char>(*k);
if (alpha_inc) if (alpha_inc)
c += alpha_inc[k-_allchars]; c += alpha_inc[k-_allchars];
if (alpha_unify)
c = alpha_unify[c];
*ptr = c; *ptr = c;
ptr++; ptr++;
} }
@@ -99,6 +101,8 @@ KeywordExt::init_selchars_low (bool use_all_chars, const Positions& positions, c
else else
/* Out of range of KEY length, so we'll just skip it. */ /* Out of range of KEY length, so we'll just skip it. */
continue; continue;
if (alpha_unify)
c = alpha_unify[c];
*ptr = c; *ptr = c;
ptr++; ptr++;
} }
@@ -111,16 +115,16 @@ KeywordExt::init_selchars_low (bool use_all_chars, const Positions& positions, c
} }
void void
KeywordExt::init_selchars_tuple (bool use_all_chars, const Positions& positions) KeywordExt::init_selchars_tuple (bool use_all_chars, const Positions& positions, const unsigned int *alpha_unify)
{ {
init_selchars_low (use_all_chars, positions, NULL); init_selchars_low (use_all_chars, positions, alpha_unify, NULL);
} }
void void
KeywordExt::init_selchars_multiset (bool use_all_chars, const Positions& positions, const unsigned int *alpha_inc) KeywordExt::init_selchars_multiset (bool use_all_chars, const Positions& positions, const unsigned int *alpha_unify, const unsigned int *alpha_inc)
{ {
unsigned int *selchars = unsigned int *selchars =
init_selchars_low (use_all_chars, positions, alpha_inc); init_selchars_low (use_all_chars, positions, alpha_unify, alpha_inc);
/* Sort the selchars elements alphabetically. */ /* Sort the selchars elements alphabetically. */
sort_char_set (selchars, _selchars_length); sort_char_set (selchars, _selchars_length);

View File

@@ -68,9 +68,9 @@ struct KeywordExt : public Keyword
/* Methods depending on the keyposition list. */ /* Methods depending on the keyposition list. */
/* Initializes selchars and selchars_length, without reordering. */ /* Initializes selchars and selchars_length, without reordering. */
void init_selchars_tuple (bool use_all_chars, const Positions& positions); void init_selchars_tuple (bool use_all_chars, const Positions& positions, const unsigned int *alpha_unify);
/* Initializes selchars and selchars_length, with reordering. */ /* Initializes selchars and selchars_length, with reordering. */
void init_selchars_multiset (bool use_all_chars, const Positions& positions, const unsigned int *alpha_inc); void init_selchars_multiset (bool use_all_chars, const Positions& positions, const unsigned int *alpha_unify, const unsigned int *alpha_inc);
/* Deletes selchars. */ /* Deletes selchars. */
void delete_selchars (); void delete_selchars ();
@@ -81,7 +81,7 @@ struct KeywordExt : public Keyword
int _final_index; int _final_index;
private: private:
unsigned int * init_selchars_low (bool use_all_chars, const Positions& positions, const unsigned int *alpha_inc); unsigned int * init_selchars_low (bool use_all_chars, const Positions& positions, const unsigned int *alpha_unify, const unsigned int *alpha_inc);
}; };
/* An abstract factory for creating Keyword instances. /* An abstract factory for creating Keyword instances.

View File

@@ -106,6 +106,10 @@ Options::long_usage (FILE * stream) const
" is considered part of the type declaration. Key\n" " is considered part of the type declaration. Key\n"
" words and additional fields may follow this, one\n" " words and additional fields may follow this, one\n"
" group of fields per line.\n"); " group of fields per line.\n");
fprintf (stream,
" --ignore-case Consider upper and lower case ASCII characters as\n"
" equivalent. Note that locale dependent case mappings\n"
" are ignored.\n");
fprintf (stream, "\n"); fprintf (stream, "\n");
fprintf (stream, fprintf (stream,
"Language for the output code:\n"); "Language for the output code:\n");
@@ -463,6 +467,7 @@ Options::~Options ()
"\nENUM is........: %s" "\nENUM is........: %s"
"\nINCLUDE is.....: %s" "\nINCLUDE is.....: %s"
"\nSEVENBIT is....: %s" "\nSEVENBIT is....: %s"
"\nUPPERLOWER is..: %s"
"\nlookup function name = %s" "\nlookup function name = %s"
"\nhash function name = %s" "\nhash function name = %s"
"\nword list name = %s" "\nword list name = %s"
@@ -492,6 +497,7 @@ Options::~Options ()
_option_word & ENUM ? "enabled" : "disabled", _option_word & ENUM ? "enabled" : "disabled",
_option_word & INCLUDE ? "enabled" : "disabled", _option_word & INCLUDE ? "enabled" : "disabled",
_option_word & SEVENBIT ? "enabled" : "disabled", _option_word & SEVENBIT ? "enabled" : "disabled",
_option_word & UPPERLOWER ? "enabled" : "disabled",
_function_name, _hash_name, _wordlist_name, _slot_name, _function_name, _hash_name, _wordlist_name, _slot_name,
_initializer_suffix, _asso_iterations, _jump, _size_multiple, _initializer_suffix, _asso_iterations, _jump, _size_multiple,
_initial_asso_value, _delimiters, _total_switches); _initial_asso_value, _delimiters, _total_switches);
@@ -605,6 +611,7 @@ Options::set_delimiters (const char *delimiters)
static const struct option long_options[] = static const struct option long_options[] =
{ {
{ "output-file", required_argument, NULL, CHAR_MAX + 1 }, { "output-file", required_argument, NULL, CHAR_MAX + 1 },
{ "ignore-case", no_argument, NULL, CHAR_MAX + 2 },
{ "delimiters", required_argument, NULL, 'e' }, { "delimiters", required_argument, NULL, 'e' },
{ "struct-type", no_argument, NULL, 't' }, { "struct-type", no_argument, NULL, 't' },
{ "language", required_argument, NULL, 'L' }, { "language", required_argument, NULL, 'L' },
@@ -949,6 +956,11 @@ warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.\n\
_output_file_name = /*getopt*/optarg; _output_file_name = /*getopt*/optarg;
break; break;
} }
case CHAR_MAX + 2: /* Case insignificant. */
{
_option_word |= UPPERLOWER;
break;
}
default: default:
short_usage (stderr); short_usage (stderr);
exit (1); exit (1);

View File

@@ -98,7 +98,10 @@ enum Option_Type
SEVENBIT = 1 << 19, SEVENBIT = 1 << 19,
/* Optimize for position-independent code. */ /* Optimize for position-independent code. */
SHAREDLIB = 1 << 20 SHAREDLIB = 1 << 20,
/* Ignore case of ASCII characters. */
UPPERLOWER = 1 << 21
}; };
/* Class manager for gperf program Options. */ /* Class manager for gperf program Options. */

View File

@@ -232,6 +232,131 @@ Output::output_constants (struct Output_Constants& style) const
/* ------------------------------------------------------------------------- */ /* ------------------------------------------------------------------------- */
/* Output gperf's ASCII-case insensitive strcmp replacement. */
static void
output_upperlower_strcmp ()
{
printf ("#ifndef GPERF_CASE_STRCMP\n"
"#define GPERF_CASE_STRCMP 1\n"
"static int\n"
"gperf_case_strcmp ");
printf (option[KRC] ?
"(s1, s2)\n"
" register char *s1;\n"
" register char *s2;\n" :
option[C] ?
"(s1, s2)\n"
" register const char *s1;\n"
" register const char *s2;\n" :
option[ANSIC] | option[CPLUSPLUS] ?
"(register const char *s1, register const char *s2)\n" :
"");
printf ("{\n"
" for (;;)\n"
" {\n"
" unsigned char c1 = *s1++;\n"
" unsigned char c2 = *s2++;\n"
" if (c1 >= 'A' && c1 <= 'Z')\n"
" c1 += 'a' - 'A';\n"
" if (c2 >= 'A' && c2 <= 'Z')\n"
" c2 += 'a' - 'A';\n"
" if (c1 != 0 && c1 == c2)\n"
" continue;\n"
" return (int)c1 - (int)c2;\n"
" }\n"
"}\n"
"#endif\n\n");
}
/* Output gperf's ASCII-case insensitive strncmp replacement. */
static void
output_upperlower_strncmp ()
{
printf ("#ifndef GPERF_CASE_STRNCMP\n"
"#define GPERF_CASE_STRNCMP 1\n"
"static int\n"
"gperf_case_strncmp ");
printf (option[KRC] ?
"(s1, s2, n)\n"
" register char *s1;\n"
" register char *s2;\n"
" register unsigned int n;\n" :
option[C] ?
"(s1, s2, n)\n"
" register const char *s1;\n"
" register const char *s2;\n"
" register unsigned int n;\n" :
option[ANSIC] | option[CPLUSPLUS] ?
"(register const char *s1, register const char *s2, register unsigned int n)\n" :
"");
printf ("{\n"
" for (; n > 0;)\n"
" {\n"
" unsigned char c1 = *s1++;\n"
" unsigned char c2 = *s2++;\n"
" if (c1 >= 'A' && c1 <= 'Z')\n"
" c1 += 'a' - 'A';\n"
" if (c2 >= 'A' && c2 <= 'Z')\n"
" c2 += 'a' - 'A';\n"
" if (c1 != 0 && c1 == c2)\n"
" {\n"
" n--;\n"
" continue;\n"
" }\n"
" return (int)c1 - (int)c2;\n"
" }\n"
" return 0;\n"
"}\n"
"#endif\n\n");
}
/* Output gperf's ASCII-case insensitive memcmp replacement. */
static void
output_upperlower_memcmp ()
{
printf ("#ifndef GPERF_CASE_MEMCMP\n"
"#define GPERF_CASE_MEMCMP 1\n"
"static int\n"
"gperf_case_memcmp ");
printf (option[KRC] ?
"(s1, s2, n)\n"
" register char *s1;\n"
" register char *s2;\n"
" register unsigned int n;\n" :
option[C] ?
"(s1, s2, n)\n"
" register const char *s1;\n"
" register const char *s2;\n"
" register unsigned int n;\n" :
option[ANSIC] | option[CPLUSPLUS] ?
"(register const char *s1, register const char *s2, register unsigned int n)\n" :
"");
printf ("{\n"
" for (; n > 0;)\n"
" {\n"
" unsigned char c1 = *s1++;\n"
" unsigned char c2 = *s2++;\n"
" if (c1 >= 'A' && c1 <= 'Z')\n"
" c1 += 'a' - 'A';\n"
" if (c2 >= 'A' && c2 <= 'Z')\n"
" c2 += 'a' - 'A';\n"
" if (c1 == c2)\n"
" {\n"
" n--;\n"
" continue;\n"
" }\n"
" return (int)c1 - (int)c2;\n"
" }\n"
" return 0;\n"
"}\n"
"#endif\n\n");
}
/* ------------------------------------------------------------------------- */
/* Outputs a keyword, as a string: enclosed in double quotes, escaping /* Outputs a keyword, as a string: enclosed in double quotes, escaping
backslashes, double quote and unprintable characters. */ backslashes, double quote and unprintable characters. */
@@ -363,7 +488,10 @@ void Output_Compare_Strcmp::output_comparison (const Output_Expr& expr1,
expr1.output_expr (); expr1.output_expr ();
printf (" == *"); printf (" == *");
expr2.output_expr (); expr2.output_expr ();
printf (" && !strcmp ("); printf (" && !");
if (option[UPPERLOWER])
printf ("gperf_case_");
printf ("strcmp (");
expr1.output_expr (); expr1.output_expr ();
printf (" + 1, "); printf (" + 1, ");
expr2.output_expr (); expr2.output_expr ();
@@ -389,7 +517,10 @@ void Output_Compare_Strncmp::output_comparison (const Output_Expr& expr1,
expr1.output_expr (); expr1.output_expr ();
printf (" == *"); printf (" == *");
expr2.output_expr (); expr2.output_expr ();
printf (" && !strncmp ("); printf (" && !");
if (option[UPPERLOWER])
printf ("gperf_case_");
printf ("strncmp (");
expr1.output_expr (); expr1.output_expr ();
printf (" + 1, "); printf (" + 1, ");
expr2.output_expr (); expr2.output_expr ();
@@ -418,7 +549,10 @@ void Output_Compare_Memcmp::output_comparison (const Output_Expr& expr1,
expr1.output_expr (); expr1.output_expr ();
printf (" == *"); printf (" == *");
expr2.output_expr (); expr2.output_expr ();
printf (" && !memcmp ("); printf (" && !");
if (option[UPPERLOWER])
printf ("gperf_case_");
printf ("memcmp (");
expr1.output_expr (); expr1.output_expr ();
printf (" + 1, "); printf (" + 1, ");
expr2.output_expr (); expr2.output_expr ();
@@ -1522,6 +1656,19 @@ Output::output ()
printf ("/* maximum key range = %d, duplicates = %d */\n\n", printf ("/* maximum key range = %d, duplicates = %d */\n\n",
_max_hash_value - _min_hash_value + 1, _total_duplicates); _max_hash_value - _min_hash_value + 1, _total_duplicates);
if (option[UPPERLOWER])
{
if (option[LENTABLE])
output_upperlower_memcmp ();
else
{
if (option[COMP])
output_upperlower_strncmp ();
else
output_upperlower_strcmp ();
}
}
if (option[CPLUSPLUS]) if (option[CPLUSPLUS])
printf ("class %s\n" printf ("class %s\n"
"{\n" "{\n"

View File

@@ -146,12 +146,39 @@ Search::preprepare ()
/* ====================== Finding good byte positions ====================== */ /* ====================== Finding good byte positions ====================== */
/* Computes the upper bound on the indices passed to asso_values[],
assuming no alpha_increments. */
unsigned int
Search::compute_alpha_size () const
{
return (option[SEVENBIT] ? 128 : 256);
}
/* Computes the unification rules between different asso_values[c],
assuming no alpha_increments. */
unsigned int *
Search::compute_alpha_unify () const
{
if (option[UPPERLOWER])
{
unsigned int alpha_size = compute_alpha_size();
unsigned int *alpha_unify = new unsigned int[alpha_size];
for (unsigned int c = 0; c < alpha_size; c++)
alpha_unify[c] = c;
for (unsigned int c = 'A'; c <= 'Z'; c++)
alpha_unify[c] = c + ('a'-'A');
return alpha_unify;
}
else
return NULL;
}
/* Initializes each keyword's _selchars array. */ /* Initializes each keyword's _selchars array. */
void void
Search::init_selchars_tuple (bool use_all_chars, const Positions& positions) const Search::init_selchars_tuple (bool use_all_chars, const Positions& positions) const
{ {
for (KeywordExt_List *temp = _head; temp; temp = temp->rest()) for (KeywordExt_List *temp = _head; temp; temp = temp->rest())
temp->first()->init_selchars_tuple(use_all_chars, positions); temp->first()->init_selchars_tuple(use_all_chars, positions, _alpha_unify);
} }
/* Deletes each keyword's _selchars array. */ /* Deletes each keyword's _selchars array. */
@@ -202,6 +229,9 @@ Search::find_positions ()
return; return;
} }
/* Compute preliminary value for _alpha_unify. */
_alpha_unify = compute_alpha_unify ();
/* 1. Find positions that must occur in order to distinguish duplicates. */ /* 1. Find positions that must occur in order to distinguish duplicates. */
Positions mandatory; Positions mandatory;
@@ -222,17 +252,42 @@ Search::find_positions ()
int n = keyword1->_allchars_length; int n = keyword1->_allchars_length;
int i; int i;
for (i = 1; i < n; i++) for (i = 1; i < n; i++)
if (keyword1->_allchars[i-1] != keyword2->_allchars[i-1])
break;
if (i < n
&& memcmp (&keyword1->_allchars[i],
&keyword2->_allchars[i],
n - i)
== 0)
{ {
/* Position i is mandatory. */ unsigned char c1 = keyword1->_allchars[i-1];
if (!mandatory.contains (i)) unsigned char c2 = keyword2->_allchars[i-1];
mandatory.add (i); if (option[UPPERLOWER])
{
if (c1 >= 'A' && c1 <= 'Z')
c1 += 'a' - 'A';
if (c2 >= 'A' && c2 <= 'Z')
c2 += 'a' - 'A';
}
if (c1 != c2)
break;
}
if (i < n)
{
int j;
for (j = i + 1; j <= n; j++)
{
unsigned char c1 = keyword1->_allchars[j-1];
unsigned char c2 = keyword2->_allchars[j-1];
if (option[UPPERLOWER])
{
if (c1 >= 'A' && c1 <= 'Z')
c1 += 'a' - 'A';
if (c2 >= 'A' && c2 <= 'Z')
c2 += 'a' - 'A';
}
if (c1 != c2)
break;
}
if (j > n)
{
/* Position i is mandatory. */
if (!mandatory.contains (i))
mandatory.add (i);
}
} }
} }
} }
@@ -379,16 +434,113 @@ Search::find_positions ()
} }
fprintf (stderr, "\n"); fprintf (stderr, "\n");
} }
/* Free preliminary value for _alpha_unify. */
delete[] _alpha_unify;
} }
/* ===================== Finding good alpha increments ===================== */ /* ===================== Finding good alpha increments ===================== */
/* Computes the upper bound on the indices passed to asso_values[]. */
unsigned int
Search::compute_alpha_size (const unsigned int *alpha_inc) const
{
unsigned int max_alpha_inc = 0;
for (int i = 0; i < _max_key_len; i++)
if (max_alpha_inc < alpha_inc[i])
max_alpha_inc = alpha_inc[i];
return (option[SEVENBIT] ? 128 : 256) + max_alpha_inc;
}
/* Computes the unification rules between different asso_values[c]. */
unsigned int *
Search::compute_alpha_unify (const Positions& positions, const unsigned int *alpha_inc) const
{
if (option[UPPERLOWER])
{
/* Without alpha increments, we would simply unify
'A' -> 'a', ..., 'Z' -> 'z'.
But when a keyword contains at position i a character c,
we have the constraint
asso_values[tolower(c) + alpha_inc[i]] ==
asso_values[toupper(c) + alpha_inc[i]].
This introduces a unification
toupper(c) + alpha_inc[i] -> tolower(c) + alpha_inc[i].
Note that this unification can extend outside the range of
ASCII letters! But still every unified character pair is at
a distance of 'a'-'A' = 32, or (after chained unification)
at a multiple of 32. So in the end the alpha_unify vector has
the form c -> c + 32 * f(c) where f(c) is a nonnegative
integer. */
unsigned int alpha_size = compute_alpha_size (alpha_inc);
unsigned int *alpha_unify = new unsigned int[alpha_size];
for (unsigned int c = 0; c < alpha_size; c++)
alpha_unify[c] = c;
for (KeywordExt_List *temp = _head; temp; temp = temp->rest())
{
KeywordExt *keyword = temp->first();
if (option[ALLCHARS])
/* Iterate through all character positions. */
for (int i = 0; i < keyword->_allchars_length; i++)
{
unsigned int c = static_cast<unsigned char>(keyword->_allchars[i]);
if (c >= 'A' && c <= 'Z')
c += 'a' - 'A';
if (c >= 'a' && c <= 'z')
{
c += alpha_inc[i];
/* Unify c with c - ('a'-'A'). */
unsigned int d = alpha_unify[c];
unsigned int b = c - ('a'-'A');
for (int a = b; a >= 0 && alpha_unify[a] == b; a -= ('a'-'A'))
alpha_unify[a] = d;
}
}
else
{
/* Iterate through the selected character positions. */
PositionIterator iter (positions);
for (int i; (i = iter.next ()) != PositionIterator::EOS; )
{
unsigned int c;
if (i == Positions::LASTCHAR)
c = static_cast<unsigned char>(keyword->_allchars[keyword->_allchars_length - 1]);
else if (i <= keyword->_allchars_length)
c = static_cast<unsigned char>(keyword->_allchars[i - 1]);
else
continue;
if (c >= 'A' && c <= 'Z')
c += 'a' - 'A';
if (c >= 'a' && c <= 'z')
{
if (i != Positions::LASTCHAR)
c += alpha_inc[i - 1];
/* Unify c with c - ('a'-'A'). */
unsigned int d = alpha_unify[c];
unsigned int b = c - ('a'-'A');
for (int a = b; a >= 0 && alpha_unify[a] == b; a -= ('a'-'A'))
alpha_unify[a] = d;
}
}
}
}
return alpha_unify;
}
else
/* Identity mapping. */
return NULL;
}
/* Initializes each keyword's _selchars array. */ /* Initializes each keyword's _selchars array. */
void void
Search::init_selchars_multiset (bool use_all_chars, const Positions& positions, const unsigned int *alpha_inc) const Search::init_selchars_multiset (bool use_all_chars, const Positions& positions, const unsigned int *alpha_unify, const unsigned int *alpha_inc) const
{ {
for (KeywordExt_List *temp = _head; temp; temp = temp->rest()) for (KeywordExt_List *temp = _head; temp; temp = temp->rest())
temp->first()->init_selchars_multiset(use_all_chars, positions, alpha_inc); temp->first()->init_selchars_multiset(use_all_chars, positions, alpha_unify, alpha_inc);
} }
/* Count the duplicate keywords that occur with the given set of positions /* Count the duplicate keywords that occur with the given set of positions
@@ -402,7 +554,9 @@ Search::count_duplicates_multiset (const unsigned int *alpha_inc) const
/* Run through the keyword list and count the duplicates incrementally. /* Run through the keyword list and count the duplicates incrementally.
The result does not depend on the order of the keyword list, thanks to The result does not depend on the order of the keyword list, thanks to
the formula above. */ the formula above. */
init_selchars_multiset (option[ALLCHARS], _key_positions, alpha_inc); init_selchars_multiset (option[ALLCHARS], _key_positions,
compute_alpha_unify (_key_positions, alpha_inc),
alpha_inc);
unsigned int count = 0; unsigned int count = 0;
{ {
@@ -428,7 +582,9 @@ Search::find_alpha_inc ()
/* The goal is to choose _alpha_inc[] such that it doesn't introduce /* The goal is to choose _alpha_inc[] such that it doesn't introduce
artificial duplicates. artificial duplicates.
In other words, the goal is # proj2 (proj1 (K)) = # proj1 (K). */ In other words, the goal is # proj2 (proj1 (K)) = # proj1 (K). */
_alpha_unify = compute_alpha_unify ();
unsigned int duplicates_goal = count_duplicates_tuple (_key_positions); unsigned int duplicates_goal = count_duplicates_tuple (_key_positions);
delete[] _alpha_unify;
/* Start with zero increments. This is sufficient in most cases. */ /* Start with zero increments. This is sufficient in most cases. */
unsigned int *current = new unsigned int [_max_key_len]; unsigned int *current = new unsigned int [_max_key_len];
@@ -545,6 +701,8 @@ Search::find_alpha_inc ()
} }
_alpha_inc = current; _alpha_inc = current;
_alpha_size = compute_alpha_size (_alpha_inc);
_alpha_unify = compute_alpha_unify (_key_positions, _alpha_inc);
} }
/* ======================= Finding good asso_values ======================== */ /* ======================= Finding good asso_values ======================== */
@@ -555,7 +713,8 @@ Search::prepare ()
KeywordExt_List *temp; KeywordExt_List *temp;
/* Initialize each keyword's _selchars array. */ /* Initialize each keyword's _selchars array. */
init_selchars_multiset(option[ALLCHARS], _key_positions, _alpha_inc); init_selchars_multiset(option[ALLCHARS], _key_positions,
_alpha_unify, _alpha_inc);
/* Check for duplicates, i.e. keywords with the same _selchars array /* Check for duplicates, i.e. keywords with the same _selchars array
(and - if !option[NOLENGTH] - also the same length). (and - if !option[NOLENGTH] - also the same length).
@@ -634,14 +793,6 @@ Search::prepare ()
} }
} }
/* Compute _alpha_size, the upper bound on the indices passed to
asso_values[]. */
unsigned int max_alpha_inc = 0;
for (int i = 0; i < _max_key_len; i++)
if (max_alpha_inc < _alpha_inc[i])
max_alpha_inc = _alpha_inc[i];
_alpha_size = (option[SEVENBIT] ? 128 : 256) + max_alpha_inc;
/* Compute the occurrences of each character in the alphabet. */ /* Compute the occurrences of each character in the alphabet. */
_occurrences = new int[_alpha_size]; _occurrences = new int[_alpha_size];
memset (_occurrences, 0, _alpha_size * sizeof (_occurrences[0])); memset (_occurrences, 0, _alpha_size * sizeof (_occurrences[0]));
@@ -1492,6 +1643,12 @@ Search::optimize ()
for (unsigned int c = 0; c < _alpha_size; c++) for (unsigned int c = 0; c < _alpha_size; c++)
if (_occurrences[c] == 0) if (_occurrences[c] == 0)
_asso_values[c] = max_hash_value + 1; _asso_values[c] = max_hash_value + 1;
/* Propagate unified asso_values. */
if (_alpha_unify)
for (unsigned int c = 0; c < _alpha_size; c++)
if (_alpha_unify[c] != c)
_asso_values[c] = _asso_values[_alpha_unify[c]];
} }
/* Prints out some diagnostics upon completion. */ /* Prints out some diagnostics upon completion. */
@@ -1533,5 +1690,6 @@ Search::~Search ()
} }
delete[] _asso_values; delete[] _asso_values;
delete[] _occurrences; delete[] _occurrences;
delete[] _alpha_unify;
delete[] _alpha_inc; delete[] _alpha_inc;
} }

View File

@@ -41,6 +41,14 @@ public:
private: private:
void preprepare (); void preprepare ();
/* Computes the upper bound on the indices passed to asso_values[],
assuming no alpha_increments. */
unsigned int compute_alpha_size () const;
/* Computes the unification rules between different asso_values[c],
assuming no alpha_increments. */
unsigned int * compute_alpha_unify () const;
/* Initializes each keyword's _selchars array. */ /* Initializes each keyword's _selchars array. */
void init_selchars_tuple (bool use_all_chars, const Positions& positions) const; void init_selchars_tuple (bool use_all_chars, const Positions& positions) const;
/* Deletes each keyword's _selchars array. */ /* Deletes each keyword's _selchars array. */
@@ -52,8 +60,14 @@ private:
/* Find good key positions. */ /* Find good key positions. */
void find_positions (); void find_positions ();
/* Computes the upper bound on the indices passed to asso_values[]. */
unsigned int compute_alpha_size (const unsigned int *alpha_inc) const;
/* Computes the unification rules between different asso_values[c]. */
unsigned int * compute_alpha_unify (const Positions& positions, const unsigned int *alpha_inc) const;
/* Initializes each keyword's _selchars array. */ /* Initializes each keyword's _selchars array. */
void init_selchars_multiset (bool use_all_chars, const Positions& positions, const unsigned int *alpha_inc) const; void init_selchars_multiset (bool use_all_chars, const Positions& positions, const unsigned int *alpha_unify, const unsigned int *alpha_inc) const;
/* Count the duplicate keywords that occur with the given set of positions /* Count the duplicate keywords that occur with the given set of positions
and a given alpha_inc[] array. */ and a given alpha_inc[] array. */
@@ -115,13 +129,17 @@ public:
/* Adjustments to add to bytes add specific key positions. */ /* Adjustments to add to bytes add specific key positions. */
unsigned int * _alpha_inc; unsigned int * _alpha_inc;
/* Size of alphabet. */
unsigned int _alpha_size;
/* Alphabet character unification, either the identity or a mapping from
upper case characters to lower case characters (and maybe more). */
unsigned int * _alpha_unify;
/* Total number of duplicates that have been moved to _duplicate_link lists /* Total number of duplicates that have been moved to _duplicate_link lists
(not counting their representatives which stay on the main list). */ (not counting their representatives which stay on the main list). */
int _total_duplicates; int _total_duplicates;
/* Size of alphabet. */
unsigned int _alpha_size;
/* Counts occurrences of each key set character. /* Counts occurrences of each key set character.
_occurrences[c] is the number of times that c occurs among the _selchars _occurrences[c] is the number of times that c occurs among the _selchars
of a keyword. */ of a keyword. */

View File

@@ -143,6 +143,8 @@ check-test:
diff $(srcdir)/permut2.exp permut2.out diff $(srcdir)/permut2.exp permut2.out
$(GPERF) -m5 < $(srcdir)/permut3.gperf > permut3.out $(GPERF) -m5 < $(srcdir)/permut3.gperf > permut3.out
diff $(srcdir)/permut3.exp permut3.out diff $(srcdir)/permut3.exp permut3.out
$(GPERF) -m5 --ignore-case < $(srcdir)/permutc2.gperf > permutc2.out
diff $(srcdir)/permutc2.exp permutc2.out
$(GPERF) -C -E -G -I -t < $(srcdir)/charsets.gperf > charsets.out $(GPERF) -C -E -G -I -t < $(srcdir)/charsets.gperf > charsets.out
diff $(srcdir)/charsets.exp charsets.out diff $(srcdir)/charsets.exp charsets.out
$(GPERF) -C -E -G -I -t < $(srcdir)/languages.gperf > languages.out $(GPERF) -C -E -G -I -t < $(srcdir)/languages.gperf > languages.out

116
tests/permutc2.exp Normal file
View File

@@ -0,0 +1,116 @@
/* C code produced by gperf version 2.7.2 */
/* Command-line: ../src/gperf -m5 --ignore-case */
/* Computed positions: -k'1-2' */
/* Test of a hash function which has to deal with permutation and
case-independence. Without case-independence, the alpha_inc is 1.
With case-independence, the alpha_inc is 3. */
#define TOTAL_KEYWORDS 8
#define MIN_WORD_LENGTH 2
#define MAX_WORD_LENGTH 2
#define MIN_HASH_VALUE 2
#define MAX_HASH_VALUE 9
/* maximum key range = 8, duplicates = 0 */
#ifndef GPERF_CASE_STRCMP
#define GPERF_CASE_STRCMP 1
static int
gperf_case_strcmp (s1, s2)
register const char *s1;
register const char *s2;
{
for (;;)
{
unsigned char c1 = *s1++;
unsigned char c2 = *s2++;
if (c1 >= 'A' && c1 <= 'Z')
c1 += 'a' - 'A';
if (c2 >= 'A' && c2 <= 'Z')
c2 += 'a' - 'A';
if (c1 != 0 && c1 == c2)
continue;
return (int)c1 - (int)c2;
}
}
#endif
#ifdef __GNUC__
__inline
#else
#ifdef __cplusplus
inline
#endif
#endif
static unsigned int
hash (str, len)
register const char *str;
register unsigned int len;
{
static unsigned char asso_values[] =
{
10,10,10,10,10,10,10,10,10,10,
10,10,10,10,10,10,10,10,10,10,
10,10,10,10,10,10,10,10,10,10,
10,10,10,10,10,10,10,10,10,10,
10,10,10,10,10,10,10,10,10,10,
10,10,10,10,10,10,10,10,10,10,
10,10,10,10,10, 1,10,10, 3,10,
10,10,10,10,10,10,10,10,10,10,
10,10,10,10,10,10,10,10, 3, 1,
0, 7, 1, 0, 3,10,10, 1,10,10,
3,10,10,10,10,10,10,10,10,10,
10,10,10,10,10,10,10,10,10,10,
3, 1, 0, 0, 1, 0, 2,10,10,10,
10,10,10,10,10,10,10,10,10,10,
10,10,10,10,10,10,10,10,10,10,
10,10,10,10,10,10,10,10,10,10,
10,10,10,10,10,10,10,10,10,10,
10,10,10,10,10,10,10,10,10,10,
10,10,10,10,10,10,10,10,10,10,
10,10,10,10,10,10,10,10,10,10,
10,10,10,10,10,10,10,10,10,10,
10,10,10,10,10,10,10,10,10,10,
10,10,10,10,10,10,10,10,10,10,
10,10,10,10,10,10,10,10,10,10,
10,10,10,10,10,10,10,10,10,10,
10,10,10,10,10,10,10,10,10
};
return len + asso_values[(unsigned char)str[1]+3] + asso_values[(unsigned char)str[0]];
}
#ifdef __GNUC__
__inline
#endif
const char *
in_word_set (str, len)
register const char *str;
register unsigned int len;
{
static const char * wordlist[] =
{
"", "",
"{w",
"az",
"ay",
"za",
"ya",
"x{",
"x[",
"[w"
};
if (len <= MAX_WORD_LENGTH && len >= MIN_WORD_LENGTH)
{
register int key = hash (str, len);
if (key <= MAX_HASH_VALUE && key >= 0)
{
register const char *s = wordlist[key];
if (*str == *s && !gperf_case_strcmp (str + 1, s + 1))
return s;
}
}
return 0;
}

14
tests/permutc2.gperf Normal file
View File

@@ -0,0 +1,14 @@
%{
/* Test of a hash function which has to deal with permutation and
case-independence. Without case-independence, the alpha_inc is 1.
With case-independence, the alpha_inc is 3. */
%}
%%
az
za
ay
ya
x{
x[
{w
[w

View File

@@ -20,6 +20,9 @@ Input file interpretation:
is considered part of the type declaration. Key is considered part of the type declaration. Key
words and additional fields may follow this, one words and additional fields may follow this, one
group of fields per line. group of fields per line.
--ignore-case Consider upper and lower case ASCII characters as
equivalent. Note that locale dependent case mappings
are ignored.
Language for the output code: Language for the output code:
-L, --language=LANGUAGE-NAME -L, --language=LANGUAGE-NAME