mirror of
https://git.savannah.gnu.org/git/gperf.git
synced 2025-12-02 13:09:22 +00:00
New option --ignore-case.
This commit is contained in:
44
ChangeLog
44
ChangeLog
@@ -1,5 +1,49 @@
|
||||
2002-12-10 Bruno Haible <bruno@clisp.org>
|
||||
|
||||
* src/options.h (UPPERLOWER): New enum value.
|
||||
* src/options.cc (Options::long_usage): Document option --ignore-case.
|
||||
(Options::~Options): Update.
|
||||
(long_options): Add option --ignore-case.
|
||||
(Options::parse_options): Handle option --ignore-case.
|
||||
* src/input.cc (Input::read_input): Recognize option %ignore-case.
|
||||
* src/keyword.h (KeywordExt::init_selchars_tuple,
|
||||
KeywordExt::init_selchars_multiset, KeywordExt::init_selchars_low):
|
||||
Add alpha_unify argument.
|
||||
* src/keyword.cc (KeywordExt::init_selchars_low): Add alpha_unify
|
||||
argument.
|
||||
(KeywordExt::init_selchars_tuple): Add alpha_unify argument.
|
||||
(KeywordExt::init_selchars_multiset): Add alpha_unify argument.
|
||||
* src/search.h (Search::compute_alpha_size,
|
||||
Search::compute_alpha_unify): New declarations.
|
||||
(Search::init_selchars_multiset): Add alpha_unify argument.
|
||||
(Search::_alpha_unify): New field.
|
||||
* src/search.cc (Search::compute_alpha_size,
|
||||
Search::compute_alpha_unify): New functions.
|
||||
(Search::init_selchars_tuple): Update.
|
||||
(Search::find_positions): Temporarily set _alpha_unify. Perform a
|
||||
case insensitive comparison if needed.
|
||||
(Search::init_selchars_multiset): Add alpha_unify argument.
|
||||
(Search::count_duplicates_multiset): Call compute_alpha_unify.
|
||||
(Search::find_alpha_inc): Temporarily set _alpha_unify. At the end,
|
||||
set _alpha_size and _alpha_unify.
|
||||
(Search::prepare): Update. Don't compute _alpga_size here.
|
||||
(Search::optimize): Propagate unified asso_values.
|
||||
(Search::~Search) Delete _alpha_unify.
|
||||
* src/output.cc (output_upperlower_strcmp, output_upperlower_strncmp,
|
||||
output_upperlower_memcmp): New functions.
|
||||
(Output_Compare_Strcmp::output_comparison,
|
||||
Output_Compare_Strncmp::output_comparison,
|
||||
Output_Compare_Memcmp::output_comparison): Use the case-insensitive
|
||||
comparison function if --ignore-case was given.
|
||||
(Output::output): Emit the auxiliary case-insensitive comparison
|
||||
function if needed.
|
||||
* tests/permutc2.gperf, tests/permutc2.exp: New files.
|
||||
* tests/Makefile.in (check-test): Also check permutc2.gperf.
|
||||
* tests/test-6.exp: Update.
|
||||
* doc/gperf.texi (Gperf Declarations): Document %ignore-case.
|
||||
(Input Details): Document option --ignore-case.
|
||||
* NEWS: Update.
|
||||
|
||||
* src/search.cc (Search::optimize): Fill unused asso_values[] entries
|
||||
with a large value.
|
||||
* src/output.h (Output::Output): Remove occurrences argument.
|
||||
|
||||
2
NEWS
2
NEWS
@@ -10,6 +10,7 @@ New in 2.97:
|
||||
* The following options can now be specified inside the input file:
|
||||
%delimiters=DELIMITER-LIST
|
||||
%struct-type
|
||||
%ignore-case
|
||||
%language=LANGUAGE-NAME
|
||||
%define slot-name NAME
|
||||
%define hash-function-name NAME
|
||||
@@ -42,6 +43,7 @@ New in 2.97:
|
||||
* The options -f/--fast and -o/--occurrence-sort have no effect any more.
|
||||
* Added option -P/--pic that optimizes the generated code for use in shared
|
||||
libraries.
|
||||
* Added option --ignore-case that produces a case independent lookup function.
|
||||
* Bug fixes.
|
||||
|
||||
New in 2.7.2:
|
||||
|
||||
@@ -436,6 +436,12 @@ commas or newlines.
|
||||
Allows you to include a @code{struct} type declaration for generated
|
||||
code; see above for an example.
|
||||
|
||||
@item %ignore-case
|
||||
@cindex @samp{%ignore-case}
|
||||
Consider upper and lower case ASCII characters as equivalent. The string
|
||||
comparison will use a case insignificant character comparison. Note that
|
||||
locale dependent case mappings are ignored.
|
||||
|
||||
@item %language=@var{language-name}
|
||||
@cindex @samp{%language}
|
||||
Instructs @code{gperf} to generate code in the language specified by the
|
||||
@@ -790,6 +796,17 @@ part of the type declaration. Keywords and additional fields may follow
|
||||
this, one group of fields per line. A set of examples for generating
|
||||
perfect hash tables and functions for Ada, C, C++, Pascal, Modula 2,
|
||||
Modula 3 and JavaScript reserved words are distributed with this release.
|
||||
|
||||
@item --ignore-case
|
||||
Consider upper and lower case ASCII characters as equivalent. The string
|
||||
comparison will use a case insignificant character comparison. Note that
|
||||
locale dependent case mappings are ignored. This option is therefore not
|
||||
suitable if a properly internationalized or locale aware case mapping
|
||||
should be used. (For example, in a Turkish locale, the upper case equivalent
|
||||
of the lowercase ASCII letter @samp{i} is the non-ASCII character
|
||||
@samp{capital i with dot above}.) For this case, it is better to apply
|
||||
an uppercase or lowercase conversion on the string before passing it to
|
||||
the @code{gperf} generated function.
|
||||
@end table
|
||||
|
||||
@node Output Language, Output Details, Input Details, Options
|
||||
|
||||
@@ -472,6 +472,10 @@ Input::read_input ()
|
||||
option.set (TYPE);
|
||||
else
|
||||
|
||||
if (is_declaration (line, line_end, lineno, "ignore-case"))
|
||||
option.set (UPPERLOWER);
|
||||
else
|
||||
|
||||
if (is_declaration_with_arg (line, line_end, lineno,
|
||||
"language", &arg))
|
||||
option.set_language (arg);
|
||||
|
||||
@@ -59,7 +59,7 @@ static inline void sort_char_set (unsigned int *base, int len)
|
||||
*/
|
||||
|
||||
unsigned int *
|
||||
KeywordExt::init_selchars_low (bool use_all_chars, const Positions& positions, const unsigned int *alpha_inc)
|
||||
KeywordExt::init_selchars_low (bool use_all_chars, const Positions& positions, const unsigned int *alpha_unify, const unsigned int *alpha_inc)
|
||||
{
|
||||
const char *k = _allchars;
|
||||
unsigned int *key_set =
|
||||
@@ -73,6 +73,8 @@ KeywordExt::init_selchars_low (bool use_all_chars, const Positions& positions, c
|
||||
unsigned int c = static_cast<unsigned char>(*k);
|
||||
if (alpha_inc)
|
||||
c += alpha_inc[k-_allchars];
|
||||
if (alpha_unify)
|
||||
c = alpha_unify[c];
|
||||
*ptr = c;
|
||||
ptr++;
|
||||
}
|
||||
@@ -99,6 +101,8 @@ KeywordExt::init_selchars_low (bool use_all_chars, const Positions& positions, c
|
||||
else
|
||||
/* Out of range of KEY length, so we'll just skip it. */
|
||||
continue;
|
||||
if (alpha_unify)
|
||||
c = alpha_unify[c];
|
||||
*ptr = c;
|
||||
ptr++;
|
||||
}
|
||||
@@ -111,16 +115,16 @@ KeywordExt::init_selchars_low (bool use_all_chars, const Positions& positions, c
|
||||
}
|
||||
|
||||
void
|
||||
KeywordExt::init_selchars_tuple (bool use_all_chars, const Positions& positions)
|
||||
KeywordExt::init_selchars_tuple (bool use_all_chars, const Positions& positions, const unsigned int *alpha_unify)
|
||||
{
|
||||
init_selchars_low (use_all_chars, positions, NULL);
|
||||
init_selchars_low (use_all_chars, positions, alpha_unify, NULL);
|
||||
}
|
||||
|
||||
void
|
||||
KeywordExt::init_selchars_multiset (bool use_all_chars, const Positions& positions, const unsigned int *alpha_inc)
|
||||
KeywordExt::init_selchars_multiset (bool use_all_chars, const Positions& positions, const unsigned int *alpha_unify, const unsigned int *alpha_inc)
|
||||
{
|
||||
unsigned int *selchars =
|
||||
init_selchars_low (use_all_chars, positions, alpha_inc);
|
||||
init_selchars_low (use_all_chars, positions, alpha_unify, alpha_inc);
|
||||
|
||||
/* Sort the selchars elements alphabetically. */
|
||||
sort_char_set (selchars, _selchars_length);
|
||||
|
||||
@@ -68,9 +68,9 @@ struct KeywordExt : public Keyword
|
||||
|
||||
/* Methods depending on the keyposition list. */
|
||||
/* Initializes selchars and selchars_length, without reordering. */
|
||||
void init_selchars_tuple (bool use_all_chars, const Positions& positions);
|
||||
void init_selchars_tuple (bool use_all_chars, const Positions& positions, const unsigned int *alpha_unify);
|
||||
/* Initializes selchars and selchars_length, with reordering. */
|
||||
void init_selchars_multiset (bool use_all_chars, const Positions& positions, const unsigned int *alpha_inc);
|
||||
void init_selchars_multiset (bool use_all_chars, const Positions& positions, const unsigned int *alpha_unify, const unsigned int *alpha_inc);
|
||||
/* Deletes selchars. */
|
||||
void delete_selchars ();
|
||||
|
||||
@@ -81,7 +81,7 @@ struct KeywordExt : public Keyword
|
||||
int _final_index;
|
||||
|
||||
private:
|
||||
unsigned int * init_selchars_low (bool use_all_chars, const Positions& positions, const unsigned int *alpha_inc);
|
||||
unsigned int * init_selchars_low (bool use_all_chars, const Positions& positions, const unsigned int *alpha_unify, const unsigned int *alpha_inc);
|
||||
};
|
||||
|
||||
/* An abstract factory for creating Keyword instances.
|
||||
|
||||
@@ -106,6 +106,10 @@ Options::long_usage (FILE * stream) const
|
||||
" is considered part of the type declaration. Key\n"
|
||||
" words and additional fields may follow this, one\n"
|
||||
" group of fields per line.\n");
|
||||
fprintf (stream,
|
||||
" --ignore-case Consider upper and lower case ASCII characters as\n"
|
||||
" equivalent. Note that locale dependent case mappings\n"
|
||||
" are ignored.\n");
|
||||
fprintf (stream, "\n");
|
||||
fprintf (stream,
|
||||
"Language for the output code:\n");
|
||||
@@ -463,6 +467,7 @@ Options::~Options ()
|
||||
"\nENUM is........: %s"
|
||||
"\nINCLUDE is.....: %s"
|
||||
"\nSEVENBIT is....: %s"
|
||||
"\nUPPERLOWER is..: %s"
|
||||
"\nlookup function name = %s"
|
||||
"\nhash function name = %s"
|
||||
"\nword list name = %s"
|
||||
@@ -492,6 +497,7 @@ Options::~Options ()
|
||||
_option_word & ENUM ? "enabled" : "disabled",
|
||||
_option_word & INCLUDE ? "enabled" : "disabled",
|
||||
_option_word & SEVENBIT ? "enabled" : "disabled",
|
||||
_option_word & UPPERLOWER ? "enabled" : "disabled",
|
||||
_function_name, _hash_name, _wordlist_name, _slot_name,
|
||||
_initializer_suffix, _asso_iterations, _jump, _size_multiple,
|
||||
_initial_asso_value, _delimiters, _total_switches);
|
||||
@@ -605,6 +611,7 @@ Options::set_delimiters (const char *delimiters)
|
||||
static const struct option long_options[] =
|
||||
{
|
||||
{ "output-file", required_argument, NULL, CHAR_MAX + 1 },
|
||||
{ "ignore-case", no_argument, NULL, CHAR_MAX + 2 },
|
||||
{ "delimiters", required_argument, NULL, 'e' },
|
||||
{ "struct-type", no_argument, NULL, 't' },
|
||||
{ "language", required_argument, NULL, 'L' },
|
||||
@@ -949,6 +956,11 @@ warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.\n\
|
||||
_output_file_name = /*getopt*/optarg;
|
||||
break;
|
||||
}
|
||||
case CHAR_MAX + 2: /* Case insignificant. */
|
||||
{
|
||||
_option_word |= UPPERLOWER;
|
||||
break;
|
||||
}
|
||||
default:
|
||||
short_usage (stderr);
|
||||
exit (1);
|
||||
|
||||
@@ -98,7 +98,10 @@ enum Option_Type
|
||||
SEVENBIT = 1 << 19,
|
||||
|
||||
/* Optimize for position-independent code. */
|
||||
SHAREDLIB = 1 << 20
|
||||
SHAREDLIB = 1 << 20,
|
||||
|
||||
/* Ignore case of ASCII characters. */
|
||||
UPPERLOWER = 1 << 21
|
||||
};
|
||||
|
||||
/* Class manager for gperf program Options. */
|
||||
|
||||
153
src/output.cc
153
src/output.cc
@@ -232,6 +232,131 @@ Output::output_constants (struct Output_Constants& style) const
|
||||
|
||||
/* ------------------------------------------------------------------------- */
|
||||
|
||||
/* Output gperf's ASCII-case insensitive strcmp replacement. */
|
||||
|
||||
static void
|
||||
output_upperlower_strcmp ()
|
||||
{
|
||||
printf ("#ifndef GPERF_CASE_STRCMP\n"
|
||||
"#define GPERF_CASE_STRCMP 1\n"
|
||||
"static int\n"
|
||||
"gperf_case_strcmp ");
|
||||
printf (option[KRC] ?
|
||||
"(s1, s2)\n"
|
||||
" register char *s1;\n"
|
||||
" register char *s2;\n" :
|
||||
option[C] ?
|
||||
"(s1, s2)\n"
|
||||
" register const char *s1;\n"
|
||||
" register const char *s2;\n" :
|
||||
option[ANSIC] | option[CPLUSPLUS] ?
|
||||
"(register const char *s1, register const char *s2)\n" :
|
||||
"");
|
||||
printf ("{\n"
|
||||
" for (;;)\n"
|
||||
" {\n"
|
||||
" unsigned char c1 = *s1++;\n"
|
||||
" unsigned char c2 = *s2++;\n"
|
||||
" if (c1 >= 'A' && c1 <= 'Z')\n"
|
||||
" c1 += 'a' - 'A';\n"
|
||||
" if (c2 >= 'A' && c2 <= 'Z')\n"
|
||||
" c2 += 'a' - 'A';\n"
|
||||
" if (c1 != 0 && c1 == c2)\n"
|
||||
" continue;\n"
|
||||
" return (int)c1 - (int)c2;\n"
|
||||
" }\n"
|
||||
"}\n"
|
||||
"#endif\n\n");
|
||||
}
|
||||
|
||||
/* Output gperf's ASCII-case insensitive strncmp replacement. */
|
||||
|
||||
static void
|
||||
output_upperlower_strncmp ()
|
||||
{
|
||||
printf ("#ifndef GPERF_CASE_STRNCMP\n"
|
||||
"#define GPERF_CASE_STRNCMP 1\n"
|
||||
"static int\n"
|
||||
"gperf_case_strncmp ");
|
||||
printf (option[KRC] ?
|
||||
"(s1, s2, n)\n"
|
||||
" register char *s1;\n"
|
||||
" register char *s2;\n"
|
||||
" register unsigned int n;\n" :
|
||||
option[C] ?
|
||||
"(s1, s2, n)\n"
|
||||
" register const char *s1;\n"
|
||||
" register const char *s2;\n"
|
||||
" register unsigned int n;\n" :
|
||||
option[ANSIC] | option[CPLUSPLUS] ?
|
||||
"(register const char *s1, register const char *s2, register unsigned int n)\n" :
|
||||
"");
|
||||
printf ("{\n"
|
||||
" for (; n > 0;)\n"
|
||||
" {\n"
|
||||
" unsigned char c1 = *s1++;\n"
|
||||
" unsigned char c2 = *s2++;\n"
|
||||
" if (c1 >= 'A' && c1 <= 'Z')\n"
|
||||
" c1 += 'a' - 'A';\n"
|
||||
" if (c2 >= 'A' && c2 <= 'Z')\n"
|
||||
" c2 += 'a' - 'A';\n"
|
||||
" if (c1 != 0 && c1 == c2)\n"
|
||||
" {\n"
|
||||
" n--;\n"
|
||||
" continue;\n"
|
||||
" }\n"
|
||||
" return (int)c1 - (int)c2;\n"
|
||||
" }\n"
|
||||
" return 0;\n"
|
||||
"}\n"
|
||||
"#endif\n\n");
|
||||
}
|
||||
|
||||
/* Output gperf's ASCII-case insensitive memcmp replacement. */
|
||||
|
||||
static void
|
||||
output_upperlower_memcmp ()
|
||||
{
|
||||
printf ("#ifndef GPERF_CASE_MEMCMP\n"
|
||||
"#define GPERF_CASE_MEMCMP 1\n"
|
||||
"static int\n"
|
||||
"gperf_case_memcmp ");
|
||||
printf (option[KRC] ?
|
||||
"(s1, s2, n)\n"
|
||||
" register char *s1;\n"
|
||||
" register char *s2;\n"
|
||||
" register unsigned int n;\n" :
|
||||
option[C] ?
|
||||
"(s1, s2, n)\n"
|
||||
" register const char *s1;\n"
|
||||
" register const char *s2;\n"
|
||||
" register unsigned int n;\n" :
|
||||
option[ANSIC] | option[CPLUSPLUS] ?
|
||||
"(register const char *s1, register const char *s2, register unsigned int n)\n" :
|
||||
"");
|
||||
printf ("{\n"
|
||||
" for (; n > 0;)\n"
|
||||
" {\n"
|
||||
" unsigned char c1 = *s1++;\n"
|
||||
" unsigned char c2 = *s2++;\n"
|
||||
" if (c1 >= 'A' && c1 <= 'Z')\n"
|
||||
" c1 += 'a' - 'A';\n"
|
||||
" if (c2 >= 'A' && c2 <= 'Z')\n"
|
||||
" c2 += 'a' - 'A';\n"
|
||||
" if (c1 == c2)\n"
|
||||
" {\n"
|
||||
" n--;\n"
|
||||
" continue;\n"
|
||||
" }\n"
|
||||
" return (int)c1 - (int)c2;\n"
|
||||
" }\n"
|
||||
" return 0;\n"
|
||||
"}\n"
|
||||
"#endif\n\n");
|
||||
}
|
||||
|
||||
/* ------------------------------------------------------------------------- */
|
||||
|
||||
/* Outputs a keyword, as a string: enclosed in double quotes, escaping
|
||||
backslashes, double quote and unprintable characters. */
|
||||
|
||||
@@ -363,7 +488,10 @@ void Output_Compare_Strcmp::output_comparison (const Output_Expr& expr1,
|
||||
expr1.output_expr ();
|
||||
printf (" == *");
|
||||
expr2.output_expr ();
|
||||
printf (" && !strcmp (");
|
||||
printf (" && !");
|
||||
if (option[UPPERLOWER])
|
||||
printf ("gperf_case_");
|
||||
printf ("strcmp (");
|
||||
expr1.output_expr ();
|
||||
printf (" + 1, ");
|
||||
expr2.output_expr ();
|
||||
@@ -389,7 +517,10 @@ void Output_Compare_Strncmp::output_comparison (const Output_Expr& expr1,
|
||||
expr1.output_expr ();
|
||||
printf (" == *");
|
||||
expr2.output_expr ();
|
||||
printf (" && !strncmp (");
|
||||
printf (" && !");
|
||||
if (option[UPPERLOWER])
|
||||
printf ("gperf_case_");
|
||||
printf ("strncmp (");
|
||||
expr1.output_expr ();
|
||||
printf (" + 1, ");
|
||||
expr2.output_expr ();
|
||||
@@ -418,7 +549,10 @@ void Output_Compare_Memcmp::output_comparison (const Output_Expr& expr1,
|
||||
expr1.output_expr ();
|
||||
printf (" == *");
|
||||
expr2.output_expr ();
|
||||
printf (" && !memcmp (");
|
||||
printf (" && !");
|
||||
if (option[UPPERLOWER])
|
||||
printf ("gperf_case_");
|
||||
printf ("memcmp (");
|
||||
expr1.output_expr ();
|
||||
printf (" + 1, ");
|
||||
expr2.output_expr ();
|
||||
@@ -1522,6 +1656,19 @@ Output::output ()
|
||||
printf ("/* maximum key range = %d, duplicates = %d */\n\n",
|
||||
_max_hash_value - _min_hash_value + 1, _total_duplicates);
|
||||
|
||||
if (option[UPPERLOWER])
|
||||
{
|
||||
if (option[LENTABLE])
|
||||
output_upperlower_memcmp ();
|
||||
else
|
||||
{
|
||||
if (option[COMP])
|
||||
output_upperlower_strncmp ();
|
||||
else
|
||||
output_upperlower_strcmp ();
|
||||
}
|
||||
}
|
||||
|
||||
if (option[CPLUSPLUS])
|
||||
printf ("class %s\n"
|
||||
"{\n"
|
||||
|
||||
196
src/search.cc
196
src/search.cc
@@ -146,12 +146,39 @@ Search::preprepare ()
|
||||
|
||||
/* ====================== Finding good byte positions ====================== */
|
||||
|
||||
/* Computes the upper bound on the indices passed to asso_values[],
|
||||
assuming no alpha_increments. */
|
||||
unsigned int
|
||||
Search::compute_alpha_size () const
|
||||
{
|
||||
return (option[SEVENBIT] ? 128 : 256);
|
||||
}
|
||||
|
||||
/* Computes the unification rules between different asso_values[c],
|
||||
assuming no alpha_increments. */
|
||||
unsigned int *
|
||||
Search::compute_alpha_unify () const
|
||||
{
|
||||
if (option[UPPERLOWER])
|
||||
{
|
||||
unsigned int alpha_size = compute_alpha_size();
|
||||
unsigned int *alpha_unify = new unsigned int[alpha_size];
|
||||
for (unsigned int c = 0; c < alpha_size; c++)
|
||||
alpha_unify[c] = c;
|
||||
for (unsigned int c = 'A'; c <= 'Z'; c++)
|
||||
alpha_unify[c] = c + ('a'-'A');
|
||||
return alpha_unify;
|
||||
}
|
||||
else
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/* Initializes each keyword's _selchars array. */
|
||||
void
|
||||
Search::init_selchars_tuple (bool use_all_chars, const Positions& positions) const
|
||||
{
|
||||
for (KeywordExt_List *temp = _head; temp; temp = temp->rest())
|
||||
temp->first()->init_selchars_tuple(use_all_chars, positions);
|
||||
temp->first()->init_selchars_tuple(use_all_chars, positions, _alpha_unify);
|
||||
}
|
||||
|
||||
/* Deletes each keyword's _selchars array. */
|
||||
@@ -202,6 +229,9 @@ Search::find_positions ()
|
||||
return;
|
||||
}
|
||||
|
||||
/* Compute preliminary value for _alpha_unify. */
|
||||
_alpha_unify = compute_alpha_unify ();
|
||||
|
||||
/* 1. Find positions that must occur in order to distinguish duplicates. */
|
||||
Positions mandatory;
|
||||
|
||||
@@ -222,13 +252,37 @@ Search::find_positions ()
|
||||
int n = keyword1->_allchars_length;
|
||||
int i;
|
||||
for (i = 1; i < n; i++)
|
||||
if (keyword1->_allchars[i-1] != keyword2->_allchars[i-1])
|
||||
{
|
||||
unsigned char c1 = keyword1->_allchars[i-1];
|
||||
unsigned char c2 = keyword2->_allchars[i-1];
|
||||
if (option[UPPERLOWER])
|
||||
{
|
||||
if (c1 >= 'A' && c1 <= 'Z')
|
||||
c1 += 'a' - 'A';
|
||||
if (c2 >= 'A' && c2 <= 'Z')
|
||||
c2 += 'a' - 'A';
|
||||
}
|
||||
if (c1 != c2)
|
||||
break;
|
||||
if (i < n
|
||||
&& memcmp (&keyword1->_allchars[i],
|
||||
&keyword2->_allchars[i],
|
||||
n - i)
|
||||
== 0)
|
||||
}
|
||||
if (i < n)
|
||||
{
|
||||
int j;
|
||||
for (j = i + 1; j <= n; j++)
|
||||
{
|
||||
unsigned char c1 = keyword1->_allchars[j-1];
|
||||
unsigned char c2 = keyword2->_allchars[j-1];
|
||||
if (option[UPPERLOWER])
|
||||
{
|
||||
if (c1 >= 'A' && c1 <= 'Z')
|
||||
c1 += 'a' - 'A';
|
||||
if (c2 >= 'A' && c2 <= 'Z')
|
||||
c2 += 'a' - 'A';
|
||||
}
|
||||
if (c1 != c2)
|
||||
break;
|
||||
}
|
||||
if (j > n)
|
||||
{
|
||||
/* Position i is mandatory. */
|
||||
if (!mandatory.contains (i))
|
||||
@@ -238,6 +292,7 @@ Search::find_positions ()
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* 2. Add positions, as long as this decreases the duplicates count. */
|
||||
int imax = (_max_key_len < Positions::MAX_KEY_POS
|
||||
@@ -379,16 +434,113 @@ Search::find_positions ()
|
||||
}
|
||||
fprintf (stderr, "\n");
|
||||
}
|
||||
|
||||
/* Free preliminary value for _alpha_unify. */
|
||||
delete[] _alpha_unify;
|
||||
}
|
||||
|
||||
/* ===================== Finding good alpha increments ===================== */
|
||||
|
||||
/* Computes the upper bound on the indices passed to asso_values[]. */
|
||||
unsigned int
|
||||
Search::compute_alpha_size (const unsigned int *alpha_inc) const
|
||||
{
|
||||
unsigned int max_alpha_inc = 0;
|
||||
for (int i = 0; i < _max_key_len; i++)
|
||||
if (max_alpha_inc < alpha_inc[i])
|
||||
max_alpha_inc = alpha_inc[i];
|
||||
return (option[SEVENBIT] ? 128 : 256) + max_alpha_inc;
|
||||
}
|
||||
|
||||
/* Computes the unification rules between different asso_values[c]. */
|
||||
unsigned int *
|
||||
Search::compute_alpha_unify (const Positions& positions, const unsigned int *alpha_inc) const
|
||||
{
|
||||
if (option[UPPERLOWER])
|
||||
{
|
||||
/* Without alpha increments, we would simply unify
|
||||
'A' -> 'a', ..., 'Z' -> 'z'.
|
||||
But when a keyword contains at position i a character c,
|
||||
we have the constraint
|
||||
asso_values[tolower(c) + alpha_inc[i]] ==
|
||||
asso_values[toupper(c) + alpha_inc[i]].
|
||||
This introduces a unification
|
||||
toupper(c) + alpha_inc[i] -> tolower(c) + alpha_inc[i].
|
||||
Note that this unification can extend outside the range of
|
||||
ASCII letters! But still every unified character pair is at
|
||||
a distance of 'a'-'A' = 32, or (after chained unification)
|
||||
at a multiple of 32. So in the end the alpha_unify vector has
|
||||
the form c -> c + 32 * f(c) where f(c) is a nonnegative
|
||||
integer. */
|
||||
unsigned int alpha_size = compute_alpha_size (alpha_inc);
|
||||
|
||||
unsigned int *alpha_unify = new unsigned int[alpha_size];
|
||||
for (unsigned int c = 0; c < alpha_size; c++)
|
||||
alpha_unify[c] = c;
|
||||
|
||||
for (KeywordExt_List *temp = _head; temp; temp = temp->rest())
|
||||
{
|
||||
KeywordExt *keyword = temp->first();
|
||||
|
||||
if (option[ALLCHARS])
|
||||
/* Iterate through all character positions. */
|
||||
for (int i = 0; i < keyword->_allchars_length; i++)
|
||||
{
|
||||
unsigned int c = static_cast<unsigned char>(keyword->_allchars[i]);
|
||||
if (c >= 'A' && c <= 'Z')
|
||||
c += 'a' - 'A';
|
||||
if (c >= 'a' && c <= 'z')
|
||||
{
|
||||
c += alpha_inc[i];
|
||||
/* Unify c with c - ('a'-'A'). */
|
||||
unsigned int d = alpha_unify[c];
|
||||
unsigned int b = c - ('a'-'A');
|
||||
for (int a = b; a >= 0 && alpha_unify[a] == b; a -= ('a'-'A'))
|
||||
alpha_unify[a] = d;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
/* Iterate through the selected character positions. */
|
||||
PositionIterator iter (positions);
|
||||
|
||||
for (int i; (i = iter.next ()) != PositionIterator::EOS; )
|
||||
{
|
||||
unsigned int c;
|
||||
if (i == Positions::LASTCHAR)
|
||||
c = static_cast<unsigned char>(keyword->_allchars[keyword->_allchars_length - 1]);
|
||||
else if (i <= keyword->_allchars_length)
|
||||
c = static_cast<unsigned char>(keyword->_allchars[i - 1]);
|
||||
else
|
||||
continue;
|
||||
if (c >= 'A' && c <= 'Z')
|
||||
c += 'a' - 'A';
|
||||
if (c >= 'a' && c <= 'z')
|
||||
{
|
||||
if (i != Positions::LASTCHAR)
|
||||
c += alpha_inc[i - 1];
|
||||
/* Unify c with c - ('a'-'A'). */
|
||||
unsigned int d = alpha_unify[c];
|
||||
unsigned int b = c - ('a'-'A');
|
||||
for (int a = b; a >= 0 && alpha_unify[a] == b; a -= ('a'-'A'))
|
||||
alpha_unify[a] = d;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return alpha_unify;
|
||||
}
|
||||
else
|
||||
/* Identity mapping. */
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/* Initializes each keyword's _selchars array. */
|
||||
void
|
||||
Search::init_selchars_multiset (bool use_all_chars, const Positions& positions, const unsigned int *alpha_inc) const
|
||||
Search::init_selchars_multiset (bool use_all_chars, const Positions& positions, const unsigned int *alpha_unify, const unsigned int *alpha_inc) const
|
||||
{
|
||||
for (KeywordExt_List *temp = _head; temp; temp = temp->rest())
|
||||
temp->first()->init_selchars_multiset(use_all_chars, positions, alpha_inc);
|
||||
temp->first()->init_selchars_multiset(use_all_chars, positions, alpha_unify, alpha_inc);
|
||||
}
|
||||
|
||||
/* Count the duplicate keywords that occur with the given set of positions
|
||||
@@ -402,7 +554,9 @@ Search::count_duplicates_multiset (const unsigned int *alpha_inc) const
|
||||
/* Run through the keyword list and count the duplicates incrementally.
|
||||
The result does not depend on the order of the keyword list, thanks to
|
||||
the formula above. */
|
||||
init_selchars_multiset (option[ALLCHARS], _key_positions, alpha_inc);
|
||||
init_selchars_multiset (option[ALLCHARS], _key_positions,
|
||||
compute_alpha_unify (_key_positions, alpha_inc),
|
||||
alpha_inc);
|
||||
|
||||
unsigned int count = 0;
|
||||
{
|
||||
@@ -428,7 +582,9 @@ Search::find_alpha_inc ()
|
||||
/* The goal is to choose _alpha_inc[] such that it doesn't introduce
|
||||
artificial duplicates.
|
||||
In other words, the goal is # proj2 (proj1 (K)) = # proj1 (K). */
|
||||
_alpha_unify = compute_alpha_unify ();
|
||||
unsigned int duplicates_goal = count_duplicates_tuple (_key_positions);
|
||||
delete[] _alpha_unify;
|
||||
|
||||
/* Start with zero increments. This is sufficient in most cases. */
|
||||
unsigned int *current = new unsigned int [_max_key_len];
|
||||
@@ -545,6 +701,8 @@ Search::find_alpha_inc ()
|
||||
}
|
||||
|
||||
_alpha_inc = current;
|
||||
_alpha_size = compute_alpha_size (_alpha_inc);
|
||||
_alpha_unify = compute_alpha_unify (_key_positions, _alpha_inc);
|
||||
}
|
||||
|
||||
/* ======================= Finding good asso_values ======================== */
|
||||
@@ -555,7 +713,8 @@ Search::prepare ()
|
||||
KeywordExt_List *temp;
|
||||
|
||||
/* Initialize each keyword's _selchars array. */
|
||||
init_selchars_multiset(option[ALLCHARS], _key_positions, _alpha_inc);
|
||||
init_selchars_multiset(option[ALLCHARS], _key_positions,
|
||||
_alpha_unify, _alpha_inc);
|
||||
|
||||
/* Check for duplicates, i.e. keywords with the same _selchars array
|
||||
(and - if !option[NOLENGTH] - also the same length).
|
||||
@@ -634,14 +793,6 @@ Search::prepare ()
|
||||
}
|
||||
}
|
||||
|
||||
/* Compute _alpha_size, the upper bound on the indices passed to
|
||||
asso_values[]. */
|
||||
unsigned int max_alpha_inc = 0;
|
||||
for (int i = 0; i < _max_key_len; i++)
|
||||
if (max_alpha_inc < _alpha_inc[i])
|
||||
max_alpha_inc = _alpha_inc[i];
|
||||
_alpha_size = (option[SEVENBIT] ? 128 : 256) + max_alpha_inc;
|
||||
|
||||
/* Compute the occurrences of each character in the alphabet. */
|
||||
_occurrences = new int[_alpha_size];
|
||||
memset (_occurrences, 0, _alpha_size * sizeof (_occurrences[0]));
|
||||
@@ -1492,6 +1643,12 @@ Search::optimize ()
|
||||
for (unsigned int c = 0; c < _alpha_size; c++)
|
||||
if (_occurrences[c] == 0)
|
||||
_asso_values[c] = max_hash_value + 1;
|
||||
|
||||
/* Propagate unified asso_values. */
|
||||
if (_alpha_unify)
|
||||
for (unsigned int c = 0; c < _alpha_size; c++)
|
||||
if (_alpha_unify[c] != c)
|
||||
_asso_values[c] = _asso_values[_alpha_unify[c]];
|
||||
}
|
||||
|
||||
/* Prints out some diagnostics upon completion. */
|
||||
@@ -1533,5 +1690,6 @@ Search::~Search ()
|
||||
}
|
||||
delete[] _asso_values;
|
||||
delete[] _occurrences;
|
||||
delete[] _alpha_unify;
|
||||
delete[] _alpha_inc;
|
||||
}
|
||||
|
||||
26
src/search.h
26
src/search.h
@@ -41,6 +41,14 @@ public:
|
||||
private:
|
||||
void preprepare ();
|
||||
|
||||
/* Computes the upper bound on the indices passed to asso_values[],
|
||||
assuming no alpha_increments. */
|
||||
unsigned int compute_alpha_size () const;
|
||||
|
||||
/* Computes the unification rules between different asso_values[c],
|
||||
assuming no alpha_increments. */
|
||||
unsigned int * compute_alpha_unify () const;
|
||||
|
||||
/* Initializes each keyword's _selchars array. */
|
||||
void init_selchars_tuple (bool use_all_chars, const Positions& positions) const;
|
||||
/* Deletes each keyword's _selchars array. */
|
||||
@@ -52,8 +60,14 @@ private:
|
||||
/* Find good key positions. */
|
||||
void find_positions ();
|
||||
|
||||
/* Computes the upper bound on the indices passed to asso_values[]. */
|
||||
unsigned int compute_alpha_size (const unsigned int *alpha_inc) const;
|
||||
|
||||
/* Computes the unification rules between different asso_values[c]. */
|
||||
unsigned int * compute_alpha_unify (const Positions& positions, const unsigned int *alpha_inc) const;
|
||||
|
||||
/* Initializes each keyword's _selchars array. */
|
||||
void init_selchars_multiset (bool use_all_chars, const Positions& positions, const unsigned int *alpha_inc) const;
|
||||
void init_selchars_multiset (bool use_all_chars, const Positions& positions, const unsigned int *alpha_unify, const unsigned int *alpha_inc) const;
|
||||
|
||||
/* Count the duplicate keywords that occur with the given set of positions
|
||||
and a given alpha_inc[] array. */
|
||||
@@ -115,13 +129,17 @@ public:
|
||||
/* Adjustments to add to bytes add specific key positions. */
|
||||
unsigned int * _alpha_inc;
|
||||
|
||||
/* Size of alphabet. */
|
||||
unsigned int _alpha_size;
|
||||
|
||||
/* Alphabet character unification, either the identity or a mapping from
|
||||
upper case characters to lower case characters (and maybe more). */
|
||||
unsigned int * _alpha_unify;
|
||||
|
||||
/* Total number of duplicates that have been moved to _duplicate_link lists
|
||||
(not counting their representatives which stay on the main list). */
|
||||
int _total_duplicates;
|
||||
|
||||
/* Size of alphabet. */
|
||||
unsigned int _alpha_size;
|
||||
|
||||
/* Counts occurrences of each key set character.
|
||||
_occurrences[c] is the number of times that c occurs among the _selchars
|
||||
of a keyword. */
|
||||
|
||||
@@ -143,6 +143,8 @@ check-test:
|
||||
diff $(srcdir)/permut2.exp permut2.out
|
||||
$(GPERF) -m5 < $(srcdir)/permut3.gperf > permut3.out
|
||||
diff $(srcdir)/permut3.exp permut3.out
|
||||
$(GPERF) -m5 --ignore-case < $(srcdir)/permutc2.gperf > permutc2.out
|
||||
diff $(srcdir)/permutc2.exp permutc2.out
|
||||
$(GPERF) -C -E -G -I -t < $(srcdir)/charsets.gperf > charsets.out
|
||||
diff $(srcdir)/charsets.exp charsets.out
|
||||
$(GPERF) -C -E -G -I -t < $(srcdir)/languages.gperf > languages.out
|
||||
|
||||
116
tests/permutc2.exp
Normal file
116
tests/permutc2.exp
Normal file
@@ -0,0 +1,116 @@
|
||||
/* C code produced by gperf version 2.7.2 */
|
||||
/* Command-line: ../src/gperf -m5 --ignore-case */
|
||||
/* Computed positions: -k'1-2' */
|
||||
|
||||
/* Test of a hash function which has to deal with permutation and
|
||||
case-independence. Without case-independence, the alpha_inc is 1.
|
||||
With case-independence, the alpha_inc is 3. */
|
||||
|
||||
#define TOTAL_KEYWORDS 8
|
||||
#define MIN_WORD_LENGTH 2
|
||||
#define MAX_WORD_LENGTH 2
|
||||
#define MIN_HASH_VALUE 2
|
||||
#define MAX_HASH_VALUE 9
|
||||
/* maximum key range = 8, duplicates = 0 */
|
||||
|
||||
#ifndef GPERF_CASE_STRCMP
|
||||
#define GPERF_CASE_STRCMP 1
|
||||
static int
|
||||
gperf_case_strcmp (s1, s2)
|
||||
register const char *s1;
|
||||
register const char *s2;
|
||||
{
|
||||
for (;;)
|
||||
{
|
||||
unsigned char c1 = *s1++;
|
||||
unsigned char c2 = *s2++;
|
||||
if (c1 >= 'A' && c1 <= 'Z')
|
||||
c1 += 'a' - 'A';
|
||||
if (c2 >= 'A' && c2 <= 'Z')
|
||||
c2 += 'a' - 'A';
|
||||
if (c1 != 0 && c1 == c2)
|
||||
continue;
|
||||
return (int)c1 - (int)c2;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef __GNUC__
|
||||
__inline
|
||||
#else
|
||||
#ifdef __cplusplus
|
||||
inline
|
||||
#endif
|
||||
#endif
|
||||
static unsigned int
|
||||
hash (str, len)
|
||||
register const char *str;
|
||||
register unsigned int len;
|
||||
{
|
||||
static unsigned char asso_values[] =
|
||||
{
|
||||
10,10,10,10,10,10,10,10,10,10,
|
||||
10,10,10,10,10,10,10,10,10,10,
|
||||
10,10,10,10,10,10,10,10,10,10,
|
||||
10,10,10,10,10,10,10,10,10,10,
|
||||
10,10,10,10,10,10,10,10,10,10,
|
||||
10,10,10,10,10,10,10,10,10,10,
|
||||
10,10,10,10,10, 1,10,10, 3,10,
|
||||
10,10,10,10,10,10,10,10,10,10,
|
||||
10,10,10,10,10,10,10,10, 3, 1,
|
||||
0, 7, 1, 0, 3,10,10, 1,10,10,
|
||||
3,10,10,10,10,10,10,10,10,10,
|
||||
10,10,10,10,10,10,10,10,10,10,
|
||||
3, 1, 0, 0, 1, 0, 2,10,10,10,
|
||||
10,10,10,10,10,10,10,10,10,10,
|
||||
10,10,10,10,10,10,10,10,10,10,
|
||||
10,10,10,10,10,10,10,10,10,10,
|
||||
10,10,10,10,10,10,10,10,10,10,
|
||||
10,10,10,10,10,10,10,10,10,10,
|
||||
10,10,10,10,10,10,10,10,10,10,
|
||||
10,10,10,10,10,10,10,10,10,10,
|
||||
10,10,10,10,10,10,10,10,10,10,
|
||||
10,10,10,10,10,10,10,10,10,10,
|
||||
10,10,10,10,10,10,10,10,10,10,
|
||||
10,10,10,10,10,10,10,10,10,10,
|
||||
10,10,10,10,10,10,10,10,10,10,
|
||||
10,10,10,10,10,10,10,10,10
|
||||
};
|
||||
return len + asso_values[(unsigned char)str[1]+3] + asso_values[(unsigned char)str[0]];
|
||||
}
|
||||
|
||||
#ifdef __GNUC__
|
||||
__inline
|
||||
#endif
|
||||
const char *
|
||||
in_word_set (str, len)
|
||||
register const char *str;
|
||||
register unsigned int len;
|
||||
{
|
||||
static const char * wordlist[] =
|
||||
{
|
||||
"", "",
|
||||
"{w",
|
||||
"az",
|
||||
"ay",
|
||||
"za",
|
||||
"ya",
|
||||
"x{",
|
||||
"x[",
|
||||
"[w"
|
||||
};
|
||||
|
||||
if (len <= MAX_WORD_LENGTH && len >= MIN_WORD_LENGTH)
|
||||
{
|
||||
register int key = hash (str, len);
|
||||
|
||||
if (key <= MAX_HASH_VALUE && key >= 0)
|
||||
{
|
||||
register const char *s = wordlist[key];
|
||||
|
||||
if (*str == *s && !gperf_case_strcmp (str + 1, s + 1))
|
||||
return s;
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
14
tests/permutc2.gperf
Normal file
14
tests/permutc2.gperf
Normal file
@@ -0,0 +1,14 @@
|
||||
%{
|
||||
/* Test of a hash function which has to deal with permutation and
|
||||
case-independence. Without case-independence, the alpha_inc is 1.
|
||||
With case-independence, the alpha_inc is 3. */
|
||||
%}
|
||||
%%
|
||||
az
|
||||
za
|
||||
ay
|
||||
ya
|
||||
x{
|
||||
x[
|
||||
{w
|
||||
[w
|
||||
@@ -20,6 +20,9 @@ Input file interpretation:
|
||||
is considered part of the type declaration. Key
|
||||
words and additional fields may follow this, one
|
||||
group of fields per line.
|
||||
--ignore-case Consider upper and lower case ASCII characters as
|
||||
equivalent. Note that locale dependent case mappings
|
||||
are ignored.
|
||||
|
||||
Language for the output code:
|
||||
-L, --language=LANGUAGE-NAME
|
||||
|
||||
Reference in New Issue
Block a user