1
0
mirror of https://git.savannah.gnu.org/git/gperf.git synced 2025-12-02 13:09:22 +00:00

New option --ignore-case.

This commit is contained in:
Bruno Haible
2003-04-02 09:26:05 +00:00
parent 0093e33163
commit 7dfd32b736
15 changed files with 583 additions and 39 deletions

View File

@@ -472,6 +472,10 @@ Input::read_input ()
option.set (TYPE);
else
if (is_declaration (line, line_end, lineno, "ignore-case"))
option.set (UPPERLOWER);
else
if (is_declaration_with_arg (line, line_end, lineno,
"language", &arg))
option.set_language (arg);

View File

@@ -59,7 +59,7 @@ static inline void sort_char_set (unsigned int *base, int len)
*/
unsigned int *
KeywordExt::init_selchars_low (bool use_all_chars, const Positions& positions, const unsigned int *alpha_inc)
KeywordExt::init_selchars_low (bool use_all_chars, const Positions& positions, const unsigned int *alpha_unify, const unsigned int *alpha_inc)
{
const char *k = _allchars;
unsigned int *key_set =
@@ -73,6 +73,8 @@ KeywordExt::init_selchars_low (bool use_all_chars, const Positions& positions, c
unsigned int c = static_cast<unsigned char>(*k);
if (alpha_inc)
c += alpha_inc[k-_allchars];
if (alpha_unify)
c = alpha_unify[c];
*ptr = c;
ptr++;
}
@@ -99,6 +101,8 @@ KeywordExt::init_selchars_low (bool use_all_chars, const Positions& positions, c
else
/* Out of range of KEY length, so we'll just skip it. */
continue;
if (alpha_unify)
c = alpha_unify[c];
*ptr = c;
ptr++;
}
@@ -111,16 +115,16 @@ KeywordExt::init_selchars_low (bool use_all_chars, const Positions& positions, c
}
void
KeywordExt::init_selchars_tuple (bool use_all_chars, const Positions& positions)
KeywordExt::init_selchars_tuple (bool use_all_chars, const Positions& positions, const unsigned int *alpha_unify)
{
init_selchars_low (use_all_chars, positions, NULL);
init_selchars_low (use_all_chars, positions, alpha_unify, NULL);
}
void
KeywordExt::init_selchars_multiset (bool use_all_chars, const Positions& positions, const unsigned int *alpha_inc)
KeywordExt::init_selchars_multiset (bool use_all_chars, const Positions& positions, const unsigned int *alpha_unify, const unsigned int *alpha_inc)
{
unsigned int *selchars =
init_selchars_low (use_all_chars, positions, alpha_inc);
init_selchars_low (use_all_chars, positions, alpha_unify, alpha_inc);
/* Sort the selchars elements alphabetically. */
sort_char_set (selchars, _selchars_length);

View File

@@ -68,9 +68,9 @@ struct KeywordExt : public Keyword
/* Methods depending on the keyposition list. */
/* Initializes selchars and selchars_length, without reordering. */
void init_selchars_tuple (bool use_all_chars, const Positions& positions);
void init_selchars_tuple (bool use_all_chars, const Positions& positions, const unsigned int *alpha_unify);
/* Initializes selchars and selchars_length, with reordering. */
void init_selchars_multiset (bool use_all_chars, const Positions& positions, const unsigned int *alpha_inc);
void init_selchars_multiset (bool use_all_chars, const Positions& positions, const unsigned int *alpha_unify, const unsigned int *alpha_inc);
/* Deletes selchars. */
void delete_selchars ();
@@ -81,7 +81,7 @@ struct KeywordExt : public Keyword
int _final_index;
private:
unsigned int * init_selchars_low (bool use_all_chars, const Positions& positions, const unsigned int *alpha_inc);
unsigned int * init_selchars_low (bool use_all_chars, const Positions& positions, const unsigned int *alpha_unify, const unsigned int *alpha_inc);
};
/* An abstract factory for creating Keyword instances.

View File

@@ -106,6 +106,10 @@ Options::long_usage (FILE * stream) const
" is considered part of the type declaration. Key\n"
" words and additional fields may follow this, one\n"
" group of fields per line.\n");
fprintf (stream,
" --ignore-case Consider upper and lower case ASCII characters as\n"
" equivalent. Note that locale dependent case mappings\n"
" are ignored.\n");
fprintf (stream, "\n");
fprintf (stream,
"Language for the output code:\n");
@@ -463,6 +467,7 @@ Options::~Options ()
"\nENUM is........: %s"
"\nINCLUDE is.....: %s"
"\nSEVENBIT is....: %s"
"\nUPPERLOWER is..: %s"
"\nlookup function name = %s"
"\nhash function name = %s"
"\nword list name = %s"
@@ -492,6 +497,7 @@ Options::~Options ()
_option_word & ENUM ? "enabled" : "disabled",
_option_word & INCLUDE ? "enabled" : "disabled",
_option_word & SEVENBIT ? "enabled" : "disabled",
_option_word & UPPERLOWER ? "enabled" : "disabled",
_function_name, _hash_name, _wordlist_name, _slot_name,
_initializer_suffix, _asso_iterations, _jump, _size_multiple,
_initial_asso_value, _delimiters, _total_switches);
@@ -605,6 +611,7 @@ Options::set_delimiters (const char *delimiters)
static const struct option long_options[] =
{
{ "output-file", required_argument, NULL, CHAR_MAX + 1 },
{ "ignore-case", no_argument, NULL, CHAR_MAX + 2 },
{ "delimiters", required_argument, NULL, 'e' },
{ "struct-type", no_argument, NULL, 't' },
{ "language", required_argument, NULL, 'L' },
@@ -949,6 +956,11 @@ warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.\n\
_output_file_name = /*getopt*/optarg;
break;
}
case CHAR_MAX + 2: /* Case insignificant. */
{
_option_word |= UPPERLOWER;
break;
}
default:
short_usage (stderr);
exit (1);

View File

@@ -98,7 +98,10 @@ enum Option_Type
SEVENBIT = 1 << 19,
/* Optimize for position-independent code. */
SHAREDLIB = 1 << 20
SHAREDLIB = 1 << 20,
/* Ignore case of ASCII characters. */
UPPERLOWER = 1 << 21
};
/* Class manager for gperf program Options. */

View File

@@ -232,6 +232,131 @@ Output::output_constants (struct Output_Constants& style) const
/* ------------------------------------------------------------------------- */
/* Output gperf's ASCII-case insensitive strcmp replacement. */
static void
output_upperlower_strcmp ()
{
printf ("#ifndef GPERF_CASE_STRCMP\n"
"#define GPERF_CASE_STRCMP 1\n"
"static int\n"
"gperf_case_strcmp ");
printf (option[KRC] ?
"(s1, s2)\n"
" register char *s1;\n"
" register char *s2;\n" :
option[C] ?
"(s1, s2)\n"
" register const char *s1;\n"
" register const char *s2;\n" :
option[ANSIC] | option[CPLUSPLUS] ?
"(register const char *s1, register const char *s2)\n" :
"");
printf ("{\n"
" for (;;)\n"
" {\n"
" unsigned char c1 = *s1++;\n"
" unsigned char c2 = *s2++;\n"
" if (c1 >= 'A' && c1 <= 'Z')\n"
" c1 += 'a' - 'A';\n"
" if (c2 >= 'A' && c2 <= 'Z')\n"
" c2 += 'a' - 'A';\n"
" if (c1 != 0 && c1 == c2)\n"
" continue;\n"
" return (int)c1 - (int)c2;\n"
" }\n"
"}\n"
"#endif\n\n");
}
/* Output gperf's ASCII-case insensitive strncmp replacement. */
static void
output_upperlower_strncmp ()
{
printf ("#ifndef GPERF_CASE_STRNCMP\n"
"#define GPERF_CASE_STRNCMP 1\n"
"static int\n"
"gperf_case_strncmp ");
printf (option[KRC] ?
"(s1, s2, n)\n"
" register char *s1;\n"
" register char *s2;\n"
" register unsigned int n;\n" :
option[C] ?
"(s1, s2, n)\n"
" register const char *s1;\n"
" register const char *s2;\n"
" register unsigned int n;\n" :
option[ANSIC] | option[CPLUSPLUS] ?
"(register const char *s1, register const char *s2, register unsigned int n)\n" :
"");
printf ("{\n"
" for (; n > 0;)\n"
" {\n"
" unsigned char c1 = *s1++;\n"
" unsigned char c2 = *s2++;\n"
" if (c1 >= 'A' && c1 <= 'Z')\n"
" c1 += 'a' - 'A';\n"
" if (c2 >= 'A' && c2 <= 'Z')\n"
" c2 += 'a' - 'A';\n"
" if (c1 != 0 && c1 == c2)\n"
" {\n"
" n--;\n"
" continue;\n"
" }\n"
" return (int)c1 - (int)c2;\n"
" }\n"
" return 0;\n"
"}\n"
"#endif\n\n");
}
/* Output gperf's ASCII-case insensitive memcmp replacement. */
static void
output_upperlower_memcmp ()
{
printf ("#ifndef GPERF_CASE_MEMCMP\n"
"#define GPERF_CASE_MEMCMP 1\n"
"static int\n"
"gperf_case_memcmp ");
printf (option[KRC] ?
"(s1, s2, n)\n"
" register char *s1;\n"
" register char *s2;\n"
" register unsigned int n;\n" :
option[C] ?
"(s1, s2, n)\n"
" register const char *s1;\n"
" register const char *s2;\n"
" register unsigned int n;\n" :
option[ANSIC] | option[CPLUSPLUS] ?
"(register const char *s1, register const char *s2, register unsigned int n)\n" :
"");
printf ("{\n"
" for (; n > 0;)\n"
" {\n"
" unsigned char c1 = *s1++;\n"
" unsigned char c2 = *s2++;\n"
" if (c1 >= 'A' && c1 <= 'Z')\n"
" c1 += 'a' - 'A';\n"
" if (c2 >= 'A' && c2 <= 'Z')\n"
" c2 += 'a' - 'A';\n"
" if (c1 == c2)\n"
" {\n"
" n--;\n"
" continue;\n"
" }\n"
" return (int)c1 - (int)c2;\n"
" }\n"
" return 0;\n"
"}\n"
"#endif\n\n");
}
/* ------------------------------------------------------------------------- */
/* Outputs a keyword, as a string: enclosed in double quotes, escaping
backslashes, double quote and unprintable characters. */
@@ -363,7 +488,10 @@ void Output_Compare_Strcmp::output_comparison (const Output_Expr& expr1,
expr1.output_expr ();
printf (" == *");
expr2.output_expr ();
printf (" && !strcmp (");
printf (" && !");
if (option[UPPERLOWER])
printf ("gperf_case_");
printf ("strcmp (");
expr1.output_expr ();
printf (" + 1, ");
expr2.output_expr ();
@@ -389,7 +517,10 @@ void Output_Compare_Strncmp::output_comparison (const Output_Expr& expr1,
expr1.output_expr ();
printf (" == *");
expr2.output_expr ();
printf (" && !strncmp (");
printf (" && !");
if (option[UPPERLOWER])
printf ("gperf_case_");
printf ("strncmp (");
expr1.output_expr ();
printf (" + 1, ");
expr2.output_expr ();
@@ -418,7 +549,10 @@ void Output_Compare_Memcmp::output_comparison (const Output_Expr& expr1,
expr1.output_expr ();
printf (" == *");
expr2.output_expr ();
printf (" && !memcmp (");
printf (" && !");
if (option[UPPERLOWER])
printf ("gperf_case_");
printf ("memcmp (");
expr1.output_expr ();
printf (" + 1, ");
expr2.output_expr ();
@@ -1522,6 +1656,19 @@ Output::output ()
printf ("/* maximum key range = %d, duplicates = %d */\n\n",
_max_hash_value - _min_hash_value + 1, _total_duplicates);
if (option[UPPERLOWER])
{
if (option[LENTABLE])
output_upperlower_memcmp ();
else
{
if (option[COMP])
output_upperlower_strncmp ();
else
output_upperlower_strcmp ();
}
}
if (option[CPLUSPLUS])
printf ("class %s\n"
"{\n"

View File

@@ -146,12 +146,39 @@ Search::preprepare ()
/* ====================== Finding good byte positions ====================== */
/* Computes the upper bound on the indices passed to asso_values[],
assuming no alpha_increments. */
unsigned int
Search::compute_alpha_size () const
{
return (option[SEVENBIT] ? 128 : 256);
}
/* Computes the unification rules between different asso_values[c],
assuming no alpha_increments. */
unsigned int *
Search::compute_alpha_unify () const
{
if (option[UPPERLOWER])
{
unsigned int alpha_size = compute_alpha_size();
unsigned int *alpha_unify = new unsigned int[alpha_size];
for (unsigned int c = 0; c < alpha_size; c++)
alpha_unify[c] = c;
for (unsigned int c = 'A'; c <= 'Z'; c++)
alpha_unify[c] = c + ('a'-'A');
return alpha_unify;
}
else
return NULL;
}
/* Initializes each keyword's _selchars array. */
void
Search::init_selchars_tuple (bool use_all_chars, const Positions& positions) const
{
for (KeywordExt_List *temp = _head; temp; temp = temp->rest())
temp->first()->init_selchars_tuple(use_all_chars, positions);
temp->first()->init_selchars_tuple(use_all_chars, positions, _alpha_unify);
}
/* Deletes each keyword's _selchars array. */
@@ -202,6 +229,9 @@ Search::find_positions ()
return;
}
/* Compute preliminary value for _alpha_unify. */
_alpha_unify = compute_alpha_unify ();
/* 1. Find positions that must occur in order to distinguish duplicates. */
Positions mandatory;
@@ -222,17 +252,42 @@ Search::find_positions ()
int n = keyword1->_allchars_length;
int i;
for (i = 1; i < n; i++)
if (keyword1->_allchars[i-1] != keyword2->_allchars[i-1])
break;
if (i < n
&& memcmp (&keyword1->_allchars[i],
&keyword2->_allchars[i],
n - i)
== 0)
{
/* Position i is mandatory. */
if (!mandatory.contains (i))
mandatory.add (i);
unsigned char c1 = keyword1->_allchars[i-1];
unsigned char c2 = keyword2->_allchars[i-1];
if (option[UPPERLOWER])
{
if (c1 >= 'A' && c1 <= 'Z')
c1 += 'a' - 'A';
if (c2 >= 'A' && c2 <= 'Z')
c2 += 'a' - 'A';
}
if (c1 != c2)
break;
}
if (i < n)
{
int j;
for (j = i + 1; j <= n; j++)
{
unsigned char c1 = keyword1->_allchars[j-1];
unsigned char c2 = keyword2->_allchars[j-1];
if (option[UPPERLOWER])
{
if (c1 >= 'A' && c1 <= 'Z')
c1 += 'a' - 'A';
if (c2 >= 'A' && c2 <= 'Z')
c2 += 'a' - 'A';
}
if (c1 != c2)
break;
}
if (j > n)
{
/* Position i is mandatory. */
if (!mandatory.contains (i))
mandatory.add (i);
}
}
}
}
@@ -379,16 +434,113 @@ Search::find_positions ()
}
fprintf (stderr, "\n");
}
/* Free preliminary value for _alpha_unify. */
delete[] _alpha_unify;
}
/* ===================== Finding good alpha increments ===================== */
/* Computes the upper bound on the indices passed to asso_values[]. */
unsigned int
Search::compute_alpha_size (const unsigned int *alpha_inc) const
{
unsigned int max_alpha_inc = 0;
for (int i = 0; i < _max_key_len; i++)
if (max_alpha_inc < alpha_inc[i])
max_alpha_inc = alpha_inc[i];
return (option[SEVENBIT] ? 128 : 256) + max_alpha_inc;
}
/* Computes the unification rules between different asso_values[c]. */
unsigned int *
Search::compute_alpha_unify (const Positions& positions, const unsigned int *alpha_inc) const
{
if (option[UPPERLOWER])
{
/* Without alpha increments, we would simply unify
'A' -> 'a', ..., 'Z' -> 'z'.
But when a keyword contains at position i a character c,
we have the constraint
asso_values[tolower(c) + alpha_inc[i]] ==
asso_values[toupper(c) + alpha_inc[i]].
This introduces a unification
toupper(c) + alpha_inc[i] -> tolower(c) + alpha_inc[i].
Note that this unification can extend outside the range of
ASCII letters! But still every unified character pair is at
a distance of 'a'-'A' = 32, or (after chained unification)
at a multiple of 32. So in the end the alpha_unify vector has
the form c -> c + 32 * f(c) where f(c) is a nonnegative
integer. */
unsigned int alpha_size = compute_alpha_size (alpha_inc);
unsigned int *alpha_unify = new unsigned int[alpha_size];
for (unsigned int c = 0; c < alpha_size; c++)
alpha_unify[c] = c;
for (KeywordExt_List *temp = _head; temp; temp = temp->rest())
{
KeywordExt *keyword = temp->first();
if (option[ALLCHARS])
/* Iterate through all character positions. */
for (int i = 0; i < keyword->_allchars_length; i++)
{
unsigned int c = static_cast<unsigned char>(keyword->_allchars[i]);
if (c >= 'A' && c <= 'Z')
c += 'a' - 'A';
if (c >= 'a' && c <= 'z')
{
c += alpha_inc[i];
/* Unify c with c - ('a'-'A'). */
unsigned int d = alpha_unify[c];
unsigned int b = c - ('a'-'A');
for (int a = b; a >= 0 && alpha_unify[a] == b; a -= ('a'-'A'))
alpha_unify[a] = d;
}
}
else
{
/* Iterate through the selected character positions. */
PositionIterator iter (positions);
for (int i; (i = iter.next ()) != PositionIterator::EOS; )
{
unsigned int c;
if (i == Positions::LASTCHAR)
c = static_cast<unsigned char>(keyword->_allchars[keyword->_allchars_length - 1]);
else if (i <= keyword->_allchars_length)
c = static_cast<unsigned char>(keyword->_allchars[i - 1]);
else
continue;
if (c >= 'A' && c <= 'Z')
c += 'a' - 'A';
if (c >= 'a' && c <= 'z')
{
if (i != Positions::LASTCHAR)
c += alpha_inc[i - 1];
/* Unify c with c - ('a'-'A'). */
unsigned int d = alpha_unify[c];
unsigned int b = c - ('a'-'A');
for (int a = b; a >= 0 && alpha_unify[a] == b; a -= ('a'-'A'))
alpha_unify[a] = d;
}
}
}
}
return alpha_unify;
}
else
/* Identity mapping. */
return NULL;
}
/* Initializes each keyword's _selchars array. */
void
Search::init_selchars_multiset (bool use_all_chars, const Positions& positions, const unsigned int *alpha_inc) const
Search::init_selchars_multiset (bool use_all_chars, const Positions& positions, const unsigned int *alpha_unify, const unsigned int *alpha_inc) const
{
for (KeywordExt_List *temp = _head; temp; temp = temp->rest())
temp->first()->init_selchars_multiset(use_all_chars, positions, alpha_inc);
temp->first()->init_selchars_multiset(use_all_chars, positions, alpha_unify, alpha_inc);
}
/* Count the duplicate keywords that occur with the given set of positions
@@ -402,7 +554,9 @@ Search::count_duplicates_multiset (const unsigned int *alpha_inc) const
/* Run through the keyword list and count the duplicates incrementally.
The result does not depend on the order of the keyword list, thanks to
the formula above. */
init_selchars_multiset (option[ALLCHARS], _key_positions, alpha_inc);
init_selchars_multiset (option[ALLCHARS], _key_positions,
compute_alpha_unify (_key_positions, alpha_inc),
alpha_inc);
unsigned int count = 0;
{
@@ -428,7 +582,9 @@ Search::find_alpha_inc ()
/* The goal is to choose _alpha_inc[] such that it doesn't introduce
artificial duplicates.
In other words, the goal is # proj2 (proj1 (K)) = # proj1 (K). */
_alpha_unify = compute_alpha_unify ();
unsigned int duplicates_goal = count_duplicates_tuple (_key_positions);
delete[] _alpha_unify;
/* Start with zero increments. This is sufficient in most cases. */
unsigned int *current = new unsigned int [_max_key_len];
@@ -545,6 +701,8 @@ Search::find_alpha_inc ()
}
_alpha_inc = current;
_alpha_size = compute_alpha_size (_alpha_inc);
_alpha_unify = compute_alpha_unify (_key_positions, _alpha_inc);
}
/* ======================= Finding good asso_values ======================== */
@@ -555,7 +713,8 @@ Search::prepare ()
KeywordExt_List *temp;
/* Initialize each keyword's _selchars array. */
init_selchars_multiset(option[ALLCHARS], _key_positions, _alpha_inc);
init_selchars_multiset(option[ALLCHARS], _key_positions,
_alpha_unify, _alpha_inc);
/* Check for duplicates, i.e. keywords with the same _selchars array
(and - if !option[NOLENGTH] - also the same length).
@@ -634,14 +793,6 @@ Search::prepare ()
}
}
/* Compute _alpha_size, the upper bound on the indices passed to
asso_values[]. */
unsigned int max_alpha_inc = 0;
for (int i = 0; i < _max_key_len; i++)
if (max_alpha_inc < _alpha_inc[i])
max_alpha_inc = _alpha_inc[i];
_alpha_size = (option[SEVENBIT] ? 128 : 256) + max_alpha_inc;
/* Compute the occurrences of each character in the alphabet. */
_occurrences = new int[_alpha_size];
memset (_occurrences, 0, _alpha_size * sizeof (_occurrences[0]));
@@ -1492,6 +1643,12 @@ Search::optimize ()
for (unsigned int c = 0; c < _alpha_size; c++)
if (_occurrences[c] == 0)
_asso_values[c] = max_hash_value + 1;
/* Propagate unified asso_values. */
if (_alpha_unify)
for (unsigned int c = 0; c < _alpha_size; c++)
if (_alpha_unify[c] != c)
_asso_values[c] = _asso_values[_alpha_unify[c]];
}
/* Prints out some diagnostics upon completion. */
@@ -1533,5 +1690,6 @@ Search::~Search ()
}
delete[] _asso_values;
delete[] _occurrences;
delete[] _alpha_unify;
delete[] _alpha_inc;
}

View File

@@ -41,6 +41,14 @@ public:
private:
void preprepare ();
/* Computes the upper bound on the indices passed to asso_values[],
assuming no alpha_increments. */
unsigned int compute_alpha_size () const;
/* Computes the unification rules between different asso_values[c],
assuming no alpha_increments. */
unsigned int * compute_alpha_unify () const;
/* Initializes each keyword's _selchars array. */
void init_selchars_tuple (bool use_all_chars, const Positions& positions) const;
/* Deletes each keyword's _selchars array. */
@@ -52,8 +60,14 @@ private:
/* Find good key positions. */
void find_positions ();
/* Computes the upper bound on the indices passed to asso_values[]. */
unsigned int compute_alpha_size (const unsigned int *alpha_inc) const;
/* Computes the unification rules between different asso_values[c]. */
unsigned int * compute_alpha_unify (const Positions& positions, const unsigned int *alpha_inc) const;
/* Initializes each keyword's _selchars array. */
void init_selchars_multiset (bool use_all_chars, const Positions& positions, const unsigned int *alpha_inc) const;
void init_selchars_multiset (bool use_all_chars, const Positions& positions, const unsigned int *alpha_unify, const unsigned int *alpha_inc) const;
/* Count the duplicate keywords that occur with the given set of positions
and a given alpha_inc[] array. */
@@ -115,13 +129,17 @@ public:
/* Adjustments to add to bytes add specific key positions. */
unsigned int * _alpha_inc;
/* Size of alphabet. */
unsigned int _alpha_size;
/* Alphabet character unification, either the identity or a mapping from
upper case characters to lower case characters (and maybe more). */
unsigned int * _alpha_unify;
/* Total number of duplicates that have been moved to _duplicate_link lists
(not counting their representatives which stay on the main list). */
int _total_duplicates;
/* Size of alphabet. */
unsigned int _alpha_size;
/* Counts occurrences of each key set character.
_occurrences[c] is the number of times that c occurs among the _selchars
of a keyword. */