gperf/src/key-list.cc

/* Routines for building, ordering, and printing the keyword list.
   Copyright (C) 1989-1998, 2000, 2002 Free Software Foundation, Inc.
   written by Douglas C. Schmidt (schmidt@ics.uci.edu)

This file is part of GNU GPERF.

GNU GPERF is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 1, or (at your option)
any later version.

GNU GPERF is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with GNU GPERF; see the file COPYING.  If not, write to the Free
Software Foundation, 59 Temple Place - Suite 330, Boston, MA 02111, USA.  */

#include <stdio.h>
#include <string.h> /* declares strncpy(), strchr() */
#include <stdlib.h> /* declares malloc(), free(), abs(), exit(), abort() */
#include <limits.h> /* defines UCHAR_MAX etc. */
#include "options.h"
#include "read-line.h"
#include "hash-table.h"
#include "key-list.h"

/* Make the hash table 8 times larger than the number of keyword entries. */
static const int TABLE_MULTIPLE     = 10;

/* Efficiently returns the least power of two greater than or equal to X! */
#define POW(X) ((!X)?1:(X-=1,X|=X>>1,X|=X>>2,X|=X>>4,X|=X>>8,X|=X>>16,(++X)))

int Key_List::_determined[MAX_ALPHA_SIZE];

/* Destructor dumps diagnostics during debugging. */

Key_List::~Key_List ()
{
  if (option[DEBUG])
    {
      fprintf (stderr, "\nDumping key list information:\ntotal non-static linked keywords = %d"
               "\ntotal keywords = %d\ntotal duplicates = %d\nmaximum key length = %d\n",
               _list_len, _total_keys, _total_duplicates, _max_key_len);
      dump ();
      fprintf (stderr, "End dumping list.\n\n");
    }
}

/* Gathers the input stream into a buffer until one of two things occur:

   1. We read a '%' followed by a '%'
   2. We read a '%' followed by a '}'

   The first symbolizes the beginning of the keyword list proper,
   The second symbolizes the end of the C source code to be generated
   verbatim in the output file.

   I assume that the keys are separated from the optional preceding struct
   declaration by a consecutive % followed by either % or } starting in
   the first column. The code below uses an expandible buffer to scan off
   and return a pointer to all the code (if any) appearing before the delimiter. */

const char *
Key_List::get_special_input (char delimiter)
{
  int size  = 80;
  char *buf = new char[size];
  int c, i;

  for (i = 0; (c = getchar ()) != EOF; i++)
    {
      if (c == '%')
        {
          if ((c = getchar ()) == delimiter)
            {

              while ((c = getchar ()) != '\n')
                ; /* discard newline */

              if (i == 0)
                return "";
              else
                {
                  buf[delimiter == '%' && buf[i - 2] == ';' ? i - 2 : i - 1] = '\0';
                  return buf;
                }
            }
          else
            buf[i++] = '%';
        }
      else if (i >= size) /* Yikes, time to grow the buffer! */
        {
          char *temp = new char[size *= 2];
          int j;

          for (j = 0; j < i; j++)
            temp[j] = buf[j];

          buf = temp;
        }
      buf[i] = c;
    }

  return 0;        /* Problem here. */
}

/* Stores any C text that must be included verbatim into the
   generated code output. */

const char *
Key_List::save_include_src ()
{
  int c;

  if ((c = getchar ()) != '%')
    ungetc (c, stdin);
  else if ((c = getchar ()) != '{')
    {
      fprintf (stderr, "internal error, %c != '{' on line %d in file %s", c, __LINE__, __FILE__);
      exit (1);
    }
  else
    return get_special_input ('}');
  return "";
}

/* Determines from the input file whether the user wants to build a table
   from a user-defined struct, or whether the user is content to simply
   use the default array of keys. */

const char *
Key_List::get_array_type ()
{
  return get_special_input ('%');
}

/* strcspn - find length of initial segment of S consisting entirely
   of characters not from REJECT (borrowed from Henry Spencer's
   ANSI string package, when GNU libc comes out I'll replace this...). */

#ifndef strcspn
inline int
Key_List::strcspn (const char *s, const char *reject)
{
  const char *scan;
  const char *rej_scan;
  int   count = 0;

  for (scan = s; *scan; scan++)
    {

      for (rej_scan = reject; *rej_scan; rej_scan++)
        if (*scan == *rej_scan)
          return count;

      count++;
    }

  return count;
}
#endif

/* Sets up the Return_Type, the Struct_Tag type and the Array_Type
   based upon various user Options. */

void
Key_List::set_output_types ()
{
  if (option[TYPE])
    {
      _array_type = get_array_type ();
      if (!_array_type)
        /* Something's wrong, but we'll catch it later on, in read_keys()... */
        return;
      /* Yow, we've got a user-defined type... */
      int i = strcspn (_array_type, "{\n\0");
      /* Remove trailing whitespace. */
      while (i > 0 && strchr (" \t", _array_type[i-1]))
        i--;
      int struct_tag_length = i;

      /* Set `struct_tag' to a naked "struct something". */
      char *structtag = new char[struct_tag_length + 1];
      strncpy (structtag, _array_type, struct_tag_length);
      structtag[struct_tag_length] = '\0';
      _struct_tag = structtag;

      /* The return type of the lookup function is "struct something *".
         No "const" here, because if !option[CONST], some user code might want
         to modify the structure. */
      char *rettype = new char[struct_tag_length + 3];
      strncpy (rettype, _array_type, struct_tag_length);
      rettype[struct_tag_length] = ' ';
      rettype[struct_tag_length + 1] = '*';
      rettype[struct_tag_length + 2] = '\0';
      _return_type = rettype;
    }
}

/* Extracts a key from an input line and creates a new KeywordExt_List for
   it. */

static KeywordExt_List *
parse_line (const char *line, const char *delimiters)
{
  if (*line == '"')
    {
      /* Parse a string in ANSI C syntax. */
      char *key = new char[strlen(line)];
      char *kp = key;
      const char *lp = line + 1;

      for (; *lp;)
        {
          char c = *lp;

          if (c == '\0')
            {
              fprintf (stderr, "unterminated string: %s\n", line);
              exit (1);
            }
          else if (c == '\\')
            {
              c = *++lp;
              switch (c)
                {
                case '0': case '1': case '2': case '3':
                case '4': case '5': case '6': case '7':
                  {
                    int code = 0;
                    int count = 0;
                    while (count < 3 && *lp >= '0' && *lp <= '7')
                      {
                        code = (code << 3) + (*lp - '0');
                        lp++;
                        count++;
                      }
                    if (code > UCHAR_MAX)
                      fprintf (stderr, "octal escape out of range: %s\n", line);
                    *kp = (char) code;
                    break;
                  }
                case 'x':
                  {
                    int code = 0;
                    int count = 0;
                    lp++;
                    while ((*lp >= '0' && *lp <= '9')
                           || (*lp >= 'A' && *lp <= 'F')
                           || (*lp >= 'a' && *lp <= 'f'))
                      {
                        code = (code << 4)
                               + (*lp >= 'A' && *lp <= 'F' ? *lp - 'A' + 10 :
                                  *lp >= 'a' && *lp <= 'f' ? *lp - 'a' + 10 :
                                  *lp - '0');
                        lp++;
                        count++;
                      }
                    if (count == 0)
                      fprintf (stderr, "hexadecimal escape without any hex digits: %s\n", line);
                    if (code > UCHAR_MAX)
                      fprintf (stderr, "hexadecimal escape out of range: %s\n", line);
                    *kp = (char) code;
                    break;
                  }
                case '\\': case '\'': case '"':
                  *kp = c;
                  lp++;
                  break;
                case 'n':
                  *kp = '\n';
                  lp++;
                  break;
                case 't':
                  *kp = '\t';
                  lp++;
                  break;
                case 'r':
                  *kp = '\r';
                  lp++;
                  break;
                case 'f':
                  *kp = '\f';
                  lp++;
                  break;
                case 'b':
                  *kp = '\b';
                  lp++;
                  break;
                case 'a':
                  *kp = '\a';
                  lp++;
                  break;
                case 'v':
                  *kp = '\v';
                  lp++;
                  break;
                default:
                  fprintf (stderr, "invalid escape sequence in string: %s\n", line);
                  exit (1);
                }
            }
          else if (c == '"')
            break;
          else
            {
              *kp = c;
              lp++;
            }
          kp++;
        }
      lp++;
      if (*lp != '\0')
        {
          if (strchr (delimiters, *lp) == NULL)
            {
              fprintf (stderr, "string not followed by delimiter: %s\n", line);
              exit (1);
            }
          lp++;
        }
      return new KeywordExt_List (key, kp - key, option[TYPE] ? lp : "");
    }
  else
    {
      /* Not a string. Look for the delimiter. */
      int len = strcspn (line, delimiters);
      const char *rest;

      if (line[len] == '\0')
        rest = "";
      else
        /* Skip the first delimiter. */
        rest = &line[len + 1];
      return new KeywordExt_List (line, len, option[TYPE] ? rest : "");
    }
}

/* Reads in all keys from standard input and creates a linked list pointed
   to by Head.  This list is then quickly checked for ``links,'' i.e.,
   unhashable elements possessing identical key sets and lengths. */

void
Key_List::read_keys ()
{
  char *ptr;

  _include_src = save_include_src ();
  set_output_types ();

  /* Oops, problem with the input file. */
  if (! (ptr = Read_Line::read_next_line ()))
    {
      fprintf (stderr, "No words in input file, did you forget to prepend %s or use -t accidentally?\n", "%%");
      exit (1);
    }

  /* Read in all the keywords from the input file. */
  else
    {
      const char *delimiter = option.get_delimiter ();
      KeywordExt_List *temp;
      KeywordExt_List *trail = NULL;

      _head = parse_line (ptr, delimiter);
      _head->first()->init_selchars(this);

      for (temp = _head;
           (ptr = Read_Line::read_next_line ()) && strcmp (ptr, "%%");
           temp = temp->rest())
        {
          temp->rest() = parse_line (ptr, delimiter);
          temp->rest()->first()->init_selchars(this);
          _total_keys++;
        }

      /* See if any additional C code is included at end of this file. */
      if (ptr)
        _additional_code = 1;

      /* Hash table this number of times larger than keyword number. */
      int table_size = (_list_len = _total_keys) * TABLE_MULTIPLE;
      /* Table must be a power of 2 for the hash function scheme to work. */
      KeywordExt **table = new KeywordExt*[POW (table_size)];

      /* Make large hash table for efficiency. */
      Hash_Table found_link (table, table_size, option[NOLENGTH]);

      /* Test whether there are any links and also set the maximum length of
        an identifier in the keyword list. */

      for (temp = _head; temp; temp = temp->rest())
        {
          KeywordExt *keyword = temp->first();
          KeywordExt *other_keyword = found_link.insert (keyword);

          /* Check for links.  We deal with these by building an equivalence class
             of all duplicate values (i.e., links) so that only 1 keyword is
             representative of the entire collection.  This *greatly* simplifies
             processing during later stages of the program. */

          if (other_keyword)
            {
              _total_duplicates++;
              _list_len--;
              trail->rest() = temp->rest();
              temp->first()->_duplicate_link = other_keyword->_duplicate_link;
              other_keyword->_duplicate_link = temp->first();

              /* Complain if user hasn't enabled the duplicate option. */
              if (!option[DUP] || option[DEBUG])
                fprintf (stderr, "Key link: \"%.*s\" = \"%.*s\", with key set \"%.*s\".\n",
                                 keyword->_allchars_length, keyword->_allchars,
                                 other_keyword->_allchars_length, other_keyword->_allchars,
                                 keyword->_selchars_length, keyword->_selchars);
            }
          else
            trail = temp;

          /* Update minimum and maximum keyword length, if needed. */
          if (_max_key_len < keyword->_allchars_length)
            _max_key_len = keyword->_allchars_length;
          if (_min_key_len > keyword->_allchars_length)
            _min_key_len = keyword->_allchars_length;
        }

      delete[] table;

      /* Exit program if links exists and option[DUP] not set, since we can't continue */
      if (_total_duplicates)
        {
          if (option[DUP])
            fprintf (stderr, "%d input keys have identical hash values, examine output carefully...\n",
                             _total_duplicates);
          else
            {
              fprintf (stderr, "%d input keys have identical hash values,\ntry different key positions or use option -D.\n",
                               _total_duplicates);
              exit (1);
            }
        }
      /* Exit program if an empty string is used as key, since the comparison
         expressions don't work correctly for looking up an empty string. */
      if (_min_key_len == 0)
        {
          fprintf (stderr, "Empty input key is not allowed.\nTo recognize an empty input key, your code should check for\nlen == 0 before calling the gperf generated lookup function.\n");
          exit (1);
        }
    }
}

/* Recursively merges two sorted lists together to form one sorted list. The
   ordering criteria is by frequency of occurrence of elements in the key set
   or by the hash value.  This is a kludge, but permits nice sharing of
   almost identical code without incurring the overhead of a function
   call comparison. */

KeywordExt_List *
Key_List::merge (KeywordExt_List *list1, KeywordExt_List *list2)
{
  KeywordExt_List *result;
  KeywordExt_List **resultp = &result;
  for (;;)
    {
      if (!list1)
        {
          *resultp = list2;
          break;
        }
      if (!list2)
        {
          *resultp = list1;
          break;
        }
      if (_occurrence_sort && list1->first()->_occurrence < list2->first()->_occurrence
          || _hash_sort && list1->first()->_hash_value > list2->first()->_hash_value)
        {
          *resultp = list2;
          resultp = &list2->rest(); list2 = list1; list1 = *resultp;
        }
      else
        {
          *resultp = list1;
          resultp = &list1->rest(); list1 = *resultp;
        }
    }
  return result;
}

/* Applies the merge sort algorithm to recursively sort the key list by
   frequency of occurrence of elements in the key set. */

KeywordExt_List *
Key_List::merge_sort (KeywordExt_List *head)
{
  if (!head || !head->rest())
    return head;
  else
    {
      KeywordExt_List *middle = head;
      KeywordExt_List *temp   = head->rest()->rest();

      while (temp)
        {
          temp   = temp->rest();
          middle = middle->rest();
          if (temp)
            temp = temp->rest();
        }

      temp         = middle->rest();
      middle->rest() = 0;
      return merge (merge_sort (head), merge_sort (temp));
    }
}

/* Returns the frequency of occurrence of elements in the key set. */

inline int
Key_List::get_occurrence (KeywordExt *ptr)
{
  int value = 0;

  const char *p = ptr->_selchars;
  unsigned int i = ptr->_selchars_length;
  for (; i > 0; p++, i--)
    value += _occurrences[(unsigned char)(*p)];

  return value;
}

/* Enables the index location of all key set elements that are now
   determined. */

inline void
Key_List::set_determined (KeywordExt *ptr)
{
  const char *p = ptr->_selchars;
  unsigned int i = ptr->_selchars_length;
  for (; i > 0; p++, i--)
    _determined[(unsigned char)(*p)] = 1;
}

/* Returns TRUE if PTR's key set is already completely determined. */

inline int
Key_List::already_determined (KeywordExt *ptr)
{
  int is_determined = 1;

  const char *p = ptr->_selchars;
  unsigned int i = ptr->_selchars_length;
  for (; is_determined && i > 0; p++, i--)
    is_determined = _determined[(unsigned char)(*p)];

  return is_determined;
}

/* Reorders the table by first sorting the list so that frequently occuring
   keys appear first, and then the list is reordered so that keys whose values
   are already determined will be placed towards the front of the list.  This
   helps prune the search time by handling inevitable collisions early in the
   search process.  See Cichelli's paper from Jan 1980 JACM for details.... */

void
Key_List::reorder ()
{
  KeywordExt_List *ptr;
  for (ptr = _head; ptr; ptr = ptr->rest())
    {
      KeywordExt *keyword = ptr->first();

      keyword->_occurrence = get_occurrence (keyword);
    }

  _hash_sort = 0;
  _occurrence_sort = 1;

  _head = merge_sort (_head);

  for (ptr = _head; ptr->rest(); ptr = ptr->rest())
    {
      set_determined (ptr->first());

      if (!already_determined (ptr->rest()->first()))
        {
          KeywordExt_List *trail_ptr = ptr->rest();
          KeywordExt_List *run_ptr   = trail_ptr->rest();

          for (; run_ptr; run_ptr = trail_ptr->rest())
            {

              if (already_determined (run_ptr->first()))
                {
                  trail_ptr->rest() = run_ptr->rest();
                  run_ptr->rest()   = ptr->rest();
                  ptr = ptr->rest() = run_ptr;
                }
              else
                trail_ptr = run_ptr;
            }
        }
    }
}

/* Sorts the keys by hash value. */

void
Key_List::sort ()
{
  _hash_sort       = 1;
  _occurrence_sort = 0;

  _head = merge_sort (_head);
}

/* Dumps the key list to stderr stream. */

void
Key_List::dump ()
{
  int field_width = get_max_keysig_size ();

  fprintf (stderr, "\nList contents are:\n(hash value, key length, index, %*s, keyword):\n",
           field_width, "selchars");

  for (KeywordExt_List *ptr = _head; ptr; ptr = ptr->rest())
    fprintf (stderr, "%11d,%11d,%6d, %*.*s, %.*s\n",
             ptr->first()->_hash_value, ptr->first()->_allchars_length, ptr->first()->_final_index,
             field_width, ptr->first()->_selchars_length, ptr->first()->_selchars,
             ptr->first()->_allchars_length, ptr->first()->_allchars);
}

/* Simple-minded constructor action here... */

Key_List::Key_List ()
{
  _total_keys       = 1;
  _max_key_len      = INT_MIN;
  _min_key_len      = INT_MAX;
  _array_type       = 0;
  _return_type      = 0;
  _struct_tag       = 0;
  _head             = 0;
  _total_duplicates = 0;
  _additional_code  = 0;
}

/* Returns the length of entire key list. */

int
Key_List::keyword_list_length ()
{
  return _list_len;
}

/* Returns length of longest key read. */

int
Key_List::max_key_length ()
{
  return _max_key_len;
}

/* Returns number of key positions.  */

int
Key_List::get_max_keysig_size ()
{
  return option[ALLCHARS] ? _max_key_len : option.get_max_keysig_size ();
}