Add tests for 8-bit clean comparison and binary comparison.

2025-12-02 13:09:22 +00:00 · 2000-08-20 20:35:53 +00:00
parent 3354d156d6
commit ac65860e4b
8 changed files with 161 additions and 1 deletions
--- a/tests/Makefile.in
+++ b/tests/Makefile.in
@@ -55,7 +55,7 @@ installdirs :

 uninstall :

-check : check-link-c check-link-c++ check-c check-ada check-modula3 check-pascal check-test
+check : check-link-c check-link-c++ check-c check-ada check-modula3 check-pascal check-lang-utf8 check-lang-ucs2 check-test
 	@true

 extracheck : @CHECK_LANG_SYNTAX@
@@ -101,6 +101,23 @@ check-pascal:
 	./pout -v < $(srcdir)/pascal.gperf > pascal.out
 	diff $(srcdir)/pascal.exp pascal.out

+# check for 8-bit cleanliness
+check-lang-utf8:
+	$(GPERF) -k1 -t -I -K foreign_name < $(srcdir)/lang-utf8.gperf > lu8inset.c
+	$(CC) $(CFLAGS) -o lu8out lu8inset.c test.o
+	@echo "testing UTF-8 encoded languages, all items should be found in the set"
+	sed -e '1,6d' -e 's/,.*//' < $(srcdir)/lang-utf8.gperf | ./lu8out -v > lang-utf8.out
+	diff $(srcdir)/lang-utf8.exp lang-utf8.out
+
+# check for binary keywords with NUL bytes
+check-lang-ucs2:
+	$(CC) -c $(CFLAGS) $(srcdir)/test2.c
+	$(GPERF) -k4 -t -l -I -K foreign_name < $(srcdir)/lang-ucs2.gperf > lu2inset.c
+	$(CC) $(CFLAGS) -o lu2out lu2inset.c test2.o
+	@echo "testing UCS-2 encoded languages, all items should be found in the set"
+	./lu2out -v < $(srcdir)/lang-ucs2.in > lang-ucs2.out
+	diff $(srcdir)/lang-ucs2.exp lang-ucs2.out
+
 # these next 5 are demos that show off the generated code
 check-test:
 	$(GPERF) -L C -F ', 0, 0' -p -j1 -i 1 -g -o -t -G -N is_reserved_word -k1,3,'$$' < $(srcdir)/c-parse.gperf > c-parse.out
--- a/tests/lang-ucs2.exp
+++ b/tests/lang-ucs2.exp
@@ -0,0 +1,20 @@
+in word set 12A0121B122D129B
+in word set 010D00650073006B0079
+in word set 00440061006E0073006B
+in word set 0045006E0067006C006900730068
+in word set 00530075006F006D0069
+in word set 004600720061006E00E7006100690073
+in word set 0044006500750074007300630068
+in word set 039503BB03BB03B703BD03B903BA03AC
+in word set 05E205D105E805D905EA
+in word set 004900740061006C00690061006E006F
+in word set 004E006F00720073006B
+in word set 0420044304410441043A04380439
+in word set 004500730070006100F1006F006C
+in word set 005300760065006E0073006B0061
+in word set 0E200E320E290E320E440E170E22
+in word set 005400FC0072006B00E70065
+in word set 005400691EBF006E00670020005600691EC70074
+in word set 65E5672C8A9E
+in word set 4E2D6587
+in word set D55CAE00
--- a/tests/lang-ucs2.gperf
+++ b/tests/lang-ucs2.gperf
@@ -0,0 +1,26 @@
+struct language {
+  const char *foreign_name;
+  const char *english_name;
+  const char *locale;
+};
+%%
+"\x12\xA0\x12\x1B\x12\x2D\x12\x9B",       "Amharic",    NULL
+"\x01\x0D\x00\x65\x00\x73\x00\x6B\x00\x79",      "Czech",      "cs_CZ.UTF-8"
+"\x00\x44\x00\x61\x00\x6E\x00\x73\x00\x6B",      "Danish",     "da_DK.UTF-8"
+"\x00\x45\x00\x6E\x00\x67\x00\x6C\x00\x69\x00\x73\x00\x68",    "English",    "en_GB.UTF-8"
+"\x00\x53\x00\x75\x00\x6F\x00\x6D\x00\x69",      "Finnish",    "fi_FI.UTF-8"
+"\x00\x46\x00\x72\x00\x61\x00\x6E\x00\xE7\x00\x61\x00\x69\x00\x73",   "French",     "fr_FR.UTF-8"
+"\x00\x44\x00\x65\x00\x75\x00\x74\x00\x73\x00\x63\x00\x68",    "German",     "de_DE.UTF-8"
+"\x03\x95\x03\xBB\x03\xBB\x03\xB7\x03\xBD\x03\xB9\x03\xBA\x03\xAC",   "Greek",      "el_GR.UTF-8"
+"\x05\xE2\x05\xD1\x05\xE8\x05\xD9\x05\xEA",      "Hebrew",     "he_IL.UTF-8"
+"\x00\x49\x00\x74\x00\x61\x00\x6C\x00\x69\x00\x61\x00\x6E\x00\x6F",   "Italian",    "it_IT.UTF-8"
+"\x00\x4E\x00\x6F\x00\x72\x00\x73\x00\x6B",      "Norwegian",  "no_NO.UTF-8"
+"\x04\x20\x04\x43\x04\x41\x04\x41\x04\x3A\x04\x38\x04\x39",    "Russian",    "ru_RU.UTF-8"
+"\x00\x45\x00\x73\x00\x70\x00\x61\x00\xF1\x00\x6F\x00\x6C",    "Spanish",    "es_ES.UTF-8"
+"\x00\x53\x00\x76\x00\x65\x00\x6E\x00\x73\x00\x6B\x00\x61",    "Swedish",    "sv_SE.UTF-8"
+"\x0E\x20\x0E\x32\x0E\x29\x0E\x32\x0E\x44\x0E\x17\x0E\x22",    "Thai",       "th_TH.UTF-8"
+"\x00\x54\x00\xFC\x00\x72\x00\x6B\x00\xE7\x00\x65",     "Turkish",    "tr_TR.UTF-8"
+"\x00\x54\x00\x69\x1E\xBF\x00\x6E\x00\x67\x00\x20\x00\x56\x00\x69\x1E\xC7\x00\x74", "Vietnamese", "vi_VN.UTF-8"
+"\x65\xE5\x67\x2C\x8A\x9E",     "Japanese",   "ja_JP.UTF-8"
+"\x4E\x2D\x65\x87",       "Chinese",    "zh_CN.UTF-8"
+"\xD5\x5C\xAE\x00",       "Korean",     "ko_KR.UTF-8"
--- a/tests/lang-ucs2.in
+++ b/tests/lang-ucs2.in
--- a/tests/lang-utf8.exp
+++ b/tests/lang-utf8.exp
@@ -0,0 +1,20 @@
+in word set አማርኛ
+in word set česky
+in word set Dansk
+in word set English
+in word set Suomi
+in word set Français
+in word set Deutsch
+in word set Ελληνικά
+in word set עברית
+in word set Italiano
+in word set Norsk
+in word set Русский
+in word set Español
+in word set Svenska
+in word set ภาษาไทย
+in word set Türkçe
+in word set Tiếng Việt
+in word set 日本語
+in word set 中文
+in word set 한글
--- a/tests/lang-utf8.gperf
+++ b/tests/lang-utf8.gperf
@@ -0,0 +1,26 @@
+struct language {
+  const char *foreign_name;
+  const char *english_name;
+  const char *locale;
+};
+%%
+አማርኛ,       "Amharic",    NULL
+česky,      "Czech",      "cs_CZ.UTF-8"
+Dansk,      "Danish",     "da_DK.UTF-8"
+English,    "English",    "en_GB.UTF-8"
+Suomi,      "Finnish",    "fi_FI.UTF-8"
+Français,   "French",     "fr_FR.UTF-8"
+Deutsch,    "German",     "de_DE.UTF-8"
+Ελληνικά,   "Greek",      "el_GR.UTF-8"
+עברית,      "Hebrew",     "he_IL.UTF-8"
+Italiano,   "Italian",    "it_IT.UTF-8"
+Norsk,      "Norwegian",  "no_NO.UTF-8"
+Русский,    "Russian",    "ru_RU.UTF-8"
+Español,    "Spanish",    "es_ES.UTF-8"
+Svenska,    "Swedish",    "sv_SE.UTF-8"
+ภาษาไทย,    "Thai",       "th_TH.UTF-8"
+Türkçe,     "Turkish",    "tr_TR.UTF-8"
+Tiếng Việt, "Vietnamese", "vi_VN.UTF-8"
+日本語,     "Japanese",   "ja_JP.UTF-8"
+中文,       "Chinese",    "zh_CN.UTF-8"
+한글,       "Korean",     "ko_KR.UTF-8"
--- a/tests/test2.c
+++ b/tests/test2.c
@@ -0,0 +1,45 @@
+/*
+   Tests the generated perfect hash function.
+   The -v option prints diagnostics as to whether a word is in
+   the set or not.  Without -v the program is useful for timing.
+*/
+
+#include <stdio.h>
+
+#define MAX_LEN 80
+
+int
+main (argc, argv)
+     int   argc;
+     char *argv[];
+{
+  int  verbose = argc > 1 ? 1 : 0;
+  char buf[2*MAX_LEN];
+  int buflen;
+
+  for (;;)
+    {
+      /* Simulate gets(buf) with 2 bytes per character. */
+      char *p = buf;
+      while (fread (p, 2, 1, stdin) == 1)
+        {
+          if ((p[0] << 8) + p[1] == '\n')
+            break;
+          p += 2;
+        }
+      buflen = p - buf;
+
+      if (buflen == 0)
+        break;
+
+      if (in_word_set (buf, buflen) && verbose)
+        printf ("in word set ");
+      else if (verbose)
+        printf ("NOT in word set ");
+      for (p = buf; p < buf + buflen; p++)
+        printf ("%02X", (unsigned char) *p);
+      printf("\n");
+    }
+
+  return 0;
+}