glib/tests/unicode-normalize.c

#include <glib.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>

gboolean success = TRUE;

static char *
decode (const gchar *input)
{
  unsigned ch;
  int offset = 0;
  GString *result = g_string_new (NULL);
  
  do 
    {
      if (sscanf (input + offset, "%x", &ch) != 1)
	{
	  fprintf (stderr, "Error parsing character string %s\n", input);
	  exit (1);
	}

      /* FIXME: We don't handle the > BMP or Hangul syllables */
      if (ch > 0xffff ||	         /* > BMP */
	  (ch >= 0xac00 && ch <= 0xd7ff))  /* Hangul syllables */
	{
	  g_string_free (result, TRUE);
	  return NULL;
	}

      g_string_append_unichar (result, ch);
      
      while (input[offset] && input[offset] != ' ')
	offset++;
      while (input[offset] && input[offset] == ' ')
	offset++;
    }
  while (input[offset]);

  return g_string_free (result, FALSE);
}

const char *names[4] = {
  "NFD",
  "NFC",
  "NFKD",
  "NFKC"
};

static void
test_form (int            line,
	   GNormalizeMode mode,
	   gboolean       do_compat,
	   int            expected,
	   char         **c,
	   char         **raw)
{
  int i;
  
  gboolean mode_is_compat = (mode == G_NORMALIZE_NFKC ||
			     mode == G_NORMALIZE_NFKD);

  if (mode_is_compat || !do_compat)
    {
      for (i = 0; i < 3; i++)
	{
	  char *result = g_utf8_normalize (c[i], -1, mode);
	  if (strcmp (result, c[expected]) != 0)
	    {
	      fprintf (stderr, "\nFailure: %d/%d: %s\n", line, i + 1, raw[5]);
	      fprintf (stderr, "  g_utf8_normalize (%s, %s) != %s\n",
		   raw[i], names[mode], raw[expected]);
	      success = FALSE;
	    }
	  
	  g_free (result);
	}
    }
  if (mode_is_compat || do_compat)
    {
      for (i = 3; i < 5; i++)
	{
	  char *result = g_utf8_normalize (c[i], -1, mode);
	  if (strcmp (result, c[expected]) != 0)
	    {
	      fprintf (stderr, "\nFailure: %d/%d: %s\n", line, i, raw[5]);
	      fprintf (stderr, "  g_utf8_normalize (%s, %s) != %s\n",
		   raw[i], names[mode], raw[expected]);
	      success = FALSE;
	    }
	  
	  g_free (result);
	}
    }
}

static gboolean
process_one (int line, gchar **columns)
{
  char *c[5];
  int i;
  gboolean skip = FALSE;

  for (i=0; i < 5; i++)
    {
      c[i] = decode(columns[i]);
      if (!c[i])
	skip = TRUE;
    }

  if (!skip)
    {
      test_form (line, G_NORMALIZE_NFD, FALSE, 2, c, columns);
      test_form (line, G_NORMALIZE_NFD, TRUE, 4, c, columns);
      test_form (line, G_NORMALIZE_NFC, FALSE, 1, c, columns);
      test_form (line, G_NORMALIZE_NFC, TRUE, 3, c, columns);
      test_form (line, G_NORMALIZE_NFKD, TRUE, 4, c, columns);
      test_form (line, G_NORMALIZE_NFKC, TRUE, 3, c, columns);
    }

  for (i=0; i < 5; i++)
    g_free (c[i]);
  
  return TRUE;
}

int main (int argc, char **argv)
{
  GIOChannel *in;
  GError *error = NULL;
  GString *buffer = g_string_new (NULL);
  int line_to_do = 0;
  int line = 1;

  if (argc != 2 && argc != 3)
    {
      fprintf (stderr, "Usage: unicode-normalize NormalizationTest.txt LINE\n");
      return 1;
    }

  if (argc == 3)
    line_to_do = atoi(argv[2]);

  in = g_io_channel_new_file (argv[1], "r", &error);
  if (!in)
    {
      fprintf (stderr, "Cannot open %s: %s\n", argv[1], error->message);
      return 1;
    }

  while (TRUE)
    {
      gsize term_pos;
      gchar **columns;

      if (g_io_channel_read_line_string (in, buffer, &term_pos, &error) != G_IO_STATUS_NORMAL)
	break;
	
      if (line_to_do && line != line_to_do)
	goto next;
      
      buffer->str[term_pos] = '\0';
      
      if (buffer->str[0] == '#') /* Comment */
	goto next;
      if (buffer->str[0] == '@') /* Part */
	{
	  fprintf (stderr, "\nProcessing %s\n", buffer->str + 1);
	  goto next;
	}
      
      columns = g_strsplit (buffer->str, ";", -1);
      if (!process_one (line, columns))
	return 1;
      g_strfreev (columns);

    next:
      g_string_truncate (buffer, 0);
      line++;
    }

  if (error)
    {
      fprintf (stderr, "Error reading test file, %s\n", error->message);
      return 1;
    }

  g_io_channel_close (in);
  g_string_free (buffer, TRUE);

  return !success;
}
Use G_N_ELEMENTS rather than a custom macro. Sun Jul 1 20:16:25 2001 Owen Taylor <otaylor@redhat.com> * glib/guniprop.c (g_unichar_totitle): Use G_N_ELEMENTS rather than a custom macro. * glib/gen-unicode-tables.pl: Adapt to changes in table formats for Unicode 3.1 * glib/gunicode.h glib/guniprop.c glib/gunichartables.h glib/gen-unicode-tables.pl: Add case conversion functions g_utf8_casefold, g_utf8_strup, g_utf8_strdown. * tests/unicode-caseconv.c tests/gen-casefold-txt.pl tests/gen-casemap-txt.pl tests/casefold.txt tests/casemap.txt: Test cases for case conversion. * glib/gunicode.h glib/gunidecomp.[ch] glib/gunicomp.h glib/gen-unicode-tables.pl: Add function to do Unicode normalization g_utf8_normalize(). * tests/unicode-normalize.c: Test program for case conversion. * glib/gunicode.h glib/gunicollate.c: Add collation functions g_utf8_collate, g_utf8_collate_key. * test/unicode-collate.c: Test program for collation. * glib/gdate.c (g_date_fill_parse_tokens): Fix uninitialized variable. * glib/gdate.c (g_date_strftime) docs/Changes-2.0.txt: Make work with UTF-8 even if the locale isn't UTF-8 based. Still somewhat of broken, if the format string contains characters not representable in the current locale, will warn and not work. * glib/gdate.c: Use UTF-8 normalization and casefolding. 2001-07-02 02:49:21 +02:00			`#include <glib.h>`
			`#include <stdio.h>`
			`#include <stdlib.h>`
Fix. Mon Jul 2 16:03:21 2001 Owen Taylor <otaylor@redhat.com> * glib/giochannel.c (g_io_channel_get_buffer_condition): Fix. * glib/giunix.c: Fix prepare/check/dispatch for watches. * tests/unicode-normalize.c: #include <string.h> 2001-07-02 22:26:38 +02:00			`#include <string.h>`
Use G_N_ELEMENTS rather than a custom macro. Sun Jul 1 20:16:25 2001 Owen Taylor <otaylor@redhat.com> * glib/guniprop.c (g_unichar_totitle): Use G_N_ELEMENTS rather than a custom macro. * glib/gen-unicode-tables.pl: Adapt to changes in table formats for Unicode 3.1 * glib/gunicode.h glib/guniprop.c glib/gunichartables.h glib/gen-unicode-tables.pl: Add case conversion functions g_utf8_casefold, g_utf8_strup, g_utf8_strdown. * tests/unicode-caseconv.c tests/gen-casefold-txt.pl tests/gen-casemap-txt.pl tests/casefold.txt tests/casemap.txt: Test cases for case conversion. * glib/gunicode.h glib/gunidecomp.[ch] glib/gunicomp.h glib/gen-unicode-tables.pl: Add function to do Unicode normalization g_utf8_normalize(). * tests/unicode-normalize.c: Test program for case conversion. * glib/gunicode.h glib/gunicollate.c: Add collation functions g_utf8_collate, g_utf8_collate_key. * test/unicode-collate.c: Test program for collation. * glib/gdate.c (g_date_fill_parse_tokens): Fix uninitialized variable. * glib/gdate.c (g_date_strftime) docs/Changes-2.0.txt: Make work with UTF-8 even if the locale isn't UTF-8 based. Still somewhat of broken, if the format string contains characters not representable in the current locale, will warn and not work. * glib/gdate.c: Use UTF-8 normalization and casefolding. 2001-07-02 02:49:21 +02:00
			`gboolean success = TRUE;`

			`static char *`
			`decode (const gchar *input)`
			`{`
			`unsigned ch;`
			`int offset = 0;`
			`GString *result = g_string_new (NULL);`

			`do`
			`{`
			`if (sscanf (input + offset, "%x", &ch) != 1)`
			`{`
			`fprintf (stderr, "Error parsing character string %s\n", input);`
			`exit (1);`
			`}`

			`/* FIXME: We don't handle the > BMP or Hangul syllables */`
			`if (ch > 0xffff \|\| /* > BMP */`
			`(ch >= 0xac00 && ch <= 0xd7ff)) /* Hangul syllables */`
			`{`
			`g_string_free (result, TRUE);`
			`return NULL;`
			`}`

Add functions to insert a unichar as UTF-8, since this is reasonably Fri Jul 13 19:20:06 2001 Owen Taylor <otaylor@redhat.com> * glib/gstring.c (g_string_insert/append/prepend_unichar): Add functions to insert a unichar as UTF-8, since this is reasonably common. * glib/gutf8.c glib/gunicode.h (g_utf8_get_char_validated): New function exposing iterating through possibly invalid/incomplete UTF-8 to unicode to the outside world. * glib/gutf8.c (g_utf8_get_char_extended): Fix max_len argument to be gssize, not gsize. 2001-07-19 16:35:48 +02:00			`g_string_append_unichar (result, ch);`
Use G_N_ELEMENTS rather than a custom macro. Sun Jul 1 20:16:25 2001 Owen Taylor <otaylor@redhat.com> * glib/guniprop.c (g_unichar_totitle): Use G_N_ELEMENTS rather than a custom macro. * glib/gen-unicode-tables.pl: Adapt to changes in table formats for Unicode 3.1 * glib/gunicode.h glib/guniprop.c glib/gunichartables.h glib/gen-unicode-tables.pl: Add case conversion functions g_utf8_casefold, g_utf8_strup, g_utf8_strdown. * tests/unicode-caseconv.c tests/gen-casefold-txt.pl tests/gen-casemap-txt.pl tests/casefold.txt tests/casemap.txt: Test cases for case conversion. * glib/gunicode.h glib/gunidecomp.[ch] glib/gunicomp.h glib/gen-unicode-tables.pl: Add function to do Unicode normalization g_utf8_normalize(). * tests/unicode-normalize.c: Test program for case conversion. * glib/gunicode.h glib/gunicollate.c: Add collation functions g_utf8_collate, g_utf8_collate_key. * test/unicode-collate.c: Test program for collation. * glib/gdate.c (g_date_fill_parse_tokens): Fix uninitialized variable. * glib/gdate.c (g_date_strftime) docs/Changes-2.0.txt: Make work with UTF-8 even if the locale isn't UTF-8 based. Still somewhat of broken, if the format string contains characters not representable in the current locale, will warn and not work. * glib/gdate.c: Use UTF-8 normalization and casefolding. 2001-07-02 02:49:21 +02:00
			`while (input[offset] && input[offset] != ' ')`
			`offset++;`
			`while (input[offset] && input[offset] == ' ')`
			`offset++;`
			`}`
			`while (input[offset]);`

			`return g_string_free (result, FALSE);`
			`}`

			`const char *names[4] = {`
			`"NFD",`
			`"NFC",`
			`"NFKD",`
			`"NFKC"`
			`};`

			`static void`
			`test_form (int line,`
			`GNormalizeMode mode,`
			`gboolean do_compat,`
			`int expected,`
			`char **c,`
			`char **raw)`
			`{`
			`int i;`

			`gboolean mode_is_compat = (mode == G_NORMALIZE_NFKC \|\|`
			`mode == G_NORMALIZE_NFKD);`

			`if (mode_is_compat \|\| !do_compat)`
			`{`
			`for (i = 0; i < 3; i++)`
			`{`
Add length arguments to g_utf8_{strup,strdown,casefold,collate_key}. Fri Jul 6 22:34:32 2001 Owen Taylor <otaylor@redhat.com> * glib/gunicode.h glib/gunidecomp.c glib/guniprop.c glib/gunicollate.c: Add length arguments to g_utf8_{strup,strdown,casefold,collate_key}. * glib/gdate.c: Fix for above. 2001-07-07 04:42:49 +02:00			`char *result = g_utf8_normalize (c[i], -1, mode);`
Use G_N_ELEMENTS rather than a custom macro. Sun Jul 1 20:16:25 2001 Owen Taylor <otaylor@redhat.com> * glib/guniprop.c (g_unichar_totitle): Use G_N_ELEMENTS rather than a custom macro. * glib/gen-unicode-tables.pl: Adapt to changes in table formats for Unicode 3.1 * glib/gunicode.h glib/guniprop.c glib/gunichartables.h glib/gen-unicode-tables.pl: Add case conversion functions g_utf8_casefold, g_utf8_strup, g_utf8_strdown. * tests/unicode-caseconv.c tests/gen-casefold-txt.pl tests/gen-casemap-txt.pl tests/casefold.txt tests/casemap.txt: Test cases for case conversion. * glib/gunicode.h glib/gunidecomp.[ch] glib/gunicomp.h glib/gen-unicode-tables.pl: Add function to do Unicode normalization g_utf8_normalize(). * tests/unicode-normalize.c: Test program for case conversion. * glib/gunicode.h glib/gunicollate.c: Add collation functions g_utf8_collate, g_utf8_collate_key. * test/unicode-collate.c: Test program for collation. * glib/gdate.c (g_date_fill_parse_tokens): Fix uninitialized variable. * glib/gdate.c (g_date_strftime) docs/Changes-2.0.txt: Make work with UTF-8 even if the locale isn't UTF-8 based. Still somewhat of broken, if the format string contains characters not representable in the current locale, will warn and not work. * glib/gdate.c: Use UTF-8 normalization and casefolding. 2001-07-02 02:49:21 +02:00			`if (strcmp (result, c[expected]) != 0)`
			`{`
			`fprintf (stderr, "\nFailure: %d/%d: %s\n", line, i + 1, raw[5]);`
			`fprintf (stderr, " g_utf8_normalize (%s, %s) != %s\n",`
			`raw[i], names[mode], raw[expected]);`
			`success = FALSE;`
			`}`

			`g_free (result);`
			`}`
			`}`
			`if (mode_is_compat \|\| do_compat)`
			`{`
			`for (i = 3; i < 5; i++)`
			`{`
Add length arguments to g_utf8_{strup,strdown,casefold,collate_key}. Fri Jul 6 22:34:32 2001 Owen Taylor <otaylor@redhat.com> * glib/gunicode.h glib/gunidecomp.c glib/guniprop.c glib/gunicollate.c: Add length arguments to g_utf8_{strup,strdown,casefold,collate_key}. * glib/gdate.c: Fix for above. 2001-07-07 04:42:49 +02:00			`char *result = g_utf8_normalize (c[i], -1, mode);`
Use G_N_ELEMENTS rather than a custom macro. Sun Jul 1 20:16:25 2001 Owen Taylor <otaylor@redhat.com> * glib/guniprop.c (g_unichar_totitle): Use G_N_ELEMENTS rather than a custom macro. * glib/gen-unicode-tables.pl: Adapt to changes in table formats for Unicode 3.1 * glib/gunicode.h glib/guniprop.c glib/gunichartables.h glib/gen-unicode-tables.pl: Add case conversion functions g_utf8_casefold, g_utf8_strup, g_utf8_strdown. * tests/unicode-caseconv.c tests/gen-casefold-txt.pl tests/gen-casemap-txt.pl tests/casefold.txt tests/casemap.txt: Test cases for case conversion. * glib/gunicode.h glib/gunidecomp.[ch] glib/gunicomp.h glib/gen-unicode-tables.pl: Add function to do Unicode normalization g_utf8_normalize(). * tests/unicode-normalize.c: Test program for case conversion. * glib/gunicode.h glib/gunicollate.c: Add collation functions g_utf8_collate, g_utf8_collate_key. * test/unicode-collate.c: Test program for collation. * glib/gdate.c (g_date_fill_parse_tokens): Fix uninitialized variable. * glib/gdate.c (g_date_strftime) docs/Changes-2.0.txt: Make work with UTF-8 even if the locale isn't UTF-8 based. Still somewhat of broken, if the format string contains characters not representable in the current locale, will warn and not work. * glib/gdate.c: Use UTF-8 normalization and casefolding. 2001-07-02 02:49:21 +02:00			`if (strcmp (result, c[expected]) != 0)`
			`{`
			`fprintf (stderr, "\nFailure: %d/%d: %s\n", line, i, raw[5]);`
			`fprintf (stderr, " g_utf8_normalize (%s, %s) != %s\n",`
			`raw[i], names[mode], raw[expected]);`
			`success = FALSE;`
			`}`

			`g_free (result);`
			`}`
			`}`
			`}`

			`static gboolean`
			`process_one (int line, gchar **columns)`
			`{`
			`char *c[5];`
			`int i;`
			`gboolean skip = FALSE;`

			`for (i=0; i < 5; i++)`
			`{`
			`c[i] = decode(columns[i]);`
			`if (!c[i])`
			`skip = TRUE;`
			`}`

			`if (!skip)`
			`{`
			`test_form (line, G_NORMALIZE_NFD, FALSE, 2, c, columns);`
			`test_form (line, G_NORMALIZE_NFD, TRUE, 4, c, columns);`
			`test_form (line, G_NORMALIZE_NFC, FALSE, 1, c, columns);`
			`test_form (line, G_NORMALIZE_NFC, TRUE, 3, c, columns);`
			`test_form (line, G_NORMALIZE_NFKD, TRUE, 4, c, columns);`
			`test_form (line, G_NORMALIZE_NFKC, TRUE, 3, c, columns);`
			`}`

			`for (i=0; i < 5; i++)`
			`g_free (c[i]);`

			`return TRUE;`
			`}`

			`int main (int argc, char **argv)`
			`{`
			`GIOChannel *in;`
			`GError *error = NULL;`
			`GString *buffer = g_string_new (NULL);`
			`int line_to_do = 0;`
			`int line = 1;`

			`if (argc != 2 && argc != 3)`
			`{`
			`fprintf (stderr, "Usage: unicode-normalize NormalizationTest.txt LINE\n");`
			`return 1;`
			`}`

			`if (argc == 3)`
			`line_to_do = atoi(argv[2]);`

Modified Files: glib/ChangeLog glib/glib.def glib/glib/giochannel.c Modified Files: glib/ChangeLog glib/glib.def glib/glib/giochannel.c glib/glib/giochannel.h glib/glib/giounix.c glib/glib/giowin32.c glib/docs/reference/glib/glib-sections.txt glib/tests/iochannel-test.c glib/tests/unicode-collate.c glib/tests/unicode-normalize.c Added Files: glib/tests/iochannel-test-infile * glib/giochannel.c: API changes, fixes to error handling, some internal restructuring * glib/giochannel.h: API changes, documentation for elements in GIOChannel structure * glib/giounix.c: Matched API changes, implemented backend to set is_readable, is_writeable, is_seekable flags, added a test to catch large values of count for which the behavior of write() is undefined * glib/giowin32.c: Changed to match new prototypes for io_close() and io_seek(), removed references to G_IO_STATUS_INTR, set is_seekable flag in channel creation functions * glib.def: Renamed g_channel_error_quark() and g_channel_error_from_errno() to g_io_channel_error_quark() and g_io_channel_error_from_errno(); added new functions g_io_channel_get_buffered() and g_io_channel_set_buffered() * docs/reference/glib/glib-sections.txt: Modified iochannel section to reflect new functions and API changes * tests/iochannel-test.c: Fixed to work with API changes * tests/iochannel-test-infile: New file; input file for iochannel-test * tests/unicode-collate.c tests/unicode-normalize.c: Changed G_IO_FILE_MODE_READ to "r" to match API change 2001-07-20 22:14:37 +02:00			`in = g_io_channel_new_file (argv[1], "r", &error);`
Use G_N_ELEMENTS rather than a custom macro. Sun Jul 1 20:16:25 2001 Owen Taylor <otaylor@redhat.com> * glib/guniprop.c (g_unichar_totitle): Use G_N_ELEMENTS rather than a custom macro. * glib/gen-unicode-tables.pl: Adapt to changes in table formats for Unicode 3.1 * glib/gunicode.h glib/guniprop.c glib/gunichartables.h glib/gen-unicode-tables.pl: Add case conversion functions g_utf8_casefold, g_utf8_strup, g_utf8_strdown. * tests/unicode-caseconv.c tests/gen-casefold-txt.pl tests/gen-casemap-txt.pl tests/casefold.txt tests/casemap.txt: Test cases for case conversion. * glib/gunicode.h glib/gunidecomp.[ch] glib/gunicomp.h glib/gen-unicode-tables.pl: Add function to do Unicode normalization g_utf8_normalize(). * tests/unicode-normalize.c: Test program for case conversion. * glib/gunicode.h glib/gunicollate.c: Add collation functions g_utf8_collate, g_utf8_collate_key. * test/unicode-collate.c: Test program for collation. * glib/gdate.c (g_date_fill_parse_tokens): Fix uninitialized variable. * glib/gdate.c (g_date_strftime) docs/Changes-2.0.txt: Make work with UTF-8 even if the locale isn't UTF-8 based. Still somewhat of broken, if the format string contains characters not representable in the current locale, will warn and not work. * glib/gdate.c: Use UTF-8 normalization and casefolding. 2001-07-02 02:49:21 +02:00			`if (!in)`
			`{`
			`fprintf (stderr, "Cannot open %s: %s\n", argv[1], error->message);`
			`return 1;`
			`}`

			`while (TRUE)`
			`{`
			`gsize term_pos;`
			`gchar **columns;`

			`if (g_io_channel_read_line_string (in, buffer, &term_pos, &error) != G_IO_STATUS_NORMAL)`
			`break;`

			`if (line_to_do && line != line_to_do)`
			`goto next;`

			`buffer->str[term_pos] = '\0';`

			`if (buffer->str[0] == '#') /* Comment */`
			`goto next;`
			`if (buffer->str[0] == '@') /* Part */`
			`{`
			`fprintf (stderr, "\nProcessing %s\n", buffer->str + 1);`
			`goto next;`
			`}`

			`columns = g_strsplit (buffer->str, ";", -1);`
			`if (!process_one (line, columns))`
			`return 1;`
			`g_strfreev (columns);`

			`next:`
			`g_string_truncate (buffer, 0);`
			`line++;`
			`}`

			`if (error)`
			`{`
			`fprintf (stderr, "Error reading test file, %s\n", error->message);`
			`return 1;`
			`}`

			`g_io_channel_close (in);`
			`g_string_free (buffer, TRUE);`

			`return !success;`
			`}`