Update Unicode data to 4.0. (#107974)

2003-07-30 Noah Levitt <nlevitt@columbia.edu> * glib/gen-unicode-tables.pl: * glib/gunibreak.c: * glib/gunibreak.h: * glib/gunichartables.h: * glib/gunicode.h: * glib/gunicomp.h: * glib/gunidecomp.c: * glib/gunidecomp.h: * glib/guniprop.c: * tests/casefold.txt: * tests/casemap.txt: * tests/gen-casefold-txt.pl: * tests/gen-casemap-txt.pl: Update Unicode data to 4.0. (#107974)
2025-10-04 04:39:20 +02:00 · 2003-07-31 02:27:56 +00:00
parent cdf72b09e6
commit 05f99527eb
19 changed files with 22213 additions and 8644 deletions
--- a/16
+++ b/16
@@ -1,3 +1,19 @@
 2003-07-30  Noah Levitt  <nlevitt@columbia.edu>
 	* glib/gen-unicode-tables.pl:
 	* glib/gunibreak.c:
 	* glib/gunibreak.h:
 	* glib/gunichartables.h:
 	* glib/gunicode.h:
 	* glib/gunicomp.h:
 	* glib/gunidecomp.c:
 	* glib/gunidecomp.h:
 	* glib/guniprop.c:
 	* tests/casefold.txt:
 	* tests/casemap.txt:
 	* tests/gen-casefold-txt.pl:
 	* tests/gen-casemap-txt.pl: Update Unicode data to 4.0. (#107974)
 2003-07-31  Tor Lillqvist  <tml@iki.fi>
 	* glib/gspawn-win32.c: When possible, manage without the helper
--- a/ChangeLog.pre-2-10
+++ b/ChangeLog.pre-2-10
@@ -1,3 +1,19 @@
 2003-07-30  Noah Levitt  <nlevitt@columbia.edu>
 	* glib/gen-unicode-tables.pl:
 	* glib/gunibreak.c:
 	* glib/gunibreak.h:
 	* glib/gunichartables.h:
 	* glib/gunicode.h:
 	* glib/gunicomp.h:
 	* glib/gunidecomp.c:
 	* glib/gunidecomp.h:
 	* glib/guniprop.c:
 	* tests/casefold.txt:
 	* tests/casemap.txt:
 	* tests/gen-casefold-txt.pl:
 	* tests/gen-casemap-txt.pl: Update Unicode data to 4.0. (#107974)
 2003-07-31  Tor Lillqvist  <tml@iki.fi>
 	* glib/gspawn-win32.c: When possible, manage without the helper
--- a/ChangeLog.pre-2-12
+++ b/ChangeLog.pre-2-12
@@ -1,3 +1,19 @@
 2003-07-30  Noah Levitt  <nlevitt@columbia.edu>
 	* glib/gen-unicode-tables.pl:
 	* glib/gunibreak.c:
 	* glib/gunibreak.h:
 	* glib/gunichartables.h:
 	* glib/gunicode.h:
 	* glib/gunicomp.h:
 	* glib/gunidecomp.c:
 	* glib/gunidecomp.h:
 	* glib/guniprop.c:
 	* tests/casefold.txt:
 	* tests/casemap.txt:
 	* tests/gen-casefold-txt.pl:
 	* tests/gen-casemap-txt.pl: Update Unicode data to 4.0. (#107974)
 2003-07-31  Tor Lillqvist  <tml@iki.fi>
 	* glib/gspawn-win32.c: When possible, manage without the helper
--- a/ChangeLog.pre-2-4
+++ b/ChangeLog.pre-2-4
@@ -1,3 +1,19 @@
 2003-07-30  Noah Levitt  <nlevitt@columbia.edu>
 	* glib/gen-unicode-tables.pl:
 	* glib/gunibreak.c:
 	* glib/gunibreak.h:
 	* glib/gunichartables.h:
 	* glib/gunicode.h:
 	* glib/gunicomp.h:
 	* glib/gunidecomp.c:
 	* glib/gunidecomp.h:
 	* glib/guniprop.c:
 	* tests/casefold.txt:
 	* tests/casemap.txt:
 	* tests/gen-casefold-txt.pl:
 	* tests/gen-casemap-txt.pl: Update Unicode data to 4.0. (#107974)
 2003-07-31  Tor Lillqvist  <tml@iki.fi>
 	* glib/gspawn-win32.c: When possible, manage without the helper
--- a/ChangeLog.pre-2-6
+++ b/ChangeLog.pre-2-6
@@ -1,3 +1,19 @@
 2003-07-30  Noah Levitt  <nlevitt@columbia.edu>
 	* glib/gen-unicode-tables.pl:
 	* glib/gunibreak.c:
 	* glib/gunibreak.h:
 	* glib/gunichartables.h:
 	* glib/gunicode.h:
 	* glib/gunicomp.h:
 	* glib/gunidecomp.c:
 	* glib/gunidecomp.h:
 	* glib/guniprop.c:
 	* tests/casefold.txt:
 	* tests/casemap.txt:
 	* tests/gen-casefold-txt.pl:
 	* tests/gen-casemap-txt.pl: Update Unicode data to 4.0. (#107974)
 2003-07-31  Tor Lillqvist  <tml@iki.fi>
 	* glib/gspawn-win32.c: When possible, manage without the helper
--- a/ChangeLog.pre-2-8
+++ b/ChangeLog.pre-2-8
@@ -1,3 +1,19 @@
 2003-07-30  Noah Levitt  <nlevitt@columbia.edu>
 	* glib/gen-unicode-tables.pl:
 	* glib/gunibreak.c:
 	* glib/gunibreak.h:
 	* glib/gunichartables.h:
 	* glib/gunicode.h:
 	* glib/gunicomp.h:
 	* glib/gunidecomp.c:
 	* glib/gunidecomp.h:
 	* glib/guniprop.c:
 	* tests/casefold.txt:
 	* tests/casemap.txt:
 	* tests/gen-casefold-txt.pl:
 	* tests/gen-casemap-txt.pl: Update Unicode data to 4.0. (#107974)
 2003-07-31  Tor Lillqvist  <tml@iki.fi>
 	* glib/gspawn-win32.c: When possible, manage without the helper
--- a/glib/gen-unicode-tables.pl
+++ b/glib/gen-unicode-tables.pl
@@ -31,8 +31,12 @@
 # * For decomp table it might make sense to use a shift count other
 #   than 8.  We could easily compute the perfect shift count.
 # we use some perl unicode features
 require 5.006;
 use vars qw($CODE $NAME $CATEGORY $COMBINING_CLASSES $BIDI_CATEGORY $DECOMPOSITION $DECIMAL_VALUE $DIGIT_VALUE $NUMERIC_VALUE $MIRRORED $OLD_NAME $COMMENT $UPPER $LOWER $TITLE $BREAK_CODE $BREAK_CATEGORY $BREAK_NAME $CASE_CODE $CASE_LOWER $CASE_TITLE $CASE_UPPER $CASE_CONDITION);
 # Names of fields in Unicode data table.
 $CODE = 0;
 $NAME = 1;
@@ -134,6 +138,8 @@ $FOLDING_MAPPING = 2;
     'PO' => "G_UNICODE_BREAK_POSTFIX",
     'SA' => "G_UNICODE_BREAK_COMPLEX_CONTEXT",
     'AI' => "G_UNICODE_BREAK_AMBIGUOUS",
     'NL' => "G_UNICODE_BREAK_NEXT_LINE",
     'WJ' => "G_UNICODE_BREAK_WORD_JOINER",
     'XX' => "G_UNICODE_BREAK_UNKNOWN"
     );
@@ -143,8 +149,9 @@ $FOLDING_MAPPING = 2;
 # Maximum length of special-case strings
 my $special_case_len = 0;
 my @special_cases;
 my @special_case_offsets;
 my $special_case_offset = 0;
 $do_decomp = 0;
 $do_props = 1;
@@ -193,6 +200,9 @@ print "Unicode data from $ARGV[1]\n";
 open (INPUT, "< $ARGV[1]") || exit 1;
 # we save memory by skipping the huge empty area before U+E0000
 my $pages_before_e0000;
 $last_code = -1;
 while (<INPUT>)
 {
@@ -205,7 +215,10 @@ while (<INPUT>)
    $code = hex ($fields[$CODE]);
-    last if ($code > 0xFFFF); # ignore characters out of the basic plane
+    if ($code >= 0xE0000 and $last_code < 0xE0000)
    {
        $pages_before_e0000 = ($last_code >> 8) + 1;
    }
    if ($code > $last_code + 1)
    {
@@ -237,12 +250,12 @@ close INPUT;
@gfields = ('', '', 'Cn', '0', '', '', '', '', '', '', '',
 	    '', '', '', '');
-for (++$last_code; $last_code < 0x10000; ++$last_code)
+for (++$last_code; $last_code <= 0x10FFFF; ++$last_code)
 {
    $gfields{$CODE} = sprintf ("%04x", $last_code);
    &process_one ($last_code, @gfields);
 }
--$last_code;			# Want last to be 0xFFFF.
+--$last_code;			# Want last to be 0x10FFFF.
 print "Creating line break table\n";
@@ -268,7 +281,7 @@ while (<INPUT>)
 	next;
    }
-    if ($fields[$CODE] =~ /([A-F0-9]{4})..([A-F0-9]{4})/) 
+    if ($fields[$CODE] =~ /([A-F0-9]{4,6})\.\.([A-F0-9]{4,6})/) 
    {
 	$start_code = hex ($1);
 	$end_code = hex ($2);
@@ -277,8 +290,6 @@ while (<INPUT>)
    }
    last if ($start_code > 0xFFFF); # FIXME ignore characters out of the basic plane 
    if ($start_code > $last_code + 1)
    {
 	# The gap represents undefined characters. If assigned,
@@ -306,7 +317,7 @@ while (<INPUT>)
 close INPUT;
-for (++$last_code; $last_code < 0x10000; ++$last_code)
+for (++$last_code; $last_code <= 0x10FFFF; ++$last_code)
 {
  if ($type[$last_code] eq 'Cn')
    {
@@ -317,9 +328,9 @@ for (++$last_code; $last_code < 0x10000; ++$last_code)
      $break_props[$last_code] = 'AL';
    }
 }
--$last_code;			# Want last to be 0xFFFF.
+--$last_code;			# Want last to be 0x10FFFF.
-print STDERR "Last code is not 0xFFFF" if ($last_code != 0xFFFF);
+print STDERR "Last code is not 0x10FFFF" if ($last_code != 0x10FFFF);
 print "Reading special-casing table for case conversion\n";
@@ -403,22 +414,21 @@ while (<INPUT>)
    $raw_code = $fields[$FOLDING_CODE];
    $code = hex ($raw_code);
    next if $code > 0xffff;	# FIXME!
    if ($#fields != 3)
    {
 	printf STDERR ("Entry for $raw_code has wrong number of fields (%d)\n", $#fields);
 	next;
    }
-    next if ($fields[$FOLDING_STATUS] eq 'S');
+    # we don't use Simple or Turkic rules here
    next if ($fields[$FOLDING_STATUS] =~ /^[ST]$/);
    @values = map { hex ($_) } split /\s+/, $fields[$FOLDING_MAPPING];
    # Check simple case
    if (@values == 1 && 
-	!(defined $value[$code] && $value[$code] >= 0xd800 && $value[$code] < 0xdc00) &&
+	!(defined $value[$code] && $value[$code] >= 0x1000000) &&
 	defined $type[$code]) {
 	my $lower;
@@ -441,13 +451,12 @@ while (<INPUT>)
    }
    my $string = pack ("U*", @values);
    $string =~ s/([\x80-\xff])/sprintf "\\x%02x",ord($1)/eg;
-    if (1 + length $string > $casefoldlen) {
+    if (1 + &length_in_bytes ($string) > $casefoldlen) {
-	$casefoldlen = 1 + length $string;
+	$casefoldlen = 1 + &length_in_bytes ($string);
    }
-    push @casefold, [ $code, $string ];
+    push @casefold, [ $code, &escape ($string) ];
 }
 close INPUT;
@@ -464,6 +473,16 @@ if ($do_decomp) {
 exit 0;
 # perl "length" returns the length in characters
 sub length_in_bytes
 {
    my ($string) = @_;
    use bytes;
    return length $string;
 }
 # Process a single character.
 sub process_one
 {
@@ -528,7 +547,11 @@ sub print_tables
    printf OUT "#define G_UNICODE_LAST_CHAR 0x%04x\n\n", $last;
-    printf OUT "#define G_UNICODE_MAX_TABLE_INDEX 1000\n\n";
+    printf OUT "#define G_UNICODE_MAX_TABLE_INDEX 10000\n\n";
    my $last_part1 = ($pages_before_e0000 * 256) - 1;
    printf OUT "#define G_UNICODE_LAST_CHAR_PART1 0x%04X\n\n", $last_part1;
    printf OUT "#define G_UNICODE_LAST_PAGE_PART1 %d\n\n", $pages_before_e0000 - 1;
    $table_index = 0;
    printf OUT "static const char type_data[][256] = {\n";
@@ -538,8 +561,9 @@ sub print_tables
    }
    printf OUT "\n};\n\n";
-    print OUT "static const short type_table[256] = {\n";
+    printf OUT "/* U+0000 through U+%04X */\n", $last_part1;
-    for ($count = 0; $count <= $last; $count += 256)
+    print OUT "static const gint16 type_table_part1[$pages_before_e0000] = {\n";
    for ($count = 0; $count <= $last_part1; $count += 256)
    {
 	print OUT ",\n" if $count > 0;
 	print OUT "  ", $row[$count / 256];
@@ -547,21 +571,32 @@ sub print_tables
    }
    print OUT "\n};\n\n";
    printf OUT "/* U+E0000 through U+%04X */\n", $last;
    print OUT "static const gint16 type_table_part2[768] = {\n";
    for ($count = 0xE0000; $count <= $last; $count += 256)
    {
 	print OUT ",\n" if $count > 0xE0000;
 	print OUT "  ", $row[$count / 256];
 	$bytes_out += 2;
    }
    print OUT "\n};\n\n";
    #
    # Now print attribute table.
    #
    $table_index = 0;
-    printf OUT "static const unsigned short attr_data[][256] = {\n";
+    printf OUT "static const gunichar attr_data[][256] = {\n";
    for ($count = 0; $count <= $last; $count += 256)
    {
-	$row[$count / 256] = &print_row ($count, 2, \&fetch_attr);
+	$row[$count / 256] = &print_row ($count, 4, \&fetch_attr);
    }
    printf OUT "\n};\n\n";
-    print OUT "static const short attr_table[256] = {\n";
+    printf OUT "/* U+0000 through U+%04X */\n", $last_part1;
-    for ($count = 0; $count <= $last; $count += 256)
+    print OUT "static const gint16 attr_table_part1[$pages_before_e0000] = {\n";
    for ($count = 0; $count <= $last_part1; $count += 256)
    {
 	print OUT ",\n" if $count > 0;
 	print OUT "  ", $row[$count / 256];
@@ -569,12 +604,21 @@ sub print_tables
    }
    print OUT "\n};\n\n";
    printf OUT "/* U+E0000 through U+%04X */\n", $last;
    print OUT "static const gint16 attr_table_part2[768] = {\n";
    for ($count = 0xE0000; $count <= $last; $count += 256)
    {
 	print OUT ",\n" if $count > 0xE0000;
 	print OUT "  ", $row[$count / 256];
 	$bytes_out += 2;
    }
    print OUT "\n};\n\n";
    #
    # print title case table
    #
-    # FIXME: type.
+    print OUT "static const gunichar title_table[][3] = {\n";
    print OUT "static const unsigned short title_table[][3] = {\n";
    my ($item);
    my ($first) = 1;
    foreach $item (sort keys %title_to_lower)
@@ -583,7 +627,7 @@ sub print_tables
 	    unless $first;
 	$first = 0;
 	printf OUT "  { 0x%04x, 0x%04x, 0x%04x }", $item, $title_to_upper{$item}, $title_to_lower{$item};
-	$bytes_out += 6;
+	$bytes_out += 12;
    }
    print OUT "\n};\n\n";
@@ -666,6 +710,40 @@ sub print_row
    return sprintf "%d /* page %d */", $table_index++, $start / 256;
 }
 sub escape
 {
    my ($string) = @_;
    $string =~ s/(\C)/sprintf "\\x%02x",ord($1)/eg;
    return $string;
 }
 # Returns the offset of $decomp in the offset string. Updates the
 # referenced variables as appropriate.
 sub handle_decomp ($$$$)
 {
    my ($decomp, $decomp_offsets_ref, $decomp_string_ref, $decomp_string_offset_ref) = @_;
    my $offset = "G_UNICODE_NOT_PRESENT_OFFSET";
    if (defined $decomp)
    {
        if (defined $decomp_offsets_ref->{$decomp})
        {
            $offset = $decomp_offsets_ref->{$decomp};
        }
        else
        {
            $offset = ${$decomp_string_offset_ref};
            $decomp_offsets_ref->{$decomp} = $offset;
            ${$decomp_string_ref} .= "\n  \"" . &escape ($decomp) . "\\0\" /* offset ${$decomp_string_offset_ref} */";
            ${$decomp_string_offset_ref} += &length_in_bytes ($decomp) + 1;
        }
    }
    return $offset;
 }
 # Generate the character decomposition header.
 sub print_decomp
 {
@@ -684,19 +762,26 @@ sub print_decomp
    printf OUT "#define G_UNICODE_LAST_CHAR 0x%04x\n\n", $last;
-    printf OUT "#define G_UNICODE_MAX_TABLE_INDEX 1000\n\n";
+    printf OUT "#define G_UNICODE_MAX_TABLE_INDEX (0x110000 / 256)\n\n";
    my $last_part1 = ($pages_before_e0000 * 256) - 1;
    printf OUT "#define G_UNICODE_LAST_CHAR_PART1 0x%04X\n\n", $last_part1;
    printf OUT "#define G_UNICODE_LAST_PAGE_PART1 %d\n\n", $pages_before_e0000 - 1;
    $NOT_PRESENT_OFFSET = 65535;
    print OUT "#define G_UNICODE_NOT_PRESENT_OFFSET $NOT_PRESENT_OFFSET\n\n";
    my ($count, @row);
    $table_index = 0;
-    printf OUT "static const unsigned char cclass_data[][256] = {\n";
+    printf OUT "static const guchar cclass_data[][256] = {\n";
    for ($count = 0; $count <= $last; $count += 256)
    {
 	$row[$count / 256] = &print_row ($count, 1, \&fetch_cclass);
    }
    printf OUT "\n};\n\n";
-    print OUT "static const short combining_class_table[256] = {\n";
+    print OUT "static const gint16 combining_class_table_part1[$pages_before_e0000] = {\n";
-    for ($count = 0; $count <= $last; $count += 256)
+    for ($count = 0; $count <= $last_part1; $count += 256)
    {
 	print OUT ",\n" if $count > 0;
 	print OUT "  ", $row[$count / 256];
@@ -704,12 +789,19 @@ sub print_decomp
    }
    print OUT "\n};\n\n";
    print OUT "static const gint16 combining_class_table_part2[768] = {\n";
    for ($count = 0xE0000; $count <= $last; $count += 256)
    {
 	print OUT ",\n" if $count > 0xE0000;
 	print OUT "  ", $row[$count / 256];
 	$bytes_out += 2;
    }
    print OUT "\n};\n\n";
    print OUT "typedef struct\n{\n";
-    # FIXME: type.
+    print OUT "  gunichar ch;\n";
-    print OUT "  unsigned short ch;\n";
+    print OUT "  guint16 canon_offset;\n";
-    print OUT "  unsigned char canon_offset;\n";
+    print OUT "  guint16 compat_offset;\n";
    print OUT "  unsigned char compat_offset;\n";
    print OUT "  unsigned short expansion_offset;\n";
    print OUT "} decomposition;\n\n";
    print OUT "static const decomposition decomp_table[] =\n{\n";
@@ -737,40 +829,19 @@ sub print_decomp
 		undef $compat_decomp; 
 	    }
-	    my $string = "";
+	    my $canon_offset = handle_decomp ($canon_decomp, \%decomp_offsets, \$decomp_string, \$decomp_string_offset);
-	    my $canon_offset = 0xff;
+	    my $compat_offset = handle_decomp ($compat_decomp, \%decomp_offsets, \$decomp_string, \$decomp_string_offset);
 	    my $compat_offset = 0xff;
-	    if (defined $canon_decomp) {
+            die if $decomp_string_offset > $NOT_PRESENT_OFFSET;
 		$canon_offset = 0;
 		$string .= $canon_decomp;
 	    }
 	    if (defined $compat_decomp) {
 		if (defined $canon_decomp) {
 		    $string .= "\\x00\\x00";
 		}
 		$compat_offset = (length $string) / 4;
 		$string .= $compat_decomp;
 	    }
            if (!defined($decomp_offsets{$string})) {
                $decomp_offsets{$string} = $decomp_string_offset;
                $decomp_string .= "\n  \"".$string."\\0\\0\" /* offset ".
                    $decomp_string_offset." */";
                $decomp_string_offset += ((length $string) / 4) + 2;
                $bytes_out += (length $string) / 4 + 2; # "\x20"
            }
            printf OUT qq(  { 0x%04x, %u, %u, %d }), 
                $count, $canon_offset, $compat_offset, $decomp_offsets{$string};
 	    $bytes_out += 6;
            printf OUT qq(  { 0x%04x, $canon_offset, $compat_offset }), $count;
 	    $bytes_out += 8;
 	}
    }
    print OUT "\n};\n\n";
    $bytes_out += $decomp_string_offset + 1;
-    printf OUT "static const guchar decomp_expansion_string[] = %s;\n\n", $decomp_string;
+    printf OUT "static const gchar decomp_expansion_string[] = %s;\n\n", $decomp_string;
    print OUT "#endif /* DECOMP_H */\n";
@@ -796,20 +867,25 @@ sub print_line_break
    print OUT "#define G_UNICODE_DATA_VERSION \"$ARGV[0]\"\n\n";
-    printf OUT "#define G_UNICODE_LAST_CHAR 0x%04x\n\n", $last;
+    printf OUT "#define G_UNICODE_LAST_CHAR 0x%04X\n\n", $last;
-    printf OUT "#define G_UNICODE_MAX_TABLE_INDEX 1000\n\n";
+    printf OUT "#define G_UNICODE_MAX_TABLE_INDEX 10000\n\n";
    my $last_part1 = ($pages_before_e0000 * 256) - 1;
    printf OUT "/* the last code point that should be looked up in break_property_table_part1 */\n";
    printf OUT "#define G_UNICODE_LAST_CHAR_PART1 0x%04X\n\n", $last_part1;
    $table_index = 0;
-    printf OUT "static const char break_property_data[][256] = {\n";
+    printf OUT "static const gint8 break_property_data[][256] = {\n";
    for ($count = 0; $count <= $last; $count += 256)
    {
 	$row[$count / 256] = &print_row ($count, 1, \&fetch_break_type);
    }
    printf OUT "\n};\n\n";
-    print OUT "static const short break_property_table[256] = {\n";
+    printf OUT "/* U+0000 through U+%04X */\n", $last_part1;
-    for ($count = 0; $count <= $last; $count += 256)
+    print OUT "static const gint16 break_property_table_part1[$pages_before_e0000] = {\n";
    for ($count = 0; $count <= $last_part1; $count += 256)
    {
 	print OUT ",\n" if $count > 0;
 	print OUT "  ", $row[$count / 256];
@@ -817,6 +893,17 @@ sub print_line_break
    }
    print OUT "\n};\n\n";
    printf OUT "/* U+E0000 through U+%04X */\n", $last;
    print OUT "static const gint16 break_property_table_part2[768] = {\n";
    for ($count = 0xE0000; $count <= $last; $count += 256)
    {
 	print OUT ",\n" if $count > 0xE0000;
 	print OUT "  ", $row[$count / 256];
 	$bytes_out += 2;
    }
    print OUT "\n};\n\n";
    print OUT "#endif /* BREAKTABLES_H */\n";
    close (OUT);
@@ -870,7 +957,7 @@ sub make_decomp
    my $result = "";
    foreach $iter (&expand_decomp ($code, $compat))
    {
-	$result .= sprintf "\\x%02x\\x%02x", $iter / 256, $iter & 0xff;
+	$result .= pack ("U", $iter);  # to utf-8
    }
    $result;
@@ -888,21 +975,17 @@ sub add_special_case
    for $value (@values) {
-	$result .= sprintf ("\\x%02x\\x%02x", $value / 256, $value & 0xff);
+	$result .= pack ("U", $value);  # to utf-8
    }
-    $result .= "\\0";
+    push @special_case_offsets, $special_case_offset;
-    if (2 * @values + 2 > $special_case_len) {
+    # We encode special cases up in the 0x1000000 space
-	$special_case_len = 2 * @values + 2;
+    $value[$code] = 0x1000000 + $special_case_offset;
    }
-    push @special_cases, $result;
+    $special_case_offset += 1 + &length_in_bytes ($result);
-    #
+    push @special_cases, &escape ($result);
    # We encode special cases in the surrogate pair space
    #
    $value[$code] = 0xD800 + scalar(@special_cases) - 1;
 }
 sub output_special_case_table
@@ -915,13 +998,15 @@ sub output_special_case_table
 * First, the best single character mapping to lowercase if Lu, 
 * and to uppercase if Ll, followed by the output mapping for the two cases 
 * other than the case of the codepoint, in the order [Ll],[Lu],[Lt],
- * separated and terminated by a double NUL.
+ * encoded in UTF-8, separated and terminated by a null character.
 */
-static const guchar special_case_table[][$special_case_len] = {
+static const gchar special_case_table[] = {
 EOT
    my $i = 0;
    for $case (@special_cases) {
-	print $out qq( "$case",\n);
+	print $out qq( "$case\\0" /* offset ${special_case_offsets[$i]} */\n);
        $i++;
    }
    print $out <<EOT;
@@ -929,7 +1014,7 @@ EOT
 EOT
-    print STDERR "Generated ", ($special_case_len * scalar @special_cases), " bytes in special case table\n";
+    print STDERR "Generated " . ($special_case_offset + 1) . " bytes in special case table\n";
 }
 sub enumerate_ordered
@@ -962,16 +1047,22 @@ sub output_composition_table
    # decompositions. At the same time, record
    # the first and second character of each decomposition
-    for $code (keys %compositions) {
+    for $code (keys %compositions) 
    {
 	@values = map { hex ($_) } split /\s+/, $compositions{$code};
        # non-starters
 	if ($cclass[$values[0]]) {
 	    delete $compositions{$code};
 	    next;
 	}
        # single-character decompositions
 	if (@values == 1) {
 	    delete $compositions{$code};
 	    next;
 	}
 	if (@values != 2) {
 	    die "$code has more than two elements in its decomposition!\n";
 	}
@@ -983,10 +1074,10 @@ sub output_composition_table
 	}
    }
-    # Assign integer indicices, removing singletons
+    # Assign integer indices, removing singletons
    my $n_first = enumerate_ordered (\%first);
-    # Now record the second character if each (non-singleton) decomposition
+    # Now record the second character of each (non-singleton) decomposition
    for $code (keys %compositions) {
 	@values = map { hex ($_) } split /\s+/, $compositions{$code};
@@ -1065,27 +1156,31 @@ sub output_composition_table
    my @row;						  
    $table_index = 0;
-    printf OUT "static const gushort compose_data[][256] = {\n";
+    printf OUT "static const guint16 compose_data[][256] = {\n";
    for (my $count = 0; $count <= $last; $count += 256)
    {
 	$row[$count / 256] = &print_row ($count, 2, sub { exists $vals{$_[0]} ? $vals{$_[0]} : 0; });
    }
    printf OUT "\n};\n\n";
-    print OUT "static const short compose_table[256] = {\n";
+    print OUT "static const gint16 compose_table[256] = {\n";
    for (my $count = 0; $count <= $last; $count += 256)
    {
 	print OUT ",\n" if $count > 0;
 	print OUT "  ", $row[$count / 256];
 	$bytes_out += 4;
    }
    print OUT "\n};\n\n";
    $bytes_out += 256 * 2;
    # Output first singletons
-    print OUT "static const gushort compose_first_single[][2] = {\n";
+    print OUT "static const guint16 compose_first_single[][2] = {\n";
    $i = 0;				     
    for $record (@first_singletons) {
        if ($record->[1] > 0xFFFF or $record->[2] > 0xFFFF) {
            die "time to switch compose_first_single to gunichar" ;
        }
 	print OUT ",\n" if $i++ > 0;
 	printf OUT " { %#06x, %#06x }", $record->[1], $record->[2];
    }
@@ -1095,9 +1190,12 @@ sub output_composition_table
    # Output second singletons
-    print OUT "static const gushort compose_second_single[][2] = {\n";
+    print OUT "static const guint16 compose_second_single[][2] = {\n";
    $i = 0;				     
    for $record (@second_singletons) {
        if ($record->[1] > 0xFFFF or $record->[2] > 0xFFFF) {
            die "time to switch compose_second_single to gunichar";
        }
 	print OUT ",\n" if $i++ > 0;
 	printf OUT " { %#06x, %#06x }", $record->[1], $record->[2];
    }
@@ -1108,7 +1206,7 @@ sub output_composition_table
    # Output array of composition pairs
    print OUT <<EOT;
-static const gushort compose_array[$n_first][$n_second] = {
+static const guint16 compose_array[$n_first][$n_second] = {
 EOT
    for (my $i = 0; $i < $n_first; $i++) {
@@ -1117,7 +1215,10 @@ EOT
 	for (my $j = 0; $j < $n_second; $j++) {
 	    print OUT ", " if $j;
 	    if (exists $reverse{"$i|$j"}) {
-		printf OUT "%#06x", $reverse{"$i|$j"};
+                if ($reverse{"$i|$j"} > 0xFFFF) {
                    die "time to switch compose_array to gunichar" ;
                }
 		printf OUT "0x%04x", $reverse{"$i|$j"};
 	    } else {
 		print OUT "     0";
            }
@@ -1151,10 +1252,16 @@ EOT
   @casefold = sort { $a->[0] <=> $b->[0] } @casefold; 
-   for $case (@casefold) {
+   for $case (@casefold) 
   {
       $code = $case->[0];
       $string = $case->[1];
-       print $out sprintf(qq({ %#04x, "$string" },\n), $code);
+
       if ($code > 0xFFFF) {
           die "time to switch casefold_table to gunichar" ;
       }
       print $out sprintf(qq(  { 0x%04x, "$string" },\n), $code);
   }
--- a/glib/gunibreak.c
+++ b/glib/gunibreak.c
@@ -25,13 +25,22 @@
 #include "glib.h"
 #include "gunibreak.h"
 #define TPROP_PART1(Page, Char) \
  ((break_property_table_part1[Page] >= G_UNICODE_MAX_TABLE_INDEX) \
   ? (break_property_table_part1[Page] - G_UNICODE_MAX_TABLE_INDEX) \
   : (break_property_data[break_property_table_part1[Page]][Char]))
-#define TPROP(Page, Char) \
+#define TPROP_PART2(Page, Char) \
-  ((break_property_table[Page] >= G_UNICODE_MAX_TABLE_INDEX) \
+  ((break_property_table_part2[Page] >= G_UNICODE_MAX_TABLE_INDEX) \
-   ? (break_property_table[Page] - G_UNICODE_MAX_TABLE_INDEX) \
+   ? (break_property_table_part2[Page] - G_UNICODE_MAX_TABLE_INDEX) \
-   : (break_property_data[break_property_table[Page]][Char]))
+   : (break_property_data[break_property_table_part2[Page]][Char]))
-#define PROP(Char) (((Char) > (G_UNICODE_LAST_CHAR)) ? G_UNICODE_BREAK_UNKNOWN : TPROP ((Char) >> 8, (Char) & 0xff))
+#define PROP(Char) \
  (((Char) <= G_UNICODE_LAST_CHAR_PART1) \
   ? TPROP_PART1 ((Char) >> 8, (Char) & 0xff) \
   : (((Char) >= 0xe0000 && (Char) <= G_UNICODE_LAST_CHAR) \
      ? TPROP_PART2 (((Char) - 0xe0000) >> 8, (Char) & 0xff) \
      : G_UNICODE_BREAK_UNKNOWN))
 /**
 * g_unichar_break_type:
--- a/glib/gunibreak.h
+++ b/glib/gunibreak.h
--- a/glib/gunichartables.h
+++ b/glib/gunichartables.h
--- a/glib/gunicode.h
+++ b/glib/gunicode.h
@@ -100,7 +100,9 @@ typedef enum
  G_UNICODE_BREAK_POSTFIX,
  G_UNICODE_BREAK_COMPLEX_CONTEXT,
  G_UNICODE_BREAK_AMBIGUOUS,
-  G_UNICODE_BREAK_UNKNOWN
+  G_UNICODE_BREAK_UNKNOWN,
  G_UNICODE_BREAK_NEXT_LINE,
  G_UNICODE_BREAK_WORD_JOINER
 } GUnicodeBreakType;
 /* Returns TRUE if current locale uses UTF-8 charset.  If CHARSET is
--- a/glib/gunicomp.h
+++ b/glib/gunicomp.h
@@ -3,7 +3,7 @@
 #define COMPOSE_SECOND_START 357
 #define COMPOSE_SECOND_SINGLE_START 388
-static const gushort compose_data[][256] = {
+static const guint16 compose_data[][256] = {
  { /* page 0, index 0 */
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
@@ -222,7 +222,7 @@ static const gushort compose_data[][256] = {
  }
 };
-static const short compose_table[256] = {
+static const gint16 compose_table[256] = {
  0 /* page 0 */,
  1 /* page 1 */,
  2 /* page 2 */,
@@ -274,7 +274,7 @@ static const short compose_table[256] = {
  15 /* page 48 */
 };
-static const gushort compose_first_single[][2] = {
+static const guint16 compose_first_single[][2] = {
 { 0x0338, 0x226e },
 { 0x0338, 0x2260 },
 { 0x0338, 0x226f },
@@ -486,7 +486,7 @@ static const gushort compose_first_single[][2] = {
 { 0x3099, 0x30fa },
 { 0x3099, 0x30fe }
 };
-static const gushort compose_second_single[][2] = {
+static const guint16 compose_second_single[][2] = {
 { 0x0627, 0x0622 },
 { 0x0627, 0x0623 },
 { 0x0627, 0x0625 },
@@ -506,7 +506,7 @@ static const gushort compose_second_single[][2] = {
 { 0x0dd9, 0x0ddc },
 { 0x0dd9, 0x0dde }
 };
-static const gushort compose_array[146][31] = {
+static const guint16 compose_array[146][31] = {
 { 0x00c0, 0x00c1, 0x00c2, 0x00c3, 0x0100, 0x0102, 0x0226, 0x00c4, 0x1ea2, 0x00c5,      0, 0x01cd, 0x0200, 0x0202,      0,      0,      0, 0x1ea0,      0, 0x1e00,      0,      0, 0x0104,      0,      0,      0,      0,      0,      0,      0,      0 },
 {      0,      0,      0,      0,      0,      0, 0x1e02,      0,      0,      0,      0,      0,      0,      0,      0,      0,      0, 0x1e04,      0,      0,      0,      0,      0,      0,      0,      0, 0x1e06,      0,      0,      0,      0 },
 {      0, 0x0106, 0x0108,      0,      0,      0, 0x010a,      0,      0,      0,      0, 0x010c,      0,      0,      0,      0,      0,      0,      0,      0,      0, 0x00c7,      0,      0,      0,      0,      0,      0,      0,      0,      0 },
--- a/glib/gunidecomp.c
+++ b/glib/gunidecomp.c
@@ -28,13 +28,22 @@
 #include "gunicomp.h"
-#define CC(Page, Char) \
+#define CC_PART1(Page, Char) \
-  ((combining_class_table[Page] >= G_UNICODE_MAX_TABLE_INDEX) \
+  ((combining_class_table_part1[Page] >= G_UNICODE_MAX_TABLE_INDEX) \
-   ? (combining_class_table[Page] - G_UNICODE_MAX_TABLE_INDEX) \
+   ? (combining_class_table_part1[Page] - G_UNICODE_MAX_TABLE_INDEX) \
-   : (cclass_data[combining_class_table[Page]][Char]))
+   : (cclass_data[combining_class_table_part1[Page]][Char]))
 #define CC_PART2(Page, Char) \
  ((combining_class_table_part2[Page] >= G_UNICODE_MAX_TABLE_INDEX) \
   ? (combining_class_table_part2[Page] - G_UNICODE_MAX_TABLE_INDEX) \
   : (cclass_data[combining_class_table_part2[Page]][Char]))
 #define COMBINING_CLASS(Char) \
-     (((Char) > (G_UNICODE_LAST_CHAR)) ? 0 : CC((Char) >> 8, (Char) & 0xff))
+  (((Char) <= G_UNICODE_LAST_CHAR_PART1) \
   ? CC_PART1 ((Char) >> 8, (Char) & 0xff) \
   : (((Char) >= 0xe0000 && (Char) <= G_UNICODE_LAST_CHAR) \
      ? CC_PART2 (((Char) - 0xe0000) >> 8, (Char) & 0xff) \
      : 0))
 /**
 * g_unicode_canonical_ordering:
@@ -84,7 +93,8 @@ g_unicode_canonical_ordering (gunichar *string,
    }
 }
-static const guchar *
+/* returns a pointer to a null-terminated UTF-8 string */
 static const gchar *
 find_decomposition (gunichar ch,
 		    gboolean compat)
 {
@@ -104,17 +114,17 @@ find_decomposition (gunichar ch,
 	      if (compat)
 		{
 		  offset = decomp_table[half].compat_offset;
-		  if (offset == 0xff)
+		  if (offset == G_UNICODE_NOT_PRESENT_OFFSET)
 		    offset = decomp_table[half].canon_offset;
 		}
 	      else
 		{
 		  offset = decomp_table[half].canon_offset;
-		  if (offset == 0xff)
+		  if (offset == G_UNICODE_NOT_PRESENT_OFFSET)
 		    return NULL;
 		}
-	      return &(decomp_expansion_string[decomp_table[half].expansion_offset + offset]);
+	      return &(decomp_expansion_string[offset]);
 	    }
 	  else if (half == start)
 	    break;
@@ -142,27 +152,20 @@ gunichar *
 g_unicode_canonical_decomposition (gunichar ch,
 				   gsize   *result_len)
 {
-  const guchar *decomp = find_decomposition (ch, FALSE);
+  const gchar *decomp = find_decomposition (ch, FALSE);
  const gchar *p;
  gunichar *r;
  if (decomp)
    {
      /* Found it.  */
-      int i, len;
+      int i;
      /* We store as a double-nul terminated string.  */
      for (len = 0; (decomp[len] || decomp[len + 1]);
 	   len += 2)
 	;
-      /* We've counted twice as many bytes as there are
+      *result_len = g_utf8_strlen (decomp, -1);
-	 characters.  */
+      r = g_malloc (*result_len * sizeof (gunichar));
      *result_len = len / 2;
      r = g_malloc (len / 2 * sizeof (gunichar));
-      for (i = 0; i < len; i += 2)
+      for (p = decomp, i = 0; *p != '\0'; p = g_utf8_next_char (p), i++)
-	{
+        r[i] = g_utf8_get_char (p);
 	  r[i / 2] = (decomp[i] << 8 | decomp[i + 1]);
 	}
    }
  else
    {
@@ -194,6 +197,7 @@ combine (gunichar  a,
  gushort index_a, index_b;
  index_a = COMPOSE_INDEX(a);
  if (index_a >= COMPOSE_FIRST_SINGLE_START && index_a < COMPOSE_SECOND_START)
    {
      if (b == compose_first_single[index_a - COMPOSE_FIRST_SINGLE_START][0])
@@ -206,6 +210,7 @@ combine (gunichar  a,
    }
  index_b = COMPOSE_INDEX(b);
  if (index_b >= COMPOSE_SECOND_SINGLE_START)
    {
      if (a == compose_second_single[index_b - COMPOSE_SECOND_SINGLE_START][0])
@@ -252,17 +257,10 @@ _g_utf8_normalize_wc (const gchar    *str,
    {
      gunichar wc = g_utf8_get_char (p);
-      const guchar *decomp = find_decomposition (wc, do_compat);
+      const gchar *decomp = find_decomposition (wc, do_compat);
      if (decomp)
-	{
+        n_wc += g_utf8_strlen (decomp, -1);
 	  int len;
 	  /* We store as a double-nul terminated string.  */
 	  for (len = 0; (decomp[len] || decomp[len + 1]);
 	       len += 2)
 	    ;
 	  n_wc += len / 2;
 	}
      else
 	n_wc++;
@@ -277,7 +275,7 @@ _g_utf8_normalize_wc (const gchar    *str,
  while ((max_len < 0 || p < str + max_len) && *p)
    {
      gunichar wc = g_utf8_get_char (p);
-      const guchar *decomp;
+      const gchar *decomp;
      int cc;
      gsize old_n_wc = n_wc;
@@ -285,11 +283,9 @@ _g_utf8_normalize_wc (const gchar    *str,
      if (decomp)
 	{
-	  int len;
+          const char *pd;
-	  /* We store as a double-nul terminated string.  */
+          for (pd = decomp; *pd != '\0'; pd = g_utf8_next_char (pd))
-	  for (len = 0; (decomp[len] || decomp[len + 1]);
+            wc_buffer[n_wc++] = g_utf8_get_char (pd);
 	       len += 2)
 	    wc_buffer[n_wc++] = (decomp[len] << 8 | decomp[len + 1]);
 	}
      else
 	wc_buffer[n_wc++] = wc;
@@ -318,7 +314,6 @@ _g_utf8_normalize_wc (const gchar    *str,
  /* All decomposed and reordered */ 
  if (do_compose && n_wc > 0)
    {
      gsize i, j;
--- a/glib/gunidecomp.h
+++ b/glib/gunidecomp.h
--- a/glib/guniprop.c
+++ b/glib/guniprop.c
@@ -28,17 +28,30 @@
 #include "glib.h"
 #include "gunichartables.h"
 #define ATTR_TABLE(Page) (((Page) <= G_UNICODE_LAST_PAGE_PART1) \
                          ? attr_table_part1[Page] \
                          : attr_table_part2[(Page) - 0xe00])
 #define ATTTABLE(Page, Char) \
-  ((attr_table[Page] == G_UNICODE_MAX_TABLE_INDEX) ? 0 : (attr_data[attr_table[Page]][Char]))
+  ((ATTR_TABLE(Page) == G_UNICODE_MAX_TABLE_INDEX) ? 0 : (attr_data[ATTR_TABLE(Page)][Char]))
-#define TTYPE(Page, Char) \
+#define TTYPE_PART1(Page, Char) \
-  ((type_table[Page] >= G_UNICODE_MAX_TABLE_INDEX) \
+  ((type_table_part1[Page] >= G_UNICODE_MAX_TABLE_INDEX) \
-   ? (type_table[Page] - G_UNICODE_MAX_TABLE_INDEX) \
+   ? (type_table_part1[Page] - G_UNICODE_MAX_TABLE_INDEX) \
-   : (type_data[type_table[Page]][Char]))
+   : (type_data[type_table_part1[Page]][Char]))
 #define TTYPE_PART2(Page, Char) \
  ((type_table_part2[Page] >= G_UNICODE_MAX_TABLE_INDEX) \
   ? (type_table_part2[Page] - G_UNICODE_MAX_TABLE_INDEX) \
   : (type_data[type_table_part2[Page]][Char]))
 #define TYPE(Char) \
  (((Char) <= G_UNICODE_LAST_CHAR_PART1) \
   ? TTYPE_PART1 ((Char) >> 8, (Char) & 0xff) \
   : (((Char) >= 0xe0000 && (Char) <= G_UNICODE_LAST_CHAR) \
      ? TTYPE_PART2 (((Char) - 0xe0000) >> 8, (Char) & 0xff) \
      : G_UNICODE_UNASSIGNED))
 #define TYPE(Char) (((Char) > (G_UNICODE_LAST_CHAR)) ? G_UNICODE_UNASSIGNED : TTYPE ((Char) >> 8, (Char) & 0xff))
 #define ISDIGIT(Type) ((Type) == G_UNICODE_DECIMAL_NUMBER	\
 		       || (Type) == G_UNICODE_LETTER_NUMBER	\
@@ -361,10 +374,10 @@ g_unichar_toupper (gunichar c)
  if (t == G_UNICODE_LOWERCASE_LETTER)
    {
      gunichar val = ATTTABLE (c >> 8, c & 0xff);
-      if (val >= 0xd800 && val < 0xdc00)
+      if (val >= 0x1000000)
 	{
-	  const guchar *p = special_case_table[val - 0xd800];
+	  const guchar *p = special_case_table + val - 0x1000000;
-	  return p[0] * 256 + p[1];
+	  return g_utf8_get_char (p);
 	}
      else
 	return val ? val : c;
@@ -398,10 +411,10 @@ g_unichar_tolower (gunichar c)
  if (t == G_UNICODE_UPPERCASE_LETTER)
    {
      gunichar val = ATTTABLE (c >> 8, c & 0xff);
-      if (val >= 0xd800 && val < 0xdc00)
+      if (val >= 0x1000000)
 	{
-	  const guchar *p = special_case_table[val - 0xd800];
+	  const guchar *p = special_case_table + val - 0x1000000;
-	  return p[0] * 256 + p[1];
+	  return g_utf8_get_char (p);
 	}
      else
 	return val ? val : c;
@@ -561,31 +574,22 @@ output_marks (const char **p_inout,
 static gsize
 output_special_case (gchar *out_buffer,
 		     gsize  len,
-		     int    index,
+		     int    offset,
 		     int    type,
 		     int    which)
 {
-  const guchar *p = special_case_table[index];
+  const guchar *p = special_case_table + offset;
  gint len;
  if (type != G_UNICODE_TITLECASE_LETTER)
-    p += 2; /* +2 to skip over "best single match" */
+    p = g_utf8_next_char (p);
  if (which == 1)
-    {
+    p += strlen (p) + 1;
      while (p[0] || p[1])
 	p += 2;
      p += 2;
    }
-  while (TRUE)
+  len = strlen (p);
-    {
+  if (out_buffer)
-      gunichar ch = p[0] * 256 + p[1];
+    memcpy (out_buffer, p, len);
      if (!ch)
 	break;
      len += g_unichar_to_utf8 (ch, out_buffer ? out_buffer + len : NULL);
      p += 2;
    }
  return len;
 }
@@ -662,9 +666,9 @@ real_toupper (const gchar *str,
 	{
 	  val = ATTTABLE (c >> 8, c & 0xff);
-	  if (val >= 0xd800 && val < 0xdc00)
+	  if (val >= 0x1000000)
 	    {
-	      len += output_special_case (out_buffer, len, val - 0xd800, t,
+	      len += output_special_case (out_buffer, len, val - 0x1000000, t,
 					  t == G_UNICODE_LOWERCASE_LETTER ? 0 : 1);
 	    }
 	  else
@@ -785,9 +789,9 @@ real_tolower (const gchar *str,
 	{
 	  val = ATTTABLE (c >> 8, c & 0xff);
-	  if (val >= 0xd800 && val < 0xdc00)
+	  if (val >= 0x1000000)
 	    {
-	      len += output_special_case (out_buffer, len, val - 0xd800, t, 0);
+	      len += output_special_case (out_buffer, len, val - 0x1000000, t, 0);
 	    }
 	  else
 	    {
--- a/tests/casefold.txt
+++ b/tests/casefold.txt
@@ -1,4 +1,4 @@
-# Test cases generated from Unicode 3.1 data
+# Test cases generated from Unicode 4.0 data
 # by gen-casefold-test.pl. Do not edit.
 #
 # Some special hand crafted tests
@@ -89,8 +89,7 @@ Z	z
 Ī	ī
 Ĭ	ĭ
 Į	į
-İ	i
+İ	i̇
 ı	i
 Ĳ	ĳ
 Ĵ	ĵ
 Ķ	ķ
@@ -216,6 +215,7 @@ Z	z
 Ț	ț
 Ȝ	ȝ
 Ȟ	ȟ
 Ƞ	ƞ
 Ȣ	ȣ
 Ȥ	ȥ
 Ȧ	ȧ
@@ -266,6 +266,7 @@ Z	z
 ϑ	θ
 ϕ	φ
 ϖ	π
 Ϙ	ϙ
 Ϛ	ϛ
 Ϝ	ϝ
 Ϟ	ϟ
@@ -279,9 +280,11 @@ Z	z
 Ϯ	ϯ
 ϰ	κ
 ϱ	ρ
 ϲ	σ
 ϴ	θ
 ϵ	ε
 Ϸ	ϸ
 Ϲ	ϲ
 Ϻ	ϻ
 Ѐ	ѐ
 Ё	ё
 Ђ	ђ
@@ -347,6 +350,7 @@ Z	z
 Ѽ	ѽ
 Ѿ	ѿ
 Ҁ	ҁ
 Ҋ	ҋ
 Ҍ	ҍ
 Ҏ	ҏ
 Ґ	ґ
@@ -375,8 +379,11 @@ Z	z
 Ҿ	ҿ
 Ӂ	ӂ
 Ӄ	ӄ
 Ӆ	ӆ
 Ӈ	ӈ
 Ӊ	ӊ
 Ӌ	ӌ
 Ӎ	ӎ
 Ӑ	ӑ
 Ӓ	ӓ
 Ӕ	ӕ
@@ -397,6 +404,14 @@ Z	z
 Ӳ	ӳ
 Ӵ	ӵ
 Ӹ	ӹ
 Ԁ	ԁ
 Ԃ	ԃ
 Ԅ	ԅ
 Ԇ	ԇ
 Ԉ	ԉ
 Ԋ	ԋ
 Ԍ	ԍ
 Ԏ	ԏ
 Ա	ա
 Բ	բ
 Գ	գ
@@ -794,3 +809,43 @@ Z	z
 Ｘ	ｘ
 Ｙ	ｙ
 Ｚ	ｚ
 𐐀	𐐨
 𐐁	𐐩
 𐐂	𐐪
 𐐃	𐐫
 𐐄	𐐬
 𐐅	𐐭
 𐐆	𐐮
 𐐇	𐐯
 𐐈	𐐰
 𐐉	𐐱
 𐐊	𐐲
 𐐋	𐐳
 𐐌	𐐴
 𐐍	𐐵
 𐐎	𐐶
 𐐏	𐐷
 𐐐	𐐸
 𐐑	𐐹
 𐐒	𐐺
 𐐓	𐐻
 𐐔	𐐼
 𐐕	𐐽
 𐐖	𐐾
 𐐗	𐐿
 𐐘	𐑀
 𐐙	𐑁
 𐐚	𐑂
 𐐛	𐑃
 𐐜	𐑄
 𐐝	𐑅
 𐐞	𐑆
 𐐟	𐑇
 𐐠	𐑈
 𐐡	𐑉
 𐐢	𐑊
 𐐣	𐑋
 𐐤	𐑌
 𐐥	𐑍
 𐐦	𐑎
 𐐧	𐑏
--- a/tests/casemap.txt
+++ b/tests/casemap.txt
--- a/tests/gen-casefold-txt.pl
+++ b/tests/gen-casefold-txt.pl
@@ -24,6 +24,8 @@
 # I consider the output of this program to be unrestricted.  Use it as
 # you will.
 require 5.006;
 # Names of fields in the CaseFolding table
 $FOLDING_CODE = 0;
 $FOLDING_STATUS = 1;
@@ -49,6 +51,7 @@ AaBbCc@@\taabbcc@@
 #
 EOT
 binmode STDOUT, ":utf8";
 open (INPUT, "< $ARGV[1]") || exit 1;
 while (<INPUT>)
@@ -65,15 +68,14 @@ while (<INPUT>)
    my $raw_code = $fields[$FOLDING_CODE];
    my $code = hex ($raw_code);
    next if $code > 0xffff;	# FIXME!
    if ($#fields != 3)
    {
 	printf STDERR ("Entry for $raw_code has wrong number of fields (%d)\n", $#fields);
 	next;
    }
-    next if ($fields[$FOLDING_STATUS] eq 'S');
+    # skip simple and Turkic mappings
    next if ($fields[$FOLDING_STATUS] =~ /^[ST]$/);
    @values = map { hex ($_) } split /\s+/, $fields[$FOLDING_MAPPING];
    printf ("%s\t%s\n", pack ("U", $code), pack ("U*", @values));
--- a/tests/gen-casemap-txt.pl
+++ b/tests/gen-casemap-txt.pl
@@ -23,6 +23,7 @@
 # I consider the output of this program to be unrestricted.  Use it as
 # you will.
 require 5.006;
 use utf8;
 if (@ARGV != 3) {
@@ -60,6 +61,7 @@ my @upper;
 my @title;
 my @lower;
 binmode STDOUT, ":utf8";
 open (INPUT, "< $ARGV[1]") || exit 1;
 $last_code = -1;
@@ -74,8 +76,6 @@ while (<INPUT>)
    $code = hex ($fields[$CODE]);
    last if ($code > 0xFFFF); # ignore characters out of the basic plane
    if ($code > $last_code + 1)
    {
 	# Found a gap.
@@ -196,7 +196,7 @@ sub process_one
 sub print_tests
 {
-    for ($i = 0; $i < 0xffff; $i++) {
+    for ($i = 0; $i < 0x10ffff; $i++) {
 	if ($i == 0x3A3) {
 	    # Greek sigma needs special tests
 	    next;