Update Unicode data to 4.0. (#107974)

2003-07-30 Noah Levitt <nlevitt@columbia.edu> * glib/gen-unicode-tables.pl: * glib/gunibreak.c: * glib/gunibreak.h: * glib/gunichartables.h: * glib/gunicode.h: * glib/gunicomp.h: * glib/gunidecomp.c: * glib/gunidecomp.h: * glib/guniprop.c: * tests/casefold.txt: * tests/casemap.txt: * tests/gen-casefold-txt.pl: * tests/gen-casemap-txt.pl: Update Unicode data to 4.0. (#107974)
2025-08-22 08:58:54 +02:00 · 2003-07-31 02:27:56 +00:00
parent cdf72b09e6
commit 05f99527eb
19 changed files with 22213 additions and 8644 deletions
--- a/16
+++ b/16
@@ -1,3 +1,19 @@
+2003-07-30  Noah Levitt  <nlevitt@columbia.edu>
+
+	* glib/gen-unicode-tables.pl:
+	* glib/gunibreak.c:
+	* glib/gunibreak.h:
+	* glib/gunichartables.h:
+	* glib/gunicode.h:
+	* glib/gunicomp.h:
+	* glib/gunidecomp.c:
+	* glib/gunidecomp.h:
+	* glib/guniprop.c:
+	* tests/casefold.txt:
+	* tests/casemap.txt:
+	* tests/gen-casefold-txt.pl:
+	* tests/gen-casemap-txt.pl: Update Unicode data to 4.0. (#107974)
+
 2003-07-31  Tor Lillqvist  <tml@iki.fi>

 	* glib/gspawn-win32.c: When possible, manage without the helper
--- a/ChangeLog.pre-2-10
+++ b/ChangeLog.pre-2-10
@@ -1,3 +1,19 @@
+2003-07-30  Noah Levitt  <nlevitt@columbia.edu>
+
+	* glib/gen-unicode-tables.pl:
+	* glib/gunibreak.c:
+	* glib/gunibreak.h:
+	* glib/gunichartables.h:
+	* glib/gunicode.h:
+	* glib/gunicomp.h:
+	* glib/gunidecomp.c:
+	* glib/gunidecomp.h:
+	* glib/guniprop.c:
+	* tests/casefold.txt:
+	* tests/casemap.txt:
+	* tests/gen-casefold-txt.pl:
+	* tests/gen-casemap-txt.pl: Update Unicode data to 4.0. (#107974)
+
 2003-07-31  Tor Lillqvist  <tml@iki.fi>

 	* glib/gspawn-win32.c: When possible, manage without the helper
--- a/ChangeLog.pre-2-12
+++ b/ChangeLog.pre-2-12
@@ -1,3 +1,19 @@
+2003-07-30  Noah Levitt  <nlevitt@columbia.edu>
+
+	* glib/gen-unicode-tables.pl:
+	* glib/gunibreak.c:
+	* glib/gunibreak.h:
+	* glib/gunichartables.h:
+	* glib/gunicode.h:
+	* glib/gunicomp.h:
+	* glib/gunidecomp.c:
+	* glib/gunidecomp.h:
+	* glib/guniprop.c:
+	* tests/casefold.txt:
+	* tests/casemap.txt:
+	* tests/gen-casefold-txt.pl:
+	* tests/gen-casemap-txt.pl: Update Unicode data to 4.0. (#107974)
+
 2003-07-31  Tor Lillqvist  <tml@iki.fi>

 	* glib/gspawn-win32.c: When possible, manage without the helper
--- a/ChangeLog.pre-2-4
+++ b/ChangeLog.pre-2-4
@@ -1,3 +1,19 @@
+2003-07-30  Noah Levitt  <nlevitt@columbia.edu>
+
+	* glib/gen-unicode-tables.pl:
+	* glib/gunibreak.c:
+	* glib/gunibreak.h:
+	* glib/gunichartables.h:
+	* glib/gunicode.h:
+	* glib/gunicomp.h:
+	* glib/gunidecomp.c:
+	* glib/gunidecomp.h:
+	* glib/guniprop.c:
+	* tests/casefold.txt:
+	* tests/casemap.txt:
+	* tests/gen-casefold-txt.pl:
+	* tests/gen-casemap-txt.pl: Update Unicode data to 4.0. (#107974)
+
 2003-07-31  Tor Lillqvist  <tml@iki.fi>

 	* glib/gspawn-win32.c: When possible, manage without the helper
--- a/ChangeLog.pre-2-6
+++ b/ChangeLog.pre-2-6
@@ -1,3 +1,19 @@
+2003-07-30  Noah Levitt  <nlevitt@columbia.edu>
+
+	* glib/gen-unicode-tables.pl:
+	* glib/gunibreak.c:
+	* glib/gunibreak.h:
+	* glib/gunichartables.h:
+	* glib/gunicode.h:
+	* glib/gunicomp.h:
+	* glib/gunidecomp.c:
+	* glib/gunidecomp.h:
+	* glib/guniprop.c:
+	* tests/casefold.txt:
+	* tests/casemap.txt:
+	* tests/gen-casefold-txt.pl:
+	* tests/gen-casemap-txt.pl: Update Unicode data to 4.0. (#107974)
+
 2003-07-31  Tor Lillqvist  <tml@iki.fi>

 	* glib/gspawn-win32.c: When possible, manage without the helper
--- a/ChangeLog.pre-2-8
+++ b/ChangeLog.pre-2-8
@@ -1,3 +1,19 @@
+2003-07-30  Noah Levitt  <nlevitt@columbia.edu>
+
+	* glib/gen-unicode-tables.pl:
+	* glib/gunibreak.c:
+	* glib/gunibreak.h:
+	* glib/gunichartables.h:
+	* glib/gunicode.h:
+	* glib/gunicomp.h:
+	* glib/gunidecomp.c:
+	* glib/gunidecomp.h:
+	* glib/guniprop.c:
+	* tests/casefold.txt:
+	* tests/casemap.txt:
+	* tests/gen-casefold-txt.pl:
+	* tests/gen-casemap-txt.pl: Update Unicode data to 4.0. (#107974)
+
 2003-07-31  Tor Lillqvist  <tml@iki.fi>

 	* glib/gspawn-win32.c: When possible, manage without the helper
--- a/glib/gen-unicode-tables.pl
+++ b/glib/gen-unicode-tables.pl
@@ -31,8 +31,12 @@
 # * For decomp table it might make sense to use a shift count other
 #   than 8.  We could easily compute the perfect shift count.

+# we use some perl unicode features
+require 5.006;
+
 use vars qw($CODE $NAME $CATEGORY $COMBINING_CLASSES $BIDI_CATEGORY $DECOMPOSITION $DECIMAL_VALUE $DIGIT_VALUE $NUMERIC_VALUE $MIRRORED $OLD_NAME $COMMENT $UPPER $LOWER $TITLE $BREAK_CODE $BREAK_CATEGORY $BREAK_NAME $CASE_CODE $CASE_LOWER $CASE_TITLE $CASE_UPPER $CASE_CONDITION);

+
 # Names of fields in Unicode data table.
 $CODE = 0;
 $NAME = 1;
@@ -134,6 +138,8 @@ $FOLDING_MAPPING = 2;
     'PO' => "G_UNICODE_BREAK_POSTFIX",
     'SA' => "G_UNICODE_BREAK_COMPLEX_CONTEXT",
     'AI' => "G_UNICODE_BREAK_AMBIGUOUS",
+     'NL' => "G_UNICODE_BREAK_NEXT_LINE",
+     'WJ' => "G_UNICODE_BREAK_WORD_JOINER",
     'XX' => "G_UNICODE_BREAK_UNKNOWN"
     );

@@ -143,8 +149,9 @@ $FOLDING_MAPPING = 2;

 # Maximum length of special-case strings

-my $special_case_len = 0;
 my @special_cases;
+my @special_case_offsets;
+my $special_case_offset = 0;

 $do_decomp = 0;
 $do_props = 1;
@@ -193,6 +200,9 @@ print "Unicode data from $ARGV[1]\n";

 open (INPUT, "< $ARGV[1]") || exit 1;

+# we save memory by skipping the huge empty area before U+E0000
+my $pages_before_e0000;
+
 $last_code = -1;
 while (<INPUT>)
 {
@@ -205,7 +215,10 @@ while (<INPUT>)

    $code = hex ($fields[$CODE]);

-    last if ($code > 0xFFFF); # ignore characters out of the basic plane
+    if ($code >= 0xE0000 and $last_code < 0xE0000)
+    {
+        $pages_before_e0000 = ($last_code >> 8) + 1;
+    }

    if ($code > $last_code + 1)
    {
@@ -237,12 +250,12 @@ close INPUT;

@gfields = ('', '', 'Cn', '0', '', '', '', '', '', '', '',
 	    '', '', '', '');
-for (++$last_code; $last_code < 0x10000; ++$last_code)
+for (++$last_code; $last_code <= 0x10FFFF; ++$last_code)
 {
    $gfields{$CODE} = sprintf ("%04x", $last_code);
    &process_one ($last_code, @gfields);
 }
--$last_code;			# Want last to be 0xFFFF.
+--$last_code;			# Want last to be 0x10FFFF.

 print "Creating line break table\n";

@@ -268,7 +281,7 @@ while (<INPUT>)
 	next;
    }

-    if ($fields[$CODE] =~ /([A-F0-9]{4})..([A-F0-9]{4})/) 
+    if ($fields[$CODE] =~ /([A-F0-9]{4,6})\.\.([A-F0-9]{4,6})/) 
    {
 	$start_code = hex ($1);
 	$end_code = hex ($2);
@@ -277,8 +290,6 @@ while (<INPUT>)
 	
    }

-    last if ($start_code > 0xFFFF); # FIXME ignore characters out of the basic plane 
-
    if ($start_code > $last_code + 1)
    {
 	# The gap represents undefined characters. If assigned,
@@ -306,7 +317,7 @@ while (<INPUT>)

 close INPUT;

-for (++$last_code; $last_code < 0x10000; ++$last_code)
+for (++$last_code; $last_code <= 0x10FFFF; ++$last_code)
 {
  if ($type[$last_code] eq 'Cn')
    {
@@ -317,9 +328,9 @@ for (++$last_code; $last_code < 0x10000; ++$last_code)
      $break_props[$last_code] = 'AL';
    }
 }
--$last_code;			# Want last to be 0xFFFF.
+--$last_code;			# Want last to be 0x10FFFF.

-print STDERR "Last code is not 0xFFFF" if ($last_code != 0xFFFF);
+print STDERR "Last code is not 0x10FFFF" if ($last_code != 0x10FFFF);

 print "Reading special-casing table for case conversion\n";

@@ -362,18 +373,18 @@ while (<INPUT>)
    {
 	(hex $fields[$CASE_UPPER] == $code) || die "$raw_code is Lu and UCD_Upper($raw_code) != $raw_code";

-	&add_special_case ($code, $value[$code],$fields[$CASE_LOWER], $fields[$CASE_TITLE]);
+	&add_special_case ($code, $value[$code], $fields[$CASE_LOWER], $fields[$CASE_TITLE]);
 	
    } elsif ($type[$code] eq 'Lt') 
    {
 	(hex $fields[$CASE_TITLE] == $code) || die "$raw_code is Lt and UCD_Title($raw_code) != $raw_code";
 	
-	&add_special_case ($code, undef,$fields[$CASE_LOWER], $fields[$CASE_UPPER]);
+	&add_special_case ($code, undef, $fields[$CASE_LOWER], $fields[$CASE_UPPER]);
    } elsif ($type[$code] eq 'Ll') 
    {
 	(hex $fields[$CASE_LOWER] == $code) || die "$raw_code is Ll and UCD_Lower($raw_code) != $raw_code";
 	
-	&add_special_case ($code, $value[$code],$fields[$CASE_UPPER], $fields[$CASE_TITLE]);
+	&add_special_case ($code, $value[$code], $fields[$CASE_UPPER], $fields[$CASE_TITLE]);
    } else {
 	printf STDERR "Special case for non-alphabetic code point: $raw_code\n";
 	next;
@@ -403,22 +414,21 @@ while (<INPUT>)
    $raw_code = $fields[$FOLDING_CODE];
    $code = hex ($raw_code);

-    next if $code > 0xffff;	# FIXME!
-    
    if ($#fields != 3)
    {
 	printf STDERR ("Entry for $raw_code has wrong number of fields (%d)\n", $#fields);
 	next;
    }

-    next if ($fields[$FOLDING_STATUS] eq 'S');
+    # we don't use Simple or Turkic rules here
+    next if ($fields[$FOLDING_STATUS] =~ /^[ST]$/);

    @values = map { hex ($_) } split /\s+/, $fields[$FOLDING_MAPPING];

    # Check simple case

    if (@values == 1 && 
-	!(defined $value[$code] && $value[$code] >= 0xd800 && $value[$code] < 0xdc00) &&
+	!(defined $value[$code] && $value[$code] >= 0x1000000) &&
 	defined $type[$code]) {

 	my $lower;
@@ -441,13 +451,12 @@ while (<INPUT>)
    }

    my $string = pack ("U*", @values);
-    $string =~ s/([\x80-\xff])/sprintf "\\x%02x",ord($1)/eg;
-    
-    if (1 + length $string > $casefoldlen) {
-	$casefoldlen = 1 + length $string;
+
+    if (1 + &length_in_bytes ($string) > $casefoldlen) {
+	$casefoldlen = 1 + &length_in_bytes ($string);
    }

-    push @casefold, [ $code, $string ];
+    push @casefold, [ $code, &escape ($string) ];
 }

 close INPUT;
@@ -464,6 +473,16 @@ if ($do_decomp) {

 exit 0;

+
+# perl "length" returns the length in characters
+sub length_in_bytes
+{
+    my ($string) = @_;
+
+    use bytes;
+    return length $string;
+}
+
 # Process a single character.
 sub process_one
 {
@@ -528,7 +547,11 @@ sub print_tables

    printf OUT "#define G_UNICODE_LAST_CHAR 0x%04x\n\n", $last;

-    printf OUT "#define G_UNICODE_MAX_TABLE_INDEX 1000\n\n";
+    printf OUT "#define G_UNICODE_MAX_TABLE_INDEX 10000\n\n";
+
+    my $last_part1 = ($pages_before_e0000 * 256) - 1;
+    printf OUT "#define G_UNICODE_LAST_CHAR_PART1 0x%04X\n\n", $last_part1;
+    printf OUT "#define G_UNICODE_LAST_PAGE_PART1 %d\n\n", $pages_before_e0000 - 1;

    $table_index = 0;
    printf OUT "static const char type_data[][256] = {\n";
@@ -538,8 +561,9 @@ sub print_tables
    }
    printf OUT "\n};\n\n";

-    print OUT "static const short type_table[256] = {\n";
-    for ($count = 0; $count <= $last; $count += 256)
+    printf OUT "/* U+0000 through U+%04X */\n", $last_part1;
+    print OUT "static const gint16 type_table_part1[$pages_before_e0000] = {\n";
+    for ($count = 0; $count <= $last_part1; $count += 256)
    {
 	print OUT ",\n" if $count > 0;
 	print OUT "  ", $row[$count / 256];
@@ -547,21 +571,32 @@ sub print_tables
    }
    print OUT "\n};\n\n";

+    printf OUT "/* U+E0000 through U+%04X */\n", $last;
+    print OUT "static const gint16 type_table_part2[768] = {\n";
+    for ($count = 0xE0000; $count <= $last; $count += 256)
+    {
+	print OUT ",\n" if $count > 0xE0000;
+	print OUT "  ", $row[$count / 256];
+	$bytes_out += 2;
+    }
+    print OUT "\n};\n\n";
+

    #
    # Now print attribute table.
    #

    $table_index = 0;
-    printf OUT "static const unsigned short attr_data[][256] = {\n";
+    printf OUT "static const gunichar attr_data[][256] = {\n";
    for ($count = 0; $count <= $last; $count += 256)
    {
-	$row[$count / 256] = &print_row ($count, 2, \&fetch_attr);
+	$row[$count / 256] = &print_row ($count, 4, \&fetch_attr);
    }
    printf OUT "\n};\n\n";

-    print OUT "static const short attr_table[256] = {\n";
-    for ($count = 0; $count <= $last; $count += 256)
+    printf OUT "/* U+0000 through U+%04X */\n", $last_part1;
+    print OUT "static const gint16 attr_table_part1[$pages_before_e0000] = {\n";
+    for ($count = 0; $count <= $last_part1; $count += 256)
    {
 	print OUT ",\n" if $count > 0;
 	print OUT "  ", $row[$count / 256];
@@ -569,12 +604,21 @@ sub print_tables
    }
    print OUT "\n};\n\n";

+    printf OUT "/* U+E0000 through U+%04X */\n", $last;
+    print OUT "static const gint16 attr_table_part2[768] = {\n";
+    for ($count = 0xE0000; $count <= $last; $count += 256)
+    {
+	print OUT ",\n" if $count > 0xE0000;
+	print OUT "  ", $row[$count / 256];
+	$bytes_out += 2;
+    }
+    print OUT "\n};\n\n";
+
    #
    # print title case table
    #

-    # FIXME: type.
-    print OUT "static const unsigned short title_table[][3] = {\n";
+    print OUT "static const gunichar title_table[][3] = {\n";
    my ($item);
    my ($first) = 1;
    foreach $item (sort keys %title_to_lower)
@@ -583,7 +627,7 @@ sub print_tables
 	    unless $first;
 	$first = 0;
 	printf OUT "  { 0x%04x, 0x%04x, 0x%04x }", $item, $title_to_upper{$item}, $title_to_lower{$item};
-	$bytes_out += 6;
+	$bytes_out += 12;
    }
    print OUT "\n};\n\n";

@@ -666,6 +710,40 @@ sub print_row
    return sprintf "%d /* page %d */", $table_index++, $start / 256;
 }

+sub escape
+{
+    my ($string) = @_;
+
+    $string =~ s/(\C)/sprintf "\\x%02x",ord($1)/eg;
+
+    return $string;
+}
+
+# Returns the offset of $decomp in the offset string. Updates the
+# referenced variables as appropriate.
+sub handle_decomp ($$$$)
+{
+    my ($decomp, $decomp_offsets_ref, $decomp_string_ref, $decomp_string_offset_ref) = @_;
+    my $offset = "G_UNICODE_NOT_PRESENT_OFFSET";
+
+    if (defined $decomp)
+    {
+        if (defined $decomp_offsets_ref->{$decomp})
+        {
+            $offset = $decomp_offsets_ref->{$decomp};
+        }
+        else
+        {
+            $offset = ${$decomp_string_offset_ref};
+            $decomp_offsets_ref->{$decomp} = $offset;
+            ${$decomp_string_ref} .= "\n  \"" . &escape ($decomp) . "\\0\" /* offset ${$decomp_string_offset_ref} */";
+            ${$decomp_string_offset_ref} += &length_in_bytes ($decomp) + 1;
+        }
+    }
+
+    return $offset;
+}
+
 # Generate the character decomposition header.
 sub print_decomp
 {
@@ -684,19 +762,26 @@ sub print_decomp

    printf OUT "#define G_UNICODE_LAST_CHAR 0x%04x\n\n", $last;

-    printf OUT "#define G_UNICODE_MAX_TABLE_INDEX 1000\n\n";
+    printf OUT "#define G_UNICODE_MAX_TABLE_INDEX (0x110000 / 256)\n\n";
+
+    my $last_part1 = ($pages_before_e0000 * 256) - 1;
+    printf OUT "#define G_UNICODE_LAST_CHAR_PART1 0x%04X\n\n", $last_part1;
+    printf OUT "#define G_UNICODE_LAST_PAGE_PART1 %d\n\n", $pages_before_e0000 - 1;
+
+    $NOT_PRESENT_OFFSET = 65535;
+    print OUT "#define G_UNICODE_NOT_PRESENT_OFFSET $NOT_PRESENT_OFFSET\n\n";

    my ($count, @row);
    $table_index = 0;
-    printf OUT "static const unsigned char cclass_data[][256] = {\n";
+    printf OUT "static const guchar cclass_data[][256] = {\n";
    for ($count = 0; $count <= $last; $count += 256)
    {
 	$row[$count / 256] = &print_row ($count, 1, \&fetch_cclass);
    }
    printf OUT "\n};\n\n";

-    print OUT "static const short combining_class_table[256] = {\n";
-    for ($count = 0; $count <= $last; $count += 256)
+    print OUT "static const gint16 combining_class_table_part1[$pages_before_e0000] = {\n";
+    for ($count = 0; $count <= $last_part1; $count += 256)
    {
 	print OUT ",\n" if $count > 0;
 	print OUT "  ", $row[$count / 256];
@@ -704,12 +789,19 @@ sub print_decomp
    }
    print OUT "\n};\n\n";

+    print OUT "static const gint16 combining_class_table_part2[768] = {\n";
+    for ($count = 0xE0000; $count <= $last; $count += 256)
+    {
+	print OUT ",\n" if $count > 0xE0000;
+	print OUT "  ", $row[$count / 256];
+	$bytes_out += 2;
+    }
+    print OUT "\n};\n\n";
+
    print OUT "typedef struct\n{\n";
-    # FIXME: type.
-    print OUT "  unsigned short ch;\n";
-    print OUT "  unsigned char canon_offset;\n";
-    print OUT "  unsigned char compat_offset;\n";
-    print OUT "  unsigned short expansion_offset;\n";
+    print OUT "  gunichar ch;\n";
+    print OUT "  guint16 canon_offset;\n";
+    print OUT "  guint16 compat_offset;\n";
    print OUT "} decomposition;\n\n";

    print OUT "static const decomposition decomp_table[] =\n{\n";
@@ -737,40 +829,19 @@ sub print_decomp
 		undef $compat_decomp; 
 	    }

-	    my $string = "";
-	    my $canon_offset = 0xff;
-	    my $compat_offset = 0xff;
-	    
-	    if (defined $canon_decomp) {
-		$canon_offset = 0;
-		$string .= $canon_decomp;
-	    }
-	    if (defined $compat_decomp) {
-		if (defined $canon_decomp) {
-		    $string .= "\\x00\\x00";
-		}
-		$compat_offset = (length $string) / 4;
-		$string .= $compat_decomp;
-	    }
+	    my $canon_offset = handle_decomp ($canon_decomp, \%decomp_offsets, \$decomp_string, \$decomp_string_offset);
+	    my $compat_offset = handle_decomp ($compat_decomp, \%decomp_offsets, \$decomp_string, \$decomp_string_offset);

-            if (!defined($decomp_offsets{$string})) {
-                $decomp_offsets{$string} = $decomp_string_offset;
-                $decomp_string .= "\n  \"".$string."\\0\\0\" /* offset ".
-                    $decomp_string_offset." */";
-                $decomp_string_offset += ((length $string) / 4) + 2;
-	    
-                $bytes_out += (length $string) / 4 + 2; # "\x20"
-            }
-	    
-            printf OUT qq(  { 0x%04x, %u, %u, %d }), 
-                $count, $canon_offset, $compat_offset, $decomp_offsets{$string};
-	    $bytes_out += 6;
+            die if $decomp_string_offset > $NOT_PRESENT_OFFSET;

+            printf OUT qq(  { 0x%04x, $canon_offset, $compat_offset }), $count;
+	    $bytes_out += 8;
 	}
    }
    print OUT "\n};\n\n";
+    $bytes_out += $decomp_string_offset + 1;

-    printf OUT "static const guchar decomp_expansion_string[] = %s;\n\n", $decomp_string;
+    printf OUT "static const gchar decomp_expansion_string[] = %s;\n\n", $decomp_string;

    print OUT "#endif /* DECOMP_H */\n";

@@ -796,20 +867,25 @@ sub print_line_break

    print OUT "#define G_UNICODE_DATA_VERSION \"$ARGV[0]\"\n\n";

-    printf OUT "#define G_UNICODE_LAST_CHAR 0x%04x\n\n", $last;
+    printf OUT "#define G_UNICODE_LAST_CHAR 0x%04X\n\n", $last;

-    printf OUT "#define G_UNICODE_MAX_TABLE_INDEX 1000\n\n";
+    printf OUT "#define G_UNICODE_MAX_TABLE_INDEX 10000\n\n";
+
+    my $last_part1 = ($pages_before_e0000 * 256) - 1;
+    printf OUT "/* the last code point that should be looked up in break_property_table_part1 */\n";
+    printf OUT "#define G_UNICODE_LAST_CHAR_PART1 0x%04X\n\n", $last_part1;

    $table_index = 0;
-    printf OUT "static const char break_property_data[][256] = {\n";
+    printf OUT "static const gint8 break_property_data[][256] = {\n";
    for ($count = 0; $count <= $last; $count += 256)
    {
 	$row[$count / 256] = &print_row ($count, 1, \&fetch_break_type);
    }
    printf OUT "\n};\n\n";

-    print OUT "static const short break_property_table[256] = {\n";
-    for ($count = 0; $count <= $last; $count += 256)
+    printf OUT "/* U+0000 through U+%04X */\n", $last_part1;
+    print OUT "static const gint16 break_property_table_part1[$pages_before_e0000] = {\n";
+    for ($count = 0; $count <= $last_part1; $count += 256)
    {
 	print OUT ",\n" if $count > 0;
 	print OUT "  ", $row[$count / 256];
@@ -817,6 +893,17 @@ sub print_line_break
    }
    print OUT "\n};\n\n";

+    printf OUT "/* U+E0000 through U+%04X */\n", $last;
+    print OUT "static const gint16 break_property_table_part2[768] = {\n";
+    for ($count = 0xE0000; $count <= $last; $count += 256)
+    {
+	print OUT ",\n" if $count > 0xE0000;
+	print OUT "  ", $row[$count / 256];
+	$bytes_out += 2;
+    }
+    print OUT "\n};\n\n";
+
+
    print OUT "#endif /* BREAKTABLES_H */\n";

    close (OUT);
@@ -870,7 +957,7 @@ sub make_decomp
    my $result = "";
    foreach $iter (&expand_decomp ($code, $compat))
    {
-	$result .= sprintf "\\x%02x\\x%02x", $iter / 256, $iter & 0xff;
+	$result .= pack ("U", $iter);  # to utf-8
    }

    $result;
@@ -888,21 +975,17 @@ sub add_special_case


    for $value (@values) {
-	$result .= sprintf ("\\x%02x\\x%02x", $value / 256, $value & 0xff);
+	$result .= pack ("U", $value);  # to utf-8
    }
-
-    $result .= "\\0";
    
-    if (2 * @values + 2 > $special_case_len) {
-	$special_case_len = 2 * @values + 2;
-    }
+    push @special_case_offsets, $special_case_offset;

-    push @special_cases, $result;
+    # We encode special cases up in the 0x1000000 space
+    $value[$code] = 0x1000000 + $special_case_offset;

-    #
-    # We encode special cases in the surrogate pair space
-    #
-    $value[$code] = 0xD800 + scalar(@special_cases) - 1;
+    $special_case_offset += 1 + &length_in_bytes ($result);
+
+    push @special_cases, &escape ($result);
 }

 sub output_special_case_table
@@ -915,13 +998,15 @@ sub output_special_case_table
 * First, the best single character mapping to lowercase if Lu, 
 * and to uppercase if Ll, followed by the output mapping for the two cases 
 * other than the case of the codepoint, in the order [Ll],[Lu],[Lt],
- * separated and terminated by a double NUL.
+ * encoded in UTF-8, separated and terminated by a null character.
 */
-static const guchar special_case_table[][$special_case_len] = {
+static const gchar special_case_table[] = {
 EOT

+    my $i = 0;
    for $case (@special_cases) {
-	print $out qq( "$case",\n);
+	print $out qq( "$case\\0" /* offset ${special_case_offsets[$i]} */\n);
+        $i++;
    }

    print $out <<EOT;
@@ -929,7 +1014,7 @@ EOT

 EOT

-    print STDERR "Generated ", ($special_case_len * scalar @special_cases), " bytes in special case table\n";
+    print STDERR "Generated " . ($special_case_offset + 1) . " bytes in special case table\n";
 }

 sub enumerate_ordered
@@ -962,16 +1047,22 @@ sub output_composition_table
    # decompositions. At the same time, record
    # the first and second character of each decomposition
    
-    for $code (keys %compositions) {
+    for $code (keys %compositions) 
+    {
 	@values = map { hex ($_) } split /\s+/, $compositions{$code};
+
+        # non-starters
 	if ($cclass[$values[0]]) {
 	    delete $compositions{$code};
 	    next;
 	}
+
+        # single-character decompositions
 	if (@values == 1) {
 	    delete $compositions{$code};
 	    next;
 	}
+
 	if (@values != 2) {
 	    die "$code has more than two elements in its decomposition!\n";
 	}
@@ -983,10 +1074,10 @@ sub output_composition_table
 	}
    }

-    # Assign integer indicices, removing singletons
+    # Assign integer indices, removing singletons
    my $n_first = enumerate_ordered (\%first);

-    # Now record the second character if each (non-singleton) decomposition
+    # Now record the second character of each (non-singleton) decomposition
    for $code (keys %compositions) {
 	@values = map { hex ($_) } split /\s+/, $compositions{$code};

@@ -1065,39 +1156,46 @@ sub output_composition_table

    my @row;						  
    $table_index = 0;
-    printf OUT "static const gushort compose_data[][256] = {\n";
+    printf OUT "static const guint16 compose_data[][256] = {\n";
    for (my $count = 0; $count <= $last; $count += 256)
    {
 	$row[$count / 256] = &print_row ($count, 2, sub { exists $vals{$_[0]} ? $vals{$_[0]} : 0; });
    }
    printf OUT "\n};\n\n";

-    print OUT "static const short compose_table[256] = {\n";
+    print OUT "static const gint16 compose_table[256] = {\n";
    for (my $count = 0; $count <= $last; $count += 256)
    {
 	print OUT ",\n" if $count > 0;
 	print OUT "  ", $row[$count / 256];
-	$bytes_out += 4;
    }
    print OUT "\n};\n\n";

+    $bytes_out += 256 * 2;
+
    # Output first singletons

-    print OUT "static const gushort compose_first_single[][2] = {\n";
+    print OUT "static const guint16 compose_first_single[][2] = {\n";
    $i = 0;				     
    for $record (@first_singletons) {
+        if ($record->[1] > 0xFFFF or $record->[2] > 0xFFFF) {
+            die "time to switch compose_first_single to gunichar" ;
+        }
 	print OUT ",\n" if $i++ > 0;
 	printf OUT " { %#06x, %#06x }", $record->[1], $record->[2];
    }
    print OUT "\n};\n";
 				     
-    $bytes_out += @first_singletons * 4;				     
+    $bytes_out += @first_singletons * 4;
 		  
    # Output second singletons

-    print OUT "static const gushort compose_second_single[][2] = {\n";
+    print OUT "static const guint16 compose_second_single[][2] = {\n";
    $i = 0;				     
    for $record (@second_singletons) {
+        if ($record->[1] > 0xFFFF or $record->[2] > 0xFFFF) {
+            die "time to switch compose_second_single to gunichar";
+        }
 	print OUT ",\n" if $i++ > 0;
 	printf OUT " { %#06x, %#06x }", $record->[1], $record->[2];
    }
@@ -1108,7 +1206,7 @@ sub output_composition_table
    # Output array of composition pairs

    print OUT <<EOT;
-static const gushort compose_array[$n_first][$n_second] = {
+static const guint16 compose_array[$n_first][$n_second] = {
 EOT
 			
    for (my $i = 0; $i < $n_first; $i++) {
@@ -1117,7 +1215,10 @@ EOT
 	for (my $j = 0; $j < $n_second; $j++) {
 	    print OUT ", " if $j;
 	    if (exists $reverse{"$i|$j"}) {
-		printf OUT "%#06x", $reverse{"$i|$j"};
+                if ($reverse{"$i|$j"} > 0xFFFF) {
+                    die "time to switch compose_array to gunichar" ;
+                }
+		printf OUT "0x%04x", $reverse{"$i|$j"};
 	    } else {
 		print OUT "     0";
            }
@@ -1151,10 +1252,16 @@ EOT

   @casefold = sort { $a->[0] <=> $b->[0] } @casefold; 
    
-   for $case (@casefold) {
+   for $case (@casefold) 
+   {
       $code = $case->[0];
       $string = $case->[1];
-       print $out sprintf(qq({ %#04x, "$string" },\n), $code);
+
+       if ($code > 0xFFFF) {
+           die "time to switch casefold_table to gunichar" ;
+       }
+
+       print $out sprintf(qq(  { 0x%04x, "$string" },\n), $code);
    
   }

--- a/glib/gunibreak.c
+++ b/glib/gunibreak.c
@@ -25,13 +25,22 @@
 #include "glib.h"
 #include "gunibreak.h"

+#define TPROP_PART1(Page, Char) \
+  ((break_property_table_part1[Page] >= G_UNICODE_MAX_TABLE_INDEX) \
+   ? (break_property_table_part1[Page] - G_UNICODE_MAX_TABLE_INDEX) \
+   : (break_property_data[break_property_table_part1[Page]][Char]))

-#define TPROP(Page, Char) \
-  ((break_property_table[Page] >= G_UNICODE_MAX_TABLE_INDEX) \
-   ? (break_property_table[Page] - G_UNICODE_MAX_TABLE_INDEX) \
-   : (break_property_data[break_property_table[Page]][Char]))
+#define TPROP_PART2(Page, Char) \
+  ((break_property_table_part2[Page] >= G_UNICODE_MAX_TABLE_INDEX) \
+   ? (break_property_table_part2[Page] - G_UNICODE_MAX_TABLE_INDEX) \
+   : (break_property_data[break_property_table_part2[Page]][Char]))

-#define PROP(Char) (((Char) > (G_UNICODE_LAST_CHAR)) ? G_UNICODE_BREAK_UNKNOWN : TPROP ((Char) >> 8, (Char) & 0xff))
+#define PROP(Char) \
+  (((Char) <= G_UNICODE_LAST_CHAR_PART1) \
+   ? TPROP_PART1 ((Char) >> 8, (Char) & 0xff) \
+   : (((Char) >= 0xe0000 && (Char) <= G_UNICODE_LAST_CHAR) \
+      ? TPROP_PART2 (((Char) - 0xe0000) >> 8, (Char) & 0xff) \
+      : G_UNICODE_BREAK_UNKNOWN))

 /**
 * g_unichar_break_type:
--- a/glib/gunibreak.h
+++ b/glib/gunibreak.h
--- a/glib/gunichartables.h
+++ b/glib/gunichartables.h
--- a/glib/gunicode.h
+++ b/glib/gunicode.h
@@ -100,7 +100,9 @@ typedef enum
  G_UNICODE_BREAK_POSTFIX,
  G_UNICODE_BREAK_COMPLEX_CONTEXT,
  G_UNICODE_BREAK_AMBIGUOUS,
-  G_UNICODE_BREAK_UNKNOWN
+  G_UNICODE_BREAK_UNKNOWN,
+  G_UNICODE_BREAK_NEXT_LINE,
+  G_UNICODE_BREAK_WORD_JOINER
 } GUnicodeBreakType;

 /* Returns TRUE if current locale uses UTF-8 charset.  If CHARSET is
--- a/glib/gunicomp.h
+++ b/glib/gunicomp.h
@@ -3,7 +3,7 @@
 #define COMPOSE_SECOND_START 357
 #define COMPOSE_SECOND_SINGLE_START 388

-static const gushort compose_data[][256] = {
+static const guint16 compose_data[][256] = {
  { /* page 0, index 0 */
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
@@ -222,7 +222,7 @@ static const gushort compose_data[][256] = {
  }
 };

-static const short compose_table[256] = {
+static const gint16 compose_table[256] = {
  0 /* page 0 */,
  1 /* page 1 */,
  2 /* page 2 */,
@@ -274,7 +274,7 @@ static const short compose_table[256] = {
  15 /* page 48 */
 };

-static const gushort compose_first_single[][2] = {
+static const guint16 compose_first_single[][2] = {
 { 0x0338, 0x226e },
 { 0x0338, 0x2260 },
 { 0x0338, 0x226f },
@@ -486,7 +486,7 @@ static const gushort compose_first_single[][2] = {
 { 0x3099, 0x30fa },
 { 0x3099, 0x30fe }
 };
-static const gushort compose_second_single[][2] = {
+static const guint16 compose_second_single[][2] = {
 { 0x0627, 0x0622 },
 { 0x0627, 0x0623 },
 { 0x0627, 0x0625 },
@@ -506,7 +506,7 @@ static const gushort compose_second_single[][2] = {
 { 0x0dd9, 0x0ddc },
 { 0x0dd9, 0x0dde }
 };
-static const gushort compose_array[146][31] = {
+static const guint16 compose_array[146][31] = {
 { 0x00c0, 0x00c1, 0x00c2, 0x00c3, 0x0100, 0x0102, 0x0226, 0x00c4, 0x1ea2, 0x00c5,      0, 0x01cd, 0x0200, 0x0202,      0,      0,      0, 0x1ea0,      0, 0x1e00,      0,      0, 0x0104,      0,      0,      0,      0,      0,      0,      0,      0 },
 {      0,      0,      0,      0,      0,      0, 0x1e02,      0,      0,      0,      0,      0,      0,      0,      0,      0,      0, 0x1e04,      0,      0,      0,      0,      0,      0,      0,      0, 0x1e06,      0,      0,      0,      0 },
 {      0, 0x0106, 0x0108,      0,      0,      0, 0x010a,      0,      0,      0,      0, 0x010c,      0,      0,      0,      0,      0,      0,      0,      0,      0, 0x00c7,      0,      0,      0,      0,      0,      0,      0,      0,      0 },
--- a/glib/gunidecomp.c
+++ b/glib/gunidecomp.c
@@ -28,13 +28,22 @@
 #include "gunicomp.h"


-#define CC(Page, Char) \
-  ((combining_class_table[Page] >= G_UNICODE_MAX_TABLE_INDEX) \
-   ? (combining_class_table[Page] - G_UNICODE_MAX_TABLE_INDEX) \
-   : (cclass_data[combining_class_table[Page]][Char]))
+#define CC_PART1(Page, Char) \
+  ((combining_class_table_part1[Page] >= G_UNICODE_MAX_TABLE_INDEX) \
+   ? (combining_class_table_part1[Page] - G_UNICODE_MAX_TABLE_INDEX) \
+   : (cclass_data[combining_class_table_part1[Page]][Char]))
+
+#define CC_PART2(Page, Char) \
+  ((combining_class_table_part2[Page] >= G_UNICODE_MAX_TABLE_INDEX) \
+   ? (combining_class_table_part2[Page] - G_UNICODE_MAX_TABLE_INDEX) \
+   : (cclass_data[combining_class_table_part2[Page]][Char]))

 #define COMBINING_CLASS(Char) \
-     (((Char) > (G_UNICODE_LAST_CHAR)) ? 0 : CC((Char) >> 8, (Char) & 0xff))
+  (((Char) <= G_UNICODE_LAST_CHAR_PART1) \
+   ? CC_PART1 ((Char) >> 8, (Char) & 0xff) \
+   : (((Char) >= 0xe0000 && (Char) <= G_UNICODE_LAST_CHAR) \
+      ? CC_PART2 (((Char) - 0xe0000) >> 8, (Char) & 0xff) \
+      : 0))

 /**
 * g_unicode_canonical_ordering:
@@ -84,7 +93,8 @@ g_unicode_canonical_ordering (gunichar *string,
    }
 }

-static const guchar *
+/* returns a pointer to a null-terminated UTF-8 string */
+static const gchar *
 find_decomposition (gunichar ch,
 		    gboolean compat)
 {
@@ -104,17 +114,17 @@ find_decomposition (gunichar ch,
 	      if (compat)
 		{
 		  offset = decomp_table[half].compat_offset;
-		  if (offset == 0xff)
+		  if (offset == G_UNICODE_NOT_PRESENT_OFFSET)
 		    offset = decomp_table[half].canon_offset;
 		}
 	      else
 		{
 		  offset = decomp_table[half].canon_offset;
-		  if (offset == 0xff)
+		  if (offset == G_UNICODE_NOT_PRESENT_OFFSET)
 		    return NULL;
 		}
 	      
-	      return &(decomp_expansion_string[decomp_table[half].expansion_offset + offset]);
+	      return &(decomp_expansion_string[offset]);
 	    }
 	  else if (half == start)
 	    break;
@@ -142,27 +152,20 @@ gunichar *
 g_unicode_canonical_decomposition (gunichar ch,
 				   gsize   *result_len)
 {
-  const guchar *decomp = find_decomposition (ch, FALSE);
+  const gchar *decomp = find_decomposition (ch, FALSE);
+  const gchar *p;
  gunichar *r;

  if (decomp)
    {
      /* Found it.  */
-      int i, len;
-      /* We store as a double-nul terminated string.  */
-      for (len = 0; (decomp[len] || decomp[len + 1]);
-	   len += 2)
-	;
+      int i;
      
-      /* We've counted twice as many bytes as there are
-	 characters.  */
-      *result_len = len / 2;
-      r = g_malloc (len / 2 * sizeof (gunichar));
+      *result_len = g_utf8_strlen (decomp, -1);
+      r = g_malloc (*result_len * sizeof (gunichar));
      
-      for (i = 0; i < len; i += 2)
-	{
-	  r[i / 2] = (decomp[i] << 8 | decomp[i + 1]);
-	}
+      for (p = decomp, i = 0; *p != '\0'; p = g_utf8_next_char (p), i++)
+        r[i] = g_utf8_get_char (p);
    }
  else
    {
@@ -194,6 +197,7 @@ combine (gunichar  a,
  gushort index_a, index_b;

  index_a = COMPOSE_INDEX(a);
+
  if (index_a >= COMPOSE_FIRST_SINGLE_START && index_a < COMPOSE_SECOND_START)
    {
      if (b == compose_first_single[index_a - COMPOSE_FIRST_SINGLE_START][0])
@@ -202,10 +206,11 @@ combine (gunichar  a,
 	  return TRUE;
 	}
      else
-	return FALSE;
+        return FALSE;
    }
  
  index_b = COMPOSE_INDEX(b);
+
  if (index_b >= COMPOSE_SECOND_SINGLE_START)
    {
      if (a == compose_second_single[index_b - COMPOSE_SECOND_SINGLE_START][0])
@@ -214,7 +219,7 @@ combine (gunichar  a,
 	  return TRUE;
 	}
      else
-	return FALSE;
+        return FALSE;
    }

  if (index_a >= COMPOSE_FIRST_START && index_a < COMPOSE_FIRST_SINGLE_START &&
@@ -252,17 +257,10 @@ _g_utf8_normalize_wc (const gchar    *str,
    {
      gunichar wc = g_utf8_get_char (p);

-      const guchar *decomp = find_decomposition (wc, do_compat);
+      const gchar *decomp = find_decomposition (wc, do_compat);

      if (decomp)
-	{
-	  int len;
-	  /* We store as a double-nul terminated string.  */
-	  for (len = 0; (decomp[len] || decomp[len + 1]);
-	       len += 2)
-	    ;
-	  n_wc += len / 2;
-	}
+        n_wc += g_utf8_strlen (decomp, -1);
      else
 	n_wc++;

@@ -277,7 +275,7 @@ _g_utf8_normalize_wc (const gchar    *str,
  while ((max_len < 0 || p < str + max_len) && *p)
    {
      gunichar wc = g_utf8_get_char (p);
-      const guchar *decomp;
+      const gchar *decomp;
      int cc;
      gsize old_n_wc = n_wc;
 	  
@@ -285,11 +283,9 @@ _g_utf8_normalize_wc (const gchar    *str,
 	  
      if (decomp)
 	{
-	  int len;
-	  /* We store as a double-nul terminated string.  */
-	  for (len = 0; (decomp[len] || decomp[len + 1]);
-	       len += 2)
-	    wc_buffer[n_wc++] = (decomp[len] << 8 | decomp[len + 1]);
+          const char *pd;
+          for (pd = decomp; *pd != '\0'; pd = g_utf8_next_char (pd))
+            wc_buffer[n_wc++] = g_utf8_get_char (pd);
 	}
      else
 	wc_buffer[n_wc++] = wc;
@@ -318,7 +314,6 @@ _g_utf8_normalize_wc (const gchar    *str,

  /* All decomposed and reordered */ 

-
  if (do_compose && n_wc > 0)
    {
      gsize i, j;
@@ -402,7 +397,7 @@ g_utf8_normalize (const gchar    *str,
 {
  gunichar *result_wc = _g_utf8_normalize_wc (str, len, mode);
  gchar *result;
-  
+
  result = g_ucs4_to_utf8 (result_wc, -1, NULL, NULL, NULL);
  g_free (result_wc);

--- a/glib/gunidecomp.h
+++ b/glib/gunidecomp.h
--- a/glib/guniprop.c
+++ b/glib/guniprop.c
@@ -28,17 +28,30 @@
 #include "glib.h"
 #include "gunichartables.h"

+#define ATTR_TABLE(Page) (((Page) <= G_UNICODE_LAST_PAGE_PART1) \
+                          ? attr_table_part1[Page] \
+                          : attr_table_part2[(Page) - 0xe00])

 #define ATTTABLE(Page, Char) \
-  ((attr_table[Page] == G_UNICODE_MAX_TABLE_INDEX) ? 0 : (attr_data[attr_table[Page]][Char]))
+  ((ATTR_TABLE(Page) == G_UNICODE_MAX_TABLE_INDEX) ? 0 : (attr_data[ATTR_TABLE(Page)][Char]))

-#define TTYPE(Page, Char) \
-  ((type_table[Page] >= G_UNICODE_MAX_TABLE_INDEX) \
-   ? (type_table[Page] - G_UNICODE_MAX_TABLE_INDEX) \
-   : (type_data[type_table[Page]][Char]))
+#define TTYPE_PART1(Page, Char) \
+  ((type_table_part1[Page] >= G_UNICODE_MAX_TABLE_INDEX) \
+   ? (type_table_part1[Page] - G_UNICODE_MAX_TABLE_INDEX) \
+   : (type_data[type_table_part1[Page]][Char]))

+#define TTYPE_PART2(Page, Char) \
+  ((type_table_part2[Page] >= G_UNICODE_MAX_TABLE_INDEX) \
+   ? (type_table_part2[Page] - G_UNICODE_MAX_TABLE_INDEX) \
+   : (type_data[type_table_part2[Page]][Char]))
+
+#define TYPE(Char) \
+  (((Char) <= G_UNICODE_LAST_CHAR_PART1) \
+   ? TTYPE_PART1 ((Char) >> 8, (Char) & 0xff) \
+   : (((Char) >= 0xe0000 && (Char) <= G_UNICODE_LAST_CHAR) \
+      ? TTYPE_PART2 (((Char) - 0xe0000) >> 8, (Char) & 0xff) \
+      : G_UNICODE_UNASSIGNED))

-#define TYPE(Char) (((Char) > (G_UNICODE_LAST_CHAR)) ? G_UNICODE_UNASSIGNED : TTYPE ((Char) >> 8, (Char) & 0xff))

 #define ISDIGIT(Type) ((Type) == G_UNICODE_DECIMAL_NUMBER	\
 		       || (Type) == G_UNICODE_LETTER_NUMBER	\
@@ -361,10 +374,10 @@ g_unichar_toupper (gunichar c)
  if (t == G_UNICODE_LOWERCASE_LETTER)
    {
      gunichar val = ATTTABLE (c >> 8, c & 0xff);
-      if (val >= 0xd800 && val < 0xdc00)
+      if (val >= 0x1000000)
 	{
-	  const guchar *p = special_case_table[val - 0xd800];
-	  return p[0] * 256 + p[1];
+	  const guchar *p = special_case_table + val - 0x1000000;
+	  return g_utf8_get_char (p);
 	}
      else
 	return val ? val : c;
@@ -398,10 +411,10 @@ g_unichar_tolower (gunichar c)
  if (t == G_UNICODE_UPPERCASE_LETTER)
    {
      gunichar val = ATTTABLE (c >> 8, c & 0xff);
-      if (val >= 0xd800 && val < 0xdc00)
+      if (val >= 0x1000000)
 	{
-	  const guchar *p = special_case_table[val - 0xd800];
-	  return p[0] * 256 + p[1];
+	  const guchar *p = special_case_table + val - 0x1000000;
+	  return g_utf8_get_char (p);
 	}
      else
 	return val ? val : c;
@@ -561,31 +574,22 @@ output_marks (const char **p_inout,
 static gsize
 output_special_case (gchar *out_buffer,
 		     gsize  len,
-		     int    index,
+		     int    offset,
 		     int    type,
 		     int    which)
 {
-  const guchar *p = special_case_table[index];
+  const guchar *p = special_case_table + offset;
+  gint len;

  if (type != G_UNICODE_TITLECASE_LETTER)
-    p += 2; /* +2 to skip over "best single match" */
+    p = g_utf8_next_char (p);

  if (which == 1)
-    {
-      while (p[0] || p[1])
-	p += 2;
-      p += 2;
-    }
+    p += strlen (p) + 1;

-  while (TRUE)
-    {
-      gunichar ch = p[0] * 256 + p[1];
-      if (!ch)
-	break;
-
-      len += g_unichar_to_utf8 (ch, out_buffer ? out_buffer + len : NULL);
-      p += 2;
-    }
+  len = strlen (p);
+  if (out_buffer)
+    memcpy (out_buffer, p, len);

  return len;
 }
@@ -662,9 +666,9 @@ real_toupper (const gchar *str,
 	{
 	  val = ATTTABLE (c >> 8, c & 0xff);

-	  if (val >= 0xd800 && val < 0xdc00)
+	  if (val >= 0x1000000)
 	    {
-	      len += output_special_case (out_buffer, len, val - 0xd800, t,
+	      len += output_special_case (out_buffer, len, val - 0x1000000, t,
 					  t == G_UNICODE_LOWERCASE_LETTER ? 0 : 1);
 	    }
 	  else
@@ -785,9 +789,9 @@ real_tolower (const gchar *str,
 	{
 	  val = ATTTABLE (c >> 8, c & 0xff);

-	  if (val >= 0xd800 && val < 0xdc00)
+	  if (val >= 0x1000000)
 	    {
-	      len += output_special_case (out_buffer, len, val - 0xd800, t, 0);
+	      len += output_special_case (out_buffer, len, val - 0x1000000, t, 0);
 	    }
 	  else
 	    {
@@ -891,7 +895,7 @@ g_utf8_casefold (const gchar *str,
      int end = G_N_ELEMENTS (casefold_table);

      if (ch >= casefold_table[start].ch &&
-	  ch <= casefold_table[end - 1].ch)
+          ch <= casefold_table[end - 1].ch)
 	{
 	  while (TRUE)
 	    {
--- a/tests/casefold.txt
+++ b/tests/casefold.txt
@@ -1,4 +1,4 @@
-# Test cases generated from Unicode 3.1 data
+# Test cases generated from Unicode 4.0 data
 # by gen-casefold-test.pl. Do not edit.
 #
 # Some special hand crafted tests
@@ -89,8 +89,7 @@ Z	z
 Ī	ī
 Ĭ	ĭ
 Į	į
-İ	i
-ı	i
+İ	i̇
 Ĳ	ĳ
 Ĵ	ĵ
 Ķ	ķ
@@ -216,6 +215,7 @@ Z	z
 Ț	ț
 Ȝ	ȝ
 Ȟ	ȟ
+Ƞ	ƞ
 Ȣ	ȣ
 Ȥ	ȥ
 Ȧ	ȧ
@@ -266,6 +266,7 @@ Z	z
 ϑ	θ
 ϕ	φ
 ϖ	π
+Ϙ	ϙ
 Ϛ	ϛ
 Ϝ	ϝ
 Ϟ	ϟ
@@ -279,9 +280,11 @@ Z	z
 Ϯ	ϯ
 ϰ	κ
 ϱ	ρ
-ϲ	σ
 ϴ	θ
 ϵ	ε
+Ϸ	ϸ
+Ϲ	ϲ
+Ϻ	ϻ
 Ѐ	ѐ
 Ё	ё
 Ђ	ђ
@@ -347,6 +350,7 @@ Z	z
 Ѽ	ѽ
 Ѿ	ѿ
 Ҁ	ҁ
+Ҋ	ҋ
 Ҍ	ҍ
 Ҏ	ҏ
 Ґ	ґ
@@ -375,8 +379,11 @@ Z	z
 Ҿ	ҿ
 Ӂ	ӂ
 Ӄ	ӄ
+Ӆ	ӆ
 Ӈ	ӈ
+Ӊ	ӊ
 Ӌ	ӌ
+Ӎ	ӎ
 Ӑ	ӑ
 Ӓ	ӓ
 Ӕ	ӕ
@@ -397,6 +404,14 @@ Z	z
 Ӳ	ӳ
 Ӵ	ӵ
 Ӹ	ӹ
+Ԁ	ԁ
+Ԃ	ԃ
+Ԅ	ԅ
+Ԇ	ԇ
+Ԉ	ԉ
+Ԋ	ԋ
+Ԍ	ԍ
+Ԏ	ԏ
 Ա	ա
 Բ	բ
 Գ	գ
@@ -794,3 +809,43 @@ Z	z
 Ｘ	ｘ
 Ｙ	ｙ
 Ｚ	ｚ
+𐐀	𐐨
+𐐁	𐐩
+𐐂	𐐪
+𐐃	𐐫
+𐐄	𐐬
+𐐅	𐐭
+𐐆	𐐮
+𐐇	𐐯
+𐐈	𐐰
+𐐉	𐐱
+𐐊	𐐲
+𐐋	𐐳
+𐐌	𐐴
+𐐍	𐐵
+𐐎	𐐶
+𐐏	𐐷
+𐐐	𐐸
+𐐑	𐐹
+𐐒	𐐺
+𐐓	𐐻
+𐐔	𐐼
+𐐕	𐐽
+𐐖	𐐾
+𐐗	𐐿
+𐐘	𐑀
+𐐙	𐑁
+𐐚	𐑂
+𐐛	𐑃
+𐐜	𐑄
+𐐝	𐑅
+𐐞	𐑆
+𐐟	𐑇
+𐐠	𐑈
+𐐡	𐑉
+𐐢	𐑊
+𐐣	𐑋
+𐐤	𐑌
+𐐥	𐑍
+𐐦	𐑎
+𐐧	𐑏
--- a/tests/casemap.txt
+++ b/tests/casemap.txt
--- a/tests/gen-casefold-txt.pl
+++ b/tests/gen-casefold-txt.pl
@@ -24,6 +24,8 @@
 # I consider the output of this program to be unrestricted.  Use it as
 # you will.

+require 5.006;
+
 # Names of fields in the CaseFolding table
 $FOLDING_CODE = 0;
 $FOLDING_STATUS = 1;
@@ -49,6 +51,7 @@ AaBbCc@@\taabbcc@@
 #
 EOT

+binmode STDOUT, ":utf8";
 open (INPUT, "< $ARGV[1]") || exit 1;

 while (<INPUT>)
@@ -65,15 +68,14 @@ while (<INPUT>)
    my $raw_code = $fields[$FOLDING_CODE];
    my $code = hex ($raw_code);

-    next if $code > 0xffff;	# FIXME!
-    
    if ($#fields != 3)
    {
 	printf STDERR ("Entry for $raw_code has wrong number of fields (%d)\n", $#fields);
 	next;
    }

-    next if ($fields[$FOLDING_STATUS] eq 'S');
+    # skip simple and Turkic mappings
+    next if ($fields[$FOLDING_STATUS] =~ /^[ST]$/);

    @values = map { hex ($_) } split /\s+/, $fields[$FOLDING_MAPPING];
    printf ("%s\t%s\n", pack ("U", $code), pack ("U*", @values));
--- a/tests/gen-casemap-txt.pl
+++ b/tests/gen-casemap-txt.pl
@@ -23,6 +23,7 @@
 # I consider the output of this program to be unrestricted.  Use it as
 # you will.

+require 5.006;
 use utf8;

 if (@ARGV != 3) {
@@ -60,6 +61,7 @@ my @upper;
 my @title;
 my @lower;

+binmode STDOUT, ":utf8";
 open (INPUT, "< $ARGV[1]") || exit 1;

 $last_code = -1;
@@ -74,8 +76,6 @@ while (<INPUT>)

    $code = hex ($fields[$CODE]);

-    last if ($code > 0xFFFF); # ignore characters out of the basic plane
-
    if ($code > $last_code + 1)
    {
 	# Found a gap.
@@ -196,7 +196,7 @@ sub process_one

 sub print_tests
 {
-    for ($i = 0; $i < 0xffff; $i++) {
+    for ($i = 0; $i < 0x10ffff; $i++) {
 	if ($i == 0x3A3) {
 	    # Greek sigma needs special tests
 	    next;