Update Unicode data to 4.0. (#107974)

2003-07-30  Noah Levitt  <nlevitt@columbia.edu>

	* glib/gen-unicode-tables.pl:
	* glib/gunibreak.c:
	* glib/gunibreak.h:
	* glib/gunichartables.h:
	* glib/gunicode.h:
	* glib/gunicomp.h:
	* glib/gunidecomp.c:
	* glib/gunidecomp.h:
	* glib/guniprop.c:
	* tests/casefold.txt:
	* tests/casemap.txt:
	* tests/gen-casefold-txt.pl:
	* tests/gen-casemap-txt.pl: Update Unicode data to 4.0. (#107974)
This commit is contained in:
Noah Levitt 2003-07-31 02:27:56 +00:00 committed by Noah Levitt
parent cdf72b09e6
commit 05f99527eb
19 changed files with 22213 additions and 8644 deletions

View File

@ -1,3 +1,19 @@
2003-07-30 Noah Levitt <nlevitt@columbia.edu>
* glib/gen-unicode-tables.pl:
* glib/gunibreak.c:
* glib/gunibreak.h:
* glib/gunichartables.h:
* glib/gunicode.h:
* glib/gunicomp.h:
* glib/gunidecomp.c:
* glib/gunidecomp.h:
* glib/guniprop.c:
* tests/casefold.txt:
* tests/casemap.txt:
* tests/gen-casefold-txt.pl:
* tests/gen-casemap-txt.pl: Update Unicode data to 4.0. (#107974)
2003-07-31 Tor Lillqvist <tml@iki.fi>
* glib/gspawn-win32.c: When possible, manage without the helper

View File

@ -1,3 +1,19 @@
2003-07-30 Noah Levitt <nlevitt@columbia.edu>
* glib/gen-unicode-tables.pl:
* glib/gunibreak.c:
* glib/gunibreak.h:
* glib/gunichartables.h:
* glib/gunicode.h:
* glib/gunicomp.h:
* glib/gunidecomp.c:
* glib/gunidecomp.h:
* glib/guniprop.c:
* tests/casefold.txt:
* tests/casemap.txt:
* tests/gen-casefold-txt.pl:
* tests/gen-casemap-txt.pl: Update Unicode data to 4.0. (#107974)
2003-07-31 Tor Lillqvist <tml@iki.fi>
* glib/gspawn-win32.c: When possible, manage without the helper

View File

@ -1,3 +1,19 @@
2003-07-30 Noah Levitt <nlevitt@columbia.edu>
* glib/gen-unicode-tables.pl:
* glib/gunibreak.c:
* glib/gunibreak.h:
* glib/gunichartables.h:
* glib/gunicode.h:
* glib/gunicomp.h:
* glib/gunidecomp.c:
* glib/gunidecomp.h:
* glib/guniprop.c:
* tests/casefold.txt:
* tests/casemap.txt:
* tests/gen-casefold-txt.pl:
* tests/gen-casemap-txt.pl: Update Unicode data to 4.0. (#107974)
2003-07-31 Tor Lillqvist <tml@iki.fi>
* glib/gspawn-win32.c: When possible, manage without the helper

View File

@ -1,3 +1,19 @@
2003-07-30 Noah Levitt <nlevitt@columbia.edu>
* glib/gen-unicode-tables.pl:
* glib/gunibreak.c:
* glib/gunibreak.h:
* glib/gunichartables.h:
* glib/gunicode.h:
* glib/gunicomp.h:
* glib/gunidecomp.c:
* glib/gunidecomp.h:
* glib/guniprop.c:
* tests/casefold.txt:
* tests/casemap.txt:
* tests/gen-casefold-txt.pl:
* tests/gen-casemap-txt.pl: Update Unicode data to 4.0. (#107974)
2003-07-31 Tor Lillqvist <tml@iki.fi>
* glib/gspawn-win32.c: When possible, manage without the helper

View File

@ -1,3 +1,19 @@
2003-07-30 Noah Levitt <nlevitt@columbia.edu>
* glib/gen-unicode-tables.pl:
* glib/gunibreak.c:
* glib/gunibreak.h:
* glib/gunichartables.h:
* glib/gunicode.h:
* glib/gunicomp.h:
* glib/gunidecomp.c:
* glib/gunidecomp.h:
* glib/guniprop.c:
* tests/casefold.txt:
* tests/casemap.txt:
* tests/gen-casefold-txt.pl:
* tests/gen-casemap-txt.pl: Update Unicode data to 4.0. (#107974)
2003-07-31 Tor Lillqvist <tml@iki.fi>
* glib/gspawn-win32.c: When possible, manage without the helper

View File

@ -1,3 +1,19 @@
2003-07-30 Noah Levitt <nlevitt@columbia.edu>
* glib/gen-unicode-tables.pl:
* glib/gunibreak.c:
* glib/gunibreak.h:
* glib/gunichartables.h:
* glib/gunicode.h:
* glib/gunicomp.h:
* glib/gunidecomp.c:
* glib/gunidecomp.h:
* glib/guniprop.c:
* tests/casefold.txt:
* tests/casemap.txt:
* tests/gen-casefold-txt.pl:
* tests/gen-casemap-txt.pl: Update Unicode data to 4.0. (#107974)
2003-07-31 Tor Lillqvist <tml@iki.fi>
* glib/gspawn-win32.c: When possible, manage without the helper

View File

@ -31,8 +31,12 @@
# * For decomp table it might make sense to use a shift count other
# than 8. We could easily compute the perfect shift count.
# we use some perl unicode features
require 5.006;
use vars qw($CODE $NAME $CATEGORY $COMBINING_CLASSES $BIDI_CATEGORY $DECOMPOSITION $DECIMAL_VALUE $DIGIT_VALUE $NUMERIC_VALUE $MIRRORED $OLD_NAME $COMMENT $UPPER $LOWER $TITLE $BREAK_CODE $BREAK_CATEGORY $BREAK_NAME $CASE_CODE $CASE_LOWER $CASE_TITLE $CASE_UPPER $CASE_CONDITION);
# Names of fields in Unicode data table.
$CODE = 0;
$NAME = 1;
@ -134,6 +138,8 @@ $FOLDING_MAPPING = 2;
'PO' => "G_UNICODE_BREAK_POSTFIX",
'SA' => "G_UNICODE_BREAK_COMPLEX_CONTEXT",
'AI' => "G_UNICODE_BREAK_AMBIGUOUS",
'NL' => "G_UNICODE_BREAK_NEXT_LINE",
'WJ' => "G_UNICODE_BREAK_WORD_JOINER",
'XX' => "G_UNICODE_BREAK_UNKNOWN"
);
@ -143,8 +149,9 @@ $FOLDING_MAPPING = 2;
# Maximum length of special-case strings
my $special_case_len = 0;
my @special_cases;
my @special_case_offsets;
my $special_case_offset = 0;
$do_decomp = 0;
$do_props = 1;
@ -193,6 +200,9 @@ print "Unicode data from $ARGV[1]\n";
open (INPUT, "< $ARGV[1]") || exit 1;
# we save memory by skipping the huge empty area before U+E0000
my $pages_before_e0000;
$last_code = -1;
while (<INPUT>)
{
@ -205,7 +215,10 @@ while (<INPUT>)
$code = hex ($fields[$CODE]);
last if ($code > 0xFFFF); # ignore characters out of the basic plane
if ($code >= 0xE0000 and $last_code < 0xE0000)
{
$pages_before_e0000 = ($last_code >> 8) + 1;
}
if ($code > $last_code + 1)
{
@ -237,12 +250,12 @@ close INPUT;
@gfields = ('', '', 'Cn', '0', '', '', '', '', '', '', '',
'', '', '', '');
for (++$last_code; $last_code < 0x10000; ++$last_code)
for (++$last_code; $last_code <= 0x10FFFF; ++$last_code)
{
$gfields{$CODE} = sprintf ("%04x", $last_code);
&process_one ($last_code, @gfields);
}
--$last_code; # Want last to be 0xFFFF.
--$last_code; # Want last to be 0x10FFFF.
print "Creating line break table\n";
@ -268,7 +281,7 @@ while (<INPUT>)
next;
}
if ($fields[$CODE] =~ /([A-F0-9]{4})..([A-F0-9]{4})/)
if ($fields[$CODE] =~ /([A-F0-9]{4,6})\.\.([A-F0-9]{4,6})/)
{
$start_code = hex ($1);
$end_code = hex ($2);
@ -277,8 +290,6 @@ while (<INPUT>)
}
last if ($start_code > 0xFFFF); # FIXME ignore characters out of the basic plane
if ($start_code > $last_code + 1)
{
# The gap represents undefined characters. If assigned,
@ -306,7 +317,7 @@ while (<INPUT>)
close INPUT;
for (++$last_code; $last_code < 0x10000; ++$last_code)
for (++$last_code; $last_code <= 0x10FFFF; ++$last_code)
{
if ($type[$last_code] eq 'Cn')
{
@ -317,9 +328,9 @@ for (++$last_code; $last_code < 0x10000; ++$last_code)
$break_props[$last_code] = 'AL';
}
}
--$last_code; # Want last to be 0xFFFF.
--$last_code; # Want last to be 0x10FFFF.
print STDERR "Last code is not 0xFFFF" if ($last_code != 0xFFFF);
print STDERR "Last code is not 0x10FFFF" if ($last_code != 0x10FFFF);
print "Reading special-casing table for case conversion\n";
@ -362,18 +373,18 @@ while (<INPUT>)
{
(hex $fields[$CASE_UPPER] == $code) || die "$raw_code is Lu and UCD_Upper($raw_code) != $raw_code";
&add_special_case ($code, $value[$code],$fields[$CASE_LOWER], $fields[$CASE_TITLE]);
&add_special_case ($code, $value[$code], $fields[$CASE_LOWER], $fields[$CASE_TITLE]);
} elsif ($type[$code] eq 'Lt')
{
(hex $fields[$CASE_TITLE] == $code) || die "$raw_code is Lt and UCD_Title($raw_code) != $raw_code";
&add_special_case ($code, undef,$fields[$CASE_LOWER], $fields[$CASE_UPPER]);
&add_special_case ($code, undef, $fields[$CASE_LOWER], $fields[$CASE_UPPER]);
} elsif ($type[$code] eq 'Ll')
{
(hex $fields[$CASE_LOWER] == $code) || die "$raw_code is Ll and UCD_Lower($raw_code) != $raw_code";
&add_special_case ($code, $value[$code],$fields[$CASE_UPPER], $fields[$CASE_TITLE]);
&add_special_case ($code, $value[$code], $fields[$CASE_UPPER], $fields[$CASE_TITLE]);
} else {
printf STDERR "Special case for non-alphabetic code point: $raw_code\n";
next;
@ -403,22 +414,21 @@ while (<INPUT>)
$raw_code = $fields[$FOLDING_CODE];
$code = hex ($raw_code);
next if $code > 0xffff; # FIXME!
if ($#fields != 3)
{
printf STDERR ("Entry for $raw_code has wrong number of fields (%d)\n", $#fields);
next;
}
next if ($fields[$FOLDING_STATUS] eq 'S');
# we don't use Simple or Turkic rules here
next if ($fields[$FOLDING_STATUS] =~ /^[ST]$/);
@values = map { hex ($_) } split /\s+/, $fields[$FOLDING_MAPPING];
# Check simple case
if (@values == 1 &&
!(defined $value[$code] && $value[$code] >= 0xd800 && $value[$code] < 0xdc00) &&
!(defined $value[$code] && $value[$code] >= 0x1000000) &&
defined $type[$code]) {
my $lower;
@ -441,13 +451,12 @@ while (<INPUT>)
}
my $string = pack ("U*", @values);
$string =~ s/([\x80-\xff])/sprintf "\\x%02x",ord($1)/eg;
if (1 + length $string > $casefoldlen) {
$casefoldlen = 1 + length $string;
if (1 + &length_in_bytes ($string) > $casefoldlen) {
$casefoldlen = 1 + &length_in_bytes ($string);
}
push @casefold, [ $code, $string ];
push @casefold, [ $code, &escape ($string) ];
}
close INPUT;
@ -464,6 +473,16 @@ if ($do_decomp) {
exit 0;
# perl "length" returns the length in characters
sub length_in_bytes
{
my ($string) = @_;
use bytes;
return length $string;
}
# Process a single character.
sub process_one
{
@ -528,7 +547,11 @@ sub print_tables
printf OUT "#define G_UNICODE_LAST_CHAR 0x%04x\n\n", $last;
printf OUT "#define G_UNICODE_MAX_TABLE_INDEX 1000\n\n";
printf OUT "#define G_UNICODE_MAX_TABLE_INDEX 10000\n\n";
my $last_part1 = ($pages_before_e0000 * 256) - 1;
printf OUT "#define G_UNICODE_LAST_CHAR_PART1 0x%04X\n\n", $last_part1;
printf OUT "#define G_UNICODE_LAST_PAGE_PART1 %d\n\n", $pages_before_e0000 - 1;
$table_index = 0;
printf OUT "static const char type_data[][256] = {\n";
@ -538,8 +561,9 @@ sub print_tables
}
printf OUT "\n};\n\n";
print OUT "static const short type_table[256] = {\n";
for ($count = 0; $count <= $last; $count += 256)
printf OUT "/* U+0000 through U+%04X */\n", $last_part1;
print OUT "static const gint16 type_table_part1[$pages_before_e0000] = {\n";
for ($count = 0; $count <= $last_part1; $count += 256)
{
print OUT ",\n" if $count > 0;
print OUT " ", $row[$count / 256];
@ -547,21 +571,32 @@ sub print_tables
}
print OUT "\n};\n\n";
printf OUT "/* U+E0000 through U+%04X */\n", $last;
print OUT "static const gint16 type_table_part2[768] = {\n";
for ($count = 0xE0000; $count <= $last; $count += 256)
{
print OUT ",\n" if $count > 0xE0000;
print OUT " ", $row[$count / 256];
$bytes_out += 2;
}
print OUT "\n};\n\n";
#
# Now print attribute table.
#
$table_index = 0;
printf OUT "static const unsigned short attr_data[][256] = {\n";
printf OUT "static const gunichar attr_data[][256] = {\n";
for ($count = 0; $count <= $last; $count += 256)
{
$row[$count / 256] = &print_row ($count, 2, \&fetch_attr);
$row[$count / 256] = &print_row ($count, 4, \&fetch_attr);
}
printf OUT "\n};\n\n";
print OUT "static const short attr_table[256] = {\n";
for ($count = 0; $count <= $last; $count += 256)
printf OUT "/* U+0000 through U+%04X */\n", $last_part1;
print OUT "static const gint16 attr_table_part1[$pages_before_e0000] = {\n";
for ($count = 0; $count <= $last_part1; $count += 256)
{
print OUT ",\n" if $count > 0;
print OUT " ", $row[$count / 256];
@ -569,12 +604,21 @@ sub print_tables
}
print OUT "\n};\n\n";
printf OUT "/* U+E0000 through U+%04X */\n", $last;
print OUT "static const gint16 attr_table_part2[768] = {\n";
for ($count = 0xE0000; $count <= $last; $count += 256)
{
print OUT ",\n" if $count > 0xE0000;
print OUT " ", $row[$count / 256];
$bytes_out += 2;
}
print OUT "\n};\n\n";
#
# print title case table
#
# FIXME: type.
print OUT "static const unsigned short title_table[][3] = {\n";
print OUT "static const gunichar title_table[][3] = {\n";
my ($item);
my ($first) = 1;
foreach $item (sort keys %title_to_lower)
@ -583,7 +627,7 @@ sub print_tables
unless $first;
$first = 0;
printf OUT " { 0x%04x, 0x%04x, 0x%04x }", $item, $title_to_upper{$item}, $title_to_lower{$item};
$bytes_out += 6;
$bytes_out += 12;
}
print OUT "\n};\n\n";
@ -666,6 +710,40 @@ sub print_row
return sprintf "%d /* page %d */", $table_index++, $start / 256;
}
sub escape
{
my ($string) = @_;
$string =~ s/(\C)/sprintf "\\x%02x",ord($1)/eg;
return $string;
}
# Returns the offset of $decomp in the offset string. Updates the
# referenced variables as appropriate.
sub handle_decomp ($$$$)
{
my ($decomp, $decomp_offsets_ref, $decomp_string_ref, $decomp_string_offset_ref) = @_;
my $offset = "G_UNICODE_NOT_PRESENT_OFFSET";
if (defined $decomp)
{
if (defined $decomp_offsets_ref->{$decomp})
{
$offset = $decomp_offsets_ref->{$decomp};
}
else
{
$offset = ${$decomp_string_offset_ref};
$decomp_offsets_ref->{$decomp} = $offset;
${$decomp_string_ref} .= "\n \"" . &escape ($decomp) . "\\0\" /* offset ${$decomp_string_offset_ref} */";
${$decomp_string_offset_ref} += &length_in_bytes ($decomp) + 1;
}
}
return $offset;
}
# Generate the character decomposition header.
sub print_decomp
{
@ -684,19 +762,26 @@ sub print_decomp
printf OUT "#define G_UNICODE_LAST_CHAR 0x%04x\n\n", $last;
printf OUT "#define G_UNICODE_MAX_TABLE_INDEX 1000\n\n";
printf OUT "#define G_UNICODE_MAX_TABLE_INDEX (0x110000 / 256)\n\n";
my $last_part1 = ($pages_before_e0000 * 256) - 1;
printf OUT "#define G_UNICODE_LAST_CHAR_PART1 0x%04X\n\n", $last_part1;
printf OUT "#define G_UNICODE_LAST_PAGE_PART1 %d\n\n", $pages_before_e0000 - 1;
$NOT_PRESENT_OFFSET = 65535;
print OUT "#define G_UNICODE_NOT_PRESENT_OFFSET $NOT_PRESENT_OFFSET\n\n";
my ($count, @row);
$table_index = 0;
printf OUT "static const unsigned char cclass_data[][256] = {\n";
printf OUT "static const guchar cclass_data[][256] = {\n";
for ($count = 0; $count <= $last; $count += 256)
{
$row[$count / 256] = &print_row ($count, 1, \&fetch_cclass);
}
printf OUT "\n};\n\n";
print OUT "static const short combining_class_table[256] = {\n";
for ($count = 0; $count <= $last; $count += 256)
print OUT "static const gint16 combining_class_table_part1[$pages_before_e0000] = {\n";
for ($count = 0; $count <= $last_part1; $count += 256)
{
print OUT ",\n" if $count > 0;
print OUT " ", $row[$count / 256];
@ -704,12 +789,19 @@ sub print_decomp
}
print OUT "\n};\n\n";
print OUT "static const gint16 combining_class_table_part2[768] = {\n";
for ($count = 0xE0000; $count <= $last; $count += 256)
{
print OUT ",\n" if $count > 0xE0000;
print OUT " ", $row[$count / 256];
$bytes_out += 2;
}
print OUT "\n};\n\n";
print OUT "typedef struct\n{\n";
# FIXME: type.
print OUT " unsigned short ch;\n";
print OUT " unsigned char canon_offset;\n";
print OUT " unsigned char compat_offset;\n";
print OUT " unsigned short expansion_offset;\n";
print OUT " gunichar ch;\n";
print OUT " guint16 canon_offset;\n";
print OUT " guint16 compat_offset;\n";
print OUT "} decomposition;\n\n";
print OUT "static const decomposition decomp_table[] =\n{\n";
@ -737,40 +829,19 @@ sub print_decomp
undef $compat_decomp;
}
my $string = "";
my $canon_offset = 0xff;
my $compat_offset = 0xff;
if (defined $canon_decomp) {
$canon_offset = 0;
$string .= $canon_decomp;
}
if (defined $compat_decomp) {
if (defined $canon_decomp) {
$string .= "\\x00\\x00";
}
$compat_offset = (length $string) / 4;
$string .= $compat_decomp;
}
my $canon_offset = handle_decomp ($canon_decomp, \%decomp_offsets, \$decomp_string, \$decomp_string_offset);
my $compat_offset = handle_decomp ($compat_decomp, \%decomp_offsets, \$decomp_string, \$decomp_string_offset);
if (!defined($decomp_offsets{$string})) {
$decomp_offsets{$string} = $decomp_string_offset;
$decomp_string .= "\n \"".$string."\\0\\0\" /* offset ".
$decomp_string_offset." */";
$decomp_string_offset += ((length $string) / 4) + 2;
$bytes_out += (length $string) / 4 + 2; # "\x20"
}
printf OUT qq( { 0x%04x, %u, %u, %d }),
$count, $canon_offset, $compat_offset, $decomp_offsets{$string};
$bytes_out += 6;
die if $decomp_string_offset > $NOT_PRESENT_OFFSET;
printf OUT qq( { 0x%04x, $canon_offset, $compat_offset }), $count;
$bytes_out += 8;
}
}
print OUT "\n};\n\n";
$bytes_out += $decomp_string_offset + 1;
printf OUT "static const guchar decomp_expansion_string[] = %s;\n\n", $decomp_string;
printf OUT "static const gchar decomp_expansion_string[] = %s;\n\n", $decomp_string;
print OUT "#endif /* DECOMP_H */\n";
@ -796,20 +867,25 @@ sub print_line_break
print OUT "#define G_UNICODE_DATA_VERSION \"$ARGV[0]\"\n\n";
printf OUT "#define G_UNICODE_LAST_CHAR 0x%04x\n\n", $last;
printf OUT "#define G_UNICODE_LAST_CHAR 0x%04X\n\n", $last;
printf OUT "#define G_UNICODE_MAX_TABLE_INDEX 1000\n\n";
printf OUT "#define G_UNICODE_MAX_TABLE_INDEX 10000\n\n";
my $last_part1 = ($pages_before_e0000 * 256) - 1;
printf OUT "/* the last code point that should be looked up in break_property_table_part1 */\n";
printf OUT "#define G_UNICODE_LAST_CHAR_PART1 0x%04X\n\n", $last_part1;
$table_index = 0;
printf OUT "static const char break_property_data[][256] = {\n";
printf OUT "static const gint8 break_property_data[][256] = {\n";
for ($count = 0; $count <= $last; $count += 256)
{
$row[$count / 256] = &print_row ($count, 1, \&fetch_break_type);
}
printf OUT "\n};\n\n";
print OUT "static const short break_property_table[256] = {\n";
for ($count = 0; $count <= $last; $count += 256)
printf OUT "/* U+0000 through U+%04X */\n", $last_part1;
print OUT "static const gint16 break_property_table_part1[$pages_before_e0000] = {\n";
for ($count = 0; $count <= $last_part1; $count += 256)
{
print OUT ",\n" if $count > 0;
print OUT " ", $row[$count / 256];
@ -817,6 +893,17 @@ sub print_line_break
}
print OUT "\n};\n\n";
printf OUT "/* U+E0000 through U+%04X */\n", $last;
print OUT "static const gint16 break_property_table_part2[768] = {\n";
for ($count = 0xE0000; $count <= $last; $count += 256)
{
print OUT ",\n" if $count > 0xE0000;
print OUT " ", $row[$count / 256];
$bytes_out += 2;
}
print OUT "\n};\n\n";
print OUT "#endif /* BREAKTABLES_H */\n";
close (OUT);
@ -870,7 +957,7 @@ sub make_decomp
my $result = "";
foreach $iter (&expand_decomp ($code, $compat))
{
$result .= sprintf "\\x%02x\\x%02x", $iter / 256, $iter & 0xff;
$result .= pack ("U", $iter); # to utf-8
}
$result;
@ -888,21 +975,17 @@ sub add_special_case
for $value (@values) {
$result .= sprintf ("\\x%02x\\x%02x", $value / 256, $value & 0xff);
$result .= pack ("U", $value); # to utf-8
}
$result .= "\\0";
if (2 * @values + 2 > $special_case_len) {
$special_case_len = 2 * @values + 2;
}
push @special_case_offsets, $special_case_offset;
push @special_cases, $result;
# We encode special cases up in the 0x1000000 space
$value[$code] = 0x1000000 + $special_case_offset;
#
# We encode special cases in the surrogate pair space
#
$value[$code] = 0xD800 + scalar(@special_cases) - 1;
$special_case_offset += 1 + &length_in_bytes ($result);
push @special_cases, &escape ($result);
}
sub output_special_case_table
@ -915,13 +998,15 @@ sub output_special_case_table
* First, the best single character mapping to lowercase if Lu,
* and to uppercase if Ll, followed by the output mapping for the two cases
* other than the case of the codepoint, in the order [Ll],[Lu],[Lt],
* separated and terminated by a double NUL.
* encoded in UTF-8, separated and terminated by a null character.
*/
static const guchar special_case_table[][$special_case_len] = {
static const gchar special_case_table[] = {
EOT
my $i = 0;
for $case (@special_cases) {
print $out qq( "$case",\n);
print $out qq( "$case\\0" /* offset ${special_case_offsets[$i]} */\n);
$i++;
}
print $out <<EOT;
@ -929,7 +1014,7 @@ EOT
EOT
print STDERR "Generated ", ($special_case_len * scalar @special_cases), " bytes in special case table\n";
print STDERR "Generated " . ($special_case_offset + 1) . " bytes in special case table\n";
}
sub enumerate_ordered
@ -962,16 +1047,22 @@ sub output_composition_table
# decompositions. At the same time, record
# the first and second character of each decomposition
for $code (keys %compositions) {
for $code (keys %compositions)
{
@values = map { hex ($_) } split /\s+/, $compositions{$code};
# non-starters
if ($cclass[$values[0]]) {
delete $compositions{$code};
next;
}
# single-character decompositions
if (@values == 1) {
delete $compositions{$code};
next;
}
if (@values != 2) {
die "$code has more than two elements in its decomposition!\n";
}
@ -983,10 +1074,10 @@ sub output_composition_table
}
}
# Assign integer indicices, removing singletons
# Assign integer indices, removing singletons
my $n_first = enumerate_ordered (\%first);
# Now record the second character if each (non-singleton) decomposition
# Now record the second character of each (non-singleton) decomposition
for $code (keys %compositions) {
@values = map { hex ($_) } split /\s+/, $compositions{$code};
@ -1065,39 +1156,46 @@ sub output_composition_table
my @row;
$table_index = 0;
printf OUT "static const gushort compose_data[][256] = {\n";
printf OUT "static const guint16 compose_data[][256] = {\n";
for (my $count = 0; $count <= $last; $count += 256)
{
$row[$count / 256] = &print_row ($count, 2, sub { exists $vals{$_[0]} ? $vals{$_[0]} : 0; });
}
printf OUT "\n};\n\n";
print OUT "static const short compose_table[256] = {\n";
print OUT "static const gint16 compose_table[256] = {\n";
for (my $count = 0; $count <= $last; $count += 256)
{
print OUT ",\n" if $count > 0;
print OUT " ", $row[$count / 256];
$bytes_out += 4;
}
print OUT "\n};\n\n";
$bytes_out += 256 * 2;
# Output first singletons
print OUT "static const gushort compose_first_single[][2] = {\n";
print OUT "static const guint16 compose_first_single[][2] = {\n";
$i = 0;
for $record (@first_singletons) {
if ($record->[1] > 0xFFFF or $record->[2] > 0xFFFF) {
die "time to switch compose_first_single to gunichar" ;
}
print OUT ",\n" if $i++ > 0;
printf OUT " { %#06x, %#06x }", $record->[1], $record->[2];
}
print OUT "\n};\n";
$bytes_out += @first_singletons * 4;
$bytes_out += @first_singletons * 4;
# Output second singletons
print OUT "static const gushort compose_second_single[][2] = {\n";
print OUT "static const guint16 compose_second_single[][2] = {\n";
$i = 0;
for $record (@second_singletons) {
if ($record->[1] > 0xFFFF or $record->[2] > 0xFFFF) {
die "time to switch compose_second_single to gunichar";
}
print OUT ",\n" if $i++ > 0;
printf OUT " { %#06x, %#06x }", $record->[1], $record->[2];
}
@ -1108,7 +1206,7 @@ sub output_composition_table
# Output array of composition pairs
print OUT <<EOT;
static const gushort compose_array[$n_first][$n_second] = {
static const guint16 compose_array[$n_first][$n_second] = {
EOT
for (my $i = 0; $i < $n_first; $i++) {
@ -1117,7 +1215,10 @@ EOT
for (my $j = 0; $j < $n_second; $j++) {
print OUT ", " if $j;
if (exists $reverse{"$i|$j"}) {
printf OUT "%#06x", $reverse{"$i|$j"};
if ($reverse{"$i|$j"} > 0xFFFF) {
die "time to switch compose_array to gunichar" ;
}
printf OUT "0x%04x", $reverse{"$i|$j"};
} else {
print OUT " 0";
}
@ -1151,10 +1252,16 @@ EOT
@casefold = sort { $a->[0] <=> $b->[0] } @casefold;
for $case (@casefold) {
for $case (@casefold)
{
$code = $case->[0];
$string = $case->[1];
print $out sprintf(qq({ %#04x, "$string" },\n), $code);
if ($code > 0xFFFF) {
die "time to switch casefold_table to gunichar" ;
}
print $out sprintf(qq( { 0x%04x, "$string" },\n), $code);
}

View File

@ -25,13 +25,22 @@
#include "glib.h"
#include "gunibreak.h"
#define TPROP_PART1(Page, Char) \
((break_property_table_part1[Page] >= G_UNICODE_MAX_TABLE_INDEX) \
? (break_property_table_part1[Page] - G_UNICODE_MAX_TABLE_INDEX) \
: (break_property_data[break_property_table_part1[Page]][Char]))
#define TPROP(Page, Char) \
((break_property_table[Page] >= G_UNICODE_MAX_TABLE_INDEX) \
? (break_property_table[Page] - G_UNICODE_MAX_TABLE_INDEX) \
: (break_property_data[break_property_table[Page]][Char]))
#define TPROP_PART2(Page, Char) \
((break_property_table_part2[Page] >= G_UNICODE_MAX_TABLE_INDEX) \
? (break_property_table_part2[Page] - G_UNICODE_MAX_TABLE_INDEX) \
: (break_property_data[break_property_table_part2[Page]][Char]))
#define PROP(Char) (((Char) > (G_UNICODE_LAST_CHAR)) ? G_UNICODE_BREAK_UNKNOWN : TPROP ((Char) >> 8, (Char) & 0xff))
#define PROP(Char) \
(((Char) <= G_UNICODE_LAST_CHAR_PART1) \
? TPROP_PART1 ((Char) >> 8, (Char) & 0xff) \
: (((Char) >= 0xe0000 && (Char) <= G_UNICODE_LAST_CHAR) \
? TPROP_PART2 (((Char) - 0xe0000) >> 8, (Char) & 0xff) \
: G_UNICODE_BREAK_UNKNOWN))
/**
* g_unichar_break_type:

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -100,7 +100,9 @@ typedef enum
G_UNICODE_BREAK_POSTFIX,
G_UNICODE_BREAK_COMPLEX_CONTEXT,
G_UNICODE_BREAK_AMBIGUOUS,
G_UNICODE_BREAK_UNKNOWN
G_UNICODE_BREAK_UNKNOWN,
G_UNICODE_BREAK_NEXT_LINE,
G_UNICODE_BREAK_WORD_JOINER
} GUnicodeBreakType;
/* Returns TRUE if current locale uses UTF-8 charset. If CHARSET is

View File

@ -3,7 +3,7 @@
#define COMPOSE_SECOND_START 357
#define COMPOSE_SECOND_SINGLE_START 388
static const gushort compose_data[][256] = {
static const guint16 compose_data[][256] = {
{ /* page 0, index 0 */
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
@ -222,7 +222,7 @@ static const gushort compose_data[][256] = {
}
};
static const short compose_table[256] = {
static const gint16 compose_table[256] = {
0 /* page 0 */,
1 /* page 1 */,
2 /* page 2 */,
@ -274,7 +274,7 @@ static const short compose_table[256] = {
15 /* page 48 */
};
static const gushort compose_first_single[][2] = {
static const guint16 compose_first_single[][2] = {
{ 0x0338, 0x226e },
{ 0x0338, 0x2260 },
{ 0x0338, 0x226f },
@ -486,7 +486,7 @@ static const gushort compose_first_single[][2] = {
{ 0x3099, 0x30fa },
{ 0x3099, 0x30fe }
};
static const gushort compose_second_single[][2] = {
static const guint16 compose_second_single[][2] = {
{ 0x0627, 0x0622 },
{ 0x0627, 0x0623 },
{ 0x0627, 0x0625 },
@ -506,7 +506,7 @@ static const gushort compose_second_single[][2] = {
{ 0x0dd9, 0x0ddc },
{ 0x0dd9, 0x0dde }
};
static const gushort compose_array[146][31] = {
static const guint16 compose_array[146][31] = {
{ 0x00c0, 0x00c1, 0x00c2, 0x00c3, 0x0100, 0x0102, 0x0226, 0x00c4, 0x1ea2, 0x00c5, 0, 0x01cd, 0x0200, 0x0202, 0, 0, 0, 0x1ea0, 0, 0x1e00, 0, 0, 0x0104, 0, 0, 0, 0, 0, 0, 0, 0 },
{ 0, 0, 0, 0, 0, 0, 0x1e02, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x1e04, 0, 0, 0, 0, 0, 0, 0, 0, 0x1e06, 0, 0, 0, 0 },
{ 0, 0x0106, 0x0108, 0, 0, 0, 0x010a, 0, 0, 0, 0, 0x010c, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x00c7, 0, 0, 0, 0, 0, 0, 0, 0, 0 },

View File

@ -28,13 +28,22 @@
#include "gunicomp.h"
#define CC(Page, Char) \
((combining_class_table[Page] >= G_UNICODE_MAX_TABLE_INDEX) \
? (combining_class_table[Page] - G_UNICODE_MAX_TABLE_INDEX) \
: (cclass_data[combining_class_table[Page]][Char]))
#define CC_PART1(Page, Char) \
((combining_class_table_part1[Page] >= G_UNICODE_MAX_TABLE_INDEX) \
? (combining_class_table_part1[Page] - G_UNICODE_MAX_TABLE_INDEX) \
: (cclass_data[combining_class_table_part1[Page]][Char]))
#define CC_PART2(Page, Char) \
((combining_class_table_part2[Page] >= G_UNICODE_MAX_TABLE_INDEX) \
? (combining_class_table_part2[Page] - G_UNICODE_MAX_TABLE_INDEX) \
: (cclass_data[combining_class_table_part2[Page]][Char]))
#define COMBINING_CLASS(Char) \
(((Char) > (G_UNICODE_LAST_CHAR)) ? 0 : CC((Char) >> 8, (Char) & 0xff))
(((Char) <= G_UNICODE_LAST_CHAR_PART1) \
? CC_PART1 ((Char) >> 8, (Char) & 0xff) \
: (((Char) >= 0xe0000 && (Char) <= G_UNICODE_LAST_CHAR) \
? CC_PART2 (((Char) - 0xe0000) >> 8, (Char) & 0xff) \
: 0))
/**
* g_unicode_canonical_ordering:
@ -84,7 +93,8 @@ g_unicode_canonical_ordering (gunichar *string,
}
}
static const guchar *
/* returns a pointer to a null-terminated UTF-8 string */
static const gchar *
find_decomposition (gunichar ch,
gboolean compat)
{
@ -104,17 +114,17 @@ find_decomposition (gunichar ch,
if (compat)
{
offset = decomp_table[half].compat_offset;
if (offset == 0xff)
if (offset == G_UNICODE_NOT_PRESENT_OFFSET)
offset = decomp_table[half].canon_offset;
}
else
{
offset = decomp_table[half].canon_offset;
if (offset == 0xff)
if (offset == G_UNICODE_NOT_PRESENT_OFFSET)
return NULL;
}
return &(decomp_expansion_string[decomp_table[half].expansion_offset + offset]);
return &(decomp_expansion_string[offset]);
}
else if (half == start)
break;
@ -142,27 +152,20 @@ gunichar *
g_unicode_canonical_decomposition (gunichar ch,
gsize *result_len)
{
const guchar *decomp = find_decomposition (ch, FALSE);
const gchar *decomp = find_decomposition (ch, FALSE);
const gchar *p;
gunichar *r;
if (decomp)
{
/* Found it. */
int i, len;
/* We store as a double-nul terminated string. */
for (len = 0; (decomp[len] || decomp[len + 1]);
len += 2)
;
int i;
/* We've counted twice as many bytes as there are
characters. */
*result_len = len / 2;
r = g_malloc (len / 2 * sizeof (gunichar));
*result_len = g_utf8_strlen (decomp, -1);
r = g_malloc (*result_len * sizeof (gunichar));
for (i = 0; i < len; i += 2)
{
r[i / 2] = (decomp[i] << 8 | decomp[i + 1]);
}
for (p = decomp, i = 0; *p != '\0'; p = g_utf8_next_char (p), i++)
r[i] = g_utf8_get_char (p);
}
else
{
@ -194,6 +197,7 @@ combine (gunichar a,
gushort index_a, index_b;
index_a = COMPOSE_INDEX(a);
if (index_a >= COMPOSE_FIRST_SINGLE_START && index_a < COMPOSE_SECOND_START)
{
if (b == compose_first_single[index_a - COMPOSE_FIRST_SINGLE_START][0])
@ -202,10 +206,11 @@ combine (gunichar a,
return TRUE;
}
else
return FALSE;
return FALSE;
}
index_b = COMPOSE_INDEX(b);
if (index_b >= COMPOSE_SECOND_SINGLE_START)
{
if (a == compose_second_single[index_b - COMPOSE_SECOND_SINGLE_START][0])
@ -214,7 +219,7 @@ combine (gunichar a,
return TRUE;
}
else
return FALSE;
return FALSE;
}
if (index_a >= COMPOSE_FIRST_START && index_a < COMPOSE_FIRST_SINGLE_START &&
@ -252,17 +257,10 @@ _g_utf8_normalize_wc (const gchar *str,
{
gunichar wc = g_utf8_get_char (p);
const guchar *decomp = find_decomposition (wc, do_compat);
const gchar *decomp = find_decomposition (wc, do_compat);
if (decomp)
{
int len;
/* We store as a double-nul terminated string. */
for (len = 0; (decomp[len] || decomp[len + 1]);
len += 2)
;
n_wc += len / 2;
}
n_wc += g_utf8_strlen (decomp, -1);
else
n_wc++;
@ -277,7 +275,7 @@ _g_utf8_normalize_wc (const gchar *str,
while ((max_len < 0 || p < str + max_len) && *p)
{
gunichar wc = g_utf8_get_char (p);
const guchar *decomp;
const gchar *decomp;
int cc;
gsize old_n_wc = n_wc;
@ -285,11 +283,9 @@ _g_utf8_normalize_wc (const gchar *str,
if (decomp)
{
int len;
/* We store as a double-nul terminated string. */
for (len = 0; (decomp[len] || decomp[len + 1]);
len += 2)
wc_buffer[n_wc++] = (decomp[len] << 8 | decomp[len + 1]);
const char *pd;
for (pd = decomp; *pd != '\0'; pd = g_utf8_next_char (pd))
wc_buffer[n_wc++] = g_utf8_get_char (pd);
}
else
wc_buffer[n_wc++] = wc;
@ -318,7 +314,6 @@ _g_utf8_normalize_wc (const gchar *str,
/* All decomposed and reordered */
if (do_compose && n_wc > 0)
{
gsize i, j;
@ -402,7 +397,7 @@ g_utf8_normalize (const gchar *str,
{
gunichar *result_wc = _g_utf8_normalize_wc (str, len, mode);
gchar *result;
result = g_ucs4_to_utf8 (result_wc, -1, NULL, NULL, NULL);
g_free (result_wc);

File diff suppressed because it is too large Load Diff

View File

@ -28,17 +28,30 @@
#include "glib.h"
#include "gunichartables.h"
#define ATTR_TABLE(Page) (((Page) <= G_UNICODE_LAST_PAGE_PART1) \
? attr_table_part1[Page] \
: attr_table_part2[(Page) - 0xe00])
#define ATTTABLE(Page, Char) \
((attr_table[Page] == G_UNICODE_MAX_TABLE_INDEX) ? 0 : (attr_data[attr_table[Page]][Char]))
((ATTR_TABLE(Page) == G_UNICODE_MAX_TABLE_INDEX) ? 0 : (attr_data[ATTR_TABLE(Page)][Char]))
#define TTYPE(Page, Char) \
((type_table[Page] >= G_UNICODE_MAX_TABLE_INDEX) \
? (type_table[Page] - G_UNICODE_MAX_TABLE_INDEX) \
: (type_data[type_table[Page]][Char]))
#define TTYPE_PART1(Page, Char) \
((type_table_part1[Page] >= G_UNICODE_MAX_TABLE_INDEX) \
? (type_table_part1[Page] - G_UNICODE_MAX_TABLE_INDEX) \
: (type_data[type_table_part1[Page]][Char]))
#define TTYPE_PART2(Page, Char) \
((type_table_part2[Page] >= G_UNICODE_MAX_TABLE_INDEX) \
? (type_table_part2[Page] - G_UNICODE_MAX_TABLE_INDEX) \
: (type_data[type_table_part2[Page]][Char]))
#define TYPE(Char) \
(((Char) <= G_UNICODE_LAST_CHAR_PART1) \
? TTYPE_PART1 ((Char) >> 8, (Char) & 0xff) \
: (((Char) >= 0xe0000 && (Char) <= G_UNICODE_LAST_CHAR) \
? TTYPE_PART2 (((Char) - 0xe0000) >> 8, (Char) & 0xff) \
: G_UNICODE_UNASSIGNED))
#define TYPE(Char) (((Char) > (G_UNICODE_LAST_CHAR)) ? G_UNICODE_UNASSIGNED : TTYPE ((Char) >> 8, (Char) & 0xff))
#define ISDIGIT(Type) ((Type) == G_UNICODE_DECIMAL_NUMBER \
|| (Type) == G_UNICODE_LETTER_NUMBER \
@ -361,10 +374,10 @@ g_unichar_toupper (gunichar c)
if (t == G_UNICODE_LOWERCASE_LETTER)
{
gunichar val = ATTTABLE (c >> 8, c & 0xff);
if (val >= 0xd800 && val < 0xdc00)
if (val >= 0x1000000)
{
const guchar *p = special_case_table[val - 0xd800];
return p[0] * 256 + p[1];
const guchar *p = special_case_table + val - 0x1000000;
return g_utf8_get_char (p);
}
else
return val ? val : c;
@ -398,10 +411,10 @@ g_unichar_tolower (gunichar c)
if (t == G_UNICODE_UPPERCASE_LETTER)
{
gunichar val = ATTTABLE (c >> 8, c & 0xff);
if (val >= 0xd800 && val < 0xdc00)
if (val >= 0x1000000)
{
const guchar *p = special_case_table[val - 0xd800];
return p[0] * 256 + p[1];
const guchar *p = special_case_table + val - 0x1000000;
return g_utf8_get_char (p);
}
else
return val ? val : c;
@ -561,31 +574,22 @@ output_marks (const char **p_inout,
static gsize
output_special_case (gchar *out_buffer,
gsize len,
int index,
int offset,
int type,
int which)
{
const guchar *p = special_case_table[index];
const guchar *p = special_case_table + offset;
gint len;
if (type != G_UNICODE_TITLECASE_LETTER)
p += 2; /* +2 to skip over "best single match" */
p = g_utf8_next_char (p);
if (which == 1)
{
while (p[0] || p[1])
p += 2;
p += 2;
}
p += strlen (p) + 1;
while (TRUE)
{
gunichar ch = p[0] * 256 + p[1];
if (!ch)
break;
len += g_unichar_to_utf8 (ch, out_buffer ? out_buffer + len : NULL);
p += 2;
}
len = strlen (p);
if (out_buffer)
memcpy (out_buffer, p, len);
return len;
}
@ -662,9 +666,9 @@ real_toupper (const gchar *str,
{
val = ATTTABLE (c >> 8, c & 0xff);
if (val >= 0xd800 && val < 0xdc00)
if (val >= 0x1000000)
{
len += output_special_case (out_buffer, len, val - 0xd800, t,
len += output_special_case (out_buffer, len, val - 0x1000000, t,
t == G_UNICODE_LOWERCASE_LETTER ? 0 : 1);
}
else
@ -785,9 +789,9 @@ real_tolower (const gchar *str,
{
val = ATTTABLE (c >> 8, c & 0xff);
if (val >= 0xd800 && val < 0xdc00)
if (val >= 0x1000000)
{
len += output_special_case (out_buffer, len, val - 0xd800, t, 0);
len += output_special_case (out_buffer, len, val - 0x1000000, t, 0);
}
else
{
@ -891,7 +895,7 @@ g_utf8_casefold (const gchar *str,
int end = G_N_ELEMENTS (casefold_table);
if (ch >= casefold_table[start].ch &&
ch <= casefold_table[end - 1].ch)
ch <= casefold_table[end - 1].ch)
{
while (TRUE)
{

View File

@ -1,4 +1,4 @@
# Test cases generated from Unicode 3.1 data
# Test cases generated from Unicode 4.0 data
# by gen-casefold-test.pl. Do not edit.
#
# Some special hand crafted tests
@ -89,8 +89,7 @@ Z z
Ī ī
Ĭ ĭ
Į į
İ i
ı i
İ i̇
IJ ij
Ĵ ĵ
Ķ ķ
@ -216,6 +215,7 @@ Z z
Ț ț
Ȝ ȝ
Ȟ ȟ
Ƞ ƞ
Ȣ ȣ
Ȥ ȥ
Ȧ ȧ
@ -266,6 +266,7 @@ Z z
ϑ θ
ϕ φ
ϖ π
Ϙ ϙ
Ϛ ϛ
Ϝ ϝ
Ϟ ϟ
@ -279,9 +280,11 @@ Z z
Ϯ ϯ
ϰ κ
ϱ ρ
ϲ σ
ϴ θ
ϵ ε
Ϸ ϸ
Ϲ ϲ
Ϻ ϻ
Ѐ ѐ
Ё ё
Ђ ђ
@ -347,6 +350,7 @@ Z z
Ѽ ѽ
Ѿ ѿ
Ҁ ҁ
Ҋ ҋ
Ҍ ҍ
Ҏ ҏ
Ґ ґ
@ -375,8 +379,11 @@ Z z
Ҿ ҿ
Ӂ ӂ
Ӄ ӄ
Ӆ ӆ
Ӈ ӈ
Ӊ ӊ
Ӌ ӌ
Ӎ ӎ
Ӑ ӑ
Ӓ ӓ
Ӕ ӕ
@ -397,6 +404,14 @@ Z z
Ӳ ӳ
Ӵ ӵ
Ӹ ӹ
Ԁ ԁ
Ԃ ԃ
Ԅ ԅ
Ԇ ԇ
Ԉ ԉ
Ԋ ԋ
Ԍ ԍ
Ԏ ԏ
Ա ա
Բ բ
Գ գ
@ -794,3 +809,43 @@ Z z
𐐀 𐐨
𐐁 𐐩
𐐂 𐐪
𐐃 𐐫
𐐄 𐐬
𐐅 𐐭
𐐆 𐐮
𐐇 𐐯
𐐈 𐐰
𐐉 𐐱
𐐊 𐐲
𐐋 𐐳
𐐌 𐐴
𐐍 𐐵
𐐎 𐐶
𐐏 𐐷
𐐐 𐐸
𐐑 𐐹
𐐒 𐐺
𐐓 𐐻
𐐔 𐐼
𐐕 𐐽
𐐖 𐐾
𐐗 𐐿
𐐘 𐑀
𐐙 𐑁
𐐚 𐑂
𐐛 𐑃
𐐜 𐑄
𐐝 𐑅
𐐞 𐑆
𐐟 𐑇
𐐠 𐑈
𐐡 𐑉
𐐢 𐑊
𐐣 𐑋
𐐤 𐑌
𐐥 𐑍
𐐦 𐑎
𐐧 𐑏

File diff suppressed because it is too large Load Diff

View File

@ -24,6 +24,8 @@
# I consider the output of this program to be unrestricted. Use it as
# you will.
require 5.006;
# Names of fields in the CaseFolding table
$FOLDING_CODE = 0;
$FOLDING_STATUS = 1;
@ -49,6 +51,7 @@ AaBbCc@@\taabbcc@@
#
EOT
binmode STDOUT, ":utf8";
open (INPUT, "< $ARGV[1]") || exit 1;
while (<INPUT>)
@ -65,15 +68,14 @@ while (<INPUT>)
my $raw_code = $fields[$FOLDING_CODE];
my $code = hex ($raw_code);
next if $code > 0xffff; # FIXME!
if ($#fields != 3)
{
printf STDERR ("Entry for $raw_code has wrong number of fields (%d)\n", $#fields);
next;
}
next if ($fields[$FOLDING_STATUS] eq 'S');
# skip simple and Turkic mappings
next if ($fields[$FOLDING_STATUS] =~ /^[ST]$/);
@values = map { hex ($_) } split /\s+/, $fields[$FOLDING_MAPPING];
printf ("%s\t%s\n", pack ("U", $code), pack ("U*", @values));

View File

@ -23,6 +23,7 @@
# I consider the output of this program to be unrestricted. Use it as
# you will.
require 5.006;
use utf8;
if (@ARGV != 3) {
@ -60,6 +61,7 @@ my @upper;
my @title;
my @lower;
binmode STDOUT, ":utf8";
open (INPUT, "< $ARGV[1]") || exit 1;
$last_code = -1;
@ -74,8 +76,6 @@ while (<INPUT>)
$code = hex ($fields[$CODE]);
last if ($code > 0xFFFF); # ignore characters out of the basic plane
if ($code > $last_code + 1)
{
# Found a gap.
@ -196,7 +196,7 @@ sub process_one
sub print_tests
{
for ($i = 0; $i < 0xffff; $i++) {
for ($i = 0; $i < 0x10ffff; $i++) {
if ($i == 0x3A3) {
# Greek sigma needs special tests
next;