mirror of
https://gitlab.gnome.org/GNOME/glib.git
synced 2024-11-10 03:16:17 +01:00
Update Unicode data to 4.0. (#107974)
2003-07-30 Noah Levitt <nlevitt@columbia.edu> * glib/gen-unicode-tables.pl: * glib/gunibreak.c: * glib/gunibreak.h: * glib/gunichartables.h: * glib/gunicode.h: * glib/gunicomp.h: * glib/gunidecomp.c: * glib/gunidecomp.h: * glib/guniprop.c: * tests/casefold.txt: * tests/casemap.txt: * tests/gen-casefold-txt.pl: * tests/gen-casemap-txt.pl: Update Unicode data to 4.0. (#107974)
This commit is contained in:
parent
cdf72b09e6
commit
05f99527eb
16
ChangeLog
16
ChangeLog
@ -1,3 +1,19 @@
|
||||
2003-07-30 Noah Levitt <nlevitt@columbia.edu>
|
||||
|
||||
* glib/gen-unicode-tables.pl:
|
||||
* glib/gunibreak.c:
|
||||
* glib/gunibreak.h:
|
||||
* glib/gunichartables.h:
|
||||
* glib/gunicode.h:
|
||||
* glib/gunicomp.h:
|
||||
* glib/gunidecomp.c:
|
||||
* glib/gunidecomp.h:
|
||||
* glib/guniprop.c:
|
||||
* tests/casefold.txt:
|
||||
* tests/casemap.txt:
|
||||
* tests/gen-casefold-txt.pl:
|
||||
* tests/gen-casemap-txt.pl: Update Unicode data to 4.0. (#107974)
|
||||
|
||||
2003-07-31 Tor Lillqvist <tml@iki.fi>
|
||||
|
||||
* glib/gspawn-win32.c: When possible, manage without the helper
|
||||
|
@ -1,3 +1,19 @@
|
||||
2003-07-30 Noah Levitt <nlevitt@columbia.edu>
|
||||
|
||||
* glib/gen-unicode-tables.pl:
|
||||
* glib/gunibreak.c:
|
||||
* glib/gunibreak.h:
|
||||
* glib/gunichartables.h:
|
||||
* glib/gunicode.h:
|
||||
* glib/gunicomp.h:
|
||||
* glib/gunidecomp.c:
|
||||
* glib/gunidecomp.h:
|
||||
* glib/guniprop.c:
|
||||
* tests/casefold.txt:
|
||||
* tests/casemap.txt:
|
||||
* tests/gen-casefold-txt.pl:
|
||||
* tests/gen-casemap-txt.pl: Update Unicode data to 4.0. (#107974)
|
||||
|
||||
2003-07-31 Tor Lillqvist <tml@iki.fi>
|
||||
|
||||
* glib/gspawn-win32.c: When possible, manage without the helper
|
||||
|
@ -1,3 +1,19 @@
|
||||
2003-07-30 Noah Levitt <nlevitt@columbia.edu>
|
||||
|
||||
* glib/gen-unicode-tables.pl:
|
||||
* glib/gunibreak.c:
|
||||
* glib/gunibreak.h:
|
||||
* glib/gunichartables.h:
|
||||
* glib/gunicode.h:
|
||||
* glib/gunicomp.h:
|
||||
* glib/gunidecomp.c:
|
||||
* glib/gunidecomp.h:
|
||||
* glib/guniprop.c:
|
||||
* tests/casefold.txt:
|
||||
* tests/casemap.txt:
|
||||
* tests/gen-casefold-txt.pl:
|
||||
* tests/gen-casemap-txt.pl: Update Unicode data to 4.0. (#107974)
|
||||
|
||||
2003-07-31 Tor Lillqvist <tml@iki.fi>
|
||||
|
||||
* glib/gspawn-win32.c: When possible, manage without the helper
|
||||
|
@ -1,3 +1,19 @@
|
||||
2003-07-30 Noah Levitt <nlevitt@columbia.edu>
|
||||
|
||||
* glib/gen-unicode-tables.pl:
|
||||
* glib/gunibreak.c:
|
||||
* glib/gunibreak.h:
|
||||
* glib/gunichartables.h:
|
||||
* glib/gunicode.h:
|
||||
* glib/gunicomp.h:
|
||||
* glib/gunidecomp.c:
|
||||
* glib/gunidecomp.h:
|
||||
* glib/guniprop.c:
|
||||
* tests/casefold.txt:
|
||||
* tests/casemap.txt:
|
||||
* tests/gen-casefold-txt.pl:
|
||||
* tests/gen-casemap-txt.pl: Update Unicode data to 4.0. (#107974)
|
||||
|
||||
2003-07-31 Tor Lillqvist <tml@iki.fi>
|
||||
|
||||
* glib/gspawn-win32.c: When possible, manage without the helper
|
||||
|
@ -1,3 +1,19 @@
|
||||
2003-07-30 Noah Levitt <nlevitt@columbia.edu>
|
||||
|
||||
* glib/gen-unicode-tables.pl:
|
||||
* glib/gunibreak.c:
|
||||
* glib/gunibreak.h:
|
||||
* glib/gunichartables.h:
|
||||
* glib/gunicode.h:
|
||||
* glib/gunicomp.h:
|
||||
* glib/gunidecomp.c:
|
||||
* glib/gunidecomp.h:
|
||||
* glib/guniprop.c:
|
||||
* tests/casefold.txt:
|
||||
* tests/casemap.txt:
|
||||
* tests/gen-casefold-txt.pl:
|
||||
* tests/gen-casemap-txt.pl: Update Unicode data to 4.0. (#107974)
|
||||
|
||||
2003-07-31 Tor Lillqvist <tml@iki.fi>
|
||||
|
||||
* glib/gspawn-win32.c: When possible, manage without the helper
|
||||
|
@ -1,3 +1,19 @@
|
||||
2003-07-30 Noah Levitt <nlevitt@columbia.edu>
|
||||
|
||||
* glib/gen-unicode-tables.pl:
|
||||
* glib/gunibreak.c:
|
||||
* glib/gunibreak.h:
|
||||
* glib/gunichartables.h:
|
||||
* glib/gunicode.h:
|
||||
* glib/gunicomp.h:
|
||||
* glib/gunidecomp.c:
|
||||
* glib/gunidecomp.h:
|
||||
* glib/guniprop.c:
|
||||
* tests/casefold.txt:
|
||||
* tests/casemap.txt:
|
||||
* tests/gen-casefold-txt.pl:
|
||||
* tests/gen-casemap-txt.pl: Update Unicode data to 4.0. (#107974)
|
||||
|
||||
2003-07-31 Tor Lillqvist <tml@iki.fi>
|
||||
|
||||
* glib/gspawn-win32.c: When possible, manage without the helper
|
||||
|
@ -31,8 +31,12 @@
|
||||
# * For decomp table it might make sense to use a shift count other
|
||||
# than 8. We could easily compute the perfect shift count.
|
||||
|
||||
# we use some perl unicode features
|
||||
require 5.006;
|
||||
|
||||
use vars qw($CODE $NAME $CATEGORY $COMBINING_CLASSES $BIDI_CATEGORY $DECOMPOSITION $DECIMAL_VALUE $DIGIT_VALUE $NUMERIC_VALUE $MIRRORED $OLD_NAME $COMMENT $UPPER $LOWER $TITLE $BREAK_CODE $BREAK_CATEGORY $BREAK_NAME $CASE_CODE $CASE_LOWER $CASE_TITLE $CASE_UPPER $CASE_CONDITION);
|
||||
|
||||
|
||||
# Names of fields in Unicode data table.
|
||||
$CODE = 0;
|
||||
$NAME = 1;
|
||||
@ -134,6 +138,8 @@ $FOLDING_MAPPING = 2;
|
||||
'PO' => "G_UNICODE_BREAK_POSTFIX",
|
||||
'SA' => "G_UNICODE_BREAK_COMPLEX_CONTEXT",
|
||||
'AI' => "G_UNICODE_BREAK_AMBIGUOUS",
|
||||
'NL' => "G_UNICODE_BREAK_NEXT_LINE",
|
||||
'WJ' => "G_UNICODE_BREAK_WORD_JOINER",
|
||||
'XX' => "G_UNICODE_BREAK_UNKNOWN"
|
||||
);
|
||||
|
||||
@ -143,8 +149,9 @@ $FOLDING_MAPPING = 2;
|
||||
|
||||
# Maximum length of special-case strings
|
||||
|
||||
my $special_case_len = 0;
|
||||
my @special_cases;
|
||||
my @special_case_offsets;
|
||||
my $special_case_offset = 0;
|
||||
|
||||
$do_decomp = 0;
|
||||
$do_props = 1;
|
||||
@ -193,6 +200,9 @@ print "Unicode data from $ARGV[1]\n";
|
||||
|
||||
open (INPUT, "< $ARGV[1]") || exit 1;
|
||||
|
||||
# we save memory by skipping the huge empty area before U+E0000
|
||||
my $pages_before_e0000;
|
||||
|
||||
$last_code = -1;
|
||||
while (<INPUT>)
|
||||
{
|
||||
@ -205,7 +215,10 @@ while (<INPUT>)
|
||||
|
||||
$code = hex ($fields[$CODE]);
|
||||
|
||||
last if ($code > 0xFFFF); # ignore characters out of the basic plane
|
||||
if ($code >= 0xE0000 and $last_code < 0xE0000)
|
||||
{
|
||||
$pages_before_e0000 = ($last_code >> 8) + 1;
|
||||
}
|
||||
|
||||
if ($code > $last_code + 1)
|
||||
{
|
||||
@ -237,12 +250,12 @@ close INPUT;
|
||||
|
||||
@gfields = ('', '', 'Cn', '0', '', '', '', '', '', '', '',
|
||||
'', '', '', '');
|
||||
for (++$last_code; $last_code < 0x10000; ++$last_code)
|
||||
for (++$last_code; $last_code <= 0x10FFFF; ++$last_code)
|
||||
{
|
||||
$gfields{$CODE} = sprintf ("%04x", $last_code);
|
||||
&process_one ($last_code, @gfields);
|
||||
}
|
||||
--$last_code; # Want last to be 0xFFFF.
|
||||
--$last_code; # Want last to be 0x10FFFF.
|
||||
|
||||
print "Creating line break table\n";
|
||||
|
||||
@ -268,7 +281,7 @@ while (<INPUT>)
|
||||
next;
|
||||
}
|
||||
|
||||
if ($fields[$CODE] =~ /([A-F0-9]{4})..([A-F0-9]{4})/)
|
||||
if ($fields[$CODE] =~ /([A-F0-9]{4,6})\.\.([A-F0-9]{4,6})/)
|
||||
{
|
||||
$start_code = hex ($1);
|
||||
$end_code = hex ($2);
|
||||
@ -277,8 +290,6 @@ while (<INPUT>)
|
||||
|
||||
}
|
||||
|
||||
last if ($start_code > 0xFFFF); # FIXME ignore characters out of the basic plane
|
||||
|
||||
if ($start_code > $last_code + 1)
|
||||
{
|
||||
# The gap represents undefined characters. If assigned,
|
||||
@ -306,7 +317,7 @@ while (<INPUT>)
|
||||
|
||||
close INPUT;
|
||||
|
||||
for (++$last_code; $last_code < 0x10000; ++$last_code)
|
||||
for (++$last_code; $last_code <= 0x10FFFF; ++$last_code)
|
||||
{
|
||||
if ($type[$last_code] eq 'Cn')
|
||||
{
|
||||
@ -317,9 +328,9 @@ for (++$last_code; $last_code < 0x10000; ++$last_code)
|
||||
$break_props[$last_code] = 'AL';
|
||||
}
|
||||
}
|
||||
--$last_code; # Want last to be 0xFFFF.
|
||||
--$last_code; # Want last to be 0x10FFFF.
|
||||
|
||||
print STDERR "Last code is not 0xFFFF" if ($last_code != 0xFFFF);
|
||||
print STDERR "Last code is not 0x10FFFF" if ($last_code != 0x10FFFF);
|
||||
|
||||
print "Reading special-casing table for case conversion\n";
|
||||
|
||||
@ -362,18 +373,18 @@ while (<INPUT>)
|
||||
{
|
||||
(hex $fields[$CASE_UPPER] == $code) || die "$raw_code is Lu and UCD_Upper($raw_code) != $raw_code";
|
||||
|
||||
&add_special_case ($code, $value[$code],$fields[$CASE_LOWER], $fields[$CASE_TITLE]);
|
||||
&add_special_case ($code, $value[$code], $fields[$CASE_LOWER], $fields[$CASE_TITLE]);
|
||||
|
||||
} elsif ($type[$code] eq 'Lt')
|
||||
{
|
||||
(hex $fields[$CASE_TITLE] == $code) || die "$raw_code is Lt and UCD_Title($raw_code) != $raw_code";
|
||||
|
||||
&add_special_case ($code, undef,$fields[$CASE_LOWER], $fields[$CASE_UPPER]);
|
||||
&add_special_case ($code, undef, $fields[$CASE_LOWER], $fields[$CASE_UPPER]);
|
||||
} elsif ($type[$code] eq 'Ll')
|
||||
{
|
||||
(hex $fields[$CASE_LOWER] == $code) || die "$raw_code is Ll and UCD_Lower($raw_code) != $raw_code";
|
||||
|
||||
&add_special_case ($code, $value[$code],$fields[$CASE_UPPER], $fields[$CASE_TITLE]);
|
||||
&add_special_case ($code, $value[$code], $fields[$CASE_UPPER], $fields[$CASE_TITLE]);
|
||||
} else {
|
||||
printf STDERR "Special case for non-alphabetic code point: $raw_code\n";
|
||||
next;
|
||||
@ -403,22 +414,21 @@ while (<INPUT>)
|
||||
$raw_code = $fields[$FOLDING_CODE];
|
||||
$code = hex ($raw_code);
|
||||
|
||||
next if $code > 0xffff; # FIXME!
|
||||
|
||||
if ($#fields != 3)
|
||||
{
|
||||
printf STDERR ("Entry for $raw_code has wrong number of fields (%d)\n", $#fields);
|
||||
next;
|
||||
}
|
||||
|
||||
next if ($fields[$FOLDING_STATUS] eq 'S');
|
||||
# we don't use Simple or Turkic rules here
|
||||
next if ($fields[$FOLDING_STATUS] =~ /^[ST]$/);
|
||||
|
||||
@values = map { hex ($_) } split /\s+/, $fields[$FOLDING_MAPPING];
|
||||
|
||||
# Check simple case
|
||||
|
||||
if (@values == 1 &&
|
||||
!(defined $value[$code] && $value[$code] >= 0xd800 && $value[$code] < 0xdc00) &&
|
||||
!(defined $value[$code] && $value[$code] >= 0x1000000) &&
|
||||
defined $type[$code]) {
|
||||
|
||||
my $lower;
|
||||
@ -441,13 +451,12 @@ while (<INPUT>)
|
||||
}
|
||||
|
||||
my $string = pack ("U*", @values);
|
||||
$string =~ s/([\x80-\xff])/sprintf "\\x%02x",ord($1)/eg;
|
||||
|
||||
if (1 + length $string > $casefoldlen) {
|
||||
$casefoldlen = 1 + length $string;
|
||||
|
||||
if (1 + &length_in_bytes ($string) > $casefoldlen) {
|
||||
$casefoldlen = 1 + &length_in_bytes ($string);
|
||||
}
|
||||
|
||||
push @casefold, [ $code, $string ];
|
||||
push @casefold, [ $code, &escape ($string) ];
|
||||
}
|
||||
|
||||
close INPUT;
|
||||
@ -464,6 +473,16 @@ if ($do_decomp) {
|
||||
|
||||
exit 0;
|
||||
|
||||
|
||||
# perl "length" returns the length in characters
|
||||
sub length_in_bytes
|
||||
{
|
||||
my ($string) = @_;
|
||||
|
||||
use bytes;
|
||||
return length $string;
|
||||
}
|
||||
|
||||
# Process a single character.
|
||||
sub process_one
|
||||
{
|
||||
@ -528,7 +547,11 @@ sub print_tables
|
||||
|
||||
printf OUT "#define G_UNICODE_LAST_CHAR 0x%04x\n\n", $last;
|
||||
|
||||
printf OUT "#define G_UNICODE_MAX_TABLE_INDEX 1000\n\n";
|
||||
printf OUT "#define G_UNICODE_MAX_TABLE_INDEX 10000\n\n";
|
||||
|
||||
my $last_part1 = ($pages_before_e0000 * 256) - 1;
|
||||
printf OUT "#define G_UNICODE_LAST_CHAR_PART1 0x%04X\n\n", $last_part1;
|
||||
printf OUT "#define G_UNICODE_LAST_PAGE_PART1 %d\n\n", $pages_before_e0000 - 1;
|
||||
|
||||
$table_index = 0;
|
||||
printf OUT "static const char type_data[][256] = {\n";
|
||||
@ -538,8 +561,9 @@ sub print_tables
|
||||
}
|
||||
printf OUT "\n};\n\n";
|
||||
|
||||
print OUT "static const short type_table[256] = {\n";
|
||||
for ($count = 0; $count <= $last; $count += 256)
|
||||
printf OUT "/* U+0000 through U+%04X */\n", $last_part1;
|
||||
print OUT "static const gint16 type_table_part1[$pages_before_e0000] = {\n";
|
||||
for ($count = 0; $count <= $last_part1; $count += 256)
|
||||
{
|
||||
print OUT ",\n" if $count > 0;
|
||||
print OUT " ", $row[$count / 256];
|
||||
@ -547,21 +571,32 @@ sub print_tables
|
||||
}
|
||||
print OUT "\n};\n\n";
|
||||
|
||||
printf OUT "/* U+E0000 through U+%04X */\n", $last;
|
||||
print OUT "static const gint16 type_table_part2[768] = {\n";
|
||||
for ($count = 0xE0000; $count <= $last; $count += 256)
|
||||
{
|
||||
print OUT ",\n" if $count > 0xE0000;
|
||||
print OUT " ", $row[$count / 256];
|
||||
$bytes_out += 2;
|
||||
}
|
||||
print OUT "\n};\n\n";
|
||||
|
||||
|
||||
#
|
||||
# Now print attribute table.
|
||||
#
|
||||
|
||||
$table_index = 0;
|
||||
printf OUT "static const unsigned short attr_data[][256] = {\n";
|
||||
printf OUT "static const gunichar attr_data[][256] = {\n";
|
||||
for ($count = 0; $count <= $last; $count += 256)
|
||||
{
|
||||
$row[$count / 256] = &print_row ($count, 2, \&fetch_attr);
|
||||
$row[$count / 256] = &print_row ($count, 4, \&fetch_attr);
|
||||
}
|
||||
printf OUT "\n};\n\n";
|
||||
|
||||
print OUT "static const short attr_table[256] = {\n";
|
||||
for ($count = 0; $count <= $last; $count += 256)
|
||||
printf OUT "/* U+0000 through U+%04X */\n", $last_part1;
|
||||
print OUT "static const gint16 attr_table_part1[$pages_before_e0000] = {\n";
|
||||
for ($count = 0; $count <= $last_part1; $count += 256)
|
||||
{
|
||||
print OUT ",\n" if $count > 0;
|
||||
print OUT " ", $row[$count / 256];
|
||||
@ -569,12 +604,21 @@ sub print_tables
|
||||
}
|
||||
print OUT "\n};\n\n";
|
||||
|
||||
printf OUT "/* U+E0000 through U+%04X */\n", $last;
|
||||
print OUT "static const gint16 attr_table_part2[768] = {\n";
|
||||
for ($count = 0xE0000; $count <= $last; $count += 256)
|
||||
{
|
||||
print OUT ",\n" if $count > 0xE0000;
|
||||
print OUT " ", $row[$count / 256];
|
||||
$bytes_out += 2;
|
||||
}
|
||||
print OUT "\n};\n\n";
|
||||
|
||||
#
|
||||
# print title case table
|
||||
#
|
||||
|
||||
# FIXME: type.
|
||||
print OUT "static const unsigned short title_table[][3] = {\n";
|
||||
print OUT "static const gunichar title_table[][3] = {\n";
|
||||
my ($item);
|
||||
my ($first) = 1;
|
||||
foreach $item (sort keys %title_to_lower)
|
||||
@ -583,7 +627,7 @@ sub print_tables
|
||||
unless $first;
|
||||
$first = 0;
|
||||
printf OUT " { 0x%04x, 0x%04x, 0x%04x }", $item, $title_to_upper{$item}, $title_to_lower{$item};
|
||||
$bytes_out += 6;
|
||||
$bytes_out += 12;
|
||||
}
|
||||
print OUT "\n};\n\n";
|
||||
|
||||
@ -666,6 +710,40 @@ sub print_row
|
||||
return sprintf "%d /* page %d */", $table_index++, $start / 256;
|
||||
}
|
||||
|
||||
sub escape
|
||||
{
|
||||
my ($string) = @_;
|
||||
|
||||
$string =~ s/(\C)/sprintf "\\x%02x",ord($1)/eg;
|
||||
|
||||
return $string;
|
||||
}
|
||||
|
||||
# Returns the offset of $decomp in the offset string. Updates the
|
||||
# referenced variables as appropriate.
|
||||
sub handle_decomp ($$$$)
|
||||
{
|
||||
my ($decomp, $decomp_offsets_ref, $decomp_string_ref, $decomp_string_offset_ref) = @_;
|
||||
my $offset = "G_UNICODE_NOT_PRESENT_OFFSET";
|
||||
|
||||
if (defined $decomp)
|
||||
{
|
||||
if (defined $decomp_offsets_ref->{$decomp})
|
||||
{
|
||||
$offset = $decomp_offsets_ref->{$decomp};
|
||||
}
|
||||
else
|
||||
{
|
||||
$offset = ${$decomp_string_offset_ref};
|
||||
$decomp_offsets_ref->{$decomp} = $offset;
|
||||
${$decomp_string_ref} .= "\n \"" . &escape ($decomp) . "\\0\" /* offset ${$decomp_string_offset_ref} */";
|
||||
${$decomp_string_offset_ref} += &length_in_bytes ($decomp) + 1;
|
||||
}
|
||||
}
|
||||
|
||||
return $offset;
|
||||
}
|
||||
|
||||
# Generate the character decomposition header.
|
||||
sub print_decomp
|
||||
{
|
||||
@ -684,19 +762,26 @@ sub print_decomp
|
||||
|
||||
printf OUT "#define G_UNICODE_LAST_CHAR 0x%04x\n\n", $last;
|
||||
|
||||
printf OUT "#define G_UNICODE_MAX_TABLE_INDEX 1000\n\n";
|
||||
printf OUT "#define G_UNICODE_MAX_TABLE_INDEX (0x110000 / 256)\n\n";
|
||||
|
||||
my $last_part1 = ($pages_before_e0000 * 256) - 1;
|
||||
printf OUT "#define G_UNICODE_LAST_CHAR_PART1 0x%04X\n\n", $last_part1;
|
||||
printf OUT "#define G_UNICODE_LAST_PAGE_PART1 %d\n\n", $pages_before_e0000 - 1;
|
||||
|
||||
$NOT_PRESENT_OFFSET = 65535;
|
||||
print OUT "#define G_UNICODE_NOT_PRESENT_OFFSET $NOT_PRESENT_OFFSET\n\n";
|
||||
|
||||
my ($count, @row);
|
||||
$table_index = 0;
|
||||
printf OUT "static const unsigned char cclass_data[][256] = {\n";
|
||||
printf OUT "static const guchar cclass_data[][256] = {\n";
|
||||
for ($count = 0; $count <= $last; $count += 256)
|
||||
{
|
||||
$row[$count / 256] = &print_row ($count, 1, \&fetch_cclass);
|
||||
}
|
||||
printf OUT "\n};\n\n";
|
||||
|
||||
print OUT "static const short combining_class_table[256] = {\n";
|
||||
for ($count = 0; $count <= $last; $count += 256)
|
||||
print OUT "static const gint16 combining_class_table_part1[$pages_before_e0000] = {\n";
|
||||
for ($count = 0; $count <= $last_part1; $count += 256)
|
||||
{
|
||||
print OUT ",\n" if $count > 0;
|
||||
print OUT " ", $row[$count / 256];
|
||||
@ -704,12 +789,19 @@ sub print_decomp
|
||||
}
|
||||
print OUT "\n};\n\n";
|
||||
|
||||
print OUT "static const gint16 combining_class_table_part2[768] = {\n";
|
||||
for ($count = 0xE0000; $count <= $last; $count += 256)
|
||||
{
|
||||
print OUT ",\n" if $count > 0xE0000;
|
||||
print OUT " ", $row[$count / 256];
|
||||
$bytes_out += 2;
|
||||
}
|
||||
print OUT "\n};\n\n";
|
||||
|
||||
print OUT "typedef struct\n{\n";
|
||||
# FIXME: type.
|
||||
print OUT " unsigned short ch;\n";
|
||||
print OUT " unsigned char canon_offset;\n";
|
||||
print OUT " unsigned char compat_offset;\n";
|
||||
print OUT " unsigned short expansion_offset;\n";
|
||||
print OUT " gunichar ch;\n";
|
||||
print OUT " guint16 canon_offset;\n";
|
||||
print OUT " guint16 compat_offset;\n";
|
||||
print OUT "} decomposition;\n\n";
|
||||
|
||||
print OUT "static const decomposition decomp_table[] =\n{\n";
|
||||
@ -737,40 +829,19 @@ sub print_decomp
|
||||
undef $compat_decomp;
|
||||
}
|
||||
|
||||
my $string = "";
|
||||
my $canon_offset = 0xff;
|
||||
my $compat_offset = 0xff;
|
||||
|
||||
if (defined $canon_decomp) {
|
||||
$canon_offset = 0;
|
||||
$string .= $canon_decomp;
|
||||
}
|
||||
if (defined $compat_decomp) {
|
||||
if (defined $canon_decomp) {
|
||||
$string .= "\\x00\\x00";
|
||||
}
|
||||
$compat_offset = (length $string) / 4;
|
||||
$string .= $compat_decomp;
|
||||
}
|
||||
my $canon_offset = handle_decomp ($canon_decomp, \%decomp_offsets, \$decomp_string, \$decomp_string_offset);
|
||||
my $compat_offset = handle_decomp ($compat_decomp, \%decomp_offsets, \$decomp_string, \$decomp_string_offset);
|
||||
|
||||
if (!defined($decomp_offsets{$string})) {
|
||||
$decomp_offsets{$string} = $decomp_string_offset;
|
||||
$decomp_string .= "\n \"".$string."\\0\\0\" /* offset ".
|
||||
$decomp_string_offset." */";
|
||||
$decomp_string_offset += ((length $string) / 4) + 2;
|
||||
|
||||
$bytes_out += (length $string) / 4 + 2; # "\x20"
|
||||
}
|
||||
|
||||
printf OUT qq( { 0x%04x, %u, %u, %d }),
|
||||
$count, $canon_offset, $compat_offset, $decomp_offsets{$string};
|
||||
$bytes_out += 6;
|
||||
die if $decomp_string_offset > $NOT_PRESENT_OFFSET;
|
||||
|
||||
printf OUT qq( { 0x%04x, $canon_offset, $compat_offset }), $count;
|
||||
$bytes_out += 8;
|
||||
}
|
||||
}
|
||||
print OUT "\n};\n\n";
|
||||
$bytes_out += $decomp_string_offset + 1;
|
||||
|
||||
printf OUT "static const guchar decomp_expansion_string[] = %s;\n\n", $decomp_string;
|
||||
printf OUT "static const gchar decomp_expansion_string[] = %s;\n\n", $decomp_string;
|
||||
|
||||
print OUT "#endif /* DECOMP_H */\n";
|
||||
|
||||
@ -796,20 +867,25 @@ sub print_line_break
|
||||
|
||||
print OUT "#define G_UNICODE_DATA_VERSION \"$ARGV[0]\"\n\n";
|
||||
|
||||
printf OUT "#define G_UNICODE_LAST_CHAR 0x%04x\n\n", $last;
|
||||
printf OUT "#define G_UNICODE_LAST_CHAR 0x%04X\n\n", $last;
|
||||
|
||||
printf OUT "#define G_UNICODE_MAX_TABLE_INDEX 1000\n\n";
|
||||
printf OUT "#define G_UNICODE_MAX_TABLE_INDEX 10000\n\n";
|
||||
|
||||
my $last_part1 = ($pages_before_e0000 * 256) - 1;
|
||||
printf OUT "/* the last code point that should be looked up in break_property_table_part1 */\n";
|
||||
printf OUT "#define G_UNICODE_LAST_CHAR_PART1 0x%04X\n\n", $last_part1;
|
||||
|
||||
$table_index = 0;
|
||||
printf OUT "static const char break_property_data[][256] = {\n";
|
||||
printf OUT "static const gint8 break_property_data[][256] = {\n";
|
||||
for ($count = 0; $count <= $last; $count += 256)
|
||||
{
|
||||
$row[$count / 256] = &print_row ($count, 1, \&fetch_break_type);
|
||||
}
|
||||
printf OUT "\n};\n\n";
|
||||
|
||||
print OUT "static const short break_property_table[256] = {\n";
|
||||
for ($count = 0; $count <= $last; $count += 256)
|
||||
printf OUT "/* U+0000 through U+%04X */\n", $last_part1;
|
||||
print OUT "static const gint16 break_property_table_part1[$pages_before_e0000] = {\n";
|
||||
for ($count = 0; $count <= $last_part1; $count += 256)
|
||||
{
|
||||
print OUT ",\n" if $count > 0;
|
||||
print OUT " ", $row[$count / 256];
|
||||
@ -817,6 +893,17 @@ sub print_line_break
|
||||
}
|
||||
print OUT "\n};\n\n";
|
||||
|
||||
printf OUT "/* U+E0000 through U+%04X */\n", $last;
|
||||
print OUT "static const gint16 break_property_table_part2[768] = {\n";
|
||||
for ($count = 0xE0000; $count <= $last; $count += 256)
|
||||
{
|
||||
print OUT ",\n" if $count > 0xE0000;
|
||||
print OUT " ", $row[$count / 256];
|
||||
$bytes_out += 2;
|
||||
}
|
||||
print OUT "\n};\n\n";
|
||||
|
||||
|
||||
print OUT "#endif /* BREAKTABLES_H */\n";
|
||||
|
||||
close (OUT);
|
||||
@ -870,7 +957,7 @@ sub make_decomp
|
||||
my $result = "";
|
||||
foreach $iter (&expand_decomp ($code, $compat))
|
||||
{
|
||||
$result .= sprintf "\\x%02x\\x%02x", $iter / 256, $iter & 0xff;
|
||||
$result .= pack ("U", $iter); # to utf-8
|
||||
}
|
||||
|
||||
$result;
|
||||
@ -888,21 +975,17 @@ sub add_special_case
|
||||
|
||||
|
||||
for $value (@values) {
|
||||
$result .= sprintf ("\\x%02x\\x%02x", $value / 256, $value & 0xff);
|
||||
$result .= pack ("U", $value); # to utf-8
|
||||
}
|
||||
|
||||
$result .= "\\0";
|
||||
|
||||
if (2 * @values + 2 > $special_case_len) {
|
||||
$special_case_len = 2 * @values + 2;
|
||||
}
|
||||
push @special_case_offsets, $special_case_offset;
|
||||
|
||||
push @special_cases, $result;
|
||||
# We encode special cases up in the 0x1000000 space
|
||||
$value[$code] = 0x1000000 + $special_case_offset;
|
||||
|
||||
#
|
||||
# We encode special cases in the surrogate pair space
|
||||
#
|
||||
$value[$code] = 0xD800 + scalar(@special_cases) - 1;
|
||||
$special_case_offset += 1 + &length_in_bytes ($result);
|
||||
|
||||
push @special_cases, &escape ($result);
|
||||
}
|
||||
|
||||
sub output_special_case_table
|
||||
@ -915,13 +998,15 @@ sub output_special_case_table
|
||||
* First, the best single character mapping to lowercase if Lu,
|
||||
* and to uppercase if Ll, followed by the output mapping for the two cases
|
||||
* other than the case of the codepoint, in the order [Ll],[Lu],[Lt],
|
||||
* separated and terminated by a double NUL.
|
||||
* encoded in UTF-8, separated and terminated by a null character.
|
||||
*/
|
||||
static const guchar special_case_table[][$special_case_len] = {
|
||||
static const gchar special_case_table[] = {
|
||||
EOT
|
||||
|
||||
my $i = 0;
|
||||
for $case (@special_cases) {
|
||||
print $out qq( "$case",\n);
|
||||
print $out qq( "$case\\0" /* offset ${special_case_offsets[$i]} */\n);
|
||||
$i++;
|
||||
}
|
||||
|
||||
print $out <<EOT;
|
||||
@ -929,7 +1014,7 @@ EOT
|
||||
|
||||
EOT
|
||||
|
||||
print STDERR "Generated ", ($special_case_len * scalar @special_cases), " bytes in special case table\n";
|
||||
print STDERR "Generated " . ($special_case_offset + 1) . " bytes in special case table\n";
|
||||
}
|
||||
|
||||
sub enumerate_ordered
|
||||
@ -962,16 +1047,22 @@ sub output_composition_table
|
||||
# decompositions. At the same time, record
|
||||
# the first and second character of each decomposition
|
||||
|
||||
for $code (keys %compositions) {
|
||||
for $code (keys %compositions)
|
||||
{
|
||||
@values = map { hex ($_) } split /\s+/, $compositions{$code};
|
||||
|
||||
# non-starters
|
||||
if ($cclass[$values[0]]) {
|
||||
delete $compositions{$code};
|
||||
next;
|
||||
}
|
||||
|
||||
# single-character decompositions
|
||||
if (@values == 1) {
|
||||
delete $compositions{$code};
|
||||
next;
|
||||
}
|
||||
|
||||
if (@values != 2) {
|
||||
die "$code has more than two elements in its decomposition!\n";
|
||||
}
|
||||
@ -983,10 +1074,10 @@ sub output_composition_table
|
||||
}
|
||||
}
|
||||
|
||||
# Assign integer indicices, removing singletons
|
||||
# Assign integer indices, removing singletons
|
||||
my $n_first = enumerate_ordered (\%first);
|
||||
|
||||
# Now record the second character if each (non-singleton) decomposition
|
||||
# Now record the second character of each (non-singleton) decomposition
|
||||
for $code (keys %compositions) {
|
||||
@values = map { hex ($_) } split /\s+/, $compositions{$code};
|
||||
|
||||
@ -1065,39 +1156,46 @@ sub output_composition_table
|
||||
|
||||
my @row;
|
||||
$table_index = 0;
|
||||
printf OUT "static const gushort compose_data[][256] = {\n";
|
||||
printf OUT "static const guint16 compose_data[][256] = {\n";
|
||||
for (my $count = 0; $count <= $last; $count += 256)
|
||||
{
|
||||
$row[$count / 256] = &print_row ($count, 2, sub { exists $vals{$_[0]} ? $vals{$_[0]} : 0; });
|
||||
}
|
||||
printf OUT "\n};\n\n";
|
||||
|
||||
print OUT "static const short compose_table[256] = {\n";
|
||||
print OUT "static const gint16 compose_table[256] = {\n";
|
||||
for (my $count = 0; $count <= $last; $count += 256)
|
||||
{
|
||||
print OUT ",\n" if $count > 0;
|
||||
print OUT " ", $row[$count / 256];
|
||||
$bytes_out += 4;
|
||||
}
|
||||
print OUT "\n};\n\n";
|
||||
|
||||
$bytes_out += 256 * 2;
|
||||
|
||||
# Output first singletons
|
||||
|
||||
print OUT "static const gushort compose_first_single[][2] = {\n";
|
||||
print OUT "static const guint16 compose_first_single[][2] = {\n";
|
||||
$i = 0;
|
||||
for $record (@first_singletons) {
|
||||
if ($record->[1] > 0xFFFF or $record->[2] > 0xFFFF) {
|
||||
die "time to switch compose_first_single to gunichar" ;
|
||||
}
|
||||
print OUT ",\n" if $i++ > 0;
|
||||
printf OUT " { %#06x, %#06x }", $record->[1], $record->[2];
|
||||
}
|
||||
print OUT "\n};\n";
|
||||
|
||||
$bytes_out += @first_singletons * 4;
|
||||
$bytes_out += @first_singletons * 4;
|
||||
|
||||
# Output second singletons
|
||||
|
||||
print OUT "static const gushort compose_second_single[][2] = {\n";
|
||||
print OUT "static const guint16 compose_second_single[][2] = {\n";
|
||||
$i = 0;
|
||||
for $record (@second_singletons) {
|
||||
if ($record->[1] > 0xFFFF or $record->[2] > 0xFFFF) {
|
||||
die "time to switch compose_second_single to gunichar";
|
||||
}
|
||||
print OUT ",\n" if $i++ > 0;
|
||||
printf OUT " { %#06x, %#06x }", $record->[1], $record->[2];
|
||||
}
|
||||
@ -1108,7 +1206,7 @@ sub output_composition_table
|
||||
# Output array of composition pairs
|
||||
|
||||
print OUT <<EOT;
|
||||
static const gushort compose_array[$n_first][$n_second] = {
|
||||
static const guint16 compose_array[$n_first][$n_second] = {
|
||||
EOT
|
||||
|
||||
for (my $i = 0; $i < $n_first; $i++) {
|
||||
@ -1117,7 +1215,10 @@ EOT
|
||||
for (my $j = 0; $j < $n_second; $j++) {
|
||||
print OUT ", " if $j;
|
||||
if (exists $reverse{"$i|$j"}) {
|
||||
printf OUT "%#06x", $reverse{"$i|$j"};
|
||||
if ($reverse{"$i|$j"} > 0xFFFF) {
|
||||
die "time to switch compose_array to gunichar" ;
|
||||
}
|
||||
printf OUT "0x%04x", $reverse{"$i|$j"};
|
||||
} else {
|
||||
print OUT " 0";
|
||||
}
|
||||
@ -1151,10 +1252,16 @@ EOT
|
||||
|
||||
@casefold = sort { $a->[0] <=> $b->[0] } @casefold;
|
||||
|
||||
for $case (@casefold) {
|
||||
for $case (@casefold)
|
||||
{
|
||||
$code = $case->[0];
|
||||
$string = $case->[1];
|
||||
print $out sprintf(qq({ %#04x, "$string" },\n), $code);
|
||||
|
||||
if ($code > 0xFFFF) {
|
||||
die "time to switch casefold_table to gunichar" ;
|
||||
}
|
||||
|
||||
print $out sprintf(qq( { 0x%04x, "$string" },\n), $code);
|
||||
|
||||
}
|
||||
|
||||
|
@ -25,13 +25,22 @@
|
||||
#include "glib.h"
|
||||
#include "gunibreak.h"
|
||||
|
||||
#define TPROP_PART1(Page, Char) \
|
||||
((break_property_table_part1[Page] >= G_UNICODE_MAX_TABLE_INDEX) \
|
||||
? (break_property_table_part1[Page] - G_UNICODE_MAX_TABLE_INDEX) \
|
||||
: (break_property_data[break_property_table_part1[Page]][Char]))
|
||||
|
||||
#define TPROP(Page, Char) \
|
||||
((break_property_table[Page] >= G_UNICODE_MAX_TABLE_INDEX) \
|
||||
? (break_property_table[Page] - G_UNICODE_MAX_TABLE_INDEX) \
|
||||
: (break_property_data[break_property_table[Page]][Char]))
|
||||
#define TPROP_PART2(Page, Char) \
|
||||
((break_property_table_part2[Page] >= G_UNICODE_MAX_TABLE_INDEX) \
|
||||
? (break_property_table_part2[Page] - G_UNICODE_MAX_TABLE_INDEX) \
|
||||
: (break_property_data[break_property_table_part2[Page]][Char]))
|
||||
|
||||
#define PROP(Char) (((Char) > (G_UNICODE_LAST_CHAR)) ? G_UNICODE_BREAK_UNKNOWN : TPROP ((Char) >> 8, (Char) & 0xff))
|
||||
#define PROP(Char) \
|
||||
(((Char) <= G_UNICODE_LAST_CHAR_PART1) \
|
||||
? TPROP_PART1 ((Char) >> 8, (Char) & 0xff) \
|
||||
: (((Char) >= 0xe0000 && (Char) <= G_UNICODE_LAST_CHAR) \
|
||||
? TPROP_PART2 (((Char) - 0xe0000) >> 8, (Char) & 0xff) \
|
||||
: G_UNICODE_BREAK_UNKNOWN))
|
||||
|
||||
/**
|
||||
* g_unichar_break_type:
|
||||
|
5736
glib/gunibreak.h
5736
glib/gunibreak.h
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@ -100,7 +100,9 @@ typedef enum
|
||||
G_UNICODE_BREAK_POSTFIX,
|
||||
G_UNICODE_BREAK_COMPLEX_CONTEXT,
|
||||
G_UNICODE_BREAK_AMBIGUOUS,
|
||||
G_UNICODE_BREAK_UNKNOWN
|
||||
G_UNICODE_BREAK_UNKNOWN,
|
||||
G_UNICODE_BREAK_NEXT_LINE,
|
||||
G_UNICODE_BREAK_WORD_JOINER
|
||||
} GUnicodeBreakType;
|
||||
|
||||
/* Returns TRUE if current locale uses UTF-8 charset. If CHARSET is
|
||||
|
@ -3,7 +3,7 @@
|
||||
#define COMPOSE_SECOND_START 357
|
||||
#define COMPOSE_SECOND_SINGLE_START 388
|
||||
|
||||
static const gushort compose_data[][256] = {
|
||||
static const guint16 compose_data[][256] = {
|
||||
{ /* page 0, index 0 */
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
@ -222,7 +222,7 @@ static const gushort compose_data[][256] = {
|
||||
}
|
||||
};
|
||||
|
||||
static const short compose_table[256] = {
|
||||
static const gint16 compose_table[256] = {
|
||||
0 /* page 0 */,
|
||||
1 /* page 1 */,
|
||||
2 /* page 2 */,
|
||||
@ -274,7 +274,7 @@ static const short compose_table[256] = {
|
||||
15 /* page 48 */
|
||||
};
|
||||
|
||||
static const gushort compose_first_single[][2] = {
|
||||
static const guint16 compose_first_single[][2] = {
|
||||
{ 0x0338, 0x226e },
|
||||
{ 0x0338, 0x2260 },
|
||||
{ 0x0338, 0x226f },
|
||||
@ -486,7 +486,7 @@ static const gushort compose_first_single[][2] = {
|
||||
{ 0x3099, 0x30fa },
|
||||
{ 0x3099, 0x30fe }
|
||||
};
|
||||
static const gushort compose_second_single[][2] = {
|
||||
static const guint16 compose_second_single[][2] = {
|
||||
{ 0x0627, 0x0622 },
|
||||
{ 0x0627, 0x0623 },
|
||||
{ 0x0627, 0x0625 },
|
||||
@ -506,7 +506,7 @@ static const gushort compose_second_single[][2] = {
|
||||
{ 0x0dd9, 0x0ddc },
|
||||
{ 0x0dd9, 0x0dde }
|
||||
};
|
||||
static const gushort compose_array[146][31] = {
|
||||
static const guint16 compose_array[146][31] = {
|
||||
{ 0x00c0, 0x00c1, 0x00c2, 0x00c3, 0x0100, 0x0102, 0x0226, 0x00c4, 0x1ea2, 0x00c5, 0, 0x01cd, 0x0200, 0x0202, 0, 0, 0, 0x1ea0, 0, 0x1e00, 0, 0, 0x0104, 0, 0, 0, 0, 0, 0, 0, 0 },
|
||||
{ 0, 0, 0, 0, 0, 0, 0x1e02, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x1e04, 0, 0, 0, 0, 0, 0, 0, 0, 0x1e06, 0, 0, 0, 0 },
|
||||
{ 0, 0x0106, 0x0108, 0, 0, 0, 0x010a, 0, 0, 0, 0, 0x010c, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x00c7, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
|
||||
|
@ -28,13 +28,22 @@
|
||||
#include "gunicomp.h"
|
||||
|
||||
|
||||
#define CC(Page, Char) \
|
||||
((combining_class_table[Page] >= G_UNICODE_MAX_TABLE_INDEX) \
|
||||
? (combining_class_table[Page] - G_UNICODE_MAX_TABLE_INDEX) \
|
||||
: (cclass_data[combining_class_table[Page]][Char]))
|
||||
#define CC_PART1(Page, Char) \
|
||||
((combining_class_table_part1[Page] >= G_UNICODE_MAX_TABLE_INDEX) \
|
||||
? (combining_class_table_part1[Page] - G_UNICODE_MAX_TABLE_INDEX) \
|
||||
: (cclass_data[combining_class_table_part1[Page]][Char]))
|
||||
|
||||
#define CC_PART2(Page, Char) \
|
||||
((combining_class_table_part2[Page] >= G_UNICODE_MAX_TABLE_INDEX) \
|
||||
? (combining_class_table_part2[Page] - G_UNICODE_MAX_TABLE_INDEX) \
|
||||
: (cclass_data[combining_class_table_part2[Page]][Char]))
|
||||
|
||||
#define COMBINING_CLASS(Char) \
|
||||
(((Char) > (G_UNICODE_LAST_CHAR)) ? 0 : CC((Char) >> 8, (Char) & 0xff))
|
||||
(((Char) <= G_UNICODE_LAST_CHAR_PART1) \
|
||||
? CC_PART1 ((Char) >> 8, (Char) & 0xff) \
|
||||
: (((Char) >= 0xe0000 && (Char) <= G_UNICODE_LAST_CHAR) \
|
||||
? CC_PART2 (((Char) - 0xe0000) >> 8, (Char) & 0xff) \
|
||||
: 0))
|
||||
|
||||
/**
|
||||
* g_unicode_canonical_ordering:
|
||||
@ -84,7 +93,8 @@ g_unicode_canonical_ordering (gunichar *string,
|
||||
}
|
||||
}
|
||||
|
||||
static const guchar *
|
||||
/* returns a pointer to a null-terminated UTF-8 string */
|
||||
static const gchar *
|
||||
find_decomposition (gunichar ch,
|
||||
gboolean compat)
|
||||
{
|
||||
@ -104,17 +114,17 @@ find_decomposition (gunichar ch,
|
||||
if (compat)
|
||||
{
|
||||
offset = decomp_table[half].compat_offset;
|
||||
if (offset == 0xff)
|
||||
if (offset == G_UNICODE_NOT_PRESENT_OFFSET)
|
||||
offset = decomp_table[half].canon_offset;
|
||||
}
|
||||
else
|
||||
{
|
||||
offset = decomp_table[half].canon_offset;
|
||||
if (offset == 0xff)
|
||||
if (offset == G_UNICODE_NOT_PRESENT_OFFSET)
|
||||
return NULL;
|
||||
}
|
||||
|
||||
return &(decomp_expansion_string[decomp_table[half].expansion_offset + offset]);
|
||||
return &(decomp_expansion_string[offset]);
|
||||
}
|
||||
else if (half == start)
|
||||
break;
|
||||
@ -142,27 +152,20 @@ gunichar *
|
||||
g_unicode_canonical_decomposition (gunichar ch,
|
||||
gsize *result_len)
|
||||
{
|
||||
const guchar *decomp = find_decomposition (ch, FALSE);
|
||||
const gchar *decomp = find_decomposition (ch, FALSE);
|
||||
const gchar *p;
|
||||
gunichar *r;
|
||||
|
||||
if (decomp)
|
||||
{
|
||||
/* Found it. */
|
||||
int i, len;
|
||||
/* We store as a double-nul terminated string. */
|
||||
for (len = 0; (decomp[len] || decomp[len + 1]);
|
||||
len += 2)
|
||||
;
|
||||
int i;
|
||||
|
||||
/* We've counted twice as many bytes as there are
|
||||
characters. */
|
||||
*result_len = len / 2;
|
||||
r = g_malloc (len / 2 * sizeof (gunichar));
|
||||
*result_len = g_utf8_strlen (decomp, -1);
|
||||
r = g_malloc (*result_len * sizeof (gunichar));
|
||||
|
||||
for (i = 0; i < len; i += 2)
|
||||
{
|
||||
r[i / 2] = (decomp[i] << 8 | decomp[i + 1]);
|
||||
}
|
||||
for (p = decomp, i = 0; *p != '\0'; p = g_utf8_next_char (p), i++)
|
||||
r[i] = g_utf8_get_char (p);
|
||||
}
|
||||
else
|
||||
{
|
||||
@ -194,6 +197,7 @@ combine (gunichar a,
|
||||
gushort index_a, index_b;
|
||||
|
||||
index_a = COMPOSE_INDEX(a);
|
||||
|
||||
if (index_a >= COMPOSE_FIRST_SINGLE_START && index_a < COMPOSE_SECOND_START)
|
||||
{
|
||||
if (b == compose_first_single[index_a - COMPOSE_FIRST_SINGLE_START][0])
|
||||
@ -202,10 +206,11 @@ combine (gunichar a,
|
||||
return TRUE;
|
||||
}
|
||||
else
|
||||
return FALSE;
|
||||
return FALSE;
|
||||
}
|
||||
|
||||
index_b = COMPOSE_INDEX(b);
|
||||
|
||||
if (index_b >= COMPOSE_SECOND_SINGLE_START)
|
||||
{
|
||||
if (a == compose_second_single[index_b - COMPOSE_SECOND_SINGLE_START][0])
|
||||
@ -214,7 +219,7 @@ combine (gunichar a,
|
||||
return TRUE;
|
||||
}
|
||||
else
|
||||
return FALSE;
|
||||
return FALSE;
|
||||
}
|
||||
|
||||
if (index_a >= COMPOSE_FIRST_START && index_a < COMPOSE_FIRST_SINGLE_START &&
|
||||
@ -252,17 +257,10 @@ _g_utf8_normalize_wc (const gchar *str,
|
||||
{
|
||||
gunichar wc = g_utf8_get_char (p);
|
||||
|
||||
const guchar *decomp = find_decomposition (wc, do_compat);
|
||||
const gchar *decomp = find_decomposition (wc, do_compat);
|
||||
|
||||
if (decomp)
|
||||
{
|
||||
int len;
|
||||
/* We store as a double-nul terminated string. */
|
||||
for (len = 0; (decomp[len] || decomp[len + 1]);
|
||||
len += 2)
|
||||
;
|
||||
n_wc += len / 2;
|
||||
}
|
||||
n_wc += g_utf8_strlen (decomp, -1);
|
||||
else
|
||||
n_wc++;
|
||||
|
||||
@ -277,7 +275,7 @@ _g_utf8_normalize_wc (const gchar *str,
|
||||
while ((max_len < 0 || p < str + max_len) && *p)
|
||||
{
|
||||
gunichar wc = g_utf8_get_char (p);
|
||||
const guchar *decomp;
|
||||
const gchar *decomp;
|
||||
int cc;
|
||||
gsize old_n_wc = n_wc;
|
||||
|
||||
@ -285,11 +283,9 @@ _g_utf8_normalize_wc (const gchar *str,
|
||||
|
||||
if (decomp)
|
||||
{
|
||||
int len;
|
||||
/* We store as a double-nul terminated string. */
|
||||
for (len = 0; (decomp[len] || decomp[len + 1]);
|
||||
len += 2)
|
||||
wc_buffer[n_wc++] = (decomp[len] << 8 | decomp[len + 1]);
|
||||
const char *pd;
|
||||
for (pd = decomp; *pd != '\0'; pd = g_utf8_next_char (pd))
|
||||
wc_buffer[n_wc++] = g_utf8_get_char (pd);
|
||||
}
|
||||
else
|
||||
wc_buffer[n_wc++] = wc;
|
||||
@ -318,7 +314,6 @@ _g_utf8_normalize_wc (const gchar *str,
|
||||
|
||||
/* All decomposed and reordered */
|
||||
|
||||
|
||||
if (do_compose && n_wc > 0)
|
||||
{
|
||||
gsize i, j;
|
||||
@ -402,7 +397,7 @@ g_utf8_normalize (const gchar *str,
|
||||
{
|
||||
gunichar *result_wc = _g_utf8_normalize_wc (str, len, mode);
|
||||
gchar *result;
|
||||
|
||||
|
||||
result = g_ucs4_to_utf8 (result_wc, -1, NULL, NULL, NULL);
|
||||
g_free (result_wc);
|
||||
|
||||
|
16241
glib/gunidecomp.h
16241
glib/gunidecomp.h
File diff suppressed because it is too large
Load Diff
@ -28,17 +28,30 @@
|
||||
#include "glib.h"
|
||||
#include "gunichartables.h"
|
||||
|
||||
#define ATTR_TABLE(Page) (((Page) <= G_UNICODE_LAST_PAGE_PART1) \
|
||||
? attr_table_part1[Page] \
|
||||
: attr_table_part2[(Page) - 0xe00])
|
||||
|
||||
#define ATTTABLE(Page, Char) \
|
||||
((attr_table[Page] == G_UNICODE_MAX_TABLE_INDEX) ? 0 : (attr_data[attr_table[Page]][Char]))
|
||||
((ATTR_TABLE(Page) == G_UNICODE_MAX_TABLE_INDEX) ? 0 : (attr_data[ATTR_TABLE(Page)][Char]))
|
||||
|
||||
#define TTYPE(Page, Char) \
|
||||
((type_table[Page] >= G_UNICODE_MAX_TABLE_INDEX) \
|
||||
? (type_table[Page] - G_UNICODE_MAX_TABLE_INDEX) \
|
||||
: (type_data[type_table[Page]][Char]))
|
||||
#define TTYPE_PART1(Page, Char) \
|
||||
((type_table_part1[Page] >= G_UNICODE_MAX_TABLE_INDEX) \
|
||||
? (type_table_part1[Page] - G_UNICODE_MAX_TABLE_INDEX) \
|
||||
: (type_data[type_table_part1[Page]][Char]))
|
||||
|
||||
#define TTYPE_PART2(Page, Char) \
|
||||
((type_table_part2[Page] >= G_UNICODE_MAX_TABLE_INDEX) \
|
||||
? (type_table_part2[Page] - G_UNICODE_MAX_TABLE_INDEX) \
|
||||
: (type_data[type_table_part2[Page]][Char]))
|
||||
|
||||
#define TYPE(Char) \
|
||||
(((Char) <= G_UNICODE_LAST_CHAR_PART1) \
|
||||
? TTYPE_PART1 ((Char) >> 8, (Char) & 0xff) \
|
||||
: (((Char) >= 0xe0000 && (Char) <= G_UNICODE_LAST_CHAR) \
|
||||
? TTYPE_PART2 (((Char) - 0xe0000) >> 8, (Char) & 0xff) \
|
||||
: G_UNICODE_UNASSIGNED))
|
||||
|
||||
#define TYPE(Char) (((Char) > (G_UNICODE_LAST_CHAR)) ? G_UNICODE_UNASSIGNED : TTYPE ((Char) >> 8, (Char) & 0xff))
|
||||
|
||||
#define ISDIGIT(Type) ((Type) == G_UNICODE_DECIMAL_NUMBER \
|
||||
|| (Type) == G_UNICODE_LETTER_NUMBER \
|
||||
@ -361,10 +374,10 @@ g_unichar_toupper (gunichar c)
|
||||
if (t == G_UNICODE_LOWERCASE_LETTER)
|
||||
{
|
||||
gunichar val = ATTTABLE (c >> 8, c & 0xff);
|
||||
if (val >= 0xd800 && val < 0xdc00)
|
||||
if (val >= 0x1000000)
|
||||
{
|
||||
const guchar *p = special_case_table[val - 0xd800];
|
||||
return p[0] * 256 + p[1];
|
||||
const guchar *p = special_case_table + val - 0x1000000;
|
||||
return g_utf8_get_char (p);
|
||||
}
|
||||
else
|
||||
return val ? val : c;
|
||||
@ -398,10 +411,10 @@ g_unichar_tolower (gunichar c)
|
||||
if (t == G_UNICODE_UPPERCASE_LETTER)
|
||||
{
|
||||
gunichar val = ATTTABLE (c >> 8, c & 0xff);
|
||||
if (val >= 0xd800 && val < 0xdc00)
|
||||
if (val >= 0x1000000)
|
||||
{
|
||||
const guchar *p = special_case_table[val - 0xd800];
|
||||
return p[0] * 256 + p[1];
|
||||
const guchar *p = special_case_table + val - 0x1000000;
|
||||
return g_utf8_get_char (p);
|
||||
}
|
||||
else
|
||||
return val ? val : c;
|
||||
@ -561,31 +574,22 @@ output_marks (const char **p_inout,
|
||||
static gsize
|
||||
output_special_case (gchar *out_buffer,
|
||||
gsize len,
|
||||
int index,
|
||||
int offset,
|
||||
int type,
|
||||
int which)
|
||||
{
|
||||
const guchar *p = special_case_table[index];
|
||||
const guchar *p = special_case_table + offset;
|
||||
gint len;
|
||||
|
||||
if (type != G_UNICODE_TITLECASE_LETTER)
|
||||
p += 2; /* +2 to skip over "best single match" */
|
||||
p = g_utf8_next_char (p);
|
||||
|
||||
if (which == 1)
|
||||
{
|
||||
while (p[0] || p[1])
|
||||
p += 2;
|
||||
p += 2;
|
||||
}
|
||||
p += strlen (p) + 1;
|
||||
|
||||
while (TRUE)
|
||||
{
|
||||
gunichar ch = p[0] * 256 + p[1];
|
||||
if (!ch)
|
||||
break;
|
||||
|
||||
len += g_unichar_to_utf8 (ch, out_buffer ? out_buffer + len : NULL);
|
||||
p += 2;
|
||||
}
|
||||
len = strlen (p);
|
||||
if (out_buffer)
|
||||
memcpy (out_buffer, p, len);
|
||||
|
||||
return len;
|
||||
}
|
||||
@ -662,9 +666,9 @@ real_toupper (const gchar *str,
|
||||
{
|
||||
val = ATTTABLE (c >> 8, c & 0xff);
|
||||
|
||||
if (val >= 0xd800 && val < 0xdc00)
|
||||
if (val >= 0x1000000)
|
||||
{
|
||||
len += output_special_case (out_buffer, len, val - 0xd800, t,
|
||||
len += output_special_case (out_buffer, len, val - 0x1000000, t,
|
||||
t == G_UNICODE_LOWERCASE_LETTER ? 0 : 1);
|
||||
}
|
||||
else
|
||||
@ -785,9 +789,9 @@ real_tolower (const gchar *str,
|
||||
{
|
||||
val = ATTTABLE (c >> 8, c & 0xff);
|
||||
|
||||
if (val >= 0xd800 && val < 0xdc00)
|
||||
if (val >= 0x1000000)
|
||||
{
|
||||
len += output_special_case (out_buffer, len, val - 0xd800, t, 0);
|
||||
len += output_special_case (out_buffer, len, val - 0x1000000, t, 0);
|
||||
}
|
||||
else
|
||||
{
|
||||
@ -891,7 +895,7 @@ g_utf8_casefold (const gchar *str,
|
||||
int end = G_N_ELEMENTS (casefold_table);
|
||||
|
||||
if (ch >= casefold_table[start].ch &&
|
||||
ch <= casefold_table[end - 1].ch)
|
||||
ch <= casefold_table[end - 1].ch)
|
||||
{
|
||||
while (TRUE)
|
||||
{
|
||||
|
@ -1,4 +1,4 @@
|
||||
# Test cases generated from Unicode 3.1 data
|
||||
# Test cases generated from Unicode 4.0 data
|
||||
# by gen-casefold-test.pl. Do not edit.
|
||||
#
|
||||
# Some special hand crafted tests
|
||||
@ -89,8 +89,7 @@ Z z
|
||||
Ī ī
|
||||
Ĭ ĭ
|
||||
Į į
|
||||
İ i
|
||||
ı i
|
||||
İ i̇
|
||||
IJ ij
|
||||
Ĵ ĵ
|
||||
Ķ ķ
|
||||
@ -216,6 +215,7 @@ Z z
|
||||
Ț ț
|
||||
Ȝ ȝ
|
||||
Ȟ ȟ
|
||||
Ƞ ƞ
|
||||
Ȣ ȣ
|
||||
Ȥ ȥ
|
||||
Ȧ ȧ
|
||||
@ -266,6 +266,7 @@ Z z
|
||||
ϑ θ
|
||||
ϕ φ
|
||||
ϖ π
|
||||
Ϙ ϙ
|
||||
Ϛ ϛ
|
||||
Ϝ ϝ
|
||||
Ϟ ϟ
|
||||
@ -279,9 +280,11 @@ Z z
|
||||
Ϯ ϯ
|
||||
ϰ κ
|
||||
ϱ ρ
|
||||
ϲ σ
|
||||
ϴ θ
|
||||
ϵ ε
|
||||
Ϸ ϸ
|
||||
Ϲ ϲ
|
||||
Ϻ ϻ
|
||||
Ѐ ѐ
|
||||
Ё ё
|
||||
Ђ ђ
|
||||
@ -347,6 +350,7 @@ Z z
|
||||
Ѽ ѽ
|
||||
Ѿ ѿ
|
||||
Ҁ ҁ
|
||||
Ҋ ҋ
|
||||
Ҍ ҍ
|
||||
Ҏ ҏ
|
||||
Ґ ґ
|
||||
@ -375,8 +379,11 @@ Z z
|
||||
Ҿ ҿ
|
||||
Ӂ ӂ
|
||||
Ӄ ӄ
|
||||
Ӆ ӆ
|
||||
Ӈ ӈ
|
||||
Ӊ ӊ
|
||||
Ӌ ӌ
|
||||
Ӎ ӎ
|
||||
Ӑ ӑ
|
||||
Ӓ ӓ
|
||||
Ӕ ӕ
|
||||
@ -397,6 +404,14 @@ Z z
|
||||
Ӳ ӳ
|
||||
Ӵ ӵ
|
||||
Ӹ ӹ
|
||||
Ԁ ԁ
|
||||
Ԃ ԃ
|
||||
Ԅ ԅ
|
||||
Ԇ ԇ
|
||||
Ԉ ԉ
|
||||
Ԋ ԋ
|
||||
Ԍ ԍ
|
||||
Ԏ ԏ
|
||||
Ա ա
|
||||
Բ բ
|
||||
Գ գ
|
||||
@ -794,3 +809,43 @@ Z z
|
||||
X x
|
||||
Y y
|
||||
Z z
|
||||
𐐀 𐐨
|
||||
𐐁 𐐩
|
||||
𐐂 𐐪
|
||||
𐐃 𐐫
|
||||
𐐄 𐐬
|
||||
𐐅 𐐭
|
||||
𐐆 𐐮
|
||||
𐐇 𐐯
|
||||
𐐈 𐐰
|
||||
𐐉 𐐱
|
||||
𐐊 𐐲
|
||||
𐐋 𐐳
|
||||
𐐌 𐐴
|
||||
𐐍 𐐵
|
||||
𐐎 𐐶
|
||||
𐐏 𐐷
|
||||
𐐐 𐐸
|
||||
𐐑 𐐹
|
||||
𐐒 𐐺
|
||||
𐐓 𐐻
|
||||
𐐔 𐐼
|
||||
𐐕 𐐽
|
||||
𐐖 𐐾
|
||||
𐐗 𐐿
|
||||
𐐘 𐑀
|
||||
𐐙 𐑁
|
||||
𐐚 𐑂
|
||||
𐐛 𐑃
|
||||
𐐜 𐑄
|
||||
𐐝 𐑅
|
||||
𐐞 𐑆
|
||||
𐐟 𐑇
|
||||
𐐠 𐑈
|
||||
𐐡 𐑉
|
||||
𐐢 𐑊
|
||||
𐐣 𐑋
|
||||
𐐤 𐑌
|
||||
𐐥 𐑍
|
||||
𐐦 𐑎
|
||||
𐐧 𐑏
|
||||
|
1121
tests/casemap.txt
1121
tests/casemap.txt
File diff suppressed because it is too large
Load Diff
@ -24,6 +24,8 @@
|
||||
# I consider the output of this program to be unrestricted. Use it as
|
||||
# you will.
|
||||
|
||||
require 5.006;
|
||||
|
||||
# Names of fields in the CaseFolding table
|
||||
$FOLDING_CODE = 0;
|
||||
$FOLDING_STATUS = 1;
|
||||
@ -49,6 +51,7 @@ AaBbCc@@\taabbcc@@
|
||||
#
|
||||
EOT
|
||||
|
||||
binmode STDOUT, ":utf8";
|
||||
open (INPUT, "< $ARGV[1]") || exit 1;
|
||||
|
||||
while (<INPUT>)
|
||||
@ -65,15 +68,14 @@ while (<INPUT>)
|
||||
my $raw_code = $fields[$FOLDING_CODE];
|
||||
my $code = hex ($raw_code);
|
||||
|
||||
next if $code > 0xffff; # FIXME!
|
||||
|
||||
if ($#fields != 3)
|
||||
{
|
||||
printf STDERR ("Entry for $raw_code has wrong number of fields (%d)\n", $#fields);
|
||||
next;
|
||||
}
|
||||
|
||||
next if ($fields[$FOLDING_STATUS] eq 'S');
|
||||
# skip simple and Turkic mappings
|
||||
next if ($fields[$FOLDING_STATUS] =~ /^[ST]$/);
|
||||
|
||||
@values = map { hex ($_) } split /\s+/, $fields[$FOLDING_MAPPING];
|
||||
printf ("%s\t%s\n", pack ("U", $code), pack ("U*", @values));
|
||||
|
@ -23,6 +23,7 @@
|
||||
# I consider the output of this program to be unrestricted. Use it as
|
||||
# you will.
|
||||
|
||||
require 5.006;
|
||||
use utf8;
|
||||
|
||||
if (@ARGV != 3) {
|
||||
@ -60,6 +61,7 @@ my @upper;
|
||||
my @title;
|
||||
my @lower;
|
||||
|
||||
binmode STDOUT, ":utf8";
|
||||
open (INPUT, "< $ARGV[1]") || exit 1;
|
||||
|
||||
$last_code = -1;
|
||||
@ -74,8 +76,6 @@ while (<INPUT>)
|
||||
|
||||
$code = hex ($fields[$CODE]);
|
||||
|
||||
last if ($code > 0xFFFF); # ignore characters out of the basic plane
|
||||
|
||||
if ($code > $last_code + 1)
|
||||
{
|
||||
# Found a gap.
|
||||
@ -196,7 +196,7 @@ sub process_one
|
||||
|
||||
sub print_tests
|
||||
{
|
||||
for ($i = 0; $i < 0xffff; $i++) {
|
||||
for ($i = 0; $i < 0x10ffff; $i++) {
|
||||
if ($i == 0x3A3) {
|
||||
# Greek sigma needs special tests
|
||||
next;
|
||||
|
Loading…
Reference in New Issue
Block a user