mirror of
https://gitlab.gnome.org/GNOME/glib.git
synced 2025-01-26 22:16:16 +01:00
s/UNICODE_LAST_CHAR/G_UNICODE_LAST_CHAR/
2000-11-29 Havoc Pennington <hp@redhat.com> * gunidecomp.c (COMBINING_CLASS): s/UNICODE_LAST_CHAR/G_UNICODE_LAST_CHAR/ * gunichartables.h: Update for data in Unicode 3.0.1, and include tables for line break properties * gunidecomp.h: Ditto * gunicode.h (GUnicodeBreakType): Enum for line break properties (g_unichar_break_type): Get the break property for a char * gunibreak.h: Autogenerated line break property tables * gunibreak.c (g_unichar_break_type): added * Makefile.am (EXTRA_DIST): dist gen-unicode-tables.pl * Makefile.am (libglib_1_3_la_SOURCES): Add gunibreak.h, gunibreak.c * gen-unicode-tables.pl: Include the script to update the unicode char tables
This commit is contained in:
parent
61c2307387
commit
7fce6d3abc
23
ChangeLog
23
ChangeLog
@ -1,3 +1,26 @@
|
||||
2000-11-29 Havoc Pennington <hp@redhat.com>
|
||||
|
||||
* gunidecomp.c (COMBINING_CLASS): s/UNICODE_LAST_CHAR/G_UNICODE_LAST_CHAR/
|
||||
|
||||
* gunichartables.h: Update for data in Unicode 3.0.1,
|
||||
and include tables for line break properties
|
||||
|
||||
* gunidecomp.h: Ditto
|
||||
|
||||
* gunicode.h (GUnicodeBreakType): Enum for line break properties
|
||||
(g_unichar_break_type): Get the break property for a char
|
||||
|
||||
* gunibreak.h: Autogenerated line break property tables
|
||||
|
||||
* gunibreak.c (g_unichar_break_type): added
|
||||
|
||||
* Makefile.am (EXTRA_DIST): dist gen-unicode-tables.pl
|
||||
|
||||
* Makefile.am (libglib_1_3_la_SOURCES): Add gunibreak.h, gunibreak.c
|
||||
|
||||
* gen-unicode-tables.pl: Include the script to update the unicode
|
||||
char tables
|
||||
|
||||
2000-11-28 Elliot Lee <sopwith@redhat.com>
|
||||
|
||||
* gmarkup.c: Fix warnings.
|
||||
|
@ -1,3 +1,26 @@
|
||||
2000-11-29 Havoc Pennington <hp@redhat.com>
|
||||
|
||||
* gunidecomp.c (COMBINING_CLASS): s/UNICODE_LAST_CHAR/G_UNICODE_LAST_CHAR/
|
||||
|
||||
* gunichartables.h: Update for data in Unicode 3.0.1,
|
||||
and include tables for line break properties
|
||||
|
||||
* gunidecomp.h: Ditto
|
||||
|
||||
* gunicode.h (GUnicodeBreakType): Enum for line break properties
|
||||
(g_unichar_break_type): Get the break property for a char
|
||||
|
||||
* gunibreak.h: Autogenerated line break property tables
|
||||
|
||||
* gunibreak.c (g_unichar_break_type): added
|
||||
|
||||
* Makefile.am (EXTRA_DIST): dist gen-unicode-tables.pl
|
||||
|
||||
* Makefile.am (libglib_1_3_la_SOURCES): Add gunibreak.h, gunibreak.c
|
||||
|
||||
* gen-unicode-tables.pl: Include the script to update the unicode
|
||||
char tables
|
||||
|
||||
2000-11-28 Elliot Lee <sopwith@redhat.com>
|
||||
|
||||
* gmarkup.c: Fix warnings.
|
||||
|
@ -1,3 +1,26 @@
|
||||
2000-11-29 Havoc Pennington <hp@redhat.com>
|
||||
|
||||
* gunidecomp.c (COMBINING_CLASS): s/UNICODE_LAST_CHAR/G_UNICODE_LAST_CHAR/
|
||||
|
||||
* gunichartables.h: Update for data in Unicode 3.0.1,
|
||||
and include tables for line break properties
|
||||
|
||||
* gunidecomp.h: Ditto
|
||||
|
||||
* gunicode.h (GUnicodeBreakType): Enum for line break properties
|
||||
(g_unichar_break_type): Get the break property for a char
|
||||
|
||||
* gunibreak.h: Autogenerated line break property tables
|
||||
|
||||
* gunibreak.c (g_unichar_break_type): added
|
||||
|
||||
* Makefile.am (EXTRA_DIST): dist gen-unicode-tables.pl
|
||||
|
||||
* Makefile.am (libglib_1_3_la_SOURCES): Add gunibreak.h, gunibreak.c
|
||||
|
||||
* gen-unicode-tables.pl: Include the script to update the unicode
|
||||
char tables
|
||||
|
||||
2000-11-28 Elliot Lee <sopwith@redhat.com>
|
||||
|
||||
* gmarkup.c: Fix warnings.
|
||||
|
@ -1,3 +1,26 @@
|
||||
2000-11-29 Havoc Pennington <hp@redhat.com>
|
||||
|
||||
* gunidecomp.c (COMBINING_CLASS): s/UNICODE_LAST_CHAR/G_UNICODE_LAST_CHAR/
|
||||
|
||||
* gunichartables.h: Update for data in Unicode 3.0.1,
|
||||
and include tables for line break properties
|
||||
|
||||
* gunidecomp.h: Ditto
|
||||
|
||||
* gunicode.h (GUnicodeBreakType): Enum for line break properties
|
||||
(g_unichar_break_type): Get the break property for a char
|
||||
|
||||
* gunibreak.h: Autogenerated line break property tables
|
||||
|
||||
* gunibreak.c (g_unichar_break_type): added
|
||||
|
||||
* Makefile.am (EXTRA_DIST): dist gen-unicode-tables.pl
|
||||
|
||||
* Makefile.am (libglib_1_3_la_SOURCES): Add gunibreak.h, gunibreak.c
|
||||
|
||||
* gen-unicode-tables.pl: Include the script to update the unicode
|
||||
char tables
|
||||
|
||||
2000-11-28 Elliot Lee <sopwith@redhat.com>
|
||||
|
||||
* gmarkup.c: Fix warnings.
|
||||
|
@ -1,3 +1,26 @@
|
||||
2000-11-29 Havoc Pennington <hp@redhat.com>
|
||||
|
||||
* gunidecomp.c (COMBINING_CLASS): s/UNICODE_LAST_CHAR/G_UNICODE_LAST_CHAR/
|
||||
|
||||
* gunichartables.h: Update for data in Unicode 3.0.1,
|
||||
and include tables for line break properties
|
||||
|
||||
* gunidecomp.h: Ditto
|
||||
|
||||
* gunicode.h (GUnicodeBreakType): Enum for line break properties
|
||||
(g_unichar_break_type): Get the break property for a char
|
||||
|
||||
* gunibreak.h: Autogenerated line break property tables
|
||||
|
||||
* gunibreak.c (g_unichar_break_type): added
|
||||
|
||||
* Makefile.am (EXTRA_DIST): dist gen-unicode-tables.pl
|
||||
|
||||
* Makefile.am (libglib_1_3_la_SOURCES): Add gunibreak.h, gunibreak.c
|
||||
|
||||
* gen-unicode-tables.pl: Include the script to update the unicode
|
||||
char tables
|
||||
|
||||
2000-11-28 Elliot Lee <sopwith@redhat.com>
|
||||
|
||||
* gmarkup.c: Fix warnings.
|
||||
|
@ -1,3 +1,26 @@
|
||||
2000-11-29 Havoc Pennington <hp@redhat.com>
|
||||
|
||||
* gunidecomp.c (COMBINING_CLASS): s/UNICODE_LAST_CHAR/G_UNICODE_LAST_CHAR/
|
||||
|
||||
* gunichartables.h: Update for data in Unicode 3.0.1,
|
||||
and include tables for line break properties
|
||||
|
||||
* gunidecomp.h: Ditto
|
||||
|
||||
* gunicode.h (GUnicodeBreakType): Enum for line break properties
|
||||
(g_unichar_break_type): Get the break property for a char
|
||||
|
||||
* gunibreak.h: Autogenerated line break property tables
|
||||
|
||||
* gunibreak.c (g_unichar_break_type): added
|
||||
|
||||
* Makefile.am (EXTRA_DIST): dist gen-unicode-tables.pl
|
||||
|
||||
* Makefile.am (libglib_1_3_la_SOURCES): Add gunibreak.h, gunibreak.c
|
||||
|
||||
* gen-unicode-tables.pl: Include the script to update the unicode
|
||||
char tables
|
||||
|
||||
2000-11-28 Elliot Lee <sopwith@redhat.com>
|
||||
|
||||
* gmarkup.c: Fix warnings.
|
||||
|
@ -1,3 +1,26 @@
|
||||
2000-11-29 Havoc Pennington <hp@redhat.com>
|
||||
|
||||
* gunidecomp.c (COMBINING_CLASS): s/UNICODE_LAST_CHAR/G_UNICODE_LAST_CHAR/
|
||||
|
||||
* gunichartables.h: Update for data in Unicode 3.0.1,
|
||||
and include tables for line break properties
|
||||
|
||||
* gunidecomp.h: Ditto
|
||||
|
||||
* gunicode.h (GUnicodeBreakType): Enum for line break properties
|
||||
(g_unichar_break_type): Get the break property for a char
|
||||
|
||||
* gunibreak.h: Autogenerated line break property tables
|
||||
|
||||
* gunibreak.c (g_unichar_break_type): added
|
||||
|
||||
* Makefile.am (EXTRA_DIST): dist gen-unicode-tables.pl
|
||||
|
||||
* Makefile.am (libglib_1_3_la_SOURCES): Add gunibreak.h, gunibreak.c
|
||||
|
||||
* gen-unicode-tables.pl: Include the script to update the unicode
|
||||
char tables
|
||||
|
||||
2000-11-28 Elliot Lee <sopwith@redhat.com>
|
||||
|
||||
* gmarkup.c: Fix warnings.
|
||||
|
@ -1,3 +1,26 @@
|
||||
2000-11-29 Havoc Pennington <hp@redhat.com>
|
||||
|
||||
* gunidecomp.c (COMBINING_CLASS): s/UNICODE_LAST_CHAR/G_UNICODE_LAST_CHAR/
|
||||
|
||||
* gunichartables.h: Update for data in Unicode 3.0.1,
|
||||
and include tables for line break properties
|
||||
|
||||
* gunidecomp.h: Ditto
|
||||
|
||||
* gunicode.h (GUnicodeBreakType): Enum for line break properties
|
||||
(g_unichar_break_type): Get the break property for a char
|
||||
|
||||
* gunibreak.h: Autogenerated line break property tables
|
||||
|
||||
* gunibreak.c (g_unichar_break_type): added
|
||||
|
||||
* Makefile.am (EXTRA_DIST): dist gen-unicode-tables.pl
|
||||
|
||||
* Makefile.am (libglib_1_3_la_SOURCES): Add gunibreak.h, gunibreak.c
|
||||
|
||||
* gen-unicode-tables.pl: Include the script to update the unicode
|
||||
char tables
|
||||
|
||||
2000-11-28 Elliot Lee <sopwith@redhat.com>
|
||||
|
||||
* gmarkup.c: Fix warnings.
|
||||
|
@ -32,7 +32,8 @@ EXTRA_DIST = \
|
||||
glib-2.0.pc.in \
|
||||
gobject-2.0.pc.in \
|
||||
gmodule-2.0.pc.in \
|
||||
gthread-2.0.pc.in
|
||||
gthread-2.0.pc.in \
|
||||
gen-unicode-tables.pl
|
||||
|
||||
# These may be in the builddir too
|
||||
BUILT_EXTRA_DIST = \
|
||||
@ -83,6 +84,8 @@ libglib_1_3_la_SOURCES = \
|
||||
gtree.c \
|
||||
guniprop.c \
|
||||
gutf8.c \
|
||||
gunibreak.h \
|
||||
gunibreak.c \
|
||||
gunichartables.h \
|
||||
gunidecomp.h \
|
||||
gunidecomp.c \
|
||||
|
609
gen-unicode-tables.pl
Executable file
609
gen-unicode-tables.pl
Executable file
@ -0,0 +1,609 @@
|
||||
#! /usr/bin/perl -w
|
||||
|
||||
# Copyright (C) 1998, 1999 Tom Tromey
|
||||
|
||||
# This program is free software; you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License as published by
|
||||
# the Free Software Foundation; either version 2, or (at your option)
|
||||
# any later version.
|
||||
|
||||
# This program is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
# GNU General Public License for more details.
|
||||
|
||||
# You should have received a copy of the GNU General Public License
|
||||
# along with this program; if not, write to the Free Software
|
||||
# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
|
||||
# 02111-1307, USA.
|
||||
|
||||
# gen-unicode-tables.pl - Generate tables for libunicode from Unicode data.
|
||||
# See http://www.unicode.org/Public/UNIDATA/UnicodeCharacterDatabase.html
|
||||
# Usage: gen-unicode-tables.pl [-decomp | -both] UNICODE-VERSION UnicodeData.txt LineBreak.txt
|
||||
# I consider the output of this program to be unrestricted. Use it as
|
||||
# you will.
|
||||
|
||||
# FIXME:
|
||||
# * We could save even more space in the generated table by using
|
||||
# indexes and not pointers.
|
||||
# * For decomp table it might make sense to use a shift count other
|
||||
# than 8. We could easily compute the perfect shift count.
|
||||
|
||||
use vars qw($CODE $NAME $CATEGORY $COMBINING_CLASSES $BIDI_CATEGORY $DECOMPOSITION $DECIMAL_VALUE $DIGIT_VALUE $NUMERIC_VALUE $MIRRORED $OLD_NAME $COMMENT $UPPER $LOWER $TITLE $BREAK_CODE $BREAK_CATEGORY $BREAK_NAME);
|
||||
|
||||
# Names of fields in Unicode data table.
|
||||
$CODE = 0;
|
||||
$NAME = 1;
|
||||
$CATEGORY = 2;
|
||||
$COMBINING_CLASSES = 3;
|
||||
$BIDI_CATEGORY = 4;
|
||||
$DECOMPOSITION = 5;
|
||||
$DECIMAL_VALUE = 6;
|
||||
$DIGIT_VALUE = 7;
|
||||
$NUMERIC_VALUE = 8;
|
||||
$MIRRORED = 9;
|
||||
$OLD_NAME = 10;
|
||||
$COMMENT = 11;
|
||||
$UPPER = 12;
|
||||
$LOWER = 13;
|
||||
$TITLE = 14;
|
||||
|
||||
# Names of fields in the line break table
|
||||
$BREAK_CODE = 0;
|
||||
$BREAK_PROPERTY = 1;
|
||||
$BREAK_NAME = 2;
|
||||
|
||||
# Map general category code onto symbolic name.
|
||||
%mappings =
|
||||
(
|
||||
# Normative.
|
||||
'Lu' => "G_UNICODE_UPPERCASE_LETTER",
|
||||
'Ll' => "G_UNICODE_LOWERCASE_LETTER",
|
||||
'Lt' => "G_UNICODE_TITLECASE_LETTER",
|
||||
'Mn' => "G_UNICODE_NON_SPACING_MARK",
|
||||
'Mc' => "G_UNICODE_COMBINING_MARK",
|
||||
'Me' => "G_UNICODE_ENCLOSING_MARK",
|
||||
'Nd' => "G_UNICODE_DECIMAL_NUMBER",
|
||||
'Nl' => "G_UNICODE_LETTER_NUMBER",
|
||||
'No' => "G_UNICODE_OTHER_NUMBER",
|
||||
'Zs' => "G_UNICODE_SPACE_SEPARATOR",
|
||||
'Zl' => "G_UNICODE_LINE_SEPARATOR",
|
||||
'Zp' => "G_UNICODE_PARAGRAPH_SEPARATOR",
|
||||
'Cc' => "G_UNICODE_CONTROL",
|
||||
'Cf' => "G_UNICODE_FORMAT",
|
||||
'Cs' => "G_UNICODE_SURROGATE",
|
||||
'Co' => "G_UNICODE_PRIVATE_USE",
|
||||
'Cn' => "G_UNICODE_UNASSIGNED",
|
||||
|
||||
# Informative.
|
||||
'Lm' => "G_UNICODE_MODIFIER_LETTER",
|
||||
'Lo' => "G_UNICODE_OTHER_LETTER",
|
||||
'Pc' => "G_UNICODE_CONNECT_PUNCTUATION",
|
||||
'Pd' => "G_UNICODE_DASH_PUNCTUATION",
|
||||
'Ps' => "G_UNICODE_OPEN_PUNCTUATION",
|
||||
'Pe' => "G_UNICODE_CLOSE_PUNCTUATION",
|
||||
'Pi' => "G_UNICODE_INITIAL_PUNCTUATION",
|
||||
'Pf' => "G_UNICODE_FINAL_PUNCTUATION",
|
||||
'Po' => "G_UNICODE_OTHER_PUNCTUATION",
|
||||
'Sm' => "G_UNICODE_MATH_SYMBOL",
|
||||
'Sc' => "G_UNICODE_CURRENCY_SYMBOL",
|
||||
'Sk' => "G_UNICODE_MODIFIER_SYMBOL",
|
||||
'So' => "G_UNICODE_OTHER_SYMBOL"
|
||||
);
|
||||
|
||||
%break_mappings =
|
||||
(
|
||||
'BK' => "G_UNICODE_BREAK_MANDATORY",
|
||||
'CR' => "G_UNICODE_BREAK_CARRIAGE_RETURN",
|
||||
'LF' => "G_UNICODE_BREAK_LINE_FEED",
|
||||
'CM' => "G_UNICODE_BREAK_COMBINING_MARK",
|
||||
'SG' => "G_UNICODE_BREAK_SURROGATE",
|
||||
'ZW' => "G_UNICODE_BREAK_ZERO_WIDTH_SPACE",
|
||||
'IN' => "G_UNICODE_BREAK_INSEPARABLE",
|
||||
'GL' => "G_UNICODE_BREAK_NON_BREAKING_GLUE",
|
||||
'CB' => "G_UNICODE_BREAK_CONTINGENT",
|
||||
'SP' => "G_UNICODE_BREAK_SPACE",
|
||||
'BA' => "G_UNICODE_BREAK_AFTER",
|
||||
'BB' => "G_UNICODE_BREAK_BEFORE",
|
||||
'B2' => "G_UNICODE_BREAK_BEFORE_AND_AFTER",
|
||||
'HY' => "G_UNICODE_BREAK_HYPHEN",
|
||||
'NS' => "G_UNICODE_BREAK_NON_STARTER",
|
||||
'OP' => "G_UNICODE_BREAK_OPEN_PUNCTUATION",
|
||||
'CL' => "G_UNICODE_BREAK_CLOSE_PUNCTUATION",
|
||||
'QU' => "G_UNICODE_BREAK_QUOTATION",
|
||||
'EX' => "G_UNICODE_BREAK_EXCLAMATION",
|
||||
'ID' => "G_UNICODE_BREAK_IDEOGRAPHIC",
|
||||
'NU' => "G_UNICODE_BREAK_NUMERIC",
|
||||
'IS' => "G_UNICODE_BREAK_INFIX_SEPARATOR",
|
||||
'SY' => "G_UNICODE_BREAK_SYMBOL",
|
||||
'AL' => "G_UNICODE_BREAK_ALPHABETIC",
|
||||
'PR' => "G_UNICODE_BREAK_PREFIX",
|
||||
'PO' => "G_UNICODE_BREAK_POSTFIX",
|
||||
'SA' => "G_UNICODE_BREAK_COMPLEX_CONTEXT",
|
||||
'AI' => "G_UNICODE_BREAK_AMBIGUOUS",
|
||||
'XX' => "G_UNICODE_BREAK_UNKNOWN"
|
||||
);
|
||||
|
||||
# Title case mappings.
|
||||
%title_to_lower = ();
|
||||
%title_to_upper = ();
|
||||
|
||||
$do_decomp = 0;
|
||||
$do_props = 1;
|
||||
if ($ARGV[0] eq '-decomp')
|
||||
{
|
||||
$do_decomp = 1;
|
||||
$do_props = 0;
|
||||
shift @ARGV;
|
||||
}
|
||||
elsif ($ARGV[0] eq '-both')
|
||||
{
|
||||
$do_decomp = 1;
|
||||
shift @ARGV;
|
||||
}
|
||||
|
||||
print "Creating decomp table\n" if ($do_decomp);
|
||||
print "Creating property table\n" if ($do_props);
|
||||
|
||||
print "Unicode data from $ARGV[1]\n";
|
||||
|
||||
open (INPUT, "< $ARGV[1]") || exit 1;
|
||||
|
||||
$last_code = -1;
|
||||
while (<INPUT>)
|
||||
{
|
||||
chop;
|
||||
@fields = split (';', $_, 30);
|
||||
if ($#fields != 14)
|
||||
{
|
||||
printf STDERR ("Entry for $fields[$CODE] has wrong number of fields (%d)\n", $#fields);
|
||||
}
|
||||
|
||||
$code = hex ($fields[$CODE]);
|
||||
|
||||
last if ($code > 0xFFFF); # ignore characters out of the basic plane
|
||||
|
||||
if ($code > $last_code + 1)
|
||||
{
|
||||
# Found a gap.
|
||||
if ($fields[$NAME] =~ /Last>/)
|
||||
{
|
||||
# Fill the gap with the last character read,
|
||||
# since this was a range specified in the char database
|
||||
@gfields = @fields;
|
||||
}
|
||||
else
|
||||
{
|
||||
# The gap represents undefined characters. Only the type
|
||||
# matters.
|
||||
@gfields = ('', '', 'Cn', '0', '', '', '', '', '', '', '',
|
||||
'', '', '', '');
|
||||
}
|
||||
for (++$last_code; $last_code < $code; ++$last_code)
|
||||
{
|
||||
$gfields{$CODE} = sprintf ("%04x", $last_code);
|
||||
&process_one ($last_code, @gfields);
|
||||
}
|
||||
}
|
||||
&process_one ($code, @fields);
|
||||
$last_code = $code;
|
||||
}
|
||||
|
||||
@gfields = ('', '', 'Cn', '0', '', '', '', '', '', '', '',
|
||||
'', '', '', '');
|
||||
for (++$last_code; $last_code < 0x10000; ++$last_code)
|
||||
{
|
||||
$gfields{$CODE} = sprintf ("%04x", $last_code);
|
||||
&process_one ($last_code, @gfields);
|
||||
}
|
||||
--$last_code; # Want last to be 0xFFFF.
|
||||
|
||||
print "Creating line break table\n";
|
||||
|
||||
print "Line break data from $ARGV[2]\n";
|
||||
|
||||
open (INPUT, "< $ARGV[2]") || exit 1;
|
||||
|
||||
$last_code = -1;
|
||||
while (<INPUT>)
|
||||
{
|
||||
chop;
|
||||
|
||||
next if /^#/;
|
||||
|
||||
@fields = split (';', $_, 30);
|
||||
if ($#fields != 2)
|
||||
{
|
||||
printf STDERR ("Entry for $fields[$CODE] has wrong number of fields (%d)\n", $#fields);
|
||||
}
|
||||
|
||||
$code = hex ($fields[$CODE]);
|
||||
|
||||
last if ($code > 0xFFFF); # ignore characters out of the basic plane
|
||||
|
||||
if ($code > $last_code + 1)
|
||||
{
|
||||
# Found a gap.
|
||||
if ($fields[$NAME] =~ /Last>/)
|
||||
{
|
||||
# Fill the gap with the last character read,
|
||||
# since this was a range specified in the char database
|
||||
$gap_break_prop = $fields[$BREAK_PROPERTY];
|
||||
for (++$last_code; $last_code < $code; ++$last_code)
|
||||
{
|
||||
$break_props[$last_code] = $gap_break_prop;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
# The gap represents undefined characters. If assigned,
|
||||
# they are AL, if not assigned, XX
|
||||
for (++$last_code; $last_code < $code; ++$last_code)
|
||||
{
|
||||
if ($type[$last_code] eq 'Cn')
|
||||
{
|
||||
$break_props[$last_code] = 'XX';
|
||||
}
|
||||
else
|
||||
{
|
||||
$break_props[$last_code] = 'AL';
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
$break_props[$code] = $fields[$BREAK_PROPERTY];
|
||||
$last_code = $code;
|
||||
}
|
||||
|
||||
for (++$last_code; $last_code < 0x10000; ++$last_code)
|
||||
{
|
||||
if ($type[$last_code] eq 'Cn')
|
||||
{
|
||||
$break_props[$last_code] = 'XX';
|
||||
}
|
||||
else
|
||||
{
|
||||
$break_props[$last_code] = 'AL';
|
||||
}
|
||||
}
|
||||
--$last_code; # Want last to be 0xFFFF.
|
||||
|
||||
print STDERR "Last code is not 0xFFFF" if ($last_code != 0xFFFF);
|
||||
|
||||
&print_tables ($last_code)
|
||||
if $do_props;
|
||||
&print_decomp ($last_code)
|
||||
if $do_decomp;
|
||||
|
||||
&print_line_break ($last_code);
|
||||
|
||||
exit 0;
|
||||
|
||||
# Process a single character.
|
||||
sub process_one
|
||||
{
|
||||
my ($code, @fields) = @_;
|
||||
|
||||
$type[$code] = $fields[$CATEGORY];
|
||||
if ($type[$code] eq 'Nd')
|
||||
{
|
||||
$value[$code] = int ($fields[$DECIMAL_VALUE]);
|
||||
}
|
||||
elsif ($type[$code] eq 'Ll')
|
||||
{
|
||||
$value[$code] = hex ($fields[$UPPER]);
|
||||
}
|
||||
elsif ($type[$code] eq 'Lu')
|
||||
{
|
||||
$value[$code] = hex ($fields[$LOWER]);
|
||||
}
|
||||
|
||||
if ($type[$code] eq 'Lt')
|
||||
{
|
||||
$title_to_lower{$code} = hex ($fields[$LOWER]);
|
||||
$title_to_upper{$code} = hex ($fields[$UPPER]);
|
||||
}
|
||||
|
||||
$cclass[$code] = $fields[$COMBINING_CLASSES];
|
||||
|
||||
# Handle decompositions.
|
||||
if ($fields[$DECOMPOSITION] ne ''
|
||||
&& $fields[$DECOMPOSITION] !~ /\<.*\>/)
|
||||
{
|
||||
$decompositions[$code] = $fields[$DECOMPOSITION];
|
||||
}
|
||||
}
|
||||
|
||||
sub print_tables
|
||||
{
|
||||
my ($last) = @_;
|
||||
my ($outfile) = "gunichartables.h";
|
||||
|
||||
local ($bytes_out) = 0;
|
||||
|
||||
print "Writing $outfile...\n";
|
||||
|
||||
open (OUT, "> $outfile");
|
||||
|
||||
print OUT "/* This file is automatically generated. DO NOT EDIT!\n";
|
||||
print OUT " Instead, edit gen-unicode-tables.pl and re-run. */\n\n";
|
||||
|
||||
print OUT "#ifndef CHARTABLES_H\n";
|
||||
print OUT "#define CHARTABLES_H\n\n";
|
||||
|
||||
print OUT "#define G_UNICODE_DATA_VERSION \"$ARGV[0]\"\n\n";
|
||||
|
||||
printf OUT "#define G_UNICODE_LAST_CHAR 0x%04x\n\n", $last;
|
||||
|
||||
for ($count = 0; $count <= $last; $count += 256)
|
||||
{
|
||||
$row[$count / 256] = &print_row ($count, '(char *) ', 'char', 1,
|
||||
'page', \&fetch_type);
|
||||
}
|
||||
|
||||
print OUT "static char *type_table[256] = {\n";
|
||||
for ($count = 0; $count <= $last; $count += 256)
|
||||
{
|
||||
print OUT ",\n" if $count > 0;
|
||||
print OUT " ", $row[$count / 256];
|
||||
$bytes_out += 4;
|
||||
}
|
||||
print OUT "\n};\n\n";
|
||||
|
||||
|
||||
#
|
||||
# Now print attribute table.
|
||||
#
|
||||
|
||||
for ($count = 0; $count <= $last; $count += 256)
|
||||
{
|
||||
$row[$count / 256] = &print_row ($count, '', 'unsigned short', 2,
|
||||
'attrpage', \&fetch_attr);
|
||||
}
|
||||
print OUT "static unsigned short *attr_table[256] = {\n";
|
||||
for ($count = 0; $count <= $last; $count += 256)
|
||||
{
|
||||
print OUT ",\n" if $count > 0;
|
||||
print OUT " ", $row[$count / 256];
|
||||
$bytes_out += 4;
|
||||
}
|
||||
print OUT "\n};\n\n";
|
||||
|
||||
# FIXME: type.
|
||||
print OUT "static unsigned short title_table[][3] = {\n";
|
||||
my ($item);
|
||||
my ($first) = 1;
|
||||
foreach $item (sort keys %title_to_lower)
|
||||
{
|
||||
print OUT ",\n"
|
||||
unless $first;
|
||||
$first = 0;
|
||||
printf OUT " { 0x%04x, 0x%04x, 0x%04x }", $item, $title_to_upper{$item}, $title_to_lower{$item};
|
||||
$bytes_out += 6;
|
||||
}
|
||||
print OUT "\n};\n\n";
|
||||
|
||||
print OUT "#endif /* CHARTABLES_H */\n";
|
||||
|
||||
close (OUT);
|
||||
|
||||
printf STDERR "Generated %d bytes in tables\n", $bytes_out;
|
||||
}
|
||||
|
||||
# A fetch function for the type table.
|
||||
sub fetch_type
|
||||
{
|
||||
my ($index) = @_;
|
||||
return $mappings{$type[$index]};
|
||||
}
|
||||
|
||||
# A fetch function for the attribute table.
|
||||
sub fetch_attr
|
||||
{
|
||||
my ($index) = @_;
|
||||
if (defined $value[$index])
|
||||
{
|
||||
return sprintf ("0x%04x", $value[$index]);
|
||||
}
|
||||
else
|
||||
{
|
||||
return "0x0000";
|
||||
}
|
||||
}
|
||||
|
||||
# Print a single "row" of a two-level table.
|
||||
sub print_row
|
||||
{
|
||||
my ($start, $def_pfx, $typname, $typsize, $name, $fetcher) = @_;
|
||||
|
||||
my ($i);
|
||||
my (@values);
|
||||
my ($flag) = 1;
|
||||
my ($off);
|
||||
|
||||
for ($off = 0; $off < 256; ++$off)
|
||||
{
|
||||
$values[$off] = $fetcher->($off + $start);
|
||||
if ($values[$off] ne $values[0])
|
||||
{
|
||||
$flag = 0;
|
||||
}
|
||||
}
|
||||
if ($flag)
|
||||
{
|
||||
return $def_pfx . $values[0];
|
||||
}
|
||||
|
||||
printf OUT "static %s %s%d[256] = {\n ", $typname, $name, $start / 256;
|
||||
my ($column) = 2;
|
||||
for ($i = $start; $i < $start + 256; ++$i)
|
||||
{
|
||||
print OUT ", "
|
||||
if $i > $start;
|
||||
my ($text) = $values[$i - $start];
|
||||
if (length ($text) + $column + 2 > 78)
|
||||
{
|
||||
print OUT "\n ";
|
||||
$column = 2;
|
||||
}
|
||||
print OUT $text;
|
||||
$column += length ($text) + 2;
|
||||
}
|
||||
print OUT "\n};\n\n";
|
||||
|
||||
$bytes_out += 256 * $typsize;
|
||||
|
||||
return sprintf "%s%d", $name, $start / 256;
|
||||
}
|
||||
|
||||
# Generate the character decomposition header.
|
||||
sub print_decomp
|
||||
{
|
||||
my ($last) = @_;
|
||||
my ($outfile) = "gunidecomp.h";
|
||||
|
||||
local ($bytes_out) = 0;
|
||||
|
||||
print "Writing $outfile...\n";
|
||||
|
||||
open (OUT, "> $outfile") || exit 1;
|
||||
|
||||
print OUT "/* This file is automatically generated. DO NOT EDIT! */\n\n";
|
||||
print OUT "#ifndef DECOMP_H\n";
|
||||
print OUT "#define DECOMP_H\n\n";
|
||||
|
||||
printf OUT "#define G_UNICODE_LAST_CHAR 0x%04x\n\n", $last;
|
||||
|
||||
my ($count, @row);
|
||||
for ($count = 0; $count <= $last; $count += 256)
|
||||
{
|
||||
$row[$count / 256] = &print_row ($count, '(unsigned char *) ',
|
||||
'unsigned char', 1, 'cclass',
|
||||
\&fetch_cclass);
|
||||
}
|
||||
|
||||
print OUT "static unsigned char *combining_class_table[256] = {\n";
|
||||
for ($count = 0; $count <= $last; $count += 256)
|
||||
{
|
||||
print OUT ",\n" if $count > 0;
|
||||
print OUT " ", $row[$count / 256];
|
||||
$bytes_out += 4;
|
||||
}
|
||||
print OUT "\n};\n\n";
|
||||
|
||||
print OUT "typedef struct\n{\n";
|
||||
# FIXME: type.
|
||||
print OUT " unsigned short ch;\n";
|
||||
print OUT " unsigned char *expansion;\n";
|
||||
print OUT "} decomposition;\n\n";
|
||||
|
||||
print OUT "static decomposition decomp_table[] =\n{\n";
|
||||
my ($iter);
|
||||
my ($first) = 1;
|
||||
for ($count = 0; $count <= $last; ++$count)
|
||||
{
|
||||
if (defined $decompositions[$count])
|
||||
{
|
||||
print OUT ",\n"
|
||||
if ! $first;
|
||||
$first = 0;
|
||||
printf OUT " { 0x%04x, \"", $count;
|
||||
$bytes_out += 2;
|
||||
foreach $iter (&expand_decomp ($count))
|
||||
{
|
||||
printf OUT "\\x%02x\\x%02x", $iter / 256, $iter & 0xff;
|
||||
$bytes_out += 2;
|
||||
}
|
||||
# Only a single terminator because one is implied in the string.
|
||||
print OUT "\\0\" }";
|
||||
$bytes_out += 2;
|
||||
}
|
||||
}
|
||||
print OUT "\n};\n\n";
|
||||
|
||||
print OUT "#endif /* DECOMP_H */\n";
|
||||
|
||||
printf STDERR "Generated %d bytes in decomp tables\n", $bytes_out;
|
||||
}
|
||||
|
||||
sub print_line_break
|
||||
{
|
||||
my ($last) = @_;
|
||||
my ($outfile) = "gunibreak.h";
|
||||
|
||||
local ($bytes_out) = 0;
|
||||
|
||||
print "Writing $outfile...\n";
|
||||
|
||||
open (OUT, "> $outfile");
|
||||
|
||||
print OUT "/* This file is automatically generated. DO NOT EDIT!\n";
|
||||
print OUT " Instead, edit gen-unicode-tables.pl and re-run. */\n\n";
|
||||
|
||||
print OUT "#ifndef BREAKTABLES_H\n";
|
||||
print OUT "#define BREAKTABLES_H\n\n";
|
||||
|
||||
print OUT "#define G_UNICODE_DATA_VERSION \"$ARGV[0]\"\n\n";
|
||||
|
||||
printf OUT "#define G_UNICODE_LAST_CHAR 0x%04x\n\n", $last;
|
||||
|
||||
for ($count = 0; $count <= $last; $count += 256)
|
||||
{
|
||||
$row[$count / 256] = &print_row ($count, '(char *) ', 'char', 1,
|
||||
'page',
|
||||
\&fetch_break_type);
|
||||
}
|
||||
|
||||
print OUT "static char *break_property_table[256] = {\n";
|
||||
for ($count = 0; $count <= $last; $count += 256)
|
||||
{
|
||||
print OUT ",\n" if $count > 0;
|
||||
print OUT " ", $row[$count / 256];
|
||||
$bytes_out += 4;
|
||||
}
|
||||
print OUT "\n};\n\n";
|
||||
|
||||
print OUT "#endif /* BREAKTABLES_H */\n";
|
||||
|
||||
close (OUT);
|
||||
|
||||
printf STDERR "Generated %d bytes in break tables\n", $bytes_out;
|
||||
}
|
||||
|
||||
|
||||
# A fetch function for the break properties table.
|
||||
sub fetch_break_type
|
||||
{
|
||||
my ($index) = @_;
|
||||
return $break_mappings{$break_props[$index]};
|
||||
}
|
||||
|
||||
# Fetcher for combining class.
|
||||
sub fetch_cclass
|
||||
{
|
||||
my ($i) = @_;
|
||||
return $cclass[$i];
|
||||
}
|
||||
|
||||
# Expand a character decomposition recursively.
|
||||
sub expand_decomp
|
||||
{
|
||||
my ($code) = @_;
|
||||
|
||||
my ($iter, $val);
|
||||
my (@result) = ();
|
||||
foreach $iter (split (' ', $decompositions[$code]))
|
||||
{
|
||||
$val = hex ($iter);
|
||||
if (defined $decompositions[$val])
|
||||
{
|
||||
push (@result, &expand_decomp ($val));
|
||||
}
|
||||
else
|
||||
{
|
||||
push (@result, $val);
|
||||
}
|
||||
}
|
||||
|
||||
return @result;
|
||||
}
|
@ -32,7 +32,8 @@ EXTRA_DIST = \
|
||||
glib-2.0.pc.in \
|
||||
gobject-2.0.pc.in \
|
||||
gmodule-2.0.pc.in \
|
||||
gthread-2.0.pc.in
|
||||
gthread-2.0.pc.in \
|
||||
gen-unicode-tables.pl
|
||||
|
||||
# These may be in the builddir too
|
||||
BUILT_EXTRA_DIST = \
|
||||
@ -83,6 +84,8 @@ libglib_1_3_la_SOURCES = \
|
||||
gtree.c \
|
||||
guniprop.c \
|
||||
gutf8.c \
|
||||
gunibreak.h \
|
||||
gunibreak.c \
|
||||
gunichartables.h \
|
||||
gunidecomp.h \
|
||||
gunidecomp.c \
|
||||
|
609
glib/gen-unicode-tables.pl
Executable file
609
glib/gen-unicode-tables.pl
Executable file
@ -0,0 +1,609 @@
|
||||
#! /usr/bin/perl -w
|
||||
|
||||
# Copyright (C) 1998, 1999 Tom Tromey
|
||||
|
||||
# This program is free software; you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License as published by
|
||||
# the Free Software Foundation; either version 2, or (at your option)
|
||||
# any later version.
|
||||
|
||||
# This program is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
# GNU General Public License for more details.
|
||||
|
||||
# You should have received a copy of the GNU General Public License
|
||||
# along with this program; if not, write to the Free Software
|
||||
# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
|
||||
# 02111-1307, USA.
|
||||
|
||||
# gen-unicode-tables.pl - Generate tables for libunicode from Unicode data.
|
||||
# See http://www.unicode.org/Public/UNIDATA/UnicodeCharacterDatabase.html
|
||||
# Usage: gen-unicode-tables.pl [-decomp | -both] UNICODE-VERSION UnicodeData.txt LineBreak.txt
|
||||
# I consider the output of this program to be unrestricted. Use it as
|
||||
# you will.
|
||||
|
||||
# FIXME:
|
||||
# * We could save even more space in the generated table by using
|
||||
# indexes and not pointers.
|
||||
# * For decomp table it might make sense to use a shift count other
|
||||
# than 8. We could easily compute the perfect shift count.
|
||||
|
||||
use vars qw($CODE $NAME $CATEGORY $COMBINING_CLASSES $BIDI_CATEGORY $DECOMPOSITION $DECIMAL_VALUE $DIGIT_VALUE $NUMERIC_VALUE $MIRRORED $OLD_NAME $COMMENT $UPPER $LOWER $TITLE $BREAK_CODE $BREAK_CATEGORY $BREAK_NAME);
|
||||
|
||||
# Names of fields in Unicode data table.
|
||||
$CODE = 0;
|
||||
$NAME = 1;
|
||||
$CATEGORY = 2;
|
||||
$COMBINING_CLASSES = 3;
|
||||
$BIDI_CATEGORY = 4;
|
||||
$DECOMPOSITION = 5;
|
||||
$DECIMAL_VALUE = 6;
|
||||
$DIGIT_VALUE = 7;
|
||||
$NUMERIC_VALUE = 8;
|
||||
$MIRRORED = 9;
|
||||
$OLD_NAME = 10;
|
||||
$COMMENT = 11;
|
||||
$UPPER = 12;
|
||||
$LOWER = 13;
|
||||
$TITLE = 14;
|
||||
|
||||
# Names of fields in the line break table
|
||||
$BREAK_CODE = 0;
|
||||
$BREAK_PROPERTY = 1;
|
||||
$BREAK_NAME = 2;
|
||||
|
||||
# Map general category code onto symbolic name.
|
||||
%mappings =
|
||||
(
|
||||
# Normative.
|
||||
'Lu' => "G_UNICODE_UPPERCASE_LETTER",
|
||||
'Ll' => "G_UNICODE_LOWERCASE_LETTER",
|
||||
'Lt' => "G_UNICODE_TITLECASE_LETTER",
|
||||
'Mn' => "G_UNICODE_NON_SPACING_MARK",
|
||||
'Mc' => "G_UNICODE_COMBINING_MARK",
|
||||
'Me' => "G_UNICODE_ENCLOSING_MARK",
|
||||
'Nd' => "G_UNICODE_DECIMAL_NUMBER",
|
||||
'Nl' => "G_UNICODE_LETTER_NUMBER",
|
||||
'No' => "G_UNICODE_OTHER_NUMBER",
|
||||
'Zs' => "G_UNICODE_SPACE_SEPARATOR",
|
||||
'Zl' => "G_UNICODE_LINE_SEPARATOR",
|
||||
'Zp' => "G_UNICODE_PARAGRAPH_SEPARATOR",
|
||||
'Cc' => "G_UNICODE_CONTROL",
|
||||
'Cf' => "G_UNICODE_FORMAT",
|
||||
'Cs' => "G_UNICODE_SURROGATE",
|
||||
'Co' => "G_UNICODE_PRIVATE_USE",
|
||||
'Cn' => "G_UNICODE_UNASSIGNED",
|
||||
|
||||
# Informative.
|
||||
'Lm' => "G_UNICODE_MODIFIER_LETTER",
|
||||
'Lo' => "G_UNICODE_OTHER_LETTER",
|
||||
'Pc' => "G_UNICODE_CONNECT_PUNCTUATION",
|
||||
'Pd' => "G_UNICODE_DASH_PUNCTUATION",
|
||||
'Ps' => "G_UNICODE_OPEN_PUNCTUATION",
|
||||
'Pe' => "G_UNICODE_CLOSE_PUNCTUATION",
|
||||
'Pi' => "G_UNICODE_INITIAL_PUNCTUATION",
|
||||
'Pf' => "G_UNICODE_FINAL_PUNCTUATION",
|
||||
'Po' => "G_UNICODE_OTHER_PUNCTUATION",
|
||||
'Sm' => "G_UNICODE_MATH_SYMBOL",
|
||||
'Sc' => "G_UNICODE_CURRENCY_SYMBOL",
|
||||
'Sk' => "G_UNICODE_MODIFIER_SYMBOL",
|
||||
'So' => "G_UNICODE_OTHER_SYMBOL"
|
||||
);
|
||||
|
||||
%break_mappings =
|
||||
(
|
||||
'BK' => "G_UNICODE_BREAK_MANDATORY",
|
||||
'CR' => "G_UNICODE_BREAK_CARRIAGE_RETURN",
|
||||
'LF' => "G_UNICODE_BREAK_LINE_FEED",
|
||||
'CM' => "G_UNICODE_BREAK_COMBINING_MARK",
|
||||
'SG' => "G_UNICODE_BREAK_SURROGATE",
|
||||
'ZW' => "G_UNICODE_BREAK_ZERO_WIDTH_SPACE",
|
||||
'IN' => "G_UNICODE_BREAK_INSEPARABLE",
|
||||
'GL' => "G_UNICODE_BREAK_NON_BREAKING_GLUE",
|
||||
'CB' => "G_UNICODE_BREAK_CONTINGENT",
|
||||
'SP' => "G_UNICODE_BREAK_SPACE",
|
||||
'BA' => "G_UNICODE_BREAK_AFTER",
|
||||
'BB' => "G_UNICODE_BREAK_BEFORE",
|
||||
'B2' => "G_UNICODE_BREAK_BEFORE_AND_AFTER",
|
||||
'HY' => "G_UNICODE_BREAK_HYPHEN",
|
||||
'NS' => "G_UNICODE_BREAK_NON_STARTER",
|
||||
'OP' => "G_UNICODE_BREAK_OPEN_PUNCTUATION",
|
||||
'CL' => "G_UNICODE_BREAK_CLOSE_PUNCTUATION",
|
||||
'QU' => "G_UNICODE_BREAK_QUOTATION",
|
||||
'EX' => "G_UNICODE_BREAK_EXCLAMATION",
|
||||
'ID' => "G_UNICODE_BREAK_IDEOGRAPHIC",
|
||||
'NU' => "G_UNICODE_BREAK_NUMERIC",
|
||||
'IS' => "G_UNICODE_BREAK_INFIX_SEPARATOR",
|
||||
'SY' => "G_UNICODE_BREAK_SYMBOL",
|
||||
'AL' => "G_UNICODE_BREAK_ALPHABETIC",
|
||||
'PR' => "G_UNICODE_BREAK_PREFIX",
|
||||
'PO' => "G_UNICODE_BREAK_POSTFIX",
|
||||
'SA' => "G_UNICODE_BREAK_COMPLEX_CONTEXT",
|
||||
'AI' => "G_UNICODE_BREAK_AMBIGUOUS",
|
||||
'XX' => "G_UNICODE_BREAK_UNKNOWN"
|
||||
);
|
||||
|
||||
# Title case mappings.
|
||||
%title_to_lower = ();
|
||||
%title_to_upper = ();
|
||||
|
||||
$do_decomp = 0;
|
||||
$do_props = 1;
|
||||
if ($ARGV[0] eq '-decomp')
|
||||
{
|
||||
$do_decomp = 1;
|
||||
$do_props = 0;
|
||||
shift @ARGV;
|
||||
}
|
||||
elsif ($ARGV[0] eq '-both')
|
||||
{
|
||||
$do_decomp = 1;
|
||||
shift @ARGV;
|
||||
}
|
||||
|
||||
print "Creating decomp table\n" if ($do_decomp);
|
||||
print "Creating property table\n" if ($do_props);
|
||||
|
||||
print "Unicode data from $ARGV[1]\n";
|
||||
|
||||
open (INPUT, "< $ARGV[1]") || exit 1;
|
||||
|
||||
$last_code = -1;
|
||||
while (<INPUT>)
|
||||
{
|
||||
chop;
|
||||
@fields = split (';', $_, 30);
|
||||
if ($#fields != 14)
|
||||
{
|
||||
printf STDERR ("Entry for $fields[$CODE] has wrong number of fields (%d)\n", $#fields);
|
||||
}
|
||||
|
||||
$code = hex ($fields[$CODE]);
|
||||
|
||||
last if ($code > 0xFFFF); # ignore characters out of the basic plane
|
||||
|
||||
if ($code > $last_code + 1)
|
||||
{
|
||||
# Found a gap.
|
||||
if ($fields[$NAME] =~ /Last>/)
|
||||
{
|
||||
# Fill the gap with the last character read,
|
||||
# since this was a range specified in the char database
|
||||
@gfields = @fields;
|
||||
}
|
||||
else
|
||||
{
|
||||
# The gap represents undefined characters. Only the type
|
||||
# matters.
|
||||
@gfields = ('', '', 'Cn', '0', '', '', '', '', '', '', '',
|
||||
'', '', '', '');
|
||||
}
|
||||
for (++$last_code; $last_code < $code; ++$last_code)
|
||||
{
|
||||
$gfields{$CODE} = sprintf ("%04x", $last_code);
|
||||
&process_one ($last_code, @gfields);
|
||||
}
|
||||
}
|
||||
&process_one ($code, @fields);
|
||||
$last_code = $code;
|
||||
}
|
||||
|
||||
@gfields = ('', '', 'Cn', '0', '', '', '', '', '', '', '',
|
||||
'', '', '', '');
|
||||
for (++$last_code; $last_code < 0x10000; ++$last_code)
|
||||
{
|
||||
$gfields{$CODE} = sprintf ("%04x", $last_code);
|
||||
&process_one ($last_code, @gfields);
|
||||
}
|
||||
--$last_code; # Want last to be 0xFFFF.
|
||||
|
||||
print "Creating line break table\n";
|
||||
|
||||
print "Line break data from $ARGV[2]\n";
|
||||
|
||||
open (INPUT, "< $ARGV[2]") || exit 1;
|
||||
|
||||
$last_code = -1;
|
||||
while (<INPUT>)
|
||||
{
|
||||
chop;
|
||||
|
||||
next if /^#/;
|
||||
|
||||
@fields = split (';', $_, 30);
|
||||
if ($#fields != 2)
|
||||
{
|
||||
printf STDERR ("Entry for $fields[$CODE] has wrong number of fields (%d)\n", $#fields);
|
||||
}
|
||||
|
||||
$code = hex ($fields[$CODE]);
|
||||
|
||||
last if ($code > 0xFFFF); # ignore characters out of the basic plane
|
||||
|
||||
if ($code > $last_code + 1)
|
||||
{
|
||||
# Found a gap.
|
||||
if ($fields[$NAME] =~ /Last>/)
|
||||
{
|
||||
# Fill the gap with the last character read,
|
||||
# since this was a range specified in the char database
|
||||
$gap_break_prop = $fields[$BREAK_PROPERTY];
|
||||
for (++$last_code; $last_code < $code; ++$last_code)
|
||||
{
|
||||
$break_props[$last_code] = $gap_break_prop;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
# The gap represents undefined characters. If assigned,
|
||||
# they are AL, if not assigned, XX
|
||||
for (++$last_code; $last_code < $code; ++$last_code)
|
||||
{
|
||||
if ($type[$last_code] eq 'Cn')
|
||||
{
|
||||
$break_props[$last_code] = 'XX';
|
||||
}
|
||||
else
|
||||
{
|
||||
$break_props[$last_code] = 'AL';
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
$break_props[$code] = $fields[$BREAK_PROPERTY];
|
||||
$last_code = $code;
|
||||
}
|
||||
|
||||
for (++$last_code; $last_code < 0x10000; ++$last_code)
|
||||
{
|
||||
if ($type[$last_code] eq 'Cn')
|
||||
{
|
||||
$break_props[$last_code] = 'XX';
|
||||
}
|
||||
else
|
||||
{
|
||||
$break_props[$last_code] = 'AL';
|
||||
}
|
||||
}
|
||||
--$last_code; # Want last to be 0xFFFF.
|
||||
|
||||
print STDERR "Last code is not 0xFFFF" if ($last_code != 0xFFFF);
|
||||
|
||||
&print_tables ($last_code)
|
||||
if $do_props;
|
||||
&print_decomp ($last_code)
|
||||
if $do_decomp;
|
||||
|
||||
&print_line_break ($last_code);
|
||||
|
||||
exit 0;
|
||||
|
||||
# Process a single character.
|
||||
sub process_one
|
||||
{
|
||||
my ($code, @fields) = @_;
|
||||
|
||||
$type[$code] = $fields[$CATEGORY];
|
||||
if ($type[$code] eq 'Nd')
|
||||
{
|
||||
$value[$code] = int ($fields[$DECIMAL_VALUE]);
|
||||
}
|
||||
elsif ($type[$code] eq 'Ll')
|
||||
{
|
||||
$value[$code] = hex ($fields[$UPPER]);
|
||||
}
|
||||
elsif ($type[$code] eq 'Lu')
|
||||
{
|
||||
$value[$code] = hex ($fields[$LOWER]);
|
||||
}
|
||||
|
||||
if ($type[$code] eq 'Lt')
|
||||
{
|
||||
$title_to_lower{$code} = hex ($fields[$LOWER]);
|
||||
$title_to_upper{$code} = hex ($fields[$UPPER]);
|
||||
}
|
||||
|
||||
$cclass[$code] = $fields[$COMBINING_CLASSES];
|
||||
|
||||
# Handle decompositions.
|
||||
if ($fields[$DECOMPOSITION] ne ''
|
||||
&& $fields[$DECOMPOSITION] !~ /\<.*\>/)
|
||||
{
|
||||
$decompositions[$code] = $fields[$DECOMPOSITION];
|
||||
}
|
||||
}
|
||||
|
||||
sub print_tables
|
||||
{
|
||||
my ($last) = @_;
|
||||
my ($outfile) = "gunichartables.h";
|
||||
|
||||
local ($bytes_out) = 0;
|
||||
|
||||
print "Writing $outfile...\n";
|
||||
|
||||
open (OUT, "> $outfile");
|
||||
|
||||
print OUT "/* This file is automatically generated. DO NOT EDIT!\n";
|
||||
print OUT " Instead, edit gen-unicode-tables.pl and re-run. */\n\n";
|
||||
|
||||
print OUT "#ifndef CHARTABLES_H\n";
|
||||
print OUT "#define CHARTABLES_H\n\n";
|
||||
|
||||
print OUT "#define G_UNICODE_DATA_VERSION \"$ARGV[0]\"\n\n";
|
||||
|
||||
printf OUT "#define G_UNICODE_LAST_CHAR 0x%04x\n\n", $last;
|
||||
|
||||
for ($count = 0; $count <= $last; $count += 256)
|
||||
{
|
||||
$row[$count / 256] = &print_row ($count, '(char *) ', 'char', 1,
|
||||
'page', \&fetch_type);
|
||||
}
|
||||
|
||||
print OUT "static char *type_table[256] = {\n";
|
||||
for ($count = 0; $count <= $last; $count += 256)
|
||||
{
|
||||
print OUT ",\n" if $count > 0;
|
||||
print OUT " ", $row[$count / 256];
|
||||
$bytes_out += 4;
|
||||
}
|
||||
print OUT "\n};\n\n";
|
||||
|
||||
|
||||
#
|
||||
# Now print attribute table.
|
||||
#
|
||||
|
||||
for ($count = 0; $count <= $last; $count += 256)
|
||||
{
|
||||
$row[$count / 256] = &print_row ($count, '', 'unsigned short', 2,
|
||||
'attrpage', \&fetch_attr);
|
||||
}
|
||||
print OUT "static unsigned short *attr_table[256] = {\n";
|
||||
for ($count = 0; $count <= $last; $count += 256)
|
||||
{
|
||||
print OUT ",\n" if $count > 0;
|
||||
print OUT " ", $row[$count / 256];
|
||||
$bytes_out += 4;
|
||||
}
|
||||
print OUT "\n};\n\n";
|
||||
|
||||
# FIXME: type.
|
||||
print OUT "static unsigned short title_table[][3] = {\n";
|
||||
my ($item);
|
||||
my ($first) = 1;
|
||||
foreach $item (sort keys %title_to_lower)
|
||||
{
|
||||
print OUT ",\n"
|
||||
unless $first;
|
||||
$first = 0;
|
||||
printf OUT " { 0x%04x, 0x%04x, 0x%04x }", $item, $title_to_upper{$item}, $title_to_lower{$item};
|
||||
$bytes_out += 6;
|
||||
}
|
||||
print OUT "\n};\n\n";
|
||||
|
||||
print OUT "#endif /* CHARTABLES_H */\n";
|
||||
|
||||
close (OUT);
|
||||
|
||||
printf STDERR "Generated %d bytes in tables\n", $bytes_out;
|
||||
}
|
||||
|
||||
# A fetch function for the type table.
|
||||
sub fetch_type
|
||||
{
|
||||
my ($index) = @_;
|
||||
return $mappings{$type[$index]};
|
||||
}
|
||||
|
||||
# A fetch function for the attribute table.
|
||||
sub fetch_attr
|
||||
{
|
||||
my ($index) = @_;
|
||||
if (defined $value[$index])
|
||||
{
|
||||
return sprintf ("0x%04x", $value[$index]);
|
||||
}
|
||||
else
|
||||
{
|
||||
return "0x0000";
|
||||
}
|
||||
}
|
||||
|
||||
# Print a single "row" of a two-level table.
|
||||
sub print_row
|
||||
{
|
||||
my ($start, $def_pfx, $typname, $typsize, $name, $fetcher) = @_;
|
||||
|
||||
my ($i);
|
||||
my (@values);
|
||||
my ($flag) = 1;
|
||||
my ($off);
|
||||
|
||||
for ($off = 0; $off < 256; ++$off)
|
||||
{
|
||||
$values[$off] = $fetcher->($off + $start);
|
||||
if ($values[$off] ne $values[0])
|
||||
{
|
||||
$flag = 0;
|
||||
}
|
||||
}
|
||||
if ($flag)
|
||||
{
|
||||
return $def_pfx . $values[0];
|
||||
}
|
||||
|
||||
printf OUT "static %s %s%d[256] = {\n ", $typname, $name, $start / 256;
|
||||
my ($column) = 2;
|
||||
for ($i = $start; $i < $start + 256; ++$i)
|
||||
{
|
||||
print OUT ", "
|
||||
if $i > $start;
|
||||
my ($text) = $values[$i - $start];
|
||||
if (length ($text) + $column + 2 > 78)
|
||||
{
|
||||
print OUT "\n ";
|
||||
$column = 2;
|
||||
}
|
||||
print OUT $text;
|
||||
$column += length ($text) + 2;
|
||||
}
|
||||
print OUT "\n};\n\n";
|
||||
|
||||
$bytes_out += 256 * $typsize;
|
||||
|
||||
return sprintf "%s%d", $name, $start / 256;
|
||||
}
|
||||
|
||||
# Generate the character decomposition header.
|
||||
sub print_decomp
|
||||
{
|
||||
my ($last) = @_;
|
||||
my ($outfile) = "gunidecomp.h";
|
||||
|
||||
local ($bytes_out) = 0;
|
||||
|
||||
print "Writing $outfile...\n";
|
||||
|
||||
open (OUT, "> $outfile") || exit 1;
|
||||
|
||||
print OUT "/* This file is automatically generated. DO NOT EDIT! */\n\n";
|
||||
print OUT "#ifndef DECOMP_H\n";
|
||||
print OUT "#define DECOMP_H\n\n";
|
||||
|
||||
printf OUT "#define G_UNICODE_LAST_CHAR 0x%04x\n\n", $last;
|
||||
|
||||
my ($count, @row);
|
||||
for ($count = 0; $count <= $last; $count += 256)
|
||||
{
|
||||
$row[$count / 256] = &print_row ($count, '(unsigned char *) ',
|
||||
'unsigned char', 1, 'cclass',
|
||||
\&fetch_cclass);
|
||||
}
|
||||
|
||||
print OUT "static unsigned char *combining_class_table[256] = {\n";
|
||||
for ($count = 0; $count <= $last; $count += 256)
|
||||
{
|
||||
print OUT ",\n" if $count > 0;
|
||||
print OUT " ", $row[$count / 256];
|
||||
$bytes_out += 4;
|
||||
}
|
||||
print OUT "\n};\n\n";
|
||||
|
||||
print OUT "typedef struct\n{\n";
|
||||
# FIXME: type.
|
||||
print OUT " unsigned short ch;\n";
|
||||
print OUT " unsigned char *expansion;\n";
|
||||
print OUT "} decomposition;\n\n";
|
||||
|
||||
print OUT "static decomposition decomp_table[] =\n{\n";
|
||||
my ($iter);
|
||||
my ($first) = 1;
|
||||
for ($count = 0; $count <= $last; ++$count)
|
||||
{
|
||||
if (defined $decompositions[$count])
|
||||
{
|
||||
print OUT ",\n"
|
||||
if ! $first;
|
||||
$first = 0;
|
||||
printf OUT " { 0x%04x, \"", $count;
|
||||
$bytes_out += 2;
|
||||
foreach $iter (&expand_decomp ($count))
|
||||
{
|
||||
printf OUT "\\x%02x\\x%02x", $iter / 256, $iter & 0xff;
|
||||
$bytes_out += 2;
|
||||
}
|
||||
# Only a single terminator because one is implied in the string.
|
||||
print OUT "\\0\" }";
|
||||
$bytes_out += 2;
|
||||
}
|
||||
}
|
||||
print OUT "\n};\n\n";
|
||||
|
||||
print OUT "#endif /* DECOMP_H */\n";
|
||||
|
||||
printf STDERR "Generated %d bytes in decomp tables\n", $bytes_out;
|
||||
}
|
||||
|
||||
sub print_line_break
|
||||
{
|
||||
my ($last) = @_;
|
||||
my ($outfile) = "gunibreak.h";
|
||||
|
||||
local ($bytes_out) = 0;
|
||||
|
||||
print "Writing $outfile...\n";
|
||||
|
||||
open (OUT, "> $outfile");
|
||||
|
||||
print OUT "/* This file is automatically generated. DO NOT EDIT!\n";
|
||||
print OUT " Instead, edit gen-unicode-tables.pl and re-run. */\n\n";
|
||||
|
||||
print OUT "#ifndef BREAKTABLES_H\n";
|
||||
print OUT "#define BREAKTABLES_H\n\n";
|
||||
|
||||
print OUT "#define G_UNICODE_DATA_VERSION \"$ARGV[0]\"\n\n";
|
||||
|
||||
printf OUT "#define G_UNICODE_LAST_CHAR 0x%04x\n\n", $last;
|
||||
|
||||
for ($count = 0; $count <= $last; $count += 256)
|
||||
{
|
||||
$row[$count / 256] = &print_row ($count, '(char *) ', 'char', 1,
|
||||
'page',
|
||||
\&fetch_break_type);
|
||||
}
|
||||
|
||||
print OUT "static char *break_property_table[256] = {\n";
|
||||
for ($count = 0; $count <= $last; $count += 256)
|
||||
{
|
||||
print OUT ",\n" if $count > 0;
|
||||
print OUT " ", $row[$count / 256];
|
||||
$bytes_out += 4;
|
||||
}
|
||||
print OUT "\n};\n\n";
|
||||
|
||||
print OUT "#endif /* BREAKTABLES_H */\n";
|
||||
|
||||
close (OUT);
|
||||
|
||||
printf STDERR "Generated %d bytes in break tables\n", $bytes_out;
|
||||
}
|
||||
|
||||
|
||||
# A fetch function for the break properties table.
|
||||
sub fetch_break_type
|
||||
{
|
||||
my ($index) = @_;
|
||||
return $break_mappings{$break_props[$index]};
|
||||
}
|
||||
|
||||
# Fetcher for combining class.
|
||||
sub fetch_cclass
|
||||
{
|
||||
my ($i) = @_;
|
||||
return $cclass[$i];
|
||||
}
|
||||
|
||||
# Expand a character decomposition recursively.
|
||||
sub expand_decomp
|
||||
{
|
||||
my ($code) = @_;
|
||||
|
||||
my ($iter, $val);
|
||||
my (@result) = ();
|
||||
foreach $iter (split (' ', $decompositions[$code]))
|
||||
{
|
||||
$val = hex ($iter);
|
||||
if (defined $decompositions[$val])
|
||||
{
|
||||
push (@result, &expand_decomp ($val));
|
||||
}
|
||||
else
|
||||
{
|
||||
push (@result, $val);
|
||||
}
|
||||
}
|
||||
|
||||
return @result;
|
||||
}
|
7254
glib/gunibreak.h
Normal file
7254
glib/gunibreak.h
Normal file
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@ -30,8 +30,11 @@ G_BEGIN_DECLS
|
||||
typedef guint32 gunichar;
|
||||
typedef guint16 gunichar2;
|
||||
|
||||
/* These are the possible character classifications. */
|
||||
typedef enum {
|
||||
/* These are the possible character classifications.
|
||||
* See http://www.unicode.org/Public/UNIDATA/UnicodeData.html
|
||||
*/
|
||||
typedef enum
|
||||
{
|
||||
G_UNICODE_CONTROL,
|
||||
G_UNICODE_FORMAT,
|
||||
G_UNICODE_UNASSIGNED,
|
||||
@ -64,6 +67,42 @@ typedef enum {
|
||||
G_UNICODE_SPACE_SEPARATOR
|
||||
} GUnicodeType;
|
||||
|
||||
/* These are the possible line break classifications.
|
||||
* See http://www.unicode.org/unicode/reports/tr14/
|
||||
*/
|
||||
typedef enum
|
||||
{
|
||||
G_UNICODE_BREAK_MANDATORY,
|
||||
G_UNICODE_BREAK_CARRIAGE_RETURN,
|
||||
G_UNICODE_BREAK_LINE_FEED,
|
||||
G_UNICODE_BREAK_COMBINING_MARK,
|
||||
G_UNICODE_BREAK_SURROGATE,
|
||||
G_UNICODE_BREAK_ZERO_WIDTH_SPACE,
|
||||
G_UNICODE_BREAK_INSEPARABLE,
|
||||
G_UNICODE_BREAK_NON_BREAKING_GLUE,
|
||||
G_UNICODE_BREAK_CONTINGENT,
|
||||
G_UNICODE_BREAK_SPACE,
|
||||
G_UNICODE_BREAK_AFTER,
|
||||
G_UNICODE_BREAK_BEFORE,
|
||||
G_UNICODE_BREAK_BEFORE_AND_AFTER,
|
||||
G_UNICODE_BREAK_HYPHEN,
|
||||
G_UNICODE_BREAK_NON_STARTER,
|
||||
G_UNICODE_BREAK_OPEN_PUNCTUATION,
|
||||
G_UNICODE_BREAK_CLOSE_PUNCTUATION,
|
||||
G_UNICODE_BREAK_QUOTATION,
|
||||
G_UNICODE_BREAK_EXCLAMATION,
|
||||
G_UNICODE_BREAK_IDEOGRAPHIC,
|
||||
G_UNICODE_BREAK_NUMERIC,
|
||||
G_UNICODE_BREAK_INFIX_SEPARATOR,
|
||||
G_UNICODE_BREAK_SYMBOL,
|
||||
G_UNICODE_BREAK_ALPHABETIC,
|
||||
G_UNICODE_BREAK_PREFIX,
|
||||
G_UNICODE_BREAK_POSTFIX,
|
||||
G_UNICODE_BREAK_COMPLEX_CONTEXT,
|
||||
G_UNICODE_BREAK_AMBIGUOUS,
|
||||
G_UNICODE_BREAK_UNKNOWN
|
||||
} GUnicodeBreakType;
|
||||
|
||||
/* Returns TRUE if current locale uses UTF-8 charset. If CHARSET is
|
||||
* not null, sets *CHARSET to the name of the current locale's
|
||||
* charset. This value is statically allocated.
|
||||
@ -102,6 +141,8 @@ gint g_unichar_xdigit_value (gunichar c) G_GNUC_CONST;
|
||||
/* Return the Unicode character type of a given character. */
|
||||
GUnicodeType g_unichar_type (gunichar c) G_GNUC_CONST;
|
||||
|
||||
/* Return the line break property for a given character */
|
||||
GUnicodeBreakType g_unichar_break_type (gunichar c) G_GNUC_CONST;
|
||||
|
||||
|
||||
/* Compute canonical ordering of a string in-place. This rearranges
|
||||
|
@ -35,7 +35,7 @@
|
||||
: (combining_class_table[Page][Char]))
|
||||
|
||||
#define COMBINING_CLASS(Char) \
|
||||
(((Char) > (UNICODE_LAST_CHAR)) ? 0 : CC((Char) >> 8, (Char) & 0xff))
|
||||
(((Char) > (G_UNICODE_LAST_CHAR)) ? 0 : CC((Char) >> 8, (Char) & 0xff))
|
||||
|
||||
/* Compute the canonical ordering of a string in-place. */
|
||||
void
|
||||
|
2674
glib/gunidecomp.h
2674
glib/gunidecomp.h
File diff suppressed because it is too large
Load Diff
7254
gunibreak.h
Normal file
7254
gunibreak.h
Normal file
File diff suppressed because it is too large
Load Diff
4403
gunichartables.h
4403
gunichartables.h
File diff suppressed because it is too large
Load Diff
45
gunicode.h
45
gunicode.h
@ -30,8 +30,11 @@ G_BEGIN_DECLS
|
||||
typedef guint32 gunichar;
|
||||
typedef guint16 gunichar2;
|
||||
|
||||
/* These are the possible character classifications. */
|
||||
typedef enum {
|
||||
/* These are the possible character classifications.
|
||||
* See http://www.unicode.org/Public/UNIDATA/UnicodeData.html
|
||||
*/
|
||||
typedef enum
|
||||
{
|
||||
G_UNICODE_CONTROL,
|
||||
G_UNICODE_FORMAT,
|
||||
G_UNICODE_UNASSIGNED,
|
||||
@ -64,6 +67,42 @@ typedef enum {
|
||||
G_UNICODE_SPACE_SEPARATOR
|
||||
} GUnicodeType;
|
||||
|
||||
/* These are the possible line break classifications.
|
||||
* See http://www.unicode.org/unicode/reports/tr14/
|
||||
*/
|
||||
typedef enum
|
||||
{
|
||||
G_UNICODE_BREAK_MANDATORY,
|
||||
G_UNICODE_BREAK_CARRIAGE_RETURN,
|
||||
G_UNICODE_BREAK_LINE_FEED,
|
||||
G_UNICODE_BREAK_COMBINING_MARK,
|
||||
G_UNICODE_BREAK_SURROGATE,
|
||||
G_UNICODE_BREAK_ZERO_WIDTH_SPACE,
|
||||
G_UNICODE_BREAK_INSEPARABLE,
|
||||
G_UNICODE_BREAK_NON_BREAKING_GLUE,
|
||||
G_UNICODE_BREAK_CONTINGENT,
|
||||
G_UNICODE_BREAK_SPACE,
|
||||
G_UNICODE_BREAK_AFTER,
|
||||
G_UNICODE_BREAK_BEFORE,
|
||||
G_UNICODE_BREAK_BEFORE_AND_AFTER,
|
||||
G_UNICODE_BREAK_HYPHEN,
|
||||
G_UNICODE_BREAK_NON_STARTER,
|
||||
G_UNICODE_BREAK_OPEN_PUNCTUATION,
|
||||
G_UNICODE_BREAK_CLOSE_PUNCTUATION,
|
||||
G_UNICODE_BREAK_QUOTATION,
|
||||
G_UNICODE_BREAK_EXCLAMATION,
|
||||
G_UNICODE_BREAK_IDEOGRAPHIC,
|
||||
G_UNICODE_BREAK_NUMERIC,
|
||||
G_UNICODE_BREAK_INFIX_SEPARATOR,
|
||||
G_UNICODE_BREAK_SYMBOL,
|
||||
G_UNICODE_BREAK_ALPHABETIC,
|
||||
G_UNICODE_BREAK_PREFIX,
|
||||
G_UNICODE_BREAK_POSTFIX,
|
||||
G_UNICODE_BREAK_COMPLEX_CONTEXT,
|
||||
G_UNICODE_BREAK_AMBIGUOUS,
|
||||
G_UNICODE_BREAK_UNKNOWN
|
||||
} GUnicodeBreakType;
|
||||
|
||||
/* Returns TRUE if current locale uses UTF-8 charset. If CHARSET is
|
||||
* not null, sets *CHARSET to the name of the current locale's
|
||||
* charset. This value is statically allocated.
|
||||
@ -102,6 +141,8 @@ gint g_unichar_xdigit_value (gunichar c) G_GNUC_CONST;
|
||||
/* Return the Unicode character type of a given character. */
|
||||
GUnicodeType g_unichar_type (gunichar c) G_GNUC_CONST;
|
||||
|
||||
/* Return the line break property for a given character */
|
||||
GUnicodeBreakType g_unichar_break_type (gunichar c) G_GNUC_CONST;
|
||||
|
||||
|
||||
/* Compute canonical ordering of a string in-place. This rearranges
|
||||
|
@ -35,7 +35,7 @@
|
||||
: (combining_class_table[Page][Char]))
|
||||
|
||||
#define COMBINING_CLASS(Char) \
|
||||
(((Char) > (UNICODE_LAST_CHAR)) ? 0 : CC((Char) >> 8, (Char) & 0xff))
|
||||
(((Char) > (G_UNICODE_LAST_CHAR)) ? 0 : CC((Char) >> 8, (Char) & 0xff))
|
||||
|
||||
/* Compute the canonical ordering of a string in-place. */
|
||||
void
|
||||
|
2674
gunidecomp.h
2674
gunidecomp.h
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue
Block a user