Add Unicode script support

This commit is contained in:
Matthias Clasen 2006-10-09 04:23:58 +00:00
parent 9c19905b0e
commit 68e78574db
9 changed files with 3352 additions and 1 deletions

View File

@ -1,3 +1,21 @@
2006-10-08 Matthias Clasen <mclasen@redhat.com>
Add a way to obtain Unicode script information. (#348348,
Marco Barisione)
* glib/glib.symbols:
* glib/gunicode.h: Add GUnicodeScript enumeration and
g_unichar_get_script.
* glib/guniprop.c: Implement g_unichar_get_script.
* glib/gscripttable.h: Generated private header containing
script tables.
* glib/gen-script-table.pl: Script to generate gscripttable.h.
* glib/Makefile.am: Update
2006-10-08 Matthias Clasen <mclasen@redhat.com>
* tests/run-markup-tests.sh: Small portability fix. (#347944,

View File

@ -1,5 +1,10 @@
2006-10-08 Matthias Clasen <mclasen@redhat.com>
* glib/glib-sections.txt: Add g_unichar_get_script() and
GUnicodeScript.
* glib/tmpl/unicode.sgml: Document GUnicodeScript
* gobject/tmpl/enumerations_flags.sgml: Add a hint about
the requirement that enum and flags values must be static.

View File

@ -2275,6 +2275,8 @@ g_unichar_break_type
g_unicode_canonical_ordering
g_unicode_canonical_decomposition
g_unichar_get_mirror_char
GUnicodeScript
g_unichar_get_script
<SUBSECTION>
g_utf8_next_char

View File

@ -302,7 +302,6 @@ Applications should be ready to handle unknown values.
They may be regarded as %G_UNICODE_BREAK_UNKNOWN.
See <ulink url="http://www.unicode.org/unicode/reports/tr14/"
>http://www.unicode.org/unicode/reports/tr14/</ulink>.
</para>
@G_UNICODE_BREAK_MANDATORY:
@ -380,6 +379,99 @@ See <ulink url="http://www.unicode.org/unicode/reports/tr14/"
@Returns:
<!-- ##### ENUM GUnicodeScript ##### -->
<para>
The #GUnicodeScript enumeration identifies different writing
systems. The values correspond to the names as defined in the
Unicode standard. The enumeration has been added in GLib 2.14.
Note that new types may be added in the future. Applications
should be ready to handle unknown values.
See <ulink
url="http://www.unicode.org/reports/tr24/">Unicode Standard Annex
#24: Script names</ulink>.
</para>
@G_UNICODE_SCRIPT_INVALID_CODE: a value never returned from g_unichar_get_script()
@G_UNICODE_SCRIPT_COMMON: a character used by multiple different scripts
@G_UNICODE_SCRIPT_INHERITED: a mark glyph that takes its script from the
base glyph to which it is attached
@G_UNICODE_SCRIPT_ARABIC: Arabic
@G_UNICODE_SCRIPT_ARMENIAN: Armenian
@G_UNICODE_SCRIPT_BENGALI: Bengali
@G_UNICODE_SCRIPT_BOPOMOFO: Bopomofo
@G_UNICODE_SCRIPT_CHEROKEE: Cherokee
@G_UNICODE_SCRIPT_COPTIC: Coptic
@G_UNICODE_SCRIPT_CYRILLIC: Cyrillic
@G_UNICODE_SCRIPT_DESERET: Deseret
@G_UNICODE_SCRIPT_DEVANAGARI: Devanagari
@G_UNICODE_SCRIPT_ETHIOPIC: Ethiopic
@G_UNICODE_SCRIPT_GEORGIAN: Georgian
@G_UNICODE_SCRIPT_GOTHIC: Gothic
@G_UNICODE_SCRIPT_GREEK: Greek
@G_UNICODE_SCRIPT_GUJARATI: Gujarati
@G_UNICODE_SCRIPT_GURMUKHI: Gurmukhi
@G_UNICODE_SCRIPT_HAN: Han
@G_UNICODE_SCRIPT_HANGUL: Hangul
@G_UNICODE_SCRIPT_HEBREW: Hebrew
@G_UNICODE_SCRIPT_HIRAGANA: Hiragana
@G_UNICODE_SCRIPT_KANNADA: Kannada
@G_UNICODE_SCRIPT_KATAKANA: Katakana
@G_UNICODE_SCRIPT_KHMER: Khmer
@G_UNICODE_SCRIPT_LAO: Lao
@G_UNICODE_SCRIPT_LATIN: Latin
@G_UNICODE_SCRIPT_MALAYALAM: Malayalam
@G_UNICODE_SCRIPT_MONGOLIAN: Mongolian
@G_UNICODE_SCRIPT_MYANMAR: Myanmar
@G_UNICODE_SCRIPT_OGHAM: Ogham
@G_UNICODE_SCRIPT_OLD_ITALIC: Old Italic
@G_UNICODE_SCRIPT_ORIYA: Oriya
@G_UNICODE_SCRIPT_RUNIC: Runic
@G_UNICODE_SCRIPT_SINHALA: Sinhala
@G_UNICODE_SCRIPT_SYRIAC: Syriac
@G_UNICODE_SCRIPT_TAMIL: Tamil
@G_UNICODE_SCRIPT_TELUGU: Telugu
@G_UNICODE_SCRIPT_THAANA: Thaana
@G_UNICODE_SCRIPT_THAI: Thai
@G_UNICODE_SCRIPT_TIBETAN: Tibetan
@G_UNICODE_SCRIPT_CANADIAN_ABORIGINAL:
Canadian Aboriginal
@G_UNICODE_SCRIPT_YI: Yi
@G_UNICODE_SCRIPT_TAGALOG: Tagalog
@G_UNICODE_SCRIPT_HANUNOO: Hanunoo
@G_UNICODE_SCRIPT_BUHID: Buhid
@G_UNICODE_SCRIPT_TAGBANWA: Tagbanwa
@G_UNICODE_SCRIPT_BRAILLE: Braille
@G_UNICODE_SCRIPT_CYPRIOT: Cypriot
@G_UNICODE_SCRIPT_LIMBU: Limbu
@G_UNICODE_SCRIPT_OSMANYA: Osmanya
@G_UNICODE_SCRIPT_SHAVIAN: Shavian
@G_UNICODE_SCRIPT_LINEAR_B: Linear B
@G_UNICODE_SCRIPT_TAI_LE: Tai Le
@G_UNICODE_SCRIPT_UGARITIC: Ugaritic
@G_UNICODE_SCRIPT_NEW_TAI_LUE: New Tai Lue
@G_UNICODE_SCRIPT_BUGINESE: Buginese
@G_UNICODE_SCRIPT_GLAGOLITIC: Glagolitic
@G_UNICODE_SCRIPT_TIFINAGH: Tifinagh
@G_UNICODE_SCRIPT_SYLOTI_NAGRI: Syloti Nagri
@G_UNICODE_SCRIPT_OLD_PERSIAN: Old Persian
@G_UNICODE_SCRIPT_KHAROSHTHI: Kharoshthi
@G_UNICODE_SCRIPT_UNKNOWN: an unassigned code point
@G_UNICODE_SCRIPT_BALINESE: Balinese
@G_UNICODE_SCRIPT_CUNEIFORM: Cuneiform
@G_UNICODE_SCRIPT_PHOENICIAN: Phoenician
@G_UNICODE_SCRIPT_PHAGS_PA: Phags-pa
@G_UNICODE_SCRIPT_NKO: N'Ko
<!-- ##### FUNCTION g_unichar_get_script ##### -->
<para>
</para>
@ch:
@Returns:
<!-- ##### MACRO g_utf8_next_char ##### -->
<para>
Skips to the next character in a UTF-8 string. The string must be

119
glib/gen-script-table.pl Executable file
View File

@ -0,0 +1,119 @@
#!/usr/bin/perl -w
#
# Script to convert http://www.unicode.org/Public/UNIDATA/Scripts.txt
# into a machine-readable table.
#
######################################################################
if (@ARGV != 1) {
die "Usage: gen-script-table.pl Scripts.txt > gscripttable.h\n";
}
open IN, $ARGV[0] || die "Cannot open $ARGV[0]: $!\n";
my @ranges;
my $file;
my $easy_range;
my $i;
my $start;
my $end;
my $script;
while (<IN>) {
if (/^\#\s+(Scripts-.*.txt)/) {
$file = $1;
}
s/#.*//;
next if /^\s*$/;
if (!/^([0-9A-F]+)(?:\.\.([0-9A-F]+))?\s*;\s*([A-Za-z_]+)\s*$/) {
die "Cannot parse line: '$_'\n";
}
if (defined $2) {
push @ranges, [ hex $1, hex $2, uc $3 ];
} else {
push @ranges, [ hex $1, hex $1, uc $3 ];
}
}
@ranges = sort { $a->[0] <=> $b->[0] } @ranges;
$date = gmtime;
print <<"EOT";
/* gscripttable.h: Generated by gen-script-table.pl
*
* Date: $date
* Source: $file
*
* Do not edit.
*/
EOT
$easy_range = 0x2000;
print <<"EOT";
#define G_EASY_SCRIPTS_RANGE $easy_range
static const guchar g_script_easy_table[$easy_range] = {
EOT
$i = 0;
$end = -1;
for (my $c = 0; $c < $easy_range; $c++) {
if ($c % 3 == 0) {
printf "\n ";
}
if ($c > $end) {
$start = $ranges[$i]->[0];
$end = $ranges[$i]->[1];
$script = $ranges[$i]->[2];
$i++;
}
if ($c < $start) {
printf " G_SCRIPT_UNKNOWN,";
} else {
printf " G_SCRIPT_%s,", $script;
}
}
if ($end >= $easy_range) {
$i--;
$ranges[$i]->[0] = $easy_range;
}
print <<"EOT";
};
static const struct {
gunichar start;
guint16 chars;
guint16 script;
} g_script_table[] = {
EOT
for (; $i <= $#ranges; $i++) {
$start = $ranges[$i]->[0];
$end = $ranges[$i]->[1];
$script = $ranges[$i]->[2];
while ($i <= $#ranges - 1 &&
$ranges[$i + 1]->[0] == $end + 1 &&
$ranges[$i + 1]->[2] eq $script) {
$i++;
$end = $ranges[$i]->[1];
}
printf " { %#06x, %5d, G_SCRIPT_%s },\n", $start, $end - $start + 1, $script;
}
printf "};\n";

View File

@ -1256,6 +1256,7 @@ g_unichar_tolower G_GNUC_CONST
g_unichar_totitle G_GNUC_CONST
g_unichar_toupper G_GNUC_CONST
g_unichar_get_mirror_char
g_unichar_get_script
g_unichar_digit_value G_GNUC_CONST
g_unichar_xdigit_value G_GNUC_CONST
g_unichar_type G_GNUC_CONST

2982
glib/gscripttable.h Normal file

File diff suppressed because it is too large Load Diff

View File

@ -112,6 +112,84 @@ typedef enum
G_UNICODE_BREAK_HANGUL_LVT_SYLLABLE
} GUnicodeBreakType;
typedef enum
{ /* ISO 15924 code */
G_UNICODE_SCRIPT_INVALID_CODE = -1,
G_UNICODE_SCRIPT_COMMON = 0, /* Zyyy */
G_UNICODE_SCRIPT_INHERITED, /* Qaai */
G_UNICODE_SCRIPT_ARABIC, /* Arab */
G_UNICODE_SCRIPT_ARMENIAN, /* Armn */
G_UNICODE_SCRIPT_BENGALI, /* Beng */
G_UNICODE_SCRIPT_BOPOMOFO, /* Bopo */
G_UNICODE_SCRIPT_CHEROKEE, /* Cher */
G_UNICODE_SCRIPT_COPTIC, /* Qaac */
G_UNICODE_SCRIPT_CYRILLIC, /* Cyrl (Cyrs) */
G_UNICODE_SCRIPT_DESERET, /* Dsrt */
G_UNICODE_SCRIPT_DEVANAGARI, /* Deva */
G_UNICODE_SCRIPT_ETHIOPIC, /* Ethi */
G_UNICODE_SCRIPT_GEORGIAN, /* Geor (Geon, Geoa) */
G_UNICODE_SCRIPT_GOTHIC, /* Goth */
G_UNICODE_SCRIPT_GREEK, /* Grek */
G_UNICODE_SCRIPT_GUJARATI, /* Gujr */
G_UNICODE_SCRIPT_GURMUKHI, /* Guru */
G_UNICODE_SCRIPT_HAN, /* Hani */
G_UNICODE_SCRIPT_HANGUL, /* Hang */
G_UNICODE_SCRIPT_HEBREW, /* Hebr */
G_UNICODE_SCRIPT_HIRAGANA, /* Hira */
G_UNICODE_SCRIPT_KANNADA, /* Knda */
G_UNICODE_SCRIPT_KATAKANA, /* Kana */
G_UNICODE_SCRIPT_KHMER, /* Khmr */
G_UNICODE_SCRIPT_LAO, /* Laoo */
G_UNICODE_SCRIPT_LATIN, /* Latn (Latf, Latg) */
G_UNICODE_SCRIPT_MALAYALAM, /* Mlym */
G_UNICODE_SCRIPT_MONGOLIAN, /* Mong */
G_UNICODE_SCRIPT_MYANMAR, /* Mymr */
G_UNICODE_SCRIPT_OGHAM, /* Ogam */
G_UNICODE_SCRIPT_OLD_ITALIC, /* Ital */
G_UNICODE_SCRIPT_ORIYA, /* Orya */
G_UNICODE_SCRIPT_RUNIC, /* Runr */
G_UNICODE_SCRIPT_SINHALA, /* Sinh */
G_UNICODE_SCRIPT_SYRIAC, /* Syrc (Syrj, Syrn, Syre) */
G_UNICODE_SCRIPT_TAMIL, /* Taml */
G_UNICODE_SCRIPT_TELUGU, /* Telu */
G_UNICODE_SCRIPT_THAANA, /* Thaa */
G_UNICODE_SCRIPT_THAI, /* Thai */
G_UNICODE_SCRIPT_TIBETAN, /* Tibt */
G_UNICODE_SCRIPT_CANADIAN_ABORIGINAL, /* Cans */
G_UNICODE_SCRIPT_YI, /* Yiii */
G_UNICODE_SCRIPT_TAGALOG, /* Tglg */
G_UNICODE_SCRIPT_HANUNOO, /* Hano */
G_UNICODE_SCRIPT_BUHID, /* Buhd */
G_UNICODE_SCRIPT_TAGBANWA, /* Tagb */
/* Unicode-4.0 additions */
G_UNICODE_SCRIPT_BRAILLE, /* Brai */
G_UNICODE_SCRIPT_CYPRIOT, /* Cprt */
G_UNICODE_SCRIPT_LIMBU, /* Limb */
G_UNICODE_SCRIPT_OSMANYA, /* Osma */
G_UNICODE_SCRIPT_SHAVIAN, /* Shaw */
G_UNICODE_SCRIPT_LINEAR_B, /* Linb */
G_UNICODE_SCRIPT_TAI_LE, /* Tale */
G_UNICODE_SCRIPT_UGARITIC, /* Ugar */
/* Unicode-4.1 additions */
G_UNICODE_SCRIPT_NEW_TAI_LUE, /* Talu */
G_UNICODE_SCRIPT_BUGINESE, /* Bugi */
G_UNICODE_SCRIPT_GLAGOLITIC, /* Glag */
G_UNICODE_SCRIPT_TIFINAGH, /* Tfng */
G_UNICODE_SCRIPT_SYLOTI_NAGRI, /* Sylo */
G_UNICODE_SCRIPT_OLD_PERSIAN, /* Xpeo */
G_UNICODE_SCRIPT_KHAROSHTHI, /* Khar */
/* Unicode-5.0 additions */
G_UNICODE_SCRIPT_UNKNOWN, /* Zzzz */
G_UNICODE_SCRIPT_BALINESE, /* Bali */
G_UNICODE_SCRIPT_CUNEIFORM, /* Xsux */
G_UNICODE_SCRIPT_PHOENICIAN, /* Phnx */
G_UNICODE_SCRIPT_PHAGS_PA, /* Phag */
G_UNICODE_SCRIPT_NKO /* Nkoo */
} GUnicodeScript;
/* Returns TRUE if current locale uses UTF-8 charset. If CHARSET is
* not null, sets *CHARSET to the name of the current locale's
* charset. This value is statically allocated, and should be copied
@ -292,6 +370,9 @@ gchar *g_utf8_collate_key_for_filename (const gchar *str,
gboolean g_unichar_get_mirror_char (gunichar ch,
gunichar *mirrored_ch);
GUnicodeScript g_unichar_get_script (gunichar ch);
/* private */
gchar *_g_utf8_make_valid (const gchar *name);

View File

@ -29,6 +29,7 @@
#include "glib.h"
#include "gunichartables.h"
#include "gmirroringtable.h"
#include "gscripttable.h"
#include "gunicodeprivate.h"
#include "galias.h"
@ -1183,5 +1184,55 @@ g_unichar_get_mirror_char (gunichar ch,
}
#define G_SCRIPT_TABLE_MIDPOINT (G_N_ELEMENTS (g_script_table) / 2)
static inline GUnicodeScript
g_unichar_get_script_bsearch (gunichar ch)
{
int lower = 0;
int upper = G_N_ELEMENTS (g_script_table) - 1;
static int saved_mid = G_SCRIPT_TABLE_MIDPOINT;
int mid = saved_mid;
do
{
if (ch < g_script_table[mid].start)
upper = mid - 1;
else if (ch >= g_script_table[mid].start + g_script_table[mid].chars)
lower = mid + 1;
else
return g_script_table[saved_mid = mid].script;
mid = (lower + upper) / 2;
}
while (lower <= upper);
return G_UNICODE_SCRIPT_UNKNOWN;
}
/**
* g_unichar_get_script:
* @ch: a Unicode character
*
* Looks up the #GUnicodeScript for a particular character (as defined
* by Unicode Standard Annex #24). No check is made for @ch being a
* valid Unicode character; if you pass in invalid character, the
* result is undefined.
*
* Return value: the #GUnicodeScript for the character.
*
* Since: 2.14
*/
GUnicodeScript
g_unichar_get_script (gunichar ch)
{
if (ch < G_EASY_SCRIPTS_RANGE)
return g_script_easy_table[ch];
else
return g_unichar_get_script_bsearch (ch);
}
#define __G_UNIPROP_C__
#include "galiasdef.c"