mirror of
https://gitlab.gnome.org/GNOME/glib.git
synced 2025-01-26 22:16:16 +01:00
Add Unicode script support
This commit is contained in:
parent
9c19905b0e
commit
68e78574db
18
ChangeLog
18
ChangeLog
@ -1,3 +1,21 @@
|
||||
2006-10-08 Matthias Clasen <mclasen@redhat.com>
|
||||
|
||||
Add a way to obtain Unicode script information. (#348348,
|
||||
Marco Barisione)
|
||||
|
||||
* glib/glib.symbols:
|
||||
* glib/gunicode.h: Add GUnicodeScript enumeration and
|
||||
g_unichar_get_script.
|
||||
|
||||
* glib/guniprop.c: Implement g_unichar_get_script.
|
||||
|
||||
* glib/gscripttable.h: Generated private header containing
|
||||
script tables.
|
||||
|
||||
* glib/gen-script-table.pl: Script to generate gscripttable.h.
|
||||
|
||||
* glib/Makefile.am: Update
|
||||
|
||||
2006-10-08 Matthias Clasen <mclasen@redhat.com>
|
||||
|
||||
* tests/run-markup-tests.sh: Small portability fix. (#347944,
|
||||
|
@ -1,5 +1,10 @@
|
||||
2006-10-08 Matthias Clasen <mclasen@redhat.com>
|
||||
|
||||
* glib/glib-sections.txt: Add g_unichar_get_script() and
|
||||
GUnicodeScript.
|
||||
|
||||
* glib/tmpl/unicode.sgml: Document GUnicodeScript
|
||||
|
||||
* gobject/tmpl/enumerations_flags.sgml: Add a hint about
|
||||
the requirement that enum and flags values must be static.
|
||||
|
||||
|
@ -2275,6 +2275,8 @@ g_unichar_break_type
|
||||
g_unicode_canonical_ordering
|
||||
g_unicode_canonical_decomposition
|
||||
g_unichar_get_mirror_char
|
||||
GUnicodeScript
|
||||
g_unichar_get_script
|
||||
|
||||
<SUBSECTION>
|
||||
g_utf8_next_char
|
||||
|
@ -302,7 +302,6 @@ Applications should be ready to handle unknown values.
|
||||
They may be regarded as %G_UNICODE_BREAK_UNKNOWN.
|
||||
See <ulink url="http://www.unicode.org/unicode/reports/tr14/"
|
||||
>http://www.unicode.org/unicode/reports/tr14/</ulink>.
|
||||
|
||||
</para>
|
||||
|
||||
@G_UNICODE_BREAK_MANDATORY:
|
||||
@ -380,6 +379,99 @@ See <ulink url="http://www.unicode.org/unicode/reports/tr14/"
|
||||
@Returns:
|
||||
|
||||
|
||||
<!-- ##### ENUM GUnicodeScript ##### -->
|
||||
<para>
|
||||
The #GUnicodeScript enumeration identifies different writing
|
||||
systems. The values correspond to the names as defined in the
|
||||
Unicode standard. The enumeration has been added in GLib 2.14.
|
||||
Note that new types may be added in the future. Applications
|
||||
should be ready to handle unknown values.
|
||||
See <ulink
|
||||
url="http://www.unicode.org/reports/tr24/">Unicode Standard Annex
|
||||
#24: Script names</ulink>.
|
||||
</para>
|
||||
|
||||
@G_UNICODE_SCRIPT_INVALID_CODE: a value never returned from g_unichar_get_script()
|
||||
@G_UNICODE_SCRIPT_COMMON: a character used by multiple different scripts
|
||||
@G_UNICODE_SCRIPT_INHERITED: a mark glyph that takes its script from the
|
||||
base glyph to which it is attached
|
||||
@G_UNICODE_SCRIPT_ARABIC: Arabic
|
||||
@G_UNICODE_SCRIPT_ARMENIAN: Armenian
|
||||
@G_UNICODE_SCRIPT_BENGALI: Bengali
|
||||
@G_UNICODE_SCRIPT_BOPOMOFO: Bopomofo
|
||||
@G_UNICODE_SCRIPT_CHEROKEE: Cherokee
|
||||
@G_UNICODE_SCRIPT_COPTIC: Coptic
|
||||
@G_UNICODE_SCRIPT_CYRILLIC: Cyrillic
|
||||
@G_UNICODE_SCRIPT_DESERET: Deseret
|
||||
@G_UNICODE_SCRIPT_DEVANAGARI: Devanagari
|
||||
@G_UNICODE_SCRIPT_ETHIOPIC: Ethiopic
|
||||
@G_UNICODE_SCRIPT_GEORGIAN: Georgian
|
||||
@G_UNICODE_SCRIPT_GOTHIC: Gothic
|
||||
@G_UNICODE_SCRIPT_GREEK: Greek
|
||||
@G_UNICODE_SCRIPT_GUJARATI: Gujarati
|
||||
@G_UNICODE_SCRIPT_GURMUKHI: Gurmukhi
|
||||
@G_UNICODE_SCRIPT_HAN: Han
|
||||
@G_UNICODE_SCRIPT_HANGUL: Hangul
|
||||
@G_UNICODE_SCRIPT_HEBREW: Hebrew
|
||||
@G_UNICODE_SCRIPT_HIRAGANA: Hiragana
|
||||
@G_UNICODE_SCRIPT_KANNADA: Kannada
|
||||
@G_UNICODE_SCRIPT_KATAKANA: Katakana
|
||||
@G_UNICODE_SCRIPT_KHMER: Khmer
|
||||
@G_UNICODE_SCRIPT_LAO: Lao
|
||||
@G_UNICODE_SCRIPT_LATIN: Latin
|
||||
@G_UNICODE_SCRIPT_MALAYALAM: Malayalam
|
||||
@G_UNICODE_SCRIPT_MONGOLIAN: Mongolian
|
||||
@G_UNICODE_SCRIPT_MYANMAR: Myanmar
|
||||
@G_UNICODE_SCRIPT_OGHAM: Ogham
|
||||
@G_UNICODE_SCRIPT_OLD_ITALIC: Old Italic
|
||||
@G_UNICODE_SCRIPT_ORIYA: Oriya
|
||||
@G_UNICODE_SCRIPT_RUNIC: Runic
|
||||
@G_UNICODE_SCRIPT_SINHALA: Sinhala
|
||||
@G_UNICODE_SCRIPT_SYRIAC: Syriac
|
||||
@G_UNICODE_SCRIPT_TAMIL: Tamil
|
||||
@G_UNICODE_SCRIPT_TELUGU: Telugu
|
||||
@G_UNICODE_SCRIPT_THAANA: Thaana
|
||||
@G_UNICODE_SCRIPT_THAI: Thai
|
||||
@G_UNICODE_SCRIPT_TIBETAN: Tibetan
|
||||
@G_UNICODE_SCRIPT_CANADIAN_ABORIGINAL:
|
||||
Canadian Aboriginal
|
||||
@G_UNICODE_SCRIPT_YI: Yi
|
||||
@G_UNICODE_SCRIPT_TAGALOG: Tagalog
|
||||
@G_UNICODE_SCRIPT_HANUNOO: Hanunoo
|
||||
@G_UNICODE_SCRIPT_BUHID: Buhid
|
||||
@G_UNICODE_SCRIPT_TAGBANWA: Tagbanwa
|
||||
@G_UNICODE_SCRIPT_BRAILLE: Braille
|
||||
@G_UNICODE_SCRIPT_CYPRIOT: Cypriot
|
||||
@G_UNICODE_SCRIPT_LIMBU: Limbu
|
||||
@G_UNICODE_SCRIPT_OSMANYA: Osmanya
|
||||
@G_UNICODE_SCRIPT_SHAVIAN: Shavian
|
||||
@G_UNICODE_SCRIPT_LINEAR_B: Linear B
|
||||
@G_UNICODE_SCRIPT_TAI_LE: Tai Le
|
||||
@G_UNICODE_SCRIPT_UGARITIC: Ugaritic
|
||||
@G_UNICODE_SCRIPT_NEW_TAI_LUE: New Tai Lue
|
||||
@G_UNICODE_SCRIPT_BUGINESE: Buginese
|
||||
@G_UNICODE_SCRIPT_GLAGOLITIC: Glagolitic
|
||||
@G_UNICODE_SCRIPT_TIFINAGH: Tifinagh
|
||||
@G_UNICODE_SCRIPT_SYLOTI_NAGRI: Syloti Nagri
|
||||
@G_UNICODE_SCRIPT_OLD_PERSIAN: Old Persian
|
||||
@G_UNICODE_SCRIPT_KHAROSHTHI: Kharoshthi
|
||||
@G_UNICODE_SCRIPT_UNKNOWN: an unassigned code point
|
||||
@G_UNICODE_SCRIPT_BALINESE: Balinese
|
||||
@G_UNICODE_SCRIPT_CUNEIFORM: Cuneiform
|
||||
@G_UNICODE_SCRIPT_PHOENICIAN: Phoenician
|
||||
@G_UNICODE_SCRIPT_PHAGS_PA: Phags-pa
|
||||
@G_UNICODE_SCRIPT_NKO: N'Ko
|
||||
|
||||
|
||||
<!-- ##### FUNCTION g_unichar_get_script ##### -->
|
||||
<para>
|
||||
|
||||
</para>
|
||||
|
||||
@ch:
|
||||
@Returns:
|
||||
|
||||
|
||||
<!-- ##### MACRO g_utf8_next_char ##### -->
|
||||
<para>
|
||||
Skips to the next character in a UTF-8 string. The string must be
|
||||
|
119
glib/gen-script-table.pl
Executable file
119
glib/gen-script-table.pl
Executable file
@ -0,0 +1,119 @@
|
||||
#!/usr/bin/perl -w
|
||||
#
|
||||
# Script to convert http://www.unicode.org/Public/UNIDATA/Scripts.txt
|
||||
# into a machine-readable table.
|
||||
#
|
||||
######################################################################
|
||||
|
||||
if (@ARGV != 1) {
|
||||
die "Usage: gen-script-table.pl Scripts.txt > gscripttable.h\n";
|
||||
}
|
||||
|
||||
open IN, $ARGV[0] || die "Cannot open $ARGV[0]: $!\n";
|
||||
|
||||
my @ranges;
|
||||
my $file;
|
||||
my $easy_range;
|
||||
my $i;
|
||||
my $start;
|
||||
my $end;
|
||||
my $script;
|
||||
|
||||
|
||||
while (<IN>) {
|
||||
if (/^\#\s+(Scripts-.*.txt)/) {
|
||||
$file = $1;
|
||||
}
|
||||
|
||||
s/#.*//;
|
||||
next if /^\s*$/;
|
||||
if (!/^([0-9A-F]+)(?:\.\.([0-9A-F]+))?\s*;\s*([A-Za-z_]+)\s*$/) {
|
||||
die "Cannot parse line: '$_'\n";
|
||||
}
|
||||
|
||||
if (defined $2) {
|
||||
push @ranges, [ hex $1, hex $2, uc $3 ];
|
||||
} else {
|
||||
push @ranges, [ hex $1, hex $1, uc $3 ];
|
||||
}
|
||||
}
|
||||
|
||||
@ranges = sort { $a->[0] <=> $b->[0] } @ranges;
|
||||
$date = gmtime;
|
||||
|
||||
print <<"EOT";
|
||||
/* gscripttable.h: Generated by gen-script-table.pl
|
||||
*
|
||||
* Date: $date
|
||||
* Source: $file
|
||||
*
|
||||
* Do not edit.
|
||||
*/
|
||||
|
||||
EOT
|
||||
|
||||
$easy_range = 0x2000;
|
||||
|
||||
print <<"EOT";
|
||||
#define G_EASY_SCRIPTS_RANGE $easy_range
|
||||
|
||||
static const guchar g_script_easy_table[$easy_range] = {
|
||||
EOT
|
||||
|
||||
$i = 0;
|
||||
$end = -1;
|
||||
|
||||
for (my $c = 0; $c < $easy_range; $c++) {
|
||||
|
||||
if ($c % 3 == 0) {
|
||||
printf "\n ";
|
||||
}
|
||||
|
||||
if ($c > $end) {
|
||||
$start = $ranges[$i]->[0];
|
||||
$end = $ranges[$i]->[1];
|
||||
$script = $ranges[$i]->[2];
|
||||
$i++;
|
||||
}
|
||||
|
||||
if ($c < $start) {
|
||||
printf " G_SCRIPT_UNKNOWN,";
|
||||
} else {
|
||||
printf " G_SCRIPT_%s,", $script;
|
||||
}
|
||||
}
|
||||
|
||||
if ($end >= $easy_range) {
|
||||
$i--;
|
||||
$ranges[$i]->[0] = $easy_range;
|
||||
}
|
||||
|
||||
|
||||
print <<"EOT";
|
||||
|
||||
};
|
||||
|
||||
static const struct {
|
||||
gunichar start;
|
||||
guint16 chars;
|
||||
guint16 script;
|
||||
} g_script_table[] = {
|
||||
EOT
|
||||
|
||||
for (; $i <= $#ranges; $i++) {
|
||||
$start = $ranges[$i]->[0];
|
||||
$end = $ranges[$i]->[1];
|
||||
$script = $ranges[$i]->[2];
|
||||
|
||||
while ($i <= $#ranges - 1 &&
|
||||
$ranges[$i + 1]->[0] == $end + 1 &&
|
||||
$ranges[$i + 1]->[2] eq $script) {
|
||||
$i++;
|
||||
$end = $ranges[$i]->[1];
|
||||
}
|
||||
|
||||
printf " { %#06x, %5d, G_SCRIPT_%s },\n", $start, $end - $start + 1, $script;
|
||||
}
|
||||
|
||||
printf "};\n";
|
||||
|
@ -1256,6 +1256,7 @@ g_unichar_tolower G_GNUC_CONST
|
||||
g_unichar_totitle G_GNUC_CONST
|
||||
g_unichar_toupper G_GNUC_CONST
|
||||
g_unichar_get_mirror_char
|
||||
g_unichar_get_script
|
||||
g_unichar_digit_value G_GNUC_CONST
|
||||
g_unichar_xdigit_value G_GNUC_CONST
|
||||
g_unichar_type G_GNUC_CONST
|
||||
|
2982
glib/gscripttable.h
Normal file
2982
glib/gscripttable.h
Normal file
File diff suppressed because it is too large
Load Diff
@ -112,6 +112,84 @@ typedef enum
|
||||
G_UNICODE_BREAK_HANGUL_LVT_SYLLABLE
|
||||
} GUnicodeBreakType;
|
||||
|
||||
typedef enum
|
||||
{ /* ISO 15924 code */
|
||||
G_UNICODE_SCRIPT_INVALID_CODE = -1,
|
||||
G_UNICODE_SCRIPT_COMMON = 0, /* Zyyy */
|
||||
G_UNICODE_SCRIPT_INHERITED, /* Qaai */
|
||||
G_UNICODE_SCRIPT_ARABIC, /* Arab */
|
||||
G_UNICODE_SCRIPT_ARMENIAN, /* Armn */
|
||||
G_UNICODE_SCRIPT_BENGALI, /* Beng */
|
||||
G_UNICODE_SCRIPT_BOPOMOFO, /* Bopo */
|
||||
G_UNICODE_SCRIPT_CHEROKEE, /* Cher */
|
||||
G_UNICODE_SCRIPT_COPTIC, /* Qaac */
|
||||
G_UNICODE_SCRIPT_CYRILLIC, /* Cyrl (Cyrs) */
|
||||
G_UNICODE_SCRIPT_DESERET, /* Dsrt */
|
||||
G_UNICODE_SCRIPT_DEVANAGARI, /* Deva */
|
||||
G_UNICODE_SCRIPT_ETHIOPIC, /* Ethi */
|
||||
G_UNICODE_SCRIPT_GEORGIAN, /* Geor (Geon, Geoa) */
|
||||
G_UNICODE_SCRIPT_GOTHIC, /* Goth */
|
||||
G_UNICODE_SCRIPT_GREEK, /* Grek */
|
||||
G_UNICODE_SCRIPT_GUJARATI, /* Gujr */
|
||||
G_UNICODE_SCRIPT_GURMUKHI, /* Guru */
|
||||
G_UNICODE_SCRIPT_HAN, /* Hani */
|
||||
G_UNICODE_SCRIPT_HANGUL, /* Hang */
|
||||
G_UNICODE_SCRIPT_HEBREW, /* Hebr */
|
||||
G_UNICODE_SCRIPT_HIRAGANA, /* Hira */
|
||||
G_UNICODE_SCRIPT_KANNADA, /* Knda */
|
||||
G_UNICODE_SCRIPT_KATAKANA, /* Kana */
|
||||
G_UNICODE_SCRIPT_KHMER, /* Khmr */
|
||||
G_UNICODE_SCRIPT_LAO, /* Laoo */
|
||||
G_UNICODE_SCRIPT_LATIN, /* Latn (Latf, Latg) */
|
||||
G_UNICODE_SCRIPT_MALAYALAM, /* Mlym */
|
||||
G_UNICODE_SCRIPT_MONGOLIAN, /* Mong */
|
||||
G_UNICODE_SCRIPT_MYANMAR, /* Mymr */
|
||||
G_UNICODE_SCRIPT_OGHAM, /* Ogam */
|
||||
G_UNICODE_SCRIPT_OLD_ITALIC, /* Ital */
|
||||
G_UNICODE_SCRIPT_ORIYA, /* Orya */
|
||||
G_UNICODE_SCRIPT_RUNIC, /* Runr */
|
||||
G_UNICODE_SCRIPT_SINHALA, /* Sinh */
|
||||
G_UNICODE_SCRIPT_SYRIAC, /* Syrc (Syrj, Syrn, Syre) */
|
||||
G_UNICODE_SCRIPT_TAMIL, /* Taml */
|
||||
G_UNICODE_SCRIPT_TELUGU, /* Telu */
|
||||
G_UNICODE_SCRIPT_THAANA, /* Thaa */
|
||||
G_UNICODE_SCRIPT_THAI, /* Thai */
|
||||
G_UNICODE_SCRIPT_TIBETAN, /* Tibt */
|
||||
G_UNICODE_SCRIPT_CANADIAN_ABORIGINAL, /* Cans */
|
||||
G_UNICODE_SCRIPT_YI, /* Yiii */
|
||||
G_UNICODE_SCRIPT_TAGALOG, /* Tglg */
|
||||
G_UNICODE_SCRIPT_HANUNOO, /* Hano */
|
||||
G_UNICODE_SCRIPT_BUHID, /* Buhd */
|
||||
G_UNICODE_SCRIPT_TAGBANWA, /* Tagb */
|
||||
|
||||
/* Unicode-4.0 additions */
|
||||
G_UNICODE_SCRIPT_BRAILLE, /* Brai */
|
||||
G_UNICODE_SCRIPT_CYPRIOT, /* Cprt */
|
||||
G_UNICODE_SCRIPT_LIMBU, /* Limb */
|
||||
G_UNICODE_SCRIPT_OSMANYA, /* Osma */
|
||||
G_UNICODE_SCRIPT_SHAVIAN, /* Shaw */
|
||||
G_UNICODE_SCRIPT_LINEAR_B, /* Linb */
|
||||
G_UNICODE_SCRIPT_TAI_LE, /* Tale */
|
||||
G_UNICODE_SCRIPT_UGARITIC, /* Ugar */
|
||||
|
||||
/* Unicode-4.1 additions */
|
||||
G_UNICODE_SCRIPT_NEW_TAI_LUE, /* Talu */
|
||||
G_UNICODE_SCRIPT_BUGINESE, /* Bugi */
|
||||
G_UNICODE_SCRIPT_GLAGOLITIC, /* Glag */
|
||||
G_UNICODE_SCRIPT_TIFINAGH, /* Tfng */
|
||||
G_UNICODE_SCRIPT_SYLOTI_NAGRI, /* Sylo */
|
||||
G_UNICODE_SCRIPT_OLD_PERSIAN, /* Xpeo */
|
||||
G_UNICODE_SCRIPT_KHAROSHTHI, /* Khar */
|
||||
|
||||
/* Unicode-5.0 additions */
|
||||
G_UNICODE_SCRIPT_UNKNOWN, /* Zzzz */
|
||||
G_UNICODE_SCRIPT_BALINESE, /* Bali */
|
||||
G_UNICODE_SCRIPT_CUNEIFORM, /* Xsux */
|
||||
G_UNICODE_SCRIPT_PHOENICIAN, /* Phnx */
|
||||
G_UNICODE_SCRIPT_PHAGS_PA, /* Phag */
|
||||
G_UNICODE_SCRIPT_NKO /* Nkoo */
|
||||
} GUnicodeScript;
|
||||
|
||||
/* Returns TRUE if current locale uses UTF-8 charset. If CHARSET is
|
||||
* not null, sets *CHARSET to the name of the current locale's
|
||||
* charset. This value is statically allocated, and should be copied
|
||||
@ -292,6 +370,9 @@ gchar *g_utf8_collate_key_for_filename (const gchar *str,
|
||||
gboolean g_unichar_get_mirror_char (gunichar ch,
|
||||
gunichar *mirrored_ch);
|
||||
|
||||
GUnicodeScript g_unichar_get_script (gunichar ch);
|
||||
|
||||
|
||||
/* private */
|
||||
|
||||
gchar *_g_utf8_make_valid (const gchar *name);
|
||||
|
@ -29,6 +29,7 @@
|
||||
#include "glib.h"
|
||||
#include "gunichartables.h"
|
||||
#include "gmirroringtable.h"
|
||||
#include "gscripttable.h"
|
||||
#include "gunicodeprivate.h"
|
||||
#include "galias.h"
|
||||
|
||||
@ -1183,5 +1184,55 @@ g_unichar_get_mirror_char (gunichar ch,
|
||||
|
||||
}
|
||||
|
||||
#define G_SCRIPT_TABLE_MIDPOINT (G_N_ELEMENTS (g_script_table) / 2)
|
||||
|
||||
static inline GUnicodeScript
|
||||
g_unichar_get_script_bsearch (gunichar ch)
|
||||
{
|
||||
int lower = 0;
|
||||
int upper = G_N_ELEMENTS (g_script_table) - 1;
|
||||
static int saved_mid = G_SCRIPT_TABLE_MIDPOINT;
|
||||
int mid = saved_mid;
|
||||
|
||||
|
||||
do
|
||||
{
|
||||
if (ch < g_script_table[mid].start)
|
||||
upper = mid - 1;
|
||||
else if (ch >= g_script_table[mid].start + g_script_table[mid].chars)
|
||||
lower = mid + 1;
|
||||
else
|
||||
return g_script_table[saved_mid = mid].script;
|
||||
|
||||
mid = (lower + upper) / 2;
|
||||
}
|
||||
while (lower <= upper);
|
||||
|
||||
return G_UNICODE_SCRIPT_UNKNOWN;
|
||||
}
|
||||
|
||||
/**
|
||||
* g_unichar_get_script:
|
||||
* @ch: a Unicode character
|
||||
*
|
||||
* Looks up the #GUnicodeScript for a particular character (as defined
|
||||
* by Unicode Standard Annex #24). No check is made for @ch being a
|
||||
* valid Unicode character; if you pass in invalid character, the
|
||||
* result is undefined.
|
||||
*
|
||||
* Return value: the #GUnicodeScript for the character.
|
||||
*
|
||||
* Since: 2.14
|
||||
*/
|
||||
GUnicodeScript
|
||||
g_unichar_get_script (gunichar ch)
|
||||
{
|
||||
if (ch < G_EASY_SCRIPTS_RANGE)
|
||||
return g_script_easy_table[ch];
|
||||
else
|
||||
return g_unichar_get_script_bsearch (ch);
|
||||
}
|
||||
|
||||
|
||||
#define __G_UNIPROP_C__
|
||||
#include "galiasdef.c"
|
||||
|
Loading…
Reference in New Issue
Block a user