mirror of
https://gitlab.gnome.org/GNOME/glib.git
synced 2025-03-03 14:42:10 +01:00
Unicode 4.0 special casing. (#114681)
2003-09-10 Noah Levitt <nlevitt@columbia.edu> * glib/gunicodeprivate.h: * glib/gunicollate.c: * glib/gunidecomp.c: * glib/guniprop.c: * tests/casemap.txt: * tests/gen-casemap-txt.pl: Unicode 4.0 special casing. (#114681) * glib/gunicodeprivate.h: Use a private header instead of extern function declarations (_g_utf8_normalize_wc, _g_unichar_combining_class).
This commit is contained in:
parent
896d38706b
commit
8d91ba8c58
13
ChangeLog
13
ChangeLog
@ -1,3 +1,16 @@
|
||||
2003-09-10 Noah Levitt <nlevitt@columbia.edu>
|
||||
|
||||
* glib/gunicodeprivate.h:
|
||||
* glib/gunicollate.c:
|
||||
* glib/gunidecomp.c:
|
||||
* glib/guniprop.c:
|
||||
* tests/casemap.txt:
|
||||
* tests/gen-casemap-txt.pl: Unicode 4.0 special casing. (#114681)
|
||||
|
||||
* glib/gunicodeprivate.h: Use a private header instead of extern
|
||||
function declarations (_g_utf8_normalize_wc,
|
||||
_g_unichar_combining_class).
|
||||
|
||||
Mon Sep 8 00:31:10 2003 Stefan Westerfeld <stefan@space.twc.de>
|
||||
|
||||
* glib/gbsearcharray.h: inserted casts for C++.
|
||||
|
@ -1,3 +1,16 @@
|
||||
2003-09-10 Noah Levitt <nlevitt@columbia.edu>
|
||||
|
||||
* glib/gunicodeprivate.h:
|
||||
* glib/gunicollate.c:
|
||||
* glib/gunidecomp.c:
|
||||
* glib/guniprop.c:
|
||||
* tests/casemap.txt:
|
||||
* tests/gen-casemap-txt.pl: Unicode 4.0 special casing. (#114681)
|
||||
|
||||
* glib/gunicodeprivate.h: Use a private header instead of extern
|
||||
function declarations (_g_utf8_normalize_wc,
|
||||
_g_unichar_combining_class).
|
||||
|
||||
Mon Sep 8 00:31:10 2003 Stefan Westerfeld <stefan@space.twc.de>
|
||||
|
||||
* glib/gbsearcharray.h: inserted casts for C++.
|
||||
|
@ -1,3 +1,16 @@
|
||||
2003-09-10 Noah Levitt <nlevitt@columbia.edu>
|
||||
|
||||
* glib/gunicodeprivate.h:
|
||||
* glib/gunicollate.c:
|
||||
* glib/gunidecomp.c:
|
||||
* glib/guniprop.c:
|
||||
* tests/casemap.txt:
|
||||
* tests/gen-casemap-txt.pl: Unicode 4.0 special casing. (#114681)
|
||||
|
||||
* glib/gunicodeprivate.h: Use a private header instead of extern
|
||||
function declarations (_g_utf8_normalize_wc,
|
||||
_g_unichar_combining_class).
|
||||
|
||||
Mon Sep 8 00:31:10 2003 Stefan Westerfeld <stefan@space.twc.de>
|
||||
|
||||
* glib/gbsearcharray.h: inserted casts for C++.
|
||||
|
@ -1,3 +1,16 @@
|
||||
2003-09-10 Noah Levitt <nlevitt@columbia.edu>
|
||||
|
||||
* glib/gunicodeprivate.h:
|
||||
* glib/gunicollate.c:
|
||||
* glib/gunidecomp.c:
|
||||
* glib/guniprop.c:
|
||||
* tests/casemap.txt:
|
||||
* tests/gen-casemap-txt.pl: Unicode 4.0 special casing. (#114681)
|
||||
|
||||
* glib/gunicodeprivate.h: Use a private header instead of extern
|
||||
function declarations (_g_utf8_normalize_wc,
|
||||
_g_unichar_combining_class).
|
||||
|
||||
Mon Sep 8 00:31:10 2003 Stefan Westerfeld <stefan@space.twc.de>
|
||||
|
||||
* glib/gbsearcharray.h: inserted casts for C++.
|
||||
|
@ -1,3 +1,16 @@
|
||||
2003-09-10 Noah Levitt <nlevitt@columbia.edu>
|
||||
|
||||
* glib/gunicodeprivate.h:
|
||||
* glib/gunicollate.c:
|
||||
* glib/gunidecomp.c:
|
||||
* glib/guniprop.c:
|
||||
* tests/casemap.txt:
|
||||
* tests/gen-casemap-txt.pl: Unicode 4.0 special casing. (#114681)
|
||||
|
||||
* glib/gunicodeprivate.h: Use a private header instead of extern
|
||||
function declarations (_g_utf8_normalize_wc,
|
||||
_g_unichar_combining_class).
|
||||
|
||||
Mon Sep 8 00:31:10 2003 Stefan Westerfeld <stefan@space.twc.de>
|
||||
|
||||
* glib/gbsearcharray.h: inserted casts for C++.
|
||||
|
@ -1,3 +1,16 @@
|
||||
2003-09-10 Noah Levitt <nlevitt@columbia.edu>
|
||||
|
||||
* glib/gunicodeprivate.h:
|
||||
* glib/gunicollate.c:
|
||||
* glib/gunidecomp.c:
|
||||
* glib/guniprop.c:
|
||||
* tests/casemap.txt:
|
||||
* tests/gen-casemap-txt.pl: Unicode 4.0 special casing. (#114681)
|
||||
|
||||
* glib/gunicodeprivate.h: Use a private header instead of extern
|
||||
function declarations (_g_utf8_normalize_wc,
|
||||
_g_unichar_combining_class).
|
||||
|
||||
Mon Sep 8 00:31:10 2003 Stefan Westerfeld <stefan@space.twc.de>
|
||||
|
||||
* glib/gbsearcharray.h: inserted casts for C++.
|
||||
|
35
glib/gunicodeprivate.h
Normal file
35
glib/gunicodeprivate.h
Normal file
@ -0,0 +1,35 @@
|
||||
/* gunicodeprivate.h
|
||||
*
|
||||
* Copyright (C) 2003 Noah Levitt
|
||||
*
|
||||
* This library is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2 of the License, or (at your option) any later version.
|
||||
*
|
||||
* This library is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with this library; if not, write to the
|
||||
* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
|
||||
* Boston, MA 02111-1307, USA.
|
||||
*/
|
||||
|
||||
#ifndef __G_UNICODE_PRIVATE_H__
|
||||
#define __G_UNICODE_PRIVATE_H__
|
||||
|
||||
#include "glib.h"
|
||||
|
||||
G_BEGIN_DECLS
|
||||
|
||||
gunichar *_g_utf8_normalize_wc (const gchar *str,
|
||||
gssize max_len,
|
||||
GNormalizeMode mode);
|
||||
gint _g_unichar_combining_class (gunichar uc);
|
||||
|
||||
G_END_DECLS
|
||||
|
||||
#endif /* __G_UNICODE_PRIVATE_H__ */
|
@ -27,10 +27,7 @@
|
||||
#endif
|
||||
|
||||
#include "glib.h"
|
||||
|
||||
extern gunichar *_g_utf8_normalize_wc (const gchar *str,
|
||||
gssize max_len,
|
||||
GNormalizeMode mode);
|
||||
#include "gunicodeprivate.h"
|
||||
|
||||
/**
|
||||
* g_utf8_collate:
|
||||
|
@ -26,6 +26,7 @@
|
||||
#include "glib.h"
|
||||
#include "gunidecomp.h"
|
||||
#include "gunicomp.h"
|
||||
#include "gunicodeprivate.h"
|
||||
|
||||
|
||||
#define CC_PART1(Page, Char) \
|
||||
@ -45,6 +46,12 @@
|
||||
? CC_PART2 (((Char) - 0xe0000) >> 8, (Char) & 0xff) \
|
||||
: 0))
|
||||
|
||||
gint
|
||||
_g_unichar_combining_class (gunichar uc)
|
||||
{
|
||||
return COMBINING_CLASS (uc);
|
||||
}
|
||||
|
||||
/**
|
||||
* g_unicode_canonical_ordering:
|
||||
* @string: a UCS-4 encoded string.
|
||||
|
@ -27,6 +27,7 @@
|
||||
|
||||
#include "glib.h"
|
||||
#include "gunichartables.h"
|
||||
#include "gunicodeprivate.h"
|
||||
|
||||
#define ATTR_TABLE(Page) (((Page) <= G_UNICODE_LAST_PAGE_PART1) \
|
||||
? attr_table_part1[Page] \
|
||||
@ -737,6 +738,28 @@ g_utf8_strup (const gchar *str,
|
||||
return result;
|
||||
}
|
||||
|
||||
/* traverses the string checking for characters with combining class == 230
|
||||
* until a base character is found */
|
||||
static gboolean
|
||||
has_more_above (gchar *str)
|
||||
{
|
||||
gchar *p = str;
|
||||
gint combining_class;
|
||||
|
||||
while (*p)
|
||||
{
|
||||
combining_class = _g_unichar_combining_class (g_utf8_get_char (p));
|
||||
if (combining_class == 230)
|
||||
return TRUE;
|
||||
else if (combining_class == 0)
|
||||
break;
|
||||
|
||||
p = g_utf8_next_char (p);
|
||||
}
|
||||
|
||||
return FALSE;
|
||||
}
|
||||
|
||||
static gsize
|
||||
real_tolower (const gchar *str,
|
||||
gssize max_len,
|
||||
@ -757,10 +780,47 @@ real_tolower (const gchar *str,
|
||||
p = g_utf8_next_char (p);
|
||||
|
||||
if (locale_type == LOCALE_TURKIC && c == 'I')
|
||||
{
|
||||
if (g_utf8_get_char (p) == 0x0307)
|
||||
{
|
||||
/* I + COMBINING DOT ABOVE => i (U+0069) */
|
||||
len += g_unichar_to_utf8 (0x0069, out_buffer ? out_buffer + len : NULL);
|
||||
p = g_utf8_next_char (p);
|
||||
}
|
||||
else
|
||||
{
|
||||
/* I => LATIN SMALL LETTER DOTLESS I */
|
||||
len += g_unichar_to_utf8 (0x131, out_buffer ? out_buffer + len : NULL);
|
||||
}
|
||||
}
|
||||
/* Introduce an explicit dot above when lowercasing capital I's and J's
|
||||
* whenever there are more accents above. [SpecialCasing.txt] */
|
||||
else if (locale_type == LOCALE_LITHUANIAN &&
|
||||
(c == 0x00cc || c == 0x00cd || c == 0x0128))
|
||||
{
|
||||
len += g_unichar_to_utf8 (0x0069, out_buffer ? out_buffer + len : NULL);
|
||||
len += g_unichar_to_utf8 (0x0307, out_buffer ? out_buffer + len : NULL);
|
||||
|
||||
switch (c)
|
||||
{
|
||||
case 0x00cc:
|
||||
len += g_unichar_to_utf8 (0x0300, out_buffer ? out_buffer + len : NULL);
|
||||
break;
|
||||
case 0x00cd:
|
||||
len += g_unichar_to_utf8 (0x0301, out_buffer ? out_buffer + len : NULL);
|
||||
break;
|
||||
case 0x0128:
|
||||
len += g_unichar_to_utf8 (0x0303, out_buffer ? out_buffer + len : NULL);
|
||||
break;
|
||||
}
|
||||
}
|
||||
else if (locale_type == LOCALE_LITHUANIAN &&
|
||||
(c == 'I' || c == 'J' || c == 0x012e) &&
|
||||
has_more_above (p))
|
||||
{
|
||||
len += g_unichar_to_utf8 (g_unichar_tolower (c), out_buffer ? out_buffer + len : NULL);
|
||||
len += g_unichar_to_utf8 (0x0307, out_buffer ? out_buffer + len : NULL);
|
||||
}
|
||||
else if (c == 0x03A3) /* GREEK CAPITAL LETTER SIGMA */
|
||||
{
|
||||
if ((max_len < 0 || p < str + max_len) && *p)
|
||||
|
@ -5,6 +5,10 @@
|
||||
#
|
||||
tr_TR i i İ İ # i => LATIN CAPITAL LETTER I WITH DOT ABOVE
|
||||
tr_TR I ı I I # I => LATIN SMALL LETTER DOTLESS I
|
||||
tr_TR İ i İ İ # I => LATIN SMALL LETTER DOTLESS I
|
||||
tr_TR.UTF-8 i i İ İ # i => LATIN CAPITAL LETTER I WITH DOT ABOVE
|
||||
tr_TR.UTF-8 I ı I I # I => LATIN SMALL LETTER DOTLESS I
|
||||
tr_TR.UTF-8 İ i İ İ # I => LATIN SMALL LETTER DOTLESS I
|
||||
# Test reordering of YPOGEGRAMMENI across other accents
|
||||
ᾁ ᾁ ᾉ ἉΙ
|
||||
ᾁ ᾁ ᾉ ἉΙ
|
||||
@ -16,6 +20,26 @@ tr_TR I ı I I # I => LATIN SMALL LETTER DOTLESS I
|
||||
# about the titlecase part here
|
||||
lt_LT iė iė Ie IE
|
||||
lt_LT iė iė Ie IE
|
||||
lt_LT Ì i̇̀ Ì Ì # LATIN CAPITAL LETTER I WITH GRAVE
|
||||
lt_LT Í i̇́ Í Í # LATIN CAPITAL LETTER I WITH ACUTE
|
||||
lt_LT Ĩ i̇̃ Ĩ Ĩ # LATIN CAPITAL LETTER I WITH TILDE
|
||||
lt_LT Í i̇́ Í Í # LATIN CAPITAL LETTER I (with acute accent)
|
||||
lt_LT Ì i̇̀ Ì Ì # LATIN CAPITAL LETTER I (with grave accent)
|
||||
lt_LT Ĩ i̇̃ Ĩ Ĩ # LATIN CAPITAL LETTER I (with tilde above)
|
||||
lt_LT Į́ į̇́ Į́ Į́ # LATIN CAPITAL LETTER I (with ogonek and acute accent)
|
||||
lt_LT J́ j̇́ J́ J́ # LATIN CAPITAL LETTER J (with acute accent)
|
||||
lt_LT Į́ į̇́ Į́ Į́ # LATIN CAPITAL LETTER I WITH OGONEK (with acute accent)
|
||||
lt_LT.UTF-8 iė iė Ie IE
|
||||
lt_LT.UTF-8 iė iė Ie IE
|
||||
lt_LT.UTF-8 Ì i̇̀ Ì Ì # LATIN CAPITAL LETTER I WITH GRAVE
|
||||
lt_LT.UTF-8 Í i̇́ Í Í # LATIN CAPITAL LETTER I WITH ACUTE
|
||||
lt_LT.UTF-8 Ĩ i̇̃ Ĩ Ĩ # LATIN CAPITAL LETTER I WITH TILDE
|
||||
lt_LT.UTF-8 Í i̇́ Í Í # LATIN CAPITAL LETTER I (with acute accent)
|
||||
lt_LT.UTF-8 Ì i̇̀ Ì Ì # LATIN CAPITAL LETTER I (with grave accent)
|
||||
lt_LT.UTF-8 Ĩ i̇̃ Ĩ Ĩ # LATIN CAPITAL LETTER I (with tilde above)
|
||||
lt_LT.UTF-8 Į́ į̇́ Į́ Į́ # LATIN CAPITAL LETTER I (with ogonek and acute accent)
|
||||
lt_LT.UTF-8 J́ j̇́ J́ J́ # LATIN CAPITAL LETTER J (with acute accent)
|
||||
lt_LT.UTF-8 Į́ į̇́ Į́ Į́ # LATIN CAPITAL LETTER I WITH OGONEK (with acute accent)
|
||||
# Special case not at initial position
|
||||
affl affl Affl AFFL # FB04
|
||||
#
|
||||
|
@ -148,6 +148,10 @@ print <<EOT;
|
||||
#
|
||||
tr_TR\ti\ti\t\x{0130}\t\x{0130}\t# i => LATIN CAPITAL LETTER I WITH DOT ABOVE
|
||||
tr_TR\tI\t\x{0131}\tI\tI\t# I => LATIN SMALL LETTER DOTLESS I
|
||||
tr_TR\tI\x{0307}\ti\tI\x{0307}\tI\x{0307}\t# I => LATIN SMALL LETTER DOTLESS I
|
||||
tr_TR.UTF-8\ti\ti\t\x{0130}\t\x{0130}\t# i => LATIN CAPITAL LETTER I WITH DOT ABOVE
|
||||
tr_TR.UTF-8\tI\t\x{0131}\tI\tI\t# I => LATIN SMALL LETTER DOTLESS I
|
||||
tr_TR.UTF-8\tI\x{0307}\ti\tI\x{0307}\tI\x{0307}\t# I => LATIN SMALL LETTER DOTLESS I
|
||||
# Test reordering of YPOGEGRAMMENI across other accents
|
||||
\t\x{03b1}\x{0345}\x{0314}\t\x{03b1}\x{0345}\x{314}\t\x{0391}\x{0345}\x{0314}\t\x{0391}\x{0314}\x{0399}\t
|
||||
\t\x{03b1}\x{0314}\x{0345}\t\x{03b1}\x{314}\x{0345}\t\x{0391}\x{0314}\x{0345}\t\x{0391}\x{0314}\x{0399}\t
|
||||
@ -159,6 +163,26 @@ tr_TR\tI\t\x{0131}\tI\tI\t# I => LATIN SMALL LETTER DOTLESS I
|
||||
# about the titlecase part here
|
||||
lt_LT\ti\x{117}\ti\x{117}\tIe\tIE\t
|
||||
lt_LT\tie\x{307}\tie\x{307}\tIe\tIE\t
|
||||
lt_LT\t\x{00cc}\ti\x{0307}\x{0300}\t\x{00cc}\t\x{00cc}\t # LATIN CAPITAL LETTER I WITH GRAVE
|
||||
lt_LT\t\x{00CD}\ti\x{0307}\x{0301}\t\x{00CD}\t\x{00CD}\t # LATIN CAPITAL LETTER I WITH ACUTE
|
||||
lt_LT\t\x{0128}\ti\x{0307}\x{0303}\t\x{0128}\t\x{0128}\t # LATIN CAPITAL LETTER I WITH TILDE
|
||||
lt_LT\tI\x{0301}\ti\x{0307}\x{0301}\tI\x{0301}\tI\x{0301}\t # LATIN CAPITAL LETTER I (with acute accent)
|
||||
lt_LT\tI\x{0300}\ti\x{0307}\x{0300}\tI\x{0300}\tI\x{0300}\t # LATIN CAPITAL LETTER I (with grave accent)
|
||||
lt_LT\tI\x{0303}\ti\x{0307}\x{0303}\tI\x{0303}\tI\x{0303}\t # LATIN CAPITAL LETTER I (with tilde above)
|
||||
lt_LT\tI\x{0328}\x{0301}\ti\x{0307}\x{0328}\x{0301}\tI\x{0328}\x{0301}\tI\x{0328}\x{0301}\t # LATIN CAPITAL LETTER I (with ogonek and acute accent)
|
||||
lt_LT\tJ\x{0301}\tj\x{0307}\x{0301}\tJ\x{0301}\tJ\x{0301}\t # LATIN CAPITAL LETTER J (with acute accent)
|
||||
lt_LT\t\x{012e}\x{0301}\t\x{012f}\x{0307}\x{0301}\t\x{012e}\x{0301}\t\x{012e}\x{0301}\t # LATIN CAPITAL LETTER I WITH OGONEK (with acute accent)
|
||||
lt_LT.UTF-8\ti\x{117}\ti\x{117}\tIe\tIE\t
|
||||
lt_LT.UTF-8\tie\x{307}\tie\x{307}\tIe\tIE\t
|
||||
lt_LT.UTF-8\t\x{00cc}\ti\x{0307}\x{0300}\t\x{00cc}\t\x{00cc}\t # LATIN CAPITAL LETTER I WITH GRAVE
|
||||
lt_LT.UTF-8\t\x{00CD}\ti\x{0307}\x{0301}\t\x{00CD}\t\x{00CD}\t # LATIN CAPITAL LETTER I WITH ACUTE
|
||||
lt_LT.UTF-8\t\x{0128}\ti\x{0307}\x{0303}\t\x{0128}\t\x{0128}\t # LATIN CAPITAL LETTER I WITH TILDE
|
||||
lt_LT.UTF-8\tI\x{0301}\ti\x{0307}\x{0301}\tI\x{0301}\tI\x{0301}\t # LATIN CAPITAL LETTER I (with acute accent)
|
||||
lt_LT.UTF-8\tI\x{0300}\ti\x{0307}\x{0300}\tI\x{0300}\tI\x{0300}\t # LATIN CAPITAL LETTER I (with grave accent)
|
||||
lt_LT.UTF-8\tI\x{0303}\ti\x{0307}\x{0303}\tI\x{0303}\tI\x{0303}\t # LATIN CAPITAL LETTER I (with tilde above)
|
||||
lt_LT.UTF-8\tI\x{0328}\x{0301}\ti\x{0307}\x{0328}\x{0301}\tI\x{0328}\x{0301}\tI\x{0328}\x{0301}\t # LATIN CAPITAL LETTER I (with ogonek and acute accent)
|
||||
lt_LT.UTF-8\tJ\x{0301}\tj\x{0307}\x{0301}\tJ\x{0301}\tJ\x{0301}\t # LATIN CAPITAL LETTER J (with acute accent)
|
||||
lt_LT.UTF-8\t\x{012e}\x{0301}\t\x{012f}\x{0307}\x{0301}\t\x{012e}\x{0301}\t\x{012e}\x{0301}\t # LATIN CAPITAL LETTER I WITH OGONEK (with acute accent)
|
||||
# Special case not at initial position
|
||||
\ta\x{fb04}\ta\x{fb04}\tAffl\tAFFL\t# FB04
|
||||
#
|
||||
|
Loading…
x
Reference in New Issue
Block a user