From b9a4897900cc3a39df0aac0e40fe0600fc7d123a Mon Sep 17 00:00:00 2001 From: Kjell Ahlstedt Date: Mon, 8 Feb 2021 16:32:41 +0100 Subject: [PATCH 1/2] guniprop: Fix g_utf8_strdown() for Turkish locale In the Turkish locale the lowercase equivalent of a capital I with dot above is a normal lowercase i with a dot above. Fixes part of issue #390 --- glib/guniprop.c | 16 ++++++++++------ glib/tests/unicode.c | 42 ++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 52 insertions(+), 6 deletions(-) diff --git a/glib/guniprop.c b/glib/guniprop.c index 619b39908..fdae9f0fc 100644 --- a/glib/guniprop.c +++ b/glib/guniprop.c @@ -987,14 +987,18 @@ real_tolower (const gchar *str, last = p; p = g_utf8_next_char (p); - if (locale_type == LOCALE_TURKIC && (c == 'I' || + if (locale_type == LOCALE_TURKIC && (c == 'I' || c == 0x130 || c == G_UNICHAR_FULLWIDTH_I)) - { - if (g_utf8_get_char (p) == 0x0307) + { + gboolean combining_dot = (c == 'I' || c == G_UNICHAR_FULLWIDTH_I) && + g_utf8_get_char (p) == 0x0307; + if (combining_dot || c == 0x130) { - /* I + COMBINING DOT ABOVE => i (U+0069) */ - len += g_unichar_to_utf8 (0x0069, out_buffer ? out_buffer + len : NULL); - p = g_utf8_next_char (p); + /* I + COMBINING DOT ABOVE => i (U+0069) + * LATIN CAPITAL LETTER I WITH DOT ABOVE => i (U+0069) */ + len += g_unichar_to_utf8 (0x0069, out_buffer ? out_buffer + len : NULL); + if (combining_dot) + p = g_utf8_next_char (p); } else { diff --git a/glib/tests/unicode.c b/glib/tests/unicode.c index fa8bd1fa1..089630fc7 100644 --- a/glib/tests/unicode.c +++ b/glib/tests/unicode.c @@ -464,6 +464,47 @@ test_strdown (void) g_free (str_down); } +/* Test that g_utf8_strup() and g_utf8_strdown() return the correct + * value for Turkish 'i' with and without dot above. */ +static void +test_turkish_strupdown (void) +{ + char *str_up = NULL; + char *str_down = NULL; + const char *str = "iII" + "\xcc\x87" /* COMBINING DOT ABOVE (U+307) */ + "\xc4\xb1" /* LATIN SMALL LETTER DOTLESS I (U+131) */ + "\xc4\xb0"; /* LATIN CAPITAL LETTER I WITH DOT ABOVE (U+130) */ + + char *oldlocale = g_strdup (setlocale (LC_ALL, "tr_TR")); + + if (oldlocale == NULL) + { + g_test_skip ("locale tr_TR not available"); + return; + } + + str_up = g_utf8_strup (str, strlen (str)); + str_down = g_utf8_strdown (str, strlen (str)); + /* i => LATIN CAPITAL LETTER I WITH DOT ABOVE, + * I => I, + * I + COMBINING DOT ABOVE => I + COMBINING DOT ABOVE, + * LATIN SMALL LETTER DOTLESS I => I, + * LATIN CAPITAL LETTER I WITH DOT ABOVE => LATIN CAPITAL LETTER I WITH DOT ABOVE */ + g_assert_cmpstr (str_up, ==, "\xc4\xb0II\xcc\x87I\xc4\xb0"); + /* i => i, + * I => LATIN SMALL LETTER DOTLESS I, + * I + COMBINING DOT ABOVE => i, + * LATIN SMALL LETTER DOTLESS I => LATIN SMALL LETTER DOTLESS I, + * LATIN CAPITAL LETTER I WITH DOT ABOVE => i */ + g_assert_cmpstr (str_down, ==, "i\xc4\xb1i\xc4\xb1i"); + g_free (str_up); + g_free (str_down); + + setlocale (LC_ALL, oldlocale); + g_free (oldlocale); +} + /* Test that g_utf8_casefold() returns the correct value for various * ASCII and Unicode alphabetic, numeric, and other, codepoints. */ static void @@ -1644,6 +1685,7 @@ main (int argc, g_test_add_func ("/unicode/space", test_space); g_test_add_func ("/unicode/strdown", test_strdown); g_test_add_func ("/unicode/strup", test_strup); + g_test_add_func ("/unicode/turkish-strupdown", test_turkish_strupdown); g_test_add_func ("/unicode/title", test_title); g_test_add_func ("/unicode/upper", test_upper); g_test_add_func ("/unicode/validate", test_unichar_validate); From e008301cf8cb66871cfa05a61ee46fef57781e55 Mon Sep 17 00:00:00 2001 From: Kjell Ahlstedt Date: Tue, 9 Feb 2021 15:27:26 +0100 Subject: [PATCH 2/2] guniprop, glib/tests/unicode: Fix style issues --- glib/guniprop.c | 2 +- glib/tests/unicode.c | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/glib/guniprop.c b/glib/guniprop.c index fdae9f0fc..2903367d4 100644 --- a/glib/guniprop.c +++ b/glib/guniprop.c @@ -991,7 +991,7 @@ real_tolower (const gchar *str, c == G_UNICHAR_FULLWIDTH_I)) { gboolean combining_dot = (c == 'I' || c == G_UNICHAR_FULLWIDTH_I) && - g_utf8_get_char (p) == 0x0307; + g_utf8_get_char (p) == 0x0307; if (combining_dot || c == 0x130) { /* I + COMBINING DOT ABOVE => i (U+0069) diff --git a/glib/tests/unicode.c b/glib/tests/unicode.c index 089630fc7..bf1ad52ab 100644 --- a/glib/tests/unicode.c +++ b/glib/tests/unicode.c @@ -472,9 +472,9 @@ test_turkish_strupdown (void) char *str_up = NULL; char *str_down = NULL; const char *str = "iII" - "\xcc\x87" /* COMBINING DOT ABOVE (U+307) */ - "\xc4\xb1" /* LATIN SMALL LETTER DOTLESS I (U+131) */ - "\xc4\xb0"; /* LATIN CAPITAL LETTER I WITH DOT ABOVE (U+130) */ + "\xcc\x87" /* COMBINING DOT ABOVE (U+307) */ + "\xc4\xb1" /* LATIN SMALL LETTER DOTLESS I (U+131) */ + "\xc4\xb0"; /* LATIN CAPITAL LETTER I WITH DOT ABOVE (U+130) */ char *oldlocale = g_strdup (setlocale (LC_ALL, "tr_TR"));