regex: Fix unicode othercasing

The old _pcre_ucp_othercase() function was wrong in returning
NOTACHAR (0xffffffff) for characters that aren't changed by upper-
and lower-casing. This led to PCRE internally using incorrect (or
at least inefficient) character classes when using G_REGEX_CASELESS.

E.g. [Z-\x{100}] turned into:

[Z\x{fffe}\x{fffe}\x{fffe}\x{fffe}\x{fffe}\x{fffe}\x{fffe}\x{fffe}\x{fffe}\x{fffe}\x{fffe}\x{fffe}\x{fffe}\x{fffe}\x{fffe}\x{fffe}\x{fffe}\x{fffe}\x{fffe}\x{fffe}\x{fffe}\x{fffe}\x{fffe}\x{fffe}\x{fffe}\x{fffe}\x{fffe}\x{fffe}\x{fffe}\x{fffe}\x{fffe}\x{fffe}\x{fffe}\x{fffe}\x{fffe}\x{fffe}\x{fffe}\x{fffe}\x{fffe}\x{fffe}\x{fffe}\x{fffe}\x{fffe}\x{fffe}\x{fffe}\x{fffe}\x{fffe}\x{fffe}\x{fffe}\x{fffe}\x{fffe}\x{fffe}\x{fffe}\x{fffe}\x{fffe}\x{fffe}\x{fffe}\x{fffe}\x{39c}\x{fffe}\x{fffe}\x{fffe}\x{fffe}\x{fffe}\x{fffe}\x{fffe}\x{fffe}\x{fffe}\x{fffe}\x{fffe}\x{fffe}\x{fffe}\x{178}z-\x{101}]

instead of the expected and efficient

[Z\x{39c}\x{178}z-\x{101}]

https://bugzilla.gnome.org/show_bug.cgi?id=678273
This commit is contained in:
Christian Persch 2012-06-17 22:51:44 +02:00 committed by Matthias Clasen
parent 22e9f72a8e
commit 53b48dfd3b

View File

@ -584,20 +584,17 @@ const ucp_type_table PRIV(utt)[] = {
const int PRIV(utt_size) = sizeof(PRIV(utt)) / sizeof(ucp_type_table);
unsigned int
unsigned int
_pcre_ucp_othercase(const unsigned int c)
{
int other_case = NOTACHAR;
unsigned int oc;
if (g_unichar_islower(c))
other_case = g_unichar_toupper(c);
else if (g_unichar_isupper(c))
other_case = g_unichar_tolower(c);
if ((oc = g_unichar_tolower(c)) != c)
return oc;
if ((oc = g_unichar_toupper(c)) != c)
return oc;
if (other_case == c)
other_case = NOTACHAR;
return other_case;
return c;
}
#endif /* SUPPORT_UTF */