mirror of
https://gitlab.gnome.org/GNOME/glib.git
synced 2025-02-28 05:02:10 +01:00
Unrolled implementation of g_utf8_to_ucs4_fast()
Unrolling the branches and expressions for all expected cases of UTF-8 sequences facilitates the work of both an optimizing compiler and the branch prediction logic in the CPU. This speeds up decoding noticeably on text composed primarily of longer sequences. https://bugzilla.gnome.org/show_bug.cgi?id=738504
This commit is contained in:
parent
3188b8ee79
commit
b963565125
71
glib/gutf8.c
71
glib/gutf8.c
@ -677,6 +677,8 @@ g_utf8_get_char_validated (const gchar *p,
|
|||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#define CONT_BYTE_FAST(p) ((guchar)*p++ & 0x3f)
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* g_utf8_to_ucs4_fast:
|
* g_utf8_to_ucs4_fast:
|
||||||
* @str: a UTF-8 encoded string
|
* @str: a UTF-8 encoded string
|
||||||
@ -729,39 +731,52 @@ g_utf8_to_ucs4_fast (const gchar *str,
|
|||||||
p = str;
|
p = str;
|
||||||
for (i=0; i < n_chars; i++)
|
for (i=0; i < n_chars; i++)
|
||||||
{
|
{
|
||||||
gunichar wc = (guchar)*p++;
|
guchar first = (guchar)*p++;
|
||||||
|
gunichar wc;
|
||||||
|
|
||||||
if (wc < 0x80)
|
if (first < 0xc0)
|
||||||
{
|
{
|
||||||
result[i] = wc;
|
/* We really hope first < 0x80, but we don't want to test an
|
||||||
|
* extra branch for invalid input, which this function
|
||||||
|
* does not care about. Handling unexpected continuation bytes
|
||||||
|
* here will do the least damage. */
|
||||||
|
wc = first;
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
gunichar mask = 0x40;
|
gunichar c1 = CONT_BYTE_FAST(p);
|
||||||
|
if (first < 0xe0)
|
||||||
if (G_UNLIKELY ((wc & mask) == 0))
|
{
|
||||||
{
|
wc = ((first & 0x1f) << 6) | c1;
|
||||||
/* It's an out-of-sequence 10xxxxxxx byte.
|
}
|
||||||
* Rather than making an ugly hash of this and the next byte
|
else
|
||||||
* and overrunning the buffer, it's more useful to treat it
|
{
|
||||||
* with a replacement character
|
gunichar c2 = CONT_BYTE_FAST(p);
|
||||||
*/
|
if (first < 0xf0)
|
||||||
result[i] = 0xfffd;
|
{
|
||||||
continue;
|
wc = ((first & 0x0f) << 12) | (c1 << 6) | c2;
|
||||||
}
|
}
|
||||||
|
else
|
||||||
do
|
{
|
||||||
{
|
gunichar c3 = CONT_BYTE_FAST(p);
|
||||||
wc <<= 6;
|
wc = ((first & 0x07) << 18) | (c1 << 12) | (c2 << 6) | c3;
|
||||||
wc |= (guchar)(*p++) & 0x3f;
|
if (G_UNLIKELY (first >= 0xf8))
|
||||||
mask <<= 5;
|
{
|
||||||
}
|
/* This can't be valid UTF-8, but g_utf8_next_char()
|
||||||
while((wc & mask) != 0);
|
* and company allow out-of-range sequences */
|
||||||
|
gunichar mask = 1 << 20;
|
||||||
wc &= mask - 1;
|
while ((wc & mask) != 0)
|
||||||
|
{
|
||||||
result[i] = wc;
|
wc <<= 6;
|
||||||
|
wc |= CONT_BYTE_FAST(p);
|
||||||
|
mask <<= 5;
|
||||||
|
}
|
||||||
|
wc &= mask - 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
result[i] = wc;
|
||||||
}
|
}
|
||||||
result[i] = 0;
|
result[i] = 0;
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user