Unrolled implementation of g_utf8_to_ucs4_fast()

Unrolling the branches and expressions for all expected cases
of UTF-8 sequences facilitates the work of both an optimizing compiler
and the branch prediction logic in the CPU. This speeds up decoding
noticeably on text composed primarily of longer sequences.

https://bugzilla.gnome.org/show_bug.cgi?id=738504
This commit is contained in:
Mikhail Zabaluev 2014-10-13 21:31:02 +03:00 committed by Matthias Clasen
parent 3188b8ee79
commit b963565125

View File

@ -677,6 +677,8 @@ g_utf8_get_char_validated (const gchar *p,
return result; return result;
} }
#define CONT_BYTE_FAST(p) ((guchar)*p++ & 0x3f)
/** /**
* g_utf8_to_ucs4_fast: * g_utf8_to_ucs4_fast:
* @str: a UTF-8 encoded string * @str: a UTF-8 encoded string
@ -729,39 +731,52 @@ g_utf8_to_ucs4_fast (const gchar *str,
p = str; p = str;
for (i=0; i < n_chars; i++) for (i=0; i < n_chars; i++)
{ {
gunichar wc = (guchar)*p++; guchar first = (guchar)*p++;
gunichar wc;
if (wc < 0x80) if (first < 0xc0)
{ {
result[i] = wc; /* We really hope first < 0x80, but we don't want to test an
* extra branch for invalid input, which this function
* does not care about. Handling unexpected continuation bytes
* here will do the least damage. */
wc = first;
} }
else else
{ {
gunichar mask = 0x40; gunichar c1 = CONT_BYTE_FAST(p);
if (first < 0xe0)
if (G_UNLIKELY ((wc & mask) == 0)) {
{ wc = ((first & 0x1f) << 6) | c1;
/* It's an out-of-sequence 10xxxxxxx byte. }
* Rather than making an ugly hash of this and the next byte else
* and overrunning the buffer, it's more useful to treat it {
* with a replacement character gunichar c2 = CONT_BYTE_FAST(p);
*/ if (first < 0xf0)
result[i] = 0xfffd; {
continue; wc = ((first & 0x0f) << 12) | (c1 << 6) | c2;
} }
else
do {
{ gunichar c3 = CONT_BYTE_FAST(p);
wc <<= 6; wc = ((first & 0x07) << 18) | (c1 << 12) | (c2 << 6) | c3;
wc |= (guchar)(*p++) & 0x3f; if (G_UNLIKELY (first >= 0xf8))
mask <<= 5; {
} /* This can't be valid UTF-8, but g_utf8_next_char()
while((wc & mask) != 0); * and company allow out-of-range sequences */
gunichar mask = 1 << 20;
wc &= mask - 1; while ((wc & mask) != 0)
{
result[i] = wc; wc <<= 6;
wc |= CONT_BYTE_FAST(p);
mask <<= 5;
}
wc &= mask - 1;
}
}
}
} }
result[i] = wc;
} }
result[i] = 0; result[i] = 0;