mirror of
https://gitlab.gnome.org/GNOME/glib.git
synced 2025-01-15 16:56:14 +01:00
Optimized branching in g_utf8_validate()
The number of branches and logical operations can be reduced by never producing a resulting wide character value to check its range. Instead, individual bytes in the sequence are validated depending on the branch taken on the basis of preceding bytes. The syntax given in RFC 3629 is made use of. https://bugzilla.gnome.org/show_bug.cgi?id=738504
This commit is contained in:
parent
5644ee5083
commit
3188b8ee79
125
glib/gutf8.c
125
glib/gutf8.c
@ -1442,20 +1442,18 @@ g_ucs4_to_utf16 (const gunichar *str,
|
||||
return result;
|
||||
}
|
||||
|
||||
#define CONTINUATION_CHAR \
|
||||
#define VALIDATE_BYTE(mask, expect) \
|
||||
G_STMT_START { \
|
||||
if ((*(guchar *)p & 0xc0) != 0x80) /* 10xxxxxx */ \
|
||||
if (G_UNLIKELY((*(guchar *)p & (mask)) != (expect))) \
|
||||
goto error; \
|
||||
val <<= 6; \
|
||||
val |= (*(guchar *)p) & 0x3f; \
|
||||
} G_STMT_END
|
||||
|
||||
/* see IETF RFC 3629 Section 4 */
|
||||
|
||||
static const gchar *
|
||||
fast_validate (const char *str)
|
||||
|
||||
{
|
||||
gunichar val = 0;
|
||||
gunichar min = 0;
|
||||
const gchar *p;
|
||||
|
||||
for (p = str; *p; p++)
|
||||
@ -1467,45 +1465,52 @@ fast_validate (const char *str)
|
||||
const gchar *last;
|
||||
|
||||
last = p;
|
||||
if ((*(guchar *)p & 0xe0) == 0xc0) /* 110xxxxx */
|
||||
if (*(guchar *)p < 0xe0) /* 110xxxxx */
|
||||
{
|
||||
if (G_UNLIKELY ((*(guchar *)p & 0x1e) == 0))
|
||||
goto error;
|
||||
p++;
|
||||
if (G_UNLIKELY ((*(guchar *)p & 0xc0) != 0x80)) /* 10xxxxxx */
|
||||
goto error;
|
||||
}
|
||||
else
|
||||
{
|
||||
if ((*(guchar *)p & 0xf0) == 0xe0) /* 1110xxxx */
|
||||
if (*(guchar *)p < 0xf0) /* 1110xxxx */
|
||||
{
|
||||
min = (1 << 11);
|
||||
val = *(guchar *)p & 0x0f;
|
||||
goto TWO_REMAINING;
|
||||
switch (*(guchar *)p++ & 0x0f)
|
||||
{
|
||||
case 0:
|
||||
VALIDATE_BYTE(0xe0, 0xa0); /* 0xa0 ... 0xbf */
|
||||
break;
|
||||
case 0x0d:
|
||||
VALIDATE_BYTE(0xe0, 0x80); /* 0x80 ... 0x9f */
|
||||
break;
|
||||
default:
|
||||
VALIDATE_BYTE(0xc0, 0x80); /* 10xxxxxx */
|
||||
}
|
||||
else if ((*(guchar *)p & 0xf8) == 0xf0) /* 11110xxx */
|
||||
}
|
||||
else if (*(guchar *)p < 0xf5) /* 11110xxx excluding out-of-range */
|
||||
{
|
||||
min = (1 << 16);
|
||||
val = *(guchar *)p & 0x07;
|
||||
switch (*(guchar *)p++ & 0x07)
|
||||
{
|
||||
case 0:
|
||||
VALIDATE_BYTE(0xc0, 0x80); /* 10xxxxxx */
|
||||
if (G_UNLIKELY((*(guchar *)p & 0x30) == 0))
|
||||
goto error;
|
||||
break;
|
||||
case 4:
|
||||
VALIDATE_BYTE(0xf0, 0x80); /* 0x80 ... 0x8f */
|
||||
break;
|
||||
default:
|
||||
VALIDATE_BYTE(0xc0, 0x80); /* 10xxxxxx */
|
||||
}
|
||||
p++;
|
||||
VALIDATE_BYTE(0xc0, 0x80); /* 10xxxxxx */
|
||||
}
|
||||
else
|
||||
goto error;
|
||||
|
||||
p++;
|
||||
CONTINUATION_CHAR;
|
||||
TWO_REMAINING:
|
||||
p++;
|
||||
CONTINUATION_CHAR;
|
||||
p++;
|
||||
CONTINUATION_CHAR;
|
||||
|
||||
if (G_UNLIKELY (val < min))
|
||||
goto error;
|
||||
|
||||
if (G_UNLIKELY (!UNICODE_VALID(val)))
|
||||
goto error;
|
||||
}
|
||||
|
||||
p++;
|
||||
VALIDATE_BYTE(0xc0, 0x80); /* 10xxxxxx */
|
||||
|
||||
continue;
|
||||
|
||||
error:
|
||||
@ -1521,8 +1526,6 @@ fast_validate_len (const char *str,
|
||||
gssize max_len)
|
||||
|
||||
{
|
||||
gunichar val = 0;
|
||||
gunichar min = 0;
|
||||
const gchar *p;
|
||||
|
||||
g_assert (max_len >= 0);
|
||||
@ -1536,53 +1539,61 @@ fast_validate_len (const char *str,
|
||||
const gchar *last;
|
||||
|
||||
last = p;
|
||||
if ((*(guchar *)p & 0xe0) == 0xc0) /* 110xxxxx */
|
||||
if (*(guchar *)p < 0xe0) /* 110xxxxx */
|
||||
{
|
||||
if (G_UNLIKELY (max_len - (p - str) < 2))
|
||||
goto error;
|
||||
|
||||
if (G_UNLIKELY ((*(guchar *)p & 0x1e) == 0))
|
||||
goto error;
|
||||
p++;
|
||||
if (G_UNLIKELY ((*(guchar *)p & 0xc0) != 0x80)) /* 10xxxxxx */
|
||||
goto error;
|
||||
}
|
||||
else
|
||||
{
|
||||
if ((*(guchar *)p & 0xf0) == 0xe0) /* 1110xxxx */
|
||||
if (*(guchar *)p < 0xf0) /* 1110xxxx */
|
||||
{
|
||||
if (G_UNLIKELY (max_len - (p - str) < 3))
|
||||
goto error;
|
||||
|
||||
min = (1 << 11);
|
||||
val = *(guchar *)p & 0x0f;
|
||||
goto TWO_REMAINING;
|
||||
switch (*(guchar *)p++ & 0x0f)
|
||||
{
|
||||
case 0:
|
||||
VALIDATE_BYTE(0xe0, 0xa0); /* 0xa0 ... 0xbf */
|
||||
break;
|
||||
case 0x0d:
|
||||
VALIDATE_BYTE(0xe0, 0x80); /* 0x80 ... 0x9f */
|
||||
break;
|
||||
default:
|
||||
VALIDATE_BYTE(0xc0, 0x80); /* 10xxxxxx */
|
||||
}
|
||||
else if ((*(guchar *)p & 0xf8) == 0xf0) /* 11110xxx */
|
||||
}
|
||||
else if (*(guchar *)p < 0xf5) /* 11110xxx excluding out-of-range */
|
||||
{
|
||||
if (G_UNLIKELY (max_len - (p - str) < 4))
|
||||
goto error;
|
||||
|
||||
min = (1 << 16);
|
||||
val = *(guchar *)p & 0x07;
|
||||
switch (*(guchar *)p++ & 0x07)
|
||||
{
|
||||
case 0:
|
||||
VALIDATE_BYTE(0xc0, 0x80); /* 10xxxxxx */
|
||||
if (G_UNLIKELY((*(guchar *)p & 0x30) == 0))
|
||||
goto error;
|
||||
break;
|
||||
case 4:
|
||||
VALIDATE_BYTE(0xf0, 0x80); /* 0x80 ... 0x8f */
|
||||
break;
|
||||
default:
|
||||
VALIDATE_BYTE(0xc0, 0x80); /* 10xxxxxx */
|
||||
}
|
||||
p++;
|
||||
VALIDATE_BYTE(0xc0, 0x80); /* 10xxxxxx */
|
||||
}
|
||||
else
|
||||
goto error;
|
||||
|
||||
p++;
|
||||
CONTINUATION_CHAR;
|
||||
TWO_REMAINING:
|
||||
p++;
|
||||
CONTINUATION_CHAR;
|
||||
p++;
|
||||
CONTINUATION_CHAR;
|
||||
|
||||
if (G_UNLIKELY (val < min))
|
||||
goto error;
|
||||
if (G_UNLIKELY (!UNICODE_VALID(val)))
|
||||
goto error;
|
||||
}
|
||||
|
||||
p++;
|
||||
VALIDATE_BYTE(0xc0, 0x80); /* 10xxxxxx */
|
||||
|
||||
continue;
|
||||
|
||||
error:
|
||||
|
Loading…
Reference in New Issue
Block a user