gmarkup: Optimize g_markup_escape_text()

This commit is contained in:
Mohammed Sadiq 2019-01-30 14:36:14 +00:00 committed by Philip Withnall
parent 930883609c
commit 2934dfa2bd
2 changed files with 68 additions and 15 deletions

View File

@ -2167,62 +2167,109 @@ g_markup_parse_context_pop (GMarkupParseContext *context)
return user_data; return user_data;
} }
#define APPEND_TEXT_AND_SEEK(_str, _start, _end) \
G_STMT_START { \
if (_end > _start) \
g_string_append_len (_str, _start, _end - _start); \
_start = ++_end; \
} G_STMT_END
/*
* https://www.w3.org/TR/REC-xml/ defines the set of valid
* characters as:
* #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF]
*
* That is, from non-ASCII UTF-8 character set, only 0xC27F - 0xC284 and
* 0xC286 - 0xC29F have to be escaped (excluding the surrogate blocks).
* Corresponding Unicode code points are [0x7F-0x84] and [0x86-0x9F].
*
* So instead of using costly g_utf8_next_char or similar UTF8 functions, it's
* better to read each byte, and make an exception for 0xC2XX.
*/
static void static void
append_escaped_text (GString *str, append_escaped_text (GString *str,
const gchar *text, const gchar *text,
gssize length) gssize length)
{ {
const gchar *p; const gchar *p, *pending;
const gchar *end; const gchar *end;
gunichar c;
p = text; p = pending = text;
end = text + length; end = text + length;
while (p < end) while (p < end && pending < end)
{ {
const gchar *next; guchar c = (guchar) *pending;
next = g_utf8_next_char (p);
switch (*p) switch (c)
{ {
case '&': case '&':
APPEND_TEXT_AND_SEEK (str, p, pending);
g_string_append (str, "&amp;"); g_string_append (str, "&amp;");
break; break;
case '<': case '<':
APPEND_TEXT_AND_SEEK (str, p, pending);
g_string_append (str, "&lt;"); g_string_append (str, "&lt;");
break; break;
case '>': case '>':
APPEND_TEXT_AND_SEEK (str, p, pending);
g_string_append (str, "&gt;"); g_string_append (str, "&gt;");
break; break;
case '\'': case '\'':
APPEND_TEXT_AND_SEEK (str, p, pending);
g_string_append (str, "&apos;"); g_string_append (str, "&apos;");
break; break;
case '"': case '"':
APPEND_TEXT_AND_SEEK (str, p, pending);
g_string_append (str, "&quot;"); g_string_append (str, "&quot;");
break; break;
default: default:
c = g_utf8_get_char (p);
if ((0x1 <= c && c <= 0x8) || if ((0x1 <= c && c <= 0x8) ||
(0xb <= c && c <= 0xc) || (0xb <= c && c <= 0xc) ||
(0xe <= c && c <= 0x1f) || (0xe <= c && c <= 0x1f) ||
(0x7f <= c && c <= 0x84) || (c == 0x7f))
(0x86 <= c && c <= 0x9f)) {
APPEND_TEXT_AND_SEEK (str, p, pending);
g_string_append_printf (str, "&#x%x;", c); g_string_append_printf (str, "&#x%x;", c);
}
/* The utf-8 control characters to escape begins with 0xc2 byte */
else if (c == 0xc2)
{
gunichar u = g_utf8_get_char (pending);
if ((0x7f < u && u <= 0x84) ||
(0x86 <= u && u <= 0x9f))
{
APPEND_TEXT_AND_SEEK (str, p, pending);
g_string_append_printf (str, "&#x%x;", u);
/*
* We have appended a two byte character above, which
* is one byte ahead of what we read on every loop.
* Increment to skip 0xc2 and point to the right location.
*/
p++;
}
else else
g_string_append_len (str, p, next - p); pending++;
}
else
pending++;
break; break;
} }
p = next;
} }
if (pending > p)
g_string_append_len (str, p, pending - p);
} }
#undef APPEND_TEXT_AND_SEEK
/** /**
* g_markup_escape_text: * g_markup_escape_text:
* @text: some valid UTF-8 text * @text: some valid UTF-8 text

View File

@ -20,6 +20,8 @@ static EscapeTest escape_tests[] =
{ ">", "&gt;" }, { ">", "&gt;" },
{ "'", "&apos;" }, { "'", "&apos;" },
{ "\"", "&quot;" }, { "\"", "&quot;" },
{ "\"\"", "&quot;&quot;" },
{ "\"\"", "&quot;അ&quot;" },
{ "", "" }, { "", "" },
{ "A", "A" }, { "A", "A" },
{ "A&", "A&amp;" }, { "A&", "A&amp;" },
@ -30,7 +32,11 @@ static EscapeTest escape_tests[] =
{ "A&&A", "A&amp;&amp;A" }, { "A&&A", "A&amp;&amp;A" },
{ "A&A&A", "A&amp;A&amp;A" }, { "A&A&A", "A&amp;A&amp;A" },
{ "A&#23;A", "A&amp;#23;A" }, { "A&#23;A", "A&amp;#23;A" },
{ "A&#xa;A", "A&amp;#xa;A" } { "A&#xa;A", "A&amp;#xa;A" },
{ "N\x2N", "N&#x2;N" },
{ "N\xc2\x80N", "N&#x80;N" },
{ "N\xc2\x79N", "N\xc2\x79N" },
{ "N\xc2\x9fN", "N&#x9f;N" },
}; };
static void static void