mirror of
https://gitlab.gnome.org/GNOME/glib.git
synced 2025-02-25 11:42:10 +01:00
gmarkup: Optimize g_markup_escape_text()
This commit is contained in:
parent
930883609c
commit
2934dfa2bd
@ -2167,62 +2167,109 @@ g_markup_parse_context_pop (GMarkupParseContext *context)
|
||||
return user_data;
|
||||
}
|
||||
|
||||
#define APPEND_TEXT_AND_SEEK(_str, _start, _end) \
|
||||
G_STMT_START { \
|
||||
if (_end > _start) \
|
||||
g_string_append_len (_str, _start, _end - _start); \
|
||||
_start = ++_end; \
|
||||
} G_STMT_END
|
||||
|
||||
/*
|
||||
* https://www.w3.org/TR/REC-xml/ defines the set of valid
|
||||
* characters as:
|
||||
* #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF]
|
||||
*
|
||||
* That is, from non-ASCII UTF-8 character set, only 0xC27F - 0xC284 and
|
||||
* 0xC286 - 0xC29F have to be escaped (excluding the surrogate blocks).
|
||||
* Corresponding Unicode code points are [0x7F-0x84] and [0x86-0x9F].
|
||||
*
|
||||
* So instead of using costly g_utf8_next_char or similar UTF8 functions, it's
|
||||
* better to read each byte, and make an exception for 0xC2XX.
|
||||
*/
|
||||
static void
|
||||
append_escaped_text (GString *str,
|
||||
const gchar *text,
|
||||
gssize length)
|
||||
{
|
||||
const gchar *p;
|
||||
const gchar *p, *pending;
|
||||
const gchar *end;
|
||||
gunichar c;
|
||||
|
||||
p = text;
|
||||
p = pending = text;
|
||||
end = text + length;
|
||||
|
||||
while (p < end)
|
||||
while (p < end && pending < end)
|
||||
{
|
||||
const gchar *next;
|
||||
next = g_utf8_next_char (p);
|
||||
guchar c = (guchar) *pending;
|
||||
|
||||
switch (*p)
|
||||
switch (c)
|
||||
{
|
||||
case '&':
|
||||
APPEND_TEXT_AND_SEEK (str, p, pending);
|
||||
g_string_append (str, "&");
|
||||
break;
|
||||
|
||||
case '<':
|
||||
APPEND_TEXT_AND_SEEK (str, p, pending);
|
||||
g_string_append (str, "<");
|
||||
break;
|
||||
|
||||
case '>':
|
||||
APPEND_TEXT_AND_SEEK (str, p, pending);
|
||||
g_string_append (str, ">");
|
||||
break;
|
||||
|
||||
case '\'':
|
||||
APPEND_TEXT_AND_SEEK (str, p, pending);
|
||||
g_string_append (str, "'");
|
||||
break;
|
||||
|
||||
case '"':
|
||||
APPEND_TEXT_AND_SEEK (str, p, pending);
|
||||
g_string_append (str, """);
|
||||
break;
|
||||
|
||||
default:
|
||||
c = g_utf8_get_char (p);
|
||||
if ((0x1 <= c && c <= 0x8) ||
|
||||
(0xb <= c && c <= 0xc) ||
|
||||
(0xe <= c && c <= 0x1f) ||
|
||||
(0x7f <= c && c <= 0x84) ||
|
||||
(0x86 <= c && c <= 0x9f))
|
||||
g_string_append_printf (str, "&#x%x;", c);
|
||||
(c == 0x7f))
|
||||
{
|
||||
APPEND_TEXT_AND_SEEK (str, p, pending);
|
||||
g_string_append_printf (str, "&#x%x;", c);
|
||||
}
|
||||
/* The utf-8 control characters to escape begins with 0xc2 byte */
|
||||
else if (c == 0xc2)
|
||||
{
|
||||
gunichar u = g_utf8_get_char (pending);
|
||||
|
||||
if ((0x7f < u && u <= 0x84) ||
|
||||
(0x86 <= u && u <= 0x9f))
|
||||
{
|
||||
APPEND_TEXT_AND_SEEK (str, p, pending);
|
||||
g_string_append_printf (str, "&#x%x;", u);
|
||||
|
||||
/*
|
||||
* We have appended a two byte character above, which
|
||||
* is one byte ahead of what we read on every loop.
|
||||
* Increment to skip 0xc2 and point to the right location.
|
||||
*/
|
||||
p++;
|
||||
}
|
||||
else
|
||||
pending++;
|
||||
}
|
||||
else
|
||||
g_string_append_len (str, p, next - p);
|
||||
pending++;
|
||||
break;
|
||||
}
|
||||
|
||||
p = next;
|
||||
}
|
||||
|
||||
if (pending > p)
|
||||
g_string_append_len (str, p, pending - p);
|
||||
}
|
||||
|
||||
#undef APPEND_TEXT_AND_SEEK
|
||||
|
||||
/**
|
||||
* g_markup_escape_text:
|
||||
* @text: some valid UTF-8 text
|
||||
|
@ -20,6 +20,8 @@ static EscapeTest escape_tests[] =
|
||||
{ ">", ">" },
|
||||
{ "'", "'" },
|
||||
{ "\"", """ },
|
||||
{ "\"\"", """" },
|
||||
{ "\"അ\"", ""അ"" },
|
||||
{ "", "" },
|
||||
{ "A", "A" },
|
||||
{ "A&", "A&" },
|
||||
@ -30,7 +32,11 @@ static EscapeTest escape_tests[] =
|
||||
{ "A&&A", "A&&A" },
|
||||
{ "A&A&A", "A&A&A" },
|
||||
{ "AA", "A&#23;A" },
|
||||
{ "A
A", "A&#xa;A" }
|
||||
{ "A
A", "A&#xa;A" },
|
||||
{ "N\x2N", "NN" },
|
||||
{ "N\xc2\x80N", "N€N" },
|
||||
{ "N\xc2\x79N", "N\xc2\x79N" },
|
||||
{ "N\xc2\x9fN", "NŸN" },
|
||||
};
|
||||
|
||||
static void
|
||||
|
Loading…
x
Reference in New Issue
Block a user