Optimizations; don't scan the entire text in find_current_text_end(),

2004-11-28 Matthias Clasen <mclasen@redhat.com> * glib/gmarkup.c: Optimizations; don't scan the entire text in find_current_text_end(), split unescape_text() into multiple functions. (#159001, Havoc Pennington)
2025-11-19 00:48:23 +01:00 · 2004-11-28 05:40:10 +00:00
parent c77ae3942c
commit 86c78552ec
6 changed files with 410 additions and 297 deletions
--- a/6
+++ b/6
@@ -1,3 +1,9 @@
 2004-11-28  Matthias Clasen  <mclasen@redhat.com>
 	* glib/gmarkup.c: Optimizations; don't scan the entire text
 	in find_current_text_end(), split unescape_text() into multiple
 	functions.  (#159001, Havoc Pennington)
 2004-11-27  Matthias Clasen  <mclasen@redhat.com>
 	* glib/gspawn.c (g_spawn_async_with_pipes): Update the @flags
--- a/ChangeLog.pre-2-10
+++ b/ChangeLog.pre-2-10
@@ -1,3 +1,9 @@
 2004-11-28  Matthias Clasen  <mclasen@redhat.com>
 	* glib/gmarkup.c: Optimizations; don't scan the entire text
 	in find_current_text_end(), split unescape_text() into multiple
 	functions.  (#159001, Havoc Pennington)
 2004-11-27  Matthias Clasen  <mclasen@redhat.com>
 	* glib/gspawn.c (g_spawn_async_with_pipes): Update the @flags
--- a/ChangeLog.pre-2-12
+++ b/ChangeLog.pre-2-12
@@ -1,3 +1,9 @@
 2004-11-28  Matthias Clasen  <mclasen@redhat.com>
 	* glib/gmarkup.c: Optimizations; don't scan the entire text
 	in find_current_text_end(), split unescape_text() into multiple
 	functions.  (#159001, Havoc Pennington)
 2004-11-27  Matthias Clasen  <mclasen@redhat.com>
 	* glib/gspawn.c (g_spawn_async_with_pipes): Update the @flags
--- a/ChangeLog.pre-2-6
+++ b/ChangeLog.pre-2-6
@@ -1,3 +1,9 @@
 2004-11-28  Matthias Clasen  <mclasen@redhat.com>
 	* glib/gmarkup.c: Optimizations; don't scan the entire text
 	in find_current_text_end(), split unescape_text() into multiple
 	functions.  (#159001, Havoc Pennington)
 2004-11-27  Matthias Clasen  <mclasen@redhat.com>
 	* glib/gspawn.c (g_spawn_async_with_pipes): Update the @flags
--- a/ChangeLog.pre-2-8
+++ b/ChangeLog.pre-2-8
@@ -1,3 +1,9 @@
 2004-11-28  Matthias Clasen  <mclasen@redhat.com>
 	* glib/gmarkup.c: Optimizations; don't scan the entire text
 	in find_current_text_end(), split unescape_text() into multiple
 	functions.  (#159001, Havoc Pennington)
 2004-11-27  Matthias Clasen  <mclasen@redhat.com>
 	* glib/gspawn.c (g_spawn_async_with_pipes): Update the @flags
--- a/glib/gmarkup.c
+++ b/glib/gmarkup.c
@@ -235,12 +235,24 @@ set_error (GMarkupParseContext *context,
  g_propagate_error (error, tmp_error);
 }
 /* To make these faster, we first use the ascii-only tests, then check
 * for the usual non-alnum name-end chars, and only then call the
 * expensive unicode stuff. Nobody uses non-ascii in XML tag/attribute
 * names, so this is a reasonable hack that virtually always avoids
 * the guniprop call.
 */
 #define IS_COMMON_NAME_END_CHAR(c) \
  ((c) == '=' || (c) == '/' || (c) == '>' || (c) == ' ')
 static gboolean
 is_name_start_char (gunichar c)
 {
-  if (g_unichar_isalpha (c) ||
+  if (g_ascii_isalpha (c) ||
      (!IS_COMMON_NAME_END_CHAR (c) &&
       (g_unichar_isalpha (c) ||
        c == '_' ||
-      c == ':')
+        c == ':')))
    return TRUE;
  else
    return FALSE;
@@ -249,11 +261,13 @@ is_name_start_char (gunichar c)
 static gboolean
 is_name_char (gunichar c)
 {
-  if (g_unichar_isalnum (c) ||
+  if (g_ascii_isalnum (c) ||
      (!IS_COMMON_NAME_END_CHAR (c) &&
       (g_unichar_isalnum (c) ||
        c == '.' ||
        c == '-' ||
        c == '_' ||
-      c == ':')
+        c == ':')))
    return TRUE;
  else
    return FALSE;
@@ -326,54 +340,51 @@ typedef enum
  USTATE_AFTER_CHARREF_HASH
 } UnescapeState;
-static gboolean
+typedef struct
-unescape_text (GMarkupParseContext *context,
+{
-               const gchar         *text,
+  GMarkupParseContext *context;
-               const gchar         *text_end,
+  GString *str;
-               gchar              **unescaped,
+  UnescapeState state;
  const gchar *text;
  const gchar *text_end;
  const gchar *entity_start;
 } UnescapeContext;
 static const gchar*
 unescape_text_state_inside_text (UnescapeContext *ucontext,
                                 const gchar     *p,
                                 GError         **error)
 {
 #define MAX_ENT_LEN 5
  GString *str;
  const gchar *p;
  UnescapeState state;
  const gchar *start;
  gboolean normalize_attribute;
-  str = g_string_new (NULL);
+  if (ucontext->context->state == STATE_INSIDE_ATTRIBUTE_VALUE_SQ ||
-
+      ucontext->context->state == STATE_INSIDE_ATTRIBUTE_VALUE_DQ)
  if (context->state == STATE_INSIDE_ATTRIBUTE_VALUE_SQ ||
      context->state == STATE_INSIDE_ATTRIBUTE_VALUE_DQ)
    normalize_attribute = TRUE;
  else
    normalize_attribute = FALSE;
  state = USTATE_INSIDE_TEXT;
  p = text;
  start = p;
  while (p != text_end && context->state != STATE_ERROR)
    {
      g_assert (p < text_end);
-      switch (state)
+  while (p != ucontext->text_end)
    {
-        case USTATE_INSIDE_TEXT:
+      if (*p == '&')
        {
-            while (p != text_end && *p != '&')
+          break;
        }
      else if (normalize_attribute && (*p == '\t' || *p == '\n'))
        {
-		if ((*p == '\t' || *p == '\n') && normalize_attribute)
+          g_string_append_len (ucontext->str, start, p - start);
-		  {
+          g_string_append_c (ucontext->str, ' ');
 		    g_string_append_len (str, start, p - start);
 		    g_string_append_c (str, ' ');
          p = g_utf8_next_char (p);
          start = p;
        }
      else if (*p == '\r')
        {
-		    g_string_append_len (str, start, p - start);
+          g_string_append_len (ucontext->str, start, p - start);
-		    g_string_append_c (str, normalize_attribute ? ' ' : '\n');
+          g_string_append_c (ucontext->str, normalize_attribute ? ' ' : '\n');
          p = g_utf8_next_char (p);
-		    if (*p == '\n')
+          if (p != ucontext->text_end && *p == '\n')
            p = g_utf8_next_char (p);
          start = p;
        }
@@ -382,35 +393,38 @@ unescape_text (GMarkupParseContext *context,
    }
  if (p != start)
-              {
+    g_string_append_len (ucontext->str, start, p - start);
                g_string_append_len (str, start, p - start);
-                start = NULL;
+  if (p != ucontext->text_end && *p == '&')
              }
            if (p != text_end && *p == '&')
    {
      p = g_utf8_next_char (p);
-                state = USTATE_AFTER_AMPERSAND;
+      ucontext->state = USTATE_AFTER_AMPERSAND;
    }
          }
          break;
-        case USTATE_AFTER_AMPERSAND:
+  return p;
-          {
+}
 __attribute__ ((noinline))
 static const gchar*
 unescape_text_state_after_ampersand (UnescapeContext *ucontext,
                                     const gchar     *p,
                                     GError         **error)
 {
  ucontext->entity_start = NULL;
  if (*p == '#')
    {
      p = g_utf8_next_char (p);
-                start = p;
+      ucontext->entity_start = p;
-                state = USTATE_AFTER_CHARREF_HASH;
+      ucontext->state = USTATE_AFTER_CHARREF_HASH;
    }
  else if (!is_name_start_char (g_utf8_get_char (p)))
    {
      if (*p == ';')
        {
-                    set_unescape_error (context, error,
+          set_unescape_error (ucontext->context, error,
-                                        p, text_end,
+                              p, ucontext->text_end,
                              G_MARKUP_ERROR_PARSE,
                              _("Empty entity '&;' seen; valid "
                                "entities are: &amp; &quot; &lt; &gt; &apos;"));
@@ -419,8 +433,8 @@ unescape_text (GMarkupParseContext *context,
        {
          gchar buf[7];
-                    set_unescape_error (context, error,
+          set_unescape_error (ucontext->context, error,
-                                        p, text_end,
+                              p, ucontext->text_end,
                              G_MARKUP_ERROR_PARSE,
                              _("Character '%s' is not valid at "
                                "the start of an entity name; "
@@ -433,21 +447,26 @@ unescape_text (GMarkupParseContext *context,
    }
  else
    {
-                start = p;
+      ucontext->entity_start = p;
-                state = USTATE_INSIDE_ENTITY_NAME;
+      ucontext->state = USTATE_INSIDE_ENTITY_NAME;
    }
          }
          break;
  return p;
 }
-        case USTATE_INSIDE_ENTITY_NAME:
+__attribute__ ((noinline))
-          {
+static const gchar*
 unescape_text_state_inside_entity_name (UnescapeContext *ucontext,
                                        const gchar     *p,
                                        GError         **error)
 {
 #define MAX_ENT_LEN 5
  gchar buf[MAX_ENT_LEN+1] = {
    '\0', '\0', '\0', '\0', '\0', '\0'
  };
  gchar *dest;
-            while (p != text_end)
+  while (p != ucontext->text_end)
    {
      if (*p == ';')
        break;
@@ -455,8 +474,8 @@ unescape_text (GMarkupParseContext *context,
        {
          gchar ubuf[7];
-                    set_unescape_error (context, error,
+          set_unescape_error (ucontext->context, error,
-                                        p, text_end,
+                              p, ucontext->text_end,
                              G_MARKUP_ERROR_PARSE,
                              _("Character '%s' is not valid "
                                "inside an entity name"),
@@ -467,13 +486,13 @@ unescape_text (GMarkupParseContext *context,
      p = g_utf8_next_char (p);
    }
-            if (context->state != STATE_ERROR)
+  if (ucontext->context->state != STATE_ERROR)
    {
-                if (p != text_end)
+      if (p != ucontext->text_end)
        {
          const gchar *src;
-                    src = start;
+          src = ucontext->entity_start;
          dest = buf;
          while (src != p)
            {
@@ -484,23 +503,22 @@ unescape_text (GMarkupParseContext *context,
          /* move to after semicolon */
          p = g_utf8_next_char (p);
-                    start = p;
+          ucontext->state = USTATE_INSIDE_TEXT;
                    state = USTATE_INSIDE_TEXT;
          if (strcmp (buf, "lt") == 0)
-                      g_string_append_c (str, '<');
+            g_string_append_c (ucontext->str, '<');
          else if (strcmp (buf, "gt") == 0)
-                      g_string_append_c (str, '>');
+            g_string_append_c (ucontext->str, '>');
          else if (strcmp (buf, "amp") == 0)
-                      g_string_append_c (str, '&');
+            g_string_append_c (ucontext->str, '&');
          else if (strcmp (buf, "quot") == 0)
-                      g_string_append_c (str, '"');
+            g_string_append_c (ucontext->str, '"');
          else if (strcmp (buf, "apos") == 0)
-                      g_string_append_c (str, '\'');
+            g_string_append_c (ucontext->str, '\'');
          else
            {
-                        set_unescape_error (context, error,
+              set_unescape_error (ucontext->context, error,
-                                            p, text_end,
+                                  p, ucontext->text_end,
                                  G_MARKUP_ERROR_PARSE,
                                  _("Entity name '%s' is not known"),
                                  buf);
@@ -508,9 +526,9 @@ unescape_text (GMarkupParseContext *context,
        }
      else
        {
-                    set_unescape_error (context, error,
+          set_unescape_error (ucontext->context, error,
                              /* give line number of the & */
-                                        start, text_end,
+                              ucontext->entity_start, ucontext->text_end,
                              G_MARKUP_ERROR_PARSE,
                              _("Entity did not end with a semicolon; "
                                "most likely you used an ampersand "
@@ -518,12 +536,22 @@ unescape_text (GMarkupParseContext *context,
                                "an entity - escape ampersand as &amp;"));
        }
    }
-          }
+#undef MAX_ENT_LEN
          break;
-        case USTATE_AFTER_CHARREF_HASH:
+  return p;
-          {
+}
 __attribute__ ((noinline))
 static const gchar*
 unescape_text_state_after_charref_hash (UnescapeContext *ucontext,
                                        const gchar     *p,
                                        GError         **error)
 {
  gboolean is_hex = FALSE;
  const char *start;
  start = ucontext->entity_start;
  if (*p == 'x')
    {
      is_hex = TRUE;
@@ -531,10 +559,10 @@ unescape_text (GMarkupParseContext *context,
      start = p;
    }
-            while (p != text_end && *p != ';')
+  while (p != ucontext->text_end && *p != ';')
    p = g_utf8_next_char (p);
-            if (p != text_end)
+  if (p != ucontext->text_end)
    {
      g_assert (*p == ';');
@@ -555,8 +583,8 @@ unescape_text (GMarkupParseContext *context,
          if (end != digit_end || errno != 0)
            {
-                        set_unescape_error (context, error,
+              set_unescape_error (ucontext->context, error,
-                                            start, text_end,
+                                  start, ucontext->text_end,
                                  G_MARKUP_ERROR_PARSE,
                                  _("Failed to parse '%s', which "
                                    "should have been a digit "
@@ -576,12 +604,12 @@ unescape_text (GMarkupParseContext *context,
                  (l >= 0x10000 && l <= 0x10FFFF))
                {
                  gchar buf[7];
-                            g_string_append (str, char_str (l, buf));
+                  g_string_append (ucontext->str, char_str (l, buf));
                }
              else
                {
-                            set_unescape_error (context, error,
+                  set_unescape_error (ucontext->context, error,
-                                                start, text_end,
+                                      start, ucontext->text_end,
                                      G_MARKUP_ERROR_PARSE,
                                      _("Character reference '%s' does not encode a permitted character"),
                                      digit);
@@ -592,13 +620,12 @@ unescape_text (GMarkupParseContext *context,
          /* Move to next state */
          p = g_utf8_next_char (p); /* past semicolon */
-                    start = p;
+          ucontext->state = USTATE_INSIDE_TEXT;
                    state = USTATE_INSIDE_TEXT;
        }
      else
        {
-                    set_unescape_error (context, error,
+          set_unescape_error (ucontext->context, error,
-                                        start, text_end,
+                              start, ucontext->text_end,
                              G_MARKUP_ERROR_PARSE,
                              _("Empty character reference; "
                                "should include a digit such as "
@@ -607,8 +634,8 @@ unescape_text (GMarkupParseContext *context,
    }
  else
    {
-                set_unescape_error (context, error,
+      set_unescape_error (ucontext->context, error,
-                                    start, text_end,
+                          start, ucontext->text_end,
                          G_MARKUP_ERROR_PARSE,
                          _("Character reference did not end with a "
                            "semicolon; "
@@ -616,6 +643,66 @@ unescape_text (GMarkupParseContext *context,
                            "character without intending to start "
                            "an entity - escape ampersand as &amp;"));
    }
  return p;
 }
 static gboolean
 unescape_text (GMarkupParseContext *context,
               const gchar         *text,
               const gchar         *text_end,
               GString            **unescaped,
               GError             **error)
 {
  UnescapeContext ucontext;
  const gchar *p;
  ucontext.context = context;
  ucontext.text = text;
  ucontext.text_end = text_end;
  ucontext.entity_start = NULL;
  ucontext.str = g_string_sized_new (text_end - text);
  ucontext.state = USTATE_INSIDE_TEXT;
  p = text;
  while (p != text_end && context->state != STATE_ERROR)
    {
      g_assert (p < text_end);
      switch (ucontext.state)
        {
        case USTATE_INSIDE_TEXT:
          {
            p = unescape_text_state_inside_text (&ucontext,
                                                 p,
                                                 error);
          }
          break;
        case USTATE_AFTER_AMPERSAND:
          {
            p = unescape_text_state_after_ampersand (&ucontext,
                                                     p,
                                                     error);
          }
          break;
        case USTATE_INSIDE_ENTITY_NAME:
          {
            p = unescape_text_state_inside_entity_name (&ucontext,
                                                        p,
                                                        error);
          }
          break;
        case USTATE_AFTER_CHARREF_HASH:
          {
            p = unescape_text_state_after_charref_hash (&ucontext,
                                                        p,
                                                        error);
          }
          break;
@@ -627,7 +714,7 @@ unescape_text (GMarkupParseContext *context,
  if (context->state != STATE_ERROR) 
    {
-      switch (state) 
+      switch (ucontext.state) 
 	{
 	case USTATE_INSIDE_TEXT:
 	  break;
@@ -649,31 +736,28 @@ unescape_text (GMarkupParseContext *context,
  if (context->state == STATE_ERROR)
    {
-      g_string_free (str, TRUE);
+      g_string_free (ucontext.str, TRUE);
      *unescaped = NULL;
      return FALSE;
    }
  else
    {
-      *unescaped = g_string_free (str, FALSE);
+      *unescaped = ucontext.str;
      return TRUE;
    }
 #undef MAX_ENT_LEN
 }
-static gboolean
+static inline gboolean
 advance_char (GMarkupParseContext *context)
 {  
  g_return_val_if_fail (context->iter != context->current_text_end, FALSE);
  context->iter = g_utf8_next_char (context->iter);
  context->char_number += 1;
  if (context->iter == context->current_text_end)
    {
      return FALSE;
-
+    }
-  if (*context->iter == '\n')
+  else if (*context->iter == '\n')
    {
      context->line_number += 1;
      context->char_number = 1;
@@ -682,7 +766,7 @@ advance_char (GMarkupParseContext *context)
  return TRUE;
 }
-static gboolean
+static inline gboolean
 xml_isspace (char c)
 {
  return c == ' ' || c == '\t' || c == '\n' || c == '\r';
@@ -716,7 +800,7 @@ add_to_partial (GMarkupParseContext *context,
                const gchar         *text_end)
 {
  if (context->partial_chunk == NULL)
-    context->partial_chunk = g_string_new (NULL);
+    context->partial_chunk = g_string_sized_new (text_end - text_start);
  if (text_start != text_end)
    g_string_append_len (context->partial_chunk, text_start,
@@ -750,23 +834,18 @@ current_attribute (GMarkupParseContext *context)
 static void
 find_current_text_end (GMarkupParseContext *context)
 {
-  /* This function must be safe (non-segfaulting) on invalid UTF8 */
+  /* This function must be safe (non-segfaulting) on invalid UTF8.
   * It assumes the string starts with a character start
   */
  const gchar *end = context->current_text + context->current_text_len;
  const gchar *p;
  const gchar *next;
  g_assert (context->current_text_len > 0);
-  p = context->current_text;
+  p = g_utf8_find_prev_char (context->current_text, end);
  next = g_utf8_find_next_char (p, end);
-  while (next && *next)
+  g_assert (p != NULL); /* since current_text was a char start */
    {
      if (p == next)
 	next++;
      p = next;
      next = g_utf8_find_next_char (p, end);
    }
  /* p is now the start of the last character or character portion. */
  g_assert (p != end);
@@ -1352,6 +1431,8 @@ g_markup_parse_context_parse (GMarkupParseContext *context,
               * with the partial chunk if any; set it for the current
               * attribute.
               */
              GString *unescaped;
              add_to_partial (context, context->start, context->iter);
              g_assert (context->cur_attr >= 0);
@@ -1360,10 +1441,11 @@ g_markup_parse_context_parse (GMarkupParseContext *context,
                                 context->partial_chunk->str,
                                 context->partial_chunk->str +
                                 context->partial_chunk->len,
-                                 &context->attr_values[context->cur_attr],
+                                 &unescaped,
                                 error))
                {
                  /* success, advance past quote and set state. */
                  context->attr_values[context->cur_attr] = g_string_free (unescaped, FALSE);
                  advance_char (context);
                  context->state = STATE_BETWEEN_ATTRIBUTES;
                  context->start = NULL;
@@ -1390,7 +1472,7 @@ g_markup_parse_context_parse (GMarkupParseContext *context,
          if (context->iter != context->current_text_end)
            {
-              gchar *unescaped = NULL;
+              GString *unescaped = NULL;
              /* The text has ended at the open angle. Call the text
               * callback.
@@ -1407,12 +1489,12 @@ g_markup_parse_context_parse (GMarkupParseContext *context,
                  if (context->parser->text)
                    (*context->parser->text) (context,
-                                              unescaped,
+                                              unescaped->str,
-                                              strlen (unescaped),
+                                              unescaped->len,
                                              context->user_data,
                                              &tmp_error);
-                  g_free (unescaped);
+                  g_string_free (unescaped, TRUE);
                  if (tmp_error == NULL)
                    {
@@ -1869,7 +1951,8 @@ g_markup_escape_text (const gchar *text,
  if (length < 0)
    length = strlen (text);
-  str = g_string_new (NULL);
+  /* prealloc at least as long as original text */
  str = g_string_sized_new (length);
  append_escaped_text (str, text, length);
  return g_string_free (str, FALSE);