Optimizations; don't scan the entire text in find_current_text_end(),

2004-11-28 Matthias Clasen <mclasen@redhat.com> * glib/gmarkup.c: Optimizations; don't scan the entire text in find_current_text_end(), split unescape_text() into multiple functions. (#159001, Havoc Pennington)
2025-11-23 10:49:02 +01:00 · 2004-11-28 05:40:10 +00:00
parent c77ae3942c
commit 86c78552ec
6 changed files with 410 additions and 297 deletions
--- a/6
+++ b/6
@@ -1,3 +1,9 @@
+2004-11-28  Matthias Clasen  <mclasen@redhat.com>
+
+	* glib/gmarkup.c: Optimizations; don't scan the entire text
+	in find_current_text_end(), split unescape_text() into multiple
+	functions.  (#159001, Havoc Pennington)
+
 2004-11-27  Matthias Clasen  <mclasen@redhat.com>

 	* glib/gspawn.c (g_spawn_async_with_pipes): Update the @flags
--- a/ChangeLog.pre-2-10
+++ b/ChangeLog.pre-2-10
@@ -1,3 +1,9 @@
+2004-11-28  Matthias Clasen  <mclasen@redhat.com>
+
+	* glib/gmarkup.c: Optimizations; don't scan the entire text
+	in find_current_text_end(), split unescape_text() into multiple
+	functions.  (#159001, Havoc Pennington)
+
 2004-11-27  Matthias Clasen  <mclasen@redhat.com>

 	* glib/gspawn.c (g_spawn_async_with_pipes): Update the @flags
--- a/ChangeLog.pre-2-12
+++ b/ChangeLog.pre-2-12
@@ -1,3 +1,9 @@
+2004-11-28  Matthias Clasen  <mclasen@redhat.com>
+
+	* glib/gmarkup.c: Optimizations; don't scan the entire text
+	in find_current_text_end(), split unescape_text() into multiple
+	functions.  (#159001, Havoc Pennington)
+
 2004-11-27  Matthias Clasen  <mclasen@redhat.com>

 	* glib/gspawn.c (g_spawn_async_with_pipes): Update the @flags
--- a/ChangeLog.pre-2-6
+++ b/ChangeLog.pre-2-6
@@ -1,3 +1,9 @@
+2004-11-28  Matthias Clasen  <mclasen@redhat.com>
+
+	* glib/gmarkup.c: Optimizations; don't scan the entire text
+	in find_current_text_end(), split unescape_text() into multiple
+	functions.  (#159001, Havoc Pennington)
+
 2004-11-27  Matthias Clasen  <mclasen@redhat.com>

 	* glib/gspawn.c (g_spawn_async_with_pipes): Update the @flags
--- a/ChangeLog.pre-2-8
+++ b/ChangeLog.pre-2-8
@@ -1,3 +1,9 @@
+2004-11-28  Matthias Clasen  <mclasen@redhat.com>
+
+	* glib/gmarkup.c: Optimizations; don't scan the entire text
+	in find_current_text_end(), split unescape_text() into multiple
+	functions.  (#159001, Havoc Pennington)
+
 2004-11-27  Matthias Clasen  <mclasen@redhat.com>

 	* glib/gspawn.c (g_spawn_async_with_pipes): Update the @flags
--- a/glib/gmarkup.c
+++ b/glib/gmarkup.c
@@ -235,12 +235,24 @@ set_error (GMarkupParseContext *context,
  g_propagate_error (error, tmp_error);
 }

+
+/* To make these faster, we first use the ascii-only tests, then check
+ * for the usual non-alnum name-end chars, and only then call the
+ * expensive unicode stuff. Nobody uses non-ascii in XML tag/attribute
+ * names, so this is a reasonable hack that virtually always avoids
+ * the guniprop call.
+ */
+#define IS_COMMON_NAME_END_CHAR(c) \
+  ((c) == '=' || (c) == '/' || (c) == '>' || (c) == ' ')
+
 static gboolean
 is_name_start_char (gunichar c)
 {
-  if (g_unichar_isalpha (c) ||
+  if (g_ascii_isalpha (c) ||
+      (!IS_COMMON_NAME_END_CHAR (c) &&
+       (g_unichar_isalpha (c) ||
        c == '_' ||
-      c == ':')
+        c == ':')))
    return TRUE;
  else
    return FALSE;
@@ -249,11 +261,13 @@ is_name_start_char (gunichar c)
 static gboolean
 is_name_char (gunichar c)
 {
-  if (g_unichar_isalnum (c) ||
+  if (g_ascii_isalnum (c) ||
+      (!IS_COMMON_NAME_END_CHAR (c) &&
+       (g_unichar_isalnum (c) ||
        c == '.' ||
        c == '-' ||
        c == '_' ||
-      c == ':')
+        c == ':')))
    return TRUE;
  else
    return FALSE;
@@ -326,54 +340,51 @@ typedef enum
  USTATE_AFTER_CHARREF_HASH
 } UnescapeState;

-static gboolean
-unescape_text (GMarkupParseContext *context,
-               const gchar         *text,
-               const gchar         *text_end,
-               gchar              **unescaped,
+typedef struct
+{
+  GMarkupParseContext *context;
+  GString *str;
+  UnescapeState state;
+  const gchar *text;
+  const gchar *text_end;
+  const gchar *entity_start;
+} UnescapeContext;
+
+static const gchar*
+unescape_text_state_inside_text (UnescapeContext *ucontext,
+                                 const gchar     *p,
                                 GError         **error)
 {
-#define MAX_ENT_LEN 5
-  GString *str;
-  const gchar *p;
-  UnescapeState state;
  const gchar *start;
  gboolean normalize_attribute;

-  str = g_string_new (NULL);
-
-  if (context->state == STATE_INSIDE_ATTRIBUTE_VALUE_SQ ||
-      context->state == STATE_INSIDE_ATTRIBUTE_VALUE_DQ)
+  if (ucontext->context->state == STATE_INSIDE_ATTRIBUTE_VALUE_SQ ||
+      ucontext->context->state == STATE_INSIDE_ATTRIBUTE_VALUE_DQ)
    normalize_attribute = TRUE;
  else
    normalize_attribute = FALSE;

-  state = USTATE_INSIDE_TEXT;
-  p = text;
  start = p;
-  while (p != text_end && context->state != STATE_ERROR)
-    {
-      g_assert (p < text_end);
  
-      switch (state)
+  while (p != ucontext->text_end)
    {
-        case USTATE_INSIDE_TEXT:
+      if (*p == '&')
        {
-            while (p != text_end && *p != '&')
+          break;
+        }
+      else if (normalize_attribute && (*p == '\t' || *p == '\n'))
        {
-		if ((*p == '\t' || *p == '\n') && normalize_attribute)
-		  {
-		    g_string_append_len (str, start, p - start);
-		    g_string_append_c (str, ' ');
+          g_string_append_len (ucontext->str, start, p - start);
+          g_string_append_c (ucontext->str, ' ');
          p = g_utf8_next_char (p);
          start = p;
        }
      else if (*p == '\r')
        {
-		    g_string_append_len (str, start, p - start);
-		    g_string_append_c (str, normalize_attribute ? ' ' : '\n');
+          g_string_append_len (ucontext->str, start, p - start);
+          g_string_append_c (ucontext->str, normalize_attribute ? ' ' : '\n');
          p = g_utf8_next_char (p);
-		    if (*p == '\n')
+          if (p != ucontext->text_end && *p == '\n')
            p = g_utf8_next_char (p);
          start = p;
        }
@@ -382,35 +393,38 @@ unescape_text (GMarkupParseContext *context,
    }
  
  if (p != start)
-              {
-                g_string_append_len (str, start, p - start);
+    g_string_append_len (ucontext->str, start, p - start);
  
-                start = NULL;
-              }
-            
-            if (p != text_end && *p == '&')
+  if (p != ucontext->text_end && *p == '&')
    {
      p = g_utf8_next_char (p);
-                state = USTATE_AFTER_AMPERSAND;
+      ucontext->state = USTATE_AFTER_AMPERSAND;
    }
-          }
-          break;

-        case USTATE_AFTER_AMPERSAND:
+  return p;
+}
+
+__attribute__ ((noinline))
+static const gchar*
+unescape_text_state_after_ampersand (UnescapeContext *ucontext,
+                                     const gchar     *p,
+                                     GError         **error)
 {
+  ucontext->entity_start = NULL;
+  
  if (*p == '#')
    {
      p = g_utf8_next_char (p);

-                start = p;
-                state = USTATE_AFTER_CHARREF_HASH;
+      ucontext->entity_start = p;
+      ucontext->state = USTATE_AFTER_CHARREF_HASH;
    }
  else if (!is_name_start_char (g_utf8_get_char (p)))
    {
      if (*p == ';')
        {
-                    set_unescape_error (context, error,
-                                        p, text_end,
+          set_unescape_error (ucontext->context, error,
+                              p, ucontext->text_end,
                              G_MARKUP_ERROR_PARSE,
                              _("Empty entity '&;' seen; valid "
                                "entities are: &amp; &quot; &lt; &gt; &apos;"));
@@ -419,8 +433,8 @@ unescape_text (GMarkupParseContext *context,
        {
          gchar buf[7];

-                    set_unescape_error (context, error,
-                                        p, text_end,
+          set_unescape_error (ucontext->context, error,
+                              p, ucontext->text_end,
                              G_MARKUP_ERROR_PARSE,
                              _("Character '%s' is not valid at "
                                "the start of an entity name; "
@@ -433,21 +447,26 @@ unescape_text (GMarkupParseContext *context,
    }
  else
    {
-                start = p;
-                state = USTATE_INSIDE_ENTITY_NAME;
+      ucontext->entity_start = p;
+      ucontext->state = USTATE_INSIDE_ENTITY_NAME;
    }
+
+  return p;
 }
-          break;

-
-        case USTATE_INSIDE_ENTITY_NAME:
+__attribute__ ((noinline))
+static const gchar*
+unescape_text_state_inside_entity_name (UnescapeContext *ucontext,
+                                        const gchar     *p,
+                                        GError         **error)
 {
+#define MAX_ENT_LEN 5
  gchar buf[MAX_ENT_LEN+1] = {
    '\0', '\0', '\0', '\0', '\0', '\0'
  };
  gchar *dest;

-            while (p != text_end)
+  while (p != ucontext->text_end)
    {
      if (*p == ';')
        break;
@@ -455,8 +474,8 @@ unescape_text (GMarkupParseContext *context,
        {
          gchar ubuf[7];

-                    set_unescape_error (context, error,
-                                        p, text_end,
+          set_unescape_error (ucontext->context, error,
+                              p, ucontext->text_end,
                              G_MARKUP_ERROR_PARSE,
                              _("Character '%s' is not valid "
                                "inside an entity name"),
@@ -467,13 +486,13 @@ unescape_text (GMarkupParseContext *context,
      p = g_utf8_next_char (p);
    }

-            if (context->state != STATE_ERROR)
+  if (ucontext->context->state != STATE_ERROR)
    {
-                if (p != text_end)
+      if (p != ucontext->text_end)
        {
          const gchar *src;
                
-                    src = start;
+          src = ucontext->entity_start;
          dest = buf;
          while (src != p)
            {
@@ -484,23 +503,22 @@ unescape_text (GMarkupParseContext *context,

          /* move to after semicolon */
          p = g_utf8_next_char (p);
-                    start = p;
-                    state = USTATE_INSIDE_TEXT;
+          ucontext->state = USTATE_INSIDE_TEXT;

          if (strcmp (buf, "lt") == 0)
-                      g_string_append_c (str, '<');
+            g_string_append_c (ucontext->str, '<');
          else if (strcmp (buf, "gt") == 0)
-                      g_string_append_c (str, '>');
+            g_string_append_c (ucontext->str, '>');
          else if (strcmp (buf, "amp") == 0)
-                      g_string_append_c (str, '&');
+            g_string_append_c (ucontext->str, '&');
          else if (strcmp (buf, "quot") == 0)
-                      g_string_append_c (str, '"');
+            g_string_append_c (ucontext->str, '"');
          else if (strcmp (buf, "apos") == 0)
-                      g_string_append_c (str, '\'');
+            g_string_append_c (ucontext->str, '\'');
          else
            {
-                        set_unescape_error (context, error,
-                                            p, text_end,
+              set_unescape_error (ucontext->context, error,
+                                  p, ucontext->text_end,
                                  G_MARKUP_ERROR_PARSE,
                                  _("Entity name '%s' is not known"),
                                  buf);
@@ -508,9 +526,9 @@ unescape_text (GMarkupParseContext *context,
        }
      else
        {
-                    set_unescape_error (context, error,
+          set_unescape_error (ucontext->context, error,
                              /* give line number of the & */
-                                        start, text_end,
+                              ucontext->entity_start, ucontext->text_end,
                              G_MARKUP_ERROR_PARSE,
                              _("Entity did not end with a semicolon; "
                                "most likely you used an ampersand "
@@ -518,12 +536,22 @@ unescape_text (GMarkupParseContext *context,
                                "an entity - escape ampersand as &amp;"));
        }
    }
-          }
-          break;
+#undef MAX_ENT_LEN

-        case USTATE_AFTER_CHARREF_HASH:
+  return p;
+}
+
+__attribute__ ((noinline))
+static const gchar*
+unescape_text_state_after_charref_hash (UnescapeContext *ucontext,
+                                        const gchar     *p,
+                                        GError         **error)
 {
  gboolean is_hex = FALSE;
+  const char *start;
+
+  start = ucontext->entity_start;
+
  if (*p == 'x')
    {
      is_hex = TRUE;
@@ -531,10 +559,10 @@ unescape_text (GMarkupParseContext *context,
      start = p;
    }

-            while (p != text_end && *p != ';')
+  while (p != ucontext->text_end && *p != ';')
    p = g_utf8_next_char (p);

-            if (p != text_end)
+  if (p != ucontext->text_end)
    {
      g_assert (*p == ';');

@@ -555,8 +583,8 @@ unescape_text (GMarkupParseContext *context,

          if (end != digit_end || errno != 0)
            {
-                        set_unescape_error (context, error,
-                                            start, text_end,
+              set_unescape_error (ucontext->context, error,
+                                  start, ucontext->text_end,
                                  G_MARKUP_ERROR_PARSE,
                                  _("Failed to parse '%s', which "
                                    "should have been a digit "
@@ -576,12 +604,12 @@ unescape_text (GMarkupParseContext *context,
                  (l >= 0x10000 && l <= 0x10FFFF))
                {
                  gchar buf[7];
-                            g_string_append (str, char_str (l, buf));
+                  g_string_append (ucontext->str, char_str (l, buf));
                }
              else
                {
-                            set_unescape_error (context, error,
-                                                start, text_end,
+                  set_unescape_error (ucontext->context, error,
+                                      start, ucontext->text_end,
                                      G_MARKUP_ERROR_PARSE,
                                      _("Character reference '%s' does not encode a permitted character"),
                                      digit);
@@ -592,13 +620,12 @@ unescape_text (GMarkupParseContext *context,

          /* Move to next state */
          p = g_utf8_next_char (p); /* past semicolon */
-                    start = p;
-                    state = USTATE_INSIDE_TEXT;
+          ucontext->state = USTATE_INSIDE_TEXT;
        }
      else
        {
-                    set_unescape_error (context, error,
-                                        start, text_end,
+          set_unescape_error (ucontext->context, error,
+                              start, ucontext->text_end,
                              G_MARKUP_ERROR_PARSE,
                              _("Empty character reference; "
                                "should include a digit such as "
@@ -607,8 +634,8 @@ unescape_text (GMarkupParseContext *context,
    }
  else
    {
-                set_unescape_error (context, error,
-                                    start, text_end,
+      set_unescape_error (ucontext->context, error,
+                          start, ucontext->text_end,
                          G_MARKUP_ERROR_PARSE,
                          _("Character reference did not end with a "
                            "semicolon; "
@@ -616,6 +643,66 @@ unescape_text (GMarkupParseContext *context,
                            "character without intending to start "
                            "an entity - escape ampersand as &amp;"));
    }
+
+  return p;
+}
+
+static gboolean
+unescape_text (GMarkupParseContext *context,
+               const gchar         *text,
+               const gchar         *text_end,
+               GString            **unescaped,
+               GError             **error)
+{
+  UnescapeContext ucontext;
+  const gchar *p;
+
+  ucontext.context = context;
+  ucontext.text = text;
+  ucontext.text_end = text_end;
+  ucontext.entity_start = NULL;
+  
+  ucontext.str = g_string_sized_new (text_end - text);
+
+  ucontext.state = USTATE_INSIDE_TEXT;
+  p = text;
+
+  while (p != text_end && context->state != STATE_ERROR)
+    {
+      g_assert (p < text_end);
+      
+      switch (ucontext.state)
+        {
+        case USTATE_INSIDE_TEXT:
+          {
+            p = unescape_text_state_inside_text (&ucontext,
+                                                 p,
+                                                 error);
+          }
+          break;
+
+        case USTATE_AFTER_AMPERSAND:
+          {
+            p = unescape_text_state_after_ampersand (&ucontext,
+                                                     p,
+                                                     error);
+          }
+          break;
+
+
+        case USTATE_INSIDE_ENTITY_NAME:
+          {
+            p = unescape_text_state_inside_entity_name (&ucontext,
+                                                        p,
+                                                        error);
+          }
+          break;
+
+        case USTATE_AFTER_CHARREF_HASH:
+          {
+            p = unescape_text_state_after_charref_hash (&ucontext,
+                                                        p,
+                                                        error);
          }
          break;

@@ -627,7 +714,7 @@ unescape_text (GMarkupParseContext *context,

  if (context->state != STATE_ERROR) 
    {
-      switch (state) 
+      switch (ucontext.state) 
 	{
 	case USTATE_INSIDE_TEXT:
 	  break;
@@ -649,31 +736,28 @@ unescape_text (GMarkupParseContext *context,

  if (context->state == STATE_ERROR)
    {
-      g_string_free (str, TRUE);
+      g_string_free (ucontext.str, TRUE);
      *unescaped = NULL;
      return FALSE;
    }
  else
    {
-      *unescaped = g_string_free (str, FALSE);
+      *unescaped = ucontext.str;
      return TRUE;
    }
-
-#undef MAX_ENT_LEN
 }

-static gboolean
+static inline gboolean
 advance_char (GMarkupParseContext *context)
 {  
-  g_return_val_if_fail (context->iter != context->current_text_end, FALSE);
-
  context->iter = g_utf8_next_char (context->iter);
  context->char_number += 1;

  if (context->iter == context->current_text_end)
+    {
      return FALSE;
-
-  if (*context->iter == '\n')
+    }
+  else if (*context->iter == '\n')
    {
      context->line_number += 1;
      context->char_number = 1;
@@ -682,7 +766,7 @@ advance_char (GMarkupParseContext *context)
  return TRUE;
 }

-static gboolean
+static inline gboolean
 xml_isspace (char c)
 {
  return c == ' ' || c == '\t' || c == '\n' || c == '\r';
@@ -716,7 +800,7 @@ add_to_partial (GMarkupParseContext *context,
                const gchar         *text_end)
 {
  if (context->partial_chunk == NULL)
-    context->partial_chunk = g_string_new (NULL);
+    context->partial_chunk = g_string_sized_new (text_end - text_start);

  if (text_start != text_end)
    g_string_append_len (context->partial_chunk, text_start,
@@ -750,23 +834,18 @@ current_attribute (GMarkupParseContext *context)
 static void
 find_current_text_end (GMarkupParseContext *context)
 {
-  /* This function must be safe (non-segfaulting) on invalid UTF8 */
+  /* This function must be safe (non-segfaulting) on invalid UTF8.
+   * It assumes the string starts with a character start
+   */
  const gchar *end = context->current_text + context->current_text_len;
  const gchar *p;
  const gchar *next;

  g_assert (context->current_text_len > 0);

-  p = context->current_text;
-  next = g_utf8_find_next_char (p, end);
+  p = g_utf8_find_prev_char (context->current_text, end);

-  while (next && *next)
-    {
-      if (p == next)
-	next++;
-      p = next;
-      next = g_utf8_find_next_char (p, end);
-    }
+  g_assert (p != NULL); /* since current_text was a char start */

  /* p is now the start of the last character or character portion. */
  g_assert (p != end);
@@ -1352,6 +1431,8 @@ g_markup_parse_context_parse (GMarkupParseContext *context,
               * with the partial chunk if any; set it for the current
               * attribute.
               */
+              GString *unescaped;
+              
              add_to_partial (context, context->start, context->iter);

              g_assert (context->cur_attr >= 0);
@@ -1360,10 +1441,11 @@ g_markup_parse_context_parse (GMarkupParseContext *context,
                                 context->partial_chunk->str,
                                 context->partial_chunk->str +
                                 context->partial_chunk->len,
-                                 &context->attr_values[context->cur_attr],
+                                 &unescaped,
                                 error))
                {
                  /* success, advance past quote and set state. */
+                  context->attr_values[context->cur_attr] = g_string_free (unescaped, FALSE);
                  advance_char (context);
                  context->state = STATE_BETWEEN_ATTRIBUTES;
                  context->start = NULL;
@@ -1390,7 +1472,7 @@ g_markup_parse_context_parse (GMarkupParseContext *context,

          if (context->iter != context->current_text_end)
            {
-              gchar *unescaped = NULL;
+              GString *unescaped = NULL;

              /* The text has ended at the open angle. Call the text
               * callback.
@@ -1407,12 +1489,12 @@ g_markup_parse_context_parse (GMarkupParseContext *context,

                  if (context->parser->text)
                    (*context->parser->text) (context,
-                                              unescaped,
-                                              strlen (unescaped),
+                                              unescaped->str,
+                                              unescaped->len,
                                              context->user_data,
                                              &tmp_error);
                  
-                  g_free (unescaped);
+                  g_string_free (unescaped, TRUE);

                  if (tmp_error == NULL)
                    {
@@ -1869,7 +1951,8 @@ g_markup_escape_text (const gchar *text,
  if (length < 0)
    length = strlen (text);

-  str = g_string_new (NULL);
+  /* prealloc at least as long as original text */
+  str = g_string_sized_new (length);
  append_escaped_text (str, text, length);

  return g_string_free (str, FALSE);