guri: Normalize uri segments if they are encoded

This changes it so when a segment is encoded it will be normalized at parse time which ensures its valid and it can more easily be compared with other uris.
2025-07-07 19:19:39 +02:00 · 2020-10-14 14:22:58 -05:00 · 2020-10-14 14:22:58 -05:00 · 482e10d3bb
commit 482e10d3bb
parent 63dfceedd2
2 changed files with 61 additions and 17 deletions
--- a/glib/guri.c
+++ b/glib/guri.c
@ -289,15 +289,16 @@ uri_decoder (gchar       **out,
             GUriError     parse_error,
             GError      **error)
 {
-  gchar *decoded, *d, c;
+  gchar c;
  GString *decoded;
  const gchar *invalid, *s, *end;
  gssize len;
  if (!(flags & G_URI_FLAGS_ENCODED))
    just_normalize = FALSE;
-  decoded = g_malloc (length + 1);
+  decoded = g_string_sized_new (length + 1);
-  for (s = start, end = s + length, d = decoded; s < end; s++)
+  for (s = start, end = s + length; s < end; s++)
    {
      if (*s == '%')
        {
@ -311,7 +312,7 @@ uri_decoder (gchar       **out,
                  g_set_error_literal (error, G_URI_ERROR, parse_error,
                                       /* xgettext: no-c-format */
                                       _("Invalid %-encoding in URI"));
-                  g_free (decoded);
+                  g_string_free (decoded, TRUE);
                  return -1;
                }
@ -319,7 +320,7 @@ uri_decoder (gchar       **out,
               * fix it to "%25", since that might change the way that
               * the URI's owner would interpret it.
               */
-              *d++ = *s;
+              g_string_append_c (decoded, *s);
              continue;
            }
@ -328,43 +329,50 @@ uri_decoder (gchar       **out,
            {
              g_set_error_literal (error, G_URI_ERROR, parse_error,
                                   _("Illegal character in URI"));
-              g_free (decoded);
+              g_string_free (decoded, TRUE);
              return -1;
            }
          if (just_normalize && !g_uri_char_is_unreserved (c))
            {
-              /* Leave the % sequence there. */
+              /* Leave the % sequence there but normalize it. */
-              *d++ = *s;
+              g_string_append_c (decoded, *s);
              g_string_append_c (decoded, g_ascii_toupper (s[1]));
              g_string_append_c (decoded, g_ascii_toupper (s[2]));
              s += 2;
            }
          else
            {
-              *d++ = c;
+              g_string_append_c (decoded, c);
              s += 2;
            }
        }
      else if (www_form && *s == '+')
-        *d++ = ' ';
+        g_string_append_c (decoded, ' ');
      /* Normalize any illegal characters */
      else if (just_normalize && (!g_ascii_isgraph (*s) ||
                                  (illegal_chars && strchr (illegal_chars, *s))))
        g_string_append_printf (decoded, "%%%02X", (guchar)*s);
      else
-        *d++ = *s;
+        g_string_append_c (decoded, *s);
    }
  *d = '\0';
-  len = d - decoded;
+  len = decoded->len;
  g_assert (len >= 0);
  if (!(flags & G_URI_FLAGS_ENCODED) &&
-      !g_utf8_validate (decoded, len, &invalid))
+      !g_utf8_validate (decoded->str, len, &invalid))
    {
      g_set_error_literal (error, G_URI_ERROR, parse_error,
                           _("Non-UTF-8 characters in URI"));
-      g_free (decoded);
+      g_string_free (decoded, TRUE);
      return -1;
    }
  if (out)
-    *out = g_steal_pointer (&decoded);
+    *out = g_string_free (decoded, FALSE);
  else
    g_string_free (decoded, TRUE);
  g_free (decoded);
  return len;
 }
--- a/glib/tests/uri.c
+++ b/glib/tests/uri.c
@ -1708,6 +1708,41 @@ test_uri_join_split_round_trip (void)
    }
 }
 static const struct
 {
  /* Inputs */
  const gchar *uri;
  GUriFlags flags;
  /* Outputs */
  const gchar *path;
 } normalize_tests[] =
  {
    { "http://foo/path with spaces", G_URI_FLAGS_ENCODED,
      "/path%20with%20spaces" },
    { "http://foo/path with spaces 2", G_URI_FLAGS_ENCODED_PATH,
      "/path%20with%20spaces%202" },
    { "http://foo/%aa", G_URI_FLAGS_ENCODED,
      "/%AA" },
    { "http://foo/p\xc3\xa4th/", G_URI_FLAGS_ENCODED | G_URI_FLAGS_PARSE_RELAXED,
      "/p%C3%A4th/" },
  };
 static void
 test_uri_normalize (void)
 {
  gsize i;
  for (i = 0; i < G_N_ELEMENTS (normalize_tests); ++i)
    {
      GUri *uri = g_uri_parse (normalize_tests[i].uri,
                               normalize_tests[i].flags,
                               NULL);
      g_assert_nonnull (uri);
      g_assert_cmpstr (g_uri_get_path (uri), ==, normalize_tests[i].path);
      g_uri_unref (uri);
    }
 }
 int
 main (int   argc,
      char *argv[])
@ -1733,6 +1768,7 @@ main (int   argc,
  g_test_add_func ("/uri/to-string", test_uri_to_string);
  g_test_add_func ("/uri/join", test_uri_join);
  g_test_add_func ("/uri/join-split-round-trip", test_uri_join_split_round_trip);
  g_test_add_func ("/uri/normalize", test_uri_normalize);
  g_test_add_data_func ("/uri/iter-params/nul-terminated", GINT_TO_POINTER (TRUE), test_uri_iter_params);
  g_test_add_data_func ("/uri/iter-params/length", GINT_TO_POINTER (FALSE), test_uri_iter_params);
  g_test_add_data_func ("/uri/parse-params/nul-terminated", GINT_TO_POINTER (TRUE), test_uri_parse_params);