Merge branch 'wip/tingping/guri-normalize' into 'master'

guri: Normalize uri segments if they are encoded and add a flag to do scheme-based normalization See merge request GNOME/glib!1716
2025-01-12 07:26:15 +01:00 · 2020-11-09 11:43:42 +00:00 · 2020-11-09 11:43:42 +00:00 · b8927cc6ad
commit b8927cc6ad
parent c1a11c02e5 9da213ea34
3 changed files with 188 additions and 23 deletions
--- a/glib/guri.c
+++ b/glib/guri.c
@ -132,11 +132,12 @@
 *
 * Note that there is no `g_uri_equal ()` function, because comparing
 * URIs usefully requires scheme-specific knowledge that #GUri does
- * not have. For example, `http://example.com/` and
+ * not have. #GUri can help with normalization if you use the various
- * `http://EXAMPLE.COM:80` have exactly the same meaning according
+ * encoded #GUriFlags as well as %G_URI_FLAGS_SCHEME_NORMALIZE however
- * to the HTTP specification, and `data:,foo` and
+ * it is not comprehensive.
- * `data:;base64,Zm9v` resolve to the same thing according to the
+ * For example, `data:,foo` and `data:;base64,Zm9v` resolve to the same
- * `data:` URI specification.
+ * thing according to the `data:` URI specification which GLib does not
 * handle.
 *
 * Since: 2.66
 */
@ -289,15 +290,16 @@ uri_decoder (gchar       **out,
             GUriError     parse_error,
             GError      **error)
 {
-  gchar *decoded, *d, c;
+  gchar c;
  GString *decoded;
  const gchar *invalid, *s, *end;
  gssize len;
  if (!(flags & G_URI_FLAGS_ENCODED))
    just_normalize = FALSE;
-  decoded = g_malloc (length + 1);
+  decoded = g_string_sized_new (length + 1);
-  for (s = start, end = s + length, d = decoded; s < end; s++)
+  for (s = start, end = s + length; s < end; s++)
    {
      if (*s == '%')
        {
@ -311,7 +313,7 @@ uri_decoder (gchar       **out,
                  g_set_error_literal (error, G_URI_ERROR, parse_error,
                                       /* xgettext: no-c-format */
                                       _("Invalid %-encoding in URI"));
-                  g_free (decoded);
+                  g_string_free (decoded, TRUE);
                  return -1;
                }
@ -319,7 +321,7 @@ uri_decoder (gchar       **out,
               * fix it to "%25", since that might change the way that
               * the URI's owner would interpret it.
               */
-              *d++ = *s;
+              g_string_append_c (decoded, *s);
              continue;
            }
@ -328,43 +330,49 @@ uri_decoder (gchar       **out,
            {
              g_set_error_literal (error, G_URI_ERROR, parse_error,
                                   _("Illegal character in URI"));
-              g_free (decoded);
+              g_string_free (decoded, TRUE);
              return -1;
            }
          if (just_normalize && !g_uri_char_is_unreserved (c))
            {
-              /* Leave the % sequence there. */
+              /* Leave the % sequence there but normalize it. */
-              *d++ = *s;
+              g_string_append_c (decoded, *s);
              g_string_append_c (decoded, g_ascii_toupper (s[1]));
              g_string_append_c (decoded, g_ascii_toupper (s[2]));
              s += 2;
            }
          else
            {
-              *d++ = c;
+              g_string_append_c (decoded, c);
              s += 2;
            }
        }
      else if (www_form && *s == '+')
-        *d++ = ' ';
+        g_string_append_c (decoded, ' ');
      /* Normalize any illegal characters. */
      else if (just_normalize && (!g_ascii_isgraph (*s)))
        g_string_append_printf (decoded, "%%%02X", (guchar)*s);
      else
-        *d++ = *s;
+        g_string_append_c (decoded, *s);
    }
  *d = '\0';
-  len = d - decoded;
+  len = decoded->len;
  g_assert (len >= 0);
  if (!(flags & G_URI_FLAGS_ENCODED) &&
-      !g_utf8_validate (decoded, len, &invalid))
+      !g_utf8_validate (decoded->str, len, &invalid))
    {
      g_set_error_literal (error, G_URI_ERROR, parse_error,
                           _("Non-UTF-8 characters in URI"));
-      g_free (decoded);
+      g_string_free (decoded, TRUE);
      return -1;
    }
  if (out)
-    *out = g_steal_pointer (&decoded);
+    *out = g_string_free (decoded, FALSE);
  else
    g_string_free (decoded, TRUE);
  g_free (decoded);
  return len;
 }
@ -740,6 +748,52 @@ uri_cleanup (const gchar *uri_string)
  return g_string_free (copy, FALSE);
 }
 static gboolean
 should_normalize_empty_path (const char *scheme)
 {
  const char * const schemes[] = { "https", "http", "wss", "ws" };
  int i;
  for (i = 0; i < G_N_ELEMENTS (schemes); ++i)
    {
      if (!strcmp (schemes[i], scheme))
        return TRUE;
    }
  return FALSE;
 }
 static int
 normalize_port (const char *scheme,
                int         port)
 {
  const char *default_schemes[3] = { NULL };
  int i;
  switch (port)
    {
    case 21:
      default_schemes[0] = "ftp";
      break;
    case 80:
      default_schemes[0] = "http";
      default_schemes[1] = "ws";
      break;
    case 443:
      default_schemes[0] = "https";
      default_schemes[1] = "wss";
      break;
    default:
      break;
    }
  for (i = 0; default_schemes[i]; ++i)
    {
      if (!strcmp (scheme, default_schemes[i]))
        return -1;
    }
  return port;
 }
 static gboolean
 g_uri_split_internal (const gchar  *uri_string,
                      GUriFlags     flags,
@ -758,6 +812,7 @@ g_uri_split_internal (const gchar  *uri_string,
  const gchar *end, *colon, *at, *path_start, *semi, *question;
  const gchar *p, *bracket, *hostend;
  gchar *cleaned_uri_string = NULL;
  gchar *normalized_scheme = NULL;
  if (scheme)
    *scheme = NULL;
@ -795,8 +850,9 @@ g_uri_split_internal (const gchar  *uri_string,
  if (p > uri_string && *p == ':')
    {
      normalized_scheme = g_ascii_strdown (uri_string, p - uri_string);
      if (scheme)
-        *scheme = g_ascii_strdown (uri_string, p - uri_string);
+        *scheme = g_steal_pointer (&normalized_scheme);
      p++;
    }
  else
@ -922,6 +978,22 @@ g_uri_split_internal (const gchar  *uri_string,
                      G_URI_ERROR_BAD_PATH, error))
    goto fail;
  /* Scheme-based normalization */
  if (flags & G_URI_FLAGS_SCHEME_NORMALIZE && ((scheme && *scheme) || normalized_scheme))
    {
      const char *scheme_str = scheme && *scheme ? *scheme : normalized_scheme;
      if (should_normalize_empty_path (scheme_str) && path && !**path)
        {
          g_free (*path);
          *path = g_strdup ("/");
        }
      if (port && *port != -1)
        *port = normalize_port (scheme_str, *port);
    }
  g_free (normalized_scheme);
  g_free (cleaned_uri_string);
  return TRUE;
@ -941,6 +1013,7 @@ g_uri_split_internal (const gchar  *uri_string,
  if (fragment)
    g_clear_pointer (fragment, g_free);
  g_free (normalized_scheme);
  g_free (cleaned_uri_string);
  return FALSE;
 }
@ -1394,6 +1467,19 @@ g_uri_parse_relative (GUri         *base_uri,
              uri->port = base_uri->port;
            }
        }
      /* Scheme normalization couldn't have been done earlier
       * as the relative URI may not have had a scheme */
      if (flags & G_URI_FLAGS_SCHEME_NORMALIZE)
        {
          if (should_normalize_empty_path (uri->scheme) && !*uri->path)
            {
              g_free (uri->path);
              uri->path = g_strdup ("/");
            }
          uri->port = normalize_port (uri->scheme, uri->port);
        }
    }
  return g_steal_pointer (&uri);
--- a/glib/guri.h
+++ b/glib/guri.h
@ -62,6 +62,10 @@ void         g_uri_unref            (GUri *uri);
 * @G_URI_FLAGS_ENCODED_PATH: Same as %G_URI_FLAGS_ENCODED, for the path only.
 * @G_URI_FLAGS_ENCODED_FRAGMENT: Same as %G_URI_FLAGS_ENCODED, for the
 *     fragment only.
 * @G_URI_FLAGS_SCHEME_NORMALIZE: Applies scheme-based normalization to the
 *     parsed URI. For example when parsing an HTTP URI changing empty paths
 *     to `/` and changing port `80` to `-1`. This only supports a subset
 *     of known schemes. (Since: 2.68)
 *
 * Flags that describe a URI.
 *
@ -83,6 +87,7 @@ typedef enum {
  G_URI_FLAGS_ENCODED_QUERY   = 1 << 5,
  G_URI_FLAGS_ENCODED_PATH    = 1 << 6,
  G_URI_FLAGS_ENCODED_FRAGMENT = 1 << 7,
  G_URI_FLAGS_SCHEME_NORMALIZE = 1 << 8,
 } GUriFlags;
 GLIB_AVAILABLE_IN_2_66
--- a/glib/tests/uri.c
+++ b/glib/tests/uri.c
@ -1708,6 +1708,79 @@ test_uri_join_split_round_trip (void)
    }
 }
 static const struct
 {
  /* Inputs */
  const gchar *base;
  const gchar *uri;
  GUriFlags flags;
  /* Outputs */
  const gchar *path;
  int port;
 } normalize_tests[] =
  {
    { NULL, "http://foo/path with spaces", G_URI_FLAGS_ENCODED,
      "/path%20with%20spaces", -1 },
    { NULL, "http://foo/path with spaces 2", G_URI_FLAGS_ENCODED_PATH,
      "/path%20with%20spaces%202", -1 },
    { NULL, "http://foo/%aa", G_URI_FLAGS_ENCODED,
      "/%AA", -1 },
    { NULL, "http://foo/p\xc3\xa4th/", G_URI_FLAGS_ENCODED | G_URI_FLAGS_PARSE_RELAXED,
      "/p%C3%A4th/", -1 },
    { NULL, "http://foo", G_URI_FLAGS_SCHEME_NORMALIZE,
      "/", -1 },
    { NULL, "nothttp://foo", G_URI_FLAGS_SCHEME_NORMALIZE,
      "", -1 },
    { NULL, "http://foo:80", G_URI_FLAGS_SCHEME_NORMALIZE,
      "/", -1 },
    { NULL, "https://foo:443", G_URI_FLAGS_SCHEME_NORMALIZE,
      "/", -1 },
    { NULL, "ftp://foo:21", G_URI_FLAGS_SCHEME_NORMALIZE,
      "", -1 },
    { NULL, "nothttp://foo:80", G_URI_FLAGS_SCHEME_NORMALIZE,
      "", 80 },
    { "http://foo", "//bar", G_URI_FLAGS_SCHEME_NORMALIZE,
      "/", -1 },
    { "http://foo", "//bar:80", G_URI_FLAGS_SCHEME_NORMALIZE,
      "/", -1 },
    { "nothttp://foo", "//bar:80", G_URI_FLAGS_SCHEME_NORMALIZE,
      "", 80 },
    { "http://foo", "//bar", 0,
      "", -1 },
  };
 static void
 test_uri_normalize (void)
 {
  gsize i;
  int port;
  for (i = 0; i < G_N_ELEMENTS (normalize_tests); ++i)
    {
      GUri *uri, *base = NULL;
      if (normalize_tests[i].base)
        base = g_uri_parse (normalize_tests[i].base, normalize_tests[i].flags, NULL);
      uri = g_uri_parse_relative (base,
                                  normalize_tests[i].uri,
                                  normalize_tests[i].flags,
                                  NULL);
      g_assert_nonnull (uri);
      g_assert_cmpstr (g_uri_get_path (uri), ==, normalize_tests[i].path);
      g_assert_cmpint (g_uri_get_port (uri), ==, normalize_tests[i].port);
      g_uri_unref (uri);
      if (base)
        g_uri_unref (base);
    }
  /* One off testing a codepath where scheme is NULL but internally we still normalize it. */
  g_assert_true (g_uri_split ("HTTP://foo:80", G_URI_FLAGS_SCHEME_NORMALIZE,
                              NULL, NULL, NULL, &port, NULL, NULL, NULL, NULL));
  g_assert_cmpint (port, ==, -1);
 }
 int
 main (int   argc,
      char *argv[])
@ -1733,6 +1806,7 @@ main (int   argc,
  g_test_add_func ("/uri/to-string", test_uri_to_string);
  g_test_add_func ("/uri/join", test_uri_join);
  g_test_add_func ("/uri/join-split-round-trip", test_uri_join_split_round_trip);
  g_test_add_func ("/uri/normalize", test_uri_normalize);
  g_test_add_data_func ("/uri/iter-params/nul-terminated", GINT_TO_POINTER (TRUE), test_uri_iter_params);
  g_test_add_data_func ("/uri/iter-params/length", GINT_TO_POINTER (FALSE), test_uri_iter_params);
  g_test_add_data_func ("/uri/parse-params/nul-terminated", GINT_TO_POINTER (TRUE), test_uri_parse_params);