From 482e10d3bbbdef4300144f3e82bba448140b520d Mon Sep 17 00:00:00 2001
From: Patrick Griffis <pgriffis@igalia.com>
Date: Wed, 14 Oct 2020 14:22:58 -0500
Subject: [PATCH 1/3] guri: Normalize uri segments if they are encoded

This changes it so when a segment is encoded it will be
normalized at parse time which ensures its valid and
it can more easily be compared with other uris.
---
 glib/guri.c      | 42 +++++++++++++++++++++++++-----------------
 glib/tests/uri.c | 36 ++++++++++++++++++++++++++++++++++++
 2 files changed, 61 insertions(+), 17 deletions(-)

diff --git a/glib/guri.c b/glib/guri.c
index 2f42a496d..0bc6c33b1 100644
--- a/glib/guri.c
+++ b/glib/guri.c
@@ -289,15 +289,16 @@ uri_decoder (gchar       **out,
              GUriError     parse_error,
              GError      **error)
 {
-  gchar *decoded, *d, c;
+  gchar c;
+  GString *decoded;
   const gchar *invalid, *s, *end;
   gssize len;
 
   if (!(flags & G_URI_FLAGS_ENCODED))
     just_normalize = FALSE;
 
-  decoded = g_malloc (length + 1);
-  for (s = start, end = s + length, d = decoded; s < end; s++)
+  decoded = g_string_sized_new (length + 1);
+  for (s = start, end = s + length; s < end; s++)
     {
       if (*s == '%')
         {
@@ -311,7 +312,7 @@ uri_decoder (gchar       **out,
                   g_set_error_literal (error, G_URI_ERROR, parse_error,
                                        /* xgettext: no-c-format */
                                        _("Invalid %-encoding in URI"));
-                  g_free (decoded);
+                  g_string_free (decoded, TRUE);
                   return -1;
                 }
 
@@ -319,7 +320,7 @@ uri_decoder (gchar       **out,
                * fix it to "%25", since that might change the way that
                * the URI's owner would interpret it.
                */
-              *d++ = *s;
+              g_string_append_c (decoded, *s);
               continue;
             }
 
@@ -328,43 +329,50 @@ uri_decoder (gchar       **out,
             {
               g_set_error_literal (error, G_URI_ERROR, parse_error,
                                    _("Illegal character in URI"));
-              g_free (decoded);
+              g_string_free (decoded, TRUE);
               return -1;
             }
           if (just_normalize && !g_uri_char_is_unreserved (c))
             {
-              /* Leave the % sequence there. */
-              *d++ = *s;
+              /* Leave the % sequence there but normalize it. */
+              g_string_append_c (decoded, *s);
+              g_string_append_c (decoded, g_ascii_toupper (s[1]));
+              g_string_append_c (decoded, g_ascii_toupper (s[2]));
+              s += 2;
             }
           else
             {
-              *d++ = c;
+              g_string_append_c (decoded, c);
               s += 2;
             }
         }
       else if (www_form && *s == '+')
-        *d++ = ' ';
+        g_string_append_c (decoded, ' ');
+      /* Normalize any illegal characters */
+      else if (just_normalize && (!g_ascii_isgraph (*s) ||
+                                  (illegal_chars && strchr (illegal_chars, *s))))
+        g_string_append_printf (decoded, "%%%02X", (guchar)*s);
       else
-        *d++ = *s;
+        g_string_append_c (decoded, *s);
     }
-  *d = '\0';
 
-  len = d - decoded;
+  len = decoded->len;
   g_assert (len >= 0);
 
   if (!(flags & G_URI_FLAGS_ENCODED) &&
-      !g_utf8_validate (decoded, len, &invalid))
+      !g_utf8_validate (decoded->str, len, &invalid))
     {
       g_set_error_literal (error, G_URI_ERROR, parse_error,
                            _("Non-UTF-8 characters in URI"));
-      g_free (decoded);
+      g_string_free (decoded, TRUE);
       return -1;
     }
 
   if (out)
-    *out = g_steal_pointer (&decoded);
+    *out = g_string_free (decoded, FALSE);
+  else
+    g_string_free (decoded, TRUE);
 
-  g_free (decoded);
   return len;
 }
 
diff --git a/glib/tests/uri.c b/glib/tests/uri.c
index b8a0c6a47..895bd4d5d 100644
--- a/glib/tests/uri.c
+++ b/glib/tests/uri.c
@@ -1708,6 +1708,41 @@ test_uri_join_split_round_trip (void)
     }
 }
 
+static const struct
+{
+  /* Inputs */
+  const gchar *uri;
+  GUriFlags flags;
+  /* Outputs */
+  const gchar *path;
+} normalize_tests[] =
+  {
+    { "http://foo/path with spaces", G_URI_FLAGS_ENCODED,
+      "/path%20with%20spaces" },
+    { "http://foo/path with spaces 2", G_URI_FLAGS_ENCODED_PATH,
+      "/path%20with%20spaces%202" },
+    { "http://foo/%aa", G_URI_FLAGS_ENCODED,
+      "/%AA" },
+    { "http://foo/p\xc3\xa4th/", G_URI_FLAGS_ENCODED | G_URI_FLAGS_PARSE_RELAXED,
+      "/p%C3%A4th/" },
+  };
+
+static void
+test_uri_normalize (void)
+{
+  gsize i;
+
+  for (i = 0; i < G_N_ELEMENTS (normalize_tests); ++i)
+    {
+      GUri *uri = g_uri_parse (normalize_tests[i].uri,
+                               normalize_tests[i].flags,
+                               NULL);
+      g_assert_nonnull (uri);
+      g_assert_cmpstr (g_uri_get_path (uri), ==, normalize_tests[i].path);
+      g_uri_unref (uri);
+    }
+}
+
 int
 main (int   argc,
       char *argv[])
@@ -1733,6 +1768,7 @@ main (int   argc,
   g_test_add_func ("/uri/to-string", test_uri_to_string);
   g_test_add_func ("/uri/join", test_uri_join);
   g_test_add_func ("/uri/join-split-round-trip", test_uri_join_split_round_trip);
+  g_test_add_func ("/uri/normalize", test_uri_normalize);
   g_test_add_data_func ("/uri/iter-params/nul-terminated", GINT_TO_POINTER (TRUE), test_uri_iter_params);
   g_test_add_data_func ("/uri/iter-params/length", GINT_TO_POINTER (FALSE), test_uri_iter_params);
   g_test_add_data_func ("/uri/parse-params/nul-terminated", GINT_TO_POINTER (TRUE), test_uri_parse_params);

From 64f478dca367cc263332e46051a10096cab3f501 Mon Sep 17 00:00:00 2001
From: Patrick Griffis <pgriffis@igalia.com>
Date: Fri, 23 Oct 2020 14:36:54 -0500
Subject: [PATCH 2/3] guri: Add G_URI_FLAGS_SCHEME_NORMALIZE This flag enables
 optional scheme-defined normalization during parsing of a URI.

---
 glib/guri.c      | 85 +++++++++++++++++++++++++++++++++++++++++++++---
 glib/guri.h      |  5 +++
 glib/tests/uri.c | 60 +++++++++++++++++++++++++++-------
 3 files changed, 135 insertions(+), 15 deletions(-)

diff --git a/glib/guri.c b/glib/guri.c
index 0bc6c33b1..c5399d7f3 100644
--- a/glib/guri.c
+++ b/glib/guri.c
@@ -348,9 +348,8 @@ uri_decoder (gchar       **out,
         }
       else if (www_form && *s == '+')
         g_string_append_c (decoded, ' ');
-      /* Normalize any illegal characters */
-      else if (just_normalize && (!g_ascii_isgraph (*s) ||
-                                  (illegal_chars && strchr (illegal_chars, *s))))
+      /* Normalize any illegal characters. */
+      else if (just_normalize && (!g_ascii_isgraph (*s)))
         g_string_append_printf (decoded, "%%%02X", (guchar)*s);
       else
         g_string_append_c (decoded, *s);
@@ -748,6 +747,52 @@ uri_cleanup (const gchar *uri_string)
   return g_string_free (copy, FALSE);
 }
 
+static gboolean
+should_normalize_empty_path (const char *scheme)
+{
+  const char * const schemes[] = { "https", "http", "wss", "ws" };
+  int i;
+  for (i = 0; i < G_N_ELEMENTS (schemes); ++i)
+    {
+      if (!strcmp (schemes[i], scheme))
+        return TRUE;
+    }
+  return FALSE;
+}
+
+static int
+normalize_port (const char *scheme,
+                int         port)
+{
+  const char *default_schemes[3] = { NULL };
+  int i;
+
+  switch (port)
+    {
+    case 21:
+      default_schemes[0] = "ftp";
+      break;
+    case 80:
+      default_schemes[0] = "http";
+      default_schemes[1] = "ws";
+      break;
+    case 443:
+      default_schemes[0] = "https";
+      default_schemes[1] = "wss";
+      break;
+    default:
+      break;
+    }
+
+  for (i = 0; default_schemes[i]; ++i)
+    {
+      if (!strcmp (scheme, default_schemes[i]))
+        return -1;
+    }
+
+  return port;
+}
+
 static gboolean
 g_uri_split_internal (const gchar  *uri_string,
                       GUriFlags     flags,
@@ -766,6 +811,7 @@ g_uri_split_internal (const gchar  *uri_string,
   const gchar *end, *colon, *at, *path_start, *semi, *question;
   const gchar *p, *bracket, *hostend;
   gchar *cleaned_uri_string = NULL;
+  gchar *normalized_scheme = NULL;
 
   if (scheme)
     *scheme = NULL;
@@ -803,8 +849,9 @@ g_uri_split_internal (const gchar  *uri_string,
 
   if (p > uri_string && *p == ':')
     {
+      normalized_scheme = g_ascii_strdown (uri_string, p - uri_string);
       if (scheme)
-        *scheme = g_ascii_strdown (uri_string, p - uri_string);
+        *scheme = g_steal_pointer (&normalized_scheme);
       p++;
     }
   else
@@ -930,6 +977,22 @@ g_uri_split_internal (const gchar  *uri_string,
                       G_URI_ERROR_BAD_PATH, error))
     goto fail;
 
+  /* Scheme-based normalization */
+  if (flags & G_URI_FLAGS_SCHEME_NORMALIZE && ((scheme && *scheme) || normalized_scheme))
+    {
+      const char *scheme_str = scheme && *scheme ? *scheme : normalized_scheme;
+
+      if (should_normalize_empty_path (scheme_str) && path && !**path)
+        {
+          g_free (*path);
+          *path = g_strdup ("/");
+        }
+
+      if (port && *port != -1)
+        *port = normalize_port (scheme_str, *port);
+    }
+
+  g_free (normalized_scheme);
   g_free (cleaned_uri_string);
   return TRUE;
 
@@ -949,6 +1012,7 @@ g_uri_split_internal (const gchar  *uri_string,
   if (fragment)
     g_clear_pointer (fragment, g_free);
 
+  g_free (normalized_scheme);
   g_free (cleaned_uri_string);
   return FALSE;
 }
@@ -1402,6 +1466,19 @@ g_uri_parse_relative (GUri         *base_uri,
               uri->port = base_uri->port;
             }
         }
+
+      /* Scheme normalization couldn't have been done earlier
+       * as the relative URI may not have had a scheme */
+      if (flags & G_URI_FLAGS_SCHEME_NORMALIZE)
+        {
+          if (should_normalize_empty_path (uri->scheme) && !*uri->path)
+            {
+              g_free (uri->path);
+              uri->path = g_strdup ("/");
+            }
+
+          uri->port = normalize_port (uri->scheme, uri->port);
+        }
     }
 
   return g_steal_pointer (&uri);
diff --git a/glib/guri.h b/glib/guri.h
index 3a7bb5c0e..fecbfed8e 100644
--- a/glib/guri.h
+++ b/glib/guri.h
@@ -62,6 +62,10 @@ void         g_uri_unref            (GUri *uri);
  * @G_URI_FLAGS_ENCODED_PATH: Same as %G_URI_FLAGS_ENCODED, for the path only.
  * @G_URI_FLAGS_ENCODED_FRAGMENT: Same as %G_URI_FLAGS_ENCODED, for the
  *     fragment only.
+ * @G_URI_FLAGS_SCHEME_NORMALIZE: Applies scheme-based normalization to the
+ *     parsed URI. For example when parsing an HTTP URI changing empty paths
+ *     to `/` and changing port `80` to `-1`. This only supports a subset
+ *     of known schemes. (Since: 2.68)
  *
  * Flags that describe a URI.
  *
@@ -83,6 +87,7 @@ typedef enum {
   G_URI_FLAGS_ENCODED_QUERY   = 1 << 5,
   G_URI_FLAGS_ENCODED_PATH    = 1 << 6,
   G_URI_FLAGS_ENCODED_FRAGMENT = 1 << 7,
+  G_URI_FLAGS_SCHEME_NORMALIZE = 1 << 8,
 } GUriFlags;
 
 GLIB_AVAILABLE_IN_2_66
diff --git a/glib/tests/uri.c b/glib/tests/uri.c
index 895bd4d5d..77aa95604 100644
--- a/glib/tests/uri.c
+++ b/glib/tests/uri.c
@@ -1711,36 +1711,74 @@ test_uri_join_split_round_trip (void)
 static const struct
 {
   /* Inputs */
+  const gchar *base;
   const gchar *uri;
   GUriFlags flags;
   /* Outputs */
   const gchar *path;
+  int port;
 } normalize_tests[] =
   {
-    { "http://foo/path with spaces", G_URI_FLAGS_ENCODED,
-      "/path%20with%20spaces" },
-    { "http://foo/path with spaces 2", G_URI_FLAGS_ENCODED_PATH,
-      "/path%20with%20spaces%202" },
-    { "http://foo/%aa", G_URI_FLAGS_ENCODED,
-      "/%AA" },
-    { "http://foo/p\xc3\xa4th/", G_URI_FLAGS_ENCODED | G_URI_FLAGS_PARSE_RELAXED,
-      "/p%C3%A4th/" },
+    { NULL, "http://foo/path with spaces", G_URI_FLAGS_ENCODED,
+      "/path%20with%20spaces", -1 },
+    { NULL, "http://foo/path with spaces 2", G_URI_FLAGS_ENCODED_PATH,
+      "/path%20with%20spaces%202", -1 },
+    { NULL, "http://foo/%aa", G_URI_FLAGS_ENCODED,
+      "/%AA", -1 },
+    { NULL, "http://foo/p\xc3\xa4th/", G_URI_FLAGS_ENCODED | G_URI_FLAGS_PARSE_RELAXED,
+      "/p%C3%A4th/", -1 },
+    { NULL, "http://foo", G_URI_FLAGS_SCHEME_NORMALIZE,
+      "/", -1 },
+    { NULL, "nothttp://foo", G_URI_FLAGS_SCHEME_NORMALIZE,
+      "", -1 },
+    { NULL, "http://foo:80", G_URI_FLAGS_SCHEME_NORMALIZE,
+      "/", -1 },
+    { NULL, "https://foo:443", G_URI_FLAGS_SCHEME_NORMALIZE,
+      "/", -1 },
+    { NULL, "ftp://foo:21", G_URI_FLAGS_SCHEME_NORMALIZE,
+      "", -1 },
+    { NULL, "nothttp://foo:80", G_URI_FLAGS_SCHEME_NORMALIZE,
+      "", 80 },
+    { "http://foo", "//bar", G_URI_FLAGS_SCHEME_NORMALIZE,
+      "/", -1 },
+    { "http://foo", "//bar:80", G_URI_FLAGS_SCHEME_NORMALIZE,
+      "/", -1 },
+    { "nothttp://foo", "//bar:80", G_URI_FLAGS_SCHEME_NORMALIZE,
+      "", 80 },
+    { "http://foo", "//bar", 0,
+      "", -1 },
   };
 
 static void
 test_uri_normalize (void)
 {
   gsize i;
+  int port;
 
   for (i = 0; i < G_N_ELEMENTS (normalize_tests); ++i)
     {
-      GUri *uri = g_uri_parse (normalize_tests[i].uri,
-                               normalize_tests[i].flags,
-                               NULL);
+      GUri *uri, *base = NULL;
+      if (normalize_tests[i].base)
+        base = g_uri_parse (normalize_tests[i].base, normalize_tests[i].flags, NULL);
+
+      uri = g_uri_parse_relative (base,
+                                  normalize_tests[i].uri,
+                                  normalize_tests[i].flags,
+                                  NULL);
+
       g_assert_nonnull (uri);
       g_assert_cmpstr (g_uri_get_path (uri), ==, normalize_tests[i].path);
+      g_assert_cmpint (g_uri_get_port (uri), ==, normalize_tests[i].port);
+
       g_uri_unref (uri);
+      if (base)
+        g_uri_unref (base);
     }
+
+  /* One off testing a codepath where scheme is NULL but internally we still normalize it. */
+  g_assert_true (g_uri_split ("HTTP://foo:80", G_URI_FLAGS_SCHEME_NORMALIZE,
+                              NULL, NULL, NULL, &port, NULL, NULL, NULL, NULL));
+  g_assert_cmpint (port, ==, -1);
 }
 
 int

From 9da213ea34b95d0c4387f593bf277a4a6b9a68d1 Mon Sep 17 00:00:00 2001
From: Patrick Griffis <pgriffis@igalia.com>
Date: Mon, 26 Oct 2020 14:33:44 -0500
Subject: [PATCH 3/3] docs: Add note about uri normalization for equality

---
 glib/guri.c | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/glib/guri.c b/glib/guri.c
index c5399d7f3..ceed7133d 100644
--- a/glib/guri.c
+++ b/glib/guri.c
@@ -132,11 +132,12 @@
  *
  * Note that there is no `g_uri_equal ()` function, because comparing
  * URIs usefully requires scheme-specific knowledge that #GUri does
- * not have. For example, `http://example.com/` and
- * `http://EXAMPLE.COM:80` have exactly the same meaning according
- * to the HTTP specification, and `data:,foo` and
- * `data:;base64,Zm9v` resolve to the same thing according to the
- * `data:` URI specification.
+ * not have. #GUri can help with normalization if you use the various
+ * encoded #GUriFlags as well as %G_URI_FLAGS_SCHEME_NORMALIZE however
+ * it is not comprehensive.
+ * For example, `data:,foo` and `data:;base64,Zm9v` resolve to the same
+ * thing according to the `data:` URI specification which GLib does not
+ * handle.
  *
  * Since: 2.66
  */