From 482e10d3bbbdef4300144f3e82bba448140b520d Mon Sep 17 00:00:00 2001 From: Patrick Griffis Date: Wed, 14 Oct 2020 14:22:58 -0500 Subject: [PATCH 1/3] guri: Normalize uri segments if they are encoded This changes it so when a segment is encoded it will be normalized at parse time which ensures its valid and it can more easily be compared with other uris. --- glib/guri.c | 42 +++++++++++++++++++++++++----------------- glib/tests/uri.c | 36 ++++++++++++++++++++++++++++++++++++ 2 files changed, 61 insertions(+), 17 deletions(-) diff --git a/glib/guri.c b/glib/guri.c index 2f42a496d..0bc6c33b1 100644 --- a/glib/guri.c +++ b/glib/guri.c @@ -289,15 +289,16 @@ uri_decoder (gchar **out, GUriError parse_error, GError **error) { - gchar *decoded, *d, c; + gchar c; + GString *decoded; const gchar *invalid, *s, *end; gssize len; if (!(flags & G_URI_FLAGS_ENCODED)) just_normalize = FALSE; - decoded = g_malloc (length + 1); - for (s = start, end = s + length, d = decoded; s < end; s++) + decoded = g_string_sized_new (length + 1); + for (s = start, end = s + length; s < end; s++) { if (*s == '%') { @@ -311,7 +312,7 @@ uri_decoder (gchar **out, g_set_error_literal (error, G_URI_ERROR, parse_error, /* xgettext: no-c-format */ _("Invalid %-encoding in URI")); - g_free (decoded); + g_string_free (decoded, TRUE); return -1; } @@ -319,7 +320,7 @@ uri_decoder (gchar **out, * fix it to "%25", since that might change the way that * the URI's owner would interpret it. */ - *d++ = *s; + g_string_append_c (decoded, *s); continue; } @@ -328,43 +329,50 @@ uri_decoder (gchar **out, { g_set_error_literal (error, G_URI_ERROR, parse_error, _("Illegal character in URI")); - g_free (decoded); + g_string_free (decoded, TRUE); return -1; } if (just_normalize && !g_uri_char_is_unreserved (c)) { - /* Leave the % sequence there. */ - *d++ = *s; + /* Leave the % sequence there but normalize it. */ + g_string_append_c (decoded, *s); + g_string_append_c (decoded, g_ascii_toupper (s[1])); + g_string_append_c (decoded, g_ascii_toupper (s[2])); + s += 2; } else { - *d++ = c; + g_string_append_c (decoded, c); s += 2; } } else if (www_form && *s == '+') - *d++ = ' '; + g_string_append_c (decoded, ' '); + /* Normalize any illegal characters */ + else if (just_normalize && (!g_ascii_isgraph (*s) || + (illegal_chars && strchr (illegal_chars, *s)))) + g_string_append_printf (decoded, "%%%02X", (guchar)*s); else - *d++ = *s; + g_string_append_c (decoded, *s); } - *d = '\0'; - len = d - decoded; + len = decoded->len; g_assert (len >= 0); if (!(flags & G_URI_FLAGS_ENCODED) && - !g_utf8_validate (decoded, len, &invalid)) + !g_utf8_validate (decoded->str, len, &invalid)) { g_set_error_literal (error, G_URI_ERROR, parse_error, _("Non-UTF-8 characters in URI")); - g_free (decoded); + g_string_free (decoded, TRUE); return -1; } if (out) - *out = g_steal_pointer (&decoded); + *out = g_string_free (decoded, FALSE); + else + g_string_free (decoded, TRUE); - g_free (decoded); return len; } diff --git a/glib/tests/uri.c b/glib/tests/uri.c index b8a0c6a47..895bd4d5d 100644 --- a/glib/tests/uri.c +++ b/glib/tests/uri.c @@ -1708,6 +1708,41 @@ test_uri_join_split_round_trip (void) } } +static const struct +{ + /* Inputs */ + const gchar *uri; + GUriFlags flags; + /* Outputs */ + const gchar *path; +} normalize_tests[] = + { + { "http://foo/path with spaces", G_URI_FLAGS_ENCODED, + "/path%20with%20spaces" }, + { "http://foo/path with spaces 2", G_URI_FLAGS_ENCODED_PATH, + "/path%20with%20spaces%202" }, + { "http://foo/%aa", G_URI_FLAGS_ENCODED, + "/%AA" }, + { "http://foo/p\xc3\xa4th/", G_URI_FLAGS_ENCODED | G_URI_FLAGS_PARSE_RELAXED, + "/p%C3%A4th/" }, + }; + +static void +test_uri_normalize (void) +{ + gsize i; + + for (i = 0; i < G_N_ELEMENTS (normalize_tests); ++i) + { + GUri *uri = g_uri_parse (normalize_tests[i].uri, + normalize_tests[i].flags, + NULL); + g_assert_nonnull (uri); + g_assert_cmpstr (g_uri_get_path (uri), ==, normalize_tests[i].path); + g_uri_unref (uri); + } +} + int main (int argc, char *argv[]) @@ -1733,6 +1768,7 @@ main (int argc, g_test_add_func ("/uri/to-string", test_uri_to_string); g_test_add_func ("/uri/join", test_uri_join); g_test_add_func ("/uri/join-split-round-trip", test_uri_join_split_round_trip); + g_test_add_func ("/uri/normalize", test_uri_normalize); g_test_add_data_func ("/uri/iter-params/nul-terminated", GINT_TO_POINTER (TRUE), test_uri_iter_params); g_test_add_data_func ("/uri/iter-params/length", GINT_TO_POINTER (FALSE), test_uri_iter_params); g_test_add_data_func ("/uri/parse-params/nul-terminated", GINT_TO_POINTER (TRUE), test_uri_parse_params); From 64f478dca367cc263332e46051a10096cab3f501 Mon Sep 17 00:00:00 2001 From: Patrick Griffis Date: Fri, 23 Oct 2020 14:36:54 -0500 Subject: [PATCH 2/3] guri: Add G_URI_FLAGS_SCHEME_NORMALIZE This flag enables optional scheme-defined normalization during parsing of a URI. --- glib/guri.c | 85 +++++++++++++++++++++++++++++++++++++++++++++--- glib/guri.h | 5 +++ glib/tests/uri.c | 60 +++++++++++++++++++++++++++------- 3 files changed, 135 insertions(+), 15 deletions(-) diff --git a/glib/guri.c b/glib/guri.c index 0bc6c33b1..c5399d7f3 100644 --- a/glib/guri.c +++ b/glib/guri.c @@ -348,9 +348,8 @@ uri_decoder (gchar **out, } else if (www_form && *s == '+') g_string_append_c (decoded, ' '); - /* Normalize any illegal characters */ - else if (just_normalize && (!g_ascii_isgraph (*s) || - (illegal_chars && strchr (illegal_chars, *s)))) + /* Normalize any illegal characters. */ + else if (just_normalize && (!g_ascii_isgraph (*s))) g_string_append_printf (decoded, "%%%02X", (guchar)*s); else g_string_append_c (decoded, *s); @@ -748,6 +747,52 @@ uri_cleanup (const gchar *uri_string) return g_string_free (copy, FALSE); } +static gboolean +should_normalize_empty_path (const char *scheme) +{ + const char * const schemes[] = { "https", "http", "wss", "ws" }; + int i; + for (i = 0; i < G_N_ELEMENTS (schemes); ++i) + { + if (!strcmp (schemes[i], scheme)) + return TRUE; + } + return FALSE; +} + +static int +normalize_port (const char *scheme, + int port) +{ + const char *default_schemes[3] = { NULL }; + int i; + + switch (port) + { + case 21: + default_schemes[0] = "ftp"; + break; + case 80: + default_schemes[0] = "http"; + default_schemes[1] = "ws"; + break; + case 443: + default_schemes[0] = "https"; + default_schemes[1] = "wss"; + break; + default: + break; + } + + for (i = 0; default_schemes[i]; ++i) + { + if (!strcmp (scheme, default_schemes[i])) + return -1; + } + + return port; +} + static gboolean g_uri_split_internal (const gchar *uri_string, GUriFlags flags, @@ -766,6 +811,7 @@ g_uri_split_internal (const gchar *uri_string, const gchar *end, *colon, *at, *path_start, *semi, *question; const gchar *p, *bracket, *hostend; gchar *cleaned_uri_string = NULL; + gchar *normalized_scheme = NULL; if (scheme) *scheme = NULL; @@ -803,8 +849,9 @@ g_uri_split_internal (const gchar *uri_string, if (p > uri_string && *p == ':') { + normalized_scheme = g_ascii_strdown (uri_string, p - uri_string); if (scheme) - *scheme = g_ascii_strdown (uri_string, p - uri_string); + *scheme = g_steal_pointer (&normalized_scheme); p++; } else @@ -930,6 +977,22 @@ g_uri_split_internal (const gchar *uri_string, G_URI_ERROR_BAD_PATH, error)) goto fail; + /* Scheme-based normalization */ + if (flags & G_URI_FLAGS_SCHEME_NORMALIZE && ((scheme && *scheme) || normalized_scheme)) + { + const char *scheme_str = scheme && *scheme ? *scheme : normalized_scheme; + + if (should_normalize_empty_path (scheme_str) && path && !**path) + { + g_free (*path); + *path = g_strdup ("/"); + } + + if (port && *port != -1) + *port = normalize_port (scheme_str, *port); + } + + g_free (normalized_scheme); g_free (cleaned_uri_string); return TRUE; @@ -949,6 +1012,7 @@ g_uri_split_internal (const gchar *uri_string, if (fragment) g_clear_pointer (fragment, g_free); + g_free (normalized_scheme); g_free (cleaned_uri_string); return FALSE; } @@ -1402,6 +1466,19 @@ g_uri_parse_relative (GUri *base_uri, uri->port = base_uri->port; } } + + /* Scheme normalization couldn't have been done earlier + * as the relative URI may not have had a scheme */ + if (flags & G_URI_FLAGS_SCHEME_NORMALIZE) + { + if (should_normalize_empty_path (uri->scheme) && !*uri->path) + { + g_free (uri->path); + uri->path = g_strdup ("/"); + } + + uri->port = normalize_port (uri->scheme, uri->port); + } } return g_steal_pointer (&uri); diff --git a/glib/guri.h b/glib/guri.h index 3a7bb5c0e..fecbfed8e 100644 --- a/glib/guri.h +++ b/glib/guri.h @@ -62,6 +62,10 @@ void g_uri_unref (GUri *uri); * @G_URI_FLAGS_ENCODED_PATH: Same as %G_URI_FLAGS_ENCODED, for the path only. * @G_URI_FLAGS_ENCODED_FRAGMENT: Same as %G_URI_FLAGS_ENCODED, for the * fragment only. + * @G_URI_FLAGS_SCHEME_NORMALIZE: Applies scheme-based normalization to the + * parsed URI. For example when parsing an HTTP URI changing empty paths + * to `/` and changing port `80` to `-1`. This only supports a subset + * of known schemes. (Since: 2.68) * * Flags that describe a URI. * @@ -83,6 +87,7 @@ typedef enum { G_URI_FLAGS_ENCODED_QUERY = 1 << 5, G_URI_FLAGS_ENCODED_PATH = 1 << 6, G_URI_FLAGS_ENCODED_FRAGMENT = 1 << 7, + G_URI_FLAGS_SCHEME_NORMALIZE = 1 << 8, } GUriFlags; GLIB_AVAILABLE_IN_2_66 diff --git a/glib/tests/uri.c b/glib/tests/uri.c index 895bd4d5d..77aa95604 100644 --- a/glib/tests/uri.c +++ b/glib/tests/uri.c @@ -1711,36 +1711,74 @@ test_uri_join_split_round_trip (void) static const struct { /* Inputs */ + const gchar *base; const gchar *uri; GUriFlags flags; /* Outputs */ const gchar *path; + int port; } normalize_tests[] = { - { "http://foo/path with spaces", G_URI_FLAGS_ENCODED, - "/path%20with%20spaces" }, - { "http://foo/path with spaces 2", G_URI_FLAGS_ENCODED_PATH, - "/path%20with%20spaces%202" }, - { "http://foo/%aa", G_URI_FLAGS_ENCODED, - "/%AA" }, - { "http://foo/p\xc3\xa4th/", G_URI_FLAGS_ENCODED | G_URI_FLAGS_PARSE_RELAXED, - "/p%C3%A4th/" }, + { NULL, "http://foo/path with spaces", G_URI_FLAGS_ENCODED, + "/path%20with%20spaces", -1 }, + { NULL, "http://foo/path with spaces 2", G_URI_FLAGS_ENCODED_PATH, + "/path%20with%20spaces%202", -1 }, + { NULL, "http://foo/%aa", G_URI_FLAGS_ENCODED, + "/%AA", -1 }, + { NULL, "http://foo/p\xc3\xa4th/", G_URI_FLAGS_ENCODED | G_URI_FLAGS_PARSE_RELAXED, + "/p%C3%A4th/", -1 }, + { NULL, "http://foo", G_URI_FLAGS_SCHEME_NORMALIZE, + "/", -1 }, + { NULL, "nothttp://foo", G_URI_FLAGS_SCHEME_NORMALIZE, + "", -1 }, + { NULL, "http://foo:80", G_URI_FLAGS_SCHEME_NORMALIZE, + "/", -1 }, + { NULL, "https://foo:443", G_URI_FLAGS_SCHEME_NORMALIZE, + "/", -1 }, + { NULL, "ftp://foo:21", G_URI_FLAGS_SCHEME_NORMALIZE, + "", -1 }, + { NULL, "nothttp://foo:80", G_URI_FLAGS_SCHEME_NORMALIZE, + "", 80 }, + { "http://foo", "//bar", G_URI_FLAGS_SCHEME_NORMALIZE, + "/", -1 }, + { "http://foo", "//bar:80", G_URI_FLAGS_SCHEME_NORMALIZE, + "/", -1 }, + { "nothttp://foo", "//bar:80", G_URI_FLAGS_SCHEME_NORMALIZE, + "", 80 }, + { "http://foo", "//bar", 0, + "", -1 }, }; static void test_uri_normalize (void) { gsize i; + int port; for (i = 0; i < G_N_ELEMENTS (normalize_tests); ++i) { - GUri *uri = g_uri_parse (normalize_tests[i].uri, - normalize_tests[i].flags, - NULL); + GUri *uri, *base = NULL; + if (normalize_tests[i].base) + base = g_uri_parse (normalize_tests[i].base, normalize_tests[i].flags, NULL); + + uri = g_uri_parse_relative (base, + normalize_tests[i].uri, + normalize_tests[i].flags, + NULL); + g_assert_nonnull (uri); g_assert_cmpstr (g_uri_get_path (uri), ==, normalize_tests[i].path); + g_assert_cmpint (g_uri_get_port (uri), ==, normalize_tests[i].port); + g_uri_unref (uri); + if (base) + g_uri_unref (base); } + + /* One off testing a codepath where scheme is NULL but internally we still normalize it. */ + g_assert_true (g_uri_split ("HTTP://foo:80", G_URI_FLAGS_SCHEME_NORMALIZE, + NULL, NULL, NULL, &port, NULL, NULL, NULL, NULL)); + g_assert_cmpint (port, ==, -1); } int From 9da213ea34b95d0c4387f593bf277a4a6b9a68d1 Mon Sep 17 00:00:00 2001 From: Patrick Griffis Date: Mon, 26 Oct 2020 14:33:44 -0500 Subject: [PATCH 3/3] docs: Add note about uri normalization for equality --- glib/guri.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/glib/guri.c b/glib/guri.c index c5399d7f3..ceed7133d 100644 --- a/glib/guri.c +++ b/glib/guri.c @@ -132,11 +132,12 @@ * * Note that there is no `g_uri_equal ()` function, because comparing * URIs usefully requires scheme-specific knowledge that #GUri does - * not have. For example, `http://example.com/` and - * `http://EXAMPLE.COM:80` have exactly the same meaning according - * to the HTTP specification, and `data:,foo` and - * `data:;base64,Zm9v` resolve to the same thing according to the - * `data:` URI specification. + * not have. #GUri can help with normalization if you use the various + * encoded #GUriFlags as well as %G_URI_FLAGS_SCHEME_NORMALIZE however + * it is not comprehensive. + * For example, `data:,foo` and `data:;base64,Zm9v` resolve to the same + * thing according to the `data:` URI specification which GLib does not + * handle. * * Since: 2.66 */