- update to 2.1.2:

* Fix test failures on Python 3.11.4+ (#212, #213). * Fix an incorrect type hint (#211). * Add project URLs to setup.py (#215). * Dropped Python 3.6 support, and made Python 3.11 support official. * :func:`~w3lib.url.safe_url_string` now generates safer URLs. * :func:`~w3lib.url.canonicalize_url` now strips spaces from the input URL, to be more in line with the `URL living standard`_. * :func:`~w3lib.html.get_base_url` now ignores HTML comments. * Fixed :func:`~w3lib.url.safe_url_string` re-encoding percent signs on the URL username and password even when they were being used as part of an escape sequence. (#187, #196) * Fixed :func:`~w3lib.http.basic_auth_header` using the wrong flavor of base64 encoding, which could prevent authentication in rare cases. * Python 2 is no longer supported; * The ``w3lib.form`` module is removed. * The ``w3lib.html.remove_entities`` function is removed. * The ``w3lib.url.urljoin_rfc`` function is removed. * The following functions are deprecated, and will be removed in future releases * Type annotations are added (#172, #184). * Added support for Python 3.9 and 3.10 (#168, #176). * Fixed :func:`w3lib.html.get_meta_refresh` for ``<meta>`` tags where ``http-equiv`` is written after ``content`` (#179). * Fixed :func:`w3lib.url.safe_url_string` for IDNA domains with ports (#174). * :func:`w3lib.url.url_query_cleaner` no longer adds an OBS-URL: https://build.opensuse.org/package/show/devel:languages:python/python-w3lib?expand=0&rev=22
2024-01-12 08:31:14 +00:00
parent 7702237bf2
commit 75982c34f4
6 changed files with 50 additions and 734 deletions
--- a/166-add-xfail-test_add_or_replace_parameter_fail.patch
+++ b/166-add-xfail-test_add_or_replace_parameter_fail.patch
@@ -1,80 +0,0 @@
-From 34c62eb265cdb75b748d8aca43a2f8b9581dbd6a Mon Sep 17 00:00:00 2001
-From: Eugenio Lacuesta <eugenio.lacuesta@gmail.com>
-Date: Wed, 10 Mar 2021 12:20:24 -0300
-Subject: [PATCH 1/8] [CI] Run tests on GitHub actions
-
---
- tests/test_url.py |   24 ++++++++++++++----------
- 1 file changed, 14 insertions(+), 10 deletions(-)
- delete mode 100644 .github/workflows/build.yml
- create mode 100644 .github/workflows/tests.yml
-
--- a/tests/test_url.py
-+++ b/tests/test_url.py
-@@ -2,11 +2,14 @@
- from __future__ import absolute_import
- import os
- import unittest
-+
-+import pytest
-+from six.moves.urllib.parse import urlparse
-+
- from w3lib.url import (is_url, safe_url_string, safe_download_url,
-     url_query_parameter, add_or_replace_parameter, url_query_cleaner,
-     file_uri_to_path, parse_data_uri, path_to_file_uri, any_to_uri,
-     urljoin_rfc, canonicalize_url, parse_url, add_or_replace_parameters)
-from six.moves.urllib.parse import urlparse
- 
- 
- class UrlTests(unittest.TestCase):
-@@ -76,17 +79,16 @@ class UrlTests(unittest.TestCase):
-     def test_safe_url_string_unsafe_chars(self):
-         safeurl = safe_url_string(r"http://localhost:8001/unwise{,},|,\,^,[,],`?|=[]&[]=|")
-         self.assertEqual(safeurl, r"http://localhost:8001/unwise%7B,%7D,|,%5C,%5E,[,],%60?|=[]&[]=|")
-        
-+
-     def test_safe_url_string_quote_path(self):
-         safeurl = safe_url_string(u'http://google.com/"hello"', quote_path=True)
-         self.assertEqual(safeurl, u'http://google.com/%22hello%22')
-        
-+
-         safeurl = safe_url_string(u'http://google.com/"hello"', quote_path=False)
-         self.assertEqual(safeurl, u'http://google.com/"hello"')
-        
-+
-         safeurl = safe_url_string(u'http://google.com/"hello"')
-         self.assertEqual(safeurl, u'http://google.com/%22hello%22')
-        
- 
-     def test_safe_url_string_with_query(self):
-         safeurl = safe_url_string(u"http://www.example.com/£?unit=µ")
-@@ -299,10 +301,6 @@ class UrlTests(unittest.TestCase):
-         self.assertEqual(add_or_replace_parameter(url, 'arg3', 'nv3'),
-                          'http://domain/test?arg1=v1&arg2=v2&arg3=nv3')
- 
-        url = 'http://domain/test?arg1=v1;arg2=v2'
-        self.assertEqual(add_or_replace_parameter(url, 'arg1', 'v3'),
-                         'http://domain/test?arg1=v3&arg2=v2')
-
-         self.assertEqual(add_or_replace_parameter("http://domain/moreInfo.asp?prodID=", 'prodID', '20'),
-                          'http://domain/moreInfo.asp?prodID=20')
-         url = 'http://rmc-offers.co.uk/productlist.asp?BCat=2%2C60&CatID=60'
-@@ -327,6 +325,13 @@ class UrlTests(unittest.TestCase):
-         self.assertEqual(add_or_replace_parameter(url, 'arg1', 'v3'),
-                          'http://domain/test?arg1=v3&arg2=v2')
- 
-+    @pytest.mark.xfail(reason="https://github.com/scrapy/w3lib/issues/164")
-+    def test_add_or_replace_parameter_fail(self):
-+        self.assertEqual(
-+            add_or_replace_parameter('http://domain/test?arg1=v1;arg2=v2', 'arg1', 'v3'),
-+            'http://domain/test?arg1=v3&arg2=v2'
-+        )
-+
-     def test_add_or_replace_parameters(self):
-         url = 'http://domain/test'
-         self.assertEqual(add_or_replace_parameters(url, {'arg': 'v'}),
-@@ -741,4 +746,3 @@ class DataURITests(unittest.TestCase):
- 
- if __name__ == "__main__":
-     unittest.main()
-
--- a/python-w3lib-no-six.patch
+++ b/python-w3lib-no-six.patch
@@ -1,641 +0,0 @@
-Index: w3lib-1.22.0/setup.py
-===================================================================
--- w3lib-1.22.0.orig/setup.py
-+++ w3lib-1.22.0/setup.py
-@@ -29,5 +29,4 @@ setup(
-         'Programming Language :: Python :: Implementation :: PyPy',
-         'Topic :: Internet :: WWW/HTTP',
-     ],
-    install_requires=['six >= 1.4.1'],
- )
-Index: w3lib-1.22.0/tests/test_encoding.py
-===================================================================
--- w3lib-1.22.0.orig/tests/test_encoding.py
-+++ w3lib-1.22.0/tests/test_encoding.py
-@@ -1,7 +1,14 @@
-import unittest, codecs
-import six
-from w3lib.encoding import (html_body_declared_encoding, read_bom, to_unicode,
-        http_content_type_encoding, resolve_encoding, html_to_unicode)
-+import codecs
-+import unittest
-+
-+from w3lib.encoding import (
-+    html_body_declared_encoding,
-+    http_content_type_encoding,
-+    html_to_unicode,
-+    read_bom,
-+    resolve_encoding,
-+    to_unicode,
-+)
- 
- class RequestEncodingTests(unittest.TestCase):
-     utf8_fragments = [
-@@ -107,18 +114,18 @@ class HtmlConversionTests(unittest.TestC
-         original_string = unicode_string.encode('cp1251')
-         encoding, body_unicode = html_to_unicode(ct('cp1251'), original_string)
-         # check body_as_unicode
-        self.assertTrue(isinstance(body_unicode, six.text_type))
-+        self.assertTrue(isinstance(body_unicode, str))
-         self.assertEqual(body_unicode, unicode_string)
- 
-     def _assert_encoding(self, content_type, body, expected_encoding,
-                 expected_unicode):
-        assert not isinstance(body, six.text_type)
-+        assert not isinstance(body, str)
-         encoding, body_unicode = html_to_unicode(ct(content_type), body)
-        self.assertTrue(isinstance(body_unicode, six.text_type))
-+        self.assertTrue(isinstance(body_unicode, str))
-         self.assertEqual(norm_encoding(encoding),
-                 norm_encoding(expected_encoding))
- 
-        if isinstance(expected_unicode, six.string_types):
-+        if isinstance(expected_unicode, str):
-             self.assertEqual(body_unicode, expected_unicode)
-         else:
-             self.assertTrue(
-@@ -177,9 +184,9 @@ class HtmlConversionTests(unittest.TestC
- 
-     def _assert_encoding_detected(self, content_type, expected_encoding, body,
-             **kwargs):
-        assert not isinstance(body, six.text_type)
-+        assert not isinstance(body, str)
-         encoding, body_unicode  = html_to_unicode(ct(content_type), body, **kwargs)
-        self.assertTrue(isinstance(body_unicode, six.text_type))
-+        self.assertTrue(isinstance(body_unicode, str))
-         self.assertEqual(norm_encoding(encoding),  norm_encoding(expected_encoding))
- 
-     def test_BOM(self):
-Index: w3lib-1.22.0/tests/test_html.py
-===================================================================
--- w3lib-1.22.0.orig/tests/test_html.py
-+++ w3lib-1.22.0/tests/test_html.py
-@@ -1,18 +1,25 @@
-# -*- coding: utf-8 -*-
- import unittest
-import six
-from w3lib.html import (replace_entities, replace_tags, remove_comments,
-    remove_tags_with_content, replace_escape_chars, remove_tags, unquote_markup,
-    get_base_url, get_meta_refresh)
-+
-+from w3lib.html import (
-+    get_base_url,
-+    get_meta_refresh,
-+    remove_comments,
-+    remove_tags,
-+    remove_tags_with_content,
-+    replace_entities,
-+    replace_escape_chars,
-+    replace_tags,
-+    unquote_markup,
-+)
- 
- 
- class RemoveEntitiesTest(unittest.TestCase):
-     def test_returns_unicode(self):
-         # make sure it always return uncode
-        assert isinstance(replace_entities(b'no entities'), six.text_type)
-        assert isinstance(replace_entities(b'Price: &pound;100!'),  six.text_type)
-        assert isinstance(replace_entities(u'no entities'), six.text_type)
-        assert isinstance(replace_entities(u'Price: &pound;100!'),  six.text_type)
-+        assert isinstance(replace_entities(b'no entities'), str)
-+        assert isinstance(replace_entities(b'Price: &pound;100!'),  str)
-+        assert isinstance(replace_entities(u'no entities'), str)
-+        assert isinstance(replace_entities(u'Price: &pound;100!'),  str)
- 
-     def test_regular(self):
-         # regular conversions
-@@ -71,8 +78,8 @@ class RemoveEntitiesTest(unittest.TestCa
- class ReplaceTagsTest(unittest.TestCase):
-     def test_returns_unicode(self):
-         # make sure it always return uncode
-        assert isinstance(replace_tags(b'no entities'), six.text_type)
-        assert isinstance(replace_tags('no entities'), six.text_type)
-+        assert isinstance(replace_tags(b'no entities'), str)
-+        assert isinstance(replace_tags('no entities'), str)
- 
-     def test_replace_tags(self):
-         self.assertEqual(replace_tags(u'This text contains <a>some tag</a>'),
-@@ -88,10 +95,10 @@ class ReplaceTagsTest(unittest.TestCase)
- class RemoveCommentsTest(unittest.TestCase):
-     def test_returns_unicode(self):
-         # make sure it always return unicode
-        assert isinstance(remove_comments(b'without comments'), six.text_type)
-        assert isinstance(remove_comments(b'<!-- with comments -->'), six.text_type)
-        assert isinstance(remove_comments(u'without comments'), six.text_type)
-        assert isinstance(remove_comments(u'<!-- with comments -->'), six.text_type)
-+        assert isinstance(remove_comments(b'without comments'), str)
-+        assert isinstance(remove_comments(b'<!-- with comments -->'), str)
-+        assert isinstance(remove_comments(u'without comments'), str)
-+        assert isinstance(remove_comments(u'<!-- with comments -->'), str)
- 
-     def test_no_comments(self):
-         # text without comments
-@@ -112,16 +119,16 @@ class RemoveCommentsTest(unittest.TestCa
- class RemoveTagsTest(unittest.TestCase):
-     def test_returns_unicode(self):
-         # make sure it always return unicode
-        assert isinstance(remove_tags(b'no tags'), six.text_type)
-        assert isinstance(remove_tags(b'no tags', which_ones=('p',)), six.text_type)
-        assert isinstance(remove_tags(b'<p>one tag</p>'), six.text_type)
-        assert isinstance(remove_tags(b'<p>one tag</p>', which_ones=('p')), six.text_type)
-        assert isinstance(remove_tags(b'<a>link</a>', which_ones=('b',)), six.text_type)
-        assert isinstance(remove_tags(u'no tags'), six.text_type)
-        assert isinstance(remove_tags(u'no tags', which_ones=('p',)), six.text_type)
-        assert isinstance(remove_tags(u'<p>one tag</p>'), six.text_type)
-        assert isinstance(remove_tags(u'<p>one tag</p>', which_ones=('p')), six.text_type)
-        assert isinstance(remove_tags(u'<a>link</a>', which_ones=('b',)), six.text_type)
-+        assert isinstance(remove_tags(b'no tags'), str)
-+        assert isinstance(remove_tags(b'no tags', which_ones=('p',)), str)
-+        assert isinstance(remove_tags(b'<p>one tag</p>'), str)
-+        assert isinstance(remove_tags(b'<p>one tag</p>', which_ones=('p')), str)
-+        assert isinstance(remove_tags(b'<a>link</a>', which_ones=('b',)), str)
-+        assert isinstance(remove_tags(u'no tags'), str)
-+        assert isinstance(remove_tags(u'no tags', which_ones=('p',)), str)
-+        assert isinstance(remove_tags(u'<p>one tag</p>'), str)
-+        assert isinstance(remove_tags(u'<p>one tag</p>', which_ones=('p')), str)
-+        assert isinstance(remove_tags(u'<a>link</a>', which_ones=('b',)), str)
- 
-     def test_remove_tags_without_tags(self):
-         # text without tags
-@@ -160,14 +167,14 @@ class RemoveTagsTest(unittest.TestCase):
- class RemoveTagsWithContentTest(unittest.TestCase):
-     def test_returns_unicode(self):
-         # make sure it always return unicode
-        assert isinstance(remove_tags_with_content(b'no tags'), six.text_type)
-        assert isinstance(remove_tags_with_content(b'no tags', which_ones=('p',)), six.text_type)
-        assert isinstance(remove_tags_with_content(b'<p>one tag</p>', which_ones=('p',)), six.text_type)
-        assert isinstance(remove_tags_with_content(b'<a>link</a>', which_ones=('b',)), six.text_type)
-        assert isinstance(remove_tags_with_content(u'no tags'), six.text_type)
-        assert isinstance(remove_tags_with_content(u'no tags', which_ones=('p',)), six.text_type)
-        assert isinstance(remove_tags_with_content(u'<p>one tag</p>', which_ones=('p',)), six.text_type)
-        assert isinstance(remove_tags_with_content(u'<a>link</a>', which_ones=('b',)), six.text_type)
-+        assert isinstance(remove_tags_with_content(b'no tags'), str)
-+        assert isinstance(remove_tags_with_content(b'no tags', which_ones=('p',)), str)
-+        assert isinstance(remove_tags_with_content(b'<p>one tag</p>', which_ones=('p',)), str)
-+        assert isinstance(remove_tags_with_content(b'<a>link</a>', which_ones=('b',)), str)
-+        assert isinstance(remove_tags_with_content(u'no tags'), str)
-+        assert isinstance(remove_tags_with_content(u'no tags', which_ones=('p',)), str)
-+        assert isinstance(remove_tags_with_content(u'<p>one tag</p>', which_ones=('p',)), str)
-+        assert isinstance(remove_tags_with_content(u'<a>link</a>', which_ones=('b',)), str)
- 
-     def test_without_tags(self):
-         # text without tags
-@@ -194,13 +201,13 @@ class RemoveTagsWithContentTest(unittest
- class ReplaceEscapeCharsTest(unittest.TestCase):
-     def test_returns_unicode(self):
-         # make sure it always return unicode
-        assert isinstance(replace_escape_chars(b'no ec'), six.text_type)
-        assert isinstance(replace_escape_chars(b'no ec', replace_by='str'), six.text_type)
-        assert isinstance(replace_escape_chars(b'no ec', replace_by=u'str'), six.text_type)
-        assert isinstance(replace_escape_chars(b'no ec', which_ones=('\n', '\t',)), six.text_type)
-        assert isinstance(replace_escape_chars(u'no ec'), six.text_type)
-        assert isinstance(replace_escape_chars(u'no ec', replace_by=u'str'), six.text_type)
-        assert isinstance(replace_escape_chars(u'no ec', which_ones=('\n', '\t',)), six.text_type)
-+        assert isinstance(replace_escape_chars(b'no ec'), str)
-+        assert isinstance(replace_escape_chars(b'no ec', replace_by='str'), str)
-+        assert isinstance(replace_escape_chars(b'no ec', replace_by=u'str'), str)
-+        assert isinstance(replace_escape_chars(b'no ec', which_ones=('\n', '\t',)), str)
-+        assert isinstance(replace_escape_chars(u'no ec'), str)
-+        assert isinstance(replace_escape_chars(u'no ec', replace_by=u'str'), str)
-+        assert isinstance(replace_escape_chars(u'no ec', which_ones=('\n', '\t',)), str)
- 
-     def test_without_escape_chars(self):
-         # text without escape chars
-@@ -226,8 +233,8 @@ class UnquoteMarkupTest(unittest.TestCas
- 
-     def test_returns_unicode(self):
-         # make sure it always return unicode
-        assert isinstance(unquote_markup(self.sample_txt1.encode('latin-1')), six.text_type)
-        assert isinstance(unquote_markup(self.sample_txt2), six.text_type)
-+        assert isinstance(unquote_markup(self.sample_txt1.encode('latin-1')), str)
-+        assert isinstance(unquote_markup(self.sample_txt2), str)
- 
-     def test_unquote_markup(self):
-         self.assertEqual(unquote_markup(self.sample_txt1), u"""<node1>hi, this is sample text with entities: & \xa9
-Index: w3lib-1.22.0/tests/test_url.py
-===================================================================
--- w3lib-1.22.0.orig/tests/test_url.py
-+++ w3lib-1.22.0/tests/test_url.py
-@@ -1,15 +1,25 @@
-# -*- coding: utf-8 -*-
-from __future__ import absolute_import
- import os
- import unittest
-+from urllib.parse import urlparse
- 
- import pytest
-from six.moves.urllib.parse import urlparse
- 
-from w3lib.url import (is_url, safe_url_string, safe_download_url,
-    url_query_parameter, add_or_replace_parameter, url_query_cleaner,
-    file_uri_to_path, parse_data_uri, path_to_file_uri, any_to_uri,
-    urljoin_rfc, canonicalize_url, parse_url, add_or_replace_parameters)
-+from w3lib.url import (
-+    add_or_replace_parameter,
-+    add_or_replace_parameters,
-+    any_to_uri,
-+    canonicalize_url,
-+    file_uri_to_path,
-+    is_url,
-+    parse_data_uri,
-+    parse_url,
-+    path_to_file_uri,
-+    safe_download_url,
-+    safe_url_string,
-+    url_query_parameter,
-+    url_query_cleaner,
-+    urljoin_rfc,
-+)
- 
- 
- class UrlTests(unittest.TestCase):
-Index: w3lib-1.22.0/w3lib/form.py
-===================================================================
--- w3lib-1.22.0.orig/w3lib/form.py
-+++ w3lib-1.22.0/w3lib/form.py
-@@ -1,9 +1,6 @@
- import warnings
-import six
-if six.PY2:
-    from cStringIO import StringIO as BytesIO
-else:
-    from io import BytesIO
-+from io import BytesIO
-+
- from w3lib.util import unicode_to_str
- 
- 
-Index: w3lib-1.22.0/w3lib/html.py
-===================================================================
--- w3lib-1.22.0.orig/w3lib/html.py
-+++ w3lib-1.22.0/w3lib/html.py
-@@ -5,16 +5,16 @@ Functions for dealing with markup text
- 
- import warnings
- import re
-import six
-from six import moves
-+from html.entities import name2codepoint
-+from urllib.parse import urljoin
- 
- from w3lib.util import to_bytes, to_unicode
- from w3lib.url import safe_url_string
- 
- _ent_re = re.compile(r'&((?P<named>[a-z\d]+)|#(?P<dec>\d+)|#x(?P<hex>[a-f\d]+))(?P<semicolon>;?)', re.IGNORECASE)
- _tag_re = re.compile(r'<[a-zA-Z\/!].*?>', re.DOTALL)
-_baseurl_re = re.compile(six.u(r'<base\s[^>]*href\s*=\s*[\"\']\s*([^\"\'\s]+)\s*[\"\']'), re.I)
-_meta_refresh_re = re.compile(six.u(r'<meta\s[^>]*http-equiv[^>]*refresh[^>]*content\s*=\s*(?P<quote>["\'])(?P<int>(\d*\.)?\d+)\s*;\s*url=\s*(?P<url>.*?)(?P=quote)'), re.DOTALL | re.IGNORECASE)
-+_baseurl_re = re.compile(r'<base\s[^>]*href\s*=\s*[\"\']\s*([^\"\'\s]+)\s*[\"\']', re.I)
-+_meta_refresh_re = re.compile(r'<meta\s[^>]*http-equiv[^>]*refresh[^>]*content\s*=\s*(?P<quote>["\'])(?P<int>(\d*\.)?\d+)\s*;\s*url=\s*(?P<url>.*?)(?P=quote)', re.DOTALL | re.IGNORECASE)
- _cdata_re = re.compile(r'((?P<cdata_s><!\[CDATA\[)(?P<cdata_d>.*?)(?P<cdata_e>\]\]>))', re.DOTALL)
- 
- HTML5_WHITESPACE = ' \t\n\r\x0c'
-@@ -77,8 +77,10 @@ def replace_entities(text, keep=(), remo
-             if entity_name.lower() in keep:
-                 return m.group(0)
-             else:
-                number = (moves.html_entities.name2codepoint.get(entity_name) or
-                    moves.html_entities.name2codepoint.get(entity_name.lower()))
-+                number = (
-+                    name2codepoint.get(entity_name)
-+                    or name2codepoint.get(entity_name.lower())
-+                )
-         if number is not None:
-             # Numeric character references in the 80-9F range are typically
-             # interpreted by browsers as representing the characters mapped
-@@ -86,9 +88,9 @@ def replace_entities(text, keep=(), remo
-             # see: http://en.wikipedia.org/wiki/Character_encodings_in_HTML
-             try:
-                 if 0x80 <= number <= 0x9f:
-                    return six.int2byte(number).decode('cp1252')
-+                    return bytes((number,)).decode('cp1252')
-                 else:
-                    return six.unichr(number)
-+                    return chr(number)
-             except ValueError:
-                 pass
- 
-@@ -265,7 +267,7 @@ def unquote_markup(text, keep=(), remove
-     text = to_unicode(text, encoding)
-     ret_text = u''
-     for fragment in _get_fragments(text, _cdata_re):
-        if isinstance(fragment, six.string_types):
-+        if isinstance(fragment, str):
-             # it's not a CDATA (so we try to remove its entities)
-             ret_text += replace_entities(fragment, keep=keep, remove_illegal=remove_illegal)
-         else:
-@@ -284,7 +286,7 @@ def get_base_url(text, baseurl='', encod
-     text = to_unicode(text, encoding)
-     m = _baseurl_re.search(text)
-     if m:
-        return moves.urllib.parse.urljoin(
-+        return urljoin(
-             safe_url_string(baseurl),
-             safe_url_string(m.group(1), encoding=encoding)
-         )
-@@ -301,8 +303,6 @@ def get_meta_refresh(text, baseurl='', e
- 
-     """
- 
-    if six.PY2:
-        baseurl = to_bytes(baseurl, encoding)
-     try:
-         text = to_unicode(text, encoding)
-     except UnicodeDecodeError:
-@@ -314,7 +314,7 @@ def get_meta_refresh(text, baseurl='', e
-     if m:
-         interval = float(m.group('int'))
-         url = safe_url_string(m.group('url').strip(' "\''), encoding)
-        url = moves.urllib.parse.urljoin(baseurl, url)
-+        url = urljoin(baseurl, url)
-         return interval, url
-     else:
-         return None, None
-Index: w3lib-1.22.0/w3lib/url.py
-===================================================================
--- w3lib-1.22.0.orig/w3lib/url.py
-+++ w3lib-1.22.0/w3lib/url.py
-@@ -5,17 +5,28 @@ library.
- import base64
- import codecs
- import os
-import re
- import posixpath
-import warnings
-+import re
- import string
-+import warnings
- from collections import namedtuple
-import six
-from six.moves.urllib.parse import (urljoin, urlsplit, urlunsplit,
-                                    urldefrag, urlencode, urlparse,
-                                    quote, parse_qs, parse_qsl,
-                                    ParseResult, unquote, urlunparse)
-from six.moves.urllib.request import pathname2url, url2pathname
-+from urllib.parse import (
-+    _coerce_args,
-+    parse_qs,
-+    parse_qsl,
-+    ParseResult,
-+    quote,
-+    unquote,
-+    unquote_to_bytes,
-+    urldefrag,
-+    urlencode,
-+    urljoin,
-+    urlparse,
-+    urlsplit,
-+    urlunparse,
-+    urlunsplit,
-+)
-+from urllib.request import pathname2url, url2pathname
- from w3lib.util import to_bytes, to_native_str, to_unicode
- 
- 
-@@ -184,7 +195,7 @@ def url_query_cleaner(url, parameterlist
- 
-     """
- 
-    if isinstance(parameterlist, (six.text_type, bytes)):
-+    if isinstance(parameterlist, (str, bytes)):
-         parameterlist = [parameterlist]
-     url, fragment = urldefrag(url)
-     base, _, query = url.partition('?')
-@@ -346,10 +357,7 @@ def parse_data_uri(uri):
-     # delimiters, but it makes parsing easier and should not affect
-     # well-formed URIs, as the delimiters used in this URI scheme are not
-     # allowed, percent-encoded or not, in tokens.
-    if six.PY2:
-        uri = unquote(uri)
-    else:
-        uri = unquote_to_bytes(uri)
-+    uri = unquote_to_bytes(uri)
- 
-     media_type = "text/plain"
-     media_type_params = {}
-@@ -469,33 +477,32 @@ def canonicalize_url(url, keep_blank_val
-     # 1. decode query-string as UTF-8 (or keep raw bytes),
-     #    sort values,
-     #    and percent-encode them back
-    if six.PY2:
-        keyvals = parse_qsl(query, keep_blank_values)
-    else:
-        # Python3's urllib.parse.parse_qsl does not work as wanted
-        # for percent-encoded characters that do not match passed encoding,
-        # they get lost.
-        #
-        # e.g., 'q=b%a3' becomes [('q', 'b\ufffd')]
-        # (ie. with 'REPLACEMENT CHARACTER' (U+FFFD),
-        #      instead of \xa3 that you get with Python2's parse_qsl)
-        #
-        # what we want here is to keep raw bytes, and percent encode them
-        # so as to preserve whatever encoding what originally used.
-        #
-        # See https://tools.ietf.org/html/rfc3987#section-6.4:
-        #
-        # For example, it is possible to have a URI reference of
-        # "http://www.example.org/r%E9sum%E9.xml#r%C3%A9sum%C3%A9", where the
-        # document name is encoded in iso-8859-1 based on server settings, but
-        # where the fragment identifier is encoded in UTF-8 according to
-        # [XPointer]. The IRI corresponding to the above URI would be (in XML
-        # notation)
-        # "http://www.example.org/r%E9sum%E9.xml#r&#xE9;sum&#xE9;".
-        # Similar considerations apply to query parts.  The functionality of
-        # IRIs (namely, to be able to include non-ASCII characters) can only be
-        # used if the query part is encoded in UTF-8.
-        keyvals = parse_qsl_to_bytes(query, keep_blank_values)
-+
-+    # Python's urllib.parse.parse_qsl does not work as wanted
-+    # for percent-encoded characters that do not match passed encoding,
-+    # they get lost.
-+    #
-+    # e.g., 'q=b%a3' becomes [('q', 'b\ufffd')]
-+    # (ie. with 'REPLACEMENT CHARACTER' (U+FFFD),
-+    #      instead of \xa3 that you get with Python2's parse_qsl)
-+    #
-+    # what we want here is to keep raw bytes, and percent encode them
-+    # so as to preserve whatever encoding what originally used.
-+    #
-+    # See https://tools.ietf.org/html/rfc3987#section-6.4:
-+    #
-+    # For example, it is possible to have a URI reference of
-+    # "http://www.example.org/r%E9sum%E9.xml#r%C3%A9sum%C3%A9", where the
-+    # document name is encoded in iso-8859-1 based on server settings, but
-+    # where the fragment identifier is encoded in UTF-8 according to
-+    # [XPointer]. The IRI corresponding to the above URI would be (in XML
-+    # notation)
-+    # "http://www.example.org/r%E9sum%E9.xml#r&#xE9;sum&#xE9;".
-+    # Similar considerations apply to query parts.  The functionality of
-+    # IRIs (namely, to be able to include non-ASCII characters) can only be
-+    # used if the query part is encoded in UTF-8.
-+    keyvals = parse_qsl_to_bytes(query, keep_blank_values)
-+
-     keyvals.sort()
-     query = urlencode(keyvals)
- 
-@@ -519,17 +526,12 @@ def _unquotepath(path):
-     for reserved in ('2f', '2F', '3f', '3F'):
-         path = path.replace('%' + reserved, '%25' + reserved.upper())
- 
-    if six.PY2:
-        # in Python 2, '%a3' becomes '\xa3', which is what we want
-        return unquote(path)
-    else:
-        # in Python 3,
-        # standard lib's unquote() does not work for non-UTF-8
-        # percent-escaped characters, they get lost.
-        # e.g., '%a3' becomes 'REPLACEMENT CHARACTER' (U+FFFD)
-        #
-        # unquote_to_bytes() returns raw bytes instead
-        return unquote_to_bytes(path)
-+    # standard lib's unquote() does not work for non-UTF-8
-+    # percent-escaped characters, they get lost.
-+    # e.g., '%a3' becomes 'REPLACEMENT CHARACTER' (U+FFFD)
-+    #
-+    # unquote_to_bytes() returns raw bytes instead
-+    return unquote_to_bytes(path)
- 
- 
- def parse_url(url, encoding=None):
-@@ -541,51 +543,48 @@ def parse_url(url, encoding=None):
-     return urlparse(to_unicode(url, encoding))
- 
- 
-if not six.PY2:
-    from urllib.parse import _coerce_args, unquote_to_bytes
-+def parse_qsl_to_bytes(qs, keep_blank_values=False):
-+    """Parse a query given as a string argument.
-+
-+    Data are returned as a list of name, value pairs as bytes.
- 
-    def parse_qsl_to_bytes(qs, keep_blank_values=False):
-        """Parse a query given as a string argument.
-+    Arguments:
- 
-        Data are returned as a list of name, value pairs as bytes.
-+    qs: percent-encoded query string to be parsed
- 
-        Arguments:
-
-        qs: percent-encoded query string to be parsed
-
-        keep_blank_values: flag indicating whether blank values in
-            percent-encoded queries should be treated as blank strings.  A
-            true value indicates that blanks should be retained as blank
-            strings.  The default false value indicates that blank values
-            are to be ignored and treated as if they were  not included.
-
-        """
-        # This code is the same as Python3's parse_qsl()
-        # (at https://hg.python.org/cpython/rev/c38ac7ab8d9a)
-        # except for the unquote(s, encoding, errors) calls replaced
-        # with unquote_to_bytes(s)
-        qs, _coerce_result = _coerce_args(qs)
-        pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')]
-        r = []
-        for name_value in pairs:
-            if not name_value:
-+    keep_blank_values: flag indicating whether blank values in
-+        percent-encoded queries should be treated as blank strings.  A
-+        true value indicates that blanks should be retained as blank
-+        strings.  The default false value indicates that blank values
-+        are to be ignored and treated as if they were  not included.
-+
-+    """
-+    # This code is the same as Python3's parse_qsl()
-+    # (at https://hg.python.org/cpython/rev/c38ac7ab8d9a)
-+    # except for the unquote(s, encoding, errors) calls replaced
-+    # with unquote_to_bytes(s)
-+    qs, _coerce_result = _coerce_args(qs)
-+    pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')]
-+    r = []
-+    for name_value in pairs:
-+        if not name_value:
-+            continue
-+        nv = name_value.split('=', 1)
-+        if len(nv) != 2:
-+            # Handle case of a control-name with no equal sign
-+            if keep_blank_values:
-+                nv.append('')
-+            else:
-                 continue
-            nv = name_value.split('=', 1)
-            if len(nv) != 2:
-                # Handle case of a control-name with no equal sign
-                if keep_blank_values:
-                    nv.append('')
-                else:
-                    continue
-            if len(nv[1]) or keep_blank_values:
-                name = nv[0].replace('+', ' ')
-                name = unquote_to_bytes(name)
-                name = _coerce_result(name)
-                value = nv[1].replace('+', ' ')
-                value = unquote_to_bytes(value)
-                value = _coerce_result(value)
-                r.append((name, value))
-        return r
-+        if len(nv[1]) or keep_blank_values:
-+            name = nv[0].replace('+', ' ')
-+            name = unquote_to_bytes(name)
-+            name = _coerce_result(name)
-+            value = nv[1].replace('+', ' ')
-+            value = unquote_to_bytes(value)
-+            value = _coerce_result(value)
-+            r.append((name, value))
-+    return r
- 
- 
- def urljoin_rfc(base, ref, encoding='utf-8'):
-Index: w3lib-1.22.0/w3lib/util.py
-===================================================================
--- w3lib-1.22.0.orig/w3lib/util.py
-+++ w3lib-1.22.0/w3lib/util.py
-@@ -1,5 +1,3 @@
-import six
-
- def str_to_unicode(text, encoding=None, errors='strict'):
-     if encoding is None:
-         encoding = 'utf-8'
-@@ -10,16 +8,16 @@ def str_to_unicode(text, encoding=None,
- def unicode_to_str(text, encoding=None, errors='strict'):
-     if encoding is None:
-         encoding = 'utf-8'
-    if isinstance(text, six.text_type):
-+    if isinstance(text, str):
-         return text.encode(encoding, errors)
-     return text
- 
- def to_unicode(text, encoding=None, errors='strict'):
-     """Return the unicode representation of a bytes object `text`. If `text`
-     is already an unicode object, return it as-is."""
-    if isinstance(text, six.text_type):
-+    if isinstance(text, str):
-         return text
-    if not isinstance(text, (bytes, six.text_type)):
-+    if not isinstance(text, (bytes, str)):
-         raise TypeError('to_unicode must receive a bytes, str or unicode '
-                         'object, got %s' % type(text).__name__)
-     if encoding is None:
-@@ -31,7 +29,7 @@ def to_bytes(text, encoding=None, errors
-     is already a bytes object, return it as-is."""
-     if isinstance(text, bytes):
-         return text
-    if not isinstance(text, six.string_types):
-+    if not isinstance(text, str):
-         raise TypeError('to_bytes must receive a unicode, str or bytes '
-                         'object, got %s' % type(text).__name__)
-     if encoding is None:
-@@ -39,9 +37,5 @@ def to_bytes(text, encoding=None, errors
-     return text.encode(encoding, errors)
- 
- def to_native_str(text, encoding=None, errors='strict'):
-    """ Return str representation of `text`
-    (bytes in Python 2.x and unicode in Python 3.x). """
-    if six.PY2:
-        return to_bytes(text, encoding, errors)
-    else:
-        return to_unicode(text, encoding, errors)
-+    """ Return str representation of `text` """
-+    return to_unicode(text, encoding, errors)
--- a/python-w3lib.changes
+++ b/python-w3lib.changes
@@ -1,3 +1,45 @@
+-------------------------------------------------------------------
+Fri Jan 12 08:28:28 UTC 2024 - Dirk Müller <dmueller@suse.com>
+
+- update to 2.1.2:
+  * Fix test failures on Python 3.11.4+ (#212, #213).
+  * Fix an incorrect type hint (#211).
+  * Add project URLs to setup.py (#215).
+  * Dropped Python 3.6 support, and made Python 3.11 support
+    official.
+  * :func:`~w3lib.url.safe_url_string` now generates safer
+    URLs.
+  * :func:`~w3lib.url.canonicalize_url` now strips spaces from
+    the input URL, to be more in line with the `URL living standard`_.
+  * :func:`~w3lib.html.get_base_url` now ignores HTML comments.
+  * Fixed :func:`~w3lib.url.safe_url_string` re-encoding
+    percent signs on the URL username and password even when
+    they were being used as part of an escape sequence. (#187, #196)
+  * Fixed :func:`~w3lib.http.basic_auth_header` using the wrong
+    flavor of base64 encoding, which could prevent authentication
+    in rare cases.
+  * Python 2 is no longer supported;
+  * The ``w3lib.form`` module is removed.
+  * The ``w3lib.html.remove_entities`` function is removed.
+  * The ``w3lib.url.urljoin_rfc`` function is removed.
+  * The following functions are deprecated, and will be removed
+    in future releases
+  * Type annotations are added (#172, #184).
+  * Added support for Python 3.9 and 3.10 (#168, #176).
+  * Fixed :func:`w3lib.html.get_meta_refresh` for ``<meta>`` tags
+    where ``http-equiv`` is written after ``content`` (#179).
+  * Fixed :func:`w3lib.url.safe_url_string` for IDNA domains with
+    ports (#174).
+  * :func:`w3lib.url.url_query_cleaner` no longer adds an
+    unneeded ``#`` when ``keep_fragments=True`` is passed, and
+    the URL doesn't have a fragment
+  * Removed a workaround for an ancient pathname2url bug (#142)
+  * CI is migrated to GitHub Actions (#166, #177); other CI
+    improvements
+  * The code is formatted using black (#173).
+- drop 166-add-xfail-test_add_or_replace_parameter_fail.patch,
+  python-w3lib-no-six.patch: upstream
+
 -------------------------------------------------------------------
 Fri Oct 14 19:10:33 UTC 2022 - pgajdos@suse.com

@@ -38,7 +80,7 @@ Thu Aug 29 13:15:56 UTC 2019 - Marketa Calabkova <mcalabkova@suse.com>
 Fri Mar 29 09:53:27 UTC 2019 - pgajdos@suse.com

 - version update to 1.20.0
-  * Fix url_query_cleaner to do not append "?" to urls without a 
+  * Fix url_query_cleaner to do not append "?" to urls without a
    query string (issue #109)
  * Add support for Python 3.7 and drop Python 3.3 (issue #113)
  * Add `w3lib.url.add_or_replace_parameters` helper (issue #117)
@@ -113,7 +155,7 @@ Fri Nov 15 20:36:22 UTC 2013 - p.drouand@gmail.com

 - Update to version 1.5
  + No changelog available
- Add python-setuptools BuildRequires; new dependency 
+- Add python-setuptools BuildRequires; new dependency

 -------------------------------------------------------------------
 Wed May 23 21:43:14 UTC 2012 - jfunk@funktronics.ca
--- a/python-w3lib.spec
+++ b/python-w3lib.spec
@@ -1,7 +1,7 @@
 #
 # spec file for package python-w3lib
 #
-# Copyright (c) 2022 SUSE LLC
+# Copyright (c) 2024 SUSE LLC
 #
 # All modifications and additions to the file contributed by third parties
 # remain the property of their copyright owners, unless otherwise agreed
@@ -16,20 +16,15 @@
 #


-%{?!python_module:%define python_module() python-%{**} python3-%{**}}
+%{?sle15_python_module_pythons}
 Name:           python-w3lib
-Version:        1.22.0
+Version:        2.1.2
 Release:        0
 Summary:        Library of Web-Related Functions
 License:        BSD-3-Clause
 Group:          Development/Languages/Python
 URL:            https://github.com/scrapy/w3lib
 Source:         https://files.pythonhosted.org/packages/source/w/w3lib/w3lib-%{version}.tar.gz
-# PATCH-FIX-UPSTREAM 166-add-xfail-test_add_or_replace_parameter_fail.patch mcepl@suse.com
-# Allow working with Python fixed CVE-2021-23336
-Patch0:         166-add-xfail-test_add_or_replace_parameter_fail.patch
-# https://github.com/scrapy/w3lib/commit/c16d7bac3af3148b7018c67ef7922a5da6b3e640
-Patch1:         python-w3lib-no-six.patch
 BuildRequires:  %{python_module pytest}
 BuildRequires:  %{python_module setuptools}
 BuildRequires:  fdupes
--- a/w3lib-1.22.0.tar.gz
+++ b/w3lib-1.22.0.tar.gz
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:0ad6d0203157d61149fd45aaed2e24f53902989c32fc1dccc2e2bfba371560df
-size 39121
--- a/w3lib-2.1.2.tar.gz
+++ b/w3lib-2.1.2.tar.gz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ed5b74e997eea2abe3c1321f916e344144ee8e9072a6f33463ee8e57f858a4b1
+size 48715