diff --git a/166-add-xfail-test_add_or_replace_parameter_fail.patch b/166-add-xfail-test_add_or_replace_parameter_fail.patch deleted file mode 100644 index 094244e..0000000 --- a/166-add-xfail-test_add_or_replace_parameter_fail.patch +++ /dev/null @@ -1,80 +0,0 @@ -From 34c62eb265cdb75b748d8aca43a2f8b9581dbd6a Mon Sep 17 00:00:00 2001 -From: Eugenio Lacuesta -Date: Wed, 10 Mar 2021 12:20:24 -0300 -Subject: [PATCH 1/8] [CI] Run tests on GitHub actions - ---- - tests/test_url.py | 24 ++++++++++++++---------- - 1 file changed, 14 insertions(+), 10 deletions(-) - delete mode 100644 .github/workflows/build.yml - create mode 100644 .github/workflows/tests.yml - ---- a/tests/test_url.py -+++ b/tests/test_url.py -@@ -2,11 +2,14 @@ - from __future__ import absolute_import - import os - import unittest -+ -+import pytest -+from six.moves.urllib.parse import urlparse -+ - from w3lib.url import (is_url, safe_url_string, safe_download_url, - url_query_parameter, add_or_replace_parameter, url_query_cleaner, - file_uri_to_path, parse_data_uri, path_to_file_uri, any_to_uri, - urljoin_rfc, canonicalize_url, parse_url, add_or_replace_parameters) --from six.moves.urllib.parse import urlparse - - - class UrlTests(unittest.TestCase): -@@ -76,17 +79,16 @@ class UrlTests(unittest.TestCase): - def test_safe_url_string_unsafe_chars(self): - safeurl = safe_url_string(r"http://localhost:8001/unwise{,},|,\,^,[,],`?|=[]&[]=|") - self.assertEqual(safeurl, r"http://localhost:8001/unwise%7B,%7D,|,%5C,%5E,[,],%60?|=[]&[]=|") -- -+ - def test_safe_url_string_quote_path(self): - safeurl = safe_url_string(u'http://google.com/"hello"', quote_path=True) - self.assertEqual(safeurl, u'http://google.com/%22hello%22') -- -+ - safeurl = safe_url_string(u'http://google.com/"hello"', quote_path=False) - self.assertEqual(safeurl, u'http://google.com/"hello"') -- -+ - safeurl = safe_url_string(u'http://google.com/"hello"') - self.assertEqual(safeurl, u'http://google.com/%22hello%22') -- - - def test_safe_url_string_with_query(self): - safeurl = safe_url_string(u"http://www.example.com/£?unit=µ") -@@ -299,10 +301,6 @@ class UrlTests(unittest.TestCase): - self.assertEqual(add_or_replace_parameter(url, 'arg3', 'nv3'), - 'http://domain/test?arg1=v1&arg2=v2&arg3=nv3') - -- url = 'http://domain/test?arg1=v1;arg2=v2' -- self.assertEqual(add_or_replace_parameter(url, 'arg1', 'v3'), -- 'http://domain/test?arg1=v3&arg2=v2') -- - self.assertEqual(add_or_replace_parameter("http://domain/moreInfo.asp?prodID=", 'prodID', '20'), - 'http://domain/moreInfo.asp?prodID=20') - url = 'http://rmc-offers.co.uk/productlist.asp?BCat=2%2C60&CatID=60' -@@ -327,6 +325,13 @@ class UrlTests(unittest.TestCase): - self.assertEqual(add_or_replace_parameter(url, 'arg1', 'v3'), - 'http://domain/test?arg1=v3&arg2=v2') - -+ @pytest.mark.xfail(reason="https://github.com/scrapy/w3lib/issues/164") -+ def test_add_or_replace_parameter_fail(self): -+ self.assertEqual( -+ add_or_replace_parameter('http://domain/test?arg1=v1;arg2=v2', 'arg1', 'v3'), -+ 'http://domain/test?arg1=v3&arg2=v2' -+ ) -+ - def test_add_or_replace_parameters(self): - url = 'http://domain/test' - self.assertEqual(add_or_replace_parameters(url, {'arg': 'v'}), -@@ -741,4 +746,3 @@ class DataURITests(unittest.TestCase): - - if __name__ == "__main__": - unittest.main() -- diff --git a/python-w3lib-no-six.patch b/python-w3lib-no-six.patch deleted file mode 100644 index 1cf0b59..0000000 --- a/python-w3lib-no-six.patch +++ /dev/null @@ -1,641 +0,0 @@ -Index: w3lib-1.22.0/setup.py -=================================================================== ---- w3lib-1.22.0.orig/setup.py -+++ w3lib-1.22.0/setup.py -@@ -29,5 +29,4 @@ setup( - 'Programming Language :: Python :: Implementation :: PyPy', - 'Topic :: Internet :: WWW/HTTP', - ], -- install_requires=['six >= 1.4.1'], - ) -Index: w3lib-1.22.0/tests/test_encoding.py -=================================================================== ---- w3lib-1.22.0.orig/tests/test_encoding.py -+++ w3lib-1.22.0/tests/test_encoding.py -@@ -1,7 +1,14 @@ --import unittest, codecs --import six --from w3lib.encoding import (html_body_declared_encoding, read_bom, to_unicode, -- http_content_type_encoding, resolve_encoding, html_to_unicode) -+import codecs -+import unittest -+ -+from w3lib.encoding import ( -+ html_body_declared_encoding, -+ http_content_type_encoding, -+ html_to_unicode, -+ read_bom, -+ resolve_encoding, -+ to_unicode, -+) - - class RequestEncodingTests(unittest.TestCase): - utf8_fragments = [ -@@ -107,18 +114,18 @@ class HtmlConversionTests(unittest.TestC - original_string = unicode_string.encode('cp1251') - encoding, body_unicode = html_to_unicode(ct('cp1251'), original_string) - # check body_as_unicode -- self.assertTrue(isinstance(body_unicode, six.text_type)) -+ self.assertTrue(isinstance(body_unicode, str)) - self.assertEqual(body_unicode, unicode_string) - - def _assert_encoding(self, content_type, body, expected_encoding, - expected_unicode): -- assert not isinstance(body, six.text_type) -+ assert not isinstance(body, str) - encoding, body_unicode = html_to_unicode(ct(content_type), body) -- self.assertTrue(isinstance(body_unicode, six.text_type)) -+ self.assertTrue(isinstance(body_unicode, str)) - self.assertEqual(norm_encoding(encoding), - norm_encoding(expected_encoding)) - -- if isinstance(expected_unicode, six.string_types): -+ if isinstance(expected_unicode, str): - self.assertEqual(body_unicode, expected_unicode) - else: - self.assertTrue( -@@ -177,9 +184,9 @@ class HtmlConversionTests(unittest.TestC - - def _assert_encoding_detected(self, content_type, expected_encoding, body, - **kwargs): -- assert not isinstance(body, six.text_type) -+ assert not isinstance(body, str) - encoding, body_unicode = html_to_unicode(ct(content_type), body, **kwargs) -- self.assertTrue(isinstance(body_unicode, six.text_type)) -+ self.assertTrue(isinstance(body_unicode, str)) - self.assertEqual(norm_encoding(encoding), norm_encoding(expected_encoding)) - - def test_BOM(self): -Index: w3lib-1.22.0/tests/test_html.py -=================================================================== ---- w3lib-1.22.0.orig/tests/test_html.py -+++ w3lib-1.22.0/tests/test_html.py -@@ -1,18 +1,25 @@ --# -*- coding: utf-8 -*- - import unittest --import six --from w3lib.html import (replace_entities, replace_tags, remove_comments, -- remove_tags_with_content, replace_escape_chars, remove_tags, unquote_markup, -- get_base_url, get_meta_refresh) -+ -+from w3lib.html import ( -+ get_base_url, -+ get_meta_refresh, -+ remove_comments, -+ remove_tags, -+ remove_tags_with_content, -+ replace_entities, -+ replace_escape_chars, -+ replace_tags, -+ unquote_markup, -+) - - - class RemoveEntitiesTest(unittest.TestCase): - def test_returns_unicode(self): - # make sure it always return uncode -- assert isinstance(replace_entities(b'no entities'), six.text_type) -- assert isinstance(replace_entities(b'Price: £100!'), six.text_type) -- assert isinstance(replace_entities(u'no entities'), six.text_type) -- assert isinstance(replace_entities(u'Price: £100!'), six.text_type) -+ assert isinstance(replace_entities(b'no entities'), str) -+ assert isinstance(replace_entities(b'Price: £100!'), str) -+ assert isinstance(replace_entities(u'no entities'), str) -+ assert isinstance(replace_entities(u'Price: £100!'), str) - - def test_regular(self): - # regular conversions -@@ -71,8 +78,8 @@ class RemoveEntitiesTest(unittest.TestCa - class ReplaceTagsTest(unittest.TestCase): - def test_returns_unicode(self): - # make sure it always return uncode -- assert isinstance(replace_tags(b'no entities'), six.text_type) -- assert isinstance(replace_tags('no entities'), six.text_type) -+ assert isinstance(replace_tags(b'no entities'), str) -+ assert isinstance(replace_tags('no entities'), str) - - def test_replace_tags(self): - self.assertEqual(replace_tags(u'This text contains some tag'), -@@ -88,10 +95,10 @@ class ReplaceTagsTest(unittest.TestCase) - class RemoveCommentsTest(unittest.TestCase): - def test_returns_unicode(self): - # make sure it always return unicode -- assert isinstance(remove_comments(b'without comments'), six.text_type) -- assert isinstance(remove_comments(b''), six.text_type) -- assert isinstance(remove_comments(u'without comments'), six.text_type) -- assert isinstance(remove_comments(u''), six.text_type) -+ assert isinstance(remove_comments(b'without comments'), str) -+ assert isinstance(remove_comments(b''), str) -+ assert isinstance(remove_comments(u'without comments'), str) -+ assert isinstance(remove_comments(u''), str) - - def test_no_comments(self): - # text without comments -@@ -112,16 +119,16 @@ class RemoveCommentsTest(unittest.TestCa - class RemoveTagsTest(unittest.TestCase): - def test_returns_unicode(self): - # make sure it always return unicode -- assert isinstance(remove_tags(b'no tags'), six.text_type) -- assert isinstance(remove_tags(b'no tags', which_ones=('p',)), six.text_type) -- assert isinstance(remove_tags(b'

one tag

'), six.text_type) -- assert isinstance(remove_tags(b'

one tag

', which_ones=('p')), six.text_type) -- assert isinstance(remove_tags(b'link', which_ones=('b',)), six.text_type) -- assert isinstance(remove_tags(u'no tags'), six.text_type) -- assert isinstance(remove_tags(u'no tags', which_ones=('p',)), six.text_type) -- assert isinstance(remove_tags(u'

one tag

'), six.text_type) -- assert isinstance(remove_tags(u'

one tag

', which_ones=('p')), six.text_type) -- assert isinstance(remove_tags(u'link', which_ones=('b',)), six.text_type) -+ assert isinstance(remove_tags(b'no tags'), str) -+ assert isinstance(remove_tags(b'no tags', which_ones=('p',)), str) -+ assert isinstance(remove_tags(b'

one tag

'), str) -+ assert isinstance(remove_tags(b'

one tag

', which_ones=('p')), str) -+ assert isinstance(remove_tags(b'link', which_ones=('b',)), str) -+ assert isinstance(remove_tags(u'no tags'), str) -+ assert isinstance(remove_tags(u'no tags', which_ones=('p',)), str) -+ assert isinstance(remove_tags(u'

one tag

'), str) -+ assert isinstance(remove_tags(u'

one tag

', which_ones=('p')), str) -+ assert isinstance(remove_tags(u'link', which_ones=('b',)), str) - - def test_remove_tags_without_tags(self): - # text without tags -@@ -160,14 +167,14 @@ class RemoveTagsTest(unittest.TestCase): - class RemoveTagsWithContentTest(unittest.TestCase): - def test_returns_unicode(self): - # make sure it always return unicode -- assert isinstance(remove_tags_with_content(b'no tags'), six.text_type) -- assert isinstance(remove_tags_with_content(b'no tags', which_ones=('p',)), six.text_type) -- assert isinstance(remove_tags_with_content(b'

one tag

', which_ones=('p',)), six.text_type) -- assert isinstance(remove_tags_with_content(b'link', which_ones=('b',)), six.text_type) -- assert isinstance(remove_tags_with_content(u'no tags'), six.text_type) -- assert isinstance(remove_tags_with_content(u'no tags', which_ones=('p',)), six.text_type) -- assert isinstance(remove_tags_with_content(u'

one tag

', which_ones=('p',)), six.text_type) -- assert isinstance(remove_tags_with_content(u'link', which_ones=('b',)), six.text_type) -+ assert isinstance(remove_tags_with_content(b'no tags'), str) -+ assert isinstance(remove_tags_with_content(b'no tags', which_ones=('p',)), str) -+ assert isinstance(remove_tags_with_content(b'

one tag

', which_ones=('p',)), str) -+ assert isinstance(remove_tags_with_content(b'link', which_ones=('b',)), str) -+ assert isinstance(remove_tags_with_content(u'no tags'), str) -+ assert isinstance(remove_tags_with_content(u'no tags', which_ones=('p',)), str) -+ assert isinstance(remove_tags_with_content(u'

one tag

', which_ones=('p',)), str) -+ assert isinstance(remove_tags_with_content(u'link', which_ones=('b',)), str) - - def test_without_tags(self): - # text without tags -@@ -194,13 +201,13 @@ class RemoveTagsWithContentTest(unittest - class ReplaceEscapeCharsTest(unittest.TestCase): - def test_returns_unicode(self): - # make sure it always return unicode -- assert isinstance(replace_escape_chars(b'no ec'), six.text_type) -- assert isinstance(replace_escape_chars(b'no ec', replace_by='str'), six.text_type) -- assert isinstance(replace_escape_chars(b'no ec', replace_by=u'str'), six.text_type) -- assert isinstance(replace_escape_chars(b'no ec', which_ones=('\n', '\t',)), six.text_type) -- assert isinstance(replace_escape_chars(u'no ec'), six.text_type) -- assert isinstance(replace_escape_chars(u'no ec', replace_by=u'str'), six.text_type) -- assert isinstance(replace_escape_chars(u'no ec', which_ones=('\n', '\t',)), six.text_type) -+ assert isinstance(replace_escape_chars(b'no ec'), str) -+ assert isinstance(replace_escape_chars(b'no ec', replace_by='str'), str) -+ assert isinstance(replace_escape_chars(b'no ec', replace_by=u'str'), str) -+ assert isinstance(replace_escape_chars(b'no ec', which_ones=('\n', '\t',)), str) -+ assert isinstance(replace_escape_chars(u'no ec'), str) -+ assert isinstance(replace_escape_chars(u'no ec', replace_by=u'str'), str) -+ assert isinstance(replace_escape_chars(u'no ec', which_ones=('\n', '\t',)), str) - - def test_without_escape_chars(self): - # text without escape chars -@@ -226,8 +233,8 @@ class UnquoteMarkupTest(unittest.TestCas - - def test_returns_unicode(self): - # make sure it always return unicode -- assert isinstance(unquote_markup(self.sample_txt1.encode('latin-1')), six.text_type) -- assert isinstance(unquote_markup(self.sample_txt2), six.text_type) -+ assert isinstance(unquote_markup(self.sample_txt1.encode('latin-1')), str) -+ assert isinstance(unquote_markup(self.sample_txt2), str) - - def test_unquote_markup(self): - self.assertEqual(unquote_markup(self.sample_txt1), u"""hi, this is sample text with entities: & \xa9 -Index: w3lib-1.22.0/tests/test_url.py -=================================================================== ---- w3lib-1.22.0.orig/tests/test_url.py -+++ w3lib-1.22.0/tests/test_url.py -@@ -1,15 +1,25 @@ --# -*- coding: utf-8 -*- --from __future__ import absolute_import - import os - import unittest -+from urllib.parse import urlparse - - import pytest --from six.moves.urllib.parse import urlparse - --from w3lib.url import (is_url, safe_url_string, safe_download_url, -- url_query_parameter, add_or_replace_parameter, url_query_cleaner, -- file_uri_to_path, parse_data_uri, path_to_file_uri, any_to_uri, -- urljoin_rfc, canonicalize_url, parse_url, add_or_replace_parameters) -+from w3lib.url import ( -+ add_or_replace_parameter, -+ add_or_replace_parameters, -+ any_to_uri, -+ canonicalize_url, -+ file_uri_to_path, -+ is_url, -+ parse_data_uri, -+ parse_url, -+ path_to_file_uri, -+ safe_download_url, -+ safe_url_string, -+ url_query_parameter, -+ url_query_cleaner, -+ urljoin_rfc, -+) - - - class UrlTests(unittest.TestCase): -Index: w3lib-1.22.0/w3lib/form.py -=================================================================== ---- w3lib-1.22.0.orig/w3lib/form.py -+++ w3lib-1.22.0/w3lib/form.py -@@ -1,9 +1,6 @@ - import warnings --import six --if six.PY2: -- from cStringIO import StringIO as BytesIO --else: -- from io import BytesIO -+from io import BytesIO -+ - from w3lib.util import unicode_to_str - - -Index: w3lib-1.22.0/w3lib/html.py -=================================================================== ---- w3lib-1.22.0.orig/w3lib/html.py -+++ w3lib-1.22.0/w3lib/html.py -@@ -5,16 +5,16 @@ Functions for dealing with markup text - - import warnings - import re --import six --from six import moves -+from html.entities import name2codepoint -+from urllib.parse import urljoin - - from w3lib.util import to_bytes, to_unicode - from w3lib.url import safe_url_string - - _ent_re = re.compile(r'&((?P[a-z\d]+)|#(?P\d+)|#x(?P[a-f\d]+))(?P;?)', re.IGNORECASE) - _tag_re = re.compile(r'<[a-zA-Z\/!].*?>', re.DOTALL) --_baseurl_re = re.compile(six.u(r']*href\s*=\s*[\"\']\s*([^\"\'\s]+)\s*[\"\']'), re.I) --_meta_refresh_re = re.compile(six.u(r']*http-equiv[^>]*refresh[^>]*content\s*=\s*(?P["\'])(?P(\d*\.)?\d+)\s*;\s*url=\s*(?P.*?)(?P=quote)'), re.DOTALL | re.IGNORECASE) -+_baseurl_re = re.compile(r']*href\s*=\s*[\"\']\s*([^\"\'\s]+)\s*[\"\']', re.I) -+_meta_refresh_re = re.compile(r']*http-equiv[^>]*refresh[^>]*content\s*=\s*(?P["\'])(?P(\d*\.)?\d+)\s*;\s*url=\s*(?P.*?)(?P=quote)', re.DOTALL | re.IGNORECASE) - _cdata_re = re.compile(r'((?P.*?)(?P\]\]>))', re.DOTALL) - - HTML5_WHITESPACE = ' \t\n\r\x0c' -@@ -77,8 +77,10 @@ def replace_entities(text, keep=(), remo - if entity_name.lower() in keep: - return m.group(0) - else: -- number = (moves.html_entities.name2codepoint.get(entity_name) or -- moves.html_entities.name2codepoint.get(entity_name.lower())) -+ number = ( -+ name2codepoint.get(entity_name) -+ or name2codepoint.get(entity_name.lower()) -+ ) - if number is not None: - # Numeric character references in the 80-9F range are typically - # interpreted by browsers as representing the characters mapped -@@ -86,9 +88,9 @@ def replace_entities(text, keep=(), remo - # see: http://en.wikipedia.org/wiki/Character_encodings_in_HTML - try: - if 0x80 <= number <= 0x9f: -- return six.int2byte(number).decode('cp1252') -+ return bytes((number,)).decode('cp1252') - else: -- return six.unichr(number) -+ return chr(number) - except ValueError: - pass - -@@ -265,7 +267,7 @@ def unquote_markup(text, keep=(), remove - text = to_unicode(text, encoding) - ret_text = u'' - for fragment in _get_fragments(text, _cdata_re): -- if isinstance(fragment, six.string_types): -+ if isinstance(fragment, str): - # it's not a CDATA (so we try to remove its entities) - ret_text += replace_entities(fragment, keep=keep, remove_illegal=remove_illegal) - else: -@@ -284,7 +286,7 @@ def get_base_url(text, baseurl='', encod - text = to_unicode(text, encoding) - m = _baseurl_re.search(text) - if m: -- return moves.urllib.parse.urljoin( -+ return urljoin( - safe_url_string(baseurl), - safe_url_string(m.group(1), encoding=encoding) - ) -@@ -301,8 +303,6 @@ def get_meta_refresh(text, baseurl='', e - - """ - -- if six.PY2: -- baseurl = to_bytes(baseurl, encoding) - try: - text = to_unicode(text, encoding) - except UnicodeDecodeError: -@@ -314,7 +314,7 @@ def get_meta_refresh(text, baseurl='', e - if m: - interval = float(m.group('int')) - url = safe_url_string(m.group('url').strip(' "\''), encoding) -- url = moves.urllib.parse.urljoin(baseurl, url) -+ url = urljoin(baseurl, url) - return interval, url - else: - return None, None -Index: w3lib-1.22.0/w3lib/url.py -=================================================================== ---- w3lib-1.22.0.orig/w3lib/url.py -+++ w3lib-1.22.0/w3lib/url.py -@@ -5,17 +5,28 @@ library. - import base64 - import codecs - import os --import re - import posixpath --import warnings -+import re - import string -+import warnings - from collections import namedtuple --import six --from six.moves.urllib.parse import (urljoin, urlsplit, urlunsplit, -- urldefrag, urlencode, urlparse, -- quote, parse_qs, parse_qsl, -- ParseResult, unquote, urlunparse) --from six.moves.urllib.request import pathname2url, url2pathname -+from urllib.parse import ( -+ _coerce_args, -+ parse_qs, -+ parse_qsl, -+ ParseResult, -+ quote, -+ unquote, -+ unquote_to_bytes, -+ urldefrag, -+ urlencode, -+ urljoin, -+ urlparse, -+ urlsplit, -+ urlunparse, -+ urlunsplit, -+) -+from urllib.request import pathname2url, url2pathname - from w3lib.util import to_bytes, to_native_str, to_unicode - - -@@ -184,7 +195,7 @@ def url_query_cleaner(url, parameterlist - - """ - -- if isinstance(parameterlist, (six.text_type, bytes)): -+ if isinstance(parameterlist, (str, bytes)): - parameterlist = [parameterlist] - url, fragment = urldefrag(url) - base, _, query = url.partition('?') -@@ -346,10 +357,7 @@ def parse_data_uri(uri): - # delimiters, but it makes parsing easier and should not affect - # well-formed URIs, as the delimiters used in this URI scheme are not - # allowed, percent-encoded or not, in tokens. -- if six.PY2: -- uri = unquote(uri) -- else: -- uri = unquote_to_bytes(uri) -+ uri = unquote_to_bytes(uri) - - media_type = "text/plain" - media_type_params = {} -@@ -469,33 +477,32 @@ def canonicalize_url(url, keep_blank_val - # 1. decode query-string as UTF-8 (or keep raw bytes), - # sort values, - # and percent-encode them back -- if six.PY2: -- keyvals = parse_qsl(query, keep_blank_values) -- else: -- # Python3's urllib.parse.parse_qsl does not work as wanted -- # for percent-encoded characters that do not match passed encoding, -- # they get lost. -- # -- # e.g., 'q=b%a3' becomes [('q', 'b\ufffd')] -- # (ie. with 'REPLACEMENT CHARACTER' (U+FFFD), -- # instead of \xa3 that you get with Python2's parse_qsl) -- # -- # what we want here is to keep raw bytes, and percent encode them -- # so as to preserve whatever encoding what originally used. -- # -- # See https://tools.ietf.org/html/rfc3987#section-6.4: -- # -- # For example, it is possible to have a URI reference of -- # "http://www.example.org/r%E9sum%E9.xml#r%C3%A9sum%C3%A9", where the -- # document name is encoded in iso-8859-1 based on server settings, but -- # where the fragment identifier is encoded in UTF-8 according to -- # [XPointer]. The IRI corresponding to the above URI would be (in XML -- # notation) -- # "http://www.example.org/r%E9sum%E9.xml#résumé". -- # Similar considerations apply to query parts. The functionality of -- # IRIs (namely, to be able to include non-ASCII characters) can only be -- # used if the query part is encoded in UTF-8. -- keyvals = parse_qsl_to_bytes(query, keep_blank_values) -+ -+ # Python's urllib.parse.parse_qsl does not work as wanted -+ # for percent-encoded characters that do not match passed encoding, -+ # they get lost. -+ # -+ # e.g., 'q=b%a3' becomes [('q', 'b\ufffd')] -+ # (ie. with 'REPLACEMENT CHARACTER' (U+FFFD), -+ # instead of \xa3 that you get with Python2's parse_qsl) -+ # -+ # what we want here is to keep raw bytes, and percent encode them -+ # so as to preserve whatever encoding what originally used. -+ # -+ # See https://tools.ietf.org/html/rfc3987#section-6.4: -+ # -+ # For example, it is possible to have a URI reference of -+ # "http://www.example.org/r%E9sum%E9.xml#r%C3%A9sum%C3%A9", where the -+ # document name is encoded in iso-8859-1 based on server settings, but -+ # where the fragment identifier is encoded in UTF-8 according to -+ # [XPointer]. The IRI corresponding to the above URI would be (in XML -+ # notation) -+ # "http://www.example.org/r%E9sum%E9.xml#résumé". -+ # Similar considerations apply to query parts. The functionality of -+ # IRIs (namely, to be able to include non-ASCII characters) can only be -+ # used if the query part is encoded in UTF-8. -+ keyvals = parse_qsl_to_bytes(query, keep_blank_values) -+ - keyvals.sort() - query = urlencode(keyvals) - -@@ -519,17 +526,12 @@ def _unquotepath(path): - for reserved in ('2f', '2F', '3f', '3F'): - path = path.replace('%' + reserved, '%25' + reserved.upper()) - -- if six.PY2: -- # in Python 2, '%a3' becomes '\xa3', which is what we want -- return unquote(path) -- else: -- # in Python 3, -- # standard lib's unquote() does not work for non-UTF-8 -- # percent-escaped characters, they get lost. -- # e.g., '%a3' becomes 'REPLACEMENT CHARACTER' (U+FFFD) -- # -- # unquote_to_bytes() returns raw bytes instead -- return unquote_to_bytes(path) -+ # standard lib's unquote() does not work for non-UTF-8 -+ # percent-escaped characters, they get lost. -+ # e.g., '%a3' becomes 'REPLACEMENT CHARACTER' (U+FFFD) -+ # -+ # unquote_to_bytes() returns raw bytes instead -+ return unquote_to_bytes(path) - - - def parse_url(url, encoding=None): -@@ -541,51 +543,48 @@ def parse_url(url, encoding=None): - return urlparse(to_unicode(url, encoding)) - - --if not six.PY2: -- from urllib.parse import _coerce_args, unquote_to_bytes -+def parse_qsl_to_bytes(qs, keep_blank_values=False): -+ """Parse a query given as a string argument. -+ -+ Data are returned as a list of name, value pairs as bytes. - -- def parse_qsl_to_bytes(qs, keep_blank_values=False): -- """Parse a query given as a string argument. -+ Arguments: - -- Data are returned as a list of name, value pairs as bytes. -+ qs: percent-encoded query string to be parsed - -- Arguments: -- -- qs: percent-encoded query string to be parsed -- -- keep_blank_values: flag indicating whether blank values in -- percent-encoded queries should be treated as blank strings. A -- true value indicates that blanks should be retained as blank -- strings. The default false value indicates that blank values -- are to be ignored and treated as if they were not included. -- -- """ -- # This code is the same as Python3's parse_qsl() -- # (at https://hg.python.org/cpython/rev/c38ac7ab8d9a) -- # except for the unquote(s, encoding, errors) calls replaced -- # with unquote_to_bytes(s) -- qs, _coerce_result = _coerce_args(qs) -- pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')] -- r = [] -- for name_value in pairs: -- if not name_value: -+ keep_blank_values: flag indicating whether blank values in -+ percent-encoded queries should be treated as blank strings. A -+ true value indicates that blanks should be retained as blank -+ strings. The default false value indicates that blank values -+ are to be ignored and treated as if they were not included. -+ -+ """ -+ # This code is the same as Python3's parse_qsl() -+ # (at https://hg.python.org/cpython/rev/c38ac7ab8d9a) -+ # except for the unquote(s, encoding, errors) calls replaced -+ # with unquote_to_bytes(s) -+ qs, _coerce_result = _coerce_args(qs) -+ pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')] -+ r = [] -+ for name_value in pairs: -+ if not name_value: -+ continue -+ nv = name_value.split('=', 1) -+ if len(nv) != 2: -+ # Handle case of a control-name with no equal sign -+ if keep_blank_values: -+ nv.append('') -+ else: - continue -- nv = name_value.split('=', 1) -- if len(nv) != 2: -- # Handle case of a control-name with no equal sign -- if keep_blank_values: -- nv.append('') -- else: -- continue -- if len(nv[1]) or keep_blank_values: -- name = nv[0].replace('+', ' ') -- name = unquote_to_bytes(name) -- name = _coerce_result(name) -- value = nv[1].replace('+', ' ') -- value = unquote_to_bytes(value) -- value = _coerce_result(value) -- r.append((name, value)) -- return r -+ if len(nv[1]) or keep_blank_values: -+ name = nv[0].replace('+', ' ') -+ name = unquote_to_bytes(name) -+ name = _coerce_result(name) -+ value = nv[1].replace('+', ' ') -+ value = unquote_to_bytes(value) -+ value = _coerce_result(value) -+ r.append((name, value)) -+ return r - - - def urljoin_rfc(base, ref, encoding='utf-8'): -Index: w3lib-1.22.0/w3lib/util.py -=================================================================== ---- w3lib-1.22.0.orig/w3lib/util.py -+++ w3lib-1.22.0/w3lib/util.py -@@ -1,5 +1,3 @@ --import six -- - def str_to_unicode(text, encoding=None, errors='strict'): - if encoding is None: - encoding = 'utf-8' -@@ -10,16 +8,16 @@ def str_to_unicode(text, encoding=None, - def unicode_to_str(text, encoding=None, errors='strict'): - if encoding is None: - encoding = 'utf-8' -- if isinstance(text, six.text_type): -+ if isinstance(text, str): - return text.encode(encoding, errors) - return text - - def to_unicode(text, encoding=None, errors='strict'): - """Return the unicode representation of a bytes object `text`. If `text` - is already an unicode object, return it as-is.""" -- if isinstance(text, six.text_type): -+ if isinstance(text, str): - return text -- if not isinstance(text, (bytes, six.text_type)): -+ if not isinstance(text, (bytes, str)): - raise TypeError('to_unicode must receive a bytes, str or unicode ' - 'object, got %s' % type(text).__name__) - if encoding is None: -@@ -31,7 +29,7 @@ def to_bytes(text, encoding=None, errors - is already a bytes object, return it as-is.""" - if isinstance(text, bytes): - return text -- if not isinstance(text, six.string_types): -+ if not isinstance(text, str): - raise TypeError('to_bytes must receive a unicode, str or bytes ' - 'object, got %s' % type(text).__name__) - if encoding is None: -@@ -39,9 +37,5 @@ def to_bytes(text, encoding=None, errors - return text.encode(encoding, errors) - - def to_native_str(text, encoding=None, errors='strict'): -- """ Return str representation of `text` -- (bytes in Python 2.x and unicode in Python 3.x). """ -- if six.PY2: -- return to_bytes(text, encoding, errors) -- else: -- return to_unicode(text, encoding, errors) -+ """ Return str representation of `text` """ -+ return to_unicode(text, encoding, errors) diff --git a/python-w3lib.changes b/python-w3lib.changes index a25e5ea..b509e66 100644 --- a/python-w3lib.changes +++ b/python-w3lib.changes @@ -1,3 +1,45 @@ +------------------------------------------------------------------- +Fri Jan 12 08:28:28 UTC 2024 - Dirk Müller + +- update to 2.1.2: + * Fix test failures on Python 3.11.4+ (#212, #213). + * Fix an incorrect type hint (#211). + * Add project URLs to setup.py (#215). + * Dropped Python 3.6 support, and made Python 3.11 support + official. + * :func:`~w3lib.url.safe_url_string` now generates safer + URLs. + * :func:`~w3lib.url.canonicalize_url` now strips spaces from + the input URL, to be more in line with the `URL living standard`_. + * :func:`~w3lib.html.get_base_url` now ignores HTML comments. + * Fixed :func:`~w3lib.url.safe_url_string` re-encoding + percent signs on the URL username and password even when + they were being used as part of an escape sequence. (#187, #196) + * Fixed :func:`~w3lib.http.basic_auth_header` using the wrong + flavor of base64 encoding, which could prevent authentication + in rare cases. + * Python 2 is no longer supported; + * The ``w3lib.form`` module is removed. + * The ``w3lib.html.remove_entities`` function is removed. + * The ``w3lib.url.urljoin_rfc`` function is removed. + * The following functions are deprecated, and will be removed + in future releases + * Type annotations are added (#172, #184). + * Added support for Python 3.9 and 3.10 (#168, #176). + * Fixed :func:`w3lib.html.get_meta_refresh` for ```` tags + where ``http-equiv`` is written after ``content`` (#179). + * Fixed :func:`w3lib.url.safe_url_string` for IDNA domains with + ports (#174). + * :func:`w3lib.url.url_query_cleaner` no longer adds an + unneeded ``#`` when ``keep_fragments=True`` is passed, and + the URL doesn't have a fragment + * Removed a workaround for an ancient pathname2url bug (#142) + * CI is migrated to GitHub Actions (#166, #177); other CI + improvements + * The code is formatted using black (#173). +- drop 166-add-xfail-test_add_or_replace_parameter_fail.patch, + python-w3lib-no-six.patch: upstream + ------------------------------------------------------------------- Fri Oct 14 19:10:33 UTC 2022 - pgajdos@suse.com @@ -38,7 +80,7 @@ Thu Aug 29 13:15:56 UTC 2019 - Marketa Calabkova Fri Mar 29 09:53:27 UTC 2019 - pgajdos@suse.com - version update to 1.20.0 - * Fix url_query_cleaner to do not append "?" to urls without a + * Fix url_query_cleaner to do not append "?" to urls without a query string (issue #109) * Add support for Python 3.7 and drop Python 3.3 (issue #113) * Add `w3lib.url.add_or_replace_parameters` helper (issue #117) @@ -113,7 +155,7 @@ Fri Nov 15 20:36:22 UTC 2013 - p.drouand@gmail.com - Update to version 1.5 + No changelog available -- Add python-setuptools BuildRequires; new dependency +- Add python-setuptools BuildRequires; new dependency ------------------------------------------------------------------- Wed May 23 21:43:14 UTC 2012 - jfunk@funktronics.ca diff --git a/python-w3lib.spec b/python-w3lib.spec index 3cff738..355189f 100644 --- a/python-w3lib.spec +++ b/python-w3lib.spec @@ -1,7 +1,7 @@ # # spec file for package python-w3lib # -# Copyright (c) 2022 SUSE LLC +# Copyright (c) 2024 SUSE LLC # # All modifications and additions to the file contributed by third parties # remain the property of their copyright owners, unless otherwise agreed @@ -16,20 +16,15 @@ # -%{?!python_module:%define python_module() python-%{**} python3-%{**}} +%{?sle15_python_module_pythons} Name: python-w3lib -Version: 1.22.0 +Version: 2.1.2 Release: 0 Summary: Library of Web-Related Functions License: BSD-3-Clause Group: Development/Languages/Python URL: https://github.com/scrapy/w3lib Source: https://files.pythonhosted.org/packages/source/w/w3lib/w3lib-%{version}.tar.gz -# PATCH-FIX-UPSTREAM 166-add-xfail-test_add_or_replace_parameter_fail.patch mcepl@suse.com -# Allow working with Python fixed CVE-2021-23336 -Patch0: 166-add-xfail-test_add_or_replace_parameter_fail.patch -# https://github.com/scrapy/w3lib/commit/c16d7bac3af3148b7018c67ef7922a5da6b3e640 -Patch1: python-w3lib-no-six.patch BuildRequires: %{python_module pytest} BuildRequires: %{python_module setuptools} BuildRequires: fdupes diff --git a/w3lib-1.22.0.tar.gz b/w3lib-1.22.0.tar.gz deleted file mode 100644 index 32e390f..0000000 --- a/w3lib-1.22.0.tar.gz +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:0ad6d0203157d61149fd45aaed2e24f53902989c32fc1dccc2e2bfba371560df -size 39121 diff --git a/w3lib-2.1.2.tar.gz b/w3lib-2.1.2.tar.gz new file mode 100644 index 0000000..608daeb --- /dev/null +++ b/w3lib-2.1.2.tar.gz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ed5b74e997eea2abe3c1321f916e344144ee8e9072a6f33463ee8e57f858a4b1 +size 48715