Index: w3lib-1.22.0/setup.py =================================================================== --- w3lib-1.22.0.orig/setup.py +++ w3lib-1.22.0/setup.py @@ -29,5 +29,4 @@ setup( 'Programming Language :: Python :: Implementation :: PyPy', 'Topic :: Internet :: WWW/HTTP', ], - install_requires=['six >= 1.4.1'], ) Index: w3lib-1.22.0/tests/test_encoding.py =================================================================== --- w3lib-1.22.0.orig/tests/test_encoding.py +++ w3lib-1.22.0/tests/test_encoding.py @@ -1,7 +1,14 @@ -import unittest, codecs -import six -from w3lib.encoding import (html_body_declared_encoding, read_bom, to_unicode, - http_content_type_encoding, resolve_encoding, html_to_unicode) +import codecs +import unittest + +from w3lib.encoding import ( + html_body_declared_encoding, + http_content_type_encoding, + html_to_unicode, + read_bom, + resolve_encoding, + to_unicode, +) class RequestEncodingTests(unittest.TestCase): utf8_fragments = [ @@ -107,18 +114,18 @@ class HtmlConversionTests(unittest.TestC original_string = unicode_string.encode('cp1251') encoding, body_unicode = html_to_unicode(ct('cp1251'), original_string) # check body_as_unicode - self.assertTrue(isinstance(body_unicode, six.text_type)) + self.assertTrue(isinstance(body_unicode, str)) self.assertEqual(body_unicode, unicode_string) def _assert_encoding(self, content_type, body, expected_encoding, expected_unicode): - assert not isinstance(body, six.text_type) + assert not isinstance(body, str) encoding, body_unicode = html_to_unicode(ct(content_type), body) - self.assertTrue(isinstance(body_unicode, six.text_type)) + self.assertTrue(isinstance(body_unicode, str)) self.assertEqual(norm_encoding(encoding), norm_encoding(expected_encoding)) - if isinstance(expected_unicode, six.string_types): + if isinstance(expected_unicode, str): self.assertEqual(body_unicode, expected_unicode) else: self.assertTrue( @@ -177,9 +184,9 @@ class HtmlConversionTests(unittest.TestC def _assert_encoding_detected(self, content_type, expected_encoding, body, **kwargs): - assert not isinstance(body, six.text_type) + assert not isinstance(body, str) encoding, body_unicode = html_to_unicode(ct(content_type), body, **kwargs) - self.assertTrue(isinstance(body_unicode, six.text_type)) + self.assertTrue(isinstance(body_unicode, str)) self.assertEqual(norm_encoding(encoding), norm_encoding(expected_encoding)) def test_BOM(self): Index: w3lib-1.22.0/tests/test_html.py =================================================================== --- w3lib-1.22.0.orig/tests/test_html.py +++ w3lib-1.22.0/tests/test_html.py @@ -1,18 +1,25 @@ -# -*- coding: utf-8 -*- import unittest -import six -from w3lib.html import (replace_entities, replace_tags, remove_comments, - remove_tags_with_content, replace_escape_chars, remove_tags, unquote_markup, - get_base_url, get_meta_refresh) + +from w3lib.html import ( + get_base_url, + get_meta_refresh, + remove_comments, + remove_tags, + remove_tags_with_content, + replace_entities, + replace_escape_chars, + replace_tags, + unquote_markup, +) class RemoveEntitiesTest(unittest.TestCase): def test_returns_unicode(self): # make sure it always return uncode - assert isinstance(replace_entities(b'no entities'), six.text_type) - assert isinstance(replace_entities(b'Price: £100!'), six.text_type) - assert isinstance(replace_entities(u'no entities'), six.text_type) - assert isinstance(replace_entities(u'Price: £100!'), six.text_type) + assert isinstance(replace_entities(b'no entities'), str) + assert isinstance(replace_entities(b'Price: £100!'), str) + assert isinstance(replace_entities(u'no entities'), str) + assert isinstance(replace_entities(u'Price: £100!'), str) def test_regular(self): # regular conversions @@ -71,8 +78,8 @@ class RemoveEntitiesTest(unittest.TestCa class ReplaceTagsTest(unittest.TestCase): def test_returns_unicode(self): # make sure it always return uncode - assert isinstance(replace_tags(b'no entities'), six.text_type) - assert isinstance(replace_tags('no entities'), six.text_type) + assert isinstance(replace_tags(b'no entities'), str) + assert isinstance(replace_tags('no entities'), str) def test_replace_tags(self): self.assertEqual(replace_tags(u'This text contains some tag'), @@ -88,10 +95,10 @@ class ReplaceTagsTest(unittest.TestCase) class RemoveCommentsTest(unittest.TestCase): def test_returns_unicode(self): # make sure it always return unicode - assert isinstance(remove_comments(b'without comments'), six.text_type) - assert isinstance(remove_comments(b''), six.text_type) - assert isinstance(remove_comments(u'without comments'), six.text_type) - assert isinstance(remove_comments(u''), six.text_type) + assert isinstance(remove_comments(b'without comments'), str) + assert isinstance(remove_comments(b''), str) + assert isinstance(remove_comments(u'without comments'), str) + assert isinstance(remove_comments(u''), str) def test_no_comments(self): # text without comments @@ -112,16 +119,16 @@ class RemoveCommentsTest(unittest.TestCa class RemoveTagsTest(unittest.TestCase): def test_returns_unicode(self): # make sure it always return unicode - assert isinstance(remove_tags(b'no tags'), six.text_type) - assert isinstance(remove_tags(b'no tags', which_ones=('p',)), six.text_type) - assert isinstance(remove_tags(b'
one tag
'), six.text_type) - assert isinstance(remove_tags(b'one tag
', which_ones=('p')), six.text_type) - assert isinstance(remove_tags(b'link', which_ones=('b',)), six.text_type) - assert isinstance(remove_tags(u'no tags'), six.text_type) - assert isinstance(remove_tags(u'no tags', which_ones=('p',)), six.text_type) - assert isinstance(remove_tags(u'one tag
'), six.text_type) - assert isinstance(remove_tags(u'one tag
', which_ones=('p')), six.text_type) - assert isinstance(remove_tags(u'link', which_ones=('b',)), six.text_type) + assert isinstance(remove_tags(b'no tags'), str) + assert isinstance(remove_tags(b'no tags', which_ones=('p',)), str) + assert isinstance(remove_tags(b'one tag
'), str) + assert isinstance(remove_tags(b'one tag
', which_ones=('p')), str) + assert isinstance(remove_tags(b'link', which_ones=('b',)), str) + assert isinstance(remove_tags(u'no tags'), str) + assert isinstance(remove_tags(u'no tags', which_ones=('p',)), str) + assert isinstance(remove_tags(u'one tag
'), str) + assert isinstance(remove_tags(u'one tag
', which_ones=('p')), str) + assert isinstance(remove_tags(u'link', which_ones=('b',)), str) def test_remove_tags_without_tags(self): # text without tags @@ -160,14 +167,14 @@ class RemoveTagsTest(unittest.TestCase): class RemoveTagsWithContentTest(unittest.TestCase): def test_returns_unicode(self): # make sure it always return unicode - assert isinstance(remove_tags_with_content(b'no tags'), six.text_type) - assert isinstance(remove_tags_with_content(b'no tags', which_ones=('p',)), six.text_type) - assert isinstance(remove_tags_with_content(b'one tag
', which_ones=('p',)), six.text_type) - assert isinstance(remove_tags_with_content(b'link', which_ones=('b',)), six.text_type) - assert isinstance(remove_tags_with_content(u'no tags'), six.text_type) - assert isinstance(remove_tags_with_content(u'no tags', which_ones=('p',)), six.text_type) - assert isinstance(remove_tags_with_content(u'one tag
', which_ones=('p',)), six.text_type) - assert isinstance(remove_tags_with_content(u'link', which_ones=('b',)), six.text_type) + assert isinstance(remove_tags_with_content(b'no tags'), str) + assert isinstance(remove_tags_with_content(b'no tags', which_ones=('p',)), str) + assert isinstance(remove_tags_with_content(b'one tag
', which_ones=('p',)), str) + assert isinstance(remove_tags_with_content(b'link', which_ones=('b',)), str) + assert isinstance(remove_tags_with_content(u'no tags'), str) + assert isinstance(remove_tags_with_content(u'no tags', which_ones=('p',)), str) + assert isinstance(remove_tags_with_content(u'one tag
', which_ones=('p',)), str) + assert isinstance(remove_tags_with_content(u'link', which_ones=('b',)), str) def test_without_tags(self): # text without tags @@ -194,13 +201,13 @@ class RemoveTagsWithContentTest(unittest class ReplaceEscapeCharsTest(unittest.TestCase): def test_returns_unicode(self): # make sure it always return unicode - assert isinstance(replace_escape_chars(b'no ec'), six.text_type) - assert isinstance(replace_escape_chars(b'no ec', replace_by='str'), six.text_type) - assert isinstance(replace_escape_chars(b'no ec', replace_by=u'str'), six.text_type) - assert isinstance(replace_escape_chars(b'no ec', which_ones=('\n', '\t',)), six.text_type) - assert isinstance(replace_escape_chars(u'no ec'), six.text_type) - assert isinstance(replace_escape_chars(u'no ec', replace_by=u'str'), six.text_type) - assert isinstance(replace_escape_chars(u'no ec', which_ones=('\n', '\t',)), six.text_type) + assert isinstance(replace_escape_chars(b'no ec'), str) + assert isinstance(replace_escape_chars(b'no ec', replace_by='str'), str) + assert isinstance(replace_escape_chars(b'no ec', replace_by=u'str'), str) + assert isinstance(replace_escape_chars(b'no ec', which_ones=('\n', '\t',)), str) + assert isinstance(replace_escape_chars(u'no ec'), str) + assert isinstance(replace_escape_chars(u'no ec', replace_by=u'str'), str) + assert isinstance(replace_escape_chars(u'no ec', which_ones=('\n', '\t',)), str) def test_without_escape_chars(self): # text without escape chars @@ -226,8 +233,8 @@ class UnquoteMarkupTest(unittest.TestCas def test_returns_unicode(self): # make sure it always return unicode - assert isinstance(unquote_markup(self.sample_txt1.encode('latin-1')), six.text_type) - assert isinstance(unquote_markup(self.sample_txt2), six.text_type) + assert isinstance(unquote_markup(self.sample_txt1.encode('latin-1')), str) + assert isinstance(unquote_markup(self.sample_txt2), str) def test_unquote_markup(self): self.assertEqual(unquote_markup(self.sample_txt1), u"""["\'])(?P(\d*\.)?\d+)\s*;\s*url=\s*(?P .*?)(?P=quote)'), re.DOTALL | re.IGNORECASE) +_baseurl_re = re.compile(r' ]*href\s*=\s*[\"\']\s*([^\"\'\s]+)\s*[\"\']', re.I) +_meta_refresh_re = re.compile(r']*http-equiv[^>]*refresh[^>]*content\s*=\s*(?P ["\'])(?P(\d*\.)?\d+)\s*;\s*url=\s*(?P .*?)(?P=quote)', re.DOTALL | re.IGNORECASE) _cdata_re = re.compile(r'((?P .*?)(?P \]\]>))', re.DOTALL) HTML5_WHITESPACE = ' \t\n\r\x0c' @@ -77,8 +77,10 @@ def replace_entities(text, keep=(), remo if entity_name.lower() in keep: return m.group(0) else: - number = (moves.html_entities.name2codepoint.get(entity_name) or - moves.html_entities.name2codepoint.get(entity_name.lower())) + number = ( + name2codepoint.get(entity_name) + or name2codepoint.get(entity_name.lower()) + ) if number is not None: # Numeric character references in the 80-9F range are typically # interpreted by browsers as representing the characters mapped @@ -86,9 +88,9 @@ def replace_entities(text, keep=(), remo # see: http://en.wikipedia.org/wiki/Character_encodings_in_HTML try: if 0x80 <= number <= 0x9f: - return six.int2byte(number).decode('cp1252') + return bytes((number,)).decode('cp1252') else: - return six.unichr(number) + return chr(number) except ValueError: pass @@ -265,7 +267,7 @@ def unquote_markup(text, keep=(), remove text = to_unicode(text, encoding) ret_text = u'' for fragment in _get_fragments(text, _cdata_re): - if isinstance(fragment, six.string_types): + if isinstance(fragment, str): # it's not a CDATA (so we try to remove its entities) ret_text += replace_entities(fragment, keep=keep, remove_illegal=remove_illegal) else: @@ -284,7 +286,7 @@ def get_base_url(text, baseurl='', encod text = to_unicode(text, encoding) m = _baseurl_re.search(text) if m: - return moves.urllib.parse.urljoin( + return urljoin( safe_url_string(baseurl), safe_url_string(m.group(1), encoding=encoding) ) @@ -301,8 +303,6 @@ def get_meta_refresh(text, baseurl='', e """ - if six.PY2: - baseurl = to_bytes(baseurl, encoding) try: text = to_unicode(text, encoding) except UnicodeDecodeError: @@ -314,7 +314,7 @@ def get_meta_refresh(text, baseurl='', e if m: interval = float(m.group('int')) url = safe_url_string(m.group('url').strip(' "\''), encoding) - url = moves.urllib.parse.urljoin(baseurl, url) + url = urljoin(baseurl, url) return interval, url else: return None, None Index: w3lib-1.22.0/w3lib/url.py =================================================================== --- w3lib-1.22.0.orig/w3lib/url.py +++ w3lib-1.22.0/w3lib/url.py @@ -5,17 +5,28 @@ library. import base64 import codecs import os -import re import posixpath -import warnings +import re import string +import warnings from collections import namedtuple -import six -from six.moves.urllib.parse import (urljoin, urlsplit, urlunsplit, - urldefrag, urlencode, urlparse, - quote, parse_qs, parse_qsl, - ParseResult, unquote, urlunparse) -from six.moves.urllib.request import pathname2url, url2pathname +from urllib.parse import ( + _coerce_args, + parse_qs, + parse_qsl, + ParseResult, + quote, + unquote, + unquote_to_bytes, + urldefrag, + urlencode, + urljoin, + urlparse, + urlsplit, + urlunparse, + urlunsplit, +) +from urllib.request import pathname2url, url2pathname from w3lib.util import to_bytes, to_native_str, to_unicode @@ -184,7 +195,7 @@ def url_query_cleaner(url, parameterlist """ - if isinstance(parameterlist, (six.text_type, bytes)): + if isinstance(parameterlist, (str, bytes)): parameterlist = [parameterlist] url, fragment = urldefrag(url) base, _, query = url.partition('?') @@ -346,10 +357,7 @@ def parse_data_uri(uri): # delimiters, but it makes parsing easier and should not affect # well-formed URIs, as the delimiters used in this URI scheme are not # allowed, percent-encoded or not, in tokens. - if six.PY2: - uri = unquote(uri) - else: - uri = unquote_to_bytes(uri) + uri = unquote_to_bytes(uri) media_type = "text/plain" media_type_params = {} @@ -469,33 +477,32 @@ def canonicalize_url(url, keep_blank_val # 1. decode query-string as UTF-8 (or keep raw bytes), # sort values, # and percent-encode them back - if six.PY2: - keyvals = parse_qsl(query, keep_blank_values) - else: - # Python3's urllib.parse.parse_qsl does not work as wanted - # for percent-encoded characters that do not match passed encoding, - # they get lost. - # - # e.g., 'q=b%a3' becomes [('q', 'b\ufffd')] - # (ie. with 'REPLACEMENT CHARACTER' (U+FFFD), - # instead of \xa3 that you get with Python2's parse_qsl) - # - # what we want here is to keep raw bytes, and percent encode them - # so as to preserve whatever encoding what originally used. - # - # See https://tools.ietf.org/html/rfc3987#section-6.4: - # - # For example, it is possible to have a URI reference of - # "http://www.example.org/r%E9sum%E9.xml#r%C3%A9sum%C3%A9", where the - # document name is encoded in iso-8859-1 based on server settings, but - # where the fragment identifier is encoded in UTF-8 according to - # [XPointer]. The IRI corresponding to the above URI would be (in XML - # notation) - # "http://www.example.org/r%E9sum%E9.xml#résumé". - # Similar considerations apply to query parts. The functionality of - # IRIs (namely, to be able to include non-ASCII characters) can only be - # used if the query part is encoded in UTF-8. - keyvals = parse_qsl_to_bytes(query, keep_blank_values) + + # Python's urllib.parse.parse_qsl does not work as wanted + # for percent-encoded characters that do not match passed encoding, + # they get lost. + # + # e.g., 'q=b%a3' becomes [('q', 'b\ufffd')] + # (ie. with 'REPLACEMENT CHARACTER' (U+FFFD), + # instead of \xa3 that you get with Python2's parse_qsl) + # + # what we want here is to keep raw bytes, and percent encode them + # so as to preserve whatever encoding what originally used. + # + # See https://tools.ietf.org/html/rfc3987#section-6.4: + # + # For example, it is possible to have a URI reference of + # "http://www.example.org/r%E9sum%E9.xml#r%C3%A9sum%C3%A9", where the + # document name is encoded in iso-8859-1 based on server settings, but + # where the fragment identifier is encoded in UTF-8 according to + # [XPointer]. The IRI corresponding to the above URI would be (in XML + # notation) + # "http://www.example.org/r%E9sum%E9.xml#résumé". + # Similar considerations apply to query parts. The functionality of + # IRIs (namely, to be able to include non-ASCII characters) can only be + # used if the query part is encoded in UTF-8. + keyvals = parse_qsl_to_bytes(query, keep_blank_values) + keyvals.sort() query = urlencode(keyvals) @@ -519,17 +526,12 @@ def _unquotepath(path): for reserved in ('2f', '2F', '3f', '3F'): path = path.replace('%' + reserved, '%25' + reserved.upper()) - if six.PY2: - # in Python 2, '%a3' becomes '\xa3', which is what we want - return unquote(path) - else: - # in Python 3, - # standard lib's unquote() does not work for non-UTF-8 - # percent-escaped characters, they get lost. - # e.g., '%a3' becomes 'REPLACEMENT CHARACTER' (U+FFFD) - # - # unquote_to_bytes() returns raw bytes instead - return unquote_to_bytes(path) + # standard lib's unquote() does not work for non-UTF-8 + # percent-escaped characters, they get lost. + # e.g., '%a3' becomes 'REPLACEMENT CHARACTER' (U+FFFD) + # + # unquote_to_bytes() returns raw bytes instead + return unquote_to_bytes(path) def parse_url(url, encoding=None): @@ -541,51 +543,48 @@ def parse_url(url, encoding=None): return urlparse(to_unicode(url, encoding)) -if not six.PY2: - from urllib.parse import _coerce_args, unquote_to_bytes +def parse_qsl_to_bytes(qs, keep_blank_values=False): + """Parse a query given as a string argument. + + Data are returned as a list of name, value pairs as bytes. - def parse_qsl_to_bytes(qs, keep_blank_values=False): - """Parse a query given as a string argument. + Arguments: - Data are returned as a list of name, value pairs as bytes. + qs: percent-encoded query string to be parsed - Arguments: - - qs: percent-encoded query string to be parsed - - keep_blank_values: flag indicating whether blank values in - percent-encoded queries should be treated as blank strings. A - true value indicates that blanks should be retained as blank - strings. The default false value indicates that blank values - are to be ignored and treated as if they were not included. - - """ - # This code is the same as Python3's parse_qsl() - # (at https://hg.python.org/cpython/rev/c38ac7ab8d9a) - # except for the unquote(s, encoding, errors) calls replaced - # with unquote_to_bytes(s) - qs, _coerce_result = _coerce_args(qs) - pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')] - r = [] - for name_value in pairs: - if not name_value: + keep_blank_values: flag indicating whether blank values in + percent-encoded queries should be treated as blank strings. A + true value indicates that blanks should be retained as blank + strings. The default false value indicates that blank values + are to be ignored and treated as if they were not included. + + """ + # This code is the same as Python3's parse_qsl() + # (at https://hg.python.org/cpython/rev/c38ac7ab8d9a) + # except for the unquote(s, encoding, errors) calls replaced + # with unquote_to_bytes(s) + qs, _coerce_result = _coerce_args(qs) + pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')] + r = [] + for name_value in pairs: + if not name_value: + continue + nv = name_value.split('=', 1) + if len(nv) != 2: + # Handle case of a control-name with no equal sign + if keep_blank_values: + nv.append('') + else: continue - nv = name_value.split('=', 1) - if len(nv) != 2: - # Handle case of a control-name with no equal sign - if keep_blank_values: - nv.append('') - else: - continue - if len(nv[1]) or keep_blank_values: - name = nv[0].replace('+', ' ') - name = unquote_to_bytes(name) - name = _coerce_result(name) - value = nv[1].replace('+', ' ') - value = unquote_to_bytes(value) - value = _coerce_result(value) - r.append((name, value)) - return r + if len(nv[1]) or keep_blank_values: + name = nv[0].replace('+', ' ') + name = unquote_to_bytes(name) + name = _coerce_result(name) + value = nv[1].replace('+', ' ') + value = unquote_to_bytes(value) + value = _coerce_result(value) + r.append((name, value)) + return r def urljoin_rfc(base, ref, encoding='utf-8'): Index: w3lib-1.22.0/w3lib/util.py =================================================================== --- w3lib-1.22.0.orig/w3lib/util.py +++ w3lib-1.22.0/w3lib/util.py @@ -1,5 +1,3 @@ -import six - def str_to_unicode(text, encoding=None, errors='strict'): if encoding is None: encoding = 'utf-8' @@ -10,16 +8,16 @@ def str_to_unicode(text, encoding=None, def unicode_to_str(text, encoding=None, errors='strict'): if encoding is None: encoding = 'utf-8' - if isinstance(text, six.text_type): + if isinstance(text, str): return text.encode(encoding, errors) return text def to_unicode(text, encoding=None, errors='strict'): """Return the unicode representation of a bytes object `text`. If `text` is already an unicode object, return it as-is.""" - if isinstance(text, six.text_type): + if isinstance(text, str): return text - if not isinstance(text, (bytes, six.text_type)): + if not isinstance(text, (bytes, str)): raise TypeError('to_unicode must receive a bytes, str or unicode ' 'object, got %s' % type(text).__name__) if encoding is None: @@ -31,7 +29,7 @@ def to_bytes(text, encoding=None, errors is already a bytes object, return it as-is.""" if isinstance(text, bytes): return text - if not isinstance(text, six.string_types): + if not isinstance(text, str): raise TypeError('to_bytes must receive a unicode, str or bytes ' 'object, got %s' % type(text).__name__) if encoding is None: @@ -39,9 +37,5 @@ def to_bytes(text, encoding=None, errors return text.encode(encoding, errors) def to_native_str(text, encoding=None, errors='strict'): - """ Return str representation of `text` - (bytes in Python 2.x and unicode in Python 3.x). """ - if six.PY2: - return to_bytes(text, encoding, errors) - else: - return to_unicode(text, encoding, errors) + """ Return str representation of `text` """ + return to_unicode(text, encoding, errors)