-
...
- first item @@ -487,7 +485,7 @@ extracting text elements for example. Example extracting microdata (sample content taken from http://schema.org/Product) with groups of itemscopes and corresponding itemprops:: - >>> doc = u""" + >>> doc = """ ...
- 1
- 2
- 3
- 1
- 2
- 3
- 1
- 2
- 1
- 2
- 1
- 2
- 1
- 2
- 1
- 2
- 3
- 1
- 2
- 3
- 2 ') - self.assertEqual(sel.xpath('//ul/li[position()>1]/text()')[0].get(), u'2') + self.assertEqual(sel.xpath('//ul/li[position()>1]')[0].get(), '
- 2 ') + self.assertEqual(sel.xpath('//ul/li[position()>1]/text()')[0].get(), '2') def test_selector_getall_alias(self): """Test if get() returns extracted value on a Selector""" - body = u'
- 1
- 2
- 3
- 1
- 2
- 3
- 2 ']) - self.assertListEqual(sel.xpath('//ul/li[position()>1]/text()')[0].getall(), [u'2']) + self.assertListEqual(sel.xpath('//ul/li[position()>1]')[0].getall(), ['
- 2 ']) + self.assertListEqual(sel.xpath('//ul/li[position()>1]/text()')[0].getall(), ['2']) def test_selectorlist_get_alias(self): """Test if get() returns first element for a selection call""" - body = u'
- 1
- 2
- 3
- 1
- 2
- 3
- 1 ') - self.assertEqual(sel.xpath('//ul/li/text()').get(), u'1') + self.assertEqual(sel.xpath('//ul/li').get(), '
- 1 ') + self.assertEqual(sel.xpath('//ul/li/text()').get(), '1') def test_re_first(self): """Test if re_first() returns first matched element""" - body = u'
- 1
- 2
- 1
- 2
- 1
- 2
- 1
- 2
- one
- two @@ -322,7 +322,7 @@ def test_nested_selectors(self): def test_selectorlist_getall_alias(self): """Nested selector tests using getall()""" - body = u""" + body = """
- one
- two @@ -346,20 +346,20 @@ def test_selectorlist_getall_alias(self): self.assertEqual(divtwo.xpath("./li").getall(), []) def test_mixed_nested_selectors(self): - body = u''' + body = '''
- Name: John
- Age: 10 @@ -519,23 +519,23 @@ def test_re(self): ["10", "20"]) # Test named group, hit and miss - x = self.sscls(text=u'foobar') + x = self.sscls(text='foobar') self.assertEqual(x.re('(?P
- one
- two @@ -723,18 +722,18 @@ class SmartStringsSelector(Selector): # only when smart_strings are on x = self.sscls(text=body) li_text = x.xpath('//li/text()') - self.assertFalse(any(map(lambda e: hasattr(e.root, 'getparent'), li_text))) + self.assertFalse(any([hasattr(e.root, 'getparent') for e in li_text])) div_class = x.xpath('//div/@class') - self.assertFalse(any(map(lambda e: hasattr(e.root, 'getparent'), div_class))) + self.assertFalse(any([hasattr(e.root, 'getparent') for e in div_class])) x = SmartStringsSelector(text=body) li_text = x.xpath('//li/text()') - self.assertTrue(all(map(lambda e: hasattr(e.root, 'getparent'), li_text))) + self.assertTrue(all([hasattr(e.root, 'getparent') for e in li_text])) div_class = x.xpath('//div/@class') - self.assertTrue(all(map(lambda e: hasattr(e.root, 'getparent'), div_class))) + self.assertTrue(all([hasattr(e.root, 'getparent') for e in div_class])) def test_xml_entity_expansion(self): - malicious_xml = u''\ + malicious_xml = ''\ ' ]>
- 1
- 2
- 3
- 1
- 2
- 3
- 1
- 2
- 3
- 1
- 2
- 3
- 1
- 2
- 3
- 1
- 2
- 3
- 1
- 2
- 3
- 1
- 2
- 3
- 1
- 2
- 3
- 1
- 2
- 3
@@ -591,7 +589,7 @@ returns ``True`` for nodes that have all of the specified HTML classes::
... Fourth
... """) ... - >>> sel = Selector(u""" + >>> sel = Selector(""" ... ...Second
... @@ -1111,7 +1109,7 @@ Named variables can be useful when strings need to be escaped for single or double quotes characters. The example below would be a bit tricky to get right (or legible) without a variable reference:: - >>> html = u''' + >>> html = ''' ... ...He said: "I don't know why, but I like mixing single and double quotes!"
... diff --git a/parsel/csstranslator.py b/parsel/csstranslator.py index 747e808..3881736 100644 --- a/parsel/csstranslator.py +++ b/parsel/csstranslator.py @@ -1,7 +1,4 @@ -try: - from functools import lru_cache -except ImportError: - from functools32 import lru_cache +from functools import lru_cache from cssselect import GenericTranslator as OriginalGenericTranslator from cssselect import HTMLTranslator as OriginalHTMLTranslator @@ -23,7 +20,7 @@ def from_xpath(cls, xpath, textnode=False, attribute=None): return x def __str__(self): - path = super(XPathExpr, self).__str__() + path = super().__str__() if self.textnode: if path == '*': path = 'text()' @@ -40,20 +37,20 @@ def __str__(self): return path def join(self, combiner, other): - super(XPathExpr, self).join(combiner, other) + super().join(combiner, other) self.textnode = other.textnode self.attribute = other.attribute return self -class TranslatorMixin(object): +class TranslatorMixin: """This mixin adds support to CSS pseudo elements via dynamic dispatch. Currently supported pseudo-elements are ``::text`` and ``::attr(ATTR_NAME)``. """ def xpath_element(self, selector): - xpath = super(TranslatorMixin, self).xpath_element(selector) + xpath = super().xpath_element(selector) return XPathExpr.from_xpath(xpath) def xpath_pseudo_element(self, xpath, pseudo_element): @@ -98,13 +95,13 @@ def xpath_text_simple_pseudo_element(self, xpath): class GenericTranslator(TranslatorMixin, OriginalGenericTranslator): @lru_cache(maxsize=256) def css_to_xpath(self, css, prefix='descendant-or-self::'): - return super(GenericTranslator, self).css_to_xpath(css, prefix) + return super().css_to_xpath(css, prefix) class HTMLTranslator(TranslatorMixin, OriginalHTMLTranslator): @lru_cache(maxsize=256) def css_to_xpath(self, css, prefix='descendant-or-self::'): - return super(HTMLTranslator, self).css_to_xpath(css, prefix) + return super().css_to_xpath(css, prefix) _translator = HTMLTranslator() diff --git a/parsel/selector.py b/parsel/selector.py index 504a4fe..b644e82 100644 --- a/parsel/selector.py +++ b/parsel/selector.py @@ -2,9 +2,6 @@ XPath selectors based on lxml """ -import sys - -import six from lxml import etree, html from .utils import flatten, iflatten, extract_regex, shorten @@ -22,7 +19,7 @@ class CannotRemoveElementWithoutParent(Exception): class SafeXMLParser(etree.XMLParser): def __init__(self, *args, **kwargs): kwargs.setdefault('resolve_entities', False) - super(SafeXMLParser, self).__init__(*args, **kwargs) + super().__init__(*args, **kwargs) _ctgroup = { @@ -61,13 +58,8 @@ class SelectorList(list): class, which provides a few additional methods. """ - # __getslice__ is deprecated but `list` builtin implements it only in Py2 - def __getslice__(self, i, j): - o = super(SelectorList, self).__getslice__(i, j) - return self.__class__(o) - def __getitem__(self, pos): - o = super(SelectorList, self).__getitem__(pos) + o = super().__getitem__(pos) return self.__class__(o) if isinstance(pos, slice) else o def __getstate__(self): @@ -164,7 +156,7 @@ def remove(self): x.remove() -class Selector(object): +class Selector: """ :class:`Selector` allows you to select parts of an XML or HTML text using CSS or XPath expressions and extract data from it. @@ -204,9 +196,10 @@ def __init__(self, text=None, type=None, namespaces=None, root=None, self._tostring_method = _ctgroup[st]['_tostring_method'] if text is not None: - if not isinstance(text, six.text_type): - msg = "text argument should be of type %s, got %s" % ( - six.text_type, text.__class__) + if not isinstance(text, str): + msg = "text argument should be of type str, got %s" % ( + text.__class__ + ) raise TypeError(msg) root = self._get_root(text, base_url) elif root is None: @@ -255,9 +248,7 @@ def xpath(self, query, namespaces=None, **kwargs): smart_strings=self._lxml_smart_strings, **kwargs) except etree.XPathError as exc: - msg = u"XPath error: %s in %s" % (exc, query) - msg = msg if six.PY3 else msg.encode('unicode_escape') - six.reraise(ValueError, ValueError(msg), sys.exc_info()[2]) + raise ValueError("XPath error: %s in %s" % (exc, query)) if type(result) is not list: result = [result] @@ -324,11 +315,11 @@ def get(self): with_tail=False) except (AttributeError, TypeError): if self.root is True: - return u'1' + return '1' elif self.root is False: - return u'0' + return '0' else: - return six.text_type(self.root) + return str(self.root) extract = get def getall(self): @@ -354,7 +345,7 @@ def remove_namespaces(self): if el.tag.startswith('{'): el.tag = el.tag.split('}', 1)[1] # loop on element attributes also - for an in el.attrib.keys(): + for an in el.attrib: if an.startswith('{'): el.attrib[an.split('}', 1)[1]] = el.attrib.pop(an) # remove namespace declarations diff --git a/parsel/utils.py b/parsel/utils.py index 6914362..6aeff6f 100644 --- a/parsel/utils.py +++ b/parsel/utils.py @@ -1,5 +1,4 @@ import re -import six from w3lib.html import replace_entities as w3lib_replace_entities @@ -50,10 +49,10 @@ def _is_listlike(x): True >>> _is_listlike((x for x in range(3))) True - >>> _is_listlike(six.moves.xrange(5)) + >>> _is_listlike(range(5)) True """ - return hasattr(x, "__iter__") and not isinstance(x, (six.text_type, bytes)) + return hasattr(x, "__iter__") and not isinstance(x, (str, bytes)) def extract_regex(regex, text, replace_entities=True): @@ -62,7 +61,7 @@ def extract_regex(regex, text, replace_entities=True): * if the regex contains multiple numbered groups, all those will be returned (flattened) * if the regex doesn't contain any group the entire regex matching is returned """ - if isinstance(regex, six.string_types): + if isinstance(regex, str): regex = re.compile(regex, re.UNICODE) if 'extract' in regex.groupindex: diff --git a/parsel/xpathfuncs.py b/parsel/xpathfuncs.py index 95b07ba..ceb8eaf 100644 --- a/parsel/xpathfuncs.py +++ b/parsel/xpathfuncs.py @@ -1,8 +1,6 @@ import re from lxml import etree -from six import string_types - from w3lib.html import HTML5_WHITESPACE regex = '[{}]+'.format(HTML5_WHITESPACE) @@ -45,7 +43,7 @@ def has_class(context, *classes): raise ValueError( 'XPath error: has-class must have at least 1 argument') for c in classes: - if not isinstance(c, string_types): + if not isinstance(c, str): raise ValueError( 'XPath error: has-class arguments must be strings') context.eval_context['args_checked'] = True diff --git a/setup.py b/setup.py index d14ad0e..ade049f 100644 --- a/setup.py +++ b/setup.py @@ -1,9 +1,5 @@ #!/usr/bin/env python -# -*- coding: utf-8 -*- -import sys - -from pkg_resources import parse_version from setuptools import setup, __version__ as setuptools_version @@ -13,32 +9,6 @@ with open('NEWS') as history_file: history = history_file.read().replace('.. :changelog:', '') -test_requirements = [ -] - -def has_environment_marker_platform_impl_support(): - """Code extracted from 'pytest/setup.py' - https://github.com/pytest-dev/pytest/blob/7538680c/setup.py#L31 - The first known release to support environment marker with range operators - it is 18.5, see: - https://setuptools.readthedocs.io/en/latest/history.html#id235 - """ - return parse_version(setuptools_version) >= parse_version('18.5') - -install_requires = [ - 'w3lib>=1.19.0', - 'lxml', - 'six>=1.6.0', - 'cssselect>=0.9' -] -extras_require = {} - -if not has_environment_marker_platform_impl_support(): - if sys.version_info[0:2] < (3, 0): - install_requires.append("functools32") -else: - extras_require[":python_version<'3.0'"] = ["functools32"] - setup( name='parsel', version='1.6.0', @@ -50,11 +20,16 @@ def has_environment_marker_platform_impl_support(): packages=[ 'parsel', ], - package_dir={'parsel': - 'parsel'}, + package_dir={ + 'parsel': 'parsel', + }, include_package_data=True, - install_requires=install_requires, - extras_require=extras_require, + install_requires=[ + 'cssselect>=0.9', + 'lxml', + 'w3lib>=1.19.0', + ], + python_requires='>=3.6', license="BSD", zip_safe=False, keywords='parsel', @@ -66,13 +41,11 @@ def has_environment_marker_platform_impl_support(): 'Topic :: Text Processing :: Markup', 'Topic :: Text Processing :: Markup :: HTML', 'Topic :: Text Processing :: Markup :: XML', - 'Programming Language :: Python :: 2', - 'Programming Language :: Python :: 2.7', 'Programming Language :: Python :: 3', - 'Programming Language :: Python :: 3.5', 'Programming Language :: Python :: 3.6', 'Programming Language :: Python :: 3.7', 'Programming Language :: Python :: 3.8', + 'Programming Language :: Python :: 3.9', 'Programming Language :: Python :: Implementation :: CPython', 'Programming Language :: Python :: Implementation :: PyPy', ], diff --git a/tests/test_selector.py b/tests/test_selector.py index a5c61f6..f5c60ae 100644 --- a/tests/test_selector.py +++ b/tests/test_selector.py @@ -1,7 +1,5 @@ -# -*- coding: utf-8 -*- import re import weakref -import six import unittest import pickle @@ -17,11 +15,11 @@ class SelectorTestCase(unittest.TestCase): sscls = Selector def test_pickle_selector(self): - sel = self.sscls(text=u'some text
') + sel = self.sscls(text='some text
') self.assertRaises(TypeError, lambda s: pickle.dumps(s, protocol=2), sel) def test_pickle_selector_list(self): - sel = self.sscls(text=u'I'm mixing single and + body = """
I'm mixing single and "double quotes" and I don't care :)
""" sel = self.sscls(text=body) @@ -95,7 +93,7 @@ def test_simple_selection_with_variables_escape_friendly(self): # with XPath variables, escaping is done for you self.assertEqual([x.extract() for x in sel.xpath("//input[@value=$text]/@name", text=t)], - [u'a']) + ['a']) lt = """I'm mixing single and "double quotes" and I don't care :)""" # the following gives you something like # ValueError: XPath error: Invalid predicate in //p[normalize-space()='I'm mixing single and "double quotes" and I don't care :)']//@name @@ -103,10 +101,10 @@ def test_simple_selection_with_variables_escape_friendly(self): self.assertEqual([x.extract() for x in sel.xpath("//p[normalize-space()=$lng]//@name", lng=lt)], - [u'a']) + ['a']) def test_accessing_attributes(self): - body = u""" + body = """-
@@ -134,12 +132,10 @@ def test_accessing_attributes(self):
[e.attrib for e in sel.css('li')])
def test_representation_slice(self):
- body = u"".format(50 * 'b')
+ body = "".format(50 * 'b')
sel = self.sscls(text=body)
representation = "
test
' + text = '
test
' assert isinstance(self.sscls(text=text).xpath("//p")[0], self.sscls) assert isinstance(self.sscls(text=text).css("p")[0], self.sscls) def test_boolean_result(self): - body = u"
" + body = "" xs = self.sscls(text=body) - self.assertEqual(xs.xpath("//input[@name='a']/@name='a'").extract(), [u'1']) - self.assertEqual(xs.xpath("//input[@name='a']/@name='n'").extract(), [u'0']) + self.assertEqual(xs.xpath("//input[@name='a']/@name='a'").extract(), ['1']) + self.assertEqual(xs.xpath("//input[@name='a']/@name='n'").extract(), ['0']) def test_differences_parsing_xml_vs_html(self): """Test that XML and HTML Selector's behave differently""" # some text which is parsed differently by XML and HTML flavors - text = u'
Hello

Hello

Hello

Hello

Hello

Hello
1
2
3
1
2
3
3
']) - self.assertEqual(hs.css('p')[1:3].extract(), [u'2
', u'3
']) + self.assertEqual(hs.css('p')[2:3].extract(), ['3
']) + self.assertEqual(hs.css('p')[1:3].extract(), ['2
', '3
']) def test_nested_selectors(self): """Nested selector tests""" - body = u""" + body = """text
fooan Jos\ufffd de
' - self.assertEqual([u'an Jos\ufffd de'], + text = 'an Jos\\ufffd de
' + self.assertEqual(['an Jos\\ufffd de'], self.sscls(text).xpath('//text()').extract()) def test_select_on_unevaluable_nodes(self): - r = self.sscls(text=u'some text') + r = self.sscls(text='some text') # Text node x1 = r.xpath('//text()') - self.assertEqual(x1.extract(), [u'some text']) + self.assertEqual(x1.extract(), ['some text']) self.assertEqual(x1.xpath('.//b').extract(), []) # Tag attribute x1 = r.xpath('//span/@class') - self.assertEqual(x1.extract(), [u'big']) + self.assertEqual(x1.extract(), ['big']) self.assertEqual(x1.xpath('.//text()').extract(), []) def test_select_on_text_nodes(self): - r = self.sscls(text=u'Grainy
' - self.assertEqual(u'Grainy
', + text = '\x00Grainy
' + self.assertEqual('Grainy
', self.sscls(text).extract()) def test_remove_selector_list(self): - sel = self.sscls(text=u'