15
0
forked from pool/python-parsel
Files
python-parsel/python-parsel-drop-python-2.patch

1519 lines
63 KiB
Diff

diff --git a/README.rst b/README.rst
index c1674f1..7bd8204 100644
--- a/README.rst
+++ b/README.rst
@@ -26,7 +26,7 @@ Example (`open online demo`_):
.. code-block:: python
>>> from parsel import Selector
- >>> selector = Selector(text=u"""<html>
+ >>> selector = Selector(text="""<html>
<body>
<h1>Hello, Parsel!</h1>
<ul>
diff --git a/docs/conf.py b/docs/conf.py
index 27eef0e..f3736de 100755
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -1,5 +1,4 @@
#!/usr/bin/env python
-# -*- coding: utf-8 -*-
import os
import sys
@@ -38,8 +37,8 @@
master_doc = 'index'
# General information about the project.
-project = u'Parsel'
-copyright = u'2015, Scrapy Project'
+project = 'Parsel'
+copyright = '2015, Scrapy Project'
# The version info for the project you're documenting, acts as replacement
# for |version| and |release|, also used in various other places throughout
@@ -83,8 +82,8 @@
# [howto/manual]).
latex_documents = [
('index', 'parsel.tex',
- u'Parsel Documentation',
- u'Scrapy Project', 'manual'),
+ 'Parsel Documentation',
+ 'Scrapy Project', 'manual'),
]
@@ -94,8 +93,8 @@
# (source start file, name, description, authors, manual section).
man_pages = [
('index', 'parsel',
- u'Parsel Documentation',
- [u'Scrapy Project'], 1)
+ 'Parsel Documentation',
+ ['Scrapy Project'], 1)
]
@@ -106,8 +105,8 @@
# dir menu entry, description, category)
texinfo_documents = [
('index', 'parsel',
- u'Parsel Documentation',
- u'Scrapy Project',
+ 'Parsel Documentation',
+ 'Scrapy Project',
'parsel',
'One line description of project.',
'Miscellaneous'),
diff --git a/docs/usage.rst b/docs/usage.rst
index f5950a8..55e6a31 100644
--- a/docs/usage.rst
+++ b/docs/usage.rst
@@ -8,11 +8,9 @@ Create a :class:`~parsel.selector.Selector` object for the HTML or XML text
that you want to parse::
>>> from parsel import Selector
- >>> text = u"<html><body><h1>Hello, Parsel!</h1></body></html>"
+ >>> text = "<html><body><h1>Hello, Parsel!</h1></body></html>"
>>> selector = Selector(text=text)
-.. note:: In Python 2, the ``text`` argument must be a ``unicode`` string.
-
Then use `CSS`_ or `XPath`_ expressions to select elements::
>>> selector.css('h1')
@@ -412,7 +410,7 @@ classes.
Example removing an ad from a blog post:
>>> from parsel import Selector
- >>> doc = u"""
+ >>> doc = """
... <article>
... <div class="row">Content paragraph...</div>
... <div class="row">
@@ -455,7 +453,7 @@ The ``test()`` function, for example, can prove quite useful when XPath's
Example selecting links in list item with a "class" attribute ending with a digit::
>>> from parsel import Selector
- >>> doc = u"""
+ >>> doc = """
... <div>
... <ul>
... <li class="item-0"><a href="link1.html">first item</a></li>
@@ -487,7 +485,7 @@ extracting text elements for example.
Example extracting microdata (sample content taken from http://schema.org/Product)
with groups of itemscopes and corresponding itemprops::
- >>> doc = u"""
+ >>> doc = """
... <div itemscope itemtype="http://schema.org/Product">
... <span itemprop="name">Kenmore White 17" Microwave</span>
... <img src="kenmore-microwave-17in.jpg" alt='Kenmore 17" Microwave' />
@@ -591,7 +589,7 @@ returns ``True`` for nodes that have all of the specified HTML classes::
... <p>Fourth</p>
... """)
...
- >>> sel = Selector(u"""
+ >>> sel = Selector("""
... <p class="foo bar-baz">First</p>
... <p class="foo">Second</p>
... <p class="bar">Third</p>
@@ -1111,7 +1109,7 @@ Named variables can be useful when strings need to be escaped for single
or double quotes characters. The example below would be a bit tricky to
get right (or legible) without a variable reference::
- >>> html = u'''<html>
+ >>> html = '''<html>
... <body>
... <p>He said: "I don't know why, but I like mixing single and double quotes!"</p>
... </body>
diff --git a/parsel/csstranslator.py b/parsel/csstranslator.py
index 747e808..3881736 100644
--- a/parsel/csstranslator.py
+++ b/parsel/csstranslator.py
@@ -1,7 +1,4 @@
-try:
- from functools import lru_cache
-except ImportError:
- from functools32 import lru_cache
+from functools import lru_cache
from cssselect import GenericTranslator as OriginalGenericTranslator
from cssselect import HTMLTranslator as OriginalHTMLTranslator
@@ -23,7 +20,7 @@ def from_xpath(cls, xpath, textnode=False, attribute=None):
return x
def __str__(self):
- path = super(XPathExpr, self).__str__()
+ path = super().__str__()
if self.textnode:
if path == '*':
path = 'text()'
@@ -40,20 +37,20 @@ def __str__(self):
return path
def join(self, combiner, other):
- super(XPathExpr, self).join(combiner, other)
+ super().join(combiner, other)
self.textnode = other.textnode
self.attribute = other.attribute
return self
-class TranslatorMixin(object):
+class TranslatorMixin:
"""This mixin adds support to CSS pseudo elements via dynamic dispatch.
Currently supported pseudo-elements are ``::text`` and ``::attr(ATTR_NAME)``.
"""
def xpath_element(self, selector):
- xpath = super(TranslatorMixin, self).xpath_element(selector)
+ xpath = super().xpath_element(selector)
return XPathExpr.from_xpath(xpath)
def xpath_pseudo_element(self, xpath, pseudo_element):
@@ -98,13 +95,13 @@ def xpath_text_simple_pseudo_element(self, xpath):
class GenericTranslator(TranslatorMixin, OriginalGenericTranslator):
@lru_cache(maxsize=256)
def css_to_xpath(self, css, prefix='descendant-or-self::'):
- return super(GenericTranslator, self).css_to_xpath(css, prefix)
+ return super().css_to_xpath(css, prefix)
class HTMLTranslator(TranslatorMixin, OriginalHTMLTranslator):
@lru_cache(maxsize=256)
def css_to_xpath(self, css, prefix='descendant-or-self::'):
- return super(HTMLTranslator, self).css_to_xpath(css, prefix)
+ return super().css_to_xpath(css, prefix)
_translator = HTMLTranslator()
diff --git a/parsel/selector.py b/parsel/selector.py
index 504a4fe..b644e82 100644
--- a/parsel/selector.py
+++ b/parsel/selector.py
@@ -2,9 +2,6 @@
XPath selectors based on lxml
"""
-import sys
-
-import six
from lxml import etree, html
from .utils import flatten, iflatten, extract_regex, shorten
@@ -22,7 +19,7 @@ class CannotRemoveElementWithoutParent(Exception):
class SafeXMLParser(etree.XMLParser):
def __init__(self, *args, **kwargs):
kwargs.setdefault('resolve_entities', False)
- super(SafeXMLParser, self).__init__(*args, **kwargs)
+ super().__init__(*args, **kwargs)
_ctgroup = {
@@ -61,13 +58,8 @@ class SelectorList(list):
class, which provides a few additional methods.
"""
- # __getslice__ is deprecated but `list` builtin implements it only in Py2
- def __getslice__(self, i, j):
- o = super(SelectorList, self).__getslice__(i, j)
- return self.__class__(o)
-
def __getitem__(self, pos):
- o = super(SelectorList, self).__getitem__(pos)
+ o = super().__getitem__(pos)
return self.__class__(o) if isinstance(pos, slice) else o
def __getstate__(self):
@@ -164,7 +156,7 @@ def remove(self):
x.remove()
-class Selector(object):
+class Selector:
"""
:class:`Selector` allows you to select parts of an XML or HTML text using CSS
or XPath expressions and extract data from it.
@@ -204,9 +196,10 @@ def __init__(self, text=None, type=None, namespaces=None, root=None,
self._tostring_method = _ctgroup[st]['_tostring_method']
if text is not None:
- if not isinstance(text, six.text_type):
- msg = "text argument should be of type %s, got %s" % (
- six.text_type, text.__class__)
+ if not isinstance(text, str):
+ msg = "text argument should be of type str, got %s" % (
+ text.__class__
+ )
raise TypeError(msg)
root = self._get_root(text, base_url)
elif root is None:
@@ -255,9 +248,7 @@ def xpath(self, query, namespaces=None, **kwargs):
smart_strings=self._lxml_smart_strings,
**kwargs)
except etree.XPathError as exc:
- msg = u"XPath error: %s in %s" % (exc, query)
- msg = msg if six.PY3 else msg.encode('unicode_escape')
- six.reraise(ValueError, ValueError(msg), sys.exc_info()[2])
+ raise ValueError("XPath error: %s in %s" % (exc, query))
if type(result) is not list:
result = [result]
@@ -324,11 +315,11 @@ def get(self):
with_tail=False)
except (AttributeError, TypeError):
if self.root is True:
- return u'1'
+ return '1'
elif self.root is False:
- return u'0'
+ return '0'
else:
- return six.text_type(self.root)
+ return str(self.root)
extract = get
def getall(self):
@@ -354,7 +345,7 @@ def remove_namespaces(self):
if el.tag.startswith('{'):
el.tag = el.tag.split('}', 1)[1]
# loop on element attributes also
- for an in el.attrib.keys():
+ for an in el.attrib:
if an.startswith('{'):
el.attrib[an.split('}', 1)[1]] = el.attrib.pop(an)
# remove namespace declarations
diff --git a/parsel/utils.py b/parsel/utils.py
index 6914362..6aeff6f 100644
--- a/parsel/utils.py
+++ b/parsel/utils.py
@@ -1,5 +1,4 @@
import re
-import six
from w3lib.html import replace_entities as w3lib_replace_entities
@@ -50,10 +49,10 @@ def _is_listlike(x):
True
>>> _is_listlike((x for x in range(3)))
True
- >>> _is_listlike(six.moves.xrange(5))
+ >>> _is_listlike(range(5))
True
"""
- return hasattr(x, "__iter__") and not isinstance(x, (six.text_type, bytes))
+ return hasattr(x, "__iter__") and not isinstance(x, (str, bytes))
def extract_regex(regex, text, replace_entities=True):
@@ -62,7 +61,7 @@ def extract_regex(regex, text, replace_entities=True):
* if the regex contains multiple numbered groups, all those will be returned (flattened)
* if the regex doesn't contain any group the entire regex matching is returned
"""
- if isinstance(regex, six.string_types):
+ if isinstance(regex, str):
regex = re.compile(regex, re.UNICODE)
if 'extract' in regex.groupindex:
diff --git a/parsel/xpathfuncs.py b/parsel/xpathfuncs.py
index 95b07ba..ceb8eaf 100644
--- a/parsel/xpathfuncs.py
+++ b/parsel/xpathfuncs.py
@@ -1,8 +1,6 @@
import re
from lxml import etree
-from six import string_types
-
from w3lib.html import HTML5_WHITESPACE
regex = '[{}]+'.format(HTML5_WHITESPACE)
@@ -45,7 +43,7 @@ def has_class(context, *classes):
raise ValueError(
'XPath error: has-class must have at least 1 argument')
for c in classes:
- if not isinstance(c, string_types):
+ if not isinstance(c, str):
raise ValueError(
'XPath error: has-class arguments must be strings')
context.eval_context['args_checked'] = True
diff --git a/setup.py b/setup.py
index d14ad0e..ade049f 100644
--- a/setup.py
+++ b/setup.py
@@ -1,9 +1,5 @@
#!/usr/bin/env python
-# -*- coding: utf-8 -*-
-import sys
-
-from pkg_resources import parse_version
from setuptools import setup, __version__ as setuptools_version
@@ -13,32 +9,6 @@
with open('NEWS') as history_file:
history = history_file.read().replace('.. :changelog:', '')
-test_requirements = [
-]
-
-def has_environment_marker_platform_impl_support():
- """Code extracted from 'pytest/setup.py'
- https://github.com/pytest-dev/pytest/blob/7538680c/setup.py#L31
- The first known release to support environment marker with range operators
- it is 18.5, see:
- https://setuptools.readthedocs.io/en/latest/history.html#id235
- """
- return parse_version(setuptools_version) >= parse_version('18.5')
-
-install_requires = [
- 'w3lib>=1.19.0',
- 'lxml',
- 'six>=1.6.0',
- 'cssselect>=0.9'
-]
-extras_require = {}
-
-if not has_environment_marker_platform_impl_support():
- if sys.version_info[0:2] < (3, 0):
- install_requires.append("functools32")
-else:
- extras_require[":python_version<'3.0'"] = ["functools32"]
-
setup(
name='parsel',
version='1.6.0',
@@ -50,11 +20,16 @@ def has_environment_marker_platform_impl_support():
packages=[
'parsel',
],
- package_dir={'parsel':
- 'parsel'},
+ package_dir={
+ 'parsel': 'parsel',
+ },
include_package_data=True,
- install_requires=install_requires,
- extras_require=extras_require,
+ install_requires=[
+ 'cssselect>=0.9',
+ 'lxml',
+ 'w3lib>=1.19.0',
+ ],
+ python_requires='>=3.6',
license="BSD",
zip_safe=False,
keywords='parsel',
@@ -66,13 +41,11 @@ def has_environment_marker_platform_impl_support():
'Topic :: Text Processing :: Markup',
'Topic :: Text Processing :: Markup :: HTML',
'Topic :: Text Processing :: Markup :: XML',
- 'Programming Language :: Python :: 2',
- 'Programming Language :: Python :: 2.7',
'Programming Language :: Python :: 3',
- 'Programming Language :: Python :: 3.5',
'Programming Language :: Python :: 3.6',
'Programming Language :: Python :: 3.7',
'Programming Language :: Python :: 3.8',
+ 'Programming Language :: Python :: 3.9',
'Programming Language :: Python :: Implementation :: CPython',
'Programming Language :: Python :: Implementation :: PyPy',
],
diff --git a/tests/test_selector.py b/tests/test_selector.py
index a5c61f6..f5c60ae 100644
--- a/tests/test_selector.py
+++ b/tests/test_selector.py
@@ -1,7 +1,5 @@
-# -*- coding: utf-8 -*-
import re
import weakref
-import six
import unittest
import pickle
@@ -17,11 +15,11 @@ class SelectorTestCase(unittest.TestCase):
sscls = Selector
def test_pickle_selector(self):
- sel = self.sscls(text=u'<html><body><p>some text</p></body></html>')
+ sel = self.sscls(text='<html><body><p>some text</p></body></html>')
self.assertRaises(TypeError, lambda s: pickle.dumps(s, protocol=2), sel)
def test_pickle_selector_list(self):
- sel = self.sscls(text=u'<html><body><ul><li>1</li><li>2</li><li>3</li></ul></body></html>')
+ sel = self.sscls(text='<html><body><ul><li>1</li><li>2</li><li>3</li></ul></body></html>')
sel_list = sel.css('li')
empty_sel_list = sel.css('p')
self.assertIsInstance(sel_list, self.sscls.selectorlist_cls)
@@ -31,7 +29,7 @@ def test_pickle_selector_list(self):
def test_simple_selection(self):
"""Simple selector tests"""
- body = u"<p><input name='a'value='1'/><input name='b'value='2'/></p>"
+ body = "<p><input name='a'value='1'/><input name='b'value='2'/></p>"
sel = self.sscls(text=body)
xl = sel.xpath('//input')
@@ -43,48 +41,48 @@ def test_simple_selection(self):
[x.extract() for x in sel.xpath('//input')])
self.assertEqual([x.extract() for x in sel.xpath("//input[@name='a']/@name")],
- [u'a'])
+ ['a'])
self.assertEqual([x.extract() for x in sel.xpath("number(concat(//input[@name='a']/@value, //input[@name='b']/@value))")],
- [u'12.0'])
+ ['12.0'])
self.assertEqual(sel.xpath("concat('xpath', 'rules')").extract(),
- [u'xpathrules'])
+ ['xpathrules'])
self.assertEqual([x.extract() for x in sel.xpath("concat(//input[@name='a']/@value, //input[@name='b']/@value)")],
- [u'12'])
+ ['12'])
def test_simple_selection_with_variables(self):
"""Using XPath variables"""
- body = u"<p><input name='a' value='1'/><input name='b' value='2'/></p>"
+ body = "<p><input name='a' value='1'/><input name='b' value='2'/></p>"
sel = self.sscls(text=body)
self.assertEqual([x.extract() for x in sel.xpath("//input[@value=$number]/@name", number=1)],
- [u'a'])
+ ['a'])
self.assertEqual([x.extract() for x in sel.xpath("//input[@name=$letter]/@value", letter='b')],
- [u'2'])
+ ['2'])
self.assertEqual(sel.xpath("count(//input[@value=$number or @name=$letter])",
number=2, letter='a').extract(),
- [u'2.0'])
+ ['2.0'])
# you can also pass booleans
self.assertEqual(sel.xpath("boolean(count(//input)=$cnt)=$test",
cnt=2, test=True).extract(),
- [u'1'])
+ ['1'])
self.assertEqual(sel.xpath("boolean(count(//input)=$cnt)=$test",
cnt=4, test=True).extract(),
- [u'0'])
+ ['0'])
self.assertEqual(sel.xpath("boolean(count(//input)=$cnt)=$test",
cnt=4, test=False).extract(),
- [u'1'])
+ ['1'])
# for named nodes, you need to use "name()=node_name"
self.assertEqual(sel.xpath("boolean(count(//*[name()=$tag])=$cnt)=$test",
tag="input", cnt=2, test=True).extract(),
- [u'1'])
+ ['1'])
def test_simple_selection_with_variables_escape_friendly(self):
"""Using XPath variables with quotes that would need escaping with string formatting"""
- body = u"""<p>I'm mixing single and <input name='a' value='I say "Yeah!"'/>
+ body = """<p>I'm mixing single and <input name='a' value='I say "Yeah!"'/>
"double quotes" and I don't care :)</p>"""
sel = self.sscls(text=body)
@@ -95,7 +93,7 @@ def test_simple_selection_with_variables_escape_friendly(self):
# with XPath variables, escaping is done for you
self.assertEqual([x.extract() for x in sel.xpath("//input[@value=$text]/@name", text=t)],
- [u'a'])
+ ['a'])
lt = """I'm mixing single and "double quotes" and I don't care :)"""
# the following gives you something like
# ValueError: XPath error: Invalid predicate in //p[normalize-space()='I'm mixing single and "double quotes" and I don't care :)']//@name
@@ -103,10 +101,10 @@ def test_simple_selection_with_variables_escape_friendly(self):
self.assertEqual([x.extract() for x in sel.xpath("//p[normalize-space()=$lng]//@name",
lng=lt)],
- [u'a'])
+ ['a'])
def test_accessing_attributes(self):
- body = u"""
+ body = """
<html lang="en" version="1.0">
<body>
<ul id="some-list" class="list-cls" class="list-cls">
@@ -134,12 +132,10 @@ def test_accessing_attributes(self):
[e.attrib for e in sel.css('li')])
def test_representation_slice(self):
- body = u"<p><input name='{}' value='\xa9'/></p>".format(50 * 'b')
+ body = "<p><input name='{}' value='\xa9'/></p>".format(50 * 'b')
sel = self.sscls(text=body)
representation = "<Selector xpath='//input/@name' data='{}...'>".format(37 * 'b')
- if six.PY2:
- representation = "<Selector xpath='//input/@name' data=u'{}...'>".format(37 * 'b')
self.assertEqual(
[repr(it) for it in sel.xpath('//input/@name')],
@@ -147,25 +143,27 @@ def test_representation_slice(self):
)
def test_representation_unicode_query(self):
- body = u"<p><input name='{}' value='\xa9'/></p>".format(50 * 'b')
+ body = "<p><input name='{}' value='\xa9'/></p>".format(50 * 'b')
representation = '<Selector xpath=\'//input[@value="©"]/@value\' data=\'©\'>'
- if six.PY2:
- representation = "<Selector xpath=u'//input[@value=\"\\xa9\"]/@value' data=u'\\xa9'>"
sel = self.sscls(text=body)
self.assertEqual(
- [repr(it) for it in sel.xpath(u'//input[@value="\xa9"]/@value')],
+ [repr(it) for it in sel.xpath('//input[@value="\xa9"]/@value')],
[representation]
)
def test_check_text_argument_type(self):
- self.assertRaisesRegexp(TypeError, 'text argument should be of type',
- self.sscls, b'<html/>')
+ self.assertRaisesRegex(
+ TypeError,
+ 'text argument should be of type',
+ self.sscls,
+ b'<html/>',
+ )
def test_extract_first(self):
"""Test if extract_first() returns first element"""
- body = u'<ul><li id="1">1</li><li id="2">2</li></ul>'
+ body = '<ul><li id="1">1</li><li id="2">2</li></ul>'
sel = self.sscls(text=body)
self.assertEqual(sel.xpath('//ul/li/text()').extract_first(),
@@ -181,38 +179,38 @@ def test_extract_first(self):
def test_extract_first_default(self):
"""Test if extract_first() returns default value when no results found"""
- body = u'<ul><li id="1">1</li><li id="2">2</li></ul>'
+ body = '<ul><li id="1">1</li><li id="2">2</li></ul>'
sel = self.sscls(text=body)
self.assertEqual(sel.xpath('//div/text()').extract_first(default='missing'), 'missing')
def test_selector_get_alias(self):
"""Test if get() returns extracted value on a Selector"""
- body = u'<ul><li id="1">1</li><li id="2">2</li><li id="3">3</li></ul>'
+ body = '<ul><li id="1">1</li><li id="2">2</li><li id="3">3</li></ul>'
sel = self.sscls(text=body)
- self.assertEqual(sel.xpath('//ul/li[position()>1]')[0].get(), u'<li id="2">2</li>')
- self.assertEqual(sel.xpath('//ul/li[position()>1]/text()')[0].get(), u'2')
+ self.assertEqual(sel.xpath('//ul/li[position()>1]')[0].get(), '<li id="2">2</li>')
+ self.assertEqual(sel.xpath('//ul/li[position()>1]/text()')[0].get(), '2')
def test_selector_getall_alias(self):
"""Test if get() returns extracted value on a Selector"""
- body = u'<ul><li id="1">1</li><li id="2">2</li><li id="3">3</li></ul>'
+ body = '<ul><li id="1">1</li><li id="2">2</li><li id="3">3</li></ul>'
sel = self.sscls(text=body)
- self.assertListEqual(sel.xpath('//ul/li[position()>1]')[0].getall(), [u'<li id="2">2</li>'])
- self.assertListEqual(sel.xpath('//ul/li[position()>1]/text()')[0].getall(), [u'2'])
+ self.assertListEqual(sel.xpath('//ul/li[position()>1]')[0].getall(), ['<li id="2">2</li>'])
+ self.assertListEqual(sel.xpath('//ul/li[position()>1]/text()')[0].getall(), ['2'])
def test_selectorlist_get_alias(self):
"""Test if get() returns first element for a selection call"""
- body = u'<ul><li id="1">1</li><li id="2">2</li><li id="3">3</li></ul>'
+ body = '<ul><li id="1">1</li><li id="2">2</li><li id="3">3</li></ul>'
sel = self.sscls(text=body)
- self.assertEqual(sel.xpath('//ul/li').get(), u'<li id="1">1</li>')
- self.assertEqual(sel.xpath('//ul/li/text()').get(), u'1')
+ self.assertEqual(sel.xpath('//ul/li').get(), '<li id="1">1</li>')
+ self.assertEqual(sel.xpath('//ul/li/text()').get(), '1')
def test_re_first(self):
"""Test if re_first() returns first matched element"""
- body = u'<ul><li id="1">1</li><li id="2">2</li></ul>'
+ body = '<ul><li id="1">1</li><li id="2">2</li></ul>'
sel = self.sscls(text=body)
self.assertEqual(sel.xpath('//ul/li/text()').re_first(r'\d'),
@@ -233,71 +231,73 @@ def test_re_first(self):
def test_extract_first_re_default(self):
"""Test if re_first() returns default value when no results found"""
- body = u'<ul><li id="1">1</li><li id="2">2</li></ul>'
+ body = '<ul><li id="1">1</li><li id="2">2</li></ul>'
sel = self.sscls(text=body)
self.assertEqual(sel.xpath('//div/text()').re_first(r'\w+', default='missing'), 'missing')
self.assertEqual(sel.xpath('/ul/li/text()').re_first(r'\w+', default='missing'), 'missing')
def test_select_unicode_query(self):
- body = u"<p><input name='\xa9' value='1'/></p>"
+ body = "<p><input name='\xa9' value='1'/></p>"
sel = self.sscls(text=body)
- self.assertEqual(sel.xpath(u'//input[@name="\xa9"]/@value').extract(), [u'1'])
+ self.assertEqual(sel.xpath('//input[@name="\xa9"]/@value').extract(), ['1'])
def test_list_elements_type(self):
"""Test Selector returning the same type in selection methods"""
- text = u'<p>test<p>'
+ text = '<p>test<p>'
assert isinstance(self.sscls(text=text).xpath("//p")[0], self.sscls)
assert isinstance(self.sscls(text=text).css("p")[0], self.sscls)
def test_boolean_result(self):
- body = u"<p><input name='a'value='1'/><input name='b'value='2'/></p>"
+ body = "<p><input name='a'value='1'/><input name='b'value='2'/></p>"
xs = self.sscls(text=body)
- self.assertEqual(xs.xpath("//input[@name='a']/@name='a'").extract(), [u'1'])
- self.assertEqual(xs.xpath("//input[@name='a']/@name='n'").extract(), [u'0'])
+ self.assertEqual(xs.xpath("//input[@name='a']/@name='a'").extract(), ['1'])
+ self.assertEqual(xs.xpath("//input[@name='a']/@name='n'").extract(), ['0'])
def test_differences_parsing_xml_vs_html(self):
"""Test that XML and HTML Selector's behave differently"""
# some text which is parsed differently by XML and HTML flavors
- text = u'<div><img src="a.jpg"><p>Hello</div>'
+ text = '<div><img src="a.jpg"><p>Hello</div>'
hs = self.sscls(text=text, type='html')
self.assertEqual(hs.xpath("//div").extract(),
- [u'<div><img src="a.jpg"><p>Hello</p></div>'])
+ ['<div><img src="a.jpg"><p>Hello</p></div>'])
xs = self.sscls(text=text, type='xml')
self.assertEqual(xs.xpath("//div").extract(),
- [u'<div><img src="a.jpg"><p>Hello</p></img></div>'])
+ ['<div><img src="a.jpg"><p>Hello</p></img></div>'])
def test_error_for_unknown_selector_type(self):
- self.assertRaises(ValueError, self.sscls, text=u'', type='_na_')
+ self.assertRaises(ValueError, self.sscls, text='', type='_na_')
def test_text_or_root_is_required(self):
- self.assertRaisesRegexp(ValueError,
- 'Selector needs either text or root argument',
- self.sscls)
+ self.assertRaisesRegex(
+ ValueError,
+ 'Selector needs either text or root argument',
+ self.sscls,
+ )
def test_bool(self):
- text = u'<a href="" >false</a><a href="nonempty">true</a>'
+ text = '<a href="" >false</a><a href="nonempty">true</a>'
hs = self.sscls(text=text, type='html')
falsish = hs.xpath('//a/@href')[0]
- self.assertEqual(falsish.extract(), u'')
+ self.assertEqual(falsish.extract(), '')
self.assertFalse(falsish)
trueish = hs.xpath('//a/@href')[1]
- self.assertEqual(trueish.extract(), u'nonempty')
+ self.assertEqual(trueish.extract(), 'nonempty')
self.assertTrue(trueish)
def test_slicing(self):
- text = u'<div><p>1</p><p>2</p><p>3</p></div>'
+ text = '<div><p>1</p><p>2</p><p>3</p></div>'
hs = self.sscls(text=text, type='html')
self.assertIsInstance(hs.css('p')[2], self.sscls)
self.assertIsInstance(hs.css('p')[2:3], self.sscls.selectorlist_cls)
self.assertIsInstance(hs.css('p')[:2], self.sscls.selectorlist_cls)
- self.assertEqual(hs.css('p')[2:3].extract(), [u'<p>3</p>'])
- self.assertEqual(hs.css('p')[1:3].extract(), [u'<p>2</p>', u'<p>3</p>'])
+ self.assertEqual(hs.css('p')[2:3].extract(), ['<p>3</p>'])
+ self.assertEqual(hs.css('p')[1:3].extract(), ['<p>2</p>', '<p>3</p>'])
def test_nested_selectors(self):
"""Nested selector tests"""
- body = u"""<body>
+ body = """<body>
<div class='one'>
<ul>
<li>one</li><li>two</li>
@@ -322,7 +322,7 @@ def test_nested_selectors(self):
def test_selectorlist_getall_alias(self):
"""Nested selector tests using getall()"""
- body = u"""<body>
+ body = """<body>
<div class='one'>
<ul>
<li>one</li><li>two</li>
@@ -346,20 +346,20 @@ def test_selectorlist_getall_alias(self):
self.assertEqual(divtwo.xpath("./li").getall(), [])
def test_mixed_nested_selectors(self):
- body = u'''<body>
+ body = '''<body>
<div id=1>not<span>me</span></div>
<div class="dos"><p>text</p><a href='#'>foo</a></div>
</body>'''
sel = self.sscls(text=body)
- self.assertEqual(sel.xpath('//div[@id="1"]').css('span::text').extract(), [u'me'])
- self.assertEqual(sel.css('#1').xpath('./span/text()').extract(), [u'me'])
+ self.assertEqual(sel.xpath('//div[@id="1"]').css('span::text').extract(), ['me'])
+ self.assertEqual(sel.css('#1').xpath('./span/text()').extract(), ['me'])
def test_dont_strip(self):
- sel = self.sscls(text=u'<div>fff: <a href="#">zzz</a></div>')
- self.assertEqual(sel.xpath("//text()").extract(), [u'fff: ', u'zzz'])
+ sel = self.sscls(text='<div>fff: <a href="#">zzz</a></div>')
+ self.assertEqual(sel.xpath("//text()").extract(), ['fff: ', 'zzz'])
def test_namespaces_simple(self):
- body = u"""
+ body = """
<test xmlns:somens="http://scrapy.org">
<somens:a id="foo">take this</a>
<a id="bar">found</a>
@@ -370,10 +370,10 @@ def test_namespaces_simple(self):
x.register_namespace("somens", "http://scrapy.org")
self.assertEqual(x.xpath("//somens:a/text()").extract(),
- [u'take this'])
+ ['take this'])
def test_namespaces_adhoc(self):
- body = u"""
+ body = """
<test xmlns:somens="http://scrapy.org">
<somens:a id="foo">take this</a>
<a id="bar">found</a>
@@ -384,10 +384,10 @@ def test_namespaces_adhoc(self):
self.assertEqual(x.xpath("//somens:a/text()",
namespaces={"somens": "http://scrapy.org"}).extract(),
- [u'take this'])
+ ['take this'])
def test_namespaces_adhoc_variables(self):
- body = u"""
+ body = """
<test xmlns:somens="http://scrapy.org">
<somens:a id="foo">take this</a>
<a id="bar">found</a>
@@ -399,10 +399,10 @@ def test_namespaces_adhoc_variables(self):
self.assertEqual(x.xpath("//somens:a/following-sibling::a[@id=$identifier]/text()",
namespaces={"somens": "http://scrapy.org"},
identifier="bar").extract(),
- [u'found'])
+ ['found'])
def test_namespaces_multiple(self):
- body = u"""<?xml version="1.0" encoding="UTF-8"?>
+ body = """<?xml version="1.0" encoding="UTF-8"?>
<BrowseNode xmlns="http://webservices.amazon.com/AWSECommerceService/2005-10-05"
xmlns:b="http://somens.com"
xmlns:p="http://www.scrapy.org/product" >
@@ -423,7 +423,7 @@ def test_namespaces_multiple(self):
self.assertEqual(x.xpath("//p:SecondTestTag/xmlns:material/text()").extract()[0], 'iron')
def test_namespaces_multiple_adhoc(self):
- body = u"""<?xml version="1.0" encoding="UTF-8"?>
+ body = """<?xml version="1.0" encoding="UTF-8"?>
<BrowseNode xmlns="http://webservices.amazon.com/AWSECommerceService/2005-10-05"
xmlns:b="http://somens.com"
xmlns:p="http://www.scrapy.org/product" >
@@ -495,13 +495,13 @@ def test_namespaces_multiple_adhoc(self):
self.assertEqual(x.xpath("//p:SecondTestTag/xmlns:material/text()").extract()[0], 'iron')
def test_make_links_absolute(self):
- text = u'<a href="file.html">link to file</a>'
+ text = '<a href="file.html">link to file</a>'
sel = Selector(text=text, base_url='http://example.com')
sel.root.make_links_absolute()
- self.assertEqual(u'http://example.com/file.html', sel.xpath('//a/@href').extract_first())
+ self.assertEqual('http://example.com/file.html', sel.xpath('//a/@href').extract_first())
def test_re(self):
- body = u"""<div>Name: Mary
+ body = """<div>Name: Mary
<ul>
<li>Name: John</li>
<li>Age: 10</li>
@@ -519,23 +519,23 @@ def test_re(self):
["10", "20"])
# Test named group, hit and miss
- x = self.sscls(text=u'foobar')
+ x = self.sscls(text='foobar')
self.assertEqual(x.re('(?P<extract>foo)'), ['foo'])
self.assertEqual(x.re('(?P<extract>baz)'), [])
# A purposely constructed test for an edge case
- x = self.sscls(text=u'baz')
+ x = self.sscls(text='baz')
self.assertEqual(x.re('(?P<extract>foo)|(?P<bar>baz)'), [])
def test_re_replace_entities(self):
- body = u"""<script>{"foo":"bar &amp; &quot;baz&quot;"}</script>"""
+ body = """<script>{"foo":"bar &amp; &quot;baz&quot;"}</script>"""
x = self.sscls(text=body)
name_re = re.compile('{"foo":(.*)}')
# by default, only &amp; and &lt; are preserved ;
# other entities are converted
- expected = u'"bar &amp; "baz""'
+ expected = '"bar &amp; "baz""'
self.assertEqual(x.xpath("//script/text()").re(name_re), [expected])
self.assertEqual(x.xpath("//script").re(name_re), [expected])
self.assertEqual(x.xpath("//script/text()")[0].re(name_re), [expected])
@@ -546,7 +546,7 @@ def test_re_replace_entities(self):
self.assertEqual(x.xpath("//script")[0].re_first(name_re), expected)
# switching off replace_entities will preserve &quot; also
- expected = u'"bar &amp; &quot;baz&quot;"'
+ expected = '"bar &amp; &quot;baz&quot;"'
self.assertEqual(x.xpath("//script/text()").re(name_re, replace_entities=False), [expected])
self.assertEqual(x.xpath("//script")[0].re(name_re, replace_entities=False), [expected])
@@ -554,95 +554,94 @@ def test_re_replace_entities(self):
self.assertEqual(x.xpath("//script")[0].re_first(name_re, replace_entities=False), expected)
def test_re_intl(self):
- body = u'<div>Evento: cumplea\xf1os</div>'
+ body = '<div>Evento: cumplea\xf1os</div>'
x = self.sscls(text=body)
- self.assertEqual(x.xpath("//div").re(r"Evento: (\w+)"), [u'cumplea\xf1os'])
+ self.assertEqual(x.xpath("//div").re(r"Evento: (\w+)"), ['cumplea\xf1os'])
def test_selector_over_text(self):
- hs = self.sscls(text=u'<root>lala</root>')
- self.assertEqual(hs.extract(), u'<html><body><root>lala</root></body></html>')
- xs = self.sscls(text=u'<root>lala</root>', type='xml')
- self.assertEqual(xs.extract(), u'<root>lala</root>')
- self.assertEqual(xs.xpath('.').extract(), [u'<root>lala</root>'])
+ hs = self.sscls(text='<root>lala</root>')
+ self.assertEqual(hs.extract(), '<html><body><root>lala</root></body></html>')
+ xs = self.sscls(text='<root>lala</root>', type='xml')
+ self.assertEqual(xs.extract(), '<root>lala</root>')
+ self.assertEqual(xs.xpath('.').extract(), ['<root>lala</root>'])
def test_invalid_xpath(self):
"Test invalid xpath raises ValueError with the invalid xpath"
- x = self.sscls(text=u"<html></html>")
+ x = self.sscls(text="<html></html>")
xpath = "//test[@foo='bar]"
- self.assertRaisesRegexp(ValueError, re.escape(xpath), x.xpath, xpath)
+ self.assertRaisesRegex(ValueError, re.escape(xpath), x.xpath, xpath)
def test_invalid_xpath_unicode(self):
"Test *Unicode* invalid xpath raises ValueError with the invalid xpath"
- x = self.sscls(text=u"<html></html>")
- xpath = u"//test[@foo='\u0431ar]"
- encoded = xpath if six.PY3 else xpath.encode('unicode_escape')
- self.assertRaisesRegexp(ValueError, re.escape(encoded), x.xpath, xpath)
+ x = self.sscls(text="<html></html>")
+ xpath = "//test[@foo='\\u0431ar]"
+ self.assertRaisesRegex(ValueError, re.escape(xpath), x.xpath, xpath)
def test_http_header_encoding_precedence(self):
- # u'\xa3' = pound symbol in unicode
- # u'\xc2\xa3' = pound symbol in utf-8
- # u'\xa3' = pound symbol in latin-1 (iso-8859-1)
+ # '\xa3' = pound symbol in unicode
+ # '\xc2\xa3' = pound symbol in utf-8
+ # '\xa3' = pound symbol in latin-1 (iso-8859-1)
- text = u'''<html>
+ text = '''<html>
<head><meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1"></head>
<body><span id="blank">\xa3</span></body></html>'''
x = self.sscls(text=text)
self.assertEqual(x.xpath("//span[@id='blank']/text()").extract(),
- [u'\xa3'])
+ ['\xa3'])
def test_empty_bodies_shouldnt_raise_errors(self):
- self.sscls(text=u'').xpath('//text()').extract()
+ self.sscls(text='').xpath('//text()').extract()
def test_bodies_with_comments_only(self):
- sel = self.sscls(text=u'<!-- hello world -->', base_url='http://example.com')
- self.assertEqual(u'http://example.com', sel.root.base)
+ sel = self.sscls(text='<!-- hello world -->', base_url='http://example.com')
+ self.assertEqual('http://example.com', sel.root.base)
def test_null_bytes_shouldnt_raise_errors(self):
- text = u'<root>pre\x00post</root>'
+ text = '<root>pre\x00post</root>'
self.sscls(text).xpath('//text()').extract()
def test_replacement_char_from_badly_encoded_body(self):
# \xe9 alone isn't valid utf8 sequence
- text = u'<html><p>an Jos\ufffd de</p><html>'
- self.assertEqual([u'an Jos\ufffd de'],
+ text = '<html><p>an Jos\\ufffd de</p><html>'
+ self.assertEqual(['an Jos\\ufffd de'],
self.sscls(text).xpath('//text()').extract())
def test_select_on_unevaluable_nodes(self):
- r = self.sscls(text=u'<span class="big">some text</span>')
+ r = self.sscls(text='<span class="big">some text</span>')
# Text node
x1 = r.xpath('//text()')
- self.assertEqual(x1.extract(), [u'some text'])
+ self.assertEqual(x1.extract(), ['some text'])
self.assertEqual(x1.xpath('.//b').extract(), [])
# Tag attribute
x1 = r.xpath('//span/@class')
- self.assertEqual(x1.extract(), [u'big'])
+ self.assertEqual(x1.extract(), ['big'])
self.assertEqual(x1.xpath('.//text()').extract(), [])
def test_select_on_text_nodes(self):
- r = self.sscls(text=u'<div><b>Options:</b>opt1</div><div><b>Other</b>opt2</div>')
+ r = self.sscls(text='<div><b>Options:</b>opt1</div><div><b>Other</b>opt2</div>')
x1 = r.xpath("//div/descendant::text()[preceding-sibling::b[contains(text(), 'Options')]]")
- self.assertEqual(x1.extract(), [u'opt1'])
+ self.assertEqual(x1.extract(), ['opt1'])
x1 = r.xpath("//div/descendant::text()/preceding-sibling::b[contains(text(), 'Options')]")
- self.assertEqual(x1.extract(), [u'<b>Options:</b>'])
+ self.assertEqual(x1.extract(), ['<b>Options:</b>'])
@unittest.skip("Text nodes lost parent node reference in lxml")
def test_nested_select_on_text_nodes(self):
# FIXME: does not work with lxml backend [upstream]
- r = self.sscls(text=u'<div><b>Options:</b>opt1</div><div><b>Other</b>opt2</div>')
+ r = self.sscls(text='<div><b>Options:</b>opt1</div><div><b>Other</b>opt2</div>')
x1 = r.xpath("//div/descendant::text()")
x2 = x1.xpath("./preceding-sibling::b[contains(text(), 'Options')]")
- self.assertEqual(x2.extract(), [u'<b>Options:</b>'])
+ self.assertEqual(x2.extract(), ['<b>Options:</b>'])
def test_weakref_slots(self):
"""Check that classes are using slots and are weak-referenceable"""
- x = self.sscls(text=u'')
+ x = self.sscls(text='')
weakref.ref(x)
assert not hasattr(x, '__dict__'), "%s does not use __slots__" % \
x.__class__.__name__
def test_remove_namespaces(self):
- xml = u"""<?xml version="1.0" encoding="UTF-8"?>
+ xml = """<?xml version="1.0" encoding="UTF-8"?>
<feed xmlns="http://www.w3.org/2005/Atom" xml:lang="en-US" xmlns:media="http://search.yahoo.com/mrss/">
<link type="text/html"/>
<entry>
@@ -659,7 +658,7 @@ def test_remove_namespaces(self):
self.assertEqual(len(sel.xpath("./namespace::*")), 1)
def test_remove_namespaces_embedded(self):
- xml = u"""
+ xml = """
<feed xmlns="http://www.w3.org/2005/Atom">
<link type="text/html"/>
<entry>
@@ -686,7 +685,7 @@ def test_remove_namespaces_embedded(self):
self.assertEqual(len(sel.xpath("./namespace::*")), 1)
def test_remove_attributes_namespaces(self):
- xml = u"""<?xml version="1.0" encoding="UTF-8"?>
+ xml = """<?xml version="1.0" encoding="UTF-8"?>
<feed xmlns:atom="http://www.w3.org/2005/Atom" xml:lang="en-US" xmlns:media="http://search.yahoo.com/mrss/">
<link atom:type="text/html"/>
<entry>
@@ -706,7 +705,7 @@ def test_smart_strings(self):
class SmartStringsSelector(Selector):
_lxml_smart_strings = True
- body = u"""<body>
+ body = """<body>
<div class='one'>
<ul>
<li>one</li><li>two</li>
@@ -723,18 +722,18 @@ class SmartStringsSelector(Selector):
# only when smart_strings are on
x = self.sscls(text=body)
li_text = x.xpath('//li/text()')
- self.assertFalse(any(map(lambda e: hasattr(e.root, 'getparent'), li_text)))
+ self.assertFalse(any([hasattr(e.root, 'getparent') for e in li_text]))
div_class = x.xpath('//div/@class')
- self.assertFalse(any(map(lambda e: hasattr(e.root, 'getparent'), div_class)))
+ self.assertFalse(any([hasattr(e.root, 'getparent') for e in div_class]))
x = SmartStringsSelector(text=body)
li_text = x.xpath('//li/text()')
- self.assertTrue(all(map(lambda e: hasattr(e.root, 'getparent'), li_text)))
+ self.assertTrue(all([hasattr(e.root, 'getparent') for e in li_text]))
div_class = x.xpath('//div/@class')
- self.assertTrue(all(map(lambda e: hasattr(e.root, 'getparent'), div_class)))
+ self.assertTrue(all([hasattr(e.root, 'getparent') for e in div_class]))
def test_xml_entity_expansion(self):
- malicious_xml = u'<?xml version="1.0" encoding="ISO-8859-1"?>'\
+ malicious_xml = '<?xml version="1.0" encoding="ISO-8859-1"?>'\
'<!DOCTYPE foo [ <!ELEMENT foo ANY > <!ENTITY xxe SYSTEM '\
'"file:///etc/passwd" >]><foo>&xxe;</foo>'
@@ -743,8 +742,8 @@ def test_xml_entity_expansion(self):
self.assertEqual(sel.extract(), '<foo>&xxe;</foo>')
def test_configure_base_url(self):
- sel = self.sscls(text=u'nothing', base_url='http://example.com')
- self.assertEqual(u'http://example.com', sel.root.base)
+ sel = self.sscls(text='nothing', base_url='http://example.com')
+ self.assertEqual('http://example.com', sel.root.base)
def test_extending_selector(self):
class MySelectorList(Selector.selectorlist_cls):
@@ -753,33 +752,33 @@ class MySelectorList(Selector.selectorlist_cls):
class MySelector(Selector):
selectorlist_cls = MySelectorList
- sel = MySelector(text=u'<html><div>foo</div></html>')
+ sel = MySelector(text='<html><div>foo</div></html>')
self.assertIsInstance(sel.xpath('//div'), MySelectorList)
self.assertIsInstance(sel.xpath('//div')[0], MySelector)
self.assertIsInstance(sel.css('div'), MySelectorList)
self.assertIsInstance(sel.css('div')[0], MySelector)
def test_replacement_null_char_from_body(self):
- text = u'<html>\x00<body><p>Grainy</p></body></html>'
- self.assertEqual(u'<html><body><p>Grainy</p></body></html>',
+ text = '<html>\x00<body><p>Grainy</p></body></html>'
+ self.assertEqual('<html><body><p>Grainy</p></body></html>',
self.sscls(text).extract())
def test_remove_selector_list(self):
- sel = self.sscls(text=u'<html><body><ul><li>1</li><li>2</li><li>3</li></ul></body></html>')
+ sel = self.sscls(text='<html><body><ul><li>1</li><li>2</li><li>3</li></ul></body></html>')
sel_list = sel.css('li')
sel_list.remove()
self.assertIsInstance(sel.css('li'), self.sscls.selectorlist_cls)
self.assertEqual(sel.css('li'), [])
def test_remove_selector(self):
- sel = self.sscls(text=u'<html><body><ul><li>1</li><li>2</li><li>3</li></ul></body></html>')
+ sel = self.sscls(text='<html><body><ul><li>1</li><li>2</li><li>3</li></ul></body></html>')
sel_list = sel.css('li')
sel_list[0].remove()
self.assertIsInstance(sel.css('li'), self.sscls.selectorlist_cls)
self.assertEqual(sel.css('li::text').getall(), ['2', '3'])
def test_remove_pseudo_element_selector_list(self):
- sel = self.sscls(text=u'<html><body><ul><li>1</li><li>2</li><li>3</li></ul></body></html>')
+ sel = self.sscls(text='<html><body><ul><li>1</li><li>2</li><li>3</li></ul></body></html>')
sel_list = sel.css('li::text')
self.assertEqual(sel_list.getall(), ['1', '2', '3'])
with self.assertRaises(CannotRemoveElementWithoutRoot):
@@ -789,7 +788,7 @@ def test_remove_pseudo_element_selector_list(self):
self.assertEqual(sel.css('li::text').getall(), ['1', '2', '3'])
def test_remove_pseudo_element_selector(self):
- sel = self.sscls(text=u'<html><body><ul><li>1</li><li>2</li><li>3</li></ul></body></html>')
+ sel = self.sscls(text='<html><body><ul><li>1</li><li>2</li><li>3</li></ul></body></html>')
sel_list = sel.css('li::text')
self.assertEqual(sel_list.getall(), ['1', '2', '3'])
with self.assertRaises(CannotRemoveElementWithoutRoot):
@@ -799,7 +798,7 @@ def test_remove_pseudo_element_selector(self):
self.assertEqual(sel.css('li::text').getall(), ['1', '2', '3'])
def test_remove_root_element_selector(self):
- sel = self.sscls(text=u'<html><body><ul><li>1</li><li>2</li><li>3</li></ul></body></html>')
+ sel = self.sscls(text='<html><body><ul><li>1</li><li>2</li><li>3</li></ul></body></html>')
sel_list = sel.css('li::text')
self.assertEqual(sel_list.getall(), ['1', '2', '3'])
with self.assertRaises(CannotRemoveElementWithoutParent):
@@ -821,7 +820,7 @@ class ExsltTestCase(unittest.TestCase):
def test_regexp(self):
"""EXSLT regular expression tests"""
- body = u"""
+ body = """
<p><input name='a' value='1'/><input name='b' value='2'/></p>
<div class="links">
<a href="/first.html">first link</a>
@@ -840,43 +839,43 @@ def test_regexp(self):
[x.extract()
for x in sel.xpath(
r'//a[re:test(@href, "\.html$")]/text()')],
- [u'first link', u'second link'])
+ ['first link', 'second link'])
self.assertEqual(
[x.extract()
for x in sel.xpath(
'//a[re:test(@href, "first")]/text()')],
- [u'first link'])
+ ['first link'])
self.assertEqual(
[x.extract()
for x in sel.xpath(
'//a[re:test(@href, "second")]/text()')],
- [u'second link'])
+ ['second link'])
# re:match() is rather special: it returns a node-set of <match> nodes
- # [u'<match>http://www.bayes.co.uk/xml/index.xml?/xml/utils/rechecker.xml</match>',
- # u'<match>http</match>',
- # u'<match>www.bayes.co.uk</match>',
- # u'<match></match>',
- # u'<match>/xml/index.xml?/xml/utils/rechecker.xml</match>']
+ # ['<match>http://www.bayes.co.uk/xml/index.xml?/xml/utils/rechecker.xml</match>',
+ # '<match>http</match>',
+ # '<match>www.bayes.co.uk</match>',
+ # '<match></match>',
+ # '<match>/xml/index.xml?/xml/utils/rechecker.xml</match>']
self.assertEqual(
sel.xpath(r're:match(//a[re:test(@href, "\.xml$")]/@href,'
r'"(\w+):\/\/([^/:]+)(:\d*)?([^# ]*)")/text()').extract(),
- [u'http://www.bayes.co.uk/xml/index.xml?/xml/utils/rechecker.xml',
- u'http',
- u'www.bayes.co.uk',
- u'',
- u'/xml/index.xml?/xml/utils/rechecker.xml'])
+ ['http://www.bayes.co.uk/xml/index.xml?/xml/utils/rechecker.xml',
+ 'http',
+ 'www.bayes.co.uk',
+ '',
+ '/xml/index.xml?/xml/utils/rechecker.xml'])
# re:replace()
self.assertEqual(
sel.xpath(r're:replace(//a[re:test(@href, "\.xml$")]/@href,'
r'"(\w+)://(.+)(\.xml)", "","https://\2.html")').extract(),
- [u'https://www.bayes.co.uk/xml/index.xml?/xml/utils/rechecker.html'])
+ ['https://www.bayes.co.uk/xml/index.xml?/xml/utils/rechecker.html'])
def test_set(self):
"""EXSLT set manipulation tests"""
# microdata example from http://schema.org/Event
- body = u"""
+ body = """
<div itemscope itemtype="http://schema.org/Event">
<a itemprop="url" href="nba-miami-philidelphia-game3.html">
NBA Eastern Conference First Round Playoff Tickets:
@@ -908,17 +907,17 @@ def test_set(self):
self.assertEqual(
sel.xpath('''//div[@itemtype="http://schema.org/Event"]
//@itemprop''').extract(),
- [u'url',
- u'name',
- u'startDate',
- u'location',
- u'url',
- u'address',
- u'addressLocality',
- u'addressRegion',
- u'offers',
- u'lowPrice',
- u'offerCount']
+ ['url',
+ 'name',
+ 'startDate',
+ 'location',
+ 'url',
+ 'address',
+ 'addressLocality',
+ 'addressRegion',
+ 'offers',
+ 'lowPrice',
+ 'offerCount']
)
self.assertEqual(sel.xpath('''
@@ -926,4 +925,4 @@ def test_set(self):
//@itemprop,
//div[@itemtype="http://schema.org/Event"]
//*[@itemscope]/*/@itemprop)''').extract(),
- [u'url', u'name', u'startDate', u'location', u'offers'])
+ ['url', 'name', 'startDate', 'location', 'offers'])
diff --git a/tests/test_selector_csstranslator.py b/tests/test_selector_csstranslator.py
index 83ed066..ae9ffc0 100644
--- a/tests/test_selector_csstranslator.py
+++ b/tests/test_selector_csstranslator.py
@@ -8,7 +8,7 @@
from cssselect.xpath import ExpressionError
-HTMLBODY = u'''
+HTMLBODY = '''
<html>
<body>
<div>
@@ -52,10 +52,10 @@ def setUp(self):
def test_attr_function(self):
cases = [
- ('::attr(name)', u'descendant-or-self::*/@name'),
- ('a::attr(href)', u'descendant-or-self::a/@href'),
- ('a ::attr(img)', u'descendant-or-self::a/descendant-or-self::*/@img'),
- ('a > ::attr(class)', u'descendant-or-self::a/*/@class'),
+ ('::attr(name)', 'descendant-or-self::*/@name'),
+ ('a::attr(href)', 'descendant-or-self::a/@href'),
+ ('a ::attr(img)', 'descendant-or-self::a/descendant-or-self::*/@img'),
+ ('a > ::attr(class)', 'descendant-or-self::a/*/@class'),
]
for css, xpath in cases:
self.assertEqual(self.c2x(css), xpath, css)
@@ -71,17 +71,17 @@ def test_attr_function_exception(self):
def test_text_pseudo_element(self):
cases = [
- ('::text', u'descendant-or-self::text()'),
- ('p::text', u'descendant-or-self::p/text()'),
- ('p ::text', u'descendant-or-self::p/descendant-or-self::text()'),
- ('#id::text', u"descendant-or-self::*[@id = 'id']/text()"),
- ('p#id::text', u"descendant-or-self::p[@id = 'id']/text()"),
- ('p#id ::text', u"descendant-or-self::p[@id = 'id']/descendant-or-self::text()"),
- ('p#id > ::text', u"descendant-or-self::p[@id = 'id']/*/text()"),
- ('p#id ~ ::text', u"descendant-or-self::p[@id = 'id']/following-sibling::*/text()"),
- ('a[href]::text', u'descendant-or-self::a[@href]/text()'),
- ('a[href] ::text', u'descendant-or-self::a[@href]/descendant-or-self::text()'),
- ('p::text, a::text', u"descendant-or-self::p/text() | descendant-or-self::a/text()"),
+ ('::text', 'descendant-or-self::text()'),
+ ('p::text', 'descendant-or-self::p/text()'),
+ ('p ::text', 'descendant-or-self::p/descendant-or-self::text()'),
+ ('#id::text', "descendant-or-self::*[@id = 'id']/text()"),
+ ('p#id::text', "descendant-or-self::p[@id = 'id']/text()"),
+ ('p#id ::text', "descendant-or-self::p[@id = 'id']/descendant-or-self::text()"),
+ ('p#id > ::text', "descendant-or-self::p[@id = 'id']/*/text()"),
+ ('p#id ~ ::text', "descendant-or-self::p[@id = 'id']/following-sibling::*/text()"),
+ ('a[href]::text', 'descendant-or-self::a[@href]/text()'),
+ ('a[href] ::text', 'descendant-or-self::a[@href]/descendant-or-self::text()'),
+ ('p::text, a::text', "descendant-or-self::p/text() | descendant-or-self::a/text()"),
]
for css, xpath in cases:
self.assertEqual(self.c2x(css), xpath, css)
@@ -122,7 +122,7 @@ class GenericTranslatorTest(TranslatorTestMixin, unittest.TestCase):
class UtilCss2XPathTest(unittest.TestCase):
def test_css2xpath(self):
from parsel import css2xpath
- expected_xpath = (u"descendant-or-self::*[@class and contains("
+ expected_xpath = ("descendant-or-self::*[@class and contains("
"concat(' ', normalize-space(@class), ' '), ' some-class ')]")
self.assertEqual(css2xpath('.some-class'), expected_xpath)
@@ -144,22 +144,22 @@ def test_selector_simple(self):
[x.extract() for x in self.sel.css('input')])
def test_text_pseudo_element(self):
- self.assertEqual(self.x('#p-b2'), [u'<b id="p-b2">guy</b>'])
- self.assertEqual(self.x('#p-b2::text'), [u'guy'])
- self.assertEqual(self.x('#p-b2 ::text'), [u'guy'])
- self.assertEqual(self.x('#paragraph::text'), [u'lorem ipsum text'])
- self.assertEqual(self.x('#paragraph ::text'), [u'lorem ipsum text', u'hi', u'there', u'guy'])
- self.assertEqual(self.x('p::text'), [u'lorem ipsum text'])
- self.assertEqual(self.x('p ::text'), [u'lorem ipsum text', u'hi', u'there', u'guy'])
+ self.assertEqual(self.x('#p-b2'), ['<b id="p-b2">guy</b>'])
+ self.assertEqual(self.x('#p-b2::text'), ['guy'])
+ self.assertEqual(self.x('#p-b2 ::text'), ['guy'])
+ self.assertEqual(self.x('#paragraph::text'), ['lorem ipsum text'])
+ self.assertEqual(self.x('#paragraph ::text'), ['lorem ipsum text', 'hi', 'there', 'guy'])
+ self.assertEqual(self.x('p::text'), ['lorem ipsum text'])
+ self.assertEqual(self.x('p ::text'), ['lorem ipsum text', 'hi', 'there', 'guy'])
def test_attribute_function(self):
- self.assertEqual(self.x('#p-b2::attr(id)'), [u'p-b2'])
- self.assertEqual(self.x('.cool-footer::attr(class)'), [u'cool-footer'])
- self.assertEqual(self.x('.cool-footer ::attr(id)'), [u'foobar-div', u'foobar-span'])
- self.assertEqual(self.x('map[name="dummymap"] ::attr(shape)'), [u'circle', u'default'])
+ self.assertEqual(self.x('#p-b2::attr(id)'), ['p-b2'])
+ self.assertEqual(self.x('.cool-footer::attr(class)'), ['cool-footer'])
+ self.assertEqual(self.x('.cool-footer ::attr(id)'), ['foobar-div', 'foobar-span'])
+ self.assertEqual(self.x('map[name="dummymap"] ::attr(shape)'), ['circle', 'default'])
def test_nested_selector(self):
self.assertEqual(self.sel.css('p').css('b::text').extract(),
- [u'hi', u'guy'])
+ ['hi', 'guy'])
self.assertEqual(self.sel.css('div').css('area:last-child').extract(),
- [u'<area shape="default" id="area-nohref">'])
+ ['<area shape="default" id="area-nohref">'])
diff --git a/tests/test_utils.py b/tests/test_utils.py
index 47d44f3..9eede53 100644
--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@@ -1,29 +1,28 @@
from parsel.utils import shorten, extract_regex
from pytest import mark, raises
-import six
@mark.parametrize(
'width,expected',
(
(-1, ValueError),
- (0, u''),
- (1, u'.'),
- (2, u'..'),
- (3, u'...'),
- (4, u'f...'),
- (5, u'fo...'),
- (6, u'foobar'),
- (7, u'foobar'),
+ (0, ''),
+ (1, '.'),
+ (2, '..'),
+ (3, '...'),
+ (4, 'f...'),
+ (5, 'fo...'),
+ (6, 'foobar'),
+ (7, 'foobar'),
)
)
def test_shorten(width, expected):
- if isinstance(expected, six.string_types):
- assert shorten(u'foobar', width) == expected
+ if isinstance(expected, str):
+ assert shorten('foobar', width) == expected
else:
with raises(expected):
- shorten(u'foobar', width)
+ shorten('foobar', width)
@mark.parametrize('regex, text, replace_entities, expected', (
diff --git a/tests/test_xpathfuncs.py b/tests/test_xpathfuncs.py
index cfa2579..8bcabd0 100644
--- a/tests/test_xpathfuncs.py
+++ b/tests/test_xpathfuncs.py
@@ -1,5 +1,3 @@
-# coding: utf-8
-
from parsel import Selector
from parsel.xpathfuncs import set_xpathfunc
import unittest
@@ -7,7 +5,7 @@
class XPathFuncsTestCase(unittest.TestCase):
def test_has_class_simple(self):
- body = u"""
+ body = """
<p class="foo bar-baz">First</p>
<p class="foo">Second</p>
<p class="bar">Third</p>
@@ -16,80 +14,80 @@ def test_has_class_simple(self):
sel = Selector(text=body)
self.assertEqual(
[x.extract() for x in sel.xpath('//p[has-class("foo")]/text()')],
- [u'First', u'Second'])
+ ['First', 'Second'])
self.assertEqual(
[x.extract() for x in sel.xpath('//p[has-class("bar")]/text()')],
- [u'Third'])
+ ['Third'])
self.assertEqual(
[x.extract() for x in sel.xpath('//p[has-class("foo","bar")]/text()')],
[])
self.assertEqual(
[x.extract() for x in sel.xpath('//p[has-class("foo","bar-baz")]/text()')],
- [u'First'])
+ ['First'])
def test_has_class_error_no_args(self):
- body = u"""
+ body = """
<p CLASS="foo">First</p>
"""
sel = Selector(text=body)
- self.assertRaisesRegexp(
+ self.assertRaisesRegex(
ValueError, 'has-class must have at least 1 argument',
sel.xpath, 'has-class()')
def test_has_class_error_invalid_arg_type(self):
- body = u"""
+ body = """
<p CLASS="foo">First</p>
"""
sel = Selector(text=body)
- self.assertRaisesRegexp(
+ self.assertRaisesRegex(
ValueError, 'has-class arguments must be strings',
sel.xpath, 'has-class(.)')
def test_has_class_error_invalid_unicode(self):
- body = u"""
+ body = """
<p CLASS="foo">First</p>
"""
sel = Selector(text=body)
- self.assertRaisesRegexp(
+ self.assertRaisesRegex(
ValueError, 'All strings must be XML compatible',
- sel.xpath, u'has-class("héllö")'.encode('utf-8'))
+ sel.xpath, 'has-class("héllö")'.encode('utf-8'))
def test_has_class_unicode(self):
- body = u"""
+ body = """
<p CLASS="fóó">First</p>
"""
sel = Selector(text=body)
self.assertEqual(
- [x.extract() for x in sel.xpath(u'//p[has-class("fóó")]/text()')],
- [u'First'])
+ [x.extract() for x in sel.xpath('//p[has-class("fóó")]/text()')],
+ ['First'])
def test_has_class_uppercase(self):
- body = u"""
+ body = """
<p CLASS="foo">First</p>
"""
sel = Selector(text=body)
self.assertEqual(
[x.extract() for x in sel.xpath('//p[has-class("foo")]/text()')],
- [u'First'])
+ ['First'])
def test_has_class_newline(self):
- body = u"""
+ body = """
<p CLASS="foo
bar">First</p>
"""
sel = Selector(text=body)
self.assertEqual(
- [x.extract() for x in sel.xpath(u'//p[has-class("foo")]/text()')],
- [u'First'])
+ [x.extract() for x in sel.xpath('//p[has-class("foo")]/text()')],
+ ['First'])
def test_has_class_tab(self):
- body = u"""
+ body = """
<p CLASS="foo\tbar">First</p>
"""
sel = Selector(text=body)
self.assertEqual(
- [x.extract() for x in sel.xpath(u'//p[has-class("foo")]/text()')],
- [u'First'])
+ [x.extract() for x in sel.xpath('//p[has-class("foo")]/text()')],
+ ['First'])
def test_set_xpathfunc(self):
@@ -98,11 +96,11 @@ def myfunc(ctx):
myfunc.call_count = 0
- body = u"""
+ body = """
<p CLASS="foo">First</p>
"""
sel = Selector(text=body)
- self.assertRaisesRegexp(
+ self.assertRaisesRegex(
ValueError, 'Unregistered function in myfunc',
sel.xpath, 'myfunc()')
@@ -111,6 +109,6 @@ def myfunc(ctx):
self.assertEqual(myfunc.call_count, 1)
set_xpathfunc('myfunc', None)
- self.assertRaisesRegexp(
+ self.assertRaisesRegex(
ValueError, 'Unregistered function in myfunc',
sel.xpath, 'myfunc()')