forked from pool/python-Scrapy
- Add patch twisted-23.8.0-compat.patch gh#scrapy/scrapy#6064
- Update to 2.11.0: - Spiders can now modify settings in their from_crawler methods, e.g. based on spider arguments. - Periodic logging of stats. - Bug fixes. - 2.10.0: - Added Python 3.12 support, dropped Python 3.7 support. - The new add-ons framework simplifies configuring 3rd-party components that support it. - Exceptions to retry can now be configured. - Many fixes and improvements for feed exports. - 2.9.0: - Per-domain download settings. - Compatibility with new cryptography and new parsel. - JMESPath selectors from the new parsel. - Bug fixes. - 2.8.0: - This is a maintenance release, with minor features, bug fixes, and cleanups. OBS-URL: https://build.opensuse.org/package/show/devel:languages:python/python-Scrapy?expand=0&rev=34
This commit is contained in:
3
Scrapy-2.11.0.tar.gz
Normal file
3
Scrapy-2.11.0.tar.gz
Normal file
@@ -0,0 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:3cbdedce0c3f0e0482d61be2d7458683be7cd7cf14b0ee6adfbaddb80f5b36a5
|
||||
size 1171092
|
@@ -1,3 +0,0 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:30fa408353d24b1df979df2ea4afbd19b4ae02fb2207f218d246332f1e1cf14e
|
||||
size 1131084
|
@@ -1,3 +1,27 @@
|
||||
-------------------------------------------------------------------
|
||||
Wed Jan 10 07:50:52 UTC 2024 - Daniel Garcia <daniel.garcia@suse.com>
|
||||
|
||||
- Add patch twisted-23.8.0-compat.patch gh#scrapy/scrapy#6064
|
||||
- Update to 2.11.0:
|
||||
- Spiders can now modify settings in their from_crawler methods,
|
||||
e.g. based on spider arguments.
|
||||
- Periodic logging of stats.
|
||||
- Bug fixes.
|
||||
- 2.10.0:
|
||||
- Added Python 3.12 support, dropped Python 3.7 support.
|
||||
- The new add-ons framework simplifies configuring 3rd-party
|
||||
components that support it.
|
||||
- Exceptions to retry can now be configured.
|
||||
- Many fixes and improvements for feed exports.
|
||||
- 2.9.0:
|
||||
- Per-domain download settings.
|
||||
- Compatibility with new cryptography and new parsel.
|
||||
- JMESPath selectors from the new parsel.
|
||||
- Bug fixes.
|
||||
- 2.8.0:
|
||||
- This is a maintenance release, with minor features, bug fixes, and
|
||||
cleanups.
|
||||
|
||||
-------------------------------------------------------------------
|
||||
Mon Nov 7 20:35:15 UTC 2022 - Yogalakshmi Arunachalam <yarunachalam@suse.com>
|
||||
|
||||
|
@@ -1,7 +1,7 @@
|
||||
#
|
||||
# spec file for package python-Scrapy
|
||||
#
|
||||
# Copyright (c) 2022 SUSE LLC
|
||||
# Copyright (c) 2024 SUSE LLC
|
||||
#
|
||||
# All modifications and additions to the file contributed by third parties
|
||||
# remain the property of their copyright owners, unless otherwise agreed
|
||||
@@ -16,62 +16,63 @@
|
||||
#
|
||||
|
||||
|
||||
%{?!python_module:%define python_module() python3-%{**}}
|
||||
%define skip_python2 1
|
||||
Name: python-Scrapy
|
||||
Version: 2.7.1
|
||||
Version: 2.11.0
|
||||
Release: 0
|
||||
Summary: A high-level Python Screen Scraping framework
|
||||
License: BSD-3-Clause
|
||||
Group: Development/Languages/Python
|
||||
URL: https://scrapy.org
|
||||
Source: https://files.pythonhosted.org/packages/source/S/Scrapy/Scrapy-%{version}.tar.gz
|
||||
# PATCH-FIX-UPSTREAM twisted-23.8.0-compat.patch gh#scrapy/scrapy#6064
|
||||
Patch1: twisted-23.8.0-compat.patch
|
||||
BuildRequires: %{python_module Pillow}
|
||||
BuildRequires: %{python_module Protego >= 0.1.15}
|
||||
BuildRequires: %{python_module PyDispatcher >= 2.0.5}
|
||||
BuildRequires: %{python_module Twisted >= 17.9.0}
|
||||
BuildRequires: %{python_module botocore}
|
||||
BuildRequires: %{python_module cryptography >= 2.0}
|
||||
BuildRequires: %{python_module Twisted >= 18.9.0}
|
||||
BuildRequires: %{python_module attrs}
|
||||
BuildRequires: %{python_module botocore >= 1.4.87}
|
||||
BuildRequires: %{python_module cryptography >= 36.0.0}
|
||||
BuildRequires: %{python_module cssselect >= 0.9.1}
|
||||
BuildRequires: %{python_module dbm}
|
||||
BuildRequires: %{python_module itemadapter >= 0.1.0}
|
||||
BuildRequires: %{python_module itemloaders >= 1.0.1}
|
||||
BuildRequires: %{python_module jmespath}
|
||||
BuildRequires: %{python_module lxml >= 3.5.0}
|
||||
BuildRequires: %{python_module lxml >= 4.4.1}
|
||||
BuildRequires: %{python_module parsel >= 1.5.0}
|
||||
BuildRequires: %{python_module pyOpenSSL >= 16.2.0}
|
||||
BuildRequires: %{python_module pexpect >= 4.8.1}
|
||||
BuildRequires: %{python_module pyOpenSSL >= 21.0.0}
|
||||
BuildRequires: %{python_module pyftpdlib}
|
||||
BuildRequires: %{python_module pytest-xdist}
|
||||
BuildRequires: %{python_module pytest}
|
||||
BuildRequires: %{python_module queuelib >= 1.4.2}
|
||||
BuildRequires: %{python_module service_identity >= 16.0.0}
|
||||
BuildRequires: %{python_module service_identity >= 18.1.0}
|
||||
BuildRequires: %{python_module setuptools}
|
||||
BuildRequires: %{python_module sybil}
|
||||
BuildRequires: %{python_module testfixtures >= 6.0.0}
|
||||
BuildRequires: %{python_module testfixtures}
|
||||
BuildRequires: %{python_module tldextract}
|
||||
BuildRequires: %{python_module uvloop}
|
||||
BuildRequires: %{python_module w3lib >= 1.17.0}
|
||||
BuildRequires: %{python_module zope.interface >= 4.1.3}
|
||||
BuildRequires: %{python_module zope.interface >= 5.1.0}
|
||||
BuildRequires: fdupes
|
||||
BuildRequires: python-rpm-macros
|
||||
BuildRequires: python3-Sphinx
|
||||
BuildRequires: (python3-dataclasses if python3-base < 3.7)
|
||||
Requires: python-Protego >= 0.1.15
|
||||
Requires: python-PyDispatcher >= 2.0.5
|
||||
Requires: python-Twisted >= 17.9.0
|
||||
Requires: python-cryptography >= 2.0
|
||||
Requires: python-Twisted >= 18.9.0
|
||||
Requires: python-cryptography >= 36.0.0
|
||||
Requires: python-cssselect >= 0.9.1
|
||||
Requires: python-itemadapter >= 0.1.0
|
||||
Requires: python-itemloaders >= 1.0.1
|
||||
Requires: python-lxml >= 3.5.0
|
||||
Requires: python-lxml >= 4.4.1
|
||||
Requires: python-parsel >= 1.5.0
|
||||
Requires: python-pyOpenSSL >= 16.2.0
|
||||
Requires: python-pyOpenSSL >= 21.0.0
|
||||
Requires: python-queuelib >= 1.4.2
|
||||
Requires: python-service_identity >= 16.0.0
|
||||
Requires: python-service_identity >= 18.1.0
|
||||
Requires: python-setuptools
|
||||
Requires: python-tldextract
|
||||
Requires: python-w3lib >= 1.17.2
|
||||
Requires: python-zope.interface >= 4.1.3
|
||||
Requires: python-zope.interface >= 5.1.0
|
||||
Requires(post): update-alternatives
|
||||
Requires(postun):update-alternatives
|
||||
BuildArch: noarch
|
||||
@@ -90,8 +91,7 @@ Group: Documentation/HTML
|
||||
Provides documentation for %{name}.
|
||||
|
||||
%prep
|
||||
%setup -n Scrapy-%{version}
|
||||
%autopatch -p1
|
||||
%autosetup -p1 -n Scrapy-%{version}
|
||||
|
||||
sed -i -e 's:= python:= python3:g' docs/Makefile
|
||||
|
||||
@@ -111,7 +111,7 @@ popd
|
||||
skiplist="test_pformat"
|
||||
# no online connection to toscrapy.com
|
||||
skiplist="$skiplist or CheckCommandTest"
|
||||
%{pytest \
|
||||
%{pytest -x \
|
||||
-k "not (${skiplist})" \
|
||||
-W ignore::DeprecationWarning \
|
||||
tests}
|
||||
|
254
twisted-23.8.0-compat.patch
Normal file
254
twisted-23.8.0-compat.patch
Normal file
@@ -0,0 +1,254 @@
|
||||
Index: Scrapy-2.11.0/scrapy/crawler.py
|
||||
===================================================================
|
||||
--- Scrapy-2.11.0.orig/scrapy/crawler.py
|
||||
+++ Scrapy-2.11.0/scrapy/crawler.py
|
||||
@@ -404,8 +404,8 @@ class CrawlerProcess(CrawlerRunner):
|
||||
:param bool stop_after_crawl: stop or not the reactor when all
|
||||
crawlers have finished
|
||||
|
||||
- :param bool install_signal_handlers: whether to install the shutdown
|
||||
- handlers (default: True)
|
||||
+ :param bool install_signal_handlers: whether to install the OS signal
|
||||
+ handlers from Twisted and Scrapy (default: True)
|
||||
"""
|
||||
from twisted.internet import reactor
|
||||
|
||||
@@ -416,15 +416,17 @@ class CrawlerProcess(CrawlerRunner):
|
||||
return
|
||||
d.addBoth(self._stop_reactor)
|
||||
|
||||
- if install_signal_handlers:
|
||||
- install_shutdown_handlers(self._signal_shutdown)
|
||||
resolver_class = load_object(self.settings["DNS_RESOLVER"])
|
||||
resolver = create_instance(resolver_class, self.settings, self, reactor=reactor)
|
||||
resolver.install_on_reactor()
|
||||
tp = reactor.getThreadPool()
|
||||
tp.adjustPoolsize(maxthreads=self.settings.getint("REACTOR_THREADPOOL_MAXSIZE"))
|
||||
reactor.addSystemEventTrigger("before", "shutdown", self.stop)
|
||||
- reactor.run(installSignalHandlers=False) # blocking call
|
||||
+ if install_signal_handlers:
|
||||
+ reactor.addSystemEventTrigger(
|
||||
+ "after", "startup", install_shutdown_handlers, self._signal_shutdown
|
||||
+ )
|
||||
+ reactor.run(installSignalHandlers=install_signal_handlers) # blocking call
|
||||
|
||||
def _graceful_stop_reactor(self) -> Deferred:
|
||||
d = self.stop()
|
||||
Index: Scrapy-2.11.0/scrapy/utils/ossignal.py
|
||||
===================================================================
|
||||
--- Scrapy-2.11.0.orig/scrapy/utils/ossignal.py
|
||||
+++ Scrapy-2.11.0/scrapy/utils/ossignal.py
|
||||
@@ -19,13 +19,10 @@ def install_shutdown_handlers(
|
||||
function: SignalHandlerT, override_sigint: bool = True
|
||||
) -> None:
|
||||
"""Install the given function as a signal handler for all common shutdown
|
||||
- signals (such as SIGINT, SIGTERM, etc). If override_sigint is ``False`` the
|
||||
- SIGINT handler won't be install if there is already a handler in place
|
||||
- (e.g. Pdb)
|
||||
+ signals (such as SIGINT, SIGTERM, etc). If ``override_sigint`` is ``False`` the
|
||||
+ SIGINT handler won't be installed if there is already a handler in place
|
||||
+ (e.g. Pdb)
|
||||
"""
|
||||
- from twisted.internet import reactor
|
||||
-
|
||||
- reactor._handleSignals()
|
||||
signal.signal(signal.SIGTERM, function)
|
||||
if signal.getsignal(signal.SIGINT) == signal.default_int_handler or override_sigint:
|
||||
signal.signal(signal.SIGINT, function)
|
||||
Index: Scrapy-2.11.0/scrapy/utils/testproc.py
|
||||
===================================================================
|
||||
--- Scrapy-2.11.0.orig/scrapy/utils/testproc.py
|
||||
+++ Scrapy-2.11.0/scrapy/utils/testproc.py
|
||||
@@ -2,7 +2,7 @@ from __future__ import annotations
|
||||
|
||||
import os
|
||||
import sys
|
||||
-from typing import Iterable, Optional, Tuple, cast
|
||||
+from typing import Iterable, List, Optional, Tuple, cast
|
||||
|
||||
from twisted.internet.defer import Deferred
|
||||
from twisted.internet.error import ProcessTerminated
|
||||
@@ -26,14 +26,15 @@ class ProcessTest:
|
||||
env = os.environ.copy()
|
||||
if settings is not None:
|
||||
env["SCRAPY_SETTINGS_MODULE"] = settings
|
||||
+ assert self.command
|
||||
cmd = self.prefix + [self.command] + list(args)
|
||||
pp = TestProcessProtocol()
|
||||
- pp.deferred.addBoth(self._process_finished, cmd, check_code)
|
||||
+ pp.deferred.addCallback(self._process_finished, cmd, check_code)
|
||||
reactor.spawnProcess(pp, cmd[0], cmd, env=env, path=self.cwd)
|
||||
return pp.deferred
|
||||
|
||||
def _process_finished(
|
||||
- self, pp: TestProcessProtocol, cmd: str, check_code: bool
|
||||
+ self, pp: TestProcessProtocol, cmd: List[str], check_code: bool
|
||||
) -> Tuple[int, bytes, bytes]:
|
||||
if pp.exitcode and check_code:
|
||||
msg = f"process {cmd} exit with code {pp.exitcode}"
|
||||
Index: Scrapy-2.11.0/setup.py
|
||||
===================================================================
|
||||
--- Scrapy-2.11.0.orig/setup.py
|
||||
+++ Scrapy-2.11.0/setup.py
|
||||
@@ -6,8 +6,7 @@ version = (Path(__file__).parent / "scra
|
||||
|
||||
|
||||
install_requires = [
|
||||
- # 23.8.0 incompatibility: https://github.com/scrapy/scrapy/issues/6024
|
||||
- "Twisted>=18.9.0,<23.8.0",
|
||||
+ "Twisted>=18.9.0",
|
||||
"cryptography>=36.0.0",
|
||||
"cssselect>=0.9.1",
|
||||
"itemloaders>=1.0.1",
|
||||
Index: Scrapy-2.11.0/tests/CrawlerProcess/sleeping.py
|
||||
===================================================================
|
||||
--- /dev/null
|
||||
+++ Scrapy-2.11.0/tests/CrawlerProcess/sleeping.py
|
||||
@@ -0,0 +1,24 @@
|
||||
+from twisted.internet.defer import Deferred
|
||||
+
|
||||
+import scrapy
|
||||
+from scrapy.crawler import CrawlerProcess
|
||||
+from scrapy.utils.defer import maybe_deferred_to_future
|
||||
+
|
||||
+
|
||||
+class SleepingSpider(scrapy.Spider):
|
||||
+ name = "sleeping"
|
||||
+
|
||||
+ start_urls = ["data:,;"]
|
||||
+
|
||||
+ async def parse(self, response):
|
||||
+ from twisted.internet import reactor
|
||||
+
|
||||
+ d = Deferred()
|
||||
+ reactor.callLater(3, d.callback, None)
|
||||
+ await maybe_deferred_to_future(d)
|
||||
+
|
||||
+
|
||||
+process = CrawlerProcess(settings={})
|
||||
+
|
||||
+process.crawl(SleepingSpider)
|
||||
+process.start()
|
||||
Index: Scrapy-2.11.0/tests/requirements.txt
|
||||
===================================================================
|
||||
--- Scrapy-2.11.0.orig/tests/requirements.txt
|
||||
+++ Scrapy-2.11.0/tests/requirements.txt
|
||||
@@ -1,5 +1,6 @@
|
||||
# Tests requirements
|
||||
attrs
|
||||
+pexpect >= 4.8.0
|
||||
# https://github.com/giampaolo/pyftpdlib/issues/560
|
||||
pyftpdlib; python_version < "3.12"
|
||||
pytest
|
||||
Index: Scrapy-2.11.0/tests/test_command_shell.py
|
||||
===================================================================
|
||||
--- Scrapy-2.11.0.orig/tests/test_command_shell.py
|
||||
+++ Scrapy-2.11.0/tests/test_command_shell.py
|
||||
@@ -1,11 +1,15 @@
|
||||
+import sys
|
||||
+from io import BytesIO
|
||||
from pathlib import Path
|
||||
|
||||
+from pexpect.popen_spawn import PopenSpawn
|
||||
from twisted.internet import defer
|
||||
from twisted.trial import unittest
|
||||
|
||||
from scrapy.utils.testproc import ProcessTest
|
||||
from scrapy.utils.testsite import SiteTest
|
||||
from tests import NON_EXISTING_RESOLVABLE, tests_datadir
|
||||
+from tests.mockserver import MockServer
|
||||
|
||||
|
||||
class ShellTest(ProcessTest, SiteTest, unittest.TestCase):
|
||||
@@ -133,3 +137,25 @@ class ShellTest(ProcessTest, SiteTest, u
|
||||
args = ["-c", code, "--set", f"TWISTED_REACTOR={reactor_path}"]
|
||||
_, _, err = yield self.execute(args, check_code=True)
|
||||
self.assertNotIn(b"RuntimeError: There is no current event loop in thread", err)
|
||||
+
|
||||
+
|
||||
+class InteractiveShellTest(unittest.TestCase):
|
||||
+ def test_fetch(self):
|
||||
+ args = (
|
||||
+ sys.executable,
|
||||
+ "-m",
|
||||
+ "scrapy.cmdline",
|
||||
+ "shell",
|
||||
+ )
|
||||
+ logfile = BytesIO()
|
||||
+ p = PopenSpawn(args, timeout=5)
|
||||
+ p.logfile_read = logfile
|
||||
+ p.expect_exact("Available Scrapy objects")
|
||||
+ with MockServer() as mockserver:
|
||||
+ p.sendline(f"fetch('{mockserver.url('/')}')")
|
||||
+ p.sendline("type(response)")
|
||||
+ p.expect_exact("HtmlResponse")
|
||||
+ p.sendeof()
|
||||
+ p.wait()
|
||||
+ logfile.seek(0)
|
||||
+ self.assertNotIn("Traceback", logfile.read().decode())
|
||||
Index: Scrapy-2.11.0/tests/test_crawler.py
|
||||
===================================================================
|
||||
--- Scrapy-2.11.0.orig/tests/test_crawler.py
|
||||
+++ Scrapy-2.11.0/tests/test_crawler.py
|
||||
@@ -1,13 +1,16 @@
|
||||
import logging
|
||||
import os
|
||||
import platform
|
||||
+import signal
|
||||
import subprocess
|
||||
import sys
|
||||
import warnings
|
||||
from pathlib import Path
|
||||
+from typing import List
|
||||
|
||||
import pytest
|
||||
from packaging.version import parse as parse_version
|
||||
+from pexpect.popen_spawn import PopenSpawn
|
||||
from pytest import mark, raises
|
||||
from twisted.internet import defer
|
||||
from twisted.trial import unittest
|
||||
@@ -289,9 +292,12 @@ class ScriptRunnerMixin:
|
||||
script_dir: Path
|
||||
cwd = os.getcwd()
|
||||
|
||||
- def run_script(self, script_name: str, *script_args):
|
||||
+ def get_script_args(self, script_name: str, *script_args: str) -> List[str]:
|
||||
script_path = self.script_dir / script_name
|
||||
- args = [sys.executable, str(script_path)] + list(script_args)
|
||||
+ return [sys.executable, str(script_path)] + list(script_args)
|
||||
+
|
||||
+ def run_script(self, script_name: str, *script_args: str) -> str:
|
||||
+ args = self.get_script_args(script_name, *script_args)
|
||||
p = subprocess.Popen(
|
||||
args,
|
||||
env=get_mockserver_env(),
|
||||
@@ -517,6 +523,29 @@ class CrawlerProcessSubprocess(ScriptRun
|
||||
self.assertIn("Spider closed (finished)", log)
|
||||
self.assertIn("The value of FOO is 42", log)
|
||||
|
||||
+ def test_shutdown_graceful(self):
|
||||
+ sig = signal.SIGINT if sys.platform != "win32" else signal.SIGBREAK
|
||||
+ args = self.get_script_args("sleeping.py")
|
||||
+ p = PopenSpawn(args, timeout=5)
|
||||
+ p.expect_exact("Spider opened")
|
||||
+ p.expect_exact("Crawled (200)")
|
||||
+ p.kill(sig)
|
||||
+ p.expect_exact("shutting down gracefully")
|
||||
+ p.expect_exact("Spider closed (shutdown)")
|
||||
+ p.wait()
|
||||
+
|
||||
+ def test_shutdown_forced(self):
|
||||
+ sig = signal.SIGINT if sys.platform != "win32" else signal.SIGBREAK
|
||||
+ args = self.get_script_args("sleeping.py")
|
||||
+ p = PopenSpawn(args, timeout=5)
|
||||
+ p.expect_exact("Spider opened")
|
||||
+ p.expect_exact("Crawled (200)")
|
||||
+ p.kill(sig)
|
||||
+ p.expect_exact("shutting down gracefully")
|
||||
+ p.kill(sig)
|
||||
+ p.expect_exact("forcing unclean shutdown")
|
||||
+ p.wait()
|
||||
+
|
||||
|
||||
class CrawlerRunnerSubprocess(ScriptRunnerMixin, unittest.TestCase):
|
||||
script_dir = Path(__file__).parent.resolve() / "CrawlerRunner"
|
Reference in New Issue
Block a user