From 119328cdce4852f0c88a275538db091ab3c4f61b5a2b37d0af4f4ac54b6953e7 Mon Sep 17 00:00:00 2001 From: Dirk Mueller Date: Mon, 25 Mar 2024 15:36:37 +0000 Subject: [PATCH] - update to 2.11.1 (bsc#1220514, CVE-2024-1892): * Addressed `ReDoS vulnerabilities` (bsc#1220514, CVE-2024-1892) - ``scrapy.utils.iterators.xmliter`` is now deprecated in favor of :func:`~scrapy.utils.iterators.xmliter_lxml`, which :class:`~scrapy.spiders.XMLFeedSpider` now uses. To minimize the impact of this change on existing code, :func:`~scrapy.utils.iterators.xmliter_lxml` now supports indicating the node namespace with a prefix in the node name, and big files with highly nested trees when using libxml2 2.7+. - Fixed regular expressions in the implementation of the :func:`~scrapy.utils.response.open_in_browser` function. .. _ReDoS vulnerabilities: https://owasp.org/www-community/attacks/Regular_expression_Denial_of_Service_-_ReDoS * :setting:`DOWNLOAD_MAXSIZE` and :setting:`DOWNLOAD_WARNSIZE` now also apply to the decompressed response body. Please, see the `7j7m-v7m3-jqm7 security advisory`_ for more information. .. _7j7m-v7m3-jqm7 security advisory: https://github.com/scrapy/scrapy/security/advisories/GHSA-7j7m-v7m3-jqm7 * Also in relation with the `7j7m-v7m3-jqm7 security advisory`_, the deprecated ``scrapy.downloadermiddlewares.decompression`` module has been removed. * The ``Authorization`` header is now dropped on redirects to a different domain. Please, see the `cw9j-q3vf-hrrv security advisory`_ for more information. * The OS signal handling code was refactored to no longer use private Twisted functions. (:issue:`6024`, :issue:`6064`, :issue:`6112`) * Improved documentation for :class:`~scrapy.crawler.Crawler` initialization changes made in the 2.11.0 release. (:issue:`6057`, :issue:`6147`) * Extended documentation for :attr:`Request.meta `. * Fixed the :reqmeta:`dont_merge_cookies` documentation. (:issue:`5936`, * Added a link to Zyte's export guides to the :ref:`feed exports * Added a missing note about backward-incompatible changes in OBS-URL: https://build.opensuse.org/package/show/devel:languages:python/python-Scrapy?expand=0&rev=37 --- Scrapy-2.11.0.tar.gz | 3 - Scrapy-2.11.1.tar.gz | 3 + python-Scrapy.changes | 337 ++++++++++++++++++++---------------- python-Scrapy.spec | 20 ++- twisted-23.8.0-compat.patch | 254 --------------------------- 5 files changed, 205 insertions(+), 412 deletions(-) delete mode 100644 Scrapy-2.11.0.tar.gz create mode 100644 Scrapy-2.11.1.tar.gz delete mode 100644 twisted-23.8.0-compat.patch diff --git a/Scrapy-2.11.0.tar.gz b/Scrapy-2.11.0.tar.gz deleted file mode 100644 index bdff1e4..0000000 --- a/Scrapy-2.11.0.tar.gz +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:3cbdedce0c3f0e0482d61be2d7458683be7cd7cf14b0ee6adfbaddb80f5b36a5 -size 1171092 diff --git a/Scrapy-2.11.1.tar.gz b/Scrapy-2.11.1.tar.gz new file mode 100644 index 0000000..3a26761 --- /dev/null +++ b/Scrapy-2.11.1.tar.gz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:733a039c7423e52b69bf2810b5332093d4e42a848460359c07b02ecff8f73ebe +size 1176726 diff --git a/python-Scrapy.changes b/python-Scrapy.changes index 6328775..4332df8 100644 --- a/python-Scrapy.changes +++ b/python-Scrapy.changes @@ -1,3 +1,48 @@ +------------------------------------------------------------------- +Mon Mar 25 14:12:20 UTC 2024 - Dirk Müller + +- update to 2.11.1 (bsc#1220514, CVE-2024-1892): + * Addressed `ReDoS vulnerabilities` (bsc#1220514, CVE-2024-1892) + - ``scrapy.utils.iterators.xmliter`` is now deprecated in favor of + :func:`~scrapy.utils.iterators.xmliter_lxml`, which + :class:`~scrapy.spiders.XMLFeedSpider` now uses. + + To minimize the impact of this change on existing code, + :func:`~scrapy.utils.iterators.xmliter_lxml` now supports indicating + the node namespace with a prefix in the node name, and big files with + highly nested trees when using libxml2 2.7+. + + - Fixed regular expressions in the implementation of the + :func:`~scrapy.utils.response.open_in_browser` function. + .. _ReDoS vulnerabilities: https://owasp.org/www-community/attacks/Regular_expression_Denial_of_Service_-_ReDoS + + * :setting:`DOWNLOAD_MAXSIZE` and :setting:`DOWNLOAD_WARNSIZE` now also apply + to the decompressed response body. Please, see the `7j7m-v7m3-jqm7 security + advisory`_ for more information. + + .. _7j7m-v7m3-jqm7 security advisory: https://github.com/scrapy/scrapy/security/advisories/GHSA-7j7m-v7m3-jqm7 + + * Also in relation with the `7j7m-v7m3-jqm7 security advisory`_, the + deprecated ``scrapy.downloadermiddlewares.decompression`` module has been + removed. + * The ``Authorization`` header is now dropped on redirects to a different + domain. Please, see the `cw9j-q3vf-hrrv security advisory`_ for more + information. + * The OS signal handling code was refactored to no longer use private Twisted + functions. (:issue:`6024`, :issue:`6064`, :issue:`6112`) + * Improved documentation for :class:`~scrapy.crawler.Crawler` initialization + changes made in the 2.11.0 release. (:issue:`6057`, :issue:`6147`) + * Extended documentation for :attr:`Request.meta `. + * Fixed the :reqmeta:`dont_merge_cookies` documentation. (:issue:`5936`, + * Added a link to Zyte's export guides to the :ref:`feed exports + * Added a missing note about backward-incompatible changes in + :class:`~scrapy.exporters.PythonItemExporter` to the 2.11.0 release notes. + * Added a missing note about removing the deprecated + ``scrapy.utils.boto.is_botocore()`` function to the 2.8.0 release notes. + * Other documentation improvements. (:issue:`6128`, :issue:`6144`, + :issue:`6163`, :issue:`6190`, :issue:`6192`) +- drop twisted-23.8.0-compat.patch (upstream) + ------------------------------------------------------------------- Wed Jan 10 07:50:52 UTC 2024 - Daniel Garcia @@ -25,7 +70,7 @@ Wed Jan 10 07:50:52 UTC 2024 - Daniel Garcia ------------------------------------------------------------------- Mon Nov 7 20:35:15 UTC 2022 - Yogalakshmi Arunachalam -- Update to v2.7.1 +- Update to v2.7.1 * Relaxed the restriction introduced in 2.6.2 so that the Proxy-Authentication header can again be set explicitly in certain cases, restoring compatibility with scrapy-zyte-smartproxy 2.1.0 and older Bug fixes @@ -34,7 +79,7 @@ Mon Nov 7 20:35:15 UTC 2022 - Yogalakshmi Arunachalam ------------------------------------------------------------------- Thu Oct 27 21:15:45 UTC 2022 - Yogalakshmi Arunachalam -- Update to v2.7.0 +- Update to v2.7.0 Highlights: * Added Python 3.11 support, dropped Python 3.6 support * Improved support for :ref:`asynchronous callbacks ` @@ -51,13 +96,13 @@ Thu Oct 27 21:15:45 UTC 2022 - Yogalakshmi Arunachalam Deprecations - :meth:`ImagesPipeline.thumb_path ` must now accept an item parameter (:issue:`5504`, :issue:`5508`). - The scrapy.downloadermiddlewares.decompression module is now deprecated (:issue:`5546`, :issue:`5547`). - + Complete changelog https://github.com/scrapy/scrapy/blob/2.7/docs/news.rst ------------------------------------------------------------------- Fri Sep 9 15:21:20 UTC 2022 - Yogalakshmi Arunachalam -- Update to v2.6.2 +- Update to v2.6.2 Security bug fix: * When HttpProxyMiddleware processes a request with proxy metadata, and that proxy metadata includes proxy credentials, HttpProxyMiddleware sets the Proxy-Authentication header, but only if that header is not already set. @@ -68,7 +113,7 @@ Fri Sep 9 15:21:20 UTC 2022 - Yogalakshmi Arunachalam but fail to remove the Proxy-Authentication header from the previous value of the proxy metadata, causing the credentials of one proxy to be sent to a different proxy. * To prevent the unintended leaking of proxy credentials, the behavior of HttpProxyMiddleware is now as follows when processing a request: - + If the request being processed defines proxy metadata that includes credentials, the Proxy-Authorization header is always updated + + If the request being processed defines proxy metadata that includes credentials, the Proxy-Authorization header is always updated to feature those credentials. + If the request being processed defines proxy metadata without credentials, the Proxy-Authorization header is removed unless it was originally defined for the same proxy URL. @@ -212,7 +257,7 @@ Fri Jul 3 17:05:03 UTC 2020 - Jacob W * dataclass objects and attrs objects are now valid item types * New TextResponse.json method * New bytes_received signal that allows canceling response download - * CookiesMiddleware fixes + * CookiesMiddleware fixes - Update to 2.1.0: * New FEEDS setting to export to multiple feeds @@ -249,7 +294,7 @@ Thu Apr 2 03:38:20 UTC 2020 - Steve Kowalik Thu Jan 16 15:00:50 UTC 2020 - Marketa Calabkova - update to 1.8.0 - * Dropped Python 3.4 support and updated minimum requirements; + * Dropped Python 3.4 support and updated minimum requirements; made Python 3.8 support official * lots of new fixes and features @@ -281,7 +326,7 @@ Wed Jul 24 08:37:28 UTC 2019 - pgajdos@suse.com ------------------------------------------------------------------- Thu May 16 19:33:46 UTC 2019 - ranand@suse.com -- Skip flaky CrawlerTestCase +- Skip flaky CrawlerTestCase ------------------------------------------------------------------- Mon Feb 18 21:00:54 UTC 2019 - Hans-Peter Jansen @@ -501,7 +546,7 @@ Thu Feb 14 11:48:09 UTC 2019 - Hans-Peter Jansen ------------------------------------------------------------------- Tue Feb 13 00:42:45 UTC 2018 - jacobwinski@gmail.com -- Update spec file to singlespec +- Update spec file to singlespec - Update to Scrapy 1.5.0 * Backwards Incompatible Changes + Scrapy 1.5 drops support for Python 3.3. @@ -761,7 +806,7 @@ Tue Feb 13 00:42:45 UTC 2018 - jacobwinski@gmail.com ------------------------------------------------------------------- Wed Mar 29 04:51:03 UTC 2017 - jacobwinski@gmail.com -- Update spec file: change python-pyasn1 to python2-pyasn1 +- Update spec file: change python-pyasn1 to python2-pyasn1 ------------------------------------------------------------------- Sun Jun 5 05:27:31 UTC 2016 - jacobwinski@gmail.com @@ -772,7 +817,7 @@ Sun Jun 5 05:27:31 UTC 2016 - jacobwinski@gmail.com ------------------------------------------------------------------- Thu Jun 2 19:22:04 UTC 2016 - jacobwinski@gmail.com -- Update to 1.1.0 +- Update to 1.1.0 * Most important features and bug fixes: + Scrapy 1.1 has beta Python 3 support (requires Twisted >= 15.5). See Beta Python 3 Support for more details and some limitations. + Hot new features: @@ -924,7 +969,7 @@ Sun Jan 10 06:25:08 UTC 2016 - jacobwinski@gmail.com ------------------------------------------------------------------- Sun Jan 10 06:06:51 UTC 2016 - jacobwinski@gmail.com -- add python-service_identity to build requires +- add python-service_identity to build requires ------------------------------------------------------------------- Tue Aug 11 06:06:23 UTC 2015 - jacobwinski@gmail.com @@ -954,7 +999,7 @@ Thu Jul 16 23:57:56 UTC 2015 - jacobwinski@gmail.com Sat Jul 4 19:38:14 UTC 2015 - jacobwinski@gmail.com - Update to 1.0.1 - * Unquote request path before passing to FTPClient, it already escape paths + * Unquote request path before passing to FTPClient, it already escape paths * include tests/ to source distribution in MANIFEST.in - Update to 1.0.0 * New Features & Enhancements @@ -1000,7 +1045,7 @@ Sat Jul 4 19:38:14 UTC 2015 - jacobwinski@gmail.com + Deleted bin folder from root, fixes #913 + Remove jsonrpc based webservice + Move Test cases under project root dir - + Fix backward incompatibility for relocated paths in settings + + Fix backward incompatibility for relocated paths in settings * Bugfixes + Item multi inheritance fix + ItemLoader.load_item: iterate over copy of fields @@ -1083,15 +1128,15 @@ Mon Mar 2 14:48:29 UTC 2015 - toddrme2178@gmail.com Thu Sep 4 17:05:20 UTC 2014 - toddrme2178@gmail.com Update to 0.24.4 - * pem file is used by mockserver and required by scrapy bench + * pem file is used by mockserver and required by scrapy bench * scrapy bench needs scrapy.tests* - Update to 0.24.3 - * no need to waste travis-ci time on py3 for 0.24 + * no need to waste travis-ci time on py3 for 0.24 * Update installation docs - * There is a trove classifier for Scrapy framework! - * update other places where w3lib version is mentioned + * There is a trove classifier for Scrapy framework! + * update other places where w3lib version is mentioned * Update w3lib requirement to 1.8.0 - * Use w3lib.html.replace_entities() (remove_entities() is + * Use w3lib.html.replace_entities() (remove_entities() is deprecated) * set zip_safe=False * do not ship tests package @@ -1099,42 +1144,42 @@ Thu Sep 4 17:05:20 UTC 2014 - toddrme2178@gmail.com * Modernize setup.py * headers can not handle non-string values * fix ftp test cases - * The sum up of travis-ci builds are taking like 50min to complete + * The sum up of travis-ci builds are taking like 50min to complete * Update shell.rst typo * removes weird indentation in the shell results - * improved explanations, clarified blog post as source, added link + * improved explanations, clarified blog post as source, added link for XPath string functions in the spec - * renamed UserTimeoutError and ServerTimeouterror #583 + * renamed UserTimeoutError and ServerTimeouterror #583 * adding some xpath tips to selectors docs - * fix tests to account for https://github.com/scrapy/w3lib/pull/23 + * fix tests to account for https://github.com/scrapy/w3lib/pull/23 * get_func_args maximum recursion fix #728 - * Updated input/ouput processor example according to #560. + * Updated input/ouput processor example according to #560. * Fixed Python syntax in tutorial. * Add test case for tunneling proxy - * Bugfix for leaking Proxy-Authorization header to remote host when + * Bugfix for leaking Proxy-Authorization header to remote host when using tunneling - * Extract links from XHTML documents with MIME-Type + * Extract links from XHTML documents with MIME-Type "application/xml" * Merge pull request #793 from roysc/patch-1 * Fix typo in commands.rst - * better testcase for settings.overrides.setdefault - * Using CRLF as line marker according to http 1.1 definition + * better testcase for settings.overrides.setdefault + * Using CRLF as line marker according to http 1.1 definition - Update to 0.24.2 - * Use a mutable mapping to proxy deprecated settings.overrides and + * Use a mutable mapping to proxy deprecated settings.overrides and settings.defaults attribute * there is not support for python3 yet - * Update python compatible version set to debian packages + * Update python compatible version set to debian packages * DOC fix formatting in release notes - Update to 0.24.1 - * Fix deprecated CrawlerSettings and increase backwards + * Fix deprecated CrawlerSettings and increase backwards compatibility with .defaults attribute - Update to 0.24.0 * Enhancements + Improve Scrapy top-level namespace + Add selector shortcuts to responses - + Add new lxml based LinkExtractor to replace unmantained + + Add new lxml based LinkExtractor to replace unmantained SgmlLinkExtractor - + Cleanup settings API - part of per-spider settings **GSoC + + Cleanup settings API - part of per-spider settings **GSoC project** + Add UTF8 encoding header to templates + Telnet console now binds to 127.0.0.1 by default @@ -1144,31 +1189,31 @@ Thu Sep 4 17:05:20 UTC 2014 - toddrme2178@gmail.com cache middleware + Expose current crawler in Scrapy shell + Improve testsuite comparing CSV and XML exporters - + New `offsite/filtered` and `offsite/domains` stats + + New `offsite/filtered` and `offsite/domains` stats + Support process_links as generator in CrawlSpider - + Verbose logging and new stats counters for DupeFilter + + Verbose logging and new stats counters for DupeFilter + Add a mimetype parameter to `MailSender.send()` + Generalize file pipeline log messages - + Replace unencodeable codepoints with html entities in + + Replace unencodeable codepoints with html entities in SGMLLinkExtractor + Converted SEP documents to rst format - + Tests and docs for clickdata's nr index in FormRequest - + Allow to disable a downloader handler just like any other + + Tests and docs for clickdata's nr index in FormRequest + + Allow to disable a downloader handler just like any other component - + Log when a request is discarded after too many redirections + + Log when a request is discarded after too many redirections + Log error responses if they are not handled by spider callbacks - + Add content-type check to http compression mw + + Add content-type check to http compression mw + Run pypy tests using latest pypi from ppa + Run test suite using pytest instead of trial - + Build docs and check for dead links in tox environment + + Build docs and check for dead links in tox environment + Make scrapy.version_info a tuple of integers + Infer exporter's output format from filename extensions - + Support case-insensitive domains in `url_is_from_any_domain()` - + Remove pep8 warnings in project and spider templates + + Support case-insensitive domains in `url_is_from_any_domain()` + + Remove pep8 warnings in project and spider templates + Tests and docs for `request_fingerprint` function - + Update SEP-19 for GSoC project `per-spider settings` + + Update SEP-19 for GSoC project `per-spider settings` + Set exit code to non-zero when contracts fails - + Add a setting to control what class is instanciated as + + Add a setting to control what class is instanciated as Downloader component + Pass response in `item_dropped` signal + Improve `scrapy check` contracts command @@ -1176,142 +1221,142 @@ Thu Sep 4 17:05:20 UTC 2014 - toddrme2178@gmail.com + Document `request_scheduled` signal + Add a note about reporting security issues + Add LevelDB http cache storage backend - + Sort spider list output of `scrapy list` command + + Sort spider list output of `scrapy list` command + Multiple documentation enhancemens and fixes * Bugfixes - + Encode unicode URL value when creating Links in + + Encode unicode URL value when creating Links in RegexLinkExtractor + Ignore None values in ItemLoader processors - + Fix link text when there is an inner tag in SGMLLinkExtractor + + Fix link text when there is an inner tag in SGMLLinkExtractor and HtmlParserLinkExtractor + Fix wrong checks on subclassing of deprecated classes + Handle errors caused by inspect.stack() failures + Fix a reference to unexistent engine attribute + Fix dynamic itemclass example usage of type() + Use lucasdemarchi/codespell to fix typos - + Fix default value of attrs argument in SgmlLinkExtractor to be + + Fix default value of attrs argument in SgmlLinkExtractor to be tuple + Fix XXE flaw in sitemap reader + Fix engine to support filtered start requests - + Fix offsite middleware case on urls with no hostnames + + Fix offsite middleware case on urls with no hostnames + Testsuite doesn't require PIL anymore - Update to 0.22.2 - * fix a reference to unexistent engine.slots. closes #593 - * downloaderMW doc typo (spiderMW doc copy remnant) + * fix a reference to unexistent engine.slots. closes #593 + * downloaderMW doc typo (spiderMW doc copy remnant) * Correct typos - Update to 0.22.1 - * localhost666 can resolve under certain circumstances + * localhost666 can resolve under certain circumstances * test inspect.stack failure * Handle cases when inspect.stack() fails - * Fix wrong checks on subclassing of deprecated classes. closes #581 + * Fix wrong checks on subclassing of deprecated classes. closes #581 * Docs: 4-space indent for final spider example - * Fix HtmlParserLinkExtractor and tests after #485 merge - * BaseSgmlLinkExtractor: Fixed the missing space when the link has + * Fix HtmlParserLinkExtractor and tests after #485 merge + * BaseSgmlLinkExtractor: Fixed the missing space when the link has an inner tag - * BaseSgmlLinkExtractor: Added unit test of a link with an inner tag - * BaseSgmlLinkExtractor: Fixed unknown_endtag() so that it only set - current_link=None when the end tag match the opening tag + * BaseSgmlLinkExtractor: Added unit test of a link with an inner tag + * BaseSgmlLinkExtractor: Fixed unknown_endtag() so that it only set + current_link=None when the end tag match the opening tag * Fix tests for Travis-CI build * replace unencodeable codepoints with html entities. - * RegexLinkExtractor: encode URL unicode value when creating Links - * Updated the tutorial crawl output with latest output. - * Updated shell docs with the crawler reference and fixed the actual + * RegexLinkExtractor: encode URL unicode value when creating Links + * Updated the tutorial crawl output with latest output. + * Updated shell docs with the crawler reference and fixed the actual shell output. * PEP8 minor edits. * Expose current crawler in the scrapy shell. * Unused re import and PEP8 minor edits. - * Ignore None's values when using the ItemLoader. - * DOC Fixed HTTPCACHE_STORAGE typo in the default value which is now + * Ignore None's values when using the ItemLoader. + * DOC Fixed HTTPCACHE_STORAGE typo in the default value which is now Filesystem instead Dbm. * show ubuntu setup instructions as literal code * Update Ubuntu installation instructions - * Merge pull request #550 from stray-leone/patch-1 + * Merge pull request #550 from stray-leone/patch-1 * modify the version of scrapy ubuntu package * fix 0.22.0 release date - * fix typos in news.rst and remove (not released yet) header + * fix typos in news.rst and remove (not released yet) header - Update to 0.22.0 * Enhancements - + [**Backwards incompatible**] Switched HTTPCacheMiddleware + + [**Backwards incompatible**] Switched HTTPCacheMiddleware backend to filesystem - To restore old backend set `HTTPCACHE_STORAGE` to + To restore old backend set `HTTPCACHE_STORAGE` to `scrapy.contrib.httpcache.DbmCacheStorage` + Proxy \https:// urls using CONNECT method - + Add a middleware to crawl ajax crawleable pages as defined by + + Add a middleware to crawl ajax crawleable pages as defined by google - + Rename scrapy.spider.BaseSpider to scrapy.spider.Spider + + Rename scrapy.spider.BaseSpider to scrapy.spider.Spider + Selectors register EXSLT namespaces by default + Unify item loaders similar to selectors renaming + Make `RFPDupeFilter` class easily subclassable - + Improve test coverage and forthcoming Python 3 support - + Promote startup info on settings and middleware to INFO level + + Improve test coverage and forthcoming Python 3 support + + Promote startup info on settings and middleware to INFO level + Support partials in `get_func_args` util + Allow running indiviual tests via tox + Update extensions ignored by link extractors - + Add middleware methods to get files/images/thumbs paths + + Add middleware methods to get files/images/thumbs paths + Improve offsite middleware tests - + Add a way to skip default Referer header set by + + Add a way to skip default Referer header set by RefererMiddleware - + Do not send `x-gzip` in default `Accept-Encoding` header - + Support defining http error handling using settings - + Use modern python idioms wherever you find legacies + + Do not send `x-gzip` in default `Accept-Encoding` header + + Support defining http error handling using settings + + Use modern python idioms wherever you find legacies + Improve and correct documentation * Fixes - + Update Selector class imports in CrawlSpider template + + Update Selector class imports in CrawlSpider template + Fix unexistent reference to `engine.slots` - + Do not try to call `body_as_unicode()` on a non-TextResponse + + Do not try to call `body_as_unicode()` on a non-TextResponse instance - + Warn when subclassing XPathItemLoader, previously it only warned + + Warn when subclassing XPathItemLoader, previously it only warned on instantiation. - + Warn when subclassing XPathSelector, previously it only warned + + Warn when subclassing XPathSelector, previously it only warned on instantiation. + Multiple fixes to memory stats - + Fix overriding url in `FormRequest.from_response()` + + Fix overriding url in `FormRequest.from_response()` + Fix tests runner under pip 1.5 + Fix logging error when spider name is unicode - Update to 0.20.2 - * Update CrawlSpider Template with Selector changes + * Update CrawlSpider Template with Selector changes * fix method name in tutorial. closes GH-480 - Update to 0.20.1 - * include_package_data is required to build wheels from published + * include_package_data is required to build wheels from published sources - * process_parallel was leaking the failures on its internal + * process_parallel was leaking the failures on its internal deferreds. - Update to 0.20.0 * Enhancements + New Selector's API including CSS selectors + Request/Response url/body attributes are now immutable (modifying them had been deprecated for a long time) - + :setting:`ITEM_PIPELINES` is now defined as a dict (instead of a + + :setting:`ITEM_PIPELINES` is now defined as a dict (instead of a list) + Sitemap spider can fetch alternate URLs - + `Selector.remove_namespaces()` now remove namespaces from + + `Selector.remove_namespaces()` now remove namespaces from element's attributes. + Paved the road for Python 3.3+ - + New item exporter using native python types with nesting support - + Tune HTTP1.1 pool size so it matches concurrency defined by + + New item exporter using native python types with nesting support + + Tune HTTP1.1 pool size so it matches concurrency defined by settings - + scrapy.mail.MailSender now can connect over TLS or upgrade using + + scrapy.mail.MailSender now can connect over TLS or upgrade using STARTTLS - + New FilesPipeline with functionality factored out from + + New FilesPipeline with functionality factored out from ImagesPipeline - + Recommend Pillow instead of PIL for image handling - + Added debian packages for Ubuntu quantal and raring - + Mock server (used for tests) can listen for HTTPS requests + + Recommend Pillow instead of PIL for image handling + + Added debian packages for Ubuntu quantal and raring + + Mock server (used for tests) can listen for HTTPS requests + Remove multi spider support from multiple core components - + Travis-CI now tests Scrapy changes against development versions + + Travis-CI now tests Scrapy changes against development versions of `w3lib` and `queuelib` python packages. - + Add pypy 2.1 to continuous integration tests - + Pylinted, pep8 and removed old-style exceptions from source + + Add pypy 2.1 to continuous integration tests + + Pylinted, pep8 and removed old-style exceptions from source + Use importlib for parametric imports - + Handle a regression introduced in Python 2.7.5 that affects + + Handle a regression introduced in Python 2.7.5 that affects XmlItemExporter + Bugfix crawling shutdown on SIGINT - + Do not submit `reset` type inputs in FormRequest.from_response - + Do not silence download errors when request errback raises an + + Do not submit `reset` type inputs in FormRequest.from_response + + Do not silence download errors when request errback raises an exception * Bugfixes + Fix tests under Django 1.6 - + Lot of bugfixes to retry middleware under disconnections using + + Lot of bugfixes to retry middleware under disconnections using HTTP 1.1 download handler + Fix inconsistencies among Twisted releases + Fix scrapy shell bugs @@ -1325,116 +1370,116 @@ Thu Sep 4 17:05:20 UTC 2014 - toddrme2178@gmail.com * Other + Dropped Python 2.6 support + Add `cssselect`_ python package as install dependency - + Drop libxml2 and multi selector's backend support, `lxml`_ is + + Drop libxml2 and multi selector's backend support, `lxml`_ is required from now on. - + Minimum Twisted version increased to 10.0.0, dropped Twisted 8.0 + + Minimum Twisted version increased to 10.0.0, dropped Twisted 8.0 support. - + Running test suite now requires `mock` python library + + Running test suite now requires `mock` python library - Update to 0.18.4 - * IPython refuses to update the namespace. fix #396 - * Fix AlreadyCalledError replacing a request in shell command. + * IPython refuses to update the namespace. fix #396 + * Fix AlreadyCalledError replacing a request in shell command. * Fix start_requests laziness and early hangs - Update to 0.18.3 - * fix regression on lazy evaluation of start requests + * fix regression on lazy evaluation of start requests * forms: do not submit reset inputs - * increase unittest timeouts to decrease travis false positive + * increase unittest timeouts to decrease travis false positive failures * backport master fixes to json exporter - * Fix permission and set umask before generating sdist tarball + * Fix permission and set umask before generating sdist tarball - Update to 0.18.2 - * Backport `scrapy check` command fixes and backward compatible + * Backport `scrapy check` command fixes and backward compatible multi crawler process - Update to 0.18.1 - * remove extra import added by cherry picked changes + * remove extra import added by cherry picked changes * fix crawling tests under twisted pre 11.0.0 * py26 can not format zero length fields {} - * test PotentiaDataLoss errors on unbound responses - * Treat responses without content-length or Transfer-Encoding as + * test PotentiaDataLoss errors on unbound responses + * Treat responses without content-length or Transfer-Encoding as good responses - * do no include ResponseFailed if http11 handler is not enabled - * New HTTP client wraps connection losts in ResponseFailed + * do no include ResponseFailed if http11 handler is not enabled + * New HTTP client wraps connection losts in ResponseFailed exception. * limit travis-ci build matrix - * Merge pull request #375 from peterarenot/patch-1 + * Merge pull request #375 from peterarenot/patch-1 * Fixed so it refers to the correct folder - * added quantal & raring to support ubuntu releases - * fix retry middleware which didn't retry certain connection errors + * added quantal & raring to support ubuntu releases + * fix retry middleware which didn't retry certain connection errors after the upgrade to http1 client, closes GH-373 * fix XmlItemExporter in Python 2.7.4 and 2.7.5 * minor updates to 0.18 release notes * fix contributters list format - Update to 0.18.0 - * Lot of improvements to testsuite run using Tox, including a way to + * Lot of improvements to testsuite run using Tox, including a way to test on pypi * Handle GET parameters for AJAX crawleable urls * Use lxml recover option to parse sitemaps * Bugfix cookie merging by hostname and not by netloc - * Support disabling `HttpCompressionMiddleware` using a flag setting - * Support xml namespaces using `iternodes` parser in `XMLFeedSpider` + * Support disabling `HttpCompressionMiddleware` using a flag setting + * Support xml namespaces using `iternodes` parser in `XMLFeedSpider` * Support `dont_cache` request meta flag - * Bugfix `scrapy.utils.gz.gunzip` broken by changes in python 2.7.4 + * Bugfix `scrapy.utils.gz.gunzip` broken by changes in python 2.7.4 * Bugfix url encoding on `SgmlLinkExtractor` - * Bugfix `TakeFirst` processor shouldn't discard zero (0) value + * Bugfix `TakeFirst` processor shouldn't discard zero (0) value * Support nested items in xml exporter * Improve cookies handling performance * Log dupe filtered requests once - * Split redirection middleware into status and meta based + * Split redirection middleware into status and meta based middlewares * Use HTTP1.1 as default downloader handler - * Support xpath form selection on `FormRequest.from_response` - * Bugfix unicode decoding error on `SgmlLinkExtractor` + * Support xpath form selection on `FormRequest.from_response` + * Bugfix unicode decoding error on `SgmlLinkExtractor` * Bugfix signal dispatching on pypi interpreter * Improve request delay and concurrency handling * Add RFC2616 cache policy to `HttpCacheMiddleware` * Allow customization of messages logged by engine * Multiples improvements to `DjangoItem` - * Extend Scrapy commands using setuptools entry points - * Allow spider `allowed_domains` value to be set/tuple + * Extend Scrapy commands using setuptools entry points + * Allow spider `allowed_domains` value to be set/tuple * Support `settings.getdict` - * Simplify internal `scrapy.core.scraper` slot handling + * Simplify internal `scrapy.core.scraper` slot handling * Added `Item.copy` * Collect idle downloader slots * Add `ftp://` scheme downloader handler - * Added downloader benchmark webserver and spider tools + * Added downloader benchmark webserver and spider tools :ref:`benchmarking` - * Moved persistent (on disk) queues to a separate project + * Moved persistent (on disk) queues to a separate project (queuelib_) which scrapy now depends on * Add scrapy commands using external libraries * Added ``--pdb`` option to ``scrapy`` command line tool - * Added :meth:`XPathSelector.remove_namespaces` which allows to - remove all namespaces from XML documents for convenience (to work + * Added :meth:`XPathSelector.remove_namespaces` which allows to + remove all namespaces from XML documents for convenience (to work with namespace-less XPaths). Documented in :ref:`topics-selectors` * Several improvements to spider contracts - * New default middleware named MetaRefreshMiddldeware that handles + * New default middleware named MetaRefreshMiddldeware that handles meta-refresh html tag redirections, - * MetaRefreshMiddldeware and RedirectMiddleware have different + * MetaRefreshMiddldeware and RedirectMiddleware have different priorities to address #62 * added from_crawler method to spiders * added system tests with mock server * more improvements to Mac OS compatibility (thanks Alex Cepoi) - * several more cleanups to singletons and multi-spider support + * several more cleanups to singletons and multi-spider support (thanks Nicolas Ramirez) * support custom download slots * added --spider option to "shell" command. * log overridden settings when scrapy starts - Update to 0.16.5 - * obey request method when scrapy deploy is redirected to a new + * obey request method when scrapy deploy is redirected to a new endpoint - * fix inaccurate downloader middleware documentation. refs #280 - * doc: remove links to diveintopython.org, which is no longer + * fix inaccurate downloader middleware documentation. refs #280 + * doc: remove links to diveintopython.org, which is no longer available. * Find form nodes in invalid html5 documents - * Fix typo labeling attrs type bool instead of list + * Fix typo labeling attrs type bool instead of list - Update to 0.16.4 * fixes spelling errors in documentation - * add doc about disabling an extension. refs #132 - * Fixed error message formatting. log.err() doesn't support cool - formatting and when error occurred, the message was: "ERROR: Error + * add doc about disabling an extension. refs #132 + * Fixed error message formatting. log.err() doesn't support cool + formatting and when error occurred, the message was: "ERROR: Error processing %(item)s" * lint and improve images pipeline error logging * fixed doc typos - * add documentation topics: Broad Crawls & Common Practies - * fix bug in scrapy parse command when spider is not specified + * add documentation topics: Broad Crawls & Common Practies + * fix bug in scrapy parse command when spider is not specified explicitly. * Update docs/topics/commands.rst - Update dependencies @@ -1445,7 +1490,7 @@ Mon Nov 4 16:27:37 UTC 2013 - castedo@castedo.com - Upgrade .spec dependencies to work with SLE 11 SP3 * python-twisted 8.0 from standard SLE11 repository not working, force >= 9.0 - * use new "python-pyOpenSSL" name rather than old "python-openssl" + * use new "python-pyOpenSSL" name rather than old "python-openssl" ------------------------------------------------------------------- Mon Jan 21 16:27:40 UTC 2013 - p.drouand@gmail.com diff --git a/python-Scrapy.spec b/python-Scrapy.spec index 64d6a0c..218615d 100644 --- a/python-Scrapy.spec +++ b/python-Scrapy.spec @@ -16,21 +16,21 @@ # +%{?sle15_python_module_pythons} Name: python-Scrapy -Version: 2.11.0 +Version: 2.11.1 Release: 0 Summary: A high-level Python Screen Scraping framework License: BSD-3-Clause Group: Development/Languages/Python URL: https://scrapy.org Source: https://files.pythonhosted.org/packages/source/S/Scrapy/Scrapy-%{version}.tar.gz -# PATCH-FIX-UPSTREAM twisted-23.8.0-compat.patch gh#scrapy/scrapy#6064 -Patch1: twisted-23.8.0-compat.patch BuildRequires: %{python_module Pillow} BuildRequires: %{python_module Protego >= 0.1.15} BuildRequires: %{python_module PyDispatcher >= 2.0.5} BuildRequires: %{python_module Twisted >= 18.9.0} BuildRequires: %{python_module attrs} +BuildRequires: %{python_module base >= 3.8} BuildRequires: %{python_module botocore >= 1.4.87} BuildRequires: %{python_module cryptography >= 36.0.0} BuildRequires: %{python_module cssselect >= 0.9.1} @@ -40,8 +40,9 @@ BuildRequires: %{python_module itemloaders >= 1.0.1} BuildRequires: %{python_module lxml >= 4.4.1} BuildRequires: %{python_module parsel >= 1.5.0} BuildRequires: %{python_module pexpect >= 4.8.1} +BuildRequires: %{python_module pip} BuildRequires: %{python_module pyOpenSSL >= 21.0.0} -BuildRequires: %{python_module pyftpdlib} +BuildRequires: %{python_module pyftpdlib >= 1.5.8} BuildRequires: %{python_module pytest-xdist} BuildRequires: %{python_module pytest} BuildRequires: %{python_module queuelib >= 1.4.2} @@ -52,11 +53,11 @@ BuildRequires: %{python_module testfixtures} BuildRequires: %{python_module tldextract} BuildRequires: %{python_module uvloop} BuildRequires: %{python_module w3lib >= 1.17.0} +BuildRequires: %{python_module wheel} BuildRequires: %{python_module zope.interface >= 5.1.0} BuildRequires: fdupes BuildRequires: python-rpm-macros BuildRequires: python3-Sphinx -BuildRequires: (python3-dataclasses if python3-base < 3.7) Requires: python-Protego >= 0.1.15 Requires: python-PyDispatcher >= 2.0.5 Requires: python-Twisted >= 18.9.0 @@ -65,6 +66,7 @@ Requires: python-cssselect >= 0.9.1 Requires: python-itemadapter >= 0.1.0 Requires: python-itemloaders >= 1.0.1 Requires: python-lxml >= 4.4.1 +Requires: python-packaging Requires: python-parsel >= 1.5.0 Requires: python-pyOpenSSL >= 21.0.0 Requires: python-queuelib >= 1.4.2 @@ -74,7 +76,7 @@ Requires: python-tldextract Requires: python-w3lib >= 1.17.2 Requires: python-zope.interface >= 5.1.0 Requires(post): update-alternatives -Requires(postun):update-alternatives +Requires(postun): update-alternatives BuildArch: noarch %python_subpackages @@ -96,13 +98,13 @@ Provides documentation for %{name}. sed -i -e 's:= python:= python3:g' docs/Makefile %build -%python_build +%pyproject_wheel pushd docs %make_build html && rm -r build/html/.buildinfo popd %install -%python_install +%pyproject_install %python_clone -a %{buildroot}%{_bindir}/scrapy %python_expand %fdupes %{buildroot}%{$python_sitelib} @@ -128,7 +130,7 @@ skiplist="$skiplist or test_start_requests_laziness" %license LICENSE %doc AUTHORS README.rst %{python_sitelib}/scrapy -%{python_sitelib}/Scrapy-%{version}*-info +%{python_sitelib}/Scrapy-%{version}.dist-info %python_alternative %{_bindir}/scrapy %files -n %{name}-doc diff --git a/twisted-23.8.0-compat.patch b/twisted-23.8.0-compat.patch deleted file mode 100644 index 0e36f1f..0000000 --- a/twisted-23.8.0-compat.patch +++ /dev/null @@ -1,254 +0,0 @@ -Index: Scrapy-2.11.0/scrapy/crawler.py -=================================================================== ---- Scrapy-2.11.0.orig/scrapy/crawler.py -+++ Scrapy-2.11.0/scrapy/crawler.py -@@ -404,8 +404,8 @@ class CrawlerProcess(CrawlerRunner): - :param bool stop_after_crawl: stop or not the reactor when all - crawlers have finished - -- :param bool install_signal_handlers: whether to install the shutdown -- handlers (default: True) -+ :param bool install_signal_handlers: whether to install the OS signal -+ handlers from Twisted and Scrapy (default: True) - """ - from twisted.internet import reactor - -@@ -416,15 +416,17 @@ class CrawlerProcess(CrawlerRunner): - return - d.addBoth(self._stop_reactor) - -- if install_signal_handlers: -- install_shutdown_handlers(self._signal_shutdown) - resolver_class = load_object(self.settings["DNS_RESOLVER"]) - resolver = create_instance(resolver_class, self.settings, self, reactor=reactor) - resolver.install_on_reactor() - tp = reactor.getThreadPool() - tp.adjustPoolsize(maxthreads=self.settings.getint("REACTOR_THREADPOOL_MAXSIZE")) - reactor.addSystemEventTrigger("before", "shutdown", self.stop) -- reactor.run(installSignalHandlers=False) # blocking call -+ if install_signal_handlers: -+ reactor.addSystemEventTrigger( -+ "after", "startup", install_shutdown_handlers, self._signal_shutdown -+ ) -+ reactor.run(installSignalHandlers=install_signal_handlers) # blocking call - - def _graceful_stop_reactor(self) -> Deferred: - d = self.stop() -Index: Scrapy-2.11.0/scrapy/utils/ossignal.py -=================================================================== ---- Scrapy-2.11.0.orig/scrapy/utils/ossignal.py -+++ Scrapy-2.11.0/scrapy/utils/ossignal.py -@@ -19,13 +19,10 @@ def install_shutdown_handlers( - function: SignalHandlerT, override_sigint: bool = True - ) -> None: - """Install the given function as a signal handler for all common shutdown -- signals (such as SIGINT, SIGTERM, etc). If override_sigint is ``False`` the -- SIGINT handler won't be install if there is already a handler in place -- (e.g. Pdb) -+ signals (such as SIGINT, SIGTERM, etc). If ``override_sigint`` is ``False`` the -+ SIGINT handler won't be installed if there is already a handler in place -+ (e.g. Pdb) - """ -- from twisted.internet import reactor -- -- reactor._handleSignals() - signal.signal(signal.SIGTERM, function) - if signal.getsignal(signal.SIGINT) == signal.default_int_handler or override_sigint: - signal.signal(signal.SIGINT, function) -Index: Scrapy-2.11.0/scrapy/utils/testproc.py -=================================================================== ---- Scrapy-2.11.0.orig/scrapy/utils/testproc.py -+++ Scrapy-2.11.0/scrapy/utils/testproc.py -@@ -2,7 +2,7 @@ from __future__ import annotations - - import os - import sys --from typing import Iterable, Optional, Tuple, cast -+from typing import Iterable, List, Optional, Tuple, cast - - from twisted.internet.defer import Deferred - from twisted.internet.error import ProcessTerminated -@@ -26,14 +26,15 @@ class ProcessTest: - env = os.environ.copy() - if settings is not None: - env["SCRAPY_SETTINGS_MODULE"] = settings -+ assert self.command - cmd = self.prefix + [self.command] + list(args) - pp = TestProcessProtocol() -- pp.deferred.addBoth(self._process_finished, cmd, check_code) -+ pp.deferred.addCallback(self._process_finished, cmd, check_code) - reactor.spawnProcess(pp, cmd[0], cmd, env=env, path=self.cwd) - return pp.deferred - - def _process_finished( -- self, pp: TestProcessProtocol, cmd: str, check_code: bool -+ self, pp: TestProcessProtocol, cmd: List[str], check_code: bool - ) -> Tuple[int, bytes, bytes]: - if pp.exitcode and check_code: - msg = f"process {cmd} exit with code {pp.exitcode}" -Index: Scrapy-2.11.0/setup.py -=================================================================== ---- Scrapy-2.11.0.orig/setup.py -+++ Scrapy-2.11.0/setup.py -@@ -6,8 +6,7 @@ version = (Path(__file__).parent / "scra - - - install_requires = [ -- # 23.8.0 incompatibility: https://github.com/scrapy/scrapy/issues/6024 -- "Twisted>=18.9.0,<23.8.0", -+ "Twisted>=18.9.0", - "cryptography>=36.0.0", - "cssselect>=0.9.1", - "itemloaders>=1.0.1", -Index: Scrapy-2.11.0/tests/CrawlerProcess/sleeping.py -=================================================================== ---- /dev/null -+++ Scrapy-2.11.0/tests/CrawlerProcess/sleeping.py -@@ -0,0 +1,24 @@ -+from twisted.internet.defer import Deferred -+ -+import scrapy -+from scrapy.crawler import CrawlerProcess -+from scrapy.utils.defer import maybe_deferred_to_future -+ -+ -+class SleepingSpider(scrapy.Spider): -+ name = "sleeping" -+ -+ start_urls = ["data:,;"] -+ -+ async def parse(self, response): -+ from twisted.internet import reactor -+ -+ d = Deferred() -+ reactor.callLater(3, d.callback, None) -+ await maybe_deferred_to_future(d) -+ -+ -+process = CrawlerProcess(settings={}) -+ -+process.crawl(SleepingSpider) -+process.start() -Index: Scrapy-2.11.0/tests/requirements.txt -=================================================================== ---- Scrapy-2.11.0.orig/tests/requirements.txt -+++ Scrapy-2.11.0/tests/requirements.txt -@@ -1,5 +1,6 @@ - # Tests requirements - attrs -+pexpect >= 4.8.0 - # https://github.com/giampaolo/pyftpdlib/issues/560 - pyftpdlib; python_version < "3.12" - pytest -Index: Scrapy-2.11.0/tests/test_command_shell.py -=================================================================== ---- Scrapy-2.11.0.orig/tests/test_command_shell.py -+++ Scrapy-2.11.0/tests/test_command_shell.py -@@ -1,11 +1,15 @@ -+import sys -+from io import BytesIO - from pathlib import Path - -+from pexpect.popen_spawn import PopenSpawn - from twisted.internet import defer - from twisted.trial import unittest - - from scrapy.utils.testproc import ProcessTest - from scrapy.utils.testsite import SiteTest - from tests import NON_EXISTING_RESOLVABLE, tests_datadir -+from tests.mockserver import MockServer - - - class ShellTest(ProcessTest, SiteTest, unittest.TestCase): -@@ -133,3 +137,25 @@ class ShellTest(ProcessTest, SiteTest, u - args = ["-c", code, "--set", f"TWISTED_REACTOR={reactor_path}"] - _, _, err = yield self.execute(args, check_code=True) - self.assertNotIn(b"RuntimeError: There is no current event loop in thread", err) -+ -+ -+class InteractiveShellTest(unittest.TestCase): -+ def test_fetch(self): -+ args = ( -+ sys.executable, -+ "-m", -+ "scrapy.cmdline", -+ "shell", -+ ) -+ logfile = BytesIO() -+ p = PopenSpawn(args, timeout=5) -+ p.logfile_read = logfile -+ p.expect_exact("Available Scrapy objects") -+ with MockServer() as mockserver: -+ p.sendline(f"fetch('{mockserver.url('/')}')") -+ p.sendline("type(response)") -+ p.expect_exact("HtmlResponse") -+ p.sendeof() -+ p.wait() -+ logfile.seek(0) -+ self.assertNotIn("Traceback", logfile.read().decode()) -Index: Scrapy-2.11.0/tests/test_crawler.py -=================================================================== ---- Scrapy-2.11.0.orig/tests/test_crawler.py -+++ Scrapy-2.11.0/tests/test_crawler.py -@@ -1,13 +1,16 @@ - import logging - import os - import platform -+import signal - import subprocess - import sys - import warnings - from pathlib import Path -+from typing import List - - import pytest - from packaging.version import parse as parse_version -+from pexpect.popen_spawn import PopenSpawn - from pytest import mark, raises - from twisted.internet import defer - from twisted.trial import unittest -@@ -289,9 +292,12 @@ class ScriptRunnerMixin: - script_dir: Path - cwd = os.getcwd() - -- def run_script(self, script_name: str, *script_args): -+ def get_script_args(self, script_name: str, *script_args: str) -> List[str]: - script_path = self.script_dir / script_name -- args = [sys.executable, str(script_path)] + list(script_args) -+ return [sys.executable, str(script_path)] + list(script_args) -+ -+ def run_script(self, script_name: str, *script_args: str) -> str: -+ args = self.get_script_args(script_name, *script_args) - p = subprocess.Popen( - args, - env=get_mockserver_env(), -@@ -517,6 +523,29 @@ class CrawlerProcessSubprocess(ScriptRun - self.assertIn("Spider closed (finished)", log) - self.assertIn("The value of FOO is 42", log) - -+ def test_shutdown_graceful(self): -+ sig = signal.SIGINT if sys.platform != "win32" else signal.SIGBREAK -+ args = self.get_script_args("sleeping.py") -+ p = PopenSpawn(args, timeout=5) -+ p.expect_exact("Spider opened") -+ p.expect_exact("Crawled (200)") -+ p.kill(sig) -+ p.expect_exact("shutting down gracefully") -+ p.expect_exact("Spider closed (shutdown)") -+ p.wait() -+ -+ def test_shutdown_forced(self): -+ sig = signal.SIGINT if sys.platform != "win32" else signal.SIGBREAK -+ args = self.get_script_args("sleeping.py") -+ p = PopenSpawn(args, timeout=5) -+ p.expect_exact("Spider opened") -+ p.expect_exact("Crawled (200)") -+ p.kill(sig) -+ p.expect_exact("shutting down gracefully") -+ p.kill(sig) -+ p.expect_exact("forcing unclean shutdown") -+ p.wait() -+ - - class CrawlerRunnerSubprocess(ScriptRunnerMixin, unittest.TestCase): - script_dir = Path(__file__).parent.resolve() / "CrawlerRunner"