Index: scrapy-2.13.3/conftest.py =================================================================== --- scrapy-2.13.3.orig/conftest.py +++ scrapy-2.13.3/conftest.py @@ -116,6 +116,16 @@ def requires_boto3(request): pytest.skip("boto3 is not installed") +@pytest.fixture(autouse=True) +def requires_mitmproxy(request): + if not request.node.get_closest_marker("requires_mitmproxy"): + return + try: + import mitmproxy # noqa: F401, PLC0415 + except ImportError: + pytest.skip("mitmproxy is not installed") + + def pytest_configure(config): if config.getoption("--reactor") != "default": install_reactor("twisted.internet.asyncioreactor.AsyncioSelectorReactor") Index: scrapy-2.13.3/pyproject.toml =================================================================== --- scrapy-2.13.3.orig/pyproject.toml +++ scrapy-2.13.3/pyproject.toml @@ -242,6 +242,7 @@ markers = [ "requires_uvloop: marks tests as only enabled when uvloop is known to be working", "requires_botocore: marks tests that need botocore (but not boto3)", "requires_boto3: marks tests that need botocore and boto3", + "requires_mitmproxy: marks tests that need mitmproxy", ] filterwarnings = [ "ignore::DeprecationWarning:twisted.web.static" Index: scrapy-2.13.3/scrapy/downloadermiddlewares/httpcompression.py =================================================================== --- scrapy-2.13.3.orig/scrapy/downloadermiddlewares/httpcompression.py +++ scrapy-2.13.3/scrapy/downloadermiddlewares/httpcompression.py @@ -29,14 +29,20 @@ logger = getLogger(__name__) ACCEPTED_ENCODINGS: list[bytes] = [b"gzip", b"deflate"] try: - try: - import brotli # noqa: F401 - except ImportError: - import brotlicffi # noqa: F401 + import brotli except ImportError: pass else: - ACCEPTED_ENCODINGS.append(b"br") + try: + brotli.Decompressor.can_accept_more_data + except AttributeError: # pragma: no cover + warnings.warn( + "You have brotli installed. But 'br' encoding support now requires " + "brotli version >= 1.2.0. Please upgrade brotli version to make Scrapy " + "decode 'br' encoded responses.", + ) + else: + ACCEPTED_ENCODINGS.append(b"br") try: import zstandard # noqa: F401 @@ -98,13 +104,13 @@ class HttpCompressionMiddleware: decoded_body, content_encoding = self._handle_encoding( response.body, content_encoding, max_size ) - except _DecompressionMaxSizeExceeded: + except _DecompressionMaxSizeExceeded as e: raise IgnoreRequest( f"Ignored response {response} because its body " - f"({len(response.body)} B compressed) exceeded " - f"DOWNLOAD_MAXSIZE ({max_size} B) during " - f"decompression." - ) + f"({len(response.body)} B compressed, " + f"{e.decompressed_size} B decompressed so far) exceeded " + f"DOWNLOAD_MAXSIZE ({max_size} B) during decompression." + ) from e if len(response.body) < warn_size <= len(decoded_body): logger.warning( f"{response} body size after decompression " @@ -187,7 +193,7 @@ class HttpCompressionMiddleware: f"from unsupported encoding(s) '{encodings_str}'." ) if b"br" in encodings: - msg += " You need to install brotli or brotlicffi to decode 'br'." + msg += " You need to install brotli >= 1.2.0 to decode 'br'." if b"zstd" in encodings: msg += " You need to install zstandard to decode 'zstd'." logger.warning(msg) Index: scrapy-2.13.3/scrapy/utils/_compression.py =================================================================== --- scrapy-2.13.3.orig/scrapy/utils/_compression.py +++ scrapy-2.13.3/scrapy/utils/_compression.py @@ -1,42 +1,9 @@ import contextlib import zlib from io import BytesIO -from warnings import warn - -from scrapy.exceptions import ScrapyDeprecationWarning - -try: - try: - import brotli - except ImportError: - import brotlicffi as brotli -except ImportError: - pass -else: - try: - brotli.Decompressor.process - except AttributeError: - warn( - ( - "You have brotlipy installed, and Scrapy will use it, but " - "Scrapy support for brotlipy is deprecated and will stop " - "working in a future version of Scrapy. brotlipy itself is " - "deprecated, it has been superseded by brotlicffi. Please, " - "uninstall brotlipy and install brotli or brotlicffi instead. " - "brotlipy has the same import name as brotli, so keeping both " - "installed is strongly discouraged." - ), - ScrapyDeprecationWarning, - ) - - def _brotli_decompress(decompressor, data): - return decompressor.decompress(data) - - else: - - def _brotli_decompress(decompressor, data): - return decompressor.process(data) +with contextlib.suppress(ImportError): + import brotli with contextlib.suppress(ImportError): import zstandard @@ -46,62 +13,64 @@ _CHUNK_SIZE = 65536 # 64 KiB class _DecompressionMaxSizeExceeded(ValueError): - pass + def __init__(self, decompressed_size: int, max_size: int) -> None: + self.decompressed_size = decompressed_size + self.max_size = max_size + + def __str__(self) -> str: + return ( + f"The number of bytes decompressed so far " + f"({self.decompressed_size} B) exceeded the specified maximum " + f"({self.max_size} B)." + ) + + +def _check_max_size(decompressed_size: int, max_size: int) -> None: + if max_size and decompressed_size > max_size: + raise _DecompressionMaxSizeExceeded(decompressed_size, max_size) def _inflate(data: bytes, *, max_size: int = 0) -> bytes: decompressor = zlib.decompressobj() - raw_decompressor = zlib.decompressobj(wbits=-15) - input_stream = BytesIO(data) + try: + first_chunk = decompressor.decompress(data, max_length=_CHUNK_SIZE) + except zlib.error: + # to work with raw deflate content that may be sent by microsoft servers. + decompressor = zlib.decompressobj(wbits=-15) + first_chunk = decompressor.decompress(data, max_length=_CHUNK_SIZE) + decompressed_size = len(first_chunk) + _check_max_size(decompressed_size, max_size) output_stream = BytesIO() - output_chunk = b"." - decompressed_size = 0 - while output_chunk: - input_chunk = input_stream.read(_CHUNK_SIZE) - try: - output_chunk = decompressor.decompress(input_chunk) - except zlib.error: - if decompressor != raw_decompressor: - # ugly hack to work with raw deflate content that may - # be sent by microsoft servers. For more information, see: - # http://carsten.codimi.de/gzip.yaws/ - # http://www.port80software.com/200ok/archive/2005/10/31/868.aspx - # http://www.gzip.org/zlib/zlib_faq.html#faq38 - decompressor = raw_decompressor - output_chunk = decompressor.decompress(input_chunk) - else: - raise + output_stream.write(first_chunk) + while decompressor.unconsumed_tail: + output_chunk = decompressor.decompress( + decompressor.unconsumed_tail, max_length=_CHUNK_SIZE + ) decompressed_size += len(output_chunk) - if max_size and decompressed_size > max_size: - raise _DecompressionMaxSizeExceeded( - f"The number of bytes decompressed so far " - f"({decompressed_size} B) exceed the specified maximum " - f"({max_size} B)." - ) + _check_max_size(decompressed_size, max_size) output_stream.write(output_chunk) - output_stream.seek(0) - return output_stream.read() + if tail := decompressor.flush(): + decompressed_size += len(tail) + _check_max_size(decompressed_size, max_size) + output_stream.write(tail) + return output_stream.getvalue() def _unbrotli(data: bytes, *, max_size: int = 0) -> bytes: decompressor = brotli.Decompressor() - input_stream = BytesIO(data) + first_chunk = decompressor.process(data, output_buffer_limit=_CHUNK_SIZE) + decompressed_size = len(first_chunk) + _check_max_size(decompressed_size, max_size) output_stream = BytesIO() - output_chunk = b"." - decompressed_size = 0 - while output_chunk: - input_chunk = input_stream.read(_CHUNK_SIZE) - output_chunk = _brotli_decompress(decompressor, input_chunk) + output_stream.write(first_chunk) + while not decompressor.is_finished(): + output_chunk = decompressor.process(b"", output_buffer_limit=_CHUNK_SIZE) + if not output_chunk: + break decompressed_size += len(output_chunk) - if max_size and decompressed_size > max_size: - raise _DecompressionMaxSizeExceeded( - f"The number of bytes decompressed so far " - f"({decompressed_size} B) exceed the specified maximum " - f"({max_size} B)." - ) + _check_max_size(decompressed_size, max_size) output_stream.write(output_chunk) - output_stream.seek(0) - return output_stream.read() + return output_stream.getvalue() def _unzstd(data: bytes, *, max_size: int = 0) -> bytes: @@ -113,12 +82,6 @@ def _unzstd(data: bytes, *, max_size: in while output_chunk: output_chunk = stream_reader.read(_CHUNK_SIZE) decompressed_size += len(output_chunk) - if max_size and decompressed_size > max_size: - raise _DecompressionMaxSizeExceeded( - f"The number of bytes decompressed so far " - f"({decompressed_size} B) exceed the specified maximum " - f"({max_size} B)." - ) + _check_max_size(decompressed_size, max_size) output_stream.write(output_chunk) - output_stream.seek(0) - return output_stream.read() + return output_stream.getvalue() Index: scrapy-2.13.3/scrapy/utils/gz.py =================================================================== --- scrapy-2.13.3.orig/scrapy/utils/gz.py +++ scrapy-2.13.3/scrapy/utils/gz.py @@ -5,7 +5,7 @@ from gzip import GzipFile from io import BytesIO from typing import TYPE_CHECKING -from ._compression import _CHUNK_SIZE, _DecompressionMaxSizeExceeded +from ._compression import _CHUNK_SIZE, _check_max_size if TYPE_CHECKING: from scrapy.http import Response @@ -31,15 +31,9 @@ def gunzip(data: bytes, *, max_size: int break raise decompressed_size += len(chunk) - if max_size and decompressed_size > max_size: - raise _DecompressionMaxSizeExceeded( - f"The number of bytes decompressed so far " - f"({decompressed_size} B) exceed the specified maximum " - f"({max_size} B)." - ) + _check_max_size(decompressed_size, max_size) output_stream.write(chunk) - output_stream.seek(0) - return output_stream.read() + return output_stream.getvalue() def gzip_magic_number(response: Response) -> bool: Index: scrapy-2.13.3/tests/test_downloadermiddleware_httpcompression.py =================================================================== --- scrapy-2.13.3.orig/tests/test_downloadermiddleware_httpcompression.py +++ scrapy-2.13.3/tests/test_downloadermiddleware_httpcompression.py @@ -51,6 +51,22 @@ FORMAT = { } +def _skip_if_no_br() -> None: + try: + import brotli # noqa: PLC0415 + + brotli.Decompressor.can_accept_more_data + except (ImportError, AttributeError): + pytest.skip("no brotli support") + + +def _skip_if_no_zstd() -> None: + try: + import zstandard # noqa: F401,PLC0415 + except ImportError: + pytest.skip("no zstd support (zstandard)") + + class TestHttpCompression: def setup_method(self): self.crawler = get_crawler(Spider) @@ -124,13 +140,7 @@ class TestHttpCompression: self.assertStatsEqual("httpcompression/response_bytes", 74837) def test_process_response_br(self): - try: - try: - import brotli # noqa: F401 - except ImportError: - import brotlicffi # noqa: F401 - except ImportError: - raise SkipTest("no brotli") + _skip_if_no_br() response = self._getresponse("br") request = response.request assert response.headers["Content-Encoding"] == b"br" @@ -143,14 +153,8 @@ class TestHttpCompression: def test_process_response_br_unsupported(self): try: - try: - import brotli # noqa: F401 - - raise SkipTest("Requires not having brotli support") - except ImportError: - import brotlicffi # noqa: F401 - - raise SkipTest("Requires not having brotli support") + import brotli # noqa: F401,PLC0415 + pytest.skip("Requires not having brotli support") except ImportError: pass response = self._getresponse("br") @@ -169,7 +173,7 @@ class TestHttpCompression: ( "HttpCompressionMiddleware cannot decode the response for" " http://scrapytest.org/ from unsupported encoding(s) 'br'." - " You need to install brotli or brotlicffi to decode 'br'." + " You need to install brotli >= 1.2.0 to decode 'br'." ), ), ) @@ -503,24 +507,19 @@ class TestHttpCompression: self.assertStatsEqual("httpcompression/response_bytes", None) def _test_compression_bomb_setting(self, compression_id): - settings = {"DOWNLOAD_MAXSIZE": 10_000_000} + settings = {"DOWNLOAD_MAXSIZE": 1_000_000} crawler = get_crawler(Spider, settings_dict=settings) spider = crawler._create_spider("scrapytest.org") mw = HttpCompressionMiddleware.from_crawler(crawler) mw.open_spider(spider) - response = self._getresponse(f"bomb-{compression_id}") - with pytest.raises(IgnoreRequest): + response = self._getresponse(f"bomb-{compression_id}") # 11_511_612 B + with pytest.raises(IgnoreRequest) as exc_info: mw.process_response(response.request, response, spider) + assert exc_info.value.__cause__.decompressed_size < 1_100_000 def test_compression_bomb_setting_br(self): - try: - try: - import brotli # noqa: F401 - except ImportError: - import brotlicffi # noqa: F401 - except ImportError: - raise SkipTest("no brotli") + _skip_if_no_br() self._test_compression_bomb_setting("br") def test_compression_bomb_setting_deflate(self): @@ -538,7 +537,7 @@ class TestHttpCompression: def _test_compression_bomb_spider_attr(self, compression_id): class DownloadMaxSizeSpider(Spider): - download_maxsize = 10_000_000 + download_maxsize = 1_000_000 crawler = get_crawler(DownloadMaxSizeSpider) spider = crawler._create_spider("scrapytest.org") @@ -546,17 +545,12 @@ class TestHttpCompression: mw.open_spider(spider) response = self._getresponse(f"bomb-{compression_id}") - with pytest.raises(IgnoreRequest): + with pytest.raises(IgnoreRequest) as exc_info: mw.process_response(response.request, response, spider) + assert exc_info.value.__cause__.decompressed_size < 1_100_000 def test_compression_bomb_spider_attr_br(self): - try: - try: - import brotli # noqa: F401 - except ImportError: - import brotlicffi # noqa: F401 - except ImportError: - raise SkipTest("no brotli") + _skip_if_no_br() self._test_compression_bomb_spider_attr("br") def test_compression_bomb_spider_attr_deflate(self): @@ -579,18 +573,13 @@ class TestHttpCompression: mw.open_spider(spider) response = self._getresponse(f"bomb-{compression_id}") - response.meta["download_maxsize"] = 10_000_000 - with pytest.raises(IgnoreRequest): + response.meta["download_maxsize"] = 1_000_000 + with pytest.raises(IgnoreRequest) as exc_info: mw.process_response(response.request, response, spider) + assert exc_info.value.__cause__.decompressed_size < 1_100_000 def test_compression_bomb_request_meta_br(self): - try: - try: - import brotli # noqa: F401 - except ImportError: - import brotlicffi # noqa: F401 - except ImportError: - raise SkipTest("no brotli") + _skip_if_no_br() self._test_compression_bomb_request_meta("br") def test_compression_bomb_request_meta_deflate(self): @@ -633,13 +622,7 @@ class TestHttpCompression: ) def test_download_warnsize_setting_br(self): - try: - try: - import brotli # noqa: F401 - except ImportError: - import brotlicffi # noqa: F401 - except ImportError: - raise SkipTest("no brotli") + _skip_if_no_br() self._test_download_warnsize_setting("br") def test_download_warnsize_setting_deflate(self): @@ -684,13 +667,7 @@ class TestHttpCompression: ) def test_download_warnsize_spider_attr_br(self): - try: - try: - import brotli # noqa: F401 - except ImportError: - import brotlicffi # noqa: F401 - except ImportError: - raise SkipTest("no brotli") + _skip_if_no_br() self._test_download_warnsize_spider_attr("br") def test_download_warnsize_spider_attr_deflate(self): @@ -733,13 +710,7 @@ class TestHttpCompression: ) def test_download_warnsize_request_meta_br(self): - try: - try: - import brotli # noqa: F401 - except ImportError: - import brotlicffi # noqa: F401 - except ImportError: - raise SkipTest("no brotli") + _skip_if_no_br() self._test_download_warnsize_request_meta("br") def test_download_warnsize_request_meta_deflate(self): @@ -754,3 +725,34 @@ class TestHttpCompression: except ImportError: raise SkipTest("no zstd support (zstandard)") self._test_download_warnsize_request_meta("zstd") + + def _get_truncated_response(self, compression_id): + crawler = get_crawler(Spider) + spider = crawler._create_spider("scrapytest.org") + mw = HttpCompressionMiddleware.from_crawler(crawler) + mw.open_spider(spider) + response = self._getresponse(compression_id) + truncated_body = response.body[: len(response.body) // 2] + response = response.replace(body=truncated_body) + return mw.process_response(response.request, response, spider) + + def test_process_truncated_response_br(self): + _skip_if_no_br() + resp = self._get_truncated_response("br") + assert resp.body.startswith(b"= 1.2.0 # optional for HTTP compress downloader middleware tests google-cloud-storage ipython robotexclusionrulesparser @@ -156,9 +152,7 @@ deps = Pillow==8.0.0 boto3==1.20.0 bpython==0.7.1 - brotli==0.5.2; implementation_name != "pypy" - brotlicffi==0.8.0; implementation_name == "pypy" - brotlipy + brotli==1.2.0 google-cloud-storage==1.29.0 ipython==2.0.0 robotexclusionrulesparser==1.6.2 @@ -258,7 +252,7 @@ deps = {[testenv]deps} botocore>=1.4.87 commands = - pytest {posargs:--cov-config=pyproject.toml --cov=scrapy --cov-report=xml --cov-report= tests --junitxml=botocore.junit.xml -o junit_family=legacy -m requires_botocore} + pytest {posargs:--cov-config=pyproject.toml --cov=scrapy --cov-report=xml --cov-report= tests --junitxml=botocore.junit.xml -o junit_family=legacy} -m requires_botocore [testenv:botocore-pinned] basepython = {[pinned]basepython} @@ -269,4 +263,17 @@ install_command = {[pinned]install_comma setenv = {[pinned]setenv} commands = - pytest {posargs:--cov-config=pyproject.toml --cov=scrapy --cov-report=xml --cov-report= tests --junitxml=botocore-pinned.junit.xml -o junit_family=legacy -m requires_botocore} + pytest {posargs:--cov-config=pyproject.toml --cov=scrapy --cov-report=xml --cov-report= tests --junitxml=botocore-pinned.junit.xml -o junit_family=legacy} -m requires_botocore + + +# Run proxy tests that use mitmproxy in a separate env to avoid installing +# numerous mitmproxy deps in other envs (even in extra-deps), as they can +# conflict with other deps we want, or don't want, to have installed there. + +[testenv:mitmproxy] +deps = + {[testenv]deps} + # mitmproxy does not support PyPy + mitmproxy; implementation_name != "pypy" +commands = + pytest {posargs:--cov-config=pyproject.toml --cov=scrapy --cov-report=xml --cov-report= tests --junitxml=botocore.junit.xml -o junit_family=legacy} -m requires_mitmproxy