|
|
|
|
@@ -1,684 +0,0 @@
|
|
|
|
|
From e3673d5a42cdd8be95c09982240317af1410fea3 Mon Sep 17 00:00:00 2001
|
|
|
|
|
From: Rui Xi <Cycloctane@outlook.com>
|
|
|
|
|
Date: Thu, 6 Nov 2025 18:53:35 +0800
|
|
|
|
|
Subject: [PATCH 01/18] mitigate brotli decompression bomb
|
|
|
|
|
|
|
|
|
|
drop brotlicffi
|
|
|
|
|
---
|
|
|
|
|
.../downloadermiddlewares/httpcompression.py | 10 +--
|
|
|
|
|
scrapy/utils/_compression.py | 75 +++++--------------
|
|
|
|
|
scrapy/utils/gz.py | 9 +--
|
|
|
|
|
...st_downloadermiddleware_httpcompression.py | 16 +---
|
|
|
|
|
4 files changed, 29 insertions(+), 81 deletions(-)
|
|
|
|
|
|
|
|
|
|
Index: scrapy-2.13.3/scrapy/downloadermiddlewares/httpcompression.py
|
|
|
|
|
===================================================================
|
|
|
|
|
--- scrapy-2.13.3.orig/scrapy/downloadermiddlewares/httpcompression.py
|
|
|
|
|
+++ scrapy-2.13.3/scrapy/downloadermiddlewares/httpcompression.py
|
|
|
|
|
@@ -29,14 +29,20 @@ logger = getLogger(__name__)
|
|
|
|
|
ACCEPTED_ENCODINGS: list[bytes] = [b"gzip", b"deflate"]
|
|
|
|
|
|
|
|
|
|
try:
|
|
|
|
|
- try:
|
|
|
|
|
- import brotli # noqa: F401
|
|
|
|
|
- except ImportError:
|
|
|
|
|
- import brotlicffi # noqa: F401
|
|
|
|
|
+ import brotli
|
|
|
|
|
except ImportError:
|
|
|
|
|
pass
|
|
|
|
|
else:
|
|
|
|
|
- ACCEPTED_ENCODINGS.append(b"br")
|
|
|
|
|
+ try:
|
|
|
|
|
+ brotli.Decompressor.can_accept_more_data
|
|
|
|
|
+ except AttributeError:
|
|
|
|
|
+ warnings.warn(
|
|
|
|
|
+ "You have brotli installed. But 'br' encoding support now requires "
|
|
|
|
|
+ "brotli version >= 1.2.0. Please upgrade brotli version to make Scrapy "
|
|
|
|
|
+ "decode 'br' encoded responses.",
|
|
|
|
|
+ )
|
|
|
|
|
+ else:
|
|
|
|
|
+ ACCEPTED_ENCODINGS.append(b"br")
|
|
|
|
|
|
|
|
|
|
try:
|
|
|
|
|
import zstandard # noqa: F401
|
|
|
|
|
@@ -98,13 +104,13 @@ class HttpCompressionMiddleware:
|
|
|
|
|
decoded_body, content_encoding = self._handle_encoding(
|
|
|
|
|
response.body, content_encoding, max_size
|
|
|
|
|
)
|
|
|
|
|
- except _DecompressionMaxSizeExceeded:
|
|
|
|
|
+ except _DecompressionMaxSizeExceeded as e:
|
|
|
|
|
raise IgnoreRequest(
|
|
|
|
|
f"Ignored response {response} because its body "
|
|
|
|
|
- f"({len(response.body)} B compressed) exceeded "
|
|
|
|
|
- f"DOWNLOAD_MAXSIZE ({max_size} B) during "
|
|
|
|
|
- f"decompression."
|
|
|
|
|
- )
|
|
|
|
|
+ f"({len(response.body)} B compressed, "
|
|
|
|
|
+ f"{e.decompressed_size} B decompressed so far) exceeded "
|
|
|
|
|
+ f"DOWNLOAD_MAXSIZE ({max_size} B) during decompression."
|
|
|
|
|
+ ) from e
|
|
|
|
|
if len(response.body) < warn_size <= len(decoded_body):
|
|
|
|
|
logger.warning(
|
|
|
|
|
f"{response} body size after decompression "
|
|
|
|
|
@@ -187,7 +193,7 @@ class HttpCompressionMiddleware:
|
|
|
|
|
f"from unsupported encoding(s) '{encodings_str}'."
|
|
|
|
|
)
|
|
|
|
|
if b"br" in encodings:
|
|
|
|
|
- msg += " You need to install brotli or brotlicffi to decode 'br'."
|
|
|
|
|
+ msg += " You need to install brotli >= 1.2.0 to decode 'br'."
|
|
|
|
|
if b"zstd" in encodings:
|
|
|
|
|
msg += " You need to install zstandard to decode 'zstd'."
|
|
|
|
|
logger.warning(msg)
|
|
|
|
|
Index: scrapy-2.13.3/scrapy/utils/_compression.py
|
|
|
|
|
===================================================================
|
|
|
|
|
--- scrapy-2.13.3.orig/scrapy/utils/_compression.py
|
|
|
|
|
+++ scrapy-2.13.3/scrapy/utils/_compression.py
|
|
|
|
|
@@ -1,42 +1,9 @@
|
|
|
|
|
import contextlib
|
|
|
|
|
import zlib
|
|
|
|
|
from io import BytesIO
|
|
|
|
|
-from warnings import warn
|
|
|
|
|
-
|
|
|
|
|
-from scrapy.exceptions import ScrapyDeprecationWarning
|
|
|
|
|
-
|
|
|
|
|
-try:
|
|
|
|
|
- try:
|
|
|
|
|
- import brotli
|
|
|
|
|
- except ImportError:
|
|
|
|
|
- import brotlicffi as brotli
|
|
|
|
|
-except ImportError:
|
|
|
|
|
- pass
|
|
|
|
|
-else:
|
|
|
|
|
- try:
|
|
|
|
|
- brotli.Decompressor.process
|
|
|
|
|
- except AttributeError:
|
|
|
|
|
- warn(
|
|
|
|
|
- (
|
|
|
|
|
- "You have brotlipy installed, and Scrapy will use it, but "
|
|
|
|
|
- "Scrapy support for brotlipy is deprecated and will stop "
|
|
|
|
|
- "working in a future version of Scrapy. brotlipy itself is "
|
|
|
|
|
- "deprecated, it has been superseded by brotlicffi. Please, "
|
|
|
|
|
- "uninstall brotlipy and install brotli or brotlicffi instead. "
|
|
|
|
|
- "brotlipy has the same import name as brotli, so keeping both "
|
|
|
|
|
- "installed is strongly discouraged."
|
|
|
|
|
- ),
|
|
|
|
|
- ScrapyDeprecationWarning,
|
|
|
|
|
- )
|
|
|
|
|
-
|
|
|
|
|
- def _brotli_decompress(decompressor, data):
|
|
|
|
|
- return decompressor.decompress(data)
|
|
|
|
|
-
|
|
|
|
|
- else:
|
|
|
|
|
-
|
|
|
|
|
- def _brotli_decompress(decompressor, data):
|
|
|
|
|
- return decompressor.process(data)
|
|
|
|
|
|
|
|
|
|
+with contextlib.suppress(ImportError):
|
|
|
|
|
+ import brotli
|
|
|
|
|
|
|
|
|
|
with contextlib.suppress(ImportError):
|
|
|
|
|
import zstandard
|
|
|
|
|
@@ -46,62 +13,64 @@ _CHUNK_SIZE = 65536 # 64 KiB
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class _DecompressionMaxSizeExceeded(ValueError):
|
|
|
|
|
- pass
|
|
|
|
|
+ def __init__(self, decompressed_size: int, max_size: int) -> None:
|
|
|
|
|
+ self.decompressed_size = decompressed_size
|
|
|
|
|
+ self.max_size = max_size
|
|
|
|
|
+
|
|
|
|
|
+ def __str__(self) -> str:
|
|
|
|
|
+ return (
|
|
|
|
|
+ "The number of bytes decompressed so far "
|
|
|
|
|
+ f"({self.decompressed_size} B) exceeded the specified maximum "
|
|
|
|
|
+ f"({self.max_size} B)."
|
|
|
|
|
+ )
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+def _check_max_size(decompressed_size: int, max_size: int) -> None:
|
|
|
|
|
+ if max_size and decompressed_size > max_size:
|
|
|
|
|
+ raise _DecompressionMaxSizeExceeded(decompressed_size, max_size)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _inflate(data: bytes, *, max_size: int = 0) -> bytes:
|
|
|
|
|
decompressor = zlib.decompressobj()
|
|
|
|
|
- raw_decompressor = zlib.decompressobj(wbits=-15)
|
|
|
|
|
- input_stream = BytesIO(data)
|
|
|
|
|
+ try:
|
|
|
|
|
+ first_chunk = decompressor.decompress(data, max_length=_CHUNK_SIZE)
|
|
|
|
|
+ except zlib.error:
|
|
|
|
|
+ # to work with raw deflate content that may sent by microsoft servers.
|
|
|
|
|
+ decompressor = zlib.decompressobj(wbits=-15)
|
|
|
|
|
+ first_chunk = decompressor.decompress(data, max_length=_CHUNK_SIZE)
|
|
|
|
|
+ decompressed_size = len(first_chunk)
|
|
|
|
|
+ _check_max_size(decompressed_size, max_size)
|
|
|
|
|
output_stream = BytesIO()
|
|
|
|
|
- output_chunk = b"."
|
|
|
|
|
- decompressed_size = 0
|
|
|
|
|
- while output_chunk:
|
|
|
|
|
- input_chunk = input_stream.read(_CHUNK_SIZE)
|
|
|
|
|
- try:
|
|
|
|
|
- output_chunk = decompressor.decompress(input_chunk)
|
|
|
|
|
- except zlib.error:
|
|
|
|
|
- if decompressor != raw_decompressor:
|
|
|
|
|
- # ugly hack to work with raw deflate content that may
|
|
|
|
|
- # be sent by microsoft servers. For more information, see:
|
|
|
|
|
- # http://carsten.codimi.de/gzip.yaws/
|
|
|
|
|
- # http://www.port80software.com/200ok/archive/2005/10/31/868.aspx
|
|
|
|
|
- # http://www.gzip.org/zlib/zlib_faq.html#faq38
|
|
|
|
|
- decompressor = raw_decompressor
|
|
|
|
|
- output_chunk = decompressor.decompress(input_chunk)
|
|
|
|
|
- else:
|
|
|
|
|
- raise
|
|
|
|
|
+ output_stream.write(first_chunk)
|
|
|
|
|
+ while decompressor.unconsumed_tail:
|
|
|
|
|
+ output_chunk = decompressor.decompress(
|
|
|
|
|
+ decompressor.unconsumed_tail, max_length=_CHUNK_SIZE
|
|
|
|
|
+ )
|
|
|
|
|
decompressed_size += len(output_chunk)
|
|
|
|
|
- if max_size and decompressed_size > max_size:
|
|
|
|
|
- raise _DecompressionMaxSizeExceeded(
|
|
|
|
|
- f"The number of bytes decompressed so far "
|
|
|
|
|
- f"({decompressed_size} B) exceed the specified maximum "
|
|
|
|
|
- f"({max_size} B)."
|
|
|
|
|
- )
|
|
|
|
|
+ _check_max_size(decompressed_size, max_size)
|
|
|
|
|
output_stream.write(output_chunk)
|
|
|
|
|
- output_stream.seek(0)
|
|
|
|
|
- return output_stream.read()
|
|
|
|
|
+ if tail := decompressor.flush():
|
|
|
|
|
+ decompressed_size += len(tail)
|
|
|
|
|
+ _check_max_size(decompressed_size, max_size)
|
|
|
|
|
+ output_stream.write(tail)
|
|
|
|
|
+ return output_stream.getvalue()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _unbrotli(data: bytes, *, max_size: int = 0) -> bytes:
|
|
|
|
|
decompressor = brotli.Decompressor()
|
|
|
|
|
- input_stream = BytesIO(data)
|
|
|
|
|
+ first_chunk = decompressor.process(data, output_buffer_limit=_CHUNK_SIZE)
|
|
|
|
|
+ decompressed_size = len(first_chunk)
|
|
|
|
|
+ _check_max_size(decompressed_size, max_size)
|
|
|
|
|
output_stream = BytesIO()
|
|
|
|
|
- output_chunk = b"."
|
|
|
|
|
- decompressed_size = 0
|
|
|
|
|
- while output_chunk:
|
|
|
|
|
- input_chunk = input_stream.read(_CHUNK_SIZE)
|
|
|
|
|
- output_chunk = _brotli_decompress(decompressor, input_chunk)
|
|
|
|
|
+ output_stream.write(first_chunk)
|
|
|
|
|
+ while not decompressor.is_finished():
|
|
|
|
|
+ output_chunk = decompressor.process(b"", output_buffer_limit=_CHUNK_SIZE)
|
|
|
|
|
+ if not output_chunk:
|
|
|
|
|
+ raise ValueError("Truncated brotli compressed data")
|
|
|
|
|
decompressed_size += len(output_chunk)
|
|
|
|
|
- if max_size and decompressed_size > max_size:
|
|
|
|
|
- raise _DecompressionMaxSizeExceeded(
|
|
|
|
|
- f"The number of bytes decompressed so far "
|
|
|
|
|
- f"({decompressed_size} B) exceed the specified maximum "
|
|
|
|
|
- f"({max_size} B)."
|
|
|
|
|
- )
|
|
|
|
|
+ _check_max_size(decompressed_size, max_size)
|
|
|
|
|
output_stream.write(output_chunk)
|
|
|
|
|
- output_stream.seek(0)
|
|
|
|
|
- return output_stream.read()
|
|
|
|
|
+ return output_stream.getvalue()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _unzstd(data: bytes, *, max_size: int = 0) -> bytes:
|
|
|
|
|
@@ -113,12 +82,6 @@ def _unzstd(data: bytes, *, max_size: in
|
|
|
|
|
while output_chunk:
|
|
|
|
|
output_chunk = stream_reader.read(_CHUNK_SIZE)
|
|
|
|
|
decompressed_size += len(output_chunk)
|
|
|
|
|
- if max_size and decompressed_size > max_size:
|
|
|
|
|
- raise _DecompressionMaxSizeExceeded(
|
|
|
|
|
- f"The number of bytes decompressed so far "
|
|
|
|
|
- f"({decompressed_size} B) exceed the specified maximum "
|
|
|
|
|
- f"({max_size} B)."
|
|
|
|
|
- )
|
|
|
|
|
+ _check_max_size(decompressed_size, max_size)
|
|
|
|
|
output_stream.write(output_chunk)
|
|
|
|
|
- output_stream.seek(0)
|
|
|
|
|
- return output_stream.read()
|
|
|
|
|
+ return output_stream.getvalue()
|
|
|
|
|
Index: scrapy-2.13.3/scrapy/utils/gz.py
|
|
|
|
|
===================================================================
|
|
|
|
|
--- scrapy-2.13.3.orig/scrapy/utils/gz.py
|
|
|
|
|
+++ scrapy-2.13.3/scrapy/utils/gz.py
|
|
|
|
|
@@ -5,7 +5,7 @@ from gzip import GzipFile
|
|
|
|
|
from io import BytesIO
|
|
|
|
|
from typing import TYPE_CHECKING
|
|
|
|
|
|
|
|
|
|
-from ._compression import _CHUNK_SIZE, _DecompressionMaxSizeExceeded
|
|
|
|
|
+from ._compression import _CHUNK_SIZE, _check_max_size
|
|
|
|
|
|
|
|
|
|
if TYPE_CHECKING:
|
|
|
|
|
from scrapy.http import Response
|
|
|
|
|
@@ -31,15 +31,9 @@ def gunzip(data: bytes, *, max_size: int
|
|
|
|
|
break
|
|
|
|
|
raise
|
|
|
|
|
decompressed_size += len(chunk)
|
|
|
|
|
- if max_size and decompressed_size > max_size:
|
|
|
|
|
- raise _DecompressionMaxSizeExceeded(
|
|
|
|
|
- f"The number of bytes decompressed so far "
|
|
|
|
|
- f"({decompressed_size} B) exceed the specified maximum "
|
|
|
|
|
- f"({max_size} B)."
|
|
|
|
|
- )
|
|
|
|
|
+ _check_max_size(decompressed_size, max_size)
|
|
|
|
|
output_stream.write(chunk)
|
|
|
|
|
- output_stream.seek(0)
|
|
|
|
|
- return output_stream.read()
|
|
|
|
|
+ return output_stream.getvalue()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def gzip_magic_number(response: Response) -> bool:
|
|
|
|
|
Index: scrapy-2.13.3/tests/test_downloadermiddleware_httpcompression.py
|
|
|
|
|
===================================================================
|
|
|
|
|
--- scrapy-2.13.3.orig/tests/test_downloadermiddleware_httpcompression.py
|
|
|
|
|
+++ scrapy-2.13.3/tests/test_downloadermiddleware_httpcompression.py
|
|
|
|
|
@@ -2,7 +2,6 @@ from gzip import GzipFile
|
|
|
|
|
from io import BytesIO
|
|
|
|
|
from logging import WARNING
|
|
|
|
|
from pathlib import Path
|
|
|
|
|
-from unittest import SkipTest
|
|
|
|
|
|
|
|
|
|
import pytest
|
|
|
|
|
from testfixtures import LogCapture
|
|
|
|
|
@@ -48,9 +47,26 @@ FORMAT = {
|
|
|
|
|
"zstd", # 1 096 → 11 511 612
|
|
|
|
|
)
|
|
|
|
|
},
|
|
|
|
|
+ "bomb-br-64GiB": ("bomb-br-64GiB.bin", "br"), # 51K → 64 GiB decompression bomb
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
+def _skip_if_no_br() -> None:
|
|
|
|
|
+ try:
|
|
|
|
|
+ import brotli # noqa: PLC0415
|
|
|
|
|
+
|
|
|
|
|
+ brotli.Decompressor.can_accept_more_data
|
|
|
|
|
+ except (ImportError, AttributeError):
|
|
|
|
|
+ pytest.skip("no brotli support")
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+def _skip_if_no_zstd() -> None:
|
|
|
|
|
+ try:
|
|
|
|
|
+ import zstandard # noqa: F401,PLC0415
|
|
|
|
|
+ except ImportError:
|
|
|
|
|
+ pytest.skip("no zstd support (zstandard)")
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
class TestHttpCompression:
|
|
|
|
|
def setup_method(self):
|
|
|
|
|
self.crawler = get_crawler(Spider)
|
|
|
|
|
@@ -124,13 +140,8 @@ class TestHttpCompression:
|
|
|
|
|
self.assertStatsEqual("httpcompression/response_bytes", 74837)
|
|
|
|
|
|
|
|
|
|
def test_process_response_br(self):
|
|
|
|
|
- try:
|
|
|
|
|
- try:
|
|
|
|
|
- import brotli # noqa: F401
|
|
|
|
|
- except ImportError:
|
|
|
|
|
- import brotlicffi # noqa: F401
|
|
|
|
|
- except ImportError:
|
|
|
|
|
- raise SkipTest("no brotli")
|
|
|
|
|
+ _skip_if_no_br()
|
|
|
|
|
+
|
|
|
|
|
response = self._getresponse("br")
|
|
|
|
|
request = response.request
|
|
|
|
|
assert response.headers["Content-Encoding"] == b"br"
|
|
|
|
|
@@ -143,14 +154,9 @@ class TestHttpCompression:
|
|
|
|
|
|
|
|
|
|
def test_process_response_br_unsupported(self):
|
|
|
|
|
try:
|
|
|
|
|
- try:
|
|
|
|
|
- import brotli # noqa: F401
|
|
|
|
|
-
|
|
|
|
|
- raise SkipTest("Requires not having brotli support")
|
|
|
|
|
- except ImportError:
|
|
|
|
|
- import brotlicffi # noqa: F401
|
|
|
|
|
+ import brotli # noqa: F401,PLC0415
|
|
|
|
|
|
|
|
|
|
- raise SkipTest("Requires not having brotli support")
|
|
|
|
|
+ pytest.skip("Requires not having brotli support")
|
|
|
|
|
except ImportError:
|
|
|
|
|
pass
|
|
|
|
|
response = self._getresponse("br")
|
|
|
|
|
@@ -169,7 +175,7 @@ class TestHttpCompression:
|
|
|
|
|
(
|
|
|
|
|
"HttpCompressionMiddleware cannot decode the response for"
|
|
|
|
|
" http://scrapytest.org/ from unsupported encoding(s) 'br'."
|
|
|
|
|
- " You need to install brotli or brotlicffi to decode 'br'."
|
|
|
|
|
+ " You need to install brotli >= 1.2.0 to decode 'br'."
|
|
|
|
|
),
|
|
|
|
|
),
|
|
|
|
|
)
|
|
|
|
|
@@ -177,10 +183,8 @@ class TestHttpCompression:
|
|
|
|
|
assert newresponse.headers.getlist("Content-Encoding") == [b"br"]
|
|
|
|
|
|
|
|
|
|
def test_process_response_zstd(self):
|
|
|
|
|
- try:
|
|
|
|
|
- import zstandard # noqa: F401
|
|
|
|
|
- except ImportError:
|
|
|
|
|
- raise SkipTest("no zstd support (zstandard)")
|
|
|
|
|
+ _skip_if_no_zstd()
|
|
|
|
|
+
|
|
|
|
|
raw_content = None
|
|
|
|
|
for check_key in FORMAT:
|
|
|
|
|
if not check_key.startswith("zstd-"):
|
|
|
|
|
@@ -199,9 +203,9 @@ class TestHttpCompression:
|
|
|
|
|
|
|
|
|
|
def test_process_response_zstd_unsupported(self):
|
|
|
|
|
try:
|
|
|
|
|
- import zstandard # noqa: F401
|
|
|
|
|
+ import zstandard # noqa: F401,PLC0415
|
|
|
|
|
|
|
|
|
|
- raise SkipTest("Requires not having zstandard support")
|
|
|
|
|
+ pytest.skip("Requires not having zstandard support")
|
|
|
|
|
except ImportError:
|
|
|
|
|
pass
|
|
|
|
|
response = self._getresponse("zstd-static-content-size")
|
|
|
|
|
@@ -503,24 +507,20 @@ class TestHttpCompression:
|
|
|
|
|
self.assertStatsEqual("httpcompression/response_bytes", None)
|
|
|
|
|
|
|
|
|
|
def _test_compression_bomb_setting(self, compression_id):
|
|
|
|
|
- settings = {"DOWNLOAD_MAXSIZE": 10_000_000}
|
|
|
|
|
+ settings = {"DOWNLOAD_MAXSIZE": 1_000_000}
|
|
|
|
|
crawler = get_crawler(Spider, settings_dict=settings)
|
|
|
|
|
spider = crawler._create_spider("scrapytest.org")
|
|
|
|
|
mw = HttpCompressionMiddleware.from_crawler(crawler)
|
|
|
|
|
mw.open_spider(spider)
|
|
|
|
|
|
|
|
|
|
- response = self._getresponse(f"bomb-{compression_id}")
|
|
|
|
|
- with pytest.raises(IgnoreRequest):
|
|
|
|
|
- mw.process_response(response.request, response, spider)
|
|
|
|
|
+ response = self._getresponse(f"bomb-{compression_id}") # 11_511_612 B
|
|
|
|
|
+ with pytest.raises(IgnoreRequest) as exc_info:
|
|
|
|
|
+ mw.process_response(response.request, response, self.spider)
|
|
|
|
|
+ assert exc_info.value.__cause__.decompressed_size < 1_100_000
|
|
|
|
|
|
|
|
|
|
def test_compression_bomb_setting_br(self):
|
|
|
|
|
- try:
|
|
|
|
|
- try:
|
|
|
|
|
- import brotli # noqa: F401
|
|
|
|
|
- except ImportError:
|
|
|
|
|
- import brotlicffi # noqa: F401
|
|
|
|
|
- except ImportError:
|
|
|
|
|
- raise SkipTest("no brotli")
|
|
|
|
|
+ _skip_if_no_br()
|
|
|
|
|
+
|
|
|
|
|
self._test_compression_bomb_setting("br")
|
|
|
|
|
|
|
|
|
|
def test_compression_bomb_setting_deflate(self):
|
|
|
|
|
@@ -530,15 +530,13 @@ class TestHttpCompression:
|
|
|
|
|
self._test_compression_bomb_setting("gzip")
|
|
|
|
|
|
|
|
|
|
def test_compression_bomb_setting_zstd(self):
|
|
|
|
|
- try:
|
|
|
|
|
- import zstandard # noqa: F401
|
|
|
|
|
- except ImportError:
|
|
|
|
|
- raise SkipTest("no zstd support (zstandard)")
|
|
|
|
|
+ _skip_if_no_zstd()
|
|
|
|
|
+
|
|
|
|
|
self._test_compression_bomb_setting("zstd")
|
|
|
|
|
|
|
|
|
|
def _test_compression_bomb_spider_attr(self, compression_id):
|
|
|
|
|
class DownloadMaxSizeSpider(Spider):
|
|
|
|
|
- download_maxsize = 10_000_000
|
|
|
|
|
+ download_maxsize = 1_000_000
|
|
|
|
|
|
|
|
|
|
crawler = get_crawler(DownloadMaxSizeSpider)
|
|
|
|
|
spider = crawler._create_spider("scrapytest.org")
|
|
|
|
|
@@ -546,30 +544,28 @@ class TestHttpCompression:
|
|
|
|
|
mw.open_spider(spider)
|
|
|
|
|
|
|
|
|
|
response = self._getresponse(f"bomb-{compression_id}")
|
|
|
|
|
- with pytest.raises(IgnoreRequest):
|
|
|
|
|
- mw.process_response(response.request, response, spider)
|
|
|
|
|
+ with pytest.raises(IgnoreRequest) as exc_info:
|
|
|
|
|
+ mw.process_response(response.request, response, self.spider)
|
|
|
|
|
+ assert exc_info.value.__cause__.decompressed_size < 1_100_000
|
|
|
|
|
|
|
|
|
|
+ @pytest.mark.filterwarnings("ignore::scrapy.exceptions.ScrapyDeprecationWarning")
|
|
|
|
|
def test_compression_bomb_spider_attr_br(self):
|
|
|
|
|
- try:
|
|
|
|
|
- try:
|
|
|
|
|
- import brotli # noqa: F401
|
|
|
|
|
- except ImportError:
|
|
|
|
|
- import brotlicffi # noqa: F401
|
|
|
|
|
- except ImportError:
|
|
|
|
|
- raise SkipTest("no brotli")
|
|
|
|
|
+ _skip_if_no_br()
|
|
|
|
|
+
|
|
|
|
|
self._test_compression_bomb_spider_attr("br")
|
|
|
|
|
|
|
|
|
|
+ @pytest.mark.filterwarnings("ignore::scrapy.exceptions.ScrapyDeprecationWarning")
|
|
|
|
|
def test_compression_bomb_spider_attr_deflate(self):
|
|
|
|
|
self._test_compression_bomb_spider_attr("deflate")
|
|
|
|
|
|
|
|
|
|
+ @pytest.mark.filterwarnings("ignore::scrapy.exceptions.ScrapyDeprecationWarning")
|
|
|
|
|
def test_compression_bomb_spider_attr_gzip(self):
|
|
|
|
|
self._test_compression_bomb_spider_attr("gzip")
|
|
|
|
|
|
|
|
|
|
+ @pytest.mark.filterwarnings("ignore::scrapy.exceptions.ScrapyDeprecationWarning")
|
|
|
|
|
def test_compression_bomb_spider_attr_zstd(self):
|
|
|
|
|
- try:
|
|
|
|
|
- import zstandard # noqa: F401
|
|
|
|
|
- except ImportError:
|
|
|
|
|
- raise SkipTest("no zstd support (zstandard)")
|
|
|
|
|
+ _skip_if_no_zstd()
|
|
|
|
|
+
|
|
|
|
|
self._test_compression_bomb_spider_attr("zstd")
|
|
|
|
|
|
|
|
|
|
def _test_compression_bomb_request_meta(self, compression_id):
|
|
|
|
|
@@ -579,18 +575,14 @@ class TestHttpCompression:
|
|
|
|
|
mw.open_spider(spider)
|
|
|
|
|
|
|
|
|
|
response = self._getresponse(f"bomb-{compression_id}")
|
|
|
|
|
- response.meta["download_maxsize"] = 10_000_000
|
|
|
|
|
- with pytest.raises(IgnoreRequest):
|
|
|
|
|
- mw.process_response(response.request, response, spider)
|
|
|
|
|
+ response.meta["download_maxsize"] = 1_000_000
|
|
|
|
|
+ with pytest.raises(IgnoreRequest) as exc_info:
|
|
|
|
|
+ mw.process_response(response.request, response, self.spider)
|
|
|
|
|
+ assert exc_info.value.__cause__.decompressed_size < 1_100_000
|
|
|
|
|
|
|
|
|
|
def test_compression_bomb_request_meta_br(self):
|
|
|
|
|
- try:
|
|
|
|
|
- try:
|
|
|
|
|
- import brotli # noqa: F401
|
|
|
|
|
- except ImportError:
|
|
|
|
|
- import brotlicffi # noqa: F401
|
|
|
|
|
- except ImportError:
|
|
|
|
|
- raise SkipTest("no brotli")
|
|
|
|
|
+ _skip_if_no_br()
|
|
|
|
|
+
|
|
|
|
|
self._test_compression_bomb_request_meta("br")
|
|
|
|
|
|
|
|
|
|
def test_compression_bomb_request_meta_deflate(self):
|
|
|
|
|
@@ -600,12 +592,38 @@ class TestHttpCompression:
|
|
|
|
|
self._test_compression_bomb_request_meta("gzip")
|
|
|
|
|
|
|
|
|
|
def test_compression_bomb_request_meta_zstd(self):
|
|
|
|
|
- try:
|
|
|
|
|
- import zstandard # noqa: F401
|
|
|
|
|
- except ImportError:
|
|
|
|
|
- raise SkipTest("no zstd support (zstandard)")
|
|
|
|
|
+ _skip_if_no_zstd()
|
|
|
|
|
+
|
|
|
|
|
self._test_compression_bomb_request_meta("zstd")
|
|
|
|
|
|
|
|
|
|
+ def test_compression_bomb_output_buffer_limit(self):
|
|
|
|
|
+ """Test that the 64 GiB brotli decompression bomb is properly handled.
|
|
|
|
|
+
|
|
|
|
|
+ This test ensures that the output_buffer_limit parameter in the brotli
|
|
|
|
|
+ decompressor prevents the decompression bomb attack. The bomb file is
|
|
|
|
|
+ approximately 51 KB compressed but would decompress to 64 GiB, which
|
|
|
|
|
+ should trigger IgnoreRequest when DOWNLOAD_MAXSIZE is exceeded.
|
|
|
|
|
+ """
|
|
|
|
|
+ _skip_if_no_br()
|
|
|
|
|
+
|
|
|
|
|
+ settings = {"DOWNLOAD_MAXSIZE": 10_000_000} # 10 MB limit
|
|
|
|
|
+ crawler = get_crawler(Spider, settings_dict=settings)
|
|
|
|
|
+ spider = crawler._create_spider("scrapytest.org")
|
|
|
|
|
+ mw = HttpCompressionMiddleware.from_crawler(crawler)
|
|
|
|
|
+ mw.open_spider(spider)
|
|
|
|
|
+
|
|
|
|
|
+ response = self._getresponse("bomb-br-64GiB")
|
|
|
|
|
+
|
|
|
|
|
+ # Verify the response is properly configured
|
|
|
|
|
+ assert response.headers["Content-Encoding"] == b"br"
|
|
|
|
|
+
|
|
|
|
|
+ # The middleware should raise IgnoreRequest due to exceeding DOWNLOAD_MAXSIZE
|
|
|
|
|
+ with pytest.raises(IgnoreRequest) as exc_info:
|
|
|
|
|
+ mw.process_response(response.request, response, self.spider)
|
|
|
|
|
+
|
|
|
|
|
+ # Verify the exception message mentions the download size limits
|
|
|
|
|
+ assert "exceeded DOWNLOAD_MAXSIZE (10000000 B)" in str(exc_info.value)
|
|
|
|
|
+
|
|
|
|
|
def _test_download_warnsize_setting(self, compression_id):
|
|
|
|
|
settings = {"DOWNLOAD_WARNSIZE": 10_000_000}
|
|
|
|
|
crawler = get_crawler(Spider, settings_dict=settings)
|
|
|
|
|
@@ -619,7 +637,7 @@ class TestHttpCompression:
|
|
|
|
|
propagate=False,
|
|
|
|
|
level=WARNING,
|
|
|
|
|
) as log:
|
|
|
|
|
- mw.process_response(response.request, response, spider)
|
|
|
|
|
+ mw.process_response(response.request, response, self.spider)
|
|
|
|
|
log.check(
|
|
|
|
|
(
|
|
|
|
|
"scrapy.downloadermiddlewares.httpcompression",
|
|
|
|
|
@@ -633,13 +651,8 @@ class TestHttpCompression:
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
def test_download_warnsize_setting_br(self):
|
|
|
|
|
- try:
|
|
|
|
|
- try:
|
|
|
|
|
- import brotli # noqa: F401
|
|
|
|
|
- except ImportError:
|
|
|
|
|
- import brotlicffi # noqa: F401
|
|
|
|
|
- except ImportError:
|
|
|
|
|
- raise SkipTest("no brotli")
|
|
|
|
|
+ _skip_if_no_br()
|
|
|
|
|
+
|
|
|
|
|
self._test_download_warnsize_setting("br")
|
|
|
|
|
|
|
|
|
|
def test_download_warnsize_setting_deflate(self):
|
|
|
|
|
@@ -649,10 +662,8 @@ class TestHttpCompression:
|
|
|
|
|
self._test_download_warnsize_setting("gzip")
|
|
|
|
|
|
|
|
|
|
def test_download_warnsize_setting_zstd(self):
|
|
|
|
|
- try:
|
|
|
|
|
- import zstandard # noqa: F401
|
|
|
|
|
- except ImportError:
|
|
|
|
|
- raise SkipTest("no zstd support (zstandard)")
|
|
|
|
|
+ _skip_if_no_zstd()
|
|
|
|
|
+
|
|
|
|
|
self._test_download_warnsize_setting("zstd")
|
|
|
|
|
|
|
|
|
|
def _test_download_warnsize_spider_attr(self, compression_id):
|
|
|
|
|
@@ -670,7 +681,7 @@ class TestHttpCompression:
|
|
|
|
|
propagate=False,
|
|
|
|
|
level=WARNING,
|
|
|
|
|
) as log:
|
|
|
|
|
- mw.process_response(response.request, response, spider)
|
|
|
|
|
+ mw.process_response(response.request, response, self.spider)
|
|
|
|
|
log.check(
|
|
|
|
|
(
|
|
|
|
|
"scrapy.downloadermiddlewares.httpcompression",
|
|
|
|
|
@@ -683,27 +694,24 @@ class TestHttpCompression:
|
|
|
|
|
),
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
+ @pytest.mark.filterwarnings("ignore::scrapy.exceptions.ScrapyDeprecationWarning")
|
|
|
|
|
def test_download_warnsize_spider_attr_br(self):
|
|
|
|
|
- try:
|
|
|
|
|
- try:
|
|
|
|
|
- import brotli # noqa: F401
|
|
|
|
|
- except ImportError:
|
|
|
|
|
- import brotlicffi # noqa: F401
|
|
|
|
|
- except ImportError:
|
|
|
|
|
- raise SkipTest("no brotli")
|
|
|
|
|
+ _skip_if_no_br()
|
|
|
|
|
+
|
|
|
|
|
self._test_download_warnsize_spider_attr("br")
|
|
|
|
|
|
|
|
|
|
+ @pytest.mark.filterwarnings("ignore::scrapy.exceptions.ScrapyDeprecationWarning")
|
|
|
|
|
def test_download_warnsize_spider_attr_deflate(self):
|
|
|
|
|
self._test_download_warnsize_spider_attr("deflate")
|
|
|
|
|
|
|
|
|
|
+ @pytest.mark.filterwarnings("ignore::scrapy.exceptions.ScrapyDeprecationWarning")
|
|
|
|
|
def test_download_warnsize_spider_attr_gzip(self):
|
|
|
|
|
self._test_download_warnsize_spider_attr("gzip")
|
|
|
|
|
|
|
|
|
|
+ @pytest.mark.filterwarnings("ignore::scrapy.exceptions.ScrapyDeprecationWarning")
|
|
|
|
|
def test_download_warnsize_spider_attr_zstd(self):
|
|
|
|
|
- try:
|
|
|
|
|
- import zstandard # noqa: F401
|
|
|
|
|
- except ImportError:
|
|
|
|
|
- raise SkipTest("no zstd support (zstandard)")
|
|
|
|
|
+ _skip_if_no_zstd()
|
|
|
|
|
+
|
|
|
|
|
self._test_download_warnsize_spider_attr("zstd")
|
|
|
|
|
|
|
|
|
|
def _test_download_warnsize_request_meta(self, compression_id):
|
|
|
|
|
@@ -719,7 +727,7 @@ class TestHttpCompression:
|
|
|
|
|
propagate=False,
|
|
|
|
|
level=WARNING,
|
|
|
|
|
) as log:
|
|
|
|
|
- mw.process_response(response.request, response, spider)
|
|
|
|
|
+ mw.process_response(response.request, response, self.spider)
|
|
|
|
|
log.check(
|
|
|
|
|
(
|
|
|
|
|
"scrapy.downloadermiddlewares.httpcompression",
|
|
|
|
|
@@ -733,13 +741,8 @@ class TestHttpCompression:
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
def test_download_warnsize_request_meta_br(self):
|
|
|
|
|
- try:
|
|
|
|
|
- try:
|
|
|
|
|
- import brotli # noqa: F401
|
|
|
|
|
- except ImportError:
|
|
|
|
|
- import brotlicffi # noqa: F401
|
|
|
|
|
- except ImportError:
|
|
|
|
|
- raise SkipTest("no brotli")
|
|
|
|
|
+ _skip_if_no_br()
|
|
|
|
|
+
|
|
|
|
|
self._test_download_warnsize_request_meta("br")
|
|
|
|
|
|
|
|
|
|
def test_download_warnsize_request_meta_deflate(self):
|
|
|
|
|
@@ -749,8 +752,6 @@ class TestHttpCompression:
|
|
|
|
|
self._test_download_warnsize_request_meta("gzip")
|
|
|
|
|
|
|
|
|
|
def test_download_warnsize_request_meta_zstd(self):
|
|
|
|
|
- try:
|
|
|
|
|
- import zstandard # noqa: F401
|
|
|
|
|
- except ImportError:
|
|
|
|
|
- raise SkipTest("no zstd support (zstandard)")
|
|
|
|
|
+ _skip_if_no_zstd()
|
|
|
|
|
+
|
|
|
|
|
self._test_download_warnsize_request_meta("zstd")
|
|
|
|
|
Index: scrapy-2.13.3/tox.ini
|
|
|
|
|
===================================================================
|
|
|
|
|
--- scrapy-2.13.3.orig/tox.ini
|
|
|
|
|
+++ scrapy-2.13.3/tox.ini
|
|
|
|
|
@@ -141,8 +141,7 @@ deps =
|
|
|
|
|
Twisted[http2]
|
|
|
|
|
boto3
|
|
|
|
|
bpython # optional for shell wrapper tests
|
|
|
|
|
- brotli; implementation_name != "pypy" # optional for HTTP compress downloader middleware tests
|
|
|
|
|
- brotlicffi; implementation_name == "pypy" # optional for HTTP compress downloader middleware tests
|
|
|
|
|
+ brotli >= 1.2.0 # optional for HTTP compress downloader middleware tests
|
|
|
|
|
google-cloud-storage
|
|
|
|
|
ipython
|
|
|
|
|
robotexclusionrulesparser
|
|
|
|
|
@@ -156,9 +155,7 @@ deps =
|
|
|
|
|
Pillow==8.0.0
|
|
|
|
|
boto3==1.20.0
|
|
|
|
|
bpython==0.7.1
|
|
|
|
|
- brotli==0.5.2; implementation_name != "pypy"
|
|
|
|
|
- brotlicffi==0.8.0; implementation_name == "pypy"
|
|
|
|
|
- brotlipy
|
|
|
|
|
+ brotli==1.2.0
|
|
|
|
|
google-cloud-storage==1.29.0
|
|
|
|
|
ipython==2.0.0
|
|
|
|
|
robotexclusionrulesparser==1.6.2
|