- Add upstream patch CVE-2025-14009.patch, Secure ZIP extraction to prevent Zip-Slip and RCE in nltk.downloader. (gh#nltk/nltk#3468) OBS-URL: https://build.opensuse.org/package/show/devel:languages:python/python-nltk?expand=0&rev=59
341 lines
13 KiB
Diff
341 lines
13 KiB
Diff
From 36c38b60a562f2ff8591386f35565c31ddcc569f Mon Sep 17 00:00:00 2001
|
|
From: hyperps <sarvesharmy6809@gmail.com>
|
|
Date: Thu, 20 Nov 2025 06:41:45 +0530
|
|
Subject: [PATCH 01/18] Fix: Secure ZIP extraction to prevent Zip-Slip and RCE
|
|
in nltk.downloader
|
|
|
|
---
|
|
nltk/downloader.py | 236 +++++++++++++++++++++++++++++++++++++++++++--
|
|
1 file changed, 229 insertions(+), 7 deletions(-)
|
|
|
|
Index: nltk-3.9.1/nltk/downloader.py
|
|
===================================================================
|
|
--- nltk-3.9.1.orig/nltk/downloader.py
|
|
+++ nltk-3.9.1/nltk/downloader.py
|
|
@@ -2234,20 +2234,58 @@ def unzip(filename, root, verbose=True):
|
|
|
|
|
|
def _unzip_iter(filename, root, verbose=True):
|
|
+ """
|
|
+ Secure ZIP extraction with minimal behavioural changes.
|
|
+
|
|
+ - Prevents classic Zip-Slip (.., absolute paths, drive letters)
|
|
+ - Prevents writes through pre-existing symlinks
|
|
+ - Preserves original extraction behaviour for valid archives
|
|
+ """
|
|
+
|
|
if verbose:
|
|
sys.stdout.write("Unzipping %s" % os.path.split(filename)[1])
|
|
sys.stdout.flush()
|
|
|
|
try:
|
|
zf = zipfile.ZipFile(filename)
|
|
- except zipfile.BadZipFile:
|
|
- yield ErrorMessage(filename, "Error with downloaded zip file")
|
|
- return
|
|
except Exception as e:
|
|
yield ErrorMessage(filename, e)
|
|
return
|
|
|
|
- zf.extractall(root)
|
|
+ # Canonical root
|
|
+ root_abs = os.path.abspath(root)
|
|
+ root_real = os.path.realpath(root_abs)
|
|
+ root_prefix = root_real.rstrip(os.sep) + os.sep
|
|
+
|
|
+ # Ensure the extraction root directory exists
|
|
+ os.makedirs(root, exist_ok=True)
|
|
+
|
|
+ try:
|
|
+ for member in zf.namelist():
|
|
+
|
|
+ # Construct target path
|
|
+ raw_target = os.path.join(root_abs, member)
|
|
+ target_abs = os.path.abspath(raw_target)
|
|
+
|
|
+ # Zip-Slip check (absolute/traversal/drive-letter cases)
|
|
+ if not target_abs.startswith(root_prefix):
|
|
+ yield ErrorMessage(filename, f"Zip Slip blocked: {member}")
|
|
+ continue
|
|
+
|
|
+ # Symlink-escape check
|
|
+ target_real = os.path.realpath(target_abs)
|
|
+ if not target_real.startswith(root_prefix):
|
|
+ yield ErrorMessage(filename, f"Symlink escape blocked: {member}")
|
|
+ continue
|
|
+
|
|
+ # Safe extraction
|
|
+ try:
|
|
+ zf.extract(member, root)
|
|
+ except Exception as e:
|
|
+ yield ErrorMessage(filename, f"Extraction error for {member}: {e}")
|
|
+ continue
|
|
+ finally:
|
|
+ zf.close()
|
|
|
|
if verbose:
|
|
print()
|
|
Index: nltk-3.9.1/nltk/tag/mapping.py
|
|
===================================================================
|
|
--- nltk-3.9.1.orig/nltk/tag/mapping.py
|
|
+++ nltk-3.9.1/nltk/tag/mapping.py
|
|
@@ -32,7 +32,7 @@ X - other: foreign words, typos, abbrevi
|
|
from collections import defaultdict
|
|
from os.path import join
|
|
|
|
-from nltk.data import load
|
|
+from nltk.data import load, normalize_resource_url
|
|
|
|
_UNIVERSAL_DATA = "taggers/universal_tagset"
|
|
_UNIVERSAL_TAGS = (
|
|
@@ -56,7 +56,10 @@ _MAPPINGS = defaultdict(lambda: defaultd
|
|
|
|
|
|
def _load_universal_map(fileid):
|
|
- contents = load(join(_UNIVERSAL_DATA, fileid + ".map"), format="text")
|
|
+ resource = normalize_resource_url(
|
|
+ f"nltk:{_UNIVERSAL_DATA.rstrip('/')}/{fileid.lstrip('/')}.map"
|
|
+ )
|
|
+ contents = load(resource, format="text")
|
|
|
|
# When mapping to the Universal Tagset,
|
|
# map unknown inputs to 'X' not 'UNK'
|
|
Index: nltk-3.9.1/nltk/test/unit/test_downloader_unzip.py
|
|
===================================================================
|
|
--- /dev/null
|
|
+++ nltk-3.9.1/nltk/test/unit/test_downloader_unzip.py
|
|
@@ -0,0 +1,233 @@
|
|
+import os
|
|
+import sys
|
|
+import zipfile
|
|
+from pathlib import Path
|
|
+
|
|
+import pytest
|
|
+
|
|
+from nltk.downloader import ErrorMessage, _unzip_iter
|
|
+
|
|
+
|
|
+def _make_zip(file_path: Path, members: dict[str, bytes]) -> None:
|
|
+ """
|
|
+ Create a ZIP file at file_path, with the given arcname->content mapping.
|
|
+ """
|
|
+ with zipfile.ZipFile(file_path, "w") as zf:
|
|
+ for arcname, content in members.items():
|
|
+ zf.writestr(arcname, content)
|
|
+
|
|
+
|
|
+def _run_unzip_iter(zip_path: Path, extract_root: Path, verbose: bool = False):
|
|
+ """
|
|
+ Convenience wrapper that runs _unzip_iter and returns the list of yielded
|
|
+ messages (if any).
|
|
+ """
|
|
+ return list(_unzip_iter(str(zip_path), str(extract_root), verbose=verbose))
|
|
+
|
|
+
|
|
+class TestSecureUnzip:
|
|
+ """
|
|
+ Tests for secure ZIP extraction behaviour in nltk.downloader._unzip_iter.
|
|
+
|
|
+ These tests are specifically designed so that:
|
|
+
|
|
+ * On the old implementation (using zf.extractall(root) without checks),
|
|
+ the "Zip-Slip" tests will fail because they rely on the new behaviour
|
|
+ (yielding ErrorMessage for escaping entries, and NOT writing outside
|
|
+ the extraction root).
|
|
+
|
|
+ * On the new implementation, the tests pass, because the new path
|
|
+ validation prevents escapes and emits the expected ErrorMessage.
|
|
+ """
|
|
+
|
|
+ def test_normal_relative_paths_are_extracted(self, tmp_path: Path) -> None:
|
|
+ """
|
|
+ A ZIP with only safe, relative paths should fully extract under the
|
|
+ given root, and should not yield any ErrorMessage.
|
|
+ """
|
|
+ zip_path = tmp_path / "safe.zip"
|
|
+ extract_root = tmp_path / "extract"
|
|
+
|
|
+ members = {
|
|
+ "pkg/file.txt": b"hello",
|
|
+ "pkg/subdir/other.txt": b"world",
|
|
+ }
|
|
+ _make_zip(zip_path, members)
|
|
+
|
|
+ messages = _run_unzip_iter(zip_path, extract_root, verbose=False)
|
|
+
|
|
+ # No ErrorMessage should be yielded for valid archives.
|
|
+ assert not any(isinstance(m, ErrorMessage) for m in messages)
|
|
+
|
|
+ assert (extract_root / "pkg" / "file.txt").read_bytes() == b"hello"
|
|
+ assert (extract_root / "pkg" / "subdir" / "other.txt").read_bytes() == b"world"
|
|
+
|
|
+ def test_zip_slip_with_parent_directory_component_is_blocked(
|
|
+ self, tmp_path: Path
|
|
+ ) -> None:
|
|
+ """
|
|
+ An entry containing ``..`` that would escape the target directory
|
|
+ must not be written outside the extraction root, and must cause
|
|
+ _unzip_iter to yield an ErrorMessage.
|
|
+
|
|
+ On the old implementation (extractall(root)), this test will fail
|
|
+ because the file outside the root is actually created and no
|
|
+ ErrorMessage is yielded.
|
|
+ """
|
|
+ zip_path = tmp_path / "zip_slip_parent.zip"
|
|
+ extract_root = tmp_path / "extract"
|
|
+
|
|
+ # This is the path that would be written if "../outside.txt" is
|
|
+ # extracted with extractall(root).
|
|
+ outside_target = (extract_root / ".." / "outside.txt").resolve()
|
|
+
|
|
+ members = {
|
|
+ "pkg/good.txt": b"ok",
|
|
+ # This would escape extract_root if not validated.
|
|
+ "../outside.txt": b"evil",
|
|
+ }
|
|
+ _make_zip(zip_path, members)
|
|
+
|
|
+ messages = _run_unzip_iter(zip_path, extract_root, verbose=False)
|
|
+
|
|
+ # With the secure implementation, the escaping entry must be blocked
|
|
+ # and reported as an ErrorMessage.
|
|
+ err_msgs = [m for m in messages if isinstance(m, ErrorMessage)]
|
|
+ assert (
|
|
+ err_msgs
|
|
+ ), "Expected an ErrorMessage for a Zip-Slip parent-directory attempt"
|
|
+
|
|
+ # Check that the message text indicates it was blocked. Adjust to the
|
|
+ # exact wording if needed.
|
|
+ combined_messages = " ".join(str(m.message) for m in err_msgs)
|
|
+ assert "Zip Slip" in combined_messages and "blocked" in combined_messages
|
|
+
|
|
+ # The escaping path must not have been written.
|
|
+ assert not outside_target.exists()
|
|
+
|
|
+ # The safe entry should still be extracted under the root.
|
|
+ assert (extract_root / "pkg" / "good.txt").read_bytes() == b"ok"
|
|
+
|
|
+ @pytest.mark.skipif(
|
|
+ sys.platform.startswith("win"),
|
|
+ reason="Absolute POSIX paths are not meaningful on Windows",
|
|
+ )
|
|
+ def test_zip_slip_with_absolute_posix_path_is_blocked(self, tmp_path: Path) -> None:
|
|
+ """
|
|
+ An entry with an absolute POSIX path (e.g. ``/tmp/evil``) must not be
|
|
+ extracted as-is; it should not overwrite arbitrary filesystem paths,
|
|
+ and should result in an ErrorMessage.
|
|
+
|
|
+ On the old implementation (extractall(root)), this test will fail
|
|
+ because the absolute path gets created without any ErrorMessage.
|
|
+ """
|
|
+ zip_path = tmp_path / "zip_slip_abs_posix.zip"
|
|
+ extract_root = tmp_path / "extract"
|
|
+
|
|
+ # Choose a unique location in /tmp to avoid interfering with real files.
|
|
+ absolute_target = Path("/tmp") / f"nltk_zip_slip_test_{os.getpid()}"
|
|
+
|
|
+ try:
|
|
+ members = {
|
|
+ "pkg/good.txt": b"ok",
|
|
+ # This absolute path should never be created by the secure
|
|
+ # implementation.
|
|
+ str(absolute_target): b"evil",
|
|
+ }
|
|
+ _make_zip(zip_path, members)
|
|
+
|
|
+ messages = _run_unzip_iter(zip_path, extract_root, verbose=False)
|
|
+
|
|
+ err_msgs = [m for m in messages if isinstance(m, ErrorMessage)]
|
|
+ assert (
|
|
+ err_msgs
|
|
+ ), "Expected an ErrorMessage for absolute-path Zip-Slip attempt"
|
|
+
|
|
+ combined_messages = " ".join(str(m.message) for m in err_msgs)
|
|
+ assert "Zip Slip" in combined_messages and "blocked" in combined_messages
|
|
+
|
|
+ # Absolute path must not be created by the secure implementation.
|
|
+ assert not absolute_target.exists()
|
|
+
|
|
+ # Safe entry must still be extracted under extract_root.
|
|
+ assert (extract_root / "pkg" / "good.txt").read_bytes() == b"ok"
|
|
+ finally:
|
|
+ # Best-effort cleanup if the implementation under test behaves
|
|
+ # incorrectly and creates the file.
|
|
+ if absolute_target.exists():
|
|
+ try:
|
|
+ absolute_target.unlink()
|
|
+ except OSError:
|
|
+ # Ignore cleanup errors: file may not exist or be locked.
|
|
+ pass
|
|
+
|
|
+ def test_entries_resolved_outside_root_are_blocked_via_symlink(
|
|
+ self, tmp_path: Path
|
|
+ ) -> None:
|
|
+ """
|
|
+ If there is a pre-existing symlink below the extraction root that
|
|
+ points outside the root, writing through that symlink should not
|
|
+ be allowed to escape the root.
|
|
+
|
|
+ This test documents the desired hardening to defend against this
|
|
+ class of attacks.
|
|
+ """
|
|
+ if not hasattr(os, "symlink"):
|
|
+ pytest.skip("Symlinks not supported on this platform")
|
|
+
|
|
+ zip_path = tmp_path / "zip_slip_symlink.zip"
|
|
+ extract_root = tmp_path / "extract"
|
|
+ outside_dir = tmp_path / "outside_dir"
|
|
+ outside_dir.mkdir()
|
|
+ outside_target = outside_dir / "evil.txt"
|
|
+
|
|
+ members = {
|
|
+ "pkg/good.txt": b"ok",
|
|
+ "dir_link/evil.txt": b"evil",
|
|
+ }
|
|
+ _make_zip(zip_path, members)
|
|
+
|
|
+ extract_root.mkdir()
|
|
+ # Pre-existing symlink inside extract_root pointing *outside* it.
|
|
+ os.symlink(outside_dir, extract_root / "dir_link")
|
|
+
|
|
+ messages = _run_unzip_iter(zip_path, extract_root, verbose=False)
|
|
+
|
|
+ # Desired property:
|
|
+ assert not outside_target.exists()
|
|
+ assert (extract_root / "pkg" / "good.txt").read_bytes() == b"ok"
|
|
+ # Should yield an ErrorMessage for the blocked symlink escape
|
|
+ assert any(isinstance(m, ErrorMessage) for m in messages)
|
|
+
|
|
+ def test_bad_zipfile_yields_errormessage(self, tmp_path: Path) -> None:
|
|
+ """
|
|
+ A corrupt or non-zip file should cause _unzip_iter to yield an
|
|
+ ErrorMessage instead of raising an unhandled exception.
|
|
+ """
|
|
+ zip_path = tmp_path / "not_a_zip.txt"
|
|
+ zip_path.write_bytes(b"this is not a zip archive")
|
|
+ extract_root = tmp_path / "extract"
|
|
+
|
|
+ messages = _run_unzip_iter(zip_path, extract_root, verbose=False)
|
|
+
|
|
+ assert any(isinstance(m, ErrorMessage) for m in messages)
|
|
+
|
|
+ # If the implementation chooses to create the root directory at all,
|
|
+ # it should not leave partially extracted content.
|
|
+ if extract_root.exists():
|
|
+ assert not any(extract_root.iterdir())
|
|
+
|
|
+ def test_unzip_iter_verbose_writes_to_stdout(self, capsys, tmp_path: Path) -> None:
|
|
+ """
|
|
+ When verbose=True, _unzip_iter should write a status line to stdout.
|
|
+ This checks that existing user-visible behaviour is preserved.
|
|
+ """
|
|
+ zip_path = tmp_path / "verbose.zip"
|
|
+ extract_root = tmp_path / "extract"
|
|
+
|
|
+ members = {"pkg/file.txt": b"data"}
|
|
+ _make_zip(zip_path, members)
|
|
+
|
|
+ _run_unzip_iter(zip_path, extract_root, verbose=True)
|
|
+ captured = capsys.readouterr()
|
|
+ assert "Unzipping" in captured.out
|