diff --git a/CVE-2026-28348.patch b/CVE-2026-28348.patch new file mode 100644 index 0000000..fcbf82a --- /dev/null +++ b/CVE-2026-28348.patch @@ -0,0 +1,110 @@ +From 2ef732667ddbc74ea59847bcf24b75809aaeed3b Mon Sep 17 00:00:00 2001 +From: Lumir Balhar +Date: Wed, 25 Feb 2026 22:35:58 +0100 +Subject: [PATCH] Implement unicode escape decoding + +Unicode escapes in CSS were not properly decoded before security +checks. This prevents attackers from bypassing filters using +escape sequences. +--- + CHANGES.rst | 7 ++++++ + lxml_html_clean/clean.py | 22 +++++++++++++++++- + tests/test_clean.py | 48 ++++++++++++++++++++++++++++++++++++++++ + 3 files changed, 76 insertions(+), 1 deletion(-) + +diff --git a/lxml_html_clean/clean.py b/lxml_html_clean/clean.py +index 3eeda47..5424d9f 100644 +--- a/lxml_html_clean/clean.py ++++ b/lxml_html_clean/clean.py +@@ -578,6 +578,26 @@ def _remove_javascript_link(self, link): + _comments_re = re.compile(r'/\*.*?\*/', re.S) + _find_comments = _comments_re.finditer + _substitute_comments = _comments_re.sub ++ _css_unicode_escape_re = re.compile(r'\\([0-9a-fA-F]{1,6})\s?') ++ ++ def _decode_css_unicode_escapes(self, style): ++ """ ++ Decode CSS Unicode escape sequences like \\69 or \\000069 to their ++ actual character values. This prevents bypassing security checks ++ using CSS escape sequences. ++ ++ CSS escape syntax: backslash followed by 1-6 hex digits, ++ optionally followed by a whitespace character. ++ """ ++ def replace_escape(match): ++ hex_value = match.group(1) ++ try: ++ return chr(int(hex_value, 16)) ++ except (ValueError, OverflowError): ++ # Invalid unicode codepoint, keep original ++ return match.group(0) ++ ++ return self._css_unicode_escape_re.sub(replace_escape, style) + + def _has_sneaky_javascript(self, style): + """ +@@ -591,7 +611,7 @@ def _has_sneaky_javascript(self, style): + more sneaky attempts. + """ + style = self._substitute_comments('', style) +- style = style.replace('\\', '') ++ style = self._decode_css_unicode_escapes(style) + style = _substitute_whitespace('', style) + style = style.lower() + if _has_javascript_scheme(style): +diff --git a/tests/test_clean.py b/tests/test_clean.py +index 64ad52d..d1ebcb1 100644 +--- a/tests/test_clean.py ++++ b/tests/test_clean.py +@@ -393,3 +393,51 @@ def test_possibly_invalid_url_without_whitelist(self): + self.assertEqual(len(w), 0) + self.assertNotIn("google.com", result) + self.assertNotIn("example.com", result) ++ ++ def test_unicode_escape_in_style(self): ++ # Test that CSS Unicode escapes are properly decoded before security checks ++ # This prevents attackers from bypassing filters using escape sequences ++ # CSS escape syntax: \HHHHHH where H is a hex digit (1-6 digits) ++ ++ # Test inline style attributes (requires safe_attrs_only=False) ++ cleaner = Cleaner(safe_attrs_only=False) ++ inline_style_cases = [ ++ # \6a\61\76\61\73\63\72\69\70\74 = "javascript" ++ ('
test
', '
test
'), ++ # \69 = 'i', so \69mport = "import" ++ ('
test
', '
test
'), ++ # \69 with space after = 'i', space consumed as part of escape ++ ('
test
', '
test
'), ++ # \65\78\70\72\65\73\73\69\6f\6e = "expression" ++ ('
test
', '
test
'), ++ ] ++ ++ for html, expected in inline_style_cases: ++ with self.subTest(html=html): ++ cleaned = cleaner.clean_html(html) ++ self.assertEqual(expected, cleaned) ++ ++ # Test ', ++ # Unicode-escaped "javascript:" without url() ++ '', ++ # Unicode-escaped "expression" ++ '', ++ # Unicode-escaped @import with 'i' ++ '', ++ # Unicode-escaped "data:" scheme ++ '', ++ # Space after escape is consumed: \69 mport = "import" ++ '', ++ # 6-digit escape: \000069 = 'i' ++ '', ++ # 6-digit escape with space ++ '', ++ ] ++ ++ for html in style_tag_cases: ++ with self.subTest(html=html): ++ cleaned = clean_html(html) ++ self.assertEqual('
', cleaned) diff --git a/CVE-2026-28350.patch b/CVE-2026-28350.patch new file mode 100644 index 0000000..a05ba2d --- /dev/null +++ b/CVE-2026-28350.patch @@ -0,0 +1,91 @@ +From 9c5612ca33b941eec4178abf8a5294b103403f34 Mon Sep 17 00:00:00 2001 +From: Lumir Balhar +Date: Wed, 25 Feb 2026 22:57:28 +0100 +Subject: [PATCH] Remove tags to prevent URL hijacking attacks + + tags are now automatically removed whenever is removed to +prevent URL hijacking attacks. According to HTML spec, must be in +, but browsers may interpret misplaced tags, allowing +attackers to redirect all relative URLs to malicious servers. +--- + CHANGES.rst | 5 +++++ + lxml_html_clean/clean.py | 6 +++++ + tests/test_clean.py | 48 ++++++++++++++++++++++++++++++++++++++++ + 3 files changed, 59 insertions(+) + +diff --git a/lxml_html_clean/clean.py b/lxml_html_clean/clean.py +index 5424d9f..6f95b26 100644 +--- a/lxml_html_clean/clean.py ++++ b/lxml_html_clean/clean.py +@@ -422,6 +422,12 @@ def __call__(self, doc): + if self.annoying_tags: + remove_tags.update(('blink', 'marquee')) + ++ # Remove tags whenever is being removed. ++ # According to HTML spec, must be in , but browsers ++ # may interpret it even when misplaced, allowing URL hijacking attacks. ++ if 'head' in kill_tags or 'head' in remove_tags: ++ kill_tags.add('base') ++ + _remove = deque() + _kill = deque() + for el in doc.iter(): +diff --git a/tests/test_clean.py b/tests/test_clean.py +index d1ebcb1..93f6da1 100644 +--- a/tests/test_clean.py ++++ b/tests/test_clean.py +@@ -394,6 +394,54 @@ def test_possibly_invalid_url_without_whitelist(self): + self.assertNotIn("google.com", result) + self.assertNotIn("example.com", result) + ++ def test_base_tag_removed_with_page_structure(self): ++ # Test that tags are removed when page_structure=True (default) ++ # This prevents URL hijacking attacks where redirects all relative URLs ++ ++ test_cases = [ ++ # in proper location (inside ) ++ 'link', ++ # outside ++ '
link
', ++ # Multiple tags ++ '
', ++ # with target attribute ++ '
content
', ++ # at various positions ++ 'test', ++ ] ++ ++ for html in test_cases: ++ with self.subTest(html=html): ++ cleaned = clean_html(html) ++ # Verify tag is completely removed ++ self.assertNotIn('base', cleaned.lower()) ++ self.assertNotIn('evil.com', cleaned) ++ self.assertNotIn('evil2.com', cleaned) ++ ++ def test_base_tag_kept_when_page_structure_false(self): ++ # When page_structure=False and head is not removed, should be kept ++ cleaner = Cleaner(page_structure=False) ++ html = 'test' ++ cleaned = cleaner.clean_html(html) ++ self.assertIn('', cleaned) ++ ++ def test_base_tag_removed_when_head_in_remove_tags(self): ++ # Even with page_structure=False, should be removed if head is manually removed ++ cleaner = Cleaner(page_structure=False, remove_tags=['head']) ++ html = 'test' ++ cleaned = cleaner.clean_html(html) ++ self.assertNotIn('base', cleaned.lower()) ++ self.assertNotIn('evil.com', cleaned) ++ ++ def test_base_tag_removed_when_head_in_kill_tags(self): ++ # Even with page_structure=False, should be removed if head is in kill_tags ++ cleaner = Cleaner(page_structure=False, kill_tags=['head']) ++ html = 'test' ++ cleaned = cleaner.clean_html(html) ++ self.assertNotIn('base', cleaned.lower()) ++ self.assertNotIn('evil.com', cleaned) ++ + def test_unicode_escape_in_style(self): + # Test that CSS Unicode escapes are properly decoded before security checks + # This prevents attackers from bypassing filters using escape sequences diff --git a/python-lxml_html_clean.changes b/python-lxml_html_clean.changes index 22d20a2..5da785c 100644 --- a/python-lxml_html_clean.changes +++ b/python-lxml_html_clean.changes @@ -1,3 +1,13 @@ +------------------------------------------------------------------- +Tue Mar 10 12:59:11 UTC 2026 - Nico Krapp + +- CVE-2026-28348: improper keywords checking can allow external CSS loading + (bsc#1259378) + * added CVE-2026-28348.patch +- CVE-2026-28350: lack of base tag handling can allow the hijacking of the + resolution of relative URLs (bsc#1259379) + * added CVE-2026-28350.patch + ------------------------------------------------------------------- Fri Apr 11 20:57:19 UTC 2025 - Dirk Müller diff --git a/python-lxml_html_clean.spec b/python-lxml_html_clean.spec index 5b5b953..e58348b 100644 --- a/python-lxml_html_clean.spec +++ b/python-lxml_html_clean.spec @@ -26,6 +26,10 @@ License: BSD-3-Clause Group: Development/Languages/Python URL: https://github.com/fedora-python/lxml_html_clean/ Source: https://files.pythonhosted.org/packages/source/l/lxml-html-clean/lxml_html_clean-%{version}.tar.gz +# PATCH-FIX-UPSTREAM CVE-2026-28348.patch bsc#1259378 gh#fedora-python/lxml_html_clean@2ef7326 +Patch1: CVE-2026-28348.patch +# PATCH-FIX-UPSTREAM CVE-2026-28350.patch bsc#1259379 gh#fedora-python/lxml_html_clean@9c5612c +Patch2: CVE-2026-28350.patch BuildRequires: %{python_module base >= 3.6} BuildRequires: %{python_module pip} BuildRequires: %{python_module setuptools >= 61.0}