149 lines
6.4 KiB
Diff
149 lines
6.4 KiB
Diff
|
|
From 55f655ffb7ef03bdd1df0f013743831fe54e3c7a Mon Sep 17 00:00:00 2001
|
|||
|
|
From: Leonard Richardson <leonardr@segfault.org>
|
|||
|
|
Date: Mon, 8 Dec 2025 19:34:16 -0500
|
|||
|
|
Subject: * Change the html.parser tree builder's code for handling numeric
|
|||
|
|
character references, to avoid a crash when using Python versions that
|
|||
|
|
include the fix to Python issue https://bugs.python.org/issue13633 (e.g.
|
|||
|
|
Python 3.11.13). [bug=2134393]
|
|||
|
|
|
|||
|
|
---
|
|||
|
|
CHANGELOG | 5 +++
|
|||
|
|
bs4/builder/_htmlparser.py | 78 ++++++++++++++++++++++++++++++++++++--------
|
|||
|
|
bs4/tests/test_htmlparser.py | 17 ++++++++++
|
|||
|
|
3 files changed, 86 insertions(+), 14 deletions(-)
|
|||
|
|
|
|||
|
|
diff --git a/bs4/builder/_htmlparser.py b/bs4/builder/_htmlparser.py
|
|||
|
|
index 165a3d8..ead800f 100644
|
|||
|
|
--- a/bs4/builder/_htmlparser.py
|
|||
|
|
+++ b/bs4/builder/_htmlparser.py
|
|||
|
|
@@ -10,6 +10,7 @@ __all__ = [
|
|||
|
|
]
|
|||
|
|
|
|||
|
|
from html.parser import HTMLParser
|
|||
|
|
+import re
|
|||
|
|
|
|||
|
|
from typing import (
|
|||
|
|
Any,
|
|||
|
|
@@ -223,6 +224,64 @@ class BeautifulSoupHTMLParser(HTMLParser, DetectsXMLParsedAsHTML):
|
|||
|
|
"""Handle some textual data that shows up between tags."""
|
|||
|
|
self.soup.handle_data(data)
|
|||
|
|
|
|||
|
|
+ _DECIMAL_REFERENCE_WITH_FOLLOWING_DATA = re.compile("^([0-9]+)(.*)")
|
|||
|
|
+ _HEX_REFERENCE_WITH_FOLLOWING_DATA = re.compile("^([0-9a-f]+)(.*)")
|
|||
|
|
+
|
|||
|
|
+ @classmethod
|
|||
|
|
+ def _dereference_numeric_character_reference(cls, name:str) -> Tuple[str, bool, str]:
|
|||
|
|
+ """Convert a numeric character reference into an actual character.
|
|||
|
|
+
|
|||
|
|
+ :param name: The number of the character reference, as
|
|||
|
|
+ obtained by html.parser
|
|||
|
|
+
|
|||
|
|
+ :return: A 3-tuple (dereferenced, replacement_added,
|
|||
|
|
+ extra_data). `dereferenced` is the dereferenced character
|
|||
|
|
+ reference, or the empty string if there was no
|
|||
|
|
+ reference. `replacement_added` is True if the reference
|
|||
|
|
+ could only be dereferenced by replacing content with U+FFFD
|
|||
|
|
+ REPLACEMENT CHARACTER. `extra_data` is a portion of data
|
|||
|
|
+ following the character reference, which was deemed to be
|
|||
|
|
+ normal data and not part of the reference at all.
|
|||
|
|
+ """
|
|||
|
|
+ dereferenced:str = ""
|
|||
|
|
+ replacement_added:bool = False
|
|||
|
|
+ extra_data:str = ""
|
|||
|
|
+
|
|||
|
|
+ base:int = 10
|
|||
|
|
+ reg = cls._DECIMAL_REFERENCE_WITH_FOLLOWING_DATA
|
|||
|
|
+ if name.startswith("x") or name.startswith("X"):
|
|||
|
|
+ # Hex reference
|
|||
|
|
+ name = name[1:]
|
|||
|
|
+ base = 16
|
|||
|
|
+ reg = cls._HEX_REFERENCE_WITH_FOLLOWING_DATA
|
|||
|
|
+
|
|||
|
|
+ real_name:Optional[int] = None
|
|||
|
|
+ try:
|
|||
|
|
+ real_name = int(name, base)
|
|||
|
|
+ except ValueError:
|
|||
|
|
+ # This is either bad data that starts with what looks like
|
|||
|
|
+ # a numeric character reference, or a real numeric
|
|||
|
|
+ # reference that wasn't terminated by a semicolon.
|
|||
|
|
+ #
|
|||
|
|
+ # The fix to https://bugs.python.org/issue13633 made it
|
|||
|
|
+ # our responsibility to handle the extra data.
|
|||
|
|
+ #
|
|||
|
|
+ # To preserve the old behavior, we extract the numeric
|
|||
|
|
+ # portion of the incoming "reference" and treat that as a
|
|||
|
|
+ # numeric reference. All subsequent data will be processed
|
|||
|
|
+ # as string data.
|
|||
|
|
+ match = reg.search(name)
|
|||
|
|
+ if match is not None:
|
|||
|
|
+ real_name = int(match.groups()[0], base)
|
|||
|
|
+ extra_data = match.groups()[1]
|
|||
|
|
+
|
|||
|
|
+ if real_name is None:
|
|||
|
|
+ dereferenced = ""
|
|||
|
|
+ extra_data = name
|
|||
|
|
+ else:
|
|||
|
|
+ dereferenced, replacement_added = UnicodeDammit.numeric_character_reference(real_name)
|
|||
|
|
+ return dereferenced, replacement_added, extra_data
|
|||
|
|
+
|
|||
|
|
def handle_charref(self, name: str) -> None:
|
|||
|
|
"""Handle a numeric character reference by converting it to the
|
|||
|
|
corresponding Unicode character and treating it as textual
|
|||
|
|
@@ -230,22 +289,13 @@ class BeautifulSoupHTMLParser(HTMLParser, DetectsXMLParsedAsHTML):
|
|||
|
|
|
|||
|
|
:param name: Character number, possibly in hexadecimal.
|
|||
|
|
"""
|
|||
|
|
- # TODO: This was originally a workaround for a bug in
|
|||
|
|
- # HTMLParser. (http://bugs.python.org/issue13633) The bug has
|
|||
|
|
- # been fixed, but removing this code still makes some
|
|||
|
|
- # Beautiful Soup tests fail. This needs investigation.
|
|||
|
|
- real_name:int
|
|||
|
|
- if name.startswith("x"):
|
|||
|
|
- real_name = int(name.lstrip("x"), 16)
|
|||
|
|
- elif name.startswith("X"):
|
|||
|
|
- real_name = int(name.lstrip("X"), 16)
|
|||
|
|
- else:
|
|||
|
|
- real_name = int(name)
|
|||
|
|
-
|
|||
|
|
- data, replacement_added = UnicodeDammit.numeric_character_reference(real_name)
|
|||
|
|
+ dereferenced, replacement_added, extra_data = self._dereference_numeric_character_reference(name)
|
|||
|
|
if replacement_added:
|
|||
|
|
self.soup.contains_replacement_characters = True
|
|||
|
|
- self.handle_data(data)
|
|||
|
|
+ if dereferenced is not None:
|
|||
|
|
+ self.handle_data(dereferenced)
|
|||
|
|
+ if extra_data is not None:
|
|||
|
|
+ self.handle_data(extra_data)
|
|||
|
|
|
|||
|
|
def handle_entityref(self, name: str) -> None:
|
|||
|
|
"""Handle a named entity reference by converting it to the
|
|||
|
|
diff --git a/bs4/tests/test_htmlparser.py b/bs4/tests/test_htmlparser.py
|
|||
|
|
index 0086a9d..cb85b53 100644
|
|||
|
|
--- a/bs4/tests/test_htmlparser.py
|
|||
|
|
+++ b/bs4/tests/test_htmlparser.py
|
|||
|
|
@@ -162,3 +162,20 @@ class TestHTMLParserTreeBuilder(HTMLTreeBuilderSmokeTest):
|
|||
|
|
# Since we do the replacement ourselves, we can set contains_replacement_characters appropriately.
|
|||
|
|
# lxml and html5lib do the replacement so all we ever see is REPLACEMENT CHARACTER.
|
|||
|
|
assert soup.contains_replacement_characters == True
|
|||
|
|
+
|
|||
|
|
+class TestBeautifulSoupHTMLParser:
|
|||
|
|
+ def test_dereference_numeric_character_reference(self):
|
|||
|
|
+ m = BeautifulSoupHTMLParser._dereference_numeric_character_reference
|
|||
|
|
+ assert m("64") == ("@", False, "")
|
|||
|
|
+ assert m("x64") == ("d", False, "")
|
|||
|
|
+ assert m("X64") == ("d", False, "")
|
|||
|
|
+ assert m("64andsomeextra") == ("@", False, "andsomeextra")
|
|||
|
|
+ assert m("") == ("", False, "")
|
|||
|
|
+ assert m("00whee") == ("<22>", True, "whee")
|
|||
|
|
+ assert m("xfffdthatsit") == ("<22>", False, "thatsit")
|
|||
|
|
+ assert m("xabcdplussomeextra") == ("ꯍ", False, "plussomeextra")
|
|||
|
|
+ assert m("obviouslynotnumeric") == ("", False, "obviouslynotnumeric")
|
|||
|
|
+
|
|||
|
|
+ # These are almost certainly wrong but at least it doesn't crash.
|
|||
|
|
+ assert m("xabcdandsomeextra") == ("\U000abcda", False, "ndsomeextra")
|
|||
|
|
+ assert m("xffffffffffffffffffffffbeep") == ("<22>", True, "p")
|
|||
|
|
--
|
|||
|
|
cgit v1.2.3
|
|||
|
|
|
|||
|
|
|