python-beautifulsoup4/htmlparser.patch

From 55f655ffb7ef03bdd1df0f013743831fe54e3c7a Mon Sep 17 00:00:00 2001
From: Leonard Richardson <leonardr@segfault.org>
Date: Mon, 8 Dec 2025 19:34:16 -0500
Subject: * Change the html.parser tree builder's code for handling numeric
 character references, to avoid a crash when using Python versions   that
 include the fix to Python issue https://bugs.python.org/issue13633   (e.g.
 Python 3.11.13). [bug=2134393]

---
 CHANGELOG                    |  5 +++
 bs4/builder/_htmlparser.py   | 78 ++++++++++++++++++++++++++++++++++++--------
 bs4/tests/test_htmlparser.py | 17 ++++++++++
 3 files changed, 86 insertions(+), 14 deletions(-)

diff --git a/bs4/builder/_htmlparser.py b/bs4/builder/_htmlparser.py
index 165a3d8..ead800f 100644
--- a/bs4/builder/_htmlparser.py
+++ b/bs4/builder/_htmlparser.py
@@ -10,6 +10,7 @@ __all__ = [
 ]

 from html.parser import HTMLParser
+import re

 from typing import (
     Any,
@@ -223,6 +224,64 @@ class BeautifulSoupHTMLParser(HTMLParser, DetectsXMLParsedAsHTML):
         """Handle some textual data that shows up between tags."""
         self.soup.handle_data(data)

+    _DECIMAL_REFERENCE_WITH_FOLLOWING_DATA = re.compile("^([0-9]+)(.*)")
+    _HEX_REFERENCE_WITH_FOLLOWING_DATA = re.compile("^([0-9a-f]+)(.*)")
+
+    @classmethod
+    def _dereference_numeric_character_reference(cls, name:str) -> Tuple[str, bool, str]:
+        """Convert a numeric character reference into an actual character.
+
+        :param name: The number of the character reference, as
+          obtained by html.parser
+
+        :return: A 3-tuple (dereferenced, replacement_added,
+          extra_data). `dereferenced` is the dereferenced character
+          reference, or the empty string if there was no
+          reference. `replacement_added` is True if the reference
+          could only be dereferenced by replacing content with U+FFFD
+          REPLACEMENT CHARACTER. `extra_data` is a portion of data
+          following the character reference, which was deemed to be
+          normal data and not part of the reference at all.
+        """
+        dereferenced:str = ""
+        replacement_added:bool = False
+        extra_data:str = ""
+
+        base:int = 10
+        reg = cls._DECIMAL_REFERENCE_WITH_FOLLOWING_DATA
+        if name.startswith("x") or name.startswith("X"):
+            # Hex reference
+            name = name[1:]
+            base = 16
+            reg = cls._HEX_REFERENCE_WITH_FOLLOWING_DATA
+
+        real_name:Optional[int] = None
+        try:
+            real_name = int(name, base)
+        except ValueError:
+            # This is either bad data that starts with what looks like
+            # a numeric character reference, or a real numeric
+            # reference that wasn't terminated by a semicolon.
+            #
+            # The fix to https://bugs.python.org/issue13633 made it
+            # our responsibility to handle the extra data.
+            #
+            # To preserve the old behavior, we extract the numeric
+            # portion of the incoming "reference" and treat that as a
+            # numeric reference. All subsequent data will be processed
+            # as string data.
+            match = reg.search(name)
+            if match is not None:
+                real_name = int(match.groups()[0], base)
+                extra_data = match.groups()[1]
+
+        if real_name is None:
+            dereferenced = ""
+            extra_data = name
+        else:
+            dereferenced, replacement_added = UnicodeDammit.numeric_character_reference(real_name)
+        return dereferenced, replacement_added, extra_data
+
     def handle_charref(self, name: str) -> None:
         """Handle a numeric character reference by converting it to the
         corresponding Unicode character and treating it as textual
@@ -230,22 +289,13 @@ class BeautifulSoupHTMLParser(HTMLParser, DetectsXMLParsedAsHTML):

         :param name: Character number, possibly in hexadecimal.
         """
-        # TODO: This was originally a workaround for a bug in
-        # HTMLParser. (http://bugs.python.org/issue13633) The bug has
-        # been fixed, but removing this code still makes some
-        # Beautiful Soup tests fail. This needs investigation.
-        real_name:int
-        if name.startswith("x"):
-            real_name = int(name.lstrip("x"), 16)
-        elif name.startswith("X"):
-            real_name = int(name.lstrip("X"), 16)
-        else:
-            real_name = int(name)
-
-        data, replacement_added = UnicodeDammit.numeric_character_reference(real_name)
+        dereferenced, replacement_added, extra_data = self._dereference_numeric_character_reference(name)
         if replacement_added:
             self.soup.contains_replacement_characters = True
-        self.handle_data(data)
+        if dereferenced is not None:
+            self.handle_data(dereferenced)
+        if extra_data is not None:
+            self.handle_data(extra_data)

     def handle_entityref(self, name: str) -> None:
         """Handle a named entity reference by converting it to the
diff --git a/bs4/tests/test_htmlparser.py b/bs4/tests/test_htmlparser.py
index 0086a9d..cb85b53 100644
--- a/bs4/tests/test_htmlparser.py
+++ b/bs4/tests/test_htmlparser.py
@@ -162,3 +162,20 @@ class TestHTMLParserTreeBuilder(HTMLTreeBuilderSmokeTest):
         # Since we do the replacement ourselves, we can set contains_replacement_characters appropriately.
         # lxml and html5lib do the replacement so all we ever see is REPLACEMENT CHARACTER.
         assert soup.contains_replacement_characters == True
+
+class TestBeautifulSoupHTMLParser:
+    def test_dereference_numeric_character_reference(self):
+        m = BeautifulSoupHTMLParser._dereference_numeric_character_reference
+        assert m("64") == ("@", False, "")
+        assert m("x64") == ("d", False, "")
+        assert m("X64") == ("d", False, "")
+        assert m("64andsomeextra") == ("@", False, "andsomeextra")
+        assert m("") == ("", False, "")
+        assert m("00whee") == ("<22>", True, "whee")
+        assert m("xfffdthatsit") == ("<22>", False, "thatsit")
+        assert m("xabcdplussomeextra") == ("ꯍ", False, "plussomeextra")
+        assert m("obviouslynotnumeric") == ("", False, "obviouslynotnumeric")
+
+        # These are almost certainly wrong but at least it doesn't crash.
+        assert m("xabcdandsomeextra") == ("\U000abcda", False, "ndsomeextra")
+        assert m("xffffffffffffffffffffffbeep") == ("<22>", True, "p")
--
cgit v1.2.3