From 9043edabc7e2f0dd655146e0a4571e2a0b2906af Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Fri, 13 Jun 2025 19:57:48 +0300 Subject: [PATCH] gh-135462: Fix quadratic complexity in processing special input in HTMLParser (GH-135464) End-of-file errors are now handled according to the HTML5 specs -- comments and declarations are automatically closed, tags are ignored. (cherry picked from commit 6eb6c5dbfb528bd07d77b60fd71fd05d81d45c41) Co-authored-by: Serhiy Storchaka --- Lib/html/parser.py | 41 +++++--- Lib/test/test_htmlparser.py | 51 +++++++--- Misc/NEWS.d/next/Security/2025-06-13-15-55-22.gh-issue-135462.KBeJpc.rst | 4 3 files changed, 74 insertions(+), 22 deletions(-) create mode 100644 Misc/NEWS.d/next/Security/2025-06-13-15-55-22.gh-issue-135462.KBeJpc.rst Index: Python-3.11.13/Lib/html/parser.py =================================================================== --- Python-3.11.13.orig/Lib/html/parser.py 2025-07-02 18:12:07.084569398 +0200 +++ Python-3.11.13/Lib/html/parser.py 2025-07-02 18:12:12.582519793 +0200 @@ -25,6 +25,7 @@ charref = re.compile('&#(?:[0-9]+|[xX][0-9a-fA-F]+)[^0-9a-fA-F]') starttagopen = re.compile('<[a-zA-Z]') +endtagopen = re.compile('') commentclose = re.compile(r'--\s*>') # Note: @@ -176,7 +177,7 @@ k = self.parse_pi(i) elif startswith("', i + 1) - if k < 0: - k = rawdata.find('<', i + 1) - if k < 0: - k = i + 1 + if starttagopen.match(rawdata, i): # < + letter + pass + elif startswith("'), - ('comment', '/img'), - ('endtag', 'html<')]) + ('data', '\n')]) def test_starttag_junk_chars(self): + self._run_check("<", [('data', '<')]) + self._run_check("<>", [('data', '<>')]) + self._run_check("< >", [('data', '< >')]) + self._run_check("< ", [('data', '< ')]) self._run_check("", []) + self._run_check("<$>", [('data', '<$>')]) self._run_check("", [('comment', '$')]) self._run_check("", [('endtag', 'a')]) + self._run_check("", [('starttag', 'a", [('endtag', 'a'", [('data', "'", []) + self._run_check("", [('starttag', 'a$b', [])]) self._run_check("", [('startendtag', 'a$b', [])]) self._run_check("", [('starttag', 'a$b', [])]) self._run_check("", [('startendtag', 'a$b', [])]) + self._run_check("", [('endtag', 'a$b')]) def test_slashes_in_starttag(self): self._run_check('', [('startendtag', 'a', [('foo', 'var')])]) @@ -549,8 +557,9 @@ ('comment', ' -- close enough --'), ('comment', ''), ('comment', '<-- this was an empty comment'), - ('comment', '!! another bogus comment !!!'), + ('comment', '!! another bogus comment !!!') ] + self._run_check(html, expected) def test_broken_condcoms(self): @@ -598,6 +607,26 @@ ('endtag', 'a'), ('data', ' bar & baz')] ) + @support.requires_resource('cpu') + def test_eof_no_quadratic_complexity(self): + # Each of these examples used to take about an hour. + # Now they take a fraction of a second. + def check(source): + parser = html.parser.HTMLParser() + parser.feed(source) + parser.close() + n = 120_000 + check("