From 9043edabc7e2f0dd655146e0a4571e2a0b2906af Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Fri, 13 Jun 2025 19:57:48 +0300 Subject: [PATCH] gh-135462: Fix quadratic complexity in processing special input in HTMLParser (GH-135464) End-of-file errors are now handled according to the HTML5 specs -- comments and declarations are automatically closed, tags are ignored. (cherry picked from commit 6eb6c5dbfb528bd07d77b60fd71fd05d81d45c41) Co-authored-by: Serhiy Storchaka --- Lib/html/parser.py | 41 +++- Lib/test/test_htmlparser.py | 97 +++++++--- Misc/NEWS.d/next/Security/2025-06-13-15-55-22.gh-issue-135462.KBeJpc.rst | 4 3 files changed, 111 insertions(+), 31 deletions(-) create mode 100644 Misc/NEWS.d/next/Security/2025-06-13-15-55-22.gh-issue-135462.KBeJpc.rst Index: Python-3.13.5/Lib/html/parser.py =================================================================== --- Python-3.13.5.orig/Lib/html/parser.py 2025-06-11 17:36:57.000000000 +0200 +++ Python-3.13.5/Lib/html/parser.py 2025-07-02 16:49:52.020175099 +0200 @@ -27,6 +27,7 @@ attr_charref = re.compile(r'&(#[0-9]+|#[xX][0-9a-fA-F]+|[a-zA-Z][a-zA-Z0-9]*)[;=]?') starttagopen = re.compile('<[a-zA-Z]') +endtagopen = re.compile('') commentclose = re.compile(r'--\s*>') # Note: @@ -195,7 +196,7 @@ k = self.parse_pi(i) elif startswith("', i + 1) - if k < 0: - k = rawdata.find('<', i + 1) - if k < 0: - k = i + 1 + if starttagopen.match(rawdata, i): # < + letter + pass + elif startswith("'), - ('comment', '/img'), - ('endtag', 'html<')]) + ('data', '\n')]) def test_starttag_junk_chars(self): + self._run_check("<", [('data', '<')]) + self._run_check("<>", [('data', '<>')]) + self._run_check("< >", [('data', '< >')]) + self._run_check("< ", [('data', '< ')]) self._run_check("", []) + self._run_check("<$>", [('data', '<$>')]) self._run_check("", [('comment', '$')]) self._run_check("", [('endtag', 'a')]) + self._run_check("", [('starttag', 'a", [('endtag', 'a'", [('data', "'", []) + self._run_check("", [('starttag', 'a$b', [])]) self._run_check("", [('startendtag', 'a$b', [])]) self._run_check("", [('starttag', 'a$b', [])]) self._run_check("", [('startendtag', 'a$b', [])]) + self._run_check("", [('endtag', 'a$b')]) def test_slashes_in_starttag(self): self._run_check('', [('startendtag', 'a', [('foo', 'var')])]) @@ -576,21 +583,50 @@ for html, expected in data: self._run_check(html, expected) - def test_EOF_in_comments_or_decls(self): + def test_eof_in_comments(self): data = [ - ('', [('comment', '-!>')]), + ('' '' '' @@ -604,6 +640,7 @@ '' # required '[' after CDATA ) expected = [ + ('comment', 'ELEMENT br EMPTY'), ('comment', ' not really a comment '), ('comment', ' not a comment either --'), ('comment', ' -- close enough --'), @@ -684,6 +721,26 @@ ('endtag', 'a'), ('data', ' bar & baz')] ) + @support.requires_resource('cpu') + def test_eof_no_quadratic_complexity(self): + # Each of these examples used to take about an hour. + # Now they take a fraction of a second. + def check(source): + parser = html.parser.HTMLParser() + parser.feed(source) + parser.close() + n = 120_000 + check("