Files
python314/CVE-2025-4516-DecodeError-handler.patch
Matej Cepl 4bde241d1c - Update to 3.14.0~rc2:
- Library
    - gh-137426: Remove the code deprecation of
      importlib.abc.ResourceLoader. It is documented as
      deprecated, but left for backwards compatibility with other
      classes in importlib.abc.
    - gh-137282: Fix tab completion and dir() on
      concurrent.futures.
    - gh-137257: Bump the version of pip bundled in ensurepip to
      version 25.2
    - gh-137226: Fix behavior of
      annotationlib.ForwardRef.evaluate() when the type_params
      parameter is passed and the name of a type param is also
      present in an enclosing scope.
    - gh-130522: Fix unraisable TypeError raised during
      interpreter shutdown in the threading module.
    - gh-137059: Fix handling of file URLs with a
      Windows drive letter in the URL authority by
      urllib.request.url2pathname(). This fixes a regression in
      earlier pre-releases of Python 3.14.
    - gh-130577: tarfile now validates archives to ensure member
      offsets are non-negative. (Contributed by Alexander Enrique
      Urieles Nieto in gh-130577; CVE-2025-8194, bsc#1247249).
    - gh-135228: When dataclasses replaces a class with a slotted
      dataclass, the original class can now be garbage collected
      again. Earlier changes in Python 3.14 caused this class to
      always remain in existence together with the replacement
      class synthesized by dataclasses.
  - Documentation
    - gh-136155: We are now checking for fatal errors in EPUB

OBS-URL: https://build.opensuse.org/package/show/devel:languages:python:Factory/python314?expand=0&rev=91
2025-08-15 14:16:06 +00:00

481 lines
23 KiB
Diff

From 3a939ff2298d147459116f98a09549d0f1954039 Mon Sep 17 00:00:00 2001
From: Serhiy Storchaka <storchaka@gmail.com>
Date: Tue, 4 Feb 2025 11:44:39 +0200
Subject: Fix use-after-free in the unicode-escape decoder with error handler
If the error handler is used, a new bytes object is created to set as
the object attribute of UnicodeDecodeError, and that bytes object then
replaces the original data. A pointer to the decoded data will became invalid
after destroying that temporary bytes object. So we need other way to return
the first invalid escape from _PyUnicode_DecodeUnicodeEscapeInternal().
_PyBytes_DecodeEscape() does not have such issue, because it does not
use the error handlers registry, but it should be changed for compatibility
with _PyUnicode_DecodeUnicodeEscapeInternal().
---
Include/internal/pycore_bytesobject.h | 5
Include/internal/pycore_unicodeobject.h | 12 +-
Lib/test/test_codeccallbacks.py | 39 +++++++
Lib/test/test_codecs.py | 52 ++++++++--
Misc/NEWS.d/next/Security/2025-05-09-20-22-54.gh-issue-133767.kN2i3Q.rst | 2
Objects/bytesobject.c | 41 ++++---
Objects/unicodeobject.c | 46 +++++---
Parser/string_parser.c | 26 +++--
8 files changed, 160 insertions(+), 63 deletions(-)
Index: Python-3.14.0b1/Include/internal/pycore_bytesobject.h
===================================================================
--- Python-3.14.0b1.orig/Include/internal/pycore_bytesobject.h 2025-05-06 15:33:52.000000000 +0000
+++ Python-3.14.0b1/Include/internal/pycore_bytesobject.h 2025-05-17 06:44:53.614667081 +0000
@@ -20,8 +20,9 @@
// Helper for PyBytes_DecodeEscape that detects invalid escape chars.
// Export for test_peg_generator.
-PyAPI_FUNC(PyObject*) _PyBytes_DecodeEscape(const char *, Py_ssize_t,
- const char *, const char **);
+PyAPI_FUNC(PyObject*) _PyBytes_DecodeEscape2(const char *, Py_ssize_t,
+ const char *,
+ int *, const char **);
// Substring Search.
Index: Python-3.14.0b1/Include/internal/pycore_unicodeobject.h
===================================================================
--- Python-3.14.0b1.orig/Include/internal/pycore_unicodeobject.h 2025-05-06 15:33:52.000000000 +0000
+++ Python-3.14.0b1/Include/internal/pycore_unicodeobject.h 2025-05-17 06:44:53.614817134 +0000
@@ -139,14 +139,18 @@
// Helper for PyUnicode_DecodeUnicodeEscape that detects invalid escape
// chars.
// Export for test_peg_generator.
-PyAPI_FUNC(PyObject*) _PyUnicode_DecodeUnicodeEscapeInternal(
+PyAPI_FUNC(PyObject*) _PyUnicode_DecodeUnicodeEscapeInternal2(
const char *string, /* Unicode-Escape encoded string */
Py_ssize_t length, /* size of string */
const char *errors, /* error handling */
Py_ssize_t *consumed, /* bytes consumed */
- const char **first_invalid_escape); /* on return, points to first
- invalid escaped char in
- string. */
+ int *first_invalid_escape_char, /* on return, if not -1, contain the first
+ invalid escaped char (<= 0xff) or invalid
+ octal escape (> 0xff) in string. */
+ const char **first_invalid_escape_ptr); /* on return, if not NULL, may
+ point to the first invalid escaped
+ char in string.
+ May be NULL if errors is not NULL. */
/* --- Raw-Unicode-Escape Codecs ---------------------------------------------- */
Index: Python-3.14.0b1/Lib/test/test_codeccallbacks.py
===================================================================
--- Python-3.14.0b1.orig/Lib/test/test_codeccallbacks.py 2025-05-17 06:44:47.673341939 +0000
+++ Python-3.14.0b1/Lib/test/test_codeccallbacks.py 2025-05-17 06:44:53.615018793 +0000
@@ -2,6 +2,7 @@
import codecs
import html.entities
import itertools
+import re
import sys
import unicodedata
import unittest
@@ -1125,7 +1126,7 @@
text = 'abc<def>ghi'*n
text.translate(charmap)
- def test_mutatingdecodehandler(self):
+ def test_mutating_decode_handler(self):
baddata = [
("ascii", b"\xff"),
("utf-7", b"++"),
@@ -1160,6 +1161,42 @@
for (encoding, data) in baddata:
self.assertEqual(data.decode(encoding, "test.mutating"), "\u4242")
+ def test_mutating_decode_handler_unicode_escape(self):
+ decode = codecs.unicode_escape_decode
+ def mutating(exc):
+ if isinstance(exc, UnicodeDecodeError):
+ r = data.get(exc.object[:exc.end])
+ if r is not None:
+ exc.object = r[0] + exc.object[exc.end:]
+ return ('\u0404', r[1])
+ raise AssertionError("don't know how to handle %r" % exc)
+
+ codecs.register_error('test.mutating2', mutating)
+ data = {
+ br'\x0': (b'\\', 0),
+ br'\x3': (b'xxx\\', 3),
+ br'\x5': (b'x\\', 1),
+ }
+ def check(input, expected, msg):
+ with self.assertWarns(DeprecationWarning) as cm:
+ self.assertEqual(decode(input, 'test.mutating2'), (expected, len(input)))
+ self.assertIn(msg, str(cm.warning))
+
+ check(br'\x0n\z', '\u0404\n\\z', r'"\z" is an invalid escape sequence')
+ check(br'\x0n\501', '\u0404\n\u0141', r'"\501" is an invalid octal escape sequence')
+ check(br'\x0z', '\u0404\\z', r'"\z" is an invalid escape sequence')
+
+ check(br'\x3n\zr', '\u0404\n\\zr', r'"\z" is an invalid escape sequence')
+ check(br'\x3zr', '\u0404\\zr', r'"\z" is an invalid escape sequence')
+ check(br'\x3z5', '\u0404\\z5', r'"\z" is an invalid escape sequence')
+ check(memoryview(br'\x3z5x')[:-1], '\u0404\\z5', r'"\z" is an invalid escape sequence')
+ check(memoryview(br'\x3z5xy')[:-2], '\u0404\\z5', r'"\z" is an invalid escape sequence')
+
+ check(br'\x5n\z', '\u0404\n\\z', r'"\z" is an invalid escape sequence')
+ check(br'\x5n\501', '\u0404\n\u0141', r'"\501" is an invalid octal escape sequence')
+ check(br'\x5z', '\u0404\\z', r'"\z" is an invalid escape sequence')
+ check(memoryview(br'\x5zy')[:-1], '\u0404\\z', r'"\z" is an invalid escape sequence')
+
# issue32583
def test_crashing_decode_handler(self):
# better generating one more character to fill the extra space slot
Index: Python-3.14.0b1/Lib/test/test_codecs.py
===================================================================
--- Python-3.14.0b1.orig/Lib/test/test_codecs.py 2025-05-17 06:44:47.696604117 +0000
+++ Python-3.14.0b1/Lib/test/test_codecs.py 2025-05-17 06:44:53.615449954 +0000
@@ -1196,23 +1196,39 @@
check(br"[\1010]", b"[A0]")
check(br"[\x41]", b"[A]")
check(br"[\x410]", b"[A0]")
+
+ def test_warnings(self):
+ decode = codecs.escape_decode
+ check = coding_checker(self, decode)
for i in range(97, 123):
b = bytes([i])
if b not in b'abfnrtvx':
- with self.assertWarns(DeprecationWarning):
+ with self.assertWarnsRegex(DeprecationWarning,
+ r'"\\%c" is an invalid escape sequence' % i):
check(b"\\" + b, b"\\" + b)
- with self.assertWarns(DeprecationWarning):
+ with self.assertWarnsRegex(DeprecationWarning,
+ r'"\\%c" is an invalid escape sequence' % (i-32)):
check(b"\\" + b.upper(), b"\\" + b.upper())
- with self.assertWarns(DeprecationWarning):
+ with self.assertWarnsRegex(DeprecationWarning,
+ r'"\\8" is an invalid escape sequence'):
check(br"\8", b"\\8")
with self.assertWarns(DeprecationWarning):
check(br"\9", b"\\9")
- with self.assertWarns(DeprecationWarning):
+ with self.assertWarnsRegex(DeprecationWarning,
+ r'"\\\xfa" is an invalid escape sequence') as cm:
check(b"\\\xfa", b"\\\xfa")
for i in range(0o400, 0o1000):
- with self.assertWarns(DeprecationWarning):
+ with self.assertWarnsRegex(DeprecationWarning,
+ r'"\\%o" is an invalid octal escape sequence' % i):
check(rb'\%o' % i, bytes([i & 0o377]))
+ with self.assertWarnsRegex(DeprecationWarning,
+ r'"\\z" is an invalid escape sequence'):
+ self.assertEqual(decode(br'\x\z', 'ignore'), (b'\\z', 4))
+ with self.assertWarnsRegex(DeprecationWarning,
+ r'"\\501" is an invalid octal escape sequence'):
+ self.assertEqual(decode(br'\x\501', 'ignore'), (b'A', 6))
+
def test_errors(self):
decode = codecs.escape_decode
self.assertRaises(ValueError, decode, br"\x")
@@ -2661,24 +2677,40 @@
check(br"[\x410]", "[A0]")
check(br"\u20ac", "\u20ac")
check(br"\U0001d120", "\U0001d120")
+
+ def test_decode_warnings(self):
+ decode = codecs.unicode_escape_decode
+ check = coding_checker(self, decode)
for i in range(97, 123):
b = bytes([i])
if b not in b'abfnrtuvx':
- with self.assertWarns(DeprecationWarning):
+ with self.assertWarnsRegex(DeprecationWarning,
+ r'"\\%c" is an invalid escape sequence' % i):
check(b"\\" + b, "\\" + chr(i))
if b.upper() not in b'UN':
- with self.assertWarns(DeprecationWarning):
+ with self.assertWarnsRegex(DeprecationWarning,
+ r'"\\%c" is an invalid escape sequence' % (i-32)):
check(b"\\" + b.upper(), "\\" + chr(i-32))
- with self.assertWarns(DeprecationWarning):
+ with self.assertWarnsRegex(DeprecationWarning,
+ r'"\\8" is an invalid escape sequence'):
check(br"\8", "\\8")
with self.assertWarns(DeprecationWarning):
check(br"\9", "\\9")
- with self.assertWarns(DeprecationWarning):
+ with self.assertWarnsRegex(DeprecationWarning,
+ r'"\\\xfa" is an invalid escape sequence') as cm:
check(b"\\\xfa", "\\\xfa")
for i in range(0o400, 0o1000):
- with self.assertWarns(DeprecationWarning):
+ with self.assertWarnsRegex(DeprecationWarning,
+ r'"\\%o" is an invalid octal escape sequence' % i):
check(rb'\%o' % i, chr(i))
+ with self.assertWarnsRegex(DeprecationWarning,
+ r'"\\z" is an invalid escape sequence'):
+ self.assertEqual(decode(br'\x\z', 'ignore'), ('\\z', 4))
+ with self.assertWarnsRegex(DeprecationWarning,
+ r'"\\501" is an invalid octal escape sequence'):
+ self.assertEqual(decode(br'\x\501', 'ignore'), ('\u0141', 6))
+
def test_decode_errors(self):
decode = codecs.unicode_escape_decode
for c, d in (b'x', 2), (b'u', 4), (b'U', 4):
Index: Python-3.14.0b1/Misc/NEWS.d/next/Security/2025-05-09-20-22-54.gh-issue-133767.kN2i3Q.rst
===================================================================
--- /dev/null 1970-01-01 00:00:00.000000000 +0000
+++ Python-3.14.0b1/Misc/NEWS.d/next/Security/2025-05-09-20-22-54.gh-issue-133767.kN2i3Q.rst 2025-05-17 06:44:53.615887918 +0000
@@ -0,0 +1,2 @@
+Fix use-after-free in the "unicode-escape" decoder with a non-"strict" error
+handler.
Index: Python-3.14.0b1/Objects/bytesobject.c
===================================================================
--- Python-3.14.0b1.orig/Objects/bytesobject.c 2025-05-06 15:33:52.000000000 +0000
+++ Python-3.14.0b1/Objects/bytesobject.c 2025-05-17 06:44:53.616354335 +0000
@@ -1075,10 +1075,11 @@
}
/* Unescape a backslash-escaped string. */
-PyObject *_PyBytes_DecodeEscape(const char *s,
+PyObject *_PyBytes_DecodeEscape2(const char *s,
Py_ssize_t len,
const char *errors,
- const char **first_invalid_escape)
+ int *first_invalid_escape_char,
+ const char **first_invalid_escape_ptr)
{
int c;
char *p;
@@ -1092,7 +1093,8 @@
return NULL;
writer.overallocate = 1;
- *first_invalid_escape = NULL;
+ *first_invalid_escape_char = -1;
+ *first_invalid_escape_ptr = NULL;
end = s + len;
while (s < end) {
@@ -1130,9 +1132,10 @@
c = (c<<3) + *s++ - '0';
}
if (c > 0377) {
- if (*first_invalid_escape == NULL) {
- *first_invalid_escape = s-3; /* Back up 3 chars, since we've
- already incremented s. */
+ if (*first_invalid_escape_char == -1) {
+ *first_invalid_escape_char = c;
+ /* Back up 3 chars, since we've already incremented s. */
+ *first_invalid_escape_ptr = s - 3;
}
}
*p++ = c;
@@ -1173,9 +1176,10 @@
break;
default:
- if (*first_invalid_escape == NULL) {
- *first_invalid_escape = s-1; /* Back up one char, since we've
- already incremented s. */
+ if (*first_invalid_escape_char == -1) {
+ *first_invalid_escape_char = (unsigned char)s[-1];
+ /* Back up one char, since we've already incremented s. */
+ *first_invalid_escape_ptr = s - 1;
}
*p++ = '\\';
s--;
@@ -1195,18 +1199,19 @@
Py_ssize_t Py_UNUSED(unicode),
const char *Py_UNUSED(recode_encoding))
{
- const char* first_invalid_escape;
- PyObject *result = _PyBytes_DecodeEscape(s, len, errors,
- &first_invalid_escape);
+ int first_invalid_escape_char;
+ const char *first_invalid_escape_ptr;
+ PyObject *result = _PyBytes_DecodeEscape2(s, len, errors,
+ &first_invalid_escape_char,
+ &first_invalid_escape_ptr);
if (result == NULL)
return NULL;
- if (first_invalid_escape != NULL) {
- unsigned char c = *first_invalid_escape;
- if ('4' <= c && c <= '7') {
+ if (first_invalid_escape_char != -1) {
+ if (first_invalid_escape_char > 0xff) {
if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
- "b\"\\%.3s\" is an invalid octal escape sequence. "
+ "b\"\\%o\" is an invalid octal escape sequence. "
"Such sequences will not work in the future. ",
- first_invalid_escape) < 0)
+ first_invalid_escape_char) < 0)
{
Py_DECREF(result);
return NULL;
@@ -1216,7 +1221,7 @@
if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
"b\"\\%c\" is an invalid escape sequence. "
"Such sequences will not work in the future. ",
- c) < 0)
+ first_invalid_escape_char) < 0)
{
Py_DECREF(result);
return NULL;
Index: Python-3.14.0b1/Objects/unicodeobject.c
===================================================================
--- Python-3.14.0b1.orig/Objects/unicodeobject.c 2025-05-06 15:33:52.000000000 +0000
+++ Python-3.14.0b1/Objects/unicodeobject.c 2025-05-17 06:44:53.617547540 +0000
@@ -6621,13 +6621,15 @@
/* --- Unicode Escape Codec ----------------------------------------------- */
PyObject *
-_PyUnicode_DecodeUnicodeEscapeInternal(const char *s,
+_PyUnicode_DecodeUnicodeEscapeInternal2(const char *s,
Py_ssize_t size,
const char *errors,
Py_ssize_t *consumed,
- const char **first_invalid_escape)
+ int *first_invalid_escape_char,
+ const char **first_invalid_escape_ptr)
{
const char *starts = s;
+ const char *initial_starts = starts;
_PyUnicodeWriter writer;
const char *end;
PyObject *errorHandler = NULL;
@@ -6635,7 +6637,8 @@
_PyUnicode_Name_CAPI *ucnhash_capi;
// so we can remember if we've seen an invalid escape char or not
- *first_invalid_escape = NULL;
+ *first_invalid_escape_char = -1;
+ *first_invalid_escape_ptr = NULL;
if (size == 0) {
if (consumed) {
@@ -6723,9 +6726,12 @@
}
}
if (ch > 0377) {
- if (*first_invalid_escape == NULL) {
- *first_invalid_escape = s-3; /* Back up 3 chars, since we've
- already incremented s. */
+ if (*first_invalid_escape_char == -1) {
+ *first_invalid_escape_char = ch;
+ if (starts == initial_starts) {
+ /* Back up 3 chars, since we've already incremented s. */
+ *first_invalid_escape_ptr = s - 3;
+ }
}
}
WRITE_CHAR(ch);
@@ -6820,9 +6826,12 @@
goto error;
default:
- if (*first_invalid_escape == NULL) {
- *first_invalid_escape = s-1; /* Back up one char, since we've
- already incremented s. */
+ if (*first_invalid_escape_char == -1) {
+ *first_invalid_escape_char = c;
+ if (starts == initial_starts) {
+ /* Back up one char, since we've already incremented s. */
+ *first_invalid_escape_ptr = s - 1;
+ }
}
WRITE_ASCII_CHAR('\\');
WRITE_CHAR(c);
@@ -6867,19 +6876,20 @@
const char *errors,
Py_ssize_t *consumed)
{
- const char *first_invalid_escape;
- PyObject *result = _PyUnicode_DecodeUnicodeEscapeInternal(s, size, errors,
+ int first_invalid_escape_char;
+ const char *first_invalid_escape_ptr;
+ PyObject *result = _PyUnicode_DecodeUnicodeEscapeInternal2(s, size, errors,
consumed,
- &first_invalid_escape);
+ &first_invalid_escape_char,
+ &first_invalid_escape_ptr);
if (result == NULL)
return NULL;
- if (first_invalid_escape != NULL) {
- unsigned char c = *first_invalid_escape;
- if ('4' <= c && c <= '7') {
+ if (first_invalid_escape_char != -1) {
+ if (first_invalid_escape_char > 0xff) {
if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
- "\"\\%.3s\" is an invalid octal escape sequence. "
+ "\"\\%o\" is an invalid octal escape sequence. "
"Such sequences will not work in the future. ",
- first_invalid_escape) < 0)
+ first_invalid_escape_char) < 0)
{
Py_DECREF(result);
return NULL;
@@ -6889,7 +6899,7 @@
if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
"\"\\%c\" is an invalid escape sequence. "
"Such sequences will not work in the future. ",
- c) < 0)
+ first_invalid_escape_char) < 0)
{
Py_DECREF(result);
return NULL;
Index: Python-3.14.0b1/Parser/string_parser.c
===================================================================
--- Python-3.14.0b1.orig/Parser/string_parser.c 2025-05-06 15:33:52.000000000 +0000
+++ Python-3.14.0b1/Parser/string_parser.c 2025-05-17 06:44:53.618734552 +0000
@@ -196,15 +196,18 @@
len = (size_t)(p - buf);
s = buf;
- const char *first_invalid_escape;
- v = _PyUnicode_DecodeUnicodeEscapeInternal(s, (Py_ssize_t)len, NULL, NULL, &first_invalid_escape);
+ int first_invalid_escape_char;
+ const char *first_invalid_escape_ptr;
+ v = _PyUnicode_DecodeUnicodeEscapeInternal2(s, (Py_ssize_t)len, NULL, NULL,
+ &first_invalid_escape_char,
+ &first_invalid_escape_ptr);
// HACK: later we can simply pass the line no, since we don't preserve the tokens
// when we are decoding the string but we preserve the line numbers.
- if (v != NULL && first_invalid_escape != NULL && t != NULL) {
- if (warn_invalid_escape_sequence(parser, s, first_invalid_escape, t) < 0) {
- /* We have not decref u before because first_invalid_escape points
- inside u. */
+ if (v != NULL && first_invalid_escape_ptr != NULL && t != NULL) {
+ if (warn_invalid_escape_sequence(parser, s, first_invalid_escape_ptr, t) < 0) {
+ /* We have not decref u before because first_invalid_escape_ptr
+ points inside u. */
Py_XDECREF(u);
Py_DECREF(v);
return NULL;
@@ -217,14 +220,17 @@
static PyObject *
decode_bytes_with_escapes(Parser *p, const char *s, Py_ssize_t len, Token *t)
{
- const char *first_invalid_escape;
- PyObject *result = _PyBytes_DecodeEscape(s, len, NULL, &first_invalid_escape);
+ int first_invalid_escape_char;
+ const char *first_invalid_escape_ptr;
+ PyObject *result = _PyBytes_DecodeEscape2(s, len, NULL,
+ &first_invalid_escape_char,
+ &first_invalid_escape_ptr);
if (result == NULL) {
return NULL;
}
- if (first_invalid_escape != NULL) {
- if (warn_invalid_escape_sequence(p, s, first_invalid_escape, t) < 0) {
+ if (first_invalid_escape_ptr != NULL) {
+ if (warn_invalid_escape_sequence(p, s, first_invalid_escape_ptr, t) < 0) {
Py_DECREF(result);
return NULL;
}