From 77d7b8d7cfbe8dd179858dfa42666f73fc6e57a2 Mon Sep 17 00:00:00 2001 From: Stefan <96178532+stefan6419846@users.noreply.github.com> Date: Tue, 17 Feb 2026 17:46:56 +0100 Subject: [PATCH] SEC: Limit size of `/ToUnicode` entries (#3646) --- PyPDF2/_cmap.py | 20 ++++++++++ tests/test_cmap.py | 91 +++++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 109 insertions(+), 2 deletions(-) Index: PyPDF2-2.11.1/PyPDF2/_cmap.py =================================================================== --- PyPDF2-2.11.1.orig/PyPDF2/_cmap.py +++ PyPDF2-2.11.1/PyPDF2/_cmap.py @@ -5,7 +5,7 @@ from typing import Any, Dict, List, Tupl from ._codecs import adobe_glyphs, charset_encoding from ._utils import logger_warning -from .errors import PdfReadWarning +from .errors import PdfReadWarning, LimitReachedError from .generic import DecodedStreamObject, DictionaryObject @@ -262,6 +262,15 @@ def process_cm_line( return process_rg, process_char, multiline_rg +# Usual values should be up to 65_536. +MAPPING_DICTIONARY_SIZE_LIMIT = 100_000 + + +def _check_mapping_size(size: int) -> None: + if size > MAPPING_DICTIONARY_SIZE_LIMIT: + raise LimitReachedError(f"Maximum /ToUnicode size limit reached: {size} > {MAPPING_DICTIONARY_SIZE_LIMIT}.") + + def parse_bfrange( l: bytes, map_dict: Dict[Any, Any], @@ -273,6 +282,8 @@ def parse_bfrange( nbi = max(len(lst[0]), len(lst[1])) map_dict[-1] = ceil(nbi / 2) fmt = b"%%0%dX" % (map_dict[-1] * 2) + entry_count = len(int_entry) + _check_mapping_size(entry_count) if multiline_rg is not None: a = multiline_rg[0] # a, b not in the current line b = multiline_rg[1] @@ -280,6 +291,8 @@ def parse_bfrange( if sq == b"]": closure_found = True break + entry_count += 1 + _check_mapping_size(entry_count) map_dict[ unhexlify(fmt % a).decode( "charmap" if map_dict[-1] == 1 else "utf-16-be", @@ -296,6 +309,8 @@ def parse_bfrange( if sq == b"]": closure_found = True break + entry_count += 1 + _check_mapping_size(entry_count) map_dict[ unhexlify(fmt % a).decode( "charmap" if map_dict[-1] == 1 else "utf-16-be", @@ -308,6 +323,8 @@ def parse_bfrange( c = int(lst[2], 16) fmt2 = b"%%0%dX" % max(4, len(lst[2])) closure_found = True + range_size = max(0, b - a + 1) + _check_mapping_size(entry_count + range_size) # This can be checked beforehand. while a <= b: map_dict[ unhexlify(fmt % a).decode( @@ -323,6 +340,8 @@ def parse_bfrange( def parse_bfchar(l: bytes, map_dict: Dict[Any, Any], int_entry: List[int]) -> None: lst = [x for x in l.split(b" ") if x] + new_count = len(lst) // 2 + _check_mapping_size(len(int_entry) + new_count) # This can be checked beforehand. map_dict[-1] = len(lst[0]) // 2 while len(lst) > 1: map_to = "" Index: PyPDF2-2.11.1/tests/test_cmap.py =================================================================== --- PyPDF2-2.11.1.orig/tests/test_cmap.py +++ PyPDF2-2.11.1/tests/test_cmap.py @@ -3,7 +3,9 @@ from io import BytesIO import pytest from PyPDF2 import PdfReader -from PyPDF2.errors import PdfReadWarning +from PyPDF2._cmap import parse_bfchar, parse_bfrange +from PyPDF2.errors import PdfReadWarning, LimitReachedError +from PyPDF2.generic import StreamObject from . import get_pdf_from_url @@ -91,3 +93,89 @@ def test_iss1379(): name = "02voc.pdf" reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name))) reader.pages[2].extract_text() + + +def test_parse_bfrange__iteration_limit(): + writer = PdfWriter() + + to_unicode = StreamObject() + to_unicode.set_data( + b"beginbfrange\n" + b"<00000000> <001FFFFF> <00000000>\n" + b"endbfrange\n" + ) + font = writer._add_object(DictionaryObject({ + NameObject("/Type"): NameObject("/Font"), + NameObject("/Subtype"): NameObject("/Type1"), + NameObject("/BaseFont"): NameObject("/Helvetica"), + NameObject("/ToUnicode"): to_unicode, + })) + + page = writer.add_blank_page(width=100, height=100) + page[NameObject("/Resources")] = DictionaryObject({ + NameObject("/Font"): DictionaryObject({ + NameObject("/F1"): font.indirect_reference, + }) + }) + + # Case without list, exceeding list directly. + with pytest.raises( + expected_exception=LimitReachedError, match=r"^Maximum /ToUnicode size limit reached: 2097152 > 100000\.$" + ): + _ = page.extract_text() + + # Use a pre-filled dummy list to simulate multiple calls where the upper bound does + # not overflow, but the overall size does. Case without list. + int_entry = [0] * 99_999 + map_dict = {} + with pytest.raises( + expected_exception=LimitReachedError, match=r"^Maximum /ToUnicode size limit reached: 165535 > 100000\.$" + ): + _ = parse_bfrange(line=b"0000 FFFF 0000", map_dict=map_dict, int_entry=int_entry, multiline_rg=None) + assert map_dict == {-1: 2} + + # Exceeding from previous call. + int_entry.append(1) + map_dict = {} + with pytest.raises( + expected_exception=LimitReachedError, match=r"^Maximum /ToUnicode size limit reached: 100001 > 100000\.$" + ): + _ = parse_bfrange(line=b"00000000 00000000 00000000", map_dict=map_dict, int_entry=int_entry, multiline_rg=None) + assert map_dict == {-1: 4} + + # multiline_rg + int_entry = [0] * 99_995 + map_dict = {-1: 1} + with pytest.raises( + expected_exception=LimitReachedError, match=r"^Maximum /ToUnicode size limit reached: 100001 > 100000\.$" + ): + _ = parse_bfrange( + line=b"0020 0021 0022 0023 0024 0025 0026 2019", + map_dict=map_dict, int_entry=int_entry, multiline_rg=(32, 251) + ) + assert map_dict == {-1: 1, " ": " ", "!": "!", '"': '"', "#": "#", "$": "$"} + + # No multiline_rg, but list. + int_entry = [0] * 99_995 + map_dict = {} + with pytest.raises( + expected_exception=LimitReachedError, match=r"^Maximum /ToUnicode size limit reached: 100001 > 100000\.$" + ): + _ = parse_bfrange( + line=b"01 8A [ FFFD FFFD FFFD FFFF FFAB AAAA BBBB", + map_dict=map_dict, int_entry=int_entry, multiline_rg=None + ) + assert map_dict == {-1: 1, "\x01": "�", "\x02": "�", "\x03": "�", "\x04": "\uffff", "\x05": "ᆱ"} + + +def test_parse_bfchar__iteration_limit(): + int_entry = [0] * 99_995 + map_dict = {} + with pytest.raises( + expected_exception=LimitReachedError, match=r"^Maximum /ToUnicode size limit reached: 100002 > 100000\.$" + ): + parse_bfchar( + line=b"0003 0020 0008 0025 0009 0026 000A 0027 000B 0028 000C 0029 000D 002A", + map_dict=map_dict, int_entry=int_entry, + ) + assert map_dict == {}