From 77d7b8d7cfbe8dd179858dfa42666f73fc6e57a2 Mon Sep 17 00:00:00 2001
From: Stefan <96178532+stefan6419846@users.noreply.github.com>
Date: Tue, 17 Feb 2026 17:46:56 +0100
Subject: [PATCH] SEC: Limit size of `/ToUnicode` entries (#3646)

---
 PyPDF2/_cmap.py     | 20 ++++++++++
 tests/test_cmap.py | 91 +++++++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 109 insertions(+), 2 deletions(-)

Index: PyPDF2-2.11.1/PyPDF2/_cmap.py
===================================================================
--- PyPDF2-2.11.1.orig/PyPDF2/_cmap.py
+++ PyPDF2-2.11.1/PyPDF2/_cmap.py
@@ -5,7 +5,7 @@ from typing import Any, Dict, List, Tupl
 
 from ._codecs import adobe_glyphs, charset_encoding
 from ._utils import logger_warning
-from .errors import PdfReadWarning
+from .errors import PdfReadWarning, LimitReachedError
 from .generic import DecodedStreamObject, DictionaryObject
 
 
@@ -262,6 +262,15 @@ def process_cm_line(
     return process_rg, process_char, multiline_rg
 
 
+# Usual values should be up to 65_536.
+MAPPING_DICTIONARY_SIZE_LIMIT = 100_000
+
+
+def _check_mapping_size(size: int) -> None:
+    if size > MAPPING_DICTIONARY_SIZE_LIMIT:
+        raise LimitReachedError(f"Maximum /ToUnicode size limit reached: {size} > {MAPPING_DICTIONARY_SIZE_LIMIT}.")
+
+
 def parse_bfrange(
     l: bytes,
     map_dict: Dict[Any, Any],
@@ -273,6 +282,8 @@ def parse_bfrange(
     nbi = max(len(lst[0]), len(lst[1]))
     map_dict[-1] = ceil(nbi / 2)
     fmt = b"%%0%dX" % (map_dict[-1] * 2)
+    entry_count = len(int_entry)
+    _check_mapping_size(entry_count)
     if multiline_rg is not None:
         a = multiline_rg[0]  # a, b not in the current line
         b = multiline_rg[1]
@@ -280,6 +291,8 @@ def parse_bfrange(
             if sq == b"]":
                 closure_found = True
                 break
+            entry_count += 1
+            _check_mapping_size(entry_count)
             map_dict[
                 unhexlify(fmt % a).decode(
                     "charmap" if map_dict[-1] == 1 else "utf-16-be",
@@ -296,6 +309,8 @@ def parse_bfrange(
                 if sq == b"]":
                     closure_found = True
                     break
+                entry_count += 1
+                _check_mapping_size(entry_count)
                 map_dict[
                     unhexlify(fmt % a).decode(
                         "charmap" if map_dict[-1] == 1 else "utf-16-be",
@@ -308,6 +323,8 @@ def parse_bfrange(
             c = int(lst[2], 16)
             fmt2 = b"%%0%dX" % max(4, len(lst[2]))
             closure_found = True
+            range_size = max(0, b - a + 1)
+            _check_mapping_size(entry_count + range_size)  # This can be checked beforehand.
             while a <= b:
                 map_dict[
                     unhexlify(fmt % a).decode(
@@ -323,6 +340,8 @@ def parse_bfrange(
 
 def parse_bfchar(l: bytes, map_dict: Dict[Any, Any], int_entry: List[int]) -> None:
     lst = [x for x in l.split(b" ") if x]
+    new_count = len(lst) // 2
+    _check_mapping_size(len(int_entry) + new_count)  # This can be checked beforehand.
     map_dict[-1] = len(lst[0]) // 2
     while len(lst) > 1:
         map_to = ""
Index: PyPDF2-2.11.1/tests/test_cmap.py
===================================================================
--- PyPDF2-2.11.1.orig/tests/test_cmap.py
+++ PyPDF2-2.11.1/tests/test_cmap.py
@@ -3,7 +3,9 @@ from io import BytesIO
 import pytest
 
 from PyPDF2 import PdfReader
-from PyPDF2.errors import PdfReadWarning
+from PyPDF2._cmap import parse_bfchar, parse_bfrange
+from PyPDF2.errors import PdfReadWarning, LimitReachedError
+from PyPDF2.generic import StreamObject
 
 from . import get_pdf_from_url
 
@@ -91,3 +93,89 @@ def test_iss1379():
     name = "02voc.pdf"
     reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name)))
     reader.pages[2].extract_text()
+
+
+def test_parse_bfrange__iteration_limit():
+    writer = PdfWriter()
+
+    to_unicode = StreamObject()
+    to_unicode.set_data(
+        b"beginbfrange\n"
+        b"<00000000> <001FFFFF> <00000000>\n"
+        b"endbfrange\n"
+    )
+    font = writer._add_object(DictionaryObject({
+        NameObject("/Type"): NameObject("/Font"),
+        NameObject("/Subtype"): NameObject("/Type1"),
+        NameObject("/BaseFont"): NameObject("/Helvetica"),
+        NameObject("/ToUnicode"): to_unicode,
+    }))
+
+    page = writer.add_blank_page(width=100, height=100)
+    page[NameObject("/Resources")] = DictionaryObject({
+        NameObject("/Font"): DictionaryObject({
+            NameObject("/F1"): font.indirect_reference,
+        })
+    })
+
+    # Case without list, exceeding list directly.
+    with pytest.raises(
+            expected_exception=LimitReachedError, match=r"^Maximum /ToUnicode size limit reached: 2097152 > 100000\.$"
+    ):
+        _ = page.extract_text()
+
+    # Use a pre-filled dummy list to simulate multiple calls where the upper bound does
+    # not overflow, but the overall size does. Case without list.
+    int_entry = [0] * 99_999
+    map_dict = {}
+    with pytest.raises(
+            expected_exception=LimitReachedError, match=r"^Maximum /ToUnicode size limit reached: 165535 > 100000\.$"
+    ):
+        _ = parse_bfrange(line=b"0000 FFFF 0000", map_dict=map_dict, int_entry=int_entry, multiline_rg=None)
+    assert map_dict == {-1: 2}
+
+    # Exceeding from previous call.
+    int_entry.append(1)
+    map_dict = {}
+    with pytest.raises(
+            expected_exception=LimitReachedError, match=r"^Maximum /ToUnicode size limit reached: 100001 > 100000\.$"
+    ):
+        _ = parse_bfrange(line=b"00000000 00000000 00000000", map_dict=map_dict, int_entry=int_entry, multiline_rg=None)
+    assert map_dict == {-1: 4}
+
+    # multiline_rg
+    int_entry = [0] * 99_995
+    map_dict = {-1: 1}
+    with pytest.raises(
+            expected_exception=LimitReachedError, match=r"^Maximum /ToUnicode size limit reached: 100001 > 100000\.$"
+    ):
+        _ = parse_bfrange(
+            line=b"0020  0021  0022  0023  0024  0025  0026  2019",
+            map_dict=map_dict, int_entry=int_entry, multiline_rg=(32, 251)
+        )
+    assert map_dict == {-1: 1, " ": " ", "!": "!", '"': '"', "#": "#", "$": "$"}
+
+    # No multiline_rg, but list.
+    int_entry = [0] * 99_995
+    map_dict = {}
+    with pytest.raises(
+            expected_exception=LimitReachedError, match=r"^Maximum /ToUnicode size limit reached: 100001 > 100000\.$"
+    ):
+        _ = parse_bfrange(
+            line=b"01 8A [ FFFD FFFD FFFD FFFF FFAB AAAA BBBB",
+            map_dict=map_dict, int_entry=int_entry, multiline_rg=None
+        )
+    assert map_dict == {-1: 1, "\x01": "�", "\x02": "�", "\x03": "�", "\x04": "\uffff", "\x05": "ﾫ"}
+
+
+def test_parse_bfchar__iteration_limit():
+    int_entry = [0] * 99_995
+    map_dict = {}
+    with pytest.raises(
+            expected_exception=LimitReachedError, match=r"^Maximum /ToUnicode size limit reached: 100002 > 100000\.$"
+    ):
+        parse_bfchar(
+            line=b"0003   0020   0008   0025   0009   0026   000A   0027   000B   0028   000C   0029   000D   002A",
+            map_dict=map_dict, int_entry=int_entry,
+        )
+    assert map_dict == {}