Improve sanitize_text() to keep selected CSI escape sequences

2025-08-21 05:58:52 +02:00 · 2024-03-07 11:59:51 +01:00
parent 2d5399442d
commit f9b17347da
2 changed files with 150 additions and 12 deletions
--- a/osc/output/output.py
+++ b/osc/output/output.py
@@ -1,5 +1,7 @@
 import os
+import re
 import sys
+from typing import Dict
 from typing import Optional
 from typing import TextIO
 from typing import Union
@@ -44,24 +46,82 @@ def print_msg(*args, print_to: Optional[str] = "debug"):
        raise ValueError(f"Invalid value of the 'print_to' option: {print_to}")


-# Forbidden characters are nearly all control characters 0-31 with the exception of:
-#   0x09 - horizontal tab (\t)
-#   0x0A - line feed (\n)
-#   0x0D - carriage return (\r)
-# (related to CVE-2012-1095)
-#
-# It would be good to selectively allow 0x1B with safe & trusted escape sequences.
-FORBIDDEN_BYTES = b"\x00\x01\x02\x03\x04\x05\x06\x07\x08\x0b\x0c\x0e\x0f\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f"
-FORBIDDEN_CHARS = dict.fromkeys(FORBIDDEN_BYTES)
+# cached compiled regular expressions; they are created on the first use
+SANITIZE_TEXT_RE: Optional[Dict] = None


 def sanitize_text(text: Union[bytes, str]) -> Union[bytes, str]:
    """
-    Remove forbidden characters from ``text``.
+    Remove forbidden characters and escape sequences from ``text``.
+
+    This must be run on lines or the whole text to work correctly.
+    Processing blocks of constant size might lead to splitting escape sequences
+    and leaving garbage characters after sanitizing.
    """
+    global SANITIZE_TEXT_RE
+
+    if not SANITIZE_TEXT_RE:
+        SANITIZE_TEXT_RE = {}
+
+        # CONTROL CHARACTERS
+        # remove all control characters with the exception of:
+        #   0x09 - horizontal tab (\t)
+        #   0x0A - line feed (\n)
+        #   0x0D - carriage return (\r)
+        #   0x1B - escape - is selectively handled later as part of sanitizing escape sequences
+
+        regex = r"[\x00-\x08\x0B\x0C\x0E-\x1A\x1C-\x1F]"
+        SANITIZE_TEXT_RE["str_control"] = re.compile(regex)
+        SANITIZE_TEXT_RE["bytes_control"] = re.compile(regex.encode("ascii"))
+
+        # CSI ESCAPE SEQUENCES
+        # https://en.wikipedia.org/wiki/ANSI_escape_code#CSI_codes
+        # remove all but allowed CSI escape sequences
+
+        # negative lookahead assertion that allows safe color escape sequences
+        neg_allowed_csi_sequences = r"(?!\[([0-5]|[34][0-7]|;)+m)"
+
+        # range 0x30–0x3F (OCT \040-\077) (ASCII 0–9:;<=>?); zero or more characters
+        csi_parameter_bytes = r"[\x30-\x3F]*"
+
+        # range 0x20–0x2F (OCT \040-\057) (ASCII space and !"#$%&'()*+,-./); zero or more characters
+        csi_itermediate_bytes = r"[\x20-\x2F]*"
+
+        # range 0x40–0x7E (OCT \100-\176) (ASCII @A–Z[\]^_`a–z{|}~); 1 character
+        csi_final_byte = r"[\x40-\x7E]"
+
+        regex = rf"\033{neg_allowed_csi_sequences}\[{csi_parameter_bytes}{csi_itermediate_bytes}{csi_final_byte}"
+        SANITIZE_TEXT_RE["str_csi_sequences"] = re.compile(regex)
+        SANITIZE_TEXT_RE["bytes_csi_sequences"] = re.compile(regex.encode("ascii"))
+
+        # FE ESCAPE SEQUENCES
+        # https://en.wikipedia.org/wiki/ANSI_escape_code#Fe_Escape_sequences
+        # remove all Fe escape sequences
+
+        # range 0x40 to 0x5F (ASCII @A–Z[\]^_); 1 character
+        fe = r"[\x40-x5F]"
+        regex = rf"\033{neg_allowed_csi_sequences}{fe}"
+        SANITIZE_TEXT_RE["str_fe_sequences"] = re.compile(regex)
+        SANITIZE_TEXT_RE["bytes_fe_sequences"] = re.compile(regex.encode("ascii"))
+
+        # REMAINING ESCAPE CHARACTERS
+        # remove all remaining escape characters that are not followed with the allowed CSI escape sequences
+
+        regex = rf"\033{neg_allowed_csi_sequences}"
+        SANITIZE_TEXT_RE["str_esc"] = re.compile(regex)
+        SANITIZE_TEXT_RE["bytes_esc"] = re.compile(regex.encode("ascii"))
+
    if isinstance(text, bytes):
-        return text.translate(None, FORBIDDEN_BYTES)
-    return text.translate(FORBIDDEN_CHARS)
+        text = SANITIZE_TEXT_RE["bytes_control"].sub(b"", text)
+        text = SANITIZE_TEXT_RE["bytes_csi_sequences"].sub(b"", text)
+        text = SANITIZE_TEXT_RE["bytes_fe_sequences"].sub(b"", text)
+        text = SANITIZE_TEXT_RE["bytes_esc"].sub(b"", text)
+    else:
+        text = SANITIZE_TEXT_RE["str_control"].sub("", text)
+        text = SANITIZE_TEXT_RE["str_csi_sequences"].sub("", text)
+        text = SANITIZE_TEXT_RE["str_fe_sequences"].sub("", text)
+        text = SANITIZE_TEXT_RE["str_esc"].sub("", text)
+    return text


 def safe_print(*args, **kwargs):
--- a/tests/test_output.py
+++ b/tests/test_output.py
@@ -5,6 +5,7 @@ import unittest
 import osc.conf
 from osc.output import KeyValueTable
 from osc.output import print_msg
+from osc.output import sanitize_text
 from osc.output import tty


@@ -160,5 +161,82 @@ class TestPrintMsg(unittest.TestCase):
        self.assertEqual("foo bar\n", stderr.getvalue())


+class TestSanitization(unittest.TestCase):
+    def test_control_chars_bytes(self):
+        original = b"".join([i.to_bytes(1, byteorder="big") for i in range(32)])
+        sanitized = sanitize_text(original)
+        self.assertEqual(sanitized, b"\t\n\r")
+
+    def test_control_chars_str(self):
+        original = "".join([chr(i) for i in range(32)])
+        sanitized = sanitize_text(original)
+        self.assertEqual(sanitized, "\t\n\r")
+
+    def test_csi_escape_sequences_str(self):
+        # allowed CSI escape sequences
+        originals = [">\033[0m<", ">\033[1;31;47m]<"]
+        for original in originals:
+            sanitized = sanitize_text(original)
+            self.assertEqual(sanitized, original)
+
+        # not allowed CSI escape sequences
+        originals = [">\033[8m<"]
+        for original in originals:
+            sanitized = sanitize_text(original)
+            self.assertEqual(sanitized, "><")
+
+    def test_csi_escape_sequences_bytes(self):
+        # allowed CSI escape sequences
+        originals = [b">\033[0m<", b">\033[1;31;47m]<"]
+        for original in originals:
+            sanitized = sanitize_text(original)
+            self.assertEqual(sanitized, original)
+
+        # not allowed CSI escape sequences
+        originals = [b">\033[8m<"]
+        for original in originals:
+            sanitized = sanitize_text(original)
+            self.assertEqual(sanitized, b"><")
+
+    def test_standalone_escape_str(self):
+        original = ">\033<"
+        sanitized = sanitize_text(original)
+        self.assertEqual(sanitized, "><")
+
+    def test_standalone_escape_bytes(self):
+        # standalone escape
+        original = b">\033<"
+        sanitized = sanitize_text(original)
+        self.assertEqual(sanitized, b"><")
+
+    def test_fe_escape_sequences_str(self):
+        for i in range(0x40, 0x5F + 1):
+            char = chr(i)
+            original = f">\033{char}<"
+            sanitized = sanitize_text(original)
+            self.assertEqual(sanitized, "><")
+
+    def test_fe_escape_sequences_bytes(self):
+        for i in range(0x40, 0x5F + 1):
+            byte = i.to_bytes(1, byteorder="big")
+            original = b">\033" + byte + b"<"
+            sanitized = sanitize_text(original)
+            self.assertEqual(sanitized, b"><")
+
+    def test_osc_escape_sequences_str(self):
+        # OSC (Operating System Command) sequences
+        original = "\033]0;this is the window title\007"
+        sanitized = sanitize_text(original)
+        # \033] is removed with the Fe sequences
+        self.assertEqual(sanitized, "0;this is the window title")
+
+    def test_osc_escape_sequences_bytes(self):
+        #  OSC (Operating System Command) sequences
+        original = b"\033]0;this is the window title\007"
+        sanitized = sanitize_text(original)
+        # \033] is removed with the Fe sequences
+        self.assertEqual(sanitized, b"0;this is the window title")
+
+
 if __name__ == "__main__":
    unittest.main()