From f9b17347da9f0eee16b3f7890c1091ce62f15ce9 Mon Sep 17 00:00:00 2001 From: Daniel Mach Date: Thu, 7 Mar 2024 11:59:51 +0100 Subject: [PATCH] Improve sanitize_text() to keep selected CSI escape sequences --- osc/output/output.py | 84 +++++++++++++++++++++++++++++++++++++------- tests/test_output.py | 78 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 150 insertions(+), 12 deletions(-) diff --git a/osc/output/output.py b/osc/output/output.py index 2919ab77..63b46a46 100644 --- a/osc/output/output.py +++ b/osc/output/output.py @@ -1,5 +1,7 @@ import os +import re import sys +from typing import Dict from typing import Optional from typing import TextIO from typing import Union @@ -44,24 +46,82 @@ def print_msg(*args, print_to: Optional[str] = "debug"): raise ValueError(f"Invalid value of the 'print_to' option: {print_to}") -# Forbidden characters are nearly all control characters 0-31 with the exception of: -# 0x09 - horizontal tab (\t) -# 0x0A - line feed (\n) -# 0x0D - carriage return (\r) -# (related to CVE-2012-1095) -# -# It would be good to selectively allow 0x1B with safe & trusted escape sequences. -FORBIDDEN_BYTES = b"\x00\x01\x02\x03\x04\x05\x06\x07\x08\x0b\x0c\x0e\x0f\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f" -FORBIDDEN_CHARS = dict.fromkeys(FORBIDDEN_BYTES) +# cached compiled regular expressions; they are created on the first use +SANITIZE_TEXT_RE: Optional[Dict] = None def sanitize_text(text: Union[bytes, str]) -> Union[bytes, str]: """ - Remove forbidden characters from ``text``. + Remove forbidden characters and escape sequences from ``text``. + + This must be run on lines or the whole text to work correctly. + Processing blocks of constant size might lead to splitting escape sequences + and leaving garbage characters after sanitizing. """ + global SANITIZE_TEXT_RE + + if not SANITIZE_TEXT_RE: + SANITIZE_TEXT_RE = {} + + # CONTROL CHARACTERS + # remove all control characters with the exception of: + # 0x09 - horizontal tab (\t) + # 0x0A - line feed (\n) + # 0x0D - carriage return (\r) + # 0x1B - escape - is selectively handled later as part of sanitizing escape sequences + + regex = r"[\x00-\x08\x0B\x0C\x0E-\x1A\x1C-\x1F]" + SANITIZE_TEXT_RE["str_control"] = re.compile(regex) + SANITIZE_TEXT_RE["bytes_control"] = re.compile(regex.encode("ascii")) + + # CSI ESCAPE SEQUENCES + # https://en.wikipedia.org/wiki/ANSI_escape_code#CSI_codes + # remove all but allowed CSI escape sequences + + # negative lookahead assertion that allows safe color escape sequences + neg_allowed_csi_sequences = r"(?!\[([0-5]|[34][0-7]|;)+m)" + + # range 0x30–0x3F (OCT \040-\077) (ASCII 0–9:;<=>?); zero or more characters + csi_parameter_bytes = r"[\x30-\x3F]*" + + # range 0x20–0x2F (OCT \040-\057) (ASCII space and !"#$%&'()*+,-./); zero or more characters + csi_itermediate_bytes = r"[\x20-\x2F]*" + + # range 0x40–0x7E (OCT \100-\176) (ASCII @A–Z[\]^_`a–z{|}~); 1 character + csi_final_byte = r"[\x40-\x7E]" + + regex = rf"\033{neg_allowed_csi_sequences}\[{csi_parameter_bytes}{csi_itermediate_bytes}{csi_final_byte}" + SANITIZE_TEXT_RE["str_csi_sequences"] = re.compile(regex) + SANITIZE_TEXT_RE["bytes_csi_sequences"] = re.compile(regex.encode("ascii")) + + # FE ESCAPE SEQUENCES + # https://en.wikipedia.org/wiki/ANSI_escape_code#Fe_Escape_sequences + # remove all Fe escape sequences + + # range 0x40 to 0x5F (ASCII @A–Z[\]^_); 1 character + fe = r"[\x40-x5F]" + regex = rf"\033{neg_allowed_csi_sequences}{fe}" + SANITIZE_TEXT_RE["str_fe_sequences"] = re.compile(regex) + SANITIZE_TEXT_RE["bytes_fe_sequences"] = re.compile(regex.encode("ascii")) + + # REMAINING ESCAPE CHARACTERS + # remove all remaining escape characters that are not followed with the allowed CSI escape sequences + + regex = rf"\033{neg_allowed_csi_sequences}" + SANITIZE_TEXT_RE["str_esc"] = re.compile(regex) + SANITIZE_TEXT_RE["bytes_esc"] = re.compile(regex.encode("ascii")) + if isinstance(text, bytes): - return text.translate(None, FORBIDDEN_BYTES) - return text.translate(FORBIDDEN_CHARS) + text = SANITIZE_TEXT_RE["bytes_control"].sub(b"", text) + text = SANITIZE_TEXT_RE["bytes_csi_sequences"].sub(b"", text) + text = SANITIZE_TEXT_RE["bytes_fe_sequences"].sub(b"", text) + text = SANITIZE_TEXT_RE["bytes_esc"].sub(b"", text) + else: + text = SANITIZE_TEXT_RE["str_control"].sub("", text) + text = SANITIZE_TEXT_RE["str_csi_sequences"].sub("", text) + text = SANITIZE_TEXT_RE["str_fe_sequences"].sub("", text) + text = SANITIZE_TEXT_RE["str_esc"].sub("", text) + return text def safe_print(*args, **kwargs): diff --git a/tests/test_output.py b/tests/test_output.py index 49166837..33971392 100644 --- a/tests/test_output.py +++ b/tests/test_output.py @@ -5,6 +5,7 @@ import unittest import osc.conf from osc.output import KeyValueTable from osc.output import print_msg +from osc.output import sanitize_text from osc.output import tty @@ -160,5 +161,82 @@ class TestPrintMsg(unittest.TestCase): self.assertEqual("foo bar\n", stderr.getvalue()) +class TestSanitization(unittest.TestCase): + def test_control_chars_bytes(self): + original = b"".join([i.to_bytes(1, byteorder="big") for i in range(32)]) + sanitized = sanitize_text(original) + self.assertEqual(sanitized, b"\t\n\r") + + def test_control_chars_str(self): + original = "".join([chr(i) for i in range(32)]) + sanitized = sanitize_text(original) + self.assertEqual(sanitized, "\t\n\r") + + def test_csi_escape_sequences_str(self): + # allowed CSI escape sequences + originals = [">\033[0m<", ">\033[1;31;47m]<"] + for original in originals: + sanitized = sanitize_text(original) + self.assertEqual(sanitized, original) + + # not allowed CSI escape sequences + originals = [">\033[8m<"] + for original in originals: + sanitized = sanitize_text(original) + self.assertEqual(sanitized, "><") + + def test_csi_escape_sequences_bytes(self): + # allowed CSI escape sequences + originals = [b">\033[0m<", b">\033[1;31;47m]<"] + for original in originals: + sanitized = sanitize_text(original) + self.assertEqual(sanitized, original) + + # not allowed CSI escape sequences + originals = [b">\033[8m<"] + for original in originals: + sanitized = sanitize_text(original) + self.assertEqual(sanitized, b"><") + + def test_standalone_escape_str(self): + original = ">\033<" + sanitized = sanitize_text(original) + self.assertEqual(sanitized, "><") + + def test_standalone_escape_bytes(self): + # standalone escape + original = b">\033<" + sanitized = sanitize_text(original) + self.assertEqual(sanitized, b"><") + + def test_fe_escape_sequences_str(self): + for i in range(0x40, 0x5F + 1): + char = chr(i) + original = f">\033{char}<" + sanitized = sanitize_text(original) + self.assertEqual(sanitized, "><") + + def test_fe_escape_sequences_bytes(self): + for i in range(0x40, 0x5F + 1): + byte = i.to_bytes(1, byteorder="big") + original = b">\033" + byte + b"<" + sanitized = sanitize_text(original) + self.assertEqual(sanitized, b"><") + + def test_osc_escape_sequences_str(self): + # OSC (Operating System Command) sequences + original = "\033]0;this is the window title\007" + sanitized = sanitize_text(original) + # \033] is removed with the Fe sequences + self.assertEqual(sanitized, "0;this is the window title") + + def test_osc_escape_sequences_bytes(self): + # OSC (Operating System Command) sequences + original = b"\033]0;this is the window title\007" + sanitized = sanitize_text(original) + # \033] is removed with the Fe sequences + self.assertEqual(sanitized, b"0;this is the window title") + + if __name__ == "__main__": unittest.main()