1
0
mirror of https://github.com/openSUSE/osc.git synced 2024-12-27 10:16:14 +01:00

Improve sanitize_text() to keep selected CSI escape sequences

This commit is contained in:
Daniel Mach 2024-03-07 11:59:51 +01:00
parent 2d5399442d
commit f9b17347da
2 changed files with 150 additions and 12 deletions

View File

@ -1,5 +1,7 @@
import os
import re
import sys
from typing import Dict
from typing import Optional
from typing import TextIO
from typing import Union
@ -44,24 +46,82 @@ def print_msg(*args, print_to: Optional[str] = "debug"):
raise ValueError(f"Invalid value of the 'print_to' option: {print_to}")
# Forbidden characters are nearly all control characters 0-31 with the exception of:
# 0x09 - horizontal tab (\t)
# 0x0A - line feed (\n)
# 0x0D - carriage return (\r)
# (related to CVE-2012-1095)
#
# It would be good to selectively allow 0x1B with safe & trusted escape sequences.
FORBIDDEN_BYTES = b"\x00\x01\x02\x03\x04\x05\x06\x07\x08\x0b\x0c\x0e\x0f\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f"
FORBIDDEN_CHARS = dict.fromkeys(FORBIDDEN_BYTES)
# cached compiled regular expressions; they are created on the first use
SANITIZE_TEXT_RE: Optional[Dict] = None
def sanitize_text(text: Union[bytes, str]) -> Union[bytes, str]:
"""
Remove forbidden characters from ``text``.
Remove forbidden characters and escape sequences from ``text``.
This must be run on lines or the whole text to work correctly.
Processing blocks of constant size might lead to splitting escape sequences
and leaving garbage characters after sanitizing.
"""
global SANITIZE_TEXT_RE
if not SANITIZE_TEXT_RE:
SANITIZE_TEXT_RE = {}
# CONTROL CHARACTERS
# remove all control characters with the exception of:
# 0x09 - horizontal tab (\t)
# 0x0A - line feed (\n)
# 0x0D - carriage return (\r)
# 0x1B - escape - is selectively handled later as part of sanitizing escape sequences
regex = r"[\x00-\x08\x0B\x0C\x0E-\x1A\x1C-\x1F]"
SANITIZE_TEXT_RE["str_control"] = re.compile(regex)
SANITIZE_TEXT_RE["bytes_control"] = re.compile(regex.encode("ascii"))
# CSI ESCAPE SEQUENCES
# https://en.wikipedia.org/wiki/ANSI_escape_code#CSI_codes
# remove all but allowed CSI escape sequences
# negative lookahead assertion that allows safe color escape sequences
neg_allowed_csi_sequences = r"(?!\[([0-5]|[34][0-7]|;)+m)"
# range 0x300x3F (OCT \040-\077) (ASCII 09:;<=>?); zero or more characters
csi_parameter_bytes = r"[\x30-\x3F]*"
# range 0x200x2F (OCT \040-\057) (ASCII space and !"#$%&'()*+,-./); zero or more characters
csi_itermediate_bytes = r"[\x20-\x2F]*"
# range 0x400x7E (OCT \100-\176) (ASCII @AZ[\]^_`az{|}~); 1 character
csi_final_byte = r"[\x40-\x7E]"
regex = rf"\033{neg_allowed_csi_sequences}\[{csi_parameter_bytes}{csi_itermediate_bytes}{csi_final_byte}"
SANITIZE_TEXT_RE["str_csi_sequences"] = re.compile(regex)
SANITIZE_TEXT_RE["bytes_csi_sequences"] = re.compile(regex.encode("ascii"))
# FE ESCAPE SEQUENCES
# https://en.wikipedia.org/wiki/ANSI_escape_code#Fe_Escape_sequences
# remove all Fe escape sequences
# range 0x40 to 0x5F (ASCII @AZ[\]^_); 1 character
fe = r"[\x40-x5F]"
regex = rf"\033{neg_allowed_csi_sequences}{fe}"
SANITIZE_TEXT_RE["str_fe_sequences"] = re.compile(regex)
SANITIZE_TEXT_RE["bytes_fe_sequences"] = re.compile(regex.encode("ascii"))
# REMAINING ESCAPE CHARACTERS
# remove all remaining escape characters that are not followed with the allowed CSI escape sequences
regex = rf"\033{neg_allowed_csi_sequences}"
SANITIZE_TEXT_RE["str_esc"] = re.compile(regex)
SANITIZE_TEXT_RE["bytes_esc"] = re.compile(regex.encode("ascii"))
if isinstance(text, bytes):
return text.translate(None, FORBIDDEN_BYTES)
return text.translate(FORBIDDEN_CHARS)
text = SANITIZE_TEXT_RE["bytes_control"].sub(b"", text)
text = SANITIZE_TEXT_RE["bytes_csi_sequences"].sub(b"", text)
text = SANITIZE_TEXT_RE["bytes_fe_sequences"].sub(b"", text)
text = SANITIZE_TEXT_RE["bytes_esc"].sub(b"", text)
else:
text = SANITIZE_TEXT_RE["str_control"].sub("", text)
text = SANITIZE_TEXT_RE["str_csi_sequences"].sub("", text)
text = SANITIZE_TEXT_RE["str_fe_sequences"].sub("", text)
text = SANITIZE_TEXT_RE["str_esc"].sub("", text)
return text
def safe_print(*args, **kwargs):

View File

@ -5,6 +5,7 @@ import unittest
import osc.conf
from osc.output import KeyValueTable
from osc.output import print_msg
from osc.output import sanitize_text
from osc.output import tty
@ -160,5 +161,82 @@ class TestPrintMsg(unittest.TestCase):
self.assertEqual("foo bar\n", stderr.getvalue())
class TestSanitization(unittest.TestCase):
def test_control_chars_bytes(self):
original = b"".join([i.to_bytes(1, byteorder="big") for i in range(32)])
sanitized = sanitize_text(original)
self.assertEqual(sanitized, b"\t\n\r")
def test_control_chars_str(self):
original = "".join([chr(i) for i in range(32)])
sanitized = sanitize_text(original)
self.assertEqual(sanitized, "\t\n\r")
def test_csi_escape_sequences_str(self):
# allowed CSI escape sequences
originals = [">\033[0m<", ">\033[1;31;47m]<"]
for original in originals:
sanitized = sanitize_text(original)
self.assertEqual(sanitized, original)
# not allowed CSI escape sequences
originals = [">\033[8m<"]
for original in originals:
sanitized = sanitize_text(original)
self.assertEqual(sanitized, "><")
def test_csi_escape_sequences_bytes(self):
# allowed CSI escape sequences
originals = [b">\033[0m<", b">\033[1;31;47m]<"]
for original in originals:
sanitized = sanitize_text(original)
self.assertEqual(sanitized, original)
# not allowed CSI escape sequences
originals = [b">\033[8m<"]
for original in originals:
sanitized = sanitize_text(original)
self.assertEqual(sanitized, b"><")
def test_standalone_escape_str(self):
original = ">\033<"
sanitized = sanitize_text(original)
self.assertEqual(sanitized, "><")
def test_standalone_escape_bytes(self):
# standalone escape
original = b">\033<"
sanitized = sanitize_text(original)
self.assertEqual(sanitized, b"><")
def test_fe_escape_sequences_str(self):
for i in range(0x40, 0x5F + 1):
char = chr(i)
original = f">\033{char}<"
sanitized = sanitize_text(original)
self.assertEqual(sanitized, "><")
def test_fe_escape_sequences_bytes(self):
for i in range(0x40, 0x5F + 1):
byte = i.to_bytes(1, byteorder="big")
original = b">\033" + byte + b"<"
sanitized = sanitize_text(original)
self.assertEqual(sanitized, b"><")
def test_osc_escape_sequences_str(self):
# OSC (Operating System Command) sequences
original = "\033]0;this is the window title\007"
sanitized = sanitize_text(original)
# \033] is removed with the Fe sequences
self.assertEqual(sanitized, "0;this is the window title")
def test_osc_escape_sequences_bytes(self):
# OSC (Operating System Command) sequences
original = b"\033]0;this is the window title\007"
sanitized = sanitize_text(original)
# \033] is removed with the Fe sequences
self.assertEqual(sanitized, b"0;this is the window title")
if __name__ == "__main__":
unittest.main()