mirror of
https://github.com/openSUSE/osc.git
synced 2024-12-27 10:16:14 +01:00
Improve sanitize_text() to keep selected CSI escape sequences
This commit is contained in:
parent
2d5399442d
commit
f9b17347da
@ -1,5 +1,7 @@
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
from typing import Dict
|
||||
from typing import Optional
|
||||
from typing import TextIO
|
||||
from typing import Union
|
||||
@ -44,24 +46,82 @@ def print_msg(*args, print_to: Optional[str] = "debug"):
|
||||
raise ValueError(f"Invalid value of the 'print_to' option: {print_to}")
|
||||
|
||||
|
||||
# Forbidden characters are nearly all control characters 0-31 with the exception of:
|
||||
# 0x09 - horizontal tab (\t)
|
||||
# 0x0A - line feed (\n)
|
||||
# 0x0D - carriage return (\r)
|
||||
# (related to CVE-2012-1095)
|
||||
#
|
||||
# It would be good to selectively allow 0x1B with safe & trusted escape sequences.
|
||||
FORBIDDEN_BYTES = b"\x00\x01\x02\x03\x04\x05\x06\x07\x08\x0b\x0c\x0e\x0f\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f"
|
||||
FORBIDDEN_CHARS = dict.fromkeys(FORBIDDEN_BYTES)
|
||||
# cached compiled regular expressions; they are created on the first use
|
||||
SANITIZE_TEXT_RE: Optional[Dict] = None
|
||||
|
||||
|
||||
def sanitize_text(text: Union[bytes, str]) -> Union[bytes, str]:
|
||||
"""
|
||||
Remove forbidden characters from ``text``.
|
||||
Remove forbidden characters and escape sequences from ``text``.
|
||||
|
||||
This must be run on lines or the whole text to work correctly.
|
||||
Processing blocks of constant size might lead to splitting escape sequences
|
||||
and leaving garbage characters after sanitizing.
|
||||
"""
|
||||
global SANITIZE_TEXT_RE
|
||||
|
||||
if not SANITIZE_TEXT_RE:
|
||||
SANITIZE_TEXT_RE = {}
|
||||
|
||||
# CONTROL CHARACTERS
|
||||
# remove all control characters with the exception of:
|
||||
# 0x09 - horizontal tab (\t)
|
||||
# 0x0A - line feed (\n)
|
||||
# 0x0D - carriage return (\r)
|
||||
# 0x1B - escape - is selectively handled later as part of sanitizing escape sequences
|
||||
|
||||
regex = r"[\x00-\x08\x0B\x0C\x0E-\x1A\x1C-\x1F]"
|
||||
SANITIZE_TEXT_RE["str_control"] = re.compile(regex)
|
||||
SANITIZE_TEXT_RE["bytes_control"] = re.compile(regex.encode("ascii"))
|
||||
|
||||
# CSI ESCAPE SEQUENCES
|
||||
# https://en.wikipedia.org/wiki/ANSI_escape_code#CSI_codes
|
||||
# remove all but allowed CSI escape sequences
|
||||
|
||||
# negative lookahead assertion that allows safe color escape sequences
|
||||
neg_allowed_csi_sequences = r"(?!\[([0-5]|[34][0-7]|;)+m)"
|
||||
|
||||
# range 0x30–0x3F (OCT \040-\077) (ASCII 0–9:;<=>?); zero or more characters
|
||||
csi_parameter_bytes = r"[\x30-\x3F]*"
|
||||
|
||||
# range 0x20–0x2F (OCT \040-\057) (ASCII space and !"#$%&'()*+,-./); zero or more characters
|
||||
csi_itermediate_bytes = r"[\x20-\x2F]*"
|
||||
|
||||
# range 0x40–0x7E (OCT \100-\176) (ASCII @A–Z[\]^_`a–z{|}~); 1 character
|
||||
csi_final_byte = r"[\x40-\x7E]"
|
||||
|
||||
regex = rf"\033{neg_allowed_csi_sequences}\[{csi_parameter_bytes}{csi_itermediate_bytes}{csi_final_byte}"
|
||||
SANITIZE_TEXT_RE["str_csi_sequences"] = re.compile(regex)
|
||||
SANITIZE_TEXT_RE["bytes_csi_sequences"] = re.compile(regex.encode("ascii"))
|
||||
|
||||
# FE ESCAPE SEQUENCES
|
||||
# https://en.wikipedia.org/wiki/ANSI_escape_code#Fe_Escape_sequences
|
||||
# remove all Fe escape sequences
|
||||
|
||||
# range 0x40 to 0x5F (ASCII @A–Z[\]^_); 1 character
|
||||
fe = r"[\x40-x5F]"
|
||||
regex = rf"\033{neg_allowed_csi_sequences}{fe}"
|
||||
SANITIZE_TEXT_RE["str_fe_sequences"] = re.compile(regex)
|
||||
SANITIZE_TEXT_RE["bytes_fe_sequences"] = re.compile(regex.encode("ascii"))
|
||||
|
||||
# REMAINING ESCAPE CHARACTERS
|
||||
# remove all remaining escape characters that are not followed with the allowed CSI escape sequences
|
||||
|
||||
regex = rf"\033{neg_allowed_csi_sequences}"
|
||||
SANITIZE_TEXT_RE["str_esc"] = re.compile(regex)
|
||||
SANITIZE_TEXT_RE["bytes_esc"] = re.compile(regex.encode("ascii"))
|
||||
|
||||
if isinstance(text, bytes):
|
||||
return text.translate(None, FORBIDDEN_BYTES)
|
||||
return text.translate(FORBIDDEN_CHARS)
|
||||
text = SANITIZE_TEXT_RE["bytes_control"].sub(b"", text)
|
||||
text = SANITIZE_TEXT_RE["bytes_csi_sequences"].sub(b"", text)
|
||||
text = SANITIZE_TEXT_RE["bytes_fe_sequences"].sub(b"", text)
|
||||
text = SANITIZE_TEXT_RE["bytes_esc"].sub(b"", text)
|
||||
else:
|
||||
text = SANITIZE_TEXT_RE["str_control"].sub("", text)
|
||||
text = SANITIZE_TEXT_RE["str_csi_sequences"].sub("", text)
|
||||
text = SANITIZE_TEXT_RE["str_fe_sequences"].sub("", text)
|
||||
text = SANITIZE_TEXT_RE["str_esc"].sub("", text)
|
||||
return text
|
||||
|
||||
|
||||
def safe_print(*args, **kwargs):
|
||||
|
@ -5,6 +5,7 @@ import unittest
|
||||
import osc.conf
|
||||
from osc.output import KeyValueTable
|
||||
from osc.output import print_msg
|
||||
from osc.output import sanitize_text
|
||||
from osc.output import tty
|
||||
|
||||
|
||||
@ -160,5 +161,82 @@ class TestPrintMsg(unittest.TestCase):
|
||||
self.assertEqual("foo bar\n", stderr.getvalue())
|
||||
|
||||
|
||||
class TestSanitization(unittest.TestCase):
|
||||
def test_control_chars_bytes(self):
|
||||
original = b"".join([i.to_bytes(1, byteorder="big") for i in range(32)])
|
||||
sanitized = sanitize_text(original)
|
||||
self.assertEqual(sanitized, b"\t\n\r")
|
||||
|
||||
def test_control_chars_str(self):
|
||||
original = "".join([chr(i) for i in range(32)])
|
||||
sanitized = sanitize_text(original)
|
||||
self.assertEqual(sanitized, "\t\n\r")
|
||||
|
||||
def test_csi_escape_sequences_str(self):
|
||||
# allowed CSI escape sequences
|
||||
originals = [">\033[0m<", ">\033[1;31;47m]<"]
|
||||
for original in originals:
|
||||
sanitized = sanitize_text(original)
|
||||
self.assertEqual(sanitized, original)
|
||||
|
||||
# not allowed CSI escape sequences
|
||||
originals = [">\033[8m<"]
|
||||
for original in originals:
|
||||
sanitized = sanitize_text(original)
|
||||
self.assertEqual(sanitized, "><")
|
||||
|
||||
def test_csi_escape_sequences_bytes(self):
|
||||
# allowed CSI escape sequences
|
||||
originals = [b">\033[0m<", b">\033[1;31;47m]<"]
|
||||
for original in originals:
|
||||
sanitized = sanitize_text(original)
|
||||
self.assertEqual(sanitized, original)
|
||||
|
||||
# not allowed CSI escape sequences
|
||||
originals = [b">\033[8m<"]
|
||||
for original in originals:
|
||||
sanitized = sanitize_text(original)
|
||||
self.assertEqual(sanitized, b"><")
|
||||
|
||||
def test_standalone_escape_str(self):
|
||||
original = ">\033<"
|
||||
sanitized = sanitize_text(original)
|
||||
self.assertEqual(sanitized, "><")
|
||||
|
||||
def test_standalone_escape_bytes(self):
|
||||
# standalone escape
|
||||
original = b">\033<"
|
||||
sanitized = sanitize_text(original)
|
||||
self.assertEqual(sanitized, b"><")
|
||||
|
||||
def test_fe_escape_sequences_str(self):
|
||||
for i in range(0x40, 0x5F + 1):
|
||||
char = chr(i)
|
||||
original = f">\033{char}<"
|
||||
sanitized = sanitize_text(original)
|
||||
self.assertEqual(sanitized, "><")
|
||||
|
||||
def test_fe_escape_sequences_bytes(self):
|
||||
for i in range(0x40, 0x5F + 1):
|
||||
byte = i.to_bytes(1, byteorder="big")
|
||||
original = b">\033" + byte + b"<"
|
||||
sanitized = sanitize_text(original)
|
||||
self.assertEqual(sanitized, b"><")
|
||||
|
||||
def test_osc_escape_sequences_str(self):
|
||||
# OSC (Operating System Command) sequences
|
||||
original = "\033]0;this is the window title\007"
|
||||
sanitized = sanitize_text(original)
|
||||
# \033] is removed with the Fe sequences
|
||||
self.assertEqual(sanitized, "0;this is the window title")
|
||||
|
||||
def test_osc_escape_sequences_bytes(self):
|
||||
# OSC (Operating System Command) sequences
|
||||
original = b"\033]0;this is the window title\007"
|
||||
sanitized = sanitize_text(original)
|
||||
# \033] is removed with the Fe sequences
|
||||
self.assertEqual(sanitized, b"0;this is the window title")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
|
Loading…
Reference in New Issue
Block a user