From a8106f3f66220a65ce2afc58316ce1448f348676 Mon Sep 17 00:00:00 2001 From: Daniel Mach Date: Mon, 3 Feb 2025 09:12:19 +0100 Subject: [PATCH] Fix detecting binary files --- osc/core.py | 13 +++++++----- tests/test_core.py | 51 ++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 59 insertions(+), 5 deletions(-) diff --git a/osc/core.py b/osc/core.py index e64ee139..692dd14e 100644 --- a/osc/core.py +++ b/osc/core.py @@ -2790,11 +2790,14 @@ def sha256_dgst(file): return s.hexdigest() -def binary(s): - """return ``True`` if a string is binary data using diff's heuristic""" - if s and bytes('\0', "utf-8") in s[:4096]: - return True - return False +def binary(data: bytes): + """ + Return ``True`` if ``data`` is binary data. + + We're using heuristics according to OBS: src/backend/BSSrcServer/filediff - look for "diff binary detection" + """ + binary_chars = re.findall(b"[\x00-\0x07\x0e-\x1f]", data) + return len(binary_chars) * 40 > len(data) def binary_file(fn): diff --git a/tests/test_core.py b/tests/test_core.py index 6b822879..1303645c 100644 --- a/tests/test_core.py +++ b/tests/test_core.py @@ -1,5 +1,9 @@ +import os +import shutil +import tempfile import unittest +from osc.core import binary_file from osc.core import makeurl from osc.core import UrlQueryArray from osc.core import parseRevisionOption @@ -138,5 +142,52 @@ class TestMakeurl(unittest.TestCase): self.assertEqual(url, f"https://example.com/api/v1?{encoded_char}={encoded_char}") +class TestBinaryFile(unittest.TestCase): + def setUp(self): + self.tmpdir = tempfile.mkdtemp(prefix="osc_test_") + + def tearDown(self): + shutil.rmtree(self.tmpdir) + + def test_text(self): + path = os.path.join(self.tmpdir, "text") + with open(path, "w") as f: + f.write(1000 * "a") + self.assertFalse(binary_file(path)) + + def test_text_with_binary_chars(self): + path = os.path.join(self.tmpdir, "binary") + with open(path, "wb") as f: + f.write(1000 * b"a") + f.write(b"\0") + self.assertFalse(binary_file(path)) + + with open(path, "wb") as f: + f.write(4096 * b"a") + f.write(b"\0") + self.assertFalse(binary_file(path)) + + def test_binary(self): + path = os.path.join(self.tmpdir, "binary") + + # sufficient control chars in first 4k + with open(path, "wb") as f: + f.write(1000 * b"a") + f.write(26 * b"\0") + self.assertTrue(binary_file(path)) + + # sufficient control chars in first 4k + with open(path, "wb") as f: + f.write(3993 * b"a") + f.write(103 * b"\0") + self.assertTrue(binary_file(path)) + + # detected as text because we're reading only first 4k characters + with open(path, "wb") as f: + f.write(4096 * b"a") + f.write(1000 * b"\0") + self.assertFalse(binary_file(path)) + + if __name__ == "__main__": unittest.main()