From 3c62dc0651025a6838434ed5e9e20dd4e9930b4c Mon Sep 17 00:00:00 2001 From: Daniel Mach Date: Mon, 10 Feb 2025 14:13:06 +0100 Subject: [PATCH 1/2] Fix typo in core.binary() that caused text files being detected as binary --- osc/core.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/osc/core.py b/osc/core.py index 00ed806c..cb08e2dc 100644 --- a/osc/core.py +++ b/osc/core.py @@ -2796,7 +2796,7 @@ def binary(data: bytes): We're using heuristics according to OBS: src/backend/BSSrcServer/filediff - look for "diff binary detection" """ - binary_chars = re.findall(b"[\x00-\0x07\x0e-\x1f]", data) + binary_chars = re.findall(b"[\x00-\x07\x0e-\x1f]", data) return len(binary_chars) * 40 > len(data) From 8287354d48af6566f1a03e5dde4bcba83a14e09b Mon Sep 17 00:00:00 2001 From: Daniel Mach Date: Tue, 11 Feb 2025 14:46:06 +0100 Subject: [PATCH 2/2] Improve core.binary() by always considering data that contains \0 binary --- osc/core.py | 2 ++ tests/test_core.py | 17 ++++++++++++----- 2 files changed, 14 insertions(+), 5 deletions(-) diff --git a/osc/core.py b/osc/core.py index cb08e2dc..5ac4a6af 100644 --- a/osc/core.py +++ b/osc/core.py @@ -2796,6 +2796,8 @@ def binary(data: bytes): We're using heuristics according to OBS: src/backend/BSSrcServer/filediff - look for "diff binary detection" """ + if b"\0" in data: + return True binary_chars = re.findall(b"[\x00-\x07\x0e-\x1f]", data) return len(binary_chars) * 40 > len(data) diff --git a/tests/test_core.py b/tests/test_core.py index 1303645c..9009d967 100644 --- a/tests/test_core.py +++ b/tests/test_core.py @@ -159,12 +159,12 @@ class TestBinaryFile(unittest.TestCase): path = os.path.join(self.tmpdir, "binary") with open(path, "wb") as f: f.write(1000 * b"a") - f.write(b"\0") + f.write(b"\1") self.assertFalse(binary_file(path)) with open(path, "wb") as f: f.write(4096 * b"a") - f.write(b"\0") + f.write(b"\1") self.assertFalse(binary_file(path)) def test_binary(self): @@ -173,19 +173,26 @@ class TestBinaryFile(unittest.TestCase): # sufficient control chars in first 4k with open(path, "wb") as f: f.write(1000 * b"a") - f.write(26 * b"\0") + f.write(26 * b"\1") self.assertTrue(binary_file(path)) # sufficient control chars in first 4k with open(path, "wb") as f: f.write(3993 * b"a") - f.write(103 * b"\0") + f.write(103 * b"\1") + self.assertTrue(binary_file(path)) + + # a single \0 is good enough for us to say it's a binary file + with open(path, "wb") as f: + f.write(3993 * b"a") + f.write(b"\0") + f.write(999 * b"\1") self.assertTrue(binary_file(path)) # detected as text because we're reading only first 4k characters with open(path, "wb") as f: f.write(4096 * b"a") - f.write(1000 * b"\0") + f.write(1000 * b"\1") self.assertFalse(binary_file(path))