1
0
mirror of https://github.com/openSUSE/osc.git synced 2025-02-22 18:22:12 +01:00

Merge pull request #1697 from dmach/fix-detecting-binaries

Fix detecting binary files
This commit is contained in:
Daniel Mach 2025-02-03 14:38:34 +01:00 committed by GitHub
commit b469f31d6c
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 59 additions and 5 deletions

View File

@ -2790,11 +2790,14 @@ def sha256_dgst(file):
return s.hexdigest()
def binary(s):
"""return ``True`` if a string is binary data using diff's heuristic"""
if s and bytes('\0', "utf-8") in s[:4096]:
return True
return False
def binary(data: bytes):
"""
Return ``True`` if ``data`` is binary data.
We're using heuristics according to OBS: src/backend/BSSrcServer/filediff - look for "diff binary detection"
"""
binary_chars = re.findall(b"[\x00-\0x07\x0e-\x1f]", data)
return len(binary_chars) * 40 > len(data)
def binary_file(fn):

View File

@ -1,5 +1,9 @@
import os
import shutil
import tempfile
import unittest
from osc.core import binary_file
from osc.core import makeurl
from osc.core import UrlQueryArray
from osc.core import parseRevisionOption
@ -138,5 +142,52 @@ class TestMakeurl(unittest.TestCase):
self.assertEqual(url, f"https://example.com/api/v1?{encoded_char}={encoded_char}")
class TestBinaryFile(unittest.TestCase):
def setUp(self):
self.tmpdir = tempfile.mkdtemp(prefix="osc_test_")
def tearDown(self):
shutil.rmtree(self.tmpdir)
def test_text(self):
path = os.path.join(self.tmpdir, "text")
with open(path, "w") as f:
f.write(1000 * "a")
self.assertFalse(binary_file(path))
def test_text_with_binary_chars(self):
path = os.path.join(self.tmpdir, "binary")
with open(path, "wb") as f:
f.write(1000 * b"a")
f.write(b"\0")
self.assertFalse(binary_file(path))
with open(path, "wb") as f:
f.write(4096 * b"a")
f.write(b"\0")
self.assertFalse(binary_file(path))
def test_binary(self):
path = os.path.join(self.tmpdir, "binary")
# sufficient control chars in first 4k
with open(path, "wb") as f:
f.write(1000 * b"a")
f.write(26 * b"\0")
self.assertTrue(binary_file(path))
# sufficient control chars in first 4k
with open(path, "wb") as f:
f.write(3993 * b"a")
f.write(103 * b"\0")
self.assertTrue(binary_file(path))
# detected as text because we're reading only first 4k characters
with open(path, "wb") as f:
f.write(4096 * b"a")
f.write(1000 * b"\0")
self.assertFalse(binary_file(path))
if __name__ == "__main__":
unittest.main()