From a8b2e001d6f4d2e4683aa65d6331b971f8e2ce33 Mon Sep 17 00:00:00 2001 From: Pierre-Yves Chibon Date: Mon, 14 Sep 2020 16:03:31 +0200 Subject: [PATCH 8/9] Add support for using cchardet to detect files' encoding cchardet is a much faster version of the chardet library that can be used to automatically detect the encoding of a file. Since this library is only available on python3, we're making it an optional dependency for now. Fixes https://pagure.io/pagure/issue/4977 Signed-off-by: Pierre-Yves Chibon --- pagure/lib/encoding_utils.py | 12 ++++- tests/test_pagure_flask_ui_repo.py | 58 ++++++++++++++++++++----- tests/test_pagure_lib_encoding_utils.py | 40 ++++++++++++----- tests/test_pagure_lib_mimetype.py | 28 ++++++++++-- 4 files changed, 111 insertions(+), 27 deletions(-) diff --git a/pagure/lib/encoding_utils.py b/pagure/lib/encoding_utils.py index 66f7dced..304e7d84 100644 --- a/pagure/lib/encoding_utils.py +++ b/pagure/lib/encoding_utils.py @@ -15,7 +15,12 @@ from __future__ import unicode_literals, division, absolute_import from collections import namedtuple import logging -from chardet import universaldetector, __version__ as ch_version +try: + import cchardet + from cchardet import __version__ as ch_version +except ImportError: + cchardet = None + from chardet import universaldetector, __version__ as ch_version from pagure.exceptions import PagureEncodingException @@ -44,7 +49,10 @@ def detect_encodings(data): # We can't use ``chardet.detect`` because we want to dig in the internals # of the detector to bias the utf-8 result. - detector = universaldetector.UniversalDetector() + if cchardet is not None: + detector = cchardet.UniversalDetector() + else: + detector = universaldetector.UniversalDetector() detector.reset() detector.feed(data) result = detector.close() diff --git a/tests/test_pagure_flask_ui_repo.py b/tests/test_pagure_flask_ui_repo.py index b4322e7d..e816e7f7 100644 --- a/tests/test_pagure_flask_ui_repo.py +++ b/tests/test_pagure_flask_ui_repo.py @@ -20,6 +20,12 @@ import tempfile import time import os +cchardet = None +try: + import cchardet +except ImportError: + pass + import pygit2 import six from mock import ANY, patch, MagicMock @@ -2763,9 +2769,16 @@ class PagureFlaskRepotests(tests.Modeltests): output = self.app.get("/test/raw/master") self.assertEqual(output.status_code, 200) output_text = output.get_data(as_text=True) - self.assertEqual( - output.headers["Content-Type"].lower(), "text/plain; charset=ascii" - ) + if cchardet is not None: + self.assertEqual( + output.headers["Content-Type"].lower(), + "text/plain; charset=utf-8", + ) + else: + self.assertEqual( + output.headers["Content-Type"].lower(), + "text/plain; charset=ascii", + ) self.assertIn(":Author: Pierre-Yves Chibon", output_text) # Add some more content to the repo @@ -2784,9 +2797,16 @@ class PagureFlaskRepotests(tests.Modeltests): # View in a branch output = self.app.get("/test/raw/master/f/sources") - self.assertEqual( - output.headers["Content-Type"].lower(), "text/plain; charset=ascii" - ) + if cchardet is not None: + self.assertEqual( + output.headers["Content-Type"].lower(), + "text/plain; charset=utf-8", + ) + else: + self.assertEqual( + output.headers["Content-Type"].lower(), + "text/plain; charset=ascii", + ) self.assertEqual(output.status_code, 200) output_text = output.get_data(as_text=True) self.assertIn("foo\n bar", output_text) @@ -2837,9 +2857,16 @@ class PagureFlaskRepotests(tests.Modeltests): output = self.app.get("/test/raw/master") self.assertEqual(output.status_code, 200) output_text = output.get_data(as_text=True) - self.assertEqual( - output.headers["Content-Type"].lower(), "text/plain; charset=ascii" - ) + if cchardet is not None: + self.assertEqual( + output.headers["Content-Type"].lower(), + "text/plain; charset=utf-8", + ) + else: + self.assertEqual( + output.headers["Content-Type"].lower(), + "text/plain; charset=ascii", + ) self.assertTrue( output_text.startswith("diff --git a/test_binary b/test_binary\n") ) @@ -2877,9 +2904,16 @@ class PagureFlaskRepotests(tests.Modeltests): output = self.app.get("/fork/pingou/test3/raw/master/f/sources") self.assertEqual(output.status_code, 200) output_text = output.get_data(as_text=True) - self.assertEqual( - output.headers["Content-Type"].lower(), "text/plain; charset=ascii" - ) + if cchardet is not None: + self.assertEqual( + output.headers["Content-Type"].lower(), + "text/plain; charset=utf-8", + ) + else: + self.assertEqual( + output.headers["Content-Type"].lower(), + "text/plain; charset=ascii", + ) self.assertIn("foo\n bar", output_text) def test_view_commit(self): diff --git a/tests/test_pagure_lib_encoding_utils.py b/tests/test_pagure_lib_encoding_utils.py index ccc8825f..aff7d8ba 100644 --- a/tests/test_pagure_lib_encoding_utils.py +++ b/tests/test_pagure_lib_encoding_utils.py @@ -5,11 +5,18 @@ Tests for :module:`pagure.lib.encoding_utils`. from __future__ import unicode_literals, absolute_import -import chardet import os import unittest import sys +cchardet = None +try: + import cchardet +except ImportError: + pass + +import chardet + sys.path.insert( 0, os.path.join(os.path.dirname(os.path.abspath(__file__)), "..") ) @@ -24,7 +31,10 @@ class TestGuessEncoding(unittest.TestCase): """ data = "Twas bryllyg, and the slythy toves did gyre and gymble" result = encoding_utils.guess_encoding(data.encode("ascii")) - self.assertEqual(result, "ascii") + if cchardet is not None: + self.assertEqual(result, "utf-8") + else: + self.assertEqual(result, "ascii") def test_guess_encoding_favor_utf_8(self): """ @@ -56,17 +66,24 @@ class TestGuessEncodings(unittest.TestCase): chardet_result = chardet.detect(data) if chardet.__version__[0] == "3": # The first three have different confidence values + if cchardet is not None: + expexted_list = ["utf-8"] + # The last one in the list (which apparently has only one) + self.assertEqual(result[-1].encoding, "utf-8") + else: + expexted_list = ["utf-8", "ISO-8859-9", "ISO-8859-1"] + # This is the one with the least confidence + self.assertEqual(result[-1].encoding, "windows-1255") self.assertListEqual( - [encoding.encoding for encoding in result][:3], - ["utf-8", "ISO-8859-9", "ISO-8859-1"], + [encoding.encoding for encoding in result][:3], expexted_list ) - # This is the one with the least confidence - self.assertEqual(result[-1].encoding, "windows-1255") + # The values in the middle of the list all have the same confidence # value and can't be sorted reliably: use sets. - self.assertEqual( - set([encoding.encoding for encoding in result]), - set( + if cchardet is not None: + expected_list = sorted(["utf-8"]) + else: + expected_list = sorted( [ "utf-8", "ISO-8859-9", @@ -89,7 +106,10 @@ class TestGuessEncodings(unittest.TestCase): "windows-1251", "windows-1255", ] - ), + ) + self.assertListEqual( + sorted(set([encoding.encoding for encoding in result])), + expected_list, ) self.assertEqual(chardet_result["encoding"], "ISO-8859-9") else: diff --git a/tests/test_pagure_lib_mimetype.py b/tests/test_pagure_lib_mimetype.py index d5947bee..8c2f4a31 100644 --- a/tests/test_pagure_lib_mimetype.py +++ b/tests/test_pagure_lib_mimetype.py @@ -9,6 +9,12 @@ import os import unittest import sys +cchardet = None +try: + import cchardet +except ImportError: + pass + from pagure.lib import mimetype sys.path.insert( @@ -20,8 +26,18 @@ class TestMIMEType(unittest.TestCase): def test_guess_type(self): dataset = [ ("hello.html", None, "text/html", None), - ("hello.html", b"#!", "text/html", "ascii"), - ("hello", b"#!", "text/plain", "ascii"), + ( + "hello.html", + b"#!", + "text/html", + "ascii" if cchardet is None else "utf-8", + ), + ( + "hello", + b"#!", + "text/plain", + "ascii" if cchardet is None else "utf-8", + ), ("hello.jpg", None, "image/jpeg", None), ("hello.jpg", b"#!", "image/jpeg", None), ("hello.jpg", b"\0", "image/jpeg", None), @@ -49,7 +65,13 @@ class TestMIMEType(unittest.TestCase): def test_get_normal_headers(self): dataset = [ - ("hello", b"#!", "text/plain; charset=ascii"), + ( + "hello", + b"#!", + "text/plain; charset=ascii" + if cchardet is None + else "text/plain; charset=utf-8", + ), ("hello.jpg", None, "image/jpeg"), ("hello.jpg", b"#!", "image/jpeg"), ("hello.jpg", b"\0", "image/jpeg"), -- 2.26.2