pagure/0008-Add-support-for-using-cchardet-to-detect-files-encod.patch
Neal Gompa a61ee7e487 - Backport various fixes from upstream
+ Patch: 0001-Display-real-line-numbers-on-pull-request-s-diff-vie.patch
  + Patch: 0002-Show-the-assignee-s-avatar-on-the-board.patch
  + Patch: 0003-Allow-setting-a-status-as-closing-even-if-the-projec.patch
  + Patch: 0004-Include-the-assignee-in-the-list-of-people-notified-.patch
  + Patch: 0005-Introduce-the-collaborator_project_groups-mapping.patch
  + Patch: 0006-When-a-file-a-detected-as-a-binary-file-return-the-r.patch
  + Patch: 0007-Remove-fenced-code-block-when-checking-mention.patch
  + Patch: 0008-Add-support-for-using-cchardet-to-detect-files-encod.patch
  + Patch: 0009-Add-support-for-disabling-user-registration.patch
- Remove mandatory dependency on systemd to ease containerization

OBS-URL: https://build.opensuse.org/package/show/devel:tools:scm/pagure?expand=0&rev=46
2020-09-24 23:02:09 +00:00

285 lines
10 KiB
Diff

From a8b2e001d6f4d2e4683aa65d6331b971f8e2ce33 Mon Sep 17 00:00:00 2001
From: Pierre-Yves Chibon <pingou@pingoured.fr>
Date: Mon, 14 Sep 2020 16:03:31 +0200
Subject: [PATCH 8/9] Add support for using cchardet to detect files' encoding
cchardet is a much faster version of the chardet library that can
be used to automatically detect the encoding of a file.
Since this library is only available on python3, we're making it
an optional dependency for now.
Fixes https://pagure.io/pagure/issue/4977
Signed-off-by: Pierre-Yves Chibon <pingou@pingoured.fr>
---
pagure/lib/encoding_utils.py | 12 ++++-
tests/test_pagure_flask_ui_repo.py | 58 ++++++++++++++++++++-----
tests/test_pagure_lib_encoding_utils.py | 40 ++++++++++++-----
tests/test_pagure_lib_mimetype.py | 28 ++++++++++--
4 files changed, 111 insertions(+), 27 deletions(-)
diff --git a/pagure/lib/encoding_utils.py b/pagure/lib/encoding_utils.py
index 66f7dced..304e7d84 100644
--- a/pagure/lib/encoding_utils.py
+++ b/pagure/lib/encoding_utils.py
@@ -15,7 +15,12 @@ from __future__ import unicode_literals, division, absolute_import
from collections import namedtuple
import logging
-from chardet import universaldetector, __version__ as ch_version
+try:
+ import cchardet
+ from cchardet import __version__ as ch_version
+except ImportError:
+ cchardet = None
+ from chardet import universaldetector, __version__ as ch_version
from pagure.exceptions import PagureEncodingException
@@ -44,7 +49,10 @@ def detect_encodings(data):
# We can't use ``chardet.detect`` because we want to dig in the internals
# of the detector to bias the utf-8 result.
- detector = universaldetector.UniversalDetector()
+ if cchardet is not None:
+ detector = cchardet.UniversalDetector()
+ else:
+ detector = universaldetector.UniversalDetector()
detector.reset()
detector.feed(data)
result = detector.close()
diff --git a/tests/test_pagure_flask_ui_repo.py b/tests/test_pagure_flask_ui_repo.py
index b4322e7d..e816e7f7 100644
--- a/tests/test_pagure_flask_ui_repo.py
+++ b/tests/test_pagure_flask_ui_repo.py
@@ -20,6 +20,12 @@ import tempfile
import time
import os
+cchardet = None
+try:
+ import cchardet
+except ImportError:
+ pass
+
import pygit2
import six
from mock import ANY, patch, MagicMock
@@ -2763,9 +2769,16 @@ class PagureFlaskRepotests(tests.Modeltests):
output = self.app.get("/test/raw/master")
self.assertEqual(output.status_code, 200)
output_text = output.get_data(as_text=True)
- self.assertEqual(
- output.headers["Content-Type"].lower(), "text/plain; charset=ascii"
- )
+ if cchardet is not None:
+ self.assertEqual(
+ output.headers["Content-Type"].lower(),
+ "text/plain; charset=utf-8",
+ )
+ else:
+ self.assertEqual(
+ output.headers["Content-Type"].lower(),
+ "text/plain; charset=ascii",
+ )
self.assertIn(":Author: Pierre-Yves Chibon", output_text)
# Add some more content to the repo
@@ -2784,9 +2797,16 @@ class PagureFlaskRepotests(tests.Modeltests):
# View in a branch
output = self.app.get("/test/raw/master/f/sources")
- self.assertEqual(
- output.headers["Content-Type"].lower(), "text/plain; charset=ascii"
- )
+ if cchardet is not None:
+ self.assertEqual(
+ output.headers["Content-Type"].lower(),
+ "text/plain; charset=utf-8",
+ )
+ else:
+ self.assertEqual(
+ output.headers["Content-Type"].lower(),
+ "text/plain; charset=ascii",
+ )
self.assertEqual(output.status_code, 200)
output_text = output.get_data(as_text=True)
self.assertIn("foo\n bar", output_text)
@@ -2837,9 +2857,16 @@ class PagureFlaskRepotests(tests.Modeltests):
output = self.app.get("/test/raw/master")
self.assertEqual(output.status_code, 200)
output_text = output.get_data(as_text=True)
- self.assertEqual(
- output.headers["Content-Type"].lower(), "text/plain; charset=ascii"
- )
+ if cchardet is not None:
+ self.assertEqual(
+ output.headers["Content-Type"].lower(),
+ "text/plain; charset=utf-8",
+ )
+ else:
+ self.assertEqual(
+ output.headers["Content-Type"].lower(),
+ "text/plain; charset=ascii",
+ )
self.assertTrue(
output_text.startswith("diff --git a/test_binary b/test_binary\n")
)
@@ -2877,9 +2904,16 @@ class PagureFlaskRepotests(tests.Modeltests):
output = self.app.get("/fork/pingou/test3/raw/master/f/sources")
self.assertEqual(output.status_code, 200)
output_text = output.get_data(as_text=True)
- self.assertEqual(
- output.headers["Content-Type"].lower(), "text/plain; charset=ascii"
- )
+ if cchardet is not None:
+ self.assertEqual(
+ output.headers["Content-Type"].lower(),
+ "text/plain; charset=utf-8",
+ )
+ else:
+ self.assertEqual(
+ output.headers["Content-Type"].lower(),
+ "text/plain; charset=ascii",
+ )
self.assertIn("foo\n bar", output_text)
def test_view_commit(self):
diff --git a/tests/test_pagure_lib_encoding_utils.py b/tests/test_pagure_lib_encoding_utils.py
index ccc8825f..aff7d8ba 100644
--- a/tests/test_pagure_lib_encoding_utils.py
+++ b/tests/test_pagure_lib_encoding_utils.py
@@ -5,11 +5,18 @@ Tests for :module:`pagure.lib.encoding_utils`.
from __future__ import unicode_literals, absolute_import
-import chardet
import os
import unittest
import sys
+cchardet = None
+try:
+ import cchardet
+except ImportError:
+ pass
+
+import chardet
+
sys.path.insert(
0, os.path.join(os.path.dirname(os.path.abspath(__file__)), "..")
)
@@ -24,7 +31,10 @@ class TestGuessEncoding(unittest.TestCase):
"""
data = "Twas bryllyg, and the slythy toves did gyre and gymble"
result = encoding_utils.guess_encoding(data.encode("ascii"))
- self.assertEqual(result, "ascii")
+ if cchardet is not None:
+ self.assertEqual(result, "utf-8")
+ else:
+ self.assertEqual(result, "ascii")
def test_guess_encoding_favor_utf_8(self):
"""
@@ -56,17 +66,24 @@ class TestGuessEncodings(unittest.TestCase):
chardet_result = chardet.detect(data)
if chardet.__version__[0] == "3":
# The first three have different confidence values
+ if cchardet is not None:
+ expexted_list = ["utf-8"]
+ # The last one in the list (which apparently has only one)
+ self.assertEqual(result[-1].encoding, "utf-8")
+ else:
+ expexted_list = ["utf-8", "ISO-8859-9", "ISO-8859-1"]
+ # This is the one with the least confidence
+ self.assertEqual(result[-1].encoding, "windows-1255")
self.assertListEqual(
- [encoding.encoding for encoding in result][:3],
- ["utf-8", "ISO-8859-9", "ISO-8859-1"],
+ [encoding.encoding for encoding in result][:3], expexted_list
)
- # This is the one with the least confidence
- self.assertEqual(result[-1].encoding, "windows-1255")
+
# The values in the middle of the list all have the same confidence
# value and can't be sorted reliably: use sets.
- self.assertEqual(
- set([encoding.encoding for encoding in result]),
- set(
+ if cchardet is not None:
+ expected_list = sorted(["utf-8"])
+ else:
+ expected_list = sorted(
[
"utf-8",
"ISO-8859-9",
@@ -89,7 +106,10 @@ class TestGuessEncodings(unittest.TestCase):
"windows-1251",
"windows-1255",
]
- ),
+ )
+ self.assertListEqual(
+ sorted(set([encoding.encoding for encoding in result])),
+ expected_list,
)
self.assertEqual(chardet_result["encoding"], "ISO-8859-9")
else:
diff --git a/tests/test_pagure_lib_mimetype.py b/tests/test_pagure_lib_mimetype.py
index d5947bee..8c2f4a31 100644
--- a/tests/test_pagure_lib_mimetype.py
+++ b/tests/test_pagure_lib_mimetype.py
@@ -9,6 +9,12 @@ import os
import unittest
import sys
+cchardet = None
+try:
+ import cchardet
+except ImportError:
+ pass
+
from pagure.lib import mimetype
sys.path.insert(
@@ -20,8 +26,18 @@ class TestMIMEType(unittest.TestCase):
def test_guess_type(self):
dataset = [
("hello.html", None, "text/html", None),
- ("hello.html", b"#!", "text/html", "ascii"),
- ("hello", b"#!", "text/plain", "ascii"),
+ (
+ "hello.html",
+ b"#!",
+ "text/html",
+ "ascii" if cchardet is None else "utf-8",
+ ),
+ (
+ "hello",
+ b"#!",
+ "text/plain",
+ "ascii" if cchardet is None else "utf-8",
+ ),
("hello.jpg", None, "image/jpeg", None),
("hello.jpg", b"#!", "image/jpeg", None),
("hello.jpg", b"\0", "image/jpeg", None),
@@ -49,7 +65,13 @@ class TestMIMEType(unittest.TestCase):
def test_get_normal_headers(self):
dataset = [
- ("hello", b"#!", "text/plain; charset=ascii"),
+ (
+ "hello",
+ b"#!",
+ "text/plain; charset=ascii"
+ if cchardet is None
+ else "text/plain; charset=utf-8",
+ ),
("hello.jpg", None, "image/jpeg"),
("hello.jpg", b"#!", "image/jpeg"),
("hello.jpg", b"\0", "image/jpeg"),
--
2.26.2