From 276d6e2439c8c53c182dbe785b038919e64da9f3 Mon Sep 17 00:00:00 2001 From: Marcus Huewe Date: Thu, 4 Jun 2020 13:12:22 +0200 Subject: [PATCH] Do not use the chardet module in util.helper.decode_it In general, decode_it is used to get a str from an arbitrary bytes instance. For this, decode_it used the chardet module (if present) to detect the underlying encoding (if the bytes instance corresponds to a "supported" encoding). The drawback of this detection is that it can take quite some time in case of a large bytes instance, which represents no "supported" encoding (see #669 and #746). Instead of doing a potentially "time consuming" detection, either assume an utf-8 encoding or a latin-1 encoding. Rationale: it is just not worth the effort to detect a _potential_ encoding because we have no clue what the _correct_ encoding is. For instance, consider the following bytes instance: b'This character group is not supported: [abc\xc3\xbf]' It represents a valid utf-8 and latin-1 encoding. What is the "correct" one? We don't know... Even if you interpret the bytes instance as a human you cannot give a definite answer (implicit assumption: there is no additional context available). That is, if we cannot give a definite answer in case of two potential encodings, there is no point in bringing even more potential encodings into play. Hence, do not use the chardet module. Note: the rationale for trying utf-8 first is that utf-8 is pretty much in vogue these days and, hence, the chances are "high" that we guess the "correct" encoding. Fixes: #669 ("check in huge shell archives is insanely slow") Fixes: #746 ("Very slow local buildlog parsing") --- osc/util/helper.py | 25 +++++++++++-------------- 1 file changed, 11 insertions(+), 14 deletions(-) diff --git a/osc/util/helper.py b/osc/util/helper.py index d834482a..e7b4a2f7 100644 --- a/osc/util/helper.py +++ b/osc/util/helper.py @@ -56,22 +56,19 @@ def decode_list(ilist): def decode_it(obj): - """ Decodes the given object if obj is not a string - based on the chardet module if possible - """ + """Decode the given object. - if obj is None or isinstance(obj, str): + If the given object has no decode method, the object itself is + returned. Otherwise, try to decode the object using utf-8. If this + fails due to a UnicodeDecodeError, try to decode the object using + latin-1. + """ + if not hasattr(obj, 'decode'): return obj - else: - try: - import chardet - return obj.decode(chardet.detect(obj)['encoding']) - except: - try: - import locale - return obj.decode(locale.getlocale()[1]) - except: - return obj.decode('latin-1') + try: + return obj.decode('utf-8') + except UnicodeDecodeError: + return obj.decode('latin-1') def raw_input(*args):