1
0
mirror of https://github.com/openSUSE/osc.git synced 2025-01-04 05:46:16 +01:00
github.com_openSUSE_osc/osc/util/helper.py

92 lines
2.2 KiB
Python
Raw Normal View History

# Copyright (C) 2018 SUSE Linux. All rights reserved.
# This program is free software; it may be used, copied, modified
# and distributed under the terms of the GNU General Public Licence,
# either version 2, or (at your option) any later version.
try:
import html
except ImportError:
import cgi as html
2020-02-20 08:45:02 +01:00
from osc import oscerr
def cmp_to_key(mycmp):
""" Converts a cmp= function into a key= function.
"""
class K(object):
def __init__(self, obj, *args):
self.obj = obj
def __lt__(self, other):
return mycmp(self.obj, other.obj) < 0
def __gt__(self, other):
return mycmp(self.obj, other.obj) > 0
def __eq__(self, other):
return mycmp(self.obj, other.obj) == 0
def __le__(self, other):
return mycmp(self.obj, other.obj) <= 0
def __ge__(self, other):
return mycmp(self.obj, other.obj) >= 0
def __ne__(self, other):
return mycmp(self.obj, other.obj) != 0
def __hash__(self):
raise TypeError('hash not implemented')
return K
def decode_list(ilist):
""" Decodes the elements of a list if needed
"""
dlist = []
for elem in ilist:
if not isinstance(elem, str):
dlist.append(decode_it(elem))
else:
dlist.append(elem)
return dlist
def decode_it(obj):
"""Decode the given object unless it is a str.
If the given object is a str or has no decode method, the object itself is
Do not use the chardet module in util.helper.decode_it In general, decode_it is used to get a str from an arbitrary bytes instance. For this, decode_it used the chardet module (if present) to detect the underlying encoding (if the bytes instance corresponds to a "supported" encoding). The drawback of this detection is that it can take quite some time in case of a large bytes instance, which represents no "supported" encoding (see #669 and #746). Instead of doing a potentially "time consuming" detection, either assume an utf-8 encoding or a latin-1 encoding. Rationale: it is just not worth the effort to detect a _potential_ encoding because we have no clue what the _correct_ encoding is. For instance, consider the following bytes instance: b'This character group is not supported: [abc\xc3\xbf]' It represents a valid utf-8 and latin-1 encoding. What is the "correct" one? We don't know... Even if you interpret the bytes instance as a human you cannot give a definite answer (implicit assumption: there is no additional context available). That is, if we cannot give a definite answer in case of two potential encodings, there is no point in bringing even more potential encodings into play. Hence, do not use the chardet module. Note: the rationale for trying utf-8 first is that utf-8 is pretty much in vogue these days and, hence, the chances are "high" that we guess the "correct" encoding. Fixes: #669 ("check in huge shell archives is insanely slow") Fixes: #746 ("Very slow local buildlog parsing")
2020-06-04 13:12:22 +02:00
returned. Otherwise, try to decode the object using utf-8. If this
fails due to a UnicodeDecodeError, try to decode the object using
latin-1.
"""
if isinstance(obj, str) or not hasattr(obj, 'decode'):
return obj
Do not use the chardet module in util.helper.decode_it In general, decode_it is used to get a str from an arbitrary bytes instance. For this, decode_it used the chardet module (if present) to detect the underlying encoding (if the bytes instance corresponds to a "supported" encoding). The drawback of this detection is that it can take quite some time in case of a large bytes instance, which represents no "supported" encoding (see #669 and #746). Instead of doing a potentially "time consuming" detection, either assume an utf-8 encoding or a latin-1 encoding. Rationale: it is just not worth the effort to detect a _potential_ encoding because we have no clue what the _correct_ encoding is. For instance, consider the following bytes instance: b'This character group is not supported: [abc\xc3\xbf]' It represents a valid utf-8 and latin-1 encoding. What is the "correct" one? We don't know... Even if you interpret the bytes instance as a human you cannot give a definite answer (implicit assumption: there is no additional context available). That is, if we cannot give a definite answer in case of two potential encodings, there is no point in bringing even more potential encodings into play. Hence, do not use the chardet module. Note: the rationale for trying utf-8 first is that utf-8 is pretty much in vogue these days and, hence, the chances are "high" that we guess the "correct" encoding. Fixes: #669 ("check in huge shell archives is insanely slow") Fixes: #746 ("Very slow local buildlog parsing")
2020-06-04 13:12:22 +02:00
try:
return obj.decode('utf-8')
except UnicodeDecodeError:
return obj.decode('latin-1')
def raw_input(*args):
try:
import builtins
func = builtins.input
except ImportError:
#python 2.7
import __builtin__
func = __builtin__.raw_input
try:
return func(*args)
except EOFError:
# interpret ctrl-d as user abort
raise oscerr.UserAbort()
def _html_escape(data):
return html.escape(data, quote=False)