diff --git a/declare-dollar-sign-as-safe-in-urlquote.patch b/declare-dollar-sign-as-safe-in-urlquote.patch deleted file mode 100644 index 054fecc..0000000 --- a/declare-dollar-sign-as-safe-in-urlquote.patch +++ /dev/null @@ -1,13 +0,0 @@ -Index: urlgrabber-3.9.1/urlgrabber/grabber.py -=================================================================== ---- urlgrabber-3.9.1.orig/urlgrabber/grabber.py -+++ urlgrabber-3.9.1/urlgrabber/grabber.py -@@ -715,7 +715,7 @@ class URLParser: - passing into urlgrabber. - """ - (scheme, host, path, parm, query, frag) = parts -- path = urllib.quote(path) -+ path = urllib.quote(path, safe='/$') - return (scheme, host, path, parm, query, frag) - - hexvals = '0123456789ABCDEF' diff --git a/grabber_fix.diff b/grabber_fix.diff index 2cf3257..535c6b8 100644 --- a/grabber_fix.diff +++ b/grabber_fix.diff @@ -1,161 +1,21 @@ ---- urlgrabber-3.9.1/urlgrabber/grabber.py.orig 2010-07-02 21:24:12.000000000 -0400 -+++ urlgrabber-3.9.1/urlgrabber/grabber.py 2010-07-02 20:30:25.000000000 -0400 -@@ -68,14 +68,14 @@ - (which can be set on default_grabber.throttle) is used. See - BANDWIDTH THROTTLING for more information. - -- timeout = None -+ timeout = 300 - -- a positive float expressing the number of seconds to wait for socket -- operations. If the value is None or 0.0, socket operations will block -- forever. Setting this option causes urlgrabber to call the settimeout -- method on the Socket object used for the request. See the Python -- documentation on settimeout for more information. -- http://www.python.org/doc/current/lib/socket-objects.html -+ a positive integer expressing the number of seconds to wait before -+ timing out attempts to connect to a server. If the value is None -+ or 0, connection attempts will not time out. The timeout is passed -+ to the underlying pycurl object as its CONNECTTIMEOUT option, see -+ the curl documentation on CURLOPT_CONNECTTIMEOUT for more information. -+ http://curl.haxx.se/libcurl/c/curl_easy_setopt.html#CURLOPTCONNECTTIMEOUT - - bandwidth = 0 - -@@ -439,6 +439,12 @@ - except: - __version__ = '???' +--- a/urlgrabber/grabber.py ++++ b/urlgrabber/grabber.py +@@ -594,6 +594,12 @@ def _urlunquote_convert(s): + s = s.decode('utf8') + return urlunquote(s) +try: + # this part isn't going to do much - need to talk to gettext + from i18n import _ +except ImportError, msg: + def _(st): return st -+ ++ ######################################################################## - # functions for debugging output. These functions are here because they - # are also part of the module initialization. -@@ -808,7 +814,7 @@ - self.prefix = None - self.opener = None - self.cache_openers = True -- self.timeout = None -+ self.timeout = 300 - self.text = None - self.http_headers = None - self.ftp_headers = None -@@ -1052,9 +1058,15 @@ - self._reget_length = 0 - self._prog_running = False - self._error = (None, None) -- self.size = None -+ self.size = 0 -+ self._hdr_ended = False - self._do_open() - -+ -+ def geturl(self): -+ """ Provide the geturl() method, used to be got from -+ urllib.addinfourl, via. urllib.URLopener.* """ -+ return self.url - - def __getattr__(self, name): - """This effectively allows us to wrap at the instance level. -@@ -1085,9 +1097,14 @@ - return -1 - - def _hdr_retrieve(self, buf): -+ if self._hdr_ended: -+ self._hdr_dump = '' -+ self.size = 0 -+ self._hdr_ended = False -+ - if self._over_max_size(cur=len(self._hdr_dump), - max_size=self.opts.max_header_size): -- return -1 -+ return -1 - try: - self._hdr_dump += buf - # we have to get the size before we do the progress obj start -@@ -1104,7 +1121,17 @@ - s = parse150(buf) - if s: - self.size = int(s) -- -+ -+ if buf.lower().find('location') != -1: -+ location = ':'.join(buf.split(':')[1:]) -+ location = location.strip() -+ self.scheme = urlparse.urlsplit(location)[0] -+ self.url = location -+ -+ if len(self._hdr_dump) != 0 and buf == '\r\n': -+ self._hdr_ended = True -+ if DEBUG: DEBUG.info('header ended:') -+ - return len(buf) - except KeyboardInterrupt: - return pycurl.READFUNC_ABORT -@@ -1113,8 +1140,10 @@ - if self._parsed_hdr: - return self._parsed_hdr - statusend = self._hdr_dump.find('\n') -+ statusend += 1 # ridiculous as it may seem. - hdrfp = StringIO() - hdrfp.write(self._hdr_dump[statusend:]) -+ hdrfp.seek(0) - self._parsed_hdr = mimetools.Message(hdrfp) - return self._parsed_hdr - -@@ -1136,6 +1165,7 @@ - self.curl_obj.setopt(pycurl.PROGRESSFUNCTION, self._progress_update) - self.curl_obj.setopt(pycurl.FAILONERROR, True) - self.curl_obj.setopt(pycurl.OPT_FILETIME, True) -+ self.curl_obj.setopt(pycurl.FOLLOWLOCATION, True) - - if DEBUG: - self.curl_obj.setopt(pycurl.VERBOSE, True) -@@ -1148,9 +1178,11 @@ - - # timeouts - timeout = 300 -- if opts.timeout: -- timeout = int(opts.timeout) -- self.curl_obj.setopt(pycurl.CONNECTTIMEOUT, timeout) -+ if hasattr(opts, 'timeout'): -+ timeout = int(opts.timeout or 0) -+ self.curl_obj.setopt(pycurl.CONNECTTIMEOUT, timeout) -+ self.curl_obj.setopt(pycurl.LOW_SPEED_LIMIT, 1) -+ self.curl_obj.setopt(pycurl.LOW_SPEED_TIME, timeout) - - # ssl options - if self.scheme == 'https': -@@ -1276,7 +1308,7 @@ - raise err - - elif errcode == 60: -- msg = _("client cert cannot be verified or client cert incorrect") -+ msg = _("Peer cert cannot be verified or peer cert invalid") - err = URLGrabError(14, msg) - err.url = self.url - raise err -@@ -1291,7 +1323,12 @@ - raise err - - elif str(e.args[1]) == '' and self.http_code != 0: # fake it until you make it -- msg = 'HTTP Error %s : %s ' % (self.http_code, self.url) -+ if self.scheme in ['http', 'https']: -+ msg = 'HTTP Error %s : %s ' % (self.http_code, self.url) -+ elif self.scheme in ['ftp']: -+ msg = 'FTP Error %s : %s ' % (self.http_code, self.url) -+ else: -+ msg = "Unknown Error: URL=%s , scheme=%s" % (self.url, self.scheme) - else: - msg = 'PYCURL ERROR %s - "%s"' % (errcode, str(e.args[1])) - code = errcode -@@ -1299,6 +1336,12 @@ - err.code = code - err.exception = e + # MODULE INITIALIZATION + ######################################################################## +@@ -1298,6 +1304,12 @@ class URLGrabber(object): + _('Exceeded limit (%i): %s') % (limit, url)) + err.url = url raise err + else: + if self._error[1]: @@ -164,73 +24,64 @@ + err.url = self.url + raise err - def _do_open(self): - self.curl_obj = _curl_cache -@@ -1446,9 +1489,23 @@ - # set the time - mod_time = self.curl_obj.getinfo(pycurl.INFO_FILETIME) - if mod_time != -1: -- os.utime(self.filename, (mod_time, mod_time)) -+ try: -+ os.utime(self.filename, (mod_time, mod_time)) -+ except OSError, e: -+ err = URLGrabError(16, _(\ -+ 'error setting timestamp on file %s from %s, OSError: %s') -+ % (self.filenameself.url, e)) -+ err.url = self.url -+ raise err - # re open it -- self.fo = open(self.filename, 'r') -+ try: -+ self.fo = open(self.filename, 'r') -+ except IOError, e: -+ err = URLGrabError(16, _(\ -+ 'error opening file from %s, IOError: %s') % (self.url, e)) -+ err.url = self.url -+ raise err -+ - else: - #self.fo = open(self._temp_name, 'r') - self.fo.seek(0) -@@ -1532,11 +1589,14 @@ - def _over_max_size(self, cur, max_size=None): + return s - if not max_size: -- max_size = self.size -- if self.opts.size: # if we set an opts size use that, no matter what -- max_size = self.opts.size -+ if not self.opts.size: -+ max_size = self.size -+ else: -+ max_size = self.opts.size -+ - if not max_size: return False # if we have None for all of the Max then this is dumb -- if cur > max_size + max_size*.10: -+ -+ if cur > int(float(max_size) * 1.10): +@@ -1342,6 +1354,10 @@ class PyCurlFileObject(object): + self._tm_last = None + self._do_open() - msg = _("Downloaded more than max size for %s: %s > %s") \ - % (self.url, cur, max_size) -@@ -1582,9 +1642,21 @@ - self.opts.progress_obj.end(self._amount_read) - self.fo.close() - -- + def geturl(self): + """ Provide the geturl() method, used to be got from + urllib.addinfourl, via. urllib.URLopener.* """ + return self.url -+ - _curl_cache = pycurl.Curl() # make one and reuse it over and over and over -+def reset_curl_obj(): -+ """To make sure curl has reread the network/dns info we force a reload""" -+ global _curl_cache -+ _curl_cache.close() -+ _curl_cache = pycurl.Curl() -+ -+ -+ + def __getattr__(self, name): + """This effectively allows us to wrap at the instance level. +@@ -1391,7 +1407,7 @@ class PyCurlFileObject(object): - ##################################################################### - # DEPRECATED FUNCTIONS + def _hdr_retrieve(self, buf): + if self._hdr_ended: +- self._hdr_dump = b'' ++ self._hdr_dump = '' + self.size = 0 + self._hdr_ended = False + +@@ -1426,16 +1442,15 @@ class PyCurlFileObject(object): + if s: + self.size = int(s) + +- if buf.lower().find(b'location') != -1: +- location = b':'.join(buf.split(b':')[1:]) ++ if buf.lower().find('location') != -1: ++ location = ':'.join(buf.split(':')[1:]) + location = location.strip() + self.scheme = urlparse.urlsplit(location)[0] + self.url = location + +- self._hdr_dump += buf +- if len(self._hdr_dump) != 0 and buf == b'\r\n': ++ if len(self._hdr_dump) != 0 and buf == '\r\n': + self._hdr_ended = True +- if DEBUG: DEBUG.debug('header ended:') ++ if DEBUG: DEBUG.info('header ended:') + + return len(buf) + except KeyboardInterrupt: +@@ -1444,7 +1459,7 @@ class PyCurlFileObject(object): + def _return_hdr_obj(self): + if self._parsed_hdr: + return self._parsed_hdr +- statusend = self._hdr_dump.find(b'\n') ++ statusend = self._hdr_dump.find('\n') + statusend += 1 # ridiculous as it may seem. + hdrfp = StringIO() + hdrfp.write(self._hdr_dump[statusend:]) +@@ -1498,7 +1513,7 @@ class PyCurlFileObject(object): + if hasattr(opts, 'timeout'): + timeout = int(opts.timeout or 0) + self.curl_obj.setopt(pycurl.CONNECTTIMEOUT, timeout) +- self.curl_obj.setopt(pycurl.LOW_SPEED_LIMIT, opts.minrate or 1000) ++ self.curl_obj.setopt(pycurl.LOW_SPEED_LIMIT, 1) + self.curl_obj.setopt(pycurl.LOW_SPEED_TIME, timeout) + + # ssl options diff --git a/python-urlgrabber-3.9.1-preserve-queryparams-in-urls.patch b/python-urlgrabber-3.9.1-preserve-queryparams-in-urls.patch deleted file mode 100644 index 6232d0d..0000000 --- a/python-urlgrabber-3.9.1-preserve-queryparams-in-urls.patch +++ /dev/null @@ -1,28 +0,0 @@ -diff --unified -u -r urlgrabber-3.9.1.orig/urlgrabber/mirror.py urlgrabber-3.9.1/urlgrabber/mirror.py ---- urlgrabber-3.9.1.orig/urlgrabber/mirror.py 2014-09-16 14:44:54.582048746 +0200 -+++ urlgrabber-3.9.1/urlgrabber/mirror.py 2014-09-16 14:49:24.138034099 +0200 -@@ -88,6 +88,7 @@ - - - import random -+import urlparse - import thread # needed for locking to make this threadsafe - - from grabber import URLGrabError, CallbackObject, DEBUG -@@ -366,11 +367,12 @@ - # by overriding the configuration methods :) - - def _join_url(self, base_url, rel_url): -- if base_url.endswith('/') or rel_url.startswith('/'): -- return base_url + rel_url -+ (scheme, netloc, path, query, fragid) = urlparse.urlsplit(base_url) -+ if path.endswith('/') or rel_url.startswith('/'): -+ return urlparse.urlunsplit((scheme, netloc, path + rel_url, query, fragid)) - else: -- return base_url + '/' + rel_url -- -+ return urlparse.urlunsplit((scheme, netloc, path + '/' + rel_url, query, fragid)) -+ - def _mirror_try(self, func, url, kw): - gr = GrabRequest() - gr.func = func diff --git a/python-urlgrabber-3.9.1-set-SSL_VERIFYHOST-correct.dif b/python-urlgrabber-3.9.1-set-SSL_VERIFYHOST-correct.dif deleted file mode 100644 index 3231740..0000000 --- a/python-urlgrabber-3.9.1-set-SSL_VERIFYHOST-correct.dif +++ /dev/null @@ -1,14 +0,0 @@ -Index: urlgrabber-3.9.1/urlgrabber/grabber.py -=================================================================== ---- urlgrabber-3.9.1.orig/urlgrabber/grabber.py -+++ urlgrabber-3.9.1/urlgrabber/grabber.py -@@ -1190,7 +1190,8 @@ class PyCurlFileObject(): - self.curl_obj.setopt(pycurl.CAPATH, opts.ssl_ca_cert) - self.curl_obj.setopt(pycurl.CAINFO, opts.ssl_ca_cert) - self.curl_obj.setopt(pycurl.SSL_VERIFYPEER, opts.ssl_verify_peer) -- self.curl_obj.setopt(pycurl.SSL_VERIFYHOST, opts.ssl_verify_host) -+ if opts.ssl_verify_host: # 1 is meaningless to curl -+ self.curl_obj.setopt(pycurl.SSL_VERIFYHOST, 2) - if opts.ssl_key: - self.curl_obj.setopt(pycurl.SSLKEY, opts.ssl_key) - if opts.ssl_key_type: diff --git a/python-urlgrabber.changes b/python-urlgrabber.changes index 6b1926b..c458acb 100644 --- a/python-urlgrabber.changes +++ b/python-urlgrabber.changes @@ -1,3 +1,13 @@ +------------------------------------------------------------------- +Mon Feb 25 17:44:43 CET 2019 - Matej Cepl + +- Update to the upstream version 4.0.0: + * Port to Python 3 rocket + * Add curl_obj option to grabber + * Throw an obvious error message when urlgrabber-ext-down is + missing when attempting to use external downloader + * Use setuptools for setup.py instead of distutils + ------------------------------------------------------------------- Tue Dec 4 12:55:41 UTC 2018 - Matej Cepl diff --git a/python-urlgrabber.spec b/python-urlgrabber.spec index a66931e..635fcd8 100644 --- a/python-urlgrabber.spec +++ b/python-urlgrabber.spec @@ -1,7 +1,7 @@ # # spec file for package python-urlgrabber # -# Copyright (c) 2018 SUSE LINUX GmbH, Nuernberg, Germany. +# Copyright (c) 2019 SUSE LINUX GmbH, Nuernberg, Germany. # # All modifications and additions to the file contributed by third parties # remain the property of their copyright owners, unless otherwise agreed @@ -19,20 +19,20 @@ %{!?python_sitelib: %global python_sitelib %(python -c "from distutils.sysconfig import get_python_lib; print get_python_lib()")} %{?!python_module:%define python_module() python-%{**} python3-%{**}} %define skip_python3 1 +%define modname urlgrabber + Name: python-urlgrabber -Version: 3.9.1 +Version: 4.0.0 Release: 0 Summary: A high-level cross-protocol url-grabber License: LGPL-2.1-only Group: Development/Languages/Python URL: http://urlgrabber.baseurl.org -Source: https://files.pythonhosted.org/packages/source/u/urlgrabber/urlgrabber-%{version}.tar.gz +Source: https://github.com/rpm-software-management/%{modname}/releases/download/%{modname}-4-0-0/%{modname}-%{version}.tar.gz Patch0: grabber_fix.diff -# PATCH-FIX-UPSTREAM bnc#896844 -Patch1: python-urlgrabber-3.9.1-preserve-queryparams-in-urls.patch -Patch2: declare-dollar-sign-as-safe-in-urlquote.patch -Patch3: python-urlgrabber-3.9.1-set-SSL_VERIFYHOST-correct.dif BuildRequires: %{python_module pycurl} +BuildRequires: %{python_module setuptools} +BuildRequires: %{python_module six} BuildRequires: fdupes BuildRequires: python-rpm-macros Requires: python-pycurl @@ -47,10 +47,7 @@ throttling, authentication, proxies and more. %prep %setup -q -n urlgrabber-%{version} sed -i "13d" urlgrabber/__init__.py # Remove wrong license header, fixes bnc#781323 -%patch0 -p1 -%patch1 -p1 -%patch2 -p1 -%patch3 -p1 +%autopatch -p1 %build %python_build @@ -58,6 +55,7 @@ sed -i "13d" urlgrabber/__init__.py # Remove wrong license header, fixes bnc#781 %install %python_install rm -rf %{buildroot}%{_datadir}/doc/urlgrabber-%{version} # Remove wrongly installed docs +mv -v %{buildroot}%{_usr}/libexec/urlgrabber-ext-down %{buildroot}%{_usr}/lib/urlgrabber-ext-down %python_expand %fdupes %{buildroot}%{$python_sitelib} %files %{python_files} @@ -65,5 +63,6 @@ rm -rf %{buildroot}%{_datadir}/doc/urlgrabber-%{version} # Remove wrongly instal %doc ChangeLog README TODO %{_bindir}/urlgrabber %{python_sitelib}/* +%{_usr}/lib/urlgrabber* %changelog diff --git a/urlgrabber-3.9.1.tar.gz b/urlgrabber-3.9.1.tar.gz deleted file mode 100644 index b74a40e..0000000 --- a/urlgrabber-3.9.1.tar.gz +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:b4e276fa968c66671309a6d754c4b3b0cb2003dec8bca87a681378a22e0d3da7 -size 72071 diff --git a/urlgrabber-4.0.0.tar.gz b/urlgrabber-4.0.0.tar.gz new file mode 100644 index 0000000..cd4f830 --- /dev/null +++ b/urlgrabber-4.0.0.tar.gz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d152d10c7decce45ce5b44f8ee8ee5fd0047217f4152eebb8e0c552ca5137af0 +size 86308