forked from pool/python-urlgrabber
Accepting request 679030 from home:mcepl:branches:devel:languages:python
- Update to the upstream version 4.0.0: * Port to Python 3 rocket * Add curl_obj option to grabber * Throw an obvious error message when urlgrabber-ext-down is missing when attempting to use external downloader * Use setuptools for setup.py instead of distutils OBS-URL: https://build.opensuse.org/request/show/679030 OBS-URL: https://build.opensuse.org/package/show/devel:languages:python/python-urlgrabber?expand=0&rev=31
This commit is contained in:
@@ -1,13 +0,0 @@
|
||||
Index: urlgrabber-3.9.1/urlgrabber/grabber.py
|
||||
===================================================================
|
||||
--- urlgrabber-3.9.1.orig/urlgrabber/grabber.py
|
||||
+++ urlgrabber-3.9.1/urlgrabber/grabber.py
|
||||
@@ -715,7 +715,7 @@ class URLParser:
|
||||
passing into urlgrabber.
|
||||
"""
|
||||
(scheme, host, path, parm, query, frag) = parts
|
||||
- path = urllib.quote(path)
|
||||
+ path = urllib.quote(path, safe='/$')
|
||||
return (scheme, host, path, parm, query, frag)
|
||||
|
||||
hexvals = '0123456789ABCDEF'
|
275
grabber_fix.diff
275
grabber_fix.diff
@@ -1,30 +1,8 @@
|
||||
--- urlgrabber-3.9.1/urlgrabber/grabber.py.orig 2010-07-02 21:24:12.000000000 -0400
|
||||
+++ urlgrabber-3.9.1/urlgrabber/grabber.py 2010-07-02 20:30:25.000000000 -0400
|
||||
@@ -68,14 +68,14 @@
|
||||
(which can be set on default_grabber.throttle) is used. See
|
||||
BANDWIDTH THROTTLING for more information.
|
||||
|
||||
- timeout = None
|
||||
+ timeout = 300
|
||||
|
||||
- a positive float expressing the number of seconds to wait for socket
|
||||
- operations. If the value is None or 0.0, socket operations will block
|
||||
- forever. Setting this option causes urlgrabber to call the settimeout
|
||||
- method on the Socket object used for the request. See the Python
|
||||
- documentation on settimeout for more information.
|
||||
- http://www.python.org/doc/current/lib/socket-objects.html
|
||||
+ a positive integer expressing the number of seconds to wait before
|
||||
+ timing out attempts to connect to a server. If the value is None
|
||||
+ or 0, connection attempts will not time out. The timeout is passed
|
||||
+ to the underlying pycurl object as its CONNECTTIMEOUT option, see
|
||||
+ the curl documentation on CURLOPT_CONNECTTIMEOUT for more information.
|
||||
+ http://curl.haxx.se/libcurl/c/curl_easy_setopt.html#CURLOPTCONNECTTIMEOUT
|
||||
|
||||
bandwidth = 0
|
||||
|
||||
@@ -439,6 +439,12 @@
|
||||
except:
|
||||
__version__ = '???'
|
||||
--- a/urlgrabber/grabber.py
|
||||
+++ b/urlgrabber/grabber.py
|
||||
@@ -594,6 +594,12 @@ def _urlunquote_convert(s):
|
||||
s = s.decode('utf8')
|
||||
return urlunquote(s)
|
||||
|
||||
+try:
|
||||
+ # this part isn't going to do much - need to talk to gettext
|
||||
@@ -33,129 +11,11 @@
|
||||
+ def _(st): return st
|
||||
+
|
||||
########################################################################
|
||||
# functions for debugging output. These functions are here because they
|
||||
# are also part of the module initialization.
|
||||
@@ -808,7 +814,7 @@
|
||||
self.prefix = None
|
||||
self.opener = None
|
||||
self.cache_openers = True
|
||||
- self.timeout = None
|
||||
+ self.timeout = 300
|
||||
self.text = None
|
||||
self.http_headers = None
|
||||
self.ftp_headers = None
|
||||
@@ -1052,9 +1058,15 @@
|
||||
self._reget_length = 0
|
||||
self._prog_running = False
|
||||
self._error = (None, None)
|
||||
- self.size = None
|
||||
+ self.size = 0
|
||||
+ self._hdr_ended = False
|
||||
self._do_open()
|
||||
|
||||
+
|
||||
+ def geturl(self):
|
||||
+ """ Provide the geturl() method, used to be got from
|
||||
+ urllib.addinfourl, via. urllib.URLopener.* """
|
||||
+ return self.url
|
||||
|
||||
def __getattr__(self, name):
|
||||
"""This effectively allows us to wrap at the instance level.
|
||||
@@ -1085,9 +1097,14 @@
|
||||
return -1
|
||||
|
||||
def _hdr_retrieve(self, buf):
|
||||
+ if self._hdr_ended:
|
||||
+ self._hdr_dump = ''
|
||||
+ self.size = 0
|
||||
+ self._hdr_ended = False
|
||||
+
|
||||
if self._over_max_size(cur=len(self._hdr_dump),
|
||||
max_size=self.opts.max_header_size):
|
||||
- return -1
|
||||
+ return -1
|
||||
try:
|
||||
self._hdr_dump += buf
|
||||
# we have to get the size before we do the progress obj start
|
||||
@@ -1104,7 +1121,17 @@
|
||||
s = parse150(buf)
|
||||
if s:
|
||||
self.size = int(s)
|
||||
-
|
||||
+
|
||||
+ if buf.lower().find('location') != -1:
|
||||
+ location = ':'.join(buf.split(':')[1:])
|
||||
+ location = location.strip()
|
||||
+ self.scheme = urlparse.urlsplit(location)[0]
|
||||
+ self.url = location
|
||||
+
|
||||
+ if len(self._hdr_dump) != 0 and buf == '\r\n':
|
||||
+ self._hdr_ended = True
|
||||
+ if DEBUG: DEBUG.info('header ended:')
|
||||
+
|
||||
return len(buf)
|
||||
except KeyboardInterrupt:
|
||||
return pycurl.READFUNC_ABORT
|
||||
@@ -1113,8 +1140,10 @@
|
||||
if self._parsed_hdr:
|
||||
return self._parsed_hdr
|
||||
statusend = self._hdr_dump.find('\n')
|
||||
+ statusend += 1 # ridiculous as it may seem.
|
||||
hdrfp = StringIO()
|
||||
hdrfp.write(self._hdr_dump[statusend:])
|
||||
+ hdrfp.seek(0)
|
||||
self._parsed_hdr = mimetools.Message(hdrfp)
|
||||
return self._parsed_hdr
|
||||
|
||||
@@ -1136,6 +1165,7 @@
|
||||
self.curl_obj.setopt(pycurl.PROGRESSFUNCTION, self._progress_update)
|
||||
self.curl_obj.setopt(pycurl.FAILONERROR, True)
|
||||
self.curl_obj.setopt(pycurl.OPT_FILETIME, True)
|
||||
+ self.curl_obj.setopt(pycurl.FOLLOWLOCATION, True)
|
||||
|
||||
if DEBUG:
|
||||
self.curl_obj.setopt(pycurl.VERBOSE, True)
|
||||
@@ -1148,9 +1178,11 @@
|
||||
|
||||
# timeouts
|
||||
timeout = 300
|
||||
- if opts.timeout:
|
||||
- timeout = int(opts.timeout)
|
||||
- self.curl_obj.setopt(pycurl.CONNECTTIMEOUT, timeout)
|
||||
+ if hasattr(opts, 'timeout'):
|
||||
+ timeout = int(opts.timeout or 0)
|
||||
+ self.curl_obj.setopt(pycurl.CONNECTTIMEOUT, timeout)
|
||||
+ self.curl_obj.setopt(pycurl.LOW_SPEED_LIMIT, 1)
|
||||
+ self.curl_obj.setopt(pycurl.LOW_SPEED_TIME, timeout)
|
||||
|
||||
# ssl options
|
||||
if self.scheme == 'https':
|
||||
@@ -1276,7 +1308,7 @@
|
||||
raise err
|
||||
|
||||
elif errcode == 60:
|
||||
- msg = _("client cert cannot be verified or client cert incorrect")
|
||||
+ msg = _("Peer cert cannot be verified or peer cert invalid")
|
||||
err = URLGrabError(14, msg)
|
||||
err.url = self.url
|
||||
raise err
|
||||
@@ -1291,7 +1323,12 @@
|
||||
raise err
|
||||
|
||||
elif str(e.args[1]) == '' and self.http_code != 0: # fake it until you make it
|
||||
- msg = 'HTTP Error %s : %s ' % (self.http_code, self.url)
|
||||
+ if self.scheme in ['http', 'https']:
|
||||
+ msg = 'HTTP Error %s : %s ' % (self.http_code, self.url)
|
||||
+ elif self.scheme in ['ftp']:
|
||||
+ msg = 'FTP Error %s : %s ' % (self.http_code, self.url)
|
||||
+ else:
|
||||
+ msg = "Unknown Error: URL=%s , scheme=%s" % (self.url, self.scheme)
|
||||
else:
|
||||
msg = 'PYCURL ERROR %s - "%s"' % (errcode, str(e.args[1]))
|
||||
code = errcode
|
||||
@@ -1299,6 +1336,12 @@
|
||||
err.code = code
|
||||
err.exception = e
|
||||
# MODULE INITIALIZATION
|
||||
########################################################################
|
||||
@@ -1298,6 +1304,12 @@ class URLGrabber(object):
|
||||
_('Exceeded limit (%i): %s') % (limit, url))
|
||||
err.url = url
|
||||
raise err
|
||||
+ else:
|
||||
+ if self._error[1]:
|
||||
@@ -164,73 +24,64 @@
|
||||
+ err.url = self.url
|
||||
+ raise err
|
||||
|
||||
def _do_open(self):
|
||||
self.curl_obj = _curl_cache
|
||||
@@ -1446,9 +1489,23 @@
|
||||
# set the time
|
||||
mod_time = self.curl_obj.getinfo(pycurl.INFO_FILETIME)
|
||||
if mod_time != -1:
|
||||
- os.utime(self.filename, (mod_time, mod_time))
|
||||
+ try:
|
||||
+ os.utime(self.filename, (mod_time, mod_time))
|
||||
+ except OSError, e:
|
||||
+ err = URLGrabError(16, _(\
|
||||
+ 'error setting timestamp on file %s from %s, OSError: %s')
|
||||
+ % (self.filenameself.url, e))
|
||||
+ err.url = self.url
|
||||
+ raise err
|
||||
# re open it
|
||||
- self.fo = open(self.filename, 'r')
|
||||
+ try:
|
||||
+ self.fo = open(self.filename, 'r')
|
||||
+ except IOError, e:
|
||||
+ err = URLGrabError(16, _(\
|
||||
+ 'error opening file from %s, IOError: %s') % (self.url, e))
|
||||
+ err.url = self.url
|
||||
+ raise err
|
||||
+
|
||||
else:
|
||||
#self.fo = open(self._temp_name, 'r')
|
||||
self.fo.seek(0)
|
||||
@@ -1532,11 +1589,14 @@
|
||||
def _over_max_size(self, cur, max_size=None):
|
||||
return s
|
||||
|
||||
if not max_size:
|
||||
- max_size = self.size
|
||||
- if self.opts.size: # if we set an opts size use that, no matter what
|
||||
- max_size = self.opts.size
|
||||
+ if not self.opts.size:
|
||||
+ max_size = self.size
|
||||
+ else:
|
||||
+ max_size = self.opts.size
|
||||
+
|
||||
if not max_size: return False # if we have None for all of the Max then this is dumb
|
||||
- if cur > max_size + max_size*.10:
|
||||
+
|
||||
+ if cur > int(float(max_size) * 1.10):
|
||||
@@ -1342,6 +1354,10 @@ class PyCurlFileObject(object):
|
||||
self._tm_last = None
|
||||
self._do_open()
|
||||
|
||||
msg = _("Downloaded more than max size for %s: %s > %s") \
|
||||
% (self.url, cur, max_size)
|
||||
@@ -1582,9 +1642,21 @@
|
||||
self.opts.progress_obj.end(self._amount_read)
|
||||
self.fo.close()
|
||||
|
||||
-
|
||||
+ def geturl(self):
|
||||
+ """ Provide the geturl() method, used to be got from
|
||||
+ urllib.addinfourl, via. urllib.URLopener.* """
|
||||
+ return self.url
|
||||
+
|
||||
_curl_cache = pycurl.Curl() # make one and reuse it over and over and over
|
||||
|
||||
+def reset_curl_obj():
|
||||
+ """To make sure curl has reread the network/dns info we force a reload"""
|
||||
+ global _curl_cache
|
||||
+ _curl_cache.close()
|
||||
+ _curl_cache = pycurl.Curl()
|
||||
+
|
||||
+
|
||||
+
|
||||
def __getattr__(self, name):
|
||||
"""This effectively allows us to wrap at the instance level.
|
||||
@@ -1391,7 +1407,7 @@ class PyCurlFileObject(object):
|
||||
|
||||
#####################################################################
|
||||
# DEPRECATED FUNCTIONS
|
||||
def _hdr_retrieve(self, buf):
|
||||
if self._hdr_ended:
|
||||
- self._hdr_dump = b''
|
||||
+ self._hdr_dump = ''
|
||||
self.size = 0
|
||||
self._hdr_ended = False
|
||||
|
||||
@@ -1426,16 +1442,15 @@ class PyCurlFileObject(object):
|
||||
if s:
|
||||
self.size = int(s)
|
||||
|
||||
- if buf.lower().find(b'location') != -1:
|
||||
- location = b':'.join(buf.split(b':')[1:])
|
||||
+ if buf.lower().find('location') != -1:
|
||||
+ location = ':'.join(buf.split(':')[1:])
|
||||
location = location.strip()
|
||||
self.scheme = urlparse.urlsplit(location)[0]
|
||||
self.url = location
|
||||
|
||||
- self._hdr_dump += buf
|
||||
- if len(self._hdr_dump) != 0 and buf == b'\r\n':
|
||||
+ if len(self._hdr_dump) != 0 and buf == '\r\n':
|
||||
self._hdr_ended = True
|
||||
- if DEBUG: DEBUG.debug('header ended:')
|
||||
+ if DEBUG: DEBUG.info('header ended:')
|
||||
|
||||
return len(buf)
|
||||
except KeyboardInterrupt:
|
||||
@@ -1444,7 +1459,7 @@ class PyCurlFileObject(object):
|
||||
def _return_hdr_obj(self):
|
||||
if self._parsed_hdr:
|
||||
return self._parsed_hdr
|
||||
- statusend = self._hdr_dump.find(b'\n')
|
||||
+ statusend = self._hdr_dump.find('\n')
|
||||
statusend += 1 # ridiculous as it may seem.
|
||||
hdrfp = StringIO()
|
||||
hdrfp.write(self._hdr_dump[statusend:])
|
||||
@@ -1498,7 +1513,7 @@ class PyCurlFileObject(object):
|
||||
if hasattr(opts, 'timeout'):
|
||||
timeout = int(opts.timeout or 0)
|
||||
self.curl_obj.setopt(pycurl.CONNECTTIMEOUT, timeout)
|
||||
- self.curl_obj.setopt(pycurl.LOW_SPEED_LIMIT, opts.minrate or 1000)
|
||||
+ self.curl_obj.setopt(pycurl.LOW_SPEED_LIMIT, 1)
|
||||
self.curl_obj.setopt(pycurl.LOW_SPEED_TIME, timeout)
|
||||
|
||||
# ssl options
|
||||
|
@@ -1,28 +0,0 @@
|
||||
diff --unified -u -r urlgrabber-3.9.1.orig/urlgrabber/mirror.py urlgrabber-3.9.1/urlgrabber/mirror.py
|
||||
--- urlgrabber-3.9.1.orig/urlgrabber/mirror.py 2014-09-16 14:44:54.582048746 +0200
|
||||
+++ urlgrabber-3.9.1/urlgrabber/mirror.py 2014-09-16 14:49:24.138034099 +0200
|
||||
@@ -88,6 +88,7 @@
|
||||
|
||||
|
||||
import random
|
||||
+import urlparse
|
||||
import thread # needed for locking to make this threadsafe
|
||||
|
||||
from grabber import URLGrabError, CallbackObject, DEBUG
|
||||
@@ -366,11 +367,12 @@
|
||||
# by overriding the configuration methods :)
|
||||
|
||||
def _join_url(self, base_url, rel_url):
|
||||
- if base_url.endswith('/') or rel_url.startswith('/'):
|
||||
- return base_url + rel_url
|
||||
+ (scheme, netloc, path, query, fragid) = urlparse.urlsplit(base_url)
|
||||
+ if path.endswith('/') or rel_url.startswith('/'):
|
||||
+ return urlparse.urlunsplit((scheme, netloc, path + rel_url, query, fragid))
|
||||
else:
|
||||
- return base_url + '/' + rel_url
|
||||
-
|
||||
+ return urlparse.urlunsplit((scheme, netloc, path + '/' + rel_url, query, fragid))
|
||||
+
|
||||
def _mirror_try(self, func, url, kw):
|
||||
gr = GrabRequest()
|
||||
gr.func = func
|
@@ -1,14 +0,0 @@
|
||||
Index: urlgrabber-3.9.1/urlgrabber/grabber.py
|
||||
===================================================================
|
||||
--- urlgrabber-3.9.1.orig/urlgrabber/grabber.py
|
||||
+++ urlgrabber-3.9.1/urlgrabber/grabber.py
|
||||
@@ -1190,7 +1190,8 @@ class PyCurlFileObject():
|
||||
self.curl_obj.setopt(pycurl.CAPATH, opts.ssl_ca_cert)
|
||||
self.curl_obj.setopt(pycurl.CAINFO, opts.ssl_ca_cert)
|
||||
self.curl_obj.setopt(pycurl.SSL_VERIFYPEER, opts.ssl_verify_peer)
|
||||
- self.curl_obj.setopt(pycurl.SSL_VERIFYHOST, opts.ssl_verify_host)
|
||||
+ if opts.ssl_verify_host: # 1 is meaningless to curl
|
||||
+ self.curl_obj.setopt(pycurl.SSL_VERIFYHOST, 2)
|
||||
if opts.ssl_key:
|
||||
self.curl_obj.setopt(pycurl.SSLKEY, opts.ssl_key)
|
||||
if opts.ssl_key_type:
|
@@ -1,3 +1,13 @@
|
||||
-------------------------------------------------------------------
|
||||
Mon Feb 25 17:44:43 CET 2019 - Matej Cepl <mcepl@suse.com>
|
||||
|
||||
- Update to the upstream version 4.0.0:
|
||||
* Port to Python 3 rocket
|
||||
* Add curl_obj option to grabber
|
||||
* Throw an obvious error message when urlgrabber-ext-down is
|
||||
missing when attempting to use external downloader
|
||||
* Use setuptools for setup.py instead of distutils
|
||||
|
||||
-------------------------------------------------------------------
|
||||
Tue Dec 4 12:55:41 UTC 2018 - Matej Cepl <mcepl@suse.com>
|
||||
|
||||
|
@@ -1,7 +1,7 @@
|
||||
#
|
||||
# spec file for package python-urlgrabber
|
||||
#
|
||||
# Copyright (c) 2018 SUSE LINUX GmbH, Nuernberg, Germany.
|
||||
# Copyright (c) 2019 SUSE LINUX GmbH, Nuernberg, Germany.
|
||||
#
|
||||
# All modifications and additions to the file contributed by third parties
|
||||
# remain the property of their copyright owners, unless otherwise agreed
|
||||
@@ -19,20 +19,20 @@
|
||||
%{!?python_sitelib: %global python_sitelib %(python -c "from distutils.sysconfig import get_python_lib; print get_python_lib()")}
|
||||
%{?!python_module:%define python_module() python-%{**} python3-%{**}}
|
||||
%define skip_python3 1
|
||||
%define modname urlgrabber
|
||||
|
||||
Name: python-urlgrabber
|
||||
Version: 3.9.1
|
||||
Version: 4.0.0
|
||||
Release: 0
|
||||
Summary: A high-level cross-protocol url-grabber
|
||||
License: LGPL-2.1-only
|
||||
Group: Development/Languages/Python
|
||||
URL: http://urlgrabber.baseurl.org
|
||||
Source: https://files.pythonhosted.org/packages/source/u/urlgrabber/urlgrabber-%{version}.tar.gz
|
||||
Source: https://github.com/rpm-software-management/%{modname}/releases/download/%{modname}-4-0-0/%{modname}-%{version}.tar.gz
|
||||
Patch0: grabber_fix.diff
|
||||
# PATCH-FIX-UPSTREAM bnc#896844
|
||||
Patch1: python-urlgrabber-3.9.1-preserve-queryparams-in-urls.patch
|
||||
Patch2: declare-dollar-sign-as-safe-in-urlquote.patch
|
||||
Patch3: python-urlgrabber-3.9.1-set-SSL_VERIFYHOST-correct.dif
|
||||
BuildRequires: %{python_module pycurl}
|
||||
BuildRequires: %{python_module setuptools}
|
||||
BuildRequires: %{python_module six}
|
||||
BuildRequires: fdupes
|
||||
BuildRequires: python-rpm-macros
|
||||
Requires: python-pycurl
|
||||
@@ -47,10 +47,7 @@ throttling, authentication, proxies and more.
|
||||
%prep
|
||||
%setup -q -n urlgrabber-%{version}
|
||||
sed -i "13d" urlgrabber/__init__.py # Remove wrong license header, fixes bnc#781323
|
||||
%patch0 -p1
|
||||
%patch1 -p1
|
||||
%patch2 -p1
|
||||
%patch3 -p1
|
||||
%autopatch -p1
|
||||
|
||||
%build
|
||||
%python_build
|
||||
@@ -58,6 +55,7 @@ sed -i "13d" urlgrabber/__init__.py # Remove wrong license header, fixes bnc#781
|
||||
%install
|
||||
%python_install
|
||||
rm -rf %{buildroot}%{_datadir}/doc/urlgrabber-%{version} # Remove wrongly installed docs
|
||||
mv -v %{buildroot}%{_usr}/libexec/urlgrabber-ext-down %{buildroot}%{_usr}/lib/urlgrabber-ext-down
|
||||
%python_expand %fdupes %{buildroot}%{$python_sitelib}
|
||||
|
||||
%files %{python_files}
|
||||
@@ -65,5 +63,6 @@ rm -rf %{buildroot}%{_datadir}/doc/urlgrabber-%{version} # Remove wrongly instal
|
||||
%doc ChangeLog README TODO
|
||||
%{_bindir}/urlgrabber
|
||||
%{python_sitelib}/*
|
||||
%{_usr}/lib/urlgrabber*
|
||||
|
||||
%changelog
|
||||
|
@@ -1,3 +0,0 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:b4e276fa968c66671309a6d754c4b3b0cb2003dec8bca87a681378a22e0d3da7
|
||||
size 72071
|
3
urlgrabber-4.0.0.tar.gz
Normal file
3
urlgrabber-4.0.0.tar.gz
Normal file
@@ -0,0 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:d152d10c7decce45ce5b44f8ee8ee5fd0047217f4152eebb8e0c552ca5137af0
|
||||
size 86308
|
Reference in New Issue
Block a user