From 7678967ae0b851b38db82bbc12e86e60853d2982 Mon Sep 17 00:00:00 2001 From: nkrapp Date: Wed, 2 Nov 2022 16:58:40 +0100 Subject: [PATCH 1/3] implement file caching to prevent having to download files multiple times --- lib/obs.py | 24 ++++++++++++++++++++++-- 1 file changed, 22 insertions(+), 2 deletions(-) diff --git a/lib/obs.py b/lib/obs.py index 05af330..e1a558e 100644 --- a/lib/obs.py +++ b/lib/obs.py @@ -1,9 +1,11 @@ import errno import logging +import shutil import time import urllib.parse import xml.etree.ElementTree as ET from urllib.error import HTTPError +from pathlib import Path import osc.core @@ -160,8 +162,13 @@ class OBS: dirpath: str, file_md5: str, ) -> None: - with (dirpath / name).open("wb") as f: - f.write(self._download(project, package, name, revision).read()) + cached_file = self._path_from_md5(name, dirpath, file_md5) + if not self.in_cache(name, dirpath, file_md5): + with (dirpath / name).open("wb") as f: + f.write(self._download(project, package, name, revision).read()) + shutil.copy(dirpath / name, cached_file) + else: + shutil.copy(cached_file, dirpath / name) def list(self, project, package, srcmd5, linkrev): params = {"rev": srcmd5, "expand": "1"} @@ -179,3 +186,16 @@ class OBS: raise e return root + + def _path_from_md5(self, name, dirpath, md5): + cache = dirpath.joinpath(".cache/") + if not Path(cache).exists(): + cache.mkdir() + filepath = cache.joinpath(f"{md5[0:3]}/{md5[3:6]}/{md5[6:9]}/") + filepath.mkdir(parents=True, exist_ok=True) + return filepath.joinpath(f"{md5[9:]}-{name}") + + def in_cache(self, name, dirpath, md5): + if self._path_from_md5(name, dirpath, md5).is_file(): + return True + return False From 639096b548002b45d632574c05b8be764cbc7b66 Mon Sep 17 00:00:00 2001 From: nkrapp Date: Thu, 3 Nov 2022 13:17:49 +0100 Subject: [PATCH 2/3] optimize cached file locations and add option for cache directory --- git-importer.py | 12 +++++++++++- lib/git_exporter.py | 9 ++++----- lib/obs.py | 27 +++++++++++++++------------ 3 files changed, 30 insertions(+), 18 deletions(-) diff --git a/git-importer.py b/git-importer.py index eabadbf..12b1910 100755 --- a/git-importer.py +++ b/git-importer.py @@ -52,6 +52,13 @@ def main(): type=pathlib.Path, help="Local git repository directory", ) + parser.add_argument( + "-c", + "--cachedir", + required=False, + type=pathlib.Path, + help="Local cache directory", + ) parser.add_argument( "-g", "--gc", @@ -93,9 +100,12 @@ def main(): if not args.repodir: args.repodir = pathlib.Path("repos") / args.package + if not args.cachedir: + args.cachedir = pathlib.Path("~/.cache/git-import/").expanduser() + importer = Importer(URL_OBS, "openSUSE:Factory", args.package) importer.import_into_db() - exporter = GitExporter(URL_OBS, "openSUSE:Factory", args.package, args.repodir) + exporter = GitExporter(URL_OBS, "openSUSE:Factory", args.package, args.repodir, args.cachedir) exporter.set_gc_interval(args.gc) exporter.export_as_git() diff --git a/lib/git_exporter.py b/lib/git_exporter.py index 4b14720..88cfc23 100644 --- a/lib/git_exporter.py +++ b/lib/git_exporter.py @@ -12,7 +12,7 @@ from lib.tree_builder import TreeBuilder class GitExporter: - def __init__(self, api_url, project, package, repodir): + def __init__(self, api_url, project, package, repodir, cachedir): self.obs = OBS() self.project = project self.package = package @@ -26,6 +26,7 @@ class GitExporter: ).create() self.state_file = os.path.join(self.git.path, ".git", "_flat_state.yaml") self.gc_interval = 200 + self.cachedir = cachedir def download(self, revision): obs_files = self.obs.files(revision.project, revision.package, revision.srcmd5) @@ -40,10 +41,7 @@ class GitExporter: # Download each file in OBS if it is not a binary (or large) # file - for (name, size, file_md5) in obs_files: - # Validate the MD5 of the downloaded file - if md5(self.git.path / name) != file_md5: - raise Exception(f"Download error in {name}") + for name in obs_files: self.git.add(name) def set_gc_interval(self, gc): @@ -121,6 +119,7 @@ class GitExporter: file.name, flat.commit.expanded_srcmd5, self.git.path, + self.cachedir, file_md5=md5, ) self.git.add(file) diff --git a/lib/obs.py b/lib/obs.py index e1a558e..4d0cade 100644 --- a/lib/obs.py +++ b/lib/obs.py @@ -6,6 +6,7 @@ import urllib.parse import xml.etree.ElementTree as ET from urllib.error import HTTPError from pathlib import Path +from lib.proxy_sha256 import md5 import osc.core @@ -160,16 +161,22 @@ class OBS: name: str, revision: str, dirpath: str, + cachedir: str, file_md5: str, ) -> None: - cached_file = self._path_from_md5(name, dirpath, file_md5) - if not self.in_cache(name, dirpath, file_md5): + + cached_file = self._path_from_md5(name, cachedir, file_md5) + if not self.in_cache(name, cachedir, file_md5): with (dirpath / name).open("wb") as f: f.write(self._download(project, package, name, revision).read()) shutil.copy(dirpath / name, cached_file) else: shutil.copy(cached_file, dirpath / name) + # Validate the MD5 of the downloaded file + if md5(dirpath / name) != file_md5: + raise Exception(f"Download error in {name}") + def list(self, project, package, srcmd5, linkrev): params = {"rev": srcmd5, "expand": "1"} if linkrev: @@ -187,15 +194,11 @@ class OBS: return root - def _path_from_md5(self, name, dirpath, md5): - cache = dirpath.joinpath(".cache/") - if not Path(cache).exists(): - cache.mkdir() - filepath = cache.joinpath(f"{md5[0:3]}/{md5[3:6]}/{md5[6:9]}/") + def _path_from_md5(self, name, cachedir, md5): + filepath = cachedir / md5[:3] + cached_file = f"{md5[3:]}-{name}" filepath.mkdir(parents=True, exist_ok=True) - return filepath.joinpath(f"{md5[9:]}-{name}") + return filepath / cached_file - def in_cache(self, name, dirpath, md5): - if self._path_from_md5(name, dirpath, md5).is_file(): - return True - return False + def in_cache(self, name, cachedir, md5): + return self._path_from_md5(name, cachedir, md5).exists() From 8aed76e52a0caf4492e972635d4fb34c59e6aa1a Mon Sep 17 00:00:00 2001 From: nkrapp Date: Thu, 3 Nov 2022 14:22:19 +0100 Subject: [PATCH 3/3] change cached file naming pattern --- lib/obs.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/lib/obs.py b/lib/obs.py index 4d0cade..73e6d8e 100644 --- a/lib/obs.py +++ b/lib/obs.py @@ -196,9 +196,8 @@ class OBS: def _path_from_md5(self, name, cachedir, md5): filepath = cachedir / md5[:3] - cached_file = f"{md5[3:]}-{name}" filepath.mkdir(parents=True, exist_ok=True) - return filepath / cached_file + return filepath / md5[3:] def in_cache(self, name, cachedir, md5): return self._path_from_md5(name, cachedir, md5).exists()