Merge pull request 'implement file caching' (#11) from file-cache into main

Reviewed-on: https://gitea.opensuse.org/importers/git-importer/pulls/11
This commit is contained in:
Nico Krapp 2022-11-03 14:24:22 +01:00
commit 6dd3cf3eba
3 changed files with 39 additions and 8 deletions

View File

@ -52,6 +52,13 @@ def main():
type=pathlib.Path, type=pathlib.Path,
help="Local git repository directory", help="Local git repository directory",
) )
parser.add_argument(
"-c",
"--cachedir",
required=False,
type=pathlib.Path,
help="Local cache directory",
)
parser.add_argument( parser.add_argument(
"-g", "-g",
"--gc", "--gc",
@ -93,9 +100,12 @@ def main():
if not args.repodir: if not args.repodir:
args.repodir = pathlib.Path("repos") / args.package args.repodir = pathlib.Path("repos") / args.package
if not args.cachedir:
args.cachedir = pathlib.Path("~/.cache/git-import/").expanduser()
importer = Importer(URL_OBS, "openSUSE:Factory", args.package) importer = Importer(URL_OBS, "openSUSE:Factory", args.package)
importer.import_into_db() importer.import_into_db()
exporter = GitExporter(URL_OBS, "openSUSE:Factory", args.package, args.repodir) exporter = GitExporter(URL_OBS, "openSUSE:Factory", args.package, args.repodir, args.cachedir)
exporter.set_gc_interval(args.gc) exporter.set_gc_interval(args.gc)
exporter.export_as_git() exporter.export_as_git()

View File

@ -12,7 +12,7 @@ from lib.tree_builder import TreeBuilder
class GitExporter: class GitExporter:
def __init__(self, api_url, project, package, repodir): def __init__(self, api_url, project, package, repodir, cachedir):
self.obs = OBS() self.obs = OBS()
self.project = project self.project = project
self.package = package self.package = package
@ -26,6 +26,7 @@ class GitExporter:
).create() ).create()
self.state_file = os.path.join(self.git.path, ".git", "_flat_state.yaml") self.state_file = os.path.join(self.git.path, ".git", "_flat_state.yaml")
self.gc_interval = 200 self.gc_interval = 200
self.cachedir = cachedir
def download(self, revision): def download(self, revision):
obs_files = self.obs.files(revision.project, revision.package, revision.srcmd5) obs_files = self.obs.files(revision.project, revision.package, revision.srcmd5)
@ -40,10 +41,7 @@ class GitExporter:
# Download each file in OBS if it is not a binary (or large) # Download each file in OBS if it is not a binary (or large)
# file # file
for (name, size, file_md5) in obs_files: for name in obs_files:
# Validate the MD5 of the downloaded file
if md5(self.git.path / name) != file_md5:
raise Exception(f"Download error in {name}")
self.git.add(name) self.git.add(name)
def set_gc_interval(self, gc): def set_gc_interval(self, gc):
@ -121,6 +119,7 @@ class GitExporter:
file.name, file.name,
flat.commit.expanded_srcmd5, flat.commit.expanded_srcmd5,
self.git.path, self.git.path,
self.cachedir,
file_md5=md5, file_md5=md5,
) )
self.git.add(file) self.git.add(file)

View File

@ -1,9 +1,12 @@
import errno import errno
import logging import logging
import shutil
import time import time
import urllib.parse import urllib.parse
import xml.etree.ElementTree as ET import xml.etree.ElementTree as ET
from urllib.error import HTTPError from urllib.error import HTTPError
from pathlib import Path
from lib.proxy_sha256 import md5
import osc.core import osc.core
@ -158,10 +161,21 @@ class OBS:
name: str, name: str,
revision: str, revision: str,
dirpath: str, dirpath: str,
cachedir: str,
file_md5: str, file_md5: str,
) -> None: ) -> None:
cached_file = self._path_from_md5(name, cachedir, file_md5)
if not self.in_cache(name, cachedir, file_md5):
with (dirpath / name).open("wb") as f: with (dirpath / name).open("wb") as f:
f.write(self._download(project, package, name, revision).read()) f.write(self._download(project, package, name, revision).read())
shutil.copy(dirpath / name, cached_file)
else:
shutil.copy(cached_file, dirpath / name)
# Validate the MD5 of the downloaded file
if md5(dirpath / name) != file_md5:
raise Exception(f"Download error in {name}")
def list(self, project, package, srcmd5, linkrev): def list(self, project, package, srcmd5, linkrev):
params = {"rev": srcmd5, "expand": "1"} params = {"rev": srcmd5, "expand": "1"}
@ -179,3 +193,11 @@ class OBS:
raise e raise e
return root return root
def _path_from_md5(self, name, cachedir, md5):
filepath = cachedir / md5[:3]
filepath.mkdir(parents=True, exist_ok=True)
return filepath / md5[3:]
def in_cache(self, name, cachedir, md5):
return self._path_from_md5(name, cachedir, md5).exists()