implement file caching #11
@ -52,6 +52,13 @@ def main():
|
||||
type=pathlib.Path,
|
||||
help="Local git repository directory",
|
||||
)
|
||||
parser.add_argument(
|
||||
"-c",
|
||||
"--cachedir",
|
||||
required=False,
|
||||
type=pathlib.Path,
|
||||
help="Local cache directory",
|
||||
)
|
||||
parser.add_argument(
|
||||
"-g",
|
||||
"--gc",
|
||||
@ -93,9 +100,12 @@ def main():
|
||||
if not args.repodir:
|
||||
args.repodir = pathlib.Path("repos") / args.package
|
||||
|
||||
if not args.cachedir:
|
||||
args.cachedir = pathlib.Path("~/.cache/git-import/").expanduser()
|
||||
|
||||
importer = Importer(URL_OBS, "openSUSE:Factory", args.package)
|
||||
importer.import_into_db()
|
||||
exporter = GitExporter(URL_OBS, "openSUSE:Factory", args.package, args.repodir)
|
||||
exporter = GitExporter(URL_OBS, "openSUSE:Factory", args.package, args.repodir, args.cachedir)
|
||||
exporter.set_gc_interval(args.gc)
|
||||
exporter.export_as_git()
|
||||
|
||||
|
@ -12,7 +12,7 @@ from lib.tree_builder import TreeBuilder
|
||||
|
||||
|
||||
class GitExporter:
|
||||
def __init__(self, api_url, project, package, repodir):
|
||||
def __init__(self, api_url, project, package, repodir, cachedir):
|
||||
self.obs = OBS()
|
||||
self.project = project
|
||||
self.package = package
|
||||
@ -26,6 +26,7 @@ class GitExporter:
|
||||
).create()
|
||||
self.state_file = os.path.join(self.git.path, ".git", "_flat_state.yaml")
|
||||
self.gc_interval = 200
|
||||
self.cachedir = cachedir
|
||||
|
||||
def download(self, revision):
|
||||
obs_files = self.obs.files(revision.project, revision.package, revision.srcmd5)
|
||||
@ -40,10 +41,7 @@ class GitExporter:
|
||||
|
||||
# Download each file in OBS if it is not a binary (or large)
|
||||
# file
|
||||
for (name, size, file_md5) in obs_files:
|
||||
# Validate the MD5 of the downloaded file
|
||||
if md5(self.git.path / name) != file_md5:
|
||||
raise Exception(f"Download error in {name}")
|
||||
for name in obs_files:
|
||||
self.git.add(name)
|
||||
|
||||
def set_gc_interval(self, gc):
|
||||
@ -121,6 +119,7 @@ class GitExporter:
|
||||
file.name,
|
||||
flat.commit.expanded_srcmd5,
|
||||
self.git.path,
|
||||
self.cachedir,
|
||||
file_md5=md5,
|
||||
)
|
||||
self.git.add(file)
|
||||
|
26
lib/obs.py
@ -1,9 +1,12 @@
|
||||
import errno
|
||||
import logging
|
||||
import shutil
|
||||
import time
|
||||
import urllib.parse
|
||||
import xml.etree.ElementTree as ET
|
||||
from urllib.error import HTTPError
|
||||
from pathlib import Path
|
||||
from lib.proxy_sha256 import md5
|
||||
|
||||
import osc.core
|
||||
|
||||
@ -158,10 +161,21 @@ class OBS:
|
||||
name: str,
|
||||
revision: str,
|
||||
dirpath: str,
|
||||
cachedir: str,
|
||||
file_md5: str,
|
||||
) -> None:
|
||||
with (dirpath / name).open("wb") as f:
|
||||
f.write(self._download(project, package, name, revision).read())
|
||||
|
||||
cached_file = self._path_from_md5(name, cachedir, file_md5)
|
||||
if not self.in_cache(name, cachedir, file_md5):
|
||||
Ghost marked this conversation as resolved
Outdated
|
||||
with (dirpath / name).open("wb") as f:
|
||||
f.write(self._download(project, package, name, revision).read())
|
||||
shutil.copy(dirpath / name, cached_file)
|
||||
else:
|
||||
shutil.copy(cached_file, dirpath / name)
|
||||
|
||||
# Validate the MD5 of the downloaded file
|
||||
if md5(dirpath / name) != file_md5:
|
||||
raise Exception(f"Download error in {name}")
|
||||
|
||||
def list(self, project, package, srcmd5, linkrev):
|
||||
params = {"rev": srcmd5, "expand": "1"}
|
||||
@ -179,3 +193,11 @@ class OBS:
|
||||
raise e
|
||||
|
||||
Ghost marked this conversation as resolved
Outdated
aplanas
commented
I propose to use the same model than git (also drop joinpath): I propose to use the same model than git (also drop joinpath): `filepath = cache / md5[:2] / md5[2:]`
![]()
Ghost
commented
As datapoint: we're talking about 1.3M files - that would be still ~5000 files per directory. The git objects are packed for a reason - there are not supposed to be 1.3M of them. As datapoint: we're talking about 1.3M files - that would be still ~5000 files per directory. The git objects are packed for a reason - there are not supposed to be 1.3M of them.
![]()
Ghost
commented
This implies of course we do the .cache independent of the repo path - which I agree is a bad idea This implies of course we do the .cache independent of the repo path - which I agree is a bad idea
![]()
Ghost
commented
And the .cache is actually commited: https://gitea.opensuse.org/coolo/rpm/src/branch/factory/.cache And the .cache is actually commited: https://gitea.opensuse.org/coolo/rpm/src/branch/factory/.cache
aplanas
commented
In that case > As datapoint: we're talking about 1.3M files - that would be still ~5000 files per directory. The git objects are packed for a reason - there are not supposed to be 1.3M of them.
In that case `filepath = cache / md5[:3] / md5[3:]` will create 4096 entries in the first level and 300 files in each one.
![]()
Ghost
commented
That sounds like a good compromise That sounds like a good compromise
|
||||
return root
|
||||
Ghost marked this conversation as resolved
Outdated
aplanas
commented
you create the directories here, so you can skeip lines 192 and 193 you create the directories here, so you can skeip lines 192 and 193
|
||||
|
||||
def _path_from_md5(self, name, cachedir, md5):
|
||||
filepath = cachedir / md5[:3]
|
||||
filepath.mkdir(parents=True, exist_ok=True)
|
||||
Ghost marked this conversation as resolved
Outdated
aplanas
commented
`return self._path_from_md5(name, dirpath, md5).exists()`
aplanas
commented
I liked more when it did not contain the {name}? This makes a pure object access. Now if two files has the same content but different name (LICENSE / license, README / README.txt) but the same content it will create two files. I liked more when it did not contain the {name}? This makes a pure object access. Now if two files has the same content but different name (LICENSE / license, README / README.txt) but the same content it will create two files.
![]()
Ghost
commented
I added this mainly so I could differentiate between the files but I know this isn't really needed outside of tests so I don't mind taking it out I added this mainly so I could differentiate between the files but I know this isn't really needed outside of tests so I don't mind taking it out
|
||||
return filepath / md5[3:]
|
||||
|
||||
def in_cache(self, name, cachedir, md5):
|
||||
return self._path_from_md5(name, cachedir, md5).exists()
|
||||
|
Try firt a hardlink to save space and speed up the cache:
You can use the same later
Uhm there is the risk of changing the cached version after the next update. Maybe is better to unlink first the fileif present (IIRC there is a parameter to remove any error in the file does not exists, removing the need of calling .exists() before the unlink)
So we want to unlink the normal file (not the file in cache) every time at the start and then download or link again depending on if it is already in cache?
I think so, right? If we unlink we should be keeping the cached version (copy or link) without risk of changing the content. What do you think?
I agree it's the better solution, I just wanted to make sure that I understand your logic correctly
If you afraid of files getting corrupted, why hardlink at all?
Do we know how git handles files in its working copy when you e.g. checkout a different branch?
Can be copying 1.3M files a bit expensive? (I did not measure it, but my crappy HD is killing me every morning)
When possible, hardlinks: https://git-scm.com/docs/git-clone/
It only talks about objects cloned from a local repository. And 1.3M files is for all the repos - and right now I'm downloading them again and again from OBS, which is actually worse :)
But I asked @nkrapp to check how git checkout treats files in the WC and then we can decide.
The git documentation is very ambigous here and doesn't say outright how checkout treats files but they used the wording
updating the index and the files in the working tree
(https://git-scm.com/docs/git-checkout/en) so I'd expect it to actually edit the filesCan be fun to check if this is true, like creating a repo, commiting a file, creating a branch, updating the file in the branch, and create a hard link outside of the repo. If a checkout into master update also the external link we have then only one option (do the copy and forget)
Just tested it and changing a file in a branch also updated the hardlinked file outside of the repo. I guess I'll leave the code as it was then