implement file caching #11

Merged
Ghost merged 3 commits from file-cache into main 2022-11-03 14:24:22 +01:00
3 changed files with 39 additions and 8 deletions

View File

@ -52,6 +52,13 @@ def main():
type=pathlib.Path,
help="Local git repository directory",
)
parser.add_argument(
"-c",
"--cachedir",
required=False,
type=pathlib.Path,
help="Local cache directory",
)
parser.add_argument(
"-g",
"--gc",
@ -93,9 +100,12 @@ def main():
if not args.repodir:
args.repodir = pathlib.Path("repos") / args.package
if not args.cachedir:
args.cachedir = pathlib.Path("~/.cache/git-import/").expanduser()
importer = Importer(URL_OBS, "openSUSE:Factory", args.package)
importer.import_into_db()
exporter = GitExporter(URL_OBS, "openSUSE:Factory", args.package, args.repodir)
exporter = GitExporter(URL_OBS, "openSUSE:Factory", args.package, args.repodir, args.cachedir)
exporter.set_gc_interval(args.gc)
exporter.export_as_git()

View File

@ -12,7 +12,7 @@ from lib.tree_builder import TreeBuilder
class GitExporter:
def __init__(self, api_url, project, package, repodir):
def __init__(self, api_url, project, package, repodir, cachedir):
self.obs = OBS()
self.project = project
self.package = package
@ -26,6 +26,7 @@ class GitExporter:
).create()
self.state_file = os.path.join(self.git.path, ".git", "_flat_state.yaml")
self.gc_interval = 200
self.cachedir = cachedir
def download(self, revision):
obs_files = self.obs.files(revision.project, revision.package, revision.srcmd5)
@ -40,10 +41,7 @@ class GitExporter:
# Download each file in OBS if it is not a binary (or large)
# file
for (name, size, file_md5) in obs_files:
# Validate the MD5 of the downloaded file
if md5(self.git.path / name) != file_md5:
raise Exception(f"Download error in {name}")
for name in obs_files:
self.git.add(name)
def set_gc_interval(self, gc):
@ -121,6 +119,7 @@ class GitExporter:
file.name,
flat.commit.expanded_srcmd5,
self.git.path,
self.cachedir,
file_md5=md5,
)
self.git.add(file)

View File

@ -1,9 +1,12 @@
import errno
import logging
import shutil
import time
import urllib.parse
import xml.etree.ElementTree as ET
from urllib.error import HTTPError
from pathlib import Path
from lib.proxy_sha256 import md5
import osc.core
@ -158,10 +161,21 @@ class OBS:
name: str,
revision: str,
dirpath: str,
cachedir: str,
file_md5: str,
) -> None:
with (dirpath / name).open("wb") as f:
f.write(self._download(project, package, name, revision).read())
cached_file = self._path_from_md5(name, cachedir, file_md5)
if not self.in_cache(name, cachedir, file_md5):
Ghost marked this conversation as resolved Outdated

Try firt a hardlink to save space and speed up the cache:

try:
    cached_file.hardlink_to(dirpath / name)
except OSError:
    # Maybe a different mount point
    # TODO use bindmount for files?
    shutil.copy(dirpath / name, cached_file)

You can use the same later

Try firt a hardlink to save space and speed up the cache: ```python try: cached_file.hardlink_to(dirpath / name) except OSError: # Maybe a different mount point # TODO use bindmount for files? shutil.copy(dirpath / name, cached_file) ``` You can use the same later

Uhm there is the risk of changing the cached version after the next update. Maybe is better to unlink first the fileif present (IIRC there is a parameter to remove any error in the file does not exists, removing the need of calling .exists() before the unlink)

Uhm there is the risk of changing the cached version after the next update. Maybe is better to unlink first the fileif present (IIRC there is a parameter to remove any error in the file does not exists, removing the need of calling .exists() before the unlink)
Outdated
Review

So we want to unlink the normal file (not the file in cache) every time at the start and then download or link again depending on if it is already in cache?

So we want to unlink the normal file (not the file in cache) every time at the start and then download or link again depending on if it is already in cache?

I think so, right? If we unlink we should be keeping the cached version (copy or link) without risk of changing the content. What do you think?

I think so, right? If we unlink we should be keeping the cached version (copy or link) without risk of changing the content. What do you think?
Outdated
Review

I agree it's the better solution, I just wanted to make sure that I understand your logic correctly

I agree it's the better solution, I just wanted to make sure that I understand your logic correctly
Outdated
Review

If you afraid of files getting corrupted, why hardlink at all?

If you afraid of files getting corrupted, why hardlink at all?
Outdated
Review

Do we know how git handles files in its working copy when you e.g. checkout a different branch?

Do we know how git handles files in its working copy when you e.g. checkout a different branch?

Can be copying 1.3M files a bit expensive? (I did not measure it, but my crappy HD is killing me every morning)

Can be copying 1.3M files a bit expensive? (I did not measure it, but my crappy HD is killing me every morning)

Do we know how git handles files in its working copy when you e.g. checkout a different branch?

When possible, hardlinks: https://git-scm.com/docs/git-clone/

> Do we know how git handles files in its working copy when you e.g. checkout a different branch? When possible, hardlinks: https://git-scm.com/docs/git-clone/
Outdated
Review

It only talks about objects cloned from a local repository. And 1.3M files is for all the repos - and right now I'm downloading them again and again from OBS, which is actually worse :)

But I asked @nkrapp to check how git checkout treats files in the WC and then we can decide.

It only talks about objects cloned from a local repository. And 1.3M files is for all the repos - and right now I'm downloading them again and again from OBS, which is actually worse :) But I asked @nkrapp to check how git checkout treats files in the WC and then we can decide.
Outdated
Review

The git documentation is very ambigous here and doesn't say outright how checkout treats files but they used the wording updating the index and the files in the working tree (https://git-scm.com/docs/git-checkout/en) so I'd expect it to actually edit the files

The git documentation is very ambigous here and doesn't say outright how checkout treats files but they used the wording `updating the index and the files in the working tree` (https://git-scm.com/docs/git-checkout/en) so I'd expect it to actually edit the files

Can be fun to check if this is true, like creating a repo, commiting a file, creating a branch, updating the file in the branch, and create a hard link outside of the repo. If a checkout into master update also the external link we have then only one option (do the copy and forget)

Can be fun to check if this is true, like creating a repo, commiting a file, creating a branch, updating the file in the branch, and create a hard link outside of the repo. If a checkout into master update also the external link we have then only one option (do the copy and forget)
Outdated
Review

Just tested it and changing a file in a branch also updated the hardlinked file outside of the repo. I guess I'll leave the code as it was then

Just tested it and changing a file in a branch also updated the hardlinked file outside of the repo. I guess I'll leave the code as it was then
with (dirpath / name).open("wb") as f:
f.write(self._download(project, package, name, revision).read())
shutil.copy(dirpath / name, cached_file)
else:
shutil.copy(cached_file, dirpath / name)
# Validate the MD5 of the downloaded file
if md5(dirpath / name) != file_md5:
raise Exception(f"Download error in {name}")
def list(self, project, package, srcmd5, linkrev):
params = {"rev": srcmd5, "expand": "1"}
@ -179,3 +193,11 @@ class OBS:
raise e
Ghost marked this conversation as resolved Outdated

I propose to use the same model than git (also drop joinpath): filepath = cache / md5[:2] / md5[2:]

I propose to use the same model than git (also drop joinpath): `filepath = cache / md5[:2] / md5[2:]`
Outdated
Review

As datapoint: we're talking about 1.3M files - that would be still ~5000 files per directory. The git objects are packed for a reason - there are not supposed to be 1.3M of them.

As datapoint: we're talking about 1.3M files - that would be still ~5000 files per directory. The git objects are packed for a reason - there are not supposed to be 1.3M of them.
Outdated
Review

This implies of course we do the .cache independent of the repo path - which I agree is a bad idea

This implies of course we do the .cache independent of the repo path - which I agree is a bad idea
Outdated
Review
And the .cache is actually commited: https://gitea.opensuse.org/coolo/rpm/src/branch/factory/.cache

As datapoint: we're talking about 1.3M files - that would be still ~5000 files per directory. The git objects are packed for a reason - there are not supposed to be 1.3M of them.

In that case filepath = cache / md5[:3] / md5[3:] will create 4096 entries in the first level and 300 files in each one.

> As datapoint: we're talking about 1.3M files - that would be still ~5000 files per directory. The git objects are packed for a reason - there are not supposed to be 1.3M of them. In that case `filepath = cache / md5[:3] / md5[3:]` will create 4096 entries in the first level and 300 files in each one.
Outdated
Review

That sounds like a good compromise

That sounds like a good compromise
return root
Ghost marked this conversation as resolved Outdated

you create the directories here, so you can skeip lines 192 and 193

you create the directories here, so you can skeip lines 192 and 193
def _path_from_md5(self, name, cachedir, md5):
filepath = cachedir / md5[:3]
filepath.mkdir(parents=True, exist_ok=True)
Ghost marked this conversation as resolved Outdated

return self._path_from_md5(name, dirpath, md5).exists()

`return self._path_from_md5(name, dirpath, md5).exists()`

I liked more when it did not contain the {name}? This makes a pure object access. Now if two files has the same content but different name (LICENSE / license, README / README.txt) but the same content it will create two files.

I liked more when it did not contain the {name}? This makes a pure object access. Now if two files has the same content but different name (LICENSE / license, README / README.txt) but the same content it will create two files.
Outdated
Review

I added this mainly so I could differentiate between the files but I know this isn't really needed outside of tests so I don't mind taking it out

I added this mainly so I could differentiate between the files but I know this isn't really needed outside of tests so I don't mind taking it out
return filepath / md5[3:]
def in_cache(self, name, cachedir, md5):
return self._path_from_md5(name, cachedir, md5).exists()