Merge pull request 'Fix the maintenance of .gitattributes file' (#17) from fix_lfs_attributes into main

Reviewed-on: https://gitea.opensuse.org/importers/git-importer/pulls/17
This commit is contained in:
coolo 2022-11-07 13:26:36 +01:00
commit 86490b51dd
12 changed files with 14686 additions and 84 deletions

View File

@ -5,3 +5,9 @@ all:
test:
python3 -m unittest -v tests/*.py
update-packages:
f=$$(mktemp) ;\
osc api /source/openSUSE:Factory?view=info | grep -v lsrcmd5 | grep srcmd5= | sed -e 's,.*package=",,; s,".*,,' | grep -v : > $$f ;\
echo _project >> $$f ;\
mv $$f packages

View File

@ -113,14 +113,17 @@ def main():
importer = Importer(URL_OBS, "openSUSE:Factory", args.packages)
importer.import_into_db()
with concurrent.futures.ProcessPoolExecutor(max_workers=8) as executor:
fs = [
executor.submit(
export_package, package, args.repodir, args.cachedir, args.gc
)
for package in args.packages
]
concurrent.futures.wait(fs)
if len(args.packages) != 1:
with concurrent.futures.ProcessPoolExecutor(max_workers=8) as executor:
fs = [
executor.submit(
export_package, package, args.repodir, args.cachedir, args.gc
)
for package in args.packages
]
concurrent.futures.wait(fs)
else:
export_package(args.packages[0], args.repodir, args.cachedir, args.gc)
if __name__ == "__main__":

View File

@ -215,6 +215,12 @@ class DB:
"CREATE INDEX ON linked_revs(considered)",
"UPDATE scheme SET version=20",
)
schemes[21] = (
"ALTER TABLE revisions ADD COLUMN api_url VARCHAR(40)",
"UPDATE revisions SET api_url='https://api.opensuse.org'",
"ALTER TABLE revisions ALTER COLUMN api_url SET NOT NULL",
"UPDATE scheme SET version=21",
)
schema_version = self.schema_version()
if (schema_version + 1) not in schemes:
return

View File

@ -1,6 +1,5 @@
from __future__ import annotations
import logging
from hashlib import md5
from pathlib import Path
from typing import Optional
@ -27,6 +26,7 @@ class DBRevision:
self.request_number,
self.request_id,
self.files_hash,
self.api_url,
) = row
self.rev = float(self.rev)
self._files = None
@ -52,6 +52,28 @@ class DBRevision:
return self.package < other.package
return self.rev < other.rev
def request_accept_message(self):
request = Request.find(self.db, self.request_id)
msg = f"Accepting request {request.number} from {request.source_project}\n\n"
msg += self.comment.strip()
url = self.api_url.replace("api.", "build.")
msg += f"\n\nOBS-URL: {url}/request/show/{self.request_number}"
return msg
def git_commit_message(self):
msg = ""
if self.request_id:
msg = self.request_accept_message()
else:
msg = self.comment.strip() + "\n"
url = self.api_url.replace("api.", "build.")
if self.rev == int(self.rev):
# do not link to fake revisions
msg += f"\nOBS-URL: {url}/package/show/{self.project}/{self.package}?expand=0&rev={int(self.rev)}"
else:
msg += f"\nOBS-URL: {url}/package/show/{self.project}/{self.package}?expand=0&rev={self.expanded_srcmd5}"
return msg
def as_dict(self):
"""Return a dict we can put into YAML for test cases"""
ret = {
@ -64,6 +86,7 @@ class DBRevision:
"comment": self.comment,
"broken": self.broken,
"expanded_srcmd5": self.expanded_srcmd5,
"api_url": self.api_url,
"files_hash": self.files_hash,
"files": self.files_list(),
}
@ -82,8 +105,8 @@ class DBRevision:
def import_obs_rev(db: DB, revision: OBSRevision):
with db.cursor() as cur:
cur.execute(
"""INSERT INTO revisions (project, package, rev, unexpanded_srcmd5, commit_time, userid, comment, request_number)
VALUES(%s, %s, %s, %s, %s, %s, %s, %s)""",
"""INSERT INTO revisions (project, package, rev, unexpanded_srcmd5, commit_time, userid, comment, request_number, api_url)
VALUES(%s, %s, %s, %s, %s, %s, %s, %s, %s)""",
(
revision.project,
revision.package,
@ -93,6 +116,7 @@ class DBRevision:
revision.userid,
revision.comment,
revision.request_number,
revision.obs.url,
),
)
return DBRevision.fetch_revision(
@ -101,6 +125,8 @@ class DBRevision:
@staticmethod
def fetch_revision(db, project, package, rev):
"""Technically we would need the api_url as well, but we assume projects are unique
(e.g. not importing SLE from obs)"""
with db.cursor() as cur:
cur.execute(
"SELECT * FROM revisions where project=%s and package=%s and rev=%s",
@ -172,6 +198,13 @@ class DBRevision:
(xml.get("srcmd5"), self.dbid),
)
for entry in xml.findall("entry"):
# this file creates easily 100k commits and is just useless data :(
# unfortunately it's stored in the same meta package as the project config
if (
entry.get("name") == "_staging_workflow"
and self.package == "_project"
):
continue
cur.execute(
"""INSERT INTO files (name, md5, size, mtime, revision_id)
VALUES (%s,%s,%s,%s,%s)""",
@ -228,7 +261,6 @@ class DBRevision:
If it's None, the repository is empty.
"""
to_download = []
to_delete = []
if current_rev:
old_files = {
e["name"]: f"{e['md5']}-{e['size']}" for e in current_rev.files_list()
@ -237,12 +269,9 @@ class DBRevision:
old_files = dict()
for entry in self.files_list():
if old_files.get(entry["name"]) != f"{entry['md5']}-{entry['size']}":
logging.debug(f"Download {entry['name']}")
to_download.append((Path(entry["name"]), entry["size"], entry["md5"]))
old_files.pop(entry["name"], None)
for entry in old_files.keys():
logging.debug(f"Delete {entry}")
to_delete.append(Path(entry))
to_delete = [Path(e) for e in old_files.keys()]
return to_download, to_delete
@staticmethod
@ -260,9 +289,9 @@ class DBRevision:
"""Used in test cases to read a revision from fixtures into the test database"""
with db.cursor() as cur:
cur.execute(
"""INSERT INTO revisions (project, package, rev, unexpanded_srcmd5, expanded_srcmd5,
commit_time, userid, comment, broken, files_hash)
VALUES(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s) RETURNING id""",
"""INSERT INTO revisions (project, package, rev, unexpanded_srcmd5, expanded_srcmd5,
commit_time, userid, comment, broken, files_hash, api_url)
VALUES(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s) RETURNING id""",
(
rev_dict["project"],
rev_dict["package"],
@ -274,6 +303,7 @@ class DBRevision:
rev_dict["comment"],
rev_dict["broken"],
rev_dict["files_hash"],
rev_dict.get("api_url", "https://api.opensuse.org"),
),
)
rev_id = cur.fetchone()[0]

View File

@ -1,9 +1,11 @@
import fnmatch
import logging
import os
import pathlib
import subprocess
import pygit2
import requests
from lib.binary import BINARY
@ -23,7 +25,6 @@ class Git:
def is_open(self):
return self.repo is not None
# TODO: Extend it to packages and files
def exists(self):
"""Check if the path is a valid git repository"""
return (self.path / ".git").exists()
@ -31,10 +32,12 @@ class Git:
def create(self):
"""Create a local git repository"""
self.path.mkdir(parents=True, exist_ok=True)
self.open()
def open(self):
# Convert the path to string, to avoid some limitations in
# older pygit2
self.repo = pygit2.init_repository(str(self.path))
return self
def is_dirty(self):
"""Check if there is something to commit"""
@ -73,10 +76,8 @@ class Git:
committer=None,
committer_email=None,
committer_time=None,
allow_empty=False,
):
"""Add all the files and create a new commit in the current HEAD"""
assert allow_empty or self.is_dirty()
if not committer:
committer = self.committer if self.committer else self.user
@ -85,33 +86,20 @@ class Git:
)
committer_time = committer_time if committer_time else user_time
try:
if self.is_dirty():
self.repo.index.add_all()
except pygit2.GitError as e:
if not allow_empty:
raise e
self.repo.index.write()
author = pygit2.Signature(user, user_email, int(user_time.timestamp()))
committer = pygit2.Signature(
committer, committer_email, int(committer_time.timestamp())
)
if not parents:
try:
parents = [self.repo.head.target]
except pygit2.GitError as e:
parents = []
if not allow_empty:
raise e
tree = self.repo.index.write_tree()
return self.repo.create_commit(
"HEAD", author, committer, message, tree, parents
)
def merge_abort(self):
self.repo.state_cleanup()
def last_commit(self):
try:
return self.repo.head.target
@ -204,3 +192,25 @@ class Git:
if file.name in patterns:
patterns.remove(file.name)
self.add_specific_lfs_gitattributes(patterns)
def add_gitea_remote(self, package):
repo_name = package.replace("+", "_")
org_name = "rpm"
if not os.getenv("GITEA_TOKEN"):
logging.warning("Not adding a remote due to missing $GITEA_TOKEN")
return
url = f"https://gitea.opensuse.org/api/v1/org/{org_name}/repos"
response = requests.post(
url,
data={"name": repo_name},
headers={"Authorization": f"token {os.getenv('GITEA_TOKEN')}"},
timeout=10,
)
# 409 Conflict (Already existing)
# 201 Created
if response.status_code not in (201, 409):
print(response.data)
url = f"gitea@gitea.opensuse.org:{org_name}/{repo_name}.git"
self.repo.remotes.create("origin", url)

View File

@ -7,23 +7,27 @@ from lib.binary import is_binary_or_large
from lib.db import DB
from lib.git import Git
from lib.obs import OBS
from lib.proxy_sha256 import ProxySHA256, md5
from lib.proxy_sha256 import ProxySHA256
from lib.tree_builder import TreeBuilder
from lib.user import User
class GitExporter:
def __init__(self, api_url, project, package, repodir, cachedir):
self.obs = OBS()
self.obs = OBS(api_url)
self.project = project
self.package = package
# TODO: Store the api url in the revision
self.obs.change_url(api_url)
self.proxy_sha256 = ProxySHA256(self.obs, enabled=True)
self.git = Git(
repodir / package,
committer="Git OBS Bridge",
committer_email="obsbridge@suse.de",
).create()
)
if self.git.exists():
self.git.open()
else:
self.git.create()
self.git.add_gitea_remote(package)
self.state_file = os.path.join(self.git.path, ".git", "_flat_state.yaml")
self.gc_interval = 200
self.cachedir = cachedir
@ -31,12 +35,7 @@ class GitExporter:
def set_gc_interval(self, gc):
self.gc_interval = gc
def export_as_git(self):
db = DB()
tree = TreeBuilder(db).build(self.project, self.package)
flats = tree.as_flat_list()
branch_state = {"factory": None, "devel": None}
def check_repo_state(self, flats, branch_state):
state_data = dict()
if os.path.exists(self.state_file):
with open(self.state_file, "r") as f:
@ -57,22 +56,41 @@ class GitExporter:
found_state = True
if not found_state:
left_to_commit.append(flat)
return left_to_commit
def export_as_git(self):
db = DB()
tree = TreeBuilder(db).build(self.project, self.package)
flats = tree.as_flat_list()
branch_state = {"factory": None, "devel": None}
left_to_commit = self.check_repo_state(flats, branch_state)
if not left_to_commit:
return
logging.info(f"Commiting into {self.git.path}")
self.run_gc()
users = dict()
gc_cnt = self.gc_interval
if len(left_to_commit) > 0:
logging.info(f"Commiting into {self.git.path}")
self.git.gc()
for flat in left_to_commit:
gc_cnt -= 1
if gc_cnt <= 0 and self.gc_interval:
self.git.gc()
gc_cnt = self.gc_interval
if flat.commit.userid not in users:
users[flat.commit.userid] = User(db, flat.commit.userid)
flat.user = users[flat.commit.userid]
logging.debug(f"USER {flat.user}")
self.gc_cnt -= 1
if self.gc_cnt <= 0 and self.gc_interval:
self.run_gc()
logging.debug(f"Committing {flat}")
self.commit_flat(flat, branch_state)
def run_gc(self):
self.gc_cnt = self.gc_interval
self.git.gc()
def commit_file(self, flat, file, size, md5):
# have such files been detected as text mimetype before?
is_text = self.proxy_sha256.is_text(file.name)
is_text = self.proxy_sha256.is_text(flat.commit.package, file.name)
if not is_text and is_binary_or_large(file.name, size):
file_sha256 = self.proxy_sha256.get_or_put(
flat.commit.project,
@ -84,6 +102,7 @@ class GitExporter:
)
self.git.add_lfs(file.name, file_sha256["sha256"], size)
else:
self.obs.change_url(flat.commit.api_url)
self.obs.download(
flat.commit.project,
flat.commit.package,
@ -107,7 +126,7 @@ class GitExporter:
if flat.parent1:
if not self.branch_fits_parent1(flat, branch_state):
logging.info(f"Reset {flat.branch} onto {flat.parent1.short_string()}")
logging.debug(f"Reset {flat.branch} onto {flat.parent1.short_string()}")
assert flat.parent1.git_commit
self.git.set_branch_head(flat.branch, flat.parent1.git_commit)
self.git.checkout(flat.branch)
@ -116,8 +135,8 @@ class GitExporter:
assert flat.parent2.git_commit
parents.append(flat.parent2.git_commit)
# Overwrite ".gitattributes" with the
self.git.add_default_lfs_gitattributes(force=True)
# create file if not existant
self.git.add_default_lfs_gitattributes(force=False)
to_download, to_delete = flat.commit.calc_delta(branch_state[flat.branch])
for file in to_delete:
@ -126,12 +145,10 @@ class GitExporter:
self.commit_file(flat, file, size, md5)
commit = self.git.commit(
f"OBS User {flat.commit.userid}",
"null@suse.de",
flat.user.realname,
flat.user.email,
flat.commit.commit_time,
# TODO: Normalize better the commit message
f"{flat.commit.comment}\n\n{flat.commit}",
allow_empty=True,
flat.commit.git_commit_message(),
parents=parents,
)
flat.commit.git_commit = commit

View File

@ -28,9 +28,8 @@ class Importer:
self.project = project
self.db = DB()
self.obs = OBS()
self.obs = OBS(api_url)
assert project == "openSUSE:Factory"
self.obs.change_url(api_url)
self.refreshed_packages = set()
def import_request(self, number):
@ -137,10 +136,10 @@ class Importer:
)
return
fake_rev = linked.rev + rev.rev / 1000.0
comment = f"Updating link to change in {rev.project}/{rev.package} revision {rev.rev}"
comment = f"Updating link to change in {rev.project}/{rev.package} revision {int(rev.rev)}"
cur.execute(
"""INSERT INTO revisions (project,package,rev,unexpanded_srcmd5,
commit_time, userid, comment) VALUES(%s,%s,%s,%s,%s,%s,%s) RETURNING id""",
commit_time, userid, comment, api_url) VALUES(%s,%s,%s,%s,%s,%s,%s,%s) RETURNING id""",
(
linked.project,
linked.package,
@ -149,6 +148,7 @@ class Importer:
rev.commit_time,
"buildservice-autocommit",
comment,
linked.api_url,
),
)
new_id = cur.fetchone()[0]

View File

@ -1,5 +1,6 @@
import errno
import logging
import os
import shutil
import time
import urllib.parse
@ -58,13 +59,14 @@ osc.core.http_GET = retry(osc.core.http_GET)
class OBS:
def __init__(self, url=None):
if url:
self.change_url(url)
def __init__(self, url):
self.url = None
self.change_url(url)
def change_url(self, url):
self.url = url
osc.conf.get_config(override_apiurl=url)
if url != self.url:
self.url = url
osc.conf.get_config(override_apiurl=url)
def _xml(self, url_path, **params):
url = osc.core.makeurl(self.url, [url_path], params)
@ -167,14 +169,18 @@ class OBS:
cached_file = self._path_from_md5(name, cachedir, file_md5)
if not self.in_cache(name, cachedir, file_md5):
with (dirpath / name).open("wb") as f:
logging.debug(f"Download {project}/{package}/{name}")
f.write(self._download(project, package, name, revision).read())
shutil.copy(dirpath / name, cached_file)
# Validate the MD5 of the downloaded file
if md5(dirpath / name) != file_md5:
raise Exception(f"Download error in {name}")
shutil.copy(dirpath / name, cached_file.with_suffix(".new"))
os.rename(cached_file.with_suffix(".new"), cached_file)
else:
shutil.copy(cached_file, dirpath / name)
# Validate the MD5 of the downloaded file
if md5(dirpath / name) != file_md5:
raise Exception(f"Download error in {name}")
logging.debug(f"Use cached {project}/{package}/{name}")
def list(self, project, package, srcmd5, linkrev):
params = {"rev": srcmd5, "expand": "1"}

View File

@ -31,18 +31,18 @@ class ProxySHA256:
self.url = url if url else "http://source.dyn.cloud.suse.de"
self.enabled = enabled
self.hashes = None
self.texts = set()
self.texts = None
def load_package(self, package):
# _project is unreachable for the proxy - due to being a fake package
if package == "_project":
self.enabled = False
self.texts = set(["_config", "_service"])
self.texts = set(["_config", "_service", "_staging_workflow"])
self.hashes = dict()
return
logging.debug("Retrieve all previously defined SHA256")
response = requests.get(
f"http://source.dyn.cloud.suse.de/package/{package}", timeout=5
f"http://source.dyn.cloud.suse.de/package/{package}", timeout=50
)
if response.status_code == 200:
json = response.json()
@ -94,7 +94,12 @@ class ProxySHA256:
return self._obs_put(project, package, name, revision, file_md5, size)
return self._proxy_put(project, package, name, revision, file_md5, size)
def is_text(self, filename):
def is_text(self, package, filename):
if self.texts is None:
if self.enabled:
self.load_package(package)
else:
self.texts = set()
return filename in self.texts
def get_or_put(self, project, package, name, revision, file_md5, size):

View File

@ -15,6 +15,19 @@ FAKE_ACCOUNTS = (
class User:
def __init__(self, db, userid) -> None:
row = User.lookup(db, userid)
self.userid = userid
if row:
(_, _, self.email, self.realname) = row
else:
self.email = ""
self.realname = ""
if not self.email:
self.email = "null@suse.de"
if not self.realname:
self.realname = f"OBS User {userid}"
def parse(self, xml, userid):
self.userid = userid
self.realname = xml.find("realname").text

14502
packages Normal file

File diff suppressed because it is too large Load Diff

View File

@ -6,11 +6,14 @@ from lib.db_revision import DBRevision
from lib.obs import OBS
from lib.obs_revision import OBSRevision
# needs to exist in local oscrc (little tricky)
API_URL = "https://api.opensuse.org"
class TestDBMethods(unittest.TestCase):
def setUp(self):
self.db = DB(section="test")
self.obs = OBS()
self.obs = OBS(API_URL)
def test_import(self):
test_rev = OBSRevision(self.obs, "openSUSE:Factory", "xz")
@ -30,6 +33,7 @@ class TestDBMethods(unittest.TestCase):
db_rev = DBRevision.fetch_revision(
self.db, project="openSUSE:Factory", package="xz", rev="70"
)
self.assertEqual(db_rev.api_url, API_URL)
self.assertEqual(str(test_rev), str(db_rev))