1 Commits

Author SHA1 Message Date
Stephan Kulow
716db10adf Scan old imports 2022-11-24 10:24:14 +01:00
9 changed files with 168 additions and 1551 deletions

View File

@@ -7,6 +7,8 @@ import sys
import osc.core
from lib.db import DB
from lib.db_revision import DBRevision
from lib.git_exporter import GitExporter
from lib.importer import Importer
from lib.test_exporter import TestExporter
@@ -100,6 +102,56 @@ def main():
requests_log.setLevel(logging.DEBUG)
requests_log.propagate = True
def check_old_package(db: DB, dir: pathlib.Path):
md5file = dir / "MD5SUMS"
print(md5file)
valid_revisions = None
with open(md5file, "rb") as f:
for line in f.readlines():
try:
md5, file = line.decode("utf-8").strip().split(" ")
except UnicodeDecodeError:
logging.error(f"Corrupt MD5 file: {md5file}")
return
if file == "ready":
continue
if len(md5) != 32:
logging.error(f"Corrupt MD5 file: {md5file}")
return
with db.cursor() as cur:
cur.execute(
"SELECT revision_id FROM files WHERE md5=%s AND name=%s",
(md5, file),
)
nrevs = set([row[0] for row in cur.fetchall()])
if valid_revisions is None:
valid_revisions = nrevs
else:
valid_revisions = valid_revisions & nrevs
if not valid_revisions:
break
with db.cursor() as cur:
cur.execute(
"SELECT * FROM revisions WHERE id = ANY(%s) AND project=%s",
(list(valid_revisions), "openSUSE:Factory"),
)
for row in cur.fetchall():
r = DBRevision(db, row)
print("Valid", r, r.files_hash)
return True
if False:
import os
db = DB()
basedir = pathlib.Path(
f"/mounts/work/SAVE/oldpackages/stable/{args.packages[0]}"
)
for subdir in sorted(os.listdir(basedir)):
if check_old_package(db, basedir / subdir):
break
if args.export:
if len(args.packages) != 1:
print("Can only export one package")

File diff suppressed because it is too large Load Diff

View File

@@ -4,6 +4,7 @@ import os
import pathlib
import subprocess
import pygit2
import requests
from lib.binary import BINARY
@@ -19,6 +20,11 @@ class Git:
self.committer = committer
self.committer_email = committer_email
self.repo = None
def is_open(self):
return self.repo is not None
def exists(self):
"""Check if the path is a valid git repository"""
return (self.path / ".git").exists()
@@ -29,60 +35,35 @@ class Git:
self.open()
def open(self):
subprocess.run(
['git', 'init', '--object-format=sha256', '-b', 'factory'],
cwd=self.path,
check=True,
)
# Convert the path to string, to avoid some limitations in
# older pygit2
self.repo = pygit2.init_repository(str(self.path))
def is_dirty(self):
"""Check if there is something to commit"""
status_str = subprocess.run(
['git', 'status', '--porcelain=2'],
cwd=self.path,
stdout=subprocess.PIPE,
check=True
).stdout.decode('utf-8')
return len(list(filter(None, status_str.split('\n')))) > 0
assert self.is_open()
return self.repo.status()
def branches(self):
br=subprocess.run(
['git', 'for-each-ref', '--format=%(refname:short)', 'refs/heads/'],
cwd=self.path,
check=True,
stdout=subprocess.PIPE
).stdout.decode('utf-8').split()
if len(br) == 0:
br.append('factory') # unborn branch?
return br
return list(self.repo.branches)
def branch(self, branch, commit='HEAD'):
commit = subprocess.run(
['git', 'rev-parse', '--verify', '--end-of-options', commit + '^{commit}'],
cwd=self.path,
check=True,
stdout=subprocess.PIPE
).stdout.decode('utf-8').strip()
return subprocess.run(['git', 'branch', branch, commit], check=True)
def branch(self, branch, commit=None):
if not commit:
commit = self.repo.head
else:
commit = self.repo.get(commit)
self.repo.branches.local.create(branch, commit)
def checkout(self, branch):
"""Checkout into the branch HEAD"""
new_branch = False
ref = f"refs/heads/{branch}"
if branch not in self.branches():
subprocess.run(
['git', 'branch', '-q', branch, 'HEAD'],
cwd=self.path,
check=True
)
self.repo.references["HEAD"].set_target(ref)
new_branch = True
else:
ref = f"refs/heads/{branch}"
if (self.path/'.git'/ref).exists():
subprocess.run(
['git', 'checkout', '-q', branch],
cwd=self.path,
check=True
)
self.repo.checkout(ref)
return new_branch
def commit(
@@ -106,62 +87,30 @@ class Git:
committer_time = committer_time if committer_time else user_time
if self.is_dirty():
subprocess.run(
["git", "add", "--all", "."],
cwd=self.path,
check=True,
)
self.repo.index.add_all()
tree_id = subprocess.run(
['git', 'write-tree'],
cwd=self.path,
check=True,
stdout=subprocess.PIPE
).stdout.decode('utf-8').strip()
parent_array = []
if isinstance(parents, list):
for parent in filter(None, parents):
parent_array = parent_array + ['-p', parent]
elif isinstance(parents, str):
parents_array = ['-p', parents]
commit_id = subprocess.run(
['git', 'commit-tree'] + parent_array + [tree_id],
cwd=self.path,
env={
"GIT_AUTHOR_NAME": user,
"GIT_AUTHOR_EMAIL": user_email,
"GIT_AUTHOR_DATE": f"{int(user_time.timestamp())} +0000",
"GIT_COMMITTER_NAME": committer,
"GIT_COMMITTER_EMAIL": committer_email,
"GIT_COMMITTER_DATE": f"{int(committer_time.timestamp())} +0000",
},
input=message.encode('utf-8'),
check=True,
stdout=subprocess.PIPE
).stdout.decode('utf-8').rstrip()
subprocess.run(
['git', 'reset', '--soft', commit_id],
cwd=self.path,
check=True,
self.repo.index.write()
author = pygit2.Signature(user, user_email, int(user_time.timestamp()))
committer = pygit2.Signature(
committer, committer_email, int(committer_time.timestamp())
)
return commit_id
def branch_head(self, branch='HEAD'):
return subprocess.run(
['git', 'rev-parse', '--verify', '--end-of-options', branch],
cwd=self.path,
check=True,
stdout=subprocess.PIPE
).stdout.decode('utf-8').strip()
tree = self.repo.index.write_tree()
return self.repo.create_commit(
"HEAD", author, committer, message, tree, parents
)
def last_commit(self):
try:
return self.repo.head.target
except:
return None
def branch_head(self, branch):
return self.repo.references["refs/heads/" + branch].target
def set_branch_head(self, branch, commit):
return subprocess.run(
['git', 'branch', '-f', branch, commit],
cwd=self.path,
check=True,
)
self.repo.references["refs/heads/" + branch].set_target(commit)
def gc(self):
logging.debug(f"Garbage recollect and repackage {self.path}")
@@ -172,21 +121,17 @@ class Git:
stderr=subprocess.STDOUT,
)
# def clean(self):
# for path, _ in self.repo.status().items():
# logging.debug(f"Cleaning {path}")
# try:
# (self.path / path).unlink()
# self.repo.index.remove(path)
# except Exception as e:
# logging.warning(f"Error removing file {path}: {e}")
def clean(self):
for path, _ in self.repo.status().items():
logging.debug(f"Cleaning {path}")
try:
(self.path / path).unlink()
self.repo.index.remove(path)
except Exception as e:
logging.warning(f"Error removing file {path}: {e}")
def add(self, filename):
subprocess.run(
['git', 'add', filename],
cwd=self.path,
check=True,
)
self.repo.index.add(filename)
def add_default_lfs_gitattributes(self, force=False):
if not (self.path / ".gitattributes").exists() or force:
@@ -240,11 +185,9 @@ class Git:
return any(fnmatch.fnmatch(filename, line) for line in patterns)
def remove(self, file: pathlib.Path):
subprocess.run(
['git', 'rm', '-q', '--ignore-unmatch', file.name],
cwd=self.path,
check=True,
)
self.repo.index.remove(file.name)
(self.path / file).unlink()
patterns = self.get_specific_lfs_gitattributes()
if file.name in patterns:
patterns.remove(file.name)
@@ -258,7 +201,7 @@ class Git:
logging.warning("Not adding a remote due to missing $GITEA_TOKEN")
return
url = f"https://src.opensuse.org/api/v1/org/{org_name}/repos"
url = f"https://gitea.opensuse.org/api/v1/org/{org_name}/repos"
response = requests.post(
url,
data={"name": repo_name},
@@ -269,23 +212,16 @@ class Git:
# 201 Created
if response.status_code not in (201, 409):
print(response.data)
url = f"gitea@src.opensuse.org:{org_name}/{repo_name}.git"
subprocess.run(
['git', 'remote', 'add', 'origin', url],
cwd=self.path,
check=True,
)
url = f"gitea@gitea.opensuse.org:{org_name}/{repo_name}.git"
self.repo.remotes.create("origin", url)
def push(self, force=False):
cmd = ['git', 'push'];
if force:
cmd.append('-f')
cmd.append('origin')
cmd.append('refs/heads/factory');
cmd.append('refs/heads/devel');
subprocess.run(
cmd,
cwd=self.path,
check=True,
)
def push(self):
remo = self.repo.remotes["origin"]
keypair = pygit2.KeypairFromAgent("gitea")
callbacks = pygit2.RemoteCallbacks(credentials=keypair)
refspecs = ["refs/heads/factory"]
if "refs/heads/devel" in self.repo.references:
refspecs.append("refs/heads/devel")
remo.push(refspecs, callbacks=callbacks)

View File

@@ -86,7 +86,7 @@ class GitExporter:
logging.debug(f"Committing {flat}")
self.commit_flat(flat, branch_state)
self.git.push(force=True)
self.git.push()
def run_gc(self):
self.gc_cnt = self.gc_interval

View File

@@ -1,5 +1,5 @@
import concurrent.futures
import logging
import pathlib
import xml.etree.ElementTree as ET
from lib.db import DB
@@ -31,7 +31,6 @@ class Importer:
self.obs = OBS(api_url)
assert project == "openSUSE:Factory"
self.refreshed_packages = set()
self.gone_packages_set = None
def import_request(self, number):
self.obs.request(number).import_into_db(self.db)
@@ -106,7 +105,7 @@ class Importer:
with self.db.cursor() as cur:
cur.execute(
"""SELECT * FROM revisions WHERE id IN
(SELECT revision_id from linked_revs WHERE linked_id=%s)
(SELECT revision_id from linked_revs WHERE linked_id=%s)
AND commit_time <= %s ORDER BY commit_time""",
(prev.dbid, rev.commit_time),
)
@@ -139,7 +138,7 @@ class Importer:
fake_rev = linked.rev + rev.rev / 1000.0
comment = f"Updating link to change in {rev.project}/{rev.package} revision {int(rev.rev)}"
cur.execute(
"""INSERT INTO revisions (project,package,rev,unexpanded_srcmd5,
"""INSERT INTO revisions (project,package,rev,unexpanded_srcmd5,
commit_time, userid, comment, api_url) VALUES(%s,%s,%s,%s,%s,%s,%s,%s) RETURNING id""",
(
linked.project,
@@ -162,12 +161,10 @@ class Importer:
(rev.dbid, linked.dbid),
)
def revisions_without_files(self, package):
logging.debug(f"revisions_without_files({package})")
def revisions_without_files(self):
with self.db.cursor() as cur:
cur.execute(
"SELECT * FROM revisions WHERE package=%s AND broken=FALSE AND expanded_srcmd5 IS NULL",
(package, )
"SELECT * FROM revisions WHERE broken=FALSE AND expanded_srcmd5 IS NULL"
)
return [DBRevision(self.db, row) for row in cur.fetchall()]
@@ -181,11 +178,11 @@ class Importer:
linked_rev = cur.fetchone()
if linked_rev:
linked_rev = linked_rev[0]
obs_dir_list = self.obs.list(
list = self.obs.list(
rev.project, rev.package, rev.unexpanded_srcmd5, linked_rev
)
if obs_dir_list:
rev.import_dir_list(obs_dir_list)
if list:
rev.import_dir_list(list)
md5 = rev.calculate_files_hash()
with self.db.cursor() as cur:
cur.execute(
@@ -199,43 +196,53 @@ class Importer:
self.find_linked_revs()
self.find_fake_revisions()
for package in self.packages:
for rev in self.revisions_without_files(package):
print(f"rev {rev} is without files")
self.import_rev(rev)
with concurrent.futures.ThreadPoolExecutor(max_workers=8) as executor:
fs = [
executor.submit(import_rev, self, rev)
for rev in self.revisions_without_files()
]
concurrent.futures.wait(fs)
def refresh_package(self, project, package):
key = f"{project}/{package}"
if key in self.refreshed_packages:
# refreshing once is good enough
return
if self.package_gone(key):
return
logging.debug(f"Refresh {project}/{package}")
self.refreshed_packages.add(key)
self.update_db_package(project, package)
self.fetch_all_linked_packages(project, package)
def import_into_db(self):
for package in self.packages:
refresh_package(self, self.project, package)
self.db.conn.commit()
with concurrent.futures.ThreadPoolExecutor(max_workers=8) as executor:
fs = [
executor.submit(refresh_package, self, self.project, package)
for package in self.packages
]
concurrent.futures.wait(fs)
for number in DBRevision.requests_to_fetch(self.db):
self.import_request(number)
self.db.conn.commit()
self.db.conn.commit()
fs = [
executor.submit(import_request, self, number)
for number in DBRevision.requests_to_fetch(self.db)
]
concurrent.futures.wait(fs)
with self.db.cursor() as cur:
cur.execute(
"""SELECT DISTINCT source_project,source_package FROM requests
WHERE id IN (SELECT request_id FROM revisions WHERE project=%s and package = ANY(%s));""",
(self.project, self.packages),
)
for project, package in cur.fetchall():
self.refresh_package(project, package)
self.db.conn.commit()
with self.db.cursor() as cur:
cur.execute(
"""SELECT DISTINCT source_project,source_package FROM requests
WHERE id IN (SELECT request_id FROM revisions WHERE project=%s and package = ANY(%s));""",
(self.project, self.packages),
)
fs = [
executor.submit(refresh_package, self, project, package)
for project, package in cur.fetchall()
]
concurrent.futures.wait(fs)
self.db.conn.commit()
missing_users = User.missing_users(self.db)
@@ -247,11 +254,3 @@ class Importer:
self.fill_file_lists()
self.db.conn.commit()
def package_gone(self, key):
if not self.gone_packages_set:
self.gone_packages_set = set()
with open(pathlib.Path(__file__).parent.parent / "gone-packages.txt") as f:
for line in f.readlines():
self.gone_packages_set.add(line.strip())
return key in self.gone_packages_set

View File

@@ -68,7 +68,7 @@ class LFSOid:
row = cur.fetchone()
lfs_oid_id = row[0]
cur.execute(
"""INSERT INTO lfs_oid_in_package (package,filename,lfs_oid_id)
"""INSERT INTO lfs_oid_in_package (package,filename,lfs_oid_id)
VALUES (%s,%s,%s)""",
(package, filename, lfs_oid_id),
)
@@ -83,7 +83,7 @@ class LFSOid:
self.register()
def check(self):
url = f"http://localhost:9999/check/{self.sha256}/{self.size}"
url = f"http://gitea.opensuse.org:9999/check/{self.sha256}/{self.size}"
response = requests.get(
url,
timeout=10,
@@ -127,13 +127,12 @@ class LFSOid:
"size": self.size,
}
url = "http://localhost:9999/register"
url = "http://gitea.opensuse.org:9999/register"
response = requests.post(
url,
json=data,
timeout=10,
)
response.raise_for_status()
logging.info(f"Register LFS returned {response.status_code}")
@@ -168,7 +167,7 @@ if __name__ == "__main__":
cur.execute(
"""
CREATE TEMPORARY TABLE lfs_oid_in_revision (
revision_id INTEGER,
revision_id INTEGER,
lfs_oid_id INTEGER NOT NULL,
name VARCHAR(255) NOT NULL
)

View File

@@ -148,28 +148,12 @@ class OBS:
]
def _download(self, project, package, name, revision):
# the object might be deleted but we can only pass deleted=1
# if it is actually deleted
deleted = 0
while deleted < 2:
url = osc.core.makeurl(
self.url,
["source", project, package, urllib.parse.quote(name)],
{"rev": revision, "expand": 1, "deleted": deleted if deleted else ()},
)
try:
osc.core.http_request("HEAD", url)
break
except Exception:
pass
deleted += 1
url = osc.core.makeurl(
self.url,
["source", project, package, urllib.parse.quote(name)],
{"rev": revision, "expand": 1, "deleted": 1 if deleted else ()},
)
return osc.core.http_request("GET", url)
self.url,
["source", project, package, urllib.parse.quote(name)],
{"rev": revision, "expand": 1},
)
return osc.core.http_GET(url)
def download(
self,

View File

@@ -7,6 +7,8 @@ except:
print("Install python3-python-magic, not python3-magic")
raise
import requests
from lib.db import DB
from lib.lfs_oid import LFSOid
from lib.obs import OBS

View File

@@ -114,7 +114,7 @@ class TreeBuilder:
candidates.append(node)
if node.merged_into:
# we can't have candidates that are crossing previous merges
# see https://src.opensuse.org/importers/git-importer/issues/14
# see https://gitea.opensuse.org/importers/git-importer/issues/14
candidates = []
node = node.parent
if candidates: