From fb80d0c1056e6dc079ecff86b888c206cfd39a45 Mon Sep 17 00:00:00 2001 From: Adam Majer Date: Fri, 21 Feb 2025 13:24:40 +0100 Subject: [PATCH 01/10] Revert "Only stop importing when it isn't a jengelh repository" This reverts commit 44b4d690dbc8532125399298d8765d2548c32b6b. this breaks detection of scmsync projects. Jan can live with not synced git for a few weeks --- lib/importer.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/lib/importer.py b/lib/importer.py index b82faa8..e72ffab 100644 --- a/lib/importer.py +++ b/lib/importer.py @@ -268,12 +268,9 @@ class Importer: return self.scmsync_cache[key] root = self.obs._meta(key) - scmsync = None scmsync_exists = False - if root and root.find('scmsync') is not None: - scmsync = root.find('scmsync').text - if scmsync: - scmsync_exists = scmsync.startswith('https://src.opensuse.org/pool/') + if root is not None: + scmsync_exists = root.find('scmsync') is not None self.scmsync_cache[key] = scmsync_exists return scmsync_exists -- 2.49.0 From 4ef980d1c87bf2d2726c7c343879bc6f49a65d6d Mon Sep 17 00:00:00 2001 From: Adam Majer Date: Fri, 21 Feb 2025 17:24:23 +0100 Subject: [PATCH 02/10] fix bug --- lib/git_exporter.py | 2 ++ lib/lfs_oid.py | 1 + lib/tree_builder.py | 2 ++ 3 files changed, 5 insertions(+) diff --git a/lib/git_exporter.py b/lib/git_exporter.py index 1fe9f63..82919b3 100644 --- a/lib/git_exporter.py +++ b/lib/git_exporter.py @@ -64,6 +64,8 @@ class GitExporter: if os.getenv("CHECK_ALL_LFS"): LFSOid.check_all(self.db, self.package) tree = TreeBuilder(self.db).build(self.project, self.package) + if tree == None: # eg. python-M2Crypto errors + return flats = tree.as_flat_list() branch_state = {"factory": None, "devel": None} diff --git a/lib/lfs_oid.py b/lib/lfs_oid.py index d487f94..ba4d4a5 100644 --- a/lib/lfs_oid.py +++ b/lib/lfs_oid.py @@ -83,6 +83,7 @@ class LFSOid: self.register() def check(self): + return True url = f"http://localhost:9999/check/{self.sha256}/{self.size}" response = requests.get( url, diff --git a/lib/tree_builder.py b/lib/tree_builder.py index 8f179d5..a92121c 100644 --- a/lib/tree_builder.py +++ b/lib/tree_builder.py @@ -216,6 +216,8 @@ class TreeBuilder: def build(self, project, package): """Create a Factory tree (returning the top)""" factory_revisions = self.revisions_chain(project, package) + if factory_revisions == None: + return None self.add_merge_points(factory_revisions) # factory_revisions.print() self.prune_loose_end(factory_revisions) -- 2.49.0 From 76843881934a5deb176d014036b24dde81f9b5f8 Mon Sep 17 00:00:00 2001 From: Adam Majer Date: Tue, 4 Mar 2025 16:23:29 +0100 Subject: [PATCH 03/10] re-enable LFS checks --- lib/lfs_oid.py | 1 - 1 file changed, 1 deletion(-) diff --git a/lib/lfs_oid.py b/lib/lfs_oid.py index ba4d4a5..d487f94 100644 --- a/lib/lfs_oid.py +++ b/lib/lfs_oid.py @@ -83,7 +83,6 @@ class LFSOid: self.register() def check(self): - return True url = f"http://localhost:9999/check/{self.sha256}/{self.size}" response = requests.get( url, -- 2.49.0 From 32d1924a0daa3917159cc54a1b693517174cc3fa Mon Sep 17 00:00:00 2001 From: Adam Majer Date: Tue, 25 Mar 2025 14:28:54 +0100 Subject: [PATCH 04/10] Ignore OBS service generated files _service: files should not be imported. They are created by OBS service and will be re-created by OBS during scm sync Also, not allowed in Factory by policy anyway. So this affects devel and home projects only --- lib/db_revision.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/lib/db_revision.py b/lib/db_revision.py index bb56722..3da37f3 100644 --- a/lib/db_revision.py +++ b/lib/db_revision.py @@ -204,6 +204,11 @@ class DBRevision: and self.package == "_project" ): continue + + # do not import _service:* files as those are created by OBS on source imports + if entry.get("name")[0:9] == "_service:": + continue + cur.execute( """INSERT INTO files (name, md5, size, mtime, revision_id) VALUES (%s,%s,%s,%s,%s)""", -- 2.49.0 From 5d55a8c2feedd22756145f2882da6420071a9149 Mon Sep 17 00:00:00 2001 From: Adam Majer Date: Tue, 25 Mar 2025 14:30:55 +0100 Subject: [PATCH 05/10] Empty files should still have a mimetype assigned --- lib/proxy_sha256.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/lib/proxy_sha256.py b/lib/proxy_sha256.py index 605af08..a0353b1 100644 --- a/lib/proxy_sha256.py +++ b/lib/proxy_sha256.py @@ -50,12 +50,12 @@ class ProxySHA256: sha = hashlib.sha256() while True: buffer = fin.read(10000) - if not buffer: - break - sha.update(buffer) # only guess from the first 10K if not mimetype: mimetype = self.mime.from_buffer(buffer) + if not buffer: + break + sha.update(buffer) fin.close() LFSOid(self.db).add( project, package, name, revision, sha.hexdigest(), size, mimetype, file_md5 -- 2.49.0 From 5a6a55868fcaddb475932a7fc1bc287ed5bc9cc1 Mon Sep 17 00:00:00 2001 From: Adam Majer Date: Thu, 24 Jul 2025 13:20:00 +0200 Subject: [PATCH 06/10] Look at deleted packages if they are not found --- lib/obs.py | 23 ++++++++++++++++------- 1 file changed, 16 insertions(+), 7 deletions(-) diff --git a/lib/obs.py b/lib/obs.py index d36c56b..ec75019 100644 --- a/lib/obs.py +++ b/lib/obs.py @@ -148,12 +148,21 @@ class OBS: ] def _download(self, project, package, name, revision): - url = osc.core.makeurl( - self.url, - ["source", project, package, name], - {"rev": revision, "expand": 1}, - ) - return osc.core.http_GET(url) + try: + url = osc.core.makeurl( + self.url, + ["source", project, package, name], + {"rev": revision, "expand": 1}, + ) + return osc.core.http_GET(url) + except HTTPError as e: + if e.status == 404: + url = osc.core.makeurl( + self.url, + ["source", project, package, name], + {"rev": revision, "expand": 1, "deleted": 1}, + ) + return osc.core.http_GET(url) def download( self, @@ -189,7 +198,7 @@ class OBS: try: root = self._xml(f"source/{project}/{package}", **params) except HTTPError as e: - if e.code == 400: + if e.code == 400 or e.code == 404: logging.error( f"Package [{project}/{package} {params}] can't be expanded: {e}" ) -- 2.49.0 From d5eb5c0db63368b676181e793c632b5651da4457 Mon Sep 17 00:00:00 2001 From: Adam Majer Date: Wed, 30 Jul 2025 14:59:42 +0200 Subject: [PATCH 07/10] Don't export service files from overlay --- lib/git_exporter.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/lib/git_exporter.py b/lib/git_exporter.py index 82919b3..7445b8c 100644 --- a/lib/git_exporter.py +++ b/lib/git_exporter.py @@ -105,6 +105,10 @@ class GitExporter: return not self.proxy_sha256.is_text(package, filename) def commit_file(self, flat, file, size, md5): + # don't export imported _service: files, if any + if file.name[0:9] == '_service:': + return + # have such files been detected as text mimetype before? if self.is_lfs_file(flat.commit.package, file.name, size): file_sha256 = self.proxy_sha256.get_or_put( -- 2.49.0 From 17888407df235f0c75a32b316f00f1eb4bb42236 Mon Sep 17 00:00:00 2001 From: Adam Majer Date: Thu, 31 Jul 2025 15:58:13 +0200 Subject: [PATCH 08/10] Include all the devel commits If commits in devel are younger than factory, we should probalby export them too.... --- lib/git_exporter.py | 61 +++++++++++++++++++++++++++++++++------------ 1 file changed, 45 insertions(+), 16 deletions(-) diff --git a/lib/git_exporter.py b/lib/git_exporter.py index 7445b8c..2053944 100644 --- a/lib/git_exporter.py +++ b/lib/git_exporter.py @@ -37,7 +37,7 @@ class GitExporter: def set_gc_interval(self, gc): self.gc_interval = gc - def check_repo_state(self, flats, branch_state): + def check_repo_state(self, flats, branch_state, branch): state_data = dict() if os.path.exists(self.state_file): with open(self.state_file) as f: @@ -47,32 +47,36 @@ class GitExporter: left_to_commit = [] for flat in reversed(flats): found_state = False - for branch in ["factory"]: - if flat.commit.dbid == state_data.get(branch): - branch_state[branch] = flat.commit - flat.commit.git_commit = self.git.branch_head(branch) - logging.debug( - f"Found {self.git.path}'s {branch} branch in state {flat}" - ) - left_to_commit = [] - found_state = True + if flat.commit.dbid == state_data.get(branch): + branch_state[branch] = flat.commit + flat.commit.git_commit = self.git.branch_head(branch) + logging.debug( + f"Found {self.git.path}'s {branch} branch in state {flat}" + ) + left_to_commit = [] + found_state = True if not found_state: left_to_commit.append(flat) return left_to_commit + def devel_rev(self, tree): + for flat in tree: + if flat.branch == "devel": + return flat + return None + def export_as_git(self): if os.getenv("CHECK_ALL_LFS"): LFSOid.check_all(self.db, self.package) tree = TreeBuilder(self.db).build(self.project, self.package) + added_commits = False + if tree == None: # eg. python-M2Crypto errors return flats = tree.as_flat_list() - branch_state = {"factory": None, "devel": None} - left_to_commit = self.check_repo_state(flats, branch_state) - - if not left_to_commit: - return + self.check_repo_state(flats, branch_state, "devel") # used to set branch_state only + left_to_commit = self.check_repo_state(flats, branch_state, "factory") logging.info(f"Commiting into {self.git.path}") self.run_gc() @@ -87,13 +91,38 @@ class GitExporter: self.run_gc() logging.debug(f"Committing {flat}") self.commit_flat(flat, branch_state) + added_commits = True + + # export the devel_tree head commits based on the devel branch + if self.project == "openSUSE:Factory": + devel_head = self.devel_rev(flats) + flat_devel = None + if devel_head is not None: + flat_devel = TreeBuilder(self.db).revisions_chain(devel_head.commit.project, self.package).as_flat_list() + for f in flat_devel: + f.branch = "devel" + + left_to_commit = self.check_repo_state(flat_devel, branch_state, "devel") + print(branch_state) + logging.debug(f"appending {len(left_to_commit)} items on top of devel") + for flat in left_to_commit: + if flat.commit.userid not in users: + users[flat.commit.userid] = User.find(self.db, flat.commit.userid) + flat.user = users[flat.commit.userid] + self.gc_cnt -= 1 + if self.gc_cnt <= 0 and self.gc_interval: + self.run_gc() + logging.debug(f"Committing {flat}") + self.commit_flat(flat, branch_state) + added_commits = True # make sure that we create devel branch if not branch_state["devel"]: logging.debug("force creating devel") self.git.set_branch_head("devel", self.git.branch_head("factory")) - self.git.push(force=True) + if added_commits: + self.git.push(force=True) def run_gc(self): self.gc_cnt = self.gc_interval -- 2.49.0 From 5a28f62fb94a822a34e641bd4c87c869c3a137b7 Mon Sep 17 00:00:00 2001 From: Adam Majer Date: Sat, 9 Aug 2025 18:06:35 +0200 Subject: [PATCH 09/10] reconstruct state data If the state file is missing, we can reconstruct which parts were exported based on revision ids Also, packages could have branches in Git, but not be in Git. We need to check (project,package) tuple for this and not just abort based on the package name alone. --- git-importer.py | 2 +- lib/flat_walker.py | 2 +- lib/git.py | 6 +++ lib/git_exporter.py | 107 ++++++++++++++++++++++++++++++++++++-------- lib/importer.py | 8 ++-- 5 files changed, 102 insertions(+), 23 deletions(-) diff --git a/git-importer.py b/git-importer.py index f05f9e9..8373358 100755 --- a/git-importer.py +++ b/git-importer.py @@ -119,7 +119,7 @@ def main(): importer = Importer(URL_OBS, args.project, args.packages) importer.import_into_db() for package in args.packages: - if not importer.package_with_scmsync(package): + if not importer.package_with_scmsync(args.project, package): export_package(args.project, package, args.repodir, args.cachedir, args.gc) else: logging.debug(f"{args.project}/{package} has scmsync links - skipping export") diff --git a/lib/flat_walker.py b/lib/flat_walker.py index 7176da6..b490e25 100644 --- a/lib/flat_walker.py +++ b/lib/flat_walker.py @@ -20,7 +20,7 @@ class FlatTreeWalker(AbstractWalker): def __init__(self, rebase_devel=False) -> None: super().__init__() - self.flats = [] + self.flats:list[FlatNode] = [] # the rebase_devel won't work as such as rebasing the branch needs an explicit action self.rebase_devel = rebase_devel # remember the last merge point so we can know the parent of it for the root of the sources diff --git a/lib/git.py b/lib/git.py index 3fa7c9c..ba89b72 100644 --- a/lib/git.py +++ b/lib/git.py @@ -160,6 +160,12 @@ class Git: .strip() ) + def branch_commit(self, branch="HEAD"): + try: + return (self.git_run(["cat-file", "commit", branch], stdout=subprocess.PIPE).stdout.decode("utf-8").strip()) + except: + return '' + def set_branch_head(self, branch, commit): return self.git_run(["update-ref", f"refs/heads/{branch}", commit]) diff --git a/lib/git_exporter.py b/lib/git_exporter.py index 2053944..458f26f 100644 --- a/lib/git_exporter.py +++ b/lib/git_exporter.py @@ -1,8 +1,12 @@ import logging import os +from urllib.parse import parse_qs +import psycopg +from urllib3.util import url import yaml +from lib import db from lib.binary import is_binary_or_large from lib.db import DB from lib.git import Git @@ -12,6 +16,12 @@ from lib.proxy_sha256 import ProxySHA256 from lib.tree_builder import TreeBuilder from lib.user import User +def is_number(s): + try: + float(s) + return True + except ValueError: + return False class GitExporter: def __init__(self, api_url, project, package, repodir, cachedir): @@ -37,6 +47,63 @@ class GitExporter: def set_gc_interval(self, gc): self.gc_interval = gc + def reconstruct_state(self, flats): + state_data = dict() + prefix = "OBS-URL: " + for line in self.git.branch_commit("factory").splitlines(): + if line.startswith(prefix): + u = url.parse_url(line.strip(prefix)) + if u.path != f"/package/show/openSUSE:Factory/{self.package}" or "rev=" not in u.query: + continue + v = parse_qs(u.query) + rev = v['rev'][0] + with self.db.cursor() as cur: + try: + if is_number(rev): + cur.execute("SELECT id FROM revisions WHERE project=%s AND package=%s AND rev=%s", ('openSUSE:Factory', self.package, rev,)) + else: + cur.execute("SELECT id FROM revisions WHERE project=%s AND package=%s AND expanded_srcmd5=%s", ('openSUSE:Factory', self.package, rev, rev)) + except psycopg.Error as e: + logging.error(e) + self.db.conn.rollback() + row = cur.fetchone() + if not row: + return state_data + state_data['factory'] = row[0] + try: + print("devel reconstruct") + d = self.devel_rev(flats) + if d is not None: + prj = d.commit.project + for line in self.git.branch_commit("devel").splitlines(): + if line.startswith(prefix): + u = url.parse_url(line.strip(prefix)) + if u.path != f"/package/show/{prj}/{self.package}" or u.query is None or "rev=" not in u.query: + continue + v = parse_qs(u.query) + rev = v['rev'][0] + try: + with self.db.cursor() as cur: + logging.debug(f"finding id for ({prj, self.package, rev}") + if is_number(rev): + cur.execute("SELECT id FROM revisions WHERE project=%s AND package=%s AND rev=%s", (prj, self.package, rev,)) + else: + cur.execute("SELECT id FROM revisions WHERE project=%s AND package=%s AND expanded_srcmd5=%s", (prj, self.package, rev,)) + row = cur.fetchone() + if not row: + logging.info(" ** cannot find revision for devel branch:", rev) + return state_data + state_data['devel'] = row[0] + except psycopg.Error as e: + logging.error(e) + self.db.conn.rollback() + if state_data['factory'] is not None: + state_data['devel'] = state_data['factory'] + except: + if state_data['factory'] is not None: + state_data['devel'] = state_data['factory'] + return state_data + def check_repo_state(self, flats, branch_state, branch): state_data = dict() if os.path.exists(self.state_file): @@ -44,6 +111,10 @@ class GitExporter: state_data = yaml.safe_load(f) if not isinstance(state_data, dict): state_data = {} + else: + state_data = self.reconstruct_state(flats) + + logging.debug(f"state data: {state_data}") left_to_commit = [] for flat in reversed(flats): found_state = False @@ -75,10 +146,8 @@ class GitExporter: return flats = tree.as_flat_list() branch_state = {"factory": None, "devel": None} - self.check_repo_state(flats, branch_state, "devel") # used to set branch_state only left_to_commit = self.check_repo_state(flats, branch_state, "factory") - - logging.info(f"Commiting into {self.git.path}") + logging.info(f"Commiting into {self.git.path} {len(left_to_commit)} into factory") self.run_gc() users = dict() @@ -98,31 +167,33 @@ class GitExporter: devel_head = self.devel_rev(flats) flat_devel = None if devel_head is not None: + logging.debug(f"building devel revisions chain for {devel_head.commit.project} / {self.package}") flat_devel = TreeBuilder(self.db).revisions_chain(devel_head.commit.project, self.package).as_flat_list() for f in flat_devel: f.branch = "devel" - left_to_commit = self.check_repo_state(flat_devel, branch_state, "devel") - print(branch_state) - logging.debug(f"appending {len(left_to_commit)} items on top of devel") - for flat in left_to_commit: - if flat.commit.userid not in users: - users[flat.commit.userid] = User.find(self.db, flat.commit.userid) - flat.user = users[flat.commit.userid] - self.gc_cnt -= 1 - if self.gc_cnt <= 0 and self.gc_interval: - self.run_gc() - logging.debug(f"Committing {flat}") - self.commit_flat(flat, branch_state) - added_commits = True + if flat_devel is not None: + left_to_commit = self.check_repo_state(flat_devel, branch_state, "devel") + logging.debug(branch_state) + logging.debug(f"appending {len(left_to_commit)} items on top of devel") + for flat in left_to_commit: + if flat.commit.userid not in users: + users[flat.commit.userid] = User.find(self.db, flat.commit.userid) + flat.user = users[flat.commit.userid] + self.gc_cnt -= 1 + if self.gc_cnt <= 0 and self.gc_interval: + self.run_gc() + logging.debug(f"Committing {flat}") + self.commit_flat(flat, branch_state) + added_commits = True # make sure that we create devel branch if not branch_state["devel"]: logging.debug("force creating devel") self.git.set_branch_head("devel", self.git.branch_head("factory")) - if added_commits: - self.git.push(force=True) + #if added_commits: + # self.git.push(force=True) def run_gc(self): self.gc_cnt = self.gc_interval diff --git a/lib/importer.py b/lib/importer.py index e72ffab..455f038 100644 --- a/lib/importer.py +++ b/lib/importer.py @@ -42,6 +42,8 @@ class Importer: def update_db_package(self, project, package): root = self.obs._history(project, package) if root is None: + if self.project == "openSUSE:Factory" and project == self.project: + exit(10) return latest = DBRevision.max_rev(self.db, project, package) for r in root.findall("revision"): @@ -217,7 +219,7 @@ class Importer: logging.debug(f"Refresh {project}/{package}") self.refreshed_packages.add(key) if self.has_scmsync(project) or self.has_scmsync(key): - self.packages_with_scmsync.add(package) + self.packages_with_scmsync.add((project, package)) logging.debug(f"{project}/{package} already in Git - skipping") return self.update_db_package(project, package) @@ -274,6 +276,6 @@ class Importer: self.scmsync_cache[key] = scmsync_exists return scmsync_exists - def package_with_scmsync(self, package): - return package in self.packages_with_scmsync + def package_with_scmsync(self, project, package): + return (project, package) in self.packages_with_scmsync -- 2.49.0 From 94e57852a097890f3cd2f8af4f744da263df16bd Mon Sep 17 00:00:00 2001 From: Adam Majer Date: Sun, 24 Aug 2025 15:34:08 +0200 Subject: [PATCH 10/10] stop using saved state We can reconstruct the state based on log messages. Otherwise, it's incorrect as the trimming function removes exported parts anyway and then we have missing commits --- lib/git_exporter.py | 17 +---------------- 1 file changed, 1 insertion(+), 16 deletions(-) diff --git a/lib/git_exporter.py b/lib/git_exporter.py index 458f26f..d8ff581 100644 --- a/lib/git_exporter.py +++ b/lib/git_exporter.py @@ -40,7 +40,6 @@ class GitExporter: else: self.git.create() self.git.add_gitea_remote(package) - self.state_file = os.path.join(self.git.path, ".git", "_flat_state.yaml") self.gc_interval = 200 self.cachedir = cachedir @@ -105,14 +104,7 @@ class GitExporter: return state_data def check_repo_state(self, flats, branch_state, branch): - state_data = dict() - if os.path.exists(self.state_file): - with open(self.state_file) as f: - state_data = yaml.safe_load(f) - if not isinstance(state_data, dict): - state_data = {} - else: - state_data = self.reconstruct_state(flats) + state_data = self.reconstruct_state(flats) logging.debug(f"state data: {state_data}") left_to_commit = [] @@ -278,10 +270,3 @@ class GitExporter: ) flat.commit.git_commit = commit branch_state[flat.branch] = flat.commit - with open(self.state_file, "w") as f: - data = {} - for branch in ["factory", "devel"]: - commit = branch_state[branch] - if commit: - data[branch] = commit.dbid - yaml.dump(data, f) -- 2.49.0