From c4654dd89610dea6d22825c45355e90174d11b25 Mon Sep 17 00:00:00 2001 From: Stephan Kulow Date: Wed, 2 Nov 2022 07:59:25 +0100 Subject: [PATCH] Split GitExporter out of Importer class --- git-importer.py | 13 +-- lib/git_exporter.py | 167 ++++++++++++++++++++++++++++++++++ lib/importer.py | 207 +++---------------------------------------- lib/test_exporter.py | 3 +- lib/tree_builder.py | 4 +- 5 files changed, 189 insertions(+), 205 deletions(-) create mode 100644 lib/git_exporter.py diff --git a/git-importer.py b/git-importer.py index 1f63dc5..2b0bb23 100755 --- a/git-importer.py +++ b/git-importer.py @@ -6,16 +6,17 @@ import pathlib import sys import osc.core +from lib.git_exporter import GitExporter -from lib.test_exporter import TestExporter from lib.importer import Importer +from lib.test_exporter import TestExporter URL_OBS = "https://api.opensuse.org" URL_IBS = "https://api.suse.de" # The order is relevant (from older to newer initial codebase) +# TODO: make something with these, for now we look purely at openSUSE:Factory PROJECTS = [ - ("openSUSE:Factory", "factory", URL_OBS), # ("SUSE:SLE-12:GA", "SLE_12", URL_IBS), # ("SUSE:SLE-12:Update", "SLE_12", URL_IBS), # ("SUSE:SLE-12-SP1:GA", "SLE_12_SP1", URL_IBS), @@ -92,11 +93,11 @@ def main(): if not args.repodir: args.repodir = pathlib.Path(args.package) - # TODO: use a CLI parameter to describe the projects - importer = Importer(PROJECTS, args.package, args.repodir) - importer.set_gc_interval(args.gc) + importer = Importer(URL_OBS, "openSUSE:Factory", args.package) importer.import_into_db() - importer.export_as_git() + exporter = GitExporter(URL_OBS, "openSUSE:Factory", args.package, args.repodir) + exporter.set_gc_interval(args.gc) + exporter.export_as_git() if __name__ == "__main__": diff --git a/lib/git_exporter.py b/lib/git_exporter.py new file mode 100644 index 0000000..0d4d9c9 --- /dev/null +++ b/lib/git_exporter.py @@ -0,0 +1,167 @@ +import logging +import os +from lib.binary import is_binary_or_large +import yaml +from lib.db import DB +from lib.git import Git +from lib.obs import OBS +from lib.proxy_sha256 import ProxySHA256, md5 +from lib.tree_builder import TreeBuilder + +class GitExporter: + def __init__(self, api_url, project, package, repodir): + self.obs = OBS() + self.project = project + self.package = package + # TODO: Store the api url in the revision + self.obs.change_url(api_url) + self.proxy_sha256 = ProxySHA256(self.obs, enabled=True) + self.git = Git( + repodir, + committer="Git OBS Bridge", + committer_email="obsbridge@suse.de", + ).create() + self.state_file = os.path.join(self.git.path, ".git", "_flat_state.yaml") + self.gc_interval = 200 + + + def download(self, revision): + obs_files = self.obs.files(revision.project, revision.package, revision.srcmd5) + git_files = { + (f.name, f.stat().st_size, md5(f)) + for f in self.git.path.iterdir() + if f.is_file() and f.name not in (".gitattributes") + } + + # Overwrite ".gitattributes" with the + self.git.add_default_lfs_gitattributes(force=True) + + # Download each file in OBS if it is not a binary (or large) + # file + for (name, size, file_md5) in obs_files: + # this file creates easily 100k commits and is just useless data :( + # unfortunately it's stored in the same meta package as the project config + if revision.package == "_project" and name == "_staging_workflow": + continue + # have such files been detected as text mimetype before? + is_text = self.proxy_sha256.is_text(name) + if not is_text and is_binary_or_large(name, size): + file_sha256 = self.proxy_sha256.get_or_put( + revision.project, + revision.package, + name, + revision.srcmd5, + file_md5, + size, + ) + self.git.add_lfs(name, file_sha256["sha256"], size) + else: + if (name, size, file_md5) not in git_files: + logging.debug(f"Download {name}") + self.obs.download( + revision.project, + revision.package, + name, + revision.srcmd5, + self.git.path, + ) + # Validate the MD5 of the downloaded file + if md5(self.git.path / name) != file_md5: + raise Exception(f"Download error in {name}") + self.git.add(name) + + # Remove extra files + obs_names = {n for (n, _, _) in obs_files} + git_names = {n for (n, _, _) in git_files} + for name in git_names - obs_names: + logging.debug(f"Remove {name}") + self.git.remove(name) + + def set_gc_interval(self, gc): + self.gc_interval = gc + + def export_as_git(self): + db = DB() + tree = TreeBuilder(db).build(self.project, self.package) + flats = tree.as_flat_list() + + branch_state = {"factory": None, "devel": None} + state_data = dict() + if os.path.exists(self.state_file): + with open(self.state_file, "r") as f: + state_data = yaml.safe_load(f) + if type(state_data) != dict: + state_data = {} + left_to_commit = [] + for flat in reversed(flats): + found_state = False + for branch in ["factory", "devel"]: + if flat.commit.dbid == state_data.get(branch): + branch_state[branch] = flat.commit + flat.commit.git_commit = self.git.branch_head(branch) + logging.debug( + f"Found {self.git.path}'s {branch} branch in state {flat}" + ) + left_to_commit = [] + found_state = True + if not found_state: + left_to_commit.append(flat) + + gc_cnt = self.gc_interval + if len(left_to_commit) > 0: + self.git.gc() + for flat in left_to_commit: + gc_cnt -= 1 + if gc_cnt <= 0 and self.gc_interval: + self.git.gc() + gc_cnt = self.gc_interval + logging.debug(f"Committing {flat}") + self.commit_flat(db, flat, branch_state) + + def limit_download(self, file): + if file.endswith(".spec") or file.endswith(".changes"): + return True + return False + + def commit_flat(self, db, flat, branch_state): + parents = [] + self.git.checkout(flat.branch) + if flat.parent1: + parents.append(flat.parent1.git_commit) + if flat.parent2: + parents.append(flat.parent2.git_commit) + to_download, to_delete = flat.commit.calc_delta(db, branch_state[flat.branch]) + for file in to_delete: + if not self.limit_download(file): + continue + self.git.remove(file) + for file in to_download: + if not self.limit_download(file): + continue + self.obs.download( + flat.commit.project, + flat.commit.package, + file, + flat.commit.expanded_srcmd5, + self.git.path, + ) + self.git.add(file) + + commit = self.git.commit( + f"OBS User {flat.commit.userid}", + "null@suse.de", + flat.commit.commit_time, + # TODO: Normalize better the commit message + f"{flat.commit.comment}\n\n{flat.commit}", + allow_empty=True, + parents=parents, + ) + flat.commit.git_commit = commit + branch_state[flat.branch] = flat.commit + with open(self.state_file, "w") as f: + data = {} + for branch in ["factory", "devel"]: + commit = branch_state[branch] + if commit: + data[branch] = commit.dbid + yaml.dump(data, f) diff --git a/lib/importer.py b/lib/importer.py index 5e752e9..1f56756 100644 --- a/lib/importer.py +++ b/lib/importer.py @@ -1,122 +1,23 @@ import logging -import os import xml.etree.ElementTree as ET -import yaml - -from lib.binary import is_binary_or_large from lib.db import DB from lib.db_revision import DBRevision -from lib.git import Git from lib.obs import OBS from lib.obs_revision import OBSRevision -from lib.proxy_sha256 import ProxySHA256, md5 -from lib.tree_builder import AbstractWalker, TreeBuilder + from lib.user import User - class Importer: - def __init__(self, projects, package, repodir): - # The idea is to create each commit in order, and draw the - # same graph described by the revisions timeline. For that we - # need first to fetch all the revisions and sort them - # linearly, based on the timestamp. - # - # After that we recreate the commits, and if one revision is a - # request that contains a target inside the projects in the - # "history", we create a merge commit. - # - # Optionally, if a flag is set, we will try to find a common - # "Initial commit" from a reference branch (the first one in - # "projects", that is safe to assume to be "openSUSE:Factory". - # This is not always a good idea. For example, in a normal - # situation the "devel" project history is older than - # "factory", and we can root the tree on it. But for some - # other projects we lost partially the "devel" history project - # (could be moved), and "factory" is not the root. - + def __init__(self, api_url, project, package): + # Import a Factory package into the database self.package = package + self.project = project self.obs = OBS() - self.git = Git( - repodir, - committer="Git OBS Bridge", - committer_email="obsbridge@suse.de", - ).create() - self.state_file = os.path.join(self.git.path, ".git", "_flat_state.yaml") - self.proxy_sha256 = ProxySHA256(self.obs, enabled=True) - self.gc_interval = 200 - - # Add the "devel" project - (project, branch, api_url) = projects[0] assert project == "openSUSE:Factory" self.obs.change_url(api_url) - devel_project = self.obs.devel_project(project, package) - if devel_project: - self.projects = [(devel_project, "devel", api_url)] + projects - else: - self.projects = projects - - # Associate the branch and api_url information per project - self.projects_info = { - project: (branch, api_url) for (project, branch, api_url) in self.projects - } - - def download(self, revision): - obs_files = self.obs.files(revision.project, revision.package, revision.srcmd5) - git_files = { - (f.name, f.stat().st_size, md5(f)) - for f in self.git.path.iterdir() - if f.is_file() and f.name not in (".gitattributes") - } - - # Overwrite ".gitattributes" with the - self.git.add_default_lfs_gitattributes(force=True) - - # Download each file in OBS if it is not a binary (or large) - # file - for (name, size, file_md5) in obs_files: - # this file creates easily 100k commits and is just useless data :( - # unfortunately it's stored in the same meta package as the project config - if revision.package == "_project" and name == "_staging_workflow": - continue - # have such files been detected as text mimetype before? - is_text = self.proxy_sha256.is_text(name) - if not is_text and is_binary_or_large(name, size): - file_sha256 = self.proxy_sha256.get_or_put( - revision.project, - revision.package, - name, - revision.srcmd5, - file_md5, - size, - ) - self.git.add_lfs(name, file_sha256["sha256"], size) - else: - if (name, size, file_md5) not in git_files: - logging.debug(f"Download {name}") - self.obs.download( - revision.project, - revision.package, - name, - revision.srcmd5, - self.git.path, - ) - # Validate the MD5 of the downloaded file - if md5(self.git.path / name) != file_md5: - raise Exception(f"Download error in {name}") - self.git.add(name) - - # Remove extra files - obs_names = {n for (n, _, _) in obs_files} - git_names = {n for (n, _, _) in git_files} - for name in git_names - obs_names: - logging.debug(f"Remove {name}") - self.git.remove(name) - - def set_gc_interval(self, gc): - self.gc_interval = gc - + def update_db_package(self, db, project, package): root = self.obs._history(project, package) if root is None: @@ -248,100 +149,14 @@ class Importer: ) return [DBRevision(row) for row in cur.fetchall()] - def export_as_git(self): - db = DB() - tree = TreeBuilder(db).build(self.package) - flats = tree.as_flat_list() - - branch_state = {"factory": None, "devel": None} - state_data = dict() - if os.path.exists(self.state_file): - with open(self.state_file, "r") as f: - state_data = yaml.safe_load(f) - if type(state_data) != dict: - state_data = {} - left_to_commit = [] - for flat in reversed(flats): - found_state = False - for branch in ["factory", "devel"]: - if flat.commit.dbid == state_data.get(branch): - branch_state[branch] = flat.commit - flat.commit.git_commit = self.git.branch_head(branch) - logging.debug( - f"Found {self.git.path}'s {branch} branch in state {flat}" - ) - left_to_commit = [] - found_state = True - if not found_state: - left_to_commit.append(flat) - - gc_cnt = self.gc_interval - if len(left_to_commit) > 0: - self.git.gc() - for flat in left_to_commit: - gc_cnt -= 1 - if gc_cnt <= 0 and self.gc_interval: - self.git.gc() - gc_cnt = self.gc_interval - logging.debug(f"Committing {flat}") - self.commit_flat(db, flat, branch_state) - - def limit_download(self, file): - if file.endswith(".spec") or file.endswith(".changes"): - return True - return False - - def commit_flat(self, db, flat, branch_state): - parents = [] - self.git.checkout(flat.branch) - if flat.parent1: - parents.append(flat.parent1.git_commit) - if flat.parent2: - parents.append(flat.parent2.git_commit) - to_download, to_delete = flat.commit.calc_delta(db, branch_state[flat.branch]) - for file in to_delete: - if not self.limit_download(file): - continue - self.git.remove(file) - for file in to_download: - if not self.limit_download(file): - continue - self.obs.download( - flat.commit.project, - flat.commit.package, - file, - flat.commit.expanded_srcmd5, - self.git.path, - ) - self.git.add(file) - - commit = self.git.commit( - f"OBS User {flat.commit.userid}", - "null@suse.de", - flat.commit.commit_time, - # TODO: Normalize better the commit message - f"{flat.commit.comment}\n\n{flat.commit}", - allow_empty=True, - parents=parents, - ) - flat.commit.git_commit = commit - branch_state[flat.branch] = flat.commit - with open(self.state_file, "w") as f: - data = {} - for branch in ["factory", "devel"]: - commit = branch_state[branch] - if commit: - data[branch] = commit.dbid - yaml.dump(data, f) - + def import_into_db(self): db = DB() - for project, _, api_url in self.projects: - self.obs.change_url(api_url) - self.update_db_package(db, project, self.package) - self.fetch_all_linked_packages(db, project, self.package) - # all remaining, no filtering here - self.find_linked_revs(db) + self.update_db_package(db, self.project, self.package) + self.fetch_all_linked_packages(db, self.project, self.package) + + # all remaining, no filtering here + self.find_linked_revs(db) missing_users = User.missing_users(db) for userid in missing_users: diff --git a/lib/test_exporter.py b/lib/test_exporter.py index 425946d..ae7c63d 100644 --- a/lib/test_exporter.py +++ b/lib/test_exporter.py @@ -7,7 +7,8 @@ from lib.db_revision import DBRevision class TestExporter: - """"Helper class to export data from production DB for tests""" + """ "Helper class to export data from production DB for tests""" + def __init__(self, package): self.package = package diff --git a/lib/tree_builder.py b/lib/tree_builder.py index f3cc11d..6bdd169 100644 --- a/lib/tree_builder.py +++ b/lib/tree_builder.py @@ -204,9 +204,9 @@ class TreeBuilder: last_merge.merged.merged_into = None last_merge.merged = None - def build(self, package): + def build(self, project, package): """Create a Factory tree (returning the top)""" - factory_revisions = self.revisions_chain("openSUSE:Factory", package) + factory_revisions = self.revisions_chain(project, package) self.add_merge_points(factory_revisions) # factory_revisions.print() self.prune_loose_end(factory_revisions)