import functools import logging import xml.etree.ElementTree as ET from lib.binary import is_binary_or_large from lib.db import DB from lib.db_revision import DBRevision from lib.git import Git from lib.history import History from lib.obs import OBS from lib.obs_revision import OBSRevision from lib.proxy_sha256 import ProxySHA256, md5, sha256 from lib.tree_builder import AbstractWalker, TreeBuilder from lib.user import User def _files_hash(hash_alg, dirpath): """List of (filepath, md5) for a directory""" # TODO: do it async or multythread files = [f for f in dirpath.iterdir() if f.is_file()] return [(f.parts[-1], hash_alg(f)) for f in files] files_md5 = functools.partial(_files_hash, md5) files_sha256 = functools.partial(_files_hash, sha256) class Importer: def __init__(self, projects, package, repodir, search_ancestor, rebase_devel): # The idea is to create each commit in order, and draw the # same graph described by the revisions timeline. For that we # need first to fetch all the revisions and sort them # linearly, based on the timestamp. # # After that we recreate the commits, and if one revision is a # request that contains a target inside the projects in the # "history", we create a merge commit. # # Optionally, if a flag is set, we will try to find a common # "Initial commit" from a reference branch (the first one in # "projects", that is safe to assume to be "openSUSE:Factory". # This is not always a good idea. For example, in a normal # situation the "devel" project history is older than # "factory", and we can root the tree on it. But for some # other projects we lost partially the "devel" history project # (could be moved), and "factory" is not the root. self.package = package self.search_ancestor = search_ancestor self.rebase_devel = rebase_devel self.obs = OBS() self.git = Git( repodir, committer="Git OBS Bridge", committer_email="obsbridge@suse.de", ).create() self.proxy_sha256 = ProxySHA256(self.obs, enabled=True) self.history = History(self.obs, self.package) # Add the "devel" project (project, branch, api_url) = projects[0] assert project == "openSUSE:Factory" self.obs.change_url(api_url) devel_project = self.obs.devel_project(project, package) if devel_project: self.projects = [(devel_project, "devel", api_url)] + projects else: self.projects = projects # Associate the branch and api_url information per project self.projects_info = { project: (branch, api_url) for (project, branch, api_url) in self.projects } def download(self, revision): obs_files = self.obs.files(revision.project, revision.package, revision.srcmd5) git_files = { (f.name, f.stat().st_size, md5(f)) for f in self.git.path.iterdir() if f.is_file() and f.name not in (".gitattributes") } # Overwrite ".gitattributes" with the self.git.add_default_lfs_gitattributes(force=True) # Download each file in OBS if it is not a binary (or large) # file for (name, size, file_md5) in obs_files: # this file creates easily 100k commits and is just useless data :( # unfortunately it's stored in the same meta package as the project config if revision.package == "_project" and name == "_staging_workflow": continue # have such files been detected as text mimetype before? is_text = self.proxy_sha256.is_text(name) if not is_text and is_binary_or_large(name, size): file_sha256 = self.proxy_sha256.get_or_put( revision.project, revision.package, name, revision.srcmd5, file_md5, size, ) self.git.add_lfs(name, file_sha256["sha256"], size) else: if (name, size, file_md5) not in git_files: print(f"Download {name}") self.obs.download( revision.project, revision.package, name, revision.srcmd5, self.git.path, ) # Validate the MD5 of the downloaded file if md5(self.git.path / name) != file_md5: raise Exception(f"Download error in {name}") self.git.add(name) # Remove extra files obs_names = {n for (n, _, _) in obs_files} git_names = {n for (n, _, _) in git_files} for name in git_names - obs_names: print(f"Remove {name}") self.git.remove(name) def update_db_package(self, db, project, package): root = self.obs._history(project, package) if root is None: return latest = DBRevision.latest_revision(db, project, package) for r in root.findall("revision"): rev = OBSRevision(self.obs, self, project, package).parse(r) if not latest or rev.rev > latest.rev: dbrev = DBRevision.import_obs_rev(db, rev) try: root = rev.read_link() except ET.ParseError: dbrev.set_broken(db) continue if root is not None: tprj = root.get("project") or project tpkg = root.get("package") or package dbrev.links_to(db, tprj, tpkg) def find_linked_revs(self, db): with db.cursor() as cur: cur.execute( """SELECT * from revisions WHERE id in (SELECT l.revision_id FROM links l LEFT JOIN linked_revs lrevs ON lrevs.revision_id=l.revision_id WHERE lrevs.id IS NULL) and broken is FALSE;""" ) for row in cur.fetchall(): rev = DBRevision(row) linked_rev = rev.linked_rev(db) if not linked_rev: logging.debug(f"No link {rev}") continue cur.execute( """INSERT INTO linked_revs (revision_id, linked_id) VALUES (%s,%s)""", (rev.dbid, linked_rev.dbid), ) def fetch_all_linked_packages(self, db, project, package): with db.cursor() as cur: cur.execute( """SELECT DISTINCT l.project, l.package from links l JOIN revisions r on r.id=l.revision_id WHERE r.project=%s AND r.package=%s""", (project, package), ) for row in cur.fetchall(): (lproject, lpackage) = row self.update_db_package(db, lproject, lpackage) def find_fake_revisions(self, db): with db.cursor() as cur: cur.execute( "SELECT * from revisions WHERE id in (SELECT linked_id from linked_revs WHERE considered=FALSE)" ) for row in cur.fetchall(): self._find_fake_revision(db, DBRevision(row)) def _find_fake_revision(self, db, rev): prev = rev.previous_commit(db) if not prev: with db.cursor() as cur: cur.execute( "UPDATE linked_revs SET considered=TRUE where linked_id=%s", (rev.dbid,), ) return with db.cursor() as cur: cur.execute( """SELECT * FROM revisions WHERE id IN (SELECT revision_id from linked_revs WHERE linked_id=%s) AND commit_time <= %s ORDER BY commit_time""", (prev.dbid, rev.commit_time), ) last_linked = None for linked in cur.fetchall(): linked = DBRevision(linked) nextrev = linked.next_commit(db) if nextrev and nextrev.commit_time < rev.commit_time: continue last_linked = linked cur.execute( "UPDATE linked_revs SET considered=TRUE where linked_id=%s", (rev.dbid,), ) if not last_linked: return with db.cursor() as cur: linked = last_linked cur.execute( "SELECT 1 FROM fake_revs where revision_id=%s AND linked_id=%s", (rev.dbid, linked.dbid), ) if cur.fetchone(): cur.execute( "UPDATE linked_revs SET considered=TRUE where linked_id=%s", (rev.dbid,), ) return fake_rev = linked.rev + rev.rev / 1000.0 comment = f"Updating link to change in {rev.project}/{rev.package} revision {rev.rev}" cur.execute( """INSERT INTO revisions (project,package,rev,unexpanded_srcmd5, commit_time, userid, comment) VALUES(%s,%s,%s,%s,%s,%s,%s) RETURNING id""", ( linked.project, linked.package, fake_rev, linked.unexpanded_srcmd5, rev.commit_time, "buildservice-autocommit", comment, ), ) new_id = cur.fetchone()[0] cur.execute( """INSERT INTO linked_revs (revision_id, linked_id) VALUES (%s,%s)""", (new_id, rev.dbid), ) cur.execute( """INSERT INTO fake_revs (revision_id, linked_id) VALUES (%s,%s)""", (rev.dbid, linked.dbid), ) def revisions_without_files(self, db): with db.cursor() as cur: cur.execute( "SELECT * FROM revisions WHERE broken=FALSE AND expanded_srcmd5 IS NULL" ) return [DBRevision(row) for row in cur.fetchall()] def export_as_git(self): db = DB() tree = TreeBuilder(db).build(self.package) class ExportWalker(AbstractWalker): def call(self, node, is_source): pass tree.walk(ExportWalker()) def import_into_db(self): db = DB() for project, _, api_url in self.projects: self.obs.change_url(api_url) self.update_db_package(db, project, self.package) self.fetch_all_linked_packages(db, project, self.package) # all remaining, no filtering here self.find_linked_revs(db) missing_users = User.missing_users(db) for userid in missing_users: missing_user = self.obs.user(userid) if missing_user: missing_user.import_into_db(db) self.find_fake_revisions(db) for rev in self.revisions_without_files(db): with db.cursor() as cur: cur.execute( """SELECT unexpanded_srcmd5 from revisions WHERE id=(SELECT linked_id FROM linked_revs WHERE revision_id=%s)""", (rev.dbid,), ) linked_rev = cur.fetchone() if linked_rev: linked_rev = linked_rev[0] list = self.obs.list( rev.project, rev.package, rev.unexpanded_srcmd5, linked_rev ) if list: rev.import_dir_list(db, list) md5 = rev.calculate_files_hash(db) with db.cursor() as cur: cur.execute( "UPDATE revisions SET files_hash=%s WHERE id=%s", (md5, rev.dbid), ) else: rev.set_broken(db) for number in DBRevision.requests_to_fetch(db): self.obs.request(number).import_into_db(db) db.conn.commit() def import_all_revisions(self, gc): # Fetch all the requests and sort them. Ideally we should # build the graph here, to avoid new commits before the merge. # For now we will sort them and invalidate the commits if # "rebase_devel" is set. self.history.fetch_all_revisions(self.projects) revisions = self.history.sort_all_revisions() logging.debug(f"Selected import order for {self.package}") for revision in revisions: logging.debug(revision) gc_cnt = gc for revision in revisions: gc_cnt -= 1 if gc_cnt <= 0 and gc: self.git.gc() gc_cnt = gc self.import_revision(revision) def import_new_revision_with_request(self, revision, request): """Create a new branch as a result of a merge""" submitted_revision = self.history.find_revision( request.source, request.revisionid, revision.time ) if not submitted_revision: logging.warning(f"Request {request} does not connect to a known revision") return False if not submitted_revision.commit: # If the revision appointed by the request is not part of # the git history, we can have an ordering problem. One # example is "premake4". self.import_revision(submitted_revision) assert submitted_revision.commit is not None project = revision.project branch, _ = self.projects_info[project] # TODO: add an empty commit marking the acceptenace of the request (see discussion in PR 2858) self.git.branch(branch, submitted_revision.commit) self.git.clean() self.git.checkout(branch) logging.info(f"Create new branch based on {submitted_revision.commit}") revision.commit = submitted_revision.commit def _rebase_branch_history(self, project, revision): branch, _ = self.projects_info[project] history = self.history[project] revision_index = history.index(revision) for index in range(revision_index + 1, len(history)): revision = history[index] # We are done when we have one non-commited revision if not revision.commit: return logging.info(f"Rebasing {revision} from {branch}") revision.commit = None self.import_revision(revision) def import_revision_with_request(self, revision, request): """Import a single revision via a merge""" submitted_revision = self.history.find_revision( request.source, request.revisionid, revision.time ) if not submitted_revision: logging.warning(f"Request {request} does not connect to a known revision") return False assert submitted_revision.commit is not None # TODO: detect a revision, case in point # Base:System/bash/284 -> rq683701 -> accept O:F/151 # -> autocommit Base:System/bash/285 # Revert lead to openSUSE:Factory/bash/152 # Base:System/286 restored the reverted code in devel project # rq684575 was created and accepted as O:F/153 # But the 284-285 and the 285-286 changeset is seen as empty # as the revert was never in Base:System, so the # submitted_revision of 684575 has no commit if submitted_revision.commit == "EMPTY": logging.warning("Empty commit submitted?!") return False message = ( f"Accepting request {revision.requestid}: {revision.comment}\n\n{revision}" ) commit = self.git.merge( # TODO: revision.userid or request.creator? f"OBS User {revision.userid}", "null@suse.de", revision.time, message, submitted_revision.commit, ) if commit == "EMPTY": logging.warning("Empty merge. Ignoring the revision and the request") self.git.merge_abort() revision.commit = commit return False if commit == "CONFLICT": logging.info("Merge conflict. Downloading revision") self.download(revision) message = f"CONFLICT {message}" commit = self.git.merge( f"OBS User {revision.userid}", "null@suse.de", revision.time, message, submitted_revision.commit, merged=True, ) assert commit and commit != "CONFLICT" logging.info(f"Merge with {submitted_revision.commit} into {commit}") revision.commit = commit # TODO: There are more checks to do, like for example, the # last commit into the non-devel branch should be a merge from # the devel branch if self.rebase_devel: branch, _ = self.projects_info.get(request.source, (None, None)) if branch == "devel": self.git.repo.references[f"refs/heads/{branch}"].set_target(commit) self._rebase_branch_history(request.source, submitted_revision) return True def matching_request(self, revision): request = self.obs.request(revision.requestid) if not request: return None # to be handled by the caller if request.type() != "submit": return request if request.source not in self.projects_info: logging.info("Request from a non exported project") return None if request.target != revision.project: # This seems to happen when the devel project gets # reinitialized (for example, SR#943593 in 7zip, or # SR#437901 in ColorFull) logging.info("Request target different from current project") return None if request.source == request.target: # this is not a merge, but a different way to do a # contribution to the (devel) project - see bindfs's rev 1 logging.info("Request within the same project") return None return request def import_revision(self, revision): """Import a single revision into git""" project = revision.project branch, api_url = self.projects_info[project] logging.info(f"Importing [{revision}] to {branch}") self.obs.change_url(api_url) # Populate linkrev and replace srcmd5 from the linked # revision. If the expansion fails, the revision will be ignored # and not imported. if not revision.check_expanded(): logging.warning(f"Broken revision") revision.ignored = True return # When doing a SR, we see also a revision in the origin # project with the outgoing request, but without changes in # the project. We can ignore them. # # If there is a request ID, it will be filtered out later, # when the target project is different from itself. if revision.userid == "autobuild" and not revision.requestid: logging.info("Ignoring autocommit") revision.ignored = True return if revision.userid == "buildservice-autocommit": logging.info("Ignoring autocommit") revision.ignored = True return # Create the reference if the branch is new. If so return # True. new_branch = self.git.checkout(branch) if revision.requestid: request = self.matching_request(revision) if request: if request.type() == "delete": # TODO: after this comes a restore, this should be collapsed # before even hitting git logging.info("Delete request ignored") revision.ignored = True return logging.debug(f"Found matching request: #{revision.project} #{request}") if new_branch: self.import_new_revision_with_request(revision, request) return if self.import_revision_with_request(revision, request): return # Import revision as a single commit (without merging) self.download(revision) if new_branch or self.git.is_dirty(): commit = self.git.commit( f"OBS User {revision.userid}", "null@suse.de", revision.time, # TODO: Normalize better the commit message f"{revision.comment}\n\n{revision}", # Create an empty commit only if is a new branch allow_empty=new_branch, ) revision.commit = commit logging.info(f"Commit {commit}") else: logging.info("Skip empty commit") revision.ignored = True