git-importer/lib/importer.py
Stephan Kulow bfdade8ecf Create a flat list of commits to do from the tree
Walk the node tree and record the parents, then reverse the tree so we
can have the exact order in which to create git commits
2022-10-31 12:28:12 +01:00

616 lines
24 KiB
Python

import functools
import logging
import xml.etree.ElementTree as ET
from lib.binary import is_binary_or_large
from lib.db import DB
from lib.db_revision import DBRevision
from lib.git import Git
from lib.history import History
from lib.obs import OBS
from lib.obs_revision import OBSRevision
from lib.proxy_sha256 import ProxySHA256, md5, sha256
from lib.tree_builder import AbstractWalker, TreeBuilder, TreeNode
from lib.user import User
def _files_hash(hash_alg, dirpath):
"""List of (filepath, md5) for a directory"""
# TODO: do it async or multythread
files = [f for f in dirpath.iterdir() if f.is_file()]
return [(f.parts[-1], hash_alg(f)) for f in files]
files_md5 = functools.partial(_files_hash, md5)
files_sha256 = functools.partial(_files_hash, sha256)
class Importer:
def __init__(self, projects, package, repodir, search_ancestor, rebase_devel):
# The idea is to create each commit in order, and draw the
# same graph described by the revisions timeline. For that we
# need first to fetch all the revisions and sort them
# linearly, based on the timestamp.
#
# After that we recreate the commits, and if one revision is a
# request that contains a target inside the projects in the
# "history", we create a merge commit.
#
# Optionally, if a flag is set, we will try to find a common
# "Initial commit" from a reference branch (the first one in
# "projects", that is safe to assume to be "openSUSE:Factory".
# This is not always a good idea. For example, in a normal
# situation the "devel" project history is older than
# "factory", and we can root the tree on it. But for some
# other projects we lost partially the "devel" history project
# (could be moved), and "factory" is not the root.
self.package = package
self.search_ancestor = search_ancestor
self.rebase_devel = rebase_devel
self.obs = OBS()
self.git = Git(
repodir,
committer="Git OBS Bridge",
committer_email="obsbridge@suse.de",
).create()
self.proxy_sha256 = ProxySHA256(self.obs, enabled=True)
self.history = History(self.obs, self.package)
# Add the "devel" project
(project, branch, api_url) = projects[0]
assert project == "openSUSE:Factory"
self.obs.change_url(api_url)
devel_project = self.obs.devel_project(project, package)
if devel_project:
self.projects = [(devel_project, "devel", api_url)] + projects
else:
self.projects = projects
# Associate the branch and api_url information per project
self.projects_info = {
project: (branch, api_url) for (project, branch, api_url) in self.projects
}
def download(self, revision):
obs_files = self.obs.files(revision.project, revision.package, revision.srcmd5)
git_files = {
(f.name, f.stat().st_size, md5(f))
for f in self.git.path.iterdir()
if f.is_file() and f.name not in (".gitattributes")
}
# Overwrite ".gitattributes" with the
self.git.add_default_lfs_gitattributes(force=True)
# Download each file in OBS if it is not a binary (or large)
# file
for (name, size, file_md5) in obs_files:
# this file creates easily 100k commits and is just useless data :(
# unfortunately it's stored in the same meta package as the project config
if revision.package == "_project" and name == "_staging_workflow":
continue
# have such files been detected as text mimetype before?
is_text = self.proxy_sha256.is_text(name)
if not is_text and is_binary_or_large(name, size):
file_sha256 = self.proxy_sha256.get_or_put(
revision.project,
revision.package,
name,
revision.srcmd5,
file_md5,
size,
)
self.git.add_lfs(name, file_sha256["sha256"], size)
else:
if (name, size, file_md5) not in git_files:
print(f"Download {name}")
self.obs.download(
revision.project,
revision.package,
name,
revision.srcmd5,
self.git.path,
)
# Validate the MD5 of the downloaded file
if md5(self.git.path / name) != file_md5:
raise Exception(f"Download error in {name}")
self.git.add(name)
# Remove extra files
obs_names = {n for (n, _, _) in obs_files}
git_names = {n for (n, _, _) in git_files}
for name in git_names - obs_names:
print(f"Remove {name}")
self.git.remove(name)
def update_db_package(self, db, project, package):
root = self.obs._history(project, package)
if root is None:
return
latest = DBRevision.latest_revision(db, project, package)
for r in root.findall("revision"):
rev = OBSRevision(self.obs, self, project, package).parse(r)
if not latest or rev.rev > latest.rev:
dbrev = DBRevision.import_obs_rev(db, rev)
try:
root = rev.read_link()
except ET.ParseError:
dbrev.set_broken(db)
continue
if root is not None:
tprj = root.get("project") or project
tpkg = root.get("package") or package
dbrev.links_to(db, tprj, tpkg)
def find_linked_revs(self, db):
with db.cursor() as cur:
cur.execute(
"""SELECT * from revisions WHERE id in (SELECT l.revision_id FROM links l
LEFT JOIN linked_revs lrevs ON lrevs.revision_id=l.revision_id
WHERE lrevs.id IS NULL) and broken is FALSE;"""
)
for row in cur.fetchall():
rev = DBRevision(row)
linked_rev = rev.linked_rev(db)
if not linked_rev:
logging.debug(f"No link {rev}")
continue
cur.execute(
"""INSERT INTO linked_revs (revision_id, linked_id)
VALUES (%s,%s)""",
(rev.dbid, linked_rev.dbid),
)
def fetch_all_linked_packages(self, db, project, package):
with db.cursor() as cur:
cur.execute(
"""SELECT DISTINCT l.project, l.package from links l JOIN revisions r
on r.id=l.revision_id WHERE r.project=%s AND r.package=%s""",
(project, package),
)
for row in cur.fetchall():
(lproject, lpackage) = row
self.update_db_package(db, lproject, lpackage)
def find_fake_revisions(self, db):
with db.cursor() as cur:
cur.execute(
"SELECT * from revisions WHERE id in (SELECT linked_id from linked_revs WHERE considered=FALSE)"
)
for row in cur.fetchall():
self._find_fake_revision(db, DBRevision(row))
def _find_fake_revision(self, db, rev):
prev = rev.previous_commit(db)
if not prev:
with db.cursor() as cur:
cur.execute(
"UPDATE linked_revs SET considered=TRUE where linked_id=%s",
(rev.dbid,),
)
return
with db.cursor() as cur:
cur.execute(
"""SELECT * FROM revisions WHERE id IN
(SELECT revision_id from linked_revs WHERE linked_id=%s)
AND commit_time <= %s ORDER BY commit_time""",
(prev.dbid, rev.commit_time),
)
last_linked = None
for linked in cur.fetchall():
linked = DBRevision(linked)
nextrev = linked.next_commit(db)
if nextrev and nextrev.commit_time < rev.commit_time:
continue
last_linked = linked
cur.execute(
"UPDATE linked_revs SET considered=TRUE where linked_id=%s",
(rev.dbid,),
)
if not last_linked:
return
with db.cursor() as cur:
linked = last_linked
cur.execute(
"SELECT 1 FROM fake_revs where revision_id=%s AND linked_id=%s",
(rev.dbid, linked.dbid),
)
if cur.fetchone():
cur.execute(
"UPDATE linked_revs SET considered=TRUE where linked_id=%s",
(rev.dbid,),
)
return
fake_rev = linked.rev + rev.rev / 1000.0
comment = f"Updating link to change in {rev.project}/{rev.package} revision {rev.rev}"
cur.execute(
"""INSERT INTO revisions (project,package,rev,unexpanded_srcmd5,
commit_time, userid, comment) VALUES(%s,%s,%s,%s,%s,%s,%s) RETURNING id""",
(
linked.project,
linked.package,
fake_rev,
linked.unexpanded_srcmd5,
rev.commit_time,
"buildservice-autocommit",
comment,
),
)
new_id = cur.fetchone()[0]
cur.execute(
"""INSERT INTO linked_revs (revision_id, linked_id) VALUES (%s,%s)""",
(new_id, rev.dbid),
)
cur.execute(
"""INSERT INTO fake_revs (revision_id, linked_id) VALUES (%s,%s)""",
(rev.dbid, linked.dbid),
)
def revisions_without_files(self, db):
with db.cursor() as cur:
cur.execute(
"SELECT * FROM revisions WHERE broken=FALSE AND expanded_srcmd5 IS NULL"
)
return [DBRevision(row) for row in cur.fetchall()]
def export_as_git(self):
db = DB()
tree = TreeBuilder(db).build(self.package)
class FlatNode:
def __init__(self, branch, commit, parent1=None, parent2=None) -> None:
self.branch = branch
self.commit = commit
self.parent1 = parent1
self.parent2 = parent2
def __str__(self) -> str:
p1_str = ""
if self.parent1:
p1_str = f" p1:{self.parent1.short_string()}"
p2_str = ""
if self.parent2:
p2_str = f" p2:{self.parent2.short_string()}"
return f"{self.branch} c:{self.commit.short_string()}{p1_str}{p2_str}"
class FlatTreeWalker(AbstractWalker):
def __init__(self) -> None:
super().__init__()
self.flats = []
# remember the last merge point so we can know the parent of it for the root of the sources
self.last_merge = None
def add(self, branch, commit, parent1=None, parent2=None):
self.flats.append(FlatNode(branch, commit, parent1, parent2))
def handle_source_node(self, node) -> None:
if node.parent and node.parent.merged_into and False:
self.add("devel", node.revision, node.parent.merged_into.revision)
return
if node.parent:
self.add("devel", node.revision, node.parent.revision)
elif self.last_merge:
self.add("devel", node.revision, self.last_merge.parent.revision)
def call(self, node, is_source) -> None:
if is_source:
self.handle_source_node(node)
return
if not node.parent:
self.add("factory", node.revision)
return
if not node.merged:
self.add("factory", node.revision, node.parent.revision)
return
self.add(
"factory", node.revision, node.parent.revision, node.merged.revision
)
self.last_merge = node
ftw = FlatTreeWalker()
tree.walk(ftw)
for flat in reversed(ftw.flats):
self.commit_flat(flat)
def commit_flat(self, flat):
parents = []
self.git.checkout(flat.branch)
if flat.parent1:
parents.append(flat.parent1.git_commit)
if flat.parent2:
parents.append(flat.parent2.git_commit)
commit = self.git.commit(
f"OBS User {flat.commit.userid}",
"null@suse.de",
flat.commit.commit_time,
# TODO: Normalize better the commit message
f"{flat.commit.comment}\n\n{flat.commit}",
allow_empty=True,
parents=parents,
)
flat.commit.git_commit = commit
def import_into_db(self):
db = DB()
for project, _, api_url in self.projects:
self.obs.change_url(api_url)
self.update_db_package(db, project, self.package)
self.fetch_all_linked_packages(db, project, self.package)
# all remaining, no filtering here
self.find_linked_revs(db)
missing_users = User.missing_users(db)
for userid in missing_users:
missing_user = self.obs.user(userid)
if missing_user:
missing_user.import_into_db(db)
self.find_fake_revisions(db)
for rev in self.revisions_without_files(db):
with db.cursor() as cur:
cur.execute(
"""SELECT unexpanded_srcmd5 from revisions WHERE
id=(SELECT linked_id FROM linked_revs WHERE revision_id=%s)""",
(rev.dbid,),
)
linked_rev = cur.fetchone()
if linked_rev:
linked_rev = linked_rev[0]
list = self.obs.list(
rev.project, rev.package, rev.unexpanded_srcmd5, linked_rev
)
if list:
rev.import_dir_list(db, list)
md5 = rev.calculate_files_hash(db)
with db.cursor() as cur:
cur.execute(
"UPDATE revisions SET files_hash=%s WHERE id=%s",
(md5, rev.dbid),
)
else:
rev.set_broken(db)
for number in DBRevision.requests_to_fetch(db):
self.obs.request(number).import_into_db(db)
db.conn.commit()
def import_all_revisions(self, gc):
# Fetch all the requests and sort them. Ideally we should
# build the graph here, to avoid new commits before the merge.
# For now we will sort them and invalidate the commits if
# "rebase_devel" is set.
self.history.fetch_all_revisions(self.projects)
revisions = self.history.sort_all_revisions()
logging.debug(f"Selected import order for {self.package}")
for revision in revisions:
logging.debug(revision)
gc_cnt = gc
for revision in revisions:
gc_cnt -= 1
if gc_cnt <= 0 and gc:
self.git.gc()
gc_cnt = gc
self.import_revision(revision)
def import_new_revision_with_request(self, revision, request):
"""Create a new branch as a result of a merge"""
submitted_revision = self.history.find_revision(
request.source, request.revisionid, revision.time
)
if not submitted_revision:
logging.warning(f"Request {request} does not connect to a known revision")
return False
if not submitted_revision.commit:
# If the revision appointed by the request is not part of
# the git history, we can have an ordering problem. One
# example is "premake4".
self.import_revision(submitted_revision)
assert submitted_revision.commit is not None
project = revision.project
branch, _ = self.projects_info[project]
# TODO: add an empty commit marking the acceptenace of the request (see discussion in PR 2858)
self.git.branch(branch, submitted_revision.commit)
self.git.clean()
self.git.checkout(branch)
logging.info(f"Create new branch based on {submitted_revision.commit}")
revision.commit = submitted_revision.commit
def _rebase_branch_history(self, project, revision):
branch, _ = self.projects_info[project]
history = self.history[project]
revision_index = history.index(revision)
for index in range(revision_index + 1, len(history)):
revision = history[index]
# We are done when we have one non-commited revision
if not revision.commit:
return
logging.info(f"Rebasing {revision} from {branch}")
revision.commit = None
self.import_revision(revision)
def import_revision_with_request(self, revision, request):
"""Import a single revision via a merge"""
submitted_revision = self.history.find_revision(
request.source, request.revisionid, revision.time
)
if not submitted_revision:
logging.warning(f"Request {request} does not connect to a known revision")
return False
assert submitted_revision.commit is not None
# TODO: detect a revision, case in point
# Base:System/bash/284 -> rq683701 -> accept O:F/151
# -> autocommit Base:System/bash/285
# Revert lead to openSUSE:Factory/bash/152
# Base:System/286 restored the reverted code in devel project
# rq684575 was created and accepted as O:F/153
# But the 284-285 and the 285-286 changeset is seen as empty
# as the revert was never in Base:System, so the
# submitted_revision of 684575 has no commit
if submitted_revision.commit == "EMPTY":
logging.warning("Empty commit submitted?!")
return False
message = (
f"Accepting request {revision.requestid}: {revision.comment}\n\n{revision}"
)
commit = self.git.merge(
# TODO: revision.userid or request.creator?
f"OBS User {revision.userid}",
"null@suse.de",
revision.time,
message,
submitted_revision.commit,
)
if commit == "EMPTY":
logging.warning("Empty merge. Ignoring the revision and the request")
self.git.merge_abort()
revision.commit = commit
return False
if commit == "CONFLICT":
logging.info("Merge conflict. Downloading revision")
self.download(revision)
message = f"CONFLICT {message}"
commit = self.git.merge(
f"OBS User {revision.userid}",
"null@suse.de",
revision.time,
message,
submitted_revision.commit,
merged=True,
)
assert commit and commit != "CONFLICT"
logging.info(f"Merge with {submitted_revision.commit} into {commit}")
revision.commit = commit
# TODO: There are more checks to do, like for example, the
# last commit into the non-devel branch should be a merge from
# the devel branch
if self.rebase_devel:
branch, _ = self.projects_info.get(request.source, (None, None))
if branch == "devel":
self.git.repo.references[f"refs/heads/{branch}"].set_target(commit)
self._rebase_branch_history(request.source, submitted_revision)
return True
def matching_request(self, revision):
request = self.obs.request(revision.requestid)
if not request:
return None
# to be handled by the caller
if request.type() != "submit":
return request
if request.source not in self.projects_info:
logging.info("Request from a non exported project")
return None
if request.target != revision.project:
# This seems to happen when the devel project gets
# reinitialized (for example, SR#943593 in 7zip, or
# SR#437901 in ColorFull)
logging.info("Request target different from current project")
return None
if request.source == request.target:
# this is not a merge, but a different way to do a
# contribution to the (devel) project - see bindfs's rev 1
logging.info("Request within the same project")
return None
return request
def import_revision(self, revision):
"""Import a single revision into git"""
project = revision.project
branch, api_url = self.projects_info[project]
logging.info(f"Importing [{revision}] to {branch}")
self.obs.change_url(api_url)
# Populate linkrev and replace srcmd5 from the linked
# revision. If the expansion fails, the revision will be ignored
# and not imported.
if not revision.check_expanded():
logging.warning(f"Broken revision")
revision.ignored = True
return
# When doing a SR, we see also a revision in the origin
# project with the outgoing request, but without changes in
# the project. We can ignore them.
#
# If there is a request ID, it will be filtered out later,
# when the target project is different from itself.
if revision.userid == "autobuild" and not revision.requestid:
logging.info("Ignoring autocommit")
revision.ignored = True
return
if revision.userid == "buildservice-autocommit":
logging.info("Ignoring autocommit")
revision.ignored = True
return
# Create the reference if the branch is new. If so return
# True.
new_branch = self.git.checkout(branch)
if revision.requestid:
request = self.matching_request(revision)
if request:
if request.type() == "delete":
# TODO: after this comes a restore, this should be collapsed
# before even hitting git
logging.info("Delete request ignored")
revision.ignored = True
return
logging.debug(f"Found matching request: #{revision.project} #{request}")
if new_branch:
self.import_new_revision_with_request(revision, request)
return
if self.import_revision_with_request(revision, request):
return
# Import revision as a single commit (without merging)
self.download(revision)
if new_branch or self.git.is_dirty():
commit = self.git.commit(
f"OBS User {revision.userid}",
"null@suse.de",
revision.time,
# TODO: Normalize better the commit message
f"{revision.comment}\n\n{revision}",
# Create an empty commit only if is a new branch
allow_empty=new_branch,
)
revision.commit = commit
logging.info(f"Commit {commit}")
else:
logging.info("Skip empty commit")
revision.ignored = True