git-importer/lib/importer.py
Stephan Kulow 9ed8abad2b Make database usage the default
Some cleanup of no longer used functions
2022-11-01 11:23:40 +01:00

668 lines
26 KiB
Python

import functools
import logging
import os
import xml.etree.ElementTree as ET
import yaml
from lib.binary import is_binary_or_large
from lib.db import DB
from lib.db_revision import DBRevision
from lib.git import Git
from lib.history import History
from lib.obs import OBS
from lib.obs_revision import OBSRevision
from lib.proxy_sha256 import ProxySHA256, md5, sha256
from lib.tree_builder import AbstractWalker, TreeBuilder, TreeNode
from lib.user import User
def _files_hash(hash_alg, dirpath):
"""List of (filepath, md5) for a directory"""
# TODO: do it async or multythread
files = [f for f in dirpath.iterdir() if f.is_file()]
return [(f.parts[-1], hash_alg(f)) for f in files]
files_md5 = functools.partial(_files_hash, md5)
files_sha256 = functools.partial(_files_hash, sha256)
class Importer:
def __init__(self, projects, package, repodir, search_ancestor, rebase_devel):
# The idea is to create each commit in order, and draw the
# same graph described by the revisions timeline. For that we
# need first to fetch all the revisions and sort them
# linearly, based on the timestamp.
#
# After that we recreate the commits, and if one revision is a
# request that contains a target inside the projects in the
# "history", we create a merge commit.
#
# Optionally, if a flag is set, we will try to find a common
# "Initial commit" from a reference branch (the first one in
# "projects", that is safe to assume to be "openSUSE:Factory".
# This is not always a good idea. For example, in a normal
# situation the "devel" project history is older than
# "factory", and we can root the tree on it. But for some
# other projects we lost partially the "devel" history project
# (could be moved), and "factory" is not the root.
self.package = package
self.search_ancestor = search_ancestor
self.rebase_devel = rebase_devel
self.obs = OBS()
self.git = Git(
repodir,
committer="Git OBS Bridge",
committer_email="obsbridge@suse.de",
).create()
self.state_file = os.path.join(self.git.path, ".git", "_flat_state.yaml")
self.proxy_sha256 = ProxySHA256(self.obs, enabled=True)
self.gc_interval = 200
self.history = History(self.obs, self.package)
# Add the "devel" project
(project, branch, api_url) = projects[0]
assert project == "openSUSE:Factory"
self.obs.change_url(api_url)
devel_project = self.obs.devel_project(project, package)
if devel_project:
self.projects = [(devel_project, "devel", api_url)] + projects
else:
self.projects = projects
# Associate the branch and api_url information per project
self.projects_info = {
project: (branch, api_url) for (project, branch, api_url) in self.projects
}
def download(self, revision):
obs_files = self.obs.files(revision.project, revision.package, revision.srcmd5)
git_files = {
(f.name, f.stat().st_size, md5(f))
for f in self.git.path.iterdir()
if f.is_file() and f.name not in (".gitattributes")
}
# Overwrite ".gitattributes" with the
self.git.add_default_lfs_gitattributes(force=True)
# Download each file in OBS if it is not a binary (or large)
# file
for (name, size, file_md5) in obs_files:
# this file creates easily 100k commits and is just useless data :(
# unfortunately it's stored in the same meta package as the project config
if revision.package == "_project" and name == "_staging_workflow":
continue
# have such files been detected as text mimetype before?
is_text = self.proxy_sha256.is_text(name)
if not is_text and is_binary_or_large(name, size):
file_sha256 = self.proxy_sha256.get_or_put(
revision.project,
revision.package,
name,
revision.srcmd5,
file_md5,
size,
)
self.git.add_lfs(name, file_sha256["sha256"], size)
else:
if (name, size, file_md5) not in git_files:
logging.debug(f"Download {name}")
self.obs.download(
revision.project,
revision.package,
name,
revision.srcmd5,
self.git.path,
)
# Validate the MD5 of the downloaded file
if md5(self.git.path / name) != file_md5:
raise Exception(f"Download error in {name}")
self.git.add(name)
# Remove extra files
obs_names = {n for (n, _, _) in obs_files}
git_names = {n for (n, _, _) in git_files}
for name in git_names - obs_names:
logging.debug(f"Remove {name}")
self.git.remove(name)
def set_gc_interval(self, gc):
self.gc_interval = gc
def update_db_package(self, db, project, package):
root = self.obs._history(project, package)
if root is None:
return
latest = DBRevision.latest_revision(db, project, package)
for r in root.findall("revision"):
rev = OBSRevision(self.obs, self, project, package).parse(r)
if not latest or rev.rev > latest.rev:
dbrev = DBRevision.import_obs_rev(db, rev)
try:
root = rev.read_link()
except ET.ParseError:
dbrev.set_broken(db)
continue
if root is not None:
tprj = root.get("project") or project
tpkg = root.get("package") or package
dbrev.links_to(db, tprj, tpkg)
def find_linked_revs(self, db):
with db.cursor() as cur:
cur.execute(
"""SELECT * from revisions WHERE id in (SELECT l.revision_id FROM links l
LEFT JOIN linked_revs lrevs ON lrevs.revision_id=l.revision_id
WHERE lrevs.id IS NULL) and broken is FALSE;"""
)
for row in cur.fetchall():
rev = DBRevision(row)
linked_rev = rev.linked_rev(db)
if not linked_rev:
logging.debug(f"No link {rev}")
continue
cur.execute(
"""INSERT INTO linked_revs (revision_id, linked_id)
VALUES (%s,%s)""",
(rev.dbid, linked_rev.dbid),
)
def fetch_all_linked_packages(self, db, project, package):
with db.cursor() as cur:
cur.execute(
"""SELECT DISTINCT l.project, l.package from links l JOIN revisions r
on r.id=l.revision_id WHERE r.project=%s AND r.package=%s""",
(project, package),
)
for row in cur.fetchall():
(lproject, lpackage) = row
self.update_db_package(db, lproject, lpackage)
def find_fake_revisions(self, db):
with db.cursor() as cur:
cur.execute(
"SELECT * from revisions WHERE id in (SELECT linked_id from linked_revs WHERE considered=FALSE)"
)
for row in cur.fetchall():
self._find_fake_revision(db, DBRevision(row))
def _find_fake_revision(self, db, rev):
prev = rev.previous_commit(db)
if not prev:
with db.cursor() as cur:
cur.execute(
"UPDATE linked_revs SET considered=TRUE where linked_id=%s",
(rev.dbid,),
)
return
with db.cursor() as cur:
cur.execute(
"""SELECT * FROM revisions WHERE id IN
(SELECT revision_id from linked_revs WHERE linked_id=%s)
AND commit_time <= %s ORDER BY commit_time""",
(prev.dbid, rev.commit_time),
)
last_linked = None
for linked in cur.fetchall():
linked = DBRevision(linked)
nextrev = linked.next_commit(db)
if nextrev and nextrev.commit_time < rev.commit_time:
continue
last_linked = linked
cur.execute(
"UPDATE linked_revs SET considered=TRUE where linked_id=%s",
(rev.dbid,),
)
if not last_linked:
return
with db.cursor() as cur:
linked = last_linked
cur.execute(
"SELECT 1 FROM fake_revs where revision_id=%s AND linked_id=%s",
(rev.dbid, linked.dbid),
)
if cur.fetchone():
cur.execute(
"UPDATE linked_revs SET considered=TRUE where linked_id=%s",
(rev.dbid,),
)
return
fake_rev = linked.rev + rev.rev / 1000.0
comment = f"Updating link to change in {rev.project}/{rev.package} revision {rev.rev}"
cur.execute(
"""INSERT INTO revisions (project,package,rev,unexpanded_srcmd5,
commit_time, userid, comment) VALUES(%s,%s,%s,%s,%s,%s,%s) RETURNING id""",
(
linked.project,
linked.package,
fake_rev,
linked.unexpanded_srcmd5,
rev.commit_time,
"buildservice-autocommit",
comment,
),
)
new_id = cur.fetchone()[0]
cur.execute(
"""INSERT INTO linked_revs (revision_id, linked_id) VALUES (%s,%s)""",
(new_id, rev.dbid),
)
cur.execute(
"""INSERT INTO fake_revs (revision_id, linked_id) VALUES (%s,%s)""",
(rev.dbid, linked.dbid),
)
def revisions_without_files(self, db):
with db.cursor() as cur:
cur.execute(
"SELECT * FROM revisions WHERE broken=FALSE AND expanded_srcmd5 IS NULL"
)
return [DBRevision(row) for row in cur.fetchall()]
def export_as_git(self):
db = DB()
tree = TreeBuilder(db).build(self.package)
class FlatNode:
def __init__(self, branch, commit, parent1=None, parent2=None) -> None:
self.branch = branch
self.commit = commit
self.parent1 = parent1
self.parent2 = parent2
def __str__(self) -> str:
p1_str = ""
if self.parent1:
p1_str = f" p1:{self.parent1.short_string()}"
p2_str = ""
if self.parent2:
p2_str = f" p2:{self.parent2.short_string()}"
return f"{self.branch} c:{self.commit.short_string()}{p1_str}{p2_str}"
class FlatTreeWalker(AbstractWalker):
"""While walking the tree, record the commits to do one after the other. These
FlatNodes are in the end in the flats array."""
def __init__(self, rebase_devel) -> None:
super().__init__()
self.flats = []
# the rebase_devel won't work as such as rebasing the branch needs an explicit action
self.rebase_devel = rebase_devel
# remember the last merge point so we can know the parent of it for the root of the sources
self.last_merge = None
def add(self, branch, commit, parent1=None, parent2=None):
self.flats.append(FlatNode(branch, commit, parent1, parent2))
def handle_source_node(self, node) -> None:
if self.rebase_devel and node.parent and node.parent.merged_into:
self.add("devel", node.revision, node.parent.merged_into.revision)
return
if node.parent:
self.add("devel", node.revision, node.parent.revision)
elif self.last_merge:
self.add("devel", node.revision, self.last_merge.parent.revision)
def call(self, node, is_source) -> None:
if is_source:
self.handle_source_node(node)
return
if not node.parent:
self.add("factory", node.revision)
return
if not node.merged:
self.add("factory", node.revision, node.parent.revision)
return
self.add(
"factory", node.revision, node.parent.revision, node.merged.revision
)
self.last_merge = node
ftw = FlatTreeWalker(self.rebase_devel)
tree.walk(ftw)
branch_state = {"factory": None, "devel": None}
state_data = dict()
if os.path.exists(self.state_file):
with open(self.state_file, "r") as f:
state_data = yaml.safe_load(f)
if type(state_data) != dict:
state_data = {}
left_to_commit = []
for flat in reversed(ftw.flats):
found_state = False
for branch in ["factory", "devel"]:
if flat.commit.dbid == state_data.get(branch):
branch_state[branch] = flat.commit
flat.commit.git_commit = self.git.branch_head(branch)
logging.debug(
f"Found {self.git.path}'s {branch} branch in state {flat}"
)
left_to_commit = []
found_state = True
if not found_state:
left_to_commit.append(flat)
gc_cnt = self.gc_interval
if len(left_to_commit) > 0:
self.git.gc()
for flat in left_to_commit:
gc_cnt -= 1
if gc_cnt <= 0 and self.gc_interval:
self.git.gc()
gc_cnt = self.gc_interval
logging.debug(f"Committing {flat}")
self.commit_flat(db, flat, branch_state)
def limit_download(self, file):
if file.endswith(".spec") or file.endswith(".changes"):
return True
return False
def commit_flat(self, db, flat, branch_state):
parents = []
self.git.checkout(flat.branch)
if flat.parent1:
parents.append(flat.parent1.git_commit)
if flat.parent2:
parents.append(flat.parent2.git_commit)
to_download, to_delete = flat.commit.calc_delta(db, branch_state[flat.branch])
for file in to_delete:
if not self.limit_download(file):
continue
self.git.remove(file)
for file in to_download:
if not self.limit_download(file):
continue
self.obs.download(
flat.commit.project,
flat.commit.package,
file,
flat.commit.expanded_srcmd5,
self.git.path,
)
self.git.add(file)
commit = self.git.commit(
f"OBS User {flat.commit.userid}",
"null@suse.de",
flat.commit.commit_time,
# TODO: Normalize better the commit message
f"{flat.commit.comment}\n\n{flat.commit}",
allow_empty=True,
parents=parents,
)
flat.commit.git_commit = commit
branch_state[flat.branch] = flat.commit
with open(self.state_file, "w") as f:
data = {}
for branch in ["factory", "devel"]:
commit = branch_state[branch]
if commit:
data[branch] = commit.dbid
yaml.dump(data, f)
def import_into_db(self):
db = DB()
for project, _, api_url in self.projects:
self.obs.change_url(api_url)
self.update_db_package(db, project, self.package)
self.fetch_all_linked_packages(db, project, self.package)
# all remaining, no filtering here
self.find_linked_revs(db)
missing_users = User.missing_users(db)
for userid in missing_users:
missing_user = self.obs.user(userid)
if missing_user:
missing_user.import_into_db(db)
self.find_fake_revisions(db)
for rev in self.revisions_without_files(db):
with db.cursor() as cur:
cur.execute(
"""SELECT unexpanded_srcmd5 from revisions WHERE
id=(SELECT linked_id FROM linked_revs WHERE revision_id=%s)""",
(rev.dbid,),
)
linked_rev = cur.fetchone()
if linked_rev:
linked_rev = linked_rev[0]
list = self.obs.list(
rev.project, rev.package, rev.unexpanded_srcmd5, linked_rev
)
if list:
rev.import_dir_list(db, list)
md5 = rev.calculate_files_hash(db)
with db.cursor() as cur:
cur.execute(
"UPDATE revisions SET files_hash=%s WHERE id=%s",
(md5, rev.dbid),
)
else:
rev.set_broken(db)
for number in DBRevision.requests_to_fetch(db):
self.obs.request(number).import_into_db(db)
db.conn.commit()
def import_new_revision_with_request(self, revision, request):
"""Create a new branch as a result of a merge"""
submitted_revision = self.history.find_revision(
request.source, request.revisionid, revision.time
)
if not submitted_revision:
logging.warning(f"Request {request} does not connect to a known revision")
return False
if not submitted_revision.commit:
# If the revision appointed by the request is not part of
# the git history, we can have an ordering problem. One
# example is "premake4".
self.import_revision(submitted_revision)
assert submitted_revision.commit is not None
project = revision.project
branch, _ = self.projects_info[project]
# TODO: add an empty commit marking the acceptenace of the request (see discussion in PR 2858)
self.git.branch(branch, submitted_revision.commit)
self.git.clean()
self.git.checkout(branch)
logging.info(f"Create new branch based on {submitted_revision.commit}")
revision.commit = submitted_revision.commit
def _rebase_branch_history(self, project, revision):
branch, _ = self.projects_info[project]
history = self.history[project]
revision_index = history.index(revision)
for index in range(revision_index + 1, len(history)):
revision = history[index]
# We are done when we have one non-commited revision
if not revision.commit:
return
logging.info(f"Rebasing {revision} from {branch}")
revision.commit = None
self.import_revision(revision)
def import_revision_with_request(self, revision, request):
"""Import a single revision via a merge"""
submitted_revision = self.history.find_revision(
request.source, request.revisionid, revision.time
)
if not submitted_revision:
logging.warning(f"Request {request} does not connect to a known revision")
return False
assert submitted_revision.commit is not None
# TODO: detect a revision, case in point
# Base:System/bash/284 -> rq683701 -> accept O:F/151
# -> autocommit Base:System/bash/285
# Revert lead to openSUSE:Factory/bash/152
# Base:System/286 restored the reverted code in devel project
# rq684575 was created and accepted as O:F/153
# But the 284-285 and the 285-286 changeset is seen as empty
# as the revert was never in Base:System, so the
# submitted_revision of 684575 has no commit
if submitted_revision.commit == "EMPTY":
logging.warning("Empty commit submitted?!")
return False
message = (
f"Accepting request {revision.requestid}: {revision.comment}\n\n{revision}"
)
commit = self.git.merge(
# TODO: revision.userid or request.creator?
f"OBS User {revision.userid}",
"null@suse.de",
revision.time,
message,
submitted_revision.commit,
)
if commit == "EMPTY":
logging.warning("Empty merge. Ignoring the revision and the request")
self.git.merge_abort()
revision.commit = commit
return False
if commit == "CONFLICT":
logging.info("Merge conflict. Downloading revision")
self.download(revision)
message = f"CONFLICT {message}"
commit = self.git.merge(
f"OBS User {revision.userid}",
"null@suse.de",
revision.time,
message,
submitted_revision.commit,
merged=True,
)
assert commit and commit != "CONFLICT"
logging.info(f"Merge with {submitted_revision.commit} into {commit}")
revision.commit = commit
# TODO: There are more checks to do, like for example, the
# last commit into the non-devel branch should be a merge from
# the devel branch
if self.rebase_devel:
branch, _ = self.projects_info.get(request.source, (None, None))
if branch == "devel":
self.git.repo.references[f"refs/heads/{branch}"].set_target(commit)
self._rebase_branch_history(request.source, submitted_revision)
return True
def matching_request(self, revision):
request = self.obs.request(revision.requestid)
if not request:
return None
# to be handled by the caller
if request.type() != "submit":
return request
if request.source not in self.projects_info:
logging.info("Request from a non exported project")
return None
if request.target != revision.project:
# This seems to happen when the devel project gets
# reinitialized (for example, SR#943593 in 7zip, or
# SR#437901 in ColorFull)
logging.info("Request target different from current project")
return None
if request.source == request.target:
# this is not a merge, but a different way to do a
# contribution to the (devel) project - see bindfs's rev 1
logging.info("Request within the same project")
return None
return request
def import_revision(self, revision):
"""Import a single revision into git"""
project = revision.project
branch, api_url = self.projects_info[project]
logging.info(f"Importing [{revision}] to {branch}")
self.obs.change_url(api_url)
# Populate linkrev and replace srcmd5 from the linked
# revision. If the expansion fails, the revision will be ignored
# and not imported.
if not revision.check_expanded():
logging.warning(f"Broken revision")
revision.ignored = True
return
# When doing a SR, we see also a revision in the origin
# project with the outgoing request, but without changes in
# the project. We can ignore them.
#
# If there is a request ID, it will be filtered out later,
# when the target project is different from itself.
if revision.userid == "autobuild" and not revision.requestid:
logging.info("Ignoring autocommit")
revision.ignored = True
return
if revision.userid == "buildservice-autocommit":
logging.info("Ignoring autocommit")
revision.ignored = True
return
# Create the reference if the branch is new. If so return
# True.
new_branch = self.git.checkout(branch)
if revision.requestid:
request = self.matching_request(revision)
if request:
if request.type() == "delete":
# TODO: after this comes a restore, this should be collapsed
# before even hitting git
logging.info("Delete request ignored")
revision.ignored = True
return
logging.debug(f"Found matching request: #{revision.project} #{request}")
if new_branch:
self.import_new_revision_with_request(revision, request)
return
if self.import_revision_with_request(revision, request):
return
# Import revision as a single commit (without merging)
self.download(revision)
if new_branch or self.git.is_dirty():
commit = self.git.commit(
f"OBS User {revision.userid}",
"null@suse.de",
revision.time,
# TODO: Normalize better the commit message
f"{revision.comment}\n\n{revision}",
# Create an empty commit only if is a new branch
allow_empty=new_branch,
)
revision.commit = commit
logging.info(f"Commit {commit}")
else:
logging.info("Skip empty commit")
revision.ignored = True