git-importer/git-importer.py

1291 lines
42 KiB
Python
Raw Normal View History

#!/usr/bin/env python3
import argparse
import asyncio
import datetime
import errno
import fnmatch
import functools
import hashlib
import itertools
import logging
import os
import pathlib
import re
import shutil
import subprocess
import sys
import time
import urllib.parse
import xml.etree.ElementTree as ET
from urllib.error import HTTPError
import osc.core
import pygit2
import requests
from osclib.cache import Cache
# Add a retry wrapper for some of the HTTP actions.
def retry(func):
def wrapper(*args, **kwargs):
retry = 0
while retry < 5:
try:
return func(*args, **kwargs)
except HTTPError as e:
if 500 <= e.code <= 599:
retry += 1
logging.warning(
f"HTTPError {e.code} -- Retrying {args[0]} ({retry})"
)
# TODO: remove when move to async
time.sleep(0.5)
else:
raise
except urllib.error.URLError as e:
if e.reason.errno in (errno.ENETUNREACH, errno.EADDRNOTAVAIL):
retry += 1
logging.warning(f"URLError {e} -- Retrying {args[0]} ({retry})")
time.sleep(0.5)
else:
logging.warning(f"URLError {e.errno} uncaught")
raise
except OSError as e:
if e.errno in (errno.ENETUNREACH, errno.EADDRNOTAVAIL): # sporadically hits cloud VMs :(
retry += 1
logging.warning(f"OSError {e} -- Retrying {args[0]} ({retry})")
# TODO: remove when move to async
time.sleep(0.5)
else:
logging.warning(f"OSError {e.errno} uncaught")
raise
return wrapper
osc.core.http_GET = retry(osc.core.http_GET)
BINARY = {
".7z",
".bsp",
".bz2",
".gem",
".gz",
".jar",
".lz",
".lzma",
".obscpio",
".oxt",
".pdf",
".png",
".rpm",
".tbz",
".tbz2",
".tgz",
".ttf",
".txz",
".whl",
".xz",
".zip",
".zst",
}
LFS_SUFFIX = "filter=lfs diff=lfs merge=lfs -text"
URL_OBS = "https://api.opensuse.org"
URL_IBS = "https://api.suse.de"
# The order is relevant (from older to newer initial codebase)
PROJECTS = [
("openSUSE:Factory", "factory", URL_OBS),
# ("SUSE:SLE-12:GA", "SLE_12", URL_IBS),
# ("SUSE:SLE-12:Update", "SLE_12", URL_IBS),
# ("SUSE:SLE-12-SP1:GA", "SLE_12_SP1", URL_IBS),
# ("SUSE:SLE-12-SP1:Update", "SLE_12_SP1", URL_IBS),
# ("SUSE:SLE-12-SP2:GA", "SLE_12_SP2", URL_IBS),
# ("SUSE:SLE-12-SP2:Update", "SLE_12_SP2", URL_IBS),
# ("SUSE:SLE-12-SP3:GA", "SLE_12_SP3", URL_IBS),
# ("SUSE:SLE-12-SP3:Update", "SLE_12_SP3", URL_IBS),
# ("SUSE:SLE-12-SP4:GA", "SLE_12_SP4", URL_IBS),
# ("SUSE:SLE-12-SP4:Update", "SLE_12_SP4", URL_IBS),
# ("SUSE:SLE-12-SP5:GA", "SLE_12_SP5", URL_IBS),
# ("SUSE:SLE-12-SP5:Update", "SLE_12_SP5", URL_IBS),
# ("SUSE:SLE-15:GA", "SLE_15", URL_IBS),
# ("SUSE:SLE-15:Update", "SLE_15", URL_IBS),
# ("SUSE:SLE-15-SP1:GA", "SLE_15_SP1", URL_IBS),
# ("SUSE:SLE-15-SP1:Update", "SLE_15_SP1", URL_IBS),
# ("SUSE:SLE-15-SP2:GA", "SLE_15_SP2", URL_IBS),
# ("SUSE:SLE-15-SP2:Update", "SLE_15_SP2", URL_IBS),
# ("SUSE:SLE-15-SP3:GA", "SLE_15_SP3", URL_IBS),
# ("SUSE:SLE-15-SP3:Update", "SLE_15_SP3", URL_IBS),
# ("SUSE:SLE-15-SP4:GA", "SLE_15_SP4", URL_IBS),
# ("SUSE:SLE-15-SP4:Update", "SLE_15_SP4", URL_IBS),
]
def is_binary_or_large(filename, size):
"""Decide if is a binary file based on the extension or size"""
binary_suffix = BINARY
non_binary_suffix = {
".1",
".8",
".SUSE",
".asc",
".c",
".cabal",
".cfg",
".changes",
".conf",
".desktop",
".dif",
".diff",
".dsc",
".el",
".html",
".in",
".init",
".install",
".keyring",
".kiwi",
".logrotate",
".macros",
".md",
".obsinfo",
".pamd",
".patch",
".pl",
".pom",
".py",
".rpmlintrc",
".rules",
".script",
".service",
".sh",
".sig",
".sign",
".spec",
".sysconfig",
".test",
".txt",
".xml",
".xml",
".yml",
}
suffix = pathlib.Path(filename).suffix
if suffix in binary_suffix:
return True
if suffix in non_binary_suffix:
return False
if size >= 6 * 1024:
return True
return False
def _hash(hash_alg, file_or_path):
h = hash_alg()
def __hash(f):
while chunk := f.read(1024 * 4):
h.update(chunk)
if hasattr(file_or_path, "read"):
__hash(file_or_path)
else:
with file_or_path.open("rb") as f:
__hash(f)
return h.hexdigest()
md5 = functools.partial(_hash, hashlib.md5)
sha256 = functools.partial(_hash, hashlib.sha256)
def _files_hash(hash_alg, dirpath):
"""List of (filepath, md5) for a directory"""
# TODO: do it async or multythread
files = [f for f in dirpath.iterdir() if f.is_file()]
return [(f.parts[-1], hash_alg(f)) for f in files]
files_md5 = functools.partial(_files_hash, md5)
files_sha256 = functools.partial(_files_hash, sha256)
class Git:
"""Local git repository"""
def __init__(self, path, committer=None, committer_email=None):
self.path = pathlib.Path(path)
self.committer = committer
self.committer_email = committer_email
self.repo = None
def is_open(self):
return self.repo is not None
# TODO: Extend it to packages and files
def exists(self):
"""Check if the path is a valid git repository"""
return (self.path / ".git").exists()
def create(self):
"""Create a local git repository"""
self.path.mkdir(parents=True, exist_ok=True)
# Convert the path to string, to avoid some limitations in
# older pygit2
self.repo = pygit2.init_repository(str(self.path))
return self
def is_dirty(self):
"""Check if there is something to commit"""
assert self.is_open()
return self.repo.status()
def branches(self):
return list(self.repo.branches)
def branch(self, branch, commit=None):
if not commit:
commit = self.repo.head
else:
commit = self.repo.get(commit)
self.repo.branches.local.create(branch, commit)
def checkout(self, branch):
"""Checkout into the branch HEAD"""
new_branch = False
ref = f"refs/heads/{branch}"
if branch not in self.branches():
self.repo.references["HEAD"].set_target(ref)
new_branch = True
else:
self.repo.checkout(ref)
return new_branch
def commit(
self,
user,
user_email,
user_time,
message,
parents=None,
committer=None,
committer_email=None,
committer_time=None,
allow_empty=False,
):
"""Add all the files and create a new commit in the current HEAD"""
assert allow_empty or self.is_dirty()
if not committer:
committer = self.committer if self.committer else self.user
committer_email = (
self.committer_email if self.committer_email else self.user_email
)
committer_time = committer_time if committer_time else user_time
try:
self.repo.index.add_all()
except pygit2.GitError as e:
if not allow_empty:
raise e
self.repo.index.write()
author = pygit2.Signature(user, user_email, int(user_time.timestamp()))
committer = pygit2.Signature(
committer, committer_email, int(committer_time.timestamp())
)
if not parents:
try:
parents = [self.repo.head.target]
except pygit2.GitError as e:
parents = []
if not allow_empty:
raise e
tree = self.repo.index.write_tree()
return self.repo.create_commit(
"HEAD", author, committer, message, tree, parents
)
def merge(
self,
user,
user_email,
user_time,
message,
commit,
committer=None,
committer_email=None,
committer_time=None,
clean_on_conflict=True,
merged=False,
allow_empty=False,
):
new_branch = False
if not merged:
try:
self.repo.merge(commit)
except KeyError:
# If it is the first commit, we will have a missing
# "HEAD", but the files will be there. We can proceed
# to the commit directly.
new_branch = True
if not merged and self.repo.index.conflicts:
for conflict in self.repo.index.conflicts:
conflict = [c for c in conflict if c]
if conflict:
logging.info(f"CONFLICT {conflict[0].path}")
if clean_on_conflict:
self.clean()
# Now I miss Rust enums
return "CONFLICT"
# Some merges are empty in OBS (no changes, not sure
# why), for now we signal them
if not allow_empty and not self.is_dirty():
# I really really do miss Rust enums
return "EMPTY"
if new_branch:
parents = [commit]
else:
parents = [
self.repo.head.target,
commit,
]
commit = self.commit(
user,
user_email,
user_time,
message,
parents,
committer,
committer_email,
committer_time,
allow_empty=allow_empty,
)
return commit
def merge_abort(self):
self.repo.state_cleanup()
def last_commit(self):
try:
return self.repo.head.target
except:
return None
def gc(self):
logging.info(f"Garbage recollec and repackage {self.path}")
subprocess.run(
["git", "gc", "--auto"],
cwd=self.path,
stdout=subprocess.PIPE,
stderr=subprocess.STDOUT,
)
def clean(self):
for path, _ in self.repo.status().items():
logging.debug(f"Cleaning {path}")
try:
(self.path / path).unlink()
self.repo.index.remove(path)
except Exception as e:
logging.warning(f"Error removing file {path}: {e}")
def add(self, filename):
self.repo.index.add(filename)
def add_default_lfs_gitattributes(self, force=False):
if not (self.path / ".gitattributes").exists() or force:
with (self.path / ".gitattributes").open("w") as f:
content = ["## Default LFS"]
content += [f"*{b} {LFS_SUFFIX}" for b in sorted(BINARY)]
f.write("\n".join(content))
f.write("\n")
self.add(".gitattributes")
def add_specific_lfs_gitattributes(self, binaries):
self.add_default_lfs_gitattributes(force=True)
if binaries:
with (self.path / ".gitattributes").open("a") as f:
content = ["## Specific LFS patterns"]
content += [f"{b} {LFS_SUFFIX}" for b in sorted(binaries)]
f.write("\n".join(content))
f.write("\n")
self.add(".gitattributes")
def get_specific_lfs_gitattributes(self):
with (self.path / ".gitattributes").open() as f:
patterns = [
line.split()[0]
for line in f
if line.strip() and not line.startswith("#")
]
binary = {f"*{b}" for b in BINARY}
return [p for p in patterns if p not in binary]
def add_lfs(self, filename, sha256, size):
with (self.path / filename).open("w") as f:
f.write("version https://git-lfs.github.com/spec/v1\n")
f.write(f"oid sha256:{sha256}\n")
f.write(f"size {size}\n")
self.add(filename)
if not self.is_lfs_tracked(filename):
logging.debug(f"Add specific LFS file {filename}")
specific_patterns = self.get_specific_lfs_gitattributes()
specific_patterns.append(filename)
self.add_specific_lfs_gitattributes(specific_patterns)
def is_lfs_tracked(self, filename):
with (self.path / ".gitattributes").open() as f:
patterns = (
line.split()[0]
for line in f
if line.strip() and not line.startswith("#")
)
return any(fnmatch.fnmatch(filename, line) for line in patterns)
def remove(self, filename):
self.repo.index.remove(filename)
(self.path / filename).unlink()
patterns = self.get_specific_lfs_gitattributes()
if filename in patterns:
patterns.remove(filename)
self.add_specific_lfs_gitattributes(patterns)
class OBS:
def __init__(self, url=None):
if url:
self.change_url(url)
def change_url(self, url):
self.url = url
osc.conf.get_config(override_apiurl=url)
def _xml(self, url_path, **params):
url = osc.core.makeurl(self.url, [url_path], params)
logging.debug(f"GET {url}")
return ET.parse(osc.core.http_GET(url)).getroot()
def _meta(self, project, package, **params):
try:
root = self._xml(f"source/{project}/{package}/_meta", **params)
except HTTPError:
logging.error(f"Package [{project}/{package} {params}] has no meta")
return None
return root
def _history(self, project, package, **params):
try:
root = self._xml(f"source/{project}/{package}/_history", **params)
except HTTPError:
logging.error(f"Package [{project}/{package} {params}] has no history")
return None
return root
def _link(self, project, package, rev):
try:
root = self._xml(f"source/{project}/{package}/_link", rev=rev)
except HTTPError:
logging.info("Package has no link")
return None
except ET.ParseError:
logging.error(
f"Package [{project}/{package} rev={rev}] _link can't be parsed"
)
return root
def _request(self, requestid):
try:
root = self._xml(f"request/{requestid}")
except HTTPError:
logging.warning(f"Cannot fetch request {requestid}")
return None
return root
def exists(self, project, package):
root = self._meta(project, package)
if root is None:
return False
return root.get("project") == project
def devel_project(self, project, package):
root = self._meta(project, package)
devel = root.find("devel")
if devel is None:
return None
return devel.get("project")
def request(self, requestid):
root = self._request(requestid)
if root is not None:
return Request().parse(root)
def files(self, project, package, revision):
root = self._xml(f"source/{project}/{package}", rev=revision, expand=1)
return [
(e.get("name"), int(e.get("size")), e.get("md5"))
for e in root.findall("entry")
]
def _download(self, project, package, name, revision):
url = osc.core.makeurl(
self.url,
["source", project, package, urllib.parse.quote(name)],
{"rev": revision, "expand": 1},
)
return osc.core.http_GET(url)
def download(self, project, package, name, revision, dirpath):
with (dirpath / name).open("wb") as f:
f.write(self._download(project, package, name, revision).read())
class ProxySHA256:
def __init__(self, obs, url=None, enabled=True):
self.obs = obs
self.url = url if url else "http://source.dyn.cloud.suse.de"
self.enabled = enabled
self.hashes = None
self.texts = set()
def load_package(self, package):
# _project is unreachable for the proxy - due to being a fake package
if package == "_project":
self.enabled = False
self.texts = set(["_config", "_service"])
self.hashes = dict()
return
logging.info("Retrieve all previously defined SHA256")
response = requests.get(f"http://source.dyn.cloud.suse.de/package/{package}")
if response.status_code == 200:
json = response.json()
self.hashes = json["shas"]
self.texts = set(json["texts"])
def get(self, package, name, file_md5):
key = f"{file_md5}-{name}"
if self.hashes is None:
if self.enabled:
self.load_package(package)
else:
self.hashes = {}
return self.hashes.get(key, None)
def _proxy_put(self, project, package, name, revision, file_md5, size):
quoted_name = urllib.parse.quote(name)
url = f"{self.obs.url}/public/source/{project}/{package}/{quoted_name}?rev={revision}"
response = requests.put(
self.url,
data={
"hash": file_md5,
"filename": name,
"url": url,
"package": package,
},
)
if response.status_code != 200:
raise Exception(f"Redirector error on {self.url} for {url}")
key = (file_md5, name)
self.hashes[key] = {
"sha256": response.content.decode("utf-8"),
"fsize": size,
}
return self.hashes[key]
def _obs_put(self, project, package, name, revision, file_md5, size):
key = (file_md5, name)
self.hashes[key] = {
"sha256": sha256(self.obs._download(project, package, name, revision)),
"fsize": size,
}
return self.hashes[key]
def put(self, project, package, name, revision, file_md5, size):
if not self.enabled:
return self._obs_put(project, package, name, revision, file_md5, size)
return self._proxy_put(project, package, name, revision, file_md5, size)
def is_text(self, filename):
return filename in self.texts
def get_or_put(self, project, package, name, revision, file_md5, size):
result = self.get(package, name, file_md5)
if not result:
result = self.put(project, package, name, revision, file_md5, size)
# Sanity check
if result["fsize"] != size:
raise Exception(f"Redirector has different size for {name}")
return result
class Request:
def parse(self, xml):
self.requestid = int(xml.get("id"))
self.creator = xml.get("creator")
self.type_ = xml.find("action").get("type")
if self.type_ == "delete":
# not much to do
return self
self.source = xml.find("action/source").get("project")
# expanded MD5 or commit revision
self.revisionid = xml.find("action/source").get("rev")
self.target = xml.find("action/target").get("project")
self.state = xml.find("state").get("name")
# TODO: support muti-action requests
# TODO: parse review history
# TODO: add description
return self
def type(self):
return self.type_
def __str__(self):
return f"Req {self.requestid} {self.creator} {self.type_} {self.source}->{self.target} {self.state}"
def __repr__(self):
return f"[{self.__str__()}]"
class Revision:
def __init__(self, obs, history, project, package):
self.obs = obs
self.history = history
self.project = project
self.package = package
self.commit = None
self.ignored = False
def parse(self, xml):
self.rev = int(xml.get("rev"))
# Replaced in check_expanded
self.srcmd5 = xml.find("srcmd5").text
self.version = xml.find("version").text
time = int(xml.find("time").text)
self.time = datetime.datetime.fromtimestamp(time)
userid = xml.find("user")
if userid is not None:
self.userid = userid.text
else:
self.userid = "unknown"
comment = xml.find("comment")
if comment is not None:
self.comment = comment.text or ""
else:
self.comment = ""
# Populated by check_link
self.linkrev = None
self.requestid = None
requestid = xml.find("requestid")
if requestid is not None:
self.requestid = int(requestid.text)
else:
# Sometimes requestid is missing, but can be extracted
# from "comment"
matched = re.match(
r"^Copy from .* based on submit request (\d+) from user .*$",
self.comment,
)
if matched:
self.requestid = int(matched.group(1))
return self
def __str__(self):
return f"Rev {self.project}/{self.rev} Md5 {self.srcmd5} {self.time} {self.userid} {self.requestid}"
def __repr__(self):
return f"[{self.__str__()}]"
def check_link(self):
"""Add 'linkrev' attribute into the revision. Returns False if the link is invalid"""
try:
root = self.obs._xml(
f"source/{self.project}/{self.package}/_link", rev=self.srcmd5
)
except HTTPError as e:
if e.code == 404:
logging.debug("No _link for the revision")
return True
raise e
except ET.ParseError:
logging.error(
f"_link can't be parsed [{self.project}/{self.package} rev={self.srcmd5}]"
)
return False
target_project = root.get("project")
rev = self.history.find_last_rev_after_time(target_project, self.time)
if rev:
logging.debug(f"Linkrev found: {rev}")
self.linkrev = rev.srcmd5
return True
def check_expanded(self):
# Even if it's not a link we still need to check the expanded
# srcmd5 as it's possible used in submit requests
if not self.check_link():
return False
# If there is a "linkrev", "rev" is ignored
params = {"rev": self.srcmd5, "expand": "1"}
if self.linkrev:
params["linkrev"] = self.linkrev
try:
root = self.obs._xml(f"source/{self.project}/{self.package}", **params)
except HTTPError as e:
if e.code == 400:
logging.error(f"Package [{self.project}/{self.package} {params}] can't be expanded: {e}")
return False
raise e
self.srcmd5 = root.get("srcmd5")
return True
class History:
"""Store the history of revisions of a package in different
projects.
"""
def __init__(self, obs, package):
self.obs = obs
self.package = package
self.revisions = {}
def __contains__(self, project):
return project in self.revisions
def __getitem__(self, project):
return self.revisions[project]
def _extract_copypac(self, comment):
original_project = re.findall(
r"osc copypac from project:(.*) package:", comment
)
return original_project[0] if original_project else None
def _fetch_revisions(self, project, **params):
root = self.obs._history(project, self.package, **params)
if root is not None:
return [
Revision(self.obs, self, project, self.package).parse(r)
for r in root.findall("revision")
]
def fetch_revisions(self, project, follow_copypac=False):
"""Get the revision history of a package"""
if project in self:
return
revs = self._fetch_revisions(project)
self.revisions[project] = revs
# while (
# revs
# and follow_copypac
# and (copypac_project := self._extract_copypac(revs[0].comment))
# ):
# # Add the history pre-copypac
# # TODO: missing the old project name
# revs = self._fetch_revisions(copypac_project, deleted=1)
# self.revisions[project] = (
# revs + self.revisions[project]
# )
def fetch_all_revisions(self, projects):
"""Pre-populate the history"""
for project, _, api_url in projects:
self.obs.change_url(api_url)
self.fetch_revisions(project)
def sort_all_revisions(self):
"""Sort revisions for all projects, from older to newer"""
return sorted(itertools.chain(*self.revisions.values()), key=lambda x: x.time)
def find_revision(self, project, revisionid, accepted_at):
last_commited_revision = None
for r in self.revisions.get(project, []):
logging.debug(f"Find revision {revisionid} [{accepted_at}]: {r}")
if str(r.rev) == str(revisionid) or r.srcmd5 == revisionid:
if r.ignored:
logging.debug(
f"{r} fits but is ignored, returning {last_commited_revision}"
)
return last_commited_revision
else:
logging.debug(f"{r} fits")
return r
if r.time > accepted_at:
# if we can't find the right revision, we take the last
# commit. Before ~2012 the data was tracked really loosely
# (e.g. using different timezones and the state field was
# only introduced in 2016...)
logging.warning(
f"Deploying workaround for missing request revision - returning {last_commited_revision}"
)
return last_commited_revision
if r.commit:
last_commited_revision = r
logging.info("No commited revision found, returning None")
return None
def find_last_rev_after_time(self, project, time):
# revs = self.projects.get(project, [])
# return next((r for r in reversed(revs) if r.time <= time), None)
prev = None
for rev in self.revisions.get(project, []):
if rev.time > time:
return prev
if rev.time == time:
return rev
prev = rev
return prev
class Importer:
def __init__(self, projects, package, repodir, search_ancestor, rebase_devel):
# The idea is to create each commit in order, and draw the
# same graph described by the revisions timeline. For that we
# need first to fetch all the revisions and sort them
# linearly, based on the timestamp.
#
# After that we recreate the commits, and if one revision is a
# request that contains a target inside the projects in the
# "history", we create a merge commit.
#
# Optionally, if a flag is set, we will try to find a common
# "Initial commit" from a reference branch (the first one in
# "projects", that is safe to assume to be "openSUSE:Factory".
# This is not always a good idea. For example, in a normal
# situation the "devel" project history is older than
# "factory", and we can root the tree on it. But for some
# other projects we lost partially the "devel" history project
# (could be moved), and "factory" is not the root.
self.package = package
self.search_ancestor = search_ancestor
self.rebase_devel = rebase_devel
self.obs = OBS()
self.git = Git(
repodir,
committer="Git OBS Bridge",
committer_email="obsbridge@suse.de",
).create()
self.proxy_sha256 = ProxySHA256(self.obs, enabled=True)
self.history = History(self.obs, self.package)
# Add the "devel" project
(project, branch, api_url) = projects[0]
assert project == "openSUSE:Factory"
self.obs.change_url(api_url)
devel_project = self.obs.devel_project(project, package)
if devel_project:
self.projects = [(devel_project, "devel", api_url)] + projects
else:
self.projects = projects
# Associate the branch and api_url information per project
self.projects_info = {
project: (branch, api_url) for (project, branch, api_url) in self.projects
}
def download(self, revision):
obs_files = self.obs.files(revision.project, revision.package, revision.srcmd5)
git_files = {
(f.name, f.stat().st_size, md5(f))
for f in self.git.path.iterdir()
if f.is_file() and f.name not in (".gitattributes")
}
# Overwrite ".gitattributes" with the
self.git.add_default_lfs_gitattributes(force=True)
# Download each file in OBS if it is not a binary (or large)
# file
for (name, size, file_md5) in obs_files:
# this file creates easily 100k commits and is just useless data :(
# unfortunately it's stored in the same meta package as the project config
if revision.package == "_project" and name == "_staging_workflow":
continue
# have such files been detected as text mimetype before?
is_text = self.proxy_sha256.is_text(name)
if not is_text and is_binary_or_large(name, size):
file_sha256 = self.proxy_sha256.get_or_put(
revision.project,
revision.package,
name,
revision.srcmd5,
file_md5,
size,
)
self.git.add_lfs(name, file_sha256["sha256"], size)
else:
if (name, size, file_md5) not in git_files:
print(f"Download {name}")
self.obs.download(
revision.project,
revision.package,
name,
revision.srcmd5,
self.git.path,
)
# Validate the MD5 of the downloaded file
if md5(self.git.path / name) != file_md5:
raise Exception(f"Download error in {name}")
self.git.add(name)
# Remove extra files
obs_names = {n for (n, _, _) in obs_files}
git_names = {n for (n, _, _) in git_files}
for name in git_names - obs_names:
print(f"Remove {name}")
self.git.remove(name)
def import_all_revisions(self, gc):
# Fetch all the requests and sort them. Ideally we should
# build the graph here, to avoid new commits before the merge.
# For now we will sort them and invalidate the commits if
# "rebase_devel" is set.
self.history.fetch_all_revisions(self.projects)
revisions = self.history.sort_all_revisions()
logging.debug(f"Selected import order for {self.package}")
for revision in revisions:
logging.debug(revision)
gc_cnt = gc
for revision in revisions:
gc_cnt -= 1
if gc_cnt <= 0 and gc:
self.git.gc()
gc_cnt = gc
self.import_revision(revision)
def import_new_revision_with_request(self, revision, request):
"""Create a new branch as a result of a merge"""
submitted_revision = self.history.find_revision(
request.source, request.revisionid, revision.time
)
if not submitted_revision:
logging.warning(f"Request {request} does not connect to a known revision")
return False
if not submitted_revision.commit:
# If the revision appointed by the request is not part of
# the git history, we can have an ordering problem. One
# example is "premake4".
self.import_revision(submitted_revision)
assert submitted_revision.commit is not None
project = revision.project
branch, _ = self.projects_info[project]
# TODO: add an empty commit marking the acceptenace of the request (see discussion in PR 2858)
self.git.branch(branch, submitted_revision.commit)
self.git.clean()
self.git.checkout(branch)
logging.info(f"Create new branch based on {submitted_revision.commit}")
revision.commit = submitted_revision.commit
def _rebase_branch_history(self, project, revision):
branch, _ = self.projects_info[project]
history = self.history[project]
revision_index = history.index(revision)
for index in range(revision_index + 1, len(history)):
revision = history[index]
# We are done when we have one non-commited revision
if not revision.commit:
return
logging.info(f"Rebasing {revision} from {branch}")
revision.commit = None
self.import_revision(revision)
def import_revision_with_request(self, revision, request):
"""Import a single revision via a merge"""
submitted_revision = self.history.find_revision(
request.source, request.revisionid, revision.time
)
if not submitted_revision:
logging.warning(f"Request {request} does not connect to a known revision")
return False
assert submitted_revision.commit is not None
# TODO: detect a revision, case in point
# Base:System/bash/284 -> rq683701 -> accept O:F/151
# -> autocommit Base:System/bash/285
# Revert lead to openSUSE:Factory/bash/152
# Base:System/286 restored the reverted code in devel project
# rq684575 was created and accepted as O:F/153
# But the 284-285 and the 285-286 changeset is seen as empty
# as the revert was never in Base:System, so the
# submitted_revision of 684575 has no commit
if submitted_revision.commit == "EMPTY":
logging.warning("Empty commit submitted?!")
return False
message = (
f"Accepting request {revision.requestid}: {revision.comment}\n\n{revision}"
)
commit = self.git.merge(
# TODO: revision.userid or request.creator?
f"OBS User {revision.userid}",
"null@suse.de",
revision.time,
message,
submitted_revision.commit,
)
if commit == "EMPTY":
logging.warning("Empty merge. Ignoring the revision and the request")
self.git.merge_abort()
revision.commit = commit
return False
if commit == "CONFLICT":
logging.info("Merge conflict. Downloading revision")
self.download(revision)
message = f"CONFLICT {message}"
commit = self.git.merge(
f"OBS User {revision.userid}",
"null@suse.de",
revision.time,
message,
submitted_revision.commit,
merged=True,
)
assert commit and commit != "CONFLICT"
logging.info(f"Merge with {submitted_revision.commit} into {commit}")
revision.commit = commit
# TODO: There are more checks to do, like for example, the
# last commit into the non-devel branch should be a merge from
# the devel branch
if self.rebase_devel:
branch, _ = self.projects_info.get(request.source, (None, None))
if branch == "devel":
self.git.repo.references[f"refs/heads/{branch}"].set_target(commit)
self._rebase_branch_history(request.source, submitted_revision)
return True
def matching_request(self, revision):
request = self.obs.request(revision.requestid)
if not request:
return None
# to be handled by the caller
if request.type() != "submit":
return request
if request.source not in self.projects_info:
logging.info("Request from a non exported project")
return None
if request.target != revision.project:
# This seems to happen when the devel project gets
# reinitialized (for example, SR#943593 in 7zip, or
# SR#437901 in ColorFull)
logging.info("Request target different from current project")
return None
if request.source == request.target:
# this is not a merge, but a different way to do a
# contribution to the (devel) project - see bindfs's rev 1
logging.info("Request within the same project")
return None
return request
def import_revision(self, revision):
"""Import a single revision into git"""
project = revision.project
branch, api_url = self.projects_info[project]
logging.info(f"Importing [{revision}] to {branch}")
self.obs.change_url(api_url)
# Populate linkrev and replace srcmd5 from the linked
# revision. If the expansion fails, the revision will be ignored
# and not imported.
if not revision.check_expanded():
logging.warning(f"Broken revision")
revision.ignored = True
return
# When doing a SR, we see also a revision in the origin
# project with the outgoing request, but without changes in
# the project. We can ignore them.
#
# If there is a request ID, it will be filtered out later,
# when the target project is different from itself.
if revision.userid == "autobuild" and not revision.requestid:
logging.info("Ignoring autocommit")
revision.ignored = True
return
if revision.userid == "buildservice-autocommit":
logging.info("Ignoring autocommit")
revision.ignored = True
return
# Create the reference if the branch is new. If so return
# True.
new_branch = self.git.checkout(branch)
if revision.requestid:
request = self.matching_request(revision)
if request:
if request.type() == "delete":
# TODO: after this comes a restore, this should be collapsed
# before even hitting git
logging.info("Delete request ignored")
revision.ignored = True
return
logging.debug(f"Found matching request: #{revision.project} #{request}")
if new_branch:
self.import_new_revision_with_request(revision, request)
return
if self.import_revision_with_request(revision, request):
return
# Import revision as a single commit (without merging)
self.download(revision)
if new_branch or self.git.is_dirty():
commit = self.git.commit(
f"OBS User {revision.userid}",
"null@suse.de",
revision.time,
# TODO: Normalize better the commit message
f"{revision.comment}\n\n{revision}",
# Create an empty commit only if is a new branch
allow_empty=new_branch,
)
revision.commit = commit
logging.info(f"Commit {commit}")
else:
logging.info("Skip empty commit")
revision.ignored = True
def main():
parser = argparse.ArgumentParser(description="OBS history importer into git")
parser.add_argument("package", help="OBS package name")
parser.add_argument(
"-r",
"--repodir",
required=False,
type=pathlib.Path,
help="Local git repository directory",
)
parser.add_argument(
"-f",
"--force",
action="store_true",
help="If the repository directory exists, remove it",
)
parser.add_argument(
"-a",
"--search-ancestor",
action="store_true",
help="Search closest ancestor candidate for initial commit",
)
parser.add_argument(
"-d",
"--rebase-devel",
action="store_true",
help="The devel project with be rebased after a merge",
)
parser.add_argument(
"-g",
"--gc",
metavar="N",
type=int,
default=200,
help="Garbage recollect and pack the git history each N commits",
)
parser.add_argument(
"--level",
"-l",
default="INFO",
help="logging level",
)
args = parser.parse_args()
if args.level:
numeric_level = getattr(logging, args.level.upper(), None)
if not isinstance(numeric_level, int):
print(f"Invalid log level: {args.level}")
sys.exit(-1)
logging.basicConfig(level=numeric_level)
if numeric_level == logging.DEBUG:
osc.conf.config["debug"] = True
requests_log = logging.getLogger("requests.packages.urllib3")
requests_log.setLevel(logging.DEBUG)
requests_log.propagate = True
if not args.repodir:
args.repodir = pathlib.Path(args.package)
if args.repodir.exists() and not args.force:
print(f"Repository {args.repodir} already present")
sys.exit(-1)
elif args.repodir.exists() and args.force:
logging.info(f"Removing old repository {args.repodir}")
shutil.rmtree(args.repodir)
Cache.init()
# TODO: use a CLI parameter to describe the projects
importer = Importer(
PROJECTS, args.package, args.repodir, args.search_ancestor, args.rebase_devel
)
importer.import_all_revisions(args.gc)
if __name__ == "__main__":
main()