Split GitExporter out of Importer class
This commit is contained in:
parent
9de0d6e6c5
commit
c4654dd896
@ -6,16 +6,17 @@ import pathlib
|
|||||||
import sys
|
import sys
|
||||||
|
|
||||||
import osc.core
|
import osc.core
|
||||||
|
from lib.git_exporter import GitExporter
|
||||||
|
|
||||||
from lib.test_exporter import TestExporter
|
|
||||||
from lib.importer import Importer
|
from lib.importer import Importer
|
||||||
|
from lib.test_exporter import TestExporter
|
||||||
|
|
||||||
URL_OBS = "https://api.opensuse.org"
|
URL_OBS = "https://api.opensuse.org"
|
||||||
URL_IBS = "https://api.suse.de"
|
URL_IBS = "https://api.suse.de"
|
||||||
|
|
||||||
# The order is relevant (from older to newer initial codebase)
|
# The order is relevant (from older to newer initial codebase)
|
||||||
|
# TODO: make something with these, for now we look purely at openSUSE:Factory
|
||||||
PROJECTS = [
|
PROJECTS = [
|
||||||
("openSUSE:Factory", "factory", URL_OBS),
|
|
||||||
# ("SUSE:SLE-12:GA", "SLE_12", URL_IBS),
|
# ("SUSE:SLE-12:GA", "SLE_12", URL_IBS),
|
||||||
# ("SUSE:SLE-12:Update", "SLE_12", URL_IBS),
|
# ("SUSE:SLE-12:Update", "SLE_12", URL_IBS),
|
||||||
# ("SUSE:SLE-12-SP1:GA", "SLE_12_SP1", URL_IBS),
|
# ("SUSE:SLE-12-SP1:GA", "SLE_12_SP1", URL_IBS),
|
||||||
@ -92,11 +93,11 @@ def main():
|
|||||||
if not args.repodir:
|
if not args.repodir:
|
||||||
args.repodir = pathlib.Path(args.package)
|
args.repodir = pathlib.Path(args.package)
|
||||||
|
|
||||||
# TODO: use a CLI parameter to describe the projects
|
importer = Importer(URL_OBS, "openSUSE:Factory", args.package)
|
||||||
importer = Importer(PROJECTS, args.package, args.repodir)
|
|
||||||
importer.set_gc_interval(args.gc)
|
|
||||||
importer.import_into_db()
|
importer.import_into_db()
|
||||||
importer.export_as_git()
|
exporter = GitExporter(URL_OBS, "openSUSE:Factory", args.package, args.repodir)
|
||||||
|
exporter.set_gc_interval(args.gc)
|
||||||
|
exporter.export_as_git()
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
167
lib/git_exporter.py
Normal file
167
lib/git_exporter.py
Normal file
@ -0,0 +1,167 @@
|
|||||||
|
import logging
|
||||||
|
import os
|
||||||
|
from lib.binary import is_binary_or_large
|
||||||
|
import yaml
|
||||||
|
from lib.db import DB
|
||||||
|
from lib.git import Git
|
||||||
|
from lib.obs import OBS
|
||||||
|
from lib.proxy_sha256 import ProxySHA256, md5
|
||||||
|
from lib.tree_builder import TreeBuilder
|
||||||
|
|
||||||
|
class GitExporter:
|
||||||
|
def __init__(self, api_url, project, package, repodir):
|
||||||
|
self.obs = OBS()
|
||||||
|
self.project = project
|
||||||
|
self.package = package
|
||||||
|
# TODO: Store the api url in the revision
|
||||||
|
self.obs.change_url(api_url)
|
||||||
|
self.proxy_sha256 = ProxySHA256(self.obs, enabled=True)
|
||||||
|
self.git = Git(
|
||||||
|
repodir,
|
||||||
|
committer="Git OBS Bridge",
|
||||||
|
committer_email="obsbridge@suse.de",
|
||||||
|
).create()
|
||||||
|
self.state_file = os.path.join(self.git.path, ".git", "_flat_state.yaml")
|
||||||
|
self.gc_interval = 200
|
||||||
|
|
||||||
|
|
||||||
|
def download(self, revision):
|
||||||
|
obs_files = self.obs.files(revision.project, revision.package, revision.srcmd5)
|
||||||
|
git_files = {
|
||||||
|
(f.name, f.stat().st_size, md5(f))
|
||||||
|
for f in self.git.path.iterdir()
|
||||||
|
if f.is_file() and f.name not in (".gitattributes")
|
||||||
|
}
|
||||||
|
|
||||||
|
# Overwrite ".gitattributes" with the
|
||||||
|
self.git.add_default_lfs_gitattributes(force=True)
|
||||||
|
|
||||||
|
# Download each file in OBS if it is not a binary (or large)
|
||||||
|
# file
|
||||||
|
for (name, size, file_md5) in obs_files:
|
||||||
|
# this file creates easily 100k commits and is just useless data :(
|
||||||
|
# unfortunately it's stored in the same meta package as the project config
|
||||||
|
if revision.package == "_project" and name == "_staging_workflow":
|
||||||
|
continue
|
||||||
|
# have such files been detected as text mimetype before?
|
||||||
|
is_text = self.proxy_sha256.is_text(name)
|
||||||
|
if not is_text and is_binary_or_large(name, size):
|
||||||
|
file_sha256 = self.proxy_sha256.get_or_put(
|
||||||
|
revision.project,
|
||||||
|
revision.package,
|
||||||
|
name,
|
||||||
|
revision.srcmd5,
|
||||||
|
file_md5,
|
||||||
|
size,
|
||||||
|
)
|
||||||
|
self.git.add_lfs(name, file_sha256["sha256"], size)
|
||||||
|
else:
|
||||||
|
if (name, size, file_md5) not in git_files:
|
||||||
|
logging.debug(f"Download {name}")
|
||||||
|
self.obs.download(
|
||||||
|
revision.project,
|
||||||
|
revision.package,
|
||||||
|
name,
|
||||||
|
revision.srcmd5,
|
||||||
|
self.git.path,
|
||||||
|
)
|
||||||
|
# Validate the MD5 of the downloaded file
|
||||||
|
if md5(self.git.path / name) != file_md5:
|
||||||
|
raise Exception(f"Download error in {name}")
|
||||||
|
self.git.add(name)
|
||||||
|
|
||||||
|
# Remove extra files
|
||||||
|
obs_names = {n for (n, _, _) in obs_files}
|
||||||
|
git_names = {n for (n, _, _) in git_files}
|
||||||
|
for name in git_names - obs_names:
|
||||||
|
logging.debug(f"Remove {name}")
|
||||||
|
self.git.remove(name)
|
||||||
|
|
||||||
|
def set_gc_interval(self, gc):
|
||||||
|
self.gc_interval = gc
|
||||||
|
|
||||||
|
def export_as_git(self):
|
||||||
|
db = DB()
|
||||||
|
tree = TreeBuilder(db).build(self.project, self.package)
|
||||||
|
flats = tree.as_flat_list()
|
||||||
|
|
||||||
|
branch_state = {"factory": None, "devel": None}
|
||||||
|
state_data = dict()
|
||||||
|
if os.path.exists(self.state_file):
|
||||||
|
with open(self.state_file, "r") as f:
|
||||||
|
state_data = yaml.safe_load(f)
|
||||||
|
if type(state_data) != dict:
|
||||||
|
state_data = {}
|
||||||
|
left_to_commit = []
|
||||||
|
for flat in reversed(flats):
|
||||||
|
found_state = False
|
||||||
|
for branch in ["factory", "devel"]:
|
||||||
|
if flat.commit.dbid == state_data.get(branch):
|
||||||
|
branch_state[branch] = flat.commit
|
||||||
|
flat.commit.git_commit = self.git.branch_head(branch)
|
||||||
|
logging.debug(
|
||||||
|
f"Found {self.git.path}'s {branch} branch in state {flat}"
|
||||||
|
)
|
||||||
|
left_to_commit = []
|
||||||
|
found_state = True
|
||||||
|
if not found_state:
|
||||||
|
left_to_commit.append(flat)
|
||||||
|
|
||||||
|
gc_cnt = self.gc_interval
|
||||||
|
if len(left_to_commit) > 0:
|
||||||
|
self.git.gc()
|
||||||
|
for flat in left_to_commit:
|
||||||
|
gc_cnt -= 1
|
||||||
|
if gc_cnt <= 0 and self.gc_interval:
|
||||||
|
self.git.gc()
|
||||||
|
gc_cnt = self.gc_interval
|
||||||
|
logging.debug(f"Committing {flat}")
|
||||||
|
self.commit_flat(db, flat, branch_state)
|
||||||
|
|
||||||
|
def limit_download(self, file):
|
||||||
|
if file.endswith(".spec") or file.endswith(".changes"):
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
|
||||||
|
def commit_flat(self, db, flat, branch_state):
|
||||||
|
parents = []
|
||||||
|
self.git.checkout(flat.branch)
|
||||||
|
if flat.parent1:
|
||||||
|
parents.append(flat.parent1.git_commit)
|
||||||
|
if flat.parent2:
|
||||||
|
parents.append(flat.parent2.git_commit)
|
||||||
|
to_download, to_delete = flat.commit.calc_delta(db, branch_state[flat.branch])
|
||||||
|
for file in to_delete:
|
||||||
|
if not self.limit_download(file):
|
||||||
|
continue
|
||||||
|
self.git.remove(file)
|
||||||
|
for file in to_download:
|
||||||
|
if not self.limit_download(file):
|
||||||
|
continue
|
||||||
|
self.obs.download(
|
||||||
|
flat.commit.project,
|
||||||
|
flat.commit.package,
|
||||||
|
file,
|
||||||
|
flat.commit.expanded_srcmd5,
|
||||||
|
self.git.path,
|
||||||
|
)
|
||||||
|
self.git.add(file)
|
||||||
|
|
||||||
|
commit = self.git.commit(
|
||||||
|
f"OBS User {flat.commit.userid}",
|
||||||
|
"null@suse.de",
|
||||||
|
flat.commit.commit_time,
|
||||||
|
# TODO: Normalize better the commit message
|
||||||
|
f"{flat.commit.comment}\n\n{flat.commit}",
|
||||||
|
allow_empty=True,
|
||||||
|
parents=parents,
|
||||||
|
)
|
||||||
|
flat.commit.git_commit = commit
|
||||||
|
branch_state[flat.branch] = flat.commit
|
||||||
|
with open(self.state_file, "w") as f:
|
||||||
|
data = {}
|
||||||
|
for branch in ["factory", "devel"]:
|
||||||
|
commit = branch_state[branch]
|
||||||
|
if commit:
|
||||||
|
data[branch] = commit.dbid
|
||||||
|
yaml.dump(data, f)
|
207
lib/importer.py
207
lib/importer.py
@ -1,122 +1,23 @@
|
|||||||
import logging
|
import logging
|
||||||
import os
|
|
||||||
import xml.etree.ElementTree as ET
|
import xml.etree.ElementTree as ET
|
||||||
|
|
||||||
import yaml
|
|
||||||
|
|
||||||
from lib.binary import is_binary_or_large
|
|
||||||
from lib.db import DB
|
from lib.db import DB
|
||||||
from lib.db_revision import DBRevision
|
from lib.db_revision import DBRevision
|
||||||
from lib.git import Git
|
|
||||||
from lib.obs import OBS
|
from lib.obs import OBS
|
||||||
from lib.obs_revision import OBSRevision
|
from lib.obs_revision import OBSRevision
|
||||||
from lib.proxy_sha256 import ProxySHA256, md5
|
|
||||||
from lib.tree_builder import AbstractWalker, TreeBuilder
|
|
||||||
from lib.user import User
|
from lib.user import User
|
||||||
|
|
||||||
|
|
||||||
class Importer:
|
class Importer:
|
||||||
def __init__(self, projects, package, repodir):
|
def __init__(self, api_url, project, package):
|
||||||
# The idea is to create each commit in order, and draw the
|
# Import a Factory package into the database
|
||||||
# same graph described by the revisions timeline. For that we
|
|
||||||
# need first to fetch all the revisions and sort them
|
|
||||||
# linearly, based on the timestamp.
|
|
||||||
#
|
|
||||||
# After that we recreate the commits, and if one revision is a
|
|
||||||
# request that contains a target inside the projects in the
|
|
||||||
# "history", we create a merge commit.
|
|
||||||
#
|
|
||||||
# Optionally, if a flag is set, we will try to find a common
|
|
||||||
# "Initial commit" from a reference branch (the first one in
|
|
||||||
# "projects", that is safe to assume to be "openSUSE:Factory".
|
|
||||||
# This is not always a good idea. For example, in a normal
|
|
||||||
# situation the "devel" project history is older than
|
|
||||||
# "factory", and we can root the tree on it. But for some
|
|
||||||
# other projects we lost partially the "devel" history project
|
|
||||||
# (could be moved), and "factory" is not the root.
|
|
||||||
|
|
||||||
self.package = package
|
self.package = package
|
||||||
|
self.project = project
|
||||||
|
|
||||||
self.obs = OBS()
|
self.obs = OBS()
|
||||||
self.git = Git(
|
|
||||||
repodir,
|
|
||||||
committer="Git OBS Bridge",
|
|
||||||
committer_email="obsbridge@suse.de",
|
|
||||||
).create()
|
|
||||||
self.state_file = os.path.join(self.git.path, ".git", "_flat_state.yaml")
|
|
||||||
self.proxy_sha256 = ProxySHA256(self.obs, enabled=True)
|
|
||||||
self.gc_interval = 200
|
|
||||||
|
|
||||||
# Add the "devel" project
|
|
||||||
(project, branch, api_url) = projects[0]
|
|
||||||
assert project == "openSUSE:Factory"
|
assert project == "openSUSE:Factory"
|
||||||
self.obs.change_url(api_url)
|
self.obs.change_url(api_url)
|
||||||
devel_project = self.obs.devel_project(project, package)
|
|
||||||
if devel_project:
|
|
||||||
self.projects = [(devel_project, "devel", api_url)] + projects
|
|
||||||
else:
|
|
||||||
self.projects = projects
|
|
||||||
|
|
||||||
# Associate the branch and api_url information per project
|
|
||||||
self.projects_info = {
|
|
||||||
project: (branch, api_url) for (project, branch, api_url) in self.projects
|
|
||||||
}
|
|
||||||
|
|
||||||
def download(self, revision):
|
|
||||||
obs_files = self.obs.files(revision.project, revision.package, revision.srcmd5)
|
|
||||||
git_files = {
|
|
||||||
(f.name, f.stat().st_size, md5(f))
|
|
||||||
for f in self.git.path.iterdir()
|
|
||||||
if f.is_file() and f.name not in (".gitattributes")
|
|
||||||
}
|
|
||||||
|
|
||||||
# Overwrite ".gitattributes" with the
|
|
||||||
self.git.add_default_lfs_gitattributes(force=True)
|
|
||||||
|
|
||||||
# Download each file in OBS if it is not a binary (or large)
|
|
||||||
# file
|
|
||||||
for (name, size, file_md5) in obs_files:
|
|
||||||
# this file creates easily 100k commits and is just useless data :(
|
|
||||||
# unfortunately it's stored in the same meta package as the project config
|
|
||||||
if revision.package == "_project" and name == "_staging_workflow":
|
|
||||||
continue
|
|
||||||
# have such files been detected as text mimetype before?
|
|
||||||
is_text = self.proxy_sha256.is_text(name)
|
|
||||||
if not is_text and is_binary_or_large(name, size):
|
|
||||||
file_sha256 = self.proxy_sha256.get_or_put(
|
|
||||||
revision.project,
|
|
||||||
revision.package,
|
|
||||||
name,
|
|
||||||
revision.srcmd5,
|
|
||||||
file_md5,
|
|
||||||
size,
|
|
||||||
)
|
|
||||||
self.git.add_lfs(name, file_sha256["sha256"], size)
|
|
||||||
else:
|
|
||||||
if (name, size, file_md5) not in git_files:
|
|
||||||
logging.debug(f"Download {name}")
|
|
||||||
self.obs.download(
|
|
||||||
revision.project,
|
|
||||||
revision.package,
|
|
||||||
name,
|
|
||||||
revision.srcmd5,
|
|
||||||
self.git.path,
|
|
||||||
)
|
|
||||||
# Validate the MD5 of the downloaded file
|
|
||||||
if md5(self.git.path / name) != file_md5:
|
|
||||||
raise Exception(f"Download error in {name}")
|
|
||||||
self.git.add(name)
|
|
||||||
|
|
||||||
# Remove extra files
|
|
||||||
obs_names = {n for (n, _, _) in obs_files}
|
|
||||||
git_names = {n for (n, _, _) in git_files}
|
|
||||||
for name in git_names - obs_names:
|
|
||||||
logging.debug(f"Remove {name}")
|
|
||||||
self.git.remove(name)
|
|
||||||
|
|
||||||
def set_gc_interval(self, gc):
|
|
||||||
self.gc_interval = gc
|
|
||||||
|
|
||||||
def update_db_package(self, db, project, package):
|
def update_db_package(self, db, project, package):
|
||||||
root = self.obs._history(project, package)
|
root = self.obs._history(project, package)
|
||||||
if root is None:
|
if root is None:
|
||||||
@ -248,100 +149,14 @@ class Importer:
|
|||||||
)
|
)
|
||||||
return [DBRevision(row) for row in cur.fetchall()]
|
return [DBRevision(row) for row in cur.fetchall()]
|
||||||
|
|
||||||
def export_as_git(self):
|
|
||||||
db = DB()
|
|
||||||
tree = TreeBuilder(db).build(self.package)
|
|
||||||
flats = tree.as_flat_list()
|
|
||||||
|
|
||||||
branch_state = {"factory": None, "devel": None}
|
|
||||||
state_data = dict()
|
|
||||||
if os.path.exists(self.state_file):
|
|
||||||
with open(self.state_file, "r") as f:
|
|
||||||
state_data = yaml.safe_load(f)
|
|
||||||
if type(state_data) != dict:
|
|
||||||
state_data = {}
|
|
||||||
left_to_commit = []
|
|
||||||
for flat in reversed(flats):
|
|
||||||
found_state = False
|
|
||||||
for branch in ["factory", "devel"]:
|
|
||||||
if flat.commit.dbid == state_data.get(branch):
|
|
||||||
branch_state[branch] = flat.commit
|
|
||||||
flat.commit.git_commit = self.git.branch_head(branch)
|
|
||||||
logging.debug(
|
|
||||||
f"Found {self.git.path}'s {branch} branch in state {flat}"
|
|
||||||
)
|
|
||||||
left_to_commit = []
|
|
||||||
found_state = True
|
|
||||||
if not found_state:
|
|
||||||
left_to_commit.append(flat)
|
|
||||||
|
|
||||||
gc_cnt = self.gc_interval
|
|
||||||
if len(left_to_commit) > 0:
|
|
||||||
self.git.gc()
|
|
||||||
for flat in left_to_commit:
|
|
||||||
gc_cnt -= 1
|
|
||||||
if gc_cnt <= 0 and self.gc_interval:
|
|
||||||
self.git.gc()
|
|
||||||
gc_cnt = self.gc_interval
|
|
||||||
logging.debug(f"Committing {flat}")
|
|
||||||
self.commit_flat(db, flat, branch_state)
|
|
||||||
|
|
||||||
def limit_download(self, file):
|
|
||||||
if file.endswith(".spec") or file.endswith(".changes"):
|
|
||||||
return True
|
|
||||||
return False
|
|
||||||
|
|
||||||
def commit_flat(self, db, flat, branch_state):
|
|
||||||
parents = []
|
|
||||||
self.git.checkout(flat.branch)
|
|
||||||
if flat.parent1:
|
|
||||||
parents.append(flat.parent1.git_commit)
|
|
||||||
if flat.parent2:
|
|
||||||
parents.append(flat.parent2.git_commit)
|
|
||||||
to_download, to_delete = flat.commit.calc_delta(db, branch_state[flat.branch])
|
|
||||||
for file in to_delete:
|
|
||||||
if not self.limit_download(file):
|
|
||||||
continue
|
|
||||||
self.git.remove(file)
|
|
||||||
for file in to_download:
|
|
||||||
if not self.limit_download(file):
|
|
||||||
continue
|
|
||||||
self.obs.download(
|
|
||||||
flat.commit.project,
|
|
||||||
flat.commit.package,
|
|
||||||
file,
|
|
||||||
flat.commit.expanded_srcmd5,
|
|
||||||
self.git.path,
|
|
||||||
)
|
|
||||||
self.git.add(file)
|
|
||||||
|
|
||||||
commit = self.git.commit(
|
|
||||||
f"OBS User {flat.commit.userid}",
|
|
||||||
"null@suse.de",
|
|
||||||
flat.commit.commit_time,
|
|
||||||
# TODO: Normalize better the commit message
|
|
||||||
f"{flat.commit.comment}\n\n{flat.commit}",
|
|
||||||
allow_empty=True,
|
|
||||||
parents=parents,
|
|
||||||
)
|
|
||||||
flat.commit.git_commit = commit
|
|
||||||
branch_state[flat.branch] = flat.commit
|
|
||||||
with open(self.state_file, "w") as f:
|
|
||||||
data = {}
|
|
||||||
for branch in ["factory", "devel"]:
|
|
||||||
commit = branch_state[branch]
|
|
||||||
if commit:
|
|
||||||
data[branch] = commit.dbid
|
|
||||||
yaml.dump(data, f)
|
|
||||||
|
|
||||||
def import_into_db(self):
|
def import_into_db(self):
|
||||||
db = DB()
|
db = DB()
|
||||||
for project, _, api_url in self.projects:
|
self.update_db_package(db, self.project, self.package)
|
||||||
self.obs.change_url(api_url)
|
self.fetch_all_linked_packages(db, self.project, self.package)
|
||||||
self.update_db_package(db, project, self.package)
|
|
||||||
self.fetch_all_linked_packages(db, project, self.package)
|
# all remaining, no filtering here
|
||||||
# all remaining, no filtering here
|
self.find_linked_revs(db)
|
||||||
self.find_linked_revs(db)
|
|
||||||
|
|
||||||
missing_users = User.missing_users(db)
|
missing_users = User.missing_users(db)
|
||||||
for userid in missing_users:
|
for userid in missing_users:
|
||||||
|
@ -7,7 +7,8 @@ from lib.db_revision import DBRevision
|
|||||||
|
|
||||||
|
|
||||||
class TestExporter:
|
class TestExporter:
|
||||||
""""Helper class to export data from production DB for tests"""
|
""" "Helper class to export data from production DB for tests"""
|
||||||
|
|
||||||
def __init__(self, package):
|
def __init__(self, package):
|
||||||
self.package = package
|
self.package = package
|
||||||
|
|
||||||
|
@ -204,9 +204,9 @@ class TreeBuilder:
|
|||||||
last_merge.merged.merged_into = None
|
last_merge.merged.merged_into = None
|
||||||
last_merge.merged = None
|
last_merge.merged = None
|
||||||
|
|
||||||
def build(self, package):
|
def build(self, project, package):
|
||||||
"""Create a Factory tree (returning the top)"""
|
"""Create a Factory tree (returning the top)"""
|
||||||
factory_revisions = self.revisions_chain("openSUSE:Factory", package)
|
factory_revisions = self.revisions_chain(project, package)
|
||||||
self.add_merge_points(factory_revisions)
|
self.add_merge_points(factory_revisions)
|
||||||
# factory_revisions.print()
|
# factory_revisions.print()
|
||||||
self.prune_loose_end(factory_revisions)
|
self.prune_loose_end(factory_revisions)
|
||||||
|
Loading…
Reference in New Issue
Block a user