main #36

Merged
dirkmueller merged 10 commits from adamm/git-importer:main into main 2025-09-12 12:46:26 +02:00
9 changed files with 163 additions and 51 deletions

View File

@@ -119,7 +119,7 @@ def main():
importer = Importer(URL_OBS, args.project, args.packages)
importer.import_into_db()
for package in args.packages:
if not importer.package_with_scmsync(package):
if not importer.package_with_scmsync(args.project, package):
export_package(args.project, package, args.repodir, args.cachedir, args.gc)
else:
logging.debug(f"{args.project}/{package} has scmsync links - skipping export")

View File

@@ -204,6 +204,11 @@ class DBRevision:
and self.package == "_project"
):
continue
# do not import _service:* files as those are created by OBS on source imports
if entry.get("name")[0:9] == "_service:":
continue
cur.execute(
"""INSERT INTO files (name, md5, size, mtime, revision_id)
VALUES (%s,%s,%s,%s,%s)""",

View File

@@ -20,7 +20,7 @@ class FlatTreeWalker(AbstractWalker):
def __init__(self, rebase_devel=False) -> None:
super().__init__()
self.flats = []
self.flats:list[FlatNode] = []
# the rebase_devel won't work as such as rebasing the branch needs an explicit action
self.rebase_devel = rebase_devel
# remember the last merge point so we can know the parent of it for the root of the sources

View File

@@ -160,6 +160,12 @@ class Git:
.strip()
)
def branch_commit(self, branch="HEAD"):
try:
Review

Using try … except as a substitute for if … then … else is always a code smell. Couldn’t you find some civilized way how to decide when to run that command without just trying it blindly?

Using `try … except` as a substitute for `if … then … else` is always a code smell. Couldn’t you find some civilized way how to decide when to run that command without just trying it blindly?
return (self.git_run(["cat-file", "commit", branch], stdout=subprocess.PIPE).stdout.decode("utf-8").strip())
except:
return ''
def set_branch_head(self, branch, commit):
return self.git_run(["update-ref", f"refs/heads/{branch}", commit])

View File

@@ -1,8 +1,12 @@
import logging
import os
from urllib.parse import parse_qs
import psycopg
from urllib3.util import url
import yaml
from lib import db
from lib.binary import is_binary_or_large
from lib.db import DB
from lib.git import Git
@@ -12,6 +16,12 @@ from lib.proxy_sha256 import ProxySHA256
from lib.tree_builder import TreeBuilder
from lib.user import User
def is_number(s):
try:
float(s)
return True
except ValueError:
return False
class GitExporter:
def __init__(self, api_url, project, package, repodir, cachedir):
@@ -30,24 +40,76 @@ class GitExporter:
else:
self.git.create()
self.git.add_gitea_remote(package)
self.state_file = os.path.join(self.git.path, ".git", "_flat_state.yaml")
self.gc_interval = 200
self.cachedir = cachedir
def set_gc_interval(self, gc):
self.gc_interval = gc
def check_repo_state(self, flats, branch_state):
def reconstruct_state(self, flats):
state_data = dict()
if os.path.exists(self.state_file):
with open(self.state_file) as f:
state_data = yaml.safe_load(f)
if not isinstance(state_data, dict):
state_data = {}
prefix = "OBS-URL: "
for line in self.git.branch_commit("factory").splitlines():
if line.startswith(prefix):
u = url.parse_url(line.strip(prefix))
if u.path != f"/package/show/openSUSE:Factory/{self.package}" or "rev=" not in u.query:
continue
v = parse_qs(u.query)
rev = v['rev'][0]
with self.db.cursor() as cur:
try:
if is_number(rev):
cur.execute("SELECT id FROM revisions WHERE project=%s AND package=%s AND rev=%s", ('openSUSE:Factory', self.package, rev,))
else:
cur.execute("SELECT id FROM revisions WHERE project=%s AND package=%s AND expanded_srcmd5=%s", ('openSUSE:Factory', self.package, rev, rev))
except psycopg.Error as e:
logging.error(e)
self.db.conn.rollback()
row = cur.fetchone()
if not row:
return state_data
state_data['factory'] = row[0]
try:
print("devel reconstruct")
d = self.devel_rev(flats)
if d is not None:
prj = d.commit.project
for line in self.git.branch_commit("devel").splitlines():
if line.startswith(prefix):
u = url.parse_url(line.strip(prefix))
if u.path != f"/package/show/{prj}/{self.package}" or u.query is None or "rev=" not in u.query:
continue
v = parse_qs(u.query)
rev = v['rev'][0]
try:
with self.db.cursor() as cur:
logging.debug(f"finding id for ({prj, self.package, rev}")
if is_number(rev):
cur.execute("SELECT id FROM revisions WHERE project=%s AND package=%s AND rev=%s", (prj, self.package, rev,))
else:
cur.execute("SELECT id FROM revisions WHERE project=%s AND package=%s AND expanded_srcmd5=%s", (prj, self.package, rev,))
row = cur.fetchone()
if not row:
logging.info(" ** cannot find revision for devel branch:", rev)
return state_data
state_data['devel'] = row[0]
except psycopg.Error as e:
logging.error(e)
self.db.conn.rollback()
if state_data['factory'] is not None:
state_data['devel'] = state_data['factory']
except:
if state_data['factory'] is not None:
state_data['devel'] = state_data['factory']
return state_data
def check_repo_state(self, flats, branch_state, branch):
state_data = self.reconstruct_state(flats)
logging.debug(f"state data: {state_data}")
left_to_commit = []
for flat in reversed(flats):
found_state = False
for branch in ["factory"]:
if flat.commit.dbid == state_data.get(branch):
branch_state[branch] = flat.commit
flat.commit.git_commit = self.git.branch_head(branch)
@@ -60,19 +122,24 @@ class GitExporter:
left_to_commit.append(flat)
return left_to_commit
def devel_rev(self, tree):
for flat in tree:
if flat.branch == "devel":
return flat
return None
def export_as_git(self):
if os.getenv("CHECK_ALL_LFS"):
LFSOid.check_all(self.db, self.package)
tree = TreeBuilder(self.db).build(self.project, self.package)
flats = tree.as_flat_list()
added_commits = False
branch_state = {"factory": None, "devel": None}
left_to_commit = self.check_repo_state(flats, branch_state)
if not left_to_commit:
if tree == None: # eg. python-M2Crypto errors
Review

??? As a maintainer of M2Crypto (both upstream and in openSUSE), I am curious about this comment? What’s the problem with it?

??? As a maintainer of M2Crypto (both upstream and in openSUSE), I am curious about this comment? What’s the problem with it?
Review

thats when the ssl connection terminates, because the server is overloaded. no issue in m2crypto.

thats when the ssl connection terminates, because the server is overloaded. no issue in m2crypto.
return
logging.info(f"Commiting into {self.git.path}")
flats = tree.as_flat_list()
branch_state = {"factory": None, "devel": None}
left_to_commit = self.check_repo_state(flats, branch_state, "factory")
logging.info(f"Commiting into {self.git.path} {len(left_to_commit)} into factory")
self.run_gc()
users = dict()
@@ -85,13 +152,40 @@ class GitExporter:
self.run_gc()
logging.debug(f"Committing {flat}")
self.commit_flat(flat, branch_state)
added_commits = True
# export the devel_tree head commits based on the devel branch
if self.project == "openSUSE:Factory":
devel_head = self.devel_rev(flats)
flat_devel = None
if devel_head is not None:
logging.debug(f"building devel revisions chain for {devel_head.commit.project} / {self.package}")
flat_devel = TreeBuilder(self.db).revisions_chain(devel_head.commit.project, self.package).as_flat_list()
for f in flat_devel:
f.branch = "devel"
if flat_devel is not None:
left_to_commit = self.check_repo_state(flat_devel, branch_state, "devel")
logging.debug(branch_state)
logging.debug(f"appending {len(left_to_commit)} items on top of devel")
for flat in left_to_commit:
if flat.commit.userid not in users:
users[flat.commit.userid] = User.find(self.db, flat.commit.userid)
flat.user = users[flat.commit.userid]
self.gc_cnt -= 1
if self.gc_cnt <= 0 and self.gc_interval:
self.run_gc()
logging.debug(f"Committing {flat}")
self.commit_flat(flat, branch_state)
added_commits = True
# make sure that we create devel branch
if not branch_state["devel"]:
logging.debug("force creating devel")
self.git.set_branch_head("devel", self.git.branch_head("factory"))
self.git.push(force=True)
#if added_commits:
# self.git.push(force=True)
def run_gc(self):
self.gc_cnt = self.gc_interval
@@ -103,6 +197,10 @@ class GitExporter:
return not self.proxy_sha256.is_text(package, filename)
def commit_file(self, flat, file, size, md5):
# don't export imported _service: files, if any
if file.name[0:9] == '_service:':
return
# have such files been detected as text mimetype before?
if self.is_lfs_file(flat.commit.package, file.name, size):
file_sha256 = self.proxy_sha256.get_or_put(
@@ -172,10 +270,3 @@ class GitExporter:
)
flat.commit.git_commit = commit
branch_state[flat.branch] = flat.commit
with open(self.state_file, "w") as f:
data = {}
for branch in ["factory", "devel"]:
commit = branch_state[branch]
if commit:
data[branch] = commit.dbid
yaml.dump(data, f)

View File

@@ -42,6 +42,8 @@ class Importer:
def update_db_package(self, project, package):
root = self.obs._history(project, package)
if root is None:
if self.project == "openSUSE:Factory" and project == self.project:
exit(10)
return
latest = DBRevision.max_rev(self.db, project, package)
for r in root.findall("revision"):
@@ -217,7 +219,7 @@ class Importer:
logging.debug(f"Refresh {project}/{package}")
self.refreshed_packages.add(key)
if self.has_scmsync(project) or self.has_scmsync(key):
self.packages_with_scmsync.add(package)
self.packages_with_scmsync.add((project, package))
logging.debug(f"{project}/{package} already in Git - skipping")
return
self.update_db_package(project, package)
@@ -268,15 +270,12 @@ class Importer:
return self.scmsync_cache[key]
root = self.obs._meta(key)
scmsync = None
scmsync_exists = False
if root and root.find('scmsync') is not None:
scmsync = root.find('scmsync').text
if scmsync:
scmsync_exists = scmsync.startswith('https://src.opensuse.org/pool/')
if root is not None:
scmsync_exists = root.find('scmsync') is not None
self.scmsync_cache[key] = scmsync_exists
return scmsync_exists
def package_with_scmsync(self, package):
return package in self.packages_with_scmsync
def package_with_scmsync(self, project, package):
return (project, package) in self.packages_with_scmsync

View File

@@ -148,12 +148,21 @@ class OBS:
]
def _download(self, project, package, name, revision):
try:
url = osc.core.makeurl(
self.url,
["source", project, package, name],
{"rev": revision, "expand": 1},
)
return osc.core.http_GET(url)
except HTTPError as e:
if e.status == 404:
url = osc.core.makeurl(
self.url,
["source", project, package, name],
{"rev": revision, "expand": 1, "deleted": 1},
)
return osc.core.http_GET(url)
def download(
self,
@@ -189,7 +198,7 @@ class OBS:
try:
root = self._xml(f"source/{project}/{package}", **params)
except HTTPError as e:
if e.code == 400:
if e.code == 400 or e.code == 404:
logging.error(
f"Package [{project}/{package} {params}] can't be expanded: {e}"
)

View File

@@ -50,12 +50,12 @@ class ProxySHA256:
sha = hashlib.sha256()
while True:
buffer = fin.read(10000)
if not buffer:
break
sha.update(buffer)
# only guess from the first 10K
if not mimetype:
mimetype = self.mime.from_buffer(buffer)
if not buffer:
break
sha.update(buffer)
fin.close()
LFSOid(self.db).add(
project, package, name, revision, sha.hexdigest(), size, mimetype, file_md5

View File

@@ -216,6 +216,8 @@ class TreeBuilder:
def build(self, project, package):
"""Create a Factory tree (returning the top)"""
factory_revisions = self.revisions_chain(project, package)
if factory_revisions == None:
return None
self.add_merge_points(factory_revisions)
# factory_revisions.print()
self.prune_loose_end(factory_revisions)