28 Commits

Author SHA1 Message Date
5a28f62fb9 reconstruct state data
If the state file is missing, we can reconstruct which parts were
exported based on revision ids

Also, packages could have branches in Git, but not be in Git. We
need to check (project,package) tuple for this and not just abort
based on the package name alone.
2025-08-09 18:09:35 +02:00
17888407df Include all the devel commits
If commits in devel are younger than factory, we should
probalby export them too....
2025-08-09 18:09:35 +02:00
d5eb5c0db6 Don't export service files from overlay 2025-08-09 18:09:35 +02:00
5a6a55868f Look at deleted packages if they are not found 2025-08-09 18:09:35 +02:00
5d55a8c2fe Empty files should still have a mimetype assigned 2025-08-09 18:09:35 +02:00
32d1924a0d Ignore OBS service generated files
_service: files should not be imported. They are created
by OBS service and will be re-created by OBS during scm sync

Also, not allowed in Factory by policy anyway. So this affects
devel and home projects only
2025-08-09 18:09:35 +02:00
7684388193 re-enable LFS checks 2025-08-09 18:09:35 +02:00
4ef980d1c8 fix bug 2025-02-21 17:24:23 +01:00
fb80d0c105 Revert "Only stop importing when it isn't a jengelh repository"
This reverts commit 44b4d690db.

this breaks detection of scmsync projects. Jan can live with not synced
git for a few weeks
2025-02-21 13:24:40 +01:00
Dirk Müller
44b4d690db Only stop importing when it isn't a jengelh repository 2024-12-02 09:34:49 +01:00
Dirk Müller
a69e861614 Switch the operating organization on the "pool" 2024-12-02 09:33:52 +01:00
Dirk Müller
1da740bd8b Strip multibuild flavors from monitoring 2024-09-09 09:34:12 +02:00
Dirk Mueller
b3107ba3bf Merge pull request 'Stop importing/exporting scmsync packages/projects' (#32) from adamm/git-importer:option_for_non_factory into main
Reviewed-on: importers/git-importer#32
Reviewed-by: Dirk Mueller <dirkmueller@noreply@src.opensuse.org>
2024-09-03 12:40:00 +02:00
86f82325d8 Stop importing/exporting scmsync packages/projects
Also, allow other-than Factory projects
2024-08-08 10:35:53 +02:00
Dirk Mueller
39ba616226 Merge pull request 'Add ability to specify non-Factory' (#31) from adamm/git-importer:option_for_non_factory into main
Reviewed-on: importers/git-importer#31
Reviewed-by: Dirk Mueller <dirkmueller@noreply@src.opensuse.org>
2024-08-07 18:27:11 +02:00
531dbc7c1b Add ability to specify non-Factory
This is important for devel-project only imports
non-factory is still blocked by assert
2024-08-07 16:55:05 +02:00
Dirk Müller
1318f9e0c4 Remove devel branch import
this for yet undefined reason screws up systemd history import
2024-08-07 09:47:54 +02:00
Dirk Müller
d563076d9e add explicit conversion to string to fix the concatenation 2024-08-07 09:47:18 +02:00
b11b3f1adb Add and remove literal files
pathspec in git has special characters that we should not trigger.
Assume every filespec as literal
2024-08-01 16:53:46 +02:00
Dirk Müller
479738d4b2 ruff format run 2024-07-10 10:34:20 +02:00
Adam Majer
2d04136ca5 Make sure we create devel branch, when no diff to Factory 2024-06-13 15:36:59 +02:00
Adam Majer
40ad64ddff Ignore .osc directory 2024-06-10 18:13:51 +02:00
Adam Majer
6bd5d72100 New branch is empty
New branches must be born empty
2024-06-10 17:06:15 +02:00
Dirk Müller
022ae5ab58 remember failed tasks in a separate directory 2024-06-10 17:04:43 +02:00
Dirk Müller
2ff8ed76d0 Reconnect to the AMQP bus when the connection breaks down 2024-06-10 17:04:25 +02:00
Dirk Müller
5f228dc046 enable robust push 2024-05-17 21:47:35 +02:00
Dirk Müller
4e07d8272e don't loop over failed packages 2024-05-17 21:47:15 +02:00
Dirk Müller
2a3475ab6e Create with sha256 enabled 2024-05-17 20:39:55 +02:00
11 changed files with 289 additions and 111 deletions

View File

@@ -42,8 +42,8 @@ PROJECTS = [
]
def export_package(package, repodir, cachedir, gc):
exporter = GitExporter(URL_OBS, "openSUSE:Factory", package, repodir, cachedir)
def export_package(project, package, repodir, cachedir, gc):
exporter = GitExporter(URL_OBS, project, package, repodir, cachedir)
exporter.set_gc_interval(gc)
exporter.export_as_git()
@@ -51,6 +51,12 @@ def export_package(package, repodir, cachedir, gc):
def main():
parser = argparse.ArgumentParser(description="OBS history importer into git")
parser.add_argument("packages", help="OBS package names", nargs="*")
parser.add_argument(
"-p",
"--project",
default="openSUSE:Factory",
help="Project to import/export, default is openSUSE:Factory",
)
parser.add_argument(
"-r",
"--repodir",
@@ -110,10 +116,13 @@ def main():
if not args.cachedir:
args.cachedir = pathlib.Path("~/.cache/git-import/").expanduser()
importer = Importer(URL_OBS, "openSUSE:Factory", args.packages)
importer = Importer(URL_OBS, args.project, args.packages)
importer.import_into_db()
for package in args.packages:
export_package(package, args.repodir, args.cachedir, args.gc)
if not importer.package_with_scmsync(args.project, package):
export_package(args.project, package, args.repodir, args.cachedir, args.gc)
else:
logging.debug(f"{args.project}/{package} has scmsync links - skipping export")
if __name__ == "__main__":

View File

@@ -204,6 +204,11 @@ class DBRevision:
and self.package == "_project"
):
continue
# do not import _service:* files as those are created by OBS on source imports
if entry.get("name")[0:9] == "_service:":
continue
cur.execute(
"""INSERT INTO files (name, md5, size, mtime, revision_id)
VALUES (%s,%s,%s,%s,%s)""",

View File

@@ -20,7 +20,7 @@ class FlatTreeWalker(AbstractWalker):
def __init__(self, rebase_devel=False) -> None:
super().__init__()
self.flats = []
self.flats:list[FlatNode] = []
# the rebase_devel won't work as such as rebasing the branch needs an explicit action
self.rebase_devel = rebase_devel
# remember the last merge point so we can know the parent of it for the root of the sources

View File

@@ -48,6 +48,7 @@ class Git:
def open(self):
if not self.exists():
self.git_run(["init", "--object-format=sha256", "-b", "factory"])
self.git_run(["config", "lfs.allowincompletepush", "true"])
def is_dirty(self):
"""Check if there is something to commit"""
@@ -85,7 +86,7 @@ class Git:
"""Checkout into the branch HEAD"""
new_branch = False
if branch not in self.branches():
self.git_run(["branch", "-q", branch, "HEAD"])
self.git_run(["switch", "-q", "--orphan", branch])
new_branch = True
else:
ref = f"refs/heads/{branch}"
@@ -159,6 +160,12 @@ class Git:
.strip()
)
def branch_commit(self, branch="HEAD"):
try:
return (self.git_run(["cat-file", "commit", branch], stdout=subprocess.PIPE).stdout.decode("utf-8").strip())
except:
return ''
def set_branch_head(self, branch, commit):
return self.git_run(["update-ref", f"refs/heads/{branch}", commit])
@@ -180,7 +187,13 @@ class Git:
# logging.warning(f"Error removing file {path}: {e}")
def add(self, filename):
self.git_run(["add", filename])
self.git_run(["add", ":(literal)" + str(filename)])
def add_default_gitignore(self):
if not (self.path / ".gitignore").exists():
with (self.path / ".gitignore").open("w") as f:
f.write(".osc\n")
self.add(".gitignore")
def add_default_lfs_gitattributes(self, force=False):
if not (self.path / ".gitattributes").exists() or force:
@@ -235,7 +248,7 @@ class Git:
def remove(self, file: pathlib.Path):
self.git_run(
["rm", "-q", "-f", "--ignore-unmatch", file.name],
["rm", "-q", "-f", "--ignore-unmatch", ":(literal)" + file.name],
)
patterns = self.get_specific_lfs_gitattributes()
if file.name in patterns:
@@ -244,7 +257,7 @@ class Git:
def add_gitea_remote(self, package):
repo_name = package.replace("+", "_")
org_name = "rpm"
org_name = "pool"
if not os.getenv("GITEA_TOKEN"):
logging.warning("Not adding a remote due to missing $GITEA_TOKEN")
@@ -253,7 +266,7 @@ class Git:
url = f"https://src.opensuse.org/api/v1/org/{org_name}/repos"
response = requests.post(
url,
data={"name": repo_name},
data={"name": repo_name, "object_format_name": "sha256"},
headers={"Authorization": f"token {os.getenv('GITEA_TOKEN')}"},
timeout=10,
)
@@ -271,13 +284,11 @@ class Git:
["remote"],
stdout=subprocess.PIPE,
).stdout.decode("utf-8"):
logger.warning("Not pushing to remote because no 'origin' configured")
logging.warning("Not pushing to remote because no 'origin' configured")
return
cmd = ["push"]
if force:
cmd.append("-f")
cmd.append("origin")
cmd.append("refs/heads/factory")
cmd.append("refs/heads/devel")
cmd += ["origin", "--all"]
self.git_run(cmd)

View File

@@ -1,10 +1,12 @@
import logging
import os
from urllib.parse import parse_qs
import psycopg
from urllib3.util import url
import yaml
from hashlib import md5
from pathlib import Path
from lib import db
from lib.binary import is_binary_or_large
from lib.db import DB
from lib.git import Git
@@ -14,6 +16,12 @@ from lib.proxy_sha256 import ProxySHA256
from lib.tree_builder import TreeBuilder
from lib.user import User
def is_number(s):
try:
float(s)
return True
except ValueError:
return False
class GitExporter:
def __init__(self, api_url, project, package, repodir, cachedir):
@@ -39,42 +47,107 @@ class GitExporter:
def set_gc_interval(self, gc):
self.gc_interval = gc
def check_repo_state(self, flats, branch_state):
def reconstruct_state(self, flats):
state_data = dict()
prefix = "OBS-URL: "
for line in self.git.branch_commit("factory").splitlines():
if line.startswith(prefix):
u = url.parse_url(line.strip(prefix))
if u.path != f"/package/show/openSUSE:Factory/{self.package}" or "rev=" not in u.query:
continue
v = parse_qs(u.query)
rev = v['rev'][0]
with self.db.cursor() as cur:
try:
if is_number(rev):
cur.execute("SELECT id FROM revisions WHERE project=%s AND package=%s AND rev=%s", ('openSUSE:Factory', self.package, rev,))
else:
cur.execute("SELECT id FROM revisions WHERE project=%s AND package=%s AND expanded_srcmd5=%s", ('openSUSE:Factory', self.package, rev, rev))
except psycopg.Error as e:
logging.error(e)
self.db.conn.rollback()
row = cur.fetchone()
if not row:
return state_data
state_data['factory'] = row[0]
try:
print("devel reconstruct")
d = self.devel_rev(flats)
if d is not None:
prj = d.commit.project
for line in self.git.branch_commit("devel").splitlines():
if line.startswith(prefix):
u = url.parse_url(line.strip(prefix))
if u.path != f"/package/show/{prj}/{self.package}" or u.query is None or "rev=" not in u.query:
continue
v = parse_qs(u.query)
rev = v['rev'][0]
try:
with self.db.cursor() as cur:
logging.debug(f"finding id for ({prj, self.package, rev}")
if is_number(rev):
cur.execute("SELECT id FROM revisions WHERE project=%s AND package=%s AND rev=%s", (prj, self.package, rev,))
else:
cur.execute("SELECT id FROM revisions WHERE project=%s AND package=%s AND expanded_srcmd5=%s", (prj, self.package, rev,))
row = cur.fetchone()
if not row:
logging.info(" ** cannot find revision for devel branch:", rev)
return state_data
state_data['devel'] = row[0]
except psycopg.Error as e:
logging.error(e)
self.db.conn.rollback()
if state_data['factory'] is not None:
state_data['devel'] = state_data['factory']
except:
if state_data['factory'] is not None:
state_data['devel'] = state_data['factory']
return state_data
def check_repo_state(self, flats, branch_state, branch):
state_data = dict()
if os.path.exists(self.state_file):
with open(self.state_file) as f:
state_data = yaml.safe_load(f)
if not isinstance(state_data, dict):
state_data = {}
else:
state_data = self.reconstruct_state(flats)
logging.debug(f"state data: {state_data}")
left_to_commit = []
for flat in reversed(flats):
found_state = False
for branch in ["factory", "devel"]:
if flat.commit.dbid == state_data.get(branch):
branch_state[branch] = flat.commit
flat.commit.git_commit = self.git.branch_head(branch)
logging.debug(
f"Found {self.git.path}'s {branch} branch in state {flat}"
)
left_to_commit = []
found_state = True
if flat.commit.dbid == state_data.get(branch):
branch_state[branch] = flat.commit
flat.commit.git_commit = self.git.branch_head(branch)
logging.debug(
f"Found {self.git.path}'s {branch} branch in state {flat}"
)
left_to_commit = []
found_state = True
if not found_state:
left_to_commit.append(flat)
return left_to_commit
def devel_rev(self, tree):
for flat in tree:
if flat.branch == "devel":
return flat
return None
def export_as_git(self):
if os.getenv("CHECK_ALL_LFS"):
LFSOid.check_all(self.db, self.package)
tree = TreeBuilder(self.db).build(self.project, self.package)
flats = tree.as_flat_list()
added_commits = False
branch_state = {"factory": None, "devel": None}
left_to_commit = self.check_repo_state(flats, branch_state)
if not left_to_commit:
if tree == None: # eg. python-M2Crypto errors
return
logging.info(f"Commiting into {self.git.path}")
flats = tree.as_flat_list()
branch_state = {"factory": None, "devel": None}
left_to_commit = self.check_repo_state(flats, branch_state, "factory")
logging.info(f"Commiting into {self.git.path} {len(left_to_commit)} into factory")
self.run_gc()
users = dict()
@@ -87,8 +160,40 @@ class GitExporter:
self.run_gc()
logging.debug(f"Committing {flat}")
self.commit_flat(flat, branch_state)
added_commits = True
self.git.push(force=True)
# export the devel_tree head commits based on the devel branch
if self.project == "openSUSE:Factory":
devel_head = self.devel_rev(flats)
flat_devel = None
if devel_head is not None:
logging.debug(f"building devel revisions chain for {devel_head.commit.project} / {self.package}")
flat_devel = TreeBuilder(self.db).revisions_chain(devel_head.commit.project, self.package).as_flat_list()
for f in flat_devel:
f.branch = "devel"
if flat_devel is not None:
left_to_commit = self.check_repo_state(flat_devel, branch_state, "devel")
logging.debug(branch_state)
logging.debug(f"appending {len(left_to_commit)} items on top of devel")
for flat in left_to_commit:
if flat.commit.userid not in users:
users[flat.commit.userid] = User.find(self.db, flat.commit.userid)
flat.user = users[flat.commit.userid]
self.gc_cnt -= 1
if self.gc_cnt <= 0 and self.gc_interval:
self.run_gc()
logging.debug(f"Committing {flat}")
self.commit_flat(flat, branch_state)
added_commits = True
# make sure that we create devel branch
if not branch_state["devel"]:
logging.debug("force creating devel")
self.git.set_branch_head("devel", self.git.branch_head("factory"))
#if added_commits:
# self.git.push(force=True)
def run_gc(self):
self.gc_cnt = self.gc_interval
@@ -100,6 +205,10 @@ class GitExporter:
return not self.proxy_sha256.is_text(package, filename)
def commit_file(self, flat, file, size, md5):
# don't export imported _service: files, if any
if file.name[0:9] == '_service:':
return
# have such files been detected as text mimetype before?
if self.is_lfs_file(flat.commit.package, file.name, size):
file_sha256 = self.proxy_sha256.get_or_put(
@@ -135,12 +244,6 @@ class GitExporter:
return True
return flat.parent1 == branch_state[flat.branch]
def file_md5(self, file):
m = md5()
with open(file, 'rb') as f:
m.update(f.read())
return m.hexdigest()
def commit_flat(self, flat, branch_state):
parents = []
self.git.checkout(flat.branch)
@@ -158,41 +261,13 @@ class GitExporter:
# create file if not existant
self.git.add_default_lfs_gitattributes(force=False)
self.git.add_default_gitignore()
new_files = flat.commit.files_list()
cur_files = os.listdir(self.git.path)
for cf in cur_files:
if cf[0] == '.':
continue
found = False
for nf in new_files:
if nf['name'] == cf:
found = True
break
if found:
# check if file is modified
file_path = self.git.path.joinpath(cf)
stat = file_path.stat()
if stat.st_size != nf['size'] or self.file_md5(file_path) != nf['md5']:
logging.debug(f"updating {file_path.name}")
self.commit_file(flat, Path(cf), nf['size'], nf['md5'])
else:
logging.debug(f"leaving {file_path.name}")
else:
# file not exist in new commit
self.git.remove(Path(cf))
# new files?
for file in new_files:
found = False
for cf in cur_files:
if file['name'] == cf:
found = True
break
if not found:
self.commit_file(flat, Path(file['name']), file['size'], file['md5'])
to_download, to_delete = flat.commit.calc_delta(branch_state[flat.branch])
for file in to_delete:
self.git.remove(file)
for file, size, md5 in to_download:
self.commit_file(flat, file, size, md5)
commit = self.git.commit(
flat.user.realname,

View File

@@ -26,19 +26,24 @@ class Importer:
# Import multiple Factory packages into the database
self.packages = packages
self.project = project
self.scmsync_cache = dict()
self.packages_with_scmsync = set()
self.db = DB()
self.obs = OBS(api_url)
assert project == "openSUSE:Factory"
assert not self.has_scmsync(project)
self.refreshed_packages = set()
self.gone_packages_set = None
def import_request(self, number):
self.obs.request(number).import_into_db(self.db)
def update_db_package(self, project, package):
root = self.obs._history(project, package)
if root is None:
if self.project == "openSUSE:Factory" and project == self.project:
exit(10)
return
latest = DBRevision.max_rev(self.db, project, package)
for r in root.findall("revision"):
@@ -213,6 +218,10 @@ class Importer:
return
logging.debug(f"Refresh {project}/{package}")
self.refreshed_packages.add(key)
if self.has_scmsync(project) or self.has_scmsync(key):
self.packages_with_scmsync.add((project, package))
logging.debug(f"{project}/{package} already in Git - skipping")
return
self.update_db_package(project, package)
self.fetch_all_linked_packages(project, package)
@@ -255,3 +264,18 @@ class Importer:
for line in f.readlines():
self.gone_packages_set.add(line.strip())
return key in self.gone_packages_set
def has_scmsync(self, key):
if key in self.scmsync_cache:
return self.scmsync_cache[key]
root = self.obs._meta(key)
scmsync_exists = False
if root is not None:
scmsync_exists = root.find('scmsync') is not None
self.scmsync_cache[key] = scmsync_exists
return scmsync_exists
def package_with_scmsync(self, project, package):
return (project, package) in self.packages_with_scmsync

View File

@@ -73,11 +73,11 @@ class OBS:
logging.debug(f"GET {url}")
return ET.parse(osc.core.http_GET(url)).getroot()
def _meta(self, project, package, **params):
def _meta(self, key, **params):
try:
root = self._xml(f"source/{project}/{package}/_meta", **params)
root = self._xml(f"source/{key}/_meta", **params)
except HTTPError:
logging.error(f"Package [{project}/{package} {params}] has no meta")
logging.error(f"Project/Package [{key} {params}] has no meta")
return None
return root
@@ -118,13 +118,13 @@ class OBS:
return root
def exists(self, project, package):
root = self._meta(project, package)
root = self._meta(f"{project}/{package}")
if root is None:
return False
return root.get("project") == project
def devel_project(self, project, package):
root = self._meta(project, package)
root = self._meta(f"{project}/{package}")
devel = root.find("devel")
if devel is None:
return None
@@ -148,12 +148,21 @@ class OBS:
]
def _download(self, project, package, name, revision):
url = osc.core.makeurl(
self.url,
["source", project, package, name],
{"rev": revision, "expand": 1},
)
return osc.core.http_GET(url)
try:
url = osc.core.makeurl(
self.url,
["source", project, package, name],
{"rev": revision, "expand": 1},
)
return osc.core.http_GET(url)
except HTTPError as e:
if e.status == 404:
url = osc.core.makeurl(
self.url,
["source", project, package, name],
{"rev": revision, "expand": 1, "deleted": 1},
)
return osc.core.http_GET(url)
def download(
self,
@@ -189,7 +198,7 @@ class OBS:
try:
root = self._xml(f"source/{project}/{package}", **params)
except HTTPError as e:
if e.code == 400:
if e.code == 400 or e.code == 404:
logging.error(
f"Package [{project}/{package} {params}] can't be expanded: {e}"
)

View File

@@ -50,12 +50,12 @@ class ProxySHA256:
sha = hashlib.sha256()
while True:
buffer = fin.read(10000)
if not buffer:
break
sha.update(buffer)
# only guess from the first 10K
if not mimetype:
mimetype = self.mime.from_buffer(buffer)
if not buffer:
break
sha.update(buffer)
fin.close()
LFSOid(self.db).add(
project, package, name, revision, sha.hexdigest(), size, mimetype, file_md5

View File

@@ -216,6 +216,8 @@ class TreeBuilder:
def build(self, project, package):
"""Create a Factory tree (returning the top)"""
factory_revisions = self.revisions_chain(project, package)
if factory_revisions == None:
return None
self.add_merge_points(factory_revisions)
# factory_revisions.print()
self.prune_loose_end(factory_revisions)

View File

@@ -2,36 +2,60 @@
import json
from pathlib import Path
import pika
import sys
import random
import time
MY_TASKS_DIR = Path(__file__).parent / "tasks"
connection = pika.BlockingConnection(pika.URLParameters("amqps://opensuse:opensuse@rabbit.opensuse.org"))
channel = connection.channel()
channel.exchange_declare(exchange='pubsub', exchange_type='topic', passive=True, durable=True)
def listen_events():
connection = pika.BlockingConnection(
pika.URLParameters("amqps://opensuse:opensuse@rabbit.opensuse.org")
)
channel = connection.channel()
result = channel.queue_declare("", exclusive=True)
queue_name = result.method.queue
channel.exchange_declare(
exchange="pubsub", exchange_type="topic", passive=True, durable=False
)
channel.queue_bind(exchange='pubsub',
queue=queue_name,routing_key='#')
result = channel.queue_declare("", exclusive=True)
queue_name = result.method.queue
print(' [*] Waiting for logs. To exit press CTRL+C')
channel.queue_bind(
exchange="pubsub", queue=queue_name, routing_key="opensuse.obs.package.commit"
)
def callback(ch, method, properties, body):
if method.routing_key not in ("opensuse.obs.package.commit",):
return
body = json.loads(body)
if 'project' in body and 'package' in body and body['project'] == 'openSUSE:Factory':
if '/' in body['package']:
print(" [*] Waiting for logs. To exit press CTRL+C")
def callback(ch, method, properties, body):
if method.routing_key not in ("opensuse.obs.package.commit",):
return
body = json.loads(body)
if (
"project" in body
and "package" in body
and body["project"] == "openSUSE:Factory"
):
# Strip multibuild flavors
package = body["package"].partition(':')[0]
if "/" in package:
return
(MY_TASKS_DIR / body['package']).touch()
print(" [x] %r:%r" % (method.routing_key, body['package']))
(MY_TASKS_DIR / package).touch()
print(" [x] %r:%r" % (method.routing_key, body["package"]))
channel.basic_consume(queue_name,
callback,
auto_ack=True)
channel.basic_consume(queue_name, callback, auto_ack=True)
channel.start_consuming()
channel.start_consuming()
def main():
while True:
try:
listen_events()
except (pika.exceptions.ConnectionClosed, pika.exceptions.AMQPHeartbeatTimeout):
time.sleep(random.randint(10, 100))
if __name__ == "__main__":
main()

19
update-tasks.sh Executable file
View File

@@ -0,0 +1,19 @@
#!/bin/bash
#
cd /space/dmueller/git-importer
source credentials.sh
while true; do
for i in $PWD/tasks/*; do
if test -f "$i"; then
echo "$(date): Importing $(basename $i)"
if ! python3 ./git-importer.py -c repos/.cache $(basename $i); then
mkdir -p $PWD/failed-tasks
mv -f $i $PWD/failed-tasks
fi
rm -f $i
fi
done
inotifywait -q -e create $PWD/tasks
done