12 Commits

Author SHA1 Message Date
5da7861c2a Switch to sha-256 git repo and use git tools again 2024-04-09 11:40:26 +02:00
c9e07e536f Try to fetch the element as deleted if initial access failed
The reference to the object might be already deleted by when the
request is failing. plus setting deleted=0 is rejected by the API.
So try with deleted=1 if and only if the previous access failed.
2023-12-07 18:30:36 +01:00
dc0f33354e Failing to LFS register should abort the import 2023-12-07 18:29:56 +01:00
56cbe0a125 Avoid multi-threading races on import
There seems to be races when using db cursors from multiple threads. as
found by import issues after switching to a newer computer that has
performance and energy efficient cores.

As this is not particularly performance critical, convert to single
threaded use which makes it work again
2023-11-28 23:36:44 +01:00
4353f015c8 Switch to localhost:9999 which is provided via a ssh tunnel
The port is no longer directly exposed, so we need to ssh tunnel it
2023-11-22 14:39:55 +01:00
9cbe0899bc Remove unused import 2023-06-19 13:19:52 +02:00
9e80a64fe0 Change hostname references from gitea.opensuse.org to src.opensuse.org 2023-06-19 10:59:56 +02:00
12001b1640 Commit local changes 2023-04-18 22:31:38 +02:00
3797ea178a Merge pull request 'Add a list of packages no longer existing' (#22) from add_gone into main
Reviewed-on: https://gitea.opensuse.org/importers/git-importer/pulls/22
2023-02-09 10:23:35 +01:00
999dcabcfa Add a list of packages no longer existing
I made this a file and not a DB that is automatically maintained as I think
for now adding an entry in there should be done manually - OBS being OBS
packages might look have gone for a brief moment and reappar the day after.
2022-12-02 11:00:31 +01:00
9962673eff Merge pull request 'Add force push for the devel branch' (#21) from add_force into main
Reviewed-on: https://gitea.opensuse.org/importers/git-importer/pulls/21
2022-12-02 09:35:40 +01:00
7b20c03256 Add force push for the devel branch
As devel branches can change in case of factory reverts we need to force
push. Factory branch shouldn't be affected, so not force pushing there
2022-12-02 09:12:11 +01:00
8 changed files with 1551 additions and 116 deletions

1355
gone-packages.txt Normal file

File diff suppressed because it is too large Load Diff

View File

@ -4,7 +4,6 @@ import os
import pathlib
import subprocess
import pygit2
import requests
from lib.binary import BINARY
@ -20,11 +19,6 @@ class Git:
self.committer = committer
self.committer_email = committer_email
self.repo = None
def is_open(self):
return self.repo is not None
def exists(self):
"""Check if the path is a valid git repository"""
return (self.path / ".git").exists()
@ -35,35 +29,60 @@ class Git:
self.open()
def open(self):
# Convert the path to string, to avoid some limitations in
# older pygit2
self.repo = pygit2.init_repository(str(self.path))
subprocess.run(
['git', 'init', '--object-format=sha256', '-b', 'factory'],
cwd=self.path,
check=True,
)
def is_dirty(self):
"""Check if there is something to commit"""
assert self.is_open()
return self.repo.status()
status_str = subprocess.run(
['git', 'status', '--porcelain=2'],
cwd=self.path,
stdout=subprocess.PIPE,
check=True
).stdout.decode('utf-8')
return len(list(filter(None, status_str.split('\n')))) > 0
def branches(self):
return list(self.repo.branches)
br=subprocess.run(
['git', 'for-each-ref', '--format=%(refname:short)', 'refs/heads/'],
cwd=self.path,
check=True,
stdout=subprocess.PIPE
).stdout.decode('utf-8').split()
if len(br) == 0:
br.append('factory') # unborn branch?
return br
def branch(self, branch, commit=None):
if not commit:
commit = self.repo.head
else:
commit = self.repo.get(commit)
self.repo.branches.local.create(branch, commit)
def branch(self, branch, commit='HEAD'):
commit = subprocess.run(
['git', 'rev-parse', '--verify', '--end-of-options', commit + '^{commit}'],
cwd=self.path,
check=True,
stdout=subprocess.PIPE
).stdout.decode('utf-8').strip()
return subprocess.run(['git', 'branch', branch, commit], check=True)
def checkout(self, branch):
"""Checkout into the branch HEAD"""
new_branch = False
ref = f"refs/heads/{branch}"
if branch not in self.branches():
self.repo.references["HEAD"].set_target(ref)
subprocess.run(
['git', 'branch', '-q', branch, 'HEAD'],
cwd=self.path,
check=True
)
new_branch = True
else:
self.repo.checkout(ref)
ref = f"refs/heads/{branch}"
if (self.path/'.git'/ref).exists():
subprocess.run(
['git', 'checkout', '-q', branch],
cwd=self.path,
check=True
)
return new_branch
def commit(
@ -87,30 +106,62 @@ class Git:
committer_time = committer_time if committer_time else user_time
if self.is_dirty():
self.repo.index.add_all()
subprocess.run(
["git", "add", "--all", "."],
cwd=self.path,
check=True,
)
self.repo.index.write()
author = pygit2.Signature(user, user_email, int(user_time.timestamp()))
committer = pygit2.Signature(
committer, committer_email, int(committer_time.timestamp())
tree_id = subprocess.run(
['git', 'write-tree'],
cwd=self.path,
check=True,
stdout=subprocess.PIPE
).stdout.decode('utf-8').strip()
parent_array = []
if isinstance(parents, list):
for parent in filter(None, parents):
parent_array = parent_array + ['-p', parent]
elif isinstance(parents, str):
parents_array = ['-p', parents]
commit_id = subprocess.run(
['git', 'commit-tree'] + parent_array + [tree_id],
cwd=self.path,
env={
"GIT_AUTHOR_NAME": user,
"GIT_AUTHOR_EMAIL": user_email,
"GIT_AUTHOR_DATE": f"{int(user_time.timestamp())} +0000",
"GIT_COMMITTER_NAME": committer,
"GIT_COMMITTER_EMAIL": committer_email,
"GIT_COMMITTER_DATE": f"{int(committer_time.timestamp())} +0000",
},
input=message.encode('utf-8'),
check=True,
stdout=subprocess.PIPE
).stdout.decode('utf-8').rstrip()
subprocess.run(
['git', 'reset', '--soft', commit_id],
cwd=self.path,
check=True,
)
return commit_id
tree = self.repo.index.write_tree()
return self.repo.create_commit(
"HEAD", author, committer, message, tree, parents
)
def last_commit(self):
try:
return self.repo.head.target
except:
return None
def branch_head(self, branch):
return self.repo.references["refs/heads/" + branch].target
def branch_head(self, branch='HEAD'):
return subprocess.run(
['git', 'rev-parse', '--verify', '--end-of-options', branch],
cwd=self.path,
check=True,
stdout=subprocess.PIPE
).stdout.decode('utf-8').strip()
def set_branch_head(self, branch, commit):
self.repo.references["refs/heads/" + branch].set_target(commit)
return subprocess.run(
['git', 'branch', '-f', branch, commit],
cwd=self.path,
check=True,
)
def gc(self):
logging.debug(f"Garbage recollect and repackage {self.path}")
@ -121,17 +172,21 @@ class Git:
stderr=subprocess.STDOUT,
)
def clean(self):
for path, _ in self.repo.status().items():
logging.debug(f"Cleaning {path}")
try:
(self.path / path).unlink()
self.repo.index.remove(path)
except Exception as e:
logging.warning(f"Error removing file {path}: {e}")
# def clean(self):
# for path, _ in self.repo.status().items():
# logging.debug(f"Cleaning {path}")
# try:
# (self.path / path).unlink()
# self.repo.index.remove(path)
# except Exception as e:
# logging.warning(f"Error removing file {path}: {e}")
def add(self, filename):
self.repo.index.add(filename)
subprocess.run(
['git', 'add', filename],
cwd=self.path,
check=True,
)
def add_default_lfs_gitattributes(self, force=False):
if not (self.path / ".gitattributes").exists() or force:
@ -185,9 +240,11 @@ class Git:
return any(fnmatch.fnmatch(filename, line) for line in patterns)
def remove(self, file: pathlib.Path):
self.repo.index.remove(file.name)
(self.path / file).unlink()
subprocess.run(
['git', 'rm', '-q', '--ignore-unmatch', file.name],
cwd=self.path,
check=True,
)
patterns = self.get_specific_lfs_gitattributes()
if file.name in patterns:
patterns.remove(file.name)
@ -201,7 +258,7 @@ class Git:
logging.warning("Not adding a remote due to missing $GITEA_TOKEN")
return
url = f"https://gitea.opensuse.org/api/v1/org/{org_name}/repos"
url = f"https://src.opensuse.org/api/v1/org/{org_name}/repos"
response = requests.post(
url,
data={"name": repo_name},
@ -212,16 +269,23 @@ class Git:
# 201 Created
if response.status_code not in (201, 409):
print(response.data)
url = f"gitea@gitea.opensuse.org:{org_name}/{repo_name}.git"
self.repo.remotes.create("origin", url)
url = f"gitea@src.opensuse.org:{org_name}/{repo_name}.git"
subprocess.run(
['git', 'remote', 'add', 'origin', url],
cwd=self.path,
check=True,
)
def push(self):
remo = self.repo.remotes["origin"]
def push(self, force=False):
cmd = ['git', 'push'];
if force:
cmd.append('-f')
cmd.append('origin')
cmd.append('refs/heads/factory');
cmd.append('refs/heads/devel');
subprocess.run(
cmd,
cwd=self.path,
check=True,
)
keypair = pygit2.KeypairFromAgent("gitea")
callbacks = pygit2.RemoteCallbacks(credentials=keypair)
refspecs = ["refs/heads/factory"]
if "refs/heads/devel" in self.repo.references:
refspecs.append("refs/heads/devel")
remo.push(refspecs, callbacks=callbacks)

View File

@ -86,7 +86,7 @@ class GitExporter:
logging.debug(f"Committing {flat}")
self.commit_flat(flat, branch_state)
self.git.push()
self.git.push(force=True)
def run_gc(self):
self.gc_cnt = self.gc_interval

View File

@ -1,5 +1,5 @@
import concurrent.futures
import logging
import pathlib
import xml.etree.ElementTree as ET
from lib.db import DB
@ -31,6 +31,7 @@ class Importer:
self.obs = OBS(api_url)
assert project == "openSUSE:Factory"
self.refreshed_packages = set()
self.gone_packages_set = None
def import_request(self, number):
self.obs.request(number).import_into_db(self.db)
@ -105,7 +106,7 @@ class Importer:
with self.db.cursor() as cur:
cur.execute(
"""SELECT * FROM revisions WHERE id IN
(SELECT revision_id from linked_revs WHERE linked_id=%s)
(SELECT revision_id from linked_revs WHERE linked_id=%s)
AND commit_time <= %s ORDER BY commit_time""",
(prev.dbid, rev.commit_time),
)
@ -138,7 +139,7 @@ class Importer:
fake_rev = linked.rev + rev.rev / 1000.0
comment = f"Updating link to change in {rev.project}/{rev.package} revision {int(rev.rev)}"
cur.execute(
"""INSERT INTO revisions (project,package,rev,unexpanded_srcmd5,
"""INSERT INTO revisions (project,package,rev,unexpanded_srcmd5,
commit_time, userid, comment, api_url) VALUES(%s,%s,%s,%s,%s,%s,%s,%s) RETURNING id""",
(
linked.project,
@ -161,10 +162,12 @@ class Importer:
(rev.dbid, linked.dbid),
)
def revisions_without_files(self):
def revisions_without_files(self, package):
logging.debug(f"revisions_without_files({package})")
with self.db.cursor() as cur:
cur.execute(
"SELECT * FROM revisions WHERE broken=FALSE AND expanded_srcmd5 IS NULL"
"SELECT * FROM revisions WHERE package=%s AND broken=FALSE AND expanded_srcmd5 IS NULL",
(package, )
)
return [DBRevision(self.db, row) for row in cur.fetchall()]
@ -178,11 +181,11 @@ class Importer:
linked_rev = cur.fetchone()
if linked_rev:
linked_rev = linked_rev[0]
list = self.obs.list(
obs_dir_list = self.obs.list(
rev.project, rev.package, rev.unexpanded_srcmd5, linked_rev
)
if list:
rev.import_dir_list(list)
if obs_dir_list:
rev.import_dir_list(obs_dir_list)
md5 = rev.calculate_files_hash()
with self.db.cursor() as cur:
cur.execute(
@ -196,53 +199,43 @@ class Importer:
self.find_linked_revs()
self.find_fake_revisions()
with concurrent.futures.ThreadPoolExecutor(max_workers=8) as executor:
fs = [
executor.submit(import_rev, self, rev)
for rev in self.revisions_without_files()
]
concurrent.futures.wait(fs)
for package in self.packages:
for rev in self.revisions_without_files(package):
print(f"rev {rev} is without files")
self.import_rev(rev)
def refresh_package(self, project, package):
key = f"{project}/{package}"
if key in self.refreshed_packages:
# refreshing once is good enough
return
if self.package_gone(key):
return
logging.debug(f"Refresh {project}/{package}")
self.refreshed_packages.add(key)
self.update_db_package(project, package)
self.fetch_all_linked_packages(project, package)
def import_into_db(self):
for package in self.packages:
refresh_package(self, self.project, package)
with concurrent.futures.ThreadPoolExecutor(max_workers=8) as executor:
fs = [
executor.submit(refresh_package, self, self.project, package)
for package in self.packages
]
concurrent.futures.wait(fs)
self.db.conn.commit()
self.db.conn.commit()
for number in DBRevision.requests_to_fetch(self.db):
self.import_request(number)
fs = [
executor.submit(import_request, self, number)
for number in DBRevision.requests_to_fetch(self.db)
]
concurrent.futures.wait(fs)
self.db.conn.commit()
self.db.conn.commit()
with self.db.cursor() as cur:
cur.execute(
"""SELECT DISTINCT source_project,source_package FROM requests
WHERE id IN (SELECT request_id FROM revisions WHERE project=%s and package = ANY(%s));""",
(self.project, self.packages),
)
for project, package in cur.fetchall():
self.refresh_package(project, package)
with self.db.cursor() as cur:
cur.execute(
"""SELECT DISTINCT source_project,source_package FROM requests
WHERE id IN (SELECT request_id FROM revisions WHERE project=%s and package = ANY(%s));""",
(self.project, self.packages),
)
fs = [
executor.submit(refresh_package, self, project, package)
for project, package in cur.fetchall()
]
concurrent.futures.wait(fs)
self.db.conn.commit()
missing_users = User.missing_users(self.db)
@ -254,3 +247,11 @@ class Importer:
self.fill_file_lists()
self.db.conn.commit()
def package_gone(self, key):
if not self.gone_packages_set:
self.gone_packages_set = set()
with open(pathlib.Path(__file__).parent.parent / "gone-packages.txt") as f:
for line in f.readlines():
self.gone_packages_set.add(line.strip())
return key in self.gone_packages_set

View File

@ -68,7 +68,7 @@ class LFSOid:
row = cur.fetchone()
lfs_oid_id = row[0]
cur.execute(
"""INSERT INTO lfs_oid_in_package (package,filename,lfs_oid_id)
"""INSERT INTO lfs_oid_in_package (package,filename,lfs_oid_id)
VALUES (%s,%s,%s)""",
(package, filename, lfs_oid_id),
)
@ -83,7 +83,7 @@ class LFSOid:
self.register()
def check(self):
url = f"http://gitea.opensuse.org:9999/check/{self.sha256}/{self.size}"
url = f"http://localhost:9999/check/{self.sha256}/{self.size}"
response = requests.get(
url,
timeout=10,
@ -127,12 +127,13 @@ class LFSOid:
"size": self.size,
}
url = "http://gitea.opensuse.org:9999/register"
url = "http://localhost:9999/register"
response = requests.post(
url,
json=data,
timeout=10,
)
response.raise_for_status()
logging.info(f"Register LFS returned {response.status_code}")
@ -167,7 +168,7 @@ if __name__ == "__main__":
cur.execute(
"""
CREATE TEMPORARY TABLE lfs_oid_in_revision (
revision_id INTEGER,
revision_id INTEGER,
lfs_oid_id INTEGER NOT NULL,
name VARCHAR(255) NOT NULL
)

View File

@ -148,12 +148,28 @@ class OBS:
]
def _download(self, project, package, name, revision):
# the object might be deleted but we can only pass deleted=1
# if it is actually deleted
deleted = 0
while deleted < 2:
url = osc.core.makeurl(
self.url,
["source", project, package, urllib.parse.quote(name)],
{"rev": revision, "expand": 1, "deleted": deleted if deleted else ()},
)
try:
osc.core.http_request("HEAD", url)
break
except Exception:
pass
deleted += 1
url = osc.core.makeurl(
self.url,
["source", project, package, urllib.parse.quote(name)],
{"rev": revision, "expand": 1},
)
return osc.core.http_GET(url)
self.url,
["source", project, package, urllib.parse.quote(name)],
{"rev": revision, "expand": 1, "deleted": 1 if deleted else ()},
)
return osc.core.http_request("GET", url)
def download(
self,

View File

@ -7,8 +7,6 @@ except:
print("Install python3-python-magic, not python3-magic")
raise
import requests
from lib.db import DB
from lib.lfs_oid import LFSOid
from lib.obs import OBS

View File

@ -114,7 +114,7 @@ class TreeBuilder:
candidates.append(node)
if node.merged_into:
# we can't have candidates that are crossing previous merges
# see https://gitea.opensuse.org/importers/git-importer/issues/14
# see https://src.opensuse.org/importers/git-importer/issues/14
candidates = []
node = node.parent
if candidates: