12 Commits

Author SHA1 Message Date
Adam Majer
5da7861c2a Switch to sha-256 git repo and use git tools again 2024-04-09 11:40:26 +02:00
Dirk Müller
c9e07e536f Try to fetch the element as deleted if initial access failed
The reference to the object might be already deleted by when the
request is failing. plus setting deleted=0 is rejected by the API.
So try with deleted=1 if and only if the previous access failed.
2023-12-07 18:30:36 +01:00
Dirk Müller
dc0f33354e Failing to LFS register should abort the import 2023-12-07 18:29:56 +01:00
Dirk Müller
56cbe0a125 Avoid multi-threading races on import
There seems to be races when using db cursors from multiple threads. as
found by import issues after switching to a newer computer that has
performance and energy efficient cores.

As this is not particularly performance critical, convert to single
threaded use which makes it work again
2023-11-28 23:36:44 +01:00
Dirk Müller
4353f015c8 Switch to localhost:9999 which is provided via a ssh tunnel
The port is no longer directly exposed, so we need to ssh tunnel it
2023-11-22 14:39:55 +01:00
Dirk Müller
9cbe0899bc Remove unused import 2023-06-19 13:19:52 +02:00
Dirk Müller
9e80a64fe0 Change hostname references from gitea.opensuse.org to src.opensuse.org 2023-06-19 10:59:56 +02:00
Dirk Müller
12001b1640 Commit local changes 2023-04-18 22:31:38 +02:00
Stephan Kulow
3797ea178a Merge pull request 'Add a list of packages no longer existing' (#22) from add_gone into main
Reviewed-on: https://gitea.opensuse.org/importers/git-importer/pulls/22
2023-02-09 10:23:35 +01:00
Stephan Kulow
999dcabcfa Add a list of packages no longer existing
I made this a file and not a DB that is automatically maintained as I think
for now adding an entry in there should be done manually - OBS being OBS
packages might look have gone for a brief moment and reappar the day after.
2022-12-02 11:00:31 +01:00
9962673eff Merge pull request 'Add force push for the devel branch' (#21) from add_force into main
Reviewed-on: https://gitea.opensuse.org/importers/git-importer/pulls/21
2022-12-02 09:35:40 +01:00
Stephan Kulow
7b20c03256 Add force push for the devel branch
As devel branches can change in case of factory reverts we need to force
push. Factory branch shouldn't be affected, so not force pushing there
2022-12-02 09:12:11 +01:00
8 changed files with 1551 additions and 116 deletions

1355
gone-packages.txt Normal file

File diff suppressed because it is too large Load Diff

View File

@@ -4,7 +4,6 @@ import os
import pathlib import pathlib
import subprocess import subprocess
import pygit2
import requests import requests
from lib.binary import BINARY from lib.binary import BINARY
@@ -20,11 +19,6 @@ class Git:
self.committer = committer self.committer = committer
self.committer_email = committer_email self.committer_email = committer_email
self.repo = None
def is_open(self):
return self.repo is not None
def exists(self): def exists(self):
"""Check if the path is a valid git repository""" """Check if the path is a valid git repository"""
return (self.path / ".git").exists() return (self.path / ".git").exists()
@@ -35,35 +29,60 @@ class Git:
self.open() self.open()
def open(self): def open(self):
# Convert the path to string, to avoid some limitations in subprocess.run(
# older pygit2 ['git', 'init', '--object-format=sha256', '-b', 'factory'],
self.repo = pygit2.init_repository(str(self.path)) cwd=self.path,
check=True,
)
def is_dirty(self): def is_dirty(self):
"""Check if there is something to commit""" """Check if there is something to commit"""
assert self.is_open() status_str = subprocess.run(
['git', 'status', '--porcelain=2'],
return self.repo.status() cwd=self.path,
stdout=subprocess.PIPE,
check=True
).stdout.decode('utf-8')
return len(list(filter(None, status_str.split('\n')))) > 0
def branches(self): def branches(self):
return list(self.repo.branches) br=subprocess.run(
['git', 'for-each-ref', '--format=%(refname:short)', 'refs/heads/'],
cwd=self.path,
check=True,
stdout=subprocess.PIPE
).stdout.decode('utf-8').split()
if len(br) == 0:
br.append('factory') # unborn branch?
return br
def branch(self, branch, commit=None): def branch(self, branch, commit='HEAD'):
if not commit: commit = subprocess.run(
commit = self.repo.head ['git', 'rev-parse', '--verify', '--end-of-options', commit + '^{commit}'],
else: cwd=self.path,
commit = self.repo.get(commit) check=True,
self.repo.branches.local.create(branch, commit) stdout=subprocess.PIPE
).stdout.decode('utf-8').strip()
return subprocess.run(['git', 'branch', branch, commit], check=True)
def checkout(self, branch): def checkout(self, branch):
"""Checkout into the branch HEAD""" """Checkout into the branch HEAD"""
new_branch = False new_branch = False
ref = f"refs/heads/{branch}"
if branch not in self.branches(): if branch not in self.branches():
self.repo.references["HEAD"].set_target(ref) subprocess.run(
['git', 'branch', '-q', branch, 'HEAD'],
cwd=self.path,
check=True
)
new_branch = True new_branch = True
else: else:
self.repo.checkout(ref) ref = f"refs/heads/{branch}"
if (self.path/'.git'/ref).exists():
subprocess.run(
['git', 'checkout', '-q', branch],
cwd=self.path,
check=True
)
return new_branch return new_branch
def commit( def commit(
@@ -87,30 +106,62 @@ class Git:
committer_time = committer_time if committer_time else user_time committer_time = committer_time if committer_time else user_time
if self.is_dirty(): if self.is_dirty():
self.repo.index.add_all() subprocess.run(
["git", "add", "--all", "."],
cwd=self.path,
check=True,
)
self.repo.index.write() tree_id = subprocess.run(
author = pygit2.Signature(user, user_email, int(user_time.timestamp())) ['git', 'write-tree'],
committer = pygit2.Signature( cwd=self.path,
committer, committer_email, int(committer_time.timestamp()) check=True,
stdout=subprocess.PIPE
).stdout.decode('utf-8').strip()
parent_array = []
if isinstance(parents, list):
for parent in filter(None, parents):
parent_array = parent_array + ['-p', parent]
elif isinstance(parents, str):
parents_array = ['-p', parents]
commit_id = subprocess.run(
['git', 'commit-tree'] + parent_array + [tree_id],
cwd=self.path,
env={
"GIT_AUTHOR_NAME": user,
"GIT_AUTHOR_EMAIL": user_email,
"GIT_AUTHOR_DATE": f"{int(user_time.timestamp())} +0000",
"GIT_COMMITTER_NAME": committer,
"GIT_COMMITTER_EMAIL": committer_email,
"GIT_COMMITTER_DATE": f"{int(committer_time.timestamp())} +0000",
},
input=message.encode('utf-8'),
check=True,
stdout=subprocess.PIPE
).stdout.decode('utf-8').rstrip()
subprocess.run(
['git', 'reset', '--soft', commit_id],
cwd=self.path,
check=True,
) )
return commit_id
tree = self.repo.index.write_tree() def branch_head(self, branch='HEAD'):
return self.repo.create_commit( return subprocess.run(
"HEAD", author, committer, message, tree, parents ['git', 'rev-parse', '--verify', '--end-of-options', branch],
) cwd=self.path,
check=True,
def last_commit(self): stdout=subprocess.PIPE
try: ).stdout.decode('utf-8').strip()
return self.repo.head.target
except:
return None
def branch_head(self, branch):
return self.repo.references["refs/heads/" + branch].target
def set_branch_head(self, branch, commit): def set_branch_head(self, branch, commit):
self.repo.references["refs/heads/" + branch].set_target(commit) return subprocess.run(
['git', 'branch', '-f', branch, commit],
cwd=self.path,
check=True,
)
def gc(self): def gc(self):
logging.debug(f"Garbage recollect and repackage {self.path}") logging.debug(f"Garbage recollect and repackage {self.path}")
@@ -121,17 +172,21 @@ class Git:
stderr=subprocess.STDOUT, stderr=subprocess.STDOUT,
) )
def clean(self): # def clean(self):
for path, _ in self.repo.status().items(): # for path, _ in self.repo.status().items():
logging.debug(f"Cleaning {path}") # logging.debug(f"Cleaning {path}")
try: # try:
(self.path / path).unlink() # (self.path / path).unlink()
self.repo.index.remove(path) # self.repo.index.remove(path)
except Exception as e: # except Exception as e:
logging.warning(f"Error removing file {path}: {e}") # logging.warning(f"Error removing file {path}: {e}")
def add(self, filename): def add(self, filename):
self.repo.index.add(filename) subprocess.run(
['git', 'add', filename],
cwd=self.path,
check=True,
)
def add_default_lfs_gitattributes(self, force=False): def add_default_lfs_gitattributes(self, force=False):
if not (self.path / ".gitattributes").exists() or force: if not (self.path / ".gitattributes").exists() or force:
@@ -185,9 +240,11 @@ class Git:
return any(fnmatch.fnmatch(filename, line) for line in patterns) return any(fnmatch.fnmatch(filename, line) for line in patterns)
def remove(self, file: pathlib.Path): def remove(self, file: pathlib.Path):
self.repo.index.remove(file.name) subprocess.run(
(self.path / file).unlink() ['git', 'rm', '-q', '--ignore-unmatch', file.name],
cwd=self.path,
check=True,
)
patterns = self.get_specific_lfs_gitattributes() patterns = self.get_specific_lfs_gitattributes()
if file.name in patterns: if file.name in patterns:
patterns.remove(file.name) patterns.remove(file.name)
@@ -201,7 +258,7 @@ class Git:
logging.warning("Not adding a remote due to missing $GITEA_TOKEN") logging.warning("Not adding a remote due to missing $GITEA_TOKEN")
return return
url = f"https://gitea.opensuse.org/api/v1/org/{org_name}/repos" url = f"https://src.opensuse.org/api/v1/org/{org_name}/repos"
response = requests.post( response = requests.post(
url, url,
data={"name": repo_name}, data={"name": repo_name},
@@ -212,16 +269,23 @@ class Git:
# 201 Created # 201 Created
if response.status_code not in (201, 409): if response.status_code not in (201, 409):
print(response.data) print(response.data)
url = f"gitea@gitea.opensuse.org:{org_name}/{repo_name}.git" url = f"gitea@src.opensuse.org:{org_name}/{repo_name}.git"
self.repo.remotes.create("origin", url) subprocess.run(
['git', 'remote', 'add', 'origin', url],
cwd=self.path,
check=True,
)
def push(self): def push(self, force=False):
remo = self.repo.remotes["origin"] cmd = ['git', 'push'];
if force:
cmd.append('-f')
cmd.append('origin')
cmd.append('refs/heads/factory');
cmd.append('refs/heads/devel');
subprocess.run(
cmd,
cwd=self.path,
check=True,
)
keypair = pygit2.KeypairFromAgent("gitea")
callbacks = pygit2.RemoteCallbacks(credentials=keypair)
refspecs = ["refs/heads/factory"]
if "refs/heads/devel" in self.repo.references:
refspecs.append("refs/heads/devel")
remo.push(refspecs, callbacks=callbacks)

View File

@@ -86,7 +86,7 @@ class GitExporter:
logging.debug(f"Committing {flat}") logging.debug(f"Committing {flat}")
self.commit_flat(flat, branch_state) self.commit_flat(flat, branch_state)
self.git.push() self.git.push(force=True)
def run_gc(self): def run_gc(self):
self.gc_cnt = self.gc_interval self.gc_cnt = self.gc_interval

View File

@@ -1,5 +1,5 @@
import concurrent.futures
import logging import logging
import pathlib
import xml.etree.ElementTree as ET import xml.etree.ElementTree as ET
from lib.db import DB from lib.db import DB
@@ -31,6 +31,7 @@ class Importer:
self.obs = OBS(api_url) self.obs = OBS(api_url)
assert project == "openSUSE:Factory" assert project == "openSUSE:Factory"
self.refreshed_packages = set() self.refreshed_packages = set()
self.gone_packages_set = None
def import_request(self, number): def import_request(self, number):
self.obs.request(number).import_into_db(self.db) self.obs.request(number).import_into_db(self.db)
@@ -161,10 +162,12 @@ class Importer:
(rev.dbid, linked.dbid), (rev.dbid, linked.dbid),
) )
def revisions_without_files(self): def revisions_without_files(self, package):
logging.debug(f"revisions_without_files({package})")
with self.db.cursor() as cur: with self.db.cursor() as cur:
cur.execute( cur.execute(
"SELECT * FROM revisions WHERE broken=FALSE AND expanded_srcmd5 IS NULL" "SELECT * FROM revisions WHERE package=%s AND broken=FALSE AND expanded_srcmd5 IS NULL",
(package, )
) )
return [DBRevision(self.db, row) for row in cur.fetchall()] return [DBRevision(self.db, row) for row in cur.fetchall()]
@@ -178,11 +181,11 @@ class Importer:
linked_rev = cur.fetchone() linked_rev = cur.fetchone()
if linked_rev: if linked_rev:
linked_rev = linked_rev[0] linked_rev = linked_rev[0]
list = self.obs.list( obs_dir_list = self.obs.list(
rev.project, rev.package, rev.unexpanded_srcmd5, linked_rev rev.project, rev.package, rev.unexpanded_srcmd5, linked_rev
) )
if list: if obs_dir_list:
rev.import_dir_list(list) rev.import_dir_list(obs_dir_list)
md5 = rev.calculate_files_hash() md5 = rev.calculate_files_hash()
with self.db.cursor() as cur: with self.db.cursor() as cur:
cur.execute( cur.execute(
@@ -196,53 +199,43 @@ class Importer:
self.find_linked_revs() self.find_linked_revs()
self.find_fake_revisions() self.find_fake_revisions()
with concurrent.futures.ThreadPoolExecutor(max_workers=8) as executor: for package in self.packages:
fs = [ for rev in self.revisions_without_files(package):
executor.submit(import_rev, self, rev) print(f"rev {rev} is without files")
for rev in self.revisions_without_files() self.import_rev(rev)
]
concurrent.futures.wait(fs)
def refresh_package(self, project, package): def refresh_package(self, project, package):
key = f"{project}/{package}" key = f"{project}/{package}"
if key in self.refreshed_packages: if key in self.refreshed_packages:
# refreshing once is good enough # refreshing once is good enough
return return
if self.package_gone(key):
return
logging.debug(f"Refresh {project}/{package}") logging.debug(f"Refresh {project}/{package}")
self.refreshed_packages.add(key) self.refreshed_packages.add(key)
self.update_db_package(project, package) self.update_db_package(project, package)
self.fetch_all_linked_packages(project, package) self.fetch_all_linked_packages(project, package)
def import_into_db(self): def import_into_db(self):
for package in self.packages:
refresh_package(self, self.project, package)
with concurrent.futures.ThreadPoolExecutor(max_workers=8) as executor: self.db.conn.commit()
fs = [
executor.submit(refresh_package, self, self.project, package)
for package in self.packages
]
concurrent.futures.wait(fs)
self.db.conn.commit() for number in DBRevision.requests_to_fetch(self.db):
self.import_request(number)
fs = [ self.db.conn.commit()
executor.submit(import_request, self, number)
for number in DBRevision.requests_to_fetch(self.db)
]
concurrent.futures.wait(fs)
self.db.conn.commit() with self.db.cursor() as cur:
cur.execute(
"""SELECT DISTINCT source_project,source_package FROM requests
WHERE id IN (SELECT request_id FROM revisions WHERE project=%s and package = ANY(%s));""",
(self.project, self.packages),
)
for project, package in cur.fetchall():
self.refresh_package(project, package)
with self.db.cursor() as cur:
cur.execute(
"""SELECT DISTINCT source_project,source_package FROM requests
WHERE id IN (SELECT request_id FROM revisions WHERE project=%s and package = ANY(%s));""",
(self.project, self.packages),
)
fs = [
executor.submit(refresh_package, self, project, package)
for project, package in cur.fetchall()
]
concurrent.futures.wait(fs)
self.db.conn.commit() self.db.conn.commit()
missing_users = User.missing_users(self.db) missing_users = User.missing_users(self.db)
@@ -254,3 +247,11 @@ class Importer:
self.fill_file_lists() self.fill_file_lists()
self.db.conn.commit() self.db.conn.commit()
def package_gone(self, key):
if not self.gone_packages_set:
self.gone_packages_set = set()
with open(pathlib.Path(__file__).parent.parent / "gone-packages.txt") as f:
for line in f.readlines():
self.gone_packages_set.add(line.strip())
return key in self.gone_packages_set

View File

@@ -83,7 +83,7 @@ class LFSOid:
self.register() self.register()
def check(self): def check(self):
url = f"http://gitea.opensuse.org:9999/check/{self.sha256}/{self.size}" url = f"http://localhost:9999/check/{self.sha256}/{self.size}"
response = requests.get( response = requests.get(
url, url,
timeout=10, timeout=10,
@@ -127,12 +127,13 @@ class LFSOid:
"size": self.size, "size": self.size,
} }
url = "http://gitea.opensuse.org:9999/register" url = "http://localhost:9999/register"
response = requests.post( response = requests.post(
url, url,
json=data, json=data,
timeout=10, timeout=10,
) )
response.raise_for_status()
logging.info(f"Register LFS returned {response.status_code}") logging.info(f"Register LFS returned {response.status_code}")

View File

@@ -148,12 +148,28 @@ class OBS:
] ]
def _download(self, project, package, name, revision): def _download(self, project, package, name, revision):
# the object might be deleted but we can only pass deleted=1
# if it is actually deleted
deleted = 0
while deleted < 2:
url = osc.core.makeurl(
self.url,
["source", project, package, urllib.parse.quote(name)],
{"rev": revision, "expand": 1, "deleted": deleted if deleted else ()},
)
try:
osc.core.http_request("HEAD", url)
break
except Exception:
pass
deleted += 1
url = osc.core.makeurl( url = osc.core.makeurl(
self.url, self.url,
["source", project, package, urllib.parse.quote(name)], ["source", project, package, urllib.parse.quote(name)],
{"rev": revision, "expand": 1}, {"rev": revision, "expand": 1, "deleted": 1 if deleted else ()},
) )
return osc.core.http_GET(url) return osc.core.http_request("GET", url)
def download( def download(
self, self,

View File

@@ -7,8 +7,6 @@ except:
print("Install python3-python-magic, not python3-magic") print("Install python3-python-magic, not python3-magic")
raise raise
import requests
from lib.db import DB from lib.db import DB
from lib.lfs_oid import LFSOid from lib.lfs_oid import LFSOid
from lib.obs import OBS from lib.obs import OBS

View File

@@ -114,7 +114,7 @@ class TreeBuilder:
candidates.append(node) candidates.append(node)
if node.merged_into: if node.merged_into:
# we can't have candidates that are crossing previous merges # we can't have candidates that are crossing previous merges
# see https://gitea.opensuse.org/importers/git-importer/issues/14 # see https://src.opensuse.org/importers/git-importer/issues/14
candidates = [] candidates = []
node = node.parent node = node.parent
if candidates: if candidates: