Move modules into lib subdir

2022-10-17 19:54:47 +02:00
parent 88322fffae
commit a82562b794
11 changed files with 29 additions and 25 deletions
--- a/lib/binary.py
+++ b/lib/binary.py
@@ -0,0 +1,86 @@
+import pathlib
+
+BINARY = {
+    ".7z",
+    ".bsp",
+    ".bz2",
+    ".gem",
+    ".gz",
+    ".jar",
+    ".lz",
+    ".lzma",
+    ".obscpio",
+    ".oxt",
+    ".pdf",
+    ".png",
+    ".rpm",
+    ".tbz",
+    ".tbz2",
+    ".tgz",
+    ".ttf",
+    ".txz",
+    ".whl",
+    ".xz",
+    ".zip",
+    ".zst",
+}
+
+
+def is_binary_or_large(filename, size):
+    """Decide if is a binary file based on the extension or size"""
+    binary_suffix = BINARY
+    non_binary_suffix = {
+        ".1",
+        ".8",
+        ".SUSE",
+        ".asc",
+        ".c",
+        ".cabal",
+        ".cfg",
+        ".changes",
+        ".conf",
+        ".desktop",
+        ".dif",
+        ".diff",
+        ".dsc",
+        ".el",
+        ".html",
+        ".in",
+        ".init",
+        ".install",
+        ".keyring",
+        ".kiwi",
+        ".logrotate",
+        ".macros",
+        ".md",
+        ".obsinfo",
+        ".pamd",
+        ".patch",
+        ".pl",
+        ".pom",
+        ".py",
+        ".rpmlintrc",
+        ".rules",
+        ".script",
+        ".service",
+        ".sh",
+        ".sig",
+        ".sign",
+        ".spec",
+        ".sysconfig",
+        ".test",
+        ".txt",
+        ".xml",
+        ".xml",
+        ".yml",
+    }
+
+    suffix = pathlib.Path(filename).suffix
+    if suffix in binary_suffix:
+        return True
+    if suffix in non_binary_suffix:
+        return False
+    if size >= 6 * 1024:
+        return True
+
+    return False
--- a/lib/config.py
+++ b/lib/config.py
@@ -0,0 +1,21 @@
+from configparser import ConfigParser
+
+
+def config(filename="database.ini", section="production"):
+    # create a parser
+    parser = ConfigParser()
+    # read config file
+    parser.read(filename)
+
+    # get section, default to postgresql
+    db = {}
+    if parser.has_section(section):
+        params = parser.items(section)
+        for param in params:
+            db[param[0]] = param[1]
+    else:
+        raise Exception(
+            "Section {0} not found in the {1} file".format(section, filename)
+        )
+
+    return db
--- a/lib/db.py
+++ b/lib/db.py
@@ -0,0 +1,86 @@
+import psycopg2
+from config import config
+
+
+class DB:
+    def __init__(self):
+        self.connect()
+
+    def connect(self):
+        try:
+            # read the connection parameters
+            params = config()
+            # connect to the PostgreSQL server
+            self.conn = psycopg2.connect(**params)
+
+        except (Exception, psycopg2.DatabaseError) as error:
+            print(error)
+            raise error
+
+    def schema_version(self):
+        # create a cursor
+        cur = self.conn.cursor()
+
+        # execute a statement
+        try:
+            cur.execute("SELECT MAX(version) from scheme")
+        except psycopg2.errors.UndefinedTable as error:
+            cur.close()
+            self.close()
+            self.connect()
+            return 0
+
+        db_version = cur.fetchone()
+
+        cur.close()
+        return db_version[0]
+
+    def close(self):
+        if self.conn is not None:
+            self.conn.close()
+        self.conn = None
+
+    def create_tables(self):
+        """Create the tables if not existing - assumes connected"""
+
+        commands = (
+            """
+        CREATE TABLE scheme (
+            id SERIAL PRIMARY KEY,
+            version SMALLINT NOT NULL
+        )
+        """,
+            "INSERT INTO scheme (version) VALUES(1)",
+            """ CREATE TABLE revisions (
+                id                  SERIAL PRIMARY KEY,
+                project             VARCHAR(255) NOT NULL,
+                package             VARCHAR(255) NOT NULL,
+                rev                 VARCHAR(255) NOT NULL,
+                unexpanded_srcmd5   VARCHAR(255) NOT NULL,
+                commit_time         timestamp    NOT NULL,
+                userid              VARCHAR(255) NOT NULL,
+                comment             VARCHAR(255)
+                )
+        """,
+        )
+
+        if self.schema_version() > 0:
+            return
+        try:
+            cur = self.conn.cursor()
+            # create table one by one
+            for command in commands:
+                cur.execute(command)
+            # close communication with the PostgreSQL database server
+            cur.close()
+            # commit the changes
+            self.conn.commit()
+        except (Exception, psycopg2.DatabaseError) as error:
+            print(error)
+            self.close()
+            raise error
+
+
+if __name__ == "__main__":
+    db = DB()
+    db.create_tables()
--- a/lib/git.py
+++ b/lib/git.py
@@ -0,0 +1,263 @@
+import fnmatch
+import logging
+import pathlib
+import subprocess
+
+import pygit2
+
+from lib.binary import BINARY
+
+LFS_SUFFIX = "filter=lfs diff=lfs merge=lfs -text"
+
+
+class Git:
+    """Local git repository"""
+
+    def __init__(self, path, committer=None, committer_email=None):
+        self.path = pathlib.Path(path)
+        self.committer = committer
+        self.committer_email = committer_email
+
+        self.repo = None
+
+    def is_open(self):
+        return self.repo is not None
+
+    # TODO: Extend it to packages and files
+    def exists(self):
+        """Check if the path is a valid git repository"""
+        return (self.path / ".git").exists()
+
+    def create(self):
+        """Create a local git repository"""
+        self.path.mkdir(parents=True, exist_ok=True)
+        # Convert the path to string, to avoid some limitations in
+        # older pygit2
+        self.repo = pygit2.init_repository(str(self.path))
+        return self
+
+    def is_dirty(self):
+        """Check if there is something to commit"""
+        assert self.is_open()
+
+        return self.repo.status()
+
+    def branches(self):
+        return list(self.repo.branches)
+
+    def branch(self, branch, commit=None):
+        if not commit:
+            commit = self.repo.head
+        else:
+            commit = self.repo.get(commit)
+        self.repo.branches.local.create(branch, commit)
+
+    def checkout(self, branch):
+        """Checkout into the branch HEAD"""
+        new_branch = False
+        ref = f"refs/heads/{branch}"
+        if branch not in self.branches():
+            self.repo.references["HEAD"].set_target(ref)
+            new_branch = True
+        else:
+            self.repo.checkout(ref)
+        return new_branch
+
+    def commit(
+        self,
+        user,
+        user_email,
+        user_time,
+        message,
+        parents=None,
+        committer=None,
+        committer_email=None,
+        committer_time=None,
+        allow_empty=False,
+    ):
+        """Add all the files and create a new commit in the current HEAD"""
+        assert allow_empty or self.is_dirty()
+
+        if not committer:
+            committer = self.committer if self.committer else self.user
+            committer_email = (
+                self.committer_email if self.committer_email else self.user_email
+            )
+            committer_time = committer_time if committer_time else user_time
+
+        try:
+            self.repo.index.add_all()
+        except pygit2.GitError as e:
+            if not allow_empty:
+                raise e
+
+        self.repo.index.write()
+        author = pygit2.Signature(user, user_email, int(user_time.timestamp()))
+        committer = pygit2.Signature(
+            committer, committer_email, int(committer_time.timestamp())
+        )
+        if not parents:
+            try:
+                parents = [self.repo.head.target]
+            except pygit2.GitError as e:
+                parents = []
+                if not allow_empty:
+                    raise e
+
+        tree = self.repo.index.write_tree()
+        return self.repo.create_commit(
+            "HEAD", author, committer, message, tree, parents
+        )
+
+    def merge(
+        self,
+        user,
+        user_email,
+        user_time,
+        message,
+        commit,
+        committer=None,
+        committer_email=None,
+        committer_time=None,
+        clean_on_conflict=True,
+        merged=False,
+        allow_empty=False,
+    ):
+        new_branch = False
+
+        if not merged:
+            try:
+                self.repo.merge(commit)
+            except KeyError:
+                # If it is the first commit, we will have a missing
+                # "HEAD", but the files will be there.  We can proceed
+                # to the commit directly.
+                new_branch = True
+
+        if not merged and self.repo.index.conflicts:
+            for conflict in self.repo.index.conflicts:
+                conflict = [c for c in conflict if c]
+                if conflict:
+                    logging.info(f"CONFLICT {conflict[0].path}")
+
+            if clean_on_conflict:
+                self.clean()
+            # Now I miss Rust enums
+            return "CONFLICT"
+
+        # Some merges are empty in OBS (no changes, not sure
+        # why), for now we signal them
+        if not allow_empty and not self.is_dirty():
+            # I really really do miss Rust enums
+            return "EMPTY"
+
+        if new_branch:
+            parents = [commit]
+        else:
+            parents = [
+                self.repo.head.target,
+                commit,
+            ]
+        commit = self.commit(
+            user,
+            user_email,
+            user_time,
+            message,
+            parents,
+            committer,
+            committer_email,
+            committer_time,
+            allow_empty=allow_empty,
+        )
+
+        return commit
+
+    def merge_abort(self):
+        self.repo.state_cleanup()
+
+    def last_commit(self):
+        try:
+            return self.repo.head.target
+        except:
+            return None
+
+    def gc(self):
+        logging.info(f"Garbage recollec and repackage {self.path}")
+        subprocess.run(
+            ["git", "gc", "--auto"],
+            cwd=self.path,
+            stdout=subprocess.PIPE,
+            stderr=subprocess.STDOUT,
+        )
+
+    def clean(self):
+        for path, _ in self.repo.status().items():
+            logging.debug(f"Cleaning {path}")
+            try:
+                (self.path / path).unlink()
+                self.repo.index.remove(path)
+            except Exception as e:
+                logging.warning(f"Error removing file {path}: {e}")
+
+    def add(self, filename):
+        self.repo.index.add(filename)
+
+    def add_default_lfs_gitattributes(self, force=False):
+        if not (self.path / ".gitattributes").exists() or force:
+            with (self.path / ".gitattributes").open("w") as f:
+                content = ["## Default LFS"]
+                content += [f"*{b} {LFS_SUFFIX}" for b in sorted(BINARY)]
+                f.write("\n".join(content))
+                f.write("\n")
+            self.add(".gitattributes")
+
+    def add_specific_lfs_gitattributes(self, binaries):
+        self.add_default_lfs_gitattributes(force=True)
+        if binaries:
+            with (self.path / ".gitattributes").open("a") as f:
+                content = ["## Specific LFS patterns"]
+                content += [f"{b} {LFS_SUFFIX}" for b in sorted(binaries)]
+                f.write("\n".join(content))
+                f.write("\n")
+        self.add(".gitattributes")
+
+    def get_specific_lfs_gitattributes(self):
+        with (self.path / ".gitattributes").open() as f:
+            patterns = [
+                line.split()[0]
+                for line in f
+                if line.strip() and not line.startswith("#")
+            ]
+        binary = {f"*{b}" for b in BINARY}
+        return [p for p in patterns if p not in binary]
+
+    def add_lfs(self, filename, sha256, size):
+        with (self.path / filename).open("w") as f:
+            f.write("version https://git-lfs.github.com/spec/v1\n")
+            f.write(f"oid sha256:{sha256}\n")
+            f.write(f"size {size}\n")
+        self.add(filename)
+
+        if not self.is_lfs_tracked(filename):
+            logging.debug(f"Add specific LFS file {filename}")
+            specific_patterns = self.get_specific_lfs_gitattributes()
+            specific_patterns.append(filename)
+            self.add_specific_lfs_gitattributes(specific_patterns)
+
+    def is_lfs_tracked(self, filename):
+        with (self.path / ".gitattributes").open() as f:
+            patterns = (
+                line.split()[0]
+                for line in f
+                if line.strip() and not line.startswith("#")
+            )
+            return any(fnmatch.fnmatch(filename, line) for line in patterns)
+
+    def remove(self, filename):
+        self.repo.index.remove(filename)
+        (self.path / filename).unlink()
+
+        patterns = self.get_specific_lfs_gitattributes()
+        if filename in patterns:
+            patterns.remove(filename)
+            self.add_specific_lfs_gitattributes(patterns)
--- a/lib/history.py
+++ b/lib/history.py
@@ -0,0 +1,106 @@
+import itertools
+import logging
+import re
+
+from lib.revision import Revision
+
+
+class History:
+    """Store the history of revisions of a package in different
+    projects.
+
+    """
+
+    def __init__(self, obs, package):
+        self.obs = obs
+        self.package = package
+
+        self.revisions = {}
+
+    def __contains__(self, project):
+        return project in self.revisions
+
+    def __getitem__(self, project):
+        return self.revisions[project]
+
+    def _extract_copypac(self, comment):
+        original_project = re.findall(
+            r"osc copypac from project:(.*) package:", comment
+        )
+        return original_project[0] if original_project else None
+
+    def _fetch_revisions(self, project, **params):
+        root = self.obs._history(project, self.package, **params)
+        if root is not None:
+            return [
+                Revision(self.obs, self, project, self.package).parse(r)
+                for r in root.findall("revision")
+            ]
+
+    def fetch_revisions(self, project, follow_copypac=False):
+        """Get the revision history of a package"""
+        if project in self:
+            return
+
+        revs = self._fetch_revisions(project)
+        self.revisions[project] = revs
+        # while (
+        #     revs
+        #     and follow_copypac
+        #     and (copypac_project := self._extract_copypac(revs[0].comment))
+        # ):
+        #     # Add the history pre-copypac
+        #     # TODO: missing the old project name
+        #     revs = self._fetch_revisions(copypac_project, deleted=1)
+        #     self.revisions[project] = (
+        #         revs + self.revisions[project]
+        #     )
+
+    def fetch_all_revisions(self, projects):
+        """Pre-populate the history"""
+        for project, _, api_url in projects:
+            self.obs.change_url(api_url)
+            self.fetch_revisions(project)
+
+    def sort_all_revisions(self):
+        """Sort revisions for all projects, from older to newer"""
+        return sorted(itertools.chain(*self.revisions.values()), key=lambda x: x.time)
+
+    def find_revision(self, project, revisionid, accepted_at):
+        last_commited_revision = None
+        for r in self.revisions.get(project, []):
+            logging.debug(f"Find revision {revisionid} [{accepted_at}]: {r}")
+            if str(r.rev) == str(revisionid) or r.srcmd5 == revisionid:
+                if r.ignored:
+                    logging.debug(
+                        f"{r} fits but is ignored, returning {last_commited_revision}"
+                    )
+                    return last_commited_revision
+                else:
+                    logging.debug(f"{r} fits")
+                    return r
+            if r.time > accepted_at:
+                # if we can't find the right revision, we take the last
+                # commit. Before ~2012 the data was tracked really loosely
+                # (e.g. using different timezones and the state field was
+                # only introduced in 2016...)
+                logging.warning(
+                    f"Deploying workaround for missing request revision - returning {last_commited_revision}"
+                )
+                return last_commited_revision
+            if r.commit:
+                last_commited_revision = r
+        logging.info("No commited revision found, returning None")
+        return None
+
+    def find_last_rev_after_time(self, project, time):
+        # revs = self.projects.get(project, [])
+        # return next((r for r in reversed(revs) if r.time <= time), None)
+        prev = None
+        for rev in self.revisions.get(project, []):
+            if rev.time > time:
+                return prev
+            if rev.time == time:
+                return rev
+            prev = rev
+        return prev
--- a/lib/importer.py
+++ b/lib/importer.py
@@ -0,0 +1,354 @@
+import functools
+import logging
+
+from lib.binary import is_binary_or_large
+from lib.git import Git
+from lib.history import History
+from lib.obs import OBS
+from lib.proxy_sha256 import ProxySHA256, md5, sha256
+
+
+def _files_hash(hash_alg, dirpath):
+    """List of (filepath, md5) for a directory"""
+    # TODO: do it async or multythread
+    files = [f for f in dirpath.iterdir() if f.is_file()]
+    return [(f.parts[-1], hash_alg(f)) for f in files]
+
+
+files_md5 = functools.partial(_files_hash, md5)
+files_sha256 = functools.partial(_files_hash, sha256)
+
+
+class Importer:
+    def __init__(self, projects, package, repodir, search_ancestor, rebase_devel):
+        # The idea is to create each commit in order, and draw the
+        # same graph described by the revisions timeline.  For that we
+        # need first to fetch all the revisions and sort them
+        # linearly, based on the timestamp.
+        #
+        # After that we recreate the commits, and if one revision is a
+        # request that contains a target inside the projects in the
+        # "history", we create a merge commit.
+        #
+        # Optionally, if a flag is set, we will try to find a common
+        # "Initial commit" from a reference branch (the first one in
+        # "projects", that is safe to assume to be "openSUSE:Factory".
+        # This is not always a good idea.  For example, in a normal
+        # situation the "devel" project history is older than
+        # "factory", and we can root the tree on it.  But for some
+        # other projects we lost partially the "devel" history project
+        # (could be moved), and "factory" is not the root.
+
+        self.package = package
+        self.search_ancestor = search_ancestor
+        self.rebase_devel = rebase_devel
+
+        self.obs = OBS()
+        self.git = Git(
+            repodir,
+            committer="Git OBS Bridge",
+            committer_email="obsbridge@suse.de",
+        ).create()
+        self.proxy_sha256 = ProxySHA256(self.obs, enabled=True)
+
+        self.history = History(self.obs, self.package)
+
+        # Add the "devel" project
+        (project, branch, api_url) = projects[0]
+        assert project == "openSUSE:Factory"
+        self.obs.change_url(api_url)
+        devel_project = self.obs.devel_project(project, package)
+        if devel_project:
+            self.projects = [(devel_project, "devel", api_url)] + projects
+        else:
+            self.projects = projects
+
+        # Associate the branch and api_url information per project
+        self.projects_info = {
+            project: (branch, api_url) for (project, branch, api_url) in self.projects
+        }
+
+    def download(self, revision):
+        obs_files = self.obs.files(revision.project, revision.package, revision.srcmd5)
+        git_files = {
+            (f.name, f.stat().st_size, md5(f))
+            for f in self.git.path.iterdir()
+            if f.is_file() and f.name not in (".gitattributes")
+        }
+
+        # Overwrite ".gitattributes" with the
+        self.git.add_default_lfs_gitattributes(force=True)
+
+        # Download each file in OBS if it is not a binary (or large)
+        # file
+        for (name, size, file_md5) in obs_files:
+            # this file creates easily 100k commits and is just useless data :(
+            # unfortunately it's stored in the same meta package as the project config
+            if revision.package == "_project" and name == "_staging_workflow":
+                continue
+            # have such files been detected as text mimetype before?
+            is_text = self.proxy_sha256.is_text(name)
+            if not is_text and is_binary_or_large(name, size):
+                file_sha256 = self.proxy_sha256.get_or_put(
+                    revision.project,
+                    revision.package,
+                    name,
+                    revision.srcmd5,
+                    file_md5,
+                    size,
+                )
+                self.git.add_lfs(name, file_sha256["sha256"], size)
+            else:
+                if (name, size, file_md5) not in git_files:
+                    print(f"Download {name}")
+                    self.obs.download(
+                        revision.project,
+                        revision.package,
+                        name,
+                        revision.srcmd5,
+                        self.git.path,
+                    )
+                    # Validate the MD5 of the downloaded file
+                    if md5(self.git.path / name) != file_md5:
+                        raise Exception(f"Download error in {name}")
+                    self.git.add(name)
+
+        # Remove extra files
+        obs_names = {n for (n, _, _) in obs_files}
+        git_names = {n for (n, _, _) in git_files}
+        for name in git_names - obs_names:
+            print(f"Remove {name}")
+            self.git.remove(name)
+
+    def import_all_revisions(self, gc):
+        # Fetch all the requests and sort them.  Ideally we should
+        # build the graph here, to avoid new commits before the merge.
+        # For now we will sort them and invalidate the commits if
+        # "rebase_devel" is set.
+        self.history.fetch_all_revisions(self.projects)
+        revisions = self.history.sort_all_revisions()
+
+        logging.debug(f"Selected import order for {self.package}")
+        for revision in revisions:
+            logging.debug(revision)
+
+        gc_cnt = gc
+        for revision in revisions:
+            gc_cnt -= 1
+            if gc_cnt <= 0 and gc:
+                self.git.gc()
+                gc_cnt = gc
+            self.import_revision(revision)
+
+    def import_new_revision_with_request(self, revision, request):
+        """Create a new branch as a result of a merge"""
+
+        submitted_revision = self.history.find_revision(
+            request.source, request.revisionid, revision.time
+        )
+        if not submitted_revision:
+            logging.warning(f"Request {request} does not connect to a known revision")
+            return False
+
+        if not submitted_revision.commit:
+            # If the revision appointed by the request is not part of
+            # the git history, we can have an ordering problem.  One
+            # example is "premake4".
+            self.import_revision(submitted_revision)
+
+        assert submitted_revision.commit is not None
+
+        project = revision.project
+        branch, _ = self.projects_info[project]
+
+        # TODO: add an empty commit marking the acceptenace of the request (see discussion in PR 2858)
+        self.git.branch(branch, submitted_revision.commit)
+        self.git.clean()
+        self.git.checkout(branch)
+
+        logging.info(f"Create new branch based on {submitted_revision.commit}")
+        revision.commit = submitted_revision.commit
+
+    def _rebase_branch_history(self, project, revision):
+        branch, _ = self.projects_info[project]
+        history = self.history[project]
+        revision_index = history.index(revision)
+        for index in range(revision_index + 1, len(history)):
+            revision = history[index]
+            # We are done when we have one non-commited revision
+            if not revision.commit:
+                return
+            logging.info(f"Rebasing {revision} from {branch}")
+            revision.commit = None
+            self.import_revision(revision)
+
+    def import_revision_with_request(self, revision, request):
+        """Import a single revision via a merge"""
+
+        submitted_revision = self.history.find_revision(
+            request.source, request.revisionid, revision.time
+        )
+        if not submitted_revision:
+            logging.warning(f"Request {request} does not connect to a known revision")
+            return False
+        assert submitted_revision.commit is not None
+
+        # TODO: detect a revision, case in point
+        # Base:System/bash/284 -> rq683701 -> accept O:F/151
+        #   -> autocommit Base:System/bash/285
+        # Revert lead to openSUSE:Factory/bash/152
+        # Base:System/286 restored the reverted code in devel project
+        # rq684575 was created and accepted as O:F/153
+        # But the 284-285 and the 285-286 changeset is seen as empty
+        # as the revert was never in Base:System, so the
+        # submitted_revision of 684575 has no commit
+        if submitted_revision.commit == "EMPTY":
+            logging.warning("Empty commit submitted?!")
+            return False
+
+        message = (
+            f"Accepting request {revision.requestid}: {revision.comment}\n\n{revision}"
+        )
+        commit = self.git.merge(
+            # TODO: revision.userid or request.creator?
+            f"OBS User {revision.userid}",
+            "null@suse.de",
+            revision.time,
+            message,
+            submitted_revision.commit,
+        )
+
+        if commit == "EMPTY":
+            logging.warning("Empty merge. Ignoring the revision and the request")
+            self.git.merge_abort()
+            revision.commit = commit
+            return False
+
+        if commit == "CONFLICT":
+            logging.info("Merge conflict. Downloading revision")
+            self.download(revision)
+            message = f"CONFLICT {message}"
+            commit = self.git.merge(
+                f"OBS User {revision.userid}",
+                "null@suse.de",
+                revision.time,
+                message,
+                submitted_revision.commit,
+                merged=True,
+            )
+
+        assert commit and commit != "CONFLICT"
+        logging.info(f"Merge with {submitted_revision.commit} into {commit}")
+        revision.commit = commit
+
+        # TODO: There are more checks to do, like for example, the
+        # last commit into the non-devel branch should be a merge from
+        # the devel branch
+        if self.rebase_devel:
+            branch, _ = self.projects_info.get(request.source, (None, None))
+            if branch == "devel":
+                self.git.repo.references[f"refs/heads/{branch}"].set_target(commit)
+                self._rebase_branch_history(request.source, submitted_revision)
+
+        return True
+
+    def matching_request(self, revision):
+        request = self.obs.request(revision.requestid)
+        if not request:
+            return None
+
+        # to be handled by the caller
+        if request.type() != "submit":
+            return request
+
+        if request.source not in self.projects_info:
+            logging.info("Request from a non exported project")
+            return None
+
+        if request.target != revision.project:
+            # This seems to happen when the devel project gets
+            # reinitialized (for example, SR#943593 in 7zip, or
+            # SR#437901 in ColorFull)
+            logging.info("Request target different from current project")
+            return None
+
+        if request.source == request.target:
+            # this is not a merge, but a different way to do a
+            # contribution to the (devel) project - see bindfs's rev 1
+            logging.info("Request within the same project")
+            return None
+
+        return request
+
+    def import_revision(self, revision):
+        """Import a single revision into git"""
+        project = revision.project
+        branch, api_url = self.projects_info[project]
+
+        logging.info(f"Importing [{revision}] to {branch}")
+
+        self.obs.change_url(api_url)
+
+        # Populate linkrev and replace srcmd5 from the linked
+        # revision.  If the expansion fails, the revision will be ignored
+        # and not imported.
+        if not revision.check_expanded():
+            logging.warning(f"Broken revision")
+            revision.ignored = True
+            return
+
+        # When doing a SR, we see also a revision in the origin
+        # project with the outgoing request, but without changes in
+        # the project.  We can ignore them.
+        #
+        # If there is a request ID, it will be filtered out later,
+        # when the target project is different from itself.
+        if revision.userid == "autobuild" and not revision.requestid:
+            logging.info("Ignoring autocommit")
+            revision.ignored = True
+            return
+
+        if revision.userid == "buildservice-autocommit":
+            logging.info("Ignoring autocommit")
+            revision.ignored = True
+            return
+
+        # Create the reference if the branch is new.  If so return
+        # True.
+        new_branch = self.git.checkout(branch)
+
+        if revision.requestid:
+            request = self.matching_request(revision)
+            if request:
+                if request.type() == "delete":
+                    # TODO: after this comes a restore, this should be collapsed
+                    # before even hitting git
+                    logging.info("Delete request ignored")
+                    revision.ignored = True
+                    return
+
+                logging.debug(f"Found matching request: #{revision.project} #{request}")
+                if new_branch:
+                    self.import_new_revision_with_request(revision, request)
+                    return
+                if self.import_revision_with_request(revision, request):
+                    return
+
+        # Import revision as a single commit (without merging)
+        self.download(revision)
+
+        if new_branch or self.git.is_dirty():
+            commit = self.git.commit(
+                f"OBS User {revision.userid}",
+                "null@suse.de",
+                revision.time,
+                # TODO: Normalize better the commit message
+                f"{revision.comment}\n\n{revision}",
+                # Create an empty commit only if is a new branch
+                allow_empty=new_branch,
+            )
+            revision.commit = commit
+            logging.info(f"Commit {commit}")
+        else:
+            logging.info("Skip empty commit")
+            revision.ignored = True
--- a/lib/obs.py
+++ b/lib/obs.py
@@ -0,0 +1,142 @@
+import errno
+import logging
+import time
+import urllib.parse
+import xml.etree.ElementTree as ET
+from urllib.error import HTTPError
+
+import osc.core
+
+from lib.request import Request
+
+
+# Add a retry wrapper for some of the HTTP actions.
+def retry(func):
+    def wrapper(*args, **kwargs):
+        retry = 0
+        while retry < 5:
+            try:
+                return func(*args, **kwargs)
+            except HTTPError as e:
+                if 500 <= e.code <= 599:
+                    retry += 1
+                    logging.warning(
+                        f"HTTPError {e.code} -- Retrying {args[0]} ({retry})"
+                    )
+                    # TODO: remove when move to async
+                    time.sleep(0.5)
+                else:
+                    raise
+            except urllib.error.URLError as e:
+                if e.reason.errno in (errno.ENETUNREACH, errno.EADDRNOTAVAIL):
+                    retry += 1
+                    logging.warning(f"URLError {e} -- Retrying {args[0]} ({retry})")
+                    time.sleep(0.5)
+                else:
+                    logging.warning(f"URLError {e.errno} uncaught")
+                    raise
+            except OSError as e:
+                if e.errno in (
+                    errno.ENETUNREACH,
+                    errno.EADDRNOTAVAIL,
+                ):  # sporadically hits cloud VMs :(
+                    retry += 1
+                    logging.warning(f"OSError {e} -- Retrying {args[0]} ({retry})")
+                    # TODO: remove when move to async
+                    time.sleep(0.5)
+                else:
+                    logging.warning(f"OSError {e.errno} uncaught")
+                    raise
+
+    return wrapper
+
+
+osc.core.http_GET = retry(osc.core.http_GET)
+
+
+class OBS:
+    def __init__(self, url=None):
+        if url:
+            self.change_url(url)
+
+    def change_url(self, url):
+        self.url = url
+        osc.conf.get_config(override_apiurl=url)
+
+    def _xml(self, url_path, **params):
+        url = osc.core.makeurl(self.url, [url_path], params)
+        logging.debug(f"GET {url}")
+        return ET.parse(osc.core.http_GET(url)).getroot()
+
+    def _meta(self, project, package, **params):
+        try:
+            root = self._xml(f"source/{project}/{package}/_meta", **params)
+        except HTTPError:
+            logging.error(f"Package [{project}/{package} {params}] has no meta")
+            return None
+        return root
+
+    def _history(self, project, package, **params):
+        try:
+            root = self._xml(f"source/{project}/{package}/_history", **params)
+        except HTTPError:
+            logging.error(f"Package [{project}/{package} {params}] has no history")
+            return None
+        return root
+
+    def _link(self, project, package, rev):
+        try:
+            root = self._xml(f"source/{project}/{package}/_link", rev=rev)
+        except HTTPError:
+            logging.info("Package has no link")
+            return None
+        except ET.ParseError:
+            logging.error(
+                f"Package [{project}/{package} rev={rev}] _link can't be parsed"
+            )
+        return root
+
+    def _request(self, requestid):
+        try:
+            root = self._xml(f"request/{requestid}")
+        except HTTPError:
+            logging.warning(f"Cannot fetch request {requestid}")
+            return None
+        return root
+
+    def exists(self, project, package):
+        root = self._meta(project, package)
+        if root is None:
+            return False
+        return root.get("project") == project
+
+    def devel_project(self, project, package):
+        root = self._meta(project, package)
+        devel = root.find("devel")
+        if devel is None:
+            return None
+        return devel.get("project")
+
+    def request(self, requestid):
+        root = self._request(requestid)
+        if root is not None:
+            return Request().parse(root)
+
+    def files(self, project, package, revision):
+        root = self._xml(f"source/{project}/{package}", rev=revision, expand=1)
+        return [
+            (e.get("name"), int(e.get("size")), e.get("md5"))
+            for e in root.findall("entry")
+        ]
+
+    def _download(self, project, package, name, revision):
+        url = osc.core.makeurl(
+            self.url,
+            ["source", project, package, urllib.parse.quote(name)],
+            {"rev": revision, "expand": 1},
+        )
+        return osc.core.http_GET(url)
+
+    def download(self, project, package, name, revision, dirpath):
+        with (dirpath / name).open("wb") as f:
+            f.write(self._download(project, package, name, revision).read())
--- a/lib/proxy_sha256.py
+++ b/lib/proxy_sha256.py
@@ -0,0 +1,106 @@
+import functools
+import hashlib
+import logging
+import urllib
+
+import requests
+
+
+def _hash(hash_alg, file_or_path):
+    h = hash_alg()
+
+    def __hash(f):
+        while chunk := f.read(1024 * 4):
+            h.update(chunk)
+
+    if hasattr(file_or_path, "read"):
+        __hash(file_or_path)
+    else:
+        with file_or_path.open("rb") as f:
+            __hash(f)
+    return h.hexdigest()
+
+
+md5 = functools.partial(_hash, hashlib.md5)
+sha256 = functools.partial(_hash, hashlib.sha256)
+
+
+class ProxySHA256:
+    def __init__(self, obs, url=None, enabled=True):
+        self.obs = obs
+        self.url = url if url else "http://source.dyn.cloud.suse.de"
+        self.enabled = enabled
+        self.hashes = None
+        self.texts = set()
+
+    def load_package(self, package):
+        # _project is unreachable for the proxy - due to being a fake package
+        if package == "_project":
+            self.enabled = False
+            self.texts = set(["_config", "_service"])
+            self.hashes = dict()
+            return
+        logging.info("Retrieve all previously defined SHA256")
+        response = requests.get(f"http://source.dyn.cloud.suse.de/package/{package}")
+        if response.status_code == 200:
+            json = response.json()
+            self.hashes = json["shas"]
+            self.texts = set(json["texts"])
+
+    def get(self, package, name, file_md5):
+        key = f"{file_md5}-{name}"
+        if self.hashes is None:
+            if self.enabled:
+                self.load_package(package)
+            else:
+                self.hashes = {}
+        return self.hashes.get(key, None)
+
+    def _proxy_put(self, project, package, name, revision, file_md5, size):
+        quoted_name = urllib.parse.quote(name)
+        url = f"{self.obs.url}/public/source/{project}/{package}/{quoted_name}?rev={revision}"
+        response = requests.put(
+            self.url,
+            data={
+                "hash": file_md5,
+                "filename": name,
+                "url": url,
+                "package": package,
+            },
+        )
+        if response.status_code != 200:
+            raise Exception(f"Redirector error on {self.url} for {url}")
+
+        key = (file_md5, name)
+        self.hashes[key] = {
+            "sha256": response.content.decode("utf-8"),
+            "fsize": size,
+        }
+        return self.hashes[key]
+
+    def _obs_put(self, project, package, name, revision, file_md5, size):
+        key = (file_md5, name)
+        self.hashes[key] = {
+            "sha256": sha256(self.obs._download(project, package, name, revision)),
+            "fsize": size,
+        }
+        return self.hashes[key]
+
+    def put(self, project, package, name, revision, file_md5, size):
+        if not self.enabled:
+            return self._obs_put(project, package, name, revision, file_md5, size)
+        return self._proxy_put(project, package, name, revision, file_md5, size)
+
+    def is_text(self, filename):
+        return filename in self.texts
+
+    def get_or_put(self, project, package, name, revision, file_md5, size):
+        result = self.get(package, name, file_md5)
+        if not result:
+            result = self.put(project, package, name, revision, file_md5, size)
+
+        # Sanity check
+        if result["fsize"] != size:
+            raise Exception(f"Redirector has different size for {name}")
+
+        return result
--- a/lib/request.py
+++ b/lib/request.py
@@ -0,0 +1,31 @@
+class Request:
+    def parse(self, xml):
+        self.requestid = int(xml.get("id"))
+        self.creator = xml.get("creator")
+
+        self.type_ = xml.find("action").get("type")
+        if self.type_ == "delete":
+            # not much to do
+            return self
+
+        self.source = xml.find("action/source").get("project")
+        # expanded MD5 or commit revision
+        self.revisionid = xml.find("action/source").get("rev")
+
+        self.target = xml.find("action/target").get("project")
+
+        self.state = xml.find("state").get("name")
+
+        # TODO: support muti-action requests
+        # TODO: parse review history
+        # TODO: add description
+        return self
+
+    def type(self):
+        return self.type_
+
+    def __str__(self):
+        return f"Req {self.requestid} {self.creator} {self.type_} {self.source}->{self.target} {self.state}"
+
+    def __repr__(self):
+        return f"[{self.__str__()}]"
--- a/lib/revision.py
+++ b/lib/revision.py
@@ -0,0 +1,109 @@
+import datetime
+import logging
+import re
+import xml.etree.ElementTree as ET
+from urllib.error import HTTPError
+
+
+class Revision:
+    def __init__(self, obs, history, project, package):
+        self.obs = obs
+        self.history = history
+        self.project = project
+        self.package = package
+
+        self.commit = None
+        self.ignored = False
+
+    def parse(self, xml):
+        self.rev = int(xml.get("rev"))
+        # Replaced in check_expanded
+        self.srcmd5 = xml.find("srcmd5").text
+
+        time = int(xml.find("time").text)
+        self.time = datetime.datetime.fromtimestamp(time)
+
+        userid = xml.find("user")
+        if userid is not None:
+            self.userid = userid.text
+        else:
+            self.userid = "unknown"
+
+        comment = xml.find("comment")
+        if comment is not None:
+            self.comment = comment.text or ""
+        else:
+            self.comment = ""
+
+        # Populated by check_link
+        self.linkrev = None
+
+        self.requestid = None
+        requestid = xml.find("requestid")
+        if requestid is not None:
+            self.requestid = int(requestid.text)
+        else:
+            # Sometimes requestid is missing, but can be extracted
+            # from "comment"
+            matched = re.match(
+                r"^Copy from .* based on submit request (\d+) from user .*$",
+                self.comment,
+            )
+            if matched:
+                self.requestid = int(matched.group(1))
+
+        return self
+
+    def __str__(self):
+        return f"Rev {self.project}/{self.rev} Md5 {self.srcmd5} {self.time} {self.userid} {self.requestid}"
+
+    def __repr__(self):
+        return f"[{self.__str__()}]"
+
+    def check_link(self):
+        """Add 'linkrev' attribute into the revision. Returns False if the link is invalid"""
+        try:
+            root = self.obs._xml(
+                f"source/{self.project}/{self.package}/_link", rev=self.srcmd5
+            )
+        except HTTPError as e:
+            if e.code == 404:
+                logging.debug("No _link for the revision")
+                return True
+            raise e
+        except ET.ParseError:
+            logging.error(
+                f"_link can't be parsed [{self.project}/{self.package} rev={self.srcmd5}]"
+            )
+            return False
+
+        target_project = root.get("project")
+        rev = self.history.find_last_rev_after_time(target_project, self.time)
+        if rev:
+            logging.debug(f"Linkrev found: {rev}")
+            self.linkrev = rev.srcmd5
+        return True
+
+    def check_expanded(self):
+        # Even if it's not a link we still need to check the expanded
+        # srcmd5 as it's possible used in submit requests
+        if not self.check_link():
+            return False
+
+        # If there is a "linkrev", "rev" is ignored
+        params = {"rev": self.srcmd5, "expand": "1"}
+        if self.linkrev:
+            params["linkrev"] = self.linkrev
+
+        try:
+            root = self.obs._xml(f"source/{self.project}/{self.package}", **params)
+        except HTTPError as e:
+            if e.code == 400:
+                logging.error(
+                    f"Package [{self.project}/{self.package} {params}] can't be expanded: {e}"
+                )
+                return False
+            raise e
+
+        self.srcmd5 = root.get("srcmd5")
+        return True