from __future__ import annotations import logging import os import sys import requests from lib.binary import is_text_mimetype from lib.db import DB # no need for this class yet, so just leave the migration code here class LFSOid: def __init__(self, db: DB) -> None: self.db = db self.dbid = None self.project = None self.package = None self.filename = None self.revision = None self.sha = None self.size = None self.mimetype = None self.file_md5 = None @staticmethod def check_all(db, package): with db.cursor() as cur: cur.execute( "SELECT lfs_oid_id FROM lfs_oid_in_package WHERE package=%s ORDER BY lfs_oid_id DESC limit 10 ", (package,), ) for row in cur.fetchall(): oid = LFSOid(db).set_from_dbid(row[0]) if not oid.check(): oid.register() def add( self, project: str, package: str, filename: str, revision: str, sha256: str, size: int, mimetype: str, file_md5: str, ) -> None: with self.db.cursor() as cur: # we UPDATE here so the return functions. conflicts are likely as we look for filename/md5 but conflict on sha256 cur.execute( """INSERT INTO lfs_oids (project,package,filename,rev,sha256,size,mimetype,file_md5) VALUES (%s,%s,%s,%s,%s,%s,%s,%s) ON CONFLICT (sha256,size) DO UPDATE SET mimetype=EXCLUDED.mimetype RETURNING id""", ( project, package, filename, revision, sha256, size, mimetype, file_md5, ), ) row = cur.fetchone() lfs_oid_id = row[0] cur.execute( """INSERT INTO lfs_oid_in_package (package,filename,lfs_oid_id) VALUES (%s,%s,%s)""", (package, filename, lfs_oid_id), ) if is_text_mimetype(mimetype): cur.execute( "INSERT INTO text_files (package,filename) VALUES (%s,%s)", (package, filename), ) self.db.conn.commit() self.set_from_dbid(lfs_oid_id) if not self.check(): self.register() def check(self): return True url = f"http://localhost:9999/check/{self.sha256}/{self.size}" response = requests.get( url, timeout=10, ) return response.status_code == 200 def set_from_dbid(self, dbid: int) -> LFSOid: with self.db.cursor() as cur: cur.execute("SELECT * from lfs_oids where id=%s", (dbid,)) row = cur.fetchone() self.set_from_row(row) assert self.dbid == dbid return self def set_from_row(self, row: list) -> LFSOid: ( self.dbid, self.project, self.package, self.filename, self.revision, self.sha256, self.size, self.mimetype, self.file_md5, ) = row return self def register(self): if not os.getenv("GITEA_REGISTER_SECRET"): logging.info("Not registering LFS due to missing secret") return data = { "secret": os.getenv("GITEA_REGISTER_SECRET"), "project": self.project, "package": self.package, "filename": self.filename, "rev": self.revision, "sha256": self.sha256, "size": self.size, } url = "http://localhost:9999/register" response = requests.post( url, json=data, timeout=10, ) response.raise_for_status() logging.info(f"Register LFS returned {response.status_code}") if __name__ == "__main__": """ Import the old data - it only makes sense on a DB with previously scanned revisions curl -s https://stephan.kulow.org/git_lfs.csv.xz | xz -cd | PYTHONPATH=$PWD /usr/bin/python3 lib/lfs_oid.py """ db = DB() logging.basicConfig(level=logging.DEBUG) with db.cursor() as cur: while True: line = sys.stdin.readline() if not line: break ( project, package, filename, rev, sha256, size, mimetype, md5, ) = line.strip().split("\t") cur.execute( """INSERT INTO lfs_oids (project,package,filename,rev,sha256,size,mimetype,file_md5) VALUES (%s,%s,%s,%s,%s,%s,%s,%s) ON CONFLICT DO NOTHING""", (project, package, filename, rev, sha256, size, mimetype, md5), ) cur.execute( """ CREATE TEMPORARY TABLE lfs_oid_in_revision ( revision_id INTEGER, lfs_oid_id INTEGER NOT NULL, name VARCHAR(255) NOT NULL ) """ ) cur.execute( """INSERT INTO lfs_oid_in_revision (revision_id, lfs_oid_id, name) SELECT revision_id,lfs_oids.id,files.name FROM lfs_oids JOIN files ON files.md5=lfs_oids.file_md5""" ) cur.execute( """INSERT INTO text_files (package,filename) SELECT DISTINCT r.package, lfs_oid_in_revision.name FROM lfs_oids JOIN lfs_oid_in_revision on lfs_oid_in_revision.lfs_oid_id=lfs_oids.id JOIN revisions r ON r.id=lfs_oid_in_revision.revision_id WHERE lfs_oids.mimetype like 'text/%' ON CONFLICT DO NOTHING""" ) cur.execute( """INSERT INTO lfs_oid_in_package (lfs_oid_id, package, filename) SELECT DISTINCT lfs_oids.id,r.package, lfs_oid_in_revision.name FROM lfs_oids JOIN lfs_oid_in_revision on lfs_oid_in_revision.lfs_oid_id=lfs_oids.id JOIN revisions r ON r.id=lfs_oid_in_revision.revision_id""" ) db.conn.commit()