git-importer/lib/lfs_oid.py

195 lines
6.3 KiB
Python
Raw Permalink Normal View History

from __future__ import annotations
import logging
import os
import sys
import requests
2022-11-11 16:22:18 +01:00
from lib.binary import is_text_mimetype
from lib.db import DB
# no need for this class yet, so just leave the migration code here
class LFSOid:
def __init__(self, db: DB) -> None:
self.db = db
self.dbid = None
self.project = None
self.package = None
self.filename = None
self.revision = None
self.sha = None
self.size = None
self.mimetype = None
self.file_md5 = None
@staticmethod
def check_all(db, package):
with db.cursor() as cur:
cur.execute(
"SELECT lfs_oid_id FROM lfs_oid_in_package WHERE package=%s ORDER BY lfs_oid_id DESC limit 10 ",
(package,),
)
for row in cur.fetchall():
oid = LFSOid(db).set_from_dbid(row[0])
if not oid.check():
oid.register()
def add(
self,
project: str,
package: str,
filename: str,
revision: str,
sha256: str,
size: int,
mimetype: str,
file_md5: str,
) -> None:
with self.db.cursor() as cur:
# we UPDATE here so the return functions. conflicts are likely as we look for filename/md5 but conflict on sha256
cur.execute(
"""INSERT INTO lfs_oids (project,package,filename,rev,sha256,size,mimetype,file_md5)
VALUES (%s,%s,%s,%s,%s,%s,%s,%s)
ON CONFLICT (sha256,size) DO UPDATE SET mimetype=EXCLUDED.mimetype
RETURNING id""",
(
project,
package,
filename,
revision,
sha256,
size,
mimetype,
file_md5,
),
)
row = cur.fetchone()
lfs_oid_id = row[0]
cur.execute(
"""INSERT INTO lfs_oid_in_package (package,filename,lfs_oid_id)
VALUES (%s,%s,%s)""",
(package, filename, lfs_oid_id),
)
2022-11-11 16:22:18 +01:00
if is_text_mimetype(mimetype):
cur.execute(
"INSERT INTO text_files (package,filename) VALUES (%s,%s)",
(package, filename),
)
self.db.conn.commit()
2022-11-11 16:22:18 +01:00
self.set_from_dbid(lfs_oid_id)
if not self.check():
self.register()
def check(self):
url = f"http://localhost:9999/check/{self.sha256}/{self.size}"
response = requests.get(
url,
timeout=10,
)
return response.status_code == 200
def set_from_dbid(self, dbid: int) -> LFSOid:
with self.db.cursor() as cur:
cur.execute("SELECT * from lfs_oids where id=%s", (dbid,))
row = cur.fetchone()
self.set_from_row(row)
assert self.dbid == dbid
return self
def set_from_row(self, row: list) -> LFSOid:
(
self.dbid,
self.project,
self.package,
self.filename,
self.revision,
self.sha256,
self.size,
self.mimetype,
self.file_md5,
) = row
return self
def register(self):
if not os.getenv("GITEA_REGISTER_SECRET"):
logging.info("Not registering LFS due to missing secret")
return
data = {
"secret": os.getenv("GITEA_REGISTER_SECRET"),
"project": self.project,
"package": self.package,
"filename": self.filename,
"rev": self.revision,
"sha256": self.sha256,
"size": self.size,
}
url = "http://localhost:9999/register"
response = requests.post(
url,
json=data,
timeout=10,
)
response.raise_for_status()
logging.info(f"Register LFS returned {response.status_code}")
if __name__ == "__main__":
"""
Import the old data - it only makes sense on a DB with previously scanned revisions
curl -s https://stephan.kulow.org/git_lfs.csv.xz | xz -cd | PYTHONPATH=$PWD /usr/bin/python3 lib/lfs_oid.py
"""
db = DB()
logging.basicConfig(level=logging.DEBUG)
with db.cursor() as cur:
while True:
line = sys.stdin.readline()
if not line:
break
(
project,
package,
filename,
rev,
sha256,
size,
mimetype,
md5,
) = line.strip().split("\t")
cur.execute(
"""INSERT INTO lfs_oids (project,package,filename,rev,sha256,size,mimetype,file_md5)
VALUES (%s,%s,%s,%s,%s,%s,%s,%s) ON CONFLICT DO NOTHING""",
(project, package, filename, rev, sha256, size, mimetype, md5),
)
cur.execute(
"""
CREATE TEMPORARY TABLE lfs_oid_in_revision (
revision_id INTEGER,
lfs_oid_id INTEGER NOT NULL,
name VARCHAR(255) NOT NULL
)
"""
)
cur.execute(
"""INSERT INTO lfs_oid_in_revision (revision_id, lfs_oid_id, name)
SELECT revision_id,lfs_oids.id,files.name FROM lfs_oids JOIN files ON files.md5=lfs_oids.file_md5"""
)
cur.execute(
"""INSERT INTO text_files (package,filename)
SELECT DISTINCT r.package, lfs_oid_in_revision.name FROM lfs_oids
JOIN lfs_oid_in_revision on lfs_oid_in_revision.lfs_oid_id=lfs_oids.id
JOIN revisions r ON r.id=lfs_oid_in_revision.revision_id
WHERE lfs_oids.mimetype like 'text/%' ON CONFLICT DO NOTHING"""
)
cur.execute(
"""INSERT INTO lfs_oid_in_package (lfs_oid_id, package, filename)
SELECT DISTINCT lfs_oids.id,r.package, lfs_oid_in_revision.name FROM lfs_oids
JOIN lfs_oid_in_revision on lfs_oid_in_revision.lfs_oid_id=lfs_oids.id
JOIN revisions r ON r.id=lfs_oid_in_revision.revision_id"""
)
db.conn.commit()