Migrate the ProxySHA256 data into postgresql DB

The calculation of the sha256 and the mimetype is local due to that
This commit is contained in:
Stephan Kulow
2022-11-07 19:40:19 +01:00
parent e1b32999f0
commit 3e1fbaa1c3
6 changed files with 269 additions and 107 deletions

View File

@@ -1,114 +1,142 @@
import functools
import hashlib
import logging
import urllib
import os
try:
import magic
except:
print("Install python3-python-magic, not python3-magic")
raise
import requests
def _hash(hash_alg, file_or_path):
h = hash_alg()
def __hash(f):
while chunk := f.read(1024 * 4):
h.update(chunk)
if hasattr(file_or_path, "read"):
__hash(file_or_path)
else:
with file_or_path.open("rb") as f:
__hash(f)
return h.hexdigest()
md5 = functools.partial(_hash, hashlib.md5)
sha256 = functools.partial(_hash, hashlib.sha256)
from lib.db import DB
from lib.obs import OBS
class ProxySHA256:
def __init__(self, obs, url=None, enabled=True):
def __init__(self, obs: OBS, db: DB):
self.obs = obs
self.url = url if url else "http://source.dyn.cloud.suse.de"
self.enabled = enabled
self.db = db
self.hashes = None
self.texts = None
def load_package(self, package):
# _project is unreachable for the proxy - due to being a fake package
if package == "_project":
self.enabled = False
self.texts = set(["_config", "_service", "_staging_workflow"])
self.hashes = dict()
return
logging.debug("Retrieve all previously defined SHA256")
response = requests.get(
f"http://source.dyn.cloud.suse.de/package/{package}", timeout=50
)
if response.status_code == 200:
json = response.json()
self.hashes = json["shas"]
self.texts = set(json["texts"])
self.mime = None
def get(self, package, name, file_md5):
key = f"{file_md5}-{name}"
if self.hashes is None:
if self.enabled:
self.load_package(package)
else:
self.hashes = {}
return self.hashes.get(key, None)
self.load_hashes(package)
key = f"{file_md5}-{name}"
ret = self.hashes.get(key)
return ret
def _proxy_put(self, project, package, name, revision, file_md5, size):
quoted_name = urllib.parse.quote(name)
url = f"{self.obs.url}/public/source/{project}/{package}/{quoted_name}?rev={revision}"
response = requests.put(
self.url,
data={
"hash": file_md5,
"filename": name,
"url": url,
"package": package,
},
timeout=10,
)
if response.status_code != 200:
raise Exception(f"Redirector error on {self.url} for {url}")
key = (file_md5, name)
self.hashes[key] = {
"sha256": response.content.decode("utf-8"),
"fsize": size,
}
return self.hashes[key]
def _obs_put(self, project, package, name, revision, file_md5, size):
key = (file_md5, name)
self.hashes[key] = {
"sha256": sha256(self.obs._download(project, package, name, revision)),
"fsize": size,
}
return self.hashes[key]
def load_hashes(self, package):
with self.db.cursor() as cur:
cur.execute(
"""SELECT lfs_oids.file_md5,lop.filename,lfs_oids.sha256,lfs_oids.size
FROM lfs_oid_in_package lop
JOIN lfs_oids ON lfs_oids.id=lop.lfs_oid_id
WHERE lop.package=%s""",
(package,),
)
self.hashes = {
f"{row[0]}-{row[1]}": (row[2], row[3]) for row in cur.fetchall()
}
def put(self, project, package, name, revision, file_md5, size):
if not self.enabled:
return self._obs_put(project, package, name, revision, file_md5, size)
return self._proxy_put(project, package, name, revision, file_md5, size)
if not self.mime:
self.mime = magic.Magic(mime=True)
mimetype = None
logging.debug(f"Add LFS for {project}/{package}/{name}")
fin = self.obs._download(project, package, name, revision)
sha = hashlib.sha256()
while True:
buffer = fin.read(10000)
if not buffer:
break
sha.update(buffer)
# only guess from the first 10K
if not mimetype:
mimetype = self.mime.from_buffer(buffer)
fin.close()
sha = sha.hexdigest()
with self.db.cursor() as cur:
# we UPDATE here so the return functions. conflicts are likely as we look for filename/md5 but conflict on sha256
cur.execute(
"""INSERT INTO lfs_oids (project,package,filename,rev,sha256,size,mimetype,file_md5)
VALUES (%s,%s,%s,%s,%s,%s,%s,%s)
ON CONFLICT (sha256,size) DO UPDATE SET mimetype=EXCLUDED.mimetype
RETURNING id""",
(
project,
package,
name,
revision,
sha,
size,
mimetype,
file_md5,
),
)
row = cur.fetchone()
lfs_oid_id = row[0]
cur.execute(
"""INSERT INTO lfs_oid_in_package (package,filename,lfs_oid_id)
VALUES (%s,%s,%s)""",
(package, name, lfs_oid_id),
)
if mimetype.startswith("text/"):
cur.execute(
"INSERT INTO text_files (package,filename) VALUES (%s,%s)",
(package, name),
)
self.db.conn.commit()
if os.getenv("GITEA_REGISTER_SECRET"):
data = {
"secret": os.getenv("GITEA_REGISTER_SECRET"),
"project": project,
"package": package,
"filename": name,
"rev": revision,
"sha256": sha,
"size": size,
}
url = "http://gitea.opensuse.org:9999/register"
response = requests.post(
url,
data=data,
timeout=10,
)
logging.debug(f"Registered {response.status_code}")
else:
logging.info("Not registering LFS due to missing secret")
# reset
self.hashes = None
self.texts = None
return self.get(package, name, file_md5)
def is_text(self, package, filename):
if self.texts is None:
if self.enabled:
self.load_package(package)
else:
self.texts = set()
self.load_texts(package)
return filename in self.texts
def load_texts(self, package):
self.texts = set()
with self.db.cursor() as cur:
cur.execute("SELECT filename from text_files where package=%s", (package,))
for row in cur.fetchall():
self.texts.add(row[0])
def get_or_put(self, project, package, name, revision, file_md5, size):
result = self.get(package, name, file_md5)
if not result:
result = self.put(project, package, name, revision, file_md5, size)
# Sanity check
if result["fsize"] != size:
raise Exception(f"Redirector has different size for {name}")
sha256, db_size = result
assert db_size == size
return result
return sha256