Create fake revisions for every commit in the base of a linked package

This is a rather complex operation, but whenever a package changes in
Factory, the inherited package gets a shadow revision consisting of the
3-way merge. If this happens due to a request being accepted, this is
actually in most cases also commited by 'buildservice-autocommit',
so we're making sure this is always happening (and are actually
duplicating revisions in cases that we filter out later as empty
commits).

To differenciate the fake revisions from the real revisions, I add
a fraction part
This commit is contained in:
Stephan Kulow 2022-10-24 20:21:53 +02:00
parent d17e60a608
commit 2784171f75
8 changed files with 24958 additions and 18353 deletions

View File

@ -1,3 +1,5 @@
sudo zypper in python3-psycopg2 sudo zypper in python3-psycopg2
sudo su - postgres sudo su - postgres
# `createdb -O <LOCAL_USER> imported_git` # `createdb -O <LOCAL_USER> imported_git`
To reset the database, drop table scheme

View File

@ -1,4 +1,7 @@
import logging
import psycopg2 import psycopg2
from psycopg2.extras import LoggingConnection
from lib.config import config from lib.config import config
@ -14,7 +17,9 @@ class DB:
# read the connection parameters # read the connection parameters
params = config(section=self.config_section) params = config(section=self.config_section)
# connect to the PostgreSQL server # connect to the PostgreSQL server
self.conn = psycopg2.connect(**params) self.conn = psycopg2.connect(connection_factory=LoggingConnection, **params)
logger = logging.getLogger(__name__)
self.conn.initialize(logger)
except (Exception, psycopg2.DatabaseError) as error: except (Exception, psycopg2.DatabaseError) as error:
print(error) print(error)
@ -96,6 +101,7 @@ class DB:
"UPDATE scheme SET version=4", "UPDATE scheme SET version=4",
) )
schemes[5] = ( schemes[5] = (
"""DROP TABLE IF EXISTS files""",
""" """
CREATE TABLE files ( CREATE TABLE files (
id SERIAL PRIMARY KEY, id SERIAL PRIMARY KEY,
@ -109,6 +115,7 @@ class DB:
"UPDATE scheme SET version=5", "UPDATE scheme SET version=5",
) )
schemes[6] = ( schemes[6] = (
"""DROP TABLE IF EXISTS requests""",
""" """
CREATE TABLE requests ( CREATE TABLE requests (
id SERIAL PRIMARY KEY, id SERIAL PRIMARY KEY,
@ -154,6 +161,7 @@ class DB:
"UPDATE scheme SET version=12", "UPDATE scheme SET version=12",
) )
schemes[13] = ( schemes[13] = (
"""DROP TABLE IF EXISTS linked_revs""",
""" """
CREATE TABLE users ( CREATE TABLE users (
id SERIAL PRIMARY KEY, id SERIAL PRIMARY KEY,
@ -174,6 +182,30 @@ class DB:
""", """,
"UPDATE scheme SET version=14", "UPDATE scheme SET version=14",
) )
schemes[14] = (
"ALTER TABLE revisions ALTER COLUMN rev TYPE real USING rev::real",
"UPDATE scheme SET version=14",
)
schemes[15] = (
"""DROP TABLE IF EXISTS fake_revs""",
"""
CREATE TABLE fake_revs (
id SERIAL PRIMARY KEY,
revision_id INTEGER NOT NULL,
linked_id INTEGER NOT NULL
)
""",
"create index revs_linked on fake_revs (revision_id,linked_id)",
"UPDATE scheme SET version=15",
)
schemes[16] = (
"ALTER TABLE revisions ADD COLUMN files_hash VARCHAR(40)",
"UPDATE scheme SET version=16",
)
schemes[17] = (
"ALTER TABLE linked_revs ADD COLUMN considered BOOLEAN DEFAULT FALSE",
"UPDATE scheme SET version=17",
)
schema_version = self.schema_version() schema_version = self.schema_version()
if (schema_version + 1) not in schemes: if (schema_version + 1) not in schemes:
return return

View File

@ -19,10 +19,10 @@ class DBRevision:
self.expanded_srcmd5, self.expanded_srcmd5,
self.request_number, self.request_number,
self.request_id, self.request_id,
self.files_hash,
) = row ) = row
self.rev = int(self.rev) self.rev = float(self.rev)
self._files = None self._files = None
self._hash = None
def __str__(self): def __str__(self):
return f"Rev {self.project}/{self.package}/{self.rev} Md5 {self.unexpanded_srcmd5} {self.commit_time} {self.userid} {self.request_number}" return f"Rev {self.project}/{self.package}/{self.rev} Md5 {self.unexpanded_srcmd5} {self.commit_time} {self.userid} {self.request_number}"
@ -52,6 +52,7 @@ class DBRevision:
"comment": self.comment, "comment": self.comment,
"broken": self.broken, "broken": self.broken,
"expanded_srcmd5": self.expanded_srcmd5, "expanded_srcmd5": self.expanded_srcmd5,
"files_hash": self.files_hash,
"files": self.files_list(db), "files": self.files_list(db),
} }
if self.request_id: if self.request_id:
@ -92,7 +93,8 @@ class DBRevision:
(project, package, str(rev)), (project, package, str(rev)),
) )
row = cur.fetchone() row = cur.fetchone()
return DBRevision(row) if row:
return DBRevision(row)
@staticmethod @staticmethod
def latest_revision(db, project, package): def latest_revision(db, project, package):
@ -103,7 +105,7 @@ class DBRevision:
) )
max = cur.fetchone()[0] max = cur.fetchone()[0]
if max: if max:
return DBRevision.fetch_revision(db, project, package, int(max)) return DBRevision.fetch_revision(db, project, package, max)
return None return None
@staticmethod @staticmethod
@ -144,7 +146,6 @@ class DBRevision:
with db.cursor() as cur: with db.cursor() as cur:
cur.execute("UPDATE revisions SET broken=TRUE where id=%s", (self.dbid,)) cur.execute("UPDATE revisions SET broken=TRUE where id=%s", (self.dbid,))
def import_dir_list(self, db, xml): def import_dir_list(self, db, xml):
with db.cursor() as cur: with db.cursor() as cur:
cur.execute( cur.execute(
@ -164,9 +165,13 @@ class DBRevision:
), ),
) )
def files_hash(self, db): def previous_commit(self, db):
if self._hash: return self.fetch_revision(db, self.project, self.package, int(self.rev) - 1)
return self._hash
def next_commit(self, db):
return self.fetch_revision(db, self.project, self.package, int(self.rev) + 1)
def calculate_files_hash(self, db):
m = md5() m = md5()
for file_dict in self.files_list(db): for file_dict in self.files_list(db):
m.update( m.update(
@ -178,8 +183,7 @@ class DBRevision:
+ str(file_dict["size"]) + str(file_dict["size"])
).encode("utf-8") ).encode("utf-8")
) )
self._hash = m.hexdigest() return m.hexdigest()
return self._hash
def files_list(self, db): def files_list(self, db):
if self._files: if self._files:
@ -194,7 +198,7 @@ class DBRevision:
return self._files return self._files
@staticmethod @staticmethod
def requests_to_fetch(self, db, project, package): def requests_to_fetch(db, project, package):
with db.cursor() as cur: with db.cursor() as cur:
cur.execute( cur.execute(
"""SELECT request_number FROM revisions revs LEFT JOIN requests """SELECT request_number FROM revisions revs LEFT JOIN requests
@ -209,8 +213,9 @@ class DBRevision:
"""Used in test cases to read a revision from fixtures into the test database""" """Used in test cases to read a revision from fixtures into the test database"""
with db.cursor() as cur: with db.cursor() as cur:
cur.execute( cur.execute(
"""INSERT INTO revisions (project, package, rev, unexpanded_srcmd5, expanded_srcmd5, commit_time, userid, comment, broken) """INSERT INTO revisions (project, package, rev, unexpanded_srcmd5, expanded_srcmd5,
VALUES(%s, %s, %s, %s, %s, %s, %s, %s, %s) RETURNING id""", commit_time, userid, comment, broken, files_hash)
VALUES(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s) RETURNING id""",
( (
rev_dict["project"], rev_dict["project"],
rev_dict["package"], rev_dict["package"],
@ -221,6 +226,7 @@ class DBRevision:
rev_dict["userid"], rev_dict["userid"],
rev_dict["comment"], rev_dict["comment"],
rev_dict["broken"], rev_dict["broken"],
rev_dict["files_hash"],
), ),
) )
rev_id = cur.fetchone()[0] rev_id = cur.fetchone()[0]

View File

@ -13,7 +13,10 @@ class Exporter:
def run(self): def run(self):
db = DB() db = DB()
cur = db.cursor() cur = db.cursor()
cur.execute("SELECT * from revisions where package=%s", (self.package,)) cur.execute(
"SELECT * from revisions where package=%s ORDER BY project,rev",
(self.package,),
)
data = {"revisions": []} data = {"revisions": []}
for row in cur.fetchall(): for row in cur.fetchall():
data["revisions"].append(DBRevision(row).as_dict(db)) data["revisions"].append(DBRevision(row).as_dict(db))

View File

@ -2,6 +2,8 @@ import functools
import logging import logging
import xml.etree.ElementTree as ET import xml.etree.ElementTree as ET
import psycopg2
from lib.binary import is_binary_or_large from lib.binary import is_binary_or_large
from lib.db import DB from lib.db import DB
from lib.db_revision import DBRevision from lib.db_revision import DBRevision
@ -165,6 +167,18 @@ class Importer:
(rev.dbid, linked_rev.dbid), (rev.dbid, linked_rev.dbid),
) )
def calculate_file_hashes(self, db):
cur = db.cursor()
cur.execute(
"SELECT * from revisions where files_hash IS NULL AND broken is FALSE"
)
for row in cur.fetchall():
rev = DBRevision(row)
md5 = rev.calculate_files_hash(db)
cur.execute(
"UPDATE revisions SET files_hash=%s WHERE id=%s", (md5, rev.dbid)
)
def fetch_all_linked_packages(self, db, project, package): def fetch_all_linked_packages(self, db, project, package):
cur = db.cursor() cur = db.cursor()
cur.execute( cur.execute(
@ -176,19 +190,86 @@ class Importer:
(lproject, lpackage) = row (lproject, lpackage) = row
self.update_db_package(db, lproject, lpackage) self.update_db_package(db, lproject, lpackage)
def find_fake_revisions(self, db):
cur = db.cursor()
cur.execute(
"""SELECT * from revisions WHERE
id in (SELECT revision_id from linked_revs WHERE considered=FALSE) AND
id not in (SELECT revision_id FROM fake_revs) ORDER by project,package,rev"""
)
for row in cur.fetchall():
rev = DBRevision(row)
prev = rev.previous_commit(db)
if not prev:
cur.execute(
"UPDATE linked_revs SET considered=TRUE where revision_id=%s",
(rev.dbid,),
)
continue
cur2 = db.cursor()
cur2.execute(
"""SELECT * from revisions where id in
(SELECT revision_id from linked_revs WHERE linked_id=%s)
AND commit_time <= %s ORDER BY commit_time""",
(prev.dbid, rev.commit_time),
)
last_linked = None
for linked in cur2.fetchall():
linked = DBRevision(linked)
nextrev = linked.next_commit(db)
if nextrev and nextrev.commit_time < rev.commit_time:
continue
last_linked = linked
cur.execute(
"UPDATE linked_revs SET considered=TRUE where revision_id=%s",
(rev.dbid,),
)
if last_linked:
linked = last_linked
cur2.execute(
"SELECT 1 FROM fake_revs where revision_id=%s AND linked_id=%s",
(rev.dbid, linked.dbid),
)
if cur2.fetchone():
cur.execute(
"UPDATE linked_revs SET considered=TRUE where revision_id=%s",
(rev.dbid,),
)
continue
fake_rev = linked.rev + rev.rev / 1000.0
comment = f"Updating link to change in {rev.project}/{rev.package} revision {rev.rev}"
cur2.execute(
"""INSERT INTO revisions (project,package,rev,unexpanded_srcmd5,
commit_time, userid, comment) VALUES(%s,%s,%s,%s,%s,%s,%s) RETURNING id""",
(
linked.project,
linked.package,
fake_rev,
linked.unexpanded_srcmd5,
rev.commit_time,
"buildservice-autocommit",
comment,
),
)
new_id = cur2.fetchone()[0]
cur2.execute(
"""INSERT INTO linked_revs (revision_id, linked_id) VALUES (%s,%s)""",
(new_id, rev.dbid),
)
cur2.execute(
"""INSERT INTO fake_revs (revision_id, linked_id) VALUES (%s,%s)""",
(rev.dbid, linked.dbid),
)
def import_into_db(self): def import_into_db(self):
db = DB() db = DB()
for project, _, api_url in self.projects: for project, _, api_url in self.projects:
self.obs.change_url(api_url) self.obs.change_url(api_url)
self.update_db_package(db, project, self.package) self.update_db_package(db, project, self.package)
with db.cursor() as cur: self.fetch_all_linked_packages(db, project, self.package)
cur.execute( # all remaining, no filtering here
"SELECT DISTINCT l.project, l.package from links l join revisions r on r.id=l.revision_id WHERE r.project=%s AND r.package=%s", self.find_linked_revs(db)
(project, self.package), self.find_fake_revisions(db)
)
for row in cur.fetchall():
(lproject, lpackage) = row
self.update_db_package(db, lproject, lpackage)
missing_users = User.missing_users(db) missing_users = User.missing_users(db)
for userid in missing_users: for userid in missing_users:
@ -203,7 +284,7 @@ class Importer:
with db.cursor() as cur: with db.cursor() as cur:
cur.execute( cur.execute(
"""SELECT unexpanded_srcmd5 from revisions WHERE """SELECT unexpanded_srcmd5 from revisions WHERE
id=(SELECT linked_id FROM linked_revs WHERE revision_id=%s""", id=(SELECT linked_id FROM linked_revs WHERE revision_id=%s)""",
(rev.dbid,), (rev.dbid,),
) )
linked_rev = cur.fetchone() linked_rev = cur.fetchone()
@ -217,8 +298,8 @@ class Importer:
else: else:
rev.set_broken(db) rev.set_broken(db)
for number in DBRevision.requests_to_fetch(db, project, self.package): for number in DBRevision.requests_to_fetch(db, project, self.package):
self.obs.request(number).import_into_db(db) self.obs.request(number).import_into_db(db)
db.conn.commit() db.conn.commit()
TreeBuilder(db).build(self.package) TreeBuilder(db).build(self.package)

View File

@ -56,7 +56,7 @@ class OBSRevision:
return self return self
def __str__(self): def __str__(self):
return f"Rev {self.project}/{self.rev} Md5 {self.srcmd5} {self.time} {self.userid} {self.request_number}" return f"Rev {self.project}/{self.package}/{self.rev}.0 Md5 {self.srcmd5} {self.time} {self.userid} {self.request_number}"
def __repr__(self): def __repr__(self):
return f"[{self.__str__()}]" return f"[{self.__str__()}]"

View File

@ -12,10 +12,10 @@ class TreeBuilder:
ret = [] ret = []
prev = None prev = None
for rev in revisions: for rev in revisions:
print(rev, rev.files_hash(self.db)) print(rev, rev.files_hash)
if rev.broken: if rev.broken:
continue continue
if prev and prev.files_hash(self.db) == rev.files_hash(self.db): if prev and prev.files_hash == rev.files_hash:
continue continue
ret.append(rev) ret.append(rev)
prev = rev prev = rev
@ -25,7 +25,7 @@ class TreeBuilder:
factory_revisions = self.filtered_revisions("openSUSE:Factory", package) factory_revisions = self.filtered_revisions("openSUSE:Factory", package)
source_revisions = dict() source_revisions = dict()
for rev in factory_revisions: for rev in factory_revisions:
print(rev, rev.files_hash(self.db)) print(rev, rev.files_hash)
if rev.request_id: if rev.request_id:
req = Request.find(self.db, rev.request_id) req = Request.find(self.db, rev.request_id)
print(" ", req) print(" ", req)
@ -35,5 +35,5 @@ class TreeBuilder:
req.source_project, req.source_package req.source_project, req.source_package
) )
for rev2 in source_revisions.get(key): for rev2 in source_revisions.get(key):
if rev2.files_hash(self.db) == rev.files_hash(self.db): if rev2.files_hash == rev.files_hash:
print(" ", rev2) print(" ", rev2)

File diff suppressed because it is too large Load Diff