From 1c846e963c328d79c6a19c469a063113ba900306 Mon Sep 17 00:00:00 2001 From: Stephan Kulow Date: Wed, 26 Oct 2022 21:47:39 +0200 Subject: [PATCH 1/3] Import files of revisions more reliable and faster --- lib/db.py | 4 ++++ lib/importer.py | 14 +++++++++----- 2 files changed, 13 insertions(+), 5 deletions(-) diff --git a/lib/db.py b/lib/db.py index da9c837..23e54bd 100644 --- a/lib/db.py +++ b/lib/db.py @@ -206,6 +206,10 @@ class DB: "ALTER TABLE linked_revs ADD COLUMN considered BOOLEAN DEFAULT FALSE", "UPDATE scheme SET version=18", ) + schemes[19] = ( + "CREATE INDEX ON files(revision_id);", + "UPDATE scheme SET version=19", + ) schema_version = self.schema_version() if (schema_version + 1) not in schemes: return diff --git a/lib/importer.py b/lib/importer.py index 14c3a9f..89e8aac 100644 --- a/lib/importer.py +++ b/lib/importer.py @@ -267,6 +267,13 @@ class Importer: (rev.dbid, linked.dbid), ) + def revisions_without_files(self, db): + with db.cursor() as cur: + cur.execute( + "SELECT * FROM revisions WHERE broken=FALSE AND expanded_srcmd5 IS NULL" + ) + return [DBRevision(row) for row in cur.fetchall()] + def import_into_db(self): db = DB() for project, _, api_url in self.projects: @@ -283,10 +290,7 @@ class Importer: if missing_user: missing_user.import_into_db(db) - for rev in DBRevision.all_revisions(db, project, self.package): - # TODO move into SELECT - if rev.broken or rev.expanded_srcmd5: - continue + for rev in self.revisions_without_files(db): with db.cursor() as cur: cur.execute( """SELECT unexpanded_srcmd5 from revisions WHERE @@ -297,7 +301,7 @@ class Importer: if linked_rev: linked_rev = linked_rev[0] list = self.obs.list( - project, self.package, rev.unexpanded_srcmd5, linked_rev + rev.project, rev.package, rev.unexpanded_srcmd5, linked_rev ) if list: rev.import_dir_list(db, list) From 9bd6643e8a188540692ac4c592d6426bcf382e60 Mon Sep 17 00:00:00 2001 From: Stephan Kulow Date: Wed, 26 Oct 2022 22:18:16 +0200 Subject: [PATCH 2/3] Fix files_hash calculcation and fake revisions --- lib/importer.py | 27 +++++++++------------------ lib/tree_builder.py | 2 +- 2 files changed, 10 insertions(+), 19 deletions(-) diff --git a/lib/importer.py b/lib/importer.py index 89e8aac..65adb44 100644 --- a/lib/importer.py +++ b/lib/importer.py @@ -167,18 +167,6 @@ class Importer: (rev.dbid, linked_rev.dbid), ) - def calculate_file_hashes(self, db): - with db.cursor() as cur: - cur.execute( - "SELECT * from revisions where files_hash IS NULL AND broken is FALSE" - ) - for row in cur.fetchall(): - rev = DBRevision(row) - md5 = rev.calculate_files_hash(db) - cur.execute( - "UPDATE revisions SET files_hash=%s WHERE id=%s", (md5, rev.dbid) - ) - def fetch_all_linked_packages(self, db, project, package): with db.cursor() as cur: cur.execute( @@ -193,12 +181,10 @@ class Importer: def find_fake_revisions(self, db): with db.cursor() as cur: cur.execute( - """SELECT * from revisions WHERE - id in (SELECT revision_id from linked_revs WHERE considered=FALSE) AND - id not in (SELECT revision_id FROM fake_revs) ORDER by project,package,rev""" + "SELECT * from revisions WHERE id in (SELECT revision_id from linked_revs WHERE considered=FALSE)" ) for row in cur.fetchall(): - DBRevision(row) + self._find_fake_revision(db, DBRevision(row)) def _find_fake_revision(self, db, rev): prev = rev.previous_commit(db) @@ -211,7 +197,7 @@ class Importer: return with db.cursor() as cur: cur.execute( - """SELECT * from revisions where id in + """SELECT * FROM revisions WHERE id IN (SELECT revision_id from linked_revs WHERE linked_id=%s) AND commit_time <= %s ORDER BY commit_time""", (prev.dbid, rev.commit_time), @@ -305,13 +291,18 @@ class Importer: ) if list: rev.import_dir_list(db, list) + md5 = rev.calculate_files_hash(db) + with db.cursor() as cur: + cur.execute( + "UPDATE revisions SET files_hash=%s WHERE id=%s", + (md5, rev.dbid), + ) else: rev.set_broken(db) for number in DBRevision.requests_to_fetch(db, project, self.package): self.obs.request(number).import_into_db(db) - self.calculate_file_hashes(db) db.conn.commit() TreeBuilder(db).build(self.package) diff --git a/lib/tree_builder.py b/lib/tree_builder.py index cc2a2e8..9779d57 100644 --- a/lib/tree_builder.py +++ b/lib/tree_builder.py @@ -38,4 +38,4 @@ class TreeBuilder: if rev2.commit_time > rev.commit_time: continue if rev2.files_hash == rev.files_hash: - print(" ", rev2) + print(" ", rev2, rev2.files_hash) From 38e4996280fd50ba7d5ddc3afc9937db70665d5e Mon Sep 17 00:00:00 2001 From: Stephan Kulow Date: Thu, 27 Oct 2022 07:33:46 +0200 Subject: [PATCH 3/3] Fix the fake revisions after introduction of cache The way I stored it in the linked_revs was the wrong way --- lib/db_revision.py | 5 ++-- lib/importer.py | 58 +++++++++++++++++++++++----------------------- 2 files changed, 31 insertions(+), 32 deletions(-) diff --git a/lib/db_revision.py b/lib/db_revision.py index 06c3c98..eaebae6 100644 --- a/lib/db_revision.py +++ b/lib/db_revision.py @@ -200,13 +200,12 @@ class DBRevision: return self._files @staticmethod - def requests_to_fetch(db, project, package): + def requests_to_fetch(db): with db.cursor() as cur: cur.execute( """SELECT request_number FROM revisions revs LEFT JOIN requests reqs ON reqs.number=revs.request_number WHERE reqs.id is null AND - revs.request_number IS NOT NULL and project=%s AND package=%s;""", - (project, package), + revs.request_number IS NOT NULL""", ) return [row[0] for row in cur.fetchall()] diff --git a/lib/importer.py b/lib/importer.py index 65adb44..249d5a2 100644 --- a/lib/importer.py +++ b/lib/importer.py @@ -181,7 +181,7 @@ class Importer: def find_fake_revisions(self, db): with db.cursor() as cur: cur.execute( - "SELECT * from revisions WHERE id in (SELECT revision_id from linked_revs WHERE considered=FALSE)" + "SELECT * from revisions WHERE id in (SELECT linked_id from linked_revs WHERE considered=FALSE)" ) for row in cur.fetchall(): self._find_fake_revision(db, DBRevision(row)) @@ -268,40 +268,40 @@ class Importer: self.fetch_all_linked_packages(db, project, self.package) # all remaining, no filtering here self.find_linked_revs(db) - self.find_fake_revisions(db) - missing_users = User.missing_users(db) - for userid in missing_users: - missing_user = self.obs.user(userid) - if missing_user: - missing_user.import_into_db(db) + missing_users = User.missing_users(db) + for userid in missing_users: + missing_user = self.obs.user(userid) + if missing_user: + missing_user.import_into_db(db) - for rev in self.revisions_without_files(db): + self.find_fake_revisions(db) + for rev in self.revisions_without_files(db): + with db.cursor() as cur: + cur.execute( + """SELECT unexpanded_srcmd5 from revisions WHERE + id=(SELECT linked_id FROM linked_revs WHERE revision_id=%s)""", + (rev.dbid,), + ) + linked_rev = cur.fetchone() + if linked_rev: + linked_rev = linked_rev[0] + list = self.obs.list( + rev.project, rev.package, rev.unexpanded_srcmd5, linked_rev + ) + if list: + rev.import_dir_list(db, list) + md5 = rev.calculate_files_hash(db) with db.cursor() as cur: cur.execute( - """SELECT unexpanded_srcmd5 from revisions WHERE - id=(SELECT linked_id FROM linked_revs WHERE revision_id=%s)""", - (rev.dbid,), + "UPDATE revisions SET files_hash=%s WHERE id=%s", + (md5, rev.dbid), ) - linked_rev = cur.fetchone() - if linked_rev: - linked_rev = linked_rev[0] - list = self.obs.list( - rev.project, rev.package, rev.unexpanded_srcmd5, linked_rev - ) - if list: - rev.import_dir_list(db, list) - md5 = rev.calculate_files_hash(db) - with db.cursor() as cur: - cur.execute( - "UPDATE revisions SET files_hash=%s WHERE id=%s", - (md5, rev.dbid), - ) - else: - rev.set_broken(db) + else: + rev.set_broken(db) - for number in DBRevision.requests_to_fetch(db, project, self.package): - self.obs.request(number).import_into_db(db) + for number in DBRevision.requests_to_fetch(db): + self.obs.request(number).import_into_db(db) db.conn.commit() TreeBuilder(db).build(self.package)