openSUSE-release-tools/repo_checker.py

#!/usr/bin/python3

import cmdln
from collections import namedtuple
import hashlib
from lxml import etree as ET
import os
from osc.core import show_results_meta
import pipes
import re
import subprocess
import sys
import tempfile

from osclib.cache_manager import CacheManager
from osclib.conf import Config
from osclib.conf import str2bool
from osclib.core import BINARY_REGEX
from osclib.core import builddepinfo
from osclib.core import depends_on
from osclib.core import devel_project_fallback
from osclib.core import fileinfo_ext_all
from osclib.core import package_binary_list
from osclib.core import project_meta_revision
from osclib.core import project_pseudometa_file_ensure
from osclib.core import project_pseudometa_file_load
from osclib.core import project_pseudometa_package
from osclib.core import repository_path_search
from osclib.core import repository_path_expand
from osclib.core import repositories_states
from osclib.core import repository_arch_state
from osclib.core import repositories_published
from osclib.core import target_archs
from osclib.memoize import memoize
from osclib.util import sha1_short

import ReviewBot

CACHEDIR = CacheManager.directory('repository-meta')
SCRIPT_PATH = os.path.dirname(os.path.realpath(__file__))
CheckResult = namedtuple('CheckResult', ('success', 'comment'))
INSTALL_REGEX = r"^(?:can't install (.*?)|found conflict of (.*?) with (.*?)):$"
InstallSection = namedtuple('InstallSection', ('binaries', 'text'))

ERROR_REPO_SPECIFIED = 'a repository must be specified via OSRT:Config main-repo for {}'

class RepoChecker(ReviewBot.ReviewBot):
    def __init__(self, *args, **kwargs):
        ReviewBot.ReviewBot.__init__(self, *args, **kwargs)

        # ReviewBot options.
        self.request_default_return = True
        self.comment_handler = True

        # RepoChecker options.
        self.force = False

    def project_only(self, project, post_comments=False):
        repository = self.project_repository(project)
        if not repository:
            self.logger.error(ERROR_REPO_SPECIFIED.format(project))
            return

        config = Config.get(self.apiurl, project)
        arch_whitelist = config.get('repo_checker-arch-whitelist')

        repository_pairs = repository_path_expand(self.apiurl, project, repository)
        state_hash = self.repository_state(repository_pairs, False)
        self.repository_check(repository_pairs, state_hash, False, bool(post_comments), arch_whitelist=arch_whitelist)

    def package_comments(self, project, repository):
        self.logger.info('{} package comments'.format(len(self.package_results)))

        for package, sections in self.package_results.items():
            if str2bool(Config.get(self.apiurl, project).get('repo_checker-package-comment-devel', 'False')):
                bot_name_suffix = project
                comment_project, comment_package = devel_project_fallback(self.apiurl, project, package)
                if comment_project is None or comment_package is None:
                    self.logger.warning('unable to find devel project for {}'.format(package))
                    continue

                message = 'The version of this package in [`{project}`](/package/show/{project}/{package}) ' \
                    'has installation issues and may not be installable:'.format(
                        project=project, package=package)
            else:
                bot_name_suffix = repository
                comment_project = project
                comment_package = package
                message = 'This package has installation issues and may not be installable from the `{}` ' \
                    'repository:'.format(repository)

            # Sort sections by text to group binaries together.
            sections = sorted(sections, key=lambda s: s.text)
            message += '\n\n<pre>\n{}\n</pre>'.format(
                '\n'.join([section.text for section in sections]).strip())

            # Generate a hash based on the binaries involved and the number of
            # sections. This eliminates version or release changes from causing
            # an update to the comment while still updating on relevant changes.
            binaries = set()
            for section in sections:
                binaries.update(section.binaries)
            info = ';'.join(['::'.join(sorted(binaries)), str(len(sections))])
            reference = hashlib.sha1(info).hexdigest()[:7]

            # Post comment on package in order to notifiy maintainers.
            self.comment_write(state='seen', result=reference, bot_name_suffix=bot_name_suffix,
                               project=comment_project, package=comment_package, message=message)

    def target_archs(self, project, repository, arch_whitelist=None):
        archs = target_archs(self.apiurl, project, repository)

        # Check for arch whitelist and use intersection.
        if arch_whitelist:
            archs = list(set(arch_whitelist.split(' ')).intersection(set(archs)))

        # Trick to prioritize x86_64.
        return sorted(archs, reverse=True)

    @memoize(ttl=60, session=True, add_invalidate=True)
    def mirror(self, project, repository, arch):
        """Call bs_mirrorfull script to mirror packages."""
        directory = os.path.join(CACHEDIR, project, repository, arch)
        if not os.path.exists(directory):
            os.makedirs(directory)

        script = os.path.join(SCRIPT_PATH, 'bs_mirrorfull')
        path = '/'.join((project, repository, arch))
        url = '{}/public/build/{}'.format(self.apiurl, path)
        parts = ['LC_ALL=C', 'perl', script, '--nodebug', url, directory]
        parts = [pipes.quote(part) for part in parts]

        self.logger.info('mirroring {}'.format(path))
        if os.system(' '.join(parts)):
            raise Exception('failed to mirror {}'.format(path))

        return directory

    def simulated_merge_ignore(self, override_pair, overridden_pair, arch):
        """Determine the list of binaries to similate overides in overridden layer."""
        _, binary_map = package_binary_list(self.apiurl, override_pair[0], override_pair[1], arch)
        packages = set(binary_map.values())

        binaries, _ = package_binary_list(self.apiurl, overridden_pair[0], overridden_pair[1], arch)
        for binary in binaries:
            if binary.package in packages:
                yield binary.name

    @memoize(session=True)
    def binary_list_existing_problem(self, project, repository):
        """Determine which binaries are mentioned in repo_checker output."""
        binaries = set()

        filename = self.project_pseudometa_file_name(project, repository)
        content = project_pseudometa_file_load(self.apiurl, project, filename)
        if not content:
            self.logger.warning('no project_only run from which to extract existing problems')
            return binaries

        sections = self.install_check_parse(content)
        for section in sections:
            for binary in section.binaries:
                match = re.match(BINARY_REGEX, binary)
                if match:
                    binaries.add(match.group('name'))

        return binaries

    def binary_whitelist(self, override_pair, overridden_pair, arch):
        whitelist = self.binary_list_existing_problem(overridden_pair[0], overridden_pair[1])

        staging = Config.get(self.apiurl, overridden_pair[0]).get('staging')
        if staging:
            additions = self.staging_api(staging).get_prj_pseudometa(
                override_pair[0]).get('config', {})
            prefix = 'repo_checker-binary-whitelist'
            for key in [prefix, '-'.join([prefix, arch])]:
                whitelist.update(additions.get(key, '').split(' '))

        return set(filter(None, whitelist))

    def install_check(self, target_project_pair, arch, directories,
                      ignore=None, whitelist=[], parse=False, no_filter=False):
        self.logger.info('install check: start (ignore:{}, whitelist:{}, parse:{}, no_filter:{})'.format(
            bool(ignore), len(whitelist), parse, no_filter))

        with tempfile.NamedTemporaryFile() as ignore_file:
            # Print ignored rpms on separate lines in ignore file.
            if ignore:
                for item in ignore:
                    ignore_file.write(item + '\n')
                ignore_file.flush()

            # Invoke repo_checker.pl to perform an install check.
            script = os.path.join(SCRIPT_PATH, 'repo_checker.pl')
            parts = ['LC_ALL=C', 'perl', script, arch, ','.join(directories),
                     '-f', ignore_file.name, '-w', ','.join(whitelist)]
            if no_filter:
                parts.append('--no-filter')

            parts = [pipes.quote(part) for part in parts]
            p = subprocess.Popen(' '.join(parts), shell=True,
                                 stdout=subprocess.PIPE,
                                 stderr=subprocess.PIPE, close_fds=True)
            stdout, stderr = p.communicate()

        if p.returncode:
            self.logger.info('install check: failed')
            if p.returncode == 126:
                self.logger.warning('mirror cache reset due to corruption')
                self._invalidate_all()
            elif parse:
                # Parse output for later consumption for posting comments.
                sections = self.install_check_parse(stdout)
                self.install_check_sections_group(
                    target_project_pair[0], target_project_pair[1], arch, sections)

            # Format output as markdown comment.
            parts = []

            stdout = stdout.strip().decode('utf-8')
            if stdout:
                parts.append('<pre>\n' + stdout + '\n' + '</pre>\n')
            stderr = stderr.strip().decode('utf-8')
            if stderr:
                parts.append('<pre>\n' + stderr + '\n' + '</pre>\n')

            pseudometa_project, pseudometa_package = project_pseudometa_package(
                self.apiurl, target_project_pair[0])
            filename = self.project_pseudometa_file_name(target_project_pair[0], target_project_pair[1])
            path = ['package', 'view_file', pseudometa_project, pseudometa_package, filename]
            header = '### [install check & file conflicts](/{})\n\n'.format('/'.join(path))
            return CheckResult(False, header + ('\n' + ('-' * 80) + '\n\n').join(parts))

        self.logger.info('install check: passed')
        return CheckResult(True, None)

    def install_check_sections_group(self, project, repository, arch, sections):
        _, binary_map = package_binary_list(self.apiurl, project, repository, arch)

        for section in sections:
            # If switch to creating bugs likely makes sense to join packages to
            # form grouping key and create shared bugs for conflicts.
            # Added check for b in binary_map after encountering:
            # https://lists.opensuse.org/opensuse-buildservice/2017-08/msg00035.html
            # Under normal circumstances this should never occur.
            packages = set([binary_map[b] for b in section.binaries if b in binary_map])
            for package in packages:
                self.package_results.setdefault(package, [])
                self.package_results[package].append(section)

    def install_check_parse(self, output):
        section = None
        text = None

        # Loop over lines and parse into chunks assigned to binaries.
        for line in output.splitlines(True):
            if line.startswith(' '):
                if section:
                    text += line
            else:
                if section:
                    yield InstallSection(section, text)

                match = re.match(INSTALL_REGEX, line)
                if match:
                    # Remove empty groups since regex matches different patterns.
                    binaries = [b for b in match.groups() if b is not None]
                    section = binaries
                    text = line
                else:
                    section = None

        if section:
            yield InstallSection(section, text)

    def cycle_check(self, project, repository, arch, cycle_packages):
        self.logger.info('cycle check: start %s/%s/%s' % (project, repository, arch))
        comment = []

        allowed_cycles = []
        if cycle_packages:
            for comma_list in cycle_packages.split(';'):
                allowed_cycles.append(comma_list.split(','))

        depinfo = builddepinfo(self.apiurl, project, repository, arch, order = False)
        for cycle in depinfo.findall('cycle'):
            for package in cycle.findall('package'):
                package = package.text
                allowed = False
                for acycle in allowed_cycles:
                    if package in acycle:
                        allowed = True
                        break
                if not allowed:
                    cycled = [p.text for p in cycle.findall('package')]
                    comment.append('Package {} appears in cycle {}'.format(package, '/'.join(cycled)))

        if len(comment):
            # New cycles, post comment.
            self.logger.info('cycle check: failed')
            return CheckResult(False, '\n'.join(comment) + '\n')

        self.logger.info('cycle check: passed')
        return CheckResult(True, None)

    def result_comment(self, repository, arch, results, comment):
        """Generate comment from results"""
        comment.append('## {}/{}\n'.format(repository, arch))
        for result in results.values():
            if not result.success:
                comment.append(result.comment)

    def project_pseudometa_file_name(self, project, repository):
        filename = 'repo_checker'

        main_repo = Config.get(self.apiurl, project).get('main-repo')
        if not main_repo:
            filename += '.' + repository

        return filename

    @memoize(ttl=60, session=True)
    def repository_state(self, repository_pairs, simulate_merge):
        archs = self.target_archs(repository_pairs[0][0], repository_pairs[0][1])
        states = repositories_states(self.apiurl, repository_pairs, archs)

        if simulate_merge:
            states.append(str(project_meta_revision(self.apiurl, repository_pairs[0][0])))

        return sha1_short(states)

    @memoize(ttl=60, session=True)
    def repository_state_last(self, project, repository, simulate_merge):
        if simulate_merge:
            comments = self.comment_api.get_comments(project_name=project)
            _, info = self.comment_api.comment_find(comments, '::'.join([self.bot_name, repository]))
            if info:
                return info.get('build')
        else:
            filename = self.project_pseudometa_file_name(project, repository)
            content = project_pseudometa_file_load(self.apiurl, project, filename)
            if content:
                return content.splitlines()[0]

        return None

    @memoize(session=True)
    def repository_check(self, repository_pairs, state_hash, simulate_merge, whitelist=None, arch_whitelist=None, post_comments=False, cycle_packages=None):
        comment = []
        project, repository = repository_pairs[0]
        self.logger.info('checking {}/{}@{}[{}]'.format(
            project, repository, state_hash, len(repository_pairs)))

        archs = self.target_archs(project, repository, arch_whitelist)
        new_pairs = []
        for pair in repository_pairs:
            has_all = True
            for arch in archs:
                if not repository_arch_state(self.apiurl, pair[0], pair[1], arch):
                    has_all = False
                    break
            # ignore repositories only inherited for config
            if has_all:
                new_pairs.append(pair)
        repository_pairs = new_pairs

        published = repositories_published(self.apiurl, repository_pairs, archs)

        if not self.force:
            if state_hash == self.repository_state_last(project, repository, simulate_merge):
                self.logger.info('{} build unchanged'.format(project))
                # TODO keep track of skipped count for cycle summary
                return None

            # For submit style requests, want to process if top layer is done,
            # but not mark review as final until all layers are published.
            if published is not True and (not simulate_merge or published[0] == project):
                # Require all layers to be published except when the top layer
                # is published in a simulate merge (allows quicker feedback with
                # potentially incorrect resutls for staging).
                self.logger.info('{}/{} not published'.format(published[0], published[1]))
                return None

        # Drop non-published repository information and thus reduce to boolean.
        published = published is True

        if not simulate_merge:
            # Top of pseudometa file.
            comment.append(state_hash)

            if post_comments:
                # Stores parsed install_check() results grouped by package.
                self.package_results = {}

        if not len(archs):
            self.logger.debug('{} has no relevant architectures'.format(project))
            return None

        result = True
        for arch in archs:
            directories = []
            for pair_project, pair_repository in repository_pairs:
                directories.append(self.mirror(pair_project, pair_repository, arch))

            if simulate_merge:
                ignore = self.simulated_merge_ignore(repository_pairs[0], repository_pairs[1], arch)
                if not whitelist:
                    whitelist = self.binary_whitelist(repository_pairs[0], repository_pairs[1], arch)

                results = {
                    'cycle': self.cycle_check(repository_pairs[0][0], repository_pairs[0][1], arch, cycle_packages),
                    'install': self.install_check(
                        repository_pairs[1], arch, directories, ignore, whitelist),
                }
            else:
                # Only products themselves will want no-filter or perhaps
                # projects working on cleaning up a product.
                no_filter = str2bool(Config.get(self.apiurl, project).get('repo_checker-no-filter'))
                results = {
                    'cycle': CheckResult(True, None),
                    'install': self.install_check(repository_pairs[0], arch, directories,
                                                  parse=post_comments, no_filter=no_filter),
                }

            if not all(result.success for _, result in results.items()):
                # Not all checks passed, build comment.
                result = False
                self.result_comment(repository, arch, results, comment)

        if simulate_merge:
            info_extra = {'build': state_hash}
            if not result:
                # Some checks in group did not pass, post comment.
                # Avoid identical comments with different build hash during
                # target project build phase. Once published update regardless.
                self.comment_write(state='seen', result='failed', project=project,
                                   message='\n'.join(comment).strip(), identical=True,
                                   info_extra=info_extra, info_extra_identical=published,
                                   bot_name_suffix=repository)
            else:
                # Post passed comment only if previous failed comment.
                text = 'Previously reported problems have been resolved.'
                self.comment_write(state='done', result='passed', project=project,
                                   message=text, identical=True, only_replace=True,
                                   info_extra=info_extra, bot_name_suffix=repository)
        else:
            text = '\n'.join(comment).strip()
            if not self.dryrun:
                filename = self.project_pseudometa_file_name(project, repository)
                project_pseudometa_file_ensure(
                    self.apiurl, project, filename, text + '\n', 'repo_checker project_only run')
            else:
                print(text)

            if post_comments:
                self.package_comments(project, repository)

        if result and not published:
            # Wait for the complete stack to build before positive result.
            self.logger.debug('demoting result from accept to ignore due to non-published layer')
            result = None

        return result

    @memoize(session=True)
    def project_repository(self, project):
        repository = Config.get(self.apiurl, project).get('main-repo')
        if not repository:
            self.logger.debug('no main-repo defined for {}'.format(project))

            search_project = 'openSUSE:Factory'
            for search_repository in ('snapshot', 'standard'):
                repository = repository_path_search(
                    self.apiurl, project, search_project, search_repository)

                if repository:
                    self.logger.debug('found chain to {}/{} via {}'.format(
                        search_project, search_repository, repository))
                    break

        return repository

    def staging_build_failure_check(self, api, staging):
        # This check is only utilize to avoid the case of staging changes after
        # a review succeeds and thus does not re-review after the changes needed
        # to resolve the build failure are performed. This is one of a variety
        # of cases in which this can occur, but rather than fix real issue
        # re-instating this to workaround a common case. (see #1712)

        status = api.project_status(staging, True)
        # Corrupted requests may reference non-existent projects and will
        # thus return a None status which should be considered not ready.
        if not status or (str(status['overall_state']) == 'failed' and len(status['broken_packages']) > 0):
            return False

        return True

    @memoize(ttl=60, session=True)
    def request_repository_pairs(self, request, action):
        if str2bool(Config.get(self.apiurl, action.tgt_project).get('repo_checker-project-skip', 'False')):
            # Do not change message as this should only occur in requests
            # targeting multiple projects such as in maintenance workflow in
            # which the message should be set by other actions.
            self.logger.debug('skipping review of action targeting {}'.format(action.tgt_project))
            return True

        repository = self.project_repository(action.tgt_project)
        if not repository:
            self.review_messages['declined'] = ERROR_REPO_SPECIFIED.format(action.tgt_project)
            return False

        repository_pairs = []
        # Assumes maintenance_release target project has staging disabled.
        staging = Config.get(self.apiurl, action.tgt_project).get('staging')
        if staging:
            api = self.staging_api(staging)
            stage_info = api.packages_staged.get(action.tgt_package)
            if not stage_info or str(stage_info['rq_id']) != str(request.reqid):
                self.logger.info('{} not staged'.format(request.reqid))
                return None

            if not self.force and not self.staging_build_failure_check(api, stage_info['prj']):
                self.logger.info('{} not ready due to staging build failure(s)'.format(request.reqid))
                return None

            repository_pairs.extend(repository_path_expand(self.apiurl, stage_info['prj'], repository))
        else:
            # Find a repository which links to target project "main" repository.
            repository = repository_path_search(
                self.apiurl, action.src_project, action.tgt_project, repository)
            if not repository:
                self.review_messages['declined'] = ERROR_REPO_SPECIFIED.format(action.tgt_project)
                return False

            repository_pairs.extend(repository_path_expand(self.apiurl, action.src_project, repository))

        return repository_pairs

    def check_action_submit(self, request, action):
        repository_pairs = self.request_repository_pairs(request, action)
        if not isinstance(repository_pairs, list):
            return repository_pairs

        # use project_only results by default as reference
        whitelist = None
        config = Config.get(self.apiurl, action.tgt_project)
        staging = config.get('staging')
        arch_whitelist = config.get('repo_checker-arch-whitelist')
        cycle_packages = config.get('repo_checker-allowed-in-cycles')
        if staging:
            api = self.staging_api(staging)
            if not api.is_adi_project(repository_pairs[0][0]):
                # For "leaky" ring packages in letter stagings, where the
                # repository setup does not include the target project, that are
                # not intended to to have all run-time dependencies satisfied.
                whitelist = set(config.get('repo_checker-binary-whitelist-ring', '').split(' '))

        state_hash = self.repository_state(repository_pairs, True)
        if not self.repository_check(repository_pairs, state_hash, True,
                                     arch_whitelist=arch_whitelist,
                                     whitelist=whitelist,
                                     cycle_packages=cycle_packages):
            return None

        self.review_messages['accepted'] = 'cycle and install check passed'
        return True

    def check_action_delete_package(self, request, action):
        # TODO Ignore tgt_project packages that depend on this that are part of
        # ignore list as and instead look at output from staging for those.

        built_binaries = set([])
        revdeps = set([])
        for fileinfo in fileinfo_ext_all(self.apiurl, action.tgt_project, 'standard', 'x86_64', action.tgt_package):
            built_binaries.add(fileinfo.find('name').text)
            for requiredby in fileinfo.findall('provides_ext/requiredby[@name]'):
                revdeps.add(requiredby.get('name'))
        runtime_deps = sorted(revdeps - built_binaries)

        what_depends_on = depends_on(self.apiurl, action.tgt_project, 'standard', [action.tgt_package], True)

        # filter out dependency on package itself (happens with eg
        # java bootstrapping itself with previous build)
        if action.tgt_package in what_depends_on:
            what_depends_on.remove(action.tgt_package)

        if len(what_depends_on):
            self.logger.warning('{} is still a build requirement of:\n\n- {}'.format(
                action.tgt_package, '\n- '.join(sorted(what_depends_on))))

        if len(runtime_deps):
            self.logger.warning('{} provides runtime dependencies to:\n\n- {}'.format(
                action.tgt_package, '\n- '.join(runtime_deps)))

        if len(self.comment_handler.lines):
            self.comment_write(state='seen', result='failed')
            return None

        repository_pairs = self.request_repository_pairs(request, action)
        if not isinstance(repository_pairs, list):
            return repository_pairs

        state_hash = self.repository_state(repository_pairs, True)
        if not self.repository_check(repository_pairs, state_hash, True):
            return None

        self.review_messages['accepted'] = 'cycle and install check passed'
        return True

    def check_action_maintenance_release(self, request, action):
        # No reason to special case patchinfo since same source and target
        # projects which is all that repo_checker cares about.

        repository_pairs = self.request_repository_pairs(request, action)
        if not isinstance(repository_pairs, list):
            return repository_pairs

        state_hash = self.repository_state(repository_pairs, True)
        if not self.repository_check(repository_pairs, state_hash, True):
            return None

        self.review_messages['accepted'] = 'cycle and install check passed'
        return True


class CommandLineInterface(ReviewBot.CommandLineInterface):

    def __init__(self, *args, **kwargs):
        ReviewBot.CommandLineInterface.__init__(self, args, kwargs)
        self.clazz = RepoChecker

    def get_optparser(self):
        parser = ReviewBot.CommandLineInterface.get_optparser(self)

        parser.add_option('--force', action='store_true', help='force review even if project is not ready')

        return parser

    def setup_checker(self):
        bot = ReviewBot.CommandLineInterface.setup_checker(self)

        bot.force = self.options.force

        return bot

    @cmdln.option('--post-comments', action='store_true', help='post comments to packages with issues')
    def do_project_only(self, subcmd, opts, project):
        self.checker.check_requests() # Needed to properly init ReviewBot.
        self.checker.project_only(project, opts.post_comments)

if __name__ == '__main__':
    app = CommandLineInterface()
    sys.exit(app.main())