Provide cache for expensive and cache-able staging requests.

The two slowest staging API calls are for information that rarely changes.
By caching the result the commands typically execute over twice as fast.
Going further can see improvements of an order of magnitude or more by
caching almost all the GET requests.

In contrast to osclib/memoize.py this cache operates at the HTTP request
level. This has several advantages:

- Caches the expensive part (ie the HTTP request). There are a number of
  functions in osc.core and elsewhere that make the same API request, but
  process the result differently which would require multiple API calls
  using memoize.
- Handles cases were a loader function uses class attributes as input and
  output and thus no relevant method parameters or return. An important
  example is StagingAPI._generate_ring_packages().
- Storage is project aware which allows caches to be deleted when a project
  is known to have changed.
- Due to project awareness, can utilize OBS /statistics/latest_updated API
  call to determine which projects need to be expired.

The cache file structure is as follows:

- hostname(apiurl)
  - project
    - sha1(url)
  - sha1(url)

See Cache.PATTERNS for changing the time to live (ttl) or add patterns to
be cached.
This commit is contained in:
Jimmy Berry
2017-01-09 21:49:50 -06:00
parent 72d99f6041
commit cff5befed3
7 changed files with 294 additions and 0 deletions

View File

@@ -35,6 +35,7 @@ from osclib.list_command import ListCommand
from osclib.obslock import OBSLock
from osclib.select_command import SelectCommand
from osclib.stagingapi import StagingAPI
from osclib.cache import Cache
from osclib.unselect_command import UnselectCommand
from osclib.repair_command import RepairCommand
@@ -89,6 +90,8 @@ def _full_project_name(self, project):
help='do not cleanup remaining packages in staging projects after accept')
@cmdln.option('--no-bootstrap', dest='bootstrap', action='store_false', default=True,
help='do not update bootstrap-copy when freezing')
@cmdln.option('--wipe-cache', dest='wipe_cache', action='store_true', default=False,
help='wipe GET request cache before executing')
def do_staging(self, subcmd, opts, *args):
"""${cmd_name}: Commands to work with staging projects
@@ -165,6 +168,9 @@ def do_staging(self, subcmd, opts, *args):
opts.verbose = False
Config(opts.project)
if opts.wipe_cache:
Cache.delete_all()
with OBSLock(opts.apiurl, opts.project):
api = StagingAPI(opts.apiurl, opts.project)

273
osclib/cache.py Normal file
View File

@@ -0,0 +1,273 @@
from __future__ import print_function
import datetime
import hashlib
import os
import osc.core
import re
import shutil
import sys
import urlparse
from StringIO import StringIO
from osc import conf
from osc.core import urlopen
from time import time
try:
from xml.etree import cElementTree as ET
except ImportError:
import cElementTree as ET
def http_request(method, url, headers={}, data=None, file=None):
"""
Wrapper for osc.core.http_request() to provide GET request caching.
"""
if method == 'GET':
ret = Cache.get(url)
if ret:
return ret
else:
# Logically, seems to make more sense after real call, but practically
# it should not matter and makes the apitests happy when dealing with
# request acceptance which causes a GET to determine target project.
Cache.delete(url)
ret = osc.core._http_request(method, url, headers, data, file)
if method == 'GET':
ret = Cache.put(url, ret)
return ret
class Cache(object):
"""
Provide a cache implementation for osc.core.http_request().
The cache takes a list of regular expression patterns and time to live (ttl)
for API paths. In addition to the ttl the project context is taken into
account when available in order to expire all caches related to a project
when the remote server indicates a change was made more recently than the
local cache reflects. This provides a fairly robust cache that can handle
multiple users changing the same projects.
Cannot safely cache, for lengthy periods, paths that can update without user
interaction or that do not trigger the project updated timestamp to change.
Such paths include anything related to build status. When a source package
is updated the linked packages do not trigger an update of their project. As
such sources cannot be reliably cached for too long.
Any paths without a project context will be cleared when updated using this
cache, but obviously not for other contributors.
"""
CACHE_DIR = os.path.expanduser('~/.cache/osc-plugin-factory')
TTL_LONG = 12 * 60 * 60
TTL_SHORT = 5 * 60
TTL_DUPLICATE = 3
PATTERNS = {
# Group members cannot be guaranteed, but change rarely.
'/group/[^/?]+$': TTL_SHORT,
# Clear target project cache upon request acceptance.
'/request/(\d+)\?.*newstate=accepted': TTL_DUPLICATE,
"/search/package\?match=\[@project='([^']+)'\]$": TTL_LONG,
# Potentially expire the latest_updated since it will be the only way to
# tell after an adi staging is removed. For now just cache the calls
# that occur in rapid succession.
"/search/project/id\?match=starts-with\(@name,'([^']+)\:'\)$": TTL_DUPLICATE,
# List of all projects may change, but relevant ones rarely.
'/source$': TTL_LONG,
# Sources will be expired with project, could be done on package level.
'/source/([^/?]+)(?:\?.*)?$': TTL_LONG,
# Project will be marked changed when packages are added/removed.
'/source/([^/]+)/_meta$': TTL_LONG,
'/source/([^/]+)/(?:[^/]+)/(?:_meta|_link)$': TTL_LONG,
# Handles clearing local cache on package deletes. Lots of queries like
# updating project info, comment, and package additions.
'/source/([^/]+)/(?:[^/?]+)(?:\?[^/]+)?$': TTL_LONG,
# Presumably users are not interweaving in short windows.
'/statistics/latest_updated': TTL_SHORT,
}
last_updated = {}
@staticmethod
def init():
Cache.patterns = []
for pattern in Cache.PATTERNS:
Cache.patterns.append(re.compile(pattern))
# Replace http_request with wrapper function which needs a stored
# version of the original function to call.
if not hasattr(osc.core, '_http_request'):
osc.core._http_request = osc.core.http_request
osc.core.http_request = http_request
@staticmethod
def get(url):
match, project = Cache.match(url)
if match:
path = Cache.path(url, project, include_file=True)
ttl = Cache.PATTERNS[match]
if project:
# Given project context check to see if project has been updated
# remotely more recently than local cache.
apiurl, _ = Cache.spliturl(url)
Cache.last_updated_load(apiurl)
# Use the project last updated timestamp if availabe, otherwise
# the oldest record indicates the longest period that can be
# guaranteed to have no changes.
if project in Cache.last_updated[apiurl]:
unchanged_since = Cache.last_updated[apiurl][project]
else:
unchanged_since = Cache.last_updated[apiurl]['__oldest']
now = datetime.datetime.utcnow()
unchanged_since = datetime.datetime.strptime(unchanged_since, '%Y-%m-%dT%H:%M:%SZ')
history_span = now - unchanged_since
# Treat non-existant cache as brand new for the sake of history
# span check since it behaves as desired.
age = 0
directory = Cache.path(url, project)
if os.path.exists(directory):
age = time() - os.path.getmtime(directory)
# If history span is shorter than allowed cache life and the age
# of the current cache is older than history span with no
# changes the cache cannot be guaranteed. For example:
# ttl = 1 day
# history_span = 0.5 day
# age = 0.75
# Cannot be guaranteed.
ttl_delta = datetime.timedelta(seconds=ttl)
age_delta = datetime.timedelta(seconds=age)
if history_span < ttl_delta and age_delta > history_span:
Cache.delete_project(apiurl, project)
if os.path.exists(path) and time() - os.path.getmtime(path) <= ttl:
if conf.config['debug']: print('CACHE_GET', url, file=sys.stderr)
return urlopen('file://' + path)
else:
reason = '(' + ('expired' if os.path.exists(path) else 'does not exist') + ')'
if conf.config['debug']: print('CACHE_MISS', url, reason, file=sys.stderr)
return None
@staticmethod
def put(url, data):
match, project = Cache.match(url)
if match:
path = Cache.path(url, project, include_file=True, makedirs=True)
# Since urlopen does not return a seekable stream it cannot be reset
# after writing to cache. As such a wrapper must be used. This could
# be replaced with urlopen('file://...') to be consistent, but until
# the need arrises StringIO has less overhead.
text = data.read()
data = StringIO(text)
if conf.config['debug']: print('CACHE_PUT', url, project, file=sys.stderr)
f = open(path,'w')
f.write(text)
f.close()
return data
@staticmethod
def delete(url):
match, project = Cache.match(url)
if match:
path = Cache.path(url, project, include_file=True)
# Rather then wait for last updated statistics to expire, remove the
# project cache if applicable.
if project:
apiurl, _ = Cache.spliturl(url)
if project.isdigit():
# Clear target project cache upon request acceptance.
project = osc.core.get_request(apiurl, project).actions[0].tgt_project
Cache.delete_project(apiurl, project)
if os.path.exists(path):
if conf.config['debug']: print('CACHE_DELETE', url, file=sys.stderr)
os.remove(path)
# Also delete version without query. This does not handle other
# variations using different query strings. Handy for PUT with ?force=1.
o = urlparse.urlsplit(url)
if o.query != '':
url_plain = urlparse.SplitResult(o.scheme, o.netloc, o.path, '', o.fragment).geturl()
Cache.delete(url_plain)
@staticmethod
def delete_project(apiurl, project):
path = Cache.path(apiurl, project)
if os.path.exists(path):
if conf.config['debug']: print('CACHE_DELETE_PROJECT', apiurl, project, file=sys.stderr)
shutil.rmtree(path)
@staticmethod
def delete_all():
if os.path.exists(Cache.CACHE_DIR):
shutil.rmtree(Cache.CACHE_DIR)
@staticmethod
def match(url):
apiurl, path = Cache.spliturl(url)
for pattern in Cache.patterns:
match = pattern.match(path)
if match:
return (pattern.pattern,
match.group(1) if len(match.groups()) > 0 else None)
return (False, None)
@staticmethod
def spliturl(url):
o = urlparse.urlsplit(url)
apiurl = urlparse.SplitResult(o.scheme, o.netloc, '', '', '').geturl()
path = urlparse.SplitResult('', '', o.path, o.query, '').geturl()
return (apiurl, path)
@staticmethod
def path(url, project, include_file=False, makedirs=False):
parts = [Cache.CACHE_DIR]
o = urlparse.urlsplit(url)
parts.append(o.hostname)
if project:
parts.append(project)
directory = os.path.join(*parts)
if not os.path.exists(directory) and makedirs:
os.makedirs(directory)
if include_file:
parts.append(hashlib.sha1(url).hexdigest())
return os.path.join(*parts)
return directory
@staticmethod
def last_updated_load(apiurl):
if apiurl in Cache.last_updated:
return
url = osc.core.makeurl(apiurl, ['statistics', 'latest_updated'], {'limit': 5000})
root = ET.parse(osc.core.http_GET(url)).getroot()
last_updated = {}
for entity in root:
# Entities repesent either a project or package.
key = 'name' if entity.tag == 'project' else 'project'
if entity.attrib[key] not in last_updated:
last_updated[entity.attrib[key]] = entity.attrib['updated']
# Keep track of the last entry to indicate the covered timespan.
last_updated['__oldest'] = entity.attrib['updated']
Cache.last_updated[apiurl] = last_updated

View File

@@ -36,6 +36,7 @@ from osc.core import http_GET
from osc.core import http_POST
from osc.core import http_PUT
from osclib.cache import Cache
from osclib.comments import CommentAPI
from osclib.memoize import memoize
@@ -78,6 +79,8 @@ class StagingAPI(object):
else:
self.rings = []
Cache.init()
@property
def ring_packages(self):

View File

@@ -26,6 +26,7 @@ import httpretty
import osc
import urlparse
import sys
from osclib.cache import Cache
sys.path.append(".")
from check_tags_in_requests import TagChecker
@@ -41,6 +42,7 @@ class TestTagChecker(unittest.TestCase):
Initialize the configuration
"""
Cache.last_updated[APIURL] = {'__oldest': '2016-12-18T11:49:37Z'}
httpretty.reset()
httpretty.enable()

View File

@@ -21,6 +21,7 @@ import httpretty
import osc
import re
import urlparse
from osclib.cache import Cache
from check_source_in_factory import FactorySourceChecker
@@ -39,6 +40,7 @@ class TestFactorySourceAccept(unittest.TestCase):
Initialize the configuration
"""
Cache.last_updated[APIURL] = {'__oldest': '2016-12-18T11:49:37Z'}
httpretty.reset()
httpretty.enable()

View File

@@ -0,0 +1,3 @@
<latest_updated>
<package name="notreal" project="notreal" updated="2016-12-18T11:49:37Z"/>
</latest_updated>

View File

@@ -23,6 +23,7 @@ import xml.etree.cElementTree as ET
import httpretty
import osc
from osclib.cache import Cache
APIURL = 'http://localhost'
@@ -107,6 +108,7 @@ class OBS(object):
if not OBS._self:
OBS._self = super(OBS, cls).__new__(cls, *args, **kwargs)
Cache.delete_all()
httpretty.reset()
httpretty.enable()
@@ -121,6 +123,9 @@ class OBS(object):
"""Instance constructor."""
self.fixtures = fixtures
if not hasattr(Cache, '_CACHE_DIR'):
Cache._CACHE_DIR = True
Cache.CACHE_DIR += '-test'
httpretty.enable()
oscrc = os.path.join(fixtures, 'oscrc')