Optimize searching access logs

* Use binary search for parsing accessed packages
* Write packages statistics to InfluxDB
This commit is contained in:
Witek Bedyk 2020-07-24 16:55:29 +02:00
parent adc3750f2b
commit 9d4a45e576
22 changed files with 585977 additions and 19 deletions

View File

@ -1,6 +1,8 @@
#!/usr/bin/php #!/usr/bin/php
<?php <?php
include 'utils.php';
use InfluxDB\Point; use InfluxDB\Point;
use InfluxDB\Database; use InfluxDB\Database;
@ -11,7 +13,6 @@ const LANGLEY = 'http://langley.suse.de/pub/pontifex%s-opensuse.suse.de';
const VHOST = 'download.opensuse.org'; const VHOST = 'download.opensuse.org';
const FILENAME = 'download.opensuse.org-%s-access_log.xz'; const FILENAME = 'download.opensuse.org-%s-access_log.xz';
const IPV6_PREFIX = 'ipv6.'; const IPV6_PREFIX = 'ipv6.';
const PRODUCT_PATTERN = '/^(10\.[2-3]|11\.[0-4]|12\.[1-3]|13\.[1-2]|42\.[1-3]|15\.[0-1]|tumbleweed)$/';
$begin = new DateTime(); $begin = new DateTime();
// Skip the current day since the logs are incomplete and not compressed yet. // Skip the current day since the logs are incomplete and not compressed yet.
@ -224,6 +225,9 @@ function aggregate($intervals, &$merged, $date, $date_previous, $data, $tags = [
if ($prefix == 'access') { if ($prefix == 'access') {
$summary = summarize_product_plus_key($merged[$interval]['data']['total_image_product']); $summary = summarize_product_plus_key($merged[$interval]['data']['total_image_product']);
$count += write_summary_product_plus_key($interval, $date_previous, $summary, 'image'); $count += write_summary_product_plus_key($interval, $date_previous, $summary, 'image');
$summary = summarize_product_plus_key($merged[$interval]['data']['total_package_product']);
$count += write_summary_product_plus_key($interval, $date_previous, $summary, 'package');
} }
error_log("[$prefix] [$interval] [{$merged[$interval]['value']}] wrote $count points at " . error_log("[$prefix] [$interval] [{$merged[$interval]['value']}] wrote $count points at " .
@ -364,11 +368,6 @@ function summarize_product_plus_key($data)
return $summary; return $summary;
} }
function product_filter($product)
{
return (bool) preg_match(PRODUCT_PATTERN, $product);
}
function date_period_reversed($begin, $interval, $end) function date_period_reversed($begin, $interval, $end)
{ {
$interval = DateInterval::createFromDateString($interval); $interval = DateInterval::createFromDateString($interval);

View File

@ -1,6 +1,8 @@
#!/usr/bin/php #!/usr/bin/php
<?php <?php
include 'utils.php';
const REGEX_LINE = '/\S+ \S+ \S+ \[([^:]+:\d+:\d+:\d+ [^\]]+)\] "(\S+)(?: (\S+) \S+)?" (\S+) (\S+) "[^"]*" "[^"]*" .* size:(\S+) \S+(?: +"?(\S+-\S+-\S+-\S+-[^\s"]+|-)"? "?(dvd|ftp|-)"?)?/'; const REGEX_LINE = '/\S+ \S+ \S+ \[([^:]+:\d+:\d+:\d+ [^\]]+)\] "(\S+)(?: (\S+) \S+)?" (\S+) (\S+) "[^"]*" "[^"]*" .* size:(\S+) \S+(?: +"?(\S+-\S+-\S+-\S+-[^\s"]+|-)"? "?(dvd|ftp|-)"?)?/';
const REGEX_PRODUCT = '#/(?:(tumbleweed)|distribution/(?:leap/)?(\d+\.\d+)|openSUSE(?:_|:/)(?:leap(?:_|:/))?(factory|tumbleweed|\d+\.\d+))#i'; const REGEX_PRODUCT = '#/(?:(tumbleweed)|distribution/(?:leap/)?(\d+\.\d+)|openSUSE(?:_|:/)(?:leap(?:_|:/))?(factory|tumbleweed|\d+\.\d+))#i';
const REGEX_IMAGE = '#(?:/(?:iso|live)/[^/]+-(DVD|NET|GNOME-Live|KDE-Live|Rescue-CD|Kubic-DVD)-[^/]+\.iso(?:\.torrent)?|/jeos/[^/]+-(JeOS)\.[^/]+\.(?:qcow2|vhdx|vmdk|vmx)$)#'; const REGEX_IMAGE = '#(?:/(?:iso|live)/[^/]+-(DVD|NET|GNOME-Live|KDE-Live|Rescue-CD|Kubic-DVD)-[^/]+\.iso(?:\.torrent)?|/jeos/[^/]+-(JeOS)\.[^/]+\.(?:qcow2|vhdx|vmdk|vmx)$)#';
@ -12,23 +14,68 @@ $total_product = [];
$unique_product = []; $unique_product = [];
$total_image_product = []; $total_image_product = [];
$total_package_product = []; $total_package_product = [];
$fallback_packages = get_packages_list('tumbleweed');
$packages_file = '15.2_packages'; function exception_error_handler($severity, $message, $file, $line) {
$packages = file($packages_file, FILE_IGNORE_NEW_LINES); if (!(error_reporting() & $severity)) {
$packages = array_map('ltrim', $packages); // This error code is not included in error_reporting
rsort($packages); return;
}
throw new ErrorException($message, 0, $severity, $file, $line);
}
set_error_handler("exception_error_handler");
function get_packages_list($product) {
$packages_file = "packages/" . $product;
try {
$packages = file($packages_file, FILE_IGNORE_NEW_LINES);
} catch (ErrorException $e) {
echo 'Has not found packages file for ', $product, ". Using fallback.\n";
return null;
}
$packages = array_map('trim', $packages);
sort($packages);
return $packages;
}
// Find a substring at the beginning of a string from an array of substrings // Find a substring at the beginning of a string from an array of substrings
// $substrings - array of possible substrings (needles) // $substrings - array of possible substrings (needles)
// $string - examined string (haystack) // $string - examined string (haystack)
// Returns the first match // Returns the first match
function find_substring($substrings, $string) { function find_substring($substrings, $string) {
foreach ($substrings as $sub) { $result_index = binary_string_search($substrings, 0, count($substrings) - 1, $string);
if(stripos($string, $sub) === 0) { if ($result_index >= 0)
return $sub; return check_next_element($substrings, $string, $result_index, $substrings[$result_index]);
} else
} return NULL;
return NULL; }
function check_next_element($substrings, $string, $index, $match) {
if (stripos($string, $substrings[$index + 1]) === 0)
return check_next_element($substrings, $string, $index + 1, $substrings[$index + 1]);
elseif (stripos($substrings[$index + 1], $match) === 0 &&
strncmp($substrings[$index + 1], $string, strlen($string)) < 0)
return check_next_element($substrings, $string, $index + 1, $match);
else
return $match;
}
function binary_string_search($haystack, $start, $end, $needle) {
if ($end < $start)
return false;
$mid_index = floor(($end + $start)/2);
$comparison = strncmp($haystack[$mid_index], $needle, strlen($haystack[$mid_index]));
if ($comparison == 0)
return $mid_index;
elseif ($comparison > 0)
return binary_string_search($haystack, $start, $mid_index - 1, $needle);
else
return binary_string_search($haystack, $mid_index + 1, $end, $needle);
} }
$file = $argc == 2 ? $argv[1] : 'php://stdin'; $file = $argc == 2 ? $argv[1] : 'php://stdin';
@ -42,6 +89,8 @@ while (($line = fgets($handle)) !== false) {
// Only interested in GET or HEAD requests, others are invalid. // Only interested in GET or HEAD requests, others are invalid.
if ($match[2] != 'GET' && $match[2] != 'HEAD') continue; if ($match[2] != 'GET' && $match[2] != 'HEAD') continue;
// Not interested on errors.
if ($match[4] >= '400') continue;
$total++; $total++;
// Attempt to determine for which product was the request. // Attempt to determine for which product was the request.
@ -53,11 +102,19 @@ while (($line = fgets($handle)) !== false) {
$values = array_filter($match_product); $values = array_filter($match_product);
$product = str_replace('factory', 'tumbleweed', strtolower(next($values))); $product = str_replace('factory', 'tumbleweed', strtolower(next($values)));
if (!isset($total_product[$product])) $total_product[$product] = 0; if (!isset($total_product[$product])) {
$total_product[$product] = 0;
if (product_filter($product)) {
$packages[$product] = get_packages_list($product);
if (is_null($packages[$product])) {
$packages[$product] = &$fallback_packages;
}
}
}
$total_product[$product] += 1; $total_product[$product] += 1;
if (preg_match(REGEX_RPM_NAME, $match[3], $match_rpm_name)) { if (product_filter($product) && preg_match(REGEX_RPM_NAME, $match[3], $match_rpm_name)) {
$package = find_substring($packages, $match_rpm_name[1]); $package = find_substring($packages[$product], $match_rpm_name[1]);
if ($package) { if ($package) {
if (!isset($total_package_product[$product])) $total_package_product[$product] = []; if (!isset($total_package_product[$product])) $total_package_product[$product] = [];
if (!isset($total_package_product[$product][$package])) $total_package_product[$product][$package] = 0; if (!isset($total_package_product[$product][$package])) $total_package_product[$product][$package] = 0;

5431
metrics/access/packages/10.2 Normal file

File diff suppressed because it is too large Load Diff

6770
metrics/access/packages/10.3 Normal file

File diff suppressed because it is too large Load Diff

7360
metrics/access/packages/11.0 Normal file

File diff suppressed because it is too large Load Diff

13500
metrics/access/packages/11.1 Normal file

File diff suppressed because it is too large Load Diff

15069
metrics/access/packages/11.2 Normal file

File diff suppressed because it is too large Load Diff

15573
metrics/access/packages/11.3 Normal file

File diff suppressed because it is too large Load Diff

17559
metrics/access/packages/11.4 Normal file

File diff suppressed because it is too large Load Diff

20157
metrics/access/packages/12.1 Normal file

File diff suppressed because it is too large Load Diff

23596
metrics/access/packages/12.2 Normal file

File diff suppressed because it is too large Load Diff

32531
metrics/access/packages/12.3 Normal file

File diff suppressed because it is too large Load Diff

35175
metrics/access/packages/13.1 Normal file

File diff suppressed because it is too large Load Diff

38377
metrics/access/packages/13.2 Normal file

File diff suppressed because it is too large Load Diff

53189
metrics/access/packages/15.0 Normal file

File diff suppressed because it is too large Load Diff

53396
metrics/access/packages/15.1 Normal file

File diff suppressed because it is too large Load Diff

58047
metrics/access/packages/15.2 Normal file

File diff suppressed because it is too large Load Diff

37901
metrics/access/packages/42.1 Normal file

File diff suppressed because it is too large Load Diff

42681
metrics/access/packages/42.2 Normal file

File diff suppressed because it is too large Load Diff

49369
metrics/access/packages/42.3 Normal file

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

8
metrics/access/utils.php Normal file
View File

@ -0,0 +1,8 @@
<?php
const PRODUCT_PATTERN = '/^(10\.[2-3]|11\.[0-4]|12\.[1-3]|13\.[1-2]|42\.[1-3]|15\.[0-1]|tumbleweed)$/';
function product_filter($product)
{
return (bool) preg_match(PRODUCT_PATTERN, $product);
}