From f999eaa4d31b710aa9ff5ec993c6d9084936ff9c Mon Sep 17 00:00:00 2001 From: Witek Bedyk Date: Fri, 26 Jan 2024 10:22:41 +0100 Subject: [PATCH] Add IPv4 and IPv6 handling in one file Access logs for both protocls are now stored in a single file. The change refactors the code to handle it properly. Upgrade InfluxDB client. --- metrics/access/aggregate.php | 145 ++++++++++++++++++++++++++--------- metrics/access/ingest.php | 70 ++++++++++------- 2 files changed, 150 insertions(+), 65 deletions(-) diff --git a/metrics/access/aggregate.php b/metrics/access/aggregate.php index d7140b86..d93d40c6 100755 --- a/metrics/access/aggregate.php +++ b/metrics/access/aggregate.php @@ -1,11 +1,12 @@ #!/usr/bin/php sub(date_interval_create_from_date_string('1 day')); $source_map = [ 'ipv4' => [ + // the first item defines the starting date for aggregation '2023-01-01' => false, - //'2014-04-14' => sprintf(LANGLEY, 2) . '/' . VHOST, - //'2017-12-04' => sprintf(LANGLEY, 3) . '/' . VHOST, - '2023-06-18' => BACKUP . '/' . VHOST, - $begin->format('Y-m-d') => PONTIFEX . '/' . VHOST, + '2023-11-13' => DOWNLOAD_OPENSUSE_ORG . '/' . VHOST, 'filename' => FILENAME, ], 'ipv6' => [ '2012-12-31' => false, - //'2017-12-04' => sprintf(LANGLEY, 3) . '/' . IPV6_PREFIX . VHOST, - '2023-06-18' => BACKUP . '/' . IPV6_PREFIX . VHOST, - $begin->format('Y-m-d') => PONTIFEX . '/' . IPV6_PREFIX . VHOST, + '2023-11-13' => DOWNLOAD_OPENSUSE_ORG . '/' . IPV6_PREFIX . VHOST, 'filename' => IPV6_PREFIX . FILENAME, ], + 'ipv4+6' => [ + '2023-11-13' => false, + $begin->format('Y-m-d') => DOWNLOAD_OPENSUSE_ORG . '/' . VHOST, + 'filename' => FILENAME, + ], ]; $end = new DateTime(key($source_map['ipv4'])); // decide about adding one day +$migration_date = new DateTime(key($source_map['ipv4+6'])); $period_reversed = date_period_reversed($end, '1 day', $begin); error_log('begin: ' . $begin->format('Y-m-d')); @@ -53,9 +56,11 @@ function cache_init() foreach (PROTOCOLS as $protocol) { mkdir("$CACHE_DIR/$protocol", 0755, true); } + mkdir("$CACHE_DIR/ipv4+6", 00755, true); // Avoid packaging mess while still automating, but not ideal. - passthru('cd ' . escapeshellarg($CACHE_DIR) . ' && composer require influxdb/influxdb-php ~1'); + passthru('cd ' . escapeshellarg($CACHE_DIR) . + ' && composer require influxdata/influxdb-client-php:~3.4 guzzlehttp/guzzle'); } require "$CACHE_DIR/vendor/autoload.php"; @@ -68,9 +73,10 @@ function ingest_all($period_reversed, $source_map) $found = []; // Walk backwards until found in cache. foreach ($period_reversed as $date) { - $date_string = $date->format('Y-m-d'); + $date_string = print_date($date); + $protocols_on_day = get_protocols($date); - foreach (PROTOCOLS as $protocol) { + foreach ($protocols_on_day as $protocol) { if (!empty($found[$protocol])) continue; if (isset($source_map[$protocol][$date_string])) $source[$protocol] = $source_map[$protocol][$date_string]; @@ -78,7 +84,7 @@ function ingest_all($period_reversed, $source_map) // Skip date+protocol if no source is available. if (empty($source[$protocol])) continue; - $cache_file = "$CACHE_DIR/$protocol/$date_string.json"; + $cache_file = get_cache_file($protocol, $date); if (file_exists($cache_file)) { error_log("[$date_string] [$protocol] found"); $found[$protocol] = true; @@ -88,7 +94,8 @@ function ingest_all($period_reversed, $source_map) } } - if (count($found) == count(PROTOCOLS)) { + // Stop when all cache files were found + if (count($found) == count($protocols_on_day)) { error_log('ingest initialization complete'); break; } @@ -98,6 +105,47 @@ function ingest_all($period_reversed, $source_map) subprocess_wait(1, 1); } +function print_date($date) +{ + return $date->format('Y-m-d'); +} + +// Logs before migration date have been kept in separate files for IPv4 and IPv6 addresses +function has_separate_protocol_logs($date) +{ + global $migration_date; + if ($date > $migration_date) + return false; + else + return true; +} + +function get_cache_file($protocol, $date) +{ + global $CACHE_DIR; + if (has_separate_protocol_logs($date)) + return "$CACHE_DIR/$protocol/" . print_date($date) . ".json"; + else + return "$CACHE_DIR/ipv4+6/" . print_date($date) . ".json"; +} + +function get_cache_files($date) +{ + $files = []; + foreach (get_protocols($date) as $protocol) + array_push($files, get_cache_file($protocol, $date)); + + return $files; +} + +function get_protocols($date) +{ + if (has_separate_protocol_logs($date)) + return PROTOCOLS; + else + return array("ipv4+6"); +} + function ingest($date, $source, $filename, $destination) { $url = implode('/', [ @@ -107,7 +155,7 @@ function ingest($date, $source, $filename, $destination) sprintf($filename, $date->format('Ymd')), ]); $command = implode(' ', [ - 'curl -s', + 'curl -s --digest --netrc', escapeshellarg($url), '| xzcat', '| ' . __DIR__ . '/ingest.php', @@ -159,14 +207,14 @@ function aggregate_all($period) $merged_protocol = []; $date_previous = null; foreach ($period as $date) { - $date_string = $date->format('Y-m-d'); + $date_string = print_date($date); $data = null; foreach (PROTOCOLS as $protocol) { - $cache_file = "$CACHE_DIR/$protocol/$date_string.json"; + $cache_file = get_cache_file($protocol, $date); if (!file_exists($cache_file) or !filesize($cache_file)) continue; - error_log("[$date_string] [$protocol] load cache"); + error_log("[$date_string]" . (has_separate_protocol_logs($date) ? " [$protocol]" : "") . " load cache"); $data_new = json_decode(file_get_contents($cache_file), true); if (!$data_new) { error_log('ERROR: failed to load ' . $cache_file); @@ -174,17 +222,31 @@ function aggregate_all($period) exit(1); } + if (isset($data_new[$protocol])) { + // new cache files have 'ipv4' and 'ipv6' array keys + $data_protocol = $data_new[$protocol]; + // we don't want to count 'total_invalid' and 'bytes' twice + if ($data) { + $data_protocol['total_invalid'] = 0; + $data_protocol['bytes'] = 0; + } else { + $data_protocol['total_invalid'] = $data_new['total_invalid']; + $data_protocol['bytes'] = $data_new['bytes']; + } + } + else + $data_protocol = $data_new; if (!isset($merged_protocol[$protocol])) $merged_protocol[$protocol] = []; - $data_new['days'] = 1; - normalize($data_new); - aggregate($intervals, $merged_protocol[$protocol], $date, $date_previous, $data_new, + $data_protocol['days'] = 1; + normalize($data_protocol); + aggregate($intervals, $merged_protocol[$protocol], $date, $date_previous, $data_protocol, ['protocol' => $protocol], 'protocol'); if ($data) { - merge($data, $data_new); + merge($data, $data_protocol); $data['days'] = 1; } else { - $data = $data_new; + $data = $data_protocol; } } @@ -213,12 +275,12 @@ function aggregate_all($period) function aggregate($intervals, &$merged, $date, $date_previous, $data, $tags = [], $prefix = 'access') { foreach ($intervals as $interval => $format) { - if ($interval == 'FQ') { + if ($interval === 'FQ') { $value = format_FQ($date); if (isset($date_previous)) $value_previous = format_FQ($date_previous); } - elseif ($interval == 'FY') { + elseif ($interval === 'FY') { $value = format_FY($date); if (isset($date_previous)) $value_previous = format_FY($date_previous); @@ -231,7 +293,7 @@ function aggregate($intervals, &$merged, $date, $date_previous, $data, $tags = [ if (!isset($merged[$interval]) || $value != $merged[$interval]['value']) { if (!empty($merged[$interval]['data'])) { $summary = summarize($merged[$interval]['data']); - if ($prefix == 'protocol') { + if ($prefix === 'protocol') { $summary = ['-' => $summary['-']]; } $flavors = []; @@ -248,7 +310,7 @@ function aggregate($intervals, &$merged, $date, $date_previous, $data, $tags = [ $count += write_flavors($interval, $date_previous, $flavors); } - if ($prefix == 'access') { + if ($prefix === 'access') { $summary = summarize_product_plus_key($merged[$interval]['data']['total_image_product']); $count += write_summary_product_plus_key($interval, $date_previous, $summary, 'image'); } @@ -313,7 +375,7 @@ function merge(&$data1, $data2) if (empty($data1['total_product'][$product])) $data1['total_product'][$product] = 0; - $data1['total_product'][$product] += $data2['total_product'][$product]; + $data1['total_product'][$product] += $total; } merge_unique_products($data1['unique_product'], $data2['unique_product']); @@ -451,8 +513,7 @@ function write_summary($interval, DateTime $value, $summary, $tags = [], $prefix $measurement = $prefix . '_' . $interval; $points = []; foreach ($summary as $product => $fields) { - $points[] = new Point($measurement, null, - ['product' => $product] + $tags, $fields, $value->getTimestamp()); + $points[] = new Point($measurement, ['product' => $product] + $tags, $fields, $value->getTimestamp()); } write($points); return count($points); @@ -465,7 +526,8 @@ function write_flavors($interval, DateTime $value, $flavors) foreach ($flavors as $product => $unique_flavors) { foreach($unique_flavors as $flavor => $unique_count) { $tags = ['product' => $product, 'flavor' => $flavor]; - $points[] = new Point($measurement, $unique_count, $tags, [], $value->getTimestamp()); + $fields = ['value' => $unique_count]; + $points[] = new Point($measurement, $tags, $fields, $value->getTimestamp()); } } write($points); @@ -478,7 +540,7 @@ function write_summary_product_plus_key($interval, DateTime $date, $summary, $pr $points = []; foreach ($summary as $product => $pairs) { foreach ($pairs as $key => $value) { - $points[] = new Point($measurement, null, + $points[] = new Point($measurement, ['product' => $product, 'key' => $key], ['value' => $value], $date->getTimestamp()); } } @@ -488,13 +550,20 @@ function write_summary_product_plus_key($interval, DateTime $date, $summary, $pr function write($points) { - static $database = null; + static $client; + static $writeApi; - if (!$database) { - $database = InfluxDB\Client::fromDSN('influxdb://0.0.0.0:8086/osrt_access'); - // $database->drop(); - // $database->create(); + if (!$client) { + $client = new Client([ + "url" => "http://localhost:8086", + "token" => "", + "bucket" => "osrt_access/autogen", + "org" => "-", + "precision" => InfluxDB2\Model\WritePrecision::S + ]); + $writeApi = $client->createWriteApi(); } - if (!$database->writePoints($points, Database::PRECISION_SECONDS)) die('failed to write points'); + if (!is_null($writeApi->write($points))) + die('failed to write points'); } diff --git a/metrics/access/ingest.php b/metrics/access/ingest.php index d331f2bb..15c1d464 100755 --- a/metrics/access/ingest.php +++ b/metrics/access/ingest.php @@ -4,16 +4,21 @@ const REGEX_LINE = '/(\S+) \S+ \S+ \[([^:]+:\d+:\d+:\d+ [^\]]+)\] "(\S+)(?: (\S+) \S+)?" (\S+) (\S+) "[^"]*" "[^"]*" .* (?:size:|want:- give:- \d+ )(\S+) \S+(?: +"?(\S+-\S+-\S+-\S+-[^\s"]+|-)"? "?(dvd|ftp|mini|usb-[^"]*|livecd-[^"]*|appliance-?[^"]*|-)"?)?/'; const REGEX_PRODUCT = '#/(?:(tumbleweed)|distribution/(?:leap/)?(\d+\.\d+)|openSUSE(?:_|:/)(?:leap(?:_|:/))?(factory|tumbleweed|\d+\.\d+))#i'; const REGEX_IMAGE = '#(?:/(?:iso|live)/[^/]+-(DVD|NET|GNOME-Live|KDE-Live|Rescue-CD|Kubic-DVD)-[^/]+\.iso(?:\.torrent)?|/jeos/[^/]+-(JeOS)\.[^/]+\.(?:qcow2|vhdx|vmdk|vmx)$)#'; +const REGEX_IPV4 = '/^((25[0-5]|(2[0-4]|1\d|[1-9]|)\d)\.?\b){4}$/'; +const PROTOCOLS = ['ipv4', 'ipv6']; -$total = 0; $total_invalid = 0; -$total_product = []; -$unique_product = []; -$total_image_product = []; +foreach (PROTOCOLS as $protocol) { + $total[$protocol] = 0; + $total_product[$protocol] = []; + $unique_product[$protocol] = []; + $total_image_product[$protocol] = []; +} $file = $argc == 2 ? $argv[1] : 'php://stdin'; $handle = fopen($file, 'r'); while (($line = fgets($handle)) !== false) { + $protocol = ''; if (!preg_match(REGEX_LINE, $line, $match)) { error_log('[failed to parse] ' . rtrim($line)); $total_invalid++; @@ -24,7 +29,12 @@ while (($line = fgets($handle)) !== false) { if ($match[3] != 'GET' && $match[3] != 'HEAD') continue; // Not interested on errors. if ($match[5] >= '400') continue; - $total++; + + if (preg_match(REGEX_IPV4, $match[1])) + $protocol = 'ipv4'; + else + $protocol = 'ipv6'; + $total[$protocol]++; // Attempt to determine for which product was the request. if (!preg_match(REGEX_PRODUCT, $match[4], $match_product)) { @@ -35,47 +45,53 @@ while (($line = fgets($handle)) !== false) { $values = array_filter($match_product); $product = str_replace('factory', 'tumbleweed', strtolower(next($values))); - if (!isset($total_product[$product])) $total_product[$product] = 0; - $total_product[$product] += 1; + if (!isset($total_product[$protocol][$product])) $total_product[$protocol][$product] = 0; + $total_product[$protocol][$product] += 1; if (count($match) == 10 && $match[8] != '-') { $uuid = $match[8]; - if (!isset($unique_product[$product])) $unique_product[$product] = []; - if (!isset($unique_product[$product][$uuid])) { - $unique_product[$product][$uuid] = [ + if (!isset($unique_product[$protocol][$product])) $unique_product[$protocol][$product] = []; + if (!isset($unique_product[$protocol][$product][$uuid])) { + $unique_product[$protocol][$product][$uuid] = [ 'count' => 0, 'flavor' => $match[9], 'ip' => $match[1], ]; } - $unique_product[$product][$uuid]['count'] += 1; + $unique_product[$protocol][$product][$uuid]['count'] += 1; } if (preg_match(REGEX_IMAGE, $match[4], $match_image)) { // Remove empty match groups and select non-all match. $values = array_filter($match_image); $image = next($values); - if (!isset($total_image_product[$product])) $total_image_product[$product] = []; - if (!isset($total_image_product[$product][$image])) $total_image_product[$product][$image] = 0; - $total_image_product[$product][$image] += 1; + if (!isset($total_image_product[$protocol][$product])) $total_image_product[$protocol][$product] = []; + if (!isset($total_image_product[$protocol][$product][$image])) $total_image_product[$protocol][$product][$image] = 0; + $total_image_product[$protocol][$product][$image] += 1; } } $position = ftell($handle); fclose($handle); error_log('processed ' . number_format($position) . ' bytes'); -error_log('found ' . number_format($total) . ' requests across ' . - number_format(count($total_product)) . ' products'); +error_log('found ' . number_format(array_sum($total)) . ' requests across ' . + number_format(array_sum(array_map('count', $total_product))) . ' products'); -ksort($total_product); -ksort($unique_product); -if ($position) { - echo json_encode([ - 'total' => $total, - 'total_product' => $total_product, - 'unique_product' => $unique_product, - 'total_image_product' => $total_image_product, - 'total_invalid' => $total_invalid, - 'bytes' => $position, - ]) . "\n"; // JSON_PRETTY_PRINT for debugging. +$output = [ + 'total_invalid' => $total_invalid, + 'bytes' => $position +]; +foreach (PROTOCOLS as $protocol) { + ksort($total_product[$protocol]); + ksort($unique_product[$protocol]); + $output[$protocol] = [ + 'total' => $total[$protocol], + 'total_product' => $total_product[$protocol], + 'unique_product' => $unique_product[$protocol], + 'total_image_product' => $total_image_product[$protocol] + ]; +} + +if ($position) { + echo json_encode($output) . "\n"; // JSON_PRETTY_PRINT for debugging. }