SHA256
3
0
forked from pool/zlib
zlib/zlib-1.2.12-add-vectorized-longest_match-for-power.patch

339 lines
13 KiB
Diff
Raw Normal View History

From aecdff0646c7e188b48f6db285d8d63a74f246c1 Mon Sep 17 00:00:00 2001
From: Matheus Castanho <msc@linux.ibm.com>
Date: Tue, 29 Oct 2019 18:04:11 -0300
Subject: [PATCH] Add vectorized longest_match for Power
This commit introduces an optimized version of the longest_match
function for Power processors. It uses VSX instructions to match
16 bytes at a time on each comparison, instead of one by one.
Author: Matheus Castanho <msc@linux.ibm.com>
---
CMakeLists.txt | 3 +-
Makefile.in | 8 +
configure | 4 +-
contrib/power/longest_match_power9.c | 194 +++++++++++++++++++++++++
contrib/power/longest_match_resolver.c | 15 ++
contrib/power/power.h | 2 +
deflate.c | 13 ++
7 files changed, 236 insertions(+), 3 deletions(-)
create mode 100644 contrib/power/longest_match_power9.c
create mode 100644 contrib/power/longest_match_resolver.c
Index: zlib-1.2.12/CMakeLists.txt
===================================================================
--- zlib-1.2.12.orig/CMakeLists.txt
+++ zlib-1.2.12/CMakeLists.txt
@@ -199,7 +199,8 @@ if(CMAKE_COMPILER_IS_GNUCC)
if(POWER9)
add_definitions(-DZ_POWER9)
- set(ZLIB_POWER9 )
+ set(ZLIB_POWER9
+ contrib/power/longest_match_power9.c)
set_source_files_properties(
${ZLIB_POWER9}
Index: zlib-1.2.12/Makefile.in
===================================================================
--- zlib-1.2.12.orig/Makefile.in
+++ zlib-1.2.12/Makefile.in
@@ -189,6 +189,9 @@ crc32-vx.o: $(SRCDIR)contrib/s390/crc32-
deflate.o: $(SRCDIR)deflate.c
$(CC) $(CFLAGS) $(ZINC) -c -o $@ $(SRCDIR)deflate.c
+longest_match_power9.o: $(SRCDIR)contrib/power/longest_match_power9.c
+ $(CC) $(CFLAGS) -mcpu=power9 $(ZINC) -c -o $@ $(SRCDIR)contrib/power/longest_match_power9.c
+
slide_hash_power8.o: $(SRCDIR)contrib/power/slide_hash_power8.c
$(CC) $(CFLAGS) -mcpu=power8 $(ZINC) -c -o $@ $(SRCDIR)contrib/power/slide_hash_power8.c
@@ -259,6 +262,11 @@ deflate.lo: $(SRCDIR)deflate.c
$(CC) $(SFLAGS) $(ZINC) -DPIC -c -o objs/deflate.o $(SRCDIR)deflate.c
-@mv objs/deflate.o $@
+longest_match_power9.lo: $(SRCDIR)contrib/power/longest_match_power9.c
+ -@mkdir objs 2>/dev/null || test -d objs
+ $(CC) $(SFLAGS) -mcpu=power9 $(ZINC) -DPIC -c -o objs/longest_match_power9.o $(SRCDIR)contrib/power/longest_match_power9.c
+ -@mv objs/longest_match_power9.o $@
+
slide_hash_power8.lo: $(SRCDIR)contrib/power/slide_hash_power8.c
-@mkdir objs 2>/dev/null || test -d objs
$(CC) $(SFLAGS) -mcpu=power8 $(ZINC) -DPIC -c -o objs/slide_hash_power8.o $(SRCDIR)contrib/power/slide_hash_power8.c
Index: zlib-1.2.12/configure
===================================================================
--- zlib-1.2.12.orig/configure
+++ zlib-1.2.12/configure
@@ -915,8 +915,8 @@ if tryboth $CC -c $CFLAGS $test.c; then
if tryboth $CC -c $CFLAGS -mcpu=power9 $test.c; then
POWER9="-DZ_POWER9"
- PIC_OBJC="${PIC_OBJC}"
- OBJC="${OBJC}"
+ PIC_OBJC="$PIC_OBJC longest_match_power9.lo"
+ OBJC="$OBJC longest_match_power9.o"
echo "Checking for -mcpu=power9 support... Yes." | tee -a configure.log
else
echo "Checking for -mcpu=power9 support... No." | tee -a configure.log
Index: zlib-1.2.12/contrib/power/longest_match_power9.c
===================================================================
--- /dev/null
+++ zlib-1.2.12/contrib/power/longest_match_power9.c
@@ -0,0 +1,194 @@
+/* Copyright (C) 2019 Matheus Castanho <msc@linux.ibm.com>, IBM
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#include <altivec.h>
+#include "../../deflate.h"
+
+local inline int vec_match OF((Bytef* scan, Bytef* match))
+ __attribute__((always_inline));
+
+local inline int vec_match(Bytef* scan, Bytef* match)
+{
+ vector unsigned char vscan, vmatch, vc;
+ int len;
+
+ vscan = *((vector unsigned char *) scan);
+ vmatch = *((vector unsigned char *) match);
+
+ /* Compare 16 bytes at a time.
+ * Each byte of vc will be either all ones or all zeroes,
+ * depending on the result of the comparison
+ */
+ vc = (vector unsigned char) vec_cmpne(vscan,vmatch);
+
+ /* Since the index of matching bytes will contain only zeroes
+ * on vc (since we used cmpne), counting the number of consecutive
+ * bytes where LSB == 0 is the same as counting the length of the match.
+ *
+ * There was an issue in the way the vec_cnttz_lsbb builtin was implemented
+ * that got fixed on GCC 12, but now we have to use different builtins
+ * depending on the compiler version. To avoid that, let's use inline asm to
+ * generate the exact instruction we need.
+ */
+ #ifdef __LITTLE_ENDIAN__
+ asm volatile("vctzlsbb %0, %1\n\t" : "=r" (len) : "v" (vc));
+ #else
+ asm volatile("vclzlsbb %0, %1\n\t" : "=r" (len) : "v" (vc));
+ #endif
+
+ return len;
+}
+
+uInt ZLIB_INTERNAL _longest_match_power9(deflate_state *s, IPos cur_match)
+{
+ unsigned chain_length = s->max_chain_length;/* max hash chain length */
+ register Bytef *scan = s->window + s->strstart; /* current string */
+ register Bytef *match; /* matched string */
+ register int len; /* length of current match */
+ int best_len = (int)s->prev_length; /* best match length so far */
+ int nice_match = s->nice_match; /* stop if match long enough */
+ int mbytes; /* matched bytes inside loop */
+ IPos limit = s->strstart > (IPos)MAX_DIST(s) ?
+ s->strstart - (IPos)MAX_DIST(s) : 0;
+ /* Stop when cur_match becomes <= limit. To simplify the code,
+ * we prevent matches with the string of window index 0.
+ */
+ Posf *prev = s->prev;
+ uInt wmask = s->w_mask;
+
+#if (MAX_MATCH == 258)
+ /* Compare the last two bytes at once. */
+ register Bytef *strend2 = s->window + s->strstart + MAX_MATCH - 2;
+ register ush scan_end = *(ushf*)(scan+best_len-1);
+#else
+ register Bytef *strend = s->window + s->strstart + MAX_MATCH;
+ register Byte scan_end1 = scan[best_len-1];
+ register Byte scan_end = scan[best_len];
+#endif
+
+ /* The code is optimized for HASH_BITS >= 8 and MAX_MATCH-2 multiple of 16.
+ * It is easy to get rid of this optimization if necessary.
+ */
+ Assert(s->hash_bits >= 8 && MAX_MATCH == 258, "Code too clever");
+
+ /* Do not waste too much time if we already have a good match: */
+ if (s->prev_length >= s->good_match) {
+ chain_length >>= 2;
+ }
+ /* Do not look for matches beyond the end of the input. This is necessary
+ * to make deflate deterministic.
+ */
+ if ((uInt)nice_match > s->lookahead) nice_match = (int)s->lookahead;
+
+ Assert((ulg)s->strstart <= s->window_size-MIN_LOOKAHEAD, "need lookahead");
+
+ do {
+ Assert(cur_match < s->strstart, "no future");
+ match = s->window + cur_match;
+
+ /* Skip to next match if the match length cannot increase
+ * or if the match length is less than 2. Note that the checks below
+ * for insufficient lookahead only occur occasionally for performance
+ * reasons. Therefore uninitialized memory will be accessed, and
+ * conditional jumps will be made that depend on those values.
+ * However the length of the match is limited to the lookahead, so
+ * the output of deflate is not affected by the uninitialized values.
+ */
+
+/* MAX_MATCH - 2 should be a multiple of 16 for this optimization to work. */
+#if (MAX_MATCH == 258)
+
+ /* Compare ending (2 bytes) and beginning of potential match.
+ *
+ * On Power processors, loading a 16-byte vector takes only 1 extra
+ * cycle compared to a regular byte load. So instead of comparing the
+ * first two bytes and then the rest later if they match, we can compare
+ * the first 16 at once, and when we have a match longer than 2, we will
+ * already have the result of comparing the first 16 bytes saved in mbytes.
+ */
+ if (*(ushf*)(match+best_len-1) != scan_end ||
+ (mbytes = vec_match(scan,match)) < 3) continue;
+
+ scan += mbytes;
+ match += mbytes;
+
+ /* In case when we may have a match longer than 16, we perform further
+ * comparisons in chunks of 16 and keep going while all bytes match.
+ */
+ while(mbytes == 16) {
+ mbytes = vec_match(scan,match);
+ scan += mbytes;
+ match += mbytes;
+
+ /* We also have to limit the maximum match based on MAX_MATCH.
+ * Since we are comparing 16 bytes at a time and MAX_MATCH == 258 (to
+ * comply with default implementation), we should stop comparing when
+ * we have matched 256 bytes, which happens when scan == strend2.
+ * In this ("rare") case, we have to check the remaining 2 bytes
+ * individually using common load and compare operations.
+ */
+ if(scan >= strend2) {
+ if(*scan == *match) {
+ if(*++scan == *++match)
+ scan++;
+ }
+ break;
+ }
+ }
+
+ Assert(scan <= s->window+(unsigned)(s->window_size-1), "wild scan");
+
+ len = (MAX_MATCH - 2) - (int)(strend2 - scan);
+ scan = strend2 - (MAX_MATCH - 2);
+
+#else /* MAX_MATCH == 258 */
+
+ if (match[best_len] != scan_end ||
+ match[best_len-1] != scan_end1 ||
+ *match != *scan ||
+ *++match != scan[1]) continue;
+
+ /* The check at best_len-1 can be removed because it will be made
+ * again later. (This heuristic is not always a win.)
+ * It is not necessary to compare scan[2] and match[2] since they
+ * are always equal when the other bytes match, given that
+ * the hash keys are equal and that HASH_BITS >= 8.
+ */
+ scan += 2, match++;
+ Assert(*scan == *match, "match[2]?");
+
+ /* We check for insufficient lookahead only every 8th comparison;
+ * the 256th check will be made at strstart+258.
+ */
+ do {
+ } while (*++scan == *++match && *++scan == *++match &&
+ *++scan == *++match && *++scan == *++match &&
+ *++scan == *++match && *++scan == *++match &&
+ *++scan == *++match && *++scan == *++match &&
+ scan < strend);
+
+ Assert(scan <= s->window+(unsigned)(s->window_size-1), "wild scan");
+
+ len = MAX_MATCH - (int)(strend - scan);
+ scan = strend - MAX_MATCH;
+
+#endif /* MAX_MATCH == 258 */
+
+ if (len > best_len) {
+ s->match_start = cur_match;
+ best_len = len;
+ if (len >= nice_match) break;
+#if (MAX_MATCH == 258)
+ scan_end = *(ushf*)(scan+best_len-1);
+#else
+ scan_end1 = scan[best_len-1];
+ scan_end = scan[best_len];
+#endif
+ }
+ } while ((cur_match = prev[cur_match & wmask]) > limit
+ && --chain_length != 0);
+
+ if ((uInt)best_len <= s->lookahead) return (uInt)best_len;
+ return s->lookahead;
+}
Index: zlib-1.2.12/contrib/power/longest_match_resolver.c
===================================================================
--- /dev/null
+++ zlib-1.2.12/contrib/power/longest_match_resolver.c
@@ -0,0 +1,15 @@
+/* Copyright (C) 2019 Matheus Castanho <msc@linux.ibm.com>, IBM
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#include "../gcc/zifunc.h"
+#include "power.h"
+
+Z_IFUNC(longest_match) {
+#ifdef Z_POWER9
+ if (__builtin_cpu_supports("arch_3_00"))
+ return _longest_match_power9;
+#endif
+
+ return longest_match_default;
+}
Index: zlib-1.2.12/contrib/power/power.h
===================================================================
--- zlib-1.2.12.orig/contrib/power/power.h
+++ zlib-1.2.12/contrib/power/power.h
@@ -10,4 +10,6 @@ uLong _adler32_power8(uLong adler, const
unsigned long _crc32_z_power8(unsigned long, const Bytef *, z_size_t);
+uInt _longest_match_power9(deflate_state *s, IPos cur_match);
+
void _slide_hash_power8(deflate_state *s);
Index: zlib-1.2.12/deflate.c
===================================================================
--- zlib-1.2.12.orig/deflate.c
+++ zlib-1.2.12/deflate.c
@@ -1309,6 +1309,14 @@ local void lm_init (s)
/* For 80x86 and 680x0, an optimized version will be provided in match.asm or
* match.S. The code will be functionally equivalent.
*/
+
+#ifdef Z_POWER_OPT
+/* Rename function so resolver can use its symbol. The default version will be
+ * returned by the resolver if the host has no support for an optimized version.
+ */
+#define longest_match longest_match_default
+#endif /* Z_POWER_OPT */
+
local uInt longest_match(s, pcur_match)
deflate_state *s;
IPos pcur_match; /* current match */
@@ -1454,6 +1462,11 @@ local uInt longest_match(s, pcur_match)
}
#endif /* ASMV */
+#ifdef Z_POWER_OPT
+#undef longest_match
+#include "contrib/power/longest_match_resolver.c"
+#endif /* Z_POWER_OPT */
+
#else /* FASTEST */
/* ---------------------------------------------------------------------------