From aecdff0646c7e188b48f6db285d8d63a74f246c1 Mon Sep 17 00:00:00 2001 From: Matheus Castanho Date: Tue, 29 Oct 2019 18:04:11 -0300 Subject: [PATCH] Add vectorized longest_match for Power This commit introduces an optimized version of the longest_match function for Power processors. It uses VSX instructions to match 16 bytes at a time on each comparison, instead of one by one. Author: Matheus Castanho --- CMakeLists.txt | 3 +- Makefile.in | 8 + configure | 4 +- contrib/power/longest_match_power9.c | 194 +++++++++++++++++++++++++ contrib/power/longest_match_resolver.c | 15 ++ contrib/power/power.h | 2 + deflate.c | 13 ++ 7 files changed, 236 insertions(+), 3 deletions(-) create mode 100644 contrib/power/longest_match_power9.c create mode 100644 contrib/power/longest_match_resolver.c Index: zlib-1.2.12/CMakeLists.txt =================================================================== --- zlib-1.2.12.orig/CMakeLists.txt +++ zlib-1.2.12/CMakeLists.txt @@ -199,7 +199,8 @@ if(CMAKE_COMPILER_IS_GNUCC) if(POWER9) add_definitions(-DZ_POWER9) - set(ZLIB_POWER9 ) + set(ZLIB_POWER9 + contrib/power/longest_match_power9.c) set_source_files_properties( ${ZLIB_POWER9} Index: zlib-1.2.12/Makefile.in =================================================================== --- zlib-1.2.12.orig/Makefile.in +++ zlib-1.2.12/Makefile.in @@ -189,6 +189,9 @@ crc32-vx.o: $(SRCDIR)contrib/s390/crc32- deflate.o: $(SRCDIR)deflate.c $(CC) $(CFLAGS) $(ZINC) -c -o $@ $(SRCDIR)deflate.c +longest_match_power9.o: $(SRCDIR)contrib/power/longest_match_power9.c + $(CC) $(CFLAGS) -mcpu=power9 $(ZINC) -c -o $@ $(SRCDIR)contrib/power/longest_match_power9.c + slide_hash_power8.o: $(SRCDIR)contrib/power/slide_hash_power8.c $(CC) $(CFLAGS) -mcpu=power8 $(ZINC) -c -o $@ $(SRCDIR)contrib/power/slide_hash_power8.c @@ -259,6 +262,11 @@ deflate.lo: $(SRCDIR)deflate.c $(CC) $(SFLAGS) $(ZINC) -DPIC -c -o objs/deflate.o $(SRCDIR)deflate.c -@mv objs/deflate.o $@ +longest_match_power9.lo: $(SRCDIR)contrib/power/longest_match_power9.c + -@mkdir objs 2>/dev/null || test -d objs + $(CC) $(SFLAGS) -mcpu=power9 $(ZINC) -DPIC -c -o objs/longest_match_power9.o $(SRCDIR)contrib/power/longest_match_power9.c + -@mv objs/longest_match_power9.o $@ + slide_hash_power8.lo: $(SRCDIR)contrib/power/slide_hash_power8.c -@mkdir objs 2>/dev/null || test -d objs $(CC) $(SFLAGS) -mcpu=power8 $(ZINC) -DPIC -c -o objs/slide_hash_power8.o $(SRCDIR)contrib/power/slide_hash_power8.c Index: zlib-1.2.12/configure =================================================================== --- zlib-1.2.12.orig/configure +++ zlib-1.2.12/configure @@ -915,8 +915,8 @@ if tryboth $CC -c $CFLAGS $test.c; then if tryboth $CC -c $CFLAGS -mcpu=power9 $test.c; then POWER9="-DZ_POWER9" - PIC_OBJC="${PIC_OBJC}" - OBJC="${OBJC}" + PIC_OBJC="$PIC_OBJC longest_match_power9.lo" + OBJC="$OBJC longest_match_power9.o" echo "Checking for -mcpu=power9 support... Yes." | tee -a configure.log else echo "Checking for -mcpu=power9 support... No." | tee -a configure.log Index: zlib-1.2.12/contrib/power/longest_match_power9.c =================================================================== --- /dev/null +++ zlib-1.2.12/contrib/power/longest_match_power9.c @@ -0,0 +1,194 @@ +/* Copyright (C) 2019 Matheus Castanho , IBM + * For conditions of distribution and use, see copyright notice in zlib.h + */ + +#include +#include "../../deflate.h" + +local inline int vec_match OF((Bytef* scan, Bytef* match)) + __attribute__((always_inline)); + +local inline int vec_match(Bytef* scan, Bytef* match) +{ + vector unsigned char vscan, vmatch, vc; + int len; + + vscan = *((vector unsigned char *) scan); + vmatch = *((vector unsigned char *) match); + + /* Compare 16 bytes at a time. + * Each byte of vc will be either all ones or all zeroes, + * depending on the result of the comparison + */ + vc = (vector unsigned char) vec_cmpne(vscan,vmatch); + + /* Since the index of matching bytes will contain only zeroes + * on vc (since we used cmpne), counting the number of consecutive + * bytes where LSB == 0 is the same as counting the length of the match. + * + * There was an issue in the way the vec_cnttz_lsbb builtin was implemented + * that got fixed on GCC 12, but now we have to use different builtins + * depending on the compiler version. To avoid that, let's use inline asm to + * generate the exact instruction we need. + */ + #ifdef __LITTLE_ENDIAN__ + asm volatile("vctzlsbb %0, %1\n\t" : "=r" (len) : "v" (vc)); + #else + asm volatile("vclzlsbb %0, %1\n\t" : "=r" (len) : "v" (vc)); + #endif + + return len; +} + +uInt ZLIB_INTERNAL _longest_match_power9(deflate_state *s, IPos cur_match) +{ + unsigned chain_length = s->max_chain_length;/* max hash chain length */ + register Bytef *scan = s->window + s->strstart; /* current string */ + register Bytef *match; /* matched string */ + register int len; /* length of current match */ + int best_len = (int)s->prev_length; /* best match length so far */ + int nice_match = s->nice_match; /* stop if match long enough */ + int mbytes; /* matched bytes inside loop */ + IPos limit = s->strstart > (IPos)MAX_DIST(s) ? + s->strstart - (IPos)MAX_DIST(s) : 0; + /* Stop when cur_match becomes <= limit. To simplify the code, + * we prevent matches with the string of window index 0. + */ + Posf *prev = s->prev; + uInt wmask = s->w_mask; + +#if (MAX_MATCH == 258) + /* Compare the last two bytes at once. */ + register Bytef *strend2 = s->window + s->strstart + MAX_MATCH - 2; + register ush scan_end = *(ushf*)(scan+best_len-1); +#else + register Bytef *strend = s->window + s->strstart + MAX_MATCH; + register Byte scan_end1 = scan[best_len-1]; + register Byte scan_end = scan[best_len]; +#endif + + /* The code is optimized for HASH_BITS >= 8 and MAX_MATCH-2 multiple of 16. + * It is easy to get rid of this optimization if necessary. + */ + Assert(s->hash_bits >= 8 && MAX_MATCH == 258, "Code too clever"); + + /* Do not waste too much time if we already have a good match: */ + if (s->prev_length >= s->good_match) { + chain_length >>= 2; + } + /* Do not look for matches beyond the end of the input. This is necessary + * to make deflate deterministic. + */ + if ((uInt)nice_match > s->lookahead) nice_match = (int)s->lookahead; + + Assert((ulg)s->strstart <= s->window_size-MIN_LOOKAHEAD, "need lookahead"); + + do { + Assert(cur_match < s->strstart, "no future"); + match = s->window + cur_match; + + /* Skip to next match if the match length cannot increase + * or if the match length is less than 2. Note that the checks below + * for insufficient lookahead only occur occasionally for performance + * reasons. Therefore uninitialized memory will be accessed, and + * conditional jumps will be made that depend on those values. + * However the length of the match is limited to the lookahead, so + * the output of deflate is not affected by the uninitialized values. + */ + +/* MAX_MATCH - 2 should be a multiple of 16 for this optimization to work. */ +#if (MAX_MATCH == 258) + + /* Compare ending (2 bytes) and beginning of potential match. + * + * On Power processors, loading a 16-byte vector takes only 1 extra + * cycle compared to a regular byte load. So instead of comparing the + * first two bytes and then the rest later if they match, we can compare + * the first 16 at once, and when we have a match longer than 2, we will + * already have the result of comparing the first 16 bytes saved in mbytes. + */ + if (*(ushf*)(match+best_len-1) != scan_end || + (mbytes = vec_match(scan,match)) < 3) continue; + + scan += mbytes; + match += mbytes; + + /* In case when we may have a match longer than 16, we perform further + * comparisons in chunks of 16 and keep going while all bytes match. + */ + while(mbytes == 16) { + mbytes = vec_match(scan,match); + scan += mbytes; + match += mbytes; + + /* We also have to limit the maximum match based on MAX_MATCH. + * Since we are comparing 16 bytes at a time and MAX_MATCH == 258 (to + * comply with default implementation), we should stop comparing when + * we have matched 256 bytes, which happens when scan == strend2. + * In this ("rare") case, we have to check the remaining 2 bytes + * individually using common load and compare operations. + */ + if(scan >= strend2) { + if(*scan == *match) { + if(*++scan == *++match) + scan++; + } + break; + } + } + + Assert(scan <= s->window+(unsigned)(s->window_size-1), "wild scan"); + + len = (MAX_MATCH - 2) - (int)(strend2 - scan); + scan = strend2 - (MAX_MATCH - 2); + +#else /* MAX_MATCH == 258 */ + + if (match[best_len] != scan_end || + match[best_len-1] != scan_end1 || + *match != *scan || + *++match != scan[1]) continue; + + /* The check at best_len-1 can be removed because it will be made + * again later. (This heuristic is not always a win.) + * It is not necessary to compare scan[2] and match[2] since they + * are always equal when the other bytes match, given that + * the hash keys are equal and that HASH_BITS >= 8. + */ + scan += 2, match++; + Assert(*scan == *match, "match[2]?"); + + /* We check for insufficient lookahead only every 8th comparison; + * the 256th check will be made at strstart+258. + */ + do { + } while (*++scan == *++match && *++scan == *++match && + *++scan == *++match && *++scan == *++match && + *++scan == *++match && *++scan == *++match && + *++scan == *++match && *++scan == *++match && + scan < strend); + + Assert(scan <= s->window+(unsigned)(s->window_size-1), "wild scan"); + + len = MAX_MATCH - (int)(strend - scan); + scan = strend - MAX_MATCH; + +#endif /* MAX_MATCH == 258 */ + + if (len > best_len) { + s->match_start = cur_match; + best_len = len; + if (len >= nice_match) break; +#if (MAX_MATCH == 258) + scan_end = *(ushf*)(scan+best_len-1); +#else + scan_end1 = scan[best_len-1]; + scan_end = scan[best_len]; +#endif + } + } while ((cur_match = prev[cur_match & wmask]) > limit + && --chain_length != 0); + + if ((uInt)best_len <= s->lookahead) return (uInt)best_len; + return s->lookahead; +} Index: zlib-1.2.12/contrib/power/longest_match_resolver.c =================================================================== --- /dev/null +++ zlib-1.2.12/contrib/power/longest_match_resolver.c @@ -0,0 +1,15 @@ +/* Copyright (C) 2019 Matheus Castanho , IBM + * For conditions of distribution and use, see copyright notice in zlib.h + */ + +#include "../gcc/zifunc.h" +#include "power.h" + +Z_IFUNC(longest_match) { +#ifdef Z_POWER9 + if (__builtin_cpu_supports("arch_3_00")) + return _longest_match_power9; +#endif + + return longest_match_default; +} Index: zlib-1.2.12/contrib/power/power.h =================================================================== --- zlib-1.2.12.orig/contrib/power/power.h +++ zlib-1.2.12/contrib/power/power.h @@ -10,4 +10,6 @@ uLong _adler32_power8(uLong adler, const unsigned long _crc32_z_power8(unsigned long, const Bytef *, z_size_t); +uInt _longest_match_power9(deflate_state *s, IPos cur_match); + void _slide_hash_power8(deflate_state *s); Index: zlib-1.2.12/deflate.c =================================================================== --- zlib-1.2.12.orig/deflate.c +++ zlib-1.2.12/deflate.c @@ -1309,6 +1309,14 @@ local void lm_init (s) /* For 80x86 and 680x0, an optimized version will be provided in match.asm or * match.S. The code will be functionally equivalent. */ + +#ifdef Z_POWER_OPT +/* Rename function so resolver can use its symbol. The default version will be + * returned by the resolver if the host has no support for an optimized version. + */ +#define longest_match longest_match_default +#endif /* Z_POWER_OPT */ + local uInt longest_match(s, pcur_match) deflate_state *s; IPos pcur_match; /* current match */ @@ -1454,6 +1462,11 @@ local uInt longest_match(s, pcur_match) } #endif /* ASMV */ +#ifdef Z_POWER_OPT +#undef longest_match +#include "contrib/power/longest_match_resolver.c" +#endif /* Z_POWER_OPT */ + #else /* FASTEST */ /* ---------------------------------------------------------------------------