From 4a8d89ae49aa17d1634a2816c8d159f533a07eae Mon Sep 17 00:00:00 2001 From: Matheus Castanho Date: Wed, 27 Nov 2019 10:18:10 -0300 Subject: [PATCH] Add optimized slide_hash for Power Considerable time is spent on deflate.c:slide_hash() during deflate. This commit introduces a new slide_hash function that uses VSX vector instructions to slide 8 hash elements at a time, instead of just one as the standard code does. The choice between the optimized and default versions is made only on the first call to the function, enabling a fallback to standard behavior if the host processor does not support VSX instructions, so the same binary can be used for multiple Power processor versions. Author: Matheus Castanho --- CMakeLists.txt | 3 +- Makefile.in | 8 ++++ configure | 4 +- contrib/power/power.h | 3 ++ contrib/power/slide_hash_power8.c | 63 +++++++++++++++++++++++++++++ contrib/power/slide_hash_resolver.c | 15 +++++++ deflate.c | 12 ++++++ 7 files changed, 105 insertions(+), 3 deletions(-) create mode 100644 contrib/power/slide_hash_power8.c create mode 100644 contrib/power/slide_hash_resolver.c Index: zlib-1.2.13/CMakeLists.txt =================================================================== --- zlib-1.2.13.orig/CMakeLists.txt +++ zlib-1.2.13/CMakeLists.txt @@ -174,7 +174,8 @@ if(CMAKE_COMPILER_IS_GNUCC) add_definitions(-DZ_POWER8) set(ZLIB_POWER8 contrib/power/adler32_power8.c - contrib/power/crc32_z_power8.c) + contrib/power/crc32_z_power8.c + contrib/power/slide_hash_power8.c) set_source_files_properties( ${ZLIB_POWER8} Index: zlib-1.2.13/Makefile.in =================================================================== --- zlib-1.2.13.orig/Makefile.in +++ zlib-1.2.13/Makefile.in @@ -185,6 +185,9 @@ crc32-vx.o: $(SRCDIR)contrib/s390/crc32- deflate.o: $(SRCDIR)deflate.c $(CC) $(CFLAGS) $(ZINC) -c -o $@ $(SRCDIR)deflate.c +slide_hash_power8.o: $(SRCDIR)contrib/power/slide_hash_power8.c + $(CC) $(CFLAGS) -mcpu=power8 $(ZINC) -c -o $@ $(SRCDIR)contrib/power/slide_hash_power8.c + infback.o: $(SRCDIR)infback.c $(CC) $(CFLAGS) $(ZINC) -c -o $@ $(SRCDIR)infback.c @@ -252,6 +255,11 @@ deflate.lo: $(SRCDIR)deflate.c $(CC) $(SFLAGS) $(ZINC) -DPIC -c -o objs/deflate.o $(SRCDIR)deflate.c -@mv objs/deflate.o $@ +slide_hash_power8.lo: $(SRCDIR)contrib/power/slide_hash_power8.c + -@mkdir objs 2>/dev/null || test -d objs + $(CC) $(SFLAGS) -mcpu=power8 $(ZINC) -DPIC -c -o objs/slide_hash_power8.o $(SRCDIR)contrib/power/slide_hash_power8.c + -@mv objs/slide_hash_power8.o $@ + infback.lo: $(SRCDIR)infback.c -@mkdir objs 2>/dev/null || test -d objs $(CC) $(SFLAGS) $(ZINC) -DPIC -c -o objs/infback.o $(SRCDIR)infback.c Index: zlib-1.2.13/configure =================================================================== --- zlib-1.2.13.orig/configure +++ zlib-1.2.13/configure @@ -898,8 +898,8 @@ if tryboth $CC -c $CFLAGS $test.c; then if tryboth $CC -c $CFLAGS -mcpu=power8 $test.c; then POWER8="-DZ_POWER8" - PIC_OBJC="${PIC_OBJC} adler32_power8.lo crc32_z_power8.lo" - OBJC="${OBJC} adler32_power8.o crc32_z_power8.o" + PIC_OBJC="${PIC_OBJC} adler32_power8.lo crc32_z_power8.lo slide_hash_power8.lo" + OBJC="${OBJC} adler32_power8.o crc32_z_power8.o slide_hash_power8.o" echo "Checking for -mcpu=power8 support... Yes." | tee -a configure.log else echo "Checking for -mcpu=power8 support... No." | tee -a configure.log Index: zlib-1.2.13/contrib/power/power.h =================================================================== --- zlib-1.2.13.orig/contrib/power/power.h +++ zlib-1.2.13/contrib/power/power.h @@ -4,7 +4,10 @@ */ #include "../../zconf.h" #include "../../zutil.h" +#include "../../deflate.h" uLong _adler32_power8(uLong adler, const Bytef* buf, uInt len); unsigned long _crc32_z_power8(unsigned long, const Bytef *, z_size_t); + +void _slide_hash_power8(deflate_state *s); Index: zlib-1.2.13/contrib/power/slide_hash_power8.c =================================================================== --- /dev/null +++ zlib-1.2.13/contrib/power/slide_hash_power8.c @@ -0,0 +1,63 @@ + /* Copyright (C) 2019 Matheus Castanho , IBM + * For conditions of distribution and use, see copyright notice in zlib.h + */ + +#include +#include "../../deflate.h" + +local inline void slide_hash_power8_loop OF((deflate_state *s, + unsigned n_elems, Posf *table_end)) __attribute__((always_inline)); + +local void slide_hash_power8_loop( + deflate_state *s, + unsigned n_elems, + Posf *table_end) +{ + vector unsigned short vw, vm, *vp; + unsigned chunks; + + /* Each vector register (chunk) corresponds to 128 bits == 8 Posf, + * so instead of processing each of the n_elems in the hash table + * individually, we can do it in chunks of 8 with vector instructions. + * + * This function is only called from slide_hash_power8(), and both calls + * pass n_elems as a power of 2 higher than 2^7, as defined by + * deflateInit2_(), so n_elems will always be a multiple of 8. */ + chunks = n_elems >> 3; + Assert(n_elems % 8 == 0, "Weird hash table size!"); + + /* This type casting is safe since s->w_size is always <= 64KB + * as defined by deflateInit2_() and Posf == unsigned short */ + vw[0] = (Posf) s->w_size; + vw = vec_splat(vw,0); + + vp = (vector unsigned short *) table_end; + + do { + /* Processing 8 elements at a time */ + vp--; + vm = *vp; + + /* This is equivalent to: m >= w_size ? m - w_size : 0 + * Since we are using a saturated unsigned subtraction, any + * values that are > w_size will be set to 0, while the others + * will be subtracted by w_size. */ + *vp = vec_subs(vm,vw); + } while (--chunks); +}; + +void ZLIB_INTERNAL _slide_hash_power8(deflate_state *s) +{ + unsigned n; + Posf *p; + + n = s->hash_size; + p = &s->head[n]; + slide_hash_power8_loop(s,n,p); + +#ifndef FASTEST + n = s->w_size; + p = &s->prev[n]; + slide_hash_power8_loop(s,n,p); +#endif +} Index: zlib-1.2.13/contrib/power/slide_hash_resolver.c =================================================================== --- /dev/null +++ zlib-1.2.13/contrib/power/slide_hash_resolver.c @@ -0,0 +1,15 @@ +/* Copyright (C) 2019 Matheus Castanho , IBM + * For conditions of distribution and use, see copyright notice in zlib.h + */ + +#include "../gcc/zifunc.h" +#include "power.h" + +Z_IFUNC(slide_hash) { +#ifdef Z_POWER8 + if (__builtin_cpu_supports("arch_2_07")) + return _slide_hash_power8; +#endif + + return slide_hash_default; +} Index: zlib-1.2.13/deflate.c =================================================================== --- zlib-1.2.13.orig/deflate.c +++ zlib-1.2.13/deflate.c @@ -204,6 +204,13 @@ local const config configuration_table[1 (unsigned)(s->hash_size - 1)*sizeof(*s->head)); \ } while (0) +#ifdef Z_POWER_OPT +/* Rename function so resolver can use its symbol. The default version will be + * returned by the resolver if the host has no support for an optimized version. + */ +#define slide_hash slide_hash_default +#endif /* Z_POWER_OPT */ + /* =========================================================================== * Slide the hash table when sliding the window down (could be avoided with 32 * bit values at the expense of memory usage). We slide even when level == 0 to @@ -235,6 +242,11 @@ local void slide_hash(s) #endif } +#ifdef Z_POWER_OPT +#undef slide_hash +#include "contrib/power/slide_hash_resolver.c" +#endif /* Z_POWER_OPT */ + /* ========================================================================= */ int ZEXPORT deflateInit_(strm, level, version, stream_size) z_streamp strm;