218 lines
7.9 KiB
Diff
218 lines
7.9 KiB
Diff
|
From 4a8d89ae49aa17d1634a2816c8d159f533a07eae Mon Sep 17 00:00:00 2001
|
||
|
From: Matheus Castanho <msc@linux.ibm.com>
|
||
|
Date: Wed, 27 Nov 2019 10:18:10 -0300
|
||
|
Subject: [PATCH] Add optimized slide_hash for Power
|
||
|
|
||
|
Considerable time is spent on deflate.c:slide_hash() during
|
||
|
deflate. This commit introduces a new slide_hash function that
|
||
|
uses VSX vector instructions to slide 8 hash elements at a time,
|
||
|
instead of just one as the standard code does.
|
||
|
|
||
|
The choice between the optimized and default versions is made only
|
||
|
on the first call to the function, enabling a fallback to standard
|
||
|
behavior if the host processor does not support VSX instructions,
|
||
|
so the same binary can be used for multiple Power processor
|
||
|
versions.
|
||
|
|
||
|
Author: Matheus Castanho <msc@linux.ibm.com>
|
||
|
---
|
||
|
CMakeLists.txt | 3 +-
|
||
|
Makefile.in | 8 ++++
|
||
|
configure | 4 +-
|
||
|
contrib/power/power.h | 3 ++
|
||
|
contrib/power/slide_hash_power8.c | 63 +++++++++++++++++++++++++++++
|
||
|
contrib/power/slide_hash_resolver.c | 15 +++++++
|
||
|
deflate.c | 12 ++++++
|
||
|
7 files changed, 105 insertions(+), 3 deletions(-)
|
||
|
create mode 100644 contrib/power/slide_hash_power8.c
|
||
|
create mode 100644 contrib/power/slide_hash_resolver.c
|
||
|
|
||
|
Index: zlib-1.2.13/CMakeLists.txt
|
||
|
===================================================================
|
||
|
--- zlib-1.2.13.orig/CMakeLists.txt
|
||
|
+++ zlib-1.2.13/CMakeLists.txt
|
||
|
@@ -174,7 +174,8 @@ if(CMAKE_COMPILER_IS_GNUCC)
|
||
|
add_definitions(-DZ_POWER8)
|
||
|
set(ZLIB_POWER8
|
||
|
contrib/power/adler32_power8.c
|
||
|
- contrib/power/crc32_z_power8.c)
|
||
|
+ contrib/power/crc32_z_power8.c
|
||
|
+ contrib/power/slide_hash_power8.c)
|
||
|
|
||
|
set_source_files_properties(
|
||
|
${ZLIB_POWER8}
|
||
|
Index: zlib-1.2.13/Makefile.in
|
||
|
===================================================================
|
||
|
--- zlib-1.2.13.orig/Makefile.in
|
||
|
+++ zlib-1.2.13/Makefile.in
|
||
|
@@ -185,6 +185,9 @@ crc32-vx.o: $(SRCDIR)contrib/s390/crc32-
|
||
|
deflate.o: $(SRCDIR)deflate.c
|
||
|
$(CC) $(CFLAGS) $(ZINC) -c -o $@ $(SRCDIR)deflate.c
|
||
|
|
||
|
+slide_hash_power8.o: $(SRCDIR)contrib/power/slide_hash_power8.c
|
||
|
+ $(CC) $(CFLAGS) -mcpu=power8 $(ZINC) -c -o $@ $(SRCDIR)contrib/power/slide_hash_power8.c
|
||
|
+
|
||
|
infback.o: $(SRCDIR)infback.c
|
||
|
$(CC) $(CFLAGS) $(ZINC) -c -o $@ $(SRCDIR)infback.c
|
||
|
|
||
|
@@ -252,6 +255,11 @@ deflate.lo: $(SRCDIR)deflate.c
|
||
|
$(CC) $(SFLAGS) $(ZINC) -DPIC -c -o objs/deflate.o $(SRCDIR)deflate.c
|
||
|
-@mv objs/deflate.o $@
|
||
|
|
||
|
+slide_hash_power8.lo: $(SRCDIR)contrib/power/slide_hash_power8.c
|
||
|
+ -@mkdir objs 2>/dev/null || test -d objs
|
||
|
+ $(CC) $(SFLAGS) -mcpu=power8 $(ZINC) -DPIC -c -o objs/slide_hash_power8.o $(SRCDIR)contrib/power/slide_hash_power8.c
|
||
|
+ -@mv objs/slide_hash_power8.o $@
|
||
|
+
|
||
|
infback.lo: $(SRCDIR)infback.c
|
||
|
-@mkdir objs 2>/dev/null || test -d objs
|
||
|
$(CC) $(SFLAGS) $(ZINC) -DPIC -c -o objs/infback.o $(SRCDIR)infback.c
|
||
|
Index: zlib-1.2.13/configure
|
||
|
===================================================================
|
||
|
--- zlib-1.2.13.orig/configure
|
||
|
+++ zlib-1.2.13/configure
|
||
|
@@ -898,8 +898,8 @@ if tryboth $CC -c $CFLAGS $test.c; then
|
||
|
|
||
|
if tryboth $CC -c $CFLAGS -mcpu=power8 $test.c; then
|
||
|
POWER8="-DZ_POWER8"
|
||
|
- PIC_OBJC="${PIC_OBJC} adler32_power8.lo crc32_z_power8.lo"
|
||
|
- OBJC="${OBJC} adler32_power8.o crc32_z_power8.o"
|
||
|
+ PIC_OBJC="${PIC_OBJC} adler32_power8.lo crc32_z_power8.lo slide_hash_power8.lo"
|
||
|
+ OBJC="${OBJC} adler32_power8.o crc32_z_power8.o slide_hash_power8.o"
|
||
|
echo "Checking for -mcpu=power8 support... Yes." | tee -a configure.log
|
||
|
else
|
||
|
echo "Checking for -mcpu=power8 support... No." | tee -a configure.log
|
||
|
Index: zlib-1.2.13/contrib/power/power.h
|
||
|
===================================================================
|
||
|
--- zlib-1.2.13.orig/contrib/power/power.h
|
||
|
+++ zlib-1.2.13/contrib/power/power.h
|
||
|
@@ -4,7 +4,10 @@
|
||
|
*/
|
||
|
#include "../../zconf.h"
|
||
|
#include "../../zutil.h"
|
||
|
+#include "../../deflate.h"
|
||
|
|
||
|
uLong _adler32_power8(uLong adler, const Bytef* buf, uInt len);
|
||
|
|
||
|
unsigned long _crc32_z_power8(unsigned long, const Bytef *, z_size_t);
|
||
|
+
|
||
|
+void _slide_hash_power8(deflate_state *s);
|
||
|
Index: zlib-1.2.13/contrib/power/slide_hash_power8.c
|
||
|
===================================================================
|
||
|
--- /dev/null
|
||
|
+++ zlib-1.2.13/contrib/power/slide_hash_power8.c
|
||
|
@@ -0,0 +1,63 @@
|
||
|
+ /* Copyright (C) 2019 Matheus Castanho <msc@linux.ibm.com>, IBM
|
||
|
+ * For conditions of distribution and use, see copyright notice in zlib.h
|
||
|
+ */
|
||
|
+
|
||
|
+#include <altivec.h>
|
||
|
+#include "../../deflate.h"
|
||
|
+
|
||
|
+local inline void slide_hash_power8_loop OF((deflate_state *s,
|
||
|
+ unsigned n_elems, Posf *table_end)) __attribute__((always_inline));
|
||
|
+
|
||
|
+local void slide_hash_power8_loop(
|
||
|
+ deflate_state *s,
|
||
|
+ unsigned n_elems,
|
||
|
+ Posf *table_end)
|
||
|
+{
|
||
|
+ vector unsigned short vw, vm, *vp;
|
||
|
+ unsigned chunks;
|
||
|
+
|
||
|
+ /* Each vector register (chunk) corresponds to 128 bits == 8 Posf,
|
||
|
+ * so instead of processing each of the n_elems in the hash table
|
||
|
+ * individually, we can do it in chunks of 8 with vector instructions.
|
||
|
+ *
|
||
|
+ * This function is only called from slide_hash_power8(), and both calls
|
||
|
+ * pass n_elems as a power of 2 higher than 2^7, as defined by
|
||
|
+ * deflateInit2_(), so n_elems will always be a multiple of 8. */
|
||
|
+ chunks = n_elems >> 3;
|
||
|
+ Assert(n_elems % 8 == 0, "Weird hash table size!");
|
||
|
+
|
||
|
+ /* This type casting is safe since s->w_size is always <= 64KB
|
||
|
+ * as defined by deflateInit2_() and Posf == unsigned short */
|
||
|
+ vw[0] = (Posf) s->w_size;
|
||
|
+ vw = vec_splat(vw,0);
|
||
|
+
|
||
|
+ vp = (vector unsigned short *) table_end;
|
||
|
+
|
||
|
+ do {
|
||
|
+ /* Processing 8 elements at a time */
|
||
|
+ vp--;
|
||
|
+ vm = *vp;
|
||
|
+
|
||
|
+ /* This is equivalent to: m >= w_size ? m - w_size : 0
|
||
|
+ * Since we are using a saturated unsigned subtraction, any
|
||
|
+ * values that are > w_size will be set to 0, while the others
|
||
|
+ * will be subtracted by w_size. */
|
||
|
+ *vp = vec_subs(vm,vw);
|
||
|
+ } while (--chunks);
|
||
|
+};
|
||
|
+
|
||
|
+void ZLIB_INTERNAL _slide_hash_power8(deflate_state *s)
|
||
|
+{
|
||
|
+ unsigned n;
|
||
|
+ Posf *p;
|
||
|
+
|
||
|
+ n = s->hash_size;
|
||
|
+ p = &s->head[n];
|
||
|
+ slide_hash_power8_loop(s,n,p);
|
||
|
+
|
||
|
+#ifndef FASTEST
|
||
|
+ n = s->w_size;
|
||
|
+ p = &s->prev[n];
|
||
|
+ slide_hash_power8_loop(s,n,p);
|
||
|
+#endif
|
||
|
+}
|
||
|
Index: zlib-1.2.13/contrib/power/slide_hash_resolver.c
|
||
|
===================================================================
|
||
|
--- /dev/null
|
||
|
+++ zlib-1.2.13/contrib/power/slide_hash_resolver.c
|
||
|
@@ -0,0 +1,15 @@
|
||
|
+/* Copyright (C) 2019 Matheus Castanho <msc@linux.ibm.com>, IBM
|
||
|
+ * For conditions of distribution and use, see copyright notice in zlib.h
|
||
|
+ */
|
||
|
+
|
||
|
+#include "../gcc/zifunc.h"
|
||
|
+#include "power.h"
|
||
|
+
|
||
|
+Z_IFUNC(slide_hash) {
|
||
|
+#ifdef Z_POWER8
|
||
|
+ if (__builtin_cpu_supports("arch_2_07"))
|
||
|
+ return _slide_hash_power8;
|
||
|
+#endif
|
||
|
+
|
||
|
+ return slide_hash_default;
|
||
|
+}
|
||
|
Index: zlib-1.2.13/deflate.c
|
||
|
===================================================================
|
||
|
--- zlib-1.2.13.orig/deflate.c
|
||
|
+++ zlib-1.2.13/deflate.c
|
||
|
@@ -204,6 +204,13 @@ local const config configuration_table[1
|
||
|
(unsigned)(s->hash_size - 1)*sizeof(*s->head)); \
|
||
|
} while (0)
|
||
|
|
||
|
+#ifdef Z_POWER_OPT
|
||
|
+/* Rename function so resolver can use its symbol. The default version will be
|
||
|
+ * returned by the resolver if the host has no support for an optimized version.
|
||
|
+ */
|
||
|
+#define slide_hash slide_hash_default
|
||
|
+#endif /* Z_POWER_OPT */
|
||
|
+
|
||
|
/* ===========================================================================
|
||
|
* Slide the hash table when sliding the window down (could be avoided with 32
|
||
|
* bit values at the expense of memory usage). We slide even when level == 0 to
|
||
|
@@ -235,6 +242,11 @@ local void slide_hash(s)
|
||
|
#endif
|
||
|
}
|
||
|
|
||
|
+#ifdef Z_POWER_OPT
|
||
|
+#undef slide_hash
|
||
|
+#include "contrib/power/slide_hash_resolver.c"
|
||
|
+#endif /* Z_POWER_OPT */
|
||
|
+
|
||
|
/* ========================================================================= */
|
||
|
int ZEXPORT deflateInit_(strm, level, version, stream_size)
|
||
|
z_streamp strm;
|