zlib/zlib-1.2.12-add-optimized-slide_hash-for-power.patch

From 4a8d89ae49aa17d1634a2816c8d159f533a07eae Mon Sep 17 00:00:00 2001
From: Matheus Castanho <msc@linux.ibm.com>
Date: Wed, 27 Nov 2019 10:18:10 -0300
Subject: [PATCH] Add optimized slide_hash for Power

Considerable time is spent on deflate.c:slide_hash() during
deflate. This commit introduces a new slide_hash function that
uses VSX vector instructions to slide 8 hash elements at a time,
instead of just one as the standard code does.

The choice between the optimized and default versions is made only
on the first call to the function, enabling a fallback to standard
behavior if the host processor does not support VSX instructions,
so the same binary can be used for multiple Power processor
versions.

Author: Matheus Castanho <msc@linux.ibm.com>
---
 CMakeLists.txt                      |  3 +-
 Makefile.in                         |  8 ++++
 configure                           |  4 +-
 contrib/power/power.h               |  3 ++
 contrib/power/slide_hash_power8.c   | 63 +++++++++++++++++++++++++++++
 contrib/power/slide_hash_resolver.c | 15 +++++++
 deflate.c                           | 12 ++++++
 7 files changed, 105 insertions(+), 3 deletions(-)
 create mode 100644 contrib/power/slide_hash_power8.c
 create mode 100644 contrib/power/slide_hash_resolver.c

Index: zlib-1.2.13/CMakeLists.txt
===================================================================
--- zlib-1.2.13.orig/CMakeLists.txt
+++ zlib-1.2.13/CMakeLists.txt
@@ -174,7 +174,8 @@ if(CMAKE_COMPILER_IS_GNUCC)
                 add_definitions(-DZ_POWER8)
                 set(ZLIB_POWER8
                   contrib/power/adler32_power8.c
-                  contrib/power/crc32_z_power8.c)
+                  contrib/power/crc32_z_power8.c
+                  contrib/power/slide_hash_power8.c)
 
                 set_source_files_properties(
                     ${ZLIB_POWER8}
Index: zlib-1.2.13/Makefile.in
===================================================================
--- zlib-1.2.13.orig/Makefile.in
+++ zlib-1.2.13/Makefile.in
@@ -185,6 +185,9 @@ crc32-vx.o: $(SRCDIR)contrib/s390/crc32-
 deflate.o: $(SRCDIR)deflate.c
 	$(CC) $(CFLAGS) $(ZINC) -c -o $@ $(SRCDIR)deflate.c
 
+slide_hash_power8.o: $(SRCDIR)contrib/power/slide_hash_power8.c
+	$(CC) $(CFLAGS) -mcpu=power8 $(ZINC) -c -o $@ $(SRCDIR)contrib/power/slide_hash_power8.c
+
 infback.o: $(SRCDIR)infback.c
 	$(CC) $(CFLAGS) $(ZINC) -c -o $@ $(SRCDIR)infback.c
 
@@ -252,6 +255,11 @@ deflate.lo: $(SRCDIR)deflate.c
 	$(CC) $(SFLAGS) $(ZINC) -DPIC -c -o objs/deflate.o $(SRCDIR)deflate.c
 	-@mv objs/deflate.o $@
 
+slide_hash_power8.lo: $(SRCDIR)contrib/power/slide_hash_power8.c
+	-@mkdir objs 2>/dev/null || test -d objs
+	$(CC) $(SFLAGS) -mcpu=power8 $(ZINC) -DPIC -c -o objs/slide_hash_power8.o $(SRCDIR)contrib/power/slide_hash_power8.c
+	-@mv objs/slide_hash_power8.o $@
+
 infback.lo: $(SRCDIR)infback.c
 	-@mkdir objs 2>/dev/null || test -d objs
 	$(CC) $(SFLAGS) $(ZINC) -DPIC -c -o objs/infback.o $(SRCDIR)infback.c
Index: zlib-1.2.13/configure
===================================================================
--- zlib-1.2.13.orig/configure
+++ zlib-1.2.13/configure
@@ -898,8 +898,8 @@ if tryboth $CC -c $CFLAGS $test.c; then
 
   if tryboth $CC -c $CFLAGS -mcpu=power8 $test.c; then
     POWER8="-DZ_POWER8"
-    PIC_OBJC="${PIC_OBJC} adler32_power8.lo crc32_z_power8.lo"
-    OBJC="${OBJC} adler32_power8.o crc32_z_power8.o"
+    PIC_OBJC="${PIC_OBJC} adler32_power8.lo crc32_z_power8.lo slide_hash_power8.lo"
+    OBJC="${OBJC} adler32_power8.o crc32_z_power8.o slide_hash_power8.o"
     echo "Checking for -mcpu=power8 support... Yes." | tee -a configure.log
   else
     echo "Checking for -mcpu=power8 support... No." | tee -a configure.log
Index: zlib-1.2.13/contrib/power/power.h
===================================================================
--- zlib-1.2.13.orig/contrib/power/power.h
+++ zlib-1.2.13/contrib/power/power.h
@@ -4,7 +4,10 @@
  */
 #include "../../zconf.h"
 #include "../../zutil.h"
+#include "../../deflate.h"
 
 uLong _adler32_power8(uLong adler, const Bytef* buf, uInt len);
 
 unsigned long _crc32_z_power8(unsigned long, const Bytef *, z_size_t);
+
+void _slide_hash_power8(deflate_state *s);
Index: zlib-1.2.13/contrib/power/slide_hash_power8.c
===================================================================
--- /dev/null
+++ zlib-1.2.13/contrib/power/slide_hash_power8.c
@@ -0,0 +1,63 @@
+ /* Copyright (C) 2019 Matheus Castanho <msc@linux.ibm.com>, IBM
+  * For conditions of distribution and use, see copyright notice in zlib.h
+  */
+
+#include <altivec.h>
+#include "../../deflate.h"
+
+local inline void slide_hash_power8_loop OF((deflate_state *s,
+      unsigned n_elems, Posf *table_end)) __attribute__((always_inline));
+
+local void slide_hash_power8_loop(
+    deflate_state *s,
+    unsigned n_elems,
+    Posf *table_end)
+{
+    vector unsigned short vw, vm, *vp;
+    unsigned chunks;
+
+    /* Each vector register (chunk) corresponds to 128 bits == 8 Posf,
+     * so instead of processing each of the n_elems in the hash table
+     * individually, we can do it in chunks of 8 with vector instructions.
+     *
+     * This function is only called from slide_hash_power8(), and both calls
+     * pass n_elems as a power of 2 higher than 2^7, as defined by
+     * deflateInit2_(), so n_elems will always be a multiple of 8. */
+    chunks = n_elems >> 3;
+    Assert(n_elems % 8 == 0, "Weird hash table size!");
+
+    /* This type casting is safe since s->w_size is always <= 64KB
+     * as defined by deflateInit2_() and Posf == unsigned short */
+    vw[0] = (Posf) s->w_size;
+    vw = vec_splat(vw,0);
+
+    vp = (vector unsigned short *) table_end;
+
+    do {
+        /* Processing 8 elements at a time */
+        vp--;
+        vm = *vp;
+
+        /* This is equivalent to: m >= w_size ? m - w_size : 0
+         * Since we are using a saturated unsigned subtraction, any
+         * values that are > w_size will be set to 0, while the others
+         * will be subtracted by w_size. */
+        *vp = vec_subs(vm,vw);
+    } while (--chunks);
+};
+
+void ZLIB_INTERNAL _slide_hash_power8(deflate_state *s)
+{
+    unsigned n;
+    Posf *p;
+
+    n = s->hash_size;
+    p = &s->head[n];
+    slide_hash_power8_loop(s,n,p);
+
+#ifndef FASTEST
+    n = s->w_size;
+    p = &s->prev[n];
+    slide_hash_power8_loop(s,n,p);
+#endif
+}
Index: zlib-1.2.13/contrib/power/slide_hash_resolver.c
===================================================================
--- /dev/null
+++ zlib-1.2.13/contrib/power/slide_hash_resolver.c
@@ -0,0 +1,15 @@
+/* Copyright (C) 2019 Matheus Castanho <msc@linux.ibm.com>, IBM
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#include "../gcc/zifunc.h"
+#include "power.h"
+
+Z_IFUNC(slide_hash) {
+#ifdef Z_POWER8
+    if (__builtin_cpu_supports("arch_2_07"))
+        return _slide_hash_power8;
+#endif
+
+    return slide_hash_default;
+}
Index: zlib-1.2.13/deflate.c
===================================================================
--- zlib-1.2.13.orig/deflate.c
+++ zlib-1.2.13/deflate.c
@@ -204,6 +204,13 @@ local const config configuration_table[1
                  (unsigned)(s->hash_size - 1)*sizeof(*s->head)); \
     } while (0)
 
+#ifdef Z_POWER_OPT
+/* Rename function so resolver can use its symbol. The default version will be
+ * returned by the resolver if the host has no support for an optimized version.
+ */
+#define slide_hash slide_hash_default
+#endif /* Z_POWER_OPT */
+
 /* ===========================================================================
  * Slide the hash table when sliding the window down (could be avoided with 32
  * bit values at the expense of memory usage). We slide even when level == 0 to
@@ -235,6 +242,11 @@ local void slide_hash(s)
 #endif
 }
 
+#ifdef Z_POWER_OPT
+#undef slide_hash
+#include "contrib/power/slide_hash_resolver.c"
+#endif /* Z_POWER_OPT */
+
 /* ========================================================================= */
 int ZEXPORT deflateInit_(strm, level, version, stream_size)
     z_streamp strm;
Sync from SUSE:ALP:Source:Standard:1.0 zlib revision 36f39b6d9d335ce320e489faa36e0710 2023-07-31 13:47:19 +02:00			`From 4a8d89ae49aa17d1634a2816c8d159f533a07eae Mon Sep 17 00:00:00 2001`
			`From: Matheus Castanho <msc@linux.ibm.com>`
			`Date: Wed, 27 Nov 2019 10:18:10 -0300`
			`Subject: [PATCH] Add optimized slide_hash for Power`

			`Considerable time is spent on deflate.c:slide_hash() during`
			`deflate. This commit introduces a new slide_hash function that`
			`uses VSX vector instructions to slide 8 hash elements at a time,`
			`instead of just one as the standard code does.`

			`The choice between the optimized and default versions is made only`
			`on the first call to the function, enabling a fallback to standard`
			`behavior if the host processor does not support VSX instructions,`
			`so the same binary can be used for multiple Power processor`
			`versions.`

			`Author: Matheus Castanho <msc@linux.ibm.com>`
			`---`
			`CMakeLists.txt \| 3 +-`
			`Makefile.in \| 8 ++++`
			`configure \| 4 +-`
			`contrib/power/power.h \| 3 ++`
			`contrib/power/slide_hash_power8.c \| 63 +++++++++++++++++++++++++++++`
			`contrib/power/slide_hash_resolver.c \| 15 +++++++`
			`deflate.c \| 12 ++++++`
			`7 files changed, 105 insertions(+), 3 deletions(-)`
			`create mode 100644 contrib/power/slide_hash_power8.c`
			`create mode 100644 contrib/power/slide_hash_resolver.c`

			`Index: zlib-1.2.13/CMakeLists.txt`
			`===================================================================`
			`--- zlib-1.2.13.orig/CMakeLists.txt`
			`+++ zlib-1.2.13/CMakeLists.txt`
			`@@ -174,7 +174,8 @@ if(CMAKE_COMPILER_IS_GNUCC)`
			`add_definitions(-DZ_POWER8)`
			`set(ZLIB_POWER8`
			`contrib/power/adler32_power8.c`
			`- contrib/power/crc32_z_power8.c)`
			`+ contrib/power/crc32_z_power8.c`
			`+ contrib/power/slide_hash_power8.c)`

			`set_source_files_properties(`
			`${ZLIB_POWER8}`
			`Index: zlib-1.2.13/Makefile.in`
			`===================================================================`
			`--- zlib-1.2.13.orig/Makefile.in`
			`+++ zlib-1.2.13/Makefile.in`
			`@@ -185,6 +185,9 @@ crc32-vx.o: $(SRCDIR)contrib/s390/crc32-`
			`deflate.o: $(SRCDIR)deflate.c`
			`$(CC) $(CFLAGS) $(ZINC) -c -o $@ $(SRCDIR)deflate.c`

			`+slide_hash_power8.o: $(SRCDIR)contrib/power/slide_hash_power8.c`
			`+ $(CC) $(CFLAGS) -mcpu=power8 $(ZINC) -c -o $@ $(SRCDIR)contrib/power/slide_hash_power8.c`
			`+`
			`infback.o: $(SRCDIR)infback.c`
			`$(CC) $(CFLAGS) $(ZINC) -c -o $@ $(SRCDIR)infback.c`

			`@@ -252,6 +255,11 @@ deflate.lo: $(SRCDIR)deflate.c`
			`$(CC) $(SFLAGS) $(ZINC) -DPIC -c -o objs/deflate.o $(SRCDIR)deflate.c`
			`-@mv objs/deflate.o $@`

			`+slide_hash_power8.lo: $(SRCDIR)contrib/power/slide_hash_power8.c`
			`+ -@mkdir objs 2>/dev/null \|\| test -d objs`
			`+ $(CC) $(SFLAGS) -mcpu=power8 $(ZINC) -DPIC -c -o objs/slide_hash_power8.o $(SRCDIR)contrib/power/slide_hash_power8.c`
			`+ -@mv objs/slide_hash_power8.o $@`
			`+`
			`infback.lo: $(SRCDIR)infback.c`
			`-@mkdir objs 2>/dev/null \|\| test -d objs`
			`$(CC) $(SFLAGS) $(ZINC) -DPIC -c -o objs/infback.o $(SRCDIR)infback.c`
			`Index: zlib-1.2.13/configure`
			`===================================================================`
			`--- zlib-1.2.13.orig/configure`
			`+++ zlib-1.2.13/configure`
			`@@ -898,8 +898,8 @@ if tryboth $CC -c $CFLAGS $test.c; then`

			`if tryboth $CC -c $CFLAGS -mcpu=power8 $test.c; then`
			`POWER8="-DZ_POWER8"`
			`- PIC_OBJC="${PIC_OBJC} adler32_power8.lo crc32_z_power8.lo"`
			`- OBJC="${OBJC} adler32_power8.o crc32_z_power8.o"`
			`+ PIC_OBJC="${PIC_OBJC} adler32_power8.lo crc32_z_power8.lo slide_hash_power8.lo"`
			`+ OBJC="${OBJC} adler32_power8.o crc32_z_power8.o slide_hash_power8.o"`
			`echo "Checking for -mcpu=power8 support... Yes." \| tee -a configure.log`
			`else`
			`echo "Checking for -mcpu=power8 support... No." \| tee -a configure.log`
			`Index: zlib-1.2.13/contrib/power/power.h`
			`===================================================================`
			`--- zlib-1.2.13.orig/contrib/power/power.h`
			`+++ zlib-1.2.13/contrib/power/power.h`
			`@@ -4,7 +4,10 @@`
			`*/`
			`#include "../../zconf.h"`
			`#include "../../zutil.h"`
			`+#include "../../deflate.h"`

			`uLong _adler32_power8(uLong adler, const Bytef* buf, uInt len);`

			`unsigned long _crc32_z_power8(unsigned long, const Bytef *, z_size_t);`
			`+`
			`+void _slide_hash_power8(deflate_state *s);`
			`Index: zlib-1.2.13/contrib/power/slide_hash_power8.c`
			`===================================================================`
			`--- /dev/null`
			`+++ zlib-1.2.13/contrib/power/slide_hash_power8.c`
			`@@ -0,0 +1,63 @@`
			`+ /* Copyright (C) 2019 Matheus Castanho <msc@linux.ibm.com>, IBM`
			`+ * For conditions of distribution and use, see copyright notice in zlib.h`
			`+ */`
			`+`
			`+#include <altivec.h>`
			`+#include "../../deflate.h"`
			`+`
			`+local inline void slide_hash_power8_loop OF((deflate_state *s,`
			`+ unsigned n_elems, Posf *table_end)) __attribute__((always_inline));`
			`+`
			`+local void slide_hash_power8_loop(`
			`+ deflate_state *s,`
			`+ unsigned n_elems,`
			`+ Posf *table_end)`
			`+{`
			`+ vector unsigned short vw, vm, *vp;`
			`+ unsigned chunks;`
			`+`
			`+ /* Each vector register (chunk) corresponds to 128 bits == 8 Posf,`
			`+ * so instead of processing each of the n_elems in the hash table`
			`+ * individually, we can do it in chunks of 8 with vector instructions.`
			`+ *`
			`+ * This function is only called from slide_hash_power8(), and both calls`
			`+ * pass n_elems as a power of 2 higher than 2^7, as defined by`
			`+ * deflateInit2_(), so n_elems will always be a multiple of 8. */`
			`+ chunks = n_elems >> 3;`
			`+ Assert(n_elems % 8 == 0, "Weird hash table size!");`
			`+`
			`+ /* This type casting is safe since s->w_size is always <= 64KB`
			`+ * as defined by deflateInit2_() and Posf == unsigned short */`
			`+ vw[0] = (Posf) s->w_size;`
			`+ vw = vec_splat(vw,0);`
			`+`
			`+ vp = (vector unsigned short *) table_end;`
			`+`
			`+ do {`
			`+ /* Processing 8 elements at a time */`
			`+ vp--;`
			`+ vm = *vp;`
			`+`
			`+ /* This is equivalent to: m >= w_size ? m - w_size : 0`
			`+ * Since we are using a saturated unsigned subtraction, any`
			`+ * values that are > w_size will be set to 0, while the others`
			`+ * will be subtracted by w_size. */`
			`+ *vp = vec_subs(vm,vw);`
			`+ } while (--chunks);`
			`+};`
			`+`
			`+void ZLIB_INTERNAL _slide_hash_power8(deflate_state *s)`
			`+{`
			`+ unsigned n;`
			`+ Posf *p;`
			`+`
			`+ n = s->hash_size;`
			`+ p = &s->head[n];`
			`+ slide_hash_power8_loop(s,n,p);`
			`+`
			`+#ifndef FASTEST`
			`+ n = s->w_size;`
			`+ p = &s->prev[n];`
			`+ slide_hash_power8_loop(s,n,p);`
			`+#endif`
			`+}`
			`Index: zlib-1.2.13/contrib/power/slide_hash_resolver.c`
			`===================================================================`
			`--- /dev/null`
			`+++ zlib-1.2.13/contrib/power/slide_hash_resolver.c`
			`@@ -0,0 +1,15 @@`
			`+/* Copyright (C) 2019 Matheus Castanho <msc@linux.ibm.com>, IBM`
			`+ * For conditions of distribution and use, see copyright notice in zlib.h`
			`+ */`
			`+`
			`+#include "../gcc/zifunc.h"`
			`+#include "power.h"`
			`+`
			`+Z_IFUNC(slide_hash) {`
			`+#ifdef Z_POWER8`
			`+ if (__builtin_cpu_supports("arch_2_07"))`
			`+ return _slide_hash_power8;`
			`+#endif`
			`+`
			`+ return slide_hash_default;`
			`+}`
			`Index: zlib-1.2.13/deflate.c`
			`===================================================================`
			`--- zlib-1.2.13.orig/deflate.c`
			`+++ zlib-1.2.13/deflate.c`
			`@@ -204,6 +204,13 @@ local const config configuration_table[1`
			`(unsigned)(s->hash_size - 1)sizeof(s->head)); \`
			`} while (0)`

			`+#ifdef Z_POWER_OPT`
			`+/* Rename function so resolver can use its symbol. The default version will be`
			`+ * returned by the resolver if the host has no support for an optimized version.`
			`+ */`
			`+#define slide_hash slide_hash_default`
			`+#endif /* Z_POWER_OPT */`
			`+`
			`/* ===========================================================================`
			`* Slide the hash table when sliding the window down (could be avoided with 32`
			`* bit values at the expense of memory usage). We slide even when level == 0 to`
			`@@ -235,6 +242,11 @@ local void slide_hash(s)`
			`#endif`
			`}`

			`+#ifdef Z_POWER_OPT`
			`+#undef slide_hash`
			`+#include "contrib/power/slide_hash_resolver.c"`
			`+#endif /* Z_POWER_OPT */`
			`+`
			`/* ========================================================================= */`
			`int ZEXPORT deflateInit_(strm, level, version, stream_size)`
			`z_streamp strm;`