343 lines
11 KiB
Diff
343 lines
11 KiB
Diff
From 772f4bd0f880c4c193ab7da78728f38821572a02 Mon Sep 17 00:00:00 2001
|
|
From: Rogerio Alves <rcardoso@linux.ibm.com>
|
|
Date: Mon, 9 Dec 2019 14:40:53 -0300
|
|
Subject: [PATCH] Adler32 vector optimization for Power.
|
|
|
|
This commit implements a Power (POWER8+) vector optimization for Adler32
|
|
checksum using VSX (vector) instructions. The VSX adler32 checksum is up
|
|
to 10x fast than the adler32 baseline code.
|
|
|
|
Author: Rogerio Alves <rcardoso@linux.ibm.com>
|
|
---
|
|
CMakeLists.txt | 1 +
|
|
Makefile.in | 8 ++
|
|
adler32.c | 11 ++
|
|
configure | 4 +-
|
|
contrib/power/adler32_power8.c | 196 +++++++++++++++++++++++++++++++
|
|
contrib/power/adler32_resolver.c | 15 +++
|
|
contrib/power/power.h | 4 +-
|
|
7 files changed, 236 insertions(+), 3 deletions(-)
|
|
create mode 100644 contrib/power/adler32_power8.c
|
|
create mode 100644 contrib/power/adler32_resolver.c
|
|
|
|
diff --git a/CMakeLists.txt b/CMakeLists.txt
|
|
index 581e1fa6d..c6296ee68 100644
|
|
--- a/CMakeLists.txt
|
|
+++ b/CMakeLists.txt
|
|
@@ -185,6 +185,7 @@ if(CMAKE_COMPILER_IS_GNUCC)
|
|
if(POWER8)
|
|
add_definitions(-DZ_POWER8)
|
|
set(ZLIB_POWER8
|
|
+ contrib/power/adler32_power8.c
|
|
contrib/power/crc32_z_power8.c)
|
|
|
|
set_source_files_properties(
|
|
diff --git a/Makefile.in b/Makefile.in
|
|
index 16943044e..a0ffac860 100644
|
|
--- a/Makefile.in
|
|
+++ b/Makefile.in
|
|
@@ -165,6 +165,9 @@ minigzip64.o: $(SRCDIR)test/minigzip.c $(SRCDIR)zlib.h zconf.h
|
|
adler32.o: $(SRCDIR)adler32.c
|
|
$(CC) $(CFLAGS) $(ZINC) -c -o $@ $(SRCDIR)adler32.c
|
|
|
|
+adler32_power8.o: $(SRCDIR)contrib/power/adler32_power8.c
|
|
+ $(CC) $(CFLAGS) -mcpu=power8 $(ZINC) -c -o $@ $(SRCDIR)contrib/power/adler32_power8.c
|
|
+
|
|
crc32.o: $(SRCDIR)crc32.c
|
|
$(CC) $(CFLAGS) $(ZINC) -c -o $@ $(SRCDIR)crc32.c
|
|
|
|
@@ -216,6 +219,11 @@ adler32.lo: $(SRCDIR)adler32.c
|
|
$(CC) $(SFLAGS) $(ZINC) -DPIC -c -o objs/adler32.o $(SRCDIR)adler32.c
|
|
-@mv objs/adler32.o $@
|
|
|
|
+adler32_power8.lo: $(SRCDIR)contrib/power/adler32_power8.c
|
|
+ -@mkdir objs 2>/dev/null || test -d objs
|
|
+ $(CC) $(SFLAGS) -mcpu=power8 $(ZINC) -DPIC -c -o objs/adler32_power8.o $(SRCDIR)contrib/power/adler32_power8.c
|
|
+ -@mv objs/adler32_power8.o $@
|
|
+
|
|
crc32.lo: $(SRCDIR)crc32.c
|
|
-@mkdir objs 2>/dev/null || test -d objs
|
|
$(CC) $(SFLAGS) $(ZINC) -DPIC -c -o objs/crc32.o $(SRCDIR)crc32.c
|
|
diff --git a/adler32.c b/adler32.c
|
|
index d0be4380a..4bde0fa18 100644
|
|
--- a/adler32.c
|
|
+++ b/adler32.c
|
|
@@ -131,6 +131,12 @@ uLong ZEXPORT adler32_z(adler, buf, len)
|
|
}
|
|
|
|
/* ========================================================================= */
|
|
+
|
|
+#ifdef Z_POWER_OPT
|
|
+/* Rename the default function to avoid naming conflicts */
|
|
+#define adler32 adler32_default
|
|
+#endif /* Z_POWER_OPT */
|
|
+
|
|
uLong ZEXPORT adler32(adler, buf, len)
|
|
uLong adler;
|
|
const Bytef *buf;
|
|
@@ -139,6 +145,11 @@ uLong ZEXPORT adler32(adler, buf, len)
|
|
return adler32_z(adler, buf, len);
|
|
}
|
|
|
|
+#ifdef Z_POWER_OPT
|
|
+#undef adler32
|
|
+#include "contrib/power/adler32_resolver.c"
|
|
+#endif /* Z_POWER_OPT */
|
|
+
|
|
/* ========================================================================= */
|
|
local uLong adler32_combine_(adler1, adler2, len2)
|
|
uLong adler1;
|
|
diff --git a/configure b/configure
|
|
index 914d9f4aa..810a7404d 100755
|
|
--- a/configure
|
|
+++ b/configure
|
|
@@ -879,8 +879,8 @@ if tryboth $CC -c $CFLAGS $test.c; then
|
|
|
|
if tryboth $CC -c $CFLAGS -mcpu=power8 $test.c; then
|
|
POWER8="-DZ_POWER8"
|
|
- PIC_OBJC="${PIC_OBJC} crc32_z_power8.lo"
|
|
- OBJC="${OBJC} crc32_z_power8.o"
|
|
+ PIC_OBJC="${PIC_OBJC} adler32_power8.lo crc32_z_power8.lo"
|
|
+ OBJC="${OBJC} adler32_power8.o crc32_z_power8.o"
|
|
echo "Checking for -mcpu=power8 support... Yes." | tee -a configure.log
|
|
else
|
|
echo "Checking for -mcpu=power8 support... No." | tee -a configure.log
|
|
diff --git a/contrib/power/adler32_power8.c b/contrib/power/adler32_power8.c
|
|
new file mode 100644
|
|
index 000000000..473c39457
|
|
--- /dev/null
|
|
+++ b/contrib/power/adler32_power8.c
|
|
@@ -0,0 +1,196 @@
|
|
+/*
|
|
+ * Adler32 for POWER 8+ using VSX instructions.
|
|
+ *
|
|
+ * Calculate adler32 checksum for 16 bytes at once using POWER8+ VSX (vector)
|
|
+ * instructions.
|
|
+ *
|
|
+ * If adler32 do 1 byte at time on the first iteration s1 is s1_0 (_n means
|
|
+ * iteration n) is the initial value of adler - at start _0 is 1 unless
|
|
+ * adler initial value is different than 1. So s1_1 = s1_0 + c[0] after
|
|
+ * the first calculation. For the iteration s1_2 = s1_1 + c[1] and so on.
|
|
+ * Hence, for iteration N, s1_N = s1_(N-1) + c[N] is the value of s1 on
|
|
+ * after iteration N.
|
|
+ *
|
|
+ * Therefore, for s2 and iteration N, s2_N = s2_0 + N*s1_N + N*c[0] +
|
|
+ * N-1*c[1] + ... + c[N]
|
|
+ *
|
|
+ * In a more general way:
|
|
+ *
|
|
+ * s1_N = s1_0 + sum(i=1 to N)c[i]
|
|
+ * s2_N = s2_0 + N*s1 + sum (i=1 to N)(N-i+1)*c[i]
|
|
+ *
|
|
+ * Where s1_N, s2_N are the values for s1, s2 after N iterations. So if we
|
|
+ * can process N-bit at time we can do this at once.
|
|
+ *
|
|
+ * Since VSX can support 16-bit vector instructions, we can process
|
|
+ * 16-bit at time using N = 16 we have:
|
|
+ *
|
|
+ * s1 = s1_16 = s1_(16-1) + c[16] = s1_0 + sum(i=1 to 16)c[i]
|
|
+ * s2 = s2_16 = s2_0 + 16*s1 + sum(i=1 to 16)(16-i+1)*c[i]
|
|
+ *
|
|
+ * After the first iteration we calculate the adler32 checksum for 16 bytes.
|
|
+ *
|
|
+ * For more background about adler32 please check the RFC:
|
|
+ * https://www.ietf.org/rfc/rfc1950.txt
|
|
+ *
|
|
+ * Copyright (C) 2019 Rogerio Alves <rcardoso@linux.ibm.com>, IBM
|
|
+ * For conditions of distribution and use, see copyright notice in zlib.h
|
|
+ *
|
|
+ */
|
|
+
|
|
+#include "../../zutil.h"
|
|
+#include <altivec.h>
|
|
+
|
|
+/* Largest prime smaller than 65536. */
|
|
+#define BASE 65521U
|
|
+#define NMAX 5552
|
|
+/* NMAX is the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1. */
|
|
+
|
|
+#define DO1(s1,s2,buf,i) {(s1) += buf[(i)]; (s2) += (s1);}
|
|
+#define DO2(s1,s2,buf,i) {DO1(s1,s2,buf,i); DO1(s1,s2,buf,i+1);}
|
|
+#define DO4(s1,s2,buf,i) {DO2(s1,s2,buf,i); DO2(s1,s2,buf,i+2);}
|
|
+#define DO8(s1,s2,buf,i) {DO4(s1,s2,buf,i); DO4(s1,s2,buf,i+4);}
|
|
+#define DO16(s1,s2,buf) {DO8(s1,s2,buf,0); DO8(s1,s2,buf,8);}
|
|
+
|
|
+/* Vector across sum unsigned int (saturate). */
|
|
+inline vector unsigned int vec_sumsu (vector unsigned int __a,
|
|
+ vector unsigned int __b)
|
|
+{
|
|
+ __b = vec_sld(__a, __a, 8);
|
|
+ __b = vec_add(__b, __a);
|
|
+ __a = vec_sld(__b, __b, 4);
|
|
+ __a = vec_add(__a, __b);
|
|
+
|
|
+ return __a;
|
|
+}
|
|
+
|
|
+uLong ZLIB_INTERNAL _adler32_power8 (uLong adler, const Bytef* buf, uInt len)
|
|
+{
|
|
+ /* If buffer is empty or len=0 we need to return adler initial value. */
|
|
+ if (buf == NULL)
|
|
+ return 1;
|
|
+
|
|
+ unsigned int s1 = adler & 0xffff;
|
|
+ unsigned int s2 = (adler >> 16) & 0xffff;
|
|
+
|
|
+ /* in case user likes doing a byte at a time, keep it fast */
|
|
+ if (len == 1) {
|
|
+ s1 += buf[0];
|
|
+ if (s1 >= BASE)
|
|
+ s1 -= BASE;
|
|
+ s2 += s1;
|
|
+ if (s2 >= BASE)
|
|
+ s2 -= BASE;
|
|
+ return (s2 << 16) | s1;
|
|
+ }
|
|
+
|
|
+ /* Keep it fast for short length buffers. */
|
|
+ if (len < 16) {
|
|
+ while (len--) {
|
|
+ s1 += *buf++;
|
|
+ s2 += s1;
|
|
+ }
|
|
+ if (s1 >= BASE)
|
|
+ s1 -= BASE;
|
|
+ s2 %= BASE;
|
|
+ return (s2 << 16) | s1;
|
|
+ }
|
|
+
|
|
+ /* This is faster than VSX code for len < 64. */
|
|
+ if (len < 64) {
|
|
+ while (len >= 16) {
|
|
+ len -= 16;
|
|
+ DO16(s1,s2,buf);
|
|
+ buf += 16;
|
|
+ }
|
|
+ } else {
|
|
+ /* Use POWER VSX instructions for len >= 64. */
|
|
+ const vector unsigned int v_zeros = { 0 };
|
|
+ const vector unsigned char v_mul = {16, 15, 14, 13, 12, 11, 10, 9, 8, 7,
|
|
+ 6, 5, 4, 3, 2, 1};
|
|
+ const vector unsigned char vsh = vec_splat_u8(4);
|
|
+ const vector unsigned int vmask = {0xffffffff, 0x0, 0x0, 0x0};
|
|
+ vector unsigned int vs1 = vec_xl(0, &s1);
|
|
+ vector unsigned int vs2 = vec_xl(0, &s2);
|
|
+ vector unsigned int vs1_save = { 0 };
|
|
+ vector unsigned int vsum1, vsum2;
|
|
+ vector unsigned char vbuf;
|
|
+ int n;
|
|
+
|
|
+ /* Zeros the undefined values of vectors vs1, vs2. */
|
|
+ vs1 = vec_and(vs1, vmask);
|
|
+ vs2 = vec_and(vs2, vmask);
|
|
+
|
|
+ /* Do length bigger than NMAX in blocks of NMAX size. */
|
|
+ while (len >= NMAX) {
|
|
+ len -= NMAX;
|
|
+ n = NMAX / 16;
|
|
+ do {
|
|
+ vbuf = vec_xl(0, (unsigned char *) buf);
|
|
+ vsum1 = vec_sum4s(vbuf, v_zeros); /* sum(i=1 to 16) buf[i]. */
|
|
+ /* sum(i=1 to 16) buf[i]*(16-i+1). */
|
|
+ vsum2 = vec_msum(vbuf, v_mul, v_zeros);
|
|
+ /* Save vs1. */
|
|
+ vs1_save = vec_add(vs1_save, vs1);
|
|
+ /* Accumulate the sums. */
|
|
+ vs1 = vec_add(vsum1, vs1);
|
|
+ vs2 = vec_add(vsum2, vs2);
|
|
+
|
|
+ buf += 16;
|
|
+ } while (--n);
|
|
+ /* Once each block of NMAX size. */
|
|
+ vs1 = vec_sumsu(vs1, vsum1);
|
|
+ vs1_save = vec_sll(vs1_save, vsh); /* 16*vs1_save. */
|
|
+ vs2 = vec_add(vs1_save, vs2);
|
|
+ vs2 = vec_sumsu(vs2, vsum2);
|
|
+
|
|
+ /* vs1[0] = (s1_i + sum(i=1 to 16)buf[i]) mod 65521. */
|
|
+ vs1[0] = vs1[0] % BASE;
|
|
+ /* vs2[0] = s2_i + 16*s1_save +
|
|
+ sum(i=1 to 16)(16-i+1)*buf[i] mod 65521. */
|
|
+ vs2[0] = vs2[0] % BASE;
|
|
+
|
|
+ vs1 = vec_and(vs1, vmask);
|
|
+ vs2 = vec_and(vs2, vmask);
|
|
+ vs1_save = v_zeros;
|
|
+ }
|
|
+
|
|
+ /* len is less than NMAX one modulo is needed. */
|
|
+ if (len >= 16) {
|
|
+ while (len >= 16) {
|
|
+ len -= 16;
|
|
+
|
|
+ vbuf = vec_xl(0, (unsigned char *) buf);
|
|
+
|
|
+ vsum1 = vec_sum4s(vbuf, v_zeros); /* sum(i=1 to 16) buf[i]. */
|
|
+ /* sum(i=1 to 16) buf[i]*(16-i+1). */
|
|
+ vsum2 = vec_msum(vbuf, v_mul, v_zeros);
|
|
+ /* Save vs1. */
|
|
+ vs1_save = vec_add(vs1_save, vs1);
|
|
+ /* Accumulate the sums. */
|
|
+ vs1 = vec_add(vsum1, vs1);
|
|
+ vs2 = vec_add(vsum2, vs2);
|
|
+
|
|
+ buf += 16;
|
|
+ }
|
|
+ /* Since the size will be always less than NMAX we do this once. */
|
|
+ vs1 = vec_sumsu(vs1, vsum1);
|
|
+ vs1_save = vec_sll(vs1_save, vsh); /* 16*vs1_save. */
|
|
+ vs2 = vec_add(vs1_save, vs2);
|
|
+ vs2 = vec_sumsu(vs2, vsum2);
|
|
+ }
|
|
+ /* Copy result back to s1, s2 (mod 65521). */
|
|
+ s1 = vs1[0] % BASE;
|
|
+ s2 = vs2[0] % BASE;
|
|
+ }
|
|
+
|
|
+ /* Process tail (len < 16). */
|
|
+ while (len--) {
|
|
+ s1 += *buf++;
|
|
+ s2 += s1;
|
|
+ }
|
|
+ s1 %= BASE;
|
|
+ s2 %= BASE;
|
|
+
|
|
+ return (s2 << 16) | s1;
|
|
+}
|
|
diff --git a/contrib/power/adler32_resolver.c b/contrib/power/adler32_resolver.c
|
|
new file mode 100644
|
|
index 000000000..07a1a2cb2
|
|
--- /dev/null
|
|
+++ b/contrib/power/adler32_resolver.c
|
|
@@ -0,0 +1,15 @@
|
|
+/* Copyright (C) 2019 Rogerio Alves <rcardoso@linux.ibm.com>, IBM
|
|
+ * For conditions of distribution and use, see copyright notice in zlib.h
|
|
+ */
|
|
+
|
|
+#include "../gcc/zifunc.h"
|
|
+#include "power.h"
|
|
+
|
|
+Z_IFUNC(adler32) {
|
|
+#ifdef Z_POWER8
|
|
+ if (__builtin_cpu_supports("arch_2_07"))
|
|
+ return _adler32_power8;
|
|
+#endif
|
|
+
|
|
+ return adler32_default;
|
|
+}
|
|
diff --git a/contrib/power/power.h b/contrib/power/power.h
|
|
index 79123aa90..f57c76167 100644
|
|
--- a/contrib/power/power.h
|
|
+++ b/contrib/power/power.h
|
|
@@ -2,7 +2,9 @@
|
|
* 2019 Rogerio Alves <rogerio.alves@ibm.com>, IBM
|
|
* For conditions of distribution and use, see copyright notice in zlib.h
|
|
*/
|
|
-
|
|
#include "../../zconf.h"
|
|
+#include "../../zutil.h"
|
|
+
|
|
+uLong _adler32_power8(uLong adler, const Bytef* buf, uInt len);
|
|
|
|
unsigned long _crc32_z_power8(unsigned long, const Bytef *, z_size_t);
|