Accepting request 1119078 from home:dspinella:branches:devel:libraries:c_c++
- Update to 1.3: * Building using K&R (pre-ANSI) function definitions is no longer supported. * Fixed a bug in deflateBound() for level 0 and memLevel 9. * Fixed a bug when gzungetc() is used immediately after gzopen(). * Fixed a bug when using gzflush() with a very small buffer. * Fixed a crash when gzsetparams() is attempted for a transparent write. * Fixed test/example.c to work with FORCE_STORED. * Fixed minizip to allow it to open an empty zip file. * Fixed reading disk number start on zip64 files in minizip. * Fixed a logic error in minizip argument processing. - Added patches: * zlib-1.3-IBM-Z-hw-accelerated-deflate-s390x.patch - Refreshed patches: * zlib-1.2.12-add-optimized-slide_hash-for-power.patch * zlib-1.2.12-add-vectorized-longest_match-for-power.patch * zlib-1.2.12-adler32-vector-optimizations-for-power.patch * zlib-1.2.13-optimized-s390.patch * zlib-format.patch * zlib-no-version-check.patch - Removed patches: * bsc1210593.patch * zlib-1.2.13-fix-bug-deflateBound.patch * zlib-1.2.12-s390-vectorize-crc32.patch * zlib-1.2.13-IBM-Z-hw-accelerated-deflate-s390x.patch * zlib-1.2.12-add-optimized-slide_hash-for-power.patch * zlib-1.2.12-fix-invalid-memory-access-on-ppc-and-ppc64.patch * zlib-1.2.12-add-vectorized-longest_match-for-power.patch * zlib-1.2.12-adler32-vector-optimizations-for-power.patch - Fix CVE-2023-45853, integer overflow and resultant heap-based buffer overflow in zipOpenNewFileInZip4_6, bsc#1216378 OBS-URL: https://build.opensuse.org/request/show/1119078 OBS-URL: https://build.opensuse.org/package/show/devel:libraries:c_c++/zlib?expand=0&rev=95
This commit is contained in:
parent
e6f111ebe2
commit
22d97a578b
38
CVE-2023-45853.patch
Normal file
38
CVE-2023-45853.patch
Normal file
@ -0,0 +1,38 @@
|
||||
From 431e66398552effd82d5c0ea982a521821782ebd Mon Sep 17 00:00:00 2001
|
||||
From: Hans Wennborg <hans@chromium.org>
|
||||
Date: Fri, 18 Aug 2023 11:05:33 +0200
|
||||
Subject: [PATCH] minizip: Check length of comment, filename, and extra field,
|
||||
in zipOpenNewFileInZip4_64
|
||||
|
||||
These are stored in 16-bit fields in the zip file format. Passing longer
|
||||
values would generate an invalid file.
|
||||
|
||||
Passing very long values could also cause the computation of
|
||||
zi->ci.size_centralheader to overflow, which would cause heap buffer
|
||||
overflow on subsequent writes to zi->ci.central_header.
|
||||
---
|
||||
contrib/minizip/zip.c | 11 +++++++++++
|
||||
1 file changed, 11 insertions(+)
|
||||
|
||||
diff --git a/contrib/minizip/zip.c b/contrib/minizip/zip.c
|
||||
index 3d3d4cadd..0446109b2 100644
|
||||
--- a/contrib/minizip/zip.c
|
||||
+++ b/contrib/minizip/zip.c
|
||||
@@ -1043,6 +1043,17 @@ extern int ZEXPORT zipOpenNewFileInZip4_64(zipFile file, const char* filename, c
|
||||
return ZIP_PARAMERROR;
|
||||
#endif
|
||||
|
||||
+ // The filename and comment length must fit in 16 bits.
|
||||
+ if ((filename!=NULL) && (strlen(filename)>0xffff))
|
||||
+ return ZIP_PARAMERROR;
|
||||
+ if ((comment!=NULL) && (strlen(comment)>0xffff))
|
||||
+ return ZIP_PARAMERROR;
|
||||
+ // The extra field length must fit in 16 bits. If the member also requires
|
||||
+ // a Zip64 extra block, that will also need to fit within that 16-bit
|
||||
+ // length, but that will be checked for later.
|
||||
+ if ((size_extrafield_local>0xffff) || (size_extrafield_global>0xffff))
|
||||
+ return ZIP_PARAMERROR;
|
||||
+
|
||||
zi = (zip64_internal*)file;
|
||||
|
||||
if (zi->in_opened_file_inzip == 1)
|
@ -1,13 +0,0 @@
|
||||
Index: zlib-1.2.11/contrib/s390/dfltcc_deflate.h
|
||||
===================================================================
|
||||
--- zlib-1.2.11.orig/contrib/s390/dfltcc_deflate.h
|
||||
+++ zlib-1.2.11/contrib/s390/dfltcc_deflate.h
|
||||
@@ -45,7 +45,7 @@ int ZLIB_INTERNAL dfltcc_deflate_get_dic
|
||||
#define DEFLATE_DONE dfltcc_deflate_done
|
||||
#define DEFLATE_BOUND_ADJUST_COMPLEN(strm, complen, source_len) \
|
||||
do { \
|
||||
- if (dfltcc_can_deflate((strm))) \
|
||||
+ if (deflateStateCheck((strm)) || dfltcc_can_deflate((strm))) \
|
||||
(complen) = DEFLATE_BOUND_COMPLEN(source_len); \
|
||||
} while (0)
|
||||
#define DEFLATE_NEED_CONSERVATIVE_BOUND(strm) (dfltcc_can_deflate((strm)))
|
@ -1,217 +0,0 @@
|
||||
From 4a8d89ae49aa17d1634a2816c8d159f533a07eae Mon Sep 17 00:00:00 2001
|
||||
From: Matheus Castanho <msc@linux.ibm.com>
|
||||
Date: Wed, 27 Nov 2019 10:18:10 -0300
|
||||
Subject: [PATCH] Add optimized slide_hash for Power
|
||||
|
||||
Considerable time is spent on deflate.c:slide_hash() during
|
||||
deflate. This commit introduces a new slide_hash function that
|
||||
uses VSX vector instructions to slide 8 hash elements at a time,
|
||||
instead of just one as the standard code does.
|
||||
|
||||
The choice between the optimized and default versions is made only
|
||||
on the first call to the function, enabling a fallback to standard
|
||||
behavior if the host processor does not support VSX instructions,
|
||||
so the same binary can be used for multiple Power processor
|
||||
versions.
|
||||
|
||||
Author: Matheus Castanho <msc@linux.ibm.com>
|
||||
---
|
||||
CMakeLists.txt | 3 +-
|
||||
Makefile.in | 8 ++++
|
||||
configure | 4 +-
|
||||
contrib/power/power.h | 3 ++
|
||||
contrib/power/slide_hash_power8.c | 63 +++++++++++++++++++++++++++++
|
||||
contrib/power/slide_hash_resolver.c | 15 +++++++
|
||||
deflate.c | 12 ++++++
|
||||
7 files changed, 105 insertions(+), 3 deletions(-)
|
||||
create mode 100644 contrib/power/slide_hash_power8.c
|
||||
create mode 100644 contrib/power/slide_hash_resolver.c
|
||||
|
||||
Index: zlib-1.2.13/CMakeLists.txt
|
||||
===================================================================
|
||||
--- zlib-1.2.13.orig/CMakeLists.txt
|
||||
+++ zlib-1.2.13/CMakeLists.txt
|
||||
@@ -174,7 +174,8 @@ if(CMAKE_COMPILER_IS_GNUCC)
|
||||
add_definitions(-DZ_POWER8)
|
||||
set(ZLIB_POWER8
|
||||
contrib/power/adler32_power8.c
|
||||
- contrib/power/crc32_z_power8.c)
|
||||
+ contrib/power/crc32_z_power8.c
|
||||
+ contrib/power/slide_hash_power8.c)
|
||||
|
||||
set_source_files_properties(
|
||||
${ZLIB_POWER8}
|
||||
Index: zlib-1.2.13/Makefile.in
|
||||
===================================================================
|
||||
--- zlib-1.2.13.orig/Makefile.in
|
||||
+++ zlib-1.2.13/Makefile.in
|
||||
@@ -185,6 +185,9 @@ crc32-vx.o: $(SRCDIR)contrib/s390/crc32-
|
||||
deflate.o: $(SRCDIR)deflate.c
|
||||
$(CC) $(CFLAGS) $(ZINC) -c -o $@ $(SRCDIR)deflate.c
|
||||
|
||||
+slide_hash_power8.o: $(SRCDIR)contrib/power/slide_hash_power8.c
|
||||
+ $(CC) $(CFLAGS) -mcpu=power8 $(ZINC) -c -o $@ $(SRCDIR)contrib/power/slide_hash_power8.c
|
||||
+
|
||||
infback.o: $(SRCDIR)infback.c
|
||||
$(CC) $(CFLAGS) $(ZINC) -c -o $@ $(SRCDIR)infback.c
|
||||
|
||||
@@ -252,6 +255,11 @@ deflate.lo: $(SRCDIR)deflate.c
|
||||
$(CC) $(SFLAGS) $(ZINC) -DPIC -c -o objs/deflate.o $(SRCDIR)deflate.c
|
||||
-@mv objs/deflate.o $@
|
||||
|
||||
+slide_hash_power8.lo: $(SRCDIR)contrib/power/slide_hash_power8.c
|
||||
+ -@mkdir objs 2>/dev/null || test -d objs
|
||||
+ $(CC) $(SFLAGS) -mcpu=power8 $(ZINC) -DPIC -c -o objs/slide_hash_power8.o $(SRCDIR)contrib/power/slide_hash_power8.c
|
||||
+ -@mv objs/slide_hash_power8.o $@
|
||||
+
|
||||
infback.lo: $(SRCDIR)infback.c
|
||||
-@mkdir objs 2>/dev/null || test -d objs
|
||||
$(CC) $(SFLAGS) $(ZINC) -DPIC -c -o objs/infback.o $(SRCDIR)infback.c
|
||||
Index: zlib-1.2.13/configure
|
||||
===================================================================
|
||||
--- zlib-1.2.13.orig/configure
|
||||
+++ zlib-1.2.13/configure
|
||||
@@ -898,8 +898,8 @@ if tryboth $CC -c $CFLAGS $test.c; then
|
||||
|
||||
if tryboth $CC -c $CFLAGS -mcpu=power8 $test.c; then
|
||||
POWER8="-DZ_POWER8"
|
||||
- PIC_OBJC="${PIC_OBJC} adler32_power8.lo crc32_z_power8.lo"
|
||||
- OBJC="${OBJC} adler32_power8.o crc32_z_power8.o"
|
||||
+ PIC_OBJC="${PIC_OBJC} adler32_power8.lo crc32_z_power8.lo slide_hash_power8.lo"
|
||||
+ OBJC="${OBJC} adler32_power8.o crc32_z_power8.o slide_hash_power8.o"
|
||||
echo "Checking for -mcpu=power8 support... Yes." | tee -a configure.log
|
||||
else
|
||||
echo "Checking for -mcpu=power8 support... No." | tee -a configure.log
|
||||
Index: zlib-1.2.13/contrib/power/power.h
|
||||
===================================================================
|
||||
--- zlib-1.2.13.orig/contrib/power/power.h
|
||||
+++ zlib-1.2.13/contrib/power/power.h
|
||||
@@ -4,7 +4,10 @@
|
||||
*/
|
||||
#include "../../zconf.h"
|
||||
#include "../../zutil.h"
|
||||
+#include "../../deflate.h"
|
||||
|
||||
uLong _adler32_power8(uLong adler, const Bytef* buf, uInt len);
|
||||
|
||||
unsigned long _crc32_z_power8(unsigned long, const Bytef *, z_size_t);
|
||||
+
|
||||
+void _slide_hash_power8(deflate_state *s);
|
||||
Index: zlib-1.2.13/contrib/power/slide_hash_power8.c
|
||||
===================================================================
|
||||
--- /dev/null
|
||||
+++ zlib-1.2.13/contrib/power/slide_hash_power8.c
|
||||
@@ -0,0 +1,63 @@
|
||||
+ /* Copyright (C) 2019 Matheus Castanho <msc@linux.ibm.com>, IBM
|
||||
+ * For conditions of distribution and use, see copyright notice in zlib.h
|
||||
+ */
|
||||
+
|
||||
+#include <altivec.h>
|
||||
+#include "../../deflate.h"
|
||||
+
|
||||
+local inline void slide_hash_power8_loop OF((deflate_state *s,
|
||||
+ unsigned n_elems, Posf *table_end)) __attribute__((always_inline));
|
||||
+
|
||||
+local void slide_hash_power8_loop(
|
||||
+ deflate_state *s,
|
||||
+ unsigned n_elems,
|
||||
+ Posf *table_end)
|
||||
+{
|
||||
+ vector unsigned short vw, vm, *vp;
|
||||
+ unsigned chunks;
|
||||
+
|
||||
+ /* Each vector register (chunk) corresponds to 128 bits == 8 Posf,
|
||||
+ * so instead of processing each of the n_elems in the hash table
|
||||
+ * individually, we can do it in chunks of 8 with vector instructions.
|
||||
+ *
|
||||
+ * This function is only called from slide_hash_power8(), and both calls
|
||||
+ * pass n_elems as a power of 2 higher than 2^7, as defined by
|
||||
+ * deflateInit2_(), so n_elems will always be a multiple of 8. */
|
||||
+ chunks = n_elems >> 3;
|
||||
+ Assert(n_elems % 8 == 0, "Weird hash table size!");
|
||||
+
|
||||
+ /* This type casting is safe since s->w_size is always <= 64KB
|
||||
+ * as defined by deflateInit2_() and Posf == unsigned short */
|
||||
+ vw[0] = (Posf) s->w_size;
|
||||
+ vw = vec_splat(vw,0);
|
||||
+
|
||||
+ vp = (vector unsigned short *) table_end;
|
||||
+
|
||||
+ do {
|
||||
+ /* Processing 8 elements at a time */
|
||||
+ vp--;
|
||||
+ vm = *vp;
|
||||
+
|
||||
+ /* This is equivalent to: m >= w_size ? m - w_size : 0
|
||||
+ * Since we are using a saturated unsigned subtraction, any
|
||||
+ * values that are > w_size will be set to 0, while the others
|
||||
+ * will be subtracted by w_size. */
|
||||
+ *vp = vec_subs(vm,vw);
|
||||
+ } while (--chunks);
|
||||
+};
|
||||
+
|
||||
+void ZLIB_INTERNAL _slide_hash_power8(deflate_state *s)
|
||||
+{
|
||||
+ unsigned n;
|
||||
+ Posf *p;
|
||||
+
|
||||
+ n = s->hash_size;
|
||||
+ p = &s->head[n];
|
||||
+ slide_hash_power8_loop(s,n,p);
|
||||
+
|
||||
+#ifndef FASTEST
|
||||
+ n = s->w_size;
|
||||
+ p = &s->prev[n];
|
||||
+ slide_hash_power8_loop(s,n,p);
|
||||
+#endif
|
||||
+}
|
||||
Index: zlib-1.2.13/contrib/power/slide_hash_resolver.c
|
||||
===================================================================
|
||||
--- /dev/null
|
||||
+++ zlib-1.2.13/contrib/power/slide_hash_resolver.c
|
||||
@@ -0,0 +1,15 @@
|
||||
+/* Copyright (C) 2019 Matheus Castanho <msc@linux.ibm.com>, IBM
|
||||
+ * For conditions of distribution and use, see copyright notice in zlib.h
|
||||
+ */
|
||||
+
|
||||
+#include "../gcc/zifunc.h"
|
||||
+#include "power.h"
|
||||
+
|
||||
+Z_IFUNC(slide_hash) {
|
||||
+#ifdef Z_POWER8
|
||||
+ if (__builtin_cpu_supports("arch_2_07"))
|
||||
+ return _slide_hash_power8;
|
||||
+#endif
|
||||
+
|
||||
+ return slide_hash_default;
|
||||
+}
|
||||
Index: zlib-1.2.13/deflate.c
|
||||
===================================================================
|
||||
--- zlib-1.2.13.orig/deflate.c
|
||||
+++ zlib-1.2.13/deflate.c
|
||||
@@ -204,6 +204,13 @@ local const config configuration_table[1
|
||||
(unsigned)(s->hash_size - 1)*sizeof(*s->head)); \
|
||||
} while (0)
|
||||
|
||||
+#ifdef Z_POWER_OPT
|
||||
+/* Rename function so resolver can use its symbol. The default version will be
|
||||
+ * returned by the resolver if the host has no support for an optimized version.
|
||||
+ */
|
||||
+#define slide_hash slide_hash_default
|
||||
+#endif /* Z_POWER_OPT */
|
||||
+
|
||||
/* ===========================================================================
|
||||
* Slide the hash table when sliding the window down (could be avoided with 32
|
||||
* bit values at the expense of memory usage). We slide even when level == 0 to
|
||||
@@ -235,6 +242,11 @@ local void slide_hash(s)
|
||||
#endif
|
||||
}
|
||||
|
||||
+#ifdef Z_POWER_OPT
|
||||
+#undef slide_hash
|
||||
+#include "contrib/power/slide_hash_resolver.c"
|
||||
+#endif /* Z_POWER_OPT */
|
||||
+
|
||||
/* ========================================================================= */
|
||||
int ZEXPORT deflateInit_(strm, level, version, stream_size)
|
||||
z_streamp strm;
|
@ -1,338 +0,0 @@
|
||||
From aecdff0646c7e188b48f6db285d8d63a74f246c1 Mon Sep 17 00:00:00 2001
|
||||
From: Matheus Castanho <msc@linux.ibm.com>
|
||||
Date: Tue, 29 Oct 2019 18:04:11 -0300
|
||||
Subject: [PATCH] Add vectorized longest_match for Power
|
||||
|
||||
This commit introduces an optimized version of the longest_match
|
||||
function for Power processors. It uses VSX instructions to match
|
||||
16 bytes at a time on each comparison, instead of one by one.
|
||||
|
||||
Author: Matheus Castanho <msc@linux.ibm.com>
|
||||
---
|
||||
CMakeLists.txt | 3 +-
|
||||
Makefile.in | 8 +
|
||||
configure | 4 +-
|
||||
contrib/power/longest_match_power9.c | 194 +++++++++++++++++++++++++
|
||||
contrib/power/longest_match_resolver.c | 15 ++
|
||||
contrib/power/power.h | 2 +
|
||||
deflate.c | 13 ++
|
||||
7 files changed, 236 insertions(+), 3 deletions(-)
|
||||
create mode 100644 contrib/power/longest_match_power9.c
|
||||
create mode 100644 contrib/power/longest_match_resolver.c
|
||||
|
||||
Index: zlib-1.2.13/CMakeLists.txt
|
||||
===================================================================
|
||||
--- zlib-1.2.13.orig/CMakeLists.txt
|
||||
+++ zlib-1.2.13/CMakeLists.txt
|
||||
@@ -187,7 +187,8 @@ if(CMAKE_COMPILER_IS_GNUCC)
|
||||
|
||||
if(POWER9)
|
||||
add_definitions(-DZ_POWER9)
|
||||
- set(ZLIB_POWER9 )
|
||||
+ set(ZLIB_POWER9
|
||||
+ contrib/power/longest_match_power9.c)
|
||||
|
||||
set_source_files_properties(
|
||||
${ZLIB_POWER9}
|
||||
Index: zlib-1.2.13/Makefile.in
|
||||
===================================================================
|
||||
--- zlib-1.2.13.orig/Makefile.in
|
||||
+++ zlib-1.2.13/Makefile.in
|
||||
@@ -185,6 +185,9 @@ crc32-vx.o: $(SRCDIR)contrib/s390/crc32-
|
||||
deflate.o: $(SRCDIR)deflate.c
|
||||
$(CC) $(CFLAGS) $(ZINC) -c -o $@ $(SRCDIR)deflate.c
|
||||
|
||||
+longest_match_power9.o: $(SRCDIR)contrib/power/longest_match_power9.c
|
||||
+ $(CC) $(CFLAGS) -mcpu=power9 $(ZINC) -c -o $@ $(SRCDIR)contrib/power/longest_match_power9.c
|
||||
+
|
||||
slide_hash_power8.o: $(SRCDIR)contrib/power/slide_hash_power8.c
|
||||
$(CC) $(CFLAGS) -mcpu=power8 $(ZINC) -c -o $@ $(SRCDIR)contrib/power/slide_hash_power8.c
|
||||
|
||||
@@ -255,6 +258,11 @@ deflate.lo: $(SRCDIR)deflate.c
|
||||
$(CC) $(SFLAGS) $(ZINC) -DPIC -c -o objs/deflate.o $(SRCDIR)deflate.c
|
||||
-@mv objs/deflate.o $@
|
||||
|
||||
+longest_match_power9.lo: $(SRCDIR)contrib/power/longest_match_power9.c
|
||||
+ -@mkdir objs 2>/dev/null || test -d objs
|
||||
+ $(CC) $(SFLAGS) -mcpu=power9 $(ZINC) -DPIC -c -o objs/longest_match_power9.o $(SRCDIR)contrib/power/longest_match_power9.c
|
||||
+ -@mv objs/longest_match_power9.o $@
|
||||
+
|
||||
slide_hash_power8.lo: $(SRCDIR)contrib/power/slide_hash_power8.c
|
||||
-@mkdir objs 2>/dev/null || test -d objs
|
||||
$(CC) $(SFLAGS) -mcpu=power8 $(ZINC) -DPIC -c -o objs/slide_hash_power8.o $(SRCDIR)contrib/power/slide_hash_power8.c
|
||||
Index: zlib-1.2.13/configure
|
||||
===================================================================
|
||||
--- zlib-1.2.13.orig/configure
|
||||
+++ zlib-1.2.13/configure
|
||||
@@ -907,8 +907,8 @@ if tryboth $CC -c $CFLAGS $test.c; then
|
||||
|
||||
if tryboth $CC -c $CFLAGS -mcpu=power9 $test.c; then
|
||||
POWER9="-DZ_POWER9"
|
||||
- PIC_OBJC="${PIC_OBJC}"
|
||||
- OBJC="${OBJC}"
|
||||
+ PIC_OBJC="$PIC_OBJC longest_match_power9.lo"
|
||||
+ OBJC="$OBJC longest_match_power9.o"
|
||||
echo "Checking for -mcpu=power9 support... Yes." | tee -a configure.log
|
||||
else
|
||||
echo "Checking for -mcpu=power9 support... No." | tee -a configure.log
|
||||
Index: zlib-1.2.13/contrib/power/longest_match_power9.c
|
||||
===================================================================
|
||||
--- /dev/null
|
||||
+++ zlib-1.2.13/contrib/power/longest_match_power9.c
|
||||
@@ -0,0 +1,194 @@
|
||||
+/* Copyright (C) 2019 Matheus Castanho <msc@linux.ibm.com>, IBM
|
||||
+ * For conditions of distribution and use, see copyright notice in zlib.h
|
||||
+ */
|
||||
+
|
||||
+#include <altivec.h>
|
||||
+#include "../../deflate.h"
|
||||
+
|
||||
+local inline int vec_match OF((Bytef* scan, Bytef* match))
|
||||
+ __attribute__((always_inline));
|
||||
+
|
||||
+local inline int vec_match(Bytef* scan, Bytef* match)
|
||||
+{
|
||||
+ vector unsigned char vscan, vmatch, vc;
|
||||
+ int len;
|
||||
+
|
||||
+ vscan = *((vector unsigned char *) scan);
|
||||
+ vmatch = *((vector unsigned char *) match);
|
||||
+
|
||||
+ /* Compare 16 bytes at a time.
|
||||
+ * Each byte of vc will be either all ones or all zeroes,
|
||||
+ * depending on the result of the comparison
|
||||
+ */
|
||||
+ vc = (vector unsigned char) vec_cmpne(vscan,vmatch);
|
||||
+
|
||||
+ /* Since the index of matching bytes will contain only zeroes
|
||||
+ * on vc (since we used cmpne), counting the number of consecutive
|
||||
+ * bytes where LSB == 0 is the same as counting the length of the match.
|
||||
+ *
|
||||
+ * There was an issue in the way the vec_cnttz_lsbb builtin was implemented
|
||||
+ * that got fixed on GCC 12, but now we have to use different builtins
|
||||
+ * depending on the compiler version. To avoid that, let's use inline asm to
|
||||
+ * generate the exact instruction we need.
|
||||
+ */
|
||||
+ #ifdef __LITTLE_ENDIAN__
|
||||
+ asm volatile("vctzlsbb %0, %1\n\t" : "=r" (len) : "v" (vc));
|
||||
+ #else
|
||||
+ asm volatile("vclzlsbb %0, %1\n\t" : "=r" (len) : "v" (vc));
|
||||
+ #endif
|
||||
+
|
||||
+ return len;
|
||||
+}
|
||||
+
|
||||
+uInt ZLIB_INTERNAL _longest_match_power9(deflate_state *s, IPos cur_match)
|
||||
+{
|
||||
+ unsigned chain_length = s->max_chain_length;/* max hash chain length */
|
||||
+ register Bytef *scan = s->window + s->strstart; /* current string */
|
||||
+ register Bytef *match; /* matched string */
|
||||
+ register int len; /* length of current match */
|
||||
+ int best_len = (int)s->prev_length; /* best match length so far */
|
||||
+ int nice_match = s->nice_match; /* stop if match long enough */
|
||||
+ int mbytes; /* matched bytes inside loop */
|
||||
+ IPos limit = s->strstart > (IPos)MAX_DIST(s) ?
|
||||
+ s->strstart - (IPos)MAX_DIST(s) : 0;
|
||||
+ /* Stop when cur_match becomes <= limit. To simplify the code,
|
||||
+ * we prevent matches with the string of window index 0.
|
||||
+ */
|
||||
+ Posf *prev = s->prev;
|
||||
+ uInt wmask = s->w_mask;
|
||||
+
|
||||
+#if (MAX_MATCH == 258)
|
||||
+ /* Compare the last two bytes at once. */
|
||||
+ register Bytef *strend2 = s->window + s->strstart + MAX_MATCH - 2;
|
||||
+ register ush scan_end = *(ushf*)(scan+best_len-1);
|
||||
+#else
|
||||
+ register Bytef *strend = s->window + s->strstart + MAX_MATCH;
|
||||
+ register Byte scan_end1 = scan[best_len-1];
|
||||
+ register Byte scan_end = scan[best_len];
|
||||
+#endif
|
||||
+
|
||||
+ /* The code is optimized for HASH_BITS >= 8 and MAX_MATCH-2 multiple of 16.
|
||||
+ * It is easy to get rid of this optimization if necessary.
|
||||
+ */
|
||||
+ Assert(s->hash_bits >= 8 && MAX_MATCH == 258, "Code too clever");
|
||||
+
|
||||
+ /* Do not waste too much time if we already have a good match: */
|
||||
+ if (s->prev_length >= s->good_match) {
|
||||
+ chain_length >>= 2;
|
||||
+ }
|
||||
+ /* Do not look for matches beyond the end of the input. This is necessary
|
||||
+ * to make deflate deterministic.
|
||||
+ */
|
||||
+ if ((uInt)nice_match > s->lookahead) nice_match = (int)s->lookahead;
|
||||
+
|
||||
+ Assert((ulg)s->strstart <= s->window_size-MIN_LOOKAHEAD, "need lookahead");
|
||||
+
|
||||
+ do {
|
||||
+ Assert(cur_match < s->strstart, "no future");
|
||||
+ match = s->window + cur_match;
|
||||
+
|
||||
+ /* Skip to next match if the match length cannot increase
|
||||
+ * or if the match length is less than 2. Note that the checks below
|
||||
+ * for insufficient lookahead only occur occasionally for performance
|
||||
+ * reasons. Therefore uninitialized memory will be accessed, and
|
||||
+ * conditional jumps will be made that depend on those values.
|
||||
+ * However the length of the match is limited to the lookahead, so
|
||||
+ * the output of deflate is not affected by the uninitialized values.
|
||||
+ */
|
||||
+
|
||||
+/* MAX_MATCH - 2 should be a multiple of 16 for this optimization to work. */
|
||||
+#if (MAX_MATCH == 258)
|
||||
+
|
||||
+ /* Compare ending (2 bytes) and beginning of potential match.
|
||||
+ *
|
||||
+ * On Power processors, loading a 16-byte vector takes only 1 extra
|
||||
+ * cycle compared to a regular byte load. So instead of comparing the
|
||||
+ * first two bytes and then the rest later if they match, we can compare
|
||||
+ * the first 16 at once, and when we have a match longer than 2, we will
|
||||
+ * already have the result of comparing the first 16 bytes saved in mbytes.
|
||||
+ */
|
||||
+ if (*(ushf*)(match+best_len-1) != scan_end ||
|
||||
+ (mbytes = vec_match(scan,match)) < 3) continue;
|
||||
+
|
||||
+ scan += mbytes;
|
||||
+ match += mbytes;
|
||||
+
|
||||
+ /* In case when we may have a match longer than 16, we perform further
|
||||
+ * comparisons in chunks of 16 and keep going while all bytes match.
|
||||
+ */
|
||||
+ while(mbytes == 16) {
|
||||
+ mbytes = vec_match(scan,match);
|
||||
+ scan += mbytes;
|
||||
+ match += mbytes;
|
||||
+
|
||||
+ /* We also have to limit the maximum match based on MAX_MATCH.
|
||||
+ * Since we are comparing 16 bytes at a time and MAX_MATCH == 258 (to
|
||||
+ * comply with default implementation), we should stop comparing when
|
||||
+ * we have matched 256 bytes, which happens when scan == strend2.
|
||||
+ * In this ("rare") case, we have to check the remaining 2 bytes
|
||||
+ * individually using common load and compare operations.
|
||||
+ */
|
||||
+ if(scan >= strend2) {
|
||||
+ if(*scan == *match) {
|
||||
+ if(*++scan == *++match)
|
||||
+ scan++;
|
||||
+ }
|
||||
+ break;
|
||||
+ }
|
||||
+ }
|
||||
+
|
||||
+ Assert(scan <= s->window+(unsigned)(s->window_size-1), "wild scan");
|
||||
+
|
||||
+ len = (MAX_MATCH - 2) - (int)(strend2 - scan);
|
||||
+ scan = strend2 - (MAX_MATCH - 2);
|
||||
+
|
||||
+#else /* MAX_MATCH == 258 */
|
||||
+
|
||||
+ if (match[best_len] != scan_end ||
|
||||
+ match[best_len-1] != scan_end1 ||
|
||||
+ *match != *scan ||
|
||||
+ *++match != scan[1]) continue;
|
||||
+
|
||||
+ /* The check at best_len-1 can be removed because it will be made
|
||||
+ * again later. (This heuristic is not always a win.)
|
||||
+ * It is not necessary to compare scan[2] and match[2] since they
|
||||
+ * are always equal when the other bytes match, given that
|
||||
+ * the hash keys are equal and that HASH_BITS >= 8.
|
||||
+ */
|
||||
+ scan += 2, match++;
|
||||
+ Assert(*scan == *match, "match[2]?");
|
||||
+
|
||||
+ /* We check for insufficient lookahead only every 8th comparison;
|
||||
+ * the 256th check will be made at strstart+258.
|
||||
+ */
|
||||
+ do {
|
||||
+ } while (*++scan == *++match && *++scan == *++match &&
|
||||
+ *++scan == *++match && *++scan == *++match &&
|
||||
+ *++scan == *++match && *++scan == *++match &&
|
||||
+ *++scan == *++match && *++scan == *++match &&
|
||||
+ scan < strend);
|
||||
+
|
||||
+ Assert(scan <= s->window+(unsigned)(s->window_size-1), "wild scan");
|
||||
+
|
||||
+ len = MAX_MATCH - (int)(strend - scan);
|
||||
+ scan = strend - MAX_MATCH;
|
||||
+
|
||||
+#endif /* MAX_MATCH == 258 */
|
||||
+
|
||||
+ if (len > best_len) {
|
||||
+ s->match_start = cur_match;
|
||||
+ best_len = len;
|
||||
+ if (len >= nice_match) break;
|
||||
+#if (MAX_MATCH == 258)
|
||||
+ scan_end = *(ushf*)(scan+best_len-1);
|
||||
+#else
|
||||
+ scan_end1 = scan[best_len-1];
|
||||
+ scan_end = scan[best_len];
|
||||
+#endif
|
||||
+ }
|
||||
+ } while ((cur_match = prev[cur_match & wmask]) > limit
|
||||
+ && --chain_length != 0);
|
||||
+
|
||||
+ if ((uInt)best_len <= s->lookahead) return (uInt)best_len;
|
||||
+ return s->lookahead;
|
||||
+}
|
||||
Index: zlib-1.2.13/contrib/power/longest_match_resolver.c
|
||||
===================================================================
|
||||
--- /dev/null
|
||||
+++ zlib-1.2.13/contrib/power/longest_match_resolver.c
|
||||
@@ -0,0 +1,15 @@
|
||||
+/* Copyright (C) 2019 Matheus Castanho <msc@linux.ibm.com>, IBM
|
||||
+ * For conditions of distribution and use, see copyright notice in zlib.h
|
||||
+ */
|
||||
+
|
||||
+#include "../gcc/zifunc.h"
|
||||
+#include "power.h"
|
||||
+
|
||||
+Z_IFUNC(longest_match) {
|
||||
+#ifdef Z_POWER9
|
||||
+ if (__builtin_cpu_supports("arch_3_00"))
|
||||
+ return _longest_match_power9;
|
||||
+#endif
|
||||
+
|
||||
+ return longest_match_default;
|
||||
+}
|
||||
Index: zlib-1.2.13/contrib/power/power.h
|
||||
===================================================================
|
||||
--- zlib-1.2.13.orig/contrib/power/power.h
|
||||
+++ zlib-1.2.13/contrib/power/power.h
|
||||
@@ -10,4 +10,6 @@ uLong _adler32_power8(uLong adler, const
|
||||
|
||||
unsigned long _crc32_z_power8(unsigned long, const Bytef *, z_size_t);
|
||||
|
||||
+uInt _longest_match_power9(deflate_state *s, IPos cur_match);
|
||||
+
|
||||
void _slide_hash_power8(deflate_state *s);
|
||||
Index: zlib-1.2.13/deflate.c
|
||||
===================================================================
|
||||
--- zlib-1.2.13.orig/deflate.c
|
||||
+++ zlib-1.2.13/deflate.c
|
||||
@@ -1313,6 +1313,14 @@ local void lm_init(s)
|
||||
* string (strstart) and its distance is <= MAX_DIST, and prev_length >= 1
|
||||
* OUT assertion: the match length is not greater than s->lookahead.
|
||||
*/
|
||||
+
|
||||
+#ifdef Z_POWER_OPT
|
||||
+/* Rename function so resolver can use its symbol. The default version will be
|
||||
+ * returned by the resolver if the host has no support for an optimized version.
|
||||
+ */
|
||||
+#define longest_match longest_match_default
|
||||
+#endif /* Z_POWER_OPT */
|
||||
+
|
||||
local uInt longest_match(s, pcur_match)
|
||||
deflate_state *s;
|
||||
IPos pcur_match; /* current match */
|
||||
@@ -1460,6 +1468,11 @@ local uInt longest_match(s, pcur_match)
|
||||
return s->lookahead;
|
||||
}
|
||||
|
||||
+#ifdef Z_POWER_OPT
|
||||
+#undef longest_match
|
||||
+#include "contrib/power/longest_match_resolver.c"
|
||||
+#endif /* Z_POWER_OPT */
|
||||
+
|
||||
#else /* FASTEST */
|
||||
|
||||
/* ---------------------------------------------------------------------------
|
@ -1,342 +0,0 @@
|
||||
From 772f4bd0f880c4c193ab7da78728f38821572a02 Mon Sep 17 00:00:00 2001
|
||||
From: Rogerio Alves <rcardoso@linux.ibm.com>
|
||||
Date: Mon, 9 Dec 2019 14:40:53 -0300
|
||||
Subject: [PATCH] Adler32 vector optimization for Power.
|
||||
|
||||
This commit implements a Power (POWER8+) vector optimization for Adler32
|
||||
checksum using VSX (vector) instructions. The VSX adler32 checksum is up
|
||||
to 10x fast than the adler32 baseline code.
|
||||
|
||||
Author: Rogerio Alves <rcardoso@linux.ibm.com>
|
||||
---
|
||||
CMakeLists.txt | 1 +
|
||||
Makefile.in | 8 ++
|
||||
adler32.c | 11 ++
|
||||
configure | 4 +-
|
||||
contrib/power/adler32_power8.c | 196 +++++++++++++++++++++++++++++++
|
||||
contrib/power/adler32_resolver.c | 15 +++
|
||||
contrib/power/power.h | 4 +-
|
||||
7 files changed, 236 insertions(+), 3 deletions(-)
|
||||
create mode 100644 contrib/power/adler32_power8.c
|
||||
create mode 100644 contrib/power/adler32_resolver.c
|
||||
|
||||
diff --git a/CMakeLists.txt b/CMakeLists.txt
|
||||
index 581e1fa6d..c6296ee68 100644
|
||||
--- a/CMakeLists.txt
|
||||
+++ b/CMakeLists.txt
|
||||
@@ -185,6 +185,7 @@ if(CMAKE_COMPILER_IS_GNUCC)
|
||||
if(POWER8)
|
||||
add_definitions(-DZ_POWER8)
|
||||
set(ZLIB_POWER8
|
||||
+ contrib/power/adler32_power8.c
|
||||
contrib/power/crc32_z_power8.c)
|
||||
|
||||
set_source_files_properties(
|
||||
diff --git a/Makefile.in b/Makefile.in
|
||||
index 16943044e..a0ffac860 100644
|
||||
--- a/Makefile.in
|
||||
+++ b/Makefile.in
|
||||
@@ -165,6 +165,9 @@ minigzip64.o: $(SRCDIR)test/minigzip.c $(SRCDIR)zlib.h zconf.h
|
||||
adler32.o: $(SRCDIR)adler32.c
|
||||
$(CC) $(CFLAGS) $(ZINC) -c -o $@ $(SRCDIR)adler32.c
|
||||
|
||||
+adler32_power8.o: $(SRCDIR)contrib/power/adler32_power8.c
|
||||
+ $(CC) $(CFLAGS) -mcpu=power8 $(ZINC) -c -o $@ $(SRCDIR)contrib/power/adler32_power8.c
|
||||
+
|
||||
crc32.o: $(SRCDIR)crc32.c
|
||||
$(CC) $(CFLAGS) $(ZINC) -c -o $@ $(SRCDIR)crc32.c
|
||||
|
||||
@@ -216,6 +219,11 @@ adler32.lo: $(SRCDIR)adler32.c
|
||||
$(CC) $(SFLAGS) $(ZINC) -DPIC -c -o objs/adler32.o $(SRCDIR)adler32.c
|
||||
-@mv objs/adler32.o $@
|
||||
|
||||
+adler32_power8.lo: $(SRCDIR)contrib/power/adler32_power8.c
|
||||
+ -@mkdir objs 2>/dev/null || test -d objs
|
||||
+ $(CC) $(SFLAGS) -mcpu=power8 $(ZINC) -DPIC -c -o objs/adler32_power8.o $(SRCDIR)contrib/power/adler32_power8.c
|
||||
+ -@mv objs/adler32_power8.o $@
|
||||
+
|
||||
crc32.lo: $(SRCDIR)crc32.c
|
||||
-@mkdir objs 2>/dev/null || test -d objs
|
||||
$(CC) $(SFLAGS) $(ZINC) -DPIC -c -o objs/crc32.o $(SRCDIR)crc32.c
|
||||
diff --git a/adler32.c b/adler32.c
|
||||
index d0be4380a..4bde0fa18 100644
|
||||
--- a/adler32.c
|
||||
+++ b/adler32.c
|
||||
@@ -131,6 +131,12 @@ uLong ZEXPORT adler32_z(adler, buf, len)
|
||||
}
|
||||
|
||||
/* ========================================================================= */
|
||||
+
|
||||
+#ifdef Z_POWER_OPT
|
||||
+/* Rename the default function to avoid naming conflicts */
|
||||
+#define adler32 adler32_default
|
||||
+#endif /* Z_POWER_OPT */
|
||||
+
|
||||
uLong ZEXPORT adler32(adler, buf, len)
|
||||
uLong adler;
|
||||
const Bytef *buf;
|
||||
@@ -139,6 +145,11 @@ uLong ZEXPORT adler32(adler, buf, len)
|
||||
return adler32_z(adler, buf, len);
|
||||
}
|
||||
|
||||
+#ifdef Z_POWER_OPT
|
||||
+#undef adler32
|
||||
+#include "contrib/power/adler32_resolver.c"
|
||||
+#endif /* Z_POWER_OPT */
|
||||
+
|
||||
/* ========================================================================= */
|
||||
local uLong adler32_combine_(adler1, adler2, len2)
|
||||
uLong adler1;
|
||||
diff --git a/configure b/configure
|
||||
index 914d9f4aa..810a7404d 100755
|
||||
--- a/configure
|
||||
+++ b/configure
|
||||
@@ -879,8 +879,8 @@ if tryboth $CC -c $CFLAGS $test.c; then
|
||||
|
||||
if tryboth $CC -c $CFLAGS -mcpu=power8 $test.c; then
|
||||
POWER8="-DZ_POWER8"
|
||||
- PIC_OBJC="${PIC_OBJC} crc32_z_power8.lo"
|
||||
- OBJC="${OBJC} crc32_z_power8.o"
|
||||
+ PIC_OBJC="${PIC_OBJC} adler32_power8.lo crc32_z_power8.lo"
|
||||
+ OBJC="${OBJC} adler32_power8.o crc32_z_power8.o"
|
||||
echo "Checking for -mcpu=power8 support... Yes." | tee -a configure.log
|
||||
else
|
||||
echo "Checking for -mcpu=power8 support... No." | tee -a configure.log
|
||||
diff --git a/contrib/power/adler32_power8.c b/contrib/power/adler32_power8.c
|
||||
new file mode 100644
|
||||
index 000000000..473c39457
|
||||
--- /dev/null
|
||||
+++ b/contrib/power/adler32_power8.c
|
||||
@@ -0,0 +1,196 @@
|
||||
+/*
|
||||
+ * Adler32 for POWER 8+ using VSX instructions.
|
||||
+ *
|
||||
+ * Calculate adler32 checksum for 16 bytes at once using POWER8+ VSX (vector)
|
||||
+ * instructions.
|
||||
+ *
|
||||
+ * If adler32 do 1 byte at time on the first iteration s1 is s1_0 (_n means
|
||||
+ * iteration n) is the initial value of adler - at start _0 is 1 unless
|
||||
+ * adler initial value is different than 1. So s1_1 = s1_0 + c[0] after
|
||||
+ * the first calculation. For the iteration s1_2 = s1_1 + c[1] and so on.
|
||||
+ * Hence, for iteration N, s1_N = s1_(N-1) + c[N] is the value of s1 on
|
||||
+ * after iteration N.
|
||||
+ *
|
||||
+ * Therefore, for s2 and iteration N, s2_N = s2_0 + N*s1_N + N*c[0] +
|
||||
+ * N-1*c[1] + ... + c[N]
|
||||
+ *
|
||||
+ * In a more general way:
|
||||
+ *
|
||||
+ * s1_N = s1_0 + sum(i=1 to N)c[i]
|
||||
+ * s2_N = s2_0 + N*s1 + sum (i=1 to N)(N-i+1)*c[i]
|
||||
+ *
|
||||
+ * Where s1_N, s2_N are the values for s1, s2 after N iterations. So if we
|
||||
+ * can process N-bit at time we can do this at once.
|
||||
+ *
|
||||
+ * Since VSX can support 16-bit vector instructions, we can process
|
||||
+ * 16-bit at time using N = 16 we have:
|
||||
+ *
|
||||
+ * s1 = s1_16 = s1_(16-1) + c[16] = s1_0 + sum(i=1 to 16)c[i]
|
||||
+ * s2 = s2_16 = s2_0 + 16*s1 + sum(i=1 to 16)(16-i+1)*c[i]
|
||||
+ *
|
||||
+ * After the first iteration we calculate the adler32 checksum for 16 bytes.
|
||||
+ *
|
||||
+ * For more background about adler32 please check the RFC:
|
||||
+ * https://www.ietf.org/rfc/rfc1950.txt
|
||||
+ *
|
||||
+ * Copyright (C) 2019 Rogerio Alves <rcardoso@linux.ibm.com>, IBM
|
||||
+ * For conditions of distribution and use, see copyright notice in zlib.h
|
||||
+ *
|
||||
+ */
|
||||
+
|
||||
+#include "../../zutil.h"
|
||||
+#include <altivec.h>
|
||||
+
|
||||
+/* Largest prime smaller than 65536. */
|
||||
+#define BASE 65521U
|
||||
+#define NMAX 5552
|
||||
+/* NMAX is the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1. */
|
||||
+
|
||||
+#define DO1(s1,s2,buf,i) {(s1) += buf[(i)]; (s2) += (s1);}
|
||||
+#define DO2(s1,s2,buf,i) {DO1(s1,s2,buf,i); DO1(s1,s2,buf,i+1);}
|
||||
+#define DO4(s1,s2,buf,i) {DO2(s1,s2,buf,i); DO2(s1,s2,buf,i+2);}
|
||||
+#define DO8(s1,s2,buf,i) {DO4(s1,s2,buf,i); DO4(s1,s2,buf,i+4);}
|
||||
+#define DO16(s1,s2,buf) {DO8(s1,s2,buf,0); DO8(s1,s2,buf,8);}
|
||||
+
|
||||
+/* Vector across sum unsigned int (saturate). */
|
||||
+inline vector unsigned int vec_sumsu (vector unsigned int __a,
|
||||
+ vector unsigned int __b)
|
||||
+{
|
||||
+ __b = vec_sld(__a, __a, 8);
|
||||
+ __b = vec_add(__b, __a);
|
||||
+ __a = vec_sld(__b, __b, 4);
|
||||
+ __a = vec_add(__a, __b);
|
||||
+
|
||||
+ return __a;
|
||||
+}
|
||||
+
|
||||
+uLong ZLIB_INTERNAL _adler32_power8 (uLong adler, const Bytef* buf, uInt len)
|
||||
+{
|
||||
+ /* If buffer is empty or len=0 we need to return adler initial value. */
|
||||
+ if (buf == NULL)
|
||||
+ return 1;
|
||||
+
|
||||
+ unsigned int s1 = adler & 0xffff;
|
||||
+ unsigned int s2 = (adler >> 16) & 0xffff;
|
||||
+
|
||||
+ /* in case user likes doing a byte at a time, keep it fast */
|
||||
+ if (len == 1) {
|
||||
+ s1 += buf[0];
|
||||
+ if (s1 >= BASE)
|
||||
+ s1 -= BASE;
|
||||
+ s2 += s1;
|
||||
+ if (s2 >= BASE)
|
||||
+ s2 -= BASE;
|
||||
+ return (s2 << 16) | s1;
|
||||
+ }
|
||||
+
|
||||
+ /* Keep it fast for short length buffers. */
|
||||
+ if (len < 16) {
|
||||
+ while (len--) {
|
||||
+ s1 += *buf++;
|
||||
+ s2 += s1;
|
||||
+ }
|
||||
+ if (s1 >= BASE)
|
||||
+ s1 -= BASE;
|
||||
+ s2 %= BASE;
|
||||
+ return (s2 << 16) | s1;
|
||||
+ }
|
||||
+
|
||||
+ /* This is faster than VSX code for len < 64. */
|
||||
+ if (len < 64) {
|
||||
+ while (len >= 16) {
|
||||
+ len -= 16;
|
||||
+ DO16(s1,s2,buf);
|
||||
+ buf += 16;
|
||||
+ }
|
||||
+ } else {
|
||||
+ /* Use POWER VSX instructions for len >= 64. */
|
||||
+ const vector unsigned int v_zeros = { 0 };
|
||||
+ const vector unsigned char v_mul = {16, 15, 14, 13, 12, 11, 10, 9, 8, 7,
|
||||
+ 6, 5, 4, 3, 2, 1};
|
||||
+ const vector unsigned char vsh = vec_splat_u8(4);
|
||||
+ const vector unsigned int vmask = {0xffffffff, 0x0, 0x0, 0x0};
|
||||
+ vector unsigned int vs1 = vec_xl(0, &s1);
|
||||
+ vector unsigned int vs2 = vec_xl(0, &s2);
|
||||
+ vector unsigned int vs1_save = { 0 };
|
||||
+ vector unsigned int vsum1, vsum2;
|
||||
+ vector unsigned char vbuf;
|
||||
+ int n;
|
||||
+
|
||||
+ /* Zeros the undefined values of vectors vs1, vs2. */
|
||||
+ vs1 = vec_and(vs1, vmask);
|
||||
+ vs2 = vec_and(vs2, vmask);
|
||||
+
|
||||
+ /* Do length bigger than NMAX in blocks of NMAX size. */
|
||||
+ while (len >= NMAX) {
|
||||
+ len -= NMAX;
|
||||
+ n = NMAX / 16;
|
||||
+ do {
|
||||
+ vbuf = vec_xl(0, (unsigned char *) buf);
|
||||
+ vsum1 = vec_sum4s(vbuf, v_zeros); /* sum(i=1 to 16) buf[i]. */
|
||||
+ /* sum(i=1 to 16) buf[i]*(16-i+1). */
|
||||
+ vsum2 = vec_msum(vbuf, v_mul, v_zeros);
|
||||
+ /* Save vs1. */
|
||||
+ vs1_save = vec_add(vs1_save, vs1);
|
||||
+ /* Accumulate the sums. */
|
||||
+ vs1 = vec_add(vsum1, vs1);
|
||||
+ vs2 = vec_add(vsum2, vs2);
|
||||
+
|
||||
+ buf += 16;
|
||||
+ } while (--n);
|
||||
+ /* Once each block of NMAX size. */
|
||||
+ vs1 = vec_sumsu(vs1, vsum1);
|
||||
+ vs1_save = vec_sll(vs1_save, vsh); /* 16*vs1_save. */
|
||||
+ vs2 = vec_add(vs1_save, vs2);
|
||||
+ vs2 = vec_sumsu(vs2, vsum2);
|
||||
+
|
||||
+ /* vs1[0] = (s1_i + sum(i=1 to 16)buf[i]) mod 65521. */
|
||||
+ vs1[0] = vs1[0] % BASE;
|
||||
+ /* vs2[0] = s2_i + 16*s1_save +
|
||||
+ sum(i=1 to 16)(16-i+1)*buf[i] mod 65521. */
|
||||
+ vs2[0] = vs2[0] % BASE;
|
||||
+
|
||||
+ vs1 = vec_and(vs1, vmask);
|
||||
+ vs2 = vec_and(vs2, vmask);
|
||||
+ vs1_save = v_zeros;
|
||||
+ }
|
||||
+
|
||||
+ /* len is less than NMAX one modulo is needed. */
|
||||
+ if (len >= 16) {
|
||||
+ while (len >= 16) {
|
||||
+ len -= 16;
|
||||
+
|
||||
+ vbuf = vec_xl(0, (unsigned char *) buf);
|
||||
+
|
||||
+ vsum1 = vec_sum4s(vbuf, v_zeros); /* sum(i=1 to 16) buf[i]. */
|
||||
+ /* sum(i=1 to 16) buf[i]*(16-i+1). */
|
||||
+ vsum2 = vec_msum(vbuf, v_mul, v_zeros);
|
||||
+ /* Save vs1. */
|
||||
+ vs1_save = vec_add(vs1_save, vs1);
|
||||
+ /* Accumulate the sums. */
|
||||
+ vs1 = vec_add(vsum1, vs1);
|
||||
+ vs2 = vec_add(vsum2, vs2);
|
||||
+
|
||||
+ buf += 16;
|
||||
+ }
|
||||
+ /* Since the size will be always less than NMAX we do this once. */
|
||||
+ vs1 = vec_sumsu(vs1, vsum1);
|
||||
+ vs1_save = vec_sll(vs1_save, vsh); /* 16*vs1_save. */
|
||||
+ vs2 = vec_add(vs1_save, vs2);
|
||||
+ vs2 = vec_sumsu(vs2, vsum2);
|
||||
+ }
|
||||
+ /* Copy result back to s1, s2 (mod 65521). */
|
||||
+ s1 = vs1[0] % BASE;
|
||||
+ s2 = vs2[0] % BASE;
|
||||
+ }
|
||||
+
|
||||
+ /* Process tail (len < 16). */
|
||||
+ while (len--) {
|
||||
+ s1 += *buf++;
|
||||
+ s2 += s1;
|
||||
+ }
|
||||
+ s1 %= BASE;
|
||||
+ s2 %= BASE;
|
||||
+
|
||||
+ return (s2 << 16) | s1;
|
||||
+}
|
||||
diff --git a/contrib/power/adler32_resolver.c b/contrib/power/adler32_resolver.c
|
||||
new file mode 100644
|
||||
index 000000000..07a1a2cb2
|
||||
--- /dev/null
|
||||
+++ b/contrib/power/adler32_resolver.c
|
||||
@@ -0,0 +1,15 @@
|
||||
+/* Copyright (C) 2019 Rogerio Alves <rcardoso@linux.ibm.com>, IBM
|
||||
+ * For conditions of distribution and use, see copyright notice in zlib.h
|
||||
+ */
|
||||
+
|
||||
+#include "../gcc/zifunc.h"
|
||||
+#include "power.h"
|
||||
+
|
||||
+Z_IFUNC(adler32) {
|
||||
+#ifdef Z_POWER8
|
||||
+ if (__builtin_cpu_supports("arch_2_07"))
|
||||
+ return _adler32_power8;
|
||||
+#endif
|
||||
+
|
||||
+ return adler32_default;
|
||||
+}
|
||||
diff --git a/contrib/power/power.h b/contrib/power/power.h
|
||||
index 79123aa90..f57c76167 100644
|
||||
--- a/contrib/power/power.h
|
||||
+++ b/contrib/power/power.h
|
||||
@@ -2,7 +2,9 @@
|
||||
* 2019 Rogerio Alves <rogerio.alves@ibm.com>, IBM
|
||||
* For conditions of distribution and use, see copyright notice in zlib.h
|
||||
*/
|
||||
-
|
||||
#include "../../zconf.h"
|
||||
+#include "../../zutil.h"
|
||||
+
|
||||
+uLong _adler32_power8(uLong adler, const Bytef* buf, uInt len);
|
||||
|
||||
unsigned long _crc32_z_power8(unsigned long, const Bytef *, z_size_t);
|
@ -1,34 +0,0 @@
|
||||
From 11b722e4ae91b611f605221587ec8e0829c27949 Mon Sep 17 00:00:00 2001
|
||||
From: Matheus Castanho <msc@linux.ibm.com>
|
||||
Date: Tue, 23 Jun 2020 10:26:19 -0300
|
||||
Subject: [PATCH] Fix invalid memory access on ppc and ppc64
|
||||
|
||||
---
|
||||
contrib/power/adler32_power8.c | 9 ++++-----
|
||||
1 file changed, 4 insertions(+), 5 deletions(-)
|
||||
|
||||
diff --git a/contrib/power/adler32_power8.c b/contrib/power/adler32_power8.c
|
||||
index 473c39457..fdd086453 100644
|
||||
--- a/contrib/power/adler32_power8.c
|
||||
+++ b/contrib/power/adler32_power8.c
|
||||
@@ -110,16 +110,15 @@ uLong ZLIB_INTERNAL _adler32_power8 (uLong adler, const Bytef* buf, uInt len)
|
||||
6, 5, 4, 3, 2, 1};
|
||||
const vector unsigned char vsh = vec_splat_u8(4);
|
||||
const vector unsigned int vmask = {0xffffffff, 0x0, 0x0, 0x0};
|
||||
- vector unsigned int vs1 = vec_xl(0, &s1);
|
||||
- vector unsigned int vs2 = vec_xl(0, &s2);
|
||||
+ vector unsigned int vs1 = { 0 };
|
||||
+ vector unsigned int vs2 = { 0 };
|
||||
vector unsigned int vs1_save = { 0 };
|
||||
vector unsigned int vsum1, vsum2;
|
||||
vector unsigned char vbuf;
|
||||
int n;
|
||||
|
||||
- /* Zeros the undefined values of vectors vs1, vs2. */
|
||||
- vs1 = vec_and(vs1, vmask);
|
||||
- vs2 = vec_and(vs2, vmask);
|
||||
+ vs1[0] = s1;
|
||||
+ vs2[0] = s2;
|
||||
|
||||
/* Do length bigger than NMAX in blocks of NMAX size. */
|
||||
while (len >= NMAX) {
|
@ -1,423 +0,0 @@
|
||||
From 957bc67cfb4e01403c01fe6243850383183a7c19 Mon Sep 17 00:00:00 2001
|
||||
From: Ilya Leoshkevich <iii@linux.ibm.com>
|
||||
Date: Thu, 19 Mar 2020 11:52:03 +0100
|
||||
Subject: [PATCH] s390x: vectorize crc32
|
||||
|
||||
Use vector extensions when compiling for s390x and binutils knows
|
||||
about them. At runtime, check whether kernel supports vector
|
||||
extensions (it has to be not just the CPU, but also the kernel) and
|
||||
choose between the regular and the vectorized implementations.
|
||||
---
|
||||
Makefile.in | 9 ++
|
||||
configure | 28 +++++
|
||||
contrib/gcc/zifunc.h | 21 +++-
|
||||
contrib/s390/crc32-vx.c | 195 ++++++++++++++++++++++++++++++++
|
||||
contrib/s390/crc32_z_resolver.c | 41 +++++++
|
||||
crc32.c | 11 +-
|
||||
6 files changed, 301 insertions(+), 4 deletions(-)
|
||||
create mode 100644 contrib/s390/crc32-vx.c
|
||||
create mode 100644 contrib/s390/crc32_z_resolver.c
|
||||
|
||||
Index: zlib-1.2.13/Makefile.in
|
||||
===================================================================
|
||||
--- zlib-1.2.13.orig/Makefile.in
|
||||
+++ zlib-1.2.13/Makefile.in
|
||||
@@ -25,6 +25,7 @@ LDFLAGS=
|
||||
TEST_LDFLAGS=$(LDFLAGS) -L. libz.a
|
||||
LDSHARED=$(CC)
|
||||
CPP=$(CC) -E
|
||||
+VGFMAFLAG=
|
||||
|
||||
STATICLIB=libz.a
|
||||
SHAREDLIB=libz.so
|
||||
@@ -175,6 +176,9 @@ crc32.o: $(SRCDIR)crc32.c
|
||||
crc32_z_power8.o: $(SRCDIR)contrib/power/crc32_z_power8.c
|
||||
$(CC) $(CFLAGS) -mcpu=power8 $(ZINC) -c -o $@ $(SRCDIR)contrib/power/crc32_z_power8.c
|
||||
|
||||
+crc32-vx.o: $(SRCDIR)contrib/s390/crc32-vx.c
|
||||
+ $(CC) $(CFLAGS) $(VGFMAFLAG) $(ZINC) -c -o $@ $(SRCDIR)contrib/s390/crc32-vx.c
|
||||
+
|
||||
deflate.o: $(SRCDIR)deflate.c
|
||||
$(CC) $(CFLAGS) $(ZINC) -c -o $@ $(SRCDIR)deflate.c
|
||||
|
||||
@@ -225,6 +229,11 @@ crc32.lo: $(SRCDIR)crc32.c
|
||||
$(CC) $(SFLAGS) $(ZINC) -DPIC -c -o objs/crc32.o $(SRCDIR)crc32.c
|
||||
-@mv objs/crc32.o $@
|
||||
|
||||
+crc32-vx.lo: $(SRCDIR)contrib/s390/crc32-vx.c
|
||||
+ -@mkdir objs 2>/dev/null || test -d objs
|
||||
+ $(CC) $(SFLAGS) $(VGFMAFLAG) $(ZINC) -DPIC -c -o objs/crc32-vx.o $(SRCDIR)contrib/s390/crc32-vx.c
|
||||
+ -@mv objs/crc32-vx.o $@
|
||||
+
|
||||
crc32_z_power8.lo: $(SRCDIR)contrib/power/crc32_z_power8.c
|
||||
-@mkdir objs 2>/dev/null || test -d objs
|
||||
$(CC) $(SFLAGS) -mcpu=power8 $(ZINC) -DPIC -c -o objs/crc32_z_power8.o $(SRCDIR)contrib/power/crc32_z_power8.c
|
||||
Index: zlib-1.2.13/configure
|
||||
===================================================================
|
||||
--- zlib-1.2.13.orig/configure
|
||||
+++ zlib-1.2.13/configure
|
||||
@@ -921,6 +921,32 @@ else
|
||||
echo "Checking for Power optimizations support... No." | tee -a configure.log
|
||||
fi
|
||||
|
||||
+# check if we are compiling for s390 and binutils support vector extensions
|
||||
+VGFMAFLAG=-march=z13
|
||||
+cat > $test.c <<EOF
|
||||
+#ifndef __s390__
|
||||
+#error
|
||||
+#endif
|
||||
+EOF
|
||||
+if try $CC -c $CFLAGS $VGFMAFLAG $test.c; then
|
||||
+ CFLAGS="$CFLAGS -DHAVE_S390X_VX"
|
||||
+ SFLAGS="$SFLAGS -DHAVE_S390X_VX"
|
||||
+ OBJC="$OBJC crc32-vx.o"
|
||||
+ PIC_OBJC="$PIC_OBJC crc32-vx.lo"
|
||||
+ echo "Checking for s390 vector extensions... Yes." | tee -a configure.log
|
||||
+
|
||||
+ for flag in -mzarch -fzvector; do
|
||||
+ if try $CC -c $CFLAGS $VGFMAFLAG $flag $test.c; then
|
||||
+ VGFMAFLAG="$VGFMAFLAG $flag"
|
||||
+ echo "Checking for $flag... Yes." | tee -a configure.log
|
||||
+ else
|
||||
+ echo "Checking for $flag... No." | tee -a configure.log
|
||||
+ fi
|
||||
+ done
|
||||
+else
|
||||
+ echo "Checking for s390 vector extensions... No." | tee -a configure.log
|
||||
+fi
|
||||
+
|
||||
# show the results in the log
|
||||
echo >> configure.log
|
||||
echo ALL = $ALL >> configure.log
|
||||
@@ -952,6 +978,7 @@ echo mandir = $mandir >> configure.log
|
||||
echo prefix = $prefix >> configure.log
|
||||
echo sharedlibdir = $sharedlibdir >> configure.log
|
||||
echo uname = $uname >> configure.log
|
||||
+echo VGFMAFLAG = $VGFMAFLAG >> configure.log
|
||||
|
||||
# udpate Makefile with the configure results
|
||||
sed < ${SRCDIR}Makefile.in "
|
||||
@@ -961,6 +988,7 @@ sed < ${SRCDIR}Makefile.in "
|
||||
/^LDFLAGS *=/s#=.*#=$LDFLAGS#
|
||||
/^LDSHARED *=/s#=.*#=$LDSHARED#
|
||||
/^CPP *=/s#=.*#=$CPP#
|
||||
+/^VGFMAFLAG *=/s#=.*#=$VGFMAFLAG#
|
||||
/^STATICLIB *=/s#=.*#=$STATICLIB#
|
||||
/^SHAREDLIB *=/s#=.*#=$SHAREDLIB#
|
||||
/^SHAREDLIBV *=/s#=.*#=$SHAREDLIBV#
|
||||
Index: zlib-1.2.13/contrib/gcc/zifunc.h
|
||||
===================================================================
|
||||
--- zlib-1.2.13.orig/contrib/gcc/zifunc.h
|
||||
+++ zlib-1.2.13/contrib/gcc/zifunc.h
|
||||
@@ -8,9 +8,28 @@
|
||||
|
||||
/* Helpers for arch optimizations */
|
||||
|
||||
+#if defined(__clang__)
|
||||
+#if __has_feature(coverage_sanitizer)
|
||||
+#define Z_IFUNC_NO_SANCOV __attribute__((no_sanitize("coverage")))
|
||||
+#else /* __has_feature(coverage_sanitizer) */
|
||||
+#define Z_IFUNC_NO_SANCOV
|
||||
+#endif /* __has_feature(coverage_sanitizer) */
|
||||
+#else /* __clang__ */
|
||||
+#define Z_IFUNC_NO_SANCOV
|
||||
+#endif /* __clang__ */
|
||||
+
|
||||
+#ifdef __s390__
|
||||
+#define Z_IFUNC_PARAMS unsigned long hwcap
|
||||
+#define Z_IFUNC_ATTRS Z_IFUNC_NO_SANCOV
|
||||
+#else /* __s390__ */
|
||||
+#define Z_IFUNC_PARAMS void
|
||||
+#define Z_IFUNC_ATTRS
|
||||
+#endif /* __s390__ */
|
||||
+
|
||||
#define Z_IFUNC(fname) \
|
||||
typeof(fname) fname __attribute__ ((ifunc (#fname "_resolver"))); \
|
||||
- local typeof(fname) *fname##_resolver(void)
|
||||
+ Z_IFUNC_ATTRS \
|
||||
+ local typeof(fname) *fname##_resolver(Z_IFUNC_PARAMS)
|
||||
/* This is a helper macro to declare a resolver for an indirect function
|
||||
* (ifunc). Let's say you have function
|
||||
*
|
||||
Index: zlib-1.2.13/contrib/s390/crc32-vx.c
|
||||
===================================================================
|
||||
--- /dev/null
|
||||
+++ zlib-1.2.13/contrib/s390/crc32-vx.c
|
||||
@@ -0,0 +1,195 @@
|
||||
+/*
|
||||
+ * Hardware-accelerated CRC-32 variants for Linux on z Systems
|
||||
+ *
|
||||
+ * Use the z/Architecture Vector Extension Facility to accelerate the
|
||||
+ * computing of bitreflected CRC-32 checksums.
|
||||
+ *
|
||||
+ * This CRC-32 implementation algorithm is bitreflected and processes
|
||||
+ * the least-significant bit first (Little-Endian).
|
||||
+ *
|
||||
+ * This code was originally written by Hendrik Brueckner
|
||||
+ * <brueckner@linux.vnet.ibm.com> for use in the Linux kernel and has been
|
||||
+ * relicensed under the zlib license.
|
||||
+ */
|
||||
+
|
||||
+#include "../../zutil.h"
|
||||
+
|
||||
+#include <stdint.h>
|
||||
+#include <vecintrin.h>
|
||||
+
|
||||
+typedef unsigned char uv16qi __attribute__((vector_size(16)));
|
||||
+typedef unsigned int uv4si __attribute__((vector_size(16)));
|
||||
+typedef unsigned long long uv2di __attribute__((vector_size(16)));
|
||||
+
|
||||
+uint32_t crc32_le_vgfm_16(uint32_t crc, const unsigned char *buf, size_t len) {
|
||||
+ /*
|
||||
+ * The CRC-32 constant block contains reduction constants to fold and
|
||||
+ * process particular chunks of the input data stream in parallel.
|
||||
+ *
|
||||
+ * For the CRC-32 variants, the constants are precomputed according to
|
||||
+ * these definitions:
|
||||
+ *
|
||||
+ * R1 = [(x4*128+32 mod P'(x) << 32)]' << 1
|
||||
+ * R2 = [(x4*128-32 mod P'(x) << 32)]' << 1
|
||||
+ * R3 = [(x128+32 mod P'(x) << 32)]' << 1
|
||||
+ * R4 = [(x128-32 mod P'(x) << 32)]' << 1
|
||||
+ * R5 = [(x64 mod P'(x) << 32)]' << 1
|
||||
+ * R6 = [(x32 mod P'(x) << 32)]' << 1
|
||||
+ *
|
||||
+ * The bitreflected Barret reduction constant, u', is defined as
|
||||
+ * the bit reversal of floor(x**64 / P(x)).
|
||||
+ *
|
||||
+ * where P(x) is the polynomial in the normal domain and the P'(x) is the
|
||||
+ * polynomial in the reversed (bitreflected) domain.
|
||||
+ *
|
||||
+ * CRC-32 (IEEE 802.3 Ethernet, ...) polynomials:
|
||||
+ *
|
||||
+ * P(x) = 0x04C11DB7
|
||||
+ * P'(x) = 0xEDB88320
|
||||
+ */
|
||||
+ const uv16qi perm_le2be = {15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0}; /* BE->LE mask */
|
||||
+ const uv2di r2r1 = {0x1C6E41596, 0x154442BD4}; /* R2, R1 */
|
||||
+ const uv2di r4r3 = {0x0CCAA009E, 0x1751997D0}; /* R4, R3 */
|
||||
+ const uv2di r5 = {0, 0x163CD6124}; /* R5 */
|
||||
+ const uv2di ru_poly = {0, 0x1F7011641}; /* u' */
|
||||
+ const uv2di crc_poly = {0, 0x1DB710641}; /* P'(x) << 1 */
|
||||
+
|
||||
+ /*
|
||||
+ * Load the initial CRC value.
|
||||
+ *
|
||||
+ * The CRC value is loaded into the rightmost word of the
|
||||
+ * vector register and is later XORed with the LSB portion
|
||||
+ * of the loaded input data.
|
||||
+ */
|
||||
+ uv2di v0 = {0, 0};
|
||||
+ v0 = (uv2di)vec_insert(crc, (uv4si)v0, 3);
|
||||
+
|
||||
+ /* Load a 64-byte data chunk and XOR with CRC */
|
||||
+ uv2di v1 = vec_perm(((uv2di *)buf)[0], ((uv2di *)buf)[0], perm_le2be);
|
||||
+ uv2di v2 = vec_perm(((uv2di *)buf)[1], ((uv2di *)buf)[1], perm_le2be);
|
||||
+ uv2di v3 = vec_perm(((uv2di *)buf)[2], ((uv2di *)buf)[2], perm_le2be);
|
||||
+ uv2di v4 = vec_perm(((uv2di *)buf)[3], ((uv2di *)buf)[3], perm_le2be);
|
||||
+
|
||||
+ v1 ^= v0;
|
||||
+ buf += 64;
|
||||
+ len -= 64;
|
||||
+
|
||||
+ while (len >= 64) {
|
||||
+ /* Load the next 64-byte data chunk */
|
||||
+ uv16qi part1 = vec_perm(((uv16qi *)buf)[0], ((uv16qi *)buf)[0], perm_le2be);
|
||||
+ uv16qi part2 = vec_perm(((uv16qi *)buf)[1], ((uv16qi *)buf)[1], perm_le2be);
|
||||
+ uv16qi part3 = vec_perm(((uv16qi *)buf)[2], ((uv16qi *)buf)[2], perm_le2be);
|
||||
+ uv16qi part4 = vec_perm(((uv16qi *)buf)[3], ((uv16qi *)buf)[3], perm_le2be);
|
||||
+
|
||||
+ /*
|
||||
+ * Perform a GF(2) multiplication of the doublewords in V1 with
|
||||
+ * the R1 and R2 reduction constants in V0. The intermediate result
|
||||
+ * is then folded (accumulated) with the next data chunk in PART1 and
|
||||
+ * stored in V1. Repeat this step for the register contents
|
||||
+ * in V2, V3, and V4 respectively.
|
||||
+ */
|
||||
+ v1 = (uv2di)vec_gfmsum_accum_128(r2r1, v1, part1);
|
||||
+ v2 = (uv2di)vec_gfmsum_accum_128(r2r1, v2, part2);
|
||||
+ v3 = (uv2di)vec_gfmsum_accum_128(r2r1, v3, part3);
|
||||
+ v4 = (uv2di)vec_gfmsum_accum_128(r2r1, v4, part4);
|
||||
+
|
||||
+ buf += 64;
|
||||
+ len -= 64;
|
||||
+ }
|
||||
+
|
||||
+ /*
|
||||
+ * Fold V1 to V4 into a single 128-bit value in V1. Multiply V1 with R3
|
||||
+ * and R4 and accumulating the next 128-bit chunk until a single 128-bit
|
||||
+ * value remains.
|
||||
+ */
|
||||
+ v1 = (uv2di)vec_gfmsum_accum_128(r4r3, v1, (uv16qi)v2);
|
||||
+ v1 = (uv2di)vec_gfmsum_accum_128(r4r3, v1, (uv16qi)v3);
|
||||
+ v1 = (uv2di)vec_gfmsum_accum_128(r4r3, v1, (uv16qi)v4);
|
||||
+
|
||||
+ while (len >= 16) {
|
||||
+ /* Load next data chunk */
|
||||
+ v2 = vec_perm(*(uv2di *)buf, *(uv2di *)buf, perm_le2be);
|
||||
+
|
||||
+ /* Fold next data chunk */
|
||||
+ v1 = (uv2di)vec_gfmsum_accum_128(r4r3, v1, (uv16qi)v2);
|
||||
+
|
||||
+ buf += 16;
|
||||
+ len -= 16;
|
||||
+ }
|
||||
+
|
||||
+ /*
|
||||
+ * Set up a vector register for byte shifts. The shift value must
|
||||
+ * be loaded in bits 1-4 in byte element 7 of a vector register.
|
||||
+ * Shift by 8 bytes: 0x40
|
||||
+ * Shift by 4 bytes: 0x20
|
||||
+ */
|
||||
+ uv16qi v9 = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
|
||||
+ v9 = vec_insert((unsigned char)0x40, v9, 7);
|
||||
+
|
||||
+ /*
|
||||
+ * Prepare V0 for the next GF(2) multiplication: shift V0 by 8 bytes
|
||||
+ * to move R4 into the rightmost doubleword and set the leftmost
|
||||
+ * doubleword to 0x1.
|
||||
+ */
|
||||
+ v0 = vec_srb(r4r3, (uv2di)v9);
|
||||
+ v0[0] = 1;
|
||||
+
|
||||
+ /*
|
||||
+ * Compute GF(2) product of V1 and V0. The rightmost doubleword
|
||||
+ * of V1 is multiplied with R4. The leftmost doubleword of V1 is
|
||||
+ * multiplied by 0x1 and is then XORed with rightmost product.
|
||||
+ * Implicitly, the intermediate leftmost product becomes padded
|
||||
+ */
|
||||
+ v1 = (uv2di)vec_gfmsum_128(v0, v1);
|
||||
+
|
||||
+ /*
|
||||
+ * Now do the final 32-bit fold by multiplying the rightmost word
|
||||
+ * in V1 with R5 and XOR the result with the remaining bits in V1.
|
||||
+ *
|
||||
+ * To achieve this by a single VGFMAG, right shift V1 by a word
|
||||
+ * and store the result in V2 which is then accumulated. Use the
|
||||
+ * vector unpack instruction to load the rightmost half of the
|
||||
+ * doubleword into the rightmost doubleword element of V1; the other
|
||||
+ * half is loaded in the leftmost doubleword.
|
||||
+ * The vector register with CONST_R5 contains the R5 constant in the
|
||||
+ * rightmost doubleword and the leftmost doubleword is zero to ignore
|
||||
+ * the leftmost product of V1.
|
||||
+ */
|
||||
+ v9 = vec_insert((unsigned char)0x20, v9, 7);
|
||||
+ v2 = vec_srb(v1, (uv2di)v9);
|
||||
+ v1 = vec_unpackl((uv4si)v1); /* Split rightmost doubleword */
|
||||
+ v1 = (uv2di)vec_gfmsum_accum_128(r5, v1, (uv16qi)v2);
|
||||
+
|
||||
+ /*
|
||||
+ * Apply a Barret reduction to compute the final 32-bit CRC value.
|
||||
+ *
|
||||
+ * The input values to the Barret reduction are the degree-63 polynomial
|
||||
+ * in V1 (R(x)), degree-32 generator polynomial, and the reduction
|
||||
+ * constant u. The Barret reduction result is the CRC value of R(x) mod
|
||||
+ * P(x).
|
||||
+ *
|
||||
+ * The Barret reduction algorithm is defined as:
|
||||
+ *
|
||||
+ * 1. T1(x) = floor( R(x) / x^32 ) GF2MUL u
|
||||
+ * 2. T2(x) = floor( T1(x) / x^32 ) GF2MUL P(x)
|
||||
+ * 3. C(x) = R(x) XOR T2(x) mod x^32
|
||||
+ *
|
||||
+ * Note: The leftmost doubleword of vector register containing
|
||||
+ * CONST_RU_POLY is zero and, thus, the intermediate GF(2) product
|
||||
+ * is zero and does not contribute to the final result.
|
||||
+ */
|
||||
+
|
||||
+ /* T1(x) = floor( R(x) / x^32 ) GF2MUL u */
|
||||
+ v2 = vec_unpackl((uv4si)v1);
|
||||
+ v2 = (uv2di)vec_gfmsum_128(ru_poly, v2);
|
||||
+
|
||||
+ /*
|
||||
+ * Compute the GF(2) product of the CRC polynomial with T1(x) in
|
||||
+ * V2 and XOR the intermediate result, T2(x), with the value in V1.
|
||||
+ * The final result is stored in word element 2 of V2.
|
||||
+ */
|
||||
+ v2 = vec_unpackl((uv4si)v2);
|
||||
+ v2 = (uv2di)vec_gfmsum_accum_128(crc_poly, v2, (uv16qi)v1);
|
||||
+
|
||||
+ return ((uv4si)v2)[2];
|
||||
+}
|
||||
Index: zlib-1.2.13/contrib/s390/crc32_z_resolver.c
|
||||
===================================================================
|
||||
--- /dev/null
|
||||
+++ zlib-1.2.13/contrib/s390/crc32_z_resolver.c
|
||||
@@ -0,0 +1,41 @@
|
||||
+#include <sys/auxv.h>
|
||||
+#include "../gcc/zifunc.h"
|
||||
+
|
||||
+#define VX_MIN_LEN 64
|
||||
+#define VX_ALIGNMENT 16L
|
||||
+#define VX_ALIGN_MASK (VX_ALIGNMENT - 1)
|
||||
+
|
||||
+unsigned int crc32_le_vgfm_16(unsigned int crc, const unsigned char FAR *buf, z_size_t len);
|
||||
+
|
||||
+local unsigned long s390_crc32_vx(unsigned long crc, const unsigned char FAR *buf, z_size_t len)
|
||||
+{
|
||||
+ uintptr_t prealign, aligned, remaining;
|
||||
+
|
||||
+ if (buf == Z_NULL) return 0UL;
|
||||
+
|
||||
+ if (len < VX_MIN_LEN + VX_ALIGN_MASK)
|
||||
+ return crc32_z_default(crc, buf, len);
|
||||
+
|
||||
+ if ((uintptr_t)buf & VX_ALIGN_MASK) {
|
||||
+ prealign = VX_ALIGNMENT - ((uintptr_t)buf & VX_ALIGN_MASK);
|
||||
+ len -= prealign;
|
||||
+ crc = crc32_z_default(crc, buf, prealign);
|
||||
+ buf += prealign;
|
||||
+ }
|
||||
+ aligned = len & ~VX_ALIGN_MASK;
|
||||
+ remaining = len & VX_ALIGN_MASK;
|
||||
+
|
||||
+ crc = crc32_le_vgfm_16(crc ^ 0xffffffff, buf, (size_t)aligned) ^ 0xffffffff;
|
||||
+
|
||||
+ if (remaining)
|
||||
+ crc = crc32_z_default(crc, buf + aligned, remaining);
|
||||
+
|
||||
+ return crc;
|
||||
+}
|
||||
+
|
||||
+Z_IFUNC(crc32_z)
|
||||
+{
|
||||
+ if (hwcap & HWCAP_S390_VX)
|
||||
+ return s390_crc32_vx;
|
||||
+ return crc32_z_default;
|
||||
+}
|
||||
Index: zlib-1.2.13/crc32.c
|
||||
===================================================================
|
||||
--- zlib-1.2.13.orig/crc32.c
|
||||
+++ zlib-1.2.13/crc32.c
|
||||
@@ -745,12 +745,12 @@ local z_word_t crc_word_big(data)
|
||||
#endif
|
||||
|
||||
/* ========================================================================= */
|
||||
-#ifdef Z_POWER_OPT
|
||||
+#if defined(Z_POWER_OPT) || defined(HAVE_S390X_VX)
|
||||
/* Rename function so resolver can use its symbol. The default version will be
|
||||
* returned by the resolver if the host has no support for an optimized version.
|
||||
*/
|
||||
#define crc32_z crc32_z_default
|
||||
-#endif /* Z_POWER_OPT */
|
||||
+#endif /* defined(Z_POWER_OPT) || defined(HAVE_S390X_VX) */
|
||||
|
||||
unsigned long ZEXPORT crc32_z(crc, buf, len)
|
||||
unsigned long crc;
|
||||
@@ -1073,10 +1073,15 @@ unsigned long ZEXPORT crc32_z(crc, buf,
|
||||
return crc ^ 0xffffffff;
|
||||
}
|
||||
|
||||
-#ifdef Z_POWER_OPT
|
||||
+#if defined(Z_POWER_OPT) || defined(HAVE_S390X_VX)
|
||||
#undef crc32_z
|
||||
+#ifdef Z_POWER_OPT
|
||||
#include "contrib/power/crc32_z_resolver.c"
|
||||
#endif /* Z_POWER_OPT */
|
||||
+#ifdef HAVE_S390X_VX
|
||||
+#include "contrib/s390/crc32_z_resolver.c"
|
||||
+#endif /* HAVE_S390X_VX */
|
||||
+#endif /* defined(Z_POWER_OPT) || defined(HAVE_S390X_VX) */
|
||||
|
||||
#endif
|
||||
|
@ -1,27 +0,0 @@
|
||||
From e554695638228b846d49657f31eeff0ca4680e8a Mon Sep 17 00:00:00 2001
|
||||
From: Mark Adler <madler@alumni.caltech.edu>
|
||||
Date: Thu, 15 Dec 2022 09:07:13 -0800
|
||||
Subject: [PATCH] Fix bug in deflateBound() for level 0 and memLevel 9.
|
||||
|
||||
memLevel 9 would cause deflateBound() to assume the use of fixed
|
||||
blocks, even if the compression level was 0, which forces stored
|
||||
blocks. That could result in a bound less than the size of the
|
||||
compressed data. Now level 0 always uses the stored blocks bound.
|
||||
---
|
||||
deflate.c | 3 ++-
|
||||
1 file changed, 2 insertions(+), 1 deletion(-)
|
||||
|
||||
diff --git a/deflate.c b/deflate.c
|
||||
index cd538b8ac..4a512e1f9 100644
|
||||
--- a/deflate.c
|
||||
+++ b/deflate.c
|
||||
@@ -752,7 +752,8 @@ uLong ZEXPORT deflateBound(strm, sourceLen)
|
||||
|
||||
/* if not default parameters, return one of the conservative bounds */
|
||||
if (s->w_bits != 15 || s->hash_bits != 8 + 7)
|
||||
- return (s->w_bits <= s->hash_bits ? fixedlen : storelen) + wraplen;
|
||||
+ return (s->w_bits <= s->hash_bits && s->level ? fixedlen : storelen) +
|
||||
+ wraplen;
|
||||
|
||||
/* default settings: return tight bound for that case -- ~0.03% overhead
|
||||
plus a small constant */
|
@ -2,16 +2,12 @@ Index: deflate.c
|
||||
===================================================================
|
||||
--- deflate.c.orig
|
||||
+++ deflate.c
|
||||
@@ -1233,15 +1233,16 @@ local void lm_init (s)
|
||||
@@ -1233,12 +1233,13 @@ local void lm_init (s)
|
||||
* string (strstart) and its distance is <= MAX_DIST, and prev_length >= 1
|
||||
* OUT assertion: the match length is not greater than s->lookahead.
|
||||
*/
|
||||
-local uInt longest_match(s, cur_match)
|
||||
+local uInt longest_match(s, pcur_match)
|
||||
deflate_state *s;
|
||||
- IPos cur_match; /* current match */
|
||||
+ IPos pcur_match; /* current match */
|
||||
{
|
||||
-local uInt longest_match(deflate_state *s, IPos cur_match) {
|
||||
+local uInt longest_match(deflate_state *s, IPos pcur_match) {
|
||||
+ ptrdiff_t cur_match = pcur_match; /* extend to pointer width */
|
||||
unsigned chain_length = s->max_chain_length;/* max hash chain length */
|
||||
register Bytef *scan = s->window + s->strstart; /* current string */
|
||||
|
BIN
zlib-1.2.13.tar.gz
(Stored with Git LFS)
BIN
zlib-1.2.13.tar.gz
(Stored with Git LFS)
Binary file not shown.
@ -1,7 +0,0 @@
|
||||
-----BEGIN PGP SIGNATURE-----
|
||||
Comment: GPGTools - http://gpgtools.org
|
||||
|
||||
iF0EABECAB0WIQRe1GpnIdNlWHeR4qp4P82OWLyvugUCY0h42QAKCRB4P82OWLyv
|
||||
upvZAKCF7EgWGaMEfO78WnkA8hivLlBMlACgyI7Vm2A5BI2jI+h23yqrKjgQC5s=
|
||||
=umRA
|
||||
-----END PGP SIGNATURE-----
|
File diff suppressed because it is too large
Load Diff
3
zlib-1.3.tar.gz
Normal file
3
zlib-1.3.tar.gz
Normal file
@ -0,0 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:ff0ba4c292013dbc27530b3a81e1f9a813cd39de01ca5e0f8bf355702efa593e
|
||||
size 1495873
|
7
zlib-1.3.tar.gz.asc
Normal file
7
zlib-1.3.tar.gz.asc
Normal file
@ -0,0 +1,7 @@
|
||||
-----BEGIN PGP SIGNATURE-----
|
||||
Comment: GPGTools - http://gpgtools.org
|
||||
|
||||
iF0EABECAB0WIQRe1GpnIdNlWHeR4qp4P82OWLyvugUCZN8+EgAKCRB4P82OWLyv
|
||||
usBmAKC6ixPJLSVYgQivrqK4KBw4gTGFGwCgxJ9SfDFGqI3uqjyR99/13L7vn3o=
|
||||
=TwN5
|
||||
-----END PGP SIGNATURE-----
|
@ -9,8 +9,8 @@ Index: zlib.h
|
||||
is returned, and the error state is set to Z_STREAM_ERROR.
|
||||
*/
|
||||
|
||||
-ZEXTERN int ZEXPORTVA gzprintf Z_ARG((gzFile file, const char *format, ...));
|
||||
+ZEXTERN int ZEXPORTVA gzprintf Z_ARG((gzFile file, const char *format, ...))
|
||||
-ZEXTERN int ZEXPORTVA gzprintf(gzFile file, const char *format, ...);
|
||||
+ZEXTERN int ZEXPORTVA gzprintf(gzFile file, const char *format, ...)
|
||||
+#ifdef __GNUC__
|
||||
+ __attribute__((__format__(__printf__,2,3)))
|
||||
+#endif
|
||||
|
@ -1,9 +1,9 @@
|
||||
Index: zlib-1.2.12/infback.c
|
||||
Index: zlib-1.3/infback.c
|
||||
===================================================================
|
||||
--- zlib-1.2.12.orig/infback.c
|
||||
+++ zlib-1.2.12/infback.c
|
||||
@@ -34,9 +34,6 @@ int stream_size;
|
||||
{
|
||||
--- zlib-1.3.orig/infback.c
|
||||
+++ zlib-1.3/infback.c
|
||||
@@ -27,9 +27,6 @@ int ZEXPORT inflateBackInit_(z_streamp s
|
||||
int stream_size) {
|
||||
struct inflate_state FAR *state;
|
||||
|
||||
- if (version == Z_NULL || version[0] != ZLIB_VERSION[0] ||
|
||||
@ -12,11 +12,11 @@ Index: zlib-1.2.12/infback.c
|
||||
if (strm == Z_NULL || window == Z_NULL ||
|
||||
windowBits < 8 || windowBits > 15)
|
||||
return Z_STREAM_ERROR;
|
||||
Index: zlib-1.2.12/inflate.c
|
||||
Index: zlib-1.3/inflate.c
|
||||
===================================================================
|
||||
--- zlib-1.2.12.orig/inflate.c
|
||||
+++ zlib-1.2.12/inflate.c
|
||||
@@ -202,9 +202,6 @@ int stream_size;
|
||||
--- zlib-1.3.orig/inflate.c
|
||||
+++ zlib-1.3/inflate.c
|
||||
@@ -180,9 +180,6 @@ int ZEXPORT inflateInit2_(z_streamp strm
|
||||
int ret;
|
||||
struct inflate_state FAR *state;
|
||||
|
||||
@ -26,12 +26,12 @@ Index: zlib-1.2.12/inflate.c
|
||||
if (strm == Z_NULL) return Z_STREAM_ERROR;
|
||||
strm->msg = Z_NULL; /* in case we return an error */
|
||||
if (strm->zalloc == (alloc_func)0) {
|
||||
Index: zlib-1.2.12/deflate.c
|
||||
Index: zlib-1.3/deflate.c
|
||||
===================================================================
|
||||
--- zlib-1.2.12.orig/deflate.c
|
||||
+++ zlib-1.2.12/deflate.c
|
||||
@@ -253,12 +253,7 @@ int ZEXPORT deflateInit2_(strm, level, m
|
||||
{
|
||||
--- zlib-1.3.orig/deflate.c
|
||||
+++ zlib-1.3/deflate.c
|
||||
@@ -384,12 +384,7 @@ int ZEXPORT deflateInit2_(z_streamp strm
|
||||
const char *version, int stream_size) {
|
||||
deflate_state *s;
|
||||
int wrap = 1;
|
||||
- static const char my_version[] = ZLIB_VERSION;
|
||||
|
35
zlib.changes
35
zlib.changes
@ -1,3 +1,38 @@
|
||||
-------------------------------------------------------------------
|
||||
Thu Oct 19 16:00:31 UTC 2023 - Danilo Spinella <oss@danyspin97.org>
|
||||
|
||||
- Update to 1.3:
|
||||
* Building using K&R (pre-ANSI) function definitions is no longer supported.
|
||||
* Fixed a bug in deflateBound() for level 0 and memLevel 9.
|
||||
* Fixed a bug when gzungetc() is used immediately after gzopen().
|
||||
* Fixed a bug when using gzflush() with a very small buffer.
|
||||
* Fixed a crash when gzsetparams() is attempted for a transparent write.
|
||||
* Fixed test/example.c to work with FORCE_STORED.
|
||||
* Fixed minizip to allow it to open an empty zip file.
|
||||
* Fixed reading disk number start on zip64 files in minizip.
|
||||
* Fixed a logic error in minizip argument processing.
|
||||
- Added patches:
|
||||
* zlib-1.3-IBM-Z-hw-accelerated-deflate-s390x.patch
|
||||
- Refreshed patches:
|
||||
* zlib-1.2.12-add-optimized-slide_hash-for-power.patch
|
||||
* zlib-1.2.12-add-vectorized-longest_match-for-power.patch
|
||||
* zlib-1.2.12-adler32-vector-optimizations-for-power.patch
|
||||
* zlib-1.2.13-optimized-s390.patch
|
||||
* zlib-format.patch
|
||||
* zlib-no-version-check.patch
|
||||
- Removed patches:
|
||||
* bsc1210593.patch
|
||||
* zlib-1.2.13-fix-bug-deflateBound.patch
|
||||
* zlib-1.2.12-s390-vectorize-crc32.patch
|
||||
* zlib-1.2.13-IBM-Z-hw-accelerated-deflate-s390x.patch
|
||||
* zlib-1.2.12-add-optimized-slide_hash-for-power.patch
|
||||
* zlib-1.2.12-fix-invalid-memory-access-on-ppc-and-ppc64.patch
|
||||
* zlib-1.2.12-add-vectorized-longest_match-for-power.patch
|
||||
* zlib-1.2.12-adler32-vector-optimizations-for-power.patch
|
||||
- Fix CVE-2023-45853, integer overflow and resultant heap-based buffer
|
||||
overflow in zipOpenNewFileInZip4_6, bsc#1216378
|
||||
* CVE-2023-45853.patch
|
||||
|
||||
-------------------------------------------------------------------
|
||||
Fri May 5 09:56:31 UTC 2023 - Danilo Spinella <danilo.spinella@suse.com>
|
||||
|
||||
|
28
zlib.spec
28
zlib.spec
@ -17,7 +17,7 @@
|
||||
|
||||
|
||||
Name: zlib
|
||||
Version: 1.2.13
|
||||
Version: 1.3
|
||||
Release: 0
|
||||
Summary: Library implementing the DEFLATE compression algorithm
|
||||
License: Zlib
|
||||
@ -37,25 +37,17 @@ Patch2: 0001-Do-not-try-to-store-negative-values-in-unsigned-int.patch
|
||||
Patch3: zlib-no-version-check.patch
|
||||
#PATCH-FIX-SUSE https://github.com/madler/zlib/pull/229
|
||||
Patch4: minizip-dont-install-crypt-header.patch
|
||||
# PATCH-FIX-UPSTREAM https://github.com/madler/zlib/commit/e554695638228b846d49657f31eeff0ca4680e8a
|
||||
Patch5: zlib-1.2.13-fix-bug-deflateBound.patch
|
||||
#PATCH-FIX-SUSE https://github.com/madler/zlib/pull/410
|
||||
Patch6: zlib-1.2.13-IBM-Z-hw-accelerated-deflate-s390x.patch
|
||||
Patch6: zlib-1.3-IBM-Z-hw-accelerated-deflate-s390x.patch
|
||||
# Patches taken from https://github.com/iii-i/zlib/releases/tag/crc32vx-v3
|
||||
Patch7: zlib-1.2.5-minizip-fixuncrypt.patch
|
||||
Patch8: zlib-1.2.13-optimized-s390.patch
|
||||
# https://github.com/iii-i/zlib/commit/171d0ff3c9ed40da0ac14085ab16b766b1162069
|
||||
Patch10: zlib-1.2.11-covscan-issues.patch
|
||||
Patch11: zlib-1.2.11-covscan-issues-rhel9.patch
|
||||
Patch14: zlib-1.2.12-s390-vectorize-crc32.patch
|
||||
# The following patches are taken from https://github.com/mscastanho/zlib/commits/power-optimizations-1.2.12
|
||||
Patch15: zlib-1.2.12-adler32-vector-optimizations-for-power.patch
|
||||
Patch16: zlib-1.2.12-fix-invalid-memory-access-on-ppc-and-ppc64.patch
|
||||
Patch17: zlib-1.2.12-add-optimized-slide_hash-for-power.patch
|
||||
Patch18: zlib-1.2.12-add-vectorized-longest_match-for-power.patch
|
||||
# PATCH-FIX-UPSTREAM danilo.spinella@suse.com bsc#1210593 bsc#1211005
|
||||
# Fix deflateBound() before deflateInit()
|
||||
Patch19: bsc1210593.patch
|
||||
# PATCh-FIX-SECURITY CVE-2023-45853.patch bsc#1216378 CVE-2023-45853 danilo.spinella@suse.com
|
||||
# integer overflow and resultant heap-based buffer overflow in zipOpenNewFileInZip4_6
|
||||
Patch12: CVE-2023-45853.patch
|
||||
BuildRequires: autoconf
|
||||
BuildRequires: automake
|
||||
BuildRequires: libtool
|
||||
@ -147,18 +139,12 @@ It should exit 0
|
||||
%patch2 -p1
|
||||
%patch3 -p1
|
||||
%patch4 -p1
|
||||
%patch5 -p1
|
||||
%patch6 -p1
|
||||
%patch7 -p1
|
||||
%patch8
|
||||
%patch10 -p1
|
||||
%patch11 -p1
|
||||
%patch14 -p1
|
||||
%patch15 -p1
|
||||
%patch16 -p1
|
||||
%patch17 -p1
|
||||
%patch18 -p1
|
||||
%patch19 -p1
|
||||
%patch12 -p1
|
||||
cp %{SOURCE4} .
|
||||
|
||||
%build
|
||||
@ -221,7 +207,7 @@ find %{buildroot} -type f -name "*.la" -delete -print
|
||||
|
||||
%files -n libz1
|
||||
%license LICENSE
|
||||
%{_libdir}/libz.so.1.2.*
|
||||
%{_libdir}/libz.so.1.3
|
||||
%{_libdir}/libz.so.1
|
||||
|
||||
%files devel
|
||||
|
Loading…
Reference in New Issue
Block a user