From f6c5a01bebdaa4fdfdb34c5f2a83d2fa26ec67c0195b2ed5a3d9225ea73863e5 Mon Sep 17 00:00:00 2001 From: Dirk Mueller Date: Tue, 11 Oct 2022 12:40:29 +0000 Subject: [PATCH] Accepting request 1009574 from home:dspinella:branches:devel:libraries:c_c++ - Add Power8 optimizations: * zlib-1.2.12-add-optimized-slide_hash-for-power.patch * zlib-1.2.12-add-vectorized-longest_match-for-power.patch * zlib-1.2.12-adler32-vector-optimizations-for-power.patch * zlib-1.2.12-fix-invalid-memory-access-on-ppc-and-ppc64.patch - Update zlib-1.2.12-IBM-Z-hw-accelerated-deflate-s390x.patch OBS-URL: https://build.opensuse.org/request/show/1009574 OBS-URL: https://build.opensuse.org/package/show/devel:libraries:c_c++/zlib?expand=0&rev=82 --- ...2-IBM-Z-hw-accelerated-deflate-s390x.patch | 307 ++++++++++------ ...2-add-optimized-slide_hash-for-power.patch | 219 +++++++++++ ...d-vectorized-longest_match-for-power.patch | 338 +++++++++++++++++ ...ler32-vector-optimizations-for-power.patch | 342 ++++++++++++++++++ ...valid-memory-access-on-ppc-and-ppc64.patch | 34 ++ zlib.changes | 10 + zlib.spec | 16 +- 7 files changed, 1157 insertions(+), 109 deletions(-) create mode 100644 zlib-1.2.12-add-optimized-slide_hash-for-power.patch create mode 100644 zlib-1.2.12-add-vectorized-longest_match-for-power.patch create mode 100644 zlib-1.2.12-adler32-vector-optimizations-for-power.patch create mode 100644 zlib-1.2.12-fix-invalid-memory-access-on-ppc-and-ppc64.patch diff --git a/zlib-1.2.12-IBM-Z-hw-accelerated-deflate-s390x.patch b/zlib-1.2.12-IBM-Z-hw-accelerated-deflate-s390x.patch index 7376951..3fa0c37 100644 --- a/zlib-1.2.12-IBM-Z-hw-accelerated-deflate-s390x.patch +++ b/zlib-1.2.12-IBM-Z-hw-accelerated-deflate-s390x.patch @@ -1,8 +1,99 @@ -From e1d7e6dc9698968a8536b00ebd9e9b4e429b4306 Mon Sep 17 00:00:00 2001 +From 171d0ff3c9ed40da0ac14085ab16b766b1162069 Mon Sep 17 00:00:00 2001 From: Ilya Leoshkevich -Date: Wed, 27 Apr 2022 14:37:24 +0200 -Subject: [PATCH] zlib-1.2.12-IBM-Z-hw-accelrated-deflate-s390x.patch +Date: Wed, 18 Jul 2018 13:14:07 +0200 +Subject: [PATCH] Add support for IBM Z hardware-accelerated deflate +IBM Z mainframes starting from version z15 provide DFLTCC instruction, +which implements deflate algorithm in hardware with estimated +compression and decompression performance orders of magnitude faster +than the current zlib and ratio comparable with that of level 1. + +This patch adds DFLTCC support to zlib. In order to enable it, the +following build commands should be used: + + $ ./configure --dfltcc + $ make + +When built like this, zlib would compress in hardware on level 1, and in +software on all other levels. Decompression will always happen in +hardware. In order to enable DFLTCC compression for levels 1-6 (i.e. to +make it used by default) one could either configure with +--dfltcc-level-mask=0x7e or set the environment variable +DFLTCC_LEVEL_MASK to 0x7e at run time. + +Two DFLTCC compression calls produce the same results only when they +both are made on machines of the same generation, and when the +respective buffers have the same offset relative to the start of the +page. Therefore care should be taken when using hardware compression +when reproducible results are desired. One such use case - reproducible +software builds - is handled explicitly: when SOURCE_DATE_EPOCH +environment variable is set, the hardware compression is disabled. + +DFLTCC does not support every single zlib feature, in particular: + + * inflate(Z_BLOCK) and inflate(Z_TREES) + * inflateMark() + * inflatePrime() + * inflateSyncPoint() + +When used, these functions will either switch to software, or, in case +this is not possible, gracefully fail. + +This patch tries to add DFLTCC support in the least intrusive way. +All SystemZ-specific code is placed into a separate file, but +unfortunately there is still a noticeable amount of changes in the +main zlib code. Below is the summary of these changes. + +DFLTCC takes as arguments a parameter block, an input buffer, an output +buffer and a window. Since DFLTCC requires parameter block to be +doubleword-aligned, and it's reasonable to allocate it alongside +deflate and inflate states, ZALLOC_STATE, ZFREE_STATE and ZCOPY_STATE +macros were introduced in order to encapsulate the allocation details. +The same is true for window, for which ZALLOC_WINDOW and +TRY_FREE_WINDOW macros were introduced. + +While for inflate software and hardware window formats match, this is +not the case for deflate. Therefore, deflateSetDictionary and +deflateGetDictionary need special handling, which is triggered using the +new DEFLATE_SET_DICTIONARY_HOOK and DEFLATE_GET_DICTIONARY_HOOK macros. + +deflateResetKeep() and inflateResetKeep() now update the DFLTCC +parameter block, which is allocated alongside zlib state, using +the new DEFLATE_RESET_KEEP_HOOK and INFLATE_RESET_KEEP_HOOK macros. + +The new DEFLATE_PARAMS_HOOK switches between hardware and software +deflate implementations when deflateParams() arguments demand this. + +The new INFLATE_PRIME_HOOK, INFLATE_MARK_HOOK and +INFLATE_SYNC_POINT_HOOK macros make the respective unsupported calls +gracefully fail. + +The algorithm implemented in hardware has different compression ratio +than the one implemented in software. In order for deflateBound() to +return the correct results for the hardware implementation, the new +DEFLATE_BOUND_ADJUST_COMPLEN and DEFLATE_NEED_CONSERVATIVE_BOUND macros +were introduced. + +Actual compression and decompression are handled by the new DEFLATE_HOOK +and INFLATE_TYPEDO_HOOK macros. Since inflation with DFLTCC manages the +window on its own, calling updatewindow() is suppressed using the new +INFLATE_NEED_UPDATEWINDOW() macro. + +In addition to compression, DFLTCC computes CRC-32 and Adler-32 +checksums, therefore, whenever it's used, software checksumming needs to +be suppressed using the new DEFLATE_NEED_CHECKSUM and +INFLATE_NEED_CHECKSUM macros. + +DFLTCC will refuse to write an End-of-block Symbol if there is no input +data, thus in some cases it is necessary to do this manually. In order +to achieve this, send_bits, bi_reverse, bi_windup and flush_pending +were promoted from local to ZLIB_INTERNAL. Furthermore, since block and +stream termination must be handled in software as well, block_state enum +was moved to deflate.h. + +Since the first call to dfltcc_inflate already needs the window, and it +might be not allocated yet, inflate_ensure_window was factored out of +updatewindow and made ZLIB_INTERNAL. --- Makefile.in | 8 + compress.c | 14 +- @@ -27,10 +118,10 @@ Subject: [PATCH] zlib-1.2.12-IBM-Z-hw-accelrated-deflate-s390x.patch create mode 100644 contrib/s390/dfltcc.h create mode 100644 contrib/s390/dfltcc_deflate.h -Index: zlib-1.2.12/Makefile.in -=================================================================== ---- zlib-1.2.12.orig/Makefile.in -+++ zlib-1.2.12/Makefile.in +diff --git a/Makefile.in b/Makefile.in +index fd28bbfbf..66e3a8057 100644 +--- a/Makefile.in ++++ b/Makefile.in @@ -143,6 +143,14 @@ match.lo: match.S mv _match.o match.lo rm -f _match.s @@ -46,10 +137,10 @@ Index: zlib-1.2.12/Makefile.in example.o: $(SRCDIR)test/example.c $(SRCDIR)zlib.h zconf.h $(CC) $(CFLAGS) $(ZINCOUT) -c -o $@ $(SRCDIR)test/example.c -Index: zlib-1.2.12/compress.c -=================================================================== ---- zlib-1.2.12.orig/compress.c -+++ zlib-1.2.12/compress.c +diff --git a/compress.c b/compress.c +index e2db404ab..78fc6568f 100644 +--- a/compress.c ++++ b/compress.c @@ -5,9 +5,15 @@ /* @(#) $Id$ */ @@ -67,7 +158,7 @@ Index: zlib-1.2.12/compress.c /* =========================================================================== Compresses the source buffer into the destination buffer. The level parameter has the same meaning as in deflateInit. sourceLen is the byte -@@ -81,6 +87,12 @@ int ZEXPORT compress (dest, destLen, sou +@@ -81,6 +87,12 @@ int ZEXPORT compress (dest, destLen, source, sourceLen) uLong ZEXPORT compressBound (sourceLen) uLong sourceLen; { @@ -80,10 +171,10 @@ Index: zlib-1.2.12/compress.c return sourceLen + (sourceLen >> 12) + (sourceLen >> 14) + (sourceLen >> 25) + 13; } -Index: zlib-1.2.12/configure -=================================================================== ---- zlib-1.2.12.orig/configure -+++ zlib-1.2.12/configure +diff --git a/configure b/configure +index fbaf25357..02e325e22 100755 +--- a/configure ++++ b/configure @@ -115,6 +115,7 @@ case "$1" in echo ' configure [--const] [--zprefix] [--prefix=PREFIX] [--eprefix=EXPREFIX]' | tee -a configure.log echo ' [--static] [--64] [--libdir=LIBDIR] [--sharedlibdir=LIBDIR]' | tee -a configure.log @@ -109,7 +200,7 @@ Index: zlib-1.2.12/configure *) echo "unknown option: $1" | tee -a configure.log echo "$0 --help for help" | tee -a configure.log -@@ -833,6 +844,19 @@ EOF +@@ -836,6 +847,19 @@ EOF fi fi @@ -129,11 +220,11 @@ Index: zlib-1.2.12/configure # show the results in the log echo >> configure.log echo ALL = $ALL >> configure.log -Index: zlib-1.2.12/contrib/README.contrib -=================================================================== ---- zlib-1.2.12.orig/contrib/README.contrib -+++ zlib-1.2.12/contrib/README.contrib -@@ -46,6 +46,10 @@ puff/ by Mark Adler Small, low memory usage inflate. Also serves to provide an unambiguous description of the deflate format. @@ -144,10 +235,11 @@ Index: zlib-1.2.12/contrib/README.contrib testzlib/ by Gilles Vollant Example of the use of zlib -Index: zlib-1.2.12/contrib/s390/README.txt -=================================================================== +diff --git a/contrib/s390/README.txt b/contrib/s390/README.txt +new file mode 100644 +index 000000000..48be008bd --- /dev/null -+++ zlib-1.2.12/contrib/s390/README.txt ++++ b/contrib/s390/README.txt @@ -0,0 +1,17 @@ +IBM Z mainframes starting from version z15 provide DFLTCC instruction, +which implements deflate algorithm in hardware with estimated @@ -166,10 +258,11 @@ Index: zlib-1.2.12/contrib/s390/README.txt +make it used by default) one could either configure with +--dfltcc-level-mask=0x7e or set the environment variable +DFLTCC_LEVEL_MASK to 0x7e at run time. -Index: zlib-1.2.12/contrib/s390/dfltcc.c -=================================================================== +diff --git a/contrib/s390/dfltcc.c b/contrib/s390/dfltcc.c +new file mode 100644 +index 000000000..cd959290d --- /dev/null -+++ zlib-1.2.12/contrib/s390/dfltcc.c ++++ b/contrib/s390/dfltcc.c @@ -0,0 +1,996 @@ +/* dfltcc.c - SystemZ DEFLATE CONVERSION CALL support. */ + @@ -797,7 +890,7 @@ Index: zlib-1.2.12/contrib/s390/dfltcc.c + state->bits = param->sbb; + state->whave = param->hl; + state->wnext = (param->ho + param->hl) & ((1 << HB_BITS) - 1); -+ state->check = state->flags ? ZSWAP32(param->cv) : param->cv; ++ strm->adler = state->check = state->flags ? ZSWAP32(param->cv) : param->cv; + if (cc == DFLTCC_CC_OP2_CORRUPT && param->oesc != 0) { + /* Report an error if stream is corrupted */ + state->mode = BAD; @@ -1167,10 +1260,11 @@ Index: zlib-1.2.12/contrib/s390/dfltcc.c + *dict_length = param->hl; + return Z_OK; +} -Index: zlib-1.2.12/contrib/s390/dfltcc.h -=================================================================== +diff --git a/contrib/s390/dfltcc.h b/contrib/s390/dfltcc.h +new file mode 100644 +index 000000000..da26612ca --- /dev/null -+++ zlib-1.2.12/contrib/s390/dfltcc.h ++++ b/contrib/s390/dfltcc.h @@ -0,0 +1,81 @@ +#ifndef DFLTCC_H +#define DFLTCC_H @@ -1253,10 +1347,11 @@ Index: zlib-1.2.12/contrib/s390/dfltcc.h + } while (0) + +#endif -Index: zlib-1.2.12/contrib/s390/dfltcc_deflate.h -=================================================================== +diff --git a/contrib/s390/dfltcc_deflate.h b/contrib/s390/dfltcc_deflate.h +new file mode 100644 +index 000000000..46acfc550 --- /dev/null -+++ zlib-1.2.12/contrib/s390/dfltcc_deflate.h ++++ b/contrib/s390/dfltcc_deflate.h @@ -0,0 +1,55 @@ +#ifndef DFLTCC_DEFLATE_H +#define DFLTCC_DEFLATE_H @@ -1313,10 +1408,10 @@ Index: zlib-1.2.12/contrib/s390/dfltcc_deflate.h +#define DEFLATE_NEED_CHECKSUM(strm) (!dfltcc_can_deflate((strm))) + +#endif -Index: zlib-1.2.12/deflate.c -=================================================================== ---- zlib-1.2.12.orig/deflate.c -+++ zlib-1.2.12/deflate.c +diff --git a/deflate.c b/deflate.c +index 7f421e4da..a56c1783c 100644 +--- a/deflate.c ++++ b/deflate.c @@ -61,15 +61,30 @@ const char deflate_copyright[] = */ @@ -1355,7 +1450,7 @@ Index: zlib-1.2.12/deflate.c typedef block_state (*compress_func) OF((deflate_state *s, int flush)); /* Compression function. Returns the block state after the call. */ -@@ -85,7 +100,6 @@ local block_state deflate_rle OF((def +@@ -85,7 +100,6 @@ local block_state deflate_rle OF((deflate_state *s, int flush)); local block_state deflate_huff OF((deflate_state *s, int flush)); local void lm_init OF((deflate_state *s)); local void putShortMSB OF((deflate_state *s, uInt b)); @@ -1363,7 +1458,7 @@ Index: zlib-1.2.12/deflate.c local unsigned read_buf OF((z_streamp strm, Bytef *buf, unsigned size)); #ifdef ASMV # pragma message("Assembler code may have bugs -- use at your own risk") -@@ -294,7 +308,7 @@ int ZEXPORT deflateInit2_(strm, level, m +@@ -299,7 +313,7 @@ int ZEXPORT deflateInit2_(strm, level, method, windowBits, memLevel, strategy, return Z_STREAM_ERROR; } if (windowBits == 8) windowBits = 9; /* until 256-byte window bug fixed */ @@ -1372,7 +1467,7 @@ Index: zlib-1.2.12/deflate.c if (s == Z_NULL) return Z_MEM_ERROR; strm->state = (struct internal_state FAR *)s; s->strm = strm; -@@ -311,7 +325,7 @@ int ZEXPORT deflateInit2_(strm, level, m +@@ -316,7 +330,7 @@ int ZEXPORT deflateInit2_(strm, level, method, windowBits, memLevel, strategy, s->hash_mask = s->hash_size - 1; s->hash_shift = ((s->hash_bits+MIN_MATCH-1)/MIN_MATCH); @@ -1381,7 +1476,7 @@ Index: zlib-1.2.12/deflate.c s->prev = (Posf *) ZALLOC(strm, s->w_size, sizeof(Pos)); s->head = (Posf *) ZALLOC(strm, s->hash_size, sizeof(Pos)); -@@ -429,6 +443,7 @@ int ZEXPORT deflateSetDictionary (strm, +@@ -434,6 +448,7 @@ int ZEXPORT deflateSetDictionary (strm, dictionary, dictLength) /* when using zlib wrappers, compute Adler-32 for provided dictionary */ if (wrap == 1) strm->adler = adler32(strm->adler, dictionary, dictLength); @@ -1389,7 +1484,7 @@ Index: zlib-1.2.12/deflate.c s->wrap = 0; /* avoid computing Adler-32 in read_buf */ /* if dictionary would fill window, just replace the history */ -@@ -487,6 +502,7 @@ int ZEXPORT deflateGetDictionary (strm, +@@ -492,6 +507,7 @@ int ZEXPORT deflateGetDictionary (strm, dictionary, dictLength) if (deflateStateCheck(strm)) return Z_STREAM_ERROR; @@ -1397,7 +1492,7 @@ Index: zlib-1.2.12/deflate.c s = strm->state; len = s->strstart + s->lookahead; if (len > s->w_size) -@@ -533,6 +549,8 @@ int ZEXPORT deflateResetKeep (strm) +@@ -538,6 +554,8 @@ int ZEXPORT deflateResetKeep (strm) _tr_init(s); @@ -1406,7 +1501,7 @@ Index: zlib-1.2.12/deflate.c return Z_OK; } -@@ -608,6 +626,7 @@ int ZEXPORT deflateParams(strm, level, s +@@ -613,6 +631,7 @@ int ZEXPORT deflateParams(strm, level, strategy) { deflate_state *s; compress_func func; @@ -1414,7 +1509,7 @@ Index: zlib-1.2.12/deflate.c if (deflateStateCheck(strm)) return Z_STREAM_ERROR; s = strm->state; -@@ -620,15 +639,18 @@ int ZEXPORT deflateParams(strm, level, s +@@ -625,15 +644,18 @@ int ZEXPORT deflateParams(strm, level, strategy) if (level < 0 || level > 9 || strategy < 0 || strategy > Z_FIXED) { return Z_STREAM_ERROR; } @@ -1437,7 +1532,7 @@ Index: zlib-1.2.12/deflate.c return Z_BUF_ERROR; } if (s->level != level) { -@@ -695,6 +717,7 @@ uLong ZEXPORT deflateBound(strm, sourceL +@@ -700,6 +722,7 @@ uLong ZEXPORT deflateBound(strm, sourceLen) /* conservative upper bound for compressed data */ complen = sourceLen + ((sourceLen + 7) >> 3) + ((sourceLen + 63) >> 6) + 5; @@ -1445,7 +1540,7 @@ Index: zlib-1.2.12/deflate.c /* if can't get parameters, return conservative bound plus zlib wrapper */ if (deflateStateCheck(strm)) -@@ -736,7 +759,8 @@ uLong ZEXPORT deflateBound(strm, sourceL +@@ -741,7 +764,8 @@ uLong ZEXPORT deflateBound(strm, sourceLen) } /* if not default parameters, return conservative bound */ @@ -1455,7 +1550,7 @@ Index: zlib-1.2.12/deflate.c return complen + wraplen; /* default settings: return tight bound for that case */ -@@ -763,7 +787,7 @@ local void putShortMSB (s, b) +@@ -768,7 +792,7 @@ local void putShortMSB (s, b) * applications may wish to modify it to avoid allocating a large * strm->next_out buffer and copying into it. (See also read_buf()). */ @@ -1464,7 +1559,7 @@ Index: zlib-1.2.12/deflate.c z_streamp strm; { unsigned len; -@@ -1035,7 +1059,8 @@ int ZEXPORT deflate (strm, flush) +@@ -1040,7 +1064,8 @@ int ZEXPORT deflate (strm, flush) (flush != Z_NO_FLUSH && s->status != FINISH_STATE)) { block_state bstate; @@ -1474,7 +1569,7 @@ Index: zlib-1.2.12/deflate.c s->strategy == Z_HUFFMAN_ONLY ? deflate_huff(s, flush) : s->strategy == Z_RLE ? deflate_rle(s, flush) : (*(configuration_table[s->level].func))(s, flush); -@@ -1082,7 +1107,6 @@ int ZEXPORT deflate (strm, flush) +@@ -1087,7 +1112,6 @@ int ZEXPORT deflate (strm, flush) } if (flush != Z_FINISH) return Z_OK; @@ -1482,7 +1577,7 @@ Index: zlib-1.2.12/deflate.c /* Write the trailer */ #ifdef GZIP -@@ -1098,7 +1122,7 @@ int ZEXPORT deflate (strm, flush) +@@ -1103,7 +1127,7 @@ int ZEXPORT deflate (strm, flush) } else #endif @@ -1491,7 +1586,7 @@ Index: zlib-1.2.12/deflate.c putShortMSB(s, (uInt)(strm->adler >> 16)); putShortMSB(s, (uInt)(strm->adler & 0xffff)); } -@@ -1107,7 +1131,11 @@ int ZEXPORT deflate (strm, flush) +@@ -1112,7 +1136,11 @@ int ZEXPORT deflate (strm, flush) * to flush the rest. */ if (s->wrap > 0) s->wrap = -s->wrap; /* write the trailer only once! */ @@ -1504,7 +1599,7 @@ Index: zlib-1.2.12/deflate.c } /* ========================================================================= */ -@@ -1124,9 +1152,9 @@ int ZEXPORT deflateEnd (strm) +@@ -1129,9 +1157,9 @@ int ZEXPORT deflateEnd (strm) TRY_FREE(strm, strm->state->pending_buf); TRY_FREE(strm, strm->state->head); TRY_FREE(strm, strm->state->prev); @@ -1516,7 +1611,7 @@ Index: zlib-1.2.12/deflate.c strm->state = Z_NULL; return status == BUSY_STATE ? Z_DATA_ERROR : Z_OK; -@@ -1156,13 +1184,13 @@ int ZEXPORT deflateCopy (dest, source) +@@ -1161,13 +1189,13 @@ int ZEXPORT deflateCopy (dest, source) zmemcpy((voidpf)dest, (voidpf)source, sizeof(z_stream)); @@ -1533,7 +1628,7 @@ Index: zlib-1.2.12/deflate.c ds->prev = (Posf *) ZALLOC(dest, ds->w_size, sizeof(Pos)); ds->head = (Posf *) ZALLOC(dest, ds->hash_size, sizeof(Pos)); ds->pending_buf = (uchf *) ZALLOC(dest, ds->lit_bufsize, 4); -@@ -1209,7 +1237,8 @@ local unsigned read_buf(strm, buf, size) +@@ -1214,7 +1242,8 @@ local unsigned read_buf(strm, buf, size) strm->avail_in -= len; zmemcpy(buf, strm->next_in, len); @@ -1543,11 +1638,11 @@ Index: zlib-1.2.12/deflate.c strm->adler = adler32(strm->adler, buf, len); } #ifdef GZIP -Index: zlib-1.2.12/deflate.h -=================================================================== ---- zlib-1.2.12.orig/deflate.h -+++ zlib-1.2.12/deflate.h -@@ -299,6 +299,7 @@ void ZLIB_INTERNAL _tr_flush_bits OF((de +diff --git a/deflate.h b/deflate.h +index 1a06cd5f2..f92750ca6 100644 +--- a/deflate.h ++++ b/deflate.h +@@ -299,6 +299,7 @@ void ZLIB_INTERNAL _tr_flush_bits OF((deflate_state *s)); void ZLIB_INTERNAL _tr_align OF((deflate_state *s)); void ZLIB_INTERNAL _tr_stored_block OF((deflate_state *s, charf *buf, ulg stored_len, int last)); @@ -1555,7 +1650,7 @@ Index: zlib-1.2.12/deflate.h #define d_code(dist) \ ((dist) < 256 ? _dist_code[dist] : _dist_code[256+((dist)>>7)]) -@@ -343,4 +344,15 @@ void ZLIB_INTERNAL _tr_stored_block OF(( +@@ -343,4 +344,15 @@ void ZLIB_INTERNAL _tr_stored_block OF((deflate_state *s, charf *buf, flush = _tr_tally(s, distance, length) #endif @@ -1571,10 +1666,10 @@ Index: zlib-1.2.12/deflate.h +void ZLIB_INTERNAL flush_pending OF((z_streamp strm)); + #endif /* DEFLATE_H */ -Index: zlib-1.2.12/gzguts.h -=================================================================== ---- zlib-1.2.12.orig/gzguts.h -+++ zlib-1.2.12/gzguts.h +diff --git a/gzguts.h b/gzguts.h +index 57faf3716..581f2b631 100644 +--- a/gzguts.h ++++ b/gzguts.h @@ -153,7 +153,11 @@ /* default i/o buffer size -- double this for output when reading (this and @@ -1587,10 +1682,10 @@ Index: zlib-1.2.12/gzguts.h /* gzip modes, also provide a little integrity check on the passed structure */ #define GZ_NONE 0 -Index: zlib-1.2.12/inflate.c -=================================================================== ---- zlib-1.2.12.orig/inflate.c -+++ zlib-1.2.12/inflate.c +diff --git a/inflate.c b/inflate.c +index 2a3c4fe98..ca0f8c9a4 100644 +--- a/inflate.c ++++ b/inflate.c @@ -85,6 +85,24 @@ #include "inflate.h" #include "inffast.h" @@ -1633,7 +1728,7 @@ Index: zlib-1.2.12/inflate.c state->window = Z_NULL; } -@@ -219,7 +238,7 @@ int stream_size; +@@ -222,7 +241,7 @@ int stream_size; strm->zfree = zcfree; #endif state = (struct inflate_state FAR *) @@ -1642,7 +1737,7 @@ Index: zlib-1.2.12/inflate.c if (state == Z_NULL) return Z_MEM_ERROR; Tracev((stderr, "inflate: allocated\n")); strm->state = (struct internal_state FAR *)state; -@@ -228,7 +247,7 @@ int stream_size; +@@ -231,7 +250,7 @@ int stream_size; state->mode = HEAD; /* to pass state test in inflateReset2() */ ret = inflateReset2(strm, windowBits); if (ret != Z_OK) { @@ -1651,7 +1746,7 @@ Index: zlib-1.2.12/inflate.c strm->state = Z_NULL; } return ret; -@@ -250,6 +269,7 @@ int value; +@@ -253,6 +272,7 @@ int value; struct inflate_state FAR *state; if (inflateStateCheck(strm)) return Z_STREAM_ERROR; @@ -1659,7 +1754,7 @@ Index: zlib-1.2.12/inflate.c state = (struct inflate_state FAR *)strm->state; if (bits < 0) { state->hold = 0; -@@ -377,6 +397,27 @@ void makefixed() +@@ -380,6 +400,27 @@ void makefixed() } #endif /* MAKEFIXED */ @@ -1687,7 +1782,7 @@ Index: zlib-1.2.12/inflate.c /* Update the window with the last wsize (normally 32K) bytes written before returning. If window does not exist yet, create it. This is only called -@@ -401,20 +442,7 @@ unsigned copy; +@@ -404,20 +445,7 @@ unsigned copy; state = (struct inflate_state FAR *)strm->state; @@ -1709,7 +1804,7 @@ Index: zlib-1.2.12/inflate.c /* copy state->wsize or less output bytes into the circular window */ if (copy >= state->wsize) { -@@ -857,6 +885,7 @@ int flush; +@@ -861,6 +889,7 @@ int flush; if (flush == Z_BLOCK || flush == Z_TREES) goto inf_leave; /* fallthrough */ case TYPEDO: @@ -1717,7 +1812,7 @@ Index: zlib-1.2.12/inflate.c if (state->last) { BYTEBITS(); state->mode = CHECK; -@@ -1218,7 +1247,7 @@ int flush; +@@ -1222,7 +1251,7 @@ int flush; out -= left; strm->total_out += out; state->total += out; @@ -1726,7 +1821,7 @@ Index: zlib-1.2.12/inflate.c strm->adler = state->check = UPDATE_CHECK(state->check, put - out, out); out = left; -@@ -1273,8 +1302,9 @@ int flush; +@@ -1277,8 +1306,9 @@ int flush; */ inf_leave: RESTORE(); @@ -1738,7 +1833,7 @@ Index: zlib-1.2.12/inflate.c if (updatewindow(strm, strm->next_out, out - strm->avail_out)) { state->mode = MEM; return Z_MEM_ERROR; -@@ -1284,7 +1314,7 @@ int flush; +@@ -1288,7 +1318,7 @@ int flush; strm->total_in += in; strm->total_out += out; state->total += out; @@ -1747,7 +1842,7 @@ Index: zlib-1.2.12/inflate.c strm->adler = state->check = UPDATE_CHECK(state->check, strm->next_out - out, out); strm->data_type = (int)state->bits + (state->last ? 64 : 0) + -@@ -1302,8 +1332,8 @@ z_streamp strm; +@@ -1306,8 +1336,8 @@ z_streamp strm; if (inflateStateCheck(strm)) return Z_STREAM_ERROR; state = (struct inflate_state FAR *)strm->state; @@ -1758,7 +1853,7 @@ Index: zlib-1.2.12/inflate.c strm->state = Z_NULL; Tracev((stderr, "inflate: end\n")); return Z_OK; -@@ -1482,6 +1512,7 @@ z_streamp strm; +@@ -1486,6 +1516,7 @@ z_streamp strm; struct inflate_state FAR *state; if (inflateStateCheck(strm)) return Z_STREAM_ERROR; @@ -1766,7 +1861,7 @@ Index: zlib-1.2.12/inflate.c state = (struct inflate_state FAR *)strm->state; return state->mode == STORED && state->bits == 0; } -@@ -1502,21 +1533,22 @@ z_streamp source; +@@ -1506,21 +1537,22 @@ z_streamp source; /* allocate space */ copy = (struct inflate_state FAR *) @@ -1793,7 +1888,7 @@ Index: zlib-1.2.12/inflate.c copy->strm = dest; if (state->lencode >= state->codes && state->lencode <= state->codes + ENOUGH - 1) { -@@ -1573,6 +1605,7 @@ z_streamp strm; +@@ -1577,6 +1609,7 @@ z_streamp strm; if (inflateStateCheck(strm)) return -(1L << 16); @@ -1801,20 +1896,20 @@ Index: zlib-1.2.12/inflate.c state = (struct inflate_state FAR *)strm->state; return (long)(((unsigned long)((long)state->back)) << 16) + (state->mode == COPY ? state->length : -Index: zlib-1.2.12/inflate.h -=================================================================== ---- zlib-1.2.12.orig/inflate.h -+++ zlib-1.2.12/inflate.h +diff --git a/inflate.h b/inflate.h +index f127b6b1f..519ed3535 100644 +--- a/inflate.h ++++ b/inflate.h @@ -124,3 +124,5 @@ struct inflate_state { int back; /* bits back of last unprocessed length/lit */ unsigned was; /* initial length of match */ }; + +int ZLIB_INTERNAL inflate_ensure_window OF((struct inflate_state *state)); -Index: zlib-1.2.12/test/infcover.c -=================================================================== ---- zlib-1.2.12.orig/test/infcover.c -+++ zlib-1.2.12/test/infcover.c +diff --git a/test/infcover.c b/test/infcover.c +index 2be01646c..a208219dc 100644 +--- a/test/infcover.c ++++ b/test/infcover.c @@ -373,7 +373,7 @@ local void cover_support(void) mem_setup(&strm); strm.avail_in = 0; @@ -1833,10 +1928,10 @@ Index: zlib-1.2.12/test/infcover.c { static unsigned int next = 0; static unsigned char dat[] = {0x63, 0, 2, 0}; -Index: zlib-1.2.12/test/minigzip.c -=================================================================== ---- zlib-1.2.12.orig/test/minigzip.c -+++ zlib-1.2.12/test/minigzip.c +diff --git a/test/minigzip.c b/test/minigzip.c +index e22fb08c0..4b5f4efed 100644 +--- a/test/minigzip.c ++++ b/test/minigzip.c @@ -132,7 +132,11 @@ static void pwinerror (s) #endif #define SUFFIX_LEN (sizeof(GZ_SUFFIX)-1) @@ -1849,11 +1944,11 @@ Index: zlib-1.2.12/test/minigzip.c #define MAX_NAME_LEN 1024 #ifdef MAXSEG_64K -Index: zlib-1.2.12/trees.c -=================================================================== ---- zlib-1.2.12.orig/trees.c -+++ zlib-1.2.12/trees.c -@@ -149,8 +149,6 @@ local void send_all_trees OF((deflate_st +diff --git a/trees.c b/trees.c +index 72b521fb0..534f29c98 100644 +--- a/trees.c ++++ b/trees.c +@@ -149,8 +149,6 @@ local void send_all_trees OF((deflate_state *s, int lcodes, int dcodes, local void compress_block OF((deflate_state *s, const ct_data *ltree, const ct_data *dtree)); local int detect_data_type OF((deflate_state *s)); @@ -1894,11 +1989,11 @@ Index: zlib-1.2.12/trees.c deflate_state *s; { if (s->bi_valid > 8) { -Index: zlib-1.2.12/zutil.h -=================================================================== ---- zlib-1.2.12.orig/zutil.h -+++ zlib-1.2.12/zutil.h -@@ -87,6 +87,8 @@ extern z_const char * const z_errmsg[10] +diff --git a/zutil.h b/zutil.h +index d9a20ae1b..bc83f59d0 100644 +--- a/zutil.h ++++ b/zutil.h +@@ -87,6 +87,8 @@ extern z_const char * const z_errmsg[10]; /* indexed by 2-zlib_error */ #define PRESET_DICT 0x20 /* preset dictionary flag in zlib header */ diff --git a/zlib-1.2.12-add-optimized-slide_hash-for-power.patch b/zlib-1.2.12-add-optimized-slide_hash-for-power.patch new file mode 100644 index 0000000..8d1740c --- /dev/null +++ b/zlib-1.2.12-add-optimized-slide_hash-for-power.patch @@ -0,0 +1,219 @@ +From 4a8d89ae49aa17d1634a2816c8d159f533a07eae Mon Sep 17 00:00:00 2001 +From: Matheus Castanho +Date: Wed, 27 Nov 2019 10:18:10 -0300 +Subject: [PATCH] Add optimized slide_hash for Power + +Considerable time is spent on deflate.c:slide_hash() during +deflate. This commit introduces a new slide_hash function that +uses VSX vector instructions to slide 8 hash elements at a time, +instead of just one as the standard code does. + +The choice between the optimized and default versions is made only +on the first call to the function, enabling a fallback to standard +behavior if the host processor does not support VSX instructions, +so the same binary can be used for multiple Power processor +versions. + +Author: Matheus Castanho +--- + CMakeLists.txt | 3 +- + Makefile.in | 8 ++++ + configure | 4 +- + contrib/power/power.h | 3 ++ + contrib/power/slide_hash_power8.c | 63 +++++++++++++++++++++++++++++ + contrib/power/slide_hash_resolver.c | 15 +++++++ + deflate.c | 12 ++++++ + 7 files changed, 105 insertions(+), 3 deletions(-) + create mode 100644 contrib/power/slide_hash_power8.c + create mode 100644 contrib/power/slide_hash_resolver.c + +diff --git a/CMakeLists.txt b/CMakeLists.txt +index 44de486f6..8208c626b 100644 +--- a/CMakeLists.txt ++++ b/CMakeLists.txt +@@ -186,7 +186,8 @@ if(CMAKE_COMPILER_IS_GNUCC) + add_definitions(-DZ_POWER8) + set(ZLIB_POWER8 + contrib/power/adler32_power8.c +- contrib/power/crc32_z_power8.c) ++ contrib/power/crc32_z_power8.c ++ contrib/power/slide_hash_power8.c) + + set_source_files_properties( + ${ZLIB_POWER8} +diff --git a/Makefile.in b/Makefile.in +index 9ef9fa9b5..f71c6eae0 100644 +--- a/Makefile.in ++++ b/Makefile.in +@@ -183,6 +183,9 @@ crc32_z_power8.o: $(SRCDIR)contrib/power/crc32_z_power8.c + deflate.o: $(SRCDIR)deflate.c + $(CC) $(CFLAGS) $(ZINC) -c -o $@ $(SRCDIR)deflate.c + ++slide_hash_power8.o: $(SRCDIR)contrib/power/slide_hash_power8.c ++ $(CC) $(CFLAGS) -mcpu=power8 $(ZINC) -c -o $@ $(SRCDIR)contrib/power/slide_hash_power8.c ++ + infback.o: $(SRCDIR)infback.c + $(CC) $(CFLAGS) $(ZINC) -c -o $@ $(SRCDIR)infback.c + +@@ -245,6 +248,11 @@ deflate.lo: $(SRCDIR)deflate.c + $(CC) $(SFLAGS) $(ZINC) -DPIC -c -o objs/deflate.o $(SRCDIR)deflate.c + -@mv objs/deflate.o $@ + ++slide_hash_power8.lo: $(SRCDIR)contrib/power/slide_hash_power8.c ++ -@mkdir objs 2>/dev/null || test -d objs ++ $(CC) $(SFLAGS) -mcpu=power8 $(ZINC) -DPIC -c -o objs/slide_hash_power8.o $(SRCDIR)contrib/power/slide_hash_power8.c ++ -@mv objs/slide_hash_power8.o $@ ++ + infback.lo: $(SRCDIR)infback.c + -@mkdir objs 2>/dev/null || test -d objs + $(CC) $(SFLAGS) $(ZINC) -DPIC -c -o objs/infback.o $(SRCDIR)infback.c +diff --git a/configure b/configure +index 810a7404d..d0dacf9c2 100755 +--- a/configure ++++ b/configure +@@ -879,8 +879,8 @@ if tryboth $CC -c $CFLAGS $test.c; then + + if tryboth $CC -c $CFLAGS -mcpu=power8 $test.c; then + POWER8="-DZ_POWER8" +- PIC_OBJC="${PIC_OBJC} adler32_power8.lo crc32_z_power8.lo" +- OBJC="${OBJC} adler32_power8.o crc32_z_power8.o" ++ PIC_OBJC="${PIC_OBJC} adler32_power8.lo crc32_z_power8.lo slide_hash_power8.lo" ++ OBJC="${OBJC} adler32_power8.o crc32_z_power8.o slide_hash_power8.o" + echo "Checking for -mcpu=power8 support... Yes." | tee -a configure.log + else + echo "Checking for -mcpu=power8 support... No." | tee -a configure.log +diff --git a/contrib/power/power.h b/contrib/power/power.h +index f57c76167..28c8f78ca 100644 +--- a/contrib/power/power.h ++++ b/contrib/power/power.h +@@ -4,7 +4,10 @@ + */ + #include "../../zconf.h" + #include "../../zutil.h" ++#include "../../deflate.h" + + uLong _adler32_power8(uLong adler, const Bytef* buf, uInt len); + + unsigned long _crc32_z_power8(unsigned long, const Bytef *, z_size_t); ++ ++void _slide_hash_power8(deflate_state *s); +diff --git a/contrib/power/slide_hash_power8.c b/contrib/power/slide_hash_power8.c +new file mode 100644 +index 000000000..c5a0eb5a6 +--- /dev/null ++++ b/contrib/power/slide_hash_power8.c +@@ -0,0 +1,63 @@ ++ /* Copyright (C) 2019 Matheus Castanho , IBM ++ * For conditions of distribution and use, see copyright notice in zlib.h ++ */ ++ ++#include ++#include "../../deflate.h" ++ ++local inline void slide_hash_power8_loop OF((deflate_state *s, ++ unsigned n_elems, Posf *table_end)) __attribute__((always_inline)); ++ ++local void slide_hash_power8_loop( ++ deflate_state *s, ++ unsigned n_elems, ++ Posf *table_end) ++{ ++ vector unsigned short vw, vm, *vp; ++ unsigned chunks; ++ ++ /* Each vector register (chunk) corresponds to 128 bits == 8 Posf, ++ * so instead of processing each of the n_elems in the hash table ++ * individually, we can do it in chunks of 8 with vector instructions. ++ * ++ * This function is only called from slide_hash_power8(), and both calls ++ * pass n_elems as a power of 2 higher than 2^7, as defined by ++ * deflateInit2_(), so n_elems will always be a multiple of 8. */ ++ chunks = n_elems >> 3; ++ Assert(n_elems % 8 == 0, "Weird hash table size!"); ++ ++ /* This type casting is safe since s->w_size is always <= 64KB ++ * as defined by deflateInit2_() and Posf == unsigned short */ ++ vw[0] = (Posf) s->w_size; ++ vw = vec_splat(vw,0); ++ ++ vp = (vector unsigned short *) table_end; ++ ++ do { ++ /* Processing 8 elements at a time */ ++ vp--; ++ vm = *vp; ++ ++ /* This is equivalent to: m >= w_size ? m - w_size : 0 ++ * Since we are using a saturated unsigned subtraction, any ++ * values that are > w_size will be set to 0, while the others ++ * will be subtracted by w_size. */ ++ *vp = vec_subs(vm,vw); ++ } while (--chunks); ++}; ++ ++void ZLIB_INTERNAL _slide_hash_power8(deflate_state *s) ++{ ++ unsigned n; ++ Posf *p; ++ ++ n = s->hash_size; ++ p = &s->head[n]; ++ slide_hash_power8_loop(s,n,p); ++ ++#ifndef FASTEST ++ n = s->w_size; ++ p = &s->prev[n]; ++ slide_hash_power8_loop(s,n,p); ++#endif ++} +diff --git a/contrib/power/slide_hash_resolver.c b/contrib/power/slide_hash_resolver.c +new file mode 100644 +index 000000000..54fa1eb21 +--- /dev/null ++++ b/contrib/power/slide_hash_resolver.c +@@ -0,0 +1,15 @@ ++/* Copyright (C) 2019 Matheus Castanho , IBM ++ * For conditions of distribution and use, see copyright notice in zlib.h ++ */ ++ ++#include "../gcc/zifunc.h" ++#include "power.h" ++ ++Z_IFUNC(slide_hash) { ++#ifdef Z_POWER8 ++ if (__builtin_cpu_supports("arch_2_07")) ++ return _slide_hash_power8; ++#endif ++ ++ return slide_hash_default; ++} +diff --git a/deflate.c b/deflate.c +index 799fb93cc..b2db576dc 100644 +--- a/deflate.c ++++ b/deflate.c +@@ -196,6 +196,13 @@ local const config configuration_table[10] = { + (unsigned)(s->hash_size-1)*sizeof(*s->head)); \ + } while (0) + ++#ifdef Z_POWER_OPT ++/* Rename function so resolver can use its symbol. The default version will be ++ * returned by the resolver if the host has no support for an optimized version. ++ */ ++#define slide_hash slide_hash_default ++#endif /* Z_POWER_OPT */ ++ + /* =========================================================================== + * Slide the hash table when sliding the window down (could be avoided with 32 + * bit values at the expense of memory usage). We slide even when level == 0 to +@@ -227,6 +234,11 @@ local void slide_hash(s) + #endif + } + ++#ifdef Z_POWER_OPT ++#undef slide_hash ++#include "contrib/power/slide_hash_resolver.c" ++#endif /* Z_POWER_OPT */ ++ + /* ========================================================================= */ + int ZEXPORT deflateInit_(strm, level, version, stream_size) + z_streamp strm; diff --git a/zlib-1.2.12-add-vectorized-longest_match-for-power.patch b/zlib-1.2.12-add-vectorized-longest_match-for-power.patch new file mode 100644 index 0000000..62281ff --- /dev/null +++ b/zlib-1.2.12-add-vectorized-longest_match-for-power.patch @@ -0,0 +1,338 @@ +From aecdff0646c7e188b48f6db285d8d63a74f246c1 Mon Sep 17 00:00:00 2001 +From: Matheus Castanho +Date: Tue, 29 Oct 2019 18:04:11 -0300 +Subject: [PATCH] Add vectorized longest_match for Power + +This commit introduces an optimized version of the longest_match +function for Power processors. It uses VSX instructions to match +16 bytes at a time on each comparison, instead of one by one. + +Author: Matheus Castanho +--- + CMakeLists.txt | 3 +- + Makefile.in | 8 + + configure | 4 +- + contrib/power/longest_match_power9.c | 194 +++++++++++++++++++++++++ + contrib/power/longest_match_resolver.c | 15 ++ + contrib/power/power.h | 2 + + deflate.c | 13 ++ + 7 files changed, 236 insertions(+), 3 deletions(-) + create mode 100644 contrib/power/longest_match_power9.c + create mode 100644 contrib/power/longest_match_resolver.c + +Index: zlib-1.2.12/CMakeLists.txt +=================================================================== +--- zlib-1.2.12.orig/CMakeLists.txt ++++ zlib-1.2.12/CMakeLists.txt +@@ -199,7 +199,8 @@ if(CMAKE_COMPILER_IS_GNUCC) + + if(POWER9) + add_definitions(-DZ_POWER9) +- set(ZLIB_POWER9 ) ++ set(ZLIB_POWER9 ++ contrib/power/longest_match_power9.c) + + set_source_files_properties( + ${ZLIB_POWER9} +Index: zlib-1.2.12/Makefile.in +=================================================================== +--- zlib-1.2.12.orig/Makefile.in ++++ zlib-1.2.12/Makefile.in +@@ -189,6 +189,9 @@ crc32-vx.o: $(SRCDIR)contrib/s390/crc32- + deflate.o: $(SRCDIR)deflate.c + $(CC) $(CFLAGS) $(ZINC) -c -o $@ $(SRCDIR)deflate.c + ++longest_match_power9.o: $(SRCDIR)contrib/power/longest_match_power9.c ++ $(CC) $(CFLAGS) -mcpu=power9 $(ZINC) -c -o $@ $(SRCDIR)contrib/power/longest_match_power9.c ++ + slide_hash_power8.o: $(SRCDIR)contrib/power/slide_hash_power8.c + $(CC) $(CFLAGS) -mcpu=power8 $(ZINC) -c -o $@ $(SRCDIR)contrib/power/slide_hash_power8.c + +@@ -259,6 +262,11 @@ deflate.lo: $(SRCDIR)deflate.c + $(CC) $(SFLAGS) $(ZINC) -DPIC -c -o objs/deflate.o $(SRCDIR)deflate.c + -@mv objs/deflate.o $@ + ++longest_match_power9.lo: $(SRCDIR)contrib/power/longest_match_power9.c ++ -@mkdir objs 2>/dev/null || test -d objs ++ $(CC) $(SFLAGS) -mcpu=power9 $(ZINC) -DPIC -c -o objs/longest_match_power9.o $(SRCDIR)contrib/power/longest_match_power9.c ++ -@mv objs/longest_match_power9.o $@ ++ + slide_hash_power8.lo: $(SRCDIR)contrib/power/slide_hash_power8.c + -@mkdir objs 2>/dev/null || test -d objs + $(CC) $(SFLAGS) -mcpu=power8 $(ZINC) -DPIC -c -o objs/slide_hash_power8.o $(SRCDIR)contrib/power/slide_hash_power8.c +Index: zlib-1.2.12/configure +=================================================================== +--- zlib-1.2.12.orig/configure ++++ zlib-1.2.12/configure +@@ -915,8 +915,8 @@ if tryboth $CC -c $CFLAGS $test.c; then + + if tryboth $CC -c $CFLAGS -mcpu=power9 $test.c; then + POWER9="-DZ_POWER9" +- PIC_OBJC="${PIC_OBJC}" +- OBJC="${OBJC}" ++ PIC_OBJC="$PIC_OBJC longest_match_power9.lo" ++ OBJC="$OBJC longest_match_power9.o" + echo "Checking for -mcpu=power9 support... Yes." | tee -a configure.log + else + echo "Checking for -mcpu=power9 support... No." | tee -a configure.log +Index: zlib-1.2.12/contrib/power/longest_match_power9.c +=================================================================== +--- /dev/null ++++ zlib-1.2.12/contrib/power/longest_match_power9.c +@@ -0,0 +1,194 @@ ++/* Copyright (C) 2019 Matheus Castanho , IBM ++ * For conditions of distribution and use, see copyright notice in zlib.h ++ */ ++ ++#include ++#include "../../deflate.h" ++ ++local inline int vec_match OF((Bytef* scan, Bytef* match)) ++ __attribute__((always_inline)); ++ ++local inline int vec_match(Bytef* scan, Bytef* match) ++{ ++ vector unsigned char vscan, vmatch, vc; ++ int len; ++ ++ vscan = *((vector unsigned char *) scan); ++ vmatch = *((vector unsigned char *) match); ++ ++ /* Compare 16 bytes at a time. ++ * Each byte of vc will be either all ones or all zeroes, ++ * depending on the result of the comparison ++ */ ++ vc = (vector unsigned char) vec_cmpne(vscan,vmatch); ++ ++ /* Since the index of matching bytes will contain only zeroes ++ * on vc (since we used cmpne), counting the number of consecutive ++ * bytes where LSB == 0 is the same as counting the length of the match. ++ * ++ * There was an issue in the way the vec_cnttz_lsbb builtin was implemented ++ * that got fixed on GCC 12, but now we have to use different builtins ++ * depending on the compiler version. To avoid that, let's use inline asm to ++ * generate the exact instruction we need. ++ */ ++ #ifdef __LITTLE_ENDIAN__ ++ asm volatile("vctzlsbb %0, %1\n\t" : "=r" (len) : "v" (vc)); ++ #else ++ asm volatile("vclzlsbb %0, %1\n\t" : "=r" (len) : "v" (vc)); ++ #endif ++ ++ return len; ++} ++ ++uInt ZLIB_INTERNAL _longest_match_power9(deflate_state *s, IPos cur_match) ++{ ++ unsigned chain_length = s->max_chain_length;/* max hash chain length */ ++ register Bytef *scan = s->window + s->strstart; /* current string */ ++ register Bytef *match; /* matched string */ ++ register int len; /* length of current match */ ++ int best_len = (int)s->prev_length; /* best match length so far */ ++ int nice_match = s->nice_match; /* stop if match long enough */ ++ int mbytes; /* matched bytes inside loop */ ++ IPos limit = s->strstart > (IPos)MAX_DIST(s) ? ++ s->strstart - (IPos)MAX_DIST(s) : 0; ++ /* Stop when cur_match becomes <= limit. To simplify the code, ++ * we prevent matches with the string of window index 0. ++ */ ++ Posf *prev = s->prev; ++ uInt wmask = s->w_mask; ++ ++#if (MAX_MATCH == 258) ++ /* Compare the last two bytes at once. */ ++ register Bytef *strend2 = s->window + s->strstart + MAX_MATCH - 2; ++ register ush scan_end = *(ushf*)(scan+best_len-1); ++#else ++ register Bytef *strend = s->window + s->strstart + MAX_MATCH; ++ register Byte scan_end1 = scan[best_len-1]; ++ register Byte scan_end = scan[best_len]; ++#endif ++ ++ /* The code is optimized for HASH_BITS >= 8 and MAX_MATCH-2 multiple of 16. ++ * It is easy to get rid of this optimization if necessary. ++ */ ++ Assert(s->hash_bits >= 8 && MAX_MATCH == 258, "Code too clever"); ++ ++ /* Do not waste too much time if we already have a good match: */ ++ if (s->prev_length >= s->good_match) { ++ chain_length >>= 2; ++ } ++ /* Do not look for matches beyond the end of the input. This is necessary ++ * to make deflate deterministic. ++ */ ++ if ((uInt)nice_match > s->lookahead) nice_match = (int)s->lookahead; ++ ++ Assert((ulg)s->strstart <= s->window_size-MIN_LOOKAHEAD, "need lookahead"); ++ ++ do { ++ Assert(cur_match < s->strstart, "no future"); ++ match = s->window + cur_match; ++ ++ /* Skip to next match if the match length cannot increase ++ * or if the match length is less than 2. Note that the checks below ++ * for insufficient lookahead only occur occasionally for performance ++ * reasons. Therefore uninitialized memory will be accessed, and ++ * conditional jumps will be made that depend on those values. ++ * However the length of the match is limited to the lookahead, so ++ * the output of deflate is not affected by the uninitialized values. ++ */ ++ ++/* MAX_MATCH - 2 should be a multiple of 16 for this optimization to work. */ ++#if (MAX_MATCH == 258) ++ ++ /* Compare ending (2 bytes) and beginning of potential match. ++ * ++ * On Power processors, loading a 16-byte vector takes only 1 extra ++ * cycle compared to a regular byte load. So instead of comparing the ++ * first two bytes and then the rest later if they match, we can compare ++ * the first 16 at once, and when we have a match longer than 2, we will ++ * already have the result of comparing the first 16 bytes saved in mbytes. ++ */ ++ if (*(ushf*)(match+best_len-1) != scan_end || ++ (mbytes = vec_match(scan,match)) < 3) continue; ++ ++ scan += mbytes; ++ match += mbytes; ++ ++ /* In case when we may have a match longer than 16, we perform further ++ * comparisons in chunks of 16 and keep going while all bytes match. ++ */ ++ while(mbytes == 16) { ++ mbytes = vec_match(scan,match); ++ scan += mbytes; ++ match += mbytes; ++ ++ /* We also have to limit the maximum match based on MAX_MATCH. ++ * Since we are comparing 16 bytes at a time and MAX_MATCH == 258 (to ++ * comply with default implementation), we should stop comparing when ++ * we have matched 256 bytes, which happens when scan == strend2. ++ * In this ("rare") case, we have to check the remaining 2 bytes ++ * individually using common load and compare operations. ++ */ ++ if(scan >= strend2) { ++ if(*scan == *match) { ++ if(*++scan == *++match) ++ scan++; ++ } ++ break; ++ } ++ } ++ ++ Assert(scan <= s->window+(unsigned)(s->window_size-1), "wild scan"); ++ ++ len = (MAX_MATCH - 2) - (int)(strend2 - scan); ++ scan = strend2 - (MAX_MATCH - 2); ++ ++#else /* MAX_MATCH == 258 */ ++ ++ if (match[best_len] != scan_end || ++ match[best_len-1] != scan_end1 || ++ *match != *scan || ++ *++match != scan[1]) continue; ++ ++ /* The check at best_len-1 can be removed because it will be made ++ * again later. (This heuristic is not always a win.) ++ * It is not necessary to compare scan[2] and match[2] since they ++ * are always equal when the other bytes match, given that ++ * the hash keys are equal and that HASH_BITS >= 8. ++ */ ++ scan += 2, match++; ++ Assert(*scan == *match, "match[2]?"); ++ ++ /* We check for insufficient lookahead only every 8th comparison; ++ * the 256th check will be made at strstart+258. ++ */ ++ do { ++ } while (*++scan == *++match && *++scan == *++match && ++ *++scan == *++match && *++scan == *++match && ++ *++scan == *++match && *++scan == *++match && ++ *++scan == *++match && *++scan == *++match && ++ scan < strend); ++ ++ Assert(scan <= s->window+(unsigned)(s->window_size-1), "wild scan"); ++ ++ len = MAX_MATCH - (int)(strend - scan); ++ scan = strend - MAX_MATCH; ++ ++#endif /* MAX_MATCH == 258 */ ++ ++ if (len > best_len) { ++ s->match_start = cur_match; ++ best_len = len; ++ if (len >= nice_match) break; ++#if (MAX_MATCH == 258) ++ scan_end = *(ushf*)(scan+best_len-1); ++#else ++ scan_end1 = scan[best_len-1]; ++ scan_end = scan[best_len]; ++#endif ++ } ++ } while ((cur_match = prev[cur_match & wmask]) > limit ++ && --chain_length != 0); ++ ++ if ((uInt)best_len <= s->lookahead) return (uInt)best_len; ++ return s->lookahead; ++} +Index: zlib-1.2.12/contrib/power/longest_match_resolver.c +=================================================================== +--- /dev/null ++++ zlib-1.2.12/contrib/power/longest_match_resolver.c +@@ -0,0 +1,15 @@ ++/* Copyright (C) 2019 Matheus Castanho , IBM ++ * For conditions of distribution and use, see copyright notice in zlib.h ++ */ ++ ++#include "../gcc/zifunc.h" ++#include "power.h" ++ ++Z_IFUNC(longest_match) { ++#ifdef Z_POWER9 ++ if (__builtin_cpu_supports("arch_3_00")) ++ return _longest_match_power9; ++#endif ++ ++ return longest_match_default; ++} +Index: zlib-1.2.12/contrib/power/power.h +=================================================================== +--- zlib-1.2.12.orig/contrib/power/power.h ++++ zlib-1.2.12/contrib/power/power.h +@@ -10,4 +10,6 @@ uLong _adler32_power8(uLong adler, const + + unsigned long _crc32_z_power8(unsigned long, const Bytef *, z_size_t); + ++uInt _longest_match_power9(deflate_state *s, IPos cur_match); ++ + void _slide_hash_power8(deflate_state *s); +Index: zlib-1.2.12/deflate.c +=================================================================== +--- zlib-1.2.12.orig/deflate.c ++++ zlib-1.2.12/deflate.c +@@ -1309,6 +1309,14 @@ local void lm_init (s) + /* For 80x86 and 680x0, an optimized version will be provided in match.asm or + * match.S. The code will be functionally equivalent. + */ ++ ++#ifdef Z_POWER_OPT ++/* Rename function so resolver can use its symbol. The default version will be ++ * returned by the resolver if the host has no support for an optimized version. ++ */ ++#define longest_match longest_match_default ++#endif /* Z_POWER_OPT */ ++ + local uInt longest_match(s, pcur_match) + deflate_state *s; + IPos pcur_match; /* current match */ +@@ -1454,6 +1462,11 @@ local uInt longest_match(s, pcur_match) + } + #endif /* ASMV */ + ++#ifdef Z_POWER_OPT ++#undef longest_match ++#include "contrib/power/longest_match_resolver.c" ++#endif /* Z_POWER_OPT */ ++ + #else /* FASTEST */ + + /* --------------------------------------------------------------------------- diff --git a/zlib-1.2.12-adler32-vector-optimizations-for-power.patch b/zlib-1.2.12-adler32-vector-optimizations-for-power.patch new file mode 100644 index 0000000..e5dfb38 --- /dev/null +++ b/zlib-1.2.12-adler32-vector-optimizations-for-power.patch @@ -0,0 +1,342 @@ +From 772f4bd0f880c4c193ab7da78728f38821572a02 Mon Sep 17 00:00:00 2001 +From: Rogerio Alves +Date: Mon, 9 Dec 2019 14:40:53 -0300 +Subject: [PATCH] Adler32 vector optimization for Power. + +This commit implements a Power (POWER8+) vector optimization for Adler32 +checksum using VSX (vector) instructions. The VSX adler32 checksum is up +to 10x fast than the adler32 baseline code. + +Author: Rogerio Alves +--- + CMakeLists.txt | 1 + + Makefile.in | 8 ++ + adler32.c | 11 ++ + configure | 4 +- + contrib/power/adler32_power8.c | 196 +++++++++++++++++++++++++++++++ + contrib/power/adler32_resolver.c | 15 +++ + contrib/power/power.h | 4 +- + 7 files changed, 236 insertions(+), 3 deletions(-) + create mode 100644 contrib/power/adler32_power8.c + create mode 100644 contrib/power/adler32_resolver.c + +diff --git a/CMakeLists.txt b/CMakeLists.txt +index 581e1fa6d..c6296ee68 100644 +--- a/CMakeLists.txt ++++ b/CMakeLists.txt +@@ -185,6 +185,7 @@ if(CMAKE_COMPILER_IS_GNUCC) + if(POWER8) + add_definitions(-DZ_POWER8) + set(ZLIB_POWER8 ++ contrib/power/adler32_power8.c + contrib/power/crc32_z_power8.c) + + set_source_files_properties( +diff --git a/Makefile.in b/Makefile.in +index 16943044e..a0ffac860 100644 +--- a/Makefile.in ++++ b/Makefile.in +@@ -165,6 +165,9 @@ minigzip64.o: $(SRCDIR)test/minigzip.c $(SRCDIR)zlib.h zconf.h + adler32.o: $(SRCDIR)adler32.c + $(CC) $(CFLAGS) $(ZINC) -c -o $@ $(SRCDIR)adler32.c + ++adler32_power8.o: $(SRCDIR)contrib/power/adler32_power8.c ++ $(CC) $(CFLAGS) -mcpu=power8 $(ZINC) -c -o $@ $(SRCDIR)contrib/power/adler32_power8.c ++ + crc32.o: $(SRCDIR)crc32.c + $(CC) $(CFLAGS) $(ZINC) -c -o $@ $(SRCDIR)crc32.c + +@@ -216,6 +219,11 @@ adler32.lo: $(SRCDIR)adler32.c + $(CC) $(SFLAGS) $(ZINC) -DPIC -c -o objs/adler32.o $(SRCDIR)adler32.c + -@mv objs/adler32.o $@ + ++adler32_power8.lo: $(SRCDIR)contrib/power/adler32_power8.c ++ -@mkdir objs 2>/dev/null || test -d objs ++ $(CC) $(SFLAGS) -mcpu=power8 $(ZINC) -DPIC -c -o objs/adler32_power8.o $(SRCDIR)contrib/power/adler32_power8.c ++ -@mv objs/adler32_power8.o $@ ++ + crc32.lo: $(SRCDIR)crc32.c + -@mkdir objs 2>/dev/null || test -d objs + $(CC) $(SFLAGS) $(ZINC) -DPIC -c -o objs/crc32.o $(SRCDIR)crc32.c +diff --git a/adler32.c b/adler32.c +index d0be4380a..4bde0fa18 100644 +--- a/adler32.c ++++ b/adler32.c +@@ -131,6 +131,12 @@ uLong ZEXPORT adler32_z(adler, buf, len) + } + + /* ========================================================================= */ ++ ++#ifdef Z_POWER_OPT ++/* Rename the default function to avoid naming conflicts */ ++#define adler32 adler32_default ++#endif /* Z_POWER_OPT */ ++ + uLong ZEXPORT adler32(adler, buf, len) + uLong adler; + const Bytef *buf; +@@ -139,6 +145,11 @@ uLong ZEXPORT adler32(adler, buf, len) + return adler32_z(adler, buf, len); + } + ++#ifdef Z_POWER_OPT ++#undef adler32 ++#include "contrib/power/adler32_resolver.c" ++#endif /* Z_POWER_OPT */ ++ + /* ========================================================================= */ + local uLong adler32_combine_(adler1, adler2, len2) + uLong adler1; +diff --git a/configure b/configure +index 914d9f4aa..810a7404d 100755 +--- a/configure ++++ b/configure +@@ -879,8 +879,8 @@ if tryboth $CC -c $CFLAGS $test.c; then + + if tryboth $CC -c $CFLAGS -mcpu=power8 $test.c; then + POWER8="-DZ_POWER8" +- PIC_OBJC="${PIC_OBJC} crc32_z_power8.lo" +- OBJC="${OBJC} crc32_z_power8.o" ++ PIC_OBJC="${PIC_OBJC} adler32_power8.lo crc32_z_power8.lo" ++ OBJC="${OBJC} adler32_power8.o crc32_z_power8.o" + echo "Checking for -mcpu=power8 support... Yes." | tee -a configure.log + else + echo "Checking for -mcpu=power8 support... No." | tee -a configure.log +diff --git a/contrib/power/adler32_power8.c b/contrib/power/adler32_power8.c +new file mode 100644 +index 000000000..473c39457 +--- /dev/null ++++ b/contrib/power/adler32_power8.c +@@ -0,0 +1,196 @@ ++/* ++ * Adler32 for POWER 8+ using VSX instructions. ++ * ++ * Calculate adler32 checksum for 16 bytes at once using POWER8+ VSX (vector) ++ * instructions. ++ * ++ * If adler32 do 1 byte at time on the first iteration s1 is s1_0 (_n means ++ * iteration n) is the initial value of adler - at start _0 is 1 unless ++ * adler initial value is different than 1. So s1_1 = s1_0 + c[0] after ++ * the first calculation. For the iteration s1_2 = s1_1 + c[1] and so on. ++ * Hence, for iteration N, s1_N = s1_(N-1) + c[N] is the value of s1 on ++ * after iteration N. ++ * ++ * Therefore, for s2 and iteration N, s2_N = s2_0 + N*s1_N + N*c[0] + ++ * N-1*c[1] + ... + c[N] ++ * ++ * In a more general way: ++ * ++ * s1_N = s1_0 + sum(i=1 to N)c[i] ++ * s2_N = s2_0 + N*s1 + sum (i=1 to N)(N-i+1)*c[i] ++ * ++ * Where s1_N, s2_N are the values for s1, s2 after N iterations. So if we ++ * can process N-bit at time we can do this at once. ++ * ++ * Since VSX can support 16-bit vector instructions, we can process ++ * 16-bit at time using N = 16 we have: ++ * ++ * s1 = s1_16 = s1_(16-1) + c[16] = s1_0 + sum(i=1 to 16)c[i] ++ * s2 = s2_16 = s2_0 + 16*s1 + sum(i=1 to 16)(16-i+1)*c[i] ++ * ++ * After the first iteration we calculate the adler32 checksum for 16 bytes. ++ * ++ * For more background about adler32 please check the RFC: ++ * https://www.ietf.org/rfc/rfc1950.txt ++ * ++ * Copyright (C) 2019 Rogerio Alves , IBM ++ * For conditions of distribution and use, see copyright notice in zlib.h ++ * ++ */ ++ ++#include "../../zutil.h" ++#include ++ ++/* Largest prime smaller than 65536. */ ++#define BASE 65521U ++#define NMAX 5552 ++/* NMAX is the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1. */ ++ ++#define DO1(s1,s2,buf,i) {(s1) += buf[(i)]; (s2) += (s1);} ++#define DO2(s1,s2,buf,i) {DO1(s1,s2,buf,i); DO1(s1,s2,buf,i+1);} ++#define DO4(s1,s2,buf,i) {DO2(s1,s2,buf,i); DO2(s1,s2,buf,i+2);} ++#define DO8(s1,s2,buf,i) {DO4(s1,s2,buf,i); DO4(s1,s2,buf,i+4);} ++#define DO16(s1,s2,buf) {DO8(s1,s2,buf,0); DO8(s1,s2,buf,8);} ++ ++/* Vector across sum unsigned int (saturate). */ ++inline vector unsigned int vec_sumsu (vector unsigned int __a, ++ vector unsigned int __b) ++{ ++ __b = vec_sld(__a, __a, 8); ++ __b = vec_add(__b, __a); ++ __a = vec_sld(__b, __b, 4); ++ __a = vec_add(__a, __b); ++ ++ return __a; ++} ++ ++uLong ZLIB_INTERNAL _adler32_power8 (uLong adler, const Bytef* buf, uInt len) ++{ ++ /* If buffer is empty or len=0 we need to return adler initial value. */ ++ if (buf == NULL) ++ return 1; ++ ++ unsigned int s1 = adler & 0xffff; ++ unsigned int s2 = (adler >> 16) & 0xffff; ++ ++ /* in case user likes doing a byte at a time, keep it fast */ ++ if (len == 1) { ++ s1 += buf[0]; ++ if (s1 >= BASE) ++ s1 -= BASE; ++ s2 += s1; ++ if (s2 >= BASE) ++ s2 -= BASE; ++ return (s2 << 16) | s1; ++ } ++ ++ /* Keep it fast for short length buffers. */ ++ if (len < 16) { ++ while (len--) { ++ s1 += *buf++; ++ s2 += s1; ++ } ++ if (s1 >= BASE) ++ s1 -= BASE; ++ s2 %= BASE; ++ return (s2 << 16) | s1; ++ } ++ ++ /* This is faster than VSX code for len < 64. */ ++ if (len < 64) { ++ while (len >= 16) { ++ len -= 16; ++ DO16(s1,s2,buf); ++ buf += 16; ++ } ++ } else { ++ /* Use POWER VSX instructions for len >= 64. */ ++ const vector unsigned int v_zeros = { 0 }; ++ const vector unsigned char v_mul = {16, 15, 14, 13, 12, 11, 10, 9, 8, 7, ++ 6, 5, 4, 3, 2, 1}; ++ const vector unsigned char vsh = vec_splat_u8(4); ++ const vector unsigned int vmask = {0xffffffff, 0x0, 0x0, 0x0}; ++ vector unsigned int vs1 = vec_xl(0, &s1); ++ vector unsigned int vs2 = vec_xl(0, &s2); ++ vector unsigned int vs1_save = { 0 }; ++ vector unsigned int vsum1, vsum2; ++ vector unsigned char vbuf; ++ int n; ++ ++ /* Zeros the undefined values of vectors vs1, vs2. */ ++ vs1 = vec_and(vs1, vmask); ++ vs2 = vec_and(vs2, vmask); ++ ++ /* Do length bigger than NMAX in blocks of NMAX size. */ ++ while (len >= NMAX) { ++ len -= NMAX; ++ n = NMAX / 16; ++ do { ++ vbuf = vec_xl(0, (unsigned char *) buf); ++ vsum1 = vec_sum4s(vbuf, v_zeros); /* sum(i=1 to 16) buf[i]. */ ++ /* sum(i=1 to 16) buf[i]*(16-i+1). */ ++ vsum2 = vec_msum(vbuf, v_mul, v_zeros); ++ /* Save vs1. */ ++ vs1_save = vec_add(vs1_save, vs1); ++ /* Accumulate the sums. */ ++ vs1 = vec_add(vsum1, vs1); ++ vs2 = vec_add(vsum2, vs2); ++ ++ buf += 16; ++ } while (--n); ++ /* Once each block of NMAX size. */ ++ vs1 = vec_sumsu(vs1, vsum1); ++ vs1_save = vec_sll(vs1_save, vsh); /* 16*vs1_save. */ ++ vs2 = vec_add(vs1_save, vs2); ++ vs2 = vec_sumsu(vs2, vsum2); ++ ++ /* vs1[0] = (s1_i + sum(i=1 to 16)buf[i]) mod 65521. */ ++ vs1[0] = vs1[0] % BASE; ++ /* vs2[0] = s2_i + 16*s1_save + ++ sum(i=1 to 16)(16-i+1)*buf[i] mod 65521. */ ++ vs2[0] = vs2[0] % BASE; ++ ++ vs1 = vec_and(vs1, vmask); ++ vs2 = vec_and(vs2, vmask); ++ vs1_save = v_zeros; ++ } ++ ++ /* len is less than NMAX one modulo is needed. */ ++ if (len >= 16) { ++ while (len >= 16) { ++ len -= 16; ++ ++ vbuf = vec_xl(0, (unsigned char *) buf); ++ ++ vsum1 = vec_sum4s(vbuf, v_zeros); /* sum(i=1 to 16) buf[i]. */ ++ /* sum(i=1 to 16) buf[i]*(16-i+1). */ ++ vsum2 = vec_msum(vbuf, v_mul, v_zeros); ++ /* Save vs1. */ ++ vs1_save = vec_add(vs1_save, vs1); ++ /* Accumulate the sums. */ ++ vs1 = vec_add(vsum1, vs1); ++ vs2 = vec_add(vsum2, vs2); ++ ++ buf += 16; ++ } ++ /* Since the size will be always less than NMAX we do this once. */ ++ vs1 = vec_sumsu(vs1, vsum1); ++ vs1_save = vec_sll(vs1_save, vsh); /* 16*vs1_save. */ ++ vs2 = vec_add(vs1_save, vs2); ++ vs2 = vec_sumsu(vs2, vsum2); ++ } ++ /* Copy result back to s1, s2 (mod 65521). */ ++ s1 = vs1[0] % BASE; ++ s2 = vs2[0] % BASE; ++ } ++ ++ /* Process tail (len < 16). */ ++ while (len--) { ++ s1 += *buf++; ++ s2 += s1; ++ } ++ s1 %= BASE; ++ s2 %= BASE; ++ ++ return (s2 << 16) | s1; ++} +diff --git a/contrib/power/adler32_resolver.c b/contrib/power/adler32_resolver.c +new file mode 100644 +index 000000000..07a1a2cb2 +--- /dev/null ++++ b/contrib/power/adler32_resolver.c +@@ -0,0 +1,15 @@ ++/* Copyright (C) 2019 Rogerio Alves , IBM ++ * For conditions of distribution and use, see copyright notice in zlib.h ++ */ ++ ++#include "../gcc/zifunc.h" ++#include "power.h" ++ ++Z_IFUNC(adler32) { ++#ifdef Z_POWER8 ++ if (__builtin_cpu_supports("arch_2_07")) ++ return _adler32_power8; ++#endif ++ ++ return adler32_default; ++} +diff --git a/contrib/power/power.h b/contrib/power/power.h +index 79123aa90..f57c76167 100644 +--- a/contrib/power/power.h ++++ b/contrib/power/power.h +@@ -2,7 +2,9 @@ + * 2019 Rogerio Alves , IBM + * For conditions of distribution and use, see copyright notice in zlib.h + */ +- + #include "../../zconf.h" ++#include "../../zutil.h" ++ ++uLong _adler32_power8(uLong adler, const Bytef* buf, uInt len); + + unsigned long _crc32_z_power8(unsigned long, const Bytef *, z_size_t); diff --git a/zlib-1.2.12-fix-invalid-memory-access-on-ppc-and-ppc64.patch b/zlib-1.2.12-fix-invalid-memory-access-on-ppc-and-ppc64.patch new file mode 100644 index 0000000..d105a20 --- /dev/null +++ b/zlib-1.2.12-fix-invalid-memory-access-on-ppc-and-ppc64.patch @@ -0,0 +1,34 @@ +From 11b722e4ae91b611f605221587ec8e0829c27949 Mon Sep 17 00:00:00 2001 +From: Matheus Castanho +Date: Tue, 23 Jun 2020 10:26:19 -0300 +Subject: [PATCH] Fix invalid memory access on ppc and ppc64 + +--- + contrib/power/adler32_power8.c | 9 ++++----- + 1 file changed, 4 insertions(+), 5 deletions(-) + +diff --git a/contrib/power/adler32_power8.c b/contrib/power/adler32_power8.c +index 473c39457..fdd086453 100644 +--- a/contrib/power/adler32_power8.c ++++ b/contrib/power/adler32_power8.c +@@ -110,16 +110,15 @@ uLong ZLIB_INTERNAL _adler32_power8 (uLong adler, const Bytef* buf, uInt len) + 6, 5, 4, 3, 2, 1}; + const vector unsigned char vsh = vec_splat_u8(4); + const vector unsigned int vmask = {0xffffffff, 0x0, 0x0, 0x0}; +- vector unsigned int vs1 = vec_xl(0, &s1); +- vector unsigned int vs2 = vec_xl(0, &s2); ++ vector unsigned int vs1 = { 0 }; ++ vector unsigned int vs2 = { 0 }; + vector unsigned int vs1_save = { 0 }; + vector unsigned int vsum1, vsum2; + vector unsigned char vbuf; + int n; + +- /* Zeros the undefined values of vectors vs1, vs2. */ +- vs1 = vec_and(vs1, vmask); +- vs2 = vec_and(vs2, vmask); ++ vs1[0] = s1; ++ vs2[0] = s2; + + /* Do length bigger than NMAX in blocks of NMAX size. */ + while (len >= NMAX) { diff --git a/zlib.changes b/zlib.changes index 316d2bb..6c9f265 100644 --- a/zlib.changes +++ b/zlib.changes @@ -1,3 +1,13 @@ +------------------------------------------------------------------- +Mon Oct 10 10:08:02 UTC 2022 - Danilo Spinella + +- Add Power8 optimizations: + * zlib-1.2.12-add-optimized-slide_hash-for-power.patch + * zlib-1.2.12-add-vectorized-longest_match-for-power.patch + * zlib-1.2.12-adler32-vector-optimizations-for-power.patch + * zlib-1.2.12-fix-invalid-memory-access-on-ppc-and-ppc64.patch +- Update zlib-1.2.12-IBM-Z-hw-accelerated-deflate-s390x.patch + ------------------------------------------------------------------- Tue Aug 23 16:22:59 UTC 2022 - Danilo Spinella diff --git a/zlib.spec b/zlib.spec index f74e6ad..c6385e3 100644 --- a/zlib.spec +++ b/zlib.spec @@ -44,12 +44,18 @@ Patch6: minizip-dont-install-crypt-header.patch # The following patches are taken from https://github.com/iii-i/zlib/commits/crc32vx-v3 Patch7: zlib-1.2.5-minizip-fixuncrypt.patch Patch8: zlib-1.2.11-optimized-s390.patch +# https://github.com/iii-i/zlib/commit/171d0ff3c9ed40da0ac14085ab16b766b1162069 Patch9: zlib-1.2.12-IBM-Z-hw-accelerated-deflate-s390x.patch Patch10: zlib-1.2.11-covscan-issues.patch Patch11: zlib-1.2.11-covscan-issues-rhel9.patch Patch12: zlib-1.2.12-optimized-crc32-power8.patch Patch13: zlib-1.2.12-fix-configure.patch Patch14: zlib-1.2.12-s390-vectorize-crc32.patch +# The following patches are taken from https://github.com/mscastanho/zlib/commits/power-optimizations-1.2.12 +Patch15: zlib-1.2.12-adler32-vector-optimizations-for-power.patch +Patch16: zlib-1.2.12-fix-invalid-memory-access-on-ppc-and-ppc64.patch +Patch17: zlib-1.2.12-add-optimized-slide_hash-for-power.patch +Patch18: zlib-1.2.12-add-vectorized-longest_match-for-power.patch BuildRequires: autoconf BuildRequires: automake BuildRequires: libtool @@ -148,6 +154,10 @@ It should exit 0 %patch12 -p1 %patch13 -p1 %patch14 -p1 +%patch15 -p1 +%patch16 -p1 +%patch17 -p1 +%patch18 -p1 cp %{SOURCE4} . %build @@ -167,10 +177,10 @@ CC="cc" ./configure \ # Profiling flags breaks tests, as of 1.2.12 # In particular, gzseek does not work as intended #%if %{do_profiling} -# #make %{?_smp_mflags} CFLAGS="%{optflags} %{cflags_profile_generate}" +# make %{?_smp_mflags} CFLAGS="%{optflags} %{cflags_profile_generate}" # make check %{?_smp_mflags} -# #make %{?_smp_mflags} clean -# #make %{?_smp_mflags} CFLAGS="%{optflags} %{cflags_profile_feedback}" +# make %{?_smp_mflags} clean +# make %{?_smp_mflags} CFLAGS="%{optflags} %{cflags_profile_feedback}" #%else make %{?_smp_mflags} #%endif