e8b526b295
- add longest_match performance patch (fate#314093) * suggested by IBM, sent upstream - rename the main library package to libz1 according Shared Library Policy - profiling build can be enabled via build --with profiling - use the human-readable package description from zlib.net - add rpmlintrc OBS-URL: https://build.opensuse.org/request/show/138143 OBS-URL: https://build.opensuse.org/package/show/devel:libraries:c_c++/zlib?expand=0&rev=4
106 lines
4.0 KiB
Diff
106 lines
4.0 KiB
Diff
# http://mail.madler.net/pipermail/zlib-devel_madler.net/2012-June/002907.html
|
|
# http://mail.madler.net/pipermail/zlib-devel_madler.net/2012-August/002977.html
|
|
From: Andreas Krebbel krebbel at linux.vnet.ibm.com
|
|
Subject: RFC: Improve longest_match performance
|
|
|
|
The code currently generated for longest_match looks far from optimal
|
|
due to a bunch of pointless zero/sign extend instructions.
|
|
|
|
By just promoting a few data types in the function I was able to get
|
|
rid of all but two. The new hotloop is almost half the size of the
|
|
original version providing quite a performance win for S/390:
|
|
|
|
Measured on a zEnterprise z196
|
|
zlib compiled with upstream GCC 4.8 branch
|
|
|
|
32 bit old new
|
|
|
|
256MB randomdata: 11.65s 10.92s 6.68%
|
|
100MB manpages: 4.23s 4.14s 2.17%
|
|
217MB PDF: 10.54s 9.44s 11.65%
|
|
|
|
unaligned ok
|
|
|
|
256MB randomdata: 10.90s 10.54s 3.41%
|
|
100MB manpages: 3.94s 3.87s 1.81%
|
|
217MB PDF: 8.77s 8.64s 1.50%
|
|
|
|
64 bit old new
|
|
|
|
256MB randomdata: 11.90s 11.43s 4.11%
|
|
100MB manpages: 4.51s 4.44s 1.58%
|
|
217MB PDF: 10.11s 9.89s 2.22%
|
|
|
|
unaligned ok
|
|
|
|
256MB randomdata: 11.51s 11.15s 3.23%
|
|
100MB manpages: 4.33s 3.99s 8.52%
|
|
217MB PDF: 9.81s 9.02s 8.76%
|
|
|
|
I also did some measurements on x86 and Power:
|
|
|
|
For Power (64 bit, unaligned_ok) an additional zero extend
|
|
appears. However, the impact is not measurable. There are minor wins
|
|
and minor regressions. The overall result is flat.
|
|
|
|
For Core2 32 bit the patch is a clear winner with up to 9% for the pdf
|
|
test. Also on 64 bit the code optimized for Core2 gets a bit smaller
|
|
but unfortunately causes some regressions which I cannot explain.
|
|
|
|
For mainframe customers the performance of zlib is very important so I
|
|
would be very happy to see the patch integrated into upstream
|
|
zlib. Given that the patch might cause minor regressions on other
|
|
targets, would it be possible to enable it arch-dependent?
|
|
|
|
See below for the patch and some code snippets from my tests.
|
|
|
|
Bye,
|
|
|
|
-Andreas-
|
|
|
|
---
|
|
deflate.c | 15 ++++++++-------
|
|
1 file changed, 8 insertions(+), 7 deletions(-)
|
|
|
|
Index: zlib-1.2.7/deflate.c
|
|
===================================================================
|
|
--- zlib-1.2.7.orig/deflate.c 2012-02-13 01:15:47.000000000 +0100
|
|
+++ zlib-1.2.7/deflate.c 2012-09-27 13:39:57.942762946 +0200
|
|
@@ -1143,15 +1143,16 @@
|
|
/* For 80x86 and 680x0, an optimized version will be provided in match.asm or
|
|
* match.S. The code will be functionally equivalent.
|
|
*/
|
|
-local uInt longest_match(s, cur_match)
|
|
+local uInt longest_match(s, pcur_match)
|
|
deflate_state *s;
|
|
- IPos cur_match; /* current match */
|
|
+ IPos pcur_match; /* current match */
|
|
{
|
|
+ ptrdiff_t cur_match = pcur_match; /* extend to pointer width */
|
|
unsigned chain_length = s->max_chain_length;/* max hash chain length */
|
|
register Bytef *scan = s->window + s->strstart; /* current string */
|
|
register Bytef *match; /* matched string */
|
|
register int len; /* length of current match */
|
|
- int best_len = s->prev_length; /* best match length so far */
|
|
+ ptrdiff_t best_len = s->prev_length; /* best match length so far */
|
|
int nice_match = s->nice_match; /* stop if match long enough */
|
|
IPos limit = s->strstart > (IPos)MAX_DIST(s) ?
|
|
s->strstart - (IPos)MAX_DIST(s) : NIL;
|
|
@@ -1166,12 +1167,12 @@
|
|
* Try with and without -DUNALIGNED_OK to check.
|
|
*/
|
|
register Bytef *strend = s->window + s->strstart + MAX_MATCH - 1;
|
|
- register ush scan_start = *(ushf*)scan;
|
|
- register ush scan_end = *(ushf*)(scan+best_len-1);
|
|
+ register uInt scan_start = *(ushf*)scan;
|
|
+ register uInt scan_end = *(ushf*)(scan+best_len-1);
|
|
#else
|
|
register Bytef *strend = s->window + s->strstart + MAX_MATCH;
|
|
- register Byte scan_end1 = scan[best_len-1];
|
|
- register Byte scan_end = scan[best_len];
|
|
+ register uInt scan_end1 = scan[best_len-1];
|
|
+ register uInt scan_end = scan[best_len];
|
|
#endif
|
|
|
|
/* The code is optimized for HASH_BITS >= 8 and MAX_MATCH-2 multiple of 16.
|