106 lines
4.0 KiB
Diff
106 lines
4.0 KiB
Diff
|
# http://mail.madler.net/pipermail/zlib-devel_madler.net/2012-June/002907.html
|
||
|
# http://mail.madler.net/pipermail/zlib-devel_madler.net/2012-August/002977.html
|
||
|
From: Andreas Krebbel krebbel at linux.vnet.ibm.com
|
||
|
Subject: RFC: Improve longest_match performance
|
||
|
|
||
|
The code currently generated for longest_match looks far from optimal
|
||
|
due to a bunch of pointless zero/sign extend instructions.
|
||
|
|
||
|
By just promoting a few data types in the function I was able to get
|
||
|
rid of all but two. The new hotloop is almost half the size of the
|
||
|
original version providing quite a performance win for S/390:
|
||
|
|
||
|
Measured on a zEnterprise z196
|
||
|
zlib compiled with upstream GCC 4.8 branch
|
||
|
|
||
|
32 bit old new
|
||
|
|
||
|
256MB randomdata: 11.65s 10.92s 6.68%
|
||
|
100MB manpages: 4.23s 4.14s 2.17%
|
||
|
217MB PDF: 10.54s 9.44s 11.65%
|
||
|
|
||
|
unaligned ok
|
||
|
|
||
|
256MB randomdata: 10.90s 10.54s 3.41%
|
||
|
100MB manpages: 3.94s 3.87s 1.81%
|
||
|
217MB PDF: 8.77s 8.64s 1.50%
|
||
|
|
||
|
64 bit old new
|
||
|
|
||
|
256MB randomdata: 11.90s 11.43s 4.11%
|
||
|
100MB manpages: 4.51s 4.44s 1.58%
|
||
|
217MB PDF: 10.11s 9.89s 2.22%
|
||
|
|
||
|
unaligned ok
|
||
|
|
||
|
256MB randomdata: 11.51s 11.15s 3.23%
|
||
|
100MB manpages: 4.33s 3.99s 8.52%
|
||
|
217MB PDF: 9.81s 9.02s 8.76%
|
||
|
|
||
|
I also did some measurements on x86 and Power:
|
||
|
|
||
|
For Power (64 bit, unaligned_ok) an additional zero extend
|
||
|
appears. However, the impact is not measurable. There are minor wins
|
||
|
and minor regressions. The overall result is flat.
|
||
|
|
||
|
For Core2 32 bit the patch is a clear winner with up to 9% for the pdf
|
||
|
test. Also on 64 bit the code optimized for Core2 gets a bit smaller
|
||
|
but unfortunately causes some regressions which I cannot explain.
|
||
|
|
||
|
For mainframe customers the performance of zlib is very important so I
|
||
|
would be very happy to see the patch integrated into upstream
|
||
|
zlib. Given that the patch might cause minor regressions on other
|
||
|
targets, would it be possible to enable it arch-dependent?
|
||
|
|
||
|
See below for the patch and some code snippets from my tests.
|
||
|
|
||
|
Bye,
|
||
|
|
||
|
-Andreas-
|
||
|
|
||
|
---
|
||
|
deflate.c | 15 ++++++++-------
|
||
|
1 file changed, 8 insertions(+), 7 deletions(-)
|
||
|
|
||
|
Index: zlib-1.2.7/deflate.c
|
||
|
===================================================================
|
||
|
--- zlib-1.2.7.orig/deflate.c 2012-02-13 01:15:47.000000000 +0100
|
||
|
+++ zlib-1.2.7/deflate.c 2012-09-27 13:39:57.942762946 +0200
|
||
|
@@ -1143,15 +1143,16 @@
|
||
|
/* For 80x86 and 680x0, an optimized version will be provided in match.asm or
|
||
|
* match.S. The code will be functionally equivalent.
|
||
|
*/
|
||
|
-local uInt longest_match(s, cur_match)
|
||
|
+local uInt longest_match(s, pcur_match)
|
||
|
deflate_state *s;
|
||
|
- IPos cur_match; /* current match */
|
||
|
+ IPos pcur_match; /* current match */
|
||
|
{
|
||
|
+ ptrdiff_t cur_match = pcur_match; /* extend to pointer width */
|
||
|
unsigned chain_length = s->max_chain_length;/* max hash chain length */
|
||
|
register Bytef *scan = s->window + s->strstart; /* current string */
|
||
|
register Bytef *match; /* matched string */
|
||
|
register int len; /* length of current match */
|
||
|
- int best_len = s->prev_length; /* best match length so far */
|
||
|
+ ptrdiff_t best_len = s->prev_length; /* best match length so far */
|
||
|
int nice_match = s->nice_match; /* stop if match long enough */
|
||
|
IPos limit = s->strstart > (IPos)MAX_DIST(s) ?
|
||
|
s->strstart - (IPos)MAX_DIST(s) : NIL;
|
||
|
@@ -1166,12 +1167,12 @@
|
||
|
* Try with and without -DUNALIGNED_OK to check.
|
||
|
*/
|
||
|
register Bytef *strend = s->window + s->strstart + MAX_MATCH - 1;
|
||
|
- register ush scan_start = *(ushf*)scan;
|
||
|
- register ush scan_end = *(ushf*)(scan+best_len-1);
|
||
|
+ register uInt scan_start = *(ushf*)scan;
|
||
|
+ register uInt scan_end = *(ushf*)(scan+best_len-1);
|
||
|
#else
|
||
|
register Bytef *strend = s->window + s->strstart + MAX_MATCH;
|
||
|
- register Byte scan_end1 = scan[best_len-1];
|
||
|
- register Byte scan_end = scan[best_len];
|
||
|
+ register uInt scan_end1 = scan[best_len-1];
|
||
|
+ register uInt scan_end = scan[best_len];
|
||
|
#endif
|
||
|
|
||
|
/* The code is optimized for HASH_BITS >= 8 and MAX_MATCH-2 multiple of 16.
|