zlib/zlib-1.2.7-improve-longest_match-performance.patch

# http://mail.madler.net/pipermail/zlib-devel_madler.net/2012-June/002907.html
# http://mail.madler.net/pipermail/zlib-devel_madler.net/2012-August/002977.html
From: Andreas Krebbel krebbel at linux.vnet.ibm.com
Subject: RFC: Improve longest_match performance

The code currently generated for longest_match looks far from optimal
due to a bunch of pointless zero/sign extend instructions.

By just promoting a few data types in the function I was able to get
rid of all but two. The new hotloop is almost half the size of the
original version providing quite a performance win for S/390:

Measured on a zEnterprise z196
zlib compiled with upstream GCC 4.8 branch

32 bit	      	   	old 	new

256MB randomdata:	11.65s	10.92s	6.68%
100MB manpages:		4.23s	4.14s	2.17%
217MB PDF:		10.54s	9.44s	11.65%

unaligned ok

256MB randomdata:	10.90s	10.54s	3.41%
100MB manpages:		3.94s	3.87s	1.81%
217MB PDF:		8.77s	8.64s   1.50%

64 bit	      	   	old 	new

256MB randomdata:	11.90s	11.43s	4.11%
100MB manpages:		4.51s	4.44s	1.58%
217MB PDF:		10.11s	9.89s	2.22%

unaligned ok

256MB randomdata:	11.51s	11.15s	3.23%
100MB manpages:		4.33s	3.99s	8.52%
217MB PDF:		9.81s	9.02s   8.76%

I also did some measurements on x86 and Power:

For Power (64 bit, unaligned_ok) an additional zero extend
appears. However, the impact is not measurable. There are minor wins
and minor regressions. The overall result is flat.

For Core2 32 bit the patch is a clear winner with up to 9% for the pdf
test.  Also on 64 bit the code optimized for Core2 gets a bit smaller
but unfortunately causes some regressions which I cannot explain.

For mainframe customers the performance of zlib is very important so I
would be very happy to see the patch integrated into upstream
zlib. Given that the patch might cause minor regressions on other
targets, would it be possible to enable it arch-dependent?

See below for the patch and some code snippets from my tests.

Bye,

-Andreas-

---
 deflate.c |   15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

Index: zlib-1.2.7/deflate.c
===================================================================
--- zlib-1.2.7.orig/deflate.c	2012-02-13 01:15:47.000000000 +0100
+++ zlib-1.2.7/deflate.c	2012-09-27 13:39:57.942762946 +0200
@@ -1143,15 +1143,16 @@
 /* For 80x86 and 680x0, an optimized version will be provided in match.asm or
  * match.S. The code will be functionally equivalent.
  */
-local uInt longest_match(s, cur_match)
+local uInt longest_match(s, pcur_match)
     deflate_state *s;
-    IPos cur_match;                             /* current match */
+    IPos pcur_match;                             /* current match */
 {
+    ptrdiff_t cur_match = pcur_match; /* extend to pointer width */
     unsigned chain_length = s->max_chain_length;/* max hash chain length */
     register Bytef *scan = s->window + s->strstart; /* current string */
     register Bytef *match;                       /* matched string */
     register int len;                           /* length of current match */
-    int best_len = s->prev_length;              /* best match length so far */
+    ptrdiff_t best_len = s->prev_length;              /* best match length so far */
     int nice_match = s->nice_match;             /* stop if match long enough */
     IPos limit = s->strstart > (IPos)MAX_DIST(s) ?
         s->strstart - (IPos)MAX_DIST(s) : NIL;
@@ -1166,12 +1167,12 @@
      * Try with and without -DUNALIGNED_OK to check.
      */
     register Bytef *strend = s->window + s->strstart + MAX_MATCH - 1;
-    register ush scan_start = *(ushf*)scan;
-    register ush scan_end   = *(ushf*)(scan+best_len-1);
+    register uInt scan_start = *(ushf*)scan;
+    register uInt scan_end   = *(ushf*)(scan+best_len-1);
 #else
     register Bytef *strend = s->window + s->strstart + MAX_MATCH;
-    register Byte scan_end1  = scan[best_len-1];
-    register Byte scan_end   = scan[best_len];
+    register uInt scan_end1  = scan[best_len-1];
+    register uInt scan_end   = scan[best_len];
 #endif

     /* The code is optimized for HASH_BITS >= 8 and MAX_MATCH-2 multiple of 16.