# http://mail.madler.net/pipermail/zlib-devel_madler.net/2012-June/002907.html # http://mail.madler.net/pipermail/zlib-devel_madler.net/2012-August/002977.html From: Andreas Krebbel krebbel at linux.vnet.ibm.com Subject: RFC: Improve longest_match performance The code currently generated for longest_match looks far from optimal due to a bunch of pointless zero/sign extend instructions. By just promoting a few data types in the function I was able to get rid of all but two. The new hotloop is almost half the size of the original version providing quite a performance win for S/390: Measured on a zEnterprise z196 zlib compiled with upstream GCC 4.8 branch 32 bit old new 256MB randomdata: 11.65s 10.92s 6.68% 100MB manpages: 4.23s 4.14s 2.17% 217MB PDF: 10.54s 9.44s 11.65% unaligned ok 256MB randomdata: 10.90s 10.54s 3.41% 100MB manpages: 3.94s 3.87s 1.81% 217MB PDF: 8.77s 8.64s 1.50% 64 bit old new 256MB randomdata: 11.90s 11.43s 4.11% 100MB manpages: 4.51s 4.44s 1.58% 217MB PDF: 10.11s 9.89s 2.22% unaligned ok 256MB randomdata: 11.51s 11.15s 3.23% 100MB manpages: 4.33s 3.99s 8.52% 217MB PDF: 9.81s 9.02s 8.76% I also did some measurements on x86 and Power: For Power (64 bit, unaligned_ok) an additional zero extend appears. However, the impact is not measurable. There are minor wins and minor regressions. The overall result is flat. For Core2 32 bit the patch is a clear winner with up to 9% for the pdf test. Also on 64 bit the code optimized for Core2 gets a bit smaller but unfortunately causes some regressions which I cannot explain. For mainframe customers the performance of zlib is very important so I would be very happy to see the patch integrated into upstream zlib. Given that the patch might cause minor regressions on other targets, would it be possible to enable it arch-dependent? See below for the patch and some code snippets from my tests. Bye, -Andreas- --- deflate.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) Index: zlib-1.2.7/deflate.c =================================================================== --- zlib-1.2.7.orig/deflate.c 2012-02-13 01:15:47.000000000 +0100 +++ zlib-1.2.7/deflate.c 2012-09-27 13:39:57.942762946 +0200 @@ -1143,15 +1143,16 @@ /* For 80x86 and 680x0, an optimized version will be provided in match.asm or * match.S. The code will be functionally equivalent. */ -local uInt longest_match(s, cur_match) +local uInt longest_match(s, pcur_match) deflate_state *s; - IPos cur_match; /* current match */ + IPos pcur_match; /* current match */ { + ptrdiff_t cur_match = pcur_match; /* extend to pointer width */ unsigned chain_length = s->max_chain_length;/* max hash chain length */ register Bytef *scan = s->window + s->strstart; /* current string */ register Bytef *match; /* matched string */ register int len; /* length of current match */ - int best_len = s->prev_length; /* best match length so far */ + ptrdiff_t best_len = s->prev_length; /* best match length so far */ int nice_match = s->nice_match; /* stop if match long enough */ IPos limit = s->strstart > (IPos)MAX_DIST(s) ? s->strstart - (IPos)MAX_DIST(s) : NIL; @@ -1166,12 +1167,12 @@ * Try with and without -DUNALIGNED_OK to check. */ register Bytef *strend = s->window + s->strstart + MAX_MATCH - 1; - register ush scan_start = *(ushf*)scan; - register ush scan_end = *(ushf*)(scan+best_len-1); + register uInt scan_start = *(ushf*)scan; + register uInt scan_end = *(ushf*)(scan+best_len-1); #else register Bytef *strend = s->window + s->strstart + MAX_MATCH; - register Byte scan_end1 = scan[best_len-1]; - register Byte scan_end = scan[best_len]; + register uInt scan_end1 = scan[best_len-1]; + register uInt scan_end = scan[best_len]; #endif /* The code is optimized for HASH_BITS >= 8 and MAX_MATCH-2 multiple of 16.