From 12e1d1e53d9f35b60f8ae2cb5534a77147d0c67b99bbadd15fc923cb4f71c4f3 Mon Sep 17 00:00:00 2001
From: Sascha Peilicke <null@suse.de>
Date: Sun, 3 Apr 2011 10:13:53 +0000
Subject: [PATCH] Accepting request 65792 from Base:System

Accepted submit request 65792 from user coolo

OBS-URL: https://build.opensuse.org/request/show/65792
OBS-URL: https://build.opensuse.org/package/show/openSUSE:Factory/zlib?expand=0&rev=27
---
 01-prepare.patch     |  423 +++++++++++++++
 02-ppc_altivec.patch |  307 +++++++++++
 03-arm.patch         |  400 +++++++++++++++
 04-x86.patch         | 1165 ++++++++++++++++++++++++++++++++++++++++++
 zlib.changes         |   11 +
 zlib.spec            |    9 +-
 6 files changed, 2314 insertions(+), 1 deletion(-)
 create mode 100644 01-prepare.patch
 create mode 100644 02-ppc_altivec.patch
 create mode 100644 03-arm.patch
 create mode 100644 04-x86.patch

diff --git a/01-prepare.patch b/01-prepare.patch
new file mode 100644
index 0000000..f04d874
--- /dev/null
+++ b/01-prepare.patch
@@ -0,0 +1,423 @@
+=== modified file 'Makefile.in'
+--- Makefile.in	2011-03-14 01:01:37 +0000
++++ Makefile.in	2011-03-14 02:19:21 +0000
+@@ -236,7 +236,8 @@
+ 
+ # DO NOT DELETE THIS LINE -- make depend depends on it.
+ 
+-adler32.o zutil.o: zutil.h zlib.h zconf.h
++adler32.o: adler32.c zutil.h zlib.h zconf.h
++zutil.o: zutil.h zlib.h zconf.h
+ gzclose.o gzlib.o gzread.o gzwrite.o: zlib.h zconf.h gzguts.h
+ compress.o example.o minigzip.o uncompr.o: zlib.h zconf.h
+ crc32.o: zutil.h zlib.h zconf.h crc32.h
+@@ -246,7 +247,8 @@
+ inftrees.o: zutil.h zlib.h zconf.h inftrees.h
+ trees.o: deflate.h zutil.h zlib.h zconf.h trees.h
+ 
+-adler32.lo zutil.lo: zutil.h zlib.h zconf.h
++adler32.lo: adler32.c zutil.h zlib.h zconf.h
++zutil.lo: zutil.h zlib.h zconf.h
+ gzclose.lo gzlib.lo gzread.lo gzwrite.lo: zlib.h zconf.h gzguts.h
+ compress.lo example.lo minigzip.lo uncompr.lo: zlib.h zconf.h
+ crc32.lo: zutil.h zlib.h zconf.h crc32.h
+
+=== modified file 'adler32.c'
+--- adler32.c	2011-03-14 01:01:37 +0000
++++ adler32.c	2011-03-30 13:38:42 +0000
+@@ -9,6 +9,35 @@
+ 
+ #define local static
+ 
++#define GCC_VERSION_GE(x) ((__GNUC__-0) * 100 + __GNUC_MINOR__-0 >= x)
++
++#if GCC_VERSION_GE(301)
++/* sometimes leakes out of old kernel header */
++#  undef noinline
++#  define noinline __attribute__((__noinline__))
++#else
++#  ifndef noinline
++#    define noinline
++#  endif
++#endif
++
++#if GCC_VERSION_GE(301)
++# define GCC_ATTR_UNUSED_PARAM __attribute__((__unused__))
++#else
++# define GCC_ATTR_UNUSED_PARAM
++#endif
++
++#if GCC_VERSION_GE(296)
++#  define likely(x)   __builtin_expect(!!(x), 1)
++#  define unlikely(x) __builtin_expect(!!(x), 0)
++#else
++#  define likely(x)   (x)
++#  define unlikely(x) (x)
++#endif
++
++#define ROUND_TO(x , n) ((x) & ~((n) - 1L))
++#define ALIGN_DIFF(x, n) (((intptr_t)((x)+(n) - 1L) & ~((intptr_t)(n) - 1L)) - (intptr_t)(x))
++
+ local uLong adler32_combine_(uLong adler1, uLong adler2, z_off64_t len2);
+ 
+ #define BASE 65521UL    /* largest prime smaller than 65536 */
+@@ -21,9 +50,20 @@
+ #define DO8(buf,i)  DO4(buf,i); DO4(buf,i+4);
+ #define DO16(buf)   DO8(buf,0); DO8(buf,8);
+ 
++#if defined(__alpha__)
++/* even if gcc can generate a mul by inverse, the code is really
++ * ugly (find global const pool pointer, load constant, a mul, lots
++ * of shifts/add/sub), up to 14 instructions. The replacement code
++ * only needs >= 5 instructions
++ */
++#  define NO_DIVIDE
++#endif
++
+ /* use NO_DIVIDE if your processor does not do division in hardware */
+ #ifdef NO_DIVIDE
+-#  define MOD(a) \
++/* use NO_SHIFT if your processor does shift > 1 by loop */
++#  ifdef NO_SHIFT
++#    define reduce_full(a) \
+     do { \
+         if (a >= (BASE << 16)) a -= (BASE << 16); \
+         if (a >= (BASE << 15)) a -= (BASE << 15); \
+@@ -43,21 +83,237 @@
+         if (a >= (BASE << 1)) a -= (BASE << 1); \
+         if (a >= BASE) a -= BASE; \
+     } while (0)
+-#  define MOD4(a) \
++#    define reduce_x(a) \
+     do { \
++        if (MIN_WORK >= (1 << 6) && a >= (BASE << 6)) a -= (BASE << 6); \
++        if (MIN_WORK >= (1 << 5) && a >= (BASE << 5)) a -= (BASE << 5); \
+         if (a >= (BASE << 4)) a -= (BASE << 4); \
+         if (a >= (BASE << 3)) a -= (BASE << 3); \
+         if (a >= (BASE << 2)) a -= (BASE << 2); \
+         if (a >= (BASE << 1)) a -= (BASE << 1); \
+         if (a >= BASE) a -= BASE; \
+     } while (0)
++#    define reduce(a) reduce_full(a)
++#  else
++#    define reduce_full(a) \
++    do { \
++        unsigned long b = a & 0x0000ffff; \
++        a >>= 16; \
++        b -= a; \
++        a <<= 4; \
++        a += b; \
++    } while(a >= BASE)
++#    define reduce_x(a) \
++    do { \
++        unsigned long b = a & 0x0000ffff; \
++        a >>= 16; \
++        b -= a; \
++        a <<= 4; \
++        a += b; \
++        a = a >= BASE ? a - BASE : a; \
++    } while(0)
++#    define reduce(a) \
++    do { \
++        unsigned long b = a & 0x0000ffff; \
++        a >>= 16; \
++        b -= a; \
++        a <<= 4; \
++        a += b; \
++    } while(0)
++#  endif
+ #else
+-#  define MOD(a) a %= BASE
+-#  define MOD4(a) a %= BASE
+-#endif
+-
+-/* ========================================================================= */
+-uLong ZEXPORT adler32(adler, buf, len)
++#  define reduce_full(a) a %= BASE
++#  define reduce_x(a) a %= BASE
++#  define reduce(a) a %= BASE
++#endif
++
++local int host_is_bigendian()
++{
++    local const union {
++        uInt d;
++        unsigned char endian[sizeof(uInt)];
++    } x = {1};
++    return x.endian[0] == 0;
++}
++
++#ifndef MIN_WORK
++#  define MIN_WORK 16
++#endif
++
++/* ========================================================================= */
++local noinline uLong adler32_1(adler, buf, len)
++    uLong adler;
++    const Bytef *buf;
++    uInt len GCC_ATTR_UNUSED_PARAM;
++{
++    unsigned long sum2;
++
++    /* split Adler-32 into component sums */
++    sum2 = (adler >> 16) & 0xffff;
++    adler &= 0xffff;
++
++    adler += buf[0];
++    if (adler >= BASE)
++        adler -= BASE;
++    sum2 += adler;
++    if (sum2 >= BASE)
++        sum2 -= BASE;
++    return adler | (sum2 << 16);
++}
++
++/* ========================================================================= */
++local noinline uLong adler32_common(adler, buf, len)
++    uLong adler;
++    const Bytef *buf;
++    uInt len;
++{
++    unsigned long sum2;
++
++    /* split Adler-32 into component sums */
++    sum2 = (adler >> 16) & 0xffff;
++    adler &= 0xffff;
++
++    while (len--) {
++        adler += *buf++;
++        sum2 += adler;
++    }
++    if (adler >= BASE)
++        adler -= BASE;
++    reduce_x(sum2);             /* only added so many BASE's */
++    return adler | (sum2 << 16);
++}
++
++#ifndef HAVE_ADLER32_VEC
++#  if (defined(__LP64__) || ((SIZE_MAX-0) >> 31) >= 2) && !defined(NO_ADLER32_VEC)
++
++/* On 64 Bit archs, we can do pseudo SIMD with a nice win.
++ * This is esp. important for old Alphas, they do not have byte
++ * access.
++ * This needs some register but x86_64 is fine (>= 9 for the mainloop
++ * req.). If your 64 Bit arch is more limited, throw it away...
++ */
++#    ifndef UINT64_C
++#      if defined(_MSC_VER) || defined(__BORLANDC__)
++#        define UINT64_C(c)    (c ## ui64)
++#      else
++#        define UINT64_C(c)    (c ## ULL)
++#      endif
++#    endif
++
++#    undef VNMAX
++#    define VNMAX (2*NMAX+((9*NMAX)/10))
++
++/* ========================================================================= */
++local noinline uLong adler32_vec(adler, buf, len)
++    uLong adler;
++    const Bytef *buf;
++    uInt len;
++{
++    unsigned int s1, s2;
++    unsigned int k;
++
++    /* split Adler-32 into component sums */
++    s1 = adler & 0xffff;
++    s2 = (adler >> 16) & 0xffff;
++
++    /* align input data */
++    k    = ALIGN_DIFF(buf, sizeof(size_t));
++    len -= k;
++    if (k) do {
++        s1 += *buf++;
++        s2 += s1;
++    } while(--k);
++
++    k = len > VNMAX ? VNMAX : len;
++    len -= k;
++    if (likely(k >= 2 * sizeof(size_t))) do
++    {
++        unsigned int vs1, vs2;
++        unsigned int vs1s;
++
++        /* add s1 to s2 for rounds to come */
++        s2 += s1 * ROUND_TO(k, sizeof(size_t));
++        vs1s = vs1 = vs2 = 0;
++        do {
++            size_t vs1l = 0, vs1h = 0, vs1l_s = 0, vs1h_s = 0;
++            unsigned int a, b, c, d, e, f, g, h;
++            unsigned int j;
++
++            j = k > 23 * sizeof(size_t) ? 23 : k/sizeof(size_t);
++            k -= j * sizeof(size_t);
++            /* add s1 to s1 round sum for rounds to come */
++            vs1s += j * vs1;
++            do {
++                size_t in8 = *(const size_t *)buf;
++                buf += sizeof(size_t);
++                /* add this s1 to s1 round sum */
++                vs1l_s += vs1l;
++                vs1h_s += vs1h;
++                /* add up input data to s1 */
++                vs1l +=  in8 & UINT64_C(0x00ff00ff00ff00ff);
++                vs1h += (in8 & UINT64_C(0xff00ff00ff00ff00)) >> 8;
++            } while(--j);
++
++            /* split s1 */
++            if(host_is_bigendian()) {
++                a = (vs1h >> 48) & 0x0000ffff;
++                b = (vs1l >> 48) & 0x0000ffff;
++                c = (vs1h >> 32) & 0x0000ffff;
++                d = (vs1l >> 32) & 0x0000ffff;
++                e = (vs1h >> 16) & 0x0000ffff;
++                f = (vs1l >> 16) & 0x0000ffff;
++                g = (vs1h      ) & 0x0000ffff;
++                h = (vs1l      ) & 0x0000ffff;
++            } else {
++                a = (vs1l      ) & 0x0000ffff;
++                b = (vs1h      ) & 0x0000ffff;
++                c = (vs1l >> 16) & 0x0000ffff;
++                d = (vs1h >> 16) & 0x0000ffff;
++                e = (vs1l >> 32) & 0x0000ffff;
++                f = (vs1h >> 32) & 0x0000ffff;
++                g = (vs1l >> 48) & 0x0000ffff;
++                h = (vs1h >> 48) & 0x0000ffff;
++            }
++
++            /* add s1 & s2 horiz. */
++            vs2 += 8*a + 7*b + 6*c + 5*d + 4*e + 3*f + 2*g + 1*h;
++            vs1 += a + b + c + d + e + f + g + h;
++
++            /* split and add up s1 round sum */
++            vs1l_s = ((vs1l_s      ) & UINT64_C(0x0000ffff0000ffff)) +
++                     ((vs1l_s >> 16) & UINT64_C(0x0000ffff0000ffff));
++            vs1h_s = ((vs1h_s      ) & UINT64_C(0x0000ffff0000ffff)) +
++                     ((vs1h_s >> 16) & UINT64_C(0x0000ffff0000ffff));
++            vs1l_s += vs1h_s;
++            vs1s += ((vs1l_s      ) & UINT64_C(0x00000000ffffffff)) +
++                    ((vs1l_s >> 32) & UINT64_C(0x00000000ffffffff));
++        } while (k >= sizeof(size_t));
++        reduce(vs1s);
++        s2 += vs1s * 8 + vs2;
++        reduce(s2);
++        s1 += vs1;
++        reduce(s1);
++        len += k;
++        k = len > VNMAX ? VNMAX : len;
++        len -= k;
++    } while (k >= sizeof(size_t));
++
++    /* handle trailer */
++    if (k) do {
++        s1 += *buf++;
++        s2 += s1;
++    } while (--k);
++    reduce(s1);
++    reduce(s2);
++
++    /* return recombined sums */
++    return (s2 << 16) | s1;
++}
++
++#  else
++
++/* ========================================================================= */
++local noinline uLong adler32_vec(adler, buf, len)
+     uLong adler;
+     const Bytef *buf;
+     uInt len;
+@@ -69,33 +325,6 @@
+     sum2 = (adler >> 16) & 0xffff;
+     adler &= 0xffff;
+ 
+-    /* in case user likes doing a byte at a time, keep it fast */
+-    if (len == 1) {
+-        adler += buf[0];
+-        if (adler >= BASE)
+-            adler -= BASE;
+-        sum2 += adler;
+-        if (sum2 >= BASE)
+-            sum2 -= BASE;
+-        return adler | (sum2 << 16);
+-    }
+-
+-    /* initial Adler-32 value (deferred check for len == 1 speed) */
+-    if (buf == Z_NULL)
+-        return 1L;
+-
+-    /* in case short lengths are provided, keep it somewhat fast */
+-    if (len < 16) {
+-        while (len--) {
+-            adler += *buf++;
+-            sum2 += adler;
+-        }
+-        if (adler >= BASE)
+-            adler -= BASE;
+-        MOD4(sum2);             /* only added so many BASE's */
+-        return adler | (sum2 << 16);
+-    }
+-
+     /* do length NMAX blocks -- requires just one modulo operation */
+     while (len >= NMAX) {
+         len -= NMAX;
+@@ -104,8 +333,8 @@
+             DO16(buf);          /* 16 sums unrolled */
+             buf += 16;
+         } while (--n);
+-        MOD(adler);
+-        MOD(sum2);
++        reduce_full(adler);
++        reduce_full(sum2);
+     }
+ 
+     /* do remaining bytes (less than NMAX, still just one modulo) */
+@@ -119,13 +348,36 @@
+             adler += *buf++;
+             sum2 += adler;
+         }
+-        MOD(adler);
+-        MOD(sum2);
++        reduce_full(adler);
++        reduce_full(sum2);
+     }
+ 
+     /* return recombined sums */
+     return adler | (sum2 << 16);
+ }
++#  endif
++#endif
++
++/* ========================================================================= */
++uLong ZEXPORT adler32(adler, buf, len)
++    uLong adler;
++    const Bytef *buf;
++    uInt len;
++{
++    /* in case user likes doing a byte at a time, keep it fast */
++    if (len == 1)
++        return adler32_1(adler, buf, len); /* should create a fast tailcall */
++
++    /* initial Adler-32 value (deferred check for len == 1 speed) */
++    if (buf == Z_NULL)
++        return 1L;
++
++    /* in case short lengths are provided, keep it somewhat fast */
++    if (len < MIN_WORK)
++        return adler32_common(adler, buf, len);
++
++    return adler32_vec(adler, buf, len);
++}
+ 
+ /* ========================================================================= */
+ local uLong adler32_combine_(adler1, adler2, len2)
+@@ -141,7 +393,7 @@
+     rem = (unsigned)(len2 % BASE);
+     sum1 = adler1 & 0xffff;
+     sum2 = rem * sum1;
+-    MOD(sum2);
++    reduce_full(sum2);
+     sum1 += (adler2 & 0xffff) + BASE - 1;
+     sum2 += ((adler1 >> 16) & 0xffff) + ((adler2 >> 16) & 0xffff) + BASE - rem;
+     if (sum1 >= BASE) sum1 -= BASE;
diff --git a/02-ppc_altivec.patch b/02-ppc_altivec.patch
new file mode 100644
index 0000000..5aabbc3
--- /dev/null
+++ b/02-ppc_altivec.patch
@@ -0,0 +1,307 @@
+=== modified file 'Makefile.in'
+--- Makefile.in	2011-03-14 02:19:21 +0000
++++ Makefile.in	2011-03-14 03:06:03 +0000
+@@ -236,7 +236,7 @@
+ 
+ # DO NOT DELETE THIS LINE -- make depend depends on it.
+ 
+-adler32.o: adler32.c zutil.h zlib.h zconf.h
++adler32.o: adler32.c adler32_ppc.c zutil.h zlib.h zconf.h
+ zutil.o: zutil.h zlib.h zconf.h
+ gzclose.o gzlib.o gzread.o gzwrite.o: zlib.h zconf.h gzguts.h
+ compress.o example.o minigzip.o uncompr.o: zlib.h zconf.h
+@@ -247,7 +247,7 @@
+ inftrees.o: zutil.h zlib.h zconf.h inftrees.h
+ trees.o: deflate.h zutil.h zlib.h zconf.h trees.h
+ 
+-adler32.lo: adler32.c zutil.h zlib.h zconf.h
++adler32.lo: adler32.c adler32_ppc.c zutil.h zlib.h zconf.h
+ zutil.lo: zutil.h zlib.h zconf.h
+ gzclose.lo gzlib.lo gzread.lo gzwrite.lo: zlib.h zconf.h gzguts.h
+ compress.lo example.lo minigzip.lo uncompr.lo: zlib.h zconf.h
+
+=== modified file 'adler32.c'
+--- adler32.c	2011-03-30 13:38:42 +0000
++++ adler32.c	2011-03-30 13:38:46 +0000
+@@ -36,7 +36,10 @@
+ #endif
+ 
+ #define ROUND_TO(x , n) ((x) & ~((n) - 1L))
++#define DIV_ROUNDUP(a, b) (((a) + (b) - 1) / (b))
+ #define ALIGN_DIFF(x, n) (((intptr_t)((x)+(n) - 1L) & ~((intptr_t)(n) - 1L)) - (intptr_t)(x))
++#define ALIGN_DOWN(x, n) (((intptr_t)(x)) & ~((intptr_t)(n) - 1L))
++#define ALIGN_DOWN_DIFF(x, n) (((intptr_t)(x)) & ((intptr_t)(n) - 1L))
+ 
+ local uLong adler32_combine_(uLong adler1, uLong adler2, z_off64_t len2);
+ 
+@@ -136,6 +139,12 @@
+     return x.endian[0] == 0;
+ }
+ 
++#ifndef NO_ADLER32_VEC
++#  if defined(__powerpc__) || defined(__powerpc64__)
++#    include "adler32_ppc.c"
++#  endif
++#endif
++
+ #ifndef MIN_WORK
+ #  define MIN_WORK 16
+ #endif
+
+=== added file 'adler32_ppc.c'
+--- adler32_ppc.c	1970-01-01 00:00:00 +0000
++++ adler32_ppc.c	2011-03-30 11:12:04 +0000
+@@ -0,0 +1,253 @@
++/*
++ * adler32.c -- compute the Adler-32 checksum of a data stream
++ *   ppc implementation
++ * Copyright (C) 1995-2007 Mark Adler
++ * Copyright (C) 2009-2011 Jan Seiffert
++ * For conditions of distribution and use, see copyright notice in zlib.h
++ */
++
++/* @(#) $Id$ */
++
++/*
++ * We use the Altivec PIM vector stuff, but still, this is only
++ * tested with GCC, and prop. uses some GCC specifics (like GCC
++ * understands vector types and you can simply write a += b)
++ */
++#if defined(__ALTIVEC__) && defined(__GNUC__)
++# define HAVE_ADLER32_VEC
++/* it needs some bytes till the vec version gets up to speed... */
++# define MIN_WORK 64
++# include <altivec.h>
++
++/*
++ * Depending on length, this can be slower (short length < 64 bytes),
++ * much faster (our beloved 128kb 22.2s generic to 3.4s vec, but cache
++ * is important...), to a little faster (very long length, 1.6MB, 47.6s
++ * to 36s), which is prop. only capped by memory bandwith.
++ * (The orig. 128k case was slower in AltiVec, because AltiVec loads
++ * are always uncached and trigger no HW prefetching, because that is
++ * what you often need with mass data manipulation (not poisen your
++ * cache, movntq), instead you have to do it for yourself (data stream
++ * touch). With 128k it could be cleanly seen: no prefetch, half as slow
++ * as generic, but comment out the memory load -> 3s. With proper prefetch
++ * we are at 3.4s. So AltiVec can execute these "expensive" FMA quite
++ * fast (even without fancy unrolling), only the data does not arrive
++ * fast enough. In cases where the working set does not fit into cache
++ * it simply cannot be delivered fast enough over the FSB/Mem).
++ * Still we have to prefetch, or we are slow as hell.
++ */
++
++# define SOVUC (sizeof(vector unsigned char))
++
++/* can be propably more, since we do not have the x86 psadbw 64 bit sum */
++# define VNMAX (6*NMAX)
++
++/* ========================================================================= */
++local inline vector unsigned char vec_identl(level)
++    unsigned int level;
++{
++    return vec_lvsl(level, (const unsigned char *)0);
++}
++
++/* ========================================================================= */
++local inline vector unsigned char vec_ident_rev(void)
++{
++    return vec_xor(vec_identl(0), vec_splat_u8(15));
++}
++
++/* ========================================================================= */
++/* multiply two 32 bit ints, return the low 32 bit */
++local inline vector unsigned int vec_mullw(vector unsigned int a, vector unsigned int b)
++{
++    vector unsigned int v16   = vec_splat_u32(-16);
++    vector unsigned int v0_32 = vec_splat_u32(0);
++    vector unsigned int swap, low, high;
++
++    swap = vec_rl(b, v16);
++    low  = vec_mulo((vector unsigned short)a, (vector unsigned short)b);
++    high = vec_msum((vector unsigned short)a, (vector unsigned short)swap, v0_32);
++    high = vec_sl(high, v16);
++    return vec_add(low, high);
++}
++
++/* ========================================================================= */
++local inline vector unsigned int vector_reduce(vector unsigned int x)
++{
++    vector unsigned int y;
++    vector unsigned int vsh;
++
++    vsh = vec_splat_u32(1);
++    vsh = vec_sl(vsh, vec_splat_u32(4));
++
++    y = vec_sl(x, vsh);
++    y = vec_sr(y, vsh);
++    x = vec_sr(x, vsh);
++    y = vec_sub(y, x);
++    x = vec_sl(x, vec_splat_u32(4));
++    x = vec_add(x, y);
++    return x;
++}
++
++/* ========================================================================= */
++local noinline uLong adler32_vec(adler, buf, len)
++    uLong adler;
++    const Bytef *buf;
++    uInt len;
++{
++    unsigned int s1, s2;
++
++    s1 = adler & 0xffff;
++    s2 = (adler >> 16) & 0xffff;
++
++    if (likely(len >= 2*SOVUC)) {
++        vector unsigned int v0_32 = vec_splat_u32(0);
++        vector unsigned int   vsh = vec_splat_u32(4);
++        vector unsigned char   v1 = vec_splat_u8(1);
++        vector unsigned char vord;
++        vector unsigned char   v0 = vec_splat_u8(0);
++        vector unsigned int vs1, vs2;
++        vector unsigned char in16, vord_a, v1_a, vperm;
++        unsigned int f, n;
++        unsigned int k, block_num;
++
++        /*
++         * if i understand the Altivec PEM right, little
++         * endian impl. should have the data reversed on
++         * load, so the big endian vorder works.
++         */
++        vord = vec_ident_rev() + v1;
++        block_num = DIV_ROUNDUP(len, 512); /* 32 block size * 16 bytes */
++        f  = 512;
++        f |= block_num >= 256 ? 0 : block_num << 16;
++        vec_dst(buf, f, 2);
++        /*
++         * Add stuff to achieve alignment
++         */
++        /* swizzle masks in place */
++        vperm  = vec_lvsl(0, buf);
++        vord_a = vec_perm(vord, v0, vperm);
++        v1_a   = vec_perm(v1, v0, vperm);
++        vperm  = vec_lvsr(0, buf);
++        vord_a = vec_perm(v0, vord_a, vperm);
++        v1_a   = vec_perm(v0, v1_a, vperm);
++
++        /* align hard down */
++        f = (unsigned) ALIGN_DOWN_DIFF(buf, SOVUC);
++        n = SOVUC - f;
++        buf = (const unsigned char *)ALIGN_DOWN(buf, SOVUC);
++
++        /* add n times s1 to s2 for start round */
++        s2 += s1 * n;
++
++        /* set sums 0 */
++        vs1 = v0_32;
++        vs2 = v0_32;
++
++        k = len < VNMAX ? (unsigned)len : VNMAX;
++        len -= k;
++
++        /* insert scalar start somewhere */
++        vs1 = vec_lde(0, &s1);
++        vs2 = vec_lde(0, &s2);
++
++        /* get input data */
++        in16 = vec_ldl(0, buf);
++
++        /* mask out excess data, add 4 byte horizontal and add to old dword */
++        vs1 = vec_msum(in16, v1_a, vs1);
++
++        /* apply order, masking out excess data, add 4 byte horizontal and add to old dword */
++        vs2 = vec_msum(in16, vord_a, vs2);
++
++        buf += SOVUC;
++        k -= n;
++
++        if (likely(k >= SOVUC)) do {
++            vector unsigned int vs1_r = v0_32;
++            f  = 512;
++            f |= block_num >= 256 ? 0 : block_num << 16;
++            vec_dst(buf, f, 2);
++
++            do {
++                /* get input data */
++                in16 = vec_ldl(0, buf);
++
++                /* add vs1 for this round */
++                vs1_r += vs1;
++
++                /* add 4 byte horizontal and add to old dword */
++                vs1 = vec_sum4s(in16, vs1);
++                /* apply order, add 4 byte horizontal and add to old dword */
++                vs2 = vec_msum(in16, vord, vs2);
++
++                buf += SOVUC;
++                k -= SOVUC;
++            } while (k >= SOVUC);
++            /* reduce vs1 round sum before multiplying by 16 */
++            vs1_r = vector_reduce(vs1_r);
++            /* add all vs1 for 16 times */
++            vs2 += vec_sl(vs1_r, vsh);
++            /* reduce the vectors to something in the range of BASE */
++            vs2 = vector_reduce(vs2);
++            vs1 = vector_reduce(vs1);
++            len += k;
++            k = len < VNMAX ? (unsigned)len : VNMAX;
++            block_num = DIV_ROUNDUP(len, 512); /* 32 block size * 16 bytes */
++            len -= k;
++        } while (likely(k >= SOVUC));
++
++        if (likely(k)) {
++            vector unsigned int vk;
++            /*
++             * handle trailer
++             */
++            f = SOVUC - k;
++            /* swizzle masks in place */
++            vperm  = vec_identl(f);
++            vord_a = vec_perm(vord, v0, vperm);
++            v1_a   = vec_perm(v1, v0, vperm);
++
++            /* add k times vs1 for this trailer */
++            vk = (vector unsigned int)vec_lvsl(0, (unsigned *)(intptr_t)k);
++            vk = (vector unsigned)vec_mergeh(v0, (vector unsigned char)vk);
++            vk = (vector unsigned)vec_mergeh((vector unsigned short)v0, (vector unsigned short)vk);
++            vk = vec_splat(vk, 0);
++            vs2 += vec_mullw(vs1, vk);
++
++            /* get input data */
++            in16 = vec_ldl(0, buf);
++
++            /* mask out excess data, add 4 byte horizontal and add to old dword */
++            vs1 = vec_msum(in16, v1_a, vs1);
++            /* apply order, masking out excess data, add 4 byte horizontal and add to old dword */
++            vs2 = vec_msum(in16, vord_a, vs2);
++
++            buf += k;
++            k -= k;
++        }
++
++        vec_dss(2);
++
++        /* add horizontal */
++        /* stuff should be reduced so no proplem with signed sature */
++        vs1 = (vector unsigned)vec_sums((vector int)vs1, (vector int)v0_32);
++        vs2 = (vector unsigned)vec_sums((vector int)vs2, (vector int)v0_32);
++        /* shake and roll */
++        vs1 = vec_splat(vs1, 3);
++        vs2 = vec_splat(vs2, 3);
++        vec_ste(vs1, 0, &s1);
++        vec_ste(vs2, 0, &s2);
++        /* after horizontal add, reduce again in scalar code */
++    }
++
++    if (unlikely(len)) do {
++        s1 += *buf++;
++        s2 += s1;
++    } while (--len);
++    reduce(s1);
++    reduce(s2);
++
++    return (s2 << 16) | s1;
++}
++
++#endif
diff --git a/03-arm.patch b/03-arm.patch
new file mode 100644
index 0000000..6b9878b
--- /dev/null
+++ b/03-arm.patch
@@ -0,0 +1,400 @@
+=== modified file 'Makefile.in'
+--- Makefile.in	2011-03-14 03:06:03 +0000
++++ Makefile.in	2011-03-14 14:39:24 +0000
+@@ -236,7 +236,7 @@
+ 
+ # DO NOT DELETE THIS LINE -- make depend depends on it.
+ 
+-adler32.o: adler32.c adler32_ppc.c zutil.h zlib.h zconf.h
++adler32.o: adler32.c adler32_ppc.c adler32_arm.c zutil.h zlib.h zconf.h
+ zutil.o: zutil.h zlib.h zconf.h
+ gzclose.o gzlib.o gzread.o gzwrite.o: zlib.h zconf.h gzguts.h
+ compress.o example.o minigzip.o uncompr.o: zlib.h zconf.h
+@@ -247,7 +247,7 @@
+ inftrees.o: zutil.h zlib.h zconf.h inftrees.h
+ trees.o: deflate.h zutil.h zlib.h zconf.h trees.h
+ 
+-adler32.lo: adler32.c adler32_ppc.c zutil.h zlib.h zconf.h
++adler32.lo: adler32.c adler32_ppc.c adler32_arm.c zutil.h zlib.h zconf.h
+ zutil.lo: zutil.h zlib.h zconf.h
+ gzclose.lo gzlib.lo gzread.lo gzwrite.lo: zlib.h zconf.h gzguts.h
+ compress.lo example.lo minigzip.lo uncompr.lo: zlib.h zconf.h
+
+=== modified file 'adler32.c'
+--- adler32.c	2011-03-30 13:38:46 +0000
++++ adler32.c	2011-03-30 13:38:46 +0000
+@@ -140,7 +140,9 @@
+ }
+ 
+ #ifndef NO_ADLER32_VEC
+-#  if defined(__powerpc__) || defined(__powerpc64__)
++#  if defined(__arm__)
++#    include "adler32_arm.c"
++#  elif defined(__powerpc__) || defined(__powerpc64__)
+ #    include "adler32_ppc.c"
+ #  endif
+ #endif
+
+=== added file 'adler32_arm.c'
+--- adler32_arm.c	1970-01-01 00:00:00 +0000
++++ adler32_arm.c	2011-03-30 11:18:49 +0000
+@@ -0,0 +1,359 @@
++/*
++ * adler32.c -- compute the Adler-32 checksum of a data stream
++ *   arm implementation
++ * Copyright (C) 1995-2007 Mark Adler
++ * Copyright (C) 2009-2011 Jan Seiffert
++ * For conditions of distribution and use, see copyright notice in zlib.h
++ */
++
++/* @(#) $Id$ */
++
++#if defined(__ARM_NEON__)
++// TODO: need byte order define
++/*
++ * Big endian NEON qwords are kind of broken.
++ * They are big endian within the dwords, but WRONG
++ * (really??) way round between lo and hi.
++ * Creating some kind of PDP11 middle endian.
++ *
++ * This is madness and unsupportable. For this reason
++ * GCC wants to disable qword endian specific patterns.
++ * We would need a Preprocessor define which endian we
++ * have to disable this code.
++ */
++#  include <arm_neon.h>
++
++#  define SOVUCQ sizeof(uint8x16_t)
++#  define SOVUC sizeof(uint8x8_t)
++/* since we do not have the 64bit psadbw sum, we could prop. do a little more */
++#  define VNMAX (6*NMAX)
++#  define HAVE_ADLER32_VEC
++#  define MIN_WORK 32
++
++/* ========================================================================= */
++local inline uint8x16_t neon_simple_alignq(uint8x16_t a, uint8x16_t b, unsigned amount)
++{
++    switch(amount % SOVUCQ)
++    {
++    case  0: return a;
++    case  1: return vextq_u8(a, b,  1);
++    case  2: return vextq_u8(a, b,  2);
++    case  3: return vextq_u8(a, b,  3);
++    case  4: return vextq_u8(a, b,  4);
++    case  5: return vextq_u8(a, b,  5);
++    case  6: return vextq_u8(a, b,  6);
++    case  7: return vextq_u8(a, b,  7);
++    case  8: return vextq_u8(a, b,  8);
++    case  9: return vextq_u8(a, b,  9);
++    case 10: return vextq_u8(a, b, 10);
++    case 11: return vextq_u8(a, b, 11);
++    case 12: return vextq_u8(a, b, 12);
++    case 13: return vextq_u8(a, b, 13);
++    case 14: return vextq_u8(a, b, 14);
++    case 15: return vextq_u8(a, b, 15);
++    }
++    return b;
++}
++
++/* ========================================================================= */
++local inline uint32x4_t vector_reduce(uint32x4_t x)
++{
++    uint32x4_t y;
++
++    y = vshlq_n_u32(x, 16);
++    x = vshrq_n_u32(x, 16);
++    y = vshrq_n_u32(y, 16);
++    y = vsubq_u32(y, x);
++    x = vaddq_u32(y, vshlq_n_u32(x, 4));
++    return x;
++}
++
++/* ========================================================================= */
++local noinline uLong adler32_vec(adler, buf, len)
++    uLong adler;
++    const Bytef *buf;
++    uInt len;
++{
++    uint32x4_t v0_32 = (uint32x4_t){0,0,0,0};
++    uint8x16_t    v0 = (uint8x16_t)v0_32;
++    uint8x16_t vord, vord_a;
++    uint32x4_t vs1, vs2;
++    uint32x2_t v_tsum;
++    uint8x16_t in16;
++    uint32_t s1, s2;
++    unsigned k;
++
++    s1 = adler & 0xffff;
++    s2 = (adler >> 16) & 0xffff;
++
++// TODO: big endian mask is prop. wrong
++    if (host_is_bigendian())
++        vord = (uint8x16_t){16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1};
++    else
++        vord = (uint8x16_t){1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16};
++
++    if (likely(len >= 2*SOVUCQ)) {
++        unsigned f, n;
++
++        /*
++         * Add stuff to achieve alignment
++         */
++        /* align hard down */
++        f = (unsigned) ALIGN_DOWN_DIFF(buf, SOVUCQ);
++        n = SOVUCQ - f;
++        buf = (const unsigned char *)ALIGN_DOWN(buf, SOVUCQ);
++
++        /* add n times s1 to s2 for start round */
++        s2 += s1 * n;
++
++        /* set sums 0 */
++        vs1 = v0_32;
++        vs2 = v0_32;
++        /*
++         * the accumulation of s1 for every round grows very fast
++         * (quadratic?), even if we accumulate in 4 dwords, more
++         * rounds means nonlinear growth.
++         * We already split it out of s2, normaly it would be in
++         * s2 times 16... and even grow faster.
++         * Thanks to this split and vector reduction, we can stay
++         * longer in the loops. But we have to prepare for the worst
++         * (all 0xff), only do 6 times the work.
++         * (we could prop. stay a little longer since we have 4 sums,
++         * not 2 like on x86).
++         */
++        k = len < VNMAX ? (unsigned)len : VNMAX;
++        len -= k;
++        /* insert scalar start somewhere */
++        vs1 = vsetq_lane_u32(s1, vs1, 0);
++        vs2 = vsetq_lane_u32(s2, vs2, 0);
++
++        /* get input data */
++        in16 = *(const uint8x16_t *)buf;
++        /* mask out excess data */
++        if(host_is_bigendian()) {
++            in16 = neon_simple_alignq(v0, in16, n);
++            vord_a = neon_simple_alignq(v0, vord, n);
++        } else {
++            in16 = neon_simple_alignq(in16, v0, f);
++            vord_a = neon_simple_alignq(vord, v0, f);
++        }
++
++        /* pairwise add bytes and long, pairwise add word long acc */
++        vs1 = vpadalq_u16(vs1, vpaddlq_u8(in16));
++        /* apply order, add words, pairwise add word long acc */
++        vs2 = vpadalq_u16(vs2,
++                vmlal_u8(
++                    vmull_u8(vget_low_u8(in16), vget_low_u8(vord_a)),
++                    vget_high_u8(in16), vget_high_u8(vord_a)
++                    )
++                );
++
++        buf += SOVUCQ;
++        k -= n;
++
++        if (likely(k >= SOVUCQ)) do {
++            uint32x4_t vs1_r = v0_32;
++            do {
++                /* add vs1 for this round */
++                vs1_r = vaddq_u32(vs1_r, vs1);
++
++                /* get input data */
++                in16 = *(const uint8x16_t *)buf;
++
++// TODO: make work in inner loop more tight
++                /*
++                 * decompose partial sums, so we do less instructions and
++                 * build loops around it to do acc and so on only from time
++                 * to time.
++                 * This is hard with NEON, because the instruction are nice:
++                 * we have the stuff in widening and with acc (practicaly
++                 * for free...)
++                 */
++                /* pairwise add bytes and long, pairwise add word long acc */
++                vs1 = vpadalq_u16(vs1, vpaddlq_u8(in16));
++                /* apply order, add words, pairwise add word long acc */
++                vs2 = vpadalq_u16(vs2,
++                        vmlal_u8(
++                            vmull_u8(vget_low_u8(in16), vget_low_u8(vord)),
++                            vget_high_u8(in16), vget_high_u8(vord)
++                            )
++                        );
++
++                buf += SOVUCQ;
++                k -= SOVUCQ;
++            } while (k >= SOVUCQ);
++            /* reduce vs1 round sum before multiplying by 16 */
++            vs1_r = vector_reduce(vs1_r);
++            /* add vs1 for this round (16 times) */
++            /* they have shift right and accummulate, where is shift left and acc?? */
++            vs2 = vaddq_u32(vs2, vshlq_n_u32(vs1_r, 4));
++            /* reduce both vectors to something within 16 bit */
++            vs2 = vector_reduce(vs2);
++            vs1 = vector_reduce(vs1);
++            len += k;
++            k = len < VNMAX ? (unsigned) len : VNMAX;
++            len -= k;
++        } while (likely(k >= SOVUC));
++
++        if (likely(k)) {
++            /*
++             * handle trailer
++             */
++            f = SOVUCQ - k;
++            /* add k times vs1 for this trailer */
++            vs2 = vmlaq_u32(vs2, vs1, vdupq_n_u32(k));
++
++            /* get input data */
++            in16 = *(const uint8x16_t *)buf;
++            /* masks out bad data */
++            if(host_is_bigendian())
++                in16 = neon_simple_alignq(in16, v0, f);
++            else
++                in16 = neon_simple_alignq(v0, in16, k);
++
++            /* pairwise add bytes and long, pairwise add word long acc */
++            vs1 = vpadalq_u16(vs1, vpaddlq_u8(in16));
++            /* apply order, add words, pairwise add word long acc */
++            vs2 = vpadalq_u16(vs2,
++                    vmlal_u8(
++                        vmull_u8(vget_low_u8(in16), vget_low_u8(vord)),
++                        vget_high_u8(in16), vget_high_u8(vord)
++                        )
++                    );
++
++            buf += k;
++            k -= k;
++        }
++
++        /* add horizontal */
++        v_tsum = vpadd_u32(vget_high_u32(vs1), vget_low_u32(vs1));
++        v_tsum = vpadd_u32(v_tsum, v_tsum);
++        s1 = vget_lane_u32(v_tsum, 0);
++        v_tsum = vpadd_u32(vget_high_u32(vs2), vget_low_u32(vs2));
++        v_tsum = vpadd_u32(v_tsum, v_tsum);
++        s2 = vget_lane_u32(v_tsum, 0);
++    }
++
++    if (unlikely(len)) do {
++        s1 += *buf++;
++        s2 += s1;
++    } while (--len);
++    reduce_x(s1);
++    reduce_x(s2);
++
++    return (s2 << 16) | s1;
++}
++
++/* inline asm, so only on GCC (or compatible) && ARM v6 or better */
++#elif defined(__GNUC__) && ( \
++        defined(__ARM_ARCH_6__)  || defined(__ARM_ARCH_6J__)  || \
++        defined(__ARM_ARCH_6Z__) || defined(__ARM_ARCH_6ZK__) || \
++        defined(__ARM_ARCH_7A__) \
++      )
++#  define SOU32 (sizeof(unsigned int))
++#  define HAVE_ADLER32_VEC
++#  define MIN_WORK 16
++
++/* ========================================================================= */
++local noinline uLong adler32_vec(adler, buf, len)
++    uLong adler;
++    const Bytef *buf;
++    uInt len;
++{
++    unsigned int s1, s2;
++    unsigned int k;
++
++    s1 = adler & 0xffff;
++    s2 = (adler >> 16) & 0xffff;
++
++    k    = ALIGN_DIFF(buf, SOU32);
++    len -= k;
++    if (k) do {
++        s1 += *buf++;
++        s2 += s1;
++    } while (--k);
++
++    if (likely(len >= 4 * SOU32)) {
++        unsigned int vs1 = s1, vs2 = s2;
++        unsigned int order_lo, order_hi;
++
++// TODO: byte order?
++        if(host_is_bigendian()) {
++            order_lo = 0x00030001;
++            order_hi = 0x00040002;
++        } else {
++            order_lo = 0x00020004;
++            order_hi = 0x00010003;
++        }
++// TODO: we could go over NMAX, since we have split the vs2 sum
++        /* something around (NMAX+(NMAX/3)+302) */
++        k = len < NMAX ? len : NMAX;
++        len -= k;
++
++        do {
++            unsigned int vs1_r = 0;
++            do {
++                unsigned int t21, t22, in;
++
++                /* get input data */
++                in = *(const unsigned int *)buf;
++
++                /* add vs1 for this round */
++                vs1_r += vs1;
++
++                /* add horizontal and acc */
++                asm ("usada8 %0, %1, %2, %3" : "=r" (vs1) : "r" (in), "r" (0), "r" (vs1));
++                /* widen bytes to words, apply order, add and acc */
++                asm ("uxtb16 %0, %1" : "=r" (t21) : "r" (in));
++                asm ("uxtb16 %0, %1, ror #8" : "=r" (t22) : "r" (in));
++// TODO: instruction result latency
++                /*
++                 * The same problem like the classic serial sum:
++                 * Chip maker sell us 1-cycle instructions, but that is not the
++                 * whole story. Nearly all 1-cycle chips are pipelined, so
++                 * you can get one result per cycle, but only if _they_ (plural)
++                 * are independent.
++                 * If you are depending on the result of an preciding instruction,
++                 * in the worst case you hit the instruction latency which is worst
++                 * case >= pipeline length. On the other hand there are result-fast-paths.
++                 * This could all be a wash with the classic sum (4 * 2 instructions,
++                 * + dependence), since smald is:
++                 * - 2 cycle issue
++                 * - needs the acc in pipeline step E1, instead of E2
++                 * But the Cortex has a fastpath for acc.
++                 * I don't know.
++                 * We can not even unroll, we would need 4 order vars, return ENOREGISTER.
++                 */
++                asm ("smlad %0, %1, %2, %3" : "=r" (vs2) : "r" (t21) , "r" (order_lo), "r" (vs2));
++                asm ("smlad %0, %1, %2, %3" : "=r" (vs2) : "r" (t22) , "r" (order_hi), "r" (vs2));
++
++                buf += SOU32;
++                k -= SOU32;
++            } while (k >= SOU32);
++            /* reduce vs1 round sum before multiplying by 4 */
++            reduce(vs1_r);
++            /* add vs1 for this round (4 times) */
++            vs2 += vs1_r * 4;
++            /* reduce both sums to something within 16 bit */
++            reduce(vs2);
++            reduce(vs1);
++            len += k;
++            k = len < NMAX ? len : NMAX;
++            len -= k;
++        } while (likely(k >= 4 * SOU32));
++        len += k;
++        s1 = vs1;
++        s2 = vs2;
++    }
++
++    if (unlikely(len)) do {
++        s1 += *buf++;
++        s2 += s1;
++    } while (--len);
++    /* at this point we should no have so big s1 & s2 */
++    reduce_x(s1);
++    reduce_x(s2);
++
++    return (s2 << 16) | s1;
++}
++#endif
diff --git a/04-x86.patch b/04-x86.patch
new file mode 100644
index 0000000..f2d489e
--- /dev/null
+++ b/04-x86.patch
@@ -0,0 +1,1165 @@
+=== modified file 'Makefile.in'
+--- Makefile.in	2011-03-14 14:39:24 +0000
++++ Makefile.in	2011-03-14 16:46:30 +0000
+@@ -236,7 +236,7 @@
+ 
+ # DO NOT DELETE THIS LINE -- make depend depends on it.
+ 
+-adler32.o: adler32.c adler32_ppc.c adler32_arm.c zutil.h zlib.h zconf.h
++adler32.o: adler32.c adler32_ppc.c adler32_arm.c adler32_x86.c zutil.h zlib.h zconf.h
+ zutil.o: zutil.h zlib.h zconf.h
+ gzclose.o gzlib.o gzread.o gzwrite.o: zlib.h zconf.h gzguts.h
+ compress.o example.o minigzip.o uncompr.o: zlib.h zconf.h
+@@ -247,7 +247,7 @@
+ inftrees.o: zutil.h zlib.h zconf.h inftrees.h
+ trees.o: deflate.h zutil.h zlib.h zconf.h trees.h
+ 
+-adler32.lo: adler32.c adler32_ppc.c adler32_arm.c zutil.h zlib.h zconf.h
++adler32.lo: adler32.c adler32_ppc.c adler32_arm.c adler32_x86.c zutil.h zlib.h zconf.h
+ zutil.lo: zutil.h zlib.h zconf.h
+ gzclose.lo gzlib.lo gzread.lo gzwrite.lo: zlib.h zconf.h gzguts.h
+ compress.lo example.lo minigzip.lo uncompr.lo: zlib.h zconf.h
+
+=== modified file 'adler32.c'
+--- adler32.c	2011-03-30 13:38:46 +0000
++++ adler32.c	2011-03-30 13:38:46 +0000
+@@ -144,6 +144,8 @@
+ #    include "adler32_arm.c"
+ #  elif defined(__powerpc__) || defined(__powerpc64__)
+ #    include "adler32_ppc.c"
++#  elif defined(__i386__) || defined(__x86_64__)
++#    include "adler32_x86.c"
+ #  endif
+ #endif
+ 
+
+=== added file 'adler32_x86.c'
+--- adler32_x86.c	1970-01-01 00:00:00 +0000
++++ adler32_x86.c	2011-03-15 23:15:36 +0000
+@@ -0,0 +1,1125 @@
++/*
++ * adler32.c -- compute the Adler-32 checksum of a data stream
++ *   x86 implementation
++ * Copyright (C) 1995-2007 Mark Adler
++ * Copyright (C) 2009-2011 Jan Seiffert
++ * For conditions of distribution and use, see copyright notice in zlib.h
++ */
++
++/* @(#) $Id$ */
++
++#if GCC_VERSION_GE(207)
++#  define GCC_ATTR_CONSTRUCTOR __attribute__((__constructor__))
++#else
++#  define VEC_NO_GO
++#endif
++
++#if GCC_VERSION_GE(203)
++#  define GCC_ATTR_ALIGNED(x) __attribute__((__aligned__(x)))
++#else
++#  define VEC_NO_GO
++#endif
++
++/* inline asm, so only on GCC (or compatible) */
++#if defined(__GNUC__) && !defined(VEC_NO_GO)
++#  define HAVE_ADLER32_VEC
++#  define MIN_WORK 64
++
++#  ifdef __x86_64__
++#    define PICREG "%%rbx"
++#  else
++#    define PICREG "%%ebx"
++#  endif
++
++/* ========================================================================= */
++local const struct { short d[24]; } vord GCC_ATTR_ALIGNED(16) = {
++    {1,1,1,1,1,1,1,1,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1}
++};
++
++/* ========================================================================= */
++local const struct { char d[16]; } vord_b GCC_ATTR_ALIGNED(16) = {
++    {16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1}
++};
++
++/* ========================================================================= */
++local noinline const Bytef *adler32_jumped(buf, s1, s2, k)
++    const Bytef *buf;
++    unsigned int *s1;
++    unsigned int *s2;
++    unsigned int k;
++{
++    unsigned int t;
++    unsigned n = k % 16;
++    buf += n;
++    k = (k / 16) + 1;
++
++    __asm__ __volatile__ (
++#  ifdef __x86_64__
++#    define CLOB "&"
++            "lea	1f(%%rip), %q4\n\t"
++            "lea	(%q4,%q5,8), %q4\n\t"
++            "jmp	*%q4\n\t"
++#  else
++#    ifndef __PIC__
++#      define CLOB
++            "lea	1f(,%5,8), %4\n\t"
++#    else
++#      define CLOB
++            "lea	1f-3f(,%5,8), %4\n\t"
++            "call	9f\n"
++            "3:\n\t"
++#    endif
++            "jmp	*%4\n\t"
++#    ifdef __PIC__
++            ".p2align 1\n"
++            "9:\n\t"
++            "addl	(%%esp), %4\n\t"
++            "ret\n\t"
++#    endif
++#  endif
++            ".p2align 1\n"
++            "2:\n\t"
++#  ifdef __i386
++            ".byte 0x3e\n\t"
++#  endif
++            "add	$0x10, %2\n\t"
++            ".p2align 1\n"
++            "1:\n\t"
++            /* 128 */
++            "movzbl	-16(%2), %4\n\t"	/* 4 */
++            "add	%4, %0\n\t"	/* 2 */
++            "add	%0, %1\n\t"	/* 2 */
++            /* 120 */
++            "movzbl	-15(%2), %4\n\t"	/* 4 */
++            "add	%4, %0\n\t"	/* 2 */
++            "add	%0, %1\n\t"	/* 2 */
++            /* 112 */
++            "movzbl	-14(%2), %4\n\t"	/* 4 */
++            "add	%4, %0\n\t"	/* 2 */
++            "add	%0, %1\n\t"	/* 2 */
++            /* 104 */
++            "movzbl	-13(%2), %4\n\t"	/* 4 */
++            "add	%4, %0\n\t"	/* 2 */
++            "add	%0, %1\n\t"	/* 2 */
++            /*  96 */
++            "movzbl	-12(%2), %4\n\t"	/* 4 */
++            "add	%4, %0\n\t"	/* 2 */
++            "add	%0, %1\n\t"	/* 2 */
++            /*  88 */
++            "movzbl	-11(%2), %4\n\t"	/* 4 */
++            "add	%4, %0\n\t"	/* 2 */
++            "add	%0, %1\n\t"	/* 2 */
++            /*  80 */
++            "movzbl	-10(%2), %4\n\t"	/* 4 */
++            "add	%4, %0\n\t"	/* 2 */
++            "add	%0, %1\n\t"	/* 2 */
++            /*  72 */
++            "movzbl	 -9(%2), %4\n\t"	/* 4 */
++            "add	%4, %0\n\t"	/* 2 */
++            "add	%0, %1\n\t"	/* 2 */
++            /*  64 */
++            "movzbl	 -8(%2), %4\n\t"	/* 4 */
++            "add	%4, %0\n\t"	/* 2 */
++            "add	%0, %1\n\t"	/* 2 */
++            /*  56 */
++            "movzbl	 -7(%2), %4\n\t"	/* 4 */
++            "add	%4, %0\n\t"	/* 2 */
++            "add	%0, %1\n\t"	/* 2 */
++            /*  48 */
++            "movzbl	 -6(%2), %4\n\t"	/* 4 */
++            "add	%4, %0\n\t"	/* 2 */
++            "add	%0, %1\n\t"	/* 2 */
++            /*  40 */
++            "movzbl	 -5(%2), %4\n\t"	/* 4 */
++            "add	%4, %0\n\t"	/* 2 */
++            "add	%0, %1\n\t"	/* 2 */
++            /*  32 */
++            "movzbl	 -4(%2), %4\n\t"	/* 4 */
++            "add	%4, %0\n\t"	/* 2 */
++            "add	%0, %1\n\t"	/* 2 */
++            /*  24 */
++            "movzbl	 -3(%2), %4\n\t"	/* 4 */
++            "add	%4, %0\n\t"	/* 2 */
++            "add	%0, %1\n\t"	/* 2 */
++            /*  16 */
++            "movzbl	 -2(%2), %4\n\t"	/* 4 */
++            "add	%4, %0\n\t"	/* 2 */
++            "add	%0, %1\n\t"	/* 2 */
++            /*   8 */
++            "movzbl	 -1(%2), %4\n\t"	/* 4 */
++            "add	%4, %0\n\t"	/* 2 */
++            "add	%0, %1\n\t"	/* 2 */
++            /*   0 */
++            "dec	%3\n\t"
++            "jnz	2b"
++        : /* %0 */ "=R" (*s1),
++          /* %1 */ "=R" (*s2),
++          /* %2 */ "=abdSD" (buf),
++          /* %3 */ "=c" (k),
++          /* %4 */ "="CLOB"R" (t)
++        : /* %5 */ "r" (16 - n),
++          /*    */ "0" (*s1),
++          /*    */ "1" (*s2),
++          /*    */ "2" (buf),
++          /*    */ "3" (k)
++        : "cc", "memory"
++    );
++
++    return buf;
++}
++
++#if 0
++        /*
++         * Will XOP processors have SSSE3/AVX??
++         * And what is the unaligned load performance?
++         */
++        "prefetchnta	0x70(%0)\n\t"
++        "lddqu	(%0), %%xmm0\n\t"
++        "vpaddd	%%xmm3, %%xmm5, %%xmm5\n\t"
++        "sub	$16, %3\n\t"
++        "add	$16, %0\n\t"
++        "cmp	$15, %3\n\t"
++        "vphaddubd	%%xmm0, %%xmm1\n\t" /* A */
++        "vpmaddubsw %%xmm4, %%xmm0, %%xmm0\n\t"/* AVX! */ /* 1 */
++        "vphadduwd	%%xmm0, %%xmm0\n\t" /* 2 */
++        "vpaddd	%%xmm1, %%xmm3, %%xmm3\n\t" /* B: A+B => hadd+acc or vpmadcubd w. mul = 1 */
++        "vpaddd	%%xmm0, %%xmm2, %%xmm2\n\t" /* 3: 1+2+3 => vpmadcubd w. mul = 16,15,14... */
++        "jg	1b\n\t"
++        xop_reduce
++        xop_reduce
++        xop_reduce
++        setup
++        "jg	1b\n\t"
++        "vphaddudq	%%xmm2, %%xmm0\n\t"
++        "vphaddudq	%%xmm3, %%xmm1\n\t"
++        "pshufd	$0xE6, %%xmm0, %%xmm2\n\t"
++        "pshufd	$0xE6, %%xmm1, %%xmm3\n\t"
++        "paddd	%%xmm0, %%xmm2\n\t"
++        "paddd	%%xmm1, %%xmm3\n\t"
++        "movd	%%xmm2, %2\n\t"
++        "movd	%%xmm3, %1\n\t"
++#endif
++
++/* ========================================================================= */
++local uLong adler32_SSSE3(adler, buf, len)
++    uLong adler;
++    const Bytef *buf;
++    uInt len;
++{
++    unsigned int s1 = adler & 0xffff;
++    unsigned int s2 = (adler >> 16) & 0xffff;
++    unsigned int k;
++
++    k    = ALIGN_DIFF(buf, 16);
++    len -= k;
++    if (k)
++        buf = adler32_jumped(buf, &s1, &s2, k);
++
++    __asm__ __volatile__ (
++            "mov	%6, %3\n\t"		/* get max. byte count VNMAX till v1_round_sum overflows */
++            "cmp	%3, %4\n\t"
++            "cmovb	%4, %3\n\t"		/* k = len >= VNMAX ? k : len */
++            "sub	%3, %4\n\t"		/* len -= k */
++            "cmp	$16, %3\n\t"
++            "jb	88f\n\t"			/* if(k < 16) goto OUT */
++#ifdef __ELF__
++            ".subsection 2\n\t"
++#else
++            "jmp	77f\n\t"
++#endif
++            ".p2align 2\n"
++            /*
++             * reduction function to bring a vector sum within the range of BASE
++             * This does no full reduction! When the sum is large, a number > BASE
++             * is the result. To do a full reduction call multiple times.
++             */
++            "sse2_reduce:\n\t"
++            "movdqa	%%xmm0, %%xmm1\n\t"	/* y = x */
++            "pslld	$16, %%xmm1\n\t"	/* y <<= 16 */
++            "psrld	$16, %%xmm0\n\t"	/* x >>= 16 */
++            "psrld	$16, %%xmm1\n\t"	/* y >>= 16 */
++            "psubd	%%xmm0, %%xmm1\n\t"	/* y -= x */
++            "pslld	$4, %%xmm0\n\t"		/* x <<= 4 */
++            "paddd	%%xmm1, %%xmm0\n\t"	/* x += y */
++            "ret\n\t"
++#ifdef __ELF__
++            ".previous\n\t"
++#else
++            "77:\n\t"
++#endif
++            "movdqa	%5, %%xmm5\n\t"		/* get vord_b */
++            "prefetchnta	0x70(%0)\n\t"
++            "movd %2, %%xmm2\n\t"		/* init vector sum vs2 with s2 */
++            "movd %1, %%xmm3\n\t"		/* init vector sum vs1 with s1 */
++            "pxor	%%xmm4, %%xmm4\n"	/* zero */
++            "3:\n\t"
++            "pxor	%%xmm7, %%xmm7\n\t"	/* zero vs1_round_sum */
++            ".p2align 3,,3\n\t"
++            ".p2align 2\n"
++            "2:\n\t"
++            "mov	$128, %1\n\t"		/* inner_k = 128 bytes till vs2_i overflows */
++            "cmp	%1, %3\n\t"
++            "cmovb	%3, %1\n\t"		/* inner_k = k >= inner_k ? inner_k : k */
++            "and	$-16, %1\n\t"		/* inner_k = ROUND_TO(inner_k, 16) */
++            "sub	%1, %3\n\t"		/* k -= inner_k */
++            "shr	$4, %1\n\t"		/* inner_k /= 16 */
++            "pxor	%%xmm6, %%xmm6\n\t"	/* zero vs2_i */
++            ".p2align 4,,7\n"
++            ".p2align 3\n"
++            "1:\n\t"
++            "movdqa	(%0), %%xmm0\n\t"	/* fetch input data */
++            "prefetchnta	0x70(%0)\n\t"
++            "paddd	%%xmm3, %%xmm7\n\t"	/* vs1_round_sum += vs1 */
++            "add	$16, %0\n\t"		/* advance input data pointer */
++            "dec	%1\n\t"			/* decrement inner_k */
++            "movdqa	%%xmm0, %%xmm1\n\t"	/* make a copy of the input data */
++#  if (HAVE_BINUTILS-0) >= 217
++            "pmaddubsw %%xmm5, %%xmm0\n\t"	/* multiply all input bytes by vord_b bytes, add adjecent results to words */
++#  else
++            ".byte 0x66, 0x0f, 0x38, 0x04, 0xc5\n\t" /* pmaddubsw %%xmm5, %%xmm0 */
++#  endif
++            "psadbw	%%xmm4, %%xmm1\n\t"	/* subtract zero from every byte, add 8 bytes to a sum */
++            "paddw	%%xmm0, %%xmm6\n\t"	/* vs2_i += in * vorder_b */
++            "paddd	%%xmm1, %%xmm3\n\t"	/* vs1 += psadbw */
++            "jnz	1b\n\t"			/* repeat if inner_k != 0 */
++            "movdqa	%%xmm6, %%xmm0\n\t"	/* copy vs2_i */
++            "punpckhwd	%%xmm4, %%xmm0\n\t"	/* zero extent vs2_i upper words to dwords */
++            "punpcklwd	%%xmm4, %%xmm6\n\t"	/* zero extent vs2_i lower words to dwords */
++            "paddd	%%xmm0, %%xmm2\n\t"	/* vs2 += vs2_i.upper */
++            "paddd	%%xmm6, %%xmm2\n\t"	/* vs2 += vs2_i.lower */
++            "cmp	$15, %3\n\t"
++            "jg	2b\n\t"				/* if(k > 15) repeat */
++            "movdqa	%%xmm7, %%xmm0\n\t"	/* move vs1_round_sum */
++            "call	sse2_reduce\n\t"	/* reduce vs1_round_sum */
++            "pslld	$4, %%xmm0\n\t"		/* vs1_round_sum *= 16 */
++            "paddd	%%xmm2, %%xmm0\n\t"	/* vs2 += vs1_round_sum */
++            "call	sse2_reduce\n\t"	/* reduce again */
++            "movdqa	%%xmm0, %%xmm2\n\t"	/* move vs2 back in place */
++            "movdqa	%%xmm3, %%xmm0\n\t"	/* move vs1 */
++            "call	sse2_reduce\n\t"	/* reduce */
++            "movdqa	%%xmm0, %%xmm3\n\t"	/* move vs1 back in place */
++            "add	%3, %4\n\t"		/* len += k */
++            "mov	%6, %3\n\t"		/* get max. byte count VNMAX till v1_round_sum overflows */
++            "cmp	%3, %4\n\t"
++            "cmovb	%4, %3\n\t"		/* k = len >= VNMAX ? k : len */
++            "sub	%3, %4\n\t"		/* len -= k */
++            "cmp	$15, %3\n\t"
++            "jg	3b\n\t"				/* if(k > 15) repeat */
++            "pshufd	$0xEE, %%xmm3, %%xmm1\n\t"	/* collect vs1 & vs2 in lowest vector member */
++            "pshufd	$0xEE, %%xmm2, %%xmm0\n\t"
++            "paddd	%%xmm3, %%xmm1\n\t"
++            "paddd	%%xmm2, %%xmm0\n\t"
++            "pshufd	$0xE5, %%xmm0, %%xmm2\n\t"
++            "paddd	%%xmm0, %%xmm2\n\t"
++            "movd	%%xmm1, %1\n\t"		/* mov vs1 to s1 */
++            "movd	%%xmm2, %2\n"		/* mov vs2 to s2 */
++            "88:"
++        : /* %0 */ "=r" (buf),
++          /* %1 */ "=r" (s1),
++          /* %2 */ "=r" (s2),
++          /* %3 */ "=r" (k),
++          /* %4 */ "=r" (len)
++        : /* %5 */ "m" (vord_b),
++          /*
++           * somewhere between 5 & 6, psadbw 64 bit sums ruin the party
++           * spreading the sums with palignr only brings it to 7 (?),
++           * while introducing an op into the main loop (2800 ms -> 3200 ms)
++           */
++          /* %6 */ "i" (5*NMAX),
++          /*    */ "0" (buf),
++          /*    */ "1" (s1),
++          /*    */ "2" (s2),
++          /*    */ "4" (len)
++        : "cc", "memory"
++#  ifdef __SSE__
++          , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
++#  endif
++    );
++
++    if (unlikely(k))
++        buf = adler32_jumped(buf, &s1, &s2, k);
++    reduce(s1);
++    reduce(s2);
++    return (s2 << 16) | s1;
++}
++
++/* ========================================================================= */
++local uLong adler32_SSE2(adler, buf, len)
++    uLong adler;
++    const Bytef *buf;
++    uInt len;
++{
++    unsigned int s1 = adler & 0xffff;
++    unsigned int s2 = (adler >> 16) & 0xffff;
++    unsigned int k;
++
++    k    = ALIGN_DIFF(buf, 16);
++    len -= k;
++    if (k)
++        buf = adler32_jumped(buf, &s1, &s2, k);
++
++    __asm__ __volatile__ (
++            "mov	%6, %3\n\t"
++            "cmp	%3, %4\n\t"
++            "cmovb	%4, %3\n\t"
++            "sub	%3, %4\n\t"
++            "cmp	$16, %3\n\t"
++            "jb	88f\n\t"
++            "prefetchnta	0x70(%0)\n\t"
++            "movd	%1, %%xmm4\n\t"
++            "movd	%2, %%xmm3\n\t"
++            "pxor	%%xmm2, %%xmm2\n\t"
++            "pxor	%%xmm5, %%xmm5\n\t"
++            ".p2align 2\n"
++            "3:\n\t"
++            "pxor	%%xmm6, %%xmm6\n\t"
++            "pxor	%%xmm7, %%xmm7\n\t"
++            "mov	$2048, %1\n\t"		/* get byte count till vs2_{l|h}_word overflows */
++            "cmp	%1, %3\n\t"
++            "cmovb	%3, %1\n"
++            "and	$-16, %1\n\t"
++            "sub	%1, %3\n\t"
++            "shr	$4, %1\n\t"
++            ".p2align 4,,7\n"
++            ".p2align 3\n"
++            "1:\n\t"
++            "prefetchnta	0x70(%0)\n\t"
++            "movdqa	(%0), %%xmm0\n\t"	/* fetch input data */
++            "paddd	%%xmm4, %%xmm5\n\t"	/* vs1_round_sum += vs1 */
++            "add	$16, %0\n\t"
++            "dec	%1\n\t"
++            "movdqa	%%xmm0, %%xmm1\n\t"	/* copy input data */
++            "psadbw	%%xmm2, %%xmm0\n\t"	/* add all bytes horiz. */
++            "paddd	%%xmm0, %%xmm4\n\t"	/* add that to vs1 */
++            "movdqa	%%xmm1, %%xmm0\n\t"	/* copy input data */
++            "punpckhbw	%%xmm2, %%xmm1\n\t"	/* zero extent input upper bytes to words */
++            "punpcklbw	%%xmm2, %%xmm0\n\t"	/* zero extent input lower bytes to words */
++            "paddw	%%xmm1, %%xmm7\n\t"	/* vs2_h_words += in_high_words */
++            "paddw	%%xmm0, %%xmm6\n\t"	/* vs2_l_words += in_low_words */
++            "jnz	1b\n\t"
++            "cmp	$15, %3\n\t"
++            "pmaddwd	32+%5, %%xmm7\n\t"	/* multiply vs2_h_words with order, add adjecend results */
++            "pmaddwd	16+%5, %%xmm6\n\t"	/* multiply vs2_l_words with order, add adjecend results */
++            "paddd	%%xmm7, %%xmm3\n\t"	/* add to vs2 */
++            "paddd	%%xmm6, %%xmm3\n\t"	/* add to vs2 */
++            "jg	3b\n\t"
++            "movdqa	%%xmm5, %%xmm0\n\t"
++            "pxor	%%xmm5, %%xmm5\n\t"
++            "call	sse2_reduce\n\t"
++            "pslld	$4, %%xmm0\n\t"
++            "paddd	%%xmm3, %%xmm0\n\t"
++            "call	sse2_reduce\n\t"
++            "movdqa	%%xmm0, %%xmm3\n\t"
++            "movdqa	%%xmm4, %%xmm0\n\t"
++            "call	sse2_reduce\n\t"
++            "movdqa	%%xmm0, %%xmm4\n\t"
++            "add	%3, %4\n\t"
++            "mov	%6, %3\n\t"
++            "cmp	%3, %4\n\t"
++            "cmovb	%4, %3\n"
++            "sub	%3, %4\n\t"
++            "cmp	$15, %3\n\t"
++            "jg	3b\n\t"
++            "pshufd	$0xEE, %%xmm4, %%xmm1\n\t"
++            "pshufd	$0xEE, %%xmm3, %%xmm0\n\t"
++            "paddd	%%xmm4, %%xmm1\n\t"
++            "paddd	%%xmm3, %%xmm0\n\t"
++            "pshufd	$0xE5, %%xmm0, %%xmm3\n\t"
++            "paddd	%%xmm0, %%xmm3\n\t"
++            "movd	%%xmm1, %1\n\t"
++            "movd	%%xmm3, %2\n"
++            "88:\n\t"
++        : /* %0 */ "=r" (buf),
++          /* %1 */ "=r" (s1),
++          /* %2 */ "=r" (s2),
++          /* %3 */ "=r" (k),
++          /* %4 */ "=r" (len)
++        : /* %5 */ "m" (vord),
++          /* %6 */ "i" (5*NMAX),
++          /*    */ "0" (buf),
++          /*    */ "1" (s1),
++          /*    */ "2" (s2),
++          /*    */ "3" (k),
++          /*    */ "4" (len)
++        : "cc", "memory"
++#  ifdef __SSE__
++          , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
++#  endif
++    );
++
++    if (unlikely(k))
++        buf = adler32_jumped(buf, &s1, &s2, k);
++    reduce(s1);
++    reduce(s2);
++    return (s2 << 16) | s1;
++}
++
++#  if 0
++/* ========================================================================= */
++/*
++ * The SSE2 version above is faster on my CPUs (Athlon64, Core2,
++ * P4 Xeon, K10 Sempron), but has instruction stalls only a
++ * Out-Of-Order-Execution CPU can solve.
++ * So this Version _may_ be better for the new old thing, Atom.
++ */
++local noinline uLong adler32_SSE2_no_oooe(adler, buf, len)
++    uLong adler;
++    const Bytef *buf;
++    uInt len;
++{
++    unsigned int s1 = adler & 0xffff;
++    unsigned int s2 = (adler >> 16) & 0xffff;
++    unsigned int k;
++
++    k    = ALIGN_DIFF(buf, 16);
++    len -= k;
++    if (k)
++        buf = adler32_jumped(buf, &s1, &s2, k);
++
++    __asm__ __volatile__ (
++            "mov	%6, %3\n\t"
++            "cmp	%3, %4\n\t"
++            "cmovb	%4, %3\n\t"
++            "sub	%3, %4\n\t"
++            "cmp	$16, %3\n\t"
++            "jb	88f\n\t"
++            "movdqa	16+%5, %%xmm6\n\t"
++            "movdqa	32+%5, %%xmm5\n\t"
++            "prefetchnta	16(%0)\n\t"
++            "pxor	%%xmm7, %%xmm7\n\t"
++            "movd	%1, %%xmm4\n\t"
++            "movd	%2, %%xmm3\n\t"
++            ".p2align 3,,3\n\t"
++            ".p2align 2\n"
++            "1:\n\t"
++            "prefetchnta	32(%0)\n\t"
++            "movdqa	(%0), %%xmm1\n\t"
++            "sub	$16, %3\n\t"
++            "movdqa	%%xmm4, %%xmm2\n\t"
++            "add	$16, %0\n\t"
++            "movdqa	%%xmm1, %%xmm0\n\t"
++            "cmp	$15, %3\n\t"
++            "pslld	$4, %%xmm2\n\t"
++            "paddd	%%xmm3, %%xmm2\n\t"
++            "psadbw	%%xmm7, %%xmm0\n\t"
++            "paddd	%%xmm0, %%xmm4\n\t"
++            "movdqa	%%xmm1, %%xmm0\n\t"
++            "punpckhbw	%%xmm7, %%xmm1\n\t"
++            "punpcklbw	%%xmm7, %%xmm0\n\t"
++            "movdqa	%%xmm1, %%xmm3\n\t"
++            "pmaddwd	%%xmm6, %%xmm0\n\t"
++            "paddd	%%xmm2, %%xmm0\n\t"
++            "pmaddwd	%%xmm5, %%xmm3\n\t"
++            "paddd	%%xmm0, %%xmm3\n\t"
++            "jg	1b\n\t"
++            "movdqa	%%xmm3, %%xmm0\n\t"
++            "call	sse2_reduce\n\t"
++            "call	sse2_reduce\n\t"
++            "movdqa	%%xmm0, %%xmm3\n\t"
++            "movdqa	%%xmm4, %%xmm0\n\t"
++            "call	sse2_reduce\n\t"
++            "movdqa	%%xmm0, %%xmm4\n\t"
++            "add	%3, %4\n\t"
++            "mov	%6, %3\n\t"
++            "cmp	%3, %4\n\t"
++            "cmovb	%4, %3\n\t"
++            "sub	%3, %4\n\t"
++            "cmp	$15, %3\n\t"
++            "jg	1b\n\t"
++            "pshufd	$0xEE, %%xmm3, %%xmm0\n\t"
++            "pshufd	$0xEE, %%xmm4, %%xmm1\n\t"
++            "paddd	%%xmm3, %%xmm0\n\t"
++            "pshufd	$0xE5, %%xmm0, %%xmm2\n\t"
++            "paddd	%%xmm4, %%xmm1\n\t"
++            "movd	%%xmm1, %1\n\t"
++            "paddd	%%xmm0, %%xmm2\n\t"
++            "movd	%%xmm2, %2\n"
++            "88:"
++        : /* %0 */ "=r" (buf),
++          /* %1 */ "=r" (s1),
++          /* %2 */ "=r" (s2),
++          /* %3 */ "=r" (k),
++          /* %4 */ "=r" (len)
++        : /* %5 */ "m" (vord),
++          /* %6 */ "i" (NMAX + NMAX/3),
++          /*    */ "0" (buf),
++          /*    */ "1" (s1),
++          /*    */ "2" (s2),
++          /*    */ "4" (len)
++        : "cc", "memory"
++#  ifdef __SSE__
++          , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
++#  endif
++    );
++
++    if (unlikely(k))
++        buf = adler32_jumped(buf, &s1, &s2, k);
++    reduce(s1);
++    reduce(s2);
++    return (s2 << 16) | s1;
++}
++#  endif
++
++#  ifndef __x86_64__
++/* ========================================================================= */
++/*
++ * SSE version to help VIA-C3_2, P2 & P3
++ */
++local uLong adler32_SSE(adler, buf, len)
++    uLong adler;
++    const Bytef *buf;
++    uInt len;
++{
++    unsigned int s1 = adler & 0xffff;
++    unsigned int s2 = (adler >> 16) & 0xffff;
++    unsigned int k;
++
++    k    = ALIGN_DIFF(buf, 8);
++    len -= k;
++    if (k)
++        buf = adler32_jumped(buf, &s1, &s2, k);
++
++    __asm__ __volatile__ (
++            "mov	%6, %3\n\t"
++            "cmp	%3, %4\n\t"
++            "cmovb	%4, %3\n"
++            "sub	%3, %4\n\t"
++            "cmp	$8, %3\n\t"
++            "jb	88f\n\t"
++            "movd	%1, %%mm4\n\t"
++            "movd	%2, %%mm3\n\t"
++            "pxor	%%mm2, %%mm2\n\t"
++            "pxor	%%mm5, %%mm5\n\t"
++#    ifdef __ELF__
++            ".subsection 2\n\t"
++#    else
++            "jmp	77f\n\t"
++#    endif
++            ".p2align 2\n"
++            "mmx_reduce:\n\t"
++            "movq	%%mm0, %%mm1\n\t"
++            "pslld	$16, %%mm1\n\t"
++            "psrld	$16, %%mm0\n\t"
++            "psrld	$16, %%mm1\n\t"
++            "psubd	%%mm0, %%mm1\n\t"
++            "pslld	$4, %%mm0\n\t"
++            "paddd	%%mm1, %%mm0\n\t"
++            "ret\n\t"
++#    ifdef __ELF__
++            ".previous\n\t"
++#    else
++            "77:\n\t"
++#    endif
++            ".p2align	2\n"
++            "3:\n\t"
++            "pxor	%%mm6, %%mm6\n\t"
++            "pxor	%%mm7, %%mm7\n\t"
++            "mov	$1024, %1\n\t"
++            "cmp	%1, %3\n\t"
++            "cmovb	%3, %1\n"
++            "and	$-8, %1\n\t"
++            "sub	%1, %3\n\t"
++            "shr	$3, %1\n\t"
++            ".p2align 4,,7\n"
++            ".p2align 3\n"
++            "1:\n\t"
++            "movq	(%0), %%mm0\n\t"
++            "paddd	%%mm4, %%mm5\n\t"
++            "add	$8, %0\n\t"
++            "dec	%1\n\t"
++            "movq	%%mm0, %%mm1\n\t"
++            "psadbw	%%mm2, %%mm0\n\t"
++            "paddd	%%mm0, %%mm4\n\t"
++            "movq	%%mm1, %%mm0\n\t"
++            "punpckhbw	%%mm2, %%mm1\n\t"
++            "punpcklbw	%%mm2, %%mm0\n\t"
++            "paddw	%%mm1, %%mm7\n\t"
++            "paddw	%%mm0, %%mm6\n\t"
++            "jnz	1b\n\t"
++            "cmp	$7, %3\n\t"
++            "pmaddwd	40+%5, %%mm7\n\t"
++            "pmaddwd	32+%5, %%mm6\n\t"
++            "paddd	%%mm7, %%mm3\n\t"
++            "paddd	%%mm6, %%mm3\n\t"
++            "jg	3b\n\t"
++            "movq	%%mm5, %%mm0\n\t"
++            "pxor	%%mm5, %%mm5\n\t"
++            "call	mmx_reduce\n\t"
++            "pslld	$3, %%mm0\n\t"
++            "paddd	%%mm3, %%mm0\n\t"
++            "call	mmx_reduce\n\t"
++            "movq	%%mm0, %%mm3\n\t"
++            "movq	%%mm4, %%mm0\n\t"
++            "call	mmx_reduce\n\t"
++            "movq	%%mm0, %%mm4\n\t"
++            "add	%3, %4\n\t"
++            "mov	%6, %3\n\t"
++            "cmp	%3, %4\n\t"
++            "cmovb	%4, %3\n"
++            "sub	%3, %4\n\t"
++            "cmp	$7, %3\n\t"
++            "jg	3b\n\t"
++            "movd	%%mm4, %1\n\t"
++            "psrlq	$32, %%mm4\n\t"
++            "movd	%%mm3, %2\n\t"
++            "psrlq	$32, %%mm3\n\t"
++            "movd	%%mm4, %4\n\t"
++            "add	%4, %1\n\t"
++            "movd	%%mm3, %4\n\t"
++            "add	%4, %2\n"
++            "emms\n\t"
++            "88:\n\t"
++        : /* %0 */ "=r" (buf),
++          /* %1 */ "=r" (s1),
++          /* %2 */ "=r" (s2),
++          /* %3 */ "=r" (k),
++          /* %4 */ "=r" (len)
++        : /* %5 */ "m" (vord),
++          /* %6 */ "i" ((5*NMAX)/2),
++          /*    */ "0" (buf),
++          /*    */ "1" (s1),
++          /*    */ "2" (s2),
++          /*    */ "3" (k),
++          /*    */ "4" (len)
++        : "cc", "memory"
++#    ifdef __MMX__
++          , "mm0", "mm1", "mm2", "mm3", "mm4", "mm5", "mm6", "mm7"
++#    endif
++                               );
++
++    if (unlikely(k))
++        buf = adler32_jumped(buf, &s1, &s2, k);
++    reduce(s1);
++    reduce(s2);
++    return (s2 << 16) | s1;
++}
++
++/* ========================================================================= */
++/*
++ * Processors which only have MMX will prop. not like this
++ * code, they are so old, they are not Out-Of-Order
++ * (maybe except AMD K6, Cyrix, Winchip/VIA).
++ * I did my best to get at least 1 instruction between result -> use
++ */
++local uLong adler32_MMX(adler, buf, len)
++    uLong adler;
++    const Bytef *buf;
++    uInt len;
++{
++    unsigned int s1 = adler & 0xffff;
++    unsigned int s2 = (adler >> 16) & 0xffff;
++    unsigned int k;
++
++    k    = ALIGN_DIFF(buf, 8);
++    len -= k;
++    if (k)
++        buf = adler32_jumped(buf, &s1, &s2, k);
++
++    __asm__ __volatile__ (
++            "mov	%6, %3\n\t"
++            "cmp	%3, %4\n\t"
++            "jae	11f\n\t"
++            "mov	%4, %3\n"
++            "11:\n\t"
++            "sub	%3, %4\n\t"
++            "cmp	$8, %3\n\t"
++            "jb	88f\n\t"
++            "sub	$8, %%esp\n\t"
++            "movd	%1, %%mm4\n\t"
++            "movd	%2, %%mm2\n\t"
++            "movq	%5, %%mm3\n"
++            "33:\n\t"
++            "movq	%%mm2, %%mm0\n\t"
++            "pxor	%%mm2, %%mm2\n\t"
++            "pxor	%%mm5, %%mm5\n\t"
++            ".p2align 2\n"
++            "3:\n\t"
++            "movq	%%mm0, (%%esp)\n\t"
++            "pxor	%%mm6, %%mm6\n\t"
++            "pxor	%%mm7, %%mm7\n\t"
++            "mov	$1024, %1\n\t"
++            "cmp	%1, %3\n\t"
++            "jae	44f\n\t"
++            "mov	%3, %1\n"
++            "44:\n\t"
++            "and	$-8, %1\n\t"
++            "sub	%1, %3\n\t"
++            "shr	$3, %1\n\t"
++            ".p2align 4,,7\n"
++            ".p2align 3\n"
++            "1:\n\t"
++            "movq	(%0), %%mm0\n\t"
++            "paddd	%%mm4, %%mm5\n\t"
++            "add	$8, %0\n\t"
++            "dec	%1\n\t"
++            "movq	%%mm0, %%mm1\n\t"
++            "punpcklbw	%%mm2, %%mm0\n\t"
++            "punpckhbw	%%mm2, %%mm1\n\t"
++            "paddw	%%mm0, %%mm6\n\t"
++            "paddw	%%mm1, %%mm0\n\t"
++            "paddw	%%mm1, %%mm7\n\t"
++            "pmaddwd	%%mm3, %%mm0\n\t"
++            "paddd	%%mm0, %%mm4\n\t"
++            "jnz	1b\n\t"
++            "movq	(%%esp), %%mm0\n\t"
++            "cmp	$7, %3\n\t"
++            "pmaddwd	32+%5, %%mm6\n\t"
++            "pmaddwd	40+%5, %%mm7\n\t"
++            "paddd	%%mm6, %%mm0\n\t"
++            "paddd	%%mm7, %%mm0\n\t"
++            "jg	3b\n\t"
++            "movq	%%mm0, %%mm2\n\t"
++            "movq	%%mm5, %%mm0\n\t"
++            "call	mmx_reduce\n\t"
++            "pslld	$3, %%mm0\n\t"
++            "paddd	%%mm2, %%mm0\n\t"
++            "call	mmx_reduce\n\t"
++            "movq	%%mm0, %%mm2\n\t"
++            "movq	%%mm4, %%mm0\n\t"
++            "call	mmx_reduce\n\t"
++            "movq	%%mm0, %%mm4\n\t"
++            "add	%3, %4\n\t"
++            "mov	%6, %3\n\t"
++            "cmp	%3, %4\n\t"
++            "jae	22f\n\t"
++            "mov	%4, %3\n"
++            "22:\n\t"
++            "sub	%3, %4\n\t"
++            "cmp	$7, %3\n\t"
++            "jg	33b\n\t"
++            "add	$8, %%esp\n\t"
++            "movd	%%mm4, %1\n\t"
++            "psrlq	$32, %%mm4\n\t"
++            "movd	%%mm2, %2\n\t"
++            "psrlq	$32, %%mm2\n\t"
++            "movd	%%mm4, %4\n\t"
++            "add	%4, %1\n\t"
++            "movd	%%mm2, %4\n\t"
++            "add	%4, %2\n"
++            "emms\n\t"
++            "88:\n\t"
++        : /* %0 */ "=r" (buf),
++          /* %1 */ "=r" (s1),
++          /* %2 */ "=r" (s2),
++          /* %3 */ "=r" (k),
++          /* %4 */ "=r" (len)
++        : /* %5 */ "m" (vord),
++          /* %6 */ "i" (4*NMAX),
++          /*    */ "0" (buf),
++          /*    */ "1" (s1),
++          /*    */ "2" (s2),
++          /*    */ "3" (k),
++          /*    */ "4" (len)
++        : "cc", "memory"
++#    ifdef __MMX__
++          , "mm0", "mm1", "mm2", "mm3", "mm4", "mm5", "mm6", "mm7"
++#    endif
++    );
++
++    if (unlikely(k))
++        buf = adler32_jumped(buf, &s1, &s2, k);
++    reduce(s1);
++    reduce(s2);
++    return (s2 << 16) | s1;
++}
++#  endif
++
++/* ========================================================================= */
++local uLong adler32_x86(adler, buf, len)
++    uLong adler;
++    const Bytef *buf;
++    uInt len;
++{
++    unsigned int s1 = adler & 0xffff;
++    unsigned int s2 = (adler >> 16) & 0xffff;
++    unsigned int n;
++
++    while (likely(len)) {
++#  ifndef __x86_64__
++#    define LOOP_COUNT 4
++#  else
++#    define LOOP_COUNT 8
++#  endif
++        unsigned int k;
++        n = len < NMAX ? len : NMAX;
++        len -= n;
++        k = n / LOOP_COUNT;
++        n %= LOOP_COUNT;
++
++        if (likely(k)) do {
++            /*
++             * Modern compiler can do "wonders".
++             * Only if they would not "trick them self" sometime.
++             * This was unrolled 16 times not because someone
++             * anticipated autovectorizing compiler, but the
++             * classical "avoid loop overhead".
++             *
++             * But things get tricky if the compiler starts to see:
++             * "hey lets disambiguate one sum step from the other",
++             * the classical prevent-pipeline-stalls-thing.
++             *
++             * Suddenly we have 16 temporary sums, which unfortunatly
++             * blows x86 limited register set...
++             *
++             * Loopunrolling is also a little bad for the I-cache.
++             *
++             * So tune this down for x86.
++             * Instead we try to keep it in the register set. 4 sums fits
++             * into i386 register set with no framepointer.
++             * x86_64 is a little more splendit, but still we can not
++             * take 16, so take 8 sums.
++             */
++            s1 += buf[0]; s2 += s1;
++            s1 += buf[1]; s2 += s1;
++            s1 += buf[2]; s2 += s1;
++            s1 += buf[3]; s2 += s1;
++#  ifdef __x86_64__
++            s1 += buf[4]; s2 += s1;
++            s1 += buf[5]; s2 += s1;
++            s1 += buf[6]; s2 += s1;
++            s1 += buf[7]; s2 += s1;
++#  endif
++            buf  += LOOP_COUNT;
++        } while(likely(--k));
++        if (n) do {
++            s1 += *buf++;
++            s2 += s1;
++        } while (--n);
++        reduce_full(s1);
++        reduce_full(s2);
++    }
++    return (s2 << 16) | s1;
++}
++
++/* ========================================================================= */
++/*
++ * Knot it all together with a runtime switch
++ */
++
++/* Flags */
++#  define CFF_DEFAULT (1 << 0)
++/* Processor features */
++#  define CFEATURE_CMOV  (15 +   0)
++#  define CFEATURE_MMX   (23 +   0)
++#  define CFEATURE_SSE   (25 +   0)
++#  define CFEATURE_SSE2  (26 +   0)
++#  define CFEATURE_SSSE3 ( 9 +  32)
++
++#  define CFB(x) (1 << ((x)%32))
++
++#  define FEATURE_WORDS 2
++
++/* data structure */
++struct test_cpu_feature
++{
++    void (*func)(void);
++    int flags;
++    unsigned int features[FEATURE_WORDS];
++};
++
++/* ========================================================================= */
++/*
++ * Decision table
++ */
++local const struct test_cpu_feature tfeat_adler32_vec[] =
++{
++    /* func                             flags   features       */
++    {(void (*)(void))adler32_SSSE3,         0, {CFB(CFEATURE_CMOV), CFB(CFEATURE_SSSE3)}},
++    {(void (*)(void))adler32_SSE2,          0, {CFB(CFEATURE_SSE2)|CFB(CFEATURE_CMOV), 0}},
++#  ifndef __x86_64__
++    {(void (*)(void))adler32_SSE,           0, {CFB(CFEATURE_SSE)|CFB(CFEATURE_CMOV), 0}},
++    {(void (*)(void))adler32_MMX,           0, {CFB(CFEATURE_MMX), 0}},
++#  endif
++    {(void (*)(void))adler32_x86, CFF_DEFAULT, { 0, 0}},
++};
++
++/* ========================================================================= */
++/* Prototypes */
++local noinline void *test_cpu_feature(const struct test_cpu_feature *t, unsigned int l);
++local uLong adler32_vec_runtimesw(uLong adler, const Bytef *buf, uInt len);
++
++/* ========================================================================= */
++/*
++ * Runtime Function pointer
++ */
++local uLong (*adler32_vec_ptr)(uLong adler, const Bytef *buf, uInt len) = adler32_vec_runtimesw;
++
++/* ========================================================================= */
++/*
++ * Constructor to init the pointer early
++ */
++local GCC_ATTR_CONSTRUCTOR void adler32_vec_select(void)
++{
++    adler32_vec_ptr = test_cpu_feature(tfeat_adler32_vec, sizeof (tfeat_adler32_vec)/sizeof (tfeat_adler32_vec[0]));
++}
++
++/* ========================================================================= */
++/*
++ * Jump function
++ */
++local noinline uLong adler32_vec(adler, buf, len)
++    uLong adler;
++    const Bytef *buf;
++    uInt len;
++{
++    return adler32_vec_ptr(adler, buf, len);
++}
++
++/* ========================================================================= */
++/*
++ * the runtime switcher is a little racy, it should normaly not run if the constructor works
++ */
++local uLong adler32_vec_runtimesw(uLong adler, const Bytef *buf, uInt len)
++{
++    adler32_vec_select();
++    return adler32_vec(adler, buf, len);
++}
++
++
++/* ========================================================================= */
++/* Internal data types */
++struct cpuid_regs
++{
++    unsigned long eax, ebx, ecx, edx;
++};
++
++local struct
++{
++    unsigned int max_basic;
++    unsigned int features[FEATURE_WORDS];
++    int init_done;
++} our_cpu;
++
++/* ========================================================================= */
++local inline unsigned long read_flags(void)
++{
++    unsigned long f;
++    __asm__ __volatile__ (
++            "pushf\n\t"
++            "pop %0\n\t"
++        : "=r" (f)
++    );
++    return f;
++}
++
++/* ========================================================================= */
++local inline void write_flags(unsigned long f)
++{
++    __asm__ __volatile__ (
++            "push %0\n\t"
++            "popf\n\t"
++        : : "ri" (f) : "cc"
++    );
++}
++
++/* ========================================================================= */
++local inline void cpuid(struct cpuid_regs *regs, unsigned long func)
++{
++    /* save ebx around cpuid call, PIC code needs it */
++    __asm__ __volatile__ (
++            "xchg	%1, " PICREG "\n\t"
++            "cpuid\n\t"
++            "xchg	%1, " PICREG "\n"
++        : /* %0 */ "=a" (regs->eax),
++          /* %1 */ "=r" (regs->ebx),
++          /* %2 */ "=c" (regs->ecx),
++          /* %4 */ "=d" (regs->edx)
++        : /* %5 */ "0" (func),
++          /* %6 */ "2" (regs->ecx)
++        : "cc"
++    );
++}
++
++/* ========================================================================= */
++local inline void cpuids(struct cpuid_regs *regs, unsigned long func)
++{
++    regs->ecx = 0;
++    cpuid(regs, func);
++}
++
++/* ========================================================================= */
++local inline int toggle_eflags_test(const unsigned long mask)
++{
++    unsigned long f;
++    int result;
++
++    f = read_flags();
++    write_flags(f ^ mask);
++    result = !!((f ^ read_flags()) & mask);
++    /*
++     * restore the old flags, the test for i486 tests the alignment
++     * check bit, and left set will confuse the x86 software world.
++     */
++    write_flags(f);
++    return result;
++}
++
++/* ========================================================================= */
++local inline int is_486(void)
++{
++    return toggle_eflags_test(1 << 18);
++}
++
++/* ========================================================================= */
++local inline int has_cpuid(void)
++{
++    return toggle_eflags_test(1 << 21);
++}
++
++/* ========================================================================= */
++local void identify_cpu(void)
++{
++    struct cpuid_regs a;
++
++    if (our_cpu.init_done)
++        return;
++
++    our_cpu.init_done = -1;
++    /* force a write out to memory */
++    __asm__ __volatile__ ("" : : "m" (our_cpu.init_done));
++
++    if (!is_486())
++        return;
++
++    if (!has_cpuid())
++        return;
++
++    /* get the maximum basic leaf number */
++    cpuids(&a, 0x00000000);
++    our_cpu.max_basic = (unsigned int)a.eax;
++    /* we could get the vendor string from ebx, edx, ecx */
++
++    /* get the first basic leaf, if it is avail. */
++    if (our_cpu.max_basic >= 0x00000001)
++        cpuids(&a, 0x00000001);
++    else
++        a.eax = a.ebx = a.ecx = a.edx = 0;
++
++    /* we could extract family, model, stepping from eax */
++
++    /* there is the first set of features */
++    our_cpu.features[0] = a.edx;
++    our_cpu.features[1] = a.ecx;
++
++    /* now we could test the extended features, but is not needed, for now */
++}
++
++/* ========================================================================= */
++local noinline void *test_cpu_feature(const struct test_cpu_feature *t, unsigned int l)
++{
++    unsigned int i, j, f;
++    identify_cpu();
++
++    for (i = 0; i < l; i++) {
++        if (t[i].flags & CFF_DEFAULT)
++            return t[i].func;
++        for (f = 0, j = 0; j < FEATURE_WORDS; j++)
++            f |= (our_cpu.features[j] & t[i].features[j]) ^ t[i].features[j];
++        if (f)
++            continue;
++        return t[i].func;
++    }
++    return NULL; /* die! */
++}
++
++#endif
+
diff --git a/zlib.changes b/zlib.changes
index e0aa3dd..ba28ccc 100644
--- a/zlib.changes
+++ b/zlib.changes
@@ -1,3 +1,14 @@
+-------------------------------------------------------------------
+Wed Mar 30 19:47:30 UTC 2011 - crrodriguez@opensuse.org
+
+- Update SSE2/MMX patches to version 2. 
+
+-------------------------------------------------------------------
+Tue Mar 15 22:38:32 UTC 2011 - crrodriguez@opensuse.org
+
+- Add highly experimental patches to use SSE2/SSSE3/MMX in zlib
+  this makes the library up to 6 times faster. 
+
 -------------------------------------------------------------------
 Sun Jan  9 14:33:08 CET 2011 - meissner@suse.de
 
diff --git a/zlib.spec b/zlib.spec
index 614d198..3e2f0ac 100644
--- a/zlib.spec
+++ b/zlib.spec
@@ -40,6 +40,10 @@ Patch0:         zlib-1.2.2-format.patch
 Patch1:         zlib-lfs.patch
 # PATCH-FIX-JENGELH-PARALLEL-MAKE zlib-parallel.patch meissner@novell.com -- shared library links with libz.a
 Patch2:         zlib-parallel.patch
+Patch3:         01-prepare.patch  
+Patch4:         02-ppc_altivec.patch  
+Patch5:         03-arm.patch  
+Patch6:         04-x86.patch
 BuildRoot:      %{_tmppath}/%{name}-%{version}-build
 BuildRequires:  pkgconfig
 
@@ -84,7 +88,10 @@ libraries.
 %patch0
 %patch1
 %patch2 -p1
-
+%patch3
+%patch4
+%patch5
+%patch6
 %build
 # Marcus: breaks example64 in 32bit builds.
 %define do_profiling 0