zlib/01-prepare.patch

=== modified file 'Makefile.in'
--- Makefile.in	2011-03-14 01:01:37 +0000
+++ Makefile.in	2011-03-14 02:19:21 +0000
@@ -236,7 +236,8 @@

 # DO NOT DELETE THIS LINE -- make depend depends on it.

-adler32.o zutil.o: zutil.h zlib.h zconf.h
+adler32.o: adler32.c zutil.h zlib.h zconf.h
+zutil.o: zutil.h zlib.h zconf.h
 gzclose.o gzlib.o gzread.o gzwrite.o: zlib.h zconf.h gzguts.h
 compress.o example.o minigzip.o uncompr.o: zlib.h zconf.h
 crc32.o: zutil.h zlib.h zconf.h crc32.h
@@ -246,7 +247,8 @@
 inftrees.o: zutil.h zlib.h zconf.h inftrees.h
 trees.o: deflate.h zutil.h zlib.h zconf.h trees.h

-adler32.lo zutil.lo: zutil.h zlib.h zconf.h
+adler32.lo: adler32.c zutil.h zlib.h zconf.h
+zutil.lo: zutil.h zlib.h zconf.h
 gzclose.lo gzlib.lo gzread.lo gzwrite.lo: zlib.h zconf.h gzguts.h
 compress.lo example.lo minigzip.lo uncompr.lo: zlib.h zconf.h
 crc32.lo: zutil.h zlib.h zconf.h crc32.h

=== modified file 'adler32.c'
--- adler32.c	2011-03-14 01:01:37 +0000
+++ adler32.c	2011-03-30 13:38:42 +0000
@@ -9,6 +9,35 @@

 #define local static

+#define GCC_VERSION_GE(x) ((__GNUC__-0) * 100 + __GNUC_MINOR__-0 >= x)
+
+#if GCC_VERSION_GE(301)
+/* sometimes leakes out of old kernel header */
+#  undef noinline
+#  define noinline __attribute__((__noinline__))
+#else
+#  ifndef noinline
+#    define noinline
+#  endif
+#endif
+
+#if GCC_VERSION_GE(301)
+# define GCC_ATTR_UNUSED_PARAM __attribute__((__unused__))
+#else
+# define GCC_ATTR_UNUSED_PARAM
+#endif
+
+#if GCC_VERSION_GE(296)
+#  define likely(x)   __builtin_expect(!!(x), 1)
+#  define unlikely(x) __builtin_expect(!!(x), 0)
+#else
+#  define likely(x)   (x)
+#  define unlikely(x) (x)
+#endif
+
+#define ROUND_TO(x , n) ((x) & ~((n) - 1L))
+#define ALIGN_DIFF(x, n) (((intptr_t)((x)+(n) - 1L) & ~((intptr_t)(n) - 1L)) - (intptr_t)(x))
+
 local uLong adler32_combine_(uLong adler1, uLong adler2, z_off64_t len2);

 #define BASE 65521UL    /* largest prime smaller than 65536 */
@@ -21,9 +50,20 @@
 #define DO8(buf,i)  DO4(buf,i); DO4(buf,i+4);
 #define DO16(buf)   DO8(buf,0); DO8(buf,8);

+#if defined(__alpha__)
+/* even if gcc can generate a mul by inverse, the code is really
+ * ugly (find global const pool pointer, load constant, a mul, lots
+ * of shifts/add/sub), up to 14 instructions. The replacement code
+ * only needs >= 5 instructions
+ */
+#  define NO_DIVIDE
+#endif
+
 /* use NO_DIVIDE if your processor does not do division in hardware */
 #ifdef NO_DIVIDE
-#  define MOD(a) \
+/* use NO_SHIFT if your processor does shift > 1 by loop */
+#  ifdef NO_SHIFT
+#    define reduce_full(a) \
     do { \
         if (a >= (BASE << 16)) a -= (BASE << 16); \
         if (a >= (BASE << 15)) a -= (BASE << 15); \
@@ -43,21 +83,237 @@
         if (a >= (BASE << 1)) a -= (BASE << 1); \
         if (a >= BASE) a -= BASE; \
     } while (0)
-#  define MOD4(a) \
+#    define reduce_x(a) \
     do { \
+        if (MIN_WORK >= (1 << 6) && a >= (BASE << 6)) a -= (BASE << 6); \
+        if (MIN_WORK >= (1 << 5) && a >= (BASE << 5)) a -= (BASE << 5); \
         if (a >= (BASE << 4)) a -= (BASE << 4); \
         if (a >= (BASE << 3)) a -= (BASE << 3); \
         if (a >= (BASE << 2)) a -= (BASE << 2); \
         if (a >= (BASE << 1)) a -= (BASE << 1); \
         if (a >= BASE) a -= BASE; \
     } while (0)
+#    define reduce(a) reduce_full(a)
+#  else
+#    define reduce_full(a) \
+    do { \
+        unsigned long b = a & 0x0000ffff; \
+        a >>= 16; \
+        b -= a; \
+        a <<= 4; \
+        a += b; \
+    } while(a >= BASE)
+#    define reduce_x(a) \
+    do { \
+        unsigned long b = a & 0x0000ffff; \
+        a >>= 16; \
+        b -= a; \
+        a <<= 4; \
+        a += b; \
+        a = a >= BASE ? a - BASE : a; \
+    } while(0)
+#    define reduce(a) \
+    do { \
+        unsigned long b = a & 0x0000ffff; \
+        a >>= 16; \
+        b -= a; \
+        a <<= 4; \
+        a += b; \
+    } while(0)
+#  endif
 #else
-#  define MOD(a) a %= BASE
-#  define MOD4(a) a %= BASE
-#endif
-
-/* ========================================================================= */
-uLong ZEXPORT adler32(adler, buf, len)
+#  define reduce_full(a) a %= BASE
+#  define reduce_x(a) a %= BASE
+#  define reduce(a) a %= BASE
+#endif
+
+local int host_is_bigendian()
+{
+    local const union {
+        uInt d;
+        unsigned char endian[sizeof(uInt)];
+    } x = {1};
+    return x.endian[0] == 0;
+}
+
+#ifndef MIN_WORK
+#  define MIN_WORK 16
+#endif
+
+/* ========================================================================= */
+local noinline uLong adler32_1(adler, buf, len)
+    uLong adler;
+    const Bytef *buf;
+    uInt len GCC_ATTR_UNUSED_PARAM;
+{
+    unsigned long sum2;
+
+    /* split Adler-32 into component sums */
+    sum2 = (adler >> 16) & 0xffff;
+    adler &= 0xffff;
+
+    adler += buf[0];
+    if (adler >= BASE)
+        adler -= BASE;
+    sum2 += adler;
+    if (sum2 >= BASE)
+        sum2 -= BASE;
+    return adler | (sum2 << 16);
+}
+
+/* ========================================================================= */
+local noinline uLong adler32_common(adler, buf, len)
+    uLong adler;
+    const Bytef *buf;
+    uInt len;
+{
+    unsigned long sum2;
+
+    /* split Adler-32 into component sums */
+    sum2 = (adler >> 16) & 0xffff;
+    adler &= 0xffff;
+
+    while (len--) {
+        adler += *buf++;
+        sum2 += adler;
+    }
+    if (adler >= BASE)
+        adler -= BASE;
+    reduce_x(sum2);             /* only added so many BASE's */
+    return adler | (sum2 << 16);
+}
+
+#ifndef HAVE_ADLER32_VEC
+#  if (defined(__LP64__) || ((SIZE_MAX-0) >> 31) >= 2) && !defined(NO_ADLER32_VEC)
+
+/* On 64 Bit archs, we can do pseudo SIMD with a nice win.
+ * This is esp. important for old Alphas, they do not have byte
+ * access.
+ * This needs some register but x86_64 is fine (>= 9 for the mainloop
+ * req.). If your 64 Bit arch is more limited, throw it away...
+ */
+#    ifndef UINT64_C
+#      if defined(_MSC_VER) || defined(__BORLANDC__)
+#        define UINT64_C(c)    (c ## ui64)
+#      else
+#        define UINT64_C(c)    (c ## ULL)
+#      endif
+#    endif
+
+#    undef VNMAX
+#    define VNMAX (2*NMAX+((9*NMAX)/10))
+
+/* ========================================================================= */
+local noinline uLong adler32_vec(adler, buf, len)
+    uLong adler;
+    const Bytef *buf;
+    uInt len;
+{
+    unsigned int s1, s2;
+    unsigned int k;
+
+    /* split Adler-32 into component sums */
+    s1 = adler & 0xffff;
+    s2 = (adler >> 16) & 0xffff;
+
+    /* align input data */
+    k    = ALIGN_DIFF(buf, sizeof(size_t));
+    len -= k;
+    if (k) do {
+        s1 += *buf++;
+        s2 += s1;
+    } while(--k);
+
+    k = len > VNMAX ? VNMAX : len;
+    len -= k;
+    if (likely(k >= 2 * sizeof(size_t))) do
+    {
+        unsigned int vs1, vs2;
+        unsigned int vs1s;
+
+        /* add s1 to s2 for rounds to come */
+        s2 += s1 * ROUND_TO(k, sizeof(size_t));
+        vs1s = vs1 = vs2 = 0;
+        do {
+            size_t vs1l = 0, vs1h = 0, vs1l_s = 0, vs1h_s = 0;
+            unsigned int a, b, c, d, e, f, g, h;
+            unsigned int j;
+
+            j = k > 23 * sizeof(size_t) ? 23 : k/sizeof(size_t);
+            k -= j * sizeof(size_t);
+            /* add s1 to s1 round sum for rounds to come */
+            vs1s += j * vs1;
+            do {
+                size_t in8 = *(const size_t *)buf;
+                buf += sizeof(size_t);
+                /* add this s1 to s1 round sum */
+                vs1l_s += vs1l;
+                vs1h_s += vs1h;
+                /* add up input data to s1 */
+                vs1l +=  in8 & UINT64_C(0x00ff00ff00ff00ff);
+                vs1h += (in8 & UINT64_C(0xff00ff00ff00ff00)) >> 8;
+            } while(--j);
+
+            /* split s1 */
+            if(host_is_bigendian()) {
+                a = (vs1h >> 48) & 0x0000ffff;
+                b = (vs1l >> 48) & 0x0000ffff;
+                c = (vs1h >> 32) & 0x0000ffff;
+                d = (vs1l >> 32) & 0x0000ffff;
+                e = (vs1h >> 16) & 0x0000ffff;
+                f = (vs1l >> 16) & 0x0000ffff;
+                g = (vs1h      ) & 0x0000ffff;
+                h = (vs1l      ) & 0x0000ffff;
+            } else {
+                a = (vs1l      ) & 0x0000ffff;
+                b = (vs1h      ) & 0x0000ffff;
+                c = (vs1l >> 16) & 0x0000ffff;
+                d = (vs1h >> 16) & 0x0000ffff;
+                e = (vs1l >> 32) & 0x0000ffff;
+                f = (vs1h >> 32) & 0x0000ffff;
+                g = (vs1l >> 48) & 0x0000ffff;
+                h = (vs1h >> 48) & 0x0000ffff;
+            }
+
+            /* add s1 & s2 horiz. */
+            vs2 += 8*a + 7*b + 6*c + 5*d + 4*e + 3*f + 2*g + 1*h;
+            vs1 += a + b + c + d + e + f + g + h;
+
+            /* split and add up s1 round sum */
+            vs1l_s = ((vs1l_s      ) & UINT64_C(0x0000ffff0000ffff)) +
+                     ((vs1l_s >> 16) & UINT64_C(0x0000ffff0000ffff));
+            vs1h_s = ((vs1h_s      ) & UINT64_C(0x0000ffff0000ffff)) +
+                     ((vs1h_s >> 16) & UINT64_C(0x0000ffff0000ffff));
+            vs1l_s += vs1h_s;
+            vs1s += ((vs1l_s      ) & UINT64_C(0x00000000ffffffff)) +
+                    ((vs1l_s >> 32) & UINT64_C(0x00000000ffffffff));
+        } while (k >= sizeof(size_t));
+        reduce(vs1s);
+        s2 += vs1s * 8 + vs2;
+        reduce(s2);
+        s1 += vs1;
+        reduce(s1);
+        len += k;
+        k = len > VNMAX ? VNMAX : len;
+        len -= k;
+    } while (k >= sizeof(size_t));
+
+    /* handle trailer */
+    if (k) do {
+        s1 += *buf++;
+        s2 += s1;
+    } while (--k);
+    reduce(s1);
+    reduce(s2);
+
+    /* return recombined sums */
+    return (s2 << 16) | s1;
+}
+
+#  else
+
+/* ========================================================================= */
+local noinline uLong adler32_vec(adler, buf, len)
     uLong adler;
     const Bytef *buf;
     uInt len;
@@ -69,33 +325,6 @@
     sum2 = (adler >> 16) & 0xffff;
     adler &= 0xffff;

-    /* in case user likes doing a byte at a time, keep it fast */
-    if (len == 1) {
-        adler += buf[0];
-        if (adler >= BASE)
-            adler -= BASE;
-        sum2 += adler;
-        if (sum2 >= BASE)
-            sum2 -= BASE;
-        return adler | (sum2 << 16);
-    }
-
-    /* initial Adler-32 value (deferred check for len == 1 speed) */
-    if (buf == Z_NULL)
-        return 1L;
-
-    /* in case short lengths are provided, keep it somewhat fast */
-    if (len < 16) {
-        while (len--) {
-            adler += *buf++;
-            sum2 += adler;
-        }
-        if (adler >= BASE)
-            adler -= BASE;
-        MOD4(sum2);             /* only added so many BASE's */
-        return adler | (sum2 << 16);
-    }
-
     /* do length NMAX blocks -- requires just one modulo operation */
     while (len >= NMAX) {
         len -= NMAX;
@@ -104,8 +333,8 @@
             DO16(buf);          /* 16 sums unrolled */
             buf += 16;
         } while (--n);
-        MOD(adler);
-        MOD(sum2);
+        reduce_full(adler);
+        reduce_full(sum2);
     }

     /* do remaining bytes (less than NMAX, still just one modulo) */
@@ -119,13 +348,36 @@
             adler += *buf++;
             sum2 += adler;
         }
-        MOD(adler);
-        MOD(sum2);
+        reduce_full(adler);
+        reduce_full(sum2);
     }

     /* return recombined sums */
     return adler | (sum2 << 16);
 }
+#  endif
+#endif
+
+/* ========================================================================= */
+uLong ZEXPORT adler32(adler, buf, len)
+    uLong adler;
+    const Bytef *buf;
+    uInt len;
+{
+    /* in case user likes doing a byte at a time, keep it fast */
+    if (len == 1)
+        return adler32_1(adler, buf, len); /* should create a fast tailcall */
+
+    /* initial Adler-32 value (deferred check for len == 1 speed) */
+    if (buf == Z_NULL)
+        return 1L;
+
+    /* in case short lengths are provided, keep it somewhat fast */
+    if (len < MIN_WORK)
+        return adler32_common(adler, buf, len);
+
+    return adler32_vec(adler, buf, len);
+}

 /* ========================================================================= */
 local uLong adler32_combine_(adler1, adler2, len2)
@@ -141,7 +393,7 @@
     rem = (unsigned)(len2 % BASE);
     sum1 = adler1 & 0xffff;
     sum2 = rem * sum1;
-    MOD(sum2);
+    reduce_full(sum2);
     sum1 += (adler2 & 0xffff) + BASE - 1;
     sum2 += ((adler1 >> 16) & 0xffff) + ((adler2 >> 16) & 0xffff) + BASE - rem;
     if (sum1 >= BASE) sum1 -= BASE;