From 737caa3fca5d49156f95fdb5a0d458b36f1e3b503742e5c99a5032fb22f2512c Mon Sep 17 00:00:00 2001 From: Sascha Peilicke Date: Mon, 16 May 2011 07:15:23 +0000 Subject: [PATCH] Accepting request 70149 from Base:System - Update SSE patches, fixes bugs in PPC implementation - X86 improvements. (forwarded request 70148 from elvigia) OBS-URL: https://build.opensuse.org/request/show/70149 OBS-URL: https://build.opensuse.org/package/show/openSUSE:Factory/zlib?expand=0&rev=29 --- 01-prepare.patch | 423 ---------- 02-ppc_altivec.patch | 307 -------- 03-arm.patch | 400 ---------- 04-x86.patch | 1165 ---------------------------- zlib-1.2.5.tar.bz2 | 3 - zlib-1.2.5_git201105121450.tar.bz2 | 3 + zlib.changes | 20 + zlib.spec | 11 +- 8 files changed, 25 insertions(+), 2307 deletions(-) delete mode 100644 01-prepare.patch delete mode 100644 02-ppc_altivec.patch delete mode 100644 03-arm.patch delete mode 100644 04-x86.patch delete mode 100644 zlib-1.2.5.tar.bz2 create mode 100644 zlib-1.2.5_git201105121450.tar.bz2 diff --git a/01-prepare.patch b/01-prepare.patch deleted file mode 100644 index f04d874..0000000 --- a/01-prepare.patch +++ /dev/null @@ -1,423 +0,0 @@ -=== modified file 'Makefile.in' ---- Makefile.in 2011-03-14 01:01:37 +0000 -+++ Makefile.in 2011-03-14 02:19:21 +0000 -@@ -236,7 +236,8 @@ - - # DO NOT DELETE THIS LINE -- make depend depends on it. - --adler32.o zutil.o: zutil.h zlib.h zconf.h -+adler32.o: adler32.c zutil.h zlib.h zconf.h -+zutil.o: zutil.h zlib.h zconf.h - gzclose.o gzlib.o gzread.o gzwrite.o: zlib.h zconf.h gzguts.h - compress.o example.o minigzip.o uncompr.o: zlib.h zconf.h - crc32.o: zutil.h zlib.h zconf.h crc32.h -@@ -246,7 +247,8 @@ - inftrees.o: zutil.h zlib.h zconf.h inftrees.h - trees.o: deflate.h zutil.h zlib.h zconf.h trees.h - --adler32.lo zutil.lo: zutil.h zlib.h zconf.h -+adler32.lo: adler32.c zutil.h zlib.h zconf.h -+zutil.lo: zutil.h zlib.h zconf.h - gzclose.lo gzlib.lo gzread.lo gzwrite.lo: zlib.h zconf.h gzguts.h - compress.lo example.lo minigzip.lo uncompr.lo: zlib.h zconf.h - crc32.lo: zutil.h zlib.h zconf.h crc32.h - -=== modified file 'adler32.c' ---- adler32.c 2011-03-14 01:01:37 +0000 -+++ adler32.c 2011-03-30 13:38:42 +0000 -@@ -9,6 +9,35 @@ - - #define local static - -+#define GCC_VERSION_GE(x) ((__GNUC__-0) * 100 + __GNUC_MINOR__-0 >= x) -+ -+#if GCC_VERSION_GE(301) -+/* sometimes leakes out of old kernel header */ -+# undef noinline -+# define noinline __attribute__((__noinline__)) -+#else -+# ifndef noinline -+# define noinline -+# endif -+#endif -+ -+#if GCC_VERSION_GE(301) -+# define GCC_ATTR_UNUSED_PARAM __attribute__((__unused__)) -+#else -+# define GCC_ATTR_UNUSED_PARAM -+#endif -+ -+#if GCC_VERSION_GE(296) -+# define likely(x) __builtin_expect(!!(x), 1) -+# define unlikely(x) __builtin_expect(!!(x), 0) -+#else -+# define likely(x) (x) -+# define unlikely(x) (x) -+#endif -+ -+#define ROUND_TO(x , n) ((x) & ~((n) - 1L)) -+#define ALIGN_DIFF(x, n) (((intptr_t)((x)+(n) - 1L) & ~((intptr_t)(n) - 1L)) - (intptr_t)(x)) -+ - local uLong adler32_combine_(uLong adler1, uLong adler2, z_off64_t len2); - - #define BASE 65521UL /* largest prime smaller than 65536 */ -@@ -21,9 +50,20 @@ - #define DO8(buf,i) DO4(buf,i); DO4(buf,i+4); - #define DO16(buf) DO8(buf,0); DO8(buf,8); - -+#if defined(__alpha__) -+/* even if gcc can generate a mul by inverse, the code is really -+ * ugly (find global const pool pointer, load constant, a mul, lots -+ * of shifts/add/sub), up to 14 instructions. The replacement code -+ * only needs >= 5 instructions -+ */ -+# define NO_DIVIDE -+#endif -+ - /* use NO_DIVIDE if your processor does not do division in hardware */ - #ifdef NO_DIVIDE --# define MOD(a) \ -+/* use NO_SHIFT if your processor does shift > 1 by loop */ -+# ifdef NO_SHIFT -+# define reduce_full(a) \ - do { \ - if (a >= (BASE << 16)) a -= (BASE << 16); \ - if (a >= (BASE << 15)) a -= (BASE << 15); \ -@@ -43,21 +83,237 @@ - if (a >= (BASE << 1)) a -= (BASE << 1); \ - if (a >= BASE) a -= BASE; \ - } while (0) --# define MOD4(a) \ -+# define reduce_x(a) \ - do { \ -+ if (MIN_WORK >= (1 << 6) && a >= (BASE << 6)) a -= (BASE << 6); \ -+ if (MIN_WORK >= (1 << 5) && a >= (BASE << 5)) a -= (BASE << 5); \ - if (a >= (BASE << 4)) a -= (BASE << 4); \ - if (a >= (BASE << 3)) a -= (BASE << 3); \ - if (a >= (BASE << 2)) a -= (BASE << 2); \ - if (a >= (BASE << 1)) a -= (BASE << 1); \ - if (a >= BASE) a -= BASE; \ - } while (0) -+# define reduce(a) reduce_full(a) -+# else -+# define reduce_full(a) \ -+ do { \ -+ unsigned long b = a & 0x0000ffff; \ -+ a >>= 16; \ -+ b -= a; \ -+ a <<= 4; \ -+ a += b; \ -+ } while(a >= BASE) -+# define reduce_x(a) \ -+ do { \ -+ unsigned long b = a & 0x0000ffff; \ -+ a >>= 16; \ -+ b -= a; \ -+ a <<= 4; \ -+ a += b; \ -+ a = a >= BASE ? a - BASE : a; \ -+ } while(0) -+# define reduce(a) \ -+ do { \ -+ unsigned long b = a & 0x0000ffff; \ -+ a >>= 16; \ -+ b -= a; \ -+ a <<= 4; \ -+ a += b; \ -+ } while(0) -+# endif - #else --# define MOD(a) a %= BASE --# define MOD4(a) a %= BASE --#endif -- --/* ========================================================================= */ --uLong ZEXPORT adler32(adler, buf, len) -+# define reduce_full(a) a %= BASE -+# define reduce_x(a) a %= BASE -+# define reduce(a) a %= BASE -+#endif -+ -+local int host_is_bigendian() -+{ -+ local const union { -+ uInt d; -+ unsigned char endian[sizeof(uInt)]; -+ } x = {1}; -+ return x.endian[0] == 0; -+} -+ -+#ifndef MIN_WORK -+# define MIN_WORK 16 -+#endif -+ -+/* ========================================================================= */ -+local noinline uLong adler32_1(adler, buf, len) -+ uLong adler; -+ const Bytef *buf; -+ uInt len GCC_ATTR_UNUSED_PARAM; -+{ -+ unsigned long sum2; -+ -+ /* split Adler-32 into component sums */ -+ sum2 = (adler >> 16) & 0xffff; -+ adler &= 0xffff; -+ -+ adler += buf[0]; -+ if (adler >= BASE) -+ adler -= BASE; -+ sum2 += adler; -+ if (sum2 >= BASE) -+ sum2 -= BASE; -+ return adler | (sum2 << 16); -+} -+ -+/* ========================================================================= */ -+local noinline uLong adler32_common(adler, buf, len) -+ uLong adler; -+ const Bytef *buf; -+ uInt len; -+{ -+ unsigned long sum2; -+ -+ /* split Adler-32 into component sums */ -+ sum2 = (adler >> 16) & 0xffff; -+ adler &= 0xffff; -+ -+ while (len--) { -+ adler += *buf++; -+ sum2 += adler; -+ } -+ if (adler >= BASE) -+ adler -= BASE; -+ reduce_x(sum2); /* only added so many BASE's */ -+ return adler | (sum2 << 16); -+} -+ -+#ifndef HAVE_ADLER32_VEC -+# if (defined(__LP64__) || ((SIZE_MAX-0) >> 31) >= 2) && !defined(NO_ADLER32_VEC) -+ -+/* On 64 Bit archs, we can do pseudo SIMD with a nice win. -+ * This is esp. important for old Alphas, they do not have byte -+ * access. -+ * This needs some register but x86_64 is fine (>= 9 for the mainloop -+ * req.). If your 64 Bit arch is more limited, throw it away... -+ */ -+# ifndef UINT64_C -+# if defined(_MSC_VER) || defined(__BORLANDC__) -+# define UINT64_C(c) (c ## ui64) -+# else -+# define UINT64_C(c) (c ## ULL) -+# endif -+# endif -+ -+# undef VNMAX -+# define VNMAX (2*NMAX+((9*NMAX)/10)) -+ -+/* ========================================================================= */ -+local noinline uLong adler32_vec(adler, buf, len) -+ uLong adler; -+ const Bytef *buf; -+ uInt len; -+{ -+ unsigned int s1, s2; -+ unsigned int k; -+ -+ /* split Adler-32 into component sums */ -+ s1 = adler & 0xffff; -+ s2 = (adler >> 16) & 0xffff; -+ -+ /* align input data */ -+ k = ALIGN_DIFF(buf, sizeof(size_t)); -+ len -= k; -+ if (k) do { -+ s1 += *buf++; -+ s2 += s1; -+ } while(--k); -+ -+ k = len > VNMAX ? VNMAX : len; -+ len -= k; -+ if (likely(k >= 2 * sizeof(size_t))) do -+ { -+ unsigned int vs1, vs2; -+ unsigned int vs1s; -+ -+ /* add s1 to s2 for rounds to come */ -+ s2 += s1 * ROUND_TO(k, sizeof(size_t)); -+ vs1s = vs1 = vs2 = 0; -+ do { -+ size_t vs1l = 0, vs1h = 0, vs1l_s = 0, vs1h_s = 0; -+ unsigned int a, b, c, d, e, f, g, h; -+ unsigned int j; -+ -+ j = k > 23 * sizeof(size_t) ? 23 : k/sizeof(size_t); -+ k -= j * sizeof(size_t); -+ /* add s1 to s1 round sum for rounds to come */ -+ vs1s += j * vs1; -+ do { -+ size_t in8 = *(const size_t *)buf; -+ buf += sizeof(size_t); -+ /* add this s1 to s1 round sum */ -+ vs1l_s += vs1l; -+ vs1h_s += vs1h; -+ /* add up input data to s1 */ -+ vs1l += in8 & UINT64_C(0x00ff00ff00ff00ff); -+ vs1h += (in8 & UINT64_C(0xff00ff00ff00ff00)) >> 8; -+ } while(--j); -+ -+ /* split s1 */ -+ if(host_is_bigendian()) { -+ a = (vs1h >> 48) & 0x0000ffff; -+ b = (vs1l >> 48) & 0x0000ffff; -+ c = (vs1h >> 32) & 0x0000ffff; -+ d = (vs1l >> 32) & 0x0000ffff; -+ e = (vs1h >> 16) & 0x0000ffff; -+ f = (vs1l >> 16) & 0x0000ffff; -+ g = (vs1h ) & 0x0000ffff; -+ h = (vs1l ) & 0x0000ffff; -+ } else { -+ a = (vs1l ) & 0x0000ffff; -+ b = (vs1h ) & 0x0000ffff; -+ c = (vs1l >> 16) & 0x0000ffff; -+ d = (vs1h >> 16) & 0x0000ffff; -+ e = (vs1l >> 32) & 0x0000ffff; -+ f = (vs1h >> 32) & 0x0000ffff; -+ g = (vs1l >> 48) & 0x0000ffff; -+ h = (vs1h >> 48) & 0x0000ffff; -+ } -+ -+ /* add s1 & s2 horiz. */ -+ vs2 += 8*a + 7*b + 6*c + 5*d + 4*e + 3*f + 2*g + 1*h; -+ vs1 += a + b + c + d + e + f + g + h; -+ -+ /* split and add up s1 round sum */ -+ vs1l_s = ((vs1l_s ) & UINT64_C(0x0000ffff0000ffff)) + -+ ((vs1l_s >> 16) & UINT64_C(0x0000ffff0000ffff)); -+ vs1h_s = ((vs1h_s ) & UINT64_C(0x0000ffff0000ffff)) + -+ ((vs1h_s >> 16) & UINT64_C(0x0000ffff0000ffff)); -+ vs1l_s += vs1h_s; -+ vs1s += ((vs1l_s ) & UINT64_C(0x00000000ffffffff)) + -+ ((vs1l_s >> 32) & UINT64_C(0x00000000ffffffff)); -+ } while (k >= sizeof(size_t)); -+ reduce(vs1s); -+ s2 += vs1s * 8 + vs2; -+ reduce(s2); -+ s1 += vs1; -+ reduce(s1); -+ len += k; -+ k = len > VNMAX ? VNMAX : len; -+ len -= k; -+ } while (k >= sizeof(size_t)); -+ -+ /* handle trailer */ -+ if (k) do { -+ s1 += *buf++; -+ s2 += s1; -+ } while (--k); -+ reduce(s1); -+ reduce(s2); -+ -+ /* return recombined sums */ -+ return (s2 << 16) | s1; -+} -+ -+# else -+ -+/* ========================================================================= */ -+local noinline uLong adler32_vec(adler, buf, len) - uLong adler; - const Bytef *buf; - uInt len; -@@ -69,33 +325,6 @@ - sum2 = (adler >> 16) & 0xffff; - adler &= 0xffff; - -- /* in case user likes doing a byte at a time, keep it fast */ -- if (len == 1) { -- adler += buf[0]; -- if (adler >= BASE) -- adler -= BASE; -- sum2 += adler; -- if (sum2 >= BASE) -- sum2 -= BASE; -- return adler | (sum2 << 16); -- } -- -- /* initial Adler-32 value (deferred check for len == 1 speed) */ -- if (buf == Z_NULL) -- return 1L; -- -- /* in case short lengths are provided, keep it somewhat fast */ -- if (len < 16) { -- while (len--) { -- adler += *buf++; -- sum2 += adler; -- } -- if (adler >= BASE) -- adler -= BASE; -- MOD4(sum2); /* only added so many BASE's */ -- return adler | (sum2 << 16); -- } -- - /* do length NMAX blocks -- requires just one modulo operation */ - while (len >= NMAX) { - len -= NMAX; -@@ -104,8 +333,8 @@ - DO16(buf); /* 16 sums unrolled */ - buf += 16; - } while (--n); -- MOD(adler); -- MOD(sum2); -+ reduce_full(adler); -+ reduce_full(sum2); - } - - /* do remaining bytes (less than NMAX, still just one modulo) */ -@@ -119,13 +348,36 @@ - adler += *buf++; - sum2 += adler; - } -- MOD(adler); -- MOD(sum2); -+ reduce_full(adler); -+ reduce_full(sum2); - } - - /* return recombined sums */ - return adler | (sum2 << 16); - } -+# endif -+#endif -+ -+/* ========================================================================= */ -+uLong ZEXPORT adler32(adler, buf, len) -+ uLong adler; -+ const Bytef *buf; -+ uInt len; -+{ -+ /* in case user likes doing a byte at a time, keep it fast */ -+ if (len == 1) -+ return adler32_1(adler, buf, len); /* should create a fast tailcall */ -+ -+ /* initial Adler-32 value (deferred check for len == 1 speed) */ -+ if (buf == Z_NULL) -+ return 1L; -+ -+ /* in case short lengths are provided, keep it somewhat fast */ -+ if (len < MIN_WORK) -+ return adler32_common(adler, buf, len); -+ -+ return adler32_vec(adler, buf, len); -+} - - /* ========================================================================= */ - local uLong adler32_combine_(adler1, adler2, len2) -@@ -141,7 +393,7 @@ - rem = (unsigned)(len2 % BASE); - sum1 = adler1 & 0xffff; - sum2 = rem * sum1; -- MOD(sum2); -+ reduce_full(sum2); - sum1 += (adler2 & 0xffff) + BASE - 1; - sum2 += ((adler1 >> 16) & 0xffff) + ((adler2 >> 16) & 0xffff) + BASE - rem; - if (sum1 >= BASE) sum1 -= BASE; diff --git a/02-ppc_altivec.patch b/02-ppc_altivec.patch deleted file mode 100644 index 5aabbc3..0000000 --- a/02-ppc_altivec.patch +++ /dev/null @@ -1,307 +0,0 @@ -=== modified file 'Makefile.in' ---- Makefile.in 2011-03-14 02:19:21 +0000 -+++ Makefile.in 2011-03-14 03:06:03 +0000 -@@ -236,7 +236,7 @@ - - # DO NOT DELETE THIS LINE -- make depend depends on it. - --adler32.o: adler32.c zutil.h zlib.h zconf.h -+adler32.o: adler32.c adler32_ppc.c zutil.h zlib.h zconf.h - zutil.o: zutil.h zlib.h zconf.h - gzclose.o gzlib.o gzread.o gzwrite.o: zlib.h zconf.h gzguts.h - compress.o example.o minigzip.o uncompr.o: zlib.h zconf.h -@@ -247,7 +247,7 @@ - inftrees.o: zutil.h zlib.h zconf.h inftrees.h - trees.o: deflate.h zutil.h zlib.h zconf.h trees.h - --adler32.lo: adler32.c zutil.h zlib.h zconf.h -+adler32.lo: adler32.c adler32_ppc.c zutil.h zlib.h zconf.h - zutil.lo: zutil.h zlib.h zconf.h - gzclose.lo gzlib.lo gzread.lo gzwrite.lo: zlib.h zconf.h gzguts.h - compress.lo example.lo minigzip.lo uncompr.lo: zlib.h zconf.h - -=== modified file 'adler32.c' ---- adler32.c 2011-03-30 13:38:42 +0000 -+++ adler32.c 2011-03-30 13:38:46 +0000 -@@ -36,7 +36,10 @@ - #endif - - #define ROUND_TO(x , n) ((x) & ~((n) - 1L)) -+#define DIV_ROUNDUP(a, b) (((a) + (b) - 1) / (b)) - #define ALIGN_DIFF(x, n) (((intptr_t)((x)+(n) - 1L) & ~((intptr_t)(n) - 1L)) - (intptr_t)(x)) -+#define ALIGN_DOWN(x, n) (((intptr_t)(x)) & ~((intptr_t)(n) - 1L)) -+#define ALIGN_DOWN_DIFF(x, n) (((intptr_t)(x)) & ((intptr_t)(n) - 1L)) - - local uLong adler32_combine_(uLong adler1, uLong adler2, z_off64_t len2); - -@@ -136,6 +139,12 @@ - return x.endian[0] == 0; - } - -+#ifndef NO_ADLER32_VEC -+# if defined(__powerpc__) || defined(__powerpc64__) -+# include "adler32_ppc.c" -+# endif -+#endif -+ - #ifndef MIN_WORK - # define MIN_WORK 16 - #endif - -=== added file 'adler32_ppc.c' ---- adler32_ppc.c 1970-01-01 00:00:00 +0000 -+++ adler32_ppc.c 2011-03-30 11:12:04 +0000 -@@ -0,0 +1,253 @@ -+/* -+ * adler32.c -- compute the Adler-32 checksum of a data stream -+ * ppc implementation -+ * Copyright (C) 1995-2007 Mark Adler -+ * Copyright (C) 2009-2011 Jan Seiffert -+ * For conditions of distribution and use, see copyright notice in zlib.h -+ */ -+ -+/* @(#) $Id$ */ -+ -+/* -+ * We use the Altivec PIM vector stuff, but still, this is only -+ * tested with GCC, and prop. uses some GCC specifics (like GCC -+ * understands vector types and you can simply write a += b) -+ */ -+#if defined(__ALTIVEC__) && defined(__GNUC__) -+# define HAVE_ADLER32_VEC -+/* it needs some bytes till the vec version gets up to speed... */ -+# define MIN_WORK 64 -+# include -+ -+/* -+ * Depending on length, this can be slower (short length < 64 bytes), -+ * much faster (our beloved 128kb 22.2s generic to 3.4s vec, but cache -+ * is important...), to a little faster (very long length, 1.6MB, 47.6s -+ * to 36s), which is prop. only capped by memory bandwith. -+ * (The orig. 128k case was slower in AltiVec, because AltiVec loads -+ * are always uncached and trigger no HW prefetching, because that is -+ * what you often need with mass data manipulation (not poisen your -+ * cache, movntq), instead you have to do it for yourself (data stream -+ * touch). With 128k it could be cleanly seen: no prefetch, half as slow -+ * as generic, but comment out the memory load -> 3s. With proper prefetch -+ * we are at 3.4s. So AltiVec can execute these "expensive" FMA quite -+ * fast (even without fancy unrolling), only the data does not arrive -+ * fast enough. In cases where the working set does not fit into cache -+ * it simply cannot be delivered fast enough over the FSB/Mem). -+ * Still we have to prefetch, or we are slow as hell. -+ */ -+ -+# define SOVUC (sizeof(vector unsigned char)) -+ -+/* can be propably more, since we do not have the x86 psadbw 64 bit sum */ -+# define VNMAX (6*NMAX) -+ -+/* ========================================================================= */ -+local inline vector unsigned char vec_identl(level) -+ unsigned int level; -+{ -+ return vec_lvsl(level, (const unsigned char *)0); -+} -+ -+/* ========================================================================= */ -+local inline vector unsigned char vec_ident_rev(void) -+{ -+ return vec_xor(vec_identl(0), vec_splat_u8(15)); -+} -+ -+/* ========================================================================= */ -+/* multiply two 32 bit ints, return the low 32 bit */ -+local inline vector unsigned int vec_mullw(vector unsigned int a, vector unsigned int b) -+{ -+ vector unsigned int v16 = vec_splat_u32(-16); -+ vector unsigned int v0_32 = vec_splat_u32(0); -+ vector unsigned int swap, low, high; -+ -+ swap = vec_rl(b, v16); -+ low = vec_mulo((vector unsigned short)a, (vector unsigned short)b); -+ high = vec_msum((vector unsigned short)a, (vector unsigned short)swap, v0_32); -+ high = vec_sl(high, v16); -+ return vec_add(low, high); -+} -+ -+/* ========================================================================= */ -+local inline vector unsigned int vector_reduce(vector unsigned int x) -+{ -+ vector unsigned int y; -+ vector unsigned int vsh; -+ -+ vsh = vec_splat_u32(1); -+ vsh = vec_sl(vsh, vec_splat_u32(4)); -+ -+ y = vec_sl(x, vsh); -+ y = vec_sr(y, vsh); -+ x = vec_sr(x, vsh); -+ y = vec_sub(y, x); -+ x = vec_sl(x, vec_splat_u32(4)); -+ x = vec_add(x, y); -+ return x; -+} -+ -+/* ========================================================================= */ -+local noinline uLong adler32_vec(adler, buf, len) -+ uLong adler; -+ const Bytef *buf; -+ uInt len; -+{ -+ unsigned int s1, s2; -+ -+ s1 = adler & 0xffff; -+ s2 = (adler >> 16) & 0xffff; -+ -+ if (likely(len >= 2*SOVUC)) { -+ vector unsigned int v0_32 = vec_splat_u32(0); -+ vector unsigned int vsh = vec_splat_u32(4); -+ vector unsigned char v1 = vec_splat_u8(1); -+ vector unsigned char vord; -+ vector unsigned char v0 = vec_splat_u8(0); -+ vector unsigned int vs1, vs2; -+ vector unsigned char in16, vord_a, v1_a, vperm; -+ unsigned int f, n; -+ unsigned int k, block_num; -+ -+ /* -+ * if i understand the Altivec PEM right, little -+ * endian impl. should have the data reversed on -+ * load, so the big endian vorder works. -+ */ -+ vord = vec_ident_rev() + v1; -+ block_num = DIV_ROUNDUP(len, 512); /* 32 block size * 16 bytes */ -+ f = 512; -+ f |= block_num >= 256 ? 0 : block_num << 16; -+ vec_dst(buf, f, 2); -+ /* -+ * Add stuff to achieve alignment -+ */ -+ /* swizzle masks in place */ -+ vperm = vec_lvsl(0, buf); -+ vord_a = vec_perm(vord, v0, vperm); -+ v1_a = vec_perm(v1, v0, vperm); -+ vperm = vec_lvsr(0, buf); -+ vord_a = vec_perm(v0, vord_a, vperm); -+ v1_a = vec_perm(v0, v1_a, vperm); -+ -+ /* align hard down */ -+ f = (unsigned) ALIGN_DOWN_DIFF(buf, SOVUC); -+ n = SOVUC - f; -+ buf = (const unsigned char *)ALIGN_DOWN(buf, SOVUC); -+ -+ /* add n times s1 to s2 for start round */ -+ s2 += s1 * n; -+ -+ /* set sums 0 */ -+ vs1 = v0_32; -+ vs2 = v0_32; -+ -+ k = len < VNMAX ? (unsigned)len : VNMAX; -+ len -= k; -+ -+ /* insert scalar start somewhere */ -+ vs1 = vec_lde(0, &s1); -+ vs2 = vec_lde(0, &s2); -+ -+ /* get input data */ -+ in16 = vec_ldl(0, buf); -+ -+ /* mask out excess data, add 4 byte horizontal and add to old dword */ -+ vs1 = vec_msum(in16, v1_a, vs1); -+ -+ /* apply order, masking out excess data, add 4 byte horizontal and add to old dword */ -+ vs2 = vec_msum(in16, vord_a, vs2); -+ -+ buf += SOVUC; -+ k -= n; -+ -+ if (likely(k >= SOVUC)) do { -+ vector unsigned int vs1_r = v0_32; -+ f = 512; -+ f |= block_num >= 256 ? 0 : block_num << 16; -+ vec_dst(buf, f, 2); -+ -+ do { -+ /* get input data */ -+ in16 = vec_ldl(0, buf); -+ -+ /* add vs1 for this round */ -+ vs1_r += vs1; -+ -+ /* add 4 byte horizontal and add to old dword */ -+ vs1 = vec_sum4s(in16, vs1); -+ /* apply order, add 4 byte horizontal and add to old dword */ -+ vs2 = vec_msum(in16, vord, vs2); -+ -+ buf += SOVUC; -+ k -= SOVUC; -+ } while (k >= SOVUC); -+ /* reduce vs1 round sum before multiplying by 16 */ -+ vs1_r = vector_reduce(vs1_r); -+ /* add all vs1 for 16 times */ -+ vs2 += vec_sl(vs1_r, vsh); -+ /* reduce the vectors to something in the range of BASE */ -+ vs2 = vector_reduce(vs2); -+ vs1 = vector_reduce(vs1); -+ len += k; -+ k = len < VNMAX ? (unsigned)len : VNMAX; -+ block_num = DIV_ROUNDUP(len, 512); /* 32 block size * 16 bytes */ -+ len -= k; -+ } while (likely(k >= SOVUC)); -+ -+ if (likely(k)) { -+ vector unsigned int vk; -+ /* -+ * handle trailer -+ */ -+ f = SOVUC - k; -+ /* swizzle masks in place */ -+ vperm = vec_identl(f); -+ vord_a = vec_perm(vord, v0, vperm); -+ v1_a = vec_perm(v1, v0, vperm); -+ -+ /* add k times vs1 for this trailer */ -+ vk = (vector unsigned int)vec_lvsl(0, (unsigned *)(intptr_t)k); -+ vk = (vector unsigned)vec_mergeh(v0, (vector unsigned char)vk); -+ vk = (vector unsigned)vec_mergeh((vector unsigned short)v0, (vector unsigned short)vk); -+ vk = vec_splat(vk, 0); -+ vs2 += vec_mullw(vs1, vk); -+ -+ /* get input data */ -+ in16 = vec_ldl(0, buf); -+ -+ /* mask out excess data, add 4 byte horizontal and add to old dword */ -+ vs1 = vec_msum(in16, v1_a, vs1); -+ /* apply order, masking out excess data, add 4 byte horizontal and add to old dword */ -+ vs2 = vec_msum(in16, vord_a, vs2); -+ -+ buf += k; -+ k -= k; -+ } -+ -+ vec_dss(2); -+ -+ /* add horizontal */ -+ /* stuff should be reduced so no proplem with signed sature */ -+ vs1 = (vector unsigned)vec_sums((vector int)vs1, (vector int)v0_32); -+ vs2 = (vector unsigned)vec_sums((vector int)vs2, (vector int)v0_32); -+ /* shake and roll */ -+ vs1 = vec_splat(vs1, 3); -+ vs2 = vec_splat(vs2, 3); -+ vec_ste(vs1, 0, &s1); -+ vec_ste(vs2, 0, &s2); -+ /* after horizontal add, reduce again in scalar code */ -+ } -+ -+ if (unlikely(len)) do { -+ s1 += *buf++; -+ s2 += s1; -+ } while (--len); -+ reduce(s1); -+ reduce(s2); -+ -+ return (s2 << 16) | s1; -+} -+ -+#endif diff --git a/03-arm.patch b/03-arm.patch deleted file mode 100644 index 6b9878b..0000000 --- a/03-arm.patch +++ /dev/null @@ -1,400 +0,0 @@ -=== modified file 'Makefile.in' ---- Makefile.in 2011-03-14 03:06:03 +0000 -+++ Makefile.in 2011-03-14 14:39:24 +0000 -@@ -236,7 +236,7 @@ - - # DO NOT DELETE THIS LINE -- make depend depends on it. - --adler32.o: adler32.c adler32_ppc.c zutil.h zlib.h zconf.h -+adler32.o: adler32.c adler32_ppc.c adler32_arm.c zutil.h zlib.h zconf.h - zutil.o: zutil.h zlib.h zconf.h - gzclose.o gzlib.o gzread.o gzwrite.o: zlib.h zconf.h gzguts.h - compress.o example.o minigzip.o uncompr.o: zlib.h zconf.h -@@ -247,7 +247,7 @@ - inftrees.o: zutil.h zlib.h zconf.h inftrees.h - trees.o: deflate.h zutil.h zlib.h zconf.h trees.h - --adler32.lo: adler32.c adler32_ppc.c zutil.h zlib.h zconf.h -+adler32.lo: adler32.c adler32_ppc.c adler32_arm.c zutil.h zlib.h zconf.h - zutil.lo: zutil.h zlib.h zconf.h - gzclose.lo gzlib.lo gzread.lo gzwrite.lo: zlib.h zconf.h gzguts.h - compress.lo example.lo minigzip.lo uncompr.lo: zlib.h zconf.h - -=== modified file 'adler32.c' ---- adler32.c 2011-03-30 13:38:46 +0000 -+++ adler32.c 2011-03-30 13:38:46 +0000 -@@ -140,7 +140,9 @@ - } - - #ifndef NO_ADLER32_VEC --# if defined(__powerpc__) || defined(__powerpc64__) -+# if defined(__arm__) -+# include "adler32_arm.c" -+# elif defined(__powerpc__) || defined(__powerpc64__) - # include "adler32_ppc.c" - # endif - #endif - -=== added file 'adler32_arm.c' ---- adler32_arm.c 1970-01-01 00:00:00 +0000 -+++ adler32_arm.c 2011-03-30 11:18:49 +0000 -@@ -0,0 +1,359 @@ -+/* -+ * adler32.c -- compute the Adler-32 checksum of a data stream -+ * arm implementation -+ * Copyright (C) 1995-2007 Mark Adler -+ * Copyright (C) 2009-2011 Jan Seiffert -+ * For conditions of distribution and use, see copyright notice in zlib.h -+ */ -+ -+/* @(#) $Id$ */ -+ -+#if defined(__ARM_NEON__) -+// TODO: need byte order define -+/* -+ * Big endian NEON qwords are kind of broken. -+ * They are big endian within the dwords, but WRONG -+ * (really??) way round between lo and hi. -+ * Creating some kind of PDP11 middle endian. -+ * -+ * This is madness and unsupportable. For this reason -+ * GCC wants to disable qword endian specific patterns. -+ * We would need a Preprocessor define which endian we -+ * have to disable this code. -+ */ -+# include -+ -+# define SOVUCQ sizeof(uint8x16_t) -+# define SOVUC sizeof(uint8x8_t) -+/* since we do not have the 64bit psadbw sum, we could prop. do a little more */ -+# define VNMAX (6*NMAX) -+# define HAVE_ADLER32_VEC -+# define MIN_WORK 32 -+ -+/* ========================================================================= */ -+local inline uint8x16_t neon_simple_alignq(uint8x16_t a, uint8x16_t b, unsigned amount) -+{ -+ switch(amount % SOVUCQ) -+ { -+ case 0: return a; -+ case 1: return vextq_u8(a, b, 1); -+ case 2: return vextq_u8(a, b, 2); -+ case 3: return vextq_u8(a, b, 3); -+ case 4: return vextq_u8(a, b, 4); -+ case 5: return vextq_u8(a, b, 5); -+ case 6: return vextq_u8(a, b, 6); -+ case 7: return vextq_u8(a, b, 7); -+ case 8: return vextq_u8(a, b, 8); -+ case 9: return vextq_u8(a, b, 9); -+ case 10: return vextq_u8(a, b, 10); -+ case 11: return vextq_u8(a, b, 11); -+ case 12: return vextq_u8(a, b, 12); -+ case 13: return vextq_u8(a, b, 13); -+ case 14: return vextq_u8(a, b, 14); -+ case 15: return vextq_u8(a, b, 15); -+ } -+ return b; -+} -+ -+/* ========================================================================= */ -+local inline uint32x4_t vector_reduce(uint32x4_t x) -+{ -+ uint32x4_t y; -+ -+ y = vshlq_n_u32(x, 16); -+ x = vshrq_n_u32(x, 16); -+ y = vshrq_n_u32(y, 16); -+ y = vsubq_u32(y, x); -+ x = vaddq_u32(y, vshlq_n_u32(x, 4)); -+ return x; -+} -+ -+/* ========================================================================= */ -+local noinline uLong adler32_vec(adler, buf, len) -+ uLong adler; -+ const Bytef *buf; -+ uInt len; -+{ -+ uint32x4_t v0_32 = (uint32x4_t){0,0,0,0}; -+ uint8x16_t v0 = (uint8x16_t)v0_32; -+ uint8x16_t vord, vord_a; -+ uint32x4_t vs1, vs2; -+ uint32x2_t v_tsum; -+ uint8x16_t in16; -+ uint32_t s1, s2; -+ unsigned k; -+ -+ s1 = adler & 0xffff; -+ s2 = (adler >> 16) & 0xffff; -+ -+// TODO: big endian mask is prop. wrong -+ if (host_is_bigendian()) -+ vord = (uint8x16_t){16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1}; -+ else -+ vord = (uint8x16_t){1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16}; -+ -+ if (likely(len >= 2*SOVUCQ)) { -+ unsigned f, n; -+ -+ /* -+ * Add stuff to achieve alignment -+ */ -+ /* align hard down */ -+ f = (unsigned) ALIGN_DOWN_DIFF(buf, SOVUCQ); -+ n = SOVUCQ - f; -+ buf = (const unsigned char *)ALIGN_DOWN(buf, SOVUCQ); -+ -+ /* add n times s1 to s2 for start round */ -+ s2 += s1 * n; -+ -+ /* set sums 0 */ -+ vs1 = v0_32; -+ vs2 = v0_32; -+ /* -+ * the accumulation of s1 for every round grows very fast -+ * (quadratic?), even if we accumulate in 4 dwords, more -+ * rounds means nonlinear growth. -+ * We already split it out of s2, normaly it would be in -+ * s2 times 16... and even grow faster. -+ * Thanks to this split and vector reduction, we can stay -+ * longer in the loops. But we have to prepare for the worst -+ * (all 0xff), only do 6 times the work. -+ * (we could prop. stay a little longer since we have 4 sums, -+ * not 2 like on x86). -+ */ -+ k = len < VNMAX ? (unsigned)len : VNMAX; -+ len -= k; -+ /* insert scalar start somewhere */ -+ vs1 = vsetq_lane_u32(s1, vs1, 0); -+ vs2 = vsetq_lane_u32(s2, vs2, 0); -+ -+ /* get input data */ -+ in16 = *(const uint8x16_t *)buf; -+ /* mask out excess data */ -+ if(host_is_bigendian()) { -+ in16 = neon_simple_alignq(v0, in16, n); -+ vord_a = neon_simple_alignq(v0, vord, n); -+ } else { -+ in16 = neon_simple_alignq(in16, v0, f); -+ vord_a = neon_simple_alignq(vord, v0, f); -+ } -+ -+ /* pairwise add bytes and long, pairwise add word long acc */ -+ vs1 = vpadalq_u16(vs1, vpaddlq_u8(in16)); -+ /* apply order, add words, pairwise add word long acc */ -+ vs2 = vpadalq_u16(vs2, -+ vmlal_u8( -+ vmull_u8(vget_low_u8(in16), vget_low_u8(vord_a)), -+ vget_high_u8(in16), vget_high_u8(vord_a) -+ ) -+ ); -+ -+ buf += SOVUCQ; -+ k -= n; -+ -+ if (likely(k >= SOVUCQ)) do { -+ uint32x4_t vs1_r = v0_32; -+ do { -+ /* add vs1 for this round */ -+ vs1_r = vaddq_u32(vs1_r, vs1); -+ -+ /* get input data */ -+ in16 = *(const uint8x16_t *)buf; -+ -+// TODO: make work in inner loop more tight -+ /* -+ * decompose partial sums, so we do less instructions and -+ * build loops around it to do acc and so on only from time -+ * to time. -+ * This is hard with NEON, because the instruction are nice: -+ * we have the stuff in widening and with acc (practicaly -+ * for free...) -+ */ -+ /* pairwise add bytes and long, pairwise add word long acc */ -+ vs1 = vpadalq_u16(vs1, vpaddlq_u8(in16)); -+ /* apply order, add words, pairwise add word long acc */ -+ vs2 = vpadalq_u16(vs2, -+ vmlal_u8( -+ vmull_u8(vget_low_u8(in16), vget_low_u8(vord)), -+ vget_high_u8(in16), vget_high_u8(vord) -+ ) -+ ); -+ -+ buf += SOVUCQ; -+ k -= SOVUCQ; -+ } while (k >= SOVUCQ); -+ /* reduce vs1 round sum before multiplying by 16 */ -+ vs1_r = vector_reduce(vs1_r); -+ /* add vs1 for this round (16 times) */ -+ /* they have shift right and accummulate, where is shift left and acc?? */ -+ vs2 = vaddq_u32(vs2, vshlq_n_u32(vs1_r, 4)); -+ /* reduce both vectors to something within 16 bit */ -+ vs2 = vector_reduce(vs2); -+ vs1 = vector_reduce(vs1); -+ len += k; -+ k = len < VNMAX ? (unsigned) len : VNMAX; -+ len -= k; -+ } while (likely(k >= SOVUC)); -+ -+ if (likely(k)) { -+ /* -+ * handle trailer -+ */ -+ f = SOVUCQ - k; -+ /* add k times vs1 for this trailer */ -+ vs2 = vmlaq_u32(vs2, vs1, vdupq_n_u32(k)); -+ -+ /* get input data */ -+ in16 = *(const uint8x16_t *)buf; -+ /* masks out bad data */ -+ if(host_is_bigendian()) -+ in16 = neon_simple_alignq(in16, v0, f); -+ else -+ in16 = neon_simple_alignq(v0, in16, k); -+ -+ /* pairwise add bytes and long, pairwise add word long acc */ -+ vs1 = vpadalq_u16(vs1, vpaddlq_u8(in16)); -+ /* apply order, add words, pairwise add word long acc */ -+ vs2 = vpadalq_u16(vs2, -+ vmlal_u8( -+ vmull_u8(vget_low_u8(in16), vget_low_u8(vord)), -+ vget_high_u8(in16), vget_high_u8(vord) -+ ) -+ ); -+ -+ buf += k; -+ k -= k; -+ } -+ -+ /* add horizontal */ -+ v_tsum = vpadd_u32(vget_high_u32(vs1), vget_low_u32(vs1)); -+ v_tsum = vpadd_u32(v_tsum, v_tsum); -+ s1 = vget_lane_u32(v_tsum, 0); -+ v_tsum = vpadd_u32(vget_high_u32(vs2), vget_low_u32(vs2)); -+ v_tsum = vpadd_u32(v_tsum, v_tsum); -+ s2 = vget_lane_u32(v_tsum, 0); -+ } -+ -+ if (unlikely(len)) do { -+ s1 += *buf++; -+ s2 += s1; -+ } while (--len); -+ reduce_x(s1); -+ reduce_x(s2); -+ -+ return (s2 << 16) | s1; -+} -+ -+/* inline asm, so only on GCC (or compatible) && ARM v6 or better */ -+#elif defined(__GNUC__) && ( \ -+ defined(__ARM_ARCH_6__) || defined(__ARM_ARCH_6J__) || \ -+ defined(__ARM_ARCH_6Z__) || defined(__ARM_ARCH_6ZK__) || \ -+ defined(__ARM_ARCH_7A__) \ -+ ) -+# define SOU32 (sizeof(unsigned int)) -+# define HAVE_ADLER32_VEC -+# define MIN_WORK 16 -+ -+/* ========================================================================= */ -+local noinline uLong adler32_vec(adler, buf, len) -+ uLong adler; -+ const Bytef *buf; -+ uInt len; -+{ -+ unsigned int s1, s2; -+ unsigned int k; -+ -+ s1 = adler & 0xffff; -+ s2 = (adler >> 16) & 0xffff; -+ -+ k = ALIGN_DIFF(buf, SOU32); -+ len -= k; -+ if (k) do { -+ s1 += *buf++; -+ s2 += s1; -+ } while (--k); -+ -+ if (likely(len >= 4 * SOU32)) { -+ unsigned int vs1 = s1, vs2 = s2; -+ unsigned int order_lo, order_hi; -+ -+// TODO: byte order? -+ if(host_is_bigendian()) { -+ order_lo = 0x00030001; -+ order_hi = 0x00040002; -+ } else { -+ order_lo = 0x00020004; -+ order_hi = 0x00010003; -+ } -+// TODO: we could go over NMAX, since we have split the vs2 sum -+ /* something around (NMAX+(NMAX/3)+302) */ -+ k = len < NMAX ? len : NMAX; -+ len -= k; -+ -+ do { -+ unsigned int vs1_r = 0; -+ do { -+ unsigned int t21, t22, in; -+ -+ /* get input data */ -+ in = *(const unsigned int *)buf; -+ -+ /* add vs1 for this round */ -+ vs1_r += vs1; -+ -+ /* add horizontal and acc */ -+ asm ("usada8 %0, %1, %2, %3" : "=r" (vs1) : "r" (in), "r" (0), "r" (vs1)); -+ /* widen bytes to words, apply order, add and acc */ -+ asm ("uxtb16 %0, %1" : "=r" (t21) : "r" (in)); -+ asm ("uxtb16 %0, %1, ror #8" : "=r" (t22) : "r" (in)); -+// TODO: instruction result latency -+ /* -+ * The same problem like the classic serial sum: -+ * Chip maker sell us 1-cycle instructions, but that is not the -+ * whole story. Nearly all 1-cycle chips are pipelined, so -+ * you can get one result per cycle, but only if _they_ (plural) -+ * are independent. -+ * If you are depending on the result of an preciding instruction, -+ * in the worst case you hit the instruction latency which is worst -+ * case >= pipeline length. On the other hand there are result-fast-paths. -+ * This could all be a wash with the classic sum (4 * 2 instructions, -+ * + dependence), since smald is: -+ * - 2 cycle issue -+ * - needs the acc in pipeline step E1, instead of E2 -+ * But the Cortex has a fastpath for acc. -+ * I don't know. -+ * We can not even unroll, we would need 4 order vars, return ENOREGISTER. -+ */ -+ asm ("smlad %0, %1, %2, %3" : "=r" (vs2) : "r" (t21) , "r" (order_lo), "r" (vs2)); -+ asm ("smlad %0, %1, %2, %3" : "=r" (vs2) : "r" (t22) , "r" (order_hi), "r" (vs2)); -+ -+ buf += SOU32; -+ k -= SOU32; -+ } while (k >= SOU32); -+ /* reduce vs1 round sum before multiplying by 4 */ -+ reduce(vs1_r); -+ /* add vs1 for this round (4 times) */ -+ vs2 += vs1_r * 4; -+ /* reduce both sums to something within 16 bit */ -+ reduce(vs2); -+ reduce(vs1); -+ len += k; -+ k = len < NMAX ? len : NMAX; -+ len -= k; -+ } while (likely(k >= 4 * SOU32)); -+ len += k; -+ s1 = vs1; -+ s2 = vs2; -+ } -+ -+ if (unlikely(len)) do { -+ s1 += *buf++; -+ s2 += s1; -+ } while (--len); -+ /* at this point we should no have so big s1 & s2 */ -+ reduce_x(s1); -+ reduce_x(s2); -+ -+ return (s2 << 16) | s1; -+} -+#endif diff --git a/04-x86.patch b/04-x86.patch deleted file mode 100644 index f2d489e..0000000 --- a/04-x86.patch +++ /dev/null @@ -1,1165 +0,0 @@ -=== modified file 'Makefile.in' ---- Makefile.in 2011-03-14 14:39:24 +0000 -+++ Makefile.in 2011-03-14 16:46:30 +0000 -@@ -236,7 +236,7 @@ - - # DO NOT DELETE THIS LINE -- make depend depends on it. - --adler32.o: adler32.c adler32_ppc.c adler32_arm.c zutil.h zlib.h zconf.h -+adler32.o: adler32.c adler32_ppc.c adler32_arm.c adler32_x86.c zutil.h zlib.h zconf.h - zutil.o: zutil.h zlib.h zconf.h - gzclose.o gzlib.o gzread.o gzwrite.o: zlib.h zconf.h gzguts.h - compress.o example.o minigzip.o uncompr.o: zlib.h zconf.h -@@ -247,7 +247,7 @@ - inftrees.o: zutil.h zlib.h zconf.h inftrees.h - trees.o: deflate.h zutil.h zlib.h zconf.h trees.h - --adler32.lo: adler32.c adler32_ppc.c adler32_arm.c zutil.h zlib.h zconf.h -+adler32.lo: adler32.c adler32_ppc.c adler32_arm.c adler32_x86.c zutil.h zlib.h zconf.h - zutil.lo: zutil.h zlib.h zconf.h - gzclose.lo gzlib.lo gzread.lo gzwrite.lo: zlib.h zconf.h gzguts.h - compress.lo example.lo minigzip.lo uncompr.lo: zlib.h zconf.h - -=== modified file 'adler32.c' ---- adler32.c 2011-03-30 13:38:46 +0000 -+++ adler32.c 2011-03-30 13:38:46 +0000 -@@ -144,6 +144,8 @@ - # include "adler32_arm.c" - # elif defined(__powerpc__) || defined(__powerpc64__) - # include "adler32_ppc.c" -+# elif defined(__i386__) || defined(__x86_64__) -+# include "adler32_x86.c" - # endif - #endif - - -=== added file 'adler32_x86.c' ---- adler32_x86.c 1970-01-01 00:00:00 +0000 -+++ adler32_x86.c 2011-03-15 23:15:36 +0000 -@@ -0,0 +1,1125 @@ -+/* -+ * adler32.c -- compute the Adler-32 checksum of a data stream -+ * x86 implementation -+ * Copyright (C) 1995-2007 Mark Adler -+ * Copyright (C) 2009-2011 Jan Seiffert -+ * For conditions of distribution and use, see copyright notice in zlib.h -+ */ -+ -+/* @(#) $Id$ */ -+ -+#if GCC_VERSION_GE(207) -+# define GCC_ATTR_CONSTRUCTOR __attribute__((__constructor__)) -+#else -+# define VEC_NO_GO -+#endif -+ -+#if GCC_VERSION_GE(203) -+# define GCC_ATTR_ALIGNED(x) __attribute__((__aligned__(x))) -+#else -+# define VEC_NO_GO -+#endif -+ -+/* inline asm, so only on GCC (or compatible) */ -+#if defined(__GNUC__) && !defined(VEC_NO_GO) -+# define HAVE_ADLER32_VEC -+# define MIN_WORK 64 -+ -+# ifdef __x86_64__ -+# define PICREG "%%rbx" -+# else -+# define PICREG "%%ebx" -+# endif -+ -+/* ========================================================================= */ -+local const struct { short d[24]; } vord GCC_ATTR_ALIGNED(16) = { -+ {1,1,1,1,1,1,1,1,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1} -+}; -+ -+/* ========================================================================= */ -+local const struct { char d[16]; } vord_b GCC_ATTR_ALIGNED(16) = { -+ {16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1} -+}; -+ -+/* ========================================================================= */ -+local noinline const Bytef *adler32_jumped(buf, s1, s2, k) -+ const Bytef *buf; -+ unsigned int *s1; -+ unsigned int *s2; -+ unsigned int k; -+{ -+ unsigned int t; -+ unsigned n = k % 16; -+ buf += n; -+ k = (k / 16) + 1; -+ -+ __asm__ __volatile__ ( -+# ifdef __x86_64__ -+# define CLOB "&" -+ "lea 1f(%%rip), %q4\n\t" -+ "lea (%q4,%q5,8), %q4\n\t" -+ "jmp *%q4\n\t" -+# else -+# ifndef __PIC__ -+# define CLOB -+ "lea 1f(,%5,8), %4\n\t" -+# else -+# define CLOB -+ "lea 1f-3f(,%5,8), %4\n\t" -+ "call 9f\n" -+ "3:\n\t" -+# endif -+ "jmp *%4\n\t" -+# ifdef __PIC__ -+ ".p2align 1\n" -+ "9:\n\t" -+ "addl (%%esp), %4\n\t" -+ "ret\n\t" -+# endif -+# endif -+ ".p2align 1\n" -+ "2:\n\t" -+# ifdef __i386 -+ ".byte 0x3e\n\t" -+# endif -+ "add $0x10, %2\n\t" -+ ".p2align 1\n" -+ "1:\n\t" -+ /* 128 */ -+ "movzbl -16(%2), %4\n\t" /* 4 */ -+ "add %4, %0\n\t" /* 2 */ -+ "add %0, %1\n\t" /* 2 */ -+ /* 120 */ -+ "movzbl -15(%2), %4\n\t" /* 4 */ -+ "add %4, %0\n\t" /* 2 */ -+ "add %0, %1\n\t" /* 2 */ -+ /* 112 */ -+ "movzbl -14(%2), %4\n\t" /* 4 */ -+ "add %4, %0\n\t" /* 2 */ -+ "add %0, %1\n\t" /* 2 */ -+ /* 104 */ -+ "movzbl -13(%2), %4\n\t" /* 4 */ -+ "add %4, %0\n\t" /* 2 */ -+ "add %0, %1\n\t" /* 2 */ -+ /* 96 */ -+ "movzbl -12(%2), %4\n\t" /* 4 */ -+ "add %4, %0\n\t" /* 2 */ -+ "add %0, %1\n\t" /* 2 */ -+ /* 88 */ -+ "movzbl -11(%2), %4\n\t" /* 4 */ -+ "add %4, %0\n\t" /* 2 */ -+ "add %0, %1\n\t" /* 2 */ -+ /* 80 */ -+ "movzbl -10(%2), %4\n\t" /* 4 */ -+ "add %4, %0\n\t" /* 2 */ -+ "add %0, %1\n\t" /* 2 */ -+ /* 72 */ -+ "movzbl -9(%2), %4\n\t" /* 4 */ -+ "add %4, %0\n\t" /* 2 */ -+ "add %0, %1\n\t" /* 2 */ -+ /* 64 */ -+ "movzbl -8(%2), %4\n\t" /* 4 */ -+ "add %4, %0\n\t" /* 2 */ -+ "add %0, %1\n\t" /* 2 */ -+ /* 56 */ -+ "movzbl -7(%2), %4\n\t" /* 4 */ -+ "add %4, %0\n\t" /* 2 */ -+ "add %0, %1\n\t" /* 2 */ -+ /* 48 */ -+ "movzbl -6(%2), %4\n\t" /* 4 */ -+ "add %4, %0\n\t" /* 2 */ -+ "add %0, %1\n\t" /* 2 */ -+ /* 40 */ -+ "movzbl -5(%2), %4\n\t" /* 4 */ -+ "add %4, %0\n\t" /* 2 */ -+ "add %0, %1\n\t" /* 2 */ -+ /* 32 */ -+ "movzbl -4(%2), %4\n\t" /* 4 */ -+ "add %4, %0\n\t" /* 2 */ -+ "add %0, %1\n\t" /* 2 */ -+ /* 24 */ -+ "movzbl -3(%2), %4\n\t" /* 4 */ -+ "add %4, %0\n\t" /* 2 */ -+ "add %0, %1\n\t" /* 2 */ -+ /* 16 */ -+ "movzbl -2(%2), %4\n\t" /* 4 */ -+ "add %4, %0\n\t" /* 2 */ -+ "add %0, %1\n\t" /* 2 */ -+ /* 8 */ -+ "movzbl -1(%2), %4\n\t" /* 4 */ -+ "add %4, %0\n\t" /* 2 */ -+ "add %0, %1\n\t" /* 2 */ -+ /* 0 */ -+ "dec %3\n\t" -+ "jnz 2b" -+ : /* %0 */ "=R" (*s1), -+ /* %1 */ "=R" (*s2), -+ /* %2 */ "=abdSD" (buf), -+ /* %3 */ "=c" (k), -+ /* %4 */ "="CLOB"R" (t) -+ : /* %5 */ "r" (16 - n), -+ /* */ "0" (*s1), -+ /* */ "1" (*s2), -+ /* */ "2" (buf), -+ /* */ "3" (k) -+ : "cc", "memory" -+ ); -+ -+ return buf; -+} -+ -+#if 0 -+ /* -+ * Will XOP processors have SSSE3/AVX?? -+ * And what is the unaligned load performance? -+ */ -+ "prefetchnta 0x70(%0)\n\t" -+ "lddqu (%0), %%xmm0\n\t" -+ "vpaddd %%xmm3, %%xmm5, %%xmm5\n\t" -+ "sub $16, %3\n\t" -+ "add $16, %0\n\t" -+ "cmp $15, %3\n\t" -+ "vphaddubd %%xmm0, %%xmm1\n\t" /* A */ -+ "vpmaddubsw %%xmm4, %%xmm0, %%xmm0\n\t"/* AVX! */ /* 1 */ -+ "vphadduwd %%xmm0, %%xmm0\n\t" /* 2 */ -+ "vpaddd %%xmm1, %%xmm3, %%xmm3\n\t" /* B: A+B => hadd+acc or vpmadcubd w. mul = 1 */ -+ "vpaddd %%xmm0, %%xmm2, %%xmm2\n\t" /* 3: 1+2+3 => vpmadcubd w. mul = 16,15,14... */ -+ "jg 1b\n\t" -+ xop_reduce -+ xop_reduce -+ xop_reduce -+ setup -+ "jg 1b\n\t" -+ "vphaddudq %%xmm2, %%xmm0\n\t" -+ "vphaddudq %%xmm3, %%xmm1\n\t" -+ "pshufd $0xE6, %%xmm0, %%xmm2\n\t" -+ "pshufd $0xE6, %%xmm1, %%xmm3\n\t" -+ "paddd %%xmm0, %%xmm2\n\t" -+ "paddd %%xmm1, %%xmm3\n\t" -+ "movd %%xmm2, %2\n\t" -+ "movd %%xmm3, %1\n\t" -+#endif -+ -+/* ========================================================================= */ -+local uLong adler32_SSSE3(adler, buf, len) -+ uLong adler; -+ const Bytef *buf; -+ uInt len; -+{ -+ unsigned int s1 = adler & 0xffff; -+ unsigned int s2 = (adler >> 16) & 0xffff; -+ unsigned int k; -+ -+ k = ALIGN_DIFF(buf, 16); -+ len -= k; -+ if (k) -+ buf = adler32_jumped(buf, &s1, &s2, k); -+ -+ __asm__ __volatile__ ( -+ "mov %6, %3\n\t" /* get max. byte count VNMAX till v1_round_sum overflows */ -+ "cmp %3, %4\n\t" -+ "cmovb %4, %3\n\t" /* k = len >= VNMAX ? k : len */ -+ "sub %3, %4\n\t" /* len -= k */ -+ "cmp $16, %3\n\t" -+ "jb 88f\n\t" /* if(k < 16) goto OUT */ -+#ifdef __ELF__ -+ ".subsection 2\n\t" -+#else -+ "jmp 77f\n\t" -+#endif -+ ".p2align 2\n" -+ /* -+ * reduction function to bring a vector sum within the range of BASE -+ * This does no full reduction! When the sum is large, a number > BASE -+ * is the result. To do a full reduction call multiple times. -+ */ -+ "sse2_reduce:\n\t" -+ "movdqa %%xmm0, %%xmm1\n\t" /* y = x */ -+ "pslld $16, %%xmm1\n\t" /* y <<= 16 */ -+ "psrld $16, %%xmm0\n\t" /* x >>= 16 */ -+ "psrld $16, %%xmm1\n\t" /* y >>= 16 */ -+ "psubd %%xmm0, %%xmm1\n\t" /* y -= x */ -+ "pslld $4, %%xmm0\n\t" /* x <<= 4 */ -+ "paddd %%xmm1, %%xmm0\n\t" /* x += y */ -+ "ret\n\t" -+#ifdef __ELF__ -+ ".previous\n\t" -+#else -+ "77:\n\t" -+#endif -+ "movdqa %5, %%xmm5\n\t" /* get vord_b */ -+ "prefetchnta 0x70(%0)\n\t" -+ "movd %2, %%xmm2\n\t" /* init vector sum vs2 with s2 */ -+ "movd %1, %%xmm3\n\t" /* init vector sum vs1 with s1 */ -+ "pxor %%xmm4, %%xmm4\n" /* zero */ -+ "3:\n\t" -+ "pxor %%xmm7, %%xmm7\n\t" /* zero vs1_round_sum */ -+ ".p2align 3,,3\n\t" -+ ".p2align 2\n" -+ "2:\n\t" -+ "mov $128, %1\n\t" /* inner_k = 128 bytes till vs2_i overflows */ -+ "cmp %1, %3\n\t" -+ "cmovb %3, %1\n\t" /* inner_k = k >= inner_k ? inner_k : k */ -+ "and $-16, %1\n\t" /* inner_k = ROUND_TO(inner_k, 16) */ -+ "sub %1, %3\n\t" /* k -= inner_k */ -+ "shr $4, %1\n\t" /* inner_k /= 16 */ -+ "pxor %%xmm6, %%xmm6\n\t" /* zero vs2_i */ -+ ".p2align 4,,7\n" -+ ".p2align 3\n" -+ "1:\n\t" -+ "movdqa (%0), %%xmm0\n\t" /* fetch input data */ -+ "prefetchnta 0x70(%0)\n\t" -+ "paddd %%xmm3, %%xmm7\n\t" /* vs1_round_sum += vs1 */ -+ "add $16, %0\n\t" /* advance input data pointer */ -+ "dec %1\n\t" /* decrement inner_k */ -+ "movdqa %%xmm0, %%xmm1\n\t" /* make a copy of the input data */ -+# if (HAVE_BINUTILS-0) >= 217 -+ "pmaddubsw %%xmm5, %%xmm0\n\t" /* multiply all input bytes by vord_b bytes, add adjecent results to words */ -+# else -+ ".byte 0x66, 0x0f, 0x38, 0x04, 0xc5\n\t" /* pmaddubsw %%xmm5, %%xmm0 */ -+# endif -+ "psadbw %%xmm4, %%xmm1\n\t" /* subtract zero from every byte, add 8 bytes to a sum */ -+ "paddw %%xmm0, %%xmm6\n\t" /* vs2_i += in * vorder_b */ -+ "paddd %%xmm1, %%xmm3\n\t" /* vs1 += psadbw */ -+ "jnz 1b\n\t" /* repeat if inner_k != 0 */ -+ "movdqa %%xmm6, %%xmm0\n\t" /* copy vs2_i */ -+ "punpckhwd %%xmm4, %%xmm0\n\t" /* zero extent vs2_i upper words to dwords */ -+ "punpcklwd %%xmm4, %%xmm6\n\t" /* zero extent vs2_i lower words to dwords */ -+ "paddd %%xmm0, %%xmm2\n\t" /* vs2 += vs2_i.upper */ -+ "paddd %%xmm6, %%xmm2\n\t" /* vs2 += vs2_i.lower */ -+ "cmp $15, %3\n\t" -+ "jg 2b\n\t" /* if(k > 15) repeat */ -+ "movdqa %%xmm7, %%xmm0\n\t" /* move vs1_round_sum */ -+ "call sse2_reduce\n\t" /* reduce vs1_round_sum */ -+ "pslld $4, %%xmm0\n\t" /* vs1_round_sum *= 16 */ -+ "paddd %%xmm2, %%xmm0\n\t" /* vs2 += vs1_round_sum */ -+ "call sse2_reduce\n\t" /* reduce again */ -+ "movdqa %%xmm0, %%xmm2\n\t" /* move vs2 back in place */ -+ "movdqa %%xmm3, %%xmm0\n\t" /* move vs1 */ -+ "call sse2_reduce\n\t" /* reduce */ -+ "movdqa %%xmm0, %%xmm3\n\t" /* move vs1 back in place */ -+ "add %3, %4\n\t" /* len += k */ -+ "mov %6, %3\n\t" /* get max. byte count VNMAX till v1_round_sum overflows */ -+ "cmp %3, %4\n\t" -+ "cmovb %4, %3\n\t" /* k = len >= VNMAX ? k : len */ -+ "sub %3, %4\n\t" /* len -= k */ -+ "cmp $15, %3\n\t" -+ "jg 3b\n\t" /* if(k > 15) repeat */ -+ "pshufd $0xEE, %%xmm3, %%xmm1\n\t" /* collect vs1 & vs2 in lowest vector member */ -+ "pshufd $0xEE, %%xmm2, %%xmm0\n\t" -+ "paddd %%xmm3, %%xmm1\n\t" -+ "paddd %%xmm2, %%xmm0\n\t" -+ "pshufd $0xE5, %%xmm0, %%xmm2\n\t" -+ "paddd %%xmm0, %%xmm2\n\t" -+ "movd %%xmm1, %1\n\t" /* mov vs1 to s1 */ -+ "movd %%xmm2, %2\n" /* mov vs2 to s2 */ -+ "88:" -+ : /* %0 */ "=r" (buf), -+ /* %1 */ "=r" (s1), -+ /* %2 */ "=r" (s2), -+ /* %3 */ "=r" (k), -+ /* %4 */ "=r" (len) -+ : /* %5 */ "m" (vord_b), -+ /* -+ * somewhere between 5 & 6, psadbw 64 bit sums ruin the party -+ * spreading the sums with palignr only brings it to 7 (?), -+ * while introducing an op into the main loop (2800 ms -> 3200 ms) -+ */ -+ /* %6 */ "i" (5*NMAX), -+ /* */ "0" (buf), -+ /* */ "1" (s1), -+ /* */ "2" (s2), -+ /* */ "4" (len) -+ : "cc", "memory" -+# ifdef __SSE__ -+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" -+# endif -+ ); -+ -+ if (unlikely(k)) -+ buf = adler32_jumped(buf, &s1, &s2, k); -+ reduce(s1); -+ reduce(s2); -+ return (s2 << 16) | s1; -+} -+ -+/* ========================================================================= */ -+local uLong adler32_SSE2(adler, buf, len) -+ uLong adler; -+ const Bytef *buf; -+ uInt len; -+{ -+ unsigned int s1 = adler & 0xffff; -+ unsigned int s2 = (adler >> 16) & 0xffff; -+ unsigned int k; -+ -+ k = ALIGN_DIFF(buf, 16); -+ len -= k; -+ if (k) -+ buf = adler32_jumped(buf, &s1, &s2, k); -+ -+ __asm__ __volatile__ ( -+ "mov %6, %3\n\t" -+ "cmp %3, %4\n\t" -+ "cmovb %4, %3\n\t" -+ "sub %3, %4\n\t" -+ "cmp $16, %3\n\t" -+ "jb 88f\n\t" -+ "prefetchnta 0x70(%0)\n\t" -+ "movd %1, %%xmm4\n\t" -+ "movd %2, %%xmm3\n\t" -+ "pxor %%xmm2, %%xmm2\n\t" -+ "pxor %%xmm5, %%xmm5\n\t" -+ ".p2align 2\n" -+ "3:\n\t" -+ "pxor %%xmm6, %%xmm6\n\t" -+ "pxor %%xmm7, %%xmm7\n\t" -+ "mov $2048, %1\n\t" /* get byte count till vs2_{l|h}_word overflows */ -+ "cmp %1, %3\n\t" -+ "cmovb %3, %1\n" -+ "and $-16, %1\n\t" -+ "sub %1, %3\n\t" -+ "shr $4, %1\n\t" -+ ".p2align 4,,7\n" -+ ".p2align 3\n" -+ "1:\n\t" -+ "prefetchnta 0x70(%0)\n\t" -+ "movdqa (%0), %%xmm0\n\t" /* fetch input data */ -+ "paddd %%xmm4, %%xmm5\n\t" /* vs1_round_sum += vs1 */ -+ "add $16, %0\n\t" -+ "dec %1\n\t" -+ "movdqa %%xmm0, %%xmm1\n\t" /* copy input data */ -+ "psadbw %%xmm2, %%xmm0\n\t" /* add all bytes horiz. */ -+ "paddd %%xmm0, %%xmm4\n\t" /* add that to vs1 */ -+ "movdqa %%xmm1, %%xmm0\n\t" /* copy input data */ -+ "punpckhbw %%xmm2, %%xmm1\n\t" /* zero extent input upper bytes to words */ -+ "punpcklbw %%xmm2, %%xmm0\n\t" /* zero extent input lower bytes to words */ -+ "paddw %%xmm1, %%xmm7\n\t" /* vs2_h_words += in_high_words */ -+ "paddw %%xmm0, %%xmm6\n\t" /* vs2_l_words += in_low_words */ -+ "jnz 1b\n\t" -+ "cmp $15, %3\n\t" -+ "pmaddwd 32+%5, %%xmm7\n\t" /* multiply vs2_h_words with order, add adjecend results */ -+ "pmaddwd 16+%5, %%xmm6\n\t" /* multiply vs2_l_words with order, add adjecend results */ -+ "paddd %%xmm7, %%xmm3\n\t" /* add to vs2 */ -+ "paddd %%xmm6, %%xmm3\n\t" /* add to vs2 */ -+ "jg 3b\n\t" -+ "movdqa %%xmm5, %%xmm0\n\t" -+ "pxor %%xmm5, %%xmm5\n\t" -+ "call sse2_reduce\n\t" -+ "pslld $4, %%xmm0\n\t" -+ "paddd %%xmm3, %%xmm0\n\t" -+ "call sse2_reduce\n\t" -+ "movdqa %%xmm0, %%xmm3\n\t" -+ "movdqa %%xmm4, %%xmm0\n\t" -+ "call sse2_reduce\n\t" -+ "movdqa %%xmm0, %%xmm4\n\t" -+ "add %3, %4\n\t" -+ "mov %6, %3\n\t" -+ "cmp %3, %4\n\t" -+ "cmovb %4, %3\n" -+ "sub %3, %4\n\t" -+ "cmp $15, %3\n\t" -+ "jg 3b\n\t" -+ "pshufd $0xEE, %%xmm4, %%xmm1\n\t" -+ "pshufd $0xEE, %%xmm3, %%xmm0\n\t" -+ "paddd %%xmm4, %%xmm1\n\t" -+ "paddd %%xmm3, %%xmm0\n\t" -+ "pshufd $0xE5, %%xmm0, %%xmm3\n\t" -+ "paddd %%xmm0, %%xmm3\n\t" -+ "movd %%xmm1, %1\n\t" -+ "movd %%xmm3, %2\n" -+ "88:\n\t" -+ : /* %0 */ "=r" (buf), -+ /* %1 */ "=r" (s1), -+ /* %2 */ "=r" (s2), -+ /* %3 */ "=r" (k), -+ /* %4 */ "=r" (len) -+ : /* %5 */ "m" (vord), -+ /* %6 */ "i" (5*NMAX), -+ /* */ "0" (buf), -+ /* */ "1" (s1), -+ /* */ "2" (s2), -+ /* */ "3" (k), -+ /* */ "4" (len) -+ : "cc", "memory" -+# ifdef __SSE__ -+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" -+# endif -+ ); -+ -+ if (unlikely(k)) -+ buf = adler32_jumped(buf, &s1, &s2, k); -+ reduce(s1); -+ reduce(s2); -+ return (s2 << 16) | s1; -+} -+ -+# if 0 -+/* ========================================================================= */ -+/* -+ * The SSE2 version above is faster on my CPUs (Athlon64, Core2, -+ * P4 Xeon, K10 Sempron), but has instruction stalls only a -+ * Out-Of-Order-Execution CPU can solve. -+ * So this Version _may_ be better for the new old thing, Atom. -+ */ -+local noinline uLong adler32_SSE2_no_oooe(adler, buf, len) -+ uLong adler; -+ const Bytef *buf; -+ uInt len; -+{ -+ unsigned int s1 = adler & 0xffff; -+ unsigned int s2 = (adler >> 16) & 0xffff; -+ unsigned int k; -+ -+ k = ALIGN_DIFF(buf, 16); -+ len -= k; -+ if (k) -+ buf = adler32_jumped(buf, &s1, &s2, k); -+ -+ __asm__ __volatile__ ( -+ "mov %6, %3\n\t" -+ "cmp %3, %4\n\t" -+ "cmovb %4, %3\n\t" -+ "sub %3, %4\n\t" -+ "cmp $16, %3\n\t" -+ "jb 88f\n\t" -+ "movdqa 16+%5, %%xmm6\n\t" -+ "movdqa 32+%5, %%xmm5\n\t" -+ "prefetchnta 16(%0)\n\t" -+ "pxor %%xmm7, %%xmm7\n\t" -+ "movd %1, %%xmm4\n\t" -+ "movd %2, %%xmm3\n\t" -+ ".p2align 3,,3\n\t" -+ ".p2align 2\n" -+ "1:\n\t" -+ "prefetchnta 32(%0)\n\t" -+ "movdqa (%0), %%xmm1\n\t" -+ "sub $16, %3\n\t" -+ "movdqa %%xmm4, %%xmm2\n\t" -+ "add $16, %0\n\t" -+ "movdqa %%xmm1, %%xmm0\n\t" -+ "cmp $15, %3\n\t" -+ "pslld $4, %%xmm2\n\t" -+ "paddd %%xmm3, %%xmm2\n\t" -+ "psadbw %%xmm7, %%xmm0\n\t" -+ "paddd %%xmm0, %%xmm4\n\t" -+ "movdqa %%xmm1, %%xmm0\n\t" -+ "punpckhbw %%xmm7, %%xmm1\n\t" -+ "punpcklbw %%xmm7, %%xmm0\n\t" -+ "movdqa %%xmm1, %%xmm3\n\t" -+ "pmaddwd %%xmm6, %%xmm0\n\t" -+ "paddd %%xmm2, %%xmm0\n\t" -+ "pmaddwd %%xmm5, %%xmm3\n\t" -+ "paddd %%xmm0, %%xmm3\n\t" -+ "jg 1b\n\t" -+ "movdqa %%xmm3, %%xmm0\n\t" -+ "call sse2_reduce\n\t" -+ "call sse2_reduce\n\t" -+ "movdqa %%xmm0, %%xmm3\n\t" -+ "movdqa %%xmm4, %%xmm0\n\t" -+ "call sse2_reduce\n\t" -+ "movdqa %%xmm0, %%xmm4\n\t" -+ "add %3, %4\n\t" -+ "mov %6, %3\n\t" -+ "cmp %3, %4\n\t" -+ "cmovb %4, %3\n\t" -+ "sub %3, %4\n\t" -+ "cmp $15, %3\n\t" -+ "jg 1b\n\t" -+ "pshufd $0xEE, %%xmm3, %%xmm0\n\t" -+ "pshufd $0xEE, %%xmm4, %%xmm1\n\t" -+ "paddd %%xmm3, %%xmm0\n\t" -+ "pshufd $0xE5, %%xmm0, %%xmm2\n\t" -+ "paddd %%xmm4, %%xmm1\n\t" -+ "movd %%xmm1, %1\n\t" -+ "paddd %%xmm0, %%xmm2\n\t" -+ "movd %%xmm2, %2\n" -+ "88:" -+ : /* %0 */ "=r" (buf), -+ /* %1 */ "=r" (s1), -+ /* %2 */ "=r" (s2), -+ /* %3 */ "=r" (k), -+ /* %4 */ "=r" (len) -+ : /* %5 */ "m" (vord), -+ /* %6 */ "i" (NMAX + NMAX/3), -+ /* */ "0" (buf), -+ /* */ "1" (s1), -+ /* */ "2" (s2), -+ /* */ "4" (len) -+ : "cc", "memory" -+# ifdef __SSE__ -+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" -+# endif -+ ); -+ -+ if (unlikely(k)) -+ buf = adler32_jumped(buf, &s1, &s2, k); -+ reduce(s1); -+ reduce(s2); -+ return (s2 << 16) | s1; -+} -+# endif -+ -+# ifndef __x86_64__ -+/* ========================================================================= */ -+/* -+ * SSE version to help VIA-C3_2, P2 & P3 -+ */ -+local uLong adler32_SSE(adler, buf, len) -+ uLong adler; -+ const Bytef *buf; -+ uInt len; -+{ -+ unsigned int s1 = adler & 0xffff; -+ unsigned int s2 = (adler >> 16) & 0xffff; -+ unsigned int k; -+ -+ k = ALIGN_DIFF(buf, 8); -+ len -= k; -+ if (k) -+ buf = adler32_jumped(buf, &s1, &s2, k); -+ -+ __asm__ __volatile__ ( -+ "mov %6, %3\n\t" -+ "cmp %3, %4\n\t" -+ "cmovb %4, %3\n" -+ "sub %3, %4\n\t" -+ "cmp $8, %3\n\t" -+ "jb 88f\n\t" -+ "movd %1, %%mm4\n\t" -+ "movd %2, %%mm3\n\t" -+ "pxor %%mm2, %%mm2\n\t" -+ "pxor %%mm5, %%mm5\n\t" -+# ifdef __ELF__ -+ ".subsection 2\n\t" -+# else -+ "jmp 77f\n\t" -+# endif -+ ".p2align 2\n" -+ "mmx_reduce:\n\t" -+ "movq %%mm0, %%mm1\n\t" -+ "pslld $16, %%mm1\n\t" -+ "psrld $16, %%mm0\n\t" -+ "psrld $16, %%mm1\n\t" -+ "psubd %%mm0, %%mm1\n\t" -+ "pslld $4, %%mm0\n\t" -+ "paddd %%mm1, %%mm0\n\t" -+ "ret\n\t" -+# ifdef __ELF__ -+ ".previous\n\t" -+# else -+ "77:\n\t" -+# endif -+ ".p2align 2\n" -+ "3:\n\t" -+ "pxor %%mm6, %%mm6\n\t" -+ "pxor %%mm7, %%mm7\n\t" -+ "mov $1024, %1\n\t" -+ "cmp %1, %3\n\t" -+ "cmovb %3, %1\n" -+ "and $-8, %1\n\t" -+ "sub %1, %3\n\t" -+ "shr $3, %1\n\t" -+ ".p2align 4,,7\n" -+ ".p2align 3\n" -+ "1:\n\t" -+ "movq (%0), %%mm0\n\t" -+ "paddd %%mm4, %%mm5\n\t" -+ "add $8, %0\n\t" -+ "dec %1\n\t" -+ "movq %%mm0, %%mm1\n\t" -+ "psadbw %%mm2, %%mm0\n\t" -+ "paddd %%mm0, %%mm4\n\t" -+ "movq %%mm1, %%mm0\n\t" -+ "punpckhbw %%mm2, %%mm1\n\t" -+ "punpcklbw %%mm2, %%mm0\n\t" -+ "paddw %%mm1, %%mm7\n\t" -+ "paddw %%mm0, %%mm6\n\t" -+ "jnz 1b\n\t" -+ "cmp $7, %3\n\t" -+ "pmaddwd 40+%5, %%mm7\n\t" -+ "pmaddwd 32+%5, %%mm6\n\t" -+ "paddd %%mm7, %%mm3\n\t" -+ "paddd %%mm6, %%mm3\n\t" -+ "jg 3b\n\t" -+ "movq %%mm5, %%mm0\n\t" -+ "pxor %%mm5, %%mm5\n\t" -+ "call mmx_reduce\n\t" -+ "pslld $3, %%mm0\n\t" -+ "paddd %%mm3, %%mm0\n\t" -+ "call mmx_reduce\n\t" -+ "movq %%mm0, %%mm3\n\t" -+ "movq %%mm4, %%mm0\n\t" -+ "call mmx_reduce\n\t" -+ "movq %%mm0, %%mm4\n\t" -+ "add %3, %4\n\t" -+ "mov %6, %3\n\t" -+ "cmp %3, %4\n\t" -+ "cmovb %4, %3\n" -+ "sub %3, %4\n\t" -+ "cmp $7, %3\n\t" -+ "jg 3b\n\t" -+ "movd %%mm4, %1\n\t" -+ "psrlq $32, %%mm4\n\t" -+ "movd %%mm3, %2\n\t" -+ "psrlq $32, %%mm3\n\t" -+ "movd %%mm4, %4\n\t" -+ "add %4, %1\n\t" -+ "movd %%mm3, %4\n\t" -+ "add %4, %2\n" -+ "emms\n\t" -+ "88:\n\t" -+ : /* %0 */ "=r" (buf), -+ /* %1 */ "=r" (s1), -+ /* %2 */ "=r" (s2), -+ /* %3 */ "=r" (k), -+ /* %4 */ "=r" (len) -+ : /* %5 */ "m" (vord), -+ /* %6 */ "i" ((5*NMAX)/2), -+ /* */ "0" (buf), -+ /* */ "1" (s1), -+ /* */ "2" (s2), -+ /* */ "3" (k), -+ /* */ "4" (len) -+ : "cc", "memory" -+# ifdef __MMX__ -+ , "mm0", "mm1", "mm2", "mm3", "mm4", "mm5", "mm6", "mm7" -+# endif -+ ); -+ -+ if (unlikely(k)) -+ buf = adler32_jumped(buf, &s1, &s2, k); -+ reduce(s1); -+ reduce(s2); -+ return (s2 << 16) | s1; -+} -+ -+/* ========================================================================= */ -+/* -+ * Processors which only have MMX will prop. not like this -+ * code, they are so old, they are not Out-Of-Order -+ * (maybe except AMD K6, Cyrix, Winchip/VIA). -+ * I did my best to get at least 1 instruction between result -> use -+ */ -+local uLong adler32_MMX(adler, buf, len) -+ uLong adler; -+ const Bytef *buf; -+ uInt len; -+{ -+ unsigned int s1 = adler & 0xffff; -+ unsigned int s2 = (adler >> 16) & 0xffff; -+ unsigned int k; -+ -+ k = ALIGN_DIFF(buf, 8); -+ len -= k; -+ if (k) -+ buf = adler32_jumped(buf, &s1, &s2, k); -+ -+ __asm__ __volatile__ ( -+ "mov %6, %3\n\t" -+ "cmp %3, %4\n\t" -+ "jae 11f\n\t" -+ "mov %4, %3\n" -+ "11:\n\t" -+ "sub %3, %4\n\t" -+ "cmp $8, %3\n\t" -+ "jb 88f\n\t" -+ "sub $8, %%esp\n\t" -+ "movd %1, %%mm4\n\t" -+ "movd %2, %%mm2\n\t" -+ "movq %5, %%mm3\n" -+ "33:\n\t" -+ "movq %%mm2, %%mm0\n\t" -+ "pxor %%mm2, %%mm2\n\t" -+ "pxor %%mm5, %%mm5\n\t" -+ ".p2align 2\n" -+ "3:\n\t" -+ "movq %%mm0, (%%esp)\n\t" -+ "pxor %%mm6, %%mm6\n\t" -+ "pxor %%mm7, %%mm7\n\t" -+ "mov $1024, %1\n\t" -+ "cmp %1, %3\n\t" -+ "jae 44f\n\t" -+ "mov %3, %1\n" -+ "44:\n\t" -+ "and $-8, %1\n\t" -+ "sub %1, %3\n\t" -+ "shr $3, %1\n\t" -+ ".p2align 4,,7\n" -+ ".p2align 3\n" -+ "1:\n\t" -+ "movq (%0), %%mm0\n\t" -+ "paddd %%mm4, %%mm5\n\t" -+ "add $8, %0\n\t" -+ "dec %1\n\t" -+ "movq %%mm0, %%mm1\n\t" -+ "punpcklbw %%mm2, %%mm0\n\t" -+ "punpckhbw %%mm2, %%mm1\n\t" -+ "paddw %%mm0, %%mm6\n\t" -+ "paddw %%mm1, %%mm0\n\t" -+ "paddw %%mm1, %%mm7\n\t" -+ "pmaddwd %%mm3, %%mm0\n\t" -+ "paddd %%mm0, %%mm4\n\t" -+ "jnz 1b\n\t" -+ "movq (%%esp), %%mm0\n\t" -+ "cmp $7, %3\n\t" -+ "pmaddwd 32+%5, %%mm6\n\t" -+ "pmaddwd 40+%5, %%mm7\n\t" -+ "paddd %%mm6, %%mm0\n\t" -+ "paddd %%mm7, %%mm0\n\t" -+ "jg 3b\n\t" -+ "movq %%mm0, %%mm2\n\t" -+ "movq %%mm5, %%mm0\n\t" -+ "call mmx_reduce\n\t" -+ "pslld $3, %%mm0\n\t" -+ "paddd %%mm2, %%mm0\n\t" -+ "call mmx_reduce\n\t" -+ "movq %%mm0, %%mm2\n\t" -+ "movq %%mm4, %%mm0\n\t" -+ "call mmx_reduce\n\t" -+ "movq %%mm0, %%mm4\n\t" -+ "add %3, %4\n\t" -+ "mov %6, %3\n\t" -+ "cmp %3, %4\n\t" -+ "jae 22f\n\t" -+ "mov %4, %3\n" -+ "22:\n\t" -+ "sub %3, %4\n\t" -+ "cmp $7, %3\n\t" -+ "jg 33b\n\t" -+ "add $8, %%esp\n\t" -+ "movd %%mm4, %1\n\t" -+ "psrlq $32, %%mm4\n\t" -+ "movd %%mm2, %2\n\t" -+ "psrlq $32, %%mm2\n\t" -+ "movd %%mm4, %4\n\t" -+ "add %4, %1\n\t" -+ "movd %%mm2, %4\n\t" -+ "add %4, %2\n" -+ "emms\n\t" -+ "88:\n\t" -+ : /* %0 */ "=r" (buf), -+ /* %1 */ "=r" (s1), -+ /* %2 */ "=r" (s2), -+ /* %3 */ "=r" (k), -+ /* %4 */ "=r" (len) -+ : /* %5 */ "m" (vord), -+ /* %6 */ "i" (4*NMAX), -+ /* */ "0" (buf), -+ /* */ "1" (s1), -+ /* */ "2" (s2), -+ /* */ "3" (k), -+ /* */ "4" (len) -+ : "cc", "memory" -+# ifdef __MMX__ -+ , "mm0", "mm1", "mm2", "mm3", "mm4", "mm5", "mm6", "mm7" -+# endif -+ ); -+ -+ if (unlikely(k)) -+ buf = adler32_jumped(buf, &s1, &s2, k); -+ reduce(s1); -+ reduce(s2); -+ return (s2 << 16) | s1; -+} -+# endif -+ -+/* ========================================================================= */ -+local uLong adler32_x86(adler, buf, len) -+ uLong adler; -+ const Bytef *buf; -+ uInt len; -+{ -+ unsigned int s1 = adler & 0xffff; -+ unsigned int s2 = (adler >> 16) & 0xffff; -+ unsigned int n; -+ -+ while (likely(len)) { -+# ifndef __x86_64__ -+# define LOOP_COUNT 4 -+# else -+# define LOOP_COUNT 8 -+# endif -+ unsigned int k; -+ n = len < NMAX ? len : NMAX; -+ len -= n; -+ k = n / LOOP_COUNT; -+ n %= LOOP_COUNT; -+ -+ if (likely(k)) do { -+ /* -+ * Modern compiler can do "wonders". -+ * Only if they would not "trick them self" sometime. -+ * This was unrolled 16 times not because someone -+ * anticipated autovectorizing compiler, but the -+ * classical "avoid loop overhead". -+ * -+ * But things get tricky if the compiler starts to see: -+ * "hey lets disambiguate one sum step from the other", -+ * the classical prevent-pipeline-stalls-thing. -+ * -+ * Suddenly we have 16 temporary sums, which unfortunatly -+ * blows x86 limited register set... -+ * -+ * Loopunrolling is also a little bad for the I-cache. -+ * -+ * So tune this down for x86. -+ * Instead we try to keep it in the register set. 4 sums fits -+ * into i386 register set with no framepointer. -+ * x86_64 is a little more splendit, but still we can not -+ * take 16, so take 8 sums. -+ */ -+ s1 += buf[0]; s2 += s1; -+ s1 += buf[1]; s2 += s1; -+ s1 += buf[2]; s2 += s1; -+ s1 += buf[3]; s2 += s1; -+# ifdef __x86_64__ -+ s1 += buf[4]; s2 += s1; -+ s1 += buf[5]; s2 += s1; -+ s1 += buf[6]; s2 += s1; -+ s1 += buf[7]; s2 += s1; -+# endif -+ buf += LOOP_COUNT; -+ } while(likely(--k)); -+ if (n) do { -+ s1 += *buf++; -+ s2 += s1; -+ } while (--n); -+ reduce_full(s1); -+ reduce_full(s2); -+ } -+ return (s2 << 16) | s1; -+} -+ -+/* ========================================================================= */ -+/* -+ * Knot it all together with a runtime switch -+ */ -+ -+/* Flags */ -+# define CFF_DEFAULT (1 << 0) -+/* Processor features */ -+# define CFEATURE_CMOV (15 + 0) -+# define CFEATURE_MMX (23 + 0) -+# define CFEATURE_SSE (25 + 0) -+# define CFEATURE_SSE2 (26 + 0) -+# define CFEATURE_SSSE3 ( 9 + 32) -+ -+# define CFB(x) (1 << ((x)%32)) -+ -+# define FEATURE_WORDS 2 -+ -+/* data structure */ -+struct test_cpu_feature -+{ -+ void (*func)(void); -+ int flags; -+ unsigned int features[FEATURE_WORDS]; -+}; -+ -+/* ========================================================================= */ -+/* -+ * Decision table -+ */ -+local const struct test_cpu_feature tfeat_adler32_vec[] = -+{ -+ /* func flags features */ -+ {(void (*)(void))adler32_SSSE3, 0, {CFB(CFEATURE_CMOV), CFB(CFEATURE_SSSE3)}}, -+ {(void (*)(void))adler32_SSE2, 0, {CFB(CFEATURE_SSE2)|CFB(CFEATURE_CMOV), 0}}, -+# ifndef __x86_64__ -+ {(void (*)(void))adler32_SSE, 0, {CFB(CFEATURE_SSE)|CFB(CFEATURE_CMOV), 0}}, -+ {(void (*)(void))adler32_MMX, 0, {CFB(CFEATURE_MMX), 0}}, -+# endif -+ {(void (*)(void))adler32_x86, CFF_DEFAULT, { 0, 0}}, -+}; -+ -+/* ========================================================================= */ -+/* Prototypes */ -+local noinline void *test_cpu_feature(const struct test_cpu_feature *t, unsigned int l); -+local uLong adler32_vec_runtimesw(uLong adler, const Bytef *buf, uInt len); -+ -+/* ========================================================================= */ -+/* -+ * Runtime Function pointer -+ */ -+local uLong (*adler32_vec_ptr)(uLong adler, const Bytef *buf, uInt len) = adler32_vec_runtimesw; -+ -+/* ========================================================================= */ -+/* -+ * Constructor to init the pointer early -+ */ -+local GCC_ATTR_CONSTRUCTOR void adler32_vec_select(void) -+{ -+ adler32_vec_ptr = test_cpu_feature(tfeat_adler32_vec, sizeof (tfeat_adler32_vec)/sizeof (tfeat_adler32_vec[0])); -+} -+ -+/* ========================================================================= */ -+/* -+ * Jump function -+ */ -+local noinline uLong adler32_vec(adler, buf, len) -+ uLong adler; -+ const Bytef *buf; -+ uInt len; -+{ -+ return adler32_vec_ptr(adler, buf, len); -+} -+ -+/* ========================================================================= */ -+/* -+ * the runtime switcher is a little racy, it should normaly not run if the constructor works -+ */ -+local uLong adler32_vec_runtimesw(uLong adler, const Bytef *buf, uInt len) -+{ -+ adler32_vec_select(); -+ return adler32_vec(adler, buf, len); -+} -+ -+ -+/* ========================================================================= */ -+/* Internal data types */ -+struct cpuid_regs -+{ -+ unsigned long eax, ebx, ecx, edx; -+}; -+ -+local struct -+{ -+ unsigned int max_basic; -+ unsigned int features[FEATURE_WORDS]; -+ int init_done; -+} our_cpu; -+ -+/* ========================================================================= */ -+local inline unsigned long read_flags(void) -+{ -+ unsigned long f; -+ __asm__ __volatile__ ( -+ "pushf\n\t" -+ "pop %0\n\t" -+ : "=r" (f) -+ ); -+ return f; -+} -+ -+/* ========================================================================= */ -+local inline void write_flags(unsigned long f) -+{ -+ __asm__ __volatile__ ( -+ "push %0\n\t" -+ "popf\n\t" -+ : : "ri" (f) : "cc" -+ ); -+} -+ -+/* ========================================================================= */ -+local inline void cpuid(struct cpuid_regs *regs, unsigned long func) -+{ -+ /* save ebx around cpuid call, PIC code needs it */ -+ __asm__ __volatile__ ( -+ "xchg %1, " PICREG "\n\t" -+ "cpuid\n\t" -+ "xchg %1, " PICREG "\n" -+ : /* %0 */ "=a" (regs->eax), -+ /* %1 */ "=r" (regs->ebx), -+ /* %2 */ "=c" (regs->ecx), -+ /* %4 */ "=d" (regs->edx) -+ : /* %5 */ "0" (func), -+ /* %6 */ "2" (regs->ecx) -+ : "cc" -+ ); -+} -+ -+/* ========================================================================= */ -+local inline void cpuids(struct cpuid_regs *regs, unsigned long func) -+{ -+ regs->ecx = 0; -+ cpuid(regs, func); -+} -+ -+/* ========================================================================= */ -+local inline int toggle_eflags_test(const unsigned long mask) -+{ -+ unsigned long f; -+ int result; -+ -+ f = read_flags(); -+ write_flags(f ^ mask); -+ result = !!((f ^ read_flags()) & mask); -+ /* -+ * restore the old flags, the test for i486 tests the alignment -+ * check bit, and left set will confuse the x86 software world. -+ */ -+ write_flags(f); -+ return result; -+} -+ -+/* ========================================================================= */ -+local inline int is_486(void) -+{ -+ return toggle_eflags_test(1 << 18); -+} -+ -+/* ========================================================================= */ -+local inline int has_cpuid(void) -+{ -+ return toggle_eflags_test(1 << 21); -+} -+ -+/* ========================================================================= */ -+local void identify_cpu(void) -+{ -+ struct cpuid_regs a; -+ -+ if (our_cpu.init_done) -+ return; -+ -+ our_cpu.init_done = -1; -+ /* force a write out to memory */ -+ __asm__ __volatile__ ("" : : "m" (our_cpu.init_done)); -+ -+ if (!is_486()) -+ return; -+ -+ if (!has_cpuid()) -+ return; -+ -+ /* get the maximum basic leaf number */ -+ cpuids(&a, 0x00000000); -+ our_cpu.max_basic = (unsigned int)a.eax; -+ /* we could get the vendor string from ebx, edx, ecx */ -+ -+ /* get the first basic leaf, if it is avail. */ -+ if (our_cpu.max_basic >= 0x00000001) -+ cpuids(&a, 0x00000001); -+ else -+ a.eax = a.ebx = a.ecx = a.edx = 0; -+ -+ /* we could extract family, model, stepping from eax */ -+ -+ /* there is the first set of features */ -+ our_cpu.features[0] = a.edx; -+ our_cpu.features[1] = a.ecx; -+ -+ /* now we could test the extended features, but is not needed, for now */ -+} -+ -+/* ========================================================================= */ -+local noinline void *test_cpu_feature(const struct test_cpu_feature *t, unsigned int l) -+{ -+ unsigned int i, j, f; -+ identify_cpu(); -+ -+ for (i = 0; i < l; i++) { -+ if (t[i].flags & CFF_DEFAULT) -+ return t[i].func; -+ for (f = 0, j = 0; j < FEATURE_WORDS; j++) -+ f |= (our_cpu.features[j] & t[i].features[j]) ^ t[i].features[j]; -+ if (f) -+ continue; -+ return t[i].func; -+ } -+ return NULL; /* die! */ -+} -+ -+#endif - diff --git a/zlib-1.2.5.tar.bz2 b/zlib-1.2.5.tar.bz2 deleted file mode 100644 index d589b2b..0000000 --- a/zlib-1.2.5.tar.bz2 +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:94e7564a74e24859a27c1d41d24fb9998356b2d3b659cd4810455c5e6668854b -size 486372 diff --git a/zlib-1.2.5_git201105121450.tar.bz2 b/zlib-1.2.5_git201105121450.tar.bz2 new file mode 100644 index 0000000..f701d70 --- /dev/null +++ b/zlib-1.2.5_git201105121450.tar.bz2 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d25e8fdeba3b8244511db10c87f146c01373dcf61f4a39cb82b133e15340c6e4 +size 513566 diff --git a/zlib.changes b/zlib.changes index ba28ccc..bdfd407 100644 --- a/zlib.changes +++ b/zlib.changes @@ -1,3 +1,23 @@ +------------------------------------------------------------------- +Thu May 12 20:02:26 UTC 2011 - crrodriguez@opensuse.org + +- Update SSE patches, fixes bugs in PPC implementation +- X86 improvements. + +------------------------------------------------------------------- +Sat May 7 18:25:48 UTC 2011 - crrodriguez@opensuse.org + +- Update SSE2/MMX patches to their current version. + per request of the author. + * This are integrated now,including support for a number + of additional archs and fixes ARM patches bugs. + +------------------------------------------------------------------- +Mon Apr 18 18:02:50 UTC 2011 - crrodriguez@opensuse.org + +- Update SSE2/MMX patches tp version 3 + now with comments,performance numbers,and ia64 support + ------------------------------------------------------------------- Wed Mar 30 19:47:30 UTC 2011 - crrodriguez@opensuse.org diff --git a/zlib.spec b/zlib.spec index 0525442..47587e7 100644 --- a/zlib.spec +++ b/zlib.spec @@ -27,10 +27,11 @@ Obsoletes: libz Obsoletes: zlib-64bit %endif # -Version: 1.2.5 +Version: 1.2.5_git201105121450 Release: 11 Summary: Data Compression Library Url: http://www.zlib.net/ +# git://github.com/kaffeemonster/zlib_adler32_vec.git Source: zlib-%{version}.tar.bz2 Source1: LICENSE Source2: baselibs.conf @@ -40,10 +41,6 @@ Patch0: zlib-1.2.2-format.patch Patch1: zlib-lfs.patch # PATCH-FIX-JENGELH-PARALLEL-MAKE zlib-parallel.patch meissner@novell.com -- shared library links with libz.a Patch2: zlib-parallel.patch -Patch3: 01-prepare.patch -Patch4: 02-ppc_altivec.patch -Patch5: 03-arm.patch -Patch6: 04-x86.patch BuildRoot: %{_tmppath}/%{name}-%{version}-build BuildRequires: pkgconfig @@ -88,10 +85,6 @@ libraries. %patch0 %patch1 %patch2 -p1 -%patch3 -%patch4 -%patch5 -%patch6 %build # Marcus: breaks example64 in 32bit builds.