diff --git a/01-prepare.patch b/01-prepare.patch new file mode 100644 index 0000000..f04d874 --- /dev/null +++ b/01-prepare.patch @@ -0,0 +1,423 @@ +=== modified file 'Makefile.in' +--- Makefile.in 2011-03-14 01:01:37 +0000 ++++ Makefile.in 2011-03-14 02:19:21 +0000 +@@ -236,7 +236,8 @@ + + # DO NOT DELETE THIS LINE -- make depend depends on it. + +-adler32.o zutil.o: zutil.h zlib.h zconf.h ++adler32.o: adler32.c zutil.h zlib.h zconf.h ++zutil.o: zutil.h zlib.h zconf.h + gzclose.o gzlib.o gzread.o gzwrite.o: zlib.h zconf.h gzguts.h + compress.o example.o minigzip.o uncompr.o: zlib.h zconf.h + crc32.o: zutil.h zlib.h zconf.h crc32.h +@@ -246,7 +247,8 @@ + inftrees.o: zutil.h zlib.h zconf.h inftrees.h + trees.o: deflate.h zutil.h zlib.h zconf.h trees.h + +-adler32.lo zutil.lo: zutil.h zlib.h zconf.h ++adler32.lo: adler32.c zutil.h zlib.h zconf.h ++zutil.lo: zutil.h zlib.h zconf.h + gzclose.lo gzlib.lo gzread.lo gzwrite.lo: zlib.h zconf.h gzguts.h + compress.lo example.lo minigzip.lo uncompr.lo: zlib.h zconf.h + crc32.lo: zutil.h zlib.h zconf.h crc32.h + +=== modified file 'adler32.c' +--- adler32.c 2011-03-14 01:01:37 +0000 ++++ adler32.c 2011-03-30 13:38:42 +0000 +@@ -9,6 +9,35 @@ + + #define local static + ++#define GCC_VERSION_GE(x) ((__GNUC__-0) * 100 + __GNUC_MINOR__-0 >= x) ++ ++#if GCC_VERSION_GE(301) ++/* sometimes leakes out of old kernel header */ ++# undef noinline ++# define noinline __attribute__((__noinline__)) ++#else ++# ifndef noinline ++# define noinline ++# endif ++#endif ++ ++#if GCC_VERSION_GE(301) ++# define GCC_ATTR_UNUSED_PARAM __attribute__((__unused__)) ++#else ++# define GCC_ATTR_UNUSED_PARAM ++#endif ++ ++#if GCC_VERSION_GE(296) ++# define likely(x) __builtin_expect(!!(x), 1) ++# define unlikely(x) __builtin_expect(!!(x), 0) ++#else ++# define likely(x) (x) ++# define unlikely(x) (x) ++#endif ++ ++#define ROUND_TO(x , n) ((x) & ~((n) - 1L)) ++#define ALIGN_DIFF(x, n) (((intptr_t)((x)+(n) - 1L) & ~((intptr_t)(n) - 1L)) - (intptr_t)(x)) ++ + local uLong adler32_combine_(uLong adler1, uLong adler2, z_off64_t len2); + + #define BASE 65521UL /* largest prime smaller than 65536 */ +@@ -21,9 +50,20 @@ + #define DO8(buf,i) DO4(buf,i); DO4(buf,i+4); + #define DO16(buf) DO8(buf,0); DO8(buf,8); + ++#if defined(__alpha__) ++/* even if gcc can generate a mul by inverse, the code is really ++ * ugly (find global const pool pointer, load constant, a mul, lots ++ * of shifts/add/sub), up to 14 instructions. The replacement code ++ * only needs >= 5 instructions ++ */ ++# define NO_DIVIDE ++#endif ++ + /* use NO_DIVIDE if your processor does not do division in hardware */ + #ifdef NO_DIVIDE +-# define MOD(a) \ ++/* use NO_SHIFT if your processor does shift > 1 by loop */ ++# ifdef NO_SHIFT ++# define reduce_full(a) \ + do { \ + if (a >= (BASE << 16)) a -= (BASE << 16); \ + if (a >= (BASE << 15)) a -= (BASE << 15); \ +@@ -43,21 +83,237 @@ + if (a >= (BASE << 1)) a -= (BASE << 1); \ + if (a >= BASE) a -= BASE; \ + } while (0) +-# define MOD4(a) \ ++# define reduce_x(a) \ + do { \ ++ if (MIN_WORK >= (1 << 6) && a >= (BASE << 6)) a -= (BASE << 6); \ ++ if (MIN_WORK >= (1 << 5) && a >= (BASE << 5)) a -= (BASE << 5); \ + if (a >= (BASE << 4)) a -= (BASE << 4); \ + if (a >= (BASE << 3)) a -= (BASE << 3); \ + if (a >= (BASE << 2)) a -= (BASE << 2); \ + if (a >= (BASE << 1)) a -= (BASE << 1); \ + if (a >= BASE) a -= BASE; \ + } while (0) ++# define reduce(a) reduce_full(a) ++# else ++# define reduce_full(a) \ ++ do { \ ++ unsigned long b = a & 0x0000ffff; \ ++ a >>= 16; \ ++ b -= a; \ ++ a <<= 4; \ ++ a += b; \ ++ } while(a >= BASE) ++# define reduce_x(a) \ ++ do { \ ++ unsigned long b = a & 0x0000ffff; \ ++ a >>= 16; \ ++ b -= a; \ ++ a <<= 4; \ ++ a += b; \ ++ a = a >= BASE ? a - BASE : a; \ ++ } while(0) ++# define reduce(a) \ ++ do { \ ++ unsigned long b = a & 0x0000ffff; \ ++ a >>= 16; \ ++ b -= a; \ ++ a <<= 4; \ ++ a += b; \ ++ } while(0) ++# endif + #else +-# define MOD(a) a %= BASE +-# define MOD4(a) a %= BASE +-#endif +- +-/* ========================================================================= */ +-uLong ZEXPORT adler32(adler, buf, len) ++# define reduce_full(a) a %= BASE ++# define reduce_x(a) a %= BASE ++# define reduce(a) a %= BASE ++#endif ++ ++local int host_is_bigendian() ++{ ++ local const union { ++ uInt d; ++ unsigned char endian[sizeof(uInt)]; ++ } x = {1}; ++ return x.endian[0] == 0; ++} ++ ++#ifndef MIN_WORK ++# define MIN_WORK 16 ++#endif ++ ++/* ========================================================================= */ ++local noinline uLong adler32_1(adler, buf, len) ++ uLong adler; ++ const Bytef *buf; ++ uInt len GCC_ATTR_UNUSED_PARAM; ++{ ++ unsigned long sum2; ++ ++ /* split Adler-32 into component sums */ ++ sum2 = (adler >> 16) & 0xffff; ++ adler &= 0xffff; ++ ++ adler += buf[0]; ++ if (adler >= BASE) ++ adler -= BASE; ++ sum2 += adler; ++ if (sum2 >= BASE) ++ sum2 -= BASE; ++ return adler | (sum2 << 16); ++} ++ ++/* ========================================================================= */ ++local noinline uLong adler32_common(adler, buf, len) ++ uLong adler; ++ const Bytef *buf; ++ uInt len; ++{ ++ unsigned long sum2; ++ ++ /* split Adler-32 into component sums */ ++ sum2 = (adler >> 16) & 0xffff; ++ adler &= 0xffff; ++ ++ while (len--) { ++ adler += *buf++; ++ sum2 += adler; ++ } ++ if (adler >= BASE) ++ adler -= BASE; ++ reduce_x(sum2); /* only added so many BASE's */ ++ return adler | (sum2 << 16); ++} ++ ++#ifndef HAVE_ADLER32_VEC ++# if (defined(__LP64__) || ((SIZE_MAX-0) >> 31) >= 2) && !defined(NO_ADLER32_VEC) ++ ++/* On 64 Bit archs, we can do pseudo SIMD with a nice win. ++ * This is esp. important for old Alphas, they do not have byte ++ * access. ++ * This needs some register but x86_64 is fine (>= 9 for the mainloop ++ * req.). If your 64 Bit arch is more limited, throw it away... ++ */ ++# ifndef UINT64_C ++# if defined(_MSC_VER) || defined(__BORLANDC__) ++# define UINT64_C(c) (c ## ui64) ++# else ++# define UINT64_C(c) (c ## ULL) ++# endif ++# endif ++ ++# undef VNMAX ++# define VNMAX (2*NMAX+((9*NMAX)/10)) ++ ++/* ========================================================================= */ ++local noinline uLong adler32_vec(adler, buf, len) ++ uLong adler; ++ const Bytef *buf; ++ uInt len; ++{ ++ unsigned int s1, s2; ++ unsigned int k; ++ ++ /* split Adler-32 into component sums */ ++ s1 = adler & 0xffff; ++ s2 = (adler >> 16) & 0xffff; ++ ++ /* align input data */ ++ k = ALIGN_DIFF(buf, sizeof(size_t)); ++ len -= k; ++ if (k) do { ++ s1 += *buf++; ++ s2 += s1; ++ } while(--k); ++ ++ k = len > VNMAX ? VNMAX : len; ++ len -= k; ++ if (likely(k >= 2 * sizeof(size_t))) do ++ { ++ unsigned int vs1, vs2; ++ unsigned int vs1s; ++ ++ /* add s1 to s2 for rounds to come */ ++ s2 += s1 * ROUND_TO(k, sizeof(size_t)); ++ vs1s = vs1 = vs2 = 0; ++ do { ++ size_t vs1l = 0, vs1h = 0, vs1l_s = 0, vs1h_s = 0; ++ unsigned int a, b, c, d, e, f, g, h; ++ unsigned int j; ++ ++ j = k > 23 * sizeof(size_t) ? 23 : k/sizeof(size_t); ++ k -= j * sizeof(size_t); ++ /* add s1 to s1 round sum for rounds to come */ ++ vs1s += j * vs1; ++ do { ++ size_t in8 = *(const size_t *)buf; ++ buf += sizeof(size_t); ++ /* add this s1 to s1 round sum */ ++ vs1l_s += vs1l; ++ vs1h_s += vs1h; ++ /* add up input data to s1 */ ++ vs1l += in8 & UINT64_C(0x00ff00ff00ff00ff); ++ vs1h += (in8 & UINT64_C(0xff00ff00ff00ff00)) >> 8; ++ } while(--j); ++ ++ /* split s1 */ ++ if(host_is_bigendian()) { ++ a = (vs1h >> 48) & 0x0000ffff; ++ b = (vs1l >> 48) & 0x0000ffff; ++ c = (vs1h >> 32) & 0x0000ffff; ++ d = (vs1l >> 32) & 0x0000ffff; ++ e = (vs1h >> 16) & 0x0000ffff; ++ f = (vs1l >> 16) & 0x0000ffff; ++ g = (vs1h ) & 0x0000ffff; ++ h = (vs1l ) & 0x0000ffff; ++ } else { ++ a = (vs1l ) & 0x0000ffff; ++ b = (vs1h ) & 0x0000ffff; ++ c = (vs1l >> 16) & 0x0000ffff; ++ d = (vs1h >> 16) & 0x0000ffff; ++ e = (vs1l >> 32) & 0x0000ffff; ++ f = (vs1h >> 32) & 0x0000ffff; ++ g = (vs1l >> 48) & 0x0000ffff; ++ h = (vs1h >> 48) & 0x0000ffff; ++ } ++ ++ /* add s1 & s2 horiz. */ ++ vs2 += 8*a + 7*b + 6*c + 5*d + 4*e + 3*f + 2*g + 1*h; ++ vs1 += a + b + c + d + e + f + g + h; ++ ++ /* split and add up s1 round sum */ ++ vs1l_s = ((vs1l_s ) & UINT64_C(0x0000ffff0000ffff)) + ++ ((vs1l_s >> 16) & UINT64_C(0x0000ffff0000ffff)); ++ vs1h_s = ((vs1h_s ) & UINT64_C(0x0000ffff0000ffff)) + ++ ((vs1h_s >> 16) & UINT64_C(0x0000ffff0000ffff)); ++ vs1l_s += vs1h_s; ++ vs1s += ((vs1l_s ) & UINT64_C(0x00000000ffffffff)) + ++ ((vs1l_s >> 32) & UINT64_C(0x00000000ffffffff)); ++ } while (k >= sizeof(size_t)); ++ reduce(vs1s); ++ s2 += vs1s * 8 + vs2; ++ reduce(s2); ++ s1 += vs1; ++ reduce(s1); ++ len += k; ++ k = len > VNMAX ? VNMAX : len; ++ len -= k; ++ } while (k >= sizeof(size_t)); ++ ++ /* handle trailer */ ++ if (k) do { ++ s1 += *buf++; ++ s2 += s1; ++ } while (--k); ++ reduce(s1); ++ reduce(s2); ++ ++ /* return recombined sums */ ++ return (s2 << 16) | s1; ++} ++ ++# else ++ ++/* ========================================================================= */ ++local noinline uLong adler32_vec(adler, buf, len) + uLong adler; + const Bytef *buf; + uInt len; +@@ -69,33 +325,6 @@ + sum2 = (adler >> 16) & 0xffff; + adler &= 0xffff; + +- /* in case user likes doing a byte at a time, keep it fast */ +- if (len == 1) { +- adler += buf[0]; +- if (adler >= BASE) +- adler -= BASE; +- sum2 += adler; +- if (sum2 >= BASE) +- sum2 -= BASE; +- return adler | (sum2 << 16); +- } +- +- /* initial Adler-32 value (deferred check for len == 1 speed) */ +- if (buf == Z_NULL) +- return 1L; +- +- /* in case short lengths are provided, keep it somewhat fast */ +- if (len < 16) { +- while (len--) { +- adler += *buf++; +- sum2 += adler; +- } +- if (adler >= BASE) +- adler -= BASE; +- MOD4(sum2); /* only added so many BASE's */ +- return adler | (sum2 << 16); +- } +- + /* do length NMAX blocks -- requires just one modulo operation */ + while (len >= NMAX) { + len -= NMAX; +@@ -104,8 +333,8 @@ + DO16(buf); /* 16 sums unrolled */ + buf += 16; + } while (--n); +- MOD(adler); +- MOD(sum2); ++ reduce_full(adler); ++ reduce_full(sum2); + } + + /* do remaining bytes (less than NMAX, still just one modulo) */ +@@ -119,13 +348,36 @@ + adler += *buf++; + sum2 += adler; + } +- MOD(adler); +- MOD(sum2); ++ reduce_full(adler); ++ reduce_full(sum2); + } + + /* return recombined sums */ + return adler | (sum2 << 16); + } ++# endif ++#endif ++ ++/* ========================================================================= */ ++uLong ZEXPORT adler32(adler, buf, len) ++ uLong adler; ++ const Bytef *buf; ++ uInt len; ++{ ++ /* in case user likes doing a byte at a time, keep it fast */ ++ if (len == 1) ++ return adler32_1(adler, buf, len); /* should create a fast tailcall */ ++ ++ /* initial Adler-32 value (deferred check for len == 1 speed) */ ++ if (buf == Z_NULL) ++ return 1L; ++ ++ /* in case short lengths are provided, keep it somewhat fast */ ++ if (len < MIN_WORK) ++ return adler32_common(adler, buf, len); ++ ++ return adler32_vec(adler, buf, len); ++} + + /* ========================================================================= */ + local uLong adler32_combine_(adler1, adler2, len2) +@@ -141,7 +393,7 @@ + rem = (unsigned)(len2 % BASE); + sum1 = adler1 & 0xffff; + sum2 = rem * sum1; +- MOD(sum2); ++ reduce_full(sum2); + sum1 += (adler2 & 0xffff) + BASE - 1; + sum2 += ((adler1 >> 16) & 0xffff) + ((adler2 >> 16) & 0xffff) + BASE - rem; + if (sum1 >= BASE) sum1 -= BASE; diff --git a/02-ppc_altivec.patch b/02-ppc_altivec.patch new file mode 100644 index 0000000..5aabbc3 --- /dev/null +++ b/02-ppc_altivec.patch @@ -0,0 +1,307 @@ +=== modified file 'Makefile.in' +--- Makefile.in 2011-03-14 02:19:21 +0000 ++++ Makefile.in 2011-03-14 03:06:03 +0000 +@@ -236,7 +236,7 @@ + + # DO NOT DELETE THIS LINE -- make depend depends on it. + +-adler32.o: adler32.c zutil.h zlib.h zconf.h ++adler32.o: adler32.c adler32_ppc.c zutil.h zlib.h zconf.h + zutil.o: zutil.h zlib.h zconf.h + gzclose.o gzlib.o gzread.o gzwrite.o: zlib.h zconf.h gzguts.h + compress.o example.o minigzip.o uncompr.o: zlib.h zconf.h +@@ -247,7 +247,7 @@ + inftrees.o: zutil.h zlib.h zconf.h inftrees.h + trees.o: deflate.h zutil.h zlib.h zconf.h trees.h + +-adler32.lo: adler32.c zutil.h zlib.h zconf.h ++adler32.lo: adler32.c adler32_ppc.c zutil.h zlib.h zconf.h + zutil.lo: zutil.h zlib.h zconf.h + gzclose.lo gzlib.lo gzread.lo gzwrite.lo: zlib.h zconf.h gzguts.h + compress.lo example.lo minigzip.lo uncompr.lo: zlib.h zconf.h + +=== modified file 'adler32.c' +--- adler32.c 2011-03-30 13:38:42 +0000 ++++ adler32.c 2011-03-30 13:38:46 +0000 +@@ -36,7 +36,10 @@ + #endif + + #define ROUND_TO(x , n) ((x) & ~((n) - 1L)) ++#define DIV_ROUNDUP(a, b) (((a) + (b) - 1) / (b)) + #define ALIGN_DIFF(x, n) (((intptr_t)((x)+(n) - 1L) & ~((intptr_t)(n) - 1L)) - (intptr_t)(x)) ++#define ALIGN_DOWN(x, n) (((intptr_t)(x)) & ~((intptr_t)(n) - 1L)) ++#define ALIGN_DOWN_DIFF(x, n) (((intptr_t)(x)) & ((intptr_t)(n) - 1L)) + + local uLong adler32_combine_(uLong adler1, uLong adler2, z_off64_t len2); + +@@ -136,6 +139,12 @@ + return x.endian[0] == 0; + } + ++#ifndef NO_ADLER32_VEC ++# if defined(__powerpc__) || defined(__powerpc64__) ++# include "adler32_ppc.c" ++# endif ++#endif ++ + #ifndef MIN_WORK + # define MIN_WORK 16 + #endif + +=== added file 'adler32_ppc.c' +--- adler32_ppc.c 1970-01-01 00:00:00 +0000 ++++ adler32_ppc.c 2011-03-30 11:12:04 +0000 +@@ -0,0 +1,253 @@ ++/* ++ * adler32.c -- compute the Adler-32 checksum of a data stream ++ * ppc implementation ++ * Copyright (C) 1995-2007 Mark Adler ++ * Copyright (C) 2009-2011 Jan Seiffert ++ * For conditions of distribution and use, see copyright notice in zlib.h ++ */ ++ ++/* @(#) $Id$ */ ++ ++/* ++ * We use the Altivec PIM vector stuff, but still, this is only ++ * tested with GCC, and prop. uses some GCC specifics (like GCC ++ * understands vector types and you can simply write a += b) ++ */ ++#if defined(__ALTIVEC__) && defined(__GNUC__) ++# define HAVE_ADLER32_VEC ++/* it needs some bytes till the vec version gets up to speed... */ ++# define MIN_WORK 64 ++# include ++ ++/* ++ * Depending on length, this can be slower (short length < 64 bytes), ++ * much faster (our beloved 128kb 22.2s generic to 3.4s vec, but cache ++ * is important...), to a little faster (very long length, 1.6MB, 47.6s ++ * to 36s), which is prop. only capped by memory bandwith. ++ * (The orig. 128k case was slower in AltiVec, because AltiVec loads ++ * are always uncached and trigger no HW prefetching, because that is ++ * what you often need with mass data manipulation (not poisen your ++ * cache, movntq), instead you have to do it for yourself (data stream ++ * touch). With 128k it could be cleanly seen: no prefetch, half as slow ++ * as generic, but comment out the memory load -> 3s. With proper prefetch ++ * we are at 3.4s. So AltiVec can execute these "expensive" FMA quite ++ * fast (even without fancy unrolling), only the data does not arrive ++ * fast enough. In cases where the working set does not fit into cache ++ * it simply cannot be delivered fast enough over the FSB/Mem). ++ * Still we have to prefetch, or we are slow as hell. ++ */ ++ ++# define SOVUC (sizeof(vector unsigned char)) ++ ++/* can be propably more, since we do not have the x86 psadbw 64 bit sum */ ++# define VNMAX (6*NMAX) ++ ++/* ========================================================================= */ ++local inline vector unsigned char vec_identl(level) ++ unsigned int level; ++{ ++ return vec_lvsl(level, (const unsigned char *)0); ++} ++ ++/* ========================================================================= */ ++local inline vector unsigned char vec_ident_rev(void) ++{ ++ return vec_xor(vec_identl(0), vec_splat_u8(15)); ++} ++ ++/* ========================================================================= */ ++/* multiply two 32 bit ints, return the low 32 bit */ ++local inline vector unsigned int vec_mullw(vector unsigned int a, vector unsigned int b) ++{ ++ vector unsigned int v16 = vec_splat_u32(-16); ++ vector unsigned int v0_32 = vec_splat_u32(0); ++ vector unsigned int swap, low, high; ++ ++ swap = vec_rl(b, v16); ++ low = vec_mulo((vector unsigned short)a, (vector unsigned short)b); ++ high = vec_msum((vector unsigned short)a, (vector unsigned short)swap, v0_32); ++ high = vec_sl(high, v16); ++ return vec_add(low, high); ++} ++ ++/* ========================================================================= */ ++local inline vector unsigned int vector_reduce(vector unsigned int x) ++{ ++ vector unsigned int y; ++ vector unsigned int vsh; ++ ++ vsh = vec_splat_u32(1); ++ vsh = vec_sl(vsh, vec_splat_u32(4)); ++ ++ y = vec_sl(x, vsh); ++ y = vec_sr(y, vsh); ++ x = vec_sr(x, vsh); ++ y = vec_sub(y, x); ++ x = vec_sl(x, vec_splat_u32(4)); ++ x = vec_add(x, y); ++ return x; ++} ++ ++/* ========================================================================= */ ++local noinline uLong adler32_vec(adler, buf, len) ++ uLong adler; ++ const Bytef *buf; ++ uInt len; ++{ ++ unsigned int s1, s2; ++ ++ s1 = adler & 0xffff; ++ s2 = (adler >> 16) & 0xffff; ++ ++ if (likely(len >= 2*SOVUC)) { ++ vector unsigned int v0_32 = vec_splat_u32(0); ++ vector unsigned int vsh = vec_splat_u32(4); ++ vector unsigned char v1 = vec_splat_u8(1); ++ vector unsigned char vord; ++ vector unsigned char v0 = vec_splat_u8(0); ++ vector unsigned int vs1, vs2; ++ vector unsigned char in16, vord_a, v1_a, vperm; ++ unsigned int f, n; ++ unsigned int k, block_num; ++ ++ /* ++ * if i understand the Altivec PEM right, little ++ * endian impl. should have the data reversed on ++ * load, so the big endian vorder works. ++ */ ++ vord = vec_ident_rev() + v1; ++ block_num = DIV_ROUNDUP(len, 512); /* 32 block size * 16 bytes */ ++ f = 512; ++ f |= block_num >= 256 ? 0 : block_num << 16; ++ vec_dst(buf, f, 2); ++ /* ++ * Add stuff to achieve alignment ++ */ ++ /* swizzle masks in place */ ++ vperm = vec_lvsl(0, buf); ++ vord_a = vec_perm(vord, v0, vperm); ++ v1_a = vec_perm(v1, v0, vperm); ++ vperm = vec_lvsr(0, buf); ++ vord_a = vec_perm(v0, vord_a, vperm); ++ v1_a = vec_perm(v0, v1_a, vperm); ++ ++ /* align hard down */ ++ f = (unsigned) ALIGN_DOWN_DIFF(buf, SOVUC); ++ n = SOVUC - f; ++ buf = (const unsigned char *)ALIGN_DOWN(buf, SOVUC); ++ ++ /* add n times s1 to s2 for start round */ ++ s2 += s1 * n; ++ ++ /* set sums 0 */ ++ vs1 = v0_32; ++ vs2 = v0_32; ++ ++ k = len < VNMAX ? (unsigned)len : VNMAX; ++ len -= k; ++ ++ /* insert scalar start somewhere */ ++ vs1 = vec_lde(0, &s1); ++ vs2 = vec_lde(0, &s2); ++ ++ /* get input data */ ++ in16 = vec_ldl(0, buf); ++ ++ /* mask out excess data, add 4 byte horizontal and add to old dword */ ++ vs1 = vec_msum(in16, v1_a, vs1); ++ ++ /* apply order, masking out excess data, add 4 byte horizontal and add to old dword */ ++ vs2 = vec_msum(in16, vord_a, vs2); ++ ++ buf += SOVUC; ++ k -= n; ++ ++ if (likely(k >= SOVUC)) do { ++ vector unsigned int vs1_r = v0_32; ++ f = 512; ++ f |= block_num >= 256 ? 0 : block_num << 16; ++ vec_dst(buf, f, 2); ++ ++ do { ++ /* get input data */ ++ in16 = vec_ldl(0, buf); ++ ++ /* add vs1 for this round */ ++ vs1_r += vs1; ++ ++ /* add 4 byte horizontal and add to old dword */ ++ vs1 = vec_sum4s(in16, vs1); ++ /* apply order, add 4 byte horizontal and add to old dword */ ++ vs2 = vec_msum(in16, vord, vs2); ++ ++ buf += SOVUC; ++ k -= SOVUC; ++ } while (k >= SOVUC); ++ /* reduce vs1 round sum before multiplying by 16 */ ++ vs1_r = vector_reduce(vs1_r); ++ /* add all vs1 for 16 times */ ++ vs2 += vec_sl(vs1_r, vsh); ++ /* reduce the vectors to something in the range of BASE */ ++ vs2 = vector_reduce(vs2); ++ vs1 = vector_reduce(vs1); ++ len += k; ++ k = len < VNMAX ? (unsigned)len : VNMAX; ++ block_num = DIV_ROUNDUP(len, 512); /* 32 block size * 16 bytes */ ++ len -= k; ++ } while (likely(k >= SOVUC)); ++ ++ if (likely(k)) { ++ vector unsigned int vk; ++ /* ++ * handle trailer ++ */ ++ f = SOVUC - k; ++ /* swizzle masks in place */ ++ vperm = vec_identl(f); ++ vord_a = vec_perm(vord, v0, vperm); ++ v1_a = vec_perm(v1, v0, vperm); ++ ++ /* add k times vs1 for this trailer */ ++ vk = (vector unsigned int)vec_lvsl(0, (unsigned *)(intptr_t)k); ++ vk = (vector unsigned)vec_mergeh(v0, (vector unsigned char)vk); ++ vk = (vector unsigned)vec_mergeh((vector unsigned short)v0, (vector unsigned short)vk); ++ vk = vec_splat(vk, 0); ++ vs2 += vec_mullw(vs1, vk); ++ ++ /* get input data */ ++ in16 = vec_ldl(0, buf); ++ ++ /* mask out excess data, add 4 byte horizontal and add to old dword */ ++ vs1 = vec_msum(in16, v1_a, vs1); ++ /* apply order, masking out excess data, add 4 byte horizontal and add to old dword */ ++ vs2 = vec_msum(in16, vord_a, vs2); ++ ++ buf += k; ++ k -= k; ++ } ++ ++ vec_dss(2); ++ ++ /* add horizontal */ ++ /* stuff should be reduced so no proplem with signed sature */ ++ vs1 = (vector unsigned)vec_sums((vector int)vs1, (vector int)v0_32); ++ vs2 = (vector unsigned)vec_sums((vector int)vs2, (vector int)v0_32); ++ /* shake and roll */ ++ vs1 = vec_splat(vs1, 3); ++ vs2 = vec_splat(vs2, 3); ++ vec_ste(vs1, 0, &s1); ++ vec_ste(vs2, 0, &s2); ++ /* after horizontal add, reduce again in scalar code */ ++ } ++ ++ if (unlikely(len)) do { ++ s1 += *buf++; ++ s2 += s1; ++ } while (--len); ++ reduce(s1); ++ reduce(s2); ++ ++ return (s2 << 16) | s1; ++} ++ ++#endif diff --git a/03-arm.patch b/03-arm.patch new file mode 100644 index 0000000..6b9878b --- /dev/null +++ b/03-arm.patch @@ -0,0 +1,400 @@ +=== modified file 'Makefile.in' +--- Makefile.in 2011-03-14 03:06:03 +0000 ++++ Makefile.in 2011-03-14 14:39:24 +0000 +@@ -236,7 +236,7 @@ + + # DO NOT DELETE THIS LINE -- make depend depends on it. + +-adler32.o: adler32.c adler32_ppc.c zutil.h zlib.h zconf.h ++adler32.o: adler32.c adler32_ppc.c adler32_arm.c zutil.h zlib.h zconf.h + zutil.o: zutil.h zlib.h zconf.h + gzclose.o gzlib.o gzread.o gzwrite.o: zlib.h zconf.h gzguts.h + compress.o example.o minigzip.o uncompr.o: zlib.h zconf.h +@@ -247,7 +247,7 @@ + inftrees.o: zutil.h zlib.h zconf.h inftrees.h + trees.o: deflate.h zutil.h zlib.h zconf.h trees.h + +-adler32.lo: adler32.c adler32_ppc.c zutil.h zlib.h zconf.h ++adler32.lo: adler32.c adler32_ppc.c adler32_arm.c zutil.h zlib.h zconf.h + zutil.lo: zutil.h zlib.h zconf.h + gzclose.lo gzlib.lo gzread.lo gzwrite.lo: zlib.h zconf.h gzguts.h + compress.lo example.lo minigzip.lo uncompr.lo: zlib.h zconf.h + +=== modified file 'adler32.c' +--- adler32.c 2011-03-30 13:38:46 +0000 ++++ adler32.c 2011-03-30 13:38:46 +0000 +@@ -140,7 +140,9 @@ + } + + #ifndef NO_ADLER32_VEC +-# if defined(__powerpc__) || defined(__powerpc64__) ++# if defined(__arm__) ++# include "adler32_arm.c" ++# elif defined(__powerpc__) || defined(__powerpc64__) + # include "adler32_ppc.c" + # endif + #endif + +=== added file 'adler32_arm.c' +--- adler32_arm.c 1970-01-01 00:00:00 +0000 ++++ adler32_arm.c 2011-03-30 11:18:49 +0000 +@@ -0,0 +1,359 @@ ++/* ++ * adler32.c -- compute the Adler-32 checksum of a data stream ++ * arm implementation ++ * Copyright (C) 1995-2007 Mark Adler ++ * Copyright (C) 2009-2011 Jan Seiffert ++ * For conditions of distribution and use, see copyright notice in zlib.h ++ */ ++ ++/* @(#) $Id$ */ ++ ++#if defined(__ARM_NEON__) ++// TODO: need byte order define ++/* ++ * Big endian NEON qwords are kind of broken. ++ * They are big endian within the dwords, but WRONG ++ * (really??) way round between lo and hi. ++ * Creating some kind of PDP11 middle endian. ++ * ++ * This is madness and unsupportable. For this reason ++ * GCC wants to disable qword endian specific patterns. ++ * We would need a Preprocessor define which endian we ++ * have to disable this code. ++ */ ++# include ++ ++# define SOVUCQ sizeof(uint8x16_t) ++# define SOVUC sizeof(uint8x8_t) ++/* since we do not have the 64bit psadbw sum, we could prop. do a little more */ ++# define VNMAX (6*NMAX) ++# define HAVE_ADLER32_VEC ++# define MIN_WORK 32 ++ ++/* ========================================================================= */ ++local inline uint8x16_t neon_simple_alignq(uint8x16_t a, uint8x16_t b, unsigned amount) ++{ ++ switch(amount % SOVUCQ) ++ { ++ case 0: return a; ++ case 1: return vextq_u8(a, b, 1); ++ case 2: return vextq_u8(a, b, 2); ++ case 3: return vextq_u8(a, b, 3); ++ case 4: return vextq_u8(a, b, 4); ++ case 5: return vextq_u8(a, b, 5); ++ case 6: return vextq_u8(a, b, 6); ++ case 7: return vextq_u8(a, b, 7); ++ case 8: return vextq_u8(a, b, 8); ++ case 9: return vextq_u8(a, b, 9); ++ case 10: return vextq_u8(a, b, 10); ++ case 11: return vextq_u8(a, b, 11); ++ case 12: return vextq_u8(a, b, 12); ++ case 13: return vextq_u8(a, b, 13); ++ case 14: return vextq_u8(a, b, 14); ++ case 15: return vextq_u8(a, b, 15); ++ } ++ return b; ++} ++ ++/* ========================================================================= */ ++local inline uint32x4_t vector_reduce(uint32x4_t x) ++{ ++ uint32x4_t y; ++ ++ y = vshlq_n_u32(x, 16); ++ x = vshrq_n_u32(x, 16); ++ y = vshrq_n_u32(y, 16); ++ y = vsubq_u32(y, x); ++ x = vaddq_u32(y, vshlq_n_u32(x, 4)); ++ return x; ++} ++ ++/* ========================================================================= */ ++local noinline uLong adler32_vec(adler, buf, len) ++ uLong adler; ++ const Bytef *buf; ++ uInt len; ++{ ++ uint32x4_t v0_32 = (uint32x4_t){0,0,0,0}; ++ uint8x16_t v0 = (uint8x16_t)v0_32; ++ uint8x16_t vord, vord_a; ++ uint32x4_t vs1, vs2; ++ uint32x2_t v_tsum; ++ uint8x16_t in16; ++ uint32_t s1, s2; ++ unsigned k; ++ ++ s1 = adler & 0xffff; ++ s2 = (adler >> 16) & 0xffff; ++ ++// TODO: big endian mask is prop. wrong ++ if (host_is_bigendian()) ++ vord = (uint8x16_t){16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1}; ++ else ++ vord = (uint8x16_t){1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16}; ++ ++ if (likely(len >= 2*SOVUCQ)) { ++ unsigned f, n; ++ ++ /* ++ * Add stuff to achieve alignment ++ */ ++ /* align hard down */ ++ f = (unsigned) ALIGN_DOWN_DIFF(buf, SOVUCQ); ++ n = SOVUCQ - f; ++ buf = (const unsigned char *)ALIGN_DOWN(buf, SOVUCQ); ++ ++ /* add n times s1 to s2 for start round */ ++ s2 += s1 * n; ++ ++ /* set sums 0 */ ++ vs1 = v0_32; ++ vs2 = v0_32; ++ /* ++ * the accumulation of s1 for every round grows very fast ++ * (quadratic?), even if we accumulate in 4 dwords, more ++ * rounds means nonlinear growth. ++ * We already split it out of s2, normaly it would be in ++ * s2 times 16... and even grow faster. ++ * Thanks to this split and vector reduction, we can stay ++ * longer in the loops. But we have to prepare for the worst ++ * (all 0xff), only do 6 times the work. ++ * (we could prop. stay a little longer since we have 4 sums, ++ * not 2 like on x86). ++ */ ++ k = len < VNMAX ? (unsigned)len : VNMAX; ++ len -= k; ++ /* insert scalar start somewhere */ ++ vs1 = vsetq_lane_u32(s1, vs1, 0); ++ vs2 = vsetq_lane_u32(s2, vs2, 0); ++ ++ /* get input data */ ++ in16 = *(const uint8x16_t *)buf; ++ /* mask out excess data */ ++ if(host_is_bigendian()) { ++ in16 = neon_simple_alignq(v0, in16, n); ++ vord_a = neon_simple_alignq(v0, vord, n); ++ } else { ++ in16 = neon_simple_alignq(in16, v0, f); ++ vord_a = neon_simple_alignq(vord, v0, f); ++ } ++ ++ /* pairwise add bytes and long, pairwise add word long acc */ ++ vs1 = vpadalq_u16(vs1, vpaddlq_u8(in16)); ++ /* apply order, add words, pairwise add word long acc */ ++ vs2 = vpadalq_u16(vs2, ++ vmlal_u8( ++ vmull_u8(vget_low_u8(in16), vget_low_u8(vord_a)), ++ vget_high_u8(in16), vget_high_u8(vord_a) ++ ) ++ ); ++ ++ buf += SOVUCQ; ++ k -= n; ++ ++ if (likely(k >= SOVUCQ)) do { ++ uint32x4_t vs1_r = v0_32; ++ do { ++ /* add vs1 for this round */ ++ vs1_r = vaddq_u32(vs1_r, vs1); ++ ++ /* get input data */ ++ in16 = *(const uint8x16_t *)buf; ++ ++// TODO: make work in inner loop more tight ++ /* ++ * decompose partial sums, so we do less instructions and ++ * build loops around it to do acc and so on only from time ++ * to time. ++ * This is hard with NEON, because the instruction are nice: ++ * we have the stuff in widening and with acc (practicaly ++ * for free...) ++ */ ++ /* pairwise add bytes and long, pairwise add word long acc */ ++ vs1 = vpadalq_u16(vs1, vpaddlq_u8(in16)); ++ /* apply order, add words, pairwise add word long acc */ ++ vs2 = vpadalq_u16(vs2, ++ vmlal_u8( ++ vmull_u8(vget_low_u8(in16), vget_low_u8(vord)), ++ vget_high_u8(in16), vget_high_u8(vord) ++ ) ++ ); ++ ++ buf += SOVUCQ; ++ k -= SOVUCQ; ++ } while (k >= SOVUCQ); ++ /* reduce vs1 round sum before multiplying by 16 */ ++ vs1_r = vector_reduce(vs1_r); ++ /* add vs1 for this round (16 times) */ ++ /* they have shift right and accummulate, where is shift left and acc?? */ ++ vs2 = vaddq_u32(vs2, vshlq_n_u32(vs1_r, 4)); ++ /* reduce both vectors to something within 16 bit */ ++ vs2 = vector_reduce(vs2); ++ vs1 = vector_reduce(vs1); ++ len += k; ++ k = len < VNMAX ? (unsigned) len : VNMAX; ++ len -= k; ++ } while (likely(k >= SOVUC)); ++ ++ if (likely(k)) { ++ /* ++ * handle trailer ++ */ ++ f = SOVUCQ - k; ++ /* add k times vs1 for this trailer */ ++ vs2 = vmlaq_u32(vs2, vs1, vdupq_n_u32(k)); ++ ++ /* get input data */ ++ in16 = *(const uint8x16_t *)buf; ++ /* masks out bad data */ ++ if(host_is_bigendian()) ++ in16 = neon_simple_alignq(in16, v0, f); ++ else ++ in16 = neon_simple_alignq(v0, in16, k); ++ ++ /* pairwise add bytes and long, pairwise add word long acc */ ++ vs1 = vpadalq_u16(vs1, vpaddlq_u8(in16)); ++ /* apply order, add words, pairwise add word long acc */ ++ vs2 = vpadalq_u16(vs2, ++ vmlal_u8( ++ vmull_u8(vget_low_u8(in16), vget_low_u8(vord)), ++ vget_high_u8(in16), vget_high_u8(vord) ++ ) ++ ); ++ ++ buf += k; ++ k -= k; ++ } ++ ++ /* add horizontal */ ++ v_tsum = vpadd_u32(vget_high_u32(vs1), vget_low_u32(vs1)); ++ v_tsum = vpadd_u32(v_tsum, v_tsum); ++ s1 = vget_lane_u32(v_tsum, 0); ++ v_tsum = vpadd_u32(vget_high_u32(vs2), vget_low_u32(vs2)); ++ v_tsum = vpadd_u32(v_tsum, v_tsum); ++ s2 = vget_lane_u32(v_tsum, 0); ++ } ++ ++ if (unlikely(len)) do { ++ s1 += *buf++; ++ s2 += s1; ++ } while (--len); ++ reduce_x(s1); ++ reduce_x(s2); ++ ++ return (s2 << 16) | s1; ++} ++ ++/* inline asm, so only on GCC (or compatible) && ARM v6 or better */ ++#elif defined(__GNUC__) && ( \ ++ defined(__ARM_ARCH_6__) || defined(__ARM_ARCH_6J__) || \ ++ defined(__ARM_ARCH_6Z__) || defined(__ARM_ARCH_6ZK__) || \ ++ defined(__ARM_ARCH_7A__) \ ++ ) ++# define SOU32 (sizeof(unsigned int)) ++# define HAVE_ADLER32_VEC ++# define MIN_WORK 16 ++ ++/* ========================================================================= */ ++local noinline uLong adler32_vec(adler, buf, len) ++ uLong adler; ++ const Bytef *buf; ++ uInt len; ++{ ++ unsigned int s1, s2; ++ unsigned int k; ++ ++ s1 = adler & 0xffff; ++ s2 = (adler >> 16) & 0xffff; ++ ++ k = ALIGN_DIFF(buf, SOU32); ++ len -= k; ++ if (k) do { ++ s1 += *buf++; ++ s2 += s1; ++ } while (--k); ++ ++ if (likely(len >= 4 * SOU32)) { ++ unsigned int vs1 = s1, vs2 = s2; ++ unsigned int order_lo, order_hi; ++ ++// TODO: byte order? ++ if(host_is_bigendian()) { ++ order_lo = 0x00030001; ++ order_hi = 0x00040002; ++ } else { ++ order_lo = 0x00020004; ++ order_hi = 0x00010003; ++ } ++// TODO: we could go over NMAX, since we have split the vs2 sum ++ /* something around (NMAX+(NMAX/3)+302) */ ++ k = len < NMAX ? len : NMAX; ++ len -= k; ++ ++ do { ++ unsigned int vs1_r = 0; ++ do { ++ unsigned int t21, t22, in; ++ ++ /* get input data */ ++ in = *(const unsigned int *)buf; ++ ++ /* add vs1 for this round */ ++ vs1_r += vs1; ++ ++ /* add horizontal and acc */ ++ asm ("usada8 %0, %1, %2, %3" : "=r" (vs1) : "r" (in), "r" (0), "r" (vs1)); ++ /* widen bytes to words, apply order, add and acc */ ++ asm ("uxtb16 %0, %1" : "=r" (t21) : "r" (in)); ++ asm ("uxtb16 %0, %1, ror #8" : "=r" (t22) : "r" (in)); ++// TODO: instruction result latency ++ /* ++ * The same problem like the classic serial sum: ++ * Chip maker sell us 1-cycle instructions, but that is not the ++ * whole story. Nearly all 1-cycle chips are pipelined, so ++ * you can get one result per cycle, but only if _they_ (plural) ++ * are independent. ++ * If you are depending on the result of an preciding instruction, ++ * in the worst case you hit the instruction latency which is worst ++ * case >= pipeline length. On the other hand there are result-fast-paths. ++ * This could all be a wash with the classic sum (4 * 2 instructions, ++ * + dependence), since smald is: ++ * - 2 cycle issue ++ * - needs the acc in pipeline step E1, instead of E2 ++ * But the Cortex has a fastpath for acc. ++ * I don't know. ++ * We can not even unroll, we would need 4 order vars, return ENOREGISTER. ++ */ ++ asm ("smlad %0, %1, %2, %3" : "=r" (vs2) : "r" (t21) , "r" (order_lo), "r" (vs2)); ++ asm ("smlad %0, %1, %2, %3" : "=r" (vs2) : "r" (t22) , "r" (order_hi), "r" (vs2)); ++ ++ buf += SOU32; ++ k -= SOU32; ++ } while (k >= SOU32); ++ /* reduce vs1 round sum before multiplying by 4 */ ++ reduce(vs1_r); ++ /* add vs1 for this round (4 times) */ ++ vs2 += vs1_r * 4; ++ /* reduce both sums to something within 16 bit */ ++ reduce(vs2); ++ reduce(vs1); ++ len += k; ++ k = len < NMAX ? len : NMAX; ++ len -= k; ++ } while (likely(k >= 4 * SOU32)); ++ len += k; ++ s1 = vs1; ++ s2 = vs2; ++ } ++ ++ if (unlikely(len)) do { ++ s1 += *buf++; ++ s2 += s1; ++ } while (--len); ++ /* at this point we should no have so big s1 & s2 */ ++ reduce_x(s1); ++ reduce_x(s2); ++ ++ return (s2 << 16) | s1; ++} ++#endif diff --git a/04-x86.patch b/04-x86.patch new file mode 100644 index 0000000..f2d489e --- /dev/null +++ b/04-x86.patch @@ -0,0 +1,1165 @@ +=== modified file 'Makefile.in' +--- Makefile.in 2011-03-14 14:39:24 +0000 ++++ Makefile.in 2011-03-14 16:46:30 +0000 +@@ -236,7 +236,7 @@ + + # DO NOT DELETE THIS LINE -- make depend depends on it. + +-adler32.o: adler32.c adler32_ppc.c adler32_arm.c zutil.h zlib.h zconf.h ++adler32.o: adler32.c adler32_ppc.c adler32_arm.c adler32_x86.c zutil.h zlib.h zconf.h + zutil.o: zutil.h zlib.h zconf.h + gzclose.o gzlib.o gzread.o gzwrite.o: zlib.h zconf.h gzguts.h + compress.o example.o minigzip.o uncompr.o: zlib.h zconf.h +@@ -247,7 +247,7 @@ + inftrees.o: zutil.h zlib.h zconf.h inftrees.h + trees.o: deflate.h zutil.h zlib.h zconf.h trees.h + +-adler32.lo: adler32.c adler32_ppc.c adler32_arm.c zutil.h zlib.h zconf.h ++adler32.lo: adler32.c adler32_ppc.c adler32_arm.c adler32_x86.c zutil.h zlib.h zconf.h + zutil.lo: zutil.h zlib.h zconf.h + gzclose.lo gzlib.lo gzread.lo gzwrite.lo: zlib.h zconf.h gzguts.h + compress.lo example.lo minigzip.lo uncompr.lo: zlib.h zconf.h + +=== modified file 'adler32.c' +--- adler32.c 2011-03-30 13:38:46 +0000 ++++ adler32.c 2011-03-30 13:38:46 +0000 +@@ -144,6 +144,8 @@ + # include "adler32_arm.c" + # elif defined(__powerpc__) || defined(__powerpc64__) + # include "adler32_ppc.c" ++# elif defined(__i386__) || defined(__x86_64__) ++# include "adler32_x86.c" + # endif + #endif + + +=== added file 'adler32_x86.c' +--- adler32_x86.c 1970-01-01 00:00:00 +0000 ++++ adler32_x86.c 2011-03-15 23:15:36 +0000 +@@ -0,0 +1,1125 @@ ++/* ++ * adler32.c -- compute the Adler-32 checksum of a data stream ++ * x86 implementation ++ * Copyright (C) 1995-2007 Mark Adler ++ * Copyright (C) 2009-2011 Jan Seiffert ++ * For conditions of distribution and use, see copyright notice in zlib.h ++ */ ++ ++/* @(#) $Id$ */ ++ ++#if GCC_VERSION_GE(207) ++# define GCC_ATTR_CONSTRUCTOR __attribute__((__constructor__)) ++#else ++# define VEC_NO_GO ++#endif ++ ++#if GCC_VERSION_GE(203) ++# define GCC_ATTR_ALIGNED(x) __attribute__((__aligned__(x))) ++#else ++# define VEC_NO_GO ++#endif ++ ++/* inline asm, so only on GCC (or compatible) */ ++#if defined(__GNUC__) && !defined(VEC_NO_GO) ++# define HAVE_ADLER32_VEC ++# define MIN_WORK 64 ++ ++# ifdef __x86_64__ ++# define PICREG "%%rbx" ++# else ++# define PICREG "%%ebx" ++# endif ++ ++/* ========================================================================= */ ++local const struct { short d[24]; } vord GCC_ATTR_ALIGNED(16) = { ++ {1,1,1,1,1,1,1,1,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1} ++}; ++ ++/* ========================================================================= */ ++local const struct { char d[16]; } vord_b GCC_ATTR_ALIGNED(16) = { ++ {16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1} ++}; ++ ++/* ========================================================================= */ ++local noinline const Bytef *adler32_jumped(buf, s1, s2, k) ++ const Bytef *buf; ++ unsigned int *s1; ++ unsigned int *s2; ++ unsigned int k; ++{ ++ unsigned int t; ++ unsigned n = k % 16; ++ buf += n; ++ k = (k / 16) + 1; ++ ++ __asm__ __volatile__ ( ++# ifdef __x86_64__ ++# define CLOB "&" ++ "lea 1f(%%rip), %q4\n\t" ++ "lea (%q4,%q5,8), %q4\n\t" ++ "jmp *%q4\n\t" ++# else ++# ifndef __PIC__ ++# define CLOB ++ "lea 1f(,%5,8), %4\n\t" ++# else ++# define CLOB ++ "lea 1f-3f(,%5,8), %4\n\t" ++ "call 9f\n" ++ "3:\n\t" ++# endif ++ "jmp *%4\n\t" ++# ifdef __PIC__ ++ ".p2align 1\n" ++ "9:\n\t" ++ "addl (%%esp), %4\n\t" ++ "ret\n\t" ++# endif ++# endif ++ ".p2align 1\n" ++ "2:\n\t" ++# ifdef __i386 ++ ".byte 0x3e\n\t" ++# endif ++ "add $0x10, %2\n\t" ++ ".p2align 1\n" ++ "1:\n\t" ++ /* 128 */ ++ "movzbl -16(%2), %4\n\t" /* 4 */ ++ "add %4, %0\n\t" /* 2 */ ++ "add %0, %1\n\t" /* 2 */ ++ /* 120 */ ++ "movzbl -15(%2), %4\n\t" /* 4 */ ++ "add %4, %0\n\t" /* 2 */ ++ "add %0, %1\n\t" /* 2 */ ++ /* 112 */ ++ "movzbl -14(%2), %4\n\t" /* 4 */ ++ "add %4, %0\n\t" /* 2 */ ++ "add %0, %1\n\t" /* 2 */ ++ /* 104 */ ++ "movzbl -13(%2), %4\n\t" /* 4 */ ++ "add %4, %0\n\t" /* 2 */ ++ "add %0, %1\n\t" /* 2 */ ++ /* 96 */ ++ "movzbl -12(%2), %4\n\t" /* 4 */ ++ "add %4, %0\n\t" /* 2 */ ++ "add %0, %1\n\t" /* 2 */ ++ /* 88 */ ++ "movzbl -11(%2), %4\n\t" /* 4 */ ++ "add %4, %0\n\t" /* 2 */ ++ "add %0, %1\n\t" /* 2 */ ++ /* 80 */ ++ "movzbl -10(%2), %4\n\t" /* 4 */ ++ "add %4, %0\n\t" /* 2 */ ++ "add %0, %1\n\t" /* 2 */ ++ /* 72 */ ++ "movzbl -9(%2), %4\n\t" /* 4 */ ++ "add %4, %0\n\t" /* 2 */ ++ "add %0, %1\n\t" /* 2 */ ++ /* 64 */ ++ "movzbl -8(%2), %4\n\t" /* 4 */ ++ "add %4, %0\n\t" /* 2 */ ++ "add %0, %1\n\t" /* 2 */ ++ /* 56 */ ++ "movzbl -7(%2), %4\n\t" /* 4 */ ++ "add %4, %0\n\t" /* 2 */ ++ "add %0, %1\n\t" /* 2 */ ++ /* 48 */ ++ "movzbl -6(%2), %4\n\t" /* 4 */ ++ "add %4, %0\n\t" /* 2 */ ++ "add %0, %1\n\t" /* 2 */ ++ /* 40 */ ++ "movzbl -5(%2), %4\n\t" /* 4 */ ++ "add %4, %0\n\t" /* 2 */ ++ "add %0, %1\n\t" /* 2 */ ++ /* 32 */ ++ "movzbl -4(%2), %4\n\t" /* 4 */ ++ "add %4, %0\n\t" /* 2 */ ++ "add %0, %1\n\t" /* 2 */ ++ /* 24 */ ++ "movzbl -3(%2), %4\n\t" /* 4 */ ++ "add %4, %0\n\t" /* 2 */ ++ "add %0, %1\n\t" /* 2 */ ++ /* 16 */ ++ "movzbl -2(%2), %4\n\t" /* 4 */ ++ "add %4, %0\n\t" /* 2 */ ++ "add %0, %1\n\t" /* 2 */ ++ /* 8 */ ++ "movzbl -1(%2), %4\n\t" /* 4 */ ++ "add %4, %0\n\t" /* 2 */ ++ "add %0, %1\n\t" /* 2 */ ++ /* 0 */ ++ "dec %3\n\t" ++ "jnz 2b" ++ : /* %0 */ "=R" (*s1), ++ /* %1 */ "=R" (*s2), ++ /* %2 */ "=abdSD" (buf), ++ /* %3 */ "=c" (k), ++ /* %4 */ "="CLOB"R" (t) ++ : /* %5 */ "r" (16 - n), ++ /* */ "0" (*s1), ++ /* */ "1" (*s2), ++ /* */ "2" (buf), ++ /* */ "3" (k) ++ : "cc", "memory" ++ ); ++ ++ return buf; ++} ++ ++#if 0 ++ /* ++ * Will XOP processors have SSSE3/AVX?? ++ * And what is the unaligned load performance? ++ */ ++ "prefetchnta 0x70(%0)\n\t" ++ "lddqu (%0), %%xmm0\n\t" ++ "vpaddd %%xmm3, %%xmm5, %%xmm5\n\t" ++ "sub $16, %3\n\t" ++ "add $16, %0\n\t" ++ "cmp $15, %3\n\t" ++ "vphaddubd %%xmm0, %%xmm1\n\t" /* A */ ++ "vpmaddubsw %%xmm4, %%xmm0, %%xmm0\n\t"/* AVX! */ /* 1 */ ++ "vphadduwd %%xmm0, %%xmm0\n\t" /* 2 */ ++ "vpaddd %%xmm1, %%xmm3, %%xmm3\n\t" /* B: A+B => hadd+acc or vpmadcubd w. mul = 1 */ ++ "vpaddd %%xmm0, %%xmm2, %%xmm2\n\t" /* 3: 1+2+3 => vpmadcubd w. mul = 16,15,14... */ ++ "jg 1b\n\t" ++ xop_reduce ++ xop_reduce ++ xop_reduce ++ setup ++ "jg 1b\n\t" ++ "vphaddudq %%xmm2, %%xmm0\n\t" ++ "vphaddudq %%xmm3, %%xmm1\n\t" ++ "pshufd $0xE6, %%xmm0, %%xmm2\n\t" ++ "pshufd $0xE6, %%xmm1, %%xmm3\n\t" ++ "paddd %%xmm0, %%xmm2\n\t" ++ "paddd %%xmm1, %%xmm3\n\t" ++ "movd %%xmm2, %2\n\t" ++ "movd %%xmm3, %1\n\t" ++#endif ++ ++/* ========================================================================= */ ++local uLong adler32_SSSE3(adler, buf, len) ++ uLong adler; ++ const Bytef *buf; ++ uInt len; ++{ ++ unsigned int s1 = adler & 0xffff; ++ unsigned int s2 = (adler >> 16) & 0xffff; ++ unsigned int k; ++ ++ k = ALIGN_DIFF(buf, 16); ++ len -= k; ++ if (k) ++ buf = adler32_jumped(buf, &s1, &s2, k); ++ ++ __asm__ __volatile__ ( ++ "mov %6, %3\n\t" /* get max. byte count VNMAX till v1_round_sum overflows */ ++ "cmp %3, %4\n\t" ++ "cmovb %4, %3\n\t" /* k = len >= VNMAX ? k : len */ ++ "sub %3, %4\n\t" /* len -= k */ ++ "cmp $16, %3\n\t" ++ "jb 88f\n\t" /* if(k < 16) goto OUT */ ++#ifdef __ELF__ ++ ".subsection 2\n\t" ++#else ++ "jmp 77f\n\t" ++#endif ++ ".p2align 2\n" ++ /* ++ * reduction function to bring a vector sum within the range of BASE ++ * This does no full reduction! When the sum is large, a number > BASE ++ * is the result. To do a full reduction call multiple times. ++ */ ++ "sse2_reduce:\n\t" ++ "movdqa %%xmm0, %%xmm1\n\t" /* y = x */ ++ "pslld $16, %%xmm1\n\t" /* y <<= 16 */ ++ "psrld $16, %%xmm0\n\t" /* x >>= 16 */ ++ "psrld $16, %%xmm1\n\t" /* y >>= 16 */ ++ "psubd %%xmm0, %%xmm1\n\t" /* y -= x */ ++ "pslld $4, %%xmm0\n\t" /* x <<= 4 */ ++ "paddd %%xmm1, %%xmm0\n\t" /* x += y */ ++ "ret\n\t" ++#ifdef __ELF__ ++ ".previous\n\t" ++#else ++ "77:\n\t" ++#endif ++ "movdqa %5, %%xmm5\n\t" /* get vord_b */ ++ "prefetchnta 0x70(%0)\n\t" ++ "movd %2, %%xmm2\n\t" /* init vector sum vs2 with s2 */ ++ "movd %1, %%xmm3\n\t" /* init vector sum vs1 with s1 */ ++ "pxor %%xmm4, %%xmm4\n" /* zero */ ++ "3:\n\t" ++ "pxor %%xmm7, %%xmm7\n\t" /* zero vs1_round_sum */ ++ ".p2align 3,,3\n\t" ++ ".p2align 2\n" ++ "2:\n\t" ++ "mov $128, %1\n\t" /* inner_k = 128 bytes till vs2_i overflows */ ++ "cmp %1, %3\n\t" ++ "cmovb %3, %1\n\t" /* inner_k = k >= inner_k ? inner_k : k */ ++ "and $-16, %1\n\t" /* inner_k = ROUND_TO(inner_k, 16) */ ++ "sub %1, %3\n\t" /* k -= inner_k */ ++ "shr $4, %1\n\t" /* inner_k /= 16 */ ++ "pxor %%xmm6, %%xmm6\n\t" /* zero vs2_i */ ++ ".p2align 4,,7\n" ++ ".p2align 3\n" ++ "1:\n\t" ++ "movdqa (%0), %%xmm0\n\t" /* fetch input data */ ++ "prefetchnta 0x70(%0)\n\t" ++ "paddd %%xmm3, %%xmm7\n\t" /* vs1_round_sum += vs1 */ ++ "add $16, %0\n\t" /* advance input data pointer */ ++ "dec %1\n\t" /* decrement inner_k */ ++ "movdqa %%xmm0, %%xmm1\n\t" /* make a copy of the input data */ ++# if (HAVE_BINUTILS-0) >= 217 ++ "pmaddubsw %%xmm5, %%xmm0\n\t" /* multiply all input bytes by vord_b bytes, add adjecent results to words */ ++# else ++ ".byte 0x66, 0x0f, 0x38, 0x04, 0xc5\n\t" /* pmaddubsw %%xmm5, %%xmm0 */ ++# endif ++ "psadbw %%xmm4, %%xmm1\n\t" /* subtract zero from every byte, add 8 bytes to a sum */ ++ "paddw %%xmm0, %%xmm6\n\t" /* vs2_i += in * vorder_b */ ++ "paddd %%xmm1, %%xmm3\n\t" /* vs1 += psadbw */ ++ "jnz 1b\n\t" /* repeat if inner_k != 0 */ ++ "movdqa %%xmm6, %%xmm0\n\t" /* copy vs2_i */ ++ "punpckhwd %%xmm4, %%xmm0\n\t" /* zero extent vs2_i upper words to dwords */ ++ "punpcklwd %%xmm4, %%xmm6\n\t" /* zero extent vs2_i lower words to dwords */ ++ "paddd %%xmm0, %%xmm2\n\t" /* vs2 += vs2_i.upper */ ++ "paddd %%xmm6, %%xmm2\n\t" /* vs2 += vs2_i.lower */ ++ "cmp $15, %3\n\t" ++ "jg 2b\n\t" /* if(k > 15) repeat */ ++ "movdqa %%xmm7, %%xmm0\n\t" /* move vs1_round_sum */ ++ "call sse2_reduce\n\t" /* reduce vs1_round_sum */ ++ "pslld $4, %%xmm0\n\t" /* vs1_round_sum *= 16 */ ++ "paddd %%xmm2, %%xmm0\n\t" /* vs2 += vs1_round_sum */ ++ "call sse2_reduce\n\t" /* reduce again */ ++ "movdqa %%xmm0, %%xmm2\n\t" /* move vs2 back in place */ ++ "movdqa %%xmm3, %%xmm0\n\t" /* move vs1 */ ++ "call sse2_reduce\n\t" /* reduce */ ++ "movdqa %%xmm0, %%xmm3\n\t" /* move vs1 back in place */ ++ "add %3, %4\n\t" /* len += k */ ++ "mov %6, %3\n\t" /* get max. byte count VNMAX till v1_round_sum overflows */ ++ "cmp %3, %4\n\t" ++ "cmovb %4, %3\n\t" /* k = len >= VNMAX ? k : len */ ++ "sub %3, %4\n\t" /* len -= k */ ++ "cmp $15, %3\n\t" ++ "jg 3b\n\t" /* if(k > 15) repeat */ ++ "pshufd $0xEE, %%xmm3, %%xmm1\n\t" /* collect vs1 & vs2 in lowest vector member */ ++ "pshufd $0xEE, %%xmm2, %%xmm0\n\t" ++ "paddd %%xmm3, %%xmm1\n\t" ++ "paddd %%xmm2, %%xmm0\n\t" ++ "pshufd $0xE5, %%xmm0, %%xmm2\n\t" ++ "paddd %%xmm0, %%xmm2\n\t" ++ "movd %%xmm1, %1\n\t" /* mov vs1 to s1 */ ++ "movd %%xmm2, %2\n" /* mov vs2 to s2 */ ++ "88:" ++ : /* %0 */ "=r" (buf), ++ /* %1 */ "=r" (s1), ++ /* %2 */ "=r" (s2), ++ /* %3 */ "=r" (k), ++ /* %4 */ "=r" (len) ++ : /* %5 */ "m" (vord_b), ++ /* ++ * somewhere between 5 & 6, psadbw 64 bit sums ruin the party ++ * spreading the sums with palignr only brings it to 7 (?), ++ * while introducing an op into the main loop (2800 ms -> 3200 ms) ++ */ ++ /* %6 */ "i" (5*NMAX), ++ /* */ "0" (buf), ++ /* */ "1" (s1), ++ /* */ "2" (s2), ++ /* */ "4" (len) ++ : "cc", "memory" ++# ifdef __SSE__ ++ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" ++# endif ++ ); ++ ++ if (unlikely(k)) ++ buf = adler32_jumped(buf, &s1, &s2, k); ++ reduce(s1); ++ reduce(s2); ++ return (s2 << 16) | s1; ++} ++ ++/* ========================================================================= */ ++local uLong adler32_SSE2(adler, buf, len) ++ uLong adler; ++ const Bytef *buf; ++ uInt len; ++{ ++ unsigned int s1 = adler & 0xffff; ++ unsigned int s2 = (adler >> 16) & 0xffff; ++ unsigned int k; ++ ++ k = ALIGN_DIFF(buf, 16); ++ len -= k; ++ if (k) ++ buf = adler32_jumped(buf, &s1, &s2, k); ++ ++ __asm__ __volatile__ ( ++ "mov %6, %3\n\t" ++ "cmp %3, %4\n\t" ++ "cmovb %4, %3\n\t" ++ "sub %3, %4\n\t" ++ "cmp $16, %3\n\t" ++ "jb 88f\n\t" ++ "prefetchnta 0x70(%0)\n\t" ++ "movd %1, %%xmm4\n\t" ++ "movd %2, %%xmm3\n\t" ++ "pxor %%xmm2, %%xmm2\n\t" ++ "pxor %%xmm5, %%xmm5\n\t" ++ ".p2align 2\n" ++ "3:\n\t" ++ "pxor %%xmm6, %%xmm6\n\t" ++ "pxor %%xmm7, %%xmm7\n\t" ++ "mov $2048, %1\n\t" /* get byte count till vs2_{l|h}_word overflows */ ++ "cmp %1, %3\n\t" ++ "cmovb %3, %1\n" ++ "and $-16, %1\n\t" ++ "sub %1, %3\n\t" ++ "shr $4, %1\n\t" ++ ".p2align 4,,7\n" ++ ".p2align 3\n" ++ "1:\n\t" ++ "prefetchnta 0x70(%0)\n\t" ++ "movdqa (%0), %%xmm0\n\t" /* fetch input data */ ++ "paddd %%xmm4, %%xmm5\n\t" /* vs1_round_sum += vs1 */ ++ "add $16, %0\n\t" ++ "dec %1\n\t" ++ "movdqa %%xmm0, %%xmm1\n\t" /* copy input data */ ++ "psadbw %%xmm2, %%xmm0\n\t" /* add all bytes horiz. */ ++ "paddd %%xmm0, %%xmm4\n\t" /* add that to vs1 */ ++ "movdqa %%xmm1, %%xmm0\n\t" /* copy input data */ ++ "punpckhbw %%xmm2, %%xmm1\n\t" /* zero extent input upper bytes to words */ ++ "punpcklbw %%xmm2, %%xmm0\n\t" /* zero extent input lower bytes to words */ ++ "paddw %%xmm1, %%xmm7\n\t" /* vs2_h_words += in_high_words */ ++ "paddw %%xmm0, %%xmm6\n\t" /* vs2_l_words += in_low_words */ ++ "jnz 1b\n\t" ++ "cmp $15, %3\n\t" ++ "pmaddwd 32+%5, %%xmm7\n\t" /* multiply vs2_h_words with order, add adjecend results */ ++ "pmaddwd 16+%5, %%xmm6\n\t" /* multiply vs2_l_words with order, add adjecend results */ ++ "paddd %%xmm7, %%xmm3\n\t" /* add to vs2 */ ++ "paddd %%xmm6, %%xmm3\n\t" /* add to vs2 */ ++ "jg 3b\n\t" ++ "movdqa %%xmm5, %%xmm0\n\t" ++ "pxor %%xmm5, %%xmm5\n\t" ++ "call sse2_reduce\n\t" ++ "pslld $4, %%xmm0\n\t" ++ "paddd %%xmm3, %%xmm0\n\t" ++ "call sse2_reduce\n\t" ++ "movdqa %%xmm0, %%xmm3\n\t" ++ "movdqa %%xmm4, %%xmm0\n\t" ++ "call sse2_reduce\n\t" ++ "movdqa %%xmm0, %%xmm4\n\t" ++ "add %3, %4\n\t" ++ "mov %6, %3\n\t" ++ "cmp %3, %4\n\t" ++ "cmovb %4, %3\n" ++ "sub %3, %4\n\t" ++ "cmp $15, %3\n\t" ++ "jg 3b\n\t" ++ "pshufd $0xEE, %%xmm4, %%xmm1\n\t" ++ "pshufd $0xEE, %%xmm3, %%xmm0\n\t" ++ "paddd %%xmm4, %%xmm1\n\t" ++ "paddd %%xmm3, %%xmm0\n\t" ++ "pshufd $0xE5, %%xmm0, %%xmm3\n\t" ++ "paddd %%xmm0, %%xmm3\n\t" ++ "movd %%xmm1, %1\n\t" ++ "movd %%xmm3, %2\n" ++ "88:\n\t" ++ : /* %0 */ "=r" (buf), ++ /* %1 */ "=r" (s1), ++ /* %2 */ "=r" (s2), ++ /* %3 */ "=r" (k), ++ /* %4 */ "=r" (len) ++ : /* %5 */ "m" (vord), ++ /* %6 */ "i" (5*NMAX), ++ /* */ "0" (buf), ++ /* */ "1" (s1), ++ /* */ "2" (s2), ++ /* */ "3" (k), ++ /* */ "4" (len) ++ : "cc", "memory" ++# ifdef __SSE__ ++ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" ++# endif ++ ); ++ ++ if (unlikely(k)) ++ buf = adler32_jumped(buf, &s1, &s2, k); ++ reduce(s1); ++ reduce(s2); ++ return (s2 << 16) | s1; ++} ++ ++# if 0 ++/* ========================================================================= */ ++/* ++ * The SSE2 version above is faster on my CPUs (Athlon64, Core2, ++ * P4 Xeon, K10 Sempron), but has instruction stalls only a ++ * Out-Of-Order-Execution CPU can solve. ++ * So this Version _may_ be better for the new old thing, Atom. ++ */ ++local noinline uLong adler32_SSE2_no_oooe(adler, buf, len) ++ uLong adler; ++ const Bytef *buf; ++ uInt len; ++{ ++ unsigned int s1 = adler & 0xffff; ++ unsigned int s2 = (adler >> 16) & 0xffff; ++ unsigned int k; ++ ++ k = ALIGN_DIFF(buf, 16); ++ len -= k; ++ if (k) ++ buf = adler32_jumped(buf, &s1, &s2, k); ++ ++ __asm__ __volatile__ ( ++ "mov %6, %3\n\t" ++ "cmp %3, %4\n\t" ++ "cmovb %4, %3\n\t" ++ "sub %3, %4\n\t" ++ "cmp $16, %3\n\t" ++ "jb 88f\n\t" ++ "movdqa 16+%5, %%xmm6\n\t" ++ "movdqa 32+%5, %%xmm5\n\t" ++ "prefetchnta 16(%0)\n\t" ++ "pxor %%xmm7, %%xmm7\n\t" ++ "movd %1, %%xmm4\n\t" ++ "movd %2, %%xmm3\n\t" ++ ".p2align 3,,3\n\t" ++ ".p2align 2\n" ++ "1:\n\t" ++ "prefetchnta 32(%0)\n\t" ++ "movdqa (%0), %%xmm1\n\t" ++ "sub $16, %3\n\t" ++ "movdqa %%xmm4, %%xmm2\n\t" ++ "add $16, %0\n\t" ++ "movdqa %%xmm1, %%xmm0\n\t" ++ "cmp $15, %3\n\t" ++ "pslld $4, %%xmm2\n\t" ++ "paddd %%xmm3, %%xmm2\n\t" ++ "psadbw %%xmm7, %%xmm0\n\t" ++ "paddd %%xmm0, %%xmm4\n\t" ++ "movdqa %%xmm1, %%xmm0\n\t" ++ "punpckhbw %%xmm7, %%xmm1\n\t" ++ "punpcklbw %%xmm7, %%xmm0\n\t" ++ "movdqa %%xmm1, %%xmm3\n\t" ++ "pmaddwd %%xmm6, %%xmm0\n\t" ++ "paddd %%xmm2, %%xmm0\n\t" ++ "pmaddwd %%xmm5, %%xmm3\n\t" ++ "paddd %%xmm0, %%xmm3\n\t" ++ "jg 1b\n\t" ++ "movdqa %%xmm3, %%xmm0\n\t" ++ "call sse2_reduce\n\t" ++ "call sse2_reduce\n\t" ++ "movdqa %%xmm0, %%xmm3\n\t" ++ "movdqa %%xmm4, %%xmm0\n\t" ++ "call sse2_reduce\n\t" ++ "movdqa %%xmm0, %%xmm4\n\t" ++ "add %3, %4\n\t" ++ "mov %6, %3\n\t" ++ "cmp %3, %4\n\t" ++ "cmovb %4, %3\n\t" ++ "sub %3, %4\n\t" ++ "cmp $15, %3\n\t" ++ "jg 1b\n\t" ++ "pshufd $0xEE, %%xmm3, %%xmm0\n\t" ++ "pshufd $0xEE, %%xmm4, %%xmm1\n\t" ++ "paddd %%xmm3, %%xmm0\n\t" ++ "pshufd $0xE5, %%xmm0, %%xmm2\n\t" ++ "paddd %%xmm4, %%xmm1\n\t" ++ "movd %%xmm1, %1\n\t" ++ "paddd %%xmm0, %%xmm2\n\t" ++ "movd %%xmm2, %2\n" ++ "88:" ++ : /* %0 */ "=r" (buf), ++ /* %1 */ "=r" (s1), ++ /* %2 */ "=r" (s2), ++ /* %3 */ "=r" (k), ++ /* %4 */ "=r" (len) ++ : /* %5 */ "m" (vord), ++ /* %6 */ "i" (NMAX + NMAX/3), ++ /* */ "0" (buf), ++ /* */ "1" (s1), ++ /* */ "2" (s2), ++ /* */ "4" (len) ++ : "cc", "memory" ++# ifdef __SSE__ ++ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" ++# endif ++ ); ++ ++ if (unlikely(k)) ++ buf = adler32_jumped(buf, &s1, &s2, k); ++ reduce(s1); ++ reduce(s2); ++ return (s2 << 16) | s1; ++} ++# endif ++ ++# ifndef __x86_64__ ++/* ========================================================================= */ ++/* ++ * SSE version to help VIA-C3_2, P2 & P3 ++ */ ++local uLong adler32_SSE(adler, buf, len) ++ uLong adler; ++ const Bytef *buf; ++ uInt len; ++{ ++ unsigned int s1 = adler & 0xffff; ++ unsigned int s2 = (adler >> 16) & 0xffff; ++ unsigned int k; ++ ++ k = ALIGN_DIFF(buf, 8); ++ len -= k; ++ if (k) ++ buf = adler32_jumped(buf, &s1, &s2, k); ++ ++ __asm__ __volatile__ ( ++ "mov %6, %3\n\t" ++ "cmp %3, %4\n\t" ++ "cmovb %4, %3\n" ++ "sub %3, %4\n\t" ++ "cmp $8, %3\n\t" ++ "jb 88f\n\t" ++ "movd %1, %%mm4\n\t" ++ "movd %2, %%mm3\n\t" ++ "pxor %%mm2, %%mm2\n\t" ++ "pxor %%mm5, %%mm5\n\t" ++# ifdef __ELF__ ++ ".subsection 2\n\t" ++# else ++ "jmp 77f\n\t" ++# endif ++ ".p2align 2\n" ++ "mmx_reduce:\n\t" ++ "movq %%mm0, %%mm1\n\t" ++ "pslld $16, %%mm1\n\t" ++ "psrld $16, %%mm0\n\t" ++ "psrld $16, %%mm1\n\t" ++ "psubd %%mm0, %%mm1\n\t" ++ "pslld $4, %%mm0\n\t" ++ "paddd %%mm1, %%mm0\n\t" ++ "ret\n\t" ++# ifdef __ELF__ ++ ".previous\n\t" ++# else ++ "77:\n\t" ++# endif ++ ".p2align 2\n" ++ "3:\n\t" ++ "pxor %%mm6, %%mm6\n\t" ++ "pxor %%mm7, %%mm7\n\t" ++ "mov $1024, %1\n\t" ++ "cmp %1, %3\n\t" ++ "cmovb %3, %1\n" ++ "and $-8, %1\n\t" ++ "sub %1, %3\n\t" ++ "shr $3, %1\n\t" ++ ".p2align 4,,7\n" ++ ".p2align 3\n" ++ "1:\n\t" ++ "movq (%0), %%mm0\n\t" ++ "paddd %%mm4, %%mm5\n\t" ++ "add $8, %0\n\t" ++ "dec %1\n\t" ++ "movq %%mm0, %%mm1\n\t" ++ "psadbw %%mm2, %%mm0\n\t" ++ "paddd %%mm0, %%mm4\n\t" ++ "movq %%mm1, %%mm0\n\t" ++ "punpckhbw %%mm2, %%mm1\n\t" ++ "punpcklbw %%mm2, %%mm0\n\t" ++ "paddw %%mm1, %%mm7\n\t" ++ "paddw %%mm0, %%mm6\n\t" ++ "jnz 1b\n\t" ++ "cmp $7, %3\n\t" ++ "pmaddwd 40+%5, %%mm7\n\t" ++ "pmaddwd 32+%5, %%mm6\n\t" ++ "paddd %%mm7, %%mm3\n\t" ++ "paddd %%mm6, %%mm3\n\t" ++ "jg 3b\n\t" ++ "movq %%mm5, %%mm0\n\t" ++ "pxor %%mm5, %%mm5\n\t" ++ "call mmx_reduce\n\t" ++ "pslld $3, %%mm0\n\t" ++ "paddd %%mm3, %%mm0\n\t" ++ "call mmx_reduce\n\t" ++ "movq %%mm0, %%mm3\n\t" ++ "movq %%mm4, %%mm0\n\t" ++ "call mmx_reduce\n\t" ++ "movq %%mm0, %%mm4\n\t" ++ "add %3, %4\n\t" ++ "mov %6, %3\n\t" ++ "cmp %3, %4\n\t" ++ "cmovb %4, %3\n" ++ "sub %3, %4\n\t" ++ "cmp $7, %3\n\t" ++ "jg 3b\n\t" ++ "movd %%mm4, %1\n\t" ++ "psrlq $32, %%mm4\n\t" ++ "movd %%mm3, %2\n\t" ++ "psrlq $32, %%mm3\n\t" ++ "movd %%mm4, %4\n\t" ++ "add %4, %1\n\t" ++ "movd %%mm3, %4\n\t" ++ "add %4, %2\n" ++ "emms\n\t" ++ "88:\n\t" ++ : /* %0 */ "=r" (buf), ++ /* %1 */ "=r" (s1), ++ /* %2 */ "=r" (s2), ++ /* %3 */ "=r" (k), ++ /* %4 */ "=r" (len) ++ : /* %5 */ "m" (vord), ++ /* %6 */ "i" ((5*NMAX)/2), ++ /* */ "0" (buf), ++ /* */ "1" (s1), ++ /* */ "2" (s2), ++ /* */ "3" (k), ++ /* */ "4" (len) ++ : "cc", "memory" ++# ifdef __MMX__ ++ , "mm0", "mm1", "mm2", "mm3", "mm4", "mm5", "mm6", "mm7" ++# endif ++ ); ++ ++ if (unlikely(k)) ++ buf = adler32_jumped(buf, &s1, &s2, k); ++ reduce(s1); ++ reduce(s2); ++ return (s2 << 16) | s1; ++} ++ ++/* ========================================================================= */ ++/* ++ * Processors which only have MMX will prop. not like this ++ * code, they are so old, they are not Out-Of-Order ++ * (maybe except AMD K6, Cyrix, Winchip/VIA). ++ * I did my best to get at least 1 instruction between result -> use ++ */ ++local uLong adler32_MMX(adler, buf, len) ++ uLong adler; ++ const Bytef *buf; ++ uInt len; ++{ ++ unsigned int s1 = adler & 0xffff; ++ unsigned int s2 = (adler >> 16) & 0xffff; ++ unsigned int k; ++ ++ k = ALIGN_DIFF(buf, 8); ++ len -= k; ++ if (k) ++ buf = adler32_jumped(buf, &s1, &s2, k); ++ ++ __asm__ __volatile__ ( ++ "mov %6, %3\n\t" ++ "cmp %3, %4\n\t" ++ "jae 11f\n\t" ++ "mov %4, %3\n" ++ "11:\n\t" ++ "sub %3, %4\n\t" ++ "cmp $8, %3\n\t" ++ "jb 88f\n\t" ++ "sub $8, %%esp\n\t" ++ "movd %1, %%mm4\n\t" ++ "movd %2, %%mm2\n\t" ++ "movq %5, %%mm3\n" ++ "33:\n\t" ++ "movq %%mm2, %%mm0\n\t" ++ "pxor %%mm2, %%mm2\n\t" ++ "pxor %%mm5, %%mm5\n\t" ++ ".p2align 2\n" ++ "3:\n\t" ++ "movq %%mm0, (%%esp)\n\t" ++ "pxor %%mm6, %%mm6\n\t" ++ "pxor %%mm7, %%mm7\n\t" ++ "mov $1024, %1\n\t" ++ "cmp %1, %3\n\t" ++ "jae 44f\n\t" ++ "mov %3, %1\n" ++ "44:\n\t" ++ "and $-8, %1\n\t" ++ "sub %1, %3\n\t" ++ "shr $3, %1\n\t" ++ ".p2align 4,,7\n" ++ ".p2align 3\n" ++ "1:\n\t" ++ "movq (%0), %%mm0\n\t" ++ "paddd %%mm4, %%mm5\n\t" ++ "add $8, %0\n\t" ++ "dec %1\n\t" ++ "movq %%mm0, %%mm1\n\t" ++ "punpcklbw %%mm2, %%mm0\n\t" ++ "punpckhbw %%mm2, %%mm1\n\t" ++ "paddw %%mm0, %%mm6\n\t" ++ "paddw %%mm1, %%mm0\n\t" ++ "paddw %%mm1, %%mm7\n\t" ++ "pmaddwd %%mm3, %%mm0\n\t" ++ "paddd %%mm0, %%mm4\n\t" ++ "jnz 1b\n\t" ++ "movq (%%esp), %%mm0\n\t" ++ "cmp $7, %3\n\t" ++ "pmaddwd 32+%5, %%mm6\n\t" ++ "pmaddwd 40+%5, %%mm7\n\t" ++ "paddd %%mm6, %%mm0\n\t" ++ "paddd %%mm7, %%mm0\n\t" ++ "jg 3b\n\t" ++ "movq %%mm0, %%mm2\n\t" ++ "movq %%mm5, %%mm0\n\t" ++ "call mmx_reduce\n\t" ++ "pslld $3, %%mm0\n\t" ++ "paddd %%mm2, %%mm0\n\t" ++ "call mmx_reduce\n\t" ++ "movq %%mm0, %%mm2\n\t" ++ "movq %%mm4, %%mm0\n\t" ++ "call mmx_reduce\n\t" ++ "movq %%mm0, %%mm4\n\t" ++ "add %3, %4\n\t" ++ "mov %6, %3\n\t" ++ "cmp %3, %4\n\t" ++ "jae 22f\n\t" ++ "mov %4, %3\n" ++ "22:\n\t" ++ "sub %3, %4\n\t" ++ "cmp $7, %3\n\t" ++ "jg 33b\n\t" ++ "add $8, %%esp\n\t" ++ "movd %%mm4, %1\n\t" ++ "psrlq $32, %%mm4\n\t" ++ "movd %%mm2, %2\n\t" ++ "psrlq $32, %%mm2\n\t" ++ "movd %%mm4, %4\n\t" ++ "add %4, %1\n\t" ++ "movd %%mm2, %4\n\t" ++ "add %4, %2\n" ++ "emms\n\t" ++ "88:\n\t" ++ : /* %0 */ "=r" (buf), ++ /* %1 */ "=r" (s1), ++ /* %2 */ "=r" (s2), ++ /* %3 */ "=r" (k), ++ /* %4 */ "=r" (len) ++ : /* %5 */ "m" (vord), ++ /* %6 */ "i" (4*NMAX), ++ /* */ "0" (buf), ++ /* */ "1" (s1), ++ /* */ "2" (s2), ++ /* */ "3" (k), ++ /* */ "4" (len) ++ : "cc", "memory" ++# ifdef __MMX__ ++ , "mm0", "mm1", "mm2", "mm3", "mm4", "mm5", "mm6", "mm7" ++# endif ++ ); ++ ++ if (unlikely(k)) ++ buf = adler32_jumped(buf, &s1, &s2, k); ++ reduce(s1); ++ reduce(s2); ++ return (s2 << 16) | s1; ++} ++# endif ++ ++/* ========================================================================= */ ++local uLong adler32_x86(adler, buf, len) ++ uLong adler; ++ const Bytef *buf; ++ uInt len; ++{ ++ unsigned int s1 = adler & 0xffff; ++ unsigned int s2 = (adler >> 16) & 0xffff; ++ unsigned int n; ++ ++ while (likely(len)) { ++# ifndef __x86_64__ ++# define LOOP_COUNT 4 ++# else ++# define LOOP_COUNT 8 ++# endif ++ unsigned int k; ++ n = len < NMAX ? len : NMAX; ++ len -= n; ++ k = n / LOOP_COUNT; ++ n %= LOOP_COUNT; ++ ++ if (likely(k)) do { ++ /* ++ * Modern compiler can do "wonders". ++ * Only if they would not "trick them self" sometime. ++ * This was unrolled 16 times not because someone ++ * anticipated autovectorizing compiler, but the ++ * classical "avoid loop overhead". ++ * ++ * But things get tricky if the compiler starts to see: ++ * "hey lets disambiguate one sum step from the other", ++ * the classical prevent-pipeline-stalls-thing. ++ * ++ * Suddenly we have 16 temporary sums, which unfortunatly ++ * blows x86 limited register set... ++ * ++ * Loopunrolling is also a little bad for the I-cache. ++ * ++ * So tune this down for x86. ++ * Instead we try to keep it in the register set. 4 sums fits ++ * into i386 register set with no framepointer. ++ * x86_64 is a little more splendit, but still we can not ++ * take 16, so take 8 sums. ++ */ ++ s1 += buf[0]; s2 += s1; ++ s1 += buf[1]; s2 += s1; ++ s1 += buf[2]; s2 += s1; ++ s1 += buf[3]; s2 += s1; ++# ifdef __x86_64__ ++ s1 += buf[4]; s2 += s1; ++ s1 += buf[5]; s2 += s1; ++ s1 += buf[6]; s2 += s1; ++ s1 += buf[7]; s2 += s1; ++# endif ++ buf += LOOP_COUNT; ++ } while(likely(--k)); ++ if (n) do { ++ s1 += *buf++; ++ s2 += s1; ++ } while (--n); ++ reduce_full(s1); ++ reduce_full(s2); ++ } ++ return (s2 << 16) | s1; ++} ++ ++/* ========================================================================= */ ++/* ++ * Knot it all together with a runtime switch ++ */ ++ ++/* Flags */ ++# define CFF_DEFAULT (1 << 0) ++/* Processor features */ ++# define CFEATURE_CMOV (15 + 0) ++# define CFEATURE_MMX (23 + 0) ++# define CFEATURE_SSE (25 + 0) ++# define CFEATURE_SSE2 (26 + 0) ++# define CFEATURE_SSSE3 ( 9 + 32) ++ ++# define CFB(x) (1 << ((x)%32)) ++ ++# define FEATURE_WORDS 2 ++ ++/* data structure */ ++struct test_cpu_feature ++{ ++ void (*func)(void); ++ int flags; ++ unsigned int features[FEATURE_WORDS]; ++}; ++ ++/* ========================================================================= */ ++/* ++ * Decision table ++ */ ++local const struct test_cpu_feature tfeat_adler32_vec[] = ++{ ++ /* func flags features */ ++ {(void (*)(void))adler32_SSSE3, 0, {CFB(CFEATURE_CMOV), CFB(CFEATURE_SSSE3)}}, ++ {(void (*)(void))adler32_SSE2, 0, {CFB(CFEATURE_SSE2)|CFB(CFEATURE_CMOV), 0}}, ++# ifndef __x86_64__ ++ {(void (*)(void))adler32_SSE, 0, {CFB(CFEATURE_SSE)|CFB(CFEATURE_CMOV), 0}}, ++ {(void (*)(void))adler32_MMX, 0, {CFB(CFEATURE_MMX), 0}}, ++# endif ++ {(void (*)(void))adler32_x86, CFF_DEFAULT, { 0, 0}}, ++}; ++ ++/* ========================================================================= */ ++/* Prototypes */ ++local noinline void *test_cpu_feature(const struct test_cpu_feature *t, unsigned int l); ++local uLong adler32_vec_runtimesw(uLong adler, const Bytef *buf, uInt len); ++ ++/* ========================================================================= */ ++/* ++ * Runtime Function pointer ++ */ ++local uLong (*adler32_vec_ptr)(uLong adler, const Bytef *buf, uInt len) = adler32_vec_runtimesw; ++ ++/* ========================================================================= */ ++/* ++ * Constructor to init the pointer early ++ */ ++local GCC_ATTR_CONSTRUCTOR void adler32_vec_select(void) ++{ ++ adler32_vec_ptr = test_cpu_feature(tfeat_adler32_vec, sizeof (tfeat_adler32_vec)/sizeof (tfeat_adler32_vec[0])); ++} ++ ++/* ========================================================================= */ ++/* ++ * Jump function ++ */ ++local noinline uLong adler32_vec(adler, buf, len) ++ uLong adler; ++ const Bytef *buf; ++ uInt len; ++{ ++ return adler32_vec_ptr(adler, buf, len); ++} ++ ++/* ========================================================================= */ ++/* ++ * the runtime switcher is a little racy, it should normaly not run if the constructor works ++ */ ++local uLong adler32_vec_runtimesw(uLong adler, const Bytef *buf, uInt len) ++{ ++ adler32_vec_select(); ++ return adler32_vec(adler, buf, len); ++} ++ ++ ++/* ========================================================================= */ ++/* Internal data types */ ++struct cpuid_regs ++{ ++ unsigned long eax, ebx, ecx, edx; ++}; ++ ++local struct ++{ ++ unsigned int max_basic; ++ unsigned int features[FEATURE_WORDS]; ++ int init_done; ++} our_cpu; ++ ++/* ========================================================================= */ ++local inline unsigned long read_flags(void) ++{ ++ unsigned long f; ++ __asm__ __volatile__ ( ++ "pushf\n\t" ++ "pop %0\n\t" ++ : "=r" (f) ++ ); ++ return f; ++} ++ ++/* ========================================================================= */ ++local inline void write_flags(unsigned long f) ++{ ++ __asm__ __volatile__ ( ++ "push %0\n\t" ++ "popf\n\t" ++ : : "ri" (f) : "cc" ++ ); ++} ++ ++/* ========================================================================= */ ++local inline void cpuid(struct cpuid_regs *regs, unsigned long func) ++{ ++ /* save ebx around cpuid call, PIC code needs it */ ++ __asm__ __volatile__ ( ++ "xchg %1, " PICREG "\n\t" ++ "cpuid\n\t" ++ "xchg %1, " PICREG "\n" ++ : /* %0 */ "=a" (regs->eax), ++ /* %1 */ "=r" (regs->ebx), ++ /* %2 */ "=c" (regs->ecx), ++ /* %4 */ "=d" (regs->edx) ++ : /* %5 */ "0" (func), ++ /* %6 */ "2" (regs->ecx) ++ : "cc" ++ ); ++} ++ ++/* ========================================================================= */ ++local inline void cpuids(struct cpuid_regs *regs, unsigned long func) ++{ ++ regs->ecx = 0; ++ cpuid(regs, func); ++} ++ ++/* ========================================================================= */ ++local inline int toggle_eflags_test(const unsigned long mask) ++{ ++ unsigned long f; ++ int result; ++ ++ f = read_flags(); ++ write_flags(f ^ mask); ++ result = !!((f ^ read_flags()) & mask); ++ /* ++ * restore the old flags, the test for i486 tests the alignment ++ * check bit, and left set will confuse the x86 software world. ++ */ ++ write_flags(f); ++ return result; ++} ++ ++/* ========================================================================= */ ++local inline int is_486(void) ++{ ++ return toggle_eflags_test(1 << 18); ++} ++ ++/* ========================================================================= */ ++local inline int has_cpuid(void) ++{ ++ return toggle_eflags_test(1 << 21); ++} ++ ++/* ========================================================================= */ ++local void identify_cpu(void) ++{ ++ struct cpuid_regs a; ++ ++ if (our_cpu.init_done) ++ return; ++ ++ our_cpu.init_done = -1; ++ /* force a write out to memory */ ++ __asm__ __volatile__ ("" : : "m" (our_cpu.init_done)); ++ ++ if (!is_486()) ++ return; ++ ++ if (!has_cpuid()) ++ return; ++ ++ /* get the maximum basic leaf number */ ++ cpuids(&a, 0x00000000); ++ our_cpu.max_basic = (unsigned int)a.eax; ++ /* we could get the vendor string from ebx, edx, ecx */ ++ ++ /* get the first basic leaf, if it is avail. */ ++ if (our_cpu.max_basic >= 0x00000001) ++ cpuids(&a, 0x00000001); ++ else ++ a.eax = a.ebx = a.ecx = a.edx = 0; ++ ++ /* we could extract family, model, stepping from eax */ ++ ++ /* there is the first set of features */ ++ our_cpu.features[0] = a.edx; ++ our_cpu.features[1] = a.ecx; ++ ++ /* now we could test the extended features, but is not needed, for now */ ++} ++ ++/* ========================================================================= */ ++local noinline void *test_cpu_feature(const struct test_cpu_feature *t, unsigned int l) ++{ ++ unsigned int i, j, f; ++ identify_cpu(); ++ ++ for (i = 0; i < l; i++) { ++ if (t[i].flags & CFF_DEFAULT) ++ return t[i].func; ++ for (f = 0, j = 0; j < FEATURE_WORDS; j++) ++ f |= (our_cpu.features[j] & t[i].features[j]) ^ t[i].features[j]; ++ if (f) ++ continue; ++ return t[i].func; ++ } ++ return NULL; /* die! */ ++} ++ ++#endif + diff --git a/zlib.changes b/zlib.changes index e0aa3dd..ba28ccc 100644 --- a/zlib.changes +++ b/zlib.changes @@ -1,3 +1,14 @@ +------------------------------------------------------------------- +Wed Mar 30 19:47:30 UTC 2011 - crrodriguez@opensuse.org + +- Update SSE2/MMX patches to version 2. + +------------------------------------------------------------------- +Tue Mar 15 22:38:32 UTC 2011 - crrodriguez@opensuse.org + +- Add highly experimental patches to use SSE2/SSSE3/MMX in zlib + this makes the library up to 6 times faster. + ------------------------------------------------------------------- Sun Jan 9 14:33:08 CET 2011 - meissner@suse.de diff --git a/zlib.spec b/zlib.spec index 614d198..3e2f0ac 100644 --- a/zlib.spec +++ b/zlib.spec @@ -40,6 +40,10 @@ Patch0: zlib-1.2.2-format.patch Patch1: zlib-lfs.patch # PATCH-FIX-JENGELH-PARALLEL-MAKE zlib-parallel.patch meissner@novell.com -- shared library links with libz.a Patch2: zlib-parallel.patch +Patch3: 01-prepare.patch +Patch4: 02-ppc_altivec.patch +Patch5: 03-arm.patch +Patch6: 04-x86.patch BuildRoot: %{_tmppath}/%{name}-%{version}-build BuildRequires: pkgconfig @@ -84,7 +88,10 @@ libraries. %patch0 %patch1 %patch2 -p1 - +%patch3 +%patch4 +%patch5 +%patch6 %build # Marcus: breaks example64 in 32bit builds. %define do_profiling 0