Accepting request 65792 from Base:System
Accepted submit request 65792 from user coolo OBS-URL: https://build.opensuse.org/request/show/65792 OBS-URL: https://build.opensuse.org/package/show/openSUSE:Factory/zlib?expand=0&rev=27
This commit is contained in:
parent
ac90384018
commit
12e1d1e53d
423
01-prepare.patch
Normal file
423
01-prepare.patch
Normal file
@ -0,0 +1,423 @@
|
||||
=== modified file 'Makefile.in'
|
||||
--- Makefile.in 2011-03-14 01:01:37 +0000
|
||||
+++ Makefile.in 2011-03-14 02:19:21 +0000
|
||||
@@ -236,7 +236,8 @@
|
||||
|
||||
# DO NOT DELETE THIS LINE -- make depend depends on it.
|
||||
|
||||
-adler32.o zutil.o: zutil.h zlib.h zconf.h
|
||||
+adler32.o: adler32.c zutil.h zlib.h zconf.h
|
||||
+zutil.o: zutil.h zlib.h zconf.h
|
||||
gzclose.o gzlib.o gzread.o gzwrite.o: zlib.h zconf.h gzguts.h
|
||||
compress.o example.o minigzip.o uncompr.o: zlib.h zconf.h
|
||||
crc32.o: zutil.h zlib.h zconf.h crc32.h
|
||||
@@ -246,7 +247,8 @@
|
||||
inftrees.o: zutil.h zlib.h zconf.h inftrees.h
|
||||
trees.o: deflate.h zutil.h zlib.h zconf.h trees.h
|
||||
|
||||
-adler32.lo zutil.lo: zutil.h zlib.h zconf.h
|
||||
+adler32.lo: adler32.c zutil.h zlib.h zconf.h
|
||||
+zutil.lo: zutil.h zlib.h zconf.h
|
||||
gzclose.lo gzlib.lo gzread.lo gzwrite.lo: zlib.h zconf.h gzguts.h
|
||||
compress.lo example.lo minigzip.lo uncompr.lo: zlib.h zconf.h
|
||||
crc32.lo: zutil.h zlib.h zconf.h crc32.h
|
||||
|
||||
=== modified file 'adler32.c'
|
||||
--- adler32.c 2011-03-14 01:01:37 +0000
|
||||
+++ adler32.c 2011-03-30 13:38:42 +0000
|
||||
@@ -9,6 +9,35 @@
|
||||
|
||||
#define local static
|
||||
|
||||
+#define GCC_VERSION_GE(x) ((__GNUC__-0) * 100 + __GNUC_MINOR__-0 >= x)
|
||||
+
|
||||
+#if GCC_VERSION_GE(301)
|
||||
+/* sometimes leakes out of old kernel header */
|
||||
+# undef noinline
|
||||
+# define noinline __attribute__((__noinline__))
|
||||
+#else
|
||||
+# ifndef noinline
|
||||
+# define noinline
|
||||
+# endif
|
||||
+#endif
|
||||
+
|
||||
+#if GCC_VERSION_GE(301)
|
||||
+# define GCC_ATTR_UNUSED_PARAM __attribute__((__unused__))
|
||||
+#else
|
||||
+# define GCC_ATTR_UNUSED_PARAM
|
||||
+#endif
|
||||
+
|
||||
+#if GCC_VERSION_GE(296)
|
||||
+# define likely(x) __builtin_expect(!!(x), 1)
|
||||
+# define unlikely(x) __builtin_expect(!!(x), 0)
|
||||
+#else
|
||||
+# define likely(x) (x)
|
||||
+# define unlikely(x) (x)
|
||||
+#endif
|
||||
+
|
||||
+#define ROUND_TO(x , n) ((x) & ~((n) - 1L))
|
||||
+#define ALIGN_DIFF(x, n) (((intptr_t)((x)+(n) - 1L) & ~((intptr_t)(n) - 1L)) - (intptr_t)(x))
|
||||
+
|
||||
local uLong adler32_combine_(uLong adler1, uLong adler2, z_off64_t len2);
|
||||
|
||||
#define BASE 65521UL /* largest prime smaller than 65536 */
|
||||
@@ -21,9 +50,20 @@
|
||||
#define DO8(buf,i) DO4(buf,i); DO4(buf,i+4);
|
||||
#define DO16(buf) DO8(buf,0); DO8(buf,8);
|
||||
|
||||
+#if defined(__alpha__)
|
||||
+/* even if gcc can generate a mul by inverse, the code is really
|
||||
+ * ugly (find global const pool pointer, load constant, a mul, lots
|
||||
+ * of shifts/add/sub), up to 14 instructions. The replacement code
|
||||
+ * only needs >= 5 instructions
|
||||
+ */
|
||||
+# define NO_DIVIDE
|
||||
+#endif
|
||||
+
|
||||
/* use NO_DIVIDE if your processor does not do division in hardware */
|
||||
#ifdef NO_DIVIDE
|
||||
-# define MOD(a) \
|
||||
+/* use NO_SHIFT if your processor does shift > 1 by loop */
|
||||
+# ifdef NO_SHIFT
|
||||
+# define reduce_full(a) \
|
||||
do { \
|
||||
if (a >= (BASE << 16)) a -= (BASE << 16); \
|
||||
if (a >= (BASE << 15)) a -= (BASE << 15); \
|
||||
@@ -43,21 +83,237 @@
|
||||
if (a >= (BASE << 1)) a -= (BASE << 1); \
|
||||
if (a >= BASE) a -= BASE; \
|
||||
} while (0)
|
||||
-# define MOD4(a) \
|
||||
+# define reduce_x(a) \
|
||||
do { \
|
||||
+ if (MIN_WORK >= (1 << 6) && a >= (BASE << 6)) a -= (BASE << 6); \
|
||||
+ if (MIN_WORK >= (1 << 5) && a >= (BASE << 5)) a -= (BASE << 5); \
|
||||
if (a >= (BASE << 4)) a -= (BASE << 4); \
|
||||
if (a >= (BASE << 3)) a -= (BASE << 3); \
|
||||
if (a >= (BASE << 2)) a -= (BASE << 2); \
|
||||
if (a >= (BASE << 1)) a -= (BASE << 1); \
|
||||
if (a >= BASE) a -= BASE; \
|
||||
} while (0)
|
||||
+# define reduce(a) reduce_full(a)
|
||||
+# else
|
||||
+# define reduce_full(a) \
|
||||
+ do { \
|
||||
+ unsigned long b = a & 0x0000ffff; \
|
||||
+ a >>= 16; \
|
||||
+ b -= a; \
|
||||
+ a <<= 4; \
|
||||
+ a += b; \
|
||||
+ } while(a >= BASE)
|
||||
+# define reduce_x(a) \
|
||||
+ do { \
|
||||
+ unsigned long b = a & 0x0000ffff; \
|
||||
+ a >>= 16; \
|
||||
+ b -= a; \
|
||||
+ a <<= 4; \
|
||||
+ a += b; \
|
||||
+ a = a >= BASE ? a - BASE : a; \
|
||||
+ } while(0)
|
||||
+# define reduce(a) \
|
||||
+ do { \
|
||||
+ unsigned long b = a & 0x0000ffff; \
|
||||
+ a >>= 16; \
|
||||
+ b -= a; \
|
||||
+ a <<= 4; \
|
||||
+ a += b; \
|
||||
+ } while(0)
|
||||
+# endif
|
||||
#else
|
||||
-# define MOD(a) a %= BASE
|
||||
-# define MOD4(a) a %= BASE
|
||||
-#endif
|
||||
-
|
||||
-/* ========================================================================= */
|
||||
-uLong ZEXPORT adler32(adler, buf, len)
|
||||
+# define reduce_full(a) a %= BASE
|
||||
+# define reduce_x(a) a %= BASE
|
||||
+# define reduce(a) a %= BASE
|
||||
+#endif
|
||||
+
|
||||
+local int host_is_bigendian()
|
||||
+{
|
||||
+ local const union {
|
||||
+ uInt d;
|
||||
+ unsigned char endian[sizeof(uInt)];
|
||||
+ } x = {1};
|
||||
+ return x.endian[0] == 0;
|
||||
+}
|
||||
+
|
||||
+#ifndef MIN_WORK
|
||||
+# define MIN_WORK 16
|
||||
+#endif
|
||||
+
|
||||
+/* ========================================================================= */
|
||||
+local noinline uLong adler32_1(adler, buf, len)
|
||||
+ uLong adler;
|
||||
+ const Bytef *buf;
|
||||
+ uInt len GCC_ATTR_UNUSED_PARAM;
|
||||
+{
|
||||
+ unsigned long sum2;
|
||||
+
|
||||
+ /* split Adler-32 into component sums */
|
||||
+ sum2 = (adler >> 16) & 0xffff;
|
||||
+ adler &= 0xffff;
|
||||
+
|
||||
+ adler += buf[0];
|
||||
+ if (adler >= BASE)
|
||||
+ adler -= BASE;
|
||||
+ sum2 += adler;
|
||||
+ if (sum2 >= BASE)
|
||||
+ sum2 -= BASE;
|
||||
+ return adler | (sum2 << 16);
|
||||
+}
|
||||
+
|
||||
+/* ========================================================================= */
|
||||
+local noinline uLong adler32_common(adler, buf, len)
|
||||
+ uLong adler;
|
||||
+ const Bytef *buf;
|
||||
+ uInt len;
|
||||
+{
|
||||
+ unsigned long sum2;
|
||||
+
|
||||
+ /* split Adler-32 into component sums */
|
||||
+ sum2 = (adler >> 16) & 0xffff;
|
||||
+ adler &= 0xffff;
|
||||
+
|
||||
+ while (len--) {
|
||||
+ adler += *buf++;
|
||||
+ sum2 += adler;
|
||||
+ }
|
||||
+ if (adler >= BASE)
|
||||
+ adler -= BASE;
|
||||
+ reduce_x(sum2); /* only added so many BASE's */
|
||||
+ return adler | (sum2 << 16);
|
||||
+}
|
||||
+
|
||||
+#ifndef HAVE_ADLER32_VEC
|
||||
+# if (defined(__LP64__) || ((SIZE_MAX-0) >> 31) >= 2) && !defined(NO_ADLER32_VEC)
|
||||
+
|
||||
+/* On 64 Bit archs, we can do pseudo SIMD with a nice win.
|
||||
+ * This is esp. important for old Alphas, they do not have byte
|
||||
+ * access.
|
||||
+ * This needs some register but x86_64 is fine (>= 9 for the mainloop
|
||||
+ * req.). If your 64 Bit arch is more limited, throw it away...
|
||||
+ */
|
||||
+# ifndef UINT64_C
|
||||
+# if defined(_MSC_VER) || defined(__BORLANDC__)
|
||||
+# define UINT64_C(c) (c ## ui64)
|
||||
+# else
|
||||
+# define UINT64_C(c) (c ## ULL)
|
||||
+# endif
|
||||
+# endif
|
||||
+
|
||||
+# undef VNMAX
|
||||
+# define VNMAX (2*NMAX+((9*NMAX)/10))
|
||||
+
|
||||
+/* ========================================================================= */
|
||||
+local noinline uLong adler32_vec(adler, buf, len)
|
||||
+ uLong adler;
|
||||
+ const Bytef *buf;
|
||||
+ uInt len;
|
||||
+{
|
||||
+ unsigned int s1, s2;
|
||||
+ unsigned int k;
|
||||
+
|
||||
+ /* split Adler-32 into component sums */
|
||||
+ s1 = adler & 0xffff;
|
||||
+ s2 = (adler >> 16) & 0xffff;
|
||||
+
|
||||
+ /* align input data */
|
||||
+ k = ALIGN_DIFF(buf, sizeof(size_t));
|
||||
+ len -= k;
|
||||
+ if (k) do {
|
||||
+ s1 += *buf++;
|
||||
+ s2 += s1;
|
||||
+ } while(--k);
|
||||
+
|
||||
+ k = len > VNMAX ? VNMAX : len;
|
||||
+ len -= k;
|
||||
+ if (likely(k >= 2 * sizeof(size_t))) do
|
||||
+ {
|
||||
+ unsigned int vs1, vs2;
|
||||
+ unsigned int vs1s;
|
||||
+
|
||||
+ /* add s1 to s2 for rounds to come */
|
||||
+ s2 += s1 * ROUND_TO(k, sizeof(size_t));
|
||||
+ vs1s = vs1 = vs2 = 0;
|
||||
+ do {
|
||||
+ size_t vs1l = 0, vs1h = 0, vs1l_s = 0, vs1h_s = 0;
|
||||
+ unsigned int a, b, c, d, e, f, g, h;
|
||||
+ unsigned int j;
|
||||
+
|
||||
+ j = k > 23 * sizeof(size_t) ? 23 : k/sizeof(size_t);
|
||||
+ k -= j * sizeof(size_t);
|
||||
+ /* add s1 to s1 round sum for rounds to come */
|
||||
+ vs1s += j * vs1;
|
||||
+ do {
|
||||
+ size_t in8 = *(const size_t *)buf;
|
||||
+ buf += sizeof(size_t);
|
||||
+ /* add this s1 to s1 round sum */
|
||||
+ vs1l_s += vs1l;
|
||||
+ vs1h_s += vs1h;
|
||||
+ /* add up input data to s1 */
|
||||
+ vs1l += in8 & UINT64_C(0x00ff00ff00ff00ff);
|
||||
+ vs1h += (in8 & UINT64_C(0xff00ff00ff00ff00)) >> 8;
|
||||
+ } while(--j);
|
||||
+
|
||||
+ /* split s1 */
|
||||
+ if(host_is_bigendian()) {
|
||||
+ a = (vs1h >> 48) & 0x0000ffff;
|
||||
+ b = (vs1l >> 48) & 0x0000ffff;
|
||||
+ c = (vs1h >> 32) & 0x0000ffff;
|
||||
+ d = (vs1l >> 32) & 0x0000ffff;
|
||||
+ e = (vs1h >> 16) & 0x0000ffff;
|
||||
+ f = (vs1l >> 16) & 0x0000ffff;
|
||||
+ g = (vs1h ) & 0x0000ffff;
|
||||
+ h = (vs1l ) & 0x0000ffff;
|
||||
+ } else {
|
||||
+ a = (vs1l ) & 0x0000ffff;
|
||||
+ b = (vs1h ) & 0x0000ffff;
|
||||
+ c = (vs1l >> 16) & 0x0000ffff;
|
||||
+ d = (vs1h >> 16) & 0x0000ffff;
|
||||
+ e = (vs1l >> 32) & 0x0000ffff;
|
||||
+ f = (vs1h >> 32) & 0x0000ffff;
|
||||
+ g = (vs1l >> 48) & 0x0000ffff;
|
||||
+ h = (vs1h >> 48) & 0x0000ffff;
|
||||
+ }
|
||||
+
|
||||
+ /* add s1 & s2 horiz. */
|
||||
+ vs2 += 8*a + 7*b + 6*c + 5*d + 4*e + 3*f + 2*g + 1*h;
|
||||
+ vs1 += a + b + c + d + e + f + g + h;
|
||||
+
|
||||
+ /* split and add up s1 round sum */
|
||||
+ vs1l_s = ((vs1l_s ) & UINT64_C(0x0000ffff0000ffff)) +
|
||||
+ ((vs1l_s >> 16) & UINT64_C(0x0000ffff0000ffff));
|
||||
+ vs1h_s = ((vs1h_s ) & UINT64_C(0x0000ffff0000ffff)) +
|
||||
+ ((vs1h_s >> 16) & UINT64_C(0x0000ffff0000ffff));
|
||||
+ vs1l_s += vs1h_s;
|
||||
+ vs1s += ((vs1l_s ) & UINT64_C(0x00000000ffffffff)) +
|
||||
+ ((vs1l_s >> 32) & UINT64_C(0x00000000ffffffff));
|
||||
+ } while (k >= sizeof(size_t));
|
||||
+ reduce(vs1s);
|
||||
+ s2 += vs1s * 8 + vs2;
|
||||
+ reduce(s2);
|
||||
+ s1 += vs1;
|
||||
+ reduce(s1);
|
||||
+ len += k;
|
||||
+ k = len > VNMAX ? VNMAX : len;
|
||||
+ len -= k;
|
||||
+ } while (k >= sizeof(size_t));
|
||||
+
|
||||
+ /* handle trailer */
|
||||
+ if (k) do {
|
||||
+ s1 += *buf++;
|
||||
+ s2 += s1;
|
||||
+ } while (--k);
|
||||
+ reduce(s1);
|
||||
+ reduce(s2);
|
||||
+
|
||||
+ /* return recombined sums */
|
||||
+ return (s2 << 16) | s1;
|
||||
+}
|
||||
+
|
||||
+# else
|
||||
+
|
||||
+/* ========================================================================= */
|
||||
+local noinline uLong adler32_vec(adler, buf, len)
|
||||
uLong adler;
|
||||
const Bytef *buf;
|
||||
uInt len;
|
||||
@@ -69,33 +325,6 @@
|
||||
sum2 = (adler >> 16) & 0xffff;
|
||||
adler &= 0xffff;
|
||||
|
||||
- /* in case user likes doing a byte at a time, keep it fast */
|
||||
- if (len == 1) {
|
||||
- adler += buf[0];
|
||||
- if (adler >= BASE)
|
||||
- adler -= BASE;
|
||||
- sum2 += adler;
|
||||
- if (sum2 >= BASE)
|
||||
- sum2 -= BASE;
|
||||
- return adler | (sum2 << 16);
|
||||
- }
|
||||
-
|
||||
- /* initial Adler-32 value (deferred check for len == 1 speed) */
|
||||
- if (buf == Z_NULL)
|
||||
- return 1L;
|
||||
-
|
||||
- /* in case short lengths are provided, keep it somewhat fast */
|
||||
- if (len < 16) {
|
||||
- while (len--) {
|
||||
- adler += *buf++;
|
||||
- sum2 += adler;
|
||||
- }
|
||||
- if (adler >= BASE)
|
||||
- adler -= BASE;
|
||||
- MOD4(sum2); /* only added so many BASE's */
|
||||
- return adler | (sum2 << 16);
|
||||
- }
|
||||
-
|
||||
/* do length NMAX blocks -- requires just one modulo operation */
|
||||
while (len >= NMAX) {
|
||||
len -= NMAX;
|
||||
@@ -104,8 +333,8 @@
|
||||
DO16(buf); /* 16 sums unrolled */
|
||||
buf += 16;
|
||||
} while (--n);
|
||||
- MOD(adler);
|
||||
- MOD(sum2);
|
||||
+ reduce_full(adler);
|
||||
+ reduce_full(sum2);
|
||||
}
|
||||
|
||||
/* do remaining bytes (less than NMAX, still just one modulo) */
|
||||
@@ -119,13 +348,36 @@
|
||||
adler += *buf++;
|
||||
sum2 += adler;
|
||||
}
|
||||
- MOD(adler);
|
||||
- MOD(sum2);
|
||||
+ reduce_full(adler);
|
||||
+ reduce_full(sum2);
|
||||
}
|
||||
|
||||
/* return recombined sums */
|
||||
return adler | (sum2 << 16);
|
||||
}
|
||||
+# endif
|
||||
+#endif
|
||||
+
|
||||
+/* ========================================================================= */
|
||||
+uLong ZEXPORT adler32(adler, buf, len)
|
||||
+ uLong adler;
|
||||
+ const Bytef *buf;
|
||||
+ uInt len;
|
||||
+{
|
||||
+ /* in case user likes doing a byte at a time, keep it fast */
|
||||
+ if (len == 1)
|
||||
+ return adler32_1(adler, buf, len); /* should create a fast tailcall */
|
||||
+
|
||||
+ /* initial Adler-32 value (deferred check for len == 1 speed) */
|
||||
+ if (buf == Z_NULL)
|
||||
+ return 1L;
|
||||
+
|
||||
+ /* in case short lengths are provided, keep it somewhat fast */
|
||||
+ if (len < MIN_WORK)
|
||||
+ return adler32_common(adler, buf, len);
|
||||
+
|
||||
+ return adler32_vec(adler, buf, len);
|
||||
+}
|
||||
|
||||
/* ========================================================================= */
|
||||
local uLong adler32_combine_(adler1, adler2, len2)
|
||||
@@ -141,7 +393,7 @@
|
||||
rem = (unsigned)(len2 % BASE);
|
||||
sum1 = adler1 & 0xffff;
|
||||
sum2 = rem * sum1;
|
||||
- MOD(sum2);
|
||||
+ reduce_full(sum2);
|
||||
sum1 += (adler2 & 0xffff) + BASE - 1;
|
||||
sum2 += ((adler1 >> 16) & 0xffff) + ((adler2 >> 16) & 0xffff) + BASE - rem;
|
||||
if (sum1 >= BASE) sum1 -= BASE;
|
307
02-ppc_altivec.patch
Normal file
307
02-ppc_altivec.patch
Normal file
@ -0,0 +1,307 @@
|
||||
=== modified file 'Makefile.in'
|
||||
--- Makefile.in 2011-03-14 02:19:21 +0000
|
||||
+++ Makefile.in 2011-03-14 03:06:03 +0000
|
||||
@@ -236,7 +236,7 @@
|
||||
|
||||
# DO NOT DELETE THIS LINE -- make depend depends on it.
|
||||
|
||||
-adler32.o: adler32.c zutil.h zlib.h zconf.h
|
||||
+adler32.o: adler32.c adler32_ppc.c zutil.h zlib.h zconf.h
|
||||
zutil.o: zutil.h zlib.h zconf.h
|
||||
gzclose.o gzlib.o gzread.o gzwrite.o: zlib.h zconf.h gzguts.h
|
||||
compress.o example.o minigzip.o uncompr.o: zlib.h zconf.h
|
||||
@@ -247,7 +247,7 @@
|
||||
inftrees.o: zutil.h zlib.h zconf.h inftrees.h
|
||||
trees.o: deflate.h zutil.h zlib.h zconf.h trees.h
|
||||
|
||||
-adler32.lo: adler32.c zutil.h zlib.h zconf.h
|
||||
+adler32.lo: adler32.c adler32_ppc.c zutil.h zlib.h zconf.h
|
||||
zutil.lo: zutil.h zlib.h zconf.h
|
||||
gzclose.lo gzlib.lo gzread.lo gzwrite.lo: zlib.h zconf.h gzguts.h
|
||||
compress.lo example.lo minigzip.lo uncompr.lo: zlib.h zconf.h
|
||||
|
||||
=== modified file 'adler32.c'
|
||||
--- adler32.c 2011-03-30 13:38:42 +0000
|
||||
+++ adler32.c 2011-03-30 13:38:46 +0000
|
||||
@@ -36,7 +36,10 @@
|
||||
#endif
|
||||
|
||||
#define ROUND_TO(x , n) ((x) & ~((n) - 1L))
|
||||
+#define DIV_ROUNDUP(a, b) (((a) + (b) - 1) / (b))
|
||||
#define ALIGN_DIFF(x, n) (((intptr_t)((x)+(n) - 1L) & ~((intptr_t)(n) - 1L)) - (intptr_t)(x))
|
||||
+#define ALIGN_DOWN(x, n) (((intptr_t)(x)) & ~((intptr_t)(n) - 1L))
|
||||
+#define ALIGN_DOWN_DIFF(x, n) (((intptr_t)(x)) & ((intptr_t)(n) - 1L))
|
||||
|
||||
local uLong adler32_combine_(uLong adler1, uLong adler2, z_off64_t len2);
|
||||
|
||||
@@ -136,6 +139,12 @@
|
||||
return x.endian[0] == 0;
|
||||
}
|
||||
|
||||
+#ifndef NO_ADLER32_VEC
|
||||
+# if defined(__powerpc__) || defined(__powerpc64__)
|
||||
+# include "adler32_ppc.c"
|
||||
+# endif
|
||||
+#endif
|
||||
+
|
||||
#ifndef MIN_WORK
|
||||
# define MIN_WORK 16
|
||||
#endif
|
||||
|
||||
=== added file 'adler32_ppc.c'
|
||||
--- adler32_ppc.c 1970-01-01 00:00:00 +0000
|
||||
+++ adler32_ppc.c 2011-03-30 11:12:04 +0000
|
||||
@@ -0,0 +1,253 @@
|
||||
+/*
|
||||
+ * adler32.c -- compute the Adler-32 checksum of a data stream
|
||||
+ * ppc implementation
|
||||
+ * Copyright (C) 1995-2007 Mark Adler
|
||||
+ * Copyright (C) 2009-2011 Jan Seiffert
|
||||
+ * For conditions of distribution and use, see copyright notice in zlib.h
|
||||
+ */
|
||||
+
|
||||
+/* @(#) $Id$ */
|
||||
+
|
||||
+/*
|
||||
+ * We use the Altivec PIM vector stuff, but still, this is only
|
||||
+ * tested with GCC, and prop. uses some GCC specifics (like GCC
|
||||
+ * understands vector types and you can simply write a += b)
|
||||
+ */
|
||||
+#if defined(__ALTIVEC__) && defined(__GNUC__)
|
||||
+# define HAVE_ADLER32_VEC
|
||||
+/* it needs some bytes till the vec version gets up to speed... */
|
||||
+# define MIN_WORK 64
|
||||
+# include <altivec.h>
|
||||
+
|
||||
+/*
|
||||
+ * Depending on length, this can be slower (short length < 64 bytes),
|
||||
+ * much faster (our beloved 128kb 22.2s generic to 3.4s vec, but cache
|
||||
+ * is important...), to a little faster (very long length, 1.6MB, 47.6s
|
||||
+ * to 36s), which is prop. only capped by memory bandwith.
|
||||
+ * (The orig. 128k case was slower in AltiVec, because AltiVec loads
|
||||
+ * are always uncached and trigger no HW prefetching, because that is
|
||||
+ * what you often need with mass data manipulation (not poisen your
|
||||
+ * cache, movntq), instead you have to do it for yourself (data stream
|
||||
+ * touch). With 128k it could be cleanly seen: no prefetch, half as slow
|
||||
+ * as generic, but comment out the memory load -> 3s. With proper prefetch
|
||||
+ * we are at 3.4s. So AltiVec can execute these "expensive" FMA quite
|
||||
+ * fast (even without fancy unrolling), only the data does not arrive
|
||||
+ * fast enough. In cases where the working set does not fit into cache
|
||||
+ * it simply cannot be delivered fast enough over the FSB/Mem).
|
||||
+ * Still we have to prefetch, or we are slow as hell.
|
||||
+ */
|
||||
+
|
||||
+# define SOVUC (sizeof(vector unsigned char))
|
||||
+
|
||||
+/* can be propably more, since we do not have the x86 psadbw 64 bit sum */
|
||||
+# define VNMAX (6*NMAX)
|
||||
+
|
||||
+/* ========================================================================= */
|
||||
+local inline vector unsigned char vec_identl(level)
|
||||
+ unsigned int level;
|
||||
+{
|
||||
+ return vec_lvsl(level, (const unsigned char *)0);
|
||||
+}
|
||||
+
|
||||
+/* ========================================================================= */
|
||||
+local inline vector unsigned char vec_ident_rev(void)
|
||||
+{
|
||||
+ return vec_xor(vec_identl(0), vec_splat_u8(15));
|
||||
+}
|
||||
+
|
||||
+/* ========================================================================= */
|
||||
+/* multiply two 32 bit ints, return the low 32 bit */
|
||||
+local inline vector unsigned int vec_mullw(vector unsigned int a, vector unsigned int b)
|
||||
+{
|
||||
+ vector unsigned int v16 = vec_splat_u32(-16);
|
||||
+ vector unsigned int v0_32 = vec_splat_u32(0);
|
||||
+ vector unsigned int swap, low, high;
|
||||
+
|
||||
+ swap = vec_rl(b, v16);
|
||||
+ low = vec_mulo((vector unsigned short)a, (vector unsigned short)b);
|
||||
+ high = vec_msum((vector unsigned short)a, (vector unsigned short)swap, v0_32);
|
||||
+ high = vec_sl(high, v16);
|
||||
+ return vec_add(low, high);
|
||||
+}
|
||||
+
|
||||
+/* ========================================================================= */
|
||||
+local inline vector unsigned int vector_reduce(vector unsigned int x)
|
||||
+{
|
||||
+ vector unsigned int y;
|
||||
+ vector unsigned int vsh;
|
||||
+
|
||||
+ vsh = vec_splat_u32(1);
|
||||
+ vsh = vec_sl(vsh, vec_splat_u32(4));
|
||||
+
|
||||
+ y = vec_sl(x, vsh);
|
||||
+ y = vec_sr(y, vsh);
|
||||
+ x = vec_sr(x, vsh);
|
||||
+ y = vec_sub(y, x);
|
||||
+ x = vec_sl(x, vec_splat_u32(4));
|
||||
+ x = vec_add(x, y);
|
||||
+ return x;
|
||||
+}
|
||||
+
|
||||
+/* ========================================================================= */
|
||||
+local noinline uLong adler32_vec(adler, buf, len)
|
||||
+ uLong adler;
|
||||
+ const Bytef *buf;
|
||||
+ uInt len;
|
||||
+{
|
||||
+ unsigned int s1, s2;
|
||||
+
|
||||
+ s1 = adler & 0xffff;
|
||||
+ s2 = (adler >> 16) & 0xffff;
|
||||
+
|
||||
+ if (likely(len >= 2*SOVUC)) {
|
||||
+ vector unsigned int v0_32 = vec_splat_u32(0);
|
||||
+ vector unsigned int vsh = vec_splat_u32(4);
|
||||
+ vector unsigned char v1 = vec_splat_u8(1);
|
||||
+ vector unsigned char vord;
|
||||
+ vector unsigned char v0 = vec_splat_u8(0);
|
||||
+ vector unsigned int vs1, vs2;
|
||||
+ vector unsigned char in16, vord_a, v1_a, vperm;
|
||||
+ unsigned int f, n;
|
||||
+ unsigned int k, block_num;
|
||||
+
|
||||
+ /*
|
||||
+ * if i understand the Altivec PEM right, little
|
||||
+ * endian impl. should have the data reversed on
|
||||
+ * load, so the big endian vorder works.
|
||||
+ */
|
||||
+ vord = vec_ident_rev() + v1;
|
||||
+ block_num = DIV_ROUNDUP(len, 512); /* 32 block size * 16 bytes */
|
||||
+ f = 512;
|
||||
+ f |= block_num >= 256 ? 0 : block_num << 16;
|
||||
+ vec_dst(buf, f, 2);
|
||||
+ /*
|
||||
+ * Add stuff to achieve alignment
|
||||
+ */
|
||||
+ /* swizzle masks in place */
|
||||
+ vperm = vec_lvsl(0, buf);
|
||||
+ vord_a = vec_perm(vord, v0, vperm);
|
||||
+ v1_a = vec_perm(v1, v0, vperm);
|
||||
+ vperm = vec_lvsr(0, buf);
|
||||
+ vord_a = vec_perm(v0, vord_a, vperm);
|
||||
+ v1_a = vec_perm(v0, v1_a, vperm);
|
||||
+
|
||||
+ /* align hard down */
|
||||
+ f = (unsigned) ALIGN_DOWN_DIFF(buf, SOVUC);
|
||||
+ n = SOVUC - f;
|
||||
+ buf = (const unsigned char *)ALIGN_DOWN(buf, SOVUC);
|
||||
+
|
||||
+ /* add n times s1 to s2 for start round */
|
||||
+ s2 += s1 * n;
|
||||
+
|
||||
+ /* set sums 0 */
|
||||
+ vs1 = v0_32;
|
||||
+ vs2 = v0_32;
|
||||
+
|
||||
+ k = len < VNMAX ? (unsigned)len : VNMAX;
|
||||
+ len -= k;
|
||||
+
|
||||
+ /* insert scalar start somewhere */
|
||||
+ vs1 = vec_lde(0, &s1);
|
||||
+ vs2 = vec_lde(0, &s2);
|
||||
+
|
||||
+ /* get input data */
|
||||
+ in16 = vec_ldl(0, buf);
|
||||
+
|
||||
+ /* mask out excess data, add 4 byte horizontal and add to old dword */
|
||||
+ vs1 = vec_msum(in16, v1_a, vs1);
|
||||
+
|
||||
+ /* apply order, masking out excess data, add 4 byte horizontal and add to old dword */
|
||||
+ vs2 = vec_msum(in16, vord_a, vs2);
|
||||
+
|
||||
+ buf += SOVUC;
|
||||
+ k -= n;
|
||||
+
|
||||
+ if (likely(k >= SOVUC)) do {
|
||||
+ vector unsigned int vs1_r = v0_32;
|
||||
+ f = 512;
|
||||
+ f |= block_num >= 256 ? 0 : block_num << 16;
|
||||
+ vec_dst(buf, f, 2);
|
||||
+
|
||||
+ do {
|
||||
+ /* get input data */
|
||||
+ in16 = vec_ldl(0, buf);
|
||||
+
|
||||
+ /* add vs1 for this round */
|
||||
+ vs1_r += vs1;
|
||||
+
|
||||
+ /* add 4 byte horizontal and add to old dword */
|
||||
+ vs1 = vec_sum4s(in16, vs1);
|
||||
+ /* apply order, add 4 byte horizontal and add to old dword */
|
||||
+ vs2 = vec_msum(in16, vord, vs2);
|
||||
+
|
||||
+ buf += SOVUC;
|
||||
+ k -= SOVUC;
|
||||
+ } while (k >= SOVUC);
|
||||
+ /* reduce vs1 round sum before multiplying by 16 */
|
||||
+ vs1_r = vector_reduce(vs1_r);
|
||||
+ /* add all vs1 for 16 times */
|
||||
+ vs2 += vec_sl(vs1_r, vsh);
|
||||
+ /* reduce the vectors to something in the range of BASE */
|
||||
+ vs2 = vector_reduce(vs2);
|
||||
+ vs1 = vector_reduce(vs1);
|
||||
+ len += k;
|
||||
+ k = len < VNMAX ? (unsigned)len : VNMAX;
|
||||
+ block_num = DIV_ROUNDUP(len, 512); /* 32 block size * 16 bytes */
|
||||
+ len -= k;
|
||||
+ } while (likely(k >= SOVUC));
|
||||
+
|
||||
+ if (likely(k)) {
|
||||
+ vector unsigned int vk;
|
||||
+ /*
|
||||
+ * handle trailer
|
||||
+ */
|
||||
+ f = SOVUC - k;
|
||||
+ /* swizzle masks in place */
|
||||
+ vperm = vec_identl(f);
|
||||
+ vord_a = vec_perm(vord, v0, vperm);
|
||||
+ v1_a = vec_perm(v1, v0, vperm);
|
||||
+
|
||||
+ /* add k times vs1 for this trailer */
|
||||
+ vk = (vector unsigned int)vec_lvsl(0, (unsigned *)(intptr_t)k);
|
||||
+ vk = (vector unsigned)vec_mergeh(v0, (vector unsigned char)vk);
|
||||
+ vk = (vector unsigned)vec_mergeh((vector unsigned short)v0, (vector unsigned short)vk);
|
||||
+ vk = vec_splat(vk, 0);
|
||||
+ vs2 += vec_mullw(vs1, vk);
|
||||
+
|
||||
+ /* get input data */
|
||||
+ in16 = vec_ldl(0, buf);
|
||||
+
|
||||
+ /* mask out excess data, add 4 byte horizontal and add to old dword */
|
||||
+ vs1 = vec_msum(in16, v1_a, vs1);
|
||||
+ /* apply order, masking out excess data, add 4 byte horizontal and add to old dword */
|
||||
+ vs2 = vec_msum(in16, vord_a, vs2);
|
||||
+
|
||||
+ buf += k;
|
||||
+ k -= k;
|
||||
+ }
|
||||
+
|
||||
+ vec_dss(2);
|
||||
+
|
||||
+ /* add horizontal */
|
||||
+ /* stuff should be reduced so no proplem with signed sature */
|
||||
+ vs1 = (vector unsigned)vec_sums((vector int)vs1, (vector int)v0_32);
|
||||
+ vs2 = (vector unsigned)vec_sums((vector int)vs2, (vector int)v0_32);
|
||||
+ /* shake and roll */
|
||||
+ vs1 = vec_splat(vs1, 3);
|
||||
+ vs2 = vec_splat(vs2, 3);
|
||||
+ vec_ste(vs1, 0, &s1);
|
||||
+ vec_ste(vs2, 0, &s2);
|
||||
+ /* after horizontal add, reduce again in scalar code */
|
||||
+ }
|
||||
+
|
||||
+ if (unlikely(len)) do {
|
||||
+ s1 += *buf++;
|
||||
+ s2 += s1;
|
||||
+ } while (--len);
|
||||
+ reduce(s1);
|
||||
+ reduce(s2);
|
||||
+
|
||||
+ return (s2 << 16) | s1;
|
||||
+}
|
||||
+
|
||||
+#endif
|
400
03-arm.patch
Normal file
400
03-arm.patch
Normal file
@ -0,0 +1,400 @@
|
||||
=== modified file 'Makefile.in'
|
||||
--- Makefile.in 2011-03-14 03:06:03 +0000
|
||||
+++ Makefile.in 2011-03-14 14:39:24 +0000
|
||||
@@ -236,7 +236,7 @@
|
||||
|
||||
# DO NOT DELETE THIS LINE -- make depend depends on it.
|
||||
|
||||
-adler32.o: adler32.c adler32_ppc.c zutil.h zlib.h zconf.h
|
||||
+adler32.o: adler32.c adler32_ppc.c adler32_arm.c zutil.h zlib.h zconf.h
|
||||
zutil.o: zutil.h zlib.h zconf.h
|
||||
gzclose.o gzlib.o gzread.o gzwrite.o: zlib.h zconf.h gzguts.h
|
||||
compress.o example.o minigzip.o uncompr.o: zlib.h zconf.h
|
||||
@@ -247,7 +247,7 @@
|
||||
inftrees.o: zutil.h zlib.h zconf.h inftrees.h
|
||||
trees.o: deflate.h zutil.h zlib.h zconf.h trees.h
|
||||
|
||||
-adler32.lo: adler32.c adler32_ppc.c zutil.h zlib.h zconf.h
|
||||
+adler32.lo: adler32.c adler32_ppc.c adler32_arm.c zutil.h zlib.h zconf.h
|
||||
zutil.lo: zutil.h zlib.h zconf.h
|
||||
gzclose.lo gzlib.lo gzread.lo gzwrite.lo: zlib.h zconf.h gzguts.h
|
||||
compress.lo example.lo minigzip.lo uncompr.lo: zlib.h zconf.h
|
||||
|
||||
=== modified file 'adler32.c'
|
||||
--- adler32.c 2011-03-30 13:38:46 +0000
|
||||
+++ adler32.c 2011-03-30 13:38:46 +0000
|
||||
@@ -140,7 +140,9 @@
|
||||
}
|
||||
|
||||
#ifndef NO_ADLER32_VEC
|
||||
-# if defined(__powerpc__) || defined(__powerpc64__)
|
||||
+# if defined(__arm__)
|
||||
+# include "adler32_arm.c"
|
||||
+# elif defined(__powerpc__) || defined(__powerpc64__)
|
||||
# include "adler32_ppc.c"
|
||||
# endif
|
||||
#endif
|
||||
|
||||
=== added file 'adler32_arm.c'
|
||||
--- adler32_arm.c 1970-01-01 00:00:00 +0000
|
||||
+++ adler32_arm.c 2011-03-30 11:18:49 +0000
|
||||
@@ -0,0 +1,359 @@
|
||||
+/*
|
||||
+ * adler32.c -- compute the Adler-32 checksum of a data stream
|
||||
+ * arm implementation
|
||||
+ * Copyright (C) 1995-2007 Mark Adler
|
||||
+ * Copyright (C) 2009-2011 Jan Seiffert
|
||||
+ * For conditions of distribution and use, see copyright notice in zlib.h
|
||||
+ */
|
||||
+
|
||||
+/* @(#) $Id$ */
|
||||
+
|
||||
+#if defined(__ARM_NEON__)
|
||||
+// TODO: need byte order define
|
||||
+/*
|
||||
+ * Big endian NEON qwords are kind of broken.
|
||||
+ * They are big endian within the dwords, but WRONG
|
||||
+ * (really??) way round between lo and hi.
|
||||
+ * Creating some kind of PDP11 middle endian.
|
||||
+ *
|
||||
+ * This is madness and unsupportable. For this reason
|
||||
+ * GCC wants to disable qword endian specific patterns.
|
||||
+ * We would need a Preprocessor define which endian we
|
||||
+ * have to disable this code.
|
||||
+ */
|
||||
+# include <arm_neon.h>
|
||||
+
|
||||
+# define SOVUCQ sizeof(uint8x16_t)
|
||||
+# define SOVUC sizeof(uint8x8_t)
|
||||
+/* since we do not have the 64bit psadbw sum, we could prop. do a little more */
|
||||
+# define VNMAX (6*NMAX)
|
||||
+# define HAVE_ADLER32_VEC
|
||||
+# define MIN_WORK 32
|
||||
+
|
||||
+/* ========================================================================= */
|
||||
+local inline uint8x16_t neon_simple_alignq(uint8x16_t a, uint8x16_t b, unsigned amount)
|
||||
+{
|
||||
+ switch(amount % SOVUCQ)
|
||||
+ {
|
||||
+ case 0: return a;
|
||||
+ case 1: return vextq_u8(a, b, 1);
|
||||
+ case 2: return vextq_u8(a, b, 2);
|
||||
+ case 3: return vextq_u8(a, b, 3);
|
||||
+ case 4: return vextq_u8(a, b, 4);
|
||||
+ case 5: return vextq_u8(a, b, 5);
|
||||
+ case 6: return vextq_u8(a, b, 6);
|
||||
+ case 7: return vextq_u8(a, b, 7);
|
||||
+ case 8: return vextq_u8(a, b, 8);
|
||||
+ case 9: return vextq_u8(a, b, 9);
|
||||
+ case 10: return vextq_u8(a, b, 10);
|
||||
+ case 11: return vextq_u8(a, b, 11);
|
||||
+ case 12: return vextq_u8(a, b, 12);
|
||||
+ case 13: return vextq_u8(a, b, 13);
|
||||
+ case 14: return vextq_u8(a, b, 14);
|
||||
+ case 15: return vextq_u8(a, b, 15);
|
||||
+ }
|
||||
+ return b;
|
||||
+}
|
||||
+
|
||||
+/* ========================================================================= */
|
||||
+local inline uint32x4_t vector_reduce(uint32x4_t x)
|
||||
+{
|
||||
+ uint32x4_t y;
|
||||
+
|
||||
+ y = vshlq_n_u32(x, 16);
|
||||
+ x = vshrq_n_u32(x, 16);
|
||||
+ y = vshrq_n_u32(y, 16);
|
||||
+ y = vsubq_u32(y, x);
|
||||
+ x = vaddq_u32(y, vshlq_n_u32(x, 4));
|
||||
+ return x;
|
||||
+}
|
||||
+
|
||||
+/* ========================================================================= */
|
||||
+local noinline uLong adler32_vec(adler, buf, len)
|
||||
+ uLong adler;
|
||||
+ const Bytef *buf;
|
||||
+ uInt len;
|
||||
+{
|
||||
+ uint32x4_t v0_32 = (uint32x4_t){0,0,0,0};
|
||||
+ uint8x16_t v0 = (uint8x16_t)v0_32;
|
||||
+ uint8x16_t vord, vord_a;
|
||||
+ uint32x4_t vs1, vs2;
|
||||
+ uint32x2_t v_tsum;
|
||||
+ uint8x16_t in16;
|
||||
+ uint32_t s1, s2;
|
||||
+ unsigned k;
|
||||
+
|
||||
+ s1 = adler & 0xffff;
|
||||
+ s2 = (adler >> 16) & 0xffff;
|
||||
+
|
||||
+// TODO: big endian mask is prop. wrong
|
||||
+ if (host_is_bigendian())
|
||||
+ vord = (uint8x16_t){16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1};
|
||||
+ else
|
||||
+ vord = (uint8x16_t){1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16};
|
||||
+
|
||||
+ if (likely(len >= 2*SOVUCQ)) {
|
||||
+ unsigned f, n;
|
||||
+
|
||||
+ /*
|
||||
+ * Add stuff to achieve alignment
|
||||
+ */
|
||||
+ /* align hard down */
|
||||
+ f = (unsigned) ALIGN_DOWN_DIFF(buf, SOVUCQ);
|
||||
+ n = SOVUCQ - f;
|
||||
+ buf = (const unsigned char *)ALIGN_DOWN(buf, SOVUCQ);
|
||||
+
|
||||
+ /* add n times s1 to s2 for start round */
|
||||
+ s2 += s1 * n;
|
||||
+
|
||||
+ /* set sums 0 */
|
||||
+ vs1 = v0_32;
|
||||
+ vs2 = v0_32;
|
||||
+ /*
|
||||
+ * the accumulation of s1 for every round grows very fast
|
||||
+ * (quadratic?), even if we accumulate in 4 dwords, more
|
||||
+ * rounds means nonlinear growth.
|
||||
+ * We already split it out of s2, normaly it would be in
|
||||
+ * s2 times 16... and even grow faster.
|
||||
+ * Thanks to this split and vector reduction, we can stay
|
||||
+ * longer in the loops. But we have to prepare for the worst
|
||||
+ * (all 0xff), only do 6 times the work.
|
||||
+ * (we could prop. stay a little longer since we have 4 sums,
|
||||
+ * not 2 like on x86).
|
||||
+ */
|
||||
+ k = len < VNMAX ? (unsigned)len : VNMAX;
|
||||
+ len -= k;
|
||||
+ /* insert scalar start somewhere */
|
||||
+ vs1 = vsetq_lane_u32(s1, vs1, 0);
|
||||
+ vs2 = vsetq_lane_u32(s2, vs2, 0);
|
||||
+
|
||||
+ /* get input data */
|
||||
+ in16 = *(const uint8x16_t *)buf;
|
||||
+ /* mask out excess data */
|
||||
+ if(host_is_bigendian()) {
|
||||
+ in16 = neon_simple_alignq(v0, in16, n);
|
||||
+ vord_a = neon_simple_alignq(v0, vord, n);
|
||||
+ } else {
|
||||
+ in16 = neon_simple_alignq(in16, v0, f);
|
||||
+ vord_a = neon_simple_alignq(vord, v0, f);
|
||||
+ }
|
||||
+
|
||||
+ /* pairwise add bytes and long, pairwise add word long acc */
|
||||
+ vs1 = vpadalq_u16(vs1, vpaddlq_u8(in16));
|
||||
+ /* apply order, add words, pairwise add word long acc */
|
||||
+ vs2 = vpadalq_u16(vs2,
|
||||
+ vmlal_u8(
|
||||
+ vmull_u8(vget_low_u8(in16), vget_low_u8(vord_a)),
|
||||
+ vget_high_u8(in16), vget_high_u8(vord_a)
|
||||
+ )
|
||||
+ );
|
||||
+
|
||||
+ buf += SOVUCQ;
|
||||
+ k -= n;
|
||||
+
|
||||
+ if (likely(k >= SOVUCQ)) do {
|
||||
+ uint32x4_t vs1_r = v0_32;
|
||||
+ do {
|
||||
+ /* add vs1 for this round */
|
||||
+ vs1_r = vaddq_u32(vs1_r, vs1);
|
||||
+
|
||||
+ /* get input data */
|
||||
+ in16 = *(const uint8x16_t *)buf;
|
||||
+
|
||||
+// TODO: make work in inner loop more tight
|
||||
+ /*
|
||||
+ * decompose partial sums, so we do less instructions and
|
||||
+ * build loops around it to do acc and so on only from time
|
||||
+ * to time.
|
||||
+ * This is hard with NEON, because the instruction are nice:
|
||||
+ * we have the stuff in widening and with acc (practicaly
|
||||
+ * for free...)
|
||||
+ */
|
||||
+ /* pairwise add bytes and long, pairwise add word long acc */
|
||||
+ vs1 = vpadalq_u16(vs1, vpaddlq_u8(in16));
|
||||
+ /* apply order, add words, pairwise add word long acc */
|
||||
+ vs2 = vpadalq_u16(vs2,
|
||||
+ vmlal_u8(
|
||||
+ vmull_u8(vget_low_u8(in16), vget_low_u8(vord)),
|
||||
+ vget_high_u8(in16), vget_high_u8(vord)
|
||||
+ )
|
||||
+ );
|
||||
+
|
||||
+ buf += SOVUCQ;
|
||||
+ k -= SOVUCQ;
|
||||
+ } while (k >= SOVUCQ);
|
||||
+ /* reduce vs1 round sum before multiplying by 16 */
|
||||
+ vs1_r = vector_reduce(vs1_r);
|
||||
+ /* add vs1 for this round (16 times) */
|
||||
+ /* they have shift right and accummulate, where is shift left and acc?? */
|
||||
+ vs2 = vaddq_u32(vs2, vshlq_n_u32(vs1_r, 4));
|
||||
+ /* reduce both vectors to something within 16 bit */
|
||||
+ vs2 = vector_reduce(vs2);
|
||||
+ vs1 = vector_reduce(vs1);
|
||||
+ len += k;
|
||||
+ k = len < VNMAX ? (unsigned) len : VNMAX;
|
||||
+ len -= k;
|
||||
+ } while (likely(k >= SOVUC));
|
||||
+
|
||||
+ if (likely(k)) {
|
||||
+ /*
|
||||
+ * handle trailer
|
||||
+ */
|
||||
+ f = SOVUCQ - k;
|
||||
+ /* add k times vs1 for this trailer */
|
||||
+ vs2 = vmlaq_u32(vs2, vs1, vdupq_n_u32(k));
|
||||
+
|
||||
+ /* get input data */
|
||||
+ in16 = *(const uint8x16_t *)buf;
|
||||
+ /* masks out bad data */
|
||||
+ if(host_is_bigendian())
|
||||
+ in16 = neon_simple_alignq(in16, v0, f);
|
||||
+ else
|
||||
+ in16 = neon_simple_alignq(v0, in16, k);
|
||||
+
|
||||
+ /* pairwise add bytes and long, pairwise add word long acc */
|
||||
+ vs1 = vpadalq_u16(vs1, vpaddlq_u8(in16));
|
||||
+ /* apply order, add words, pairwise add word long acc */
|
||||
+ vs2 = vpadalq_u16(vs2,
|
||||
+ vmlal_u8(
|
||||
+ vmull_u8(vget_low_u8(in16), vget_low_u8(vord)),
|
||||
+ vget_high_u8(in16), vget_high_u8(vord)
|
||||
+ )
|
||||
+ );
|
||||
+
|
||||
+ buf += k;
|
||||
+ k -= k;
|
||||
+ }
|
||||
+
|
||||
+ /* add horizontal */
|
||||
+ v_tsum = vpadd_u32(vget_high_u32(vs1), vget_low_u32(vs1));
|
||||
+ v_tsum = vpadd_u32(v_tsum, v_tsum);
|
||||
+ s1 = vget_lane_u32(v_tsum, 0);
|
||||
+ v_tsum = vpadd_u32(vget_high_u32(vs2), vget_low_u32(vs2));
|
||||
+ v_tsum = vpadd_u32(v_tsum, v_tsum);
|
||||
+ s2 = vget_lane_u32(v_tsum, 0);
|
||||
+ }
|
||||
+
|
||||
+ if (unlikely(len)) do {
|
||||
+ s1 += *buf++;
|
||||
+ s2 += s1;
|
||||
+ } while (--len);
|
||||
+ reduce_x(s1);
|
||||
+ reduce_x(s2);
|
||||
+
|
||||
+ return (s2 << 16) | s1;
|
||||
+}
|
||||
+
|
||||
+/* inline asm, so only on GCC (or compatible) && ARM v6 or better */
|
||||
+#elif defined(__GNUC__) && ( \
|
||||
+ defined(__ARM_ARCH_6__) || defined(__ARM_ARCH_6J__) || \
|
||||
+ defined(__ARM_ARCH_6Z__) || defined(__ARM_ARCH_6ZK__) || \
|
||||
+ defined(__ARM_ARCH_7A__) \
|
||||
+ )
|
||||
+# define SOU32 (sizeof(unsigned int))
|
||||
+# define HAVE_ADLER32_VEC
|
||||
+# define MIN_WORK 16
|
||||
+
|
||||
+/* ========================================================================= */
|
||||
+local noinline uLong adler32_vec(adler, buf, len)
|
||||
+ uLong adler;
|
||||
+ const Bytef *buf;
|
||||
+ uInt len;
|
||||
+{
|
||||
+ unsigned int s1, s2;
|
||||
+ unsigned int k;
|
||||
+
|
||||
+ s1 = adler & 0xffff;
|
||||
+ s2 = (adler >> 16) & 0xffff;
|
||||
+
|
||||
+ k = ALIGN_DIFF(buf, SOU32);
|
||||
+ len -= k;
|
||||
+ if (k) do {
|
||||
+ s1 += *buf++;
|
||||
+ s2 += s1;
|
||||
+ } while (--k);
|
||||
+
|
||||
+ if (likely(len >= 4 * SOU32)) {
|
||||
+ unsigned int vs1 = s1, vs2 = s2;
|
||||
+ unsigned int order_lo, order_hi;
|
||||
+
|
||||
+// TODO: byte order?
|
||||
+ if(host_is_bigendian()) {
|
||||
+ order_lo = 0x00030001;
|
||||
+ order_hi = 0x00040002;
|
||||
+ } else {
|
||||
+ order_lo = 0x00020004;
|
||||
+ order_hi = 0x00010003;
|
||||
+ }
|
||||
+// TODO: we could go over NMAX, since we have split the vs2 sum
|
||||
+ /* something around (NMAX+(NMAX/3)+302) */
|
||||
+ k = len < NMAX ? len : NMAX;
|
||||
+ len -= k;
|
||||
+
|
||||
+ do {
|
||||
+ unsigned int vs1_r = 0;
|
||||
+ do {
|
||||
+ unsigned int t21, t22, in;
|
||||
+
|
||||
+ /* get input data */
|
||||
+ in = *(const unsigned int *)buf;
|
||||
+
|
||||
+ /* add vs1 for this round */
|
||||
+ vs1_r += vs1;
|
||||
+
|
||||
+ /* add horizontal and acc */
|
||||
+ asm ("usada8 %0, %1, %2, %3" : "=r" (vs1) : "r" (in), "r" (0), "r" (vs1));
|
||||
+ /* widen bytes to words, apply order, add and acc */
|
||||
+ asm ("uxtb16 %0, %1" : "=r" (t21) : "r" (in));
|
||||
+ asm ("uxtb16 %0, %1, ror #8" : "=r" (t22) : "r" (in));
|
||||
+// TODO: instruction result latency
|
||||
+ /*
|
||||
+ * The same problem like the classic serial sum:
|
||||
+ * Chip maker sell us 1-cycle instructions, but that is not the
|
||||
+ * whole story. Nearly all 1-cycle chips are pipelined, so
|
||||
+ * you can get one result per cycle, but only if _they_ (plural)
|
||||
+ * are independent.
|
||||
+ * If you are depending on the result of an preciding instruction,
|
||||
+ * in the worst case you hit the instruction latency which is worst
|
||||
+ * case >= pipeline length. On the other hand there are result-fast-paths.
|
||||
+ * This could all be a wash with the classic sum (4 * 2 instructions,
|
||||
+ * + dependence), since smald is:
|
||||
+ * - 2 cycle issue
|
||||
+ * - needs the acc in pipeline step E1, instead of E2
|
||||
+ * But the Cortex has a fastpath for acc.
|
||||
+ * I don't know.
|
||||
+ * We can not even unroll, we would need 4 order vars, return ENOREGISTER.
|
||||
+ */
|
||||
+ asm ("smlad %0, %1, %2, %3" : "=r" (vs2) : "r" (t21) , "r" (order_lo), "r" (vs2));
|
||||
+ asm ("smlad %0, %1, %2, %3" : "=r" (vs2) : "r" (t22) , "r" (order_hi), "r" (vs2));
|
||||
+
|
||||
+ buf += SOU32;
|
||||
+ k -= SOU32;
|
||||
+ } while (k >= SOU32);
|
||||
+ /* reduce vs1 round sum before multiplying by 4 */
|
||||
+ reduce(vs1_r);
|
||||
+ /* add vs1 for this round (4 times) */
|
||||
+ vs2 += vs1_r * 4;
|
||||
+ /* reduce both sums to something within 16 bit */
|
||||
+ reduce(vs2);
|
||||
+ reduce(vs1);
|
||||
+ len += k;
|
||||
+ k = len < NMAX ? len : NMAX;
|
||||
+ len -= k;
|
||||
+ } while (likely(k >= 4 * SOU32));
|
||||
+ len += k;
|
||||
+ s1 = vs1;
|
||||
+ s2 = vs2;
|
||||
+ }
|
||||
+
|
||||
+ if (unlikely(len)) do {
|
||||
+ s1 += *buf++;
|
||||
+ s2 += s1;
|
||||
+ } while (--len);
|
||||
+ /* at this point we should no have so big s1 & s2 */
|
||||
+ reduce_x(s1);
|
||||
+ reduce_x(s2);
|
||||
+
|
||||
+ return (s2 << 16) | s1;
|
||||
+}
|
||||
+#endif
|
1165
04-x86.patch
Normal file
1165
04-x86.patch
Normal file
File diff suppressed because it is too large
Load Diff
11
zlib.changes
11
zlib.changes
@ -1,3 +1,14 @@
|
||||
-------------------------------------------------------------------
|
||||
Wed Mar 30 19:47:30 UTC 2011 - crrodriguez@opensuse.org
|
||||
|
||||
- Update SSE2/MMX patches to version 2.
|
||||
|
||||
-------------------------------------------------------------------
|
||||
Tue Mar 15 22:38:32 UTC 2011 - crrodriguez@opensuse.org
|
||||
|
||||
- Add highly experimental patches to use SSE2/SSSE3/MMX in zlib
|
||||
this makes the library up to 6 times faster.
|
||||
|
||||
-------------------------------------------------------------------
|
||||
Sun Jan 9 14:33:08 CET 2011 - meissner@suse.de
|
||||
|
||||
|
@ -40,6 +40,10 @@ Patch0: zlib-1.2.2-format.patch
|
||||
Patch1: zlib-lfs.patch
|
||||
# PATCH-FIX-JENGELH-PARALLEL-MAKE zlib-parallel.patch meissner@novell.com -- shared library links with libz.a
|
||||
Patch2: zlib-parallel.patch
|
||||
Patch3: 01-prepare.patch
|
||||
Patch4: 02-ppc_altivec.patch
|
||||
Patch5: 03-arm.patch
|
||||
Patch6: 04-x86.patch
|
||||
BuildRoot: %{_tmppath}/%{name}-%{version}-build
|
||||
BuildRequires: pkgconfig
|
||||
|
||||
@ -84,7 +88,10 @@ libraries.
|
||||
%patch0
|
||||
%patch1
|
||||
%patch2 -p1
|
||||
|
||||
%patch3
|
||||
%patch4
|
||||
%patch5
|
||||
%patch6
|
||||
%build
|
||||
# Marcus: breaks example64 in 32bit builds.
|
||||
%define do_profiling 0
|
||||
|
Loading…
Reference in New Issue
Block a user