308 lines
10 KiB
Diff
308 lines
10 KiB
Diff
|
=== modified file 'Makefile.in'
|
||
|
--- Makefile.in 2011-03-14 02:19:21 +0000
|
||
|
+++ Makefile.in 2011-03-14 03:06:03 +0000
|
||
|
@@ -236,7 +236,7 @@
|
||
|
|
||
|
# DO NOT DELETE THIS LINE -- make depend depends on it.
|
||
|
|
||
|
-adler32.o: adler32.c zutil.h zlib.h zconf.h
|
||
|
+adler32.o: adler32.c adler32_ppc.c zutil.h zlib.h zconf.h
|
||
|
zutil.o: zutil.h zlib.h zconf.h
|
||
|
gzclose.o gzlib.o gzread.o gzwrite.o: zlib.h zconf.h gzguts.h
|
||
|
compress.o example.o minigzip.o uncompr.o: zlib.h zconf.h
|
||
|
@@ -247,7 +247,7 @@
|
||
|
inftrees.o: zutil.h zlib.h zconf.h inftrees.h
|
||
|
trees.o: deflate.h zutil.h zlib.h zconf.h trees.h
|
||
|
|
||
|
-adler32.lo: adler32.c zutil.h zlib.h zconf.h
|
||
|
+adler32.lo: adler32.c adler32_ppc.c zutil.h zlib.h zconf.h
|
||
|
zutil.lo: zutil.h zlib.h zconf.h
|
||
|
gzclose.lo gzlib.lo gzread.lo gzwrite.lo: zlib.h zconf.h gzguts.h
|
||
|
compress.lo example.lo minigzip.lo uncompr.lo: zlib.h zconf.h
|
||
|
|
||
|
=== modified file 'adler32.c'
|
||
|
--- adler32.c 2011-03-30 13:38:42 +0000
|
||
|
+++ adler32.c 2011-03-30 13:38:46 +0000
|
||
|
@@ -36,7 +36,10 @@
|
||
|
#endif
|
||
|
|
||
|
#define ROUND_TO(x , n) ((x) & ~((n) - 1L))
|
||
|
+#define DIV_ROUNDUP(a, b) (((a) + (b) - 1) / (b))
|
||
|
#define ALIGN_DIFF(x, n) (((intptr_t)((x)+(n) - 1L) & ~((intptr_t)(n) - 1L)) - (intptr_t)(x))
|
||
|
+#define ALIGN_DOWN(x, n) (((intptr_t)(x)) & ~((intptr_t)(n) - 1L))
|
||
|
+#define ALIGN_DOWN_DIFF(x, n) (((intptr_t)(x)) & ((intptr_t)(n) - 1L))
|
||
|
|
||
|
local uLong adler32_combine_(uLong adler1, uLong adler2, z_off64_t len2);
|
||
|
|
||
|
@@ -136,6 +139,12 @@
|
||
|
return x.endian[0] == 0;
|
||
|
}
|
||
|
|
||
|
+#ifndef NO_ADLER32_VEC
|
||
|
+# if defined(__powerpc__) || defined(__powerpc64__)
|
||
|
+# include "adler32_ppc.c"
|
||
|
+# endif
|
||
|
+#endif
|
||
|
+
|
||
|
#ifndef MIN_WORK
|
||
|
# define MIN_WORK 16
|
||
|
#endif
|
||
|
|
||
|
=== added file 'adler32_ppc.c'
|
||
|
--- adler32_ppc.c 1970-01-01 00:00:00 +0000
|
||
|
+++ adler32_ppc.c 2011-03-30 11:12:04 +0000
|
||
|
@@ -0,0 +1,253 @@
|
||
|
+/*
|
||
|
+ * adler32.c -- compute the Adler-32 checksum of a data stream
|
||
|
+ * ppc implementation
|
||
|
+ * Copyright (C) 1995-2007 Mark Adler
|
||
|
+ * Copyright (C) 2009-2011 Jan Seiffert
|
||
|
+ * For conditions of distribution and use, see copyright notice in zlib.h
|
||
|
+ */
|
||
|
+
|
||
|
+/* @(#) $Id$ */
|
||
|
+
|
||
|
+/*
|
||
|
+ * We use the Altivec PIM vector stuff, but still, this is only
|
||
|
+ * tested with GCC, and prop. uses some GCC specifics (like GCC
|
||
|
+ * understands vector types and you can simply write a += b)
|
||
|
+ */
|
||
|
+#if defined(__ALTIVEC__) && defined(__GNUC__)
|
||
|
+# define HAVE_ADLER32_VEC
|
||
|
+/* it needs some bytes till the vec version gets up to speed... */
|
||
|
+# define MIN_WORK 64
|
||
|
+# include <altivec.h>
|
||
|
+
|
||
|
+/*
|
||
|
+ * Depending on length, this can be slower (short length < 64 bytes),
|
||
|
+ * much faster (our beloved 128kb 22.2s generic to 3.4s vec, but cache
|
||
|
+ * is important...), to a little faster (very long length, 1.6MB, 47.6s
|
||
|
+ * to 36s), which is prop. only capped by memory bandwith.
|
||
|
+ * (The orig. 128k case was slower in AltiVec, because AltiVec loads
|
||
|
+ * are always uncached and trigger no HW prefetching, because that is
|
||
|
+ * what you often need with mass data manipulation (not poisen your
|
||
|
+ * cache, movntq), instead you have to do it for yourself (data stream
|
||
|
+ * touch). With 128k it could be cleanly seen: no prefetch, half as slow
|
||
|
+ * as generic, but comment out the memory load -> 3s. With proper prefetch
|
||
|
+ * we are at 3.4s. So AltiVec can execute these "expensive" FMA quite
|
||
|
+ * fast (even without fancy unrolling), only the data does not arrive
|
||
|
+ * fast enough. In cases where the working set does not fit into cache
|
||
|
+ * it simply cannot be delivered fast enough over the FSB/Mem).
|
||
|
+ * Still we have to prefetch, or we are slow as hell.
|
||
|
+ */
|
||
|
+
|
||
|
+# define SOVUC (sizeof(vector unsigned char))
|
||
|
+
|
||
|
+/* can be propably more, since we do not have the x86 psadbw 64 bit sum */
|
||
|
+# define VNMAX (6*NMAX)
|
||
|
+
|
||
|
+/* ========================================================================= */
|
||
|
+local inline vector unsigned char vec_identl(level)
|
||
|
+ unsigned int level;
|
||
|
+{
|
||
|
+ return vec_lvsl(level, (const unsigned char *)0);
|
||
|
+}
|
||
|
+
|
||
|
+/* ========================================================================= */
|
||
|
+local inline vector unsigned char vec_ident_rev(void)
|
||
|
+{
|
||
|
+ return vec_xor(vec_identl(0), vec_splat_u8(15));
|
||
|
+}
|
||
|
+
|
||
|
+/* ========================================================================= */
|
||
|
+/* multiply two 32 bit ints, return the low 32 bit */
|
||
|
+local inline vector unsigned int vec_mullw(vector unsigned int a, vector unsigned int b)
|
||
|
+{
|
||
|
+ vector unsigned int v16 = vec_splat_u32(-16);
|
||
|
+ vector unsigned int v0_32 = vec_splat_u32(0);
|
||
|
+ vector unsigned int swap, low, high;
|
||
|
+
|
||
|
+ swap = vec_rl(b, v16);
|
||
|
+ low = vec_mulo((vector unsigned short)a, (vector unsigned short)b);
|
||
|
+ high = vec_msum((vector unsigned short)a, (vector unsigned short)swap, v0_32);
|
||
|
+ high = vec_sl(high, v16);
|
||
|
+ return vec_add(low, high);
|
||
|
+}
|
||
|
+
|
||
|
+/* ========================================================================= */
|
||
|
+local inline vector unsigned int vector_reduce(vector unsigned int x)
|
||
|
+{
|
||
|
+ vector unsigned int y;
|
||
|
+ vector unsigned int vsh;
|
||
|
+
|
||
|
+ vsh = vec_splat_u32(1);
|
||
|
+ vsh = vec_sl(vsh, vec_splat_u32(4));
|
||
|
+
|
||
|
+ y = vec_sl(x, vsh);
|
||
|
+ y = vec_sr(y, vsh);
|
||
|
+ x = vec_sr(x, vsh);
|
||
|
+ y = vec_sub(y, x);
|
||
|
+ x = vec_sl(x, vec_splat_u32(4));
|
||
|
+ x = vec_add(x, y);
|
||
|
+ return x;
|
||
|
+}
|
||
|
+
|
||
|
+/* ========================================================================= */
|
||
|
+local noinline uLong adler32_vec(adler, buf, len)
|
||
|
+ uLong adler;
|
||
|
+ const Bytef *buf;
|
||
|
+ uInt len;
|
||
|
+{
|
||
|
+ unsigned int s1, s2;
|
||
|
+
|
||
|
+ s1 = adler & 0xffff;
|
||
|
+ s2 = (adler >> 16) & 0xffff;
|
||
|
+
|
||
|
+ if (likely(len >= 2*SOVUC)) {
|
||
|
+ vector unsigned int v0_32 = vec_splat_u32(0);
|
||
|
+ vector unsigned int vsh = vec_splat_u32(4);
|
||
|
+ vector unsigned char v1 = vec_splat_u8(1);
|
||
|
+ vector unsigned char vord;
|
||
|
+ vector unsigned char v0 = vec_splat_u8(0);
|
||
|
+ vector unsigned int vs1, vs2;
|
||
|
+ vector unsigned char in16, vord_a, v1_a, vperm;
|
||
|
+ unsigned int f, n;
|
||
|
+ unsigned int k, block_num;
|
||
|
+
|
||
|
+ /*
|
||
|
+ * if i understand the Altivec PEM right, little
|
||
|
+ * endian impl. should have the data reversed on
|
||
|
+ * load, so the big endian vorder works.
|
||
|
+ */
|
||
|
+ vord = vec_ident_rev() + v1;
|
||
|
+ block_num = DIV_ROUNDUP(len, 512); /* 32 block size * 16 bytes */
|
||
|
+ f = 512;
|
||
|
+ f |= block_num >= 256 ? 0 : block_num << 16;
|
||
|
+ vec_dst(buf, f, 2);
|
||
|
+ /*
|
||
|
+ * Add stuff to achieve alignment
|
||
|
+ */
|
||
|
+ /* swizzle masks in place */
|
||
|
+ vperm = vec_lvsl(0, buf);
|
||
|
+ vord_a = vec_perm(vord, v0, vperm);
|
||
|
+ v1_a = vec_perm(v1, v0, vperm);
|
||
|
+ vperm = vec_lvsr(0, buf);
|
||
|
+ vord_a = vec_perm(v0, vord_a, vperm);
|
||
|
+ v1_a = vec_perm(v0, v1_a, vperm);
|
||
|
+
|
||
|
+ /* align hard down */
|
||
|
+ f = (unsigned) ALIGN_DOWN_DIFF(buf, SOVUC);
|
||
|
+ n = SOVUC - f;
|
||
|
+ buf = (const unsigned char *)ALIGN_DOWN(buf, SOVUC);
|
||
|
+
|
||
|
+ /* add n times s1 to s2 for start round */
|
||
|
+ s2 += s1 * n;
|
||
|
+
|
||
|
+ /* set sums 0 */
|
||
|
+ vs1 = v0_32;
|
||
|
+ vs2 = v0_32;
|
||
|
+
|
||
|
+ k = len < VNMAX ? (unsigned)len : VNMAX;
|
||
|
+ len -= k;
|
||
|
+
|
||
|
+ /* insert scalar start somewhere */
|
||
|
+ vs1 = vec_lde(0, &s1);
|
||
|
+ vs2 = vec_lde(0, &s2);
|
||
|
+
|
||
|
+ /* get input data */
|
||
|
+ in16 = vec_ldl(0, buf);
|
||
|
+
|
||
|
+ /* mask out excess data, add 4 byte horizontal and add to old dword */
|
||
|
+ vs1 = vec_msum(in16, v1_a, vs1);
|
||
|
+
|
||
|
+ /* apply order, masking out excess data, add 4 byte horizontal and add to old dword */
|
||
|
+ vs2 = vec_msum(in16, vord_a, vs2);
|
||
|
+
|
||
|
+ buf += SOVUC;
|
||
|
+ k -= n;
|
||
|
+
|
||
|
+ if (likely(k >= SOVUC)) do {
|
||
|
+ vector unsigned int vs1_r = v0_32;
|
||
|
+ f = 512;
|
||
|
+ f |= block_num >= 256 ? 0 : block_num << 16;
|
||
|
+ vec_dst(buf, f, 2);
|
||
|
+
|
||
|
+ do {
|
||
|
+ /* get input data */
|
||
|
+ in16 = vec_ldl(0, buf);
|
||
|
+
|
||
|
+ /* add vs1 for this round */
|
||
|
+ vs1_r += vs1;
|
||
|
+
|
||
|
+ /* add 4 byte horizontal and add to old dword */
|
||
|
+ vs1 = vec_sum4s(in16, vs1);
|
||
|
+ /* apply order, add 4 byte horizontal and add to old dword */
|
||
|
+ vs2 = vec_msum(in16, vord, vs2);
|
||
|
+
|
||
|
+ buf += SOVUC;
|
||
|
+ k -= SOVUC;
|
||
|
+ } while (k >= SOVUC);
|
||
|
+ /* reduce vs1 round sum before multiplying by 16 */
|
||
|
+ vs1_r = vector_reduce(vs1_r);
|
||
|
+ /* add all vs1 for 16 times */
|
||
|
+ vs2 += vec_sl(vs1_r, vsh);
|
||
|
+ /* reduce the vectors to something in the range of BASE */
|
||
|
+ vs2 = vector_reduce(vs2);
|
||
|
+ vs1 = vector_reduce(vs1);
|
||
|
+ len += k;
|
||
|
+ k = len < VNMAX ? (unsigned)len : VNMAX;
|
||
|
+ block_num = DIV_ROUNDUP(len, 512); /* 32 block size * 16 bytes */
|
||
|
+ len -= k;
|
||
|
+ } while (likely(k >= SOVUC));
|
||
|
+
|
||
|
+ if (likely(k)) {
|
||
|
+ vector unsigned int vk;
|
||
|
+ /*
|
||
|
+ * handle trailer
|
||
|
+ */
|
||
|
+ f = SOVUC - k;
|
||
|
+ /* swizzle masks in place */
|
||
|
+ vperm = vec_identl(f);
|
||
|
+ vord_a = vec_perm(vord, v0, vperm);
|
||
|
+ v1_a = vec_perm(v1, v0, vperm);
|
||
|
+
|
||
|
+ /* add k times vs1 for this trailer */
|
||
|
+ vk = (vector unsigned int)vec_lvsl(0, (unsigned *)(intptr_t)k);
|
||
|
+ vk = (vector unsigned)vec_mergeh(v0, (vector unsigned char)vk);
|
||
|
+ vk = (vector unsigned)vec_mergeh((vector unsigned short)v0, (vector unsigned short)vk);
|
||
|
+ vk = vec_splat(vk, 0);
|
||
|
+ vs2 += vec_mullw(vs1, vk);
|
||
|
+
|
||
|
+ /* get input data */
|
||
|
+ in16 = vec_ldl(0, buf);
|
||
|
+
|
||
|
+ /* mask out excess data, add 4 byte horizontal and add to old dword */
|
||
|
+ vs1 = vec_msum(in16, v1_a, vs1);
|
||
|
+ /* apply order, masking out excess data, add 4 byte horizontal and add to old dword */
|
||
|
+ vs2 = vec_msum(in16, vord_a, vs2);
|
||
|
+
|
||
|
+ buf += k;
|
||
|
+ k -= k;
|
||
|
+ }
|
||
|
+
|
||
|
+ vec_dss(2);
|
||
|
+
|
||
|
+ /* add horizontal */
|
||
|
+ /* stuff should be reduced so no proplem with signed sature */
|
||
|
+ vs1 = (vector unsigned)vec_sums((vector int)vs1, (vector int)v0_32);
|
||
|
+ vs2 = (vector unsigned)vec_sums((vector int)vs2, (vector int)v0_32);
|
||
|
+ /* shake and roll */
|
||
|
+ vs1 = vec_splat(vs1, 3);
|
||
|
+ vs2 = vec_splat(vs2, 3);
|
||
|
+ vec_ste(vs1, 0, &s1);
|
||
|
+ vec_ste(vs2, 0, &s2);
|
||
|
+ /* after horizontal add, reduce again in scalar code */
|
||
|
+ }
|
||
|
+
|
||
|
+ if (unlikely(len)) do {
|
||
|
+ s1 += *buf++;
|
||
|
+ s2 += s1;
|
||
|
+ } while (--len);
|
||
|
+ reduce(s1);
|
||
|
+ reduce(s2);
|
||
|
+
|
||
|
+ return (s2 << 16) | s1;
|
||
|
+}
|
||
|
+
|
||
|
+#endif
|