zlib/02-ppc_altivec.patch

308 lines
10 KiB
Diff

=== modified file 'Makefile.in'
--- Makefile.in 2011-03-14 02:19:21 +0000
+++ Makefile.in 2011-03-14 03:06:03 +0000
@@ -236,7 +236,7 @@
# DO NOT DELETE THIS LINE -- make depend depends on it.
-adler32.o: adler32.c zutil.h zlib.h zconf.h
+adler32.o: adler32.c adler32_ppc.c zutil.h zlib.h zconf.h
zutil.o: zutil.h zlib.h zconf.h
gzclose.o gzlib.o gzread.o gzwrite.o: zlib.h zconf.h gzguts.h
compress.o example.o minigzip.o uncompr.o: zlib.h zconf.h
@@ -247,7 +247,7 @@
inftrees.o: zutil.h zlib.h zconf.h inftrees.h
trees.o: deflate.h zutil.h zlib.h zconf.h trees.h
-adler32.lo: adler32.c zutil.h zlib.h zconf.h
+adler32.lo: adler32.c adler32_ppc.c zutil.h zlib.h zconf.h
zutil.lo: zutil.h zlib.h zconf.h
gzclose.lo gzlib.lo gzread.lo gzwrite.lo: zlib.h zconf.h gzguts.h
compress.lo example.lo minigzip.lo uncompr.lo: zlib.h zconf.h
=== modified file 'adler32.c'
--- adler32.c 2011-03-30 13:38:42 +0000
+++ adler32.c 2011-03-30 13:38:46 +0000
@@ -36,7 +36,10 @@
#endif
#define ROUND_TO(x , n) ((x) & ~((n) - 1L))
+#define DIV_ROUNDUP(a, b) (((a) + (b) - 1) / (b))
#define ALIGN_DIFF(x, n) (((intptr_t)((x)+(n) - 1L) & ~((intptr_t)(n) - 1L)) - (intptr_t)(x))
+#define ALIGN_DOWN(x, n) (((intptr_t)(x)) & ~((intptr_t)(n) - 1L))
+#define ALIGN_DOWN_DIFF(x, n) (((intptr_t)(x)) & ((intptr_t)(n) - 1L))
local uLong adler32_combine_(uLong adler1, uLong adler2, z_off64_t len2);
@@ -136,6 +139,12 @@
return x.endian[0] == 0;
}
+#ifndef NO_ADLER32_VEC
+# if defined(__powerpc__) || defined(__powerpc64__)
+# include "adler32_ppc.c"
+# endif
+#endif
+
#ifndef MIN_WORK
# define MIN_WORK 16
#endif
=== added file 'adler32_ppc.c'
--- adler32_ppc.c 1970-01-01 00:00:00 +0000
+++ adler32_ppc.c 2011-03-30 11:12:04 +0000
@@ -0,0 +1,253 @@
+/*
+ * adler32.c -- compute the Adler-32 checksum of a data stream
+ * ppc implementation
+ * Copyright (C) 1995-2007 Mark Adler
+ * Copyright (C) 2009-2011 Jan Seiffert
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+/* @(#) $Id$ */
+
+/*
+ * We use the Altivec PIM vector stuff, but still, this is only
+ * tested with GCC, and prop. uses some GCC specifics (like GCC
+ * understands vector types and you can simply write a += b)
+ */
+#if defined(__ALTIVEC__) && defined(__GNUC__)
+# define HAVE_ADLER32_VEC
+/* it needs some bytes till the vec version gets up to speed... */
+# define MIN_WORK 64
+# include <altivec.h>
+
+/*
+ * Depending on length, this can be slower (short length < 64 bytes),
+ * much faster (our beloved 128kb 22.2s generic to 3.4s vec, but cache
+ * is important...), to a little faster (very long length, 1.6MB, 47.6s
+ * to 36s), which is prop. only capped by memory bandwith.
+ * (The orig. 128k case was slower in AltiVec, because AltiVec loads
+ * are always uncached and trigger no HW prefetching, because that is
+ * what you often need with mass data manipulation (not poisen your
+ * cache, movntq), instead you have to do it for yourself (data stream
+ * touch). With 128k it could be cleanly seen: no prefetch, half as slow
+ * as generic, but comment out the memory load -> 3s. With proper prefetch
+ * we are at 3.4s. So AltiVec can execute these "expensive" FMA quite
+ * fast (even without fancy unrolling), only the data does not arrive
+ * fast enough. In cases where the working set does not fit into cache
+ * it simply cannot be delivered fast enough over the FSB/Mem).
+ * Still we have to prefetch, or we are slow as hell.
+ */
+
+# define SOVUC (sizeof(vector unsigned char))
+
+/* can be propably more, since we do not have the x86 psadbw 64 bit sum */
+# define VNMAX (6*NMAX)
+
+/* ========================================================================= */
+local inline vector unsigned char vec_identl(level)
+ unsigned int level;
+{
+ return vec_lvsl(level, (const unsigned char *)0);
+}
+
+/* ========================================================================= */
+local inline vector unsigned char vec_ident_rev(void)
+{
+ return vec_xor(vec_identl(0), vec_splat_u8(15));
+}
+
+/* ========================================================================= */
+/* multiply two 32 bit ints, return the low 32 bit */
+local inline vector unsigned int vec_mullw(vector unsigned int a, vector unsigned int b)
+{
+ vector unsigned int v16 = vec_splat_u32(-16);
+ vector unsigned int v0_32 = vec_splat_u32(0);
+ vector unsigned int swap, low, high;
+
+ swap = vec_rl(b, v16);
+ low = vec_mulo((vector unsigned short)a, (vector unsigned short)b);
+ high = vec_msum((vector unsigned short)a, (vector unsigned short)swap, v0_32);
+ high = vec_sl(high, v16);
+ return vec_add(low, high);
+}
+
+/* ========================================================================= */
+local inline vector unsigned int vector_reduce(vector unsigned int x)
+{
+ vector unsigned int y;
+ vector unsigned int vsh;
+
+ vsh = vec_splat_u32(1);
+ vsh = vec_sl(vsh, vec_splat_u32(4));
+
+ y = vec_sl(x, vsh);
+ y = vec_sr(y, vsh);
+ x = vec_sr(x, vsh);
+ y = vec_sub(y, x);
+ x = vec_sl(x, vec_splat_u32(4));
+ x = vec_add(x, y);
+ return x;
+}
+
+/* ========================================================================= */
+local noinline uLong adler32_vec(adler, buf, len)
+ uLong adler;
+ const Bytef *buf;
+ uInt len;
+{
+ unsigned int s1, s2;
+
+ s1 = adler & 0xffff;
+ s2 = (adler >> 16) & 0xffff;
+
+ if (likely(len >= 2*SOVUC)) {
+ vector unsigned int v0_32 = vec_splat_u32(0);
+ vector unsigned int vsh = vec_splat_u32(4);
+ vector unsigned char v1 = vec_splat_u8(1);
+ vector unsigned char vord;
+ vector unsigned char v0 = vec_splat_u8(0);
+ vector unsigned int vs1, vs2;
+ vector unsigned char in16, vord_a, v1_a, vperm;
+ unsigned int f, n;
+ unsigned int k, block_num;
+
+ /*
+ * if i understand the Altivec PEM right, little
+ * endian impl. should have the data reversed on
+ * load, so the big endian vorder works.
+ */
+ vord = vec_ident_rev() + v1;
+ block_num = DIV_ROUNDUP(len, 512); /* 32 block size * 16 bytes */
+ f = 512;
+ f |= block_num >= 256 ? 0 : block_num << 16;
+ vec_dst(buf, f, 2);
+ /*
+ * Add stuff to achieve alignment
+ */
+ /* swizzle masks in place */
+ vperm = vec_lvsl(0, buf);
+ vord_a = vec_perm(vord, v0, vperm);
+ v1_a = vec_perm(v1, v0, vperm);
+ vperm = vec_lvsr(0, buf);
+ vord_a = vec_perm(v0, vord_a, vperm);
+ v1_a = vec_perm(v0, v1_a, vperm);
+
+ /* align hard down */
+ f = (unsigned) ALIGN_DOWN_DIFF(buf, SOVUC);
+ n = SOVUC - f;
+ buf = (const unsigned char *)ALIGN_DOWN(buf, SOVUC);
+
+ /* add n times s1 to s2 for start round */
+ s2 += s1 * n;
+
+ /* set sums 0 */
+ vs1 = v0_32;
+ vs2 = v0_32;
+
+ k = len < VNMAX ? (unsigned)len : VNMAX;
+ len -= k;
+
+ /* insert scalar start somewhere */
+ vs1 = vec_lde(0, &s1);
+ vs2 = vec_lde(0, &s2);
+
+ /* get input data */
+ in16 = vec_ldl(0, buf);
+
+ /* mask out excess data, add 4 byte horizontal and add to old dword */
+ vs1 = vec_msum(in16, v1_a, vs1);
+
+ /* apply order, masking out excess data, add 4 byte horizontal and add to old dword */
+ vs2 = vec_msum(in16, vord_a, vs2);
+
+ buf += SOVUC;
+ k -= n;
+
+ if (likely(k >= SOVUC)) do {
+ vector unsigned int vs1_r = v0_32;
+ f = 512;
+ f |= block_num >= 256 ? 0 : block_num << 16;
+ vec_dst(buf, f, 2);
+
+ do {
+ /* get input data */
+ in16 = vec_ldl(0, buf);
+
+ /* add vs1 for this round */
+ vs1_r += vs1;
+
+ /* add 4 byte horizontal and add to old dword */
+ vs1 = vec_sum4s(in16, vs1);
+ /* apply order, add 4 byte horizontal and add to old dword */
+ vs2 = vec_msum(in16, vord, vs2);
+
+ buf += SOVUC;
+ k -= SOVUC;
+ } while (k >= SOVUC);
+ /* reduce vs1 round sum before multiplying by 16 */
+ vs1_r = vector_reduce(vs1_r);
+ /* add all vs1 for 16 times */
+ vs2 += vec_sl(vs1_r, vsh);
+ /* reduce the vectors to something in the range of BASE */
+ vs2 = vector_reduce(vs2);
+ vs1 = vector_reduce(vs1);
+ len += k;
+ k = len < VNMAX ? (unsigned)len : VNMAX;
+ block_num = DIV_ROUNDUP(len, 512); /* 32 block size * 16 bytes */
+ len -= k;
+ } while (likely(k >= SOVUC));
+
+ if (likely(k)) {
+ vector unsigned int vk;
+ /*
+ * handle trailer
+ */
+ f = SOVUC - k;
+ /* swizzle masks in place */
+ vperm = vec_identl(f);
+ vord_a = vec_perm(vord, v0, vperm);
+ v1_a = vec_perm(v1, v0, vperm);
+
+ /* add k times vs1 for this trailer */
+ vk = (vector unsigned int)vec_lvsl(0, (unsigned *)(intptr_t)k);
+ vk = (vector unsigned)vec_mergeh(v0, (vector unsigned char)vk);
+ vk = (vector unsigned)vec_mergeh((vector unsigned short)v0, (vector unsigned short)vk);
+ vk = vec_splat(vk, 0);
+ vs2 += vec_mullw(vs1, vk);
+
+ /* get input data */
+ in16 = vec_ldl(0, buf);
+
+ /* mask out excess data, add 4 byte horizontal and add to old dword */
+ vs1 = vec_msum(in16, v1_a, vs1);
+ /* apply order, masking out excess data, add 4 byte horizontal and add to old dword */
+ vs2 = vec_msum(in16, vord_a, vs2);
+
+ buf += k;
+ k -= k;
+ }
+
+ vec_dss(2);
+
+ /* add horizontal */
+ /* stuff should be reduced so no proplem with signed sature */
+ vs1 = (vector unsigned)vec_sums((vector int)vs1, (vector int)v0_32);
+ vs2 = (vector unsigned)vec_sums((vector int)vs2, (vector int)v0_32);
+ /* shake and roll */
+ vs1 = vec_splat(vs1, 3);
+ vs2 = vec_splat(vs2, 3);
+ vec_ste(vs1, 0, &s1);
+ vec_ste(vs2, 0, &s2);
+ /* after horizontal add, reduce again in scalar code */
+ }
+
+ if (unlikely(len)) do {
+ s1 += *buf++;
+ s2 += s1;
+ } while (--len);
+ reduce(s1);
+ reduce(s2);
+
+ return (s2 << 16) | s1;
+}
+
+#endif