=== modified file 'Makefile.in' --- Makefile.in 2011-03-14 02:19:21 +0000 +++ Makefile.in 2011-03-14 03:06:03 +0000 @@ -236,7 +236,7 @@ # DO NOT DELETE THIS LINE -- make depend depends on it. -adler32.o: adler32.c zutil.h zlib.h zconf.h +adler32.o: adler32.c adler32_ppc.c zutil.h zlib.h zconf.h zutil.o: zutil.h zlib.h zconf.h gzclose.o gzlib.o gzread.o gzwrite.o: zlib.h zconf.h gzguts.h compress.o example.o minigzip.o uncompr.o: zlib.h zconf.h @@ -247,7 +247,7 @@ inftrees.o: zutil.h zlib.h zconf.h inftrees.h trees.o: deflate.h zutil.h zlib.h zconf.h trees.h -adler32.lo: adler32.c zutil.h zlib.h zconf.h +adler32.lo: adler32.c adler32_ppc.c zutil.h zlib.h zconf.h zutil.lo: zutil.h zlib.h zconf.h gzclose.lo gzlib.lo gzread.lo gzwrite.lo: zlib.h zconf.h gzguts.h compress.lo example.lo minigzip.lo uncompr.lo: zlib.h zconf.h === modified file 'adler32.c' --- adler32.c 2011-03-30 13:38:42 +0000 +++ adler32.c 2011-03-30 13:38:46 +0000 @@ -36,7 +36,10 @@ #endif #define ROUND_TO(x , n) ((x) & ~((n) - 1L)) +#define DIV_ROUNDUP(a, b) (((a) + (b) - 1) / (b)) #define ALIGN_DIFF(x, n) (((intptr_t)((x)+(n) - 1L) & ~((intptr_t)(n) - 1L)) - (intptr_t)(x)) +#define ALIGN_DOWN(x, n) (((intptr_t)(x)) & ~((intptr_t)(n) - 1L)) +#define ALIGN_DOWN_DIFF(x, n) (((intptr_t)(x)) & ((intptr_t)(n) - 1L)) local uLong adler32_combine_(uLong adler1, uLong adler2, z_off64_t len2); @@ -136,6 +139,12 @@ return x.endian[0] == 0; } +#ifndef NO_ADLER32_VEC +# if defined(__powerpc__) || defined(__powerpc64__) +# include "adler32_ppc.c" +# endif +#endif + #ifndef MIN_WORK # define MIN_WORK 16 #endif === added file 'adler32_ppc.c' --- adler32_ppc.c 1970-01-01 00:00:00 +0000 +++ adler32_ppc.c 2011-03-30 11:12:04 +0000 @@ -0,0 +1,253 @@ +/* + * adler32.c -- compute the Adler-32 checksum of a data stream + * ppc implementation + * Copyright (C) 1995-2007 Mark Adler + * Copyright (C) 2009-2011 Jan Seiffert + * For conditions of distribution and use, see copyright notice in zlib.h + */ + +/* @(#) $Id$ */ + +/* + * We use the Altivec PIM vector stuff, but still, this is only + * tested with GCC, and prop. uses some GCC specifics (like GCC + * understands vector types and you can simply write a += b) + */ +#if defined(__ALTIVEC__) && defined(__GNUC__) +# define HAVE_ADLER32_VEC +/* it needs some bytes till the vec version gets up to speed... */ +# define MIN_WORK 64 +# include + +/* + * Depending on length, this can be slower (short length < 64 bytes), + * much faster (our beloved 128kb 22.2s generic to 3.4s vec, but cache + * is important...), to a little faster (very long length, 1.6MB, 47.6s + * to 36s), which is prop. only capped by memory bandwith. + * (The orig. 128k case was slower in AltiVec, because AltiVec loads + * are always uncached and trigger no HW prefetching, because that is + * what you often need with mass data manipulation (not poisen your + * cache, movntq), instead you have to do it for yourself (data stream + * touch). With 128k it could be cleanly seen: no prefetch, half as slow + * as generic, but comment out the memory load -> 3s. With proper prefetch + * we are at 3.4s. So AltiVec can execute these "expensive" FMA quite + * fast (even without fancy unrolling), only the data does not arrive + * fast enough. In cases where the working set does not fit into cache + * it simply cannot be delivered fast enough over the FSB/Mem). + * Still we have to prefetch, or we are slow as hell. + */ + +# define SOVUC (sizeof(vector unsigned char)) + +/* can be propably more, since we do not have the x86 psadbw 64 bit sum */ +# define VNMAX (6*NMAX) + +/* ========================================================================= */ +local inline vector unsigned char vec_identl(level) + unsigned int level; +{ + return vec_lvsl(level, (const unsigned char *)0); +} + +/* ========================================================================= */ +local inline vector unsigned char vec_ident_rev(void) +{ + return vec_xor(vec_identl(0), vec_splat_u8(15)); +} + +/* ========================================================================= */ +/* multiply two 32 bit ints, return the low 32 bit */ +local inline vector unsigned int vec_mullw(vector unsigned int a, vector unsigned int b) +{ + vector unsigned int v16 = vec_splat_u32(-16); + vector unsigned int v0_32 = vec_splat_u32(0); + vector unsigned int swap, low, high; + + swap = vec_rl(b, v16); + low = vec_mulo((vector unsigned short)a, (vector unsigned short)b); + high = vec_msum((vector unsigned short)a, (vector unsigned short)swap, v0_32); + high = vec_sl(high, v16); + return vec_add(low, high); +} + +/* ========================================================================= */ +local inline vector unsigned int vector_reduce(vector unsigned int x) +{ + vector unsigned int y; + vector unsigned int vsh; + + vsh = vec_splat_u32(1); + vsh = vec_sl(vsh, vec_splat_u32(4)); + + y = vec_sl(x, vsh); + y = vec_sr(y, vsh); + x = vec_sr(x, vsh); + y = vec_sub(y, x); + x = vec_sl(x, vec_splat_u32(4)); + x = vec_add(x, y); + return x; +} + +/* ========================================================================= */ +local noinline uLong adler32_vec(adler, buf, len) + uLong adler; + const Bytef *buf; + uInt len; +{ + unsigned int s1, s2; + + s1 = adler & 0xffff; + s2 = (adler >> 16) & 0xffff; + + if (likely(len >= 2*SOVUC)) { + vector unsigned int v0_32 = vec_splat_u32(0); + vector unsigned int vsh = vec_splat_u32(4); + vector unsigned char v1 = vec_splat_u8(1); + vector unsigned char vord; + vector unsigned char v0 = vec_splat_u8(0); + vector unsigned int vs1, vs2; + vector unsigned char in16, vord_a, v1_a, vperm; + unsigned int f, n; + unsigned int k, block_num; + + /* + * if i understand the Altivec PEM right, little + * endian impl. should have the data reversed on + * load, so the big endian vorder works. + */ + vord = vec_ident_rev() + v1; + block_num = DIV_ROUNDUP(len, 512); /* 32 block size * 16 bytes */ + f = 512; + f |= block_num >= 256 ? 0 : block_num << 16; + vec_dst(buf, f, 2); + /* + * Add stuff to achieve alignment + */ + /* swizzle masks in place */ + vperm = vec_lvsl(0, buf); + vord_a = vec_perm(vord, v0, vperm); + v1_a = vec_perm(v1, v0, vperm); + vperm = vec_lvsr(0, buf); + vord_a = vec_perm(v0, vord_a, vperm); + v1_a = vec_perm(v0, v1_a, vperm); + + /* align hard down */ + f = (unsigned) ALIGN_DOWN_DIFF(buf, SOVUC); + n = SOVUC - f; + buf = (const unsigned char *)ALIGN_DOWN(buf, SOVUC); + + /* add n times s1 to s2 for start round */ + s2 += s1 * n; + + /* set sums 0 */ + vs1 = v0_32; + vs2 = v0_32; + + k = len < VNMAX ? (unsigned)len : VNMAX; + len -= k; + + /* insert scalar start somewhere */ + vs1 = vec_lde(0, &s1); + vs2 = vec_lde(0, &s2); + + /* get input data */ + in16 = vec_ldl(0, buf); + + /* mask out excess data, add 4 byte horizontal and add to old dword */ + vs1 = vec_msum(in16, v1_a, vs1); + + /* apply order, masking out excess data, add 4 byte horizontal and add to old dword */ + vs2 = vec_msum(in16, vord_a, vs2); + + buf += SOVUC; + k -= n; + + if (likely(k >= SOVUC)) do { + vector unsigned int vs1_r = v0_32; + f = 512; + f |= block_num >= 256 ? 0 : block_num << 16; + vec_dst(buf, f, 2); + + do { + /* get input data */ + in16 = vec_ldl(0, buf); + + /* add vs1 for this round */ + vs1_r += vs1; + + /* add 4 byte horizontal and add to old dword */ + vs1 = vec_sum4s(in16, vs1); + /* apply order, add 4 byte horizontal and add to old dword */ + vs2 = vec_msum(in16, vord, vs2); + + buf += SOVUC; + k -= SOVUC; + } while (k >= SOVUC); + /* reduce vs1 round sum before multiplying by 16 */ + vs1_r = vector_reduce(vs1_r); + /* add all vs1 for 16 times */ + vs2 += vec_sl(vs1_r, vsh); + /* reduce the vectors to something in the range of BASE */ + vs2 = vector_reduce(vs2); + vs1 = vector_reduce(vs1); + len += k; + k = len < VNMAX ? (unsigned)len : VNMAX; + block_num = DIV_ROUNDUP(len, 512); /* 32 block size * 16 bytes */ + len -= k; + } while (likely(k >= SOVUC)); + + if (likely(k)) { + vector unsigned int vk; + /* + * handle trailer + */ + f = SOVUC - k; + /* swizzle masks in place */ + vperm = vec_identl(f); + vord_a = vec_perm(vord, v0, vperm); + v1_a = vec_perm(v1, v0, vperm); + + /* add k times vs1 for this trailer */ + vk = (vector unsigned int)vec_lvsl(0, (unsigned *)(intptr_t)k); + vk = (vector unsigned)vec_mergeh(v0, (vector unsigned char)vk); + vk = (vector unsigned)vec_mergeh((vector unsigned short)v0, (vector unsigned short)vk); + vk = vec_splat(vk, 0); + vs2 += vec_mullw(vs1, vk); + + /* get input data */ + in16 = vec_ldl(0, buf); + + /* mask out excess data, add 4 byte horizontal and add to old dword */ + vs1 = vec_msum(in16, v1_a, vs1); + /* apply order, masking out excess data, add 4 byte horizontal and add to old dword */ + vs2 = vec_msum(in16, vord_a, vs2); + + buf += k; + k -= k; + } + + vec_dss(2); + + /* add horizontal */ + /* stuff should be reduced so no proplem with signed sature */ + vs1 = (vector unsigned)vec_sums((vector int)vs1, (vector int)v0_32); + vs2 = (vector unsigned)vec_sums((vector int)vs2, (vector int)v0_32); + /* shake and roll */ + vs1 = vec_splat(vs1, 3); + vs2 = vec_splat(vs2, 3); + vec_ste(vs1, 0, &s1); + vec_ste(vs2, 0, &s2); + /* after horizontal add, reduce again in scalar code */ + } + + if (unlikely(len)) do { + s1 += *buf++; + s2 += s1; + } while (--len); + reduce(s1); + reduce(s2); + + return (s2 << 16) | s1; +} + +#endif