=== modified file 'Makefile.in' --- Makefile.in 2011-03-14 01:01:37 +0000 +++ Makefile.in 2011-03-14 02:19:21 +0000 @@ -236,7 +236,8 @@ # DO NOT DELETE THIS LINE -- make depend depends on it. -adler32.o zutil.o: zutil.h zlib.h zconf.h +adler32.o: adler32.c zutil.h zlib.h zconf.h +zutil.o: zutil.h zlib.h zconf.h gzclose.o gzlib.o gzread.o gzwrite.o: zlib.h zconf.h gzguts.h compress.o example.o minigzip.o uncompr.o: zlib.h zconf.h crc32.o: zutil.h zlib.h zconf.h crc32.h @@ -246,7 +247,8 @@ inftrees.o: zutil.h zlib.h zconf.h inftrees.h trees.o: deflate.h zutil.h zlib.h zconf.h trees.h -adler32.lo zutil.lo: zutil.h zlib.h zconf.h +adler32.lo: adler32.c zutil.h zlib.h zconf.h +zutil.lo: zutil.h zlib.h zconf.h gzclose.lo gzlib.lo gzread.lo gzwrite.lo: zlib.h zconf.h gzguts.h compress.lo example.lo minigzip.lo uncompr.lo: zlib.h zconf.h crc32.lo: zutil.h zlib.h zconf.h crc32.h === modified file 'adler32.c' --- adler32.c 2011-03-14 01:01:37 +0000 +++ adler32.c 2011-03-30 13:38:42 +0000 @@ -9,6 +9,35 @@ #define local static +#define GCC_VERSION_GE(x) ((__GNUC__-0) * 100 + __GNUC_MINOR__-0 >= x) + +#if GCC_VERSION_GE(301) +/* sometimes leakes out of old kernel header */ +# undef noinline +# define noinline __attribute__((__noinline__)) +#else +# ifndef noinline +# define noinline +# endif +#endif + +#if GCC_VERSION_GE(301) +# define GCC_ATTR_UNUSED_PARAM __attribute__((__unused__)) +#else +# define GCC_ATTR_UNUSED_PARAM +#endif + +#if GCC_VERSION_GE(296) +# define likely(x) __builtin_expect(!!(x), 1) +# define unlikely(x) __builtin_expect(!!(x), 0) +#else +# define likely(x) (x) +# define unlikely(x) (x) +#endif + +#define ROUND_TO(x , n) ((x) & ~((n) - 1L)) +#define ALIGN_DIFF(x, n) (((intptr_t)((x)+(n) - 1L) & ~((intptr_t)(n) - 1L)) - (intptr_t)(x)) + local uLong adler32_combine_(uLong adler1, uLong adler2, z_off64_t len2); #define BASE 65521UL /* largest prime smaller than 65536 */ @@ -21,9 +50,20 @@ #define DO8(buf,i) DO4(buf,i); DO4(buf,i+4); #define DO16(buf) DO8(buf,0); DO8(buf,8); +#if defined(__alpha__) +/* even if gcc can generate a mul by inverse, the code is really + * ugly (find global const pool pointer, load constant, a mul, lots + * of shifts/add/sub), up to 14 instructions. The replacement code + * only needs >= 5 instructions + */ +# define NO_DIVIDE +#endif + /* use NO_DIVIDE if your processor does not do division in hardware */ #ifdef NO_DIVIDE -# define MOD(a) \ +/* use NO_SHIFT if your processor does shift > 1 by loop */ +# ifdef NO_SHIFT +# define reduce_full(a) \ do { \ if (a >= (BASE << 16)) a -= (BASE << 16); \ if (a >= (BASE << 15)) a -= (BASE << 15); \ @@ -43,21 +83,237 @@ if (a >= (BASE << 1)) a -= (BASE << 1); \ if (a >= BASE) a -= BASE; \ } while (0) -# define MOD4(a) \ +# define reduce_x(a) \ do { \ + if (MIN_WORK >= (1 << 6) && a >= (BASE << 6)) a -= (BASE << 6); \ + if (MIN_WORK >= (1 << 5) && a >= (BASE << 5)) a -= (BASE << 5); \ if (a >= (BASE << 4)) a -= (BASE << 4); \ if (a >= (BASE << 3)) a -= (BASE << 3); \ if (a >= (BASE << 2)) a -= (BASE << 2); \ if (a >= (BASE << 1)) a -= (BASE << 1); \ if (a >= BASE) a -= BASE; \ } while (0) +# define reduce(a) reduce_full(a) +# else +# define reduce_full(a) \ + do { \ + unsigned long b = a & 0x0000ffff; \ + a >>= 16; \ + b -= a; \ + a <<= 4; \ + a += b; \ + } while(a >= BASE) +# define reduce_x(a) \ + do { \ + unsigned long b = a & 0x0000ffff; \ + a >>= 16; \ + b -= a; \ + a <<= 4; \ + a += b; \ + a = a >= BASE ? a - BASE : a; \ + } while(0) +# define reduce(a) \ + do { \ + unsigned long b = a & 0x0000ffff; \ + a >>= 16; \ + b -= a; \ + a <<= 4; \ + a += b; \ + } while(0) +# endif #else -# define MOD(a) a %= BASE -# define MOD4(a) a %= BASE -#endif - -/* ========================================================================= */ -uLong ZEXPORT adler32(adler, buf, len) +# define reduce_full(a) a %= BASE +# define reduce_x(a) a %= BASE +# define reduce(a) a %= BASE +#endif + +local int host_is_bigendian() +{ + local const union { + uInt d; + unsigned char endian[sizeof(uInt)]; + } x = {1}; + return x.endian[0] == 0; +} + +#ifndef MIN_WORK +# define MIN_WORK 16 +#endif + +/* ========================================================================= */ +local noinline uLong adler32_1(adler, buf, len) + uLong adler; + const Bytef *buf; + uInt len GCC_ATTR_UNUSED_PARAM; +{ + unsigned long sum2; + + /* split Adler-32 into component sums */ + sum2 = (adler >> 16) & 0xffff; + adler &= 0xffff; + + adler += buf[0]; + if (adler >= BASE) + adler -= BASE; + sum2 += adler; + if (sum2 >= BASE) + sum2 -= BASE; + return adler | (sum2 << 16); +} + +/* ========================================================================= */ +local noinline uLong adler32_common(adler, buf, len) + uLong adler; + const Bytef *buf; + uInt len; +{ + unsigned long sum2; + + /* split Adler-32 into component sums */ + sum2 = (adler >> 16) & 0xffff; + adler &= 0xffff; + + while (len--) { + adler += *buf++; + sum2 += adler; + } + if (adler >= BASE) + adler -= BASE; + reduce_x(sum2); /* only added so many BASE's */ + return adler | (sum2 << 16); +} + +#ifndef HAVE_ADLER32_VEC +# if (defined(__LP64__) || ((SIZE_MAX-0) >> 31) >= 2) && !defined(NO_ADLER32_VEC) + +/* On 64 Bit archs, we can do pseudo SIMD with a nice win. + * This is esp. important for old Alphas, they do not have byte + * access. + * This needs some register but x86_64 is fine (>= 9 for the mainloop + * req.). If your 64 Bit arch is more limited, throw it away... + */ +# ifndef UINT64_C +# if defined(_MSC_VER) || defined(__BORLANDC__) +# define UINT64_C(c) (c ## ui64) +# else +# define UINT64_C(c) (c ## ULL) +# endif +# endif + +# undef VNMAX +# define VNMAX (2*NMAX+((9*NMAX)/10)) + +/* ========================================================================= */ +local noinline uLong adler32_vec(adler, buf, len) + uLong adler; + const Bytef *buf; + uInt len; +{ + unsigned int s1, s2; + unsigned int k; + + /* split Adler-32 into component sums */ + s1 = adler & 0xffff; + s2 = (adler >> 16) & 0xffff; + + /* align input data */ + k = ALIGN_DIFF(buf, sizeof(size_t)); + len -= k; + if (k) do { + s1 += *buf++; + s2 += s1; + } while(--k); + + k = len > VNMAX ? VNMAX : len; + len -= k; + if (likely(k >= 2 * sizeof(size_t))) do + { + unsigned int vs1, vs2; + unsigned int vs1s; + + /* add s1 to s2 for rounds to come */ + s2 += s1 * ROUND_TO(k, sizeof(size_t)); + vs1s = vs1 = vs2 = 0; + do { + size_t vs1l = 0, vs1h = 0, vs1l_s = 0, vs1h_s = 0; + unsigned int a, b, c, d, e, f, g, h; + unsigned int j; + + j = k > 23 * sizeof(size_t) ? 23 : k/sizeof(size_t); + k -= j * sizeof(size_t); + /* add s1 to s1 round sum for rounds to come */ + vs1s += j * vs1; + do { + size_t in8 = *(const size_t *)buf; + buf += sizeof(size_t); + /* add this s1 to s1 round sum */ + vs1l_s += vs1l; + vs1h_s += vs1h; + /* add up input data to s1 */ + vs1l += in8 & UINT64_C(0x00ff00ff00ff00ff); + vs1h += (in8 & UINT64_C(0xff00ff00ff00ff00)) >> 8; + } while(--j); + + /* split s1 */ + if(host_is_bigendian()) { + a = (vs1h >> 48) & 0x0000ffff; + b = (vs1l >> 48) & 0x0000ffff; + c = (vs1h >> 32) & 0x0000ffff; + d = (vs1l >> 32) & 0x0000ffff; + e = (vs1h >> 16) & 0x0000ffff; + f = (vs1l >> 16) & 0x0000ffff; + g = (vs1h ) & 0x0000ffff; + h = (vs1l ) & 0x0000ffff; + } else { + a = (vs1l ) & 0x0000ffff; + b = (vs1h ) & 0x0000ffff; + c = (vs1l >> 16) & 0x0000ffff; + d = (vs1h >> 16) & 0x0000ffff; + e = (vs1l >> 32) & 0x0000ffff; + f = (vs1h >> 32) & 0x0000ffff; + g = (vs1l >> 48) & 0x0000ffff; + h = (vs1h >> 48) & 0x0000ffff; + } + + /* add s1 & s2 horiz. */ + vs2 += 8*a + 7*b + 6*c + 5*d + 4*e + 3*f + 2*g + 1*h; + vs1 += a + b + c + d + e + f + g + h; + + /* split and add up s1 round sum */ + vs1l_s = ((vs1l_s ) & UINT64_C(0x0000ffff0000ffff)) + + ((vs1l_s >> 16) & UINT64_C(0x0000ffff0000ffff)); + vs1h_s = ((vs1h_s ) & UINT64_C(0x0000ffff0000ffff)) + + ((vs1h_s >> 16) & UINT64_C(0x0000ffff0000ffff)); + vs1l_s += vs1h_s; + vs1s += ((vs1l_s ) & UINT64_C(0x00000000ffffffff)) + + ((vs1l_s >> 32) & UINT64_C(0x00000000ffffffff)); + } while (k >= sizeof(size_t)); + reduce(vs1s); + s2 += vs1s * 8 + vs2; + reduce(s2); + s1 += vs1; + reduce(s1); + len += k; + k = len > VNMAX ? VNMAX : len; + len -= k; + } while (k >= sizeof(size_t)); + + /* handle trailer */ + if (k) do { + s1 += *buf++; + s2 += s1; + } while (--k); + reduce(s1); + reduce(s2); + + /* return recombined sums */ + return (s2 << 16) | s1; +} + +# else + +/* ========================================================================= */ +local noinline uLong adler32_vec(adler, buf, len) uLong adler; const Bytef *buf; uInt len; @@ -69,33 +325,6 @@ sum2 = (adler >> 16) & 0xffff; adler &= 0xffff; - /* in case user likes doing a byte at a time, keep it fast */ - if (len == 1) { - adler += buf[0]; - if (adler >= BASE) - adler -= BASE; - sum2 += adler; - if (sum2 >= BASE) - sum2 -= BASE; - return adler | (sum2 << 16); - } - - /* initial Adler-32 value (deferred check for len == 1 speed) */ - if (buf == Z_NULL) - return 1L; - - /* in case short lengths are provided, keep it somewhat fast */ - if (len < 16) { - while (len--) { - adler += *buf++; - sum2 += adler; - } - if (adler >= BASE) - adler -= BASE; - MOD4(sum2); /* only added so many BASE's */ - return adler | (sum2 << 16); - } - /* do length NMAX blocks -- requires just one modulo operation */ while (len >= NMAX) { len -= NMAX; @@ -104,8 +333,8 @@ DO16(buf); /* 16 sums unrolled */ buf += 16; } while (--n); - MOD(adler); - MOD(sum2); + reduce_full(adler); + reduce_full(sum2); } /* do remaining bytes (less than NMAX, still just one modulo) */ @@ -119,13 +348,36 @@ adler += *buf++; sum2 += adler; } - MOD(adler); - MOD(sum2); + reduce_full(adler); + reduce_full(sum2); } /* return recombined sums */ return adler | (sum2 << 16); } +# endif +#endif + +/* ========================================================================= */ +uLong ZEXPORT adler32(adler, buf, len) + uLong adler; + const Bytef *buf; + uInt len; +{ + /* in case user likes doing a byte at a time, keep it fast */ + if (len == 1) + return adler32_1(adler, buf, len); /* should create a fast tailcall */ + + /* initial Adler-32 value (deferred check for len == 1 speed) */ + if (buf == Z_NULL) + return 1L; + + /* in case short lengths are provided, keep it somewhat fast */ + if (len < MIN_WORK) + return adler32_common(adler, buf, len); + + return adler32_vec(adler, buf, len); +} /* ========================================================================= */ local uLong adler32_combine_(adler1, adler2, len2) @@ -141,7 +393,7 @@ rem = (unsigned)(len2 % BASE); sum1 = adler1 & 0xffff; sum2 = rem * sum1; - MOD(sum2); + reduce_full(sum2); sum1 += (adler2 & 0xffff) + BASE - 1; sum2 += ((adler1 >> 16) & 0xffff) + ((adler2 >> 16) & 0xffff) + BASE - rem; if (sum1 >= BASE) sum1 -= BASE;