424 lines
12 KiB
Diff
424 lines
12 KiB
Diff
|
=== modified file 'Makefile.in'
|
||
|
--- Makefile.in 2011-03-14 01:01:37 +0000
|
||
|
+++ Makefile.in 2011-03-14 02:19:21 +0000
|
||
|
@@ -236,7 +236,8 @@
|
||
|
|
||
|
# DO NOT DELETE THIS LINE -- make depend depends on it.
|
||
|
|
||
|
-adler32.o zutil.o: zutil.h zlib.h zconf.h
|
||
|
+adler32.o: adler32.c zutil.h zlib.h zconf.h
|
||
|
+zutil.o: zutil.h zlib.h zconf.h
|
||
|
gzclose.o gzlib.o gzread.o gzwrite.o: zlib.h zconf.h gzguts.h
|
||
|
compress.o example.o minigzip.o uncompr.o: zlib.h zconf.h
|
||
|
crc32.o: zutil.h zlib.h zconf.h crc32.h
|
||
|
@@ -246,7 +247,8 @@
|
||
|
inftrees.o: zutil.h zlib.h zconf.h inftrees.h
|
||
|
trees.o: deflate.h zutil.h zlib.h zconf.h trees.h
|
||
|
|
||
|
-adler32.lo zutil.lo: zutil.h zlib.h zconf.h
|
||
|
+adler32.lo: adler32.c zutil.h zlib.h zconf.h
|
||
|
+zutil.lo: zutil.h zlib.h zconf.h
|
||
|
gzclose.lo gzlib.lo gzread.lo gzwrite.lo: zlib.h zconf.h gzguts.h
|
||
|
compress.lo example.lo minigzip.lo uncompr.lo: zlib.h zconf.h
|
||
|
crc32.lo: zutil.h zlib.h zconf.h crc32.h
|
||
|
|
||
|
=== modified file 'adler32.c'
|
||
|
--- adler32.c 2011-03-14 01:01:37 +0000
|
||
|
+++ adler32.c 2011-03-30 13:38:42 +0000
|
||
|
@@ -9,6 +9,35 @@
|
||
|
|
||
|
#define local static
|
||
|
|
||
|
+#define GCC_VERSION_GE(x) ((__GNUC__-0) * 100 + __GNUC_MINOR__-0 >= x)
|
||
|
+
|
||
|
+#if GCC_VERSION_GE(301)
|
||
|
+/* sometimes leakes out of old kernel header */
|
||
|
+# undef noinline
|
||
|
+# define noinline __attribute__((__noinline__))
|
||
|
+#else
|
||
|
+# ifndef noinline
|
||
|
+# define noinline
|
||
|
+# endif
|
||
|
+#endif
|
||
|
+
|
||
|
+#if GCC_VERSION_GE(301)
|
||
|
+# define GCC_ATTR_UNUSED_PARAM __attribute__((__unused__))
|
||
|
+#else
|
||
|
+# define GCC_ATTR_UNUSED_PARAM
|
||
|
+#endif
|
||
|
+
|
||
|
+#if GCC_VERSION_GE(296)
|
||
|
+# define likely(x) __builtin_expect(!!(x), 1)
|
||
|
+# define unlikely(x) __builtin_expect(!!(x), 0)
|
||
|
+#else
|
||
|
+# define likely(x) (x)
|
||
|
+# define unlikely(x) (x)
|
||
|
+#endif
|
||
|
+
|
||
|
+#define ROUND_TO(x , n) ((x) & ~((n) - 1L))
|
||
|
+#define ALIGN_DIFF(x, n) (((intptr_t)((x)+(n) - 1L) & ~((intptr_t)(n) - 1L)) - (intptr_t)(x))
|
||
|
+
|
||
|
local uLong adler32_combine_(uLong adler1, uLong adler2, z_off64_t len2);
|
||
|
|
||
|
#define BASE 65521UL /* largest prime smaller than 65536 */
|
||
|
@@ -21,9 +50,20 @@
|
||
|
#define DO8(buf,i) DO4(buf,i); DO4(buf,i+4);
|
||
|
#define DO16(buf) DO8(buf,0); DO8(buf,8);
|
||
|
|
||
|
+#if defined(__alpha__)
|
||
|
+/* even if gcc can generate a mul by inverse, the code is really
|
||
|
+ * ugly (find global const pool pointer, load constant, a mul, lots
|
||
|
+ * of shifts/add/sub), up to 14 instructions. The replacement code
|
||
|
+ * only needs >= 5 instructions
|
||
|
+ */
|
||
|
+# define NO_DIVIDE
|
||
|
+#endif
|
||
|
+
|
||
|
/* use NO_DIVIDE if your processor does not do division in hardware */
|
||
|
#ifdef NO_DIVIDE
|
||
|
-# define MOD(a) \
|
||
|
+/* use NO_SHIFT if your processor does shift > 1 by loop */
|
||
|
+# ifdef NO_SHIFT
|
||
|
+# define reduce_full(a) \
|
||
|
do { \
|
||
|
if (a >= (BASE << 16)) a -= (BASE << 16); \
|
||
|
if (a >= (BASE << 15)) a -= (BASE << 15); \
|
||
|
@@ -43,21 +83,237 @@
|
||
|
if (a >= (BASE << 1)) a -= (BASE << 1); \
|
||
|
if (a >= BASE) a -= BASE; \
|
||
|
} while (0)
|
||
|
-# define MOD4(a) \
|
||
|
+# define reduce_x(a) \
|
||
|
do { \
|
||
|
+ if (MIN_WORK >= (1 << 6) && a >= (BASE << 6)) a -= (BASE << 6); \
|
||
|
+ if (MIN_WORK >= (1 << 5) && a >= (BASE << 5)) a -= (BASE << 5); \
|
||
|
if (a >= (BASE << 4)) a -= (BASE << 4); \
|
||
|
if (a >= (BASE << 3)) a -= (BASE << 3); \
|
||
|
if (a >= (BASE << 2)) a -= (BASE << 2); \
|
||
|
if (a >= (BASE << 1)) a -= (BASE << 1); \
|
||
|
if (a >= BASE) a -= BASE; \
|
||
|
} while (0)
|
||
|
+# define reduce(a) reduce_full(a)
|
||
|
+# else
|
||
|
+# define reduce_full(a) \
|
||
|
+ do { \
|
||
|
+ unsigned long b = a & 0x0000ffff; \
|
||
|
+ a >>= 16; \
|
||
|
+ b -= a; \
|
||
|
+ a <<= 4; \
|
||
|
+ a += b; \
|
||
|
+ } while(a >= BASE)
|
||
|
+# define reduce_x(a) \
|
||
|
+ do { \
|
||
|
+ unsigned long b = a & 0x0000ffff; \
|
||
|
+ a >>= 16; \
|
||
|
+ b -= a; \
|
||
|
+ a <<= 4; \
|
||
|
+ a += b; \
|
||
|
+ a = a >= BASE ? a - BASE : a; \
|
||
|
+ } while(0)
|
||
|
+# define reduce(a) \
|
||
|
+ do { \
|
||
|
+ unsigned long b = a & 0x0000ffff; \
|
||
|
+ a >>= 16; \
|
||
|
+ b -= a; \
|
||
|
+ a <<= 4; \
|
||
|
+ a += b; \
|
||
|
+ } while(0)
|
||
|
+# endif
|
||
|
#else
|
||
|
-# define MOD(a) a %= BASE
|
||
|
-# define MOD4(a) a %= BASE
|
||
|
-#endif
|
||
|
-
|
||
|
-/* ========================================================================= */
|
||
|
-uLong ZEXPORT adler32(adler, buf, len)
|
||
|
+# define reduce_full(a) a %= BASE
|
||
|
+# define reduce_x(a) a %= BASE
|
||
|
+# define reduce(a) a %= BASE
|
||
|
+#endif
|
||
|
+
|
||
|
+local int host_is_bigendian()
|
||
|
+{
|
||
|
+ local const union {
|
||
|
+ uInt d;
|
||
|
+ unsigned char endian[sizeof(uInt)];
|
||
|
+ } x = {1};
|
||
|
+ return x.endian[0] == 0;
|
||
|
+}
|
||
|
+
|
||
|
+#ifndef MIN_WORK
|
||
|
+# define MIN_WORK 16
|
||
|
+#endif
|
||
|
+
|
||
|
+/* ========================================================================= */
|
||
|
+local noinline uLong adler32_1(adler, buf, len)
|
||
|
+ uLong adler;
|
||
|
+ const Bytef *buf;
|
||
|
+ uInt len GCC_ATTR_UNUSED_PARAM;
|
||
|
+{
|
||
|
+ unsigned long sum2;
|
||
|
+
|
||
|
+ /* split Adler-32 into component sums */
|
||
|
+ sum2 = (adler >> 16) & 0xffff;
|
||
|
+ adler &= 0xffff;
|
||
|
+
|
||
|
+ adler += buf[0];
|
||
|
+ if (adler >= BASE)
|
||
|
+ adler -= BASE;
|
||
|
+ sum2 += adler;
|
||
|
+ if (sum2 >= BASE)
|
||
|
+ sum2 -= BASE;
|
||
|
+ return adler | (sum2 << 16);
|
||
|
+}
|
||
|
+
|
||
|
+/* ========================================================================= */
|
||
|
+local noinline uLong adler32_common(adler, buf, len)
|
||
|
+ uLong adler;
|
||
|
+ const Bytef *buf;
|
||
|
+ uInt len;
|
||
|
+{
|
||
|
+ unsigned long sum2;
|
||
|
+
|
||
|
+ /* split Adler-32 into component sums */
|
||
|
+ sum2 = (adler >> 16) & 0xffff;
|
||
|
+ adler &= 0xffff;
|
||
|
+
|
||
|
+ while (len--) {
|
||
|
+ adler += *buf++;
|
||
|
+ sum2 += adler;
|
||
|
+ }
|
||
|
+ if (adler >= BASE)
|
||
|
+ adler -= BASE;
|
||
|
+ reduce_x(sum2); /* only added so many BASE's */
|
||
|
+ return adler | (sum2 << 16);
|
||
|
+}
|
||
|
+
|
||
|
+#ifndef HAVE_ADLER32_VEC
|
||
|
+# if (defined(__LP64__) || ((SIZE_MAX-0) >> 31) >= 2) && !defined(NO_ADLER32_VEC)
|
||
|
+
|
||
|
+/* On 64 Bit archs, we can do pseudo SIMD with a nice win.
|
||
|
+ * This is esp. important for old Alphas, they do not have byte
|
||
|
+ * access.
|
||
|
+ * This needs some register but x86_64 is fine (>= 9 for the mainloop
|
||
|
+ * req.). If your 64 Bit arch is more limited, throw it away...
|
||
|
+ */
|
||
|
+# ifndef UINT64_C
|
||
|
+# if defined(_MSC_VER) || defined(__BORLANDC__)
|
||
|
+# define UINT64_C(c) (c ## ui64)
|
||
|
+# else
|
||
|
+# define UINT64_C(c) (c ## ULL)
|
||
|
+# endif
|
||
|
+# endif
|
||
|
+
|
||
|
+# undef VNMAX
|
||
|
+# define VNMAX (2*NMAX+((9*NMAX)/10))
|
||
|
+
|
||
|
+/* ========================================================================= */
|
||
|
+local noinline uLong adler32_vec(adler, buf, len)
|
||
|
+ uLong adler;
|
||
|
+ const Bytef *buf;
|
||
|
+ uInt len;
|
||
|
+{
|
||
|
+ unsigned int s1, s2;
|
||
|
+ unsigned int k;
|
||
|
+
|
||
|
+ /* split Adler-32 into component sums */
|
||
|
+ s1 = adler & 0xffff;
|
||
|
+ s2 = (adler >> 16) & 0xffff;
|
||
|
+
|
||
|
+ /* align input data */
|
||
|
+ k = ALIGN_DIFF(buf, sizeof(size_t));
|
||
|
+ len -= k;
|
||
|
+ if (k) do {
|
||
|
+ s1 += *buf++;
|
||
|
+ s2 += s1;
|
||
|
+ } while(--k);
|
||
|
+
|
||
|
+ k = len > VNMAX ? VNMAX : len;
|
||
|
+ len -= k;
|
||
|
+ if (likely(k >= 2 * sizeof(size_t))) do
|
||
|
+ {
|
||
|
+ unsigned int vs1, vs2;
|
||
|
+ unsigned int vs1s;
|
||
|
+
|
||
|
+ /* add s1 to s2 for rounds to come */
|
||
|
+ s2 += s1 * ROUND_TO(k, sizeof(size_t));
|
||
|
+ vs1s = vs1 = vs2 = 0;
|
||
|
+ do {
|
||
|
+ size_t vs1l = 0, vs1h = 0, vs1l_s = 0, vs1h_s = 0;
|
||
|
+ unsigned int a, b, c, d, e, f, g, h;
|
||
|
+ unsigned int j;
|
||
|
+
|
||
|
+ j = k > 23 * sizeof(size_t) ? 23 : k/sizeof(size_t);
|
||
|
+ k -= j * sizeof(size_t);
|
||
|
+ /* add s1 to s1 round sum for rounds to come */
|
||
|
+ vs1s += j * vs1;
|
||
|
+ do {
|
||
|
+ size_t in8 = *(const size_t *)buf;
|
||
|
+ buf += sizeof(size_t);
|
||
|
+ /* add this s1 to s1 round sum */
|
||
|
+ vs1l_s += vs1l;
|
||
|
+ vs1h_s += vs1h;
|
||
|
+ /* add up input data to s1 */
|
||
|
+ vs1l += in8 & UINT64_C(0x00ff00ff00ff00ff);
|
||
|
+ vs1h += (in8 & UINT64_C(0xff00ff00ff00ff00)) >> 8;
|
||
|
+ } while(--j);
|
||
|
+
|
||
|
+ /* split s1 */
|
||
|
+ if(host_is_bigendian()) {
|
||
|
+ a = (vs1h >> 48) & 0x0000ffff;
|
||
|
+ b = (vs1l >> 48) & 0x0000ffff;
|
||
|
+ c = (vs1h >> 32) & 0x0000ffff;
|
||
|
+ d = (vs1l >> 32) & 0x0000ffff;
|
||
|
+ e = (vs1h >> 16) & 0x0000ffff;
|
||
|
+ f = (vs1l >> 16) & 0x0000ffff;
|
||
|
+ g = (vs1h ) & 0x0000ffff;
|
||
|
+ h = (vs1l ) & 0x0000ffff;
|
||
|
+ } else {
|
||
|
+ a = (vs1l ) & 0x0000ffff;
|
||
|
+ b = (vs1h ) & 0x0000ffff;
|
||
|
+ c = (vs1l >> 16) & 0x0000ffff;
|
||
|
+ d = (vs1h >> 16) & 0x0000ffff;
|
||
|
+ e = (vs1l >> 32) & 0x0000ffff;
|
||
|
+ f = (vs1h >> 32) & 0x0000ffff;
|
||
|
+ g = (vs1l >> 48) & 0x0000ffff;
|
||
|
+ h = (vs1h >> 48) & 0x0000ffff;
|
||
|
+ }
|
||
|
+
|
||
|
+ /* add s1 & s2 horiz. */
|
||
|
+ vs2 += 8*a + 7*b + 6*c + 5*d + 4*e + 3*f + 2*g + 1*h;
|
||
|
+ vs1 += a + b + c + d + e + f + g + h;
|
||
|
+
|
||
|
+ /* split and add up s1 round sum */
|
||
|
+ vs1l_s = ((vs1l_s ) & UINT64_C(0x0000ffff0000ffff)) +
|
||
|
+ ((vs1l_s >> 16) & UINT64_C(0x0000ffff0000ffff));
|
||
|
+ vs1h_s = ((vs1h_s ) & UINT64_C(0x0000ffff0000ffff)) +
|
||
|
+ ((vs1h_s >> 16) & UINT64_C(0x0000ffff0000ffff));
|
||
|
+ vs1l_s += vs1h_s;
|
||
|
+ vs1s += ((vs1l_s ) & UINT64_C(0x00000000ffffffff)) +
|
||
|
+ ((vs1l_s >> 32) & UINT64_C(0x00000000ffffffff));
|
||
|
+ } while (k >= sizeof(size_t));
|
||
|
+ reduce(vs1s);
|
||
|
+ s2 += vs1s * 8 + vs2;
|
||
|
+ reduce(s2);
|
||
|
+ s1 += vs1;
|
||
|
+ reduce(s1);
|
||
|
+ len += k;
|
||
|
+ k = len > VNMAX ? VNMAX : len;
|
||
|
+ len -= k;
|
||
|
+ } while (k >= sizeof(size_t));
|
||
|
+
|
||
|
+ /* handle trailer */
|
||
|
+ if (k) do {
|
||
|
+ s1 += *buf++;
|
||
|
+ s2 += s1;
|
||
|
+ } while (--k);
|
||
|
+ reduce(s1);
|
||
|
+ reduce(s2);
|
||
|
+
|
||
|
+ /* return recombined sums */
|
||
|
+ return (s2 << 16) | s1;
|
||
|
+}
|
||
|
+
|
||
|
+# else
|
||
|
+
|
||
|
+/* ========================================================================= */
|
||
|
+local noinline uLong adler32_vec(adler, buf, len)
|
||
|
uLong adler;
|
||
|
const Bytef *buf;
|
||
|
uInt len;
|
||
|
@@ -69,33 +325,6 @@
|
||
|
sum2 = (adler >> 16) & 0xffff;
|
||
|
adler &= 0xffff;
|
||
|
|
||
|
- /* in case user likes doing a byte at a time, keep it fast */
|
||
|
- if (len == 1) {
|
||
|
- adler += buf[0];
|
||
|
- if (adler >= BASE)
|
||
|
- adler -= BASE;
|
||
|
- sum2 += adler;
|
||
|
- if (sum2 >= BASE)
|
||
|
- sum2 -= BASE;
|
||
|
- return adler | (sum2 << 16);
|
||
|
- }
|
||
|
-
|
||
|
- /* initial Adler-32 value (deferred check for len == 1 speed) */
|
||
|
- if (buf == Z_NULL)
|
||
|
- return 1L;
|
||
|
-
|
||
|
- /* in case short lengths are provided, keep it somewhat fast */
|
||
|
- if (len < 16) {
|
||
|
- while (len--) {
|
||
|
- adler += *buf++;
|
||
|
- sum2 += adler;
|
||
|
- }
|
||
|
- if (adler >= BASE)
|
||
|
- adler -= BASE;
|
||
|
- MOD4(sum2); /* only added so many BASE's */
|
||
|
- return adler | (sum2 << 16);
|
||
|
- }
|
||
|
-
|
||
|
/* do length NMAX blocks -- requires just one modulo operation */
|
||
|
while (len >= NMAX) {
|
||
|
len -= NMAX;
|
||
|
@@ -104,8 +333,8 @@
|
||
|
DO16(buf); /* 16 sums unrolled */
|
||
|
buf += 16;
|
||
|
} while (--n);
|
||
|
- MOD(adler);
|
||
|
- MOD(sum2);
|
||
|
+ reduce_full(adler);
|
||
|
+ reduce_full(sum2);
|
||
|
}
|
||
|
|
||
|
/* do remaining bytes (less than NMAX, still just one modulo) */
|
||
|
@@ -119,13 +348,36 @@
|
||
|
adler += *buf++;
|
||
|
sum2 += adler;
|
||
|
}
|
||
|
- MOD(adler);
|
||
|
- MOD(sum2);
|
||
|
+ reduce_full(adler);
|
||
|
+ reduce_full(sum2);
|
||
|
}
|
||
|
|
||
|
/* return recombined sums */
|
||
|
return adler | (sum2 << 16);
|
||
|
}
|
||
|
+# endif
|
||
|
+#endif
|
||
|
+
|
||
|
+/* ========================================================================= */
|
||
|
+uLong ZEXPORT adler32(adler, buf, len)
|
||
|
+ uLong adler;
|
||
|
+ const Bytef *buf;
|
||
|
+ uInt len;
|
||
|
+{
|
||
|
+ /* in case user likes doing a byte at a time, keep it fast */
|
||
|
+ if (len == 1)
|
||
|
+ return adler32_1(adler, buf, len); /* should create a fast tailcall */
|
||
|
+
|
||
|
+ /* initial Adler-32 value (deferred check for len == 1 speed) */
|
||
|
+ if (buf == Z_NULL)
|
||
|
+ return 1L;
|
||
|
+
|
||
|
+ /* in case short lengths are provided, keep it somewhat fast */
|
||
|
+ if (len < MIN_WORK)
|
||
|
+ return adler32_common(adler, buf, len);
|
||
|
+
|
||
|
+ return adler32_vec(adler, buf, len);
|
||
|
+}
|
||
|
|
||
|
/* ========================================================================= */
|
||
|
local uLong adler32_combine_(adler1, adler2, len2)
|
||
|
@@ -141,7 +393,7 @@
|
||
|
rem = (unsigned)(len2 % BASE);
|
||
|
sum1 = adler1 & 0xffff;
|
||
|
sum2 = rem * sum1;
|
||
|
- MOD(sum2);
|
||
|
+ reduce_full(sum2);
|
||
|
sum1 += (adler2 & 0xffff) + BASE - 1;
|
||
|
sum2 += ((adler1 >> 16) & 0xffff) + ((adler2 >> 16) & 0xffff) + BASE - rem;
|
||
|
if (sum1 >= BASE) sum1 -= BASE;
|