SHA256
3
0
forked from pool/zlib
zlib/04-x86.patch

1166 lines
37 KiB
Diff
Raw Normal View History

=== modified file 'Makefile.in'
--- Makefile.in 2011-03-14 14:39:24 +0000
+++ Makefile.in 2011-03-14 16:46:30 +0000
@@ -236,7 +236,7 @@
# DO NOT DELETE THIS LINE -- make depend depends on it.
-adler32.o: adler32.c adler32_ppc.c adler32_arm.c zutil.h zlib.h zconf.h
+adler32.o: adler32.c adler32_ppc.c adler32_arm.c adler32_x86.c zutil.h zlib.h zconf.h
zutil.o: zutil.h zlib.h zconf.h
gzclose.o gzlib.o gzread.o gzwrite.o: zlib.h zconf.h gzguts.h
compress.o example.o minigzip.o uncompr.o: zlib.h zconf.h
@@ -247,7 +247,7 @@
inftrees.o: zutil.h zlib.h zconf.h inftrees.h
trees.o: deflate.h zutil.h zlib.h zconf.h trees.h
-adler32.lo: adler32.c adler32_ppc.c adler32_arm.c zutil.h zlib.h zconf.h
+adler32.lo: adler32.c adler32_ppc.c adler32_arm.c adler32_x86.c zutil.h zlib.h zconf.h
zutil.lo: zutil.h zlib.h zconf.h
gzclose.lo gzlib.lo gzread.lo gzwrite.lo: zlib.h zconf.h gzguts.h
compress.lo example.lo minigzip.lo uncompr.lo: zlib.h zconf.h
=== modified file 'adler32.c'
--- adler32.c 2011-03-30 13:38:46 +0000
+++ adler32.c 2011-03-30 13:38:46 +0000
@@ -144,6 +144,8 @@
# include "adler32_arm.c"
# elif defined(__powerpc__) || defined(__powerpc64__)
# include "adler32_ppc.c"
+# elif defined(__i386__) || defined(__x86_64__)
+# include "adler32_x86.c"
# endif
#endif
=== added file 'adler32_x86.c'
--- adler32_x86.c 1970-01-01 00:00:00 +0000
+++ adler32_x86.c 2011-03-15 23:15:36 +0000
@@ -0,0 +1,1125 @@
+/*
+ * adler32.c -- compute the Adler-32 checksum of a data stream
+ * x86 implementation
+ * Copyright (C) 1995-2007 Mark Adler
+ * Copyright (C) 2009-2011 Jan Seiffert
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+/* @(#) $Id$ */
+
+#if GCC_VERSION_GE(207)
+# define GCC_ATTR_CONSTRUCTOR __attribute__((__constructor__))
+#else
+# define VEC_NO_GO
+#endif
+
+#if GCC_VERSION_GE(203)
+# define GCC_ATTR_ALIGNED(x) __attribute__((__aligned__(x)))
+#else
+# define VEC_NO_GO
+#endif
+
+/* inline asm, so only on GCC (or compatible) */
+#if defined(__GNUC__) && !defined(VEC_NO_GO)
+# define HAVE_ADLER32_VEC
+# define MIN_WORK 64
+
+# ifdef __x86_64__
+# define PICREG "%%rbx"
+# else
+# define PICREG "%%ebx"
+# endif
+
+/* ========================================================================= */
+local const struct { short d[24]; } vord GCC_ATTR_ALIGNED(16) = {
+ {1,1,1,1,1,1,1,1,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1}
+};
+
+/* ========================================================================= */
+local const struct { char d[16]; } vord_b GCC_ATTR_ALIGNED(16) = {
+ {16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1}
+};
+
+/* ========================================================================= */
+local noinline const Bytef *adler32_jumped(buf, s1, s2, k)
+ const Bytef *buf;
+ unsigned int *s1;
+ unsigned int *s2;
+ unsigned int k;
+{
+ unsigned int t;
+ unsigned n = k % 16;
+ buf += n;
+ k = (k / 16) + 1;
+
+ __asm__ __volatile__ (
+# ifdef __x86_64__
+# define CLOB "&"
+ "lea 1f(%%rip), %q4\n\t"
+ "lea (%q4,%q5,8), %q4\n\t"
+ "jmp *%q4\n\t"
+# else
+# ifndef __PIC__
+# define CLOB
+ "lea 1f(,%5,8), %4\n\t"
+# else
+# define CLOB
+ "lea 1f-3f(,%5,8), %4\n\t"
+ "call 9f\n"
+ "3:\n\t"
+# endif
+ "jmp *%4\n\t"
+# ifdef __PIC__
+ ".p2align 1\n"
+ "9:\n\t"
+ "addl (%%esp), %4\n\t"
+ "ret\n\t"
+# endif
+# endif
+ ".p2align 1\n"
+ "2:\n\t"
+# ifdef __i386
+ ".byte 0x3e\n\t"
+# endif
+ "add $0x10, %2\n\t"
+ ".p2align 1\n"
+ "1:\n\t"
+ /* 128 */
+ "movzbl -16(%2), %4\n\t" /* 4 */
+ "add %4, %0\n\t" /* 2 */
+ "add %0, %1\n\t" /* 2 */
+ /* 120 */
+ "movzbl -15(%2), %4\n\t" /* 4 */
+ "add %4, %0\n\t" /* 2 */
+ "add %0, %1\n\t" /* 2 */
+ /* 112 */
+ "movzbl -14(%2), %4\n\t" /* 4 */
+ "add %4, %0\n\t" /* 2 */
+ "add %0, %1\n\t" /* 2 */
+ /* 104 */
+ "movzbl -13(%2), %4\n\t" /* 4 */
+ "add %4, %0\n\t" /* 2 */
+ "add %0, %1\n\t" /* 2 */
+ /* 96 */
+ "movzbl -12(%2), %4\n\t" /* 4 */
+ "add %4, %0\n\t" /* 2 */
+ "add %0, %1\n\t" /* 2 */
+ /* 88 */
+ "movzbl -11(%2), %4\n\t" /* 4 */
+ "add %4, %0\n\t" /* 2 */
+ "add %0, %1\n\t" /* 2 */
+ /* 80 */
+ "movzbl -10(%2), %4\n\t" /* 4 */
+ "add %4, %0\n\t" /* 2 */
+ "add %0, %1\n\t" /* 2 */
+ /* 72 */
+ "movzbl -9(%2), %4\n\t" /* 4 */
+ "add %4, %0\n\t" /* 2 */
+ "add %0, %1\n\t" /* 2 */
+ /* 64 */
+ "movzbl -8(%2), %4\n\t" /* 4 */
+ "add %4, %0\n\t" /* 2 */
+ "add %0, %1\n\t" /* 2 */
+ /* 56 */
+ "movzbl -7(%2), %4\n\t" /* 4 */
+ "add %4, %0\n\t" /* 2 */
+ "add %0, %1\n\t" /* 2 */
+ /* 48 */
+ "movzbl -6(%2), %4\n\t" /* 4 */
+ "add %4, %0\n\t" /* 2 */
+ "add %0, %1\n\t" /* 2 */
+ /* 40 */
+ "movzbl -5(%2), %4\n\t" /* 4 */
+ "add %4, %0\n\t" /* 2 */
+ "add %0, %1\n\t" /* 2 */
+ /* 32 */
+ "movzbl -4(%2), %4\n\t" /* 4 */
+ "add %4, %0\n\t" /* 2 */
+ "add %0, %1\n\t" /* 2 */
+ /* 24 */
+ "movzbl -3(%2), %4\n\t" /* 4 */
+ "add %4, %0\n\t" /* 2 */
+ "add %0, %1\n\t" /* 2 */
+ /* 16 */
+ "movzbl -2(%2), %4\n\t" /* 4 */
+ "add %4, %0\n\t" /* 2 */
+ "add %0, %1\n\t" /* 2 */
+ /* 8 */
+ "movzbl -1(%2), %4\n\t" /* 4 */
+ "add %4, %0\n\t" /* 2 */
+ "add %0, %1\n\t" /* 2 */
+ /* 0 */
+ "dec %3\n\t"
+ "jnz 2b"
+ : /* %0 */ "=R" (*s1),
+ /* %1 */ "=R" (*s2),
+ /* %2 */ "=abdSD" (buf),
+ /* %3 */ "=c" (k),
+ /* %4 */ "="CLOB"R" (t)
+ : /* %5 */ "r" (16 - n),
+ /* */ "0" (*s1),
+ /* */ "1" (*s2),
+ /* */ "2" (buf),
+ /* */ "3" (k)
+ : "cc", "memory"
+ );
+
+ return buf;
+}
+
+#if 0
+ /*
+ * Will XOP processors have SSSE3/AVX??
+ * And what is the unaligned load performance?
+ */
+ "prefetchnta 0x70(%0)\n\t"
+ "lddqu (%0), %%xmm0\n\t"
+ "vpaddd %%xmm3, %%xmm5, %%xmm5\n\t"
+ "sub $16, %3\n\t"
+ "add $16, %0\n\t"
+ "cmp $15, %3\n\t"
+ "vphaddubd %%xmm0, %%xmm1\n\t" /* A */
+ "vpmaddubsw %%xmm4, %%xmm0, %%xmm0\n\t"/* AVX! */ /* 1 */
+ "vphadduwd %%xmm0, %%xmm0\n\t" /* 2 */
+ "vpaddd %%xmm1, %%xmm3, %%xmm3\n\t" /* B: A+B => hadd+acc or vpmadcubd w. mul = 1 */
+ "vpaddd %%xmm0, %%xmm2, %%xmm2\n\t" /* 3: 1+2+3 => vpmadcubd w. mul = 16,15,14... */
+ "jg 1b\n\t"
+ xop_reduce
+ xop_reduce
+ xop_reduce
+ setup
+ "jg 1b\n\t"
+ "vphaddudq %%xmm2, %%xmm0\n\t"
+ "vphaddudq %%xmm3, %%xmm1\n\t"
+ "pshufd $0xE6, %%xmm0, %%xmm2\n\t"
+ "pshufd $0xE6, %%xmm1, %%xmm3\n\t"
+ "paddd %%xmm0, %%xmm2\n\t"
+ "paddd %%xmm1, %%xmm3\n\t"
+ "movd %%xmm2, %2\n\t"
+ "movd %%xmm3, %1\n\t"
+#endif
+
+/* ========================================================================= */
+local uLong adler32_SSSE3(adler, buf, len)
+ uLong adler;
+ const Bytef *buf;
+ uInt len;
+{
+ unsigned int s1 = adler & 0xffff;
+ unsigned int s2 = (adler >> 16) & 0xffff;
+ unsigned int k;
+
+ k = ALIGN_DIFF(buf, 16);
+ len -= k;
+ if (k)
+ buf = adler32_jumped(buf, &s1, &s2, k);
+
+ __asm__ __volatile__ (
+ "mov %6, %3\n\t" /* get max. byte count VNMAX till v1_round_sum overflows */
+ "cmp %3, %4\n\t"
+ "cmovb %4, %3\n\t" /* k = len >= VNMAX ? k : len */
+ "sub %3, %4\n\t" /* len -= k */
+ "cmp $16, %3\n\t"
+ "jb 88f\n\t" /* if(k < 16) goto OUT */
+#ifdef __ELF__
+ ".subsection 2\n\t"
+#else
+ "jmp 77f\n\t"
+#endif
+ ".p2align 2\n"
+ /*
+ * reduction function to bring a vector sum within the range of BASE
+ * This does no full reduction! When the sum is large, a number > BASE
+ * is the result. To do a full reduction call multiple times.
+ */
+ "sse2_reduce:\n\t"
+ "movdqa %%xmm0, %%xmm1\n\t" /* y = x */
+ "pslld $16, %%xmm1\n\t" /* y <<= 16 */
+ "psrld $16, %%xmm0\n\t" /* x >>= 16 */
+ "psrld $16, %%xmm1\n\t" /* y >>= 16 */
+ "psubd %%xmm0, %%xmm1\n\t" /* y -= x */
+ "pslld $4, %%xmm0\n\t" /* x <<= 4 */
+ "paddd %%xmm1, %%xmm0\n\t" /* x += y */
+ "ret\n\t"
+#ifdef __ELF__
+ ".previous\n\t"
+#else
+ "77:\n\t"
+#endif
+ "movdqa %5, %%xmm5\n\t" /* get vord_b */
+ "prefetchnta 0x70(%0)\n\t"
+ "movd %2, %%xmm2\n\t" /* init vector sum vs2 with s2 */
+ "movd %1, %%xmm3\n\t" /* init vector sum vs1 with s1 */
+ "pxor %%xmm4, %%xmm4\n" /* zero */
+ "3:\n\t"
+ "pxor %%xmm7, %%xmm7\n\t" /* zero vs1_round_sum */
+ ".p2align 3,,3\n\t"
+ ".p2align 2\n"
+ "2:\n\t"
+ "mov $128, %1\n\t" /* inner_k = 128 bytes till vs2_i overflows */
+ "cmp %1, %3\n\t"
+ "cmovb %3, %1\n\t" /* inner_k = k >= inner_k ? inner_k : k */
+ "and $-16, %1\n\t" /* inner_k = ROUND_TO(inner_k, 16) */
+ "sub %1, %3\n\t" /* k -= inner_k */
+ "shr $4, %1\n\t" /* inner_k /= 16 */
+ "pxor %%xmm6, %%xmm6\n\t" /* zero vs2_i */
+ ".p2align 4,,7\n"
+ ".p2align 3\n"
+ "1:\n\t"
+ "movdqa (%0), %%xmm0\n\t" /* fetch input data */
+ "prefetchnta 0x70(%0)\n\t"
+ "paddd %%xmm3, %%xmm7\n\t" /* vs1_round_sum += vs1 */
+ "add $16, %0\n\t" /* advance input data pointer */
+ "dec %1\n\t" /* decrement inner_k */
+ "movdqa %%xmm0, %%xmm1\n\t" /* make a copy of the input data */
+# if (HAVE_BINUTILS-0) >= 217
+ "pmaddubsw %%xmm5, %%xmm0\n\t" /* multiply all input bytes by vord_b bytes, add adjecent results to words */
+# else
+ ".byte 0x66, 0x0f, 0x38, 0x04, 0xc5\n\t" /* pmaddubsw %%xmm5, %%xmm0 */
+# endif
+ "psadbw %%xmm4, %%xmm1\n\t" /* subtract zero from every byte, add 8 bytes to a sum */
+ "paddw %%xmm0, %%xmm6\n\t" /* vs2_i += in * vorder_b */
+ "paddd %%xmm1, %%xmm3\n\t" /* vs1 += psadbw */
+ "jnz 1b\n\t" /* repeat if inner_k != 0 */
+ "movdqa %%xmm6, %%xmm0\n\t" /* copy vs2_i */
+ "punpckhwd %%xmm4, %%xmm0\n\t" /* zero extent vs2_i upper words to dwords */
+ "punpcklwd %%xmm4, %%xmm6\n\t" /* zero extent vs2_i lower words to dwords */
+ "paddd %%xmm0, %%xmm2\n\t" /* vs2 += vs2_i.upper */
+ "paddd %%xmm6, %%xmm2\n\t" /* vs2 += vs2_i.lower */
+ "cmp $15, %3\n\t"
+ "jg 2b\n\t" /* if(k > 15) repeat */
+ "movdqa %%xmm7, %%xmm0\n\t" /* move vs1_round_sum */
+ "call sse2_reduce\n\t" /* reduce vs1_round_sum */
+ "pslld $4, %%xmm0\n\t" /* vs1_round_sum *= 16 */
+ "paddd %%xmm2, %%xmm0\n\t" /* vs2 += vs1_round_sum */
+ "call sse2_reduce\n\t" /* reduce again */
+ "movdqa %%xmm0, %%xmm2\n\t" /* move vs2 back in place */
+ "movdqa %%xmm3, %%xmm0\n\t" /* move vs1 */
+ "call sse2_reduce\n\t" /* reduce */
+ "movdqa %%xmm0, %%xmm3\n\t" /* move vs1 back in place */
+ "add %3, %4\n\t" /* len += k */
+ "mov %6, %3\n\t" /* get max. byte count VNMAX till v1_round_sum overflows */
+ "cmp %3, %4\n\t"
+ "cmovb %4, %3\n\t" /* k = len >= VNMAX ? k : len */
+ "sub %3, %4\n\t" /* len -= k */
+ "cmp $15, %3\n\t"
+ "jg 3b\n\t" /* if(k > 15) repeat */
+ "pshufd $0xEE, %%xmm3, %%xmm1\n\t" /* collect vs1 & vs2 in lowest vector member */
+ "pshufd $0xEE, %%xmm2, %%xmm0\n\t"
+ "paddd %%xmm3, %%xmm1\n\t"
+ "paddd %%xmm2, %%xmm0\n\t"
+ "pshufd $0xE5, %%xmm0, %%xmm2\n\t"
+ "paddd %%xmm0, %%xmm2\n\t"
+ "movd %%xmm1, %1\n\t" /* mov vs1 to s1 */
+ "movd %%xmm2, %2\n" /* mov vs2 to s2 */
+ "88:"
+ : /* %0 */ "=r" (buf),
+ /* %1 */ "=r" (s1),
+ /* %2 */ "=r" (s2),
+ /* %3 */ "=r" (k),
+ /* %4 */ "=r" (len)
+ : /* %5 */ "m" (vord_b),
+ /*
+ * somewhere between 5 & 6, psadbw 64 bit sums ruin the party
+ * spreading the sums with palignr only brings it to 7 (?),
+ * while introducing an op into the main loop (2800 ms -> 3200 ms)
+ */
+ /* %6 */ "i" (5*NMAX),
+ /* */ "0" (buf),
+ /* */ "1" (s1),
+ /* */ "2" (s2),
+ /* */ "4" (len)
+ : "cc", "memory"
+# ifdef __SSE__
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
+# endif
+ );
+
+ if (unlikely(k))
+ buf = adler32_jumped(buf, &s1, &s2, k);
+ reduce(s1);
+ reduce(s2);
+ return (s2 << 16) | s1;
+}
+
+/* ========================================================================= */
+local uLong adler32_SSE2(adler, buf, len)
+ uLong adler;
+ const Bytef *buf;
+ uInt len;
+{
+ unsigned int s1 = adler & 0xffff;
+ unsigned int s2 = (adler >> 16) & 0xffff;
+ unsigned int k;
+
+ k = ALIGN_DIFF(buf, 16);
+ len -= k;
+ if (k)
+ buf = adler32_jumped(buf, &s1, &s2, k);
+
+ __asm__ __volatile__ (
+ "mov %6, %3\n\t"
+ "cmp %3, %4\n\t"
+ "cmovb %4, %3\n\t"
+ "sub %3, %4\n\t"
+ "cmp $16, %3\n\t"
+ "jb 88f\n\t"
+ "prefetchnta 0x70(%0)\n\t"
+ "movd %1, %%xmm4\n\t"
+ "movd %2, %%xmm3\n\t"
+ "pxor %%xmm2, %%xmm2\n\t"
+ "pxor %%xmm5, %%xmm5\n\t"
+ ".p2align 2\n"
+ "3:\n\t"
+ "pxor %%xmm6, %%xmm6\n\t"
+ "pxor %%xmm7, %%xmm7\n\t"
+ "mov $2048, %1\n\t" /* get byte count till vs2_{l|h}_word overflows */
+ "cmp %1, %3\n\t"
+ "cmovb %3, %1\n"
+ "and $-16, %1\n\t"
+ "sub %1, %3\n\t"
+ "shr $4, %1\n\t"
+ ".p2align 4,,7\n"
+ ".p2align 3\n"
+ "1:\n\t"
+ "prefetchnta 0x70(%0)\n\t"
+ "movdqa (%0), %%xmm0\n\t" /* fetch input data */
+ "paddd %%xmm4, %%xmm5\n\t" /* vs1_round_sum += vs1 */
+ "add $16, %0\n\t"
+ "dec %1\n\t"
+ "movdqa %%xmm0, %%xmm1\n\t" /* copy input data */
+ "psadbw %%xmm2, %%xmm0\n\t" /* add all bytes horiz. */
+ "paddd %%xmm0, %%xmm4\n\t" /* add that to vs1 */
+ "movdqa %%xmm1, %%xmm0\n\t" /* copy input data */
+ "punpckhbw %%xmm2, %%xmm1\n\t" /* zero extent input upper bytes to words */
+ "punpcklbw %%xmm2, %%xmm0\n\t" /* zero extent input lower bytes to words */
+ "paddw %%xmm1, %%xmm7\n\t" /* vs2_h_words += in_high_words */
+ "paddw %%xmm0, %%xmm6\n\t" /* vs2_l_words += in_low_words */
+ "jnz 1b\n\t"
+ "cmp $15, %3\n\t"
+ "pmaddwd 32+%5, %%xmm7\n\t" /* multiply vs2_h_words with order, add adjecend results */
+ "pmaddwd 16+%5, %%xmm6\n\t" /* multiply vs2_l_words with order, add adjecend results */
+ "paddd %%xmm7, %%xmm3\n\t" /* add to vs2 */
+ "paddd %%xmm6, %%xmm3\n\t" /* add to vs2 */
+ "jg 3b\n\t"
+ "movdqa %%xmm5, %%xmm0\n\t"
+ "pxor %%xmm5, %%xmm5\n\t"
+ "call sse2_reduce\n\t"
+ "pslld $4, %%xmm0\n\t"
+ "paddd %%xmm3, %%xmm0\n\t"
+ "call sse2_reduce\n\t"
+ "movdqa %%xmm0, %%xmm3\n\t"
+ "movdqa %%xmm4, %%xmm0\n\t"
+ "call sse2_reduce\n\t"
+ "movdqa %%xmm0, %%xmm4\n\t"
+ "add %3, %4\n\t"
+ "mov %6, %3\n\t"
+ "cmp %3, %4\n\t"
+ "cmovb %4, %3\n"
+ "sub %3, %4\n\t"
+ "cmp $15, %3\n\t"
+ "jg 3b\n\t"
+ "pshufd $0xEE, %%xmm4, %%xmm1\n\t"
+ "pshufd $0xEE, %%xmm3, %%xmm0\n\t"
+ "paddd %%xmm4, %%xmm1\n\t"
+ "paddd %%xmm3, %%xmm0\n\t"
+ "pshufd $0xE5, %%xmm0, %%xmm3\n\t"
+ "paddd %%xmm0, %%xmm3\n\t"
+ "movd %%xmm1, %1\n\t"
+ "movd %%xmm3, %2\n"
+ "88:\n\t"
+ : /* %0 */ "=r" (buf),
+ /* %1 */ "=r" (s1),
+ /* %2 */ "=r" (s2),
+ /* %3 */ "=r" (k),
+ /* %4 */ "=r" (len)
+ : /* %5 */ "m" (vord),
+ /* %6 */ "i" (5*NMAX),
+ /* */ "0" (buf),
+ /* */ "1" (s1),
+ /* */ "2" (s2),
+ /* */ "3" (k),
+ /* */ "4" (len)
+ : "cc", "memory"
+# ifdef __SSE__
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
+# endif
+ );
+
+ if (unlikely(k))
+ buf = adler32_jumped(buf, &s1, &s2, k);
+ reduce(s1);
+ reduce(s2);
+ return (s2 << 16) | s1;
+}
+
+# if 0
+/* ========================================================================= */
+/*
+ * The SSE2 version above is faster on my CPUs (Athlon64, Core2,
+ * P4 Xeon, K10 Sempron), but has instruction stalls only a
+ * Out-Of-Order-Execution CPU can solve.
+ * So this Version _may_ be better for the new old thing, Atom.
+ */
+local noinline uLong adler32_SSE2_no_oooe(adler, buf, len)
+ uLong adler;
+ const Bytef *buf;
+ uInt len;
+{
+ unsigned int s1 = adler & 0xffff;
+ unsigned int s2 = (adler >> 16) & 0xffff;
+ unsigned int k;
+
+ k = ALIGN_DIFF(buf, 16);
+ len -= k;
+ if (k)
+ buf = adler32_jumped(buf, &s1, &s2, k);
+
+ __asm__ __volatile__ (
+ "mov %6, %3\n\t"
+ "cmp %3, %4\n\t"
+ "cmovb %4, %3\n\t"
+ "sub %3, %4\n\t"
+ "cmp $16, %3\n\t"
+ "jb 88f\n\t"
+ "movdqa 16+%5, %%xmm6\n\t"
+ "movdqa 32+%5, %%xmm5\n\t"
+ "prefetchnta 16(%0)\n\t"
+ "pxor %%xmm7, %%xmm7\n\t"
+ "movd %1, %%xmm4\n\t"
+ "movd %2, %%xmm3\n\t"
+ ".p2align 3,,3\n\t"
+ ".p2align 2\n"
+ "1:\n\t"
+ "prefetchnta 32(%0)\n\t"
+ "movdqa (%0), %%xmm1\n\t"
+ "sub $16, %3\n\t"
+ "movdqa %%xmm4, %%xmm2\n\t"
+ "add $16, %0\n\t"
+ "movdqa %%xmm1, %%xmm0\n\t"
+ "cmp $15, %3\n\t"
+ "pslld $4, %%xmm2\n\t"
+ "paddd %%xmm3, %%xmm2\n\t"
+ "psadbw %%xmm7, %%xmm0\n\t"
+ "paddd %%xmm0, %%xmm4\n\t"
+ "movdqa %%xmm1, %%xmm0\n\t"
+ "punpckhbw %%xmm7, %%xmm1\n\t"
+ "punpcklbw %%xmm7, %%xmm0\n\t"
+ "movdqa %%xmm1, %%xmm3\n\t"
+ "pmaddwd %%xmm6, %%xmm0\n\t"
+ "paddd %%xmm2, %%xmm0\n\t"
+ "pmaddwd %%xmm5, %%xmm3\n\t"
+ "paddd %%xmm0, %%xmm3\n\t"
+ "jg 1b\n\t"
+ "movdqa %%xmm3, %%xmm0\n\t"
+ "call sse2_reduce\n\t"
+ "call sse2_reduce\n\t"
+ "movdqa %%xmm0, %%xmm3\n\t"
+ "movdqa %%xmm4, %%xmm0\n\t"
+ "call sse2_reduce\n\t"
+ "movdqa %%xmm0, %%xmm4\n\t"
+ "add %3, %4\n\t"
+ "mov %6, %3\n\t"
+ "cmp %3, %4\n\t"
+ "cmovb %4, %3\n\t"
+ "sub %3, %4\n\t"
+ "cmp $15, %3\n\t"
+ "jg 1b\n\t"
+ "pshufd $0xEE, %%xmm3, %%xmm0\n\t"
+ "pshufd $0xEE, %%xmm4, %%xmm1\n\t"
+ "paddd %%xmm3, %%xmm0\n\t"
+ "pshufd $0xE5, %%xmm0, %%xmm2\n\t"
+ "paddd %%xmm4, %%xmm1\n\t"
+ "movd %%xmm1, %1\n\t"
+ "paddd %%xmm0, %%xmm2\n\t"
+ "movd %%xmm2, %2\n"
+ "88:"
+ : /* %0 */ "=r" (buf),
+ /* %1 */ "=r" (s1),
+ /* %2 */ "=r" (s2),
+ /* %3 */ "=r" (k),
+ /* %4 */ "=r" (len)
+ : /* %5 */ "m" (vord),
+ /* %6 */ "i" (NMAX + NMAX/3),
+ /* */ "0" (buf),
+ /* */ "1" (s1),
+ /* */ "2" (s2),
+ /* */ "4" (len)
+ : "cc", "memory"
+# ifdef __SSE__
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
+# endif
+ );
+
+ if (unlikely(k))
+ buf = adler32_jumped(buf, &s1, &s2, k);
+ reduce(s1);
+ reduce(s2);
+ return (s2 << 16) | s1;
+}
+# endif
+
+# ifndef __x86_64__
+/* ========================================================================= */
+/*
+ * SSE version to help VIA-C3_2, P2 & P3
+ */
+local uLong adler32_SSE(adler, buf, len)
+ uLong adler;
+ const Bytef *buf;
+ uInt len;
+{
+ unsigned int s1 = adler & 0xffff;
+ unsigned int s2 = (adler >> 16) & 0xffff;
+ unsigned int k;
+
+ k = ALIGN_DIFF(buf, 8);
+ len -= k;
+ if (k)
+ buf = adler32_jumped(buf, &s1, &s2, k);
+
+ __asm__ __volatile__ (
+ "mov %6, %3\n\t"
+ "cmp %3, %4\n\t"
+ "cmovb %4, %3\n"
+ "sub %3, %4\n\t"
+ "cmp $8, %3\n\t"
+ "jb 88f\n\t"
+ "movd %1, %%mm4\n\t"
+ "movd %2, %%mm3\n\t"
+ "pxor %%mm2, %%mm2\n\t"
+ "pxor %%mm5, %%mm5\n\t"
+# ifdef __ELF__
+ ".subsection 2\n\t"
+# else
+ "jmp 77f\n\t"
+# endif
+ ".p2align 2\n"
+ "mmx_reduce:\n\t"
+ "movq %%mm0, %%mm1\n\t"
+ "pslld $16, %%mm1\n\t"
+ "psrld $16, %%mm0\n\t"
+ "psrld $16, %%mm1\n\t"
+ "psubd %%mm0, %%mm1\n\t"
+ "pslld $4, %%mm0\n\t"
+ "paddd %%mm1, %%mm0\n\t"
+ "ret\n\t"
+# ifdef __ELF__
+ ".previous\n\t"
+# else
+ "77:\n\t"
+# endif
+ ".p2align 2\n"
+ "3:\n\t"
+ "pxor %%mm6, %%mm6\n\t"
+ "pxor %%mm7, %%mm7\n\t"
+ "mov $1024, %1\n\t"
+ "cmp %1, %3\n\t"
+ "cmovb %3, %1\n"
+ "and $-8, %1\n\t"
+ "sub %1, %3\n\t"
+ "shr $3, %1\n\t"
+ ".p2align 4,,7\n"
+ ".p2align 3\n"
+ "1:\n\t"
+ "movq (%0), %%mm0\n\t"
+ "paddd %%mm4, %%mm5\n\t"
+ "add $8, %0\n\t"
+ "dec %1\n\t"
+ "movq %%mm0, %%mm1\n\t"
+ "psadbw %%mm2, %%mm0\n\t"
+ "paddd %%mm0, %%mm4\n\t"
+ "movq %%mm1, %%mm0\n\t"
+ "punpckhbw %%mm2, %%mm1\n\t"
+ "punpcklbw %%mm2, %%mm0\n\t"
+ "paddw %%mm1, %%mm7\n\t"
+ "paddw %%mm0, %%mm6\n\t"
+ "jnz 1b\n\t"
+ "cmp $7, %3\n\t"
+ "pmaddwd 40+%5, %%mm7\n\t"
+ "pmaddwd 32+%5, %%mm6\n\t"
+ "paddd %%mm7, %%mm3\n\t"
+ "paddd %%mm6, %%mm3\n\t"
+ "jg 3b\n\t"
+ "movq %%mm5, %%mm0\n\t"
+ "pxor %%mm5, %%mm5\n\t"
+ "call mmx_reduce\n\t"
+ "pslld $3, %%mm0\n\t"
+ "paddd %%mm3, %%mm0\n\t"
+ "call mmx_reduce\n\t"
+ "movq %%mm0, %%mm3\n\t"
+ "movq %%mm4, %%mm0\n\t"
+ "call mmx_reduce\n\t"
+ "movq %%mm0, %%mm4\n\t"
+ "add %3, %4\n\t"
+ "mov %6, %3\n\t"
+ "cmp %3, %4\n\t"
+ "cmovb %4, %3\n"
+ "sub %3, %4\n\t"
+ "cmp $7, %3\n\t"
+ "jg 3b\n\t"
+ "movd %%mm4, %1\n\t"
+ "psrlq $32, %%mm4\n\t"
+ "movd %%mm3, %2\n\t"
+ "psrlq $32, %%mm3\n\t"
+ "movd %%mm4, %4\n\t"
+ "add %4, %1\n\t"
+ "movd %%mm3, %4\n\t"
+ "add %4, %2\n"
+ "emms\n\t"
+ "88:\n\t"
+ : /* %0 */ "=r" (buf),
+ /* %1 */ "=r" (s1),
+ /* %2 */ "=r" (s2),
+ /* %3 */ "=r" (k),
+ /* %4 */ "=r" (len)
+ : /* %5 */ "m" (vord),
+ /* %6 */ "i" ((5*NMAX)/2),
+ /* */ "0" (buf),
+ /* */ "1" (s1),
+ /* */ "2" (s2),
+ /* */ "3" (k),
+ /* */ "4" (len)
+ : "cc", "memory"
+# ifdef __MMX__
+ , "mm0", "mm1", "mm2", "mm3", "mm4", "mm5", "mm6", "mm7"
+# endif
+ );
+
+ if (unlikely(k))
+ buf = adler32_jumped(buf, &s1, &s2, k);
+ reduce(s1);
+ reduce(s2);
+ return (s2 << 16) | s1;
+}
+
+/* ========================================================================= */
+/*
+ * Processors which only have MMX will prop. not like this
+ * code, they are so old, they are not Out-Of-Order
+ * (maybe except AMD K6, Cyrix, Winchip/VIA).
+ * I did my best to get at least 1 instruction between result -> use
+ */
+local uLong adler32_MMX(adler, buf, len)
+ uLong adler;
+ const Bytef *buf;
+ uInt len;
+{
+ unsigned int s1 = adler & 0xffff;
+ unsigned int s2 = (adler >> 16) & 0xffff;
+ unsigned int k;
+
+ k = ALIGN_DIFF(buf, 8);
+ len -= k;
+ if (k)
+ buf = adler32_jumped(buf, &s1, &s2, k);
+
+ __asm__ __volatile__ (
+ "mov %6, %3\n\t"
+ "cmp %3, %4\n\t"
+ "jae 11f\n\t"
+ "mov %4, %3\n"
+ "11:\n\t"
+ "sub %3, %4\n\t"
+ "cmp $8, %3\n\t"
+ "jb 88f\n\t"
+ "sub $8, %%esp\n\t"
+ "movd %1, %%mm4\n\t"
+ "movd %2, %%mm2\n\t"
+ "movq %5, %%mm3\n"
+ "33:\n\t"
+ "movq %%mm2, %%mm0\n\t"
+ "pxor %%mm2, %%mm2\n\t"
+ "pxor %%mm5, %%mm5\n\t"
+ ".p2align 2\n"
+ "3:\n\t"
+ "movq %%mm0, (%%esp)\n\t"
+ "pxor %%mm6, %%mm6\n\t"
+ "pxor %%mm7, %%mm7\n\t"
+ "mov $1024, %1\n\t"
+ "cmp %1, %3\n\t"
+ "jae 44f\n\t"
+ "mov %3, %1\n"
+ "44:\n\t"
+ "and $-8, %1\n\t"
+ "sub %1, %3\n\t"
+ "shr $3, %1\n\t"
+ ".p2align 4,,7\n"
+ ".p2align 3\n"
+ "1:\n\t"
+ "movq (%0), %%mm0\n\t"
+ "paddd %%mm4, %%mm5\n\t"
+ "add $8, %0\n\t"
+ "dec %1\n\t"
+ "movq %%mm0, %%mm1\n\t"
+ "punpcklbw %%mm2, %%mm0\n\t"
+ "punpckhbw %%mm2, %%mm1\n\t"
+ "paddw %%mm0, %%mm6\n\t"
+ "paddw %%mm1, %%mm0\n\t"
+ "paddw %%mm1, %%mm7\n\t"
+ "pmaddwd %%mm3, %%mm0\n\t"
+ "paddd %%mm0, %%mm4\n\t"
+ "jnz 1b\n\t"
+ "movq (%%esp), %%mm0\n\t"
+ "cmp $7, %3\n\t"
+ "pmaddwd 32+%5, %%mm6\n\t"
+ "pmaddwd 40+%5, %%mm7\n\t"
+ "paddd %%mm6, %%mm0\n\t"
+ "paddd %%mm7, %%mm0\n\t"
+ "jg 3b\n\t"
+ "movq %%mm0, %%mm2\n\t"
+ "movq %%mm5, %%mm0\n\t"
+ "call mmx_reduce\n\t"
+ "pslld $3, %%mm0\n\t"
+ "paddd %%mm2, %%mm0\n\t"
+ "call mmx_reduce\n\t"
+ "movq %%mm0, %%mm2\n\t"
+ "movq %%mm4, %%mm0\n\t"
+ "call mmx_reduce\n\t"
+ "movq %%mm0, %%mm4\n\t"
+ "add %3, %4\n\t"
+ "mov %6, %3\n\t"
+ "cmp %3, %4\n\t"
+ "jae 22f\n\t"
+ "mov %4, %3\n"
+ "22:\n\t"
+ "sub %3, %4\n\t"
+ "cmp $7, %3\n\t"
+ "jg 33b\n\t"
+ "add $8, %%esp\n\t"
+ "movd %%mm4, %1\n\t"
+ "psrlq $32, %%mm4\n\t"
+ "movd %%mm2, %2\n\t"
+ "psrlq $32, %%mm2\n\t"
+ "movd %%mm4, %4\n\t"
+ "add %4, %1\n\t"
+ "movd %%mm2, %4\n\t"
+ "add %4, %2\n"
+ "emms\n\t"
+ "88:\n\t"
+ : /* %0 */ "=r" (buf),
+ /* %1 */ "=r" (s1),
+ /* %2 */ "=r" (s2),
+ /* %3 */ "=r" (k),
+ /* %4 */ "=r" (len)
+ : /* %5 */ "m" (vord),
+ /* %6 */ "i" (4*NMAX),
+ /* */ "0" (buf),
+ /* */ "1" (s1),
+ /* */ "2" (s2),
+ /* */ "3" (k),
+ /* */ "4" (len)
+ : "cc", "memory"
+# ifdef __MMX__
+ , "mm0", "mm1", "mm2", "mm3", "mm4", "mm5", "mm6", "mm7"
+# endif
+ );
+
+ if (unlikely(k))
+ buf = adler32_jumped(buf, &s1, &s2, k);
+ reduce(s1);
+ reduce(s2);
+ return (s2 << 16) | s1;
+}
+# endif
+
+/* ========================================================================= */
+local uLong adler32_x86(adler, buf, len)
+ uLong adler;
+ const Bytef *buf;
+ uInt len;
+{
+ unsigned int s1 = adler & 0xffff;
+ unsigned int s2 = (adler >> 16) & 0xffff;
+ unsigned int n;
+
+ while (likely(len)) {
+# ifndef __x86_64__
+# define LOOP_COUNT 4
+# else
+# define LOOP_COUNT 8
+# endif
+ unsigned int k;
+ n = len < NMAX ? len : NMAX;
+ len -= n;
+ k = n / LOOP_COUNT;
+ n %= LOOP_COUNT;
+
+ if (likely(k)) do {
+ /*
+ * Modern compiler can do "wonders".
+ * Only if they would not "trick them self" sometime.
+ * This was unrolled 16 times not because someone
+ * anticipated autovectorizing compiler, but the
+ * classical "avoid loop overhead".
+ *
+ * But things get tricky if the compiler starts to see:
+ * "hey lets disambiguate one sum step from the other",
+ * the classical prevent-pipeline-stalls-thing.
+ *
+ * Suddenly we have 16 temporary sums, which unfortunatly
+ * blows x86 limited register set...
+ *
+ * Loopunrolling is also a little bad for the I-cache.
+ *
+ * So tune this down for x86.
+ * Instead we try to keep it in the register set. 4 sums fits
+ * into i386 register set with no framepointer.
+ * x86_64 is a little more splendit, but still we can not
+ * take 16, so take 8 sums.
+ */
+ s1 += buf[0]; s2 += s1;
+ s1 += buf[1]; s2 += s1;
+ s1 += buf[2]; s2 += s1;
+ s1 += buf[3]; s2 += s1;
+# ifdef __x86_64__
+ s1 += buf[4]; s2 += s1;
+ s1 += buf[5]; s2 += s1;
+ s1 += buf[6]; s2 += s1;
+ s1 += buf[7]; s2 += s1;
+# endif
+ buf += LOOP_COUNT;
+ } while(likely(--k));
+ if (n) do {
+ s1 += *buf++;
+ s2 += s1;
+ } while (--n);
+ reduce_full(s1);
+ reduce_full(s2);
+ }
+ return (s2 << 16) | s1;
+}
+
+/* ========================================================================= */
+/*
+ * Knot it all together with a runtime switch
+ */
+
+/* Flags */
+# define CFF_DEFAULT (1 << 0)
+/* Processor features */
+# define CFEATURE_CMOV (15 + 0)
+# define CFEATURE_MMX (23 + 0)
+# define CFEATURE_SSE (25 + 0)
+# define CFEATURE_SSE2 (26 + 0)
+# define CFEATURE_SSSE3 ( 9 + 32)
+
+# define CFB(x) (1 << ((x)%32))
+
+# define FEATURE_WORDS 2
+
+/* data structure */
+struct test_cpu_feature
+{
+ void (*func)(void);
+ int flags;
+ unsigned int features[FEATURE_WORDS];
+};
+
+/* ========================================================================= */
+/*
+ * Decision table
+ */
+local const struct test_cpu_feature tfeat_adler32_vec[] =
+{
+ /* func flags features */
+ {(void (*)(void))adler32_SSSE3, 0, {CFB(CFEATURE_CMOV), CFB(CFEATURE_SSSE3)}},
+ {(void (*)(void))adler32_SSE2, 0, {CFB(CFEATURE_SSE2)|CFB(CFEATURE_CMOV), 0}},
+# ifndef __x86_64__
+ {(void (*)(void))adler32_SSE, 0, {CFB(CFEATURE_SSE)|CFB(CFEATURE_CMOV), 0}},
+ {(void (*)(void))adler32_MMX, 0, {CFB(CFEATURE_MMX), 0}},
+# endif
+ {(void (*)(void))adler32_x86, CFF_DEFAULT, { 0, 0}},
+};
+
+/* ========================================================================= */
+/* Prototypes */
+local noinline void *test_cpu_feature(const struct test_cpu_feature *t, unsigned int l);
+local uLong adler32_vec_runtimesw(uLong adler, const Bytef *buf, uInt len);
+
+/* ========================================================================= */
+/*
+ * Runtime Function pointer
+ */
+local uLong (*adler32_vec_ptr)(uLong adler, const Bytef *buf, uInt len) = adler32_vec_runtimesw;
+
+/* ========================================================================= */
+/*
+ * Constructor to init the pointer early
+ */
+local GCC_ATTR_CONSTRUCTOR void adler32_vec_select(void)
+{
+ adler32_vec_ptr = test_cpu_feature(tfeat_adler32_vec, sizeof (tfeat_adler32_vec)/sizeof (tfeat_adler32_vec[0]));
+}
+
+/* ========================================================================= */
+/*
+ * Jump function
+ */
+local noinline uLong adler32_vec(adler, buf, len)
+ uLong adler;
+ const Bytef *buf;
+ uInt len;
+{
+ return adler32_vec_ptr(adler, buf, len);
+}
+
+/* ========================================================================= */
+/*
+ * the runtime switcher is a little racy, it should normaly not run if the constructor works
+ */
+local uLong adler32_vec_runtimesw(uLong adler, const Bytef *buf, uInt len)
+{
+ adler32_vec_select();
+ return adler32_vec(adler, buf, len);
+}
+
+
+/* ========================================================================= */
+/* Internal data types */
+struct cpuid_regs
+{
+ unsigned long eax, ebx, ecx, edx;
+};
+
+local struct
+{
+ unsigned int max_basic;
+ unsigned int features[FEATURE_WORDS];
+ int init_done;
+} our_cpu;
+
+/* ========================================================================= */
+local inline unsigned long read_flags(void)
+{
+ unsigned long f;
+ __asm__ __volatile__ (
+ "pushf\n\t"
+ "pop %0\n\t"
+ : "=r" (f)
+ );
+ return f;
+}
+
+/* ========================================================================= */
+local inline void write_flags(unsigned long f)
+{
+ __asm__ __volatile__ (
+ "push %0\n\t"
+ "popf\n\t"
+ : : "ri" (f) : "cc"
+ );
+}
+
+/* ========================================================================= */
+local inline void cpuid(struct cpuid_regs *regs, unsigned long func)
+{
+ /* save ebx around cpuid call, PIC code needs it */
+ __asm__ __volatile__ (
+ "xchg %1, " PICREG "\n\t"
+ "cpuid\n\t"
+ "xchg %1, " PICREG "\n"
+ : /* %0 */ "=a" (regs->eax),
+ /* %1 */ "=r" (regs->ebx),
+ /* %2 */ "=c" (regs->ecx),
+ /* %4 */ "=d" (regs->edx)
+ : /* %5 */ "0" (func),
+ /* %6 */ "2" (regs->ecx)
+ : "cc"
+ );
+}
+
+/* ========================================================================= */
+local inline void cpuids(struct cpuid_regs *regs, unsigned long func)
+{
+ regs->ecx = 0;
+ cpuid(regs, func);
+}
+
+/* ========================================================================= */
+local inline int toggle_eflags_test(const unsigned long mask)
+{
+ unsigned long f;
+ int result;
+
+ f = read_flags();
+ write_flags(f ^ mask);
+ result = !!((f ^ read_flags()) & mask);
+ /*
+ * restore the old flags, the test for i486 tests the alignment
+ * check bit, and left set will confuse the x86 software world.
+ */
+ write_flags(f);
+ return result;
+}
+
+/* ========================================================================= */
+local inline int is_486(void)
+{
+ return toggle_eflags_test(1 << 18);
+}
+
+/* ========================================================================= */
+local inline int has_cpuid(void)
+{
+ return toggle_eflags_test(1 << 21);
+}
+
+/* ========================================================================= */
+local void identify_cpu(void)
+{
+ struct cpuid_regs a;
+
+ if (our_cpu.init_done)
+ return;
+
+ our_cpu.init_done = -1;
+ /* force a write out to memory */
+ __asm__ __volatile__ ("" : : "m" (our_cpu.init_done));
+
+ if (!is_486())
+ return;
+
+ if (!has_cpuid())
+ return;
+
+ /* get the maximum basic leaf number */
+ cpuids(&a, 0x00000000);
+ our_cpu.max_basic = (unsigned int)a.eax;
+ /* we could get the vendor string from ebx, edx, ecx */
+
+ /* get the first basic leaf, if it is avail. */
+ if (our_cpu.max_basic >= 0x00000001)
+ cpuids(&a, 0x00000001);
+ else
+ a.eax = a.ebx = a.ecx = a.edx = 0;
+
+ /* we could extract family, model, stepping from eax */
+
+ /* there is the first set of features */
+ our_cpu.features[0] = a.edx;
+ our_cpu.features[1] = a.ecx;
+
+ /* now we could test the extended features, but is not needed, for now */
+}
+
+/* ========================================================================= */
+local noinline void *test_cpu_feature(const struct test_cpu_feature *t, unsigned int l)
+{
+ unsigned int i, j, f;
+ identify_cpu();
+
+ for (i = 0; i < l; i++) {
+ if (t[i].flags & CFF_DEFAULT)
+ return t[i].func;
+ for (f = 0, j = 0; j < FEATURE_WORDS; j++)
+ f |= (our_cpu.features[j] & t[i].features[j]) ^ t[i].features[j];
+ if (f)
+ continue;
+ return t[i].func;
+ }
+ return NULL; /* die! */
+}
+
+#endif