1166 lines
37 KiB
Diff
1166 lines
37 KiB
Diff
|
=== modified file 'Makefile.in'
|
||
|
--- Makefile.in 2011-03-14 14:39:24 +0000
|
||
|
+++ Makefile.in 2011-03-14 16:46:30 +0000
|
||
|
@@ -236,7 +236,7 @@
|
||
|
|
||
|
# DO NOT DELETE THIS LINE -- make depend depends on it.
|
||
|
|
||
|
-adler32.o: adler32.c adler32_ppc.c adler32_arm.c zutil.h zlib.h zconf.h
|
||
|
+adler32.o: adler32.c adler32_ppc.c adler32_arm.c adler32_x86.c zutil.h zlib.h zconf.h
|
||
|
zutil.o: zutil.h zlib.h zconf.h
|
||
|
gzclose.o gzlib.o gzread.o gzwrite.o: zlib.h zconf.h gzguts.h
|
||
|
compress.o example.o minigzip.o uncompr.o: zlib.h zconf.h
|
||
|
@@ -247,7 +247,7 @@
|
||
|
inftrees.o: zutil.h zlib.h zconf.h inftrees.h
|
||
|
trees.o: deflate.h zutil.h zlib.h zconf.h trees.h
|
||
|
|
||
|
-adler32.lo: adler32.c adler32_ppc.c adler32_arm.c zutil.h zlib.h zconf.h
|
||
|
+adler32.lo: adler32.c adler32_ppc.c adler32_arm.c adler32_x86.c zutil.h zlib.h zconf.h
|
||
|
zutil.lo: zutil.h zlib.h zconf.h
|
||
|
gzclose.lo gzlib.lo gzread.lo gzwrite.lo: zlib.h zconf.h gzguts.h
|
||
|
compress.lo example.lo minigzip.lo uncompr.lo: zlib.h zconf.h
|
||
|
|
||
|
=== modified file 'adler32.c'
|
||
|
--- adler32.c 2011-03-30 13:38:46 +0000
|
||
|
+++ adler32.c 2011-03-30 13:38:46 +0000
|
||
|
@@ -144,6 +144,8 @@
|
||
|
# include "adler32_arm.c"
|
||
|
# elif defined(__powerpc__) || defined(__powerpc64__)
|
||
|
# include "adler32_ppc.c"
|
||
|
+# elif defined(__i386__) || defined(__x86_64__)
|
||
|
+# include "adler32_x86.c"
|
||
|
# endif
|
||
|
#endif
|
||
|
|
||
|
|
||
|
=== added file 'adler32_x86.c'
|
||
|
--- adler32_x86.c 1970-01-01 00:00:00 +0000
|
||
|
+++ adler32_x86.c 2011-03-15 23:15:36 +0000
|
||
|
@@ -0,0 +1,1125 @@
|
||
|
+/*
|
||
|
+ * adler32.c -- compute the Adler-32 checksum of a data stream
|
||
|
+ * x86 implementation
|
||
|
+ * Copyright (C) 1995-2007 Mark Adler
|
||
|
+ * Copyright (C) 2009-2011 Jan Seiffert
|
||
|
+ * For conditions of distribution and use, see copyright notice in zlib.h
|
||
|
+ */
|
||
|
+
|
||
|
+/* @(#) $Id$ */
|
||
|
+
|
||
|
+#if GCC_VERSION_GE(207)
|
||
|
+# define GCC_ATTR_CONSTRUCTOR __attribute__((__constructor__))
|
||
|
+#else
|
||
|
+# define VEC_NO_GO
|
||
|
+#endif
|
||
|
+
|
||
|
+#if GCC_VERSION_GE(203)
|
||
|
+# define GCC_ATTR_ALIGNED(x) __attribute__((__aligned__(x)))
|
||
|
+#else
|
||
|
+# define VEC_NO_GO
|
||
|
+#endif
|
||
|
+
|
||
|
+/* inline asm, so only on GCC (or compatible) */
|
||
|
+#if defined(__GNUC__) && !defined(VEC_NO_GO)
|
||
|
+# define HAVE_ADLER32_VEC
|
||
|
+# define MIN_WORK 64
|
||
|
+
|
||
|
+# ifdef __x86_64__
|
||
|
+# define PICREG "%%rbx"
|
||
|
+# else
|
||
|
+# define PICREG "%%ebx"
|
||
|
+# endif
|
||
|
+
|
||
|
+/* ========================================================================= */
|
||
|
+local const struct { short d[24]; } vord GCC_ATTR_ALIGNED(16) = {
|
||
|
+ {1,1,1,1,1,1,1,1,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1}
|
||
|
+};
|
||
|
+
|
||
|
+/* ========================================================================= */
|
||
|
+local const struct { char d[16]; } vord_b GCC_ATTR_ALIGNED(16) = {
|
||
|
+ {16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1}
|
||
|
+};
|
||
|
+
|
||
|
+/* ========================================================================= */
|
||
|
+local noinline const Bytef *adler32_jumped(buf, s1, s2, k)
|
||
|
+ const Bytef *buf;
|
||
|
+ unsigned int *s1;
|
||
|
+ unsigned int *s2;
|
||
|
+ unsigned int k;
|
||
|
+{
|
||
|
+ unsigned int t;
|
||
|
+ unsigned n = k % 16;
|
||
|
+ buf += n;
|
||
|
+ k = (k / 16) + 1;
|
||
|
+
|
||
|
+ __asm__ __volatile__ (
|
||
|
+# ifdef __x86_64__
|
||
|
+# define CLOB "&"
|
||
|
+ "lea 1f(%%rip), %q4\n\t"
|
||
|
+ "lea (%q4,%q5,8), %q4\n\t"
|
||
|
+ "jmp *%q4\n\t"
|
||
|
+# else
|
||
|
+# ifndef __PIC__
|
||
|
+# define CLOB
|
||
|
+ "lea 1f(,%5,8), %4\n\t"
|
||
|
+# else
|
||
|
+# define CLOB
|
||
|
+ "lea 1f-3f(,%5,8), %4\n\t"
|
||
|
+ "call 9f\n"
|
||
|
+ "3:\n\t"
|
||
|
+# endif
|
||
|
+ "jmp *%4\n\t"
|
||
|
+# ifdef __PIC__
|
||
|
+ ".p2align 1\n"
|
||
|
+ "9:\n\t"
|
||
|
+ "addl (%%esp), %4\n\t"
|
||
|
+ "ret\n\t"
|
||
|
+# endif
|
||
|
+# endif
|
||
|
+ ".p2align 1\n"
|
||
|
+ "2:\n\t"
|
||
|
+# ifdef __i386
|
||
|
+ ".byte 0x3e\n\t"
|
||
|
+# endif
|
||
|
+ "add $0x10, %2\n\t"
|
||
|
+ ".p2align 1\n"
|
||
|
+ "1:\n\t"
|
||
|
+ /* 128 */
|
||
|
+ "movzbl -16(%2), %4\n\t" /* 4 */
|
||
|
+ "add %4, %0\n\t" /* 2 */
|
||
|
+ "add %0, %1\n\t" /* 2 */
|
||
|
+ /* 120 */
|
||
|
+ "movzbl -15(%2), %4\n\t" /* 4 */
|
||
|
+ "add %4, %0\n\t" /* 2 */
|
||
|
+ "add %0, %1\n\t" /* 2 */
|
||
|
+ /* 112 */
|
||
|
+ "movzbl -14(%2), %4\n\t" /* 4 */
|
||
|
+ "add %4, %0\n\t" /* 2 */
|
||
|
+ "add %0, %1\n\t" /* 2 */
|
||
|
+ /* 104 */
|
||
|
+ "movzbl -13(%2), %4\n\t" /* 4 */
|
||
|
+ "add %4, %0\n\t" /* 2 */
|
||
|
+ "add %0, %1\n\t" /* 2 */
|
||
|
+ /* 96 */
|
||
|
+ "movzbl -12(%2), %4\n\t" /* 4 */
|
||
|
+ "add %4, %0\n\t" /* 2 */
|
||
|
+ "add %0, %1\n\t" /* 2 */
|
||
|
+ /* 88 */
|
||
|
+ "movzbl -11(%2), %4\n\t" /* 4 */
|
||
|
+ "add %4, %0\n\t" /* 2 */
|
||
|
+ "add %0, %1\n\t" /* 2 */
|
||
|
+ /* 80 */
|
||
|
+ "movzbl -10(%2), %4\n\t" /* 4 */
|
||
|
+ "add %4, %0\n\t" /* 2 */
|
||
|
+ "add %0, %1\n\t" /* 2 */
|
||
|
+ /* 72 */
|
||
|
+ "movzbl -9(%2), %4\n\t" /* 4 */
|
||
|
+ "add %4, %0\n\t" /* 2 */
|
||
|
+ "add %0, %1\n\t" /* 2 */
|
||
|
+ /* 64 */
|
||
|
+ "movzbl -8(%2), %4\n\t" /* 4 */
|
||
|
+ "add %4, %0\n\t" /* 2 */
|
||
|
+ "add %0, %1\n\t" /* 2 */
|
||
|
+ /* 56 */
|
||
|
+ "movzbl -7(%2), %4\n\t" /* 4 */
|
||
|
+ "add %4, %0\n\t" /* 2 */
|
||
|
+ "add %0, %1\n\t" /* 2 */
|
||
|
+ /* 48 */
|
||
|
+ "movzbl -6(%2), %4\n\t" /* 4 */
|
||
|
+ "add %4, %0\n\t" /* 2 */
|
||
|
+ "add %0, %1\n\t" /* 2 */
|
||
|
+ /* 40 */
|
||
|
+ "movzbl -5(%2), %4\n\t" /* 4 */
|
||
|
+ "add %4, %0\n\t" /* 2 */
|
||
|
+ "add %0, %1\n\t" /* 2 */
|
||
|
+ /* 32 */
|
||
|
+ "movzbl -4(%2), %4\n\t" /* 4 */
|
||
|
+ "add %4, %0\n\t" /* 2 */
|
||
|
+ "add %0, %1\n\t" /* 2 */
|
||
|
+ /* 24 */
|
||
|
+ "movzbl -3(%2), %4\n\t" /* 4 */
|
||
|
+ "add %4, %0\n\t" /* 2 */
|
||
|
+ "add %0, %1\n\t" /* 2 */
|
||
|
+ /* 16 */
|
||
|
+ "movzbl -2(%2), %4\n\t" /* 4 */
|
||
|
+ "add %4, %0\n\t" /* 2 */
|
||
|
+ "add %0, %1\n\t" /* 2 */
|
||
|
+ /* 8 */
|
||
|
+ "movzbl -1(%2), %4\n\t" /* 4 */
|
||
|
+ "add %4, %0\n\t" /* 2 */
|
||
|
+ "add %0, %1\n\t" /* 2 */
|
||
|
+ /* 0 */
|
||
|
+ "dec %3\n\t"
|
||
|
+ "jnz 2b"
|
||
|
+ : /* %0 */ "=R" (*s1),
|
||
|
+ /* %1 */ "=R" (*s2),
|
||
|
+ /* %2 */ "=abdSD" (buf),
|
||
|
+ /* %3 */ "=c" (k),
|
||
|
+ /* %4 */ "="CLOB"R" (t)
|
||
|
+ : /* %5 */ "r" (16 - n),
|
||
|
+ /* */ "0" (*s1),
|
||
|
+ /* */ "1" (*s2),
|
||
|
+ /* */ "2" (buf),
|
||
|
+ /* */ "3" (k)
|
||
|
+ : "cc", "memory"
|
||
|
+ );
|
||
|
+
|
||
|
+ return buf;
|
||
|
+}
|
||
|
+
|
||
|
+#if 0
|
||
|
+ /*
|
||
|
+ * Will XOP processors have SSSE3/AVX??
|
||
|
+ * And what is the unaligned load performance?
|
||
|
+ */
|
||
|
+ "prefetchnta 0x70(%0)\n\t"
|
||
|
+ "lddqu (%0), %%xmm0\n\t"
|
||
|
+ "vpaddd %%xmm3, %%xmm5, %%xmm5\n\t"
|
||
|
+ "sub $16, %3\n\t"
|
||
|
+ "add $16, %0\n\t"
|
||
|
+ "cmp $15, %3\n\t"
|
||
|
+ "vphaddubd %%xmm0, %%xmm1\n\t" /* A */
|
||
|
+ "vpmaddubsw %%xmm4, %%xmm0, %%xmm0\n\t"/* AVX! */ /* 1 */
|
||
|
+ "vphadduwd %%xmm0, %%xmm0\n\t" /* 2 */
|
||
|
+ "vpaddd %%xmm1, %%xmm3, %%xmm3\n\t" /* B: A+B => hadd+acc or vpmadcubd w. mul = 1 */
|
||
|
+ "vpaddd %%xmm0, %%xmm2, %%xmm2\n\t" /* 3: 1+2+3 => vpmadcubd w. mul = 16,15,14... */
|
||
|
+ "jg 1b\n\t"
|
||
|
+ xop_reduce
|
||
|
+ xop_reduce
|
||
|
+ xop_reduce
|
||
|
+ setup
|
||
|
+ "jg 1b\n\t"
|
||
|
+ "vphaddudq %%xmm2, %%xmm0\n\t"
|
||
|
+ "vphaddudq %%xmm3, %%xmm1\n\t"
|
||
|
+ "pshufd $0xE6, %%xmm0, %%xmm2\n\t"
|
||
|
+ "pshufd $0xE6, %%xmm1, %%xmm3\n\t"
|
||
|
+ "paddd %%xmm0, %%xmm2\n\t"
|
||
|
+ "paddd %%xmm1, %%xmm3\n\t"
|
||
|
+ "movd %%xmm2, %2\n\t"
|
||
|
+ "movd %%xmm3, %1\n\t"
|
||
|
+#endif
|
||
|
+
|
||
|
+/* ========================================================================= */
|
||
|
+local uLong adler32_SSSE3(adler, buf, len)
|
||
|
+ uLong adler;
|
||
|
+ const Bytef *buf;
|
||
|
+ uInt len;
|
||
|
+{
|
||
|
+ unsigned int s1 = adler & 0xffff;
|
||
|
+ unsigned int s2 = (adler >> 16) & 0xffff;
|
||
|
+ unsigned int k;
|
||
|
+
|
||
|
+ k = ALIGN_DIFF(buf, 16);
|
||
|
+ len -= k;
|
||
|
+ if (k)
|
||
|
+ buf = adler32_jumped(buf, &s1, &s2, k);
|
||
|
+
|
||
|
+ __asm__ __volatile__ (
|
||
|
+ "mov %6, %3\n\t" /* get max. byte count VNMAX till v1_round_sum overflows */
|
||
|
+ "cmp %3, %4\n\t"
|
||
|
+ "cmovb %4, %3\n\t" /* k = len >= VNMAX ? k : len */
|
||
|
+ "sub %3, %4\n\t" /* len -= k */
|
||
|
+ "cmp $16, %3\n\t"
|
||
|
+ "jb 88f\n\t" /* if(k < 16) goto OUT */
|
||
|
+#ifdef __ELF__
|
||
|
+ ".subsection 2\n\t"
|
||
|
+#else
|
||
|
+ "jmp 77f\n\t"
|
||
|
+#endif
|
||
|
+ ".p2align 2\n"
|
||
|
+ /*
|
||
|
+ * reduction function to bring a vector sum within the range of BASE
|
||
|
+ * This does no full reduction! When the sum is large, a number > BASE
|
||
|
+ * is the result. To do a full reduction call multiple times.
|
||
|
+ */
|
||
|
+ "sse2_reduce:\n\t"
|
||
|
+ "movdqa %%xmm0, %%xmm1\n\t" /* y = x */
|
||
|
+ "pslld $16, %%xmm1\n\t" /* y <<= 16 */
|
||
|
+ "psrld $16, %%xmm0\n\t" /* x >>= 16 */
|
||
|
+ "psrld $16, %%xmm1\n\t" /* y >>= 16 */
|
||
|
+ "psubd %%xmm0, %%xmm1\n\t" /* y -= x */
|
||
|
+ "pslld $4, %%xmm0\n\t" /* x <<= 4 */
|
||
|
+ "paddd %%xmm1, %%xmm0\n\t" /* x += y */
|
||
|
+ "ret\n\t"
|
||
|
+#ifdef __ELF__
|
||
|
+ ".previous\n\t"
|
||
|
+#else
|
||
|
+ "77:\n\t"
|
||
|
+#endif
|
||
|
+ "movdqa %5, %%xmm5\n\t" /* get vord_b */
|
||
|
+ "prefetchnta 0x70(%0)\n\t"
|
||
|
+ "movd %2, %%xmm2\n\t" /* init vector sum vs2 with s2 */
|
||
|
+ "movd %1, %%xmm3\n\t" /* init vector sum vs1 with s1 */
|
||
|
+ "pxor %%xmm4, %%xmm4\n" /* zero */
|
||
|
+ "3:\n\t"
|
||
|
+ "pxor %%xmm7, %%xmm7\n\t" /* zero vs1_round_sum */
|
||
|
+ ".p2align 3,,3\n\t"
|
||
|
+ ".p2align 2\n"
|
||
|
+ "2:\n\t"
|
||
|
+ "mov $128, %1\n\t" /* inner_k = 128 bytes till vs2_i overflows */
|
||
|
+ "cmp %1, %3\n\t"
|
||
|
+ "cmovb %3, %1\n\t" /* inner_k = k >= inner_k ? inner_k : k */
|
||
|
+ "and $-16, %1\n\t" /* inner_k = ROUND_TO(inner_k, 16) */
|
||
|
+ "sub %1, %3\n\t" /* k -= inner_k */
|
||
|
+ "shr $4, %1\n\t" /* inner_k /= 16 */
|
||
|
+ "pxor %%xmm6, %%xmm6\n\t" /* zero vs2_i */
|
||
|
+ ".p2align 4,,7\n"
|
||
|
+ ".p2align 3\n"
|
||
|
+ "1:\n\t"
|
||
|
+ "movdqa (%0), %%xmm0\n\t" /* fetch input data */
|
||
|
+ "prefetchnta 0x70(%0)\n\t"
|
||
|
+ "paddd %%xmm3, %%xmm7\n\t" /* vs1_round_sum += vs1 */
|
||
|
+ "add $16, %0\n\t" /* advance input data pointer */
|
||
|
+ "dec %1\n\t" /* decrement inner_k */
|
||
|
+ "movdqa %%xmm0, %%xmm1\n\t" /* make a copy of the input data */
|
||
|
+# if (HAVE_BINUTILS-0) >= 217
|
||
|
+ "pmaddubsw %%xmm5, %%xmm0\n\t" /* multiply all input bytes by vord_b bytes, add adjecent results to words */
|
||
|
+# else
|
||
|
+ ".byte 0x66, 0x0f, 0x38, 0x04, 0xc5\n\t" /* pmaddubsw %%xmm5, %%xmm0 */
|
||
|
+# endif
|
||
|
+ "psadbw %%xmm4, %%xmm1\n\t" /* subtract zero from every byte, add 8 bytes to a sum */
|
||
|
+ "paddw %%xmm0, %%xmm6\n\t" /* vs2_i += in * vorder_b */
|
||
|
+ "paddd %%xmm1, %%xmm3\n\t" /* vs1 += psadbw */
|
||
|
+ "jnz 1b\n\t" /* repeat if inner_k != 0 */
|
||
|
+ "movdqa %%xmm6, %%xmm0\n\t" /* copy vs2_i */
|
||
|
+ "punpckhwd %%xmm4, %%xmm0\n\t" /* zero extent vs2_i upper words to dwords */
|
||
|
+ "punpcklwd %%xmm4, %%xmm6\n\t" /* zero extent vs2_i lower words to dwords */
|
||
|
+ "paddd %%xmm0, %%xmm2\n\t" /* vs2 += vs2_i.upper */
|
||
|
+ "paddd %%xmm6, %%xmm2\n\t" /* vs2 += vs2_i.lower */
|
||
|
+ "cmp $15, %3\n\t"
|
||
|
+ "jg 2b\n\t" /* if(k > 15) repeat */
|
||
|
+ "movdqa %%xmm7, %%xmm0\n\t" /* move vs1_round_sum */
|
||
|
+ "call sse2_reduce\n\t" /* reduce vs1_round_sum */
|
||
|
+ "pslld $4, %%xmm0\n\t" /* vs1_round_sum *= 16 */
|
||
|
+ "paddd %%xmm2, %%xmm0\n\t" /* vs2 += vs1_round_sum */
|
||
|
+ "call sse2_reduce\n\t" /* reduce again */
|
||
|
+ "movdqa %%xmm0, %%xmm2\n\t" /* move vs2 back in place */
|
||
|
+ "movdqa %%xmm3, %%xmm0\n\t" /* move vs1 */
|
||
|
+ "call sse2_reduce\n\t" /* reduce */
|
||
|
+ "movdqa %%xmm0, %%xmm3\n\t" /* move vs1 back in place */
|
||
|
+ "add %3, %4\n\t" /* len += k */
|
||
|
+ "mov %6, %3\n\t" /* get max. byte count VNMAX till v1_round_sum overflows */
|
||
|
+ "cmp %3, %4\n\t"
|
||
|
+ "cmovb %4, %3\n\t" /* k = len >= VNMAX ? k : len */
|
||
|
+ "sub %3, %4\n\t" /* len -= k */
|
||
|
+ "cmp $15, %3\n\t"
|
||
|
+ "jg 3b\n\t" /* if(k > 15) repeat */
|
||
|
+ "pshufd $0xEE, %%xmm3, %%xmm1\n\t" /* collect vs1 & vs2 in lowest vector member */
|
||
|
+ "pshufd $0xEE, %%xmm2, %%xmm0\n\t"
|
||
|
+ "paddd %%xmm3, %%xmm1\n\t"
|
||
|
+ "paddd %%xmm2, %%xmm0\n\t"
|
||
|
+ "pshufd $0xE5, %%xmm0, %%xmm2\n\t"
|
||
|
+ "paddd %%xmm0, %%xmm2\n\t"
|
||
|
+ "movd %%xmm1, %1\n\t" /* mov vs1 to s1 */
|
||
|
+ "movd %%xmm2, %2\n" /* mov vs2 to s2 */
|
||
|
+ "88:"
|
||
|
+ : /* %0 */ "=r" (buf),
|
||
|
+ /* %1 */ "=r" (s1),
|
||
|
+ /* %2 */ "=r" (s2),
|
||
|
+ /* %3 */ "=r" (k),
|
||
|
+ /* %4 */ "=r" (len)
|
||
|
+ : /* %5 */ "m" (vord_b),
|
||
|
+ /*
|
||
|
+ * somewhere between 5 & 6, psadbw 64 bit sums ruin the party
|
||
|
+ * spreading the sums with palignr only brings it to 7 (?),
|
||
|
+ * while introducing an op into the main loop (2800 ms -> 3200 ms)
|
||
|
+ */
|
||
|
+ /* %6 */ "i" (5*NMAX),
|
||
|
+ /* */ "0" (buf),
|
||
|
+ /* */ "1" (s1),
|
||
|
+ /* */ "2" (s2),
|
||
|
+ /* */ "4" (len)
|
||
|
+ : "cc", "memory"
|
||
|
+# ifdef __SSE__
|
||
|
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
|
||
|
+# endif
|
||
|
+ );
|
||
|
+
|
||
|
+ if (unlikely(k))
|
||
|
+ buf = adler32_jumped(buf, &s1, &s2, k);
|
||
|
+ reduce(s1);
|
||
|
+ reduce(s2);
|
||
|
+ return (s2 << 16) | s1;
|
||
|
+}
|
||
|
+
|
||
|
+/* ========================================================================= */
|
||
|
+local uLong adler32_SSE2(adler, buf, len)
|
||
|
+ uLong adler;
|
||
|
+ const Bytef *buf;
|
||
|
+ uInt len;
|
||
|
+{
|
||
|
+ unsigned int s1 = adler & 0xffff;
|
||
|
+ unsigned int s2 = (adler >> 16) & 0xffff;
|
||
|
+ unsigned int k;
|
||
|
+
|
||
|
+ k = ALIGN_DIFF(buf, 16);
|
||
|
+ len -= k;
|
||
|
+ if (k)
|
||
|
+ buf = adler32_jumped(buf, &s1, &s2, k);
|
||
|
+
|
||
|
+ __asm__ __volatile__ (
|
||
|
+ "mov %6, %3\n\t"
|
||
|
+ "cmp %3, %4\n\t"
|
||
|
+ "cmovb %4, %3\n\t"
|
||
|
+ "sub %3, %4\n\t"
|
||
|
+ "cmp $16, %3\n\t"
|
||
|
+ "jb 88f\n\t"
|
||
|
+ "prefetchnta 0x70(%0)\n\t"
|
||
|
+ "movd %1, %%xmm4\n\t"
|
||
|
+ "movd %2, %%xmm3\n\t"
|
||
|
+ "pxor %%xmm2, %%xmm2\n\t"
|
||
|
+ "pxor %%xmm5, %%xmm5\n\t"
|
||
|
+ ".p2align 2\n"
|
||
|
+ "3:\n\t"
|
||
|
+ "pxor %%xmm6, %%xmm6\n\t"
|
||
|
+ "pxor %%xmm7, %%xmm7\n\t"
|
||
|
+ "mov $2048, %1\n\t" /* get byte count till vs2_{l|h}_word overflows */
|
||
|
+ "cmp %1, %3\n\t"
|
||
|
+ "cmovb %3, %1\n"
|
||
|
+ "and $-16, %1\n\t"
|
||
|
+ "sub %1, %3\n\t"
|
||
|
+ "shr $4, %1\n\t"
|
||
|
+ ".p2align 4,,7\n"
|
||
|
+ ".p2align 3\n"
|
||
|
+ "1:\n\t"
|
||
|
+ "prefetchnta 0x70(%0)\n\t"
|
||
|
+ "movdqa (%0), %%xmm0\n\t" /* fetch input data */
|
||
|
+ "paddd %%xmm4, %%xmm5\n\t" /* vs1_round_sum += vs1 */
|
||
|
+ "add $16, %0\n\t"
|
||
|
+ "dec %1\n\t"
|
||
|
+ "movdqa %%xmm0, %%xmm1\n\t" /* copy input data */
|
||
|
+ "psadbw %%xmm2, %%xmm0\n\t" /* add all bytes horiz. */
|
||
|
+ "paddd %%xmm0, %%xmm4\n\t" /* add that to vs1 */
|
||
|
+ "movdqa %%xmm1, %%xmm0\n\t" /* copy input data */
|
||
|
+ "punpckhbw %%xmm2, %%xmm1\n\t" /* zero extent input upper bytes to words */
|
||
|
+ "punpcklbw %%xmm2, %%xmm0\n\t" /* zero extent input lower bytes to words */
|
||
|
+ "paddw %%xmm1, %%xmm7\n\t" /* vs2_h_words += in_high_words */
|
||
|
+ "paddw %%xmm0, %%xmm6\n\t" /* vs2_l_words += in_low_words */
|
||
|
+ "jnz 1b\n\t"
|
||
|
+ "cmp $15, %3\n\t"
|
||
|
+ "pmaddwd 32+%5, %%xmm7\n\t" /* multiply vs2_h_words with order, add adjecend results */
|
||
|
+ "pmaddwd 16+%5, %%xmm6\n\t" /* multiply vs2_l_words with order, add adjecend results */
|
||
|
+ "paddd %%xmm7, %%xmm3\n\t" /* add to vs2 */
|
||
|
+ "paddd %%xmm6, %%xmm3\n\t" /* add to vs2 */
|
||
|
+ "jg 3b\n\t"
|
||
|
+ "movdqa %%xmm5, %%xmm0\n\t"
|
||
|
+ "pxor %%xmm5, %%xmm5\n\t"
|
||
|
+ "call sse2_reduce\n\t"
|
||
|
+ "pslld $4, %%xmm0\n\t"
|
||
|
+ "paddd %%xmm3, %%xmm0\n\t"
|
||
|
+ "call sse2_reduce\n\t"
|
||
|
+ "movdqa %%xmm0, %%xmm3\n\t"
|
||
|
+ "movdqa %%xmm4, %%xmm0\n\t"
|
||
|
+ "call sse2_reduce\n\t"
|
||
|
+ "movdqa %%xmm0, %%xmm4\n\t"
|
||
|
+ "add %3, %4\n\t"
|
||
|
+ "mov %6, %3\n\t"
|
||
|
+ "cmp %3, %4\n\t"
|
||
|
+ "cmovb %4, %3\n"
|
||
|
+ "sub %3, %4\n\t"
|
||
|
+ "cmp $15, %3\n\t"
|
||
|
+ "jg 3b\n\t"
|
||
|
+ "pshufd $0xEE, %%xmm4, %%xmm1\n\t"
|
||
|
+ "pshufd $0xEE, %%xmm3, %%xmm0\n\t"
|
||
|
+ "paddd %%xmm4, %%xmm1\n\t"
|
||
|
+ "paddd %%xmm3, %%xmm0\n\t"
|
||
|
+ "pshufd $0xE5, %%xmm0, %%xmm3\n\t"
|
||
|
+ "paddd %%xmm0, %%xmm3\n\t"
|
||
|
+ "movd %%xmm1, %1\n\t"
|
||
|
+ "movd %%xmm3, %2\n"
|
||
|
+ "88:\n\t"
|
||
|
+ : /* %0 */ "=r" (buf),
|
||
|
+ /* %1 */ "=r" (s1),
|
||
|
+ /* %2 */ "=r" (s2),
|
||
|
+ /* %3 */ "=r" (k),
|
||
|
+ /* %4 */ "=r" (len)
|
||
|
+ : /* %5 */ "m" (vord),
|
||
|
+ /* %6 */ "i" (5*NMAX),
|
||
|
+ /* */ "0" (buf),
|
||
|
+ /* */ "1" (s1),
|
||
|
+ /* */ "2" (s2),
|
||
|
+ /* */ "3" (k),
|
||
|
+ /* */ "4" (len)
|
||
|
+ : "cc", "memory"
|
||
|
+# ifdef __SSE__
|
||
|
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
|
||
|
+# endif
|
||
|
+ );
|
||
|
+
|
||
|
+ if (unlikely(k))
|
||
|
+ buf = adler32_jumped(buf, &s1, &s2, k);
|
||
|
+ reduce(s1);
|
||
|
+ reduce(s2);
|
||
|
+ return (s2 << 16) | s1;
|
||
|
+}
|
||
|
+
|
||
|
+# if 0
|
||
|
+/* ========================================================================= */
|
||
|
+/*
|
||
|
+ * The SSE2 version above is faster on my CPUs (Athlon64, Core2,
|
||
|
+ * P4 Xeon, K10 Sempron), but has instruction stalls only a
|
||
|
+ * Out-Of-Order-Execution CPU can solve.
|
||
|
+ * So this Version _may_ be better for the new old thing, Atom.
|
||
|
+ */
|
||
|
+local noinline uLong adler32_SSE2_no_oooe(adler, buf, len)
|
||
|
+ uLong adler;
|
||
|
+ const Bytef *buf;
|
||
|
+ uInt len;
|
||
|
+{
|
||
|
+ unsigned int s1 = adler & 0xffff;
|
||
|
+ unsigned int s2 = (adler >> 16) & 0xffff;
|
||
|
+ unsigned int k;
|
||
|
+
|
||
|
+ k = ALIGN_DIFF(buf, 16);
|
||
|
+ len -= k;
|
||
|
+ if (k)
|
||
|
+ buf = adler32_jumped(buf, &s1, &s2, k);
|
||
|
+
|
||
|
+ __asm__ __volatile__ (
|
||
|
+ "mov %6, %3\n\t"
|
||
|
+ "cmp %3, %4\n\t"
|
||
|
+ "cmovb %4, %3\n\t"
|
||
|
+ "sub %3, %4\n\t"
|
||
|
+ "cmp $16, %3\n\t"
|
||
|
+ "jb 88f\n\t"
|
||
|
+ "movdqa 16+%5, %%xmm6\n\t"
|
||
|
+ "movdqa 32+%5, %%xmm5\n\t"
|
||
|
+ "prefetchnta 16(%0)\n\t"
|
||
|
+ "pxor %%xmm7, %%xmm7\n\t"
|
||
|
+ "movd %1, %%xmm4\n\t"
|
||
|
+ "movd %2, %%xmm3\n\t"
|
||
|
+ ".p2align 3,,3\n\t"
|
||
|
+ ".p2align 2\n"
|
||
|
+ "1:\n\t"
|
||
|
+ "prefetchnta 32(%0)\n\t"
|
||
|
+ "movdqa (%0), %%xmm1\n\t"
|
||
|
+ "sub $16, %3\n\t"
|
||
|
+ "movdqa %%xmm4, %%xmm2\n\t"
|
||
|
+ "add $16, %0\n\t"
|
||
|
+ "movdqa %%xmm1, %%xmm0\n\t"
|
||
|
+ "cmp $15, %3\n\t"
|
||
|
+ "pslld $4, %%xmm2\n\t"
|
||
|
+ "paddd %%xmm3, %%xmm2\n\t"
|
||
|
+ "psadbw %%xmm7, %%xmm0\n\t"
|
||
|
+ "paddd %%xmm0, %%xmm4\n\t"
|
||
|
+ "movdqa %%xmm1, %%xmm0\n\t"
|
||
|
+ "punpckhbw %%xmm7, %%xmm1\n\t"
|
||
|
+ "punpcklbw %%xmm7, %%xmm0\n\t"
|
||
|
+ "movdqa %%xmm1, %%xmm3\n\t"
|
||
|
+ "pmaddwd %%xmm6, %%xmm0\n\t"
|
||
|
+ "paddd %%xmm2, %%xmm0\n\t"
|
||
|
+ "pmaddwd %%xmm5, %%xmm3\n\t"
|
||
|
+ "paddd %%xmm0, %%xmm3\n\t"
|
||
|
+ "jg 1b\n\t"
|
||
|
+ "movdqa %%xmm3, %%xmm0\n\t"
|
||
|
+ "call sse2_reduce\n\t"
|
||
|
+ "call sse2_reduce\n\t"
|
||
|
+ "movdqa %%xmm0, %%xmm3\n\t"
|
||
|
+ "movdqa %%xmm4, %%xmm0\n\t"
|
||
|
+ "call sse2_reduce\n\t"
|
||
|
+ "movdqa %%xmm0, %%xmm4\n\t"
|
||
|
+ "add %3, %4\n\t"
|
||
|
+ "mov %6, %3\n\t"
|
||
|
+ "cmp %3, %4\n\t"
|
||
|
+ "cmovb %4, %3\n\t"
|
||
|
+ "sub %3, %4\n\t"
|
||
|
+ "cmp $15, %3\n\t"
|
||
|
+ "jg 1b\n\t"
|
||
|
+ "pshufd $0xEE, %%xmm3, %%xmm0\n\t"
|
||
|
+ "pshufd $0xEE, %%xmm4, %%xmm1\n\t"
|
||
|
+ "paddd %%xmm3, %%xmm0\n\t"
|
||
|
+ "pshufd $0xE5, %%xmm0, %%xmm2\n\t"
|
||
|
+ "paddd %%xmm4, %%xmm1\n\t"
|
||
|
+ "movd %%xmm1, %1\n\t"
|
||
|
+ "paddd %%xmm0, %%xmm2\n\t"
|
||
|
+ "movd %%xmm2, %2\n"
|
||
|
+ "88:"
|
||
|
+ : /* %0 */ "=r" (buf),
|
||
|
+ /* %1 */ "=r" (s1),
|
||
|
+ /* %2 */ "=r" (s2),
|
||
|
+ /* %3 */ "=r" (k),
|
||
|
+ /* %4 */ "=r" (len)
|
||
|
+ : /* %5 */ "m" (vord),
|
||
|
+ /* %6 */ "i" (NMAX + NMAX/3),
|
||
|
+ /* */ "0" (buf),
|
||
|
+ /* */ "1" (s1),
|
||
|
+ /* */ "2" (s2),
|
||
|
+ /* */ "4" (len)
|
||
|
+ : "cc", "memory"
|
||
|
+# ifdef __SSE__
|
||
|
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
|
||
|
+# endif
|
||
|
+ );
|
||
|
+
|
||
|
+ if (unlikely(k))
|
||
|
+ buf = adler32_jumped(buf, &s1, &s2, k);
|
||
|
+ reduce(s1);
|
||
|
+ reduce(s2);
|
||
|
+ return (s2 << 16) | s1;
|
||
|
+}
|
||
|
+# endif
|
||
|
+
|
||
|
+# ifndef __x86_64__
|
||
|
+/* ========================================================================= */
|
||
|
+/*
|
||
|
+ * SSE version to help VIA-C3_2, P2 & P3
|
||
|
+ */
|
||
|
+local uLong adler32_SSE(adler, buf, len)
|
||
|
+ uLong adler;
|
||
|
+ const Bytef *buf;
|
||
|
+ uInt len;
|
||
|
+{
|
||
|
+ unsigned int s1 = adler & 0xffff;
|
||
|
+ unsigned int s2 = (adler >> 16) & 0xffff;
|
||
|
+ unsigned int k;
|
||
|
+
|
||
|
+ k = ALIGN_DIFF(buf, 8);
|
||
|
+ len -= k;
|
||
|
+ if (k)
|
||
|
+ buf = adler32_jumped(buf, &s1, &s2, k);
|
||
|
+
|
||
|
+ __asm__ __volatile__ (
|
||
|
+ "mov %6, %3\n\t"
|
||
|
+ "cmp %3, %4\n\t"
|
||
|
+ "cmovb %4, %3\n"
|
||
|
+ "sub %3, %4\n\t"
|
||
|
+ "cmp $8, %3\n\t"
|
||
|
+ "jb 88f\n\t"
|
||
|
+ "movd %1, %%mm4\n\t"
|
||
|
+ "movd %2, %%mm3\n\t"
|
||
|
+ "pxor %%mm2, %%mm2\n\t"
|
||
|
+ "pxor %%mm5, %%mm5\n\t"
|
||
|
+# ifdef __ELF__
|
||
|
+ ".subsection 2\n\t"
|
||
|
+# else
|
||
|
+ "jmp 77f\n\t"
|
||
|
+# endif
|
||
|
+ ".p2align 2\n"
|
||
|
+ "mmx_reduce:\n\t"
|
||
|
+ "movq %%mm0, %%mm1\n\t"
|
||
|
+ "pslld $16, %%mm1\n\t"
|
||
|
+ "psrld $16, %%mm0\n\t"
|
||
|
+ "psrld $16, %%mm1\n\t"
|
||
|
+ "psubd %%mm0, %%mm1\n\t"
|
||
|
+ "pslld $4, %%mm0\n\t"
|
||
|
+ "paddd %%mm1, %%mm0\n\t"
|
||
|
+ "ret\n\t"
|
||
|
+# ifdef __ELF__
|
||
|
+ ".previous\n\t"
|
||
|
+# else
|
||
|
+ "77:\n\t"
|
||
|
+# endif
|
||
|
+ ".p2align 2\n"
|
||
|
+ "3:\n\t"
|
||
|
+ "pxor %%mm6, %%mm6\n\t"
|
||
|
+ "pxor %%mm7, %%mm7\n\t"
|
||
|
+ "mov $1024, %1\n\t"
|
||
|
+ "cmp %1, %3\n\t"
|
||
|
+ "cmovb %3, %1\n"
|
||
|
+ "and $-8, %1\n\t"
|
||
|
+ "sub %1, %3\n\t"
|
||
|
+ "shr $3, %1\n\t"
|
||
|
+ ".p2align 4,,7\n"
|
||
|
+ ".p2align 3\n"
|
||
|
+ "1:\n\t"
|
||
|
+ "movq (%0), %%mm0\n\t"
|
||
|
+ "paddd %%mm4, %%mm5\n\t"
|
||
|
+ "add $8, %0\n\t"
|
||
|
+ "dec %1\n\t"
|
||
|
+ "movq %%mm0, %%mm1\n\t"
|
||
|
+ "psadbw %%mm2, %%mm0\n\t"
|
||
|
+ "paddd %%mm0, %%mm4\n\t"
|
||
|
+ "movq %%mm1, %%mm0\n\t"
|
||
|
+ "punpckhbw %%mm2, %%mm1\n\t"
|
||
|
+ "punpcklbw %%mm2, %%mm0\n\t"
|
||
|
+ "paddw %%mm1, %%mm7\n\t"
|
||
|
+ "paddw %%mm0, %%mm6\n\t"
|
||
|
+ "jnz 1b\n\t"
|
||
|
+ "cmp $7, %3\n\t"
|
||
|
+ "pmaddwd 40+%5, %%mm7\n\t"
|
||
|
+ "pmaddwd 32+%5, %%mm6\n\t"
|
||
|
+ "paddd %%mm7, %%mm3\n\t"
|
||
|
+ "paddd %%mm6, %%mm3\n\t"
|
||
|
+ "jg 3b\n\t"
|
||
|
+ "movq %%mm5, %%mm0\n\t"
|
||
|
+ "pxor %%mm5, %%mm5\n\t"
|
||
|
+ "call mmx_reduce\n\t"
|
||
|
+ "pslld $3, %%mm0\n\t"
|
||
|
+ "paddd %%mm3, %%mm0\n\t"
|
||
|
+ "call mmx_reduce\n\t"
|
||
|
+ "movq %%mm0, %%mm3\n\t"
|
||
|
+ "movq %%mm4, %%mm0\n\t"
|
||
|
+ "call mmx_reduce\n\t"
|
||
|
+ "movq %%mm0, %%mm4\n\t"
|
||
|
+ "add %3, %4\n\t"
|
||
|
+ "mov %6, %3\n\t"
|
||
|
+ "cmp %3, %4\n\t"
|
||
|
+ "cmovb %4, %3\n"
|
||
|
+ "sub %3, %4\n\t"
|
||
|
+ "cmp $7, %3\n\t"
|
||
|
+ "jg 3b\n\t"
|
||
|
+ "movd %%mm4, %1\n\t"
|
||
|
+ "psrlq $32, %%mm4\n\t"
|
||
|
+ "movd %%mm3, %2\n\t"
|
||
|
+ "psrlq $32, %%mm3\n\t"
|
||
|
+ "movd %%mm4, %4\n\t"
|
||
|
+ "add %4, %1\n\t"
|
||
|
+ "movd %%mm3, %4\n\t"
|
||
|
+ "add %4, %2\n"
|
||
|
+ "emms\n\t"
|
||
|
+ "88:\n\t"
|
||
|
+ : /* %0 */ "=r" (buf),
|
||
|
+ /* %1 */ "=r" (s1),
|
||
|
+ /* %2 */ "=r" (s2),
|
||
|
+ /* %3 */ "=r" (k),
|
||
|
+ /* %4 */ "=r" (len)
|
||
|
+ : /* %5 */ "m" (vord),
|
||
|
+ /* %6 */ "i" ((5*NMAX)/2),
|
||
|
+ /* */ "0" (buf),
|
||
|
+ /* */ "1" (s1),
|
||
|
+ /* */ "2" (s2),
|
||
|
+ /* */ "3" (k),
|
||
|
+ /* */ "4" (len)
|
||
|
+ : "cc", "memory"
|
||
|
+# ifdef __MMX__
|
||
|
+ , "mm0", "mm1", "mm2", "mm3", "mm4", "mm5", "mm6", "mm7"
|
||
|
+# endif
|
||
|
+ );
|
||
|
+
|
||
|
+ if (unlikely(k))
|
||
|
+ buf = adler32_jumped(buf, &s1, &s2, k);
|
||
|
+ reduce(s1);
|
||
|
+ reduce(s2);
|
||
|
+ return (s2 << 16) | s1;
|
||
|
+}
|
||
|
+
|
||
|
+/* ========================================================================= */
|
||
|
+/*
|
||
|
+ * Processors which only have MMX will prop. not like this
|
||
|
+ * code, they are so old, they are not Out-Of-Order
|
||
|
+ * (maybe except AMD K6, Cyrix, Winchip/VIA).
|
||
|
+ * I did my best to get at least 1 instruction between result -> use
|
||
|
+ */
|
||
|
+local uLong adler32_MMX(adler, buf, len)
|
||
|
+ uLong adler;
|
||
|
+ const Bytef *buf;
|
||
|
+ uInt len;
|
||
|
+{
|
||
|
+ unsigned int s1 = adler & 0xffff;
|
||
|
+ unsigned int s2 = (adler >> 16) & 0xffff;
|
||
|
+ unsigned int k;
|
||
|
+
|
||
|
+ k = ALIGN_DIFF(buf, 8);
|
||
|
+ len -= k;
|
||
|
+ if (k)
|
||
|
+ buf = adler32_jumped(buf, &s1, &s2, k);
|
||
|
+
|
||
|
+ __asm__ __volatile__ (
|
||
|
+ "mov %6, %3\n\t"
|
||
|
+ "cmp %3, %4\n\t"
|
||
|
+ "jae 11f\n\t"
|
||
|
+ "mov %4, %3\n"
|
||
|
+ "11:\n\t"
|
||
|
+ "sub %3, %4\n\t"
|
||
|
+ "cmp $8, %3\n\t"
|
||
|
+ "jb 88f\n\t"
|
||
|
+ "sub $8, %%esp\n\t"
|
||
|
+ "movd %1, %%mm4\n\t"
|
||
|
+ "movd %2, %%mm2\n\t"
|
||
|
+ "movq %5, %%mm3\n"
|
||
|
+ "33:\n\t"
|
||
|
+ "movq %%mm2, %%mm0\n\t"
|
||
|
+ "pxor %%mm2, %%mm2\n\t"
|
||
|
+ "pxor %%mm5, %%mm5\n\t"
|
||
|
+ ".p2align 2\n"
|
||
|
+ "3:\n\t"
|
||
|
+ "movq %%mm0, (%%esp)\n\t"
|
||
|
+ "pxor %%mm6, %%mm6\n\t"
|
||
|
+ "pxor %%mm7, %%mm7\n\t"
|
||
|
+ "mov $1024, %1\n\t"
|
||
|
+ "cmp %1, %3\n\t"
|
||
|
+ "jae 44f\n\t"
|
||
|
+ "mov %3, %1\n"
|
||
|
+ "44:\n\t"
|
||
|
+ "and $-8, %1\n\t"
|
||
|
+ "sub %1, %3\n\t"
|
||
|
+ "shr $3, %1\n\t"
|
||
|
+ ".p2align 4,,7\n"
|
||
|
+ ".p2align 3\n"
|
||
|
+ "1:\n\t"
|
||
|
+ "movq (%0), %%mm0\n\t"
|
||
|
+ "paddd %%mm4, %%mm5\n\t"
|
||
|
+ "add $8, %0\n\t"
|
||
|
+ "dec %1\n\t"
|
||
|
+ "movq %%mm0, %%mm1\n\t"
|
||
|
+ "punpcklbw %%mm2, %%mm0\n\t"
|
||
|
+ "punpckhbw %%mm2, %%mm1\n\t"
|
||
|
+ "paddw %%mm0, %%mm6\n\t"
|
||
|
+ "paddw %%mm1, %%mm0\n\t"
|
||
|
+ "paddw %%mm1, %%mm7\n\t"
|
||
|
+ "pmaddwd %%mm3, %%mm0\n\t"
|
||
|
+ "paddd %%mm0, %%mm4\n\t"
|
||
|
+ "jnz 1b\n\t"
|
||
|
+ "movq (%%esp), %%mm0\n\t"
|
||
|
+ "cmp $7, %3\n\t"
|
||
|
+ "pmaddwd 32+%5, %%mm6\n\t"
|
||
|
+ "pmaddwd 40+%5, %%mm7\n\t"
|
||
|
+ "paddd %%mm6, %%mm0\n\t"
|
||
|
+ "paddd %%mm7, %%mm0\n\t"
|
||
|
+ "jg 3b\n\t"
|
||
|
+ "movq %%mm0, %%mm2\n\t"
|
||
|
+ "movq %%mm5, %%mm0\n\t"
|
||
|
+ "call mmx_reduce\n\t"
|
||
|
+ "pslld $3, %%mm0\n\t"
|
||
|
+ "paddd %%mm2, %%mm0\n\t"
|
||
|
+ "call mmx_reduce\n\t"
|
||
|
+ "movq %%mm0, %%mm2\n\t"
|
||
|
+ "movq %%mm4, %%mm0\n\t"
|
||
|
+ "call mmx_reduce\n\t"
|
||
|
+ "movq %%mm0, %%mm4\n\t"
|
||
|
+ "add %3, %4\n\t"
|
||
|
+ "mov %6, %3\n\t"
|
||
|
+ "cmp %3, %4\n\t"
|
||
|
+ "jae 22f\n\t"
|
||
|
+ "mov %4, %3\n"
|
||
|
+ "22:\n\t"
|
||
|
+ "sub %3, %4\n\t"
|
||
|
+ "cmp $7, %3\n\t"
|
||
|
+ "jg 33b\n\t"
|
||
|
+ "add $8, %%esp\n\t"
|
||
|
+ "movd %%mm4, %1\n\t"
|
||
|
+ "psrlq $32, %%mm4\n\t"
|
||
|
+ "movd %%mm2, %2\n\t"
|
||
|
+ "psrlq $32, %%mm2\n\t"
|
||
|
+ "movd %%mm4, %4\n\t"
|
||
|
+ "add %4, %1\n\t"
|
||
|
+ "movd %%mm2, %4\n\t"
|
||
|
+ "add %4, %2\n"
|
||
|
+ "emms\n\t"
|
||
|
+ "88:\n\t"
|
||
|
+ : /* %0 */ "=r" (buf),
|
||
|
+ /* %1 */ "=r" (s1),
|
||
|
+ /* %2 */ "=r" (s2),
|
||
|
+ /* %3 */ "=r" (k),
|
||
|
+ /* %4 */ "=r" (len)
|
||
|
+ : /* %5 */ "m" (vord),
|
||
|
+ /* %6 */ "i" (4*NMAX),
|
||
|
+ /* */ "0" (buf),
|
||
|
+ /* */ "1" (s1),
|
||
|
+ /* */ "2" (s2),
|
||
|
+ /* */ "3" (k),
|
||
|
+ /* */ "4" (len)
|
||
|
+ : "cc", "memory"
|
||
|
+# ifdef __MMX__
|
||
|
+ , "mm0", "mm1", "mm2", "mm3", "mm4", "mm5", "mm6", "mm7"
|
||
|
+# endif
|
||
|
+ );
|
||
|
+
|
||
|
+ if (unlikely(k))
|
||
|
+ buf = adler32_jumped(buf, &s1, &s2, k);
|
||
|
+ reduce(s1);
|
||
|
+ reduce(s2);
|
||
|
+ return (s2 << 16) | s1;
|
||
|
+}
|
||
|
+# endif
|
||
|
+
|
||
|
+/* ========================================================================= */
|
||
|
+local uLong adler32_x86(adler, buf, len)
|
||
|
+ uLong adler;
|
||
|
+ const Bytef *buf;
|
||
|
+ uInt len;
|
||
|
+{
|
||
|
+ unsigned int s1 = adler & 0xffff;
|
||
|
+ unsigned int s2 = (adler >> 16) & 0xffff;
|
||
|
+ unsigned int n;
|
||
|
+
|
||
|
+ while (likely(len)) {
|
||
|
+# ifndef __x86_64__
|
||
|
+# define LOOP_COUNT 4
|
||
|
+# else
|
||
|
+# define LOOP_COUNT 8
|
||
|
+# endif
|
||
|
+ unsigned int k;
|
||
|
+ n = len < NMAX ? len : NMAX;
|
||
|
+ len -= n;
|
||
|
+ k = n / LOOP_COUNT;
|
||
|
+ n %= LOOP_COUNT;
|
||
|
+
|
||
|
+ if (likely(k)) do {
|
||
|
+ /*
|
||
|
+ * Modern compiler can do "wonders".
|
||
|
+ * Only if they would not "trick them self" sometime.
|
||
|
+ * This was unrolled 16 times not because someone
|
||
|
+ * anticipated autovectorizing compiler, but the
|
||
|
+ * classical "avoid loop overhead".
|
||
|
+ *
|
||
|
+ * But things get tricky if the compiler starts to see:
|
||
|
+ * "hey lets disambiguate one sum step from the other",
|
||
|
+ * the classical prevent-pipeline-stalls-thing.
|
||
|
+ *
|
||
|
+ * Suddenly we have 16 temporary sums, which unfortunatly
|
||
|
+ * blows x86 limited register set...
|
||
|
+ *
|
||
|
+ * Loopunrolling is also a little bad for the I-cache.
|
||
|
+ *
|
||
|
+ * So tune this down for x86.
|
||
|
+ * Instead we try to keep it in the register set. 4 sums fits
|
||
|
+ * into i386 register set with no framepointer.
|
||
|
+ * x86_64 is a little more splendit, but still we can not
|
||
|
+ * take 16, so take 8 sums.
|
||
|
+ */
|
||
|
+ s1 += buf[0]; s2 += s1;
|
||
|
+ s1 += buf[1]; s2 += s1;
|
||
|
+ s1 += buf[2]; s2 += s1;
|
||
|
+ s1 += buf[3]; s2 += s1;
|
||
|
+# ifdef __x86_64__
|
||
|
+ s1 += buf[4]; s2 += s1;
|
||
|
+ s1 += buf[5]; s2 += s1;
|
||
|
+ s1 += buf[6]; s2 += s1;
|
||
|
+ s1 += buf[7]; s2 += s1;
|
||
|
+# endif
|
||
|
+ buf += LOOP_COUNT;
|
||
|
+ } while(likely(--k));
|
||
|
+ if (n) do {
|
||
|
+ s1 += *buf++;
|
||
|
+ s2 += s1;
|
||
|
+ } while (--n);
|
||
|
+ reduce_full(s1);
|
||
|
+ reduce_full(s2);
|
||
|
+ }
|
||
|
+ return (s2 << 16) | s1;
|
||
|
+}
|
||
|
+
|
||
|
+/* ========================================================================= */
|
||
|
+/*
|
||
|
+ * Knot it all together with a runtime switch
|
||
|
+ */
|
||
|
+
|
||
|
+/* Flags */
|
||
|
+# define CFF_DEFAULT (1 << 0)
|
||
|
+/* Processor features */
|
||
|
+# define CFEATURE_CMOV (15 + 0)
|
||
|
+# define CFEATURE_MMX (23 + 0)
|
||
|
+# define CFEATURE_SSE (25 + 0)
|
||
|
+# define CFEATURE_SSE2 (26 + 0)
|
||
|
+# define CFEATURE_SSSE3 ( 9 + 32)
|
||
|
+
|
||
|
+# define CFB(x) (1 << ((x)%32))
|
||
|
+
|
||
|
+# define FEATURE_WORDS 2
|
||
|
+
|
||
|
+/* data structure */
|
||
|
+struct test_cpu_feature
|
||
|
+{
|
||
|
+ void (*func)(void);
|
||
|
+ int flags;
|
||
|
+ unsigned int features[FEATURE_WORDS];
|
||
|
+};
|
||
|
+
|
||
|
+/* ========================================================================= */
|
||
|
+/*
|
||
|
+ * Decision table
|
||
|
+ */
|
||
|
+local const struct test_cpu_feature tfeat_adler32_vec[] =
|
||
|
+{
|
||
|
+ /* func flags features */
|
||
|
+ {(void (*)(void))adler32_SSSE3, 0, {CFB(CFEATURE_CMOV), CFB(CFEATURE_SSSE3)}},
|
||
|
+ {(void (*)(void))adler32_SSE2, 0, {CFB(CFEATURE_SSE2)|CFB(CFEATURE_CMOV), 0}},
|
||
|
+# ifndef __x86_64__
|
||
|
+ {(void (*)(void))adler32_SSE, 0, {CFB(CFEATURE_SSE)|CFB(CFEATURE_CMOV), 0}},
|
||
|
+ {(void (*)(void))adler32_MMX, 0, {CFB(CFEATURE_MMX), 0}},
|
||
|
+# endif
|
||
|
+ {(void (*)(void))adler32_x86, CFF_DEFAULT, { 0, 0}},
|
||
|
+};
|
||
|
+
|
||
|
+/* ========================================================================= */
|
||
|
+/* Prototypes */
|
||
|
+local noinline void *test_cpu_feature(const struct test_cpu_feature *t, unsigned int l);
|
||
|
+local uLong adler32_vec_runtimesw(uLong adler, const Bytef *buf, uInt len);
|
||
|
+
|
||
|
+/* ========================================================================= */
|
||
|
+/*
|
||
|
+ * Runtime Function pointer
|
||
|
+ */
|
||
|
+local uLong (*adler32_vec_ptr)(uLong adler, const Bytef *buf, uInt len) = adler32_vec_runtimesw;
|
||
|
+
|
||
|
+/* ========================================================================= */
|
||
|
+/*
|
||
|
+ * Constructor to init the pointer early
|
||
|
+ */
|
||
|
+local GCC_ATTR_CONSTRUCTOR void adler32_vec_select(void)
|
||
|
+{
|
||
|
+ adler32_vec_ptr = test_cpu_feature(tfeat_adler32_vec, sizeof (tfeat_adler32_vec)/sizeof (tfeat_adler32_vec[0]));
|
||
|
+}
|
||
|
+
|
||
|
+/* ========================================================================= */
|
||
|
+/*
|
||
|
+ * Jump function
|
||
|
+ */
|
||
|
+local noinline uLong adler32_vec(adler, buf, len)
|
||
|
+ uLong adler;
|
||
|
+ const Bytef *buf;
|
||
|
+ uInt len;
|
||
|
+{
|
||
|
+ return adler32_vec_ptr(adler, buf, len);
|
||
|
+}
|
||
|
+
|
||
|
+/* ========================================================================= */
|
||
|
+/*
|
||
|
+ * the runtime switcher is a little racy, it should normaly not run if the constructor works
|
||
|
+ */
|
||
|
+local uLong adler32_vec_runtimesw(uLong adler, const Bytef *buf, uInt len)
|
||
|
+{
|
||
|
+ adler32_vec_select();
|
||
|
+ return adler32_vec(adler, buf, len);
|
||
|
+}
|
||
|
+
|
||
|
+
|
||
|
+/* ========================================================================= */
|
||
|
+/* Internal data types */
|
||
|
+struct cpuid_regs
|
||
|
+{
|
||
|
+ unsigned long eax, ebx, ecx, edx;
|
||
|
+};
|
||
|
+
|
||
|
+local struct
|
||
|
+{
|
||
|
+ unsigned int max_basic;
|
||
|
+ unsigned int features[FEATURE_WORDS];
|
||
|
+ int init_done;
|
||
|
+} our_cpu;
|
||
|
+
|
||
|
+/* ========================================================================= */
|
||
|
+local inline unsigned long read_flags(void)
|
||
|
+{
|
||
|
+ unsigned long f;
|
||
|
+ __asm__ __volatile__ (
|
||
|
+ "pushf\n\t"
|
||
|
+ "pop %0\n\t"
|
||
|
+ : "=r" (f)
|
||
|
+ );
|
||
|
+ return f;
|
||
|
+}
|
||
|
+
|
||
|
+/* ========================================================================= */
|
||
|
+local inline void write_flags(unsigned long f)
|
||
|
+{
|
||
|
+ __asm__ __volatile__ (
|
||
|
+ "push %0\n\t"
|
||
|
+ "popf\n\t"
|
||
|
+ : : "ri" (f) : "cc"
|
||
|
+ );
|
||
|
+}
|
||
|
+
|
||
|
+/* ========================================================================= */
|
||
|
+local inline void cpuid(struct cpuid_regs *regs, unsigned long func)
|
||
|
+{
|
||
|
+ /* save ebx around cpuid call, PIC code needs it */
|
||
|
+ __asm__ __volatile__ (
|
||
|
+ "xchg %1, " PICREG "\n\t"
|
||
|
+ "cpuid\n\t"
|
||
|
+ "xchg %1, " PICREG "\n"
|
||
|
+ : /* %0 */ "=a" (regs->eax),
|
||
|
+ /* %1 */ "=r" (regs->ebx),
|
||
|
+ /* %2 */ "=c" (regs->ecx),
|
||
|
+ /* %4 */ "=d" (regs->edx)
|
||
|
+ : /* %5 */ "0" (func),
|
||
|
+ /* %6 */ "2" (regs->ecx)
|
||
|
+ : "cc"
|
||
|
+ );
|
||
|
+}
|
||
|
+
|
||
|
+/* ========================================================================= */
|
||
|
+local inline void cpuids(struct cpuid_regs *regs, unsigned long func)
|
||
|
+{
|
||
|
+ regs->ecx = 0;
|
||
|
+ cpuid(regs, func);
|
||
|
+}
|
||
|
+
|
||
|
+/* ========================================================================= */
|
||
|
+local inline int toggle_eflags_test(const unsigned long mask)
|
||
|
+{
|
||
|
+ unsigned long f;
|
||
|
+ int result;
|
||
|
+
|
||
|
+ f = read_flags();
|
||
|
+ write_flags(f ^ mask);
|
||
|
+ result = !!((f ^ read_flags()) & mask);
|
||
|
+ /*
|
||
|
+ * restore the old flags, the test for i486 tests the alignment
|
||
|
+ * check bit, and left set will confuse the x86 software world.
|
||
|
+ */
|
||
|
+ write_flags(f);
|
||
|
+ return result;
|
||
|
+}
|
||
|
+
|
||
|
+/* ========================================================================= */
|
||
|
+local inline int is_486(void)
|
||
|
+{
|
||
|
+ return toggle_eflags_test(1 << 18);
|
||
|
+}
|
||
|
+
|
||
|
+/* ========================================================================= */
|
||
|
+local inline int has_cpuid(void)
|
||
|
+{
|
||
|
+ return toggle_eflags_test(1 << 21);
|
||
|
+}
|
||
|
+
|
||
|
+/* ========================================================================= */
|
||
|
+local void identify_cpu(void)
|
||
|
+{
|
||
|
+ struct cpuid_regs a;
|
||
|
+
|
||
|
+ if (our_cpu.init_done)
|
||
|
+ return;
|
||
|
+
|
||
|
+ our_cpu.init_done = -1;
|
||
|
+ /* force a write out to memory */
|
||
|
+ __asm__ __volatile__ ("" : : "m" (our_cpu.init_done));
|
||
|
+
|
||
|
+ if (!is_486())
|
||
|
+ return;
|
||
|
+
|
||
|
+ if (!has_cpuid())
|
||
|
+ return;
|
||
|
+
|
||
|
+ /* get the maximum basic leaf number */
|
||
|
+ cpuids(&a, 0x00000000);
|
||
|
+ our_cpu.max_basic = (unsigned int)a.eax;
|
||
|
+ /* we could get the vendor string from ebx, edx, ecx */
|
||
|
+
|
||
|
+ /* get the first basic leaf, if it is avail. */
|
||
|
+ if (our_cpu.max_basic >= 0x00000001)
|
||
|
+ cpuids(&a, 0x00000001);
|
||
|
+ else
|
||
|
+ a.eax = a.ebx = a.ecx = a.edx = 0;
|
||
|
+
|
||
|
+ /* we could extract family, model, stepping from eax */
|
||
|
+
|
||
|
+ /* there is the first set of features */
|
||
|
+ our_cpu.features[0] = a.edx;
|
||
|
+ our_cpu.features[1] = a.ecx;
|
||
|
+
|
||
|
+ /* now we could test the extended features, but is not needed, for now */
|
||
|
+}
|
||
|
+
|
||
|
+/* ========================================================================= */
|
||
|
+local noinline void *test_cpu_feature(const struct test_cpu_feature *t, unsigned int l)
|
||
|
+{
|
||
|
+ unsigned int i, j, f;
|
||
|
+ identify_cpu();
|
||
|
+
|
||
|
+ for (i = 0; i < l; i++) {
|
||
|
+ if (t[i].flags & CFF_DEFAULT)
|
||
|
+ return t[i].func;
|
||
|
+ for (f = 0, j = 0; j < FEATURE_WORDS; j++)
|
||
|
+ f |= (our_cpu.features[j] & t[i].features[j]) ^ t[i].features[j];
|
||
|
+ if (f)
|
||
|
+ continue;
|
||
|
+ return t[i].func;
|
||
|
+ }
|
||
|
+ return NULL; /* die! */
|
||
|
+}
|
||
|
+
|
||
|
+#endif
|
||
|
|