forked from pool/libgcrypt
1994 lines
43 KiB
Diff
1994 lines
43 KiB
Diff
|
commit 88fe7ac33eb4cb4dff76a5cc7fca50da5fb0ee3a
|
||
|
Author: Danny Tsen <dtsen@us.ibm.com>
|
||
|
Date: Sun Jun 12 21:30:19 2022 +0300
|
||
|
|
||
|
Chacha20 poly1305 Optimized chacha20 poly1305 for P10 operation
|
||
|
|
||
|
* configure.ac: Added chacha20 and poly1305 assembly implementations.
|
||
|
* cipher/chacha20-p10le-8x.s: (New) - support 8 blocks (512 bytes)
|
||
|
unrolling.
|
||
|
* cipher/poly1305-p10le.s: (New) - support 4 blocks (128 bytes)
|
||
|
unrolling.
|
||
|
* cipher/Makefile.am: Added new chacha20 and poly1305 files.
|
||
|
* cipher/chacha20.c: Added PPC p10 le support for 8x chacha20.
|
||
|
* cipher/poly1305.c: Added PPC p10 le support for 4x poly1305.
|
||
|
* cipher/poly1305-internal.h: Added PPC p10 le support for poly1305.
|
||
|
---
|
||
|
|
||
|
GnuPG-bug-id: 6006
|
||
|
Signed-off-by: Danny Tsen <dtsen@us.ibm.com>
|
||
|
[jk: cosmetic changes to C code]
|
||
|
[jk: fix building on ppc64be]
|
||
|
Signed-off-by: Jussi Kivilinna <jussi.kivilinna@iki.fi>
|
||
|
|
||
|
Index: libgcrypt-1.10.2/cipher/Makefile.am
|
||
|
===================================================================
|
||
|
--- libgcrypt-1.10.2.orig/cipher/Makefile.am
|
||
|
+++ libgcrypt-1.10.2/cipher/Makefile.am
|
||
|
@@ -83,6 +83,7 @@ EXTRA_libcipher_la_SOURCES = \
|
||
|
chacha20.c chacha20-amd64-ssse3.S chacha20-amd64-avx2.S \
|
||
|
chacha20-armv7-neon.S chacha20-aarch64.S \
|
||
|
chacha20-ppc.c chacha20-s390x.S \
|
||
|
+ chacha20-p10le-8x.s \
|
||
|
cipher-gcm-ppc.c cipher-gcm-intel-pclmul.c cipher-gcm-armv7-neon.S \
|
||
|
cipher-gcm-armv8-aarch32-ce.S cipher-gcm-armv8-aarch64-ce.S \
|
||
|
crc.c crc-intel-pclmul.c crc-armv8-ce.c \
|
||
|
@@ -99,6 +100,7 @@ EXTRA_libcipher_la_SOURCES = \
|
||
|
md4.c \
|
||
|
md5.c \
|
||
|
poly1305-s390x.S \
|
||
|
+ poly1305-p10le.s \
|
||
|
rijndael.c rijndael-internal.h rijndael-tables.h \
|
||
|
rijndael-aesni.c rijndael-padlock.c \
|
||
|
rijndael-amd64.S rijndael-arm.S \
|
||
|
Index: libgcrypt-1.10.2/cipher/chacha20-p10le-8x.s
|
||
|
===================================================================
|
||
|
--- /dev/null
|
||
|
+++ libgcrypt-1.10.2/cipher/chacha20-p10le-8x.s
|
||
|
@@ -0,0 +1,864 @@
|
||
|
+# Copyright 2021- IBM Inc. All rights reserved
|
||
|
+#
|
||
|
+# This file is part of Libgcrypt.
|
||
|
+#
|
||
|
+# Libgcrypt is free software; you can redistribute it and/or modify
|
||
|
+# it under the terms of the GNU Lesser General Public License as
|
||
|
+# published by the Free Software Foundation; either version 2.1 of
|
||
|
+# the License, or (at your option) any later version.
|
||
|
+#
|
||
|
+# Libgcrypt is distributed in the hope that it will be useful,
|
||
|
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||
|
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||
|
+# GNU Lesser General Public License for more details.
|
||
|
+#
|
||
|
+# You should have received a copy of the GNU Lesser General Public
|
||
|
+# License along with this program; if not, see <http://www.gnu.org/licenses/>.
|
||
|
+#
|
||
|
+#===================================================================================
|
||
|
+# Written by Danny Tsen <dtsen@us.ibm.com>
|
||
|
+#
|
||
|
+# This function handles multiple 64-byte block data length
|
||
|
+# and the length should be more than 512 bytes.
|
||
|
+#
|
||
|
+# unsigned int _gcry_chacha20_p10le_8x(u32 *state, byte *dst, const byte *src, size_t len);
|
||
|
+#
|
||
|
+# r1 - top of the stack
|
||
|
+# r3 to r10 input parameters
|
||
|
+# r3 - out
|
||
|
+# r4 - inp
|
||
|
+# r5 - len
|
||
|
+# r6 - key[8]
|
||
|
+# r7 - counter[4]
|
||
|
+#
|
||
|
+# do rounds, 8 quarter rounds
|
||
|
+# 1. a += b; d ^= a; d <<<= 16;
|
||
|
+# 2. c += d; b ^= c; b <<<= 12;
|
||
|
+# 3. a += b; d ^= a; d <<<= 8;
|
||
|
+# 4. c += d; b ^= c; b <<<= 7
|
||
|
+#
|
||
|
+# row1 = (row1 + row2), row4 = row1 xor row4, row4 rotate each word by 16
|
||
|
+# row3 = (row3 + row4), row2 = row3 xor row2, row2 rotate each word by 12
|
||
|
+# row1 = (row1 + row2), row4 = row1 xor row4, row4 rotate each word by 8
|
||
|
+# row3 = (row3 + row4), row2 = row3 xor row2, row2 rotate each word by 7
|
||
|
+#
|
||
|
+# 4 blocks (a b c d)
|
||
|
+#
|
||
|
+# a0 b0 c0 d0
|
||
|
+# a1 b1 c1 d1
|
||
|
+# ...
|
||
|
+# a4 b4 c4 d4
|
||
|
+# ...
|
||
|
+# a8 b8 c8 d8
|
||
|
+# ...
|
||
|
+# a12 b12 c12 d12
|
||
|
+# a13 ...
|
||
|
+# a14 ...
|
||
|
+# a15 b15 c15 d15
|
||
|
+#
|
||
|
+# Column round (v0, v4, v8, v12, v1, v5, v9, v13, v2, v6, v10, v14, v3, v7, v11, v15)
|
||
|
+# Diagnal round (v0, v5, v10, v15, v1, v6, v11, v12, v2, v7, v8, v13, v3, v4, v9, v14)
|
||
|
+#
|
||
|
+.text
|
||
|
+
|
||
|
+.macro QT_loop_8x
|
||
|
+ # QR(v0, v4, v8, v12, v1, v5, v9, v13, v2, v6, v10, v14, v3, v7, v11, v15)
|
||
|
+ xxlor 0, 32+25, 32+25
|
||
|
+ xxlor 32+25, 20, 20
|
||
|
+ vadduwm 0, 0, 4
|
||
|
+ vadduwm 1, 1, 5
|
||
|
+ vadduwm 2, 2, 6
|
||
|
+ vadduwm 3, 3, 7
|
||
|
+ vadduwm 16, 16, 20
|
||
|
+ vadduwm 17, 17, 21
|
||
|
+ vadduwm 18, 18, 22
|
||
|
+ vadduwm 19, 19, 23
|
||
|
+
|
||
|
+ vpermxor 12, 12, 0, 25
|
||
|
+ vpermxor 13, 13, 1, 25
|
||
|
+ vpermxor 14, 14, 2, 25
|
||
|
+ vpermxor 15, 15, 3, 25
|
||
|
+ vpermxor 28, 28, 16, 25
|
||
|
+ vpermxor 29, 29, 17, 25
|
||
|
+ vpermxor 30, 30, 18, 25
|
||
|
+ vpermxor 31, 31, 19, 25
|
||
|
+ xxlor 32+25, 0, 0
|
||
|
+ vadduwm 8, 8, 12
|
||
|
+ vadduwm 9, 9, 13
|
||
|
+ vadduwm 10, 10, 14
|
||
|
+ vadduwm 11, 11, 15
|
||
|
+ vadduwm 24, 24, 28
|
||
|
+ vadduwm 25, 25, 29
|
||
|
+ vadduwm 26, 26, 30
|
||
|
+ vadduwm 27, 27, 31
|
||
|
+ vxor 4, 4, 8
|
||
|
+ vxor 5, 5, 9
|
||
|
+ vxor 6, 6, 10
|
||
|
+ vxor 7, 7, 11
|
||
|
+ vxor 20, 20, 24
|
||
|
+ vxor 21, 21, 25
|
||
|
+ vxor 22, 22, 26
|
||
|
+ vxor 23, 23, 27
|
||
|
+
|
||
|
+ xxlor 0, 32+25, 32+25
|
||
|
+ xxlor 32+25, 21, 21
|
||
|
+ vrlw 4, 4, 25 #
|
||
|
+ vrlw 5, 5, 25
|
||
|
+ vrlw 6, 6, 25
|
||
|
+ vrlw 7, 7, 25
|
||
|
+ vrlw 20, 20, 25 #
|
||
|
+ vrlw 21, 21, 25
|
||
|
+ vrlw 22, 22, 25
|
||
|
+ vrlw 23, 23, 25
|
||
|
+ xxlor 32+25, 0, 0
|
||
|
+ vadduwm 0, 0, 4
|
||
|
+ vadduwm 1, 1, 5
|
||
|
+ vadduwm 2, 2, 6
|
||
|
+ vadduwm 3, 3, 7
|
||
|
+ vadduwm 16, 16, 20
|
||
|
+ vadduwm 17, 17, 21
|
||
|
+ vadduwm 18, 18, 22
|
||
|
+ vadduwm 19, 19, 23
|
||
|
+
|
||
|
+ xxlor 0, 32+25, 32+25
|
||
|
+ xxlor 32+25, 22, 22
|
||
|
+ vpermxor 12, 12, 0, 25
|
||
|
+ vpermxor 13, 13, 1, 25
|
||
|
+ vpermxor 14, 14, 2, 25
|
||
|
+ vpermxor 15, 15, 3, 25
|
||
|
+ vpermxor 28, 28, 16, 25
|
||
|
+ vpermxor 29, 29, 17, 25
|
||
|
+ vpermxor 30, 30, 18, 25
|
||
|
+ vpermxor 31, 31, 19, 25
|
||
|
+ xxlor 32+25, 0, 0
|
||
|
+ vadduwm 8, 8, 12
|
||
|
+ vadduwm 9, 9, 13
|
||
|
+ vadduwm 10, 10, 14
|
||
|
+ vadduwm 11, 11, 15
|
||
|
+ vadduwm 24, 24, 28
|
||
|
+ vadduwm 25, 25, 29
|
||
|
+ vadduwm 26, 26, 30
|
||
|
+ vadduwm 27, 27, 31
|
||
|
+ xxlor 0, 32+28, 32+28
|
||
|
+ xxlor 32+28, 23, 23
|
||
|
+ vxor 4, 4, 8
|
||
|
+ vxor 5, 5, 9
|
||
|
+ vxor 6, 6, 10
|
||
|
+ vxor 7, 7, 11
|
||
|
+ vxor 20, 20, 24
|
||
|
+ vxor 21, 21, 25
|
||
|
+ vxor 22, 22, 26
|
||
|
+ vxor 23, 23, 27
|
||
|
+ vrlw 4, 4, 28 #
|
||
|
+ vrlw 5, 5, 28
|
||
|
+ vrlw 6, 6, 28
|
||
|
+ vrlw 7, 7, 28
|
||
|
+ vrlw 20, 20, 28 #
|
||
|
+ vrlw 21, 21, 28
|
||
|
+ vrlw 22, 22, 28
|
||
|
+ vrlw 23, 23, 28
|
||
|
+ xxlor 32+28, 0, 0
|
||
|
+
|
||
|
+ # QR(v0, v5, v10, v15, v1, v6, v11, v12, v2, v7, v8, v13, v3, v4, v9, v14)
|
||
|
+ xxlor 0, 32+25, 32+25
|
||
|
+ xxlor 32+25, 20, 20
|
||
|
+ vadduwm 0, 0, 5
|
||
|
+ vadduwm 1, 1, 6
|
||
|
+ vadduwm 2, 2, 7
|
||
|
+ vadduwm 3, 3, 4
|
||
|
+ vadduwm 16, 16, 21
|
||
|
+ vadduwm 17, 17, 22
|
||
|
+ vadduwm 18, 18, 23
|
||
|
+ vadduwm 19, 19, 20
|
||
|
+
|
||
|
+ vpermxor 15, 15, 0, 25
|
||
|
+ vpermxor 12, 12, 1, 25
|
||
|
+ vpermxor 13, 13, 2, 25
|
||
|
+ vpermxor 14, 14, 3, 25
|
||
|
+ vpermxor 31, 31, 16, 25
|
||
|
+ vpermxor 28, 28, 17, 25
|
||
|
+ vpermxor 29, 29, 18, 25
|
||
|
+ vpermxor 30, 30, 19, 25
|
||
|
+
|
||
|
+ xxlor 32+25, 0, 0
|
||
|
+ vadduwm 10, 10, 15
|
||
|
+ vadduwm 11, 11, 12
|
||
|
+ vadduwm 8, 8, 13
|
||
|
+ vadduwm 9, 9, 14
|
||
|
+ vadduwm 26, 26, 31
|
||
|
+ vadduwm 27, 27, 28
|
||
|
+ vadduwm 24, 24, 29
|
||
|
+ vadduwm 25, 25, 30
|
||
|
+ vxor 5, 5, 10
|
||
|
+ vxor 6, 6, 11
|
||
|
+ vxor 7, 7, 8
|
||
|
+ vxor 4, 4, 9
|
||
|
+ vxor 21, 21, 26
|
||
|
+ vxor 22, 22, 27
|
||
|
+ vxor 23, 23, 24
|
||
|
+ vxor 20, 20, 25
|
||
|
+
|
||
|
+ xxlor 0, 32+25, 32+25
|
||
|
+ xxlor 32+25, 21, 21
|
||
|
+ vrlw 5, 5, 25
|
||
|
+ vrlw 6, 6, 25
|
||
|
+ vrlw 7, 7, 25
|
||
|
+ vrlw 4, 4, 25
|
||
|
+ vrlw 21, 21, 25
|
||
|
+ vrlw 22, 22, 25
|
||
|
+ vrlw 23, 23, 25
|
||
|
+ vrlw 20, 20, 25
|
||
|
+ xxlor 32+25, 0, 0
|
||
|
+
|
||
|
+ vadduwm 0, 0, 5
|
||
|
+ vadduwm 1, 1, 6
|
||
|
+ vadduwm 2, 2, 7
|
||
|
+ vadduwm 3, 3, 4
|
||
|
+ vadduwm 16, 16, 21
|
||
|
+ vadduwm 17, 17, 22
|
||
|
+ vadduwm 18, 18, 23
|
||
|
+ vadduwm 19, 19, 20
|
||
|
+
|
||
|
+ xxlor 0, 32+25, 32+25
|
||
|
+ xxlor 32+25, 22, 22
|
||
|
+ vpermxor 15, 15, 0, 25
|
||
|
+ vpermxor 12, 12, 1, 25
|
||
|
+ vpermxor 13, 13, 2, 25
|
||
|
+ vpermxor 14, 14, 3, 25
|
||
|
+ vpermxor 31, 31, 16, 25
|
||
|
+ vpermxor 28, 28, 17, 25
|
||
|
+ vpermxor 29, 29, 18, 25
|
||
|
+ vpermxor 30, 30, 19, 25
|
||
|
+ xxlor 32+25, 0, 0
|
||
|
+
|
||
|
+ vadduwm 10, 10, 15
|
||
|
+ vadduwm 11, 11, 12
|
||
|
+ vadduwm 8, 8, 13
|
||
|
+ vadduwm 9, 9, 14
|
||
|
+ vadduwm 26, 26, 31
|
||
|
+ vadduwm 27, 27, 28
|
||
|
+ vadduwm 24, 24, 29
|
||
|
+ vadduwm 25, 25, 30
|
||
|
+
|
||
|
+ xxlor 0, 32+28, 32+28
|
||
|
+ xxlor 32+28, 23, 23
|
||
|
+ vxor 5, 5, 10
|
||
|
+ vxor 6, 6, 11
|
||
|
+ vxor 7, 7, 8
|
||
|
+ vxor 4, 4, 9
|
||
|
+ vxor 21, 21, 26
|
||
|
+ vxor 22, 22, 27
|
||
|
+ vxor 23, 23, 24
|
||
|
+ vxor 20, 20, 25
|
||
|
+ vrlw 5, 5, 28
|
||
|
+ vrlw 6, 6, 28
|
||
|
+ vrlw 7, 7, 28
|
||
|
+ vrlw 4, 4, 28
|
||
|
+ vrlw 21, 21, 28
|
||
|
+ vrlw 22, 22, 28
|
||
|
+ vrlw 23, 23, 28
|
||
|
+ vrlw 20, 20, 28
|
||
|
+ xxlor 32+28, 0, 0
|
||
|
+.endm
|
||
|
+
|
||
|
+.macro QT_loop_4x
|
||
|
+ # QR(v0, v4, v8, v12, v1, v5, v9, v13, v2, v6, v10, v14, v3, v7, v11, v15)
|
||
|
+ vadduwm 0, 0, 4
|
||
|
+ vadduwm 1, 1, 5
|
||
|
+ vadduwm 2, 2, 6
|
||
|
+ vadduwm 3, 3, 7
|
||
|
+ vpermxor 12, 12, 0, 20
|
||
|
+ vpermxor 13, 13, 1, 20
|
||
|
+ vpermxor 14, 14, 2, 20
|
||
|
+ vpermxor 15, 15, 3, 20
|
||
|
+ vadduwm 8, 8, 12
|
||
|
+ vadduwm 9, 9, 13
|
||
|
+ vadduwm 10, 10, 14
|
||
|
+ vadduwm 11, 11, 15
|
||
|
+ vxor 4, 4, 8
|
||
|
+ vxor 5, 5, 9
|
||
|
+ vxor 6, 6, 10
|
||
|
+ vxor 7, 7, 11
|
||
|
+ vrlw 4, 4, 21
|
||
|
+ vrlw 5, 5, 21
|
||
|
+ vrlw 6, 6, 21
|
||
|
+ vrlw 7, 7, 21
|
||
|
+ vadduwm 0, 0, 4
|
||
|
+ vadduwm 1, 1, 5
|
||
|
+ vadduwm 2, 2, 6
|
||
|
+ vadduwm 3, 3, 7
|
||
|
+ vpermxor 12, 12, 0, 22
|
||
|
+ vpermxor 13, 13, 1, 22
|
||
|
+ vpermxor 14, 14, 2, 22
|
||
|
+ vpermxor 15, 15, 3, 22
|
||
|
+ vadduwm 8, 8, 12
|
||
|
+ vadduwm 9, 9, 13
|
||
|
+ vadduwm 10, 10, 14
|
||
|
+ vadduwm 11, 11, 15
|
||
|
+ vxor 4, 4, 8
|
||
|
+ vxor 5, 5, 9
|
||
|
+ vxor 6, 6, 10
|
||
|
+ vxor 7, 7, 11
|
||
|
+ vrlw 4, 4, 23
|
||
|
+ vrlw 5, 5, 23
|
||
|
+ vrlw 6, 6, 23
|
||
|
+ vrlw 7, 7, 23
|
||
|
+
|
||
|
+ # QR(v0, v5, v10, v15, v1, v6, v11, v12, v2, v7, v8, v13, v3, v4, v9, v14)
|
||
|
+ vadduwm 0, 0, 5
|
||
|
+ vadduwm 1, 1, 6
|
||
|
+ vadduwm 2, 2, 7
|
||
|
+ vadduwm 3, 3, 4
|
||
|
+ vpermxor 15, 15, 0, 20
|
||
|
+ vpermxor 12, 12, 1, 20
|
||
|
+ vpermxor 13, 13, 2, 20
|
||
|
+ vpermxor 14, 14, 3, 20
|
||
|
+ vadduwm 10, 10, 15
|
||
|
+ vadduwm 11, 11, 12
|
||
|
+ vadduwm 8, 8, 13
|
||
|
+ vadduwm 9, 9, 14
|
||
|
+ vxor 5, 5, 10
|
||
|
+ vxor 6, 6, 11
|
||
|
+ vxor 7, 7, 8
|
||
|
+ vxor 4, 4, 9
|
||
|
+ vrlw 5, 5, 21
|
||
|
+ vrlw 6, 6, 21
|
||
|
+ vrlw 7, 7, 21
|
||
|
+ vrlw 4, 4, 21
|
||
|
+ vadduwm 0, 0, 5
|
||
|
+ vadduwm 1, 1, 6
|
||
|
+ vadduwm 2, 2, 7
|
||
|
+ vadduwm 3, 3, 4
|
||
|
+ vpermxor 15, 15, 0, 22
|
||
|
+ vpermxor 12, 12, 1, 22
|
||
|
+ vpermxor 13, 13, 2, 22
|
||
|
+ vpermxor 14, 14, 3, 22
|
||
|
+ vadduwm 10, 10, 15
|
||
|
+ vadduwm 11, 11, 12
|
||
|
+ vadduwm 8, 8, 13
|
||
|
+ vadduwm 9, 9, 14
|
||
|
+ vxor 5, 5, 10
|
||
|
+ vxor 6, 6, 11
|
||
|
+ vxor 7, 7, 8
|
||
|
+ vxor 4, 4, 9
|
||
|
+ vrlw 5, 5, 23
|
||
|
+ vrlw 6, 6, 23
|
||
|
+ vrlw 7, 7, 23
|
||
|
+ vrlw 4, 4, 23
|
||
|
+.endm
|
||
|
+
|
||
|
+# Transpose
|
||
|
+.macro TP_4x a0 a1 a2 a3
|
||
|
+ xxmrghw 10, 32+\a0, 32+\a1 # a0, a1, b0, b1
|
||
|
+ xxmrghw 11, 32+\a2, 32+\a3 # a2, a3, b2, b3
|
||
|
+ xxmrglw 12, 32+\a0, 32+\a1 # c0, c1, d0, d1
|
||
|
+ xxmrglw 13, 32+\a2, 32+\a3 # c2, c3, d2, d3
|
||
|
+ xxpermdi 32+\a0, 10, 11, 0 # a0, a1, a2, a3
|
||
|
+ xxpermdi 32+\a1, 10, 11, 3 # b0, b1, b2, b3
|
||
|
+ xxpermdi 32+\a2, 12, 13, 0 # c0, c1, c2, c3
|
||
|
+ xxpermdi 32+\a3, 12, 13, 3 # d0, d1, d2, d3
|
||
|
+.endm
|
||
|
+
|
||
|
+# key stream = working state + state
|
||
|
+.macro Add_state S
|
||
|
+ vadduwm \S+0, \S+0, 16-\S
|
||
|
+ vadduwm \S+4, \S+4, 17-\S
|
||
|
+ vadduwm \S+8, \S+8, 18-\S
|
||
|
+ vadduwm \S+12, \S+12, 19-\S
|
||
|
+
|
||
|
+ vadduwm \S+1, \S+1, 16-\S
|
||
|
+ vadduwm \S+5, \S+5, 17-\S
|
||
|
+ vadduwm \S+9, \S+9, 18-\S
|
||
|
+ vadduwm \S+13, \S+13, 19-\S
|
||
|
+
|
||
|
+ vadduwm \S+2, \S+2, 16-\S
|
||
|
+ vadduwm \S+6, \S+6, 17-\S
|
||
|
+ vadduwm \S+10, \S+10, 18-\S
|
||
|
+ vadduwm \S+14, \S+14, 19-\S
|
||
|
+
|
||
|
+ vadduwm \S+3, \S+3, 16-\S
|
||
|
+ vadduwm \S+7, \S+7, 17-\S
|
||
|
+ vadduwm \S+11, \S+11, 18-\S
|
||
|
+ vadduwm \S+15, \S+15, 19-\S
|
||
|
+.endm
|
||
|
+
|
||
|
+#
|
||
|
+# write 256 bytes
|
||
|
+#
|
||
|
+.macro Write_256 S
|
||
|
+ add 9, 14, 5
|
||
|
+ add 16, 14, 4
|
||
|
+ lxvw4x 0, 0, 9
|
||
|
+ lxvw4x 1, 17, 9
|
||
|
+ lxvw4x 2, 18, 9
|
||
|
+ lxvw4x 3, 19, 9
|
||
|
+ lxvw4x 4, 20, 9
|
||
|
+ lxvw4x 5, 21, 9
|
||
|
+ lxvw4x 6, 22, 9
|
||
|
+ lxvw4x 7, 23, 9
|
||
|
+ lxvw4x 8, 24, 9
|
||
|
+ lxvw4x 9, 25, 9
|
||
|
+ lxvw4x 10, 26, 9
|
||
|
+ lxvw4x 11, 27, 9
|
||
|
+ lxvw4x 12, 28, 9
|
||
|
+ lxvw4x 13, 29, 9
|
||
|
+ lxvw4x 14, 30, 9
|
||
|
+ lxvw4x 15, 31, 9
|
||
|
+
|
||
|
+ xxlxor \S+32, \S+32, 0
|
||
|
+ xxlxor \S+36, \S+36, 1
|
||
|
+ xxlxor \S+40, \S+40, 2
|
||
|
+ xxlxor \S+44, \S+44, 3
|
||
|
+ xxlxor \S+33, \S+33, 4
|
||
|
+ xxlxor \S+37, \S+37, 5
|
||
|
+ xxlxor \S+41, \S+41, 6
|
||
|
+ xxlxor \S+45, \S+45, 7
|
||
|
+ xxlxor \S+34, \S+34, 8
|
||
|
+ xxlxor \S+38, \S+38, 9
|
||
|
+ xxlxor \S+42, \S+42, 10
|
||
|
+ xxlxor \S+46, \S+46, 11
|
||
|
+ xxlxor \S+35, \S+35, 12
|
||
|
+ xxlxor \S+39, \S+39, 13
|
||
|
+ xxlxor \S+43, \S+43, 14
|
||
|
+ xxlxor \S+47, \S+47, 15
|
||
|
+
|
||
|
+ stxvw4x \S+32, 0, 16
|
||
|
+ stxvw4x \S+36, 17, 16
|
||
|
+ stxvw4x \S+40, 18, 16
|
||
|
+ stxvw4x \S+44, 19, 16
|
||
|
+
|
||
|
+ stxvw4x \S+33, 20, 16
|
||
|
+ stxvw4x \S+37, 21, 16
|
||
|
+ stxvw4x \S+41, 22, 16
|
||
|
+ stxvw4x \S+45, 23, 16
|
||
|
+
|
||
|
+ stxvw4x \S+34, 24, 16
|
||
|
+ stxvw4x \S+38, 25, 16
|
||
|
+ stxvw4x \S+42, 26, 16
|
||
|
+ stxvw4x \S+46, 27, 16
|
||
|
+
|
||
|
+ stxvw4x \S+35, 28, 16
|
||
|
+ stxvw4x \S+39, 29, 16
|
||
|
+ stxvw4x \S+43, 30, 16
|
||
|
+ stxvw4x \S+47, 31, 16
|
||
|
+
|
||
|
+.endm
|
||
|
+
|
||
|
+#
|
||
|
+# unsigned int _gcry_chacha20_p10le_8x(u32 *state, byte *dst, const byte *src, size_t len);
|
||
|
+#
|
||
|
+.global _gcry_chacha20_p10le_8x
|
||
|
+.align 5
|
||
|
+_gcry_chacha20_p10le_8x:
|
||
|
+ cmpdi 6, 512
|
||
|
+ blt Out_no_chacha
|
||
|
+
|
||
|
+ stdu 1,-1024(1)
|
||
|
+ mflr 0
|
||
|
+
|
||
|
+ std 14,112(1)
|
||
|
+ std 15,120(1)
|
||
|
+ std 16,128(1)
|
||
|
+ std 17,136(1)
|
||
|
+ std 18,144(1)
|
||
|
+ std 19,152(1)
|
||
|
+ std 20,160(1)
|
||
|
+ std 21,168(1)
|
||
|
+ std 22,176(1)
|
||
|
+ std 23,184(1)
|
||
|
+ std 24,192(1)
|
||
|
+ std 25,200(1)
|
||
|
+ std 26,208(1)
|
||
|
+ std 27,216(1)
|
||
|
+ std 28,224(1)
|
||
|
+ std 29,232(1)
|
||
|
+ std 30,240(1)
|
||
|
+ std 31,248(1)
|
||
|
+ std 0, 1040(1)
|
||
|
+
|
||
|
+ li 17, 16
|
||
|
+ li 18, 32
|
||
|
+ li 19, 48
|
||
|
+ li 20, 64
|
||
|
+ li 21, 80
|
||
|
+ li 22, 96
|
||
|
+ li 23, 112
|
||
|
+ li 24, 128
|
||
|
+ li 25, 144
|
||
|
+ li 26, 160
|
||
|
+ li 27, 176
|
||
|
+ li 28, 192
|
||
|
+ li 29, 208
|
||
|
+ li 30, 224
|
||
|
+ li 31, 240
|
||
|
+ addi 9, 1, 256
|
||
|
+ stvx 20, 0, 9
|
||
|
+ stvx 21, 17, 9
|
||
|
+ stvx 22, 18, 9
|
||
|
+ stvx 23, 19, 9
|
||
|
+ stvx 24, 20, 9
|
||
|
+ stvx 25, 21, 9
|
||
|
+ stvx 26, 22, 9
|
||
|
+ stvx 27, 23, 9
|
||
|
+ stvx 28, 24, 9
|
||
|
+ stvx 29, 25, 9
|
||
|
+ stvx 30, 26, 9
|
||
|
+ stvx 31, 27, 9
|
||
|
+
|
||
|
+ add 9, 9, 27
|
||
|
+ addi 14, 17, 16
|
||
|
+ stxvx 14, 14, 9
|
||
|
+ addi 14, 14, 16
|
||
|
+ stxvx 15, 14, 9
|
||
|
+ addi 14, 14, 16
|
||
|
+ stxvx 16, 14, 9
|
||
|
+ addi 14, 14, 16
|
||
|
+ stxvx 17, 14, 9
|
||
|
+ addi 14, 14, 16
|
||
|
+ stxvx 18, 14, 9
|
||
|
+ addi 14, 14, 16
|
||
|
+ stxvx 19, 14, 9
|
||
|
+ addi 14, 14, 16
|
||
|
+ stxvx 20, 14, 9
|
||
|
+ addi 14, 14, 16
|
||
|
+ stxvx 21, 14, 9
|
||
|
+ addi 14, 14, 16
|
||
|
+ stxvx 22, 14, 9
|
||
|
+ addi 14, 14, 16
|
||
|
+ stxvx 23, 14, 9
|
||
|
+ addi 14, 14, 16
|
||
|
+ stxvx 24, 14, 9
|
||
|
+ addi 14, 14, 16
|
||
|
+ stxvx 25, 14, 9
|
||
|
+ addi 14, 14, 16
|
||
|
+ stxvx 26, 14, 9
|
||
|
+ addi 14, 14, 16
|
||
|
+ stxvx 27, 14, 9
|
||
|
+ addi 14, 14, 16
|
||
|
+ stxvx 28, 14, 9
|
||
|
+ addi 14, 14, 16
|
||
|
+ stxvx 29, 14, 9
|
||
|
+ addi 14, 14, 16
|
||
|
+ stxvx 30, 14, 9
|
||
|
+ addi 14, 14, 16
|
||
|
+ stxvx 31, 14, 9
|
||
|
+
|
||
|
+ mr 15, 6 # len
|
||
|
+ li 14, 0 # offset to inp and outp
|
||
|
+
|
||
|
+ ld 10, sigma@got(2)
|
||
|
+
|
||
|
+ lxvw4x 48, 0, 3 # vr16, constants
|
||
|
+ lxvw4x 49, 17, 3 # vr17, key 1
|
||
|
+ lxvw4x 50, 18, 3 # vr18, key 2
|
||
|
+ lxvw4x 51, 19, 3 # vr19, counter, nonce
|
||
|
+
|
||
|
+ lxvw4x 62, 19, 10 # vr30, 4
|
||
|
+
|
||
|
+ vspltisw 21, 12
|
||
|
+ vspltisw 23, 7
|
||
|
+
|
||
|
+ ld 11, permx@got(2)
|
||
|
+ lxvw4x 32+20, 0, 11
|
||
|
+ lxvw4x 32+22, 17, 11
|
||
|
+
|
||
|
+ li 8, 10
|
||
|
+ mtctr 8
|
||
|
+
|
||
|
+ xxlor 16, 48, 48
|
||
|
+ xxlor 17, 49, 49
|
||
|
+ xxlor 18, 50, 50
|
||
|
+ xxlor 19, 51, 51
|
||
|
+
|
||
|
+ vspltisw 25, 4
|
||
|
+ vspltisw 26, 8
|
||
|
+
|
||
|
+ xxlor 16, 48, 48
|
||
|
+ xxlor 17, 49, 49
|
||
|
+ xxlor 18, 50, 50
|
||
|
+ xxlor 19, 51, 51
|
||
|
+
|
||
|
+ xxlor 25, 32+26, 32+26
|
||
|
+ xxlor 24, 32+25, 32+25
|
||
|
+
|
||
|
+ vadduwm 31, 30, 25 # (0, 1, 2, 3) + (4, 4, 4, 4)
|
||
|
+ xxlor 30, 32+30, 32+30
|
||
|
+ xxlor 31, 32+31, 32+31
|
||
|
+
|
||
|
+ xxlor 20, 32+20, 32+20
|
||
|
+ xxlor 21, 32+21, 32+21
|
||
|
+ xxlor 22, 32+22, 32+22
|
||
|
+ xxlor 23, 32+23, 32+23
|
||
|
+
|
||
|
+Loop_8x:
|
||
|
+ lvx 0, 20, 10
|
||
|
+ lvx 1, 21, 10
|
||
|
+ lvx 2, 22, 10
|
||
|
+ lvx 3, 23, 10
|
||
|
+ xxspltw 32+4, 17, 0
|
||
|
+ xxspltw 32+5, 17, 1
|
||
|
+ xxspltw 32+6, 17, 2
|
||
|
+ xxspltw 32+7, 17, 3
|
||
|
+ xxspltw 32+8, 18, 0
|
||
|
+ xxspltw 32+9, 18, 1
|
||
|
+ xxspltw 32+10, 18, 2
|
||
|
+ xxspltw 32+11, 18, 3
|
||
|
+ xxspltw 32+12, 19, 0
|
||
|
+ xxspltw 32+13, 19, 1
|
||
|
+ xxspltw 32+14, 19, 2
|
||
|
+ xxspltw 32+15, 19, 3
|
||
|
+ vadduwm 12, 12, 30 # increase counter
|
||
|
+
|
||
|
+ lvx 16, 20, 10
|
||
|
+ lvx 17, 21, 10
|
||
|
+ lvx 18, 22, 10
|
||
|
+ lvx 19, 23, 10
|
||
|
+ xxspltw 32+20, 17, 0
|
||
|
+ xxspltw 32+21, 17, 1
|
||
|
+ xxspltw 32+22, 17, 2
|
||
|
+ xxspltw 32+23, 17, 3
|
||
|
+ xxspltw 32+24, 18, 0
|
||
|
+ xxspltw 32+25, 18, 1
|
||
|
+ xxspltw 32+26, 18, 2
|
||
|
+ xxspltw 32+27, 18, 3
|
||
|
+ xxspltw 32+28, 19, 0
|
||
|
+ xxspltw 32+29, 19, 1
|
||
|
+ vadduwm 28, 28, 31 # increase counter
|
||
|
+ xxspltw 32+30, 19, 2
|
||
|
+ xxspltw 32+31, 19, 3
|
||
|
+
|
||
|
+.align 5
|
||
|
+quarter_loop_8x:
|
||
|
+ QT_loop_8x
|
||
|
+
|
||
|
+ bdnz quarter_loop_8x
|
||
|
+
|
||
|
+ xxlor 0, 32+30, 32+30
|
||
|
+ xxlor 32+30, 30, 30
|
||
|
+ vadduwm 12, 12, 30
|
||
|
+ xxlor 32+30, 0, 0
|
||
|
+ TP_4x 0, 1, 2, 3
|
||
|
+ TP_4x 4, 5, 6, 7
|
||
|
+ TP_4x 8, 9, 10, 11
|
||
|
+ TP_4x 12, 13, 14, 15
|
||
|
+
|
||
|
+ xxlor 0, 48, 48
|
||
|
+ xxlor 1, 49, 49
|
||
|
+ xxlor 2, 50, 50
|
||
|
+ xxlor 3, 51, 51
|
||
|
+ xxlor 48, 16, 16
|
||
|
+ xxlor 49, 17, 17
|
||
|
+ xxlor 50, 18, 18
|
||
|
+ xxlor 51, 19, 19
|
||
|
+ Add_state 0
|
||
|
+ xxlor 48, 0, 0
|
||
|
+ xxlor 49, 1, 1
|
||
|
+ xxlor 50, 2, 2
|
||
|
+ xxlor 51, 3, 3
|
||
|
+ Write_256 0
|
||
|
+ addi 14, 14, 256
|
||
|
+ addi 15, 15, -256
|
||
|
+
|
||
|
+ xxlor 5, 32+31, 32+31
|
||
|
+ xxlor 32+31, 31, 31
|
||
|
+ vadduwm 28, 28, 31
|
||
|
+ xxlor 32+31, 5, 5
|
||
|
+ TP_4x 16+0, 16+1, 16+2, 16+3
|
||
|
+ TP_4x 16+4, 16+5, 16+6, 16+7
|
||
|
+ TP_4x 16+8, 16+9, 16+10, 16+11
|
||
|
+ TP_4x 16+12, 16+13, 16+14, 16+15
|
||
|
+
|
||
|
+ xxlor 32, 16, 16
|
||
|
+ xxlor 33, 17, 17
|
||
|
+ xxlor 34, 18, 18
|
||
|
+ xxlor 35, 19, 19
|
||
|
+ Add_state 16
|
||
|
+ Write_256 16
|
||
|
+ addi 14, 14, 256
|
||
|
+ addi 15, 15, -256
|
||
|
+
|
||
|
+ # should update counter before out?
|
||
|
+ xxlor 32+24, 24, 24
|
||
|
+ xxlor 32+25, 25, 25
|
||
|
+ xxlor 32+30, 30, 30
|
||
|
+ vadduwm 30, 30, 25
|
||
|
+ vadduwm 31, 30, 24
|
||
|
+ xxlor 30, 32+30, 32+30
|
||
|
+ xxlor 31, 32+31, 32+31
|
||
|
+
|
||
|
+ cmpdi 15, 0
|
||
|
+ beq Out_loop
|
||
|
+
|
||
|
+ cmpdi 15, 512
|
||
|
+ blt Loop_last
|
||
|
+
|
||
|
+ mtctr 8
|
||
|
+ b Loop_8x
|
||
|
+
|
||
|
+Loop_last:
|
||
|
+ lxvw4x 48, 0, 3 # vr16, constants
|
||
|
+ lxvw4x 49, 17, 3 # vr17, key 1
|
||
|
+ lxvw4x 50, 18, 3 # vr18, key 2
|
||
|
+ lxvw4x 51, 19, 3 # vr19, counter, nonce
|
||
|
+
|
||
|
+ vspltisw 21, 12
|
||
|
+ vspltisw 23, 7
|
||
|
+ lxvw4x 32+20, 0, 11
|
||
|
+ lxvw4x 32+22, 17, 11
|
||
|
+
|
||
|
+ li 8, 10
|
||
|
+ mtctr 8
|
||
|
+
|
||
|
+Loop_4x:
|
||
|
+ lvx 0, 20, 10
|
||
|
+ lvx 1, 21, 10
|
||
|
+ lvx 2, 22, 10
|
||
|
+ lvx 3, 23, 10
|
||
|
+ vspltw 4, 17, 0
|
||
|
+ vspltw 5, 17, 1
|
||
|
+ vspltw 6, 17, 2
|
||
|
+ vspltw 7, 17, 3
|
||
|
+ vspltw 8, 18, 0
|
||
|
+ vspltw 9, 18, 1
|
||
|
+ vspltw 10, 18, 2
|
||
|
+ vspltw 11, 18, 3
|
||
|
+ vspltw 12, 19, 0
|
||
|
+ vadduwm 12, 12, 30 # increase counter
|
||
|
+ vspltw 13, 19, 1
|
||
|
+ vspltw 14, 19, 2
|
||
|
+ vspltw 15, 19, 3
|
||
|
+
|
||
|
+.align 5
|
||
|
+quarter_loop:
|
||
|
+ QT_loop_4x
|
||
|
+
|
||
|
+ bdnz quarter_loop
|
||
|
+
|
||
|
+ vadduwm 12, 12, 30
|
||
|
+ TP_4x 0, 1, 2, 3
|
||
|
+ TP_4x 4, 5, 6, 7
|
||
|
+ TP_4x 8, 9, 10, 11
|
||
|
+ TP_4x 12, 13, 14, 15
|
||
|
+
|
||
|
+ Add_state 0
|
||
|
+ Write_256 0
|
||
|
+ addi 14, 14, 256
|
||
|
+ addi 15, 15, -256
|
||
|
+
|
||
|
+ # Update state counter
|
||
|
+ vspltisw 25, 4
|
||
|
+ vadduwm 30, 30, 25
|
||
|
+
|
||
|
+ cmpdi 15, 0
|
||
|
+ beq Out_loop
|
||
|
+
|
||
|
+ mtctr 8
|
||
|
+ b Loop_4x
|
||
|
+
|
||
|
+Out_loop:
|
||
|
+ #
|
||
|
+ # Update state counter
|
||
|
+ #
|
||
|
+ vspltisb 16, -1 # first 16 bytes - 0xffff...ff
|
||
|
+ vspltisb 17, 0 # second 16 bytes - 0x0000...00
|
||
|
+ vsldoi 18, 16, 17, 12
|
||
|
+ vand 18, 18, 30
|
||
|
+ xxlor 32+19, 19, 19
|
||
|
+ vadduwm 18, 19, 18
|
||
|
+ stxvw4x 32+18, 19, 3
|
||
|
+ li 3, 0
|
||
|
+
|
||
|
+ addi 9, 1, 256
|
||
|
+ lvx 20, 0, 9
|
||
|
+ lvx 21, 17, 9
|
||
|
+ lvx 22, 18, 9
|
||
|
+ lvx 23, 19, 9
|
||
|
+ lvx 24, 20, 9
|
||
|
+ lvx 25, 21, 9
|
||
|
+ lvx 26, 22, 9
|
||
|
+ lvx 27, 23, 9
|
||
|
+ lvx 28, 24, 9
|
||
|
+ lvx 29, 25, 9
|
||
|
+ lvx 30, 26, 9
|
||
|
+ lvx 31, 27, 9
|
||
|
+
|
||
|
+ add 9, 9, 27
|
||
|
+ addi 14, 17, 16
|
||
|
+ lxvx 14, 14, 9
|
||
|
+ addi 14, 14, 16
|
||
|
+ lxvx 15, 14, 9
|
||
|
+ addi 14, 14, 16
|
||
|
+ lxvx 16, 14, 9
|
||
|
+ addi 14, 14, 16
|
||
|
+ lxvx 17, 14, 9
|
||
|
+ addi 14, 14, 16
|
||
|
+ lxvx 18, 14, 9
|
||
|
+ addi 14, 14, 16
|
||
|
+ lxvx 19, 14, 9
|
||
|
+ addi 14, 14, 16
|
||
|
+ lxvx 20, 14, 9
|
||
|
+ addi 14, 14, 16
|
||
|
+ lxvx 21, 14, 9
|
||
|
+ addi 14, 14, 16
|
||
|
+ lxvx 22, 14, 9
|
||
|
+ addi 14, 14, 16
|
||
|
+ lxvx 23, 14, 9
|
||
|
+ addi 14, 14, 16
|
||
|
+ lxvx 24, 14, 9
|
||
|
+ addi 14, 14, 16
|
||
|
+ lxvx 25, 14, 9
|
||
|
+ addi 14, 14, 16
|
||
|
+ lxvx 26, 14, 9
|
||
|
+ addi 14, 14, 16
|
||
|
+ lxvx 27, 14, 9
|
||
|
+ addi 14, 14, 16
|
||
|
+ lxvx 28, 14, 9
|
||
|
+ addi 14, 14, 16
|
||
|
+ lxvx 29, 14, 9
|
||
|
+ addi 14, 14, 16
|
||
|
+ lxvx 30, 14, 9
|
||
|
+ addi 14, 14, 16
|
||
|
+ lxvx 31, 14, 9
|
||
|
+
|
||
|
+ ld 0, 1040(1)
|
||
|
+ ld 14,112(1)
|
||
|
+ ld 15,120(1)
|
||
|
+ ld 16,128(1)
|
||
|
+ ld 17,136(1)
|
||
|
+ ld 18,144(1)
|
||
|
+ ld 19,152(1)
|
||
|
+ ld 20,160(1)
|
||
|
+ ld 21,168(1)
|
||
|
+ ld 22,176(1)
|
||
|
+ ld 23,184(1)
|
||
|
+ ld 24,192(1)
|
||
|
+ ld 25,200(1)
|
||
|
+ ld 26,208(1)
|
||
|
+ ld 27,216(1)
|
||
|
+ ld 28,224(1)
|
||
|
+ ld 29,232(1)
|
||
|
+ ld 30,240(1)
|
||
|
+ ld 31,248(1)
|
||
|
+
|
||
|
+ mtlr 0
|
||
|
+ addi 1, 1, 1024
|
||
|
+ blr
|
||
|
+
|
||
|
+Out_no_chacha:
|
||
|
+ li 3, 0
|
||
|
+ blr
|
||
|
+
|
||
|
+.data
|
||
|
+.align 4
|
||
|
+sigma:
|
||
|
+.long 0x61707865, 0x3320646e, 0x79622d32, 0x6b206574
|
||
|
+.long 0x0c0d0e0f, 0x08090a0b, 0x04050607, 0x00010203
|
||
|
+.long 1, 0, 0, 0
|
||
|
+.long 0, 1, 2, 3
|
||
|
+.long 0x61707865, 0x61707865, 0x61707865, 0x61707865
|
||
|
+.long 0x3320646e, 0x3320646e, 0x3320646e, 0x3320646e
|
||
|
+.long 0x79622d32, 0x79622d32, 0x79622d32, 0x79622d32
|
||
|
+.long 0x6b206574, 0x6b206574, 0x6b206574, 0x6b206574
|
||
|
+permx:
|
||
|
+.long 0x22330011, 0x66774455, 0xaabb8899, 0xeeffccdd
|
||
|
+.long 0x11223300, 0x55667744, 0x99aabb88, 0xddeeffcc
|
||
|
Index: libgcrypt-1.10.2/cipher/chacha20.c
|
||
|
===================================================================
|
||
|
--- libgcrypt-1.10.2.orig/cipher/chacha20.c
|
||
|
+++ libgcrypt-1.10.2/cipher/chacha20.c
|
||
|
@@ -125,6 +125,7 @@ typedef struct CHACHA20_context_s
|
||
|
unsigned int use_avx2:1;
|
||
|
unsigned int use_neon:1;
|
||
|
unsigned int use_ppc:1;
|
||
|
+ unsigned int use_p10:1;
|
||
|
unsigned int use_s390x:1;
|
||
|
} CHACHA20_context_t;
|
||
|
|
||
|
@@ -163,6 +164,12 @@ unsigned int _gcry_chacha20_poly1305_amd
|
||
|
|
||
|
#ifdef USE_PPC_VEC
|
||
|
|
||
|
+#ifndef WORDS_BIGENDIAN
|
||
|
+unsigned int _gcry_chacha20_p10le_8x(u32 *state, byte *dst,
|
||
|
+ const byte *src,
|
||
|
+ size_t len);
|
||
|
+#endif
|
||
|
+
|
||
|
unsigned int _gcry_chacha20_ppc8_blocks4(u32 *state, byte *dst,
|
||
|
const byte *src,
|
||
|
size_t nblks);
|
||
|
@@ -475,6 +482,9 @@ chacha20_do_setkey (CHACHA20_context_t *
|
||
|
#endif
|
||
|
#ifdef USE_PPC_VEC
|
||
|
ctx->use_ppc = (features & HWF_PPC_ARCH_2_07) != 0;
|
||
|
+# ifndef WORDS_BIGENDIAN
|
||
|
+ ctx->use_p10 = (features & HWF_PPC_ARCH_3_10) != 0;
|
||
|
+# endif
|
||
|
#endif
|
||
|
#ifdef USE_S390X_VX
|
||
|
ctx->use_s390x = (features & HWF_S390X_VX) != 0;
|
||
|
@@ -571,7 +581,22 @@ do_chacha20_encrypt_stream_tail (CHACHA2
|
||
|
{
|
||
|
size_t nblocks = length / CHACHA20_BLOCK_SIZE;
|
||
|
nblocks -= nblocks % 4;
|
||
|
- nburn = _gcry_chacha20_ppc8_blocks4(ctx->input, outbuf, inbuf, nblocks);
|
||
|
+#ifndef WORDS_BIGENDIAN
|
||
|
+ /*
|
||
|
+ * A workaround to skip counter overflow. This is rare.
|
||
|
+ */
|
||
|
+ if (ctx->use_p10 && nblocks >= 8
|
||
|
+ && ((u64)ctx->input[12] + nblocks) <= 0xffffffffU)
|
||
|
+ {
|
||
|
+ size_t len = nblocks * CHACHA20_BLOCK_SIZE;
|
||
|
+ nburn = _gcry_chacha20_p10le_8x(ctx->input, outbuf, inbuf, len);
|
||
|
+ }
|
||
|
+ else
|
||
|
+#endif
|
||
|
+ {
|
||
|
+ nburn = _gcry_chacha20_ppc8_blocks4(ctx->input, outbuf, inbuf,
|
||
|
+ nblocks);
|
||
|
+ }
|
||
|
burn = nburn > burn ? nburn : burn;
|
||
|
length -= nblocks * CHACHA20_BLOCK_SIZE;
|
||
|
outbuf += nblocks * CHACHA20_BLOCK_SIZE;
|
||
|
@@ -760,6 +785,11 @@ _gcry_chacha20_poly1305_encrypt(gcry_cip
|
||
|
}
|
||
|
#endif
|
||
|
#ifdef USE_PPC_VEC_POLY1305
|
||
|
+ else if (ctx->use_ppc && ctx->use_p10)
|
||
|
+ {
|
||
|
+ /* Skip stitched chacha20-poly1305 for P10. */
|
||
|
+ authptr = NULL;
|
||
|
+ }
|
||
|
else if (ctx->use_ppc && length >= CHACHA20_BLOCK_SIZE * 4)
|
||
|
{
|
||
|
nburn = _gcry_chacha20_ppc8_blocks4(ctx->input, outbuf, inbuf, 4);
|
||
|
@@ -998,6 +1028,7 @@ _gcry_chacha20_poly1305_decrypt(gcry_cip
|
||
|
{
|
||
|
CHACHA20_context_t *ctx = (void *) &c->context.c;
|
||
|
unsigned int nburn, burn = 0;
|
||
|
+ int skip_stitched = 0;
|
||
|
|
||
|
if (!length)
|
||
|
return 0;
|
||
|
@@ -1049,6 +1080,13 @@ _gcry_chacha20_poly1305_decrypt(gcry_cip
|
||
|
inbuf += nblocks * CHACHA20_BLOCK_SIZE;
|
||
|
}
|
||
|
#endif
|
||
|
+#ifdef USE_PPC_VEC_POLY1305
|
||
|
+ if (ctx->use_ppc && ctx->use_p10)
|
||
|
+ {
|
||
|
+ /* Skip stitched chacha20-poly1305 for P10. */
|
||
|
+ skip_stitched = 1;
|
||
|
+ }
|
||
|
+#endif
|
||
|
|
||
|
#ifdef USE_SSSE3
|
||
|
if (ctx->use_ssse3)
|
||
|
@@ -1102,7 +1140,8 @@ _gcry_chacha20_poly1305_decrypt(gcry_cip
|
||
|
#endif
|
||
|
|
||
|
#ifdef USE_PPC_VEC_POLY1305
|
||
|
- if (ctx->use_ppc && length >= 4 * CHACHA20_BLOCK_SIZE)
|
||
|
+ /* skip stitch for p10 */
|
||
|
+ if (!skip_stitched && ctx->use_ppc && length >= 4 * CHACHA20_BLOCK_SIZE)
|
||
|
{
|
||
|
size_t nblocks = length / CHACHA20_BLOCK_SIZE;
|
||
|
nblocks -= nblocks % 4;
|
||
|
Index: libgcrypt-1.10.2/cipher/poly1305-internal.h
|
||
|
===================================================================
|
||
|
--- libgcrypt-1.10.2.orig/cipher/poly1305-internal.h
|
||
|
+++ libgcrypt-1.10.2/cipher/poly1305-internal.h
|
||
|
@@ -33,6 +33,17 @@
|
||
|
#define POLY1305_KEYLEN 32
|
||
|
#define POLY1305_BLOCKSIZE 16
|
||
|
|
||
|
+/* POLY1305_USE_PPC_VEC indicates whether to enable PowerPC vector code. */
|
||
|
+#undef POLY1305_USE_PPC_VEC
|
||
|
+#ifdef ENABLE_PPC_CRYPTO_SUPPORT
|
||
|
+# if defined(HAVE_COMPATIBLE_CC_PPC_ALTIVEC) && \
|
||
|
+ defined(HAVE_GCC_INLINE_ASM_PPC_ALTIVEC) && \
|
||
|
+ !defined(WORDS_BIGENDIAN)
|
||
|
+# if __GNUC__ >= 4
|
||
|
+# define POLY1305_USE_PPC_VEC 1
|
||
|
+# endif
|
||
|
+# endif
|
||
|
+#endif
|
||
|
|
||
|
typedef struct
|
||
|
{
|
||
|
@@ -46,6 +57,9 @@ typedef struct poly1305_context_s
|
||
|
POLY1305_STATE state;
|
||
|
byte buffer[POLY1305_BLOCKSIZE];
|
||
|
unsigned int leftover;
|
||
|
+#ifdef POLY1305_USE_PPC_VEC
|
||
|
+ unsigned int use_p10:1;
|
||
|
+#endif
|
||
|
} poly1305_context_t;
|
||
|
|
||
|
|
||
|
Index: libgcrypt-1.10.2/cipher/poly1305-p10le.s
|
||
|
===================================================================
|
||
|
--- /dev/null
|
||
|
+++ libgcrypt-1.10.2/cipher/poly1305-p10le.s
|
||
|
@@ -0,0 +1,841 @@
|
||
|
+# Copyright 2021- IBM Inc. All rights reserved
|
||
|
+#
|
||
|
+# This file is part of Libgcrypt.
|
||
|
+#
|
||
|
+# Libgcrypt is free software; you can redistribute it and/or modify
|
||
|
+# it under the terms of the GNU Lesser General Public License as
|
||
|
+# published by the Free Software Foundation; either version 2.1 of
|
||
|
+# the License, or (at your option) any later version.
|
||
|
+#
|
||
|
+# Libgcrypt is distributed in the hope that it will be useful,
|
||
|
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||
|
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||
|
+# GNU Lesser General Public License for more details.
|
||
|
+#
|
||
|
+# You should have received a copy of the GNU Lesser General Public
|
||
|
+# License along with this program; if not, see <http://www.gnu.org/licenses/>.
|
||
|
+#
|
||
|
+#===================================================================================
|
||
|
+# Written by Danny Tsen <dtsen@us.ibm.com>
|
||
|
+#
|
||
|
+# Poly1305 - this version mainly using vector/VSX/Scalar
|
||
|
+# - 26 bits limbs
|
||
|
+# - Handle multiple 64 byte blcoks but need at least 2 64 bytes block
|
||
|
+#
|
||
|
+# Improve performance by breaking down polynominal to the sum of products with
|
||
|
+# h4 = m1 * r⁴ + m2 * r³ + m3 * r² + m4 * r
|
||
|
+#
|
||
|
+# 07/22/21 - this revison based on the above sum of products. Setup r^4, r^3, r^2, r and s3, s2, s1, s0
|
||
|
+# to 9 vectors for multiplications.
|
||
|
+#
|
||
|
+# setup r^4, r^3, r^2, r vectors
|
||
|
+# vs [r^1, r^3, r^2, r^4]
|
||
|
+# vs0 = [r0,.....]
|
||
|
+# vs1 = [r1,.....]
|
||
|
+# vs2 = [r2,.....]
|
||
|
+# vs3 = [r3,.....]
|
||
|
+# vs4 = [r4,.....]
|
||
|
+# vs5 = [r1*5,...]
|
||
|
+# vs6 = [r2*5,...]
|
||
|
+# vs7 = [r2*5,...]
|
||
|
+# vs8 = [r4*5,...]
|
||
|
+#
|
||
|
+# Each word in a vector consists a member of a "r/s" in [a * r/s].
|
||
|
+#
|
||
|
+# r0, r4*5, r3*5, r2*5, r1*5;
|
||
|
+# r1, r0, r4*5, r3*5, r2*5;
|
||
|
+# r2, r1, r0, r4*5, r3*5;
|
||
|
+# r3, r2, r1, r0, r4*5;
|
||
|
+# r4, r3, r2, r1, r0 ;
|
||
|
+#
|
||
|
+#
|
||
|
+# gcry_poly1305_p10le_4blocks( uint8_t *k, uint32_t mlen, uint8_t *m)
|
||
|
+# k = 32 bytes key
|
||
|
+# r3 = k (r, s)
|
||
|
+# r4 = mlen
|
||
|
+# r5 = m
|
||
|
+#
|
||
|
+.text
|
||
|
+
|
||
|
+# Block size 16 bytes
|
||
|
+# key = (r, s)
|
||
|
+# clamp r &= 0x0FFFFFFC0FFFFFFC 0x0FFFFFFC0FFFFFFF
|
||
|
+# p = 2^130 - 5
|
||
|
+# a += m
|
||
|
+# a = (r + a) % p
|
||
|
+# a += s
|
||
|
+# 16 bytes (a)
|
||
|
+#
|
||
|
+# p[0] = a0*r0 + a1*r4*5 + a2*r3*5 + a3*r2*5 + a4*r1*5;
|
||
|
+# p[1] = a0*r1 + a1*r0 + a2*r4*5 + a3*r3*5 + a4*r2*5;
|
||
|
+# p[2] = a0*r2 + a1*r1 + a2*r0 + a3*r4*5 + a4*r3*5;
|
||
|
+# p[3] = a0*r3 + a1*r2 + a2*r1 + a3*r0 + a4*r4*5;
|
||
|
+# p[4] = a0*r4 + a1*r3 + a2*r2 + a3*r1 + a4*r0 ;
|
||
|
+#
|
||
|
+# [r^2, r^3, r^1, r^4]
|
||
|
+# [m3, m2, m4, m1]
|
||
|
+#
|
||
|
+# multiply odd and even words
|
||
|
+.macro mul_odd
|
||
|
+ vmulouw 14, 4, 26
|
||
|
+ vmulouw 10, 5, 3
|
||
|
+ vmulouw 11, 6, 2
|
||
|
+ vmulouw 12, 7, 1
|
||
|
+ vmulouw 13, 8, 0
|
||
|
+ vmulouw 15, 4, 27
|
||
|
+ vaddudm 14, 14, 10
|
||
|
+ vaddudm 14, 14, 11
|
||
|
+ vmulouw 10, 5, 26
|
||
|
+ vmulouw 11, 6, 3
|
||
|
+ vaddudm 14, 14, 12
|
||
|
+ vaddudm 14, 14, 13 # x0
|
||
|
+ vaddudm 15, 15, 10
|
||
|
+ vaddudm 15, 15, 11
|
||
|
+ vmulouw 12, 7, 2
|
||
|
+ vmulouw 13, 8, 1
|
||
|
+ vaddudm 15, 15, 12
|
||
|
+ vaddudm 15, 15, 13 # x1
|
||
|
+ vmulouw 16, 4, 28
|
||
|
+ vmulouw 10, 5, 27
|
||
|
+ vmulouw 11, 6, 26
|
||
|
+ vaddudm 16, 16, 10
|
||
|
+ vaddudm 16, 16, 11
|
||
|
+ vmulouw 12, 7, 3
|
||
|
+ vmulouw 13, 8, 2
|
||
|
+ vaddudm 16, 16, 12
|
||
|
+ vaddudm 16, 16, 13 # x2
|
||
|
+ vmulouw 17, 4, 29
|
||
|
+ vmulouw 10, 5, 28
|
||
|
+ vmulouw 11, 6, 27
|
||
|
+ vaddudm 17, 17, 10
|
||
|
+ vaddudm 17, 17, 11
|
||
|
+ vmulouw 12, 7, 26
|
||
|
+ vmulouw 13, 8, 3
|
||
|
+ vaddudm 17, 17, 12
|
||
|
+ vaddudm 17, 17, 13 # x3
|
||
|
+ vmulouw 18, 4, 30
|
||
|
+ vmulouw 10, 5, 29
|
||
|
+ vmulouw 11, 6, 28
|
||
|
+ vaddudm 18, 18, 10
|
||
|
+ vaddudm 18, 18, 11
|
||
|
+ vmulouw 12, 7, 27
|
||
|
+ vmulouw 13, 8, 26
|
||
|
+ vaddudm 18, 18, 12
|
||
|
+ vaddudm 18, 18, 13 # x4
|
||
|
+.endm
|
||
|
+
|
||
|
+.macro mul_even
|
||
|
+ vmuleuw 9, 4, 26
|
||
|
+ vmuleuw 10, 5, 3
|
||
|
+ vmuleuw 11, 6, 2
|
||
|
+ vmuleuw 12, 7, 1
|
||
|
+ vmuleuw 13, 8, 0
|
||
|
+ vaddudm 14, 14, 9
|
||
|
+ vaddudm 14, 14, 10
|
||
|
+ vaddudm 14, 14, 11
|
||
|
+ vaddudm 14, 14, 12
|
||
|
+ vaddudm 14, 14, 13 # x0
|
||
|
+
|
||
|
+ vmuleuw 9, 4, 27
|
||
|
+ vmuleuw 10, 5, 26
|
||
|
+ vmuleuw 11, 6, 3
|
||
|
+ vmuleuw 12, 7, 2
|
||
|
+ vmuleuw 13, 8, 1
|
||
|
+ vaddudm 15, 15, 9
|
||
|
+ vaddudm 15, 15, 10
|
||
|
+ vaddudm 15, 15, 11
|
||
|
+ vaddudm 15, 15, 12
|
||
|
+ vaddudm 15, 15, 13 # x1
|
||
|
+
|
||
|
+ vmuleuw 9, 4, 28
|
||
|
+ vmuleuw 10, 5, 27
|
||
|
+ vmuleuw 11, 6, 26
|
||
|
+ vmuleuw 12, 7, 3
|
||
|
+ vmuleuw 13, 8, 2
|
||
|
+ vaddudm 16, 16, 9
|
||
|
+ vaddudm 16, 16, 10
|
||
|
+ vaddudm 16, 16, 11
|
||
|
+ vaddudm 16, 16, 12
|
||
|
+ vaddudm 16, 16, 13 # x2
|
||
|
+
|
||
|
+ vmuleuw 9, 4, 29
|
||
|
+ vmuleuw 10, 5, 28
|
||
|
+ vmuleuw 11, 6, 27
|
||
|
+ vmuleuw 12, 7, 26
|
||
|
+ vmuleuw 13, 8, 3
|
||
|
+ vaddudm 17, 17, 9
|
||
|
+ vaddudm 17, 17, 10
|
||
|
+ vaddudm 17, 17, 11
|
||
|
+ vaddudm 17, 17, 12
|
||
|
+ vaddudm 17, 17, 13 # x3
|
||
|
+
|
||
|
+ vmuleuw 9, 4, 30
|
||
|
+ vmuleuw 10, 5, 29
|
||
|
+ vmuleuw 11, 6, 28
|
||
|
+ vmuleuw 12, 7, 27
|
||
|
+ vmuleuw 13, 8, 26
|
||
|
+ vaddudm 18, 18, 9
|
||
|
+ vaddudm 18, 18, 10
|
||
|
+ vaddudm 18, 18, 11
|
||
|
+ vaddudm 18, 18, 12
|
||
|
+ vaddudm 18, 18, 13 # x4
|
||
|
+.endm
|
||
|
+
|
||
|
+# setup r^4, r^3, r^2, r vectors
|
||
|
+# [r, r^3, r^2, r^4]
|
||
|
+# vs0 = [r0,...]
|
||
|
+# vs1 = [r1,...]
|
||
|
+# vs2 = [r2,...]
|
||
|
+# vs3 = [r3,...]
|
||
|
+# vs4 = [r4,...]
|
||
|
+# vs5 = [r4*5,...]
|
||
|
+# vs6 = [r3*5,...]
|
||
|
+# vs7 = [r2*5,...]
|
||
|
+# vs8 = [r1*5,...]
|
||
|
+#
|
||
|
+# r0, r4*5, r3*5, r2*5, r1*5;
|
||
|
+# r1, r0, r4*5, r3*5, r2*5;
|
||
|
+# r2, r1, r0, r4*5, r3*5;
|
||
|
+# r3, r2, r1, r0, r4*5;
|
||
|
+# r4, r3, r2, r1, r0 ;
|
||
|
+#
|
||
|
+.macro poly1305_setup_r
|
||
|
+
|
||
|
+ # save r
|
||
|
+ xxlor 26, 58, 58
|
||
|
+ xxlor 27, 59, 59
|
||
|
+ xxlor 28, 60, 60
|
||
|
+ xxlor 29, 61, 61
|
||
|
+ xxlor 30, 62, 62
|
||
|
+
|
||
|
+ xxlxor 31, 31, 31
|
||
|
+
|
||
|
+# [r, r^3, r^2, r^4]
|
||
|
+ # compute r^2
|
||
|
+ vmr 4, 26
|
||
|
+ vmr 5, 27
|
||
|
+ vmr 6, 28
|
||
|
+ vmr 7, 29
|
||
|
+ vmr 8, 30
|
||
|
+ bl do_mul # r^2 r^1
|
||
|
+ xxpermdi 58, 58, 36, 0x3 # r0
|
||
|
+ xxpermdi 59, 59, 37, 0x3 # r1
|
||
|
+ xxpermdi 60, 60, 38, 0x3 # r2
|
||
|
+ xxpermdi 61, 61, 39, 0x3 # r3
|
||
|
+ xxpermdi 62, 62, 40, 0x3 # r4
|
||
|
+ xxpermdi 36, 36, 36, 0x3
|
||
|
+ xxpermdi 37, 37, 37, 0x3
|
||
|
+ xxpermdi 38, 38, 38, 0x3
|
||
|
+ xxpermdi 39, 39, 39, 0x3
|
||
|
+ xxpermdi 40, 40, 40, 0x3
|
||
|
+ vspltisb 13, 2
|
||
|
+ vsld 9, 27, 13
|
||
|
+ vsld 10, 28, 13
|
||
|
+ vsld 11, 29, 13
|
||
|
+ vsld 12, 30, 13
|
||
|
+ vaddudm 0, 9, 27
|
||
|
+ vaddudm 1, 10, 28
|
||
|
+ vaddudm 2, 11, 29
|
||
|
+ vaddudm 3, 12, 30
|
||
|
+
|
||
|
+ bl do_mul # r^4 r^3
|
||
|
+ vmrgow 26, 26, 4
|
||
|
+ vmrgow 27, 27, 5
|
||
|
+ vmrgow 28, 28, 6
|
||
|
+ vmrgow 29, 29, 7
|
||
|
+ vmrgow 30, 30, 8
|
||
|
+ vspltisb 13, 2
|
||
|
+ vsld 9, 27, 13
|
||
|
+ vsld 10, 28, 13
|
||
|
+ vsld 11, 29, 13
|
||
|
+ vsld 12, 30, 13
|
||
|
+ vaddudm 0, 9, 27
|
||
|
+ vaddudm 1, 10, 28
|
||
|
+ vaddudm 2, 11, 29
|
||
|
+ vaddudm 3, 12, 30
|
||
|
+
|
||
|
+ # r^2 r^4
|
||
|
+ xxlor 0, 58, 58
|
||
|
+ xxlor 1, 59, 59
|
||
|
+ xxlor 2, 60, 60
|
||
|
+ xxlor 3, 61, 61
|
||
|
+ xxlor 4, 62, 62
|
||
|
+ xxlor 5, 32, 32
|
||
|
+ xxlor 6, 33, 33
|
||
|
+ xxlor 7, 34, 34
|
||
|
+ xxlor 8, 35, 35
|
||
|
+
|
||
|
+ vspltw 9, 26, 3
|
||
|
+ vspltw 10, 26, 2
|
||
|
+ vmrgow 26, 10, 9
|
||
|
+ vspltw 9, 27, 3
|
||
|
+ vspltw 10, 27, 2
|
||
|
+ vmrgow 27, 10, 9
|
||
|
+ vspltw 9, 28, 3
|
||
|
+ vspltw 10, 28, 2
|
||
|
+ vmrgow 28, 10, 9
|
||
|
+ vspltw 9, 29, 3
|
||
|
+ vspltw 10, 29, 2
|
||
|
+ vmrgow 29, 10, 9
|
||
|
+ vspltw 9, 30, 3
|
||
|
+ vspltw 10, 30, 2
|
||
|
+ vmrgow 30, 10, 9
|
||
|
+
|
||
|
+ vsld 9, 27, 13
|
||
|
+ vsld 10, 28, 13
|
||
|
+ vsld 11, 29, 13
|
||
|
+ vsld 12, 30, 13
|
||
|
+ vaddudm 0, 9, 27
|
||
|
+ vaddudm 1, 10, 28
|
||
|
+ vaddudm 2, 11, 29
|
||
|
+ vaddudm 3, 12, 30
|
||
|
+.endm
|
||
|
+
|
||
|
+do_mul:
|
||
|
+ mul_odd
|
||
|
+
|
||
|
+ # do reduction ( h %= p )
|
||
|
+ # carry reduction
|
||
|
+ vspltisb 9, 2
|
||
|
+ vsrd 10, 14, 31
|
||
|
+ vsrd 11, 17, 31
|
||
|
+ vand 7, 17, 25
|
||
|
+ vand 4, 14, 25
|
||
|
+ vaddudm 18, 18, 11
|
||
|
+ vsrd 12, 18, 31
|
||
|
+ vaddudm 15, 15, 10
|
||
|
+
|
||
|
+ vsrd 11, 15, 31
|
||
|
+ vand 8, 18, 25
|
||
|
+ vand 5, 15, 25
|
||
|
+ vaddudm 4, 4, 12
|
||
|
+ vsld 10, 12, 9
|
||
|
+ vaddudm 6, 16, 11
|
||
|
+
|
||
|
+ vsrd 13, 6, 31
|
||
|
+ vand 6, 6, 25
|
||
|
+ vaddudm 4, 4, 10
|
||
|
+ vsrd 10, 4, 31
|
||
|
+ vaddudm 7, 7, 13
|
||
|
+
|
||
|
+ vsrd 11, 7, 31
|
||
|
+ vand 7, 7, 25
|
||
|
+ vand 4, 4, 25
|
||
|
+ vaddudm 5, 5, 10
|
||
|
+ vaddudm 8, 8, 11
|
||
|
+ blr
|
||
|
+
|
||
|
+#
|
||
|
+# init key
|
||
|
+#
|
||
|
+do_poly1305_init:
|
||
|
+ ld 10, rmask@got(2)
|
||
|
+ ld 11, 0(10)
|
||
|
+ ld 12, 8(10)
|
||
|
+
|
||
|
+ li 14, 16
|
||
|
+ li 15, 32
|
||
|
+ ld 10, cnum@got(2)
|
||
|
+ lvx 25, 0, 10 # v25 - mask
|
||
|
+ lvx 31, 14, 10 # v31 = 1a
|
||
|
+ lvx 19, 15, 10 # v19 = 1 << 24
|
||
|
+ lxv 24, 48(10) # vs24
|
||
|
+ lxv 25, 64(10) # vs25
|
||
|
+
|
||
|
+ # initialize
|
||
|
+ # load key from r3 to vectors
|
||
|
+ ld 9, 16(3)
|
||
|
+ ld 10, 24(3)
|
||
|
+ ld 11, 0(3)
|
||
|
+ ld 12, 8(3)
|
||
|
+
|
||
|
+ # break 26 bits
|
||
|
+ extrdi 14, 9, 26, 38
|
||
|
+ extrdi 15, 9, 26, 12
|
||
|
+ extrdi 16, 9, 12, 0
|
||
|
+ mtvsrdd 58, 0, 14
|
||
|
+ insrdi 16, 10, 14, 38
|
||
|
+ mtvsrdd 59, 0, 15
|
||
|
+ extrdi 17, 10, 26, 24
|
||
|
+ mtvsrdd 60, 0, 16
|
||
|
+ extrdi 18, 10, 24, 0
|
||
|
+ mtvsrdd 61, 0, 17
|
||
|
+ mtvsrdd 62, 0, 18
|
||
|
+
|
||
|
+ # r1 = r1 * 5, r2 = r2 * 5, r3 = r3 * 5, r4 = r4 * 5
|
||
|
+ li 9, 5
|
||
|
+ mtvsrdd 36, 0, 9
|
||
|
+ vmulouw 0, 27, 4 # v0 = rr0
|
||
|
+ vmulouw 1, 28, 4 # v1 = rr1
|
||
|
+ vmulouw 2, 29, 4 # v2 = rr2
|
||
|
+ vmulouw 3, 30, 4 # v3 = rr3
|
||
|
+ blr
|
||
|
+
|
||
|
+#
|
||
|
+# gcry_poly1305_p10le_4blocks( uint8_t *k, uint32_t mlen, uint8_t *m)
|
||
|
+# k = 32 bytes key
|
||
|
+# r3 = k (r, s)
|
||
|
+# r4 = mlen
|
||
|
+# r5 = m
|
||
|
+#
|
||
|
+.global gcry_poly1305_p10le_4blocks
|
||
|
+.align 5
|
||
|
+gcry_poly1305_p10le_4blocks:
|
||
|
+_gcry_poly1305_p10le_4blocks:
|
||
|
+ cmpdi 5, 128
|
||
|
+ blt Out_no_poly1305
|
||
|
+
|
||
|
+ stdu 1,-1024(1)
|
||
|
+ mflr 0
|
||
|
+
|
||
|
+ std 14,112(1)
|
||
|
+ std 15,120(1)
|
||
|
+ std 16,128(1)
|
||
|
+ std 17,136(1)
|
||
|
+ std 18,144(1)
|
||
|
+ std 19,152(1)
|
||
|
+ std 20,160(1)
|
||
|
+ std 21,168(1)
|
||
|
+ std 31,248(1)
|
||
|
+ li 14, 256
|
||
|
+ stvx 20, 14, 1
|
||
|
+ addi 14, 14, 16
|
||
|
+ stvx 21, 14, 1
|
||
|
+ addi 14, 14, 16
|
||
|
+ stvx 22, 14, 1
|
||
|
+ addi 14, 14, 16
|
||
|
+ stvx 23, 14, 1
|
||
|
+ addi 14, 14, 16
|
||
|
+ stvx 24, 14, 1
|
||
|
+ addi 14, 14, 16
|
||
|
+ stvx 25, 14, 1
|
||
|
+ addi 14, 14, 16
|
||
|
+ stvx 26, 14, 1
|
||
|
+ addi 14, 14, 16
|
||
|
+ stvx 27, 14, 1
|
||
|
+ addi 14, 14, 16
|
||
|
+ stvx 28, 14, 1
|
||
|
+ addi 14, 14, 16
|
||
|
+ stvx 29, 14, 1
|
||
|
+ addi 14, 14, 16
|
||
|
+ stvx 30, 14, 1
|
||
|
+ addi 14, 14, 16
|
||
|
+ stvx 31, 14, 1
|
||
|
+
|
||
|
+ addi 14, 14, 16
|
||
|
+ stxvx 14, 14, 1
|
||
|
+ addi 14, 14, 16
|
||
|
+ stxvx 15, 14, 1
|
||
|
+ addi 14, 14, 16
|
||
|
+ stxvx 16, 14, 1
|
||
|
+ addi 14, 14, 16
|
||
|
+ stxvx 17, 14, 1
|
||
|
+ addi 14, 14, 16
|
||
|
+ stxvx 18, 14, 1
|
||
|
+ addi 14, 14, 16
|
||
|
+ stxvx 19, 14, 1
|
||
|
+ addi 14, 14, 16
|
||
|
+ stxvx 20, 14, 1
|
||
|
+ addi 14, 14, 16
|
||
|
+ stxvx 21, 14, 1
|
||
|
+ addi 14, 14, 16
|
||
|
+ stxvx 22, 14, 1
|
||
|
+ addi 14, 14, 16
|
||
|
+ stxvx 23, 14, 1
|
||
|
+ addi 14, 14, 16
|
||
|
+ stxvx 24, 14, 1
|
||
|
+ addi 14, 14, 16
|
||
|
+ stxvx 25, 14, 1
|
||
|
+ addi 14, 14, 16
|
||
|
+ stxvx 26, 14, 1
|
||
|
+ addi 14, 14, 16
|
||
|
+ stxvx 27, 14, 1
|
||
|
+ addi 14, 14, 16
|
||
|
+ stxvx 28, 14, 1
|
||
|
+ addi 14, 14, 16
|
||
|
+ stxvx 29, 14, 1
|
||
|
+ addi 14, 14, 16
|
||
|
+ stxvx 30, 14, 1
|
||
|
+ addi 14, 14, 16
|
||
|
+ stxvx 31, 14, 1
|
||
|
+ std 0, 1040(1)
|
||
|
+
|
||
|
+ bl do_poly1305_init
|
||
|
+
|
||
|
+ li 21, 0 # counter to message
|
||
|
+
|
||
|
+ poly1305_setup_r
|
||
|
+
|
||
|
+ # load previous state
|
||
|
+ # break/convert r6 to 26 bits
|
||
|
+ ld 9, 32(3)
|
||
|
+ ld 10, 40(3)
|
||
|
+ lwz 19, 48(3)
|
||
|
+ sldi 19, 19, 24
|
||
|
+ mtvsrdd 41, 0, 19
|
||
|
+ extrdi 14, 9, 26, 38
|
||
|
+ extrdi 15, 9, 26, 12
|
||
|
+ extrdi 16, 9, 12, 0
|
||
|
+ mtvsrdd 36, 0, 14
|
||
|
+ insrdi 16, 10, 14, 38
|
||
|
+ mtvsrdd 37, 0, 15
|
||
|
+ extrdi 17, 10, 26, 24
|
||
|
+ mtvsrdd 38, 0, 16
|
||
|
+ extrdi 18, 10, 24, 0
|
||
|
+ mtvsrdd 39, 0, 17
|
||
|
+ mtvsrdd 40, 0, 18
|
||
|
+ vor 8, 8, 9
|
||
|
+
|
||
|
+ # input m1 m2
|
||
|
+ add 20, 4, 21
|
||
|
+ xxlor 49, 24, 24
|
||
|
+ xxlor 50, 25, 25
|
||
|
+ lxvw4x 43, 0, 20
|
||
|
+ addi 17, 20, 16
|
||
|
+ lxvw4x 44, 0, 17
|
||
|
+ vperm 14, 11, 12, 17
|
||
|
+ vperm 15, 11, 12, 18
|
||
|
+ vand 9, 14, 25 # a0
|
||
|
+ vsrd 10, 14, 31 # >> 26
|
||
|
+ vsrd 11, 10, 31 # 12 bits left
|
||
|
+ vand 10, 10, 25 # a1
|
||
|
+ vspltisb 13, 12
|
||
|
+ vand 16, 15, 25
|
||
|
+ vsld 12, 16, 13
|
||
|
+ vor 11, 11, 12
|
||
|
+ vand 11, 11, 25 # a2
|
||
|
+ vspltisb 13, 14
|
||
|
+ vsrd 12, 15, 13 # >> 14
|
||
|
+ vsrd 13, 12, 31 # >> 26, a4
|
||
|
+ vand 12, 12, 25 # a3
|
||
|
+
|
||
|
+ vaddudm 20, 4, 9
|
||
|
+ vaddudm 21, 5, 10
|
||
|
+ vaddudm 22, 6, 11
|
||
|
+ vaddudm 23, 7, 12
|
||
|
+ vaddudm 24, 8, 13
|
||
|
+
|
||
|
+ # m3 m4
|
||
|
+ addi 17, 17, 16
|
||
|
+ lxvw4x 43, 0, 17
|
||
|
+ addi 17, 17, 16
|
||
|
+ lxvw4x 44, 0, 17
|
||
|
+ vperm 14, 11, 12, 17
|
||
|
+ vperm 15, 11, 12, 18
|
||
|
+ vand 9, 14, 25 # a0
|
||
|
+ vsrd 10, 14, 31 # >> 26
|
||
|
+ vsrd 11, 10, 31 # 12 bits left
|
||
|
+ vand 10, 10, 25 # a1
|
||
|
+ vspltisb 13, 12
|
||
|
+ vand 16, 15, 25
|
||
|
+ vsld 12, 16, 13
|
||
|
+ vspltisb 13, 14
|
||
|
+ vor 11, 11, 12
|
||
|
+ vand 11, 11, 25 # a2
|
||
|
+ vsrd 12, 15, 13 # >> 14
|
||
|
+ vsrd 13, 12, 31 # >> 26, a4
|
||
|
+ vand 12, 12, 25 # a3
|
||
|
+
|
||
|
+ # Smash 4 message blocks into 5 vectors of [m4, m2, m3, m1]
|
||
|
+ vmrgow 4, 9, 20
|
||
|
+ vmrgow 5, 10, 21
|
||
|
+ vmrgow 6, 11, 22
|
||
|
+ vmrgow 7, 12, 23
|
||
|
+ vmrgow 8, 13, 24
|
||
|
+ vaddudm 8, 8, 19
|
||
|
+
|
||
|
+ addi 5, 5, -64
|
||
|
+ addi 21, 21, 64
|
||
|
+
|
||
|
+ li 9, 64
|
||
|
+ divdu 31, 5, 9
|
||
|
+
|
||
|
+ mtctr 31
|
||
|
+
|
||
|
+# h4 = m1 * r⁴ + m2 * r³ + m3 * r² + m4 * r
|
||
|
+# Rewrite the polynominal sum of product as follows,
|
||
|
+# h1 = (h0 + m1) * r^2, h2 = (h0 + m2) * r^2
|
||
|
+# h3 = (h1 + m3) * r^2, h4 = (h2 + m4) * r^2 --> (h0 + m1) r*4 + (h3 + m3) r^2, (h0 + m2) r^4 + (h0 + m4) r^2
|
||
|
+# .... Repeat
|
||
|
+# h5 = (h3 + m5) * r^2, h6 = (h4 + m6) * r^2 -->
|
||
|
+# h7 = (h5 + m7) * r^2, h8 = (h6 + m8) * r^1 --> m5 * r^4 + m6 * r^3 + m7 * r^2 + m8 * r
|
||
|
+#
|
||
|
+loop_4blocks:
|
||
|
+
|
||
|
+ # Multiply odd words and even words
|
||
|
+ mul_odd
|
||
|
+ mul_even
|
||
|
+ # carry reduction
|
||
|
+ vspltisb 9, 2
|
||
|
+ vsrd 10, 14, 31
|
||
|
+ vsrd 11, 17, 31
|
||
|
+ vand 7, 17, 25
|
||
|
+ vand 4, 14, 25
|
||
|
+ vaddudm 18, 18, 11
|
||
|
+ vsrd 12, 18, 31
|
||
|
+ vaddudm 15, 15, 10
|
||
|
+
|
||
|
+ vsrd 11, 15, 31
|
||
|
+ vand 8, 18, 25
|
||
|
+ vand 5, 15, 25
|
||
|
+ vaddudm 4, 4, 12
|
||
|
+ vsld 10, 12, 9
|
||
|
+ vaddudm 6, 16, 11
|
||
|
+
|
||
|
+ vsrd 13, 6, 31
|
||
|
+ vand 6, 6, 25
|
||
|
+ vaddudm 4, 4, 10
|
||
|
+ vsrd 10, 4, 31
|
||
|
+ vaddudm 7, 7, 13
|
||
|
+
|
||
|
+ vsrd 11, 7, 31
|
||
|
+ vand 7, 7, 25
|
||
|
+ vand 4, 4, 25
|
||
|
+ vaddudm 5, 5, 10
|
||
|
+ vaddudm 8, 8, 11
|
||
|
+
|
||
|
+ # input m1 m2 m3 m4
|
||
|
+ add 20, 4, 21
|
||
|
+ xxlor 49, 24, 24
|
||
|
+ xxlor 50, 25, 25
|
||
|
+ lxvw4x 43, 0, 20
|
||
|
+ addi 17, 20, 16
|
||
|
+ lxvw4x 44, 0, 17
|
||
|
+ vperm 14, 11, 12, 17
|
||
|
+ vperm 15, 11, 12, 18
|
||
|
+ addi 17, 17, 16
|
||
|
+ lxvw4x 43, 0, 17
|
||
|
+ addi 17, 17, 16
|
||
|
+ lxvw4x 44, 0, 17
|
||
|
+ vperm 17, 11, 12, 17
|
||
|
+ vperm 18, 11, 12, 18
|
||
|
+
|
||
|
+ vand 20, 14, 25 # a0
|
||
|
+ vand 9, 17, 25 # a0
|
||
|
+ vsrd 21, 14, 31 # >> 26
|
||
|
+ vsrd 22, 21, 31 # 12 bits left
|
||
|
+ vsrd 10, 17, 31 # >> 26
|
||
|
+ vsrd 11, 10, 31 # 12 bits left
|
||
|
+
|
||
|
+ vand 21, 21, 25 # a1
|
||
|
+ vand 10, 10, 25 # a1
|
||
|
+
|
||
|
+ vspltisb 13, 12
|
||
|
+ vand 16, 15, 25
|
||
|
+ vsld 23, 16, 13
|
||
|
+ vor 22, 22, 23
|
||
|
+ vand 22, 22, 25 # a2
|
||
|
+ vand 16, 18, 25
|
||
|
+ vsld 12, 16, 13
|
||
|
+ vor 11, 11, 12
|
||
|
+ vand 11, 11, 25 # a2
|
||
|
+ vspltisb 13, 14
|
||
|
+ vsrd 23, 15, 13 # >> 14
|
||
|
+ vsrd 24, 23, 31 # >> 26, a4
|
||
|
+ vand 23, 23, 25 # a3
|
||
|
+ vsrd 12, 18, 13 # >> 14
|
||
|
+ vsrd 13, 12, 31 # >> 26, a4
|
||
|
+ vand 12, 12, 25 # a3
|
||
|
+
|
||
|
+ vaddudm 4, 4, 20
|
||
|
+ vaddudm 5, 5, 21
|
||
|
+ vaddudm 6, 6, 22
|
||
|
+ vaddudm 7, 7, 23
|
||
|
+ vaddudm 8, 8, 24
|
||
|
+
|
||
|
+ # Smash 4 message blocks into 5 vectors of [m4, m2, m3, m1]
|
||
|
+ vmrgow 4, 9, 4
|
||
|
+ vmrgow 5, 10, 5
|
||
|
+ vmrgow 6, 11, 6
|
||
|
+ vmrgow 7, 12, 7
|
||
|
+ vmrgow 8, 13, 8
|
||
|
+ vaddudm 8, 8, 19
|
||
|
+
|
||
|
+ addi 5, 5, -64
|
||
|
+ addi 21, 21, 64
|
||
|
+
|
||
|
+ bdnz loop_4blocks
|
||
|
+
|
||
|
+ xxlor 58, 0, 0
|
||
|
+ xxlor 59, 1, 1
|
||
|
+ xxlor 60, 2, 2
|
||
|
+ xxlor 61, 3, 3
|
||
|
+ xxlor 62, 4, 4
|
||
|
+ xxlor 32, 5, 5
|
||
|
+ xxlor 33, 6, 6
|
||
|
+ xxlor 34, 7, 7
|
||
|
+ xxlor 35, 8, 8
|
||
|
+
|
||
|
+ # Multiply odd words and even words
|
||
|
+ mul_odd
|
||
|
+ mul_even
|
||
|
+
|
||
|
+ # Sum the products.
|
||
|
+ xxpermdi 41, 31, 46, 0
|
||
|
+ xxpermdi 42, 31, 47, 0
|
||
|
+ vaddudm 4, 14, 9
|
||
|
+ xxpermdi 36, 31, 36, 3
|
||
|
+ vaddudm 5, 15, 10
|
||
|
+ xxpermdi 37, 31, 37, 3
|
||
|
+ xxpermdi 43, 31, 48, 0
|
||
|
+ vaddudm 6, 16, 11
|
||
|
+ xxpermdi 38, 31, 38, 3
|
||
|
+ xxpermdi 44, 31, 49, 0
|
||
|
+ vaddudm 7, 17, 12
|
||
|
+ xxpermdi 39, 31, 39, 3
|
||
|
+ xxpermdi 45, 31, 50, 0
|
||
|
+ vaddudm 8, 18, 13
|
||
|
+ xxpermdi 40, 31, 40, 3
|
||
|
+
|
||
|
+ # carry reduction
|
||
|
+ vspltisb 9, 2
|
||
|
+ vsrd 10, 4, 31
|
||
|
+ vsrd 11, 7, 31
|
||
|
+ vand 7, 7, 25
|
||
|
+ vand 4, 4, 25
|
||
|
+ vaddudm 8, 8, 11
|
||
|
+ vsrd 12, 8, 31
|
||
|
+ vaddudm 5, 5, 10
|
||
|
+
|
||
|
+ vsrd 11, 5, 31
|
||
|
+ vand 8, 8, 25
|
||
|
+ vand 5, 5, 25
|
||
|
+ vaddudm 4, 4, 12
|
||
|
+ vsld 10, 12, 9
|
||
|
+ vaddudm 6, 6, 11
|
||
|
+
|
||
|
+ vsrd 13, 6, 31
|
||
|
+ vand 6, 6, 25
|
||
|
+ vaddudm 4, 4, 10
|
||
|
+ vsrd 10, 4, 31
|
||
|
+ vaddudm 7, 7, 13
|
||
|
+
|
||
|
+ vsrd 11, 7, 31
|
||
|
+ vand 7, 7, 25
|
||
|
+ vand 4, 4, 25
|
||
|
+ vaddudm 5, 5, 10
|
||
|
+ vaddudm 8, 8, 11
|
||
|
+
|
||
|
+ b do_final_update
|
||
|
+
|
||
|
+do_final_update:
|
||
|
+ # v4, v5, v6, v7 and v8 are 26 bit vectors
|
||
|
+ vsld 5, 5, 31
|
||
|
+ vor 20, 4, 5
|
||
|
+ vspltisb 11, 12
|
||
|
+ vsrd 12, 6, 11
|
||
|
+ vsld 6, 6, 31
|
||
|
+ vsld 6, 6, 31
|
||
|
+ vor 20, 20, 6
|
||
|
+ vspltisb 11, 14
|
||
|
+ vsld 7, 7, 11
|
||
|
+ vor 21, 7, 12
|
||
|
+ mfvsrld 16, 40 # save last 2 bytes
|
||
|
+ vsld 8, 8, 11
|
||
|
+ vsld 8, 8, 31
|
||
|
+ vor 21, 21, 8
|
||
|
+ mfvsrld 17, 52
|
||
|
+ mfvsrld 19, 53
|
||
|
+ srdi 16, 16, 24
|
||
|
+
|
||
|
+ std 17, 32(3)
|
||
|
+ std 19, 40(3)
|
||
|
+ stw 16, 48(3)
|
||
|
+
|
||
|
+Out_loop:
|
||
|
+ li 3, 0
|
||
|
+
|
||
|
+ li 14, 256
|
||
|
+ lvx 20, 14, 1
|
||
|
+ addi 14, 14, 16
|
||
|
+ lvx 21, 14, 1
|
||
|
+ addi 14, 14, 16
|
||
|
+ lvx 22, 14, 1
|
||
|
+ addi 14, 14, 16
|
||
|
+ lvx 23, 14, 1
|
||
|
+ addi 14, 14, 16
|
||
|
+ lvx 24, 14, 1
|
||
|
+ addi 14, 14, 16
|
||
|
+ lvx 25, 14, 1
|
||
|
+ addi 14, 14, 16
|
||
|
+ lvx 26, 14, 1
|
||
|
+ addi 14, 14, 16
|
||
|
+ lvx 27, 14, 1
|
||
|
+ addi 14, 14, 16
|
||
|
+ lvx 28, 14, 1
|
||
|
+ addi 14, 14, 16
|
||
|
+ lvx 29, 14, 1
|
||
|
+ addi 14, 14, 16
|
||
|
+ lvx 30, 14, 1
|
||
|
+ addi 14, 14, 16
|
||
|
+ lvx 31, 14, 1
|
||
|
+
|
||
|
+ addi 14, 14, 16
|
||
|
+ lxvx 14, 14, 1
|
||
|
+ addi 14, 14, 16
|
||
|
+ lxvx 15, 14, 1
|
||
|
+ addi 14, 14, 16
|
||
|
+ lxvx 16, 14, 1
|
||
|
+ addi 14, 14, 16
|
||
|
+ lxvx 17, 14, 1
|
||
|
+ addi 14, 14, 16
|
||
|
+ lxvx 18, 14, 1
|
||
|
+ addi 14, 14, 16
|
||
|
+ lxvx 19, 14, 1
|
||
|
+ addi 14, 14, 16
|
||
|
+ lxvx 20, 14, 1
|
||
|
+ addi 14, 14, 16
|
||
|
+ lxvx 21, 14, 1
|
||
|
+ addi 14, 14, 16
|
||
|
+ lxvx 22, 14, 1
|
||
|
+ addi 14, 14, 16
|
||
|
+ lxvx 23, 14, 1
|
||
|
+ addi 14, 14, 16
|
||
|
+ lxvx 24, 14, 1
|
||
|
+ addi 14, 14, 16
|
||
|
+ lxvx 25, 14, 1
|
||
|
+ addi 14, 14, 16
|
||
|
+ lxvx 26, 14, 1
|
||
|
+ addi 14, 14, 16
|
||
|
+ lxvx 27, 14, 1
|
||
|
+ addi 14, 14, 16
|
||
|
+ lxvx 28, 14, 1
|
||
|
+ addi 14, 14, 16
|
||
|
+ lxvx 29, 14, 1
|
||
|
+ addi 14, 14, 16
|
||
|
+ lxvx 30, 14, 1
|
||
|
+ addi 14, 14, 16
|
||
|
+ lxvx 31, 14, 1
|
||
|
+
|
||
|
+ ld 0, 1040(1)
|
||
|
+ ld 14,112(1)
|
||
|
+ ld 15,120(1)
|
||
|
+ ld 16,128(1)
|
||
|
+ ld 17,136(1)
|
||
|
+ ld 18,144(1)
|
||
|
+ ld 19,152(1)
|
||
|
+ ld 20,160(1)
|
||
|
+ ld 21,168(1)
|
||
|
+ ld 31,248(1)
|
||
|
+
|
||
|
+ mtlr 0
|
||
|
+ addi 1, 1, 1024
|
||
|
+ blr
|
||
|
+
|
||
|
+Out_no_poly1305:
|
||
|
+ li 3, 0
|
||
|
+ blr
|
||
|
+
|
||
|
+.data
|
||
|
+.align 5
|
||
|
+rmask:
|
||
|
+.byte 0xff, 0xff, 0xff, 0x0f, 0xfc, 0xff, 0xff, 0x0f, 0xfc, 0xff, 0xff, 0x0f, 0xfc, 0xff, 0xff, 0x0f
|
||
|
+cnum:
|
||
|
+.long 0x03ffffff, 0x00000000, 0x03ffffff, 0x00000000
|
||
|
+.long 0x1a, 0x00, 0x1a, 0x00
|
||
|
+.long 0x01000000, 0x01000000, 0x01000000, 0x01000000
|
||
|
+.long 0x00010203, 0x04050607, 0x10111213, 0x14151617
|
||
|
+.long 0x08090a0b, 0x0c0d0e0f, 0x18191a1b, 0x1c1d1e1f
|
||
|
+.long 0x05, 0x00, 0x00, 0x00
|
||
|
+.long 0x02020202, 0x02020202, 0x02020202, 0x02020202
|
||
|
+.long 0xffffffff, 0xffffffff, 0x00000000, 0x00000000
|
||
|
Index: libgcrypt-1.10.2/cipher/poly1305.c
|
||
|
===================================================================
|
||
|
--- libgcrypt-1.10.2.orig/cipher/poly1305.c
|
||
|
+++ libgcrypt-1.10.2/cipher/poly1305.c
|
||
|
@@ -78,11 +78,23 @@ poly1305_blocks (poly1305_context_t *ctx
|
||
|
#endif /* USE_S390X_ASM */
|
||
|
|
||
|
|
||
|
+#ifdef POLY1305_USE_PPC_VEC
|
||
|
+
|
||
|
+extern unsigned int
|
||
|
+gcry_poly1305_p10le_4blocks(unsigned char *key, const byte *m, size_t len);
|
||
|
+
|
||
|
+#endif /* POLY1305_USE_PPC_VEC */
|
||
|
+
|
||
|
+
|
||
|
static void poly1305_init (poly1305_context_t *ctx,
|
||
|
const byte key[POLY1305_KEYLEN])
|
||
|
{
|
||
|
POLY1305_STATE *st = &ctx->state;
|
||
|
|
||
|
+#ifdef POLY1305_USE_PPC_VEC
|
||
|
+ ctx->use_p10 = (_gcry_get_hw_features () & HWF_PPC_ARCH_3_10) != 0;
|
||
|
+#endif
|
||
|
+
|
||
|
ctx->leftover = 0;
|
||
|
|
||
|
st->h[0] = 0;
|
||
|
@@ -533,6 +545,7 @@ _gcry_poly1305_update_burn (poly1305_con
|
||
|
size_t bytes)
|
||
|
{
|
||
|
unsigned int burn = 0;
|
||
|
+ unsigned int nburn;
|
||
|
|
||
|
/* handle leftover */
|
||
|
if (ctx->leftover)
|
||
|
@@ -546,15 +559,31 @@ _gcry_poly1305_update_burn (poly1305_con
|
||
|
ctx->leftover += want;
|
||
|
if (ctx->leftover < POLY1305_BLOCKSIZE)
|
||
|
return 0;
|
||
|
- burn = poly1305_blocks (ctx, ctx->buffer, POLY1305_BLOCKSIZE, 1);
|
||
|
+ nburn = poly1305_blocks (ctx, ctx->buffer, POLY1305_BLOCKSIZE, 1);
|
||
|
+ burn = nburn > burn ? nburn : burn;
|
||
|
ctx->leftover = 0;
|
||
|
}
|
||
|
|
||
|
+#ifdef POLY1305_USE_PPC_VEC
|
||
|
+ /* PPC-P10/little-endian: bulk process multiples of eight blocks */
|
||
|
+ if (ctx->use_p10 && bytes >= POLY1305_BLOCKSIZE * 8)
|
||
|
+ {
|
||
|
+ size_t nblks = bytes / (POLY1305_BLOCKSIZE * 8);
|
||
|
+ size_t len = nblks * (POLY1305_BLOCKSIZE * 8);
|
||
|
+ POLY1305_STATE *st = &ctx->state;
|
||
|
+ nburn = gcry_poly1305_p10le_4blocks ((unsigned char *) st, m, len);
|
||
|
+ burn = nburn > burn ? nburn : burn;
|
||
|
+ m += len;
|
||
|
+ bytes -= len;
|
||
|
+ }
|
||
|
+#endif /* POLY1305_USE_PPC_VEC */
|
||
|
+
|
||
|
/* process full blocks */
|
||
|
if (bytes >= POLY1305_BLOCKSIZE)
|
||
|
{
|
||
|
size_t nblks = bytes / POLY1305_BLOCKSIZE;
|
||
|
- burn = poly1305_blocks (ctx, m, nblks * POLY1305_BLOCKSIZE, 1);
|
||
|
+ nburn = poly1305_blocks (ctx, m, nblks * POLY1305_BLOCKSIZE, 1);
|
||
|
+ burn = nburn > burn ? nburn : burn;
|
||
|
m += nblks * POLY1305_BLOCKSIZE;
|
||
|
bytes -= nblks * POLY1305_BLOCKSIZE;
|
||
|
}
|
||
|
Index: libgcrypt-1.10.2/configure.ac
|
||
|
===================================================================
|
||
|
--- libgcrypt-1.10.2.orig/configure.ac
|
||
|
+++ libgcrypt-1.10.2/configure.ac
|
||
|
@@ -2779,6 +2779,11 @@ if test "$found" = "1" ; then
|
||
|
powerpc64le-*-*)
|
||
|
# Build with the ppc8 vector implementation
|
||
|
GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS chacha20-ppc.lo"
|
||
|
+ # Build with the assembly implementation
|
||
|
+ if test "$gcry_cv_gcc_inline_asm_ppc_altivec" = "yes" &&
|
||
|
+ test "$gcry_cv_gcc_inline_asm_ppc_arch_3_00" = "yes" ; then
|
||
|
+ GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS chacha20-p10le-8x.lo"
|
||
|
+ fi
|
||
|
;;
|
||
|
powerpc64-*-*)
|
||
|
# Build with the ppc8 vector implementation
|
||
|
@@ -3117,6 +3122,13 @@ case "${host}" in
|
||
|
s390x-*-*)
|
||
|
GCRYPT_ASM_DIGESTS="$GCRYPT_ASM_DIGESTS poly1305-s390x.lo"
|
||
|
;;
|
||
|
+ powerpc64le-*-*)
|
||
|
+ # Build with the assembly implementation
|
||
|
+ if test "$gcry_cv_gcc_inline_asm_ppc_altivec" = "yes" &&
|
||
|
+ test "$gcry_cv_gcc_inline_asm_ppc_arch_3_00" = "yes" ; then
|
||
|
+ GCRYPT_ASM_DIGESTS="$GCRYPT_ASM_DIGESTS poly1305-p10le.lo"
|
||
|
+ fi
|
||
|
+ ;;
|
||
|
esac
|
||
|
|
||
|
LIST_MEMBER(scrypt, $enabled_kdfs)
|