From 30ec5cbd476fdf2bb3f79b85f64b64fda3726b3033b1fbee421e39cdf860fd76 Mon Sep 17 00:00:00 2001 From: Pedro Monreal Gonzalez Date: Wed, 11 Oct 2023 07:38:30 +0000 Subject: [PATCH] Accepting request 1116818 from home:pmonrealgonzalez:branches:devel:libraries:c_c++ - POWER: performance enhancements for cryptography [jsc#PED-5088] * Optimize Chacha20 and Poly1305 for PPC P10 LE: [T6006] - Chacha20/poly1305: Optimized chacha20/poly1305 for P10 operation [rC88fe7ac33eb4] - ppc: enable P10 assembly with ENABLE_FORCE_SOFT_HWFEATURES on arch-3.00 [rC2c5e5ab6843d] * Add patches: - libgcrypt-Chacha20-poly1305-Optimized-chacha20-poly1305.patch - libgcrypt-ppc-enable-P10-assembly-with-ENABLE_FORCE_SOF.patch OBS-URL: https://build.opensuse.org/request/show/1116818 OBS-URL: https://build.opensuse.org/package/show/devel:libraries:c_c++/libgcrypt?expand=0&rev=170 --- ...poly1305-Optimized-chacha20-poly1305.patch | 1993 +++++++++++++++++ ...e-P10-assembly-with-ENABLE_FORCE_SOF.patch | 76 + libgcrypt.changes | 13 + libgcrypt.spec | 3 + 4 files changed, 2085 insertions(+) create mode 100644 libgcrypt-Chacha20-poly1305-Optimized-chacha20-poly1305.patch create mode 100644 libgcrypt-ppc-enable-P10-assembly-with-ENABLE_FORCE_SOF.patch diff --git a/libgcrypt-Chacha20-poly1305-Optimized-chacha20-poly1305.patch b/libgcrypt-Chacha20-poly1305-Optimized-chacha20-poly1305.patch new file mode 100644 index 0000000..5877fee --- /dev/null +++ b/libgcrypt-Chacha20-poly1305-Optimized-chacha20-poly1305.patch @@ -0,0 +1,1993 @@ +commit 88fe7ac33eb4cb4dff76a5cc7fca50da5fb0ee3a +Author: Danny Tsen +Date: Sun Jun 12 21:30:19 2022 +0300 + + Chacha20 poly1305 Optimized chacha20 poly1305 for P10 operation + + * configure.ac: Added chacha20 and poly1305 assembly implementations. + * cipher/chacha20-p10le-8x.s: (New) - support 8 blocks (512 bytes) + unrolling. + * cipher/poly1305-p10le.s: (New) - support 4 blocks (128 bytes) + unrolling. + * cipher/Makefile.am: Added new chacha20 and poly1305 files. + * cipher/chacha20.c: Added PPC p10 le support for 8x chacha20. + * cipher/poly1305.c: Added PPC p10 le support for 4x poly1305. + * cipher/poly1305-internal.h: Added PPC p10 le support for poly1305. + --- + + GnuPG-bug-id: 6006 + Signed-off-by: Danny Tsen + [jk: cosmetic changes to C code] + [jk: fix building on ppc64be] + Signed-off-by: Jussi Kivilinna + +Index: libgcrypt-1.10.2/cipher/Makefile.am +=================================================================== +--- libgcrypt-1.10.2.orig/cipher/Makefile.am ++++ libgcrypt-1.10.2/cipher/Makefile.am +@@ -83,6 +83,7 @@ EXTRA_libcipher_la_SOURCES = \ + chacha20.c chacha20-amd64-ssse3.S chacha20-amd64-avx2.S \ + chacha20-armv7-neon.S chacha20-aarch64.S \ + chacha20-ppc.c chacha20-s390x.S \ ++ chacha20-p10le-8x.s \ + cipher-gcm-ppc.c cipher-gcm-intel-pclmul.c cipher-gcm-armv7-neon.S \ + cipher-gcm-armv8-aarch32-ce.S cipher-gcm-armv8-aarch64-ce.S \ + crc.c crc-intel-pclmul.c crc-armv8-ce.c \ +@@ -99,6 +100,7 @@ EXTRA_libcipher_la_SOURCES = \ + md4.c \ + md5.c \ + poly1305-s390x.S \ ++ poly1305-p10le.s \ + rijndael.c rijndael-internal.h rijndael-tables.h \ + rijndael-aesni.c rijndael-padlock.c \ + rijndael-amd64.S rijndael-arm.S \ +Index: libgcrypt-1.10.2/cipher/chacha20-p10le-8x.s +=================================================================== +--- /dev/null ++++ libgcrypt-1.10.2/cipher/chacha20-p10le-8x.s +@@ -0,0 +1,864 @@ ++# Copyright 2021- IBM Inc. All rights reserved ++# ++# This file is part of Libgcrypt. ++# ++# Libgcrypt is free software; you can redistribute it and/or modify ++# it under the terms of the GNU Lesser General Public License as ++# published by the Free Software Foundation; either version 2.1 of ++# the License, or (at your option) any later version. ++# ++# Libgcrypt is distributed in the hope that it will be useful, ++# but WITHOUT ANY WARRANTY; without even the implied warranty of ++# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++# GNU Lesser General Public License for more details. ++# ++# You should have received a copy of the GNU Lesser General Public ++# License along with this program; if not, see . ++# ++#=================================================================================== ++# Written by Danny Tsen ++# ++# This function handles multiple 64-byte block data length ++# and the length should be more than 512 bytes. ++# ++# unsigned int _gcry_chacha20_p10le_8x(u32 *state, byte *dst, const byte *src, size_t len); ++# ++# r1 - top of the stack ++# r3 to r10 input parameters ++# r3 - out ++# r4 - inp ++# r5 - len ++# r6 - key[8] ++# r7 - counter[4] ++# ++# do rounds, 8 quarter rounds ++# 1. a += b; d ^= a; d <<<= 16; ++# 2. c += d; b ^= c; b <<<= 12; ++# 3. a += b; d ^= a; d <<<= 8; ++# 4. c += d; b ^= c; b <<<= 7 ++# ++# row1 = (row1 + row2), row4 = row1 xor row4, row4 rotate each word by 16 ++# row3 = (row3 + row4), row2 = row3 xor row2, row2 rotate each word by 12 ++# row1 = (row1 + row2), row4 = row1 xor row4, row4 rotate each word by 8 ++# row3 = (row3 + row4), row2 = row3 xor row2, row2 rotate each word by 7 ++# ++# 4 blocks (a b c d) ++# ++# a0 b0 c0 d0 ++# a1 b1 c1 d1 ++# ... ++# a4 b4 c4 d4 ++# ... ++# a8 b8 c8 d8 ++# ... ++# a12 b12 c12 d12 ++# a13 ... ++# a14 ... ++# a15 b15 c15 d15 ++# ++# Column round (v0, v4, v8, v12, v1, v5, v9, v13, v2, v6, v10, v14, v3, v7, v11, v15) ++# Diagnal round (v0, v5, v10, v15, v1, v6, v11, v12, v2, v7, v8, v13, v3, v4, v9, v14) ++# ++.text ++ ++.macro QT_loop_8x ++ # QR(v0, v4, v8, v12, v1, v5, v9, v13, v2, v6, v10, v14, v3, v7, v11, v15) ++ xxlor 0, 32+25, 32+25 ++ xxlor 32+25, 20, 20 ++ vadduwm 0, 0, 4 ++ vadduwm 1, 1, 5 ++ vadduwm 2, 2, 6 ++ vadduwm 3, 3, 7 ++ vadduwm 16, 16, 20 ++ vadduwm 17, 17, 21 ++ vadduwm 18, 18, 22 ++ vadduwm 19, 19, 23 ++ ++ vpermxor 12, 12, 0, 25 ++ vpermxor 13, 13, 1, 25 ++ vpermxor 14, 14, 2, 25 ++ vpermxor 15, 15, 3, 25 ++ vpermxor 28, 28, 16, 25 ++ vpermxor 29, 29, 17, 25 ++ vpermxor 30, 30, 18, 25 ++ vpermxor 31, 31, 19, 25 ++ xxlor 32+25, 0, 0 ++ vadduwm 8, 8, 12 ++ vadduwm 9, 9, 13 ++ vadduwm 10, 10, 14 ++ vadduwm 11, 11, 15 ++ vadduwm 24, 24, 28 ++ vadduwm 25, 25, 29 ++ vadduwm 26, 26, 30 ++ vadduwm 27, 27, 31 ++ vxor 4, 4, 8 ++ vxor 5, 5, 9 ++ vxor 6, 6, 10 ++ vxor 7, 7, 11 ++ vxor 20, 20, 24 ++ vxor 21, 21, 25 ++ vxor 22, 22, 26 ++ vxor 23, 23, 27 ++ ++ xxlor 0, 32+25, 32+25 ++ xxlor 32+25, 21, 21 ++ vrlw 4, 4, 25 # ++ vrlw 5, 5, 25 ++ vrlw 6, 6, 25 ++ vrlw 7, 7, 25 ++ vrlw 20, 20, 25 # ++ vrlw 21, 21, 25 ++ vrlw 22, 22, 25 ++ vrlw 23, 23, 25 ++ xxlor 32+25, 0, 0 ++ vadduwm 0, 0, 4 ++ vadduwm 1, 1, 5 ++ vadduwm 2, 2, 6 ++ vadduwm 3, 3, 7 ++ vadduwm 16, 16, 20 ++ vadduwm 17, 17, 21 ++ vadduwm 18, 18, 22 ++ vadduwm 19, 19, 23 ++ ++ xxlor 0, 32+25, 32+25 ++ xxlor 32+25, 22, 22 ++ vpermxor 12, 12, 0, 25 ++ vpermxor 13, 13, 1, 25 ++ vpermxor 14, 14, 2, 25 ++ vpermxor 15, 15, 3, 25 ++ vpermxor 28, 28, 16, 25 ++ vpermxor 29, 29, 17, 25 ++ vpermxor 30, 30, 18, 25 ++ vpermxor 31, 31, 19, 25 ++ xxlor 32+25, 0, 0 ++ vadduwm 8, 8, 12 ++ vadduwm 9, 9, 13 ++ vadduwm 10, 10, 14 ++ vadduwm 11, 11, 15 ++ vadduwm 24, 24, 28 ++ vadduwm 25, 25, 29 ++ vadduwm 26, 26, 30 ++ vadduwm 27, 27, 31 ++ xxlor 0, 32+28, 32+28 ++ xxlor 32+28, 23, 23 ++ vxor 4, 4, 8 ++ vxor 5, 5, 9 ++ vxor 6, 6, 10 ++ vxor 7, 7, 11 ++ vxor 20, 20, 24 ++ vxor 21, 21, 25 ++ vxor 22, 22, 26 ++ vxor 23, 23, 27 ++ vrlw 4, 4, 28 # ++ vrlw 5, 5, 28 ++ vrlw 6, 6, 28 ++ vrlw 7, 7, 28 ++ vrlw 20, 20, 28 # ++ vrlw 21, 21, 28 ++ vrlw 22, 22, 28 ++ vrlw 23, 23, 28 ++ xxlor 32+28, 0, 0 ++ ++ # QR(v0, v5, v10, v15, v1, v6, v11, v12, v2, v7, v8, v13, v3, v4, v9, v14) ++ xxlor 0, 32+25, 32+25 ++ xxlor 32+25, 20, 20 ++ vadduwm 0, 0, 5 ++ vadduwm 1, 1, 6 ++ vadduwm 2, 2, 7 ++ vadduwm 3, 3, 4 ++ vadduwm 16, 16, 21 ++ vadduwm 17, 17, 22 ++ vadduwm 18, 18, 23 ++ vadduwm 19, 19, 20 ++ ++ vpermxor 15, 15, 0, 25 ++ vpermxor 12, 12, 1, 25 ++ vpermxor 13, 13, 2, 25 ++ vpermxor 14, 14, 3, 25 ++ vpermxor 31, 31, 16, 25 ++ vpermxor 28, 28, 17, 25 ++ vpermxor 29, 29, 18, 25 ++ vpermxor 30, 30, 19, 25 ++ ++ xxlor 32+25, 0, 0 ++ vadduwm 10, 10, 15 ++ vadduwm 11, 11, 12 ++ vadduwm 8, 8, 13 ++ vadduwm 9, 9, 14 ++ vadduwm 26, 26, 31 ++ vadduwm 27, 27, 28 ++ vadduwm 24, 24, 29 ++ vadduwm 25, 25, 30 ++ vxor 5, 5, 10 ++ vxor 6, 6, 11 ++ vxor 7, 7, 8 ++ vxor 4, 4, 9 ++ vxor 21, 21, 26 ++ vxor 22, 22, 27 ++ vxor 23, 23, 24 ++ vxor 20, 20, 25 ++ ++ xxlor 0, 32+25, 32+25 ++ xxlor 32+25, 21, 21 ++ vrlw 5, 5, 25 ++ vrlw 6, 6, 25 ++ vrlw 7, 7, 25 ++ vrlw 4, 4, 25 ++ vrlw 21, 21, 25 ++ vrlw 22, 22, 25 ++ vrlw 23, 23, 25 ++ vrlw 20, 20, 25 ++ xxlor 32+25, 0, 0 ++ ++ vadduwm 0, 0, 5 ++ vadduwm 1, 1, 6 ++ vadduwm 2, 2, 7 ++ vadduwm 3, 3, 4 ++ vadduwm 16, 16, 21 ++ vadduwm 17, 17, 22 ++ vadduwm 18, 18, 23 ++ vadduwm 19, 19, 20 ++ ++ xxlor 0, 32+25, 32+25 ++ xxlor 32+25, 22, 22 ++ vpermxor 15, 15, 0, 25 ++ vpermxor 12, 12, 1, 25 ++ vpermxor 13, 13, 2, 25 ++ vpermxor 14, 14, 3, 25 ++ vpermxor 31, 31, 16, 25 ++ vpermxor 28, 28, 17, 25 ++ vpermxor 29, 29, 18, 25 ++ vpermxor 30, 30, 19, 25 ++ xxlor 32+25, 0, 0 ++ ++ vadduwm 10, 10, 15 ++ vadduwm 11, 11, 12 ++ vadduwm 8, 8, 13 ++ vadduwm 9, 9, 14 ++ vadduwm 26, 26, 31 ++ vadduwm 27, 27, 28 ++ vadduwm 24, 24, 29 ++ vadduwm 25, 25, 30 ++ ++ xxlor 0, 32+28, 32+28 ++ xxlor 32+28, 23, 23 ++ vxor 5, 5, 10 ++ vxor 6, 6, 11 ++ vxor 7, 7, 8 ++ vxor 4, 4, 9 ++ vxor 21, 21, 26 ++ vxor 22, 22, 27 ++ vxor 23, 23, 24 ++ vxor 20, 20, 25 ++ vrlw 5, 5, 28 ++ vrlw 6, 6, 28 ++ vrlw 7, 7, 28 ++ vrlw 4, 4, 28 ++ vrlw 21, 21, 28 ++ vrlw 22, 22, 28 ++ vrlw 23, 23, 28 ++ vrlw 20, 20, 28 ++ xxlor 32+28, 0, 0 ++.endm ++ ++.macro QT_loop_4x ++ # QR(v0, v4, v8, v12, v1, v5, v9, v13, v2, v6, v10, v14, v3, v7, v11, v15) ++ vadduwm 0, 0, 4 ++ vadduwm 1, 1, 5 ++ vadduwm 2, 2, 6 ++ vadduwm 3, 3, 7 ++ vpermxor 12, 12, 0, 20 ++ vpermxor 13, 13, 1, 20 ++ vpermxor 14, 14, 2, 20 ++ vpermxor 15, 15, 3, 20 ++ vadduwm 8, 8, 12 ++ vadduwm 9, 9, 13 ++ vadduwm 10, 10, 14 ++ vadduwm 11, 11, 15 ++ vxor 4, 4, 8 ++ vxor 5, 5, 9 ++ vxor 6, 6, 10 ++ vxor 7, 7, 11 ++ vrlw 4, 4, 21 ++ vrlw 5, 5, 21 ++ vrlw 6, 6, 21 ++ vrlw 7, 7, 21 ++ vadduwm 0, 0, 4 ++ vadduwm 1, 1, 5 ++ vadduwm 2, 2, 6 ++ vadduwm 3, 3, 7 ++ vpermxor 12, 12, 0, 22 ++ vpermxor 13, 13, 1, 22 ++ vpermxor 14, 14, 2, 22 ++ vpermxor 15, 15, 3, 22 ++ vadduwm 8, 8, 12 ++ vadduwm 9, 9, 13 ++ vadduwm 10, 10, 14 ++ vadduwm 11, 11, 15 ++ vxor 4, 4, 8 ++ vxor 5, 5, 9 ++ vxor 6, 6, 10 ++ vxor 7, 7, 11 ++ vrlw 4, 4, 23 ++ vrlw 5, 5, 23 ++ vrlw 6, 6, 23 ++ vrlw 7, 7, 23 ++ ++ # QR(v0, v5, v10, v15, v1, v6, v11, v12, v2, v7, v8, v13, v3, v4, v9, v14) ++ vadduwm 0, 0, 5 ++ vadduwm 1, 1, 6 ++ vadduwm 2, 2, 7 ++ vadduwm 3, 3, 4 ++ vpermxor 15, 15, 0, 20 ++ vpermxor 12, 12, 1, 20 ++ vpermxor 13, 13, 2, 20 ++ vpermxor 14, 14, 3, 20 ++ vadduwm 10, 10, 15 ++ vadduwm 11, 11, 12 ++ vadduwm 8, 8, 13 ++ vadduwm 9, 9, 14 ++ vxor 5, 5, 10 ++ vxor 6, 6, 11 ++ vxor 7, 7, 8 ++ vxor 4, 4, 9 ++ vrlw 5, 5, 21 ++ vrlw 6, 6, 21 ++ vrlw 7, 7, 21 ++ vrlw 4, 4, 21 ++ vadduwm 0, 0, 5 ++ vadduwm 1, 1, 6 ++ vadduwm 2, 2, 7 ++ vadduwm 3, 3, 4 ++ vpermxor 15, 15, 0, 22 ++ vpermxor 12, 12, 1, 22 ++ vpermxor 13, 13, 2, 22 ++ vpermxor 14, 14, 3, 22 ++ vadduwm 10, 10, 15 ++ vadduwm 11, 11, 12 ++ vadduwm 8, 8, 13 ++ vadduwm 9, 9, 14 ++ vxor 5, 5, 10 ++ vxor 6, 6, 11 ++ vxor 7, 7, 8 ++ vxor 4, 4, 9 ++ vrlw 5, 5, 23 ++ vrlw 6, 6, 23 ++ vrlw 7, 7, 23 ++ vrlw 4, 4, 23 ++.endm ++ ++# Transpose ++.macro TP_4x a0 a1 a2 a3 ++ xxmrghw 10, 32+\a0, 32+\a1 # a0, a1, b0, b1 ++ xxmrghw 11, 32+\a2, 32+\a3 # a2, a3, b2, b3 ++ xxmrglw 12, 32+\a0, 32+\a1 # c0, c1, d0, d1 ++ xxmrglw 13, 32+\a2, 32+\a3 # c2, c3, d2, d3 ++ xxpermdi 32+\a0, 10, 11, 0 # a0, a1, a2, a3 ++ xxpermdi 32+\a1, 10, 11, 3 # b0, b1, b2, b3 ++ xxpermdi 32+\a2, 12, 13, 0 # c0, c1, c2, c3 ++ xxpermdi 32+\a3, 12, 13, 3 # d0, d1, d2, d3 ++.endm ++ ++# key stream = working state + state ++.macro Add_state S ++ vadduwm \S+0, \S+0, 16-\S ++ vadduwm \S+4, \S+4, 17-\S ++ vadduwm \S+8, \S+8, 18-\S ++ vadduwm \S+12, \S+12, 19-\S ++ ++ vadduwm \S+1, \S+1, 16-\S ++ vadduwm \S+5, \S+5, 17-\S ++ vadduwm \S+9, \S+9, 18-\S ++ vadduwm \S+13, \S+13, 19-\S ++ ++ vadduwm \S+2, \S+2, 16-\S ++ vadduwm \S+6, \S+6, 17-\S ++ vadduwm \S+10, \S+10, 18-\S ++ vadduwm \S+14, \S+14, 19-\S ++ ++ vadduwm \S+3, \S+3, 16-\S ++ vadduwm \S+7, \S+7, 17-\S ++ vadduwm \S+11, \S+11, 18-\S ++ vadduwm \S+15, \S+15, 19-\S ++.endm ++ ++# ++# write 256 bytes ++# ++.macro Write_256 S ++ add 9, 14, 5 ++ add 16, 14, 4 ++ lxvw4x 0, 0, 9 ++ lxvw4x 1, 17, 9 ++ lxvw4x 2, 18, 9 ++ lxvw4x 3, 19, 9 ++ lxvw4x 4, 20, 9 ++ lxvw4x 5, 21, 9 ++ lxvw4x 6, 22, 9 ++ lxvw4x 7, 23, 9 ++ lxvw4x 8, 24, 9 ++ lxvw4x 9, 25, 9 ++ lxvw4x 10, 26, 9 ++ lxvw4x 11, 27, 9 ++ lxvw4x 12, 28, 9 ++ lxvw4x 13, 29, 9 ++ lxvw4x 14, 30, 9 ++ lxvw4x 15, 31, 9 ++ ++ xxlxor \S+32, \S+32, 0 ++ xxlxor \S+36, \S+36, 1 ++ xxlxor \S+40, \S+40, 2 ++ xxlxor \S+44, \S+44, 3 ++ xxlxor \S+33, \S+33, 4 ++ xxlxor \S+37, \S+37, 5 ++ xxlxor \S+41, \S+41, 6 ++ xxlxor \S+45, \S+45, 7 ++ xxlxor \S+34, \S+34, 8 ++ xxlxor \S+38, \S+38, 9 ++ xxlxor \S+42, \S+42, 10 ++ xxlxor \S+46, \S+46, 11 ++ xxlxor \S+35, \S+35, 12 ++ xxlxor \S+39, \S+39, 13 ++ xxlxor \S+43, \S+43, 14 ++ xxlxor \S+47, \S+47, 15 ++ ++ stxvw4x \S+32, 0, 16 ++ stxvw4x \S+36, 17, 16 ++ stxvw4x \S+40, 18, 16 ++ stxvw4x \S+44, 19, 16 ++ ++ stxvw4x \S+33, 20, 16 ++ stxvw4x \S+37, 21, 16 ++ stxvw4x \S+41, 22, 16 ++ stxvw4x \S+45, 23, 16 ++ ++ stxvw4x \S+34, 24, 16 ++ stxvw4x \S+38, 25, 16 ++ stxvw4x \S+42, 26, 16 ++ stxvw4x \S+46, 27, 16 ++ ++ stxvw4x \S+35, 28, 16 ++ stxvw4x \S+39, 29, 16 ++ stxvw4x \S+43, 30, 16 ++ stxvw4x \S+47, 31, 16 ++ ++.endm ++ ++# ++# unsigned int _gcry_chacha20_p10le_8x(u32 *state, byte *dst, const byte *src, size_t len); ++# ++.global _gcry_chacha20_p10le_8x ++.align 5 ++_gcry_chacha20_p10le_8x: ++ cmpdi 6, 512 ++ blt Out_no_chacha ++ ++ stdu 1,-1024(1) ++ mflr 0 ++ ++ std 14,112(1) ++ std 15,120(1) ++ std 16,128(1) ++ std 17,136(1) ++ std 18,144(1) ++ std 19,152(1) ++ std 20,160(1) ++ std 21,168(1) ++ std 22,176(1) ++ std 23,184(1) ++ std 24,192(1) ++ std 25,200(1) ++ std 26,208(1) ++ std 27,216(1) ++ std 28,224(1) ++ std 29,232(1) ++ std 30,240(1) ++ std 31,248(1) ++ std 0, 1040(1) ++ ++ li 17, 16 ++ li 18, 32 ++ li 19, 48 ++ li 20, 64 ++ li 21, 80 ++ li 22, 96 ++ li 23, 112 ++ li 24, 128 ++ li 25, 144 ++ li 26, 160 ++ li 27, 176 ++ li 28, 192 ++ li 29, 208 ++ li 30, 224 ++ li 31, 240 ++ addi 9, 1, 256 ++ stvx 20, 0, 9 ++ stvx 21, 17, 9 ++ stvx 22, 18, 9 ++ stvx 23, 19, 9 ++ stvx 24, 20, 9 ++ stvx 25, 21, 9 ++ stvx 26, 22, 9 ++ stvx 27, 23, 9 ++ stvx 28, 24, 9 ++ stvx 29, 25, 9 ++ stvx 30, 26, 9 ++ stvx 31, 27, 9 ++ ++ add 9, 9, 27 ++ addi 14, 17, 16 ++ stxvx 14, 14, 9 ++ addi 14, 14, 16 ++ stxvx 15, 14, 9 ++ addi 14, 14, 16 ++ stxvx 16, 14, 9 ++ addi 14, 14, 16 ++ stxvx 17, 14, 9 ++ addi 14, 14, 16 ++ stxvx 18, 14, 9 ++ addi 14, 14, 16 ++ stxvx 19, 14, 9 ++ addi 14, 14, 16 ++ stxvx 20, 14, 9 ++ addi 14, 14, 16 ++ stxvx 21, 14, 9 ++ addi 14, 14, 16 ++ stxvx 22, 14, 9 ++ addi 14, 14, 16 ++ stxvx 23, 14, 9 ++ addi 14, 14, 16 ++ stxvx 24, 14, 9 ++ addi 14, 14, 16 ++ stxvx 25, 14, 9 ++ addi 14, 14, 16 ++ stxvx 26, 14, 9 ++ addi 14, 14, 16 ++ stxvx 27, 14, 9 ++ addi 14, 14, 16 ++ stxvx 28, 14, 9 ++ addi 14, 14, 16 ++ stxvx 29, 14, 9 ++ addi 14, 14, 16 ++ stxvx 30, 14, 9 ++ addi 14, 14, 16 ++ stxvx 31, 14, 9 ++ ++ mr 15, 6 # len ++ li 14, 0 # offset to inp and outp ++ ++ ld 10, sigma@got(2) ++ ++ lxvw4x 48, 0, 3 # vr16, constants ++ lxvw4x 49, 17, 3 # vr17, key 1 ++ lxvw4x 50, 18, 3 # vr18, key 2 ++ lxvw4x 51, 19, 3 # vr19, counter, nonce ++ ++ lxvw4x 62, 19, 10 # vr30, 4 ++ ++ vspltisw 21, 12 ++ vspltisw 23, 7 ++ ++ ld 11, permx@got(2) ++ lxvw4x 32+20, 0, 11 ++ lxvw4x 32+22, 17, 11 ++ ++ li 8, 10 ++ mtctr 8 ++ ++ xxlor 16, 48, 48 ++ xxlor 17, 49, 49 ++ xxlor 18, 50, 50 ++ xxlor 19, 51, 51 ++ ++ vspltisw 25, 4 ++ vspltisw 26, 8 ++ ++ xxlor 16, 48, 48 ++ xxlor 17, 49, 49 ++ xxlor 18, 50, 50 ++ xxlor 19, 51, 51 ++ ++ xxlor 25, 32+26, 32+26 ++ xxlor 24, 32+25, 32+25 ++ ++ vadduwm 31, 30, 25 # (0, 1, 2, 3) + (4, 4, 4, 4) ++ xxlor 30, 32+30, 32+30 ++ xxlor 31, 32+31, 32+31 ++ ++ xxlor 20, 32+20, 32+20 ++ xxlor 21, 32+21, 32+21 ++ xxlor 22, 32+22, 32+22 ++ xxlor 23, 32+23, 32+23 ++ ++Loop_8x: ++ lvx 0, 20, 10 ++ lvx 1, 21, 10 ++ lvx 2, 22, 10 ++ lvx 3, 23, 10 ++ xxspltw 32+4, 17, 0 ++ xxspltw 32+5, 17, 1 ++ xxspltw 32+6, 17, 2 ++ xxspltw 32+7, 17, 3 ++ xxspltw 32+8, 18, 0 ++ xxspltw 32+9, 18, 1 ++ xxspltw 32+10, 18, 2 ++ xxspltw 32+11, 18, 3 ++ xxspltw 32+12, 19, 0 ++ xxspltw 32+13, 19, 1 ++ xxspltw 32+14, 19, 2 ++ xxspltw 32+15, 19, 3 ++ vadduwm 12, 12, 30 # increase counter ++ ++ lvx 16, 20, 10 ++ lvx 17, 21, 10 ++ lvx 18, 22, 10 ++ lvx 19, 23, 10 ++ xxspltw 32+20, 17, 0 ++ xxspltw 32+21, 17, 1 ++ xxspltw 32+22, 17, 2 ++ xxspltw 32+23, 17, 3 ++ xxspltw 32+24, 18, 0 ++ xxspltw 32+25, 18, 1 ++ xxspltw 32+26, 18, 2 ++ xxspltw 32+27, 18, 3 ++ xxspltw 32+28, 19, 0 ++ xxspltw 32+29, 19, 1 ++ vadduwm 28, 28, 31 # increase counter ++ xxspltw 32+30, 19, 2 ++ xxspltw 32+31, 19, 3 ++ ++.align 5 ++quarter_loop_8x: ++ QT_loop_8x ++ ++ bdnz quarter_loop_8x ++ ++ xxlor 0, 32+30, 32+30 ++ xxlor 32+30, 30, 30 ++ vadduwm 12, 12, 30 ++ xxlor 32+30, 0, 0 ++ TP_4x 0, 1, 2, 3 ++ TP_4x 4, 5, 6, 7 ++ TP_4x 8, 9, 10, 11 ++ TP_4x 12, 13, 14, 15 ++ ++ xxlor 0, 48, 48 ++ xxlor 1, 49, 49 ++ xxlor 2, 50, 50 ++ xxlor 3, 51, 51 ++ xxlor 48, 16, 16 ++ xxlor 49, 17, 17 ++ xxlor 50, 18, 18 ++ xxlor 51, 19, 19 ++ Add_state 0 ++ xxlor 48, 0, 0 ++ xxlor 49, 1, 1 ++ xxlor 50, 2, 2 ++ xxlor 51, 3, 3 ++ Write_256 0 ++ addi 14, 14, 256 ++ addi 15, 15, -256 ++ ++ xxlor 5, 32+31, 32+31 ++ xxlor 32+31, 31, 31 ++ vadduwm 28, 28, 31 ++ xxlor 32+31, 5, 5 ++ TP_4x 16+0, 16+1, 16+2, 16+3 ++ TP_4x 16+4, 16+5, 16+6, 16+7 ++ TP_4x 16+8, 16+9, 16+10, 16+11 ++ TP_4x 16+12, 16+13, 16+14, 16+15 ++ ++ xxlor 32, 16, 16 ++ xxlor 33, 17, 17 ++ xxlor 34, 18, 18 ++ xxlor 35, 19, 19 ++ Add_state 16 ++ Write_256 16 ++ addi 14, 14, 256 ++ addi 15, 15, -256 ++ ++ # should update counter before out? ++ xxlor 32+24, 24, 24 ++ xxlor 32+25, 25, 25 ++ xxlor 32+30, 30, 30 ++ vadduwm 30, 30, 25 ++ vadduwm 31, 30, 24 ++ xxlor 30, 32+30, 32+30 ++ xxlor 31, 32+31, 32+31 ++ ++ cmpdi 15, 0 ++ beq Out_loop ++ ++ cmpdi 15, 512 ++ blt Loop_last ++ ++ mtctr 8 ++ b Loop_8x ++ ++Loop_last: ++ lxvw4x 48, 0, 3 # vr16, constants ++ lxvw4x 49, 17, 3 # vr17, key 1 ++ lxvw4x 50, 18, 3 # vr18, key 2 ++ lxvw4x 51, 19, 3 # vr19, counter, nonce ++ ++ vspltisw 21, 12 ++ vspltisw 23, 7 ++ lxvw4x 32+20, 0, 11 ++ lxvw4x 32+22, 17, 11 ++ ++ li 8, 10 ++ mtctr 8 ++ ++Loop_4x: ++ lvx 0, 20, 10 ++ lvx 1, 21, 10 ++ lvx 2, 22, 10 ++ lvx 3, 23, 10 ++ vspltw 4, 17, 0 ++ vspltw 5, 17, 1 ++ vspltw 6, 17, 2 ++ vspltw 7, 17, 3 ++ vspltw 8, 18, 0 ++ vspltw 9, 18, 1 ++ vspltw 10, 18, 2 ++ vspltw 11, 18, 3 ++ vspltw 12, 19, 0 ++ vadduwm 12, 12, 30 # increase counter ++ vspltw 13, 19, 1 ++ vspltw 14, 19, 2 ++ vspltw 15, 19, 3 ++ ++.align 5 ++quarter_loop: ++ QT_loop_4x ++ ++ bdnz quarter_loop ++ ++ vadduwm 12, 12, 30 ++ TP_4x 0, 1, 2, 3 ++ TP_4x 4, 5, 6, 7 ++ TP_4x 8, 9, 10, 11 ++ TP_4x 12, 13, 14, 15 ++ ++ Add_state 0 ++ Write_256 0 ++ addi 14, 14, 256 ++ addi 15, 15, -256 ++ ++ # Update state counter ++ vspltisw 25, 4 ++ vadduwm 30, 30, 25 ++ ++ cmpdi 15, 0 ++ beq Out_loop ++ ++ mtctr 8 ++ b Loop_4x ++ ++Out_loop: ++ # ++ # Update state counter ++ # ++ vspltisb 16, -1 # first 16 bytes - 0xffff...ff ++ vspltisb 17, 0 # second 16 bytes - 0x0000...00 ++ vsldoi 18, 16, 17, 12 ++ vand 18, 18, 30 ++ xxlor 32+19, 19, 19 ++ vadduwm 18, 19, 18 ++ stxvw4x 32+18, 19, 3 ++ li 3, 0 ++ ++ addi 9, 1, 256 ++ lvx 20, 0, 9 ++ lvx 21, 17, 9 ++ lvx 22, 18, 9 ++ lvx 23, 19, 9 ++ lvx 24, 20, 9 ++ lvx 25, 21, 9 ++ lvx 26, 22, 9 ++ lvx 27, 23, 9 ++ lvx 28, 24, 9 ++ lvx 29, 25, 9 ++ lvx 30, 26, 9 ++ lvx 31, 27, 9 ++ ++ add 9, 9, 27 ++ addi 14, 17, 16 ++ lxvx 14, 14, 9 ++ addi 14, 14, 16 ++ lxvx 15, 14, 9 ++ addi 14, 14, 16 ++ lxvx 16, 14, 9 ++ addi 14, 14, 16 ++ lxvx 17, 14, 9 ++ addi 14, 14, 16 ++ lxvx 18, 14, 9 ++ addi 14, 14, 16 ++ lxvx 19, 14, 9 ++ addi 14, 14, 16 ++ lxvx 20, 14, 9 ++ addi 14, 14, 16 ++ lxvx 21, 14, 9 ++ addi 14, 14, 16 ++ lxvx 22, 14, 9 ++ addi 14, 14, 16 ++ lxvx 23, 14, 9 ++ addi 14, 14, 16 ++ lxvx 24, 14, 9 ++ addi 14, 14, 16 ++ lxvx 25, 14, 9 ++ addi 14, 14, 16 ++ lxvx 26, 14, 9 ++ addi 14, 14, 16 ++ lxvx 27, 14, 9 ++ addi 14, 14, 16 ++ lxvx 28, 14, 9 ++ addi 14, 14, 16 ++ lxvx 29, 14, 9 ++ addi 14, 14, 16 ++ lxvx 30, 14, 9 ++ addi 14, 14, 16 ++ lxvx 31, 14, 9 ++ ++ ld 0, 1040(1) ++ ld 14,112(1) ++ ld 15,120(1) ++ ld 16,128(1) ++ ld 17,136(1) ++ ld 18,144(1) ++ ld 19,152(1) ++ ld 20,160(1) ++ ld 21,168(1) ++ ld 22,176(1) ++ ld 23,184(1) ++ ld 24,192(1) ++ ld 25,200(1) ++ ld 26,208(1) ++ ld 27,216(1) ++ ld 28,224(1) ++ ld 29,232(1) ++ ld 30,240(1) ++ ld 31,248(1) ++ ++ mtlr 0 ++ addi 1, 1, 1024 ++ blr ++ ++Out_no_chacha: ++ li 3, 0 ++ blr ++ ++.data ++.align 4 ++sigma: ++.long 0x61707865, 0x3320646e, 0x79622d32, 0x6b206574 ++.long 0x0c0d0e0f, 0x08090a0b, 0x04050607, 0x00010203 ++.long 1, 0, 0, 0 ++.long 0, 1, 2, 3 ++.long 0x61707865, 0x61707865, 0x61707865, 0x61707865 ++.long 0x3320646e, 0x3320646e, 0x3320646e, 0x3320646e ++.long 0x79622d32, 0x79622d32, 0x79622d32, 0x79622d32 ++.long 0x6b206574, 0x6b206574, 0x6b206574, 0x6b206574 ++permx: ++.long 0x22330011, 0x66774455, 0xaabb8899, 0xeeffccdd ++.long 0x11223300, 0x55667744, 0x99aabb88, 0xddeeffcc +Index: libgcrypt-1.10.2/cipher/chacha20.c +=================================================================== +--- libgcrypt-1.10.2.orig/cipher/chacha20.c ++++ libgcrypt-1.10.2/cipher/chacha20.c +@@ -125,6 +125,7 @@ typedef struct CHACHA20_context_s + unsigned int use_avx2:1; + unsigned int use_neon:1; + unsigned int use_ppc:1; ++ unsigned int use_p10:1; + unsigned int use_s390x:1; + } CHACHA20_context_t; + +@@ -163,6 +164,12 @@ unsigned int _gcry_chacha20_poly1305_amd + + #ifdef USE_PPC_VEC + ++#ifndef WORDS_BIGENDIAN ++unsigned int _gcry_chacha20_p10le_8x(u32 *state, byte *dst, ++ const byte *src, ++ size_t len); ++#endif ++ + unsigned int _gcry_chacha20_ppc8_blocks4(u32 *state, byte *dst, + const byte *src, + size_t nblks); +@@ -475,6 +482,9 @@ chacha20_do_setkey (CHACHA20_context_t * + #endif + #ifdef USE_PPC_VEC + ctx->use_ppc = (features & HWF_PPC_ARCH_2_07) != 0; ++# ifndef WORDS_BIGENDIAN ++ ctx->use_p10 = (features & HWF_PPC_ARCH_3_10) != 0; ++# endif + #endif + #ifdef USE_S390X_VX + ctx->use_s390x = (features & HWF_S390X_VX) != 0; +@@ -571,7 +581,22 @@ do_chacha20_encrypt_stream_tail (CHACHA2 + { + size_t nblocks = length / CHACHA20_BLOCK_SIZE; + nblocks -= nblocks % 4; +- nburn = _gcry_chacha20_ppc8_blocks4(ctx->input, outbuf, inbuf, nblocks); ++#ifndef WORDS_BIGENDIAN ++ /* ++ * A workaround to skip counter overflow. This is rare. ++ */ ++ if (ctx->use_p10 && nblocks >= 8 ++ && ((u64)ctx->input[12] + nblocks) <= 0xffffffffU) ++ { ++ size_t len = nblocks * CHACHA20_BLOCK_SIZE; ++ nburn = _gcry_chacha20_p10le_8x(ctx->input, outbuf, inbuf, len); ++ } ++ else ++#endif ++ { ++ nburn = _gcry_chacha20_ppc8_blocks4(ctx->input, outbuf, inbuf, ++ nblocks); ++ } + burn = nburn > burn ? nburn : burn; + length -= nblocks * CHACHA20_BLOCK_SIZE; + outbuf += nblocks * CHACHA20_BLOCK_SIZE; +@@ -760,6 +785,11 @@ _gcry_chacha20_poly1305_encrypt(gcry_cip + } + #endif + #ifdef USE_PPC_VEC_POLY1305 ++ else if (ctx->use_ppc && ctx->use_p10) ++ { ++ /* Skip stitched chacha20-poly1305 for P10. */ ++ authptr = NULL; ++ } + else if (ctx->use_ppc && length >= CHACHA20_BLOCK_SIZE * 4) + { + nburn = _gcry_chacha20_ppc8_blocks4(ctx->input, outbuf, inbuf, 4); +@@ -998,6 +1028,7 @@ _gcry_chacha20_poly1305_decrypt(gcry_cip + { + CHACHA20_context_t *ctx = (void *) &c->context.c; + unsigned int nburn, burn = 0; ++ int skip_stitched = 0; + + if (!length) + return 0; +@@ -1049,6 +1080,13 @@ _gcry_chacha20_poly1305_decrypt(gcry_cip + inbuf += nblocks * CHACHA20_BLOCK_SIZE; + } + #endif ++#ifdef USE_PPC_VEC_POLY1305 ++ if (ctx->use_ppc && ctx->use_p10) ++ { ++ /* Skip stitched chacha20-poly1305 for P10. */ ++ skip_stitched = 1; ++ } ++#endif + + #ifdef USE_SSSE3 + if (ctx->use_ssse3) +@@ -1102,7 +1140,8 @@ _gcry_chacha20_poly1305_decrypt(gcry_cip + #endif + + #ifdef USE_PPC_VEC_POLY1305 +- if (ctx->use_ppc && length >= 4 * CHACHA20_BLOCK_SIZE) ++ /* skip stitch for p10 */ ++ if (!skip_stitched && ctx->use_ppc && length >= 4 * CHACHA20_BLOCK_SIZE) + { + size_t nblocks = length / CHACHA20_BLOCK_SIZE; + nblocks -= nblocks % 4; +Index: libgcrypt-1.10.2/cipher/poly1305-internal.h +=================================================================== +--- libgcrypt-1.10.2.orig/cipher/poly1305-internal.h ++++ libgcrypt-1.10.2/cipher/poly1305-internal.h +@@ -33,6 +33,17 @@ + #define POLY1305_KEYLEN 32 + #define POLY1305_BLOCKSIZE 16 + ++/* POLY1305_USE_PPC_VEC indicates whether to enable PowerPC vector code. */ ++#undef POLY1305_USE_PPC_VEC ++#ifdef ENABLE_PPC_CRYPTO_SUPPORT ++# if defined(HAVE_COMPATIBLE_CC_PPC_ALTIVEC) && \ ++ defined(HAVE_GCC_INLINE_ASM_PPC_ALTIVEC) && \ ++ !defined(WORDS_BIGENDIAN) ++# if __GNUC__ >= 4 ++# define POLY1305_USE_PPC_VEC 1 ++# endif ++# endif ++#endif + + typedef struct + { +@@ -46,6 +57,9 @@ typedef struct poly1305_context_s + POLY1305_STATE state; + byte buffer[POLY1305_BLOCKSIZE]; + unsigned int leftover; ++#ifdef POLY1305_USE_PPC_VEC ++ unsigned int use_p10:1; ++#endif + } poly1305_context_t; + + +Index: libgcrypt-1.10.2/cipher/poly1305-p10le.s +=================================================================== +--- /dev/null ++++ libgcrypt-1.10.2/cipher/poly1305-p10le.s +@@ -0,0 +1,841 @@ ++# Copyright 2021- IBM Inc. All rights reserved ++# ++# This file is part of Libgcrypt. ++# ++# Libgcrypt is free software; you can redistribute it and/or modify ++# it under the terms of the GNU Lesser General Public License as ++# published by the Free Software Foundation; either version 2.1 of ++# the License, or (at your option) any later version. ++# ++# Libgcrypt is distributed in the hope that it will be useful, ++# but WITHOUT ANY WARRANTY; without even the implied warranty of ++# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++# GNU Lesser General Public License for more details. ++# ++# You should have received a copy of the GNU Lesser General Public ++# License along with this program; if not, see . ++# ++#=================================================================================== ++# Written by Danny Tsen ++# ++# Poly1305 - this version mainly using vector/VSX/Scalar ++# - 26 bits limbs ++# - Handle multiple 64 byte blcoks but need at least 2 64 bytes block ++# ++# Improve performance by breaking down polynominal to the sum of products with ++# h4 = m1 * r⁴ + m2 * r³ + m3 * r² + m4 * r ++# ++# 07/22/21 - this revison based on the above sum of products. Setup r^4, r^3, r^2, r and s3, s2, s1, s0 ++# to 9 vectors for multiplications. ++# ++# setup r^4, r^3, r^2, r vectors ++# vs [r^1, r^3, r^2, r^4] ++# vs0 = [r0,.....] ++# vs1 = [r1,.....] ++# vs2 = [r2,.....] ++# vs3 = [r3,.....] ++# vs4 = [r4,.....] ++# vs5 = [r1*5,...] ++# vs6 = [r2*5,...] ++# vs7 = [r2*5,...] ++# vs8 = [r4*5,...] ++# ++# Each word in a vector consists a member of a "r/s" in [a * r/s]. ++# ++# r0, r4*5, r3*5, r2*5, r1*5; ++# r1, r0, r4*5, r3*5, r2*5; ++# r2, r1, r0, r4*5, r3*5; ++# r3, r2, r1, r0, r4*5; ++# r4, r3, r2, r1, r0 ; ++# ++# ++# gcry_poly1305_p10le_4blocks( uint8_t *k, uint32_t mlen, uint8_t *m) ++# k = 32 bytes key ++# r3 = k (r, s) ++# r4 = mlen ++# r5 = m ++# ++.text ++ ++# Block size 16 bytes ++# key = (r, s) ++# clamp r &= 0x0FFFFFFC0FFFFFFC 0x0FFFFFFC0FFFFFFF ++# p = 2^130 - 5 ++# a += m ++# a = (r + a) % p ++# a += s ++# 16 bytes (a) ++# ++# p[0] = a0*r0 + a1*r4*5 + a2*r3*5 + a3*r2*5 + a4*r1*5; ++# p[1] = a0*r1 + a1*r0 + a2*r4*5 + a3*r3*5 + a4*r2*5; ++# p[2] = a0*r2 + a1*r1 + a2*r0 + a3*r4*5 + a4*r3*5; ++# p[3] = a0*r3 + a1*r2 + a2*r1 + a3*r0 + a4*r4*5; ++# p[4] = a0*r4 + a1*r3 + a2*r2 + a3*r1 + a4*r0 ; ++# ++# [r^2, r^3, r^1, r^4] ++# [m3, m2, m4, m1] ++# ++# multiply odd and even words ++.macro mul_odd ++ vmulouw 14, 4, 26 ++ vmulouw 10, 5, 3 ++ vmulouw 11, 6, 2 ++ vmulouw 12, 7, 1 ++ vmulouw 13, 8, 0 ++ vmulouw 15, 4, 27 ++ vaddudm 14, 14, 10 ++ vaddudm 14, 14, 11 ++ vmulouw 10, 5, 26 ++ vmulouw 11, 6, 3 ++ vaddudm 14, 14, 12 ++ vaddudm 14, 14, 13 # x0 ++ vaddudm 15, 15, 10 ++ vaddudm 15, 15, 11 ++ vmulouw 12, 7, 2 ++ vmulouw 13, 8, 1 ++ vaddudm 15, 15, 12 ++ vaddudm 15, 15, 13 # x1 ++ vmulouw 16, 4, 28 ++ vmulouw 10, 5, 27 ++ vmulouw 11, 6, 26 ++ vaddudm 16, 16, 10 ++ vaddudm 16, 16, 11 ++ vmulouw 12, 7, 3 ++ vmulouw 13, 8, 2 ++ vaddudm 16, 16, 12 ++ vaddudm 16, 16, 13 # x2 ++ vmulouw 17, 4, 29 ++ vmulouw 10, 5, 28 ++ vmulouw 11, 6, 27 ++ vaddudm 17, 17, 10 ++ vaddudm 17, 17, 11 ++ vmulouw 12, 7, 26 ++ vmulouw 13, 8, 3 ++ vaddudm 17, 17, 12 ++ vaddudm 17, 17, 13 # x3 ++ vmulouw 18, 4, 30 ++ vmulouw 10, 5, 29 ++ vmulouw 11, 6, 28 ++ vaddudm 18, 18, 10 ++ vaddudm 18, 18, 11 ++ vmulouw 12, 7, 27 ++ vmulouw 13, 8, 26 ++ vaddudm 18, 18, 12 ++ vaddudm 18, 18, 13 # x4 ++.endm ++ ++.macro mul_even ++ vmuleuw 9, 4, 26 ++ vmuleuw 10, 5, 3 ++ vmuleuw 11, 6, 2 ++ vmuleuw 12, 7, 1 ++ vmuleuw 13, 8, 0 ++ vaddudm 14, 14, 9 ++ vaddudm 14, 14, 10 ++ vaddudm 14, 14, 11 ++ vaddudm 14, 14, 12 ++ vaddudm 14, 14, 13 # x0 ++ ++ vmuleuw 9, 4, 27 ++ vmuleuw 10, 5, 26 ++ vmuleuw 11, 6, 3 ++ vmuleuw 12, 7, 2 ++ vmuleuw 13, 8, 1 ++ vaddudm 15, 15, 9 ++ vaddudm 15, 15, 10 ++ vaddudm 15, 15, 11 ++ vaddudm 15, 15, 12 ++ vaddudm 15, 15, 13 # x1 ++ ++ vmuleuw 9, 4, 28 ++ vmuleuw 10, 5, 27 ++ vmuleuw 11, 6, 26 ++ vmuleuw 12, 7, 3 ++ vmuleuw 13, 8, 2 ++ vaddudm 16, 16, 9 ++ vaddudm 16, 16, 10 ++ vaddudm 16, 16, 11 ++ vaddudm 16, 16, 12 ++ vaddudm 16, 16, 13 # x2 ++ ++ vmuleuw 9, 4, 29 ++ vmuleuw 10, 5, 28 ++ vmuleuw 11, 6, 27 ++ vmuleuw 12, 7, 26 ++ vmuleuw 13, 8, 3 ++ vaddudm 17, 17, 9 ++ vaddudm 17, 17, 10 ++ vaddudm 17, 17, 11 ++ vaddudm 17, 17, 12 ++ vaddudm 17, 17, 13 # x3 ++ ++ vmuleuw 9, 4, 30 ++ vmuleuw 10, 5, 29 ++ vmuleuw 11, 6, 28 ++ vmuleuw 12, 7, 27 ++ vmuleuw 13, 8, 26 ++ vaddudm 18, 18, 9 ++ vaddudm 18, 18, 10 ++ vaddudm 18, 18, 11 ++ vaddudm 18, 18, 12 ++ vaddudm 18, 18, 13 # x4 ++.endm ++ ++# setup r^4, r^3, r^2, r vectors ++# [r, r^3, r^2, r^4] ++# vs0 = [r0,...] ++# vs1 = [r1,...] ++# vs2 = [r2,...] ++# vs3 = [r3,...] ++# vs4 = [r4,...] ++# vs5 = [r4*5,...] ++# vs6 = [r3*5,...] ++# vs7 = [r2*5,...] ++# vs8 = [r1*5,...] ++# ++# r0, r4*5, r3*5, r2*5, r1*5; ++# r1, r0, r4*5, r3*5, r2*5; ++# r2, r1, r0, r4*5, r3*5; ++# r3, r2, r1, r0, r4*5; ++# r4, r3, r2, r1, r0 ; ++# ++.macro poly1305_setup_r ++ ++ # save r ++ xxlor 26, 58, 58 ++ xxlor 27, 59, 59 ++ xxlor 28, 60, 60 ++ xxlor 29, 61, 61 ++ xxlor 30, 62, 62 ++ ++ xxlxor 31, 31, 31 ++ ++# [r, r^3, r^2, r^4] ++ # compute r^2 ++ vmr 4, 26 ++ vmr 5, 27 ++ vmr 6, 28 ++ vmr 7, 29 ++ vmr 8, 30 ++ bl do_mul # r^2 r^1 ++ xxpermdi 58, 58, 36, 0x3 # r0 ++ xxpermdi 59, 59, 37, 0x3 # r1 ++ xxpermdi 60, 60, 38, 0x3 # r2 ++ xxpermdi 61, 61, 39, 0x3 # r3 ++ xxpermdi 62, 62, 40, 0x3 # r4 ++ xxpermdi 36, 36, 36, 0x3 ++ xxpermdi 37, 37, 37, 0x3 ++ xxpermdi 38, 38, 38, 0x3 ++ xxpermdi 39, 39, 39, 0x3 ++ xxpermdi 40, 40, 40, 0x3 ++ vspltisb 13, 2 ++ vsld 9, 27, 13 ++ vsld 10, 28, 13 ++ vsld 11, 29, 13 ++ vsld 12, 30, 13 ++ vaddudm 0, 9, 27 ++ vaddudm 1, 10, 28 ++ vaddudm 2, 11, 29 ++ vaddudm 3, 12, 30 ++ ++ bl do_mul # r^4 r^3 ++ vmrgow 26, 26, 4 ++ vmrgow 27, 27, 5 ++ vmrgow 28, 28, 6 ++ vmrgow 29, 29, 7 ++ vmrgow 30, 30, 8 ++ vspltisb 13, 2 ++ vsld 9, 27, 13 ++ vsld 10, 28, 13 ++ vsld 11, 29, 13 ++ vsld 12, 30, 13 ++ vaddudm 0, 9, 27 ++ vaddudm 1, 10, 28 ++ vaddudm 2, 11, 29 ++ vaddudm 3, 12, 30 ++ ++ # r^2 r^4 ++ xxlor 0, 58, 58 ++ xxlor 1, 59, 59 ++ xxlor 2, 60, 60 ++ xxlor 3, 61, 61 ++ xxlor 4, 62, 62 ++ xxlor 5, 32, 32 ++ xxlor 6, 33, 33 ++ xxlor 7, 34, 34 ++ xxlor 8, 35, 35 ++ ++ vspltw 9, 26, 3 ++ vspltw 10, 26, 2 ++ vmrgow 26, 10, 9 ++ vspltw 9, 27, 3 ++ vspltw 10, 27, 2 ++ vmrgow 27, 10, 9 ++ vspltw 9, 28, 3 ++ vspltw 10, 28, 2 ++ vmrgow 28, 10, 9 ++ vspltw 9, 29, 3 ++ vspltw 10, 29, 2 ++ vmrgow 29, 10, 9 ++ vspltw 9, 30, 3 ++ vspltw 10, 30, 2 ++ vmrgow 30, 10, 9 ++ ++ vsld 9, 27, 13 ++ vsld 10, 28, 13 ++ vsld 11, 29, 13 ++ vsld 12, 30, 13 ++ vaddudm 0, 9, 27 ++ vaddudm 1, 10, 28 ++ vaddudm 2, 11, 29 ++ vaddudm 3, 12, 30 ++.endm ++ ++do_mul: ++ mul_odd ++ ++ # do reduction ( h %= p ) ++ # carry reduction ++ vspltisb 9, 2 ++ vsrd 10, 14, 31 ++ vsrd 11, 17, 31 ++ vand 7, 17, 25 ++ vand 4, 14, 25 ++ vaddudm 18, 18, 11 ++ vsrd 12, 18, 31 ++ vaddudm 15, 15, 10 ++ ++ vsrd 11, 15, 31 ++ vand 8, 18, 25 ++ vand 5, 15, 25 ++ vaddudm 4, 4, 12 ++ vsld 10, 12, 9 ++ vaddudm 6, 16, 11 ++ ++ vsrd 13, 6, 31 ++ vand 6, 6, 25 ++ vaddudm 4, 4, 10 ++ vsrd 10, 4, 31 ++ vaddudm 7, 7, 13 ++ ++ vsrd 11, 7, 31 ++ vand 7, 7, 25 ++ vand 4, 4, 25 ++ vaddudm 5, 5, 10 ++ vaddudm 8, 8, 11 ++ blr ++ ++# ++# init key ++# ++do_poly1305_init: ++ ld 10, rmask@got(2) ++ ld 11, 0(10) ++ ld 12, 8(10) ++ ++ li 14, 16 ++ li 15, 32 ++ ld 10, cnum@got(2) ++ lvx 25, 0, 10 # v25 - mask ++ lvx 31, 14, 10 # v31 = 1a ++ lvx 19, 15, 10 # v19 = 1 << 24 ++ lxv 24, 48(10) # vs24 ++ lxv 25, 64(10) # vs25 ++ ++ # initialize ++ # load key from r3 to vectors ++ ld 9, 16(3) ++ ld 10, 24(3) ++ ld 11, 0(3) ++ ld 12, 8(3) ++ ++ # break 26 bits ++ extrdi 14, 9, 26, 38 ++ extrdi 15, 9, 26, 12 ++ extrdi 16, 9, 12, 0 ++ mtvsrdd 58, 0, 14 ++ insrdi 16, 10, 14, 38 ++ mtvsrdd 59, 0, 15 ++ extrdi 17, 10, 26, 24 ++ mtvsrdd 60, 0, 16 ++ extrdi 18, 10, 24, 0 ++ mtvsrdd 61, 0, 17 ++ mtvsrdd 62, 0, 18 ++ ++ # r1 = r1 * 5, r2 = r2 * 5, r3 = r3 * 5, r4 = r4 * 5 ++ li 9, 5 ++ mtvsrdd 36, 0, 9 ++ vmulouw 0, 27, 4 # v0 = rr0 ++ vmulouw 1, 28, 4 # v1 = rr1 ++ vmulouw 2, 29, 4 # v2 = rr2 ++ vmulouw 3, 30, 4 # v3 = rr3 ++ blr ++ ++# ++# gcry_poly1305_p10le_4blocks( uint8_t *k, uint32_t mlen, uint8_t *m) ++# k = 32 bytes key ++# r3 = k (r, s) ++# r4 = mlen ++# r5 = m ++# ++.global gcry_poly1305_p10le_4blocks ++.align 5 ++gcry_poly1305_p10le_4blocks: ++_gcry_poly1305_p10le_4blocks: ++ cmpdi 5, 128 ++ blt Out_no_poly1305 ++ ++ stdu 1,-1024(1) ++ mflr 0 ++ ++ std 14,112(1) ++ std 15,120(1) ++ std 16,128(1) ++ std 17,136(1) ++ std 18,144(1) ++ std 19,152(1) ++ std 20,160(1) ++ std 21,168(1) ++ std 31,248(1) ++ li 14, 256 ++ stvx 20, 14, 1 ++ addi 14, 14, 16 ++ stvx 21, 14, 1 ++ addi 14, 14, 16 ++ stvx 22, 14, 1 ++ addi 14, 14, 16 ++ stvx 23, 14, 1 ++ addi 14, 14, 16 ++ stvx 24, 14, 1 ++ addi 14, 14, 16 ++ stvx 25, 14, 1 ++ addi 14, 14, 16 ++ stvx 26, 14, 1 ++ addi 14, 14, 16 ++ stvx 27, 14, 1 ++ addi 14, 14, 16 ++ stvx 28, 14, 1 ++ addi 14, 14, 16 ++ stvx 29, 14, 1 ++ addi 14, 14, 16 ++ stvx 30, 14, 1 ++ addi 14, 14, 16 ++ stvx 31, 14, 1 ++ ++ addi 14, 14, 16 ++ stxvx 14, 14, 1 ++ addi 14, 14, 16 ++ stxvx 15, 14, 1 ++ addi 14, 14, 16 ++ stxvx 16, 14, 1 ++ addi 14, 14, 16 ++ stxvx 17, 14, 1 ++ addi 14, 14, 16 ++ stxvx 18, 14, 1 ++ addi 14, 14, 16 ++ stxvx 19, 14, 1 ++ addi 14, 14, 16 ++ stxvx 20, 14, 1 ++ addi 14, 14, 16 ++ stxvx 21, 14, 1 ++ addi 14, 14, 16 ++ stxvx 22, 14, 1 ++ addi 14, 14, 16 ++ stxvx 23, 14, 1 ++ addi 14, 14, 16 ++ stxvx 24, 14, 1 ++ addi 14, 14, 16 ++ stxvx 25, 14, 1 ++ addi 14, 14, 16 ++ stxvx 26, 14, 1 ++ addi 14, 14, 16 ++ stxvx 27, 14, 1 ++ addi 14, 14, 16 ++ stxvx 28, 14, 1 ++ addi 14, 14, 16 ++ stxvx 29, 14, 1 ++ addi 14, 14, 16 ++ stxvx 30, 14, 1 ++ addi 14, 14, 16 ++ stxvx 31, 14, 1 ++ std 0, 1040(1) ++ ++ bl do_poly1305_init ++ ++ li 21, 0 # counter to message ++ ++ poly1305_setup_r ++ ++ # load previous state ++ # break/convert r6 to 26 bits ++ ld 9, 32(3) ++ ld 10, 40(3) ++ lwz 19, 48(3) ++ sldi 19, 19, 24 ++ mtvsrdd 41, 0, 19 ++ extrdi 14, 9, 26, 38 ++ extrdi 15, 9, 26, 12 ++ extrdi 16, 9, 12, 0 ++ mtvsrdd 36, 0, 14 ++ insrdi 16, 10, 14, 38 ++ mtvsrdd 37, 0, 15 ++ extrdi 17, 10, 26, 24 ++ mtvsrdd 38, 0, 16 ++ extrdi 18, 10, 24, 0 ++ mtvsrdd 39, 0, 17 ++ mtvsrdd 40, 0, 18 ++ vor 8, 8, 9 ++ ++ # input m1 m2 ++ add 20, 4, 21 ++ xxlor 49, 24, 24 ++ xxlor 50, 25, 25 ++ lxvw4x 43, 0, 20 ++ addi 17, 20, 16 ++ lxvw4x 44, 0, 17 ++ vperm 14, 11, 12, 17 ++ vperm 15, 11, 12, 18 ++ vand 9, 14, 25 # a0 ++ vsrd 10, 14, 31 # >> 26 ++ vsrd 11, 10, 31 # 12 bits left ++ vand 10, 10, 25 # a1 ++ vspltisb 13, 12 ++ vand 16, 15, 25 ++ vsld 12, 16, 13 ++ vor 11, 11, 12 ++ vand 11, 11, 25 # a2 ++ vspltisb 13, 14 ++ vsrd 12, 15, 13 # >> 14 ++ vsrd 13, 12, 31 # >> 26, a4 ++ vand 12, 12, 25 # a3 ++ ++ vaddudm 20, 4, 9 ++ vaddudm 21, 5, 10 ++ vaddudm 22, 6, 11 ++ vaddudm 23, 7, 12 ++ vaddudm 24, 8, 13 ++ ++ # m3 m4 ++ addi 17, 17, 16 ++ lxvw4x 43, 0, 17 ++ addi 17, 17, 16 ++ lxvw4x 44, 0, 17 ++ vperm 14, 11, 12, 17 ++ vperm 15, 11, 12, 18 ++ vand 9, 14, 25 # a0 ++ vsrd 10, 14, 31 # >> 26 ++ vsrd 11, 10, 31 # 12 bits left ++ vand 10, 10, 25 # a1 ++ vspltisb 13, 12 ++ vand 16, 15, 25 ++ vsld 12, 16, 13 ++ vspltisb 13, 14 ++ vor 11, 11, 12 ++ vand 11, 11, 25 # a2 ++ vsrd 12, 15, 13 # >> 14 ++ vsrd 13, 12, 31 # >> 26, a4 ++ vand 12, 12, 25 # a3 ++ ++ # Smash 4 message blocks into 5 vectors of [m4, m2, m3, m1] ++ vmrgow 4, 9, 20 ++ vmrgow 5, 10, 21 ++ vmrgow 6, 11, 22 ++ vmrgow 7, 12, 23 ++ vmrgow 8, 13, 24 ++ vaddudm 8, 8, 19 ++ ++ addi 5, 5, -64 ++ addi 21, 21, 64 ++ ++ li 9, 64 ++ divdu 31, 5, 9 ++ ++ mtctr 31 ++ ++# h4 = m1 * r⁴ + m2 * r³ + m3 * r² + m4 * r ++# Rewrite the polynominal sum of product as follows, ++# h1 = (h0 + m1) * r^2, h2 = (h0 + m2) * r^2 ++# h3 = (h1 + m3) * r^2, h4 = (h2 + m4) * r^2 --> (h0 + m1) r*4 + (h3 + m3) r^2, (h0 + m2) r^4 + (h0 + m4) r^2 ++# .... Repeat ++# h5 = (h3 + m5) * r^2, h6 = (h4 + m6) * r^2 --> ++# h7 = (h5 + m7) * r^2, h8 = (h6 + m8) * r^1 --> m5 * r^4 + m6 * r^3 + m7 * r^2 + m8 * r ++# ++loop_4blocks: ++ ++ # Multiply odd words and even words ++ mul_odd ++ mul_even ++ # carry reduction ++ vspltisb 9, 2 ++ vsrd 10, 14, 31 ++ vsrd 11, 17, 31 ++ vand 7, 17, 25 ++ vand 4, 14, 25 ++ vaddudm 18, 18, 11 ++ vsrd 12, 18, 31 ++ vaddudm 15, 15, 10 ++ ++ vsrd 11, 15, 31 ++ vand 8, 18, 25 ++ vand 5, 15, 25 ++ vaddudm 4, 4, 12 ++ vsld 10, 12, 9 ++ vaddudm 6, 16, 11 ++ ++ vsrd 13, 6, 31 ++ vand 6, 6, 25 ++ vaddudm 4, 4, 10 ++ vsrd 10, 4, 31 ++ vaddudm 7, 7, 13 ++ ++ vsrd 11, 7, 31 ++ vand 7, 7, 25 ++ vand 4, 4, 25 ++ vaddudm 5, 5, 10 ++ vaddudm 8, 8, 11 ++ ++ # input m1 m2 m3 m4 ++ add 20, 4, 21 ++ xxlor 49, 24, 24 ++ xxlor 50, 25, 25 ++ lxvw4x 43, 0, 20 ++ addi 17, 20, 16 ++ lxvw4x 44, 0, 17 ++ vperm 14, 11, 12, 17 ++ vperm 15, 11, 12, 18 ++ addi 17, 17, 16 ++ lxvw4x 43, 0, 17 ++ addi 17, 17, 16 ++ lxvw4x 44, 0, 17 ++ vperm 17, 11, 12, 17 ++ vperm 18, 11, 12, 18 ++ ++ vand 20, 14, 25 # a0 ++ vand 9, 17, 25 # a0 ++ vsrd 21, 14, 31 # >> 26 ++ vsrd 22, 21, 31 # 12 bits left ++ vsrd 10, 17, 31 # >> 26 ++ vsrd 11, 10, 31 # 12 bits left ++ ++ vand 21, 21, 25 # a1 ++ vand 10, 10, 25 # a1 ++ ++ vspltisb 13, 12 ++ vand 16, 15, 25 ++ vsld 23, 16, 13 ++ vor 22, 22, 23 ++ vand 22, 22, 25 # a2 ++ vand 16, 18, 25 ++ vsld 12, 16, 13 ++ vor 11, 11, 12 ++ vand 11, 11, 25 # a2 ++ vspltisb 13, 14 ++ vsrd 23, 15, 13 # >> 14 ++ vsrd 24, 23, 31 # >> 26, a4 ++ vand 23, 23, 25 # a3 ++ vsrd 12, 18, 13 # >> 14 ++ vsrd 13, 12, 31 # >> 26, a4 ++ vand 12, 12, 25 # a3 ++ ++ vaddudm 4, 4, 20 ++ vaddudm 5, 5, 21 ++ vaddudm 6, 6, 22 ++ vaddudm 7, 7, 23 ++ vaddudm 8, 8, 24 ++ ++ # Smash 4 message blocks into 5 vectors of [m4, m2, m3, m1] ++ vmrgow 4, 9, 4 ++ vmrgow 5, 10, 5 ++ vmrgow 6, 11, 6 ++ vmrgow 7, 12, 7 ++ vmrgow 8, 13, 8 ++ vaddudm 8, 8, 19 ++ ++ addi 5, 5, -64 ++ addi 21, 21, 64 ++ ++ bdnz loop_4blocks ++ ++ xxlor 58, 0, 0 ++ xxlor 59, 1, 1 ++ xxlor 60, 2, 2 ++ xxlor 61, 3, 3 ++ xxlor 62, 4, 4 ++ xxlor 32, 5, 5 ++ xxlor 33, 6, 6 ++ xxlor 34, 7, 7 ++ xxlor 35, 8, 8 ++ ++ # Multiply odd words and even words ++ mul_odd ++ mul_even ++ ++ # Sum the products. ++ xxpermdi 41, 31, 46, 0 ++ xxpermdi 42, 31, 47, 0 ++ vaddudm 4, 14, 9 ++ xxpermdi 36, 31, 36, 3 ++ vaddudm 5, 15, 10 ++ xxpermdi 37, 31, 37, 3 ++ xxpermdi 43, 31, 48, 0 ++ vaddudm 6, 16, 11 ++ xxpermdi 38, 31, 38, 3 ++ xxpermdi 44, 31, 49, 0 ++ vaddudm 7, 17, 12 ++ xxpermdi 39, 31, 39, 3 ++ xxpermdi 45, 31, 50, 0 ++ vaddudm 8, 18, 13 ++ xxpermdi 40, 31, 40, 3 ++ ++ # carry reduction ++ vspltisb 9, 2 ++ vsrd 10, 4, 31 ++ vsrd 11, 7, 31 ++ vand 7, 7, 25 ++ vand 4, 4, 25 ++ vaddudm 8, 8, 11 ++ vsrd 12, 8, 31 ++ vaddudm 5, 5, 10 ++ ++ vsrd 11, 5, 31 ++ vand 8, 8, 25 ++ vand 5, 5, 25 ++ vaddudm 4, 4, 12 ++ vsld 10, 12, 9 ++ vaddudm 6, 6, 11 ++ ++ vsrd 13, 6, 31 ++ vand 6, 6, 25 ++ vaddudm 4, 4, 10 ++ vsrd 10, 4, 31 ++ vaddudm 7, 7, 13 ++ ++ vsrd 11, 7, 31 ++ vand 7, 7, 25 ++ vand 4, 4, 25 ++ vaddudm 5, 5, 10 ++ vaddudm 8, 8, 11 ++ ++ b do_final_update ++ ++do_final_update: ++ # v4, v5, v6, v7 and v8 are 26 bit vectors ++ vsld 5, 5, 31 ++ vor 20, 4, 5 ++ vspltisb 11, 12 ++ vsrd 12, 6, 11 ++ vsld 6, 6, 31 ++ vsld 6, 6, 31 ++ vor 20, 20, 6 ++ vspltisb 11, 14 ++ vsld 7, 7, 11 ++ vor 21, 7, 12 ++ mfvsrld 16, 40 # save last 2 bytes ++ vsld 8, 8, 11 ++ vsld 8, 8, 31 ++ vor 21, 21, 8 ++ mfvsrld 17, 52 ++ mfvsrld 19, 53 ++ srdi 16, 16, 24 ++ ++ std 17, 32(3) ++ std 19, 40(3) ++ stw 16, 48(3) ++ ++Out_loop: ++ li 3, 0 ++ ++ li 14, 256 ++ lvx 20, 14, 1 ++ addi 14, 14, 16 ++ lvx 21, 14, 1 ++ addi 14, 14, 16 ++ lvx 22, 14, 1 ++ addi 14, 14, 16 ++ lvx 23, 14, 1 ++ addi 14, 14, 16 ++ lvx 24, 14, 1 ++ addi 14, 14, 16 ++ lvx 25, 14, 1 ++ addi 14, 14, 16 ++ lvx 26, 14, 1 ++ addi 14, 14, 16 ++ lvx 27, 14, 1 ++ addi 14, 14, 16 ++ lvx 28, 14, 1 ++ addi 14, 14, 16 ++ lvx 29, 14, 1 ++ addi 14, 14, 16 ++ lvx 30, 14, 1 ++ addi 14, 14, 16 ++ lvx 31, 14, 1 ++ ++ addi 14, 14, 16 ++ lxvx 14, 14, 1 ++ addi 14, 14, 16 ++ lxvx 15, 14, 1 ++ addi 14, 14, 16 ++ lxvx 16, 14, 1 ++ addi 14, 14, 16 ++ lxvx 17, 14, 1 ++ addi 14, 14, 16 ++ lxvx 18, 14, 1 ++ addi 14, 14, 16 ++ lxvx 19, 14, 1 ++ addi 14, 14, 16 ++ lxvx 20, 14, 1 ++ addi 14, 14, 16 ++ lxvx 21, 14, 1 ++ addi 14, 14, 16 ++ lxvx 22, 14, 1 ++ addi 14, 14, 16 ++ lxvx 23, 14, 1 ++ addi 14, 14, 16 ++ lxvx 24, 14, 1 ++ addi 14, 14, 16 ++ lxvx 25, 14, 1 ++ addi 14, 14, 16 ++ lxvx 26, 14, 1 ++ addi 14, 14, 16 ++ lxvx 27, 14, 1 ++ addi 14, 14, 16 ++ lxvx 28, 14, 1 ++ addi 14, 14, 16 ++ lxvx 29, 14, 1 ++ addi 14, 14, 16 ++ lxvx 30, 14, 1 ++ addi 14, 14, 16 ++ lxvx 31, 14, 1 ++ ++ ld 0, 1040(1) ++ ld 14,112(1) ++ ld 15,120(1) ++ ld 16,128(1) ++ ld 17,136(1) ++ ld 18,144(1) ++ ld 19,152(1) ++ ld 20,160(1) ++ ld 21,168(1) ++ ld 31,248(1) ++ ++ mtlr 0 ++ addi 1, 1, 1024 ++ blr ++ ++Out_no_poly1305: ++ li 3, 0 ++ blr ++ ++.data ++.align 5 ++rmask: ++.byte 0xff, 0xff, 0xff, 0x0f, 0xfc, 0xff, 0xff, 0x0f, 0xfc, 0xff, 0xff, 0x0f, 0xfc, 0xff, 0xff, 0x0f ++cnum: ++.long 0x03ffffff, 0x00000000, 0x03ffffff, 0x00000000 ++.long 0x1a, 0x00, 0x1a, 0x00 ++.long 0x01000000, 0x01000000, 0x01000000, 0x01000000 ++.long 0x00010203, 0x04050607, 0x10111213, 0x14151617 ++.long 0x08090a0b, 0x0c0d0e0f, 0x18191a1b, 0x1c1d1e1f ++.long 0x05, 0x00, 0x00, 0x00 ++.long 0x02020202, 0x02020202, 0x02020202, 0x02020202 ++.long 0xffffffff, 0xffffffff, 0x00000000, 0x00000000 +Index: libgcrypt-1.10.2/cipher/poly1305.c +=================================================================== +--- libgcrypt-1.10.2.orig/cipher/poly1305.c ++++ libgcrypt-1.10.2/cipher/poly1305.c +@@ -78,11 +78,23 @@ poly1305_blocks (poly1305_context_t *ctx + #endif /* USE_S390X_ASM */ + + ++#ifdef POLY1305_USE_PPC_VEC ++ ++extern unsigned int ++gcry_poly1305_p10le_4blocks(unsigned char *key, const byte *m, size_t len); ++ ++#endif /* POLY1305_USE_PPC_VEC */ ++ ++ + static void poly1305_init (poly1305_context_t *ctx, + const byte key[POLY1305_KEYLEN]) + { + POLY1305_STATE *st = &ctx->state; + ++#ifdef POLY1305_USE_PPC_VEC ++ ctx->use_p10 = (_gcry_get_hw_features () & HWF_PPC_ARCH_3_10) != 0; ++#endif ++ + ctx->leftover = 0; + + st->h[0] = 0; +@@ -533,6 +545,7 @@ _gcry_poly1305_update_burn (poly1305_con + size_t bytes) + { + unsigned int burn = 0; ++ unsigned int nburn; + + /* handle leftover */ + if (ctx->leftover) +@@ -546,15 +559,31 @@ _gcry_poly1305_update_burn (poly1305_con + ctx->leftover += want; + if (ctx->leftover < POLY1305_BLOCKSIZE) + return 0; +- burn = poly1305_blocks (ctx, ctx->buffer, POLY1305_BLOCKSIZE, 1); ++ nburn = poly1305_blocks (ctx, ctx->buffer, POLY1305_BLOCKSIZE, 1); ++ burn = nburn > burn ? nburn : burn; + ctx->leftover = 0; + } + ++#ifdef POLY1305_USE_PPC_VEC ++ /* PPC-P10/little-endian: bulk process multiples of eight blocks */ ++ if (ctx->use_p10 && bytes >= POLY1305_BLOCKSIZE * 8) ++ { ++ size_t nblks = bytes / (POLY1305_BLOCKSIZE * 8); ++ size_t len = nblks * (POLY1305_BLOCKSIZE * 8); ++ POLY1305_STATE *st = &ctx->state; ++ nburn = gcry_poly1305_p10le_4blocks ((unsigned char *) st, m, len); ++ burn = nburn > burn ? nburn : burn; ++ m += len; ++ bytes -= len; ++ } ++#endif /* POLY1305_USE_PPC_VEC */ ++ + /* process full blocks */ + if (bytes >= POLY1305_BLOCKSIZE) + { + size_t nblks = bytes / POLY1305_BLOCKSIZE; +- burn = poly1305_blocks (ctx, m, nblks * POLY1305_BLOCKSIZE, 1); ++ nburn = poly1305_blocks (ctx, m, nblks * POLY1305_BLOCKSIZE, 1); ++ burn = nburn > burn ? nburn : burn; + m += nblks * POLY1305_BLOCKSIZE; + bytes -= nblks * POLY1305_BLOCKSIZE; + } +Index: libgcrypt-1.10.2/configure.ac +=================================================================== +--- libgcrypt-1.10.2.orig/configure.ac ++++ libgcrypt-1.10.2/configure.ac +@@ -2779,6 +2779,11 @@ if test "$found" = "1" ; then + powerpc64le-*-*) + # Build with the ppc8 vector implementation + GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS chacha20-ppc.lo" ++ # Build with the assembly implementation ++ if test "$gcry_cv_gcc_inline_asm_ppc_altivec" = "yes" && ++ test "$gcry_cv_gcc_inline_asm_ppc_arch_3_00" = "yes" ; then ++ GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS chacha20-p10le-8x.lo" ++ fi + ;; + powerpc64-*-*) + # Build with the ppc8 vector implementation +@@ -3117,6 +3122,13 @@ case "${host}" in + s390x-*-*) + GCRYPT_ASM_DIGESTS="$GCRYPT_ASM_DIGESTS poly1305-s390x.lo" + ;; ++ powerpc64le-*-*) ++ # Build with the assembly implementation ++ if test "$gcry_cv_gcc_inline_asm_ppc_altivec" = "yes" && ++ test "$gcry_cv_gcc_inline_asm_ppc_arch_3_00" = "yes" ; then ++ GCRYPT_ASM_DIGESTS="$GCRYPT_ASM_DIGESTS poly1305-p10le.lo" ++ fi ++ ;; + esac + + LIST_MEMBER(scrypt, $enabled_kdfs) diff --git a/libgcrypt-ppc-enable-P10-assembly-with-ENABLE_FORCE_SOF.patch b/libgcrypt-ppc-enable-P10-assembly-with-ENABLE_FORCE_SOF.patch new file mode 100644 index 0000000..8ef3197 --- /dev/null +++ b/libgcrypt-ppc-enable-P10-assembly-with-ENABLE_FORCE_SOF.patch @@ -0,0 +1,76 @@ +commit 2c5e5ab6843d747c4b877d2c6f47226f61e9ff14 +Author: Jussi Kivilinna +Date: Sun Jun 12 21:51:34 2022 +0300 + + ppc enable P10 assembly with ENABLE_FORCE_SOFT_HWFEATURES on arch 3.00 + + * cipher/chacha20.c (chacha20_do_setkey) [USE_PPC_VEC]: Enable + P10 assembly for HWF_PPC_ARCH_3_00 if ENABLE_FORCE_SOFT_HWFEATURES is + defined. + * cipher/poly1305.c (poly1305_init) [POLY1305_USE_PPC_VEC]: Likewise. + * cipher/rijndael.c (do_setkey) [USE_PPC_CRYPTO_WITH_PPC9LE]: Likewise. + --- + + This change allows testing P10 implementations with P9 and with QEMU-PPC. + + GnuPG-bug-id: 6006 + Signed-off-by: Jussi Kivilinna + +Index: libgcrypt-1.10.2/cipher/chacha20.c +=================================================================== +--- libgcrypt-1.10.2.orig/cipher/chacha20.c ++++ libgcrypt-1.10.2/cipher/chacha20.c +@@ -484,6 +484,11 @@ chacha20_do_setkey (CHACHA20_context_t * + ctx->use_ppc = (features & HWF_PPC_ARCH_2_07) != 0; + # ifndef WORDS_BIGENDIAN + ctx->use_p10 = (features & HWF_PPC_ARCH_3_10) != 0; ++# ifdef ENABLE_FORCE_SOFT_HWFEATURES ++ /* HWF_PPC_ARCH_3_10 above is used as soft HW-feature indicator for P10. ++ * Actual implementation works with HWF_PPC_ARCH_3_00 also. */ ++ ctx->use_p10 |= (features & HWF_PPC_ARCH_3_00) != 0; ++# endif + # endif + #endif + #ifdef USE_S390X_VX +Index: libgcrypt-1.10.2/cipher/poly1305.c +=================================================================== +--- libgcrypt-1.10.2.orig/cipher/poly1305.c ++++ libgcrypt-1.10.2/cipher/poly1305.c +@@ -90,11 +90,19 @@ static void poly1305_init (poly1305_cont + const byte key[POLY1305_KEYLEN]) + { + POLY1305_STATE *st = &ctx->state; ++ unsigned int features = _gcry_get_hw_features (); + + #ifdef POLY1305_USE_PPC_VEC +- ctx->use_p10 = (_gcry_get_hw_features () & HWF_PPC_ARCH_3_10) != 0; ++ ctx->use_p10 = (features & HWF_PPC_ARCH_3_10) != 0; ++# ifdef ENABLE_FORCE_SOFT_HWFEATURES ++ /* HWF_PPC_ARCH_3_10 above is used as soft HW-feature indicator for P10. ++ * Actual implementation works with HWF_PPC_ARCH_3_00 also. */ ++ ctx->use_p10 |= (features & HWF_PPC_ARCH_3_00) != 0; ++# endif + #endif + ++ (void)features; ++ + ctx->leftover = 0; + + st->h[0] = 0; +Index: libgcrypt-1.10.2/cipher/rijndael.c +=================================================================== +--- libgcrypt-1.10.2.orig/cipher/rijndael.c ++++ libgcrypt-1.10.2/cipher/rijndael.c +@@ -605,6 +605,12 @@ do_setkey (RIJNDAEL_context *ctx, const + bulk_ops->xts_crypt = _gcry_aes_ppc9le_xts_crypt; + if (hwfeatures & HWF_PPC_ARCH_3_10) /* for P10 */ + bulk_ops->gcm_crypt = _gcry_aes_p10le_gcm_crypt; ++# ifdef ENABLE_FORCE_SOFT_HWFEATURES ++ /* HWF_PPC_ARCH_3_10 above is used as soft HW-feature indicator for P10. ++ * Actual implementation works with HWF_PPC_ARCH_3_00 also. */ ++ if (hwfeatures & HWF_PPC_ARCH_3_00) ++ bulk_ops->gcm_crypt = _gcry_aes_p10le_gcm_crypt; ++# endif + } + #endif + #ifdef USE_PPC_CRYPTO diff --git a/libgcrypt.changes b/libgcrypt.changes index 80dbdd7..9d552b4 100644 --- a/libgcrypt.changes +++ b/libgcrypt.changes @@ -1,3 +1,16 @@ +------------------------------------------------------------------- +Tue Oct 3 12:58:41 UTC 2023 - Pedro Monreal + +- POWER: performance enhancements for cryptography [jsc#PED-5088] + * Optimize Chacha20 and Poly1305 for PPC P10 LE: [T6006] + - Chacha20/poly1305: Optimized chacha20/poly1305 for + P10 operation [rC88fe7ac33eb4] + - ppc: enable P10 assembly with ENABLE_FORCE_SOFT_HWFEATURES + on arch-3.00 [rC2c5e5ab6843d] + * Add patches: + - libgcrypt-Chacha20-poly1305-Optimized-chacha20-poly1305.patch + - libgcrypt-ppc-enable-P10-assembly-with-ENABLE_FORCE_SOF.patch + ------------------------------------------------------------------- Mon May 22 11:32:53 UTC 2023 - Pedro Monreal diff --git a/libgcrypt.spec b/libgcrypt.spec index 3d7ea32..2b8c52b 100644 --- a/libgcrypt.spec +++ b/libgcrypt.spec @@ -48,6 +48,9 @@ Patch102: libgcrypt-FIPS-SLI-hash-mac.patch Patch103: libgcrypt-jitterentropy-3.4.0.patch #PATCH-FIX-SUSE bsc#1202117 FIPS: Get most of the entropy from rndjent_poll Patch104: libgcrypt-FIPS-rndjent_poll.patch +# POWER patches [jsc#PED-5088] POWER performance enhancements for cryptography +Patch200: libgcrypt-Chacha20-poly1305-Optimized-chacha20-poly1305.patch +Patch201: libgcrypt-ppc-enable-P10-assembly-with-ENABLE_FORCE_SOF.patch BuildRequires: automake >= 1.14 BuildRequires: libgpg-error-devel >= 1.27 BuildRequires: libtool