diff --git a/libnettle-powerpc64-remove-m4_unquote-sha256.patch b/libnettle-powerpc64-remove-m4_unquote-sha256.patch new file mode 100644 index 0000000..e23bb14 --- /dev/null +++ b/libnettle-powerpc64-remove-m4_unquote-sha256.patch @@ -0,0 +1,75 @@ +From 95d7ebbafaea628751e35d2ce1c4c5d2617ed5de Mon Sep 17 00:00:00 2001 +From: Eric Richter +Date: Thu, 20 Jun 2024 13:43:57 -0500 +Subject: [PATCH] powerpc64: remove use of m4_unquote in the load step for + sha256 + +By passing in the constant offset value into the LOAD macro, the use of +m4_unquote to calculate the correct constant GPR can be avoided, +improving readability. + +Signed-off-by: Eric Richter +--- + powerpc64/p8/sha256-compress-n.asm | 36 +++++++++++++++--------------- + 1 file changed, 18 insertions(+), 18 deletions(-) + +diff --git a/powerpc64/p8/sha256-compress-n.asm b/powerpc64/p8/sha256-compress-n.asm +index 4848461e..309db1fa 100644 +--- a/powerpc64/p8/sha256-compress-n.asm ++++ b/powerpc64/p8/sha256-compress-n.asm +@@ -177,34 +177,34 @@ define(`EXTENDROUNDS', ` + ') + + define(`LOAD', ` +- IF_BE(`lxvw4x VSR(IV($1)), m4_unquote(TC`'eval(($1 % 4) * 4)), INPUT') ++ IF_BE(`lxvw4x VSR(IV($1)), $2, INPUT') + IF_LE(` +- lxvd2x VSR(IV($1)), m4_unquote(TC`'eval(($1 % 4) * 4)), INPUT ++ lxvd2x VSR(IV($1)), $2, INPUT + vperm IV($1), IV($1), IV($1), VT0 + ') + ') + + define(`DOLOADS', ` + IF_LE(`DATA_LOAD_VEC(VT0, .load_swap, T1)') +- LOAD(0) +- LOAD(1) +- LOAD(2) +- LOAD(3) ++ LOAD(0, TC0) ++ LOAD(1, TC4) ++ LOAD(2, TC8) ++ LOAD(3, TC12) + addi INPUT, INPUT, 16 +- LOAD(4) +- LOAD(5) +- LOAD(6) +- LOAD(7) ++ LOAD(4, TC0) ++ LOAD(5, TC4) ++ LOAD(6, TC8) ++ LOAD(7, TC12) + addi INPUT, INPUT, 16 +- LOAD(8) +- LOAD(9) +- LOAD(10) +- LOAD(11) ++ LOAD(8, TC0) ++ LOAD(9, TC4) ++ LOAD(10, TC8) ++ LOAD(11, TC12) + addi INPUT, INPUT, 16 +- LOAD(12) +- LOAD(13) +- LOAD(14) +- LOAD(15) ++ LOAD(12, TC0) ++ LOAD(13, TC4) ++ LOAD(14, TC8) ++ LOAD(15, TC12) + addi INPUT, INPUT, 16 + ') + +-- +GitLab + diff --git a/libnettle-powerpc64-sha256-adjust-stack-offset-for-non-volatile-registers.patch b/libnettle-powerpc64-sha256-adjust-stack-offset-for-non-volatile-registers.patch new file mode 100644 index 0000000..bd0e34e --- /dev/null +++ b/libnettle-powerpc64-sha256-adjust-stack-offset-for-non-volatile-registers.patch @@ -0,0 +1,45 @@ +From 9d8b3e93bbfea1da668a28760540a2b25fae4a50 Mon Sep 17 00:00:00 2001 +From: Eric Richter +Date: Thu, 29 Aug 2024 09:44:25 -0500 +Subject: [PATCH] powerpc64/sha256: adjust stack offset for storing + non-volatile registers + +According to the ABI, the stack pointer is quadword aligned, so starting +the stack storage at offset -8, may cause the return address to be +stepped on. Adjusting to use -16 as the starting point, which also +matches other POWER assembly code. + +Signed-off-by: Eric Richter +--- + powerpc64/p8/sha256-compress-n.asm | 8 ++++---- + 1 file changed, 4 insertions(+), 4 deletions(-) + +diff --git a/powerpc64/p8/sha256-compress-n.asm b/powerpc64/p8/sha256-compress-n.asm +index 309db1fa..e08ae132 100644 +--- a/powerpc64/p8/sha256-compress-n.asm ++++ b/powerpc64/p8/sha256-compress-n.asm +@@ -216,8 +216,8 @@ PROLOGUE(_nettle_sha256_compress_n) + + C Store non-volatile registers + +- li T0, -8 +- li T1, -24 ++ li T0, -16 ++ li T1, -32 + stvx v20, T0, SP + stvx v21, T1, SP + subi T0, T0, 32 +@@ -321,8 +321,8 @@ PROLOGUE(_nettle_sha256_compress_n) + + + C Restore nonvolatile registers +- li T0, -8 +- li T1, -24 ++ li T0, -16 ++ li T1, -32 + lvx v20, T0, SP + lvx v21, T1, SP + subi T0, T0, 32 +-- +GitLab + diff --git a/libnettle-powerpc64-sha256-fix-loading-overreads.patch b/libnettle-powerpc64-sha256-fix-loading-overreads.patch new file mode 100644 index 0000000..f57ec95 --- /dev/null +++ b/libnettle-powerpc64-sha256-fix-loading-overreads.patch @@ -0,0 +1,112 @@ +From 89ae5b24c6d052aa4d9b14c9a50b3c62b5636d81 Mon Sep 17 00:00:00 2001 +From: Eric Richter +Date: Wed, 11 Sep 2024 13:53:48 -0500 +Subject: [PATCH] powerpc64/sha256: fix loading overreads by loading less and + shifting + +Originally, the 16 input words were loaded with 16 individual vector load +instructions. This has a side effect where the last three loads would +overread 1/2/3 extra words. + +Fix the overread by replacing unnecessary overlapped reads with shifts. +As a consequence, the constant registers for 4,8,12 can be removed, and +also gain about 1~2% in performance. + +Signed-off-by: Eric Richter +--- + powerpc64/p8/sha256-compress-n.asm | 44 +++++++++++------------------- + 1 file changed, 16 insertions(+), 28 deletions(-) + +diff --git a/powerpc64/p8/sha256-compress-n.asm b/powerpc64/p8/sha256-compress-n.asm +index e08ae132..75666deb 100644 +--- a/powerpc64/p8/sha256-compress-n.asm ++++ b/powerpc64/p8/sha256-compress-n.asm +@@ -44,10 +44,7 @@ define(`T1', `r8') + define(`TK', `r9') + define(`COUNT', `r10') + define(`TC0', `0') C Index instructions allow literal 0 instead of a GPR +-define(`TC4', `r11') +-define(`TC8', `r12') +-define(`TC12', `r14') +-define(`TC16', `r15') ++define(`TC16', `r11') + + C State registers + define(`VSA', `v0') +@@ -187,24 +184,24 @@ define(`LOAD', ` + define(`DOLOADS', ` + IF_LE(`DATA_LOAD_VEC(VT0, .load_swap, T1)') + LOAD(0, TC0) +- LOAD(1, TC4) +- LOAD(2, TC8) +- LOAD(3, TC12) ++ vsldoi IV(1), IV(0), IV(0), 4 ++ vsldoi IV(2), IV(0), IV(0), 8 ++ vsldoi IV(3), IV(0), IV(0), 12 + addi INPUT, INPUT, 16 + LOAD(4, TC0) +- LOAD(5, TC4) +- LOAD(6, TC8) +- LOAD(7, TC12) ++ vsldoi IV(5), IV(4), IV(4), 4 ++ vsldoi IV(6), IV(4), IV(4), 8 ++ vsldoi IV(7), IV(4), IV(4), 12 + addi INPUT, INPUT, 16 + LOAD(8, TC0) +- LOAD(9, TC4) +- LOAD(10, TC8) +- LOAD(11, TC12) ++ vsldoi IV(9), IV(8), IV(8), 4 ++ vsldoi IV(10), IV(8), IV(8), 8 ++ vsldoi IV(11), IV(8), IV(8), 12 + addi INPUT, INPUT, 16 + LOAD(12, TC0) +- LOAD(13, TC4) +- LOAD(14, TC8) +- LOAD(15, TC12) ++ vsldoi IV(13), IV(12), IV(12), 4 ++ vsldoi IV(14), IV(12), IV(12), 8 ++ vsldoi IV(15), IV(12), IV(12), 12 + addi INPUT, INPUT, 16 + ') + +@@ -216,6 +213,8 @@ PROLOGUE(_nettle_sha256_compress_n) + + C Store non-volatile registers + ++ ALIGN(16) C Appears necessary for optimal stores ++ li TC16, 16 + li T0, -16 + li T1, -32 + stvx v20, T0, SP +@@ -240,15 +239,8 @@ PROLOGUE(_nettle_sha256_compress_n) + subi T1, T1, 32 + stvx v30, T0, SP + stvx v31, T1, SP +- subi T0, T0, 32 +- subi T1, T1, 32 +- stdx r14, T0, SP +- stdx r15, T1, SP + +- li TC4, 4 +- li TC8, 8 +- li TC12, 12 +- li TC16, 16 ++ ALIGN(16) C Appears necessary for optimal loads + + C Load state values + lxvw4x VSR(VSA), 0, STATE C VSA contains A,B,C,D +@@ -345,10 +337,6 @@ PROLOGUE(_nettle_sha256_compress_n) + subi T1, T1, 32 + lvx v30, T0, SP + lvx v31, T1, SP +- subi T0, T0, 32 +- subi T1, T1, 32 +- ldx r14, T0, SP +- ldx r15, T1, SP + + .done: + mr r3, INPUT +-- +GitLab + diff --git a/libnettle-powerpc64-skip-AES-GCM-test.patch b/libnettle-powerpc64-skip-AES-GCM-test.patch new file mode 100644 index 0000000..e1694da --- /dev/null +++ b/libnettle-powerpc64-skip-AES-GCM-test.patch @@ -0,0 +1,13 @@ +Index: nettle-3.10/testsuite/Makefile.in +=================================================================== +--- nettle-3.10.orig/testsuite/Makefile.in ++++ nettle-3.10/testsuite/Makefile.in +@@ -28,7 +28,7 @@ TS_NETTLE_SOURCES = aes-test.c aes-keywr + streebog-test.c sm3-test.c sm4-test.c \ + serpent-test.c twofish-test.c version-test.c \ + knuth-lfib-test.c \ +- cbc-test.c cfb-test.c ctr-test.c gcm-test.c eax-test.c ccm-test.c \ ++ cbc-test.c cfb-test.c ctr-test.c eax-test.c ccm-test.c \ + cmac-test.c ocb-test.c siv-cmac-test.c siv-gcm-test.c \ + poly1305-test.c chacha-poly1305-test.c \ + hmac-test.c umac-test.c \ diff --git a/libnettle.changes b/libnettle.changes index d1d4b4c..cd730a7 100644 --- a/libnettle.changes +++ b/libnettle.changes @@ -1,3 +1,16 @@ +------------------------------------------------------------------- +Tue Dec 3 08:07:16 UTC 2024 - Pedro Monreal + +- ppcl64le: POWER10 performance enhancements for cryptography [jsc#PED-9904] + * powerpc64/sha256: fix loading overreads by loading less and shifting + * powerpc64/sha256: adjust stack offset for storing non-volatile registers + * powerpc64: remove use of m4_unquote in the load step for sha256 + * Temporarily skip the gcm test: libnettle-powerpc64-skip-AES-GCM-test.patch + * Add patches: + - libnettle-powerpc64-sha256-fix-loading-overreads.patch + - libnettle-powerpc64-sha256-adjust-stack-offset-for-non-volatile-registers.patch + - libnettle-powerpc64-remove-m4_unquote-sha256.patch + ------------------------------------------------------------------- Mon Jun 17 06:22:31 UTC 2024 - Pedro Monreal diff --git a/libnettle.spec b/libnettle.spec index 97646e6..3f72188 100644 --- a/libnettle.spec +++ b/libnettle.spec @@ -30,6 +30,11 @@ Source1: https://ftp.gnu.org/gnu/nettle/nettle-%{version}.tar.gz.sig Source2: %{name}.keyring Source3: baselibs.conf Source4: %{name}-rpmlintrc +# PATCH-FIX-UPSTREAM [jsc#PED-9904] ppcl64le: POWER10 performance enhancements for cryptography +Patch1: libnettle-powerpc64-remove-m4_unquote-sha256.patch +Patch2: libnettle-powerpc64-sha256-adjust-stack-offset-for-non-volatile-registers.patch +Patch3: libnettle-powerpc64-sha256-fix-loading-overreads.patch +Patch4: libnettle-powerpc64-skip-AES-GCM-test.patch BuildRequires: autoconf BuildRequires: fipscheck BuildRequires: gmp-devel >= 6.1.0