From 89ae5b24c6d052aa4d9b14c9a50b3c62b5636d81 Mon Sep 17 00:00:00 2001 From: Eric Richter Date: Wed, 11 Sep 2024 13:53:48 -0500 Subject: [PATCH] powerpc64/sha256: fix loading overreads by loading less and shifting Originally, the 16 input words were loaded with 16 individual vector load instructions. This has a side effect where the last three loads would overread 1/2/3 extra words. Fix the overread by replacing unnecessary overlapped reads with shifts. As a consequence, the constant registers for 4,8,12 can be removed, and also gain about 1~2% in performance. Signed-off-by: Eric Richter --- powerpc64/p8/sha256-compress-n.asm | 44 +++++++++++------------------- 1 file changed, 16 insertions(+), 28 deletions(-) diff --git a/powerpc64/p8/sha256-compress-n.asm b/powerpc64/p8/sha256-compress-n.asm index e08ae132..75666deb 100644 --- a/powerpc64/p8/sha256-compress-n.asm +++ b/powerpc64/p8/sha256-compress-n.asm @@ -44,10 +44,7 @@ define(`T1', `r8') define(`TK', `r9') define(`COUNT', `r10') define(`TC0', `0') C Index instructions allow literal 0 instead of a GPR -define(`TC4', `r11') -define(`TC8', `r12') -define(`TC12', `r14') -define(`TC16', `r15') +define(`TC16', `r11') C State registers define(`VSA', `v0') @@ -187,24 +184,24 @@ define(`LOAD', ` define(`DOLOADS', ` IF_LE(`DATA_LOAD_VEC(VT0, .load_swap, T1)') LOAD(0, TC0) - LOAD(1, TC4) - LOAD(2, TC8) - LOAD(3, TC12) + vsldoi IV(1), IV(0), IV(0), 4 + vsldoi IV(2), IV(0), IV(0), 8 + vsldoi IV(3), IV(0), IV(0), 12 addi INPUT, INPUT, 16 LOAD(4, TC0) - LOAD(5, TC4) - LOAD(6, TC8) - LOAD(7, TC12) + vsldoi IV(5), IV(4), IV(4), 4 + vsldoi IV(6), IV(4), IV(4), 8 + vsldoi IV(7), IV(4), IV(4), 12 addi INPUT, INPUT, 16 LOAD(8, TC0) - LOAD(9, TC4) - LOAD(10, TC8) - LOAD(11, TC12) + vsldoi IV(9), IV(8), IV(8), 4 + vsldoi IV(10), IV(8), IV(8), 8 + vsldoi IV(11), IV(8), IV(8), 12 addi INPUT, INPUT, 16 LOAD(12, TC0) - LOAD(13, TC4) - LOAD(14, TC8) - LOAD(15, TC12) + vsldoi IV(13), IV(12), IV(12), 4 + vsldoi IV(14), IV(12), IV(12), 8 + vsldoi IV(15), IV(12), IV(12), 12 addi INPUT, INPUT, 16 ') @@ -216,6 +213,8 @@ PROLOGUE(_nettle_sha256_compress_n) C Store non-volatile registers + ALIGN(16) C Appears necessary for optimal stores + li TC16, 16 li T0, -16 li T1, -32 stvx v20, T0, SP @@ -240,15 +239,8 @@ PROLOGUE(_nettle_sha256_compress_n) subi T1, T1, 32 stvx v30, T0, SP stvx v31, T1, SP - subi T0, T0, 32 - subi T1, T1, 32 - stdx r14, T0, SP - stdx r15, T1, SP - li TC4, 4 - li TC8, 8 - li TC12, 12 - li TC16, 16 + ALIGN(16) C Appears necessary for optimal loads C Load state values lxvw4x VSR(VSA), 0, STATE C VSA contains A,B,C,D @@ -345,10 +337,6 @@ PROLOGUE(_nettle_sha256_compress_n) subi T1, T1, 32 lvx v30, T0, SP lvx v31, T1, SP - subi T0, T0, 32 - subi T1, T1, 32 - ldx r14, T0, SP - ldx r15, T1, SP .done: mr r3, INPUT -- GitLab