113 lines
2.9 KiB
Diff
113 lines
2.9 KiB
Diff
From 89ae5b24c6d052aa4d9b14c9a50b3c62b5636d81 Mon Sep 17 00:00:00 2001
|
|
From: Eric Richter <erichte@linux.ibm.com>
|
|
Date: Wed, 11 Sep 2024 13:53:48 -0500
|
|
Subject: [PATCH] powerpc64/sha256: fix loading overreads by loading less and
|
|
shifting
|
|
|
|
Originally, the 16 input words were loaded with 16 individual vector load
|
|
instructions. This has a side effect where the last three loads would
|
|
overread 1/2/3 extra words.
|
|
|
|
Fix the overread by replacing unnecessary overlapped reads with shifts.
|
|
As a consequence, the constant registers for 4,8,12 can be removed, and
|
|
also gain about 1~2% in performance.
|
|
|
|
Signed-off-by: Eric Richter <erichte@linux.ibm.com>
|
|
---
|
|
powerpc64/p8/sha256-compress-n.asm | 44 +++++++++++-------------------
|
|
1 file changed, 16 insertions(+), 28 deletions(-)
|
|
|
|
diff --git a/powerpc64/p8/sha256-compress-n.asm b/powerpc64/p8/sha256-compress-n.asm
|
|
index e08ae132..75666deb 100644
|
|
--- a/powerpc64/p8/sha256-compress-n.asm
|
|
+++ b/powerpc64/p8/sha256-compress-n.asm
|
|
@@ -44,10 +44,7 @@ define(`T1', `r8')
|
|
define(`TK', `r9')
|
|
define(`COUNT', `r10')
|
|
define(`TC0', `0') C Index instructions allow literal 0 instead of a GPR
|
|
-define(`TC4', `r11')
|
|
-define(`TC8', `r12')
|
|
-define(`TC12', `r14')
|
|
-define(`TC16', `r15')
|
|
+define(`TC16', `r11')
|
|
|
|
C State registers
|
|
define(`VSA', `v0')
|
|
@@ -187,24 +184,24 @@ define(`LOAD', `
|
|
define(`DOLOADS', `
|
|
IF_LE(`DATA_LOAD_VEC(VT0, .load_swap, T1)')
|
|
LOAD(0, TC0)
|
|
- LOAD(1, TC4)
|
|
- LOAD(2, TC8)
|
|
- LOAD(3, TC12)
|
|
+ vsldoi IV(1), IV(0), IV(0), 4
|
|
+ vsldoi IV(2), IV(0), IV(0), 8
|
|
+ vsldoi IV(3), IV(0), IV(0), 12
|
|
addi INPUT, INPUT, 16
|
|
LOAD(4, TC0)
|
|
- LOAD(5, TC4)
|
|
- LOAD(6, TC8)
|
|
- LOAD(7, TC12)
|
|
+ vsldoi IV(5), IV(4), IV(4), 4
|
|
+ vsldoi IV(6), IV(4), IV(4), 8
|
|
+ vsldoi IV(7), IV(4), IV(4), 12
|
|
addi INPUT, INPUT, 16
|
|
LOAD(8, TC0)
|
|
- LOAD(9, TC4)
|
|
- LOAD(10, TC8)
|
|
- LOAD(11, TC12)
|
|
+ vsldoi IV(9), IV(8), IV(8), 4
|
|
+ vsldoi IV(10), IV(8), IV(8), 8
|
|
+ vsldoi IV(11), IV(8), IV(8), 12
|
|
addi INPUT, INPUT, 16
|
|
LOAD(12, TC0)
|
|
- LOAD(13, TC4)
|
|
- LOAD(14, TC8)
|
|
- LOAD(15, TC12)
|
|
+ vsldoi IV(13), IV(12), IV(12), 4
|
|
+ vsldoi IV(14), IV(12), IV(12), 8
|
|
+ vsldoi IV(15), IV(12), IV(12), 12
|
|
addi INPUT, INPUT, 16
|
|
')
|
|
|
|
@@ -216,6 +213,8 @@ PROLOGUE(_nettle_sha256_compress_n)
|
|
|
|
C Store non-volatile registers
|
|
|
|
+ ALIGN(16) C Appears necessary for optimal stores
|
|
+ li TC16, 16
|
|
li T0, -16
|
|
li T1, -32
|
|
stvx v20, T0, SP
|
|
@@ -240,15 +239,8 @@ PROLOGUE(_nettle_sha256_compress_n)
|
|
subi T1, T1, 32
|
|
stvx v30, T0, SP
|
|
stvx v31, T1, SP
|
|
- subi T0, T0, 32
|
|
- subi T1, T1, 32
|
|
- stdx r14, T0, SP
|
|
- stdx r15, T1, SP
|
|
|
|
- li TC4, 4
|
|
- li TC8, 8
|
|
- li TC12, 12
|
|
- li TC16, 16
|
|
+ ALIGN(16) C Appears necessary for optimal loads
|
|
|
|
C Load state values
|
|
lxvw4x VSR(VSA), 0, STATE C VSA contains A,B,C,D
|
|
@@ -345,10 +337,6 @@ PROLOGUE(_nettle_sha256_compress_n)
|
|
subi T1, T1, 32
|
|
lvx v30, T0, SP
|
|
lvx v31, T1, SP
|
|
- subi T0, T0, 32
|
|
- subi T1, T1, 32
|
|
- ldx r14, T0, SP
|
|
- ldx r15, T1, SP
|
|
|
|
.done:
|
|
mr r3, INPUT
|
|
--
|
|
GitLab
|
|
|