496 lines
14 KiB
Diff
496 lines
14 KiB
Diff
|
From 3d3a7ecd1ae5ab08d22041f7b3b035c34f12fa02 Mon Sep 17 00:00:00 2001
|
||
|
From: Danny Tsen <dtsen@linux.ibm.com>
|
||
|
Date: Tue, 22 Aug 2023 15:58:53 -0400
|
||
|
Subject: [PATCH] Improve performance for 6x unrolling with vpermxor
|
||
|
instruction
|
||
|
|
||
|
Reviewed-by: Paul Dale <pauli@openssl.org>
|
||
|
Reviewed-by: Tomas Mraz <tomas@openssl.org>
|
||
|
(Merged from https://github.com/openssl/openssl/pull/21812)
|
||
|
---
|
||
|
crypto/aes/asm/aesp8-ppc.pl | 145 +++++++++++++++++++++++-------------
|
||
|
1 file changed, 95 insertions(+), 50 deletions(-)
|
||
|
|
||
|
diff --git a/crypto/aes/asm/aesp8-ppc.pl b/crypto/aes/asm/aesp8-ppc.pl
|
||
|
index 60cf86f52aed2..38b9405a283b7 100755
|
||
|
--- a/crypto/aes/asm/aesp8-ppc.pl
|
||
|
+++ b/crypto/aes/asm/aesp8-ppc.pl
|
||
|
@@ -99,11 +99,12 @@
|
||
|
.long 0x1b000000, 0x1b000000, 0x1b000000, 0x1b000000 ?rev
|
||
|
.long 0x0d0e0f0c, 0x0d0e0f0c, 0x0d0e0f0c, 0x0d0e0f0c ?rev
|
||
|
.long 0,0,0,0 ?asis
|
||
|
+.long 0x0f102132, 0x43546576, 0x8798a9ba, 0xcbdcedfe
|
||
|
Lconsts:
|
||
|
mflr r0
|
||
|
bcl 20,31,\$+4
|
||
|
mflr $ptr #vvvvv "distance between . and rcon
|
||
|
- addi $ptr,$ptr,-0x48
|
||
|
+ addi $ptr,$ptr,-0x58
|
||
|
mtlr r0
|
||
|
blr
|
||
|
.long 0
|
||
|
@@ -2405,7 +2406,7 @@ ()
|
||
|
my $key_=$key2;
|
||
|
my ($x00,$x10,$x20,$x30,$x40,$x50,$x60,$x70)=map("r$_",(0,3,26..31));
|
||
|
$x00=0 if ($flavour =~ /osx/);
|
||
|
-my ($in0, $in1, $in2, $in3, $in4, $in5 )=map("v$_",(0..5));
|
||
|
+my ($in0, $in1, $in2, $in3, $in4, $in5)=map("v$_",(0..5));
|
||
|
my ($out0, $out1, $out2, $out3, $out4, $out5)=map("v$_",(7,12..16));
|
||
|
my ($twk0, $twk1, $twk2, $twk3, $twk4, $twk5)=map("v$_",(17..22));
|
||
|
my $rndkey0="v23"; # v24-v25 rotating buffer for first found keys
|
||
|
@@ -2460,6 +2461,18 @@ ()
|
||
|
li $x70,0x70
|
||
|
mtspr 256,r0
|
||
|
|
||
|
+ # Reverse eighty7 to 0x010101..87
|
||
|
+ xxlor 2, 32+$eighty7, 32+$eighty7
|
||
|
+ vsldoi $eighty7,$tmp,$eighty7,1 # 0x010101..87
|
||
|
+ xxlor 1, 32+$eighty7, 32+$eighty7
|
||
|
+
|
||
|
+ # Load XOR contents. 0xf102132435465768798a9bacbdcedfe
|
||
|
+ mr $x70, r6
|
||
|
+ bl Lconsts
|
||
|
+ lxvw4x 0, $x40, r6 # load XOR contents
|
||
|
+ mr r6, $x70
|
||
|
+ li $x70,0x70
|
||
|
+
|
||
|
subi $rounds,$rounds,3 # -4 in total
|
||
|
|
||
|
lvx $rndkey0,$x00,$key1 # load key schedule
|
||
|
@@ -2502,69 +2515,77 @@ ()
|
||
|
?vperm v31,v31,$twk5,$keyperm
|
||
|
lvx v25,$x10,$key_ # pre-load round[2]
|
||
|
|
||
|
+ # Switch to use the following codes with 0x010101..87 to generate tweak.
|
||
|
+ # eighty7 = 0x010101..87
|
||
|
+ # vsrab tmp, tweak, seven # next tweak value, right shift 7 bits
|
||
|
+ # vand tmp, tmp, eighty7 # last byte with carry
|
||
|
+ # vaddubm tweak, tweak, tweak # left shift 1 bit (x2)
|
||
|
+ # xxlor vsx, 0, 0
|
||
|
+ # vpermxor tweak, tweak, tmp, vsx
|
||
|
+
|
||
|
vperm $in0,$inout,$inptail,$inpperm
|
||
|
subi $inp,$inp,31 # undo "caller"
|
||
|
vxor $twk0,$tweak,$rndkey0
|
||
|
vsrab $tmp,$tweak,$seven # next tweak value
|
||
|
vaddubm $tweak,$tweak,$tweak
|
||
|
- vsldoi $tmp,$tmp,$tmp,15
|
||
|
vand $tmp,$tmp,$eighty7
|
||
|
vxor $out0,$in0,$twk0
|
||
|
- vxor $tweak,$tweak,$tmp
|
||
|
+ xxlor 32+$in1, 0, 0
|
||
|
+ vpermxor $tweak, $tweak, $tmp, $in1
|
||
|
|
||
|
lvx_u $in1,$x10,$inp
|
||
|
vxor $twk1,$tweak,$rndkey0
|
||
|
vsrab $tmp,$tweak,$seven # next tweak value
|
||
|
vaddubm $tweak,$tweak,$tweak
|
||
|
- vsldoi $tmp,$tmp,$tmp,15
|
||
|
le?vperm $in1,$in1,$in1,$leperm
|
||
|
vand $tmp,$tmp,$eighty7
|
||
|
vxor $out1,$in1,$twk1
|
||
|
- vxor $tweak,$tweak,$tmp
|
||
|
+ xxlor 32+$in2, 0, 0
|
||
|
+ vpermxor $tweak, $tweak, $tmp, $in2
|
||
|
|
||
|
lvx_u $in2,$x20,$inp
|
||
|
andi. $taillen,$len,15
|
||
|
vxor $twk2,$tweak,$rndkey0
|
||
|
vsrab $tmp,$tweak,$seven # next tweak value
|
||
|
vaddubm $tweak,$tweak,$tweak
|
||
|
- vsldoi $tmp,$tmp,$tmp,15
|
||
|
le?vperm $in2,$in2,$in2,$leperm
|
||
|
vand $tmp,$tmp,$eighty7
|
||
|
vxor $out2,$in2,$twk2
|
||
|
- vxor $tweak,$tweak,$tmp
|
||
|
+ xxlor 32+$in3, 0, 0
|
||
|
+ vpermxor $tweak, $tweak, $tmp, $in3
|
||
|
|
||
|
lvx_u $in3,$x30,$inp
|
||
|
sub $len,$len,$taillen
|
||
|
vxor $twk3,$tweak,$rndkey0
|
||
|
vsrab $tmp,$tweak,$seven # next tweak value
|
||
|
vaddubm $tweak,$tweak,$tweak
|
||
|
- vsldoi $tmp,$tmp,$tmp,15
|
||
|
le?vperm $in3,$in3,$in3,$leperm
|
||
|
vand $tmp,$tmp,$eighty7
|
||
|
vxor $out3,$in3,$twk3
|
||
|
- vxor $tweak,$tweak,$tmp
|
||
|
+ xxlor 32+$in4, 0, 0
|
||
|
+ vpermxor $tweak, $tweak, $tmp, $in4
|
||
|
|
||
|
lvx_u $in4,$x40,$inp
|
||
|
subi $len,$len,0x60
|
||
|
vxor $twk4,$tweak,$rndkey0
|
||
|
vsrab $tmp,$tweak,$seven # next tweak value
|
||
|
vaddubm $tweak,$tweak,$tweak
|
||
|
- vsldoi $tmp,$tmp,$tmp,15
|
||
|
le?vperm $in4,$in4,$in4,$leperm
|
||
|
vand $tmp,$tmp,$eighty7
|
||
|
vxor $out4,$in4,$twk4
|
||
|
- vxor $tweak,$tweak,$tmp
|
||
|
+ xxlor 32+$in5, 0, 0
|
||
|
+ vpermxor $tweak, $tweak, $tmp, $in5
|
||
|
|
||
|
lvx_u $in5,$x50,$inp
|
||
|
addi $inp,$inp,0x60
|
||
|
vxor $twk5,$tweak,$rndkey0
|
||
|
vsrab $tmp,$tweak,$seven # next tweak value
|
||
|
vaddubm $tweak,$tweak,$tweak
|
||
|
- vsldoi $tmp,$tmp,$tmp,15
|
||
|
le?vperm $in5,$in5,$in5,$leperm
|
||
|
vand $tmp,$tmp,$eighty7
|
||
|
vxor $out5,$in5,$twk5
|
||
|
- vxor $tweak,$tweak,$tmp
|
||
|
+ xxlor 32+$in0, 0, 0
|
||
|
+ vpermxor $tweak, $tweak, $tmp, $in0
|
||
|
|
||
|
vxor v31,v31,$rndkey0
|
||
|
mtctr $rounds
|
||
|
@@ -2590,6 +2611,8 @@ ()
|
||
|
lvx v25,$x10,$key_ # round[4]
|
||
|
bdnz Loop_xts_enc6x
|
||
|
|
||
|
+ xxlor 32+$eighty7, 1, 1 # 0x010101..87
|
||
|
+
|
||
|
subic $len,$len,96 # $len-=96
|
||
|
vxor $in0,$twk0,v31 # xor with last round key
|
||
|
vcipher $out0,$out0,v24
|
||
|
@@ -2599,7 +2622,6 @@ ()
|
||
|
vaddubm $tweak,$tweak,$tweak
|
||
|
vcipher $out2,$out2,v24
|
||
|
vcipher $out3,$out3,v24
|
||
|
- vsldoi $tmp,$tmp,$tmp,15
|
||
|
vcipher $out4,$out4,v24
|
||
|
vcipher $out5,$out5,v24
|
||
|
|
||
|
@@ -2607,7 +2629,8 @@ ()
|
||
|
vand $tmp,$tmp,$eighty7
|
||
|
vcipher $out0,$out0,v25
|
||
|
vcipher $out1,$out1,v25
|
||
|
- vxor $tweak,$tweak,$tmp
|
||
|
+ xxlor 32+$in1, 0, 0
|
||
|
+ vpermxor $tweak, $tweak, $tmp, $in1
|
||
|
vcipher $out2,$out2,v25
|
||
|
vcipher $out3,$out3,v25
|
||
|
vxor $in1,$twk1,v31
|
||
|
@@ -2618,13 +2641,13 @@ ()
|
||
|
|
||
|
and r0,r0,$len
|
||
|
vaddubm $tweak,$tweak,$tweak
|
||
|
- vsldoi $tmp,$tmp,$tmp,15
|
||
|
vcipher $out0,$out0,v26
|
||
|
vcipher $out1,$out1,v26
|
||
|
vand $tmp,$tmp,$eighty7
|
||
|
vcipher $out2,$out2,v26
|
||
|
vcipher $out3,$out3,v26
|
||
|
- vxor $tweak,$tweak,$tmp
|
||
|
+ xxlor 32+$in2, 0, 0
|
||
|
+ vpermxor $tweak, $tweak, $tmp, $in2
|
||
|
vcipher $out4,$out4,v26
|
||
|
vcipher $out5,$out5,v26
|
||
|
|
||
|
@@ -2638,7 +2661,6 @@ ()
|
||
|
vaddubm $tweak,$tweak,$tweak
|
||
|
vcipher $out0,$out0,v27
|
||
|
vcipher $out1,$out1,v27
|
||
|
- vsldoi $tmp,$tmp,$tmp,15
|
||
|
vcipher $out2,$out2,v27
|
||
|
vcipher $out3,$out3,v27
|
||
|
vand $tmp,$tmp,$eighty7
|
||
|
@@ -2646,7 +2668,8 @@ ()
|
||
|
vcipher $out5,$out5,v27
|
||
|
|
||
|
addi $key_,$sp,$FRAME+15 # rewind $key_
|
||
|
- vxor $tweak,$tweak,$tmp
|
||
|
+ xxlor 32+$in3, 0, 0
|
||
|
+ vpermxor $tweak, $tweak, $tmp, $in3
|
||
|
vcipher $out0,$out0,v28
|
||
|
vcipher $out1,$out1,v28
|
||
|
vxor $in3,$twk3,v31
|
||
|
@@ -2655,7 +2678,6 @@ ()
|
||
|
vcipher $out2,$out2,v28
|
||
|
vcipher $out3,$out3,v28
|
||
|
vaddubm $tweak,$tweak,$tweak
|
||
|
- vsldoi $tmp,$tmp,$tmp,15
|
||
|
vcipher $out4,$out4,v28
|
||
|
vcipher $out5,$out5,v28
|
||
|
lvx v24,$x00,$key_ # re-pre-load round[1]
|
||
|
@@ -2663,7 +2685,8 @@ ()
|
||
|
|
||
|
vcipher $out0,$out0,v29
|
||
|
vcipher $out1,$out1,v29
|
||
|
- vxor $tweak,$tweak,$tmp
|
||
|
+ xxlor 32+$in4, 0, 0
|
||
|
+ vpermxor $tweak, $tweak, $tmp, $in4
|
||
|
vcipher $out2,$out2,v29
|
||
|
vcipher $out3,$out3,v29
|
||
|
vxor $in4,$twk4,v31
|
||
|
@@ -2673,14 +2696,14 @@ ()
|
||
|
vcipher $out5,$out5,v29
|
||
|
lvx v25,$x10,$key_ # re-pre-load round[2]
|
||
|
vaddubm $tweak,$tweak,$tweak
|
||
|
- vsldoi $tmp,$tmp,$tmp,15
|
||
|
|
||
|
vcipher $out0,$out0,v30
|
||
|
vcipher $out1,$out1,v30
|
||
|
vand $tmp,$tmp,$eighty7
|
||
|
vcipher $out2,$out2,v30
|
||
|
vcipher $out3,$out3,v30
|
||
|
- vxor $tweak,$tweak,$tmp
|
||
|
+ xxlor 32+$in5, 0, 0
|
||
|
+ vpermxor $tweak, $tweak, $tmp, $in5
|
||
|
vcipher $out4,$out4,v30
|
||
|
vcipher $out5,$out5,v30
|
||
|
vxor $in5,$twk5,v31
|
||
|
@@ -2690,7 +2713,6 @@ ()
|
||
|
vcipherlast $out0,$out0,$in0
|
||
|
lvx_u $in0,$x00,$inp # load next input block
|
||
|
vaddubm $tweak,$tweak,$tweak
|
||
|
- vsldoi $tmp,$tmp,$tmp,15
|
||
|
vcipherlast $out1,$out1,$in1
|
||
|
lvx_u $in1,$x10,$inp
|
||
|
vcipherlast $out2,$out2,$in2
|
||
|
@@ -2703,7 +2725,10 @@ ()
|
||
|
vcipherlast $out4,$out4,$in4
|
||
|
le?vperm $in2,$in2,$in2,$leperm
|
||
|
lvx_u $in4,$x40,$inp
|
||
|
- vxor $tweak,$tweak,$tmp
|
||
|
+ xxlor 10, 32+$in0, 32+$in0
|
||
|
+ xxlor 32+$in0, 0, 0
|
||
|
+ vpermxor $tweak, $tweak, $tmp, $in0
|
||
|
+ xxlor 32+$in0, 10, 10
|
||
|
vcipherlast $tmp,$out5,$in5 # last block might be needed
|
||
|
# in stealing mode
|
||
|
le?vperm $in3,$in3,$in3,$leperm
|
||
|
@@ -2736,6 +2761,8 @@ ()
|
||
|
mtctr $rounds
|
||
|
beq Loop_xts_enc6x # did $len-=96 borrow?
|
||
|
|
||
|
+ xxlor 32+$eighty7, 2, 2 # 0x870101..01
|
||
|
+
|
||
|
addic. $len,$len,0x60
|
||
|
beq Lxts_enc6x_zero
|
||
|
cmpwi $len,0x20
|
||
|
@@ -3112,6 +3139,18 @@ ()
|
||
|
li $x70,0x70
|
||
|
mtspr 256,r0
|
||
|
|
||
|
+ # Reverse eighty7 to 0x010101..87
|
||
|
+ xxlor 2, 32+$eighty7, 32+$eighty7
|
||
|
+ vsldoi $eighty7,$tmp,$eighty7,1 # 0x010101..87
|
||
|
+ xxlor 1, 32+$eighty7, 32+$eighty7
|
||
|
+
|
||
|
+ # Load XOR contents. 0xf102132435465768798a9bacbdcedfe
|
||
|
+ mr $x70, r6
|
||
|
+ bl Lconsts
|
||
|
+ lxvw4x 0, $x40, r6 # load XOR contents
|
||
|
+ mr r6, $x70
|
||
|
+ li $x70,0x70
|
||
|
+
|
||
|
subi $rounds,$rounds,3 # -4 in total
|
||
|
|
||
|
lvx $rndkey0,$x00,$key1 # load key schedule
|
||
|
@@ -3159,64 +3198,64 @@ ()
|
||
|
vxor $twk0,$tweak,$rndkey0
|
||
|
vsrab $tmp,$tweak,$seven # next tweak value
|
||
|
vaddubm $tweak,$tweak,$tweak
|
||
|
- vsldoi $tmp,$tmp,$tmp,15
|
||
|
vand $tmp,$tmp,$eighty7
|
||
|
vxor $out0,$in0,$twk0
|
||
|
- vxor $tweak,$tweak,$tmp
|
||
|
+ xxlor 32+$in1, 0, 0
|
||
|
+ vpermxor $tweak, $tweak, $tmp, $in1
|
||
|
|
||
|
lvx_u $in1,$x10,$inp
|
||
|
vxor $twk1,$tweak,$rndkey0
|
||
|
vsrab $tmp,$tweak,$seven # next tweak value
|
||
|
vaddubm $tweak,$tweak,$tweak
|
||
|
- vsldoi $tmp,$tmp,$tmp,15
|
||
|
le?vperm $in1,$in1,$in1,$leperm
|
||
|
vand $tmp,$tmp,$eighty7
|
||
|
vxor $out1,$in1,$twk1
|
||
|
- vxor $tweak,$tweak,$tmp
|
||
|
+ xxlor 32+$in2, 0, 0
|
||
|
+ vpermxor $tweak, $tweak, $tmp, $in2
|
||
|
|
||
|
lvx_u $in2,$x20,$inp
|
||
|
andi. $taillen,$len,15
|
||
|
vxor $twk2,$tweak,$rndkey0
|
||
|
vsrab $tmp,$tweak,$seven # next tweak value
|
||
|
vaddubm $tweak,$tweak,$tweak
|
||
|
- vsldoi $tmp,$tmp,$tmp,15
|
||
|
le?vperm $in2,$in2,$in2,$leperm
|
||
|
vand $tmp,$tmp,$eighty7
|
||
|
vxor $out2,$in2,$twk2
|
||
|
- vxor $tweak,$tweak,$tmp
|
||
|
+ xxlor 32+$in3, 0, 0
|
||
|
+ vpermxor $tweak, $tweak, $tmp, $in3
|
||
|
|
||
|
lvx_u $in3,$x30,$inp
|
||
|
sub $len,$len,$taillen
|
||
|
vxor $twk3,$tweak,$rndkey0
|
||
|
vsrab $tmp,$tweak,$seven # next tweak value
|
||
|
vaddubm $tweak,$tweak,$tweak
|
||
|
- vsldoi $tmp,$tmp,$tmp,15
|
||
|
le?vperm $in3,$in3,$in3,$leperm
|
||
|
vand $tmp,$tmp,$eighty7
|
||
|
vxor $out3,$in3,$twk3
|
||
|
- vxor $tweak,$tweak,$tmp
|
||
|
+ xxlor 32+$in4, 0, 0
|
||
|
+ vpermxor $tweak, $tweak, $tmp, $in4
|
||
|
|
||
|
lvx_u $in4,$x40,$inp
|
||
|
subi $len,$len,0x60
|
||
|
vxor $twk4,$tweak,$rndkey0
|
||
|
vsrab $tmp,$tweak,$seven # next tweak value
|
||
|
vaddubm $tweak,$tweak,$tweak
|
||
|
- vsldoi $tmp,$tmp,$tmp,15
|
||
|
le?vperm $in4,$in4,$in4,$leperm
|
||
|
vand $tmp,$tmp,$eighty7
|
||
|
vxor $out4,$in4,$twk4
|
||
|
- vxor $tweak,$tweak,$tmp
|
||
|
+ xxlor 32+$in5, 0, 0
|
||
|
+ vpermxor $tweak, $tweak, $tmp, $in5
|
||
|
|
||
|
lvx_u $in5,$x50,$inp
|
||
|
addi $inp,$inp,0x60
|
||
|
vxor $twk5,$tweak,$rndkey0
|
||
|
vsrab $tmp,$tweak,$seven # next tweak value
|
||
|
vaddubm $tweak,$tweak,$tweak
|
||
|
- vsldoi $tmp,$tmp,$tmp,15
|
||
|
le?vperm $in5,$in5,$in5,$leperm
|
||
|
vand $tmp,$tmp,$eighty7
|
||
|
vxor $out5,$in5,$twk5
|
||
|
- vxor $tweak,$tweak,$tmp
|
||
|
+ xxlor 32+$in0, 0, 0
|
||
|
+ vpermxor $tweak, $tweak, $tmp, $in0
|
||
|
|
||
|
vxor v31,v31,$rndkey0
|
||
|
mtctr $rounds
|
||
|
@@ -3242,6 +3281,8 @@ ()
|
||
|
lvx v25,$x10,$key_ # round[4]
|
||
|
bdnz Loop_xts_dec6x
|
||
|
|
||
|
+ xxlor 32+$eighty7, 1, 1
|
||
|
+
|
||
|
subic $len,$len,96 # $len-=96
|
||
|
vxor $in0,$twk0,v31 # xor with last round key
|
||
|
vncipher $out0,$out0,v24
|
||
|
@@ -3251,7 +3292,6 @@ ()
|
||
|
vaddubm $tweak,$tweak,$tweak
|
||
|
vncipher $out2,$out2,v24
|
||
|
vncipher $out3,$out3,v24
|
||
|
- vsldoi $tmp,$tmp,$tmp,15
|
||
|
vncipher $out4,$out4,v24
|
||
|
vncipher $out5,$out5,v24
|
||
|
|
||
|
@@ -3259,7 +3299,8 @@ ()
|
||
|
vand $tmp,$tmp,$eighty7
|
||
|
vncipher $out0,$out0,v25
|
||
|
vncipher $out1,$out1,v25
|
||
|
- vxor $tweak,$tweak,$tmp
|
||
|
+ xxlor 32+$in1, 0, 0
|
||
|
+ vpermxor $tweak, $tweak, $tmp, $in1
|
||
|
vncipher $out2,$out2,v25
|
||
|
vncipher $out3,$out3,v25
|
||
|
vxor $in1,$twk1,v31
|
||
|
@@ -3270,13 +3311,13 @@ ()
|
||
|
|
||
|
and r0,r0,$len
|
||
|
vaddubm $tweak,$tweak,$tweak
|
||
|
- vsldoi $tmp,$tmp,$tmp,15
|
||
|
vncipher $out0,$out0,v26
|
||
|
vncipher $out1,$out1,v26
|
||
|
vand $tmp,$tmp,$eighty7
|
||
|
vncipher $out2,$out2,v26
|
||
|
vncipher $out3,$out3,v26
|
||
|
- vxor $tweak,$tweak,$tmp
|
||
|
+ xxlor 32+$in2, 0, 0
|
||
|
+ vpermxor $tweak, $tweak, $tmp, $in2
|
||
|
vncipher $out4,$out4,v26
|
||
|
vncipher $out5,$out5,v26
|
||
|
|
||
|
@@ -3290,7 +3331,6 @@ ()
|
||
|
vaddubm $tweak,$tweak,$tweak
|
||
|
vncipher $out0,$out0,v27
|
||
|
vncipher $out1,$out1,v27
|
||
|
- vsldoi $tmp,$tmp,$tmp,15
|
||
|
vncipher $out2,$out2,v27
|
||
|
vncipher $out3,$out3,v27
|
||
|
vand $tmp,$tmp,$eighty7
|
||
|
@@ -3298,7 +3338,8 @@ ()
|
||
|
vncipher $out5,$out5,v27
|
||
|
|
||
|
addi $key_,$sp,$FRAME+15 # rewind $key_
|
||
|
- vxor $tweak,$tweak,$tmp
|
||
|
+ xxlor 32+$in3, 0, 0
|
||
|
+ vpermxor $tweak, $tweak, $tmp, $in3
|
||
|
vncipher $out0,$out0,v28
|
||
|
vncipher $out1,$out1,v28
|
||
|
vxor $in3,$twk3,v31
|
||
|
@@ -3307,7 +3348,6 @@ ()
|
||
|
vncipher $out2,$out2,v28
|
||
|
vncipher $out3,$out3,v28
|
||
|
vaddubm $tweak,$tweak,$tweak
|
||
|
- vsldoi $tmp,$tmp,$tmp,15
|
||
|
vncipher $out4,$out4,v28
|
||
|
vncipher $out5,$out5,v28
|
||
|
lvx v24,$x00,$key_ # re-pre-load round[1]
|
||
|
@@ -3315,7 +3355,8 @@ ()
|
||
|
|
||
|
vncipher $out0,$out0,v29
|
||
|
vncipher $out1,$out1,v29
|
||
|
- vxor $tweak,$tweak,$tmp
|
||
|
+ xxlor 32+$in4, 0, 0
|
||
|
+ vpermxor $tweak, $tweak, $tmp, $in4
|
||
|
vncipher $out2,$out2,v29
|
||
|
vncipher $out3,$out3,v29
|
||
|
vxor $in4,$twk4,v31
|
||
|
@@ -3325,14 +3366,14 @@ ()
|
||
|
vncipher $out5,$out5,v29
|
||
|
lvx v25,$x10,$key_ # re-pre-load round[2]
|
||
|
vaddubm $tweak,$tweak,$tweak
|
||
|
- vsldoi $tmp,$tmp,$tmp,15
|
||
|
|
||
|
vncipher $out0,$out0,v30
|
||
|
vncipher $out1,$out1,v30
|
||
|
vand $tmp,$tmp,$eighty7
|
||
|
vncipher $out2,$out2,v30
|
||
|
vncipher $out3,$out3,v30
|
||
|
- vxor $tweak,$tweak,$tmp
|
||
|
+ xxlor 32+$in5, 0, 0
|
||
|
+ vpermxor $tweak, $tweak, $tmp, $in5
|
||
|
vncipher $out4,$out4,v30
|
||
|
vncipher $out5,$out5,v30
|
||
|
vxor $in5,$twk5,v31
|
||
|
@@ -3342,7 +3383,6 @@ ()
|
||
|
vncipherlast $out0,$out0,$in0
|
||
|
lvx_u $in0,$x00,$inp # load next input block
|
||
|
vaddubm $tweak,$tweak,$tweak
|
||
|
- vsldoi $tmp,$tmp,$tmp,15
|
||
|
vncipherlast $out1,$out1,$in1
|
||
|
lvx_u $in1,$x10,$inp
|
||
|
vncipherlast $out2,$out2,$in2
|
||
|
@@ -3355,7 +3395,10 @@ ()
|
||
|
vncipherlast $out4,$out4,$in4
|
||
|
le?vperm $in2,$in2,$in2,$leperm
|
||
|
lvx_u $in4,$x40,$inp
|
||
|
- vxor $tweak,$tweak,$tmp
|
||
|
+ xxlor 10, 32+$in0, 32+$in0
|
||
|
+ xxlor 32+$in0, 0, 0
|
||
|
+ vpermxor $tweak, $tweak, $tmp, $in0
|
||
|
+ xxlor 32+$in0, 10, 10
|
||
|
vncipherlast $out5,$out5,$in5
|
||
|
le?vperm $in3,$in3,$in3,$leperm
|
||
|
lvx_u $in5,$x50,$inp
|
||
|
@@ -3386,6 +3429,8 @@ ()
|
||
|
mtctr $rounds
|
||
|
beq Loop_xts_dec6x # did $len-=96 borrow?
|
||
|
|
||
|
+ xxlor 32+$eighty7, 2, 2
|
||
|
+
|
||
|
addic. $len,$len,0x60
|
||
|
beq Lxts_dec6x_zero
|
||
|
cmpwi $len,0x20
|