From 3d3a7ecd1ae5ab08d22041f7b3b035c34f12fa02 Mon Sep 17 00:00:00 2001 From: Danny Tsen Date: Tue, 22 Aug 2023 15:58:53 -0400 Subject: [PATCH] Improve performance for 6x unrolling with vpermxor instruction Reviewed-by: Paul Dale Reviewed-by: Tomas Mraz (Merged from https://github.com/openssl/openssl/pull/21812) --- crypto/aes/asm/aesp8-ppc.pl | 145 +++++++++++++++++++++++------------- 1 file changed, 95 insertions(+), 50 deletions(-) diff --git a/crypto/aes/asm/aesp8-ppc.pl b/crypto/aes/asm/aesp8-ppc.pl index 60cf86f52aed2..38b9405a283b7 100755 --- a/crypto/aes/asm/aesp8-ppc.pl +++ b/crypto/aes/asm/aesp8-ppc.pl @@ -99,11 +99,12 @@ .long 0x1b000000, 0x1b000000, 0x1b000000, 0x1b000000 ?rev .long 0x0d0e0f0c, 0x0d0e0f0c, 0x0d0e0f0c, 0x0d0e0f0c ?rev .long 0,0,0,0 ?asis +.long 0x0f102132, 0x43546576, 0x8798a9ba, 0xcbdcedfe Lconsts: mflr r0 bcl 20,31,\$+4 mflr $ptr #vvvvv "distance between . and rcon - addi $ptr,$ptr,-0x48 + addi $ptr,$ptr,-0x58 mtlr r0 blr .long 0 @@ -2405,7 +2406,7 @@ () my $key_=$key2; my ($x00,$x10,$x20,$x30,$x40,$x50,$x60,$x70)=map("r$_",(0,3,26..31)); $x00=0 if ($flavour =~ /osx/); -my ($in0, $in1, $in2, $in3, $in4, $in5 )=map("v$_",(0..5)); +my ($in0, $in1, $in2, $in3, $in4, $in5)=map("v$_",(0..5)); my ($out0, $out1, $out2, $out3, $out4, $out5)=map("v$_",(7,12..16)); my ($twk0, $twk1, $twk2, $twk3, $twk4, $twk5)=map("v$_",(17..22)); my $rndkey0="v23"; # v24-v25 rotating buffer for first found keys @@ -2460,6 +2461,18 @@ () li $x70,0x70 mtspr 256,r0 + # Reverse eighty7 to 0x010101..87 + xxlor 2, 32+$eighty7, 32+$eighty7 + vsldoi $eighty7,$tmp,$eighty7,1 # 0x010101..87 + xxlor 1, 32+$eighty7, 32+$eighty7 + + # Load XOR contents. 0xf102132435465768798a9bacbdcedfe + mr $x70, r6 + bl Lconsts + lxvw4x 0, $x40, r6 # load XOR contents + mr r6, $x70 + li $x70,0x70 + subi $rounds,$rounds,3 # -4 in total lvx $rndkey0,$x00,$key1 # load key schedule @@ -2502,69 +2515,77 @@ () ?vperm v31,v31,$twk5,$keyperm lvx v25,$x10,$key_ # pre-load round[2] + # Switch to use the following codes with 0x010101..87 to generate tweak. + # eighty7 = 0x010101..87 + # vsrab tmp, tweak, seven # next tweak value, right shift 7 bits + # vand tmp, tmp, eighty7 # last byte with carry + # vaddubm tweak, tweak, tweak # left shift 1 bit (x2) + # xxlor vsx, 0, 0 + # vpermxor tweak, tweak, tmp, vsx + vperm $in0,$inout,$inptail,$inpperm subi $inp,$inp,31 # undo "caller" vxor $twk0,$tweak,$rndkey0 vsrab $tmp,$tweak,$seven # next tweak value vaddubm $tweak,$tweak,$tweak - vsldoi $tmp,$tmp,$tmp,15 vand $tmp,$tmp,$eighty7 vxor $out0,$in0,$twk0 - vxor $tweak,$tweak,$tmp + xxlor 32+$in1, 0, 0 + vpermxor $tweak, $tweak, $tmp, $in1 lvx_u $in1,$x10,$inp vxor $twk1,$tweak,$rndkey0 vsrab $tmp,$tweak,$seven # next tweak value vaddubm $tweak,$tweak,$tweak - vsldoi $tmp,$tmp,$tmp,15 le?vperm $in1,$in1,$in1,$leperm vand $tmp,$tmp,$eighty7 vxor $out1,$in1,$twk1 - vxor $tweak,$tweak,$tmp + xxlor 32+$in2, 0, 0 + vpermxor $tweak, $tweak, $tmp, $in2 lvx_u $in2,$x20,$inp andi. $taillen,$len,15 vxor $twk2,$tweak,$rndkey0 vsrab $tmp,$tweak,$seven # next tweak value vaddubm $tweak,$tweak,$tweak - vsldoi $tmp,$tmp,$tmp,15 le?vperm $in2,$in2,$in2,$leperm vand $tmp,$tmp,$eighty7 vxor $out2,$in2,$twk2 - vxor $tweak,$tweak,$tmp + xxlor 32+$in3, 0, 0 + vpermxor $tweak, $tweak, $tmp, $in3 lvx_u $in3,$x30,$inp sub $len,$len,$taillen vxor $twk3,$tweak,$rndkey0 vsrab $tmp,$tweak,$seven # next tweak value vaddubm $tweak,$tweak,$tweak - vsldoi $tmp,$tmp,$tmp,15 le?vperm $in3,$in3,$in3,$leperm vand $tmp,$tmp,$eighty7 vxor $out3,$in3,$twk3 - vxor $tweak,$tweak,$tmp + xxlor 32+$in4, 0, 0 + vpermxor $tweak, $tweak, $tmp, $in4 lvx_u $in4,$x40,$inp subi $len,$len,0x60 vxor $twk4,$tweak,$rndkey0 vsrab $tmp,$tweak,$seven # next tweak value vaddubm $tweak,$tweak,$tweak - vsldoi $tmp,$tmp,$tmp,15 le?vperm $in4,$in4,$in4,$leperm vand $tmp,$tmp,$eighty7 vxor $out4,$in4,$twk4 - vxor $tweak,$tweak,$tmp + xxlor 32+$in5, 0, 0 + vpermxor $tweak, $tweak, $tmp, $in5 lvx_u $in5,$x50,$inp addi $inp,$inp,0x60 vxor $twk5,$tweak,$rndkey0 vsrab $tmp,$tweak,$seven # next tweak value vaddubm $tweak,$tweak,$tweak - vsldoi $tmp,$tmp,$tmp,15 le?vperm $in5,$in5,$in5,$leperm vand $tmp,$tmp,$eighty7 vxor $out5,$in5,$twk5 - vxor $tweak,$tweak,$tmp + xxlor 32+$in0, 0, 0 + vpermxor $tweak, $tweak, $tmp, $in0 vxor v31,v31,$rndkey0 mtctr $rounds @@ -2590,6 +2611,8 @@ () lvx v25,$x10,$key_ # round[4] bdnz Loop_xts_enc6x + xxlor 32+$eighty7, 1, 1 # 0x010101..87 + subic $len,$len,96 # $len-=96 vxor $in0,$twk0,v31 # xor with last round key vcipher $out0,$out0,v24 @@ -2599,7 +2622,6 @@ () vaddubm $tweak,$tweak,$tweak vcipher $out2,$out2,v24 vcipher $out3,$out3,v24 - vsldoi $tmp,$tmp,$tmp,15 vcipher $out4,$out4,v24 vcipher $out5,$out5,v24 @@ -2607,7 +2629,8 @@ () vand $tmp,$tmp,$eighty7 vcipher $out0,$out0,v25 vcipher $out1,$out1,v25 - vxor $tweak,$tweak,$tmp + xxlor 32+$in1, 0, 0 + vpermxor $tweak, $tweak, $tmp, $in1 vcipher $out2,$out2,v25 vcipher $out3,$out3,v25 vxor $in1,$twk1,v31 @@ -2618,13 +2641,13 @@ () and r0,r0,$len vaddubm $tweak,$tweak,$tweak - vsldoi $tmp,$tmp,$tmp,15 vcipher $out0,$out0,v26 vcipher $out1,$out1,v26 vand $tmp,$tmp,$eighty7 vcipher $out2,$out2,v26 vcipher $out3,$out3,v26 - vxor $tweak,$tweak,$tmp + xxlor 32+$in2, 0, 0 + vpermxor $tweak, $tweak, $tmp, $in2 vcipher $out4,$out4,v26 vcipher $out5,$out5,v26 @@ -2638,7 +2661,6 @@ () vaddubm $tweak,$tweak,$tweak vcipher $out0,$out0,v27 vcipher $out1,$out1,v27 - vsldoi $tmp,$tmp,$tmp,15 vcipher $out2,$out2,v27 vcipher $out3,$out3,v27 vand $tmp,$tmp,$eighty7 @@ -2646,7 +2668,8 @@ () vcipher $out5,$out5,v27 addi $key_,$sp,$FRAME+15 # rewind $key_ - vxor $tweak,$tweak,$tmp + xxlor 32+$in3, 0, 0 + vpermxor $tweak, $tweak, $tmp, $in3 vcipher $out0,$out0,v28 vcipher $out1,$out1,v28 vxor $in3,$twk3,v31 @@ -2655,7 +2678,6 @@ () vcipher $out2,$out2,v28 vcipher $out3,$out3,v28 vaddubm $tweak,$tweak,$tweak - vsldoi $tmp,$tmp,$tmp,15 vcipher $out4,$out4,v28 vcipher $out5,$out5,v28 lvx v24,$x00,$key_ # re-pre-load round[1] @@ -2663,7 +2685,8 @@ () vcipher $out0,$out0,v29 vcipher $out1,$out1,v29 - vxor $tweak,$tweak,$tmp + xxlor 32+$in4, 0, 0 + vpermxor $tweak, $tweak, $tmp, $in4 vcipher $out2,$out2,v29 vcipher $out3,$out3,v29 vxor $in4,$twk4,v31 @@ -2673,14 +2696,14 @@ () vcipher $out5,$out5,v29 lvx v25,$x10,$key_ # re-pre-load round[2] vaddubm $tweak,$tweak,$tweak - vsldoi $tmp,$tmp,$tmp,15 vcipher $out0,$out0,v30 vcipher $out1,$out1,v30 vand $tmp,$tmp,$eighty7 vcipher $out2,$out2,v30 vcipher $out3,$out3,v30 - vxor $tweak,$tweak,$tmp + xxlor 32+$in5, 0, 0 + vpermxor $tweak, $tweak, $tmp, $in5 vcipher $out4,$out4,v30 vcipher $out5,$out5,v30 vxor $in5,$twk5,v31 @@ -2690,7 +2713,6 @@ () vcipherlast $out0,$out0,$in0 lvx_u $in0,$x00,$inp # load next input block vaddubm $tweak,$tweak,$tweak - vsldoi $tmp,$tmp,$tmp,15 vcipherlast $out1,$out1,$in1 lvx_u $in1,$x10,$inp vcipherlast $out2,$out2,$in2 @@ -2703,7 +2725,10 @@ () vcipherlast $out4,$out4,$in4 le?vperm $in2,$in2,$in2,$leperm lvx_u $in4,$x40,$inp - vxor $tweak,$tweak,$tmp + xxlor 10, 32+$in0, 32+$in0 + xxlor 32+$in0, 0, 0 + vpermxor $tweak, $tweak, $tmp, $in0 + xxlor 32+$in0, 10, 10 vcipherlast $tmp,$out5,$in5 # last block might be needed # in stealing mode le?vperm $in3,$in3,$in3,$leperm @@ -2736,6 +2761,8 @@ () mtctr $rounds beq Loop_xts_enc6x # did $len-=96 borrow? + xxlor 32+$eighty7, 2, 2 # 0x870101..01 + addic. $len,$len,0x60 beq Lxts_enc6x_zero cmpwi $len,0x20 @@ -3112,6 +3139,18 @@ () li $x70,0x70 mtspr 256,r0 + # Reverse eighty7 to 0x010101..87 + xxlor 2, 32+$eighty7, 32+$eighty7 + vsldoi $eighty7,$tmp,$eighty7,1 # 0x010101..87 + xxlor 1, 32+$eighty7, 32+$eighty7 + + # Load XOR contents. 0xf102132435465768798a9bacbdcedfe + mr $x70, r6 + bl Lconsts + lxvw4x 0, $x40, r6 # load XOR contents + mr r6, $x70 + li $x70,0x70 + subi $rounds,$rounds,3 # -4 in total lvx $rndkey0,$x00,$key1 # load key schedule @@ -3159,64 +3198,64 @@ () vxor $twk0,$tweak,$rndkey0 vsrab $tmp,$tweak,$seven # next tweak value vaddubm $tweak,$tweak,$tweak - vsldoi $tmp,$tmp,$tmp,15 vand $tmp,$tmp,$eighty7 vxor $out0,$in0,$twk0 - vxor $tweak,$tweak,$tmp + xxlor 32+$in1, 0, 0 + vpermxor $tweak, $tweak, $tmp, $in1 lvx_u $in1,$x10,$inp vxor $twk1,$tweak,$rndkey0 vsrab $tmp,$tweak,$seven # next tweak value vaddubm $tweak,$tweak,$tweak - vsldoi $tmp,$tmp,$tmp,15 le?vperm $in1,$in1,$in1,$leperm vand $tmp,$tmp,$eighty7 vxor $out1,$in1,$twk1 - vxor $tweak,$tweak,$tmp + xxlor 32+$in2, 0, 0 + vpermxor $tweak, $tweak, $tmp, $in2 lvx_u $in2,$x20,$inp andi. $taillen,$len,15 vxor $twk2,$tweak,$rndkey0 vsrab $tmp,$tweak,$seven # next tweak value vaddubm $tweak,$tweak,$tweak - vsldoi $tmp,$tmp,$tmp,15 le?vperm $in2,$in2,$in2,$leperm vand $tmp,$tmp,$eighty7 vxor $out2,$in2,$twk2 - vxor $tweak,$tweak,$tmp + xxlor 32+$in3, 0, 0 + vpermxor $tweak, $tweak, $tmp, $in3 lvx_u $in3,$x30,$inp sub $len,$len,$taillen vxor $twk3,$tweak,$rndkey0 vsrab $tmp,$tweak,$seven # next tweak value vaddubm $tweak,$tweak,$tweak - vsldoi $tmp,$tmp,$tmp,15 le?vperm $in3,$in3,$in3,$leperm vand $tmp,$tmp,$eighty7 vxor $out3,$in3,$twk3 - vxor $tweak,$tweak,$tmp + xxlor 32+$in4, 0, 0 + vpermxor $tweak, $tweak, $tmp, $in4 lvx_u $in4,$x40,$inp subi $len,$len,0x60 vxor $twk4,$tweak,$rndkey0 vsrab $tmp,$tweak,$seven # next tweak value vaddubm $tweak,$tweak,$tweak - vsldoi $tmp,$tmp,$tmp,15 le?vperm $in4,$in4,$in4,$leperm vand $tmp,$tmp,$eighty7 vxor $out4,$in4,$twk4 - vxor $tweak,$tweak,$tmp + xxlor 32+$in5, 0, 0 + vpermxor $tweak, $tweak, $tmp, $in5 lvx_u $in5,$x50,$inp addi $inp,$inp,0x60 vxor $twk5,$tweak,$rndkey0 vsrab $tmp,$tweak,$seven # next tweak value vaddubm $tweak,$tweak,$tweak - vsldoi $tmp,$tmp,$tmp,15 le?vperm $in5,$in5,$in5,$leperm vand $tmp,$tmp,$eighty7 vxor $out5,$in5,$twk5 - vxor $tweak,$tweak,$tmp + xxlor 32+$in0, 0, 0 + vpermxor $tweak, $tweak, $tmp, $in0 vxor v31,v31,$rndkey0 mtctr $rounds @@ -3242,6 +3281,8 @@ () lvx v25,$x10,$key_ # round[4] bdnz Loop_xts_dec6x + xxlor 32+$eighty7, 1, 1 + subic $len,$len,96 # $len-=96 vxor $in0,$twk0,v31 # xor with last round key vncipher $out0,$out0,v24 @@ -3251,7 +3292,6 @@ () vaddubm $tweak,$tweak,$tweak vncipher $out2,$out2,v24 vncipher $out3,$out3,v24 - vsldoi $tmp,$tmp,$tmp,15 vncipher $out4,$out4,v24 vncipher $out5,$out5,v24 @@ -3259,7 +3299,8 @@ () vand $tmp,$tmp,$eighty7 vncipher $out0,$out0,v25 vncipher $out1,$out1,v25 - vxor $tweak,$tweak,$tmp + xxlor 32+$in1, 0, 0 + vpermxor $tweak, $tweak, $tmp, $in1 vncipher $out2,$out2,v25 vncipher $out3,$out3,v25 vxor $in1,$twk1,v31 @@ -3270,13 +3311,13 @@ () and r0,r0,$len vaddubm $tweak,$tweak,$tweak - vsldoi $tmp,$tmp,$tmp,15 vncipher $out0,$out0,v26 vncipher $out1,$out1,v26 vand $tmp,$tmp,$eighty7 vncipher $out2,$out2,v26 vncipher $out3,$out3,v26 - vxor $tweak,$tweak,$tmp + xxlor 32+$in2, 0, 0 + vpermxor $tweak, $tweak, $tmp, $in2 vncipher $out4,$out4,v26 vncipher $out5,$out5,v26 @@ -3290,7 +3331,6 @@ () vaddubm $tweak,$tweak,$tweak vncipher $out0,$out0,v27 vncipher $out1,$out1,v27 - vsldoi $tmp,$tmp,$tmp,15 vncipher $out2,$out2,v27 vncipher $out3,$out3,v27 vand $tmp,$tmp,$eighty7 @@ -3298,7 +3338,8 @@ () vncipher $out5,$out5,v27 addi $key_,$sp,$FRAME+15 # rewind $key_ - vxor $tweak,$tweak,$tmp + xxlor 32+$in3, 0, 0 + vpermxor $tweak, $tweak, $tmp, $in3 vncipher $out0,$out0,v28 vncipher $out1,$out1,v28 vxor $in3,$twk3,v31 @@ -3307,7 +3348,6 @@ () vncipher $out2,$out2,v28 vncipher $out3,$out3,v28 vaddubm $tweak,$tweak,$tweak - vsldoi $tmp,$tmp,$tmp,15 vncipher $out4,$out4,v28 vncipher $out5,$out5,v28 lvx v24,$x00,$key_ # re-pre-load round[1] @@ -3315,7 +3355,8 @@ () vncipher $out0,$out0,v29 vncipher $out1,$out1,v29 - vxor $tweak,$tweak,$tmp + xxlor 32+$in4, 0, 0 + vpermxor $tweak, $tweak, $tmp, $in4 vncipher $out2,$out2,v29 vncipher $out3,$out3,v29 vxor $in4,$twk4,v31 @@ -3325,14 +3366,14 @@ () vncipher $out5,$out5,v29 lvx v25,$x10,$key_ # re-pre-load round[2] vaddubm $tweak,$tweak,$tweak - vsldoi $tmp,$tmp,$tmp,15 vncipher $out0,$out0,v30 vncipher $out1,$out1,v30 vand $tmp,$tmp,$eighty7 vncipher $out2,$out2,v30 vncipher $out3,$out3,v30 - vxor $tweak,$tweak,$tmp + xxlor 32+$in5, 0, 0 + vpermxor $tweak, $tweak, $tmp, $in5 vncipher $out4,$out4,v30 vncipher $out5,$out5,v30 vxor $in5,$twk5,v31 @@ -3342,7 +3383,6 @@ () vncipherlast $out0,$out0,$in0 lvx_u $in0,$x00,$inp # load next input block vaddubm $tweak,$tweak,$tweak - vsldoi $tmp,$tmp,$tmp,15 vncipherlast $out1,$out1,$in1 lvx_u $in1,$x10,$inp vncipherlast $out2,$out2,$in2 @@ -3355,7 +3395,10 @@ () vncipherlast $out4,$out4,$in4 le?vperm $in2,$in2,$in2,$leperm lvx_u $in4,$x40,$inp - vxor $tweak,$tweak,$tmp + xxlor 10, 32+$in0, 32+$in0 + xxlor 32+$in0, 0, 0 + vpermxor $tweak, $tweak, $tmp, $in0 + xxlor 32+$in0, 10, 10 vncipherlast $out5,$out5,$in5 le?vperm $in3,$in3,$in3,$leperm lvx_u $in5,$x50,$inp @@ -3386,6 +3429,8 @@ () mtctr $rounds beq Loop_xts_dec6x # did $len-=96 borrow? + xxlor 32+$eighty7, 2, 2 + addic. $len,$len,0x60 beq Lxts_dec6x_zero cmpwi $len,0x20