From b51c004cd8e095f310fc4b14cfef053d2aef40f512101e7726e6b10ddb4e7b98 Mon Sep 17 00:00:00 2001 From: Otto Hollmann Date: Wed, 25 Oct 2023 07:52:22 +0000 Subject: [PATCH] Accepting request 1119558 from home:ohollmann:branches:security:tls - Performance enhancements for cryptography from OpenSSL 3.x [jsc#PED-5086, jsc#PED-3514] * Add patches: - openssl-ec-Use-static-linkage-on-nistp521-felem_-square-mul-.patch - openssl-ec-56-bit-Limb-Solinas-Strategy-for-secp384r1.patch - openssl-ec-powerpc64le-Add-asm-implementation-of-felem_-squa.patch - openssl-ecc-Remove-extraneous-parentheses-in-secp384r1.patch - openssl-powerpc-ecc-Fix-stack-allocation-secp384r1-asm.patch - openssl-Improve-performance-for-6x-unrolling-with-vpermxor-i.patch OBS-URL: https://build.opensuse.org/request/show/1119558 OBS-URL: https://build.opensuse.org/package/show/security:tls/openssl-1_1?expand=0&rev=148 --- openssl-1_1.changes | 13 + openssl-1_1.spec | 8 + ...nce-for-6x-unrolling-with-vpermxor-i.patch | 495 ++++ ...-Limb-Solinas-Strategy-for-secp384r1.patch | 2197 +++++++++++++++++ ...nkage-on-nistp521-felem_-square-mul-.patch | 65 + ...dd-asm-implementation-of-felem_-squa.patch | 410 +++ ...-extraneous-parentheses-in-secp384r1.patch | 76 + ...c-Fix-stack-allocation-secp384r1-asm.patch | 96 + 8 files changed, 3360 insertions(+) create mode 100644 openssl-Improve-performance-for-6x-unrolling-with-vpermxor-i.patch create mode 100644 openssl-ec-56-bit-Limb-Solinas-Strategy-for-secp384r1.patch create mode 100644 openssl-ec-Use-static-linkage-on-nistp521-felem_-square-mul-.patch create mode 100644 openssl-ec-powerpc64le-Add-asm-implementation-of-felem_-squa.patch create mode 100644 openssl-ecc-Remove-extraneous-parentheses-in-secp384r1.patch create mode 100644 openssl-powerpc-ecc-Fix-stack-allocation-secp384r1-asm.patch diff --git a/openssl-1_1.changes b/openssl-1_1.changes index eec4e62..a34c4d7 100644 --- a/openssl-1_1.changes +++ b/openssl-1_1.changes @@ -1,3 +1,16 @@ +------------------------------------------------------------------- +Thu Oct 19 15:03:14 UTC 2023 - Otto Hollmann + +- Performance enhancements for cryptography from OpenSSL 3.x + [jsc#PED-5086, jsc#PED-3514] + * Add patches: + - openssl-ec-Use-static-linkage-on-nistp521-felem_-square-mul-.patch + - openssl-ec-56-bit-Limb-Solinas-Strategy-for-secp384r1.patch + - openssl-ec-powerpc64le-Add-asm-implementation-of-felem_-squa.patch + - openssl-ecc-Remove-extraneous-parentheses-in-secp384r1.patch + - openssl-powerpc-ecc-Fix-stack-allocation-secp384r1-asm.patch + - openssl-Improve-performance-for-6x-unrolling-with-vpermxor-i.patch + ------------------------------------------------------------------- Wed Oct 4 07:15:29 UTC 2023 - Otto Hollmann diff --git a/openssl-1_1.spec b/openssl-1_1.spec index a26a4d9..fb0a989 100644 --- a/openssl-1_1.spec +++ b/openssl-1_1.spec @@ -177,6 +177,14 @@ Patch106: openssl-s_client-check-ocsp-status.patch Patch107: openssl-dont-pass-zero-length-input-to-EVP_Cipher.patch #PATCH-FIX-SUSE bsc#1215215 FIPS: Add "fips" to version string Patch108: openssl-1_1-fips-bsc1215215_fips_in_version_string.patch +# PATCH-FIX-UPSTREAM jsc#PED-5086, jsc#PED-3514 +# POWER10 performance enhancements for cryptography +Patch109: openssl-ec-Use-static-linkage-on-nistp521-felem_-square-mul-.patch +Patch110: openssl-ec-56-bit-Limb-Solinas-Strategy-for-secp384r1.patch +Patch111: openssl-ec-powerpc64le-Add-asm-implementation-of-felem_-squa.patch +Patch112: openssl-ecc-Remove-extraneous-parentheses-in-secp384r1.patch +Patch113: openssl-powerpc-ecc-Fix-stack-allocation-secp384r1-asm.patch +Patch114: openssl-Improve-performance-for-6x-unrolling-with-vpermxor-i.patch BuildRequires: jitterentropy-devel >= 3.4.0 BuildRequires: pkgconfig BuildRequires: pkgconfig(zlib) diff --git a/openssl-Improve-performance-for-6x-unrolling-with-vpermxor-i.patch b/openssl-Improve-performance-for-6x-unrolling-with-vpermxor-i.patch new file mode 100644 index 0000000..7c57d6b --- /dev/null +++ b/openssl-Improve-performance-for-6x-unrolling-with-vpermxor-i.patch @@ -0,0 +1,495 @@ +From 3d3a7ecd1ae5ab08d22041f7b3b035c34f12fa02 Mon Sep 17 00:00:00 2001 +From: Danny Tsen +Date: Tue, 22 Aug 2023 15:58:53 -0400 +Subject: [PATCH] Improve performance for 6x unrolling with vpermxor + instruction + +Reviewed-by: Paul Dale +Reviewed-by: Tomas Mraz +(Merged from https://github.com/openssl/openssl/pull/21812) +--- + crypto/aes/asm/aesp8-ppc.pl | 145 +++++++++++++++++++++++------------- + 1 file changed, 95 insertions(+), 50 deletions(-) + +diff --git a/crypto/aes/asm/aesp8-ppc.pl b/crypto/aes/asm/aesp8-ppc.pl +index 60cf86f52aed2..38b9405a283b7 100755 +--- a/crypto/aes/asm/aesp8-ppc.pl ++++ b/crypto/aes/asm/aesp8-ppc.pl +@@ -99,11 +99,12 @@ + .long 0x1b000000, 0x1b000000, 0x1b000000, 0x1b000000 ?rev + .long 0x0d0e0f0c, 0x0d0e0f0c, 0x0d0e0f0c, 0x0d0e0f0c ?rev + .long 0,0,0,0 ?asis ++.long 0x0f102132, 0x43546576, 0x8798a9ba, 0xcbdcedfe + Lconsts: + mflr r0 + bcl 20,31,\$+4 + mflr $ptr #vvvvv "distance between . and rcon +- addi $ptr,$ptr,-0x48 ++ addi $ptr,$ptr,-0x58 + mtlr r0 + blr + .long 0 +@@ -2405,7 +2406,7 @@ () + my $key_=$key2; + my ($x00,$x10,$x20,$x30,$x40,$x50,$x60,$x70)=map("r$_",(0,3,26..31)); + $x00=0 if ($flavour =~ /osx/); +-my ($in0, $in1, $in2, $in3, $in4, $in5 )=map("v$_",(0..5)); ++my ($in0, $in1, $in2, $in3, $in4, $in5)=map("v$_",(0..5)); + my ($out0, $out1, $out2, $out3, $out4, $out5)=map("v$_",(7,12..16)); + my ($twk0, $twk1, $twk2, $twk3, $twk4, $twk5)=map("v$_",(17..22)); + my $rndkey0="v23"; # v24-v25 rotating buffer for first found keys +@@ -2460,6 +2461,18 @@ () + li $x70,0x70 + mtspr 256,r0 + ++ # Reverse eighty7 to 0x010101..87 ++ xxlor 2, 32+$eighty7, 32+$eighty7 ++ vsldoi $eighty7,$tmp,$eighty7,1 # 0x010101..87 ++ xxlor 1, 32+$eighty7, 32+$eighty7 ++ ++ # Load XOR contents. 0xf102132435465768798a9bacbdcedfe ++ mr $x70, r6 ++ bl Lconsts ++ lxvw4x 0, $x40, r6 # load XOR contents ++ mr r6, $x70 ++ li $x70,0x70 ++ + subi $rounds,$rounds,3 # -4 in total + + lvx $rndkey0,$x00,$key1 # load key schedule +@@ -2502,69 +2515,77 @@ () + ?vperm v31,v31,$twk5,$keyperm + lvx v25,$x10,$key_ # pre-load round[2] + ++ # Switch to use the following codes with 0x010101..87 to generate tweak. ++ # eighty7 = 0x010101..87 ++ # vsrab tmp, tweak, seven # next tweak value, right shift 7 bits ++ # vand tmp, tmp, eighty7 # last byte with carry ++ # vaddubm tweak, tweak, tweak # left shift 1 bit (x2) ++ # xxlor vsx, 0, 0 ++ # vpermxor tweak, tweak, tmp, vsx ++ + vperm $in0,$inout,$inptail,$inpperm + subi $inp,$inp,31 # undo "caller" + vxor $twk0,$tweak,$rndkey0 + vsrab $tmp,$tweak,$seven # next tweak value + vaddubm $tweak,$tweak,$tweak +- vsldoi $tmp,$tmp,$tmp,15 + vand $tmp,$tmp,$eighty7 + vxor $out0,$in0,$twk0 +- vxor $tweak,$tweak,$tmp ++ xxlor 32+$in1, 0, 0 ++ vpermxor $tweak, $tweak, $tmp, $in1 + + lvx_u $in1,$x10,$inp + vxor $twk1,$tweak,$rndkey0 + vsrab $tmp,$tweak,$seven # next tweak value + vaddubm $tweak,$tweak,$tweak +- vsldoi $tmp,$tmp,$tmp,15 + le?vperm $in1,$in1,$in1,$leperm + vand $tmp,$tmp,$eighty7 + vxor $out1,$in1,$twk1 +- vxor $tweak,$tweak,$tmp ++ xxlor 32+$in2, 0, 0 ++ vpermxor $tweak, $tweak, $tmp, $in2 + + lvx_u $in2,$x20,$inp + andi. $taillen,$len,15 + vxor $twk2,$tweak,$rndkey0 + vsrab $tmp,$tweak,$seven # next tweak value + vaddubm $tweak,$tweak,$tweak +- vsldoi $tmp,$tmp,$tmp,15 + le?vperm $in2,$in2,$in2,$leperm + vand $tmp,$tmp,$eighty7 + vxor $out2,$in2,$twk2 +- vxor $tweak,$tweak,$tmp ++ xxlor 32+$in3, 0, 0 ++ vpermxor $tweak, $tweak, $tmp, $in3 + + lvx_u $in3,$x30,$inp + sub $len,$len,$taillen + vxor $twk3,$tweak,$rndkey0 + vsrab $tmp,$tweak,$seven # next tweak value + vaddubm $tweak,$tweak,$tweak +- vsldoi $tmp,$tmp,$tmp,15 + le?vperm $in3,$in3,$in3,$leperm + vand $tmp,$tmp,$eighty7 + vxor $out3,$in3,$twk3 +- vxor $tweak,$tweak,$tmp ++ xxlor 32+$in4, 0, 0 ++ vpermxor $tweak, $tweak, $tmp, $in4 + + lvx_u $in4,$x40,$inp + subi $len,$len,0x60 + vxor $twk4,$tweak,$rndkey0 + vsrab $tmp,$tweak,$seven # next tweak value + vaddubm $tweak,$tweak,$tweak +- vsldoi $tmp,$tmp,$tmp,15 + le?vperm $in4,$in4,$in4,$leperm + vand $tmp,$tmp,$eighty7 + vxor $out4,$in4,$twk4 +- vxor $tweak,$tweak,$tmp ++ xxlor 32+$in5, 0, 0 ++ vpermxor $tweak, $tweak, $tmp, $in5 + + lvx_u $in5,$x50,$inp + addi $inp,$inp,0x60 + vxor $twk5,$tweak,$rndkey0 + vsrab $tmp,$tweak,$seven # next tweak value + vaddubm $tweak,$tweak,$tweak +- vsldoi $tmp,$tmp,$tmp,15 + le?vperm $in5,$in5,$in5,$leperm + vand $tmp,$tmp,$eighty7 + vxor $out5,$in5,$twk5 +- vxor $tweak,$tweak,$tmp ++ xxlor 32+$in0, 0, 0 ++ vpermxor $tweak, $tweak, $tmp, $in0 + + vxor v31,v31,$rndkey0 + mtctr $rounds +@@ -2590,6 +2611,8 @@ () + lvx v25,$x10,$key_ # round[4] + bdnz Loop_xts_enc6x + ++ xxlor 32+$eighty7, 1, 1 # 0x010101..87 ++ + subic $len,$len,96 # $len-=96 + vxor $in0,$twk0,v31 # xor with last round key + vcipher $out0,$out0,v24 +@@ -2599,7 +2622,6 @@ () + vaddubm $tweak,$tweak,$tweak + vcipher $out2,$out2,v24 + vcipher $out3,$out3,v24 +- vsldoi $tmp,$tmp,$tmp,15 + vcipher $out4,$out4,v24 + vcipher $out5,$out5,v24 + +@@ -2607,7 +2629,8 @@ () + vand $tmp,$tmp,$eighty7 + vcipher $out0,$out0,v25 + vcipher $out1,$out1,v25 +- vxor $tweak,$tweak,$tmp ++ xxlor 32+$in1, 0, 0 ++ vpermxor $tweak, $tweak, $tmp, $in1 + vcipher $out2,$out2,v25 + vcipher $out3,$out3,v25 + vxor $in1,$twk1,v31 +@@ -2618,13 +2641,13 @@ () + + and r0,r0,$len + vaddubm $tweak,$tweak,$tweak +- vsldoi $tmp,$tmp,$tmp,15 + vcipher $out0,$out0,v26 + vcipher $out1,$out1,v26 + vand $tmp,$tmp,$eighty7 + vcipher $out2,$out2,v26 + vcipher $out3,$out3,v26 +- vxor $tweak,$tweak,$tmp ++ xxlor 32+$in2, 0, 0 ++ vpermxor $tweak, $tweak, $tmp, $in2 + vcipher $out4,$out4,v26 + vcipher $out5,$out5,v26 + +@@ -2638,7 +2661,6 @@ () + vaddubm $tweak,$tweak,$tweak + vcipher $out0,$out0,v27 + vcipher $out1,$out1,v27 +- vsldoi $tmp,$tmp,$tmp,15 + vcipher $out2,$out2,v27 + vcipher $out3,$out3,v27 + vand $tmp,$tmp,$eighty7 +@@ -2646,7 +2668,8 @@ () + vcipher $out5,$out5,v27 + + addi $key_,$sp,$FRAME+15 # rewind $key_ +- vxor $tweak,$tweak,$tmp ++ xxlor 32+$in3, 0, 0 ++ vpermxor $tweak, $tweak, $tmp, $in3 + vcipher $out0,$out0,v28 + vcipher $out1,$out1,v28 + vxor $in3,$twk3,v31 +@@ -2655,7 +2678,6 @@ () + vcipher $out2,$out2,v28 + vcipher $out3,$out3,v28 + vaddubm $tweak,$tweak,$tweak +- vsldoi $tmp,$tmp,$tmp,15 + vcipher $out4,$out4,v28 + vcipher $out5,$out5,v28 + lvx v24,$x00,$key_ # re-pre-load round[1] +@@ -2663,7 +2685,8 @@ () + + vcipher $out0,$out0,v29 + vcipher $out1,$out1,v29 +- vxor $tweak,$tweak,$tmp ++ xxlor 32+$in4, 0, 0 ++ vpermxor $tweak, $tweak, $tmp, $in4 + vcipher $out2,$out2,v29 + vcipher $out3,$out3,v29 + vxor $in4,$twk4,v31 +@@ -2673,14 +2696,14 @@ () + vcipher $out5,$out5,v29 + lvx v25,$x10,$key_ # re-pre-load round[2] + vaddubm $tweak,$tweak,$tweak +- vsldoi $tmp,$tmp,$tmp,15 + + vcipher $out0,$out0,v30 + vcipher $out1,$out1,v30 + vand $tmp,$tmp,$eighty7 + vcipher $out2,$out2,v30 + vcipher $out3,$out3,v30 +- vxor $tweak,$tweak,$tmp ++ xxlor 32+$in5, 0, 0 ++ vpermxor $tweak, $tweak, $tmp, $in5 + vcipher $out4,$out4,v30 + vcipher $out5,$out5,v30 + vxor $in5,$twk5,v31 +@@ -2690,7 +2713,6 @@ () + vcipherlast $out0,$out0,$in0 + lvx_u $in0,$x00,$inp # load next input block + vaddubm $tweak,$tweak,$tweak +- vsldoi $tmp,$tmp,$tmp,15 + vcipherlast $out1,$out1,$in1 + lvx_u $in1,$x10,$inp + vcipherlast $out2,$out2,$in2 +@@ -2703,7 +2725,10 @@ () + vcipherlast $out4,$out4,$in4 + le?vperm $in2,$in2,$in2,$leperm + lvx_u $in4,$x40,$inp +- vxor $tweak,$tweak,$tmp ++ xxlor 10, 32+$in0, 32+$in0 ++ xxlor 32+$in0, 0, 0 ++ vpermxor $tweak, $tweak, $tmp, $in0 ++ xxlor 32+$in0, 10, 10 + vcipherlast $tmp,$out5,$in5 # last block might be needed + # in stealing mode + le?vperm $in3,$in3,$in3,$leperm +@@ -2736,6 +2761,8 @@ () + mtctr $rounds + beq Loop_xts_enc6x # did $len-=96 borrow? + ++ xxlor 32+$eighty7, 2, 2 # 0x870101..01 ++ + addic. $len,$len,0x60 + beq Lxts_enc6x_zero + cmpwi $len,0x20 +@@ -3112,6 +3139,18 @@ () + li $x70,0x70 + mtspr 256,r0 + ++ # Reverse eighty7 to 0x010101..87 ++ xxlor 2, 32+$eighty7, 32+$eighty7 ++ vsldoi $eighty7,$tmp,$eighty7,1 # 0x010101..87 ++ xxlor 1, 32+$eighty7, 32+$eighty7 ++ ++ # Load XOR contents. 0xf102132435465768798a9bacbdcedfe ++ mr $x70, r6 ++ bl Lconsts ++ lxvw4x 0, $x40, r6 # load XOR contents ++ mr r6, $x70 ++ li $x70,0x70 ++ + subi $rounds,$rounds,3 # -4 in total + + lvx $rndkey0,$x00,$key1 # load key schedule +@@ -3159,64 +3198,64 @@ () + vxor $twk0,$tweak,$rndkey0 + vsrab $tmp,$tweak,$seven # next tweak value + vaddubm $tweak,$tweak,$tweak +- vsldoi $tmp,$tmp,$tmp,15 + vand $tmp,$tmp,$eighty7 + vxor $out0,$in0,$twk0 +- vxor $tweak,$tweak,$tmp ++ xxlor 32+$in1, 0, 0 ++ vpermxor $tweak, $tweak, $tmp, $in1 + + lvx_u $in1,$x10,$inp + vxor $twk1,$tweak,$rndkey0 + vsrab $tmp,$tweak,$seven # next tweak value + vaddubm $tweak,$tweak,$tweak +- vsldoi $tmp,$tmp,$tmp,15 + le?vperm $in1,$in1,$in1,$leperm + vand $tmp,$tmp,$eighty7 + vxor $out1,$in1,$twk1 +- vxor $tweak,$tweak,$tmp ++ xxlor 32+$in2, 0, 0 ++ vpermxor $tweak, $tweak, $tmp, $in2 + + lvx_u $in2,$x20,$inp + andi. $taillen,$len,15 + vxor $twk2,$tweak,$rndkey0 + vsrab $tmp,$tweak,$seven # next tweak value + vaddubm $tweak,$tweak,$tweak +- vsldoi $tmp,$tmp,$tmp,15 + le?vperm $in2,$in2,$in2,$leperm + vand $tmp,$tmp,$eighty7 + vxor $out2,$in2,$twk2 +- vxor $tweak,$tweak,$tmp ++ xxlor 32+$in3, 0, 0 ++ vpermxor $tweak, $tweak, $tmp, $in3 + + lvx_u $in3,$x30,$inp + sub $len,$len,$taillen + vxor $twk3,$tweak,$rndkey0 + vsrab $tmp,$tweak,$seven # next tweak value + vaddubm $tweak,$tweak,$tweak +- vsldoi $tmp,$tmp,$tmp,15 + le?vperm $in3,$in3,$in3,$leperm + vand $tmp,$tmp,$eighty7 + vxor $out3,$in3,$twk3 +- vxor $tweak,$tweak,$tmp ++ xxlor 32+$in4, 0, 0 ++ vpermxor $tweak, $tweak, $tmp, $in4 + + lvx_u $in4,$x40,$inp + subi $len,$len,0x60 + vxor $twk4,$tweak,$rndkey0 + vsrab $tmp,$tweak,$seven # next tweak value + vaddubm $tweak,$tweak,$tweak +- vsldoi $tmp,$tmp,$tmp,15 + le?vperm $in4,$in4,$in4,$leperm + vand $tmp,$tmp,$eighty7 + vxor $out4,$in4,$twk4 +- vxor $tweak,$tweak,$tmp ++ xxlor 32+$in5, 0, 0 ++ vpermxor $tweak, $tweak, $tmp, $in5 + + lvx_u $in5,$x50,$inp + addi $inp,$inp,0x60 + vxor $twk5,$tweak,$rndkey0 + vsrab $tmp,$tweak,$seven # next tweak value + vaddubm $tweak,$tweak,$tweak +- vsldoi $tmp,$tmp,$tmp,15 + le?vperm $in5,$in5,$in5,$leperm + vand $tmp,$tmp,$eighty7 + vxor $out5,$in5,$twk5 +- vxor $tweak,$tweak,$tmp ++ xxlor 32+$in0, 0, 0 ++ vpermxor $tweak, $tweak, $tmp, $in0 + + vxor v31,v31,$rndkey0 + mtctr $rounds +@@ -3242,6 +3281,8 @@ () + lvx v25,$x10,$key_ # round[4] + bdnz Loop_xts_dec6x + ++ xxlor 32+$eighty7, 1, 1 ++ + subic $len,$len,96 # $len-=96 + vxor $in0,$twk0,v31 # xor with last round key + vncipher $out0,$out0,v24 +@@ -3251,7 +3292,6 @@ () + vaddubm $tweak,$tweak,$tweak + vncipher $out2,$out2,v24 + vncipher $out3,$out3,v24 +- vsldoi $tmp,$tmp,$tmp,15 + vncipher $out4,$out4,v24 + vncipher $out5,$out5,v24 + +@@ -3259,7 +3299,8 @@ () + vand $tmp,$tmp,$eighty7 + vncipher $out0,$out0,v25 + vncipher $out1,$out1,v25 +- vxor $tweak,$tweak,$tmp ++ xxlor 32+$in1, 0, 0 ++ vpermxor $tweak, $tweak, $tmp, $in1 + vncipher $out2,$out2,v25 + vncipher $out3,$out3,v25 + vxor $in1,$twk1,v31 +@@ -3270,13 +3311,13 @@ () + + and r0,r0,$len + vaddubm $tweak,$tweak,$tweak +- vsldoi $tmp,$tmp,$tmp,15 + vncipher $out0,$out0,v26 + vncipher $out1,$out1,v26 + vand $tmp,$tmp,$eighty7 + vncipher $out2,$out2,v26 + vncipher $out3,$out3,v26 +- vxor $tweak,$tweak,$tmp ++ xxlor 32+$in2, 0, 0 ++ vpermxor $tweak, $tweak, $tmp, $in2 + vncipher $out4,$out4,v26 + vncipher $out5,$out5,v26 + +@@ -3290,7 +3331,6 @@ () + vaddubm $tweak,$tweak,$tweak + vncipher $out0,$out0,v27 + vncipher $out1,$out1,v27 +- vsldoi $tmp,$tmp,$tmp,15 + vncipher $out2,$out2,v27 + vncipher $out3,$out3,v27 + vand $tmp,$tmp,$eighty7 +@@ -3298,7 +3338,8 @@ () + vncipher $out5,$out5,v27 + + addi $key_,$sp,$FRAME+15 # rewind $key_ +- vxor $tweak,$tweak,$tmp ++ xxlor 32+$in3, 0, 0 ++ vpermxor $tweak, $tweak, $tmp, $in3 + vncipher $out0,$out0,v28 + vncipher $out1,$out1,v28 + vxor $in3,$twk3,v31 +@@ -3307,7 +3348,6 @@ () + vncipher $out2,$out2,v28 + vncipher $out3,$out3,v28 + vaddubm $tweak,$tweak,$tweak +- vsldoi $tmp,$tmp,$tmp,15 + vncipher $out4,$out4,v28 + vncipher $out5,$out5,v28 + lvx v24,$x00,$key_ # re-pre-load round[1] +@@ -3315,7 +3355,8 @@ () + + vncipher $out0,$out0,v29 + vncipher $out1,$out1,v29 +- vxor $tweak,$tweak,$tmp ++ xxlor 32+$in4, 0, 0 ++ vpermxor $tweak, $tweak, $tmp, $in4 + vncipher $out2,$out2,v29 + vncipher $out3,$out3,v29 + vxor $in4,$twk4,v31 +@@ -3325,14 +3366,14 @@ () + vncipher $out5,$out5,v29 + lvx v25,$x10,$key_ # re-pre-load round[2] + vaddubm $tweak,$tweak,$tweak +- vsldoi $tmp,$tmp,$tmp,15 + + vncipher $out0,$out0,v30 + vncipher $out1,$out1,v30 + vand $tmp,$tmp,$eighty7 + vncipher $out2,$out2,v30 + vncipher $out3,$out3,v30 +- vxor $tweak,$tweak,$tmp ++ xxlor 32+$in5, 0, 0 ++ vpermxor $tweak, $tweak, $tmp, $in5 + vncipher $out4,$out4,v30 + vncipher $out5,$out5,v30 + vxor $in5,$twk5,v31 +@@ -3342,7 +3383,6 @@ () + vncipherlast $out0,$out0,$in0 + lvx_u $in0,$x00,$inp # load next input block + vaddubm $tweak,$tweak,$tweak +- vsldoi $tmp,$tmp,$tmp,15 + vncipherlast $out1,$out1,$in1 + lvx_u $in1,$x10,$inp + vncipherlast $out2,$out2,$in2 +@@ -3355,7 +3395,10 @@ () + vncipherlast $out4,$out4,$in4 + le?vperm $in2,$in2,$in2,$leperm + lvx_u $in4,$x40,$inp +- vxor $tweak,$tweak,$tmp ++ xxlor 10, 32+$in0, 32+$in0 ++ xxlor 32+$in0, 0, 0 ++ vpermxor $tweak, $tweak, $tmp, $in0 ++ xxlor 32+$in0, 10, 10 + vncipherlast $out5,$out5,$in5 + le?vperm $in3,$in3,$in3,$leperm + lvx_u $in5,$x50,$inp +@@ -3386,6 +3429,8 @@ () + mtctr $rounds + beq Loop_xts_dec6x # did $len-=96 borrow? + ++ xxlor 32+$eighty7, 2, 2 ++ + addic. $len,$len,0x60 + beq Lxts_dec6x_zero + cmpwi $len,0x20 diff --git a/openssl-ec-56-bit-Limb-Solinas-Strategy-for-secp384r1.patch b/openssl-ec-56-bit-Limb-Solinas-Strategy-for-secp384r1.patch new file mode 100644 index 0000000..498db7f --- /dev/null +++ b/openssl-ec-56-bit-Limb-Solinas-Strategy-for-secp384r1.patch @@ -0,0 +1,2197 @@ +From 01d901e470d9e035a3bd78e77b9438a4cc0da785 Mon Sep 17 00:00:00 2001 +From: Rohan McLure +Date: Wed, 12 Jul 2023 12:25:22 +1000 +Subject: [PATCH] ec: 56-bit Limb Solinas' Strategy for secp384r1 + +Adopt a 56-bit redundant-limb Solinas' reduction approach for efficient +modular multiplication in P384. This has the affect of accelerating +digital signing by 446% and verification by 106%. The implementation +strategy and names of methods are the same as that provided in +ecp_nistp224 and ecp_nistp521. + +As in Commit 1036749883cc ("ec: Add run time code selection for p521 +field operations"), allow for run time selection of implementation for +felem_{square,mul}, where an assembly implementation is proclaimed to +be present when ECP_NISTP384_ASM is present. + +Signed-off-by: Rohan McLure + +Reviewed-by: Paul Dale +Reviewed-by: Shane Lontis +Reviewed-by: Dmitry Belyavskiy +Reviewed-by: Todd Short +(Merged from https://github.com/openssl/openssl/pull/21471) +--- + crypto/ec/build.info | 3 +- + crypto/ec/ec_curve.c | 4 + + crypto/ec/ec_lib.c | 8 + + crypto/ec/ec_local.h | 27 +- + crypto/ec/ecp_nistp384.c | 1988 ++++++++++++++++++++++++++++++++++++++ + crypto/err/openssl.txt | 4 + include/openssl/ecerr.h | 3 + 7 files changed, 2035 insertions(+), 2 deletions(-) + create mode 100644 crypto/ec/ecp_nistp384.c + +Index: openssl-1.1.1w/crypto/ec/ec_curve.c +=================================================================== +--- openssl-1.1.1w.orig/crypto/ec/ec_curve.c ++++ openssl-1.1.1w/crypto/ec/ec_curve.c +@@ -2833,6 +2833,8 @@ static const ec_list_element curve_list[ + {NID_secp384r1, &_EC_NIST_PRIME_384.h, + # if defined(S390X_EC_ASM) + EC_GFp_s390x_nistp384_method, ++# elif !defined(OPENSSL_NO_EC_NISTP_64_GCC_128) ++ ossl_ec_GFp_nistp384_method, + # else + 0, + # endif +Index: openssl-1.1.1w/crypto/ec/ec_lib.c +=================================================================== +--- openssl-1.1.1w.orig/crypto/ec/ec_lib.c ++++ openssl-1.1.1w/crypto/ec/ec_lib.c +@@ -75,12 +75,16 @@ void EC_pre_comp_free(EC_GROUP *group) + case PCT_nistp256: + EC_nistp256_pre_comp_free(group->pre_comp.nistp256); + break; ++ case PCT_nistp384: ++ ossl_ec_nistp384_pre_comp_free(group->pre_comp.nistp384); ++ break; + case PCT_nistp521: + EC_nistp521_pre_comp_free(group->pre_comp.nistp521); + break; + #else + case PCT_nistp224: + case PCT_nistp256: ++ case PCT_nistp384: + case PCT_nistp521: + break; + #endif +@@ -160,12 +164,16 @@ int EC_GROUP_copy(EC_GROUP *dest, const + case PCT_nistp256: + dest->pre_comp.nistp256 = EC_nistp256_pre_comp_dup(src->pre_comp.nistp256); + break; ++ case PCT_nistp384: ++ dest->pre_comp.nistp384 = ossl_ec_nistp384_pre_comp_dup(src->pre_comp.nistp384); ++ break; + case PCT_nistp521: + dest->pre_comp.nistp521 = EC_nistp521_pre_comp_dup(src->pre_comp.nistp521); + break; + #else + case PCT_nistp224: + case PCT_nistp256: ++ case PCT_nistp384: + case PCT_nistp521: + break; + #endif +Index: openssl-1.1.1w/crypto/ec/ec_local.h +=================================================================== +--- openssl-1.1.1w.orig/crypto/ec/ec_local.h ++++ openssl-1.1.1w/crypto/ec/ec_local.h +@@ -207,6 +207,7 @@ struct ec_method_st { + */ + typedef struct nistp224_pre_comp_st NISTP224_PRE_COMP; + typedef struct nistp256_pre_comp_st NISTP256_PRE_COMP; ++typedef struct nistp384_pre_comp_st NISTP384_PRE_COMP; + typedef struct nistp521_pre_comp_st NISTP521_PRE_COMP; + typedef struct nistz256_pre_comp_st NISTZ256_PRE_COMP; + typedef struct ec_pre_comp_st EC_PRE_COMP; +@@ -268,12 +269,13 @@ struct ec_group_st { + */ + enum { + PCT_none, +- PCT_nistp224, PCT_nistp256, PCT_nistp521, PCT_nistz256, ++ PCT_nistp224, PCT_nistp256, PCT_nistp384, PCT_nistp521, PCT_nistz256, + PCT_ec + } pre_comp_type; + union { + NISTP224_PRE_COMP *nistp224; + NISTP256_PRE_COMP *nistp256; ++ NISTP384_PRE_COMP *nistp384; + NISTP521_PRE_COMP *nistp521; + NISTZ256_PRE_COMP *nistz256; + EC_PRE_COMP *ec; +@@ -330,6 +332,7 @@ static ossl_inline int ec_point_is_compa + + NISTP224_PRE_COMP *EC_nistp224_pre_comp_dup(NISTP224_PRE_COMP *); + NISTP256_PRE_COMP *EC_nistp256_pre_comp_dup(NISTP256_PRE_COMP *); ++NISTP384_PRE_COMP *ossl_ec_nistp384_pre_comp_dup(NISTP384_PRE_COMP *); + NISTP521_PRE_COMP *EC_nistp521_pre_comp_dup(NISTP521_PRE_COMP *); + NISTZ256_PRE_COMP *EC_nistz256_pre_comp_dup(NISTZ256_PRE_COMP *); + NISTP256_PRE_COMP *EC_nistp256_pre_comp_dup(NISTP256_PRE_COMP *); +@@ -338,6 +341,7 @@ EC_PRE_COMP *EC_ec_pre_comp_dup(EC_PRE_C + void EC_pre_comp_free(EC_GROUP *group); + void EC_nistp224_pre_comp_free(NISTP224_PRE_COMP *); + void EC_nistp256_pre_comp_free(NISTP256_PRE_COMP *); ++void ossl_ec_nistp384_pre_comp_free(NISTP384_PRE_COMP *); + void EC_nistp521_pre_comp_free(NISTP521_PRE_COMP *); + void EC_nistz256_pre_comp_free(NISTZ256_PRE_COMP *); + void EC_ec_pre_comp_free(EC_PRE_COMP *); +@@ -543,6 +547,27 @@ int ec_GFp_nistp256_points_mul(const EC_ + int ec_GFp_nistp256_precompute_mult(EC_GROUP *group, BN_CTX *ctx); + int ec_GFp_nistp256_have_precompute_mult(const EC_GROUP *group); + ++/* method functions in ecp_nistp384.c */ ++int ossl_ec_GFp_nistp384_group_init(EC_GROUP *group); ++int ossl_ec_GFp_nistp384_group_set_curve(EC_GROUP *group, const BIGNUM *p, ++ const BIGNUM *a, const BIGNUM *n, ++ BN_CTX *); ++int ossl_ec_GFp_nistp384_point_get_affine_coordinates(const EC_GROUP *group, ++ const EC_POINT *point, ++ BIGNUM *x, BIGNUM *y, ++ BN_CTX *ctx); ++int ossl_ec_GFp_nistp384_mul(const EC_GROUP *group, EC_POINT *r, ++ const BIGNUM *scalar, size_t num, ++ const EC_POINT *points[], const BIGNUM *scalars[], ++ BN_CTX *); ++int ossl_ec_GFp_nistp384_points_mul(const EC_GROUP *group, EC_POINT *r, ++ const BIGNUM *scalar, size_t num, ++ const EC_POINT *points[], ++ const BIGNUM *scalars[], BN_CTX *ctx); ++int ossl_ec_GFp_nistp384_precompute_mult(EC_GROUP *group, BN_CTX *ctx); ++int ossl_ec_GFp_nistp384_have_precompute_mult(const EC_GROUP *group); ++const EC_METHOD *ossl_ec_GFp_nistp384_method(void); ++ + /* method functions in ecp_nistp521.c */ + int ec_GFp_nistp521_group_init(EC_GROUP *group); + int ec_GFp_nistp521_group_set_curve(EC_GROUP *group, const BIGNUM *p, +Index: openssl-1.1.1w/crypto/ec/ecp_nistp384.c +=================================================================== +--- /dev/null ++++ openssl-1.1.1w/crypto/ec/ecp_nistp384.c +@@ -0,0 +1,1993 @@ ++/* ++ * Copyright 2023 The OpenSSL Project Authors. All Rights Reserved. ++ * ++ * Licensed under the Apache License 2.0 (the "License"). You may not use ++ * this file except in compliance with the License. You can obtain a copy ++ * in the file LICENSE in the source distribution or at ++ * https://www.openssl.org/source/license.html ++ */ ++ ++/* Copyright 2023 IBM Corp. ++ * ++ * Licensed under the Apache License, Version 2.0 (the "License"); ++ * ++ * you may not use this file except in compliance with the License. ++ * You may obtain a copy of the License at ++ * ++ * http://www.apache.org/licenses/LICENSE-2.0 ++ * ++ * Unless required by applicable law or agreed to in writing, software ++ * distributed under the License is distributed on an "AS IS" BASIS, ++ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ++ * See the License for the specific language governing permissions and ++ * limitations under the License. ++ */ ++ ++/* ++ * Designed for 56-bit limbs by Rohan McLure . ++ * The layout is based on that of ecp_nistp{224,521}.c, allowing even for asm ++ * acceleration of felem_{square,mul} as supported in these files. ++ */ ++ ++#include ++#ifdef OPENSSL_NO_EC_NISTP_64_GCC_128 ++NON_EMPTY_TRANSLATION_UNIT ++#else ++#include ++#include ++#include "ec_local.h" ++ ++#include "internal/numbers.h" ++ ++#ifndef INT128_MAX ++# error "Your compiler doesn't appear to support 128-bit integer types" ++#endif ++ ++typedef uint8_t u8; ++typedef uint64_t u64; ++ ++/* ++ * The underlying field. P384 operates over GF(2^384-2^128-2^96+2^32-1). We ++ * can serialize an element of this field into 48 bytes. We call this an ++ * felem_bytearray. ++ */ ++ ++typedef u8 felem_bytearray[48]; ++ ++/* ++ * These are the parameters of P384, taken from FIPS 186-3, section D.1.2.4. ++ * These values are big-endian. ++ */ ++static const felem_bytearray nistp384_curve_params[5] = { ++ {0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, /* p */ ++ 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, ++ 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFE, 0xFF, 0xFF, 0xFF, 0xFF, ++ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFF, 0xFF, 0xFF, 0xFF}, ++ {0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, /* a = -3 */ ++ 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, ++ 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFE, 0xFF, 0xFF, 0xFF, 0xFF, ++ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFF, 0xFF, 0xFF, 0xFC}, ++ {0xB3, 0x31, 0x2F, 0xA7, 0xE2, 0x3E, 0xE7, 0xE4, 0x98, 0x8E, 0x05, 0x6B, /* b */ ++ 0xE3, 0xF8, 0x2D, 0x19, 0x18, 0x1D, 0x9C, 0x6E, 0xFE, 0x81, 0x41, 0x12, ++ 0x03, 0x14, 0x08, 0x8F, 0x50, 0x13, 0x87, 0x5A, 0xC6, 0x56, 0x39, 0x8D, ++ 0x8A, 0x2E, 0xD1, 0x9D, 0x2A, 0x85, 0xC8, 0xED, 0xD3, 0xEC, 0x2A, 0xEF}, ++ {0xAA, 0x87, 0xCA, 0x22, 0xBE, 0x8B, 0x05, 0x37, 0x8E, 0xB1, 0xC7, 0x1E, /* x */ ++ 0xF3, 0x20, 0xAD, 0x74, 0x6E, 0x1D, 0x3B, 0x62, 0x8B, 0xA7, 0x9B, 0x98, ++ 0x59, 0xF7, 0x41, 0xE0, 0x82, 0x54, 0x2A, 0x38, 0x55, 0x02, 0xF2, 0x5D, ++ 0xBF, 0x55, 0x29, 0x6C, 0x3A, 0x54, 0x5E, 0x38, 0x72, 0x76, 0x0A, 0xB7}, ++ {0x36, 0x17, 0xDE, 0x4A, 0x96, 0x26, 0x2C, 0x6F, 0x5D, 0x9E, 0x98, 0xBF, /* y */ ++ 0x92, 0x92, 0xDC, 0x29, 0xF8, 0xF4, 0x1D, 0xBD, 0x28, 0x9A, 0x14, 0x7C, ++ 0xE9, 0xDA, 0x31, 0x13, 0xB5, 0xF0, 0xB8, 0xC0, 0x0A, 0x60, 0xB1, 0xCE, ++ 0x1D, 0x7E, 0x81, 0x9D, 0x7A, 0x43, 0x1D, 0x7C, 0x90, 0xEA, 0x0E, 0x5F}, ++}; ++ ++/*- ++ * The representation of field elements. ++ * ------------------------------------ ++ * ++ * We represent field elements with seven values. These values are either 64 or ++ * 128 bits and the field element represented is: ++ * v[0]*2^0 + v[1]*2^56 + v[2]*2^112 + ... + v[6]*2^336 (mod p) ++ * Each of the seven values is called a 'limb'. Since the limbs are spaced only ++ * 56 bits apart, but are greater than 56 bits in length, the most significant ++ * bits of each limb overlap with the least significant bits of the next ++ * ++ * This representation is considered to be 'redundant' in the sense that ++ * intermediate values can each contain more than a 56-bit value in each limb. ++ * Reduction causes all but the final limb to be reduced to contain a value less ++ * than 2^56, with the final value represented allowed to be larger than 2^384, ++ * inasmuch as we can be sure that arithmetic overflow remains impossible. The ++ * reduced value must of course be congruent to the unreduced value. ++ * ++ * A field element with 64-bit limbs is an 'felem'. One with 128-bit limbs is a ++ * 'widefelem', featuring enough bits to store the result of a multiplication ++ * and even some further arithmetic without need for immediate reduction. ++ */ ++ ++#define NLIMBS 7 ++ ++typedef uint64_t limb; ++typedef uint128_t widelimb; ++typedef limb limb_aX __attribute((__aligned__(1))); ++typedef limb felem[NLIMBS]; ++typedef widelimb widefelem[2*NLIMBS-1]; ++ ++static const limb bottom56bits = 0xffffffffffffff; ++ ++/* Helper functions (de)serialising reduced field elements in little endian */ ++static void bin48_to_felem(felem out, const u8 in[48]) ++{ ++ memset(out, 0, 56); ++ out[0] = (*((limb *) & in[0])) & bottom56bits; ++ out[1] = (*((limb_aX *) & in[7])) & bottom56bits; ++ out[2] = (*((limb_aX *) & in[14])) & bottom56bits; ++ out[3] = (*((limb_aX *) & in[21])) & bottom56bits; ++ out[4] = (*((limb_aX *) & in[28])) & bottom56bits; ++ out[5] = (*((limb_aX *) & in[35])) & bottom56bits; ++ memmove(&out[6], &in[42], 6); ++} ++ ++static void felem_to_bin48(u8 out[48], const felem in) ++{ ++ memset(out, 0, 48); ++ (*((limb *) & out[0])) |= (in[0] & bottom56bits); ++ (*((limb_aX *) & out[7])) |= (in[1] & bottom56bits); ++ (*((limb_aX *) & out[14])) |= (in[2] & bottom56bits); ++ (*((limb_aX *) & out[21])) |= (in[3] & bottom56bits); ++ (*((limb_aX *) & out[28])) |= (in[4] & bottom56bits); ++ (*((limb_aX *) & out[35])) |= (in[5] & bottom56bits); ++ memmove(&out[42], &in[6], 6); ++} ++ ++/* BN_to_felem converts an OpenSSL BIGNUM into an felem */ ++static int BN_to_felem(felem out, const BIGNUM *bn) ++{ ++ felem_bytearray b_out; ++ int num_bytes; ++ ++ if (BN_is_negative(bn)) { ++ ECerr(EC_F_BN_TO_FELEM, EC_R_BIGNUM_OUT_OF_RANGE); ++ return 0; ++ } ++ num_bytes = BN_bn2lebinpad(bn, b_out, sizeof(b_out)); ++ if (num_bytes < 0) { ++ ECerr(EC_F_BN_TO_FELEM, EC_R_BIGNUM_OUT_OF_RANGE); ++ return 0; ++ } ++ bin48_to_felem(out, b_out); ++ return 1; ++} ++ ++/* felem_to_BN converts an felem into an OpenSSL BIGNUM */ ++static BIGNUM *felem_to_BN(BIGNUM *out, const felem in) ++{ ++ felem_bytearray b_out; ++ ++ felem_to_bin48(b_out, in); ++ return BN_lebin2bn(b_out, sizeof(b_out), out); ++} ++ ++/*- ++ * Field operations ++ * ---------------- ++ */ ++ ++static void felem_one(felem out) ++{ ++ out[0] = 1; ++ memset(&out[1], 0, sizeof(limb) * (NLIMBS-1)); ++} ++ ++static void felem_assign(felem out, const felem in) ++{ ++ memcpy(out, in, sizeof(felem)); ++} ++ ++/* felem_sum64 sets out = out + in. */ ++static void felem_sum64(felem out, const felem in) ++{ ++ unsigned int i; ++ ++ for (i = 0; i < NLIMBS; i++) ++ out[i] += in[i]; ++} ++ ++/* felem_scalar sets out = in * scalar */ ++static void felem_scalar(felem out, const felem in, limb scalar) ++{ ++ unsigned int i; ++ ++ for (i = 0; i < NLIMBS; i++) ++ out[i] = in[i] * scalar; ++} ++ ++/* felem_scalar64 sets out = out * scalar */ ++static void felem_scalar64(felem out, limb scalar) ++{ ++ unsigned int i; ++ ++ for (i = 0; i < NLIMBS; i++) ++ out[i] *= scalar; ++} ++ ++/* felem_scalar128 sets out = out * scalar */ ++static void felem_scalar128(widefelem out, limb scalar) ++{ ++ unsigned int i; ++ ++ for (i = 0; i < 2*NLIMBS-1; i++) ++ out[i] *= scalar; ++} ++ ++/*- ++ * felem_neg sets |out| to |-in| ++ * On entry: ++ * in[i] < 2^60 - 2^29 ++ * On exit: ++ * out[i] < 2^60 ++ */ ++static void felem_neg(felem out, const felem in) ++{ ++ /* ++ * In order to prevent underflow, we add a multiple of p before subtracting. ++ * Use telescopic sums to represent 2^12 * p redundantly with each limb ++ * of the form 2^60 + ... ++ */ ++ static const limb two60m52m4 = (((limb) 1) << 60) ++ - (((limb) 1) << 52) ++ - (((limb) 1) << 4); ++ static const limb two60p44m12 = (((limb) 1) << 60) ++ + (((limb) 1) << 44) ++ - (((limb) 1) << 12); ++ static const limb two60m28m4 = (((limb) 1) << 60) ++ - (((limb) 1) << 28) ++ - (((limb) 1) << 4); ++ static const limb two60m4 = (((limb) 1) << 60) ++ - (((limb) 1) << 4); ++ ++ out[0] = two60p44m12 - in[0]; ++ out[1] = two60m52m4 - in[1]; ++ out[2] = two60m28m4 - in[2]; ++ out[3] = two60m4 - in[3]; ++ out[4] = two60m4 - in[4]; ++ out[5] = two60m4 - in[5]; ++ out[6] = two60m4 - in[6]; ++} ++ ++/*- ++ * felem_diff64 subtracts |in| from |out| ++ * On entry: ++ * in[i] < 2^60 - 2^52 - 2^4 ++ * On exit: ++ * out[i] < out_orig[i] + 2^60 + 2^44 ++ */ ++static void felem_diff64(felem out, const felem in) ++{ ++ /* ++ * In order to prevent underflow, we add a multiple of p before subtracting. ++ * Use telescopic sums to represent 2^12 * p redundantly with each limb ++ * of the form 2^60 + ... ++ */ ++ ++ static const limb two60m52m4 = (((limb) 1) << 60) ++ - (((limb) 1) << 52) ++ - (((limb) 1) << 4); ++ static const limb two60p44m12 = (((limb) 1) << 60) ++ + (((limb) 1) << 44) ++ - (((limb) 1) << 12); ++ static const limb two60m28m4 = (((limb) 1) << 60) ++ - (((limb) 1) << 28) ++ - (((limb) 1) << 4); ++ static const limb two60m4 = (((limb) 1) << 60) ++ - (((limb) 1) << 4); ++ ++ out[0] += two60p44m12 - in[0]; ++ out[1] += two60m52m4 - in[1]; ++ out[2] += two60m28m4 - in[2]; ++ out[3] += two60m4 - in[3]; ++ out[4] += two60m4 - in[4]; ++ out[5] += two60m4 - in[5]; ++ out[6] += two60m4 - in[6]; ++} ++ ++/* ++ * in[i] < 2^63 ++ * out[i] < out_orig[i] + 2^64 + 2^48 ++ */ ++static void felem_diff_128_64(widefelem out, const felem in) ++{ ++ /* ++ * In order to prevent underflow, we add a multiple of p before subtracting. ++ * Use telescopic sums to represent 2^16 * p redundantly with each limb ++ * of the form 2^64 + ... ++ */ ++ ++ static const widelimb two64m56m8 = (((widelimb) 1) << 64) ++ - (((widelimb) 1) << 56) ++ - (((widelimb) 1) << 8); ++ static const widelimb two64m32m8 = (((widelimb) 1) << 64) ++ - (((widelimb) 1) << 32) ++ - (((widelimb) 1) << 8); ++ static const widelimb two64m8 = (((widelimb) 1) << 64) ++ - (((widelimb) 1) << 8); ++ static const widelimb two64p48m16 = (((widelimb) 1) << 64) ++ + (((widelimb) 1) << 48) ++ - (((widelimb) 1) << 16); ++ unsigned int i; ++ ++ out[0] += two64p48m16; ++ out[1] += two64m56m8; ++ out[2] += two64m32m8; ++ out[3] += two64m8; ++ out[4] += two64m8; ++ out[5] += two64m8; ++ out[6] += two64m8; ++ ++ for (i = 0; i < NLIMBS; i++) ++ out[i] -= in[i]; ++} ++ ++/* ++ * in[i] < 2^127 - 2^119 - 2^71 ++ * out[i] < out_orig[i] + 2^127 + 2^111 ++ */ ++static void felem_diff128(widefelem out, const widefelem in) ++{ ++ /* ++ * In order to prevent underflow, we add a multiple of p before subtracting. ++ * Use telescopic sums to represent 2^415 * p redundantly with each limb ++ * of the form 2^127 + ... ++ */ ++ ++ static const widelimb two127 = ((widelimb) 1) << 127; ++ static const widelimb two127m71 = (((widelimb) 1) << 127) ++ - (((widelimb) 1) << 71); ++ static const widelimb two127p111m79m71 = (((widelimb) 1) << 127) ++ + (((widelimb) 1) << 111) ++ - (((widelimb) 1) << 79) ++ - (((widelimb) 1) << 71); ++ static const widelimb two127m119m71 = (((widelimb) 1) << 127) ++ - (((widelimb) 1) << 119) ++ - (((widelimb) 1) << 71); ++ static const widelimb two127m95m71 = (((widelimb) 1) << 127) ++ - (((widelimb) 1) << 95) ++ - (((widelimb) 1) << 71); ++ unsigned int i; ++ ++ out[0] += two127; ++ out[1] += two127m71; ++ out[2] += two127m71; ++ out[3] += two127m71; ++ out[4] += two127m71; ++ out[5] += two127m71; ++ out[6] += two127p111m79m71; ++ out[7] += two127m119m71; ++ out[8] += two127m95m71; ++ out[9] += two127m71; ++ out[10] += two127m71; ++ out[11] += two127m71; ++ out[12] += two127m71; ++ ++ for (i = 0; i < 2*NLIMBS-1; i++) ++ out[i] -= in[i]; ++} ++ ++static void felem_square_ref(widefelem out, const felem in) ++{ ++ felem inx2; ++ felem_scalar(inx2, in, 2); ++ ++ out[0] = ((uint128_t) in[0]) * in[0]; ++ ++ out[1] = ((uint128_t) in[0]) * inx2[1]; ++ ++ out[2] = ((uint128_t) in[0]) * inx2[2] ++ + ((uint128_t) in[1]) * in[1]; ++ ++ out[3] = ((uint128_t) in[0]) * inx2[3] ++ + ((uint128_t) in[1]) * inx2[2]; ++ ++ out[4] = ((uint128_t) in[0]) * inx2[4] ++ + ((uint128_t) in[1]) * inx2[3] ++ + ((uint128_t) in[2]) * in[2]; ++ ++ out[5] = ((uint128_t) in[0]) * inx2[5] ++ + ((uint128_t) in[1]) * inx2[4] ++ + ((uint128_t) in[2]) * inx2[3]; ++ ++ out[6] = ((uint128_t) in[0]) * inx2[6] ++ + ((uint128_t) in[1]) * inx2[5] ++ + ((uint128_t) in[2]) * inx2[4] ++ + ((uint128_t) in[3]) * in[3]; ++ ++ out[7] = ((uint128_t) in[1]) * inx2[6] ++ + ((uint128_t) in[2]) * inx2[5] ++ + ((uint128_t) in[3]) * inx2[4]; ++ ++ out[8] = ((uint128_t) in[2]) * inx2[6] ++ + ((uint128_t) in[3]) * inx2[5] ++ + ((uint128_t) in[4]) * in[4]; ++ ++ out[9] = ((uint128_t) in[3]) * inx2[6] ++ + ((uint128_t) in[4]) * inx2[5]; ++ ++ out[10] = ((uint128_t) in[4]) * inx2[6] ++ + ((uint128_t) in[5]) * in[5]; ++ ++ out[11] = ((uint128_t) in[5]) * inx2[6]; ++ ++ out[12] = ((uint128_t) in[6]) * in[6]; ++} ++ ++static void felem_mul_ref(widefelem out, const felem in1, const felem in2) ++{ ++ out[0] = ((uint128_t) in1[0]) * in2[0]; ++ ++ out[1] = ((uint128_t) in1[0]) * in2[1] ++ + ((uint128_t) in1[1]) * in2[0]; ++ ++ out[2] = ((uint128_t) in1[0]) * in2[2] ++ + ((uint128_t) in1[1]) * in2[1] ++ + ((uint128_t) in1[2]) * in2[0]; ++ ++ out[3] = ((uint128_t) in1[0]) * in2[3] ++ + ((uint128_t) in1[1]) * in2[2] ++ + ((uint128_t) in1[2]) * in2[1] ++ + ((uint128_t) in1[3]) * in2[0]; ++ ++ out[4] = ((uint128_t) in1[0]) * in2[4] ++ + ((uint128_t) in1[1]) * in2[3] ++ + ((uint128_t) in1[2]) * in2[2] ++ + ((uint128_t) in1[3]) * in2[1] ++ + ((uint128_t) in1[4]) * in2[0]; ++ ++ out[5] = ((uint128_t) in1[0]) * in2[5] ++ + ((uint128_t) in1[1]) * in2[4] ++ + ((uint128_t) in1[2]) * in2[3] ++ + ((uint128_t) in1[3]) * in2[2] ++ + ((uint128_t) in1[4]) * in2[1] ++ + ((uint128_t) in1[5]) * in2[0]; ++ ++ out[6] = ((uint128_t) in1[0]) * in2[6] ++ + ((uint128_t) in1[1]) * in2[5] ++ + ((uint128_t) in1[2]) * in2[4] ++ + ((uint128_t) in1[3]) * in2[3] ++ + ((uint128_t) in1[4]) * in2[2] ++ + ((uint128_t) in1[5]) * in2[1] ++ + ((uint128_t) in1[6]) * in2[0]; ++ ++ out[7] = ((uint128_t) in1[1]) * in2[6] ++ + ((uint128_t) in1[2]) * in2[5] ++ + ((uint128_t) in1[3]) * in2[4] ++ + ((uint128_t) in1[4]) * in2[3] ++ + ((uint128_t) in1[5]) * in2[2] ++ + ((uint128_t) in1[6]) * in2[1]; ++ ++ out[8] = ((uint128_t) in1[2]) * in2[6] ++ + ((uint128_t) in1[3]) * in2[5] ++ + ((uint128_t) in1[4]) * in2[4] ++ + ((uint128_t) in1[5]) * in2[3] ++ + ((uint128_t) in1[6]) * in2[2]; ++ ++ out[9] = ((uint128_t) in1[3]) * in2[6] ++ + ((uint128_t) in1[4]) * in2[5] ++ + ((uint128_t) in1[5]) * in2[4] ++ + ((uint128_t) in1[6]) * in2[3]; ++ ++ out[10] = ((uint128_t) in1[4]) * in2[6] ++ + ((uint128_t) in1[5]) * in2[5] ++ + ((uint128_t) in1[6]) * in2[4]; ++ ++ out[11] = ((uint128_t) in1[5]) * in2[6] ++ + ((uint128_t) in1[6]) * in2[5]; ++ ++ out[12] = ((uint128_t) in1[6]) * in2[6]; ++} ++ ++/*- ++ * Reduce thirteen 128-bit coefficients to seven 64-bit coefficients. ++ * in[i] < 2^128 - 2^125 ++ * out[i] < 2^56 for i < 6, ++ * out[6] <= 2^48 ++ * ++ * The technique in use here stems from the format of the prime modulus: ++ * P384 = 2^384 - delta ++ * ++ * Thus we can reduce numbers of the form (X + 2^384 * Y) by substituting ++ * them with (X + delta Y), with delta = 2^128 + 2^96 + (-2^32 + 1). These ++ * coefficients are still quite large, and so we repeatedly apply this ++ * technique on high-order bits in order to guarantee the desired bounds on ++ * the size of our output. ++ * ++ * The three phases of elimination are as follows: ++ * [1]: Y = 2^120 (in[12] | in[11] | in[10] | in[9]) ++ * [2]: Y = 2^8 (acc[8] | acc[7]) ++ * [3]: Y = 2^48 (acc[6] >> 48) ++ * (Where a | b | c | d = (2^56)^3 a + (2^56)^2 b + (2^56) c + d) ++ */ ++static void felem_reduce(felem out, const widefelem in) ++{ ++ /* ++ * In order to prevent underflow, we add a multiple of p before subtracting. ++ * Use telescopic sums to represent 2^76 * p redundantly with each limb ++ * of the form 2^124 + ... ++ */ ++ static const widelimb two124m68 = (((widelimb) 1) << 124) ++ - (((widelimb) 1) << 68); ++ static const widelimb two124m116m68 = (((widelimb) 1) << 124) ++ - (((widelimb) 1) << 116) ++ - (((widelimb) 1) << 68); ++ static const widelimb two124p108m76 = (((widelimb) 1) << 124) ++ + (((widelimb) 1) << 108) ++ - (((widelimb) 1) << 76); ++ static const widelimb two124m92m68 = (((widelimb) 1) << 124) ++ - (((widelimb) 1) << 92) ++ - (((widelimb) 1) << 68); ++ widelimb temp, acc[9]; ++ unsigned int i; ++ ++ memcpy(acc, in, sizeof(widelimb) * 9); ++ ++ acc[0] += two124p108m76; ++ acc[1] += two124m116m68; ++ acc[2] += two124m92m68; ++ acc[3] += two124m68; ++ acc[4] += two124m68; ++ acc[5] += two124m68; ++ acc[6] += two124m68; ++ ++ /* [1]: Eliminate in[9], ..., in[12] */ ++ acc[8] += in[12] >> 32; ++ acc[7] += (in[12] & 0xffffffff) << 24; ++ acc[7] += in[12] >> 8; ++ acc[6] += (in[12] & 0xff) << 48; ++ acc[6] -= in[12] >> 16; ++ acc[5] -= ((in[12] & 0xffff) << 40); ++ acc[6] += in[12] >> 48; ++ acc[5] += (in[12] & 0xffffffffffff) << 8; ++ ++ acc[7] += in[11] >> 32; ++ acc[6] += (in[11] & 0xffffffff) << 24; ++ acc[6] += in[11] >> 8; ++ acc[5] += (in[11] & 0xff) << 48; ++ acc[5] -= in[11] >> 16; ++ acc[4] -= ((in[11] & 0xffff) << 40); ++ acc[5] += in[11] >> 48; ++ acc[4] += (in[11] & 0xffffffffffff) << 8; ++ ++ acc[6] += in[10] >> 32; ++ acc[5] += (in[10] & 0xffffffff) << 24; ++ acc[5] += in[10] >> 8; ++ acc[4] += (in[10] & 0xff) << 48; ++ acc[4] -= in[10] >> 16; ++ acc[3] -= ((in[10] & 0xffff) << 40); ++ acc[4] += in[10] >> 48; ++ acc[3] += (in[10] & 0xffffffffffff) << 8; ++ ++ acc[5] += in[9] >> 32; ++ acc[4] += (in[9] & 0xffffffff) << 24; ++ acc[4] += in[9] >> 8; ++ acc[3] += (in[9] & 0xff) << 48; ++ acc[3] -= in[9] >> 16; ++ acc[2] -= ((in[9] & 0xffff) << 40); ++ acc[3] += in[9] >> 48; ++ acc[2] += (in[9] & 0xffffffffffff) << 8; ++ ++ /* ++ * [2]: Eliminate acc[7], acc[8], that is the 7 and eighth limbs, as ++ * well as the contributions made from eliminating higher limbs. ++ * acc[7] < in[7] + 2^120 + 2^56 < in[7] + 2^121 ++ * acc[8] < in[8] + 2^96 ++ */ ++ acc[4] += acc[8] >> 32; ++ acc[3] += (acc[8] & 0xffffffff) << 24; ++ acc[3] += acc[8] >> 8; ++ acc[2] += (acc[8] & 0xff) << 48; ++ acc[2] -= acc[8] >> 16; ++ acc[1] -= ((acc[8] & 0xffff) << 40); ++ acc[2] += acc[8] >> 48; ++ acc[1] += (acc[8] & 0xffffffffffff) << 8; ++ ++ acc[3] += acc[7] >> 32; ++ acc[2] += (acc[7] & 0xffffffff) << 24; ++ acc[2] += acc[7] >> 8; ++ acc[1] += (acc[7] & 0xff) << 48; ++ acc[1] -= acc[7] >> 16; ++ acc[0] -= ((acc[7] & 0xffff) << 40); ++ acc[1] += acc[7] >> 48; ++ acc[0] += (acc[7] & 0xffffffffffff) << 8; ++ ++ /*- ++ * acc[k] < in[k] + 2^124 + 2^121 ++ * < in[k] + 2^125 ++ * < 2^128, for k <= 6 ++ */ ++ ++ /* ++ * Carry 4 -> 5 -> 6 ++ * This has the effect of ensuring that these more significant limbs ++ * will be small in value after eliminating high bits from acc[6]. ++ */ ++ acc[5] += acc[4] >> 56; ++ acc[4] &= 0x00ffffffffffffff; ++ ++ acc[6] += acc[5] >> 56; ++ acc[5] &= 0x00ffffffffffffff; ++ ++ /*- ++ * acc[6] < in[6] + 2^124 + 2^121 + 2^72 + 2^16 ++ * < in[6] + 2^125 ++ * < 2^128 ++ */ ++ ++ /* [3]: Eliminate high bits of acc[6] */ ++ temp = acc[6] >> 48; ++ acc[6] &= 0x0000ffffffffffff; ++ ++ /* temp < 2^80 */ ++ ++ acc[3] += temp >> 40; ++ acc[2] += (temp & 0xffffffffff) << 16; ++ acc[2] += temp >> 16; ++ acc[1] += (temp & 0xffff) << 40; ++ acc[1] -= temp >> 24; ++ acc[0] -= (temp & 0xffffff) << 32; ++ acc[0] += temp; ++ ++ /*- ++ * acc[k] < acc_old[k] + 2^64 + 2^56 ++ * < in[k] + 2^124 + 2^121 + 2^72 + 2^64 + 2^56 + 2^16 , k < 4 ++ */ ++ ++ /* Carry 0 -> 1 -> 2 -> 3 -> 4 -> 5 -> 6 */ ++ acc[1] += acc[0] >> 56; /* acc[1] < acc_old[1] + 2^72 */ ++ acc[0] &= 0x00ffffffffffffff; ++ ++ acc[2] += acc[1] >> 56; /* acc[2] < acc_old[2] + 2^72 + 2^16 */ ++ acc[1] &= 0x00ffffffffffffff; ++ ++ acc[3] += acc[2] >> 56; /* acc[3] < acc_old[3] + 2^72 + 2^16 */ ++ acc[2] &= 0x00ffffffffffffff; ++ ++ /*- ++ * acc[k] < acc_old[k] + 2^72 + 2^16 ++ * < in[k] + 2^124 + 2^121 + 2^73 + 2^64 + 2^56 + 2^17 ++ * < in[k] + 2^125 ++ * < 2^128 , k < 4 ++ */ ++ ++ acc[4] += acc[3] >> 56; /*- ++ * acc[4] < acc_old[4] + 2^72 + 2^16 ++ * < 2^72 + 2^56 + 2^16 ++ */ ++ acc[3] &= 0x00ffffffffffffff; ++ ++ acc[5] += acc[4] >> 56; /*- ++ * acc[5] < acc_old[5] + 2^16 + 1 ++ * < 2^56 + 2^16 + 1 ++ */ ++ acc[4] &= 0x00ffffffffffffff; ++ ++ acc[6] += acc[5] >> 56; /* acc[6] < 2^48 + 1 <= 2^48 */ ++ acc[5] &= 0x00ffffffffffffff; ++ ++ for (i = 0; i < NLIMBS; i++) ++ out[i] = acc[i]; ++} ++ ++#if defined(ECP_NISTP384_ASM) ++static void felem_square_wrapper(widefelem out, const felem in); ++static void felem_mul_wrapper(widefelem out, const felem in1, const felem in2); ++ ++static void (*felem_square_p)(widefelem out, const felem in) = ++ felem_square_wrapper; ++static void (*felem_mul_p)(widefelem out, const felem in1, const felem in2) = ++ felem_mul_wrapper; ++ ++void p384_felem_square(widefelem out, const felem in); ++void p384_felem_mul(widefelem out, const felem in1, const felem in2); ++ ++# if defined(_ARCH_PPC64) ++# include "crypto/ppc_arch.h" ++# endif ++ ++static void felem_select(void) ++{ ++ /* Default */ ++ felem_square_p = felem_square_ref; ++ felem_mul_p = felem_mul_ref; ++} ++ ++static void felem_square_wrapper(widefelem out, const felem in) ++{ ++ felem_select(); ++ felem_square_p(out, in); ++} ++ ++static void felem_mul_wrapper(widefelem out, const felem in1, const felem in2) ++{ ++ felem_select(); ++ felem_mul_p(out, in1, in2); ++} ++ ++# define felem_square felem_square_p ++# define felem_mul felem_mul_p ++#else ++# define felem_square felem_square_ref ++# define felem_mul felem_mul_ref ++#endif ++ ++static ossl_inline void felem_square_reduce(felem out, const felem in) ++{ ++ widefelem tmp; ++ ++ felem_square(tmp, in); ++ felem_reduce(out, tmp); ++} ++ ++static ossl_inline void felem_mul_reduce(felem out, const felem in1, const felem in2) ++{ ++ widefelem tmp; ++ ++ felem_mul(tmp, in1, in2); ++ felem_reduce(out, tmp); ++} ++ ++/*- ++ * felem_inv calculates |out| = |in|^{-1} ++ * ++ * Based on Fermat's Little Theorem: ++ * a^p = a (mod p) ++ * a^{p-1} = 1 (mod p) ++ * a^{p-2} = a^{-1} (mod p) ++ */ ++static void felem_inv(felem out, const felem in) ++{ ++ felem ftmp, ftmp2, ftmp3, ftmp4, ftmp5, ftmp6; ++ unsigned int i = 0; ++ ++ felem_square_reduce(ftmp, in); /* 2^1 */ ++ felem_mul_reduce(ftmp, ftmp, in); /* 2^1 + 2^0 */ ++ felem_assign(ftmp2, ftmp); ++ ++ felem_square_reduce(ftmp, ftmp); /* 2^2 + 2^1 */ ++ felem_mul_reduce(ftmp, ftmp, in); /* 2^2 + 2^1 * 2^0 */ ++ felem_assign(ftmp3, ftmp); ++ ++ for (i = 0; i < 3; i++) ++ felem_square_reduce(ftmp, ftmp); /* 2^5 + 2^4 + 2^3 */ ++ felem_mul_reduce(ftmp, ftmp3, ftmp); /* 2^5 + 2^4 + 2^3 + 2^2 + 2^1 + 2^0 */ ++ felem_assign(ftmp4, ftmp); ++ ++ for (i = 0; i < 6; i++) ++ felem_square_reduce(ftmp, ftmp); /* 2^11 + ... + 2^6 */ ++ felem_mul_reduce(ftmp, ftmp4, ftmp); /* 2^11 + ... + 2^0 */ ++ ++ for (i = 0; i < 3; i++) ++ felem_square_reduce(ftmp, ftmp); /* 2^14 + ... + 2^3 */ ++ felem_mul_reduce(ftmp, ftmp3, ftmp); /* 2^14 + ... + 2^0 */ ++ felem_assign(ftmp5, ftmp); ++ ++ for (i = 0; i < 15; i++) ++ felem_square_reduce(ftmp, ftmp); /* 2^29 + ... + 2^15 */ ++ felem_mul_reduce(ftmp, ftmp5, ftmp); /* 2^29 + ... + 2^0 */ ++ felem_assign(ftmp6, ftmp); ++ ++ for (i = 0; i < 30; i++) ++ felem_square_reduce(ftmp, ftmp); /* 2^59 + ... + 2^30 */ ++ felem_mul_reduce(ftmp, ftmp6, ftmp); /* 2^59 + ... + 2^0 */ ++ felem_assign(ftmp4, ftmp); ++ ++ for (i = 0; i < 60; i++) ++ felem_square_reduce(ftmp, ftmp); /* 2^119 + ... + 2^60 */ ++ felem_mul_reduce(ftmp, ftmp4, ftmp); /* 2^119 + ... + 2^0 */ ++ felem_assign(ftmp4, ftmp); ++ ++ for (i = 0; i < 120; i++) ++ felem_square_reduce(ftmp, ftmp); /* 2^239 + ... + 2^120 */ ++ felem_mul_reduce(ftmp, ftmp4, ftmp); /* 2^239 + ... + 2^0 */ ++ ++ for (i = 0; i < 15; i++) ++ felem_square_reduce(ftmp, ftmp); /* 2^254 + ... + 2^15 */ ++ felem_mul_reduce(ftmp, ftmp5, ftmp); /* 2^254 + ... + 2^0 */ ++ ++ for (i = 0; i < 31; i++) ++ felem_square_reduce(ftmp, ftmp); /* 2^285 + ... + 2^31 */ ++ felem_mul_reduce(ftmp, ftmp6, ftmp); /* 2^285 + ... + 2^31 + 2^29 + ... + 2^0 */ ++ ++ for (i = 0; i < 2; i++) ++ felem_square_reduce(ftmp, ftmp); /* 2^287 + ... + 2^33 + 2^31 + ... + 2^2 */ ++ felem_mul_reduce(ftmp, ftmp2, ftmp); /* 2^287 + ... + 2^33 + 2^31 + ... + 2^0 */ ++ ++ for (i = 0; i < 94; i++) ++ felem_square_reduce(ftmp, ftmp); /* 2^381 + ... + 2^127 + 2^125 + ... + 2^94 */ ++ felem_mul_reduce(ftmp, ftmp6, ftmp); /* 2^381 + ... + 2^127 + 2^125 + ... + 2^94 + 2^29 + ... + 2^0 */ ++ ++ for (i = 0; i < 2; i++) ++ felem_square_reduce(ftmp, ftmp); /* 2^383 + ... + 2^129 + 2^127 + ... + 2^96 + 2^31 + ... + 2^2 */ ++ felem_mul_reduce(ftmp, in, ftmp); /* 2^383 + ... + 2^129 + 2^127 + ... + 2^96 + 2^31 + ... + 2^2 + 2^0 */ ++ ++ memcpy(out, ftmp, sizeof(felem)); ++} ++ ++/* ++ * Zero-check: returns a limb with all bits set if |in| == 0 (mod p) ++ * and 0 otherwise. We know that field elements are reduced to ++ * 0 < in < 2p, so we only need to check two cases: ++ * 0 and 2^384 - 2^128 - 2^96 + 2^32 - 1 ++ * in[k] < 2^56, k < 6 ++ * in[6] <= 2^48 ++ */ ++static limb felem_is_zero(const felem in) ++{ ++ limb zero, p384; ++ ++ zero = in[0] | in[1] | in[2] | in[3] | in[4] | in[5] | in[6]; ++ zero = ((int64_t) (zero) - 1) >> 63; ++ p384 = (in[0] ^ 0x000000ffffffff) | (in[1] ^ 0xffff0000000000) ++ | (in[2] ^ 0xfffffffffeffff) | (in[3] ^ 0xffffffffffffff) ++ | (in[4] ^ 0xffffffffffffff) | (in[5] ^ 0xffffffffffffff) ++ | (in[6] ^ 0xffffffffffff); ++ p384 = ((int64_t) (p384) - 1) >> 63; ++ ++ return (zero | p384); ++} ++ ++static int felem_is_zero_int(const void *in) ++{ ++ return (int)(felem_is_zero(in) & ((limb) 1)); ++} ++ ++/*- ++ * felem_contract converts |in| to its unique, minimal representation. ++ * Assume we've removed all redundant bits. ++ * On entry: ++ * in[k] < 2^56, k < 6 ++ * in[6] <= 2^48 ++ */ ++static void felem_contract(felem out, const felem in) ++{ ++ static const int64_t two56 = ((limb) 1) << 56; ++ ++ /* ++ * We know for a fact that 0 <= |in| < 2*p, for p = 2^384 - 2^128 - 2^96 + 2^32 - 1 ++ * Perform two successive, idempotent subtractions to reduce if |in| >= p. ++ */ ++ ++ int64_t tmp[NLIMBS], cond[5], a; ++ unsigned int i; ++ ++ memcpy(tmp, in, sizeof(felem)); ++ ++ /* Case 1: a = 1 iff |in| >= 2^384 */ ++ a = (in[6] >> 48); ++ tmp[0] += a; ++ tmp[0] -= a << 32; ++ tmp[1] += a << 40; ++ tmp[2] += a << 16; ++ tmp[6] &= 0x0000ffffffffffff; ++ ++ /* ++ * eliminate negative coefficients: if tmp[0] is negative, tmp[1] must be ++ * non-zero, so we only need one step ++ */ ++ ++ a = tmp[0] >> 63; ++ tmp[0] += a & two56; ++ tmp[1] -= a & 1; ++ ++ /* Carry 1 -> 2 -> 3 -> 4 -> 5 -> 6 */ ++ tmp[2] += tmp[1] >> 56; ++ tmp[1] &= 0x00ffffffffffffff; ++ ++ tmp[3] += tmp[2] >> 56; ++ tmp[2] &= 0x00ffffffffffffff; ++ ++ tmp[4] += tmp[3] >> 56; ++ tmp[3] &= 0x00ffffffffffffff; ++ ++ tmp[5] += tmp[4] >> 56; ++ tmp[4] &= 0x00ffffffffffffff; ++ ++ tmp[6] += tmp[5] >> 56; /* tmp[6] < 2^48 */ ++ tmp[5] &= 0x00ffffffffffffff; ++ ++ /* ++ * Case 2: a = all ones if p <= |in| < 2^384, 0 otherwise ++ */ ++ ++ /* 0 iff (2^129..2^383) are all one */ ++ cond[0] = ((tmp[6] | 0xff000000000000) & tmp[5] & tmp[4] & tmp[3] & (tmp[2] | 0x0000000001ffff)) + 1; ++ /* 0 iff 2^128 bit is one */ ++ cond[1] = (tmp[2] | ~0x00000000010000) + 1; ++ /* 0 iff (2^96..2^127) bits are all one */ ++ cond[2] = ((tmp[2] | 0xffffffffff0000) & (tmp[1] | 0x0000ffffffffff)) + 1; ++ /* 0 iff (2^32..2^95) bits are all zero */ ++ cond[3] = (tmp[1] & ~0xffff0000000000) | (tmp[0] & ~((int64_t) 0x000000ffffffff)); ++ /* 0 iff (2^0..2^31) bits are all one */ ++ cond[4] = (tmp[0] | 0xffffff00000000) + 1; ++ ++ /* ++ * In effect, invert our conditions, so that 0 values become all 1's, ++ * any non-zero value in the low-order 56 bits becomes all 0's ++ */ ++ for (i = 0; i < 5; i++) ++ cond[i] = ((cond[i] & 0x00ffffffffffffff) - 1) >> 63; ++ ++ /* ++ * The condition for determining whether in is greater than our ++ * prime is given by the following condition. ++ */ ++ ++ /* First subtract 2^384 - 2^129 cheaply */ ++ a = cond[0] & (cond[1] | (cond[2] & (~cond[3] | cond[4]))); ++ tmp[6] &= ~a; ++ tmp[5] &= ~a; ++ tmp[4] &= ~a; ++ tmp[3] &= ~a; ++ tmp[2] &= ~a | 0x0000000001ffff; ++ ++ /* ++ * Subtract 2^128 - 2^96 by ++ * means of disjoint cases. ++ */ ++ ++ /* subtract 2^128 if that bit is present, and add 2^96 */ ++ a = cond[0] & cond[1]; ++ tmp[2] &= ~a | 0xfffffffffeffff; ++ tmp[1] += a & ((int64_t) 1 << 40); ++ ++ /* otherwise, clear bits 2^127 .. 2^96 */ ++ a = cond[0] & ~cond[1] & (cond[2] & (~cond[3] | cond[4])); ++ tmp[2] &= ~a | 0xffffffffff0000; ++ tmp[1] &= ~a | 0x0000ffffffffff; ++ ++ /* finally, subtract the last 2^32 - 1 */ ++ a = cond[0] & (cond[1] | (cond[2] & (~cond[3] | cond[4]))); ++ tmp[0] += a & (-((int64_t) 1 << 32) + 1); ++ ++ /* ++ * eliminate negative coefficients: if tmp[0] is negative, tmp[1] must be ++ * non-zero, so we only need one step ++ */ ++ a = tmp[0] >> 63; ++ tmp[0] += a & two56; ++ tmp[1] -= a & 1; ++ ++ /* Carry 1 -> 2 -> 3 -> 4 -> 5 -> 6 */ ++ tmp[2] += tmp[1] >> 56; ++ tmp[1] &= 0x00ffffffffffffff; ++ ++ tmp[3] += tmp[2] >> 56; ++ tmp[2] &= 0x00ffffffffffffff; ++ ++ tmp[4] += tmp[3] >> 56; ++ tmp[3] &= 0x00ffffffffffffff; ++ ++ tmp[5] += tmp[4] >> 56; ++ tmp[4] &= 0x00ffffffffffffff; ++ ++ tmp[6] += tmp[5] >> 56; ++ tmp[5] &= 0x00ffffffffffffff; ++ ++ memcpy(out, tmp, sizeof(felem)); ++} ++ ++/*- ++ * Group operations ++ * ---------------- ++ * ++ * Building on top of the field operations we have the operations on the ++ * elliptic curve group itself. Points on the curve are represented in Jacobian ++ * coordinates ++ */ ++ ++/*- ++ * point_double calculates 2*(x_in, y_in, z_in) ++ * ++ * The method is taken from: ++ * http://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#doubling-dbl-2001-b ++ * ++ * Outputs can equal corresponding inputs, i.e., x_out == x_in is allowed. ++ * while x_out == y_in is not (maybe this works, but it's not tested). ++ */ ++static void ++point_double(felem x_out, felem y_out, felem z_out, ++ const felem x_in, const felem y_in, const felem z_in) ++{ ++ widefelem tmp, tmp2; ++ felem delta, gamma, beta, alpha, ftmp, ftmp2; ++ ++ felem_assign(ftmp, x_in); ++ felem_assign(ftmp2, x_in); ++ ++ /* delta = z^2 */ ++ felem_square_reduce(delta, z_in); /* delta[i] < 2^56 */ ++ ++ /* gamma = y^2 */ ++ felem_square_reduce(gamma, y_in); /* gamma[i] < 2^56 */ ++ ++ /* beta = x*gamma */ ++ felem_mul_reduce(beta, x_in, gamma); /* beta[i] < 2^56 */ ++ ++ /* alpha = 3*(x-delta)*(x+delta) */ ++ felem_diff64(ftmp, delta); /* ftmp[i] < 2^60 + 2^58 + 2^44 */ ++ felem_sum64(ftmp2, delta); /* ftmp2[i] < 2^59 */ ++ felem_scalar64(ftmp2, 3); /* ftmp2[i] < 2^61 */ ++ felem_mul_reduce(alpha, ftmp, ftmp2); /* alpha[i] < 2^56 */ ++ ++ /* x' = alpha^2 - 8*beta */ ++ felem_square(tmp, alpha); /* tmp[i] < 2^115 */ ++ felem_assign(ftmp, beta); /* ftmp[i] < 2^56 */ ++ felem_scalar64(ftmp, 8); /* ftmp[i] < 2^59 */ ++ felem_diff_128_64(tmp, ftmp); /* tmp[i] < 2^115 + 2^64 + 2^48 */ ++ felem_reduce(x_out, tmp); /* x_out[i] < 2^56 */ ++ ++ /* z' = (y + z)^2 - gamma - delta */ ++ felem_sum64(delta, gamma); /* delta[i] < 2^57 */ ++ felem_assign(ftmp, y_in); /* ftmp[i] < 2^56 */ ++ felem_sum64(ftmp, z_in); /* ftmp[i] < 2^56 */ ++ felem_square(tmp, ftmp); /* tmp[i] < 2^115 */ ++ felem_diff_128_64(tmp, delta); /* tmp[i] < 2^115 + 2^64 + 2^48 */ ++ felem_reduce(z_out, tmp); /* z_out[i] < 2^56 */ ++ ++ /* y' = alpha*(4*beta - x') - 8*gamma^2 */ ++ felem_scalar64(beta, 4); /* beta[i] < 2^58 */ ++ felem_diff64(beta, x_out); /* beta[i] < 2^60 + 2^58 + 2^44 */ ++ felem_mul(tmp, alpha, beta); /* tmp[i] < 2^119 */ ++ felem_square(tmp2, gamma); /* tmp2[i] < 2^115 */ ++ felem_scalar128(tmp2, 8); /* tmp2[i] < 2^118 */ ++ felem_diff128(tmp, tmp2); /* tmp[i] < 2^127 + 2^119 + 2^111 */ ++ felem_reduce(y_out, tmp); /* tmp[i] < 2^56 */ ++} ++ ++/* copy_conditional copies in to out iff mask is all ones. */ ++static void copy_conditional(felem out, const felem in, limb mask) ++{ ++ unsigned int i; ++ ++ for (i = 0; i < NLIMBS; i++) ++ out[i] ^= mask & (in[i] ^ out[i]); ++} ++ ++/*- ++ * point_add calculates (x1, y1, z1) + (x2, y2, z2) ++ * ++ * The method is taken from ++ * http://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#addition-add-2007-bl, ++ * adapted for mixed addition (z2 = 1, or z2 = 0 for the point at infinity). ++ * ++ * This function includes a branch for checking whether the two input points ++ * are equal (while not equal to the point at infinity). See comment below ++ * on constant-time. ++ */ ++static void point_add(felem x3, felem y3, felem z3, ++ const felem x1, const felem y1, const felem z1, ++ const int mixed, const felem x2, const felem y2, ++ const felem z2) ++{ ++ felem ftmp, ftmp2, ftmp3, ftmp4, ftmp5, ftmp6, x_out, y_out, z_out; ++ widefelem tmp, tmp2; ++ limb x_equal, y_equal, z1_is_zero, z2_is_zero; ++ limb points_equal; ++ ++ z1_is_zero = felem_is_zero(z1); ++ z2_is_zero = felem_is_zero(z2); ++ ++ /* ftmp = z1z1 = z1**2 */ ++ felem_square_reduce(ftmp, z1); /* ftmp[i] < 2^56 */ ++ ++ if (!mixed) { ++ /* ftmp2 = z2z2 = z2**2 */ ++ felem_square_reduce(ftmp2, z2); /* ftmp2[i] < 2^56 */ ++ ++ /* u1 = ftmp3 = x1*z2z2 */ ++ felem_mul_reduce(ftmp3, x1, ftmp2); /* ftmp3[i] < 2^56 */ ++ ++ /* ftmp5 = z1 + z2 */ ++ felem_assign(ftmp5, z1); /* ftmp5[i] < 2^56 */ ++ felem_sum64(ftmp5, z2); /* ftmp5[i] < 2^57 */ ++ ++ /* ftmp5 = (z1 + z2)**2 - z1z1 - z2z2 = 2*z1z2 */ ++ felem_square(tmp, ftmp5); /* tmp[i] < 2^117 */ ++ felem_diff_128_64(tmp, ftmp); /* tmp[i] < 2^117 + 2^64 + 2^48 */ ++ felem_diff_128_64(tmp, ftmp2); /* tmp[i] < 2^117 + 2^65 + 2^49 */ ++ felem_reduce(ftmp5, tmp); /* ftmp5[i] < 2^56 */ ++ ++ /* ftmp2 = z2 * z2z2 */ ++ felem_mul_reduce(ftmp2, ftmp2, z2); /* ftmp2[i] < 2^56 */ ++ ++ /* s1 = ftmp6 = y1 * z2**3 */ ++ felem_mul_reduce(ftmp6, y1, ftmp2); /* ftmp6[i] < 2^56 */ ++ } else { ++ /* ++ * We'll assume z2 = 1 (special case z2 = 0 is handled later) ++ */ ++ ++ /* u1 = ftmp3 = x1*z2z2 */ ++ felem_assign(ftmp3, x1); /* ftmp3[i] < 2^56 */ ++ ++ /* ftmp5 = 2*z1z2 */ ++ felem_scalar(ftmp5, z1, 2); /* ftmp5[i] < 2^57 */ ++ ++ /* s1 = ftmp6 = y1 * z2**3 */ ++ felem_assign(ftmp6, y1); /* ftmp6[i] < 2^56 */ ++ } ++ /* ftmp3[i] < 2^56, ftmp5[i] < 2^57, ftmp6[i] < 2^56 */ ++ ++ /* u2 = x2*z1z1 */ ++ felem_mul(tmp, x2, ftmp); /* tmp[i] < 2^115 */ ++ ++ /* h = ftmp4 = u2 - u1 */ ++ felem_diff_128_64(tmp, ftmp3); /* tmp[i] < 2^115 + 2^64 + 2^48 */ ++ felem_reduce(ftmp4, tmp); /* ftmp[4] < 2^56 */ ++ ++ x_equal = felem_is_zero(ftmp4); ++ ++ /* z_out = ftmp5 * h */ ++ felem_mul_reduce(z_out, ftmp5, ftmp4); /* z_out[i] < 2^56 */ ++ ++ /* ftmp = z1 * z1z1 */ ++ felem_mul_reduce(ftmp, ftmp, z1); /* ftmp[i] < 2^56 */ ++ ++ /* s2 = tmp = y2 * z1**3 */ ++ felem_mul(tmp, y2, ftmp); /* tmp[i] < 2^115 */ ++ ++ /* r = ftmp5 = (s2 - s1)*2 */ ++ felem_diff_128_64(tmp, ftmp6); /* tmp[i] < 2^115 + 2^64 + 2^48 */ ++ felem_reduce(ftmp5, tmp); /* ftmp5[i] < 2^56 */ ++ y_equal = felem_is_zero(ftmp5); ++ felem_scalar64(ftmp5, 2); /* ftmp5[i] < 2^57 */ ++ ++ /* ++ * The formulae are incorrect if the points are equal, in affine coordinates ++ * (X_1, Y_1) == (X_2, Y_2), so we check for this and do doubling if this ++ * happens. ++ * ++ * We use bitwise operations to avoid potential side-channels introduced by ++ * the short-circuiting behaviour of boolean operators. ++ * ++ * The special case of either point being the point at infinity (z1 and/or ++ * z2 are zero), is handled separately later on in this function, so we ++ * avoid jumping to point_double here in those special cases. ++ * ++ * Notice the comment below on the implications of this branching for timing ++ * leaks and why it is considered practically irrelevant. ++ */ ++ points_equal = (x_equal & y_equal & (~z1_is_zero) & (~z2_is_zero)); ++ ++ if (points_equal) { ++ /* ++ * This is obviously not constant-time but it will almost-never happen ++ * for ECDH / ECDSA. ++ */ ++ point_double(x3, y3, z3, x1, y1, z1); ++ return; ++ } ++ ++ /* I = ftmp = (2h)**2 */ ++ felem_assign(ftmp, ftmp4); /* ftmp[i] < 2^56 */ ++ felem_scalar64(ftmp, 2); /* ftmp[i] < 2^57 */ ++ felem_square_reduce(ftmp, ftmp); /* ftmp[i] < 2^56 */ ++ ++ /* J = ftmp2 = h * I */ ++ felem_mul_reduce(ftmp2, ftmp4, ftmp); /* ftmp2[i] < 2^56 */ ++ ++ /* V = ftmp4 = U1 * I */ ++ felem_mul_reduce(ftmp4, ftmp3, ftmp); /* ftmp4[i] < 2^56 */ ++ ++ /* x_out = r**2 - J - 2V */ ++ felem_square(tmp, ftmp5); /* tmp[i] < 2^117 */ ++ felem_diff_128_64(tmp, ftmp2); /* tmp[i] < 2^117 + 2^64 + 2^48 */ ++ felem_assign(ftmp3, ftmp4); /* ftmp3[i] < 2^56 */ ++ felem_scalar64(ftmp4, 2); /* ftmp4[i] < 2^57 */ ++ felem_diff_128_64(tmp, ftmp4); /* tmp[i] < 2^117 + 2^65 + 2^49 */ ++ felem_reduce(x_out, tmp); /* x_out[i] < 2^56 */ ++ ++ /* y_out = r(V-x_out) - 2 * s1 * J */ ++ felem_diff64(ftmp3, x_out); /* ftmp3[i] < 2^60 + 2^56 + 2^44 */ ++ felem_mul(tmp, ftmp5, ftmp3); /* tmp[i] < 2^116 */ ++ felem_mul(tmp2, ftmp6, ftmp2); /* tmp2[i] < 2^115 */ ++ felem_scalar128(tmp2, 2); /* tmp2[i] < 2^116 */ ++ felem_diff128(tmp, tmp2); /* tmp[i] < 2^127 + 2^116 + 2^111 */ ++ felem_reduce(y_out, tmp); /* y_out[i] < 2^56 */ ++ ++ copy_conditional(x_out, x2, z1_is_zero); ++ copy_conditional(x_out, x1, z2_is_zero); ++ copy_conditional(y_out, y2, z1_is_zero); ++ copy_conditional(y_out, y1, z2_is_zero); ++ copy_conditional(z_out, z2, z1_is_zero); ++ copy_conditional(z_out, z1, z2_is_zero); ++ felem_assign(x3, x_out); ++ felem_assign(y3, y_out); ++ felem_assign(z3, z_out); ++} ++ ++/*- ++ * Base point pre computation ++ * -------------------------- ++ * ++ * Two different sorts of precomputed tables are used in the following code. ++ * Each contain various points on the curve, where each point is three field ++ * elements (x, y, z). ++ * ++ * For the base point table, z is usually 1 (0 for the point at infinity). ++ * This table has 16 elements: ++ * index | bits | point ++ * ------+---------+------------------------------ ++ * 0 | 0 0 0 0 | 0G ++ * 1 | 0 0 0 1 | 1G ++ * 2 | 0 0 1 0 | 2^95G ++ * 3 | 0 0 1 1 | (2^95 + 1)G ++ * 4 | 0 1 0 0 | 2^190G ++ * 5 | 0 1 0 1 | (2^190 + 1)G ++ * 6 | 0 1 1 0 | (2^190 + 2^95)G ++ * 7 | 0 1 1 1 | (2^190 + 2^95 + 1)G ++ * 8 | 1 0 0 0 | 2^285G ++ * 9 | 1 0 0 1 | (2^285 + 1)G ++ * 10 | 1 0 1 0 | (2^285 + 2^95)G ++ * 11 | 1 0 1 1 | (2^285 + 2^95 + 1)G ++ * 12 | 1 1 0 0 | (2^285 + 2^190)G ++ * 13 | 1 1 0 1 | (2^285 + 2^190 + 1)G ++ * 14 | 1 1 1 0 | (2^285 + 2^190 + 2^95)G ++ * 15 | 1 1 1 1 | (2^285 + 2^190 + 2^95 + 1)G ++ * ++ * The reason for this is so that we can clock bits into four different ++ * locations when doing simple scalar multiplies against the base point. ++ * ++ * Tables for other points have table[i] = iG for i in 0 .. 16. ++ */ ++ ++/* gmul is the table of precomputed base points */ ++static const felem gmul[16][3] = { ++{{0, 0, 0, 0, 0, 0, 0}, ++ {0, 0, 0, 0, 0, 0, 0}, ++ {0, 0, 0, 0, 0, 0, 0}}, ++{{0x00545e3872760ab7, 0x00f25dbf55296c3a, 0x00e082542a385502, 0x008ba79b9859f741, ++ 0x0020ad746e1d3b62, 0x0005378eb1c71ef3, 0x0000aa87ca22be8b}, ++ {0x00431d7c90ea0e5f, 0x00b1ce1d7e819d7a, 0x0013b5f0b8c00a60, 0x00289a147ce9da31, ++ 0x0092dc29f8f41dbd, 0x002c6f5d9e98bf92, 0x00003617de4a9626}, ++ {1, 0, 0, 0, 0, 0, 0}}, ++{{0x00024711cc902a90, 0x00acb2e579ab4fe1, 0x00af818a4b4d57b1, 0x00a17c7bec49c3de, ++ 0x004280482d726a8b, 0x00128dd0f0a90f3b, 0x00004387c1c3fa3c}, ++ {0x002ce76543cf5c3a, 0x00de6cee5ef58f0a, 0x00403e42fa561ca6, 0x00bc54d6f9cb9731, ++ 0x007155f925fb4ff1, 0x004a9ce731b7b9bc, 0x00002609076bd7b2}, ++ {1, 0, 0, 0, 0, 0, 0}}, ++{{0x00e74c9182f0251d, 0x0039bf54bb111974, 0x00b9d2f2eec511d2, 0x0036b1594eb3a6a4, ++ 0x00ac3bb82d9d564b, 0x00f9313f4615a100, 0x00006716a9a91b10}, ++ {0x0046698116e2f15c, 0x00f34347067d3d33, 0x008de4ccfdebd002, 0x00e838c6b8e8c97b, ++ 0x006faf0798def346, 0x007349794a57563c, 0x00002629e7e6ad84}, ++ {1, 0, 0, 0, 0, 0, 0}}, ++{{0x0075300e34fd163b, 0x0092e9db4e8d0ad3, 0x00254be9f625f760, 0x00512c518c72ae68, ++ 0x009bfcf162bede5a, 0x00bf9341566ce311, 0x0000cd6175bd41cf}, ++ {0x007dfe52af4ac70f, 0x0002159d2d5c4880, 0x00b504d16f0af8d0, 0x0014585e11f5e64c, ++ 0x0089c6388e030967, 0x00ffb270cbfa5f71, 0x00009a15d92c3947}, ++ {1, 0, 0, 0, 0, 0, 0}}, ++{{0x0033fc1278dc4fe5, 0x00d53088c2caa043, 0x0085558827e2db66, 0x00c192bef387b736, ++ 0x00df6405a2225f2c, 0x0075205aa90fd91a, 0x0000137e3f12349d}, ++ {0x00ce5b115efcb07e, 0x00abc3308410deeb, 0x005dc6fc1de39904, 0x00907c1c496f36b4, ++ 0x0008e6ad3926cbe1, 0x00110747b787928c, 0x0000021b9162eb7e}, ++ {1, 0, 0, 0, 0, 0, 0}}, ++{{0x008180042cfa26e1, 0x007b826a96254967, 0x0082473694d6b194, 0x007bd6880a45b589, ++ 0x00c0a5097072d1a3, 0x0019186555e18b4e, 0x000020278190e5ca}, ++ {0x00b4bef17de61ac0, 0x009535e3c38ed348, 0x002d4aa8e468ceab, 0x00ef40b431036ad3, ++ 0x00defd52f4542857, 0x0086edbf98234266, 0x00002025b3a7814d}, ++ {1, 0, 0, 0, 0, 0, 0}}, ++{{0x00b238aa97b886be, 0x00ef3192d6dd3a32, 0x0079f9e01fd62df8, 0x00742e890daba6c5, ++ 0x008e5289144408ce, 0x0073bbcc8e0171a5, 0x0000c4fd329d3b52}, ++ {0x00c6f64a15ee23e7, 0x00dcfb7b171cad8b, 0x00039f6cbd805867, 0x00de024e428d4562, ++ 0x00be6a594d7c64c5, 0x0078467b70dbcd64, 0x0000251f2ed7079b}, ++ {1, 0, 0, 0, 0, 0, 0}}, ++{{0x000e5cc25fc4b872, 0x005ebf10d31ef4e1, 0x0061e0ebd11e8256, 0x0076e026096f5a27, ++ 0x0013e6fc44662e9a, 0x0042b00289d3597e, 0x000024f089170d88}, ++ {0x001604d7e0effbe6, 0x0048d77cba64ec2c, 0x008166b16da19e36, 0x006b0d1a0f28c088, ++ 0x000259fcd47754fd, 0x00cc643e4d725f9a, 0x00007b10f3c79c14}, ++ {1, 0, 0, 0, 0, 0, 0}}, ++{{0x00430155e3b908af, 0x00b801e4fec25226, 0x00b0d4bcfe806d26, 0x009fc4014eb13d37, ++ 0x0066c94e44ec07e8, 0x00d16adc03874ba2, 0x000030c917a0d2a7}, ++ {0x00edac9e21eb891c, 0x00ef0fb768102eff, 0x00c088cef272a5f3, 0x00cbf782134e2964, ++ 0x0001044a7ba9a0e3, 0x00e363f5b194cf3c, 0x00009ce85249e372}, ++ {1, 0, 0, 0, 0, 0, 0}}, ++{{0x001dd492dda5a7eb, 0x008fd577be539fd1, 0x002ff4b25a5fc3f1, 0x0074a8a1b64df72f, ++ 0x002ba3d8c204a76c, 0x009d5cff95c8235a, 0x0000e014b9406e0f}, ++ {0x008c2e4dbfc98aba, 0x00f30bb89f1a1436, 0x00b46f7aea3e259c, 0x009224454ac02f54, ++ 0x00906401f5645fa2, 0x003a1d1940eabc77, 0x00007c9351d680e6}, ++ {1, 0, 0, 0, 0, 0, 0}}, ++{{0x005a35d872ef967c, 0x0049f1b7884e1987, 0x0059d46d7e31f552, 0x00ceb4869d2d0fb6, ++ 0x00e8e89eee56802a, 0x0049d806a774aaf2, 0x0000147e2af0ae24}, ++ {0x005fd1bd852c6e5e, 0x00b674b7b3de6885, 0x003b9ea5eb9b6c08, 0x005c9f03babf3ef7, ++ 0x00605337fecab3c7, 0x009a3f85b11bbcc8, 0x0000455470f330ec}, ++ {1, 0, 0, 0, 0, 0, 0}}, ++{{0x002197ff4d55498d, 0x00383e8916c2d8af, 0x00eb203f34d1c6d2, 0x0080367cbd11b542, ++ 0x00769b3be864e4f5, 0x0081a8458521c7bb, 0x0000c531b34d3539}, ++ {0x00e2a3d775fa2e13, 0x00534fc379573844, 0x00ff237d2a8db54a, 0x00d301b2335a8882, ++ 0x000f75ea96103a80, 0x0018fecb3cdd96fa, 0x0000304bf61e94eb}, ++ {1, 0, 0, 0, 0, 0, 0}}, ++{{0x00b2afc332a73dbd, 0x0029a0d5bb007bc5, 0x002d628eb210f577, 0x009f59a36dd05f50, ++ 0x006d339de4eca613, 0x00c75a71addc86bc, 0x000060384c5ea93c}, ++ {0x00aa9641c32a30b4, 0x00cc73ae8cce565d, 0x00ec911a4df07f61, 0x00aa4b762ea4b264, ++ 0x0096d395bb393629, 0x004efacfb7632fe0, 0x00006f252f46fa3f}, ++ {1, 0, 0, 0, 0, 0, 0}}, ++{{0x00567eec597c7af6, 0x0059ba6795204413, 0x00816d4e6f01196f, 0x004ae6b3eb57951d, ++ 0x00420f5abdda2108, 0x003401d1f57ca9d9, 0x0000cf5837b0b67a}, ++ {0x00eaa64b8aeeabf9, 0x00246ddf16bcb4de, 0x000e7e3c3aecd751, 0x0008449f04fed72e, ++ 0x00307b67ccf09183, 0x0017108c3556b7b1, 0x0000229b2483b3bf}, ++ {1, 0, 0, 0, 0, 0, 0}}, ++{{0x00e7c491a7bb78a1, 0x00eafddd1d3049ab, 0x00352c05e2bc7c98, 0x003d6880c165fa5c, ++ 0x00b6ac61cc11c97d, 0x00beeb54fcf90ce5, 0x0000dc1f0b455edc}, ++ {0x002db2e7aee34d60, 0x0073b5f415a2d8c0, 0x00dd84e4193e9a0c, 0x00d02d873467c572, ++ 0x0018baaeda60aee5, 0x0013fb11d697c61e, 0x000083aafcc3a973}, ++ {1, 0, 0, 0, 0, 0, 0}} ++}; ++ ++/* ++ * select_point selects the |idx|th point from a precomputation table and ++ * copies it to out. ++ * ++ * pre_comp below is of the size provided in |size|. ++ */ ++static void select_point(const limb idx, unsigned int size, ++ const felem pre_comp[][3], felem out[3]) ++{ ++ unsigned int i, j; ++ limb *outlimbs = &out[0][0]; ++ ++ memset(out, 0, sizeof(*out) * 3); ++ ++ for (i = 0; i < size; i++) { ++ const limb *inlimbs = &pre_comp[i][0][0]; ++ limb mask = i ^ idx; ++ ++ mask |= mask >> 4; ++ mask |= mask >> 2; ++ mask |= mask >> 1; ++ mask &= 1; ++ mask--; ++ for (j = 0; j < NLIMBS * 3; j++) ++ outlimbs[j] |= inlimbs[j] & mask; ++ } ++} ++ ++/* get_bit returns the |i|th bit in |in| */ ++static char get_bit(const felem_bytearray in, int i) ++{ ++ if (i < 0 || i >= 384) ++ return 0; ++ return (in[i >> 3] >> (i & 7)) & 1; ++} ++ ++/* ++ * Interleaved point multiplication using precomputed point multiples: The ++ * small point multiples 0*P, 1*P, ..., 16*P are in pre_comp[], the scalars ++ * in scalars[]. If g_scalar is non-NULL, we also add this multiple of the ++ * generator, using certain (large) precomputed multiples in g_pre_comp. ++ * Output point (X, Y, Z) is stored in x_out, y_out, z_out ++ */ ++static void batch_mul(felem x_out, felem y_out, felem z_out, ++ const felem_bytearray scalars[], ++ const unsigned int num_points, const u8 *g_scalar, ++ const int mixed, const felem pre_comp[][17][3], ++ const felem g_pre_comp[16][3]) ++{ ++ int i, skip; ++ unsigned int num, gen_mul = (g_scalar != NULL); ++ felem nq[3], tmp[4]; ++ limb bits; ++ u8 sign, digit; ++ ++ /* set nq to the point at infinity */ ++ memset(nq, 0, sizeof(nq)); ++ ++ /* ++ * Loop over all scalars msb-to-lsb, interleaving additions of multiples ++ * of the generator (last quarter of rounds) and additions of other ++ * points multiples (every 5th round). ++ */ ++ skip = 1; /* save two point operations in the first ++ * round */ ++ for (i = (num_points ? 380 : 98); i >= 0; --i) { ++ /* double */ ++ if (!skip) ++ point_double(nq[0], nq[1], nq[2], nq[0], nq[1], nq[2]); ++ ++ /* add multiples of the generator */ ++ if (gen_mul && (i <= 98)) { ++ bits = get_bit(g_scalar, i + 285) << 3; ++ if (i < 95) { ++ bits |= get_bit(g_scalar, i + 190) << 2; ++ bits |= get_bit(g_scalar, i + 95) << 1; ++ bits |= get_bit(g_scalar, i); ++ } ++ /* select the point to add, in constant time */ ++ select_point(bits, 16, g_pre_comp, tmp); ++ if (!skip) { ++ /* The 1 argument below is for "mixed" */ ++ point_add(nq[0], nq[1], nq[2], ++ nq[0], nq[1], nq[2], 1, ++ tmp[0], tmp[1], tmp[2]); ++ } else { ++ memcpy(nq, tmp, 3 * sizeof(felem)); ++ skip = 0; ++ } ++ } ++ ++ /* do other additions every 5 doublings */ ++ if (num_points && (i % 5 == 0)) { ++ /* loop over all scalars */ ++ for (num = 0; num < num_points; ++num) { ++ bits = get_bit(scalars[num], i + 4) << 5; ++ bits |= get_bit(scalars[num], i + 3) << 4; ++ bits |= get_bit(scalars[num], i + 2) << 3; ++ bits |= get_bit(scalars[num], i + 1) << 2; ++ bits |= get_bit(scalars[num], i) << 1; ++ bits |= get_bit(scalars[num], i - 1); ++ ec_GFp_nistp_recode_scalar_bits(&sign, &digit, bits); ++ ++ /* ++ * select the point to add or subtract, in constant time ++ */ ++ select_point(digit, 17, pre_comp[num], tmp); ++ felem_neg(tmp[3], tmp[1]); /* (X, -Y, Z) is the negative ++ * point */ ++ copy_conditional(tmp[1], tmp[3], (-(limb) sign)); ++ ++ if (!skip) { ++ point_add(nq[0], nq[1], nq[2], ++ nq[0], nq[1], nq[2], mixed, ++ tmp[0], tmp[1], tmp[2]); ++ } else { ++ memcpy(nq, tmp, 3 * sizeof(felem)); ++ skip = 0; ++ } ++ } ++ } ++ } ++ felem_assign(x_out, nq[0]); ++ felem_assign(y_out, nq[1]); ++ felem_assign(z_out, nq[2]); ++} ++ ++/* Precomputation for the group generator. */ ++struct nistp384_pre_comp_st { ++ felem g_pre_comp[16][3]; ++ CRYPTO_REF_COUNT refcnt; ++ CRYPTO_RWLOCK *refcnt_lock; ++}; ++ ++const EC_METHOD *ossl_ec_GFp_nistp384_method(void) ++{ ++ static const EC_METHOD ret = { ++ EC_FLAGS_DEFAULT_OCT, ++ NID_X9_62_prime_field, ++ ossl_ec_GFp_nistp384_group_init, ++ ec_GFp_simple_group_finish, ++ ec_GFp_simple_group_clear_finish, ++ ec_GFp_nist_group_copy, ++ ossl_ec_GFp_nistp384_group_set_curve, ++ ec_GFp_simple_group_get_curve, ++ ec_GFp_simple_group_get_degree, ++ ec_group_simple_order_bits, ++ ec_GFp_simple_group_check_discriminant, ++ ec_GFp_simple_point_init, ++ ec_GFp_simple_point_finish, ++ ec_GFp_simple_point_clear_finish, ++ ec_GFp_simple_point_copy, ++ ec_GFp_simple_point_set_to_infinity, ++ ec_GFp_simple_set_Jprojective_coordinates_GFp, ++ ec_GFp_simple_get_Jprojective_coordinates_GFp, ++ ec_GFp_simple_point_set_affine_coordinates, ++ ossl_ec_GFp_nistp384_point_get_affine_coordinates, ++ 0, /* point_set_compressed_coordinates */ ++ 0, /* point2oct */ ++ 0, /* oct2point */ ++ ec_GFp_simple_add, ++ ec_GFp_simple_dbl, ++ ec_GFp_simple_invert, ++ ec_GFp_simple_is_at_infinity, ++ ec_GFp_simple_is_on_curve, ++ ec_GFp_simple_cmp, ++ ec_GFp_simple_make_affine, ++ ec_GFp_simple_points_make_affine, ++ ossl_ec_GFp_nistp384_points_mul, ++ ossl_ec_GFp_nistp384_precompute_mult, ++ ossl_ec_GFp_nistp384_have_precompute_mult, ++ ec_GFp_nist_field_mul, ++ ec_GFp_nist_field_sqr, ++ 0, /* field_div */ ++ ec_GFp_simple_field_inv, ++ 0, /* field_encode */ ++ 0, /* field_decode */ ++ 0, /* field_set_to_one */ ++ ec_key_simple_priv2oct, ++ ec_key_simple_oct2priv, ++ 0, /* set private */ ++ ec_key_simple_generate_key, ++ ec_key_simple_check_key, ++ ec_key_simple_generate_public_key, ++ 0, /* keycopy */ ++ 0, /* keyfinish */ ++ ecdh_simple_compute_key, ++ ecdsa_simple_sign_setup, ++ ecdsa_simple_sign_sig, ++ ecdsa_simple_verify_sig, ++ 0, /* field_inverse_mod_ord */ ++ 0, /* blind_coordinates */ ++ 0, /* ladder_pre */ ++ 0, /* ladder_step */ ++ 0 /* ladder_post */ ++ }; ++ ++ return &ret; ++} ++ ++/******************************************************************************/ ++/* ++ * FUNCTIONS TO MANAGE PRECOMPUTATION ++ */ ++ ++static NISTP384_PRE_COMP *nistp384_pre_comp_new(void) ++{ ++ NISTP384_PRE_COMP *ret = OPENSSL_zalloc(sizeof(*ret)); ++ ++ if (ret == NULL || (ret->refcnt_lock = CRYPTO_THREAD_lock_new()) == NULL) { ++ OPENSSL_free(ret); ++ return NULL; ++ } ++ ++ ret->refcnt = 1; ++ return ret; ++} ++ ++NISTP384_PRE_COMP *ossl_ec_nistp384_pre_comp_dup(NISTP384_PRE_COMP *p) ++{ ++ int i; ++ ++ if (p != NULL) ++ CRYPTO_UP_REF(&p->refcnt, &i, p->refcnt_lock); ++ return p; ++} ++ ++void ossl_ec_nistp384_pre_comp_free(NISTP384_PRE_COMP *p) ++{ ++ int i; ++ ++ if (p == NULL) ++ return; ++ ++ CRYPTO_DOWN_REF(&p->refcnt, &i, p->refcnt_lock); ++ REF_PRINT_COUNT("ec_nistp384", p); ++ if (i > 0) ++ return; ++ REF_ASSERT_ISNT(i < 0); ++ ++ CRYPTO_THREAD_lock_free(p->refcnt_lock); ++ OPENSSL_free(p); ++} ++ ++/******************************************************************************/ ++/* ++ * OPENSSL EC_METHOD FUNCTIONS ++ */ ++ ++int ossl_ec_GFp_nistp384_group_init(EC_GROUP *group) ++{ ++ int ret; ++ ++ ret = ec_GFp_simple_group_init(group); ++ group->a_is_minus3 = 1; ++ return ret; ++} ++ ++int ossl_ec_GFp_nistp384_group_set_curve(EC_GROUP *group, const BIGNUM *p, ++ const BIGNUM *a, const BIGNUM *b, ++ BN_CTX *ctx) ++{ ++ int ret = 0; ++ BIGNUM *curve_p, *curve_a, *curve_b; ++#ifndef FIPS_MODULE ++ BN_CTX *new_ctx = NULL; ++ ++ if (ctx == NULL) ++ ctx = new_ctx = BN_CTX_new(); ++#endif ++ if (ctx == NULL) ++ return 0; ++ ++ BN_CTX_start(ctx); ++ curve_p = BN_CTX_get(ctx); ++ curve_a = BN_CTX_get(ctx); ++ curve_b = BN_CTX_get(ctx); ++ if (curve_b == NULL) ++ goto err; ++ BN_bin2bn(nistp384_curve_params[0], sizeof(felem_bytearray), curve_p); ++ BN_bin2bn(nistp384_curve_params[1], sizeof(felem_bytearray), curve_a); ++ BN_bin2bn(nistp384_curve_params[2], sizeof(felem_bytearray), curve_b); ++ if ((BN_cmp(curve_p, p)) || (BN_cmp(curve_a, a)) || (BN_cmp(curve_b, b))) { ++ ECerr(EC_F_EC_GFP_NISTP384_GROUP_SET_CURVE, EC_R_WRONG_CURVE_PARAMETERS); ++ goto err; ++ } ++ group->field_mod_func = BN_nist_mod_384; ++ ret = ec_GFp_simple_group_set_curve(group, p, a, b, ctx); ++ err: ++ BN_CTX_end(ctx); ++#ifndef FIPS_MODULE ++ BN_CTX_free(new_ctx); ++#endif ++ return ret; ++} ++ ++/* ++ * Takes the Jacobian coordinates (X, Y, Z) of a point and returns (X', Y') = ++ * (X/Z^2, Y/Z^3) ++ */ ++int ossl_ec_GFp_nistp384_point_get_affine_coordinates(const EC_GROUP *group, ++ const EC_POINT *point, ++ BIGNUM *x, BIGNUM *y, ++ BN_CTX *ctx) ++{ ++ felem z1, z2, x_in, y_in, x_out, y_out; ++ widefelem tmp; ++ ++ if (EC_POINT_is_at_infinity(group, point)) { ++ ECerr(EC_F_EC_GFP_NISTP384_POINT_GET_AFFINE_COORDINATES, EC_R_POINT_AT_INFINITY); ++ return 0; ++ } ++ if ((!BN_to_felem(x_in, point->X)) || (!BN_to_felem(y_in, point->Y)) || ++ (!BN_to_felem(z1, point->Z))) ++ return 0; ++ felem_inv(z2, z1); ++ felem_square(tmp, z2); ++ felem_reduce(z1, tmp); ++ felem_mul(tmp, x_in, z1); ++ felem_reduce(x_in, tmp); ++ felem_contract(x_out, x_in); ++ if (x != NULL) { ++ if (!felem_to_BN(x, x_out)) { ++ ECerr(EC_F_EC_GFP_NISTP384_POINT_GET_AFFINE_COORDINATES, ERR_R_BN_LIB); ++ return 0; ++ } ++ } ++ felem_mul(tmp, z1, z2); ++ felem_reduce(z1, tmp); ++ felem_mul(tmp, y_in, z1); ++ felem_reduce(y_in, tmp); ++ felem_contract(y_out, y_in); ++ if (y != NULL) { ++ if (!felem_to_BN(y, y_out)) { ++ ECerr(EC_F_EC_GFP_NISTP384_POINT_GET_AFFINE_COORDINATES, ERR_R_BN_LIB); ++ return 0; ++ } ++ } ++ return 1; ++} ++ ++/* points below is of size |num|, and tmp_felems is of size |num+1/ */ ++static void make_points_affine(size_t num, felem points[][3], ++ felem tmp_felems[]) ++{ ++ /* ++ * Runs in constant time, unless an input is the point at infinity (which ++ * normally shouldn't happen). ++ */ ++ ec_GFp_nistp_points_make_affine_internal(num, ++ points, ++ sizeof(felem), ++ tmp_felems, ++ (void (*)(void *))felem_one, ++ felem_is_zero_int, ++ (void (*)(void *, const void *)) ++ felem_assign, ++ (void (*)(void *, const void *)) ++ felem_square_reduce, ++ (void (*)(void *, const void *, const void*)) ++ felem_mul_reduce, ++ (void (*)(void *, const void *)) ++ felem_inv, ++ (void (*)(void *, const void *)) ++ felem_contract); ++} ++ ++/* ++ * Computes scalar*generator + \sum scalars[i]*points[i], ignoring NULL ++ * values Result is stored in r (r can equal one of the inputs). ++ */ ++int ossl_ec_GFp_nistp384_points_mul(const EC_GROUP *group, EC_POINT *r, ++ const BIGNUM *scalar, size_t num, ++ const EC_POINT *points[], ++ const BIGNUM *scalars[], BN_CTX *ctx) ++{ ++ int ret = 0; ++ int j; ++ int mixed = 0; ++ BIGNUM *x, *y, *z, *tmp_scalar; ++ felem_bytearray g_secret; ++ felem_bytearray *secrets = NULL; ++ felem (*pre_comp)[17][3] = NULL; ++ felem *tmp_felems = NULL; ++ unsigned int i; ++ int num_bytes; ++ int have_pre_comp = 0; ++ size_t num_points = num; ++ felem x_in, y_in, z_in, x_out, y_out, z_out; ++ NISTP384_PRE_COMP *pre = NULL; ++ felem(*g_pre_comp)[3] = NULL; ++ EC_POINT *generator = NULL; ++ const EC_POINT *p = NULL; ++ const BIGNUM *p_scalar = NULL; ++ ++ BN_CTX_start(ctx); ++ x = BN_CTX_get(ctx); ++ y = BN_CTX_get(ctx); ++ z = BN_CTX_get(ctx); ++ tmp_scalar = BN_CTX_get(ctx); ++ if (tmp_scalar == NULL) ++ goto err; ++ ++ if (scalar != NULL) { ++ pre = group->pre_comp.nistp384; ++ if (pre) ++ /* we have precomputation, try to use it */ ++ g_pre_comp = &pre->g_pre_comp[0]; ++ else ++ /* try to use the standard precomputation */ ++ g_pre_comp = (felem(*)[3]) gmul; ++ generator = EC_POINT_new(group); ++ if (generator == NULL) ++ goto err; ++ /* get the generator from precomputation */ ++ if (!felem_to_BN(x, g_pre_comp[1][0]) || ++ !felem_to_BN(y, g_pre_comp[1][1]) || ++ !felem_to_BN(z, g_pre_comp[1][2])) { ++ ECerr(EC_F_EC_GFP_NISTP384_POINTS_MUL, ERR_R_BN_LIB); ++ goto err; ++ } ++ if (!ec_GFp_simple_set_Jprojective_coordinates_GFp(group, ++ generator, ++ x, y, z, ctx)) ++ goto err; ++ if (0 == EC_POINT_cmp(group, generator, group->generator, ctx)) ++ /* precomputation matches generator */ ++ have_pre_comp = 1; ++ else ++ /* ++ * we don't have valid precomputation: treat the generator as a ++ * random point ++ */ ++ num_points++; ++ } ++ ++ if (num_points > 0) { ++ if (num_points >= 2) { ++ /* ++ * unless we precompute multiples for just one point, converting ++ * those into affine form is time well spent ++ */ ++ mixed = 1; ++ } ++ secrets = OPENSSL_zalloc(sizeof(*secrets) * num_points); ++ pre_comp = OPENSSL_zalloc(sizeof(*pre_comp) * num_points); ++ if (mixed) ++ tmp_felems = ++ OPENSSL_malloc(sizeof(*tmp_felems) * (num_points * 17 + 1)); ++ if ((secrets == NULL) || (pre_comp == NULL) ++ || (mixed && (tmp_felems == NULL))) ++ goto err; ++ ++ /* ++ * we treat NULL scalars as 0, and NULL points as points at infinity, ++ * i.e., they contribute nothing to the linear combination ++ */ ++ for (i = 0; i < num_points; ++i) { ++ if (i == num) { ++ /* ++ * we didn't have a valid precomputation, so we pick the ++ * generator ++ */ ++ p = EC_GROUP_get0_generator(group); ++ p_scalar = scalar; ++ } else { ++ /* the i^th point */ ++ p = points[i]; ++ p_scalar = scalars[i]; ++ } ++ if (p_scalar != NULL && p != NULL) { ++ /* reduce scalar to 0 <= scalar < 2^384 */ ++ if ((BN_num_bits(p_scalar) > 384) ++ || (BN_is_negative(p_scalar))) { ++ /* ++ * this is an unusual input, and we don't guarantee ++ * constant-timeness ++ */ ++ if (!BN_nnmod(tmp_scalar, p_scalar, group->order, ctx)) { ++ ECerr(EC_F_EC_GFP_NISTP384_POINTS_MUL, ERR_R_BN_LIB); ++ goto err; ++ } ++ num_bytes = BN_bn2lebinpad(tmp_scalar, ++ secrets[i], sizeof(secrets[i])); ++ } else { ++ num_bytes = BN_bn2lebinpad(p_scalar, ++ secrets[i], sizeof(secrets[i])); ++ } ++ if (num_bytes < 0) { ++ ECerr(EC_F_EC_GFP_NISTP384_POINTS_MUL, ERR_R_BN_LIB); ++ goto err; ++ } ++ /* precompute multiples */ ++ if ((!BN_to_felem(x_out, p->X)) || ++ (!BN_to_felem(y_out, p->Y)) || ++ (!BN_to_felem(z_out, p->Z))) ++ goto err; ++ memcpy(pre_comp[i][1][0], x_out, sizeof(felem)); ++ memcpy(pre_comp[i][1][1], y_out, sizeof(felem)); ++ memcpy(pre_comp[i][1][2], z_out, sizeof(felem)); ++ for (j = 2; j <= 16; ++j) { ++ if (j & 1) { ++ point_add(pre_comp[i][j][0], pre_comp[i][j][1], pre_comp[i][j][2], ++ pre_comp[i][1][0], pre_comp[i][1][1], pre_comp[i][1][2], 0, ++ pre_comp[i][j - 1][0], pre_comp[i][j - 1][1], pre_comp[i][j - 1][2]); ++ } else { ++ point_double(pre_comp[i][j][0], pre_comp[i][j][1], pre_comp[i][j][2], ++ pre_comp[i][j / 2][0], pre_comp[i][j / 2][1], pre_comp[i][j / 2][2]); ++ } ++ } ++ } ++ } ++ if (mixed) ++ make_points_affine(num_points * 17, pre_comp[0], tmp_felems); ++ } ++ ++ /* the scalar for the generator */ ++ if (scalar != NULL && have_pre_comp) { ++ memset(g_secret, 0, sizeof(g_secret)); ++ /* reduce scalar to 0 <= scalar < 2^384 */ ++ if ((BN_num_bits(scalar) > 384) || (BN_is_negative(scalar))) { ++ /* ++ * this is an unusual input, and we don't guarantee ++ * constant-timeness ++ */ ++ if (!BN_nnmod(tmp_scalar, scalar, group->order, ctx)) { ++ ECerr(EC_F_EC_GFP_NISTP384_POINTS_MUL, ERR_R_BN_LIB); ++ goto err; ++ } ++ num_bytes = BN_bn2lebinpad(tmp_scalar, g_secret, sizeof(g_secret)); ++ } else { ++ num_bytes = BN_bn2lebinpad(scalar, g_secret, sizeof(g_secret)); ++ } ++ /* do the multiplication with generator precomputation */ ++ batch_mul(x_out, y_out, z_out, ++ (const felem_bytearray(*))secrets, num_points, ++ g_secret, ++ mixed, (const felem(*)[17][3])pre_comp, ++ (const felem(*)[3])g_pre_comp); ++ } else { ++ /* do the multiplication without generator precomputation */ ++ batch_mul(x_out, y_out, z_out, ++ (const felem_bytearray(*))secrets, num_points, ++ NULL, mixed, (const felem(*)[17][3])pre_comp, NULL); ++ } ++ /* reduce the output to its unique minimal representation */ ++ felem_contract(x_in, x_out); ++ felem_contract(y_in, y_out); ++ felem_contract(z_in, z_out); ++ if ((!felem_to_BN(x, x_in)) || (!felem_to_BN(y, y_in)) || ++ (!felem_to_BN(z, z_in))) { ++ ECerr(EC_F_EC_GFP_NISTP384_POINTS_MUL, ERR_R_BN_LIB); ++ goto err; ++ } ++ ret = ec_GFp_simple_set_Jprojective_coordinates_GFp(group, r, x, y, z, ++ ctx); ++ ++ err: ++ BN_CTX_end(ctx); ++ EC_POINT_free(generator); ++ OPENSSL_free(secrets); ++ OPENSSL_free(pre_comp); ++ OPENSSL_free(tmp_felems); ++ return ret; ++} ++ ++int ossl_ec_GFp_nistp384_precompute_mult(EC_GROUP *group, BN_CTX *ctx) ++{ ++ int ret = 0; ++ NISTP384_PRE_COMP *pre = NULL; ++ int i, j; ++ BIGNUM *x, *y; ++ EC_POINT *generator = NULL; ++ felem tmp_felems[16]; ++#ifndef FIPS_MODULE ++ BN_CTX *new_ctx = NULL; ++#endif ++ ++ /* throw away old precomputation */ ++ EC_pre_comp_free(group); ++ ++#ifndef FIPS_MODULE ++ if (ctx == NULL) ++ ctx = new_ctx = BN_CTX_new(); ++#endif ++ if (ctx == NULL) ++ return 0; ++ ++ BN_CTX_start(ctx); ++ x = BN_CTX_get(ctx); ++ y = BN_CTX_get(ctx); ++ if (y == NULL) ++ goto err; ++ /* get the generator */ ++ if (group->generator == NULL) ++ goto err; ++ generator = EC_POINT_new(group); ++ if (generator == NULL) ++ goto err; ++ BN_bin2bn(nistp384_curve_params[3], sizeof(felem_bytearray), x); ++ BN_bin2bn(nistp384_curve_params[4], sizeof(felem_bytearray), y); ++ if (!EC_POINT_set_affine_coordinates(group, generator, x, y, ctx)) ++ goto err; ++ if ((pre = nistp384_pre_comp_new()) == NULL) ++ goto err; ++ /* ++ * if the generator is the standard one, use built-in precomputation ++ */ ++ if (0 == EC_POINT_cmp(group, generator, group->generator, ctx)) { ++ memcpy(pre->g_pre_comp, gmul, sizeof(pre->g_pre_comp)); ++ goto done; ++ } ++ if ((!BN_to_felem(pre->g_pre_comp[1][0], group->generator->X)) || ++ (!BN_to_felem(pre->g_pre_comp[1][1], group->generator->Y)) || ++ (!BN_to_felem(pre->g_pre_comp[1][2], group->generator->Z))) ++ goto err; ++ /* compute 2^95*G, 2^190*G, 2^285*G */ ++ for (i = 1; i <= 4; i <<= 1) { ++ point_double(pre->g_pre_comp[2 * i][0], pre->g_pre_comp[2 * i][1], pre->g_pre_comp[2 * i][2], ++ pre->g_pre_comp[i][0], pre->g_pre_comp[i][1], pre->g_pre_comp[i][2]); ++ for (j = 0; j < 94; ++j) { ++ point_double(pre->g_pre_comp[2 * i][0], pre->g_pre_comp[2 * i][1], pre->g_pre_comp[2 * i][2], ++ pre->g_pre_comp[2 * i][0], pre->g_pre_comp[2 * i][1], pre->g_pre_comp[2 * i][2]); ++ } ++ } ++ /* g_pre_comp[0] is the point at infinity */ ++ memset(pre->g_pre_comp[0], 0, sizeof(pre->g_pre_comp[0])); ++ /* the remaining multiples */ ++ /* 2^95*G + 2^190*G */ ++ point_add(pre->g_pre_comp[6][0], pre->g_pre_comp[6][1], pre->g_pre_comp[6][2], ++ pre->g_pre_comp[4][0], pre->g_pre_comp[4][1], pre->g_pre_comp[4][2], 0, ++ pre->g_pre_comp[2][0], pre->g_pre_comp[2][1], pre->g_pre_comp[2][2]); ++ /* 2^95*G + 2^285*G */ ++ point_add(pre->g_pre_comp[10][0], pre->g_pre_comp[10][1], pre->g_pre_comp[10][2], ++ pre->g_pre_comp[8][0], pre->g_pre_comp[8][1], pre->g_pre_comp[8][2], 0, ++ pre->g_pre_comp[2][0], pre->g_pre_comp[2][1], pre->g_pre_comp[2][2]); ++ /* 2^190*G + 2^285*G */ ++ point_add(pre->g_pre_comp[12][0], pre->g_pre_comp[12][1], pre->g_pre_comp[12][2], ++ pre->g_pre_comp[8][0], pre->g_pre_comp[8][1], pre->g_pre_comp[8][2], 0, ++ pre->g_pre_comp[4][0], pre->g_pre_comp[4][1], pre->g_pre_comp[4][2]); ++ /* 2^95*G + 2^190*G + 2^285*G */ ++ point_add(pre->g_pre_comp[14][0], pre->g_pre_comp[14][1], pre->g_pre_comp[14][2], ++ pre->g_pre_comp[12][0], pre->g_pre_comp[12][1], pre->g_pre_comp[12][2], 0, ++ pre->g_pre_comp[2][0], pre->g_pre_comp[2][1], pre->g_pre_comp[2][2]); ++ for (i = 1; i < 8; ++i) { ++ /* odd multiples: add G */ ++ point_add(pre->g_pre_comp[2 * i + 1][0], pre->g_pre_comp[2 * i + 1][1], pre->g_pre_comp[2 * i + 1][2], ++ pre->g_pre_comp[2 * i][0], pre->g_pre_comp[2 * i][1], pre->g_pre_comp[2 * i][2], 0, ++ pre->g_pre_comp[1][0], pre->g_pre_comp[1][1], pre->g_pre_comp[1][2]); ++ } ++ make_points_affine(15, &(pre->g_pre_comp[1]), tmp_felems); ++ ++ done: ++ SETPRECOMP(group, nistp384, pre); ++ ret = 1; ++ pre = NULL; ++ err: ++ BN_CTX_end(ctx); ++ EC_POINT_free(generator); ++#ifndef FIPS_MODULE ++ BN_CTX_free(new_ctx); ++#endif ++ ossl_ec_nistp384_pre_comp_free(pre); ++ return ret; ++} ++ ++int ossl_ec_GFp_nistp384_have_precompute_mult(const EC_GROUP *group) ++{ ++ return HAVEPRECOMP(group, nistp384); ++} ++#endif /* OPENSSL_NO_EC_NISTP_64_GCC_128 */ +Index: openssl-1.1.1w/crypto/ec/build.info +=================================================================== +--- openssl-1.1.1w.orig/crypto/ec/build.info ++++ openssl-1.1.1w/crypto/ec/build.info +@@ -3,7 +3,8 @@ SOURCE[../../libcrypto]=\ + ec_lib.c ecp_smpl.c ecp_mont.c ecp_nist.c ec_cvt.c ec_mult.c \ + ec_err.c ec_curve.c ec_check.c ec_print.c ec_asn1.c ec_key.c \ + ec2_smpl.c ec_ameth.c ec_pmeth.c eck_prn.c \ +- ecp_nistp224.c ecp_nistp256.c ecp_nistp521.c ecp_nistputil.c \ ++ ecp_nistp224.c ecp_nistp256.c ecp_nistp384.c ecp_nistp521.c \ ++ ecp_nistputil.c \ + ecp_oct.c ec2_oct.c ec_oct.c ec_kmeth.c ecdh_ossl.c ecdh_kdf.c \ + ecdsa_ossl.c ecdsa_sign.c ecdsa_vrf.c curve25519.c ecx_meth.c \ + curve448/f_generic.c curve448/scalar.c \ +Index: openssl-1.1.1w/crypto/err/openssl.txt +=================================================================== +--- openssl-1.1.1w.orig/crypto/err/openssl.txt ++++ openssl-1.1.1w/crypto/err/openssl.txt +@@ -562,6 +562,10 @@ EC_F_EC_GFP_NISTP256_GROUP_SET_CURVE:230 + EC_F_EC_GFP_NISTP256_POINTS_MUL:231:ec_GFp_nistp256_points_mul + EC_F_EC_GFP_NISTP256_POINT_GET_AFFINE_COORDINATES:232:\ + ec_GFp_nistp256_point_get_affine_coordinates ++EC_F_EC_GFP_NISTP384_GROUP_SET_CURVE:315:ec_GFp_nistp384_group_set_curve ++EC_F_EC_GFP_NISTP384_POINT_GET_AFFINE_COORDINATES:316:\ ++ ec_GFp_nistp385_point_get_affine_coordinates ++EC_F_EC_GFP_NISTP384_POINTS_MUL:317:ec_GFp_nistp384_points_mul + EC_F_EC_GFP_NISTP521_GROUP_SET_CURVE:233:ec_GFp_nistp521_group_set_curve + EC_F_EC_GFP_NISTP521_POINTS_MUL:234:ec_GFp_nistp521_points_mul + EC_F_EC_GFP_NISTP521_POINT_GET_AFFINE_COORDINATES:235:\ +Index: openssl-1.1.1w/include/openssl/ecerr.h +=================================================================== +--- openssl-1.1.1w.orig/include/openssl/ecerr.h ++++ openssl-1.1.1w/include/openssl/ecerr.h +@@ -93,6 +93,9 @@ int ERR_load_EC_strings(void); + # define EC_F_EC_GFP_NISTP256_GROUP_SET_CURVE 230 + # define EC_F_EC_GFP_NISTP256_POINTS_MUL 231 + # define EC_F_EC_GFP_NISTP256_POINT_GET_AFFINE_COORDINATES 232 ++# define EC_F_EC_GFP_NISTP384_GROUP_SET_CURVE 315 ++# define EC_F_EC_GFP_NISTP384_POINT_GET_AFFINE_COORDINATES 316 ++# define EC_F_EC_GFP_NISTP384_POINTS_MUL 317 + # define EC_F_EC_GFP_NISTP521_GROUP_SET_CURVE 233 + # define EC_F_EC_GFP_NISTP521_POINTS_MUL 234 + # define EC_F_EC_GFP_NISTP521_POINT_GET_AFFINE_COORDINATES 235 diff --git a/openssl-ec-Use-static-linkage-on-nistp521-felem_-square-mul-.patch b/openssl-ec-Use-static-linkage-on-nistp521-felem_-square-mul-.patch new file mode 100644 index 0000000..85685cf --- /dev/null +++ b/openssl-ec-Use-static-linkage-on-nistp521-felem_-square-mul-.patch @@ -0,0 +1,65 @@ +From 3e47a286dc3274bda72a196c3a4030a1fc8302f1 Mon Sep 17 00:00:00 2001 +From: Rohan McLure +Date: Fri, 23 Jun 2023 16:41:48 +1000 +Subject: [PATCH] ec: Use static linkage on nistp521 felem_{square,mul} + wrappers + +Runtime selection of implementations for felem_{square,mul} depends on +felem_{square,mul}_wrapper functions, which overwrite function points in +a similar design to that of .plt.got sections used by program loaders +during dynamic linking. + +There's no reason why these functions need to have external linkage. +Mark static. + +Signed-off-by: Rohan McLure + +Reviewed-by: Paul Dale +Reviewed-by: Shane Lontis +Reviewed-by: Dmitry Belyavskiy +Reviewed-by: Todd Short +(Merged from https://github.com/openssl/openssl/pull/21471) +--- + crypto/ec/ecp_nistp521.c | 10 +++++----- + 1 file changed, 5 insertions(+), 5 deletions(-) + +diff --git a/crypto/ec/ecp_nistp521.c b/crypto/ec/ecp_nistp521.c +index 97815cac1f13..32a9268ecf17 100644 +--- a/crypto/ec/ecp_nistp521.c ++++ b/crypto/ec/ecp_nistp521.c +@@ -676,8 +676,8 @@ static void felem_reduce(felem out, const largefelem in) + } + + #if defined(ECP_NISTP521_ASM) +-void felem_square_wrapper(largefelem out, const felem in); +-void felem_mul_wrapper(largefelem out, const felem in1, const felem in2); ++static void felem_square_wrapper(largefelem out, const felem in); ++static void felem_mul_wrapper(largefelem out, const felem in1, const felem in2); + + static void (*felem_square_p)(largefelem out, const felem in) = + felem_square_wrapper; +@@ -691,7 +691,7 @@ void p521_felem_mul(largefelem out, const felem in1, const felem in2); + # include "../ppc_arch.h" + # endif + +-void felem_select(void) ++static void felem_select(void) + { + # if defined(_ARCH_PPC64) + if ((OPENSSL_ppccap_P & PPC_MADD300) && (OPENSSL_ppccap_P & PPC_ALTIVEC)) { +@@ -707,13 +707,13 @@ void felem_select(void) + felem_mul_p = felem_mul_ref; + } + +-void felem_square_wrapper(largefelem out, const felem in) ++static void felem_square_wrapper(largefelem out, const felem in) + { + felem_select(); + felem_square_p(out, in); + } + +-void felem_mul_wrapper(largefelem out, const felem in1, const felem in2) ++static void felem_mul_wrapper(largefelem out, const felem in1, const felem in2) + { + felem_select(); + felem_mul_p(out, in1, in2); diff --git a/openssl-ec-powerpc64le-Add-asm-implementation-of-felem_-squa.patch b/openssl-ec-powerpc64le-Add-asm-implementation-of-felem_-squa.patch new file mode 100644 index 0000000..f120df2 --- /dev/null +++ b/openssl-ec-powerpc64le-Add-asm-implementation-of-felem_-squa.patch @@ -0,0 +1,410 @@ +From 966047ee13188e8634af25af348940acceb9316d Mon Sep 17 00:00:00 2001 +From: Rohan McLure +Date: Wed, 31 May 2023 14:32:26 +1000 +Subject: [PATCH] ec: powerpc64le: Add asm implementation of felem_{square,mul} + +Add an assembly implementation of felem_{square,mul}, which will be +implemented whenever Altivec support is present and the core implements +ISA 3.0 (Power 9) or greater. + +Signed-off-by: Rohan McLure + +Reviewed-by: Paul Dale +Reviewed-by: Shane Lontis +Reviewed-by: Dmitry Belyavskiy +Reviewed-by: Todd Short +(Merged from https://github.com/openssl/openssl/pull/21471) +--- + crypto/ec/asm/ecp_nistp384-ppc64.pl | 355 ++++++++++++++++++++++++++++++++++++ + crypto/ec/build.info | 2 + crypto/ec/ecp_nistp384.c | 9 + 3 files changed, 366 insertions(+) + create mode 100755 crypto/ec/asm/ecp_nistp384-ppc64.pl + +--- /dev/null ++++ b/crypto/ec/asm/ecp_nistp384-ppc64.pl +@@ -0,0 +1,355 @@ ++#! /usr/bin/env perl ++# Copyright 2023 The OpenSSL Project Authors. All Rights Reserved. ++# ++# Licensed under the Apache License 2.0 (the "License"). You may not use ++# this file except in compliance with the License. You can obtain a copy ++# in the file LICENSE in the source distribution or at ++# https://www.openssl.org/source/license.html ++# ++# ==================================================================== ++# Written by Rohan McLure for the OpenSSL ++# project. ++# ==================================================================== ++# ++# p384 lower-level primitives for PPC64 using vector instructions. ++# ++ ++use strict; ++use warnings; ++ ++my $flavour = shift; ++my $output = ""; ++while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} ++if (!$output) { ++ $output = "-"; ++} ++ ++my ($xlate, $dir); ++$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; ++( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or ++( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or ++die "can't locate ppc-xlate.pl"; ++ ++open OUT,"| \"$^X\" $xlate $flavour $output"; ++*STDOUT=*OUT; ++ ++my $code = ""; ++ ++my ($sp, $outp, $savelr, $savesp) = ("r1", "r3", "r10", "r12"); ++ ++my $vzero = "v32"; ++ ++sub startproc($) ++{ ++ my ($name) = @_; ++ ++ $code.=<<___; ++ .globl ${name} ++ .align 5 ++${name}: ++ ++___ ++} ++ ++sub endproc($) ++{ ++ my ($name) = @_; ++ ++ $code.=<<___; ++ blr ++ .size ${name},.-${name} ++ ++___ ++} ++ ++ ++sub push_vrs($$) ++{ ++ my ($min, $max) = @_; ++ ++ my $count = $max - $min + 1; ++ ++ $code.=<<___; ++ mr $savesp,$sp ++ stdu $sp,-16*`$count+1`($sp) ++ ++___ ++ for (my $i = $min; $i <= $max; $i++) { ++ my $mult = $max - $i + 1; ++ $code.=<<___; ++ stxv $i,-16*$mult($savesp) ++___ ++ ++ } ++ ++ $code.=<<___; ++ ++___ ++} ++ ++sub pop_vrs($$) ++{ ++ my ($min, $max) = @_; ++ ++ $code.=<<___; ++ ld $savesp,0($sp) ++___ ++ for (my $i = $min; $i <= $max; $i++) { ++ my $mult = $max - $i + 1; ++ $code.=<<___; ++ lxv $i,-16*$mult($savesp) ++___ ++ } ++ ++ $code.=<<___; ++ mr $sp,$savesp ++ ++___ ++} ++ ++sub load_vrs($$) ++{ ++ my ($pointer, $reg_list) = @_; ++ ++ for (my $i = 0; $i <= 6; $i++) { ++ my $offset = $i * 8; ++ $code.=<<___; ++ lxsd $reg_list->[$i],$offset($pointer) ++___ ++ } ++ ++ $code.=<<___; ++ ++___ ++} ++ ++sub store_vrs($$) ++{ ++ my ($pointer, $reg_list) = @_; ++ ++ for (my $i = 0; $i <= 12; $i++) { ++ my $offset = $i * 16; ++ $code.=<<___; ++ stxv $reg_list->[$i],$offset($pointer) ++___ ++ } ++ ++ $code.=<<___; ++ ++___ ++} ++ ++$code.=<<___; ++.machine "any" ++.text ++ ++___ ++ ++{ ++ # mul/square common ++ my ($t1, $t2, $t3, $t4) = ("v33", "v34", "v42", "v43"); ++ my ($zero, $one) = ("r8", "r9"); ++ my $out = "v51"; ++ ++ { ++ # ++ # p384_felem_mul ++ # ++ ++ my ($in1p, $in2p) = ("r4", "r5"); ++ my @in1 = map("v$_",(44..50)); ++ my @in2 = map("v$_",(35..41)); ++ ++ startproc("p384_felem_mul"); ++ ++ push_vrs(52, 63); ++ ++ $code.=<<___; ++ vspltisw $vzero,0 ++ ++___ ++ ++ load_vrs($in1p, \@in1); ++ load_vrs($in2p, \@in2); ++ ++ $code.=<<___; ++ vmsumudm $out,$in1[0],$in2[0],$vzero ++ stxv $out,0($outp) ++ ++ xxpermdi $t1,$in1[0],$in1[1],0b00 ++ xxpermdi $t2,$in2[1],$in2[0],0b00 ++ vmsumudm $out,$t1,$t2,$vzero ++ stxv $out,16($outp) ++ ++ xxpermdi $t2,$in2[2],$in2[1],0b00 ++ vmsumudm $out,$t1,$t2,$vzero ++ vmsumudm $out,$in1[2],$in2[0],$out ++ stxv $out,32($outp) ++ ++ xxpermdi $t2,$in2[1],$in2[0],0b00 ++ xxpermdi $t3,$in1[2],$in1[3],0b00 ++ xxpermdi $t4,$in2[3],$in2[2],0b00 ++ vmsumudm $out,$t1,$t4,$vzero ++ vmsumudm $out,$t3,$t2,$out ++ stxv $out,48($outp) ++ ++ xxpermdi $t2,$in2[4],$in2[3],0b00 ++ xxpermdi $t4,$in2[2],$in2[1],0b00 ++ vmsumudm $out,$t1,$t2,$vzero ++ vmsumudm $out,$t3,$t4,$out ++ vmsumudm $out,$in1[4],$in2[0],$out ++ stxv $out,64($outp) ++ ++ xxpermdi $t2,$in2[5],$in2[4],0b00 ++ xxpermdi $t4,$in2[3],$in2[2],0b00 ++ vmsumudm $out,$t1,$t2,$vzero ++ vmsumudm $out,$t3,$t4,$out ++ xxpermdi $t4,$in2[1],$in2[0],0b00 ++ xxpermdi $t1,$in1[4],$in1[5],0b00 ++ vmsumudm $out,$t1,$t4,$out ++ stxv $out,80($outp) ++ ++ xxpermdi $t1,$in1[0],$in1[1],0b00 ++ xxpermdi $t2,$in2[6],$in2[5],0b00 ++ xxpermdi $t4,$in2[4],$in2[3],0b00 ++ vmsumudm $out,$t1,$t2,$vzero ++ vmsumudm $out,$t3,$t4,$out ++ xxpermdi $t2,$in2[2],$in2[1],0b00 ++ xxpermdi $t1,$in1[4],$in1[5],0b00 ++ vmsumudm $out,$t1,$t2,$out ++ vmsumudm $out,$in1[6],$in2[0],$out ++ stxv $out,96($outp) ++ ++ xxpermdi $t1,$in1[1],$in1[2],0b00 ++ xxpermdi $t2,$in2[6],$in2[5],0b00 ++ xxpermdi $t3,$in1[3],$in1[4],0b00 ++ vmsumudm $out,$t1,$t2,$vzero ++ vmsumudm $out,$t3,$t4,$out ++ xxpermdi $t3,$in2[2],$in2[1],0b00 ++ xxpermdi $t1,$in1[5],$in1[6],0b00 ++ vmsumudm $out,$t1,$t3,$out ++ stxv $out,112($outp) ++ ++ xxpermdi $t1,$in1[2],$in1[3],0b00 ++ xxpermdi $t3,$in1[4],$in1[5],0b00 ++ vmsumudm $out,$t1,$t2,$vzero ++ vmsumudm $out,$t3,$t4,$out ++ vmsumudm $out,$in1[6],$in2[2],$out ++ stxv $out,128($outp) ++ ++ xxpermdi $t1,$in1[3],$in1[4],0b00 ++ vmsumudm $out,$t1,$t2,$vzero ++ xxpermdi $t1,$in1[5],$in1[6],0b00 ++ vmsumudm $out,$t1,$t4,$out ++ stxv $out,144($outp) ++ ++ vmsumudm $out,$t3,$t2,$vzero ++ vmsumudm $out,$in1[6],$in2[4],$out ++ stxv $out,160($outp) ++ ++ vmsumudm $out,$t1,$t2,$vzero ++ stxv $out,176($outp) ++ ++ vmsumudm $out,$in1[6],$in2[6],$vzero ++ stxv $out,192($outp) ++___ ++ ++ endproc("p384_felem_mul"); ++ } ++ ++ { ++ # ++ # p384_felem_square ++ # ++ ++ my ($inp) = ("r4"); ++ my @in = map("v$_",(44..50)); ++ my @inx2 = map("v$_",(35..41)); ++ ++ startproc("p384_felem_square"); ++ ++ push_vrs(52, 63); ++ ++ $code.=<<___; ++ vspltisw $vzero,0 ++ ++___ ++ ++ load_vrs($inp, \@in); ++ ++ $code.=<<___; ++ li $zero,0 ++ li $one,1 ++ mtvsrdd $t1,$one,$zero ++___ ++ ++ for (my $i = 0; $i <= 6; $i++) { ++ $code.=<<___; ++ vsld $inx2[$i],$in[$i],$t1 ++___ ++ } ++ ++ $code.=<<___; ++ vmsumudm $out,$in[0],$in[0],$vzero ++ stxv $out,0($outp) ++ ++ vmsumudm $out,$in[0],$inx2[1],$vzero ++ stxv $out,16($outp) ++ ++ vmsumudm $out,$in[0],$inx2[2],$vzero ++ vmsumudm $out,$in[1],$in[1],$out ++ stxv $out,32($outp) ++ ++ xxpermdi $t1,$in[0],$in[1],0b00 ++ xxpermdi $t2,$inx2[3],$inx2[2],0b00 ++ vmsumudm $out,$t1,$t2,$vzero ++ stxv $out,48($outp) ++ ++ xxpermdi $t4,$inx2[4],$inx2[3],0b00 ++ vmsumudm $out,$t1,$t4,$vzero ++ vmsumudm $out,$in[2],$in[2],$out ++ stxv $out,64($outp) ++ ++ xxpermdi $t2,$inx2[5],$inx2[4],0b00 ++ vmsumudm $out,$t1,$t2,$vzero ++ vmsumudm $out,$in[2],$inx2[3],$out ++ stxv $out,80($outp) ++ ++ xxpermdi $t2,$inx2[6],$inx2[5],0b00 ++ vmsumudm $out,$t1,$t2,$vzero ++ vmsumudm $out,$in[2],$inx2[4],$out ++ vmsumudm $out,$in[3],$in[3],$out ++ stxv $out,96($outp) ++ ++ xxpermdi $t3,$in[1],$in[2],0b00 ++ vmsumudm $out,$t3,$t2,$vzero ++ vmsumudm $out,$in[3],$inx2[4],$out ++ stxv $out,112($outp) ++ ++ xxpermdi $t1,$in[2],$in[3],0b00 ++ vmsumudm $out,$t1,$t2,$vzero ++ vmsumudm $out,$in[4],$in[4],$out ++ stxv $out,128($outp) ++ ++ xxpermdi $t1,$in[3],$in[4],0b00 ++ vmsumudm $out,$t1,$t2,$vzero ++ stxv $out,144($outp) ++ ++ vmsumudm $out,$in[4],$inx2[6],$vzero ++ vmsumudm $out,$in[5],$in[5],$out ++ stxv $out,160($outp) ++ ++ vmsumudm $out,$in[5],$inx2[6],$vzero ++ stxv $out,176($outp) ++ ++ vmsumudm $out,$in[6],$in[6],$vzero ++ stxv $out,192($outp) ++___ ++ ++ endproc("p384_felem_square"); ++ } ++} ++ ++$code =~ s/\`([^\`]*)\`/eval $1/gem; ++print $code; ++close STDOUT or die "error closing STDOUT: $!"; +--- a/crypto/ec/build.info ++++ b/crypto/ec/build.info +@@ -31,6 +31,8 @@ GENERATE[ecp_nistz256-armv8.S]=asm/ecp_n + INCLUDE[ecp_nistz256-armv8.o]=.. + GENERATE[ecp_nistz256-ppc64.s]=asm/ecp_nistz256-ppc64.pl $(PERLASM_SCHEME) + ++GENERATE[ecp_nistp384-ppc64.s]=asm/ecp_nistp384-ppc64.pl $(PERLASM_SCHEME) ++INCLUDE[ecp_nistp384.o]=.. + GENERATE[ecp_nistp521-ppc64.s]=asm/ecp_nistp521-ppc64.pl $(PERLASM_SCHEME) + + GENERATE[x25519-x86_64.s]=asm/x25519-x86_64.pl $(PERLASM_SCHEME) +--- a/crypto/ec/ecp_nistp384.c ++++ b/crypto/ec/ecp_nistp384.c +@@ -691,6 +691,15 @@ void p384_felem_mul(widefelem out, const + + static void felem_select(void) + { ++# if defined(_ARCH_PPC64) ++ if ((OPENSSL_ppccap_P & PPC_MADD300) && (OPENSSL_ppccap_P & PPC_ALTIVEC)) { ++ felem_square_p = p384_felem_square; ++ felem_mul_p = p384_felem_mul; ++ ++ return; ++ } ++# endif ++ + /* Default */ + felem_square_p = felem_square_ref; + felem_mul_p = felem_mul_ref; diff --git a/openssl-ecc-Remove-extraneous-parentheses-in-secp384r1.patch b/openssl-ecc-Remove-extraneous-parentheses-in-secp384r1.patch new file mode 100644 index 0000000..a2918d9 --- /dev/null +++ b/openssl-ecc-Remove-extraneous-parentheses-in-secp384r1.patch @@ -0,0 +1,76 @@ +From 670e73d9084465384b11ef24802ca4a313e1d2f4 Mon Sep 17 00:00:00 2001 +From: Rohan McLure +Date: Tue, 15 Aug 2023 15:20:20 +1000 +Subject: [PATCH] ecc: Remove extraneous parentheses in secp384r1 + +Substitutions in the felem_reduce() method feature unecessary +parentheses, remove them. + +Signed-off-by: Rohan McLure + +Reviewed-by: Tomas Mraz +Reviewed-by: Shane Lontis +Reviewed-by: Hugo Landau +(Merged from https://github.com/openssl/openssl/pull/21749) +--- + crypto/ec/ecp_nistp384.c | 12 ++++++------ + 1 file changed, 6 insertions(+), 6 deletions(-) + +diff --git a/crypto/ec/ecp_nistp384.c b/crypto/ec/ecp_nistp384.c +index 14f9530d07c6..ff68f9cc7ad0 100644 +--- a/crypto/ec/ecp_nistp384.c ++++ b/crypto/ec/ecp_nistp384.c +@@ -540,7 +540,7 @@ static void felem_reduce(felem out, const widefelem in) + acc[7] += in[12] >> 8; + acc[6] += (in[12] & 0xff) << 48; + acc[6] -= in[12] >> 16; +- acc[5] -= ((in[12] & 0xffff) << 40); ++ acc[5] -= (in[12] & 0xffff) << 40; + acc[6] += in[12] >> 48; + acc[5] += (in[12] & 0xffffffffffff) << 8; + +@@ -549,7 +549,7 @@ static void felem_reduce(felem out, const widefelem in) + acc[6] += in[11] >> 8; + acc[5] += (in[11] & 0xff) << 48; + acc[5] -= in[11] >> 16; +- acc[4] -= ((in[11] & 0xffff) << 40); ++ acc[4] -= (in[11] & 0xffff) << 40; + acc[5] += in[11] >> 48; + acc[4] += (in[11] & 0xffffffffffff) << 8; + +@@ -558,7 +558,7 @@ static void felem_reduce(felem out, const widefelem in) + acc[5] += in[10] >> 8; + acc[4] += (in[10] & 0xff) << 48; + acc[4] -= in[10] >> 16; +- acc[3] -= ((in[10] & 0xffff) << 40); ++ acc[3] -= (in[10] & 0xffff) << 40; + acc[4] += in[10] >> 48; + acc[3] += (in[10] & 0xffffffffffff) << 8; + +@@ -567,7 +567,7 @@ static void felem_reduce(felem out, const widefelem in) + acc[4] += in[9] >> 8; + acc[3] += (in[9] & 0xff) << 48; + acc[3] -= in[9] >> 16; +- acc[2] -= ((in[9] & 0xffff) << 40); ++ acc[2] -= (in[9] & 0xffff) << 40; + acc[3] += in[9] >> 48; + acc[2] += (in[9] & 0xffffffffffff) << 8; + +@@ -582,7 +582,7 @@ static void felem_reduce(felem out, const widefelem in) + acc[3] += acc[8] >> 8; + acc[2] += (acc[8] & 0xff) << 48; + acc[2] -= acc[8] >> 16; +- acc[1] -= ((acc[8] & 0xffff) << 40); ++ acc[1] -= (acc[8] & 0xffff) << 40; + acc[2] += acc[8] >> 48; + acc[1] += (acc[8] & 0xffffffffffff) << 8; + +@@ -591,7 +591,7 @@ static void felem_reduce(felem out, const widefelem in) + acc[2] += acc[7] >> 8; + acc[1] += (acc[7] & 0xff) << 48; + acc[1] -= acc[7] >> 16; +- acc[0] -= ((acc[7] & 0xffff) << 40); ++ acc[0] -= (acc[7] & 0xffff) << 40; + acc[1] += acc[7] >> 48; + acc[0] += (acc[7] & 0xffffffffffff) << 8; + diff --git a/openssl-powerpc-ecc-Fix-stack-allocation-secp384r1-asm.patch b/openssl-powerpc-ecc-Fix-stack-allocation-secp384r1-asm.patch new file mode 100644 index 0000000..ecfecb5 --- /dev/null +++ b/openssl-powerpc-ecc-Fix-stack-allocation-secp384r1-asm.patch @@ -0,0 +1,96 @@ +From 50f8b936b00dc18ce1f622a7a6aa46daf03da48b Mon Sep 17 00:00:00 2001 +From: Rohan McLure +Date: Wed, 16 Aug 2023 16:52:47 +1000 +Subject: [PATCH] powerpc: ecc: Fix stack allocation secp384r1 asm + +Assembly acceleration secp384r1 opts to not use any callee-save VSRs, as +VSX enabled systems make extensive use of renaming, and so writebacks in +felem_{mul,square}() can be reordered for best cache effects. + +Remove stack allocations. This in turn fixes unmatched push/pops in +felem_{mul,square}(). + +Signed-off-by: Rohan McLure + +Reviewed-by: Tomas Mraz +Reviewed-by: Shane Lontis +Reviewed-by: Hugo Landau +(Merged from https://github.com/openssl/openssl/pull/21749) +--- + crypto/ec/asm/ecp_nistp384-ppc64.pl | 49 ----------------------------- + 1 file changed, 49 deletions(-) + +diff --git a/crypto/ec/asm/ecp_nistp384-ppc64.pl b/crypto/ec/asm/ecp_nistp384-ppc64.pl +index 3f86b391af69..28f4168e5218 100755 +--- a/crypto/ec/asm/ecp_nistp384-ppc64.pl ++++ b/crypto/ec/asm/ecp_nistp384-ppc64.pl +@@ -62,51 +62,6 @@ ($) + ___ + } + +- +-sub push_vrs($$) +-{ +- my ($min, $max) = @_; +- +- my $count = $max - $min + 1; +- +- $code.=<<___; +- mr $savesp,$sp +- stdu $sp,-16*`$count+1`($sp) +- +-___ +- for (my $i = $min; $i <= $max; $i++) { +- my $mult = $max - $i + 1; +- $code.=<<___; +- stxv $i,-16*$mult($savesp) +-___ +- +- } +- +- $code.=<<___; +- +-___ +-} +- +-sub pop_vrs($$) +-{ +- my ($min, $max) = @_; +- +- $code.=<<___; +- ld $savesp,0($sp) +-___ +- for (my $i = $min; $i <= $max; $i++) { +- my $mult = $max - $i + 1; +- $code.=<<___; +- lxv $i,-16*$mult($savesp) +-___ +- } +- +- $code.=<<___; +- mr $sp,$savesp +- +-___ +-} +- + sub load_vrs($$) + { + my ($pointer, $reg_list) = @_; +@@ -162,8 +117,6 @@ ($$) + + startproc("p384_felem_mul"); + +- push_vrs(52, 63); +- + $code.=<<___; + vspltisw $vzero,0 + +@@ -268,8 +221,6 @@ ($$) + + startproc("p384_felem_square"); + +- push_vrs(52, 63); +- + $code.=<<___; + vspltisw $vzero,0 +