From b51c004cd8e095f310fc4b14cfef053d2aef40f512101e7726e6b10ddb4e7b98 Mon Sep 17 00:00:00 2001
From: Otto Hollmann <otto.hollmann@suse.com>
Date: Wed, 25 Oct 2023 07:52:22 +0000
Subject: [PATCH] Accepting request 1119558 from
 home:ohollmann:branches:security:tls

- Performance enhancements for cryptography from OpenSSL 3.x
  [jsc#PED-5086, jsc#PED-3514]
  * Add patches:
    - openssl-ec-Use-static-linkage-on-nistp521-felem_-square-mul-.patch
    - openssl-ec-56-bit-Limb-Solinas-Strategy-for-secp384r1.patch
    - openssl-ec-powerpc64le-Add-asm-implementation-of-felem_-squa.patch
    - openssl-ecc-Remove-extraneous-parentheses-in-secp384r1.patch
    - openssl-powerpc-ecc-Fix-stack-allocation-secp384r1-asm.patch
    - openssl-Improve-performance-for-6x-unrolling-with-vpermxor-i.patch

OBS-URL: https://build.opensuse.org/request/show/1119558
OBS-URL: https://build.opensuse.org/package/show/security:tls/openssl-1_1?expand=0&rev=148
---
 openssl-1_1.changes                           |   13 +
 openssl-1_1.spec                              |    8 +
 ...nce-for-6x-unrolling-with-vpermxor-i.patch |  495 ++++
 ...-Limb-Solinas-Strategy-for-secp384r1.patch | 2197 +++++++++++++++++
 ...nkage-on-nistp521-felem_-square-mul-.patch |   65 +
 ...dd-asm-implementation-of-felem_-squa.patch |  410 +++
 ...-extraneous-parentheses-in-secp384r1.patch |   76 +
 ...c-Fix-stack-allocation-secp384r1-asm.patch |   96 +
 8 files changed, 3360 insertions(+)
 create mode 100644 openssl-Improve-performance-for-6x-unrolling-with-vpermxor-i.patch
 create mode 100644 openssl-ec-56-bit-Limb-Solinas-Strategy-for-secp384r1.patch
 create mode 100644 openssl-ec-Use-static-linkage-on-nistp521-felem_-square-mul-.patch
 create mode 100644 openssl-ec-powerpc64le-Add-asm-implementation-of-felem_-squa.patch
 create mode 100644 openssl-ecc-Remove-extraneous-parentheses-in-secp384r1.patch
 create mode 100644 openssl-powerpc-ecc-Fix-stack-allocation-secp384r1-asm.patch

diff --git a/openssl-1_1.changes b/openssl-1_1.changes
index eec4e62..a34c4d7 100644
--- a/openssl-1_1.changes
+++ b/openssl-1_1.changes
@@ -1,3 +1,16 @@
+-------------------------------------------------------------------
+Thu Oct 19 15:03:14 UTC 2023 - Otto Hollmann <otto.hollmann@suse.com>
+
+- Performance enhancements for cryptography from OpenSSL 3.x
+  [jsc#PED-5086, jsc#PED-3514]
+  * Add patches:
+    - openssl-ec-Use-static-linkage-on-nistp521-felem_-square-mul-.patch
+    - openssl-ec-56-bit-Limb-Solinas-Strategy-for-secp384r1.patch
+    - openssl-ec-powerpc64le-Add-asm-implementation-of-felem_-squa.patch
+    - openssl-ecc-Remove-extraneous-parentheses-in-secp384r1.patch
+    - openssl-powerpc-ecc-Fix-stack-allocation-secp384r1-asm.patch
+    - openssl-Improve-performance-for-6x-unrolling-with-vpermxor-i.patch
+
 -------------------------------------------------------------------
 Wed Oct  4 07:15:29 UTC 2023 - Otto Hollmann <otto.hollmann@suse.com>
 
diff --git a/openssl-1_1.spec b/openssl-1_1.spec
index a26a4d9..fb0a989 100644
--- a/openssl-1_1.spec
+++ b/openssl-1_1.spec
@@ -177,6 +177,14 @@ Patch106:       openssl-s_client-check-ocsp-status.patch
 Patch107:       openssl-dont-pass-zero-length-input-to-EVP_Cipher.patch
 #PATCH-FIX-SUSE bsc#1215215 FIPS: Add "fips" to version string
 Patch108:       openssl-1_1-fips-bsc1215215_fips_in_version_string.patch
+# PATCH-FIX-UPSTREAM jsc#PED-5086, jsc#PED-3514
+# POWER10 performance enhancements for cryptography
+Patch109:       openssl-ec-Use-static-linkage-on-nistp521-felem_-square-mul-.patch
+Patch110:       openssl-ec-56-bit-Limb-Solinas-Strategy-for-secp384r1.patch
+Patch111:       openssl-ec-powerpc64le-Add-asm-implementation-of-felem_-squa.patch
+Patch112:       openssl-ecc-Remove-extraneous-parentheses-in-secp384r1.patch
+Patch113:       openssl-powerpc-ecc-Fix-stack-allocation-secp384r1-asm.patch
+Patch114:       openssl-Improve-performance-for-6x-unrolling-with-vpermxor-i.patch
 BuildRequires:  jitterentropy-devel >= 3.4.0
 BuildRequires:  pkgconfig
 BuildRequires:  pkgconfig(zlib)
diff --git a/openssl-Improve-performance-for-6x-unrolling-with-vpermxor-i.patch b/openssl-Improve-performance-for-6x-unrolling-with-vpermxor-i.patch
new file mode 100644
index 0000000..7c57d6b
--- /dev/null
+++ b/openssl-Improve-performance-for-6x-unrolling-with-vpermxor-i.patch
@@ -0,0 +1,495 @@
+From 3d3a7ecd1ae5ab08d22041f7b3b035c34f12fa02 Mon Sep 17 00:00:00 2001
+From: Danny Tsen <dtsen@linux.ibm.com>
+Date: Tue, 22 Aug 2023 15:58:53 -0400
+Subject: [PATCH] Improve performance for 6x unrolling with vpermxor
+ instruction
+
+Reviewed-by: Paul Dale <pauli@openssl.org>
+Reviewed-by: Tomas Mraz <tomas@openssl.org>
+(Merged from https://github.com/openssl/openssl/pull/21812)
+---
+ crypto/aes/asm/aesp8-ppc.pl | 145 +++++++++++++++++++++++-------------
+ 1 file changed, 95 insertions(+), 50 deletions(-)
+
+diff --git a/crypto/aes/asm/aesp8-ppc.pl b/crypto/aes/asm/aesp8-ppc.pl
+index 60cf86f52aed2..38b9405a283b7 100755
+--- a/crypto/aes/asm/aesp8-ppc.pl
++++ b/crypto/aes/asm/aesp8-ppc.pl
+@@ -99,11 +99,12 @@
+ .long	0x1b000000, 0x1b000000, 0x1b000000, 0x1b000000	?rev
+ .long	0x0d0e0f0c, 0x0d0e0f0c, 0x0d0e0f0c, 0x0d0e0f0c	?rev
+ .long	0,0,0,0						?asis
++.long	0x0f102132, 0x43546576, 0x8798a9ba, 0xcbdcedfe
+ Lconsts:
+ 	mflr	r0
+ 	bcl	20,31,\$+4
+ 	mflr	$ptr	 #vvvvv "distance between . and rcon
+-	addi	$ptr,$ptr,-0x48
++	addi	$ptr,$ptr,-0x58
+ 	mtlr	r0
+ 	blr
+ 	.long	0
+@@ -2405,7 +2406,7 @@ ()
+ my $key_=$key2;
+ my ($x00,$x10,$x20,$x30,$x40,$x50,$x60,$x70)=map("r$_",(0,3,26..31));
+     $x00=0 if ($flavour =~ /osx/);
+-my ($in0,  $in1,  $in2,  $in3,  $in4,  $in5 )=map("v$_",(0..5));
++my ($in0,  $in1,  $in2,  $in3,  $in4,  $in5)=map("v$_",(0..5));
+ my ($out0, $out1, $out2, $out3, $out4, $out5)=map("v$_",(7,12..16));
+ my ($twk0, $twk1, $twk2, $twk3, $twk4, $twk5)=map("v$_",(17..22));
+ my $rndkey0="v23";	# v24-v25 rotating buffer for first found keys
+@@ -2460,6 +2461,18 @@ ()
+ 	li		$x70,0x70
+ 	mtspr		256,r0
+ 
++	# Reverse eighty7 to 0x010101..87
++	xxlor		2, 32+$eighty7, 32+$eighty7
++	vsldoi		$eighty7,$tmp,$eighty7,1	# 0x010101..87
++	xxlor		1, 32+$eighty7, 32+$eighty7
++
++	# Load XOR contents. 0xf102132435465768798a9bacbdcedfe
++	mr		$x70, r6
++	bl		Lconsts
++	lxvw4x		0, $x40, r6		# load XOR contents
++	mr		r6, $x70
++	li		$x70,0x70
++
+ 	subi		$rounds,$rounds,3	# -4 in total
+ 
+ 	lvx		$rndkey0,$x00,$key1	# load key schedule
+@@ -2502,69 +2515,77 @@ ()
+ 	?vperm		v31,v31,$twk5,$keyperm
+ 	lvx		v25,$x10,$key_		# pre-load round[2]
+ 
++	# Switch to use the following codes with 0x010101..87 to generate tweak.
++	#     eighty7 = 0x010101..87
++	# vsrab		tmp, tweak, seven	# next tweak value, right shift 7 bits
++	# vand		tmp, tmp, eighty7	# last byte with carry
++	# vaddubm	tweak, tweak, tweak	# left shift 1 bit (x2)
++	# xxlor		vsx, 0, 0
++	# vpermxor	tweak, tweak, tmp, vsx
++
+ 	 vperm		$in0,$inout,$inptail,$inpperm
+ 	 subi		$inp,$inp,31		# undo "caller"
+ 	vxor		$twk0,$tweak,$rndkey0
+ 	vsrab		$tmp,$tweak,$seven	# next tweak value
+ 	vaddubm		$tweak,$tweak,$tweak
+-	vsldoi		$tmp,$tmp,$tmp,15
+ 	vand		$tmp,$tmp,$eighty7
+ 	 vxor		$out0,$in0,$twk0
+-	vxor		$tweak,$tweak,$tmp
++	xxlor		32+$in1, 0, 0
++	vpermxor	$tweak, $tweak, $tmp, $in1
+ 
+ 	 lvx_u		$in1,$x10,$inp
+ 	vxor		$twk1,$tweak,$rndkey0
+ 	vsrab		$tmp,$tweak,$seven	# next tweak value
+ 	vaddubm		$tweak,$tweak,$tweak
+-	vsldoi		$tmp,$tmp,$tmp,15
+ 	 le?vperm	$in1,$in1,$in1,$leperm
+ 	vand		$tmp,$tmp,$eighty7
+ 	 vxor		$out1,$in1,$twk1
+-	vxor		$tweak,$tweak,$tmp
++	xxlor		32+$in2, 0, 0
++	vpermxor	$tweak, $tweak, $tmp, $in2
+ 
+ 	 lvx_u		$in2,$x20,$inp
+ 	 andi.		$taillen,$len,15
+ 	vxor		$twk2,$tweak,$rndkey0
+ 	vsrab		$tmp,$tweak,$seven	# next tweak value
+ 	vaddubm		$tweak,$tweak,$tweak
+-	vsldoi		$tmp,$tmp,$tmp,15
+ 	 le?vperm	$in2,$in2,$in2,$leperm
+ 	vand		$tmp,$tmp,$eighty7
+ 	 vxor		$out2,$in2,$twk2
+-	vxor		$tweak,$tweak,$tmp
++	xxlor		32+$in3, 0, 0
++	vpermxor	$tweak, $tweak, $tmp, $in3
+ 
+ 	 lvx_u		$in3,$x30,$inp
+ 	 sub		$len,$len,$taillen
+ 	vxor		$twk3,$tweak,$rndkey0
+ 	vsrab		$tmp,$tweak,$seven	# next tweak value
+ 	vaddubm		$tweak,$tweak,$tweak
+-	vsldoi		$tmp,$tmp,$tmp,15
+ 	 le?vperm	$in3,$in3,$in3,$leperm
+ 	vand		$tmp,$tmp,$eighty7
+ 	 vxor		$out3,$in3,$twk3
+-	vxor		$tweak,$tweak,$tmp
++	xxlor		32+$in4, 0, 0
++	vpermxor	$tweak, $tweak, $tmp, $in4
+ 
+ 	 lvx_u		$in4,$x40,$inp
+ 	 subi		$len,$len,0x60
+ 	vxor		$twk4,$tweak,$rndkey0
+ 	vsrab		$tmp,$tweak,$seven	# next tweak value
+ 	vaddubm		$tweak,$tweak,$tweak
+-	vsldoi		$tmp,$tmp,$tmp,15
+ 	 le?vperm	$in4,$in4,$in4,$leperm
+ 	vand		$tmp,$tmp,$eighty7
+ 	 vxor		$out4,$in4,$twk4
+-	vxor		$tweak,$tweak,$tmp
++	xxlor		32+$in5, 0, 0
++	vpermxor	$tweak, $tweak, $tmp, $in5
+ 
+ 	 lvx_u		$in5,$x50,$inp
+ 	 addi		$inp,$inp,0x60
+ 	vxor		$twk5,$tweak,$rndkey0
+ 	vsrab		$tmp,$tweak,$seven	# next tweak value
+ 	vaddubm		$tweak,$tweak,$tweak
+-	vsldoi		$tmp,$tmp,$tmp,15
+ 	 le?vperm	$in5,$in5,$in5,$leperm
+ 	vand		$tmp,$tmp,$eighty7
+ 	 vxor		$out5,$in5,$twk5
+-	vxor		$tweak,$tweak,$tmp
++	xxlor		32+$in0, 0, 0
++	vpermxor	$tweak, $tweak, $tmp, $in0
+ 
+ 	vxor		v31,v31,$rndkey0
+ 	mtctr		$rounds
+@@ -2590,6 +2611,8 @@ ()
+ 	lvx		v25,$x10,$key_		# round[4]
+ 	bdnz		Loop_xts_enc6x
+ 
++	xxlor		32+$eighty7, 1, 1		# 0x010101..87
++
+ 	subic		$len,$len,96		# $len-=96
+ 	 vxor		$in0,$twk0,v31		# xor with last round key
+ 	vcipher		$out0,$out0,v24
+@@ -2599,7 +2622,6 @@ ()
+ 	 vaddubm	$tweak,$tweak,$tweak
+ 	vcipher		$out2,$out2,v24
+ 	vcipher		$out3,$out3,v24
+-	 vsldoi		$tmp,$tmp,$tmp,15
+ 	vcipher		$out4,$out4,v24
+ 	vcipher		$out5,$out5,v24
+ 
+@@ -2607,7 +2629,8 @@ ()
+ 	 vand		$tmp,$tmp,$eighty7
+ 	vcipher		$out0,$out0,v25
+ 	vcipher		$out1,$out1,v25
+-	 vxor		$tweak,$tweak,$tmp
++	 xxlor		32+$in1, 0, 0
++	 vpermxor	$tweak, $tweak, $tmp, $in1
+ 	vcipher		$out2,$out2,v25
+ 	vcipher		$out3,$out3,v25
+ 	 vxor		$in1,$twk1,v31
+@@ -2618,13 +2641,13 @@ ()
+ 
+ 	and		r0,r0,$len
+ 	 vaddubm	$tweak,$tweak,$tweak
+-	 vsldoi		$tmp,$tmp,$tmp,15
+ 	vcipher		$out0,$out0,v26
+ 	vcipher		$out1,$out1,v26
+ 	 vand		$tmp,$tmp,$eighty7
+ 	vcipher		$out2,$out2,v26
+ 	vcipher		$out3,$out3,v26
+-	 vxor		$tweak,$tweak,$tmp
++	 xxlor		32+$in2, 0, 0
++	 vpermxor	$tweak, $tweak, $tmp, $in2
+ 	vcipher		$out4,$out4,v26
+ 	vcipher		$out5,$out5,v26
+ 
+@@ -2638,7 +2661,6 @@ ()
+ 	 vaddubm	$tweak,$tweak,$tweak
+ 	vcipher		$out0,$out0,v27
+ 	vcipher		$out1,$out1,v27
+-	 vsldoi		$tmp,$tmp,$tmp,15
+ 	vcipher		$out2,$out2,v27
+ 	vcipher		$out3,$out3,v27
+ 	 vand		$tmp,$tmp,$eighty7
+@@ -2646,7 +2668,8 @@ ()
+ 	vcipher		$out5,$out5,v27
+ 
+ 	addi		$key_,$sp,$FRAME+15	# rewind $key_
+-	 vxor		$tweak,$tweak,$tmp
++	 xxlor		32+$in3, 0, 0
++	 vpermxor	$tweak, $tweak, $tmp, $in3
+ 	vcipher		$out0,$out0,v28
+ 	vcipher		$out1,$out1,v28
+ 	 vxor		$in3,$twk3,v31
+@@ -2655,7 +2678,6 @@ ()
+ 	vcipher		$out2,$out2,v28
+ 	vcipher		$out3,$out3,v28
+ 	 vaddubm	$tweak,$tweak,$tweak
+-	 vsldoi		$tmp,$tmp,$tmp,15
+ 	vcipher		$out4,$out4,v28
+ 	vcipher		$out5,$out5,v28
+ 	lvx		v24,$x00,$key_		# re-pre-load round[1]
+@@ -2663,7 +2685,8 @@ ()
+ 
+ 	vcipher		$out0,$out0,v29
+ 	vcipher		$out1,$out1,v29
+-	 vxor		$tweak,$tweak,$tmp
++	 xxlor		32+$in4, 0, 0
++	 vpermxor	$tweak, $tweak, $tmp, $in4
+ 	vcipher		$out2,$out2,v29
+ 	vcipher		$out3,$out3,v29
+ 	 vxor		$in4,$twk4,v31
+@@ -2673,14 +2696,14 @@ ()
+ 	vcipher		$out5,$out5,v29
+ 	lvx		v25,$x10,$key_		# re-pre-load round[2]
+ 	 vaddubm	$tweak,$tweak,$tweak
+-	 vsldoi		$tmp,$tmp,$tmp,15
+ 
+ 	vcipher		$out0,$out0,v30
+ 	vcipher		$out1,$out1,v30
+ 	 vand		$tmp,$tmp,$eighty7
+ 	vcipher		$out2,$out2,v30
+ 	vcipher		$out3,$out3,v30
+-	 vxor		$tweak,$tweak,$tmp
++	 xxlor		32+$in5, 0, 0
++	 vpermxor	$tweak, $tweak, $tmp, $in5
+ 	vcipher		$out4,$out4,v30
+ 	vcipher		$out5,$out5,v30
+ 	 vxor		$in5,$twk5,v31
+@@ -2690,7 +2713,6 @@ ()
+ 	vcipherlast	$out0,$out0,$in0
+ 	 lvx_u		$in0,$x00,$inp		# load next input block
+ 	 vaddubm	$tweak,$tweak,$tweak
+-	 vsldoi		$tmp,$tmp,$tmp,15
+ 	vcipherlast	$out1,$out1,$in1
+ 	 lvx_u		$in1,$x10,$inp
+ 	vcipherlast	$out2,$out2,$in2
+@@ -2703,7 +2725,10 @@ ()
+ 	vcipherlast	$out4,$out4,$in4
+ 	 le?vperm	$in2,$in2,$in2,$leperm
+ 	 lvx_u		$in4,$x40,$inp
+-	 vxor		$tweak,$tweak,$tmp
++	 xxlor		10, 32+$in0, 32+$in0
++	 xxlor		32+$in0, 0, 0
++	 vpermxor	$tweak, $tweak, $tmp, $in0
++	 xxlor		32+$in0, 10, 10
+ 	vcipherlast	$tmp,$out5,$in5		# last block might be needed
+ 						# in stealing mode
+ 	 le?vperm	$in3,$in3,$in3,$leperm
+@@ -2736,6 +2761,8 @@ ()
+ 	mtctr		$rounds
+ 	beq		Loop_xts_enc6x		# did $len-=96 borrow?
+ 
++	xxlor		32+$eighty7, 2, 2		# 0x870101..01
++
+ 	addic.		$len,$len,0x60
+ 	beq		Lxts_enc6x_zero
+ 	cmpwi		$len,0x20
+@@ -3112,6 +3139,18 @@ ()
+ 	li		$x70,0x70
+ 	mtspr		256,r0
+ 
++	# Reverse eighty7 to 0x010101..87
++	xxlor		2, 32+$eighty7, 32+$eighty7
++	vsldoi		$eighty7,$tmp,$eighty7,1	# 0x010101..87
++	xxlor		1, 32+$eighty7, 32+$eighty7
++
++	# Load XOR contents. 0xf102132435465768798a9bacbdcedfe
++	mr		$x70, r6
++	bl		Lconsts
++	lxvw4x		0, $x40, r6		# load XOR contents
++	mr		r6, $x70
++	li		$x70,0x70
++
+ 	subi		$rounds,$rounds,3	# -4 in total
+ 
+ 	lvx		$rndkey0,$x00,$key1	# load key schedule
+@@ -3159,64 +3198,64 @@ ()
+ 	vxor		$twk0,$tweak,$rndkey0
+ 	vsrab		$tmp,$tweak,$seven	# next tweak value
+ 	vaddubm		$tweak,$tweak,$tweak
+-	vsldoi		$tmp,$tmp,$tmp,15
+ 	vand		$tmp,$tmp,$eighty7
+ 	 vxor		$out0,$in0,$twk0
+-	vxor		$tweak,$tweak,$tmp
++	xxlor		32+$in1, 0, 0
++	vpermxor	$tweak, $tweak, $tmp, $in1
+ 
+ 	 lvx_u		$in1,$x10,$inp
+ 	vxor		$twk1,$tweak,$rndkey0
+ 	vsrab		$tmp,$tweak,$seven	# next tweak value
+ 	vaddubm		$tweak,$tweak,$tweak
+-	vsldoi		$tmp,$tmp,$tmp,15
+ 	 le?vperm	$in1,$in1,$in1,$leperm
+ 	vand		$tmp,$tmp,$eighty7
+ 	 vxor		$out1,$in1,$twk1
+-	vxor		$tweak,$tweak,$tmp
++	xxlor		32+$in2, 0, 0
++	vpermxor	$tweak, $tweak, $tmp, $in2
+ 
+ 	 lvx_u		$in2,$x20,$inp
+ 	 andi.		$taillen,$len,15
+ 	vxor		$twk2,$tweak,$rndkey0
+ 	vsrab		$tmp,$tweak,$seven	# next tweak value
+ 	vaddubm		$tweak,$tweak,$tweak
+-	vsldoi		$tmp,$tmp,$tmp,15
+ 	 le?vperm	$in2,$in2,$in2,$leperm
+ 	vand		$tmp,$tmp,$eighty7
+ 	 vxor		$out2,$in2,$twk2
+-	vxor		$tweak,$tweak,$tmp
++	xxlor		32+$in3, 0, 0
++	vpermxor	$tweak, $tweak, $tmp, $in3
+ 
+ 	 lvx_u		$in3,$x30,$inp
+ 	 sub		$len,$len,$taillen
+ 	vxor		$twk3,$tweak,$rndkey0
+ 	vsrab		$tmp,$tweak,$seven	# next tweak value
+ 	vaddubm		$tweak,$tweak,$tweak
+-	vsldoi		$tmp,$tmp,$tmp,15
+ 	 le?vperm	$in3,$in3,$in3,$leperm
+ 	vand		$tmp,$tmp,$eighty7
+ 	 vxor		$out3,$in3,$twk3
+-	vxor		$tweak,$tweak,$tmp
++	xxlor		32+$in4, 0, 0
++	vpermxor	$tweak, $tweak, $tmp, $in4
+ 
+ 	 lvx_u		$in4,$x40,$inp
+ 	 subi		$len,$len,0x60
+ 	vxor		$twk4,$tweak,$rndkey0
+ 	vsrab		$tmp,$tweak,$seven	# next tweak value
+ 	vaddubm		$tweak,$tweak,$tweak
+-	vsldoi		$tmp,$tmp,$tmp,15
+ 	 le?vperm	$in4,$in4,$in4,$leperm
+ 	vand		$tmp,$tmp,$eighty7
+ 	 vxor		$out4,$in4,$twk4
+-	vxor		$tweak,$tweak,$tmp
++	xxlor		32+$in5, 0, 0
++	vpermxor	$tweak, $tweak, $tmp, $in5
+ 
+ 	 lvx_u		$in5,$x50,$inp
+ 	 addi		$inp,$inp,0x60
+ 	vxor		$twk5,$tweak,$rndkey0
+ 	vsrab		$tmp,$tweak,$seven	# next tweak value
+ 	vaddubm		$tweak,$tweak,$tweak
+-	vsldoi		$tmp,$tmp,$tmp,15
+ 	 le?vperm	$in5,$in5,$in5,$leperm
+ 	vand		$tmp,$tmp,$eighty7
+ 	 vxor		$out5,$in5,$twk5
+-	vxor		$tweak,$tweak,$tmp
++	xxlor		32+$in0, 0, 0
++	vpermxor	$tweak, $tweak, $tmp, $in0
+ 
+ 	vxor		v31,v31,$rndkey0
+ 	mtctr		$rounds
+@@ -3242,6 +3281,8 @@ ()
+ 	lvx		v25,$x10,$key_		# round[4]
+ 	bdnz		Loop_xts_dec6x
+ 
++	xxlor		32+$eighty7, 1, 1
++
+ 	subic		$len,$len,96		# $len-=96
+ 	 vxor		$in0,$twk0,v31		# xor with last round key
+ 	vncipher	$out0,$out0,v24
+@@ -3251,7 +3292,6 @@ ()
+ 	 vaddubm	$tweak,$tweak,$tweak
+ 	vncipher	$out2,$out2,v24
+ 	vncipher	$out3,$out3,v24
+-	 vsldoi		$tmp,$tmp,$tmp,15
+ 	vncipher	$out4,$out4,v24
+ 	vncipher	$out5,$out5,v24
+ 
+@@ -3259,7 +3299,8 @@ ()
+ 	 vand		$tmp,$tmp,$eighty7
+ 	vncipher	$out0,$out0,v25
+ 	vncipher	$out1,$out1,v25
+-	 vxor		$tweak,$tweak,$tmp
++	 xxlor		32+$in1, 0, 0
++	 vpermxor	$tweak, $tweak, $tmp, $in1
+ 	vncipher	$out2,$out2,v25
+ 	vncipher	$out3,$out3,v25
+ 	 vxor		$in1,$twk1,v31
+@@ -3270,13 +3311,13 @@ ()
+ 
+ 	and		r0,r0,$len
+ 	 vaddubm	$tweak,$tweak,$tweak
+-	 vsldoi		$tmp,$tmp,$tmp,15
+ 	vncipher	$out0,$out0,v26
+ 	vncipher	$out1,$out1,v26
+ 	 vand		$tmp,$tmp,$eighty7
+ 	vncipher	$out2,$out2,v26
+ 	vncipher	$out3,$out3,v26
+-	 vxor		$tweak,$tweak,$tmp
++	 xxlor		32+$in2, 0, 0
++	 vpermxor	$tweak, $tweak, $tmp, $in2
+ 	vncipher	$out4,$out4,v26
+ 	vncipher	$out5,$out5,v26
+ 
+@@ -3290,7 +3331,6 @@ ()
+ 	 vaddubm	$tweak,$tweak,$tweak
+ 	vncipher	$out0,$out0,v27
+ 	vncipher	$out1,$out1,v27
+-	 vsldoi		$tmp,$tmp,$tmp,15
+ 	vncipher	$out2,$out2,v27
+ 	vncipher	$out3,$out3,v27
+ 	 vand		$tmp,$tmp,$eighty7
+@@ -3298,7 +3338,8 @@ ()
+ 	vncipher	$out5,$out5,v27
+ 
+ 	addi		$key_,$sp,$FRAME+15	# rewind $key_
+-	 vxor		$tweak,$tweak,$tmp
++	 xxlor		32+$in3, 0, 0
++	 vpermxor	$tweak, $tweak, $tmp, $in3
+ 	vncipher	$out0,$out0,v28
+ 	vncipher	$out1,$out1,v28
+ 	 vxor		$in3,$twk3,v31
+@@ -3307,7 +3348,6 @@ ()
+ 	vncipher	$out2,$out2,v28
+ 	vncipher	$out3,$out3,v28
+ 	 vaddubm	$tweak,$tweak,$tweak
+-	 vsldoi		$tmp,$tmp,$tmp,15
+ 	vncipher	$out4,$out4,v28
+ 	vncipher	$out5,$out5,v28
+ 	lvx		v24,$x00,$key_		# re-pre-load round[1]
+@@ -3315,7 +3355,8 @@ ()
+ 
+ 	vncipher	$out0,$out0,v29
+ 	vncipher	$out1,$out1,v29
+-	 vxor		$tweak,$tweak,$tmp
++	 xxlor		32+$in4, 0, 0
++	 vpermxor	$tweak, $tweak, $tmp, $in4
+ 	vncipher	$out2,$out2,v29
+ 	vncipher	$out3,$out3,v29
+ 	 vxor		$in4,$twk4,v31
+@@ -3325,14 +3366,14 @@ ()
+ 	vncipher	$out5,$out5,v29
+ 	lvx		v25,$x10,$key_		# re-pre-load round[2]
+ 	 vaddubm	$tweak,$tweak,$tweak
+-	 vsldoi		$tmp,$tmp,$tmp,15
+ 
+ 	vncipher	$out0,$out0,v30
+ 	vncipher	$out1,$out1,v30
+ 	 vand		$tmp,$tmp,$eighty7
+ 	vncipher	$out2,$out2,v30
+ 	vncipher	$out3,$out3,v30
+-	 vxor		$tweak,$tweak,$tmp
++	 xxlor		32+$in5, 0, 0
++	 vpermxor	$tweak, $tweak, $tmp, $in5
+ 	vncipher	$out4,$out4,v30
+ 	vncipher	$out5,$out5,v30
+ 	 vxor		$in5,$twk5,v31
+@@ -3342,7 +3383,6 @@ ()
+ 	vncipherlast	$out0,$out0,$in0
+ 	 lvx_u		$in0,$x00,$inp		# load next input block
+ 	 vaddubm	$tweak,$tweak,$tweak
+-	 vsldoi		$tmp,$tmp,$tmp,15
+ 	vncipherlast	$out1,$out1,$in1
+ 	 lvx_u		$in1,$x10,$inp
+ 	vncipherlast	$out2,$out2,$in2
+@@ -3355,7 +3395,10 @@ ()
+ 	vncipherlast	$out4,$out4,$in4
+ 	 le?vperm	$in2,$in2,$in2,$leperm
+ 	 lvx_u		$in4,$x40,$inp
+-	 vxor		$tweak,$tweak,$tmp
++	 xxlor		10, 32+$in0, 32+$in0
++	 xxlor		32+$in0, 0, 0
++	 vpermxor	$tweak, $tweak, $tmp, $in0
++	 xxlor		32+$in0, 10, 10
+ 	vncipherlast	$out5,$out5,$in5
+ 	 le?vperm	$in3,$in3,$in3,$leperm
+ 	 lvx_u		$in5,$x50,$inp
+@@ -3386,6 +3429,8 @@ ()
+ 	mtctr		$rounds
+ 	beq		Loop_xts_dec6x		# did $len-=96 borrow?
+ 
++	xxlor		32+$eighty7, 2, 2
++
+ 	addic.		$len,$len,0x60
+ 	beq		Lxts_dec6x_zero
+ 	cmpwi		$len,0x20
diff --git a/openssl-ec-56-bit-Limb-Solinas-Strategy-for-secp384r1.patch b/openssl-ec-56-bit-Limb-Solinas-Strategy-for-secp384r1.patch
new file mode 100644
index 0000000..498db7f
--- /dev/null
+++ b/openssl-ec-56-bit-Limb-Solinas-Strategy-for-secp384r1.patch
@@ -0,0 +1,2197 @@
+From 01d901e470d9e035a3bd78e77b9438a4cc0da785 Mon Sep 17 00:00:00 2001
+From: Rohan McLure <rohanmclure@linux.ibm.com>
+Date: Wed, 12 Jul 2023 12:25:22 +1000
+Subject: [PATCH] ec: 56-bit Limb Solinas' Strategy for secp384r1
+
+Adopt a 56-bit redundant-limb Solinas' reduction approach for efficient
+modular multiplication in P384. This has the affect of accelerating
+digital signing by 446% and verification by 106%. The implementation
+strategy and names of methods are the same as that provided in
+ecp_nistp224 and ecp_nistp521.
+
+As in Commit 1036749883cc ("ec: Add run time code selection for p521
+field operations"), allow for run time selection of implementation for
+felem_{square,mul}, where an assembly implementation is proclaimed to
+be present when ECP_NISTP384_ASM is present.
+
+Signed-off-by: Rohan McLure <rohanmclure@linux.ibm.com>
+
+Reviewed-by: Paul Dale <pauli@openssl.org>
+Reviewed-by: Shane Lontis <shane.lontis@oracle.com>
+Reviewed-by: Dmitry Belyavskiy <beldmit@gmail.com>
+Reviewed-by: Todd Short <todd.short@me.com>
+(Merged from https://github.com/openssl/openssl/pull/21471)
+---
+ crypto/ec/build.info     |    3 +-
+ crypto/ec/ec_curve.c     |    4 +
+ crypto/ec/ec_lib.c       |    8 +
+ crypto/ec/ec_local.h     |   27 +-
+ crypto/ec/ecp_nistp384.c | 1988 ++++++++++++++++++++++++++++++++++++++
+ crypto/err/openssl.txt   |    4
+ include/openssl/ecerr.h  |    3
+ 7 files changed, 2035 insertions(+), 2 deletions(-)
+ create mode 100644 crypto/ec/ecp_nistp384.c
+
+Index: openssl-1.1.1w/crypto/ec/ec_curve.c
+===================================================================
+--- openssl-1.1.1w.orig/crypto/ec/ec_curve.c
++++ openssl-1.1.1w/crypto/ec/ec_curve.c
+@@ -2833,6 +2833,8 @@ static const ec_list_element curve_list[
+     {NID_secp384r1, &_EC_NIST_PRIME_384.h,
+ # if defined(S390X_EC_ASM)
+      EC_GFp_s390x_nistp384_method,
++# elif !defined(OPENSSL_NO_EC_NISTP_64_GCC_128)
++     ossl_ec_GFp_nistp384_method,
+ # else
+      0,
+ # endif
+Index: openssl-1.1.1w/crypto/ec/ec_lib.c
+===================================================================
+--- openssl-1.1.1w.orig/crypto/ec/ec_lib.c
++++ openssl-1.1.1w/crypto/ec/ec_lib.c
+@@ -75,12 +75,16 @@ void EC_pre_comp_free(EC_GROUP *group)
+     case PCT_nistp256:
+         EC_nistp256_pre_comp_free(group->pre_comp.nistp256);
+         break;
++    case PCT_nistp384:
++        ossl_ec_nistp384_pre_comp_free(group->pre_comp.nistp384);
++        break;
+     case PCT_nistp521:
+         EC_nistp521_pre_comp_free(group->pre_comp.nistp521);
+         break;
+ #else
+     case PCT_nistp224:
+     case PCT_nistp256:
++    case PCT_nistp384:
+     case PCT_nistp521:
+         break;
+ #endif
+@@ -160,12 +164,16 @@ int EC_GROUP_copy(EC_GROUP *dest, const
+     case PCT_nistp256:
+         dest->pre_comp.nistp256 = EC_nistp256_pre_comp_dup(src->pre_comp.nistp256);
+         break;
++    case PCT_nistp384:
++        dest->pre_comp.nistp384 = ossl_ec_nistp384_pre_comp_dup(src->pre_comp.nistp384);
++        break;
+     case PCT_nistp521:
+         dest->pre_comp.nistp521 = EC_nistp521_pre_comp_dup(src->pre_comp.nistp521);
+         break;
+ #else
+     case PCT_nistp224:
+     case PCT_nistp256:
++    case PCT_nistp384:
+     case PCT_nistp521:
+         break;
+ #endif
+Index: openssl-1.1.1w/crypto/ec/ec_local.h
+===================================================================
+--- openssl-1.1.1w.orig/crypto/ec/ec_local.h
++++ openssl-1.1.1w/crypto/ec/ec_local.h
+@@ -207,6 +207,7 @@ struct ec_method_st {
+  */
+ typedef struct nistp224_pre_comp_st NISTP224_PRE_COMP;
+ typedef struct nistp256_pre_comp_st NISTP256_PRE_COMP;
++typedef struct nistp384_pre_comp_st NISTP384_PRE_COMP;
+ typedef struct nistp521_pre_comp_st NISTP521_PRE_COMP;
+ typedef struct nistz256_pre_comp_st NISTZ256_PRE_COMP;
+ typedef struct ec_pre_comp_st EC_PRE_COMP;
+@@ -268,12 +269,13 @@ struct ec_group_st {
+      */
+     enum {
+         PCT_none,
+-        PCT_nistp224, PCT_nistp256, PCT_nistp521, PCT_nistz256,
++        PCT_nistp224, PCT_nistp256, PCT_nistp384, PCT_nistp521, PCT_nistz256,
+         PCT_ec
+     } pre_comp_type;
+     union {
+         NISTP224_PRE_COMP *nistp224;
+         NISTP256_PRE_COMP *nistp256;
++        NISTP384_PRE_COMP *nistp384;
+         NISTP521_PRE_COMP *nistp521;
+         NISTZ256_PRE_COMP *nistz256;
+         EC_PRE_COMP *ec;
+@@ -330,6 +332,7 @@ static ossl_inline int ec_point_is_compa
+ 
+ NISTP224_PRE_COMP *EC_nistp224_pre_comp_dup(NISTP224_PRE_COMP *);
+ NISTP256_PRE_COMP *EC_nistp256_pre_comp_dup(NISTP256_PRE_COMP *);
++NISTP384_PRE_COMP *ossl_ec_nistp384_pre_comp_dup(NISTP384_PRE_COMP *);
+ NISTP521_PRE_COMP *EC_nistp521_pre_comp_dup(NISTP521_PRE_COMP *);
+ NISTZ256_PRE_COMP *EC_nistz256_pre_comp_dup(NISTZ256_PRE_COMP *);
+ NISTP256_PRE_COMP *EC_nistp256_pre_comp_dup(NISTP256_PRE_COMP *);
+@@ -338,6 +341,7 @@ EC_PRE_COMP *EC_ec_pre_comp_dup(EC_PRE_C
+ void EC_pre_comp_free(EC_GROUP *group);
+ void EC_nistp224_pre_comp_free(NISTP224_PRE_COMP *);
+ void EC_nistp256_pre_comp_free(NISTP256_PRE_COMP *);
++void ossl_ec_nistp384_pre_comp_free(NISTP384_PRE_COMP *);
+ void EC_nistp521_pre_comp_free(NISTP521_PRE_COMP *);
+ void EC_nistz256_pre_comp_free(NISTZ256_PRE_COMP *);
+ void EC_ec_pre_comp_free(EC_PRE_COMP *);
+@@ -543,6 +547,27 @@ int ec_GFp_nistp256_points_mul(const EC_
+ int ec_GFp_nistp256_precompute_mult(EC_GROUP *group, BN_CTX *ctx);
+ int ec_GFp_nistp256_have_precompute_mult(const EC_GROUP *group);
+ 
++/* method functions in ecp_nistp384.c */
++int ossl_ec_GFp_nistp384_group_init(EC_GROUP *group);
++int ossl_ec_GFp_nistp384_group_set_curve(EC_GROUP *group, const BIGNUM *p,
++                                         const BIGNUM *a, const BIGNUM *n,
++                                         BN_CTX *);
++int ossl_ec_GFp_nistp384_point_get_affine_coordinates(const EC_GROUP *group,
++                                                      const EC_POINT *point,
++                                                      BIGNUM *x, BIGNUM *y,
++                                                      BN_CTX *ctx);
++int ossl_ec_GFp_nistp384_mul(const EC_GROUP *group, EC_POINT *r,
++                             const BIGNUM *scalar, size_t num,
++                             const EC_POINT *points[], const BIGNUM *scalars[],
++                             BN_CTX *);
++int ossl_ec_GFp_nistp384_points_mul(const EC_GROUP *group, EC_POINT *r,
++                                    const BIGNUM *scalar, size_t num,
++                                    const EC_POINT *points[],
++                                    const BIGNUM *scalars[], BN_CTX *ctx);
++int ossl_ec_GFp_nistp384_precompute_mult(EC_GROUP *group, BN_CTX *ctx);
++int ossl_ec_GFp_nistp384_have_precompute_mult(const EC_GROUP *group);
++const EC_METHOD *ossl_ec_GFp_nistp384_method(void);
++
+ /* method functions in ecp_nistp521.c */
+ int ec_GFp_nistp521_group_init(EC_GROUP *group);
+ int ec_GFp_nistp521_group_set_curve(EC_GROUP *group, const BIGNUM *p,
+Index: openssl-1.1.1w/crypto/ec/ecp_nistp384.c
+===================================================================
+--- /dev/null
++++ openssl-1.1.1w/crypto/ec/ecp_nistp384.c
+@@ -0,0 +1,1993 @@
++/*
++ * Copyright 2023 The OpenSSL Project Authors. All Rights Reserved.
++ *
++ * Licensed under the Apache License 2.0 (the "License").  You may not use
++ * this file except in compliance with the License.  You can obtain a copy
++ * in the file LICENSE in the source distribution or at
++ * https://www.openssl.org/source/license.html
++ */
++
++/* Copyright 2023 IBM Corp.
++ *
++ * Licensed under the Apache License, Version 2.0 (the "License");
++ *
++ * you may not use this file except in compliance with the License.
++ * You may obtain a copy of the License at
++ *
++ *     http://www.apache.org/licenses/LICENSE-2.0
++ *
++ *  Unless required by applicable law or agreed to in writing, software
++ *  distributed under the License is distributed on an "AS IS" BASIS,
++ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
++ *  See the License for the specific language governing permissions and
++ *  limitations under the License.
++ */
++
++/*
++ * Designed for 56-bit limbs by Rohan McLure <rohan.mclure@linux.ibm.com>.
++ * The layout is based on that of ecp_nistp{224,521}.c, allowing even for asm
++ * acceleration of felem_{square,mul} as supported in these files.
++ */
++
++#include <openssl/e_os2.h>
++#ifdef OPENSSL_NO_EC_NISTP_64_GCC_128
++NON_EMPTY_TRANSLATION_UNIT
++#else
++#include <string.h>
++#include <openssl/err.h>
++#include "ec_local.h"
++
++#include "internal/numbers.h"
++
++#ifndef INT128_MAX
++# error "Your compiler doesn't appear to support 128-bit integer types"
++#endif
++
++typedef uint8_t u8;
++typedef uint64_t u64;
++
++/*
++ * The underlying field. P384 operates over GF(2^384-2^128-2^96+2^32-1). We
++ * can serialize an element of this field into 48 bytes. We call this an
++ * felem_bytearray.
++ */
++
++typedef u8 felem_bytearray[48];
++
++/*
++ * These are the parameters of P384, taken from FIPS 186-3, section D.1.2.4.
++ * These values are big-endian.
++ */
++static const felem_bytearray nistp384_curve_params[5] = {
++  {0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, /* p */
++   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
++   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFE, 0xFF, 0xFF, 0xFF, 0xFF,
++   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFF, 0xFF, 0xFF, 0xFF},
++  {0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, /* a = -3 */
++   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
++   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFE, 0xFF, 0xFF, 0xFF, 0xFF,
++   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFF, 0xFF, 0xFF, 0xFC},
++  {0xB3, 0x31, 0x2F, 0xA7, 0xE2, 0x3E, 0xE7, 0xE4, 0x98, 0x8E, 0x05, 0x6B, /* b */
++   0xE3, 0xF8, 0x2D, 0x19, 0x18, 0x1D, 0x9C, 0x6E, 0xFE, 0x81, 0x41, 0x12,
++   0x03, 0x14, 0x08, 0x8F, 0x50, 0x13, 0x87, 0x5A, 0xC6, 0x56, 0x39, 0x8D,
++   0x8A, 0x2E, 0xD1, 0x9D, 0x2A, 0x85, 0xC8, 0xED, 0xD3, 0xEC, 0x2A, 0xEF},
++  {0xAA, 0x87, 0xCA, 0x22, 0xBE, 0x8B, 0x05, 0x37, 0x8E, 0xB1, 0xC7, 0x1E, /* x */
++   0xF3, 0x20, 0xAD, 0x74, 0x6E, 0x1D, 0x3B, 0x62, 0x8B, 0xA7, 0x9B, 0x98,
++   0x59, 0xF7, 0x41, 0xE0, 0x82, 0x54, 0x2A, 0x38, 0x55, 0x02, 0xF2, 0x5D,
++   0xBF, 0x55, 0x29, 0x6C, 0x3A, 0x54, 0x5E, 0x38, 0x72, 0x76, 0x0A, 0xB7},
++  {0x36, 0x17, 0xDE, 0x4A, 0x96, 0x26, 0x2C, 0x6F, 0x5D, 0x9E, 0x98, 0xBF, /* y */
++   0x92, 0x92, 0xDC, 0x29, 0xF8, 0xF4, 0x1D, 0xBD, 0x28, 0x9A, 0x14, 0x7C,
++   0xE9, 0xDA, 0x31, 0x13, 0xB5, 0xF0, 0xB8, 0xC0, 0x0A, 0x60, 0xB1, 0xCE,
++   0x1D, 0x7E, 0x81, 0x9D, 0x7A, 0x43, 0x1D, 0x7C, 0x90, 0xEA, 0x0E, 0x5F},
++};
++
++/*-
++ * The representation of field elements.
++ * ------------------------------------
++ *
++ * We represent field elements with seven values. These values are either 64 or
++ * 128 bits and the field element represented is:
++ *   v[0]*2^0 + v[1]*2^56 + v[2]*2^112 + ... + v[6]*2^336  (mod p)
++ * Each of the seven values is called a 'limb'. Since the limbs are spaced only
++ * 56 bits apart, but are greater than 56 bits in length, the most significant
++ * bits of each limb overlap with the least significant bits of the next
++ *
++ * This representation is considered to be 'redundant' in the sense that
++ * intermediate values can each contain more than a 56-bit value in each limb.
++ * Reduction causes all but the final limb to be reduced to contain a value less
++ * than 2^56, with the final value represented allowed to be larger than 2^384,
++ * inasmuch as we can be sure that arithmetic overflow remains impossible. The
++ * reduced value must of course be congruent to the unreduced value.
++ *
++ * A field element with 64-bit limbs is an 'felem'. One with 128-bit limbs is a
++ * 'widefelem', featuring enough bits to store the result of a multiplication
++ * and even some further arithmetic without need for immediate reduction.
++ */
++
++#define NLIMBS 7
++
++typedef uint64_t limb;
++typedef uint128_t widelimb;
++typedef limb limb_aX __attribute((__aligned__(1)));
++typedef limb felem[NLIMBS];
++typedef widelimb widefelem[2*NLIMBS-1];
++
++static const limb bottom56bits = 0xffffffffffffff;
++
++/* Helper functions (de)serialising reduced field elements in little endian */
++static void bin48_to_felem(felem out, const u8 in[48])
++{
++    memset(out, 0, 56);
++    out[0] = (*((limb *) & in[0])) & bottom56bits;
++    out[1] = (*((limb_aX *) & in[7])) & bottom56bits;
++    out[2] = (*((limb_aX *) & in[14])) & bottom56bits;
++    out[3] = (*((limb_aX *) & in[21])) & bottom56bits;
++    out[4] = (*((limb_aX *) & in[28])) & bottom56bits;
++    out[5] = (*((limb_aX *) & in[35])) & bottom56bits;
++    memmove(&out[6], &in[42], 6);
++}
++
++static void felem_to_bin48(u8 out[48], const felem in)
++{
++    memset(out, 0, 48);
++    (*((limb *) & out[0]))     |= (in[0] & bottom56bits);
++    (*((limb_aX *) & out[7]))  |= (in[1] & bottom56bits);
++    (*((limb_aX *) & out[14])) |= (in[2] & bottom56bits);
++    (*((limb_aX *) & out[21])) |= (in[3] & bottom56bits);
++    (*((limb_aX *) & out[28])) |= (in[4] & bottom56bits);
++    (*((limb_aX *) & out[35])) |= (in[5] & bottom56bits);
++    memmove(&out[42], &in[6], 6);
++}
++
++/* BN_to_felem converts an OpenSSL BIGNUM into an felem */
++static int BN_to_felem(felem out, const BIGNUM *bn)
++{
++    felem_bytearray b_out;
++    int num_bytes;
++
++    if (BN_is_negative(bn)) {
++        ECerr(EC_F_BN_TO_FELEM, EC_R_BIGNUM_OUT_OF_RANGE);
++        return 0;
++    }
++    num_bytes = BN_bn2lebinpad(bn, b_out, sizeof(b_out));
++    if (num_bytes < 0) {
++        ECerr(EC_F_BN_TO_FELEM, EC_R_BIGNUM_OUT_OF_RANGE);
++        return 0;
++    }
++    bin48_to_felem(out, b_out);
++    return 1;
++}
++
++/* felem_to_BN converts an felem into an OpenSSL BIGNUM */
++static BIGNUM *felem_to_BN(BIGNUM *out, const felem in)
++{
++    felem_bytearray b_out;
++
++    felem_to_bin48(b_out, in);
++    return BN_lebin2bn(b_out, sizeof(b_out), out);
++}
++
++/*-
++ * Field operations
++ * ----------------
++ */
++
++static void felem_one(felem out)
++{
++    out[0] = 1;
++    memset(&out[1], 0, sizeof(limb) * (NLIMBS-1));
++}
++
++static void felem_assign(felem out, const felem in)
++{
++    memcpy(out, in, sizeof(felem));
++}
++
++/* felem_sum64 sets out = out + in. */
++static void felem_sum64(felem out, const felem in)
++{
++    unsigned int i;
++
++    for (i = 0; i < NLIMBS; i++)
++        out[i] += in[i];
++}
++
++/* felem_scalar sets out = in * scalar */
++static void felem_scalar(felem out, const felem in, limb scalar)
++{
++    unsigned int i;
++
++    for (i = 0; i < NLIMBS; i++)
++        out[i] = in[i] * scalar;
++}
++
++/* felem_scalar64 sets out = out * scalar */
++static void felem_scalar64(felem out, limb scalar)
++{
++    unsigned int i;
++
++    for (i = 0; i < NLIMBS; i++)
++        out[i] *= scalar;
++}
++
++/* felem_scalar128 sets out = out * scalar */
++static void felem_scalar128(widefelem out, limb scalar)
++{
++    unsigned int i;
++
++    for (i = 0; i < 2*NLIMBS-1; i++)
++        out[i] *= scalar;
++}
++
++/*-
++ * felem_neg sets |out| to |-in|
++ * On entry:
++ *   in[i] < 2^60 - 2^29
++ * On exit:
++ *   out[i] < 2^60
++ */
++static void felem_neg(felem out, const felem in)
++{
++    /*
++     * In order to prevent underflow, we add a multiple of p before subtracting.
++     * Use telescopic sums to represent 2^12 * p redundantly with each limb
++     * of the form 2^60 + ...
++     */
++    static const limb two60m52m4 = (((limb) 1) << 60)
++                                 - (((limb) 1) << 52)
++                                 - (((limb) 1) << 4);
++    static const limb two60p44m12 = (((limb) 1) << 60)
++                                  + (((limb) 1) << 44)
++                                  - (((limb) 1) << 12);
++    static const limb two60m28m4 = (((limb) 1) << 60)
++                                 - (((limb) 1) << 28)
++                                 - (((limb) 1) << 4);
++    static const limb two60m4 = (((limb) 1) << 60)
++                              - (((limb) 1) << 4);
++
++    out[0] = two60p44m12 - in[0];
++    out[1] = two60m52m4 - in[1];
++    out[2] = two60m28m4 - in[2];
++    out[3] = two60m4 - in[3];
++    out[4] = two60m4 - in[4];
++    out[5] = two60m4 - in[5];
++    out[6] = two60m4 - in[6];
++}
++
++/*-
++ * felem_diff64 subtracts |in| from |out|
++ * On entry:
++ *   in[i] < 2^60 - 2^52 - 2^4
++ * On exit:
++ *   out[i] < out_orig[i] + 2^60 + 2^44
++ */
++static void felem_diff64(felem out, const felem in)
++{
++    /*
++     * In order to prevent underflow, we add a multiple of p before subtracting.
++     * Use telescopic sums to represent 2^12 * p redundantly with each limb
++     * of the form 2^60 + ...
++     */
++
++    static const limb two60m52m4 = (((limb) 1) << 60)
++                                 - (((limb) 1) << 52)
++                                 - (((limb) 1) << 4);
++    static const limb two60p44m12 = (((limb) 1) << 60)
++                                  + (((limb) 1) << 44)
++                                  - (((limb) 1) << 12);
++    static const limb two60m28m4 = (((limb) 1) << 60)
++                                 - (((limb) 1) << 28)
++                                 - (((limb) 1) << 4);
++    static const limb two60m4 = (((limb) 1) << 60)
++                              - (((limb) 1) << 4);
++
++    out[0] += two60p44m12 - in[0];
++    out[1] += two60m52m4 - in[1];
++    out[2] += two60m28m4 - in[2];
++    out[3] += two60m4 - in[3];
++    out[4] += two60m4 - in[4];
++    out[5] += two60m4 - in[5];
++    out[6] += two60m4 - in[6];
++}
++
++/*
++ * in[i] < 2^63
++ * out[i] < out_orig[i] + 2^64 + 2^48
++ */
++static void felem_diff_128_64(widefelem out, const felem in)
++{
++    /*
++     * In order to prevent underflow, we add a multiple of p before subtracting.
++     * Use telescopic sums to represent 2^16 * p redundantly with each limb
++     * of the form 2^64 + ...
++     */
++
++    static const widelimb two64m56m8 = (((widelimb) 1) << 64)
++                                     - (((widelimb) 1) << 56)
++                                     - (((widelimb) 1) << 8);
++    static const widelimb two64m32m8 = (((widelimb) 1) << 64)
++                                     - (((widelimb) 1) << 32)
++                                     - (((widelimb) 1) << 8);
++    static const widelimb two64m8 = (((widelimb) 1) << 64)
++                                  - (((widelimb) 1) << 8);
++    static const widelimb two64p48m16 = (((widelimb) 1) << 64)
++                                      + (((widelimb) 1) << 48)
++                                      - (((widelimb) 1) << 16);
++    unsigned int i;
++
++    out[0] += two64p48m16;
++    out[1] += two64m56m8;
++    out[2] += two64m32m8;
++    out[3] += two64m8;
++    out[4] += two64m8;
++    out[5] += two64m8;
++    out[6] += two64m8;
++
++    for (i = 0; i < NLIMBS; i++)
++        out[i] -= in[i];
++}
++
++/*
++ * in[i] < 2^127 - 2^119 - 2^71
++ * out[i] < out_orig[i] + 2^127 + 2^111
++ */
++static void felem_diff128(widefelem out, const widefelem in)
++{
++    /*
++     * In order to prevent underflow, we add a multiple of p before subtracting.
++     * Use telescopic sums to represent 2^415 * p redundantly with each limb
++     * of the form 2^127 + ...
++     */
++
++    static const widelimb two127 = ((widelimb) 1) << 127;
++    static const widelimb two127m71 = (((widelimb) 1) << 127)
++                                    - (((widelimb) 1) << 71);
++    static const widelimb two127p111m79m71 = (((widelimb) 1) << 127)
++                                           + (((widelimb) 1) << 111)
++                                           - (((widelimb) 1) << 79)
++                                           - (((widelimb) 1) << 71);
++    static const widelimb two127m119m71 = (((widelimb) 1) << 127)
++                                        - (((widelimb) 1) << 119)
++                                        - (((widelimb) 1) << 71);
++    static const widelimb two127m95m71 = (((widelimb) 1) << 127)
++                                       - (((widelimb) 1) << 95)
++                                       - (((widelimb) 1) << 71);
++    unsigned int i;
++
++    out[0]  += two127;
++    out[1]  += two127m71;
++    out[2]  += two127m71;
++    out[3]  += two127m71;
++    out[4]  += two127m71;
++    out[5]  += two127m71;
++    out[6]  += two127p111m79m71;
++    out[7]  += two127m119m71;
++    out[8]  += two127m95m71;
++    out[9]  += two127m71;
++    out[10] += two127m71;
++    out[11] += two127m71;
++    out[12] += two127m71;
++
++    for (i = 0; i < 2*NLIMBS-1; i++)
++        out[i] -= in[i];
++}
++
++static void felem_square_ref(widefelem out, const felem in)
++{
++    felem inx2;
++    felem_scalar(inx2, in, 2);
++
++    out[0] = ((uint128_t) in[0]) * in[0];
++
++    out[1] = ((uint128_t) in[0]) * inx2[1];
++
++    out[2] = ((uint128_t) in[0]) * inx2[2]
++           + ((uint128_t) in[1]) * in[1];
++
++    out[3] = ((uint128_t) in[0]) * inx2[3]
++           + ((uint128_t) in[1]) * inx2[2];
++
++    out[4] = ((uint128_t) in[0]) * inx2[4]
++           + ((uint128_t) in[1]) * inx2[3]
++           + ((uint128_t) in[2]) * in[2];
++
++    out[5] = ((uint128_t) in[0]) * inx2[5]
++           + ((uint128_t) in[1]) * inx2[4]
++           + ((uint128_t) in[2]) * inx2[3];
++
++    out[6] = ((uint128_t) in[0]) * inx2[6]
++           + ((uint128_t) in[1]) * inx2[5]
++           + ((uint128_t) in[2]) * inx2[4]
++           + ((uint128_t) in[3]) * in[3];
++
++    out[7] = ((uint128_t) in[1]) * inx2[6]
++           + ((uint128_t) in[2]) * inx2[5]
++           + ((uint128_t) in[3]) * inx2[4];
++
++    out[8] = ((uint128_t) in[2]) * inx2[6]
++           + ((uint128_t) in[3]) * inx2[5]
++           + ((uint128_t) in[4]) * in[4];
++
++    out[9] = ((uint128_t) in[3]) * inx2[6]
++           + ((uint128_t) in[4]) * inx2[5];
++
++    out[10] = ((uint128_t) in[4]) * inx2[6]
++            + ((uint128_t) in[5]) * in[5];
++
++    out[11] = ((uint128_t) in[5]) * inx2[6];
++
++    out[12] = ((uint128_t) in[6]) * in[6];
++}
++
++static void felem_mul_ref(widefelem out, const felem in1, const felem in2)
++{
++    out[0] = ((uint128_t) in1[0]) * in2[0];
++
++    out[1] = ((uint128_t) in1[0]) * in2[1]
++           + ((uint128_t) in1[1]) * in2[0];
++
++    out[2] = ((uint128_t) in1[0]) * in2[2]
++           + ((uint128_t) in1[1]) * in2[1]
++           + ((uint128_t) in1[2]) * in2[0];
++
++    out[3] = ((uint128_t) in1[0]) * in2[3]
++           + ((uint128_t) in1[1]) * in2[2]
++           + ((uint128_t) in1[2]) * in2[1]
++           + ((uint128_t) in1[3]) * in2[0];
++
++    out[4] = ((uint128_t) in1[0]) * in2[4]
++           + ((uint128_t) in1[1]) * in2[3]
++           + ((uint128_t) in1[2]) * in2[2]
++           + ((uint128_t) in1[3]) * in2[1]
++           + ((uint128_t) in1[4]) * in2[0];
++
++    out[5] = ((uint128_t) in1[0]) * in2[5]
++           + ((uint128_t) in1[1]) * in2[4]
++           + ((uint128_t) in1[2]) * in2[3]
++           + ((uint128_t) in1[3]) * in2[2]
++           + ((uint128_t) in1[4]) * in2[1]
++           + ((uint128_t) in1[5]) * in2[0];
++
++    out[6] = ((uint128_t) in1[0]) * in2[6]
++           + ((uint128_t) in1[1]) * in2[5]
++           + ((uint128_t) in1[2]) * in2[4]
++           + ((uint128_t) in1[3]) * in2[3]
++           + ((uint128_t) in1[4]) * in2[2]
++           + ((uint128_t) in1[5]) * in2[1]
++           + ((uint128_t) in1[6]) * in2[0];
++
++    out[7] = ((uint128_t) in1[1]) * in2[6]
++           + ((uint128_t) in1[2]) * in2[5]
++           + ((uint128_t) in1[3]) * in2[4]
++           + ((uint128_t) in1[4]) * in2[3]
++           + ((uint128_t) in1[5]) * in2[2]
++           + ((uint128_t) in1[6]) * in2[1];
++
++    out[8] = ((uint128_t) in1[2]) * in2[6]
++           + ((uint128_t) in1[3]) * in2[5]
++           + ((uint128_t) in1[4]) * in2[4]
++           + ((uint128_t) in1[5]) * in2[3]
++           + ((uint128_t) in1[6]) * in2[2];
++
++    out[9] = ((uint128_t) in1[3]) * in2[6]
++           + ((uint128_t) in1[4]) * in2[5]
++           + ((uint128_t) in1[5]) * in2[4]
++           + ((uint128_t) in1[6]) * in2[3];
++
++    out[10] = ((uint128_t) in1[4]) * in2[6]
++            + ((uint128_t) in1[5]) * in2[5]
++            + ((uint128_t) in1[6]) * in2[4];
++
++    out[11] = ((uint128_t) in1[5]) * in2[6]
++            + ((uint128_t) in1[6]) * in2[5];
++
++    out[12] = ((uint128_t) in1[6]) * in2[6];
++}
++
++/*-
++ * Reduce thirteen 128-bit coefficients to seven 64-bit coefficients.
++ * in[i] < 2^128 - 2^125
++ * out[i] < 2^56 for i < 6,
++ * out[6] <= 2^48
++ *
++ * The technique in use here stems from the format of the prime modulus:
++ * P384 = 2^384 - delta
++ *
++ * Thus we can reduce numbers of the form (X + 2^384 * Y) by substituting
++ * them with (X + delta Y), with delta = 2^128 + 2^96 + (-2^32 + 1). These
++ * coefficients are still quite large, and so we repeatedly apply this
++ * technique on high-order bits in order to guarantee the desired bounds on
++ * the size of our output.
++ *
++ * The three phases of elimination are as follows:
++ * [1]: Y = 2^120 (in[12] | in[11] | in[10] | in[9])
++ * [2]: Y = 2^8 (acc[8] | acc[7])
++ * [3]: Y = 2^48 (acc[6] >> 48)
++ * (Where a | b | c | d = (2^56)^3 a + (2^56)^2 b + (2^56) c + d)
++ */
++static void felem_reduce(felem out, const widefelem in)
++{
++    /*
++     * In order to prevent underflow, we add a multiple of p before subtracting.
++     * Use telescopic sums to represent 2^76 * p redundantly with each limb
++     * of the form 2^124 + ...
++     */
++    static const widelimb two124m68 = (((widelimb) 1) << 124)
++                                    - (((widelimb) 1) << 68);
++    static const widelimb two124m116m68 = (((widelimb) 1) << 124)
++                                        - (((widelimb) 1) << 116)
++                                        - (((widelimb) 1) << 68);
++    static const widelimb two124p108m76 = (((widelimb) 1) << 124)
++                                        + (((widelimb) 1) << 108)
++                                        - (((widelimb) 1) << 76);
++    static const widelimb two124m92m68 = (((widelimb) 1) << 124)
++                                       - (((widelimb) 1) << 92)
++                                       - (((widelimb) 1) << 68);
++    widelimb temp, acc[9];
++    unsigned int i;
++
++    memcpy(acc, in, sizeof(widelimb) * 9);
++
++    acc[0] += two124p108m76;
++    acc[1] += two124m116m68;
++    acc[2] += two124m92m68;
++    acc[3] += two124m68;
++    acc[4] += two124m68;
++    acc[5] += two124m68;
++    acc[6] += two124m68;
++
++    /* [1]: Eliminate in[9], ..., in[12] */
++    acc[8] += in[12] >> 32;
++    acc[7] += (in[12] & 0xffffffff) << 24;
++    acc[7] += in[12] >> 8;
++    acc[6] += (in[12] & 0xff) << 48;
++    acc[6] -= in[12] >> 16;
++    acc[5] -= ((in[12] & 0xffff) << 40);
++    acc[6] += in[12] >> 48;
++    acc[5] += (in[12] & 0xffffffffffff) << 8;
++
++    acc[7] += in[11] >> 32;
++    acc[6] += (in[11] & 0xffffffff) << 24;
++    acc[6] += in[11] >> 8;
++    acc[5] += (in[11] & 0xff) << 48;
++    acc[5] -= in[11] >> 16;
++    acc[4] -= ((in[11] & 0xffff) << 40);
++    acc[5] += in[11] >> 48;
++    acc[4] += (in[11] & 0xffffffffffff) << 8;
++
++    acc[6] += in[10] >> 32;
++    acc[5] += (in[10] & 0xffffffff) << 24;
++    acc[5] += in[10] >> 8;
++    acc[4] += (in[10] & 0xff) << 48;
++    acc[4] -= in[10] >> 16;
++    acc[3] -= ((in[10] & 0xffff) << 40);
++    acc[4] += in[10] >> 48;
++    acc[3] += (in[10] & 0xffffffffffff) << 8;
++
++    acc[5] += in[9] >> 32;
++    acc[4] += (in[9] & 0xffffffff) << 24;
++    acc[4] += in[9] >> 8;
++    acc[3] += (in[9] & 0xff) << 48;
++    acc[3] -= in[9] >> 16;
++    acc[2] -= ((in[9] & 0xffff) << 40);
++    acc[3] += in[9] >> 48;
++    acc[2] += (in[9] & 0xffffffffffff) << 8;
++
++    /*
++     * [2]: Eliminate acc[7], acc[8], that is the 7 and eighth limbs, as
++     * well as the contributions made from eliminating higher limbs.
++     * acc[7] < in[7] + 2^120 + 2^56 < in[7] + 2^121
++     * acc[8] < in[8] + 2^96
++     */
++    acc[4] += acc[8] >> 32;
++    acc[3] += (acc[8] & 0xffffffff) << 24;
++    acc[3] += acc[8] >> 8;
++    acc[2] += (acc[8] & 0xff) << 48;
++    acc[2] -= acc[8] >> 16;
++    acc[1] -= ((acc[8] & 0xffff) << 40);
++    acc[2] += acc[8] >> 48;
++    acc[1] += (acc[8] & 0xffffffffffff) << 8;
++
++    acc[3] += acc[7] >> 32;
++    acc[2] += (acc[7] & 0xffffffff) << 24;
++    acc[2] += acc[7] >> 8;
++    acc[1] += (acc[7] & 0xff) << 48;
++    acc[1] -= acc[7] >> 16;
++    acc[0] -= ((acc[7] & 0xffff) << 40);
++    acc[1] += acc[7] >> 48;
++    acc[0] += (acc[7] & 0xffffffffffff) << 8;
++
++    /*-
++     * acc[k] < in[k] + 2^124 + 2^121 
++     *        < in[k] + 2^125
++     *        < 2^128, for k <= 6
++     */
++
++    /*
++     * Carry 4 -> 5 -> 6
++     * This has the effect of ensuring that these more significant limbs
++     * will be small in value after eliminating high bits from acc[6].
++     */
++    acc[5] += acc[4] >> 56;
++    acc[4] &= 0x00ffffffffffffff;
++
++    acc[6] += acc[5] >> 56;
++    acc[5] &= 0x00ffffffffffffff;
++
++    /*-
++     * acc[6] < in[6] + 2^124 + 2^121 + 2^72 + 2^16
++     *        < in[6] + 2^125
++     *        < 2^128
++     */
++
++    /* [3]: Eliminate high bits of acc[6] */
++    temp = acc[6] >> 48;
++    acc[6] &= 0x0000ffffffffffff;
++    
++    /* temp < 2^80 */
++
++    acc[3] += temp >> 40;
++    acc[2] += (temp & 0xffffffffff) << 16;
++    acc[2] += temp >> 16;
++    acc[1] += (temp & 0xffff) << 40;
++    acc[1] -= temp >> 24;
++    acc[0] -= (temp & 0xffffff) << 32;
++    acc[0] += temp;
++
++    /*-
++     * acc[k] < acc_old[k] + 2^64 + 2^56
++     *        < in[k] + 2^124 + 2^121 + 2^72 + 2^64 + 2^56 + 2^16 , k < 4
++     */
++
++    /* Carry 0 -> 1 -> 2 -> 3 -> 4 -> 5 -> 6 */
++    acc[1] += acc[0] >> 56;   /* acc[1] < acc_old[1] + 2^72 */
++    acc[0] &= 0x00ffffffffffffff;
++
++    acc[2] += acc[1] >> 56;   /* acc[2] < acc_old[2] + 2^72 + 2^16 */
++    acc[1] &= 0x00ffffffffffffff;
++
++    acc[3] += acc[2] >> 56;   /* acc[3] < acc_old[3] + 2^72 + 2^16 */
++    acc[2] &= 0x00ffffffffffffff;
++
++    /*-
++     * acc[k] < acc_old[k] + 2^72 + 2^16
++     *        < in[k] + 2^124 + 2^121 + 2^73 + 2^64 + 2^56 + 2^17
++     *        < in[k] + 2^125
++     *        < 2^128 , k < 4
++     */
++
++    acc[4] += acc[3] >> 56;   /*-
++                               * acc[4] < acc_old[4] + 2^72 + 2^16
++                               *        < 2^72 + 2^56 + 2^16
++                               */
++    acc[3] &= 0x00ffffffffffffff;
++
++    acc[5] += acc[4] >> 56;   /*-
++                               * acc[5] < acc_old[5] + 2^16 + 1
++                               *        < 2^56 + 2^16 + 1
++                               */
++    acc[4] &= 0x00ffffffffffffff;
++
++    acc[6] += acc[5] >> 56;   /* acc[6] < 2^48 + 1 <= 2^48 */
++    acc[5] &= 0x00ffffffffffffff;
++
++    for (i = 0; i < NLIMBS; i++)
++        out[i] = acc[i];
++}
++
++#if defined(ECP_NISTP384_ASM)
++static void felem_square_wrapper(widefelem out, const felem in);
++static void felem_mul_wrapper(widefelem out, const felem in1, const felem in2);
++
++static void (*felem_square_p)(widefelem out, const felem in) =
++    felem_square_wrapper;
++static void (*felem_mul_p)(widefelem out, const felem in1, const felem in2) =
++    felem_mul_wrapper;
++
++void p384_felem_square(widefelem out, const felem in);
++void p384_felem_mul(widefelem out, const felem in1, const felem in2);
++
++# if defined(_ARCH_PPC64)
++#  include "crypto/ppc_arch.h"
++# endif
++
++static void felem_select(void)
++{
++    /* Default */
++    felem_square_p = felem_square_ref;
++    felem_mul_p = felem_mul_ref;
++}
++
++static void felem_square_wrapper(widefelem out, const felem in)
++{
++    felem_select();
++    felem_square_p(out, in);
++}
++
++static void felem_mul_wrapper(widefelem out, const felem in1, const felem in2)
++{
++    felem_select();
++    felem_mul_p(out, in1, in2);
++}
++
++# define felem_square felem_square_p
++# define felem_mul felem_mul_p
++#else
++# define felem_square felem_square_ref
++# define felem_mul felem_mul_ref
++#endif
++
++static ossl_inline void felem_square_reduce(felem out, const felem in)
++{
++    widefelem tmp;
++
++    felem_square(tmp, in);
++    felem_reduce(out, tmp);
++}
++
++static ossl_inline void felem_mul_reduce(felem out, const felem in1, const felem in2)
++{
++    widefelem tmp;
++
++    felem_mul(tmp, in1, in2);
++    felem_reduce(out, tmp);
++}
++
++/*-
++ * felem_inv calculates |out| = |in|^{-1}
++ *
++ * Based on Fermat's Little Theorem:
++ *   a^p = a (mod p)
++ *   a^{p-1} = 1 (mod p)
++ *   a^{p-2} = a^{-1} (mod p)
++ */
++static void felem_inv(felem out, const felem in)
++{
++    felem ftmp, ftmp2, ftmp3, ftmp4, ftmp5, ftmp6;
++    unsigned int i = 0;
++
++    felem_square_reduce(ftmp, in);      /* 2^1 */
++    felem_mul_reduce(ftmp, ftmp, in);   /* 2^1 + 2^0 */
++    felem_assign(ftmp2, ftmp);
++
++    felem_square_reduce(ftmp, ftmp);    /* 2^2 + 2^1 */
++    felem_mul_reduce(ftmp, ftmp, in);   /* 2^2 + 2^1 * 2^0 */
++    felem_assign(ftmp3, ftmp);
++
++    for (i = 0; i < 3; i++)
++        felem_square_reduce(ftmp, ftmp); /* 2^5 + 2^4 + 2^3 */
++    felem_mul_reduce(ftmp, ftmp3, ftmp); /* 2^5 + 2^4 + 2^3 + 2^2 + 2^1 + 2^0 */
++    felem_assign(ftmp4, ftmp);
++
++    for (i = 0; i < 6; i++)
++        felem_square_reduce(ftmp, ftmp); /* 2^11 + ... + 2^6 */
++    felem_mul_reduce(ftmp, ftmp4, ftmp); /* 2^11 + ... + 2^0 */
++
++    for (i = 0; i < 3; i++)
++        felem_square_reduce(ftmp, ftmp); /* 2^14 + ... + 2^3 */
++    felem_mul_reduce(ftmp, ftmp3, ftmp); /* 2^14 + ... + 2^0 */
++    felem_assign(ftmp5, ftmp);
++
++    for (i = 0; i < 15; i++)
++        felem_square_reduce(ftmp, ftmp); /* 2^29 + ... + 2^15 */
++    felem_mul_reduce(ftmp, ftmp5, ftmp); /* 2^29 + ... + 2^0 */
++    felem_assign(ftmp6, ftmp);
++
++    for (i = 0; i < 30; i++)
++        felem_square_reduce(ftmp, ftmp); /* 2^59 + ... + 2^30 */
++    felem_mul_reduce(ftmp, ftmp6, ftmp); /* 2^59 + ... + 2^0 */
++    felem_assign(ftmp4, ftmp);
++
++    for (i = 0; i < 60; i++)
++        felem_square_reduce(ftmp, ftmp); /* 2^119 + ... + 2^60 */
++    felem_mul_reduce(ftmp, ftmp4, ftmp); /* 2^119 + ... + 2^0 */
++    felem_assign(ftmp4, ftmp);
++
++    for (i = 0; i < 120; i++)
++      felem_square_reduce(ftmp, ftmp);   /* 2^239 + ... + 2^120 */
++    felem_mul_reduce(ftmp, ftmp4, ftmp); /* 2^239 + ... + 2^0 */
++
++    for (i = 0; i < 15; i++)
++        felem_square_reduce(ftmp, ftmp); /* 2^254 + ... + 2^15 */
++    felem_mul_reduce(ftmp, ftmp5, ftmp); /* 2^254 + ... + 2^0 */
++
++    for (i = 0; i < 31; i++)
++        felem_square_reduce(ftmp, ftmp); /* 2^285 + ... + 2^31 */
++    felem_mul_reduce(ftmp, ftmp6, ftmp); /* 2^285 + ... + 2^31 + 2^29 + ... + 2^0 */
++
++    for (i = 0; i < 2; i++)
++        felem_square_reduce(ftmp, ftmp); /* 2^287 + ... + 2^33 + 2^31 + ... + 2^2 */
++    felem_mul_reduce(ftmp, ftmp2, ftmp); /* 2^287 + ... + 2^33 + 2^31 + ... + 2^0 */
++
++    for (i = 0; i < 94; i++)
++        felem_square_reduce(ftmp, ftmp); /* 2^381 + ... + 2^127 + 2^125 + ... + 2^94 */
++    felem_mul_reduce(ftmp, ftmp6, ftmp); /* 2^381 + ... + 2^127 + 2^125 + ... + 2^94 + 2^29 + ... + 2^0 */
++
++    for (i = 0; i < 2; i++)
++        felem_square_reduce(ftmp, ftmp); /* 2^383 + ... + 2^129 + 2^127 + ... + 2^96 + 2^31 + ... + 2^2 */
++    felem_mul_reduce(ftmp, in, ftmp);    /* 2^383 + ... + 2^129 + 2^127 + ... + 2^96 + 2^31 + ... + 2^2 + 2^0 */
++
++    memcpy(out, ftmp, sizeof(felem));
++}
++
++/*
++ * Zero-check: returns a limb with all bits set if |in| == 0 (mod p)
++ * and 0 otherwise. We know that field elements are reduced to
++ * 0 < in < 2p, so we only need to check two cases:
++ * 0 and 2^384 - 2^128 - 2^96 + 2^32 - 1
++ *   in[k] < 2^56, k < 6
++ *   in[6] <= 2^48
++ */
++static limb felem_is_zero(const felem in)
++{
++    limb zero, p384;
++
++    zero = in[0] | in[1] | in[2] | in[3] | in[4] | in[5] | in[6];
++    zero = ((int64_t) (zero) - 1) >> 63;
++    p384 = (in[0] ^ 0x000000ffffffff) | (in[1] ^ 0xffff0000000000)
++         | (in[2] ^ 0xfffffffffeffff) | (in[3] ^ 0xffffffffffffff)
++         | (in[4] ^ 0xffffffffffffff) | (in[5] ^ 0xffffffffffffff)
++         | (in[6] ^ 0xffffffffffff);
++    p384 = ((int64_t) (p384) - 1) >> 63;
++
++    return (zero | p384);
++}
++
++static int felem_is_zero_int(const void *in)
++{
++    return (int)(felem_is_zero(in) & ((limb) 1));
++}
++
++/*-
++ * felem_contract converts |in| to its unique, minimal representation.
++ * Assume we've removed all redundant bits.
++ * On entry:
++ *   in[k] < 2^56, k < 6
++ *   in[6] <= 2^48
++ */
++static void felem_contract(felem out, const felem in)
++{
++    static const int64_t two56 = ((limb) 1) << 56;
++
++    /*
++     * We know for a fact that 0 <= |in| < 2*p, for p = 2^384 - 2^128 - 2^96 + 2^32 - 1
++     * Perform two successive, idempotent subtractions to reduce if |in| >= p.
++     */
++
++    int64_t tmp[NLIMBS], cond[5], a;
++    unsigned int i;
++
++    memcpy(tmp, in, sizeof(felem));
++ 
++    /* Case 1: a = 1 iff |in| >= 2^384 */
++    a = (in[6] >> 48);
++    tmp[0] += a;
++    tmp[0] -= a << 32;
++    tmp[1] += a << 40;
++    tmp[2] += a << 16;
++    tmp[6] &= 0x0000ffffffffffff;
++
++    /*
++     * eliminate negative coefficients: if tmp[0] is negative, tmp[1] must be
++     * non-zero, so we only need one step
++     */
++
++    a = tmp[0] >> 63;
++    tmp[0] += a & two56;
++    tmp[1] -= a & 1;
++
++    /* Carry 1 -> 2 -> 3 -> 4 -> 5 -> 6 */
++    tmp[2] += tmp[1] >> 56;
++    tmp[1] &= 0x00ffffffffffffff;
++
++    tmp[3] += tmp[2] >> 56;
++    tmp[2] &= 0x00ffffffffffffff;
++
++    tmp[4] += tmp[3] >> 56;
++    tmp[3] &= 0x00ffffffffffffff;
++
++    tmp[5] += tmp[4] >> 56;
++    tmp[4] &= 0x00ffffffffffffff;
++
++    tmp[6] += tmp[5] >> 56; /* tmp[6] < 2^48 */
++    tmp[5] &= 0x00ffffffffffffff;
++
++    /*
++     * Case 2: a = all ones if p <= |in| < 2^384, 0 otherwise
++     */
++
++    /* 0 iff (2^129..2^383) are all one */
++    cond[0] = ((tmp[6] | 0xff000000000000) & tmp[5] & tmp[4] & tmp[3] & (tmp[2] | 0x0000000001ffff)) + 1;
++    /* 0 iff 2^128 bit is one */
++    cond[1] = (tmp[2] | ~0x00000000010000) + 1;
++    /* 0 iff (2^96..2^127) bits are all one */
++    cond[2] = ((tmp[2] | 0xffffffffff0000) & (tmp[1] | 0x0000ffffffffff)) + 1;
++    /* 0 iff (2^32..2^95) bits are all zero */
++    cond[3] = (tmp[1] & ~0xffff0000000000) | (tmp[0] & ~((int64_t) 0x000000ffffffff));
++    /* 0 iff (2^0..2^31) bits are all one */
++    cond[4] = (tmp[0] | 0xffffff00000000) + 1;
++
++    /*
++     * In effect, invert our conditions, so that 0 values become all 1's,
++     * any non-zero value in the low-order 56 bits becomes all 0's
++     */
++    for (i = 0; i < 5; i++)
++       cond[i] = ((cond[i] & 0x00ffffffffffffff) - 1) >> 63;
++
++    /*
++     * The condition for determining whether in is greater than our
++     * prime is given by the following condition.
++     */
++
++    /* First subtract 2^384 - 2^129 cheaply */
++    a = cond[0] & (cond[1] | (cond[2] & (~cond[3] | cond[4])));
++    tmp[6] &= ~a;
++    tmp[5] &= ~a;
++    tmp[4] &= ~a;
++    tmp[3] &= ~a;
++    tmp[2] &= ~a | 0x0000000001ffff;
++
++    /*
++     * Subtract 2^128 - 2^96 by
++     * means of disjoint cases.
++     */
++
++    /* subtract 2^128 if that bit is present, and add 2^96 */
++    a = cond[0] & cond[1];
++    tmp[2] &= ~a | 0xfffffffffeffff;
++    tmp[1] += a & ((int64_t) 1 << 40);
++
++    /* otherwise, clear bits 2^127 .. 2^96  */
++    a = cond[0] & ~cond[1] & (cond[2] & (~cond[3] | cond[4]));
++    tmp[2] &= ~a | 0xffffffffff0000;
++    tmp[1] &= ~a | 0x0000ffffffffff;
++
++    /* finally, subtract the last 2^32 - 1 */
++    a = cond[0] & (cond[1] | (cond[2] & (~cond[3] | cond[4])));
++    tmp[0] += a & (-((int64_t) 1 << 32) + 1);
++
++    /*
++     * eliminate negative coefficients: if tmp[0] is negative, tmp[1] must be
++     * non-zero, so we only need one step
++     */
++    a = tmp[0] >> 63;
++    tmp[0] += a & two56;
++    tmp[1] -= a & 1;
++
++    /* Carry 1 -> 2 -> 3 -> 4 -> 5 -> 6 */
++    tmp[2] += tmp[1] >> 56;
++    tmp[1] &= 0x00ffffffffffffff;
++
++    tmp[3] += tmp[2] >> 56;
++    tmp[2] &= 0x00ffffffffffffff;
++
++    tmp[4] += tmp[3] >> 56;
++    tmp[3] &= 0x00ffffffffffffff;
++
++    tmp[5] += tmp[4] >> 56;
++    tmp[4] &= 0x00ffffffffffffff;
++
++    tmp[6] += tmp[5] >> 56;
++    tmp[5] &= 0x00ffffffffffffff;
++
++    memcpy(out, tmp, sizeof(felem));
++}
++
++/*-
++ * Group operations
++ * ----------------
++ *
++ * Building on top of the field operations we have the operations on the
++ * elliptic curve group itself. Points on the curve are represented in Jacobian
++ * coordinates
++ */
++
++/*-
++ * point_double calculates 2*(x_in, y_in, z_in)
++ *
++ * The method is taken from:
++ *   http://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#doubling-dbl-2001-b
++ *
++ * Outputs can equal corresponding inputs, i.e., x_out == x_in is allowed.
++ * while x_out == y_in is not (maybe this works, but it's not tested).
++ */
++static void
++point_double(felem x_out, felem y_out, felem z_out,
++             const felem x_in, const felem y_in, const felem z_in)
++{
++    widefelem tmp, tmp2;
++    felem delta, gamma, beta, alpha, ftmp, ftmp2;
++
++    felem_assign(ftmp, x_in);
++    felem_assign(ftmp2, x_in);
++
++    /* delta = z^2 */
++    felem_square_reduce(delta, z_in);     /* delta[i] < 2^56 */
++
++    /* gamma = y^2 */
++    felem_square_reduce(gamma, y_in);     /* gamma[i] < 2^56 */
++
++    /* beta = x*gamma */
++    felem_mul_reduce(beta, x_in, gamma);  /* beta[i] < 2^56 */
++
++    /* alpha = 3*(x-delta)*(x+delta) */
++    felem_diff64(ftmp, delta);            /* ftmp[i] < 2^60 + 2^58 + 2^44 */
++    felem_sum64(ftmp2, delta);            /* ftmp2[i] < 2^59 */
++    felem_scalar64(ftmp2, 3);             /* ftmp2[i] < 2^61 */
++    felem_mul_reduce(alpha, ftmp, ftmp2); /* alpha[i] < 2^56 */
++
++    /* x' = alpha^2 - 8*beta */
++    felem_square(tmp, alpha);             /* tmp[i] < 2^115 */
++    felem_assign(ftmp, beta);             /* ftmp[i] < 2^56 */
++    felem_scalar64(ftmp, 8);              /* ftmp[i] < 2^59 */
++    felem_diff_128_64(tmp, ftmp);         /* tmp[i] < 2^115 + 2^64 + 2^48 */
++    felem_reduce(x_out, tmp);             /* x_out[i] < 2^56 */
++
++    /* z' = (y + z)^2 - gamma - delta */
++    felem_sum64(delta, gamma);     /* delta[i] < 2^57 */
++    felem_assign(ftmp, y_in);      /* ftmp[i] < 2^56 */
++    felem_sum64(ftmp, z_in);       /* ftmp[i] < 2^56 */
++    felem_square(tmp, ftmp);       /* tmp[i] < 2^115 */
++    felem_diff_128_64(tmp, delta); /* tmp[i] < 2^115 + 2^64 + 2^48 */
++    felem_reduce(z_out, tmp);      /* z_out[i] < 2^56 */
++
++    /* y' = alpha*(4*beta - x') - 8*gamma^2 */
++    felem_scalar64(beta, 4);       /* beta[i] < 2^58 */
++    felem_diff64(beta, x_out);     /* beta[i] < 2^60 + 2^58 + 2^44 */
++    felem_mul(tmp, alpha, beta);   /* tmp[i] < 2^119 */
++    felem_square(tmp2, gamma);     /* tmp2[i] < 2^115 */
++    felem_scalar128(tmp2, 8);      /* tmp2[i] < 2^118 */
++    felem_diff128(tmp, tmp2);      /* tmp[i] < 2^127 + 2^119 + 2^111 */
++    felem_reduce(y_out, tmp);      /* tmp[i] < 2^56 */
++}
++
++/* copy_conditional copies in to out iff mask is all ones. */
++static void copy_conditional(felem out, const felem in, limb mask)
++{
++    unsigned int i;
++
++    for (i = 0; i < NLIMBS; i++)
++        out[i] ^= mask & (in[i] ^ out[i]);
++}
++
++/*-
++ * point_add calculates (x1, y1, z1) + (x2, y2, z2)
++ *
++ * The method is taken from
++ *   http://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#addition-add-2007-bl,
++ * adapted for mixed addition (z2 = 1, or z2 = 0 for the point at infinity).
++ *
++ * This function includes a branch for checking whether the two input points
++ * are equal (while not equal to the point at infinity). See comment below
++ * on constant-time.
++ */
++static void point_add(felem x3, felem y3, felem z3,
++                      const felem x1, const felem y1, const felem z1,
++                      const int mixed, const felem x2, const felem y2,
++                      const felem z2)
++{
++    felem ftmp, ftmp2, ftmp3, ftmp4, ftmp5, ftmp6, x_out, y_out, z_out;
++    widefelem tmp, tmp2;
++    limb x_equal, y_equal, z1_is_zero, z2_is_zero;
++    limb points_equal;
++
++    z1_is_zero = felem_is_zero(z1);
++    z2_is_zero = felem_is_zero(z2);
++
++    /* ftmp = z1z1 = z1**2 */
++    felem_square_reduce(ftmp, z1);      /* ftmp[i] < 2^56 */
++
++    if (!mixed) {
++        /* ftmp2 = z2z2 = z2**2 */
++        felem_square_reduce(ftmp2, z2); /* ftmp2[i] < 2^56 */
++
++        /* u1 = ftmp3 = x1*z2z2 */
++        felem_mul_reduce(ftmp3, x1, ftmp2); /* ftmp3[i] < 2^56 */
++
++        /* ftmp5 = z1 + z2 */
++        felem_assign(ftmp5, z1);       /* ftmp5[i] < 2^56 */
++        felem_sum64(ftmp5, z2);        /* ftmp5[i] < 2^57 */
++
++        /* ftmp5 = (z1 + z2)**2 - z1z1 - z2z2 = 2*z1z2 */
++        felem_square(tmp, ftmp5);      /* tmp[i] < 2^117 */
++        felem_diff_128_64(tmp, ftmp);  /* tmp[i] < 2^117 + 2^64 + 2^48 */
++        felem_diff_128_64(tmp, ftmp2); /* tmp[i] < 2^117 + 2^65 + 2^49 */
++        felem_reduce(ftmp5, tmp);      /* ftmp5[i] < 2^56 */
++
++        /* ftmp2 = z2 * z2z2 */
++        felem_mul_reduce(ftmp2, ftmp2, z2); /* ftmp2[i] < 2^56 */
++
++        /* s1 = ftmp6 = y1 * z2**3 */
++        felem_mul_reduce(ftmp6, y1, ftmp2); /* ftmp6[i] < 2^56 */
++    } else {
++        /*
++         * We'll assume z2 = 1 (special case z2 = 0 is handled later)
++         */
++
++        /* u1 = ftmp3 = x1*z2z2 */
++        felem_assign(ftmp3, x1);     /* ftmp3[i] < 2^56 */
++
++        /* ftmp5 = 2*z1z2 */
++        felem_scalar(ftmp5, z1, 2);  /* ftmp5[i] < 2^57 */
++
++        /* s1 = ftmp6 = y1 * z2**3 */
++        felem_assign(ftmp6, y1);     /* ftmp6[i] < 2^56 */
++    }
++    /* ftmp3[i] < 2^56, ftmp5[i] < 2^57, ftmp6[i] < 2^56 */
++
++    /* u2 = x2*z1z1 */
++    felem_mul(tmp, x2, ftmp);        /* tmp[i] < 2^115 */
++
++    /* h = ftmp4 = u2 - u1 */
++    felem_diff_128_64(tmp, ftmp3);   /* tmp[i] < 2^115 + 2^64 + 2^48 */
++    felem_reduce(ftmp4, tmp);        /* ftmp[4] < 2^56 */
++
++    x_equal = felem_is_zero(ftmp4);
++
++    /* z_out = ftmp5 * h */
++    felem_mul_reduce(z_out, ftmp5, ftmp4);  /* z_out[i] < 2^56 */
++
++    /* ftmp = z1 * z1z1 */
++    felem_mul_reduce(ftmp, ftmp, z1);  /* ftmp[i] < 2^56 */
++
++    /* s2 = tmp = y2 * z1**3 */
++    felem_mul(tmp, y2, ftmp);      /* tmp[i] < 2^115 */
++
++    /* r = ftmp5 = (s2 - s1)*2 */
++    felem_diff_128_64(tmp, ftmp6); /* tmp[i] < 2^115 + 2^64 + 2^48 */
++    felem_reduce(ftmp5, tmp);      /* ftmp5[i] < 2^56 */
++    y_equal = felem_is_zero(ftmp5);
++    felem_scalar64(ftmp5, 2);      /* ftmp5[i] < 2^57 */
++
++    /*
++     * The formulae are incorrect if the points are equal, in affine coordinates
++     * (X_1, Y_1) == (X_2, Y_2), so we check for this and do doubling if this
++     * happens.
++     *
++     * We use bitwise operations to avoid potential side-channels introduced by
++     * the short-circuiting behaviour of boolean operators.
++     *
++     * The special case of either point being the point at infinity (z1 and/or
++     * z2 are zero), is handled separately later on in this function, so we
++     * avoid jumping to point_double here in those special cases.
++     *
++     * Notice the comment below on the implications of this branching for timing
++     * leaks and why it is considered practically irrelevant.
++     */
++    points_equal = (x_equal & y_equal & (~z1_is_zero) & (~z2_is_zero));
++
++    if (points_equal) {
++        /*
++         * This is obviously not constant-time but it will almost-never happen
++         * for ECDH / ECDSA.
++         */
++        point_double(x3, y3, z3, x1, y1, z1);
++        return;
++    }
++
++    /* I = ftmp = (2h)**2 */
++    felem_assign(ftmp, ftmp4);        /* ftmp[i] < 2^56 */
++    felem_scalar64(ftmp, 2);          /* ftmp[i] < 2^57 */
++    felem_square_reduce(ftmp, ftmp);  /* ftmp[i] < 2^56 */
++
++    /* J = ftmp2 = h * I */
++    felem_mul_reduce(ftmp2, ftmp4, ftmp); /* ftmp2[i] < 2^56 */
++
++    /* V = ftmp4 = U1 * I */
++    felem_mul_reduce(ftmp4, ftmp3, ftmp); /* ftmp4[i] < 2^56 */
++
++    /* x_out = r**2 - J - 2V */
++    felem_square(tmp, ftmp5);      /* tmp[i] < 2^117 */
++    felem_diff_128_64(tmp, ftmp2); /* tmp[i] < 2^117 + 2^64 + 2^48 */
++    felem_assign(ftmp3, ftmp4);    /* ftmp3[i] < 2^56 */
++    felem_scalar64(ftmp4, 2);      /* ftmp4[i] < 2^57 */
++    felem_diff_128_64(tmp, ftmp4); /* tmp[i] < 2^117 + 2^65 + 2^49 */
++    felem_reduce(x_out, tmp);      /* x_out[i] < 2^56 */
++
++    /* y_out = r(V-x_out) - 2 * s1 * J */
++    felem_diff64(ftmp3, x_out);    /* ftmp3[i] < 2^60 + 2^56 + 2^44 */
++    felem_mul(tmp, ftmp5, ftmp3);  /* tmp[i] < 2^116 */
++    felem_mul(tmp2, ftmp6, ftmp2); /* tmp2[i] < 2^115 */
++    felem_scalar128(tmp2, 2);      /* tmp2[i] < 2^116 */
++    felem_diff128(tmp, tmp2);      /* tmp[i] < 2^127 + 2^116 + 2^111 */
++    felem_reduce(y_out, tmp);      /* y_out[i] < 2^56 */
++
++    copy_conditional(x_out, x2, z1_is_zero);
++    copy_conditional(x_out, x1, z2_is_zero);
++    copy_conditional(y_out, y2, z1_is_zero);
++    copy_conditional(y_out, y1, z2_is_zero);
++    copy_conditional(z_out, z2, z1_is_zero);
++    copy_conditional(z_out, z1, z2_is_zero);
++    felem_assign(x3, x_out);
++    felem_assign(y3, y_out);
++    felem_assign(z3, z_out);
++}
++
++/*-
++ * Base point pre computation
++ * --------------------------
++ *
++ * Two different sorts of precomputed tables are used in the following code.
++ * Each contain various points on the curve, where each point is three field
++ * elements (x, y, z).
++ *
++ * For the base point table, z is usually 1 (0 for the point at infinity).
++ * This table has 16 elements:
++ * index | bits    | point
++ * ------+---------+------------------------------
++ *     0 | 0 0 0 0 | 0G
++ *     1 | 0 0 0 1 | 1G
++ *     2 | 0 0 1 0 | 2^95G
++ *     3 | 0 0 1 1 | (2^95 + 1)G
++ *     4 | 0 1 0 0 | 2^190G
++ *     5 | 0 1 0 1 | (2^190 + 1)G
++ *     6 | 0 1 1 0 | (2^190 + 2^95)G
++ *     7 | 0 1 1 1 | (2^190 + 2^95 + 1)G
++ *     8 | 1 0 0 0 | 2^285G
++ *     9 | 1 0 0 1 | (2^285 + 1)G
++ *    10 | 1 0 1 0 | (2^285 + 2^95)G
++ *    11 | 1 0 1 1 | (2^285 + 2^95 + 1)G
++ *    12 | 1 1 0 0 | (2^285 + 2^190)G
++ *    13 | 1 1 0 1 | (2^285 + 2^190 + 1)G
++ *    14 | 1 1 1 0 | (2^285 + 2^190 + 2^95)G
++ *    15 | 1 1 1 1 | (2^285 + 2^190 + 2^95 + 1)G
++ *
++ * The reason for this is so that we can clock bits into four different
++ * locations when doing simple scalar multiplies against the base point.
++ *
++ * Tables for other points have table[i] = iG for i in 0 .. 16.
++ */
++
++/* gmul is the table of precomputed base points */
++static const felem gmul[16][3] = {
++{{0, 0, 0, 0, 0, 0, 0},
++ {0, 0, 0, 0, 0, 0, 0},
++ {0, 0, 0, 0, 0, 0, 0}},
++{{0x00545e3872760ab7, 0x00f25dbf55296c3a, 0x00e082542a385502, 0x008ba79b9859f741,
++  0x0020ad746e1d3b62, 0x0005378eb1c71ef3, 0x0000aa87ca22be8b},
++ {0x00431d7c90ea0e5f, 0x00b1ce1d7e819d7a, 0x0013b5f0b8c00a60, 0x00289a147ce9da31,
++  0x0092dc29f8f41dbd, 0x002c6f5d9e98bf92, 0x00003617de4a9626},
++ {1, 0, 0, 0, 0, 0, 0}},
++{{0x00024711cc902a90, 0x00acb2e579ab4fe1, 0x00af818a4b4d57b1, 0x00a17c7bec49c3de,
++  0x004280482d726a8b, 0x00128dd0f0a90f3b, 0x00004387c1c3fa3c},
++ {0x002ce76543cf5c3a, 0x00de6cee5ef58f0a, 0x00403e42fa561ca6, 0x00bc54d6f9cb9731,
++  0x007155f925fb4ff1, 0x004a9ce731b7b9bc, 0x00002609076bd7b2},
++ {1, 0, 0, 0, 0, 0, 0}},
++{{0x00e74c9182f0251d, 0x0039bf54bb111974, 0x00b9d2f2eec511d2, 0x0036b1594eb3a6a4,
++  0x00ac3bb82d9d564b, 0x00f9313f4615a100, 0x00006716a9a91b10},
++ {0x0046698116e2f15c, 0x00f34347067d3d33, 0x008de4ccfdebd002, 0x00e838c6b8e8c97b,
++  0x006faf0798def346, 0x007349794a57563c, 0x00002629e7e6ad84},
++ {1, 0, 0, 0, 0, 0, 0}},
++{{0x0075300e34fd163b, 0x0092e9db4e8d0ad3, 0x00254be9f625f760, 0x00512c518c72ae68,
++  0x009bfcf162bede5a, 0x00bf9341566ce311, 0x0000cd6175bd41cf},
++ {0x007dfe52af4ac70f, 0x0002159d2d5c4880, 0x00b504d16f0af8d0, 0x0014585e11f5e64c,
++  0x0089c6388e030967, 0x00ffb270cbfa5f71, 0x00009a15d92c3947},
++ {1, 0, 0, 0, 0, 0, 0}},
++{{0x0033fc1278dc4fe5, 0x00d53088c2caa043, 0x0085558827e2db66, 0x00c192bef387b736,
++  0x00df6405a2225f2c, 0x0075205aa90fd91a, 0x0000137e3f12349d},
++ {0x00ce5b115efcb07e, 0x00abc3308410deeb, 0x005dc6fc1de39904, 0x00907c1c496f36b4,
++  0x0008e6ad3926cbe1, 0x00110747b787928c, 0x0000021b9162eb7e},
++ {1, 0, 0, 0, 0, 0, 0}},
++{{0x008180042cfa26e1, 0x007b826a96254967, 0x0082473694d6b194, 0x007bd6880a45b589,
++  0x00c0a5097072d1a3, 0x0019186555e18b4e, 0x000020278190e5ca},
++ {0x00b4bef17de61ac0, 0x009535e3c38ed348, 0x002d4aa8e468ceab, 0x00ef40b431036ad3,
++  0x00defd52f4542857, 0x0086edbf98234266, 0x00002025b3a7814d},
++ {1, 0, 0, 0, 0, 0, 0}},
++{{0x00b238aa97b886be, 0x00ef3192d6dd3a32, 0x0079f9e01fd62df8, 0x00742e890daba6c5,
++  0x008e5289144408ce, 0x0073bbcc8e0171a5, 0x0000c4fd329d3b52},
++ {0x00c6f64a15ee23e7, 0x00dcfb7b171cad8b, 0x00039f6cbd805867, 0x00de024e428d4562,
++  0x00be6a594d7c64c5, 0x0078467b70dbcd64, 0x0000251f2ed7079b},
++ {1, 0, 0, 0, 0, 0, 0}},
++{{0x000e5cc25fc4b872, 0x005ebf10d31ef4e1, 0x0061e0ebd11e8256, 0x0076e026096f5a27,
++  0x0013e6fc44662e9a, 0x0042b00289d3597e, 0x000024f089170d88},
++ {0x001604d7e0effbe6, 0x0048d77cba64ec2c, 0x008166b16da19e36, 0x006b0d1a0f28c088,
++  0x000259fcd47754fd, 0x00cc643e4d725f9a, 0x00007b10f3c79c14},
++ {1, 0, 0, 0, 0, 0, 0}},
++{{0x00430155e3b908af, 0x00b801e4fec25226, 0x00b0d4bcfe806d26, 0x009fc4014eb13d37,
++  0x0066c94e44ec07e8, 0x00d16adc03874ba2, 0x000030c917a0d2a7},
++ {0x00edac9e21eb891c, 0x00ef0fb768102eff, 0x00c088cef272a5f3, 0x00cbf782134e2964,
++  0x0001044a7ba9a0e3, 0x00e363f5b194cf3c, 0x00009ce85249e372},
++ {1, 0, 0, 0, 0, 0, 0}},
++{{0x001dd492dda5a7eb, 0x008fd577be539fd1, 0x002ff4b25a5fc3f1, 0x0074a8a1b64df72f,
++  0x002ba3d8c204a76c, 0x009d5cff95c8235a, 0x0000e014b9406e0f},
++ {0x008c2e4dbfc98aba, 0x00f30bb89f1a1436, 0x00b46f7aea3e259c, 0x009224454ac02f54,
++  0x00906401f5645fa2, 0x003a1d1940eabc77, 0x00007c9351d680e6},
++ {1, 0, 0, 0, 0, 0, 0}},
++{{0x005a35d872ef967c, 0x0049f1b7884e1987, 0x0059d46d7e31f552, 0x00ceb4869d2d0fb6,
++  0x00e8e89eee56802a, 0x0049d806a774aaf2, 0x0000147e2af0ae24},
++ {0x005fd1bd852c6e5e, 0x00b674b7b3de6885, 0x003b9ea5eb9b6c08, 0x005c9f03babf3ef7,
++  0x00605337fecab3c7, 0x009a3f85b11bbcc8, 0x0000455470f330ec},
++ {1, 0, 0, 0, 0, 0, 0}},
++{{0x002197ff4d55498d, 0x00383e8916c2d8af, 0x00eb203f34d1c6d2, 0x0080367cbd11b542,
++  0x00769b3be864e4f5, 0x0081a8458521c7bb, 0x0000c531b34d3539},
++ {0x00e2a3d775fa2e13, 0x00534fc379573844, 0x00ff237d2a8db54a, 0x00d301b2335a8882,
++  0x000f75ea96103a80, 0x0018fecb3cdd96fa, 0x0000304bf61e94eb},
++ {1, 0, 0, 0, 0, 0, 0}},
++{{0x00b2afc332a73dbd, 0x0029a0d5bb007bc5, 0x002d628eb210f577, 0x009f59a36dd05f50,
++  0x006d339de4eca613, 0x00c75a71addc86bc, 0x000060384c5ea93c},
++ {0x00aa9641c32a30b4, 0x00cc73ae8cce565d, 0x00ec911a4df07f61, 0x00aa4b762ea4b264,
++  0x0096d395bb393629, 0x004efacfb7632fe0, 0x00006f252f46fa3f},
++ {1, 0, 0, 0, 0, 0, 0}},
++{{0x00567eec597c7af6, 0x0059ba6795204413, 0x00816d4e6f01196f, 0x004ae6b3eb57951d,
++  0x00420f5abdda2108, 0x003401d1f57ca9d9, 0x0000cf5837b0b67a},
++ {0x00eaa64b8aeeabf9, 0x00246ddf16bcb4de, 0x000e7e3c3aecd751, 0x0008449f04fed72e,
++  0x00307b67ccf09183, 0x0017108c3556b7b1, 0x0000229b2483b3bf},
++ {1, 0, 0, 0, 0, 0, 0}},
++{{0x00e7c491a7bb78a1, 0x00eafddd1d3049ab, 0x00352c05e2bc7c98, 0x003d6880c165fa5c,
++  0x00b6ac61cc11c97d, 0x00beeb54fcf90ce5, 0x0000dc1f0b455edc},
++ {0x002db2e7aee34d60, 0x0073b5f415a2d8c0, 0x00dd84e4193e9a0c, 0x00d02d873467c572,
++  0x0018baaeda60aee5, 0x0013fb11d697c61e, 0x000083aafcc3a973},
++ {1, 0, 0, 0, 0, 0, 0}}
++};
++
++/*
++ * select_point selects the |idx|th point from a precomputation table and
++ * copies it to out.
++ *
++ * pre_comp below is of the size provided in |size|.
++ */
++static void select_point(const limb idx, unsigned int size,
++                         const felem pre_comp[][3], felem out[3])
++{
++    unsigned int i, j;
++    limb *outlimbs = &out[0][0];
++
++    memset(out, 0, sizeof(*out) * 3);
++
++    for (i = 0; i < size; i++) {
++        const limb *inlimbs = &pre_comp[i][0][0];
++        limb mask = i ^ idx;
++
++        mask |= mask >> 4;
++        mask |= mask >> 2;
++        mask |= mask >> 1;
++        mask &= 1;
++        mask--;
++        for (j = 0; j < NLIMBS * 3; j++)
++            outlimbs[j] |= inlimbs[j] & mask;
++    }
++}
++
++/* get_bit returns the |i|th bit in |in| */
++static char get_bit(const felem_bytearray in, int i)
++{
++    if (i < 0 || i >= 384)
++        return 0;
++    return (in[i >> 3] >> (i & 7)) & 1;
++}
++
++/*
++ * Interleaved point multiplication using precomputed point multiples: The
++ * small point multiples 0*P, 1*P, ..., 16*P are in pre_comp[], the scalars
++ * in scalars[]. If g_scalar is non-NULL, we also add this multiple of the
++ * generator, using certain (large) precomputed multiples in g_pre_comp.
++ * Output point (X, Y, Z) is stored in x_out, y_out, z_out
++ */
++static void batch_mul(felem x_out, felem y_out, felem z_out,
++                      const felem_bytearray scalars[],
++                      const unsigned int num_points, const u8 *g_scalar,
++                      const int mixed, const felem pre_comp[][17][3],
++                      const felem g_pre_comp[16][3])
++{
++    int i, skip;
++    unsigned int num, gen_mul = (g_scalar != NULL);
++    felem nq[3], tmp[4];
++    limb bits;
++    u8 sign, digit;
++
++    /* set nq to the point at infinity */
++    memset(nq, 0, sizeof(nq));
++
++    /*
++     * Loop over all scalars msb-to-lsb, interleaving additions of multiples
++     * of the generator (last quarter of rounds) and additions of other
++     * points multiples (every 5th round).
++     */
++    skip = 1;                   /* save two point operations in the first
++                                 * round */
++    for (i = (num_points ? 380 : 98); i >= 0; --i) {
++        /* double */
++        if (!skip)
++            point_double(nq[0], nq[1], nq[2], nq[0], nq[1], nq[2]);
++
++        /* add multiples of the generator */
++        if (gen_mul && (i <= 98)) {
++            bits = get_bit(g_scalar, i + 285) << 3;
++            if (i < 95) {
++                bits |= get_bit(g_scalar, i + 190) << 2;
++                bits |= get_bit(g_scalar, i + 95) << 1;
++                bits |= get_bit(g_scalar, i);
++            }
++            /* select the point to add, in constant time */
++            select_point(bits, 16, g_pre_comp, tmp);
++            if (!skip) {
++                /* The 1 argument below is for "mixed" */
++                point_add(nq[0],  nq[1],  nq[2],
++                          nq[0],  nq[1],  nq[2], 1,
++                          tmp[0], tmp[1], tmp[2]);
++            } else {
++                memcpy(nq, tmp, 3 * sizeof(felem));
++                skip = 0;
++            }
++        }
++
++        /* do other additions every 5 doublings */
++        if (num_points && (i % 5 == 0)) {
++            /* loop over all scalars */
++            for (num = 0; num < num_points; ++num) {
++                bits = get_bit(scalars[num], i + 4) << 5;
++                bits |= get_bit(scalars[num], i + 3) << 4;
++                bits |= get_bit(scalars[num], i + 2) << 3;
++                bits |= get_bit(scalars[num], i + 1) << 2;
++                bits |= get_bit(scalars[num], i) << 1;
++                bits |= get_bit(scalars[num], i - 1);
++                ec_GFp_nistp_recode_scalar_bits(&sign, &digit, bits);
++
++                /*
++                 * select the point to add or subtract, in constant time
++                 */
++                select_point(digit, 17, pre_comp[num], tmp);
++                felem_neg(tmp[3], tmp[1]); /* (X, -Y, Z) is the negative
++                                            * point */
++                copy_conditional(tmp[1], tmp[3], (-(limb) sign));
++
++                if (!skip) {
++                    point_add(nq[0],  nq[1],  nq[2],
++                              nq[0],  nq[1],  nq[2], mixed,
++                              tmp[0], tmp[1], tmp[2]);
++                } else {
++                    memcpy(nq, tmp, 3 * sizeof(felem));
++                    skip = 0;
++                }
++            }
++        }
++    }
++    felem_assign(x_out, nq[0]);
++    felem_assign(y_out, nq[1]);
++    felem_assign(z_out, nq[2]);
++}
++
++/* Precomputation for the group generator. */
++struct nistp384_pre_comp_st {
++    felem g_pre_comp[16][3];
++    CRYPTO_REF_COUNT refcnt;
++    CRYPTO_RWLOCK *refcnt_lock;
++};
++
++const EC_METHOD *ossl_ec_GFp_nistp384_method(void)
++{
++    static const EC_METHOD ret = {
++        EC_FLAGS_DEFAULT_OCT,
++        NID_X9_62_prime_field,
++        ossl_ec_GFp_nistp384_group_init,
++        ec_GFp_simple_group_finish,
++        ec_GFp_simple_group_clear_finish,
++        ec_GFp_nist_group_copy,
++        ossl_ec_GFp_nistp384_group_set_curve,
++        ec_GFp_simple_group_get_curve,
++        ec_GFp_simple_group_get_degree,
++        ec_group_simple_order_bits,
++        ec_GFp_simple_group_check_discriminant,
++        ec_GFp_simple_point_init,
++        ec_GFp_simple_point_finish,
++        ec_GFp_simple_point_clear_finish,
++        ec_GFp_simple_point_copy,
++        ec_GFp_simple_point_set_to_infinity,
++        ec_GFp_simple_set_Jprojective_coordinates_GFp,
++        ec_GFp_simple_get_Jprojective_coordinates_GFp,
++        ec_GFp_simple_point_set_affine_coordinates,
++        ossl_ec_GFp_nistp384_point_get_affine_coordinates,
++        0, /* point_set_compressed_coordinates */
++        0, /* point2oct */
++        0, /* oct2point */
++        ec_GFp_simple_add,
++        ec_GFp_simple_dbl,
++        ec_GFp_simple_invert,
++        ec_GFp_simple_is_at_infinity,
++        ec_GFp_simple_is_on_curve,
++        ec_GFp_simple_cmp,
++        ec_GFp_simple_make_affine,
++        ec_GFp_simple_points_make_affine,
++        ossl_ec_GFp_nistp384_points_mul,
++        ossl_ec_GFp_nistp384_precompute_mult,
++        ossl_ec_GFp_nistp384_have_precompute_mult,
++        ec_GFp_nist_field_mul,
++        ec_GFp_nist_field_sqr,
++        0, /* field_div */
++        ec_GFp_simple_field_inv,
++        0, /* field_encode */
++        0, /* field_decode */
++        0, /* field_set_to_one */
++        ec_key_simple_priv2oct,
++        ec_key_simple_oct2priv,
++        0, /* set private */
++        ec_key_simple_generate_key,
++        ec_key_simple_check_key,
++        ec_key_simple_generate_public_key,
++        0, /* keycopy */
++        0, /* keyfinish */
++        ecdh_simple_compute_key,
++        ecdsa_simple_sign_setup,
++        ecdsa_simple_sign_sig,
++        ecdsa_simple_verify_sig,
++        0, /* field_inverse_mod_ord */
++        0, /* blind_coordinates */
++        0, /* ladder_pre */
++        0, /* ladder_step */
++        0  /* ladder_post */
++    };
++
++    return &ret;
++}
++
++/******************************************************************************/
++/*
++ * FUNCTIONS TO MANAGE PRECOMPUTATION
++ */
++
++static NISTP384_PRE_COMP *nistp384_pre_comp_new(void)
++{
++    NISTP384_PRE_COMP *ret = OPENSSL_zalloc(sizeof(*ret));
++
++    if (ret == NULL || (ret->refcnt_lock = CRYPTO_THREAD_lock_new()) == NULL) {
++        OPENSSL_free(ret);
++        return NULL;
++    }
++
++    ret->refcnt = 1;
++    return ret;
++}
++
++NISTP384_PRE_COMP *ossl_ec_nistp384_pre_comp_dup(NISTP384_PRE_COMP *p)
++{
++    int i;
++
++    if (p != NULL)
++        CRYPTO_UP_REF(&p->refcnt, &i, p->refcnt_lock);
++    return p;
++}
++
++void ossl_ec_nistp384_pre_comp_free(NISTP384_PRE_COMP *p)
++{
++    int i;
++
++    if (p == NULL)
++        return;
++
++    CRYPTO_DOWN_REF(&p->refcnt, &i, p->refcnt_lock);
++    REF_PRINT_COUNT("ec_nistp384", p);
++    if (i > 0)
++        return;
++    REF_ASSERT_ISNT(i < 0);
++
++    CRYPTO_THREAD_lock_free(p->refcnt_lock);
++    OPENSSL_free(p);
++}
++
++/******************************************************************************/
++/*
++ * OPENSSL EC_METHOD FUNCTIONS
++ */
++
++int ossl_ec_GFp_nistp384_group_init(EC_GROUP *group)
++{
++    int ret;
++
++    ret = ec_GFp_simple_group_init(group);
++    group->a_is_minus3 = 1;
++    return ret;
++}
++
++int ossl_ec_GFp_nistp384_group_set_curve(EC_GROUP *group, const BIGNUM *p,
++                                         const BIGNUM *a, const BIGNUM *b,
++                                         BN_CTX *ctx)
++{
++    int ret = 0;
++    BIGNUM *curve_p, *curve_a, *curve_b;
++#ifndef FIPS_MODULE
++    BN_CTX *new_ctx = NULL;
++
++    if (ctx == NULL)
++        ctx = new_ctx = BN_CTX_new();
++#endif
++    if (ctx == NULL)
++        return 0;
++
++    BN_CTX_start(ctx);
++    curve_p = BN_CTX_get(ctx);
++    curve_a = BN_CTX_get(ctx);
++    curve_b = BN_CTX_get(ctx);
++    if (curve_b == NULL)
++        goto err;
++    BN_bin2bn(nistp384_curve_params[0], sizeof(felem_bytearray), curve_p);
++    BN_bin2bn(nistp384_curve_params[1], sizeof(felem_bytearray), curve_a);
++    BN_bin2bn(nistp384_curve_params[2], sizeof(felem_bytearray), curve_b);
++    if ((BN_cmp(curve_p, p)) || (BN_cmp(curve_a, a)) || (BN_cmp(curve_b, b))) {
++        ECerr(EC_F_EC_GFP_NISTP384_GROUP_SET_CURVE, EC_R_WRONG_CURVE_PARAMETERS);
++        goto err;
++    }
++    group->field_mod_func = BN_nist_mod_384;
++    ret = ec_GFp_simple_group_set_curve(group, p, a, b, ctx);
++ err:
++    BN_CTX_end(ctx);
++#ifndef FIPS_MODULE
++    BN_CTX_free(new_ctx);
++#endif
++    return ret;
++}
++
++/*
++ * Takes the Jacobian coordinates (X, Y, Z) of a point and returns (X', Y') =
++ * (X/Z^2, Y/Z^3)
++ */
++int ossl_ec_GFp_nistp384_point_get_affine_coordinates(const EC_GROUP *group,
++                                                      const EC_POINT *point,
++                                                      BIGNUM *x, BIGNUM *y,
++                                                      BN_CTX *ctx)
++{
++    felem z1, z2, x_in, y_in, x_out, y_out;
++    widefelem tmp;
++
++    if (EC_POINT_is_at_infinity(group, point)) {
++        ECerr(EC_F_EC_GFP_NISTP384_POINT_GET_AFFINE_COORDINATES, EC_R_POINT_AT_INFINITY);
++        return 0;
++    }
++    if ((!BN_to_felem(x_in, point->X)) || (!BN_to_felem(y_in, point->Y)) ||
++        (!BN_to_felem(z1, point->Z)))
++        return 0;
++    felem_inv(z2, z1);
++    felem_square(tmp, z2);
++    felem_reduce(z1, tmp);
++    felem_mul(tmp, x_in, z1);
++    felem_reduce(x_in, tmp);
++    felem_contract(x_out, x_in);
++    if (x != NULL) {
++        if (!felem_to_BN(x, x_out)) {
++            ECerr(EC_F_EC_GFP_NISTP384_POINT_GET_AFFINE_COORDINATES, ERR_R_BN_LIB);
++            return 0;
++        }
++    }
++    felem_mul(tmp, z1, z2);
++    felem_reduce(z1, tmp);
++    felem_mul(tmp, y_in, z1);
++    felem_reduce(y_in, tmp);
++    felem_contract(y_out, y_in);
++    if (y != NULL) {
++        if (!felem_to_BN(y, y_out)) {
++            ECerr(EC_F_EC_GFP_NISTP384_POINT_GET_AFFINE_COORDINATES, ERR_R_BN_LIB);
++            return 0;
++        }
++    }
++    return 1;
++}
++
++/* points below is of size |num|, and tmp_felems is of size |num+1/ */
++static void make_points_affine(size_t num, felem points[][3],
++                               felem tmp_felems[])
++{
++    /*
++     * Runs in constant time, unless an input is the point at infinity (which
++     * normally shouldn't happen).
++     */
++    ec_GFp_nistp_points_make_affine_internal(num,
++                                                  points,
++                                                  sizeof(felem),
++                                                  tmp_felems,
++                                                  (void (*)(void *))felem_one,
++                                                  felem_is_zero_int,
++                                                  (void (*)(void *, const void *))
++                                                  felem_assign,
++                                                  (void (*)(void *, const void *))
++                                                  felem_square_reduce,
++                                                  (void (*)(void *, const void *, const void*))
++                                                  felem_mul_reduce,
++                                                  (void (*)(void *, const void *))
++                                                  felem_inv,
++                                                  (void (*)(void *, const void *))
++                                                  felem_contract);
++}
++
++/*
++ * Computes scalar*generator + \sum scalars[i]*points[i], ignoring NULL
++ * values Result is stored in r (r can equal one of the inputs).
++ */
++int ossl_ec_GFp_nistp384_points_mul(const EC_GROUP *group, EC_POINT *r,
++                                    const BIGNUM *scalar, size_t num,
++                                    const EC_POINT *points[],
++                                    const BIGNUM *scalars[], BN_CTX *ctx)
++{
++    int ret = 0;
++    int j;
++    int mixed = 0;
++    BIGNUM *x, *y, *z, *tmp_scalar;
++    felem_bytearray g_secret;
++    felem_bytearray *secrets = NULL;
++    felem (*pre_comp)[17][3] = NULL;
++    felem *tmp_felems = NULL;
++    unsigned int i;
++    int num_bytes;
++    int have_pre_comp = 0;
++    size_t num_points = num;
++    felem x_in, y_in, z_in, x_out, y_out, z_out;
++    NISTP384_PRE_COMP *pre = NULL;
++    felem(*g_pre_comp)[3] = NULL;
++    EC_POINT *generator = NULL;
++    const EC_POINT *p = NULL;
++    const BIGNUM *p_scalar = NULL;
++
++    BN_CTX_start(ctx);
++    x = BN_CTX_get(ctx);
++    y = BN_CTX_get(ctx);
++    z = BN_CTX_get(ctx);
++    tmp_scalar = BN_CTX_get(ctx);
++    if (tmp_scalar == NULL)
++        goto err;
++
++    if (scalar != NULL) {
++        pre = group->pre_comp.nistp384;
++        if (pre)
++            /* we have precomputation, try to use it */
++            g_pre_comp = &pre->g_pre_comp[0];
++        else
++            /* try to use the standard precomputation */
++            g_pre_comp = (felem(*)[3]) gmul;
++        generator = EC_POINT_new(group);
++        if (generator == NULL)
++            goto err;
++        /* get the generator from precomputation */
++        if (!felem_to_BN(x, g_pre_comp[1][0]) ||
++            !felem_to_BN(y, g_pre_comp[1][1]) ||
++            !felem_to_BN(z, g_pre_comp[1][2])) {
++            ECerr(EC_F_EC_GFP_NISTP384_POINTS_MUL, ERR_R_BN_LIB);
++            goto err;
++        }
++        if (!ec_GFp_simple_set_Jprojective_coordinates_GFp(group,
++                                                                generator,
++                                                                x, y, z, ctx))
++            goto err;
++        if (0 == EC_POINT_cmp(group, generator, group->generator, ctx))
++            /* precomputation matches generator */
++            have_pre_comp = 1;
++        else
++            /*
++             * we don't have valid precomputation: treat the generator as a
++             * random point
++             */
++            num_points++;
++    }
++
++    if (num_points > 0) {
++        if (num_points >= 2) {
++            /*
++             * unless we precompute multiples for just one point, converting
++             * those into affine form is time well spent
++             */
++            mixed = 1;
++        }
++        secrets = OPENSSL_zalloc(sizeof(*secrets) * num_points);
++        pre_comp = OPENSSL_zalloc(sizeof(*pre_comp) * num_points);
++        if (mixed)
++            tmp_felems =
++                OPENSSL_malloc(sizeof(*tmp_felems) * (num_points * 17 + 1));
++        if ((secrets == NULL) || (pre_comp == NULL)
++            || (mixed && (tmp_felems == NULL)))
++            goto err;
++
++        /*
++         * we treat NULL scalars as 0, and NULL points as points at infinity,
++         * i.e., they contribute nothing to the linear combination
++         */
++        for (i = 0; i < num_points; ++i) {
++            if (i == num) {
++                /*
++                 * we didn't have a valid precomputation, so we pick the
++                 * generator
++                 */
++                p = EC_GROUP_get0_generator(group);
++                p_scalar = scalar;
++            } else {
++                /* the i^th point */
++                p = points[i];
++                p_scalar = scalars[i];
++            }
++            if (p_scalar != NULL && p != NULL) {
++                /* reduce scalar to 0 <= scalar < 2^384 */
++                if ((BN_num_bits(p_scalar) > 384)
++                    || (BN_is_negative(p_scalar))) {
++                    /*
++                     * this is an unusual input, and we don't guarantee
++                     * constant-timeness
++                     */
++                    if (!BN_nnmod(tmp_scalar, p_scalar, group->order, ctx)) {
++                        ECerr(EC_F_EC_GFP_NISTP384_POINTS_MUL, ERR_R_BN_LIB);
++                        goto err;
++                    }
++                    num_bytes = BN_bn2lebinpad(tmp_scalar,
++                                               secrets[i], sizeof(secrets[i]));
++                } else {
++                    num_bytes = BN_bn2lebinpad(p_scalar,
++                                               secrets[i], sizeof(secrets[i]));
++                }
++                if (num_bytes < 0) {
++                    ECerr(EC_F_EC_GFP_NISTP384_POINTS_MUL, ERR_R_BN_LIB);
++                    goto err;
++                }
++                /* precompute multiples */
++                if ((!BN_to_felem(x_out, p->X)) ||
++                    (!BN_to_felem(y_out, p->Y)) ||
++                    (!BN_to_felem(z_out, p->Z)))
++                    goto err;
++                memcpy(pre_comp[i][1][0], x_out, sizeof(felem));
++                memcpy(pre_comp[i][1][1], y_out, sizeof(felem));
++                memcpy(pre_comp[i][1][2], z_out, sizeof(felem));
++                for (j = 2; j <= 16; ++j) {
++                    if (j & 1) {
++                        point_add(pre_comp[i][j][0],     pre_comp[i][j][1],     pre_comp[i][j][2],
++                                  pre_comp[i][1][0],     pre_comp[i][1][1],     pre_comp[i][1][2], 0,
++                                  pre_comp[i][j - 1][0], pre_comp[i][j - 1][1], pre_comp[i][j - 1][2]);
++                    } else {
++                        point_double(pre_comp[i][j][0],     pre_comp[i][j][1],     pre_comp[i][j][2],
++                                     pre_comp[i][j / 2][0], pre_comp[i][j / 2][1], pre_comp[i][j / 2][2]);
++                    }
++                }
++            }
++        }
++        if (mixed)
++            make_points_affine(num_points * 17, pre_comp[0], tmp_felems);
++    }
++
++    /* the scalar for the generator */
++    if (scalar != NULL && have_pre_comp) {
++        memset(g_secret, 0, sizeof(g_secret));
++        /* reduce scalar to 0 <= scalar < 2^384 */
++        if ((BN_num_bits(scalar) > 384) || (BN_is_negative(scalar))) {
++            /*
++             * this is an unusual input, and we don't guarantee
++             * constant-timeness
++             */
++            if (!BN_nnmod(tmp_scalar, scalar, group->order, ctx)) {
++                ECerr(EC_F_EC_GFP_NISTP384_POINTS_MUL, ERR_R_BN_LIB);
++                goto err;
++            }
++            num_bytes = BN_bn2lebinpad(tmp_scalar, g_secret, sizeof(g_secret));
++        } else {
++            num_bytes = BN_bn2lebinpad(scalar, g_secret, sizeof(g_secret));
++        }
++        /* do the multiplication with generator precomputation */
++        batch_mul(x_out, y_out, z_out,
++                  (const felem_bytearray(*))secrets, num_points,
++                  g_secret,
++                  mixed, (const felem(*)[17][3])pre_comp,
++                  (const felem(*)[3])g_pre_comp);
++    } else {
++        /* do the multiplication without generator precomputation */
++        batch_mul(x_out, y_out, z_out,
++                  (const felem_bytearray(*))secrets, num_points,
++                  NULL, mixed, (const felem(*)[17][3])pre_comp, NULL);
++    }
++    /* reduce the output to its unique minimal representation */
++    felem_contract(x_in, x_out);
++    felem_contract(y_in, y_out);
++    felem_contract(z_in, z_out);
++    if ((!felem_to_BN(x, x_in)) || (!felem_to_BN(y, y_in)) ||
++        (!felem_to_BN(z, z_in))) {
++        ECerr(EC_F_EC_GFP_NISTP384_POINTS_MUL, ERR_R_BN_LIB);
++        goto err;
++    }
++    ret = ec_GFp_simple_set_Jprojective_coordinates_GFp(group, r, x, y, z,
++                                                             ctx);
++
++ err:
++    BN_CTX_end(ctx);
++    EC_POINT_free(generator);
++    OPENSSL_free(secrets);
++    OPENSSL_free(pre_comp);
++    OPENSSL_free(tmp_felems);
++    return ret;
++}
++
++int ossl_ec_GFp_nistp384_precompute_mult(EC_GROUP *group, BN_CTX *ctx)
++{
++    int ret = 0;
++    NISTP384_PRE_COMP *pre = NULL;
++    int i, j;
++    BIGNUM *x, *y;
++    EC_POINT *generator = NULL;
++    felem tmp_felems[16];
++#ifndef FIPS_MODULE
++    BN_CTX *new_ctx = NULL;
++#endif
++
++    /* throw away old precomputation */
++    EC_pre_comp_free(group);
++
++#ifndef FIPS_MODULE
++    if (ctx == NULL)
++        ctx = new_ctx = BN_CTX_new();
++#endif
++    if (ctx == NULL)
++        return 0;
++
++    BN_CTX_start(ctx);
++    x = BN_CTX_get(ctx);
++    y = BN_CTX_get(ctx);
++    if (y == NULL)
++        goto err;
++    /* get the generator */
++    if (group->generator == NULL)
++        goto err;
++    generator = EC_POINT_new(group);
++    if (generator == NULL)
++        goto err;
++    BN_bin2bn(nistp384_curve_params[3], sizeof(felem_bytearray), x);
++    BN_bin2bn(nistp384_curve_params[4], sizeof(felem_bytearray), y);
++    if (!EC_POINT_set_affine_coordinates(group, generator, x, y, ctx))
++        goto err;
++    if ((pre = nistp384_pre_comp_new()) == NULL)
++        goto err;
++    /*
++     * if the generator is the standard one, use built-in precomputation
++     */
++    if (0 == EC_POINT_cmp(group, generator, group->generator, ctx)) {
++        memcpy(pre->g_pre_comp, gmul, sizeof(pre->g_pre_comp));
++        goto done;
++    }
++    if ((!BN_to_felem(pre->g_pre_comp[1][0], group->generator->X)) ||
++        (!BN_to_felem(pre->g_pre_comp[1][1], group->generator->Y)) ||
++        (!BN_to_felem(pre->g_pre_comp[1][2], group->generator->Z)))
++        goto err;
++    /* compute 2^95*G, 2^190*G, 2^285*G */
++    for (i = 1; i <= 4; i <<= 1) {
++        point_double(pre->g_pre_comp[2 * i][0], pre->g_pre_comp[2 * i][1], pre->g_pre_comp[2 * i][2],
++                     pre->g_pre_comp[i][0],  pre->g_pre_comp[i][1],    pre->g_pre_comp[i][2]);
++        for (j = 0; j < 94; ++j) {
++            point_double(pre->g_pre_comp[2 * i][0], pre->g_pre_comp[2 * i][1], pre->g_pre_comp[2 * i][2],
++                         pre->g_pre_comp[2 * i][0], pre->g_pre_comp[2 * i][1], pre->g_pre_comp[2 * i][2]);
++        }
++    }
++    /* g_pre_comp[0] is the point at infinity */
++    memset(pre->g_pre_comp[0], 0, sizeof(pre->g_pre_comp[0]));
++    /* the remaining multiples */
++    /* 2^95*G + 2^190*G */
++    point_add(pre->g_pre_comp[6][0],  pre->g_pre_comp[6][1],  pre->g_pre_comp[6][2],
++              pre->g_pre_comp[4][0],  pre->g_pre_comp[4][1],  pre->g_pre_comp[4][2], 0,
++              pre->g_pre_comp[2][0],  pre->g_pre_comp[2][1],  pre->g_pre_comp[2][2]);
++    /* 2^95*G + 2^285*G */
++    point_add(pre->g_pre_comp[10][0], pre->g_pre_comp[10][1], pre->g_pre_comp[10][2],
++              pre->g_pre_comp[8][0],  pre->g_pre_comp[8][1],  pre->g_pre_comp[8][2], 0,
++              pre->g_pre_comp[2][0],  pre->g_pre_comp[2][1],  pre->g_pre_comp[2][2]);
++    /* 2^190*G + 2^285*G */
++    point_add(pre->g_pre_comp[12][0], pre->g_pre_comp[12][1], pre->g_pre_comp[12][2],
++              pre->g_pre_comp[8][0],  pre->g_pre_comp[8][1],  pre->g_pre_comp[8][2], 0,
++              pre->g_pre_comp[4][0],  pre->g_pre_comp[4][1],  pre->g_pre_comp[4][2]);
++    /* 2^95*G + 2^190*G + 2^285*G */
++    point_add(pre->g_pre_comp[14][0], pre->g_pre_comp[14][1], pre->g_pre_comp[14][2],
++              pre->g_pre_comp[12][0], pre->g_pre_comp[12][1], pre->g_pre_comp[12][2], 0,
++              pre->g_pre_comp[2][0],  pre->g_pre_comp[2][1],  pre->g_pre_comp[2][2]);
++    for (i = 1; i < 8; ++i) {
++        /* odd multiples: add G */
++        point_add(pre->g_pre_comp[2 * i + 1][0], pre->g_pre_comp[2 * i + 1][1], pre->g_pre_comp[2 * i + 1][2],
++                  pre->g_pre_comp[2 * i][0],     pre->g_pre_comp[2 * i][1],     pre->g_pre_comp[2 * i][2], 0,
++                  pre->g_pre_comp[1][0],         pre->g_pre_comp[1][1],         pre->g_pre_comp[1][2]);
++    }
++    make_points_affine(15, &(pre->g_pre_comp[1]), tmp_felems);
++
++ done:
++    SETPRECOMP(group, nistp384, pre);
++    ret = 1;
++    pre = NULL;
++ err:
++    BN_CTX_end(ctx);
++    EC_POINT_free(generator);
++#ifndef FIPS_MODULE
++    BN_CTX_free(new_ctx);
++#endif
++    ossl_ec_nistp384_pre_comp_free(pre);
++    return ret;
++}
++
++int ossl_ec_GFp_nistp384_have_precompute_mult(const EC_GROUP *group)
++{
++    return HAVEPRECOMP(group, nistp384);
++}
++#endif /* OPENSSL_NO_EC_NISTP_64_GCC_128 */
+Index: openssl-1.1.1w/crypto/ec/build.info
+===================================================================
+--- openssl-1.1.1w.orig/crypto/ec/build.info
++++ openssl-1.1.1w/crypto/ec/build.info
+@@ -3,7 +3,8 @@ SOURCE[../../libcrypto]=\
+         ec_lib.c ecp_smpl.c ecp_mont.c ecp_nist.c ec_cvt.c ec_mult.c \
+         ec_err.c ec_curve.c ec_check.c ec_print.c ec_asn1.c ec_key.c \
+         ec2_smpl.c ec_ameth.c ec_pmeth.c eck_prn.c \
+-        ecp_nistp224.c ecp_nistp256.c ecp_nistp521.c ecp_nistputil.c \
++        ecp_nistp224.c ecp_nistp256.c ecp_nistp384.c ecp_nistp521.c \
++        ecp_nistputil.c \
+         ecp_oct.c ec2_oct.c ec_oct.c ec_kmeth.c ecdh_ossl.c ecdh_kdf.c \
+         ecdsa_ossl.c ecdsa_sign.c ecdsa_vrf.c curve25519.c ecx_meth.c \
+         curve448/f_generic.c curve448/scalar.c \
+Index: openssl-1.1.1w/crypto/err/openssl.txt
+===================================================================
+--- openssl-1.1.1w.orig/crypto/err/openssl.txt
++++ openssl-1.1.1w/crypto/err/openssl.txt
+@@ -562,6 +562,10 @@ EC_F_EC_GFP_NISTP256_GROUP_SET_CURVE:230
+ EC_F_EC_GFP_NISTP256_POINTS_MUL:231:ec_GFp_nistp256_points_mul
+ EC_F_EC_GFP_NISTP256_POINT_GET_AFFINE_COORDINATES:232:\
+ 	ec_GFp_nistp256_point_get_affine_coordinates
++EC_F_EC_GFP_NISTP384_GROUP_SET_CURVE:315:ec_GFp_nistp384_group_set_curve
++EC_F_EC_GFP_NISTP384_POINT_GET_AFFINE_COORDINATES:316:\
++	ec_GFp_nistp385_point_get_affine_coordinates
++EC_F_EC_GFP_NISTP384_POINTS_MUL:317:ec_GFp_nistp384_points_mul
+ EC_F_EC_GFP_NISTP521_GROUP_SET_CURVE:233:ec_GFp_nistp521_group_set_curve
+ EC_F_EC_GFP_NISTP521_POINTS_MUL:234:ec_GFp_nistp521_points_mul
+ EC_F_EC_GFP_NISTP521_POINT_GET_AFFINE_COORDINATES:235:\
+Index: openssl-1.1.1w/include/openssl/ecerr.h
+===================================================================
+--- openssl-1.1.1w.orig/include/openssl/ecerr.h
++++ openssl-1.1.1w/include/openssl/ecerr.h
+@@ -93,6 +93,9 @@ int ERR_load_EC_strings(void);
+ #  define EC_F_EC_GFP_NISTP256_GROUP_SET_CURVE             230
+ #  define EC_F_EC_GFP_NISTP256_POINTS_MUL                  231
+ #  define EC_F_EC_GFP_NISTP256_POINT_GET_AFFINE_COORDINATES 232
++#  define EC_F_EC_GFP_NISTP384_GROUP_SET_CURVE             315
++#  define EC_F_EC_GFP_NISTP384_POINT_GET_AFFINE_COORDINATES 316
++#  define EC_F_EC_GFP_NISTP384_POINTS_MUL                  317
+ #  define EC_F_EC_GFP_NISTP521_GROUP_SET_CURVE             233
+ #  define EC_F_EC_GFP_NISTP521_POINTS_MUL                  234
+ #  define EC_F_EC_GFP_NISTP521_POINT_GET_AFFINE_COORDINATES 235
diff --git a/openssl-ec-Use-static-linkage-on-nistp521-felem_-square-mul-.patch b/openssl-ec-Use-static-linkage-on-nistp521-felem_-square-mul-.patch
new file mode 100644
index 0000000..85685cf
--- /dev/null
+++ b/openssl-ec-Use-static-linkage-on-nistp521-felem_-square-mul-.patch
@@ -0,0 +1,65 @@
+From 3e47a286dc3274bda72a196c3a4030a1fc8302f1 Mon Sep 17 00:00:00 2001
+From: Rohan McLure <rohanmclure@linux.ibm.com>
+Date: Fri, 23 Jun 2023 16:41:48 +1000
+Subject: [PATCH] ec: Use static linkage on nistp521 felem_{square,mul}
+ wrappers
+
+Runtime selection of implementations for felem_{square,mul} depends on
+felem_{square,mul}_wrapper functions, which overwrite function points in
+a similar design to that of .plt.got sections used by program loaders
+during dynamic linking.
+
+There's no reason why these functions need to have external linkage.
+Mark static.
+
+Signed-off-by: Rohan McLure <rohanmclure@linux.ibm.com>
+
+Reviewed-by: Paul Dale <pauli@openssl.org>
+Reviewed-by: Shane Lontis <shane.lontis@oracle.com>
+Reviewed-by: Dmitry Belyavskiy <beldmit@gmail.com>
+Reviewed-by: Todd Short <todd.short@me.com>
+(Merged from https://github.com/openssl/openssl/pull/21471)
+---
+ crypto/ec/ecp_nistp521.c | 10 +++++-----
+ 1 file changed, 5 insertions(+), 5 deletions(-)
+
+diff --git a/crypto/ec/ecp_nistp521.c b/crypto/ec/ecp_nistp521.c
+index 97815cac1f13..32a9268ecf17 100644
+--- a/crypto/ec/ecp_nistp521.c
++++ b/crypto/ec/ecp_nistp521.c
+@@ -676,8 +676,8 @@ static void felem_reduce(felem out, const largefelem in)
+ }
+ 
+ #if defined(ECP_NISTP521_ASM)
+-void felem_square_wrapper(largefelem out, const felem in);
+-void felem_mul_wrapper(largefelem out, const felem in1, const felem in2);
++static void felem_square_wrapper(largefelem out, const felem in);
++static void felem_mul_wrapper(largefelem out, const felem in1, const felem in2);
+ 
+ static void (*felem_square_p)(largefelem out, const felem in) =
+     felem_square_wrapper;
+@@ -691,7 +691,7 @@ void p521_felem_mul(largefelem out, const felem in1, const felem in2);
+ #  include "../ppc_arch.h"
+ # endif
+ 
+-void felem_select(void)
++static void felem_select(void)
+ {
+ # if defined(_ARCH_PPC64)
+     if ((OPENSSL_ppccap_P & PPC_MADD300) && (OPENSSL_ppccap_P & PPC_ALTIVEC)) {
+@@ -707,13 +707,13 @@ void felem_select(void)
+     felem_mul_p = felem_mul_ref;
+ }
+ 
+-void felem_square_wrapper(largefelem out, const felem in)
++static void felem_square_wrapper(largefelem out, const felem in)
+ {
+     felem_select();
+     felem_square_p(out, in);
+ }
+ 
+-void felem_mul_wrapper(largefelem out, const felem in1, const felem in2)
++static void felem_mul_wrapper(largefelem out, const felem in1, const felem in2)
+ {
+     felem_select();
+     felem_mul_p(out, in1, in2);
diff --git a/openssl-ec-powerpc64le-Add-asm-implementation-of-felem_-squa.patch b/openssl-ec-powerpc64le-Add-asm-implementation-of-felem_-squa.patch
new file mode 100644
index 0000000..f120df2
--- /dev/null
+++ b/openssl-ec-powerpc64le-Add-asm-implementation-of-felem_-squa.patch
@@ -0,0 +1,410 @@
+From 966047ee13188e8634af25af348940acceb9316d Mon Sep 17 00:00:00 2001
+From: Rohan McLure <rohanmclure@linux.ibm.com>
+Date: Wed, 31 May 2023 14:32:26 +1000
+Subject: [PATCH] ec: powerpc64le: Add asm implementation of felem_{square,mul}
+
+Add an assembly implementation of felem_{square,mul}, which will be
+implemented whenever Altivec support is present and the core implements
+ISA 3.0 (Power 9) or greater.
+
+Signed-off-by: Rohan McLure <rohanmclure@linux.ibm.com>
+
+Reviewed-by: Paul Dale <pauli@openssl.org>
+Reviewed-by: Shane Lontis <shane.lontis@oracle.com>
+Reviewed-by: Dmitry Belyavskiy <beldmit@gmail.com>
+Reviewed-by: Todd Short <todd.short@me.com>
+(Merged from https://github.com/openssl/openssl/pull/21471)
+---
+ crypto/ec/asm/ecp_nistp384-ppc64.pl |  355 ++++++++++++++++++++++++++++++++++++
+ crypto/ec/build.info                |    2 
+ crypto/ec/ecp_nistp384.c            |    9 
+ 3 files changed, 366 insertions(+)
+ create mode 100755 crypto/ec/asm/ecp_nistp384-ppc64.pl
+
+--- /dev/null
++++ b/crypto/ec/asm/ecp_nistp384-ppc64.pl
+@@ -0,0 +1,355 @@
++#! /usr/bin/env perl
++# Copyright 2023 The OpenSSL Project Authors. All Rights Reserved.
++#
++# Licensed under the Apache License 2.0 (the "License").  You may not use
++# this file except in compliance with the License.  You can obtain a copy
++# in the file LICENSE in the source distribution or at
++# https://www.openssl.org/source/license.html
++#
++# ====================================================================
++# Written by Rohan McLure <rmclure@linux.ibm.com> for the OpenSSL
++# project.
++# ====================================================================
++#
++# p384 lower-level primitives for PPC64 using vector instructions.
++#
++
++use strict;
++use warnings;
++
++my $flavour = shift;
++my $output = "";
++while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {}
++if (!$output) {
++    $output = "-";
++}
++
++my ($xlate, $dir);
++$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
++( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
++( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
++die "can't locate ppc-xlate.pl";
++
++open OUT,"| \"$^X\" $xlate $flavour $output";
++*STDOUT=*OUT;
++
++my $code = "";
++
++my ($sp, $outp, $savelr, $savesp) = ("r1", "r3", "r10", "r12");
++
++my $vzero = "v32";
++
++sub startproc($)
++{
++    my ($name) = @_;
++
++    $code.=<<___;
++    .globl ${name}
++    .align 5
++${name}:
++
++___
++}
++
++sub endproc($)
++{
++    my ($name) = @_;
++
++    $code.=<<___;
++    blr
++        .size ${name},.-${name}
++
++___
++}
++
++
++sub push_vrs($$)
++{
++    my ($min, $max) = @_;
++
++    my $count = $max - $min + 1;
++
++    $code.=<<___;
++    mr      $savesp,$sp
++    stdu        $sp,-16*`$count+1`($sp)
++
++___
++        for (my $i = $min; $i <= $max; $i++) {
++            my $mult = $max - $i + 1;
++            $code.=<<___;
++    stxv        $i,-16*$mult($savesp)
++___
++
++    }
++
++    $code.=<<___;
++
++___
++}
++
++sub pop_vrs($$)
++{
++    my ($min, $max) = @_;
++
++    $code.=<<___;
++    ld      $savesp,0($sp)
++___
++    for (my $i = $min; $i <= $max; $i++) {
++        my $mult = $max - $i + 1;
++        $code.=<<___;
++    lxv     $i,-16*$mult($savesp)
++___
++    }
++
++    $code.=<<___;
++    mr      $sp,$savesp
++
++___
++}
++
++sub load_vrs($$)
++{
++    my ($pointer, $reg_list) = @_;
++
++    for (my $i = 0; $i <= 6; $i++) {
++        my $offset = $i * 8;
++        $code.=<<___;
++    lxsd        $reg_list->[$i],$offset($pointer)
++___
++    }
++
++    $code.=<<___;
++
++___
++}
++
++sub store_vrs($$)
++{
++    my ($pointer, $reg_list) = @_;
++
++    for (my $i = 0; $i <= 12; $i++) {
++        my $offset = $i * 16;
++        $code.=<<___;
++    stxv        $reg_list->[$i],$offset($pointer)
++___
++    }
++
++    $code.=<<___;
++
++___
++}
++
++$code.=<<___;
++.machine    "any"
++.text
++
++___
++
++{
++    # mul/square common
++    my ($t1, $t2, $t3, $t4) = ("v33", "v34", "v42", "v43");
++    my ($zero, $one) = ("r8", "r9");
++    my $out = "v51";
++
++    {
++        #
++        # p384_felem_mul
++        #
++
++        my ($in1p, $in2p) = ("r4", "r5");
++        my @in1 = map("v$_",(44..50));
++        my @in2 = map("v$_",(35..41));
++
++        startproc("p384_felem_mul");
++
++        push_vrs(52, 63);
++
++        $code.=<<___;
++    vspltisw    $vzero,0
++
++___
++
++        load_vrs($in1p, \@in1);
++        load_vrs($in2p, \@in2);
++
++        $code.=<<___;
++    vmsumudm    $out,$in1[0],$in2[0],$vzero
++    stxv        $out,0($outp)
++
++    xxpermdi    $t1,$in1[0],$in1[1],0b00
++    xxpermdi    $t2,$in2[1],$in2[0],0b00
++    vmsumudm    $out,$t1,$t2,$vzero
++    stxv        $out,16($outp)
++
++    xxpermdi    $t2,$in2[2],$in2[1],0b00
++    vmsumudm    $out,$t1,$t2,$vzero
++    vmsumudm    $out,$in1[2],$in2[0],$out
++    stxv        $out,32($outp)
++
++    xxpermdi    $t2,$in2[1],$in2[0],0b00
++    xxpermdi    $t3,$in1[2],$in1[3],0b00
++    xxpermdi    $t4,$in2[3],$in2[2],0b00
++    vmsumudm    $out,$t1,$t4,$vzero
++    vmsumudm    $out,$t3,$t2,$out
++    stxv        $out,48($outp)
++
++    xxpermdi    $t2,$in2[4],$in2[3],0b00
++    xxpermdi    $t4,$in2[2],$in2[1],0b00
++    vmsumudm    $out,$t1,$t2,$vzero
++    vmsumudm    $out,$t3,$t4,$out
++    vmsumudm    $out,$in1[4],$in2[0],$out
++    stxv        $out,64($outp)
++
++    xxpermdi    $t2,$in2[5],$in2[4],0b00
++    xxpermdi    $t4,$in2[3],$in2[2],0b00
++    vmsumudm    $out,$t1,$t2,$vzero
++    vmsumudm    $out,$t3,$t4,$out
++    xxpermdi    $t4,$in2[1],$in2[0],0b00
++    xxpermdi    $t1,$in1[4],$in1[5],0b00
++    vmsumudm    $out,$t1,$t4,$out
++    stxv        $out,80($outp)
++
++    xxpermdi    $t1,$in1[0],$in1[1],0b00
++    xxpermdi    $t2,$in2[6],$in2[5],0b00
++    xxpermdi    $t4,$in2[4],$in2[3],0b00
++    vmsumudm    $out,$t1,$t2,$vzero
++    vmsumudm    $out,$t3,$t4,$out
++    xxpermdi    $t2,$in2[2],$in2[1],0b00
++    xxpermdi    $t1,$in1[4],$in1[5],0b00
++    vmsumudm    $out,$t1,$t2,$out
++    vmsumudm    $out,$in1[6],$in2[0],$out
++    stxv        $out,96($outp)
++
++    xxpermdi    $t1,$in1[1],$in1[2],0b00
++    xxpermdi    $t2,$in2[6],$in2[5],0b00
++    xxpermdi    $t3,$in1[3],$in1[4],0b00
++    vmsumudm    $out,$t1,$t2,$vzero
++    vmsumudm    $out,$t3,$t4,$out
++    xxpermdi    $t3,$in2[2],$in2[1],0b00
++    xxpermdi    $t1,$in1[5],$in1[6],0b00
++    vmsumudm    $out,$t1,$t3,$out
++    stxv        $out,112($outp)
++
++    xxpermdi    $t1,$in1[2],$in1[3],0b00
++    xxpermdi    $t3,$in1[4],$in1[5],0b00
++    vmsumudm    $out,$t1,$t2,$vzero
++    vmsumudm    $out,$t3,$t4,$out
++    vmsumudm    $out,$in1[6],$in2[2],$out
++    stxv        $out,128($outp)
++
++    xxpermdi    $t1,$in1[3],$in1[4],0b00
++    vmsumudm    $out,$t1,$t2,$vzero
++    xxpermdi    $t1,$in1[5],$in1[6],0b00
++    vmsumudm    $out,$t1,$t4,$out
++    stxv        $out,144($outp)
++
++    vmsumudm    $out,$t3,$t2,$vzero
++    vmsumudm    $out,$in1[6],$in2[4],$out
++    stxv        $out,160($outp)
++
++    vmsumudm    $out,$t1,$t2,$vzero
++    stxv        $out,176($outp)
++
++    vmsumudm    $out,$in1[6],$in2[6],$vzero
++    stxv        $out,192($outp)
++___
++
++        endproc("p384_felem_mul");
++    }
++
++    {
++        #
++        # p384_felem_square
++        #
++
++        my ($inp) = ("r4");
++        my @in = map("v$_",(44..50));
++        my @inx2 = map("v$_",(35..41));
++
++        startproc("p384_felem_square");
++
++        push_vrs(52, 63);
++
++        $code.=<<___;
++    vspltisw    $vzero,0
++
++___
++
++        load_vrs($inp, \@in);
++
++        $code.=<<___;
++    li        $zero,0
++    li        $one,1
++    mtvsrdd        $t1,$one,$zero
++___
++
++        for (my $i = 0; $i <= 6; $i++) {
++            $code.=<<___;
++    vsld        $inx2[$i],$in[$i],$t1
++___
++        }
++
++        $code.=<<___;
++    vmsumudm    $out,$in[0],$in[0],$vzero
++    stxv        $out,0($outp)
++
++    vmsumudm    $out,$in[0],$inx2[1],$vzero
++    stxv        $out,16($outp)
++
++    vmsumudm    $out,$in[0],$inx2[2],$vzero
++    vmsumudm    $out,$in[1],$in[1],$out
++    stxv        $out,32($outp)
++
++    xxpermdi    $t1,$in[0],$in[1],0b00
++    xxpermdi    $t2,$inx2[3],$inx2[2],0b00
++    vmsumudm    $out,$t1,$t2,$vzero
++    stxv        $out,48($outp)
++
++    xxpermdi    $t4,$inx2[4],$inx2[3],0b00
++    vmsumudm    $out,$t1,$t4,$vzero
++    vmsumudm    $out,$in[2],$in[2],$out
++    stxv        $out,64($outp)
++
++    xxpermdi    $t2,$inx2[5],$inx2[4],0b00
++    vmsumudm    $out,$t1,$t2,$vzero
++    vmsumudm    $out,$in[2],$inx2[3],$out
++    stxv        $out,80($outp)
++
++    xxpermdi    $t2,$inx2[6],$inx2[5],0b00
++    vmsumudm    $out,$t1,$t2,$vzero
++    vmsumudm    $out,$in[2],$inx2[4],$out
++    vmsumudm    $out,$in[3],$in[3],$out
++    stxv        $out,96($outp)
++
++    xxpermdi    $t3,$in[1],$in[2],0b00
++    vmsumudm    $out,$t3,$t2,$vzero
++    vmsumudm    $out,$in[3],$inx2[4],$out
++    stxv        $out,112($outp)
++
++    xxpermdi    $t1,$in[2],$in[3],0b00
++    vmsumudm    $out,$t1,$t2,$vzero
++    vmsumudm    $out,$in[4],$in[4],$out
++    stxv        $out,128($outp)
++
++    xxpermdi    $t1,$in[3],$in[4],0b00
++    vmsumudm    $out,$t1,$t2,$vzero
++    stxv        $out,144($outp)
++
++    vmsumudm    $out,$in[4],$inx2[6],$vzero
++    vmsumudm    $out,$in[5],$in[5],$out
++    stxv        $out,160($outp)
++
++    vmsumudm    $out,$in[5],$inx2[6],$vzero
++    stxv        $out,176($outp)
++
++    vmsumudm    $out,$in[6],$in[6],$vzero
++    stxv        $out,192($outp)
++___
++
++        endproc("p384_felem_square");
++    }
++}
++
++$code =~ s/\`([^\`]*)\`/eval $1/gem;
++print $code;
++close STDOUT or die "error closing STDOUT: $!";
+--- a/crypto/ec/build.info
++++ b/crypto/ec/build.info
+@@ -31,6 +31,8 @@ GENERATE[ecp_nistz256-armv8.S]=asm/ecp_n
+ INCLUDE[ecp_nistz256-armv8.o]=..
+ GENERATE[ecp_nistz256-ppc64.s]=asm/ecp_nistz256-ppc64.pl $(PERLASM_SCHEME)
+ 
++GENERATE[ecp_nistp384-ppc64.s]=asm/ecp_nistp384-ppc64.pl $(PERLASM_SCHEME)
++INCLUDE[ecp_nistp384.o]=..
+ GENERATE[ecp_nistp521-ppc64.s]=asm/ecp_nistp521-ppc64.pl $(PERLASM_SCHEME)
+ 
+ GENERATE[x25519-x86_64.s]=asm/x25519-x86_64.pl $(PERLASM_SCHEME)
+--- a/crypto/ec/ecp_nistp384.c
++++ b/crypto/ec/ecp_nistp384.c
+@@ -691,6 +691,15 @@ void p384_felem_mul(widefelem out, const
+ 
+ static void felem_select(void)
+ {
++# if defined(_ARCH_PPC64)
++    if ((OPENSSL_ppccap_P & PPC_MADD300) && (OPENSSL_ppccap_P & PPC_ALTIVEC)) {
++        felem_square_p = p384_felem_square;
++        felem_mul_p = p384_felem_mul;
++
++        return;
++    }
++# endif
++
+     /* Default */
+     felem_square_p = felem_square_ref;
+     felem_mul_p = felem_mul_ref;
diff --git a/openssl-ecc-Remove-extraneous-parentheses-in-secp384r1.patch b/openssl-ecc-Remove-extraneous-parentheses-in-secp384r1.patch
new file mode 100644
index 0000000..a2918d9
--- /dev/null
+++ b/openssl-ecc-Remove-extraneous-parentheses-in-secp384r1.patch
@@ -0,0 +1,76 @@
+From 670e73d9084465384b11ef24802ca4a313e1d2f4 Mon Sep 17 00:00:00 2001
+From: Rohan McLure <rohanmclure@linux.ibm.com>
+Date: Tue, 15 Aug 2023 15:20:20 +1000
+Subject: [PATCH] ecc: Remove extraneous parentheses in secp384r1
+
+Substitutions in the felem_reduce() method feature unecessary
+parentheses, remove them.
+
+Signed-off-by: Rohan McLure <rohan.mclure@linux.ibm.com>
+
+Reviewed-by: Tomas Mraz <tomas@openssl.org>
+Reviewed-by: Shane Lontis <shane.lontis@oracle.com>
+Reviewed-by: Hugo Landau <hlandau@openssl.org>
+(Merged from https://github.com/openssl/openssl/pull/21749)
+---
+ crypto/ec/ecp_nistp384.c | 12 ++++++------
+ 1 file changed, 6 insertions(+), 6 deletions(-)
+
+diff --git a/crypto/ec/ecp_nistp384.c b/crypto/ec/ecp_nistp384.c
+index 14f9530d07c6..ff68f9cc7ad0 100644
+--- a/crypto/ec/ecp_nistp384.c
++++ b/crypto/ec/ecp_nistp384.c
+@@ -540,7 +540,7 @@ static void felem_reduce(felem out, const widefelem in)
+     acc[7] += in[12] >> 8;
+     acc[6] += (in[12] & 0xff) << 48;
+     acc[6] -= in[12] >> 16;
+-    acc[5] -= ((in[12] & 0xffff) << 40);
++    acc[5] -= (in[12] & 0xffff) << 40;
+     acc[6] += in[12] >> 48;
+     acc[5] += (in[12] & 0xffffffffffff) << 8;
+ 
+@@ -549,7 +549,7 @@ static void felem_reduce(felem out, const widefelem in)
+     acc[6] += in[11] >> 8;
+     acc[5] += (in[11] & 0xff) << 48;
+     acc[5] -= in[11] >> 16;
+-    acc[4] -= ((in[11] & 0xffff) << 40);
++    acc[4] -= (in[11] & 0xffff) << 40;
+     acc[5] += in[11] >> 48;
+     acc[4] += (in[11] & 0xffffffffffff) << 8;
+ 
+@@ -558,7 +558,7 @@ static void felem_reduce(felem out, const widefelem in)
+     acc[5] += in[10] >> 8;
+     acc[4] += (in[10] & 0xff) << 48;
+     acc[4] -= in[10] >> 16;
+-    acc[3] -= ((in[10] & 0xffff) << 40);
++    acc[3] -= (in[10] & 0xffff) << 40;
+     acc[4] += in[10] >> 48;
+     acc[3] += (in[10] & 0xffffffffffff) << 8;
+ 
+@@ -567,7 +567,7 @@ static void felem_reduce(felem out, const widefelem in)
+     acc[4] += in[9] >> 8;
+     acc[3] += (in[9] & 0xff) << 48;
+     acc[3] -= in[9] >> 16;
+-    acc[2] -= ((in[9] & 0xffff) << 40);
++    acc[2] -= (in[9] & 0xffff) << 40;
+     acc[3] += in[9] >> 48;
+     acc[2] += (in[9] & 0xffffffffffff) << 8;
+ 
+@@ -582,7 +582,7 @@ static void felem_reduce(felem out, const widefelem in)
+     acc[3] += acc[8] >> 8;
+     acc[2] += (acc[8] & 0xff) << 48;
+     acc[2] -= acc[8] >> 16;
+-    acc[1] -= ((acc[8] & 0xffff) << 40);
++    acc[1] -= (acc[8] & 0xffff) << 40;
+     acc[2] += acc[8] >> 48;
+     acc[1] += (acc[8] & 0xffffffffffff) << 8;
+ 
+@@ -591,7 +591,7 @@ static void felem_reduce(felem out, const widefelem in)
+     acc[2] += acc[7] >> 8;
+     acc[1] += (acc[7] & 0xff) << 48;
+     acc[1] -= acc[7] >> 16;
+-    acc[0] -= ((acc[7] & 0xffff) << 40);
++    acc[0] -= (acc[7] & 0xffff) << 40;
+     acc[1] += acc[7] >> 48;
+     acc[0] += (acc[7] & 0xffffffffffff) << 8;
+ 
diff --git a/openssl-powerpc-ecc-Fix-stack-allocation-secp384r1-asm.patch b/openssl-powerpc-ecc-Fix-stack-allocation-secp384r1-asm.patch
new file mode 100644
index 0000000..ecfecb5
--- /dev/null
+++ b/openssl-powerpc-ecc-Fix-stack-allocation-secp384r1-asm.patch
@@ -0,0 +1,96 @@
+From 50f8b936b00dc18ce1f622a7a6aa46daf03da48b Mon Sep 17 00:00:00 2001
+From: Rohan McLure <rohanmclure@linux.ibm.com>
+Date: Wed, 16 Aug 2023 16:52:47 +1000
+Subject: [PATCH] powerpc: ecc: Fix stack allocation secp384r1 asm
+
+Assembly acceleration secp384r1 opts to not use any callee-save VSRs, as
+VSX enabled systems make extensive use of renaming, and so writebacks in
+felem_{mul,square}() can be reordered for best cache effects.
+
+Remove stack allocations. This in turn fixes unmatched push/pops in
+felem_{mul,square}().
+
+Signed-off-by: Rohan McLure <rohan.mclure@linux.ibm.com>
+
+Reviewed-by: Tomas Mraz <tomas@openssl.org>
+Reviewed-by: Shane Lontis <shane.lontis@oracle.com>
+Reviewed-by: Hugo Landau <hlandau@openssl.org>
+(Merged from https://github.com/openssl/openssl/pull/21749)
+---
+ crypto/ec/asm/ecp_nistp384-ppc64.pl | 49 -----------------------------
+ 1 file changed, 49 deletions(-)
+
+diff --git a/crypto/ec/asm/ecp_nistp384-ppc64.pl b/crypto/ec/asm/ecp_nistp384-ppc64.pl
+index 3f86b391af69..28f4168e5218 100755
+--- a/crypto/ec/asm/ecp_nistp384-ppc64.pl
++++ b/crypto/ec/asm/ecp_nistp384-ppc64.pl
+@@ -62,51 +62,6 @@ ($)
+ ___
+ }
+ 
+-
+-sub push_vrs($$)
+-{
+-    my ($min, $max) = @_;
+-
+-    my $count = $max - $min + 1;
+-
+-    $code.=<<___;
+-    mr      $savesp,$sp
+-    stdu        $sp,-16*`$count+1`($sp)
+-
+-___
+-        for (my $i = $min; $i <= $max; $i++) {
+-            my $mult = $max - $i + 1;
+-            $code.=<<___;
+-    stxv        $i,-16*$mult($savesp)
+-___
+-
+-    }
+-
+-    $code.=<<___;
+-
+-___
+-}
+-
+-sub pop_vrs($$)
+-{
+-    my ($min, $max) = @_;
+-
+-    $code.=<<___;
+-    ld      $savesp,0($sp)
+-___
+-    for (my $i = $min; $i <= $max; $i++) {
+-        my $mult = $max - $i + 1;
+-        $code.=<<___;
+-    lxv     $i,-16*$mult($savesp)
+-___
+-    }
+-
+-    $code.=<<___;
+-    mr      $sp,$savesp
+-
+-___
+-}
+-
+ sub load_vrs($$)
+ {
+     my ($pointer, $reg_list) = @_;
+@@ -162,8 +117,6 @@ ($$)
+ 
+         startproc("p384_felem_mul");
+ 
+-        push_vrs(52, 63);
+-
+         $code.=<<___;
+     vspltisw    $vzero,0
+ 
+@@ -268,8 +221,6 @@ ($$)
+ 
+         startproc("p384_felem_square");
+ 
+-        push_vrs(52, 63);
+-
+         $code.=<<___;
+     vspltisw    $vzero,0
+