openssl-1_1/openssl-1_1-Optimize-RSA-armv8.patch

576 lines
16 KiB
Diff

From 5ea64b456b1a27ae046f23d632a968a7583bb9eb Mon Sep 17 00:00:00 2001
From: "Fangming.Fang" <fangming.fang@arm.com>
Date: Tue, 28 Apr 2020 02:33:50 +0000
Subject: [PATCH] Read MIDR_EL1 system register on aarch64
MIDR_EL1 system register exposes microarchitecture information so that
people can make micro-arch related optimization such as exposing as
much instruction level parallelism as possible.
MIDR_EL1 register can be read only if HWCAP_CPUID feature is supported.
Change-Id: Iabb8a36c5d31b184dba6399f378598058d394d4e
Reviewed-by: Paul Dale <paul.dale@oracle.com>
Reviewed-by: Tomas Mraz <tmraz@fedoraproject.org>
(Merged from https://github.com/openssl/openssl/pull/11744)
---
crypto/arm64cpuid.pl | 7 +++++++
crypto/arm_arch.h | 44 ++++++++++++++++++++++++++++++++++++++++++++
crypto/armcap.c | 11 +++++++++++
3 files changed, 62 insertions(+)
Index: openssl-1.1.1d/crypto/arm64cpuid.pl
===================================================================
--- openssl-1.1.1d.orig/crypto/arm64cpuid.pl
+++ openssl-1.1.1d/crypto/arm64cpuid.pl
@@ -78,6 +78,13 @@ _armv8_sha512_probe:
ret
.size _armv8_sha512_probe,.-_armv8_sha512_probe
+.globl _armv8_cpuid_probe
+.type _armv8_cpuid_probe,%function
+_armv8_cpuid_probe:
+ mrs x0, midr_el1
+ ret
+.size _armv8_cpuid_probe,.-_armv8_cpuid_probe
+
.globl OPENSSL_cleanse
.type OPENSSL_cleanse,%function
.align 5
Index: openssl-1.1.1d/crypto/arm_arch.h
===================================================================
--- openssl-1.1.1d.orig/crypto/arm_arch.h
+++ openssl-1.1.1d/crypto/arm_arch.h
@@ -71,6 +71,7 @@
# ifndef __ASSEMBLER__
extern unsigned int OPENSSL_armcap_P;
+extern unsigned int OPENSSL_arm_midr;
# endif
# define ARMV7_NEON (1<<0)
@@ -80,5 +81,48 @@ extern unsigned int OPENSSL_armcap_P;
# define ARMV8_SHA256 (1<<4)
# define ARMV8_PMULL (1<<5)
# define ARMV8_SHA512 (1<<6)
+# define ARMV8_CPUID (1<<7)
+/*
+ * MIDR_EL1 system register
+ *
+ * 63___ _ ___32_31___ _ ___24_23_____20_19_____16_15__ _ __4_3_______0
+ * | | | | | | |
+ * |RES0 | Implementer | Variant | Arch | PartNum |Revision|
+ * |____ _ _____|_____ _ _____|_________|_______ _|____ _ ___|________|
+ *
+ */
+
+# define ARM_CPU_IMP_ARM 0x41
+
+# define ARM_CPU_PART_CORTEX_A72 0xD08
+# define ARM_CPU_PART_N1 0xD0C
+
+# define MIDR_PARTNUM_SHIFT 4
+# define MIDR_PARTNUM_MASK (0xfff << MIDR_PARTNUM_SHIFT)
+# define MIDR_PARTNUM(midr) \
+ (((midr) & MIDR_PARTNUM_MASK) >> MIDR_PARTNUM_SHIFT)
+
+# define MIDR_IMPLEMENTER_SHIFT 24
+# define MIDR_IMPLEMENTER_MASK (0xff << MIDR_IMPLEMENTER_SHIFT)
+# define MIDR_IMPLEMENTER(midr) \
+ (((midr) & MIDR_IMPLEMENTER_MASK) >> MIDR_IMPLEMENTER_SHIFT)
+
+# define MIDR_ARCHITECTURE_SHIFT 16
+# define MIDR_ARCHITECTURE_MASK (0xf << MIDR_ARCHITECTURE_SHIFT)
+# define MIDR_ARCHITECTURE(midr) \
+ (((midr) & MIDR_ARCHITECTURE_MASK) >> MIDR_ARCHITECTURE_SHIFT)
+
+# define MIDR_CPU_MODEL_MASK \
+ (MIDR_IMPLEMENTER_MASK | \
+ MIDR_PARTNUM_MASK | \
+ MIDR_ARCHITECTURE_MASK)
+
+# define MIDR_CPU_MODEL(imp, partnum) \
+ (((imp) << MIDR_IMPLEMENTER_SHIFT) | \
+ (0xf << MIDR_ARCHITECTURE_SHIFT) | \
+ ((partnum) << MIDR_PARTNUM_SHIFT))
+
+# define MIDR_IS_CPU_MODEL(midr, imp, partnum) \
+ (((midr) & MIDR_CPU_MODEL_MASK) == MIDR_CPU_MODEL(imp, partnum))
#endif
Index: openssl-1.1.1d/crypto/armcap.c
===================================================================
--- openssl-1.1.1d.orig/crypto/armcap.c
+++ openssl-1.1.1d/crypto/armcap.c
@@ -18,6 +18,8 @@
#include "arm_arch.h"
unsigned int OPENSSL_armcap_P = 0;
+unsigned int OPENSSL_arm_midr = 0;
+unsigned int OPENSSL_armv8_rsa_neonized = 0;
#if __ARM_MAX_ARCH__<7
void OPENSSL_cpuid_setup(void)
@@ -48,6 +50,7 @@ void _armv8_sha256_probe(void);
void _armv8_pmull_probe(void);
# ifdef __aarch64__
void _armv8_sha512_probe(void);
+unsigned int _armv8_cpuid_probe(void);
# endif
uint32_t _armv7_tick(void);
@@ -95,6 +98,7 @@ void OPENSSL_cpuid_setup(void) __attribu
# define HWCAP_CE_PMULL (1 << 4)
# define HWCAP_CE_SHA1 (1 << 5)
# define HWCAP_CE_SHA256 (1 << 6)
+# define HWCAP_CPUID (1 << 11)
# define HWCAP_CE_SHA512 (1 << 21)
# endif
@@ -155,6 +159,9 @@ void OPENSSL_cpuid_setup(void)
# ifdef __aarch64__
if (hwcap & HWCAP_CE_SHA512)
OPENSSL_armcap_P |= ARMV8_SHA512;
+
+ if (hwcap & HWCAP_CPUID)
+ OPENSSL_armcap_P |= ARMV8_CPUID;
# endif
}
# endif
@@ -210,5 +217,16 @@ void OPENSSL_cpuid_setup(void)
sigaction(SIGILL, &ill_oact, NULL);
sigprocmask(SIG_SETMASK, &oset, NULL);
+
+# ifdef __aarch64__
+ if (OPENSSL_armcap_P & ARMV8_CPUID)
+ OPENSSL_arm_midr = _armv8_cpuid_probe();
+
+ if ((MIDR_IS_CPU_MODEL(OPENSSL_arm_midr, ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A72) ||
+ MIDR_IS_CPU_MODEL(OPENSSL_arm_midr, ARM_CPU_IMP_ARM, ARM_CPU_PART_N1)) &&
+ (OPENSSL_armcap_P & ARMV7_NEON)) {
+ OPENSSL_armv8_rsa_neonized = 1;
+ }
+# endif
}
#endif
Index: openssl-1.1.1d/crypto/bn/asm/armv8-mont.pl
===================================================================
--- openssl-1.1.1d.orig/crypto/bn/asm/armv8-mont.pl
+++ openssl-1.1.1d/crypto/bn/asm/armv8-mont.pl
@@ -64,16 +64,34 @@ $n0="x4"; # const BN_ULONG *n0,
$num="x5"; # int num);
$code.=<<___;
+#ifndef __KERNEL__
+# include "arm_arch.h"
+.extern OPENSSL_armv8_rsa_neonized
+.hidden OPENSSL_armv8_rsa_neonized
+#endif
.text
.globl bn_mul_mont
.type bn_mul_mont,%function
.align 5
bn_mul_mont:
+.Lbn_mul_mont:
+ tst $num,#3
+ b.ne .Lmul_mont
+ cmp $num,#32
+ b.le .Lscalar_impl
+#ifndef __KERNEL__
+ adrp x17,OPENSSL_armv8_rsa_neonized
+ ldr w17,[x17,#:lo12:OPENSSL_armv8_rsa_neonized]
+ cbnz w17, bn_mul8x_mont_neon
+#endif
+
+.Lscalar_impl:
tst $num,#7
b.eq __bn_sqr8x_mont
tst $num,#3
b.eq __bn_mul4x_mont
+
.Lmul_mont:
stp x29,x30,[sp,#-64]!
add x29,sp,#0
@@ -271,6 +289,369 @@ bn_mul_mont:
.size bn_mul_mont,.-bn_mul_mont
___
{
+my ($A0,$A1,$N0,$N1)=map("v$_",(0..3));
+my ($Z,$Temp)=("v4.16b","v5");
+my @ACC=map("v$_",(6..13));
+my ($Bi,$Ni,$M0)=map("v$_",(28..30));
+my $sBi="s28";
+my $sM0="s30";
+my $zero="v14";
+my $temp="v15";
+my $ACCTemp="v16";
+
+my ($rptr,$aptr,$bptr,$nptr,$n0,$num)=map("x$_",(0..5));
+my ($tinptr,$toutptr,$inner,$outer,$bnptr)=map("x$_",(6..11));
+
+$code.=<<___;
+.type bn_mul8x_mont_neon,%function
+.align 5
+bn_mul8x_mont_neon:
+ stp x29,x30,[sp,#-80]!
+ mov x16,sp
+ stp d8,d9,[sp,#16]
+ stp d10,d11,[sp,#32]
+ stp d12,d13,[sp,#48]
+ stp d14,d15,[sp,#64]
+ lsl $num,$num,#1
+ eor $zero.16b,$zero.16b,$zero.16b
+
+.align 4
+.LNEON_8n:
+ eor @ACC[0].16b,@ACC[0].16b,@ACC[0].16b
+ sub $toutptr,sp,#128
+ eor @ACC[1].16b,@ACC[1].16b,@ACC[1].16b
+ sub $toutptr,$toutptr,$num,lsl#4
+ eor @ACC[2].16b,@ACC[2].16b,@ACC[2].16b
+ and $toutptr,$toutptr,#-64
+ eor @ACC[3].16b,@ACC[3].16b,@ACC[3].16b
+ mov sp,$toutptr // alloca
+ eor @ACC[4].16b,@ACC[4].16b,@ACC[4].16b
+ add $toutptr,$toutptr,#256
+ eor @ACC[5].16b,@ACC[5].16b,@ACC[5].16b
+ sub $inner,$num,#8
+ eor @ACC[6].16b,@ACC[6].16b,@ACC[6].16b
+ eor @ACC[7].16b,@ACC[7].16b,@ACC[7].16b
+
+.LNEON_8n_init:
+ st1 {@ACC[0].2d,@ACC[1].2d},[$toutptr],#32
+ subs $inner,$inner,#8
+ st1 {@ACC[2].2d,@ACC[3].2d},[$toutptr],#32
+ st1 {@ACC[4].2d,@ACC[5].2d},[$toutptr],#32
+ st1 {@ACC[6].2d,@ACC[7].2d},[$toutptr],#32
+ bne .LNEON_8n_init
+
+ add $tinptr,sp,#256
+ ld1 {$A0.4s,$A1.4s},[$aptr],#32
+ add $bnptr,sp,#8
+ ldr $sM0,[$n0],#4
+ mov $outer,$num
+ b .LNEON_8n_outer
+
+.align 4
+.LNEON_8n_outer:
+ ldr $sBi,[$bptr],#4 // *b++
+ uxtl $Bi.4s,$Bi.4h
+ add $toutptr,sp,#128
+ ld1 {$N0.4s,$N1.4s},[$nptr],#32
+
+ umlal @ACC[0].2d,$Bi.2s,$A0.s[0]
+ umlal @ACC[1].2d,$Bi.2s,$A0.s[1]
+ umlal @ACC[2].2d,$Bi.2s,$A0.s[2]
+ shl $Ni.2d,@ACC[0].2d,#16
+ ext $Ni.16b,$Ni.16b,$Ni.16b,#8
+ umlal @ACC[3].2d,$Bi.2s,$A0.s[3]
+ add $Ni.2d,$Ni.2d,@ACC[0].2d
+ umlal @ACC[4].2d,$Bi.2s,$A1.s[0]
+ mul $Ni.2s,$Ni.2s,$M0.2s
+ umlal @ACC[5].2d,$Bi.2s,$A1.s[1]
+ st1 {$Bi.2s},[sp] // put aside smashed b[8*i+0]
+ umlal @ACC[6].2d,$Bi.2s,$A1.s[2]
+ uxtl $Ni.4s,$Ni.4h
+ umlal @ACC[7].2d,$Bi.2s,$A1.s[3]
+___
+for ($i=0; $i<7;) {
+$code.=<<___;
+ ldr $sBi,[$bptr],#4 // *b++
+ umlal @ACC[0].2d,$Ni.2s,$N0.s[0]
+ umlal @ACC[1].2d,$Ni.2s,$N0.s[1]
+ uxtl $Bi.4s,$Bi.4h
+ umlal @ACC[2].2d,$Ni.2s,$N0.s[2]
+ ushr $temp.2d,@ACC[0].2d,#16
+ umlal @ACC[3].2d,$Ni.2s,$N0.s[3]
+ umlal @ACC[4].2d,$Ni.2s,$N1.s[0]
+ ext @ACC[0].16b,@ACC[0].16b,@ACC[0].16b,#8
+ add @ACC[0].2d,@ACC[0].2d,$temp.2d
+ umlal @ACC[5].2d,$Ni.2s,$N1.s[1]
+ ushr @ACC[0].2d,@ACC[0].2d,#16
+ umlal @ACC[6].2d,$Ni.2s,$N1.s[2]
+ umlal @ACC[7].2d,$Ni.2s,$N1.s[3]
+ add $ACCTemp.2d,@ACC[1].2d,@ACC[0].2d
+ ins @ACC[1].d[0],$ACCTemp.d[0]
+ st1 {$Ni.2s},[$bnptr],#8 // put aside smashed m[8*i+$i]
+___
+ push(@ACC,shift(@ACC)); $i++;
+$code.=<<___;
+ umlal @ACC[0].2d,$Bi.2s,$A0.s[0]
+ ld1 {@ACC[7].2d},[$tinptr],#16
+ umlal @ACC[1].2d,$Bi.2s,$A0.s[1]
+ umlal @ACC[2].2d,$Bi.2s,$A0.s[2]
+ shl $Ni.2d,@ACC[0].2d,#16
+ ext $Ni.16b,$Ni.16b,$Ni.16b,#8
+ umlal @ACC[3].2d,$Bi.2s,$A0.s[3]
+ add $Ni.2d,$Ni.2d,@ACC[0].2d
+ umlal @ACC[4].2d,$Bi.2s,$A1.s[0]
+ mul $Ni.2s,$Ni.2s,$M0.2s
+ umlal @ACC[5].2d,$Bi.2s,$A1.s[1]
+ st1 {$Bi.2s},[$bnptr],#8 // put aside smashed b[8*i+$i]
+ umlal @ACC[6].2d,$Bi.2s,$A1.s[2]
+ uxtl $Ni.4s,$Ni.4h
+ umlal @ACC[7].2d,$Bi.2s,$A1.s[3]
+___
+}
+$code.=<<___;
+ ld1 {$Bi.2s},[sp] // pull smashed b[8*i+0]
+ umlal @ACC[0].2d,$Ni.2s,$N0.s[0]
+ ld1 {$A0.4s,$A1.4s},[$aptr],#32
+ umlal @ACC[1].2d,$Ni.2s,$N0.s[1]
+ umlal @ACC[2].2d,$Ni.2s,$N0.s[2]
+ mov $Temp.16b,@ACC[0].16b
+ ushr $Temp.2d,$Temp.2d,#16
+ ext @ACC[0].16b,@ACC[0].16b,@ACC[0].16b,#8
+ umlal @ACC[3].2d,$Ni.2s,$N0.s[3]
+ umlal @ACC[4].2d,$Ni.2s,$N1.s[0]
+ add @ACC[0].2d,@ACC[0].2d,$Temp.2d
+ umlal @ACC[5].2d,$Ni.2s,$N1.s[1]
+ ushr @ACC[0].2d,@ACC[0].2d,#16
+ eor $temp.16b,$temp.16b,$temp.16b
+ ins @ACC[0].d[1],$temp.d[0]
+ umlal @ACC[6].2d,$Ni.2s,$N1.s[2]
+ umlal @ACC[7].2d,$Ni.2s,$N1.s[3]
+ add @ACC[1].2d,@ACC[1].2d,@ACC[0].2d
+ st1 {$Ni.2s},[$bnptr],#8 // put aside smashed m[8*i+$i]
+ add $bnptr,sp,#8 // rewind
+___
+ push(@ACC,shift(@ACC));
+$code.=<<___;
+ sub $inner,$num,#8
+ b .LNEON_8n_inner
+
+.align 4
+.LNEON_8n_inner:
+ subs $inner,$inner,#8
+ umlal @ACC[0].2d,$Bi.2s,$A0.s[0]
+ ld1 {@ACC[7].2d},[$tinptr]
+ umlal @ACC[1].2d,$Bi.2s,$A0.s[1]
+ ld1 {$Ni.2s},[$bnptr],#8 // pull smashed m[8*i+0]
+ umlal @ACC[2].2d,$Bi.2s,$A0.s[2]
+ ld1 {$N0.4s,$N1.4s},[$nptr],#32
+ umlal @ACC[3].2d,$Bi.2s,$A0.s[3]
+ b.eq .LInner_jump
+ add $tinptr,$tinptr,#16 // don't advance in last iteration
+.LInner_jump:
+ umlal @ACC[4].2d,$Bi.2s,$A1.s[0]
+ umlal @ACC[5].2d,$Bi.2s,$A1.s[1]
+ umlal @ACC[6].2d,$Bi.2s,$A1.s[2]
+ umlal @ACC[7].2d,$Bi.2s,$A1.s[3]
+___
+for ($i=1; $i<8; $i++) {
+$code.=<<___;
+ ld1 {$Bi.2s},[$bnptr],#8 // pull smashed b[8*i+$i]
+ umlal @ACC[0].2d,$Ni.2s,$N0.s[0]
+ umlal @ACC[1].2d,$Ni.2s,$N0.s[1]
+ umlal @ACC[2].2d,$Ni.2s,$N0.s[2]
+ umlal @ACC[3].2d,$Ni.2s,$N0.s[3]
+ umlal @ACC[4].2d,$Ni.2s,$N1.s[0]
+ umlal @ACC[5].2d,$Ni.2s,$N1.s[1]
+ umlal @ACC[6].2d,$Ni.2s,$N1.s[2]
+ umlal @ACC[7].2d,$Ni.2s,$N1.s[3]
+ st1 {@ACC[0].2d},[$toutptr],#16
+___
+ push(@ACC,shift(@ACC));
+$code.=<<___;
+ umlal @ACC[0].2d,$Bi.2s,$A0.s[0]
+ ld1 {@ACC[7].2d},[$tinptr]
+ umlal @ACC[1].2d,$Bi.2s,$A0.s[1]
+ ld1 {$Ni.2s},[$bnptr],#8 // pull smashed m[8*i+$i]
+ umlal @ACC[2].2d,$Bi.2s,$A0.s[2]
+ b.eq .LInner_jump$i
+ add $tinptr,$tinptr,#16 // don't advance in last iteration
+.LInner_jump$i:
+ umlal @ACC[3].2d,$Bi.2s,$A0.s[3]
+ umlal @ACC[4].2d,$Bi.2s,$A1.s[0]
+ umlal @ACC[5].2d,$Bi.2s,$A1.s[1]
+ umlal @ACC[6].2d,$Bi.2s,$A1.s[2]
+ umlal @ACC[7].2d,$Bi.2s,$A1.s[3]
+___
+}
+$code.=<<___;
+ b.ne .LInner_after_rewind$i
+ sub $aptr,$aptr,$num,lsl#2 // rewind
+.LInner_after_rewind$i:
+ umlal @ACC[0].2d,$Ni.2s,$N0.s[0]
+ ld1 {$Bi.2s},[sp] // pull smashed b[8*i+0]
+ umlal @ACC[1].2d,$Ni.2s,$N0.s[1]
+ ld1 {$A0.4s,$A1.4s},[$aptr],#32
+ umlal @ACC[2].2d,$Ni.2s,$N0.s[2]
+ add $bnptr,sp,#8 // rewind
+ umlal @ACC[3].2d,$Ni.2s,$N0.s[3]
+ umlal @ACC[4].2d,$Ni.2s,$N1.s[0]
+ umlal @ACC[5].2d,$Ni.2s,$N1.s[1]
+ umlal @ACC[6].2d,$Ni.2s,$N1.s[2]
+ st1 {@ACC[0].2d},[$toutptr],#16
+ umlal @ACC[7].2d,$Ni.2s,$N1.s[3]
+
+ bne .LNEON_8n_inner
+___
+ push(@ACC,shift(@ACC));
+$code.=<<___;
+ add $tinptr,sp,#128
+ st1 {@ACC[0].2d,@ACC[1].2d},[$toutptr],#32
+ eor $N0.16b,$N0.16b,$N0.16b // $N0
+ st1 {@ACC[2].2d,@ACC[3].2d},[$toutptr],#32
+ eor $N1.16b,$N1.16b,$N1.16b // $N1
+ st1 {@ACC[4].2d,@ACC[5].2d},[$toutptr],#32
+ st1 {@ACC[6].2d},[$toutptr]
+
+ subs $outer,$outer,#8
+ ld1 {@ACC[0].2d,@ACC[1].2d},[$tinptr],#32
+ ld1 {@ACC[2].2d,@ACC[3].2d},[$tinptr],#32
+ ld1 {@ACC[4].2d,@ACC[5].2d},[$tinptr],#32
+ ld1 {@ACC[6].2d,@ACC[7].2d},[$tinptr],#32
+
+ b.eq .LInner_8n_jump_2steps
+ sub $nptr,$nptr,$num,lsl#2 // rewind
+ b .LNEON_8n_outer
+
+.LInner_8n_jump_2steps:
+ add $toutptr,sp,#128
+ st1 {$N0.2d,$N1.2d}, [sp],#32 // start wiping stack frame
+ mov $Temp.16b,@ACC[0].16b
+ ushr $temp.2d,@ACC[0].2d,#16
+ ext @ACC[0].16b,@ACC[0].16b,@ACC[0].16b,#8
+ st1 {$N0.2d,$N1.2d}, [sp],#32
+ add @ACC[0].2d,@ACC[0].2d,$temp.2d
+ st1 {$N0.2d,$N1.2d}, [sp],#32
+ ushr $temp.2d,@ACC[0].2d,#16
+ st1 {$N0.2d,$N1.2d}, [sp],#32
+ zip1 @ACC[0].4h,$Temp.4h,@ACC[0].4h
+ ins $temp.d[1],$zero.d[0]
+
+ mov $inner,$num
+ b .LNEON_tail_entry
+
+.align 4
+.LNEON_tail:
+ add @ACC[0].2d,@ACC[0].2d,$temp.2d
+ mov $Temp.16b,@ACC[0].16b
+ ushr $temp.2d,@ACC[0].2d,#16
+ ext @ACC[0].16b,@ACC[0].16b,@ACC[0].16b,#8
+ ld1 {@ACC[2].2d,@ACC[3].2d}, [$tinptr],#32
+ add @ACC[0].2d,@ACC[0].2d,$temp.2d
+ ld1 {@ACC[4].2d,@ACC[5].2d}, [$tinptr],#32
+ ushr $temp.2d,@ACC[0].2d,#16
+ ld1 {@ACC[6].2d,@ACC[7].2d}, [$tinptr],#32
+ zip1 @ACC[0].4h,$Temp.4h,@ACC[0].4h
+ ins $temp.d[1],$zero.d[0]
+
+.LNEON_tail_entry:
+___
+for ($i=1; $i<8; $i++) {
+$code.=<<___;
+ add @ACC[1].2d,@ACC[1].2d,$temp.2d
+ st1 {@ACC[0].s}[0], [$toutptr],#4
+ ushr $temp.2d,@ACC[1].2d,#16
+ mov $Temp.16b,@ACC[1].16b
+ ext @ACC[1].16b,@ACC[1].16b,@ACC[1].16b,#8
+ add @ACC[1].2d,@ACC[1].2d,$temp.2d
+ ushr $temp.2d,@ACC[1].2d,#16
+ zip1 @ACC[1].4h,$Temp.4h,@ACC[1].4h
+ ins $temp.d[1],$zero.d[0]
+___
+ push(@ACC,shift(@ACC));
+}
+ push(@ACC,shift(@ACC));
+$code.=<<___;
+ ld1 {@ACC[0].2d,@ACC[1].2d}, [$tinptr],#32
+ subs $inner,$inner,#8
+ st1 {@ACC[7].s}[0], [$toutptr],#4
+ bne .LNEON_tail
+
+ st1 {$temp.s}[0], [$toutptr],#4 // top-most bit
+ sub $nptr,$nptr,$num,lsl#2 // rewind $nptr
+ subs $aptr,sp,#0 // clear carry flag
+ add $bptr,sp,$num,lsl#2
+
+.LNEON_sub:
+ ldp w4,w5,[$aptr],#8
+ ldp w6,w7,[$aptr],#8
+ ldp w8,w9,[$nptr],#8
+ ldp w10,w11,[$nptr],#8
+ sbcs w8,w4,w8
+ sbcs w9,w5,w9
+ sbcs w10,w6,w10
+ sbcs w11,w7,w11
+ sub x17,$bptr,$aptr
+ stp w8,w9,[$rptr],#8
+ stp w10,w11,[$rptr],#8
+ cbnz x17,.LNEON_sub
+
+ ldr w10, [$aptr] // load top-most bit
+ mov x11,sp
+ eor v0.16b,v0.16b,v0.16b
+ sub x11,$bptr,x11 // this is num*4
+ eor v1.16b,v1.16b,v1.16b
+ mov $aptr,sp
+ sub $rptr,$rptr,x11 // rewind $rptr
+ mov $nptr,$bptr // second 3/4th of frame
+ sbcs w10,w10,wzr // result is carry flag
+
+.LNEON_copy_n_zap:
+ ldp w4,w5,[$aptr],#8
+ ldp w6,w7,[$aptr],#8
+ ldp w8,w9,[$rptr],#8
+ ldp w10,w11,[$rptr]
+ sub $rptr,$rptr,#8
+ b.cs .LCopy_1
+ mov w8,w4
+ mov w9,w5
+ mov w10,w6
+ mov w11,w7
+.LCopy_1:
+ st1 {v0.2d,v1.2d}, [$nptr],#32 // wipe
+ st1 {v0.2d,v1.2d}, [$nptr],#32 // wipe
+ ldp w4,w5,[$aptr],#8
+ ldp w6,w7,[$aptr],#8
+ stp w8,w9,[$rptr],#8
+ stp w10,w11,[$rptr],#8
+ sub $aptr,$aptr,#32
+ ldp w8,w9,[$rptr],#8
+ ldp w10,w11,[$rptr]
+ sub $rptr,$rptr,#8
+ b.cs .LCopy_2
+ mov w8, w4
+ mov w9, w5
+ mov w10, w6
+ mov w11, w7
+.LCopy_2:
+ st1 {v0.2d,v1.2d}, [$aptr],#32 // wipe
+ st1 {v0.2d,v1.2d}, [$nptr],#32 // wipe
+ sub x17,$bptr,$aptr // preserves carry
+ stp w8,w9,[$rptr],#8
+ stp w10,w11,[$rptr],#8
+ cbnz x17,.LNEON_copy_n_zap
+
+ mov sp,x16
+ ldp d14,d15,[sp,#64]
+ ldp d12,d13,[sp,#48]
+ ldp d10,d11,[sp,#32]
+ ldp d8,d9,[sp,#16]
+ ldr x29,[sp],#80
+ ret // bx lr
+
+.size bn_mul8x_mont_neon,.-bn_mul8x_mont_neon
+___
+}
+{
########################################################################
# Following is ARMv8 adaptation of sqrx8x_mont from x86_64-mont5 module.
Index: openssl-1.1.1d/crypto/bn/build.info
===================================================================
--- openssl-1.1.1d.orig/crypto/bn/build.info
+++ openssl-1.1.1d/crypto/bn/build.info
@@ -65,3 +65,4 @@ INCLUDE[armv4-mont.o]=..
GENERATE[armv4-gf2m.S]=asm/armv4-gf2m.pl $(PERLASM_SCHEME)
INCLUDE[armv4-gf2m.o]=..
GENERATE[armv8-mont.S]=asm/armv8-mont.pl $(PERLASM_SCHEME)
+INCLUDE[armv8-mont.o]=..