forked from pool/openssl-1_1
576 lines
16 KiB
Diff
576 lines
16 KiB
Diff
|
From 5ea64b456b1a27ae046f23d632a968a7583bb9eb Mon Sep 17 00:00:00 2001
|
||
|
From: "Fangming.Fang" <fangming.fang@arm.com>
|
||
|
Date: Tue, 28 Apr 2020 02:33:50 +0000
|
||
|
Subject: [PATCH] Read MIDR_EL1 system register on aarch64
|
||
|
|
||
|
MIDR_EL1 system register exposes microarchitecture information so that
|
||
|
people can make micro-arch related optimization such as exposing as
|
||
|
much instruction level parallelism as possible.
|
||
|
|
||
|
MIDR_EL1 register can be read only if HWCAP_CPUID feature is supported.
|
||
|
|
||
|
Change-Id: Iabb8a36c5d31b184dba6399f378598058d394d4e
|
||
|
|
||
|
Reviewed-by: Paul Dale <paul.dale@oracle.com>
|
||
|
Reviewed-by: Tomas Mraz <tmraz@fedoraproject.org>
|
||
|
(Merged from https://github.com/openssl/openssl/pull/11744)
|
||
|
---
|
||
|
crypto/arm64cpuid.pl | 7 +++++++
|
||
|
crypto/arm_arch.h | 44 ++++++++++++++++++++++++++++++++++++++++++++
|
||
|
crypto/armcap.c | 11 +++++++++++
|
||
|
3 files changed, 62 insertions(+)
|
||
|
|
||
|
Index: openssl-1.1.1d/crypto/arm64cpuid.pl
|
||
|
===================================================================
|
||
|
--- openssl-1.1.1d.orig/crypto/arm64cpuid.pl
|
||
|
+++ openssl-1.1.1d/crypto/arm64cpuid.pl
|
||
|
@@ -78,6 +78,13 @@ _armv8_sha512_probe:
|
||
|
ret
|
||
|
.size _armv8_sha512_probe,.-_armv8_sha512_probe
|
||
|
|
||
|
+.globl _armv8_cpuid_probe
|
||
|
+.type _armv8_cpuid_probe,%function
|
||
|
+_armv8_cpuid_probe:
|
||
|
+ mrs x0, midr_el1
|
||
|
+ ret
|
||
|
+.size _armv8_cpuid_probe,.-_armv8_cpuid_probe
|
||
|
+
|
||
|
.globl OPENSSL_cleanse
|
||
|
.type OPENSSL_cleanse,%function
|
||
|
.align 5
|
||
|
Index: openssl-1.1.1d/crypto/arm_arch.h
|
||
|
===================================================================
|
||
|
--- openssl-1.1.1d.orig/crypto/arm_arch.h
|
||
|
+++ openssl-1.1.1d/crypto/arm_arch.h
|
||
|
@@ -71,6 +71,7 @@
|
||
|
|
||
|
# ifndef __ASSEMBLER__
|
||
|
extern unsigned int OPENSSL_armcap_P;
|
||
|
+extern unsigned int OPENSSL_arm_midr;
|
||
|
# endif
|
||
|
|
||
|
# define ARMV7_NEON (1<<0)
|
||
|
@@ -80,5 +81,48 @@ extern unsigned int OPENSSL_armcap_P;
|
||
|
# define ARMV8_SHA256 (1<<4)
|
||
|
# define ARMV8_PMULL (1<<5)
|
||
|
# define ARMV8_SHA512 (1<<6)
|
||
|
+# define ARMV8_CPUID (1<<7)
|
||
|
|
||
|
+/*
|
||
|
+ * MIDR_EL1 system register
|
||
|
+ *
|
||
|
+ * 63___ _ ___32_31___ _ ___24_23_____20_19_____16_15__ _ __4_3_______0
|
||
|
+ * | | | | | | |
|
||
|
+ * |RES0 | Implementer | Variant | Arch | PartNum |Revision|
|
||
|
+ * |____ _ _____|_____ _ _____|_________|_______ _|____ _ ___|________|
|
||
|
+ *
|
||
|
+ */
|
||
|
+
|
||
|
+# define ARM_CPU_IMP_ARM 0x41
|
||
|
+
|
||
|
+# define ARM_CPU_PART_CORTEX_A72 0xD08
|
||
|
+# define ARM_CPU_PART_N1 0xD0C
|
||
|
+
|
||
|
+# define MIDR_PARTNUM_SHIFT 4
|
||
|
+# define MIDR_PARTNUM_MASK (0xfff << MIDR_PARTNUM_SHIFT)
|
||
|
+# define MIDR_PARTNUM(midr) \
|
||
|
+ (((midr) & MIDR_PARTNUM_MASK) >> MIDR_PARTNUM_SHIFT)
|
||
|
+
|
||
|
+# define MIDR_IMPLEMENTER_SHIFT 24
|
||
|
+# define MIDR_IMPLEMENTER_MASK (0xff << MIDR_IMPLEMENTER_SHIFT)
|
||
|
+# define MIDR_IMPLEMENTER(midr) \
|
||
|
+ (((midr) & MIDR_IMPLEMENTER_MASK) >> MIDR_IMPLEMENTER_SHIFT)
|
||
|
+
|
||
|
+# define MIDR_ARCHITECTURE_SHIFT 16
|
||
|
+# define MIDR_ARCHITECTURE_MASK (0xf << MIDR_ARCHITECTURE_SHIFT)
|
||
|
+# define MIDR_ARCHITECTURE(midr) \
|
||
|
+ (((midr) & MIDR_ARCHITECTURE_MASK) >> MIDR_ARCHITECTURE_SHIFT)
|
||
|
+
|
||
|
+# define MIDR_CPU_MODEL_MASK \
|
||
|
+ (MIDR_IMPLEMENTER_MASK | \
|
||
|
+ MIDR_PARTNUM_MASK | \
|
||
|
+ MIDR_ARCHITECTURE_MASK)
|
||
|
+
|
||
|
+# define MIDR_CPU_MODEL(imp, partnum) \
|
||
|
+ (((imp) << MIDR_IMPLEMENTER_SHIFT) | \
|
||
|
+ (0xf << MIDR_ARCHITECTURE_SHIFT) | \
|
||
|
+ ((partnum) << MIDR_PARTNUM_SHIFT))
|
||
|
+
|
||
|
+# define MIDR_IS_CPU_MODEL(midr, imp, partnum) \
|
||
|
+ (((midr) & MIDR_CPU_MODEL_MASK) == MIDR_CPU_MODEL(imp, partnum))
|
||
|
#endif
|
||
|
Index: openssl-1.1.1d/crypto/armcap.c
|
||
|
===================================================================
|
||
|
--- openssl-1.1.1d.orig/crypto/armcap.c
|
||
|
+++ openssl-1.1.1d/crypto/armcap.c
|
||
|
@@ -18,6 +18,8 @@
|
||
|
#include "arm_arch.h"
|
||
|
|
||
|
unsigned int OPENSSL_armcap_P = 0;
|
||
|
+unsigned int OPENSSL_arm_midr = 0;
|
||
|
+unsigned int OPENSSL_armv8_rsa_neonized = 0;
|
||
|
|
||
|
#if __ARM_MAX_ARCH__<7
|
||
|
void OPENSSL_cpuid_setup(void)
|
||
|
@@ -48,6 +50,7 @@ void _armv8_sha256_probe(void);
|
||
|
void _armv8_pmull_probe(void);
|
||
|
# ifdef __aarch64__
|
||
|
void _armv8_sha512_probe(void);
|
||
|
+unsigned int _armv8_cpuid_probe(void);
|
||
|
# endif
|
||
|
uint32_t _armv7_tick(void);
|
||
|
|
||
|
@@ -95,6 +98,7 @@ void OPENSSL_cpuid_setup(void) __attribu
|
||
|
# define HWCAP_CE_PMULL (1 << 4)
|
||
|
# define HWCAP_CE_SHA1 (1 << 5)
|
||
|
# define HWCAP_CE_SHA256 (1 << 6)
|
||
|
+# define HWCAP_CPUID (1 << 11)
|
||
|
# define HWCAP_CE_SHA512 (1 << 21)
|
||
|
# endif
|
||
|
|
||
|
@@ -155,6 +159,9 @@ void OPENSSL_cpuid_setup(void)
|
||
|
# ifdef __aarch64__
|
||
|
if (hwcap & HWCAP_CE_SHA512)
|
||
|
OPENSSL_armcap_P |= ARMV8_SHA512;
|
||
|
+
|
||
|
+ if (hwcap & HWCAP_CPUID)
|
||
|
+ OPENSSL_armcap_P |= ARMV8_CPUID;
|
||
|
# endif
|
||
|
}
|
||
|
# endif
|
||
|
@@ -210,5 +217,16 @@ void OPENSSL_cpuid_setup(void)
|
||
|
|
||
|
sigaction(SIGILL, &ill_oact, NULL);
|
||
|
sigprocmask(SIG_SETMASK, &oset, NULL);
|
||
|
+
|
||
|
+# ifdef __aarch64__
|
||
|
+ if (OPENSSL_armcap_P & ARMV8_CPUID)
|
||
|
+ OPENSSL_arm_midr = _armv8_cpuid_probe();
|
||
|
+
|
||
|
+ if ((MIDR_IS_CPU_MODEL(OPENSSL_arm_midr, ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A72) ||
|
||
|
+ MIDR_IS_CPU_MODEL(OPENSSL_arm_midr, ARM_CPU_IMP_ARM, ARM_CPU_PART_N1)) &&
|
||
|
+ (OPENSSL_armcap_P & ARMV7_NEON)) {
|
||
|
+ OPENSSL_armv8_rsa_neonized = 1;
|
||
|
+ }
|
||
|
+# endif
|
||
|
}
|
||
|
#endif
|
||
|
Index: openssl-1.1.1d/crypto/bn/asm/armv8-mont.pl
|
||
|
===================================================================
|
||
|
--- openssl-1.1.1d.orig/crypto/bn/asm/armv8-mont.pl
|
||
|
+++ openssl-1.1.1d/crypto/bn/asm/armv8-mont.pl
|
||
|
@@ -64,16 +64,34 @@ $n0="x4"; # const BN_ULONG *n0,
|
||
|
$num="x5"; # int num);
|
||
|
|
||
|
$code.=<<___;
|
||
|
+#ifndef __KERNEL__
|
||
|
+# include "arm_arch.h"
|
||
|
+.extern OPENSSL_armv8_rsa_neonized
|
||
|
+.hidden OPENSSL_armv8_rsa_neonized
|
||
|
+#endif
|
||
|
.text
|
||
|
|
||
|
.globl bn_mul_mont
|
||
|
.type bn_mul_mont,%function
|
||
|
.align 5
|
||
|
bn_mul_mont:
|
||
|
+.Lbn_mul_mont:
|
||
|
+ tst $num,#3
|
||
|
+ b.ne .Lmul_mont
|
||
|
+ cmp $num,#32
|
||
|
+ b.le .Lscalar_impl
|
||
|
+#ifndef __KERNEL__
|
||
|
+ adrp x17,OPENSSL_armv8_rsa_neonized
|
||
|
+ ldr w17,[x17,#:lo12:OPENSSL_armv8_rsa_neonized]
|
||
|
+ cbnz w17, bn_mul8x_mont_neon
|
||
|
+#endif
|
||
|
+
|
||
|
+.Lscalar_impl:
|
||
|
tst $num,#7
|
||
|
b.eq __bn_sqr8x_mont
|
||
|
tst $num,#3
|
||
|
b.eq __bn_mul4x_mont
|
||
|
+
|
||
|
.Lmul_mont:
|
||
|
stp x29,x30,[sp,#-64]!
|
||
|
add x29,sp,#0
|
||
|
@@ -271,6 +289,369 @@ bn_mul_mont:
|
||
|
.size bn_mul_mont,.-bn_mul_mont
|
||
|
___
|
||
|
{
|
||
|
+my ($A0,$A1,$N0,$N1)=map("v$_",(0..3));
|
||
|
+my ($Z,$Temp)=("v4.16b","v5");
|
||
|
+my @ACC=map("v$_",(6..13));
|
||
|
+my ($Bi,$Ni,$M0)=map("v$_",(28..30));
|
||
|
+my $sBi="s28";
|
||
|
+my $sM0="s30";
|
||
|
+my $zero="v14";
|
||
|
+my $temp="v15";
|
||
|
+my $ACCTemp="v16";
|
||
|
+
|
||
|
+my ($rptr,$aptr,$bptr,$nptr,$n0,$num)=map("x$_",(0..5));
|
||
|
+my ($tinptr,$toutptr,$inner,$outer,$bnptr)=map("x$_",(6..11));
|
||
|
+
|
||
|
+$code.=<<___;
|
||
|
+.type bn_mul8x_mont_neon,%function
|
||
|
+.align 5
|
||
|
+bn_mul8x_mont_neon:
|
||
|
+ stp x29,x30,[sp,#-80]!
|
||
|
+ mov x16,sp
|
||
|
+ stp d8,d9,[sp,#16]
|
||
|
+ stp d10,d11,[sp,#32]
|
||
|
+ stp d12,d13,[sp,#48]
|
||
|
+ stp d14,d15,[sp,#64]
|
||
|
+ lsl $num,$num,#1
|
||
|
+ eor $zero.16b,$zero.16b,$zero.16b
|
||
|
+
|
||
|
+.align 4
|
||
|
+.LNEON_8n:
|
||
|
+ eor @ACC[0].16b,@ACC[0].16b,@ACC[0].16b
|
||
|
+ sub $toutptr,sp,#128
|
||
|
+ eor @ACC[1].16b,@ACC[1].16b,@ACC[1].16b
|
||
|
+ sub $toutptr,$toutptr,$num,lsl#4
|
||
|
+ eor @ACC[2].16b,@ACC[2].16b,@ACC[2].16b
|
||
|
+ and $toutptr,$toutptr,#-64
|
||
|
+ eor @ACC[3].16b,@ACC[3].16b,@ACC[3].16b
|
||
|
+ mov sp,$toutptr // alloca
|
||
|
+ eor @ACC[4].16b,@ACC[4].16b,@ACC[4].16b
|
||
|
+ add $toutptr,$toutptr,#256
|
||
|
+ eor @ACC[5].16b,@ACC[5].16b,@ACC[5].16b
|
||
|
+ sub $inner,$num,#8
|
||
|
+ eor @ACC[6].16b,@ACC[6].16b,@ACC[6].16b
|
||
|
+ eor @ACC[7].16b,@ACC[7].16b,@ACC[7].16b
|
||
|
+
|
||
|
+.LNEON_8n_init:
|
||
|
+ st1 {@ACC[0].2d,@ACC[1].2d},[$toutptr],#32
|
||
|
+ subs $inner,$inner,#8
|
||
|
+ st1 {@ACC[2].2d,@ACC[3].2d},[$toutptr],#32
|
||
|
+ st1 {@ACC[4].2d,@ACC[5].2d},[$toutptr],#32
|
||
|
+ st1 {@ACC[6].2d,@ACC[7].2d},[$toutptr],#32
|
||
|
+ bne .LNEON_8n_init
|
||
|
+
|
||
|
+ add $tinptr,sp,#256
|
||
|
+ ld1 {$A0.4s,$A1.4s},[$aptr],#32
|
||
|
+ add $bnptr,sp,#8
|
||
|
+ ldr $sM0,[$n0],#4
|
||
|
+ mov $outer,$num
|
||
|
+ b .LNEON_8n_outer
|
||
|
+
|
||
|
+.align 4
|
||
|
+.LNEON_8n_outer:
|
||
|
+ ldr $sBi,[$bptr],#4 // *b++
|
||
|
+ uxtl $Bi.4s,$Bi.4h
|
||
|
+ add $toutptr,sp,#128
|
||
|
+ ld1 {$N0.4s,$N1.4s},[$nptr],#32
|
||
|
+
|
||
|
+ umlal @ACC[0].2d,$Bi.2s,$A0.s[0]
|
||
|
+ umlal @ACC[1].2d,$Bi.2s,$A0.s[1]
|
||
|
+ umlal @ACC[2].2d,$Bi.2s,$A0.s[2]
|
||
|
+ shl $Ni.2d,@ACC[0].2d,#16
|
||
|
+ ext $Ni.16b,$Ni.16b,$Ni.16b,#8
|
||
|
+ umlal @ACC[3].2d,$Bi.2s,$A0.s[3]
|
||
|
+ add $Ni.2d,$Ni.2d,@ACC[0].2d
|
||
|
+ umlal @ACC[4].2d,$Bi.2s,$A1.s[0]
|
||
|
+ mul $Ni.2s,$Ni.2s,$M0.2s
|
||
|
+ umlal @ACC[5].2d,$Bi.2s,$A1.s[1]
|
||
|
+ st1 {$Bi.2s},[sp] // put aside smashed b[8*i+0]
|
||
|
+ umlal @ACC[6].2d,$Bi.2s,$A1.s[2]
|
||
|
+ uxtl $Ni.4s,$Ni.4h
|
||
|
+ umlal @ACC[7].2d,$Bi.2s,$A1.s[3]
|
||
|
+___
|
||
|
+for ($i=0; $i<7;) {
|
||
|
+$code.=<<___;
|
||
|
+ ldr $sBi,[$bptr],#4 // *b++
|
||
|
+ umlal @ACC[0].2d,$Ni.2s,$N0.s[0]
|
||
|
+ umlal @ACC[1].2d,$Ni.2s,$N0.s[1]
|
||
|
+ uxtl $Bi.4s,$Bi.4h
|
||
|
+ umlal @ACC[2].2d,$Ni.2s,$N0.s[2]
|
||
|
+ ushr $temp.2d,@ACC[0].2d,#16
|
||
|
+ umlal @ACC[3].2d,$Ni.2s,$N0.s[3]
|
||
|
+ umlal @ACC[4].2d,$Ni.2s,$N1.s[0]
|
||
|
+ ext @ACC[0].16b,@ACC[0].16b,@ACC[0].16b,#8
|
||
|
+ add @ACC[0].2d,@ACC[0].2d,$temp.2d
|
||
|
+ umlal @ACC[5].2d,$Ni.2s,$N1.s[1]
|
||
|
+ ushr @ACC[0].2d,@ACC[0].2d,#16
|
||
|
+ umlal @ACC[6].2d,$Ni.2s,$N1.s[2]
|
||
|
+ umlal @ACC[7].2d,$Ni.2s,$N1.s[3]
|
||
|
+ add $ACCTemp.2d,@ACC[1].2d,@ACC[0].2d
|
||
|
+ ins @ACC[1].d[0],$ACCTemp.d[0]
|
||
|
+ st1 {$Ni.2s},[$bnptr],#8 // put aside smashed m[8*i+$i]
|
||
|
+___
|
||
|
+ push(@ACC,shift(@ACC)); $i++;
|
||
|
+$code.=<<___;
|
||
|
+ umlal @ACC[0].2d,$Bi.2s,$A0.s[0]
|
||
|
+ ld1 {@ACC[7].2d},[$tinptr],#16
|
||
|
+ umlal @ACC[1].2d,$Bi.2s,$A0.s[1]
|
||
|
+ umlal @ACC[2].2d,$Bi.2s,$A0.s[2]
|
||
|
+ shl $Ni.2d,@ACC[0].2d,#16
|
||
|
+ ext $Ni.16b,$Ni.16b,$Ni.16b,#8
|
||
|
+ umlal @ACC[3].2d,$Bi.2s,$A0.s[3]
|
||
|
+ add $Ni.2d,$Ni.2d,@ACC[0].2d
|
||
|
+ umlal @ACC[4].2d,$Bi.2s,$A1.s[0]
|
||
|
+ mul $Ni.2s,$Ni.2s,$M0.2s
|
||
|
+ umlal @ACC[5].2d,$Bi.2s,$A1.s[1]
|
||
|
+ st1 {$Bi.2s},[$bnptr],#8 // put aside smashed b[8*i+$i]
|
||
|
+ umlal @ACC[6].2d,$Bi.2s,$A1.s[2]
|
||
|
+ uxtl $Ni.4s,$Ni.4h
|
||
|
+ umlal @ACC[7].2d,$Bi.2s,$A1.s[3]
|
||
|
+___
|
||
|
+}
|
||
|
+$code.=<<___;
|
||
|
+ ld1 {$Bi.2s},[sp] // pull smashed b[8*i+0]
|
||
|
+ umlal @ACC[0].2d,$Ni.2s,$N0.s[0]
|
||
|
+ ld1 {$A0.4s,$A1.4s},[$aptr],#32
|
||
|
+ umlal @ACC[1].2d,$Ni.2s,$N0.s[1]
|
||
|
+ umlal @ACC[2].2d,$Ni.2s,$N0.s[2]
|
||
|
+ mov $Temp.16b,@ACC[0].16b
|
||
|
+ ushr $Temp.2d,$Temp.2d,#16
|
||
|
+ ext @ACC[0].16b,@ACC[0].16b,@ACC[0].16b,#8
|
||
|
+ umlal @ACC[3].2d,$Ni.2s,$N0.s[3]
|
||
|
+ umlal @ACC[4].2d,$Ni.2s,$N1.s[0]
|
||
|
+ add @ACC[0].2d,@ACC[0].2d,$Temp.2d
|
||
|
+ umlal @ACC[5].2d,$Ni.2s,$N1.s[1]
|
||
|
+ ushr @ACC[0].2d,@ACC[0].2d,#16
|
||
|
+ eor $temp.16b,$temp.16b,$temp.16b
|
||
|
+ ins @ACC[0].d[1],$temp.d[0]
|
||
|
+ umlal @ACC[6].2d,$Ni.2s,$N1.s[2]
|
||
|
+ umlal @ACC[7].2d,$Ni.2s,$N1.s[3]
|
||
|
+ add @ACC[1].2d,@ACC[1].2d,@ACC[0].2d
|
||
|
+ st1 {$Ni.2s},[$bnptr],#8 // put aside smashed m[8*i+$i]
|
||
|
+ add $bnptr,sp,#8 // rewind
|
||
|
+___
|
||
|
+ push(@ACC,shift(@ACC));
|
||
|
+$code.=<<___;
|
||
|
+ sub $inner,$num,#8
|
||
|
+ b .LNEON_8n_inner
|
||
|
+
|
||
|
+.align 4
|
||
|
+.LNEON_8n_inner:
|
||
|
+ subs $inner,$inner,#8
|
||
|
+ umlal @ACC[0].2d,$Bi.2s,$A0.s[0]
|
||
|
+ ld1 {@ACC[7].2d},[$tinptr]
|
||
|
+ umlal @ACC[1].2d,$Bi.2s,$A0.s[1]
|
||
|
+ ld1 {$Ni.2s},[$bnptr],#8 // pull smashed m[8*i+0]
|
||
|
+ umlal @ACC[2].2d,$Bi.2s,$A0.s[2]
|
||
|
+ ld1 {$N0.4s,$N1.4s},[$nptr],#32
|
||
|
+ umlal @ACC[3].2d,$Bi.2s,$A0.s[3]
|
||
|
+ b.eq .LInner_jump
|
||
|
+ add $tinptr,$tinptr,#16 // don't advance in last iteration
|
||
|
+.LInner_jump:
|
||
|
+ umlal @ACC[4].2d,$Bi.2s,$A1.s[0]
|
||
|
+ umlal @ACC[5].2d,$Bi.2s,$A1.s[1]
|
||
|
+ umlal @ACC[6].2d,$Bi.2s,$A1.s[2]
|
||
|
+ umlal @ACC[7].2d,$Bi.2s,$A1.s[3]
|
||
|
+___
|
||
|
+for ($i=1; $i<8; $i++) {
|
||
|
+$code.=<<___;
|
||
|
+ ld1 {$Bi.2s},[$bnptr],#8 // pull smashed b[8*i+$i]
|
||
|
+ umlal @ACC[0].2d,$Ni.2s,$N0.s[0]
|
||
|
+ umlal @ACC[1].2d,$Ni.2s,$N0.s[1]
|
||
|
+ umlal @ACC[2].2d,$Ni.2s,$N0.s[2]
|
||
|
+ umlal @ACC[3].2d,$Ni.2s,$N0.s[3]
|
||
|
+ umlal @ACC[4].2d,$Ni.2s,$N1.s[0]
|
||
|
+ umlal @ACC[5].2d,$Ni.2s,$N1.s[1]
|
||
|
+ umlal @ACC[6].2d,$Ni.2s,$N1.s[2]
|
||
|
+ umlal @ACC[7].2d,$Ni.2s,$N1.s[3]
|
||
|
+ st1 {@ACC[0].2d},[$toutptr],#16
|
||
|
+___
|
||
|
+ push(@ACC,shift(@ACC));
|
||
|
+$code.=<<___;
|
||
|
+ umlal @ACC[0].2d,$Bi.2s,$A0.s[0]
|
||
|
+ ld1 {@ACC[7].2d},[$tinptr]
|
||
|
+ umlal @ACC[1].2d,$Bi.2s,$A0.s[1]
|
||
|
+ ld1 {$Ni.2s},[$bnptr],#8 // pull smashed m[8*i+$i]
|
||
|
+ umlal @ACC[2].2d,$Bi.2s,$A0.s[2]
|
||
|
+ b.eq .LInner_jump$i
|
||
|
+ add $tinptr,$tinptr,#16 // don't advance in last iteration
|
||
|
+.LInner_jump$i:
|
||
|
+ umlal @ACC[3].2d,$Bi.2s,$A0.s[3]
|
||
|
+ umlal @ACC[4].2d,$Bi.2s,$A1.s[0]
|
||
|
+ umlal @ACC[5].2d,$Bi.2s,$A1.s[1]
|
||
|
+ umlal @ACC[6].2d,$Bi.2s,$A1.s[2]
|
||
|
+ umlal @ACC[7].2d,$Bi.2s,$A1.s[3]
|
||
|
+___
|
||
|
+}
|
||
|
+$code.=<<___;
|
||
|
+ b.ne .LInner_after_rewind$i
|
||
|
+ sub $aptr,$aptr,$num,lsl#2 // rewind
|
||
|
+.LInner_after_rewind$i:
|
||
|
+ umlal @ACC[0].2d,$Ni.2s,$N0.s[0]
|
||
|
+ ld1 {$Bi.2s},[sp] // pull smashed b[8*i+0]
|
||
|
+ umlal @ACC[1].2d,$Ni.2s,$N0.s[1]
|
||
|
+ ld1 {$A0.4s,$A1.4s},[$aptr],#32
|
||
|
+ umlal @ACC[2].2d,$Ni.2s,$N0.s[2]
|
||
|
+ add $bnptr,sp,#8 // rewind
|
||
|
+ umlal @ACC[3].2d,$Ni.2s,$N0.s[3]
|
||
|
+ umlal @ACC[4].2d,$Ni.2s,$N1.s[0]
|
||
|
+ umlal @ACC[5].2d,$Ni.2s,$N1.s[1]
|
||
|
+ umlal @ACC[6].2d,$Ni.2s,$N1.s[2]
|
||
|
+ st1 {@ACC[0].2d},[$toutptr],#16
|
||
|
+ umlal @ACC[7].2d,$Ni.2s,$N1.s[3]
|
||
|
+
|
||
|
+ bne .LNEON_8n_inner
|
||
|
+___
|
||
|
+ push(@ACC,shift(@ACC));
|
||
|
+$code.=<<___;
|
||
|
+ add $tinptr,sp,#128
|
||
|
+ st1 {@ACC[0].2d,@ACC[1].2d},[$toutptr],#32
|
||
|
+ eor $N0.16b,$N0.16b,$N0.16b // $N0
|
||
|
+ st1 {@ACC[2].2d,@ACC[3].2d},[$toutptr],#32
|
||
|
+ eor $N1.16b,$N1.16b,$N1.16b // $N1
|
||
|
+ st1 {@ACC[4].2d,@ACC[5].2d},[$toutptr],#32
|
||
|
+ st1 {@ACC[6].2d},[$toutptr]
|
||
|
+
|
||
|
+ subs $outer,$outer,#8
|
||
|
+ ld1 {@ACC[0].2d,@ACC[1].2d},[$tinptr],#32
|
||
|
+ ld1 {@ACC[2].2d,@ACC[3].2d},[$tinptr],#32
|
||
|
+ ld1 {@ACC[4].2d,@ACC[5].2d},[$tinptr],#32
|
||
|
+ ld1 {@ACC[6].2d,@ACC[7].2d},[$tinptr],#32
|
||
|
+
|
||
|
+ b.eq .LInner_8n_jump_2steps
|
||
|
+ sub $nptr,$nptr,$num,lsl#2 // rewind
|
||
|
+ b .LNEON_8n_outer
|
||
|
+
|
||
|
+.LInner_8n_jump_2steps:
|
||
|
+ add $toutptr,sp,#128
|
||
|
+ st1 {$N0.2d,$N1.2d}, [sp],#32 // start wiping stack frame
|
||
|
+ mov $Temp.16b,@ACC[0].16b
|
||
|
+ ushr $temp.2d,@ACC[0].2d,#16
|
||
|
+ ext @ACC[0].16b,@ACC[0].16b,@ACC[0].16b,#8
|
||
|
+ st1 {$N0.2d,$N1.2d}, [sp],#32
|
||
|
+ add @ACC[0].2d,@ACC[0].2d,$temp.2d
|
||
|
+ st1 {$N0.2d,$N1.2d}, [sp],#32
|
||
|
+ ushr $temp.2d,@ACC[0].2d,#16
|
||
|
+ st1 {$N0.2d,$N1.2d}, [sp],#32
|
||
|
+ zip1 @ACC[0].4h,$Temp.4h,@ACC[0].4h
|
||
|
+ ins $temp.d[1],$zero.d[0]
|
||
|
+
|
||
|
+ mov $inner,$num
|
||
|
+ b .LNEON_tail_entry
|
||
|
+
|
||
|
+.align 4
|
||
|
+.LNEON_tail:
|
||
|
+ add @ACC[0].2d,@ACC[0].2d,$temp.2d
|
||
|
+ mov $Temp.16b,@ACC[0].16b
|
||
|
+ ushr $temp.2d,@ACC[0].2d,#16
|
||
|
+ ext @ACC[0].16b,@ACC[0].16b,@ACC[0].16b,#8
|
||
|
+ ld1 {@ACC[2].2d,@ACC[3].2d}, [$tinptr],#32
|
||
|
+ add @ACC[0].2d,@ACC[0].2d,$temp.2d
|
||
|
+ ld1 {@ACC[4].2d,@ACC[5].2d}, [$tinptr],#32
|
||
|
+ ushr $temp.2d,@ACC[0].2d,#16
|
||
|
+ ld1 {@ACC[6].2d,@ACC[7].2d}, [$tinptr],#32
|
||
|
+ zip1 @ACC[0].4h,$Temp.4h,@ACC[0].4h
|
||
|
+ ins $temp.d[1],$zero.d[0]
|
||
|
+
|
||
|
+.LNEON_tail_entry:
|
||
|
+___
|
||
|
+for ($i=1; $i<8; $i++) {
|
||
|
+$code.=<<___;
|
||
|
+ add @ACC[1].2d,@ACC[1].2d,$temp.2d
|
||
|
+ st1 {@ACC[0].s}[0], [$toutptr],#4
|
||
|
+ ushr $temp.2d,@ACC[1].2d,#16
|
||
|
+ mov $Temp.16b,@ACC[1].16b
|
||
|
+ ext @ACC[1].16b,@ACC[1].16b,@ACC[1].16b,#8
|
||
|
+ add @ACC[1].2d,@ACC[1].2d,$temp.2d
|
||
|
+ ushr $temp.2d,@ACC[1].2d,#16
|
||
|
+ zip1 @ACC[1].4h,$Temp.4h,@ACC[1].4h
|
||
|
+ ins $temp.d[1],$zero.d[0]
|
||
|
+___
|
||
|
+ push(@ACC,shift(@ACC));
|
||
|
+}
|
||
|
+ push(@ACC,shift(@ACC));
|
||
|
+$code.=<<___;
|
||
|
+ ld1 {@ACC[0].2d,@ACC[1].2d}, [$tinptr],#32
|
||
|
+ subs $inner,$inner,#8
|
||
|
+ st1 {@ACC[7].s}[0], [$toutptr],#4
|
||
|
+ bne .LNEON_tail
|
||
|
+
|
||
|
+ st1 {$temp.s}[0], [$toutptr],#4 // top-most bit
|
||
|
+ sub $nptr,$nptr,$num,lsl#2 // rewind $nptr
|
||
|
+ subs $aptr,sp,#0 // clear carry flag
|
||
|
+ add $bptr,sp,$num,lsl#2
|
||
|
+
|
||
|
+.LNEON_sub:
|
||
|
+ ldp w4,w5,[$aptr],#8
|
||
|
+ ldp w6,w7,[$aptr],#8
|
||
|
+ ldp w8,w9,[$nptr],#8
|
||
|
+ ldp w10,w11,[$nptr],#8
|
||
|
+ sbcs w8,w4,w8
|
||
|
+ sbcs w9,w5,w9
|
||
|
+ sbcs w10,w6,w10
|
||
|
+ sbcs w11,w7,w11
|
||
|
+ sub x17,$bptr,$aptr
|
||
|
+ stp w8,w9,[$rptr],#8
|
||
|
+ stp w10,w11,[$rptr],#8
|
||
|
+ cbnz x17,.LNEON_sub
|
||
|
+
|
||
|
+ ldr w10, [$aptr] // load top-most bit
|
||
|
+ mov x11,sp
|
||
|
+ eor v0.16b,v0.16b,v0.16b
|
||
|
+ sub x11,$bptr,x11 // this is num*4
|
||
|
+ eor v1.16b,v1.16b,v1.16b
|
||
|
+ mov $aptr,sp
|
||
|
+ sub $rptr,$rptr,x11 // rewind $rptr
|
||
|
+ mov $nptr,$bptr // second 3/4th of frame
|
||
|
+ sbcs w10,w10,wzr // result is carry flag
|
||
|
+
|
||
|
+.LNEON_copy_n_zap:
|
||
|
+ ldp w4,w5,[$aptr],#8
|
||
|
+ ldp w6,w7,[$aptr],#8
|
||
|
+ ldp w8,w9,[$rptr],#8
|
||
|
+ ldp w10,w11,[$rptr]
|
||
|
+ sub $rptr,$rptr,#8
|
||
|
+ b.cs .LCopy_1
|
||
|
+ mov w8,w4
|
||
|
+ mov w9,w5
|
||
|
+ mov w10,w6
|
||
|
+ mov w11,w7
|
||
|
+.LCopy_1:
|
||
|
+ st1 {v0.2d,v1.2d}, [$nptr],#32 // wipe
|
||
|
+ st1 {v0.2d,v1.2d}, [$nptr],#32 // wipe
|
||
|
+ ldp w4,w5,[$aptr],#8
|
||
|
+ ldp w6,w7,[$aptr],#8
|
||
|
+ stp w8,w9,[$rptr],#8
|
||
|
+ stp w10,w11,[$rptr],#8
|
||
|
+ sub $aptr,$aptr,#32
|
||
|
+ ldp w8,w9,[$rptr],#8
|
||
|
+ ldp w10,w11,[$rptr]
|
||
|
+ sub $rptr,$rptr,#8
|
||
|
+ b.cs .LCopy_2
|
||
|
+ mov w8, w4
|
||
|
+ mov w9, w5
|
||
|
+ mov w10, w6
|
||
|
+ mov w11, w7
|
||
|
+.LCopy_2:
|
||
|
+ st1 {v0.2d,v1.2d}, [$aptr],#32 // wipe
|
||
|
+ st1 {v0.2d,v1.2d}, [$nptr],#32 // wipe
|
||
|
+ sub x17,$bptr,$aptr // preserves carry
|
||
|
+ stp w8,w9,[$rptr],#8
|
||
|
+ stp w10,w11,[$rptr],#8
|
||
|
+ cbnz x17,.LNEON_copy_n_zap
|
||
|
+
|
||
|
+ mov sp,x16
|
||
|
+ ldp d14,d15,[sp,#64]
|
||
|
+ ldp d12,d13,[sp,#48]
|
||
|
+ ldp d10,d11,[sp,#32]
|
||
|
+ ldp d8,d9,[sp,#16]
|
||
|
+ ldr x29,[sp],#80
|
||
|
+ ret // bx lr
|
||
|
+
|
||
|
+.size bn_mul8x_mont_neon,.-bn_mul8x_mont_neon
|
||
|
+___
|
||
|
+}
|
||
|
+{
|
||
|
########################################################################
|
||
|
# Following is ARMv8 adaptation of sqrx8x_mont from x86_64-mont5 module.
|
||
|
|
||
|
Index: openssl-1.1.1d/crypto/bn/build.info
|
||
|
===================================================================
|
||
|
--- openssl-1.1.1d.orig/crypto/bn/build.info
|
||
|
+++ openssl-1.1.1d/crypto/bn/build.info
|
||
|
@@ -65,3 +65,4 @@ INCLUDE[armv4-mont.o]=..
|
||
|
GENERATE[armv4-gf2m.S]=asm/armv4-gf2m.pl $(PERLASM_SCHEME)
|
||
|
INCLUDE[armv4-gf2m.o]=..
|
||
|
GENERATE[armv8-mont.S]=asm/armv8-mont.pl $(PERLASM_SCHEME)
|
||
|
+INCLUDE[armv8-mont.o]=..
|