From 5ea64b456b1a27ae046f23d632a968a7583bb9eb Mon Sep 17 00:00:00 2001 From: "Fangming.Fang" Date: Tue, 28 Apr 2020 02:33:50 +0000 Subject: [PATCH] Read MIDR_EL1 system register on aarch64 MIDR_EL1 system register exposes microarchitecture information so that people can make micro-arch related optimization such as exposing as much instruction level parallelism as possible. MIDR_EL1 register can be read only if HWCAP_CPUID feature is supported. Change-Id: Iabb8a36c5d31b184dba6399f378598058d394d4e Reviewed-by: Paul Dale Reviewed-by: Tomas Mraz (Merged from https://github.com/openssl/openssl/pull/11744) --- crypto/arm64cpuid.pl | 7 +++++++ crypto/arm_arch.h | 44 ++++++++++++++++++++++++++++++++++++++++++++ crypto/armcap.c | 11 +++++++++++ 3 files changed, 62 insertions(+) Index: openssl-1.1.1d/crypto/arm64cpuid.pl =================================================================== --- openssl-1.1.1d.orig/crypto/arm64cpuid.pl +++ openssl-1.1.1d/crypto/arm64cpuid.pl @@ -78,6 +78,13 @@ _armv8_sha512_probe: ret .size _armv8_sha512_probe,.-_armv8_sha512_probe +.globl _armv8_cpuid_probe +.type _armv8_cpuid_probe,%function +_armv8_cpuid_probe: + mrs x0, midr_el1 + ret +.size _armv8_cpuid_probe,.-_armv8_cpuid_probe + .globl OPENSSL_cleanse .type OPENSSL_cleanse,%function .align 5 Index: openssl-1.1.1d/crypto/arm_arch.h =================================================================== --- openssl-1.1.1d.orig/crypto/arm_arch.h +++ openssl-1.1.1d/crypto/arm_arch.h @@ -71,6 +71,7 @@ # ifndef __ASSEMBLER__ extern unsigned int OPENSSL_armcap_P; +extern unsigned int OPENSSL_arm_midr; # endif # define ARMV7_NEON (1<<0) @@ -80,5 +81,48 @@ extern unsigned int OPENSSL_armcap_P; # define ARMV8_SHA256 (1<<4) # define ARMV8_PMULL (1<<5) # define ARMV8_SHA512 (1<<6) +# define ARMV8_CPUID (1<<7) +/* + * MIDR_EL1 system register + * + * 63___ _ ___32_31___ _ ___24_23_____20_19_____16_15__ _ __4_3_______0 + * | | | | | | | + * |RES0 | Implementer | Variant | Arch | PartNum |Revision| + * |____ _ _____|_____ _ _____|_________|_______ _|____ _ ___|________| + * + */ + +# define ARM_CPU_IMP_ARM 0x41 + +# define ARM_CPU_PART_CORTEX_A72 0xD08 +# define ARM_CPU_PART_N1 0xD0C + +# define MIDR_PARTNUM_SHIFT 4 +# define MIDR_PARTNUM_MASK (0xfff << MIDR_PARTNUM_SHIFT) +# define MIDR_PARTNUM(midr) \ + (((midr) & MIDR_PARTNUM_MASK) >> MIDR_PARTNUM_SHIFT) + +# define MIDR_IMPLEMENTER_SHIFT 24 +# define MIDR_IMPLEMENTER_MASK (0xff << MIDR_IMPLEMENTER_SHIFT) +# define MIDR_IMPLEMENTER(midr) \ + (((midr) & MIDR_IMPLEMENTER_MASK) >> MIDR_IMPLEMENTER_SHIFT) + +# define MIDR_ARCHITECTURE_SHIFT 16 +# define MIDR_ARCHITECTURE_MASK (0xf << MIDR_ARCHITECTURE_SHIFT) +# define MIDR_ARCHITECTURE(midr) \ + (((midr) & MIDR_ARCHITECTURE_MASK) >> MIDR_ARCHITECTURE_SHIFT) + +# define MIDR_CPU_MODEL_MASK \ + (MIDR_IMPLEMENTER_MASK | \ + MIDR_PARTNUM_MASK | \ + MIDR_ARCHITECTURE_MASK) + +# define MIDR_CPU_MODEL(imp, partnum) \ + (((imp) << MIDR_IMPLEMENTER_SHIFT) | \ + (0xf << MIDR_ARCHITECTURE_SHIFT) | \ + ((partnum) << MIDR_PARTNUM_SHIFT)) + +# define MIDR_IS_CPU_MODEL(midr, imp, partnum) \ + (((midr) & MIDR_CPU_MODEL_MASK) == MIDR_CPU_MODEL(imp, partnum)) #endif Index: openssl-1.1.1d/crypto/armcap.c =================================================================== --- openssl-1.1.1d.orig/crypto/armcap.c +++ openssl-1.1.1d/crypto/armcap.c @@ -18,6 +18,8 @@ #include "arm_arch.h" unsigned int OPENSSL_armcap_P = 0; +unsigned int OPENSSL_arm_midr = 0; +unsigned int OPENSSL_armv8_rsa_neonized = 0; #if __ARM_MAX_ARCH__<7 void OPENSSL_cpuid_setup(void) @@ -48,6 +50,7 @@ void _armv8_sha256_probe(void); void _armv8_pmull_probe(void); # ifdef __aarch64__ void _armv8_sha512_probe(void); +unsigned int _armv8_cpuid_probe(void); # endif uint32_t _armv7_tick(void); @@ -95,6 +98,7 @@ void OPENSSL_cpuid_setup(void) __attribu # define HWCAP_CE_PMULL (1 << 4) # define HWCAP_CE_SHA1 (1 << 5) # define HWCAP_CE_SHA256 (1 << 6) +# define HWCAP_CPUID (1 << 11) # define HWCAP_CE_SHA512 (1 << 21) # endif @@ -155,6 +159,9 @@ void OPENSSL_cpuid_setup(void) # ifdef __aarch64__ if (hwcap & HWCAP_CE_SHA512) OPENSSL_armcap_P |= ARMV8_SHA512; + + if (hwcap & HWCAP_CPUID) + OPENSSL_armcap_P |= ARMV8_CPUID; # endif } # endif @@ -210,5 +217,16 @@ void OPENSSL_cpuid_setup(void) sigaction(SIGILL, &ill_oact, NULL); sigprocmask(SIG_SETMASK, &oset, NULL); + +# ifdef __aarch64__ + if (OPENSSL_armcap_P & ARMV8_CPUID) + OPENSSL_arm_midr = _armv8_cpuid_probe(); + + if ((MIDR_IS_CPU_MODEL(OPENSSL_arm_midr, ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A72) || + MIDR_IS_CPU_MODEL(OPENSSL_arm_midr, ARM_CPU_IMP_ARM, ARM_CPU_PART_N1)) && + (OPENSSL_armcap_P & ARMV7_NEON)) { + OPENSSL_armv8_rsa_neonized = 1; + } +# endif } #endif Index: openssl-1.1.1d/crypto/bn/asm/armv8-mont.pl =================================================================== --- openssl-1.1.1d.orig/crypto/bn/asm/armv8-mont.pl +++ openssl-1.1.1d/crypto/bn/asm/armv8-mont.pl @@ -64,16 +64,34 @@ $n0="x4"; # const BN_ULONG *n0, $num="x5"; # int num); $code.=<<___; +#ifndef __KERNEL__ +# include "arm_arch.h" +.extern OPENSSL_armv8_rsa_neonized +.hidden OPENSSL_armv8_rsa_neonized +#endif .text .globl bn_mul_mont .type bn_mul_mont,%function .align 5 bn_mul_mont: +.Lbn_mul_mont: + tst $num,#3 + b.ne .Lmul_mont + cmp $num,#32 + b.le .Lscalar_impl +#ifndef __KERNEL__ + adrp x17,OPENSSL_armv8_rsa_neonized + ldr w17,[x17,#:lo12:OPENSSL_armv8_rsa_neonized] + cbnz w17, bn_mul8x_mont_neon +#endif + +.Lscalar_impl: tst $num,#7 b.eq __bn_sqr8x_mont tst $num,#3 b.eq __bn_mul4x_mont + .Lmul_mont: stp x29,x30,[sp,#-64]! add x29,sp,#0 @@ -271,6 +289,369 @@ bn_mul_mont: .size bn_mul_mont,.-bn_mul_mont ___ { +my ($A0,$A1,$N0,$N1)=map("v$_",(0..3)); +my ($Z,$Temp)=("v4.16b","v5"); +my @ACC=map("v$_",(6..13)); +my ($Bi,$Ni,$M0)=map("v$_",(28..30)); +my $sBi="s28"; +my $sM0="s30"; +my $zero="v14"; +my $temp="v15"; +my $ACCTemp="v16"; + +my ($rptr,$aptr,$bptr,$nptr,$n0,$num)=map("x$_",(0..5)); +my ($tinptr,$toutptr,$inner,$outer,$bnptr)=map("x$_",(6..11)); + +$code.=<<___; +.type bn_mul8x_mont_neon,%function +.align 5 +bn_mul8x_mont_neon: + stp x29,x30,[sp,#-80]! + mov x16,sp + stp d8,d9,[sp,#16] + stp d10,d11,[sp,#32] + stp d12,d13,[sp,#48] + stp d14,d15,[sp,#64] + lsl $num,$num,#1 + eor $zero.16b,$zero.16b,$zero.16b + +.align 4 +.LNEON_8n: + eor @ACC[0].16b,@ACC[0].16b,@ACC[0].16b + sub $toutptr,sp,#128 + eor @ACC[1].16b,@ACC[1].16b,@ACC[1].16b + sub $toutptr,$toutptr,$num,lsl#4 + eor @ACC[2].16b,@ACC[2].16b,@ACC[2].16b + and $toutptr,$toutptr,#-64 + eor @ACC[3].16b,@ACC[3].16b,@ACC[3].16b + mov sp,$toutptr // alloca + eor @ACC[4].16b,@ACC[4].16b,@ACC[4].16b + add $toutptr,$toutptr,#256 + eor @ACC[5].16b,@ACC[5].16b,@ACC[5].16b + sub $inner,$num,#8 + eor @ACC[6].16b,@ACC[6].16b,@ACC[6].16b + eor @ACC[7].16b,@ACC[7].16b,@ACC[7].16b + +.LNEON_8n_init: + st1 {@ACC[0].2d,@ACC[1].2d},[$toutptr],#32 + subs $inner,$inner,#8 + st1 {@ACC[2].2d,@ACC[3].2d},[$toutptr],#32 + st1 {@ACC[4].2d,@ACC[5].2d},[$toutptr],#32 + st1 {@ACC[6].2d,@ACC[7].2d},[$toutptr],#32 + bne .LNEON_8n_init + + add $tinptr,sp,#256 + ld1 {$A0.4s,$A1.4s},[$aptr],#32 + add $bnptr,sp,#8 + ldr $sM0,[$n0],#4 + mov $outer,$num + b .LNEON_8n_outer + +.align 4 +.LNEON_8n_outer: + ldr $sBi,[$bptr],#4 // *b++ + uxtl $Bi.4s,$Bi.4h + add $toutptr,sp,#128 + ld1 {$N0.4s,$N1.4s},[$nptr],#32 + + umlal @ACC[0].2d,$Bi.2s,$A0.s[0] + umlal @ACC[1].2d,$Bi.2s,$A0.s[1] + umlal @ACC[2].2d,$Bi.2s,$A0.s[2] + shl $Ni.2d,@ACC[0].2d,#16 + ext $Ni.16b,$Ni.16b,$Ni.16b,#8 + umlal @ACC[3].2d,$Bi.2s,$A0.s[3] + add $Ni.2d,$Ni.2d,@ACC[0].2d + umlal @ACC[4].2d,$Bi.2s,$A1.s[0] + mul $Ni.2s,$Ni.2s,$M0.2s + umlal @ACC[5].2d,$Bi.2s,$A1.s[1] + st1 {$Bi.2s},[sp] // put aside smashed b[8*i+0] + umlal @ACC[6].2d,$Bi.2s,$A1.s[2] + uxtl $Ni.4s,$Ni.4h + umlal @ACC[7].2d,$Bi.2s,$A1.s[3] +___ +for ($i=0; $i<7;) { +$code.=<<___; + ldr $sBi,[$bptr],#4 // *b++ + umlal @ACC[0].2d,$Ni.2s,$N0.s[0] + umlal @ACC[1].2d,$Ni.2s,$N0.s[1] + uxtl $Bi.4s,$Bi.4h + umlal @ACC[2].2d,$Ni.2s,$N0.s[2] + ushr $temp.2d,@ACC[0].2d,#16 + umlal @ACC[3].2d,$Ni.2s,$N0.s[3] + umlal @ACC[4].2d,$Ni.2s,$N1.s[0] + ext @ACC[0].16b,@ACC[0].16b,@ACC[0].16b,#8 + add @ACC[0].2d,@ACC[0].2d,$temp.2d + umlal @ACC[5].2d,$Ni.2s,$N1.s[1] + ushr @ACC[0].2d,@ACC[0].2d,#16 + umlal @ACC[6].2d,$Ni.2s,$N1.s[2] + umlal @ACC[7].2d,$Ni.2s,$N1.s[3] + add $ACCTemp.2d,@ACC[1].2d,@ACC[0].2d + ins @ACC[1].d[0],$ACCTemp.d[0] + st1 {$Ni.2s},[$bnptr],#8 // put aside smashed m[8*i+$i] +___ + push(@ACC,shift(@ACC)); $i++; +$code.=<<___; + umlal @ACC[0].2d,$Bi.2s,$A0.s[0] + ld1 {@ACC[7].2d},[$tinptr],#16 + umlal @ACC[1].2d,$Bi.2s,$A0.s[1] + umlal @ACC[2].2d,$Bi.2s,$A0.s[2] + shl $Ni.2d,@ACC[0].2d,#16 + ext $Ni.16b,$Ni.16b,$Ni.16b,#8 + umlal @ACC[3].2d,$Bi.2s,$A0.s[3] + add $Ni.2d,$Ni.2d,@ACC[0].2d + umlal @ACC[4].2d,$Bi.2s,$A1.s[0] + mul $Ni.2s,$Ni.2s,$M0.2s + umlal @ACC[5].2d,$Bi.2s,$A1.s[1] + st1 {$Bi.2s},[$bnptr],#8 // put aside smashed b[8*i+$i] + umlal @ACC[6].2d,$Bi.2s,$A1.s[2] + uxtl $Ni.4s,$Ni.4h + umlal @ACC[7].2d,$Bi.2s,$A1.s[3] +___ +} +$code.=<<___; + ld1 {$Bi.2s},[sp] // pull smashed b[8*i+0] + umlal @ACC[0].2d,$Ni.2s,$N0.s[0] + ld1 {$A0.4s,$A1.4s},[$aptr],#32 + umlal @ACC[1].2d,$Ni.2s,$N0.s[1] + umlal @ACC[2].2d,$Ni.2s,$N0.s[2] + mov $Temp.16b,@ACC[0].16b + ushr $Temp.2d,$Temp.2d,#16 + ext @ACC[0].16b,@ACC[0].16b,@ACC[0].16b,#8 + umlal @ACC[3].2d,$Ni.2s,$N0.s[3] + umlal @ACC[4].2d,$Ni.2s,$N1.s[0] + add @ACC[0].2d,@ACC[0].2d,$Temp.2d + umlal @ACC[5].2d,$Ni.2s,$N1.s[1] + ushr @ACC[0].2d,@ACC[0].2d,#16 + eor $temp.16b,$temp.16b,$temp.16b + ins @ACC[0].d[1],$temp.d[0] + umlal @ACC[6].2d,$Ni.2s,$N1.s[2] + umlal @ACC[7].2d,$Ni.2s,$N1.s[3] + add @ACC[1].2d,@ACC[1].2d,@ACC[0].2d + st1 {$Ni.2s},[$bnptr],#8 // put aside smashed m[8*i+$i] + add $bnptr,sp,#8 // rewind +___ + push(@ACC,shift(@ACC)); +$code.=<<___; + sub $inner,$num,#8 + b .LNEON_8n_inner + +.align 4 +.LNEON_8n_inner: + subs $inner,$inner,#8 + umlal @ACC[0].2d,$Bi.2s,$A0.s[0] + ld1 {@ACC[7].2d},[$tinptr] + umlal @ACC[1].2d,$Bi.2s,$A0.s[1] + ld1 {$Ni.2s},[$bnptr],#8 // pull smashed m[8*i+0] + umlal @ACC[2].2d,$Bi.2s,$A0.s[2] + ld1 {$N0.4s,$N1.4s},[$nptr],#32 + umlal @ACC[3].2d,$Bi.2s,$A0.s[3] + b.eq .LInner_jump + add $tinptr,$tinptr,#16 // don't advance in last iteration +.LInner_jump: + umlal @ACC[4].2d,$Bi.2s,$A1.s[0] + umlal @ACC[5].2d,$Bi.2s,$A1.s[1] + umlal @ACC[6].2d,$Bi.2s,$A1.s[2] + umlal @ACC[7].2d,$Bi.2s,$A1.s[3] +___ +for ($i=1; $i<8; $i++) { +$code.=<<___; + ld1 {$Bi.2s},[$bnptr],#8 // pull smashed b[8*i+$i] + umlal @ACC[0].2d,$Ni.2s,$N0.s[0] + umlal @ACC[1].2d,$Ni.2s,$N0.s[1] + umlal @ACC[2].2d,$Ni.2s,$N0.s[2] + umlal @ACC[3].2d,$Ni.2s,$N0.s[3] + umlal @ACC[4].2d,$Ni.2s,$N1.s[0] + umlal @ACC[5].2d,$Ni.2s,$N1.s[1] + umlal @ACC[6].2d,$Ni.2s,$N1.s[2] + umlal @ACC[7].2d,$Ni.2s,$N1.s[3] + st1 {@ACC[0].2d},[$toutptr],#16 +___ + push(@ACC,shift(@ACC)); +$code.=<<___; + umlal @ACC[0].2d,$Bi.2s,$A0.s[0] + ld1 {@ACC[7].2d},[$tinptr] + umlal @ACC[1].2d,$Bi.2s,$A0.s[1] + ld1 {$Ni.2s},[$bnptr],#8 // pull smashed m[8*i+$i] + umlal @ACC[2].2d,$Bi.2s,$A0.s[2] + b.eq .LInner_jump$i + add $tinptr,$tinptr,#16 // don't advance in last iteration +.LInner_jump$i: + umlal @ACC[3].2d,$Bi.2s,$A0.s[3] + umlal @ACC[4].2d,$Bi.2s,$A1.s[0] + umlal @ACC[5].2d,$Bi.2s,$A1.s[1] + umlal @ACC[6].2d,$Bi.2s,$A1.s[2] + umlal @ACC[7].2d,$Bi.2s,$A1.s[3] +___ +} +$code.=<<___; + b.ne .LInner_after_rewind$i + sub $aptr,$aptr,$num,lsl#2 // rewind +.LInner_after_rewind$i: + umlal @ACC[0].2d,$Ni.2s,$N0.s[0] + ld1 {$Bi.2s},[sp] // pull smashed b[8*i+0] + umlal @ACC[1].2d,$Ni.2s,$N0.s[1] + ld1 {$A0.4s,$A1.4s},[$aptr],#32 + umlal @ACC[2].2d,$Ni.2s,$N0.s[2] + add $bnptr,sp,#8 // rewind + umlal @ACC[3].2d,$Ni.2s,$N0.s[3] + umlal @ACC[4].2d,$Ni.2s,$N1.s[0] + umlal @ACC[5].2d,$Ni.2s,$N1.s[1] + umlal @ACC[6].2d,$Ni.2s,$N1.s[2] + st1 {@ACC[0].2d},[$toutptr],#16 + umlal @ACC[7].2d,$Ni.2s,$N1.s[3] + + bne .LNEON_8n_inner +___ + push(@ACC,shift(@ACC)); +$code.=<<___; + add $tinptr,sp,#128 + st1 {@ACC[0].2d,@ACC[1].2d},[$toutptr],#32 + eor $N0.16b,$N0.16b,$N0.16b // $N0 + st1 {@ACC[2].2d,@ACC[3].2d},[$toutptr],#32 + eor $N1.16b,$N1.16b,$N1.16b // $N1 + st1 {@ACC[4].2d,@ACC[5].2d},[$toutptr],#32 + st1 {@ACC[6].2d},[$toutptr] + + subs $outer,$outer,#8 + ld1 {@ACC[0].2d,@ACC[1].2d},[$tinptr],#32 + ld1 {@ACC[2].2d,@ACC[3].2d},[$tinptr],#32 + ld1 {@ACC[4].2d,@ACC[5].2d},[$tinptr],#32 + ld1 {@ACC[6].2d,@ACC[7].2d},[$tinptr],#32 + + b.eq .LInner_8n_jump_2steps + sub $nptr,$nptr,$num,lsl#2 // rewind + b .LNEON_8n_outer + +.LInner_8n_jump_2steps: + add $toutptr,sp,#128 + st1 {$N0.2d,$N1.2d}, [sp],#32 // start wiping stack frame + mov $Temp.16b,@ACC[0].16b + ushr $temp.2d,@ACC[0].2d,#16 + ext @ACC[0].16b,@ACC[0].16b,@ACC[0].16b,#8 + st1 {$N0.2d,$N1.2d}, [sp],#32 + add @ACC[0].2d,@ACC[0].2d,$temp.2d + st1 {$N0.2d,$N1.2d}, [sp],#32 + ushr $temp.2d,@ACC[0].2d,#16 + st1 {$N0.2d,$N1.2d}, [sp],#32 + zip1 @ACC[0].4h,$Temp.4h,@ACC[0].4h + ins $temp.d[1],$zero.d[0] + + mov $inner,$num + b .LNEON_tail_entry + +.align 4 +.LNEON_tail: + add @ACC[0].2d,@ACC[0].2d,$temp.2d + mov $Temp.16b,@ACC[0].16b + ushr $temp.2d,@ACC[0].2d,#16 + ext @ACC[0].16b,@ACC[0].16b,@ACC[0].16b,#8 + ld1 {@ACC[2].2d,@ACC[3].2d}, [$tinptr],#32 + add @ACC[0].2d,@ACC[0].2d,$temp.2d + ld1 {@ACC[4].2d,@ACC[5].2d}, [$tinptr],#32 + ushr $temp.2d,@ACC[0].2d,#16 + ld1 {@ACC[6].2d,@ACC[7].2d}, [$tinptr],#32 + zip1 @ACC[0].4h,$Temp.4h,@ACC[0].4h + ins $temp.d[1],$zero.d[0] + +.LNEON_tail_entry: +___ +for ($i=1; $i<8; $i++) { +$code.=<<___; + add @ACC[1].2d,@ACC[1].2d,$temp.2d + st1 {@ACC[0].s}[0], [$toutptr],#4 + ushr $temp.2d,@ACC[1].2d,#16 + mov $Temp.16b,@ACC[1].16b + ext @ACC[1].16b,@ACC[1].16b,@ACC[1].16b,#8 + add @ACC[1].2d,@ACC[1].2d,$temp.2d + ushr $temp.2d,@ACC[1].2d,#16 + zip1 @ACC[1].4h,$Temp.4h,@ACC[1].4h + ins $temp.d[1],$zero.d[0] +___ + push(@ACC,shift(@ACC)); +} + push(@ACC,shift(@ACC)); +$code.=<<___; + ld1 {@ACC[0].2d,@ACC[1].2d}, [$tinptr],#32 + subs $inner,$inner,#8 + st1 {@ACC[7].s}[0], [$toutptr],#4 + bne .LNEON_tail + + st1 {$temp.s}[0], [$toutptr],#4 // top-most bit + sub $nptr,$nptr,$num,lsl#2 // rewind $nptr + subs $aptr,sp,#0 // clear carry flag + add $bptr,sp,$num,lsl#2 + +.LNEON_sub: + ldp w4,w5,[$aptr],#8 + ldp w6,w7,[$aptr],#8 + ldp w8,w9,[$nptr],#8 + ldp w10,w11,[$nptr],#8 + sbcs w8,w4,w8 + sbcs w9,w5,w9 + sbcs w10,w6,w10 + sbcs w11,w7,w11 + sub x17,$bptr,$aptr + stp w8,w9,[$rptr],#8 + stp w10,w11,[$rptr],#8 + cbnz x17,.LNEON_sub + + ldr w10, [$aptr] // load top-most bit + mov x11,sp + eor v0.16b,v0.16b,v0.16b + sub x11,$bptr,x11 // this is num*4 + eor v1.16b,v1.16b,v1.16b + mov $aptr,sp + sub $rptr,$rptr,x11 // rewind $rptr + mov $nptr,$bptr // second 3/4th of frame + sbcs w10,w10,wzr // result is carry flag + +.LNEON_copy_n_zap: + ldp w4,w5,[$aptr],#8 + ldp w6,w7,[$aptr],#8 + ldp w8,w9,[$rptr],#8 + ldp w10,w11,[$rptr] + sub $rptr,$rptr,#8 + b.cs .LCopy_1 + mov w8,w4 + mov w9,w5 + mov w10,w6 + mov w11,w7 +.LCopy_1: + st1 {v0.2d,v1.2d}, [$nptr],#32 // wipe + st1 {v0.2d,v1.2d}, [$nptr],#32 // wipe + ldp w4,w5,[$aptr],#8 + ldp w6,w7,[$aptr],#8 + stp w8,w9,[$rptr],#8 + stp w10,w11,[$rptr],#8 + sub $aptr,$aptr,#32 + ldp w8,w9,[$rptr],#8 + ldp w10,w11,[$rptr] + sub $rptr,$rptr,#8 + b.cs .LCopy_2 + mov w8, w4 + mov w9, w5 + mov w10, w6 + mov w11, w7 +.LCopy_2: + st1 {v0.2d,v1.2d}, [$aptr],#32 // wipe + st1 {v0.2d,v1.2d}, [$nptr],#32 // wipe + sub x17,$bptr,$aptr // preserves carry + stp w8,w9,[$rptr],#8 + stp w10,w11,[$rptr],#8 + cbnz x17,.LNEON_copy_n_zap + + mov sp,x16 + ldp d14,d15,[sp,#64] + ldp d12,d13,[sp,#48] + ldp d10,d11,[sp,#32] + ldp d8,d9,[sp,#16] + ldr x29,[sp],#80 + ret // bx lr + +.size bn_mul8x_mont_neon,.-bn_mul8x_mont_neon +___ +} +{ ######################################################################## # Following is ARMv8 adaptation of sqrx8x_mont from x86_64-mont5 module. Index: openssl-1.1.1d/crypto/bn/build.info =================================================================== --- openssl-1.1.1d.orig/crypto/bn/build.info +++ openssl-1.1.1d/crypto/bn/build.info @@ -65,3 +65,4 @@ INCLUDE[armv4-mont.o]=.. GENERATE[armv4-gf2m.S]=asm/armv4-gf2m.pl $(PERLASM_SCHEME) INCLUDE[armv4-gf2m.o]=.. GENERATE[armv8-mont.S]=asm/armv8-mont.pl $(PERLASM_SCHEME) +INCLUDE[armv8-mont.o]=..