Accepting request 949750 from home:pmonrealgonzalez:branches:security:tls
- Backport cryptographic improvements from OpenSSL 3 [jsc#SLE-19766] * Optimize RSA on armv8: openssl-1_1-Optimize-RSA-armv8.patch * Optimize AES-XTS mode for aarch64: openssl-1_1-Optimize-AES-XTS-aarch64.patch * Optimize AES-GCM for uarchs with unroll and new instructions: openssl-1_1-Optimize-AES-GCM-uarchs.patch - POWER10 performance enhancements for cryptography [jsc#SLE-19409] * openssl-1_1-Optimize-ppc64.patch OBS-URL: https://build.opensuse.org/request/show/949750 OBS-URL: https://build.opensuse.org/package/show/security:tls/openssl-1_1?expand=0&rev=102
This commit is contained in:
parent
4ef4397187
commit
8903999f6a
7709
openssl-1_1-Optimize-AES-GCM-uarchs.patch
Normal file
7709
openssl-1_1-Optimize-AES-GCM-uarchs.patch
Normal file
File diff suppressed because it is too large
Load Diff
1616
openssl-1_1-Optimize-AES-XTS-aarch64.patch
Normal file
1616
openssl-1_1-Optimize-AES-XTS-aarch64.patch
Normal file
File diff suppressed because it is too large
Load Diff
575
openssl-1_1-Optimize-RSA-armv8.patch
Normal file
575
openssl-1_1-Optimize-RSA-armv8.patch
Normal file
@ -0,0 +1,575 @@
|
||||
From 5ea64b456b1a27ae046f23d632a968a7583bb9eb Mon Sep 17 00:00:00 2001
|
||||
From: "Fangming.Fang" <fangming.fang@arm.com>
|
||||
Date: Tue, 28 Apr 2020 02:33:50 +0000
|
||||
Subject: [PATCH] Read MIDR_EL1 system register on aarch64
|
||||
|
||||
MIDR_EL1 system register exposes microarchitecture information so that
|
||||
people can make micro-arch related optimization such as exposing as
|
||||
much instruction level parallelism as possible.
|
||||
|
||||
MIDR_EL1 register can be read only if HWCAP_CPUID feature is supported.
|
||||
|
||||
Change-Id: Iabb8a36c5d31b184dba6399f378598058d394d4e
|
||||
|
||||
Reviewed-by: Paul Dale <paul.dale@oracle.com>
|
||||
Reviewed-by: Tomas Mraz <tmraz@fedoraproject.org>
|
||||
(Merged from https://github.com/openssl/openssl/pull/11744)
|
||||
---
|
||||
crypto/arm64cpuid.pl | 7 +++++++
|
||||
crypto/arm_arch.h | 44 ++++++++++++++++++++++++++++++++++++++++++++
|
||||
crypto/armcap.c | 11 +++++++++++
|
||||
3 files changed, 62 insertions(+)
|
||||
|
||||
Index: openssl-1.1.1d/crypto/arm64cpuid.pl
|
||||
===================================================================
|
||||
--- openssl-1.1.1d.orig/crypto/arm64cpuid.pl
|
||||
+++ openssl-1.1.1d/crypto/arm64cpuid.pl
|
||||
@@ -78,6 +78,13 @@ _armv8_sha512_probe:
|
||||
ret
|
||||
.size _armv8_sha512_probe,.-_armv8_sha512_probe
|
||||
|
||||
+.globl _armv8_cpuid_probe
|
||||
+.type _armv8_cpuid_probe,%function
|
||||
+_armv8_cpuid_probe:
|
||||
+ mrs x0, midr_el1
|
||||
+ ret
|
||||
+.size _armv8_cpuid_probe,.-_armv8_cpuid_probe
|
||||
+
|
||||
.globl OPENSSL_cleanse
|
||||
.type OPENSSL_cleanse,%function
|
||||
.align 5
|
||||
Index: openssl-1.1.1d/crypto/arm_arch.h
|
||||
===================================================================
|
||||
--- openssl-1.1.1d.orig/crypto/arm_arch.h
|
||||
+++ openssl-1.1.1d/crypto/arm_arch.h
|
||||
@@ -71,6 +71,7 @@
|
||||
|
||||
# ifndef __ASSEMBLER__
|
||||
extern unsigned int OPENSSL_armcap_P;
|
||||
+extern unsigned int OPENSSL_arm_midr;
|
||||
# endif
|
||||
|
||||
# define ARMV7_NEON (1<<0)
|
||||
@@ -80,5 +81,48 @@ extern unsigned int OPENSSL_armcap_P;
|
||||
# define ARMV8_SHA256 (1<<4)
|
||||
# define ARMV8_PMULL (1<<5)
|
||||
# define ARMV8_SHA512 (1<<6)
|
||||
+# define ARMV8_CPUID (1<<7)
|
||||
|
||||
+/*
|
||||
+ * MIDR_EL1 system register
|
||||
+ *
|
||||
+ * 63___ _ ___32_31___ _ ___24_23_____20_19_____16_15__ _ __4_3_______0
|
||||
+ * | | | | | | |
|
||||
+ * |RES0 | Implementer | Variant | Arch | PartNum |Revision|
|
||||
+ * |____ _ _____|_____ _ _____|_________|_______ _|____ _ ___|________|
|
||||
+ *
|
||||
+ */
|
||||
+
|
||||
+# define ARM_CPU_IMP_ARM 0x41
|
||||
+
|
||||
+# define ARM_CPU_PART_CORTEX_A72 0xD08
|
||||
+# define ARM_CPU_PART_N1 0xD0C
|
||||
+
|
||||
+# define MIDR_PARTNUM_SHIFT 4
|
||||
+# define MIDR_PARTNUM_MASK (0xfff << MIDR_PARTNUM_SHIFT)
|
||||
+# define MIDR_PARTNUM(midr) \
|
||||
+ (((midr) & MIDR_PARTNUM_MASK) >> MIDR_PARTNUM_SHIFT)
|
||||
+
|
||||
+# define MIDR_IMPLEMENTER_SHIFT 24
|
||||
+# define MIDR_IMPLEMENTER_MASK (0xff << MIDR_IMPLEMENTER_SHIFT)
|
||||
+# define MIDR_IMPLEMENTER(midr) \
|
||||
+ (((midr) & MIDR_IMPLEMENTER_MASK) >> MIDR_IMPLEMENTER_SHIFT)
|
||||
+
|
||||
+# define MIDR_ARCHITECTURE_SHIFT 16
|
||||
+# define MIDR_ARCHITECTURE_MASK (0xf << MIDR_ARCHITECTURE_SHIFT)
|
||||
+# define MIDR_ARCHITECTURE(midr) \
|
||||
+ (((midr) & MIDR_ARCHITECTURE_MASK) >> MIDR_ARCHITECTURE_SHIFT)
|
||||
+
|
||||
+# define MIDR_CPU_MODEL_MASK \
|
||||
+ (MIDR_IMPLEMENTER_MASK | \
|
||||
+ MIDR_PARTNUM_MASK | \
|
||||
+ MIDR_ARCHITECTURE_MASK)
|
||||
+
|
||||
+# define MIDR_CPU_MODEL(imp, partnum) \
|
||||
+ (((imp) << MIDR_IMPLEMENTER_SHIFT) | \
|
||||
+ (0xf << MIDR_ARCHITECTURE_SHIFT) | \
|
||||
+ ((partnum) << MIDR_PARTNUM_SHIFT))
|
||||
+
|
||||
+# define MIDR_IS_CPU_MODEL(midr, imp, partnum) \
|
||||
+ (((midr) & MIDR_CPU_MODEL_MASK) == MIDR_CPU_MODEL(imp, partnum))
|
||||
#endif
|
||||
Index: openssl-1.1.1d/crypto/armcap.c
|
||||
===================================================================
|
||||
--- openssl-1.1.1d.orig/crypto/armcap.c
|
||||
+++ openssl-1.1.1d/crypto/armcap.c
|
||||
@@ -18,6 +18,8 @@
|
||||
#include "arm_arch.h"
|
||||
|
||||
unsigned int OPENSSL_armcap_P = 0;
|
||||
+unsigned int OPENSSL_arm_midr = 0;
|
||||
+unsigned int OPENSSL_armv8_rsa_neonized = 0;
|
||||
|
||||
#if __ARM_MAX_ARCH__<7
|
||||
void OPENSSL_cpuid_setup(void)
|
||||
@@ -48,6 +50,7 @@ void _armv8_sha256_probe(void);
|
||||
void _armv8_pmull_probe(void);
|
||||
# ifdef __aarch64__
|
||||
void _armv8_sha512_probe(void);
|
||||
+unsigned int _armv8_cpuid_probe(void);
|
||||
# endif
|
||||
uint32_t _armv7_tick(void);
|
||||
|
||||
@@ -95,6 +98,7 @@ void OPENSSL_cpuid_setup(void) __attribu
|
||||
# define HWCAP_CE_PMULL (1 << 4)
|
||||
# define HWCAP_CE_SHA1 (1 << 5)
|
||||
# define HWCAP_CE_SHA256 (1 << 6)
|
||||
+# define HWCAP_CPUID (1 << 11)
|
||||
# define HWCAP_CE_SHA512 (1 << 21)
|
||||
# endif
|
||||
|
||||
@@ -155,6 +159,9 @@ void OPENSSL_cpuid_setup(void)
|
||||
# ifdef __aarch64__
|
||||
if (hwcap & HWCAP_CE_SHA512)
|
||||
OPENSSL_armcap_P |= ARMV8_SHA512;
|
||||
+
|
||||
+ if (hwcap & HWCAP_CPUID)
|
||||
+ OPENSSL_armcap_P |= ARMV8_CPUID;
|
||||
# endif
|
||||
}
|
||||
# endif
|
||||
@@ -210,5 +217,16 @@ void OPENSSL_cpuid_setup(void)
|
||||
|
||||
sigaction(SIGILL, &ill_oact, NULL);
|
||||
sigprocmask(SIG_SETMASK, &oset, NULL);
|
||||
+
|
||||
+# ifdef __aarch64__
|
||||
+ if (OPENSSL_armcap_P & ARMV8_CPUID)
|
||||
+ OPENSSL_arm_midr = _armv8_cpuid_probe();
|
||||
+
|
||||
+ if ((MIDR_IS_CPU_MODEL(OPENSSL_arm_midr, ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A72) ||
|
||||
+ MIDR_IS_CPU_MODEL(OPENSSL_arm_midr, ARM_CPU_IMP_ARM, ARM_CPU_PART_N1)) &&
|
||||
+ (OPENSSL_armcap_P & ARMV7_NEON)) {
|
||||
+ OPENSSL_armv8_rsa_neonized = 1;
|
||||
+ }
|
||||
+# endif
|
||||
}
|
||||
#endif
|
||||
Index: openssl-1.1.1d/crypto/bn/asm/armv8-mont.pl
|
||||
===================================================================
|
||||
--- openssl-1.1.1d.orig/crypto/bn/asm/armv8-mont.pl
|
||||
+++ openssl-1.1.1d/crypto/bn/asm/armv8-mont.pl
|
||||
@@ -64,16 +64,34 @@ $n0="x4"; # const BN_ULONG *n0,
|
||||
$num="x5"; # int num);
|
||||
|
||||
$code.=<<___;
|
||||
+#ifndef __KERNEL__
|
||||
+# include "arm_arch.h"
|
||||
+.extern OPENSSL_armv8_rsa_neonized
|
||||
+.hidden OPENSSL_armv8_rsa_neonized
|
||||
+#endif
|
||||
.text
|
||||
|
||||
.globl bn_mul_mont
|
||||
.type bn_mul_mont,%function
|
||||
.align 5
|
||||
bn_mul_mont:
|
||||
+.Lbn_mul_mont:
|
||||
+ tst $num,#3
|
||||
+ b.ne .Lmul_mont
|
||||
+ cmp $num,#32
|
||||
+ b.le .Lscalar_impl
|
||||
+#ifndef __KERNEL__
|
||||
+ adrp x17,OPENSSL_armv8_rsa_neonized
|
||||
+ ldr w17,[x17,#:lo12:OPENSSL_armv8_rsa_neonized]
|
||||
+ cbnz w17, bn_mul8x_mont_neon
|
||||
+#endif
|
||||
+
|
||||
+.Lscalar_impl:
|
||||
tst $num,#7
|
||||
b.eq __bn_sqr8x_mont
|
||||
tst $num,#3
|
||||
b.eq __bn_mul4x_mont
|
||||
+
|
||||
.Lmul_mont:
|
||||
stp x29,x30,[sp,#-64]!
|
||||
add x29,sp,#0
|
||||
@@ -271,6 +289,369 @@ bn_mul_mont:
|
||||
.size bn_mul_mont,.-bn_mul_mont
|
||||
___
|
||||
{
|
||||
+my ($A0,$A1,$N0,$N1)=map("v$_",(0..3));
|
||||
+my ($Z,$Temp)=("v4.16b","v5");
|
||||
+my @ACC=map("v$_",(6..13));
|
||||
+my ($Bi,$Ni,$M0)=map("v$_",(28..30));
|
||||
+my $sBi="s28";
|
||||
+my $sM0="s30";
|
||||
+my $zero="v14";
|
||||
+my $temp="v15";
|
||||
+my $ACCTemp="v16";
|
||||
+
|
||||
+my ($rptr,$aptr,$bptr,$nptr,$n0,$num)=map("x$_",(0..5));
|
||||
+my ($tinptr,$toutptr,$inner,$outer,$bnptr)=map("x$_",(6..11));
|
||||
+
|
||||
+$code.=<<___;
|
||||
+.type bn_mul8x_mont_neon,%function
|
||||
+.align 5
|
||||
+bn_mul8x_mont_neon:
|
||||
+ stp x29,x30,[sp,#-80]!
|
||||
+ mov x16,sp
|
||||
+ stp d8,d9,[sp,#16]
|
||||
+ stp d10,d11,[sp,#32]
|
||||
+ stp d12,d13,[sp,#48]
|
||||
+ stp d14,d15,[sp,#64]
|
||||
+ lsl $num,$num,#1
|
||||
+ eor $zero.16b,$zero.16b,$zero.16b
|
||||
+
|
||||
+.align 4
|
||||
+.LNEON_8n:
|
||||
+ eor @ACC[0].16b,@ACC[0].16b,@ACC[0].16b
|
||||
+ sub $toutptr,sp,#128
|
||||
+ eor @ACC[1].16b,@ACC[1].16b,@ACC[1].16b
|
||||
+ sub $toutptr,$toutptr,$num,lsl#4
|
||||
+ eor @ACC[2].16b,@ACC[2].16b,@ACC[2].16b
|
||||
+ and $toutptr,$toutptr,#-64
|
||||
+ eor @ACC[3].16b,@ACC[3].16b,@ACC[3].16b
|
||||
+ mov sp,$toutptr // alloca
|
||||
+ eor @ACC[4].16b,@ACC[4].16b,@ACC[4].16b
|
||||
+ add $toutptr,$toutptr,#256
|
||||
+ eor @ACC[5].16b,@ACC[5].16b,@ACC[5].16b
|
||||
+ sub $inner,$num,#8
|
||||
+ eor @ACC[6].16b,@ACC[6].16b,@ACC[6].16b
|
||||
+ eor @ACC[7].16b,@ACC[7].16b,@ACC[7].16b
|
||||
+
|
||||
+.LNEON_8n_init:
|
||||
+ st1 {@ACC[0].2d,@ACC[1].2d},[$toutptr],#32
|
||||
+ subs $inner,$inner,#8
|
||||
+ st1 {@ACC[2].2d,@ACC[3].2d},[$toutptr],#32
|
||||
+ st1 {@ACC[4].2d,@ACC[5].2d},[$toutptr],#32
|
||||
+ st1 {@ACC[6].2d,@ACC[7].2d},[$toutptr],#32
|
||||
+ bne .LNEON_8n_init
|
||||
+
|
||||
+ add $tinptr,sp,#256
|
||||
+ ld1 {$A0.4s,$A1.4s},[$aptr],#32
|
||||
+ add $bnptr,sp,#8
|
||||
+ ldr $sM0,[$n0],#4
|
||||
+ mov $outer,$num
|
||||
+ b .LNEON_8n_outer
|
||||
+
|
||||
+.align 4
|
||||
+.LNEON_8n_outer:
|
||||
+ ldr $sBi,[$bptr],#4 // *b++
|
||||
+ uxtl $Bi.4s,$Bi.4h
|
||||
+ add $toutptr,sp,#128
|
||||
+ ld1 {$N0.4s,$N1.4s},[$nptr],#32
|
||||
+
|
||||
+ umlal @ACC[0].2d,$Bi.2s,$A0.s[0]
|
||||
+ umlal @ACC[1].2d,$Bi.2s,$A0.s[1]
|
||||
+ umlal @ACC[2].2d,$Bi.2s,$A0.s[2]
|
||||
+ shl $Ni.2d,@ACC[0].2d,#16
|
||||
+ ext $Ni.16b,$Ni.16b,$Ni.16b,#8
|
||||
+ umlal @ACC[3].2d,$Bi.2s,$A0.s[3]
|
||||
+ add $Ni.2d,$Ni.2d,@ACC[0].2d
|
||||
+ umlal @ACC[4].2d,$Bi.2s,$A1.s[0]
|
||||
+ mul $Ni.2s,$Ni.2s,$M0.2s
|
||||
+ umlal @ACC[5].2d,$Bi.2s,$A1.s[1]
|
||||
+ st1 {$Bi.2s},[sp] // put aside smashed b[8*i+0]
|
||||
+ umlal @ACC[6].2d,$Bi.2s,$A1.s[2]
|
||||
+ uxtl $Ni.4s,$Ni.4h
|
||||
+ umlal @ACC[7].2d,$Bi.2s,$A1.s[3]
|
||||
+___
|
||||
+for ($i=0; $i<7;) {
|
||||
+$code.=<<___;
|
||||
+ ldr $sBi,[$bptr],#4 // *b++
|
||||
+ umlal @ACC[0].2d,$Ni.2s,$N0.s[0]
|
||||
+ umlal @ACC[1].2d,$Ni.2s,$N0.s[1]
|
||||
+ uxtl $Bi.4s,$Bi.4h
|
||||
+ umlal @ACC[2].2d,$Ni.2s,$N0.s[2]
|
||||
+ ushr $temp.2d,@ACC[0].2d,#16
|
||||
+ umlal @ACC[3].2d,$Ni.2s,$N0.s[3]
|
||||
+ umlal @ACC[4].2d,$Ni.2s,$N1.s[0]
|
||||
+ ext @ACC[0].16b,@ACC[0].16b,@ACC[0].16b,#8
|
||||
+ add @ACC[0].2d,@ACC[0].2d,$temp.2d
|
||||
+ umlal @ACC[5].2d,$Ni.2s,$N1.s[1]
|
||||
+ ushr @ACC[0].2d,@ACC[0].2d,#16
|
||||
+ umlal @ACC[6].2d,$Ni.2s,$N1.s[2]
|
||||
+ umlal @ACC[7].2d,$Ni.2s,$N1.s[3]
|
||||
+ add $ACCTemp.2d,@ACC[1].2d,@ACC[0].2d
|
||||
+ ins @ACC[1].d[0],$ACCTemp.d[0]
|
||||
+ st1 {$Ni.2s},[$bnptr],#8 // put aside smashed m[8*i+$i]
|
||||
+___
|
||||
+ push(@ACC,shift(@ACC)); $i++;
|
||||
+$code.=<<___;
|
||||
+ umlal @ACC[0].2d,$Bi.2s,$A0.s[0]
|
||||
+ ld1 {@ACC[7].2d},[$tinptr],#16
|
||||
+ umlal @ACC[1].2d,$Bi.2s,$A0.s[1]
|
||||
+ umlal @ACC[2].2d,$Bi.2s,$A0.s[2]
|
||||
+ shl $Ni.2d,@ACC[0].2d,#16
|
||||
+ ext $Ni.16b,$Ni.16b,$Ni.16b,#8
|
||||
+ umlal @ACC[3].2d,$Bi.2s,$A0.s[3]
|
||||
+ add $Ni.2d,$Ni.2d,@ACC[0].2d
|
||||
+ umlal @ACC[4].2d,$Bi.2s,$A1.s[0]
|
||||
+ mul $Ni.2s,$Ni.2s,$M0.2s
|
||||
+ umlal @ACC[5].2d,$Bi.2s,$A1.s[1]
|
||||
+ st1 {$Bi.2s},[$bnptr],#8 // put aside smashed b[8*i+$i]
|
||||
+ umlal @ACC[6].2d,$Bi.2s,$A1.s[2]
|
||||
+ uxtl $Ni.4s,$Ni.4h
|
||||
+ umlal @ACC[7].2d,$Bi.2s,$A1.s[3]
|
||||
+___
|
||||
+}
|
||||
+$code.=<<___;
|
||||
+ ld1 {$Bi.2s},[sp] // pull smashed b[8*i+0]
|
||||
+ umlal @ACC[0].2d,$Ni.2s,$N0.s[0]
|
||||
+ ld1 {$A0.4s,$A1.4s},[$aptr],#32
|
||||
+ umlal @ACC[1].2d,$Ni.2s,$N0.s[1]
|
||||
+ umlal @ACC[2].2d,$Ni.2s,$N0.s[2]
|
||||
+ mov $Temp.16b,@ACC[0].16b
|
||||
+ ushr $Temp.2d,$Temp.2d,#16
|
||||
+ ext @ACC[0].16b,@ACC[0].16b,@ACC[0].16b,#8
|
||||
+ umlal @ACC[3].2d,$Ni.2s,$N0.s[3]
|
||||
+ umlal @ACC[4].2d,$Ni.2s,$N1.s[0]
|
||||
+ add @ACC[0].2d,@ACC[0].2d,$Temp.2d
|
||||
+ umlal @ACC[5].2d,$Ni.2s,$N1.s[1]
|
||||
+ ushr @ACC[0].2d,@ACC[0].2d,#16
|
||||
+ eor $temp.16b,$temp.16b,$temp.16b
|
||||
+ ins @ACC[0].d[1],$temp.d[0]
|
||||
+ umlal @ACC[6].2d,$Ni.2s,$N1.s[2]
|
||||
+ umlal @ACC[7].2d,$Ni.2s,$N1.s[3]
|
||||
+ add @ACC[1].2d,@ACC[1].2d,@ACC[0].2d
|
||||
+ st1 {$Ni.2s},[$bnptr],#8 // put aside smashed m[8*i+$i]
|
||||
+ add $bnptr,sp,#8 // rewind
|
||||
+___
|
||||
+ push(@ACC,shift(@ACC));
|
||||
+$code.=<<___;
|
||||
+ sub $inner,$num,#8
|
||||
+ b .LNEON_8n_inner
|
||||
+
|
||||
+.align 4
|
||||
+.LNEON_8n_inner:
|
||||
+ subs $inner,$inner,#8
|
||||
+ umlal @ACC[0].2d,$Bi.2s,$A0.s[0]
|
||||
+ ld1 {@ACC[7].2d},[$tinptr]
|
||||
+ umlal @ACC[1].2d,$Bi.2s,$A0.s[1]
|
||||
+ ld1 {$Ni.2s},[$bnptr],#8 // pull smashed m[8*i+0]
|
||||
+ umlal @ACC[2].2d,$Bi.2s,$A0.s[2]
|
||||
+ ld1 {$N0.4s,$N1.4s},[$nptr],#32
|
||||
+ umlal @ACC[3].2d,$Bi.2s,$A0.s[3]
|
||||
+ b.eq .LInner_jump
|
||||
+ add $tinptr,$tinptr,#16 // don't advance in last iteration
|
||||
+.LInner_jump:
|
||||
+ umlal @ACC[4].2d,$Bi.2s,$A1.s[0]
|
||||
+ umlal @ACC[5].2d,$Bi.2s,$A1.s[1]
|
||||
+ umlal @ACC[6].2d,$Bi.2s,$A1.s[2]
|
||||
+ umlal @ACC[7].2d,$Bi.2s,$A1.s[3]
|
||||
+___
|
||||
+for ($i=1; $i<8; $i++) {
|
||||
+$code.=<<___;
|
||||
+ ld1 {$Bi.2s},[$bnptr],#8 // pull smashed b[8*i+$i]
|
||||
+ umlal @ACC[0].2d,$Ni.2s,$N0.s[0]
|
||||
+ umlal @ACC[1].2d,$Ni.2s,$N0.s[1]
|
||||
+ umlal @ACC[2].2d,$Ni.2s,$N0.s[2]
|
||||
+ umlal @ACC[3].2d,$Ni.2s,$N0.s[3]
|
||||
+ umlal @ACC[4].2d,$Ni.2s,$N1.s[0]
|
||||
+ umlal @ACC[5].2d,$Ni.2s,$N1.s[1]
|
||||
+ umlal @ACC[6].2d,$Ni.2s,$N1.s[2]
|
||||
+ umlal @ACC[7].2d,$Ni.2s,$N1.s[3]
|
||||
+ st1 {@ACC[0].2d},[$toutptr],#16
|
||||
+___
|
||||
+ push(@ACC,shift(@ACC));
|
||||
+$code.=<<___;
|
||||
+ umlal @ACC[0].2d,$Bi.2s,$A0.s[0]
|
||||
+ ld1 {@ACC[7].2d},[$tinptr]
|
||||
+ umlal @ACC[1].2d,$Bi.2s,$A0.s[1]
|
||||
+ ld1 {$Ni.2s},[$bnptr],#8 // pull smashed m[8*i+$i]
|
||||
+ umlal @ACC[2].2d,$Bi.2s,$A0.s[2]
|
||||
+ b.eq .LInner_jump$i
|
||||
+ add $tinptr,$tinptr,#16 // don't advance in last iteration
|
||||
+.LInner_jump$i:
|
||||
+ umlal @ACC[3].2d,$Bi.2s,$A0.s[3]
|
||||
+ umlal @ACC[4].2d,$Bi.2s,$A1.s[0]
|
||||
+ umlal @ACC[5].2d,$Bi.2s,$A1.s[1]
|
||||
+ umlal @ACC[6].2d,$Bi.2s,$A1.s[2]
|
||||
+ umlal @ACC[7].2d,$Bi.2s,$A1.s[3]
|
||||
+___
|
||||
+}
|
||||
+$code.=<<___;
|
||||
+ b.ne .LInner_after_rewind$i
|
||||
+ sub $aptr,$aptr,$num,lsl#2 // rewind
|
||||
+.LInner_after_rewind$i:
|
||||
+ umlal @ACC[0].2d,$Ni.2s,$N0.s[0]
|
||||
+ ld1 {$Bi.2s},[sp] // pull smashed b[8*i+0]
|
||||
+ umlal @ACC[1].2d,$Ni.2s,$N0.s[1]
|
||||
+ ld1 {$A0.4s,$A1.4s},[$aptr],#32
|
||||
+ umlal @ACC[2].2d,$Ni.2s,$N0.s[2]
|
||||
+ add $bnptr,sp,#8 // rewind
|
||||
+ umlal @ACC[3].2d,$Ni.2s,$N0.s[3]
|
||||
+ umlal @ACC[4].2d,$Ni.2s,$N1.s[0]
|
||||
+ umlal @ACC[5].2d,$Ni.2s,$N1.s[1]
|
||||
+ umlal @ACC[6].2d,$Ni.2s,$N1.s[2]
|
||||
+ st1 {@ACC[0].2d},[$toutptr],#16
|
||||
+ umlal @ACC[7].2d,$Ni.2s,$N1.s[3]
|
||||
+
|
||||
+ bne .LNEON_8n_inner
|
||||
+___
|
||||
+ push(@ACC,shift(@ACC));
|
||||
+$code.=<<___;
|
||||
+ add $tinptr,sp,#128
|
||||
+ st1 {@ACC[0].2d,@ACC[1].2d},[$toutptr],#32
|
||||
+ eor $N0.16b,$N0.16b,$N0.16b // $N0
|
||||
+ st1 {@ACC[2].2d,@ACC[3].2d},[$toutptr],#32
|
||||
+ eor $N1.16b,$N1.16b,$N1.16b // $N1
|
||||
+ st1 {@ACC[4].2d,@ACC[5].2d},[$toutptr],#32
|
||||
+ st1 {@ACC[6].2d},[$toutptr]
|
||||
+
|
||||
+ subs $outer,$outer,#8
|
||||
+ ld1 {@ACC[0].2d,@ACC[1].2d},[$tinptr],#32
|
||||
+ ld1 {@ACC[2].2d,@ACC[3].2d},[$tinptr],#32
|
||||
+ ld1 {@ACC[4].2d,@ACC[5].2d},[$tinptr],#32
|
||||
+ ld1 {@ACC[6].2d,@ACC[7].2d},[$tinptr],#32
|
||||
+
|
||||
+ b.eq .LInner_8n_jump_2steps
|
||||
+ sub $nptr,$nptr,$num,lsl#2 // rewind
|
||||
+ b .LNEON_8n_outer
|
||||
+
|
||||
+.LInner_8n_jump_2steps:
|
||||
+ add $toutptr,sp,#128
|
||||
+ st1 {$N0.2d,$N1.2d}, [sp],#32 // start wiping stack frame
|
||||
+ mov $Temp.16b,@ACC[0].16b
|
||||
+ ushr $temp.2d,@ACC[0].2d,#16
|
||||
+ ext @ACC[0].16b,@ACC[0].16b,@ACC[0].16b,#8
|
||||
+ st1 {$N0.2d,$N1.2d}, [sp],#32
|
||||
+ add @ACC[0].2d,@ACC[0].2d,$temp.2d
|
||||
+ st1 {$N0.2d,$N1.2d}, [sp],#32
|
||||
+ ushr $temp.2d,@ACC[0].2d,#16
|
||||
+ st1 {$N0.2d,$N1.2d}, [sp],#32
|
||||
+ zip1 @ACC[0].4h,$Temp.4h,@ACC[0].4h
|
||||
+ ins $temp.d[1],$zero.d[0]
|
||||
+
|
||||
+ mov $inner,$num
|
||||
+ b .LNEON_tail_entry
|
||||
+
|
||||
+.align 4
|
||||
+.LNEON_tail:
|
||||
+ add @ACC[0].2d,@ACC[0].2d,$temp.2d
|
||||
+ mov $Temp.16b,@ACC[0].16b
|
||||
+ ushr $temp.2d,@ACC[0].2d,#16
|
||||
+ ext @ACC[0].16b,@ACC[0].16b,@ACC[0].16b,#8
|
||||
+ ld1 {@ACC[2].2d,@ACC[3].2d}, [$tinptr],#32
|
||||
+ add @ACC[0].2d,@ACC[0].2d,$temp.2d
|
||||
+ ld1 {@ACC[4].2d,@ACC[5].2d}, [$tinptr],#32
|
||||
+ ushr $temp.2d,@ACC[0].2d,#16
|
||||
+ ld1 {@ACC[6].2d,@ACC[7].2d}, [$tinptr],#32
|
||||
+ zip1 @ACC[0].4h,$Temp.4h,@ACC[0].4h
|
||||
+ ins $temp.d[1],$zero.d[0]
|
||||
+
|
||||
+.LNEON_tail_entry:
|
||||
+___
|
||||
+for ($i=1; $i<8; $i++) {
|
||||
+$code.=<<___;
|
||||
+ add @ACC[1].2d,@ACC[1].2d,$temp.2d
|
||||
+ st1 {@ACC[0].s}[0], [$toutptr],#4
|
||||
+ ushr $temp.2d,@ACC[1].2d,#16
|
||||
+ mov $Temp.16b,@ACC[1].16b
|
||||
+ ext @ACC[1].16b,@ACC[1].16b,@ACC[1].16b,#8
|
||||
+ add @ACC[1].2d,@ACC[1].2d,$temp.2d
|
||||
+ ushr $temp.2d,@ACC[1].2d,#16
|
||||
+ zip1 @ACC[1].4h,$Temp.4h,@ACC[1].4h
|
||||
+ ins $temp.d[1],$zero.d[0]
|
||||
+___
|
||||
+ push(@ACC,shift(@ACC));
|
||||
+}
|
||||
+ push(@ACC,shift(@ACC));
|
||||
+$code.=<<___;
|
||||
+ ld1 {@ACC[0].2d,@ACC[1].2d}, [$tinptr],#32
|
||||
+ subs $inner,$inner,#8
|
||||
+ st1 {@ACC[7].s}[0], [$toutptr],#4
|
||||
+ bne .LNEON_tail
|
||||
+
|
||||
+ st1 {$temp.s}[0], [$toutptr],#4 // top-most bit
|
||||
+ sub $nptr,$nptr,$num,lsl#2 // rewind $nptr
|
||||
+ subs $aptr,sp,#0 // clear carry flag
|
||||
+ add $bptr,sp,$num,lsl#2
|
||||
+
|
||||
+.LNEON_sub:
|
||||
+ ldp w4,w5,[$aptr],#8
|
||||
+ ldp w6,w7,[$aptr],#8
|
||||
+ ldp w8,w9,[$nptr],#8
|
||||
+ ldp w10,w11,[$nptr],#8
|
||||
+ sbcs w8,w4,w8
|
||||
+ sbcs w9,w5,w9
|
||||
+ sbcs w10,w6,w10
|
||||
+ sbcs w11,w7,w11
|
||||
+ sub x17,$bptr,$aptr
|
||||
+ stp w8,w9,[$rptr],#8
|
||||
+ stp w10,w11,[$rptr],#8
|
||||
+ cbnz x17,.LNEON_sub
|
||||
+
|
||||
+ ldr w10, [$aptr] // load top-most bit
|
||||
+ mov x11,sp
|
||||
+ eor v0.16b,v0.16b,v0.16b
|
||||
+ sub x11,$bptr,x11 // this is num*4
|
||||
+ eor v1.16b,v1.16b,v1.16b
|
||||
+ mov $aptr,sp
|
||||
+ sub $rptr,$rptr,x11 // rewind $rptr
|
||||
+ mov $nptr,$bptr // second 3/4th of frame
|
||||
+ sbcs w10,w10,wzr // result is carry flag
|
||||
+
|
||||
+.LNEON_copy_n_zap:
|
||||
+ ldp w4,w5,[$aptr],#8
|
||||
+ ldp w6,w7,[$aptr],#8
|
||||
+ ldp w8,w9,[$rptr],#8
|
||||
+ ldp w10,w11,[$rptr]
|
||||
+ sub $rptr,$rptr,#8
|
||||
+ b.cs .LCopy_1
|
||||
+ mov w8,w4
|
||||
+ mov w9,w5
|
||||
+ mov w10,w6
|
||||
+ mov w11,w7
|
||||
+.LCopy_1:
|
||||
+ st1 {v0.2d,v1.2d}, [$nptr],#32 // wipe
|
||||
+ st1 {v0.2d,v1.2d}, [$nptr],#32 // wipe
|
||||
+ ldp w4,w5,[$aptr],#8
|
||||
+ ldp w6,w7,[$aptr],#8
|
||||
+ stp w8,w9,[$rptr],#8
|
||||
+ stp w10,w11,[$rptr],#8
|
||||
+ sub $aptr,$aptr,#32
|
||||
+ ldp w8,w9,[$rptr],#8
|
||||
+ ldp w10,w11,[$rptr]
|
||||
+ sub $rptr,$rptr,#8
|
||||
+ b.cs .LCopy_2
|
||||
+ mov w8, w4
|
||||
+ mov w9, w5
|
||||
+ mov w10, w6
|
||||
+ mov w11, w7
|
||||
+.LCopy_2:
|
||||
+ st1 {v0.2d,v1.2d}, [$aptr],#32 // wipe
|
||||
+ st1 {v0.2d,v1.2d}, [$nptr],#32 // wipe
|
||||
+ sub x17,$bptr,$aptr // preserves carry
|
||||
+ stp w8,w9,[$rptr],#8
|
||||
+ stp w10,w11,[$rptr],#8
|
||||
+ cbnz x17,.LNEON_copy_n_zap
|
||||
+
|
||||
+ mov sp,x16
|
||||
+ ldp d14,d15,[sp,#64]
|
||||
+ ldp d12,d13,[sp,#48]
|
||||
+ ldp d10,d11,[sp,#32]
|
||||
+ ldp d8,d9,[sp,#16]
|
||||
+ ldr x29,[sp],#80
|
||||
+ ret // bx lr
|
||||
+
|
||||
+.size bn_mul8x_mont_neon,.-bn_mul8x_mont_neon
|
||||
+___
|
||||
+}
|
||||
+{
|
||||
########################################################################
|
||||
# Following is ARMv8 adaptation of sqrx8x_mont from x86_64-mont5 module.
|
||||
|
||||
Index: openssl-1.1.1d/crypto/bn/build.info
|
||||
===================================================================
|
||||
--- openssl-1.1.1d.orig/crypto/bn/build.info
|
||||
+++ openssl-1.1.1d/crypto/bn/build.info
|
||||
@@ -65,3 +65,4 @@ INCLUDE[armv4-mont.o]=..
|
||||
GENERATE[armv4-gf2m.S]=asm/armv4-gf2m.pl $(PERLASM_SCHEME)
|
||||
INCLUDE[armv4-gf2m.o]=..
|
||||
GENERATE[armv8-mont.S]=asm/armv8-mont.pl $(PERLASM_SCHEME)
|
||||
+INCLUDE[armv8-mont.o]=..
|
2308
openssl-1_1-Optimize-ppc64.patch
Normal file
2308
openssl-1_1-Optimize-ppc64.patch
Normal file
File diff suppressed because it is too large
Load Diff
@ -1,3 +1,19 @@
|
||||
-------------------------------------------------------------------
|
||||
Fri Jan 28 17:33:51 UTC 2022 - Pedro Monreal <pmonreal@suse.com>
|
||||
|
||||
- Backport cryptographic improvements from OpenSSL 3 [jsc#SLE-19766]
|
||||
* Optimize RSA on armv8: openssl-1_1-Optimize-RSA-armv8.patch
|
||||
* Optimize AES-XTS mode for aarch64:
|
||||
openssl-1_1-Optimize-AES-XTS-aarch64.patch
|
||||
* Optimize AES-GCM for uarchs with unroll and new instructions:
|
||||
openssl-1_1-Optimize-AES-GCM-uarchs.patch
|
||||
|
||||
-------------------------------------------------------------------
|
||||
Fri Jan 28 17:33:23 UTC 2022 - Pedro Monreal <pmonreal@suse.com>
|
||||
|
||||
- POWER10 performance enhancements for cryptography [jsc#SLE-19409]
|
||||
* openssl-1_1-Optimize-ppc64.patch
|
||||
|
||||
-------------------------------------------------------------------
|
||||
Tue Dec 28 12:34:04 UTC 2021 - Pedro Monreal <pmonreal@suse.com>
|
||||
|
||||
|
@ -1,7 +1,7 @@
|
||||
#
|
||||
# spec file for package openssl-1_1
|
||||
#
|
||||
# Copyright (c) 2021 SUSE LLC
|
||||
# Copyright (c) 2022 SUSE LLC
|
||||
#
|
||||
# All modifications and additions to the file contributed by third parties
|
||||
# remain the property of their copyright owners, unless otherwise agreed
|
||||
@ -112,6 +112,12 @@ Patch54: openssl-1_1-use-seclevel2-in-tests.patch
|
||||
Patch55: openssl-1_1-disable-test_srp-sslapi.patch
|
||||
Patch56: openssl-add_rfc3526_rfc7919.patch
|
||||
Patch57: openssl-1_1-use-include-directive.patch
|
||||
#PATCH-FIX-UPSTREAM jsc#SLE-19409 POWER10 performance enhancements for cryptography
|
||||
Patch69: openssl-1_1-Optimize-ppc64.patch
|
||||
#PATCH-FIX-UPSTREAM jsc#SLE-19766 Backport Arm improvements from OpenSSL 3
|
||||
Patch70: openssl-1_1-Optimize-RSA-armv8.patch
|
||||
Patch71: openssl-1_1-Optimize-AES-XTS-aarch64.patch
|
||||
Patch72: openssl-1_1-Optimize-AES-GCM-uarchs.patch
|
||||
BuildRequires: pkgconfig
|
||||
%if 0%{?sle_version} >= 150400 || 0%{?suse_version} >= 1550
|
||||
Requires: crypto-policies
|
||||
@ -245,6 +251,7 @@ make all %{?_smp_mflags}
|
||||
export MALLOC_CHECK_=3
|
||||
export MALLOC_PERTURB_=$(($RANDOM % 255 + 1))
|
||||
#export HARNESS_VERBOSE=1
|
||||
#export OPENSSL_FORCE_FIPS_MODE=1
|
||||
LD_LIBRARY_PATH=`pwd` make test -j1
|
||||
|
||||
# show ciphers
|
||||
@ -330,7 +337,7 @@ cp %{SOURCE5} .
|
||||
# invalidates a HMAC that may have been created earlier.
|
||||
# solution: create the hashes _after_ the macro runs.
|
||||
#
|
||||
# this shows up earlier because otherwise the %%expand of
|
||||
# this shows up earlier because otherwise the expand of
|
||||
# the macro is too late.
|
||||
# remark: This is the same as running
|
||||
# openssl dgst -sha256 -hmac 'ppaksykemnsecgtsttplmamstKMEs'
|
||||
|
Loading…
Reference in New Issue
Block a user