Pedro Monreal Gonzalez
8903999f6a
- Backport cryptographic improvements from OpenSSL 3 [jsc#SLE-19766] * Optimize RSA on armv8: openssl-1_1-Optimize-RSA-armv8.patch * Optimize AES-XTS mode for aarch64: openssl-1_1-Optimize-AES-XTS-aarch64.patch * Optimize AES-GCM for uarchs with unroll and new instructions: openssl-1_1-Optimize-AES-GCM-uarchs.patch - POWER10 performance enhancements for cryptography [jsc#SLE-19409] * openssl-1_1-Optimize-ppc64.patch OBS-URL: https://build.opensuse.org/request/show/949750 OBS-URL: https://build.opensuse.org/package/show/security:tls/openssl-1_1?expand=0&rev=102
2309 lines
59 KiB
Diff
2309 lines
59 KiB
Diff
From 4dba53694bf633c272075e62acdc5a5ca3003ce6 Mon Sep 17 00:00:00 2001
|
|
From: Amitay Isaacs <amitay@ozlabs.org>
|
|
Date: Mon, 29 Mar 2021 18:06:13 +1100
|
|
Subject: [PATCH 01/29] numbers: Define 128-bit integers if compiler supports
|
|
|
|
Signed-off-by: Amitay Isaacs <amitay@ozlabs.org>
|
|
|
|
Reviewed-by: Tomas Mraz <tomas@openssl.org>
|
|
Reviewed-by: Matt Caswell <matt@openssl.org>
|
|
(Merged from https://github.com/openssl/openssl/pull/14784)
|
|
|
|
(cherry picked from commit bbed0d1cbd436af6797d7837e270bff4ca4d5a10)
|
|
---
|
|
include/internal/numbers.h | 10 ++++++++++
|
|
1 file changed, 10 insertions(+)
|
|
|
|
Index: openssl-1.1.1l/include/internal/numbers.h
|
|
===================================================================
|
|
--- openssl-1.1.1l.orig/include/internal/numbers.h
|
|
+++ openssl-1.1.1l/include/internal/numbers.h
|
|
@@ -60,6 +60,16 @@
|
|
# define UINT64_MAX __MAXUINT__(uint64_t)
|
|
# endif
|
|
|
|
+# ifndef INT128_MAX
|
|
+# if defined(__SIZEOF_INT128__) && __SIZEOF_INT128__ == 16
|
|
+typedef __int128_t int128_t;
|
|
+typedef __uint128_t uint128_t;
|
|
+# define INT128_MIN __MININT__(int128_t)
|
|
+# define INT128_MAX __MAXINT__(int128_t)
|
|
+# define UINT128_MAX __MAXUINT__(uint128_t)
|
|
+# endif
|
|
+# endif
|
|
+
|
|
# ifndef SIZE_MAX
|
|
# define SIZE_MAX __MAXUINT__(size_t)
|
|
# endif
|
|
Index: openssl-1.1.1l/crypto/bn/bn_div.c
|
|
===================================================================
|
|
--- openssl-1.1.1l.orig/crypto/bn/bn_div.c
|
|
+++ openssl-1.1.1l/crypto/bn/bn_div.c
|
|
@@ -97,7 +97,7 @@ BN_ULONG bn_div_3_words(const BN_ULONG *
|
|
*/
|
|
# if BN_BITS2 == 64 && defined(__SIZEOF_INT128__) && __SIZEOF_INT128__==16
|
|
# undef BN_ULLONG
|
|
-# define BN_ULLONG __uint128_t
|
|
+# define BN_ULLONG uint128_t
|
|
# define BN_LLONG
|
|
# endif
|
|
|
|
Index: openssl-1.1.1l/crypto/bn/bn_local.h
|
|
===================================================================
|
|
--- openssl-1.1.1l.orig/crypto/bn/bn_local.h
|
|
+++ openssl-1.1.1l/crypto/bn/bn_local.h
|
|
@@ -22,6 +22,7 @@
|
|
# endif
|
|
|
|
# include "crypto/bn.h"
|
|
+# include "internal/numbers.h"
|
|
|
|
/*
|
|
* These preprocessor symbols control various aspects of the bignum headers
|
|
@@ -374,9 +375,9 @@ struct bn_gencb_st {
|
|
*/
|
|
# if defined(__SIZEOF_INT128__) && __SIZEOF_INT128__==16 && \
|
|
(defined(SIXTY_FOUR_BIT) || defined(SIXTY_FOUR_BIT_LONG))
|
|
-# define BN_UMULT_HIGH(a,b) (((__uint128_t)(a)*(b))>>64)
|
|
+# define BN_UMULT_HIGH(a,b) (((uint128_t)(a)*(b))>>64)
|
|
# define BN_UMULT_LOHI(low,high,a,b) ({ \
|
|
- __uint128_t ret=(__uint128_t)(a)*(b); \
|
|
+ uint128_t ret=(uint128_t)(a)*(b); \
|
|
(high)=ret>>64; (low)=ret; })
|
|
# elif defined(__alpha) && (defined(SIXTY_FOUR_BIT_LONG) || defined(SIXTY_FOUR_BIT))
|
|
# if defined(__DECC)
|
|
Index: openssl-1.1.1l/crypto/ec/curve25519.c
|
|
===================================================================
|
|
--- openssl-1.1.1l.orig/crypto/ec/curve25519.c
|
|
+++ openssl-1.1.1l/crypto/ec/curve25519.c
|
|
@@ -11,6 +11,8 @@
|
|
#include "ec_local.h"
|
|
#include <openssl/sha.h>
|
|
|
|
+#include "internal/numbers.h"
|
|
+
|
|
#if defined(X25519_ASM) && (defined(__x86_64) || defined(__x86_64__) || \
|
|
defined(_M_AMD64) || defined(_M_X64))
|
|
|
|
@@ -252,7 +254,7 @@ static void x25519_scalar_mulx(uint8_t o
|
|
#endif
|
|
|
|
#if defined(X25519_ASM) \
|
|
- || ( (defined(__SIZEOF_INT128__) && __SIZEOF_INT128__ == 16) \
|
|
+ || ( defined(INT128_MAX) \
|
|
&& !defined(__sparc__) \
|
|
&& (!defined(__SIZEOF_LONG__) || (__SIZEOF_LONG__ == 8)) \
|
|
&& !(defined(__ANDROID__) && !defined(__clang__)) )
|
|
@@ -385,7 +387,7 @@ void x25519_fe51_mul121666(fe51 h, fe51
|
|
# define fe51_mul121666 x25519_fe51_mul121666
|
|
# else
|
|
|
|
-typedef __uint128_t u128;
|
|
+typedef uint128_t u128;
|
|
|
|
static void fe51_mul(fe51 h, const fe51 f, const fe51 g)
|
|
{
|
|
Index: openssl-1.1.1l/crypto/ec/curve448/curve448utils.h
|
|
===================================================================
|
|
--- openssl-1.1.1l.orig/crypto/ec/curve448/curve448utils.h
|
|
+++ openssl-1.1.1l/crypto/ec/curve448/curve448utils.h
|
|
@@ -15,6 +15,8 @@
|
|
|
|
# include <openssl/e_os2.h>
|
|
|
|
+# include "internal/numbers.h"
|
|
+
|
|
/*
|
|
* Internal word types. Somewhat tricky. This could be decided separately per
|
|
* platform. However, the structs do need to be all the same size and
|
|
@@ -41,9 +43,9 @@ typedef int64_t c448_sword_t;
|
|
/* "Boolean" type, will be set to all-zero or all-one (i.e. -1u) */
|
|
typedef uint64_t c448_bool_t;
|
|
/* Double-word size for internal computations */
|
|
-typedef __uint128_t c448_dword_t;
|
|
+typedef uint128_t c448_dword_t;
|
|
/* Signed double-word size for internal computations */
|
|
-typedef __int128_t c448_dsword_t;
|
|
+typedef int128_t c448_dsword_t;
|
|
# elif C448_WORD_BITS == 32
|
|
/* Word size for internal computations */
|
|
typedef uint32_t c448_word_t;
|
|
Index: openssl-1.1.1l/crypto/ec/curve448/word.h
|
|
===================================================================
|
|
--- openssl-1.1.1l.orig/crypto/ec/curve448/word.h
|
|
+++ openssl-1.1.1l/crypto/ec/curve448/word.h
|
|
@@ -17,15 +17,20 @@
|
|
# include <assert.h>
|
|
# include <stdlib.h>
|
|
# include <openssl/e_os2.h>
|
|
-# include "arch_intrinsics.h"
|
|
# include "curve448utils.h"
|
|
|
|
+# ifdef INT128_MAX
|
|
+# include "arch_64/arch_intrinsics.h"
|
|
+# else
|
|
+# include "arch_32/arch_intrinsics.h"
|
|
+# endif
|
|
+
|
|
# if (ARCH_WORD_BITS == 64)
|
|
typedef uint64_t word_t, mask_t;
|
|
-typedef __uint128_t dword_t;
|
|
+typedef uint128_t dword_t;
|
|
typedef int32_t hsword_t;
|
|
typedef int64_t sword_t;
|
|
-typedef __int128_t dsword_t;
|
|
+typedef int128_t dsword_t;
|
|
# elif (ARCH_WORD_BITS == 32)
|
|
typedef uint32_t word_t, mask_t;
|
|
typedef uint64_t dword_t;
|
|
Index: openssl-1.1.1l/crypto/ec/ecp_nistp224.c
|
|
===================================================================
|
|
--- openssl-1.1.1l.orig/crypto/ec/ecp_nistp224.c
|
|
+++ openssl-1.1.1l/crypto/ec/ecp_nistp224.c
|
|
@@ -40,11 +40,9 @@ NON_EMPTY_TRANSLATION_UNIT
|
|
# include <openssl/err.h>
|
|
# include "ec_local.h"
|
|
|
|
-# if defined(__SIZEOF_INT128__) && __SIZEOF_INT128__==16
|
|
- /* even with gcc, the typedef won't work for 32-bit platforms */
|
|
-typedef __uint128_t uint128_t; /* nonstandard; implemented by gcc on 64-bit
|
|
- * platforms */
|
|
-# else
|
|
+#include "internal/numbers.h"
|
|
+
|
|
+#ifndef INT128_MAX
|
|
# error "Your compiler doesn't appear to support 128-bit integer types"
|
|
# endif
|
|
|
|
Index: openssl-1.1.1l/crypto/ec/ecp_nistp256.c
|
|
===================================================================
|
|
--- openssl-1.1.1l.orig/crypto/ec/ecp_nistp256.c
|
|
+++ openssl-1.1.1l/crypto/ec/ecp_nistp256.c
|
|
@@ -41,14 +41,11 @@ NON_EMPTY_TRANSLATION_UNIT
|
|
# include <openssl/err.h>
|
|
# include "ec_local.h"
|
|
|
|
-# if defined(__SIZEOF_INT128__) && __SIZEOF_INT128__==16
|
|
- /* even with gcc, the typedef won't work for 32-bit platforms */
|
|
-typedef __uint128_t uint128_t; /* nonstandard; implemented by gcc on 64-bit
|
|
- * platforms */
|
|
-typedef __int128_t int128_t;
|
|
-# else
|
|
-# error "Your compiler doesn't appear to support 128-bit integer types"
|
|
-# endif
|
|
+#include "internal/numbers.h"
|
|
+
|
|
+#ifndef INT128_MAX
|
|
+# error "Your compiler doesn't appear to support 128-bit integer types"
|
|
+#endif
|
|
|
|
typedef uint8_t u8;
|
|
typedef uint32_t u32;
|
|
Index: openssl-1.1.1l/crypto/ec/ecp_nistp521.c
|
|
===================================================================
|
|
--- openssl-1.1.1l.orig/crypto/ec/ecp_nistp521.c
|
|
+++ openssl-1.1.1l/crypto/ec/ecp_nistp521.c
|
|
@@ -40,13 +40,11 @@ NON_EMPTY_TRANSLATION_UNIT
|
|
# include <openssl/err.h>
|
|
# include "ec_local.h"
|
|
|
|
-# if defined(__SIZEOF_INT128__) && __SIZEOF_INT128__==16
|
|
- /* even with gcc, the typedef won't work for 32-bit platforms */
|
|
-typedef __uint128_t uint128_t; /* nonstandard; implemented by gcc on 64-bit
|
|
- * platforms */
|
|
-# else
|
|
-# error "Your compiler doesn't appear to support 128-bit integer types"
|
|
-# endif
|
|
+#include "internal/numbers.h"
|
|
+
|
|
+#ifndef INT128_MAX
|
|
+# error "Your compiler doesn't appear to support 128-bit integer types"
|
|
+#endif
|
|
|
|
typedef uint8_t u8;
|
|
typedef uint64_t u64;
|
|
@@ -400,7 +398,7 @@ static void felem_diff128(largefelem out
|
|
* On exit:
|
|
* out[i] < 17 * max(in[i]) * max(in[i])
|
|
*/
|
|
-static void felem_square(largefelem out, const felem in)
|
|
+static void felem_square_ref(largefelem out, const felem in)
|
|
{
|
|
felem inx2, inx4;
|
|
felem_scalar(inx2, in, 2);
|
|
@@ -484,7 +482,7 @@ static void felem_square(largefelem out,
|
|
* On exit:
|
|
* out[i] < 17 * max(in1[i]) * max(in2[i])
|
|
*/
|
|
-static void felem_mul(largefelem out, const felem in1, const felem in2)
|
|
+static void felem_mul_ref(largefelem out, const felem in1, const felem in2)
|
|
{
|
|
felem in2x2;
|
|
felem_scalar(in2x2, in2, 2);
|
|
@@ -674,6 +672,57 @@ static void felem_reduce(felem out, cons
|
|
*/
|
|
}
|
|
|
|
+#if defined(ECP_NISTP521_ASM)
|
|
+void felem_square_wrapper(largefelem out, const felem in);
|
|
+void felem_mul_wrapper(largefelem out, const felem in1, const felem in2);
|
|
+
|
|
+static void (*felem_square_p)(largefelem out, const felem in) =
|
|
+ felem_square_wrapper;
|
|
+static void (*felem_mul_p)(largefelem out, const felem in1, const felem in2) =
|
|
+ felem_mul_wrapper;
|
|
+
|
|
+void p521_felem_square(largefelem out, const felem in);
|
|
+void p521_felem_mul(largefelem out, const felem in1, const felem in2);
|
|
+
|
|
+# if defined(_ARCH_PPC64)
|
|
+# include "../ppc_arch.h"
|
|
+# endif
|
|
+
|
|
+void felem_select(void)
|
|
+{
|
|
+# if defined(_ARCH_PPC64)
|
|
+ if ((OPENSSL_ppccap_P & PPC_MADD300) && (OPENSSL_ppccap_P & PPC_ALTIVEC)) {
|
|
+ felem_square_p = p521_felem_square;
|
|
+ felem_mul_p = p521_felem_mul;
|
|
+
|
|
+ return;
|
|
+ }
|
|
+# endif
|
|
+
|
|
+ /* Default */
|
|
+ felem_square_p = felem_square_ref;
|
|
+ felem_mul_p = felem_mul_ref;
|
|
+}
|
|
+
|
|
+void felem_square_wrapper(largefelem out, const felem in)
|
|
+{
|
|
+ felem_select();
|
|
+ felem_square_p(out, in);
|
|
+}
|
|
+
|
|
+void felem_mul_wrapper(largefelem out, const felem in1, const felem in2)
|
|
+{
|
|
+ felem_select();
|
|
+ felem_mul_p(out, in1, in2);
|
|
+}
|
|
+
|
|
+# define felem_square felem_square_p
|
|
+# define felem_mul felem_mul_p
|
|
+#else
|
|
+# define felem_square felem_square_ref
|
|
+# define felem_mul felem_mul_ref
|
|
+#endif
|
|
+
|
|
static void felem_square_reduce(felem out, const felem in)
|
|
{
|
|
largefelem tmp;
|
|
Index: openssl-1.1.1l/crypto/poly1305/poly1305.c
|
|
===================================================================
|
|
--- openssl-1.1.1l.orig/crypto/poly1305/poly1305.c
|
|
+++ openssl-1.1.1l/crypto/poly1305/poly1305.c
|
|
@@ -95,11 +95,10 @@ poly1305_blocks(void *ctx, const unsigne
|
|
(a ^ ((a ^ b) | ((a - b) ^ b))) >> (sizeof(a) * 8 - 1) \
|
|
)
|
|
|
|
-# if (defined(__SIZEOF_INT128__) && __SIZEOF_INT128__==16) && \
|
|
- (defined(__SIZEOF_LONG__) && __SIZEOF_LONG__==8)
|
|
+# if defined(INT64_MAX) && defined(INT128_MAX)
|
|
|
|
typedef unsigned long u64;
|
|
-typedef __uint128_t u128;
|
|
+typedef uint128_t u128;
|
|
|
|
typedef struct {
|
|
u64 h[3];
|
|
Index: openssl-1.1.1l/crypto/poly1305/poly1305_base2_44.c
|
|
===================================================================
|
|
--- openssl-1.1.1l.orig/crypto/poly1305/poly1305_base2_44.c
|
|
+++ openssl-1.1.1l/crypto/poly1305/poly1305_base2_44.c
|
|
@@ -18,7 +18,7 @@
|
|
typedef unsigned char u8;
|
|
typedef unsigned int u32;
|
|
typedef unsigned long u64;
|
|
-typedef unsigned __int128 u128;
|
|
+typedef uint128_t u128;
|
|
|
|
typedef struct {
|
|
u64 h[3];
|
|
Index: openssl-1.1.1l/crypto/ec/build.info
|
|
===================================================================
|
|
--- openssl-1.1.1l.orig/crypto/ec/build.info
|
|
+++ openssl-1.1.1l/crypto/ec/build.info
|
|
@@ -6,8 +13,9 @@ SOURCE[../../libcrypto]=\
|
|
ecp_nistp224.c ecp_nistp256.c ecp_nistp521.c ecp_nistputil.c \
|
|
ecp_oct.c ec2_oct.c ec_oct.c ec_kmeth.c ecdh_ossl.c ecdh_kdf.c \
|
|
ecdsa_ossl.c ecdsa_sign.c ecdsa_vrf.c curve25519.c ecx_meth.c \
|
|
- curve448/arch_32/f_impl.c curve448/f_generic.c curve448/scalar.c \
|
|
+ curve448/f_generic.c curve448/scalar.c \
|
|
curve448/curve448_tables.c curve448/eddsa.c curve448/curve448.c \
|
|
+ curve448/arch_64/f_impl64.c curve448/arch_32/f_impl32.c \
|
|
{- $target{ec_asm_src} -}
|
|
|
|
GENERATE[ecp_nistz256-x86.s]=asm/ecp_nistz256-x86.pl \
|
|
@@ -29,6 +38,8 @@ GENERATE[ecp_nistz256-armv8.S]=asm/ecp_n
|
|
INCLUDE[ecp_nistz256-armv8.o]=..
|
|
GENERATE[ecp_nistz256-ppc64.s]=asm/ecp_nistz256-ppc64.pl $(PERLASM_SCHEME)
|
|
|
|
+GENERATE[ecp_nistp521-ppc64.s]=asm/ecp_nistp521-ppc64.pl $(PERLASM_SCHEME)
|
|
+
|
|
GENERATE[x25519-x86_64.s]=asm/x25519-x86_64.pl $(PERLASM_SCHEME)
|
|
GENERATE[x25519-ppc64.s]=asm/x25519-ppc64.pl $(PERLASM_SCHEME)
|
|
|
|
@@ -36,10 +47,3 @@ BEGINRAW[Makefile]
|
|
{- $builddir -}/ecp_nistz256-%.S: {- $sourcedir -}/asm/ecp_nistz256-%.pl
|
|
CC="$(CC)" $(PERL) $< $(PERLASM_SCHEME) $@
|
|
ENDRAW[Makefile]
|
|
-
|
|
-INCLUDE[curve448/arch_32/f_impl.o]=curve448/arch_32 curve448
|
|
-INCLUDE[curve448/f_generic.o]=curve448/arch_32 curve448
|
|
-INCLUDE[curve448/scalar.o]=curve448/arch_32 curve448
|
|
-INCLUDE[curve448/curve448_tables.o]=curve448/arch_32 curve448
|
|
-INCLUDE[curve448/eddsa.o]=curve448/arch_32 curve448
|
|
-INCLUDE[curve448/curve448.o]=curve448/arch_32 curve448
|
|
Index: openssl-1.1.1l/crypto/ec/curve448/field.h
|
|
===================================================================
|
|
--- openssl-1.1.1l.orig/crypto/ec/curve448/field.h
|
|
+++ openssl-1.1.1l/crypto/ec/curve448/field.h
|
|
@@ -66,10 +66,15 @@ void gf_serialize(uint8_t *serial, const
|
|
mask_t gf_deserialize(gf x, const uint8_t serial[SER_BYTES], int with_hibit,
|
|
uint8_t hi_nmask);
|
|
|
|
-# include "f_impl.h" /* Bring in the inline implementations */
|
|
|
|
# define LIMBPERM(i) (i)
|
|
-# define LIMB_MASK(i) (((1)<<LIMB_PLACE_VALUE(i))-1)
|
|
+# if (ARCH_WORD_BITS == 32)
|
|
+# include "arch_32/f_impl.h" /* Bring in the inline implementations */
|
|
+# define LIMB_MASK(i) (((1)<<LIMB_PLACE_VALUE(i))-1)
|
|
+# elif (ARCH_WORD_BITS == 64)
|
|
+# include "arch_64/f_impl.h" /* Bring in the inline implementations */
|
|
+# define LIMB_MASK(i) (((1ULL)<<LIMB_PLACE_VALUE(i))-1)
|
|
+# endif
|
|
|
|
static const gf ZERO = {{{0}}}, ONE = {{{1}}};
|
|
|
|
Index: openssl-1.1.1l/crypto/ec/curve448/arch_64/arch_intrinsics.h
|
|
===================================================================
|
|
--- /dev/null
|
|
+++ openssl-1.1.1l/crypto/ec/curve448/arch_64/arch_intrinsics.h
|
|
@@ -0,0 +1,27 @@
|
|
+/*
|
|
+ * Copyright 2017-2018 The OpenSSL Project Authors. All Rights Reserved.
|
|
+ * Copyright 2016 Cryptography Research, Inc.
|
|
+ *
|
|
+ * Licensed under the OpenSSL license (the "License"). You may not use
|
|
+ * this file except in compliance with the License. You can obtain a copy
|
|
+ * in the file LICENSE in the source distribution or at
|
|
+ * https://www.openssl.org/source/license.html
|
|
+ *
|
|
+ * Originally written by Mike Hamburg
|
|
+ */
|
|
+
|
|
+# include "internal/constant_time.h"
|
|
+
|
|
+#ifndef OSSL_CRYPTO_EC_CURVE448_ARCH_64_INTRINSICS_H
|
|
+# define OSSL_CRYPTO_EC_CURVE448_ARCH_64_INTRINSICS_H
|
|
+
|
|
+# define ARCH_WORD_BITS 64
|
|
+
|
|
+# define word_is_zero(a) constant_time_is_zero_64(a)
|
|
+
|
|
+static ossl_inline uint128_t widemul(uint64_t a, uint64_t b)
|
|
+{
|
|
+ return ((uint128_t) a) * b;
|
|
+}
|
|
+
|
|
+#endif /* OSSL_CRYPTO_EC_CURVE448_ARCH_64_INTRINSICS_H */
|
|
Index: openssl-1.1.1l/crypto/ec/curve448/arch_64/f_impl.h
|
|
===================================================================
|
|
--- /dev/null
|
|
+++ openssl-1.1.1l/crypto/ec/curve448/arch_64/f_impl.h
|
|
@@ -0,0 +1,58 @@
|
|
+/*
|
|
+ * Copyright 2017-2018 The OpenSSL Project Authors. All Rights Reserved.
|
|
+ * Copyright 2014-2016 Cryptography Research, Inc.
|
|
+ *
|
|
+ * Licensed under the OpenSSL license (the "License"). You may not use
|
|
+ * this file except in compliance with the License. You can obtain a copy
|
|
+ * in the file LICENSE in the source distribution or at
|
|
+ * https://www.openssl.org/source/license.html
|
|
+ *
|
|
+ * Originally written by Mike Hamburg
|
|
+ */
|
|
+
|
|
+#ifndef OSSL_CRYPTO_EC_CURVE448_ARCH_64_F_IMPL_H
|
|
+# define OSSL_CRYPTO_EC_CURVE448_ARCH_64_F_IMPL_H
|
|
+
|
|
+# define GF_HEADROOM 9999 /* Everything is reduced anyway */
|
|
+# define FIELD_LITERAL(a,b,c,d,e,f,g,h) {{a,b,c,d,e,f,g,h}}
|
|
+
|
|
+# define LIMB_PLACE_VALUE(i) 56
|
|
+
|
|
+void gf_add_RAW(gf out, const gf a, const gf b)
|
|
+{
|
|
+ unsigned int i;
|
|
+
|
|
+ for (i = 0; i < NLIMBS; i++)
|
|
+ out->limb[i] = a->limb[i] + b->limb[i];
|
|
+
|
|
+ gf_weak_reduce(out);
|
|
+}
|
|
+
|
|
+void gf_sub_RAW(gf out, const gf a, const gf b)
|
|
+{
|
|
+ uint64_t co1 = ((1ULL << 56) - 1) * 2, co2 = co1 - 2;
|
|
+ unsigned int i;
|
|
+
|
|
+ for (i = 0; i < NLIMBS; i++)
|
|
+ out->limb[i] = a->limb[i] - b->limb[i] + ((i == NLIMBS / 2) ? co2 : co1);
|
|
+
|
|
+ gf_weak_reduce(out);
|
|
+}
|
|
+
|
|
+void gf_bias(gf a, int amt)
|
|
+{
|
|
+}
|
|
+
|
|
+void gf_weak_reduce(gf a)
|
|
+{
|
|
+ uint64_t mask = (1ULL << 56) - 1;
|
|
+ uint64_t tmp = a->limb[NLIMBS - 1] >> 56;
|
|
+ unsigned int i;
|
|
+
|
|
+ a->limb[NLIMBS / 2] += tmp;
|
|
+ for (i = NLIMBS - 1; i > 0; i--)
|
|
+ a->limb[i] = (a->limb[i] & mask) + (a->limb[i - 1] >> 56);
|
|
+ a->limb[0] = (a->limb[0] & mask) + tmp;
|
|
+}
|
|
+
|
|
+#endif /* OSSL_CRYPTO_EC_CURVE448_ARCH_64_F_IMPL_H */
|
|
Index: openssl-1.1.1l/include/internal/constant_time.h
|
|
===================================================================
|
|
--- openssl-1.1.1l.orig/include/internal/constant_time.h
|
|
+++ openssl-1.1.1l/include/internal/constant_time.h
|
|
@@ -181,6 +181,11 @@ static ossl_inline uint32_t constant_tim
|
|
return constant_time_msb_32(~a & (a - 1));
|
|
}
|
|
|
|
+static ossl_inline uint64_t constant_time_is_zero_64(uint64_t a)
|
|
+{
|
|
+ return constant_time_msb_64(~a & (a - 1));
|
|
+}
|
|
+
|
|
static ossl_inline unsigned int constant_time_eq(unsigned int a,
|
|
unsigned int b)
|
|
{
|
|
Index: openssl-1.1.1l/crypto/ec/curve448/arch_32/f_impl32.c
|
|
===================================================================
|
|
--- /dev/null
|
|
+++ openssl-1.1.1l/crypto/ec/curve448/arch_32/f_impl32.c
|
|
@@ -0,0 +1,104 @@
|
|
+/*
|
|
+ * Copyright 2017-2018 The OpenSSL Project Authors. All Rights Reserved.
|
|
+ * Copyright 2014 Cryptography Research, Inc.
|
|
+ *
|
|
+ * Licensed under the OpenSSL license (the "License"). You may not use
|
|
+ * this file except in compliance with the License. You can obtain a copy
|
|
+ * in the file LICENSE in the source distribution or at
|
|
+ * https://www.openssl.org/source/license.html
|
|
+ *
|
|
+ * Originally written by Mike Hamburg
|
|
+ */
|
|
+
|
|
+#include <openssl/opensslconf.h>
|
|
+#include "internal/numbers.h"
|
|
+
|
|
+#ifdef UINT128_MAX
|
|
+/* We have support for 128 bit ints, so do nothing here */
|
|
+NON_EMPTY_TRANSLATION_UNIT
|
|
+#else
|
|
+
|
|
+# include "../field.h"
|
|
+
|
|
+void gf_mul(gf_s * RESTRICT cs, const gf as, const gf bs)
|
|
+{
|
|
+ const uint32_t *a = as->limb, *b = bs->limb;
|
|
+ uint32_t *c = cs->limb;
|
|
+ uint64_t accum0 = 0, accum1 = 0, accum2 = 0;
|
|
+ uint32_t mask = (1 << 28) - 1;
|
|
+ uint32_t aa[8], bb[8];
|
|
+ int i, j;
|
|
+
|
|
+ for (i = 0; i < 8; i++) {
|
|
+ aa[i] = a[i] + a[i + 8];
|
|
+ bb[i] = b[i] + b[i + 8];
|
|
+ }
|
|
+
|
|
+ for (j = 0; j < 8; j++) {
|
|
+ accum2 = 0;
|
|
+ for (i = 0; i < j + 1; i++) {
|
|
+ accum2 += widemul(a[j - i], b[i]);
|
|
+ accum1 += widemul(aa[j - i], bb[i]);
|
|
+ accum0 += widemul(a[8 + j - i], b[8 + i]);
|
|
+ }
|
|
+ accum1 -= accum2;
|
|
+ accum0 += accum2;
|
|
+ accum2 = 0;
|
|
+ for (i = j + 1; i < 8; i++) {
|
|
+ accum0 -= widemul(a[8 + j - i], b[i]);
|
|
+ accum2 += widemul(aa[8 + j - i], bb[i]);
|
|
+ accum1 += widemul(a[16 + j - i], b[8 + i]);
|
|
+ }
|
|
+ accum1 += accum2;
|
|
+ accum0 += accum2;
|
|
+ c[j] = ((uint32_t)(accum0)) & mask;
|
|
+ c[j + 8] = ((uint32_t)(accum1)) & mask;
|
|
+ accum0 >>= 28;
|
|
+ accum1 >>= 28;
|
|
+ }
|
|
+
|
|
+ accum0 += accum1;
|
|
+ accum0 += c[8];
|
|
+ accum1 += c[0];
|
|
+ c[8] = ((uint32_t)(accum0)) & mask;
|
|
+ c[0] = ((uint32_t)(accum1)) & mask;
|
|
+
|
|
+ accum0 >>= 28;
|
|
+ accum1 >>= 28;
|
|
+ c[9] += ((uint32_t)(accum0));
|
|
+ c[1] += ((uint32_t)(accum1));
|
|
+}
|
|
+
|
|
+void gf_mulw_unsigned(gf_s * RESTRICT cs, const gf as, uint32_t b)
|
|
+{
|
|
+ const uint32_t *a = as->limb;
|
|
+ uint32_t *c = cs->limb;
|
|
+ uint64_t accum0 = 0, accum8 = 0;
|
|
+ uint32_t mask = (1 << 28) - 1;
|
|
+ int i;
|
|
+
|
|
+ assert(b <= mask);
|
|
+
|
|
+ for (i = 0; i < 8; i++) {
|
|
+ accum0 += widemul(b, a[i]);
|
|
+ accum8 += widemul(b, a[i + 8]);
|
|
+ c[i] = accum0 & mask;
|
|
+ accum0 >>= 28;
|
|
+ c[i + 8] = accum8 & mask;
|
|
+ accum8 >>= 28;
|
|
+ }
|
|
+
|
|
+ accum0 += accum8 + c[8];
|
|
+ c[8] = ((uint32_t)accum0) & mask;
|
|
+ c[9] += (uint32_t)(accum0 >> 28);
|
|
+
|
|
+ accum8 += c[0];
|
|
+ c[0] = ((uint32_t)accum8) & mask;
|
|
+ c[1] += (uint32_t)(accum8 >> 28);
|
|
+}
|
|
+
|
|
+void gf_sqr(gf_s * RESTRICT cs, const gf as)
|
|
+{
|
|
+ gf_mul(cs, as, as); /* Performs better with a dedicated square */
|
|
+}
|
|
+#endif
|
|
Index: openssl-1.1.1l/crypto/ec/curve448/arch_64/f_impl64.c
|
|
===================================================================
|
|
--- /dev/null
|
|
+++ openssl-1.1.1l/crypto/ec/curve448/arch_64/f_impl64.c
|
|
@@ -0,0 +1,210 @@
|
|
+/*
|
|
+ * Copyright 2017-2018 The OpenSSL Project Authors. All Rights Reserved.
|
|
+ * Copyright 2014 Cryptography Research, Inc.
|
|
+ *
|
|
+ * Licensed under the OpenSSL license (the "License"). You may not use
|
|
+ * this file except in compliance with the License. You can obtain a copy
|
|
+ * in the file LICENSE in the source distribution or at
|
|
+ * https://www.openssl.org/source/license.html
|
|
+ *
|
|
+ * Originally written by Mike Hamburg
|
|
+ */
|
|
+
|
|
+#include <openssl/opensslconf.h>
|
|
+#include "internal/numbers.h"
|
|
+
|
|
+#ifndef UINT128_MAX
|
|
+/* No support for 128 bit ints, so do nothing here */
|
|
+NON_EMPTY_TRANSLATION_UNIT
|
|
+#else
|
|
+
|
|
+# include "../field.h"
|
|
+
|
|
+void gf_mul(gf_s * RESTRICT cs, const gf as, const gf bs)
|
|
+{
|
|
+ const uint64_t *a = as->limb, *b = bs->limb;
|
|
+ uint64_t *c = cs->limb;
|
|
+ uint128_t accum0 = 0, accum1 = 0, accum2;
|
|
+ uint64_t mask = (1ULL << 56) - 1;
|
|
+ uint64_t aa[4], bb[4], bbb[4];
|
|
+ unsigned int i, j;
|
|
+
|
|
+ for (i = 0; i < 4; i++) {
|
|
+ aa[i] = a[i] + a[i + 4];
|
|
+ bb[i] = b[i] + b[i + 4];
|
|
+ bbb[i] = bb[i] + b[i + 4];
|
|
+ }
|
|
+
|
|
+ for (i = 0; i < 4; i++) {
|
|
+ accum2 = 0;
|
|
+
|
|
+ for (j = 0; j <= i; j++) {
|
|
+ accum2 += widemul(a[j], b[i - j]);
|
|
+ accum1 += widemul(aa[j], bb[i - j]);
|
|
+ accum0 += widemul(a[j + 4], b[i - j + 4]);
|
|
+ }
|
|
+ for (; j < 4; j++) {
|
|
+ accum2 += widemul(a[j], b[i - j + 8]);
|
|
+ accum1 += widemul(aa[j], bbb[i - j + 4]);
|
|
+ accum0 += widemul(a[j + 4], bb[i - j + 4]);
|
|
+ }
|
|
+
|
|
+ accum1 -= accum2;
|
|
+ accum0 += accum2;
|
|
+
|
|
+ c[i] = ((uint64_t)(accum0)) & mask;
|
|
+ c[i + 4] = ((uint64_t)(accum1)) & mask;
|
|
+
|
|
+ accum0 >>= 56;
|
|
+ accum1 >>= 56;
|
|
+ }
|
|
+
|
|
+ accum0 += accum1;
|
|
+ accum0 += c[4];
|
|
+ accum1 += c[0];
|
|
+ c[4] = ((uint64_t)(accum0)) & mask;
|
|
+ c[0] = ((uint64_t)(accum1)) & mask;
|
|
+
|
|
+ accum0 >>= 56;
|
|
+ accum1 >>= 56;
|
|
+
|
|
+ c[5] += ((uint64_t)(accum0));
|
|
+ c[1] += ((uint64_t)(accum1));
|
|
+}
|
|
+
|
|
+void gf_mulw_unsigned(gf_s * __restrict__ cs, const gf as, uint32_t b)
|
|
+{
|
|
+ const uint64_t *a = as->limb;
|
|
+ uint64_t *c = cs->limb;
|
|
+ uint128_t accum0 = 0, accum4 = 0;
|
|
+ uint64_t mask = (1ULL << 56) - 1;
|
|
+ int i;
|
|
+
|
|
+ for (i = 0; i < 4; i++) {
|
|
+ accum0 += widemul(b, a[i]);
|
|
+ accum4 += widemul(b, a[i + 4]);
|
|
+ c[i] = accum0 & mask;
|
|
+ accum0 >>= 56;
|
|
+ c[i + 4] = accum4 & mask;
|
|
+ accum4 >>= 56;
|
|
+ }
|
|
+
|
|
+ accum0 += accum4 + c[4];
|
|
+ c[4] = accum0 & mask;
|
|
+ c[5] += accum0 >> 56;
|
|
+
|
|
+ accum4 += c[0];
|
|
+ c[0] = accum4 & mask;
|
|
+ c[1] += accum4 >> 56;
|
|
+}
|
|
+
|
|
+void gf_sqr(gf_s * __restrict__ cs, const gf as)
|
|
+{
|
|
+ const uint64_t *a = as->limb;
|
|
+ uint64_t *c = cs->limb;
|
|
+ uint128_t accum0 = 0, accum1 = 0, accum2;
|
|
+ uint64_t mask = (1ULL << 56) - 1;
|
|
+ uint64_t aa[4];
|
|
+
|
|
+ /* For some reason clang doesn't vectorize this without prompting? */
|
|
+ unsigned int i;
|
|
+ for (i = 0; i < 4; i++) {
|
|
+ aa[i] = a[i] + a[i + 4];
|
|
+ }
|
|
+
|
|
+ accum2 = widemul(a[0], a[3]);
|
|
+ accum0 = widemul(aa[0], aa[3]);
|
|
+ accum1 = widemul(a[4], a[7]);
|
|
+
|
|
+ accum2 += widemul(a[1], a[2]);
|
|
+ accum0 += widemul(aa[1], aa[2]);
|
|
+ accum1 += widemul(a[5], a[6]);
|
|
+
|
|
+ accum0 -= accum2;
|
|
+ accum1 += accum2;
|
|
+
|
|
+ c[3] = ((uint64_t)(accum1)) << 1 & mask;
|
|
+ c[7] = ((uint64_t)(accum0)) << 1 & mask;
|
|
+
|
|
+ accum0 >>= 55;
|
|
+ accum1 >>= 55;
|
|
+
|
|
+ accum0 += widemul(2 * aa[1], aa[3]);
|
|
+ accum1 += widemul(2 * a[5], a[7]);
|
|
+ accum0 += widemul(aa[2], aa[2]);
|
|
+ accum1 += accum0;
|
|
+
|
|
+ accum0 -= widemul(2 * a[1], a[3]);
|
|
+ accum1 += widemul(a[6], a[6]);
|
|
+
|
|
+ accum2 = widemul(a[0], a[0]);
|
|
+ accum1 -= accum2;
|
|
+ accum0 += accum2;
|
|
+
|
|
+ accum0 -= widemul(a[2], a[2]);
|
|
+ accum1 += widemul(aa[0], aa[0]);
|
|
+ accum0 += widemul(a[4], a[4]);
|
|
+
|
|
+ c[0] = ((uint64_t)(accum0)) & mask;
|
|
+ c[4] = ((uint64_t)(accum1)) & mask;
|
|
+
|
|
+ accum0 >>= 56;
|
|
+ accum1 >>= 56;
|
|
+
|
|
+ accum2 = widemul(2 * aa[2], aa[3]);
|
|
+ accum0 -= widemul(2 * a[2], a[3]);
|
|
+ accum1 += widemul(2 * a[6], a[7]);
|
|
+
|
|
+ accum1 += accum2;
|
|
+ accum0 += accum2;
|
|
+
|
|
+ accum2 = widemul(2 * a[0], a[1]);
|
|
+ accum1 += widemul(2 * aa[0], aa[1]);
|
|
+ accum0 += widemul(2 * a[4], a[5]);
|
|
+
|
|
+ accum1 -= accum2;
|
|
+ accum0 += accum2;
|
|
+
|
|
+ c[1] = ((uint64_t)(accum0)) & mask;
|
|
+ c[5] = ((uint64_t)(accum1)) & mask;
|
|
+
|
|
+ accum0 >>= 56;
|
|
+ accum1 >>= 56;
|
|
+
|
|
+ accum2 = widemul(aa[3], aa[3]);
|
|
+ accum0 -= widemul(a[3], a[3]);
|
|
+ accum1 += widemul(a[7], a[7]);
|
|
+
|
|
+ accum1 += accum2;
|
|
+ accum0 += accum2;
|
|
+
|
|
+ accum2 = widemul(2 * a[0], a[2]);
|
|
+ accum1 += widemul(2 * aa[0], aa[2]);
|
|
+ accum0 += widemul(2 * a[4], a[6]);
|
|
+
|
|
+ accum2 += widemul(a[1], a[1]);
|
|
+ accum1 += widemul(aa[1], aa[1]);
|
|
+ accum0 += widemul(a[5], a[5]);
|
|
+
|
|
+ accum1 -= accum2;
|
|
+ accum0 += accum2;
|
|
+
|
|
+ c[2] = ((uint64_t)(accum0)) & mask;
|
|
+ c[6] = ((uint64_t)(accum1)) & mask;
|
|
+
|
|
+ accum0 >>= 56;
|
|
+ accum1 >>= 56;
|
|
+
|
|
+ accum0 += c[3];
|
|
+ accum1 += c[7];
|
|
+ c[3] = ((uint64_t)(accum0)) & mask;
|
|
+ c[7] = ((uint64_t)(accum1)) & mask;
|
|
+
|
|
+ /* we could almost stop here, but it wouldn't be stable, so... */
|
|
+
|
|
+ accum0 >>= 56;
|
|
+ accum1 >>= 56;
|
|
+ c[4] += ((uint64_t)(accum0)) + ((uint64_t)(accum1));
|
|
+ c[0] += ((uint64_t)(accum1));
|
|
+}
|
|
+#endif
|
|
Index: openssl-1.1.1l/Configurations/00-base-templates.conf
|
|
===================================================================
|
|
--- openssl-1.1.1l.orig/Configurations/00-base-templates.conf
|
|
+++ openssl-1.1.1l/Configurations/00-base-templates.conf
|
|
@@ -351,7 +351,8 @@ my %targets=(
|
|
ppc64_asm => {
|
|
inherit_from => [ "ppc32_asm" ],
|
|
template => 1,
|
|
- ec_asm_src => "ecp_nistz256.c ecp_nistz256-ppc64.s x25519-ppc64.s",
|
|
+ bn_asm_src => add("ppc64-mont-fixed.s"),
|
|
+ ec_asm_src => "ecp_nistz256.c ecp_nistz256-ppc64.s ecp_nistp521-ppc64.s x25519-ppc64.s",
|
|
keccak1600_asm_src => "keccak1600-ppc64.s",
|
|
},
|
|
);
|
|
Index: openssl-1.1.1l/crypto/bn/asm/ppc64-mont-fixed.pl
|
|
===================================================================
|
|
--- /dev/null
|
|
+++ openssl-1.1.1l/crypto/bn/asm/ppc64-mont-fixed.pl
|
|
@@ -0,0 +1,581 @@
|
|
+#! /usr/bin/env perl
|
|
+# Copyright 2021 The OpenSSL Project Authors. All Rights Reserved.
|
|
+#
|
|
+# Licensed under the Apache License 2.0 (the "License"). You may not use
|
|
+# this file except in compliance with the License. You can obtain a copy
|
|
+# in the file LICENSE in the source distribution or at
|
|
+# https://www.openssl.org/source/license.html
|
|
+
|
|
+# ====================================================================
|
|
+# Written by Amitay Isaacs <amitay@ozlabs.org>, Martin Schwenke
|
|
+# <martin@meltin.net> & Alastair D'Silva <alastair@d-silva.org> for
|
|
+# the OpenSSL project.
|
|
+# ====================================================================
|
|
+
|
|
+#
|
|
+# Fixed length (n=6), unrolled PPC Montgomery Multiplication
|
|
+#
|
|
+
|
|
+# 2021
|
|
+#
|
|
+# Although this is a generic implementation for unrolling Montgomery
|
|
+# Multiplication for arbitrary values of n, this is currently only
|
|
+# used for n = 6 to improve the performance of ECC p384.
|
|
+#
|
|
+# Unrolling allows intermediate results to be stored in registers,
|
|
+# rather than on the stack, improving performance by ~7% compared to
|
|
+# the existing PPC assembly code.
|
|
+#
|
|
+# The ISA 3.0 implementation uses combination multiply/add
|
|
+# instructions (maddld, maddhdu) to improve performance by an
|
|
+# additional ~10% on Power 9.
|
|
+#
|
|
+# Finally, saving non-volatile registers into volatile vector
|
|
+# registers instead of onto the stack saves a little more.
|
|
+#
|
|
+# On a Power 9 machine we see an overall improvement of ~18%.
|
|
+#
|
|
+
|
|
+use strict;
|
|
+use warnings;
|
|
+
|
|
+my ($flavour, $output, $dir, $xlate);
|
|
+
|
|
+# $output is the last argument if it looks like a file (it has an extension)
|
|
+# $flavour is the first argument if it doesn't look like a file
|
|
+$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
|
|
+$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
|
|
+
|
|
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
|
|
+( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
|
|
+( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
|
|
+die "can't locate ppc-xlate.pl";
|
|
+
|
|
+open STDOUT,"| $^X $xlate $flavour \"$output\""
|
|
+ or die "can't call $xlate: $!";
|
|
+
|
|
+if ($flavour !~ /64/) {
|
|
+ die "bad flavour ($flavour) - only ppc64 permitted";
|
|
+}
|
|
+
|
|
+my $SIZE_T= 8;
|
|
+
|
|
+# Registers are global so the code is remotely readable
|
|
+
|
|
+# Parameters for Montgomery multiplication
|
|
+my $sp = "r1";
|
|
+my $toc = "r2";
|
|
+my $rp = "r3";
|
|
+my $ap = "r4";
|
|
+my $bp = "r5";
|
|
+my $np = "r6";
|
|
+my $n0 = "r7";
|
|
+my $num = "r8";
|
|
+
|
|
+my $i = "r9";
|
|
+my $c0 = "r10";
|
|
+my $bp0 = "r11";
|
|
+my $bpi = "r11";
|
|
+my $bpj = "r11";
|
|
+my $tj = "r12";
|
|
+my $apj = "r12";
|
|
+my $npj = "r12";
|
|
+my $lo = "r14";
|
|
+my $c1 = "r14";
|
|
+
|
|
+# Non-volatile registers used for tp[i]
|
|
+#
|
|
+# 12 registers are available but the limit on unrolling is 10,
|
|
+# since registers from $tp[0] to $tp[$n+1] are used.
|
|
+my @tp = ("r20" .. "r31");
|
|
+
|
|
+# volatile VSRs for saving non-volatile GPRs - faster than stack
|
|
+my @vsrs = ("v32" .. "v46");
|
|
+
|
|
+package Mont;
|
|
+
|
|
+sub new($$)
|
|
+{
|
|
+ my ($class, $n) = @_;
|
|
+
|
|
+ if ($n > 10) {
|
|
+ die "Can't unroll for BN length ${n} (maximum 10)"
|
|
+ }
|
|
+
|
|
+ my $self = {
|
|
+ code => "",
|
|
+ n => $n,
|
|
+ };
|
|
+ bless $self, $class;
|
|
+
|
|
+ return $self;
|
|
+}
|
|
+
|
|
+sub add_code($$)
|
|
+{
|
|
+ my ($self, $c) = @_;
|
|
+
|
|
+ $self->{code} .= $c;
|
|
+}
|
|
+
|
|
+sub get_code($)
|
|
+{
|
|
+ my ($self) = @_;
|
|
+
|
|
+ return $self->{code};
|
|
+}
|
|
+
|
|
+sub get_function_name($)
|
|
+{
|
|
+ my ($self) = @_;
|
|
+
|
|
+ return "bn_mul_mont_fixed_n" . $self->{n};
|
|
+}
|
|
+
|
|
+sub get_label($$)
|
|
+{
|
|
+ my ($self, $l) = @_;
|
|
+
|
|
+ return "L" . $l . "_" . $self->{n};
|
|
+}
|
|
+
|
|
+sub get_labels($@)
|
|
+{
|
|
+ my ($self, @labels) = @_;
|
|
+
|
|
+ my %out = ();
|
|
+
|
|
+ foreach my $l (@labels) {
|
|
+ $out{"$l"} = $self->get_label("$l");
|
|
+ }
|
|
+
|
|
+ return \%out;
|
|
+}
|
|
+
|
|
+sub nl($)
|
|
+{
|
|
+ my ($self) = @_;
|
|
+
|
|
+ $self->add_code("\n");
|
|
+}
|
|
+
|
|
+sub copy_result($)
|
|
+{
|
|
+ my ($self) = @_;
|
|
+
|
|
+ my ($n) = $self->{n};
|
|
+
|
|
+ for (my $j = 0; $j < $n; $j++) {
|
|
+ $self->add_code(<<___);
|
|
+ std $tp[$j],`$j*$SIZE_T`($rp)
|
|
+___
|
|
+ }
|
|
+
|
|
+}
|
|
+
|
|
+sub mul_mont_fixed($)
|
|
+{
|
|
+ my ($self) = @_;
|
|
+
|
|
+ my ($n) = $self->{n};
|
|
+ my $fname = $self->get_function_name();
|
|
+ my $label = $self->get_labels("outer", "enter", "sub", "copy", "end");
|
|
+
|
|
+ $self->add_code(<<___);
|
|
+
|
|
+.globl .${fname}
|
|
+.align 5
|
|
+.${fname}:
|
|
+
|
|
+___
|
|
+
|
|
+ $self->save_registers();
|
|
+
|
|
+ $self->add_code(<<___);
|
|
+ ld $n0,0($n0)
|
|
+
|
|
+ ld $bp0,0($bp)
|
|
+
|
|
+ ld $apj,0($ap)
|
|
+___
|
|
+
|
|
+ $self->mul_c_0($tp[0], $apj, $bp0, $c0);
|
|
+
|
|
+ for (my $j = 1; $j < $n - 1; $j++) {
|
|
+ $self->add_code(<<___);
|
|
+ ld $apj,`$j*$SIZE_T`($ap)
|
|
+___
|
|
+ $self->mul($tp[$j], $apj, $bp0, $c0);
|
|
+ }
|
|
+
|
|
+ $self->add_code(<<___);
|
|
+ ld $apj,`($n-1)*$SIZE_T`($ap)
|
|
+___
|
|
+
|
|
+ $self->mul_last($tp[$n-1], $tp[$n], $apj, $bp0, $c0);
|
|
+
|
|
+ $self->add_code(<<___);
|
|
+ li $tp[$n+1],0
|
|
+
|
|
+___
|
|
+
|
|
+ $self->add_code(<<___);
|
|
+ li $i,0
|
|
+ mtctr $num
|
|
+ b $label->{"enter"}
|
|
+
|
|
+.align 4
|
|
+$label->{"outer"}:
|
|
+ ldx $bpi,$bp,$i
|
|
+
|
|
+ ld $apj,0($ap)
|
|
+___
|
|
+
|
|
+ $self->mul_add_c_0($tp[0], $tp[0], $apj, $bpi, $c0);
|
|
+
|
|
+ for (my $j = 1; $j < $n; $j++) {
|
|
+ $self->add_code(<<___);
|
|
+ ld $apj,`$j*$SIZE_T`($ap)
|
|
+___
|
|
+ $self->mul_add($tp[$j], $tp[$j], $apj, $bpi, $c0);
|
|
+ }
|
|
+
|
|
+ $self->add_code(<<___);
|
|
+ addc $tp[$n],$tp[$n],$c0
|
|
+ addze $tp[$n+1],$tp[$n+1]
|
|
+___
|
|
+
|
|
+ $self->add_code(<<___);
|
|
+.align 4
|
|
+$label->{"enter"}:
|
|
+ mulld $bpi,$tp[0],$n0
|
|
+
|
|
+ ld $npj,0($np)
|
|
+___
|
|
+
|
|
+ $self->mul_add_c_0($lo, $tp[0], $bpi, $npj, $c0);
|
|
+
|
|
+ for (my $j = 1; $j < $n; $j++) {
|
|
+ $self->add_code(<<___);
|
|
+ ld $npj,`$j*$SIZE_T`($np)
|
|
+___
|
|
+ $self->mul_add($tp[$j-1], $tp[$j], $npj, $bpi, $c0);
|
|
+ }
|
|
+
|
|
+ $self->add_code(<<___);
|
|
+ addc $tp[$n-1],$tp[$n],$c0
|
|
+ addze $tp[$n],$tp[$n+1]
|
|
+
|
|
+ addi $i,$i,$SIZE_T
|
|
+ bdnz $label->{"outer"}
|
|
+
|
|
+ and. $tp[$n],$tp[$n],$tp[$n]
|
|
+ bne $label->{"sub"}
|
|
+
|
|
+ cmpld $tp[$n-1],$npj
|
|
+ blt $label->{"copy"}
|
|
+
|
|
+$label->{"sub"}:
|
|
+___
|
|
+
|
|
+ #
|
|
+ # Reduction
|
|
+ #
|
|
+
|
|
+ $self->add_code(<<___);
|
|
+ ld $bpj,`0*$SIZE_T`($np)
|
|
+ subfc $c1,$bpj,$tp[0]
|
|
+ std $c1,`0*$SIZE_T`($rp)
|
|
+
|
|
+___
|
|
+ for (my $j = 1; $j < $n - 1; $j++) {
|
|
+ $self->add_code(<<___);
|
|
+ ld $bpj,`$j*$SIZE_T`($np)
|
|
+ subfe $c1,$bpj,$tp[$j]
|
|
+ std $c1,`$j*$SIZE_T`($rp)
|
|
+
|
|
+___
|
|
+ }
|
|
+
|
|
+ $self->add_code(<<___);
|
|
+ subfe $c1,$npj,$tp[$n-1]
|
|
+ std $c1,`($n-1)*$SIZE_T`($rp)
|
|
+
|
|
+___
|
|
+
|
|
+ $self->add_code(<<___);
|
|
+ addme. $tp[$n],$tp[$n]
|
|
+ beq $label->{"end"}
|
|
+
|
|
+$label->{"copy"}:
|
|
+___
|
|
+
|
|
+ $self->copy_result();
|
|
+
|
|
+ $self->add_code(<<___);
|
|
+
|
|
+$label->{"end"}:
|
|
+___
|
|
+
|
|
+ $self->restore_registers();
|
|
+
|
|
+ $self->add_code(<<___);
|
|
+ li r3,1
|
|
+ blr
|
|
+.size .${fname},.-.${fname}
|
|
+___
|
|
+
|
|
+}
|
|
+
|
|
+package Mont::GPR;
|
|
+
|
|
+our @ISA = ('Mont');
|
|
+
|
|
+sub new($$)
|
|
+{
|
|
+ my ($class, $n) = @_;
|
|
+
|
|
+ return $class->SUPER::new($n);
|
|
+}
|
|
+
|
|
+sub save_registers($)
|
|
+{
|
|
+ my ($self) = @_;
|
|
+
|
|
+ my $n = $self->{n};
|
|
+
|
|
+ $self->add_code(<<___);
|
|
+ std $lo,-8($sp)
|
|
+___
|
|
+
|
|
+ for (my $j = 0; $j <= $n+1; $j++) {
|
|
+ $self->{code}.=<<___;
|
|
+ std $tp[$j],-`($j+2)*8`($sp)
|
|
+___
|
|
+ }
|
|
+
|
|
+ $self->add_code(<<___);
|
|
+
|
|
+___
|
|
+}
|
|
+
|
|
+sub restore_registers($)
|
|
+{
|
|
+ my ($self) = @_;
|
|
+
|
|
+ my $n = $self->{n};
|
|
+
|
|
+ $self->add_code(<<___);
|
|
+ ld $lo,-8($sp)
|
|
+___
|
|
+
|
|
+ for (my $j = 0; $j <= $n+1; $j++) {
|
|
+ $self->{code}.=<<___;
|
|
+ ld $tp[$j],-`($j+2)*8`($sp)
|
|
+___
|
|
+ }
|
|
+
|
|
+ $self->{code} .=<<___;
|
|
+
|
|
+___
|
|
+}
|
|
+
|
|
+# Direct translation of C mul()
|
|
+sub mul($$$$$)
|
|
+{
|
|
+ my ($self, $r, $a, $w, $c) = @_;
|
|
+
|
|
+ $self->add_code(<<___);
|
|
+ mulld $lo,$a,$w
|
|
+ addc $r,$lo,$c
|
|
+ mulhdu $c,$a,$w
|
|
+ addze $c,$c
|
|
+
|
|
+___
|
|
+}
|
|
+
|
|
+# Like mul() but $c is ignored as an input - an optimisation to save a
|
|
+# preliminary instruction that would set input $c to 0
|
|
+sub mul_c_0($$$$$)
|
|
+{
|
|
+ my ($self, $r, $a, $w, $c) = @_;
|
|
+
|
|
+ $self->add_code(<<___);
|
|
+ mulld $r,$a,$w
|
|
+ mulhdu $c,$a,$w
|
|
+
|
|
+___
|
|
+}
|
|
+
|
|
+# Like mul() but does not to the final addition of CA into $c - an
|
|
+# optimisation to save an instruction
|
|
+sub mul_last($$$$$$)
|
|
+{
|
|
+ my ($self, $r1, $r2, $a, $w, $c) = @_;
|
|
+
|
|
+ $self->add_code(<<___);
|
|
+ mulld $lo,$a,$w
|
|
+ addc $r1,$lo,$c
|
|
+ mulhdu $c,$a,$w
|
|
+
|
|
+ addze $r2,$c
|
|
+___
|
|
+}
|
|
+
|
|
+# Like C mul_add() but allow $r_out and $r_in to be different
|
|
+sub mul_add($$$$$$)
|
|
+{
|
|
+ my ($self, $r_out, $r_in, $a, $w, $c) = @_;
|
|
+
|
|
+ $self->add_code(<<___);
|
|
+ mulld $lo,$a,$w
|
|
+ addc $lo,$lo,$c
|
|
+ mulhdu $c,$a,$w
|
|
+ addze $c,$c
|
|
+ addc $r_out,$r_in,$lo
|
|
+ addze $c,$c
|
|
+
|
|
+___
|
|
+}
|
|
+
|
|
+# Like mul_add() but $c is ignored as an input - an optimisation to save a
|
|
+# preliminary instruction that would set input $c to 0
|
|
+sub mul_add_c_0($$$$$$)
|
|
+{
|
|
+ my ($self, $r_out, $r_in, $a, $w, $c) = @_;
|
|
+
|
|
+ $self->add_code(<<___);
|
|
+ mulld $lo,$a,$w
|
|
+ addc $r_out,$r_in,$lo
|
|
+ mulhdu $c,$a,$w
|
|
+ addze $c,$c
|
|
+
|
|
+___
|
|
+}
|
|
+
|
|
+package Mont::GPR_300;
|
|
+
|
|
+our @ISA = ('Mont::GPR');
|
|
+
|
|
+sub new($$)
|
|
+{
|
|
+ my ($class, $n) = @_;
|
|
+
|
|
+ my $mont = $class->SUPER::new($n);
|
|
+
|
|
+ return $mont;
|
|
+}
|
|
+
|
|
+sub get_function_name($)
|
|
+{
|
|
+ my ($self) = @_;
|
|
+
|
|
+ return "bn_mul_mont_300_fixed_n" . $self->{n};
|
|
+}
|
|
+
|
|
+sub get_label($$)
|
|
+{
|
|
+ my ($self, $l) = @_;
|
|
+
|
|
+ return "L" . $l . "_300_" . $self->{n};
|
|
+}
|
|
+
|
|
+# Direct translation of C mul()
|
|
+sub mul($$$$$)
|
|
+{
|
|
+ my ($self, $r, $a, $w, $c, $last) = @_;
|
|
+
|
|
+ $self->add_code(<<___);
|
|
+ maddld $r,$a,$w,$c
|
|
+ maddhdu $c,$a,$w,$c
|
|
+
|
|
+___
|
|
+}
|
|
+
|
|
+# Save the last carry as the final entry
|
|
+sub mul_last($$$$$)
|
|
+{
|
|
+ my ($self, $r1, $r2, $a, $w, $c) = @_;
|
|
+
|
|
+ $self->add_code(<<___);
|
|
+ maddld $r1,$a,$w,$c
|
|
+ maddhdu $r2,$a,$w,$c
|
|
+
|
|
+___
|
|
+}
|
|
+
|
|
+# Like mul() but $c is ignored as an input - an optimisation to save a
|
|
+# preliminary instruction that would set input $c to 0
|
|
+sub mul_c_0($$$$$)
|
|
+{
|
|
+ my ($self, $r, $a, $w, $c) = @_;
|
|
+
|
|
+ $self->add_code(<<___);
|
|
+ mulld $r,$a,$w
|
|
+ mulhdu $c,$a,$w
|
|
+
|
|
+___
|
|
+}
|
|
+
|
|
+# Like C mul_add() but allow $r_out and $r_in to be different
|
|
+sub mul_add($$$$$$)
|
|
+{
|
|
+ my ($self, $r_out, $r_in, $a, $w, $c) = @_;
|
|
+
|
|
+ $self->add_code(<<___);
|
|
+ maddld $lo,$a,$w,$c
|
|
+ maddhdu $c,$a,$w,$c
|
|
+ addc $r_out,$r_in,$lo
|
|
+ addze $c,$c
|
|
+
|
|
+___
|
|
+}
|
|
+
|
|
+# Like mul_add() but $c is ignored as an input - an optimisation to save a
|
|
+# preliminary instruction that would set input $c to 0
|
|
+sub mul_add_c_0($$$$$$)
|
|
+{
|
|
+ my ($self, $r_out, $r_in, $a, $w, $c) = @_;
|
|
+
|
|
+ $self->add_code(<<___);
|
|
+ maddld $lo,$a,$w,$r_in
|
|
+ maddhdu $c,$a,$w,$r_in
|
|
+___
|
|
+
|
|
+ if ($r_out ne $lo) {
|
|
+ $self->add_code(<<___);
|
|
+ mr $r_out,$lo
|
|
+___
|
|
+ }
|
|
+
|
|
+ $self->nl();
|
|
+}
|
|
+
|
|
+
|
|
+package main;
|
|
+
|
|
+my $code;
|
|
+
|
|
+$code.=<<___;
|
|
+.machine "any"
|
|
+.text
|
|
+___
|
|
+
|
|
+my $mont;
|
|
+
|
|
+$mont = new Mont::GPR(6);
|
|
+$mont->mul_mont_fixed();
|
|
+$code .= $mont->get_code();
|
|
+
|
|
+$mont = new Mont::GPR_300(6);
|
|
+$mont->mul_mont_fixed();
|
|
+$code .= $mont->get_code();
|
|
+
|
|
+$code =~ s/\`([^\`]*)\`/eval $1/gem;
|
|
+
|
|
+$code.=<<___;
|
|
+.asciz "Montgomery Multiplication for PPC by <amitay\@ozlabs.org>, <alastair\@d-silva.org>"
|
|
+___
|
|
+
|
|
+print $code;
|
|
+close STDOUT or die "error closing STDOUT: $!";
|
|
Index: openssl-1.1.1l/crypto/bn/build.info
|
|
===================================================================
|
|
--- openssl-1.1.1l.orig/crypto/bn/build.info
|
|
+++ openssl-1.1.1l/crypto/bn/build.info
|
|
@@ -56,6 +56,7 @@ GENERATE[parisc-mont.s]=asm/parisc-mont.
|
|
GENERATE[bn-ppc.s]=asm/ppc.pl $(PERLASM_SCHEME)
|
|
GENERATE[ppc-mont.s]=asm/ppc-mont.pl $(PERLASM_SCHEME)
|
|
GENERATE[ppc64-mont.s]=asm/ppc64-mont.pl $(PERLASM_SCHEME)
|
|
+GENERATE[ppc64-mont-fixed.s]=asm/ppc64-mont-fixed.pl $(PERLASM_SCHEME)
|
|
|
|
GENERATE[alpha-mont.S]=asm/alpha-mont.pl $(PERLASM_SCHEME)
|
|
|
|
Index: openssl-1.1.1l/crypto/ppccap.c
|
|
===================================================================
|
|
--- openssl-1.1.1l.orig/crypto/ppccap.c
|
|
+++ openssl-1.1.1l/crypto/ppccap.c
|
|
@@ -46,6 +46,12 @@ int bn_mul_mont(BN_ULONG *rp, const BN_U
|
|
const BN_ULONG *np, const BN_ULONG *n0, int num);
|
|
int bn_mul4x_mont_int(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
|
|
const BN_ULONG *np, const BN_ULONG *n0, int num);
|
|
+ int bn_mul_mont_fixed_n6(BN_ULONG *rp, const BN_ULONG *ap,
|
|
+ const BN_ULONG *bp, const BN_ULONG *np,
|
|
+ const BN_ULONG *n0, int num);
|
|
+ int bn_mul_mont_300_fixed_n6(BN_ULONG *rp, const BN_ULONG *ap,
|
|
+ const BN_ULONG *bp, const BN_ULONG *np,
|
|
+ const BN_ULONG *n0, int num);
|
|
|
|
if (num < 4)
|
|
return 0;
|
|
@@ -61,6 +67,15 @@ int bn_mul_mont(BN_ULONG *rp, const BN_U
|
|
* no opportunity to figure it out...
|
|
*/
|
|
|
|
+#if defined(_ARCH_PPC64)
|
|
+ if (num == 6) {
|
|
+ if (OPENSSL_ppccap_P & PPC_MADD300)
|
|
+ return bn_mul_mont_300_fixed_n6(rp, ap, bp, np, n0, num);
|
|
+ else
|
|
+ return bn_mul_mont_fixed_n6(rp, ap, bp, np, n0, num);
|
|
+ }
|
|
+#endif
|
|
+
|
|
return bn_mul_mont_int(rp, ap, bp, np, n0, num);
|
|
}
|
|
#endif
|
|
Index: openssl-1.1.1l/crypto/perlasm/ppc-xlate.pl
|
|
===================================================================
|
|
--- openssl-1.1.1l.orig/crypto/perlasm/ppc-xlate.pl
|
|
+++ openssl-1.1.1l/crypto/perlasm/ppc-xlate.pl
|
|
@@ -136,6 +136,71 @@ my $quad = sub {
|
|
};
|
|
|
|
################################################################
|
|
+# vector register number hacking
|
|
+################################################################
|
|
+
|
|
+# It is convenient to be able to set a variable like:
|
|
+# my $foo = "v33";
|
|
+# and use this in different contexts where:
|
|
+# * a VSR (Vector-Scaler Register) number (i.e. "v33") is required
|
|
+# * a VR (Vector Register) number (i.e. "v1") is required
|
|
+# Map VSR numbering to VR number for certain vector instructions.
|
|
+
|
|
+# vs<N> -> v<N-32> if N > 32
|
|
+sub vsr2vr1 {
|
|
+ my $in = shift;
|
|
+
|
|
+ my $n = int($in);
|
|
+ if ($n >= 32) {
|
|
+ $n -= 32;
|
|
+ }
|
|
+
|
|
+ return "$n";
|
|
+}
|
|
+# As above for first $num register args, returns list
|
|
+sub _vsr2vr {
|
|
+ my $num = shift;
|
|
+ my @rest = @_;
|
|
+ my @subst = splice(@rest, 0, $num);
|
|
+
|
|
+ @subst = map { vsr2vr1($_); } @subst;
|
|
+
|
|
+ return (@subst, @rest);
|
|
+}
|
|
+# As above but 1st arg ($f) is extracted and reinserted after
|
|
+# processing so that it can be ignored by a code generation function
|
|
+# that consumes the result
|
|
+sub vsr2vr_args {
|
|
+ my $num = shift;
|
|
+ my $f = shift;
|
|
+
|
|
+ my @out = _vsr2vr($num, @_);
|
|
+
|
|
+ return ($f, @out);
|
|
+}
|
|
+# As above but 1st arg is mnemonic, return formatted instruction
|
|
+sub vsr2vr {
|
|
+ my $mnemonic = shift;
|
|
+ my $num = shift;
|
|
+ my $f = shift;
|
|
+
|
|
+ my @out = _vsr2vr($num, @_);
|
|
+
|
|
+ " ${mnemonic}${f} " . join(",", @out);
|
|
+}
|
|
+
|
|
+# ISA 2.03
|
|
+my $vsel = sub { vsr2vr("vsel", 4, @_); };
|
|
+my $vsl = sub { vsr2vr("vsl", 3, @_); };
|
|
+my $vspltisb = sub { vsr2vr("vspltisb", 1, @_); };
|
|
+my $vspltisw = sub { vsr2vr("vspltisw", 1, @_); };
|
|
+my $vsr = sub { vsr2vr("vsr", 3, @_); };
|
|
+my $vsro = sub { vsr2vr("vsro", 3, @_); };
|
|
+
|
|
+# ISA 3.0
|
|
+my $lxsd = sub { vsr2vr("lxsd", 1, @_); };
|
|
+
|
|
+################################################################
|
|
# simplified mnemonics not handled by at least one assembler
|
|
################################################################
|
|
my $cmplw = sub {
|
|
@@ -226,13 +291,18 @@ my $vpermdi = sub { # xxpermdi
|
|
|
|
# PowerISA 2.07 stuff
|
|
sub vcrypto_op {
|
|
- my ($f, $vrt, $vra, $vrb, $op) = @_;
|
|
+ my ($f, $vrt, $vra, $vrb, $op) = vsr2vr_args(3, @_);
|
|
" .long ".sprintf "0x%X",(4<<26)|($vrt<<21)|($vra<<16)|($vrb<<11)|$op;
|
|
}
|
|
sub vfour {
|
|
my ($f, $vrt, $vra, $vrb, $vrc, $op) = @_;
|
|
" .long ".sprintf "0x%X",(4<<26)|($vrt<<21)|($vra<<16)|($vrb<<11)|($vrc<<6)|$op;
|
|
};
|
|
+sub vfour_vsr {
|
|
+ my ($f, $vrt, $vra, $vrb, $vrc, $op) = vsr2vr_args(4, @_);
|
|
+ " .long ".sprintf "0x%X",(4<<26)|($vrt<<21)|($vra<<16)|($vrb<<11)|($vrc<<6)|$op;
|
|
+};
|
|
+
|
|
my $vcipher = sub { vcrypto_op(@_, 1288); };
|
|
my $vcipherlast = sub { vcrypto_op(@_, 1289); };
|
|
my $vncipher = sub { vcrypto_op(@_, 1352); };
|
|
@@ -254,10 +324,10 @@ my $vsld = sub { vcrypto_op(@_, 1476); }
|
|
my $vsrd = sub { vcrypto_op(@_, 1732); };
|
|
my $vsubudm = sub { vcrypto_op(@_, 1216); };
|
|
my $vaddcuq = sub { vcrypto_op(@_, 320); };
|
|
-my $vaddeuqm = sub { vfour(@_,60); };
|
|
-my $vaddecuq = sub { vfour(@_,61); };
|
|
-my $vmrgew = sub { vfour(@_,0,1932); };
|
|
-my $vmrgow = sub { vfour(@_,0,1676); };
|
|
+my $vaddeuqm = sub { vfour_vsr(@_,60); };
|
|
+my $vaddecuq = sub { vfour_vsr(@_,61); };
|
|
+my $vmrgew = sub { vfour_vsr(@_,0,1932); };
|
|
+my $vmrgow = sub { vfour_vsr(@_,0,1676); };
|
|
|
|
my $mtsle = sub {
|
|
my ($f, $arg) = @_;
|
|
@@ -298,7 +368,7 @@ my $addex = sub {
|
|
my ($f, $rt, $ra, $rb, $cy) = @_; # only cy==0 is specified in 3.0B
|
|
" .long ".sprintf "0x%X",(31<<26)|($rt<<21)|($ra<<16)|($rb<<11)|($cy<<9)|(170<<1);
|
|
};
|
|
-my $vmsumudm = sub { vfour(@_,35); };
|
|
+my $vmsumudm = sub { vfour_vsr(@_, 35); };
|
|
|
|
while($line=<>) {
|
|
|
|
Index: openssl-1.1.1l/Configurations/10-main.conf
|
|
===================================================================
|
|
--- openssl-1.1.1l.orig/Configurations/10-main.conf
|
|
+++ openssl-1.1.1l/Configurations/10-main.conf
|
|
@@ -669,7 +669,7 @@ my %targets = (
|
|
inherit_from => [ "linux-generic64", asm("ppc64_asm") ],
|
|
cflags => add("-m64"),
|
|
cxxflags => add("-m64"),
|
|
- lib_cppflags => add("-DB_ENDIAN"),
|
|
+ lib_cppflags => add("-DB_ENDIAN -DECP_NISTP521_ASM"),
|
|
perlasm_scheme => "linux64",
|
|
multilib => "64",
|
|
},
|
|
@@ -677,7 +677,7 @@ my %targets = (
|
|
inherit_from => [ "linux-generic64", asm("ppc64_asm") ],
|
|
cflags => add("-m64"),
|
|
cxxflags => add("-m64"),
|
|
- lib_cppflags => add("-DL_ENDIAN"),
|
|
+ lib_cppflags => add("-DL_ENDIAN -DECP_NISTP521_ASM"),
|
|
perlasm_scheme => "linux64le",
|
|
},
|
|
|
|
Index: openssl-1.1.1l/crypto/ec/asm/ecp_nistp521-ppc64.pl
|
|
===================================================================
|
|
--- /dev/null
|
|
+++ openssl-1.1.1l/crypto/ec/asm/ecp_nistp521-ppc64.pl
|
|
@@ -0,0 +1,435 @@
|
|
+#! /usr/bin/env perl
|
|
+# Copyright 2021 The OpenSSL Project Authors. All Rights Reserved.
|
|
+#
|
|
+# Licensed under the OpenSSL license (the "License"). You may not use
|
|
+# this file except in compliance with the License. You can obtain a copy
|
|
+# in the file LICENSE in the source distribution or at
|
|
+# https://www.openssl.org/source/license.html
|
|
+#
|
|
+# ====================================================================
|
|
+# Written by Amitay Isaacs <amitay@ozlabs.org> and Martin Schwenke
|
|
+# <martin@meltin.net> for the OpenSSL project.
|
|
+# ====================================================================
|
|
+#
|
|
+# p521 lower-level primitives for PPC64 using vector instructions.
|
|
+#
|
|
+
|
|
+use strict;
|
|
+use warnings;
|
|
+
|
|
+my $flavour = shift;
|
|
+my $output = "";
|
|
+while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {}
|
|
+if (!$output) {
|
|
+ $output = "-";
|
|
+}
|
|
+
|
|
+my ($xlate, $dir);
|
|
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
|
|
+( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
|
|
+( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
|
|
+die "can't locate ppc-xlate.pl";
|
|
+
|
|
+open OUT,"| \"$^X\" $xlate $flavour $output";
|
|
+*STDOUT=*OUT;
|
|
+
|
|
+my $code = "";
|
|
+
|
|
+my ($sp, $outp, $savelr, $savesp) = ("r1", "r3", "r10", "r12");
|
|
+
|
|
+my $vzero = "v32";
|
|
+
|
|
+sub startproc($)
|
|
+{
|
|
+ my ($name) = @_;
|
|
+
|
|
+ $code.=<<___;
|
|
+ .globl ${name}
|
|
+ .align 5
|
|
+${name}:
|
|
+
|
|
+___
|
|
+}
|
|
+
|
|
+sub endproc($)
|
|
+{
|
|
+ my ($name) = @_;
|
|
+
|
|
+ $code.=<<___;
|
|
+ blr
|
|
+ .size ${name},.-${name}
|
|
+
|
|
+___
|
|
+}
|
|
+
|
|
+
|
|
+sub push_vrs($$)
|
|
+{
|
|
+ my ($min, $max) = @_;
|
|
+
|
|
+ my $count = $max - $min + 1;
|
|
+
|
|
+ $code.=<<___;
|
|
+ mr $savesp,$sp
|
|
+ stdu $sp,-16*`$count+1`($sp)
|
|
+
|
|
+___
|
|
+ for (my $i = $min; $i <= $max; $i++) {
|
|
+ my $mult = $max - $i + 1;
|
|
+ $code.=<<___;
|
|
+ stxv $i,-16*$mult($savesp)
|
|
+___
|
|
+
|
|
+ }
|
|
+
|
|
+ $code.=<<___;
|
|
+
|
|
+___
|
|
+}
|
|
+
|
|
+sub pop_vrs($$)
|
|
+{
|
|
+ my ($min, $max) = @_;
|
|
+
|
|
+ $code.=<<___;
|
|
+ ld $savesp,0($sp)
|
|
+___
|
|
+ for (my $i = $min; $i <= $max; $i++) {
|
|
+ my $mult = $max - $i + 1;
|
|
+ $code.=<<___;
|
|
+ lxv $i,-16*$mult($savesp)
|
|
+___
|
|
+ }
|
|
+
|
|
+ $code.=<<___;
|
|
+ mr $sp,$savesp
|
|
+
|
|
+___
|
|
+}
|
|
+
|
|
+sub load_vrs($$)
|
|
+{
|
|
+ my ($pointer, $reg_list) = @_;
|
|
+
|
|
+ for (my $i = 0; $i <= 8; $i++) {
|
|
+ my $offset = $i * 8;
|
|
+ $code.=<<___;
|
|
+ lxsd $reg_list->[$i],$offset($pointer)
|
|
+___
|
|
+ }
|
|
+
|
|
+ $code.=<<___;
|
|
+
|
|
+___
|
|
+}
|
|
+
|
|
+sub store_vrs($$)
|
|
+{
|
|
+ my ($pointer, $reg_list) = @_;
|
|
+
|
|
+ for (my $i = 0; $i <= 8; $i++) {
|
|
+ my $offset = $i * 16;
|
|
+ $code.=<<___;
|
|
+ stxv $reg_list->[$i],$offset($pointer)
|
|
+___
|
|
+ }
|
|
+
|
|
+ $code.=<<___;
|
|
+
|
|
+___
|
|
+}
|
|
+
|
|
+$code.=<<___;
|
|
+.text
|
|
+
|
|
+___
|
|
+
|
|
+{
|
|
+ # mul/square common
|
|
+ my ($t1, $t2, $t3, $t4) = ("v33", "v34", "v44", "v54");
|
|
+ my ($zero, $one) = ("r8", "r9");
|
|
+ my @out = map("v$_",(55..63));
|
|
+
|
|
+ {
|
|
+ #
|
|
+ # p521_felem_mul
|
|
+ #
|
|
+
|
|
+ my ($in1p, $in2p) = ("r4", "r5");
|
|
+ my @in1 = map("v$_",(45..53));
|
|
+ my @in2 = map("v$_",(35..43));
|
|
+
|
|
+ startproc("p521_felem_mul");
|
|
+
|
|
+ push_vrs(52, 63);
|
|
+
|
|
+ $code.=<<___;
|
|
+ vspltisw $vzero,0
|
|
+
|
|
+___
|
|
+
|
|
+ load_vrs($in1p, \@in1);
|
|
+ load_vrs($in2p, \@in2);
|
|
+
|
|
+ $code.=<<___;
|
|
+ vmsumudm $out[0],$in1[0],$in2[0],$vzero
|
|
+
|
|
+ xxpermdi $t1,$in1[0],$in1[1],0b00
|
|
+ xxpermdi $t2,$in2[1],$in2[0],0b00
|
|
+ vmsumudm $out[1],$t1,$t2,$vzero
|
|
+
|
|
+ xxpermdi $t2,$in2[2],$in2[1],0b00
|
|
+ vmsumudm $out[2],$t1,$t2,$vzero
|
|
+ vmsumudm $out[2],$in1[2],$in2[0],$out[2]
|
|
+
|
|
+ xxpermdi $t2,$in2[3],$in2[2],0b00
|
|
+ vmsumudm $out[3],$t1,$t2,$vzero
|
|
+ xxpermdi $t3,$in1[2],$in1[3],0b00
|
|
+ xxpermdi $t4,$in2[1],$in2[0],0b00
|
|
+ vmsumudm $out[3],$t3,$t4,$out[3]
|
|
+
|
|
+ xxpermdi $t2,$in2[4],$in2[3],0b00
|
|
+ vmsumudm $out[4],$t1,$t2,$vzero
|
|
+ xxpermdi $t4,$in2[2],$in2[1],0b00
|
|
+ vmsumudm $out[4],$t3,$t4,$out[4]
|
|
+ vmsumudm $out[4],$in1[4],$in2[0],$out[4]
|
|
+
|
|
+ xxpermdi $t2,$in2[5],$in2[4],0b00
|
|
+ vmsumudm $out[5],$t1,$t2,$vzero
|
|
+ xxpermdi $t4,$in2[3],$in2[2],0b00
|
|
+ vmsumudm $out[5],$t3,$t4,$out[5]
|
|
+
|
|
+ xxpermdi $t2,$in2[6],$in2[5],0b00
|
|
+ vmsumudm $out[6],$t1,$t2,$vzero
|
|
+ xxpermdi $t4,$in2[4],$in2[3],0b00
|
|
+ vmsumudm $out[6],$t3,$t4,$out[6]
|
|
+
|
|
+ xxpermdi $t2,$in2[7],$in2[6],0b00
|
|
+ vmsumudm $out[7],$t1,$t2,$vzero
|
|
+ xxpermdi $t4,$in2[5],$in2[4],0b00
|
|
+ vmsumudm $out[7],$t3,$t4,$out[7]
|
|
+
|
|
+ xxpermdi $t2,$in2[8],$in2[7],0b00
|
|
+ vmsumudm $out[8],$t1,$t2,$vzero
|
|
+ xxpermdi $t4,$in2[6],$in2[5],0b00
|
|
+ vmsumudm $out[8],$t3,$t4,$out[8]
|
|
+
|
|
+ xxpermdi $t1,$in1[4],$in1[5],0b00
|
|
+ xxpermdi $t2,$in2[1],$in2[0],0b00
|
|
+ vmsumudm $out[5],$t1,$t2,$out[5]
|
|
+
|
|
+ xxpermdi $t2,$in2[2],$in2[1],0b00
|
|
+ vmsumudm $out[6],$t1,$t2,$out[6]
|
|
+ vmsumudm $out[6],$in1[6],$in2[0],$out[6]
|
|
+
|
|
+ xxpermdi $t2,$in2[3],$in2[2],0b00
|
|
+ vmsumudm $out[7],$t1,$t2,$out[7]
|
|
+ xxpermdi $t3,$in1[6],$in1[7],0b00
|
|
+ xxpermdi $t4,$in2[1],$in2[0],0b00
|
|
+ vmsumudm $out[7],$t3,$t4,$out[7]
|
|
+
|
|
+ xxpermdi $t2,$in2[4],$in2[3],0b00
|
|
+ vmsumudm $out[8],$t1,$t2,$out[8]
|
|
+ xxpermdi $t4,$in2[2],$in2[1],0b00
|
|
+ vmsumudm $out[8],$t3,$t4,$out[8]
|
|
+ vmsumudm $out[8],$in1[8],$in2[0],$out[8]
|
|
+
|
|
+ li $zero,0
|
|
+ li $one,1
|
|
+ mtvsrdd $t1,$one,$zero
|
|
+___
|
|
+
|
|
+ for (my $i = 0; $i <= 8; $i++) {
|
|
+ $code.=<<___;
|
|
+ vsld $in2[$i],$in2[$i],$t1
|
|
+___
|
|
+ }
|
|
+
|
|
+ $code.=<<___;
|
|
+
|
|
+ vmsumudm $out[7],$in1[8],$in2[8],$out[7]
|
|
+
|
|
+ xxpermdi $t2,$in2[8],$in2[7],0b00
|
|
+ xxpermdi $t1,$in1[7],$in1[8],0b00
|
|
+ vmsumudm $out[6],$t1,$t2,$out[6]
|
|
+
|
|
+ xxpermdi $t1,$in1[6],$in1[7],0b00
|
|
+ vmsumudm $out[5],$t1,$t2,$out[5]
|
|
+ vmsumudm $out[5],$in1[8],$in2[6],$out[5]
|
|
+
|
|
+ xxpermdi $t1,$in1[5],$in1[6],0b00
|
|
+ vmsumudm $out[4],$t1,$t2,$out[4]
|
|
+ xxpermdi $t4,$in2[6],$in2[5],0b00
|
|
+ xxpermdi $t3,$in1[7],$in1[8],0b00
|
|
+ vmsumudm $out[4],$t3,$t4,$out[4]
|
|
+
|
|
+ xxpermdi $t1,$in1[4],$in1[5],0b00
|
|
+ vmsumudm $out[3],$t1,$t2,$out[3]
|
|
+ xxpermdi $t3,$in1[6],$in1[7],0b00
|
|
+ vmsumudm $out[3],$t3,$t4,$out[3]
|
|
+ vmsumudm $out[3],$in1[8],$in2[4],$out[3]
|
|
+
|
|
+ xxpermdi $t1,$in1[3],$in1[4],0b00
|
|
+ vmsumudm $out[2],$t1,$t2,$out[2]
|
|
+ xxpermdi $t3,$in1[5],$in1[6],0b00
|
|
+ vmsumudm $out[2],$t3,$t4,$out[2]
|
|
+
|
|
+ xxpermdi $t1,$in1[2],$in1[3],0b00
|
|
+ vmsumudm $out[1],$t1,$t2,$out[1]
|
|
+ xxpermdi $t3,$in1[4],$in1[5],0b00
|
|
+ vmsumudm $out[1],$t3,$t4,$out[1]
|
|
+
|
|
+ xxpermdi $t1,$in1[1],$in1[2],0b00
|
|
+ vmsumudm $out[0],$t1,$t2,$out[0]
|
|
+ xxpermdi $t3,$in1[3],$in1[4],0b00
|
|
+ vmsumudm $out[0],$t3,$t4,$out[0]
|
|
+
|
|
+ xxpermdi $t2,$in2[4],$in2[3],0b00
|
|
+ xxpermdi $t1,$in1[7],$in1[8],0b00
|
|
+ vmsumudm $out[2],$t1,$t2,$out[2]
|
|
+
|
|
+ xxpermdi $t1,$in1[6],$in1[7],0b00
|
|
+ vmsumudm $out[1],$t1,$t2,$out[1]
|
|
+ vmsumudm $out[1],$in1[8],$in2[2],$out[1]
|
|
+
|
|
+ xxpermdi $t1,$in1[5],$in1[6],0b00
|
|
+ vmsumudm $out[0],$t1,$t2,$out[0]
|
|
+ xxpermdi $t4,$in2[2],$in2[1],0b00
|
|
+ xxpermdi $t3,$in1[7],$in1[8],0b00
|
|
+ vmsumudm $out[0],$t3,$t4,$out[0]
|
|
+
|
|
+___
|
|
+
|
|
+ store_vrs($outp, \@out);
|
|
+
|
|
+ pop_vrs(52, 63);
|
|
+
|
|
+ endproc("p521_felem_mul");
|
|
+ }
|
|
+
|
|
+ {
|
|
+ #
|
|
+ # p51_felem_square
|
|
+ #
|
|
+
|
|
+ my ($inp) = ("r4");
|
|
+ my @in = map("v$_",(45..53));
|
|
+ my @inx2 = map("v$_",(35..43));
|
|
+
|
|
+ startproc("p521_felem_square");
|
|
+
|
|
+ push_vrs(52, 63);
|
|
+
|
|
+ $code.=<<___;
|
|
+ vspltisw $vzero,0
|
|
+
|
|
+___
|
|
+
|
|
+ load_vrs($inp, \@in);
|
|
+
|
|
+ $code.=<<___;
|
|
+ li $zero,0
|
|
+ li $one,1
|
|
+ mtvsrdd $t1,$one,$zero
|
|
+___
|
|
+
|
|
+ for (my $i = 0; $i <= 8; $i++) {
|
|
+ $code.=<<___;
|
|
+ vsld $inx2[$i],$in[$i],$t1
|
|
+___
|
|
+ }
|
|
+
|
|
+ $code.=<<___;
|
|
+ vmsumudm $out[0],$in[0],$in[0],$vzero
|
|
+
|
|
+ vmsumudm $out[1],$in[0],$inx2[1],$vzero
|
|
+
|
|
+ xxpermdi $t1,$in[0],$in[1],0b00
|
|
+ xxpermdi $t2,$inx2[2],$in[1],0b00
|
|
+ vmsumudm $out[2],$t1,$t2,$vzero
|
|
+
|
|
+ xxpermdi $t2,$inx2[3],$inx2[2],0b00
|
|
+ vmsumudm $out[3],$t1,$t2,$vzero
|
|
+
|
|
+ xxpermdi $t2,$inx2[4],$inx2[3],0b00
|
|
+ vmsumudm $out[4],$t1,$t2,$vzero
|
|
+ vmsumudm $out[4],$in[2],$in[2],$out[4]
|
|
+
|
|
+ xxpermdi $t2,$inx2[5],$inx2[4],0b00
|
|
+ vmsumudm $out[5],$t1,$t2,$vzero
|
|
+ vmsumudm $out[5],$in[2],$inx2[3],$out[5]
|
|
+
|
|
+ xxpermdi $t2,$inx2[6],$inx2[5],0b00
|
|
+ vmsumudm $out[6],$t1,$t2,$vzero
|
|
+ xxpermdi $t3,$in[2],$in[3],0b00
|
|
+ xxpermdi $t4,$inx2[4],$in[3],0b00
|
|
+ vmsumudm $out[6],$t3,$t4,$out[6]
|
|
+
|
|
+ xxpermdi $t2,$inx2[7],$inx2[6],0b00
|
|
+ vmsumudm $out[7],$t1,$t2,$vzero
|
|
+ xxpermdi $t4,$inx2[5],$inx2[4],0b00
|
|
+ vmsumudm $out[7],$t3,$t4,$out[7]
|
|
+
|
|
+ xxpermdi $t2,$inx2[8],$inx2[7],0b00
|
|
+ vmsumudm $out[8],$t1,$t2,$vzero
|
|
+ xxpermdi $t4,$inx2[6],$inx2[5],0b00
|
|
+ vmsumudm $out[8],$t3,$t4,$out[8]
|
|
+ vmsumudm $out[8],$in[4],$in[4],$out[8]
|
|
+
|
|
+ vmsumudm $out[1],$in[5],$inx2[5],$out[1]
|
|
+
|
|
+ vmsumudm $out[3],$in[6],$inx2[6],$out[3]
|
|
+
|
|
+ vmsumudm $out[5],$in[7],$inx2[7],$out[5]
|
|
+
|
|
+ vmsumudm $out[7],$in[8],$inx2[8],$out[7]
|
|
+
|
|
+ mtvsrdd $t1,$one,$zero
|
|
+___
|
|
+
|
|
+ for (my $i = 5; $i <= 8; $i++) {
|
|
+ $code.=<<___;
|
|
+ vsld $inx2[$i],$inx2[$i],$t1
|
|
+___
|
|
+ }
|
|
+
|
|
+ $code.=<<___;
|
|
+
|
|
+ vmsumudm $out[6],$in[7],$inx2[8],$out[6]
|
|
+
|
|
+ vmsumudm $out[5],$in[6],$inx2[8],$out[5]
|
|
+
|
|
+ xxpermdi $t2,$inx2[8],$inx2[7],0b00
|
|
+ xxpermdi $t1,$in[5],$in[6],0b00
|
|
+ vmsumudm $out[4],$t1,$t2,$out[4]
|
|
+
|
|
+ xxpermdi $t1,$in[4],$in[5],0b00
|
|
+ vmsumudm $out[3],$t1,$t2,$out[3]
|
|
+
|
|
+ xxpermdi $t1,$in[3],$in[4],0b00
|
|
+ vmsumudm $out[2],$t1,$t2,$out[2]
|
|
+ vmsumudm $out[2],$in[5],$inx2[6],$out[2]
|
|
+
|
|
+ xxpermdi $t1,$in[2],$in[3],0b00
|
|
+ vmsumudm $out[1],$t1,$t2,$out[1]
|
|
+ vmsumudm $out[1],$in[4],$inx2[6],$out[1]
|
|
+
|
|
+ xxpermdi $t1,$in[1],$in[2],0b00
|
|
+ vmsumudm $out[0],$t1,$t2,$out[0]
|
|
+ xxpermdi $t2,$inx2[6],$inx2[5],0b00
|
|
+ xxpermdi $t1,$in[3],$in[4],0b00
|
|
+ vmsumudm $out[0],$t1,$t2,$out[0]
|
|
+
|
|
+___
|
|
+
|
|
+ store_vrs($outp, \@out);
|
|
+
|
|
+ pop_vrs(52, 63);
|
|
+
|
|
+ endproc("p521_felem_square");
|
|
+ }
|
|
+}
|
|
+
|
|
+$code =~ s/\`([^\`]*)\`/eval $1/gem;
|
|
+print $code;
|
|
+close STDOUT or die "error closing STDOUT: $!";
|
|
Index: openssl-1.1.1l/crypto/ec/ec_local.h
|
|
===================================================================
|
|
--- openssl-1.1.1l.orig/crypto/ec/ec_local.h
|
|
+++ openssl-1.1.1l/crypto/ec/ec_local.h
|
|
@@ -499,6 +499,10 @@ int ec_GF2m_simple_field_div(const EC_GR
|
|
const BIGNUM *b, BN_CTX *);
|
|
|
|
#ifndef OPENSSL_NO_EC_NISTP_64_GCC_128
|
|
+# ifdef B_ENDIAN
|
|
+# error "Can not enable ec_nistp_64_gcc_128 on big-endian systems"
|
|
+# endif
|
|
+
|
|
/* method functions in ecp_nistp224.c */
|
|
int ec_GFp_nistp224_group_init(EC_GROUP *group);
|
|
int ec_GFp_nistp224_group_set_curve(EC_GROUP *group, const BIGNUM *p,
|
|
Index: openssl-1.1.1l/crypto/ec/curve448/arch_32/f_impl.c
|
|
===================================================================
|
|
--- openssl-1.1.1l.orig/crypto/ec/curve448/arch_32/f_impl.c
|
|
+++ openssl-1.1.1l/crypto/ec/curve448/arch_32/f_impl.c
|
|
@@ -10,7 +10,7 @@
|
|
* Originally written by Mike Hamburg
|
|
*/
|
|
|
|
-#include "field.h"
|
|
+#include "../field.h"
|
|
|
|
void gf_mul(gf_s * RESTRICT cs, const gf as, const gf bs)
|
|
{
|
|
Index: openssl-1.1.1l/crypto/ec/curve448/arch_64/f_impl.c
|
|
===================================================================
|
|
--- /dev/null
|
|
+++ openssl-1.1.1l/crypto/ec/curve448/arch_64/f_impl.c
|
|
@@ -0,0 +1,200 @@
|
|
+/*
|
|
+ * Copyright 2017-2018 The OpenSSL Project Authors. All Rights Reserved.
|
|
+ * Copyright 2014 Cryptography Research, Inc.
|
|
+ *
|
|
+ * Licensed under the OpenSSL license (the "License"). You may not use
|
|
+ * this file except in compliance with the License. You can obtain a copy
|
|
+ * in the file LICENSE in the source distribution or at
|
|
+ * https://www.openssl.org/source/license.html
|
|
+ *
|
|
+ * Originally written by Mike Hamburg
|
|
+ */
|
|
+
|
|
+#include "../field.h"
|
|
+
|
|
+void gf_mul(gf_s * RESTRICT cs, const gf as, const gf bs)
|
|
+{
|
|
+ const uint64_t *a = as->limb, *b = bs->limb;
|
|
+ uint64_t *c = cs->limb;
|
|
+ uint128_t accum0 = 0, accum1 = 0, accum2;
|
|
+ uint64_t mask = (1ULL << 56) - 1;
|
|
+ uint64_t aa[4], bb[4], bbb[4];
|
|
+ unsigned int i, j;
|
|
+
|
|
+ for (i = 0; i < 4; i++) {
|
|
+ aa[i] = a[i] + a[i + 4];
|
|
+ bb[i] = b[i] + b[i + 4];
|
|
+ bbb[i] = bb[i] + b[i + 4];
|
|
+ }
|
|
+
|
|
+ for (i = 0; i < 4; i++) {
|
|
+ accum2 = 0;
|
|
+
|
|
+ for (j = 0; j <= i; j++) {
|
|
+ accum2 += widemul(a[j], b[i - j]);
|
|
+ accum1 += widemul(aa[j], bb[i - j]);
|
|
+ accum0 += widemul(a[j + 4], b[i - j + 4]);
|
|
+ }
|
|
+ for (; j < 4; j++) {
|
|
+ accum2 += widemul(a[j], b[i - j + 8]);
|
|
+ accum1 += widemul(aa[j], bbb[i - j + 4]);
|
|
+ accum0 += widemul(a[j + 4], bb[i - j + 4]);
|
|
+ }
|
|
+
|
|
+ accum1 -= accum2;
|
|
+ accum0 += accum2;
|
|
+
|
|
+ c[i] = ((uint64_t)(accum0)) & mask;
|
|
+ c[i + 4] = ((uint64_t)(accum1)) & mask;
|
|
+
|
|
+ accum0 >>= 56;
|
|
+ accum1 >>= 56;
|
|
+ }
|
|
+
|
|
+ accum0 += accum1;
|
|
+ accum0 += c[4];
|
|
+ accum1 += c[0];
|
|
+ c[4] = ((uint64_t)(accum0)) & mask;
|
|
+ c[0] = ((uint64_t)(accum1)) & mask;
|
|
+
|
|
+ accum0 >>= 56;
|
|
+ accum1 >>= 56;
|
|
+
|
|
+ c[5] += ((uint64_t)(accum0));
|
|
+ c[1] += ((uint64_t)(accum1));
|
|
+}
|
|
+
|
|
+void gf_mulw_unsigned(gf_s * RESTRICT cs, const gf as, uint32_t b)
|
|
+{
|
|
+ const uint64_t *a = as->limb;
|
|
+ uint64_t *c = cs->limb;
|
|
+ uint128_t accum0 = 0, accum4 = 0;
|
|
+ uint64_t mask = (1ULL << 56) - 1;
|
|
+ int i;
|
|
+
|
|
+ for (i = 0; i < 4; i++) {
|
|
+ accum0 += widemul(b, a[i]);
|
|
+ accum4 += widemul(b, a[i + 4]);
|
|
+ c[i] = accum0 & mask;
|
|
+ accum0 >>= 56;
|
|
+ c[i + 4] = accum4 & mask;
|
|
+ accum4 >>= 56;
|
|
+ }
|
|
+
|
|
+ accum0 += accum4 + c[4];
|
|
+ c[4] = accum0 & mask;
|
|
+ c[5] += accum0 >> 56;
|
|
+
|
|
+ accum4 += c[0];
|
|
+ c[0] = accum4 & mask;
|
|
+ c[1] += accum4 >> 56;
|
|
+}
|
|
+
|
|
+void gf_sqr(gf_s * RESTRICT cs, const gf as)
|
|
+{
|
|
+ const uint64_t *a = as->limb;
|
|
+ uint64_t *c = cs->limb;
|
|
+ uint128_t accum0 = 0, accum1 = 0, accum2;
|
|
+ uint64_t mask = (1ULL << 56) - 1;
|
|
+ uint64_t aa[4];
|
|
+ unsigned int i;
|
|
+
|
|
+ /* For some reason clang doesn't vectorize this without prompting? */
|
|
+ for (i = 0; i < 4; i++)
|
|
+ aa[i] = a[i] + a[i + 4];
|
|
+
|
|
+ accum2 = widemul(a[0], a[3]);
|
|
+ accum0 = widemul(aa[0], aa[3]);
|
|
+ accum1 = widemul(a[4], a[7]);
|
|
+
|
|
+ accum2 += widemul(a[1], a[2]);
|
|
+ accum0 += widemul(aa[1], aa[2]);
|
|
+ accum1 += widemul(a[5], a[6]);
|
|
+
|
|
+ accum0 -= accum2;
|
|
+ accum1 += accum2;
|
|
+
|
|
+ c[3] = ((uint64_t)(accum1)) << 1 & mask;
|
|
+ c[7] = ((uint64_t)(accum0)) << 1 & mask;
|
|
+
|
|
+ accum0 >>= 55;
|
|
+ accum1 >>= 55;
|
|
+
|
|
+ accum0 += widemul(2 * aa[1], aa[3]);
|
|
+ accum1 += widemul(2 * a[5], a[7]);
|
|
+ accum0 += widemul(aa[2], aa[2]);
|
|
+ accum1 += accum0;
|
|
+
|
|
+ accum0 -= widemul(2 * a[1], a[3]);
|
|
+ accum1 += widemul(a[6], a[6]);
|
|
+
|
|
+ accum2 = widemul(a[0], a[0]);
|
|
+ accum1 -= accum2;
|
|
+ accum0 += accum2;
|
|
+
|
|
+ accum0 -= widemul(a[2], a[2]);
|
|
+ accum1 += widemul(aa[0], aa[0]);
|
|
+ accum0 += widemul(a[4], a[4]);
|
|
+
|
|
+ c[0] = ((uint64_t)(accum0)) & mask;
|
|
+ c[4] = ((uint64_t)(accum1)) & mask;
|
|
+
|
|
+ accum0 >>= 56;
|
|
+ accum1 >>= 56;
|
|
+
|
|
+ accum2 = widemul(2 * aa[2], aa[3]);
|
|
+ accum0 -= widemul(2 * a[2], a[3]);
|
|
+ accum1 += widemul(2 * a[6], a[7]);
|
|
+
|
|
+ accum1 += accum2;
|
|
+ accum0 += accum2;
|
|
+
|
|
+ accum2 = widemul(2 * a[0], a[1]);
|
|
+ accum1 += widemul(2 * aa[0], aa[1]);
|
|
+ accum0 += widemul(2 * a[4], a[5]);
|
|
+
|
|
+ accum1 -= accum2;
|
|
+ accum0 += accum2;
|
|
+
|
|
+ c[1] = ((uint64_t)(accum0)) & mask;
|
|
+ c[5] = ((uint64_t)(accum1)) & mask;
|
|
+
|
|
+ accum0 >>= 56;
|
|
+ accum1 >>= 56;
|
|
+
|
|
+ accum2 = widemul(aa[3], aa[3]);
|
|
+ accum0 -= widemul(a[3], a[3]);
|
|
+ accum1 += widemul(a[7], a[7]);
|
|
+
|
|
+ accum1 += accum2;
|
|
+ accum0 += accum2;
|
|
+
|
|
+ accum2 = widemul(2 * a[0], a[2]);
|
|
+ accum1 += widemul(2 * aa[0], aa[2]);
|
|
+ accum0 += widemul(2 * a[4], a[6]);
|
|
+
|
|
+ accum2 += widemul(a[1], a[1]);
|
|
+ accum1 += widemul(aa[1], aa[1]);
|
|
+ accum0 += widemul(a[5], a[5]);
|
|
+
|
|
+ accum1 -= accum2;
|
|
+ accum0 += accum2;
|
|
+
|
|
+ c[2] = ((uint64_t)(accum0)) & mask;
|
|
+ c[6] = ((uint64_t)(accum1)) & mask;
|
|
+
|
|
+ accum0 >>= 56;
|
|
+ accum1 >>= 56;
|
|
+
|
|
+ accum0 += c[3];
|
|
+ accum1 += c[7];
|
|
+ c[3] = ((uint64_t)(accum0)) & mask;
|
|
+ c[7] = ((uint64_t)(accum1)) & mask;
|
|
+
|
|
+ /* we could almost stop here, but it wouldn't be stable, so... */
|
|
+
|
|
+ accum0 >>= 56;
|
|
+ accum1 >>= 56;
|
|
+ c[4] += ((uint64_t)(accum0)) + ((uint64_t)(accum1));
|
|
+ c[0] += ((uint64_t)(accum1));
|
|
+}
|
|
Index: openssl-1.1.1l/Configure
|
|
===================================================================
|
|
--- openssl-1.1.1l.orig/Configure
|
|
+++ openssl-1.1.1l/Configure
|
|
@@ -1476,6 +1476,20 @@ if (!$disabled{asm} && !$predefined_C{__
|
|
}
|
|
}
|
|
|
|
+# Check if __SIZEOF_INT128__ is defined by compiler
|
|
+$config{use_int128} = 0;
|
|
+{
|
|
+ my $cc = $config{CROSS_COMPILE}.$config{CC};
|
|
+ open(PIPE, "$cc -E -dM - </dev/null 2>&1 |");
|
|
+ while(<PIPE>) {
|
|
+ if (m/__SIZEOF_INT128__/) {
|
|
+ $config{use_int128} = 1;
|
|
+ last;
|
|
+ }
|
|
+ }
|
|
+ close(PIPE);
|
|
+}
|
|
+
|
|
# Deal with bn_ops ###################################################
|
|
|
|
$config{bn_ll} =0;
|