From 4dba53694bf633c272075e62acdc5a5ca3003ce6 Mon Sep 17 00:00:00 2001 From: Amitay Isaacs Date: Mon, 29 Mar 2021 18:06:13 +1100 Subject: [PATCH 01/29] numbers: Define 128-bit integers if compiler supports Signed-off-by: Amitay Isaacs Reviewed-by: Tomas Mraz Reviewed-by: Matt Caswell (Merged from https://github.com/openssl/openssl/pull/14784) (cherry picked from commit bbed0d1cbd436af6797d7837e270bff4ca4d5a10) --- include/internal/numbers.h | 10 ++++++++++ 1 file changed, 10 insertions(+) Index: openssl-1.1.1l/include/internal/numbers.h =================================================================== --- openssl-1.1.1l.orig/include/internal/numbers.h +++ openssl-1.1.1l/include/internal/numbers.h @@ -60,6 +60,16 @@ # define UINT64_MAX __MAXUINT__(uint64_t) # endif +# ifndef INT128_MAX +# if defined(__SIZEOF_INT128__) && __SIZEOF_INT128__ == 16 +typedef __int128_t int128_t; +typedef __uint128_t uint128_t; +# define INT128_MIN __MININT__(int128_t) +# define INT128_MAX __MAXINT__(int128_t) +# define UINT128_MAX __MAXUINT__(uint128_t) +# endif +# endif + # ifndef SIZE_MAX # define SIZE_MAX __MAXUINT__(size_t) # endif Index: openssl-1.1.1l/crypto/bn/bn_div.c =================================================================== --- openssl-1.1.1l.orig/crypto/bn/bn_div.c +++ openssl-1.1.1l/crypto/bn/bn_div.c @@ -97,7 +97,7 @@ BN_ULONG bn_div_3_words(const BN_ULONG * */ # if BN_BITS2 == 64 && defined(__SIZEOF_INT128__) && __SIZEOF_INT128__==16 # undef BN_ULLONG -# define BN_ULLONG __uint128_t +# define BN_ULLONG uint128_t # define BN_LLONG # endif Index: openssl-1.1.1l/crypto/bn/bn_local.h =================================================================== --- openssl-1.1.1l.orig/crypto/bn/bn_local.h +++ openssl-1.1.1l/crypto/bn/bn_local.h @@ -22,6 +22,7 @@ # endif # include "crypto/bn.h" +# include "internal/numbers.h" /* * These preprocessor symbols control various aspects of the bignum headers @@ -374,9 +375,9 @@ struct bn_gencb_st { */ # if defined(__SIZEOF_INT128__) && __SIZEOF_INT128__==16 && \ (defined(SIXTY_FOUR_BIT) || defined(SIXTY_FOUR_BIT_LONG)) -# define BN_UMULT_HIGH(a,b) (((__uint128_t)(a)*(b))>>64) +# define BN_UMULT_HIGH(a,b) (((uint128_t)(a)*(b))>>64) # define BN_UMULT_LOHI(low,high,a,b) ({ \ - __uint128_t ret=(__uint128_t)(a)*(b); \ + uint128_t ret=(uint128_t)(a)*(b); \ (high)=ret>>64; (low)=ret; }) # elif defined(__alpha) && (defined(SIXTY_FOUR_BIT_LONG) || defined(SIXTY_FOUR_BIT)) # if defined(__DECC) Index: openssl-1.1.1l/crypto/ec/curve25519.c =================================================================== --- openssl-1.1.1l.orig/crypto/ec/curve25519.c +++ openssl-1.1.1l/crypto/ec/curve25519.c @@ -11,6 +11,8 @@ #include "ec_local.h" #include +#include "internal/numbers.h" + #if defined(X25519_ASM) && (defined(__x86_64) || defined(__x86_64__) || \ defined(_M_AMD64) || defined(_M_X64)) @@ -252,7 +254,7 @@ static void x25519_scalar_mulx(uint8_t o #endif #if defined(X25519_ASM) \ - || ( (defined(__SIZEOF_INT128__) && __SIZEOF_INT128__ == 16) \ + || ( defined(INT128_MAX) \ && !defined(__sparc__) \ && (!defined(__SIZEOF_LONG__) || (__SIZEOF_LONG__ == 8)) \ && !(defined(__ANDROID__) && !defined(__clang__)) ) @@ -385,7 +387,7 @@ void x25519_fe51_mul121666(fe51 h, fe51 # define fe51_mul121666 x25519_fe51_mul121666 # else -typedef __uint128_t u128; +typedef uint128_t u128; static void fe51_mul(fe51 h, const fe51 f, const fe51 g) { Index: openssl-1.1.1l/crypto/ec/curve448/curve448utils.h =================================================================== --- openssl-1.1.1l.orig/crypto/ec/curve448/curve448utils.h +++ openssl-1.1.1l/crypto/ec/curve448/curve448utils.h @@ -15,6 +15,8 @@ # include +# include "internal/numbers.h" + /* * Internal word types. Somewhat tricky. This could be decided separately per * platform. However, the structs do need to be all the same size and @@ -41,9 +43,9 @@ typedef int64_t c448_sword_t; /* "Boolean" type, will be set to all-zero or all-one (i.e. -1u) */ typedef uint64_t c448_bool_t; /* Double-word size for internal computations */ -typedef __uint128_t c448_dword_t; +typedef uint128_t c448_dword_t; /* Signed double-word size for internal computations */ -typedef __int128_t c448_dsword_t; +typedef int128_t c448_dsword_t; # elif C448_WORD_BITS == 32 /* Word size for internal computations */ typedef uint32_t c448_word_t; Index: openssl-1.1.1l/crypto/ec/curve448/word.h =================================================================== --- openssl-1.1.1l.orig/crypto/ec/curve448/word.h +++ openssl-1.1.1l/crypto/ec/curve448/word.h @@ -17,15 +17,20 @@ # include # include # include -# include "arch_intrinsics.h" # include "curve448utils.h" +# ifdef INT128_MAX +# include "arch_64/arch_intrinsics.h" +# else +# include "arch_32/arch_intrinsics.h" +# endif + # if (ARCH_WORD_BITS == 64) typedef uint64_t word_t, mask_t; -typedef __uint128_t dword_t; +typedef uint128_t dword_t; typedef int32_t hsword_t; typedef int64_t sword_t; -typedef __int128_t dsword_t; +typedef int128_t dsword_t; # elif (ARCH_WORD_BITS == 32) typedef uint32_t word_t, mask_t; typedef uint64_t dword_t; Index: openssl-1.1.1l/crypto/ec/ecp_nistp224.c =================================================================== --- openssl-1.1.1l.orig/crypto/ec/ecp_nistp224.c +++ openssl-1.1.1l/crypto/ec/ecp_nistp224.c @@ -40,11 +40,9 @@ NON_EMPTY_TRANSLATION_UNIT # include # include "ec_local.h" -# if defined(__SIZEOF_INT128__) && __SIZEOF_INT128__==16 - /* even with gcc, the typedef won't work for 32-bit platforms */ -typedef __uint128_t uint128_t; /* nonstandard; implemented by gcc on 64-bit - * platforms */ -# else +#include "internal/numbers.h" + +#ifndef INT128_MAX # error "Your compiler doesn't appear to support 128-bit integer types" # endif Index: openssl-1.1.1l/crypto/ec/ecp_nistp256.c =================================================================== --- openssl-1.1.1l.orig/crypto/ec/ecp_nistp256.c +++ openssl-1.1.1l/crypto/ec/ecp_nistp256.c @@ -41,14 +41,11 @@ NON_EMPTY_TRANSLATION_UNIT # include # include "ec_local.h" -# if defined(__SIZEOF_INT128__) && __SIZEOF_INT128__==16 - /* even with gcc, the typedef won't work for 32-bit platforms */ -typedef __uint128_t uint128_t; /* nonstandard; implemented by gcc on 64-bit - * platforms */ -typedef __int128_t int128_t; -# else -# error "Your compiler doesn't appear to support 128-bit integer types" -# endif +#include "internal/numbers.h" + +#ifndef INT128_MAX +# error "Your compiler doesn't appear to support 128-bit integer types" +#endif typedef uint8_t u8; typedef uint32_t u32; Index: openssl-1.1.1l/crypto/ec/ecp_nistp521.c =================================================================== --- openssl-1.1.1l.orig/crypto/ec/ecp_nistp521.c +++ openssl-1.1.1l/crypto/ec/ecp_nistp521.c @@ -40,13 +40,11 @@ NON_EMPTY_TRANSLATION_UNIT # include # include "ec_local.h" -# if defined(__SIZEOF_INT128__) && __SIZEOF_INT128__==16 - /* even with gcc, the typedef won't work for 32-bit platforms */ -typedef __uint128_t uint128_t; /* nonstandard; implemented by gcc on 64-bit - * platforms */ -# else -# error "Your compiler doesn't appear to support 128-bit integer types" -# endif +#include "internal/numbers.h" + +#ifndef INT128_MAX +# error "Your compiler doesn't appear to support 128-bit integer types" +#endif typedef uint8_t u8; typedef uint64_t u64; @@ -400,7 +398,7 @@ static void felem_diff128(largefelem out * On exit: * out[i] < 17 * max(in[i]) * max(in[i]) */ -static void felem_square(largefelem out, const felem in) +static void felem_square_ref(largefelem out, const felem in) { felem inx2, inx4; felem_scalar(inx2, in, 2); @@ -484,7 +482,7 @@ static void felem_square(largefelem out, * On exit: * out[i] < 17 * max(in1[i]) * max(in2[i]) */ -static void felem_mul(largefelem out, const felem in1, const felem in2) +static void felem_mul_ref(largefelem out, const felem in1, const felem in2) { felem in2x2; felem_scalar(in2x2, in2, 2); @@ -674,6 +672,57 @@ static void felem_reduce(felem out, cons */ } +#if defined(ECP_NISTP521_ASM) +void felem_square_wrapper(largefelem out, const felem in); +void felem_mul_wrapper(largefelem out, const felem in1, const felem in2); + +static void (*felem_square_p)(largefelem out, const felem in) = + felem_square_wrapper; +static void (*felem_mul_p)(largefelem out, const felem in1, const felem in2) = + felem_mul_wrapper; + +void p521_felem_square(largefelem out, const felem in); +void p521_felem_mul(largefelem out, const felem in1, const felem in2); + +# if defined(_ARCH_PPC64) +# include "../ppc_arch.h" +# endif + +void felem_select(void) +{ +# if defined(_ARCH_PPC64) + if ((OPENSSL_ppccap_P & PPC_MADD300) && (OPENSSL_ppccap_P & PPC_ALTIVEC)) { + felem_square_p = p521_felem_square; + felem_mul_p = p521_felem_mul; + + return; + } +# endif + + /* Default */ + felem_square_p = felem_square_ref; + felem_mul_p = felem_mul_ref; +} + +void felem_square_wrapper(largefelem out, const felem in) +{ + felem_select(); + felem_square_p(out, in); +} + +void felem_mul_wrapper(largefelem out, const felem in1, const felem in2) +{ + felem_select(); + felem_mul_p(out, in1, in2); +} + +# define felem_square felem_square_p +# define felem_mul felem_mul_p +#else +# define felem_square felem_square_ref +# define felem_mul felem_mul_ref +#endif + static void felem_square_reduce(felem out, const felem in) { largefelem tmp; Index: openssl-1.1.1l/crypto/poly1305/poly1305.c =================================================================== --- openssl-1.1.1l.orig/crypto/poly1305/poly1305.c +++ openssl-1.1.1l/crypto/poly1305/poly1305.c @@ -95,11 +95,10 @@ poly1305_blocks(void *ctx, const unsigne (a ^ ((a ^ b) | ((a - b) ^ b))) >> (sizeof(a) * 8 - 1) \ ) -# if (defined(__SIZEOF_INT128__) && __SIZEOF_INT128__==16) && \ - (defined(__SIZEOF_LONG__) && __SIZEOF_LONG__==8) +# if defined(INT64_MAX) && defined(INT128_MAX) typedef unsigned long u64; -typedef __uint128_t u128; +typedef uint128_t u128; typedef struct { u64 h[3]; Index: openssl-1.1.1l/crypto/poly1305/poly1305_base2_44.c =================================================================== --- openssl-1.1.1l.orig/crypto/poly1305/poly1305_base2_44.c +++ openssl-1.1.1l/crypto/poly1305/poly1305_base2_44.c @@ -18,7 +18,7 @@ typedef unsigned char u8; typedef unsigned int u32; typedef unsigned long u64; -typedef unsigned __int128 u128; +typedef uint128_t u128; typedef struct { u64 h[3]; Index: openssl-1.1.1l/crypto/ec/build.info =================================================================== --- openssl-1.1.1l.orig/crypto/ec/build.info +++ openssl-1.1.1l/crypto/ec/build.info @@ -6,8 +13,9 @@ SOURCE[../../libcrypto]=\ ecp_nistp224.c ecp_nistp256.c ecp_nistp521.c ecp_nistputil.c \ ecp_oct.c ec2_oct.c ec_oct.c ec_kmeth.c ecdh_ossl.c ecdh_kdf.c \ ecdsa_ossl.c ecdsa_sign.c ecdsa_vrf.c curve25519.c ecx_meth.c \ - curve448/arch_32/f_impl.c curve448/f_generic.c curve448/scalar.c \ + curve448/f_generic.c curve448/scalar.c \ curve448/curve448_tables.c curve448/eddsa.c curve448/curve448.c \ + curve448/arch_64/f_impl64.c curve448/arch_32/f_impl32.c \ {- $target{ec_asm_src} -} GENERATE[ecp_nistz256-x86.s]=asm/ecp_nistz256-x86.pl \ @@ -29,6 +38,8 @@ GENERATE[ecp_nistz256-armv8.S]=asm/ecp_n INCLUDE[ecp_nistz256-armv8.o]=.. GENERATE[ecp_nistz256-ppc64.s]=asm/ecp_nistz256-ppc64.pl $(PERLASM_SCHEME) +GENERATE[ecp_nistp521-ppc64.s]=asm/ecp_nistp521-ppc64.pl $(PERLASM_SCHEME) + GENERATE[x25519-x86_64.s]=asm/x25519-x86_64.pl $(PERLASM_SCHEME) GENERATE[x25519-ppc64.s]=asm/x25519-ppc64.pl $(PERLASM_SCHEME) @@ -36,10 +47,3 @@ BEGINRAW[Makefile] {- $builddir -}/ecp_nistz256-%.S: {- $sourcedir -}/asm/ecp_nistz256-%.pl CC="$(CC)" $(PERL) $< $(PERLASM_SCHEME) $@ ENDRAW[Makefile] - -INCLUDE[curve448/arch_32/f_impl.o]=curve448/arch_32 curve448 -INCLUDE[curve448/f_generic.o]=curve448/arch_32 curve448 -INCLUDE[curve448/scalar.o]=curve448/arch_32 curve448 -INCLUDE[curve448/curve448_tables.o]=curve448/arch_32 curve448 -INCLUDE[curve448/eddsa.o]=curve448/arch_32 curve448 -INCLUDE[curve448/curve448.o]=curve448/arch_32 curve448 Index: openssl-1.1.1l/crypto/ec/curve448/field.h =================================================================== --- openssl-1.1.1l.orig/crypto/ec/curve448/field.h +++ openssl-1.1.1l/crypto/ec/curve448/field.h @@ -66,10 +66,15 @@ void gf_serialize(uint8_t *serial, const mask_t gf_deserialize(gf x, const uint8_t serial[SER_BYTES], int with_hibit, uint8_t hi_nmask); -# include "f_impl.h" /* Bring in the inline implementations */ # define LIMBPERM(i) (i) -# define LIMB_MASK(i) (((1)<limb[i] = a->limb[i] + b->limb[i]; + + gf_weak_reduce(out); +} + +void gf_sub_RAW(gf out, const gf a, const gf b) +{ + uint64_t co1 = ((1ULL << 56) - 1) * 2, co2 = co1 - 2; + unsigned int i; + + for (i = 0; i < NLIMBS; i++) + out->limb[i] = a->limb[i] - b->limb[i] + ((i == NLIMBS / 2) ? co2 : co1); + + gf_weak_reduce(out); +} + +void gf_bias(gf a, int amt) +{ +} + +void gf_weak_reduce(gf a) +{ + uint64_t mask = (1ULL << 56) - 1; + uint64_t tmp = a->limb[NLIMBS - 1] >> 56; + unsigned int i; + + a->limb[NLIMBS / 2] += tmp; + for (i = NLIMBS - 1; i > 0; i--) + a->limb[i] = (a->limb[i] & mask) + (a->limb[i - 1] >> 56); + a->limb[0] = (a->limb[0] & mask) + tmp; +} + +#endif /* OSSL_CRYPTO_EC_CURVE448_ARCH_64_F_IMPL_H */ Index: openssl-1.1.1l/include/internal/constant_time.h =================================================================== --- openssl-1.1.1l.orig/include/internal/constant_time.h +++ openssl-1.1.1l/include/internal/constant_time.h @@ -181,6 +181,11 @@ static ossl_inline uint32_t constant_tim return constant_time_msb_32(~a & (a - 1)); } +static ossl_inline uint64_t constant_time_is_zero_64(uint64_t a) +{ + return constant_time_msb_64(~a & (a - 1)); +} + static ossl_inline unsigned int constant_time_eq(unsigned int a, unsigned int b) { Index: openssl-1.1.1l/crypto/ec/curve448/arch_32/f_impl32.c =================================================================== --- /dev/null +++ openssl-1.1.1l/crypto/ec/curve448/arch_32/f_impl32.c @@ -0,0 +1,104 @@ +/* + * Copyright 2017-2018 The OpenSSL Project Authors. All Rights Reserved. + * Copyright 2014 Cryptography Research, Inc. + * + * Licensed under the OpenSSL license (the "License"). You may not use + * this file except in compliance with the License. You can obtain a copy + * in the file LICENSE in the source distribution or at + * https://www.openssl.org/source/license.html + * + * Originally written by Mike Hamburg + */ + +#include +#include "internal/numbers.h" + +#ifdef UINT128_MAX +/* We have support for 128 bit ints, so do nothing here */ +NON_EMPTY_TRANSLATION_UNIT +#else + +# include "../field.h" + +void gf_mul(gf_s * RESTRICT cs, const gf as, const gf bs) +{ + const uint32_t *a = as->limb, *b = bs->limb; + uint32_t *c = cs->limb; + uint64_t accum0 = 0, accum1 = 0, accum2 = 0; + uint32_t mask = (1 << 28) - 1; + uint32_t aa[8], bb[8]; + int i, j; + + for (i = 0; i < 8; i++) { + aa[i] = a[i] + a[i + 8]; + bb[i] = b[i] + b[i + 8]; + } + + for (j = 0; j < 8; j++) { + accum2 = 0; + for (i = 0; i < j + 1; i++) { + accum2 += widemul(a[j - i], b[i]); + accum1 += widemul(aa[j - i], bb[i]); + accum0 += widemul(a[8 + j - i], b[8 + i]); + } + accum1 -= accum2; + accum0 += accum2; + accum2 = 0; + for (i = j + 1; i < 8; i++) { + accum0 -= widemul(a[8 + j - i], b[i]); + accum2 += widemul(aa[8 + j - i], bb[i]); + accum1 += widemul(a[16 + j - i], b[8 + i]); + } + accum1 += accum2; + accum0 += accum2; + c[j] = ((uint32_t)(accum0)) & mask; + c[j + 8] = ((uint32_t)(accum1)) & mask; + accum0 >>= 28; + accum1 >>= 28; + } + + accum0 += accum1; + accum0 += c[8]; + accum1 += c[0]; + c[8] = ((uint32_t)(accum0)) & mask; + c[0] = ((uint32_t)(accum1)) & mask; + + accum0 >>= 28; + accum1 >>= 28; + c[9] += ((uint32_t)(accum0)); + c[1] += ((uint32_t)(accum1)); +} + +void gf_mulw_unsigned(gf_s * RESTRICT cs, const gf as, uint32_t b) +{ + const uint32_t *a = as->limb; + uint32_t *c = cs->limb; + uint64_t accum0 = 0, accum8 = 0; + uint32_t mask = (1 << 28) - 1; + int i; + + assert(b <= mask); + + for (i = 0; i < 8; i++) { + accum0 += widemul(b, a[i]); + accum8 += widemul(b, a[i + 8]); + c[i] = accum0 & mask; + accum0 >>= 28; + c[i + 8] = accum8 & mask; + accum8 >>= 28; + } + + accum0 += accum8 + c[8]; + c[8] = ((uint32_t)accum0) & mask; + c[9] += (uint32_t)(accum0 >> 28); + + accum8 += c[0]; + c[0] = ((uint32_t)accum8) & mask; + c[1] += (uint32_t)(accum8 >> 28); +} + +void gf_sqr(gf_s * RESTRICT cs, const gf as) +{ + gf_mul(cs, as, as); /* Performs better with a dedicated square */ +} +#endif Index: openssl-1.1.1l/crypto/ec/curve448/arch_64/f_impl64.c =================================================================== --- /dev/null +++ openssl-1.1.1l/crypto/ec/curve448/arch_64/f_impl64.c @@ -0,0 +1,210 @@ +/* + * Copyright 2017-2018 The OpenSSL Project Authors. All Rights Reserved. + * Copyright 2014 Cryptography Research, Inc. + * + * Licensed under the OpenSSL license (the "License"). You may not use + * this file except in compliance with the License. You can obtain a copy + * in the file LICENSE in the source distribution or at + * https://www.openssl.org/source/license.html + * + * Originally written by Mike Hamburg + */ + +#include +#include "internal/numbers.h" + +#ifndef UINT128_MAX +/* No support for 128 bit ints, so do nothing here */ +NON_EMPTY_TRANSLATION_UNIT +#else + +# include "../field.h" + +void gf_mul(gf_s * RESTRICT cs, const gf as, const gf bs) +{ + const uint64_t *a = as->limb, *b = bs->limb; + uint64_t *c = cs->limb; + uint128_t accum0 = 0, accum1 = 0, accum2; + uint64_t mask = (1ULL << 56) - 1; + uint64_t aa[4], bb[4], bbb[4]; + unsigned int i, j; + + for (i = 0; i < 4; i++) { + aa[i] = a[i] + a[i + 4]; + bb[i] = b[i] + b[i + 4]; + bbb[i] = bb[i] + b[i + 4]; + } + + for (i = 0; i < 4; i++) { + accum2 = 0; + + for (j = 0; j <= i; j++) { + accum2 += widemul(a[j], b[i - j]); + accum1 += widemul(aa[j], bb[i - j]); + accum0 += widemul(a[j + 4], b[i - j + 4]); + } + for (; j < 4; j++) { + accum2 += widemul(a[j], b[i - j + 8]); + accum1 += widemul(aa[j], bbb[i - j + 4]); + accum0 += widemul(a[j + 4], bb[i - j + 4]); + } + + accum1 -= accum2; + accum0 += accum2; + + c[i] = ((uint64_t)(accum0)) & mask; + c[i + 4] = ((uint64_t)(accum1)) & mask; + + accum0 >>= 56; + accum1 >>= 56; + } + + accum0 += accum1; + accum0 += c[4]; + accum1 += c[0]; + c[4] = ((uint64_t)(accum0)) & mask; + c[0] = ((uint64_t)(accum1)) & mask; + + accum0 >>= 56; + accum1 >>= 56; + + c[5] += ((uint64_t)(accum0)); + c[1] += ((uint64_t)(accum1)); +} + +void gf_mulw_unsigned(gf_s * __restrict__ cs, const gf as, uint32_t b) +{ + const uint64_t *a = as->limb; + uint64_t *c = cs->limb; + uint128_t accum0 = 0, accum4 = 0; + uint64_t mask = (1ULL << 56) - 1; + int i; + + for (i = 0; i < 4; i++) { + accum0 += widemul(b, a[i]); + accum4 += widemul(b, a[i + 4]); + c[i] = accum0 & mask; + accum0 >>= 56; + c[i + 4] = accum4 & mask; + accum4 >>= 56; + } + + accum0 += accum4 + c[4]; + c[4] = accum0 & mask; + c[5] += accum0 >> 56; + + accum4 += c[0]; + c[0] = accum4 & mask; + c[1] += accum4 >> 56; +} + +void gf_sqr(gf_s * __restrict__ cs, const gf as) +{ + const uint64_t *a = as->limb; + uint64_t *c = cs->limb; + uint128_t accum0 = 0, accum1 = 0, accum2; + uint64_t mask = (1ULL << 56) - 1; + uint64_t aa[4]; + + /* For some reason clang doesn't vectorize this without prompting? */ + unsigned int i; + for (i = 0; i < 4; i++) { + aa[i] = a[i] + a[i + 4]; + } + + accum2 = widemul(a[0], a[3]); + accum0 = widemul(aa[0], aa[3]); + accum1 = widemul(a[4], a[7]); + + accum2 += widemul(a[1], a[2]); + accum0 += widemul(aa[1], aa[2]); + accum1 += widemul(a[5], a[6]); + + accum0 -= accum2; + accum1 += accum2; + + c[3] = ((uint64_t)(accum1)) << 1 & mask; + c[7] = ((uint64_t)(accum0)) << 1 & mask; + + accum0 >>= 55; + accum1 >>= 55; + + accum0 += widemul(2 * aa[1], aa[3]); + accum1 += widemul(2 * a[5], a[7]); + accum0 += widemul(aa[2], aa[2]); + accum1 += accum0; + + accum0 -= widemul(2 * a[1], a[3]); + accum1 += widemul(a[6], a[6]); + + accum2 = widemul(a[0], a[0]); + accum1 -= accum2; + accum0 += accum2; + + accum0 -= widemul(a[2], a[2]); + accum1 += widemul(aa[0], aa[0]); + accum0 += widemul(a[4], a[4]); + + c[0] = ((uint64_t)(accum0)) & mask; + c[4] = ((uint64_t)(accum1)) & mask; + + accum0 >>= 56; + accum1 >>= 56; + + accum2 = widemul(2 * aa[2], aa[3]); + accum0 -= widemul(2 * a[2], a[3]); + accum1 += widemul(2 * a[6], a[7]); + + accum1 += accum2; + accum0 += accum2; + + accum2 = widemul(2 * a[0], a[1]); + accum1 += widemul(2 * aa[0], aa[1]); + accum0 += widemul(2 * a[4], a[5]); + + accum1 -= accum2; + accum0 += accum2; + + c[1] = ((uint64_t)(accum0)) & mask; + c[5] = ((uint64_t)(accum1)) & mask; + + accum0 >>= 56; + accum1 >>= 56; + + accum2 = widemul(aa[3], aa[3]); + accum0 -= widemul(a[3], a[3]); + accum1 += widemul(a[7], a[7]); + + accum1 += accum2; + accum0 += accum2; + + accum2 = widemul(2 * a[0], a[2]); + accum1 += widemul(2 * aa[0], aa[2]); + accum0 += widemul(2 * a[4], a[6]); + + accum2 += widemul(a[1], a[1]); + accum1 += widemul(aa[1], aa[1]); + accum0 += widemul(a[5], a[5]); + + accum1 -= accum2; + accum0 += accum2; + + c[2] = ((uint64_t)(accum0)) & mask; + c[6] = ((uint64_t)(accum1)) & mask; + + accum0 >>= 56; + accum1 >>= 56; + + accum0 += c[3]; + accum1 += c[7]; + c[3] = ((uint64_t)(accum0)) & mask; + c[7] = ((uint64_t)(accum1)) & mask; + + /* we could almost stop here, but it wouldn't be stable, so... */ + + accum0 >>= 56; + accum1 >>= 56; + c[4] += ((uint64_t)(accum0)) + ((uint64_t)(accum1)); + c[0] += ((uint64_t)(accum1)); +} +#endif Index: openssl-1.1.1l/Configurations/00-base-templates.conf =================================================================== --- openssl-1.1.1l.orig/Configurations/00-base-templates.conf +++ openssl-1.1.1l/Configurations/00-base-templates.conf @@ -351,7 +351,8 @@ my %targets=( ppc64_asm => { inherit_from => [ "ppc32_asm" ], template => 1, - ec_asm_src => "ecp_nistz256.c ecp_nistz256-ppc64.s x25519-ppc64.s", + bn_asm_src => add("ppc64-mont-fixed.s"), + ec_asm_src => "ecp_nistz256.c ecp_nistz256-ppc64.s ecp_nistp521-ppc64.s x25519-ppc64.s", keccak1600_asm_src => "keccak1600-ppc64.s", }, ); Index: openssl-1.1.1l/crypto/bn/asm/ppc64-mont-fixed.pl =================================================================== --- /dev/null +++ openssl-1.1.1l/crypto/bn/asm/ppc64-mont-fixed.pl @@ -0,0 +1,581 @@ +#! /usr/bin/env perl +# Copyright 2021 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the Apache License 2.0 (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html + +# ==================================================================== +# Written by Amitay Isaacs , Martin Schwenke +# & Alastair D'Silva for +# the OpenSSL project. +# ==================================================================== + +# +# Fixed length (n=6), unrolled PPC Montgomery Multiplication +# + +# 2021 +# +# Although this is a generic implementation for unrolling Montgomery +# Multiplication for arbitrary values of n, this is currently only +# used for n = 6 to improve the performance of ECC p384. +# +# Unrolling allows intermediate results to be stored in registers, +# rather than on the stack, improving performance by ~7% compared to +# the existing PPC assembly code. +# +# The ISA 3.0 implementation uses combination multiply/add +# instructions (maddld, maddhdu) to improve performance by an +# additional ~10% on Power 9. +# +# Finally, saving non-volatile registers into volatile vector +# registers instead of onto the stack saves a little more. +# +# On a Power 9 machine we see an overall improvement of ~18%. +# + +use strict; +use warnings; + +my ($flavour, $output, $dir, $xlate); + +# $output is the last argument if it looks like a file (it has an extension) +# $flavour is the first argument if it doesn't look like a file +$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef; +$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef; + +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; +( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or +( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or +die "can't locate ppc-xlate.pl"; + +open STDOUT,"| $^X $xlate $flavour \"$output\"" + or die "can't call $xlate: $!"; + +if ($flavour !~ /64/) { + die "bad flavour ($flavour) - only ppc64 permitted"; +} + +my $SIZE_T= 8; + +# Registers are global so the code is remotely readable + +# Parameters for Montgomery multiplication +my $sp = "r1"; +my $toc = "r2"; +my $rp = "r3"; +my $ap = "r4"; +my $bp = "r5"; +my $np = "r6"; +my $n0 = "r7"; +my $num = "r8"; + +my $i = "r9"; +my $c0 = "r10"; +my $bp0 = "r11"; +my $bpi = "r11"; +my $bpj = "r11"; +my $tj = "r12"; +my $apj = "r12"; +my $npj = "r12"; +my $lo = "r14"; +my $c1 = "r14"; + +# Non-volatile registers used for tp[i] +# +# 12 registers are available but the limit on unrolling is 10, +# since registers from $tp[0] to $tp[$n+1] are used. +my @tp = ("r20" .. "r31"); + +# volatile VSRs for saving non-volatile GPRs - faster than stack +my @vsrs = ("v32" .. "v46"); + +package Mont; + +sub new($$) +{ + my ($class, $n) = @_; + + if ($n > 10) { + die "Can't unroll for BN length ${n} (maximum 10)" + } + + my $self = { + code => "", + n => $n, + }; + bless $self, $class; + + return $self; +} + +sub add_code($$) +{ + my ($self, $c) = @_; + + $self->{code} .= $c; +} + +sub get_code($) +{ + my ($self) = @_; + + return $self->{code}; +} + +sub get_function_name($) +{ + my ($self) = @_; + + return "bn_mul_mont_fixed_n" . $self->{n}; +} + +sub get_label($$) +{ + my ($self, $l) = @_; + + return "L" . $l . "_" . $self->{n}; +} + +sub get_labels($@) +{ + my ($self, @labels) = @_; + + my %out = (); + + foreach my $l (@labels) { + $out{"$l"} = $self->get_label("$l"); + } + + return \%out; +} + +sub nl($) +{ + my ($self) = @_; + + $self->add_code("\n"); +} + +sub copy_result($) +{ + my ($self) = @_; + + my ($n) = $self->{n}; + + for (my $j = 0; $j < $n; $j++) { + $self->add_code(<<___); + std $tp[$j],`$j*$SIZE_T`($rp) +___ + } + +} + +sub mul_mont_fixed($) +{ + my ($self) = @_; + + my ($n) = $self->{n}; + my $fname = $self->get_function_name(); + my $label = $self->get_labels("outer", "enter", "sub", "copy", "end"); + + $self->add_code(<<___); + +.globl .${fname} +.align 5 +.${fname}: + +___ + + $self->save_registers(); + + $self->add_code(<<___); + ld $n0,0($n0) + + ld $bp0,0($bp) + + ld $apj,0($ap) +___ + + $self->mul_c_0($tp[0], $apj, $bp0, $c0); + + for (my $j = 1; $j < $n - 1; $j++) { + $self->add_code(<<___); + ld $apj,`$j*$SIZE_T`($ap) +___ + $self->mul($tp[$j], $apj, $bp0, $c0); + } + + $self->add_code(<<___); + ld $apj,`($n-1)*$SIZE_T`($ap) +___ + + $self->mul_last($tp[$n-1], $tp[$n], $apj, $bp0, $c0); + + $self->add_code(<<___); + li $tp[$n+1],0 + +___ + + $self->add_code(<<___); + li $i,0 + mtctr $num + b $label->{"enter"} + +.align 4 +$label->{"outer"}: + ldx $bpi,$bp,$i + + ld $apj,0($ap) +___ + + $self->mul_add_c_0($tp[0], $tp[0], $apj, $bpi, $c0); + + for (my $j = 1; $j < $n; $j++) { + $self->add_code(<<___); + ld $apj,`$j*$SIZE_T`($ap) +___ + $self->mul_add($tp[$j], $tp[$j], $apj, $bpi, $c0); + } + + $self->add_code(<<___); + addc $tp[$n],$tp[$n],$c0 + addze $tp[$n+1],$tp[$n+1] +___ + + $self->add_code(<<___); +.align 4 +$label->{"enter"}: + mulld $bpi,$tp[0],$n0 + + ld $npj,0($np) +___ + + $self->mul_add_c_0($lo, $tp[0], $bpi, $npj, $c0); + + for (my $j = 1; $j < $n; $j++) { + $self->add_code(<<___); + ld $npj,`$j*$SIZE_T`($np) +___ + $self->mul_add($tp[$j-1], $tp[$j], $npj, $bpi, $c0); + } + + $self->add_code(<<___); + addc $tp[$n-1],$tp[$n],$c0 + addze $tp[$n],$tp[$n+1] + + addi $i,$i,$SIZE_T + bdnz $label->{"outer"} + + and. $tp[$n],$tp[$n],$tp[$n] + bne $label->{"sub"} + + cmpld $tp[$n-1],$npj + blt $label->{"copy"} + +$label->{"sub"}: +___ + + # + # Reduction + # + + $self->add_code(<<___); + ld $bpj,`0*$SIZE_T`($np) + subfc $c1,$bpj,$tp[0] + std $c1,`0*$SIZE_T`($rp) + +___ + for (my $j = 1; $j < $n - 1; $j++) { + $self->add_code(<<___); + ld $bpj,`$j*$SIZE_T`($np) + subfe $c1,$bpj,$tp[$j] + std $c1,`$j*$SIZE_T`($rp) + +___ + } + + $self->add_code(<<___); + subfe $c1,$npj,$tp[$n-1] + std $c1,`($n-1)*$SIZE_T`($rp) + +___ + + $self->add_code(<<___); + addme. $tp[$n],$tp[$n] + beq $label->{"end"} + +$label->{"copy"}: +___ + + $self->copy_result(); + + $self->add_code(<<___); + +$label->{"end"}: +___ + + $self->restore_registers(); + + $self->add_code(<<___); + li r3,1 + blr +.size .${fname},.-.${fname} +___ + +} + +package Mont::GPR; + +our @ISA = ('Mont'); + +sub new($$) +{ + my ($class, $n) = @_; + + return $class->SUPER::new($n); +} + +sub save_registers($) +{ + my ($self) = @_; + + my $n = $self->{n}; + + $self->add_code(<<___); + std $lo,-8($sp) +___ + + for (my $j = 0; $j <= $n+1; $j++) { + $self->{code}.=<<___; + std $tp[$j],-`($j+2)*8`($sp) +___ + } + + $self->add_code(<<___); + +___ +} + +sub restore_registers($) +{ + my ($self) = @_; + + my $n = $self->{n}; + + $self->add_code(<<___); + ld $lo,-8($sp) +___ + + for (my $j = 0; $j <= $n+1; $j++) { + $self->{code}.=<<___; + ld $tp[$j],-`($j+2)*8`($sp) +___ + } + + $self->{code} .=<<___; + +___ +} + +# Direct translation of C mul() +sub mul($$$$$) +{ + my ($self, $r, $a, $w, $c) = @_; + + $self->add_code(<<___); + mulld $lo,$a,$w + addc $r,$lo,$c + mulhdu $c,$a,$w + addze $c,$c + +___ +} + +# Like mul() but $c is ignored as an input - an optimisation to save a +# preliminary instruction that would set input $c to 0 +sub mul_c_0($$$$$) +{ + my ($self, $r, $a, $w, $c) = @_; + + $self->add_code(<<___); + mulld $r,$a,$w + mulhdu $c,$a,$w + +___ +} + +# Like mul() but does not to the final addition of CA into $c - an +# optimisation to save an instruction +sub mul_last($$$$$$) +{ + my ($self, $r1, $r2, $a, $w, $c) = @_; + + $self->add_code(<<___); + mulld $lo,$a,$w + addc $r1,$lo,$c + mulhdu $c,$a,$w + + addze $r2,$c +___ +} + +# Like C mul_add() but allow $r_out and $r_in to be different +sub mul_add($$$$$$) +{ + my ($self, $r_out, $r_in, $a, $w, $c) = @_; + + $self->add_code(<<___); + mulld $lo,$a,$w + addc $lo,$lo,$c + mulhdu $c,$a,$w + addze $c,$c + addc $r_out,$r_in,$lo + addze $c,$c + +___ +} + +# Like mul_add() but $c is ignored as an input - an optimisation to save a +# preliminary instruction that would set input $c to 0 +sub mul_add_c_0($$$$$$) +{ + my ($self, $r_out, $r_in, $a, $w, $c) = @_; + + $self->add_code(<<___); + mulld $lo,$a,$w + addc $r_out,$r_in,$lo + mulhdu $c,$a,$w + addze $c,$c + +___ +} + +package Mont::GPR_300; + +our @ISA = ('Mont::GPR'); + +sub new($$) +{ + my ($class, $n) = @_; + + my $mont = $class->SUPER::new($n); + + return $mont; +} + +sub get_function_name($) +{ + my ($self) = @_; + + return "bn_mul_mont_300_fixed_n" . $self->{n}; +} + +sub get_label($$) +{ + my ($self, $l) = @_; + + return "L" . $l . "_300_" . $self->{n}; +} + +# Direct translation of C mul() +sub mul($$$$$) +{ + my ($self, $r, $a, $w, $c, $last) = @_; + + $self->add_code(<<___); + maddld $r,$a,$w,$c + maddhdu $c,$a,$w,$c + +___ +} + +# Save the last carry as the final entry +sub mul_last($$$$$) +{ + my ($self, $r1, $r2, $a, $w, $c) = @_; + + $self->add_code(<<___); + maddld $r1,$a,$w,$c + maddhdu $r2,$a,$w,$c + +___ +} + +# Like mul() but $c is ignored as an input - an optimisation to save a +# preliminary instruction that would set input $c to 0 +sub mul_c_0($$$$$) +{ + my ($self, $r, $a, $w, $c) = @_; + + $self->add_code(<<___); + mulld $r,$a,$w + mulhdu $c,$a,$w + +___ +} + +# Like C mul_add() but allow $r_out and $r_in to be different +sub mul_add($$$$$$) +{ + my ($self, $r_out, $r_in, $a, $w, $c) = @_; + + $self->add_code(<<___); + maddld $lo,$a,$w,$c + maddhdu $c,$a,$w,$c + addc $r_out,$r_in,$lo + addze $c,$c + +___ +} + +# Like mul_add() but $c is ignored as an input - an optimisation to save a +# preliminary instruction that would set input $c to 0 +sub mul_add_c_0($$$$$$) +{ + my ($self, $r_out, $r_in, $a, $w, $c) = @_; + + $self->add_code(<<___); + maddld $lo,$a,$w,$r_in + maddhdu $c,$a,$w,$r_in +___ + + if ($r_out ne $lo) { + $self->add_code(<<___); + mr $r_out,$lo +___ + } + + $self->nl(); +} + + +package main; + +my $code; + +$code.=<<___; +.machine "any" +.text +___ + +my $mont; + +$mont = new Mont::GPR(6); +$mont->mul_mont_fixed(); +$code .= $mont->get_code(); + +$mont = new Mont::GPR_300(6); +$mont->mul_mont_fixed(); +$code .= $mont->get_code(); + +$code =~ s/\`([^\`]*)\`/eval $1/gem; + +$code.=<<___; +.asciz "Montgomery Multiplication for PPC by , " +___ + +print $code; +close STDOUT or die "error closing STDOUT: $!"; Index: openssl-1.1.1l/crypto/bn/build.info =================================================================== --- openssl-1.1.1l.orig/crypto/bn/build.info +++ openssl-1.1.1l/crypto/bn/build.info @@ -56,6 +56,7 @@ GENERATE[parisc-mont.s]=asm/parisc-mont. GENERATE[bn-ppc.s]=asm/ppc.pl $(PERLASM_SCHEME) GENERATE[ppc-mont.s]=asm/ppc-mont.pl $(PERLASM_SCHEME) GENERATE[ppc64-mont.s]=asm/ppc64-mont.pl $(PERLASM_SCHEME) +GENERATE[ppc64-mont-fixed.s]=asm/ppc64-mont-fixed.pl $(PERLASM_SCHEME) GENERATE[alpha-mont.S]=asm/alpha-mont.pl $(PERLASM_SCHEME) Index: openssl-1.1.1l/crypto/ppccap.c =================================================================== --- openssl-1.1.1l.orig/crypto/ppccap.c +++ openssl-1.1.1l/crypto/ppccap.c @@ -46,6 +46,12 @@ int bn_mul_mont(BN_ULONG *rp, const BN_U const BN_ULONG *np, const BN_ULONG *n0, int num); int bn_mul4x_mont_int(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, const BN_ULONG *np, const BN_ULONG *n0, int num); + int bn_mul_mont_fixed_n6(BN_ULONG *rp, const BN_ULONG *ap, + const BN_ULONG *bp, const BN_ULONG *np, + const BN_ULONG *n0, int num); + int bn_mul_mont_300_fixed_n6(BN_ULONG *rp, const BN_ULONG *ap, + const BN_ULONG *bp, const BN_ULONG *np, + const BN_ULONG *n0, int num); if (num < 4) return 0; @@ -61,6 +67,15 @@ int bn_mul_mont(BN_ULONG *rp, const BN_U * no opportunity to figure it out... */ +#if defined(_ARCH_PPC64) + if (num == 6) { + if (OPENSSL_ppccap_P & PPC_MADD300) + return bn_mul_mont_300_fixed_n6(rp, ap, bp, np, n0, num); + else + return bn_mul_mont_fixed_n6(rp, ap, bp, np, n0, num); + } +#endif + return bn_mul_mont_int(rp, ap, bp, np, n0, num); } #endif Index: openssl-1.1.1l/crypto/perlasm/ppc-xlate.pl =================================================================== --- openssl-1.1.1l.orig/crypto/perlasm/ppc-xlate.pl +++ openssl-1.1.1l/crypto/perlasm/ppc-xlate.pl @@ -136,6 +136,71 @@ my $quad = sub { }; ################################################################ +# vector register number hacking +################################################################ + +# It is convenient to be able to set a variable like: +# my $foo = "v33"; +# and use this in different contexts where: +# * a VSR (Vector-Scaler Register) number (i.e. "v33") is required +# * a VR (Vector Register) number (i.e. "v1") is required +# Map VSR numbering to VR number for certain vector instructions. + +# vs -> v if N > 32 +sub vsr2vr1 { + my $in = shift; + + my $n = int($in); + if ($n >= 32) { + $n -= 32; + } + + return "$n"; +} +# As above for first $num register args, returns list +sub _vsr2vr { + my $num = shift; + my @rest = @_; + my @subst = splice(@rest, 0, $num); + + @subst = map { vsr2vr1($_); } @subst; + + return (@subst, @rest); +} +# As above but 1st arg ($f) is extracted and reinserted after +# processing so that it can be ignored by a code generation function +# that consumes the result +sub vsr2vr_args { + my $num = shift; + my $f = shift; + + my @out = _vsr2vr($num, @_); + + return ($f, @out); +} +# As above but 1st arg is mnemonic, return formatted instruction +sub vsr2vr { + my $mnemonic = shift; + my $num = shift; + my $f = shift; + + my @out = _vsr2vr($num, @_); + + " ${mnemonic}${f} " . join(",", @out); +} + +# ISA 2.03 +my $vsel = sub { vsr2vr("vsel", 4, @_); }; +my $vsl = sub { vsr2vr("vsl", 3, @_); }; +my $vspltisb = sub { vsr2vr("vspltisb", 1, @_); }; +my $vspltisw = sub { vsr2vr("vspltisw", 1, @_); }; +my $vsr = sub { vsr2vr("vsr", 3, @_); }; +my $vsro = sub { vsr2vr("vsro", 3, @_); }; + +# ISA 3.0 +my $lxsd = sub { vsr2vr("lxsd", 1, @_); }; + +################################################################ # simplified mnemonics not handled by at least one assembler ################################################################ my $cmplw = sub { @@ -226,13 +291,18 @@ my $vpermdi = sub { # xxpermdi # PowerISA 2.07 stuff sub vcrypto_op { - my ($f, $vrt, $vra, $vrb, $op) = @_; + my ($f, $vrt, $vra, $vrb, $op) = vsr2vr_args(3, @_); " .long ".sprintf "0x%X",(4<<26)|($vrt<<21)|($vra<<16)|($vrb<<11)|$op; } sub vfour { my ($f, $vrt, $vra, $vrb, $vrc, $op) = @_; " .long ".sprintf "0x%X",(4<<26)|($vrt<<21)|($vra<<16)|($vrb<<11)|($vrc<<6)|$op; }; +sub vfour_vsr { + my ($f, $vrt, $vra, $vrb, $vrc, $op) = vsr2vr_args(4, @_); + " .long ".sprintf "0x%X",(4<<26)|($vrt<<21)|($vra<<16)|($vrb<<11)|($vrc<<6)|$op; +}; + my $vcipher = sub { vcrypto_op(@_, 1288); }; my $vcipherlast = sub { vcrypto_op(@_, 1289); }; my $vncipher = sub { vcrypto_op(@_, 1352); }; @@ -254,10 +324,10 @@ my $vsld = sub { vcrypto_op(@_, 1476); } my $vsrd = sub { vcrypto_op(@_, 1732); }; my $vsubudm = sub { vcrypto_op(@_, 1216); }; my $vaddcuq = sub { vcrypto_op(@_, 320); }; -my $vaddeuqm = sub { vfour(@_,60); }; -my $vaddecuq = sub { vfour(@_,61); }; -my $vmrgew = sub { vfour(@_,0,1932); }; -my $vmrgow = sub { vfour(@_,0,1676); }; +my $vaddeuqm = sub { vfour_vsr(@_,60); }; +my $vaddecuq = sub { vfour_vsr(@_,61); }; +my $vmrgew = sub { vfour_vsr(@_,0,1932); }; +my $vmrgow = sub { vfour_vsr(@_,0,1676); }; my $mtsle = sub { my ($f, $arg) = @_; @@ -298,7 +368,7 @@ my $addex = sub { my ($f, $rt, $ra, $rb, $cy) = @_; # only cy==0 is specified in 3.0B " .long ".sprintf "0x%X",(31<<26)|($rt<<21)|($ra<<16)|($rb<<11)|($cy<<9)|(170<<1); }; -my $vmsumudm = sub { vfour(@_,35); }; +my $vmsumudm = sub { vfour_vsr(@_, 35); }; while($line=<>) { Index: openssl-1.1.1l/Configurations/10-main.conf =================================================================== --- openssl-1.1.1l.orig/Configurations/10-main.conf +++ openssl-1.1.1l/Configurations/10-main.conf @@ -669,7 +669,7 @@ my %targets = ( inherit_from => [ "linux-generic64", asm("ppc64_asm") ], cflags => add("-m64"), cxxflags => add("-m64"), - lib_cppflags => add("-DB_ENDIAN"), + lib_cppflags => add("-DB_ENDIAN -DECP_NISTP521_ASM"), perlasm_scheme => "linux64", multilib => "64", }, @@ -677,7 +677,7 @@ my %targets = ( inherit_from => [ "linux-generic64", asm("ppc64_asm") ], cflags => add("-m64"), cxxflags => add("-m64"), - lib_cppflags => add("-DL_ENDIAN"), + lib_cppflags => add("-DL_ENDIAN -DECP_NISTP521_ASM"), perlasm_scheme => "linux64le", }, Index: openssl-1.1.1l/crypto/ec/asm/ecp_nistp521-ppc64.pl =================================================================== --- /dev/null +++ openssl-1.1.1l/crypto/ec/asm/ecp_nistp521-ppc64.pl @@ -0,0 +1,435 @@ +#! /usr/bin/env perl +# Copyright 2021 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the OpenSSL license (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html +# +# ==================================================================== +# Written by Amitay Isaacs and Martin Schwenke +# for the OpenSSL project. +# ==================================================================== +# +# p521 lower-level primitives for PPC64 using vector instructions. +# + +use strict; +use warnings; + +my $flavour = shift; +my $output = ""; +while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} +if (!$output) { + $output = "-"; +} + +my ($xlate, $dir); +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; +( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or +( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or +die "can't locate ppc-xlate.pl"; + +open OUT,"| \"$^X\" $xlate $flavour $output"; +*STDOUT=*OUT; + +my $code = ""; + +my ($sp, $outp, $savelr, $savesp) = ("r1", "r3", "r10", "r12"); + +my $vzero = "v32"; + +sub startproc($) +{ + my ($name) = @_; + + $code.=<<___; + .globl ${name} + .align 5 +${name}: + +___ +} + +sub endproc($) +{ + my ($name) = @_; + + $code.=<<___; + blr + .size ${name},.-${name} + +___ +} + + +sub push_vrs($$) +{ + my ($min, $max) = @_; + + my $count = $max - $min + 1; + + $code.=<<___; + mr $savesp,$sp + stdu $sp,-16*`$count+1`($sp) + +___ + for (my $i = $min; $i <= $max; $i++) { + my $mult = $max - $i + 1; + $code.=<<___; + stxv $i,-16*$mult($savesp) +___ + + } + + $code.=<<___; + +___ +} + +sub pop_vrs($$) +{ + my ($min, $max) = @_; + + $code.=<<___; + ld $savesp,0($sp) +___ + for (my $i = $min; $i <= $max; $i++) { + my $mult = $max - $i + 1; + $code.=<<___; + lxv $i,-16*$mult($savesp) +___ + } + + $code.=<<___; + mr $sp,$savesp + +___ +} + +sub load_vrs($$) +{ + my ($pointer, $reg_list) = @_; + + for (my $i = 0; $i <= 8; $i++) { + my $offset = $i * 8; + $code.=<<___; + lxsd $reg_list->[$i],$offset($pointer) +___ + } + + $code.=<<___; + +___ +} + +sub store_vrs($$) +{ + my ($pointer, $reg_list) = @_; + + for (my $i = 0; $i <= 8; $i++) { + my $offset = $i * 16; + $code.=<<___; + stxv $reg_list->[$i],$offset($pointer) +___ + } + + $code.=<<___; + +___ +} + +$code.=<<___; +.text + +___ + +{ + # mul/square common + my ($t1, $t2, $t3, $t4) = ("v33", "v34", "v44", "v54"); + my ($zero, $one) = ("r8", "r9"); + my @out = map("v$_",(55..63)); + + { + # + # p521_felem_mul + # + + my ($in1p, $in2p) = ("r4", "r5"); + my @in1 = map("v$_",(45..53)); + my @in2 = map("v$_",(35..43)); + + startproc("p521_felem_mul"); + + push_vrs(52, 63); + + $code.=<<___; + vspltisw $vzero,0 + +___ + + load_vrs($in1p, \@in1); + load_vrs($in2p, \@in2); + + $code.=<<___; + vmsumudm $out[0],$in1[0],$in2[0],$vzero + + xxpermdi $t1,$in1[0],$in1[1],0b00 + xxpermdi $t2,$in2[1],$in2[0],0b00 + vmsumudm $out[1],$t1,$t2,$vzero + + xxpermdi $t2,$in2[2],$in2[1],0b00 + vmsumudm $out[2],$t1,$t2,$vzero + vmsumudm $out[2],$in1[2],$in2[0],$out[2] + + xxpermdi $t2,$in2[3],$in2[2],0b00 + vmsumudm $out[3],$t1,$t2,$vzero + xxpermdi $t3,$in1[2],$in1[3],0b00 + xxpermdi $t4,$in2[1],$in2[0],0b00 + vmsumudm $out[3],$t3,$t4,$out[3] + + xxpermdi $t2,$in2[4],$in2[3],0b00 + vmsumudm $out[4],$t1,$t2,$vzero + xxpermdi $t4,$in2[2],$in2[1],0b00 + vmsumudm $out[4],$t3,$t4,$out[4] + vmsumudm $out[4],$in1[4],$in2[0],$out[4] + + xxpermdi $t2,$in2[5],$in2[4],0b00 + vmsumudm $out[5],$t1,$t2,$vzero + xxpermdi $t4,$in2[3],$in2[2],0b00 + vmsumudm $out[5],$t3,$t4,$out[5] + + xxpermdi $t2,$in2[6],$in2[5],0b00 + vmsumudm $out[6],$t1,$t2,$vzero + xxpermdi $t4,$in2[4],$in2[3],0b00 + vmsumudm $out[6],$t3,$t4,$out[6] + + xxpermdi $t2,$in2[7],$in2[6],0b00 + vmsumudm $out[7],$t1,$t2,$vzero + xxpermdi $t4,$in2[5],$in2[4],0b00 + vmsumudm $out[7],$t3,$t4,$out[7] + + xxpermdi $t2,$in2[8],$in2[7],0b00 + vmsumudm $out[8],$t1,$t2,$vzero + xxpermdi $t4,$in2[6],$in2[5],0b00 + vmsumudm $out[8],$t3,$t4,$out[8] + + xxpermdi $t1,$in1[4],$in1[5],0b00 + xxpermdi $t2,$in2[1],$in2[0],0b00 + vmsumudm $out[5],$t1,$t2,$out[5] + + xxpermdi $t2,$in2[2],$in2[1],0b00 + vmsumudm $out[6],$t1,$t2,$out[6] + vmsumudm $out[6],$in1[6],$in2[0],$out[6] + + xxpermdi $t2,$in2[3],$in2[2],0b00 + vmsumudm $out[7],$t1,$t2,$out[7] + xxpermdi $t3,$in1[6],$in1[7],0b00 + xxpermdi $t4,$in2[1],$in2[0],0b00 + vmsumudm $out[7],$t3,$t4,$out[7] + + xxpermdi $t2,$in2[4],$in2[3],0b00 + vmsumudm $out[8],$t1,$t2,$out[8] + xxpermdi $t4,$in2[2],$in2[1],0b00 + vmsumudm $out[8],$t3,$t4,$out[8] + vmsumudm $out[8],$in1[8],$in2[0],$out[8] + + li $zero,0 + li $one,1 + mtvsrdd $t1,$one,$zero +___ + + for (my $i = 0; $i <= 8; $i++) { + $code.=<<___; + vsld $in2[$i],$in2[$i],$t1 +___ + } + + $code.=<<___; + + vmsumudm $out[7],$in1[8],$in2[8],$out[7] + + xxpermdi $t2,$in2[8],$in2[7],0b00 + xxpermdi $t1,$in1[7],$in1[8],0b00 + vmsumudm $out[6],$t1,$t2,$out[6] + + xxpermdi $t1,$in1[6],$in1[7],0b00 + vmsumudm $out[5],$t1,$t2,$out[5] + vmsumudm $out[5],$in1[8],$in2[6],$out[5] + + xxpermdi $t1,$in1[5],$in1[6],0b00 + vmsumudm $out[4],$t1,$t2,$out[4] + xxpermdi $t4,$in2[6],$in2[5],0b00 + xxpermdi $t3,$in1[7],$in1[8],0b00 + vmsumudm $out[4],$t3,$t4,$out[4] + + xxpermdi $t1,$in1[4],$in1[5],0b00 + vmsumudm $out[3],$t1,$t2,$out[3] + xxpermdi $t3,$in1[6],$in1[7],0b00 + vmsumudm $out[3],$t3,$t4,$out[3] + vmsumudm $out[3],$in1[8],$in2[4],$out[3] + + xxpermdi $t1,$in1[3],$in1[4],0b00 + vmsumudm $out[2],$t1,$t2,$out[2] + xxpermdi $t3,$in1[5],$in1[6],0b00 + vmsumudm $out[2],$t3,$t4,$out[2] + + xxpermdi $t1,$in1[2],$in1[3],0b00 + vmsumudm $out[1],$t1,$t2,$out[1] + xxpermdi $t3,$in1[4],$in1[5],0b00 + vmsumudm $out[1],$t3,$t4,$out[1] + + xxpermdi $t1,$in1[1],$in1[2],0b00 + vmsumudm $out[0],$t1,$t2,$out[0] + xxpermdi $t3,$in1[3],$in1[4],0b00 + vmsumudm $out[0],$t3,$t4,$out[0] + + xxpermdi $t2,$in2[4],$in2[3],0b00 + xxpermdi $t1,$in1[7],$in1[8],0b00 + vmsumudm $out[2],$t1,$t2,$out[2] + + xxpermdi $t1,$in1[6],$in1[7],0b00 + vmsumudm $out[1],$t1,$t2,$out[1] + vmsumudm $out[1],$in1[8],$in2[2],$out[1] + + xxpermdi $t1,$in1[5],$in1[6],0b00 + vmsumudm $out[0],$t1,$t2,$out[0] + xxpermdi $t4,$in2[2],$in2[1],0b00 + xxpermdi $t3,$in1[7],$in1[8],0b00 + vmsumudm $out[0],$t3,$t4,$out[0] + +___ + + store_vrs($outp, \@out); + + pop_vrs(52, 63); + + endproc("p521_felem_mul"); + } + + { + # + # p51_felem_square + # + + my ($inp) = ("r4"); + my @in = map("v$_",(45..53)); + my @inx2 = map("v$_",(35..43)); + + startproc("p521_felem_square"); + + push_vrs(52, 63); + + $code.=<<___; + vspltisw $vzero,0 + +___ + + load_vrs($inp, \@in); + + $code.=<<___; + li $zero,0 + li $one,1 + mtvsrdd $t1,$one,$zero +___ + + for (my $i = 0; $i <= 8; $i++) { + $code.=<<___; + vsld $inx2[$i],$in[$i],$t1 +___ + } + + $code.=<<___; + vmsumudm $out[0],$in[0],$in[0],$vzero + + vmsumudm $out[1],$in[0],$inx2[1],$vzero + + xxpermdi $t1,$in[0],$in[1],0b00 + xxpermdi $t2,$inx2[2],$in[1],0b00 + vmsumudm $out[2],$t1,$t2,$vzero + + xxpermdi $t2,$inx2[3],$inx2[2],0b00 + vmsumudm $out[3],$t1,$t2,$vzero + + xxpermdi $t2,$inx2[4],$inx2[3],0b00 + vmsumudm $out[4],$t1,$t2,$vzero + vmsumudm $out[4],$in[2],$in[2],$out[4] + + xxpermdi $t2,$inx2[5],$inx2[4],0b00 + vmsumudm $out[5],$t1,$t2,$vzero + vmsumudm $out[5],$in[2],$inx2[3],$out[5] + + xxpermdi $t2,$inx2[6],$inx2[5],0b00 + vmsumudm $out[6],$t1,$t2,$vzero + xxpermdi $t3,$in[2],$in[3],0b00 + xxpermdi $t4,$inx2[4],$in[3],0b00 + vmsumudm $out[6],$t3,$t4,$out[6] + + xxpermdi $t2,$inx2[7],$inx2[6],0b00 + vmsumudm $out[7],$t1,$t2,$vzero + xxpermdi $t4,$inx2[5],$inx2[4],0b00 + vmsumudm $out[7],$t3,$t4,$out[7] + + xxpermdi $t2,$inx2[8],$inx2[7],0b00 + vmsumudm $out[8],$t1,$t2,$vzero + xxpermdi $t4,$inx2[6],$inx2[5],0b00 + vmsumudm $out[8],$t3,$t4,$out[8] + vmsumudm $out[8],$in[4],$in[4],$out[8] + + vmsumudm $out[1],$in[5],$inx2[5],$out[1] + + vmsumudm $out[3],$in[6],$inx2[6],$out[3] + + vmsumudm $out[5],$in[7],$inx2[7],$out[5] + + vmsumudm $out[7],$in[8],$inx2[8],$out[7] + + mtvsrdd $t1,$one,$zero +___ + + for (my $i = 5; $i <= 8; $i++) { + $code.=<<___; + vsld $inx2[$i],$inx2[$i],$t1 +___ + } + + $code.=<<___; + + vmsumudm $out[6],$in[7],$inx2[8],$out[6] + + vmsumudm $out[5],$in[6],$inx2[8],$out[5] + + xxpermdi $t2,$inx2[8],$inx2[7],0b00 + xxpermdi $t1,$in[5],$in[6],0b00 + vmsumudm $out[4],$t1,$t2,$out[4] + + xxpermdi $t1,$in[4],$in[5],0b00 + vmsumudm $out[3],$t1,$t2,$out[3] + + xxpermdi $t1,$in[3],$in[4],0b00 + vmsumudm $out[2],$t1,$t2,$out[2] + vmsumudm $out[2],$in[5],$inx2[6],$out[2] + + xxpermdi $t1,$in[2],$in[3],0b00 + vmsumudm $out[1],$t1,$t2,$out[1] + vmsumudm $out[1],$in[4],$inx2[6],$out[1] + + xxpermdi $t1,$in[1],$in[2],0b00 + vmsumudm $out[0],$t1,$t2,$out[0] + xxpermdi $t2,$inx2[6],$inx2[5],0b00 + xxpermdi $t1,$in[3],$in[4],0b00 + vmsumudm $out[0],$t1,$t2,$out[0] + +___ + + store_vrs($outp, \@out); + + pop_vrs(52, 63); + + endproc("p521_felem_square"); + } +} + +$code =~ s/\`([^\`]*)\`/eval $1/gem; +print $code; +close STDOUT or die "error closing STDOUT: $!"; Index: openssl-1.1.1l/crypto/ec/ec_local.h =================================================================== --- openssl-1.1.1l.orig/crypto/ec/ec_local.h +++ openssl-1.1.1l/crypto/ec/ec_local.h @@ -499,6 +499,10 @@ int ec_GF2m_simple_field_div(const EC_GR const BIGNUM *b, BN_CTX *); #ifndef OPENSSL_NO_EC_NISTP_64_GCC_128 +# ifdef B_ENDIAN +# error "Can not enable ec_nistp_64_gcc_128 on big-endian systems" +# endif + /* method functions in ecp_nistp224.c */ int ec_GFp_nistp224_group_init(EC_GROUP *group); int ec_GFp_nistp224_group_set_curve(EC_GROUP *group, const BIGNUM *p, Index: openssl-1.1.1l/crypto/ec/curve448/arch_32/f_impl.c =================================================================== --- openssl-1.1.1l.orig/crypto/ec/curve448/arch_32/f_impl.c +++ openssl-1.1.1l/crypto/ec/curve448/arch_32/f_impl.c @@ -10,7 +10,7 @@ * Originally written by Mike Hamburg */ -#include "field.h" +#include "../field.h" void gf_mul(gf_s * RESTRICT cs, const gf as, const gf bs) { Index: openssl-1.1.1l/crypto/ec/curve448/arch_64/f_impl.c =================================================================== --- /dev/null +++ openssl-1.1.1l/crypto/ec/curve448/arch_64/f_impl.c @@ -0,0 +1,200 @@ +/* + * Copyright 2017-2018 The OpenSSL Project Authors. All Rights Reserved. + * Copyright 2014 Cryptography Research, Inc. + * + * Licensed under the OpenSSL license (the "License"). You may not use + * this file except in compliance with the License. You can obtain a copy + * in the file LICENSE in the source distribution or at + * https://www.openssl.org/source/license.html + * + * Originally written by Mike Hamburg + */ + +#include "../field.h" + +void gf_mul(gf_s * RESTRICT cs, const gf as, const gf bs) +{ + const uint64_t *a = as->limb, *b = bs->limb; + uint64_t *c = cs->limb; + uint128_t accum0 = 0, accum1 = 0, accum2; + uint64_t mask = (1ULL << 56) - 1; + uint64_t aa[4], bb[4], bbb[4]; + unsigned int i, j; + + for (i = 0; i < 4; i++) { + aa[i] = a[i] + a[i + 4]; + bb[i] = b[i] + b[i + 4]; + bbb[i] = bb[i] + b[i + 4]; + } + + for (i = 0; i < 4; i++) { + accum2 = 0; + + for (j = 0; j <= i; j++) { + accum2 += widemul(a[j], b[i - j]); + accum1 += widemul(aa[j], bb[i - j]); + accum0 += widemul(a[j + 4], b[i - j + 4]); + } + for (; j < 4; j++) { + accum2 += widemul(a[j], b[i - j + 8]); + accum1 += widemul(aa[j], bbb[i - j + 4]); + accum0 += widemul(a[j + 4], bb[i - j + 4]); + } + + accum1 -= accum2; + accum0 += accum2; + + c[i] = ((uint64_t)(accum0)) & mask; + c[i + 4] = ((uint64_t)(accum1)) & mask; + + accum0 >>= 56; + accum1 >>= 56; + } + + accum0 += accum1; + accum0 += c[4]; + accum1 += c[0]; + c[4] = ((uint64_t)(accum0)) & mask; + c[0] = ((uint64_t)(accum1)) & mask; + + accum0 >>= 56; + accum1 >>= 56; + + c[5] += ((uint64_t)(accum0)); + c[1] += ((uint64_t)(accum1)); +} + +void gf_mulw_unsigned(gf_s * RESTRICT cs, const gf as, uint32_t b) +{ + const uint64_t *a = as->limb; + uint64_t *c = cs->limb; + uint128_t accum0 = 0, accum4 = 0; + uint64_t mask = (1ULL << 56) - 1; + int i; + + for (i = 0; i < 4; i++) { + accum0 += widemul(b, a[i]); + accum4 += widemul(b, a[i + 4]); + c[i] = accum0 & mask; + accum0 >>= 56; + c[i + 4] = accum4 & mask; + accum4 >>= 56; + } + + accum0 += accum4 + c[4]; + c[4] = accum0 & mask; + c[5] += accum0 >> 56; + + accum4 += c[0]; + c[0] = accum4 & mask; + c[1] += accum4 >> 56; +} + +void gf_sqr(gf_s * RESTRICT cs, const gf as) +{ + const uint64_t *a = as->limb; + uint64_t *c = cs->limb; + uint128_t accum0 = 0, accum1 = 0, accum2; + uint64_t mask = (1ULL << 56) - 1; + uint64_t aa[4]; + unsigned int i; + + /* For some reason clang doesn't vectorize this without prompting? */ + for (i = 0; i < 4; i++) + aa[i] = a[i] + a[i + 4]; + + accum2 = widemul(a[0], a[3]); + accum0 = widemul(aa[0], aa[3]); + accum1 = widemul(a[4], a[7]); + + accum2 += widemul(a[1], a[2]); + accum0 += widemul(aa[1], aa[2]); + accum1 += widemul(a[5], a[6]); + + accum0 -= accum2; + accum1 += accum2; + + c[3] = ((uint64_t)(accum1)) << 1 & mask; + c[7] = ((uint64_t)(accum0)) << 1 & mask; + + accum0 >>= 55; + accum1 >>= 55; + + accum0 += widemul(2 * aa[1], aa[3]); + accum1 += widemul(2 * a[5], a[7]); + accum0 += widemul(aa[2], aa[2]); + accum1 += accum0; + + accum0 -= widemul(2 * a[1], a[3]); + accum1 += widemul(a[6], a[6]); + + accum2 = widemul(a[0], a[0]); + accum1 -= accum2; + accum0 += accum2; + + accum0 -= widemul(a[2], a[2]); + accum1 += widemul(aa[0], aa[0]); + accum0 += widemul(a[4], a[4]); + + c[0] = ((uint64_t)(accum0)) & mask; + c[4] = ((uint64_t)(accum1)) & mask; + + accum0 >>= 56; + accum1 >>= 56; + + accum2 = widemul(2 * aa[2], aa[3]); + accum0 -= widemul(2 * a[2], a[3]); + accum1 += widemul(2 * a[6], a[7]); + + accum1 += accum2; + accum0 += accum2; + + accum2 = widemul(2 * a[0], a[1]); + accum1 += widemul(2 * aa[0], aa[1]); + accum0 += widemul(2 * a[4], a[5]); + + accum1 -= accum2; + accum0 += accum2; + + c[1] = ((uint64_t)(accum0)) & mask; + c[5] = ((uint64_t)(accum1)) & mask; + + accum0 >>= 56; + accum1 >>= 56; + + accum2 = widemul(aa[3], aa[3]); + accum0 -= widemul(a[3], a[3]); + accum1 += widemul(a[7], a[7]); + + accum1 += accum2; + accum0 += accum2; + + accum2 = widemul(2 * a[0], a[2]); + accum1 += widemul(2 * aa[0], aa[2]); + accum0 += widemul(2 * a[4], a[6]); + + accum2 += widemul(a[1], a[1]); + accum1 += widemul(aa[1], aa[1]); + accum0 += widemul(a[5], a[5]); + + accum1 -= accum2; + accum0 += accum2; + + c[2] = ((uint64_t)(accum0)) & mask; + c[6] = ((uint64_t)(accum1)) & mask; + + accum0 >>= 56; + accum1 >>= 56; + + accum0 += c[3]; + accum1 += c[7]; + c[3] = ((uint64_t)(accum0)) & mask; + c[7] = ((uint64_t)(accum1)) & mask; + + /* we could almost stop here, but it wouldn't be stable, so... */ + + accum0 >>= 56; + accum1 >>= 56; + c[4] += ((uint64_t)(accum0)) + ((uint64_t)(accum1)); + c[0] += ((uint64_t)(accum1)); +} Index: openssl-1.1.1l/Configure =================================================================== --- openssl-1.1.1l.orig/Configure +++ openssl-1.1.1l/Configure @@ -1476,6 +1476,20 @@ if (!$disabled{asm} && !$predefined_C{__ } } +# Check if __SIZEOF_INT128__ is defined by compiler +$config{use_int128} = 0; +{ + my $cc = $config{CROSS_COMPILE}.$config{CC}; + open(PIPE, "$cc -E -dM - &1 |"); + while() { + if (m/__SIZEOF_INT128__/) { + $config{use_int128} = 1; + last; + } + } + close(PIPE); +} + # Deal with bn_ops ################################################### $config{bn_ll} =0;