openssl-1_1/openssl-1_1-Optimize-ppc64.patch

From 4dba53694bf633c272075e62acdc5a5ca3003ce6 Mon Sep 17 00:00:00 2001
From: Amitay Isaacs <amitay@ozlabs.org>
Date: Mon, 29 Mar 2021 18:06:13 +1100
Subject: [PATCH 01/29] numbers: Define 128-bit integers if compiler supports

Signed-off-by: Amitay Isaacs <amitay@ozlabs.org>

Reviewed-by: Tomas Mraz <tomas@openssl.org>
Reviewed-by: Matt Caswell <matt@openssl.org>
(Merged from https://github.com/openssl/openssl/pull/14784)

(cherry picked from commit bbed0d1cbd436af6797d7837e270bff4ca4d5a10)
---
 include/internal/numbers.h | 10 ++++++++++
 1 file changed, 10 insertions(+)

Index: openssl-1.1.1l/include/internal/numbers.h
===================================================================
--- openssl-1.1.1l.orig/include/internal/numbers.h
+++ openssl-1.1.1l/include/internal/numbers.h
@@ -60,6 +60,16 @@
 #  define UINT64_MAX __MAXUINT__(uint64_t)
 # endif

+# ifndef INT128_MAX
+#  if defined(__SIZEOF_INT128__) && __SIZEOF_INT128__ == 16
+typedef __int128_t int128_t;
+typedef __uint128_t uint128_t;
+#   define INT128_MIN __MININT__(int128_t)
+#   define INT128_MAX __MAXINT__(int128_t)
+#   define UINT128_MAX __MAXUINT__(uint128_t)
+#  endif
+# endif
+
 # ifndef SIZE_MAX
 #  define SIZE_MAX __MAXUINT__(size_t)
 # endif
Index: openssl-1.1.1l/crypto/bn/bn_div.c
===================================================================
--- openssl-1.1.1l.orig/crypto/bn/bn_div.c
+++ openssl-1.1.1l/crypto/bn/bn_div.c
@@ -97,7 +97,7 @@ BN_ULONG bn_div_3_words(const BN_ULONG *
  */
 #  if BN_BITS2 == 64 && defined(__SIZEOF_INT128__) && __SIZEOF_INT128__==16
 #   undef BN_ULLONG
-#   define BN_ULLONG __uint128_t
+#   define BN_ULLONG uint128_t
 #   define BN_LLONG
 #  endif

Index: openssl-1.1.1l/crypto/bn/bn_local.h
===================================================================
--- openssl-1.1.1l.orig/crypto/bn/bn_local.h
+++ openssl-1.1.1l/crypto/bn/bn_local.h
@@ -22,6 +22,7 @@
 # endif

 # include "crypto/bn.h"
+# include "internal/numbers.h"

 /*
  * These preprocessor symbols control various aspects of the bignum headers
@@ -374,9 +375,9 @@ struct bn_gencb_st {
  */
 #  if defined(__SIZEOF_INT128__) && __SIZEOF_INT128__==16 && \
       (defined(SIXTY_FOUR_BIT) || defined(SIXTY_FOUR_BIT_LONG))
-#   define BN_UMULT_HIGH(a,b)          (((__uint128_t)(a)*(b))>>64)
+#   define BN_UMULT_HIGH(a,b)          (((uint128_t)(a)*(b))>>64)
 #   define BN_UMULT_LOHI(low,high,a,b) ({       \
-        __uint128_t ret=(__uint128_t)(a)*(b);   \
+        uint128_t ret=(uint128_t)(a)*(b);   \
         (high)=ret>>64; (low)=ret;      })
 #  elif defined(__alpha) && (defined(SIXTY_FOUR_BIT_LONG) || defined(SIXTY_FOUR_BIT))
 #   if defined(__DECC)
Index: openssl-1.1.1l/crypto/ec/curve25519.c
===================================================================
--- openssl-1.1.1l.orig/crypto/ec/curve25519.c
+++ openssl-1.1.1l/crypto/ec/curve25519.c
@@ -11,6 +11,8 @@
 #include "ec_local.h"
 #include <openssl/sha.h>

+#include "internal/numbers.h"
+
 #if defined(X25519_ASM) && (defined(__x86_64) || defined(__x86_64__) || \
                             defined(_M_AMD64) || defined(_M_X64))

@@ -252,7 +254,7 @@ static void x25519_scalar_mulx(uint8_t o
 #endif

 #if defined(X25519_ASM) \
-    || ( (defined(__SIZEOF_INT128__) && __SIZEOF_INT128__ == 16) \
+    || ( defined(INT128_MAX) \
          && !defined(__sparc__) \
          && (!defined(__SIZEOF_LONG__) || (__SIZEOF_LONG__ == 8)) \
          && !(defined(__ANDROID__) && !defined(__clang__)) )
@@ -385,7 +387,7 @@ void x25519_fe51_mul121666(fe51 h, fe51
 #  define fe51_mul121666 x25519_fe51_mul121666
 # else

-typedef __uint128_t u128;
+typedef uint128_t u128;

 static void fe51_mul(fe51 h, const fe51 f, const fe51 g)
 {
Index: openssl-1.1.1l/crypto/ec/curve448/curve448utils.h
===================================================================
--- openssl-1.1.1l.orig/crypto/ec/curve448/curve448utils.h
+++ openssl-1.1.1l/crypto/ec/curve448/curve448utils.h
@@ -15,6 +15,8 @@

 # include <openssl/e_os2.h>

+# include "internal/numbers.h"
+
 /*
  * Internal word types. Somewhat tricky.  This could be decided separately per
  * platform.  However, the structs do need to be all the same size and
@@ -41,9 +43,9 @@ typedef int64_t c448_sword_t;
 /* "Boolean" type, will be set to all-zero or all-one (i.e. -1u) */
 typedef uint64_t c448_bool_t;
 /* Double-word size for internal computations */
-typedef __uint128_t c448_dword_t;
+typedef uint128_t c448_dword_t;
 /* Signed double-word size for internal computations */
-typedef __int128_t c448_dsword_t;
+typedef int128_t c448_dsword_t;
 # elif C448_WORD_BITS == 32
 /* Word size for internal computations */
 typedef uint32_t c448_word_t;
Index: openssl-1.1.1l/crypto/ec/curve448/word.h
===================================================================
--- openssl-1.1.1l.orig/crypto/ec/curve448/word.h
+++ openssl-1.1.1l/crypto/ec/curve448/word.h
@@ -17,15 +17,20 @@
 # include <assert.h>
 # include <stdlib.h>
 # include <openssl/e_os2.h>
-# include "arch_intrinsics.h"
 # include "curve448utils.h"

+# ifdef INT128_MAX
+#  include "arch_64/arch_intrinsics.h"
+# else
+#  include "arch_32/arch_intrinsics.h"
+# endif
+
 # if (ARCH_WORD_BITS == 64)
 typedef uint64_t word_t, mask_t;
-typedef __uint128_t dword_t;
+typedef uint128_t dword_t;
 typedef int32_t hsword_t;
 typedef int64_t sword_t;
-typedef __int128_t dsword_t;
+typedef int128_t dsword_t;
 # elif (ARCH_WORD_BITS == 32)
 typedef uint32_t word_t, mask_t;
 typedef uint64_t dword_t;
Index: openssl-1.1.1l/crypto/ec/ecp_nistp224.c
===================================================================
--- openssl-1.1.1l.orig/crypto/ec/ecp_nistp224.c
+++ openssl-1.1.1l/crypto/ec/ecp_nistp224.c
@@ -40,11 +40,9 @@ NON_EMPTY_TRANSLATION_UNIT
 # include <openssl/err.h>
 # include "ec_local.h"

-# if defined(__SIZEOF_INT128__) && __SIZEOF_INT128__==16
-  /* even with gcc, the typedef won't work for 32-bit platforms */
-typedef __uint128_t uint128_t;  /* nonstandard; implemented by gcc on 64-bit
-                                 * platforms */
-# else
+#include "internal/numbers.h"
+
+#ifndef INT128_MAX
 #  error "Your compiler doesn't appear to support 128-bit integer types"
 # endif

Index: openssl-1.1.1l/crypto/ec/ecp_nistp256.c
===================================================================
--- openssl-1.1.1l.orig/crypto/ec/ecp_nistp256.c
+++ openssl-1.1.1l/crypto/ec/ecp_nistp256.c
@@ -41,14 +41,11 @@ NON_EMPTY_TRANSLATION_UNIT
 # include <openssl/err.h>
 # include "ec_local.h"

-# if defined(__SIZEOF_INT128__) && __SIZEOF_INT128__==16
-  /* even with gcc, the typedef won't work for 32-bit platforms */
-typedef __uint128_t uint128_t;  /* nonstandard; implemented by gcc on 64-bit
-                                 * platforms */
-typedef __int128_t int128_t;
-# else
-#  error "Your compiler doesn't appear to support 128-bit integer types"
-# endif
+#include "internal/numbers.h"
+
+#ifndef INT128_MAX
+# error "Your compiler doesn't appear to support 128-bit integer types"
+#endif

 typedef uint8_t u8;
 typedef uint32_t u32;
Index: openssl-1.1.1l/crypto/ec/ecp_nistp521.c
===================================================================
--- openssl-1.1.1l.orig/crypto/ec/ecp_nistp521.c
+++ openssl-1.1.1l/crypto/ec/ecp_nistp521.c
@@ -40,13 +40,11 @@ NON_EMPTY_TRANSLATION_UNIT
 # include <openssl/err.h>
 # include "ec_local.h"

-# if defined(__SIZEOF_INT128__) && __SIZEOF_INT128__==16
-  /* even with gcc, the typedef won't work for 32-bit platforms */
-typedef __uint128_t uint128_t;  /* nonstandard; implemented by gcc on 64-bit
-                                 * platforms */
-# else
-#  error "Your compiler doesn't appear to support 128-bit integer types"
-# endif
+#include "internal/numbers.h"
+
+#ifndef INT128_MAX
+# error "Your compiler doesn't appear to support 128-bit integer types"
+#endif

 typedef uint8_t u8;
 typedef uint64_t u64;
@@ -400,7 +398,7 @@ static void felem_diff128(largefelem out
  * On exit:
  *   out[i] < 17 * max(in[i]) * max(in[i])
  */
-static void felem_square(largefelem out, const felem in)
+static void felem_square_ref(largefelem out, const felem in)
 {
     felem inx2, inx4;
     felem_scalar(inx2, in, 2);
@@ -484,7 +482,7 @@ static void felem_square(largefelem out,
  * On exit:
  *   out[i] < 17 * max(in1[i]) * max(in2[i])
  */
-static void felem_mul(largefelem out, const felem in1, const felem in2)
+static void felem_mul_ref(largefelem out, const felem in1, const felem in2)
 {
     felem in2x2;
     felem_scalar(in2x2, in2, 2);
@@ -674,6 +672,57 @@ static void felem_reduce(felem out, cons
      */
 }

+#if defined(ECP_NISTP521_ASM)
+void felem_square_wrapper(largefelem out, const felem in);
+void felem_mul_wrapper(largefelem out, const felem in1, const felem in2);
+
+static void (*felem_square_p)(largefelem out, const felem in) =
+    felem_square_wrapper;
+static void (*felem_mul_p)(largefelem out, const felem in1, const felem in2) =
+    felem_mul_wrapper;
+
+void p521_felem_square(largefelem out, const felem in);
+void p521_felem_mul(largefelem out, const felem in1, const felem in2);
+
+# if defined(_ARCH_PPC64)
+#  include "../ppc_arch.h"
+# endif
+
+void felem_select(void)
+{
+# if defined(_ARCH_PPC64)
+    if ((OPENSSL_ppccap_P & PPC_MADD300) && (OPENSSL_ppccap_P & PPC_ALTIVEC)) {
+        felem_square_p = p521_felem_square;
+        felem_mul_p = p521_felem_mul;
+
+        return;
+    }
+# endif
+
+    /* Default */
+    felem_square_p = felem_square_ref;
+    felem_mul_p = felem_mul_ref;
+}
+
+void felem_square_wrapper(largefelem out, const felem in)
+{
+    felem_select();
+    felem_square_p(out, in);
+}
+
+void felem_mul_wrapper(largefelem out, const felem in1, const felem in2)
+{
+    felem_select();
+    felem_mul_p(out, in1, in2);
+}
+
+# define felem_square felem_square_p
+# define felem_mul felem_mul_p
+#else
+# define felem_square felem_square_ref
+# define felem_mul felem_mul_ref
+#endif
+
 static void felem_square_reduce(felem out, const felem in)
 {
     largefelem tmp;
Index: openssl-1.1.1l/crypto/poly1305/poly1305.c
===================================================================
--- openssl-1.1.1l.orig/crypto/poly1305/poly1305.c
+++ openssl-1.1.1l/crypto/poly1305/poly1305.c
@@ -95,11 +95,10 @@ poly1305_blocks(void *ctx, const unsigne
          (a ^ ((a ^ b) | ((a - b) ^ b))) >> (sizeof(a) * 8 - 1) \
          )

-# if (defined(__SIZEOF_INT128__) && __SIZEOF_INT128__==16) && \
-     (defined(__SIZEOF_LONG__) && __SIZEOF_LONG__==8)
+# if defined(INT64_MAX) && defined(INT128_MAX)

 typedef unsigned long u64;
-typedef __uint128_t u128;
+typedef uint128_t u128;

 typedef struct {
     u64 h[3];
Index: openssl-1.1.1l/crypto/poly1305/poly1305_base2_44.c
===================================================================
--- openssl-1.1.1l.orig/crypto/poly1305/poly1305_base2_44.c
+++ openssl-1.1.1l/crypto/poly1305/poly1305_base2_44.c
@@ -18,7 +18,7 @@
 typedef unsigned char u8;
 typedef unsigned int u32;
 typedef unsigned long u64;
-typedef unsigned __int128 u128;
+typedef uint128_t u128;

 typedef struct {
     u64 h[3];
Index: openssl-1.1.1l/crypto/ec/build.info
===================================================================
--- openssl-1.1.1l.orig/crypto/ec/build.info
+++ openssl-1.1.1l/crypto/ec/build.info
@@ -6,8 +13,9 @@ SOURCE[../../libcrypto]=\
         ecp_nistp224.c ecp_nistp256.c ecp_nistp521.c ecp_nistputil.c \
         ecp_oct.c ec2_oct.c ec_oct.c ec_kmeth.c ecdh_ossl.c ecdh_kdf.c \
         ecdsa_ossl.c ecdsa_sign.c ecdsa_vrf.c curve25519.c ecx_meth.c \
-        curve448/arch_32/f_impl.c curve448/f_generic.c curve448/scalar.c \
+        curve448/f_generic.c curve448/scalar.c \
         curve448/curve448_tables.c curve448/eddsa.c curve448/curve448.c \
+        curve448/arch_64/f_impl64.c curve448/arch_32/f_impl32.c \
         {- $target{ec_asm_src} -}

 GENERATE[ecp_nistz256-x86.s]=asm/ecp_nistz256-x86.pl \
@@ -29,6 +38,8 @@ GENERATE[ecp_nistz256-armv8.S]=asm/ecp_n
 INCLUDE[ecp_nistz256-armv8.o]=..
 GENERATE[ecp_nistz256-ppc64.s]=asm/ecp_nistz256-ppc64.pl $(PERLASM_SCHEME)

+GENERATE[ecp_nistp521-ppc64.s]=asm/ecp_nistp521-ppc64.pl $(PERLASM_SCHEME)
+
 GENERATE[x25519-x86_64.s]=asm/x25519-x86_64.pl $(PERLASM_SCHEME)
 GENERATE[x25519-ppc64.s]=asm/x25519-ppc64.pl $(PERLASM_SCHEME)

@@ -36,10 +47,3 @@ BEGINRAW[Makefile]
 {- $builddir -}/ecp_nistz256-%.S:	{- $sourcedir -}/asm/ecp_nistz256-%.pl
 	CC="$(CC)" $(PERL) $< $(PERLASM_SCHEME) $@
 ENDRAW[Makefile]
-
-INCLUDE[curve448/arch_32/f_impl.o]=curve448/arch_32 curve448
-INCLUDE[curve448/f_generic.o]=curve448/arch_32 curve448
-INCLUDE[curve448/scalar.o]=curve448/arch_32 curve448
-INCLUDE[curve448/curve448_tables.o]=curve448/arch_32 curve448
-INCLUDE[curve448/eddsa.o]=curve448/arch_32 curve448
-INCLUDE[curve448/curve448.o]=curve448/arch_32 curve448
Index: openssl-1.1.1l/crypto/ec/curve448/field.h
===================================================================
--- openssl-1.1.1l.orig/crypto/ec/curve448/field.h
+++ openssl-1.1.1l/crypto/ec/curve448/field.h
@@ -66,10 +66,15 @@ void gf_serialize(uint8_t *serial, const
 mask_t gf_deserialize(gf x, const uint8_t serial[SER_BYTES], int with_hibit,
                       uint8_t hi_nmask);

-# include "f_impl.h"            /* Bring in the inline implementations */

 # define LIMBPERM(i) (i)
-# define LIMB_MASK(i) (((1)<<LIMB_PLACE_VALUE(i))-1)
+# if (ARCH_WORD_BITS == 32)
+#  include "arch_32/f_impl.h"    /* Bring in the inline implementations */
+#  define LIMB_MASK(i) (((1)<<LIMB_PLACE_VALUE(i))-1)
+# elif (ARCH_WORD_BITS == 64)
+#  include "arch_64/f_impl.h"    /* Bring in the inline implementations */
+#  define LIMB_MASK(i) (((1ULL)<<LIMB_PLACE_VALUE(i))-1)
+# endif

 static const gf ZERO = {{{0}}}, ONE = {{{1}}};

Index: openssl-1.1.1l/crypto/ec/curve448/arch_64/arch_intrinsics.h
===================================================================
--- /dev/null
+++ openssl-1.1.1l/crypto/ec/curve448/arch_64/arch_intrinsics.h
@@ -0,0 +1,27 @@
+/*
+ * Copyright 2017-2018 The OpenSSL Project Authors. All Rights Reserved.
+ * Copyright 2016 Cryptography Research, Inc.
+ *
+ * Licensed under the OpenSSL license (the "License").  You may not use
+ * this file except in compliance with the License.  You can obtain a copy
+ * in the file LICENSE in the source distribution or at
+ * https://www.openssl.org/source/license.html
+ *
+ * Originally written by Mike Hamburg
+ */
+
+# include "internal/constant_time.h"
+
+#ifndef OSSL_CRYPTO_EC_CURVE448_ARCH_64_INTRINSICS_H
+# define OSSL_CRYPTO_EC_CURVE448_ARCH_64_INTRINSICS_H
+
+# define ARCH_WORD_BITS 64
+
+# define word_is_zero(a)     constant_time_is_zero_64(a)
+
+static ossl_inline uint128_t widemul(uint64_t a, uint64_t b)
+{
+    return ((uint128_t) a) * b;
+}
+
+#endif                          /* OSSL_CRYPTO_EC_CURVE448_ARCH_64_INTRINSICS_H */
Index: openssl-1.1.1l/crypto/ec/curve448/arch_64/f_impl.h
===================================================================
--- /dev/null
+++ openssl-1.1.1l/crypto/ec/curve448/arch_64/f_impl.h
@@ -0,0 +1,58 @@
+/*
+ * Copyright 2017-2018 The OpenSSL Project Authors. All Rights Reserved.
+ * Copyright 2014-2016 Cryptography Research, Inc.
+ *
+ * Licensed under the OpenSSL license (the "License").  You may not use
+ * this file except in compliance with the License.  You can obtain a copy
+ * in the file LICENSE in the source distribution or at
+ * https://www.openssl.org/source/license.html
+ *
+ * Originally written by Mike Hamburg
+ */
+
+#ifndef OSSL_CRYPTO_EC_CURVE448_ARCH_64_F_IMPL_H
+# define OSSL_CRYPTO_EC_CURVE448_ARCH_64_F_IMPL_H
+
+# define GF_HEADROOM 9999        /* Everything is reduced anyway */
+# define FIELD_LITERAL(a,b,c,d,e,f,g,h) {{a,b,c,d,e,f,g,h}}
+
+# define LIMB_PLACE_VALUE(i) 56
+
+void gf_add_RAW(gf out, const gf a, const gf b)
+{
+    unsigned int i;
+
+    for (i = 0; i < NLIMBS; i++)
+        out->limb[i] = a->limb[i] + b->limb[i];
+
+    gf_weak_reduce(out);
+}
+
+void gf_sub_RAW(gf out, const gf a, const gf b)
+{
+    uint64_t co1 = ((1ULL << 56) - 1) * 2, co2 = co1 - 2;
+    unsigned int i;
+
+    for (i = 0; i < NLIMBS; i++)
+        out->limb[i] = a->limb[i] - b->limb[i] + ((i == NLIMBS / 2) ? co2 : co1);
+
+    gf_weak_reduce(out);
+}
+
+void gf_bias(gf a, int amt)
+{
+}
+
+void gf_weak_reduce(gf a)
+{
+    uint64_t mask = (1ULL << 56) - 1;
+    uint64_t tmp = a->limb[NLIMBS - 1] >> 56;
+    unsigned int i;
+
+    a->limb[NLIMBS / 2] += tmp;
+    for (i = NLIMBS - 1; i > 0; i--)
+        a->limb[i] = (a->limb[i] & mask) + (a->limb[i - 1] >> 56);
+    a->limb[0] = (a->limb[0] & mask) + tmp;
+}
+
+#endif                  /* OSSL_CRYPTO_EC_CURVE448_ARCH_64_F_IMPL_H */
Index: openssl-1.1.1l/include/internal/constant_time.h
===================================================================
--- openssl-1.1.1l.orig/include/internal/constant_time.h
+++ openssl-1.1.1l/include/internal/constant_time.h
@@ -181,6 +181,11 @@ static ossl_inline uint32_t constant_tim
     return constant_time_msb_32(~a & (a - 1));
 }

+static ossl_inline uint64_t constant_time_is_zero_64(uint64_t a)
+{
+    return constant_time_msb_64(~a & (a - 1));
+}
+
 static ossl_inline unsigned int constant_time_eq(unsigned int a,
                                                  unsigned int b)
 {
Index: openssl-1.1.1l/crypto/ec/curve448/arch_32/f_impl32.c
===================================================================
--- /dev/null
+++ openssl-1.1.1l/crypto/ec/curve448/arch_32/f_impl32.c
@@ -0,0 +1,104 @@
+/*
+ * Copyright 2017-2018 The OpenSSL Project Authors. All Rights Reserved.
+ * Copyright 2014 Cryptography Research, Inc.
+ *
+ * Licensed under the OpenSSL license (the "License").  You may not use
+ * this file except in compliance with the License.  You can obtain a copy
+ * in the file LICENSE in the source distribution or at
+ * https://www.openssl.org/source/license.html
+ *
+ * Originally written by Mike Hamburg
+ */
+
+#include <openssl/opensslconf.h>
+#include "internal/numbers.h"
+
+#ifdef UINT128_MAX
+/* We have support for 128 bit ints, so do nothing here */
+NON_EMPTY_TRANSLATION_UNIT
+#else
+
+# include "../field.h"
+
+void gf_mul(gf_s * RESTRICT cs, const gf as, const gf bs)
+{
+    const uint32_t *a = as->limb, *b = bs->limb;
+    uint32_t *c = cs->limb;
+    uint64_t accum0 = 0, accum1 = 0, accum2 = 0;
+    uint32_t mask = (1 << 28) - 1;
+    uint32_t aa[8], bb[8];
+    int i, j;
+
+    for (i = 0; i < 8; i++) {
+        aa[i] = a[i] + a[i + 8];
+        bb[i] = b[i] + b[i + 8];
+    }
+
+    for (j = 0; j < 8; j++) {
+        accum2 = 0;
+        for (i = 0; i < j + 1; i++) {
+            accum2 += widemul(a[j - i], b[i]);
+            accum1 += widemul(aa[j - i], bb[i]);
+            accum0 += widemul(a[8 + j - i], b[8 + i]);
+        }
+        accum1 -= accum2;
+        accum0 += accum2;
+        accum2 = 0;
+        for (i = j + 1; i < 8; i++) {
+            accum0 -= widemul(a[8 + j - i], b[i]);
+            accum2 += widemul(aa[8 + j - i], bb[i]);
+            accum1 += widemul(a[16 + j - i], b[8 + i]);
+        }
+        accum1 += accum2;
+        accum0 += accum2;
+        c[j] = ((uint32_t)(accum0)) & mask;
+        c[j + 8] = ((uint32_t)(accum1)) & mask;
+        accum0 >>= 28;
+        accum1 >>= 28;
+    }
+
+    accum0 += accum1;
+    accum0 += c[8];
+    accum1 += c[0];
+    c[8] = ((uint32_t)(accum0)) & mask;
+    c[0] = ((uint32_t)(accum1)) & mask;
+
+    accum0 >>= 28;
+    accum1 >>= 28;
+    c[9] += ((uint32_t)(accum0));
+    c[1] += ((uint32_t)(accum1));
+}
+
+void gf_mulw_unsigned(gf_s * RESTRICT cs, const gf as, uint32_t b)
+{
+    const uint32_t *a = as->limb;
+    uint32_t *c = cs->limb;
+    uint64_t accum0 = 0, accum8 = 0;
+    uint32_t mask = (1 << 28) - 1;
+    int i;
+
+    assert(b <= mask);
+
+    for (i = 0; i < 8; i++) {
+        accum0 += widemul(b, a[i]);
+        accum8 += widemul(b, a[i + 8]);
+        c[i] = accum0 & mask;
+        accum0 >>= 28;
+        c[i + 8] = accum8 & mask;
+        accum8 >>= 28;
+    }
+
+    accum0 += accum8 + c[8];
+    c[8] = ((uint32_t)accum0) & mask;
+    c[9] += (uint32_t)(accum0 >> 28);
+
+    accum8 += c[0];
+    c[0] = ((uint32_t)accum8) & mask;
+    c[1] += (uint32_t)(accum8 >> 28);
+}
+
+void gf_sqr(gf_s * RESTRICT cs, const gf as)
+{
+    gf_mul(cs, as, as);         /* Performs better with a dedicated square */
+}
+#endif
Index: openssl-1.1.1l/crypto/ec/curve448/arch_64/f_impl64.c
===================================================================
--- /dev/null
+++ openssl-1.1.1l/crypto/ec/curve448/arch_64/f_impl64.c
@@ -0,0 +1,210 @@
+/*
+ * Copyright 2017-2018 The OpenSSL Project Authors. All Rights Reserved.
+ * Copyright 2014 Cryptography Research, Inc.
+ *
+ * Licensed under the OpenSSL license (the "License").  You may not use
+ * this file except in compliance with the License.  You can obtain a copy
+ * in the file LICENSE in the source distribution or at
+ * https://www.openssl.org/source/license.html
+ *
+ * Originally written by Mike Hamburg
+ */
+
+#include <openssl/opensslconf.h>
+#include "internal/numbers.h"
+
+#ifndef UINT128_MAX
+/* No support for 128 bit ints, so do nothing here */
+NON_EMPTY_TRANSLATION_UNIT
+#else
+
+# include "../field.h"
+
+void gf_mul(gf_s * RESTRICT cs, const gf as, const gf bs)
+{
+    const uint64_t *a = as->limb, *b = bs->limb;
+    uint64_t *c = cs->limb;
+    uint128_t accum0 = 0, accum1 = 0, accum2;
+    uint64_t mask = (1ULL << 56) - 1;
+    uint64_t aa[4], bb[4], bbb[4];
+    unsigned int i, j;
+
+    for (i = 0; i < 4; i++) {
+        aa[i] = a[i] + a[i + 4];
+        bb[i] = b[i] + b[i + 4];
+        bbb[i] = bb[i] + b[i + 4];
+    }
+
+    for (i = 0; i < 4; i++) {
+        accum2 = 0;
+
+        for (j = 0; j <= i; j++) {
+            accum2 += widemul(a[j], b[i - j]);
+            accum1 += widemul(aa[j], bb[i - j]);
+            accum0 += widemul(a[j + 4], b[i - j + 4]);
+        }
+        for (; j < 4; j++) {
+            accum2 += widemul(a[j], b[i - j + 8]);
+            accum1 += widemul(aa[j], bbb[i - j + 4]);
+            accum0 += widemul(a[j + 4], bb[i - j + 4]);
+        }
+
+	accum1 -= accum2;
+	accum0 += accum2;
+
+        c[i] = ((uint64_t)(accum0)) & mask;
+        c[i + 4] = ((uint64_t)(accum1)) & mask;
+
+        accum0 >>= 56;
+        accum1 >>= 56;
+    }
+
+    accum0 += accum1;
+    accum0 += c[4];
+    accum1 += c[0];
+    c[4] = ((uint64_t)(accum0)) & mask;
+    c[0] = ((uint64_t)(accum1)) & mask;
+
+    accum0 >>= 56;
+    accum1 >>= 56;
+
+    c[5] += ((uint64_t)(accum0));
+    c[1] += ((uint64_t)(accum1));
+}
+
+void gf_mulw_unsigned(gf_s * __restrict__ cs, const gf as, uint32_t b)
+{
+    const uint64_t *a = as->limb;
+    uint64_t *c = cs->limb;
+    uint128_t accum0 = 0, accum4 = 0;
+    uint64_t mask = (1ULL << 56) - 1;
+    int i;
+
+    for (i = 0; i < 4; i++) {
+        accum0 += widemul(b, a[i]);
+        accum4 += widemul(b, a[i + 4]);
+        c[i] = accum0 & mask;
+        accum0 >>= 56;
+        c[i + 4] = accum4 & mask;
+        accum4 >>= 56;
+    }
+
+    accum0 += accum4 + c[4];
+    c[4] = accum0 & mask;
+    c[5] += accum0 >> 56;
+
+    accum4 += c[0];
+    c[0] = accum4 & mask;
+    c[1] += accum4 >> 56;
+}
+
+void gf_sqr(gf_s * __restrict__ cs, const gf as)
+{
+    const uint64_t *a = as->limb;
+    uint64_t *c = cs->limb;
+    uint128_t accum0 = 0, accum1 = 0, accum2;
+    uint64_t mask = (1ULL << 56) - 1;
+    uint64_t aa[4];
+
+    /* For some reason clang doesn't vectorize this without prompting? */
+    unsigned int i;
+    for (i = 0; i < 4; i++) {
+        aa[i] = a[i] + a[i + 4];
+    }
+
+    accum2 = widemul(a[0], a[3]);
+    accum0 = widemul(aa[0], aa[3]);
+    accum1 = widemul(a[4], a[7]);
+
+    accum2 += widemul(a[1], a[2]);
+    accum0 += widemul(aa[1], aa[2]);
+    accum1 += widemul(a[5], a[6]);
+
+    accum0 -= accum2;
+    accum1 += accum2;
+
+    c[3] = ((uint64_t)(accum1)) << 1 & mask;
+    c[7] = ((uint64_t)(accum0)) << 1 & mask;
+
+    accum0 >>= 55;
+    accum1 >>= 55;
+
+    accum0 += widemul(2 * aa[1], aa[3]);
+    accum1 += widemul(2 * a[5], a[7]);
+    accum0 += widemul(aa[2], aa[2]);
+    accum1 += accum0;
+
+    accum0 -= widemul(2 * a[1], a[3]);
+    accum1 += widemul(a[6], a[6]);
+
+    accum2 = widemul(a[0], a[0]);
+    accum1 -= accum2;
+    accum0 += accum2;
+
+    accum0 -= widemul(a[2], a[2]);
+    accum1 += widemul(aa[0], aa[0]);
+    accum0 += widemul(a[4], a[4]);
+
+    c[0] = ((uint64_t)(accum0)) & mask;
+    c[4] = ((uint64_t)(accum1)) & mask;
+
+    accum0 >>= 56;
+    accum1 >>= 56;
+
+    accum2 = widemul(2 * aa[2], aa[3]);
+    accum0 -= widemul(2 * a[2], a[3]);
+    accum1 += widemul(2 * a[6], a[7]);
+
+    accum1 += accum2;
+    accum0 += accum2;
+
+    accum2 = widemul(2 * a[0], a[1]);
+    accum1 += widemul(2 * aa[0], aa[1]);
+    accum0 += widemul(2 * a[4], a[5]);
+
+    accum1 -= accum2;
+    accum0 += accum2;
+
+    c[1] = ((uint64_t)(accum0)) & mask;
+    c[5] = ((uint64_t)(accum1)) & mask;
+
+    accum0 >>= 56;
+    accum1 >>= 56;
+
+    accum2 = widemul(aa[3], aa[3]);
+    accum0 -= widemul(a[3], a[3]);
+    accum1 += widemul(a[7], a[7]);
+
+    accum1 += accum2;
+    accum0 += accum2;
+
+    accum2 = widemul(2 * a[0], a[2]);
+    accum1 += widemul(2 * aa[0], aa[2]);
+    accum0 += widemul(2 * a[4], a[6]);
+
+    accum2 += widemul(a[1], a[1]);
+    accum1 += widemul(aa[1], aa[1]);
+    accum0 += widemul(a[5], a[5]);
+
+    accum1 -= accum2;
+    accum0 += accum2;
+
+    c[2] = ((uint64_t)(accum0)) & mask;
+    c[6] = ((uint64_t)(accum1)) & mask;
+
+    accum0 >>= 56;
+    accum1 >>= 56;
+
+    accum0 += c[3];
+    accum1 += c[7];
+    c[3] = ((uint64_t)(accum0)) & mask;
+    c[7] = ((uint64_t)(accum1)) & mask;
+
+    /* we could almost stop here, but it wouldn't be stable, so... */
+
+    accum0 >>= 56;
+    accum1 >>= 56;
+    c[4] += ((uint64_t)(accum0)) + ((uint64_t)(accum1));
+    c[0] += ((uint64_t)(accum1));
+}
+#endif
Index: openssl-1.1.1l/Configurations/00-base-templates.conf
===================================================================
--- openssl-1.1.1l.orig/Configurations/00-base-templates.conf
+++ openssl-1.1.1l/Configurations/00-base-templates.conf
@@ -351,7 +351,8 @@ my %targets=(
     ppc64_asm => {
 	inherit_from	=> [ "ppc32_asm" ],
 	template	=> 1,
-	ec_asm_src	=> "ecp_nistz256.c ecp_nistz256-ppc64.s x25519-ppc64.s",
+	bn_asm_src      => add("ppc64-mont-fixed.s"),
+	ec_asm_src	=> "ecp_nistz256.c ecp_nistz256-ppc64.s ecp_nistp521-ppc64.s x25519-ppc64.s",
 	keccak1600_asm_src	=> "keccak1600-ppc64.s",
     },
 );
Index: openssl-1.1.1l/crypto/bn/asm/ppc64-mont-fixed.pl
===================================================================
--- /dev/null
+++ openssl-1.1.1l/crypto/bn/asm/ppc64-mont-fixed.pl
@@ -0,0 +1,581 @@
+#! /usr/bin/env perl
+# Copyright 2021 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the Apache License 2.0 (the "License").  You may not use
+# this file except in compliance with the License.  You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
+# ====================================================================
+# Written by Amitay Isaacs <amitay@ozlabs.org>, Martin Schwenke
+# <martin@meltin.net> & Alastair D'Silva <alastair@d-silva.org> for
+# the OpenSSL project.
+# ====================================================================
+
+#
+# Fixed length (n=6), unrolled PPC Montgomery Multiplication
+#
+
+# 2021
+#
+# Although this is a generic implementation for unrolling Montgomery
+# Multiplication for arbitrary values of n, this is currently only
+# used for n = 6 to improve the performance of ECC p384.
+#
+# Unrolling allows intermediate results to be stored in registers,
+# rather than on the stack, improving performance by ~7% compared to
+# the existing PPC assembly code.
+#
+# The ISA 3.0 implementation uses combination multiply/add
+# instructions (maddld, maddhdu) to improve performance by an
+# additional ~10% on Power 9.
+#
+# Finally, saving non-volatile registers into volatile vector
+# registers instead of onto the stack saves a little more.
+#
+# On a Power 9 machine we see an overall improvement of ~18%.
+#
+
+use strict;
+use warnings;
+
+my ($flavour, $output, $dir, $xlate);
+
+# $output is the last argument if it looks like a file (it has an extension)
+# $flavour is the first argument if it doesn't look like a file
+$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
+$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
+die "can't locate ppc-xlate.pl";
+
+open STDOUT,"| $^X $xlate $flavour \"$output\""
+    or die "can't call $xlate: $!";
+
+if ($flavour !~ /64/) {
+	die "bad flavour ($flavour) - only ppc64 permitted";
+}
+
+my $SIZE_T= 8;
+
+# Registers are global so the code is remotely readable
+
+# Parameters for Montgomery multiplication
+my $sp	= "r1";
+my $toc	= "r2";
+my $rp	= "r3";
+my $ap	= "r4";
+my $bp	= "r5";
+my $np	= "r6";
+my $n0	= "r7";
+my $num	= "r8";
+
+my $i	= "r9";
+my $c0	= "r10";
+my $bp0	= "r11";
+my $bpi	= "r11";
+my $bpj	= "r11";
+my $tj	= "r12";
+my $apj	= "r12";
+my $npj	= "r12";
+my $lo	= "r14";
+my $c1	= "r14";
+
+# Non-volatile registers used for tp[i]
+#
+# 12 registers are available but the limit on unrolling is 10,
+# since registers from $tp[0] to $tp[$n+1] are used.
+my @tp = ("r20" .. "r31");
+
+# volatile VSRs for saving non-volatile GPRs - faster than stack
+my @vsrs = ("v32" .. "v46");
+
+package Mont;
+
+sub new($$)
+{
+	my ($class, $n) = @_;
+
+	if ($n > 10) {
+		die "Can't unroll for BN length ${n} (maximum 10)"
+	}
+
+	my $self = {
+		code => "",
+		n => $n,
+	};
+	bless $self, $class;
+
+	return $self;
+}
+
+sub add_code($$)
+{
+	my ($self, $c) = @_;
+
+	$self->{code} .= $c;
+}
+
+sub get_code($)
+{
+	my ($self) = @_;
+
+	return $self->{code};
+}
+
+sub get_function_name($)
+{
+	my ($self) = @_;
+
+	return "bn_mul_mont_fixed_n" . $self->{n};
+}
+
+sub get_label($$)
+{
+	my ($self, $l) = @_;
+
+	return "L" . $l . "_" . $self->{n};
+}
+
+sub get_labels($@)
+{
+	my ($self, @labels) = @_;
+
+	my %out = ();
+
+	foreach my $l (@labels) {
+		$out{"$l"} = $self->get_label("$l");
+	}
+
+	return \%out;
+}
+
+sub nl($)
+{
+	my ($self) = @_;
+
+	$self->add_code("\n");
+}
+
+sub copy_result($)
+{
+	my ($self) = @_;
+
+	my ($n) = $self->{n};
+
+	for (my $j = 0; $j < $n; $j++) {
+		$self->add_code(<<___);
+	std		$tp[$j],`$j*$SIZE_T`($rp)
+___
+	}
+
+}
+
+sub mul_mont_fixed($)
+{
+	my ($self) = @_;
+
+	my ($n) = $self->{n};
+	my $fname = $self->get_function_name();
+	my $label = $self->get_labels("outer", "enter", "sub", "copy", "end");
+
+	$self->add_code(<<___);
+
+.globl	.${fname}
+.align	5
+.${fname}:
+
+___
+
+	$self->save_registers();
+
+	$self->add_code(<<___);
+	ld		$n0,0($n0)
+
+	ld		$bp0,0($bp)
+
+	ld		$apj,0($ap)
+___
+
+	$self->mul_c_0($tp[0], $apj, $bp0, $c0);
+
+	for (my $j = 1; $j < $n - 1; $j++) {
+		$self->add_code(<<___);
+	ld		$apj,`$j*$SIZE_T`($ap)
+___
+		$self->mul($tp[$j], $apj, $bp0, $c0);
+	}
+
+	$self->add_code(<<___);
+	ld		$apj,`($n-1)*$SIZE_T`($ap)
+___
+
+	$self->mul_last($tp[$n-1], $tp[$n], $apj, $bp0, $c0);
+
+	$self->add_code(<<___);
+	li		$tp[$n+1],0
+
+___
+
+	$self->add_code(<<___);
+	li		$i,0
+	mtctr		$num
+	b		$label->{"enter"}
+
+.align	4
+$label->{"outer"}:
+	ldx		$bpi,$bp,$i
+
+	ld		$apj,0($ap)
+___
+
+	$self->mul_add_c_0($tp[0], $tp[0], $apj, $bpi, $c0);
+
+	for (my $j = 1; $j < $n; $j++) {
+		$self->add_code(<<___);
+	ld		$apj,`$j*$SIZE_T`($ap)
+___
+		$self->mul_add($tp[$j], $tp[$j], $apj, $bpi, $c0);
+	}
+
+	$self->add_code(<<___);
+	addc		$tp[$n],$tp[$n],$c0
+	addze		$tp[$n+1],$tp[$n+1]
+___
+
+	$self->add_code(<<___);
+.align	4
+$label->{"enter"}:
+	mulld		$bpi,$tp[0],$n0
+
+	ld		$npj,0($np)
+___
+
+	$self->mul_add_c_0($lo, $tp[0], $bpi, $npj, $c0);
+
+	for (my $j = 1; $j < $n; $j++) {
+		$self->add_code(<<___);
+	ld		$npj,`$j*$SIZE_T`($np)
+___
+		$self->mul_add($tp[$j-1], $tp[$j], $npj, $bpi, $c0);
+	}
+
+	$self->add_code(<<___);
+	addc		$tp[$n-1],$tp[$n],$c0
+	addze		$tp[$n],$tp[$n+1]
+
+	addi		$i,$i,$SIZE_T
+	bdnz		$label->{"outer"}
+
+	and.		$tp[$n],$tp[$n],$tp[$n]
+	bne		$label->{"sub"}
+
+	cmpld	$tp[$n-1],$npj
+	blt		$label->{"copy"}
+
+$label->{"sub"}:
+___
+
+	#
+	# Reduction
+	#
+
+		$self->add_code(<<___);
+	ld		$bpj,`0*$SIZE_T`($np)
+	subfc		$c1,$bpj,$tp[0]
+	std		$c1,`0*$SIZE_T`($rp)
+
+___
+	for (my $j = 1; $j < $n - 1; $j++) {
+		$self->add_code(<<___);
+	ld		$bpj,`$j*$SIZE_T`($np)
+	subfe		$c1,$bpj,$tp[$j]
+	std		$c1,`$j*$SIZE_T`($rp)
+
+___
+	}
+
+		$self->add_code(<<___);
+	subfe		$c1,$npj,$tp[$n-1]
+	std		$c1,`($n-1)*$SIZE_T`($rp)
+
+___
+
+	$self->add_code(<<___);
+	addme.		$tp[$n],$tp[$n]
+	beq		$label->{"end"}
+
+$label->{"copy"}:
+___
+
+	$self->copy_result();
+
+	$self->add_code(<<___);
+
+$label->{"end"}:
+___
+
+	$self->restore_registers();
+
+	$self->add_code(<<___);
+	li		r3,1
+	blr
+.size .${fname},.-.${fname}
+___
+
+}
+
+package Mont::GPR;
+
+our @ISA = ('Mont');
+
+sub new($$)
+{
+    my ($class, $n) = @_;
+
+    return $class->SUPER::new($n);
+}
+
+sub save_registers($)
+{
+	my ($self) = @_;
+
+	my $n = $self->{n};
+
+	$self->add_code(<<___);
+	std	$lo,-8($sp)
+___
+
+	for (my $j = 0; $j <= $n+1; $j++) {
+		$self->{code}.=<<___;
+	std	$tp[$j],-`($j+2)*8`($sp)
+___
+	}
+
+	$self->add_code(<<___);
+
+___
+}
+
+sub restore_registers($)
+{
+	my ($self) = @_;
+
+	my $n = $self->{n};
+
+	$self->add_code(<<___);
+	ld	$lo,-8($sp)
+___
+
+	for (my $j = 0; $j <= $n+1; $j++) {
+		$self->{code}.=<<___;
+	ld	$tp[$j],-`($j+2)*8`($sp)
+___
+	}
+
+	$self->{code} .=<<___;
+
+___
+}
+
+# Direct translation of C mul()
+sub mul($$$$$)
+{
+	my ($self, $r, $a, $w, $c) = @_;
+
+	$self->add_code(<<___);
+	mulld		$lo,$a,$w
+	addc		$r,$lo,$c
+	mulhdu		$c,$a,$w
+	addze		$c,$c
+
+___
+}
+
+# Like mul() but $c is ignored as an input - an optimisation to save a
+# preliminary instruction that would set input $c to 0
+sub mul_c_0($$$$$)
+{
+	my ($self, $r, $a, $w, $c) = @_;
+
+	$self->add_code(<<___);
+	mulld		$r,$a,$w
+	mulhdu		$c,$a,$w
+
+___
+}
+
+# Like mul() but does not to the final addition of CA into $c - an
+# optimisation to save an instruction
+sub mul_last($$$$$$)
+{
+	my ($self, $r1, $r2, $a, $w, $c) = @_;
+
+	$self->add_code(<<___);
+	mulld		$lo,$a,$w
+	addc		$r1,$lo,$c
+	mulhdu		$c,$a,$w
+
+	addze		$r2,$c
+___
+}
+
+# Like C mul_add() but allow $r_out and $r_in to be different
+sub mul_add($$$$$$)
+{
+	my ($self, $r_out, $r_in, $a, $w, $c) = @_;
+
+	$self->add_code(<<___);
+	mulld		$lo,$a,$w
+	addc		$lo,$lo,$c
+	mulhdu		$c,$a,$w
+	addze		$c,$c
+	addc		$r_out,$r_in,$lo
+	addze		$c,$c
+
+___
+}
+
+# Like mul_add() but $c is ignored as an input - an optimisation to save a
+# preliminary instruction that would set input $c to 0
+sub mul_add_c_0($$$$$$)
+{
+	my ($self, $r_out, $r_in, $a, $w, $c) = @_;
+
+	$self->add_code(<<___);
+	mulld		$lo,$a,$w
+	addc		$r_out,$r_in,$lo
+	mulhdu		$c,$a,$w
+	addze		$c,$c
+
+___
+}
+
+package Mont::GPR_300;
+
+our @ISA = ('Mont::GPR');
+
+sub new($$)
+{
+	my ($class, $n) = @_;
+
+	my $mont = $class->SUPER::new($n);
+
+	return $mont;
+}
+
+sub get_function_name($)
+{
+	my ($self) = @_;
+
+	return "bn_mul_mont_300_fixed_n" . $self->{n};
+}
+
+sub get_label($$)
+{
+	my ($self, $l) = @_;
+
+	return "L" . $l . "_300_" . $self->{n};
+}
+
+# Direct translation of C mul()
+sub mul($$$$$)
+{
+	my ($self, $r, $a, $w, $c, $last) = @_;
+
+	$self->add_code(<<___);
+	maddld		$r,$a,$w,$c
+	maddhdu		$c,$a,$w,$c
+
+___
+}
+
+# Save the last carry as the final entry
+sub mul_last($$$$$)
+{
+	my ($self, $r1, $r2, $a, $w, $c) = @_;
+
+	$self->add_code(<<___);
+	maddld		$r1,$a,$w,$c
+	maddhdu		$r2,$a,$w,$c
+
+___
+}
+
+# Like mul() but $c is ignored as an input - an optimisation to save a
+# preliminary instruction that would set input $c to 0
+sub mul_c_0($$$$$)
+{
+	my ($self, $r, $a, $w, $c) = @_;
+
+	$self->add_code(<<___);
+	mulld          $r,$a,$w
+	mulhdu          $c,$a,$w
+
+___
+}
+
+# Like C mul_add() but allow $r_out and $r_in to be different
+sub mul_add($$$$$$)
+{
+	my ($self, $r_out, $r_in, $a, $w, $c) = @_;
+
+	$self->add_code(<<___);
+	maddld		$lo,$a,$w,$c
+	maddhdu		$c,$a,$w,$c
+	addc		$r_out,$r_in,$lo
+	addze		$c,$c
+
+___
+}
+
+# Like mul_add() but $c is ignored as an input - an optimisation to save a
+# preliminary instruction that would set input $c to 0
+sub mul_add_c_0($$$$$$)
+{
+	my ($self, $r_out, $r_in, $a, $w, $c) = @_;
+
+	$self->add_code(<<___);
+	maddld		$lo,$a,$w,$r_in
+	maddhdu		$c,$a,$w,$r_in
+___
+
+	if ($r_out ne $lo) {
+		$self->add_code(<<___);
+	mr			$r_out,$lo
+___
+	}
+
+	$self->nl();
+}
+
+
+package main;
+
+my $code;
+
+$code.=<<___;
+.machine "any"
+.text
+___
+
+my $mont;
+
+$mont = new Mont::GPR(6);
+$mont->mul_mont_fixed();
+$code .= $mont->get_code();
+
+$mont = new Mont::GPR_300(6);
+$mont->mul_mont_fixed();
+$code .= $mont->get_code();
+
+$code =~ s/\`([^\`]*)\`/eval $1/gem;
+
+$code.=<<___;
+.asciz  "Montgomery Multiplication for PPC by <amitay\@ozlabs.org>, <alastair\@d-silva.org>"
+___
+
+print $code;
+close STDOUT or die "error closing STDOUT: $!";
Index: openssl-1.1.1l/crypto/bn/build.info
===================================================================
--- openssl-1.1.1l.orig/crypto/bn/build.info
+++ openssl-1.1.1l/crypto/bn/build.info
@@ -56,6 +56,7 @@ GENERATE[parisc-mont.s]=asm/parisc-mont.
 GENERATE[bn-ppc.s]=asm/ppc.pl $(PERLASM_SCHEME)
 GENERATE[ppc-mont.s]=asm/ppc-mont.pl $(PERLASM_SCHEME)
 GENERATE[ppc64-mont.s]=asm/ppc64-mont.pl $(PERLASM_SCHEME)
+GENERATE[ppc64-mont-fixed.s]=asm/ppc64-mont-fixed.pl $(PERLASM_SCHEME)

 GENERATE[alpha-mont.S]=asm/alpha-mont.pl $(PERLASM_SCHEME)

Index: openssl-1.1.1l/crypto/ppccap.c
===================================================================
--- openssl-1.1.1l.orig/crypto/ppccap.c
+++ openssl-1.1.1l/crypto/ppccap.c
@@ -46,6 +46,12 @@ int bn_mul_mont(BN_ULONG *rp, const BN_U
                         const BN_ULONG *np, const BN_ULONG *n0, int num);
     int bn_mul4x_mont_int(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
                           const BN_ULONG *np, const BN_ULONG *n0, int num);
+    int bn_mul_mont_fixed_n6(BN_ULONG *rp, const BN_ULONG *ap,
+                             const BN_ULONG *bp, const BN_ULONG *np,
+                             const BN_ULONG *n0, int num);
+    int bn_mul_mont_300_fixed_n6(BN_ULONG *rp, const BN_ULONG *ap,
+                                 const BN_ULONG *bp, const BN_ULONG *np,
+                                 const BN_ULONG *n0, int num);

     if (num < 4)
         return 0;
@@ -61,6 +67,15 @@ int bn_mul_mont(BN_ULONG *rp, const BN_U
      * no opportunity to figure it out...
      */

+#if defined(_ARCH_PPC64)
+    if (num == 6) {
+        if (OPENSSL_ppccap_P & PPC_MADD300)
+            return bn_mul_mont_300_fixed_n6(rp, ap, bp, np, n0, num);
+        else
+            return bn_mul_mont_fixed_n6(rp, ap, bp, np, n0, num);
+    }
+#endif
+
     return bn_mul_mont_int(rp, ap, bp, np, n0, num);
 }
 #endif
Index: openssl-1.1.1l/crypto/perlasm/ppc-xlate.pl
===================================================================
--- openssl-1.1.1l.orig/crypto/perlasm/ppc-xlate.pl
+++ openssl-1.1.1l/crypto/perlasm/ppc-xlate.pl
@@ -136,6 +136,71 @@ my $quad = sub {
 };

 ################################################################
+# vector register number hacking
+################################################################
+
+# It is convenient to be able to set a variable like:
+#   my $foo = "v33";
+# and use this in different contexts where:
+# * a VSR (Vector-Scaler Register) number (i.e. "v33") is required
+# * a VR (Vector Register) number (i.e. "v1") is required
+# Map VSR numbering to VR number for certain vector instructions.
+
+# vs<N> -> v<N-32> if N > 32
+sub vsr2vr1 {
+    my $in = shift;
+
+    my $n = int($in);
+    if ($n >= 32) {
+	    $n -= 32;
+    }
+
+    return "$n";
+}
+# As above for first $num register args, returns list
+sub _vsr2vr {
+    my $num = shift;
+    my @rest = @_;
+    my @subst = splice(@rest, 0, $num);
+
+    @subst = map { vsr2vr1($_); } @subst;
+
+    return (@subst, @rest);
+}
+# As above but 1st arg ($f) is extracted and reinserted after
+# processing so that it can be ignored by a code generation function
+# that consumes the result
+sub vsr2vr_args {
+    my $num = shift;
+    my $f = shift;
+
+    my @out = _vsr2vr($num, @_);
+
+    return ($f, @out);
+}
+# As above but 1st arg is mnemonic, return formatted instruction
+sub vsr2vr {
+    my $mnemonic = shift;
+    my $num = shift;
+    my $f = shift;
+
+    my @out = _vsr2vr($num, @_);
+
+    "	${mnemonic}${f}	" . join(",", @out);
+}
+
+# ISA 2.03
+my $vsel	= sub { vsr2vr("vsel",		4, @_); };
+my $vsl		= sub { vsr2vr("vsl",		3, @_); };
+my $vspltisb	= sub { vsr2vr("vspltisb",	1, @_); };
+my $vspltisw	= sub { vsr2vr("vspltisw",	1, @_); };
+my $vsr		= sub { vsr2vr("vsr",		3, @_); };
+my $vsro	= sub { vsr2vr("vsro",		3, @_); };
+
+# ISA 3.0
+my $lxsd	= sub { vsr2vr("lxsd",		1, @_); };
+
+################################################################
 # simplified mnemonics not handled by at least one assembler
 ################################################################
 my $cmplw = sub {
@@ -226,13 +291,18 @@ my $vpermdi	= sub {				# xxpermdi

 # PowerISA 2.07 stuff
 sub vcrypto_op {
-    my ($f, $vrt, $vra, $vrb, $op) = @_;
+    my ($f, $vrt, $vra, $vrb, $op) = vsr2vr_args(3, @_);
     "	.long	".sprintf "0x%X",(4<<26)|($vrt<<21)|($vra<<16)|($vrb<<11)|$op;
 }
 sub vfour {
     my ($f, $vrt, $vra, $vrb, $vrc, $op) = @_;
     "	.long	".sprintf "0x%X",(4<<26)|($vrt<<21)|($vra<<16)|($vrb<<11)|($vrc<<6)|$op;
 };
+sub vfour_vsr {
+    my ($f, $vrt, $vra, $vrb, $vrc, $op) = vsr2vr_args(4, @_);
+    "	.long	".sprintf "0x%X",(4<<26)|($vrt<<21)|($vra<<16)|($vrb<<11)|($vrc<<6)|$op;
+};
+
 my $vcipher	= sub { vcrypto_op(@_, 1288); };
 my $vcipherlast	= sub { vcrypto_op(@_, 1289); };
 my $vncipher	= sub { vcrypto_op(@_, 1352); };
@@ -254,10 +324,10 @@ my $vsld	= sub { vcrypto_op(@_, 1476); }
 my $vsrd	= sub { vcrypto_op(@_, 1732); };
 my $vsubudm	= sub { vcrypto_op(@_, 1216); };
 my $vaddcuq	= sub { vcrypto_op(@_, 320);  };
-my $vaddeuqm	= sub { vfour(@_,60); };
-my $vaddecuq	= sub { vfour(@_,61); };
-my $vmrgew	= sub { vfour(@_,0,1932); };
-my $vmrgow	= sub { vfour(@_,0,1676); };
+my $vaddeuqm	= sub { vfour_vsr(@_,60); };
+my $vaddecuq	= sub { vfour_vsr(@_,61); };
+my $vmrgew	= sub { vfour_vsr(@_,0,1932); };
+my $vmrgow	= sub { vfour_vsr(@_,0,1676); };

 my $mtsle	= sub {
     my ($f, $arg) = @_;
@@ -298,7 +368,7 @@ my $addex = sub {
     my ($f, $rt, $ra, $rb, $cy) = @_;	# only cy==0 is specified in 3.0B
     "	.long	".sprintf "0x%X",(31<<26)|($rt<<21)|($ra<<16)|($rb<<11)|($cy<<9)|(170<<1);
 };
-my $vmsumudm	= sub { vfour(@_,35); };
+my $vmsumudm	= sub { vfour_vsr(@_, 35); };

 while($line=<>) {

Index: openssl-1.1.1l/Configurations/10-main.conf
===================================================================
--- openssl-1.1.1l.orig/Configurations/10-main.conf
+++ openssl-1.1.1l/Configurations/10-main.conf
@@ -669,7 +669,7 @@ my %targets = (
         inherit_from     => [ "linux-generic64", asm("ppc64_asm") ],
         cflags           => add("-m64"),
         cxxflags         => add("-m64"),
-        lib_cppflags     => add("-DB_ENDIAN"),
+        lib_cppflags     => add("-DB_ENDIAN -DECP_NISTP521_ASM"),
         perlasm_scheme   => "linux64",
         multilib         => "64",
     },
@@ -677,7 +677,7 @@ my %targets = (
         inherit_from     => [ "linux-generic64", asm("ppc64_asm") ],
         cflags           => add("-m64"),
         cxxflags         => add("-m64"),
-        lib_cppflags     => add("-DL_ENDIAN"),
+        lib_cppflags     => add("-DL_ENDIAN -DECP_NISTP521_ASM"),
         perlasm_scheme   => "linux64le",
     },

Index: openssl-1.1.1l/crypto/ec/asm/ecp_nistp521-ppc64.pl
===================================================================
--- /dev/null
+++ openssl-1.1.1l/crypto/ec/asm/ecp_nistp521-ppc64.pl
@@ -0,0 +1,435 @@
+#! /usr/bin/env perl
+# Copyright 2021 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the OpenSSL license (the "License").  You may not use
+# this file except in compliance with the License.  You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+#
+# ====================================================================
+# Written by Amitay Isaacs <amitay@ozlabs.org> and Martin Schwenke
+# <martin@meltin.net> for the OpenSSL project.
+# ====================================================================
+#
+# p521 lower-level primitives for PPC64 using vector instructions.
+#
+
+use strict;
+use warnings;
+
+my $flavour = shift;
+my $output = "";
+while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {}
+if (!$output) {
+	$output = "-";
+}
+
+my ($xlate, $dir);
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
+die "can't locate ppc-xlate.pl";
+
+open OUT,"| \"$^X\" $xlate $flavour $output";
+*STDOUT=*OUT;
+
+my $code = "";
+
+my ($sp, $outp, $savelr, $savesp) = ("r1", "r3", "r10", "r12");
+
+my $vzero = "v32";
+
+sub startproc($)
+{
+    my ($name) = @_;
+
+    $code.=<<___;
+    .globl ${name}
+    .align 5
+${name}:
+
+___
+}
+
+sub endproc($)
+{
+    my ($name) = @_;
+
+    $code.=<<___;
+	blr
+	    .size	${name},.-${name}
+
+___
+}
+
+
+sub push_vrs($$)
+{
+	my ($min, $max) = @_;
+
+	my $count = $max - $min + 1;
+
+	$code.=<<___;
+	mr		$savesp,$sp
+	stdu		$sp,-16*`$count+1`($sp)
+
+___
+	    for (my $i = $min; $i <= $max; $i++) {
+		    my $mult = $max - $i + 1;
+		    $code.=<<___;
+	stxv		$i,-16*$mult($savesp)
+___
+
+	}
+
+	$code.=<<___;
+
+___
+}
+
+sub pop_vrs($$)
+{
+	my ($min, $max) = @_;
+
+	$code.=<<___;
+	ld		$savesp,0($sp)
+___
+	for (my $i = $min; $i <= $max; $i++) {
+		my $mult = $max - $i + 1;
+		$code.=<<___;
+	lxv		$i,-16*$mult($savesp)
+___
+	}
+
+	$code.=<<___;
+	mr		$sp,$savesp
+
+___
+}
+
+sub load_vrs($$)
+{
+	my ($pointer, $reg_list) = @_;
+
+	for (my $i = 0; $i <= 8; $i++) {
+		my $offset = $i * 8;
+		$code.=<<___;
+	lxsd		$reg_list->[$i],$offset($pointer)
+___
+	}
+
+	$code.=<<___;
+
+___
+}
+
+sub store_vrs($$)
+{
+	my ($pointer, $reg_list) = @_;
+
+	for (my $i = 0; $i <= 8; $i++) {
+		my $offset = $i * 16;
+		$code.=<<___;
+	stxv		$reg_list->[$i],$offset($pointer)
+___
+	}
+
+	$code.=<<___;
+
+___
+}
+
+$code.=<<___;
+.text
+
+___
+
+{
+	# mul/square common
+	my ($t1, $t2, $t3, $t4) = ("v33", "v34", "v44", "v54");
+	my ($zero, $one) = ("r8", "r9");
+	my @out = map("v$_",(55..63));
+
+	{
+		#
+		# p521_felem_mul
+		#
+
+		my ($in1p, $in2p) = ("r4", "r5");
+		my @in1 = map("v$_",(45..53));
+		my @in2 = map("v$_",(35..43));
+
+		startproc("p521_felem_mul");
+
+		push_vrs(52, 63);
+
+		$code.=<<___;
+	vspltisw	$vzero,0
+
+___
+
+		load_vrs($in1p, \@in1);
+		load_vrs($in2p, \@in2);
+
+		$code.=<<___;
+	vmsumudm	$out[0],$in1[0],$in2[0],$vzero
+
+	xxpermdi	$t1,$in1[0],$in1[1],0b00
+	xxpermdi	$t2,$in2[1],$in2[0],0b00
+	vmsumudm	$out[1],$t1,$t2,$vzero
+
+	xxpermdi	$t2,$in2[2],$in2[1],0b00
+	vmsumudm	$out[2],$t1,$t2,$vzero
+	vmsumudm	$out[2],$in1[2],$in2[0],$out[2]
+
+	xxpermdi	$t2,$in2[3],$in2[2],0b00
+	vmsumudm	$out[3],$t1,$t2,$vzero
+	xxpermdi	$t3,$in1[2],$in1[3],0b00
+	xxpermdi	$t4,$in2[1],$in2[0],0b00
+	vmsumudm	$out[3],$t3,$t4,$out[3]
+
+	xxpermdi	$t2,$in2[4],$in2[3],0b00
+	vmsumudm	$out[4],$t1,$t2,$vzero
+	xxpermdi	$t4,$in2[2],$in2[1],0b00
+	vmsumudm	$out[4],$t3,$t4,$out[4]
+	vmsumudm	$out[4],$in1[4],$in2[0],$out[4]
+
+	xxpermdi	$t2,$in2[5],$in2[4],0b00
+	vmsumudm	$out[5],$t1,$t2,$vzero
+	xxpermdi	$t4,$in2[3],$in2[2],0b00
+	vmsumudm	$out[5],$t3,$t4,$out[5]
+
+	xxpermdi	$t2,$in2[6],$in2[5],0b00
+	vmsumudm	$out[6],$t1,$t2,$vzero
+	xxpermdi	$t4,$in2[4],$in2[3],0b00
+	vmsumudm	$out[6],$t3,$t4,$out[6]
+
+	xxpermdi	$t2,$in2[7],$in2[6],0b00
+	vmsumudm	$out[7],$t1,$t2,$vzero
+	xxpermdi	$t4,$in2[5],$in2[4],0b00
+	vmsumudm	$out[7],$t3,$t4,$out[7]
+
+	xxpermdi	$t2,$in2[8],$in2[7],0b00
+	vmsumudm	$out[8],$t1,$t2,$vzero
+	xxpermdi	$t4,$in2[6],$in2[5],0b00
+	vmsumudm	$out[8],$t3,$t4,$out[8]
+
+	xxpermdi	$t1,$in1[4],$in1[5],0b00
+	xxpermdi	$t2,$in2[1],$in2[0],0b00
+	vmsumudm	$out[5],$t1,$t2,$out[5]
+
+	xxpermdi	$t2,$in2[2],$in2[1],0b00
+	vmsumudm	$out[6],$t1,$t2,$out[6]
+	vmsumudm	$out[6],$in1[6],$in2[0],$out[6]
+
+	xxpermdi	$t2,$in2[3],$in2[2],0b00
+	vmsumudm	$out[7],$t1,$t2,$out[7]
+	xxpermdi	$t3,$in1[6],$in1[7],0b00
+	xxpermdi	$t4,$in2[1],$in2[0],0b00
+	vmsumudm	$out[7],$t3,$t4,$out[7]
+
+	xxpermdi	$t2,$in2[4],$in2[3],0b00
+	vmsumudm	$out[8],$t1,$t2,$out[8]
+	xxpermdi	$t4,$in2[2],$in2[1],0b00
+	vmsumudm	$out[8],$t3,$t4,$out[8]
+	vmsumudm	$out[8],$in1[8],$in2[0],$out[8]
+
+	li		$zero,0
+	li		$one,1
+	mtvsrdd		$t1,$one,$zero
+___
+
+		for (my $i = 0; $i <= 8; $i++) {
+			$code.=<<___;
+	vsld		$in2[$i],$in2[$i],$t1
+___
+		}
+
+		$code.=<<___;
+
+	vmsumudm	$out[7],$in1[8],$in2[8],$out[7]
+
+	xxpermdi	$t2,$in2[8],$in2[7],0b00
+	xxpermdi	$t1,$in1[7],$in1[8],0b00
+	vmsumudm	$out[6],$t1,$t2,$out[6]
+
+	xxpermdi	$t1,$in1[6],$in1[7],0b00
+	vmsumudm	$out[5],$t1,$t2,$out[5]
+	vmsumudm	$out[5],$in1[8],$in2[6],$out[5]
+
+	xxpermdi	$t1,$in1[5],$in1[6],0b00
+	vmsumudm	$out[4],$t1,$t2,$out[4]
+	xxpermdi	$t4,$in2[6],$in2[5],0b00
+	xxpermdi	$t3,$in1[7],$in1[8],0b00
+	vmsumudm	$out[4],$t3,$t4,$out[4]
+
+	xxpermdi	$t1,$in1[4],$in1[5],0b00
+	vmsumudm	$out[3],$t1,$t2,$out[3]
+	xxpermdi	$t3,$in1[6],$in1[7],0b00
+	vmsumudm	$out[3],$t3,$t4,$out[3]
+	vmsumudm	$out[3],$in1[8],$in2[4],$out[3]
+
+	xxpermdi	$t1,$in1[3],$in1[4],0b00
+	vmsumudm	$out[2],$t1,$t2,$out[2]
+	xxpermdi	$t3,$in1[5],$in1[6],0b00
+	vmsumudm	$out[2],$t3,$t4,$out[2]
+
+	xxpermdi	$t1,$in1[2],$in1[3],0b00
+	vmsumudm	$out[1],$t1,$t2,$out[1]
+	xxpermdi	$t3,$in1[4],$in1[5],0b00
+	vmsumudm	$out[1],$t3,$t4,$out[1]
+
+	xxpermdi	$t1,$in1[1],$in1[2],0b00
+	vmsumudm	$out[0],$t1,$t2,$out[0]
+	xxpermdi	$t3,$in1[3],$in1[4],0b00
+	vmsumudm	$out[0],$t3,$t4,$out[0]
+
+	xxpermdi	$t2,$in2[4],$in2[3],0b00
+	xxpermdi	$t1,$in1[7],$in1[8],0b00
+	vmsumudm	$out[2],$t1,$t2,$out[2]
+
+	xxpermdi	$t1,$in1[6],$in1[7],0b00
+	vmsumudm	$out[1],$t1,$t2,$out[1]
+	vmsumudm	$out[1],$in1[8],$in2[2],$out[1]
+
+	xxpermdi	$t1,$in1[5],$in1[6],0b00
+	vmsumudm	$out[0],$t1,$t2,$out[0]
+	xxpermdi	$t4,$in2[2],$in2[1],0b00
+	xxpermdi	$t3,$in1[7],$in1[8],0b00
+	vmsumudm	$out[0],$t3,$t4,$out[0]
+
+___
+
+		store_vrs($outp, \@out);
+
+		pop_vrs(52, 63);
+
+		endproc("p521_felem_mul");
+	}
+
+	{
+		#
+		# p51_felem_square
+		#
+
+		my ($inp) = ("r4");
+		my @in = map("v$_",(45..53));
+		my @inx2 = map("v$_",(35..43));
+
+		startproc("p521_felem_square");
+
+		push_vrs(52, 63);
+
+		$code.=<<___;
+	vspltisw	$vzero,0
+
+___
+
+		load_vrs($inp, \@in);
+
+		$code.=<<___;
+	li		$zero,0
+	li		$one,1
+	mtvsrdd		$t1,$one,$zero
+___
+
+		for (my $i = 0; $i <= 8; $i++) {
+			$code.=<<___;
+	vsld		$inx2[$i],$in[$i],$t1
+___
+		}
+
+		$code.=<<___;
+	vmsumudm	$out[0],$in[0],$in[0],$vzero
+
+	vmsumudm	$out[1],$in[0],$inx2[1],$vzero
+
+	xxpermdi	$t1,$in[0],$in[1],0b00
+	xxpermdi	$t2,$inx2[2],$in[1],0b00
+	vmsumudm	$out[2],$t1,$t2,$vzero
+
+	xxpermdi	$t2,$inx2[3],$inx2[2],0b00
+	vmsumudm	$out[3],$t1,$t2,$vzero
+
+	xxpermdi	$t2,$inx2[4],$inx2[3],0b00
+	vmsumudm	$out[4],$t1,$t2,$vzero
+	vmsumudm	$out[4],$in[2],$in[2],$out[4]
+
+	xxpermdi	$t2,$inx2[5],$inx2[4],0b00
+	vmsumudm	$out[5],$t1,$t2,$vzero
+	vmsumudm	$out[5],$in[2],$inx2[3],$out[5]
+
+	xxpermdi	$t2,$inx2[6],$inx2[5],0b00
+	vmsumudm	$out[6],$t1,$t2,$vzero
+	xxpermdi	$t3,$in[2],$in[3],0b00
+	xxpermdi	$t4,$inx2[4],$in[3],0b00
+	vmsumudm	$out[6],$t3,$t4,$out[6]
+
+	xxpermdi	$t2,$inx2[7],$inx2[6],0b00
+	vmsumudm	$out[7],$t1,$t2,$vzero
+	xxpermdi	$t4,$inx2[5],$inx2[4],0b00
+	vmsumudm	$out[7],$t3,$t4,$out[7]
+
+	xxpermdi	$t2,$inx2[8],$inx2[7],0b00
+	vmsumudm	$out[8],$t1,$t2,$vzero
+	xxpermdi	$t4,$inx2[6],$inx2[5],0b00
+	vmsumudm	$out[8],$t3,$t4,$out[8]
+	vmsumudm	$out[8],$in[4],$in[4],$out[8]
+
+	vmsumudm	$out[1],$in[5],$inx2[5],$out[1]
+
+	vmsumudm	$out[3],$in[6],$inx2[6],$out[3]
+
+	vmsumudm	$out[5],$in[7],$inx2[7],$out[5]
+
+	vmsumudm	$out[7],$in[8],$inx2[8],$out[7]
+
+	mtvsrdd		$t1,$one,$zero
+___
+
+		for (my $i = 5; $i <= 8; $i++) {
+			$code.=<<___;
+	vsld		$inx2[$i],$inx2[$i],$t1
+___
+		}
+
+		$code.=<<___;
+
+	vmsumudm	$out[6],$in[7],$inx2[8],$out[6]
+
+	vmsumudm	$out[5],$in[6],$inx2[8],$out[5]
+
+	xxpermdi	$t2,$inx2[8],$inx2[7],0b00
+	xxpermdi	$t1,$in[5],$in[6],0b00
+	vmsumudm	$out[4],$t1,$t2,$out[4]
+
+	xxpermdi	$t1,$in[4],$in[5],0b00
+	vmsumudm	$out[3],$t1,$t2,$out[3]
+
+	xxpermdi	$t1,$in[3],$in[4],0b00
+	vmsumudm	$out[2],$t1,$t2,$out[2]
+	vmsumudm	$out[2],$in[5],$inx2[6],$out[2]
+
+	xxpermdi	$t1,$in[2],$in[3],0b00
+	vmsumudm	$out[1],$t1,$t2,$out[1]
+	vmsumudm	$out[1],$in[4],$inx2[6],$out[1]
+
+	xxpermdi	$t1,$in[1],$in[2],0b00
+	vmsumudm	$out[0],$t1,$t2,$out[0]
+	xxpermdi	$t2,$inx2[6],$inx2[5],0b00
+	xxpermdi	$t1,$in[3],$in[4],0b00
+	vmsumudm	$out[0],$t1,$t2,$out[0]
+
+___
+
+		store_vrs($outp, \@out);
+
+		pop_vrs(52, 63);
+
+		endproc("p521_felem_square");
+	}
+}
+
+$code =~ s/\`([^\`]*)\`/eval $1/gem;
+print $code;
+close STDOUT or die "error closing STDOUT: $!";
Index: openssl-1.1.1l/crypto/ec/ec_local.h
===================================================================
--- openssl-1.1.1l.orig/crypto/ec/ec_local.h
+++ openssl-1.1.1l/crypto/ec/ec_local.h
@@ -499,6 +499,10 @@ int ec_GF2m_simple_field_div(const EC_GR
                              const BIGNUM *b, BN_CTX *);

 #ifndef OPENSSL_NO_EC_NISTP_64_GCC_128
+# ifdef B_ENDIAN
+#  error "Can not enable ec_nistp_64_gcc_128 on big-endian systems"
+# endif
+
 /* method functions in ecp_nistp224.c */
 int ec_GFp_nistp224_group_init(EC_GROUP *group);
 int ec_GFp_nistp224_group_set_curve(EC_GROUP *group, const BIGNUM *p,
Index: openssl-1.1.1l/crypto/ec/curve448/arch_32/f_impl.c
===================================================================
--- openssl-1.1.1l.orig/crypto/ec/curve448/arch_32/f_impl.c
+++ openssl-1.1.1l/crypto/ec/curve448/arch_32/f_impl.c
@@ -10,7 +10,7 @@
  * Originally written by Mike Hamburg
  */

-#include "field.h"
+#include "../field.h"

 void gf_mul(gf_s * RESTRICT cs, const gf as, const gf bs)
 {
Index: openssl-1.1.1l/crypto/ec/curve448/arch_64/f_impl.c
===================================================================
--- /dev/null
+++ openssl-1.1.1l/crypto/ec/curve448/arch_64/f_impl.c
@@ -0,0 +1,200 @@
+/*
+ * Copyright 2017-2018 The OpenSSL Project Authors. All Rights Reserved.
+ * Copyright 2014 Cryptography Research, Inc.
+ *
+ * Licensed under the OpenSSL license (the "License").  You may not use
+ * this file except in compliance with the License.  You can obtain a copy
+ * in the file LICENSE in the source distribution or at
+ * https://www.openssl.org/source/license.html
+ *
+ * Originally written by Mike Hamburg
+ */
+
+#include "../field.h"
+
+void gf_mul(gf_s * RESTRICT cs, const gf as, const gf bs)
+{
+    const uint64_t *a = as->limb, *b = bs->limb;
+    uint64_t *c = cs->limb;
+    uint128_t accum0 = 0, accum1 = 0, accum2;
+    uint64_t mask = (1ULL << 56) - 1;
+    uint64_t aa[4], bb[4], bbb[4];
+    unsigned int i, j;
+
+    for (i = 0; i < 4; i++) {
+        aa[i] = a[i] + a[i + 4];
+        bb[i] = b[i] + b[i + 4];
+        bbb[i] = bb[i] + b[i + 4];
+    }
+
+    for (i = 0; i < 4; i++) {
+        accum2 = 0;
+
+        for (j = 0; j <= i; j++) {
+            accum2 += widemul(a[j], b[i - j]);
+            accum1 += widemul(aa[j], bb[i - j]);
+            accum0 += widemul(a[j + 4], b[i - j + 4]);
+        }
+        for (; j < 4; j++) {
+            accum2 += widemul(a[j], b[i - j + 8]);
+            accum1 += widemul(aa[j], bbb[i - j + 4]);
+            accum0 += widemul(a[j + 4], bb[i - j + 4]);
+        }
+
+        accum1 -= accum2;
+        accum0 += accum2;
+
+        c[i] = ((uint64_t)(accum0)) & mask;
+        c[i + 4] = ((uint64_t)(accum1)) & mask;
+
+        accum0 >>= 56;
+        accum1 >>= 56;
+    }
+
+    accum0 += accum1;
+    accum0 += c[4];
+    accum1 += c[0];
+    c[4] = ((uint64_t)(accum0)) & mask;
+    c[0] = ((uint64_t)(accum1)) & mask;
+
+    accum0 >>= 56;
+    accum1 >>= 56;
+
+    c[5] += ((uint64_t)(accum0));
+    c[1] += ((uint64_t)(accum1));
+}
+
+void gf_mulw_unsigned(gf_s * RESTRICT cs, const gf as, uint32_t b)
+{
+    const uint64_t *a = as->limb;
+    uint64_t *c = cs->limb;
+    uint128_t accum0 = 0, accum4 = 0;
+    uint64_t mask = (1ULL << 56) - 1;
+    int i;
+
+    for (i = 0; i < 4; i++) {
+        accum0 += widemul(b, a[i]);
+        accum4 += widemul(b, a[i + 4]);
+        c[i] = accum0 & mask;
+        accum0 >>= 56;
+        c[i + 4] = accum4 & mask;
+        accum4 >>= 56;
+    }
+
+    accum0 += accum4 + c[4];
+    c[4] = accum0 & mask;
+    c[5] += accum0 >> 56;
+
+    accum4 += c[0];
+    c[0] = accum4 & mask;
+    c[1] += accum4 >> 56;
+}
+
+void gf_sqr(gf_s * RESTRICT cs, const gf as)
+{
+    const uint64_t *a = as->limb;
+    uint64_t *c = cs->limb;
+    uint128_t accum0 = 0, accum1 = 0, accum2;
+    uint64_t mask = (1ULL << 56) - 1;
+    uint64_t aa[4];
+    unsigned int i;
+
+    /* For some reason clang doesn't vectorize this without prompting? */
+    for (i = 0; i < 4; i++)
+        aa[i] = a[i] + a[i + 4];
+
+    accum2 = widemul(a[0], a[3]);
+    accum0 = widemul(aa[0], aa[3]);
+    accum1 = widemul(a[4], a[7]);
+
+    accum2 += widemul(a[1], a[2]);
+    accum0 += widemul(aa[1], aa[2]);
+    accum1 += widemul(a[5], a[6]);
+
+    accum0 -= accum2;
+    accum1 += accum2;
+
+    c[3] = ((uint64_t)(accum1)) << 1 & mask;
+    c[7] = ((uint64_t)(accum0)) << 1 & mask;
+
+    accum0 >>= 55;
+    accum1 >>= 55;
+
+    accum0 += widemul(2 * aa[1], aa[3]);
+    accum1 += widemul(2 * a[5], a[7]);
+    accum0 += widemul(aa[2], aa[2]);
+    accum1 += accum0;
+
+    accum0 -= widemul(2 * a[1], a[3]);
+    accum1 += widemul(a[6], a[6]);
+
+    accum2 = widemul(a[0], a[0]);
+    accum1 -= accum2;
+    accum0 += accum2;
+
+    accum0 -= widemul(a[2], a[2]);
+    accum1 += widemul(aa[0], aa[0]);
+    accum0 += widemul(a[4], a[4]);
+
+    c[0] = ((uint64_t)(accum0)) & mask;
+    c[4] = ((uint64_t)(accum1)) & mask;
+
+    accum0 >>= 56;
+    accum1 >>= 56;
+
+    accum2 = widemul(2 * aa[2], aa[3]);
+    accum0 -= widemul(2 * a[2], a[3]);
+    accum1 += widemul(2 * a[6], a[7]);
+
+    accum1 += accum2;
+    accum0 += accum2;
+
+    accum2 = widemul(2 * a[0], a[1]);
+    accum1 += widemul(2 * aa[0], aa[1]);
+    accum0 += widemul(2 * a[4], a[5]);
+
+    accum1 -= accum2;
+    accum0 += accum2;
+
+    c[1] = ((uint64_t)(accum0)) & mask;
+    c[5] = ((uint64_t)(accum1)) & mask;
+
+    accum0 >>= 56;
+    accum1 >>= 56;
+
+    accum2 = widemul(aa[3], aa[3]);
+    accum0 -= widemul(a[3], a[3]);
+    accum1 += widemul(a[7], a[7]);
+
+    accum1 += accum2;
+    accum0 += accum2;
+
+    accum2 = widemul(2 * a[0], a[2]);
+    accum1 += widemul(2 * aa[0], aa[2]);
+    accum0 += widemul(2 * a[4], a[6]);
+
+    accum2 += widemul(a[1], a[1]);
+    accum1 += widemul(aa[1], aa[1]);
+    accum0 += widemul(a[5], a[5]);
+
+    accum1 -= accum2;
+    accum0 += accum2;
+
+    c[2] = ((uint64_t)(accum0)) & mask;
+    c[6] = ((uint64_t)(accum1)) & mask;
+
+    accum0 >>= 56;
+    accum1 >>= 56;
+
+    accum0 += c[3];
+    accum1 += c[7];
+    c[3] = ((uint64_t)(accum0)) & mask;
+    c[7] = ((uint64_t)(accum1)) & mask;
+
+    /* we could almost stop here, but it wouldn't be stable, so... */
+
+    accum0 >>= 56;
+    accum1 >>= 56;
+    c[4] += ((uint64_t)(accum0)) + ((uint64_t)(accum1));
+    c[0] += ((uint64_t)(accum1));
+}
Index: openssl-1.1.1l/Configure
===================================================================
--- openssl-1.1.1l.orig/Configure
+++ openssl-1.1.1l/Configure
@@ -1476,6 +1476,20 @@ if (!$disabled{asm} && !$predefined_C{__
     }
 }

+# Check if __SIZEOF_INT128__ is defined by compiler
+$config{use_int128} = 0;
+{
+    my $cc = $config{CROSS_COMPILE}.$config{CC};
+    open(PIPE, "$cc -E -dM - </dev/null 2>&1 |");
+    while(<PIPE>) {
+        if (m/__SIZEOF_INT128__/) {
+            $config{use_int128} = 1;
+            last;
+        }
+    }
+    close(PIPE);
+}
+
 # Deal with bn_ops ###################################################

 $config{bn_ll}                  =0;