From 0ce74d253bcb2cbc6696e0cea7bd18c9494b8ca025a62aefe6539d7310ec3b97 Mon Sep 17 00:00:00 2001
From: Otto Hollmann <otto.hollmann@suse.com>
Date: Wed, 14 Dec 2022 09:46:30 +0000
Subject: [PATCH 1/2] Accepting request 1042846 from
 home:ohollmann:branches:security:tls

- POWER10 performance enhancements for cryptography [jsc#PED-512]
  * openssl-1_1-AES-GCM-performance-optimzation-with-stitched-method.patch
  * openssl-1_1-Fixed-counter-overflow.patch
  * openssl-1_1-chacha20-performance-optimizations-for-ppc64le-with-.patch
  * openssl-1_1-Fixed-conditional-statement-testing-64-and-256-bytes.patch
  * openssl-1_1-Fix-AES-GCM-on-Power-8-CPUs.patch

OBS-URL: https://build.opensuse.org/request/show/1042846
OBS-URL: https://build.opensuse.org/package/show/security:tls/openssl-1_1?expand=0&rev=123
---
 ...nce-optimzation-with-stitched-method.patch | 1588 +++++++++++++++++
 openssl-1_1-Fix-AES-GCM-on-Power-8-CPUs.patch |  230 +++
 ...l-statement-testing-64-and-256-bytes.patch |  103 ++
 openssl-1_1-Fixed-counter-overflow.patch      |  136 ++
 ...ance-optimizations-for-ppc64le-with-.patch | 1535 ++++++++++++++++
 openssl-1_1.changes                           |   10 +
 openssl-1_1.spec                              |    8 +
 7 files changed, 3610 insertions(+)
 create mode 100644 openssl-1_1-AES-GCM-performance-optimzation-with-stitched-method.patch
 create mode 100644 openssl-1_1-Fix-AES-GCM-on-Power-8-CPUs.patch
 create mode 100644 openssl-1_1-Fixed-conditional-statement-testing-64-and-256-bytes.patch
 create mode 100644 openssl-1_1-Fixed-counter-overflow.patch
 create mode 100644 openssl-1_1-chacha20-performance-optimizations-for-ppc64le-with-.patch

diff --git a/openssl-1_1-AES-GCM-performance-optimzation-with-stitched-method.patch b/openssl-1_1-AES-GCM-performance-optimzation-with-stitched-method.patch
new file mode 100644
index 0000000..b6cfe60
--- /dev/null
+++ b/openssl-1_1-AES-GCM-performance-optimzation-with-stitched-method.patch
@@ -0,0 +1,1588 @@
+From 44a563dde1584cd9284e80b6e45ee5019be8d36c Mon Sep 17 00:00:00 2001
+From: Danny Tsen <dtsen@us.ibm.com>
+Date: Mon, 18 Oct 2021 10:51:42 -0400
+Subject: [PATCH] AES-GCM performance optimzation with stitched method for p9+
+ ppc64le
+
+Assembly code reviewed by Shricharan Srivatsan <ssrivat@us.ibm.com>
+
+Reviewed-by: Tomas Mraz <tomas@openssl.org>
+Reviewed-by: Paul Dale <pauli@openssl.org>
+(Merged from https://github.com/openssl/openssl/pull/16854)
+---
+ Configurations/00-base-templates.conf |    2 
+ crypto/evp/e_aes.c                    |   33 
+ crypto/modes/asm/aes-gcm-ppc.pl       | 1439 ++++++++++++++++++++++++++++++++++
+ crypto/modes/build.info               |    1 
+ 4 files changed, 1466 insertions(+), 9 deletions(-)
+ create mode 100644 crypto/modes/asm/aes-gcm-ppc.pl
+ create mode 100644 providers/implementations/ciphers/cipher_aes_gcm_hw_ppc.inc
+
+--- a/Configurations/00-base-templates.conf
++++ b/Configurations/00-base-templates.conf
+@@ -344,7 +344,7 @@ my %targets=(
+ 	bn_asm_src      => "bn-ppc.s ppc-mont.s",
+ 	aes_asm_src     => "aes_core.c aes_cbc.c aes-ppc.s vpaes-ppc.s aesp8-ppc.s",
+ 	sha1_asm_src    => "sha1-ppc.s sha256-ppc.s sha512-ppc.s sha256p8-ppc.s sha512p8-ppc.s",
+-	modes_asm_src   => "ghashp8-ppc.s",
++	modes_asm_src   => "ghashp8-ppc.s aes-gcm-ppc.s",
+ 	chacha_asm_src	=> "chacha-ppc.s",
+ 	poly1305_asm_src=> "poly1305-ppc.s poly1305-ppcfp.s",
+     },
+--- a/crypto/evp/e_aes.c
++++ b/crypto/evp/e_aes.c
+@@ -178,6 +178,20 @@ static void ctr64_inc(unsigned char *cou
+ # define HWAES_ctr32_encrypt_blocks aes_p8_ctr32_encrypt_blocks
+ # define HWAES_xts_encrypt aes_p8_xts_encrypt
+ # define HWAES_xts_decrypt aes_p8_xts_decrypt
++# define PPC_AES_GCM_CAPABLE (OPENSSL_ppccap_P & PPC_MADD300)
++# define AES_GCM_ENC_BYTES 128
++# define AES_GCM_DEC_BYTES 128
++size_t ppc_aes_gcm_encrypt(const unsigned char *in, unsigned char *out, size_t len,
++                         const void *key, unsigned char ivec[16], u64 *Xi);
++size_t ppc_aes_gcm_decrypt(const unsigned char *in, unsigned char *out, size_t len,
++                         const void *key, unsigned char ivec[16], u64 *Xi);
++void gcm_ghash_p8(u64 Xi[2],const u128 Htable[16],const u8 *inp, size_t len);
++# if PPC_AES_GCM_CAPABLE
++#  define AES_gcm_encrypt ppc_aes_gcm_encrypt
++#  define AES_gcm_decrypt ppc_aes_gcm_decrypt
++#  define AES_GCM_ASM(gctx) ((gctx)->ctr==aes_p8_ctr32_encrypt_blocks && \
++                             (gctx)->gcm.ghash==gcm_ghash_p8)
++# endif
+ #endif
+ 
+ #if     defined(OPENSSL_CPUID_OBJ) &&                   (  \
+@@ -199,6 +213,9 @@ extern unsigned int OPENSSL_ia32cap_P[];
+  */
+ # define AESNI_CAPABLE   (OPENSSL_ia32cap_P[1]&(1<<(57-32)))
+ 
++# define AES_GCM_ENC_BYTES 32
++# define AES_GCM_DEC_BYTES 16
++
+ int aesni_set_encrypt_key(const unsigned char *userKey, int bits,
+                           AES_KEY *key);
+ int aesni_set_decrypt_key(const unsigned char *userKey, int bits,
+@@ -3101,7 +3118,7 @@ static int aes_gcm_tls_cipher(EVP_CIPHER
+         if (gctx->ctr) {
+             size_t bulk = 0;
+ #if defined(AES_GCM_ASM)
+-            if (len >= 32 && AES_GCM_ASM(gctx)) {
++            if (len >= AES_GCM_ENC_BYTES && AES_GCM_ASM(gctx)) {
+                 if (CRYPTO_gcm128_encrypt(&gctx->gcm, NULL, NULL, 0))
+                     return -1;
+ 
+@@ -3119,7 +3136,7 @@ static int aes_gcm_tls_cipher(EVP_CIPHER
+         } else {
+             size_t bulk = 0;
+ #if defined(AES_GCM_ASM2)
+-            if (len >= 32 && AES_GCM_ASM2(gctx)) {
++            if (len >= AES_GCM_ENC_BYTES && AES_GCM_ASM2(gctx)) {
+                 if (CRYPTO_gcm128_encrypt(&gctx->gcm, NULL, NULL, 0))
+                     return -1;
+ 
+@@ -3142,7 +3159,7 @@ static int aes_gcm_tls_cipher(EVP_CIPHER
+         if (gctx->ctr) {
+             size_t bulk = 0;
+ #if defined(AES_GCM_ASM)
+-            if (len >= 16 && AES_GCM_ASM(gctx)) {
++            if (len >= AES_GCM_DEC_BYTES && AES_GCM_ASM(gctx)) {
+                 if (CRYPTO_gcm128_decrypt(&gctx->gcm, NULL, NULL, 0))
+                     return -1;
+ 
+@@ -3160,7 +3177,7 @@ static int aes_gcm_tls_cipher(EVP_CIPHER
+         } else {
+             size_t bulk = 0;
+ #if defined(AES_GCM_ASM2)
+-            if (len >= 16 && AES_GCM_ASM2(gctx)) {
++            if (len >= AES_GCM_DEC_BYTES && AES_GCM_ASM2(gctx)) {
+                 if (CRYPTO_gcm128_decrypt(&gctx->gcm, NULL, NULL, 0))
+                     return -1;
+ 
+@@ -3211,7 +3228,7 @@ static int aes_gcm_cipher(EVP_CIPHER_CTX
+             if (gctx->ctr) {
+                 size_t bulk = 0;
+ #if defined(AES_GCM_ASM)
+-                if (len >= 32 && AES_GCM_ASM(gctx)) {
++                if (len >= AES_GCM_ENC_BYTES && AES_GCM_ASM(gctx)) {
+                     size_t res = (16 - gctx->gcm.mres) % 16;
+ 
+                     if (CRYPTO_gcm128_encrypt(&gctx->gcm, in, out, res))
+@@ -3233,7 +3250,7 @@ static int aes_gcm_cipher(EVP_CIPHER_CTX
+             } else {
+                 size_t bulk = 0;
+ #if defined(AES_GCM_ASM2)
+-                if (len >= 32 && AES_GCM_ASM2(gctx)) {
++                if (len >= AES_GCM_ENC_BYTES && AES_GCM_ASM2(gctx)) {
+                     size_t res = (16 - gctx->gcm.mres) % 16;
+ 
+                     if (CRYPTO_gcm128_encrypt(&gctx->gcm, in, out, res))
+@@ -3255,7 +3272,7 @@ static int aes_gcm_cipher(EVP_CIPHER_CTX
+             if (gctx->ctr) {
+                 size_t bulk = 0;
+ #if defined(AES_GCM_ASM)
+-                if (len >= 16 && AES_GCM_ASM(gctx)) {
++                if (len >= AES_GCM_DEC_BYTES && AES_GCM_ASM(gctx)) {
+                     size_t res = (16 - gctx->gcm.mres) % 16;
+ 
+                     if (CRYPTO_gcm128_decrypt(&gctx->gcm, in, out, res))
+@@ -3277,7 +3294,7 @@ static int aes_gcm_cipher(EVP_CIPHER_CTX
+             } else {
+                 size_t bulk = 0;
+ #if defined(AES_GCM_ASM2)
+-                if (len >= 16 && AES_GCM_ASM2(gctx)) {
++                if (len >= AES_GCM_DEC_BYTES && AES_GCM_ASM2(gctx)) {
+                     size_t res = (16 - gctx->gcm.mres) % 16;
+ 
+                     if (CRYPTO_gcm128_decrypt(&gctx->gcm, in, out, res))
+--- /dev/null
++++ b/crypto/modes/asm/aes-gcm-ppc.pl
+@@ -0,0 +1,1439 @@
++#! /usr/bin/env perl
++# Copyright 2014-2020 The OpenSSL Project Authors. All Rights Reserved.
++# Copyright 2021- IBM Inc. All rights reserved
++#
++# Licensed under the Apache License 2.0 (the "License").  You may not use
++# this file except in compliance with the License.  You can obtain a copy
++# in the file LICENSE in the source distribution or at
++# https://www.openssl.org/source/license.html
++#
++#===================================================================================
++# Written by Danny Tsen <dtsen@us.ibm.com> for OpenSSL Project,
++#
++# GHASH is based on the Karatsuba multiplication method.
++#
++#    Xi xor X1
++#
++#    X1 * H^4 + X2 * H^3 + x3 * H^2 + X4 * H =
++#      (X1.h * H4.h + xX.l * H4.l + X1 * H4) +
++#      (X2.h * H3.h + X2.l * H3.l + X2 * H3) +
++#      (X3.h * H2.h + X3.l * H2.l + X3 * H2) +
++#      (X4.h * H.h + X4.l * H.l + X4 * H)
++#
++# Xi = v0
++# H Poly = v2
++# Hash keys = v3 - v14
++#     ( H.l, H, H.h)
++#     ( H^2.l, H^2, H^2.h)
++#     ( H^3.l, H^3, H^3.h)
++#     ( H^4.l, H^4, H^4.h)
++#
++# v30 is IV
++# v31 - counter 1
++#
++# AES used,
++#     vs0 - vs14 for round keys
++#     v15, v16, v17, v18, v19, v20, v21, v22 for 8 blocks (encrypted)
++#
++# This implementation uses stitched AES-GCM approach to improve overall performance.
++# AES is implemented with 8x blocks and GHASH is using 2 4x blocks.
++#
++# Current large block (16384 bytes) performance per second with 128 bit key --
++#
++#                        Encrypt  Decrypt
++# Power10[le] (3.5GHz)   5.32G    5.26G
++#
++# ===================================================================================
++#
++# $output is the last argument if it looks like a file (it has an extension)
++# $flavour is the first argument if it doesn't look like a file
++$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
++$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
++
++if ($flavour =~ /64/) {
++	$SIZE_T=8;
++	$LRSAVE=2*$SIZE_T;
++	$STU="stdu";
++	$POP="ld";
++	$PUSH="std";
++	$UCMP="cmpld";
++	$SHRI="srdi";
++} elsif ($flavour =~ /32/) {
++	$SIZE_T=4;
++	$LRSAVE=$SIZE_T;
++	$STU="stwu";
++	$POP="lwz";
++	$PUSH="stw";
++	$UCMP="cmplw";
++	$SHRI="srwi";
++} else { die "nonsense $flavour"; }
++
++$sp="r1";
++$FRAME=6*$SIZE_T+13*16;	# 13*16 is for v20-v31 offload
++
++$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
++( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
++( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
++die "can't locate ppc-xlate.pl";
++
++open STDOUT,"| $^X $xlate $flavour \"$output\""
++    or die "can't call $xlate: $!";
++
++$code=<<___;
++.machine        "any"
++.abiversion     2
++.text
++
++# 4x loops
++# v15 - v18 - input states
++# vs1 - vs9 - round keys
++#
++.macro Loop_aes_middle4x
++	xxlor	19+32, 1, 1
++	xxlor	20+32, 2, 2
++	xxlor	21+32, 3, 3
++	xxlor	22+32, 4, 4
++
++	vcipher	15, 15, 19
++	vcipher	16, 16, 19
++	vcipher	17, 17, 19
++	vcipher	18, 18, 19
++
++	vcipher	15, 15, 20
++	vcipher	16, 16, 20
++	vcipher	17, 17, 20
++	vcipher	18, 18, 20
++
++	vcipher	15, 15, 21
++	vcipher	16, 16, 21
++	vcipher	17, 17, 21
++	vcipher	18, 18, 21
++
++	vcipher	15, 15, 22
++	vcipher	16, 16, 22
++	vcipher	17, 17, 22
++	vcipher	18, 18, 22
++
++	xxlor	19+32, 5, 5
++	xxlor	20+32, 6, 6
++	xxlor	21+32, 7, 7
++	xxlor	22+32, 8, 8
++
++	vcipher	15, 15, 19
++	vcipher	16, 16, 19
++	vcipher	17, 17, 19
++	vcipher	18, 18, 19
++
++	vcipher	15, 15, 20
++	vcipher	16, 16, 20
++	vcipher	17, 17, 20
++	vcipher	18, 18, 20
++
++	vcipher	15, 15, 21
++	vcipher	16, 16, 21
++	vcipher	17, 17, 21
++	vcipher	18, 18, 21
++
++	vcipher	15, 15, 22
++	vcipher	16, 16, 22
++	vcipher	17, 17, 22
++	vcipher	18, 18, 22
++
++	xxlor	23+32, 9, 9
++	vcipher	15, 15, 23
++	vcipher	16, 16, 23
++	vcipher	17, 17, 23
++	vcipher	18, 18, 23
++.endm
++
++# 8x loops
++# v15 - v22 - input states
++# vs1 - vs9 - round keys
++#
++.macro Loop_aes_middle8x
++	xxlor	23+32, 1, 1
++	xxlor	24+32, 2, 2
++	xxlor	25+32, 3, 3
++	xxlor	26+32, 4, 4
++
++	vcipher	15, 15, 23
++	vcipher	16, 16, 23
++	vcipher	17, 17, 23
++	vcipher	18, 18, 23
++	vcipher	19, 19, 23
++	vcipher	20, 20, 23
++	vcipher	21, 21, 23
++	vcipher	22, 22, 23
++
++	vcipher	15, 15, 24
++	vcipher	16, 16, 24
++	vcipher	17, 17, 24
++	vcipher	18, 18, 24
++	vcipher	19, 19, 24
++	vcipher	20, 20, 24
++	vcipher	21, 21, 24
++	vcipher	22, 22, 24
++
++	vcipher	15, 15, 25
++	vcipher	16, 16, 25
++	vcipher	17, 17, 25
++	vcipher	18, 18, 25
++	vcipher	19, 19, 25
++	vcipher	20, 20, 25
++	vcipher	21, 21, 25
++	vcipher	22, 22, 25
++
++	vcipher	15, 15, 26
++	vcipher	16, 16, 26
++	vcipher	17, 17, 26
++	vcipher	18, 18, 26
++	vcipher	19, 19, 26
++	vcipher	20, 20, 26
++	vcipher	21, 21, 26
++	vcipher	22, 22, 26
++
++	xxlor	23+32, 5, 5
++	xxlor	24+32, 6, 6
++	xxlor	25+32, 7, 7
++	xxlor	26+32, 8, 8
++
++	vcipher	15, 15, 23
++	vcipher	16, 16, 23
++	vcipher	17, 17, 23
++	vcipher	18, 18, 23
++	vcipher	19, 19, 23
++	vcipher	20, 20, 23
++	vcipher	21, 21, 23
++	vcipher	22, 22, 23
++
++	vcipher	15, 15, 24
++	vcipher	16, 16, 24
++	vcipher	17, 17, 24
++	vcipher	18, 18, 24
++	vcipher	19, 19, 24
++	vcipher	20, 20, 24
++	vcipher	21, 21, 24
++	vcipher	22, 22, 24
++
++	vcipher	15, 15, 25
++	vcipher	16, 16, 25
++	vcipher	17, 17, 25
++	vcipher	18, 18, 25
++	vcipher	19, 19, 25
++	vcipher	20, 20, 25
++	vcipher	21, 21, 25
++	vcipher	22, 22, 25
++
++	vcipher	15, 15, 26
++	vcipher	16, 16, 26
++	vcipher	17, 17, 26
++	vcipher	18, 18, 26
++	vcipher	19, 19, 26
++	vcipher	20, 20, 26
++	vcipher	21, 21, 26
++	vcipher	22, 22, 26
++
++	xxlor	23+32, 9, 9
++	vcipher	15, 15, 23
++	vcipher	16, 16, 23
++	vcipher	17, 17, 23
++	vcipher	18, 18, 23
++	vcipher	19, 19, 23
++	vcipher	20, 20, 23
++	vcipher	21, 21, 23
++	vcipher	22, 22, 23
++.endm
++
++#
++# Compute 4x hash values based on Karatsuba method.
++#
++ppc_aes_gcm_ghash:
++	vxor		15, 15, 0
++
++	xxlxor		29, 29, 29
++
++	vpmsumd		23, 12, 15		# H4.L * X.L
++	vpmsumd		24, 9, 16
++	vpmsumd		25, 6, 17
++	vpmsumd		26, 3, 18
++
++	vxor		23, 23, 24
++	vxor		23, 23, 25
++	vxor		23, 23, 26		# L
++
++	vpmsumd		24, 13, 15		# H4.L * X.H + H4.H * X.L
++	vpmsumd		25, 10, 16		# H3.L * X1.H + H3.H * X1.L
++	vpmsumd		26, 7, 17
++	vpmsumd		27, 4, 18
++
++	vxor		24, 24, 25
++	vxor		24, 24, 26
++	vxor		24, 24, 27		# M
++
++	# sum hash and reduction with H Poly
++	vpmsumd		28, 23, 2		# reduction
++
++	xxlor		29+32, 29, 29
++	vsldoi		26, 24, 29, 8		# mL
++	vsldoi		29, 29, 24, 8		# mH
++	vxor		23, 23, 26		# mL + L
++
++	vsldoi		23, 23, 23, 8		# swap
++	vxor		23, 23, 28
++
++	vpmsumd		24, 14, 15		# H4.H * X.H
++	vpmsumd		25, 11, 16
++	vpmsumd		26, 8, 17
++	vpmsumd		27, 5, 18
++
++	vxor		24, 24, 25
++	vxor		24, 24, 26
++	vxor		24, 24, 27
++
++	vxor		24, 24, 29
++
++	# sum hash and reduction with H Poly
++	vsldoi		27, 23, 23, 8		# swap
++	vpmsumd		23, 23, 2
++	vxor		27, 27, 24
++	vxor		23, 23, 27
++
++	xxlor		32, 23+32, 23+32		# update hash
++
++	blr
++
++#
++# Combine two 4x ghash
++# v15 - v22 - input blocks
++#
++.macro ppc_aes_gcm_ghash2_4x
++	# first 4x hash
++	vxor		15, 15, 0		# Xi + X
++
++	xxlxor		29, 29, 29
++
++	vpmsumd		23, 12, 15		# H4.L * X.L
++	vpmsumd		24, 9, 16
++	vpmsumd		25, 6, 17
++	vpmsumd		26, 3, 18
++
++	vxor		23, 23, 24
++	vxor		23, 23, 25
++	vxor		23, 23, 26		# L
++
++	vpmsumd		24, 13, 15		# H4.L * X.H + H4.H * X.L
++	vpmsumd		25, 10, 16		# H3.L * X1.H + H3.H * X1.L
++	vpmsumd		26, 7, 17
++	vpmsumd		27, 4, 18
++
++	vxor		24, 24, 25
++	vxor		24, 24, 26
++
++	# sum hash and reduction with H Poly
++	vpmsumd		28, 23, 2		# reduction
++
++	xxlor		29+32, 29, 29
++
++	vxor		24, 24, 27		# M
++	vsldoi		26, 24, 29, 8		# mL
++	vsldoi		29, 29, 24, 8		# mH
++	vxor		23, 23, 26		# mL + L
++
++	vsldoi		23, 23, 23, 8		# swap
++	vxor		23, 23, 28
++
++	vpmsumd		24, 14, 15		# H4.H * X.H
++	vpmsumd		25, 11, 16
++	vpmsumd		26, 8, 17
++	vpmsumd		27, 5, 18
++
++	vxor		24, 24, 25
++	vxor		24, 24, 26
++	vxor		24, 24, 27		# H
++
++	vxor		24, 24, 29		# H + mH
++
++	# sum hash and reduction with H Poly
++	vsldoi		27, 23, 23, 8		# swap
++	vpmsumd		23, 23, 2
++	vxor		27, 27, 24
++	vxor		27, 23, 27		# 1st Xi
++
++	# 2nd 4x hash
++	vpmsumd		24, 9, 20
++	vpmsumd		25, 6, 21
++	vpmsumd		26, 3, 22
++	vxor		19, 19, 27		# Xi + X
++	vpmsumd		23, 12, 19		# H4.L * X.L
++
++	vxor		23, 23, 24
++	vxor		23, 23, 25
++	vxor		23, 23, 26		# L
++
++	vpmsumd		24, 13, 19		# H4.L * X.H + H4.H * X.L
++	vpmsumd		25, 10, 20		# H3.L * X1.H + H3.H * X1.L
++	vpmsumd		26, 7, 21
++	vpmsumd		27, 4, 22
++
++	vxor		24, 24, 25
++	vxor		24, 24, 26
++
++	# sum hash and reduction with H Poly
++	vpmsumd		28, 23, 2		# reduction
++
++	xxlor		29+32, 29, 29
++
++	vxor		24, 24, 27		# M
++	vsldoi		26, 24, 29, 8		# mL
++	vsldoi		29, 29, 24, 8		# mH
++	vxor		23, 23, 26		# mL + L
++
++	vsldoi		23, 23, 23, 8		# swap
++	vxor		23, 23, 28
++
++	vpmsumd		24, 14, 19		# H4.H * X.H
++	vpmsumd		25, 11, 20
++	vpmsumd		26, 8, 21
++	vpmsumd		27, 5, 22
++
++	vxor		24, 24, 25
++	vxor		24, 24, 26
++	vxor		24, 24, 27		# H
++
++	vxor		24, 24, 29		# H + mH
++
++	# sum hash and reduction with H Poly
++	vsldoi		27, 23, 23, 8		# swap
++	vpmsumd		23, 23, 2
++	vxor		27, 27, 24
++	vxor		23, 23, 27
++
++	xxlor		32, 23+32, 23+32		# update hash
++
++.endm
++
++#
++# Compute update single hash
++#
++.macro ppc_update_hash_1x
++	vxor		28, 28, 0
++
++	vxor		19, 19, 19
++
++	vpmsumd		22, 3, 28		# L
++	vpmsumd		23, 4, 28		# M
++	vpmsumd		24, 5, 28		# H
++
++	vpmsumd		27, 22, 2		# reduction
++
++	vsldoi		25, 23, 19, 8		# mL
++	vsldoi		26, 19, 23, 8		# mH
++	vxor		22, 22, 25		# LL + LL
++	vxor		24, 24, 26		# HH + HH
++
++	vsldoi		22, 22, 22, 8		# swap
++	vxor		22, 22, 27
++
++	vsldoi		20, 22, 22, 8		# swap
++	vpmsumd		22, 22, 2		# reduction
++	vxor		20, 20, 24
++	vxor		22, 22, 20
++
++	vmr		0, 22			# update hash
++
++.endm
++
++#
++# ppc_aes_gcm_encrypt (const void *inp, void *out, size_t len,
++#               const AES_KEY *key, unsigned char iv[16],
++#               void *Xip);
++#
++#    r3 - inp
++#    r4 - out
++#    r5 - len
++#    r6 - AES round keys
++#    r7 - iv
++#    r8 - Xi, HPoli, hash keys
++#
++.global ppc_aes_gcm_encrypt
++.align 5
++ppc_aes_gcm_encrypt:
++_ppc_aes_gcm_encrypt:
++
++	stdu 1,-512(1)
++	mflr 0
++
++	std	14,112(1)
++	std	15,120(1)
++	std	16,128(1)
++	std	17,136(1)
++	std	18,144(1)
++	std	19,152(1)
++	std	20,160(1)
++	std	21,168(1)
++	li	9, 256
++	stvx	20, 9, 1
++	addi	9, 9, 16
++	stvx	21, 9, 1
++	addi	9, 9, 16
++	stvx	22, 9, 1
++	addi	9, 9, 16
++	stvx	23, 9, 1
++	addi	9, 9, 16
++	stvx	24, 9, 1
++	addi	9, 9, 16
++	stvx	25, 9, 1
++	addi	9, 9, 16
++	stvx	26, 9, 1
++	addi	9, 9, 16
++	stvx	27, 9, 1
++	addi	9, 9, 16
++	stvx	28, 9, 1
++	addi	9, 9, 16
++	stvx	29, 9, 1
++	addi	9, 9, 16
++	stvx	30, 9, 1
++	addi	9, 9, 16
++	stvx	31, 9, 1
++	std	0, 528(1)
++
++	# Load Xi
++	lxvb16x	32, 0, 8	# load Xi
++
++	# load Hash - h^4, h^3, h^2, h
++	li	10, 32
++	lxvd2x	2+32, 10, 8	# H Poli
++	li	10, 48
++	lxvd2x	3+32, 10, 8	# Hl
++	li	10, 64
++	lxvd2x	4+32, 10, 8	# H
++	li	10, 80
++	lxvd2x	5+32, 10, 8	# Hh
++
++	li	10, 96
++	lxvd2x	6+32, 10, 8	# H^2l
++	li	10, 112
++	lxvd2x	7+32, 10, 8	# H^2
++	li	10, 128
++	lxvd2x	8+32, 10, 8	# H^2h
++
++	li	10, 144
++	lxvd2x	9+32, 10, 8	# H^3l
++	li	10, 160
++	lxvd2x	10+32, 10, 8	# H^3
++	li	10, 176
++	lxvd2x	11+32, 10, 8	# H^3h
++
++	li	10, 192
++	lxvd2x	12+32, 10, 8	# H^4l
++	li	10, 208
++	lxvd2x	13+32, 10, 8	# H^4
++	li	10, 224
++	lxvd2x	14+32, 10, 8	# H^4h
++
++	# initialize ICB: GHASH( IV ), IV - r7
++	lxvb16x	30+32, 0, 7	# load IV  - v30
++
++	mr	12, 5		# length
++	li	11, 0		# block index
++
++	# counter 1
++	vxor	31, 31, 31
++	vspltisb 22, 1
++	vsldoi	31, 31, 22,1	# counter 1
++
++	# load round key to VSR
++	lxv	0, 0(6)
++	lxv	1, 0x10(6)
++	lxv	2, 0x20(6)
++	lxv	3, 0x30(6)
++	lxv	4, 0x40(6)
++	lxv	5, 0x50(6)
++	lxv	6, 0x60(6)
++	lxv	7, 0x70(6)
++	lxv	8, 0x80(6)
++	lxv	9, 0x90(6)
++	lxv	10, 0xa0(6)
++
++	# load rounds - 10 (128), 12 (192), 14 (256)
++	lwz	9,240(6)
++
++	#
++	# vxor	state, state, w # addroundkey
++	xxlor	32+29, 0, 0
++	vxor	15, 30, 29	# IV + round key - add round key 0
++
++	cmpdi	9, 10
++	beq	Loop_aes_gcm_8x
++
++	# load 2 more round keys (v11, v12)
++	lxv	11, 0xb0(6)
++	lxv	12, 0xc0(6)
++
++	cmpdi	9, 12
++	beq	Loop_aes_gcm_8x
++
++	# load 2 more round keys (v11, v12, v13, v14)
++	lxv	13, 0xd0(6)
++	lxv	14, 0xe0(6)
++	cmpdi	9, 14
++	beq	Loop_aes_gcm_8x
++
++	b	aes_gcm_out
++
++.align 5
++Loop_aes_gcm_8x:
++	mr	14, 3
++	mr	9, 4
++
++	# n blcoks
++	li	10, 128
++	divdu	10, 5, 10	# n 128 bytes-blocks
++	cmpdi	10, 0
++	beq	Loop_last_block
++
++	vaddudm	30, 30, 31	# IV + counter
++	vxor	16, 30, 29
++	vaddudm	30, 30, 31
++	vxor	17, 30, 29
++	vaddudm	30, 30, 31
++	vxor	18, 30, 29
++	vaddudm	30, 30, 31
++	vxor	19, 30, 29
++	vaddudm	30, 30, 31
++	vxor	20, 30, 29
++	vaddudm	30, 30, 31
++	vxor	21, 30, 29
++	vaddudm	30, 30, 31
++	vxor	22, 30, 29
++
++	mtctr	10
++
++	li	15, 16
++	li	16, 32
++	li	17, 48
++	li	18, 64
++	li	19, 80
++	li	20, 96
++	li	21, 112
++
++	lwz	10, 240(6)
++
++Loop_8x_block:
++
++	lxvb16x		15, 0, 14	# load block
++	lxvb16x		16, 15, 14	# load block
++	lxvb16x		17, 16, 14	# load block
++	lxvb16x		18, 17, 14	# load block
++	lxvb16x		19, 18, 14	# load block
++	lxvb16x		20, 19, 14	# load block
++	lxvb16x		21, 20, 14	# load block
++	lxvb16x		22, 21, 14	# load block
++	addi		14, 14, 128
++
++	Loop_aes_middle8x
++
++	xxlor	23+32, 10, 10
++
++	cmpdi	10, 10
++	beq	Do_next_ghash
++
++	# 192 bits
++	xxlor	24+32, 11, 11
++
++	vcipher	15, 15, 23
++	vcipher	16, 16, 23
++	vcipher	17, 17, 23
++	vcipher	18, 18, 23
++	vcipher	19, 19, 23
++	vcipher	20, 20, 23
++	vcipher	21, 21, 23
++	vcipher	22, 22, 23
++
++	vcipher	15, 15, 24
++	vcipher	16, 16, 24
++	vcipher	17, 17, 24
++	vcipher	18, 18, 24
++	vcipher	19, 19, 24
++	vcipher	20, 20, 24
++	vcipher	21, 21, 24
++	vcipher	22, 22, 24
++
++	xxlor	23+32, 12, 12
++
++	cmpdi	10, 12
++	beq	Do_next_ghash
++
++	# 256 bits
++	xxlor	24+32, 13, 13
++
++	vcipher	15, 15, 23
++	vcipher	16, 16, 23
++	vcipher	17, 17, 23
++	vcipher	18, 18, 23
++	vcipher	19, 19, 23
++	vcipher	20, 20, 23
++	vcipher	21, 21, 23
++	vcipher	22, 22, 23
++
++	vcipher	15, 15, 24
++	vcipher	16, 16, 24
++	vcipher	17, 17, 24
++	vcipher	18, 18, 24
++	vcipher	19, 19, 24
++	vcipher	20, 20, 24
++	vcipher	21, 21, 24
++	vcipher	22, 22, 24
++
++	xxlor	23+32, 14, 14
++
++	cmpdi	10, 14
++	beq	Do_next_ghash
++	b	aes_gcm_out
++
++Do_next_ghash:
++
++	#
++	# last round
++	vcipherlast     15, 15, 23
++	vcipherlast     16, 16, 23
++
++	xxlxor		47, 47, 15
++	stxvb16x        47, 0, 9	# store output
++	xxlxor		48, 48, 16
++	stxvb16x        48, 15, 9	# store output
++
++	vcipherlast     17, 17, 23
++	vcipherlast     18, 18, 23
++
++	xxlxor		49, 49, 17
++	stxvb16x        49, 16, 9	# store output
++	xxlxor		50, 50, 18
++	stxvb16x        50, 17, 9	# store output
++
++	vcipherlast     19, 19, 23
++	vcipherlast     20, 20, 23
++
++	xxlxor		51, 51, 19
++	stxvb16x        51, 18, 9	# store output
++	xxlxor		52, 52, 20
++	stxvb16x        52, 19, 9	# store output
++
++	vcipherlast     21, 21, 23
++	vcipherlast     22, 22, 23
++
++	xxlxor		53, 53, 21
++	stxvb16x        53, 20, 9	# store output
++	xxlxor		54, 54, 22
++	stxvb16x        54, 21, 9	# store output
++
++	addi		9, 9, 128
++
++	# ghash here
++	ppc_aes_gcm_ghash2_4x
++
++	xxlor	27+32, 0, 0
++	vaddudm 30, 30, 31		# IV + counter
++	vmr	29, 30
++	vxor    15, 30, 27		# add round key
++	vaddudm 30, 30, 31
++	vxor    16, 30, 27
++	vaddudm 30, 30, 31
++	vxor    17, 30, 27
++	vaddudm 30, 30, 31
++	vxor    18, 30, 27
++	vaddudm 30, 30, 31
++	vxor    19, 30, 27
++	vaddudm 30, 30, 31
++	vxor    20, 30, 27
++	vaddudm 30, 30, 31
++	vxor    21, 30, 27
++	vaddudm 30, 30, 31
++	vxor    22, 30, 27
++
++	addi    12, 12, -128
++	addi    11, 11, 128
++
++	bdnz	Loop_8x_block
++
++	vmr	30, 29
++
++Loop_last_block:
++	cmpdi   12, 0
++	beq     aes_gcm_out
++
++	# loop last few blocks
++	li      10, 16
++	divdu   10, 12, 10
++
++	mtctr   10
++
++	lwz	10, 240(6)
++
++	cmpdi   12, 16
++	blt     Final_block
++
++.macro Loop_aes_middle_1x
++	xxlor	19+32, 1, 1
++	xxlor	20+32, 2, 2
++	xxlor	21+32, 3, 3
++	xxlor	22+32, 4, 4
++
++	vcipher 15, 15, 19
++	vcipher 15, 15, 20
++	vcipher 15, 15, 21
++	vcipher 15, 15, 22
++
++	xxlor	19+32, 5, 5
++	xxlor	20+32, 6, 6
++	xxlor	21+32, 7, 7
++	xxlor	22+32, 8, 8
++
++	vcipher 15, 15, 19
++	vcipher 15, 15, 20
++	vcipher 15, 15, 21
++	vcipher 15, 15, 22
++
++	xxlor	19+32, 9, 9
++	vcipher 15, 15, 19
++.endm
++
++Next_rem_block:
++	lxvb16x 15, 0, 14		# load block
++
++	Loop_aes_middle_1x
++
++	xxlor	23+32, 10, 10
++
++	cmpdi	10, 10
++	beq	Do_next_1x
++
++	# 192 bits
++	xxlor	24+32, 11, 11
++
++	vcipher	15, 15, 23
++	vcipher	15, 15, 24
++
++	xxlor	23+32, 12, 12
++
++	cmpdi	10, 12
++	beq	Do_next_1x
++
++	# 256 bits
++	xxlor	24+32, 13, 13
++
++	vcipher	15, 15, 23
++	vcipher	15, 15, 24
++
++	xxlor	23+32, 14, 14
++
++	cmpdi	10, 14
++	beq	Do_next_1x
++
++Do_next_1x:
++	vcipherlast     15, 15, 23
++
++	xxlxor		47, 47, 15
++	stxvb16x	47, 0, 9	# store output
++	addi		14, 14, 16
++	addi		9, 9, 16
++
++	vmr		28, 15
++	ppc_update_hash_1x
++
++	addi		12, 12, -16
++	addi		11, 11, 16
++	xxlor		19+32, 0, 0
++	vaddudm		30, 30, 31		# IV + counter
++	vxor		15, 30, 19		# add round key
++
++	bdnz	Next_rem_block
++
++	cmpdi	12, 0
++	beq	aes_gcm_out
++
++Final_block:
++	Loop_aes_middle_1x
++
++	xxlor	23+32, 10, 10
++
++	cmpdi	10, 10
++	beq	Do_final_1x
++
++	# 192 bits
++	xxlor	24+32, 11, 11
++
++	vcipher	15, 15, 23
++	vcipher	15, 15, 24
++
++	xxlor	23+32, 12, 12
++
++	cmpdi	10, 12
++	beq	Do_final_1x
++
++	# 256 bits
++	xxlor	24+32, 13, 13
++
++	vcipher	15, 15, 23
++	vcipher	15, 15, 24
++
++	xxlor	23+32, 14, 14
++
++	cmpdi	10, 14
++	beq	Do_final_1x
++
++Do_final_1x:
++	vcipherlast     15, 15, 23
++
++	lxvb16x	15, 0, 14		# load last block
++	xxlxor	47, 47, 15
++
++	# create partial block mask
++	li	15, 16
++	sub	15, 15, 12		# index to the mask
++
++	vspltisb	16, -1		# first 16 bytes - 0xffff...ff
++	vspltisb	17, 0		# second 16 bytes - 0x0000...00
++	li	10, 192
++	stvx	16, 10, 1
++	addi	10, 10, 16
++	stvx	17, 10, 1
++
++	addi	10, 1, 192
++	lxvb16x	16, 15, 10		# load partial block mask
++	xxland	47, 47, 16
++
++	vmr	28, 15
++	ppc_update_hash_1x
++
++	# * should store only the remaining bytes.
++	bl	Write_partial_block
++
++	b aes_gcm_out
++
++#
++# Write partial block
++# r9 - output
++# r12 - remaining bytes
++# v15 - partial input data
++#
++Write_partial_block:
++	li		10, 192
++	stxvb16x	15+32, 10, 1		# last block
++
++	#add		10, 9, 11		# Output
++	addi		10, 9, -1
++	addi		16, 1, 191
++
++        mtctr		12			# remaining bytes
++	li		15, 0
++
++Write_last_byte:
++        lbzu		14, 1(16)
++	stbu		14, 1(10)
++        bdnz		Write_last_byte
++	blr
++
++aes_gcm_out:
++	# out = state
++	stxvb16x	32, 0, 8		# write out Xi
++	add	3, 11, 12		# return count
++
++	li	9, 256
++	lvx	20, 9, 1
++	addi	9, 9, 16
++	lvx	21, 9, 1
++	addi	9, 9, 16
++	lvx	22, 9, 1
++	addi	9, 9, 16
++	lvx	23, 9, 1
++	addi	9, 9, 16
++	lvx	24, 9, 1
++	addi	9, 9, 16
++	lvx	25, 9, 1
++	addi	9, 9, 16
++	lvx	26, 9, 1
++	addi	9, 9, 16
++	lvx	27, 9, 1
++	addi	9, 9, 16
++	lvx	28, 9, 1
++	addi	9, 9, 16
++	lvx	29, 9, 1
++	addi	9, 9, 16
++	lvx	30, 9, 1
++	addi	9, 9, 16
++	lvx	31, 9, 1
++
++	ld	0, 528(1)
++	ld      14,112(1)
++	ld      15,120(1)
++	ld      16,128(1)
++	ld      17,136(1)
++	ld      18,144(1)
++	ld      19,152(1)
++	ld      20,160(1)
++	ld	21,168(1)
++
++	mtlr	0
++	addi	1, 1, 512
++	blr
++
++#
++# 8x Decrypt
++#
++.global ppc_aes_gcm_decrypt
++.align 5
++ppc_aes_gcm_decrypt:
++_ppc_aes_gcm_decrypt:
++
++	stdu 1,-512(1)
++	mflr 0
++
++	std	14,112(1)
++	std	15,120(1)
++	std	16,128(1)
++	std	17,136(1)
++	std	18,144(1)
++	std	19,152(1)
++	std	20,160(1)
++	std	21,168(1)
++	li	9, 256
++	stvx	20, 9, 1
++	addi	9, 9, 16
++	stvx	21, 9, 1
++	addi	9, 9, 16
++	stvx	22, 9, 1
++	addi	9, 9, 16
++	stvx	23, 9, 1
++	addi	9, 9, 16
++	stvx	24, 9, 1
++	addi	9, 9, 16
++	stvx	25, 9, 1
++	addi	9, 9, 16
++	stvx	26, 9, 1
++	addi	9, 9, 16
++	stvx	27, 9, 1
++	addi	9, 9, 16
++	stvx	28, 9, 1
++	addi	9, 9, 16
++	stvx	29, 9, 1
++	addi	9, 9, 16
++	stvx	30, 9, 1
++	addi	9, 9, 16
++	stvx	31, 9, 1
++	std	0, 528(1)
++
++	# Load Xi
++	lxvb16x	32, 0, 8	# load Xi
++
++	# load Hash - h^4, h^3, h^2, h
++	li	10, 32
++	lxvd2x	2+32, 10, 8	# H Poli
++	li	10, 48
++	lxvd2x	3+32, 10, 8	# Hl
++	li	10, 64
++	lxvd2x	4+32, 10, 8	# H
++	li	10, 80
++	lxvd2x	5+32, 10, 8	# Hh
++
++	li	10, 96
++	lxvd2x	6+32, 10, 8	# H^2l
++	li	10, 112
++	lxvd2x	7+32, 10, 8	# H^2
++	li	10, 128
++	lxvd2x	8+32, 10, 8	# H^2h
++
++	li	10, 144
++	lxvd2x	9+32, 10, 8	# H^3l
++	li	10, 160
++	lxvd2x	10+32, 10, 8	# H^3
++	li	10, 176
++	lxvd2x	11+32, 10, 8	# H^3h
++
++	li	10, 192
++	lxvd2x	12+32, 10, 8	# H^4l
++	li	10, 208
++	lxvd2x	13+32, 10, 8	# H^4
++	li	10, 224
++	lxvd2x	14+32, 10, 8	# H^4h
++
++	# initialize ICB: GHASH( IV ), IV - r7
++	lxvb16x	30+32, 0, 7	# load IV  - v30
++
++	mr	12, 5		# length
++	li	11, 0		# block index
++
++	# counter 1
++	vxor	31, 31, 31
++	vspltisb 22, 1
++	vsldoi	31, 31, 22,1	# counter 1
++
++	# load round key to VSR
++	lxv	0, 0(6)
++	lxv	1, 0x10(6)
++	lxv	2, 0x20(6)
++	lxv	3, 0x30(6)
++	lxv	4, 0x40(6)
++	lxv	5, 0x50(6)
++	lxv	6, 0x60(6)
++	lxv	7, 0x70(6)
++	lxv	8, 0x80(6)
++	lxv	9, 0x90(6)
++	lxv	10, 0xa0(6)
++
++	# load rounds - 10 (128), 12 (192), 14 (256)
++	lwz	9,240(6)
++
++	#
++	# vxor	state, state, w # addroundkey
++	xxlor	32+29, 0, 0
++	vxor	15, 30, 29	# IV + round key - add round key 0
++
++	cmpdi	9, 10
++	beq	Loop_aes_gcm_8x_dec
++
++	# load 2 more round keys (v11, v12)
++	lxv	11, 0xb0(6)
++	lxv	12, 0xc0(6)
++
++	cmpdi	9, 12
++	beq	Loop_aes_gcm_8x_dec
++
++	# load 2 more round keys (v11, v12, v13, v14)
++	lxv	13, 0xd0(6)
++	lxv	14, 0xe0(6)
++	cmpdi	9, 14
++	beq	Loop_aes_gcm_8x_dec
++
++	b	aes_gcm_out
++
++.align 5
++Loop_aes_gcm_8x_dec:
++	mr	14, 3
++	mr	9, 4
++
++	# n blcoks
++	li	10, 128
++	divdu	10, 5, 10	# n 128 bytes-blocks
++	cmpdi	10, 0
++	beq	Loop_last_block_dec
++
++	vaddudm	30, 30, 31	# IV + counter
++	vxor	16, 30, 29
++	vaddudm	30, 30, 31
++	vxor	17, 30, 29
++	vaddudm	30, 30, 31
++	vxor	18, 30, 29
++	vaddudm	30, 30, 31
++	vxor	19, 30, 29
++	vaddudm	30, 30, 31
++	vxor	20, 30, 29
++	vaddudm	30, 30, 31
++	vxor	21, 30, 29
++	vaddudm	30, 30, 31
++	vxor	22, 30, 29
++
++	mtctr	10
++
++	li	15, 16
++	li	16, 32
++	li	17, 48
++	li	18, 64
++	li	19, 80
++	li	20, 96
++	li	21, 112
++
++	lwz	10, 240(6)
++
++Loop_8x_block_dec:
++
++	lxvb16x		15, 0, 14	# load block
++	lxvb16x		16, 15, 14	# load block
++	lxvb16x		17, 16, 14	# load block
++	lxvb16x		18, 17, 14	# load block
++	lxvb16x		19, 18, 14	# load block
++	lxvb16x		20, 19, 14	# load block
++	lxvb16x		21, 20, 14	# load block
++	lxvb16x		22, 21, 14	# load block
++	addi		14, 14, 128
++
++	Loop_aes_middle8x
++
++	xxlor	23+32, 10, 10
++
++	cmpdi	10, 10
++	beq	Do_last_aes_dec
++
++	# 192 bits
++	xxlor	24+32, 11, 11
++
++	vcipher	15, 15, 23
++	vcipher	16, 16, 23
++	vcipher	17, 17, 23
++	vcipher	18, 18, 23
++	vcipher	19, 19, 23
++	vcipher	20, 20, 23
++	vcipher	21, 21, 23
++	vcipher	22, 22, 23
++
++	vcipher	15, 15, 24
++	vcipher	16, 16, 24
++	vcipher	17, 17, 24
++	vcipher	18, 18, 24
++	vcipher	19, 19, 24
++	vcipher	20, 20, 24
++	vcipher	21, 21, 24
++	vcipher	22, 22, 24
++
++	xxlor	23+32, 12, 12
++
++	cmpdi	10, 12
++	beq	Do_last_aes_dec
++
++	# 256 bits
++	xxlor	24+32, 13, 13
++
++	vcipher	15, 15, 23
++	vcipher	16, 16, 23
++	vcipher	17, 17, 23
++	vcipher	18, 18, 23
++	vcipher	19, 19, 23
++	vcipher	20, 20, 23
++	vcipher	21, 21, 23
++	vcipher	22, 22, 23
++
++	vcipher	15, 15, 24
++	vcipher	16, 16, 24
++	vcipher	17, 17, 24
++	vcipher	18, 18, 24
++	vcipher	19, 19, 24
++	vcipher	20, 20, 24
++	vcipher	21, 21, 24
++	vcipher	22, 22, 24
++
++	xxlor	23+32, 14, 14
++
++	cmpdi	10, 14
++	beq	Do_last_aes_dec
++	b	aes_gcm_out
++
++Do_last_aes_dec:
++
++	#
++	# last round
++	vcipherlast     15, 15, 23
++	vcipherlast     16, 16, 23
++
++	xxlxor		47, 47, 15
++	stxvb16x        47, 0, 9	# store output
++	xxlxor		48, 48, 16
++	stxvb16x        48, 15, 9	# store output
++
++	vcipherlast     17, 17, 23
++	vcipherlast     18, 18, 23
++
++	xxlxor		49, 49, 17
++	stxvb16x        49, 16, 9	# store output
++	xxlxor		50, 50, 18
++	stxvb16x        50, 17, 9	# store output
++
++	vcipherlast     19, 19, 23
++	vcipherlast     20, 20, 23
++
++	xxlxor		51, 51, 19
++	stxvb16x        51, 18, 9	# store output
++	xxlxor		52, 52, 20
++	stxvb16x        52, 19, 9	# store output
++
++	vcipherlast     21, 21, 23
++	vcipherlast     22, 22, 23
++
++	xxlxor		53, 53, 21
++	stxvb16x        53, 20, 9	# store output
++	xxlxor		54, 54, 22
++	stxvb16x        54, 21, 9	# store output
++
++	addi		9, 9, 128
++
++	xxlor		15+32, 15, 15
++	xxlor		16+32, 16, 16
++	xxlor		17+32, 17, 17
++	xxlor		18+32, 18, 18
++	xxlor		19+32, 19, 19
++	xxlor		20+32, 20, 20
++	xxlor		21+32, 21, 21
++	xxlor		22+32, 22, 22
++
++	# ghash here
++	ppc_aes_gcm_ghash2_4x
++
++	xxlor	27+32, 0, 0
++	vaddudm 30, 30, 31		# IV + counter
++	vmr	29, 30
++	vxor    15, 30, 27		# add round key
++	vaddudm 30, 30, 31
++	vxor    16, 30, 27
++	vaddudm 30, 30, 31
++	vxor    17, 30, 27
++	vaddudm 30, 30, 31
++	vxor    18, 30, 27
++	vaddudm 30, 30, 31
++	vxor    19, 30, 27
++	vaddudm 30, 30, 31
++	vxor    20, 30, 27
++	vaddudm 30, 30, 31
++	vxor    21, 30, 27
++	vaddudm 30, 30, 31
++	vxor    22, 30, 27
++	addi    12, 12, -128
++	addi    11, 11, 128
++
++	bdnz	Loop_8x_block_dec
++
++	vmr	30, 29
++
++Loop_last_block_dec:
++	cmpdi   12, 0
++	beq     aes_gcm_out
++
++	# loop last few blocks
++	li      10, 16
++	divdu   10, 12, 10
++
++	mtctr   10
++
++	lwz	10,240(6)
++
++	cmpdi   12, 16
++	blt     Final_block_dec
++
++Next_rem_block_dec:
++	lxvb16x 15, 0, 14		# load block
++
++	Loop_aes_middle_1x
++
++	xxlor	23+32, 10, 10
++
++	cmpdi	10, 10
++	beq	Do_next_1x_dec
++
++	# 192 bits
++	xxlor	24+32, 11, 11
++
++	vcipher	15, 15, 23
++	vcipher	15, 15, 24
++
++	xxlor	23+32, 12, 12
++
++	cmpdi	10, 12
++	beq	Do_next_1x_dec
++
++	# 256 bits
++	xxlor	24+32, 13, 13
++
++	vcipher	15, 15, 23
++	vcipher	15, 15, 24
++
++	xxlor	23+32, 14, 14
++
++	cmpdi	10, 14
++	beq	Do_next_1x_dec
++
++Do_next_1x_dec:
++	vcipherlast     15, 15, 23
++
++	xxlxor  47, 47, 15
++	stxvb16x        47, 0, 9	# store output
++	addi	14, 14, 16
++	addi	9, 9, 16
++
++	xxlor	28+32, 15, 15
++	ppc_update_hash_1x
++
++	addi    12, 12, -16
++	addi    11, 11, 16
++	xxlor	19+32, 0, 0
++	vaddudm 30, 30, 31		# IV + counter
++	vxor	15, 30, 19		# add round key
++
++	bdnz	Next_rem_block_dec
++
++	cmpdi	12, 0
++	beq	aes_gcm_out
++
++Final_block_dec:
++	Loop_aes_middle_1x
++
++	xxlor	23+32, 10, 10
++
++	cmpdi	10, 10
++	beq	Do_final_1x_dec
++
++	# 192 bits
++	xxlor	24+32, 11, 11
++
++	vcipher	15, 15, 23
++	vcipher	15, 15, 24
++
++	xxlor	23+32, 12, 12
++
++	cmpdi	10, 12
++	beq	Do_final_1x_dec
++
++	# 256 bits
++	xxlor	24+32, 13, 13
++
++	vcipher	15, 15, 23
++	vcipher	15, 15, 24
++
++	xxlor	23+32, 14, 14
++
++	cmpdi	10, 14
++	beq	Do_final_1x_dec
++
++Do_final_1x_dec:
++	vcipherlast     15, 15, 23
++
++	lxvb16x	15, 0, 14		# load block
++	xxlxor	47, 47, 15
++
++	# create partial block mask
++	li	15, 16
++	sub	15, 15, 12		# index to the mask
++
++	vspltisb	16, -1		# first 16 bytes - 0xffff...ff
++	vspltisb	17, 0		# second 16 bytes - 0x0000...00
++	li	10, 192
++	stvx	16, 10, 1
++	addi	10, 10, 16
++	stvx	17, 10, 1
++
++	addi	10, 1, 192
++	lxvb16x	16, 15, 10		# load block mask
++	xxland	47, 47, 16
++
++	xxlor	28+32, 15, 15
++	ppc_update_hash_1x
++
++	# * should store only the remaining bytes.
++	bl	Write_partial_block
++
++	b aes_gcm_out
++
++
++___
++
++foreach (split("\n",$code)) {
++	s/\`([^\`]*)\`/eval $1/geo;
++
++	if ($flavour =~ /le$/o) {	# little-endian
++	    s/le\?//o		or
++	    s/be\?/#be#/o;
++	} else {
++	    s/le\?/#le#/o	or
++	    s/be\?//o;
++	}
++	print $_,"\n";
++}
++
++close STDOUT or die "error closing STDOUT: $!"; # enforce flush
+--- a/crypto/modes/build.info
++++ b/crypto/modes/build.info
+@@ -16,6 +16,7 @@ INCLUDE[ghash-sparcv9.o]=..
+ GENERATE[ghash-alpha.S]=asm/ghash-alpha.pl $(PERLASM_SCHEME)
+ GENERATE[ghash-parisc.s]=asm/ghash-parisc.pl $(PERLASM_SCHEME)
+ GENERATE[ghashp8-ppc.s]=asm/ghashp8-ppc.pl $(PERLASM_SCHEME)
++GENERATE[aes-gcm-ppc.s]=asm/aes-gcm-ppc.pl $(PERLASM_SCHEME)
+ GENERATE[ghash-armv4.S]=asm/ghash-armv4.pl $(PERLASM_SCHEME)
+ INCLUDE[ghash-armv4.o]=..
+ GENERATE[ghashv8-armx.S]=asm/ghashv8-armx.pl $(PERLASM_SCHEME)
diff --git a/openssl-1_1-Fix-AES-GCM-on-Power-8-CPUs.patch b/openssl-1_1-Fix-AES-GCM-on-Power-8-CPUs.patch
new file mode 100644
index 0000000..bba48b6
--- /dev/null
+++ b/openssl-1_1-Fix-AES-GCM-on-Power-8-CPUs.patch
@@ -0,0 +1,230 @@
+From 9ab6b64ac856157a31a54c0d12207c2338bfa8e2 Mon Sep 17 00:00:00 2001
+From: Tomas Mraz <tomas@openssl.org>
+Date: Fri, 9 Sep 2022 14:46:24 +0200
+Subject: [PATCH] Fix AES-GCM on Power 8 CPUs
+
+Properly fallback to the default implementation on CPUs
+missing necessary instructions.
+
+Fixes #19163
+
+Reviewed-by: Dmitry Belyavskiy <beldmit@gmail.com>
+Reviewed-by: Paul Dale <pauli@openssl.org>
+(Merged from https://github.com/openssl/openssl/pull/19182)
+---
+ crypto/evp/e_aes.c |  179 +++++++++++++++++++++++++++++++----------------------
+ 1 file changed, 107 insertions(+), 72 deletions(-)
+
+--- a/crypto/evp/e_aes.c
++++ b/crypto/evp/e_aes.c
+@@ -181,30 +181,16 @@ static void ctr64_inc(unsigned char *cou
+ # define PPC_AES_GCM_CAPABLE (OPENSSL_ppccap_P & PPC_MADD300)
+ # define AES_GCM_ENC_BYTES 128
+ # define AES_GCM_DEC_BYTES 128
+-# if PPC_AES_GCM_CAPABLE
+ size_t ppc_aes_gcm_encrypt(const unsigned char *in, unsigned char *out,
+                            size_t len, const void *key, unsigned char ivec[16],
+                            u64 *Xi);
+ size_t ppc_aes_gcm_decrypt(const unsigned char *in, unsigned char *out,
+                            size_t len, const void *key, unsigned char ivec[16],
+                            u64 *Xi);
+-size_t ppc_aes_gcm_encrypt_wrap(const unsigned char *in, unsigned char *out,
+-                                size_t len, const void *key,
+-                                unsigned char ivec[16], u64 *Xi);
+-size_t ppc_aes_gcm_decrypt_wrap(const unsigned char *in, unsigned char *out,
+-                                size_t len, const void *key,
+-                                unsigned char ivec[16], u64 *Xi);
+-#  define AES_gcm_encrypt ppc_aes_gcm_encrypt_wrap
+-#  define AES_gcm_decrypt ppc_aes_gcm_decrypt_wrap
+-#  define AES_GCM_ASM(gctx) ((gctx)->ctr==aes_p8_ctr32_encrypt_blocks && \
+-                             (gctx)->gcm.ghash==gcm_ghash_p8)
++# define AES_GCM_ASM_PPC(gctx) ((gctx)->ctr==aes_p8_ctr32_encrypt_blocks && \
++                                (gctx)->gcm.ghash==gcm_ghash_p8)
+ void gcm_ghash_p8(u64 Xi[2],const u128 Htable[16],const u8 *inp, size_t len);
+ 
+-extern size_t ppc_aes_gcm_encrypt(const unsigned char *in, unsigned char *out, size_t len,
+-                                  const void *key, unsigned char ivec[16], u64 *Xi);
+-extern size_t ppc_aes_gcm_decrypt(const unsigned char *in, unsigned char *out, size_t len,
+-                                  const void *key, unsigned char ivec[16], u64 *Xi);
+-
+ static inline u32 UTO32(unsigned char *buf)
+ {
+     return ((u32) buf[0] << 24) | ((u32) buf[1] << 16) | ((u32) buf[2] << 8) | ((u32) buf[3]);
+@@ -223,62 +209,6 @@ static inline u32 add32TOU(unsigned char
+     return r;
+ }
+ 
+-static size_t aes_p10_gcm_crypt(const unsigned char *in, unsigned char *out, size_t len,
+-                                const void *key, unsigned char ivec[16], u64 *Xi, int encrypt)
+-{
+-    int s = 0;
+-    int ndone = 0;
+-    int ctr_reset = 0;
+-    u64 blocks_unused;
+-    u64 nb = len / 16;
+-    u64 next_ctr = 0;
+-    unsigned char ctr_saved[12];
+-
+-    memcpy(ctr_saved, ivec, 12);
+-
+-    while (nb) {
+-        blocks_unused = (u64) 0xffffffffU + 1 - (u64) UTO32 (ivec + 12);
+-        if (nb > blocks_unused) {
+-            len = blocks_unused * 16;
+-            nb -= blocks_unused;
+-            next_ctr = blocks_unused;
+-            ctr_reset = 1;
+-        } else {
+-            len = nb * 16;
+-            next_ctr = nb;
+-            nb = 0;
+-        }
+-
+-        s = encrypt ? ppc_aes_gcm_encrypt(in, out, len, key, ivec, Xi)
+-                    : ppc_aes_gcm_decrypt(in, out, len, key, ivec, Xi);
+-
+-        /* add counter to ivec */
+-        add32TOU(ivec + 12, (u32) next_ctr);
+-        if (ctr_reset) {
+-            ctr_reset = 0;
+-            in += len;
+-            out += len;
+-        }
+-        memcpy(ivec, ctr_saved, 12);
+-        ndone += s;
+-    }
+-
+-    return ndone;
+-}
+-
+-size_t ppc_aes_gcm_encrypt_wrap(const unsigned char *in, unsigned char *out, size_t len,
+-                                const void *key, unsigned char ivec[16], u64 *Xi)
+-{
+-    return aes_p10_gcm_crypt(in, out, len, key, ivec, Xi, 1);
+-}
+-
+-size_t ppc_aes_gcm_decrypt_wrap(const unsigned char *in, unsigned char *out, size_t len,
+-                                const void *key, unsigned char ivec[16], u64 *Xi)
+-{
+-    return aes_p10_gcm_crypt(in, out, len, key, ivec, Xi, 0);
+-}
+-
+-# endif
+ #endif
+ 
+ #if     defined(OPENSSL_CPUID_OBJ) &&                   (  \
+@@ -3294,9 +3224,114 @@ static int aes_gcm_tls_cipher(EVP_CIPHER
+     return rv;
+ }
+ 
++static size_t ppc_aes_gcm_crypt(const unsigned char *in, unsigned char *out, size_t len,
++                                const void *key, unsigned char ivec[16], u64 *Xi, int encrypt)
++{
++    int s = 0;
++    int ndone = 0;
++    int ctr_reset = 0;
++    u64 blocks_unused;
++    u64 nb = len / 16;
++    u64 next_ctr = 0;
++    unsigned char ctr_saved[12];
++
++    memcpy(ctr_saved, ivec, 12);
++
++    while (nb) {
++        blocks_unused = (u64) 0xffffffffU + 1 - (u64) UTO32 (ivec + 12);
++        if (nb > blocks_unused) {
++            len = blocks_unused * 16;
++            nb -= blocks_unused;
++            next_ctr = blocks_unused;
++            ctr_reset = 1;
++        } else {
++            len = nb * 16;
++            next_ctr = nb;
++            nb = 0;
++        }
++
++        s = encrypt ? ppc_aes_gcm_encrypt(in, out, len, key, ivec, Xi)
++                    : ppc_aes_gcm_decrypt(in, out, len, key, ivec, Xi);
++
++        /* add counter to ivec */
++        add32TOU(ivec + 12, (u32) next_ctr);
++        if (ctr_reset) {
++            ctr_reset = 0;
++            in += len;
++            out += len;
++        }
++        memcpy(ivec, ctr_saved, 12);
++        ndone += s;
++    }
++
++    return ndone;
++}
++
++#if defined(OPENSSL_CPUID_OBJ) && (defined(__powerpc__) || defined(__ppc__) || defined(_ARCH_PPC))
++static int ppc_aes_gcm_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
++                          const unsigned char *in, size_t len)
++{
++    EVP_AES_GCM_CTX *gctx = EVP_C_DATA(EVP_AES_GCM_CTX,ctx);
++    if (ctx->encrypt) {
++        if (gctx->ctr != NULL) {
++            size_t bulk = 0;
++
++            if (len >= AES_GCM_ENC_BYTES && AES_GCM_ASM_PPC(gctx)) {
++                size_t res = (16 - gctx->gcm.mres) % 16;
++
++                if (CRYPTO_gcm128_encrypt(&gctx->gcm, in, out, res))
++                    return 0;
++
++                bulk = ppc_aes_gcm_crypt(in + res, out + res, len - res,
++                                         gctx->gcm.key,
++                                         gctx->gcm.Yi.c, gctx->gcm.Xi.u, 1);
++
++                gctx->gcm.len.u[1] += bulk;
++                bulk += res;
++            }
++            if (CRYPTO_gcm128_encrypt_ctr32(&gctx->gcm, in + bulk, out + bulk,
++                                            len - bulk, gctx->ctr))
++                return 0;
++        } else {
++            if (CRYPTO_gcm128_encrypt(&gctx->gcm, in, out, len))
++                return 0;
++        }
++    } else {
++        if (gctx->ctr != NULL) {
++            size_t bulk = 0;
++
++            if (len >= AES_GCM_DEC_BYTES && AES_GCM_ASM_PPC(gctx)) {
++                size_t res = (16 - gctx->gcm.mres) % 16;
++
++                if (CRYPTO_gcm128_decrypt(&gctx->gcm, in, out, res))
++                    return -1;
++
++                bulk = ppc_aes_gcm_crypt(in + res, out + res, len - res,
++                                         gctx->gcm.key,
++                                         gctx->gcm.Yi.c, gctx->gcm.Xi.u, 0);
++
++                gctx->gcm.len.u[1] += bulk;
++                bulk += res;
++            }
++            if (CRYPTO_gcm128_decrypt_ctr32(&gctx->gcm, in + bulk, out + bulk,
++                                            len - bulk, gctx->ctr))
++                return 0;
++        } else {
++            if (CRYPTO_gcm128_decrypt(&gctx->gcm, in, out, len))
++                return 0;
++        }
++    }
++    return 1;
++}
++#endif
++
+ static int aes_gcm_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
+                           const unsigned char *in, size_t len)
+ {
++#if defined(OPENSSL_CPUID_OBJ) && (defined(__powerpc__) || defined(__ppc__) || defined(_ARCH_PPC))
++    if (PPC_AES_GCM_CAPABLE)
++        return ppc_aes_gcm_cipher(ctx, out, in, len);
++#endif
+     EVP_AES_GCM_CTX *gctx = EVP_C_DATA(EVP_AES_GCM_CTX,ctx);
+     /* If not set up, return error */
+     if (!gctx->key_set)
diff --git a/openssl-1_1-Fixed-conditional-statement-testing-64-and-256-bytes.patch b/openssl-1_1-Fixed-conditional-statement-testing-64-and-256-bytes.patch
new file mode 100644
index 0000000..a727083
--- /dev/null
+++ b/openssl-1_1-Fixed-conditional-statement-testing-64-and-256-bytes.patch
@@ -0,0 +1,103 @@
+From 7e1f3ffcc5bc15fb9a12b9e3bb202f544c6ed5aa Mon Sep 17 00:00:00 2001
+From: Danny Tsen <dtsen@us.ibm.com>
+Date: Wed, 23 Feb 2022 13:18:35 -0600
+Subject: [PATCH] Fixed conditional statement testing 64 and 256 bytes
+
+Reviewed-by: Paul Dale <pauli@openssl.org>
+Reviewed-by: Tomas Mraz <tomas@openssl.org>
+(Merged from https://github.com/openssl/openssl/pull/17760)
+---
+ crypto/chacha/asm/chachap10-ppc.pl |   68 -------------------------------------
+ 1 file changed, 1 insertion(+), 67 deletions(-)
+
+--- a/crypto/chacha/asm/chachap10-ppc.pl
++++ b/crypto/chacha/asm/chachap10-ppc.pl
+@@ -101,29 +101,6 @@ my ($x00,$x10,$x20,$x30) = (0, map("r$_"
+ 
+ my $FRAME=$LOCALS+64+7*16;	# 7*16 is for v26-v31 offload
+ 
+-sub VSX_lane_ROUND_1x {
+-my $a=@_[0];
+-my $b=@_[1];
+-my $c=@_[2];
+-my $d=@_[3];
+-my $odd=@_[4];
+-	vadduwm		($a,$a,$b);
+-	vxor		($d,$d,$a);
+-	vrlw		($d,$d,$sixteen);
+-	vadduwm		($c,$c,$d);
+-	vxor		($b,$b,$c);
+-	vrlw		($b,$b,$twelve);
+-	vadduwm		($a,$a,$b);
+-	vxor		($d,$d,$a);
+-	vrlw		($d,$d,$eight);
+-	vadduwm		($c,$c,$d);
+-	vxor		($b,$b,$c);
+-	vrlw		($b,$b,$seven);
+-	xxsldwi		($c,$c,$c,2);
+-	xxsldwi		($b,$b,$b,$odd?3:1);
+-	xxsldwi		($d,$d,$d,$odd?1:3);
+-}
+-
+ 
+ sub VSX_lane_ROUND_4x {
+ my ($a0,$b0,$c0,$d0)=@_;
+@@ -192,7 +169,7 @@ $code.=<<___;
+ .globl	.ChaCha20_ctr32_vsx_p10
+ .align	5
+ .ChaCha20_ctr32_vsx_p10:
+-	${UCMP}i $len,256
++	${UCMP}i $len,255
+ 	bgt 	ChaCha20_ctr32_vsx_8x
+ 	$STU	$sp,-$FRAME($sp)
+ 	mflr	r0
+@@ -268,49 +245,6 @@ Loop_outer_vsx:
+ 	vspltisw $eight,8
+ 	vspltisw $seven,7
+ 
+-	${UCMP}i $len,64
+-	bgt 	Loop_vsx_4x
+-
+-	vmr	$xa0,@K[0]
+-	vmr	$xb0,@K[1]
+-	vmr	$xc0,@K[2]
+-	vmr 	$xd0,@K[3]
+-
+-Loop_vsx_1x:
+-___
+-	VSX_lane_ROUND_1x($xa0, $xb0, $xc0,$xd0,0);
+-	VSX_lane_ROUND_1x($xa0, $xb0, $xc0,$xd0,1);
+-
+-$code.=<<___;
+-
+-	bdnz	Loop_vsx_1x
+-
+-	vadduwm $xa0, $xa0, @K[0]
+-	vadduwm $xb0, $xb0, @K[1]
+-	vadduwm $xc0, $xc0, @K[2]
+-	vadduwm $xd0, $xd0, @K[3]
+-	${UCMP}i $len,0x40
+-	blt	Ltail_vsx
+-
+-	lvx_4w  $xt0,$x00, $inp
+-	lvx_4w  $xt1,$x10, $inp
+-	lvx_4w  $xt2,$x20, $inp
+-	lvx_4w  $xt3,$x30, $inp
+-
+-	vxor	$xa0,$xa0,$xt0
+-	vxor	$xb0,$xb0,$xt1
+-	vxor	$xc0,$xc0,$xt2
+-	vxor	$xd0,$xd0,$xt3
+-
+-	stvx_4w	$xa0,$x00,$out
+-	stvx_4w	$xb0,$x10,$out
+-	addi	$inp,$inp,0x40
+-	stvx_4w	$xc0,$x20,$out
+-	subi	$len,$len,0x40
+-	stvx_4w	$xd0,$x30,$out
+-	addi	$out,$out,0x40
+-	beq	Ldone_vsx
+-
+ Loop_vsx_4x:
+ ___
+ 	foreach (&VSX_lane_ROUND_4x(0, 4, 8,12)) { eval; }
diff --git a/openssl-1_1-Fixed-counter-overflow.patch b/openssl-1_1-Fixed-counter-overflow.patch
new file mode 100644
index 0000000..40d8213
--- /dev/null
+++ b/openssl-1_1-Fixed-counter-overflow.patch
@@ -0,0 +1,136 @@
+From 345c99b6654b8313c792d54f829943068911ddbd Mon Sep 17 00:00:00 2001
+From: Danny Tsen <dtsen@us.ibm.com>
+Date: Thu, 27 Jan 2022 18:49:59 -0600
+Subject: [PATCH] Fixed counter overflow
+
+Reviewed-by: Tomas Mraz <tomas@openssl.org>
+Reviewed-by: Paul Dale <pauli@openssl.org>
+(Merged from https://github.com/openssl/openssl/pull/17607)
+---
+ crypto/evp/e_aes.c              |  101 +++++++++++++++++++++++++++++++++++++---
+ crypto/modes/asm/aes-gcm-ppc.pl |    1 
+ 2 files changed, 94 insertions(+), 8 deletions(-)
+
+--- a/crypto/evp/e_aes.c
++++ b/crypto/evp/e_aes.c
+@@ -181,16 +181,103 @@ static void ctr64_inc(unsigned char *cou
+ # define PPC_AES_GCM_CAPABLE (OPENSSL_ppccap_P & PPC_MADD300)
+ # define AES_GCM_ENC_BYTES 128
+ # define AES_GCM_DEC_BYTES 128
+-size_t ppc_aes_gcm_encrypt(const unsigned char *in, unsigned char *out, size_t len,
+-                         const void *key, unsigned char ivec[16], u64 *Xi);
+-size_t ppc_aes_gcm_decrypt(const unsigned char *in, unsigned char *out, size_t len,
+-                         const void *key, unsigned char ivec[16], u64 *Xi);
+-void gcm_ghash_p8(u64 Xi[2],const u128 Htable[16],const u8 *inp, size_t len);
+ # if PPC_AES_GCM_CAPABLE
+-#  define AES_gcm_encrypt ppc_aes_gcm_encrypt
+-#  define AES_gcm_decrypt ppc_aes_gcm_decrypt
++size_t ppc_aes_gcm_encrypt(const unsigned char *in, unsigned char *out,
++                           size_t len, const void *key, unsigned char ivec[16],
++                           u64 *Xi);
++size_t ppc_aes_gcm_decrypt(const unsigned char *in, unsigned char *out,
++                           size_t len, const void *key, unsigned char ivec[16],
++                           u64 *Xi);
++size_t ppc_aes_gcm_encrypt_wrap(const unsigned char *in, unsigned char *out,
++                                size_t len, const void *key,
++                                unsigned char ivec[16], u64 *Xi);
++size_t ppc_aes_gcm_decrypt_wrap(const unsigned char *in, unsigned char *out,
++                                size_t len, const void *key,
++                                unsigned char ivec[16], u64 *Xi);
++#  define AES_gcm_encrypt ppc_aes_gcm_encrypt_wrap
++#  define AES_gcm_decrypt ppc_aes_gcm_decrypt_wrap
+ #  define AES_GCM_ASM(gctx) ((gctx)->ctr==aes_p8_ctr32_encrypt_blocks && \
+                              (gctx)->gcm.ghash==gcm_ghash_p8)
++void gcm_ghash_p8(u64 Xi[2],const u128 Htable[16],const u8 *inp, size_t len);
++
++extern size_t ppc_aes_gcm_encrypt(const unsigned char *in, unsigned char *out, size_t len,
++                                  const void *key, unsigned char ivec[16], u64 *Xi);
++extern size_t ppc_aes_gcm_decrypt(const unsigned char *in, unsigned char *out, size_t len,
++                                  const void *key, unsigned char ivec[16], u64 *Xi);
++
++static inline u32 UTO32(unsigned char *buf)
++{
++    return ((u32) buf[0] << 24) | ((u32) buf[1] << 16) | ((u32) buf[2] << 8) | ((u32) buf[3]);
++}
++
++static inline u32 add32TOU(unsigned char buf[4], u32 n)
++{
++    u32 r;
++
++    r = UTO32(buf);
++    r += n;
++    buf[0] = (unsigned char) (r >> 24) & 0xFF;
++    buf[1] = (unsigned char) (r >> 16) & 0xFF;
++    buf[2] = (unsigned char) (r >> 8) & 0xFF;
++    buf[3] = (unsigned char) r & 0xFF;
++    return r;
++}
++
++static size_t aes_p10_gcm_crypt(const unsigned char *in, unsigned char *out, size_t len,
++                                const void *key, unsigned char ivec[16], u64 *Xi, int encrypt)
++{
++    int s = 0;
++    int ndone = 0;
++    int ctr_reset = 0;
++    u64 blocks_unused;
++    u64 nb = len / 16;
++    u64 next_ctr = 0;
++    unsigned char ctr_saved[12];
++
++    memcpy(ctr_saved, ivec, 12);
++
++    while (nb) {
++        blocks_unused = (u64) 0xffffffffU + 1 - (u64) UTO32 (ivec + 12);
++        if (nb > blocks_unused) {
++            len = blocks_unused * 16;
++            nb -= blocks_unused;
++            next_ctr = blocks_unused;
++            ctr_reset = 1;
++        } else {
++            len = nb * 16;
++            next_ctr = nb;
++            nb = 0;
++        }
++
++        s = encrypt ? ppc_aes_gcm_encrypt(in, out, len, key, ivec, Xi)
++                    : ppc_aes_gcm_decrypt(in, out, len, key, ivec, Xi);
++
++        /* add counter to ivec */
++        add32TOU(ivec + 12, (u32) next_ctr);
++        if (ctr_reset) {
++            ctr_reset = 0;
++            in += len;
++            out += len;
++        }
++        memcpy(ivec, ctr_saved, 12);
++        ndone += s;
++    }
++
++    return ndone;
++}
++
++size_t ppc_aes_gcm_encrypt_wrap(const unsigned char *in, unsigned char *out, size_t len,
++                                const void *key, unsigned char ivec[16], u64 *Xi)
++{
++    return aes_p10_gcm_crypt(in, out, len, key, ivec, Xi, 1);
++}
++
++size_t ppc_aes_gcm_decrypt_wrap(const unsigned char *in, unsigned char *out, size_t len,
++                                const void *key, unsigned char ivec[16], u64 *Xi)
++{
++    return aes_p10_gcm_crypt(in, out, len, key, ivec, Xi, 0);
++}
++
+ # endif
+ #endif
+ 
+--- a/crypto/modes/asm/aes-gcm-ppc.pl
++++ b/crypto/modes/asm/aes-gcm-ppc.pl
+@@ -81,7 +81,6 @@ open STDOUT,"| $^X $xlate $flavour \"$ou
+ 
+ $code=<<___;
+ .machine        "any"
+-.abiversion     2
+ .text
+ 
+ # 4x loops
diff --git a/openssl-1_1-chacha20-performance-optimizations-for-ppc64le-with-.patch b/openssl-1_1-chacha20-performance-optimizations-for-ppc64le-with-.patch
new file mode 100644
index 0000000..c28231c
--- /dev/null
+++ b/openssl-1_1-chacha20-performance-optimizations-for-ppc64le-with-.patch
@@ -0,0 +1,1535 @@
+From f596bbe4da779b56eea34d96168b557d78e1149a Mon Sep 17 00:00:00 2001
+From: Deepankar Bhattacharjee <deepankar.b@in.ibm.com>
+Date: Mon, 20 Sep 2021 10:45:15 -0400
+Subject: [PATCH] chacha20 performance optimizations for ppc64le with 8x lanes,
+ Performance increase around 50%.
+
+Co-authored-by: Madhusudhanan Duraisamy <madurais@in.ibm.com>
+
+Co-authored-by: Nilamjyoti Goswami <nilamgoswami@in.ibm.com>
+
+Co-authored-by: Siva Sundar Anbareeswaran <srisivasundar@in.ibm.com>
+
+Reviewed-by: Danny Tsen <dtsen@us.ibm.com>
+Tested-by: Danny Tsen <dtsen@us.ibm.com>
+Signed-off-by: Danny <dtsen@us.ibm.com>
+
+Reviewed-by: Tomas Mraz <tomas@openssl.org>
+Reviewed-by: Paul Dale <pauli@openssl.org>
+(Merged from https://github.com/openssl/openssl/pull/16637)
+---
+ Configurations/00-base-templates.conf |    2 
+ crypto/chacha/asm/chachap10-ppc.pl    | 1354 ++++++++++++++++++++++++++++++++++
+ crypto/chacha/build.info              |    1 
+ crypto/perlasm/ppc-xlate.pl           |   17 
+ crypto/ppc_arch.h                     |    1 
+ crypto/ppccap.c                       |   24 
+ crypto/ppccpuid.pl                    |   11 
+ 7 files changed, 1404 insertions(+), 6 deletions(-)
+ create mode 100755 crypto/chacha/asm/chachap10-ppc.pl
+
+--- a/Configurations/00-base-templates.conf
++++ b/Configurations/00-base-templates.conf
+@@ -345,7 +345,7 @@ my %targets=(
+ 	aes_asm_src     => "aes_core.c aes_cbc.c aes-ppc.s vpaes-ppc.s aesp8-ppc.s",
+ 	sha1_asm_src    => "sha1-ppc.s sha256-ppc.s sha512-ppc.s sha256p8-ppc.s sha512p8-ppc.s",
+ 	modes_asm_src   => "ghashp8-ppc.s aes-gcm-ppc.s",
+-	chacha_asm_src	=> "chacha-ppc.s",
++	chacha_asm_src	=> "chacha-ppc.s chachap10-ppc.s",
+ 	poly1305_asm_src=> "poly1305-ppc.s poly1305-ppcfp.s",
+     },
+     ppc64_asm => {
+--- /dev/null
++++ b/crypto/chacha/asm/chachap10-ppc.pl
+@@ -0,0 +1,1354 @@
++#! /usr/bin/env perl
++# Copyright 2016-2020 The OpenSSL Project Authors. All Rights Reserved.
++#
++# Licensed under the Apache License 2.0 (the "License").  You may not use
++# this file except in compliance with the License.  You can obtain a copy
++# in the file LICENSE in the source distribution or at
++# https://www.openssl.org/source/license.html
++
++#
++# ====================================================================
++# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
++# project. The module is, however, dual licensed under OpenSSL and
++# CRYPTOGAMS licenses depending on where you obtain it. For further
++# details see http://www.openssl.org/~appro/cryptogams/.
++# ====================================================================
++#
++# October 2015
++#
++# ChaCha20 for PowerPC/AltiVec.
++#
++# June 2018
++#
++# Add VSX 2.07 code path. Original 3xAltiVec+1xIALU is well-suited for
++# processors that can't issue more than one vector instruction per
++# cycle. But POWER8 (and POWER9) can issue a pair, and vector-only 4x
++# interleave would perform better. Incidentally PowerISA 2.07 (first
++# implemented by POWER8) defined new usable instructions, hence 4xVSX
++# code path...
++#
++# Performance in cycles per byte out of large buffer.
++#
++#			IALU/gcc-4.x    3xAltiVec+1xIALU	4xVSX
++#
++# Freescale e300	13.6/+115%	-			-
++# PPC74x0/G4e		6.81/+310%	3.81			-
++# PPC970/G5		9.29/+160%	?			-
++# POWER7		8.62/+61%	3.35			-
++# POWER8		8.70/+51%	2.91			2.09
++# POWER9		8.80/+29%	4.44(*)			2.45(**)
++#
++# (*)	this is trade-off result, it's possible to improve it, but
++#	then it would negatively affect all others;
++# (**)	POWER9 seems to be "allergic" to mixing vector and integer
++#	instructions, which is why switch to vector-only code pays
++#	off that much;
++
++# $output is the last argument if it looks like a file (it has an extension)
++# $flavour is the first argument if it doesn't look like a file
++$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
++$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
++
++if ($flavour =~ /64/) {
++	$SIZE_T	=8;
++	$LRSAVE	=2*$SIZE_T;
++	$STU	="stdu";
++	$POP	="ld";
++	$PUSH	="std";
++	$UCMP	="cmpld";
++} elsif ($flavour =~ /32/) {
++	$SIZE_T	=4;
++	$LRSAVE	=$SIZE_T;
++	$STU	="stwu";
++	$POP	="lwz";
++	$PUSH	="stw";
++	$UCMP	="cmplw";
++} else { die "nonsense $flavour"; }
++
++$LITTLE_ENDIAN = ($flavour=~/le$/) ? 1 : 0;
++
++$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
++( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
++( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
++die "can't locate ppc-xlate.pl";
++
++open STDOUT,"| $^X $xlate $flavour \"$output\""
++    or die "can't call $xlate: $!";
++
++$LOCALS=6*$SIZE_T;
++$FRAME=$LOCALS+64+18*$SIZE_T;	# 64 is for local variables
++
++sub AUTOLOAD()		# thunk [simplified] x86-style perlasm
++{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./;
++    $code .= "\t$opcode\t".join(',',@_)."\n";
++}
++
++my $sp = "r1";
++
++my ($out,$inp,$len,$key,$ctr) = map("r$_",(3..7));
++
++
++{{{
++my ($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3,
++    $xc0,$xc1,$xc2,$xc3, $xd0,$xd1,$xd2,$xd3) = map("v$_",(0..15));
++my @K = map("v$_",(16..19));
++my $CTR = "v26";
++my ($xt0,$xt1,$xt2,$xt3) = map("v$_",(27..30));
++my ($sixteen,$twelve,$eight,$seven) = ($xt0,$xt1,$xt2,$xt3);
++my $beperm = "v31";
++
++my ($x00,$x10,$x20,$x30) = (0, map("r$_",(8..10)));
++
++my $FRAME=$LOCALS+64+7*16;	# 7*16 is for v26-v31 offload
++
++sub VSX_lane_ROUND_1x {
++my $a=@_[0];
++my $b=@_[1];
++my $c=@_[2];
++my $d=@_[3];
++my $odd=@_[4];
++	vadduwm		($a,$a,$b);
++	vxor		($d,$d,$a);
++	vrlw		($d,$d,$sixteen);
++	vadduwm		($c,$c,$d);
++	vxor		($b,$b,$c);
++	vrlw		($b,$b,$twelve);
++	vadduwm		($a,$a,$b);
++	vxor		($d,$d,$a);
++	vrlw		($d,$d,$eight);
++	vadduwm		($c,$c,$d);
++	vxor		($b,$b,$c);
++	vrlw		($b,$b,$seven);
++	xxsldwi		($c,$c,$c,2);
++	xxsldwi		($b,$b,$b,$odd?3:1);
++	xxsldwi		($d,$d,$d,$odd?1:3);
++}
++
++
++sub VSX_lane_ROUND_4x {
++my ($a0,$b0,$c0,$d0)=@_;
++my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0));
++my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1));
++my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2));
++my @x=map("\"v$_\"",(0..15));
++
++	(
++	"&vadduwm	(@x[$a0],@x[$a0],@x[$b0])",	# Q1
++	 "&vadduwm	(@x[$a1],@x[$a1],@x[$b1])",	# Q2
++	  "&vadduwm	(@x[$a2],@x[$a2],@x[$b2])",	# Q3
++	   "&vadduwm	(@x[$a3],@x[$a3],@x[$b3])",	# Q4
++	"&vxor		(@x[$d0],@x[$d0],@x[$a0])",
++	 "&vxor		(@x[$d1],@x[$d1],@x[$a1])",
++	  "&vxor	(@x[$d2],@x[$d2],@x[$a2])",
++	   "&vxor	(@x[$d3],@x[$d3],@x[$a3])",
++	"&vrlw		(@x[$d0],@x[$d0],'$sixteen')",
++	 "&vrlw		(@x[$d1],@x[$d1],'$sixteen')",
++	  "&vrlw	(@x[$d2],@x[$d2],'$sixteen')",
++	   "&vrlw	(@x[$d3],@x[$d3],'$sixteen')",
++
++	"&vadduwm	(@x[$c0],@x[$c0],@x[$d0])",
++	 "&vadduwm	(@x[$c1],@x[$c1],@x[$d1])",
++	  "&vadduwm	(@x[$c2],@x[$c2],@x[$d2])",
++	   "&vadduwm	(@x[$c3],@x[$c3],@x[$d3])",
++	"&vxor		(@x[$b0],@x[$b0],@x[$c0])",
++	 "&vxor		(@x[$b1],@x[$b1],@x[$c1])",
++	  "&vxor	(@x[$b2],@x[$b2],@x[$c2])",
++	   "&vxor	(@x[$b3],@x[$b3],@x[$c3])",
++	"&vrlw		(@x[$b0],@x[$b0],'$twelve')",
++	 "&vrlw		(@x[$b1],@x[$b1],'$twelve')",
++	  "&vrlw	(@x[$b2],@x[$b2],'$twelve')",
++	   "&vrlw	(@x[$b3],@x[$b3],'$twelve')",
++
++	"&vadduwm	(@x[$a0],@x[$a0],@x[$b0])",
++	 "&vadduwm	(@x[$a1],@x[$a1],@x[$b1])",
++	  "&vadduwm	(@x[$a2],@x[$a2],@x[$b2])",
++	   "&vadduwm	(@x[$a3],@x[$a3],@x[$b3])",
++	"&vxor		(@x[$d0],@x[$d0],@x[$a0])",
++	 "&vxor		(@x[$d1],@x[$d1],@x[$a1])",
++	  "&vxor	(@x[$d2],@x[$d2],@x[$a2])",
++	   "&vxor	(@x[$d3],@x[$d3],@x[$a3])",
++	"&vrlw		(@x[$d0],@x[$d0],'$eight')",
++	 "&vrlw		(@x[$d1],@x[$d1],'$eight')",
++	  "&vrlw	(@x[$d2],@x[$d2],'$eight')",
++	   "&vrlw	(@x[$d3],@x[$d3],'$eight')",
++
++	"&vadduwm	(@x[$c0],@x[$c0],@x[$d0])",
++	 "&vadduwm	(@x[$c1],@x[$c1],@x[$d1])",
++	  "&vadduwm	(@x[$c2],@x[$c2],@x[$d2])",
++	   "&vadduwm	(@x[$c3],@x[$c3],@x[$d3])",
++	"&vxor		(@x[$b0],@x[$b0],@x[$c0])",
++	 "&vxor		(@x[$b1],@x[$b1],@x[$c1])",
++	  "&vxor	(@x[$b2],@x[$b2],@x[$c2])",
++	   "&vxor	(@x[$b3],@x[$b3],@x[$c3])",
++	"&vrlw		(@x[$b0],@x[$b0],'$seven')",
++	 "&vrlw		(@x[$b1],@x[$b1],'$seven')",
++	  "&vrlw	(@x[$b2],@x[$b2],'$seven')",
++	   "&vrlw	(@x[$b3],@x[$b3],'$seven')"
++	);
++}
++
++$code.=<<___;
++
++.globl	.ChaCha20_ctr32_vsx_p10
++.align	5
++.ChaCha20_ctr32_vsx_p10:
++	${UCMP}i $len,256
++	bgt 	ChaCha20_ctr32_vsx_8x
++	$STU	$sp,-$FRAME($sp)
++	mflr	r0
++	li	r10,`15+$LOCALS+64`
++	li	r11,`31+$LOCALS+64`
++	mfspr	r12,256
++	stvx	v26,r10,$sp
++	addi	r10,r10,32
++	stvx	v27,r11,$sp
++	addi	r11,r11,32
++	stvx	v28,r10,$sp
++	addi	r10,r10,32
++	stvx	v29,r11,$sp
++	addi	r11,r11,32
++	stvx	v30,r10,$sp
++	stvx	v31,r11,$sp
++	stw	r12,`$FRAME-4`($sp)		# save vrsave
++	li	r12,-4096+63
++	$PUSH	r0, `$FRAME+$LRSAVE`($sp)
++	mtspr	256,r12				# preserve 29 AltiVec registers
++
++	bl	Lconsts				# returns pointer Lsigma in r12
++	lvx_4w	@K[0],0,r12			# load sigma
++	addi	r12,r12,0x70
++	li	$x10,16
++	li	$x20,32
++	li	$x30,48
++	li	r11,64
++
++	lvx_4w	@K[1],0,$key			# load key
++	lvx_4w	@K[2],$x10,$key
++	lvx_4w	@K[3],0,$ctr			# load counter
++
++	vxor	$xt0,$xt0,$xt0
++	lvx_4w	$xt1,r11,r12
++	vspltw	$CTR,@K[3],0
++	vsldoi	@K[3],@K[3],$xt0,4
++	vsldoi	@K[3],$xt0,@K[3],12		# clear @K[3].word[0]
++	vadduwm	$CTR,$CTR,$xt1
++
++	be?lvsl	$beperm,0,$x10			# 0x00..0f
++	be?vspltisb $xt0,3			# 0x03..03
++	be?vxor	$beperm,$beperm,$xt0		# swap bytes within words
++
++	li	r0,10				# inner loop counter
++	mtctr	r0
++	b	Loop_outer_vsx
++
++.align	5
++Loop_outer_vsx:
++	lvx	$xa0,$x00,r12			# load [smashed] sigma
++	lvx	$xa1,$x10,r12
++	lvx	$xa2,$x20,r12
++	lvx	$xa3,$x30,r12
++
++	vspltw	$xb0,@K[1],0			# smash the key
++	vspltw	$xb1,@K[1],1
++	vspltw	$xb2,@K[1],2
++	vspltw	$xb3,@K[1],3
++
++	vspltw	$xc0,@K[2],0
++	vspltw	$xc1,@K[2],1
++	vspltw	$xc2,@K[2],2
++	vspltw	$xc3,@K[2],3
++
++	vmr	$xd0,$CTR			# smash the counter
++	vspltw	$xd1,@K[3],1
++	vspltw	$xd2,@K[3],2
++	vspltw	$xd3,@K[3],3
++
++	vspltisw $sixteen,-16			# synthesize constants
++	vspltisw $twelve,12
++	vspltisw $eight,8
++	vspltisw $seven,7
++
++	${UCMP}i $len,64
++	bgt 	Loop_vsx_4x
++
++	vmr	$xa0,@K[0]
++	vmr	$xb0,@K[1]
++	vmr	$xc0,@K[2]
++	vmr 	$xd0,@K[3]
++
++Loop_vsx_1x:
++___
++	VSX_lane_ROUND_1x($xa0, $xb0, $xc0,$xd0,0);
++	VSX_lane_ROUND_1x($xa0, $xb0, $xc0,$xd0,1);
++
++$code.=<<___;
++
++	bdnz	Loop_vsx_1x
++
++	vadduwm $xa0, $xa0, @K[0]
++	vadduwm $xb0, $xb0, @K[1]
++	vadduwm $xc0, $xc0, @K[2]
++	vadduwm $xd0, $xd0, @K[3]
++	${UCMP}i $len,0x40
++	blt	Ltail_vsx
++
++	lvx_4w  $xt0,$x00, $inp
++	lvx_4w  $xt1,$x10, $inp
++	lvx_4w  $xt2,$x20, $inp
++	lvx_4w  $xt3,$x30, $inp
++
++	vxor	$xa0,$xa0,$xt0
++	vxor	$xb0,$xb0,$xt1
++	vxor	$xc0,$xc0,$xt2
++	vxor	$xd0,$xd0,$xt3
++
++	stvx_4w	$xa0,$x00,$out
++	stvx_4w	$xb0,$x10,$out
++	addi	$inp,$inp,0x40
++	stvx_4w	$xc0,$x20,$out
++	subi	$len,$len,0x40
++	stvx_4w	$xd0,$x30,$out
++	addi	$out,$out,0x40
++	beq	Ldone_vsx
++
++Loop_vsx_4x:
++___
++	foreach (&VSX_lane_ROUND_4x(0, 4, 8,12)) { eval; }
++	foreach (&VSX_lane_ROUND_4x(0, 5,10,15)) { eval; }
++$code.=<<___;
++
++	bdnz	Loop_vsx_4x
++
++	vadduwm	$xd0,$xd0,$CTR
++
++	vmrgew	$xt0,$xa0,$xa1			# transpose data
++	vmrgew	$xt1,$xa2,$xa3
++	vmrgow	$xa0,$xa0,$xa1
++	vmrgow	$xa2,$xa2,$xa3
++	vmrgew	$xt2,$xb0,$xb1
++	vmrgew	$xt3,$xb2,$xb3
++	vpermdi	$xa1,$xa0,$xa2,0b00
++	vpermdi	$xa3,$xa0,$xa2,0b11
++	vpermdi	$xa0,$xt0,$xt1,0b00
++	vpermdi	$xa2,$xt0,$xt1,0b11
++
++	vmrgow	$xb0,$xb0,$xb1
++	vmrgow	$xb2,$xb2,$xb3
++	vmrgew	$xt0,$xc0,$xc1
++	vmrgew	$xt1,$xc2,$xc3
++	vpermdi	$xb1,$xb0,$xb2,0b00
++	vpermdi	$xb3,$xb0,$xb2,0b11
++	vpermdi	$xb0,$xt2,$xt3,0b00
++	vpermdi	$xb2,$xt2,$xt3,0b11
++
++	vmrgow	$xc0,$xc0,$xc1
++	vmrgow	$xc2,$xc2,$xc3
++	vmrgew	$xt2,$xd0,$xd1
++	vmrgew	$xt3,$xd2,$xd3
++	vpermdi	$xc1,$xc0,$xc2,0b00
++	vpermdi	$xc3,$xc0,$xc2,0b11
++	vpermdi	$xc0,$xt0,$xt1,0b00
++	vpermdi	$xc2,$xt0,$xt1,0b11
++
++	vmrgow	$xd0,$xd0,$xd1
++	vmrgow	$xd2,$xd2,$xd3
++	vspltisw $xt0,4
++	vadduwm  $CTR,$CTR,$xt0		# next counter value
++	vpermdi	$xd1,$xd0,$xd2,0b00
++	vpermdi	$xd3,$xd0,$xd2,0b11
++	vpermdi	$xd0,$xt2,$xt3,0b00
++	vpermdi	$xd2,$xt2,$xt3,0b11
++
++	vadduwm	$xa0,$xa0,@K[0]
++	vadduwm	$xb0,$xb0,@K[1]
++	vadduwm	$xc0,$xc0,@K[2]
++	vadduwm	$xd0,$xd0,@K[3]
++
++	be?vperm $xa0,$xa0,$xa0,$beperm
++	be?vperm $xb0,$xb0,$xb0,$beperm
++	be?vperm $xc0,$xc0,$xc0,$beperm
++	be?vperm $xd0,$xd0,$xd0,$beperm
++
++	${UCMP}i $len,0x40
++	blt	Ltail_vsx
++
++	lvx_4w	$xt0,$x00,$inp
++	lvx_4w	$xt1,$x10,$inp
++	lvx_4w	$xt2,$x20,$inp
++	lvx_4w	$xt3,$x30,$inp
++
++	vxor	$xt0,$xt0,$xa0
++	vxor	$xt1,$xt1,$xb0
++	vxor	$xt2,$xt2,$xc0
++	vxor	$xt3,$xt3,$xd0
++
++	stvx_4w	$xt0,$x00,$out
++	stvx_4w	$xt1,$x10,$out
++	addi	$inp,$inp,0x40
++	stvx_4w	$xt2,$x20,$out
++	subi	$len,$len,0x40
++	stvx_4w	$xt3,$x30,$out
++	addi	$out,$out,0x40
++	beq	Ldone_vsx
++
++	vadduwm	$xa0,$xa1,@K[0]
++	vadduwm	$xb0,$xb1,@K[1]
++	vadduwm	$xc0,$xc1,@K[2]
++	vadduwm	$xd0,$xd1,@K[3]
++
++	be?vperm $xa0,$xa0,$xa0,$beperm
++	be?vperm $xb0,$xb0,$xb0,$beperm
++	be?vperm $xc0,$xc0,$xc0,$beperm
++	be?vperm $xd0,$xd0,$xd0,$beperm
++
++	${UCMP}i $len,0x40
++	blt	Ltail_vsx
++
++	lvx_4w	$xt0,$x00,$inp
++	lvx_4w	$xt1,$x10,$inp
++	lvx_4w	$xt2,$x20,$inp
++	lvx_4w	$xt3,$x30,$inp
++
++	vxor	$xt0,$xt0,$xa0
++	vxor	$xt1,$xt1,$xb0
++	vxor	$xt2,$xt2,$xc0
++	vxor	$xt3,$xt3,$xd0
++
++	stvx_4w	$xt0,$x00,$out
++	stvx_4w	$xt1,$x10,$out
++	addi	$inp,$inp,0x40
++	stvx_4w	$xt2,$x20,$out
++	subi	$len,$len,0x40
++	stvx_4w	$xt3,$x30,$out
++	addi	$out,$out,0x40
++	beq	Ldone_vsx
++
++	vadduwm	$xa0,$xa2,@K[0]
++	vadduwm	$xb0,$xb2,@K[1]
++	vadduwm	$xc0,$xc2,@K[2]
++	vadduwm	$xd0,$xd2,@K[3]
++
++	be?vperm $xa0,$xa0,$xa0,$beperm
++	be?vperm $xb0,$xb0,$xb0,$beperm
++	be?vperm $xc0,$xc0,$xc0,$beperm
++	be?vperm $xd0,$xd0,$xd0,$beperm
++
++	${UCMP}i $len,0x40
++	blt	Ltail_vsx
++
++	lvx_4w	$xt0,$x00,$inp
++	lvx_4w	$xt1,$x10,$inp
++	lvx_4w	$xt2,$x20,$inp
++	lvx_4w	$xt3,$x30,$inp
++
++	vxor	$xt0,$xt0,$xa0
++	vxor	$xt1,$xt1,$xb0
++	vxor	$xt2,$xt2,$xc0
++	vxor	$xt3,$xt3,$xd0
++
++	stvx_4w	$xt0,$x00,$out
++	stvx_4w	$xt1,$x10,$out
++	addi	$inp,$inp,0x40
++	stvx_4w	$xt2,$x20,$out
++	subi	$len,$len,0x40
++	stvx_4w	$xt3,$x30,$out
++	addi	$out,$out,0x40
++	beq	Ldone_vsx
++
++	vadduwm	$xa0,$xa3,@K[0]
++	vadduwm	$xb0,$xb3,@K[1]
++	vadduwm	$xc0,$xc3,@K[2]
++	vadduwm	$xd0,$xd3,@K[3]
++
++	be?vperm $xa0,$xa0,$xa0,$beperm
++	be?vperm $xb0,$xb0,$xb0,$beperm
++	be?vperm $xc0,$xc0,$xc0,$beperm
++	be?vperm $xd0,$xd0,$xd0,$beperm
++
++	${UCMP}i $len,0x40
++	blt	Ltail_vsx
++
++	lvx_4w	$xt0,$x00,$inp
++	lvx_4w	$xt1,$x10,$inp
++	lvx_4w	$xt2,$x20,$inp
++	lvx_4w	$xt3,$x30,$inp
++
++	vxor	$xt0,$xt0,$xa0
++	vxor	$xt1,$xt1,$xb0
++	vxor	$xt2,$xt2,$xc0
++	vxor	$xt3,$xt3,$xd0
++
++	stvx_4w	$xt0,$x00,$out
++	stvx_4w	$xt1,$x10,$out
++	addi	$inp,$inp,0x40
++	stvx_4w	$xt2,$x20,$out
++	subi	$len,$len,0x40
++	stvx_4w	$xt3,$x30,$out
++	addi	$out,$out,0x40
++	mtctr	r0
++	bne	Loop_outer_vsx
++
++Ldone_vsx:
++	lwz	r12,`$FRAME-4`($sp)		# pull vrsave
++	li	r10,`15+$LOCALS+64`
++	li	r11,`31+$LOCALS+64`
++	$POP	r0, `$FRAME+$LRSAVE`($sp)
++	mtspr	256,r12				# restore vrsave
++	lvx	v26,r10,$sp
++	addi	r10,r10,32
++	lvx	v27,r11,$sp
++	addi	r11,r11,32
++	lvx	v28,r10,$sp
++	addi	r10,r10,32
++	lvx	v29,r11,$sp
++	addi	r11,r11,32
++	lvx	v30,r10,$sp
++	lvx	v31,r11,$sp
++	mtlr	r0
++	addi	$sp,$sp,$FRAME
++	blr
++
++.align	4
++Ltail_vsx:
++	addi	r11,$sp,$LOCALS
++	mtctr	$len
++	stvx_4w	$xa0,$x00,r11			# offload block to stack
++	stvx_4w	$xb0,$x10,r11
++	stvx_4w	$xc0,$x20,r11
++	stvx_4w	$xd0,$x30,r11
++	subi	r12,r11,1			# prepare for *++ptr
++	subi	$inp,$inp,1
++	subi	$out,$out,1
++
++Loop_tail_vsx:
++	lbzu	r6,1(r12)
++	lbzu	r7,1($inp)
++	xor	r6,r6,r7
++	stbu	r6,1($out)
++	bdnz	Loop_tail_vsx
++
++	stvx_4w	$K[0],$x00,r11			# wipe copy of the block
++	stvx_4w	$K[0],$x10,r11
++	stvx_4w	$K[0],$x20,r11
++	stvx_4w	$K[0],$x30,r11
++
++	b	Ldone_vsx
++	.long	0
++	.byte	0,12,0x04,1,0x80,0,5,0
++	.long	0
++.size	.ChaCha20_ctr32_vsx_p10,.-.ChaCha20_ctr32_vsx_p10
++___
++}}}
++
++##This is 8 block in parallel implementation. The heart of chacha round uses vector instruction that has access to 
++# vsr[32+X]. To perform the 8 parallel block we tend to use all 32 register to hold the 8 block info.
++# WE need to store few register value on side, so we can use VSR{32+X} for few vector instructions used in round op and hold intermediate value.
++# WE use the VSR[0]-VSR[31] for holding intermediate value and perform 8 block in parallel.
++#
++{{{
++#### ($out,$inp,$len,$key,$ctr) = map("r$_",(3..7));
++my ($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3,
++    $xc0,$xc1,$xc2,$xc3, $xd0,$xd1,$xd2,$xd3,
++    $xa4,$xa5,$xa6,$xa7, $xb4,$xb5,$xb6,$xb7,
++    $xc4,$xc5,$xc6,$xc7, $xd4,$xd5,$xd6,$xd7) = map("v$_",(0..31));
++my ($xcn4,$xcn5,$xcn6,$xcn7, $xdn4,$xdn5,$xdn6,$xdn7) = map("v$_",(8..15));
++my ($xan0,$xbn0,$xcn0,$xdn0) = map("v$_",(0..3));
++my @K = map("v$_",27,(24..26));
++my ($xt0,$xt1,$xt2,$xt3,$xt4) = map("v$_",23,(28..31));
++my $xr0 = "v4";
++my $CTR0 = "v22";
++my $CTR1 = "v5";
++my $beperm = "v31";
++my ($x00,$x10,$x20,$x30) = (0, map("r$_",(8..10)));
++my ($xv0,$xv1,$xv2,$xv3,$xv4,$xv5,$xv6,$xv7) = map("v$_",(0..7));
++my ($xv8,$xv9,$xv10,$xv11,$xv12,$xv13,$xv14,$xv15,$xv16,$xv17) = map("v$_",(8..17));
++my ($xv18,$xv19,$xv20,$xv21) = map("v$_",(18..21));
++my ($xv22,$xv23,$xv24,$xv25,$xv26) = map("v$_",(22..26));
++
++my $FRAME=$LOCALS+64+9*16;	# 8*16 is for v24-v31 offload
++
++sub VSX_lane_ROUND_8x {
++my ($a0,$b0,$c0,$d0,$a4,$b4,$c4,$d4)=@_;
++my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0));
++my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1));
++my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2));
++my ($a5,$b5,$c5,$d5)=map(($_&~3)+(($_+1)&3),($a4,$b4,$c4,$d4));
++my ($a6,$b6,$c6,$d6)=map(($_&~3)+(($_+1)&3),($a5,$b5,$c5,$d5));
++my ($a7,$b7,$c7,$d7)=map(($_&~3)+(($_+1)&3),($a6,$b6,$c6,$d6));
++my ($xv8,$xv9,$xv10,$xv11,$xv12,$xv13,$xv14,$xv15,$xv16,$xv17) = map("\"v$_\"",(8..17));
++my @x=map("\"v$_\"",(0..31));
++
++	(
++	"&vxxlor        ($xv15 ,@x[$c7],@x[$c7])",      #copy v30 to v13
++	"&vxxlorc       (@x[$c7], $xv9,$xv9)",
++
++	"&vadduwm	(@x[$a0],@x[$a0],@x[$b0])",	# Q1
++	 "&vadduwm	(@x[$a1],@x[$a1],@x[$b1])",	# Q2
++	  "&vadduwm	(@x[$a2],@x[$a2],@x[$b2])",	# Q3
++	   "&vadduwm	(@x[$a3],@x[$a3],@x[$b3])",	# Q4
++	"&vadduwm	(@x[$a4],@x[$a4],@x[$b4])",	# Q1
++	 "&vadduwm	(@x[$a5],@x[$a5],@x[$b5])",	# Q2
++	  "&vadduwm	(@x[$a6],@x[$a6],@x[$b6])",	# Q3
++	   "&vadduwm	(@x[$a7],@x[$a7],@x[$b7])",	# Q4
++
++	"&vxor		(@x[$d0],@x[$d0],@x[$a0])",
++	 "&vxor		(@x[$d1],@x[$d1],@x[$a1])",
++	  "&vxor	(@x[$d2],@x[$d2],@x[$a2])",
++	   "&vxor	(@x[$d3],@x[$d3],@x[$a3])",
++	"&vxor		(@x[$d4],@x[$d4],@x[$a4])",
++	 "&vxor		(@x[$d5],@x[$d5],@x[$a5])",
++	  "&vxor	(@x[$d6],@x[$d6],@x[$a6])",
++	   "&vxor	(@x[$d7],@x[$d7],@x[$a7])",
++
++	"&vrlw		(@x[$d0],@x[$d0],@x[$c7])",
++	 "&vrlw		(@x[$d1],@x[$d1],@x[$c7])",
++	  "&vrlw	(@x[$d2],@x[$d2],@x[$c7])",
++	   "&vrlw	(@x[$d3],@x[$d3],@x[$c7])",
++	"&vrlw		(@x[$d4],@x[$d4],@x[$c7])",
++	 "&vrlw		(@x[$d5],@x[$d5],@x[$c7])",
++	  "&vrlw	(@x[$d6],@x[$d6],@x[$c7])",
++	   "&vrlw	(@x[$d7],@x[$d7],@x[$c7])",
++
++	"&vxxlor        ($xv13 ,@x[$a7],@x[$a7])",
++	"&vxxlorc       (@x[$c7], $xv15,$xv15)",
++	"&vxxlorc       (@x[$a7], $xv10,$xv10)",
++
++	"&vadduwm	(@x[$c0],@x[$c0],@x[$d0])",
++	 "&vadduwm	(@x[$c1],@x[$c1],@x[$d1])",
++	  "&vadduwm	(@x[$c2],@x[$c2],@x[$d2])",
++	   "&vadduwm	(@x[$c3],@x[$c3],@x[$d3])",
++	"&vadduwm	(@x[$c4],@x[$c4],@x[$d4])",
++	 "&vadduwm	(@x[$c5],@x[$c5],@x[$d5])",
++	  "&vadduwm	(@x[$c6],@x[$c6],@x[$d6])",
++	   "&vadduwm	(@x[$c7],@x[$c7],@x[$d7])",
++
++	"&vxor		(@x[$b0],@x[$b0],@x[$c0])",
++	 "&vxor		(@x[$b1],@x[$b1],@x[$c1])",
++	  "&vxor	(@x[$b2],@x[$b2],@x[$c2])",
++	   "&vxor	(@x[$b3],@x[$b3],@x[$c3])",
++	"&vxor		(@x[$b4],@x[$b4],@x[$c4])",
++	 "&vxor		(@x[$b5],@x[$b5],@x[$c5])",
++	  "&vxor	(@x[$b6],@x[$b6],@x[$c6])",
++	   "&vxor	(@x[$b7],@x[$b7],@x[$c7])",
++
++	"&vrlw		(@x[$b0],@x[$b0],@x[$a7])",
++	 "&vrlw		(@x[$b1],@x[$b1],@x[$a7])",
++	  "&vrlw	(@x[$b2],@x[$b2],@x[$a7])",
++	   "&vrlw	(@x[$b3],@x[$b3],@x[$a7])",
++	"&vrlw		(@x[$b4],@x[$b4],@x[$a7])",
++	 "&vrlw		(@x[$b5],@x[$b5],@x[$a7])",
++	  "&vrlw	(@x[$b6],@x[$b6],@x[$a7])",
++	   "&vrlw	(@x[$b7],@x[$b7],@x[$a7])",
++
++	"&vxxlorc       (@x[$a7], $xv13,$xv13)",
++	"&vxxlor	($xv15 ,@x[$c7],@x[$c7])",                 
++	"&vxxlorc       (@x[$c7], $xv11,$xv11)",
++
++
++	"&vadduwm	(@x[$a0],@x[$a0],@x[$b0])",
++	 "&vadduwm	(@x[$a1],@x[$a1],@x[$b1])",
++	  "&vadduwm	(@x[$a2],@x[$a2],@x[$b2])",
++	   "&vadduwm	(@x[$a3],@x[$a3],@x[$b3])",
++	"&vadduwm	(@x[$a4],@x[$a4],@x[$b4])",
++	 "&vadduwm	(@x[$a5],@x[$a5],@x[$b5])",
++	  "&vadduwm	(@x[$a6],@x[$a6],@x[$b6])",
++	   "&vadduwm	(@x[$a7],@x[$a7],@x[$b7])",
++
++	"&vxor		(@x[$d0],@x[$d0],@x[$a0])",
++	 "&vxor		(@x[$d1],@x[$d1],@x[$a1])",
++	  "&vxor	(@x[$d2],@x[$d2],@x[$a2])",
++	   "&vxor	(@x[$d3],@x[$d3],@x[$a3])",
++	"&vxor		(@x[$d4],@x[$d4],@x[$a4])",
++	 "&vxor		(@x[$d5],@x[$d5],@x[$a5])",
++	  "&vxor	(@x[$d6],@x[$d6],@x[$a6])",
++	   "&vxor	(@x[$d7],@x[$d7],@x[$a7])",
++
++	"&vrlw		(@x[$d0],@x[$d0],@x[$c7])",
++	 "&vrlw		(@x[$d1],@x[$d1],@x[$c7])",
++	  "&vrlw	(@x[$d2],@x[$d2],@x[$c7])",
++	   "&vrlw	(@x[$d3],@x[$d3],@x[$c7])",
++	"&vrlw		(@x[$d4],@x[$d4],@x[$c7])",
++	 "&vrlw		(@x[$d5],@x[$d5],@x[$c7])",
++	  "&vrlw	(@x[$d6],@x[$d6],@x[$c7])",
++	   "&vrlw	(@x[$d7],@x[$d7],@x[$c7])",
++
++	"&vxxlorc       (@x[$c7], $xv15,$xv15)",
++	"&vxxlor        ($xv13 ,@x[$a7],@x[$a7])",               
++	"&vxxlorc       (@x[$a7], $xv12,$xv12)",
++
++	"&vadduwm	(@x[$c0],@x[$c0],@x[$d0])",
++	 "&vadduwm	(@x[$c1],@x[$c1],@x[$d1])",
++	  "&vadduwm	(@x[$c2],@x[$c2],@x[$d2])",
++	   "&vadduwm	(@x[$c3],@x[$c3],@x[$d3])",
++	"&vadduwm	(@x[$c4],@x[$c4],@x[$d4])",
++	 "&vadduwm	(@x[$c5],@x[$c5],@x[$d5])",
++	  "&vadduwm	(@x[$c6],@x[$c6],@x[$d6])",
++	   "&vadduwm	(@x[$c7],@x[$c7],@x[$d7])",
++	"&vxor		(@x[$b0],@x[$b0],@x[$c0])",
++	 "&vxor		(@x[$b1],@x[$b1],@x[$c1])",
++	  "&vxor	(@x[$b2],@x[$b2],@x[$c2])",
++	   "&vxor	(@x[$b3],@x[$b3],@x[$c3])",
++	"&vxor		(@x[$b4],@x[$b4],@x[$c4])",
++	 "&vxor		(@x[$b5],@x[$b5],@x[$c5])",
++	  "&vxor	(@x[$b6],@x[$b6],@x[$c6])",
++	   "&vxor	(@x[$b7],@x[$b7],@x[$c7])",
++	"&vrlw		(@x[$b0],@x[$b0],@x[$a7])",
++	 "&vrlw		(@x[$b1],@x[$b1],@x[$a7])",
++	  "&vrlw	(@x[$b2],@x[$b2],@x[$a7])",
++	   "&vrlw	(@x[$b3],@x[$b3],@x[$a7])",
++	"&vrlw		(@x[$b4],@x[$b4],@x[$a7])",
++	 "&vrlw		(@x[$b5],@x[$b5],@x[$a7])",
++	  "&vrlw	(@x[$b6],@x[$b6],@x[$a7])",
++	   "&vrlw	(@x[$b7],@x[$b7],@x[$a7])",
++
++	"&vxxlorc       (@x[$a7], $xv13,$xv13)",
++	);
++}
++
++$code.=<<___;
++
++.globl	.ChaCha20_ctr32_vsx_8x
++.align	5
++.ChaCha20_ctr32_vsx_8x:
++	$STU	$sp,-$FRAME($sp)
++	mflr	r0
++	li	r10,`15+$LOCALS+64`
++	li	r11,`31+$LOCALS+64`
++	mfspr	r12,256
++	stvx	v24,r10,$sp
++	addi	r10,r10,32
++	stvx	v25,r11,$sp
++	addi	r11,r11,32
++	stvx	v26,r10,$sp
++	addi	r10,r10,32
++	stvx	v27,r11,$sp
++	addi	r11,r11,32
++	stvx	v28,r10,$sp
++	addi	r10,r10,32
++	stvx	v29,r11,$sp
++	addi	r11,r11,32
++	stvx	v30,r10,$sp
++	stvx	v31,r11,$sp
++	stw	r12,`$FRAME-4`($sp)		# save vrsave
++	li	r12,-4096+63
++	$PUSH	r0, `$FRAME+$LRSAVE`($sp)
++	mtspr	256,r12				# preserve 29 AltiVec registers
++
++	bl	Lconsts				# returns pointer Lsigma in r12
++
++	lvx_4w	@K[0],0,r12			# load sigma
++	addi	r12,r12,0x70
++	li	$x10,16
++	li	$x20,32
++	li	$x30,48
++	li	r11,64
++
++	vspltisw $xa4,-16			# synthesize constants
++	vspltisw $xb4,12			# synthesize constants
++	vspltisw $xc4,8			# synthesize constants
++	vspltisw $xd4,7			# synthesize constants
++
++	lvx	$xa0,$x00,r12			# load [smashed] sigma
++	lvx	$xa1,$x10,r12
++	lvx	$xa2,$x20,r12
++	lvx	$xa3,$x30,r12
++
++	vxxlor	$xv9   ,$xa4,$xa4               #save shift val in vr9-12
++	vxxlor	$xv10  ,$xb4,$xb4
++	vxxlor	$xv11  ,$xc4,$xc4
++	vxxlor	$xv12  ,$xd4,$xd4
++	vxxlor	$xv22  ,$xa0,$xa0               #save sigma in vr22-25
++	vxxlor	$xv23  ,$xa1,$xa1
++	vxxlor	$xv24  ,$xa2,$xa2
++	vxxlor	$xv25  ,$xa3,$xa3
++
++	lvx_4w	@K[1],0,$key			# load key
++	lvx_4w	@K[2],$x10,$key
++	lvx_4w	@K[3],0,$ctr			# load counter
++	vspltisw $xt3,4
++
++
++	vxor	$xt2,$xt2,$xt2
++	lvx_4w	$xt1,r11,r12
++	vspltw	$xa2,@K[3],0			#save the original count after spltw
++	vsldoi	@K[3],@K[3],$xt2,4
++	vsldoi	@K[3],$xt2,@K[3],12		# clear @K[3].word[0]
++	vadduwm	$xt1,$xa2,$xt1
++	vadduwm $xt3,$xt1,$xt3     		# next counter value
++	vspltw	$xa0,@K[2],2                    # save the K[2] spltw 2 and save v8.
++
++	be?lvsl	  $beperm,0,$x10			# 0x00..0f
++	be?vspltisb $xt0,3			# 0x03..03
++	be?vxor   $beperm,$beperm,$xt0		# swap bytes within words
++	be?vxxlor $xv26 ,$beperm,$beperm
++
++	vxxlor	$xv0 ,@K[0],@K[0]               # K0,k1,k2 to vr0,1,2
++	vxxlor	$xv1 ,@K[1],@K[1]
++	vxxlor	$xv2 ,@K[2],@K[2]
++	vxxlor	$xv3 ,@K[3],@K[3]
++	vxxlor	$xv4 ,$xt1,$xt1                #CTR ->4, CTR+4-> 5
++	vxxlor	$xv5 ,$xt3,$xt3
++	vxxlor	$xv8 ,$xa0,$xa0
++
++	li	r0,10				# inner loop counter
++	mtctr	r0
++	b	Loop_outer_vsx_8x
++
++.align	5
++Loop_outer_vsx_8x:
++	vxxlorc	$xa0,$xv22,$xv22	        # load [smashed] sigma
++	vxxlorc	$xa1,$xv23,$xv23
++	vxxlorc	$xa2,$xv24,$xv24
++	vxxlorc	$xa3,$xv25,$xv25
++	vxxlorc	$xa4,$xv22,$xv22
++	vxxlorc	$xa5,$xv23,$xv23
++	vxxlorc	$xa6,$xv24,$xv24
++	vxxlorc	$xa7,$xv25,$xv25
++
++	vspltw	$xb0,@K[1],0			# smash the key
++	vspltw	$xb1,@K[1],1
++	vspltw	$xb2,@K[1],2
++	vspltw	$xb3,@K[1],3
++	vspltw	$xb4,@K[1],0			# smash the key
++	vspltw	$xb5,@K[1],1
++	vspltw	$xb6,@K[1],2
++	vspltw	$xb7,@K[1],3
++
++	vspltw	$xc0,@K[2],0
++	vspltw	$xc1,@K[2],1
++	vspltw	$xc2,@K[2],2
++	vspltw	$xc3,@K[2],3
++	vspltw	$xc4,@K[2],0
++	vspltw	$xc7,@K[2],3
++	vspltw	$xc5,@K[2],1
++
++	vxxlorc	$xd0,$xv4,$xv4			# smash the counter
++	vspltw	$xd1,@K[3],1
++	vspltw	$xd2,@K[3],2
++	vspltw	$xd3,@K[3],3
++	vxxlorc	$xd4,$xv5,$xv5			# smash the counter
++	vspltw	$xd5,@K[3],1
++	vspltw	$xd6,@K[3],2
++	vspltw	$xd7,@K[3],3
++	vxxlorc	$xc6,$xv8,$xv8                  #copy of vlspt k[2],2 is in v8.v26 ->k[3] so need to wait until k3 is done
++
++Loop_vsx_8x:
++___
++	foreach (&VSX_lane_ROUND_8x(0,4, 8,12,16,20,24,28)) { eval; }
++	foreach (&VSX_lane_ROUND_8x(0,5,10,15,16,21,26,31)) { eval; }
++$code.=<<___;
++
++	bdnz	        Loop_vsx_8x
++	vxxlor	        $xv13 ,$xd4,$xd4                # save the register vr24-31
++	vxxlor	        $xv14 ,$xd5,$xd5                #
++	vxxlor	        $xv15 ,$xd6,$xd6                #
++	vxxlor	        $xv16 ,$xd7,$xd7                #
++
++	vxxlor	        $xv18 ,$xc4,$xc4                #
++	vxxlor	        $xv19 ,$xc5,$xc5                #
++	vxxlor	        $xv20 ,$xc6,$xc6                #
++	vxxlor	        $xv21 ,$xc7,$xc7                #
++
++	vxxlor	        $xv6  ,$xb6,$xb6                # save vr23, so we get 8 regs
++	vxxlor	        $xv7  ,$xb7,$xb7                # save vr23, so we get 8 regs
++	be?vxxlorc      $beperm,$xv26,$xv26             # copy back the the beperm.
++
++	vxxlorc	   @K[0],$xv0,$xv0                #27
++	vxxlorc	   @K[1],$xv1,$xv1 		  #24
++	vxxlorc	   @K[2],$xv2,$xv2		  #25
++	vxxlorc	   @K[3],$xv3,$xv3		  #26
++	vxxlorc	   $CTR0,$xv4,$xv4
++###changing to vertical
++
++	vmrgew	$xt0,$xa0,$xa1			# transpose data
++	vmrgew	$xt1,$xa2,$xa3
++	vmrgow	$xa0,$xa0,$xa1
++	vmrgow	$xa2,$xa2,$xa3
++
++	vmrgew	$xt2,$xb0,$xb1
++	vmrgew	$xt3,$xb2,$xb3
++	vmrgow	$xb0,$xb0,$xb1
++	vmrgow	$xb2,$xb2,$xb3
++
++	vadduwm	$xd0,$xd0,$CTR0
++
++	vpermdi	$xa1,$xa0,$xa2,0b00
++	vpermdi	$xa3,$xa0,$xa2,0b11
++	vpermdi	$xa0,$xt0,$xt1,0b00
++	vpermdi	$xa2,$xt0,$xt1,0b11
++	vpermdi	$xb1,$xb0,$xb2,0b00
++	vpermdi	$xb3,$xb0,$xb2,0b11
++	vpermdi	$xb0,$xt2,$xt3,0b00
++	vpermdi	$xb2,$xt2,$xt3,0b11
++
++	vmrgew	$xt0,$xc0,$xc1
++	vmrgew	$xt1,$xc2,$xc3
++	vmrgow	$xc0,$xc0,$xc1
++	vmrgow	$xc2,$xc2,$xc3
++	vmrgew	$xt2,$xd0,$xd1
++	vmrgew	$xt3,$xd2,$xd3
++	vmrgow	$xd0,$xd0,$xd1
++	vmrgow	$xd2,$xd2,$xd3
++
++	vpermdi	$xc1,$xc0,$xc2,0b00
++	vpermdi	$xc3,$xc0,$xc2,0b11
++	vpermdi	$xc0,$xt0,$xt1,0b00
++	vpermdi	$xc2,$xt0,$xt1,0b11
++	vpermdi	$xd1,$xd0,$xd2,0b00
++	vpermdi	$xd3,$xd0,$xd2,0b11
++	vpermdi	$xd0,$xt2,$xt3,0b00
++	vpermdi	$xd2,$xt2,$xt3,0b11
++
++	vspltisw $xt0,8
++	vadduwm  $CTR0,$CTR0,$xt0		# next counter value
++	vxxlor	 $xv4 ,$CTR0,$CTR0	        #CTR+4-> 5
++
++	vadduwm	$xa0,$xa0,@K[0]
++	vadduwm	$xb0,$xb0,@K[1]
++	vadduwm	$xc0,$xc0,@K[2]
++	vadduwm	$xd0,$xd0,@K[3]
++
++	be?vperm $xa0,$xa0,$xa0,$beperm
++	be?vperm $xb0,$xb0,$xb0,$beperm
++	be?vperm $xc0,$xc0,$xc0,$beperm
++	be?vperm $xd0,$xd0,$xd0,$beperm
++
++	${UCMP}i $len,0x40
++	blt	Ltail_vsx_8x
++
++	lvx_4w	$xt0,$x00,$inp
++	lvx_4w	$xt1,$x10,$inp
++	lvx_4w	$xt2,$x20,$inp
++	lvx_4w	$xt3,$x30,$inp
++
++	vxor	$xt0,$xt0,$xa0
++	vxor	$xt1,$xt1,$xb0
++	vxor	$xt2,$xt2,$xc0
++	vxor	$xt3,$xt3,$xd0
++
++	stvx_4w	$xt0,$x00,$out
++	stvx_4w	$xt1,$x10,$out
++	addi	$inp,$inp,0x40
++	stvx_4w	$xt2,$x20,$out
++	subi	$len,$len,0x40
++	stvx_4w	$xt3,$x30,$out
++	addi	$out,$out,0x40
++	beq	Ldone_vsx_8x
++
++	vadduwm	$xa0,$xa1,@K[0]
++	vadduwm	$xb0,$xb1,@K[1]
++	vadduwm	$xc0,$xc1,@K[2]
++	vadduwm	$xd0,$xd1,@K[3]
++
++	be?vperm $xa0,$xa0,$xa0,$beperm
++	be?vperm $xb0,$xb0,$xb0,$beperm
++	be?vperm $xc0,$xc0,$xc0,$beperm
++	be?vperm $xd0,$xd0,$xd0,$beperm
++
++	${UCMP}i $len,0x40
++	blt	Ltail_vsx_8x
++
++	lvx_4w	$xt0,$x00,$inp
++	lvx_4w	$xt1,$x10,$inp
++	lvx_4w	$xt2,$x20,$inp
++	lvx_4w	$xt3,$x30,$inp
++
++	vxor	$xt0,$xt0,$xa0
++	vxor	$xt1,$xt1,$xb0
++	vxor	$xt2,$xt2,$xc0
++	vxor	$xt3,$xt3,$xd0
++
++	stvx_4w	$xt0,$x00,$out
++	stvx_4w	$xt1,$x10,$out
++	addi	$inp,$inp,0x40
++	stvx_4w	$xt2,$x20,$out
++	subi	$len,$len,0x40
++	stvx_4w	$xt3,$x30,$out
++	addi	$out,$out,0x40
++	beq	Ldone_vsx_8x
++
++	vadduwm	$xa0,$xa2,@K[0]
++	vadduwm	$xb0,$xb2,@K[1]
++	vadduwm	$xc0,$xc2,@K[2]
++	vadduwm	$xd0,$xd2,@K[3]
++
++	be?vperm $xa0,$xa0,$xa0,$beperm
++	be?vperm $xb0,$xb0,$xb0,$beperm
++	be?vperm $xc0,$xc0,$xc0,$beperm
++	be?vperm $xd0,$xd0,$xd0,$beperm
++
++	${UCMP}i $len,0x40
++	blt	Ltail_vsx_8x
++
++	lvx_4w	$xt0,$x00,$inp
++	lvx_4w	$xt1,$x10,$inp
++	lvx_4w	$xt2,$x20,$inp
++	lvx_4w	$xt3,$x30,$inp
++
++	vxor	$xt0,$xt0,$xa0
++	vxor	$xt1,$xt1,$xb0
++	vxor	$xt2,$xt2,$xc0
++	vxor	$xt3,$xt3,$xd0
++
++	stvx_4w	$xt0,$x00,$out
++	stvx_4w	$xt1,$x10,$out
++	addi	$inp,$inp,0x40
++	stvx_4w	$xt2,$x20,$out
++	subi	$len,$len,0x40
++	stvx_4w	$xt3,$x30,$out
++	addi	$out,$out,0x40
++	beq	Ldone_vsx_8x
++
++	vadduwm	$xa0,$xa3,@K[0]
++	vadduwm	$xb0,$xb3,@K[1]
++	vadduwm	$xc0,$xc3,@K[2]
++	vadduwm	$xd0,$xd3,@K[3]
++
++	be?vperm $xa0,$xa0,$xa0,$beperm
++	be?vperm $xb0,$xb0,$xb0,$beperm
++	be?vperm $xc0,$xc0,$xc0,$beperm
++	be?vperm $xd0,$xd0,$xd0,$beperm
++
++	${UCMP}i $len,0x40
++	blt	Ltail_vsx_8x
++
++	lvx_4w	$xt0,$x00,$inp
++	lvx_4w	$xt1,$x10,$inp
++	lvx_4w	$xt2,$x20,$inp
++	lvx_4w	$xt3,$x30,$inp
++
++	vxor	$xt0,$xt0,$xa0
++	vxor	$xt1,$xt1,$xb0
++	vxor	$xt2,$xt2,$xc0
++	vxor	$xt3,$xt3,$xd0
++
++	stvx_4w	$xt0,$x00,$out
++	stvx_4w	$xt1,$x10,$out
++	addi	$inp,$inp,0x40
++	stvx_4w	$xt2,$x20,$out
++	subi	$len,$len,0x40
++	stvx_4w	$xt3,$x30,$out
++	addi	$out,$out,0x40
++	beq	Ldone_vsx_8x
++
++#blk4-7: 24:31 remain the same as we can use the same logic above . Reg a4-b7 remain same.Load c4,d7--> position 8-15.we can reuse vr24-31.
++#VR0-3 : are used to load temp value, vr4 --> as xr0 instead of xt0.
++
++	vxxlorc	   $CTR1 ,$xv5,$xv5
++
++	vxxlorc	   $xcn4 ,$xv18,$xv18
++	vxxlorc	   $xcn5 ,$xv19,$xv19
++	vxxlorc	   $xcn6 ,$xv20,$xv20
++	vxxlorc	   $xcn7 ,$xv21,$xv21
++
++	vxxlorc	   $xdn4 ,$xv13,$xv13
++	vxxlorc	   $xdn5 ,$xv14,$xv14
++	vxxlorc	   $xdn6 ,$xv15,$xv15
++	vxxlorc	   $xdn7 ,$xv16,$xv16
++	vadduwm	   $xdn4,$xdn4,$CTR1
++
++	vxxlorc	   $xb6 ,$xv6,$xv6
++	vxxlorc	   $xb7 ,$xv7,$xv7
++#use xa1->xr0, as xt0...in the block 4-7
++
++	vmrgew	$xr0,$xa4,$xa5			# transpose data
++	vmrgew	$xt1,$xa6,$xa7
++	vmrgow	$xa4,$xa4,$xa5
++	vmrgow	$xa6,$xa6,$xa7
++	vmrgew	$xt2,$xb4,$xb5
++	vmrgew	$xt3,$xb6,$xb7
++	vmrgow	$xb4,$xb4,$xb5
++	vmrgow	$xb6,$xb6,$xb7
++
++	vpermdi	$xa5,$xa4,$xa6,0b00
++	vpermdi	$xa7,$xa4,$xa6,0b11
++	vpermdi	$xa4,$xr0,$xt1,0b00
++	vpermdi	$xa6,$xr0,$xt1,0b11
++	vpermdi	$xb5,$xb4,$xb6,0b00
++	vpermdi	$xb7,$xb4,$xb6,0b11
++	vpermdi	$xb4,$xt2,$xt3,0b00
++	vpermdi	$xb6,$xt2,$xt3,0b11
++
++	vmrgew	$xr0,$xcn4,$xcn5
++	vmrgew	$xt1,$xcn6,$xcn7
++	vmrgow	$xcn4,$xcn4,$xcn5
++	vmrgow	$xcn6,$xcn6,$xcn7
++	vmrgew	$xt2,$xdn4,$xdn5
++	vmrgew	$xt3,$xdn6,$xdn7
++	vmrgow	$xdn4,$xdn4,$xdn5
++	vmrgow	$xdn6,$xdn6,$xdn7
++
++	vpermdi	$xcn5,$xcn4,$xcn6,0b00
++	vpermdi	$xcn7,$xcn4,$xcn6,0b11
++	vpermdi	$xcn4,$xr0,$xt1,0b00
++	vpermdi	$xcn6,$xr0,$xt1,0b11
++	vpermdi	$xdn5,$xdn4,$xdn6,0b00
++	vpermdi	$xdn7,$xdn4,$xdn6,0b11
++	vpermdi	$xdn4,$xt2,$xt3,0b00
++	vpermdi	$xdn6,$xt2,$xt3,0b11
++
++	vspltisw $xr0,8
++	vadduwm  $CTR1,$CTR1,$xr0		# next counter value
++	vxxlor	 $xv5 ,$CTR1,$CTR1	        #CTR+4-> 5
++
++	vadduwm	$xan0,$xa4,@K[0]
++	vadduwm	$xbn0,$xb4,@K[1]
++	vadduwm	$xcn0,$xcn4,@K[2]
++	vadduwm	$xdn0,$xdn4,@K[3]
++
++	be?vperm $xan0,$xa4,$xa4,$beperm
++	be?vperm $xbn0,$xb4,$xb4,$beperm
++	be?vperm $xcn0,$xcn4,$xcn4,$beperm
++	be?vperm $xdn0,$xdn4,$xdn4,$beperm
++
++	${UCMP}i $len,0x40
++	blt	Ltail_vsx_8x_1
++
++	lvx_4w	$xr0,$x00,$inp
++	lvx_4w	$xt1,$x10,$inp
++	lvx_4w	$xt2,$x20,$inp
++	lvx_4w	$xt3,$x30,$inp
++
++	vxor	$xr0,$xr0,$xan0
++	vxor	$xt1,$xt1,$xbn0
++	vxor	$xt2,$xt2,$xcn0
++	vxor	$xt3,$xt3,$xdn0
++
++	stvx_4w	$xr0,$x00,$out
++	stvx_4w	$xt1,$x10,$out
++	addi	$inp,$inp,0x40
++	stvx_4w	$xt2,$x20,$out
++	subi	$len,$len,0x40
++	stvx_4w	$xt3,$x30,$out
++	addi	$out,$out,0x40
++	beq	Ldone_vsx_8x
++
++	vadduwm	$xan0,$xa5,@K[0]
++	vadduwm	$xbn0,$xb5,@K[1]
++	vadduwm	$xcn0,$xcn5,@K[2]
++	vadduwm	$xdn0,$xdn5,@K[3]
++
++	be?vperm $xan0,$xan0,$xan0,$beperm
++	be?vperm $xbn0,$xbn0,$xbn0,$beperm
++	be?vperm $xcn0,$xcn0,$xcn0,$beperm
++	be?vperm $xdn0,$xdn0,$xdn0,$beperm
++
++	${UCMP}i $len,0x40
++	blt	Ltail_vsx_8x_1
++
++	lvx_4w	$xr0,$x00,$inp
++	lvx_4w	$xt1,$x10,$inp
++	lvx_4w	$xt2,$x20,$inp
++	lvx_4w	$xt3,$x30,$inp
++
++	vxor	$xr0,$xr0,$xan0
++	vxor	$xt1,$xt1,$xbn0
++	vxor	$xt2,$xt2,$xcn0
++	vxor	$xt3,$xt3,$xdn0
++
++	stvx_4w	$xr0,$x00,$out
++	stvx_4w	$xt1,$x10,$out
++	addi	$inp,$inp,0x40
++	stvx_4w	$xt2,$x20,$out
++	subi	$len,$len,0x40
++	stvx_4w	$xt3,$x30,$out
++	addi	$out,$out,0x40
++	beq	Ldone_vsx_8x
++
++	vadduwm	$xan0,$xa6,@K[0]
++	vadduwm	$xbn0,$xb6,@K[1]
++	vadduwm	$xcn0,$xcn6,@K[2]
++	vadduwm	$xdn0,$xdn6,@K[3]
++
++	be?vperm $xan0,$xan0,$xan0,$beperm
++	be?vperm $xbn0,$xbn0,$xbn0,$beperm
++	be?vperm $xcn0,$xcn0,$xcn0,$beperm
++	be?vperm $xdn0,$xdn0,$xdn0,$beperm
++
++	${UCMP}i $len,0x40
++	blt	Ltail_vsx_8x_1
++
++	lvx_4w	$xr0,$x00,$inp
++	lvx_4w	$xt1,$x10,$inp
++	lvx_4w	$xt2,$x20,$inp
++	lvx_4w	$xt3,$x30,$inp
++
++	vxor	$xr0,$xr0,$xan0
++	vxor	$xt1,$xt1,$xbn0
++	vxor	$xt2,$xt2,$xcn0
++	vxor	$xt3,$xt3,$xdn0
++
++	stvx_4w	$xr0,$x00,$out
++	stvx_4w	$xt1,$x10,$out
++	addi	$inp,$inp,0x40
++	stvx_4w	$xt2,$x20,$out
++	subi	$len,$len,0x40
++	stvx_4w	$xt3,$x30,$out
++	addi	$out,$out,0x40
++	beq	Ldone_vsx_8x
++
++	vadduwm	$xan0,$xa7,@K[0]
++	vadduwm	$xbn0,$xb7,@K[1]
++	vadduwm	$xcn0,$xcn7,@K[2]
++	vadduwm	$xdn0,$xdn7,@K[3]
++
++	be?vperm $xan0,$xan0,$xan0,$beperm
++	be?vperm $xbn0,$xbn0,$xbn0,$beperm
++	be?vperm $xcn0,$xcn0,$xcn0,$beperm
++	be?vperm $xdn0,$xdn0,$xdn0,$beperm
++
++	${UCMP}i $len,0x40
++	blt	Ltail_vsx_8x_1
++
++	lvx_4w	$xr0,$x00,$inp
++	lvx_4w	$xt1,$x10,$inp
++	lvx_4w	$xt2,$x20,$inp
++	lvx_4w	$xt3,$x30,$inp
++
++	vxor	$xr0,$xr0,$xan0
++	vxor	$xt1,$xt1,$xbn0
++	vxor	$xt2,$xt2,$xcn0
++	vxor	$xt3,$xt3,$xdn0
++
++	stvx_4w	$xr0,$x00,$out
++	stvx_4w	$xt1,$x10,$out
++	addi	$inp,$inp,0x40
++	stvx_4w	$xt2,$x20,$out
++	subi	$len,$len,0x40
++	stvx_4w	$xt3,$x30,$out
++	addi	$out,$out,0x40
++	beq	Ldone_vsx_8x
++
++	mtctr	r0
++	bne	Loop_outer_vsx_8x
++
++Ldone_vsx_8x:
++	lwz	r12,`$FRAME-4`($sp)		# pull vrsave
++	li	r10,`15+$LOCALS+64`
++	li	r11,`31+$LOCALS+64`
++	$POP	r0, `$FRAME+$LRSAVE`($sp)
++	mtspr	256,r12				# restore vrsave
++	lvx	v24,r10,$sp
++	addi	r10,r10,32
++	lvx	v25,r11,$sp
++	addi	r11,r11,32
++	lvx	v26,r10,$sp
++	addi	r10,r10,32
++	lvx	v27,r11,$sp
++	addi	r11,r11,32
++	lvx	v28,r10,$sp
++	addi	r10,r10,32
++	lvx	v29,r11,$sp
++	addi	r11,r11,32
++	lvx	v30,r10,$sp
++	lvx	v31,r11,$sp
++	mtlr	r0
++	addi	$sp,$sp,$FRAME
++	blr
++
++.align	4
++Ltail_vsx_8x:
++	addi	r11,$sp,$LOCALS
++	mtctr	$len
++	stvx_4w	$xa0,$x00,r11			# offload block to stack
++	stvx_4w	$xb0,$x10,r11
++	stvx_4w	$xc0,$x20,r11
++	stvx_4w	$xd0,$x30,r11
++	subi	r12,r11,1			# prepare for *++ptr
++	subi	$inp,$inp,1
++	subi	$out,$out,1
++	bl      Loop_tail_vsx_8x
++Ltail_vsx_8x_1:
++	addi	r11,$sp,$LOCALS
++	mtctr	$len
++	stvx_4w	$xan0,$x00,r11			# offload block to stack
++	stvx_4w	$xbn0,$x10,r11
++	stvx_4w	$xcn0,$x20,r11
++	stvx_4w	$xdn0,$x30,r11
++	subi	r12,r11,1			# prepare for *++ptr
++	subi	$inp,$inp,1
++	subi	$out,$out,1
++        bl      Loop_tail_vsx_8x
++
++Loop_tail_vsx_8x:
++	lbzu	r6,1(r12)
++	lbzu	r7,1($inp)
++	xor	r6,r6,r7
++	stbu	r6,1($out)
++	bdnz	Loop_tail_vsx_8x
++
++	stvx_4w	$K[0],$x00,r11			# wipe copy of the block
++	stvx_4w	$K[0],$x10,r11
++	stvx_4w	$K[0],$x20,r11
++	stvx_4w	$K[0],$x30,r11
++
++	b	Ldone_vsx_8x
++	.long	0
++	.byte	0,12,0x04,1,0x80,0,5,0
++	.long	0
++.size	.ChaCha20_ctr32_vsx_8x,.-.ChaCha20_ctr32_vsx_8x
++___
++}}}
++
++
++$code.=<<___;
++.align	5
++Lconsts:
++	mflr	r0
++	bcl	20,31,\$+4
++	mflr	r12	#vvvvv "distance between . and Lsigma
++	addi	r12,r12,`64-8`
++	mtlr	r0
++	blr
++	.long	0
++	.byte	0,12,0x14,0,0,0,0,0
++	.space	`64-9*4`
++Lsigma:
++	.long   0x61707865,0x3320646e,0x79622d32,0x6b206574
++	.long	1,0,0,0
++	.long	2,0,0,0
++	.long	3,0,0,0
++	.long	4,0,0,0
++___
++$code.=<<___ 	if ($LITTLE_ENDIAN);
++	.long	0x0e0f0c0d,0x0a0b0809,0x06070405,0x02030001
++	.long	0x0d0e0f0c,0x090a0b08,0x05060704,0x01020300
++___
++$code.=<<___ 	if (!$LITTLE_ENDIAN);	# flipped words
++	.long	0x02030001,0x06070405,0x0a0b0809,0x0e0f0c0d
++	.long	0x01020300,0x05060704,0x090a0b08,0x0d0e0f0c
++___
++$code.=<<___;
++	.long	0x61707865,0x61707865,0x61707865,0x61707865
++	.long	0x3320646e,0x3320646e,0x3320646e,0x3320646e
++	.long	0x79622d32,0x79622d32,0x79622d32,0x79622d32
++	.long	0x6b206574,0x6b206574,0x6b206574,0x6b206574
++	.long	0,1,2,3
++        .long   0x03020100,0x07060504,0x0b0a0908,0x0f0e0d0c
++.asciz  "ChaCha20 for PowerPC/AltiVec, CRYPTOGAMS by <appro\@openssl.org>"
++.align	2
++___
++
++foreach (split("\n",$code)) {
++	s/\`([^\`]*)\`/eval $1/ge;
++
++	# instructions prefixed with '?' are endian-specific and need
++	# to be adjusted accordingly...
++	if ($flavour !~ /le$/) {	# big-endian
++	    s/be\?//		or
++	    s/le\?/#le#/	or
++	    s/\?lvsr/lvsl/	or
++	    s/\?lvsl/lvsr/	or
++	    s/\?(vperm\s+v[0-9]+,\s*)(v[0-9]+,\s*)(v[0-9]+,\s*)(v[0-9]+)/$1$3$2$4/ or
++	    s/vrldoi(\s+v[0-9]+,\s*)(v[0-9]+,)\s*([0-9]+)/vsldoi$1$2$2 16-$3/;
++	} else {			# little-endian
++	    s/le\?//		or
++	    s/be\?/#be#/	or
++	    s/\?([a-z]+)/$1/	or
++	    s/vrldoi(\s+v[0-9]+,\s*)(v[0-9]+,)\s*([0-9]+)/vsldoi$1$2$2 $3/;
++	}
++
++	print $_,"\n";
++}
++
++close STDOUT or die "error closing STDOUT: $!";
+--- a/crypto/chacha/build.info
++++ b/crypto/chacha/build.info
+@@ -5,6 +5,7 @@ GENERATE[chacha-x86.s]=asm/chacha-x86.pl
+         $(PERLASM_SCHEME) $(LIB_CFLAGS) $(LIB_CPPFLAGS) $(PROCESSOR)
+ GENERATE[chacha-x86_64.s]=asm/chacha-x86_64.pl $(PERLASM_SCHEME)
+ GENERATE[chacha-ppc.s]=asm/chacha-ppc.pl $(PERLASM_SCHEME)
++GENERATE[chachap10-ppc.s]=asm/chachap10-ppc.pl $(PERLASM_SCHEME)
+ GENERATE[chacha-armv4.S]=asm/chacha-armv4.pl $(PERLASM_SCHEME)
+ INCLUDE[chacha-armv4.o]=..
+ GENERATE[chacha-armv8.S]=asm/chacha-armv8.pl $(PERLASM_SCHEME)
+--- a/crypto/perlasm/ppc-xlate.pl
++++ b/crypto/perlasm/ppc-xlate.pl
+@@ -288,6 +288,14 @@ my $vpermdi	= sub {				# xxpermdi
+     $dm = oct($dm) if ($dm =~ /^0/);
+     "	.long	".sprintf "0x%X",(60<<26)|($vrt<<21)|($vra<<16)|($vrb<<11)|($dm<<8)|(10<<3)|7;
+ };
++my $vxxlor	= sub {				# xxlor
++    my ($f, $vrt, $vra, $vrb) = @_;
++    "	.long	".sprintf "0x%X",(60<<26)|($vrt<<21)|($vra<<16)|($vrb<<11)|(146<<3)|6;
++};
++my $vxxlorc	= sub {				# xxlor
++    my ($f, $vrt, $vra, $vrb) = @_;
++    "	.long	".sprintf "0x%X",(60<<26)|($vrt<<21)|($vra<<16)|($vrb<<11)|(146<<3)|1;
++};
+ 
+ # PowerISA 2.07 stuff
+ sub vcrypto_op {
+@@ -370,6 +378,15 @@ my $addex = sub {
+ };
+ my $vmsumudm	= sub { vfour_vsr(@_, 35); };
+ 
++# PowerISA 3.1 stuff
++my $brd = sub {
++    my ($f, $ra, $rs) = @_;
++    "  .long   ".sprintf "0x%X",(31<<26)|($rs<<21)|($ra<<16)|(187<<1);
++};
++my $vsrq	= sub { vcrypto_op(@_, 517); };
++
++
++
+ while($line=<>) {
+ 
+     $line =~ s|[#!;].*$||;	# get rid of asm-style comments...
+--- a/crypto/ppc_arch.h
++++ b/crypto/ppc_arch.h
+@@ -24,5 +24,6 @@ extern unsigned int OPENSSL_ppccap_P;
+ # define PPC_MADD300     (1<<4)
+ # define PPC_MFTB        (1<<5)
+ # define PPC_MFSPR268    (1<<6)
++# define PPC_BRD31       (1<<7)
+ 
+ #endif
+--- a/crypto/ppccap.c
++++ b/crypto/ppccap.c
+@@ -108,15 +108,20 @@ void ChaCha20_ctr32_vmx(unsigned char *o
+ void ChaCha20_ctr32_vsx(unsigned char *out, const unsigned char *inp,
+                         size_t len, const unsigned int key[8],
+                         const unsigned int counter[4]);
++void ChaCha20_ctr32_vsx_p10(unsigned char *out, const unsigned char *inp,
++                        size_t len, const unsigned int key[8],
++                        const unsigned int counter[4]);
+ void ChaCha20_ctr32(unsigned char *out, const unsigned char *inp,
+                     size_t len, const unsigned int key[8],
+                     const unsigned int counter[4])
+ {
+-    OPENSSL_ppccap_P & PPC_CRYPTO207
+-        ? ChaCha20_ctr32_vsx(out, inp, len, key, counter)
+-        : OPENSSL_ppccap_P & PPC_ALTIVEC
+-            ? ChaCha20_ctr32_vmx(out, inp, len, key, counter)
+-            : ChaCha20_ctr32_int(out, inp, len, key, counter);
++    OPENSSL_ppccap_P & PPC_BRD31
++        ? ChaCha20_ctr32_vsx_p10(out, inp, len, key, counter)
++        :OPENSSL_ppccap_P & PPC_CRYPTO207
++            ? ChaCha20_ctr32_vsx(out, inp, len, key, counter)
++            : OPENSSL_ppccap_P & PPC_ALTIVEC
++                 ? ChaCha20_ctr32_vmx(out, inp, len, key, counter)
++                 : ChaCha20_ctr32_int(out, inp, len, key, counter);
+ }
+ #endif
+ 
+@@ -182,6 +187,7 @@ void OPENSSL_ppc64_probe(void);
+ void OPENSSL_altivec_probe(void);
+ void OPENSSL_crypto207_probe(void);
+ void OPENSSL_madd300_probe(void);
++void OPENSSL_brd31_probe(void);
+ 
+ long OPENSSL_rdtsc_mftb(void);
+ long OPENSSL_rdtsc_mfspr268(void);
+@@ -264,6 +270,7 @@ static unsigned long getauxval(unsigned
+ #define HWCAP2                  26      /* AT_HWCAP2 */
+ #define HWCAP_VEC_CRYPTO        (1U << 25)
+ #define HWCAP_ARCH_3_00         (1U << 23)
++#define HWCAP_ARCH_3_1          (1U << 18)
+ 
+ # if defined(__GNUC__) && __GNUC__>=2
+ __attribute__ ((constructor))
+@@ -324,6 +331,9 @@ void OPENSSL_cpuid_setup(void)
+     if (__power_set(0xffffffffU<<17))           /* POWER9 and later */
+         OPENSSL_ppccap_P |= PPC_MADD300;
+ 
++    if (__power_set(0xffffffffU<<18))           /* POWER10 and later */
++        OPENSSL_ppccap_P |= PPC_BRD31;
++
+     return;
+ # endif
+ #endif
+@@ -379,6 +389,10 @@ void OPENSSL_cpuid_setup(void)
+         if (hwcap2 & HWCAP_ARCH_3_00) {
+             OPENSSL_ppccap_P |= PPC_MADD300;
+         }
++
++        if (hwcap2 & HWCAP_ARCH_3_1) {
++            OPENSSL_ppccap_P |= PPC_BRD31;
++        }
+     }
+ #endif
+ 
+--- a/crypto/ppccpuid.pl
++++ b/crypto/ppccpuid.pl
+@@ -77,6 +77,17 @@ $code=<<___;
+ 	.long	0
+ 	.byte	0,12,0x14,0,0,0,0,0
+ 
++.globl	.OPENSSL_brd31_probe
++.align	4
++.OPENSSL_brd31_probe:
++	xor	r0,r0,r0
++	brd	r3,r0
++	blr
++	.long	0
++	.byte	0,12,0x14,0,0,0,0,0
++.size	.OPENSSL_brd31_probe,.-.OPENSSL_brd31_probe
++
++
+ .globl	.OPENSSL_wipe_cpu
+ .align	4
+ .OPENSSL_wipe_cpu:
diff --git a/openssl-1_1.changes b/openssl-1_1.changes
index acf925f..d7bad32 100644
--- a/openssl-1_1.changes
+++ b/openssl-1_1.changes
@@ -1,3 +1,13 @@
+-------------------------------------------------------------------
+Wed Dec 14 09:04:40 UTC 2022 - Otto Hollmann <otto.hollmann@suse.com>
+
+- POWER10 performance enhancements for cryptography [jsc#PED-512]
+  * openssl-1_1-AES-GCM-performance-optimzation-with-stitched-method.patch
+  * openssl-1_1-Fixed-counter-overflow.patch
+  * openssl-1_1-chacha20-performance-optimizations-for-ppc64le-with-.patch
+  * openssl-1_1-Fixed-conditional-statement-testing-64-and-256-bytes.patch
+  * openssl-1_1-Fix-AES-GCM-on-Power-8-CPUs.patch
+
 -------------------------------------------------------------------
 Wed Nov  2 12:00:40 UTC 2022 - Otto Hollmann <otto.hollmann@suse.com>
 
diff --git a/openssl-1_1.spec b/openssl-1_1.spec
index 96adb4b..94ae402 100644
--- a/openssl-1_1.spec
+++ b/openssl-1_1.spec
@@ -123,6 +123,14 @@ Patch72:        openssl-1_1-Optimize-AES-GCM-uarchs.patch
 Patch73:        openssl-1_1-FIPS-fix-error-reason-codes.patch
 #PATCH-FIX-SUSE bsc#1180995 Default to RFC7919 groups in FIPS mode
 Patch74:        openssl-1_1-paramgen-default_to_rfc7919.patch
+# PATCH-FIX-UPSTREAM jsc#PED-512
+# POWER10 performance enhancements for cryptography
+Patch75:        openssl-1_1-AES-GCM-performance-optimzation-with-stitched-method.patch
+Patch76:        openssl-1_1-Fixed-counter-overflow.patch
+Patch77:        openssl-1_1-chacha20-performance-optimizations-for-ppc64le-with-.patch
+Patch78:        openssl-1_1-Fixed-conditional-statement-testing-64-and-256-bytes.patch
+Patch79:        openssl-1_1-Fix-AES-GCM-on-Power-8-CPUs.patch
+
 Requires:       libopenssl1_1 = %{version}-%{release}
 BuildRequires:  pkgconfig
 BuildRequires:  pkgconfig(zlib)

From 93c266235b911436405030ac6ada0e9ce88cd86fb14a0eaec79fa58da3ce7901 Mon Sep 17 00:00:00 2001
From: Pedro Monreal Gonzalez <pmonrealgonzalez@suse.com>
Date: Wed, 14 Dec 2022 20:20:45 +0000
Subject: [PATCH 2/2] Accepting request 1042984 from
 home:ohollmann:branches:security:tls

OBS-URL: https://build.opensuse.org/request/show/1042984
OBS-URL: https://build.opensuse.org/package/show/security:tls/openssl-1_1?expand=0&rev=124
---
 openssl-1_1-Fix-AES-GCM-on-Power-8-CPUs.patch | 112 +++++++-----------
 1 file changed, 45 insertions(+), 67 deletions(-)

diff --git a/openssl-1_1-Fix-AES-GCM-on-Power-8-CPUs.patch b/openssl-1_1-Fix-AES-GCM-on-Power-8-CPUs.patch
index bba48b6..e990782 100644
--- a/openssl-1_1-Fix-AES-GCM-on-Power-8-CPUs.patch
+++ b/openssl-1_1-Fix-AES-GCM-on-Power-8-CPUs.patch
@@ -12,8 +12,8 @@ Reviewed-by: Dmitry Belyavskiy <beldmit@gmail.com>
 Reviewed-by: Paul Dale <pauli@openssl.org>
 (Merged from https://github.com/openssl/openssl/pull/19182)
 ---
- crypto/evp/e_aes.c |  179 +++++++++++++++++++++++++++++++----------------------
- 1 file changed, 107 insertions(+), 72 deletions(-)
+ crypto/evp/e_aes.c |  146 ++++++++++++++++++++++++++---------------------------
+ 1 file changed, 74 insertions(+), 72 deletions(-)
 
 --- a/crypto/evp/e_aes.c
 +++ b/crypto/evp/e_aes.c
@@ -113,10 +113,11 @@ Reviewed-by: Paul Dale <pauli@openssl.org>
  #endif
  
  #if     defined(OPENSSL_CPUID_OBJ) &&                   (  \
-@@ -3294,9 +3224,114 @@ static int aes_gcm_tls_cipher(EVP_CIPHER
+@@ -3294,6 +3224,51 @@ static int aes_gcm_tls_cipher(EVP_CIPHER
      return rv;
  }
  
++#if defined(OPENSSL_CPUID_OBJ) && (defined(__powerpc__) || defined(__ppc__) || defined(_ARCH_PPC))
 +static size_t ppc_aes_gcm_crypt(const unsigned char *in, unsigned char *out, size_t len,
 +                                const void *key, unsigned char ivec[16], u64 *Xi, int encrypt)
 +{
@@ -159,72 +160,49 @@ Reviewed-by: Paul Dale <pauli@openssl.org>
 +
 +    return ndone;
 +}
-+
-+#if defined(OPENSSL_CPUID_OBJ) && (defined(__powerpc__) || defined(__ppc__) || defined(_ARCH_PPC))
-+static int ppc_aes_gcm_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
-+                          const unsigned char *in, size_t len)
-+{
-+    EVP_AES_GCM_CTX *gctx = EVP_C_DATA(EVP_AES_GCM_CTX,ctx);
-+    if (ctx->encrypt) {
-+        if (gctx->ctr != NULL) {
-+            size_t bulk = 0;
-+
-+            if (len >= AES_GCM_ENC_BYTES && AES_GCM_ASM_PPC(gctx)) {
-+                size_t res = (16 - gctx->gcm.mres) % 16;
-+
-+                if (CRYPTO_gcm128_encrypt(&gctx->gcm, in, out, res))
-+                    return 0;
-+
-+                bulk = ppc_aes_gcm_crypt(in + res, out + res, len - res,
-+                                         gctx->gcm.key,
-+                                         gctx->gcm.Yi.c, gctx->gcm.Xi.u, 1);
-+
-+                gctx->gcm.len.u[1] += bulk;
-+                bulk += res;
-+            }
-+            if (CRYPTO_gcm128_encrypt_ctr32(&gctx->gcm, in + bulk, out + bulk,
-+                                            len - bulk, gctx->ctr))
-+                return 0;
-+        } else {
-+            if (CRYPTO_gcm128_encrypt(&gctx->gcm, in, out, len))
-+                return 0;
-+        }
-+    } else {
-+        if (gctx->ctr != NULL) {
-+            size_t bulk = 0;
-+
-+            if (len >= AES_GCM_DEC_BYTES && AES_GCM_ASM_PPC(gctx)) {
-+                size_t res = (16 - gctx->gcm.mres) % 16;
-+
-+                if (CRYPTO_gcm128_decrypt(&gctx->gcm, in, out, res))
-+                    return -1;
-+
-+                bulk = ppc_aes_gcm_crypt(in + res, out + res, len - res,
-+                                         gctx->gcm.key,
-+                                         gctx->gcm.Yi.c, gctx->gcm.Xi.u, 0);
-+
-+                gctx->gcm.len.u[1] += bulk;
-+                bulk += res;
-+            }
-+            if (CRYPTO_gcm128_decrypt_ctr32(&gctx->gcm, in + bulk, out + bulk,
-+                                            len - bulk, gctx->ctr))
-+                return 0;
-+        } else {
-+            if (CRYPTO_gcm128_decrypt(&gctx->gcm, in, out, len))
-+                return 0;
-+        }
-+    }
-+    return 1;
-+}
 +#endif
 +
  static int aes_gcm_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
                            const unsigned char *in, size_t len)
  {
-+#if defined(OPENSSL_CPUID_OBJ) && (defined(__powerpc__) || defined(__ppc__) || defined(_ARCH_PPC))
-+    if (PPC_AES_GCM_CAPABLE)
-+        return ppc_aes_gcm_cipher(ctx, out, in, len);
-+#endif
-     EVP_AES_GCM_CTX *gctx = EVP_C_DATA(EVP_AES_GCM_CTX,ctx);
-     /* If not set up, return error */
-     if (!gctx->key_set)
+@@ -3325,6 +3300,20 @@ static int aes_gcm_cipher(EVP_CIPHER_CTX
+                                            out + res, len - res,
+                                            gctx->gcm.key, gctx->gcm.Yi.c,
+                                            gctx->gcm.Xi.u);
++
++                    gctx->gcm.len.u[1] += bulk;
++                    bulk += res;
++                }
++#elif defined(AES_GCM_ASM_PPC) && defined(OPENSSL_CPUID_OBJ) && (defined(__powerpc__) || defined(__ppc__) || defined(_ARCH_PPC))
++                if (PPC_AES_GCM_CAPABLE && len >= AES_GCM_ENC_BYTES && AES_GCM_ASM_PPC(gctx)) {
++                    size_t res = (16 - gctx->gcm.mres) % 16;
++
++                    if (CRYPTO_gcm128_encrypt(&gctx->gcm, in, out, res))
++                        return -1;
++
++                    bulk = ppc_aes_gcm_crypt(in + res, out + res, len - res,
++                                             gctx->gcm.key,
++                                             gctx->gcm.Yi.c, gctx->gcm.Xi.u, 1);
+                     gctx->gcm.len.u[1] += bulk;
+                     bulk += res;
+                 }
+@@ -3372,6 +3361,19 @@ static int aes_gcm_cipher(EVP_CIPHER_CTX
+                     gctx->gcm.len.u[1] += bulk;
+                     bulk += res;
+                 }
++#elif defined(AES_GCM_ASM_PPC) && defined(OPENSSL_CPUID_OBJ) && (defined(__powerpc__) || defined(__ppc__) || defined(_ARCH_PPC))
++                if (PPC_AES_GCM_CAPABLE && len >= AES_GCM_DEC_BYTES && AES_GCM_ASM_PPC(gctx)) {
++                    size_t res = (16 - gctx->gcm.mres) % 16;
++
++                    if (CRYPTO_gcm128_decrypt(&gctx->gcm, in, out, res))
++                        return -1;
++
++                    bulk = ppc_aes_gcm_crypt(in + res, out + res, len - res,
++                                             gctx->gcm.key,
++                                             gctx->gcm.Yi.c, gctx->gcm.Xi.u, 0);
++                    gctx->gcm.len.u[1] += bulk;
++                    bulk += res;
++                }
+ #endif
+                 if (CRYPTO_gcm128_decrypt_ctr32(&gctx->gcm,
+                                                 in + bulk,