From 0ce74d253bcb2cbc6696e0cea7bd18c9494b8ca025a62aefe6539d7310ec3b97 Mon Sep 17 00:00:00 2001 From: Otto Hollmann Date: Wed, 14 Dec 2022 09:46:30 +0000 Subject: [PATCH 1/2] Accepting request 1042846 from home:ohollmann:branches:security:tls - POWER10 performance enhancements for cryptography [jsc#PED-512] * openssl-1_1-AES-GCM-performance-optimzation-with-stitched-method.patch * openssl-1_1-Fixed-counter-overflow.patch * openssl-1_1-chacha20-performance-optimizations-for-ppc64le-with-.patch * openssl-1_1-Fixed-conditional-statement-testing-64-and-256-bytes.patch * openssl-1_1-Fix-AES-GCM-on-Power-8-CPUs.patch OBS-URL: https://build.opensuse.org/request/show/1042846 OBS-URL: https://build.opensuse.org/package/show/security:tls/openssl-1_1?expand=0&rev=123 --- ...nce-optimzation-with-stitched-method.patch | 1588 +++++++++++++++++ openssl-1_1-Fix-AES-GCM-on-Power-8-CPUs.patch | 230 +++ ...l-statement-testing-64-and-256-bytes.patch | 103 ++ openssl-1_1-Fixed-counter-overflow.patch | 136 ++ ...ance-optimizations-for-ppc64le-with-.patch | 1535 ++++++++++++++++ openssl-1_1.changes | 10 + openssl-1_1.spec | 8 + 7 files changed, 3610 insertions(+) create mode 100644 openssl-1_1-AES-GCM-performance-optimzation-with-stitched-method.patch create mode 100644 openssl-1_1-Fix-AES-GCM-on-Power-8-CPUs.patch create mode 100644 openssl-1_1-Fixed-conditional-statement-testing-64-and-256-bytes.patch create mode 100644 openssl-1_1-Fixed-counter-overflow.patch create mode 100644 openssl-1_1-chacha20-performance-optimizations-for-ppc64le-with-.patch diff --git a/openssl-1_1-AES-GCM-performance-optimzation-with-stitched-method.patch b/openssl-1_1-AES-GCM-performance-optimzation-with-stitched-method.patch new file mode 100644 index 0000000..b6cfe60 --- /dev/null +++ b/openssl-1_1-AES-GCM-performance-optimzation-with-stitched-method.patch @@ -0,0 +1,1588 @@ +From 44a563dde1584cd9284e80b6e45ee5019be8d36c Mon Sep 17 00:00:00 2001 +From: Danny Tsen +Date: Mon, 18 Oct 2021 10:51:42 -0400 +Subject: [PATCH] AES-GCM performance optimzation with stitched method for p9+ + ppc64le + +Assembly code reviewed by Shricharan Srivatsan + +Reviewed-by: Tomas Mraz +Reviewed-by: Paul Dale +(Merged from https://github.com/openssl/openssl/pull/16854) +--- + Configurations/00-base-templates.conf | 2 + crypto/evp/e_aes.c | 33 + crypto/modes/asm/aes-gcm-ppc.pl | 1439 ++++++++++++++++++++++++++++++++++ + crypto/modes/build.info | 1 + 4 files changed, 1466 insertions(+), 9 deletions(-) + create mode 100644 crypto/modes/asm/aes-gcm-ppc.pl + create mode 100644 providers/implementations/ciphers/cipher_aes_gcm_hw_ppc.inc + +--- a/Configurations/00-base-templates.conf ++++ b/Configurations/00-base-templates.conf +@@ -344,7 +344,7 @@ my %targets=( + bn_asm_src => "bn-ppc.s ppc-mont.s", + aes_asm_src => "aes_core.c aes_cbc.c aes-ppc.s vpaes-ppc.s aesp8-ppc.s", + sha1_asm_src => "sha1-ppc.s sha256-ppc.s sha512-ppc.s sha256p8-ppc.s sha512p8-ppc.s", +- modes_asm_src => "ghashp8-ppc.s", ++ modes_asm_src => "ghashp8-ppc.s aes-gcm-ppc.s", + chacha_asm_src => "chacha-ppc.s", + poly1305_asm_src=> "poly1305-ppc.s poly1305-ppcfp.s", + }, +--- a/crypto/evp/e_aes.c ++++ b/crypto/evp/e_aes.c +@@ -178,6 +178,20 @@ static void ctr64_inc(unsigned char *cou + # define HWAES_ctr32_encrypt_blocks aes_p8_ctr32_encrypt_blocks + # define HWAES_xts_encrypt aes_p8_xts_encrypt + # define HWAES_xts_decrypt aes_p8_xts_decrypt ++# define PPC_AES_GCM_CAPABLE (OPENSSL_ppccap_P & PPC_MADD300) ++# define AES_GCM_ENC_BYTES 128 ++# define AES_GCM_DEC_BYTES 128 ++size_t ppc_aes_gcm_encrypt(const unsigned char *in, unsigned char *out, size_t len, ++ const void *key, unsigned char ivec[16], u64 *Xi); ++size_t ppc_aes_gcm_decrypt(const unsigned char *in, unsigned char *out, size_t len, ++ const void *key, unsigned char ivec[16], u64 *Xi); ++void gcm_ghash_p8(u64 Xi[2],const u128 Htable[16],const u8 *inp, size_t len); ++# if PPC_AES_GCM_CAPABLE ++# define AES_gcm_encrypt ppc_aes_gcm_encrypt ++# define AES_gcm_decrypt ppc_aes_gcm_decrypt ++# define AES_GCM_ASM(gctx) ((gctx)->ctr==aes_p8_ctr32_encrypt_blocks && \ ++ (gctx)->gcm.ghash==gcm_ghash_p8) ++# endif + #endif + + #if defined(OPENSSL_CPUID_OBJ) && ( \ +@@ -199,6 +213,9 @@ extern unsigned int OPENSSL_ia32cap_P[]; + */ + # define AESNI_CAPABLE (OPENSSL_ia32cap_P[1]&(1<<(57-32))) + ++# define AES_GCM_ENC_BYTES 32 ++# define AES_GCM_DEC_BYTES 16 ++ + int aesni_set_encrypt_key(const unsigned char *userKey, int bits, + AES_KEY *key); + int aesni_set_decrypt_key(const unsigned char *userKey, int bits, +@@ -3101,7 +3118,7 @@ static int aes_gcm_tls_cipher(EVP_CIPHER + if (gctx->ctr) { + size_t bulk = 0; + #if defined(AES_GCM_ASM) +- if (len >= 32 && AES_GCM_ASM(gctx)) { ++ if (len >= AES_GCM_ENC_BYTES && AES_GCM_ASM(gctx)) { + if (CRYPTO_gcm128_encrypt(&gctx->gcm, NULL, NULL, 0)) + return -1; + +@@ -3119,7 +3136,7 @@ static int aes_gcm_tls_cipher(EVP_CIPHER + } else { + size_t bulk = 0; + #if defined(AES_GCM_ASM2) +- if (len >= 32 && AES_GCM_ASM2(gctx)) { ++ if (len >= AES_GCM_ENC_BYTES && AES_GCM_ASM2(gctx)) { + if (CRYPTO_gcm128_encrypt(&gctx->gcm, NULL, NULL, 0)) + return -1; + +@@ -3142,7 +3159,7 @@ static int aes_gcm_tls_cipher(EVP_CIPHER + if (gctx->ctr) { + size_t bulk = 0; + #if defined(AES_GCM_ASM) +- if (len >= 16 && AES_GCM_ASM(gctx)) { ++ if (len >= AES_GCM_DEC_BYTES && AES_GCM_ASM(gctx)) { + if (CRYPTO_gcm128_decrypt(&gctx->gcm, NULL, NULL, 0)) + return -1; + +@@ -3160,7 +3177,7 @@ static int aes_gcm_tls_cipher(EVP_CIPHER + } else { + size_t bulk = 0; + #if defined(AES_GCM_ASM2) +- if (len >= 16 && AES_GCM_ASM2(gctx)) { ++ if (len >= AES_GCM_DEC_BYTES && AES_GCM_ASM2(gctx)) { + if (CRYPTO_gcm128_decrypt(&gctx->gcm, NULL, NULL, 0)) + return -1; + +@@ -3211,7 +3228,7 @@ static int aes_gcm_cipher(EVP_CIPHER_CTX + if (gctx->ctr) { + size_t bulk = 0; + #if defined(AES_GCM_ASM) +- if (len >= 32 && AES_GCM_ASM(gctx)) { ++ if (len >= AES_GCM_ENC_BYTES && AES_GCM_ASM(gctx)) { + size_t res = (16 - gctx->gcm.mres) % 16; + + if (CRYPTO_gcm128_encrypt(&gctx->gcm, in, out, res)) +@@ -3233,7 +3250,7 @@ static int aes_gcm_cipher(EVP_CIPHER_CTX + } else { + size_t bulk = 0; + #if defined(AES_GCM_ASM2) +- if (len >= 32 && AES_GCM_ASM2(gctx)) { ++ if (len >= AES_GCM_ENC_BYTES && AES_GCM_ASM2(gctx)) { + size_t res = (16 - gctx->gcm.mres) % 16; + + if (CRYPTO_gcm128_encrypt(&gctx->gcm, in, out, res)) +@@ -3255,7 +3272,7 @@ static int aes_gcm_cipher(EVP_CIPHER_CTX + if (gctx->ctr) { + size_t bulk = 0; + #if defined(AES_GCM_ASM) +- if (len >= 16 && AES_GCM_ASM(gctx)) { ++ if (len >= AES_GCM_DEC_BYTES && AES_GCM_ASM(gctx)) { + size_t res = (16 - gctx->gcm.mres) % 16; + + if (CRYPTO_gcm128_decrypt(&gctx->gcm, in, out, res)) +@@ -3277,7 +3294,7 @@ static int aes_gcm_cipher(EVP_CIPHER_CTX + } else { + size_t bulk = 0; + #if defined(AES_GCM_ASM2) +- if (len >= 16 && AES_GCM_ASM2(gctx)) { ++ if (len >= AES_GCM_DEC_BYTES && AES_GCM_ASM2(gctx)) { + size_t res = (16 - gctx->gcm.mres) % 16; + + if (CRYPTO_gcm128_decrypt(&gctx->gcm, in, out, res)) +--- /dev/null ++++ b/crypto/modes/asm/aes-gcm-ppc.pl +@@ -0,0 +1,1439 @@ ++#! /usr/bin/env perl ++# Copyright 2014-2020 The OpenSSL Project Authors. All Rights Reserved. ++# Copyright 2021- IBM Inc. All rights reserved ++# ++# Licensed under the Apache License 2.0 (the "License"). You may not use ++# this file except in compliance with the License. You can obtain a copy ++# in the file LICENSE in the source distribution or at ++# https://www.openssl.org/source/license.html ++# ++#=================================================================================== ++# Written by Danny Tsen for OpenSSL Project, ++# ++# GHASH is based on the Karatsuba multiplication method. ++# ++# Xi xor X1 ++# ++# X1 * H^4 + X2 * H^3 + x3 * H^2 + X4 * H = ++# (X1.h * H4.h + xX.l * H4.l + X1 * H4) + ++# (X2.h * H3.h + X2.l * H3.l + X2 * H3) + ++# (X3.h * H2.h + X3.l * H2.l + X3 * H2) + ++# (X4.h * H.h + X4.l * H.l + X4 * H) ++# ++# Xi = v0 ++# H Poly = v2 ++# Hash keys = v3 - v14 ++# ( H.l, H, H.h) ++# ( H^2.l, H^2, H^2.h) ++# ( H^3.l, H^3, H^3.h) ++# ( H^4.l, H^4, H^4.h) ++# ++# v30 is IV ++# v31 - counter 1 ++# ++# AES used, ++# vs0 - vs14 for round keys ++# v15, v16, v17, v18, v19, v20, v21, v22 for 8 blocks (encrypted) ++# ++# This implementation uses stitched AES-GCM approach to improve overall performance. ++# AES is implemented with 8x blocks and GHASH is using 2 4x blocks. ++# ++# Current large block (16384 bytes) performance per second with 128 bit key -- ++# ++# Encrypt Decrypt ++# Power10[le] (3.5GHz) 5.32G 5.26G ++# ++# =================================================================================== ++# ++# $output is the last argument if it looks like a file (it has an extension) ++# $flavour is the first argument if it doesn't look like a file ++$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef; ++$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef; ++ ++if ($flavour =~ /64/) { ++ $SIZE_T=8; ++ $LRSAVE=2*$SIZE_T; ++ $STU="stdu"; ++ $POP="ld"; ++ $PUSH="std"; ++ $UCMP="cmpld"; ++ $SHRI="srdi"; ++} elsif ($flavour =~ /32/) { ++ $SIZE_T=4; ++ $LRSAVE=$SIZE_T; ++ $STU="stwu"; ++ $POP="lwz"; ++ $PUSH="stw"; ++ $UCMP="cmplw"; ++ $SHRI="srwi"; ++} else { die "nonsense $flavour"; } ++ ++$sp="r1"; ++$FRAME=6*$SIZE_T+13*16; # 13*16 is for v20-v31 offload ++ ++$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; ++( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or ++( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or ++die "can't locate ppc-xlate.pl"; ++ ++open STDOUT,"| $^X $xlate $flavour \"$output\"" ++ or die "can't call $xlate: $!"; ++ ++$code=<<___; ++.machine "any" ++.abiversion 2 ++.text ++ ++# 4x loops ++# v15 - v18 - input states ++# vs1 - vs9 - round keys ++# ++.macro Loop_aes_middle4x ++ xxlor 19+32, 1, 1 ++ xxlor 20+32, 2, 2 ++ xxlor 21+32, 3, 3 ++ xxlor 22+32, 4, 4 ++ ++ vcipher 15, 15, 19 ++ vcipher 16, 16, 19 ++ vcipher 17, 17, 19 ++ vcipher 18, 18, 19 ++ ++ vcipher 15, 15, 20 ++ vcipher 16, 16, 20 ++ vcipher 17, 17, 20 ++ vcipher 18, 18, 20 ++ ++ vcipher 15, 15, 21 ++ vcipher 16, 16, 21 ++ vcipher 17, 17, 21 ++ vcipher 18, 18, 21 ++ ++ vcipher 15, 15, 22 ++ vcipher 16, 16, 22 ++ vcipher 17, 17, 22 ++ vcipher 18, 18, 22 ++ ++ xxlor 19+32, 5, 5 ++ xxlor 20+32, 6, 6 ++ xxlor 21+32, 7, 7 ++ xxlor 22+32, 8, 8 ++ ++ vcipher 15, 15, 19 ++ vcipher 16, 16, 19 ++ vcipher 17, 17, 19 ++ vcipher 18, 18, 19 ++ ++ vcipher 15, 15, 20 ++ vcipher 16, 16, 20 ++ vcipher 17, 17, 20 ++ vcipher 18, 18, 20 ++ ++ vcipher 15, 15, 21 ++ vcipher 16, 16, 21 ++ vcipher 17, 17, 21 ++ vcipher 18, 18, 21 ++ ++ vcipher 15, 15, 22 ++ vcipher 16, 16, 22 ++ vcipher 17, 17, 22 ++ vcipher 18, 18, 22 ++ ++ xxlor 23+32, 9, 9 ++ vcipher 15, 15, 23 ++ vcipher 16, 16, 23 ++ vcipher 17, 17, 23 ++ vcipher 18, 18, 23 ++.endm ++ ++# 8x loops ++# v15 - v22 - input states ++# vs1 - vs9 - round keys ++# ++.macro Loop_aes_middle8x ++ xxlor 23+32, 1, 1 ++ xxlor 24+32, 2, 2 ++ xxlor 25+32, 3, 3 ++ xxlor 26+32, 4, 4 ++ ++ vcipher 15, 15, 23 ++ vcipher 16, 16, 23 ++ vcipher 17, 17, 23 ++ vcipher 18, 18, 23 ++ vcipher 19, 19, 23 ++ vcipher 20, 20, 23 ++ vcipher 21, 21, 23 ++ vcipher 22, 22, 23 ++ ++ vcipher 15, 15, 24 ++ vcipher 16, 16, 24 ++ vcipher 17, 17, 24 ++ vcipher 18, 18, 24 ++ vcipher 19, 19, 24 ++ vcipher 20, 20, 24 ++ vcipher 21, 21, 24 ++ vcipher 22, 22, 24 ++ ++ vcipher 15, 15, 25 ++ vcipher 16, 16, 25 ++ vcipher 17, 17, 25 ++ vcipher 18, 18, 25 ++ vcipher 19, 19, 25 ++ vcipher 20, 20, 25 ++ vcipher 21, 21, 25 ++ vcipher 22, 22, 25 ++ ++ vcipher 15, 15, 26 ++ vcipher 16, 16, 26 ++ vcipher 17, 17, 26 ++ vcipher 18, 18, 26 ++ vcipher 19, 19, 26 ++ vcipher 20, 20, 26 ++ vcipher 21, 21, 26 ++ vcipher 22, 22, 26 ++ ++ xxlor 23+32, 5, 5 ++ xxlor 24+32, 6, 6 ++ xxlor 25+32, 7, 7 ++ xxlor 26+32, 8, 8 ++ ++ vcipher 15, 15, 23 ++ vcipher 16, 16, 23 ++ vcipher 17, 17, 23 ++ vcipher 18, 18, 23 ++ vcipher 19, 19, 23 ++ vcipher 20, 20, 23 ++ vcipher 21, 21, 23 ++ vcipher 22, 22, 23 ++ ++ vcipher 15, 15, 24 ++ vcipher 16, 16, 24 ++ vcipher 17, 17, 24 ++ vcipher 18, 18, 24 ++ vcipher 19, 19, 24 ++ vcipher 20, 20, 24 ++ vcipher 21, 21, 24 ++ vcipher 22, 22, 24 ++ ++ vcipher 15, 15, 25 ++ vcipher 16, 16, 25 ++ vcipher 17, 17, 25 ++ vcipher 18, 18, 25 ++ vcipher 19, 19, 25 ++ vcipher 20, 20, 25 ++ vcipher 21, 21, 25 ++ vcipher 22, 22, 25 ++ ++ vcipher 15, 15, 26 ++ vcipher 16, 16, 26 ++ vcipher 17, 17, 26 ++ vcipher 18, 18, 26 ++ vcipher 19, 19, 26 ++ vcipher 20, 20, 26 ++ vcipher 21, 21, 26 ++ vcipher 22, 22, 26 ++ ++ xxlor 23+32, 9, 9 ++ vcipher 15, 15, 23 ++ vcipher 16, 16, 23 ++ vcipher 17, 17, 23 ++ vcipher 18, 18, 23 ++ vcipher 19, 19, 23 ++ vcipher 20, 20, 23 ++ vcipher 21, 21, 23 ++ vcipher 22, 22, 23 ++.endm ++ ++# ++# Compute 4x hash values based on Karatsuba method. ++# ++ppc_aes_gcm_ghash: ++ vxor 15, 15, 0 ++ ++ xxlxor 29, 29, 29 ++ ++ vpmsumd 23, 12, 15 # H4.L * X.L ++ vpmsumd 24, 9, 16 ++ vpmsumd 25, 6, 17 ++ vpmsumd 26, 3, 18 ++ ++ vxor 23, 23, 24 ++ vxor 23, 23, 25 ++ vxor 23, 23, 26 # L ++ ++ vpmsumd 24, 13, 15 # H4.L * X.H + H4.H * X.L ++ vpmsumd 25, 10, 16 # H3.L * X1.H + H3.H * X1.L ++ vpmsumd 26, 7, 17 ++ vpmsumd 27, 4, 18 ++ ++ vxor 24, 24, 25 ++ vxor 24, 24, 26 ++ vxor 24, 24, 27 # M ++ ++ # sum hash and reduction with H Poly ++ vpmsumd 28, 23, 2 # reduction ++ ++ xxlor 29+32, 29, 29 ++ vsldoi 26, 24, 29, 8 # mL ++ vsldoi 29, 29, 24, 8 # mH ++ vxor 23, 23, 26 # mL + L ++ ++ vsldoi 23, 23, 23, 8 # swap ++ vxor 23, 23, 28 ++ ++ vpmsumd 24, 14, 15 # H4.H * X.H ++ vpmsumd 25, 11, 16 ++ vpmsumd 26, 8, 17 ++ vpmsumd 27, 5, 18 ++ ++ vxor 24, 24, 25 ++ vxor 24, 24, 26 ++ vxor 24, 24, 27 ++ ++ vxor 24, 24, 29 ++ ++ # sum hash and reduction with H Poly ++ vsldoi 27, 23, 23, 8 # swap ++ vpmsumd 23, 23, 2 ++ vxor 27, 27, 24 ++ vxor 23, 23, 27 ++ ++ xxlor 32, 23+32, 23+32 # update hash ++ ++ blr ++ ++# ++# Combine two 4x ghash ++# v15 - v22 - input blocks ++# ++.macro ppc_aes_gcm_ghash2_4x ++ # first 4x hash ++ vxor 15, 15, 0 # Xi + X ++ ++ xxlxor 29, 29, 29 ++ ++ vpmsumd 23, 12, 15 # H4.L * X.L ++ vpmsumd 24, 9, 16 ++ vpmsumd 25, 6, 17 ++ vpmsumd 26, 3, 18 ++ ++ vxor 23, 23, 24 ++ vxor 23, 23, 25 ++ vxor 23, 23, 26 # L ++ ++ vpmsumd 24, 13, 15 # H4.L * X.H + H4.H * X.L ++ vpmsumd 25, 10, 16 # H3.L * X1.H + H3.H * X1.L ++ vpmsumd 26, 7, 17 ++ vpmsumd 27, 4, 18 ++ ++ vxor 24, 24, 25 ++ vxor 24, 24, 26 ++ ++ # sum hash and reduction with H Poly ++ vpmsumd 28, 23, 2 # reduction ++ ++ xxlor 29+32, 29, 29 ++ ++ vxor 24, 24, 27 # M ++ vsldoi 26, 24, 29, 8 # mL ++ vsldoi 29, 29, 24, 8 # mH ++ vxor 23, 23, 26 # mL + L ++ ++ vsldoi 23, 23, 23, 8 # swap ++ vxor 23, 23, 28 ++ ++ vpmsumd 24, 14, 15 # H4.H * X.H ++ vpmsumd 25, 11, 16 ++ vpmsumd 26, 8, 17 ++ vpmsumd 27, 5, 18 ++ ++ vxor 24, 24, 25 ++ vxor 24, 24, 26 ++ vxor 24, 24, 27 # H ++ ++ vxor 24, 24, 29 # H + mH ++ ++ # sum hash and reduction with H Poly ++ vsldoi 27, 23, 23, 8 # swap ++ vpmsumd 23, 23, 2 ++ vxor 27, 27, 24 ++ vxor 27, 23, 27 # 1st Xi ++ ++ # 2nd 4x hash ++ vpmsumd 24, 9, 20 ++ vpmsumd 25, 6, 21 ++ vpmsumd 26, 3, 22 ++ vxor 19, 19, 27 # Xi + X ++ vpmsumd 23, 12, 19 # H4.L * X.L ++ ++ vxor 23, 23, 24 ++ vxor 23, 23, 25 ++ vxor 23, 23, 26 # L ++ ++ vpmsumd 24, 13, 19 # H4.L * X.H + H4.H * X.L ++ vpmsumd 25, 10, 20 # H3.L * X1.H + H3.H * X1.L ++ vpmsumd 26, 7, 21 ++ vpmsumd 27, 4, 22 ++ ++ vxor 24, 24, 25 ++ vxor 24, 24, 26 ++ ++ # sum hash and reduction with H Poly ++ vpmsumd 28, 23, 2 # reduction ++ ++ xxlor 29+32, 29, 29 ++ ++ vxor 24, 24, 27 # M ++ vsldoi 26, 24, 29, 8 # mL ++ vsldoi 29, 29, 24, 8 # mH ++ vxor 23, 23, 26 # mL + L ++ ++ vsldoi 23, 23, 23, 8 # swap ++ vxor 23, 23, 28 ++ ++ vpmsumd 24, 14, 19 # H4.H * X.H ++ vpmsumd 25, 11, 20 ++ vpmsumd 26, 8, 21 ++ vpmsumd 27, 5, 22 ++ ++ vxor 24, 24, 25 ++ vxor 24, 24, 26 ++ vxor 24, 24, 27 # H ++ ++ vxor 24, 24, 29 # H + mH ++ ++ # sum hash and reduction with H Poly ++ vsldoi 27, 23, 23, 8 # swap ++ vpmsumd 23, 23, 2 ++ vxor 27, 27, 24 ++ vxor 23, 23, 27 ++ ++ xxlor 32, 23+32, 23+32 # update hash ++ ++.endm ++ ++# ++# Compute update single hash ++# ++.macro ppc_update_hash_1x ++ vxor 28, 28, 0 ++ ++ vxor 19, 19, 19 ++ ++ vpmsumd 22, 3, 28 # L ++ vpmsumd 23, 4, 28 # M ++ vpmsumd 24, 5, 28 # H ++ ++ vpmsumd 27, 22, 2 # reduction ++ ++ vsldoi 25, 23, 19, 8 # mL ++ vsldoi 26, 19, 23, 8 # mH ++ vxor 22, 22, 25 # LL + LL ++ vxor 24, 24, 26 # HH + HH ++ ++ vsldoi 22, 22, 22, 8 # swap ++ vxor 22, 22, 27 ++ ++ vsldoi 20, 22, 22, 8 # swap ++ vpmsumd 22, 22, 2 # reduction ++ vxor 20, 20, 24 ++ vxor 22, 22, 20 ++ ++ vmr 0, 22 # update hash ++ ++.endm ++ ++# ++# ppc_aes_gcm_encrypt (const void *inp, void *out, size_t len, ++# const AES_KEY *key, unsigned char iv[16], ++# void *Xip); ++# ++# r3 - inp ++# r4 - out ++# r5 - len ++# r6 - AES round keys ++# r7 - iv ++# r8 - Xi, HPoli, hash keys ++# ++.global ppc_aes_gcm_encrypt ++.align 5 ++ppc_aes_gcm_encrypt: ++_ppc_aes_gcm_encrypt: ++ ++ stdu 1,-512(1) ++ mflr 0 ++ ++ std 14,112(1) ++ std 15,120(1) ++ std 16,128(1) ++ std 17,136(1) ++ std 18,144(1) ++ std 19,152(1) ++ std 20,160(1) ++ std 21,168(1) ++ li 9, 256 ++ stvx 20, 9, 1 ++ addi 9, 9, 16 ++ stvx 21, 9, 1 ++ addi 9, 9, 16 ++ stvx 22, 9, 1 ++ addi 9, 9, 16 ++ stvx 23, 9, 1 ++ addi 9, 9, 16 ++ stvx 24, 9, 1 ++ addi 9, 9, 16 ++ stvx 25, 9, 1 ++ addi 9, 9, 16 ++ stvx 26, 9, 1 ++ addi 9, 9, 16 ++ stvx 27, 9, 1 ++ addi 9, 9, 16 ++ stvx 28, 9, 1 ++ addi 9, 9, 16 ++ stvx 29, 9, 1 ++ addi 9, 9, 16 ++ stvx 30, 9, 1 ++ addi 9, 9, 16 ++ stvx 31, 9, 1 ++ std 0, 528(1) ++ ++ # Load Xi ++ lxvb16x 32, 0, 8 # load Xi ++ ++ # load Hash - h^4, h^3, h^2, h ++ li 10, 32 ++ lxvd2x 2+32, 10, 8 # H Poli ++ li 10, 48 ++ lxvd2x 3+32, 10, 8 # Hl ++ li 10, 64 ++ lxvd2x 4+32, 10, 8 # H ++ li 10, 80 ++ lxvd2x 5+32, 10, 8 # Hh ++ ++ li 10, 96 ++ lxvd2x 6+32, 10, 8 # H^2l ++ li 10, 112 ++ lxvd2x 7+32, 10, 8 # H^2 ++ li 10, 128 ++ lxvd2x 8+32, 10, 8 # H^2h ++ ++ li 10, 144 ++ lxvd2x 9+32, 10, 8 # H^3l ++ li 10, 160 ++ lxvd2x 10+32, 10, 8 # H^3 ++ li 10, 176 ++ lxvd2x 11+32, 10, 8 # H^3h ++ ++ li 10, 192 ++ lxvd2x 12+32, 10, 8 # H^4l ++ li 10, 208 ++ lxvd2x 13+32, 10, 8 # H^4 ++ li 10, 224 ++ lxvd2x 14+32, 10, 8 # H^4h ++ ++ # initialize ICB: GHASH( IV ), IV - r7 ++ lxvb16x 30+32, 0, 7 # load IV - v30 ++ ++ mr 12, 5 # length ++ li 11, 0 # block index ++ ++ # counter 1 ++ vxor 31, 31, 31 ++ vspltisb 22, 1 ++ vsldoi 31, 31, 22,1 # counter 1 ++ ++ # load round key to VSR ++ lxv 0, 0(6) ++ lxv 1, 0x10(6) ++ lxv 2, 0x20(6) ++ lxv 3, 0x30(6) ++ lxv 4, 0x40(6) ++ lxv 5, 0x50(6) ++ lxv 6, 0x60(6) ++ lxv 7, 0x70(6) ++ lxv 8, 0x80(6) ++ lxv 9, 0x90(6) ++ lxv 10, 0xa0(6) ++ ++ # load rounds - 10 (128), 12 (192), 14 (256) ++ lwz 9,240(6) ++ ++ # ++ # vxor state, state, w # addroundkey ++ xxlor 32+29, 0, 0 ++ vxor 15, 30, 29 # IV + round key - add round key 0 ++ ++ cmpdi 9, 10 ++ beq Loop_aes_gcm_8x ++ ++ # load 2 more round keys (v11, v12) ++ lxv 11, 0xb0(6) ++ lxv 12, 0xc0(6) ++ ++ cmpdi 9, 12 ++ beq Loop_aes_gcm_8x ++ ++ # load 2 more round keys (v11, v12, v13, v14) ++ lxv 13, 0xd0(6) ++ lxv 14, 0xe0(6) ++ cmpdi 9, 14 ++ beq Loop_aes_gcm_8x ++ ++ b aes_gcm_out ++ ++.align 5 ++Loop_aes_gcm_8x: ++ mr 14, 3 ++ mr 9, 4 ++ ++ # n blcoks ++ li 10, 128 ++ divdu 10, 5, 10 # n 128 bytes-blocks ++ cmpdi 10, 0 ++ beq Loop_last_block ++ ++ vaddudm 30, 30, 31 # IV + counter ++ vxor 16, 30, 29 ++ vaddudm 30, 30, 31 ++ vxor 17, 30, 29 ++ vaddudm 30, 30, 31 ++ vxor 18, 30, 29 ++ vaddudm 30, 30, 31 ++ vxor 19, 30, 29 ++ vaddudm 30, 30, 31 ++ vxor 20, 30, 29 ++ vaddudm 30, 30, 31 ++ vxor 21, 30, 29 ++ vaddudm 30, 30, 31 ++ vxor 22, 30, 29 ++ ++ mtctr 10 ++ ++ li 15, 16 ++ li 16, 32 ++ li 17, 48 ++ li 18, 64 ++ li 19, 80 ++ li 20, 96 ++ li 21, 112 ++ ++ lwz 10, 240(6) ++ ++Loop_8x_block: ++ ++ lxvb16x 15, 0, 14 # load block ++ lxvb16x 16, 15, 14 # load block ++ lxvb16x 17, 16, 14 # load block ++ lxvb16x 18, 17, 14 # load block ++ lxvb16x 19, 18, 14 # load block ++ lxvb16x 20, 19, 14 # load block ++ lxvb16x 21, 20, 14 # load block ++ lxvb16x 22, 21, 14 # load block ++ addi 14, 14, 128 ++ ++ Loop_aes_middle8x ++ ++ xxlor 23+32, 10, 10 ++ ++ cmpdi 10, 10 ++ beq Do_next_ghash ++ ++ # 192 bits ++ xxlor 24+32, 11, 11 ++ ++ vcipher 15, 15, 23 ++ vcipher 16, 16, 23 ++ vcipher 17, 17, 23 ++ vcipher 18, 18, 23 ++ vcipher 19, 19, 23 ++ vcipher 20, 20, 23 ++ vcipher 21, 21, 23 ++ vcipher 22, 22, 23 ++ ++ vcipher 15, 15, 24 ++ vcipher 16, 16, 24 ++ vcipher 17, 17, 24 ++ vcipher 18, 18, 24 ++ vcipher 19, 19, 24 ++ vcipher 20, 20, 24 ++ vcipher 21, 21, 24 ++ vcipher 22, 22, 24 ++ ++ xxlor 23+32, 12, 12 ++ ++ cmpdi 10, 12 ++ beq Do_next_ghash ++ ++ # 256 bits ++ xxlor 24+32, 13, 13 ++ ++ vcipher 15, 15, 23 ++ vcipher 16, 16, 23 ++ vcipher 17, 17, 23 ++ vcipher 18, 18, 23 ++ vcipher 19, 19, 23 ++ vcipher 20, 20, 23 ++ vcipher 21, 21, 23 ++ vcipher 22, 22, 23 ++ ++ vcipher 15, 15, 24 ++ vcipher 16, 16, 24 ++ vcipher 17, 17, 24 ++ vcipher 18, 18, 24 ++ vcipher 19, 19, 24 ++ vcipher 20, 20, 24 ++ vcipher 21, 21, 24 ++ vcipher 22, 22, 24 ++ ++ xxlor 23+32, 14, 14 ++ ++ cmpdi 10, 14 ++ beq Do_next_ghash ++ b aes_gcm_out ++ ++Do_next_ghash: ++ ++ # ++ # last round ++ vcipherlast 15, 15, 23 ++ vcipherlast 16, 16, 23 ++ ++ xxlxor 47, 47, 15 ++ stxvb16x 47, 0, 9 # store output ++ xxlxor 48, 48, 16 ++ stxvb16x 48, 15, 9 # store output ++ ++ vcipherlast 17, 17, 23 ++ vcipherlast 18, 18, 23 ++ ++ xxlxor 49, 49, 17 ++ stxvb16x 49, 16, 9 # store output ++ xxlxor 50, 50, 18 ++ stxvb16x 50, 17, 9 # store output ++ ++ vcipherlast 19, 19, 23 ++ vcipherlast 20, 20, 23 ++ ++ xxlxor 51, 51, 19 ++ stxvb16x 51, 18, 9 # store output ++ xxlxor 52, 52, 20 ++ stxvb16x 52, 19, 9 # store output ++ ++ vcipherlast 21, 21, 23 ++ vcipherlast 22, 22, 23 ++ ++ xxlxor 53, 53, 21 ++ stxvb16x 53, 20, 9 # store output ++ xxlxor 54, 54, 22 ++ stxvb16x 54, 21, 9 # store output ++ ++ addi 9, 9, 128 ++ ++ # ghash here ++ ppc_aes_gcm_ghash2_4x ++ ++ xxlor 27+32, 0, 0 ++ vaddudm 30, 30, 31 # IV + counter ++ vmr 29, 30 ++ vxor 15, 30, 27 # add round key ++ vaddudm 30, 30, 31 ++ vxor 16, 30, 27 ++ vaddudm 30, 30, 31 ++ vxor 17, 30, 27 ++ vaddudm 30, 30, 31 ++ vxor 18, 30, 27 ++ vaddudm 30, 30, 31 ++ vxor 19, 30, 27 ++ vaddudm 30, 30, 31 ++ vxor 20, 30, 27 ++ vaddudm 30, 30, 31 ++ vxor 21, 30, 27 ++ vaddudm 30, 30, 31 ++ vxor 22, 30, 27 ++ ++ addi 12, 12, -128 ++ addi 11, 11, 128 ++ ++ bdnz Loop_8x_block ++ ++ vmr 30, 29 ++ ++Loop_last_block: ++ cmpdi 12, 0 ++ beq aes_gcm_out ++ ++ # loop last few blocks ++ li 10, 16 ++ divdu 10, 12, 10 ++ ++ mtctr 10 ++ ++ lwz 10, 240(6) ++ ++ cmpdi 12, 16 ++ blt Final_block ++ ++.macro Loop_aes_middle_1x ++ xxlor 19+32, 1, 1 ++ xxlor 20+32, 2, 2 ++ xxlor 21+32, 3, 3 ++ xxlor 22+32, 4, 4 ++ ++ vcipher 15, 15, 19 ++ vcipher 15, 15, 20 ++ vcipher 15, 15, 21 ++ vcipher 15, 15, 22 ++ ++ xxlor 19+32, 5, 5 ++ xxlor 20+32, 6, 6 ++ xxlor 21+32, 7, 7 ++ xxlor 22+32, 8, 8 ++ ++ vcipher 15, 15, 19 ++ vcipher 15, 15, 20 ++ vcipher 15, 15, 21 ++ vcipher 15, 15, 22 ++ ++ xxlor 19+32, 9, 9 ++ vcipher 15, 15, 19 ++.endm ++ ++Next_rem_block: ++ lxvb16x 15, 0, 14 # load block ++ ++ Loop_aes_middle_1x ++ ++ xxlor 23+32, 10, 10 ++ ++ cmpdi 10, 10 ++ beq Do_next_1x ++ ++ # 192 bits ++ xxlor 24+32, 11, 11 ++ ++ vcipher 15, 15, 23 ++ vcipher 15, 15, 24 ++ ++ xxlor 23+32, 12, 12 ++ ++ cmpdi 10, 12 ++ beq Do_next_1x ++ ++ # 256 bits ++ xxlor 24+32, 13, 13 ++ ++ vcipher 15, 15, 23 ++ vcipher 15, 15, 24 ++ ++ xxlor 23+32, 14, 14 ++ ++ cmpdi 10, 14 ++ beq Do_next_1x ++ ++Do_next_1x: ++ vcipherlast 15, 15, 23 ++ ++ xxlxor 47, 47, 15 ++ stxvb16x 47, 0, 9 # store output ++ addi 14, 14, 16 ++ addi 9, 9, 16 ++ ++ vmr 28, 15 ++ ppc_update_hash_1x ++ ++ addi 12, 12, -16 ++ addi 11, 11, 16 ++ xxlor 19+32, 0, 0 ++ vaddudm 30, 30, 31 # IV + counter ++ vxor 15, 30, 19 # add round key ++ ++ bdnz Next_rem_block ++ ++ cmpdi 12, 0 ++ beq aes_gcm_out ++ ++Final_block: ++ Loop_aes_middle_1x ++ ++ xxlor 23+32, 10, 10 ++ ++ cmpdi 10, 10 ++ beq Do_final_1x ++ ++ # 192 bits ++ xxlor 24+32, 11, 11 ++ ++ vcipher 15, 15, 23 ++ vcipher 15, 15, 24 ++ ++ xxlor 23+32, 12, 12 ++ ++ cmpdi 10, 12 ++ beq Do_final_1x ++ ++ # 256 bits ++ xxlor 24+32, 13, 13 ++ ++ vcipher 15, 15, 23 ++ vcipher 15, 15, 24 ++ ++ xxlor 23+32, 14, 14 ++ ++ cmpdi 10, 14 ++ beq Do_final_1x ++ ++Do_final_1x: ++ vcipherlast 15, 15, 23 ++ ++ lxvb16x 15, 0, 14 # load last block ++ xxlxor 47, 47, 15 ++ ++ # create partial block mask ++ li 15, 16 ++ sub 15, 15, 12 # index to the mask ++ ++ vspltisb 16, -1 # first 16 bytes - 0xffff...ff ++ vspltisb 17, 0 # second 16 bytes - 0x0000...00 ++ li 10, 192 ++ stvx 16, 10, 1 ++ addi 10, 10, 16 ++ stvx 17, 10, 1 ++ ++ addi 10, 1, 192 ++ lxvb16x 16, 15, 10 # load partial block mask ++ xxland 47, 47, 16 ++ ++ vmr 28, 15 ++ ppc_update_hash_1x ++ ++ # * should store only the remaining bytes. ++ bl Write_partial_block ++ ++ b aes_gcm_out ++ ++# ++# Write partial block ++# r9 - output ++# r12 - remaining bytes ++# v15 - partial input data ++# ++Write_partial_block: ++ li 10, 192 ++ stxvb16x 15+32, 10, 1 # last block ++ ++ #add 10, 9, 11 # Output ++ addi 10, 9, -1 ++ addi 16, 1, 191 ++ ++ mtctr 12 # remaining bytes ++ li 15, 0 ++ ++Write_last_byte: ++ lbzu 14, 1(16) ++ stbu 14, 1(10) ++ bdnz Write_last_byte ++ blr ++ ++aes_gcm_out: ++ # out = state ++ stxvb16x 32, 0, 8 # write out Xi ++ add 3, 11, 12 # return count ++ ++ li 9, 256 ++ lvx 20, 9, 1 ++ addi 9, 9, 16 ++ lvx 21, 9, 1 ++ addi 9, 9, 16 ++ lvx 22, 9, 1 ++ addi 9, 9, 16 ++ lvx 23, 9, 1 ++ addi 9, 9, 16 ++ lvx 24, 9, 1 ++ addi 9, 9, 16 ++ lvx 25, 9, 1 ++ addi 9, 9, 16 ++ lvx 26, 9, 1 ++ addi 9, 9, 16 ++ lvx 27, 9, 1 ++ addi 9, 9, 16 ++ lvx 28, 9, 1 ++ addi 9, 9, 16 ++ lvx 29, 9, 1 ++ addi 9, 9, 16 ++ lvx 30, 9, 1 ++ addi 9, 9, 16 ++ lvx 31, 9, 1 ++ ++ ld 0, 528(1) ++ ld 14,112(1) ++ ld 15,120(1) ++ ld 16,128(1) ++ ld 17,136(1) ++ ld 18,144(1) ++ ld 19,152(1) ++ ld 20,160(1) ++ ld 21,168(1) ++ ++ mtlr 0 ++ addi 1, 1, 512 ++ blr ++ ++# ++# 8x Decrypt ++# ++.global ppc_aes_gcm_decrypt ++.align 5 ++ppc_aes_gcm_decrypt: ++_ppc_aes_gcm_decrypt: ++ ++ stdu 1,-512(1) ++ mflr 0 ++ ++ std 14,112(1) ++ std 15,120(1) ++ std 16,128(1) ++ std 17,136(1) ++ std 18,144(1) ++ std 19,152(1) ++ std 20,160(1) ++ std 21,168(1) ++ li 9, 256 ++ stvx 20, 9, 1 ++ addi 9, 9, 16 ++ stvx 21, 9, 1 ++ addi 9, 9, 16 ++ stvx 22, 9, 1 ++ addi 9, 9, 16 ++ stvx 23, 9, 1 ++ addi 9, 9, 16 ++ stvx 24, 9, 1 ++ addi 9, 9, 16 ++ stvx 25, 9, 1 ++ addi 9, 9, 16 ++ stvx 26, 9, 1 ++ addi 9, 9, 16 ++ stvx 27, 9, 1 ++ addi 9, 9, 16 ++ stvx 28, 9, 1 ++ addi 9, 9, 16 ++ stvx 29, 9, 1 ++ addi 9, 9, 16 ++ stvx 30, 9, 1 ++ addi 9, 9, 16 ++ stvx 31, 9, 1 ++ std 0, 528(1) ++ ++ # Load Xi ++ lxvb16x 32, 0, 8 # load Xi ++ ++ # load Hash - h^4, h^3, h^2, h ++ li 10, 32 ++ lxvd2x 2+32, 10, 8 # H Poli ++ li 10, 48 ++ lxvd2x 3+32, 10, 8 # Hl ++ li 10, 64 ++ lxvd2x 4+32, 10, 8 # H ++ li 10, 80 ++ lxvd2x 5+32, 10, 8 # Hh ++ ++ li 10, 96 ++ lxvd2x 6+32, 10, 8 # H^2l ++ li 10, 112 ++ lxvd2x 7+32, 10, 8 # H^2 ++ li 10, 128 ++ lxvd2x 8+32, 10, 8 # H^2h ++ ++ li 10, 144 ++ lxvd2x 9+32, 10, 8 # H^3l ++ li 10, 160 ++ lxvd2x 10+32, 10, 8 # H^3 ++ li 10, 176 ++ lxvd2x 11+32, 10, 8 # H^3h ++ ++ li 10, 192 ++ lxvd2x 12+32, 10, 8 # H^4l ++ li 10, 208 ++ lxvd2x 13+32, 10, 8 # H^4 ++ li 10, 224 ++ lxvd2x 14+32, 10, 8 # H^4h ++ ++ # initialize ICB: GHASH( IV ), IV - r7 ++ lxvb16x 30+32, 0, 7 # load IV - v30 ++ ++ mr 12, 5 # length ++ li 11, 0 # block index ++ ++ # counter 1 ++ vxor 31, 31, 31 ++ vspltisb 22, 1 ++ vsldoi 31, 31, 22,1 # counter 1 ++ ++ # load round key to VSR ++ lxv 0, 0(6) ++ lxv 1, 0x10(6) ++ lxv 2, 0x20(6) ++ lxv 3, 0x30(6) ++ lxv 4, 0x40(6) ++ lxv 5, 0x50(6) ++ lxv 6, 0x60(6) ++ lxv 7, 0x70(6) ++ lxv 8, 0x80(6) ++ lxv 9, 0x90(6) ++ lxv 10, 0xa0(6) ++ ++ # load rounds - 10 (128), 12 (192), 14 (256) ++ lwz 9,240(6) ++ ++ # ++ # vxor state, state, w # addroundkey ++ xxlor 32+29, 0, 0 ++ vxor 15, 30, 29 # IV + round key - add round key 0 ++ ++ cmpdi 9, 10 ++ beq Loop_aes_gcm_8x_dec ++ ++ # load 2 more round keys (v11, v12) ++ lxv 11, 0xb0(6) ++ lxv 12, 0xc0(6) ++ ++ cmpdi 9, 12 ++ beq Loop_aes_gcm_8x_dec ++ ++ # load 2 more round keys (v11, v12, v13, v14) ++ lxv 13, 0xd0(6) ++ lxv 14, 0xe0(6) ++ cmpdi 9, 14 ++ beq Loop_aes_gcm_8x_dec ++ ++ b aes_gcm_out ++ ++.align 5 ++Loop_aes_gcm_8x_dec: ++ mr 14, 3 ++ mr 9, 4 ++ ++ # n blcoks ++ li 10, 128 ++ divdu 10, 5, 10 # n 128 bytes-blocks ++ cmpdi 10, 0 ++ beq Loop_last_block_dec ++ ++ vaddudm 30, 30, 31 # IV + counter ++ vxor 16, 30, 29 ++ vaddudm 30, 30, 31 ++ vxor 17, 30, 29 ++ vaddudm 30, 30, 31 ++ vxor 18, 30, 29 ++ vaddudm 30, 30, 31 ++ vxor 19, 30, 29 ++ vaddudm 30, 30, 31 ++ vxor 20, 30, 29 ++ vaddudm 30, 30, 31 ++ vxor 21, 30, 29 ++ vaddudm 30, 30, 31 ++ vxor 22, 30, 29 ++ ++ mtctr 10 ++ ++ li 15, 16 ++ li 16, 32 ++ li 17, 48 ++ li 18, 64 ++ li 19, 80 ++ li 20, 96 ++ li 21, 112 ++ ++ lwz 10, 240(6) ++ ++Loop_8x_block_dec: ++ ++ lxvb16x 15, 0, 14 # load block ++ lxvb16x 16, 15, 14 # load block ++ lxvb16x 17, 16, 14 # load block ++ lxvb16x 18, 17, 14 # load block ++ lxvb16x 19, 18, 14 # load block ++ lxvb16x 20, 19, 14 # load block ++ lxvb16x 21, 20, 14 # load block ++ lxvb16x 22, 21, 14 # load block ++ addi 14, 14, 128 ++ ++ Loop_aes_middle8x ++ ++ xxlor 23+32, 10, 10 ++ ++ cmpdi 10, 10 ++ beq Do_last_aes_dec ++ ++ # 192 bits ++ xxlor 24+32, 11, 11 ++ ++ vcipher 15, 15, 23 ++ vcipher 16, 16, 23 ++ vcipher 17, 17, 23 ++ vcipher 18, 18, 23 ++ vcipher 19, 19, 23 ++ vcipher 20, 20, 23 ++ vcipher 21, 21, 23 ++ vcipher 22, 22, 23 ++ ++ vcipher 15, 15, 24 ++ vcipher 16, 16, 24 ++ vcipher 17, 17, 24 ++ vcipher 18, 18, 24 ++ vcipher 19, 19, 24 ++ vcipher 20, 20, 24 ++ vcipher 21, 21, 24 ++ vcipher 22, 22, 24 ++ ++ xxlor 23+32, 12, 12 ++ ++ cmpdi 10, 12 ++ beq Do_last_aes_dec ++ ++ # 256 bits ++ xxlor 24+32, 13, 13 ++ ++ vcipher 15, 15, 23 ++ vcipher 16, 16, 23 ++ vcipher 17, 17, 23 ++ vcipher 18, 18, 23 ++ vcipher 19, 19, 23 ++ vcipher 20, 20, 23 ++ vcipher 21, 21, 23 ++ vcipher 22, 22, 23 ++ ++ vcipher 15, 15, 24 ++ vcipher 16, 16, 24 ++ vcipher 17, 17, 24 ++ vcipher 18, 18, 24 ++ vcipher 19, 19, 24 ++ vcipher 20, 20, 24 ++ vcipher 21, 21, 24 ++ vcipher 22, 22, 24 ++ ++ xxlor 23+32, 14, 14 ++ ++ cmpdi 10, 14 ++ beq Do_last_aes_dec ++ b aes_gcm_out ++ ++Do_last_aes_dec: ++ ++ # ++ # last round ++ vcipherlast 15, 15, 23 ++ vcipherlast 16, 16, 23 ++ ++ xxlxor 47, 47, 15 ++ stxvb16x 47, 0, 9 # store output ++ xxlxor 48, 48, 16 ++ stxvb16x 48, 15, 9 # store output ++ ++ vcipherlast 17, 17, 23 ++ vcipherlast 18, 18, 23 ++ ++ xxlxor 49, 49, 17 ++ stxvb16x 49, 16, 9 # store output ++ xxlxor 50, 50, 18 ++ stxvb16x 50, 17, 9 # store output ++ ++ vcipherlast 19, 19, 23 ++ vcipherlast 20, 20, 23 ++ ++ xxlxor 51, 51, 19 ++ stxvb16x 51, 18, 9 # store output ++ xxlxor 52, 52, 20 ++ stxvb16x 52, 19, 9 # store output ++ ++ vcipherlast 21, 21, 23 ++ vcipherlast 22, 22, 23 ++ ++ xxlxor 53, 53, 21 ++ stxvb16x 53, 20, 9 # store output ++ xxlxor 54, 54, 22 ++ stxvb16x 54, 21, 9 # store output ++ ++ addi 9, 9, 128 ++ ++ xxlor 15+32, 15, 15 ++ xxlor 16+32, 16, 16 ++ xxlor 17+32, 17, 17 ++ xxlor 18+32, 18, 18 ++ xxlor 19+32, 19, 19 ++ xxlor 20+32, 20, 20 ++ xxlor 21+32, 21, 21 ++ xxlor 22+32, 22, 22 ++ ++ # ghash here ++ ppc_aes_gcm_ghash2_4x ++ ++ xxlor 27+32, 0, 0 ++ vaddudm 30, 30, 31 # IV + counter ++ vmr 29, 30 ++ vxor 15, 30, 27 # add round key ++ vaddudm 30, 30, 31 ++ vxor 16, 30, 27 ++ vaddudm 30, 30, 31 ++ vxor 17, 30, 27 ++ vaddudm 30, 30, 31 ++ vxor 18, 30, 27 ++ vaddudm 30, 30, 31 ++ vxor 19, 30, 27 ++ vaddudm 30, 30, 31 ++ vxor 20, 30, 27 ++ vaddudm 30, 30, 31 ++ vxor 21, 30, 27 ++ vaddudm 30, 30, 31 ++ vxor 22, 30, 27 ++ addi 12, 12, -128 ++ addi 11, 11, 128 ++ ++ bdnz Loop_8x_block_dec ++ ++ vmr 30, 29 ++ ++Loop_last_block_dec: ++ cmpdi 12, 0 ++ beq aes_gcm_out ++ ++ # loop last few blocks ++ li 10, 16 ++ divdu 10, 12, 10 ++ ++ mtctr 10 ++ ++ lwz 10,240(6) ++ ++ cmpdi 12, 16 ++ blt Final_block_dec ++ ++Next_rem_block_dec: ++ lxvb16x 15, 0, 14 # load block ++ ++ Loop_aes_middle_1x ++ ++ xxlor 23+32, 10, 10 ++ ++ cmpdi 10, 10 ++ beq Do_next_1x_dec ++ ++ # 192 bits ++ xxlor 24+32, 11, 11 ++ ++ vcipher 15, 15, 23 ++ vcipher 15, 15, 24 ++ ++ xxlor 23+32, 12, 12 ++ ++ cmpdi 10, 12 ++ beq Do_next_1x_dec ++ ++ # 256 bits ++ xxlor 24+32, 13, 13 ++ ++ vcipher 15, 15, 23 ++ vcipher 15, 15, 24 ++ ++ xxlor 23+32, 14, 14 ++ ++ cmpdi 10, 14 ++ beq Do_next_1x_dec ++ ++Do_next_1x_dec: ++ vcipherlast 15, 15, 23 ++ ++ xxlxor 47, 47, 15 ++ stxvb16x 47, 0, 9 # store output ++ addi 14, 14, 16 ++ addi 9, 9, 16 ++ ++ xxlor 28+32, 15, 15 ++ ppc_update_hash_1x ++ ++ addi 12, 12, -16 ++ addi 11, 11, 16 ++ xxlor 19+32, 0, 0 ++ vaddudm 30, 30, 31 # IV + counter ++ vxor 15, 30, 19 # add round key ++ ++ bdnz Next_rem_block_dec ++ ++ cmpdi 12, 0 ++ beq aes_gcm_out ++ ++Final_block_dec: ++ Loop_aes_middle_1x ++ ++ xxlor 23+32, 10, 10 ++ ++ cmpdi 10, 10 ++ beq Do_final_1x_dec ++ ++ # 192 bits ++ xxlor 24+32, 11, 11 ++ ++ vcipher 15, 15, 23 ++ vcipher 15, 15, 24 ++ ++ xxlor 23+32, 12, 12 ++ ++ cmpdi 10, 12 ++ beq Do_final_1x_dec ++ ++ # 256 bits ++ xxlor 24+32, 13, 13 ++ ++ vcipher 15, 15, 23 ++ vcipher 15, 15, 24 ++ ++ xxlor 23+32, 14, 14 ++ ++ cmpdi 10, 14 ++ beq Do_final_1x_dec ++ ++Do_final_1x_dec: ++ vcipherlast 15, 15, 23 ++ ++ lxvb16x 15, 0, 14 # load block ++ xxlxor 47, 47, 15 ++ ++ # create partial block mask ++ li 15, 16 ++ sub 15, 15, 12 # index to the mask ++ ++ vspltisb 16, -1 # first 16 bytes - 0xffff...ff ++ vspltisb 17, 0 # second 16 bytes - 0x0000...00 ++ li 10, 192 ++ stvx 16, 10, 1 ++ addi 10, 10, 16 ++ stvx 17, 10, 1 ++ ++ addi 10, 1, 192 ++ lxvb16x 16, 15, 10 # load block mask ++ xxland 47, 47, 16 ++ ++ xxlor 28+32, 15, 15 ++ ppc_update_hash_1x ++ ++ # * should store only the remaining bytes. ++ bl Write_partial_block ++ ++ b aes_gcm_out ++ ++ ++___ ++ ++foreach (split("\n",$code)) { ++ s/\`([^\`]*)\`/eval $1/geo; ++ ++ if ($flavour =~ /le$/o) { # little-endian ++ s/le\?//o or ++ s/be\?/#be#/o; ++ } else { ++ s/le\?/#le#/o or ++ s/be\?//o; ++ } ++ print $_,"\n"; ++} ++ ++close STDOUT or die "error closing STDOUT: $!"; # enforce flush +--- a/crypto/modes/build.info ++++ b/crypto/modes/build.info +@@ -16,6 +16,7 @@ INCLUDE[ghash-sparcv9.o]=.. + GENERATE[ghash-alpha.S]=asm/ghash-alpha.pl $(PERLASM_SCHEME) + GENERATE[ghash-parisc.s]=asm/ghash-parisc.pl $(PERLASM_SCHEME) + GENERATE[ghashp8-ppc.s]=asm/ghashp8-ppc.pl $(PERLASM_SCHEME) ++GENERATE[aes-gcm-ppc.s]=asm/aes-gcm-ppc.pl $(PERLASM_SCHEME) + GENERATE[ghash-armv4.S]=asm/ghash-armv4.pl $(PERLASM_SCHEME) + INCLUDE[ghash-armv4.o]=.. + GENERATE[ghashv8-armx.S]=asm/ghashv8-armx.pl $(PERLASM_SCHEME) diff --git a/openssl-1_1-Fix-AES-GCM-on-Power-8-CPUs.patch b/openssl-1_1-Fix-AES-GCM-on-Power-8-CPUs.patch new file mode 100644 index 0000000..bba48b6 --- /dev/null +++ b/openssl-1_1-Fix-AES-GCM-on-Power-8-CPUs.patch @@ -0,0 +1,230 @@ +From 9ab6b64ac856157a31a54c0d12207c2338bfa8e2 Mon Sep 17 00:00:00 2001 +From: Tomas Mraz +Date: Fri, 9 Sep 2022 14:46:24 +0200 +Subject: [PATCH] Fix AES-GCM on Power 8 CPUs + +Properly fallback to the default implementation on CPUs +missing necessary instructions. + +Fixes #19163 + +Reviewed-by: Dmitry Belyavskiy +Reviewed-by: Paul Dale +(Merged from https://github.com/openssl/openssl/pull/19182) +--- + crypto/evp/e_aes.c | 179 +++++++++++++++++++++++++++++++---------------------- + 1 file changed, 107 insertions(+), 72 deletions(-) + +--- a/crypto/evp/e_aes.c ++++ b/crypto/evp/e_aes.c +@@ -181,30 +181,16 @@ static void ctr64_inc(unsigned char *cou + # define PPC_AES_GCM_CAPABLE (OPENSSL_ppccap_P & PPC_MADD300) + # define AES_GCM_ENC_BYTES 128 + # define AES_GCM_DEC_BYTES 128 +-# if PPC_AES_GCM_CAPABLE + size_t ppc_aes_gcm_encrypt(const unsigned char *in, unsigned char *out, + size_t len, const void *key, unsigned char ivec[16], + u64 *Xi); + size_t ppc_aes_gcm_decrypt(const unsigned char *in, unsigned char *out, + size_t len, const void *key, unsigned char ivec[16], + u64 *Xi); +-size_t ppc_aes_gcm_encrypt_wrap(const unsigned char *in, unsigned char *out, +- size_t len, const void *key, +- unsigned char ivec[16], u64 *Xi); +-size_t ppc_aes_gcm_decrypt_wrap(const unsigned char *in, unsigned char *out, +- size_t len, const void *key, +- unsigned char ivec[16], u64 *Xi); +-# define AES_gcm_encrypt ppc_aes_gcm_encrypt_wrap +-# define AES_gcm_decrypt ppc_aes_gcm_decrypt_wrap +-# define AES_GCM_ASM(gctx) ((gctx)->ctr==aes_p8_ctr32_encrypt_blocks && \ +- (gctx)->gcm.ghash==gcm_ghash_p8) ++# define AES_GCM_ASM_PPC(gctx) ((gctx)->ctr==aes_p8_ctr32_encrypt_blocks && \ ++ (gctx)->gcm.ghash==gcm_ghash_p8) + void gcm_ghash_p8(u64 Xi[2],const u128 Htable[16],const u8 *inp, size_t len); + +-extern size_t ppc_aes_gcm_encrypt(const unsigned char *in, unsigned char *out, size_t len, +- const void *key, unsigned char ivec[16], u64 *Xi); +-extern size_t ppc_aes_gcm_decrypt(const unsigned char *in, unsigned char *out, size_t len, +- const void *key, unsigned char ivec[16], u64 *Xi); +- + static inline u32 UTO32(unsigned char *buf) + { + return ((u32) buf[0] << 24) | ((u32) buf[1] << 16) | ((u32) buf[2] << 8) | ((u32) buf[3]); +@@ -223,62 +209,6 @@ static inline u32 add32TOU(unsigned char + return r; + } + +-static size_t aes_p10_gcm_crypt(const unsigned char *in, unsigned char *out, size_t len, +- const void *key, unsigned char ivec[16], u64 *Xi, int encrypt) +-{ +- int s = 0; +- int ndone = 0; +- int ctr_reset = 0; +- u64 blocks_unused; +- u64 nb = len / 16; +- u64 next_ctr = 0; +- unsigned char ctr_saved[12]; +- +- memcpy(ctr_saved, ivec, 12); +- +- while (nb) { +- blocks_unused = (u64) 0xffffffffU + 1 - (u64) UTO32 (ivec + 12); +- if (nb > blocks_unused) { +- len = blocks_unused * 16; +- nb -= blocks_unused; +- next_ctr = blocks_unused; +- ctr_reset = 1; +- } else { +- len = nb * 16; +- next_ctr = nb; +- nb = 0; +- } +- +- s = encrypt ? ppc_aes_gcm_encrypt(in, out, len, key, ivec, Xi) +- : ppc_aes_gcm_decrypt(in, out, len, key, ivec, Xi); +- +- /* add counter to ivec */ +- add32TOU(ivec + 12, (u32) next_ctr); +- if (ctr_reset) { +- ctr_reset = 0; +- in += len; +- out += len; +- } +- memcpy(ivec, ctr_saved, 12); +- ndone += s; +- } +- +- return ndone; +-} +- +-size_t ppc_aes_gcm_encrypt_wrap(const unsigned char *in, unsigned char *out, size_t len, +- const void *key, unsigned char ivec[16], u64 *Xi) +-{ +- return aes_p10_gcm_crypt(in, out, len, key, ivec, Xi, 1); +-} +- +-size_t ppc_aes_gcm_decrypt_wrap(const unsigned char *in, unsigned char *out, size_t len, +- const void *key, unsigned char ivec[16], u64 *Xi) +-{ +- return aes_p10_gcm_crypt(in, out, len, key, ivec, Xi, 0); +-} +- +-# endif + #endif + + #if defined(OPENSSL_CPUID_OBJ) && ( \ +@@ -3294,9 +3224,114 @@ static int aes_gcm_tls_cipher(EVP_CIPHER + return rv; + } + ++static size_t ppc_aes_gcm_crypt(const unsigned char *in, unsigned char *out, size_t len, ++ const void *key, unsigned char ivec[16], u64 *Xi, int encrypt) ++{ ++ int s = 0; ++ int ndone = 0; ++ int ctr_reset = 0; ++ u64 blocks_unused; ++ u64 nb = len / 16; ++ u64 next_ctr = 0; ++ unsigned char ctr_saved[12]; ++ ++ memcpy(ctr_saved, ivec, 12); ++ ++ while (nb) { ++ blocks_unused = (u64) 0xffffffffU + 1 - (u64) UTO32 (ivec + 12); ++ if (nb > blocks_unused) { ++ len = blocks_unused * 16; ++ nb -= blocks_unused; ++ next_ctr = blocks_unused; ++ ctr_reset = 1; ++ } else { ++ len = nb * 16; ++ next_ctr = nb; ++ nb = 0; ++ } ++ ++ s = encrypt ? ppc_aes_gcm_encrypt(in, out, len, key, ivec, Xi) ++ : ppc_aes_gcm_decrypt(in, out, len, key, ivec, Xi); ++ ++ /* add counter to ivec */ ++ add32TOU(ivec + 12, (u32) next_ctr); ++ if (ctr_reset) { ++ ctr_reset = 0; ++ in += len; ++ out += len; ++ } ++ memcpy(ivec, ctr_saved, 12); ++ ndone += s; ++ } ++ ++ return ndone; ++} ++ ++#if defined(OPENSSL_CPUID_OBJ) && (defined(__powerpc__) || defined(__ppc__) || defined(_ARCH_PPC)) ++static int ppc_aes_gcm_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out, ++ const unsigned char *in, size_t len) ++{ ++ EVP_AES_GCM_CTX *gctx = EVP_C_DATA(EVP_AES_GCM_CTX,ctx); ++ if (ctx->encrypt) { ++ if (gctx->ctr != NULL) { ++ size_t bulk = 0; ++ ++ if (len >= AES_GCM_ENC_BYTES && AES_GCM_ASM_PPC(gctx)) { ++ size_t res = (16 - gctx->gcm.mres) % 16; ++ ++ if (CRYPTO_gcm128_encrypt(&gctx->gcm, in, out, res)) ++ return 0; ++ ++ bulk = ppc_aes_gcm_crypt(in + res, out + res, len - res, ++ gctx->gcm.key, ++ gctx->gcm.Yi.c, gctx->gcm.Xi.u, 1); ++ ++ gctx->gcm.len.u[1] += bulk; ++ bulk += res; ++ } ++ if (CRYPTO_gcm128_encrypt_ctr32(&gctx->gcm, in + bulk, out + bulk, ++ len - bulk, gctx->ctr)) ++ return 0; ++ } else { ++ if (CRYPTO_gcm128_encrypt(&gctx->gcm, in, out, len)) ++ return 0; ++ } ++ } else { ++ if (gctx->ctr != NULL) { ++ size_t bulk = 0; ++ ++ if (len >= AES_GCM_DEC_BYTES && AES_GCM_ASM_PPC(gctx)) { ++ size_t res = (16 - gctx->gcm.mres) % 16; ++ ++ if (CRYPTO_gcm128_decrypt(&gctx->gcm, in, out, res)) ++ return -1; ++ ++ bulk = ppc_aes_gcm_crypt(in + res, out + res, len - res, ++ gctx->gcm.key, ++ gctx->gcm.Yi.c, gctx->gcm.Xi.u, 0); ++ ++ gctx->gcm.len.u[1] += bulk; ++ bulk += res; ++ } ++ if (CRYPTO_gcm128_decrypt_ctr32(&gctx->gcm, in + bulk, out + bulk, ++ len - bulk, gctx->ctr)) ++ return 0; ++ } else { ++ if (CRYPTO_gcm128_decrypt(&gctx->gcm, in, out, len)) ++ return 0; ++ } ++ } ++ return 1; ++} ++#endif ++ + static int aes_gcm_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out, + const unsigned char *in, size_t len) + { ++#if defined(OPENSSL_CPUID_OBJ) && (defined(__powerpc__) || defined(__ppc__) || defined(_ARCH_PPC)) ++ if (PPC_AES_GCM_CAPABLE) ++ return ppc_aes_gcm_cipher(ctx, out, in, len); ++#endif + EVP_AES_GCM_CTX *gctx = EVP_C_DATA(EVP_AES_GCM_CTX,ctx); + /* If not set up, return error */ + if (!gctx->key_set) diff --git a/openssl-1_1-Fixed-conditional-statement-testing-64-and-256-bytes.patch b/openssl-1_1-Fixed-conditional-statement-testing-64-and-256-bytes.patch new file mode 100644 index 0000000..a727083 --- /dev/null +++ b/openssl-1_1-Fixed-conditional-statement-testing-64-and-256-bytes.patch @@ -0,0 +1,103 @@ +From 7e1f3ffcc5bc15fb9a12b9e3bb202f544c6ed5aa Mon Sep 17 00:00:00 2001 +From: Danny Tsen +Date: Wed, 23 Feb 2022 13:18:35 -0600 +Subject: [PATCH] Fixed conditional statement testing 64 and 256 bytes + +Reviewed-by: Paul Dale +Reviewed-by: Tomas Mraz +(Merged from https://github.com/openssl/openssl/pull/17760) +--- + crypto/chacha/asm/chachap10-ppc.pl | 68 ------------------------------------- + 1 file changed, 1 insertion(+), 67 deletions(-) + +--- a/crypto/chacha/asm/chachap10-ppc.pl ++++ b/crypto/chacha/asm/chachap10-ppc.pl +@@ -101,29 +101,6 @@ my ($x00,$x10,$x20,$x30) = (0, map("r$_" + + my $FRAME=$LOCALS+64+7*16; # 7*16 is for v26-v31 offload + +-sub VSX_lane_ROUND_1x { +-my $a=@_[0]; +-my $b=@_[1]; +-my $c=@_[2]; +-my $d=@_[3]; +-my $odd=@_[4]; +- vadduwm ($a,$a,$b); +- vxor ($d,$d,$a); +- vrlw ($d,$d,$sixteen); +- vadduwm ($c,$c,$d); +- vxor ($b,$b,$c); +- vrlw ($b,$b,$twelve); +- vadduwm ($a,$a,$b); +- vxor ($d,$d,$a); +- vrlw ($d,$d,$eight); +- vadduwm ($c,$c,$d); +- vxor ($b,$b,$c); +- vrlw ($b,$b,$seven); +- xxsldwi ($c,$c,$c,2); +- xxsldwi ($b,$b,$b,$odd?3:1); +- xxsldwi ($d,$d,$d,$odd?1:3); +-} +- + + sub VSX_lane_ROUND_4x { + my ($a0,$b0,$c0,$d0)=@_; +@@ -192,7 +169,7 @@ $code.=<<___; + .globl .ChaCha20_ctr32_vsx_p10 + .align 5 + .ChaCha20_ctr32_vsx_p10: +- ${UCMP}i $len,256 ++ ${UCMP}i $len,255 + bgt ChaCha20_ctr32_vsx_8x + $STU $sp,-$FRAME($sp) + mflr r0 +@@ -268,49 +245,6 @@ Loop_outer_vsx: + vspltisw $eight,8 + vspltisw $seven,7 + +- ${UCMP}i $len,64 +- bgt Loop_vsx_4x +- +- vmr $xa0,@K[0] +- vmr $xb0,@K[1] +- vmr $xc0,@K[2] +- vmr $xd0,@K[3] +- +-Loop_vsx_1x: +-___ +- VSX_lane_ROUND_1x($xa0, $xb0, $xc0,$xd0,0); +- VSX_lane_ROUND_1x($xa0, $xb0, $xc0,$xd0,1); +- +-$code.=<<___; +- +- bdnz Loop_vsx_1x +- +- vadduwm $xa0, $xa0, @K[0] +- vadduwm $xb0, $xb0, @K[1] +- vadduwm $xc0, $xc0, @K[2] +- vadduwm $xd0, $xd0, @K[3] +- ${UCMP}i $len,0x40 +- blt Ltail_vsx +- +- lvx_4w $xt0,$x00, $inp +- lvx_4w $xt1,$x10, $inp +- lvx_4w $xt2,$x20, $inp +- lvx_4w $xt3,$x30, $inp +- +- vxor $xa0,$xa0,$xt0 +- vxor $xb0,$xb0,$xt1 +- vxor $xc0,$xc0,$xt2 +- vxor $xd0,$xd0,$xt3 +- +- stvx_4w $xa0,$x00,$out +- stvx_4w $xb0,$x10,$out +- addi $inp,$inp,0x40 +- stvx_4w $xc0,$x20,$out +- subi $len,$len,0x40 +- stvx_4w $xd0,$x30,$out +- addi $out,$out,0x40 +- beq Ldone_vsx +- + Loop_vsx_4x: + ___ + foreach (&VSX_lane_ROUND_4x(0, 4, 8,12)) { eval; } diff --git a/openssl-1_1-Fixed-counter-overflow.patch b/openssl-1_1-Fixed-counter-overflow.patch new file mode 100644 index 0000000..40d8213 --- /dev/null +++ b/openssl-1_1-Fixed-counter-overflow.patch @@ -0,0 +1,136 @@ +From 345c99b6654b8313c792d54f829943068911ddbd Mon Sep 17 00:00:00 2001 +From: Danny Tsen +Date: Thu, 27 Jan 2022 18:49:59 -0600 +Subject: [PATCH] Fixed counter overflow + +Reviewed-by: Tomas Mraz +Reviewed-by: Paul Dale +(Merged from https://github.com/openssl/openssl/pull/17607) +--- + crypto/evp/e_aes.c | 101 +++++++++++++++++++++++++++++++++++++--- + crypto/modes/asm/aes-gcm-ppc.pl | 1 + 2 files changed, 94 insertions(+), 8 deletions(-) + +--- a/crypto/evp/e_aes.c ++++ b/crypto/evp/e_aes.c +@@ -181,16 +181,103 @@ static void ctr64_inc(unsigned char *cou + # define PPC_AES_GCM_CAPABLE (OPENSSL_ppccap_P & PPC_MADD300) + # define AES_GCM_ENC_BYTES 128 + # define AES_GCM_DEC_BYTES 128 +-size_t ppc_aes_gcm_encrypt(const unsigned char *in, unsigned char *out, size_t len, +- const void *key, unsigned char ivec[16], u64 *Xi); +-size_t ppc_aes_gcm_decrypt(const unsigned char *in, unsigned char *out, size_t len, +- const void *key, unsigned char ivec[16], u64 *Xi); +-void gcm_ghash_p8(u64 Xi[2],const u128 Htable[16],const u8 *inp, size_t len); + # if PPC_AES_GCM_CAPABLE +-# define AES_gcm_encrypt ppc_aes_gcm_encrypt +-# define AES_gcm_decrypt ppc_aes_gcm_decrypt ++size_t ppc_aes_gcm_encrypt(const unsigned char *in, unsigned char *out, ++ size_t len, const void *key, unsigned char ivec[16], ++ u64 *Xi); ++size_t ppc_aes_gcm_decrypt(const unsigned char *in, unsigned char *out, ++ size_t len, const void *key, unsigned char ivec[16], ++ u64 *Xi); ++size_t ppc_aes_gcm_encrypt_wrap(const unsigned char *in, unsigned char *out, ++ size_t len, const void *key, ++ unsigned char ivec[16], u64 *Xi); ++size_t ppc_aes_gcm_decrypt_wrap(const unsigned char *in, unsigned char *out, ++ size_t len, const void *key, ++ unsigned char ivec[16], u64 *Xi); ++# define AES_gcm_encrypt ppc_aes_gcm_encrypt_wrap ++# define AES_gcm_decrypt ppc_aes_gcm_decrypt_wrap + # define AES_GCM_ASM(gctx) ((gctx)->ctr==aes_p8_ctr32_encrypt_blocks && \ + (gctx)->gcm.ghash==gcm_ghash_p8) ++void gcm_ghash_p8(u64 Xi[2],const u128 Htable[16],const u8 *inp, size_t len); ++ ++extern size_t ppc_aes_gcm_encrypt(const unsigned char *in, unsigned char *out, size_t len, ++ const void *key, unsigned char ivec[16], u64 *Xi); ++extern size_t ppc_aes_gcm_decrypt(const unsigned char *in, unsigned char *out, size_t len, ++ const void *key, unsigned char ivec[16], u64 *Xi); ++ ++static inline u32 UTO32(unsigned char *buf) ++{ ++ return ((u32) buf[0] << 24) | ((u32) buf[1] << 16) | ((u32) buf[2] << 8) | ((u32) buf[3]); ++} ++ ++static inline u32 add32TOU(unsigned char buf[4], u32 n) ++{ ++ u32 r; ++ ++ r = UTO32(buf); ++ r += n; ++ buf[0] = (unsigned char) (r >> 24) & 0xFF; ++ buf[1] = (unsigned char) (r >> 16) & 0xFF; ++ buf[2] = (unsigned char) (r >> 8) & 0xFF; ++ buf[3] = (unsigned char) r & 0xFF; ++ return r; ++} ++ ++static size_t aes_p10_gcm_crypt(const unsigned char *in, unsigned char *out, size_t len, ++ const void *key, unsigned char ivec[16], u64 *Xi, int encrypt) ++{ ++ int s = 0; ++ int ndone = 0; ++ int ctr_reset = 0; ++ u64 blocks_unused; ++ u64 nb = len / 16; ++ u64 next_ctr = 0; ++ unsigned char ctr_saved[12]; ++ ++ memcpy(ctr_saved, ivec, 12); ++ ++ while (nb) { ++ blocks_unused = (u64) 0xffffffffU + 1 - (u64) UTO32 (ivec + 12); ++ if (nb > blocks_unused) { ++ len = blocks_unused * 16; ++ nb -= blocks_unused; ++ next_ctr = blocks_unused; ++ ctr_reset = 1; ++ } else { ++ len = nb * 16; ++ next_ctr = nb; ++ nb = 0; ++ } ++ ++ s = encrypt ? ppc_aes_gcm_encrypt(in, out, len, key, ivec, Xi) ++ : ppc_aes_gcm_decrypt(in, out, len, key, ivec, Xi); ++ ++ /* add counter to ivec */ ++ add32TOU(ivec + 12, (u32) next_ctr); ++ if (ctr_reset) { ++ ctr_reset = 0; ++ in += len; ++ out += len; ++ } ++ memcpy(ivec, ctr_saved, 12); ++ ndone += s; ++ } ++ ++ return ndone; ++} ++ ++size_t ppc_aes_gcm_encrypt_wrap(const unsigned char *in, unsigned char *out, size_t len, ++ const void *key, unsigned char ivec[16], u64 *Xi) ++{ ++ return aes_p10_gcm_crypt(in, out, len, key, ivec, Xi, 1); ++} ++ ++size_t ppc_aes_gcm_decrypt_wrap(const unsigned char *in, unsigned char *out, size_t len, ++ const void *key, unsigned char ivec[16], u64 *Xi) ++{ ++ return aes_p10_gcm_crypt(in, out, len, key, ivec, Xi, 0); ++} ++ + # endif + #endif + +--- a/crypto/modes/asm/aes-gcm-ppc.pl ++++ b/crypto/modes/asm/aes-gcm-ppc.pl +@@ -81,7 +81,6 @@ open STDOUT,"| $^X $xlate $flavour \"$ou + + $code=<<___; + .machine "any" +-.abiversion 2 + .text + + # 4x loops diff --git a/openssl-1_1-chacha20-performance-optimizations-for-ppc64le-with-.patch b/openssl-1_1-chacha20-performance-optimizations-for-ppc64le-with-.patch new file mode 100644 index 0000000..c28231c --- /dev/null +++ b/openssl-1_1-chacha20-performance-optimizations-for-ppc64le-with-.patch @@ -0,0 +1,1535 @@ +From f596bbe4da779b56eea34d96168b557d78e1149a Mon Sep 17 00:00:00 2001 +From: Deepankar Bhattacharjee +Date: Mon, 20 Sep 2021 10:45:15 -0400 +Subject: [PATCH] chacha20 performance optimizations for ppc64le with 8x lanes, + Performance increase around 50%. + +Co-authored-by: Madhusudhanan Duraisamy + +Co-authored-by: Nilamjyoti Goswami + +Co-authored-by: Siva Sundar Anbareeswaran + +Reviewed-by: Danny Tsen +Tested-by: Danny Tsen +Signed-off-by: Danny + +Reviewed-by: Tomas Mraz +Reviewed-by: Paul Dale +(Merged from https://github.com/openssl/openssl/pull/16637) +--- + Configurations/00-base-templates.conf | 2 + crypto/chacha/asm/chachap10-ppc.pl | 1354 ++++++++++++++++++++++++++++++++++ + crypto/chacha/build.info | 1 + crypto/perlasm/ppc-xlate.pl | 17 + crypto/ppc_arch.h | 1 + crypto/ppccap.c | 24 + crypto/ppccpuid.pl | 11 + 7 files changed, 1404 insertions(+), 6 deletions(-) + create mode 100755 crypto/chacha/asm/chachap10-ppc.pl + +--- a/Configurations/00-base-templates.conf ++++ b/Configurations/00-base-templates.conf +@@ -345,7 +345,7 @@ my %targets=( + aes_asm_src => "aes_core.c aes_cbc.c aes-ppc.s vpaes-ppc.s aesp8-ppc.s", + sha1_asm_src => "sha1-ppc.s sha256-ppc.s sha512-ppc.s sha256p8-ppc.s sha512p8-ppc.s", + modes_asm_src => "ghashp8-ppc.s aes-gcm-ppc.s", +- chacha_asm_src => "chacha-ppc.s", ++ chacha_asm_src => "chacha-ppc.s chachap10-ppc.s", + poly1305_asm_src=> "poly1305-ppc.s poly1305-ppcfp.s", + }, + ppc64_asm => { +--- /dev/null ++++ b/crypto/chacha/asm/chachap10-ppc.pl +@@ -0,0 +1,1354 @@ ++#! /usr/bin/env perl ++# Copyright 2016-2020 The OpenSSL Project Authors. All Rights Reserved. ++# ++# Licensed under the Apache License 2.0 (the "License"). You may not use ++# this file except in compliance with the License. You can obtain a copy ++# in the file LICENSE in the source distribution or at ++# https://www.openssl.org/source/license.html ++ ++# ++# ==================================================================== ++# Written by Andy Polyakov for the OpenSSL ++# project. The module is, however, dual licensed under OpenSSL and ++# CRYPTOGAMS licenses depending on where you obtain it. For further ++# details see http://www.openssl.org/~appro/cryptogams/. ++# ==================================================================== ++# ++# October 2015 ++# ++# ChaCha20 for PowerPC/AltiVec. ++# ++# June 2018 ++# ++# Add VSX 2.07 code path. Original 3xAltiVec+1xIALU is well-suited for ++# processors that can't issue more than one vector instruction per ++# cycle. But POWER8 (and POWER9) can issue a pair, and vector-only 4x ++# interleave would perform better. Incidentally PowerISA 2.07 (first ++# implemented by POWER8) defined new usable instructions, hence 4xVSX ++# code path... ++# ++# Performance in cycles per byte out of large buffer. ++# ++# IALU/gcc-4.x 3xAltiVec+1xIALU 4xVSX ++# ++# Freescale e300 13.6/+115% - - ++# PPC74x0/G4e 6.81/+310% 3.81 - ++# PPC970/G5 9.29/+160% ? - ++# POWER7 8.62/+61% 3.35 - ++# POWER8 8.70/+51% 2.91 2.09 ++# POWER9 8.80/+29% 4.44(*) 2.45(**) ++# ++# (*) this is trade-off result, it's possible to improve it, but ++# then it would negatively affect all others; ++# (**) POWER9 seems to be "allergic" to mixing vector and integer ++# instructions, which is why switch to vector-only code pays ++# off that much; ++ ++# $output is the last argument if it looks like a file (it has an extension) ++# $flavour is the first argument if it doesn't look like a file ++$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef; ++$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef; ++ ++if ($flavour =~ /64/) { ++ $SIZE_T =8; ++ $LRSAVE =2*$SIZE_T; ++ $STU ="stdu"; ++ $POP ="ld"; ++ $PUSH ="std"; ++ $UCMP ="cmpld"; ++} elsif ($flavour =~ /32/) { ++ $SIZE_T =4; ++ $LRSAVE =$SIZE_T; ++ $STU ="stwu"; ++ $POP ="lwz"; ++ $PUSH ="stw"; ++ $UCMP ="cmplw"; ++} else { die "nonsense $flavour"; } ++ ++$LITTLE_ENDIAN = ($flavour=~/le$/) ? 1 : 0; ++ ++$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; ++( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or ++( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or ++die "can't locate ppc-xlate.pl"; ++ ++open STDOUT,"| $^X $xlate $flavour \"$output\"" ++ or die "can't call $xlate: $!"; ++ ++$LOCALS=6*$SIZE_T; ++$FRAME=$LOCALS+64+18*$SIZE_T; # 64 is for local variables ++ ++sub AUTOLOAD() # thunk [simplified] x86-style perlasm ++{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./; ++ $code .= "\t$opcode\t".join(',',@_)."\n"; ++} ++ ++my $sp = "r1"; ++ ++my ($out,$inp,$len,$key,$ctr) = map("r$_",(3..7)); ++ ++ ++{{{ ++my ($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3, ++ $xc0,$xc1,$xc2,$xc3, $xd0,$xd1,$xd2,$xd3) = map("v$_",(0..15)); ++my @K = map("v$_",(16..19)); ++my $CTR = "v26"; ++my ($xt0,$xt1,$xt2,$xt3) = map("v$_",(27..30)); ++my ($sixteen,$twelve,$eight,$seven) = ($xt0,$xt1,$xt2,$xt3); ++my $beperm = "v31"; ++ ++my ($x00,$x10,$x20,$x30) = (0, map("r$_",(8..10))); ++ ++my $FRAME=$LOCALS+64+7*16; # 7*16 is for v26-v31 offload ++ ++sub VSX_lane_ROUND_1x { ++my $a=@_[0]; ++my $b=@_[1]; ++my $c=@_[2]; ++my $d=@_[3]; ++my $odd=@_[4]; ++ vadduwm ($a,$a,$b); ++ vxor ($d,$d,$a); ++ vrlw ($d,$d,$sixteen); ++ vadduwm ($c,$c,$d); ++ vxor ($b,$b,$c); ++ vrlw ($b,$b,$twelve); ++ vadduwm ($a,$a,$b); ++ vxor ($d,$d,$a); ++ vrlw ($d,$d,$eight); ++ vadduwm ($c,$c,$d); ++ vxor ($b,$b,$c); ++ vrlw ($b,$b,$seven); ++ xxsldwi ($c,$c,$c,2); ++ xxsldwi ($b,$b,$b,$odd?3:1); ++ xxsldwi ($d,$d,$d,$odd?1:3); ++} ++ ++ ++sub VSX_lane_ROUND_4x { ++my ($a0,$b0,$c0,$d0)=@_; ++my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0)); ++my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1)); ++my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2)); ++my @x=map("\"v$_\"",(0..15)); ++ ++ ( ++ "&vadduwm (@x[$a0],@x[$a0],@x[$b0])", # Q1 ++ "&vadduwm (@x[$a1],@x[$a1],@x[$b1])", # Q2 ++ "&vadduwm (@x[$a2],@x[$a2],@x[$b2])", # Q3 ++ "&vadduwm (@x[$a3],@x[$a3],@x[$b3])", # Q4 ++ "&vxor (@x[$d0],@x[$d0],@x[$a0])", ++ "&vxor (@x[$d1],@x[$d1],@x[$a1])", ++ "&vxor (@x[$d2],@x[$d2],@x[$a2])", ++ "&vxor (@x[$d3],@x[$d3],@x[$a3])", ++ "&vrlw (@x[$d0],@x[$d0],'$sixteen')", ++ "&vrlw (@x[$d1],@x[$d1],'$sixteen')", ++ "&vrlw (@x[$d2],@x[$d2],'$sixteen')", ++ "&vrlw (@x[$d3],@x[$d3],'$sixteen')", ++ ++ "&vadduwm (@x[$c0],@x[$c0],@x[$d0])", ++ "&vadduwm (@x[$c1],@x[$c1],@x[$d1])", ++ "&vadduwm (@x[$c2],@x[$c2],@x[$d2])", ++ "&vadduwm (@x[$c3],@x[$c3],@x[$d3])", ++ "&vxor (@x[$b0],@x[$b0],@x[$c0])", ++ "&vxor (@x[$b1],@x[$b1],@x[$c1])", ++ "&vxor (@x[$b2],@x[$b2],@x[$c2])", ++ "&vxor (@x[$b3],@x[$b3],@x[$c3])", ++ "&vrlw (@x[$b0],@x[$b0],'$twelve')", ++ "&vrlw (@x[$b1],@x[$b1],'$twelve')", ++ "&vrlw (@x[$b2],@x[$b2],'$twelve')", ++ "&vrlw (@x[$b3],@x[$b3],'$twelve')", ++ ++ "&vadduwm (@x[$a0],@x[$a0],@x[$b0])", ++ "&vadduwm (@x[$a1],@x[$a1],@x[$b1])", ++ "&vadduwm (@x[$a2],@x[$a2],@x[$b2])", ++ "&vadduwm (@x[$a3],@x[$a3],@x[$b3])", ++ "&vxor (@x[$d0],@x[$d0],@x[$a0])", ++ "&vxor (@x[$d1],@x[$d1],@x[$a1])", ++ "&vxor (@x[$d2],@x[$d2],@x[$a2])", ++ "&vxor (@x[$d3],@x[$d3],@x[$a3])", ++ "&vrlw (@x[$d0],@x[$d0],'$eight')", ++ "&vrlw (@x[$d1],@x[$d1],'$eight')", ++ "&vrlw (@x[$d2],@x[$d2],'$eight')", ++ "&vrlw (@x[$d3],@x[$d3],'$eight')", ++ ++ "&vadduwm (@x[$c0],@x[$c0],@x[$d0])", ++ "&vadduwm (@x[$c1],@x[$c1],@x[$d1])", ++ "&vadduwm (@x[$c2],@x[$c2],@x[$d2])", ++ "&vadduwm (@x[$c3],@x[$c3],@x[$d3])", ++ "&vxor (@x[$b0],@x[$b0],@x[$c0])", ++ "&vxor (@x[$b1],@x[$b1],@x[$c1])", ++ "&vxor (@x[$b2],@x[$b2],@x[$c2])", ++ "&vxor (@x[$b3],@x[$b3],@x[$c3])", ++ "&vrlw (@x[$b0],@x[$b0],'$seven')", ++ "&vrlw (@x[$b1],@x[$b1],'$seven')", ++ "&vrlw (@x[$b2],@x[$b2],'$seven')", ++ "&vrlw (@x[$b3],@x[$b3],'$seven')" ++ ); ++} ++ ++$code.=<<___; ++ ++.globl .ChaCha20_ctr32_vsx_p10 ++.align 5 ++.ChaCha20_ctr32_vsx_p10: ++ ${UCMP}i $len,256 ++ bgt ChaCha20_ctr32_vsx_8x ++ $STU $sp,-$FRAME($sp) ++ mflr r0 ++ li r10,`15+$LOCALS+64` ++ li r11,`31+$LOCALS+64` ++ mfspr r12,256 ++ stvx v26,r10,$sp ++ addi r10,r10,32 ++ stvx v27,r11,$sp ++ addi r11,r11,32 ++ stvx v28,r10,$sp ++ addi r10,r10,32 ++ stvx v29,r11,$sp ++ addi r11,r11,32 ++ stvx v30,r10,$sp ++ stvx v31,r11,$sp ++ stw r12,`$FRAME-4`($sp) # save vrsave ++ li r12,-4096+63 ++ $PUSH r0, `$FRAME+$LRSAVE`($sp) ++ mtspr 256,r12 # preserve 29 AltiVec registers ++ ++ bl Lconsts # returns pointer Lsigma in r12 ++ lvx_4w @K[0],0,r12 # load sigma ++ addi r12,r12,0x70 ++ li $x10,16 ++ li $x20,32 ++ li $x30,48 ++ li r11,64 ++ ++ lvx_4w @K[1],0,$key # load key ++ lvx_4w @K[2],$x10,$key ++ lvx_4w @K[3],0,$ctr # load counter ++ ++ vxor $xt0,$xt0,$xt0 ++ lvx_4w $xt1,r11,r12 ++ vspltw $CTR,@K[3],0 ++ vsldoi @K[3],@K[3],$xt0,4 ++ vsldoi @K[3],$xt0,@K[3],12 # clear @K[3].word[0] ++ vadduwm $CTR,$CTR,$xt1 ++ ++ be?lvsl $beperm,0,$x10 # 0x00..0f ++ be?vspltisb $xt0,3 # 0x03..03 ++ be?vxor $beperm,$beperm,$xt0 # swap bytes within words ++ ++ li r0,10 # inner loop counter ++ mtctr r0 ++ b Loop_outer_vsx ++ ++.align 5 ++Loop_outer_vsx: ++ lvx $xa0,$x00,r12 # load [smashed] sigma ++ lvx $xa1,$x10,r12 ++ lvx $xa2,$x20,r12 ++ lvx $xa3,$x30,r12 ++ ++ vspltw $xb0,@K[1],0 # smash the key ++ vspltw $xb1,@K[1],1 ++ vspltw $xb2,@K[1],2 ++ vspltw $xb3,@K[1],3 ++ ++ vspltw $xc0,@K[2],0 ++ vspltw $xc1,@K[2],1 ++ vspltw $xc2,@K[2],2 ++ vspltw $xc3,@K[2],3 ++ ++ vmr $xd0,$CTR # smash the counter ++ vspltw $xd1,@K[3],1 ++ vspltw $xd2,@K[3],2 ++ vspltw $xd3,@K[3],3 ++ ++ vspltisw $sixteen,-16 # synthesize constants ++ vspltisw $twelve,12 ++ vspltisw $eight,8 ++ vspltisw $seven,7 ++ ++ ${UCMP}i $len,64 ++ bgt Loop_vsx_4x ++ ++ vmr $xa0,@K[0] ++ vmr $xb0,@K[1] ++ vmr $xc0,@K[2] ++ vmr $xd0,@K[3] ++ ++Loop_vsx_1x: ++___ ++ VSX_lane_ROUND_1x($xa0, $xb0, $xc0,$xd0,0); ++ VSX_lane_ROUND_1x($xa0, $xb0, $xc0,$xd0,1); ++ ++$code.=<<___; ++ ++ bdnz Loop_vsx_1x ++ ++ vadduwm $xa0, $xa0, @K[0] ++ vadduwm $xb0, $xb0, @K[1] ++ vadduwm $xc0, $xc0, @K[2] ++ vadduwm $xd0, $xd0, @K[3] ++ ${UCMP}i $len,0x40 ++ blt Ltail_vsx ++ ++ lvx_4w $xt0,$x00, $inp ++ lvx_4w $xt1,$x10, $inp ++ lvx_4w $xt2,$x20, $inp ++ lvx_4w $xt3,$x30, $inp ++ ++ vxor $xa0,$xa0,$xt0 ++ vxor $xb0,$xb0,$xt1 ++ vxor $xc0,$xc0,$xt2 ++ vxor $xd0,$xd0,$xt3 ++ ++ stvx_4w $xa0,$x00,$out ++ stvx_4w $xb0,$x10,$out ++ addi $inp,$inp,0x40 ++ stvx_4w $xc0,$x20,$out ++ subi $len,$len,0x40 ++ stvx_4w $xd0,$x30,$out ++ addi $out,$out,0x40 ++ beq Ldone_vsx ++ ++Loop_vsx_4x: ++___ ++ foreach (&VSX_lane_ROUND_4x(0, 4, 8,12)) { eval; } ++ foreach (&VSX_lane_ROUND_4x(0, 5,10,15)) { eval; } ++$code.=<<___; ++ ++ bdnz Loop_vsx_4x ++ ++ vadduwm $xd0,$xd0,$CTR ++ ++ vmrgew $xt0,$xa0,$xa1 # transpose data ++ vmrgew $xt1,$xa2,$xa3 ++ vmrgow $xa0,$xa0,$xa1 ++ vmrgow $xa2,$xa2,$xa3 ++ vmrgew $xt2,$xb0,$xb1 ++ vmrgew $xt3,$xb2,$xb3 ++ vpermdi $xa1,$xa0,$xa2,0b00 ++ vpermdi $xa3,$xa0,$xa2,0b11 ++ vpermdi $xa0,$xt0,$xt1,0b00 ++ vpermdi $xa2,$xt0,$xt1,0b11 ++ ++ vmrgow $xb0,$xb0,$xb1 ++ vmrgow $xb2,$xb2,$xb3 ++ vmrgew $xt0,$xc0,$xc1 ++ vmrgew $xt1,$xc2,$xc3 ++ vpermdi $xb1,$xb0,$xb2,0b00 ++ vpermdi $xb3,$xb0,$xb2,0b11 ++ vpermdi $xb0,$xt2,$xt3,0b00 ++ vpermdi $xb2,$xt2,$xt3,0b11 ++ ++ vmrgow $xc0,$xc0,$xc1 ++ vmrgow $xc2,$xc2,$xc3 ++ vmrgew $xt2,$xd0,$xd1 ++ vmrgew $xt3,$xd2,$xd3 ++ vpermdi $xc1,$xc0,$xc2,0b00 ++ vpermdi $xc3,$xc0,$xc2,0b11 ++ vpermdi $xc0,$xt0,$xt1,0b00 ++ vpermdi $xc2,$xt0,$xt1,0b11 ++ ++ vmrgow $xd0,$xd0,$xd1 ++ vmrgow $xd2,$xd2,$xd3 ++ vspltisw $xt0,4 ++ vadduwm $CTR,$CTR,$xt0 # next counter value ++ vpermdi $xd1,$xd0,$xd2,0b00 ++ vpermdi $xd3,$xd0,$xd2,0b11 ++ vpermdi $xd0,$xt2,$xt3,0b00 ++ vpermdi $xd2,$xt2,$xt3,0b11 ++ ++ vadduwm $xa0,$xa0,@K[0] ++ vadduwm $xb0,$xb0,@K[1] ++ vadduwm $xc0,$xc0,@K[2] ++ vadduwm $xd0,$xd0,@K[3] ++ ++ be?vperm $xa0,$xa0,$xa0,$beperm ++ be?vperm $xb0,$xb0,$xb0,$beperm ++ be?vperm $xc0,$xc0,$xc0,$beperm ++ be?vperm $xd0,$xd0,$xd0,$beperm ++ ++ ${UCMP}i $len,0x40 ++ blt Ltail_vsx ++ ++ lvx_4w $xt0,$x00,$inp ++ lvx_4w $xt1,$x10,$inp ++ lvx_4w $xt2,$x20,$inp ++ lvx_4w $xt3,$x30,$inp ++ ++ vxor $xt0,$xt0,$xa0 ++ vxor $xt1,$xt1,$xb0 ++ vxor $xt2,$xt2,$xc0 ++ vxor $xt3,$xt3,$xd0 ++ ++ stvx_4w $xt0,$x00,$out ++ stvx_4w $xt1,$x10,$out ++ addi $inp,$inp,0x40 ++ stvx_4w $xt2,$x20,$out ++ subi $len,$len,0x40 ++ stvx_4w $xt3,$x30,$out ++ addi $out,$out,0x40 ++ beq Ldone_vsx ++ ++ vadduwm $xa0,$xa1,@K[0] ++ vadduwm $xb0,$xb1,@K[1] ++ vadduwm $xc0,$xc1,@K[2] ++ vadduwm $xd0,$xd1,@K[3] ++ ++ be?vperm $xa0,$xa0,$xa0,$beperm ++ be?vperm $xb0,$xb0,$xb0,$beperm ++ be?vperm $xc0,$xc0,$xc0,$beperm ++ be?vperm $xd0,$xd0,$xd0,$beperm ++ ++ ${UCMP}i $len,0x40 ++ blt Ltail_vsx ++ ++ lvx_4w $xt0,$x00,$inp ++ lvx_4w $xt1,$x10,$inp ++ lvx_4w $xt2,$x20,$inp ++ lvx_4w $xt3,$x30,$inp ++ ++ vxor $xt0,$xt0,$xa0 ++ vxor $xt1,$xt1,$xb0 ++ vxor $xt2,$xt2,$xc0 ++ vxor $xt3,$xt3,$xd0 ++ ++ stvx_4w $xt0,$x00,$out ++ stvx_4w $xt1,$x10,$out ++ addi $inp,$inp,0x40 ++ stvx_4w $xt2,$x20,$out ++ subi $len,$len,0x40 ++ stvx_4w $xt3,$x30,$out ++ addi $out,$out,0x40 ++ beq Ldone_vsx ++ ++ vadduwm $xa0,$xa2,@K[0] ++ vadduwm $xb0,$xb2,@K[1] ++ vadduwm $xc0,$xc2,@K[2] ++ vadduwm $xd0,$xd2,@K[3] ++ ++ be?vperm $xa0,$xa0,$xa0,$beperm ++ be?vperm $xb0,$xb0,$xb0,$beperm ++ be?vperm $xc0,$xc0,$xc0,$beperm ++ be?vperm $xd0,$xd0,$xd0,$beperm ++ ++ ${UCMP}i $len,0x40 ++ blt Ltail_vsx ++ ++ lvx_4w $xt0,$x00,$inp ++ lvx_4w $xt1,$x10,$inp ++ lvx_4w $xt2,$x20,$inp ++ lvx_4w $xt3,$x30,$inp ++ ++ vxor $xt0,$xt0,$xa0 ++ vxor $xt1,$xt1,$xb0 ++ vxor $xt2,$xt2,$xc0 ++ vxor $xt3,$xt3,$xd0 ++ ++ stvx_4w $xt0,$x00,$out ++ stvx_4w $xt1,$x10,$out ++ addi $inp,$inp,0x40 ++ stvx_4w $xt2,$x20,$out ++ subi $len,$len,0x40 ++ stvx_4w $xt3,$x30,$out ++ addi $out,$out,0x40 ++ beq Ldone_vsx ++ ++ vadduwm $xa0,$xa3,@K[0] ++ vadduwm $xb0,$xb3,@K[1] ++ vadduwm $xc0,$xc3,@K[2] ++ vadduwm $xd0,$xd3,@K[3] ++ ++ be?vperm $xa0,$xa0,$xa0,$beperm ++ be?vperm $xb0,$xb0,$xb0,$beperm ++ be?vperm $xc0,$xc0,$xc0,$beperm ++ be?vperm $xd0,$xd0,$xd0,$beperm ++ ++ ${UCMP}i $len,0x40 ++ blt Ltail_vsx ++ ++ lvx_4w $xt0,$x00,$inp ++ lvx_4w $xt1,$x10,$inp ++ lvx_4w $xt2,$x20,$inp ++ lvx_4w $xt3,$x30,$inp ++ ++ vxor $xt0,$xt0,$xa0 ++ vxor $xt1,$xt1,$xb0 ++ vxor $xt2,$xt2,$xc0 ++ vxor $xt3,$xt3,$xd0 ++ ++ stvx_4w $xt0,$x00,$out ++ stvx_4w $xt1,$x10,$out ++ addi $inp,$inp,0x40 ++ stvx_4w $xt2,$x20,$out ++ subi $len,$len,0x40 ++ stvx_4w $xt3,$x30,$out ++ addi $out,$out,0x40 ++ mtctr r0 ++ bne Loop_outer_vsx ++ ++Ldone_vsx: ++ lwz r12,`$FRAME-4`($sp) # pull vrsave ++ li r10,`15+$LOCALS+64` ++ li r11,`31+$LOCALS+64` ++ $POP r0, `$FRAME+$LRSAVE`($sp) ++ mtspr 256,r12 # restore vrsave ++ lvx v26,r10,$sp ++ addi r10,r10,32 ++ lvx v27,r11,$sp ++ addi r11,r11,32 ++ lvx v28,r10,$sp ++ addi r10,r10,32 ++ lvx v29,r11,$sp ++ addi r11,r11,32 ++ lvx v30,r10,$sp ++ lvx v31,r11,$sp ++ mtlr r0 ++ addi $sp,$sp,$FRAME ++ blr ++ ++.align 4 ++Ltail_vsx: ++ addi r11,$sp,$LOCALS ++ mtctr $len ++ stvx_4w $xa0,$x00,r11 # offload block to stack ++ stvx_4w $xb0,$x10,r11 ++ stvx_4w $xc0,$x20,r11 ++ stvx_4w $xd0,$x30,r11 ++ subi r12,r11,1 # prepare for *++ptr ++ subi $inp,$inp,1 ++ subi $out,$out,1 ++ ++Loop_tail_vsx: ++ lbzu r6,1(r12) ++ lbzu r7,1($inp) ++ xor r6,r6,r7 ++ stbu r6,1($out) ++ bdnz Loop_tail_vsx ++ ++ stvx_4w $K[0],$x00,r11 # wipe copy of the block ++ stvx_4w $K[0],$x10,r11 ++ stvx_4w $K[0],$x20,r11 ++ stvx_4w $K[0],$x30,r11 ++ ++ b Ldone_vsx ++ .long 0 ++ .byte 0,12,0x04,1,0x80,0,5,0 ++ .long 0 ++.size .ChaCha20_ctr32_vsx_p10,.-.ChaCha20_ctr32_vsx_p10 ++___ ++}}} ++ ++##This is 8 block in parallel implementation. The heart of chacha round uses vector instruction that has access to ++# vsr[32+X]. To perform the 8 parallel block we tend to use all 32 register to hold the 8 block info. ++# WE need to store few register value on side, so we can use VSR{32+X} for few vector instructions used in round op and hold intermediate value. ++# WE use the VSR[0]-VSR[31] for holding intermediate value and perform 8 block in parallel. ++# ++{{{ ++#### ($out,$inp,$len,$key,$ctr) = map("r$_",(3..7)); ++my ($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3, ++ $xc0,$xc1,$xc2,$xc3, $xd0,$xd1,$xd2,$xd3, ++ $xa4,$xa5,$xa6,$xa7, $xb4,$xb5,$xb6,$xb7, ++ $xc4,$xc5,$xc6,$xc7, $xd4,$xd5,$xd6,$xd7) = map("v$_",(0..31)); ++my ($xcn4,$xcn5,$xcn6,$xcn7, $xdn4,$xdn5,$xdn6,$xdn7) = map("v$_",(8..15)); ++my ($xan0,$xbn0,$xcn0,$xdn0) = map("v$_",(0..3)); ++my @K = map("v$_",27,(24..26)); ++my ($xt0,$xt1,$xt2,$xt3,$xt4) = map("v$_",23,(28..31)); ++my $xr0 = "v4"; ++my $CTR0 = "v22"; ++my $CTR1 = "v5"; ++my $beperm = "v31"; ++my ($x00,$x10,$x20,$x30) = (0, map("r$_",(8..10))); ++my ($xv0,$xv1,$xv2,$xv3,$xv4,$xv5,$xv6,$xv7) = map("v$_",(0..7)); ++my ($xv8,$xv9,$xv10,$xv11,$xv12,$xv13,$xv14,$xv15,$xv16,$xv17) = map("v$_",(8..17)); ++my ($xv18,$xv19,$xv20,$xv21) = map("v$_",(18..21)); ++my ($xv22,$xv23,$xv24,$xv25,$xv26) = map("v$_",(22..26)); ++ ++my $FRAME=$LOCALS+64+9*16; # 8*16 is for v24-v31 offload ++ ++sub VSX_lane_ROUND_8x { ++my ($a0,$b0,$c0,$d0,$a4,$b4,$c4,$d4)=@_; ++my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0)); ++my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1)); ++my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2)); ++my ($a5,$b5,$c5,$d5)=map(($_&~3)+(($_+1)&3),($a4,$b4,$c4,$d4)); ++my ($a6,$b6,$c6,$d6)=map(($_&~3)+(($_+1)&3),($a5,$b5,$c5,$d5)); ++my ($a7,$b7,$c7,$d7)=map(($_&~3)+(($_+1)&3),($a6,$b6,$c6,$d6)); ++my ($xv8,$xv9,$xv10,$xv11,$xv12,$xv13,$xv14,$xv15,$xv16,$xv17) = map("\"v$_\"",(8..17)); ++my @x=map("\"v$_\"",(0..31)); ++ ++ ( ++ "&vxxlor ($xv15 ,@x[$c7],@x[$c7])", #copy v30 to v13 ++ "&vxxlorc (@x[$c7], $xv9,$xv9)", ++ ++ "&vadduwm (@x[$a0],@x[$a0],@x[$b0])", # Q1 ++ "&vadduwm (@x[$a1],@x[$a1],@x[$b1])", # Q2 ++ "&vadduwm (@x[$a2],@x[$a2],@x[$b2])", # Q3 ++ "&vadduwm (@x[$a3],@x[$a3],@x[$b3])", # Q4 ++ "&vadduwm (@x[$a4],@x[$a4],@x[$b4])", # Q1 ++ "&vadduwm (@x[$a5],@x[$a5],@x[$b5])", # Q2 ++ "&vadduwm (@x[$a6],@x[$a6],@x[$b6])", # Q3 ++ "&vadduwm (@x[$a7],@x[$a7],@x[$b7])", # Q4 ++ ++ "&vxor (@x[$d0],@x[$d0],@x[$a0])", ++ "&vxor (@x[$d1],@x[$d1],@x[$a1])", ++ "&vxor (@x[$d2],@x[$d2],@x[$a2])", ++ "&vxor (@x[$d3],@x[$d3],@x[$a3])", ++ "&vxor (@x[$d4],@x[$d4],@x[$a4])", ++ "&vxor (@x[$d5],@x[$d5],@x[$a5])", ++ "&vxor (@x[$d6],@x[$d6],@x[$a6])", ++ "&vxor (@x[$d7],@x[$d7],@x[$a7])", ++ ++ "&vrlw (@x[$d0],@x[$d0],@x[$c7])", ++ "&vrlw (@x[$d1],@x[$d1],@x[$c7])", ++ "&vrlw (@x[$d2],@x[$d2],@x[$c7])", ++ "&vrlw (@x[$d3],@x[$d3],@x[$c7])", ++ "&vrlw (@x[$d4],@x[$d4],@x[$c7])", ++ "&vrlw (@x[$d5],@x[$d5],@x[$c7])", ++ "&vrlw (@x[$d6],@x[$d6],@x[$c7])", ++ "&vrlw (@x[$d7],@x[$d7],@x[$c7])", ++ ++ "&vxxlor ($xv13 ,@x[$a7],@x[$a7])", ++ "&vxxlorc (@x[$c7], $xv15,$xv15)", ++ "&vxxlorc (@x[$a7], $xv10,$xv10)", ++ ++ "&vadduwm (@x[$c0],@x[$c0],@x[$d0])", ++ "&vadduwm (@x[$c1],@x[$c1],@x[$d1])", ++ "&vadduwm (@x[$c2],@x[$c2],@x[$d2])", ++ "&vadduwm (@x[$c3],@x[$c3],@x[$d3])", ++ "&vadduwm (@x[$c4],@x[$c4],@x[$d4])", ++ "&vadduwm (@x[$c5],@x[$c5],@x[$d5])", ++ "&vadduwm (@x[$c6],@x[$c6],@x[$d6])", ++ "&vadduwm (@x[$c7],@x[$c7],@x[$d7])", ++ ++ "&vxor (@x[$b0],@x[$b0],@x[$c0])", ++ "&vxor (@x[$b1],@x[$b1],@x[$c1])", ++ "&vxor (@x[$b2],@x[$b2],@x[$c2])", ++ "&vxor (@x[$b3],@x[$b3],@x[$c3])", ++ "&vxor (@x[$b4],@x[$b4],@x[$c4])", ++ "&vxor (@x[$b5],@x[$b5],@x[$c5])", ++ "&vxor (@x[$b6],@x[$b6],@x[$c6])", ++ "&vxor (@x[$b7],@x[$b7],@x[$c7])", ++ ++ "&vrlw (@x[$b0],@x[$b0],@x[$a7])", ++ "&vrlw (@x[$b1],@x[$b1],@x[$a7])", ++ "&vrlw (@x[$b2],@x[$b2],@x[$a7])", ++ "&vrlw (@x[$b3],@x[$b3],@x[$a7])", ++ "&vrlw (@x[$b4],@x[$b4],@x[$a7])", ++ "&vrlw (@x[$b5],@x[$b5],@x[$a7])", ++ "&vrlw (@x[$b6],@x[$b6],@x[$a7])", ++ "&vrlw (@x[$b7],@x[$b7],@x[$a7])", ++ ++ "&vxxlorc (@x[$a7], $xv13,$xv13)", ++ "&vxxlor ($xv15 ,@x[$c7],@x[$c7])", ++ "&vxxlorc (@x[$c7], $xv11,$xv11)", ++ ++ ++ "&vadduwm (@x[$a0],@x[$a0],@x[$b0])", ++ "&vadduwm (@x[$a1],@x[$a1],@x[$b1])", ++ "&vadduwm (@x[$a2],@x[$a2],@x[$b2])", ++ "&vadduwm (@x[$a3],@x[$a3],@x[$b3])", ++ "&vadduwm (@x[$a4],@x[$a4],@x[$b4])", ++ "&vadduwm (@x[$a5],@x[$a5],@x[$b5])", ++ "&vadduwm (@x[$a6],@x[$a6],@x[$b6])", ++ "&vadduwm (@x[$a7],@x[$a7],@x[$b7])", ++ ++ "&vxor (@x[$d0],@x[$d0],@x[$a0])", ++ "&vxor (@x[$d1],@x[$d1],@x[$a1])", ++ "&vxor (@x[$d2],@x[$d2],@x[$a2])", ++ "&vxor (@x[$d3],@x[$d3],@x[$a3])", ++ "&vxor (@x[$d4],@x[$d4],@x[$a4])", ++ "&vxor (@x[$d5],@x[$d5],@x[$a5])", ++ "&vxor (@x[$d6],@x[$d6],@x[$a6])", ++ "&vxor (@x[$d7],@x[$d7],@x[$a7])", ++ ++ "&vrlw (@x[$d0],@x[$d0],@x[$c7])", ++ "&vrlw (@x[$d1],@x[$d1],@x[$c7])", ++ "&vrlw (@x[$d2],@x[$d2],@x[$c7])", ++ "&vrlw (@x[$d3],@x[$d3],@x[$c7])", ++ "&vrlw (@x[$d4],@x[$d4],@x[$c7])", ++ "&vrlw (@x[$d5],@x[$d5],@x[$c7])", ++ "&vrlw (@x[$d6],@x[$d6],@x[$c7])", ++ "&vrlw (@x[$d7],@x[$d7],@x[$c7])", ++ ++ "&vxxlorc (@x[$c7], $xv15,$xv15)", ++ "&vxxlor ($xv13 ,@x[$a7],@x[$a7])", ++ "&vxxlorc (@x[$a7], $xv12,$xv12)", ++ ++ "&vadduwm (@x[$c0],@x[$c0],@x[$d0])", ++ "&vadduwm (@x[$c1],@x[$c1],@x[$d1])", ++ "&vadduwm (@x[$c2],@x[$c2],@x[$d2])", ++ "&vadduwm (@x[$c3],@x[$c3],@x[$d3])", ++ "&vadduwm (@x[$c4],@x[$c4],@x[$d4])", ++ "&vadduwm (@x[$c5],@x[$c5],@x[$d5])", ++ "&vadduwm (@x[$c6],@x[$c6],@x[$d6])", ++ "&vadduwm (@x[$c7],@x[$c7],@x[$d7])", ++ "&vxor (@x[$b0],@x[$b0],@x[$c0])", ++ "&vxor (@x[$b1],@x[$b1],@x[$c1])", ++ "&vxor (@x[$b2],@x[$b2],@x[$c2])", ++ "&vxor (@x[$b3],@x[$b3],@x[$c3])", ++ "&vxor (@x[$b4],@x[$b4],@x[$c4])", ++ "&vxor (@x[$b5],@x[$b5],@x[$c5])", ++ "&vxor (@x[$b6],@x[$b6],@x[$c6])", ++ "&vxor (@x[$b7],@x[$b7],@x[$c7])", ++ "&vrlw (@x[$b0],@x[$b0],@x[$a7])", ++ "&vrlw (@x[$b1],@x[$b1],@x[$a7])", ++ "&vrlw (@x[$b2],@x[$b2],@x[$a7])", ++ "&vrlw (@x[$b3],@x[$b3],@x[$a7])", ++ "&vrlw (@x[$b4],@x[$b4],@x[$a7])", ++ "&vrlw (@x[$b5],@x[$b5],@x[$a7])", ++ "&vrlw (@x[$b6],@x[$b6],@x[$a7])", ++ "&vrlw (@x[$b7],@x[$b7],@x[$a7])", ++ ++ "&vxxlorc (@x[$a7], $xv13,$xv13)", ++ ); ++} ++ ++$code.=<<___; ++ ++.globl .ChaCha20_ctr32_vsx_8x ++.align 5 ++.ChaCha20_ctr32_vsx_8x: ++ $STU $sp,-$FRAME($sp) ++ mflr r0 ++ li r10,`15+$LOCALS+64` ++ li r11,`31+$LOCALS+64` ++ mfspr r12,256 ++ stvx v24,r10,$sp ++ addi r10,r10,32 ++ stvx v25,r11,$sp ++ addi r11,r11,32 ++ stvx v26,r10,$sp ++ addi r10,r10,32 ++ stvx v27,r11,$sp ++ addi r11,r11,32 ++ stvx v28,r10,$sp ++ addi r10,r10,32 ++ stvx v29,r11,$sp ++ addi r11,r11,32 ++ stvx v30,r10,$sp ++ stvx v31,r11,$sp ++ stw r12,`$FRAME-4`($sp) # save vrsave ++ li r12,-4096+63 ++ $PUSH r0, `$FRAME+$LRSAVE`($sp) ++ mtspr 256,r12 # preserve 29 AltiVec registers ++ ++ bl Lconsts # returns pointer Lsigma in r12 ++ ++ lvx_4w @K[0],0,r12 # load sigma ++ addi r12,r12,0x70 ++ li $x10,16 ++ li $x20,32 ++ li $x30,48 ++ li r11,64 ++ ++ vspltisw $xa4,-16 # synthesize constants ++ vspltisw $xb4,12 # synthesize constants ++ vspltisw $xc4,8 # synthesize constants ++ vspltisw $xd4,7 # synthesize constants ++ ++ lvx $xa0,$x00,r12 # load [smashed] sigma ++ lvx $xa1,$x10,r12 ++ lvx $xa2,$x20,r12 ++ lvx $xa3,$x30,r12 ++ ++ vxxlor $xv9 ,$xa4,$xa4 #save shift val in vr9-12 ++ vxxlor $xv10 ,$xb4,$xb4 ++ vxxlor $xv11 ,$xc4,$xc4 ++ vxxlor $xv12 ,$xd4,$xd4 ++ vxxlor $xv22 ,$xa0,$xa0 #save sigma in vr22-25 ++ vxxlor $xv23 ,$xa1,$xa1 ++ vxxlor $xv24 ,$xa2,$xa2 ++ vxxlor $xv25 ,$xa3,$xa3 ++ ++ lvx_4w @K[1],0,$key # load key ++ lvx_4w @K[2],$x10,$key ++ lvx_4w @K[3],0,$ctr # load counter ++ vspltisw $xt3,4 ++ ++ ++ vxor $xt2,$xt2,$xt2 ++ lvx_4w $xt1,r11,r12 ++ vspltw $xa2,@K[3],0 #save the original count after spltw ++ vsldoi @K[3],@K[3],$xt2,4 ++ vsldoi @K[3],$xt2,@K[3],12 # clear @K[3].word[0] ++ vadduwm $xt1,$xa2,$xt1 ++ vadduwm $xt3,$xt1,$xt3 # next counter value ++ vspltw $xa0,@K[2],2 # save the K[2] spltw 2 and save v8. ++ ++ be?lvsl $beperm,0,$x10 # 0x00..0f ++ be?vspltisb $xt0,3 # 0x03..03 ++ be?vxor $beperm,$beperm,$xt0 # swap bytes within words ++ be?vxxlor $xv26 ,$beperm,$beperm ++ ++ vxxlor $xv0 ,@K[0],@K[0] # K0,k1,k2 to vr0,1,2 ++ vxxlor $xv1 ,@K[1],@K[1] ++ vxxlor $xv2 ,@K[2],@K[2] ++ vxxlor $xv3 ,@K[3],@K[3] ++ vxxlor $xv4 ,$xt1,$xt1 #CTR ->4, CTR+4-> 5 ++ vxxlor $xv5 ,$xt3,$xt3 ++ vxxlor $xv8 ,$xa0,$xa0 ++ ++ li r0,10 # inner loop counter ++ mtctr r0 ++ b Loop_outer_vsx_8x ++ ++.align 5 ++Loop_outer_vsx_8x: ++ vxxlorc $xa0,$xv22,$xv22 # load [smashed] sigma ++ vxxlorc $xa1,$xv23,$xv23 ++ vxxlorc $xa2,$xv24,$xv24 ++ vxxlorc $xa3,$xv25,$xv25 ++ vxxlorc $xa4,$xv22,$xv22 ++ vxxlorc $xa5,$xv23,$xv23 ++ vxxlorc $xa6,$xv24,$xv24 ++ vxxlorc $xa7,$xv25,$xv25 ++ ++ vspltw $xb0,@K[1],0 # smash the key ++ vspltw $xb1,@K[1],1 ++ vspltw $xb2,@K[1],2 ++ vspltw $xb3,@K[1],3 ++ vspltw $xb4,@K[1],0 # smash the key ++ vspltw $xb5,@K[1],1 ++ vspltw $xb6,@K[1],2 ++ vspltw $xb7,@K[1],3 ++ ++ vspltw $xc0,@K[2],0 ++ vspltw $xc1,@K[2],1 ++ vspltw $xc2,@K[2],2 ++ vspltw $xc3,@K[2],3 ++ vspltw $xc4,@K[2],0 ++ vspltw $xc7,@K[2],3 ++ vspltw $xc5,@K[2],1 ++ ++ vxxlorc $xd0,$xv4,$xv4 # smash the counter ++ vspltw $xd1,@K[3],1 ++ vspltw $xd2,@K[3],2 ++ vspltw $xd3,@K[3],3 ++ vxxlorc $xd4,$xv5,$xv5 # smash the counter ++ vspltw $xd5,@K[3],1 ++ vspltw $xd6,@K[3],2 ++ vspltw $xd7,@K[3],3 ++ vxxlorc $xc6,$xv8,$xv8 #copy of vlspt k[2],2 is in v8.v26 ->k[3] so need to wait until k3 is done ++ ++Loop_vsx_8x: ++___ ++ foreach (&VSX_lane_ROUND_8x(0,4, 8,12,16,20,24,28)) { eval; } ++ foreach (&VSX_lane_ROUND_8x(0,5,10,15,16,21,26,31)) { eval; } ++$code.=<<___; ++ ++ bdnz Loop_vsx_8x ++ vxxlor $xv13 ,$xd4,$xd4 # save the register vr24-31 ++ vxxlor $xv14 ,$xd5,$xd5 # ++ vxxlor $xv15 ,$xd6,$xd6 # ++ vxxlor $xv16 ,$xd7,$xd7 # ++ ++ vxxlor $xv18 ,$xc4,$xc4 # ++ vxxlor $xv19 ,$xc5,$xc5 # ++ vxxlor $xv20 ,$xc6,$xc6 # ++ vxxlor $xv21 ,$xc7,$xc7 # ++ ++ vxxlor $xv6 ,$xb6,$xb6 # save vr23, so we get 8 regs ++ vxxlor $xv7 ,$xb7,$xb7 # save vr23, so we get 8 regs ++ be?vxxlorc $beperm,$xv26,$xv26 # copy back the the beperm. ++ ++ vxxlorc @K[0],$xv0,$xv0 #27 ++ vxxlorc @K[1],$xv1,$xv1 #24 ++ vxxlorc @K[2],$xv2,$xv2 #25 ++ vxxlorc @K[3],$xv3,$xv3 #26 ++ vxxlorc $CTR0,$xv4,$xv4 ++###changing to vertical ++ ++ vmrgew $xt0,$xa0,$xa1 # transpose data ++ vmrgew $xt1,$xa2,$xa3 ++ vmrgow $xa0,$xa0,$xa1 ++ vmrgow $xa2,$xa2,$xa3 ++ ++ vmrgew $xt2,$xb0,$xb1 ++ vmrgew $xt3,$xb2,$xb3 ++ vmrgow $xb0,$xb0,$xb1 ++ vmrgow $xb2,$xb2,$xb3 ++ ++ vadduwm $xd0,$xd0,$CTR0 ++ ++ vpermdi $xa1,$xa0,$xa2,0b00 ++ vpermdi $xa3,$xa0,$xa2,0b11 ++ vpermdi $xa0,$xt0,$xt1,0b00 ++ vpermdi $xa2,$xt0,$xt1,0b11 ++ vpermdi $xb1,$xb0,$xb2,0b00 ++ vpermdi $xb3,$xb0,$xb2,0b11 ++ vpermdi $xb0,$xt2,$xt3,0b00 ++ vpermdi $xb2,$xt2,$xt3,0b11 ++ ++ vmrgew $xt0,$xc0,$xc1 ++ vmrgew $xt1,$xc2,$xc3 ++ vmrgow $xc0,$xc0,$xc1 ++ vmrgow $xc2,$xc2,$xc3 ++ vmrgew $xt2,$xd0,$xd1 ++ vmrgew $xt3,$xd2,$xd3 ++ vmrgow $xd0,$xd0,$xd1 ++ vmrgow $xd2,$xd2,$xd3 ++ ++ vpermdi $xc1,$xc0,$xc2,0b00 ++ vpermdi $xc3,$xc0,$xc2,0b11 ++ vpermdi $xc0,$xt0,$xt1,0b00 ++ vpermdi $xc2,$xt0,$xt1,0b11 ++ vpermdi $xd1,$xd0,$xd2,0b00 ++ vpermdi $xd3,$xd0,$xd2,0b11 ++ vpermdi $xd0,$xt2,$xt3,0b00 ++ vpermdi $xd2,$xt2,$xt3,0b11 ++ ++ vspltisw $xt0,8 ++ vadduwm $CTR0,$CTR0,$xt0 # next counter value ++ vxxlor $xv4 ,$CTR0,$CTR0 #CTR+4-> 5 ++ ++ vadduwm $xa0,$xa0,@K[0] ++ vadduwm $xb0,$xb0,@K[1] ++ vadduwm $xc0,$xc0,@K[2] ++ vadduwm $xd0,$xd0,@K[3] ++ ++ be?vperm $xa0,$xa0,$xa0,$beperm ++ be?vperm $xb0,$xb0,$xb0,$beperm ++ be?vperm $xc0,$xc0,$xc0,$beperm ++ be?vperm $xd0,$xd0,$xd0,$beperm ++ ++ ${UCMP}i $len,0x40 ++ blt Ltail_vsx_8x ++ ++ lvx_4w $xt0,$x00,$inp ++ lvx_4w $xt1,$x10,$inp ++ lvx_4w $xt2,$x20,$inp ++ lvx_4w $xt3,$x30,$inp ++ ++ vxor $xt0,$xt0,$xa0 ++ vxor $xt1,$xt1,$xb0 ++ vxor $xt2,$xt2,$xc0 ++ vxor $xt3,$xt3,$xd0 ++ ++ stvx_4w $xt0,$x00,$out ++ stvx_4w $xt1,$x10,$out ++ addi $inp,$inp,0x40 ++ stvx_4w $xt2,$x20,$out ++ subi $len,$len,0x40 ++ stvx_4w $xt3,$x30,$out ++ addi $out,$out,0x40 ++ beq Ldone_vsx_8x ++ ++ vadduwm $xa0,$xa1,@K[0] ++ vadduwm $xb0,$xb1,@K[1] ++ vadduwm $xc0,$xc1,@K[2] ++ vadduwm $xd0,$xd1,@K[3] ++ ++ be?vperm $xa0,$xa0,$xa0,$beperm ++ be?vperm $xb0,$xb0,$xb0,$beperm ++ be?vperm $xc0,$xc0,$xc0,$beperm ++ be?vperm $xd0,$xd0,$xd0,$beperm ++ ++ ${UCMP}i $len,0x40 ++ blt Ltail_vsx_8x ++ ++ lvx_4w $xt0,$x00,$inp ++ lvx_4w $xt1,$x10,$inp ++ lvx_4w $xt2,$x20,$inp ++ lvx_4w $xt3,$x30,$inp ++ ++ vxor $xt0,$xt0,$xa0 ++ vxor $xt1,$xt1,$xb0 ++ vxor $xt2,$xt2,$xc0 ++ vxor $xt3,$xt3,$xd0 ++ ++ stvx_4w $xt0,$x00,$out ++ stvx_4w $xt1,$x10,$out ++ addi $inp,$inp,0x40 ++ stvx_4w $xt2,$x20,$out ++ subi $len,$len,0x40 ++ stvx_4w $xt3,$x30,$out ++ addi $out,$out,0x40 ++ beq Ldone_vsx_8x ++ ++ vadduwm $xa0,$xa2,@K[0] ++ vadduwm $xb0,$xb2,@K[1] ++ vadduwm $xc0,$xc2,@K[2] ++ vadduwm $xd0,$xd2,@K[3] ++ ++ be?vperm $xa0,$xa0,$xa0,$beperm ++ be?vperm $xb0,$xb0,$xb0,$beperm ++ be?vperm $xc0,$xc0,$xc0,$beperm ++ be?vperm $xd0,$xd0,$xd0,$beperm ++ ++ ${UCMP}i $len,0x40 ++ blt Ltail_vsx_8x ++ ++ lvx_4w $xt0,$x00,$inp ++ lvx_4w $xt1,$x10,$inp ++ lvx_4w $xt2,$x20,$inp ++ lvx_4w $xt3,$x30,$inp ++ ++ vxor $xt0,$xt0,$xa0 ++ vxor $xt1,$xt1,$xb0 ++ vxor $xt2,$xt2,$xc0 ++ vxor $xt3,$xt3,$xd0 ++ ++ stvx_4w $xt0,$x00,$out ++ stvx_4w $xt1,$x10,$out ++ addi $inp,$inp,0x40 ++ stvx_4w $xt2,$x20,$out ++ subi $len,$len,0x40 ++ stvx_4w $xt3,$x30,$out ++ addi $out,$out,0x40 ++ beq Ldone_vsx_8x ++ ++ vadduwm $xa0,$xa3,@K[0] ++ vadduwm $xb0,$xb3,@K[1] ++ vadduwm $xc0,$xc3,@K[2] ++ vadduwm $xd0,$xd3,@K[3] ++ ++ be?vperm $xa0,$xa0,$xa0,$beperm ++ be?vperm $xb0,$xb0,$xb0,$beperm ++ be?vperm $xc0,$xc0,$xc0,$beperm ++ be?vperm $xd0,$xd0,$xd0,$beperm ++ ++ ${UCMP}i $len,0x40 ++ blt Ltail_vsx_8x ++ ++ lvx_4w $xt0,$x00,$inp ++ lvx_4w $xt1,$x10,$inp ++ lvx_4w $xt2,$x20,$inp ++ lvx_4w $xt3,$x30,$inp ++ ++ vxor $xt0,$xt0,$xa0 ++ vxor $xt1,$xt1,$xb0 ++ vxor $xt2,$xt2,$xc0 ++ vxor $xt3,$xt3,$xd0 ++ ++ stvx_4w $xt0,$x00,$out ++ stvx_4w $xt1,$x10,$out ++ addi $inp,$inp,0x40 ++ stvx_4w $xt2,$x20,$out ++ subi $len,$len,0x40 ++ stvx_4w $xt3,$x30,$out ++ addi $out,$out,0x40 ++ beq Ldone_vsx_8x ++ ++#blk4-7: 24:31 remain the same as we can use the same logic above . Reg a4-b7 remain same.Load c4,d7--> position 8-15.we can reuse vr24-31. ++#VR0-3 : are used to load temp value, vr4 --> as xr0 instead of xt0. ++ ++ vxxlorc $CTR1 ,$xv5,$xv5 ++ ++ vxxlorc $xcn4 ,$xv18,$xv18 ++ vxxlorc $xcn5 ,$xv19,$xv19 ++ vxxlorc $xcn6 ,$xv20,$xv20 ++ vxxlorc $xcn7 ,$xv21,$xv21 ++ ++ vxxlorc $xdn4 ,$xv13,$xv13 ++ vxxlorc $xdn5 ,$xv14,$xv14 ++ vxxlorc $xdn6 ,$xv15,$xv15 ++ vxxlorc $xdn7 ,$xv16,$xv16 ++ vadduwm $xdn4,$xdn4,$CTR1 ++ ++ vxxlorc $xb6 ,$xv6,$xv6 ++ vxxlorc $xb7 ,$xv7,$xv7 ++#use xa1->xr0, as xt0...in the block 4-7 ++ ++ vmrgew $xr0,$xa4,$xa5 # transpose data ++ vmrgew $xt1,$xa6,$xa7 ++ vmrgow $xa4,$xa4,$xa5 ++ vmrgow $xa6,$xa6,$xa7 ++ vmrgew $xt2,$xb4,$xb5 ++ vmrgew $xt3,$xb6,$xb7 ++ vmrgow $xb4,$xb4,$xb5 ++ vmrgow $xb6,$xb6,$xb7 ++ ++ vpermdi $xa5,$xa4,$xa6,0b00 ++ vpermdi $xa7,$xa4,$xa6,0b11 ++ vpermdi $xa4,$xr0,$xt1,0b00 ++ vpermdi $xa6,$xr0,$xt1,0b11 ++ vpermdi $xb5,$xb4,$xb6,0b00 ++ vpermdi $xb7,$xb4,$xb6,0b11 ++ vpermdi $xb4,$xt2,$xt3,0b00 ++ vpermdi $xb6,$xt2,$xt3,0b11 ++ ++ vmrgew $xr0,$xcn4,$xcn5 ++ vmrgew $xt1,$xcn6,$xcn7 ++ vmrgow $xcn4,$xcn4,$xcn5 ++ vmrgow $xcn6,$xcn6,$xcn7 ++ vmrgew $xt2,$xdn4,$xdn5 ++ vmrgew $xt3,$xdn6,$xdn7 ++ vmrgow $xdn4,$xdn4,$xdn5 ++ vmrgow $xdn6,$xdn6,$xdn7 ++ ++ vpermdi $xcn5,$xcn4,$xcn6,0b00 ++ vpermdi $xcn7,$xcn4,$xcn6,0b11 ++ vpermdi $xcn4,$xr0,$xt1,0b00 ++ vpermdi $xcn6,$xr0,$xt1,0b11 ++ vpermdi $xdn5,$xdn4,$xdn6,0b00 ++ vpermdi $xdn7,$xdn4,$xdn6,0b11 ++ vpermdi $xdn4,$xt2,$xt3,0b00 ++ vpermdi $xdn6,$xt2,$xt3,0b11 ++ ++ vspltisw $xr0,8 ++ vadduwm $CTR1,$CTR1,$xr0 # next counter value ++ vxxlor $xv5 ,$CTR1,$CTR1 #CTR+4-> 5 ++ ++ vadduwm $xan0,$xa4,@K[0] ++ vadduwm $xbn0,$xb4,@K[1] ++ vadduwm $xcn0,$xcn4,@K[2] ++ vadduwm $xdn0,$xdn4,@K[3] ++ ++ be?vperm $xan0,$xa4,$xa4,$beperm ++ be?vperm $xbn0,$xb4,$xb4,$beperm ++ be?vperm $xcn0,$xcn4,$xcn4,$beperm ++ be?vperm $xdn0,$xdn4,$xdn4,$beperm ++ ++ ${UCMP}i $len,0x40 ++ blt Ltail_vsx_8x_1 ++ ++ lvx_4w $xr0,$x00,$inp ++ lvx_4w $xt1,$x10,$inp ++ lvx_4w $xt2,$x20,$inp ++ lvx_4w $xt3,$x30,$inp ++ ++ vxor $xr0,$xr0,$xan0 ++ vxor $xt1,$xt1,$xbn0 ++ vxor $xt2,$xt2,$xcn0 ++ vxor $xt3,$xt3,$xdn0 ++ ++ stvx_4w $xr0,$x00,$out ++ stvx_4w $xt1,$x10,$out ++ addi $inp,$inp,0x40 ++ stvx_4w $xt2,$x20,$out ++ subi $len,$len,0x40 ++ stvx_4w $xt3,$x30,$out ++ addi $out,$out,0x40 ++ beq Ldone_vsx_8x ++ ++ vadduwm $xan0,$xa5,@K[0] ++ vadduwm $xbn0,$xb5,@K[1] ++ vadduwm $xcn0,$xcn5,@K[2] ++ vadduwm $xdn0,$xdn5,@K[3] ++ ++ be?vperm $xan0,$xan0,$xan0,$beperm ++ be?vperm $xbn0,$xbn0,$xbn0,$beperm ++ be?vperm $xcn0,$xcn0,$xcn0,$beperm ++ be?vperm $xdn0,$xdn0,$xdn0,$beperm ++ ++ ${UCMP}i $len,0x40 ++ blt Ltail_vsx_8x_1 ++ ++ lvx_4w $xr0,$x00,$inp ++ lvx_4w $xt1,$x10,$inp ++ lvx_4w $xt2,$x20,$inp ++ lvx_4w $xt3,$x30,$inp ++ ++ vxor $xr0,$xr0,$xan0 ++ vxor $xt1,$xt1,$xbn0 ++ vxor $xt2,$xt2,$xcn0 ++ vxor $xt3,$xt3,$xdn0 ++ ++ stvx_4w $xr0,$x00,$out ++ stvx_4w $xt1,$x10,$out ++ addi $inp,$inp,0x40 ++ stvx_4w $xt2,$x20,$out ++ subi $len,$len,0x40 ++ stvx_4w $xt3,$x30,$out ++ addi $out,$out,0x40 ++ beq Ldone_vsx_8x ++ ++ vadduwm $xan0,$xa6,@K[0] ++ vadduwm $xbn0,$xb6,@K[1] ++ vadduwm $xcn0,$xcn6,@K[2] ++ vadduwm $xdn0,$xdn6,@K[3] ++ ++ be?vperm $xan0,$xan0,$xan0,$beperm ++ be?vperm $xbn0,$xbn0,$xbn0,$beperm ++ be?vperm $xcn0,$xcn0,$xcn0,$beperm ++ be?vperm $xdn0,$xdn0,$xdn0,$beperm ++ ++ ${UCMP}i $len,0x40 ++ blt Ltail_vsx_8x_1 ++ ++ lvx_4w $xr0,$x00,$inp ++ lvx_4w $xt1,$x10,$inp ++ lvx_4w $xt2,$x20,$inp ++ lvx_4w $xt3,$x30,$inp ++ ++ vxor $xr0,$xr0,$xan0 ++ vxor $xt1,$xt1,$xbn0 ++ vxor $xt2,$xt2,$xcn0 ++ vxor $xt3,$xt3,$xdn0 ++ ++ stvx_4w $xr0,$x00,$out ++ stvx_4w $xt1,$x10,$out ++ addi $inp,$inp,0x40 ++ stvx_4w $xt2,$x20,$out ++ subi $len,$len,0x40 ++ stvx_4w $xt3,$x30,$out ++ addi $out,$out,0x40 ++ beq Ldone_vsx_8x ++ ++ vadduwm $xan0,$xa7,@K[0] ++ vadduwm $xbn0,$xb7,@K[1] ++ vadduwm $xcn0,$xcn7,@K[2] ++ vadduwm $xdn0,$xdn7,@K[3] ++ ++ be?vperm $xan0,$xan0,$xan0,$beperm ++ be?vperm $xbn0,$xbn0,$xbn0,$beperm ++ be?vperm $xcn0,$xcn0,$xcn0,$beperm ++ be?vperm $xdn0,$xdn0,$xdn0,$beperm ++ ++ ${UCMP}i $len,0x40 ++ blt Ltail_vsx_8x_1 ++ ++ lvx_4w $xr0,$x00,$inp ++ lvx_4w $xt1,$x10,$inp ++ lvx_4w $xt2,$x20,$inp ++ lvx_4w $xt3,$x30,$inp ++ ++ vxor $xr0,$xr0,$xan0 ++ vxor $xt1,$xt1,$xbn0 ++ vxor $xt2,$xt2,$xcn0 ++ vxor $xt3,$xt3,$xdn0 ++ ++ stvx_4w $xr0,$x00,$out ++ stvx_4w $xt1,$x10,$out ++ addi $inp,$inp,0x40 ++ stvx_4w $xt2,$x20,$out ++ subi $len,$len,0x40 ++ stvx_4w $xt3,$x30,$out ++ addi $out,$out,0x40 ++ beq Ldone_vsx_8x ++ ++ mtctr r0 ++ bne Loop_outer_vsx_8x ++ ++Ldone_vsx_8x: ++ lwz r12,`$FRAME-4`($sp) # pull vrsave ++ li r10,`15+$LOCALS+64` ++ li r11,`31+$LOCALS+64` ++ $POP r0, `$FRAME+$LRSAVE`($sp) ++ mtspr 256,r12 # restore vrsave ++ lvx v24,r10,$sp ++ addi r10,r10,32 ++ lvx v25,r11,$sp ++ addi r11,r11,32 ++ lvx v26,r10,$sp ++ addi r10,r10,32 ++ lvx v27,r11,$sp ++ addi r11,r11,32 ++ lvx v28,r10,$sp ++ addi r10,r10,32 ++ lvx v29,r11,$sp ++ addi r11,r11,32 ++ lvx v30,r10,$sp ++ lvx v31,r11,$sp ++ mtlr r0 ++ addi $sp,$sp,$FRAME ++ blr ++ ++.align 4 ++Ltail_vsx_8x: ++ addi r11,$sp,$LOCALS ++ mtctr $len ++ stvx_4w $xa0,$x00,r11 # offload block to stack ++ stvx_4w $xb0,$x10,r11 ++ stvx_4w $xc0,$x20,r11 ++ stvx_4w $xd0,$x30,r11 ++ subi r12,r11,1 # prepare for *++ptr ++ subi $inp,$inp,1 ++ subi $out,$out,1 ++ bl Loop_tail_vsx_8x ++Ltail_vsx_8x_1: ++ addi r11,$sp,$LOCALS ++ mtctr $len ++ stvx_4w $xan0,$x00,r11 # offload block to stack ++ stvx_4w $xbn0,$x10,r11 ++ stvx_4w $xcn0,$x20,r11 ++ stvx_4w $xdn0,$x30,r11 ++ subi r12,r11,1 # prepare for *++ptr ++ subi $inp,$inp,1 ++ subi $out,$out,1 ++ bl Loop_tail_vsx_8x ++ ++Loop_tail_vsx_8x: ++ lbzu r6,1(r12) ++ lbzu r7,1($inp) ++ xor r6,r6,r7 ++ stbu r6,1($out) ++ bdnz Loop_tail_vsx_8x ++ ++ stvx_4w $K[0],$x00,r11 # wipe copy of the block ++ stvx_4w $K[0],$x10,r11 ++ stvx_4w $K[0],$x20,r11 ++ stvx_4w $K[0],$x30,r11 ++ ++ b Ldone_vsx_8x ++ .long 0 ++ .byte 0,12,0x04,1,0x80,0,5,0 ++ .long 0 ++.size .ChaCha20_ctr32_vsx_8x,.-.ChaCha20_ctr32_vsx_8x ++___ ++}}} ++ ++ ++$code.=<<___; ++.align 5 ++Lconsts: ++ mflr r0 ++ bcl 20,31,\$+4 ++ mflr r12 #vvvvv "distance between . and Lsigma ++ addi r12,r12,`64-8` ++ mtlr r0 ++ blr ++ .long 0 ++ .byte 0,12,0x14,0,0,0,0,0 ++ .space `64-9*4` ++Lsigma: ++ .long 0x61707865,0x3320646e,0x79622d32,0x6b206574 ++ .long 1,0,0,0 ++ .long 2,0,0,0 ++ .long 3,0,0,0 ++ .long 4,0,0,0 ++___ ++$code.=<<___ if ($LITTLE_ENDIAN); ++ .long 0x0e0f0c0d,0x0a0b0809,0x06070405,0x02030001 ++ .long 0x0d0e0f0c,0x090a0b08,0x05060704,0x01020300 ++___ ++$code.=<<___ if (!$LITTLE_ENDIAN); # flipped words ++ .long 0x02030001,0x06070405,0x0a0b0809,0x0e0f0c0d ++ .long 0x01020300,0x05060704,0x090a0b08,0x0d0e0f0c ++___ ++$code.=<<___; ++ .long 0x61707865,0x61707865,0x61707865,0x61707865 ++ .long 0x3320646e,0x3320646e,0x3320646e,0x3320646e ++ .long 0x79622d32,0x79622d32,0x79622d32,0x79622d32 ++ .long 0x6b206574,0x6b206574,0x6b206574,0x6b206574 ++ .long 0,1,2,3 ++ .long 0x03020100,0x07060504,0x0b0a0908,0x0f0e0d0c ++.asciz "ChaCha20 for PowerPC/AltiVec, CRYPTOGAMS by " ++.align 2 ++___ ++ ++foreach (split("\n",$code)) { ++ s/\`([^\`]*)\`/eval $1/ge; ++ ++ # instructions prefixed with '?' are endian-specific and need ++ # to be adjusted accordingly... ++ if ($flavour !~ /le$/) { # big-endian ++ s/be\?// or ++ s/le\?/#le#/ or ++ s/\?lvsr/lvsl/ or ++ s/\?lvsl/lvsr/ or ++ s/\?(vperm\s+v[0-9]+,\s*)(v[0-9]+,\s*)(v[0-9]+,\s*)(v[0-9]+)/$1$3$2$4/ or ++ s/vrldoi(\s+v[0-9]+,\s*)(v[0-9]+,)\s*([0-9]+)/vsldoi$1$2$2 16-$3/; ++ } else { # little-endian ++ s/le\?// or ++ s/be\?/#be#/ or ++ s/\?([a-z]+)/$1/ or ++ s/vrldoi(\s+v[0-9]+,\s*)(v[0-9]+,)\s*([0-9]+)/vsldoi$1$2$2 $3/; ++ } ++ ++ print $_,"\n"; ++} ++ ++close STDOUT or die "error closing STDOUT: $!"; +--- a/crypto/chacha/build.info ++++ b/crypto/chacha/build.info +@@ -5,6 +5,7 @@ GENERATE[chacha-x86.s]=asm/chacha-x86.pl + $(PERLASM_SCHEME) $(LIB_CFLAGS) $(LIB_CPPFLAGS) $(PROCESSOR) + GENERATE[chacha-x86_64.s]=asm/chacha-x86_64.pl $(PERLASM_SCHEME) + GENERATE[chacha-ppc.s]=asm/chacha-ppc.pl $(PERLASM_SCHEME) ++GENERATE[chachap10-ppc.s]=asm/chachap10-ppc.pl $(PERLASM_SCHEME) + GENERATE[chacha-armv4.S]=asm/chacha-armv4.pl $(PERLASM_SCHEME) + INCLUDE[chacha-armv4.o]=.. + GENERATE[chacha-armv8.S]=asm/chacha-armv8.pl $(PERLASM_SCHEME) +--- a/crypto/perlasm/ppc-xlate.pl ++++ b/crypto/perlasm/ppc-xlate.pl +@@ -288,6 +288,14 @@ my $vpermdi = sub { # xxpermdi + $dm = oct($dm) if ($dm =~ /^0/); + " .long ".sprintf "0x%X",(60<<26)|($vrt<<21)|($vra<<16)|($vrb<<11)|($dm<<8)|(10<<3)|7; + }; ++my $vxxlor = sub { # xxlor ++ my ($f, $vrt, $vra, $vrb) = @_; ++ " .long ".sprintf "0x%X",(60<<26)|($vrt<<21)|($vra<<16)|($vrb<<11)|(146<<3)|6; ++}; ++my $vxxlorc = sub { # xxlor ++ my ($f, $vrt, $vra, $vrb) = @_; ++ " .long ".sprintf "0x%X",(60<<26)|($vrt<<21)|($vra<<16)|($vrb<<11)|(146<<3)|1; ++}; + + # PowerISA 2.07 stuff + sub vcrypto_op { +@@ -370,6 +378,15 @@ my $addex = sub { + }; + my $vmsumudm = sub { vfour_vsr(@_, 35); }; + ++# PowerISA 3.1 stuff ++my $brd = sub { ++ my ($f, $ra, $rs) = @_; ++ " .long ".sprintf "0x%X",(31<<26)|($rs<<21)|($ra<<16)|(187<<1); ++}; ++my $vsrq = sub { vcrypto_op(@_, 517); }; ++ ++ ++ + while($line=<>) { + + $line =~ s|[#!;].*$||; # get rid of asm-style comments... +--- a/crypto/ppc_arch.h ++++ b/crypto/ppc_arch.h +@@ -24,5 +24,6 @@ extern unsigned int OPENSSL_ppccap_P; + # define PPC_MADD300 (1<<4) + # define PPC_MFTB (1<<5) + # define PPC_MFSPR268 (1<<6) ++# define PPC_BRD31 (1<<7) + + #endif +--- a/crypto/ppccap.c ++++ b/crypto/ppccap.c +@@ -108,15 +108,20 @@ void ChaCha20_ctr32_vmx(unsigned char *o + void ChaCha20_ctr32_vsx(unsigned char *out, const unsigned char *inp, + size_t len, const unsigned int key[8], + const unsigned int counter[4]); ++void ChaCha20_ctr32_vsx_p10(unsigned char *out, const unsigned char *inp, ++ size_t len, const unsigned int key[8], ++ const unsigned int counter[4]); + void ChaCha20_ctr32(unsigned char *out, const unsigned char *inp, + size_t len, const unsigned int key[8], + const unsigned int counter[4]) + { +- OPENSSL_ppccap_P & PPC_CRYPTO207 +- ? ChaCha20_ctr32_vsx(out, inp, len, key, counter) +- : OPENSSL_ppccap_P & PPC_ALTIVEC +- ? ChaCha20_ctr32_vmx(out, inp, len, key, counter) +- : ChaCha20_ctr32_int(out, inp, len, key, counter); ++ OPENSSL_ppccap_P & PPC_BRD31 ++ ? ChaCha20_ctr32_vsx_p10(out, inp, len, key, counter) ++ :OPENSSL_ppccap_P & PPC_CRYPTO207 ++ ? ChaCha20_ctr32_vsx(out, inp, len, key, counter) ++ : OPENSSL_ppccap_P & PPC_ALTIVEC ++ ? ChaCha20_ctr32_vmx(out, inp, len, key, counter) ++ : ChaCha20_ctr32_int(out, inp, len, key, counter); + } + #endif + +@@ -182,6 +187,7 @@ void OPENSSL_ppc64_probe(void); + void OPENSSL_altivec_probe(void); + void OPENSSL_crypto207_probe(void); + void OPENSSL_madd300_probe(void); ++void OPENSSL_brd31_probe(void); + + long OPENSSL_rdtsc_mftb(void); + long OPENSSL_rdtsc_mfspr268(void); +@@ -264,6 +270,7 @@ static unsigned long getauxval(unsigned + #define HWCAP2 26 /* AT_HWCAP2 */ + #define HWCAP_VEC_CRYPTO (1U << 25) + #define HWCAP_ARCH_3_00 (1U << 23) ++#define HWCAP_ARCH_3_1 (1U << 18) + + # if defined(__GNUC__) && __GNUC__>=2 + __attribute__ ((constructor)) +@@ -324,6 +331,9 @@ void OPENSSL_cpuid_setup(void) + if (__power_set(0xffffffffU<<17)) /* POWER9 and later */ + OPENSSL_ppccap_P |= PPC_MADD300; + ++ if (__power_set(0xffffffffU<<18)) /* POWER10 and later */ ++ OPENSSL_ppccap_P |= PPC_BRD31; ++ + return; + # endif + #endif +@@ -379,6 +389,10 @@ void OPENSSL_cpuid_setup(void) + if (hwcap2 & HWCAP_ARCH_3_00) { + OPENSSL_ppccap_P |= PPC_MADD300; + } ++ ++ if (hwcap2 & HWCAP_ARCH_3_1) { ++ OPENSSL_ppccap_P |= PPC_BRD31; ++ } + } + #endif + +--- a/crypto/ppccpuid.pl ++++ b/crypto/ppccpuid.pl +@@ -77,6 +77,17 @@ $code=<<___; + .long 0 + .byte 0,12,0x14,0,0,0,0,0 + ++.globl .OPENSSL_brd31_probe ++.align 4 ++.OPENSSL_brd31_probe: ++ xor r0,r0,r0 ++ brd r3,r0 ++ blr ++ .long 0 ++ .byte 0,12,0x14,0,0,0,0,0 ++.size .OPENSSL_brd31_probe,.-.OPENSSL_brd31_probe ++ ++ + .globl .OPENSSL_wipe_cpu + .align 4 + .OPENSSL_wipe_cpu: diff --git a/openssl-1_1.changes b/openssl-1_1.changes index acf925f..d7bad32 100644 --- a/openssl-1_1.changes +++ b/openssl-1_1.changes @@ -1,3 +1,13 @@ +------------------------------------------------------------------- +Wed Dec 14 09:04:40 UTC 2022 - Otto Hollmann + +- POWER10 performance enhancements for cryptography [jsc#PED-512] + * openssl-1_1-AES-GCM-performance-optimzation-with-stitched-method.patch + * openssl-1_1-Fixed-counter-overflow.patch + * openssl-1_1-chacha20-performance-optimizations-for-ppc64le-with-.patch + * openssl-1_1-Fixed-conditional-statement-testing-64-and-256-bytes.patch + * openssl-1_1-Fix-AES-GCM-on-Power-8-CPUs.patch + ------------------------------------------------------------------- Wed Nov 2 12:00:40 UTC 2022 - Otto Hollmann diff --git a/openssl-1_1.spec b/openssl-1_1.spec index 96adb4b..94ae402 100644 --- a/openssl-1_1.spec +++ b/openssl-1_1.spec @@ -123,6 +123,14 @@ Patch72: openssl-1_1-Optimize-AES-GCM-uarchs.patch Patch73: openssl-1_1-FIPS-fix-error-reason-codes.patch #PATCH-FIX-SUSE bsc#1180995 Default to RFC7919 groups in FIPS mode Patch74: openssl-1_1-paramgen-default_to_rfc7919.patch +# PATCH-FIX-UPSTREAM jsc#PED-512 +# POWER10 performance enhancements for cryptography +Patch75: openssl-1_1-AES-GCM-performance-optimzation-with-stitched-method.patch +Patch76: openssl-1_1-Fixed-counter-overflow.patch +Patch77: openssl-1_1-chacha20-performance-optimizations-for-ppc64le-with-.patch +Patch78: openssl-1_1-Fixed-conditional-statement-testing-64-and-256-bytes.patch +Patch79: openssl-1_1-Fix-AES-GCM-on-Power-8-CPUs.patch + Requires: libopenssl1_1 = %{version}-%{release} BuildRequires: pkgconfig BuildRequires: pkgconfig(zlib) From 93c266235b911436405030ac6ada0e9ce88cd86fb14a0eaec79fa58da3ce7901 Mon Sep 17 00:00:00 2001 From: Pedro Monreal Gonzalez Date: Wed, 14 Dec 2022 20:20:45 +0000 Subject: [PATCH 2/2] Accepting request 1042984 from home:ohollmann:branches:security:tls OBS-URL: https://build.opensuse.org/request/show/1042984 OBS-URL: https://build.opensuse.org/package/show/security:tls/openssl-1_1?expand=0&rev=124 --- openssl-1_1-Fix-AES-GCM-on-Power-8-CPUs.patch | 112 +++++++----------- 1 file changed, 45 insertions(+), 67 deletions(-) diff --git a/openssl-1_1-Fix-AES-GCM-on-Power-8-CPUs.patch b/openssl-1_1-Fix-AES-GCM-on-Power-8-CPUs.patch index bba48b6..e990782 100644 --- a/openssl-1_1-Fix-AES-GCM-on-Power-8-CPUs.patch +++ b/openssl-1_1-Fix-AES-GCM-on-Power-8-CPUs.patch @@ -12,8 +12,8 @@ Reviewed-by: Dmitry Belyavskiy Reviewed-by: Paul Dale (Merged from https://github.com/openssl/openssl/pull/19182) --- - crypto/evp/e_aes.c | 179 +++++++++++++++++++++++++++++++---------------------- - 1 file changed, 107 insertions(+), 72 deletions(-) + crypto/evp/e_aes.c | 146 ++++++++++++++++++++++++++--------------------------- + 1 file changed, 74 insertions(+), 72 deletions(-) --- a/crypto/evp/e_aes.c +++ b/crypto/evp/e_aes.c @@ -113,10 +113,11 @@ Reviewed-by: Paul Dale #endif #if defined(OPENSSL_CPUID_OBJ) && ( \ -@@ -3294,9 +3224,114 @@ static int aes_gcm_tls_cipher(EVP_CIPHER +@@ -3294,6 +3224,51 @@ static int aes_gcm_tls_cipher(EVP_CIPHER return rv; } ++#if defined(OPENSSL_CPUID_OBJ) && (defined(__powerpc__) || defined(__ppc__) || defined(_ARCH_PPC)) +static size_t ppc_aes_gcm_crypt(const unsigned char *in, unsigned char *out, size_t len, + const void *key, unsigned char ivec[16], u64 *Xi, int encrypt) +{ @@ -159,72 +160,49 @@ Reviewed-by: Paul Dale + + return ndone; +} -+ -+#if defined(OPENSSL_CPUID_OBJ) && (defined(__powerpc__) || defined(__ppc__) || defined(_ARCH_PPC)) -+static int ppc_aes_gcm_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out, -+ const unsigned char *in, size_t len) -+{ -+ EVP_AES_GCM_CTX *gctx = EVP_C_DATA(EVP_AES_GCM_CTX,ctx); -+ if (ctx->encrypt) { -+ if (gctx->ctr != NULL) { -+ size_t bulk = 0; -+ -+ if (len >= AES_GCM_ENC_BYTES && AES_GCM_ASM_PPC(gctx)) { -+ size_t res = (16 - gctx->gcm.mres) % 16; -+ -+ if (CRYPTO_gcm128_encrypt(&gctx->gcm, in, out, res)) -+ return 0; -+ -+ bulk = ppc_aes_gcm_crypt(in + res, out + res, len - res, -+ gctx->gcm.key, -+ gctx->gcm.Yi.c, gctx->gcm.Xi.u, 1); -+ -+ gctx->gcm.len.u[1] += bulk; -+ bulk += res; -+ } -+ if (CRYPTO_gcm128_encrypt_ctr32(&gctx->gcm, in + bulk, out + bulk, -+ len - bulk, gctx->ctr)) -+ return 0; -+ } else { -+ if (CRYPTO_gcm128_encrypt(&gctx->gcm, in, out, len)) -+ return 0; -+ } -+ } else { -+ if (gctx->ctr != NULL) { -+ size_t bulk = 0; -+ -+ if (len >= AES_GCM_DEC_BYTES && AES_GCM_ASM_PPC(gctx)) { -+ size_t res = (16 - gctx->gcm.mres) % 16; -+ -+ if (CRYPTO_gcm128_decrypt(&gctx->gcm, in, out, res)) -+ return -1; -+ -+ bulk = ppc_aes_gcm_crypt(in + res, out + res, len - res, -+ gctx->gcm.key, -+ gctx->gcm.Yi.c, gctx->gcm.Xi.u, 0); -+ -+ gctx->gcm.len.u[1] += bulk; -+ bulk += res; -+ } -+ if (CRYPTO_gcm128_decrypt_ctr32(&gctx->gcm, in + bulk, out + bulk, -+ len - bulk, gctx->ctr)) -+ return 0; -+ } else { -+ if (CRYPTO_gcm128_decrypt(&gctx->gcm, in, out, len)) -+ return 0; -+ } -+ } -+ return 1; -+} +#endif + static int aes_gcm_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out, const unsigned char *in, size_t len) { -+#if defined(OPENSSL_CPUID_OBJ) && (defined(__powerpc__) || defined(__ppc__) || defined(_ARCH_PPC)) -+ if (PPC_AES_GCM_CAPABLE) -+ return ppc_aes_gcm_cipher(ctx, out, in, len); -+#endif - EVP_AES_GCM_CTX *gctx = EVP_C_DATA(EVP_AES_GCM_CTX,ctx); - /* If not set up, return error */ - if (!gctx->key_set) +@@ -3325,6 +3300,20 @@ static int aes_gcm_cipher(EVP_CIPHER_CTX + out + res, len - res, + gctx->gcm.key, gctx->gcm.Yi.c, + gctx->gcm.Xi.u); ++ ++ gctx->gcm.len.u[1] += bulk; ++ bulk += res; ++ } ++#elif defined(AES_GCM_ASM_PPC) && defined(OPENSSL_CPUID_OBJ) && (defined(__powerpc__) || defined(__ppc__) || defined(_ARCH_PPC)) ++ if (PPC_AES_GCM_CAPABLE && len >= AES_GCM_ENC_BYTES && AES_GCM_ASM_PPC(gctx)) { ++ size_t res = (16 - gctx->gcm.mres) % 16; ++ ++ if (CRYPTO_gcm128_encrypt(&gctx->gcm, in, out, res)) ++ return -1; ++ ++ bulk = ppc_aes_gcm_crypt(in + res, out + res, len - res, ++ gctx->gcm.key, ++ gctx->gcm.Yi.c, gctx->gcm.Xi.u, 1); + gctx->gcm.len.u[1] += bulk; + bulk += res; + } +@@ -3372,6 +3361,19 @@ static int aes_gcm_cipher(EVP_CIPHER_CTX + gctx->gcm.len.u[1] += bulk; + bulk += res; + } ++#elif defined(AES_GCM_ASM_PPC) && defined(OPENSSL_CPUID_OBJ) && (defined(__powerpc__) || defined(__ppc__) || defined(_ARCH_PPC)) ++ if (PPC_AES_GCM_CAPABLE && len >= AES_GCM_DEC_BYTES && AES_GCM_ASM_PPC(gctx)) { ++ size_t res = (16 - gctx->gcm.mres) % 16; ++ ++ if (CRYPTO_gcm128_decrypt(&gctx->gcm, in, out, res)) ++ return -1; ++ ++ bulk = ppc_aes_gcm_crypt(in + res, out + res, len - res, ++ gctx->gcm.key, ++ gctx->gcm.Yi.c, gctx->gcm.Xi.u, 0); ++ gctx->gcm.len.u[1] += bulk; ++ bulk += res; ++ } + #endif + if (CRYPTO_gcm128_decrypt_ctr32(&gctx->gcm, + in + bulk,