From 8903999f6a4ab300c13a45b0af6e1a8bd8a13dfe9c5b31fd3f69d4be5362cf4a Mon Sep 17 00:00:00 2001 From: Pedro Monreal Gonzalez Date: Fri, 28 Jan 2022 17:51:43 +0000 Subject: [PATCH] Accepting request 949750 from home:pmonrealgonzalez:branches:security:tls - Backport cryptographic improvements from OpenSSL 3 [jsc#SLE-19766] * Optimize RSA on armv8: openssl-1_1-Optimize-RSA-armv8.patch * Optimize AES-XTS mode for aarch64: openssl-1_1-Optimize-AES-XTS-aarch64.patch * Optimize AES-GCM for uarchs with unroll and new instructions: openssl-1_1-Optimize-AES-GCM-uarchs.patch - POWER10 performance enhancements for cryptography [jsc#SLE-19409] * openssl-1_1-Optimize-ppc64.patch OBS-URL: https://build.opensuse.org/request/show/949750 OBS-URL: https://build.opensuse.org/package/show/security:tls/openssl-1_1?expand=0&rev=102 --- openssl-1_1-Optimize-AES-GCM-uarchs.patch | 7709 ++++++++++++++++++++ openssl-1_1-Optimize-AES-XTS-aarch64.patch | 1616 ++++ openssl-1_1-Optimize-RSA-armv8.patch | 575 ++ openssl-1_1-Optimize-ppc64.patch | 2308 ++++++ openssl-1_1.changes | 16 + openssl-1_1.spec | 11 +- 6 files changed, 12233 insertions(+), 2 deletions(-) create mode 100644 openssl-1_1-Optimize-AES-GCM-uarchs.patch create mode 100644 openssl-1_1-Optimize-AES-XTS-aarch64.patch create mode 100644 openssl-1_1-Optimize-RSA-armv8.patch create mode 100644 openssl-1_1-Optimize-ppc64.patch diff --git a/openssl-1_1-Optimize-AES-GCM-uarchs.patch b/openssl-1_1-Optimize-AES-GCM-uarchs.patch new file mode 100644 index 0000000..69e4302 --- /dev/null +++ b/openssl-1_1-Optimize-AES-GCM-uarchs.patch @@ -0,0 +1,7709 @@ +From 954f45ba4c504570206ff5bed811e512cf92dc8e Mon Sep 17 00:00:00 2001 +From: XiaokangQian +Date: Wed, 9 Jun 2021 06:35:46 +0000 +Subject: [PATCH] Optimize AES-GCM for uarchs with unroll and new instructions + +Increase the block numbers to 8 for every iteration. Increase the hash +table capacity. Make use of EOR3 instruction to improve the performance. + +This can improve performance 25-40% on out-of-order microarchitectures +with a large number of fast execution units, such as Neoverse V1. We also +see 20-30% performance improvements on other architectures such as the M1. + +Assembly code reviewd by Tom Cosgrove (ARM). + +Reviewed-by: Bernd Edlinger +Reviewed-by: Paul Dale +(Merged from https://github.com/openssl/openssl/pull/15916) +--- + crypto/arm64cpuid.pl | 8 + + crypto/arm_arch.h | 6 + + crypto/armcap.c | 24 +- + crypto/modes/asm/aes-gcm-armv8-unroll8_64.pl | 7369 +++++++++++++++++ + crypto/modes/asm/ghashv8-armx.pl | 105 +- + crypto/modes/build.info | 4 +- + include/crypto/aes_platform.h | 12 + + .../ciphers/cipher_aes_gcm_hw_armv8.inc | 36 +- + 8 files changed, 7546 insertions(+), 18 deletions(-) + create mode 100644 crypto/modes/asm/aes-gcm-armv8-unroll8_64.pl + +Index: openssl-1.1.1m/crypto/arm64cpuid.pl +=================================================================== +--- openssl-1.1.1m.orig/crypto/arm64cpuid.pl ++++ openssl-1.1.1m/crypto/arm64cpuid.pl +@@ -78,6 +78,14 @@ _armv8_sha512_probe: + ret + .size _armv8_sha512_probe,.-_armv8_sha512_probe + ++.globl _armv8_eor3_probe ++.type _armv8_eor3_probe,%function ++_armv8_eor3_probe: ++ AARCH64_VALID_CALL_TARGET ++ .long 0xce010800 // eor3 v0.16b, v0.16b, v1.16b, v2.16b ++ ret ++.size _armv8_eor3_probe,.-_armv8_eor3_probe ++ + .globl _armv8_cpuid_probe + .type _armv8_cpuid_probe,%function + _armv8_cpuid_probe: +Index: openssl-1.1.1m/crypto/arm_arch.h +=================================================================== +--- openssl-1.1.1m.orig/crypto/arm_arch.h ++++ openssl-1.1.1m/crypto/arm_arch.h +@@ -83,6 +83,9 @@ extern unsigned int OPENSSL_arm_midr; + # define ARMV8_SHA512 (1<<6) + # define ARMV8_CPUID (1<<7) + ++# define ARMV8_SHA3 (1<<11) ++# define ARMV8_UNROLL8_EOR3 (1<<12) ++ + /* + * MIDR_EL1 system register + * +@@ -97,6 +100,7 @@ extern unsigned int OPENSSL_arm_midr; + + # define ARM_CPU_PART_CORTEX_A72 0xD08 + # define ARM_CPU_PART_N1 0xD0C ++# define ARM_CPU_PART_V1 0xD40 + + # define MIDR_PARTNUM_SHIFT 4 + # define MIDR_PARTNUM_MASK (0xfff << MIDR_PARTNUM_SHIFT) +@@ -125,4 +129,29 @@ extern unsigned int OPENSSL_arm_midr; + + # define MIDR_IS_CPU_MODEL(midr, imp, partnum) \ + (((midr) & MIDR_CPU_MODEL_MASK) == MIDR_CPU_MODEL(imp, partnum)) ++ ++# define IS_CPU_SUPPORT_UNROLL8_EOR3() \ ++ (OPENSSL_armcap_P & ARMV8_UNROLL8_EOR3) ++ ++#if defined(__ASSEMBLER__) ++ ++ /* ++ * Support macros for ++ * - Armv8.3-A Pointer Authentication and ++ * - Armv8.5-A Branch Target Identification ++ * features which require emitting a .note.gnu.property section with the ++ * appropriate architecture-dependent feature bits set. ++ * Read more: "ELF for the ArmĀ® 64-bit Architecture" ++ */ ++ ++# if defined(__ARM_FEATURE_BTI_DEFAULT) && __ARM_FEATURE_BTI_DEFAULT == 1 ++# define GNU_PROPERTY_AARCH64_BTI (1 << 0) /* Has Branch Target Identification */ ++# define AARCH64_VALID_CALL_TARGET hint #34 /* BTI 'c' */ ++# else ++# define GNU_PROPERTY_AARCH64_BTI 0 /* No Branch Target Identification */ ++# define AARCH64_VALID_CALL_TARGET ++# endif ++ ++# endif /* defined __ASSEMBLER__ */ ++ + #endif +Index: openssl-1.1.1m/crypto/armcap.c +=================================================================== +--- openssl-1.1.1m.orig/crypto/armcap.c ++++ openssl-1.1.1m/crypto/armcap.c +@@ -13,6 +13,9 @@ + #include + #include + #include ++#ifdef __APPLE__ ++#include ++#endif + #include "internal/cryptlib.h" + + #include "arm_arch.h" +@@ -134,6 +137,7 @@ static unsigned long getauxval(unsigned + # define HWCAP_CE_SHA1 (1 << 5) + # define HWCAP_CE_SHA256 (1 << 6) + # define HWCAP_CPUID (1 << 11) ++# define HWCAP_SHA3 (1 << 17) + # define HWCAP_CE_SHA512 (1 << 21) + # endif + +@@ -148,12 +152,15 @@ void OPENSSL_cpuid_setup(void) + return; + trigger = 1; + ++ OPENSSL_armcap_P = 0; ++ + if ((e = getenv("OPENSSL_armcap"))) { + OPENSSL_armcap_P = (unsigned int)strtoul(e, NULL, 0); + return; + } + +-# if defined(__APPLE__) && !defined(__aarch64__) ++# if defined(__APPLE__) ++# if !defined(__aarch64__) + /* + * Capability probing by catching SIGILL appears to be problematic + * on iOS. But since Apple universe is "monocultural", it's actually +@@ -169,9 +176,25 @@ void OPENSSL_cpuid_setup(void) + * Unified code works because it never triggers SIGILL on Apple + * devices... + */ +-# endif ++# else ++ { ++ unsigned int feature; ++ size_t len = sizeof(feature); ++ char uarch[64]; + +- OPENSSL_armcap_P = 0; ++ if (sysctlbyname("hw.optional.armv8_2_sha512", &feature, &len, NULL, 0) == 0 && feature == 1) ++ OPENSSL_armcap_P |= ARMV8_SHA512; ++ feature = 0; ++ if (sysctlbyname("hw.optional.armv8_2_sha3", &feature, &len, NULL, 0) == 0 && feature == 1) { ++ OPENSSL_armcap_P |= ARMV8_SHA3; ++ len = sizeof(uarch); ++ if ((sysctlbyname("machdep.cpu.brand_string", uarch, &len, NULL, 0) == 0) && ++ (strncmp(uarch, "Apple M1", 8) == 0)) ++ OPENSSL_armcap_P |= ARMV8_UNROLL8_EOR3; ++ } ++ } ++# endif ++# endif + + # ifdef OSSL_IMPLEMENT_GETAUXVAL + if (getauxval(HWCAP) & HWCAP_NEON) { +@@ -197,6 +220,9 @@ void OPENSSL_cpuid_setup(void) + + if (hwcap & HWCAP_CPUID) + OPENSSL_armcap_P |= ARMV8_CPUID; ++ ++ if (hwcap & HWCAP_SHA3) ++ OPENSSL_armcap_P |= ARMV8_SHA3; + # endif + } + # endif +@@ -240,6 +266,10 @@ void OPENSSL_cpuid_setup(void) + _armv8_sha512_probe(); + OPENSSL_armcap_P |= ARMV8_SHA512; + } ++ if (sigsetjmp(ill_jmp, 1) == 0) { ++ _armv8_eor3_probe(); ++ OPENSSL_armcap_P |= ARMV8_SHA3; ++ } + # endif + } + # endif +@@ -262,6 +292,9 @@ void OPENSSL_cpuid_setup(void) + (OPENSSL_armcap_P & ARMV7_NEON)) { + OPENSSL_armv8_rsa_neonized = 1; + } ++ if ((MIDR_IS_CPU_MODEL(OPENSSL_arm_midr, ARM_CPU_IMP_ARM, ARM_CPU_PART_V1)) && ++ (OPENSSL_armcap_P & ARMV8_SHA3)) ++ OPENSSL_armcap_P |= ARMV8_UNROLL8_EOR3; + # endif + } + #endif +Index: openssl-1.1.1m/crypto/modes/asm/aes-gcm-armv8-unroll8_64.pl +=================================================================== +--- /dev/null ++++ openssl-1.1.1m/crypto/modes/asm/aes-gcm-armv8-unroll8_64.pl +@@ -0,0 +1,7369 @@ ++#! /usr/bin/env perl ++# Copyright 2020-2021 The OpenSSL Project Authors. All Rights Reserved. ++# ++# Licensed under the Apache License 2.0 (the "License"). You may not use ++# this file except in compliance with the License. You can obtain a copy ++# in the file LICENSE in the source distribution or at ++# https://www.openssl.org/source/license.html ++ ++# ++#======================================================================== ++# Written by Xiaokang Qian for the OpenSSL project, ++# derived from https://github.com/ARM-software/AArch64cryptolib, original ++# author Samuel Lee . The module is, however, dual ++# licensed under OpenSSL and SPDX BSD-3-Clause licenses depending on where you ++# obtain it. ++#======================================================================== ++# ++# Approach - We want to reload constants as we have plenty of spare ASIMD slots around crypto units for loading ++# Unroll x8 in main loop, main loop to act on 8 16B blocks per iteration, and then do modulo of the accumulated ++# intermediate hashesfrom the 8 blocks. ++# ++# ____________________________________________________ ++# | | ++# | PRE | ++# |____________________________________________________| ++# | | | | ++# | CTR block 8k+13| AES block 8k+8 | GHASH block 8k+0 | ++# |________________|________________|__________________| ++# | | | | ++# | CTR block 8k+14| AES block 8k+9 | GHASH block 8k+1 | ++# |________________|________________|__________________| ++# | | | | ++# | CTR block 8k+15| AES block 8k+10| GHASH block 8k+2 | ++# |________________|________________|__________________| ++# | | | | ++# | CTR block 8k+16| AES block 8k+11| GHASH block 8k+3 | ++# |________________|________________|__________________| ++# | | | | ++# | CTR block 8k+17| AES block 8k+12| GHASH block 8k+4 | ++# |________________|________________|__________________| ++# | | | | ++# | CTR block 8k+18| AES block 8k+13| GHASH block 8k+5 | ++# |________________|________________|__________________| ++# | | | | ++# | CTR block 8k+19| AES block 8k+14| GHASH block 8k+6 | ++# |________________|________________|__________________| ++# | | | | ++# | CTR block 8k+20| AES block 8k+15| GHASH block 8k+7 | ++# |________________|____(mostly)____|__________________| ++# | | ++# | MODULO | ++# |____________________________________________________| ++# ++# PRE: ++# Ensure previous generated intermediate hash is aligned and merged with result for GHASH 4k+0 ++# EXT low_acc, low_acc, low_acc, #8 ++# EOR res_curr (8k+0), res_curr (4k+0), low_acc ++# ++# CTR block: ++# Increment and byte reverse counter in scalar registers and transfer to SIMD registers ++# REV ctr32, rev_ctr32 ++# ORR ctr64, constctr96_top32, ctr32, LSL #32 ++# INS ctr_next.d[0], constctr96_bottom64 // Keeping this in scalar registers to free up space in SIMD RF ++# INS ctr_next.d[1], ctr64X ++# ADD rev_ctr32, #1 ++# ++# AES block: ++# Do AES encryption/decryption on CTR block X and EOR it with input block X. Take 256 bytes key below for example. ++# Doing small trick here of loading input in scalar registers, EORing with last key and then transferring ++# Given we are very constrained in our ASIMD registers this is quite important ++# ++# Encrypt: ++# LDR input_low, [ input_ptr ], #8 ++# LDR input_high, [ input_ptr ], #8 ++# EOR input_low, k14_low ++# EOR input_high, k14_high ++# INS res_curr.d[0], input_low ++# INS res_curr.d[1], input_high ++# AESE ctr_curr, k0; AESMC ctr_curr, ctr_curr ++# AESE ctr_curr, k1; AESMC ctr_curr, ctr_curr ++# AESE ctr_curr, k2; AESMC ctr_curr, ctr_curr ++# AESE ctr_curr, k3; AESMC ctr_curr, ctr_curr ++# AESE ctr_curr, k4; AESMC ctr_curr, ctr_curr ++# AESE ctr_curr, k5; AESMC ctr_curr, ctr_curr ++# AESE ctr_curr, k6; AESMC ctr_curr, ctr_curr ++# AESE ctr_curr, k7; AESMC ctr_curr, ctr_curr ++# AESE ctr_curr, k8; AESMC ctr_curr, ctr_curr ++# AESE ctr_curr, k9; AESMC ctr_curr, ctr_curr ++# AESE ctr_curr, k10; AESMC ctr_curr, ctr_curr ++# AESE ctr_curr, k11; AESMC ctr_curr, ctr_curr ++# AESE ctr_curr, k12; AESMC ctr_curr, ctr_curr ++# AESE ctr_curr, k13 ++# EOR res_curr, res_curr, ctr_curr ++# ST1 { res_curr.16b }, [ output_ptr ], #16 ++# ++# Decrypt: ++# AESE ctr_curr, k0; AESMC ctr_curr, ctr_curr ++# AESE ctr_curr, k1; AESMC ctr_curr, ctr_curr ++# AESE ctr_curr, k2; AESMC ctr_curr, ctr_curr ++# AESE ctr_curr, k3; AESMC ctr_curr, ctr_curr ++# AESE ctr_curr, k4; AESMC ctr_curr, ctr_curr ++# AESE ctr_curr, k5; AESMC ctr_curr, ctr_curr ++# AESE ctr_curr, k6; AESMC ctr_curr, ctr_curr ++# AESE ctr_curr, k7; AESMC ctr_curr, ctr_curr ++# AESE ctr_curr, k8; AESMC ctr_curr, ctr_curr ++# AESE ctr_curr, k9; AESMC ctr_curr, ctr_curr ++# AESE ctr_curr, k10; AESMC ctr_curr, ctr_curr ++# AESE ctr_curr, k11; AESMC ctr_curr, ctr_curr ++# AESE ctr_curr, k12; AESMC ctr_curr, ctr_curr ++# AESE ctr_curr, k13 ++# LDR res_curr, [ input_ptr ], #16 ++# EOR res_curr, res_curr, ctr_curr ++# MOV output_low, res_curr.d[0] ++# MOV output_high, res_curr.d[1] ++# EOR output_low, k14_low ++# EOR output_high, k14_high ++# STP output_low, output_high, [ output_ptr ], #16 ++ ++# GHASH block X: ++# Do 128b karatsuba polynomial multiplication on block ++# We only have 64b->128b polynomial multipliers, naively that means we need to do 4 64b multiplies to generate a 128b ++# ++# multiplication: ++# Pmull(A,B) == (Pmull(Ah,Bh)<<128 | Pmull(Al,Bl)) ^ (Pmull(Ah,Bl) ^ Pmull(Al,Bh))<<64 ++# ++# The idea behind Karatsuba multiplication is that we can do just 3 64b multiplies: ++# Pmull(A,B) == (Pmull(Ah,Bh)<<128 | Pmull(Al,Bl)) ^ (Pmull(Ah^Al,Bh^Bl) ^ Pmull(Ah,Bh) ^ Pmull(Al,Bl))<<64 ++# ++# There is some complication here because the bit order of GHASH's PMULL is reversed compared to elsewhere, so we are ++# multiplying with "twisted" powers of H ++# ++# Note: We can PMULL directly into the acc_x in first GHASH of the loop ++# Note: For scheduling big cores we want to split the processing to happen over two loop iterations - otherwise the critical ++# path latency dominates the performance ++# ++# This has a knock on effect on register pressure, so we have to be a bit more clever with our temporary registers ++# than indicated here ++# REV64 res_curr, res_curr ++# INS t_m.d[0], res_curr.d[1] ++# EOR t_m.8B, t_m.8B, res_curr.8B ++# PMULL2 t_h, res_curr, HX ++# PMULL t_l, res_curr, HX ++# PMULL t_m, t_m, HX_k ++# EOR acc_h, acc_h, t_h ++# EOR acc_l, acc_l, t_l ++# EOR acc_m, acc_m, t_m ++# ++# MODULO: take the partial accumulators (~representing sum of 256b multiplication results), from GHASH and do modulo reduction on them ++# There is some complication here because the bit order of GHASH's PMULL is reversed compared to elsewhere, so we are doing modulo ++# with a reversed constant ++# EOR3 acc_m, acc_m, acc_l, acc_h // Finish off karatsuba processing ++# PMULL t_mod, acc_h, mod_constant ++# EXT acc_h, acc_h, acc_h, #8 ++# EOR3 acc_m, acc_m, t_mod, acc_h ++# PMULL acc_h, acc_m, mod_constant ++# EXT acc_m, acc_m, acc_m, #8 ++# EOR3 acc_l, acc_l, acc_m, acc_h ++ ++$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef; ++$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef; ++ ++$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; ++( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or ++( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate ) or ++die "can't locate arm-xlate.pl"; ++ ++die "only for 64 bit" if $flavour !~ /64/; ++ ++open OUT,"| \"$^X\" $xlate $flavour $output"; ++*STDOUT=*OUT; ++ ++$code=<<___; ++#include "arm_arch.h" ++ ++#if __ARM_MAX_ARCH__>=8 ++___ ++$code.=".arch armv8.2-a+crypto\n.arch_extension sha3\n.text\n"; ++ ++$input_ptr="x0"; #argument block ++$bit_length="x1"; ++$output_ptr="x2"; ++$current_tag="x3"; ++$counter="x16"; ++$constant_temp="x15"; ++$modulo_constant="x10"; ++$cc="x8"; ++{ ++my ($end_input_ptr,$main_end_input_ptr,$temp0_x,$temp1_x)=map("x$_",(4..7)); ++my ($temp2_x,$temp3_x)=map("x$_",(13..14)); ++my ($ctr0b,$ctr1b,$ctr2b,$ctr3b,$ctr4b,$ctr5b,$ctr6b,$ctr7b,$res0b,$res1b,$res2b,$res3b,$res4b,$res5b,$res6b,$res7b)=map("v$_.16b",(0..15)); ++my ($ctr0,$ctr1,$ctr2,$ctr3,$ctr4,$ctr5,$ctr6,$ctr7,$res0,$res1,$res2,$res3,$res4,$res5,$res6,$res7)=map("v$_",(0..15)); ++my ($ctr0d,$ctr1d,$ctr2d,$ctr3d,$ctr4d,$ctr5d,$ctr6d,$ctr7d)=map("d$_",(0..7)); ++my ($ctr0q,$ctr1q,$ctr2q,$ctr3q,$ctr4q,$ctr5q,$ctr6q,$ctr7q)=map("q$_",(0..7)); ++my ($res0q,$res1q,$res2q,$res3q,$res4q,$res5q,$res6q,$res7q)=map("q$_",(8..15)); ++ ++my ($ctr_t0,$ctr_t1,$ctr_t2,$ctr_t3,$ctr_t4,$ctr_t5,$ctr_t6,$ctr_t7)=map("v$_",(8..15)); ++my ($ctr_t0b,$ctr_t1b,$ctr_t2b,$ctr_t3b,$ctr_t4b,$ctr_t5b,$ctr_t6b,$ctr_t7b)=map("v$_.16b",(8..15)); ++my ($ctr_t0q,$ctr_t1q,$ctr_t2q,$ctr_t3q,$ctr_t4q,$ctr_t5q,$ctr_t6q,$ctr_t7q)=map("q$_",(8..15)); ++ ++my ($acc_hb,$acc_mb,$acc_lb)=map("v$_.16b",(17..19)); ++my ($acc_h,$acc_m,$acc_l)=map("v$_",(17..19)); ++ ++my ($h1,$h12k,$h2,$h3,$h34k,$h4)=map("v$_",(20..25)); ++my ($h5,$h56k,$h6,$h7,$h78k,$h8)=map("v$_",(20..25)); ++my ($h1q,$h12kq,$h2q,$h3q,$h34kq,$h4q)=map("q$_",(20..25)); ++my ($h5q,$h56kq,$h6q,$h7q,$h78kq,$h8q)=map("q$_",(20..25)); ++ ++my $t0="v16"; ++my $t0d="d16"; ++ ++my $t1="v29"; ++my $t2=$res1; ++my $t3=$t1; ++ ++my $t4=$res0; ++my $t5=$res2; ++my $t6=$t0; ++ ++my $t7=$res3; ++my $t8=$res4; ++my $t9=$res5; ++ ++my $t10=$res6; ++my $t11="v21"; ++my $t12=$t1; ++ ++my $rtmp_ctr="v30"; ++my $rtmp_ctrq="q30"; ++my $rctr_inc="v31"; ++my $rctr_incd="d31"; ++ ++my $mod_constantd=$t0d; ++my $mod_constant=$t0; ++ ++my ($rk0,$rk1,$rk2)=map("v$_.16b",(26..28)); ++my ($rk3,$rk4,$rk5)=map("v$_.16b",(26..28)); ++my ($rk6,$rk7,$rk8)=map("v$_.16b",(26..28)); ++my ($rk9,$rk10,$rk11)=map("v$_.16b",(26..28)); ++my ($rk12,$rk13,$rk14)=map("v$_.16b",(26..28)); ++my ($rk0q,$rk1q,$rk2q)=map("q$_",(26..28)); ++my ($rk3q,$rk4q,$rk5q)=map("q$_",(26..28)); ++my ($rk6q,$rk7q,$rk8q)=map("q$_",(26..28)); ++my ($rk9q,$rk10q,$rk11q)=map("q$_",(26..28)); ++my ($rk12q,$rk13q,$rk14q)=map("q$_",(26..28)); ++my $rk2q1="v28.1q"; ++my $rk3q1="v26.1q"; ++my $rk4v="v27"; ++ ++ ++######################################################################################### ++# size_t unroll8_eor3_aes_gcm_enc_128_kernel(const unsigned char *in, ++# size_t len, ++# unsigned char *out, ++# const void *key, ++# unsigned char ivec[16], ++# u64 *Xi); ++# ++$code.=<<___; ++.global unroll8_eor3_aes_gcm_enc_128_kernel ++.type unroll8_eor3_aes_gcm_enc_128_kernel,%function ++.align 4 ++unroll8_eor3_aes_gcm_enc_128_kernel: ++ AARCH64_VALID_CALL_TARGET ++ cbz x1, .L128_enc_ret ++ stp d8, d9, [sp, #-80]! ++ mov $counter, x4 ++ mov $cc, x5 ++ stp d10, d11, [sp, #16] ++ stp d12, d13, [sp, #32] ++ stp d14, d15, [sp, #48] ++ mov x5, #0xc200000000000000 ++ stp x5, xzr, [sp, #64] ++ add $modulo_constant, sp, #64 ++ ++ mov $constant_temp, #0x100000000 @ set up counter increment ++ movi $rctr_inc.16b, #0x0 ++ mov $rctr_inc.d[1], $constant_temp ++ lsr $main_end_input_ptr, $bit_length, #3 @ byte_len ++ ld1 { $ctr0b}, [$counter] @ CTR block 0 ++ ++ sub $main_end_input_ptr, $main_end_input_ptr, #1 @ byte_len - 1 ++ ++ and $main_end_input_ptr, $main_end_input_ptr, #0xffffffffffffff80 @ number of bytes to be processed in main loop (at least 1 byte must be handled by tail) ++ ++ rev32 $rtmp_ctr.16b, $ctr0.16b @ set up reversed counter ++ ++ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 0 ++ ++ rev32 $ctr1.16b, $rtmp_ctr.16b @ CTR block 1 ++ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 1 ++ ++ rev32 $ctr2.16b, $rtmp_ctr.16b @ CTR block 2 ++ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 2 ++ ++ rev32 $ctr3.16b, $rtmp_ctr.16b @ CTR block 3 ++ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 3 ++ ++ rev32 $ctr4.16b, $rtmp_ctr.16b @ CTR block 4 ++ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 4 ++ ++ rev32 $ctr5.16b, $rtmp_ctr.16b @ CTR block 5 ++ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 5 ++ ldp $rk0q, $rk1q, [$cc, #0] @ load rk0, rk1 ++ ++ rev32 $ctr6.16b, $rtmp_ctr.16b @ CTR block 6 ++ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 6 ++ ++ rev32 $ctr7.16b, $rtmp_ctr.16b @ CTR block 7 ++ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 7 ++ ++ aese $ctr4b, $rk0 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 0 ++ aese $ctr6b, $rk0 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 0 ++ aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 0 ++ ++ aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 0 ++ aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 0 ++ aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 0 ++ ++ aese $ctr7b, $rk0 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 0 ++ aese $ctr5b, $rk0 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 0 ++ ldp $rk2q, $rk3q, [$cc, #32] @ load rk2, rk3 ++ ++ aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 1 ++ ++ aese $ctr7b, $rk1 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 1 ++ aese $ctr5b, $rk1 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 1 ++ aese $ctr4b, $rk1 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 1 ++ ++ aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 1 ++ aese $ctr6b, $rk1 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 1 ++ aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 1 ++ ++ aese $ctr5b, $rk2 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 2 ++ aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 1 ++ aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 2 ++ ++ aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 2 ++ aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 2 ++ aese $ctr7b, $rk2 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 2 ++ ++ aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 2 ++ aese $ctr6b, $rk2 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 2 ++ aese $ctr4b, $rk2 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 2 ++ ++ aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 3 ++ ++ ldp $rk4q, $rk5q, [$cc, #64] @ load rk4, rk5 ++ aese $ctr5b, $rk3 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 3 ++ aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 3 ++ ++ aese $ctr4b, $rk3 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 3 ++ aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 3 ++ aese $ctr6b, $rk3 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 3 ++ ++ aese $ctr7b, $rk3 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 3 ++ ++ aese $ctr6b, $rk4 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 4 ++ aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 3 ++ aese $ctr5b, $rk4 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 4 ++ ++ aese $ctr7b, $rk4 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 4 ++ aese $ctr4b, $rk4 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 4 ++ aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 4 ++ ++ aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 4 ++ aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 4 ++ aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 4 ++ ++ aese $ctr7b, $rk5 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 5 ++ aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 5 ++ ldp $rk6q, $rk7q, [$cc, #96] @ load rk6, rk7 ++ ++ aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 5 ++ aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 5 ++ aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 5 ++ ++ aese $ctr4b, $rk5 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 5 ++ aese $ctr5b, $rk5 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 5 ++ aese $ctr6b, $rk5 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 5 ++ ++ aese $ctr4b, $rk6 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 6 ++ aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 6 ++ aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 6 ++ ++ aese $ctr7b, $rk6 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 6 ++ aese $ctr6b, $rk6 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 6 ++ aese $ctr5b, $rk6 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 6 ++ ++ aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 6 ++ aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 6 ++ ldp $rk8q, $rk9q, [$cc, #128] @ load rk8, rk9 ++ ++ aese $ctr5b, $rk7 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 7 ++ ++ ld1 { $acc_lb}, [$current_tag] ++ ext $acc_lb, $acc_lb, $acc_lb, #8 ++ rev64 $acc_lb, $acc_lb ++ ++ aese $ctr7b, $rk7 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 7 ++ ++ aese $ctr4b, $rk7 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 7 ++ aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 7 ++ aese $ctr6b, $rk7 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 7 ++ ++ aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 7 ++ aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 7 ++ aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 7 ++ ++ aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 8 ++ aese $ctr6b, $rk8 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 8 ++ aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 8 ++ ++ aese $ctr7b, $rk8 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 8 ++ aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 8 ++ ldr $rk10q, [$cc, #160] @ load rk10 ++ ++ aese $ctr3b, $rk9 @ AES block 8k+11 - round 9 ++ aese $ctr4b, $rk8 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 8 ++ aese $ctr2b, $rk9 @ AES block 8k+10 - round 9 ++ ++ aese $ctr5b, $rk8 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 8 ++ aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 8 ++ aese $ctr6b, $rk9 @ AES block 8k+14 - round 9 ++ ++ aese $ctr4b, $rk9 @ AES block 8k+12 - round 9 ++ add $main_end_input_ptr, $main_end_input_ptr, $input_ptr ++ aese $ctr0b, $rk9 @ AES block 8k+8 - round 9 ++ ++ aese $ctr7b, $rk9 @ AES block 8k+15 - round 9 ++ aese $ctr5b, $rk9 @ AES block 8k+13 - round 9 ++ aese $ctr1b, $rk9 @ AES block 8k+9 - round 9 ++ ++ add $end_input_ptr, $input_ptr, $bit_length, lsr #3 @ end_input_ptr ++ cmp $input_ptr, $main_end_input_ptr @ check if we have <= 8 blocks ++ b.ge .L128_enc_tail @ handle tail ++ ++ ldp $ctr_t0q, $ctr_t1q, [$input_ptr], #32 @ AES block 0, 1 - load plaintext ++ ++ ldp $ctr_t2q, $ctr_t3q, [$input_ptr], #32 @ AES block 2, 3 - load plaintext ++ ++ ldp $ctr_t4q, $ctr_t5q, [$input_ptr], #32 @ AES block 4, 5 - load plaintext ++ ++ ldp $ctr_t6q, $ctr_t7q, [$input_ptr], #32 @ AES block 6, 7 - load plaintext ++ cmp $input_ptr, $main_end_input_ptr @ check if we have <= 8 blocks ++ ++ eor3 $res0b, $ctr_t0b, $ctr0b, $rk10 @ AES block 0 - result ++ rev32 $ctr0.16b, $rtmp_ctr.16b @ CTR block 8 ++ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8 ++ ++ eor3 $res1b, $ctr_t1b, $ctr1b, $rk10 @ AES block 1 - result ++ stp $res0q, $res1q, [$output_ptr], #32 @ AES block 0, 1 - store result ++ ++ rev32 $ctr1.16b, $rtmp_ctr.16b @ CTR block 9 ++ eor3 $res5b, $ctr_t5b, $ctr5b, $rk10 @ AES block 5 - result ++ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 9 ++ ++ eor3 $res2b, $ctr_t2b, $ctr2b, $rk10 @ AES block 2 - result ++ eor3 $res6b, $ctr_t6b, $ctr6b, $rk10 @ AES block 6 - result ++ eor3 $res4b, $ctr_t4b, $ctr4b, $rk10 @ AES block 4 - result ++ ++ rev32 $ctr2.16b, $rtmp_ctr.16b @ CTR block 10 ++ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 10 ++ ++ eor3 $res3b, $ctr_t3b, $ctr3b, $rk10 @ AES block 3 - result ++ eor3 $res7b, $ctr_t7b, $ctr7b,$rk10 @ AES block 7 - result ++ stp $res2q, $res3q, [$output_ptr], #32 @ AES block 2, 3 - store result ++ ++ rev32 $ctr3.16b, $rtmp_ctr.16b @ CTR block 11 ++ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 11 ++ stp $res4q, $res5q, [$output_ptr], #32 @ AES block 4, 5 - store result ++ ++ stp $res6q, $res7q, [$output_ptr], #32 @ AES block 6, 7 - store result ++ ++ rev32 $ctr4.16b, $rtmp_ctr.16b @ CTR block 12 ++ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 12 ++ b.ge .L128_enc_prepretail @ do prepretail ++ ++.L128_enc_main_loop: @ main loop start ++ rev32 $ctr5.16b, $rtmp_ctr.16b @ CTR block 8k+13 ++ ldr $h5q, [$current_tag, #128] @ load h5l | h5h ++ ext $h5.16b, $h5.16b, $h5.16b, #8 ++ ldr $h6q, [$current_tag, #160] @ load h6l | h6h ++ ext $h6.16b, $h6.16b, $h6.16b, #8 ++ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+13 ++ ++ rev64 $res1b, $res1b @ GHASH block 8k+1 ++ rev64 $res0b, $res0b @ GHASH block 8k ++ ldr $h7q, [$current_tag, #176] @ load h7l | h7h ++ ext $h7.16b, $h7.16b, $h7.16b, #8 ++ ldr $h8q, [$current_tag, #208] @ load h8l | h8h ++ ext $h8.16b, $h8.16b, $h8.16b, #8 ++ ++ rev32 $ctr6.16b, $rtmp_ctr.16b @ CTR block 8k+14 ++ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+14 ++ ext $acc_lb, $acc_lb, $acc_lb, #8 @ PRE 0 ++ ++ ldr $h56kq, [$current_tag, #144] @ load h6k | h5k ++ ldr $h78kq, [$current_tag, #192] @ load h8k | h7k ++ rev64 $res5b, $res5b @ GHASH block 8k+5 (t0, t1, t2 and t3 free) ++ rev64 $res3b, $res3b @ GHASH block 8k+3 ++ ++ ldp $rk0q, $rk1q, [$cc, #0] @ load rk0, rk1 ++ eor $res0b, $res0b, $acc_lb @ PRE 1 ++ rev32 $ctr7.16b, $rtmp_ctr.16b @ CTR block 8k+15 ++ ++ rev64 $res7b, $res7b @ GHASH block 8k+7 (t0, t1, t2 and t3 free) ++ ++ pmull2 $t0.1q, $res1.2d, $h7.2d @ GHASH block 8k+1 - high ++ rev64 $res2b, $res2b @ GHASH block 8k+2 ++ pmull2 $acc_h.1q, $res0.2d, $h8.2d @ GHASH block 8k - high ++ ++ pmull $h7.1q, $res1.1d, $h7.1d @ GHASH block 8k+1 - low ++ trn1 $acc_m.2d, $res1.2d, $res0.2d @ GHASH block 8k, 8k+1 - mid ++ pmull $acc_l.1q, $res0.1d, $h8.1d @ GHASH block 8k - low ++ ++ trn2 $res0.2d, $res1.2d, $res0.2d @ GHASH block 8k, 8k+1 - mid ++ pmull2 $t1.1q, $res2.2d, $h6.2d @ GHASH block 8k+2 - high ++ pmull2 $t2.1q, $res3.2d, $h5.2d @ GHASH block 8k+3 - high ++ ++ eor $acc_lb, $acc_lb, $h7.16b @ GHASH block 8k+1 - low ++ ldr $h3q, [$current_tag, #80] @ load h3l | h3h ++ ext $h3.16b, $h3.16b, $h3.16b, #8 ++ ldr $h4q, [$current_tag, #112] @ load h3l | h3h ++ ext $h4.16b, $h4.16b, $h4.16b, #8 ++ aese $ctr5b, $rk0 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 0 ++ ++ aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 0 ++ aese $ctr4b, $rk0 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 0 ++ eor $acc_hb, $acc_hb, $t0.16b @ GHASH block 8k+1 - high ++ ++ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+15 ++ aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 0 ++ eor $res0.16b, $res0.16b, $acc_m.16b @ GHASH block 8k, 8k+1 - mid ++ ++ aese $ctr6b, $rk0 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 0 ++ aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 1 ++ aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 0 ++ ++ aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 1 ++ aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 0 ++ pmull $h6.1q, $res2.1d, $h6.1d @ GHASH block 8k+2 - low ++ ++ aese $ctr5b, $rk1 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 1 ++ aese $ctr7b, $rk0 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 0 ++ aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 1 ++ ++ eor3 $acc_hb, $acc_hb, $t1.16b,$t2.16b @ GHASH block 8k+2, 8k+3 - high ++ trn1 $t3.2d, $res3.2d, $res2.2d @ GHASH block 8k+2, 8k+3 - mid ++ trn2 $res2.2d, $res3.2d, $res2.2d @ GHASH block 8k+2, 8k+3 - mid ++ ++ ldp $rk2q, $rk3q, [$cc, #32] @ load rk2, rk3 ++ aese $ctr4b, $rk1 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 1 ++ aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 1 ++ ++ pmull $h5.1q, $res3.1d, $h5.1d @ GHASH block 8k+3 - low ++ aese $ctr7b, $rk1 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 1 ++ aese $ctr6b, $rk1 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 1 ++ ++ pmull2 $acc_m.1q, $res0.2d, $h78k.2d @ GHASH block 8k - mid ++ eor $res2.16b, $res2.16b, $t3.16b @ GHASH block 8k+2, 8k+3 - mid ++ pmull $h78k.1q, $res0.1d, $h78k.1d @ GHASH block 8k+1 - mid ++ ++ rev64 $res6b, $res6b @ GHASH block 8k+6 (t0, t1, and t2 free) ++ eor3 $acc_lb, $acc_lb, $h6.16b, $h5.16b @ GHASH block 8k+2, 8k+3 - low ++ ++ pmull2 $t3.1q, $res2.2d, $h56k.2d @ GHASH block 8k+2 - mid ++ eor $acc_mb, $acc_mb, $h78k.16b @ GHASH block 8k+1 - mid ++ pmull $h56k.1q, $res2.1d, $h56k.1d @ GHASH block 8k+3 - mid ++ ++ aese $ctr5b, $rk2 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 2 ++ aese $ctr4b, $rk2 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 2 ++ aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 2 ++ ++ aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 2 ++ eor3 $acc_mb, $acc_mb, $h56k.16b, $t3.16b @ GHASH block 8k+2, 8k+3 - mid ++ aese $ctr6b, $rk2 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 2 ++ ++ aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 2 ++ aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 2 ++ aese $ctr7b, $rk2 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 2 ++ ++ aese $ctr6b, $rk3 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 3 ++ ldr $h12kq, [$current_tag, #48] @ load h2k | h1k ++ ldr $h34kq, [$current_tag, #96] @ load h4k | h3k ++ rev64 $res4b, $res4b @ GHASH block 8k+4 (t0, t1, and t2 free) ++ ++ ldp $rk4q, $rk5q, [$cc, #64] @ load rk4, rk5 ++ aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 3 ++ aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 3 ++ ++ ldr $h1q, [$current_tag, #32] @ load h1l | h1h ++ ext $h1.16b, $h1.16b, $h1.16b, #8 ++ ldr $h2q, [$current_tag, #64] @ load h1l | h1h ++ ext $h2.16b, $h2.16b, $h2.16b, #8 ++ pmull2 $t4.1q, $res4.2d, $h4.2d @ GHASH block 8k+4 - high ++ pmull $h4.1q, $res4.1d, $h4.1d @ GHASH block 8k+4 - low ++ ++ trn1 $t6.2d, $res5.2d, $res4.2d @ GHASH block 8k+4, 8k+5 - mid ++ trn2 $res4.2d, $res5.2d, $res4.2d @ GHASH block 8k+4, 8k+5 - mid ++ ++ aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 3 ++ aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 3 ++ ++ aese $ctr7b, $rk3 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 3 ++ aese $ctr4b, $rk3 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 3 ++ ++ aese $ctr5b, $rk3 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 3 ++ aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 4 ++ ++ aese $ctr7b, $rk4 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 4 ++ aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 4 ++ aese $ctr4b, $rk4 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 4 ++ ++ aese $ctr5b, $rk4 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 4 ++ aese $ctr6b, $rk4 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 4 ++ aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 4 ++ ++ pmull2 $t5.1q, $res5.2d, $h3.2d @ GHASH block 8k+5 - high ++ eor $res4.16b, $res4.16b, $t6.16b @ GHASH block 8k+4, 8k+5 - mid ++ pmull $h3.1q, $res5.1d, $h3.1d @ GHASH block 8k+5 - low ++ ++ aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 4 ++ ldp $rk6q, $rk7q, [$cc, #96] @ load rk6, rk7 ++ trn1 $t9.2d, $res7.2d, $res6.2d @ GHASH block 8k+6, 8k+7 - mid ++ ++ pmull2 $t6.1q, $res4.2d, $h34k.2d @ GHASH block 8k+4 - mid ++ pmull $h34k.1q, $res4.1d, $h34k.1d @ GHASH block 8k+5 - mid ++ pmull2 $t7.1q, $res6.2d, $h2.2d @ GHASH block 8k+6 - high ++ ++ pmull2 $t8.1q, $res7.2d, $h1.2d @ GHASH block 8k+7 - high ++ aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 5 ++ aese $ctr5b, $rk5 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 5 ++ ++ pmull $h2.1q, $res6.1d, $h2.1d @ GHASH block 8k+6 - low ++ eor3 $acc_hb, $acc_hb, $t4.16b, $t5.16b @ GHASH block 8k+4, 8k+5 - high ++ trn2 $res6.2d, $res7.2d, $res6.2d @ GHASH block 8k+6, 8k+7 - mid ++ ++ eor3 $acc_lb, $acc_lb, $h4.16b, $h3.16b @ GHASH block 8k+4, 8k+5 - low ++ aese $ctr6b, $rk5 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 5 ++ ++ eor $res6.16b, $res6.16b, $t9.16b @ GHASH block 8k+6, 8k+7 - mid ++ aese $ctr7b, $rk5 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 5 ++ aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 5 ++ ++ aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 5 ++ aese $ctr4b, $rk5 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 5 ++ aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 5 ++ ++ eor3 $acc_mb, $acc_mb, $h34k.16b, $t6.16b @ GHASH block 8k+4, 8k+5 - mid ++ ldr $mod_constantd, [$modulo_constant] @ MODULO - load modulo constant ++ pmull $h1.1q, $res7.1d, $h1.1d @ GHASH block 8k+7 - low ++ ++ aese $ctr7b, $rk6 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 6 ++ aese $ctr5b, $rk6 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 6 ++ ++ pmull2 $t9.1q, $res6.2d, $h12k.2d @ GHASH block 8k+6 - mid ++ aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 6 ++ aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 6 ++ ++ pmull $h12k.1q, $res6.1d, $h12k.1d @ GHASH block 8k+7 - mid ++ eor3 $acc_lb, $acc_lb, $h2.16b, $h1.16b @ GHASH block 8k+6, 8k+7 - low ++ ldp $ctr_t0q, $ctr_t1q, [$input_ptr], #32 @ AES block 8k+8, 8k+9 - load plaintext ++ ++ aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 6 ++ rev32 $h1.16b, $rtmp_ctr.16b @ CTR block 8k+16 ++ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+16 ++ ++ aese $ctr4b, $rk6 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 6 ++ aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 6 ++ aese $ctr6b, $rk6 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 6 ++ ++ eor3 $acc_mb, $acc_mb, $h12k.16b, $t9.16b @ GHASH block 8k+6, 8k+7 - mid ++ ldp $rk8q, $rk9q, [$cc, #128] @ load rk8, rk9 ++ eor3 $acc_hb, $acc_hb, $t7.16b, $t8.16b @ GHASH block 8k+6, 8k+7 - high ++ ++ aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 7 ++ aese $ctr7b, $rk7 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 7 ++ ldp $ctr_t2q, $ctr_t3q, [$input_ptr], #32 @ AES block 8k+10, 8k+11 - load plaintext ++ ++ aese $ctr5b, $rk7 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 7 ++ aese $ctr6b, $rk7 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 7 ++ aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 7 ++ ++ pmull $t11.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid ++ aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 7 ++ aese $ctr4b, $rk7 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 7 ++ ++ rev32 $h2.16b, $rtmp_ctr.16b @ CTR block 8k+17 ++ aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 7 ++ ++ aese $ctr5b, $rk8 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 8 ++ ldp $ctr_t4q, $ctr_t5q, [$input_ptr], #32 @ AES block 8k+12, 8k+13 - load plaintext ++ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+17 ++ ++ aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 8 ++ aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 8 ++ aese $ctr7b, $rk8 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 8 ++ ++ aese $ctr4b, $rk8 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 8 ++ eor3 $acc_mb, $acc_mb, $acc_hb, $acc_lb @ MODULO - karatsuba tidy up ++ ldr $rk10q, [$cc, #160] @ load rk10 ++ ++ ext $t12.16b, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment ++ rev32 $h3.16b, $rtmp_ctr.16b @ CTR block 8k+18 ++ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+18 ++ aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 8 ++ ++ aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 8 ++ eor3 $acc_mb, $acc_mb, $t12.16b, $t11.16b @ MODULO - fold into mid ++ aese $ctr6b, $rk8 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 8 ++ ++ aese $ctr2b, $rk9 @ AES block 8k+10 - round 9 ++ aese $ctr4b, $rk9 @ AES block 8k+12 - round 9 ++ aese $ctr1b, $rk9 @ AES block 8k+9 - round 9 ++ ++ ldp $ctr_t6q, $ctr_t7q, [$input_ptr], #32 @ AES block 8k+14, 8k+15 - load plaintext ++ rev32 $h4.16b, $rtmp_ctr.16b @ CTR block 8k+19 ++ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+19 ++ ++ cmp $input_ptr, $main_end_input_ptr @ LOOP CONTROL ++ eor3 $res4b, $ctr_t4b, $ctr4b, $rk10 @ AES block 4 - result ++ aese $ctr7b, $rk9 @ AES block 8k+15 - round 9 ++ ++ aese $ctr6b, $rk9 @ AES block 8k+14 - round 9 ++ aese $ctr3b, $rk9 @ AES block 8k+11 - round 9 ++ ++ eor3 $res2b, $ctr_t2b, $ctr2b, $rk10 @ AES block 8k+10 - result ++ ++ mov $ctr2.16b, $h3.16b @ CTR block 8k+18 ++ aese $ctr0b, $rk9 @ AES block 8k+8 - round 9 ++ ++ rev32 $ctr4.16b, $rtmp_ctr.16b @ CTR block 8k+20 ++ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+20 ++ ++ eor3 $res7b, $ctr_t7b, $ctr7b, $rk10 @ AES block 7 - result ++ aese $ctr5b, $rk9 @ AES block 8k+13 - round 9 ++ pmull $acc_h.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low ++ ++ eor3 $res1b, $ctr_t1b, $ctr1b, $rk10 @ AES block 8k+9 - result ++ eor3 $res3b, $ctr_t3b, $ctr3b, $rk10 @ AES block 8k+11 - result ++ mov $ctr3.16b, $h4.16b @ CTR block 8k+19 ++ ++ ext $t11.16b, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment ++ eor3 $res5b, $ctr_t5b, $ctr5b, $rk10 @ AES block 5 - result ++ mov $ctr1.16b, $h2.16b @ CTR block 8k+17 ++ ++ eor3 $res0b, $ctr_t0b, $ctr0b, $rk10 @ AES block 8k+8 - result ++ mov $ctr0.16b, $h1.16b @ CTR block 8k+16 ++ stp $res0q, $res1q, [$output_ptr], #32 @ AES block 8k+8, 8k+9 - store result ++ ++ stp $res2q, $res3q, [$output_ptr], #32 @ AES block 8k+10, 8k+11 - store result ++ eor3 $res6b, $ctr_t6b, $ctr6b, $rk10 @ AES block 6 - result ++ ++ stp $res4q, $res5q, [$output_ptr], #32 @ AES block 8k+12, 8k+13 - store result ++ eor3 $acc_lb, $acc_lb, $acc_hb, $t11.16b @ MODULO - fold into low ++ ++ stp $res6q, $res7q, [$output_ptr], #32 @ AES block 8k+14, 8k+15 - store result ++ b.lt .L128_enc_main_loop ++ ++.L128_enc_prepretail: @ PREPRETAIL ++ rev32 $ctr5.16b, $rtmp_ctr.16b @ CTR block 8k+13 ++ ldr $h7q, [$current_tag, #176] @ load h7l | h7h ++ ext $h7.16b, $h7.16b, $h7.16b, #8 ++ ldr $h8q, [$current_tag, #208] @ load h8l | h8h ++ ext $h8.16b, $h8.16b, $h8.16b, #8 ++ ext $acc_lb, $acc_lb, $acc_lb, #8 @ PRE 0 ++ ++ ldr $h5q, [$current_tag, #128] @ load h5l | h5h ++ ext $h5.16b, $h5.16b, $h5.16b, #8 ++ ldr $h6q, [$current_tag, #160] @ load h6l | h6h ++ ext $h6.16b, $h6.16b, $h6.16b, #8 ++ rev64 $res0b, $res0b @ GHASH block 8k ++ rev64 $res1b, $res1b @ GHASH block 8k+1 ++ ++ ldr $h56kq, [$current_tag, #144] @ load h6k | h5k ++ ldr $h78kq, [$current_tag, #192] @ load h6k | h5k ++ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+13 ++ rev64 $res3b, $res3b @ GHASH block 8k+3 ++ ++ rev64 $res2b, $res2b @ GHASH block 8k+2 ++ eor $res0b, $res0b, $acc_lb @ PRE 1 ++ ++ rev32 $ctr6.16b, $rtmp_ctr.16b @ CTR block 8k+14 ++ ++ pmull2 $t0.1q, $res1.2d, $h7.2d @ GHASH block 8k+1 - high ++ pmull $acc_l.1q, $res0.1d, $h8.1d @ GHASH block 8k - low ++ pmull2 $acc_h.1q, $res0.2d, $h8.2d @ GHASH block 8k - high ++ ++ rev64 $res5b, $res5b @ GHASH block 8k+5 (t0, t1, t2 and t3 free) ++ trn1 $acc_m.2d, $res1.2d, $res0.2d @ GHASH block 8k, 8k+1 - mid ++ ++ pmull $h7.1q, $res1.1d, $h7.1d @ GHASH block 8k+1 - low ++ eor $acc_hb, $acc_hb, $t0.16b @ GHASH block 8k+1 - high ++ trn2 $res0.2d, $res1.2d, $res0.2d @ GHASH block 8k, 8k+1 - mid ++ ++ eor $acc_lb, $acc_lb, $h7.16b @ GHASH block 8k+1 - low ++ eor $res0.16b, $res0.16b, $acc_m.16b @ GHASH block 8k, 8k+1 - mid ++ ++ ldp $rk0q, $rk1q, [$cc, #0] @ load rk0, rk1 ++ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+14 ++ ++ pmull2 $acc_m.1q, $res0.2d, $h78k.2d @ GHASH block 8k - mid ++ pmull $h78k.1q, $res0.1d, $h78k.1d @ GHASH block 8k+1 - mid ++ ++ rev64 $res4b, $res4b @ GHASH block 8k+4 (t0, t1, and t2 free) ++ rev64 $res7b, $res7b @ GHASH block 8k+7 (t0, t1, t2 and t3 free) ++ ++ eor $acc_mb, $acc_mb, $h78k.16b @ GHASH block 8k+1 - mid ++ ++ rev32 $ctr7.16b, $rtmp_ctr.16b @ CTR block 8k+15 ++ ++ rev64 $res6b, $res6b @ GHASH block 8k+6 (t0, t1, and t2 free) ++ ++ aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 0 ++ ++ pmull2 $t2.1q, $res3.2d, $h5.2d @ GHASH block 8k+3 - high ++ pmull2 $t1.1q, $res2.2d, $h6.2d @ GHASH block 8k+2 - high ++ ++ aese $ctr6b, $rk0 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 0 ++ aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 0 ++ ++ pmull $h6.1q, $res2.1d, $h6.1d @ GHASH block 8k+2 - low ++ aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 0 ++ ++ eor3 $acc_hb, $acc_hb, $t1.16b, $t2.16b @ GHASH block 8k+2, 8k+3 - high ++ trn1 $t3.2d, $res3.2d, $res2.2d @ GHASH block 8k+2, 8k+3 - mid ++ trn2 $res2.2d, $res3.2d, $res2.2d @ GHASH block 8k+2, 8k+3 - mid ++ ++ aese $ctr5b, $rk0 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 0 ++ aese $ctr7b, $rk0 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 0 ++ ++ eor $res2.16b, $res2.16b, $t3.16b @ GHASH block 8k+2, 8k+3 - mid ++ aese $ctr4b, $rk0 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 0 ++ aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 0 ++ ++ aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 1 ++ pmull $h5.1q, $res3.1d, $h5.1d @ GHASH block 8k+3 - low ++ ++ ldr $h3q, [$current_tag, #80] @ load h3l | h3h ++ ext $h3.16b, $h3.16b, $h3.16b, #8 ++ ldr $h4q, [$current_tag, #112] @ load h4l | h4h ++ ext $h4.16b, $h4.16b, $h4.16b, #8 ++ ++ ldp $rk2q, $rk3q, [$cc, #32] @ load rk2, rk3 ++ aese $ctr5b, $rk1 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 1 ++ pmull2 $t3.1q, $res2.2d, $h56k.2d @ GHASH block 8k+2 - mid ++ ++ eor3 $acc_lb, $acc_lb, $h6.16b, $h5.16b @ GHASH block 8k+2, 8k+3 - low ++ pmull $h56k.1q, $res2.1d, $h56k.1d @ GHASH block 8k+3 - mid ++ ++ aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 1 ++ aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 1 ++ ++ eor3 $acc_mb, $acc_mb, $h56k.16b, $t3.16b @ GHASH block 8k+2, 8k+3 - mid ++ ldr $h12kq, [$current_tag, #48] @ load h2k | h1k ++ ldr $h34kq, [$current_tag, #96] @ load h4k | h3k ++ aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 1 ++ ++ aese $ctr4b, $rk1 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 1 ++ aese $ctr7b, $rk1 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 1 ++ ++ aese $ctr5b, $rk2 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 2 ++ aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 2 ++ aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 2 ++ ++ aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 2 ++ aese $ctr6b, $rk1 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 1 ++ aese $ctr4b, $rk2 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 2 ++ ++ aese $ctr5b, $rk3 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 3 ++ aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 2 ++ ++ aese $ctr6b, $rk2 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 2 ++ aese $ctr7b, $rk2 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 2 ++ ldp $rk4q, $rk5q, [$cc, #64] @ load rk4, rk5 ++ ++ ldr $h1q, [$current_tag, #32] @ load h1l | h1h ++ ext $h1.16b, $h1.16b, $h1.16b, #8 ++ ldr $h2q, [$current_tag, #64] @ load h1l | h1h ++ ext $h2.16b, $h2.16b, $h2.16b, #8 ++ trn1 $t6.2d, $res5.2d, $res4.2d @ GHASH block 8k+4, 8k+5 - mid ++ aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 3 ++ ++ pmull2 $t4.1q, $res4.2d, $h4.2d @ GHASH block 8k+4 - high ++ aese $ctr6b, $rk3 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 3 ++ aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 3 ++ ++ pmull $h4.1q, $res4.1d, $h4.1d @ GHASH block 8k+4 - low ++ trn2 $res4.2d, $res5.2d, $res4.2d @ GHASH block 8k+4, 8k+5 - mid ++ pmull2 $t5.1q, $res5.2d, $h3.2d @ GHASH block 8k+5 - high ++ ++ aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 3 ++ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+15 ++ ++ aese $ctr7b, $rk3 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 3 ++ aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 3 ++ eor $res4.16b, $res4.16b, $t6.16b @ GHASH block 8k+4, 8k+5 - mid ++ ++ pmull $h3.1q, $res5.1d, $h3.1d @ GHASH block 8k+5 - low ++ aese $ctr4b, $rk3 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 3 ++ pmull2 $t7.1q, $res6.2d, $h2.2d @ GHASH block 8k+6 - high ++ ++ trn1 $t9.2d, $res7.2d, $res6.2d @ GHASH block 8k+6, 8k+7 - mid ++ pmull $h2.1q, $res6.1d, $h2.1d @ GHASH block 8k+6 - low ++ trn2 $res6.2d, $res7.2d, $res6.2d @ GHASH block 8k+6, 8k+7 - mid ++ ++ aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 4 ++ aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 4 ++ eor3 $acc_hb, $acc_hb, $t4.16b, $t5.16b @ GHASH block 8k+4, 8k+5 - high ++ ++ eor3 $acc_lb, $acc_lb, $h4.16b, $h3.16b @ GHASH block 8k+4, 8k+5 - low ++ eor $res6.16b, $res6.16b, $t9.16b @ GHASH block 8k+6, 8k+7 - mid ++ pmull2 $t6.1q, $res4.2d, $h34k.2d @ GHASH block 8k+4 - mid ++ ++ aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 5 ++ aese $ctr6b, $rk4 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 4 ++ aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 4 ++ ++ aese $ctr7b, $rk4 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 4 ++ aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 4 ++ ++ pmull $h34k.1q, $res4.1d, $h34k.1d @ GHASH block 8k+5 - mid ++ aese $ctr4b, $rk4 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 4 ++ aese $ctr5b, $rk4 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 4 ++ ++ pmull2 $t8.1q, $res7.2d, $h1.2d @ GHASH block 8k+7 - high ++ ldp $rk6q, $rk7q, [$cc, #96] @ load rk6, rk7 ++ pmull $h1.1q, $res7.1d, $h1.1d @ GHASH block 8k+7 - low ++ ++ eor3 $acc_mb, $acc_mb, $h34k.16b, $t6.16b @ GHASH block 8k+4, 8k+5 - mid ++ pmull2 $t9.1q, $res6.2d, $h12k.2d @ GHASH block 8k+6 - mid ++ pmull $h12k.1q, $res6.1d, $h12k.1d @ GHASH block 8k+7 - mid ++ ++ aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 5 ++ aese $ctr7b, $rk5 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 5 ++ ldr $mod_constantd, [$modulo_constant] @ MODULO - load modulo constant ++ ++ aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 5 ++ aese $ctr4b, $rk5 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 5 ++ ++ eor3 $acc_hb, $acc_hb, $t7.16b, $t8.16b @ GHASH block 8k+6, 8k+7 - high ++ aese $ctr5b, $rk5 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 5 ++ aese $ctr6b, $rk5 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 5 ++ ++ aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 5 ++ aese $ctr4b, $rk6 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 6 ++ ++ aese $ctr5b, $rk6 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 6 ++ aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 6 ++ aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 6 ++ ++ aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 6 ++ eor3 $acc_lb, $acc_lb, $h2.16b, $h1.16b @ GHASH block 8k+6, 8k+7 - low ++ eor3 $acc_mb, $acc_mb, $h12k.16b, $t9.16b @ GHASH block 8k+6, 8k+7 - mid ++ ++ aese $ctr6b, $rk6 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 6 ++ aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 6 ++ aese $ctr7b, $rk6 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 6 ++ ++ pmull $t11.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid ++ eor3 $acc_mb, $acc_mb, $acc_hb, $acc_lb @ MODULO - karatsuba tidy up ++ ldp $rk8q, $rk9q, [$cc, #128] @ load rk8, rk9 ++ ++ aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 7 ++ aese $ctr6b, $rk7 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 7 ++ aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 7 ++ ext $t12.16b, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment ++ ++ aese $ctr5b, $rk7 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 7 ++ aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 7 ++ eor3 $acc_mb, $acc_mb, $t12.16b, $t11.16b @ MODULO - fold into mid ++ ++ aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 7 ++ aese $ctr7b, $rk7 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 7 ++ ++ pmull $acc_h.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low ++ aese $ctr4b, $rk7 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 7 ++ ++ aese $ctr7b, $rk8 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 8 ++ aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 8 ++ aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 8 ++ ext $acc_mb, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment ++ ++ aese $ctr6b, $rk8 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 8 ++ eor3 $acc_lb, $acc_lb, $acc_hb, $acc_mb @ MODULO - fold into low ++ aese $ctr4b, $rk8 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 8 ++ ++ aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 8 ++ aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 8 ++ aese $ctr5b, $rk8 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 8 ++ ++ ldr $rk10q, [$cc, #160] @ load rk10 ++ aese $ctr6b, $rk9 @ AES block 8k+14 - round 9 ++ aese $ctr2b, $rk9 @ AES block 8k+10 - round 9 ++ ++ aese $ctr0b, $rk9 @ AES block 8k+8 - round 9 ++ aese $ctr1b, $rk9 @ AES block 8k+9 - round 9 ++ ++ aese $ctr3b, $rk9 @ AES block 8k+11 - round 9 ++ aese $ctr5b, $rk9 @ AES block 8k+13 - round 9 ++ ++ aese $ctr4b, $rk9 @ AES block 8k+12 - round 9 ++ aese $ctr7b, $rk9 @ AES block 8k+15 - round 9 ++.L128_enc_tail: @ TAIL ++ ++ sub $main_end_input_ptr, $end_input_ptr, $input_ptr @ main_end_input_ptr is number of bytes left to process ++ ldr $ctr_t0q, [$input_ptr], #16 @ AES block 8k+8 - load plaintext ++ ++ mov $t1.16b, $rk10 ++ ldp $h5q, $h56kq, [$current_tag, #128] @ load h5l | h5h ++ ext $h5.16b, $h5.16b, $h5.16b, #8 ++ ++ eor3 $res1b, $ctr_t0b, $ctr0b, $t1.16b @ AES block 8k+8 - result ++ ext $t0.16b, $acc_lb, $acc_lb, #8 @ prepare final partial tag ++ ldp $h6q, $h7q, [$current_tag, #160] @ load h6k | h5k ++ ext $h6.16b, $h6.16b, $h6.16b, #8 ++ ext $h7.16b, $h7.16b, $h7.16b, #8 ++ ++ ldp $h78kq, $h8q, [$current_tag, #192] @ load h7l | h7h ++ ext $h8.16b, $h8.16b, $h8.16b, #8 ++ cmp $main_end_input_ptr, #112 ++ b.gt .L128_enc_blocks_more_than_7 ++ ++ mov $ctr7b, $ctr6b ++ mov $ctr6b, $ctr5b ++ movi $acc_h.8b, #0 ++ ++ cmp $main_end_input_ptr, #96 ++ sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s ++ mov $ctr5b, $ctr4b ++ ++ mov $ctr4b, $ctr3b ++ mov $ctr3b, $ctr2b ++ mov $ctr2b, $ctr1b ++ ++ movi $acc_l.8b, #0 ++ movi $acc_m.8b, #0 ++ b.gt .L128_enc_blocks_more_than_6 ++ ++ mov $ctr7b, $ctr6b ++ cmp $main_end_input_ptr, #80 ++ ++ sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s ++ mov $ctr6b, $ctr5b ++ mov $ctr5b, $ctr4b ++ ++ mov $ctr4b, $ctr3b ++ mov $ctr3b, $ctr1b ++ b.gt .L128_enc_blocks_more_than_5 ++ ++ cmp $main_end_input_ptr, #64 ++ sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s ++ ++ mov $ctr7b, $ctr6b ++ mov $ctr6b, $ctr5b ++ ++ mov $ctr5b, $ctr4b ++ mov $ctr4b, $ctr1b ++ b.gt .L128_enc_blocks_more_than_4 ++ ++ mov $ctr7b, $ctr6b ++ sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s ++ mov $ctr6b, $ctr5b ++ ++ mov $ctr5b, $ctr1b ++ cmp $main_end_input_ptr, #48 ++ b.gt .L128_enc_blocks_more_than_3 ++ ++ sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s ++ mov $ctr7b, $ctr6b ++ mov $ctr6b, $ctr1b ++ ++ cmp $main_end_input_ptr, #32 ++ ldr $h34kq, [$current_tag, #96] @ load h4k | h3k ++ b.gt .L128_enc_blocks_more_than_2 ++ ++ cmp $main_end_input_ptr, #16 ++ ++ sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s ++ mov $ctr7b, $ctr1b ++ b.gt .L128_enc_blocks_more_than_1 ++ ++ ldr $h12kq, [$current_tag, #48] @ load h2k | h1k ++ sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s ++ b .L128_enc_blocks_less_than_1 ++.L128_enc_blocks_more_than_7: @ blocks left > 7 ++ st1 { $res1b}, [$output_ptr], #16 @ AES final-7 block - store result ++ ++ rev64 $res0b, $res1b @ GHASH final-7 block ++ ldr $ctr_t1q, [$input_ptr], #16 @ AES final-6 block - load plaintext ++ ++ eor $res0b, $res0b, $t0.16b @ feed in partial tag ++ ++ ins $rk4v.d[0], $res0.d[1] @ GHASH final-7 block - mid ++ ++ pmull2 $acc_h.1q, $res0.2d, $h8.2d @ GHASH final-7 block - high ++ ++ ins $acc_m.d[0], $h78k.d[1] @ GHASH final-7 block - mid ++ ++ eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-7 block - mid ++ movi $t0.8b, #0 @ surpress further partial tag feed in ++ ++ eor3 $res1b, $ctr_t1b, $ctr1b, $t1.16b @ AES final-6 block - result ++ ++ pmull $acc_m.1q, $rk4v.1d, $acc_m.1d @ GHASH final-7 block - mid ++ pmull $acc_l.1q, $res0.1d, $h8.1d @ GHASH final-7 block - low ++.L128_enc_blocks_more_than_6: @ blocks left > 6 ++ ++ st1 { $res1b}, [$output_ptr], #16 @ AES final-6 block - store result ++ ++ rev64 $res0b, $res1b @ GHASH final-6 block ++ ldr $ctr_t1q, [$input_ptr], #16 @ AES final-5 block - load plaintext ++ ++ eor $res0b, $res0b, $t0.16b @ feed in partial tag ++ ++ ins $rk4v.d[0], $res0.d[1] @ GHASH final-6 block - mid ++ ++ eor3 $res1b, $ctr_t1b, $ctr2b, $t1.16b @ AES final-5 block - result ++ pmull $rk3q1, $res0.1d, $h7.1d @ GHASH final-6 block - low ++ ++ eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-6 block - mid ++ movi $t0.8b, #0 @ surpress further partial tag feed in ++ ++ pmull $rk4v.1q, $rk4v.1d, $h78k.1d @ GHASH final-6 block - mid ++ pmull2 $rk2q1, $res0.2d, $h7.2d @ GHASH final-6 block - high ++ ++ eor $acc_lb, $acc_lb, $rk3 @ GHASH final-6 block - low ++ ++ eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-6 block - mid ++ eor $acc_hb, $acc_hb, $rk2 @ GHASH final-6 block - high ++.L128_enc_blocks_more_than_5: @ blocks left > 5 ++ ++ st1 { $res1b}, [$output_ptr], #16 @ AES final-5 block - store result ++ ++ rev64 $res0b, $res1b @ GHASH final-5 block ++ ++ eor $res0b, $res0b, $t0.16b @ feed in partial tag ++ ++ ins $rk4v.d[0], $res0.d[1] @ GHASH final-5 block - mid ++ ldr $ctr_t1q, [$input_ptr], #16 @ AES final-4 block - load plaintext ++ pmull2 $rk2q1, $res0.2d, $h6.2d @ GHASH final-5 block - high ++ ++ eor $acc_hb, $acc_hb, $rk2 @ GHASH final-5 block - high ++ ++ eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-5 block - mid ++ ++ ins $rk4v.d[1], $rk4v.d[0] @ GHASH final-5 block - mid ++ ++ eor3 $res1b, $ctr_t1b, $ctr3b, $t1.16b @ AES final-4 block - result ++ pmull $rk3q1, $res0.1d, $h6.1d @ GHASH final-5 block - low ++ movi $t0.8b, #0 @ surpress further partial tag feed in ++ ++ pmull2 $rk4v.1q, $rk4v.2d, $h56k.2d @ GHASH final-5 block - mid ++ eor $acc_lb, $acc_lb, $rk3 @ GHASH final-5 block - low ++ ++ eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-5 block - mid ++.L128_enc_blocks_more_than_4: @ blocks left > 4 ++ ++ st1 { $res1b}, [$output_ptr], #16 @ AES final-4 block - store result ++ ++ rev64 $res0b, $res1b @ GHASH final-4 block ++ ++ ldr $ctr_t1q, [$input_ptr], #16 @ AES final-3 block - load plaintext ++ ++ eor $res0b, $res0b, $t0.16b @ feed in partial tag ++ ++ ins $rk4v.d[0], $res0.d[1] @ GHASH final-4 block - mid ++ movi $t0.8b, #0 @ surpress further partial tag feed in ++ pmull2 $rk2q1, $res0.2d, $h5.2d @ GHASH final-4 block - high ++ ++ eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-4 block - mid ++ ++ pmull $rk3q1, $res0.1d, $h5.1d @ GHASH final-4 block - low ++ ++ eor $acc_hb, $acc_hb, $rk2 @ GHASH final-4 block - high ++ pmull $rk4v.1q, $rk4v.1d, $h56k.1d @ GHASH final-4 block - mid ++ ++ eor $acc_lb, $acc_lb, $rk3 @ GHASH final-4 block - low ++ ++ eor3 $res1b, $ctr_t1b, $ctr4b, $t1.16b @ AES final-3 block - result ++ eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-4 block - mid ++.L128_enc_blocks_more_than_3: @ blocks left > 3 ++ ++ st1 { $res1b}, [$output_ptr], #16 @ AES final-3 block - store result ++ ++ ldr $h4q, [$current_tag, #112] @ load h4l | h4h ++ ext $h4.16b, $h4.16b, $h4.16b, #8 ++ ++ rev64 $res0b, $res1b @ GHASH final-3 block ++ ++ eor $res0b, $res0b, $t0.16b @ feed in partial tag ++ movi $t0.8b, #0 @ surpress further partial tag feed in ++ ++ ins $rk4v.d[0], $res0.d[1] @ GHASH final-3 block - mid ++ ldr $h34kq, [$current_tag, #96] @ load h4k | h3k ++ pmull $rk3q1, $res0.1d, $h4.1d @ GHASH final-3 block - low ++ ++ ldr $ctr_t1q, [$input_ptr], #16 @ AES final-2 block - load plaintext ++ ++ eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-3 block - mid ++ ++ ins $rk4v.d[1], $rk4v.d[0] @ GHASH final-3 block - mid ++ eor $acc_lb, $acc_lb, $rk3 @ GHASH final-3 block - low ++ ++ eor3 $res1b, $ctr_t1b, $ctr5b, $t1.16b @ AES final-2 block - result ++ ++ pmull2 $rk4v.1q, $rk4v.2d, $h34k.2d @ GHASH final-3 block - mid ++ pmull2 $rk2q1, $res0.2d, $h4.2d @ GHASH final-3 block - high ++ ++ eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-3 block - mid ++ eor $acc_hb, $acc_hb, $rk2 @ GHASH final-3 block - high ++.L128_enc_blocks_more_than_2: @ blocks left > 2 ++ ++ st1 { $res1b}, [$output_ptr], #16 @ AES final-2 block - store result ++ ++ rev64 $res0b, $res1b @ GHASH final-2 block ++ ++ eor $res0b, $res0b, $t0.16b @ feed in partial tag ++ ++ ldr $ctr_t1q, [$input_ptr], #16 @ AES final-1 block - load plaintext ++ ++ ins $rk4v.d[0], $res0.d[1] @ GHASH final-2 block - mid ++ ldr $h3q, [$current_tag, #80] @ load h3l | h3h ++ ext $h3.16b, $h3.16b, $h3.16b, #8 ++ movi $t0.8b, #0 @ surpress further partial tag feed in ++ ++ eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-2 block - mid ++ eor3 $res1b, $ctr_t1b, $ctr6b, $t1.16b @ AES final-1 block - result ++ ++ pmull2 $rk2q1, $res0.2d, $h3.2d @ GHASH final-2 block - high ++ ++ pmull $rk3q1, $res0.1d, $h3.1d @ GHASH final-2 block - low ++ pmull $rk4v.1q, $rk4v.1d, $h34k.1d @ GHASH final-2 block - mid ++ ++ eor $acc_hb, $acc_hb, $rk2 @ GHASH final-2 block - high ++ ++ eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-2 block - mid ++ eor $acc_lb, $acc_lb, $rk3 @ GHASH final-2 block - low ++.L128_enc_blocks_more_than_1: @ blocks left > 1 ++ ++ st1 { $res1b}, [$output_ptr], #16 @ AES final-1 block - store result ++ ++ ldr $h2q, [$current_tag, #64] @ load h2l | h2h ++ ext $h2.16b, $h2.16b, $h2.16b, #8 ++ rev64 $res0b, $res1b @ GHASH final-1 block ++ ldr $ctr_t1q, [$input_ptr], #16 @ AES final block - load plaintext ++ ++ eor $res0b, $res0b, $t0.16b @ feed in partial tag ++ ++ movi $t0.8b, #0 @ surpress further partial tag feed in ++ ins $rk4v.d[0], $res0.d[1] @ GHASH final-1 block - mid ++ eor3 $res1b, $ctr_t1b, $ctr7b, $t1.16b @ AES final block - result ++ ++ pmull2 $rk2q1, $res0.2d, $h2.2d @ GHASH final-1 block - high ++ ++ eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-1 block - mid ++ ++ ldr $h12kq, [$current_tag, #48] @ load h2k | h1k ++ ++ ins $rk4v.d[1], $rk4v.d[0] @ GHASH final-1 block - mid ++ ++ pmull $rk3q1, $res0.1d, $h2.1d @ GHASH final-1 block - low ++ pmull2 $rk4v.1q, $rk4v.2d, $h12k.2d @ GHASH final-1 block - mid ++ ++ eor $acc_hb, $acc_hb, $rk2 @ GHASH final-1 block - high ++ ++ eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-1 block - mid ++ eor $acc_lb, $acc_lb, $rk3 @ GHASH final-1 block - low ++.L128_enc_blocks_less_than_1: @ blocks left <= 1 ++ ++ rev32 $rtmp_ctr.16b, $rtmp_ctr.16b ++ str $rtmp_ctrq, [$counter] @ store the updated counter ++ and $bit_length, $bit_length, #127 @ bit_length %= 128 ++ ++ sub $bit_length, $bit_length, #128 @ bit_length -= 128 ++ ++ neg $bit_length, $bit_length @ bit_length = 128 - #bits in input (in range [1,128]) ++ ++ mvn $temp0_x, xzr @ temp0_x = 0xffffffffffffffff ++ ld1 { $rk0}, [$output_ptr] @ load existing bytes where the possibly partial last block is to be stored ++ and $bit_length, $bit_length, #127 @ bit_length %= 128 ++ ++ lsr $temp0_x, $temp0_x, $bit_length @ temp0_x is mask for top 64b of last block ++ mvn $temp1_x, xzr @ temp1_x = 0xffffffffffffffff ++ cmp $bit_length, #64 ++ ++ csel $temp2_x, $temp1_x, $temp0_x, lt ++ csel $temp3_x, $temp0_x, xzr, lt ++ ++ mov $ctr0.d[1], $temp3_x ++ mov $ctr0.d[0], $temp2_x @ ctr0b is mask for last block ++ ++ and $res1b, $res1b, $ctr0b @ possibly partial last block has zeroes in highest bits ++ ++ rev64 $res0b, $res1b @ GHASH final block ++ ++ bif $res1b, $rk0, $ctr0b @ insert existing bytes in top end of result before storing ++ st1 { $res1b}, [$output_ptr] @ store all 16B ++ ++ eor $res0b, $res0b, $t0.16b @ feed in partial tag ++ ++ ins $t0.d[0], $res0.d[1] @ GHASH final block - mid ++ ++ eor $t0.8b, $t0.8b, $res0.8b @ GHASH final block - mid ++ ldr $h1q, [$current_tag, #32] @ load h1l | h1h ++ ext $h1.16b, $h1.16b, $h1.16b, #8 ++ ++ pmull $t0.1q, $t0.1d, $h12k.1d @ GHASH final block - mid ++ ++ pmull2 $rk2q1, $res0.2d, $h1.2d @ GHASH final block - high ++ eor $acc_mb, $acc_mb, $t0.16b @ GHASH final block - mid ++ ldr $mod_constantd, [$modulo_constant] @ MODULO - load modulo constant ++ ++ pmull $rk3q1, $res0.1d, $h1.1d @ GHASH final block - low ++ ++ eor $acc_hb, $acc_hb, $rk2 @ GHASH final block - high ++ ++ eor $acc_lb, $acc_lb, $rk3 @ GHASH final block - low ++ ++ ext $t11.16b, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment ++ pmull $t12.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid ++ ++ eor3 $acc_mb, $acc_mb, $acc_hb, $acc_lb @ MODULO - karatsuba tidy up ++ ++ eor3 $acc_mb, $acc_mb, $t12.16b, $t11.16b @ MODULO - fold into mid ++ ++ pmull $acc_h.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low ++ ext $t11.16b, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment ++ ++ eor3 $acc_lb, $acc_lb, $acc_hb, $t11.16b @ MODULO - fold into low ++ ext $acc_lb, $acc_lb, $acc_lb, #8 ++ rev64 $acc_lb, $acc_lb ++ st1 { $acc_l.16b }, [$current_tag] ++ lsr x0, $bit_length, #3 @ return sizes ++ ++ ldp d10, d11, [sp, #16] ++ ldp d12, d13, [sp, #32] ++ ldp d14, d15, [sp, #48] ++ ldp d8, d9, [sp], #80 ++ ret ++ ++.L128_enc_ret: ++ mov w0, #0x0 ++ ret ++.size unroll8_eor3_aes_gcm_enc_128_kernel,.-unroll8_eor3_aes_gcm_enc_128_kernel ++___ ++ ++######################################################################################### ++# size_t unroll8_eor3_aes_gcm_dec_128_kernel(const unsigned char *in, ++# size_t len, ++# unsigned char *out, ++# u64 *Xi, ++# unsigned char ivec[16], ++# const void *key); ++# ++$code.=<<___; ++.global unroll8_eor3_aes_gcm_dec_128_kernel ++.type unroll8_eor3_aes_gcm_dec_128_kernel,%function ++.align 4 ++unroll8_eor3_aes_gcm_dec_128_kernel: ++ AARCH64_VALID_CALL_TARGET ++ cbz x1, .L128_dec_ret ++ stp d8, d9, [sp, #-80]! ++ mov $counter, x4 ++ mov $cc, x5 ++ stp d10, d11, [sp, #16] ++ stp d12, d13, [sp, #32] ++ stp d14, d15, [sp, #48] ++ mov x5, #0xc200000000000000 ++ stp x5, xzr, [sp, #64] ++ add $modulo_constant, sp, #64 ++ ++ lsr $main_end_input_ptr, $bit_length, #3 @ byte_len ++ ld1 { $ctr0b}, [$counter] @ CTR block 0 ++ ++ ldp $rk0q, $rk1q, [$cc, #0] @ load rk0, rk1 ++ sub $main_end_input_ptr, $main_end_input_ptr, #1 @ byte_len - 1 ++ ++ mov $constant_temp, #0x100000000 @ set up counter increment ++ movi $rctr_inc.16b, #0x0 ++ mov $rctr_inc.d[1], $constant_temp ++ ld1 { $acc_lb}, [$current_tag] ++ ext $acc_lb, $acc_lb, $acc_lb, #8 ++ rev64 $acc_lb, $acc_lb ++ ++ rev32 $rtmp_ctr.16b, $ctr0.16b @ set up reversed counter ++ ++ aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 0 ++ ++ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 0 ++ ++ rev32 $ctr1.16b, $rtmp_ctr.16b @ CTR block 1 ++ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 1 ++ ++ and $main_end_input_ptr, $main_end_input_ptr, #0xffffffffffffff80 @ number of bytes to be processed in main loop (at least 1 byte must be handled by tail) ++ ++ rev32 $ctr2.16b, $rtmp_ctr.16b @ CTR block 2 ++ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 2 ++ aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 0 ++ ++ rev32 $ctr3.16b, $rtmp_ctr.16b @ CTR block 3 ++ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 3 ++ ++ aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 1 ++ aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 1 ++ ++ rev32 $ctr4.16b, $rtmp_ctr.16b @ CTR block 4 ++ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 4 ++ ++ rev32 $ctr5.16b, $rtmp_ctr.16b @ CTR block 5 ++ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 5 ++ ++ aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 0 ++ ++ rev32 $ctr6.16b, $rtmp_ctr.16b @ CTR block 6 ++ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 6 ++ aese $ctr5b, $rk0 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 0 ++ ++ aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 0 ++ aese $ctr4b, $rk0 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 0 ++ ++ rev32 $ctr7.16b, $rtmp_ctr.16b @ CTR block 7 ++ ++ aese $ctr6b, $rk0 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 0 ++ aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 1 ++ ++ aese $ctr7b, $rk0 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 0 ++ ++ ldp $rk2q, $rk3q, [$cc, #32] @ load rk2, rk3 ++ ++ aese $ctr6b, $rk1 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 1 ++ aese $ctr5b, $rk1 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 1 ++ ++ aese $ctr4b, $rk1 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 1 ++ aese $ctr7b, $rk1 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 1 ++ ++ aese $ctr7b, $rk2 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 2 ++ aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 2 ++ aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 1 ++ ++ aese $ctr6b, $rk2 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 2 ++ aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 2 ++ aese $ctr5b, $rk2 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 2 ++ ++ aese $ctr4b, $rk2 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 2 ++ aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 2 ++ aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 2 ++ ++ aese $ctr6b, $rk3 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 3 ++ aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 3 ++ ++ ldp $rk4q, $rk5q, [$cc, #64] @ load rk4, rk5 ++ aese $ctr5b, $rk3 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 3 ++ ++ aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 3 ++ aese $ctr7b, $rk3 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 3 ++ ++ aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 3 ++ aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 3 ++ ++ aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 4 ++ aese $ctr7b, $rk4 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 4 ++ aese $ctr4b, $rk3 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 3 ++ ++ aese $ctr6b, $rk4 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 4 ++ aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 4 ++ aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 4 ++ ++ aese $ctr5b, $rk4 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 4 ++ aese $ctr4b, $rk4 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 4 ++ aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 4 ++ ++ ldp $rk6q, $rk7q, [$cc, #96] @ load rk6, rk7 ++ aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 5 ++ aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 5 ++ ++ aese $ctr6b, $rk5 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 5 ++ aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 5 ++ ++ aese $ctr7b, $rk5 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 5 ++ aese $ctr5b, $rk5 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 5 ++ ++ aese $ctr4b, $rk5 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 5 ++ ++ aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 6 ++ aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 6 ++ aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 5 ++ ++ aese $ctr5b, $rk6 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 6 ++ aese $ctr4b, $rk6 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 6 ++ aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 6 ++ ++ aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 6 ++ aese $ctr7b, $rk6 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 6 ++ aese $ctr6b, $rk6 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 6 ++ ++ aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 7 ++ aese $ctr4b, $rk7 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 7 ++ aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 7 ++ ++ aese $ctr7b, $rk7 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 7 ++ aese $ctr5b, $rk7 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 7 ++ ldp $rk8q, $rk9q, [$cc, #128] @ load rk8, rk9 ++ ++ aese $ctr6b, $rk7 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 7 ++ aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 7 ++ aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 7 ++ ++ add $main_end_input_ptr, $main_end_input_ptr, $input_ptr ++ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 7 ++ ++ aese $ctr6b, $rk8 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 8 ++ aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 8 ++ ++ aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 8 ++ aese $ctr7b, $rk8 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 8 ++ aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 8 ++ ++ aese $ctr5b, $rk8 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 8 ++ aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 8 ++ aese $ctr4b, $rk8 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 8 ++ ++ aese $ctr0b, $rk9 @ AES block 0 - round 9 ++ aese $ctr1b, $rk9 @ AES block 1 - round 9 ++ aese $ctr6b, $rk9 @ AES block 6 - round 9 ++ ++ ldr $rk10q, [$cc, #160] @ load rk10 ++ aese $ctr4b, $rk9 @ AES block 4 - round 9 ++ aese $ctr3b, $rk9 @ AES block 3 - round 9 ++ ++ aese $ctr2b, $rk9 @ AES block 2 - round 9 ++ aese $ctr5b, $rk9 @ AES block 5 - round 9 ++ aese $ctr7b, $rk9 @ AES block 7 - round 9 ++ ++ add $end_input_ptr, $input_ptr, $bit_length, lsr #3 @ end_input_ptr ++ cmp $input_ptr, $main_end_input_ptr @ check if we have <= 8 blocks ++ b.ge .L128_dec_tail @ handle tail ++ ++ ldp $res0q, $res1q, [$input_ptr], #32 @ AES block 0, 1 - load ciphertext ++ ++ eor3 $ctr0b, $res0b, $ctr0b, $rk10 @ AES block 0 - result ++ eor3 $ctr1b, $res1b, $ctr1b, $rk10 @ AES block 1 - result ++ stp $ctr0q, $ctr1q, [$output_ptr], #32 @ AES block 0, 1 - store result ++ ++ rev32 $ctr0.16b, $rtmp_ctr.16b @ CTR block 8 ++ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8 ++ ldp $res2q, $res3q, [$input_ptr], #32 @ AES block 2, 3 - load ciphertext ++ ++ ldp $res4q, $res5q, [$input_ptr], #32 @ AES block 4, 5 - load ciphertext ++ ++ rev32 $ctr1.16b, $rtmp_ctr.16b @ CTR block 9 ++ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 9 ++ ldp $res6q, $res7q, [$input_ptr], #32 @ AES block 6, 7 - load ciphertext ++ ++ eor3 $ctr3b, $res3b, $ctr3b, $rk10 @ AES block 3 - result ++ eor3 $ctr2b, $res2b, $ctr2b, $rk10 @ AES block 2 - result ++ stp $ctr2q, $ctr3q, [$output_ptr], #32 @ AES block 2, 3 - store result ++ ++ rev32 $ctr2.16b, $rtmp_ctr.16b @ CTR block 10 ++ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 10 ++ ++ eor3 $ctr6b, $res6b, $ctr6b, $rk10 @ AES block 6 - result ++ ++ rev32 $ctr3.16b, $rtmp_ctr.16b @ CTR block 11 ++ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 11 ++ ++ eor3 $ctr4b, $res4b, $ctr4b, $rk10 @ AES block 4 - result ++ eor3 $ctr5b, $res5b, $ctr5b, $rk10 @ AES block 5 - result ++ stp $ctr4q, $ctr5q, [$output_ptr], #32 @ AES block 4, 5 - store result ++ ++ eor3 $ctr7b, $res7b, $ctr7b, $rk10 @ AES block 7 - result ++ stp $ctr6q, $ctr7q, [$output_ptr], #32 @ AES block 6, 7 - store result ++ rev32 $ctr4.16b, $rtmp_ctr.16b @ CTR block 12 ++ ++ cmp $input_ptr, $main_end_input_ptr @ check if we have <= 8 blocks ++ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 12 ++ b.ge .L128_dec_prepretail @ do prepretail ++ ++.L128_dec_main_loop: @ main loop start ++ ldr $h7q, [$current_tag, #176] @ load h7l | h7h ++ ext $h7.16b, $h7.16b, $h7.16b, #8 ++ ldr $h8q, [$current_tag, #208] @ load h7l | h7h ++ ext $h8.16b, $h8.16b, $h8.16b, #8 ++ ++ rev64 $res1b, $res1b @ GHASH block 8k+1 ++ rev64 $res0b, $res0b @ GHASH block 8k ++ ext $acc_lb, $acc_lb, $acc_lb, #8 @ PRE 0 ++ ++ rev64 $res6b, $res6b @ GHASH block 8k+6 ++ ldr $h5q, [$current_tag, #128] @ load h5l | h5h ++ ext $h5.16b, $h5.16b, $h5.16b, #8 ++ ldr $h6q, [$current_tag, #160] @ load h6l | h6h ++ ext $h6.16b, $h6.16b, $h6.16b, #8 ++ ++ eor $res0b, $res0b, $acc_lb @ PRE 1 ++ rev32 $ctr5.16b, $rtmp_ctr.16b @ CTR block 8k+13 ++ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+13 ++ ++ rev64 $res2b, $res2b @ GHASH block 8k+2 ++ rev64 $res4b, $res4b @ GHASH block 8k+4 ++ ldp $rk0q, $rk1q, [$cc, #0] @ load rk0, rk1 ++ ++ rev32 $ctr6.16b, $rtmp_ctr.16b @ CTR block 8k+14 ++ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+14 ++ ldr $h56kq, [$current_tag, #144] @ load h6k | h5k ++ ldr $h78kq, [$current_tag, #192] @ load h6k | h5k ++ ++ pmull2 $t0.1q, $res1.2d, $h7.2d @ GHASH block 8k+1 - high ++ pmull2 $acc_h.1q, $res0.2d, $h8.2d @ GHASH block 8k - high ++ rev64 $res3b, $res3b @ GHASH block 8k+3 ++ ++ rev32 $ctr7.16b, $rtmp_ctr.16b @ CTR block 8k+15 ++ trn1 $acc_m.2d, $res1.2d, $res0.2d @ GHASH block 8k, 8k+1 - mid ++ rev64 $res5b, $res5b @ GHASH block 8k+5 ++ ++ pmull $h7.1q, $res1.1d, $h7.1d @ GHASH block 8k+1 - low ++ pmull $acc_l.1q, $res0.1d, $h8.1d @ GHASH block 8k - low ++ trn2 $res0.2d, $res1.2d, $res0.2d @ GHASH block 8k, 8k+1 - mid ++ ++ pmull2 $t1.1q, $res2.2d, $h6.2d @ GHASH block 8k+2 - high ++ aese $ctr4b, $rk0 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 0 ++ pmull2 $t2.1q, $res3.2d, $h5.2d @ GHASH block 8k+3 - high ++ ++ aese $ctr6b, $rk0 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 0 ++ aese $ctr5b, $rk0 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 0 ++ aese $ctr7b, $rk0 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 0 ++ ++ aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 0 ++ aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 0 ++ eor $acc_hb, $acc_hb, $t0.16b @ GHASH block 8k+1 - high ++ ++ aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 0 ++ eor $res0.16b, $res0.16b, $acc_m.16b @ GHASH block 8k, 8k+1 - mid ++ aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 0 ++ ++ aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 1 ++ eor $acc_lb, $acc_lb, $h7.16b @ GHASH block 8k+1 - low ++ eor3 $acc_hb, $acc_hb, $t1.16b, $t2.16b @ GHASH block 8k+2, 8k+3 - high ++ ++ ldp $rk2q, $rk3q, [$cc, #32] @ load rk2, rk3 ++ trn1 $t3.2d, $res3.2d, $res2.2d @ GHASH block 8k+2, 8k+3 - mid ++ aese $ctr7b, $rk1 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 1 ++ ++ pmull $h6.1q, $res2.1d, $h6.1d @ GHASH block 8k+2 - low ++ trn2 $res2.2d, $res3.2d, $res2.2d @ GHASH block 8k+2, 8k+3 - mid ++ pmull2 $acc_m.1q, $res0.2d, $h78k.2d @ GHASH block 8k - mid ++ ++ ldr $h3q, [$current_tag, #80] @ load h3l | h3h ++ ext $h3.16b, $h3.16b, $h3.16b, #8 ++ ldr $h4q, [$current_tag, #112] @ load h4l | h4h ++ ext $h4.16b, $h4.16b, $h4.16b, #8 ++ pmull $h78k.1q, $res0.1d, $h78k.1d @ GHASH block 8k+1 - mid ++ aese $ctr6b, $rk1 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 1 ++ ++ aese $ctr4b, $rk1 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 1 ++ aese $ctr5b, $rk1 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 1 ++ pmull $h5.1q, $res3.1d, $h5.1d @ GHASH block 8k+3 - low ++ ++ aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 1 ++ aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 1 ++ aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 1 ++ ++ aese $ctr7b, $rk2 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 2 ++ aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 2 ++ eor3 $acc_lb, $acc_lb, $h6.16b, $h5.16b @ GHASH block 8k+2, 8k+3 - low ++ ++ aese $ctr4b, $rk2 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 2 ++ eor $res2.16b, $res2.16b, $t3.16b @ GHASH block 8k+2, 8k+3 - mid ++ ldr $h1q, [$current_tag, #32] @ load h1l | h1h ++ ext $h1.16b, $h1.16b, $h1.16b, #8 ++ ldr $h2q, [$current_tag, #64] @ load h1l | h1h ++ ext $h2.16b, $h2.16b, $h2.16b, #8 ++ ++ eor $acc_mb, $acc_mb, $h78k.16b @ GHASH block 8k+1 - mid ++ aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 2 ++ aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 2 ++ ++ trn1 $t6.2d, $res5.2d, $res4.2d @ GHASH block 8k+4, 8k+5 - mid ++ aese $ctr5b, $rk2 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 2 ++ aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 2 ++ ++ aese $ctr6b, $rk2 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 2 ++ pmull2 $t3.1q, $res2.2d, $h56k.2d @ GHASH block 8k+2 - mid ++ pmull $h56k.1q, $res2.1d, $h56k.1d @ GHASH block 8k+3 - mid ++ ++ aese $ctr7b, $rk3 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 3 ++ rev64 $res7b, $res7b @ GHASH block 8k+7 ++ pmull2 $t4.1q, $res4.2d, $h4.2d @ GHASH block 8k+4 - high ++ ++ ldp $rk4q, $rk5q, [$cc, #64] @ load rk4, rk5 ++ pmull $h4.1q, $res4.1d, $h4.1d @ GHASH block 8k+4 - low ++ eor3 $acc_mb, $acc_mb, $h56k.16b, $t3.16b @ GHASH block 8k+2, 8k+3 - mid ++ ++ ldr $h12kq, [$current_tag, #48] @ load h2k | h1k ++ ldr $h34kq, [$current_tag, #96] @ load h4k | h3k ++ aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 3 ++ trn2 $res4.2d, $res5.2d, $res4.2d @ GHASH block 8k+4, 8k+5 - mid ++ ++ aese $ctr4b, $rk3 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 3 ++ aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 3 ++ aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 3 ++ ++ aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 3 ++ aese $ctr6b, $rk3 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 3 ++ aese $ctr5b, $rk3 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 3 ++ ++ pmull2 $t5.1q, $res5.2d, $h3.2d @ GHASH block 8k+5 - high ++ pmull $h3.1q, $res5.1d, $h3.1d @ GHASH block 8k+5 - low ++ pmull2 $t7.1q, $res6.2d, $h2.2d @ GHASH block 8k+6 - high ++ ++ pmull $h2.1q, $res6.1d, $h2.1d @ GHASH block 8k+6 - low ++ aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 4 ++ aese $ctr7b, $rk4 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 4 ++ ++ eor $res4.16b, $res4.16b, $t6.16b @ GHASH block 8k+4, 8k+5 - mid ++ trn1 $t9.2d, $res7.2d, $res6.2d @ GHASH block 8k+6, 8k+7 - mid ++ aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 4 ++ ++ aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 4 ++ aese $ctr5b, $rk4 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 4 ++ aese $ctr6b, $rk4 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 4 ++ ++ aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 4 ++ aese $ctr4b, $rk4 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 4 ++ trn2 $res6.2d, $res7.2d, $res6.2d @ GHASH block 8k+6, 8k+7 - mid ++ ++ ldp $rk6q, $rk7q, [$cc, #96] @ load rk6, rk7 ++ aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 5 ++ pmull2 $t6.1q, $res4.2d, $h34k.2d @ GHASH block 8k+4 - mid ++ ++ aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 5 ++ eor $res6.16b, $res6.16b, $t9.16b @ GHASH block 8k+6, 8k+7 - mid ++ aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 5 ++ ++ pmull $h34k.1q, $res4.1d, $h34k.1d @ GHASH block 8k+5 - mid ++ aese $ctr6b, $rk5 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 5 ++ aese $ctr7b, $rk5 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 5 ++ ++ aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 5 ++ aese $ctr5b, $rk5 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 5 ++ aese $ctr4b, $rk5 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 5 ++ ++ pmull2 $t8.1q, $res7.2d, $h1.2d @ GHASH block 8k+7 - high ++ eor3 $acc_mb, $acc_mb, $h34k.16b, $t6.16b @ GHASH block 8k+4, 8k+5 - mid ++ eor3 $acc_lb, $acc_lb, $h4.16b, $h3.16b @ GHASH block 8k+4, 8k+5 - low ++ ++ aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 6 ++ eor3 $acc_hb, $acc_hb, $t4.16b, $t5.16b @ GHASH block 8k+4, 8k+5 - high ++ aese $ctr7b, $rk6 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 6 ++ ++ aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 6 ++ pmull2 $t9.1q, $res6.2d, $h12k.2d @ GHASH block 8k+6 - mid ++ aese $ctr6b, $rk6 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 6 ++ ++ aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 6 ++ aese $ctr5b, $rk6 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 6 ++ pmull $h1.1q, $res7.1d, $h1.1d @ GHASH block 8k+7 - low ++ ++ pmull $h12k.1q, $res6.1d, $h12k.1d @ GHASH block 8k+7 - mid ++ aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 6 ++ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+15 ++ ++ eor3 $acc_hb, $acc_hb, $t7.16b, $t8.16b @ GHASH block 8k+6, 8k+7 - high ++ aese $ctr4b, $rk6 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 6 ++ ldp $rk8q, $rk9q, [$cc, #128] @ load rk8, rk9 ++ ++ ldr $mod_constantd, [$modulo_constant] @ MODULO - load modulo constant ++ eor3 $acc_lb, $acc_lb, $h2.16b, $h1.16b @ GHASH block 8k+6, 8k+7 - low ++ aese $ctr5b, $rk7 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 7 ++ ++ rev32 $h1.16b, $rtmp_ctr.16b @ CTR block 8k+16 ++ eor3 $acc_mb, $acc_mb, $h12k.16b, $t9.16b @ GHASH block 8k+6, 8k+7 - mid ++ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+16 ++ ++ aese $ctr6b, $rk7 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 7 ++ aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 7 ++ aese $ctr7b, $rk7 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 7 ++ ++ aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 7 ++ aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 7 ++ rev32 $h2.16b, $rtmp_ctr.16b @ CTR block 8k+17 ++ ++ aese $ctr4b, $rk7 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 7 ++ ext $t11.16b, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment ++ pmull $t12.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid ++ ++ eor3 $acc_mb, $acc_mb, $acc_hb, $acc_lb @ MODULO - karatsuba tidy up ++ aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 7 ++ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+17 ++ ++ aese $ctr5b, $rk8 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 8 ++ aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 8 ++ ldp $res0q, $res1q, [$input_ptr], #32 @ AES block 8k+8, 8k+9 - load ciphertext ++ ++ ldp $res2q, $res3q, [$input_ptr], #32 @ AES block 8k+10, 8k+11 - load ciphertext ++ aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 8 ++ rev32 $h3.16b, $rtmp_ctr.16b @ CTR block 8k+18 ++ ++ ldp $res4q, $res5q, [$input_ptr], #32 @ AES block 8k+12, 8k+13 - load ciphertext ++ aese $ctr4b, $rk8 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 8 ++ eor3 $acc_mb, $acc_mb, $t12.16b, $t11.16b @ MODULO - fold into mid ++ ++ ldp $res6q, $res7q, [$input_ptr], #32 @ AES block 8k+14, 8k+15 - load ciphertext ++ aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 8 ++ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+18 ++ ++ aese $ctr7b, $rk8 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 8 ++ aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 8 ++ aese $ctr6b, $rk8 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 8 ++ ++ aese $ctr0b, $rk9 @ AES block 8k+8 - round 9 ++ aese $ctr1b, $rk9 @ AES block 8k+9 - round 9 ++ ldr $rk10q, [$cc, #160] @ load rk10 ++ ++ aese $ctr6b, $rk9 @ AES block 8k+14 - round 9 ++ pmull $acc_h.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low ++ aese $ctr2b, $rk9 @ AES block 8k+10 - round 9 ++ ++ aese $ctr7b, $rk9 @ AES block 8k+15 - round 9 ++ aese $ctr4b, $rk9 @ AES block 8k+12 - round 9 ++ ext $t11.16b, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment ++ ++ rev32 $h4.16b, $rtmp_ctr.16b @ CTR block 8k+19 ++ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+19 ++ ++ aese $ctr3b, $rk9 @ AES block 8k+11 - round 9 ++ aese $ctr5b, $rk9 @ AES block 8k+13 - round 9 ++ eor3 $ctr1b, $res1b, $ctr1b, $rk10 @ AES block 8k+9 - result ++ ++ eor3 $ctr0b, $res0b, $ctr0b, $rk10 @ AES block 8k+8 - result ++ eor3 $ctr7b, $res7b, $ctr7b, $rk10 @ AES block 8k+15 - result ++ eor3 $ctr6b, $res6b, $ctr6b, $rk10 @ AES block 8k+14 - result ++ ++ eor3 $ctr2b, $res2b, $ctr2b, $rk10 @ AES block 8k+10 - result ++ stp $ctr0q, $ctr1q, [$output_ptr], #32 @ AES block 8k+8, 8k+9 - store result ++ mov $ctr1.16b, $h2.16b @ CTR block 8k+17 ++ ++ eor3 $ctr4b, $res4b, $ctr4b, $rk10 @ AES block 8k+12 - result ++ eor3 $acc_lb, $acc_lb, $acc_hb, $t11.16b @ MODULO - fold into low ++ mov $ctr0.16b, $h1.16b @ CTR block 8k+16 ++ ++ eor3 $ctr3b, $res3b, $ctr3b, $rk10 @ AES block 8k+11 - result ++ cmp $input_ptr, $main_end_input_ptr @ LOOP CONTROL ++ stp $ctr2q, $ctr3q, [$output_ptr], #32 @ AES block 8k+10, 8k+11 - store result ++ ++ eor3 $ctr5b, $res5b, $ctr5b, $rk10 @ AES block 8k+13 - result ++ mov $ctr2.16b, $h3.16b @ CTR block 8k+18 ++ ++ stp $ctr4q, $ctr5q, [$output_ptr], #32 @ AES block 8k+12, 8k+13 - store result ++ rev32 $ctr4.16b, $rtmp_ctr.16b @ CTR block 8k+20 ++ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+20 ++ ++ stp $ctr6q, $ctr7q, [$output_ptr], #32 @ AES block 8k+14, 8k+15 - store result ++ mov $ctr3.16b, $h4.16b @ CTR block 8k+19 ++ b.lt .L128_dec_main_loop ++ ++.L128_dec_prepretail: @ PREPRETAIL ++ rev64 $res3b, $res3b @ GHASH block 8k+3 ++ ext $acc_lb, $acc_lb, $acc_lb, #8 @ PRE 0 ++ rev64 $res0b, $res0b @ GHASH block 8k ++ ++ rev64 $res2b, $res2b @ GHASH block 8k+2 ++ rev32 $ctr5.16b, $rtmp_ctr.16b @ CTR block 8k+13 ++ ldp $rk0q, $rk1q, [$cc, #0] @ load rk0, rk1 ++ ++ ldr $h7q, [$current_tag, #176] @ load h7l | h7h ++ ext $h7.16b, $h7.16b, $h7.16b, #8 ++ ldr $h8q, [$current_tag, #208] @ load h8l | h8h ++ ext $h8.16b, $h8.16b, $h8.16b, #8 ++ eor $res0b, $res0b, $acc_lb @ PRE 1 ++ rev64 $res1b, $res1b @ GHASH block 8k+1 ++ ++ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+13 ++ ldr $h5q, [$current_tag, #128] @ load h5l | h5h ++ ext $h5.16b, $h5.16b, $h5.16b, #8 ++ ldr $h6q, [$current_tag, #160] @ load h6l | h6h ++ ext $h6.16b, $h6.16b, $h6.16b, #8 ++ rev64 $res5b, $res5b @ GHASH block 8k+5 ++ ++ rev64 $res4b, $res4b @ GHASH block 8k+4 ++ ++ rev64 $res6b, $res6b @ GHASH block 8k+6 ++ ++ ldr $h56kq, [$current_tag, #144] @ load h6k | h5k ++ ldr $h78kq, [$current_tag, #192] @ load h6k | h5k ++ rev32 $ctr6.16b, $rtmp_ctr.16b @ CTR block 8k+14 ++ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+14 ++ ++ pmull2 $t0.1q, $res1.2d, $h7.2d @ GHASH block 8k+1 - high ++ pmull $acc_l.1q, $res0.1d, $h8.1d @ GHASH block 8k - low ++ pmull2 $acc_h.1q, $res0.2d, $h8.2d @ GHASH block 8k - high ++ ++ trn1 $acc_m.2d, $res1.2d, $res0.2d @ GHASH block 8k, 8k+1 - mid ++ trn2 $res0.2d, $res1.2d, $res0.2d @ GHASH block 8k, 8k+1 - mid ++ pmull2 $t1.1q, $res2.2d, $h6.2d @ GHASH block 8k+2 - high ++ ++ pmull $h7.1q, $res1.1d, $h7.1d @ GHASH block 8k+1 - low ++ pmull2 $t2.1q, $res3.2d, $h5.2d @ GHASH block 8k+3 - high ++ aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 0 ++ ++ eor $acc_hb, $acc_hb, $t0.16b @ GHASH block 8k+1 - high ++ aese $ctr4b, $rk0 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 0 ++ eor $res0.16b, $res0.16b, $acc_m.16b @ GHASH block 8k, 8k+1 - mid ++ ++ pmull $h6.1q, $res2.1d, $h6.1d @ GHASH block 8k+2 - low ++ rev32 $ctr7.16b, $rtmp_ctr.16b @ CTR block 8k+15 ++ aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 0 ++ ++ eor3 $acc_hb, $acc_hb, $t1.16b, $t2.16b @ GHASH block 8k+2, 8k+3 - high ++ trn1 $t3.2d, $res3.2d, $res2.2d @ GHASH block 8k+2, 8k+3 - mid ++ trn2 $res2.2d, $res3.2d, $res2.2d @ GHASH block 8k+2, 8k+3 - mid ++ ++ aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 0 ++ aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 0 ++ aese $ctr5b, $rk0 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 0 ++ ++ pmull2 $acc_m.1q, $res0.2d, $h78k.2d @ GHASH block 8k - mid ++ pmull $h78k.1q, $res0.1d, $h78k.1d @ GHASH block 8k+1 - mid ++ pmull $h5.1q, $res3.1d, $h5.1d @ GHASH block 8k+3 - low ++ ++ aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 1 ++ aese $ctr7b, $rk0 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 0 ++ aese $ctr6b, $rk0 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 0 ++ ++ eor $acc_lb, $acc_lb, $h7.16b @ GHASH block 8k+1 - low ++ eor $res2.16b, $res2.16b, $t3.16b @ GHASH block 8k+2, 8k+3 - mid ++ eor $acc_mb, $acc_mb, $h78k.16b @ GHASH block 8k+1 - mid ++ ++ aese $ctr6b, $rk1 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 1 ++ aese $ctr4b, $rk1 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 1 ++ aese $ctr5b, $rk1 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 1 ++ ++ ldp $rk2q, $rk3q, [$cc, #32] @ load rk2, rk3 ++ eor3 $acc_lb, $acc_lb, $h6.16b, $h5.16b @ GHASH block 8k+2, 8k+3 - low ++ pmull2 $t3.1q, $res2.2d, $h56k.2d @ GHASH block 8k+2 - mid ++ ++ ldr $h3q, [$current_tag, #80] @ load h3l | h3h ++ ext $h3.16b, $h3.16b, $h3.16b, #8 ++ ldr $h4q, [$current_tag, #112] @ load h4l | h4h ++ ext $h4.16b, $h4.16b, $h4.16b, #8 ++ aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 1 ++ pmull $h56k.1q, $res2.1d, $h56k.1d @ GHASH block 8k+3 - mid ++ ++ aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 1 ++ aese $ctr7b, $rk1 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 1 ++ aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 1 ++ ++ ldr $h1q, [$current_tag, #32] @ load h1l | h1h ++ ext $h1.16b, $h1.16b, $h1.16b, #8 ++ ldr $h2q, [$current_tag, #64] @ load h1l | h1h ++ ext $h2.16b, $h2.16b, $h2.16b, #8 ++ eor3 $acc_mb, $acc_mb, $h56k.16b, $t3.16b @ GHASH block 8k+2, 8k+3 - mid ++ ++ aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 2 ++ aese $ctr6b, $rk2 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 2 ++ aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 2 ++ ++ aese $ctr4b, $rk2 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 2 ++ trn1 $t6.2d, $res5.2d, $res4.2d @ GHASH block 8k+4, 8k+5 - mid ++ aese $ctr7b, $rk2 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 2 ++ ++ aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 2 ++ aese $ctr5b, $rk2 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 2 ++ aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 2 ++ ++ pmull2 $t4.1q, $res4.2d, $h4.2d @ GHASH block 8k+4 - high ++ pmull $h4.1q, $res4.1d, $h4.1d @ GHASH block 8k+4 - low ++ trn2 $res4.2d, $res5.2d, $res4.2d @ GHASH block 8k+4, 8k+5 - mid ++ ++ ldp $rk4q, $rk5q, [$cc, #64] @ load rk4, rk5 ++ rev64 $res7b, $res7b @ GHASH block 8k+7 ++ aese $ctr6b, $rk3 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 3 ++ ++ ldr $h12kq, [$current_tag, #48] @ load h2k | h1k ++ ldr $h34kq, [$current_tag, #96] @ load h4k | h3k ++ pmull2 $t5.1q, $res5.2d, $h3.2d @ GHASH block 8k+5 - high ++ pmull $h3.1q, $res5.1d, $h3.1d @ GHASH block 8k+5 - low ++ ++ aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 3 ++ aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 3 ++ trn1 $t9.2d, $res7.2d, $res6.2d @ GHASH block 8k+6, 8k+7 - mid ++ ++ pmull2 $t7.1q, $res6.2d, $h2.2d @ GHASH block 8k+6 - high ++ pmull $h2.1q, $res6.1d, $h2.1d @ GHASH block 8k+6 - low ++ trn2 $res6.2d, $res7.2d, $res6.2d @ GHASH block 8k+6, 8k+7 - mid ++ ++ aese $ctr4b, $rk3 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 3 ++ aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 3 ++ aese $ctr7b, $rk3 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 3 ++ ++ aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 3 ++ aese $ctr5b, $rk3 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 3 ++ eor $res4.16b, $res4.16b, $t6.16b @ GHASH block 8k+4, 8k+5 - mid ++ ++ eor3 $acc_hb, $acc_hb, $t4.16b, $t5.16b @ GHASH block 8k+4, 8k+5 - high ++ aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 4 ++ aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 4 ++ ++ eor $res6.16b, $res6.16b, $t9.16b @ GHASH block 8k+6, 8k+7 - mid ++ aese $ctr5b, $rk4 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 4 ++ pmull2 $t6.1q, $res4.2d, $h34k.2d @ GHASH block 8k+4 - mid ++ ++ aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 4 ++ aese $ctr6b, $rk4 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 4 ++ aese $ctr4b, $rk4 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 4 ++ ++ aese $ctr7b, $rk4 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 4 ++ aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 4 ++ pmull $h34k.1q, $res4.1d, $h34k.1d @ GHASH block 8k+5 - mid ++ ++ pmull2 $t8.1q, $res7.2d, $h1.2d @ GHASH block 8k+7 - high ++ pmull2 $t9.1q, $res6.2d, $h12k.2d @ GHASH block 8k+6 - mid ++ pmull $h12k.1q, $res6.1d, $h12k.1d @ GHASH block 8k+7 - mid ++ ++ ldp $rk6q, $rk7q, [$cc, #96] @ load rk6, rk7 ++ eor3 $acc_mb, $acc_mb, $h34k.16b, $t6.16b @ GHASH block 8k+4, 8k+5 - mid ++ aese $ctr6b, $rk5 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 5 ++ ++ ldr $mod_constantd, [$modulo_constant] @ MODULO - load modulo constant ++ pmull $h1.1q, $res7.1d, $h1.1d @ GHASH block 8k+7 - low ++ eor3 $acc_lb, $acc_lb, $h4.16b, $h3.16b @ GHASH block 8k+4, 8k+5 - low ++ ++ aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 5 ++ aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 5 ++ aese $ctr4b, $rk5 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 5 ++ ++ aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 5 ++ aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 5 ++ aese $ctr5b, $rk5 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 5 ++ ++ aese $ctr7b, $rk5 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 5 ++ eor3 $acc_mb, $acc_mb, $h12k.16b, $t9.16b @ GHASH block 8k+6, 8k+7 - mid ++ eor3 $acc_lb, $acc_lb, $h2.16b, $h1.16b @ GHASH block 8k+6, 8k+7 - low ++ ++ aese $ctr4b, $rk6 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 6 ++ aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 6 ++ aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 6 ++ ++ eor3 $acc_hb, $acc_hb, $t7.16b, $t8.16b @ GHASH block 8k+6, 8k+7 - high ++ aese $ctr5b, $rk6 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 6 ++ aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 6 ++ ++ aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 6 ++ aese $ctr6b, $rk6 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 6 ++ aese $ctr7b, $rk6 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 6 ++ ++ aese $ctr4b, $rk7 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 7 ++ eor3 $acc_mb, $acc_mb, $acc_hb, $acc_lb @ MODULO - karatsuba tidy up ++ ldp $rk8q, $rk9q, [$cc, #128] @ load rk8, rk9 ++ ++ pmull $t12.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid ++ aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 7 ++ ext $t11.16b, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment ++ ++ aese $ctr5b, $rk7 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 7 ++ aese $ctr6b, $rk7 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 7 ++ aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 7 ++ ++ aese $ctr7b, $rk7 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 7 ++ aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 7 ++ aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 7 ++ ++ eor3 $acc_mb, $acc_mb, $t12.16b, $t11.16b @ MODULO - fold into mid ++ ldr $rk10q, [$cc, #160] @ load rk10 ++ ++ aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 8 ++ aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 8 ++ ++ pmull $acc_h.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low ++ aese $ctr6b, $rk8 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 8 ++ ext $t11.16b, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment ++ ++ aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 8 ++ aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 8 ++ aese $ctr7b, $rk8 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 8 ++ ++ aese $ctr6b, $rk9 @ AES block 8k+14 - round 9 ++ aese $ctr5b, $rk8 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 8 ++ aese $ctr4b, $rk8 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 8 ++ ++ eor3 $acc_lb, $acc_lb, $acc_hb, $t11.16b @ MODULO - fold into low ++ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+15 ++ aese $ctr2b, $rk9 @ AES block 8k+10 - round 9 ++ ++ aese $ctr3b, $rk9 @ AES block 8k+11 - round 9 ++ aese $ctr5b, $rk9 @ AES block 8k+13 - round 9 ++ aese $ctr0b, $rk9 @ AES block 8k+8 - round 9 ++ ++ aese $ctr4b, $rk9 @ AES block 8k+12 - round 9 ++ aese $ctr1b, $rk9 @ AES block 8k+9 - round 9 ++ aese $ctr7b, $rk9 @ AES block 8k+15 - round 9 ++ ++.L128_dec_tail: @ TAIL ++ ++ mov $t1.16b, $rk10 ++ sub $main_end_input_ptr, $end_input_ptr, $input_ptr @ main_end_input_ptr is number of bytes left to process ++ ++ cmp $main_end_input_ptr, #112 ++ ++ ldp $h78kq, $h8q, [$current_tag, #192] @ load h7l | h7h ++ ext $h8.16b, $h8.16b, $h8.16b, #8 ++ ldr $res1q, [$input_ptr], #16 @ AES block 8k+8 - load ciphertext ++ ++ ldp $h5q, $h56kq, [$current_tag, #128] @ load h5l | h5h ++ ext $h5.16b, $h5.16b, $h5.16b, #8 ++ ext $t0.16b, $acc_lb, $acc_lb, #8 @ prepare final partial tag ++ ++ ldp $h6q, $h7q, [$current_tag, #160] @ load h6k | h5k ++ ext $h6.16b, $h6.16b, $h6.16b, #8 ++ ext $h7.16b, $h7.16b, $h7.16b, #8 ++ ++ eor3 $res4b, $res1b, $ctr0b, $t1.16b @ AES block 8k+8 - result ++ b.gt .L128_dec_blocks_more_than_7 ++ ++ cmp $main_end_input_ptr, #96 ++ mov $ctr7b, $ctr6b ++ movi $acc_l.8b, #0 ++ ++ movi $acc_h.8b, #0 ++ mov $ctr6b, $ctr5b ++ mov $ctr5b, $ctr4b ++ ++ mov $ctr4b, $ctr3b ++ mov $ctr3b, $ctr2b ++ mov $ctr2b, $ctr1b ++ ++ movi $acc_m.8b, #0 ++ sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s ++ b.gt .L128_dec_blocks_more_than_6 ++ ++ cmp $main_end_input_ptr, #80 ++ sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s ++ ++ mov $ctr7b, $ctr6b ++ mov $ctr6b, $ctr5b ++ mov $ctr5b, $ctr4b ++ ++ mov $ctr4b, $ctr3b ++ mov $ctr3b, $ctr1b ++ b.gt .L128_dec_blocks_more_than_5 ++ ++ cmp $main_end_input_ptr, #64 ++ ++ mov $ctr7b, $ctr6b ++ mov $ctr6b, $ctr5b ++ mov $ctr5b, $ctr4b ++ ++ mov $ctr4b, $ctr1b ++ sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s ++ b.gt .L128_dec_blocks_more_than_4 ++ ++ sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s ++ mov $ctr7b, $ctr6b ++ mov $ctr6b, $ctr5b ++ ++ mov $ctr5b, $ctr1b ++ cmp $main_end_input_ptr, #48 ++ b.gt .L128_dec_blocks_more_than_3 ++ ++ sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s ++ mov $ctr7b, $ctr6b ++ cmp $main_end_input_ptr, #32 ++ ++ ldr $h34kq, [$current_tag, #96] @ load h4k | h3k ++ mov $ctr6b, $ctr1b ++ b.gt .L128_dec_blocks_more_than_2 ++ ++ cmp $main_end_input_ptr, #16 ++ ++ mov $ctr7b, $ctr1b ++ sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s ++ b.gt L128_dec_blocks_more_than_1 ++ ++ sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s ++ ldr $h12kq, [$current_tag, #48] @ load h2k | h1k ++ b .L128_dec_blocks_less_than_1 ++.L128_dec_blocks_more_than_7: @ blocks left > 7 ++ rev64 $res0b, $res1b @ GHASH final-7 block ++ ++ eor $res0b, $res0b, $t0.16b @ feed in partial tag ++ ++ ins $acc_m.d[0], $h78k.d[1] @ GHASH final-7 block - mid ++ ++ pmull $acc_l.1q, $res0.1d, $h8.1d @ GHASH final-7 block - low ++ ins $rk4v.d[0], $res0.d[1] @ GHASH final-7 block - mid ++ ++ movi $t0.8b, #0 @ surpress further partial tag feed in ++ ldr $res1q, [$input_ptr], #16 @ AES final-6 block - load ciphertext ++ ++ eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-7 block - mid ++ ++ pmull2 $acc_h.1q, $res0.2d, $h8.2d @ GHASH final-7 block - high ++ st1 { $res4b}, [$output_ptr], #16 @ AES final-7 block - store result ++ eor3 $res4b, $res1b, $ctr1b, $t1.16b @ AES final-6 block - result ++ ++ pmull $acc_m.1q, $rk4v.1d, $acc_m.1d @ GHASH final-7 block - mid ++.L128_dec_blocks_more_than_6: @ blocks left > 6 ++ ++ rev64 $res0b, $res1b @ GHASH final-6 block ++ ++ eor $res0b, $res0b, $t0.16b @ feed in partial tag ++ ++ ins $rk4v.d[0], $res0.d[1] @ GHASH final-6 block - mid ++ ++ eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-6 block - mid ++ ++ pmull $rk3q1, $res0.1d, $h7.1d @ GHASH final-6 block - low ++ ldr $res1q, [$input_ptr], #16 @ AES final-5 block - load ciphertext ++ movi $t0.8b, #0 @ surpress further partial tag feed in ++ ++ pmull $rk4v.1q, $rk4v.1d, $h78k.1d @ GHASH final-6 block - mid ++ st1 { $res4b}, [$output_ptr], #16 @ AES final-6 block - store result ++ pmull2 $rk2q1, $res0.2d, $h7.2d @ GHASH final-6 block - high ++ ++ eor $acc_lb, $acc_lb, $rk3 @ GHASH final-6 block - low ++ eor $acc_hb, $acc_hb, $rk2 @ GHASH final-6 block - high ++ ++ eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-6 block - mid ++ eor3 $res4b, $res1b, $ctr2b, $t1.16b @ AES final-5 block - result ++.L128_dec_blocks_more_than_5: @ blocks left > 5 ++ ++ rev64 $res0b, $res1b @ GHASH final-5 block ++ ++ ldr $res1q, [$input_ptr], #16 @ AES final-4 block - load ciphertext ++ st1 { $res4b}, [$output_ptr], #16 @ AES final-5 block - store result ++ ++ eor $res0b, $res0b, $t0.16b @ feed in partial tag ++ ++ ins $rk4v.d[0], $res0.d[1] @ GHASH final-5 block - mid ++ ++ eor3 $res4b, $res1b, $ctr3b, $t1.16b @ AES final-4 block - result ++ ++ eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-5 block - mid ++ ++ ins $rk4v.d[1], $rk4v.d[0] @ GHASH final-5 block - mid ++ pmull $rk3q1, $res0.1d, $h6.1d @ GHASH final-5 block - low ++ movi $t0.8b, #0 @ surpress further partial tag feed in ++ ++ pmull2 $rk4v.1q, $rk4v.2d, $h56k.2d @ GHASH final-5 block - mid ++ pmull2 $rk2q1, $res0.2d, $h6.2d @ GHASH final-5 block - high ++ eor $acc_lb, $acc_lb, $rk3 @ GHASH final-5 block - low ++ ++ eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-5 block - mid ++ eor $acc_hb, $acc_hb, $rk2 @ GHASH final-5 block - high ++.L128_dec_blocks_more_than_4: @ blocks left > 4 ++ ++ rev64 $res0b, $res1b @ GHASH final-4 block ++ ++ eor $res0b, $res0b, $t0.16b @ feed in partial tag ++ ldr $res1q, [$input_ptr], #16 @ AES final-3 block - load ciphertext ++ ++ ins $rk4v.d[0], $res0.d[1] @ GHASH final-4 block - mid ++ movi $t0.8b, #0 @ surpress further partial tag feed in ++ pmull2 $rk2q1, $res0.2d, $h5.2d @ GHASH final-4 block - high ++ ++ pmull $rk3q1, $res0.1d, $h5.1d @ GHASH final-4 block - low ++ ++ eor $acc_hb, $acc_hb, $rk2 @ GHASH final-4 block - high ++ ++ st1 { $res4b}, [$output_ptr], #16 @ AES final-4 block - store result ++ eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-4 block - mid ++ ++ eor3 $res4b, $res1b, $ctr4b, $t1.16b @ AES final-3 block - result ++ eor $acc_lb, $acc_lb, $rk3 @ GHASH final-4 block - low ++ ++ pmull $rk4v.1q, $rk4v.1d, $h56k.1d @ GHASH final-4 block - mid ++ ++ eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-4 block - mid ++.L128_dec_blocks_more_than_3: @ blocks left > 3 ++ ++ st1 { $res4b}, [$output_ptr], #16 @ AES final-3 block - store result ++ rev64 $res0b, $res1b @ GHASH final-3 block ++ ++ eor $res0b, $res0b, $t0.16b @ feed in partial tag ++ ++ ins $rk4v.d[0], $res0.d[1] @ GHASH final-3 block - mid ++ ++ ldr $h4q, [$current_tag, #112] @ load h4l | h4h ++ ext $h4.16b, $h4.16b, $h4.16b, #8 ++ ldr $h34kq, [$current_tag, #96] @ load h4k | h3k ++ ++ eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-3 block - mid ++ ++ ldr $res1q, [$input_ptr], #16 @ AES final-2 block - load ciphertext ++ ++ ins $rk4v.d[1], $rk4v.d[0] @ GHASH final-3 block - mid ++ pmull $rk3q1, $res0.1d, $h4.1d @ GHASH final-3 block - low ++ pmull2 $rk2q1, $res0.2d, $h4.2d @ GHASH final-3 block - high ++ ++ movi $t0.8b, #0 @ surpress further partial tag feed in ++ eor3 $res4b, $res1b, $ctr5b, $t1.16b @ AES final-2 block - result ++ eor $acc_lb, $acc_lb, $rk3 @ GHASH final-3 block - low ++ ++ pmull2 $rk4v.1q, $rk4v.2d, $h34k.2d @ GHASH final-3 block - mid ++ ++ eor $acc_hb, $acc_hb, $rk2 @ GHASH final-3 block - high ++ eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-3 block - mid ++.L128_dec_blocks_more_than_2: @ blocks left > 2 ++ ++ rev64 $res0b, $res1b @ GHASH final-2 block ++ ++ st1 { $res4b}, [$output_ptr], #16 @ AES final-2 block - store result ++ ++ eor $res0b, $res0b, $t0.16b @ feed in partial tag ++ ldr $h3q, [$current_tag, #80] @ load h3l | h3h ++ ext $h3.16b, $h3.16b, $h3.16b, #8 ++ movi $t0.8b, #0 @ surpress further partial tag feed in ++ ++ ins $rk4v.d[0], $res0.d[1] @ GHASH final-2 block - mid ++ ++ eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-2 block - mid ++ ++ pmull $rk3q1, $res0.1d, $h3.1d @ GHASH final-2 block - low ++ ++ pmull2 $rk2q1, $res0.2d, $h3.2d @ GHASH final-2 block - high ++ pmull $rk4v.1q, $rk4v.1d, $h34k.1d @ GHASH final-2 block - mid ++ ldr $res1q, [$input_ptr], #16 @ AES final-1 block - load ciphertext ++ ++ eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-2 block - mid ++ ++ eor $acc_lb, $acc_lb, $rk3 @ GHASH final-2 block - low ++ ++ eor3 $res4b, $res1b, $ctr6b, $t1.16b @ AES final-1 block - result ++ eor $acc_hb, $acc_hb, $rk2 @ GHASH final-2 block - high ++.L128_dec_blocks_more_than_1: @ blocks left > 1 ++ ++ st1 { $res4b}, [$output_ptr], #16 @ AES final-1 block - store result ++ rev64 $res0b, $res1b @ GHASH final-1 block ++ ++ ldr $h2q, [$current_tag, #64] @ load h1l | h1h ++ ext $h2.16b, $h2.16b, $h2.16b, #8 ++ ++ eor $res0b, $res0b, $t0.16b @ feed in partial tag ++ ++ movi $t0.8b, #0 @ surpress further partial tag feed in ++ ++ ins $rk4v.d[0], $res0.d[1] @ GHASH final-1 block - mid ++ ++ ldr $res1q, [$input_ptr], #16 @ AES final block - load ciphertext ++ pmull2 $rk2q1, $res0.2d, $h2.2d @ GHASH final-1 block - high ++ ++ eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-1 block - mid ++ eor $acc_hb, $acc_hb, $rk2 @ GHASH final-1 block - high ++ ldr $h12kq, [$current_tag, #48] @ load h2k | h1k ++ ++ ins $rk4v.d[1], $rk4v.d[0] @ GHASH final-1 block - mid ++ eor3 $res4b, $res1b, $ctr7b, $t1.16b @ AES final block - result ++ ++ pmull $rk3q1, $res0.1d, $h2.1d @ GHASH final-1 block - low ++ ++ pmull2 $rk4v.1q, $rk4v.2d, $h12k.2d @ GHASH final-1 block - mid ++ ++ eor $acc_lb, $acc_lb, $rk3 @ GHASH final-1 block - low ++ ++ eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-1 block - mid ++.L128_dec_blocks_less_than_1: @ blocks left <= 1 ++ ++ and $bit_length, $bit_length, #127 @ bit_length %= 128 ++ ++ sub $bit_length, $bit_length, #128 @ bit_length -= 128 ++ ++ neg $bit_length, $bit_length @ bit_length = 128 - #bits in input (in range [1,128]) ++ ++ mvn $temp0_x, xzr @ temp0_x = 0xffffffffffffffff ++ and $bit_length, $bit_length, #127 @ bit_length %= 128 ++ ++ lsr $temp0_x, $temp0_x, $bit_length @ temp0_x is mask for top 64b of last block ++ cmp $bit_length, #64 ++ mvn $temp1_x, xzr @ temp1_x = 0xffffffffffffffff ++ ++ csel $temp2_x, $temp1_x, $temp0_x, lt ++ csel $temp3_x, $temp0_x, xzr, lt ++ ++ mov $ctr0.d[1], $temp3_x ++ mov $ctr0.d[0], $temp2_x @ ctr0b is mask for last block ++ ++ ldr $h1q, [$current_tag, #32] @ load h1l | h1h ++ ext $h1.16b, $h1.16b, $h1.16b, #8 ++ ld1 { $rk0}, [$output_ptr] @ load existing bytes where the possibly partial last block is to be stored ++ ++ and $res1b, $res1b, $ctr0b @ possibly partial last block has zeroes in highest bits ++ ++ rev64 $res0b, $res1b @ GHASH final block ++ ++ eor $res0b, $res0b, $t0.16b @ feed in partial tag ++ ++ pmull2 $rk2q1, $res0.2d, $h1.2d @ GHASH final block - high ++ ins $t0.d[0], $res0.d[1] @ GHASH final block - mid ++ ++ eor $acc_hb, $acc_hb, $rk2 @ GHASH final block - high ++ eor $t0.8b, $t0.8b, $res0.8b @ GHASH final block - mid ++ ++ bif $res4b, $rk0, $ctr0b @ insert existing bytes in top end of result before storing ++ ++ pmull $t0.1q, $t0.1d, $h12k.1d @ GHASH final block - mid ++ st1 { $res4b}, [$output_ptr] @ store all 16B ++ ++ pmull $rk3q1, $res0.1d, $h1.1d @ GHASH final block - low ++ ++ eor $acc_mb, $acc_mb, $t0.16b @ GHASH final block - mid ++ ldr $mod_constantd, [$modulo_constant] @ MODULO - load modulo constant ++ ++ eor $acc_lb, $acc_lb, $rk3 @ GHASH final block - low ++ ++ eor $t10.16b, $acc_hb, $acc_lb @ MODULO - karatsuba tidy up ++ ++ pmull $t11.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid ++ ext $acc_hb, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment ++ ++ eor $acc_mb, $acc_mb, $t10.16b @ MODULO - karatsuba tidy up ++ ++ eor3 $acc_mb, $acc_mb, $acc_hb, $t11.16b @ MODULO - fold into mid ++ ++ pmull $acc_h.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low ++ ext $acc_mb, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment ++ ++ eor3 $acc_lb, $acc_lb, $acc_mb, $acc_hb @ MODULO - fold into low ++ ext $acc_lb, $acc_lb, $acc_lb, #8 ++ rev64 $acc_lb, $acc_lb ++ st1 { $acc_l.16b }, [$current_tag] ++ rev32 $rtmp_ctr.16b, $rtmp_ctr.16b ++ ++ str $rtmp_ctrq, [$counter] @ store the updated counter ++ ++ lsr x0, $bit_length, #3 ++ ++ ldp d10, d11, [sp, #16] ++ ldp d12, d13, [sp, #32] ++ ldp d14, d15, [sp, #48] ++ ldp d8, d9, [sp], #80 ++ ret ++.L128_dec_ret: ++ mov w0, #0x0 ++ ret ++.size unroll8_eor3_aes_gcm_dec_128_kernel,.-unroll8_eor3_aes_gcm_dec_128_kernel ++___ ++} ++ ++{ ++my ($end_input_ptr,$main_end_input_ptr,$temp0_x,$temp1_x)=map("x$_",(4..7)); ++my ($temp2_x,$temp3_x)=map("x$_",(13..14)); ++my ($ctr0b,$ctr1b,$ctr2b,$ctr3b,$ctr4b,$ctr5b,$ctr6b,$ctr7b,$res0b,$res1b,$res2b,$res3b,$res4b,$res5b,$res6b,$res7b)=map("v$_.16b",(0..15)); ++my ($ctr0,$ctr1,$ctr2,$ctr3,$ctr4,$ctr5,$ctr6,$ctr7,$res0,$res1,$res2,$res3,$res4,$res5,$res6,$res7)=map("v$_",(0..15)); ++my ($ctr0d,$ctr1d,$ctr2d,$ctr3d,$ctr4d,$ctr5d,$ctr6d,$ctr7d)=map("d$_",(0..7)); ++my ($ctr0q,$ctr1q,$ctr2q,$ctr3q,$ctr4q,$ctr5q,$ctr6q,$ctr7q)=map("q$_",(0..7)); ++my ($res0q,$res1q,$res2q,$res3q,$res4q,$res5q,$res6q,$res7q)=map("q$_",(8..15)); ++ ++my ($ctr_t0,$ctr_t1,$ctr_t2,$ctr_t3,$ctr_t4,$ctr_t5,$ctr_t6,$ctr_t7)=map("v$_",(8..15)); ++my ($ctr_t0b,$ctr_t1b,$ctr_t2b,$ctr_t3b,$ctr_t4b,$ctr_t5b,$ctr_t6b,$ctr_t7b)=map("v$_.16b",(8..15)); ++my ($ctr_t0q,$ctr_t1q,$ctr_t2q,$ctr_t3q,$ctr_t4q,$ctr_t5q,$ctr_t6q,$ctr_t7q)=map("q$_",(8..15)); ++ ++my ($acc_hb,$acc_mb,$acc_lb)=map("v$_.16b",(17..19)); ++my ($acc_h,$acc_m,$acc_l)=map("v$_",(17..19)); ++ ++my ($h1,$h12k,$h2,$h3,$h34k,$h4)=map("v$_",(20..25)); ++my ($h5,$h56k,$h6,$h7,$h78k,$h8)=map("v$_",(20..25)); ++my ($h1q,$h12kq,$h2q,$h3q,$h34kq,$h4q)=map("q$_",(20..25)); ++my ($h5q,$h56kq,$h6q,$h7q,$h78kq,$h8q)=map("q$_",(20..25)); ++ ++my $t0="v16"; ++my $t0d="d16"; ++ ++my $t1="v29"; ++my $t2=$res1; ++my $t3=$t1; ++ ++my $t4=$res0; ++my $t5=$res2; ++my $t6=$t0; ++ ++my $t7=$res3; ++my $t8=$res4; ++my $t9=$res5; ++ ++my $t10=$res6; ++my $t11="v21"; ++my $t12=$t1; ++ ++my $rtmp_ctr="v30"; ++my $rtmp_ctrq="q30"; ++my $rctr_inc="v31"; ++my $rctr_incd="d31"; ++ ++my $mod_constantd=$t0d; ++my $mod_constant=$t0; ++ ++my ($rk0,$rk1,$rk2)=map("v$_.16b",(26..28)); ++my ($rk3,$rk4,$rk5)=map("v$_.16b",(26..28)); ++my ($rk6,$rk7,$rk8)=map("v$_.16b",(26..28)); ++my ($rk9,$rk10,$rk11)=map("v$_.16b",(26..28)); ++my ($rk12,$rk13,$rk14)=map("v$_.16b",(26..28)); ++my ($rk0q,$rk1q,$rk2q)=map("q$_",(26..28)); ++my ($rk3q,$rk4q,$rk5q)=map("q$_",(26..28)); ++my ($rk6q,$rk7q,$rk8q)=map("q$_",(26..28)); ++my ($rk9q,$rk10q,$rk11q)=map("q$_",(26..28)); ++my ($rk12q,$rk13q,$rk14q)=map("q$_",(26..28)); ++my $rk2q1="v28.1q"; ++my $rk3q1="v26.1q"; ++my $rk4v="v27"; ++ ++######################################################################################### ++# size_t unroll8_eor3_aes_gcm_enc_192_kernel(const unsigned char *in, ++# size_t len, ++# unsigned char *out, ++# const void *key, ++# unsigned char ivec[16], ++# u64 *Xi); ++# ++$code.=<<___; ++.global unroll8_eor3_aes_gcm_enc_192_kernel ++.type unroll8_eor3_aes_gcm_enc_192_kernel,%function ++.align 4 ++unroll8_eor3_aes_gcm_enc_192_kernel: ++ AARCH64_VALID_CALL_TARGET ++ cbz x1, .L192_enc_ret ++ stp d8, d9, [sp, #-80]! ++ mov $counter, x4 ++ mov $cc, x5 ++ stp d10, d11, [sp, #16] ++ stp d12, d13, [sp, #32] ++ stp d14, d15, [sp, #48] ++ mov x5, #0xc200000000000000 ++ stp x5, xzr, [sp, #64] ++ add $modulo_constant, sp, #64 ++ ++ lsr $main_end_input_ptr, $bit_length, #3 @ byte_len ++ ld1 { $ctr0b}, [$counter] @ CTR block 0 ++ ++ mov $constant_temp, #0x100000000 @ set up counter increment ++ movi $rctr_inc.16b, #0x0 ++ mov $rctr_inc.d[1], $constant_temp ++ ++ rev32 $rtmp_ctr.16b, $ctr0.16b @ set up reversed counter ++ ++ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 0 ++ ++ rev32 $ctr1.16b, $rtmp_ctr.16b @ CTR block 1 ++ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 1 ++ ++ rev32 $ctr2.16b, $rtmp_ctr.16b @ CTR block 2 ++ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 2 ++ ++ rev32 $ctr3.16b, $rtmp_ctr.16b @ CTR block 3 ++ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 3 ++ ++ rev32 $ctr4.16b, $rtmp_ctr.16b @ CTR block 4 ++ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 4 ++ sub $main_end_input_ptr, $main_end_input_ptr, #1 @ byte_len - 1 ++ ++ and $main_end_input_ptr, $main_end_input_ptr, #0xffffffffffffff80 @ number of bytes to be processed in main loop (at least 1 byte must be handled by tail) ++ ++ rev32 $ctr5.16b, $rtmp_ctr.16b @ CTR block 5 ++ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 5 ++ ldp $rk0q, $rk1q, [$cc, #0] @ load rk0, rk1 ++ ++ add $main_end_input_ptr, $main_end_input_ptr, $input_ptr ++ ++ rev32 $ctr6.16b, $rtmp_ctr.16b @ CTR block 6 ++ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 6 ++ ++ rev32 $ctr7.16b, $rtmp_ctr.16b @ CTR block 7 ++ ++ aese $ctr5b, $rk0 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 0 ++ aese $ctr4b, $rk0 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 0 ++ aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 0 ++ ++ aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 0 ++ aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 0 ++ aese $ctr7b, $rk0 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 0 ++ ++ aese $ctr6b, $rk0 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 0 ++ aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 0 ++ ldp $rk2q, $rk3q, [$cc, #32] @ load rk2, rk3 ++ ++ aese $ctr5b, $rk1 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 1 ++ aese $ctr7b, $rk1 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 1 ++ ++ aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 1 ++ aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 1 ++ aese $ctr6b, $rk1 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 1 ++ ++ aese $ctr5b, $rk2 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 2 ++ aese $ctr4b, $rk1 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 1 ++ aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 1 ++ ++ aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 1 ++ aese $ctr7b, $rk2 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 2 ++ aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 2 ++ ++ aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 2 ++ aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 2 ++ ++ aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 2 ++ aese $ctr4b, $rk2 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 2 ++ aese $ctr6b, $rk2 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 2 ++ ++ ldp $rk4q, $rk5q, [$cc, #64] @ load rk4, rk5 ++ aese $ctr4b, $rk3 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 3 ++ ++ aese $ctr7b, $rk3 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 3 ++ aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 3 ++ aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 3 ++ ++ aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 3 ++ ++ aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 3 ++ ++ aese $ctr6b, $rk3 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 3 ++ ++ aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 4 ++ aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 4 ++ aese $ctr5b, $rk3 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 3 ++ ++ aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 4 ++ aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 4 ++ aese $ctr4b, $rk4 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 4 ++ ++ aese $ctr6b, $rk4 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 4 ++ aese $ctr7b, $rk4 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 4 ++ aese $ctr5b, $rk4 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 4 ++ ++ aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 5 ++ ldp $rk6q, $rk7q, [$cc, #96] @ load rk6, rk7 ++ aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 5 ++ ++ aese $ctr4b, $rk5 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 5 ++ aese $ctr7b, $rk5 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 5 ++ aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 5 ++ ++ aese $ctr5b, $rk5 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 5 ++ aese $ctr6b, $rk5 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 5 ++ aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 5 ++ ++ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 7 ++ ++ aese $ctr5b, $rk6 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 6 ++ aese $ctr4b, $rk6 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 6 ++ aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 6 ++ ++ aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 6 ++ aese $ctr6b, $rk6 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 6 ++ aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 6 ++ ++ aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 6 ++ aese $ctr7b, $rk6 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 6 ++ ldp $rk8q, $rk9q, [$cc, #128] @ load rk8, rk9 ++ ++ aese $ctr6b, $rk7 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 7 ++ aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 7 ++ ++ aese $ctr4b, $rk7 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 7 ++ aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 7 ++ ++ aese $ctr7b, $rk7 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 7 ++ aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 7 ++ ++ aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 7 ++ aese $ctr5b, $rk7 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 7 ++ ++ aese $ctr7b, $rk8 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 8 ++ aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 8 ++ ++ aese $ctr4b, $rk8 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 8 ++ aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 8 ++ aese $ctr5b, $rk8 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 8 ++ ++ aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 8 ++ aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 8 ++ aese $ctr6b, $rk8 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 8 ++ ++ add $end_input_ptr, $input_ptr, $bit_length, lsr #3 @ end_input_ptr ++ cmp $input_ptr, $main_end_input_ptr @ check if we have <= 8 blocks ++ aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 9 ++ ++ ld1 { $acc_lb}, [$current_tag] ++ ext $acc_lb, $acc_lb, $acc_lb, #8 ++ rev64 $acc_lb, $acc_lb ++ ldp $rk10q, $rk11q, [$cc, #160] @ load rk10, rk11 ++ ++ aese $ctr6b, $rk9 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 9 ++ aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 9 ++ ++ aese $ctr5b, $rk9 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 9 ++ aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 9 ++ ++ aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 9 ++ aese $ctr4b, $rk9 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 9 ++ ++ aese $ctr6b, $rk10 \n aesmc $ctr6b, $ctr6b @ AES block 14 - round 10 ++ aese $ctr7b, $rk9 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 9 ++ aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b @ AES block 11 - round 10 ++ ++ aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b @ AES block 9 - round 10 ++ aese $ctr5b, $rk10 \n aesmc $ctr5b, $ctr5b @ AES block 13 - round 10 ++ aese $ctr4b, $rk10 \n aesmc $ctr4b, $ctr4b @ AES block 12 - round 10 ++ ++ aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b @ AES block 8 - round 10 ++ aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b @ AES block 10 - round 10 ++ aese $ctr7b, $rk10 \n aesmc $ctr7b, $ctr7b @ AES block 15 - round 10 ++ ++ aese $ctr6b, $rk11 @ AES block 14 - round 11 ++ aese $ctr3b, $rk11 @ AES block 11 - round 11 ++ ++ aese $ctr4b, $rk11 @ AES block 12 - round 11 ++ aese $ctr7b, $rk11 @ AES block 15 - round 11 ++ ldr $rk12q, [$cc, #192] @ load rk12 ++ ++ aese $ctr1b, $rk11 @ AES block 9 - round 11 ++ aese $ctr5b, $rk11 @ AES block 13 - round 11 ++ ++ aese $ctr2b, $rk11 @ AES block 10 - round 11 ++ aese $ctr0b, $rk11 @ AES block 8 - round 11 ++ b.ge .L192_enc_tail @ handle tail ++ ++ ldp $ctr_t0q, $ctr_t1q, [$input_ptr], #32 @ AES block 0, 1 - load plaintext ++ ++ ldp $ctr_t2q, $ctr_t3q, [$input_ptr], #32 @ AES block 2, 3 - load plaintext ++ ++ ldp $ctr_t4q, $ctr_t5q, [$input_ptr], #32 @ AES block 4, 5 - load plaintext ++ ++ ldp $ctr_t6q, $ctr_t7q, [$input_ptr], #32 @ AES block 6, 7 - load plaintext ++ ++ eor3 $res0b, $ctr_t0b, $ctr0b, $rk12 @ AES block 0 - result ++ rev32 $ctr0.16b, $rtmp_ctr.16b @ CTR block 8 ++ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8 ++ ++ eor3 $res3b, $ctr_t3b, $ctr3b, $rk12 @ AES block 3 - result ++ eor3 $res1b, $ctr_t1b, $ctr1b, $rk12 @ AES block 1 - result ++ ++ rev32 $ctr1.16b, $rtmp_ctr.16b @ CTR block 9 ++ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 9 ++ eor3 $res4b, $ctr_t4b, $ctr4b, $rk12 @ AES block 4 - result ++ ++ eor3 $res5b, $ctr_t5b, $ctr5b, $rk12 @ AES block 5 - result ++ eor3 $res7b, $ctr_t7b, $ctr7b, $rk12 @ AES block 7 - result ++ stp $res0q, $res1q, [$output_ptr], #32 @ AES block 0, 1 - store result ++ ++ eor3 $res2b, $ctr_t2b, $ctr2b, $rk12 @ AES block 2 - result ++ rev32 $ctr2.16b, $rtmp_ctr.16b @ CTR block 10 ++ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 10 ++ ++ stp $res2q, $res3q, [$output_ptr], #32 @ AES block 2, 3 - store result ++ cmp $input_ptr, $main_end_input_ptr @ check if we have <= 8 blocks ++ ++ rev32 $ctr3.16b, $rtmp_ctr.16b @ CTR block 11 ++ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 11 ++ eor3 $res6b, $ctr_t6b, $ctr6b, $rk12 @ AES block 6 - result ++ ++ stp $res4q, $res5q, [$output_ptr], #32 @ AES block 4, 5 - store result ++ ++ rev32 $ctr4.16b, $rtmp_ctr.16b @ CTR block 12 ++ stp $res6q, $res7q, [$output_ptr], #32 @ AES block 6, 7 - store result ++ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 12 ++ ++ b.ge .L192_enc_prepretail @ do prepretail ++ ++.L192_enc_main_loop: @ main loop start ++ rev64 $res4b, $res4b @ GHASH block 8k+4 (t0, t1, and t2 free) ++ ldp $rk0q, $rk1q, [$cc, #0] @ load rk0, rk1 ++ rev64 $res2b, $res2b @ GHASH block 8k+2 ++ ++ rev32 $ctr5.16b, $rtmp_ctr.16b @ CTR block 8k+13 ++ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+13 ++ ldr $h7q, [$current_tag, #176] @ load h7l | h7h ++ ext $h7.16b, $h7.16b, $h7.16b, #8 ++ ldr $h8q, [$current_tag, #208] @ load h8l | h8h ++ ext $h8.16b, $h8.16b, $h8.16b, #8 ++ ++ ext $acc_lb, $acc_lb, $acc_lb, #8 @ PRE 0 ++ rev64 $res0b, $res0b @ GHASH block 8k ++ ldr $h5q, [$current_tag, #128] @ load h5l | h5h ++ ext $h5.16b, $h5.16b, $h5.16b, #8 ++ ldr $h6q, [$current_tag, #160] @ load h6l | h6h ++ ext $h6.16b, $h6.16b, $h6.16b, #8 ++ ++ rev64 $res1b, $res1b @ GHASH block 8k+1 ++ rev32 $ctr6.16b, $rtmp_ctr.16b @ CTR block 8k+14 ++ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+14 ++ ++ eor $res0b, $res0b, $acc_lb @ PRE 1 ++ rev64 $res3b, $res3b @ GHASH block 8k+3 ++ rev64 $res5b, $res5b @ GHASH block 8k+5 (t0, t1, t2 and t3 free) ++ ++ aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 0 ++ rev32 $ctr7.16b, $rtmp_ctr.16b @ CTR block 8k+15 ++ aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 0 ++ ++ aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 0 ++ aese $ctr5b, $rk0 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 0 ++ aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 0 ++ ++ aese $ctr7b, $rk0 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 0 ++ aese $ctr4b, $rk0 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 0 ++ aese $ctr6b, $rk0 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 0 ++ ++ ldp $rk2q, $rk3q, [$cc, #32] @ load rk2, rk3 ++ pmull2 $acc_h.1q, $res0.2d, $h8.2d @ GHASH block 8k - high ++ aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 1 ++ ++ aese $ctr4b, $rk1 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 1 ++ pmull2 $t0.1q, $res1.2d, $h7.2d @ GHASH block 8k+1 - high ++ pmull $h7.1q, $res1.1d, $h7.1d @ GHASH block 8k+1 - low ++ ++ trn1 $acc_m.2d, $res1.2d, $res0.2d @ GHASH block 8k, 8k+1 - mid ++ aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 1 ++ ldr $h56kq, [$current_tag, #144] @ load h6k | h5k ++ ldr $h78kq, [$current_tag, #192] @ load h8k | h7k ++ ++ pmull2 $t1.1q, $res2.2d, $h6.2d @ GHASH block 8k+2 - high ++ pmull $acc_l.1q, $res0.1d, $h8.1d @ GHASH block 8k - low ++ trn2 $res0.2d, $res1.2d, $res0.2d @ GHASH block 8k, 8k+1 - mid ++ ++ aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 1 ++ aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 1 ++ aese $ctr5b, $rk1 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 1 ++ ++ eor $acc_hb, $acc_hb, $t0.16b @ GHASH block 8k+1 - high ++ aese $ctr6b, $rk1 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 1 ++ aese $ctr7b, $rk1 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 1 ++ ++ pmull2 $t2.1q, $res3.2d, $h5.2d @ GHASH block 8k+3 - high ++ eor $res0.16b, $res0.16b, $acc_m.16b @ GHASH block 8k, 8k+1 - mid ++ aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 2 ++ ++ aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 2 ++ aese $ctr4b, $rk2 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 2 ++ aese $ctr6b, $rk2 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 2 ++ ++ aese $ctr5b, $rk2 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 2 ++ aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 3 ++ eor3 $acc_hb, $acc_hb, $t1.16b, $t2.16b @ GHASH block 8k+2, 8k+3 - high ++ ++ pmull $h6.1q, $res2.1d, $h6.1d @ GHASH block 8k+2 - low ++ aese $ctr7b, $rk2 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 2 ++ aese $ctr4b, $rk3 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 3 ++ ++ aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 2 ++ trn1 $t3.2d, $res3.2d, $res2.2d @ GHASH block 8k+2, 8k+3 - mid ++ aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 2 ++ ++ trn2 $res2.2d, $res3.2d, $res2.2d @ GHASH block 8k+2, 8k+3 - mid ++ aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 3 ++ ldp $rk4q, $rk5q, [$cc, #64] @ load rk4, rk5 ++ ++ aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 3 ++ eor $acc_lb, $acc_lb, $h7.16b @ GHASH block 8k+1 - low ++ ldr $h3q, [$current_tag, #80] @ load h3l | h3h ++ ext $h3.16b, $h3.16b, $h3.16b, #8 ++ ldr $h4q, [$current_tag, #112] @ load h4l | h4h ++ ext $h4.16b, $h4.16b, $h4.16b, #8 ++ ++ pmull2 $acc_m.1q, $res0.2d, $h78k.2d @ GHASH block 8k - mid ++ pmull $h78k.1q, $res0.1d, $h78k.1d @ GHASH block 8k+1 - mid ++ pmull $h5.1q, $res3.1d, $h5.1d @ GHASH block 8k+3 - low ++ ++ aese $ctr5b, $rk3 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 3 ++ eor $res2.16b, $res2.16b, $t3.16b @ GHASH block 8k+2, 8k+3 - mid ++ trn1 $t6.2d, $res5.2d, $res4.2d @ GHASH block 8k+4, 8k+5 - mid ++ ++ eor $acc_mb, $acc_mb, $h78k.16b @ GHASH block 8k+1 - mid ++ aese $ctr6b, $rk3 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 3 ++ eor3 $acc_lb, $acc_lb, $h6.16b, $h5.16b @ GHASH block 8k+2, 8k+3 - low ++ ++ aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 4 ++ aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 4 ++ aese $ctr7b, $rk3 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 3 ++ ++ pmull2 $t3.1q, $res2.2d, $h56k.2d @ GHASH block 8k+2 - mid ++ aese $ctr6b, $rk4 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 4 ++ aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 3 ++ ++ pmull $h56k.1q, $res2.1d, $h56k.1d @ GHASH block 8k+3 - mid ++ aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 4 ++ aese $ctr4b, $rk4 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 4 ++ ++ aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 4 ++ aese $ctr5b, $rk4 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 4 ++ aese $ctr7b, $rk4 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 4 ++ ++ eor3 $acc_mb, $acc_mb, $h56k.16b, $t3.16b @ GHASH block 8k+2, 8k+3 - mid ++ aese $ctr4b, $rk5 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 5 ++ ldr $h1q, [$current_tag, #32] @ load h1l | h1h ++ ext $h1.16b, $h1.16b, $h1.16b, #8 ++ ldr $h2q, [$current_tag, #64] @ load h1l | h1h ++ ext $h2.16b, $h2.16b, $h2.16b, #8 ++ ++ ldp $rk6q, $rk7q, [$cc, #96] @ load rk6, rk7 ++ aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 5 ++ rev64 $res7b, $res7b @ GHASH block 8k+7 (t0, t1, t2 and t3 free) ++ ++ rev64 $res6b, $res6b @ GHASH block 8k+6 (t0, t1, and t2 free) ++ pmull2 $t4.1q, $res4.2d, $h4.2d @ GHASH block 8k+4 - high ++ pmull $h4.1q, $res4.1d, $h4.1d @ GHASH block 8k+4 - low ++ ++ aese $ctr5b, $rk5 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 5 ++ trn2 $res4.2d, $res5.2d, $res4.2d @ GHASH block 8k+4, 8k+5 - mid ++ ++ aese $ctr6b, $rk5 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 5 ++ ldr $h12kq, [$current_tag, #48] @ load h2k | h1k ++ ldr $h34kq, [$current_tag, #96] @ load h4k | h3k ++ ++ aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 5 ++ pmull2 $t5.1q, $res5.2d, $h3.2d @ GHASH block 8k+5 - high ++ eor $res4.16b, $res4.16b, $t6.16b @ GHASH block 8k+4, 8k+5 - mid ++ ++ aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 5 ++ aese $ctr7b, $rk5 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 5 ++ aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 5 ++ ++ pmull $h3.1q, $res5.1d, $h3.1d @ GHASH block 8k+5 - low ++ aese $ctr4b, $rk6 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 6 ++ trn1 $t9.2d, $res7.2d, $res6.2d @ GHASH block 8k+6, 8k+7 - mid ++ ++ aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 6 ++ aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 6 ++ pmull2 $t7.1q, $res6.2d, $h2.2d @ GHASH block 8k+6 - high ++ ++ pmull $h2.1q, $res6.1d, $h2.1d @ GHASH block 8k+6 - low ++ trn2 $res6.2d, $res7.2d, $res6.2d @ GHASH block 8k+6, 8k+7 - mid ++ aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 6 ++ ++ aese $ctr6b, $rk6 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 6 ++ aese $ctr5b, $rk6 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 6 ++ ++ aese $ctr7b, $rk6 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 6 ++ aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 7 ++ aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 6 ++ ++ aese $ctr6b, $rk7 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 7 ++ eor $res6.16b, $res6.16b, $t9.16b @ GHASH block 8k+6, 8k+7 - mid ++ ++ pmull2 $t6.1q, $res4.2d, $h34k.2d @ GHASH block 8k+4 - mid ++ ldp $rk8q, $rk9q, [$cc, #128] @ load rk8, rk9 ++ pmull $h34k.1q, $res4.1d, $h34k.1d @ GHASH block 8k+5 - mid ++ ++ aese $ctr4b, $rk7 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 7 ++ pmull2 $t8.1q, $res7.2d, $h1.2d @ GHASH block 8k+7 - high ++ aese $ctr5b, $rk7 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 7 ++ ++ eor3 $acc_mb, $acc_mb, $h34k.16b, $t6.16b @ GHASH block 8k+4, 8k+5 - mid ++ aese $ctr7b, $rk7 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 7 ++ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+15 ++ ++ ldr $mod_constantd, [$modulo_constant] @ MODULO - load modulo constant ++ eor3 $acc_hb, $acc_hb, $t4.16b, $t5.16b @ GHASH block 8k+4, 8k+5 - high ++ aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 7 ++ ++ pmull2 $t9.1q, $res6.2d, $h12k.2d @ GHASH block 8k+6 - mid ++ pmull $h1.1q, $res7.1d, $h1.1d @ GHASH block 8k+7 - low ++ aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 7 ++ ++ aese $ctr5b, $rk8 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 8 ++ aese $ctr4b, $rk8 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 8 ++ aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 8 ++ ++ aese $ctr6b, $rk8 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 8 ++ eor3 $acc_lb, $acc_lb, $h4.16b, $h3.16b @ GHASH block 8k+4, 8k+5 - low ++ aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 7 ++ ++ aese $ctr7b, $rk8 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 8 ++ aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 8 ++ pmull $h12k.1q, $res6.1d, $h12k.1d @ GHASH block 8k+7 - mid ++ ++ aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 8 ++ aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 8 ++ ldp $rk10q, $rk11q, [$cc, #160] @ load rk10, rk11 ++ ++ eor3 $acc_lb, $acc_lb, $h2.16b, $h1.16b @ GHASH block 8k+6, 8k+7 - low ++ rev32 $h1.16b, $rtmp_ctr.16b @ CTR block 8k+16 ++ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+16 ++ ++ aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 9 ++ eor3 $acc_mb, $acc_mb, $h12k.16b, $t9.16b @ GHASH block 8k+6, 8k+7 - mid ++ eor3 $acc_hb, $acc_hb, $t7.16b, $t8.16b @ GHASH block 8k+6, 8k+7 - high ++ ++ aese $ctr6b, $rk9 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 9 ++ aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 9 ++ ldp $ctr_t0q, $ctr_t1q, [$input_ptr], #32 @ AES block 8k+8, 8k+9 - load plaintext ++ ++ pmull $t11.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid ++ rev32 $h2.16b, $rtmp_ctr.16b @ CTR block 8k+17 ++ aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 9 ++ ++ aese $ctr4b, $rk9 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 9 ++ aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 9 ++ aese $ctr7b, $rk9 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 9 ++ ++ eor3 $acc_mb, $acc_mb, $acc_hb, $acc_lb @ MODULO - karatsuba tidy up ++ aese $ctr5b, $rk9 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 9 ++ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+17 ++ ++ aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 10 ++ aese $ctr4b, $rk10 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 10 ++ ldr $rk12q, [$cc, #192] @ load rk12 ++ ext $t12.16b, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment ++ ++ aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 10 ++ aese $ctr7b, $rk10 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 10 ++ ldp $ctr_t2q, $ctr_t3q, [$input_ptr], #32 @ AES block 8k+10, 8k+11 - load plaintext ++ ++ aese $ctr4b, $rk11 @ AES block 8k+12 - round 11 ++ eor3 $acc_mb, $acc_mb, $t12.16b, $t11.16b @ MODULO - fold into mid ++ ldp $ctr_t4q, $ctr_t5q, [$input_ptr], #32 @ AES block 8k+12, 8k+13 - load plaintext ++ ++ ldp $ctr_t6q, $ctr_t7q, [$input_ptr], #32 @ AES block 8k+14, 8k+15 - load plaintext ++ aese $ctr2b, $rk11 @ AES block 8k+10 - round 11 ++ aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 10 ++ ++ rev32 $h3.16b, $rtmp_ctr.16b @ CTR block 8k+18 ++ aese $ctr5b, $rk10 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 10 ++ ++ aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 10 ++ pmull $acc_h.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low ++ ++ aese $ctr6b, $rk10 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 10 ++ aese $ctr5b, $rk11 @ AES block 8k+13 - round 11 ++ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+18 ++ ++ aese $ctr7b, $rk11 @ AES block 8k+15 - round 11 ++ aese $ctr0b, $rk11 @ AES block 8k+8 - round 11 ++ eor3 $res4b, $ctr_t4b, $ctr4b, $rk12 @ AES block 4 - result ++ ++ aese $ctr6b, $rk11 @ AES block 8k+14 - round 11 ++ aese $ctr3b, $rk11 @ AES block 8k+11 - round 11 ++ aese $ctr1b, $rk11 @ AES block 8k+9 - round 11 ++ ++ rev32 $h4.16b, $rtmp_ctr.16b @ CTR block 8k+19 ++ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+19 ++ eor3 $res7b, $ctr_t7b, $ctr7b, $rk12 @ AES block 7 - result ++ ++ eor3 $res2b, $ctr_t2b, $ctr2b, $rk12 @ AES block 8k+10 - result ++ eor3 $res0b, $ctr_t0b, $ctr0b, $rk12 @ AES block 8k+8 - result ++ mov $ctr2.16b, $h3.16b @ CTR block 8k+18 ++ ++ eor3 $res1b, $ctr_t1b, $ctr1b, $rk12 @ AES block 8k+9 - result ++ mov $ctr1.16b, $h2.16b @ CTR block 8k+17 ++ stp $res0q, $res1q, [$output_ptr], #32 @ AES block 8k+8, 8k+9 - store result ++ ext $t11.16b, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment ++ ++ eor3 $res6b, $ctr_t6b, $ctr6b, $rk12 @ AES block 6 - result ++ mov $ctr0.16b, $h1.16b @ CTR block 8k+16 ++ rev32 $ctr4.16b, $rtmp_ctr.16b @ CTR block 8k+20 ++ ++ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+20 ++ eor3 $res5b, $ctr_t5b, $ctr5b, $rk12 @ AES block 5 - result ++ eor3 $acc_lb, $acc_lb, $acc_hb, $t11.16b @ MODULO - fold into low ++ ++ eor3 $res3b, $ctr_t3b, $ctr3b, $rk12 @ AES block 8k+11 - result ++ mov $ctr3.16b, $h4.16b @ CTR block 8k+19 ++ ++ stp $res2q, $res3q, [$output_ptr], #32 @ AES block 8k+10, 8k+11 - store result ++ ++ stp $res4q, $res5q, [$output_ptr], #32 @ AES block 8k+12, 8k+13 - store result ++ ++ cmp $input_ptr, $main_end_input_ptr @ LOOP CONTROL ++ stp $res6q, $res7q, [$output_ptr], #32 @ AES block 8k+14, 8k+15 - store result ++ b.lt .L192_enc_main_loop ++ ++.L192_enc_prepretail: @ PREPRETAIL ++ rev32 $ctr5.16b, $rtmp_ctr.16b @ CTR block 8k+13 ++ ldp $rk0q, $rk1q, [$cc, #0] @ load rk0, rk1 ++ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+13 ++ ++ ldr $h7q, [$current_tag, #176] @ load h7l | h7h ++ ext $h7.16b, $h7.16b, $h7.16b, #8 ++ ldr $h8q, [$current_tag, #208] @ load h8l | h8h ++ ext $h8.16b, $h8.16b, $h8.16b, #8 ++ rev64 $res0b, $res0b @ GHASH block 8k ++ ext $acc_lb, $acc_lb, $acc_lb, #8 @ PRE 0 ++ ++ rev32 $ctr6.16b, $rtmp_ctr.16b @ CTR block 8k+14 ++ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+14 ++ ldr $h56kq, [$current_tag, #144] @ load h6k | h5k ++ ldr $h78kq, [$current_tag, #192] @ load h8k | h7k ++ ++ rev64 $res3b, $res3b @ GHASH block 8k+3 ++ rev64 $res2b, $res2b @ GHASH block 8k+2 ++ ldr $h5q, [$current_tag, #128] @ load h5l | h5h ++ ext $h5.16b, $h5.16b, $h5.16b, #8 ++ ldr $h6q, [$current_tag, #160] @ load h6l | h6h ++ ext $h6.16b, $h6.16b, $h6.16b, #8 ++ ++ eor $res0b, $res0b, $acc_lb @ PRE 1 ++ rev32 $ctr7.16b, $rtmp_ctr.16b @ CTR block 8k+15 ++ rev64 $res1b, $res1b @ GHASH block 8k+1 ++ ++ aese $ctr5b, $rk0 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 0 ++ aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 0 ++ aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 0 ++ ++ pmull2 $t0.1q, $res1.2d, $h7.2d @ GHASH block 8k+1 - high ++ aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 0 ++ aese $ctr6b, $rk0 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 0 ++ ++ aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 0 ++ aese $ctr4b, $rk0 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 0 ++ pmull2 $acc_h.1q, $res0.2d, $h8.2d @ GHASH block 8k - high ++ ++ aese $ctr6b, $rk1 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 1 ++ pmull $acc_l.1q, $res0.1d, $h8.1d @ GHASH block 8k - low ++ trn1 $acc_m.2d, $res1.2d, $res0.2d @ GHASH block 8k, 8k+1 - mid ++ ++ trn2 $res0.2d, $res1.2d, $res0.2d @ GHASH block 8k, 8k+1 - mid ++ aese $ctr7b, $rk0 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 0 ++ ldp $rk2q, $rk3q, [$cc, #32] @ load rk2, rk3 ++ ++ pmull $h7.1q, $res1.1d, $h7.1d @ GHASH block 8k+1 - low ++ eor $acc_hb, $acc_hb, $t0.16b @ GHASH block 8k+1 - high ++ aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 1 ++ ++ aese $ctr5b, $rk1 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 1 ++ eor $res0.16b, $res0.16b, $acc_m.16b @ GHASH block 8k, 8k+1 - mid ++ aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 1 ++ ++ aese $ctr7b, $rk1 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 1 ++ pmull2 $t2.1q, $res3.2d, $h5.2d @ GHASH block 8k+3 - high ++ pmull2 $t1.1q, $res2.2d, $h6.2d @ GHASH block 8k+2 - high ++ ++ aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 1 ++ aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 1 ++ aese $ctr4b, $rk1 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 1 ++ ++ pmull $h6.1q, $res2.1d, $h6.1d @ GHASH block 8k+2 - low ++ aese $ctr5b, $rk2 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 2 ++ eor $acc_lb, $acc_lb, $h7.16b @ GHASH block 8k+1 - low ++ ++ pmull $h5.1q, $res3.1d, $h5.1d @ GHASH block 8k+3 - low ++ aese $ctr7b, $rk2 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 2 ++ eor3 $acc_hb, $acc_hb, $t1.16b, $t2.16b @ GHASH block 8k+2, 8k+3 - high ++ ++ aese $ctr5b, $rk3 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 3 ++ trn1 $t3.2d, $res3.2d, $res2.2d @ GHASH block 8k+2, 8k+3 - mid ++ aese $ctr6b, $rk2 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 2 ++ ++ aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 2 ++ pmull2 $acc_m.1q, $res0.2d, $h78k.2d @ GHASH block 8k - mid ++ trn2 $res2.2d, $res3.2d, $res2.2d @ GHASH block 8k+2, 8k+3 - mid ++ ++ aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 2 ++ rev64 $res5b, $res5b @ GHASH block 8k+5 (t0, t1, t2 and t3 free) ++ rev64 $res6b, $res6b @ GHASH block 8k+6 (t0, t1, and t2 free) ++ ++ aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 2 ++ aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 2 ++ aese $ctr4b, $rk2 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 2 ++ ++ eor $res2.16b, $res2.16b, $t3.16b @ GHASH block 8k+2, 8k+3 - mid ++ pmull $h78k.1q, $res0.1d, $h78k.1d @ GHASH block 8k+1 - mid ++ ldp $rk4q, $rk5q, [$cc, #64] @ load rk4, rk5 ++ ++ aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 3 ++ aese $ctr6b, $rk3 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 3 ++ aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 3 ++ ++ eor $acc_mb, $acc_mb, $h78k.16b @ GHASH block 8k+1 - mid ++ eor3 $acc_lb, $acc_lb, $h6.16b, $h5.16b @ GHASH block 8k+2, 8k+3 - low ++ aese $ctr7b, $rk3 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 3 ++ ++ ldr $h3q, [$current_tag, #80] @ load h3l | h3h ++ ext $h3.16b, $h3.16b, $h3.16b, #8 ++ ldr $h4q, [$current_tag, #112] @ load h4l | h4h ++ ext $h4.16b, $h4.16b, $h4.16b, #8 ++ aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 3 ++ pmull2 $t3.1q, $res2.2d, $h56k.2d @ GHASH block 8k+2 - mid ++ ++ ldr $h1q, [$current_tag, #32] @ load h1l | h1h ++ ext $h1.16b, $h1.16b, $h1.16b, #8 ++ ldr $h2q, [$current_tag, #64] @ load h1l | h1h ++ ext $h2.16b, $h2.16b, $h2.16b, #8 ++ aese $ctr4b, $rk3 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 3 ++ rev64 $res4b, $res4b @ GHASH block 8k+4 (t0, t1, and t2 free) ++ ++ aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 3 ++ pmull $h56k.1q, $res2.1d, $h56k.1d @ GHASH block 8k+3 - mid ++ aese $ctr6b, $rk4 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 4 ++ ++ trn1 $t6.2d, $res5.2d, $res4.2d @ GHASH block 8k+4, 8k+5 - mid ++ aese $ctr7b, $rk4 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 4 ++ aese $ctr5b, $rk4 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 4 ++ ++ eor3 $acc_mb, $acc_mb, $h56k.16b, $t3.16b @ GHASH block 8k+2, 8k+3 - mid ++ aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 4 ++ aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 4 ++ ++ aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 4 ++ aese $ctr4b, $rk4 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 4 ++ aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 4 ++ ++ aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 5 ++ rev64 $res7b, $res7b @ GHASH block 8k+7 (t0, t1, t2 and t3 free) ++ ldr $h12kq, [$current_tag, #48] @ load h2k | h1k ++ ldr $h34kq, [$current_tag, #96] @ load h4k | h3k ++ ++ aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 5 ++ aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 5 ++ ldp $rk6q, $rk7q, [$cc, #96] @ load rk6, rk7 ++ ++ pmull2 $t7.1q, $res6.2d, $h2.2d @ GHASH block 8k+6 - high ++ pmull2 $t4.1q, $res4.2d, $h4.2d @ GHASH block 8k+4 - high ++ pmull $h4.1q, $res4.1d, $h4.1d @ GHASH block 8k+4 - low ++ ++ aese $ctr4b, $rk5 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 5 ++ trn2 $res4.2d, $res5.2d, $res4.2d @ GHASH block 8k+4, 8k+5 - mid ++ ++ pmull2 $t5.1q, $res5.2d, $h3.2d @ GHASH block 8k+5 - high ++ pmull $h3.1q, $res5.1d, $h3.1d @ GHASH block 8k+5 - low ++ pmull $h2.1q, $res6.1d, $h2.1d @ GHASH block 8k+6 - low ++ ++ trn1 $t9.2d, $res7.2d, $res6.2d @ GHASH block 8k+6, 8k+7 - mid ++ eor $res4.16b, $res4.16b, $t6.16b @ GHASH block 8k+4, 8k+5 - mid ++ trn2 $res6.2d, $res7.2d, $res6.2d @ GHASH block 8k+6, 8k+7 - mid ++ ++ aese $ctr5b, $rk5 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 5 ++ aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 6 ++ aese $ctr7b, $rk5 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 5 ++ ++ aese $ctr6b, $rk5 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 5 ++ eor $res6.16b, $res6.16b, $t9.16b @ GHASH block 8k+6, 8k+7 - mid ++ aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 5 ++ ++ pmull2 $t6.1q, $res4.2d, $h34k.2d @ GHASH block 8k+4 - mid ++ pmull $h34k.1q, $res4.1d, $h34k.1d @ GHASH block 8k+5 - mid ++ ++ aese $ctr4b, $rk6 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 6 ++ aese $ctr5b, $rk6 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 6 ++ aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 7 ++ ++ aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 6 ++ aese $ctr7b, $rk6 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 6 ++ eor3 $acc_mb, $acc_mb, $h34k.16b, $t6.16b @ GHASH block 8k+4, 8k+5 - mid ++ ++ aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 6 ++ eor3 $acc_hb, $acc_hb, $t4.16b, $t5.16b @ GHASH block 8k+4, 8k+5 - high ++ aese $ctr5b, $rk7 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 7 ++ ++ aese $ctr6b, $rk6 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 6 ++ ldr $mod_constantd, [$modulo_constant] @ MODULO - load modulo constant ++ aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 6 ++ ++ pmull2 $t9.1q, $res6.2d, $h12k.2d @ GHASH block 8k+6 - mid ++ aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 7 ++ eor3 $acc_lb, $acc_lb, $h4.16b, $h3.16b @ GHASH block 8k+4, 8k+5 - low ++ ++ pmull2 $t8.1q, $res7.2d, $h1.2d @ GHASH block 8k+7 - high ++ pmull $h12k.1q, $res6.1d, $h12k.1d @ GHASH block 8k+7 - mid ++ pmull $h1.1q, $res7.1d, $h1.1d @ GHASH block 8k+7 - low ++ ++ aese $ctr4b, $rk7 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 7 ++ aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 7 ++ ldp $rk8q, $rk9q, [$cc, #128] @ load rk8, rk9 ++ ++ aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 7 ++ eor3 $acc_mb, $acc_mb, $h12k.16b, $t9.16b @ GHASH block 8k+6, 8k+7 - mid ++ ++ eor3 $acc_lb, $acc_lb, $h2.16b, $h1.16b @ GHASH block 8k+6, 8k+7 - low ++ eor3 $acc_hb, $acc_hb, $t7.16b, $t8.16b @ GHASH block 8k+6, 8k+7 - high ++ ++ eor3 $acc_mb, $acc_mb, $acc_hb, $acc_lb @ MODULO - karatsuba tidy up ++ ext $t12.16b, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment ++ aese $ctr7b, $rk7 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 7 ++ pmull $t11.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid ++ ++ aese $ctr5b, $rk8 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 8 ++ aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 8 ++ ++ aese $ctr6b, $rk7 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 7 ++ aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 8 ++ eor3 $acc_mb, $acc_mb, $t12.16b, $t11.16b @ MODULO - fold into mid ++ ++ aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 8 ++ aese $ctr5b, $rk9 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 9 ++ aese $ctr4b, $rk8 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 8 ++ ++ aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 8 ++ aese $ctr7b, $rk8 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 8 ++ aese $ctr6b, $rk8 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 8 ++ ++ aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 9 ++ ldp $rk10q, $rk11q, [$cc, #160] @ load rk10, rk11 ++ aese $ctr4b, $rk9 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 9 ++ ++ aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 9 ++ aese $ctr7b, $rk9 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 9 ++ ++ ext $t11.16b, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment ++ aese $ctr6b, $rk9 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 9 ++ aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 9 ++ aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 9 ++ ++ pmull $acc_h.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low ++ ldr $rk12q, [$cc, #192] @ load rk12 ++ ++ aese $ctr7b, $rk10 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 10 ++ aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 10 ++ aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 10 ++ ++ eor3 $acc_lb, $acc_lb, $acc_hb, $t11.16b @ MODULO - fold into low ++ aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 10 ++ aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 10 ++ ++ aese $ctr1b, $rk11 @ AES block 8k+9 - round 11 ++ aese $ctr7b, $rk11 @ AES block 8k+15 - round 11 ++ ++ aese $ctr4b, $rk10 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 10 ++ aese $ctr3b, $rk11 @ AES block 8k+11 - round 11 ++ ++ aese $ctr5b, $rk10 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 10 ++ aese $ctr6b, $rk10 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 10 ++ ++ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+15 ++ aese $ctr2b, $rk11 @ AES block 8k+10 - round 11 ++ aese $ctr0b, $rk11 @ AES block 8k+8 - round 11 ++ ++ aese $ctr6b, $rk11 @ AES block 8k+14 - round 11 ++ aese $ctr4b, $rk11 @ AES block 8k+12 - round 11 ++ aese $ctr5b, $rk11 @ AES block 8k+13 - round 11 ++ ++.L192_enc_tail: @ TAIL ++ ++ ldp $h5q, $h56kq, [$current_tag, #128] @ load h5l | h5h ++ ext $h5.16b, $h5.16b, $h5.16b, #8 ++ sub $main_end_input_ptr, $end_input_ptr, $input_ptr @ main_end_input_ptr is number of bytes left to process ++ ++ ldr $ctr_t0q, [$input_ptr], #16 @ AES block 8k+8 - l3ad plaintext ++ ++ ldp $h78kq, $h8q, [$current_tag, #192] @ load h8l | h8h ++ ext $h8.16b, $h8.16b, $h8.16b, #8 ++ ++ mov $t1.16b, $rk12 ++ ++ ldp $h6q, $h7q, [$current_tag, #160] @ load h6l | h6h ++ ext $h6.16b, $h6.16b, $h6.16b, #8 ++ ext $h7.16b, $h7.16b, $h7.16b, #8 ++ cmp $main_end_input_ptr, #112 ++ ++ eor3 $res1b, $ctr_t0b, $ctr0b, $t1.16b @ AES block 8k+8 - result ++ ext $t0.16b, $acc_lb, $acc_lb, #8 @ prepare final partial tag ++ b.gt .L192_enc_blocks_more_than_7 ++ ++ cmp $main_end_input_ptr, #96 ++ mov $ctr7b, $ctr6b ++ movi $acc_h.8b, #0 ++ ++ mov $ctr6b, $ctr5b ++ movi $acc_l.8b, #0 ++ sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s ++ ++ mov $ctr5b, $ctr4b ++ mov $ctr4b, $ctr3b ++ mov $ctr3b, $ctr2b ++ ++ mov $ctr2b, $ctr1b ++ movi $acc_m.8b, #0 ++ b.gt .L192_enc_blocks_more_than_6 ++ ++ mov $ctr7b, $ctr6b ++ cmp $main_end_input_ptr, #80 ++ ++ mov $ctr6b, $ctr5b ++ mov $ctr5b, $ctr4b ++ mov $ctr4b, $ctr3b ++ ++ mov $ctr3b, $ctr1b ++ sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s ++ b.gt .L192_enc_blocks_more_than_5 ++ ++ cmp $main_end_input_ptr, #64 ++ sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s ++ ++ mov $ctr7b, $ctr6b ++ mov $ctr6b, $ctr5b ++ mov $ctr5b, $ctr4b ++ ++ mov $ctr4b, $ctr1b ++ b.gt .L192_enc_blocks_more_than_4 ++ ++ mov $ctr7b, $ctr6b ++ mov $ctr6b, $ctr5b ++ mov $ctr5b, $ctr1b ++ ++ sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s ++ cmp $main_end_input_ptr, #48 ++ b.gt .L192_enc_blocks_more_than_3 ++ ++ mov $ctr7b, $ctr6b ++ mov $ctr6b, $ctr1b ++ sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s ++ ++ ldr $h34kq, [$current_tag, #96] @ load h4k | h3k ++ cmp $main_end_input_ptr, #32 ++ b.gt .L192_enc_blocks_more_than_2 ++ ++ sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s ++ ++ cmp $main_end_input_ptr, #16 ++ mov $ctr7b, $ctr1b ++ b.gt .L192_enc_blocks_more_than_1 ++ ++ sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s ++ ldr $h12kq, [$current_tag, #48] @ load h2k | h1k ++ b .L192_enc_blocks_less_than_1 ++.L192_enc_blocks_more_than_7: @ blocks left > 7 ++ st1 { $res1b}, [$output_ptr], #16 @ AES final-7 block - store result ++ ++ rev64 $res0b, $res1b @ GHASH final-7 block ++ ins $acc_m.d[0], $h78k.d[1] @ GHASH final-7 block - mid ++ ++ eor $res0b, $res0b, $t0.16b @ feed in partial tag ++ ++ ins $rk4v.d[0], $res0.d[1] @ GHASH final-7 block - mid ++ ++ ldr $ctr_t1q, [$input_ptr], #16 @ AES final-6 block - load plaintext ++ ++ eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-7 block - mid ++ movi $t0.8b, #0 @ surpress further partial tag feed in ++ pmull $acc_l.1q, $res0.1d, $h8.1d @ GHASH final-7 block - low ++ ++ pmull2 $acc_h.1q, $res0.2d, $h8.2d @ GHASH final-7 block - high ++ ++ pmull $acc_m.1q, $rk4v.1d, $acc_m.1d @ GHASH final-7 block - mid ++ eor3 $res1b, $ctr_t1b, $ctr1b, $t1.16b @ AES final-6 block - result ++.L192_enc_blocks_more_than_6: @ blocks left > 6 ++ ++ st1 { $res1b}, [$output_ptr], #16 @ AES final-6 block - store result ++ ++ rev64 $res0b, $res1b @ GHASH final-6 block ++ ++ ldr $ctr_t1q, [$input_ptr], #16 @ AES final-5 block - load plaintext ++ ++ eor $res0b, $res0b, $t0.16b @ feed in partial tag ++ ++ ins $rk4v.d[0], $res0.d[1] @ GHASH final-6 block - mid ++ ++ pmull $rk3q1, $res0.1d, $h7.1d @ GHASH final-6 block - low ++ eor3 $res1b, $ctr_t1b, $ctr2b, $t1.16b @ AES final-5 block - result ++ ++ movi $t0.8b, #0 @ surpress further partial tag feed in ++ pmull2 $rk2q1, $res0.2d, $h7.2d @ GHASH final-6 block - high ++ eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-6 block - mid ++ ++ pmull $rk4v.1q, $rk4v.1d, $h78k.1d @ GHASH final-6 block - mid ++ ++ eor $acc_hb, $acc_hb, $rk2 @ GHASH final-6 block - high ++ eor $acc_lb, $acc_lb, $rk3 @ GHASH final-6 block - low ++ ++ eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-6 block - mid ++.L192_enc_blocks_more_than_5: @ blocks left > 5 ++ ++ st1 { $res1b}, [$output_ptr], #16 @ AES final-5 block - store result ++ ++ rev64 $res0b, $res1b @ GHASH final-5 block ++ ++ eor $res0b, $res0b, $t0.16b @ feed in partial tag ++ ++ ins $rk4v.d[0], $res0.d[1] @ GHASH final-5 block - mid ++ ++ ldr $ctr_t1q, [$input_ptr], #16 @ AES final-4 block - load plaintext ++ pmull2 $rk2q1, $res0.2d, $h6.2d @ GHASH final-5 block - high ++ ++ eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-5 block - mid ++ eor $acc_hb, $acc_hb, $rk2 @ GHASH final-5 block - high ++ ++ ins $rk4v.d[1], $rk4v.d[0] @ GHASH final-5 block - mid ++ pmull $rk3q1, $res0.1d, $h6.1d @ GHASH final-5 block - low ++ ++ eor $acc_lb, $acc_lb, $rk3 @ GHASH final-5 block - low ++ pmull2 $rk4v.1q, $rk4v.2d, $h56k.2d @ GHASH final-5 block - mid ++ ++ eor3 $res1b, $ctr_t1b, $ctr3b, $t1.16b @ AES final-4 block - result ++ movi $t0.8b, #0 @ surpress further partial tag feed in ++ ++ eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-5 block - mid ++.L192_enc_blocks_more_than_4: @ blocks left > 4 ++ ++ st1 { $res1b}, [$output_ptr], #16 @ AES final-4 block - store result ++ ++ rev64 $res0b, $res1b @ GHASH final-4 block ++ ++ eor $res0b, $res0b, $t0.16b @ feed in partial tag ++ ++ ldr $ctr_t1q, [$input_ptr], #16 @ AES final-3 block - load plaintext ++ pmull2 $rk2q1, $res0.2d, $h5.2d @ GHASH final-4 block - high ++ ins $rk4v.d[0], $res0.d[1] @ GHASH final-4 block - mid ++ ++ pmull $rk3q1, $res0.1d, $h5.1d @ GHASH final-4 block - low ++ eor $acc_hb, $acc_hb, $rk2 @ GHASH final-4 block - high ++ ++ eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-4 block - mid ++ ++ movi $t0.8b, #0 @ surpress further partial tag feed in ++ eor $acc_lb, $acc_lb, $rk3 @ GHASH final-4 block - low ++ ++ pmull $rk4v.1q, $rk4v.1d, $h56k.1d @ GHASH final-4 block - mid ++ ++ eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-4 block - mid ++ eor3 $res1b, $ctr_t1b, $ctr4b, $t1.16b @ AES final-3 block - result ++.L192_enc_blocks_more_than_3: @ blocks left > 3 ++ ++ ldr $h34kq, [$current_tag, #96] @ load h4k | h3k ++ st1 { $res1b}, [$output_ptr], #16 @ AES final-3 block - store result ++ ++ rev64 $res0b, $res1b @ GHASH final-3 block ++ ++ eor $res0b, $res0b, $t0.16b @ feed in partial tag ++ movi $t0.8b, #0 @ surpress further partial tag feed in ++ ++ ldr $ctr_t1q, [$input_ptr], #16 @ AES final-2 block - load plaintext ++ ldr $h4q, [$current_tag, #112] @ load h4l | h4h ++ ext $h4.16b, $h4.16b, $h4.16b, #8 ++ ++ ins $rk4v.d[0], $res0.d[1] @ GHASH final-3 block - mid ++ ++ eor3 $res1b, $ctr_t1b, $ctr5b, $t1.16b @ AES final-2 block - result ++ eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-3 block - mid ++ ++ ins $rk4v.d[1], $rk4v.d[0] @ GHASH final-3 block - mid ++ pmull $rk3q1, $res0.1d, $h4.1d @ GHASH final-3 block - low ++ ++ pmull2 $rk2q1, $res0.2d, $h4.2d @ GHASH final-3 block - high ++ pmull2 $rk4v.1q, $rk4v.2d, $h34k.2d @ GHASH final-3 block - mid ++ ++ eor $acc_lb, $acc_lb, $rk3 @ GHASH final-3 block - low ++ ++ eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-3 block - mid ++ eor $acc_hb, $acc_hb, $rk2 @ GHASH final-3 block - high ++.L192_enc_blocks_more_than_2: @ blocks left > 2 ++ ++ st1 { $res1b}, [$output_ptr], #16 @ AES final-2 block - store result ++ ++ rev64 $res0b, $res1b @ GHASH final-2 block ++ ldr $h3q, [$current_tag, #80] @ load h3l | h3h ++ ext $h3.16b, $h3.16b, $h3.16b, #8 ++ ++ eor $res0b, $res0b, $t0.16b @ feed in partial tag ++ ++ ldr $ctr_t1q, [$input_ptr], #16 @ AES final-1 block - load plaintext ++ ins $rk4v.d[0], $res0.d[1] @ GHASH final-2 block - mid ++ ++ eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-2 block - mid ++ ++ pmull $rk3q1, $res0.1d, $h3.1d @ GHASH final-2 block - low ++ pmull2 $rk2q1, $res0.2d, $h3.2d @ GHASH final-2 block - high ++ movi $t0.8b, #0 @ surpress further partial tag feed in ++ ++ pmull $rk4v.1q, $rk4v.1d, $h34k.1d @ GHASH final-2 block - mid ++ ++ eor $acc_lb, $acc_lb, $rk3 @ GHASH final-2 block - low ++ eor $acc_hb, $acc_hb, $rk2 @ GHASH final-2 block - high ++ ++ eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-2 block - mid ++ eor3 $res1b, $ctr_t1b, $ctr6b, $t1.16b @ AES final-1 block - result ++.L192_enc_blocks_more_than_1: @ blocks left > 1 ++ ++ ldr $h2q, [$current_tag, #64] @ load h1l | h1h ++ ext $h2.16b, $h2.16b, $h2.16b, #8 ++ st1 { $res1b}, [$output_ptr], #16 @ AES final-1 block - store result ++ ++ rev64 $res0b, $res1b @ GHASH final-1 block ++ ++ eor $res0b, $res0b, $t0.16b @ feed in partial tag ++ ++ ins $rk4v.d[0], $res0.d[1] @ GHASH final-1 block - mid ++ pmull $rk3q1, $res0.1d, $h2.1d @ GHASH final-1 block - low ++ ++ eor $acc_lb, $acc_lb, $rk3 @ GHASH final-1 block - low ++ pmull2 $rk2q1, $res0.2d, $h2.2d @ GHASH final-1 block - high ++ eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-1 block - mid ++ ++ ldr $ctr_t1q, [$input_ptr], #16 @ AES final block - load plaintext ++ ldr $h12kq, [$current_tag, #48] @ load h2k | h1k ++ ++ ins $rk4v.d[1], $rk4v.d[0] @ GHASH final-1 block - mid ++ ++ eor3 $res1b, $ctr_t1b, $ctr7b, $t1.16b @ AES final block - result ++ pmull2 $rk4v.1q, $rk4v.2d, $h12k.2d @ GHASH final-1 block - mid ++ ++ movi $t0.8b, #0 @ surpress further partial tag feed in ++ ++ eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-1 block - mid ++ eor $acc_hb, $acc_hb, $rk2 @ GHASH final-1 block - high ++.L192_enc_blocks_less_than_1: @ blocks left <= 1 ++ ++ mvn $temp0_x, xzr @ temp0_x = 0xffffffffffffffff ++ and $bit_length, $bit_length, #127 @ bit_length %= 128 ++ ++ sub $bit_length, $bit_length, #128 @ bit_length -= 128 ++ ++ neg $bit_length, $bit_length @ bit_length = 128 - #bits in input (in range [1,128]) ++ ++ and $bit_length, $bit_length, #127 @ bit_length %= 128 ++ ++ lsr $temp0_x, $temp0_x, $bit_length @ temp0_x is mask for top 64b of last block ++ cmp $bit_length, #64 ++ mvn $temp1_x, xzr @ temp1_x = 0xffffffffffffffff ++ ++ csel $temp2_x, $temp1_x, $temp0_x, lt ++ csel $temp3_x, $temp0_x, xzr, lt ++ ++ mov $ctr0.d[1], $temp3_x ++ ldr $h1q, [$current_tag, #32] @ load h1l | h1h ++ ext $h1.16b, $h1.16b, $h1.16b, #8 ++ ++ ld1 { $rk0}, [$output_ptr] @ load existing bytes where the possibly partial last block is to be stored ++ mov $ctr0.d[0], $temp2_x @ ctr0b is mask for last block ++ ++ and $res1b, $res1b, $ctr0b @ possibly partial last block has zeroes in highest bits ++ ++ rev64 $res0b, $res1b @ GHASH final block ++ bif $res1b, $rk0, $ctr0b @ insert existing bytes in top end of result before storing ++ ++ st1 { $res1b}, [$output_ptr] @ store all 16B ++ ++ eor $res0b, $res0b, $t0.16b @ feed in partial tag ++ ++ ins $t0.d[0], $res0.d[1] @ GHASH final block - mid ++ pmull2 $rk2q1, $res0.2d, $h1.2d @ GHASH final block - high ++ ++ eor $acc_hb, $acc_hb, $rk2 @ GHASH final block - high ++ pmull $rk3q1, $res0.1d, $h1.1d @ GHASH final block - low ++ ++ eor $t0.8b, $t0.8b, $res0.8b @ GHASH final block - mid ++ ++ pmull $t0.1q, $t0.1d, $h12k.1d @ GHASH final block - mid ++ ++ eor $acc_mb, $acc_mb, $t0.16b @ GHASH final block - mid ++ ldr $mod_constantd, [$modulo_constant] @ MODULO - load modulo constant ++ ++ eor $acc_lb, $acc_lb, $rk3 @ GHASH final block - low ++ ext $t11.16b, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment ++ ++ rev32 $rtmp_ctr.16b, $rtmp_ctr.16b ++ ++ str $rtmp_ctrq, [$counter] @ store the updated counter ++ eor3 $acc_mb, $acc_mb, $acc_hb, $acc_lb @ MODULO - karatsuba tidy up ++ ++ pmull $t12.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid ++ ++ eor3 $acc_mb, $acc_mb, $t12.16b, $t11.16b @ MODULO - fold into mid ++ ++ pmull $acc_h.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low ++ ext $t11.16b, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment ++ ++ eor3 $acc_lb, $acc_lb, $acc_hb, $t11.16b @ MODULO - fold into low ++ ext $acc_lb, $acc_lb, $acc_lb, #8 ++ rev64 $acc_lb, $acc_lb ++ st1 { $acc_l.16b }, [$current_tag] ++ ++ lsr x0, $bit_length, #3 @ return sizes ++ ++ ldp d10, d11, [sp, #16] ++ ldp d12, d13, [sp, #32] ++ ldp d14, d15, [sp, #48] ++ ldp d8, d9, [sp], #80 ++ ret ++ ++.L192_enc_ret: ++ mov w0, #0x0 ++ ret ++.size unroll8_eor3_aes_gcm_enc_192_kernel,.-unroll8_eor3_aes_gcm_enc_192_kernel ++___ ++ ++######################################################################################### ++# size_t unroll8_eor3_aes_gcm_dec_192_kernel(const unsigned char *in, ++# size_t len, ++# unsigned char *out, ++# const void *key, ++# unsigned char ivec[16], ++# u64 *Xi); ++# ++$code.=<<___; ++.global unroll8_eor3_aes_gcm_dec_192_kernel ++.type unroll8_eor3_aes_gcm_dec_192_kernel,%function ++.align 4 ++unroll8_eor3_aes_gcm_dec_192_kernel: ++ AARCH64_VALID_CALL_TARGET ++ cbz x1, .L192_dec_ret ++ stp d8, d9, [sp, #-80]! ++ mov $counter, x4 ++ mov $cc, x5 ++ stp d10, d11, [sp, #16] ++ stp d12, d13, [sp, #32] ++ stp d14, d15, [sp, #48] ++ mov x5, #0xc200000000000000 ++ stp x5, xzr, [sp, #64] ++ add $modulo_constant, sp, #64 ++ ++ lsr $main_end_input_ptr, $bit_length, #3 @ byte_len ++ ld1 { $ctr0b}, [$counter] @ CTR block 0 ++ ld1 { $acc_lb}, [$current_tag] ++ ++ mov $constant_temp, #0x100000000 @ set up counter increment ++ movi $rctr_inc.16b, #0x0 ++ mov $rctr_inc.d[1], $constant_temp ++ ++ rev32 $rtmp_ctr.16b, $ctr0.16b @ set up reversed counter ++ ++ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 0 ++ ++ rev32 $ctr1.16b, $rtmp_ctr.16b @ CTR block 1 ++ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 1 ++ ++ rev32 $ctr2.16b, $rtmp_ctr.16b @ CTR block 2 ++ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 2 ++ ++ rev32 $ctr3.16b, $rtmp_ctr.16b @ CTR block 3 ++ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 3 ++ ++ rev32 $ctr4.16b, $rtmp_ctr.16b @ CTR block 4 ++ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 4 ++ ++ rev32 $ctr5.16b, $rtmp_ctr.16b @ CTR block 5 ++ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 5 ++ ldp $rk0q, $rk1q, [$cc, #0] @ load rk0, rk1 ++ ++ rev32 $ctr6.16b, $rtmp_ctr.16b @ CTR block 6 ++ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 6 ++ ++ rev32 $ctr7.16b, $rtmp_ctr.16b @ CTR block 7 ++ ++ aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 0 ++ aese $ctr6b, $rk0 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 0 ++ aese $ctr5b, $rk0 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 0 ++ ++ aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 0 ++ aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 0 ++ aese $ctr7b, $rk0 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 0 ++ ++ aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 0 ++ aese $ctr4b, $rk0 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 0 ++ ldp $rk2q, $rk3q, [$cc, #32] @ load rk2, rk3 ++ ++ aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 1 ++ ++ aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 1 ++ ++ aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 1 ++ aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 1 ++ aese $ctr7b, $rk1 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 1 ++ ++ aese $ctr5b, $rk1 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 1 ++ aese $ctr6b, $rk1 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 1 ++ ++ aese $ctr7b, $rk2 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 2 ++ aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 2 ++ aese $ctr4b, $rk1 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 1 ++ ++ aese $ctr5b, $rk2 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 2 ++ aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 2 ++ aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 2 ++ ++ aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 2 ++ aese $ctr4b, $rk2 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 2 ++ aese $ctr6b, $rk2 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 2 ++ ++ aese $ctr7b, $rk3 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 3 ++ ++ ldp $rk4q, $rk5q, [$cc, #64] @ load rk4, rk5 ++ aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 3 ++ aese $ctr5b, $rk3 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 3 ++ ++ aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 3 ++ aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 3 ++ ++ aese $ctr4b, $rk3 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 3 ++ aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 3 ++ aese $ctr6b, $rk3 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 3 ++ ++ aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 4 ++ aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 4 ++ aese $ctr5b, $rk4 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 4 ++ ++ aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 4 ++ aese $ctr7b, $rk4 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 4 ++ aese $ctr6b, $rk4 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 4 ++ ++ aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 4 ++ aese $ctr5b, $rk5 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 5 ++ aese $ctr4b, $rk4 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 4 ++ ++ aese $ctr6b, $rk5 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 5 ++ ldp $rk6q, $rk7q, [$cc, #96] @ load rk6, rk7 ++ ++ aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 5 ++ aese $ctr4b, $rk5 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 5 ++ aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 5 ++ ++ aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 5 ++ aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 5 ++ aese $ctr7b, $rk5 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 5 ++ ++ sub $main_end_input_ptr, $main_end_input_ptr, #1 @ byte_len - 1 ++ ++ aese $ctr4b, $rk6 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 6 ++ aese $ctr5b, $rk6 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 6 ++ aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 6 ++ ++ aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 6 ++ aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 6 ++ aese $ctr6b, $rk6 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 6 ++ ++ aese $ctr7b, $rk6 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 6 ++ aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 6 ++ ldp $rk8q, $rk9q, [$cc, #128] @ load rk8, rk9 ++ ++ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 7 ++ ++ aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 7 ++ aese $ctr7b, $rk7 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 7 ++ ++ aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 7 ++ aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 7 ++ aese $ctr4b, $rk7 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 7 ++ ++ aese $ctr6b, $rk7 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 7 ++ aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 7 ++ aese $ctr5b, $rk7 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 7 ++ ++ aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 8 ++ aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 8 ++ and $main_end_input_ptr, $main_end_input_ptr, #0xffffffffffffff80 @ number of bytes to be processed in main loop (at least 1 byte must be handled by tail) ++ ++ aese $ctr7b, $rk8 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 8 ++ aese $ctr6b, $rk8 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 8 ++ aese $ctr5b, $rk8 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 8 ++ ++ aese $ctr4b, $rk8 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 8 ++ aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 8 ++ aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 8 ++ ++ add $end_input_ptr, $input_ptr, $bit_length, lsr #3 @ end_input_ptr ++ aese $ctr6b, $rk9 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 9 ++ ++ ld1 { $acc_lb}, [$current_tag] ++ ext $acc_lb, $acc_lb, $acc_lb, #8 ++ rev64 $acc_lb, $acc_lb ++ ++ ldp $rk10q, $rk11q, [$cc, #160] @ load rk10, rk11 ++ ++ aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 9 ++ add $main_end_input_ptr, $main_end_input_ptr, $input_ptr ++ ++ aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 9 ++ aese $ctr7b, $rk9 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 9 ++ aese $ctr4b, $rk9 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 9 ++ ++ cmp $input_ptr, $main_end_input_ptr @ check if we have <= 8 blocks ++ aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 9 ++ ++ aese $ctr5b, $rk9 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 9 ++ aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 9 ++ ++ aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 10 ++ aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 10 ++ aese $ctr7b, $rk10 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 10 ++ ++ aese $ctr4b, $rk10 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 10 ++ aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 10 ++ aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 10 ++ ++ aese $ctr6b, $rk10 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 10 ++ aese $ctr5b, $rk10 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 10 ++ ldr $rk12q, [$cc, #192] @ load rk12 ++ ++ aese $ctr0b, $rk11 @ AES block 0 - round 11 ++ aese $ctr1b, $rk11 @ AES block 1 - round 11 ++ aese $ctr4b, $rk11 @ AES block 4 - round 11 ++ ++ aese $ctr6b, $rk11 @ AES block 6 - round 11 ++ aese $ctr5b, $rk11 @ AES block 5 - round 11 ++ aese $ctr7b, $rk11 @ AES block 7 - round 11 ++ ++ aese $ctr2b, $rk11 @ AES block 2 - round 11 ++ aese $ctr3b, $rk11 @ AES block 3 - round 11 ++ b.ge .L192_dec_tail @ handle tail ++ ++ ldp $res0q, $res1q, [$input_ptr], #32 @ AES block 0, 1 - load ciphertext ++ ++ ldp $res2q, $res3q, [$input_ptr], #32 @ AES block 2, 3 - load ciphertext ++ ++ ldp $res4q, $res5q, [$input_ptr], #32 @ AES block 4, 5 - load ciphertext ++ ++ eor3 $ctr1b, $res1b, $ctr1b, $rk12 @ AES block 1 - result ++ eor3 $ctr0b, $res0b, $ctr0b, $rk12 @ AES block 0 - result ++ stp $ctr0q, $ctr1q, [$output_ptr], #32 @ AES block 0, 1 - store result ++ ++ rev32 $ctr0.16b, $rtmp_ctr.16b @ CTR block 8 ++ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8 ++ ++ rev32 $ctr1.16b, $rtmp_ctr.16b @ CTR block 9 ++ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 9 ++ eor3 $ctr3b, $res3b, $ctr3b, $rk12 @ AES block 3 - result ++ ++ eor3 $ctr2b, $res2b, $ctr2b, $rk12 @ AES block 2 - result ++ stp $ctr2q, $ctr3q, [$output_ptr], #32 @ AES block 2, 3 - store result ++ ldp $res6q, $res7q, [$input_ptr], #32 @ AES block 6, 7 - load ciphertext ++ ++ rev32 $ctr2.16b, $rtmp_ctr.16b @ CTR block 10 ++ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 10 ++ ++ eor3 $ctr4b, $res4b, $ctr4b, $rk12 @ AES block 4 - result ++ ++ rev32 $ctr3.16b, $rtmp_ctr.16b @ CTR block 11 ++ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 11 ++ ++ eor3 $ctr5b, $res5b, $ctr5b, $rk12 @ AES block 5 - result ++ stp $ctr4q, $ctr5q, [$output_ptr], #32 @ AES block 4, 5 - store result ++ cmp $input_ptr, $main_end_input_ptr @ check if we have <= 8 blocks ++ ++ eor3 $ctr6b, $res6b, $ctr6b, $rk12 @ AES block 6 - result ++ eor3 $ctr7b, $res7b, $ctr7b, $rk12 @ AES block 7 - result ++ rev32 $ctr4.16b, $rtmp_ctr.16b @ CTR block 12 ++ ++ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 12 ++ stp $ctr6q, $ctr7q, [$output_ptr], #32 @ AES block 6, 7 - store result ++ b.ge .L192_dec_prepretail @ do prepretail ++ ++.L192_dec_main_loop: @ main loop start ++ rev64 $res1b, $res1b @ GHASH block 8k+1 ++ ldp $rk0q, $rk1q, [$cc, #0] @ load rk0, rk1 ++ ext $acc_lb, $acc_lb, $acc_lb, #8 @ PRE 0 ++ ++ rev64 $res0b, $res0b @ GHASH block 8k ++ rev32 $ctr5.16b, $rtmp_ctr.16b @ CTR block 8k+13 ++ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+13 ++ ++ ldr $h7q, [$current_tag, #176] @ load h7l | h7h ++ ext $h7.16b, $h7.16b, $h7.16b, #8 ++ ldr $h8q, [$current_tag, #208] @ load h8l | h8h ++ ext $h8.16b, $h8.16b, $h8.16b, #8 ++ rev64 $res4b, $res4b @ GHASH block 8k+4 ++ rev64 $res3b, $res3b @ GHASH block 8k+3 ++ ++ eor $res0b, $res0b, $acc_lb @ PRE 1 ++ rev32 $ctr6.16b, $rtmp_ctr.16b @ CTR block 8k+14 ++ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+14 ++ ++ rev64 $res5b, $res5b @ GHASH block 8k+5 ++ ++ rev32 $ctr7.16b, $rtmp_ctr.16b @ CTR block 8k+15 ++ aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 0 ++ aese $ctr6b, $rk0 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 0 ++ ++ aese $ctr5b, $rk0 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 0 ++ aese $ctr4b, $rk0 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 0 ++ aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 0 ++ ++ aese $ctr7b, $rk0 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 0 ++ aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 0 ++ aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 0 ++ ++ pmull $acc_l.1q, $res0.1d, $h8.1d @ GHASH block 8k - low ++ pmull2 $t0.1q, $res1.2d, $h7.2d @ GHASH block 8k+1 - high ++ ldp $rk2q, $rk3q, [$cc, #32] @ load rk2, rk3 ++ ++ aese $ctr6b, $rk1 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 1 ++ pmull $h7.1q, $res1.1d, $h7.1d @ GHASH block 8k+1 - low ++ ldr $h5q, [$current_tag, #128] @ load h5l | h5h ++ ext $h5.16b, $h5.16b, $h5.16b, #8 ++ ldr $h6q, [$current_tag, #160] @ load h6l | h6h ++ ext $h6.16b, $h6.16b, $h6.16b, #8 ++ ++ aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 1 ++ aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 1 ++ aese $ctr7b, $rk1 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 1 ++ ++ pmull2 $acc_h.1q, $res0.2d, $h8.2d @ GHASH block 8k - high ++ aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 1 ++ aese $ctr4b, $rk1 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 1 ++ ++ trn1 $acc_m.2d, $res1.2d, $res0.2d @ GHASH block 8k, 8k+1 - mid ++ rev64 $res2b, $res2b @ GHASH block 8k+2 ++ aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 1 ++ ++ aese $ctr5b, $rk1 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 1 ++ ldr $h56kq, [$current_tag, #144] @ load h6k | h5k ++ ldr $h78kq, [$current_tag, #192] @ load h8k | h7k ++ trn2 $res0.2d, $res1.2d, $res0.2d @ GHASH block 8k, 8k+1 - mid ++ ++ eor $acc_hb, $acc_hb, $t0.16b @ GHASH block 8k+1 - high ++ pmull2 $t2.1q, $res3.2d, $h5.2d @ GHASH block 8k+3 - high ++ pmull2 $t1.1q, $res2.2d, $h6.2d @ GHASH block 8k+2 - high ++ ++ eor $res0.16b, $res0.16b, $acc_m.16b @ GHASH block 8k, 8k+1 - mid ++ eor $acc_lb, $acc_lb, $h7.16b @ GHASH block 8k+1 - low ++ aese $ctr6b, $rk2 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 2 ++ ++ aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 2 ++ pmull $h5.1q, $res3.1d, $h5.1d @ GHASH block 8k+3 - low ++ eor3 $acc_hb, $acc_hb, $t1.16b, $t2.16b @ GHASH block 8k+2, 8k+3 - high ++ ++ aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 2 ++ aese $ctr6b, $rk3 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 3 ++ aese $ctr4b, $rk2 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 2 ++ ++ aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 2 ++ aese $ctr7b, $rk2 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 2 ++ aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 2 ++ ++ ldr $h3q, [$current_tag, #80] @ load h3l | h3h ++ ext $h3.16b, $h3.16b, $h3.16b, #8 ++ ldr $h4q, [$current_tag, #112] @ load h4l | h4h ++ ext $h4.16b, $h4.16b, $h4.16b, #8 ++ aese $ctr5b, $rk2 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 2 ++ aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 3 ++ ++ pmull $h6.1q, $res2.1d, $h6.1d @ GHASH block 8k+2 - low ++ trn1 $t3.2d, $res3.2d, $res2.2d @ GHASH block 8k+2, 8k+3 - mid ++ trn2 $res2.2d, $res3.2d, $res2.2d @ GHASH block 8k+2, 8k+3 - mid ++ ++ aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 3 ++ aese $ctr4b, $rk3 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 3 ++ ++ aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 3 ++ aese $ctr7b, $rk3 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 3 ++ ldp $rk4q, $rk5q, [$cc, #64] @ load rk4, rk5 ++ ++ eor $res2.16b, $res2.16b, $t3.16b @ GHASH block 8k+2, 8k+3 - mid ++ eor3 $acc_lb, $acc_lb, $h6.16b, $h5.16b @ GHASH block 8k+2, 8k+3 - low ++ aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 3 ++ ++ trn1 $t6.2d, $res5.2d, $res4.2d @ GHASH block 8k+4, 8k+5 - mid ++ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+15 ++ ++ pmull2 $t3.1q, $res2.2d, $h56k.2d @ GHASH block 8k+2 - mid ++ pmull2 $acc_m.1q, $res0.2d, $h78k.2d @ GHASH block 8k - mid ++ pmull $h78k.1q, $res0.1d, $h78k.1d @ GHASH block 8k+1 - mid ++ ++ aese $ctr5b, $rk3 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 3 ++ pmull $h56k.1q, $res2.1d, $h56k.1d @ GHASH block 8k+3 - mid ++ pmull2 $t4.1q, $res4.2d, $h4.2d @ GHASH block 8k+4 - high ++ ++ aese $ctr4b, $rk4 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 4 ++ aese $ctr6b, $rk4 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 4 ++ eor $acc_mb, $acc_mb, $h78k.16b @ GHASH block 8k+1 - mid ++ ++ aese $ctr5b, $rk4 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 4 ++ aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 4 ++ aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 4 ++ ++ aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 4 ++ aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 4 ++ aese $ctr7b, $rk4 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 4 ++ ++ ldr $h1q, [$current_tag, #32] @ load h1l | h1h ++ ext $h1.16b, $h1.16b, $h1.16b, #8 ++ ldr $h2q, [$current_tag, #64] @ load h1l | h1h ++ ext $h2.16b, $h2.16b, $h2.16b, #8 ++ aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 5 ++ aese $ctr5b, $rk5 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 5 ++ ++ ldp $rk6q, $rk7q, [$cc, #96] @ load rk6, rk7 ++ aese $ctr7b, $rk5 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 5 ++ rev64 $res7b, $res7b @ GHASH block 8k+7 ++ ++ aese $ctr4b, $rk5 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 5 ++ eor3 $acc_mb, $acc_mb, $h56k.16b, $t3.16b @ GHASH block 8k+2, 8k+3 - mid ++ aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 5 ++ ++ pmull $h4.1q, $res4.1d, $h4.1d @ GHASH block 8k+4 - low ++ trn2 $res4.2d, $res5.2d, $res4.2d @ GHASH block 8k+4, 8k+5 - mid ++ aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 5 ++ ++ aese $ctr6b, $rk5 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 5 ++ aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 5 ++ rev64 $res6b, $res6b @ GHASH block 8k+6 ++ ++ ldr $h12kq, [$current_tag, #48] @ load h2k | h1k ++ ldr $h34kq, [$current_tag, #96] @ load h4k | h3k ++ pmull2 $t5.1q, $res5.2d, $h3.2d @ GHASH block 8k+5 - high ++ pmull $h3.1q, $res5.1d, $h3.1d @ GHASH block 8k+5 - low ++ ++ aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 6 ++ eor $res4.16b, $res4.16b, $t6.16b @ GHASH block 8k+4, 8k+5 - mid ++ trn1 $t9.2d, $res7.2d, $res6.2d @ GHASH block 8k+6, 8k+7 - mid ++ ++ aese $ctr7b, $rk6 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 6 ++ aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 6 ++ aese $ctr6b, $rk6 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 6 ++ ++ pmull2 $t7.1q, $res6.2d, $h2.2d @ GHASH block 8k+6 - high ++ aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 6 ++ aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 6 ++ ++ aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 7 ++ aese $ctr6b, $rk7 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 7 ++ aese $ctr5b, $rk6 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 6 ++ ++ pmull2 $t6.1q, $res4.2d, $h34k.2d @ GHASH block 8k+4 - mid ++ eor3 $acc_hb, $acc_hb, $t4.16b, $t5.16b @ GHASH block 8k+4, 8k+5 - high ++ eor3 $acc_lb, $acc_lb, $h4.16b, $h3.16b @ GHASH block 8k+4, 8k+5 - low ++ ++ pmull $h2.1q, $res6.1d, $h2.1d @ GHASH block 8k+6 - low ++ trn2 $res6.2d, $res7.2d, $res6.2d @ GHASH block 8k+6, 8k+7 - mid ++ aese $ctr4b, $rk6 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 6 ++ ++ aese $ctr5b, $rk7 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 7 ++ ldp $rk8q, $rk9q, [$cc, #128] @ load rk8, rk9 ++ aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 7 ++ ++ eor $res6.16b, $res6.16b, $t9.16b @ GHASH block 8k+6, 8k+7 - mid ++ pmull $h34k.1q, $res4.1d, $h34k.1d @ GHASH block 8k+5 - mid ++ aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 7 ++ ++ aese $ctr4b, $rk7 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 7 ++ aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 7 ++ aese $ctr7b, $rk7 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 7 ++ ++ eor3 $acc_mb, $acc_mb, $h34k.16b, $t6.16b @ GHASH block 8k+4, 8k+5 - mid ++ pmull2 $t9.1q, $res6.2d, $h12k.2d @ GHASH block 8k+6 - mid ++ pmull2 $t8.1q, $res7.2d, $h1.2d @ GHASH block 8k+7 - high ++ ++ pmull $h12k.1q, $res6.1d, $h12k.1d @ GHASH block 8k+7 - mid ++ ldr $mod_constantd, [$modulo_constant] @ MODULO - load modulo constant ++ pmull $h1.1q, $res7.1d, $h1.1d @ GHASH block 8k+7 - low ++ ++ aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 8 ++ aese $ctr5b, $rk8 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 8 ++ aese $ctr7b, $rk8 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 8 ++ ++ aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 8 ++ aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 8 ++ eor3 $acc_lb, $acc_lb, $h2.16b, $h1.16b @ GHASH block 8k+6, 8k+7 - low ++ ++ aese $ctr4b, $rk8 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 8 ++ aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 8 ++ aese $ctr6b, $rk8 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 8 ++ ++ eor3 $acc_hb, $acc_hb, $t7.16b, $t8.16b @ GHASH block 8k+6, 8k+7 - high ++ rev32 $h1.16b, $rtmp_ctr.16b @ CTR block 8k+16 ++ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+16 ++ ++ aese $ctr5b, $rk9 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 9 ++ eor3 $acc_mb, $acc_mb, $h12k.16b, $t9.16b @ GHASH block 8k+6, 8k+7 - mid ++ aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 9 ++ ++ aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 9 ++ aese $ctr7b, $rk9 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 9 ++ ldp $rk10q, $rk11q, [$cc, #160] @ load rk10, rk11 ++ ++ eor3 $acc_mb, $acc_mb, $acc_hb, $acc_lb @ MODULO - karatsuba tidy up ++ ldp $res0q, $res1q, [$input_ptr], #32 @ AES block 8k+8, 8k+9 - load ciphertext ++ ++ aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 9 ++ aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 9 ++ ldp $res2q, $res3q, [$input_ptr], #32 @ AES block 8k+10, 8k+11 - load ciphertext ++ ++ rev32 $h2.16b, $rtmp_ctr.16b @ CTR block 8k+17 ++ pmull $t12.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid ++ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+17 ++ ++ aese $ctr6b, $rk9 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 9 ++ aese $ctr4b, $rk9 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 9 ++ ext $t11.16b, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment ++ ++ aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 10 ++ aese $ctr7b, $rk10 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 10 ++ ldp $res4q, $res5q, [$input_ptr], #32 @ AES block 8k+12, 8k+13 - load ciphertext ++ ++ rev32 $h3.16b, $rtmp_ctr.16b @ CTR block 8k+18 ++ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+18 ++ eor3 $acc_mb, $acc_mb, $t12.16b, $t11.16b @ MODULO - fold into mid ++ ++ aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 10 ++ aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 10 ++ ldr $rk12q, [$cc, #192] @ load rk12 ++ ++ ldp $res6q, $res7q, [$input_ptr], #32 @ AES block 8k+14, 8k+15 - load ciphertext ++ aese $ctr4b, $rk10 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 10 ++ aese $ctr6b, $rk10 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 10 ++ ++ aese $ctr0b, $rk11 @ AES block 8k+8 - round 11 ++ ext $t11.16b, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment ++ aese $ctr1b, $rk11 @ AES block 8k+9 - round 11 ++ ++ aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 10 ++ aese $ctr6b, $rk11 @ AES block 8k+14 - round 11 ++ aese $ctr3b, $rk11 @ AES block 8k+11 - round 11 ++ ++ eor3 $ctr0b, $res0b, $ctr0b, $rk12 @ AES block 8k+8 - result ++ rev32 $h4.16b, $rtmp_ctr.16b @ CTR block 8k+19 ++ aese $ctr5b, $rk10 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 10 ++ ++ aese $ctr4b, $rk11 @ AES block 8k+12 - round 11 ++ aese $ctr2b, $rk11 @ AES block 8k+10 - round 11 ++ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+19 ++ ++ aese $ctr7b, $rk11 @ AES block 8k+15 - round 11 ++ aese $ctr5b, $rk11 @ AES block 8k+13 - round 11 ++ pmull $acc_h.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low ++ ++ eor3 $ctr1b, $res1b, $ctr1b, $rk12 @ AES block 8k+9 - result ++ stp $ctr0q, $ctr1q, [$output_ptr], #32 @ AES block 8k+8, 8k+9 - store result ++ eor3 $ctr3b, $res3b, $ctr3b, $rk12 @ AES block 8k+11 - result ++ ++ eor3 $ctr2b, $res2b, $ctr2b, $rk12 @ AES block 8k+10 - result ++ eor3 $ctr7b, $res7b, $ctr7b, $rk12 @ AES block 8k+15 - result ++ stp $ctr2q, $ctr3q, [$output_ptr], #32 @ AES block 8k+10, 8k+11 - store result ++ ++ eor3 $ctr5b, $res5b, $ctr5b, $rk12 @ AES block 8k+13 - result ++ eor3 $acc_lb, $acc_lb, $acc_hb, $t11.16b @ MODULO - fold into low ++ mov $ctr3.16b, $h4.16b @ CTR block 8k+19 ++ ++ eor3 $ctr4b, $res4b, $ctr4b, $rk12 @ AES block 8k+12 - result ++ stp $ctr4q, $ctr5q, [$output_ptr], #32 @ AES block 8k+12, 8k+13 - store result ++ cmp $input_ptr, $main_end_input_ptr @ LOOP CONTROL ++ ++ eor3 $ctr6b, $res6b, $ctr6b, $rk12 @ AES block 8k+14 - result ++ stp $ctr6q, $ctr7q, [$output_ptr], #32 @ AES block 8k+14, 8k+15 - store result ++ mov $ctr0.16b, $h1.16b @ CTR block 8k+16 ++ ++ mov $ctr1.16b, $h2.16b @ CTR block 8k+17 ++ mov $ctr2.16b, $h3.16b @ CTR block 8k+18 ++ ++ rev32 $ctr4.16b, $rtmp_ctr.16b @ CTR block 8k+20 ++ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+20 ++ b.lt .L192_dec_main_loop ++ ++.L192_dec_prepretail: @ PREPRETAIL ++ ldp $rk0q, $rk1q, [$cc, #0] @ load rk0, rk1 ++ rev32 $ctr5.16b, $rtmp_ctr.16b @ CTR block 8k+13 ++ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+13 ++ ++ ldr $h7q, [$current_tag, #176] @ load h7l | h7h ++ ext $h7.16b, $h7.16b, $h7.16b, #8 ++ ldr $h8q, [$current_tag, #208] @ load h8l | h8h ++ ext $h8.16b, $h8.16b, $h8.16b, #8 ++ rev64 $res0b, $res0b @ GHASH block 8k ++ ext $acc_lb, $acc_lb, $acc_lb, #8 @ PRE 0 ++ ++ rev64 $res3b, $res3b @ GHASH block 8k+3 ++ rev32 $ctr6.16b, $rtmp_ctr.16b @ CTR block 8k+14 ++ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+14 ++ ++ eor $res0b, $res0b, $acc_lb @ PRE 1 ++ rev64 $res2b, $res2b @ GHASH block 8k+2 ++ rev64 $res1b, $res1b @ GHASH block 8k+1 ++ ++ ldr $h5q, [$current_tag, #128] @ load h5l | h5h ++ ext $h5.16b, $h5.16b, $h5.16b, #8 ++ ldr $h6q, [$current_tag, #160] @ load h6l | h6h ++ ext $h6.16b, $h6.16b, $h6.16b, #8 ++ rev32 $ctr7.16b, $rtmp_ctr.16b @ CTR block 8k+15 ++ ++ aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 0 ++ aese $ctr6b, $rk0 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 0 ++ aese $ctr5b, $rk0 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 0 ++ ++ aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 0 ++ aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 0 ++ pmull2 $t0.1q, $res1.2d, $h7.2d @ GHASH block 8k+1 - high ++ ++ aese $ctr4b, $rk0 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 0 ++ pmull2 $acc_h.1q, $res0.2d, $h8.2d @ GHASH block 8k - high ++ aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 0 ++ ++ aese $ctr6b, $rk1 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 1 ++ aese $ctr7b, $rk0 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 0 ++ ldp $rk2q, $rk3q, [$cc, #32] @ load rk2, rk3 ++ ++ aese $ctr4b, $rk1 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 1 ++ pmull2 $t1.1q, $res2.2d, $h6.2d @ GHASH block 8k+2 - high ++ pmull $h6.1q, $res2.1d, $h6.1d @ GHASH block 8k+2 - low ++ ++ pmull $h7.1q, $res1.1d, $h7.1d @ GHASH block 8k+1 - low ++ eor $acc_hb, $acc_hb, $t0.16b @ GHASH block 8k+1 - high ++ aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 1 ++ ++ pmull $acc_l.1q, $res0.1d, $h8.1d @ GHASH block 8k - low ++ aese $ctr7b, $rk1 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 1 ++ aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 1 ++ ++ trn1 $acc_m.2d, $res1.2d, $res0.2d @ GHASH block 8k, 8k+1 - mid ++ trn2 $res0.2d, $res1.2d, $res0.2d @ GHASH block 8k, 8k+1 - mid ++ pmull2 $t2.1q, $res3.2d, $h5.2d @ GHASH block 8k+3 - high ++ ++ aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 1 ++ aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 1 ++ aese $ctr5b, $rk1 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 1 ++ ++ ldr $h56kq, [$current_tag, #144] @ load h6k | h5k ++ ldr $h78kq, [$current_tag, #192] @ load h8k | h7k ++ aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 2 ++ eor $res0.16b, $res0.16b, $acc_m.16b @ GHASH block 8k, 8k+1 - mid ++ ++ aese $ctr6b, $rk2 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 2 ++ rev64 $res5b, $res5b @ GHASH block 8k+5 ++ pmull $h5.1q, $res3.1d, $h5.1d @ GHASH block 8k+3 - low ++ ++ eor3 $acc_hb, $acc_hb, $t1.16b, $t2.16b @ GHASH block 8k+2, 8k+3 - high ++ aese $ctr4b, $rk2 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 2 ++ aese $ctr5b, $rk2 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 2 ++ ++ trn1 $t3.2d, $res3.2d, $res2.2d @ GHASH block 8k+2, 8k+3 - mid ++ aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 3 ++ aese $ctr7b, $rk2 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 2 ++ ++ aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 2 ++ aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 2 ++ trn2 $res2.2d, $res3.2d, $res2.2d @ GHASH block 8k+2, 8k+3 - mid ++ ++ pmull2 $acc_m.1q, $res0.2d, $h78k.2d @ GHASH block 8k - mid ++ aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 2 ++ pmull $h78k.1q, $res0.1d, $h78k.1d @ GHASH block 8k+1 - mid ++ ++ aese $ctr5b, $rk3 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 3 ++ eor $res2.16b, $res2.16b, $t3.16b @ GHASH block 8k+2, 8k+3 - mid ++ eor $acc_lb, $acc_lb, $h7.16b @ GHASH block 8k+1 - low ++ ++ aese $ctr7b, $rk3 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 3 ++ aese $ctr6b, $rk3 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 3 ++ aese $ctr4b, $rk3 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 3 ++ ++ eor3 $acc_lb, $acc_lb, $h6.16b, $h5.16b @ GHASH block 8k+2, 8k+3 - low ++ ldp $rk4q, $rk5q, [$cc, #64] @ load rk4, rk5 ++ aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 3 ++ ++ ldr $h3q, [$current_tag, #80] @ load h3l | h3h ++ ext $h3.16b, $h3.16b, $h3.16b, #8 ++ ldr $h4q, [$current_tag, #112] @ load h4l | h4h ++ ext $h4.16b, $h4.16b, $h4.16b, #8 ++ pmull2 $t3.1q, $res2.2d, $h56k.2d @ GHASH block 8k+2 - mid ++ pmull $h56k.1q, $res2.1d, $h56k.1d @ GHASH block 8k+3 - mid ++ ++ ldr $h1q, [$current_tag, #32] @ load h1l | h1h ++ ext $h1.16b, $h1.16b, $h1.16b, #8 ++ ldr $h2q, [$current_tag, #64] @ load h1l | h1h ++ ext $h2.16b, $h2.16b, $h2.16b, #8 ++ eor $acc_mb, $acc_mb, $h78k.16b @ GHASH block 8k+1 - mid ++ aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 3 ++ ++ rev64 $res7b, $res7b @ GHASH block 8k+7 ++ ++ eor3 $acc_mb, $acc_mb, $h56k.16b, $t3.16b @ GHASH block 8k+2, 8k+3 - mid ++ rev64 $res4b, $res4b @ GHASH block 8k+4 ++ ++ aese $ctr5b, $rk4 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 4 ++ aese $ctr4b, $rk4 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 4 ++ aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 3 ++ ++ aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 4 ++ aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 4 ++ aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 4 ++ ++ aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 4 ++ aese $ctr6b, $rk4 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 4 ++ aese $ctr7b, $rk4 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 4 ++ ++ rev64 $res6b, $res6b @ GHASH block 8k+6 ++ ldr $h12kq, [$current_tag, #48] @ load h2k | h1k ++ ldr $h34kq, [$current_tag, #96] @ load h4k | h3k ++ trn1 $t6.2d, $res5.2d, $res4.2d @ GHASH block 8k+4, 8k+5 - mid ++ ++ aese $ctr7b, $rk5 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 5 ++ aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 5 ++ aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 5 ++ ++ ldp $rk6q, $rk7q, [$cc, #96] @ load rk6, rk7 ++ aese $ctr6b, $rk5 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 5 ++ aese $ctr5b, $rk5 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 5 ++ ++ pmull2 $t7.1q, $res6.2d, $h2.2d @ GHASH block 8k+6 - high ++ pmull2 $t4.1q, $res4.2d, $h4.2d @ GHASH block 8k+4 - high ++ pmull $h2.1q, $res6.1d, $h2.1d @ GHASH block 8k+6 - low ++ ++ aese $ctr4b, $rk5 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 5 ++ ++ pmull $h4.1q, $res4.1d, $h4.1d @ GHASH block 8k+4 - low ++ trn2 $res4.2d, $res5.2d, $res4.2d @ GHASH block 8k+4, 8k+5 - mid ++ pmull2 $t5.1q, $res5.2d, $h3.2d @ GHASH block 8k+5 - high ++ ++ pmull $h3.1q, $res5.1d, $h3.1d @ GHASH block 8k+5 - low ++ trn1 $t9.2d, $res7.2d, $res6.2d @ GHASH block 8k+6, 8k+7 - mid ++ aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 5 ++ ++ trn2 $res6.2d, $res7.2d, $res6.2d @ GHASH block 8k+6, 8k+7 - mid ++ aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 5 ++ eor $res4.16b, $res4.16b, $t6.16b @ GHASH block 8k+4, 8k+5 - mid ++ ++ aese $ctr4b, $rk6 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 6 ++ aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 6 ++ ++ eor $res6.16b, $res6.16b, $t9.16b @ GHASH block 8k+6, 8k+7 - mid ++ aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 6 ++ aese $ctr7b, $rk6 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 6 ++ ++ pmull2 $t6.1q, $res4.2d, $h34k.2d @ GHASH block 8k+4 - mid ++ pmull $h34k.1q, $res4.1d, $h34k.1d @ GHASH block 8k+5 - mid ++ aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 6 ++ ++ pmull2 $t9.1q, $res6.2d, $h12k.2d @ GHASH block 8k+6 - mid ++ aese $ctr5b, $rk6 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 6 ++ pmull2 $t8.1q, $res7.2d, $h1.2d @ GHASH block 8k+7 - high ++ ++ eor3 $acc_mb, $acc_mb, $h34k.16b, $t6.16b @ GHASH block 8k+4, 8k+5 - mid ++ aese $ctr4b, $rk7 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 7 ++ eor3 $acc_lb, $acc_lb, $h4.16b, $h3.16b @ GHASH block 8k+4, 8k+5 - low ++ ++ aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 6 ++ aese $ctr6b, $rk6 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 6 ++ aese $ctr5b, $rk7 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 7 ++ ++ ldp $rk8q, $rk9q, [$cc, #128] @ load rk8, rk9 ++ pmull $h12k.1q, $res6.1d, $h12k.1d @ GHASH block 8k+7 - mid ++ aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 7 ++ ++ ldr $mod_constantd, [$modulo_constant] @ MODULO - load modulo constant ++ eor3 $acc_hb, $acc_hb, $t4.16b, $t5.16b @ GHASH block 8k+4, 8k+5 - high ++ pmull $h1.1q, $res7.1d, $h1.1d @ GHASH block 8k+7 - low ++ ++ aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 7 ++ aese $ctr7b, $rk7 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 7 ++ aese $ctr6b, $rk7 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 7 ++ ++ eor3 $acc_hb, $acc_hb, $t7.16b, $t8.16b @ GHASH block 8k+6, 8k+7 - high ++ eor3 $acc_lb, $acc_lb, $h2.16b, $h1.16b @ GHASH block 8k+6, 8k+7 - low ++ eor3 $acc_mb, $acc_mb, $h12k.16b, $t9.16b @ GHASH block 8k+6, 8k+7 - mid ++ ++ aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 7 ++ aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 7 ++ ++ eor3 $acc_mb, $acc_mb, $acc_hb, $acc_lb @ MODULO - karatsuba tidy up ++ ext $t11.16b, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment ++ aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 8 ++ ++ aese $ctr6b, $rk8 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 8 ++ aese $ctr7b, $rk8 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 8 ++ aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 8 ++ ++ aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 8 ++ pmull $t12.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid ++ aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 8 ++ ++ aese $ctr5b, $rk8 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 8 ++ aese $ctr4b, $rk8 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 8 ++ ldp $rk10q, $rk11q, [$cc, #160] @ load rk10, rk11 ++ ++ eor3 $acc_mb, $acc_mb, $t12.16b, $t11.16b @ MODULO - fold into mid ++ aese $ctr7b, $rk9 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 9 ++ aese $ctr6b, $rk9 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 9 ++ ++ aese $ctr5b, $rk9 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 9 ++ aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 9 ++ aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 9 ++ ++ aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 9 ++ aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 9 ++ aese $ctr4b, $rk9 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 9 ++ ++ pmull $acc_h.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low ++ ldr $rk12q, [$cc, #192] @ load rk12 ++ ext $t11.16b, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment ++ ++ aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 10 ++ aese $ctr5b, $rk10 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 10 ++ aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 10 ++ ++ aese $ctr4b, $rk10 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 10 ++ aese $ctr6b, $rk10 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 10 ++ aese $ctr7b, $rk10 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 10 ++ ++ aese $ctr0b, $rk11 @ AES block 8k+8 - round 11 ++ eor3 $acc_lb, $acc_lb, $acc_hb, $t11.16b @ MODULO - fold into low ++ aese $ctr5b, $rk11 @ AES block 8k+13 - round 11 ++ ++ aese $ctr2b, $rk11 @ AES block 8k+10 - round 11 ++ aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 10 ++ aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 10 ++ ++ aese $ctr6b, $rk11 @ AES block 8k+14 - round 11 ++ aese $ctr4b, $rk11 @ AES block 8k+12 - round 11 ++ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+15 ++ ++ aese $ctr3b, $rk11 @ AES block 8k+11 - round 11 ++ aese $ctr1b, $rk11 @ AES block 8k+9 - round 11 ++ aese $ctr7b, $rk11 @ AES block 8k+15 - round 11 ++ ++.L192_dec_tail: @ TAIL ++ ++ sub $main_end_input_ptr, $end_input_ptr, $input_ptr @ main_end_input_ptr is number of bytes left to process ++ ++ ldp $h5q, $h56kq, [$current_tag, #128] @ load h5l | h5h ++ ext $h5.16b, $h5.16b, $h5.16b, #8 ++ ldr $res1q, [$input_ptr], #16 @ AES block 8k+8 - load ciphertext ++ ++ ldp $h78kq, $h8q, [$current_tag, #192] @ load h8l | h8h ++ ext $h8.16b, $h8.16b, $h8.16b, #8 ++ ++ mov $t1.16b, $rk12 ++ ++ ldp $h6q, $h7q, [$current_tag, #160] @ load h6l | h6h ++ ext $h6.16b, $h6.16b, $h6.16b, #8 ++ ext $h7.16b, $h7.16b, $h7.16b, #8 ++ ext $t0.16b, $acc_lb, $acc_lb, #8 @ prepare final partial tag ++ ++ eor3 $res4b, $res1b, $ctr0b, $t1.16b @ AES block 8k+8 - result ++ cmp $main_end_input_ptr, #112 ++ b.gt .L192_dec_blocks_more_than_7 ++ ++ mov $ctr7b, $ctr6b ++ movi $acc_h.8b, #0 ++ sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s ++ ++ mov $ctr6b, $ctr5b ++ mov $ctr5b, $ctr4b ++ mov $ctr4b, $ctr3b ++ ++ cmp $main_end_input_ptr, #96 ++ movi $acc_l.8b, #0 ++ mov $ctr3b, $ctr2b ++ ++ mov $ctr2b, $ctr1b ++ movi $acc_m.8b, #0 ++ b.gt .L192_dec_blocks_more_than_6 ++ ++ mov $ctr7b, $ctr6b ++ mov $ctr6b, $ctr5b ++ mov $ctr5b, $ctr4b ++ ++ mov $ctr4b, $ctr3b ++ mov $ctr3b, $ctr1b ++ ++ sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s ++ cmp $main_end_input_ptr, #80 ++ b.gt .L192_dec_blocks_more_than_5 ++ ++ mov $ctr7b, $ctr6b ++ mov $ctr6b, $ctr5b ++ ++ mov $ctr5b, $ctr4b ++ mov $ctr4b, $ctr1b ++ cmp $main_end_input_ptr, #64 ++ ++ sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s ++ b.gt .L192_dec_blocks_more_than_4 ++ ++ sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s ++ mov $ctr7b, $ctr6b ++ mov $ctr6b, $ctr5b ++ ++ mov $ctr5b, $ctr1b ++ cmp $main_end_input_ptr, #48 ++ b.gt .L192_dec_blocks_more_than_3 ++ ++ sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s ++ mov $ctr7b, $ctr6b ++ cmp $main_end_input_ptr, #32 ++ ++ mov $ctr6b, $ctr1b ++ ldr $h34kq, [$current_tag, #96] @ load h4k | h3k ++ b.gt .L192_dec_blocks_more_than_2 ++ ++ sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s ++ ++ mov $ctr7b, $ctr1b ++ cmp $main_end_input_ptr, #16 ++ b.gt .L192_dec_blocks_more_than_1 ++ ++ sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s ++ ldr $h12kq, [$current_tag, #48] @ load h2k | h1k ++ b .L192_dec_blocks_less_than_1 ++.L192_dec_blocks_more_than_7: @ blocks left > 7 ++ rev64 $res0b, $res1b @ GHASH final-7 block ++ ++ ins $acc_m.d[0], $h78k.d[1] @ GHASH final-7 block - mid ++ eor $res0b, $res0b, $t0.16b @ feed in partial tag ++ ++ pmull2 $acc_h.1q, $res0.2d, $h8.2d @ GHASH final-7 block - high ++ ins $rk4v.d[0], $res0.d[1] @ GHASH final-7 block - mid ++ ldr $res1q, [$input_ptr], #16 @ AES final-6 block - load ciphertext ++ ++ pmull $acc_l.1q, $res0.1d, $h8.1d @ GHASH final-7 block - low ++ ++ eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-7 block - mid ++ st1 { $res4b}, [$output_ptr], #16 @ AES final-7 block - store result ++ ++ eor3 $res4b, $res1b, $ctr1b, $t1.16b @ AES final-6 block - result ++ ++ pmull $acc_m.1q, $rk4v.1d, $acc_m.1d @ GHASH final-7 block - mid ++ movi $t0.8b, #0 @ surpress further partial tag feed in ++.L192_dec_blocks_more_than_6: @ blocks left > 6 ++ ++ rev64 $res0b, $res1b @ GHASH final-6 block ++ ++ eor $res0b, $res0b, $t0.16b @ feed in partial tag ++ ++ ldr $res1q, [$input_ptr], #16 @ AES final-5 block - load ciphertext ++ ins $rk4v.d[0], $res0.d[1] @ GHASH final-6 block - mid ++ ++ eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-6 block - mid ++ movi $t0.8b, #0 @ surpress further partial tag feed in ++ pmull2 $rk2q1, $res0.2d, $h7.2d @ GHASH final-6 block - high ++ ++ st1 { $res4b}, [$output_ptr], #16 @ AES final-6 block - store result ++ eor3 $res4b, $res1b, $ctr2b, $t1.16b @ AES final-5 block - result ++ ++ eor $acc_hb, $acc_hb, $rk2 @ GHASH final-6 block - high ++ pmull $rk4v.1q, $rk4v.1d, $h78k.1d @ GHASH final-6 block - mid ++ pmull $rk3q1, $res0.1d, $h7.1d @ GHASH final-6 block - low ++ ++ eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-6 block - mid ++ eor $acc_lb, $acc_lb, $rk3 @ GHASH final-6 block - low ++.L192_dec_blocks_more_than_5: @ blocks left > 5 ++ ++ rev64 $res0b, $res1b @ GHASH final-5 block ++ ++ eor $res0b, $res0b, $t0.16b @ feed in partial tag ++ ++ ins $rk4v.d[0], $res0.d[1] @ GHASH final-5 block - mid ++ ++ eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-5 block - mid ++ ++ ins $rk4v.d[1], $rk4v.d[0] @ GHASH final-5 block - mid ++ pmull2 $rk2q1, $res0.2d, $h6.2d @ GHASH final-5 block - high ++ ++ ldr $res1q, [$input_ptr], #16 @ AES final-4 block - load ciphertext ++ ++ eor $acc_hb, $acc_hb, $rk2 @ GHASH final-5 block - high ++ pmull $rk3q1, $res0.1d, $h6.1d @ GHASH final-5 block - low ++ ++ pmull2 $rk4v.1q, $rk4v.2d, $h56k.2d @ GHASH final-5 block - mid ++ ++ eor $acc_lb, $acc_lb, $rk3 @ GHASH final-5 block - low ++ movi $t0.8b, #0 @ surpress further partial tag feed in ++ st1 { $res4b}, [$output_ptr], #16 @ AES final-5 block - store result ++ ++ eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-5 block - mid ++ eor3 $res4b, $res1b, $ctr3b, $t1.16b @ AES final-4 block - result ++.L192_dec_blocks_more_than_4: @ blocks left > 4 ++ ++ rev64 $res0b, $res1b @ GHASH final-4 block ++ ++ eor $res0b, $res0b, $t0.16b @ feed in partial tag ++ movi $t0.8b, #0 @ surpress further partial tag feed in ++ ++ ldr $res1q, [$input_ptr], #16 @ AES final-3 block - load ciphertext ++ ins $rk4v.d[0], $res0.d[1] @ GHASH final-4 block - mid ++ pmull $rk3q1, $res0.1d, $h5.1d @ GHASH final-4 block - low ++ ++ eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-4 block - mid ++ ++ eor $acc_lb, $acc_lb, $rk3 @ GHASH final-4 block - low ++ ++ pmull $rk4v.1q, $rk4v.1d, $h56k.1d @ GHASH final-4 block - mid ++ st1 { $res4b}, [$output_ptr], #16 @ AES final-4 block - store result ++ pmull2 $rk2q1, $res0.2d, $h5.2d @ GHASH final-4 block - high ++ ++ eor3 $res4b, $res1b, $ctr4b, $t1.16b @ AES final-3 block - result ++ ++ eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-4 block - mid ++ eor $acc_hb, $acc_hb, $rk2 @ GHASH final-4 block - high ++.L192_dec_blocks_more_than_3: @ blocks left > 3 ++ ++ ldr $h4q, [$current_tag, #112] @ load h4l | h4h ++ ext $h4.16b, $h4.16b, $h4.16b, #8 ++ rev64 $res0b, $res1b @ GHASH final-3 block ++ ldr $res1q, [$input_ptr], #16 @ AES final-2 block - load ciphertext ++ ++ eor $res0b, $res0b, $t0.16b @ feed in partial tag ++ ++ ins $rk4v.d[0], $res0.d[1] @ GHASH final-3 block - mid ++ pmull2 $rk2q1, $res0.2d, $h4.2d @ GHASH final-3 block - high ++ ++ eor $acc_hb, $acc_hb, $rk2 @ GHASH final-3 block - high ++ movi $t0.8b, #0 @ surpress further partial tag feed in ++ pmull $rk3q1, $res0.1d, $h4.1d @ GHASH final-3 block - low ++ ++ st1 { $res4b}, [$output_ptr], #16 @ AES final-3 block - store result ++ eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-3 block - mid ++ eor3 $res4b, $res1b, $ctr5b, $t1.16b @ AES final-2 block - result ++ ++ eor $acc_lb, $acc_lb, $rk3 @ GHASH final-3 block - low ++ ldr $h34kq, [$current_tag, #96] @ load h4k | h3k ++ ++ ins $rk4v.d[1], $rk4v.d[0] @ GHASH final-3 block - mid ++ ++ pmull2 $rk4v.1q, $rk4v.2d, $h34k.2d @ GHASH final-3 block - mid ++ ++ eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-3 block - mid ++.L192_dec_blocks_more_than_2: @ blocks left > 2 ++ ++ rev64 $res0b, $res1b @ GHASH final-2 block ++ ldr $h3q, [$current_tag, #80] @ load h3l | h3h ++ ext $h3.16b, $h3.16b, $h3.16b, #8 ++ ++ eor $res0b, $res0b, $t0.16b @ feed in partial tag ++ ++ ins $rk4v.d[0], $res0.d[1] @ GHASH final-2 block - mid ++ ldr $res1q, [$input_ptr], #16 @ AES final-1 block - load ciphertext ++ ++ pmull2 $rk2q1, $res0.2d, $h3.2d @ GHASH final-2 block - high ++ ++ eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-2 block - mid ++ ++ eor $acc_hb, $acc_hb, $rk2 @ GHASH final-2 block - high ++ pmull $rk3q1, $res0.1d, $h3.1d @ GHASH final-2 block - low ++ ++ pmull $rk4v.1q, $rk4v.1d, $h34k.1d @ GHASH final-2 block - mid ++ movi $t0.8b, #0 @ surpress further partial tag feed in ++ ++ eor $acc_lb, $acc_lb, $rk3 @ GHASH final-2 block - low ++ st1 { $res4b}, [$output_ptr], #16 @ AES final-2 block - store result ++ ++ eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-2 block - mid ++ eor3 $res4b, $res1b, $ctr6b, $t1.16b @ AES final-1 block - result ++.L192_dec_blocks_more_than_1: @ blocks left > 1 ++ ++ rev64 $res0b, $res1b @ GHASH final-1 block ++ ldr $res1q, [$input_ptr], #16 @ AES final block - load ciphertext ++ ldr $h2q, [$current_tag, #64] @ load h1l | h1h ++ ext $h2.16b, $h2.16b, $h2.16b, #8 ++ ++ eor $res0b, $res0b, $t0.16b @ feed in partial tag ++ movi $t0.8b, #0 @ surpress further partial tag feed in ++ ldr $h12kq, [$current_tag, #48] @ load h2k | h1k ++ ++ pmull $rk3q1, $res0.1d, $h2.1d @ GHASH final-1 block - low ++ ins $rk4v.d[0], $res0.d[1] @ GHASH final-1 block - mid ++ st1 { $res4b}, [$output_ptr], #16 @ AES final-1 block - store result ++ ++ pmull2 $rk2q1, $res0.2d, $h2.2d @ GHASH final-1 block - high ++ ++ eor3 $res4b, $res1b, $ctr7b, $t1.16b @ AES final block - result ++ ++ eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-1 block - mid ++ ++ ins $rk4v.d[1], $rk4v.d[0] @ GHASH final-1 block - mid ++ ++ pmull2 $rk4v.1q, $rk4v.2d, $h12k.2d @ GHASH final-1 block - mid ++ ++ eor $acc_lb, $acc_lb, $rk3 @ GHASH final-1 block - low ++ ++ eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-1 block - mid ++ eor $acc_hb, $acc_hb, $rk2 @ GHASH final-1 block - high ++.L192_dec_blocks_less_than_1: @ blocks left <= 1 ++ ++ rev32 $rtmp_ctr.16b, $rtmp_ctr.16b ++ and $bit_length, $bit_length, #127 @ bit_length %= 128 ++ ++ sub $bit_length, $bit_length, #128 @ bit_length -= 128 ++ str $rtmp_ctrq, [$counter] @ store the updated counter ++ ++ neg $bit_length, $bit_length @ bit_length = 128 - #bits in input (in range [1,128]) ++ mvn $temp0_x, xzr @ temp0_x = 0xffffffffffffffff ++ ++ and $bit_length, $bit_length, #127 @ bit_length %= 128 ++ ++ mvn $temp1_x, xzr @ temp1_x = 0xffffffffffffffff ++ lsr $temp0_x, $temp0_x, $bit_length @ temp0_x is mask for top 64b of last block ++ cmp $bit_length, #64 ++ ++ csel $temp2_x, $temp1_x, $temp0_x, lt ++ csel $temp3_x, $temp0_x, xzr, lt ++ ldr $h1q, [$current_tag, #32] @ load h1l | h1h ++ ext $h1.16b, $h1.16b, $h1.16b, #8 ++ ++ mov $ctr0.d[1], $temp3_x ++ ld1 { $rk0}, [$output_ptr] @ load existing bytes where the possibly partial last block is to be stored ++ ++ mov $ctr0.d[0], $temp2_x @ ctr0b is mask for last block ++ ++ and $res1b, $res1b, $ctr0b @ possibly partial last block has zeroes in highest bits ++ bif $res4b, $rk0, $ctr0b @ insert existing bytes in top end of result before storing ++ ++ rev64 $res0b, $res1b @ GHASH final block ++ ++ st1 { $res4b}, [$output_ptr] @ store all 16B ++ ++ eor $res0b, $res0b, $t0.16b @ feed in partial tag ++ ++ ins $t0.d[0], $res0.d[1] @ GHASH final block - mid ++ pmull $rk3q1, $res0.1d, $h1.1d @ GHASH final block - low ++ ++ eor $t0.8b, $t0.8b, $res0.8b @ GHASH final block - mid ++ pmull2 $rk2q1, $res0.2d, $h1.2d @ GHASH final block - high ++ eor $acc_lb, $acc_lb, $rk3 @ GHASH final block - low ++ ++ pmull $t0.1q, $t0.1d, $h12k.1d @ GHASH final block - mid ++ eor $acc_hb, $acc_hb, $rk2 @ GHASH final block - high ++ ++ eor $t10.16b, $acc_hb, $acc_lb @ MODULO - karatsuba tidy up ++ eor $acc_mb, $acc_mb, $t0.16b @ GHASH final block - mid ++ ldr $mod_constantd, [$modulo_constant] @ MODULO - load modulo constant ++ ++ pmull $t11.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid ++ ext $acc_hb, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment ++ ++ eor $acc_mb, $acc_mb, $t10.16b @ MODULO - karatsuba tidy up ++ ++ eor3 $acc_mb, $acc_mb, $acc_hb, $t11.16b @ MODULO - fold into mid ++ ++ pmull $acc_h.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low ++ ext $acc_mb, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment ++ ++ eor3 $acc_lb, $acc_lb, $acc_mb, $acc_hb @ MODULO - fold into low ++ ext $acc_lb, $acc_lb, $acc_lb, #8 ++ rev64 $acc_lb, $acc_lb ++ st1 { $acc_l.16b }, [$current_tag] ++ ++ ldp d10, d11, [sp, #16] ++ ldp d12, d13, [sp, #32] ++ ldp d14, d15, [sp, #48] ++ ldp d8, d9, [sp], #80 ++ ret ++ ++.L192_dec_ret: ++ mov w0, #0x0 ++ ret ++.size unroll8_eor3_aes_gcm_dec_192_kernel,.-unroll8_eor3_aes_gcm_dec_192_kernel ++___ ++} ++ ++{ ++ ++my ($end_input_ptr,$main_end_input_ptr,$temp0_x,$temp1_x)=map("x$_",(4..7)); ++my ($temp2_x,$temp3_x)=map("x$_",(13..14)); ++my ($ctr0b,$ctr1b,$ctr2b,$ctr3b,$ctr4b,$ctr5b,$ctr6b,$ctr7b,$res0b,$res1b,$res2b,$res3b,$res4b,$res5b,$res6b,$res7b)=map("v$_.16b",(0..15)); ++my ($ctr0,$ctr1,$ctr2,$ctr3,$ctr4,$ctr5,$ctr6,$ctr7,$res0,$res1,$res2,$res3,$res4,$res5,$res6,$res7)=map("v$_",(0..15)); ++my ($ctr0d,$ctr1d,$ctr2d,$ctr3d,$ctr4d,$ctr5d,$ctr6d,$ctr7d)=map("d$_",(0..7)); ++my ($ctr0q,$ctr1q,$ctr2q,$ctr3q,$ctr4q,$ctr5q,$ctr6q,$ctr7q)=map("q$_",(0..7)); ++my ($res0q,$res1q,$res2q,$res3q,$res4q,$res5q,$res6q,$res7q)=map("q$_",(8..15)); ++ ++my ($ctr_t0,$ctr_t1,$ctr_t2,$ctr_t3,$ctr_t4,$ctr_t5,$ctr_t6,$ctr_t7)=map("v$_",(8..15)); ++my ($ctr_t0b,$ctr_t1b,$ctr_t2b,$ctr_t3b,$ctr_t4b,$ctr_t5b,$ctr_t6b,$ctr_t7b)=map("v$_.16b",(8..15)); ++my ($ctr_t0q,$ctr_t1q,$ctr_t2q,$ctr_t3q,$ctr_t4q,$ctr_t5q,$ctr_t6q,$ctr_t7q)=map("q$_",(8..15)); ++ ++my ($acc_hb,$acc_mb,$acc_lb)=map("v$_.16b",(17..19)); ++my ($acc_h,$acc_m,$acc_l)=map("v$_",(17..19)); ++ ++my ($h1,$h12k,$h2,$h3,$h34k,$h4)=map("v$_",(20..25)); ++my ($h5,$h56k,$h6,$h7,$h78k,$h8)=map("v$_",(20..25)); ++my ($h1q,$h12kq,$h2q,$h3q,$h34kq,$h4q)=map("q$_",(20..25)); ++my ($h5q,$h56kq,$h6q,$h7q,$h78kq,$h8q)=map("q$_",(20..25)); ++ ++my $t0="v16"; ++my $t0d="d16"; ++ ++my $t1="v29"; ++my $t2=$res1; ++my $t3=$t1; ++ ++my $t4=$res0; ++my $t5=$res2; ++my $t6=$t0; ++ ++my $t7=$res3; ++my $t8=$res4; ++my $t9=$res5; ++ ++my $t10=$res6; ++my $t11="v21"; ++my $t12=$t1; ++ ++my $rtmp_ctr="v30"; ++my $rtmp_ctrq="q30"; ++my $rctr_inc="v31"; ++my $rctr_incd="d31"; ++ ++my $mod_constantd=$t0d; ++my $mod_constant=$t0; ++ ++my ($rk0,$rk1,$rk2)=map("v$_.16b",(26..28)); ++my ($rk3,$rk4,$rk5)=map("v$_.16b",(26..28)); ++my ($rk6,$rk7,$rk8)=map("v$_.16b",(26..28)); ++my ($rk9,$rk10,$rk11)=map("v$_.16b",(26..28)); ++my ($rk12,$rk13,$rk14)=map("v$_.16b",(26..28)); ++my ($rk0q,$rk1q,$rk2q)=map("q$_",(26..28)); ++my ($rk3q,$rk4q,$rk5q)=map("q$_",(26..28)); ++my ($rk6q,$rk7q,$rk8q)=map("q$_",(26..28)); ++my ($rk9q,$rk10q,$rk11q)=map("q$_",(26..28)); ++my ($rk12q,$rk13q,$rk14q)=map("q$_",(26..28)); ++my $rk2q1="v28.1q"; ++my $rk3q1="v26.1q"; ++my $rk4v="v27"; ++######################################################################################### ++# size_t unroll8_eor3_aes_gcm_enc_256_kernel(const unsigned char *in, ++# size_t len, ++# unsigned char *out, ++# const void *key, ++# unsigned char ivec[16], ++# u64 *Xi); ++# ++$code.=<<___; ++.global unroll8_eor3_aes_gcm_enc_256_kernel ++.type unroll8_eor3_aes_gcm_enc_256_kernel,%function ++.align 4 ++unroll8_eor3_aes_gcm_enc_256_kernel: ++ AARCH64_VALID_CALL_TARGET ++ cbz x1, .L256_enc_ret ++ stp d8, d9, [sp, #-80]! ++ mov $counter, x4 ++ mov $cc, x5 ++ stp d10, d11, [sp, #16] ++ stp d12, d13, [sp, #32] ++ stp d14, d15, [sp, #48] ++ mov x5, #0xc200000000000000 ++ stp x5, xzr, [sp, #64] ++ add $modulo_constant, sp, #64 ++ ++ ld1 { $ctr0b}, [$counter] @ CTR block 0 ++ ++ lsr $main_end_input_ptr, $bit_length, #3 @ byte_len ++ ++ mov $constant_temp, #0x100000000 @ set up counter increment ++ movi $rctr_inc.16b, #0x0 ++ mov $rctr_inc.d[1], $constant_temp ++ sub $main_end_input_ptr, $main_end_input_ptr, #1 @ byte_len - 1 ++ ++ and $main_end_input_ptr, $main_end_input_ptr, #0xffffffffffffff80 @ number of bytes to be processed in main loop (at least 1 byte must be handled by tail) ++ ++ add $main_end_input_ptr, $main_end_input_ptr, $input_ptr ++ ++ rev32 $rtmp_ctr.16b, $ctr0.16b @ set up reversed counter ++ ++ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 0 ++ ++ rev32 $ctr1.16b, $rtmp_ctr.16b @ CTR block 1 ++ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 1 ++ ++ rev32 $ctr2.16b, $rtmp_ctr.16b @ CTR block 2 ++ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 2 ++ ++ rev32 $ctr3.16b, $rtmp_ctr.16b @ CTR block 3 ++ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 3 ++ ++ rev32 $ctr4.16b, $rtmp_ctr.16b @ CTR block 4 ++ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 4 ++ ++ rev32 $ctr5.16b, $rtmp_ctr.16b @ CTR block 5 ++ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 5 ++ ldp $rk0q, $rk1q, [$cc, #0] @ load rk0, rk1 ++ ++ rev32 $ctr6.16b, $rtmp_ctr.16b @ CTR block 6 ++ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 6 ++ ++ rev32 $ctr7.16b, $rtmp_ctr.16b @ CTR block 7 ++ ++ aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 0 ++ aese $ctr4b, $rk0 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 0 ++ aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 0 ++ ++ aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 0 ++ aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 0 ++ aese $ctr6b, $rk0 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 0 ++ ++ aese $ctr5b, $rk0 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 0 ++ aese $ctr7b, $rk0 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 0 ++ ldp $rk2q, $rk3q, [$cc, #32] @ load rk2, rk3 ++ ++ aese $ctr4b, $rk1 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 1 ++ aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 1 ++ aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 1 ++ ++ aese $ctr6b, $rk1 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 1 ++ aese $ctr5b, $rk1 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 1 ++ ++ aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 1 ++ ++ aese $ctr7b, $rk1 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 1 ++ ++ aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 2 ++ aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 2 ++ aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 1 ++ ++ aese $ctr7b, $rk2 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 2 ++ aese $ctr6b, $rk2 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 2 ++ aese $ctr5b, $rk2 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 2 ++ ++ aese $ctr4b, $rk2 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 2 ++ aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 2 ++ aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 2 ++ ++ aese $ctr5b, $rk3 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 3 ++ aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 3 ++ ldp $rk4q, $rk5q, [$cc, #64] @ load rk4, rk5 ++ ++ aese $ctr4b, $rk3 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 3 ++ ++ aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 3 ++ aese $ctr6b, $rk3 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 3 ++ aese $ctr7b, $rk3 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 3 ++ ++ aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 3 ++ aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 3 ++ ++ aese $ctr4b, $rk4 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 4 ++ aese $ctr6b, $rk4 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 4 ++ aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 4 ++ ++ aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 4 ++ aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 4 ++ ++ aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 4 ++ aese $ctr7b, $rk4 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 4 ++ aese $ctr5b, $rk4 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 4 ++ ++ aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 5 ++ aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 5 ++ ldp $rk6q, $rk7q, [$cc, #96] @ load rk6, rk7 ++ ++ aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 5 ++ aese $ctr4b, $rk5 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 5 ++ aese $ctr5b, $rk5 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 5 ++ ++ aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 5 ++ aese $ctr6b, $rk5 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 5 ++ aese $ctr7b, $rk5 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 5 ++ ++ aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 6 ++ aese $ctr5b, $rk6 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 6 ++ aese $ctr4b, $rk6 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 6 ++ ++ aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 6 ++ aese $ctr6b, $rk6 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 6 ++ aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 6 ++ ++ aese $ctr7b, $rk6 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 6 ++ aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 6 ++ ldp $rk8q, $rk9q, [$cc, #128] @ load rk8, rk9 ++ ++ aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 7 ++ aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 7 ++ ++ aese $ctr7b, $rk7 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 7 ++ aese $ctr6b, $rk7 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 7 ++ aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 7 ++ ++ aese $ctr5b, $rk7 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 7 ++ aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 7 ++ ++ aese $ctr4b, $rk7 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 7 ++ ++ aese $ctr6b, $rk8 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 8 ++ aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 8 ++ ++ aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 8 ++ aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 8 ++ aese $ctr7b, $rk8 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 8 ++ ++ aese $ctr5b, $rk8 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 8 ++ aese $ctr4b, $rk8 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 8 ++ aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 8 ++ ++ ld1 { $acc_lb}, [$current_tag] ++ ext $acc_lb, $acc_lb, $acc_lb, #8 ++ rev64 $acc_lb, $acc_lb ++ ldp $rk10q, $rk11q, [$cc, #160] @ load rk10, rk11 ++ ++ aese $ctr6b, $rk9 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 9 ++ aese $ctr7b, $rk9 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 9 ++ aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 9 ++ ++ aese $ctr4b, $rk9 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 9 ++ aese $ctr5b, $rk9 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 9 ++ aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 9 ++ ++ aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 9 ++ ++ aese $ctr7b, $rk10 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 10 ++ aese $ctr4b, $rk10 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 10 ++ aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 9 ++ ++ aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 10 ++ aese $ctr5b, $rk10 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 10 ++ aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 10 ++ ++ aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 10 ++ aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 10 ++ aese $ctr6b, $rk10 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 10 ++ ++ aese $ctr4b, $rk11 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 11 ++ ldp $rk12q, $rk13q, [$cc, #192] @ load rk12, rk13 ++ aese $ctr5b, $rk11 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 11 ++ ++ aese $ctr2b, $rk11 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 11 ++ aese $ctr6b, $rk11 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 11 ++ aese $ctr1b, $rk11 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 11 ++ ++ aese $ctr0b, $rk11 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 11 ++ aese $ctr3b, $rk11 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 11 ++ aese $ctr7b, $rk11 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 11 ++ ++ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 7 ++ ldr $rk14q, [$cc, #224] @ load rk14 ++ ++ aese $ctr4b, $rk12 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 12 ++ aese $ctr2b, $rk12 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 12 ++ aese $ctr1b, $rk12 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 12 ++ ++ aese $ctr0b, $rk12 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 12 ++ aese $ctr5b, $rk12 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 12 ++ aese $ctr3b, $rk12 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 12 ++ ++ aese $ctr2b, $rk13 @ AES block 2 - round 13 ++ aese $ctr1b, $rk13 @ AES block 1 - round 13 ++ aese $ctr4b, $rk13 @ AES block 4 - round 13 ++ ++ aese $ctr6b, $rk12 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 12 ++ aese $ctr7b, $rk12 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 12 ++ ++ aese $ctr0b, $rk13 @ AES block 0 - round 13 ++ aese $ctr5b, $rk13 @ AES block 5 - round 13 ++ ++ aese $ctr6b, $rk13 @ AES block 6 - round 13 ++ aese $ctr7b, $rk13 @ AES block 7 - round 13 ++ aese $ctr3b, $rk13 @ AES block 3 - round 13 ++ ++ add $end_input_ptr, $input_ptr, $bit_length, lsr #3 @ end_input_ptr ++ cmp $input_ptr, $main_end_input_ptr @ check if we have <= 8 blocks ++ b.ge .L256_enc_tail @ handle tail ++ ++ ldp $ctr_t0q, $ctr_t1q, [$input_ptr], #32 @ AES block 0, 1 - load plaintext ++ ++ ldp $ctr_t2q, $ctr_t3q, [$input_ptr], #32 @ AES block 2, 3 - load plaintext ++ ++ eor3 $res0b, $ctr_t0b, $ctr0b, $rk14 @ AES block 0 - result ++ rev32 $ctr0.16b, $rtmp_ctr.16b @ CTR block 8 ++ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8 ++ ++ eor3 $res1b, $ctr_t1b, $ctr1b, $rk14 @ AES block 1 - result ++ eor3 $res3b, $ctr_t3b, $ctr3b, $rk14 @ AES block 3 - result ++ ++ rev32 $ctr1.16b, $rtmp_ctr.16b @ CTR block 9 ++ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 9 ++ ldp $ctr_t4q, $ctr_t5q, [$input_ptr], #32 @ AES block 4, 5 - load plaintext ++ ++ ldp $ctr_t6q, $ctr_t7q, [$input_ptr], #32 @ AES block 6, 7 - load plaintext ++ eor3 $res2b, $ctr_t2b, $ctr2b, $rk14 @ AES block 2 - result ++ cmp $input_ptr, $main_end_input_ptr @ check if we have <= 8 blocks ++ ++ rev32 $ctr2.16b, $rtmp_ctr.16b @ CTR block 10 ++ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 10 ++ stp $res0q, $res1q, [$output_ptr], #32 @ AES block 0, 1 - store result ++ ++ stp $res2q, $res3q, [$output_ptr], #32 @ AES block 2, 3 - store result ++ ++ rev32 $ctr3.16b, $rtmp_ctr.16b @ CTR block 11 ++ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 11 ++ ++ eor3 $res4b, $ctr_t4b, $ctr4b, $rk14 @ AES block 4 - result ++ ++ eor3 $res7b, $ctr_t7b, $ctr7b, $rk14 @ AES block 7 - result ++ eor3 $res6b, $ctr_t6b, $ctr6b, $rk14 @ AES block 6 - result ++ eor3 $res5b, $ctr_t5b, $ctr5b, $rk14 @ AES block 5 - result ++ ++ stp $res4q, $res5q, [$output_ptr], #32 @ AES block 4, 5 - store result ++ rev32 $ctr4.16b, $rtmp_ctr.16b @ CTR block 12 ++ ++ stp $res6q, $res7q, [$output_ptr], #32 @ AES block 6, 7 - store result ++ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 12 ++ b.ge .L256_enc_prepretail @ do prepretail ++ ++.L256_enc_main_loop: @ main loop start ++ ldp $rk0q, $rk1q, [$cc, #0] @ load rk0, rk1 ++ ++ rev32 $ctr5.16b, $rtmp_ctr.16b @ CTR block 8k+13 ++ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+13 ++ ldr $h56kq, [$current_tag, #144] @ load h6k | h5k ++ ldr $h78kq, [$current_tag, #192] @ load h8k | h7k ++ ++ rev64 $res3b, $res3b @ GHASH block 8k+3 ++ ldr $h5q, [$current_tag, #128] @ load h5l | h5h ++ ext $h5.16b, $h5.16b, $h5.16b, #8 ++ ldr $h6q, [$current_tag, #160] @ load h6l | h6h ++ ext $h6.16b, $h6.16b, $h6.16b, #8 ++ rev64 $res1b, $res1b @ GHASH block 8k+1 ++ ++ rev32 $ctr6.16b, $rtmp_ctr.16b @ CTR block 8k+14 ++ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+14 ++ rev64 $res0b, $res0b @ GHASH block 8k ++ ++ rev64 $res4b, $res4b @ GHASH block 8k+4 ++ ext $acc_lb, $acc_lb, $acc_lb, #8 @ PRE 0 ++ ldr $h7q, [$current_tag, #176] @ load h7l | h7h ++ ext $h7.16b, $h7.16b, $h7.16b, #8 ++ ldr $h8q, [$current_tag, #208] @ load h8l | h8h ++ ext $h8.16b, $h8.16b, $h8.16b, #8 ++ ++ aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 0 ++ aese $ctr5b, $rk0 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 0 ++ rev32 $ctr7.16b, $rtmp_ctr.16b @ CTR block 8k+15 ++ ++ aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 0 ++ aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 0 ++ aese $ctr6b, $rk0 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 0 ++ ++ aese $ctr7b, $rk0 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 0 ++ aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 0 ++ aese $ctr4b, $rk0 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 0 ++ ++ ldp $rk2q, $rk3q, [$cc, #32] @ load rk2, rk3 ++ eor $res0b, $res0b, $acc_lb @ PRE 1 ++ aese $ctr6b, $rk1 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 1 ++ ++ aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 1 ++ aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 1 ++ aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 1 ++ ++ aese $ctr4b, $rk1 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 1 ++ aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 1 ++ aese $ctr5b, $rk1 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 1 ++ ++ pmull2 $acc_h.1q, $res0.2d, $h8.2d @ GHASH block 8k - high ++ pmull $acc_l.1q, $res0.1d, $h8.1d @ GHASH block 8k - low ++ pmull2 $t0.1q, $res1.2d, $h7.2d @ GHASH block 8k+1 - high ++ ++ trn1 $acc_m.2d, $res1.2d, $res0.2d @ GHASH block 8k, 8k+1 - mid ++ trn2 $res0.2d, $res1.2d, $res0.2d @ GHASH block 8k, 8k+1 - mid ++ aese $ctr7b, $rk1 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 1 ++ ++ aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 2 ++ aese $ctr5b, $rk2 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 2 ++ aese $ctr6b, $rk2 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 2 ++ ++ aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 2 ++ pmull $h7.1q, $res1.1d, $h7.1d @ GHASH block 8k+1 - low ++ aese $ctr4b, $rk2 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 2 ++ ++ aese $ctr5b, $rk3 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 3 ++ aese $ctr6b, $rk3 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 3 ++ aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 2 ++ ++ aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 3 ++ aese $ctr7b, $rk2 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 2 ++ aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 2 ++ ++ aese $ctr4b, $rk3 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 3 ++ rev64 $res6b, $res6b @ GHASH block 8k+6 ++ pmull2 $t2.1q, $res3.2d, $h5.2d @ GHASH block 8k+3 - high ++ ++ aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 3 ++ ldp $rk4q, $rk5q, [$cc, #64] @ load rk4, rk5 ++ rev64 $res2b, $res2b @ GHASH block 8k+2 ++ ++ aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 3 ++ aese $ctr7b, $rk3 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 3 ++ aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 3 ++ ++ eor $acc_hb, $acc_hb, $t0.16b @ GHASH block 8k+1 - high ++ pmull2 $t1.1q, $res2.2d, $h6.2d @ GHASH block 8k+2 - high ++ rev64 $res5b, $res5b @ GHASH block 8k+5 ++ ++ pmull $h5.1q, $res3.1d, $h5.1d @ GHASH block 8k+3 - low ++ eor $acc_lb, $acc_lb, $h7.16b @ GHASH block 8k+1 - low ++ ldr $h3q, [$current_tag, #80] @ load h3l | h3h ++ ext $h3.16b, $h3.16b, $h3.16b, #8 ++ ldr $h4q, [$current_tag, #112] @ load h4l | h4h ++ ext $h4.16b, $h4.16b, $h4.16b, #8 ++ ++ trn1 $t6.2d, $res5.2d, $res4.2d @ GHASH block 8k+4, 8k+5 - mid ++ eor3 $acc_hb, $acc_hb, $t1.16b, $t2.16b @ GHASH block 8k+2, 8k+3 - high ++ pmull $h6.1q, $res2.1d, $h6.1d @ GHASH block 8k+2 - low ++ ++ aese $ctr4b, $rk4 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 4 ++ aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 4 ++ aese $ctr5b, $rk4 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 4 ++ ++ aese $ctr7b, $rk4 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 4 ++ aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 4 ++ aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 4 ++ ++ trn1 $t3.2d, $res3.2d, $res2.2d @ GHASH block 8k+2, 8k+3 - mid ++ aese $ctr6b, $rk4 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 4 ++ aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 4 ++ ++ trn2 $res2.2d, $res3.2d, $res2.2d @ GHASH block 8k+2, 8k+3 - mid ++ eor $res0.16b, $res0.16b, $acc_m.16b @ GHASH block 8k, 8k+1 - mid ++ ldp $rk6q, $rk7q, [$cc, #96] @ load rk6, rk7 ++ ++ aese $ctr5b, $rk5 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 5 ++ aese $ctr7b, $rk5 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 5 ++ aese $ctr4b, $rk5 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 5 ++ ++ eor $res2.16b, $res2.16b, $t3.16b @ GHASH block 8k+2, 8k+3 - mid ++ aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 5 ++ rev64 $res7b, $res7b @ GHASH block 8k+7 ++ ++ aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 5 ++ aese $ctr6b, $rk5 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 5 ++ aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 5 ++ ++ pmull2 $t3.1q, $res2.2d, $h56k.2d @ GHASH block 8k+2 - mid ++ pmull2 $acc_m.1q, $res0.2d, $h78k.2d @ GHASH block 8k - mid ++ aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 5 ++ ++ pmull $h78k.1q, $res0.1d, $h78k.1d @ GHASH block 8k+1 - mid ++ aese $ctr4b, $rk6 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 6 ++ aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 6 ++ ++ aese $ctr6b, $rk6 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 6 ++ aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 6 ++ aese $ctr7b, $rk6 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 6 ++ ++ eor $acc_mb, $acc_mb, $h78k.16b @ GHASH block 8k+1 - mid ++ pmull $h56k.1q, $res2.1d, $h56k.1d @ GHASH block 8k+3 - mid ++ aese $ctr5b, $rk6 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 6 ++ ++ eor3 $acc_lb, $acc_lb, $h6.16b, $h5.16b @ GHASH block 8k+2, 8k+3 - low ++ aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 6 ++ aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 6 ++ ++ ldp $rk8q, $rk9q, [$cc, #128] @ load rk8, rk9 ++ pmull2 $t4.1q, $res4.2d, $h4.2d @ GHASH block 8k+4 - high ++ aese $ctr5b, $rk7 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 7 ++ ++ ldr $h1q, [$current_tag, #32] @ load h1l | h1h ++ ext $h1.16b, $h1.16b, $h1.16b, #8 ++ ldr $h2q, [$current_tag, #64] @ load h1l | h1h ++ ext $h2.16b, $h2.16b, $h2.16b, #8 ++ aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 7 ++ eor3 $acc_mb, $acc_mb, $h56k.16b, $t3.16b @ GHASH block 8k+2, 8k+3 - mid ++ ++ ldr $h12kq, [$current_tag, #48] @ load h2k | h1k ++ ldr $h34kq, [$current_tag, #96] @ load h4k | h3k ++ aese $ctr6b, $rk7 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 7 ++ aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 7 ++ ++ aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 7 ++ aese $ctr7b, $rk7 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 7 ++ pmull $h4.1q, $res4.1d, $h4.1d @ GHASH block 8k+4 - low ++ ++ trn2 $res4.2d, $res5.2d, $res4.2d @ GHASH block 8k+4, 8k+5 - mid ++ aese $ctr4b, $rk7 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 7 ++ aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 7 ++ ++ pmull2 $t5.1q, $res5.2d, $h3.2d @ GHASH block 8k+5 - high ++ aese $ctr7b, $rk8 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 8 ++ aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 8 ++ ++ pmull $h3.1q, $res5.1d, $h3.1d @ GHASH block 8k+5 - low ++ trn1 $t9.2d, $res7.2d, $res6.2d @ GHASH block 8k+6, 8k+7 - mid ++ eor $res4.16b, $res4.16b, $t6.16b @ GHASH block 8k+4, 8k+5 - mid ++ ++ aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 8 ++ aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 9 ++ aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 8 ++ ++ pmull2 $t6.1q, $res4.2d, $h34k.2d @ GHASH block 8k+4 - mid ++ pmull $h34k.1q, $res4.1d, $h34k.1d @ GHASH block 8k+5 - mid ++ aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 8 ++ ++ aese $ctr5b, $rk8 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 8 ++ pmull2 $t7.1q, $res6.2d, $h2.2d @ GHASH block 8k+6 - high ++ pmull $h2.1q, $res6.1d, $h2.1d @ GHASH block 8k+6 - low ++ ++ aese $ctr6b, $rk8 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 8 ++ trn2 $res6.2d, $res7.2d, $res6.2d @ GHASH block 8k+6, 8k+7 - mid ++ aese $ctr4b, $rk8 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 8 ++ ++ eor3 $acc_mb, $acc_mb, $h34k.16b, $t6.16b @ GHASH block 8k+4, 8k+5 - mid ++ aese $ctr7b, $rk9 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 9 ++ aese $ctr5b, $rk9 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 9 ++ ++ eor $res6.16b, $res6.16b, $t9.16b @ GHASH block 8k+6, 8k+7 - mid ++ aese $ctr6b, $rk9 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 9 ++ aese $ctr4b, $rk9 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 9 ++ ++ ldp $rk10q, $rk11q, [$cc, #160] @ load rk10, rk11 ++ aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 9 ++ aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 9 ++ ++ pmull2 $t8.1q, $res7.2d, $h1.2d @ GHASH block 8k+7 - high ++ eor3 $acc_lb, $acc_lb, $h4.16b, $h3.16b @ GHASH block 8k+4, 8k+5 - low ++ pmull $h1.1q, $res7.1d, $h1.1d @ GHASH block 8k+7 - low ++ ++ ldr $mod_constantd, [$modulo_constant] @ MODULO - load modulo constant ++ pmull2 $t9.1q, $res6.2d, $h12k.2d @ GHASH block 8k+6 - mid ++ pmull $h12k.1q, $res6.1d, $h12k.1d @ GHASH block 8k+7 - mid ++ ++ aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 9 ++ ++ eor3 $acc_mb, $acc_mb, $h12k.16b, $t9.16b @ GHASH block 8k+6, 8k+7 - mid ++ eor3 $acc_lb, $acc_lb, $h2.16b, $h1.16b @ GHASH block 8k+6, 8k+7 - low ++ eor3 $acc_hb, $acc_hb, $t4.16b, $t5.16b @ GHASH block 8k+4, 8k+5 - high ++ ++ aese $ctr4b, $rk10 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 10 ++ aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 10 ++ aese $ctr5b, $rk10 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 10 ++ ++ aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 10 ++ aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 10 ++ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+15 ++ ++ aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 10 ++ aese $ctr7b, $rk10 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 10 ++ aese $ctr6b, $rk10 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 10 ++ ++ eor3 $acc_hb, $acc_hb, $t7.16b, $t8.16b @ GHASH block 8k+6, 8k+7 - high ++ ++ ldp $rk12q, $rk13q, [$cc, #192] @ load rk12, rk13 ++ rev32 $h1.16b, $rtmp_ctr.16b @ CTR block 8k+16 ++ ++ ext $t11.16b, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment ++ ldp $ctr_t0q, $ctr_t1q, [$input_ptr], #32 @ AES block 8k+8, 8k+9 - load plaintext ++ aese $ctr2b, $rk11 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 11 ++ ++ aese $ctr6b, $rk11 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 11 ++ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+16 ++ aese $ctr3b, $rk11 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 11 ++ ++ aese $ctr0b, $rk11 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 11 ++ aese $ctr7b, $rk11 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 11 ++ ++ pmull $t12.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid ++ aese $ctr1b, $rk11 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 11 ++ ++ aese $ctr7b, $rk12 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 12 ++ aese $ctr5b, $rk11 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 11 ++ ++ aese $ctr3b, $rk12 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 12 ++ aese $ctr6b, $rk12 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 12 ++ rev32 $h2.16b, $rtmp_ctr.16b @ CTR block 8k+17 ++ ++ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+17 ++ aese $ctr4b, $rk11 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 11 ++ eor3 $acc_mb, $acc_mb, $acc_hb, $acc_lb @ MODULO - karatsuba tidy up ++ ++ aese $ctr5b, $rk12 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 12 ++ ldr $rk14q, [$cc, #224] @ load rk14 ++ aese $ctr7b, $rk13 @ AES block 8k+15 - round 13 ++ ++ ldp $ctr_t2q, $ctr_t3q, [$input_ptr], #32 @ AES block 8k+10, 8k+11 - load plaintext ++ aese $ctr2b, $rk12 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 12 ++ aese $ctr4b, $rk12 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 12 ++ ++ eor3 $acc_mb, $acc_mb, $t12.16b, $t11.16b @ MODULO - fold into mid ++ aese $ctr1b, $rk12 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 12 ++ ldp $ctr_t4q, $ctr_t5q, [$input_ptr], #32 @ AES block 4, 5 - load plaintext ++ ++ ldp $ctr_t6q, $ctr_t7q, [$input_ptr], #32 @ AES block 6, 7 - load plaintext ++ aese $ctr2b, $rk13 @ AES block 8k+10 - round 13 ++ aese $ctr4b, $rk13 @ AES block 8k+12 - round 13 ++ ++ rev32 $h3.16b, $rtmp_ctr.16b @ CTR block 8k+18 ++ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+18 ++ aese $ctr5b, $rk13 @ AES block 8k+13 - round 13 ++ ++ aese $ctr0b, $rk12 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 12 ++ aese $ctr3b, $rk13 @ AES block 8k+11 - round 13 ++ cmp $input_ptr, $main_end_input_ptr @ LOOP CONTROL ++ ++ eor3 $res2b, $ctr_t2b, $ctr2b, $rk14 @ AES block 8k+10 - result ++ rev32 $h4.16b, $rtmp_ctr.16b @ CTR block 8k+19 ++ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+19 ++ ++ aese $ctr0b, $rk13 @ AES block 8k+8 - round 13 ++ aese $ctr6b, $rk13 @ AES block 8k+14 - round 13 ++ eor3 $res5b, $ctr_t5b, $ctr5b, $rk14 @ AES block 5 - result ++ ++ ext $t11.16b, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment ++ pmull $acc_h.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low ++ aese $ctr1b, $rk13 @ AES block 8k+9 - round 13 ++ ++ eor3 $res4b, $ctr_t4b, $ctr4b, $rk14 @ AES block 4 - result ++ rev32 $ctr4.16b, $rtmp_ctr.16b @ CTR block 8k+20 ++ eor3 $res3b, $ctr_t3b, $ctr3b, $rk14 @ AES block 8k+11 - result ++ ++ mov $ctr3.16b, $h4.16b @ CTR block 8k+19 ++ eor3 $res1b, $ctr_t1b, $ctr1b, $rk14 @ AES block 8k+9 - result ++ eor3 $res0b, $ctr_t0b, $ctr0b, $rk14 @ AES block 8k+8 - result ++ ++ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+20 ++ stp $res0q, $res1q, [$output_ptr], #32 @ AES block 8k+8, 8k+9 - store result ++ mov $ctr2.16b, $h3.16b @ CTR block 8k+18 ++ ++ eor3 $res7b, $ctr_t7b, $ctr7b, $rk14 @ AES block 7 - result ++ eor3 $acc_lb, $acc_lb, $t11.16b, $acc_hb @ MODULO - fold into low ++ stp $res2q, $res3q, [$output_ptr], #32 @ AES block 8k+10, 8k+11 - store result ++ ++ eor3 $res6b, $ctr_t6b, $ctr6b, $rk14 @ AES block 6 - result ++ mov $ctr1.16b, $h2.16b @ CTR block 8k+17 ++ stp $res4q, $res5q, [$output_ptr], #32 @ AES block 4, 5 - store result ++ ++ stp $res6q, $res7q, [$output_ptr], #32 @ AES block 6, 7 - store result ++ mov $ctr0.16b, $h1.16b @ CTR block 8k+16 ++ b.lt .L256_enc_main_loop ++ ++.L256_enc_prepretail: @ PREPRETAIL ++ rev32 $ctr5.16b, $rtmp_ctr.16b @ CTR block 8k+13 ++ ldp $rk0q, $rk1q, [$cc, #0] @ load rk0, rk1 ++ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+13 ++ ++ rev64 $res2b, $res2b @ GHASH block 8k+2 ++ ++ rev32 $ctr6.16b, $rtmp_ctr.16b @ CTR block 8k+14 ++ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+14 ++ ++ rev64 $res5b, $res5b @ GHASH block 8k+5 ++ ldr $h56kq, [$current_tag, #144] @ load h6k | h5k ++ ldr $h78kq, [$current_tag, #192] @ load h8k | h7k ++ ++ rev32 $ctr7.16b, $rtmp_ctr.16b @ CTR block 8k+15 ++ ++ aese $ctr6b, $rk0 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 0 ++ aese $ctr4b, $rk0 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 0 ++ aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 0 ++ ++ aese $ctr5b, $rk0 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 0 ++ aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 0 ++ ++ aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 0 ++ aese $ctr7b, $rk0 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 0 ++ aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 0 ++ ++ ext $acc_lb, $acc_lb, $acc_lb, #8 @ PRE 0 ++ rev64 $res0b, $res0b @ GHASH block 8k ++ aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 1 ++ ++ rev64 $res1b, $res1b @ GHASH block 8k+1 ++ ldp $rk2q, $rk3q, [$cc, #32] @ load rk2, rk3 ++ aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 1 ++ ++ ldr $h7q, [$current_tag, #176] @ load h7l | h7h ++ ext $h7.16b, $h7.16b, $h7.16b, #8 ++ ldr $h8q, [$current_tag, #208] @ load h8l | h8h ++ ext $h8.16b, $h8.16b, $h8.16b, #8 ++ aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 1 ++ ++ ldr $h5q, [$current_tag, #128] @ load h5l | h5h ++ ext $h5.16b, $h5.16b, $h5.16b, #8 ++ ldr $h6q, [$current_tag, #160] @ load h6l | h6h ++ ext $h6.16b, $h6.16b, $h6.16b, #8 ++ aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 1 ++ aese $ctr5b, $rk1 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 1 ++ ++ aese $ctr4b, $rk1 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 1 ++ eor $res0b, $res0b, $acc_lb @ PRE 1 ++ ++ rev64 $res3b, $res3b @ GHASH block 8k+3 ++ aese $ctr6b, $rk1 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 1 ++ ++ aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 2 ++ aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 2 ++ aese $ctr7b, $rk1 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 1 ++ ++ aese $ctr4b, $rk2 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 2 ++ aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 2 ++ aese $ctr6b, $rk2 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 2 ++ ++ aese $ctr5b, $rk2 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 2 ++ aese $ctr7b, $rk2 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 2 ++ aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 2 ++ ++ ldp $rk4q, $rk5q, [$cc, #64] @ load rk4, rk5 ++ trn1 $acc_m.2d, $res1.2d, $res0.2d @ GHASH block 8k, 8k+1 - mid ++ pmull2 $acc_h.1q, $res0.2d, $h8.2d @ GHASH block 8k - high ++ ++ rev64 $res6b, $res6b @ GHASH block 8k+6 ++ aese $ctr4b, $rk3 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 3 ++ pmull2 $t0.1q, $res1.2d, $h7.2d @ GHASH block 8k+1 - high ++ ++ aese $ctr7b, $rk3 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 3 ++ pmull $acc_l.1q, $res0.1d, $h8.1d @ GHASH block 8k - low ++ trn2 $res0.2d, $res1.2d, $res0.2d @ GHASH block 8k, 8k+1 - mid ++ ++ pmull2 $t1.1q, $res2.2d, $h6.2d @ GHASH block 8k+2 - high ++ aese $ctr6b, $rk3 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 3 ++ ++ aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 3 ++ aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 3 ++ eor $acc_hb, $acc_hb, $t0.16b @ GHASH block 8k+1 - high ++ ++ pmull $h7.1q, $res1.1d, $h7.1d @ GHASH block 8k+1 - low ++ pmull2 $t2.1q, $res3.2d, $h5.2d @ GHASH block 8k+3 - high ++ aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 3 ++ ++ aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 3 ++ eor $res0.16b, $res0.16b, $acc_m.16b @ GHASH block 8k, 8k+1 - mid ++ aese $ctr5b, $rk3 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 3 ++ ++ pmull $h6.1q, $res2.1d, $h6.1d @ GHASH block 8k+2 - low ++ aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 4 ++ aese $ctr6b, $rk4 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 4 ++ ++ aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 4 ++ aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 4 ++ aese $ctr4b, $rk4 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 4 ++ ++ aese $ctr6b, $rk5 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 5 ++ pmull2 $acc_m.1q, $res0.2d, $h78k.2d @ GHASH block 8k - mid ++ eor3 $acc_hb, $acc_hb, $t1.16b, $t2.16b @ GHASH block 8k+2, 8k+3 - high ++ ++ aese $ctr7b, $rk4 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 4 ++ trn1 $t3.2d, $res3.2d, $res2.2d @ GHASH block 8k+2, 8k+3 - mid ++ trn2 $res2.2d, $res3.2d, $res2.2d @ GHASH block 8k+2, 8k+3 - mid ++ ++ aese $ctr5b, $rk4 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 4 ++ eor $acc_lb, $acc_lb, $h7.16b @ GHASH block 8k+1 - low ++ aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 4 ++ ++ pmull $h5.1q, $res3.1d, $h5.1d @ GHASH block 8k+3 - low ++ pmull $h78k.1q, $res0.1d, $h78k.1d @ GHASH block 8k+1 - mid ++ eor $res2.16b, $res2.16b, $t3.16b @ GHASH block 8k+2, 8k+3 - mid ++ ++ rev64 $res4b, $res4b @ GHASH block 8k+4 ++ aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 5 ++ aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 5 ++ ++ aese $ctr7b, $rk5 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 5 ++ aese $ctr4b, $rk5 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 5 ++ ldp $rk6q, $rk7q, [$cc, #96] @ load rk6, rk7 ++ ++ ldr $h3q, [$current_tag, #80] @ load h3l | h3h ++ ext $h3.16b, $h3.16b, $h3.16b, #8 ++ ldr $h4q, [$current_tag, #112] @ load h4l | h4h ++ ext $h4.16b, $h4.16b, $h4.16b, #8 ++ pmull2 $t3.1q, $res2.2d, $h56k.2d @ GHASH block 8k+2 - mid ++ pmull $h56k.1q, $res2.1d, $h56k.1d @ GHASH block 8k+3 - mid ++ ++ eor3 $acc_lb, $acc_lb, $h6.16b, $h5.16b @ GHASH block 8k+2, 8k+3 - low ++ eor $acc_mb, $acc_mb, $h78k.16b @ GHASH block 8k+1 - mid ++ ++ aese $ctr5b, $rk5 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 5 ++ rev64 $res7b, $res7b @ GHASH block 8k+7 ++ trn1 $t6.2d, $res5.2d, $res4.2d @ GHASH block 8k+4, 8k+5 - mid ++ ++ aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 5 ++ aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 5 ++ eor3 $acc_mb, $acc_mb, $h56k.16b, $t3.16b @ GHASH block 8k+2, 8k+3 - mid ++ ++ aese $ctr7b, $rk6 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 6 ++ aese $ctr4b, $rk6 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 6 ++ aese $ctr6b, $rk6 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 6 ++ ++ ldr $h12kq, [$current_tag, #48] @ load h2k | h1k ++ ldr $h34kq, [$current_tag, #96] @ load h4k | h3k ++ aese $ctr5b, $rk6 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 6 ++ aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 6 ++ ++ aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 6 ++ aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 6 ++ aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 6 ++ ++ pmull2 $t4.1q, $res4.2d, $h4.2d @ GHASH block 8k+4 - high ++ pmull $h4.1q, $res4.1d, $h4.1d @ GHASH block 8k+4 - low ++ ldr $h1q, [$current_tag, #32] @ load h1l | h1h ++ ext $h1.16b, $h1.16b, $h1.16b, #8 ++ ldr $h2q, [$current_tag, #64] @ load h1l | h1h ++ ext $h2.16b, $h2.16b, $h2.16b, #8 ++ ++ ldp $rk8q, $rk9q, [$cc, #128] @ load rk8, rk9 ++ aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 7 ++ aese $ctr4b, $rk7 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 7 ++ ++ pmull2 $t5.1q, $res5.2d, $h3.2d @ GHASH block 8k+5 - high ++ trn2 $res4.2d, $res5.2d, $res4.2d @ GHASH block 8k+4, 8k+5 - mid ++ ++ aese $ctr5b, $rk7 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 7 ++ aese $ctr6b, $rk7 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 7 ++ pmull $h3.1q, $res5.1d, $h3.1d @ GHASH block 8k+5 - low ++ ++ aese $ctr7b, $rk7 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 7 ++ aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 7 ++ eor $res4.16b, $res4.16b, $t6.16b @ GHASH block 8k+4, 8k+5 - mid ++ ++ pmull2 $t7.1q, $res6.2d, $h2.2d @ GHASH block 8k+6 - high ++ pmull $h2.1q, $res6.1d, $h2.1d @ GHASH block 8k+6 - low ++ aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 7 ++ ++ trn1 $t9.2d, $res7.2d, $res6.2d @ GHASH block 8k+6, 8k+7 - mid ++ trn2 $res6.2d, $res7.2d, $res6.2d @ GHASH block 8k+6, 8k+7 - mid ++ aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 7 ++ ++ aese $ctr7b, $rk8 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 8 ++ eor3 $acc_lb, $acc_lb, $h4.16b, $h3.16b @ GHASH block 8k+4, 8k+5 - low ++ aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 8 ++ ++ aese $ctr6b, $rk8 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 8 ++ aese $ctr4b, $rk8 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 8 ++ aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 8 ++ ++ aese $ctr5b, $rk8 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 8 ++ eor $res6.16b, $res6.16b, $t9.16b @ GHASH block 8k+6, 8k+7 - mid ++ aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 8 ++ ++ pmull2 $t6.1q, $res4.2d, $h34k.2d @ GHASH block 8k+4 - mid ++ pmull $h34k.1q, $res4.1d, $h34k.1d @ GHASH block 8k+5 - mid ++ aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 8 ++ ++ pmull2 $t8.1q, $res7.2d, $h1.2d @ GHASH block 8k+7 - high ++ pmull2 $t9.1q, $res6.2d, $h12k.2d @ GHASH block 8k+6 - mid ++ pmull $h12k.1q, $res6.1d, $h12k.1d @ GHASH block 8k+7 - mid ++ ++ pmull $h1.1q, $res7.1d, $h1.1d @ GHASH block 8k+7 - low ++ eor3 $acc_mb, $acc_mb, $h34k.16b, $t6.16b @ GHASH block 8k+4, 8k+5 - mid ++ eor3 $acc_hb, $acc_hb, $t4.16b, $t5.16b @ GHASH block 8k+4, 8k+5 - high ++ ++ ldp $rk10q, $rk11q, [$cc, #160] @ load rk10, rk11 ++ aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 9 ++ aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 9 ++ ++ eor3 $acc_hb, $acc_hb, $t7.16b, $t8.16b @ GHASH block 8k+6, 8k+7 - high ++ eor3 $acc_mb, $acc_mb, $h12k.16b, $t9.16b @ GHASH block 8k+6, 8k+7 - mid ++ ldr $mod_constantd, [$modulo_constant] @ MODULO - load modulo constant ++ ++ eor3 $acc_lb, $acc_lb, $h2.16b, $h1.16b @ GHASH block 8k+6, 8k+7 - low ++ ++ aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 9 ++ aese $ctr7b, $rk9 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 9 ++ aese $ctr5b, $rk9 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 9 ++ ++ aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 9 ++ aese $ctr6b, $rk9 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 9 ++ ++ aese $ctr5b, $rk10 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 10 ++ aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 10 ++ aese $ctr4b, $rk9 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 9 ++ ++ aese $ctr7b, $rk10 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 10 ++ aese $ctr6b, $rk10 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 10 ++ aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 10 ++ ++ aese $ctr4b, $rk10 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 10 ++ aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 10 ++ aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 10 ++ ++ pmull $t12.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid ++ eor3 $acc_mb, $acc_mb, $acc_hb, $acc_lb @ MODULO - karatsuba tidy up ++ aese $ctr7b, $rk11 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 11 ++ ++ ldp $rk12q, $rk13q, [$cc, #192] @ load rk12, rk13 ++ ext $t11.16b, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment ++ aese $ctr2b, $rk11 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 11 ++ ++ eor3 $acc_mb, $acc_mb, $t12.16b, $t11.16b @ MODULO - fold into mid ++ aese $ctr1b, $rk11 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 11 ++ aese $ctr6b, $rk11 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 11 ++ ++ aese $ctr0b, $rk11 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 11 ++ aese $ctr4b, $rk11 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 11 ++ aese $ctr5b, $rk11 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 11 ++ ++ pmull $acc_h.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low ++ aese $ctr3b, $rk11 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 11 ++ ldr $rk14q, [$cc, #224] @ load rk14 ++ ++ aese $ctr1b, $rk12 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 12 ++ aese $ctr2b, $rk12 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 12 ++ aese $ctr0b, $rk12 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 12 ++ ++ aese $ctr6b, $rk12 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 12 ++ aese $ctr5b, $rk12 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 12 ++ ext $t11.16b, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment ++ ++ aese $ctr4b, $rk12 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 12 ++ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+15 ++ ++ aese $ctr3b, $rk12 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 12 ++ aese $ctr7b, $rk12 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 12 ++ aese $ctr0b, $rk13 @ AES block 8k+8 - round 13 ++ ++ eor3 $acc_lb, $acc_lb, $t11.16b, $acc_hb @ MODULO - fold into low ++ aese $ctr5b, $rk13 @ AES block 8k+13 - round 13 ++ aese $ctr1b, $rk13 @ AES block 8k+9 - round 13 ++ ++ aese $ctr3b, $rk13 @ AES block 8k+11 - round 13 ++ aese $ctr4b, $rk13 @ AES block 8k+12 - round 13 ++ aese $ctr7b, $rk13 @ AES block 8k+15 - round 13 ++ ++ aese $ctr2b, $rk13 @ AES block 8k+10 - round 13 ++ aese $ctr6b, $rk13 @ AES block 8k+14 - round 13 ++.L256_enc_tail: @ TAIL ++ ++ ldp $h78kq, $h8q, [$current_tag, #192] @ load h8l | h8h ++ ext $h8.16b, $h8.16b, $h8.16b, #8 ++ sub $main_end_input_ptr, $end_input_ptr, $input_ptr @ main_end_input_ptr is number of bytes left to process ++ ++ ldr $ctr_t0q, [$input_ptr], #16 @ AES block 8k+8 - load plaintext ++ ++ ldp $h5q, $h56kq, [$current_tag, #128] @ load h5l | h5h ++ ext $h5.16b, $h5.16b, $h5.16b, #8 ++ ++ ext $t0.16b, $acc_lb, $acc_lb, #8 @ prepare final partial tag ++ ldp $h6q, $h7q, [$current_tag, #160] @ load h6l | h6h ++ ext $h6.16b, $h6.16b, $h6.16b, #8 ++ ext $h7.16b, $h7.16b, $h7.16b, #8 ++ mov $t1.16b, $rk14 ++ ++ cmp $main_end_input_ptr, #112 ++ eor3 $res1b, $ctr_t0b, $ctr0b, $t1.16b @ AES block 8k+8 - result ++ b.gt .L256_enc_blocks_more_than_7 ++ ++ movi $acc_l.8b, #0 ++ mov $ctr7b, $ctr6b ++ movi $acc_h.8b, #0 ++ ++ mov $ctr6b, $ctr5b ++ mov $ctr5b, $ctr4b ++ mov $ctr4b, $ctr3b ++ ++ mov $ctr3b, $ctr2b ++ sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s ++ mov $ctr2b, $ctr1b ++ ++ movi $acc_m.8b, #0 ++ cmp $main_end_input_ptr, #96 ++ b.gt .L256_enc_blocks_more_than_6 ++ ++ mov $ctr7b, $ctr6b ++ mov $ctr6b, $ctr5b ++ cmp $main_end_input_ptr, #80 ++ ++ mov $ctr5b, $ctr4b ++ mov $ctr4b, $ctr3b ++ mov $ctr3b, $ctr1b ++ ++ sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s ++ b.gt .L256_enc_blocks_more_than_5 ++ ++ mov $ctr7b, $ctr6b ++ sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s ++ ++ mov $ctr6b, $ctr5b ++ mov $ctr5b, $ctr4b ++ ++ cmp $main_end_input_ptr, #64 ++ mov $ctr4b, $ctr1b ++ b.gt .L256_enc_blocks_more_than_4 ++ ++ cmp $main_end_input_ptr, #48 ++ mov $ctr7b, $ctr6b ++ mov $ctr6b, $ctr5b ++ ++ mov $ctr5b, $ctr1b ++ sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s ++ b.gt .L256_enc_blocks_more_than_3 ++ ++ cmp $main_end_input_ptr, #32 ++ mov $ctr7b, $ctr6b ++ ldr $h34kq, [$current_tag, #96] @ load h4k | h3k ++ ++ mov $ctr6b, $ctr1b ++ sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s ++ b.gt .L256_enc_blocks_more_than_2 ++ ++ mov $ctr7b, $ctr1b ++ ++ sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s ++ cmp $main_end_input_ptr, #16 ++ b.gt .L256_enc_blocks_more_than_1 ++ ++ sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s ++ ldr $h12kq, [$current_tag, #48] @ load h2k | h1k ++ b .L256_enc_blocks_less_than_1 ++.L256_enc_blocks_more_than_7: @ blocks left > 7 ++ st1 { $res1b}, [$output_ptr], #16 @ AES final-7 block - store result ++ ++ rev64 $res0b, $res1b @ GHASH final-7 block ++ ++ eor $res0b, $res0b, $t0.16b @ feed in partial tag ++ ++ ldr $ctr_t1q, [$input_ptr], #16 @ AES final-6 block - load plaintext ++ ++ pmull2 $acc_h.1q, $res0.2d, $h8.2d @ GHASH final-7 block - high ++ ins $rk4v.d[0], $res0.d[1] @ GHASH final-7 block - mid ++ ins $acc_m.d[0], $h78k.d[1] @ GHASH final-7 block - mid ++ ++ movi $t0.8b, #0 @ surpress further partial tag feed in ++ ++ eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-7 block - mid ++ eor3 $res1b, $ctr_t1b, $ctr1b, $t1.16b @ AES final-6 block - result ++ ++ pmull $acc_m.1q, $rk4v.1d, $acc_m.1d @ GHASH final-7 block - mid ++ pmull $acc_l.1q, $res0.1d, $h8.1d @ GHASH final-7 block - low ++.L256_enc_blocks_more_than_6: @ blocks left > 6 ++ ++ st1 { $res1b}, [$output_ptr], #16 @ AES final-6 block - store result ++ ++ rev64 $res0b, $res1b @ GHASH final-6 block ++ ++ eor $res0b, $res0b, $t0.16b @ feed in partial tag ++ ++ pmull $rk3q1, $res0.1d, $h7.1d @ GHASH final-6 block - low ++ ins $rk4v.d[0], $res0.d[1] @ GHASH final-6 block - mid ++ pmull2 $rk2q1, $res0.2d, $h7.2d @ GHASH final-6 block - high ++ ++ ldr $ctr_t1q, [$input_ptr], #16 @ AES final-5 block - load plaintext ++ ++ eor $acc_lb, $acc_lb, $rk3 @ GHASH final-6 block - low ++ ++ eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-6 block - mid ++ ++ pmull $rk4v.1q, $rk4v.1d, $h78k.1d @ GHASH final-6 block - mid ++ eor3 $res1b, $ctr_t1b, $ctr2b, $t1.16b @ AES final-5 block - result ++ ++ movi $t0.8b, #0 @ surpress further partial tag feed in ++ ++ eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-6 block - mid ++ eor $acc_hb, $acc_hb, $rk2 @ GHASH final-6 block - high ++.L256_enc_blocks_more_than_5: @ blocks left > 5 ++ ++ st1 { $res1b}, [$output_ptr], #16 @ AES final-5 block - store result ++ ++ rev64 $res0b, $res1b @ GHASH final-5 block ++ ++ eor $res0b, $res0b, $t0.16b @ feed in partial tag ++ ++ ins $rk4v.d[0], $res0.d[1] @ GHASH final-5 block - mid ++ ++ pmull2 $rk2q1, $res0.2d, $h6.2d @ GHASH final-5 block - high ++ ++ eor $acc_hb, $acc_hb, $rk2 @ GHASH final-5 block - high ++ eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-5 block - mid ++ ++ ins $rk4v.d[1], $rk4v.d[0] @ GHASH final-5 block - mid ++ ++ ldr $ctr_t1q, [$input_ptr], #16 @ AES final-4 block - load plaintext ++ pmull $rk3q1, $res0.1d, $h6.1d @ GHASH final-5 block - low ++ ++ pmull2 $rk4v.1q, $rk4v.2d, $h56k.2d @ GHASH final-5 block - mid ++ movi $t0.8b, #0 @ surpress further partial tag feed in ++ eor $acc_lb, $acc_lb, $rk3 @ GHASH final-5 block - low ++ ++ eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-5 block - mid ++ eor3 $res1b, $ctr_t1b, $ctr3b, $t1.16b @ AES final-4 block - result ++.L256_enc_blocks_more_than_4: @ blocks left > 4 ++ ++ st1 { $res1b}, [$output_ptr], #16 @ AES final-4 block - store result ++ ++ rev64 $res0b, $res1b @ GHASH final-4 block ++ ++ ldr $ctr_t1q, [$input_ptr], #16 @ AES final-3 block - load plaintext ++ ++ eor $res0b, $res0b, $t0.16b @ feed in partial tag ++ ++ ins $rk4v.d[0], $res0.d[1] @ GHASH final-4 block - mid ++ pmull2 $rk2q1, $res0.2d, $h5.2d @ GHASH final-4 block - high ++ ++ eor3 $res1b, $ctr_t1b, $ctr4b, $t1.16b @ AES final-3 block - result ++ pmull $rk3q1, $res0.1d, $h5.1d @ GHASH final-4 block - low ++ ++ eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-4 block - mid ++ eor $acc_lb, $acc_lb, $rk3 @ GHASH final-4 block - low ++ ++ pmull $rk4v.1q, $rk4v.1d, $h56k.1d @ GHASH final-4 block - mid ++ ++ movi $t0.8b, #0 @ surpress further partial tag feed in ++ ++ eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-4 block - mid ++ eor $acc_hb, $acc_hb, $rk2 @ GHASH final-4 block - high ++.L256_enc_blocks_more_than_3: @ blocks left > 3 ++ ++ st1 { $res1b}, [$output_ptr], #16 @ AES final-3 block - store result ++ ++ ldr $h4q, [$current_tag, #112] @ load h4l | h4h ++ ext $h4.16b, $h4.16b, $h4.16b, #8 ++ rev64 $res0b, $res1b @ GHASH final-3 block ++ ++ eor $res0b, $res0b, $t0.16b @ feed in partial tag ++ ++ ins $rk4v.d[0], $res0.d[1] @ GHASH final-3 block - mid ++ pmull2 $rk2q1, $res0.2d, $h4.2d @ GHASH final-3 block - high ++ ++ eor $acc_hb, $acc_hb, $rk2 @ GHASH final-3 block - high ++ eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-3 block - mid ++ ldr $h34kq, [$current_tag, #96] @ load h4k | h3k ++ ++ ins $rk4v.d[1], $rk4v.d[0] @ GHASH final-3 block - mid ++ ldr $ctr_t1q, [$input_ptr], #16 @ AES final-2 block - load plaintext ++ ++ pmull2 $rk4v.1q, $rk4v.2d, $h34k.2d @ GHASH final-3 block - mid ++ pmull $rk3q1, $res0.1d, $h4.1d @ GHASH final-3 block - low ++ ++ eor3 $res1b, $ctr_t1b, $ctr5b, $t1.16b @ AES final-2 block - result ++ movi $t0.8b, #0 @ surpress further partial tag feed in ++ ++ eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-3 block - mid ++ eor $acc_lb, $acc_lb, $rk3 @ GHASH final-3 block - low ++.L256_enc_blocks_more_than_2: @ blocks left > 2 ++ ++ ldr $h3q, [$current_tag, #80] @ load h3l | h3h ++ ext $h3.16b, $h3.16b, $h3.16b, #8 ++ ++ st1 { $res1b}, [$output_ptr], #16 @ AES final-2 block - store result ++ ++ rev64 $res0b, $res1b @ GHASH final-2 block ++ ldr $ctr_t1q, [$input_ptr], #16 @ AES final-1 block - load plaintext ++ ++ eor $res0b, $res0b, $t0.16b @ feed in partial tag ++ ++ ins $rk4v.d[0], $res0.d[1] @ GHASH final-2 block - mid ++ ++ movi $t0.8b, #0 @ surpress further partial tag feed in ++ ++ pmull2 $rk2q1, $res0.2d, $h3.2d @ GHASH final-2 block - high ++ eor3 $res1b, $ctr_t1b, $ctr6b, $t1.16b @ AES final-1 block - result ++ ++ eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-2 block - mid ++ ++ eor $acc_hb, $acc_hb, $rk2 @ GHASH final-2 block - high ++ ++ pmull $rk4v.1q, $rk4v.1d, $h34k.1d @ GHASH final-2 block - mid ++ pmull $rk3q1, $res0.1d, $h3.1d @ GHASH final-2 block - low ++ ++ eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-2 block - mid ++ eor $acc_lb, $acc_lb, $rk3 @ GHASH final-2 block - low ++.L256_enc_blocks_more_than_1: @ blocks left > 1 ++ ++ st1 { $res1b}, [$output_ptr], #16 @ AES final-1 block - store result ++ ++ ldr $h2q, [$current_tag, #64] @ load h1l | h1h ++ ext $h2.16b, $h2.16b, $h2.16b, #8 ++ rev64 $res0b, $res1b @ GHASH final-1 block ++ ldr $ctr_t1q, [$input_ptr], #16 @ AES final block - load plaintext ++ ++ eor $res0b, $res0b, $t0.16b @ feed in partial tag ++ movi $t0.8b, #0 @ surpress further partial tag feed in ++ ++ ins $rk4v.d[0], $res0.d[1] @ GHASH final-1 block - mid ++ pmull2 $rk2q1, $res0.2d, $h2.2d @ GHASH final-1 block - high ++ ++ eor3 $res1b, $ctr_t1b, $ctr7b, $t1.16b @ AES final block - result ++ eor $acc_hb, $acc_hb, $rk2 @ GHASH final-1 block - high ++ ++ pmull $rk3q1, $res0.1d, $h2.1d @ GHASH final-1 block - low ++ eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-1 block - mid ++ ++ ldr $h12kq, [$current_tag, #48] @ load h2k | h1k ++ ++ eor $acc_lb, $acc_lb, $rk3 @ GHASH final-1 block - low ++ ins $rk4v.d[1], $rk4v.d[0] @ GHASH final-1 block - mid ++ ++ pmull2 $rk4v.1q, $rk4v.2d, $h12k.2d @ GHASH final-1 block - mid ++ ++ eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-1 block - mid ++.L256_enc_blocks_less_than_1: @ blocks left <= 1 ++ ++ and $bit_length, $bit_length, #127 @ bit_length %= 128 ++ ++ sub $bit_length, $bit_length, #128 @ bit_length -= 128 ++ ++ neg $bit_length, $bit_length @ bit_length = 128 - #bits in input (in range [1,128]) ++ ++ mvn $temp0_x, xzr @ temp0_x = 0xffffffffffffffff ++ and $bit_length, $bit_length, #127 @ bit_length %= 128 ++ ++ lsr $temp0_x, $temp0_x, $bit_length @ temp0_x is mask for top 64b of last block ++ cmp $bit_length, #64 ++ mvn $temp1_x, xzr @ temp1_x = 0xffffffffffffffff ++ ++ csel $temp3_x, $temp0_x, xzr, lt ++ csel $temp2_x, $temp1_x, $temp0_x, lt ++ ++ mov $ctr0.d[0], $temp2_x @ ctr0b is mask for last block ++ ldr $h1q, [$current_tag, #32] @ load h1l | h1h ++ ext $h1.16b, $h1.16b, $h1.16b, #8 ++ ++ ld1 { $rk0}, [$output_ptr] @ load existing bytes where the possibly partial last block is to be stored ++ mov $ctr0.d[1], $temp3_x ++ ++ and $res1b, $res1b, $ctr0b @ possibly partial last block has zeroes in highest bits ++ ++ rev64 $res0b, $res1b @ GHASH final block ++ ++ rev32 $rtmp_ctr.16b, $rtmp_ctr.16b ++ bif $res1b, $rk0, $ctr0b @ insert existing bytes in top end of result before storing ++ str $rtmp_ctrq, [$counter] @ store the updated counter ++ ++ eor $res0b, $res0b, $t0.16b @ feed in partial tag ++ st1 { $res1b}, [$output_ptr] @ store all 16B ++ ++ ins $t0.d[0], $res0.d[1] @ GHASH final block - mid ++ pmull2 $rk2q1, $res0.2d, $h1.2d @ GHASH final block - high ++ pmull $rk3q1, $res0.1d, $h1.1d @ GHASH final block - low ++ ++ eor $acc_hb, $acc_hb, $rk2 @ GHASH final block - high ++ eor $acc_lb, $acc_lb, $rk3 @ GHASH final block - low ++ ++ eor $t0.8b, $t0.8b, $res0.8b @ GHASH final block - mid ++ ++ pmull $t0.1q, $t0.1d, $h12k.1d @ GHASH final block - mid ++ ++ eor $acc_mb, $acc_mb, $t0.16b @ GHASH final block - mid ++ ldr $mod_constantd, [$modulo_constant] @ MODULO - load modulo constant ++ ++ ext $t11.16b, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment ++ ++ eor3 $acc_mb, $acc_mb, $acc_hb, $acc_lb @ MODULO - karatsuba tidy up ++ pmull $t12.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid ++ ++ eor3 $acc_mb, $acc_mb, $t12.16b, $t11.16b @ MODULO - fold into mid ++ ++ pmull $acc_h.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low ++ ext $t11.16b, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment ++ ++ eor3 $acc_lb, $acc_lb, $acc_hb, $t11.16b @ MODULO - fold into low ++ ext $acc_lb, $acc_lb, $acc_lb, #8 ++ rev64 $acc_lb, $acc_lb ++ st1 { $acc_l.16b }, [$current_tag] ++ lsr x0, $bit_length, #3 @ return sizes ++ ++ ldp d10, d11, [sp, #16] ++ ldp d12, d13, [sp, #32] ++ ldp d14, d15, [sp, #48] ++ ldp d8, d9, [sp], #80 ++ ret ++ ++.L256_enc_ret: ++ mov w0, #0x0 ++ ret ++.size unroll8_eor3_aes_gcm_enc_256_kernel,.-unroll8_eor3_aes_gcm_enc_256_kernel ++___ ++ ++{ ++######################################################################################### ++# size_t unroll8_eor3_aes_gcm_dec_256_kernel(const unsigned char *in, ++# size_t len, ++# unsigned char *out, ++# const void *key, ++# unsigned char ivec[16], ++# u64 *Xi); ++# ++$code.=<<___; ++.global unroll8_eor3_aes_gcm_dec_256_kernel ++.type unroll8_eor3_aes_gcm_dec_256_kernel,%function ++.align 4 ++unroll8_eor3_aes_gcm_dec_256_kernel: ++ AARCH64_VALID_CALL_TARGET ++ cbz x1, .L256_dec_ret ++ stp d8, d9, [sp, #-80]! ++ mov $counter, x4 ++ mov $cc, x5 ++ stp d10, d11, [sp, #16] ++ stp d12, d13, [sp, #32] ++ stp d14, d15, [sp, #48] ++ mov x5, #0xc200000000000000 ++ stp x5, xzr, [sp, #64] ++ add $modulo_constant, sp, #64 ++ ++ ld1 { $ctr0b}, [$counter] @ CTR block 0 ++ ++ mov $constant_temp, #0x100000000 @ set up counter increment ++ movi $rctr_inc.16b, #0x0 ++ mov $rctr_inc.d[1], $constant_temp ++ lsr $main_end_input_ptr, $bit_length, #3 @ byte_len ++ ++ sub $main_end_input_ptr, $main_end_input_ptr, #1 @ byte_len - 1 ++ ++ rev32 $rtmp_ctr.16b, $ctr0.16b @ set up reversed counter ++ ++ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 0 ++ ++ rev32 $ctr1.16b, $rtmp_ctr.16b @ CTR block 1 ++ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 1 ++ ++ rev32 $ctr2.16b, $rtmp_ctr.16b @ CTR block 2 ++ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 2 ++ ldp $rk0q, $rk1q, [$cc, #0] @ load rk0, rk1 ++ ++ rev32 $ctr3.16b, $rtmp_ctr.16b @ CTR block 3 ++ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 3 ++ ++ rev32 $ctr4.16b, $rtmp_ctr.16b @ CTR block 4 ++ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 4 ++ ++ aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 0 ++ ++ rev32 $ctr5.16b, $rtmp_ctr.16b @ CTR block 5 ++ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 5 ++ ++ aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 0 ++ aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 0 ++ ++ rev32 $ctr6.16b, $rtmp_ctr.16b @ CTR block 6 ++ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 6 ++ ++ rev32 $ctr7.16b, $rtmp_ctr.16b @ CTR block 7 ++ aese $ctr4b, $rk0 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 0 ++ ++ aese $ctr6b, $rk0 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 0 ++ aese $ctr5b, $rk0 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 0 ++ ++ aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 0 ++ aese $ctr7b, $rk0 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 0 ++ ldp $rk2q, $rk3q, [$cc, #32] @ load rk2, rk3 ++ ++ aese $ctr6b, $rk1 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 1 ++ aese $ctr4b, $rk1 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 1 ++ aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 1 ++ ++ aese $ctr5b, $rk1 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 1 ++ aese $ctr7b, $rk1 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 1 ++ aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 1 ++ ++ aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 1 ++ aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 1 ++ ++ aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 2 ++ aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 2 ++ aese $ctr6b, $rk2 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 2 ++ ++ aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 2 ++ aese $ctr7b, $rk2 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 2 ++ aese $ctr5b, $rk2 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 2 ++ ++ aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 2 ++ aese $ctr4b, $rk2 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 2 ++ ldp $rk4q, $rk5q, [$cc, #64] @ load rk4, rk5 ++ ++ aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 3 ++ aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 3 ++ ++ aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 3 ++ aese $ctr4b, $rk3 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 3 ++ ++ aese $ctr5b, $rk3 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 3 ++ aese $ctr7b, $rk3 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 3 ++ aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 3 ++ ++ aese $ctr6b, $rk3 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 3 ++ ++ aese $ctr7b, $rk4 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 4 ++ aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 4 ++ ++ aese $ctr6b, $rk4 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 4 ++ aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 4 ++ aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 4 ++ ++ aese $ctr4b, $rk4 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 4 ++ aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 4 ++ aese $ctr5b, $rk4 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 4 ++ ++ aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 5 ++ aese $ctr6b, $rk5 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 5 ++ ++ ldp $rk6q, $rk7q, [$cc, #96] @ load rk6, rk7 ++ aese $ctr4b, $rk5 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 5 ++ aese $ctr7b, $rk5 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 5 ++ ++ aese $ctr5b, $rk5 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 5 ++ ++ aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 5 ++ aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 5 ++ ++ aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 5 ++ ++ aese $ctr4b, $rk6 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 6 ++ aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 6 ++ aese $ctr7b, $rk6 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 6 ++ ++ aese $ctr6b, $rk6 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 6 ++ aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 6 ++ aese $ctr5b, $rk6 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 6 ++ ++ aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 6 ++ aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 6 ++ ldp $rk8q, $rk9q, [$cc, #128] @ load rk8, rk9 ++ ++ aese $ctr5b, $rk7 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 7 ++ aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 7 ++ ++ aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 7 ++ aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 7 ++ aese $ctr7b, $rk7 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 7 ++ ++ aese $ctr4b, $rk7 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 7 ++ aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 7 ++ aese $ctr6b, $rk7 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 7 ++ ++ and $main_end_input_ptr, $main_end_input_ptr, #0xffffffffffffff80 @ number of bytes to be processed in main loop (at least 1 byte must be handled by tail) ++ aese $ctr7b, $rk8 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 8 ++ aese $ctr5b, $rk8 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 8 ++ ++ aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 8 ++ aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 8 ++ aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 8 ++ ++ aese $ctr4b, $rk8 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 8 ++ aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 8 ++ aese $ctr6b, $rk8 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 8 ++ ++ aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 9 ++ ++ ld1 { $acc_lb}, [$current_tag] ++ ext $acc_lb, $acc_lb, $acc_lb, #8 ++ rev64 $acc_lb, $acc_lb ++ ldp $rk10q, $rk11q, [$cc, #160] @ load rk10, rk11 ++ add $end_input_ptr, $input_ptr, $bit_length, lsr #3 @ end_input_ptr ++ add $main_end_input_ptr, $main_end_input_ptr, $input_ptr ++ ++ aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 9 ++ aese $ctr6b, $rk9 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 9 ++ ++ aese $ctr4b, $rk9 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 9 ++ aese $ctr5b, $rk9 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 9 ++ ++ aese $ctr7b, $rk9 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 9 ++ ++ aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 9 ++ aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 9 ++ ++ aese $ctr4b, $rk10 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 10 ++ aese $ctr7b, $rk10 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 10 ++ aese $ctr5b, $rk10 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 10 ++ ++ aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 10 ++ aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 10 ++ aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 10 ++ ++ aese $ctr6b, $rk10 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 10 ++ aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 10 ++ ldp $rk12q, $rk13q, [$cc, #192] @ load rk12, rk13 ++ ++ aese $ctr0b, $rk11 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 11 ++ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 7 ++ ++ aese $ctr7b, $rk11 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 11 ++ aese $ctr3b, $rk11 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 11 ++ aese $ctr1b, $rk11 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 11 ++ ++ aese $ctr5b, $rk11 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 11 ++ aese $ctr4b, $rk11 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 11 ++ aese $ctr2b, $rk11 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 11 ++ ++ aese $ctr6b, $rk11 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 11 ++ ldr $rk14q, [$cc, #224] @ load rk14 ++ ++ aese $ctr1b, $rk12 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 12 ++ aese $ctr4b, $rk12 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 12 ++ aese $ctr5b, $rk12 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 12 ++ ++ cmp $input_ptr, $main_end_input_ptr @ check if we have <= 8 blocks ++ aese $ctr3b, $rk12 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 12 ++ aese $ctr2b, $rk12 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 12 ++ ++ aese $ctr6b, $rk12 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 12 ++ aese $ctr0b, $rk12 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 12 ++ aese $ctr7b, $rk12 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 12 ++ ++ aese $ctr5b, $rk13 @ AES block 5 - round 13 ++ aese $ctr1b, $rk13 @ AES block 1 - round 13 ++ aese $ctr2b, $rk13 @ AES block 2 - round 13 ++ ++ aese $ctr0b, $rk13 @ AES block 0 - round 13 ++ aese $ctr4b, $rk13 @ AES block 4 - round 13 ++ aese $ctr6b, $rk13 @ AES block 6 - round 13 ++ ++ aese $ctr3b, $rk13 @ AES block 3 - round 13 ++ aese $ctr7b, $rk13 @ AES block 7 - round 13 ++ b.ge .L256_dec_tail @ handle tail ++ ++ ldp $res0q, $res1q, [$input_ptr], #32 @ AES block 0, 1 - load ciphertext ++ ++ ldp $res2q, $res3q, [$input_ptr], #32 @ AES block 2, 3 - load ciphertext ++ ++ ldp $res4q, $res5q, [$input_ptr], #32 @ AES block 4, 5 - load ciphertext ++ ++ ldp $res6q, $res7q, [$input_ptr], #32 @ AES block 6, 7 - load ciphertext ++ cmp $input_ptr, $main_end_input_ptr @ check if we have <= 8 blocks ++ ++ eor3 $ctr1b, $res1b, $ctr1b, $rk14 @ AES block 1 - result ++ eor3 $ctr0b, $res0b, $ctr0b, $rk14 @ AES block 0 - result ++ stp $ctr0q, $ctr1q, [$output_ptr], #32 @ AES block 0, 1 - store result ++ ++ rev32 $ctr0.16b, $rtmp_ctr.16b @ CTR block 8 ++ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8 ++ eor3 $ctr3b, $res3b, $ctr3b, $rk14 @ AES block 3 - result ++ ++ eor3 $ctr5b, $res5b, $ctr5b, $rk14 @ AES block 5 - result ++ ++ eor3 $ctr4b, $res4b, $ctr4b, $rk14 @ AES block 4 - result ++ rev32 $ctr1.16b, $rtmp_ctr.16b @ CTR block 9 ++ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 9 ++ ++ eor3 $ctr2b, $res2b, $ctr2b, $rk14 @ AES block 2 - result ++ stp $ctr2q, $ctr3q, [$output_ptr], #32 @ AES block 2, 3 - store result ++ ++ rev32 $ctr2.16b, $rtmp_ctr.16b @ CTR block 10 ++ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 10 ++ ++ eor3 $ctr6b, $res6b, $ctr6b, $rk14 @ AES block 6 - result ++ ++ rev32 $ctr3.16b, $rtmp_ctr.16b @ CTR block 11 ++ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 11 ++ stp $ctr4q, $ctr5q, [$output_ptr], #32 @ AES block 4, 5 - store result ++ ++ eor3 $ctr7b, $res7b, $ctr7b, $rk14 @ AES block 7 - result ++ stp $ctr6q, $ctr7q, [$output_ptr], #32 @ AES block 6, 7 - store result ++ ++ rev32 $ctr4.16b, $rtmp_ctr.16b @ CTR block 12 ++ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 12 ++ b.ge .L256_dec_prepretail @ do prepretail ++ ++.L256_dec_main_loop: @ main loop start ++ rev32 $ctr5.16b, $rtmp_ctr.16b @ CTR block 8k+13 ++ ldp $rk0q, $rk1q, [$cc, #0] @ load rk0, rk1 ++ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+13 ++ ++ rev64 $res1b, $res1b @ GHASH block 8k+1 ++ ldr $h7q, [$current_tag, #176] @ load h7l | h7h ++ ext $h7.16b, $h7.16b, $h7.16b, #8 ++ ldr $h8q, [$current_tag, #208] @ load h8l | h8h ++ ext $h8.16b, $h8.16b, $h8.16b, #8 ++ ++ rev32 $ctr6.16b, $rtmp_ctr.16b @ CTR block 8k+14 ++ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+14 ++ rev64 $res0b, $res0b @ GHASH block 8k ++ ++ ext $acc_lb, $acc_lb, $acc_lb, #8 @ PRE 0 ++ rev64 $res4b, $res4b @ GHASH block 8k+4 ++ rev64 $res3b, $res3b @ GHASH block 8k+3 ++ ++ rev32 $ctr7.16b, $rtmp_ctr.16b @ CTR block 8k+15 ++ rev64 $res7b, $res7b @ GHASH block 8k+7 ++ ++ aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 0 ++ aese $ctr6b, $rk0 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 0 ++ aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 0 ++ ++ aese $ctr7b, $rk0 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 0 ++ aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 0 ++ aese $ctr5b, $rk0 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 0 ++ ++ aese $ctr4b, $rk0 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 0 ++ aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 0 ++ ldp $rk2q, $rk3q, [$cc, #32] @ load rk2, rk3 ++ ++ eor $res0b, $res0b, $acc_lb @ PRE 1 ++ ldr $h5q, [$current_tag, #128] @ load h5l | h5h ++ ext $h5.16b, $h5.16b, $h5.16b, #8 ++ ldr $h6q, [$current_tag, #160] @ load h6l | h6h ++ ext $h6.16b, $h6.16b, $h6.16b, #8 ++ aese $ctr6b, $rk1 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 1 ++ ++ aese $ctr4b, $rk1 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 1 ++ rev64 $res2b, $res2b @ GHASH block 8k+2 ++ aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 1 ++ ++ aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 1 ++ aese $ctr5b, $rk1 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 1 ++ aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 1 ++ ++ trn1 $acc_m.2d, $res1.2d, $res0.2d @ GHASH block 8k, 8k+1 - mid ++ aese $ctr7b, $rk1 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 1 ++ aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 1 ++ ++ aese $ctr4b, $rk2 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 2 ++ aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 2 ++ aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 2 ++ ++ aese $ctr6b, $rk2 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 2 ++ aese $ctr7b, $rk2 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 2 ++ pmull $acc_l.1q, $res0.1d, $h8.1d @ GHASH block 8k - low ++ ++ aese $ctr5b, $rk2 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 2 ++ aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 2 ++ aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 2 ++ ++ ldp $rk4q, $rk5q, [$cc, #64] @ load rk4, rk5 ++ pmull2 $t1.1q, $res2.2d, $h6.2d @ GHASH block 8k+2 - high ++ aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 3 ++ ++ aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 3 ++ pmull2 $t0.1q, $res1.2d, $h7.2d @ GHASH block 8k+1 - high ++ pmull $h7.1q, $res1.1d, $h7.1d @ GHASH block 8k+1 - low ++ ++ aese $ctr5b, $rk3 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 3 ++ aese $ctr6b, $rk3 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 3 ++ pmull2 $acc_h.1q, $res0.2d, $h8.2d @ GHASH block 8k - high ++ ++ aese $ctr4b, $rk3 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 3 ++ aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 3 ++ trn2 $res0.2d, $res1.2d, $res0.2d @ GHASH block 8k, 8k+1 - mid ++ ++ pmull2 $t2.1q, $res3.2d, $h5.2d @ GHASH block 8k+3 - high ++ aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 3 ++ eor $acc_hb, $acc_hb, $t0.16b @ GHASH block 8k+1 - high ++ ++ aese $ctr5b, $rk4 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 4 ++ aese $ctr7b, $rk3 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 3 ++ aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 4 ++ ++ aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 4 ++ aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 4 ++ aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 4 ++ ++ aese $ctr6b, $rk4 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 4 ++ aese $ctr7b, $rk4 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 4 ++ aese $ctr4b, $rk4 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 4 ++ ++ ldr $h56kq, [$current_tag, #144] @ load h6k | h5k ++ ldr $h78kq, [$current_tag, #192] @ load h8k | h7k ++ eor $res0.16b, $res0.16b, $acc_m.16b @ GHASH block 8k, 8k+1 - mid ++ pmull $h6.1q, $res2.1d, $h6.1d @ GHASH block 8k+2 - low ++ ++ ldp $rk6q, $rk7q, [$cc, #96] @ load rk6, rk7 ++ aese $ctr5b, $rk5 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 5 ++ eor $acc_lb, $acc_lb, $h7.16b @ GHASH block 8k+1 - low ++ ++ aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 5 ++ aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 5 ++ aese $ctr7b, $rk5 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 5 ++ ++ aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 5 ++ aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 5 ++ aese $ctr6b, $rk5 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 5 ++ ++ eor3 $acc_hb, $acc_hb, $t1.16b, $t2.16b @ GHASH block 8k+2, 8k+3 - high ++ trn1 $t3.2d, $res3.2d, $res2.2d @ GHASH block 8k+2, 8k+3 - mid ++ rev64 $res5b, $res5b @ GHASH block 8k+5 ++ ++ pmull2 $acc_m.1q, $res0.2d, $h78k.2d @ GHASH block 8k - mid ++ pmull $h78k.1q, $res0.1d, $h78k.1d @ GHASH block 8k+1 - mid ++ trn2 $res2.2d, $res3.2d, $res2.2d @ GHASH block 8k+2, 8k+3 - mid ++ ++ aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 6 ++ aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 6 ++ aese $ctr4b, $rk5 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 5 ++ ++ trn1 $t6.2d, $res5.2d, $res4.2d @ GHASH block 8k+4, 8k+5 - mid ++ aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 6 ++ aese $ctr6b, $rk6 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 6 ++ ++ eor $res2.16b, $res2.16b, $t3.16b @ GHASH block 8k+2, 8k+3 - mid ++ pmull $h5.1q, $res3.1d, $h5.1d @ GHASH block 8k+3 - low ++ aese $ctr4b, $rk6 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 6 ++ ++ aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 6 ++ aese $ctr5b, $rk6 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 6 ++ aese $ctr7b, $rk6 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 6 ++ ++ pmull2 $t3.1q, $res2.2d, $h56k.2d @ GHASH block 8k+2 - mid ++ pmull $h56k.1q, $res2.1d, $h56k.1d @ GHASH block 8k+3 - mid ++ eor3 $acc_lb, $acc_lb, $h6.16b, $h5.16b @ GHASH block 8k+2, 8k+3 - low ++ ++ ldr $h3q, [$current_tag, #80] @ load h3l | h3h ++ ext $h3.16b, $h3.16b, $h3.16b, #8 ++ ldr $h4q, [$current_tag, #112] @ load h4l | h4h ++ ext $h4.16b, $h4.16b, $h4.16b, #8 ++ rev64 $res6b, $res6b @ GHASH block 8k+6 ++ eor $acc_mb, $acc_mb, $h78k.16b @ GHASH block 8k+1 - mid ++ ++ aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 7 ++ aese $ctr5b, $rk7 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 7 ++ ldp $rk8q, $rk9q, [$cc, #128] @ load rk8, rk9 ++ ++ ldr $h1q, [$current_tag, #32] @ load h1l | h1h ++ ext $h1.16b, $h1.16b, $h1.16b, #8 ++ ldr $h2q, [$current_tag, #64] @ load h1l | h1h ++ ext $h2.16b, $h2.16b, $h2.16b, #8 ++ eor3 $acc_mb, $acc_mb, $h56k.16b, $t3.16b @ GHASH block 8k+2, 8k+3 - mid ++ aese $ctr7b, $rk7 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 7 ++ ++ aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 7 ++ aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 7 ++ aese $ctr6b, $rk7 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 7 ++ ++ ldr $h12kq, [$current_tag, #48] @ load h2k | h1k ++ ldr $h34kq, [$current_tag, #96] @ load h4k | h3k ++ aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 7 ++ aese $ctr4b, $rk7 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 7 ++ ++ pmull2 $t4.1q, $res4.2d, $h4.2d @ GHASH block 8k+4 - high ++ pmull $h4.1q, $res4.1d, $h4.1d @ GHASH block 8k+4 - low ++ trn2 $res4.2d, $res5.2d, $res4.2d @ GHASH block 8k+4, 8k+5 - mid ++ ++ aese $ctr5b, $rk8 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 8 ++ pmull2 $t5.1q, $res5.2d, $h3.2d @ GHASH block 8k+5 - high ++ aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 8 ++ ++ aese $ctr6b, $rk8 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 8 ++ pmull $h3.1q, $res5.1d, $h3.1d @ GHASH block 8k+5 - low ++ aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 8 ++ ++ aese $ctr4b, $rk8 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 8 ++ aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 8 ++ pmull2 $t7.1q, $res6.2d, $h2.2d @ GHASH block 8k+6 - high ++ ++ trn1 $t9.2d, $res7.2d, $res6.2d @ GHASH block 8k+6, 8k+7 - mid ++ aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 8 ++ aese $ctr7b, $rk8 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 8 ++ ++ ldp $rk10q, $rk11q, [$cc, #160] @ load rk10, rk11 ++ pmull $h2.1q, $res6.1d, $h2.1d @ GHASH block 8k+6 - low ++ trn2 $res6.2d, $res7.2d, $res6.2d @ GHASH block 8k+6, 8k+7 - mid ++ ++ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+15 ++ eor3 $acc_hb, $acc_hb, $t4.16b, $t5.16b @ GHASH block 8k+4, 8k+5 - high ++ aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 9 ++ ++ aese $ctr6b, $rk9 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 9 ++ eor $res6.16b, $res6.16b, $t9.16b @ GHASH block 8k+6, 8k+7 - mid ++ aese $ctr5b, $rk9 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 9 ++ ++ ldp $res0q, $res1q, [$input_ptr], #32 @ AES block 8k+8, 8k+9 - load ciphertext ++ eor $res4.16b, $res4.16b, $t6.16b @ GHASH block 8k+4, 8k+5 - mid ++ aese $ctr7b, $rk9 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 9 ++ ++ pmull2 $t9.1q, $res6.2d, $h12k.2d @ GHASH block 8k+6 - mid ++ aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 9 ++ aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 9 ++ ++ pmull2 $t6.1q, $res4.2d, $h34k.2d @ GHASH block 8k+4 - mid ++ pmull $h34k.1q, $res4.1d, $h34k.1d @ GHASH block 8k+5 - mid ++ pmull2 $t8.1q, $res7.2d, $h1.2d @ GHASH block 8k+7 - high ++ ++ pmull $h1.1q, $res7.1d, $h1.1d @ GHASH block 8k+7 - low ++ aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 10 ++ aese $ctr6b, $rk10 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 10 ++ ++ pmull $h12k.1q, $res6.1d, $h12k.1d @ GHASH block 8k+7 - mid ++ aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 9 ++ eor3 $acc_lb, $acc_lb, $h4.16b, $h3.16b @ GHASH block 8k+4, 8k+5 - low ++ ++ aese $ctr4b, $rk9 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 9 ++ eor3 $acc_mb, $acc_mb, $h34k.16b, $t6.16b @ GHASH block 8k+4, 8k+5 - mid ++ eor3 $acc_hb, $acc_hb, $t7.16b, $t8.16b @ GHASH block 8k+6, 8k+7 - high ++ ++ aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 10 ++ aese $ctr5b, $rk10 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 10 ++ aese $ctr7b, $rk10 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 10 ++ ++ aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 10 ++ aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 10 ++ aese $ctr4b, $rk10 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 10 ++ ++ eor3 $acc_lb, $acc_lb, $h2.16b, $h1.16b @ GHASH block 8k+6, 8k+7 - low ++ rev32 $h1.16b, $rtmp_ctr.16b @ CTR block 8k+16 ++ ldr $mod_constantd, [$modulo_constant] @ MODULO - load modulo constant ++ ++ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+16 ++ aese $ctr1b, $rk11 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 11 ++ ldp $rk12q, $rk13q, [$cc, #192] @ load rk12, rk13 ++ ++ aese $ctr0b, $rk11 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 11 ++ aese $ctr6b, $rk11 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 11 ++ ++ eor3 $acc_mb, $acc_mb, $h12k.16b, $t9.16b @ GHASH block 8k+6, 8k+7 - mid ++ rev32 $h2.16b, $rtmp_ctr.16b @ CTR block 8k+17 ++ aese $ctr2b, $rk11 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 11 ++ ++ ldp $res2q, $res3q, [$input_ptr], #32 @ AES block 8k+10, 8k+11 - load ciphertext ++ aese $ctr7b, $rk11 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 11 ++ ext $t11.16b, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment ++ ++ aese $ctr5b, $rk11 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 11 ++ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+17 ++ aese $ctr3b, $rk11 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 11 ++ ++ aese $ctr2b, $rk12 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 12 ++ aese $ctr7b, $rk12 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 12 ++ aese $ctr6b, $rk12 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 12 ++ ++ rev32 $h3.16b, $rtmp_ctr.16b @ CTR block 8k+18 ++ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+18 ++ pmull $t12.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid ++ ++ eor3 $acc_mb, $acc_mb, $acc_hb, $acc_lb @ MODULO - karatsuba tidy up ++ aese $ctr1b, $rk12 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 12 ++ aese $ctr4b, $rk11 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 11 ++ ++ ldr $rk14q, [$cc, #224] @ load rk14 ++ aese $ctr5b, $rk12 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 12 ++ aese $ctr3b, $rk12 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 12 ++ ++ eor3 $acc_mb, $acc_mb, $t12.16b, $t11.16b @ MODULO - fold into mid ++ aese $ctr0b, $rk12 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 12 ++ aese $ctr4b, $rk12 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 12 ++ ++ ldp $res4q, $res5q, [$input_ptr], #32 @ AES block 8k+12, 8k+13 - load ciphertext ++ aese $ctr1b, $rk13 @ AES block 8k+9 - round 13 ++ aese $ctr2b, $rk13 @ AES block 8k+10 - round 13 ++ ++ ldp $res6q, $res7q, [$input_ptr], #32 @ AES block 8k+14, 8k+15 - load ciphertext ++ aese $ctr0b, $rk13 @ AES block 8k+8 - round 13 ++ aese $ctr5b, $rk13 @ AES block 8k+13 - round 13 ++ ++ rev32 $h4.16b, $rtmp_ctr.16b @ CTR block 8k+19 ++ eor3 $ctr2b, $res2b, $ctr2b, $rk14 @ AES block 8k+10 - result ++ eor3 $ctr1b, $res1b, $ctr1b, $rk14 @ AES block 8k+9 - result ++ ++ ext $t11.16b, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment ++ aese $ctr7b, $rk13 @ AES block 8k+15 - round 13 ++ ++ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+19 ++ pmull $acc_h.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low ++ aese $ctr4b, $rk13 @ AES block 8k+12 - round 13 ++ ++ eor3 $ctr5b, $res5b, $ctr5b, $rk14 @ AES block 8k+13 - result ++ eor3 $ctr0b, $res0b, $ctr0b, $rk14 @ AES block 8k+8 - result ++ aese $ctr3b, $rk13 @ AES block 8k+11 - round 13 ++ ++ stp $ctr0q, $ctr1q, [$output_ptr], #32 @ AES block 8k+8, 8k+9 - store result ++ mov $ctr0.16b, $h1.16b @ CTR block 8k+16 ++ eor3 $ctr4b, $res4b, $ctr4b, $rk14 @ AES block 8k+12 - result ++ ++ eor3 $acc_lb, $acc_lb, $t11.16b, $acc_hb @ MODULO - fold into low ++ eor3 $ctr3b, $res3b, $ctr3b, $rk14 @ AES block 8k+11 - result ++ stp $ctr2q, $ctr3q, [$output_ptr], #32 @ AES block 8k+10, 8k+11 - store result ++ ++ mov $ctr3.16b, $h4.16b @ CTR block 8k+19 ++ mov $ctr2.16b, $h3.16b @ CTR block 8k+18 ++ aese $ctr6b, $rk13 @ AES block 8k+14 - round 13 ++ ++ mov $ctr1.16b, $h2.16b @ CTR block 8k+17 ++ stp $ctr4q, $ctr5q, [$output_ptr], #32 @ AES block 8k+12, 8k+13 - store result ++ eor3 $ctr7b, $res7b, $ctr7b, $rk14 @ AES block 8k+15 - result ++ ++ eor3 $ctr6b, $res6b, $ctr6b, $rk14 @ AES block 8k+14 - result ++ rev32 $ctr4.16b, $rtmp_ctr.16b @ CTR block 8k+20 ++ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+20 ++ ++ cmp $input_ptr, $main_end_input_ptr @ LOOP CONTROL ++ stp $ctr6q, $ctr7q, [$output_ptr], #32 @ AES block 8k+14, 8k+15 - store result ++ b.lt .L256_dec_main_loop ++ ++.L256_dec_prepretail: @ PREPRETAIL ++ ldp $rk0q, $rk1q, [$cc, #0] @ load rk0, rk1 ++ rev32 $ctr5.16b, $rtmp_ctr.16b @ CTR block 8k+13 ++ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+13 ++ ++ rev64 $res4b, $res4b @ GHASH block 8k+4 ++ ldr $h56kq, [$current_tag, #144] @ load h6k | h5k ++ ldr $h78kq, [$current_tag, #192] @ load h8k | h7k ++ ++ rev32 $ctr6.16b, $rtmp_ctr.16b @ CTR block 8k+14 ++ rev64 $res0b, $res0b @ GHASH block 8k ++ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+14 ++ ++ ext $acc_lb, $acc_lb, $acc_lb, #8 @ PRE 0 ++ ldr $h7q, [$current_tag, #176] @ load h7l | h7h ++ ext $h7.16b, $h7.16b, $h7.16b, #8 ++ ldr $h8q, [$current_tag, #208] @ load h8l | h8h ++ ext $h8.16b, $h8.16b, $h8.16b, #8 ++ rev64 $res1b, $res1b @ GHASH block 8k+1 ++ ++ rev32 $ctr7.16b, $rtmp_ctr.16b @ CTR block 8k+15 ++ rev64 $res2b, $res2b @ GHASH block 8k+2 ++ ldr $h5q, [$current_tag, #128] @ load h5l | h5h ++ ext $h5.16b, $h5.16b, $h5.16b, #8 ++ ldr $h6q, [$current_tag, #160] @ load h6l | h6h ++ ext $h6.16b, $h6.16b, $h6.16b, #8 ++ ++ aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 0 ++ aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 0 ++ aese $ctr4b, $rk0 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 0 ++ ++ aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 0 ++ aese $ctr5b, $rk0 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 0 ++ aese $ctr6b, $rk0 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 0 ++ ++ aese $ctr4b, $rk1 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 1 ++ aese $ctr7b, $rk0 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 0 ++ aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 0 ++ ++ ldp $rk2q, $rk3q, [$cc, #32] @ load rk2, rk3 ++ aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 1 ++ eor $res0b, $res0b, $acc_lb @ PRE 1 ++ ++ aese $ctr7b, $rk1 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 1 ++ aese $ctr6b, $rk1 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 1 ++ aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 1 ++ ++ aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 1 ++ aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 1 ++ aese $ctr5b, $rk1 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 1 ++ ++ pmull2 $t0.1q, $res1.2d, $h7.2d @ GHASH block 8k+1 - high ++ trn1 $acc_m.2d, $res1.2d, $res0.2d @ GHASH block 8k, 8k+1 - mid ++ pmull $acc_l.1q, $res0.1d, $h8.1d @ GHASH block 8k - low ++ ++ rev64 $res3b, $res3b @ GHASH block 8k+3 ++ pmull $h7.1q, $res1.1d, $h7.1d @ GHASH block 8k+1 - low ++ ++ aese $ctr5b, $rk2 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 2 ++ aese $ctr7b, $rk2 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 2 ++ aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 2 ++ ++ aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 2 ++ aese $ctr6b, $rk2 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 2 ++ pmull2 $acc_h.1q, $res0.2d, $h8.2d @ GHASH block 8k - high ++ ++ aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 2 ++ aese $ctr7b, $rk3 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 3 ++ ++ aese $ctr5b, $rk3 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 3 ++ rev64 $res6b, $res6b @ GHASH block 8k+6 ++ ++ aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 3 ++ aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 2 ++ aese $ctr6b, $rk3 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 3 ++ ++ pmull2 $t1.1q, $res2.2d, $h6.2d @ GHASH block 8k+2 - high ++ trn2 $res0.2d, $res1.2d, $res0.2d @ GHASH block 8k, 8k+1 - mid ++ aese $ctr4b, $rk2 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 2 ++ ++ ldp $rk4q, $rk5q, [$cc, #64] @ load rk4, rk5 ++ aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 3 ++ pmull2 $t2.1q, $res3.2d, $h5.2d @ GHASH block 8k+3 - high ++ ++ aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 3 ++ eor $acc_hb, $acc_hb, $t0.16b @ GHASH block 8k+1 - high ++ eor $res0.16b, $res0.16b, $acc_m.16b @ GHASH block 8k, 8k+1 - mid ++ ++ aese $ctr4b, $rk3 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 3 ++ pmull $h6.1q, $res2.1d, $h6.1d @ GHASH block 8k+2 - low ++ aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 3 ++ ++ eor3 $acc_hb, $acc_hb, $t1.16b, $t2.16b @ GHASH block 8k+2, 8k+3 - high ++ trn1 $t3.2d, $res3.2d, $res2.2d @ GHASH block 8k+2, 8k+3 - mid ++ trn2 $res2.2d, $res3.2d, $res2.2d @ GHASH block 8k+2, 8k+3 - mid ++ ++ pmull2 $acc_m.1q, $res0.2d, $h78k.2d @ GHASH block 8k - mid ++ pmull $h5.1q, $res3.1d, $h5.1d @ GHASH block 8k+3 - low ++ eor $acc_lb, $acc_lb, $h7.16b @ GHASH block 8k+1 - low ++ ++ pmull $h78k.1q, $res0.1d, $h78k.1d @ GHASH block 8k+1 - mid ++ aese $ctr5b, $rk4 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 4 ++ aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 4 ++ ++ eor3 $acc_lb, $acc_lb, $h6.16b, $h5.16b @ GHASH block 8k+2, 8k+3 - low ++ ldr $h1q, [$current_tag, #32] @ load h1l | h1h ++ ext $h1.16b, $h1.16b, $h1.16b, #8 ++ ldr $h2q, [$current_tag, #64] @ load h1l | h1h ++ ext $h2.16b, $h2.16b, $h2.16b, #8 ++ aese $ctr7b, $rk4 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 4 ++ ++ aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 4 ++ aese $ctr6b, $rk4 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 4 ++ eor $acc_mb, $acc_mb, $h78k.16b @ GHASH block 8k+1 - mid ++ ++ eor $res2.16b, $res2.16b, $t3.16b @ GHASH block 8k+2, 8k+3 - mid ++ aese $ctr7b, $rk5 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 5 ++ aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 4 ++ ++ aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 5 ++ aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 4 ++ aese $ctr4b, $rk4 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 4 ++ ++ aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 5 ++ pmull2 $t3.1q, $res2.2d, $h56k.2d @ GHASH block 8k+2 - mid ++ aese $ctr6b, $rk5 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 5 ++ ++ aese $ctr4b, $rk5 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 5 ++ aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 5 ++ pmull $h56k.1q, $res2.1d, $h56k.1d @ GHASH block 8k+3 - mid ++ ++ aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 5 ++ aese $ctr5b, $rk5 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 5 ++ ldp $rk6q, $rk7q, [$cc, #96] @ load rk6, rk7 ++ ++ ldr $h3q, [$current_tag, #80] @ load h3l | h3h ++ ext $h3.16b, $h3.16b, $h3.16b, #8 ++ ldr $h4q, [$current_tag, #112] @ load h4l | h4h ++ ext $h4.16b, $h4.16b, $h4.16b, #8 ++ rev64 $res7b, $res7b @ GHASH block 8k+7 ++ rev64 $res5b, $res5b @ GHASH block 8k+5 ++ ++ eor3 $acc_mb, $acc_mb, $h56k.16b, $t3.16b @ GHASH block 8k+2, 8k+3 - mid ++ ++ trn1 $t6.2d, $res5.2d, $res4.2d @ GHASH block 8k+4, 8k+5 - mid ++ ++ aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 6 ++ ldr $h12kq, [$current_tag, #48] @ load h2k | h1k ++ ldr $h34kq, [$current_tag, #96] @ load h4k | h3k ++ aese $ctr6b, $rk6 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 6 ++ ++ aese $ctr5b, $rk6 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 6 ++ aese $ctr7b, $rk6 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 6 ++ ++ pmull2 $t4.1q, $res4.2d, $h4.2d @ GHASH block 8k+4 - high ++ pmull2 $t5.1q, $res5.2d, $h3.2d @ GHASH block 8k+5 - high ++ pmull $h4.1q, $res4.1d, $h4.1d @ GHASH block 8k+4 - low ++ ++ trn2 $res4.2d, $res5.2d, $res4.2d @ GHASH block 8k+4, 8k+5 - mid ++ pmull $h3.1q, $res5.1d, $h3.1d @ GHASH block 8k+5 - low ++ trn1 $t9.2d, $res7.2d, $res6.2d @ GHASH block 8k+6, 8k+7 - mid ++ ++ aese $ctr7b, $rk7 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 7 ++ pmull2 $t7.1q, $res6.2d, $h2.2d @ GHASH block 8k+6 - high ++ aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 6 ++ ++ aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 6 ++ aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 6 ++ aese $ctr4b, $rk6 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 6 ++ ++ ldp $rk8q, $rk9q, [$cc, #128] @ load rk8, rk9 ++ pmull $h2.1q, $res6.1d, $h2.1d @ GHASH block 8k+6 - low ++ aese $ctr5b, $rk7 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 7 ++ ++ aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 7 ++ aese $ctr4b, $rk7 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 7 ++ ++ aese $ctr6b, $rk7 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 7 ++ aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 7 ++ eor3 $acc_hb, $acc_hb, $t4.16b, $t5.16b @ GHASH block 8k+4, 8k+5 - high ++ ++ aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 7 ++ trn2 $res6.2d, $res7.2d, $res6.2d @ GHASH block 8k+6, 8k+7 - mid ++ aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 7 ++ ++ aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 8 ++ aese $ctr7b, $rk8 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 8 ++ aese $ctr4b, $rk8 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 8 ++ ++ aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 8 ++ aese $ctr5b, $rk8 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 8 ++ aese $ctr6b, $rk8 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 8 ++ ++ aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 8 ++ aese $ctr4b, $rk9 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 9 ++ eor $res4.16b, $res4.16b, $t6.16b @ GHASH block 8k+4, 8k+5 - mid ++ ++ aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 9 ++ aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 9 ++ eor $res6.16b, $res6.16b, $t9.16b @ GHASH block 8k+6, 8k+7 - mid ++ ++ aese $ctr6b, $rk9 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 9 ++ aese $ctr7b, $rk9 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 9 ++ pmull2 $t6.1q, $res4.2d, $h34k.2d @ GHASH block 8k+4 - mid ++ ++ aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 8 ++ pmull $h34k.1q, $res4.1d, $h34k.1d @ GHASH block 8k+5 - mid ++ pmull2 $t8.1q, $res7.2d, $h1.2d @ GHASH block 8k+7 - high ++ ++ pmull2 $t9.1q, $res6.2d, $h12k.2d @ GHASH block 8k+6 - mid ++ pmull $h12k.1q, $res6.1d, $h12k.1d @ GHASH block 8k+7 - mid ++ pmull $h1.1q, $res7.1d, $h1.1d @ GHASH block 8k+7 - low ++ ++ ldp $rk10q, $rk11q, [$cc, #160] @ load rk10, rk11 ++ eor3 $acc_lb, $acc_lb, $h4.16b, $h3.16b @ GHASH block 8k+4, 8k+5 - low ++ eor3 $acc_mb, $acc_mb, $h34k.16b, $t6.16b @ GHASH block 8k+4, 8k+5 - mid ++ ++ aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 9 ++ aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 9 ++ aese $ctr5b, $rk9 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 9 ++ ++ eor3 $acc_hb, $acc_hb, $t7.16b, $t8.16b @ GHASH block 8k+6, 8k+7 - high ++ eor3 $acc_lb, $acc_lb, $h2.16b, $h1.16b @ GHASH block 8k+6, 8k+7 - low ++ ldr $mod_constantd, [$modulo_constant] @ MODULO - load modulo constant ++ ++ eor3 $acc_mb, $acc_mb, $h12k.16b, $t9.16b @ GHASH block 8k+6, 8k+7 - mid ++ ++ aese $ctr4b, $rk10 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 10 ++ aese $ctr6b, $rk10 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 10 ++ aese $ctr5b, $rk10 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 10 ++ ++ aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 10 ++ aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 10 ++ aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 10 ++ ++ eor3 $acc_mb, $acc_mb, $acc_hb, $acc_lb @ MODULO - karatsuba tidy up ++ ++ aese $ctr7b, $rk10 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 10 ++ aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 10 ++ ldp $rk12q, $rk13q, [$cc, #192] @ load rk12, rk13 ++ ++ ext $t11.16b, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment ++ ++ aese $ctr2b, $rk11 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 11 ++ aese $ctr1b, $rk11 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 11 ++ aese $ctr0b, $rk11 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 11 ++ ++ pmull $t12.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid ++ aese $ctr3b, $rk11 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 11 ++ ++ aese $ctr7b, $rk11 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 11 ++ aese $ctr6b, $rk11 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 11 ++ aese $ctr4b, $rk11 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 11 ++ ++ aese $ctr5b, $rk11 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 11 ++ aese $ctr3b, $rk12 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 12 ++ ++ eor3 $acc_mb, $acc_mb, $t12.16b, $t11.16b @ MODULO - fold into mid ++ ++ aese $ctr3b, $rk13 @ AES block 8k+11 - round 13 ++ aese $ctr2b, $rk12 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 12 ++ aese $ctr6b, $rk12 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 12 ++ ++ pmull $acc_h.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low ++ aese $ctr4b, $rk12 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 12 ++ aese $ctr7b, $rk12 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 12 ++ ++ aese $ctr0b, $rk12 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 12 ++ ldr $rk14q, [$cc, #224] @ load rk14 ++ aese $ctr1b, $rk12 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 12 ++ ++ aese $ctr4b, $rk13 @ AES block 8k+12 - round 13 ++ ext $t11.16b, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment ++ aese $ctr5b, $rk12 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 12 ++ ++ aese $ctr6b, $rk13 @ AES block 8k+14 - round 13 ++ aese $ctr2b, $rk13 @ AES block 8k+10 - round 13 ++ aese $ctr1b, $rk13 @ AES block 8k+9 - round 13 ++ ++ aese $ctr5b, $rk13 @ AES block 8k+13 - round 13 ++ eor3 $acc_lb, $acc_lb, $t11.16b, $acc_hb @ MODULO - fold into low ++ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+15 ++ ++ aese $ctr7b, $rk13 @ AES block 8k+15 - round 13 ++ aese $ctr0b, $rk13 @ AES block 8k+8 - round 13 ++.L256_dec_tail: @ TAIL ++ ++ ext $t0.16b, $acc_lb, $acc_lb, #8 @ prepare final partial tag ++ sub $main_end_input_ptr, $end_input_ptr, $input_ptr @ main_end_input_ptr is number of bytes left to process ++ cmp $main_end_input_ptr, #112 ++ ++ ldr $res1q, [$input_ptr], #16 @ AES block 8k+8 - load ciphertext ++ ++ ldp $h78kq, $h8q, [$current_tag, #192] @ load h8l | h8h ++ ext $h8.16b, $h8.16b, $h8.16b, #8 ++ mov $t1.16b, $rk14 ++ ++ ldp $h5q, $h56kq, [$current_tag, #128] @ load h5l | h5h ++ ext $h5.16b, $h5.16b, $h5.16b, #8 ++ ++ eor3 $res4b, $res1b, $ctr0b, $t1.16b @ AES block 8k+8 - result ++ ldp $h6q, $h7q, [$current_tag, #160] @ load h6l | h6h ++ ext $h6.16b, $h6.16b, $h6.16b, #8 ++ ext $h7.16b, $h7.16b, $h7.16b, #8 ++ b.gt .L256_dec_blocks_more_than_7 ++ ++ mov $ctr7b, $ctr6b ++ sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s ++ mov $ctr6b, $ctr5b ++ ++ mov $ctr5b, $ctr4b ++ mov $ctr4b, $ctr3b ++ movi $acc_l.8b, #0 ++ ++ movi $acc_h.8b, #0 ++ movi $acc_m.8b, #0 ++ mov $ctr3b, $ctr2b ++ ++ cmp $main_end_input_ptr, #96 ++ mov $ctr2b, $ctr1b ++ b.gt .L256_dec_blocks_more_than_6 ++ ++ mov $ctr7b, $ctr6b ++ mov $ctr6b, $ctr5b ++ ++ mov $ctr5b, $ctr4b ++ cmp $main_end_input_ptr, #80 ++ sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s ++ ++ mov $ctr4b, $ctr3b ++ mov $ctr3b, $ctr1b ++ b.gt .L256_dec_blocks_more_than_5 ++ ++ cmp $main_end_input_ptr, #64 ++ mov $ctr7b, $ctr6b ++ sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s ++ ++ mov $ctr6b, $ctr5b ++ ++ mov $ctr5b, $ctr4b ++ mov $ctr4b, $ctr1b ++ b.gt .L256_dec_blocks_more_than_4 ++ ++ sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s ++ mov $ctr7b, $ctr6b ++ cmp $main_end_input_ptr, #48 ++ ++ mov $ctr6b, $ctr5b ++ mov $ctr5b, $ctr1b ++ b.gt .L256_dec_blocks_more_than_3 ++ ++ ldr $h34kq, [$current_tag, #96] @ load h4k | h3k ++ sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s ++ mov $ctr7b, $ctr6b ++ ++ cmp $main_end_input_ptr, #32 ++ mov $ctr6b, $ctr1b ++ b.gt .L256_dec_blocks_more_than_2 ++ ++ sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s ++ ++ mov $ctr7b, $ctr1b ++ cmp $main_end_input_ptr, #16 ++ b.gt .L256_dec_blocks_more_than_1 ++ ++ sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s ++ ldr $h12kq, [$current_tag, #48] @ load h2k | h1k ++ b .L256_dec_blocks_less_than_1 ++.L256_dec_blocks_more_than_7: @ blocks left > 7 ++ rev64 $res0b, $res1b @ GHASH final-7 block ++ ldr $res1q, [$input_ptr], #16 @ AES final-6 block - load ciphertext ++ st1 { $res4b}, [$output_ptr], #16 @ AES final-7 block - store result ++ ++ ins $acc_m.d[0], $h78k.d[1] @ GHASH final-7 block - mid ++ ++ eor $res0b, $res0b, $t0.16b @ feed in partial tag ++ ++ ins $rk4v.d[0], $res0.d[1] @ GHASH final-7 block - mid ++ eor3 $res4b, $res1b, $ctr1b, $t1.16b @ AES final-6 block - result ++ ++ pmull2 $acc_h.1q, $res0.2d, $h8.2d @ GHASH final-7 block - high ++ ++ eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-7 block - mid ++ movi $t0.8b, #0 @ surpress further partial tag feed in ++ ++ pmull $acc_l.1q, $res0.1d, $h8.1d @ GHASH final-7 block - low ++ pmull $acc_m.1q, $rk4v.1d, $acc_m.1d @ GHASH final-7 block - mid ++.L256_dec_blocks_more_than_6: @ blocks left > 6 ++ ++ rev64 $res0b, $res1b @ GHASH final-6 block ++ ++ eor $res0b, $res0b, $t0.16b @ feed in partial tag ++ ldr $res1q, [$input_ptr], #16 @ AES final-5 block - load ciphertext ++ movi $t0.8b, #0 @ surpress further partial tag feed in ++ ++ ins $rk4v.d[0], $res0.d[1] @ GHASH final-6 block - mid ++ st1 { $res4b}, [$output_ptr], #16 @ AES final-6 block - store result ++ pmull2 $rk2q1, $res0.2d, $h7.2d @ GHASH final-6 block - high ++ ++ pmull $rk3q1, $res0.1d, $h7.1d @ GHASH final-6 block - low ++ ++ eor3 $res4b, $res1b, $ctr2b, $t1.16b @ AES final-5 block - result ++ eor $acc_lb, $acc_lb, $rk3 @ GHASH final-6 block - low ++ eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-6 block - mid ++ ++ pmull $rk4v.1q, $rk4v.1d, $h78k.1d @ GHASH final-6 block - mid ++ ++ eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-6 block - mid ++ eor $acc_hb, $acc_hb, $rk2 @ GHASH final-6 block - high ++.L256_dec_blocks_more_than_5: @ blocks left > 5 ++ ++ rev64 $res0b, $res1b @ GHASH final-5 block ++ ++ eor $res0b, $res0b, $t0.16b @ feed in partial tag ++ ++ pmull2 $rk2q1, $res0.2d, $h6.2d @ GHASH final-5 block - high ++ ins $rk4v.d[0], $res0.d[1] @ GHASH final-5 block - mid ++ ++ ldr $res1q, [$input_ptr], #16 @ AES final-4 block - load ciphertext ++ ++ eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-5 block - mid ++ st1 { $res4b}, [$output_ptr], #16 @ AES final-5 block - store result ++ ++ pmull $rk3q1, $res0.1d, $h6.1d @ GHASH final-5 block - low ++ ins $rk4v.d[1], $rk4v.d[0] @ GHASH final-5 block - mid ++ ++ pmull2 $rk4v.1q, $rk4v.2d, $h56k.2d @ GHASH final-5 block - mid ++ ++ eor $acc_hb, $acc_hb, $rk2 @ GHASH final-5 block - high ++ eor3 $res4b, $res1b, $ctr3b, $t1.16b @ AES final-4 block - result ++ eor $acc_lb, $acc_lb, $rk3 @ GHASH final-5 block - low ++ ++ eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-5 block - mid ++ movi $t0.8b, #0 @ surpress further partial tag feed in ++.L256_dec_blocks_more_than_4: @ blocks left > 4 ++ ++ rev64 $res0b, $res1b @ GHASH final-4 block ++ ++ eor $res0b, $res0b, $t0.16b @ feed in partial tag ++ ++ ins $rk4v.d[0], $res0.d[1] @ GHASH final-4 block - mid ++ ldr $res1q, [$input_ptr], #16 @ AES final-3 block - load ciphertext ++ ++ movi $t0.8b, #0 @ surpress further partial tag feed in ++ ++ pmull $rk3q1, $res0.1d, $h5.1d @ GHASH final-4 block - low ++ pmull2 $rk2q1, $res0.2d, $h5.2d @ GHASH final-4 block - high ++ ++ eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-4 block - mid ++ ++ eor $acc_hb, $acc_hb, $rk2 @ GHASH final-4 block - high ++ ++ pmull $rk4v.1q, $rk4v.1d, $h56k.1d @ GHASH final-4 block - mid ++ ++ eor $acc_lb, $acc_lb, $rk3 @ GHASH final-4 block - low ++ st1 { $res4b}, [$output_ptr], #16 @ AES final-4 block - store result ++ ++ eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-4 block - mid ++ eor3 $res4b, $res1b, $ctr4b, $t1.16b @ AES final-3 block - result ++.L256_dec_blocks_more_than_3: @ blocks left > 3 ++ ++ ldr $h4q, [$current_tag, #112] @ load h4l | h4h ++ ext $h4.16b, $h4.16b, $h4.16b, #8 ++ rev64 $res0b, $res1b @ GHASH final-3 block ++ ++ eor $res0b, $res0b, $t0.16b @ feed in partial tag ++ ldr $res1q, [$input_ptr], #16 @ AES final-2 block - load ciphertext ++ ldr $h34kq, [$current_tag, #96] @ load h4k | h3k ++ ++ ins $rk4v.d[0], $res0.d[1] @ GHASH final-3 block - mid ++ st1 { $res4b}, [$output_ptr], #16 @ AES final-3 block - store result ++ ++ eor3 $res4b, $res1b, $ctr5b, $t1.16b @ AES final-2 block - result ++ ++ eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-3 block - mid ++ ++ ins $rk4v.d[1], $rk4v.d[0] @ GHASH final-3 block - mid ++ pmull $rk3q1, $res0.1d, $h4.1d @ GHASH final-3 block - low ++ pmull2 $rk2q1, $res0.2d, $h4.2d @ GHASH final-3 block - high ++ ++ movi $t0.8b, #0 @ surpress further partial tag feed in ++ pmull2 $rk4v.1q, $rk4v.2d, $h34k.2d @ GHASH final-3 block - mid ++ eor $acc_lb, $acc_lb, $rk3 @ GHASH final-3 block - low ++ ++ eor $acc_hb, $acc_hb, $rk2 @ GHASH final-3 block - high ++ ++ eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-3 block - mid ++.L256_dec_blocks_more_than_2: @ blocks left > 2 ++ ++ rev64 $res0b, $res1b @ GHASH final-2 block ++ ++ ldr $h3q, [$current_tag, #80] @ load h3l | h3h ++ ext $h3.16b, $h3.16b, $h3.16b, #8 ++ ldr $res1q, [$input_ptr], #16 @ AES final-1 block - load ciphertext ++ ++ eor $res0b, $res0b, $t0.16b @ feed in partial tag ++ ++ ins $rk4v.d[0], $res0.d[1] @ GHASH final-2 block - mid ++ ++ pmull $rk3q1, $res0.1d, $h3.1d @ GHASH final-2 block - low ++ st1 { $res4b}, [$output_ptr], #16 @ AES final-2 block - store result ++ eor3 $res4b, $res1b, $ctr6b, $t1.16b @ AES final-1 block - result ++ ++ eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-2 block - mid ++ eor $acc_lb, $acc_lb, $rk3 @ GHASH final-2 block - low ++ movi $t0.8b, #0 @ surpress further partial tag feed in ++ ++ pmull $rk4v.1q, $rk4v.1d, $h34k.1d @ GHASH final-2 block - mid ++ pmull2 $rk2q1, $res0.2d, $h3.2d @ GHASH final-2 block - high ++ ++ eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-2 block - mid ++ eor $acc_hb, $acc_hb, $rk2 @ GHASH final-2 block - high ++.L256_dec_blocks_more_than_1: @ blocks left > 1 ++ ++ rev64 $res0b, $res1b @ GHASH final-1 block ++ ++ eor $res0b, $res0b, $t0.16b @ feed in partial tag ++ ++ ins $rk4v.d[0], $res0.d[1] @ GHASH final-1 block - mid ++ ldr $h2q, [$current_tag, #64] @ load h1l | h1h ++ ext $h2.16b, $h2.16b, $h2.16b, #8 ++ ++ eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-1 block - mid ++ ldr $res1q, [$input_ptr], #16 @ AES final block - load ciphertext ++ st1 { $res4b}, [$output_ptr], #16 @ AES final-1 block - store result ++ ++ ldr $h12kq, [$current_tag, #48] @ load h2k | h1k ++ pmull $rk3q1, $res0.1d, $h2.1d @ GHASH final-1 block - low ++ ++ ins $rk4v.d[1], $rk4v.d[0] @ GHASH final-1 block - mid ++ ++ eor $acc_lb, $acc_lb, $rk3 @ GHASH final-1 block - low ++ ++ eor3 $res4b, $res1b, $ctr7b, $t1.16b @ AES final block - result ++ pmull2 $rk2q1, $res0.2d, $h2.2d @ GHASH final-1 block - high ++ ++ pmull2 $rk4v.1q, $rk4v.2d, $h12k.2d @ GHASH final-1 block - mid ++ ++ movi $t0.8b, #0 @ surpress further partial tag feed in ++ eor $acc_hb, $acc_hb, $rk2 @ GHASH final-1 block - high ++ ++ eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-1 block - mid ++.L256_dec_blocks_less_than_1: @ blocks left <= 1 ++ ++ ld1 { $rk0}, [$output_ptr] @ load existing bytes where the possibly partial last block is to be stored ++ mvn $temp0_x, xzr @ temp0_x = 0xffffffffffffffff ++ and $bit_length, $bit_length, #127 @ bit_length %= 128 ++ ++ sub $bit_length, $bit_length, #128 @ bit_length -= 128 ++ rev32 $rtmp_ctr.16b, $rtmp_ctr.16b ++ str $rtmp_ctrq, [$counter] @ store the updated counter ++ ++ neg $bit_length, $bit_length @ bit_length = 128 - #bits in input (in range [1,128]) ++ ++ and $bit_length, $bit_length, #127 @ bit_length %= 128 ++ ++ lsr $temp0_x, $temp0_x, $bit_length @ temp0_x is mask for top 64b of last block ++ cmp $bit_length, #64 ++ mvn $temp1_x, xzr @ temp1_x = 0xffffffffffffffff ++ ++ csel $temp3_x, $temp0_x, xzr, lt ++ csel $temp2_x, $temp1_x, $temp0_x, lt ++ ++ mov $ctr0.d[0], $temp2_x @ ctr0b is mask for last block ++ mov $ctr0.d[1], $temp3_x ++ ++ and $res1b, $res1b, $ctr0b @ possibly partial last block has zeroes in highest bits ++ ldr $h1q, [$current_tag, #32] @ load h1l | h1h ++ ext $h1.16b, $h1.16b, $h1.16b, #8 ++ bif $res4b, $rk0, $ctr0b @ insert existing bytes in top end of result before storing ++ ++ rev64 $res0b, $res1b @ GHASH final block ++ ++ eor $res0b, $res0b, $t0.16b @ feed in partial tag ++ ++ ins $t0.d[0], $res0.d[1] @ GHASH final block - mid ++ pmull2 $rk2q1, $res0.2d, $h1.2d @ GHASH final block - high ++ ++ eor $t0.8b, $t0.8b, $res0.8b @ GHASH final block - mid ++ ++ pmull $rk3q1, $res0.1d, $h1.1d @ GHASH final block - low ++ eor $acc_hb, $acc_hb, $rk2 @ GHASH final block - high ++ ++ pmull $t0.1q, $t0.1d, $h12k.1d @ GHASH final block - mid ++ ++ eor $acc_mb, $acc_mb, $t0.16b @ GHASH final block - mid ++ ldr $mod_constantd, [$modulo_constant] @ MODULO - load modulo constant ++ eor $acc_lb, $acc_lb, $rk3 @ GHASH final block - low ++ ++ pmull $t11.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid ++ eor $t10.16b, $acc_hb, $acc_lb @ MODULO - karatsuba tidy up ++ ++ ext $acc_hb, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment ++ st1 { $res4b}, [$output_ptr] @ store all 16B ++ ++ eor $acc_mb, $acc_mb, $t10.16b @ MODULO - karatsuba tidy up ++ ++ eor $t11.16b, $acc_hb, $t11.16b @ MODULO - fold into mid ++ eor $acc_mb, $acc_mb, $t11.16b @ MODULO - fold into mid ++ ++ pmull $acc_h.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low ++ ++ ext $acc_mb, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment ++ eor $acc_lb, $acc_lb, $acc_hb @ MODULO - fold into low ++ ++ eor $acc_lb, $acc_lb, $acc_mb @ MODULO - fold into low ++ ext $acc_lb, $acc_lb, $acc_lb, #8 ++ rev64 $acc_lb, $acc_lb ++ st1 { $acc_l.16b }, [$current_tag] ++ lsr x0, $bit_length, #3 @ return sizes ++ ++ ldp d10, d11, [sp, #16] ++ ldp d12, d13, [sp, #32] ++ ldp d14, d15, [sp, #48] ++ ldp d8, d9, [sp], #80 ++ ret ++ ++.L256_dec_ret: ++ mov w0, #0x0 ++ ret ++.size unroll8_eor3_aes_gcm_dec_256_kernel,.-unroll8_eor3_aes_gcm_dec_256_kernel ++___ ++} ++} ++ ++$code.=<<___; ++.asciz "AES GCM module for ARMv8, SPDX BSD-3-Clause by " ++.align 2 ++#endif ++___ ++ ++{ ++ my %opcode = ( ++ "rax1" => 0xce608c00, "eor3" => 0xce000000, ++ "bcax" => 0xce200000, "xar" => 0xce800000 ); ++ ++ sub unsha3 { ++ my ($mnemonic,$arg)=@_; ++ ++ $arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)[^,]*(?:,\s*[qv]([0-9]+)[^,]*(?:,\s*[qv#]([0-9\-]+))?)?/ ++ && ++ sprintf ".inst\t0x%08x\t//%s %s", ++ $opcode{$mnemonic}|$1|($2<<5)|($3<<16)|(eval($4)<<10), ++ $mnemonic,$arg; ++ } ++ sub unvmov { ++ my $arg=shift; ++ ++ $arg =~ m/q([0-9]+)#(lo|hi),\s*q([0-9]+)#(lo|hi)/o && ++ sprintf "ins v%d.d[%d],v%d.d[%d]",$1<8?$1:$1+8,($2 eq "lo")?0:1, ++ $3<8?$3:$3+8,($4 eq "lo")?0:1; ++ } ++ ++ foreach(split("\n",$code)) { ++ s/@\s/\/\//o; # old->new style commentary ++ s/\`([^\`]*)\`/eval($1)/ge; ++ ++ m/\bld1r\b/ and s/\.16b/.2d/g or ++ s/\b(eor3|rax1|xar|bcax)\s+(v.*)/unsha3($1,$2)/ge; ++ print $_,"\n"; ++ } ++} ++ ++close STDOUT or die "error closing STDOUT: $!"; # enforce flush +Index: openssl-1.1.1m/crypto/modes/asm/ghashv8-armx.pl +=================================================================== +--- openssl-1.1.1m.orig/crypto/modes/asm/ghashv8-armx.pl ++++ openssl-1.1.1m/crypto/modes/asm/ghashv8-armx.pl +@@ -141,6 +141,7 @@ gcm_init_v8: + ___ + if ($flavour =~ /64/) { + my ($t3,$Yl,$Ym,$Yh) = map("q$_",(4..7)); ++my ($H3,$H34k,$H4,$H5,$H56k,$H6,$H7,$H78k,$H8) = map("q$_",(15..23)); + + $code.=<<___; + @ calculate H^3 and H^4 +@@ -175,15 +176,103 @@ $code.=<<___; + vpmull.p64 $Yl,$Yl,$xC2 + veor $t2,$t2,$Xh + veor $t3,$t3,$Yh +- veor $H, $Xl,$t2 @ H^3 +- veor $H2,$Yl,$t3 @ H^4 ++ veor $H3, $Xl,$t2 @ H^3 ++ veor $H4,$Yl,$t3 @ H^4 + +- vext.8 $t0,$H, $H,#8 @ Karatsuba pre-processing +- vext.8 $t1,$H2,$H2,#8 +- veor $t0,$t0,$H +- veor $t1,$t1,$H2 +- vext.8 $Hhl,$t0,$t1,#8 @ pack Karatsuba pre-processed +- vst1.64 {$H-$H2},[x0] @ store Htable[3..5] ++ vext.8 $t0,$H3, $H3,#8 @ Karatsuba pre-processing ++ vext.8 $t1,$H4,$H4,#8 ++ vext.8 $t2,$H2,$H2,#8 ++ veor $t0,$t0,$H3 ++ veor $t1,$t1,$H4 ++ veor $t2,$t2,$H2 ++ vext.8 $H34k,$t0,$t1,#8 @ pack Karatsuba pre-processed ++ vst1.64 {$H3-$H4},[x0],#48 @ store Htable[3..5] ++ ++ @ calculate H^5 and H^6 ++ vpmull.p64 $Xl,$H2, $H3 ++ vpmull.p64 $Yl,$H3,$H3 ++ vpmull2.p64 $Xh,$H2, $H3 ++ vpmull2.p64 $Yh,$H3,$H3 ++ vpmull.p64 $Xm,$t0,$t2 ++ vpmull.p64 $Ym,$t0,$t0 ++ ++ vext.8 $t0,$Xl,$Xh,#8 @ Karatsuba post-processing ++ vext.8 $t1,$Yl,$Yh,#8 ++ veor $t2,$Xl,$Xh ++ veor $Xm,$Xm,$t0 ++ veor $t3,$Yl,$Yh ++ veor $Ym,$Ym,$t1 ++ veor $Xm,$Xm,$t2 ++ vpmull.p64 $t2,$Xl,$xC2 @ 1st phase ++ veor $Ym,$Ym,$t3 ++ vpmull.p64 $t3,$Yl,$xC2 ++ ++ vmov $Xh#lo,$Xm#hi @ Xh|Xm - 256-bit result ++ vmov $Yh#lo,$Ym#hi ++ vmov $Xm#hi,$Xl#lo @ Xm is rotated Xl ++ vmov $Ym#hi,$Yl#lo ++ veor $Xl,$Xm,$t2 ++ veor $Yl,$Ym,$t3 ++ ++ vext.8 $t2,$Xl,$Xl,#8 @ 2nd phase ++ vext.8 $t3,$Yl,$Yl,#8 ++ vpmull.p64 $Xl,$Xl,$xC2 ++ vpmull.p64 $Yl,$Yl,$xC2 ++ veor $t2,$t2,$Xh ++ veor $t3,$t3,$Yh ++ veor $H5,$Xl,$t2 @ H^5 ++ veor $H6,$Yl,$t3 @ H^6 ++ ++ vext.8 $t0,$H5, $H5,#8 @ Karatsuba pre-processing ++ vext.8 $t1,$H6,$H6,#8 ++ vext.8 $t2,$H2,$H2,#8 ++ veor $t0,$t0,$H5 ++ veor $t1,$t1,$H6 ++ veor $t2,$t2,$H2 ++ vext.8 $H56k,$t0,$t1,#8 @ pack Karatsuba pre-processed ++ vst1.64 {$H5-$H6},[x0],#48 @ store Htable[6..8] ++ ++ @ calculate H^7 and H^8 ++ vpmull.p64 $Xl,$H2,$H5 ++ vpmull.p64 $Yl,$H2,$H6 ++ vpmull2.p64 $Xh,$H2,$H5 ++ vpmull2.p64 $Yh,$H2,$H6 ++ vpmull.p64 $Xm,$t0,$t2 ++ vpmull.p64 $Ym,$t1,$t2 ++ ++ vext.8 $t0,$Xl,$Xh,#8 @ Karatsuba post-processing ++ vext.8 $t1,$Yl,$Yh,#8 ++ veor $t2,$Xl,$Xh ++ veor $Xm,$Xm,$t0 ++ veor $t3,$Yl,$Yh ++ veor $Ym,$Ym,$t1 ++ veor $Xm,$Xm,$t2 ++ vpmull.p64 $t2,$Xl,$xC2 @ 1st phase ++ veor $Ym,$Ym,$t3 ++ vpmull.p64 $t3,$Yl,$xC2 ++ ++ vmov $Xh#lo,$Xm#hi @ Xh|Xm - 256-bit result ++ vmov $Yh#lo,$Ym#hi ++ vmov $Xm#hi,$Xl#lo @ Xm is rotated Xl ++ vmov $Ym#hi,$Yl#lo ++ veor $Xl,$Xm,$t2 ++ veor $Yl,$Ym,$t3 ++ ++ vext.8 $t2,$Xl,$Xl,#8 @ 2nd phase ++ vext.8 $t3,$Yl,$Yl,#8 ++ vpmull.p64 $Xl,$Xl,$xC2 ++ vpmull.p64 $Yl,$Yl,$xC2 ++ veor $t2,$t2,$Xh ++ veor $t3,$t3,$Yh ++ veor $H7,$Xl,$t2 @ H^7 ++ veor $H8,$Yl,$t3 @ H^8 ++ ++ vext.8 $t0,$H7,$H7,#8 @ Karatsuba pre-processing ++ vext.8 $t1,$H8,$H8,#8 ++ veor $t0,$t0,$H7 ++ veor $t1,$t1,$H8 ++ vext.8 $H78k,$t0,$t1,#8 @ pack Karatsuba pre-processed ++ vst1.64 {$H7-$H8},[x0] @ store Htable[9..11] + ___ + } + $code.=<<___; +Index: openssl-1.1.1m/crypto/modes/build.info +=================================================================== +--- openssl-1.1.1m.orig/crypto/modes/build.info ++++ openssl-1.1.1m/crypto/modes/build.info +@@ -20,6 +20,8 @@ GENERATE[ghash-armv4.S]=asm/ghash-armv4. + INCLUDE[ghash-armv4.o]=.. + GENERATE[ghashv8-armx.S]=asm/ghashv8-armx.pl $(PERLASM_SCHEME) + INCLUDE[ghashv8-armx.o]=.. ++GENERATE[aes-gcm-armv8-unroll8_64.S]=asm/aes-gcm-armv8-unroll8_64.pl $(PERLASM_SCHEME) ++INCLUDE[aes-gcm-armv8-unroll8_64.o]=.. + GENERATE[ghash-s390x.S]=asm/ghash-s390x.pl $(PERLASM_SCHEME) + INCLUDE[ghash-s390x.o]=.. + diff --git a/openssl-1_1-Optimize-AES-XTS-aarch64.patch b/openssl-1_1-Optimize-AES-XTS-aarch64.patch new file mode 100644 index 0000000..ea2c3e9 --- /dev/null +++ b/openssl-1_1-Optimize-AES-XTS-aarch64.patch @@ -0,0 +1,1616 @@ +From 9ce8e0d17e608de4f85f7543c52b146e3c6a2291 Mon Sep 17 00:00:00 2001 +From: XiaokangQian +Date: Fri, 13 Mar 2020 03:27:34 +0000 +Subject: [PATCH] Optimize AES-XTS mode in OpenSSL for aarch64 + +Aes-xts mode can be optimized by interleaving cipher operation on +several blocks and loop unrolling. Interleaving needs one ideal +unrolling factor, here we adopt the same factor with aes-cbc, +which is described as below: + If blocks number > 5, select 5 blocks as one iteration,every + loop, decrease the blocks number by 5. + If left blocks < 5, treat them as tail blocks. +Detailed implementation has a little adjustment for squeezing +code space. +With this way, for small size such as 16 bytes, the performance is +similar as before, but for big size such as 16k bytes, the performance +improves a lot, even reaches to 2x uplift, for some arches such as A57, +the improvement even reaches more than 2x uplift. We collect many +performance datas on different micro-archs such as thunderx2, +ampere-emag, a72, a75, a57, a53 and N1, all of which reach 0.5-2x uplift. +The following table lists the encryption performance data on aarch64, +take a72, a75, a57, a53 and N1 as examples. Performance value takes the +unit of cycles per byte, takes the format as comparision of values. +List them as below: + +A72: + Before optimization After optimization Improve +evp-aes-128-xts@16 8.899913518 5.949087263 49.60% +evp-aes-128-xts@64 4.525512668 3.389141845 33.53% +evp-aes-128-xts@256 3.502906908 1.633573479 114.43% +evp-aes-128-xts@1024 3.174210419 1.155952639 174.60% +evp-aes-128-xts@8192 3.053019303 1.028134888 196.95% +evp-aes-128-xts@16384 3.025292462 1.02021169 196.54% +evp-aes-256-xts@16 9.971105023 6.754233758 47.63% +evp-aes-256-xts@64 4.931479093 3.786527393 30.24% +evp-aes-256-xts@256 3.746788153 1.943975947 92.74% +evp-aes-256-xts@1024 3.401743802 1.477394648 130.25% +evp-aes-256-xts@8192 3.278769327 1.32950421 146.62% +evp-aes-256-xts@16384 3.27093296 1.325276257 146.81% + +A75: + Before optimization After optimization Improve +evp-aes-128-xts@16 8.397965173 5.126839098 63.80% +evp-aes-128-xts@64 4.176860631 2.59817764 60.76% +evp-aes-128-xts@256 3.069126585 1.284561028 138.92% +evp-aes-128-xts@1024 2.805962699 0.932754655 200.83% +evp-aes-128-xts@8192 2.725820131 0.829820397 228.48% +evp-aes-128-xts@16384 2.71521905 0.823251591 229.82% +evp-aes-256-xts@16 11.24790935 7.383914448 52.33% +evp-aes-256-xts@64 5.294128847 3.048641998 73.66% +evp-aes-256-xts@256 3.861649617 1.570359905 145.91% +evp-aes-256-xts@1024 3.537646797 1.200493533 194.68% +evp-aes-256-xts@8192 3.435353012 1.085345319 216.52% +evp-aes-256-xts@16384 3.437952563 1.097963822 213.12% + +A57: + Before optimization After optimization Improve +evp-aes-128-xts@16 10.57455446 7.165438012 47.58% +evp-aes-128-xts@64 5.418185447 3.721241202 45.60% +evp-aes-128-xts@256 3.855184592 1.747145379 120.66% +evp-aes-128-xts@1024 3.477199757 1.253049735 177.50% +evp-aes-128-xts@8192 3.36768104 1.091943159 208.41% +evp-aes-128-xts@16384 3.360373443 1.088942789 208.59% +evp-aes-256-xts@16 12.54559459 8.745489036 43.45% +evp-aes-256-xts@64 6.542808937 4.326387568 51.23% +evp-aes-256-xts@256 4.62668822 2.119908754 118.25% +evp-aes-256-xts@1024 4.161716505 1.557335554 167.23% +evp-aes-256-xts@8192 4.032462227 1.377749511 192.68% +evp-aes-256-xts@16384 4.023293877 1.371558933 193.34% + +A53: + Before optimization After optimization Improve +evp-aes-128-xts@16 18.07842135 13.96980808 29.40% +evp-aes-128-xts@64 7.933818397 6.07159276 30.70% +evp-aes-128-xts@256 5.264604704 2.611155744 101.60% +evp-aes-128-xts@1024 4.606660117 1.722713454 167.40% +evp-aes-128-xts@8192 4.405160115 1.454379201 202.90% +evp-aes-128-xts@16384 4.401592028 1.442279392 205.20% +evp-aes-256-xts@16 20.07084054 16.00803726 25.40% +evp-aes-256-xts@64 9.192647294 6.883876732 33.50% +evp-aes-256-xts@256 6.336143161 3.108140452 103.90% +evp-aes-256-xts@1024 5.62502952 2.097960651 168.10% +evp-aes-256-xts@8192 5.412085608 1.807294191 199.50% +evp-aes-256-xts@16384 5.403062591 1.790135764 201.80% + +N1: + Before optimization After optimization Improve +evp-aes-128-xts@16 6.48147613 4.209415473 53.98% +evp-aes-128-xts@64 2.847744115 1.950757468 45.98% +evp-aes-128-xts@256 2.085711968 1.061903238 96.41% +evp-aes-128-xts@1024 1.842014669 0.798486302 130.69% +evp-aes-128-xts@8192 1.760449052 0.713853939 146.61% +evp-aes-128-xts@16384 1.760763546 0.707702009 148.80% +evp-aes-256-xts@16 7.264142817 5.265970454 37.94% +evp-aes-256-xts@64 3.251356212 2.41176323 34.81% +evp-aes-256-xts@256 2.380488469 1.342095742 77.37% +evp-aes-256-xts@1024 2.08853022 1.041718215 100.49% +evp-aes-256-xts@8192 2.027432668 0.944571334 114.64% +evp-aes-256-xts@16384 2.00740782 0.941991415 113.10% + +Add more XTS test cases to cover the cipher stealing mode and cases of different +number of blocks. + +CustomizedGitHooks: yes +Change-Id: I93ee31b2575e1413764e27b599af62994deb4c96 + +Reviewed-by: Paul Dale +Reviewed-by: Tomas Mraz +(Merged from https://github.com/openssl/openssl/pull/11399) +--- + crypto/aes/asm/aesv8-armx.pl | 1426 +++++++++++++++++ + include/crypto/aes_platform.h | 4 + + .../30-test_evp_data/evpciph_aes_common.txt | 38 + + 3 files changed, 1468 insertions(+) + +Index: openssl-1.1.1d/crypto/aes/asm/aesv8-armx.pl +=================================================================== +--- openssl-1.1.1d.orig/crypto/aes/asm/aesv8-armx.pl ++++ openssl-1.1.1d/crypto/aes/asm/aesv8-armx.pl +@@ -897,6 +897,1432 @@ $code.=<<___; + .size ${prefix}_ctr32_encrypt_blocks,.-${prefix}_ctr32_encrypt_blocks + ___ + }}} ++# Performance in cycles per byte. ++# Processed with AES-XTS different key size. ++# It shows the value before and after optimization as below: ++# (before/after): ++# ++# AES-128-XTS AES-256-XTS ++# Cortex-A57 3.36/1.09 4.02/1.37 ++# Cortex-A72 3.03/1.02 3.28/1.33 ++ ++# Optimization is implemented by loop unrolling and interleaving. ++# Commonly, we choose the unrolling factor as 5, if the input ++# data size smaller than 5 blocks, but not smaller than 3 blocks, ++# choose 3 as the unrolling factor. ++# If the input data size dsize >= 5*16 bytes, then take 5 blocks ++# as one iteration, every loop the left size lsize -= 5*16. ++# If lsize < 5*16 bytes, treat them as the tail. Note: left 4*16 bytes ++# will be processed specially, which be integrated into the 5*16 bytes ++# loop to improve the efficiency. ++# There is one special case, if the original input data size dsize ++# = 16 bytes, we will treat it seperately to improve the ++# performance: one independent code block without LR, FP load and ++# store. ++# Encryption will process the (length -tailcnt) bytes as mentioned ++# previously, then encrypt the composite block as last second ++# cipher block. ++# Decryption will process the (length -tailcnt -1) bytes as mentioned ++# previously, then decrypt the last second cipher block to get the ++# last plain block(tail), decrypt the composite block as last second ++# plain text block. ++ ++{{{ ++my ($inp,$out,$len,$key1,$key2,$ivp)=map("x$_",(0..5)); ++my ($rounds0,$rounds,$key_,$step,$ivl,$ivh)=("w5","w6","x7","x8","x9","x10"); ++my ($tmpoutp,$loutp,$l2outp,$tmpinp)=("x13","w14","w15","x20"); ++my ($tailcnt,$midnum,$midnumx,$constnum,$constnumx)=("x21","w22","x22","w19","x19"); ++my ($xoffset,$tmpmx,$tmpmw)=("x6","x11","w11"); ++my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$tmp2,$rndlast)=map("q$_",(0..7)); ++my ($iv0,$iv1,$iv2,$iv3,$iv4)=("v6.16b","v8.16b","v9.16b","v10.16b","v11.16b"); ++my ($ivd00,$ivd01,$ivd20,$ivd21)=("d6","v6.d[1]","d9","v9.d[1]"); ++my ($ivd10,$ivd11,$ivd30,$ivd31,$ivd40,$ivd41)=("d8","v8.d[1]","d10","v10.d[1]","d11","v11.d[1]"); ++ ++my ($tmpin)=("v26.16b"); ++my ($dat,$tmp,$rndzero_n_last)=($dat0,$tmp0,$tmp1); ++ ++# q7 last round key ++# q10-q15, q7 Last 7 round keys ++# q8-q9 preloaded round keys except last 7 keys for big size ++# q20, q21, q8-q9 preloaded round keys except last 7 keys for only 16 byte ++ ++ ++my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9)); ++ ++my ($dat3,$in3,$tmp3); # used only in 64-bit mode ++my ($dat4,$in4,$tmp4); ++if ($flavour =~ /64/) { ++ ($dat2,$dat3,$dat4,$in2,$in3,$in4,$tmp3,$tmp4)=map("q$_",(16..23)); ++} ++ ++$code.=<<___ if ($flavour =~ /64/); ++.globl ${prefix}_xts_encrypt ++.type ${prefix}_xts_encrypt,%function ++.align 5 ++${prefix}_xts_encrypt: ++___ ++$code.=<<___ if ($flavour =~ /64/); ++ cmp $len,#16 ++ // Original input data size bigger than 16, jump to big size processing. ++ b.ne .Lxts_enc_big_size ++ // Encrypt the iv with key2, as the first XEX iv. ++ ldr $rounds,[$key2,#240] ++ vld1.8 {$dat},[$key2],#16 ++ vld1.8 {$iv0},[$ivp] ++ sub $rounds,$rounds,#2 ++ vld1.8 {$dat1},[$key2],#16 ++ ++.Loop_enc_iv_enc: ++ aese $iv0,$dat ++ aesmc $iv0,$iv0 ++ vld1.32 {$dat},[$key2],#16 ++ subs $rounds,$rounds,#2 ++ aese $iv0,$dat1 ++ aesmc $iv0,$iv0 ++ vld1.32 {$dat1},[$key2],#16 ++ b.gt .Loop_enc_iv_enc ++ ++ aese $iv0,$dat ++ aesmc $iv0,$iv0 ++ vld1.32 {$dat},[$key2] ++ aese $iv0,$dat1 ++ veor $iv0,$iv0,$dat ++ ++ vld1.8 {$dat0},[$inp] ++ veor $dat0,$iv0,$dat0 ++ ++ ldr $rounds,[$key1,#240] ++ vld1.32 {q20-q21},[$key1],#32 // load key schedule... ++ ++ aese $dat0,q20 ++ aesmc $dat0,$dat0 ++ vld1.32 {q8-q9},[$key1],#32 // load key schedule... ++ aese $dat0,q21 ++ aesmc $dat0,$dat0 ++ subs $rounds,$rounds,#10 // if rounds==10, jump to aes-128-xts processing ++ b.eq .Lxts_128_enc ++.Lxts_enc_round_loop: ++ aese $dat0,q8 ++ aesmc $dat0,$dat0 ++ vld1.32 {q8},[$key1],#16 // load key schedule... ++ aese $dat0,q9 ++ aesmc $dat0,$dat0 ++ vld1.32 {q9},[$key1],#16 // load key schedule... ++ subs $rounds,$rounds,#2 // bias ++ b.gt .Lxts_enc_round_loop ++.Lxts_128_enc: ++ vld1.32 {q10-q11},[$key1],#32 // load key schedule... ++ aese $dat0,q8 ++ aesmc $dat0,$dat0 ++ aese $dat0,q9 ++ aesmc $dat0,$dat0 ++ vld1.32 {q12-q13},[$key1],#32 // load key schedule... ++ aese $dat0,q10 ++ aesmc $dat0,$dat0 ++ aese $dat0,q11 ++ aesmc $dat0,$dat0 ++ vld1.32 {q14-q15},[$key1],#32 // load key schedule... ++ aese $dat0,q12 ++ aesmc $dat0,$dat0 ++ aese $dat0,q13 ++ aesmc $dat0,$dat0 ++ vld1.32 {$rndlast},[$key1] ++ aese $dat0,q14 ++ aesmc $dat0,$dat0 ++ aese $dat0,q15 ++ veor $dat0,$dat0,$rndlast ++ veor $dat0,$dat0,$iv0 ++ vst1.8 {$dat0},[$out] ++ b .Lxts_enc_final_abort ++ ++.align 4 ++.Lxts_enc_big_size: ++___ ++$code.=<<___ if ($flavour =~ /64/); ++ stp $constnumx,$tmpinp,[sp,#-64]! ++ stp $tailcnt,$midnumx,[sp,#48] ++ stp $ivd10,$ivd20,[sp,#32] ++ stp $ivd30,$ivd40,[sp,#16] ++ ++ // tailcnt store the tail value of length%16. ++ and $tailcnt,$len,#0xf ++ and $len,$len,#-16 ++ subs $len,$len,#16 ++ mov $step,#16 ++ b.lo .Lxts_abort ++ csel $step,xzr,$step,eq ++ ++ // Firstly, encrypt the iv with key2, as the first iv of XEX. ++ ldr $rounds,[$key2,#240] ++ vld1.32 {$dat},[$key2],#16 ++ vld1.8 {$iv0},[$ivp] ++ sub $rounds,$rounds,#2 ++ vld1.32 {$dat1},[$key2],#16 ++ ++.Loop_iv_enc: ++ aese $iv0,$dat ++ aesmc $iv0,$iv0 ++ vld1.32 {$dat},[$key2],#16 ++ subs $rounds,$rounds,#2 ++ aese $iv0,$dat1 ++ aesmc $iv0,$iv0 ++ vld1.32 {$dat1},[$key2],#16 ++ b.gt .Loop_iv_enc ++ ++ aese $iv0,$dat ++ aesmc $iv0,$iv0 ++ vld1.32 {$dat},[$key2] ++ aese $iv0,$dat1 ++ veor $iv0,$iv0,$dat ++ ++ // The iv for second block ++ // $ivl- iv(low), $ivh - iv(high) ++ // the five ivs stored into, $iv0,$iv1,$iv2,$iv3,$iv4 ++ fmov $ivl,$ivd00 ++ fmov $ivh,$ivd01 ++ mov $constnum,#0x87 ++ extr $midnumx,$ivh,$ivh,#32 ++ extr $ivh,$ivh,$ivl,#63 ++ and $tmpmw,$constnum,$midnum,asr#31 ++ eor $ivl,$tmpmx,$ivl,lsl#1 ++ fmov $ivd10,$ivl ++ fmov $ivd11,$ivh ++ ++ ldr $rounds0,[$key1,#240] // next starting point ++ vld1.8 {$dat},[$inp],$step ++ ++ vld1.32 {q8-q9},[$key1] // load key schedule... ++ sub $rounds0,$rounds0,#6 ++ add $key_,$key1,$ivp,lsl#4 // pointer to last 7 round keys ++ sub $rounds0,$rounds0,#2 ++ vld1.32 {q10-q11},[$key_],#32 ++ vld1.32 {q12-q13},[$key_],#32 ++ vld1.32 {q14-q15},[$key_],#32 ++ vld1.32 {$rndlast},[$key_] ++ ++ add $key_,$key1,#32 ++ mov $rounds,$rounds0 ++ ++ // Encryption ++.Lxts_enc: ++ vld1.8 {$dat2},[$inp],#16 ++ subs $len,$len,#32 // bias ++ add $rounds,$rounds0,#2 ++ vorr $in1,$dat,$dat ++ vorr $dat1,$dat,$dat ++ vorr $in3,$dat,$dat ++ vorr $in2,$dat2,$dat2 ++ vorr $in4,$dat2,$dat2 ++ b.lo .Lxts_inner_enc_tail ++ veor $dat,$dat,$iv0 // before encryption, xor with iv ++ veor $dat2,$dat2,$iv1 ++ ++ // The iv for third block ++ extr $midnumx,$ivh,$ivh,#32 ++ extr $ivh,$ivh,$ivl,#63 ++ and $tmpmw,$constnum,$midnum,asr#31 ++ eor $ivl,$tmpmx,$ivl,lsl#1 ++ fmov $ivd20,$ivl ++ fmov $ivd21,$ivh ++ ++ ++ vorr $dat1,$dat2,$dat2 ++ vld1.8 {$dat2},[$inp],#16 ++ vorr $in0,$dat,$dat ++ vorr $in1,$dat1,$dat1 ++ veor $in2,$dat2,$iv2 // the third block ++ veor $dat2,$dat2,$iv2 ++ cmp $len,#32 ++ b.lo .Lxts_outer_enc_tail ++ ++ // The iv for fourth block ++ extr $midnumx,$ivh,$ivh,#32 ++ extr $ivh,$ivh,$ivl,#63 ++ and $tmpmw,$constnum,$midnum,asr#31 ++ eor $ivl,$tmpmx,$ivl,lsl#1 ++ fmov $ivd30,$ivl ++ fmov $ivd31,$ivh ++ ++ vld1.8 {$dat3},[$inp],#16 ++ // The iv for fifth block ++ extr $midnumx,$ivh,$ivh,#32 ++ extr $ivh,$ivh,$ivl,#63 ++ and $tmpmw,$constnum,$midnum,asr#31 ++ eor $ivl,$tmpmx,$ivl,lsl#1 ++ fmov $ivd40,$ivl ++ fmov $ivd41,$ivh ++ ++ vld1.8 {$dat4},[$inp],#16 ++ veor $dat3,$dat3,$iv3 // the fourth block ++ veor $dat4,$dat4,$iv4 ++ sub $len,$len,#32 // bias ++ mov $rounds,$rounds0 ++ b .Loop5x_xts_enc ++ ++.align 4 ++.Loop5x_xts_enc: ++ aese $dat0,q8 ++ aesmc $dat0,$dat0 ++ aese $dat1,q8 ++ aesmc $dat1,$dat1 ++ aese $dat2,q8 ++ aesmc $dat2,$dat2 ++ aese $dat3,q8 ++ aesmc $dat3,$dat3 ++ aese $dat4,q8 ++ aesmc $dat4,$dat4 ++ vld1.32 {q8},[$key_],#16 ++ subs $rounds,$rounds,#2 ++ aese $dat0,q9 ++ aesmc $dat0,$dat0 ++ aese $dat1,q9 ++ aesmc $dat1,$dat1 ++ aese $dat2,q9 ++ aesmc $dat2,$dat2 ++ aese $dat3,q9 ++ aesmc $dat3,$dat3 ++ aese $dat4,q9 ++ aesmc $dat4,$dat4 ++ vld1.32 {q9},[$key_],#16 ++ b.gt .Loop5x_xts_enc ++ ++ aese $dat0,q8 ++ aesmc $dat0,$dat0 ++ aese $dat1,q8 ++ aesmc $dat1,$dat1 ++ aese $dat2,q8 ++ aesmc $dat2,$dat2 ++ aese $dat3,q8 ++ aesmc $dat3,$dat3 ++ aese $dat4,q8 ++ aesmc $dat4,$dat4 ++ subs $len,$len,#0x50 // because .Lxts_enc_tail4x ++ ++ aese $dat0,q9 ++ aesmc $dat0,$dat0 ++ aese $dat1,q9 ++ aesmc $dat1,$dat1 ++ aese $dat2,q9 ++ aesmc $dat2,$dat2 ++ aese $dat3,q9 ++ aesmc $dat3,$dat3 ++ aese $dat4,q9 ++ aesmc $dat4,$dat4 ++ csel $xoffset,xzr,$len,gt // borrow x6, w6, "gt" is not typo ++ mov $key_,$key1 ++ ++ aese $dat0,q10 ++ aesmc $dat0,$dat0 ++ aese $dat1,q10 ++ aesmc $dat1,$dat1 ++ aese $dat2,q10 ++ aesmc $dat2,$dat2 ++ aese $dat3,q10 ++ aesmc $dat3,$dat3 ++ aese $dat4,q10 ++ aesmc $dat4,$dat4 ++ add $inp,$inp,$xoffset // x0 is adjusted in such way that ++ // at exit from the loop v1.16b-v26.16b ++ // are loaded with last "words" ++ add $xoffset,$len,#0x60 // because .Lxts_enc_tail4x ++ ++ aese $dat0,q11 ++ aesmc $dat0,$dat0 ++ aese $dat1,q11 ++ aesmc $dat1,$dat1 ++ aese $dat2,q11 ++ aesmc $dat2,$dat2 ++ aese $dat3,q11 ++ aesmc $dat3,$dat3 ++ aese $dat4,q11 ++ aesmc $dat4,$dat4 ++ ++ aese $dat0,q12 ++ aesmc $dat0,$dat0 ++ aese $dat1,q12 ++ aesmc $dat1,$dat1 ++ aese $dat2,q12 ++ aesmc $dat2,$dat2 ++ aese $dat3,q12 ++ aesmc $dat3,$dat3 ++ aese $dat4,q12 ++ aesmc $dat4,$dat4 ++ ++ aese $dat0,q13 ++ aesmc $dat0,$dat0 ++ aese $dat1,q13 ++ aesmc $dat1,$dat1 ++ aese $dat2,q13 ++ aesmc $dat2,$dat2 ++ aese $dat3,q13 ++ aesmc $dat3,$dat3 ++ aese $dat4,q13 ++ aesmc $dat4,$dat4 ++ ++ aese $dat0,q14 ++ aesmc $dat0,$dat0 ++ aese $dat1,q14 ++ aesmc $dat1,$dat1 ++ aese $dat2,q14 ++ aesmc $dat2,$dat2 ++ aese $dat3,q14 ++ aesmc $dat3,$dat3 ++ aese $dat4,q14 ++ aesmc $dat4,$dat4 ++ ++ veor $tmp0,$rndlast,$iv0 ++ aese $dat0,q15 ++ // The iv for first block of one iteration ++ extr $midnumx,$ivh,$ivh,#32 ++ extr $ivh,$ivh,$ivl,#63 ++ and $tmpmw,$constnum,$midnum,asr#31 ++ eor $ivl,$tmpmx,$ivl,lsl#1 ++ fmov $ivd00,$ivl ++ fmov $ivd01,$ivh ++ veor $tmp1,$rndlast,$iv1 ++ vld1.8 {$in0},[$inp],#16 ++ aese $dat1,q15 ++ // The iv for second block ++ extr $midnumx,$ivh,$ivh,#32 ++ extr $ivh,$ivh,$ivl,#63 ++ and $tmpmw,$constnum,$midnum,asr#31 ++ eor $ivl,$tmpmx,$ivl,lsl#1 ++ fmov $ivd10,$ivl ++ fmov $ivd11,$ivh ++ veor $tmp2,$rndlast,$iv2 ++ vld1.8 {$in1},[$inp],#16 ++ aese $dat2,q15 ++ // The iv for third block ++ extr $midnumx,$ivh,$ivh,#32 ++ extr $ivh,$ivh,$ivl,#63 ++ and $tmpmw,$constnum,$midnum,asr#31 ++ eor $ivl,$tmpmx,$ivl,lsl#1 ++ fmov $ivd20,$ivl ++ fmov $ivd21,$ivh ++ veor $tmp3,$rndlast,$iv3 ++ vld1.8 {$in2},[$inp],#16 ++ aese $dat3,q15 ++ // The iv for fourth block ++ extr $midnumx,$ivh,$ivh,#32 ++ extr $ivh,$ivh,$ivl,#63 ++ and $tmpmw,$constnum,$midnum,asr#31 ++ eor $ivl,$tmpmx,$ivl,lsl#1 ++ fmov $ivd30,$ivl ++ fmov $ivd31,$ivh ++ veor $tmp4,$rndlast,$iv4 ++ vld1.8 {$in3},[$inp],#16 ++ aese $dat4,q15 ++ ++ // The iv for fifth block ++ extr $midnumx,$ivh,$ivh,#32 ++ extr $ivh,$ivh,$ivl,#63 ++ and $tmpmw,$constnum,$midnum,asr #31 ++ eor $ivl,$tmpmx,$ivl,lsl #1 ++ fmov $ivd40,$ivl ++ fmov $ivd41,$ivh ++ ++ vld1.8 {$in4},[$inp],#16 ++ cbz $xoffset,.Lxts_enc_tail4x ++ vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0] ++ veor $tmp0,$tmp0,$dat0 ++ veor $dat0,$in0,$iv0 ++ veor $tmp1,$tmp1,$dat1 ++ veor $dat1,$in1,$iv1 ++ veor $tmp2,$tmp2,$dat2 ++ veor $dat2,$in2,$iv2 ++ veor $tmp3,$tmp3,$dat3 ++ veor $dat3,$in3,$iv3 ++ veor $tmp4,$tmp4,$dat4 ++ vst1.8 {$tmp0},[$out],#16 ++ veor $dat4,$in4,$iv4 ++ vst1.8 {$tmp1},[$out],#16 ++ mov $rounds,$rounds0 ++ vst1.8 {$tmp2},[$out],#16 ++ vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1] ++ vst1.8 {$tmp3},[$out],#16 ++ vst1.8 {$tmp4},[$out],#16 ++ b.hs .Loop5x_xts_enc ++ ++ ++ // If left 4 blocks, borrow the five block's processing. ++ cmn $len,#0x10 ++ b.ne .Loop5x_enc_after ++ vorr $iv4,$iv3,$iv3 ++ vorr $iv3,$iv2,$iv2 ++ vorr $iv2,$iv1,$iv1 ++ vorr $iv1,$iv0,$iv0 ++ fmov $ivl,$ivd40 ++ fmov $ivh,$ivd41 ++ veor $dat0,$iv0,$in0 ++ veor $dat1,$iv1,$in1 ++ veor $dat2,$in2,$iv2 ++ veor $dat3,$in3,$iv3 ++ veor $dat4,$in4,$iv4 ++ b.eq .Loop5x_xts_enc ++ ++.Loop5x_enc_after: ++ add $len,$len,#0x50 ++ cbz $len,.Lxts_enc_done ++ ++ add $rounds,$rounds0,#2 ++ subs $len,$len,#0x30 ++ b.lo .Lxts_inner_enc_tail ++ ++ veor $dat0,$iv0,$in2 ++ veor $dat1,$iv1,$in3 ++ veor $dat2,$in4,$iv2 ++ b .Lxts_outer_enc_tail ++ ++.align 4 ++.Lxts_enc_tail4x: ++ add $inp,$inp,#16 ++ veor $tmp1,$dat1,$tmp1 ++ vst1.8 {$tmp1},[$out],#16 ++ veor $tmp2,$dat2,$tmp2 ++ vst1.8 {$tmp2},[$out],#16 ++ veor $tmp3,$dat3,$tmp3 ++ veor $tmp4,$dat4,$tmp4 ++ vst1.8 {$tmp3-$tmp4},[$out],#32 ++ ++ b .Lxts_enc_done ++.align 4 ++.Lxts_outer_enc_tail: ++ aese $dat0,q8 ++ aesmc $dat0,$dat0 ++ aese $dat1,q8 ++ aesmc $dat1,$dat1 ++ aese $dat2,q8 ++ aesmc $dat2,$dat2 ++ vld1.32 {q8},[$key_],#16 ++ subs $rounds,$rounds,#2 ++ aese $dat0,q9 ++ aesmc $dat0,$dat0 ++ aese $dat1,q9 ++ aesmc $dat1,$dat1 ++ aese $dat2,q9 ++ aesmc $dat2,$dat2 ++ vld1.32 {q9},[$key_],#16 ++ b.gt .Lxts_outer_enc_tail ++ ++ aese $dat0,q8 ++ aesmc $dat0,$dat0 ++ aese $dat1,q8 ++ aesmc $dat1,$dat1 ++ aese $dat2,q8 ++ aesmc $dat2,$dat2 ++ veor $tmp0,$iv0,$rndlast ++ subs $len,$len,#0x30 ++ // The iv for first block ++ fmov $ivl,$ivd20 ++ fmov $ivh,$ivd21 ++ //mov $constnum,#0x87 ++ extr $midnumx,$ivh,$ivh,#32 ++ extr $ivh,$ivh,$ivl,#63 ++ and $tmpmw,$constnum,$midnum,asr#31 ++ eor $ivl,$tmpmx,$ivl,lsl#1 ++ fmov $ivd00,$ivl ++ fmov $ivd01,$ivh ++ veor $tmp1,$iv1,$rndlast ++ csel $xoffset,$len,$xoffset,lo // x6, w6, is zero at this point ++ aese $dat0,q9 ++ aesmc $dat0,$dat0 ++ aese $dat1,q9 ++ aesmc $dat1,$dat1 ++ aese $dat2,q9 ++ aesmc $dat2,$dat2 ++ veor $tmp2,$iv2,$rndlast ++ ++ add $xoffset,$xoffset,#0x20 ++ add $inp,$inp,$xoffset ++ mov $key_,$key1 ++ ++ aese $dat0,q12 ++ aesmc $dat0,$dat0 ++ aese $dat1,q12 ++ aesmc $dat1,$dat1 ++ aese $dat2,q12 ++ aesmc $dat2,$dat2 ++ aese $dat0,q13 ++ aesmc $dat0,$dat0 ++ aese $dat1,q13 ++ aesmc $dat1,$dat1 ++ aese $dat2,q13 ++ aesmc $dat2,$dat2 ++ aese $dat0,q14 ++ aesmc $dat0,$dat0 ++ aese $dat1,q14 ++ aesmc $dat1,$dat1 ++ aese $dat2,q14 ++ aesmc $dat2,$dat2 ++ aese $dat0,q15 ++ aese $dat1,q15 ++ aese $dat2,q15 ++ vld1.8 {$in2},[$inp],#16 ++ add $rounds,$rounds0,#2 ++ vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0] ++ veor $tmp0,$tmp0,$dat0 ++ veor $tmp1,$tmp1,$dat1 ++ veor $dat2,$dat2,$tmp2 ++ vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1] ++ vst1.8 {$tmp0},[$out],#16 ++ vst1.8 {$tmp1},[$out],#16 ++ vst1.8 {$dat2},[$out],#16 ++ cmn $len,#0x30 ++ b.eq .Lxts_enc_done ++.Lxts_encxor_one: ++ vorr $in3,$in1,$in1 ++ vorr $in4,$in2,$in2 ++ nop ++ ++.Lxts_inner_enc_tail: ++ cmn $len,#0x10 ++ veor $dat1,$in3,$iv0 ++ veor $dat2,$in4,$iv1 ++ b.eq .Lxts_enc_tail_loop ++ veor $dat2,$in4,$iv0 ++.Lxts_enc_tail_loop: ++ aese $dat1,q8 ++ aesmc $dat1,$dat1 ++ aese $dat2,q8 ++ aesmc $dat2,$dat2 ++ vld1.32 {q8},[$key_],#16 ++ subs $rounds,$rounds,#2 ++ aese $dat1,q9 ++ aesmc $dat1,$dat1 ++ aese $dat2,q9 ++ aesmc $dat2,$dat2 ++ vld1.32 {q9},[$key_],#16 ++ b.gt .Lxts_enc_tail_loop ++ ++ aese $dat1,q8 ++ aesmc $dat1,$dat1 ++ aese $dat2,q8 ++ aesmc $dat2,$dat2 ++ aese $dat1,q9 ++ aesmc $dat1,$dat1 ++ aese $dat2,q9 ++ aesmc $dat2,$dat2 ++ aese $dat1,q12 ++ aesmc $dat1,$dat1 ++ aese $dat2,q12 ++ aesmc $dat2,$dat2 ++ cmn $len,#0x20 ++ aese $dat1,q13 ++ aesmc $dat1,$dat1 ++ aese $dat2,q13 ++ aesmc $dat2,$dat2 ++ veor $tmp1,$iv0,$rndlast ++ aese $dat1,q14 ++ aesmc $dat1,$dat1 ++ aese $dat2,q14 ++ aesmc $dat2,$dat2 ++ veor $tmp2,$iv1,$rndlast ++ aese $dat1,q15 ++ aese $dat2,q15 ++ b.eq .Lxts_enc_one ++ veor $tmp1,$tmp1,$dat1 ++ vst1.8 {$tmp1},[$out],#16 ++ veor $tmp2,$tmp2,$dat2 ++ vorr $iv0,$iv1,$iv1 ++ vst1.8 {$tmp2},[$out],#16 ++ fmov $ivl,$ivd10 ++ fmov $ivh,$ivd11 ++ mov $constnum,#0x87 ++ extr $midnumx,$ivh,$ivh,#32 ++ extr $ivh,$ivh,$ivl,#63 ++ and $tmpmw,$constnum,$midnum,asr #31 ++ eor $ivl,$tmpmx,$ivl,lsl #1 ++ fmov $ivd00,$ivl ++ fmov $ivd01,$ivh ++ b .Lxts_enc_done ++ ++.Lxts_enc_one: ++ veor $tmp1,$tmp1,$dat2 ++ vorr $iv0,$iv0,$iv0 ++ vst1.8 {$tmp1},[$out],#16 ++ fmov $ivl,$ivd00 ++ fmov $ivh,$ivd01 ++ mov $constnum,#0x87 ++ extr $midnumx,$ivh,$ivh,#32 ++ extr $ivh,$ivh,$ivl,#63 ++ and $tmpmw,$constnum,$midnum,asr #31 ++ eor $ivl,$tmpmx,$ivl,lsl #1 ++ fmov $ivd00,$ivl ++ fmov $ivd01,$ivh ++ b .Lxts_enc_done ++.align 5 ++.Lxts_enc_done: ++ // Process the tail block with cipher stealing. ++ tst $tailcnt,#0xf ++ b.eq .Lxts_abort ++ ++ mov $tmpinp,$inp ++ mov $tmpoutp,$out ++ sub $out,$out,#16 ++.composite_enc_loop: ++ subs $tailcnt,$tailcnt,#1 ++ ldrb $l2outp,[$out,$tailcnt] ++ ldrb $loutp,[$tmpinp,$tailcnt] ++ strb $l2outp,[$tmpoutp,$tailcnt] ++ strb $loutp,[$out,$tailcnt] ++ b.gt .composite_enc_loop ++.Lxts_enc_load_done: ++ vld1.8 {$tmpin},[$out] ++ veor $tmpin,$tmpin,$iv0 ++ ++ // Encrypt the composite block to get the last second encrypted text block ++ ldr $rounds,[$key1,#240] // load key schedule... ++ vld1.8 {$dat},[$key1],#16 ++ sub $rounds,$rounds,#2 ++ vld1.8 {$dat1},[$key1],#16 // load key schedule... ++.Loop_final_enc: ++ aese $tmpin,$dat0 ++ aesmc $tmpin,$tmpin ++ vld1.32 {$dat0},[$key1],#16 ++ subs $rounds,$rounds,#2 ++ aese $tmpin,$dat1 ++ aesmc $tmpin,$tmpin ++ vld1.32 {$dat1},[$key1],#16 ++ b.gt .Loop_final_enc ++ ++ aese $tmpin,$dat0 ++ aesmc $tmpin,$tmpin ++ vld1.32 {$dat0},[$key1] ++ aese $tmpin,$dat1 ++ veor $tmpin,$tmpin,$dat0 ++ veor $tmpin,$tmpin,$iv0 ++ vst1.8 {$tmpin},[$out] ++ ++.Lxts_abort: ++ ldp $tailcnt,$midnumx,[sp,#48] ++ ldp $ivd10,$ivd20,[sp,#32] ++ ldp $ivd30,$ivd40,[sp,#16] ++ ldp $constnumx,$tmpinp,[sp],#64 ++.Lxts_enc_final_abort: ++ ret ++.size ${prefix}_xts_encrypt,.-${prefix}_xts_encrypt ++___ ++ ++}}} ++{{{ ++my ($inp,$out,$len,$key1,$key2,$ivp)=map("x$_",(0..5)); ++my ($rounds0,$rounds,$key_,$step,$ivl,$ivh)=("w5","w6","x7","x8","x9","x10"); ++my ($tmpoutp,$loutp,$l2outp,$tmpinp)=("x13","w14","w15","x20"); ++my ($tailcnt,$midnum,$midnumx,$constnum,$constnumx)=("x21","w22","x22","w19","x19"); ++my ($xoffset,$tmpmx,$tmpmw)=("x6","x11","w11"); ++my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$tmp2,$rndlast)=map("q$_",(0..7)); ++my ($iv0,$iv1,$iv2,$iv3,$iv4,$tmpin)=("v6.16b","v8.16b","v9.16b","v10.16b","v11.16b","v26.16b"); ++my ($ivd00,$ivd01,$ivd20,$ivd21)=("d6","v6.d[1]","d9","v9.d[1]"); ++my ($ivd10,$ivd11,$ivd30,$ivd31,$ivd40,$ivd41)=("d8","v8.d[1]","d10","v10.d[1]","d11","v11.d[1]"); ++ ++my ($dat,$tmp,$rndzero_n_last)=($dat0,$tmp0,$tmp1); ++ ++# q7 last round key ++# q10-q15, q7 Last 7 round keys ++# q8-q9 preloaded round keys except last 7 keys for big size ++# q20, q21, q8-q9 preloaded round keys except last 7 keys for only 16 byte ++ ++{ ++my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9)); ++ ++my ($dat3,$in3,$tmp3); # used only in 64-bit mode ++my ($dat4,$in4,$tmp4); ++if ($flavour =~ /64/) { ++ ($dat2,$dat3,$dat4,$in2,$in3,$in4,$tmp3,$tmp4)=map("q$_",(16..23)); ++} ++ ++$code.=<<___ if ($flavour =~ /64/); ++.globl ${prefix}_xts_decrypt ++.type ${prefix}_xts_decrypt,%function ++.align 5 ++${prefix}_xts_decrypt: ++___ ++$code.=<<___ if ($flavour =~ /64/); ++ cmp $len,#16 ++ // Original input data size bigger than 16, jump to big size processing. ++ b.ne .Lxts_dec_big_size ++ // Encrypt the iv with key2, as the first XEX iv. ++ ldr $rounds,[$key2,#240] ++ vld1.8 {$dat},[$key2],#16 ++ vld1.8 {$iv0},[$ivp] ++ sub $rounds,$rounds,#2 ++ vld1.8 {$dat1},[$key2],#16 ++ ++.Loop_dec_small_iv_enc: ++ aese $iv0,$dat ++ aesmc $iv0,$iv0 ++ vld1.32 {$dat},[$key2],#16 ++ subs $rounds,$rounds,#2 ++ aese $iv0,$dat1 ++ aesmc $iv0,$iv0 ++ vld1.32 {$dat1},[$key2],#16 ++ b.gt .Loop_dec_small_iv_enc ++ ++ aese $iv0,$dat ++ aesmc $iv0,$iv0 ++ vld1.32 {$dat},[$key2] ++ aese $iv0,$dat1 ++ veor $iv0,$iv0,$dat ++ ++ vld1.8 {$dat0},[$inp] ++ veor $dat0,$iv0,$dat0 ++ ++ ldr $rounds,[$key1,#240] ++ vld1.32 {q20-q21},[$key1],#32 // load key schedule... ++ ++ aesd $dat0,q20 ++ aesimc $dat0,$dat0 ++ vld1.32 {q8-q9},[$key1],#32 // load key schedule... ++ aesd $dat0,q21 ++ aesimc $dat0,$dat0 ++ subs $rounds,$rounds,#10 // bias ++ b.eq .Lxts_128_dec ++.Lxts_dec_round_loop: ++ aesd $dat0,q8 ++ aesimc $dat0,$dat0 ++ vld1.32 {q8},[$key1],#16 // load key schedule... ++ aesd $dat0,q9 ++ aesimc $dat0,$dat0 ++ vld1.32 {q9},[$key1],#16 // load key schedule... ++ subs $rounds,$rounds,#2 // bias ++ b.gt .Lxts_dec_round_loop ++.Lxts_128_dec: ++ vld1.32 {q10-q11},[$key1],#32 // load key schedule... ++ aesd $dat0,q8 ++ aesimc $dat0,$dat0 ++ aesd $dat0,q9 ++ aesimc $dat0,$dat0 ++ vld1.32 {q12-q13},[$key1],#32 // load key schedule... ++ aesd $dat0,q10 ++ aesimc $dat0,$dat0 ++ aesd $dat0,q11 ++ aesimc $dat0,$dat0 ++ vld1.32 {q14-q15},[$key1],#32 // load key schedule... ++ aesd $dat0,q12 ++ aesimc $dat0,$dat0 ++ aesd $dat0,q13 ++ aesimc $dat0,$dat0 ++ vld1.32 {$rndlast},[$key1] ++ aesd $dat0,q14 ++ aesimc $dat0,$dat0 ++ aesd $dat0,q15 ++ veor $dat0,$dat0,$rndlast ++ veor $dat0,$iv0,$dat0 ++ vst1.8 {$dat0},[$out] ++ b .Lxts_dec_final_abort ++.Lxts_dec_big_size: ++___ ++$code.=<<___ if ($flavour =~ /64/); ++ stp $constnumx,$tmpinp,[sp,#-64]! ++ stp $tailcnt,$midnumx,[sp,#48] ++ stp $ivd10,$ivd20,[sp,#32] ++ stp $ivd30,$ivd40,[sp,#16] ++ ++ and $tailcnt,$len,#0xf ++ and $len,$len,#-16 ++ subs $len,$len,#16 ++ mov $step,#16 ++ b.lo .Lxts_dec_abort ++ ++ // Encrypt the iv with key2, as the first XEX iv ++ ldr $rounds,[$key2,#240] ++ vld1.8 {$dat},[$key2],#16 ++ vld1.8 {$iv0},[$ivp] ++ sub $rounds,$rounds,#2 ++ vld1.8 {$dat1},[$key2],#16 ++ ++.Loop_dec_iv_enc: ++ aese $iv0,$dat ++ aesmc $iv0,$iv0 ++ vld1.32 {$dat},[$key2],#16 ++ subs $rounds,$rounds,#2 ++ aese $iv0,$dat1 ++ aesmc $iv0,$iv0 ++ vld1.32 {$dat1},[$key2],#16 ++ b.gt .Loop_dec_iv_enc ++ ++ aese $iv0,$dat ++ aesmc $iv0,$iv0 ++ vld1.32 {$dat},[$key2] ++ aese $iv0,$dat1 ++ veor $iv0,$iv0,$dat ++ ++ // The iv for second block ++ // $ivl- iv(low), $ivh - iv(high) ++ // the five ivs stored into, $iv0,$iv1,$iv2,$iv3,$iv4 ++ fmov $ivl,$ivd00 ++ fmov $ivh,$ivd01 ++ mov $constnum,#0x87 ++ extr $midnumx,$ivh,$ivh,#32 ++ extr $ivh,$ivh,$ivl,#63 ++ and $tmpmw,$constnum,$midnum,asr #31 ++ eor $ivl,$tmpmx,$ivl,lsl #1 ++ fmov $ivd10,$ivl ++ fmov $ivd11,$ivh ++ ++ ldr $rounds0,[$key1,#240] // load rounds number ++ ++ // The iv for third block ++ extr $midnumx,$ivh,$ivh,#32 ++ extr $ivh,$ivh,$ivl,#63 ++ and $tmpmw,$constnum,$midnum,asr #31 ++ eor $ivl,$tmpmx,$ivl,lsl #1 ++ fmov $ivd20,$ivl ++ fmov $ivd21,$ivh ++ ++ vld1.32 {q8-q9},[$key1] // load key schedule... ++ sub $rounds0,$rounds0,#6 ++ add $key_,$key1,$ivp,lsl#4 // pointer to last 7 round keys ++ sub $rounds0,$rounds0,#2 ++ vld1.32 {q10-q11},[$key_],#32 // load key schedule... ++ vld1.32 {q12-q13},[$key_],#32 ++ vld1.32 {q14-q15},[$key_],#32 ++ vld1.32 {$rndlast},[$key_] ++ ++ // The iv for fourth block ++ extr $midnumx,$ivh,$ivh,#32 ++ extr $ivh,$ivh,$ivl,#63 ++ and $tmpmw,$constnum,$midnum,asr #31 ++ eor $ivl,$tmpmx,$ivl,lsl #1 ++ fmov $ivd30,$ivl ++ fmov $ivd31,$ivh ++ ++ add $key_,$key1,#32 ++ mov $rounds,$rounds0 ++ b .Lxts_dec ++ ++ // Decryption ++.align 5 ++.Lxts_dec: ++ tst $tailcnt,#0xf ++ b.eq .Lxts_dec_begin ++ subs $len,$len,#16 ++ csel $step,xzr,$step,eq ++ vld1.8 {$dat},[$inp],#16 ++ b.lo .Lxts_done ++ sub $inp,$inp,#16 ++.Lxts_dec_begin: ++ vld1.8 {$dat},[$inp],$step ++ subs $len,$len,#32 // bias ++ add $rounds,$rounds0,#2 ++ vorr $in1,$dat,$dat ++ vorr $dat1,$dat,$dat ++ vorr $in3,$dat,$dat ++ vld1.8 {$dat2},[$inp],#16 ++ vorr $in2,$dat2,$dat2 ++ vorr $in4,$dat2,$dat2 ++ b.lo .Lxts_inner_dec_tail ++ veor $dat,$dat,$iv0 // before decryt, xor with iv ++ veor $dat2,$dat2,$iv1 ++ ++ vorr $dat1,$dat2,$dat2 ++ vld1.8 {$dat2},[$inp],#16 ++ vorr $in0,$dat,$dat ++ vorr $in1,$dat1,$dat1 ++ veor $in2,$dat2,$iv2 // third block xox with third iv ++ veor $dat2,$dat2,$iv2 ++ cmp $len,#32 ++ b.lo .Lxts_outer_dec_tail ++ ++ vld1.8 {$dat3},[$inp],#16 ++ ++ // The iv for fifth block ++ extr $midnumx,$ivh,$ivh,#32 ++ extr $ivh,$ivh,$ivl,#63 ++ and $tmpmw,$constnum,$midnum,asr #31 ++ eor $ivl,$tmpmx,$ivl,lsl #1 ++ fmov $ivd40,$ivl ++ fmov $ivd41,$ivh ++ ++ vld1.8 {$dat4},[$inp],#16 ++ veor $dat3,$dat3,$iv3 // the fourth block ++ veor $dat4,$dat4,$iv4 ++ sub $len,$len,#32 // bias ++ mov $rounds,$rounds0 ++ b .Loop5x_xts_dec ++ ++.align 4 ++.Loop5x_xts_dec: ++ aesd $dat0,q8 ++ aesimc $dat0,$dat0 ++ aesd $dat1,q8 ++ aesimc $dat1,$dat1 ++ aesd $dat2,q8 ++ aesimc $dat2,$dat2 ++ aesd $dat3,q8 ++ aesimc $dat3,$dat3 ++ aesd $dat4,q8 ++ aesimc $dat4,$dat4 ++ vld1.32 {q8},[$key_],#16 // load key schedule... ++ subs $rounds,$rounds,#2 ++ aesd $dat0,q9 ++ aesimc $dat0,$dat0 ++ aesd $dat1,q9 ++ aesimc $dat1,$dat1 ++ aesd $dat2,q9 ++ aesimc $dat2,$dat2 ++ aesd $dat3,q9 ++ aesimc $dat3,$dat3 ++ aesd $dat4,q9 ++ aesimc $dat4,$dat4 ++ vld1.32 {q9},[$key_],#16 // load key schedule... ++ b.gt .Loop5x_xts_dec ++ ++ aesd $dat0,q8 ++ aesimc $dat0,$dat0 ++ aesd $dat1,q8 ++ aesimc $dat1,$dat1 ++ aesd $dat2,q8 ++ aesimc $dat2,$dat2 ++ aesd $dat3,q8 ++ aesimc $dat3,$dat3 ++ aesd $dat4,q8 ++ aesimc $dat4,$dat4 ++ subs $len,$len,#0x50 // because .Lxts_dec_tail4x ++ ++ aesd $dat0,q9 ++ aesimc $dat0,$dat ++ aesd $dat1,q9 ++ aesimc $dat1,$dat1 ++ aesd $dat2,q9 ++ aesimc $dat2,$dat2 ++ aesd $dat3,q9 ++ aesimc $dat3,$dat3 ++ aesd $dat4,q9 ++ aesimc $dat4,$dat4 ++ csel $xoffset,xzr,$len,gt // borrow x6, w6, "gt" is not typo ++ mov $key_,$key1 ++ ++ aesd $dat0,q10 ++ aesimc $dat0,$dat0 ++ aesd $dat1,q10 ++ aesimc $dat1,$dat1 ++ aesd $dat2,q10 ++ aesimc $dat2,$dat2 ++ aesd $dat3,q10 ++ aesimc $dat3,$dat3 ++ aesd $dat4,q10 ++ aesimc $dat4,$dat4 ++ add $inp,$inp,$xoffset // x0 is adjusted in such way that ++ // at exit from the loop v1.16b-v26.16b ++ // are loaded with last "words" ++ add $xoffset,$len,#0x60 // because .Lxts_dec_tail4x ++ ++ aesd $dat0,q11 ++ aesimc $dat0,$dat0 ++ aesd $dat1,q11 ++ aesimc $dat1,$dat1 ++ aesd $dat2,q11 ++ aesimc $dat2,$dat2 ++ aesd $dat3,q11 ++ aesimc $dat3,$dat3 ++ aesd $dat4,q11 ++ aesimc $dat4,$dat4 ++ ++ aesd $dat0,q12 ++ aesimc $dat0,$dat0 ++ aesd $dat1,q12 ++ aesimc $dat1,$dat1 ++ aesd $dat2,q12 ++ aesimc $dat2,$dat2 ++ aesd $dat3,q12 ++ aesimc $dat3,$dat3 ++ aesd $dat4,q12 ++ aesimc $dat4,$dat4 ++ ++ aesd $dat0,q13 ++ aesimc $dat0,$dat0 ++ aesd $dat1,q13 ++ aesimc $dat1,$dat1 ++ aesd $dat2,q13 ++ aesimc $dat2,$dat2 ++ aesd $dat3,q13 ++ aesimc $dat3,$dat3 ++ aesd $dat4,q13 ++ aesimc $dat4,$dat4 ++ ++ aesd $dat0,q14 ++ aesimc $dat0,$dat0 ++ aesd $dat1,q14 ++ aesimc $dat1,$dat1 ++ aesd $dat2,q14 ++ aesimc $dat2,$dat2 ++ aesd $dat3,q14 ++ aesimc $dat3,$dat3 ++ aesd $dat4,q14 ++ aesimc $dat4,$dat4 ++ ++ veor $tmp0,$rndlast,$iv0 ++ aesd $dat0,q15 ++ // The iv for first block of next iteration. ++ extr $midnumx,$ivh,$ivh,#32 ++ extr $ivh,$ivh,$ivl,#63 ++ and $tmpmw,$constnum,$midnum,asr #31 ++ eor $ivl,$tmpmx,$ivl,lsl #1 ++ fmov $ivd00,$ivl ++ fmov $ivd01,$ivh ++ veor $tmp1,$rndlast,$iv1 ++ vld1.8 {$in0},[$inp],#16 ++ aesd $dat1,q15 ++ // The iv for second block ++ extr $midnumx,$ivh,$ivh,#32 ++ extr $ivh,$ivh,$ivl,#63 ++ and $tmpmw,$constnum,$midnum,asr #31 ++ eor $ivl,$tmpmx,$ivl,lsl #1 ++ fmov $ivd10,$ivl ++ fmov $ivd11,$ivh ++ veor $tmp2,$rndlast,$iv2 ++ vld1.8 {$in1},[$inp],#16 ++ aesd $dat2,q15 ++ // The iv for third block ++ extr $midnumx,$ivh,$ivh,#32 ++ extr $ivh,$ivh,$ivl,#63 ++ and $tmpmw,$constnum,$midnum,asr #31 ++ eor $ivl,$tmpmx,$ivl,lsl #1 ++ fmov $ivd20,$ivl ++ fmov $ivd21,$ivh ++ veor $tmp3,$rndlast,$iv3 ++ vld1.8 {$in2},[$inp],#16 ++ aesd $dat3,q15 ++ // The iv for fourth block ++ extr $midnumx,$ivh,$ivh,#32 ++ extr $ivh,$ivh,$ivl,#63 ++ and $tmpmw,$constnum,$midnum,asr #31 ++ eor $ivl,$tmpmx,$ivl,lsl #1 ++ fmov $ivd30,$ivl ++ fmov $ivd31,$ivh ++ veor $tmp4,$rndlast,$iv4 ++ vld1.8 {$in3},[$inp],#16 ++ aesd $dat4,q15 ++ ++ // The iv for fifth block ++ extr $midnumx,$ivh,$ivh,#32 ++ extr $ivh,$ivh,$ivl,#63 ++ and $tmpmw,$constnum,$midnum,asr #31 ++ eor $ivl,$tmpmx,$ivl,lsl #1 ++ fmov $ivd40,$ivl ++ fmov $ivd41,$ivh ++ ++ vld1.8 {$in4},[$inp],#16 ++ cbz $xoffset,.Lxts_dec_tail4x ++ vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0] ++ veor $tmp0,$tmp0,$dat0 ++ veor $dat0,$in0,$iv0 ++ veor $tmp1,$tmp1,$dat1 ++ veor $dat1,$in1,$iv1 ++ veor $tmp2,$tmp2,$dat2 ++ veor $dat2,$in2,$iv2 ++ veor $tmp3,$tmp3,$dat3 ++ veor $dat3,$in3,$iv3 ++ veor $tmp4,$tmp4,$dat4 ++ vst1.8 {$tmp0},[$out],#16 ++ veor $dat4,$in4,$iv4 ++ vst1.8 {$tmp1},[$out],#16 ++ mov $rounds,$rounds0 ++ vst1.8 {$tmp2},[$out],#16 ++ vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1] ++ vst1.8 {$tmp3},[$out],#16 ++ vst1.8 {$tmp4},[$out],#16 ++ b.hs .Loop5x_xts_dec ++ ++ cmn $len,#0x10 ++ b.ne .Loop5x_dec_after ++ // If x2($len) equal to -0x10, the left blocks is 4. ++ // After specially processing, utilize the five blocks processing again. ++ // It will use the following IVs: $iv0,$iv0,$iv1,$iv2,$iv3. ++ vorr $iv4,$iv3,$iv3 ++ vorr $iv3,$iv2,$iv2 ++ vorr $iv2,$iv1,$iv1 ++ vorr $iv1,$iv0,$iv0 ++ fmov $ivl,$ivd40 ++ fmov $ivh,$ivd41 ++ veor $dat0,$iv0,$in0 ++ veor $dat1,$iv1,$in1 ++ veor $dat2,$in2,$iv2 ++ veor $dat3,$in3,$iv3 ++ veor $dat4,$in4,$iv4 ++ b.eq .Loop5x_xts_dec ++ ++.Loop5x_dec_after: ++ add $len,$len,#0x50 ++ cbz $len,.Lxts_done ++ ++ add $rounds,$rounds0,#2 ++ subs $len,$len,#0x30 ++ b.lo .Lxts_inner_dec_tail ++ ++ veor $dat0,$iv0,$in2 ++ veor $dat1,$iv1,$in3 ++ veor $dat2,$in4,$iv2 ++ b .Lxts_outer_dec_tail ++ ++.align 4 ++.Lxts_dec_tail4x: ++ add $inp,$inp,#16 ++ vld1.32 {$dat0},[$inp],#16 ++ veor $tmp1,$dat1,$tmp0 ++ vst1.8 {$tmp1},[$out],#16 ++ veor $tmp2,$dat2,$tmp2 ++ vst1.8 {$tmp2},[$out],#16 ++ veor $tmp3,$dat3,$tmp3 ++ veor $tmp4,$dat4,$tmp4 ++ vst1.8 {$tmp3-$tmp4},[$out],#32 ++ ++ b .Lxts_done ++.align 4 ++.Lxts_outer_dec_tail: ++ aesd $dat0,q8 ++ aesimc $dat0,$dat0 ++ aesd $dat1,q8 ++ aesimc $dat1,$dat1 ++ aesd $dat2,q8 ++ aesimc $dat2,$dat2 ++ vld1.32 {q8},[$key_],#16 ++ subs $rounds,$rounds,#2 ++ aesd $dat0,q9 ++ aesimc $dat0,$dat0 ++ aesd $dat1,q9 ++ aesimc $dat1,$dat1 ++ aesd $dat2,q9 ++ aesimc $dat2,$dat2 ++ vld1.32 {q9},[$key_],#16 ++ b.gt .Lxts_outer_dec_tail ++ ++ aesd $dat0,q8 ++ aesimc $dat0,$dat0 ++ aesd $dat1,q8 ++ aesimc $dat1,$dat1 ++ aesd $dat2,q8 ++ aesimc $dat2,$dat2 ++ veor $tmp0,$iv0,$rndlast ++ subs $len,$len,#0x30 ++ // The iv for first block ++ fmov $ivl,$ivd20 ++ fmov $ivh,$ivd21 ++ mov $constnum,#0x87 ++ extr $midnumx,$ivh,$ivh,#32 ++ extr $ivh,$ivh,$ivl,#63 ++ and $tmpmw,$constnum,$midnum,asr #31 ++ eor $ivl,$tmpmx,$ivl,lsl #1 ++ fmov $ivd00,$ivl ++ fmov $ivd01,$ivh ++ veor $tmp1,$iv1,$rndlast ++ csel $xoffset,$len,$xoffset,lo // x6, w6, is zero at this point ++ aesd $dat0,q9 ++ aesimc $dat0,$dat0 ++ aesd $dat1,q9 ++ aesimc $dat1,$dat1 ++ aesd $dat2,q9 ++ aesimc $dat2,$dat2 ++ veor $tmp2,$iv2,$rndlast ++ // The iv for second block ++ extr $midnumx,$ivh,$ivh,#32 ++ extr $ivh,$ivh,$ivl,#63 ++ and $tmpmw,$constnum,$midnum,asr #31 ++ eor $ivl,$tmpmx,$ivl,lsl #1 ++ fmov $ivd10,$ivl ++ fmov $ivd11,$ivh ++ ++ add $xoffset,$xoffset,#0x20 ++ add $inp,$inp,$xoffset // $inp is adjusted to the last data ++ ++ mov $key_,$key1 ++ ++ // The iv for third block ++ extr $midnumx,$ivh,$ivh,#32 ++ extr $ivh,$ivh,$ivl,#63 ++ and $tmpmw,$constnum,$midnum,asr #31 ++ eor $ivl,$tmpmx,$ivl,lsl #1 ++ fmov $ivd20,$ivl ++ fmov $ivd21,$ivh ++ ++ aesd $dat0,q12 ++ aesimc $dat0,$dat0 ++ aesd $dat1,q12 ++ aesimc $dat1,$dat1 ++ aesd $dat2,q12 ++ aesimc $dat2,$dat2 ++ aesd $dat0,q13 ++ aesimc $dat0,$dat0 ++ aesd $dat1,q13 ++ aesimc $dat1,$dat1 ++ aesd $dat2,q13 ++ aesimc $dat2,$dat2 ++ aesd $dat0,q14 ++ aesimc $dat0,$dat0 ++ aesd $dat1,q14 ++ aesimc $dat1,$dat1 ++ aesd $dat2,q14 ++ aesimc $dat2,$dat2 ++ vld1.8 {$in2},[$inp],#16 ++ aesd $dat0,q15 ++ aesd $dat1,q15 ++ aesd $dat2,q15 ++ vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0] ++ add $rounds,$rounds0,#2 ++ veor $tmp0,$tmp0,$dat0 ++ veor $tmp1,$tmp1,$dat1 ++ veor $dat2,$dat2,$tmp2 ++ vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1] ++ vst1.8 {$tmp0},[$out],#16 ++ vst1.8 {$tmp1},[$out],#16 ++ vst1.8 {$dat2},[$out],#16 ++ ++ cmn $len,#0x30 ++ add $len,$len,#0x30 ++ b.eq .Lxts_done ++ sub $len,$len,#0x30 ++ vorr $in3,$in1,$in1 ++ vorr $in4,$in2,$in2 ++ nop ++ ++.Lxts_inner_dec_tail: ++ // $len == -0x10 means two blocks left. ++ cmn $len,#0x10 ++ veor $dat1,$in3,$iv0 ++ veor $dat2,$in4,$iv1 ++ b.eq .Lxts_dec_tail_loop ++ veor $dat2,$in4,$iv0 ++.Lxts_dec_tail_loop: ++ aesd $dat1,q8 ++ aesimc $dat1,$dat1 ++ aesd $dat2,q8 ++ aesimc $dat2,$dat2 ++ vld1.32 {q8},[$key_],#16 ++ subs $rounds,$rounds,#2 ++ aesd $dat1,q9 ++ aesimc $dat1,$dat1 ++ aesd $dat2,q9 ++ aesimc $dat2,$dat2 ++ vld1.32 {q9},[$key_],#16 ++ b.gt .Lxts_dec_tail_loop ++ ++ aesd $dat1,q8 ++ aesimc $dat1,$dat1 ++ aesd $dat2,q8 ++ aesimc $dat2,$dat2 ++ aesd $dat1,q9 ++ aesimc $dat1,$dat1 ++ aesd $dat2,q9 ++ aesimc $dat2,$dat2 ++ aesd $dat1,q12 ++ aesimc $dat1,$dat1 ++ aesd $dat2,q12 ++ aesimc $dat2,$dat2 ++ cmn $len,#0x20 ++ aesd $dat1,q13 ++ aesimc $dat1,$dat1 ++ aesd $dat2,q13 ++ aesimc $dat2,$dat2 ++ veor $tmp1,$iv0,$rndlast ++ aesd $dat1,q14 ++ aesimc $dat1,$dat1 ++ aesd $dat2,q14 ++ aesimc $dat2,$dat2 ++ veor $tmp2,$iv1,$rndlast ++ aesd $dat1,q15 ++ aesd $dat2,q15 ++ b.eq .Lxts_dec_one ++ veor $tmp1,$tmp1,$dat1 ++ veor $tmp2,$tmp2,$dat2 ++ vorr $iv0,$iv2,$iv2 ++ vorr $iv1,$iv3,$iv3 ++ vst1.8 {$tmp1},[$out],#16 ++ vst1.8 {$tmp2},[$out],#16 ++ add $len,$len,#16 ++ b .Lxts_done ++ ++.Lxts_dec_one: ++ veor $tmp1,$tmp1,$dat2 ++ vorr $iv0,$iv1,$iv1 ++ vorr $iv1,$iv2,$iv2 ++ vst1.8 {$tmp1},[$out],#16 ++ add $len,$len,#32 ++ ++.Lxts_done: ++ tst $tailcnt,#0xf ++ b.eq .Lxts_dec_abort ++ // Processing the last two blocks with cipher stealing. ++ mov x7,x3 ++ cbnz x2,.Lxts_dec_1st_done ++ vld1.32 {$dat0},[$inp],#16 ++ ++ // Decrypt the last secod block to get the last plain text block ++.Lxts_dec_1st_done: ++ eor $tmpin,$dat0,$iv1 ++ ldr $rounds,[$key1,#240] ++ vld1.32 {$dat0},[$key1],#16 ++ sub $rounds,$rounds,#2 ++ vld1.32 {$dat1},[$key1],#16 ++.Loop_final_2nd_dec: ++ aesd $tmpin,$dat0 ++ aesimc $tmpin,$tmpin ++ vld1.32 {$dat0},[$key1],#16 // load key schedule... ++ subs $rounds,$rounds,#2 ++ aesd $tmpin,$dat1 ++ aesimc $tmpin,$tmpin ++ vld1.32 {$dat1},[$key1],#16 // load key schedule... ++ b.gt .Loop_final_2nd_dec ++ ++ aesd $tmpin,$dat0 ++ aesimc $tmpin,$tmpin ++ vld1.32 {$dat0},[$key1] ++ aesd $tmpin,$dat1 ++ veor $tmpin,$tmpin,$dat0 ++ veor $tmpin,$tmpin,$iv1 ++ vst1.8 {$tmpin},[$out] ++ ++ mov $tmpinp,$inp ++ add $tmpoutp,$out,#16 ++ ++ // Composite the tailcnt "16 byte not aligned block" into the last second plain blocks ++ // to get the last encrypted block. ++.composite_dec_loop: ++ subs $tailcnt,$tailcnt,#1 ++ ldrb $l2outp,[$out,$tailcnt] ++ ldrb $loutp,[$tmpinp,$tailcnt] ++ strb $l2outp,[$tmpoutp,$tailcnt] ++ strb $loutp,[$out,$tailcnt] ++ b.gt .composite_dec_loop ++.Lxts_dec_load_done: ++ vld1.8 {$tmpin},[$out] ++ veor $tmpin,$tmpin,$iv0 ++ ++ // Decrypt the composite block to get the last second plain text block ++ ldr $rounds,[$key_,#240] ++ vld1.8 {$dat},[$key_],#16 ++ sub $rounds,$rounds,#2 ++ vld1.8 {$dat1},[$key_],#16 ++.Loop_final_dec: ++ aesd $tmpin,$dat0 ++ aesimc $tmpin,$tmpin ++ vld1.32 {$dat0},[$key_],#16 // load key schedule... ++ subs $rounds,$rounds,#2 ++ aesd $tmpin,$dat1 ++ aesimc $tmpin,$tmpin ++ vld1.32 {$dat1},[$key_],#16 // load key schedule... ++ b.gt .Loop_final_dec ++ ++ aesd $tmpin,$dat0 ++ aesimc $tmpin,$tmpin ++ vld1.32 {$dat0},[$key_] ++ aesd $tmpin,$dat1 ++ veor $tmpin,$tmpin,$dat0 ++ veor $tmpin,$tmpin,$iv0 ++ vst1.8 {$tmpin},[$out] ++ ++.Lxts_dec_abort: ++ ldp $tailcnt,$midnumx,[sp,#48] ++ ldp $ivd10,$ivd20,[sp,#32] ++ ldp $ivd30,$ivd40,[sp,#16] ++ ldp $constnumx,$tmpinp,[sp],#64 ++ ++.Lxts_dec_final_abort: ++ ret ++.size ${prefix}_xts_decrypt,.-${prefix}_xts_decrypt ++___ ++} ++}}} + $code.=<<___; + #endif + ___ +Index: openssl-1.1.1d/crypto/evp/e_aes.c +=================================================================== +--- openssl-1.1.1d.orig/crypto/evp/e_aes.c ++++ openssl-1.1.1d/crypto/evp/e_aes.c +@@ -170,6 +170,10 @@ static void ctr64_inc(unsigned char *cou + # define HWAES_set_decrypt_key aes_p8_set_decrypt_key + # define HWAES_encrypt aes_p8_encrypt + # define HWAES_decrypt aes_p8_decrypt ++# if __ARM_MAX_ARCH__>=8 ++# define HWAES_xts_encrypt aes_v8_xts_encrypt ++# define HWAES_xts_decrypt aes_v8_xts_decrypt ++# endif + # define HWAES_cbc_encrypt aes_p8_cbc_encrypt + # define HWAES_ctr32_encrypt_blocks aes_p8_ctr32_encrypt_blocks + # define HWAES_xts_encrypt aes_p8_xts_encrypt +Index: openssl-1.1.1d/test/recipes/30-test_evp_data/evpcase.txt +=================================================================== +--- openssl-1.1.1d.orig/test/recipes/30-test_evp_data/evpcase.txt ++++ openssl-1.1.1d/test/recipes/30-test_evp_data/evpcase.txt +@@ -15,6 +15,44 @@ + # These tests exercise the case insensitive handling of object names. + # They are contrived + ++Title = AES XTS Non standard test vectors - generated from reference implementation ++ ++Cipher = aes-128-xts ++Key = fffefdfcfbfaf9f8f7f6f5f4f3f2f1f0bfbebdbcbbbab9b8b7b6b5b4b3b2b1b0 ++IV = 9a785634120000000000000000000000 ++Plaintext = 000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f2021 ++Ciphertext = edbf9dace45d6f6a7306e64be5dd824b9dc31efeb418c373ce073b66755529982538 ++ ++Cipher = aes-128-xts ++Key = fffefdfcfbfaf9f8f7f6f5f4f3f2f1f0bfbebdbcbbbab9b8b7b6b5b4b3b2b1b0 ++IV = 9a785634120000000000000000000000 ++Plaintext = 000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f202122232425262728292a2b2c2d2e2f3031 ++Ciphertext = edbf9dace45d6f6a7306e64be5dd824b2538f5724fcf24249ac111ab45ad39237a709959673bd8747d58690f8c762a353ad6 ++ ++Cipher = aes-128-xts ++Key = 2718281828459045235360287471352631415926535897932384626433832795 ++IV = 00000000000000000000000000000000 ++Plaintext = 000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f202122232425262728292a2b2c2d2e2f303132333435363738393a3b3c3d3e3f ++Ciphertext = 27a7479befa1d476489f308cd4cfa6e2a96e4bbe3208ff25287dd3819616e89cc78cf7f5e543445f8333d8fa7f56000005279fa5d8b5e4ad40e736ddb4d35412 ++ ++Cipher = aes-128-xts ++Key = fffefdfcfbfaf9f8f7f6f5f4f3f2f1f0bfbebdbcbbbab9b8b7b6b5b4b3b2b1b0 ++IV = 9a785634120000000000000000000000 ++Plaintext = 000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f202122232425262728292a2b2c2d2e2f303132333435363738393a3b3c3d3e3f40 ++Ciphertext = edbf9dace45d6f6a7306e64be5dd824b2538f5724fcf24249ac111ab45ad39233ad6183c66fa548a3cdf3e36d2b21ccde9ffb48286ec211619e02decc7ca0883c6 ++ ++Cipher = aes-128-xts ++Key = fffefdfcfbfaf9f8f7f6f5f4f3f2f1f0bfbebdbcbbbab9b8b7b6b5b4b3b2b1b0 ++IV = 9a785634120000000000000000000000 ++Plaintext = 000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f202122232425262728292a2b2c2d2e2f303132333435363738393a3b3c3d3e3f404142434445464748494a4b4c4d4e4f ++Ciphertext = edbf9dace45d6f6a7306e64be5dd824b2538f5724fcf24249ac111ab45ad39233ad6183c66fa548a3cdf3e36d2b21ccdc6bc657cb3aeb87ba2c5f58ffafacd76d0a098b687c0b6536d560ca007051b0b ++ ++Cipher = aes-128-xts ++Key = fffefdfcfbfaf9f8f7f6f5f4f3f2f1f0bfbebdbcbbbab9b8b7b6b5b4b3b2b1b0 ++IV = 9a785634120000000000000000000000 ++Plaintext = 000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f202122232425262728292a2b2c2d2e2f303132333435363738393a3b3c3d3e3f404142434445464748494a4b4c4d4e4f5051 ++Ciphertext = edbf9dace45d6f6a7306e64be5dd824b2538f5724fcf24249ac111ab45ad39233ad6183c66fa548a3cdf3e36d2b21ccdc6bc657cb3aeb87ba2c5f58ffafacd765ecc4c85c0a01bf317b823fbd6111956d0a0 ++ + Title = Case insensitive AES tests + + Cipher = Aes-128-eCb diff --git a/openssl-1_1-Optimize-RSA-armv8.patch b/openssl-1_1-Optimize-RSA-armv8.patch new file mode 100644 index 0000000..f5abceb --- /dev/null +++ b/openssl-1_1-Optimize-RSA-armv8.patch @@ -0,0 +1,575 @@ +From 5ea64b456b1a27ae046f23d632a968a7583bb9eb Mon Sep 17 00:00:00 2001 +From: "Fangming.Fang" +Date: Tue, 28 Apr 2020 02:33:50 +0000 +Subject: [PATCH] Read MIDR_EL1 system register on aarch64 + +MIDR_EL1 system register exposes microarchitecture information so that +people can make micro-arch related optimization such as exposing as +much instruction level parallelism as possible. + +MIDR_EL1 register can be read only if HWCAP_CPUID feature is supported. + +Change-Id: Iabb8a36c5d31b184dba6399f378598058d394d4e + +Reviewed-by: Paul Dale +Reviewed-by: Tomas Mraz +(Merged from https://github.com/openssl/openssl/pull/11744) +--- + crypto/arm64cpuid.pl | 7 +++++++ + crypto/arm_arch.h | 44 ++++++++++++++++++++++++++++++++++++++++++++ + crypto/armcap.c | 11 +++++++++++ + 3 files changed, 62 insertions(+) + +Index: openssl-1.1.1d/crypto/arm64cpuid.pl +=================================================================== +--- openssl-1.1.1d.orig/crypto/arm64cpuid.pl ++++ openssl-1.1.1d/crypto/arm64cpuid.pl +@@ -78,6 +78,13 @@ _armv8_sha512_probe: + ret + .size _armv8_sha512_probe,.-_armv8_sha512_probe + ++.globl _armv8_cpuid_probe ++.type _armv8_cpuid_probe,%function ++_armv8_cpuid_probe: ++ mrs x0, midr_el1 ++ ret ++.size _armv8_cpuid_probe,.-_armv8_cpuid_probe ++ + .globl OPENSSL_cleanse + .type OPENSSL_cleanse,%function + .align 5 +Index: openssl-1.1.1d/crypto/arm_arch.h +=================================================================== +--- openssl-1.1.1d.orig/crypto/arm_arch.h ++++ openssl-1.1.1d/crypto/arm_arch.h +@@ -71,6 +71,7 @@ + + # ifndef __ASSEMBLER__ + extern unsigned int OPENSSL_armcap_P; ++extern unsigned int OPENSSL_arm_midr; + # endif + + # define ARMV7_NEON (1<<0) +@@ -80,5 +81,48 @@ extern unsigned int OPENSSL_armcap_P; + # define ARMV8_SHA256 (1<<4) + # define ARMV8_PMULL (1<<5) + # define ARMV8_SHA512 (1<<6) ++# define ARMV8_CPUID (1<<7) + ++/* ++ * MIDR_EL1 system register ++ * ++ * 63___ _ ___32_31___ _ ___24_23_____20_19_____16_15__ _ __4_3_______0 ++ * | | | | | | | ++ * |RES0 | Implementer | Variant | Arch | PartNum |Revision| ++ * |____ _ _____|_____ _ _____|_________|_______ _|____ _ ___|________| ++ * ++ */ ++ ++# define ARM_CPU_IMP_ARM 0x41 ++ ++# define ARM_CPU_PART_CORTEX_A72 0xD08 ++# define ARM_CPU_PART_N1 0xD0C ++ ++# define MIDR_PARTNUM_SHIFT 4 ++# define MIDR_PARTNUM_MASK (0xfff << MIDR_PARTNUM_SHIFT) ++# define MIDR_PARTNUM(midr) \ ++ (((midr) & MIDR_PARTNUM_MASK) >> MIDR_PARTNUM_SHIFT) ++ ++# define MIDR_IMPLEMENTER_SHIFT 24 ++# define MIDR_IMPLEMENTER_MASK (0xff << MIDR_IMPLEMENTER_SHIFT) ++# define MIDR_IMPLEMENTER(midr) \ ++ (((midr) & MIDR_IMPLEMENTER_MASK) >> MIDR_IMPLEMENTER_SHIFT) ++ ++# define MIDR_ARCHITECTURE_SHIFT 16 ++# define MIDR_ARCHITECTURE_MASK (0xf << MIDR_ARCHITECTURE_SHIFT) ++# define MIDR_ARCHITECTURE(midr) \ ++ (((midr) & MIDR_ARCHITECTURE_MASK) >> MIDR_ARCHITECTURE_SHIFT) ++ ++# define MIDR_CPU_MODEL_MASK \ ++ (MIDR_IMPLEMENTER_MASK | \ ++ MIDR_PARTNUM_MASK | \ ++ MIDR_ARCHITECTURE_MASK) ++ ++# define MIDR_CPU_MODEL(imp, partnum) \ ++ (((imp) << MIDR_IMPLEMENTER_SHIFT) | \ ++ (0xf << MIDR_ARCHITECTURE_SHIFT) | \ ++ ((partnum) << MIDR_PARTNUM_SHIFT)) ++ ++# define MIDR_IS_CPU_MODEL(midr, imp, partnum) \ ++ (((midr) & MIDR_CPU_MODEL_MASK) == MIDR_CPU_MODEL(imp, partnum)) + #endif +Index: openssl-1.1.1d/crypto/armcap.c +=================================================================== +--- openssl-1.1.1d.orig/crypto/armcap.c ++++ openssl-1.1.1d/crypto/armcap.c +@@ -18,6 +18,8 @@ + #include "arm_arch.h" + + unsigned int OPENSSL_armcap_P = 0; ++unsigned int OPENSSL_arm_midr = 0; ++unsigned int OPENSSL_armv8_rsa_neonized = 0; + + #if __ARM_MAX_ARCH__<7 + void OPENSSL_cpuid_setup(void) +@@ -48,6 +50,7 @@ void _armv8_sha256_probe(void); + void _armv8_pmull_probe(void); + # ifdef __aarch64__ + void _armv8_sha512_probe(void); ++unsigned int _armv8_cpuid_probe(void); + # endif + uint32_t _armv7_tick(void); + +@@ -95,6 +98,7 @@ void OPENSSL_cpuid_setup(void) __attribu + # define HWCAP_CE_PMULL (1 << 4) + # define HWCAP_CE_SHA1 (1 << 5) + # define HWCAP_CE_SHA256 (1 << 6) ++# define HWCAP_CPUID (1 << 11) + # define HWCAP_CE_SHA512 (1 << 21) + # endif + +@@ -155,6 +159,9 @@ void OPENSSL_cpuid_setup(void) + # ifdef __aarch64__ + if (hwcap & HWCAP_CE_SHA512) + OPENSSL_armcap_P |= ARMV8_SHA512; ++ ++ if (hwcap & HWCAP_CPUID) ++ OPENSSL_armcap_P |= ARMV8_CPUID; + # endif + } + # endif +@@ -210,5 +217,16 @@ void OPENSSL_cpuid_setup(void) + + sigaction(SIGILL, &ill_oact, NULL); + sigprocmask(SIG_SETMASK, &oset, NULL); ++ ++# ifdef __aarch64__ ++ if (OPENSSL_armcap_P & ARMV8_CPUID) ++ OPENSSL_arm_midr = _armv8_cpuid_probe(); ++ ++ if ((MIDR_IS_CPU_MODEL(OPENSSL_arm_midr, ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A72) || ++ MIDR_IS_CPU_MODEL(OPENSSL_arm_midr, ARM_CPU_IMP_ARM, ARM_CPU_PART_N1)) && ++ (OPENSSL_armcap_P & ARMV7_NEON)) { ++ OPENSSL_armv8_rsa_neonized = 1; ++ } ++# endif + } + #endif +Index: openssl-1.1.1d/crypto/bn/asm/armv8-mont.pl +=================================================================== +--- openssl-1.1.1d.orig/crypto/bn/asm/armv8-mont.pl ++++ openssl-1.1.1d/crypto/bn/asm/armv8-mont.pl +@@ -64,16 +64,34 @@ $n0="x4"; # const BN_ULONG *n0, + $num="x5"; # int num); + + $code.=<<___; ++#ifndef __KERNEL__ ++# include "arm_arch.h" ++.extern OPENSSL_armv8_rsa_neonized ++.hidden OPENSSL_armv8_rsa_neonized ++#endif + .text + + .globl bn_mul_mont + .type bn_mul_mont,%function + .align 5 + bn_mul_mont: ++.Lbn_mul_mont: ++ tst $num,#3 ++ b.ne .Lmul_mont ++ cmp $num,#32 ++ b.le .Lscalar_impl ++#ifndef __KERNEL__ ++ adrp x17,OPENSSL_armv8_rsa_neonized ++ ldr w17,[x17,#:lo12:OPENSSL_armv8_rsa_neonized] ++ cbnz w17, bn_mul8x_mont_neon ++#endif ++ ++.Lscalar_impl: + tst $num,#7 + b.eq __bn_sqr8x_mont + tst $num,#3 + b.eq __bn_mul4x_mont ++ + .Lmul_mont: + stp x29,x30,[sp,#-64]! + add x29,sp,#0 +@@ -271,6 +289,369 @@ bn_mul_mont: + .size bn_mul_mont,.-bn_mul_mont + ___ + { ++my ($A0,$A1,$N0,$N1)=map("v$_",(0..3)); ++my ($Z,$Temp)=("v4.16b","v5"); ++my @ACC=map("v$_",(6..13)); ++my ($Bi,$Ni,$M0)=map("v$_",(28..30)); ++my $sBi="s28"; ++my $sM0="s30"; ++my $zero="v14"; ++my $temp="v15"; ++my $ACCTemp="v16"; ++ ++my ($rptr,$aptr,$bptr,$nptr,$n0,$num)=map("x$_",(0..5)); ++my ($tinptr,$toutptr,$inner,$outer,$bnptr)=map("x$_",(6..11)); ++ ++$code.=<<___; ++.type bn_mul8x_mont_neon,%function ++.align 5 ++bn_mul8x_mont_neon: ++ stp x29,x30,[sp,#-80]! ++ mov x16,sp ++ stp d8,d9,[sp,#16] ++ stp d10,d11,[sp,#32] ++ stp d12,d13,[sp,#48] ++ stp d14,d15,[sp,#64] ++ lsl $num,$num,#1 ++ eor $zero.16b,$zero.16b,$zero.16b ++ ++.align 4 ++.LNEON_8n: ++ eor @ACC[0].16b,@ACC[0].16b,@ACC[0].16b ++ sub $toutptr,sp,#128 ++ eor @ACC[1].16b,@ACC[1].16b,@ACC[1].16b ++ sub $toutptr,$toutptr,$num,lsl#4 ++ eor @ACC[2].16b,@ACC[2].16b,@ACC[2].16b ++ and $toutptr,$toutptr,#-64 ++ eor @ACC[3].16b,@ACC[3].16b,@ACC[3].16b ++ mov sp,$toutptr // alloca ++ eor @ACC[4].16b,@ACC[4].16b,@ACC[4].16b ++ add $toutptr,$toutptr,#256 ++ eor @ACC[5].16b,@ACC[5].16b,@ACC[5].16b ++ sub $inner,$num,#8 ++ eor @ACC[6].16b,@ACC[6].16b,@ACC[6].16b ++ eor @ACC[7].16b,@ACC[7].16b,@ACC[7].16b ++ ++.LNEON_8n_init: ++ st1 {@ACC[0].2d,@ACC[1].2d},[$toutptr],#32 ++ subs $inner,$inner,#8 ++ st1 {@ACC[2].2d,@ACC[3].2d},[$toutptr],#32 ++ st1 {@ACC[4].2d,@ACC[5].2d},[$toutptr],#32 ++ st1 {@ACC[6].2d,@ACC[7].2d},[$toutptr],#32 ++ bne .LNEON_8n_init ++ ++ add $tinptr,sp,#256 ++ ld1 {$A0.4s,$A1.4s},[$aptr],#32 ++ add $bnptr,sp,#8 ++ ldr $sM0,[$n0],#4 ++ mov $outer,$num ++ b .LNEON_8n_outer ++ ++.align 4 ++.LNEON_8n_outer: ++ ldr $sBi,[$bptr],#4 // *b++ ++ uxtl $Bi.4s,$Bi.4h ++ add $toutptr,sp,#128 ++ ld1 {$N0.4s,$N1.4s},[$nptr],#32 ++ ++ umlal @ACC[0].2d,$Bi.2s,$A0.s[0] ++ umlal @ACC[1].2d,$Bi.2s,$A0.s[1] ++ umlal @ACC[2].2d,$Bi.2s,$A0.s[2] ++ shl $Ni.2d,@ACC[0].2d,#16 ++ ext $Ni.16b,$Ni.16b,$Ni.16b,#8 ++ umlal @ACC[3].2d,$Bi.2s,$A0.s[3] ++ add $Ni.2d,$Ni.2d,@ACC[0].2d ++ umlal @ACC[4].2d,$Bi.2s,$A1.s[0] ++ mul $Ni.2s,$Ni.2s,$M0.2s ++ umlal @ACC[5].2d,$Bi.2s,$A1.s[1] ++ st1 {$Bi.2s},[sp] // put aside smashed b[8*i+0] ++ umlal @ACC[6].2d,$Bi.2s,$A1.s[2] ++ uxtl $Ni.4s,$Ni.4h ++ umlal @ACC[7].2d,$Bi.2s,$A1.s[3] ++___ ++for ($i=0; $i<7;) { ++$code.=<<___; ++ ldr $sBi,[$bptr],#4 // *b++ ++ umlal @ACC[0].2d,$Ni.2s,$N0.s[0] ++ umlal @ACC[1].2d,$Ni.2s,$N0.s[1] ++ uxtl $Bi.4s,$Bi.4h ++ umlal @ACC[2].2d,$Ni.2s,$N0.s[2] ++ ushr $temp.2d,@ACC[0].2d,#16 ++ umlal @ACC[3].2d,$Ni.2s,$N0.s[3] ++ umlal @ACC[4].2d,$Ni.2s,$N1.s[0] ++ ext @ACC[0].16b,@ACC[0].16b,@ACC[0].16b,#8 ++ add @ACC[0].2d,@ACC[0].2d,$temp.2d ++ umlal @ACC[5].2d,$Ni.2s,$N1.s[1] ++ ushr @ACC[0].2d,@ACC[0].2d,#16 ++ umlal @ACC[6].2d,$Ni.2s,$N1.s[2] ++ umlal @ACC[7].2d,$Ni.2s,$N1.s[3] ++ add $ACCTemp.2d,@ACC[1].2d,@ACC[0].2d ++ ins @ACC[1].d[0],$ACCTemp.d[0] ++ st1 {$Ni.2s},[$bnptr],#8 // put aside smashed m[8*i+$i] ++___ ++ push(@ACC,shift(@ACC)); $i++; ++$code.=<<___; ++ umlal @ACC[0].2d,$Bi.2s,$A0.s[0] ++ ld1 {@ACC[7].2d},[$tinptr],#16 ++ umlal @ACC[1].2d,$Bi.2s,$A0.s[1] ++ umlal @ACC[2].2d,$Bi.2s,$A0.s[2] ++ shl $Ni.2d,@ACC[0].2d,#16 ++ ext $Ni.16b,$Ni.16b,$Ni.16b,#8 ++ umlal @ACC[3].2d,$Bi.2s,$A0.s[3] ++ add $Ni.2d,$Ni.2d,@ACC[0].2d ++ umlal @ACC[4].2d,$Bi.2s,$A1.s[0] ++ mul $Ni.2s,$Ni.2s,$M0.2s ++ umlal @ACC[5].2d,$Bi.2s,$A1.s[1] ++ st1 {$Bi.2s},[$bnptr],#8 // put aside smashed b[8*i+$i] ++ umlal @ACC[6].2d,$Bi.2s,$A1.s[2] ++ uxtl $Ni.4s,$Ni.4h ++ umlal @ACC[7].2d,$Bi.2s,$A1.s[3] ++___ ++} ++$code.=<<___; ++ ld1 {$Bi.2s},[sp] // pull smashed b[8*i+0] ++ umlal @ACC[0].2d,$Ni.2s,$N0.s[0] ++ ld1 {$A0.4s,$A1.4s},[$aptr],#32 ++ umlal @ACC[1].2d,$Ni.2s,$N0.s[1] ++ umlal @ACC[2].2d,$Ni.2s,$N0.s[2] ++ mov $Temp.16b,@ACC[0].16b ++ ushr $Temp.2d,$Temp.2d,#16 ++ ext @ACC[0].16b,@ACC[0].16b,@ACC[0].16b,#8 ++ umlal @ACC[3].2d,$Ni.2s,$N0.s[3] ++ umlal @ACC[4].2d,$Ni.2s,$N1.s[0] ++ add @ACC[0].2d,@ACC[0].2d,$Temp.2d ++ umlal @ACC[5].2d,$Ni.2s,$N1.s[1] ++ ushr @ACC[0].2d,@ACC[0].2d,#16 ++ eor $temp.16b,$temp.16b,$temp.16b ++ ins @ACC[0].d[1],$temp.d[0] ++ umlal @ACC[6].2d,$Ni.2s,$N1.s[2] ++ umlal @ACC[7].2d,$Ni.2s,$N1.s[3] ++ add @ACC[1].2d,@ACC[1].2d,@ACC[0].2d ++ st1 {$Ni.2s},[$bnptr],#8 // put aside smashed m[8*i+$i] ++ add $bnptr,sp,#8 // rewind ++___ ++ push(@ACC,shift(@ACC)); ++$code.=<<___; ++ sub $inner,$num,#8 ++ b .LNEON_8n_inner ++ ++.align 4 ++.LNEON_8n_inner: ++ subs $inner,$inner,#8 ++ umlal @ACC[0].2d,$Bi.2s,$A0.s[0] ++ ld1 {@ACC[7].2d},[$tinptr] ++ umlal @ACC[1].2d,$Bi.2s,$A0.s[1] ++ ld1 {$Ni.2s},[$bnptr],#8 // pull smashed m[8*i+0] ++ umlal @ACC[2].2d,$Bi.2s,$A0.s[2] ++ ld1 {$N0.4s,$N1.4s},[$nptr],#32 ++ umlal @ACC[3].2d,$Bi.2s,$A0.s[3] ++ b.eq .LInner_jump ++ add $tinptr,$tinptr,#16 // don't advance in last iteration ++.LInner_jump: ++ umlal @ACC[4].2d,$Bi.2s,$A1.s[0] ++ umlal @ACC[5].2d,$Bi.2s,$A1.s[1] ++ umlal @ACC[6].2d,$Bi.2s,$A1.s[2] ++ umlal @ACC[7].2d,$Bi.2s,$A1.s[3] ++___ ++for ($i=1; $i<8; $i++) { ++$code.=<<___; ++ ld1 {$Bi.2s},[$bnptr],#8 // pull smashed b[8*i+$i] ++ umlal @ACC[0].2d,$Ni.2s,$N0.s[0] ++ umlal @ACC[1].2d,$Ni.2s,$N0.s[1] ++ umlal @ACC[2].2d,$Ni.2s,$N0.s[2] ++ umlal @ACC[3].2d,$Ni.2s,$N0.s[3] ++ umlal @ACC[4].2d,$Ni.2s,$N1.s[0] ++ umlal @ACC[5].2d,$Ni.2s,$N1.s[1] ++ umlal @ACC[6].2d,$Ni.2s,$N1.s[2] ++ umlal @ACC[7].2d,$Ni.2s,$N1.s[3] ++ st1 {@ACC[0].2d},[$toutptr],#16 ++___ ++ push(@ACC,shift(@ACC)); ++$code.=<<___; ++ umlal @ACC[0].2d,$Bi.2s,$A0.s[0] ++ ld1 {@ACC[7].2d},[$tinptr] ++ umlal @ACC[1].2d,$Bi.2s,$A0.s[1] ++ ld1 {$Ni.2s},[$bnptr],#8 // pull smashed m[8*i+$i] ++ umlal @ACC[2].2d,$Bi.2s,$A0.s[2] ++ b.eq .LInner_jump$i ++ add $tinptr,$tinptr,#16 // don't advance in last iteration ++.LInner_jump$i: ++ umlal @ACC[3].2d,$Bi.2s,$A0.s[3] ++ umlal @ACC[4].2d,$Bi.2s,$A1.s[0] ++ umlal @ACC[5].2d,$Bi.2s,$A1.s[1] ++ umlal @ACC[6].2d,$Bi.2s,$A1.s[2] ++ umlal @ACC[7].2d,$Bi.2s,$A1.s[3] ++___ ++} ++$code.=<<___; ++ b.ne .LInner_after_rewind$i ++ sub $aptr,$aptr,$num,lsl#2 // rewind ++.LInner_after_rewind$i: ++ umlal @ACC[0].2d,$Ni.2s,$N0.s[0] ++ ld1 {$Bi.2s},[sp] // pull smashed b[8*i+0] ++ umlal @ACC[1].2d,$Ni.2s,$N0.s[1] ++ ld1 {$A0.4s,$A1.4s},[$aptr],#32 ++ umlal @ACC[2].2d,$Ni.2s,$N0.s[2] ++ add $bnptr,sp,#8 // rewind ++ umlal @ACC[3].2d,$Ni.2s,$N0.s[3] ++ umlal @ACC[4].2d,$Ni.2s,$N1.s[0] ++ umlal @ACC[5].2d,$Ni.2s,$N1.s[1] ++ umlal @ACC[6].2d,$Ni.2s,$N1.s[2] ++ st1 {@ACC[0].2d},[$toutptr],#16 ++ umlal @ACC[7].2d,$Ni.2s,$N1.s[3] ++ ++ bne .LNEON_8n_inner ++___ ++ push(@ACC,shift(@ACC)); ++$code.=<<___; ++ add $tinptr,sp,#128 ++ st1 {@ACC[0].2d,@ACC[1].2d},[$toutptr],#32 ++ eor $N0.16b,$N0.16b,$N0.16b // $N0 ++ st1 {@ACC[2].2d,@ACC[3].2d},[$toutptr],#32 ++ eor $N1.16b,$N1.16b,$N1.16b // $N1 ++ st1 {@ACC[4].2d,@ACC[5].2d},[$toutptr],#32 ++ st1 {@ACC[6].2d},[$toutptr] ++ ++ subs $outer,$outer,#8 ++ ld1 {@ACC[0].2d,@ACC[1].2d},[$tinptr],#32 ++ ld1 {@ACC[2].2d,@ACC[3].2d},[$tinptr],#32 ++ ld1 {@ACC[4].2d,@ACC[5].2d},[$tinptr],#32 ++ ld1 {@ACC[6].2d,@ACC[7].2d},[$tinptr],#32 ++ ++ b.eq .LInner_8n_jump_2steps ++ sub $nptr,$nptr,$num,lsl#2 // rewind ++ b .LNEON_8n_outer ++ ++.LInner_8n_jump_2steps: ++ add $toutptr,sp,#128 ++ st1 {$N0.2d,$N1.2d}, [sp],#32 // start wiping stack frame ++ mov $Temp.16b,@ACC[0].16b ++ ushr $temp.2d,@ACC[0].2d,#16 ++ ext @ACC[0].16b,@ACC[0].16b,@ACC[0].16b,#8 ++ st1 {$N0.2d,$N1.2d}, [sp],#32 ++ add @ACC[0].2d,@ACC[0].2d,$temp.2d ++ st1 {$N0.2d,$N1.2d}, [sp],#32 ++ ushr $temp.2d,@ACC[0].2d,#16 ++ st1 {$N0.2d,$N1.2d}, [sp],#32 ++ zip1 @ACC[0].4h,$Temp.4h,@ACC[0].4h ++ ins $temp.d[1],$zero.d[0] ++ ++ mov $inner,$num ++ b .LNEON_tail_entry ++ ++.align 4 ++.LNEON_tail: ++ add @ACC[0].2d,@ACC[0].2d,$temp.2d ++ mov $Temp.16b,@ACC[0].16b ++ ushr $temp.2d,@ACC[0].2d,#16 ++ ext @ACC[0].16b,@ACC[0].16b,@ACC[0].16b,#8 ++ ld1 {@ACC[2].2d,@ACC[3].2d}, [$tinptr],#32 ++ add @ACC[0].2d,@ACC[0].2d,$temp.2d ++ ld1 {@ACC[4].2d,@ACC[5].2d}, [$tinptr],#32 ++ ushr $temp.2d,@ACC[0].2d,#16 ++ ld1 {@ACC[6].2d,@ACC[7].2d}, [$tinptr],#32 ++ zip1 @ACC[0].4h,$Temp.4h,@ACC[0].4h ++ ins $temp.d[1],$zero.d[0] ++ ++.LNEON_tail_entry: ++___ ++for ($i=1; $i<8; $i++) { ++$code.=<<___; ++ add @ACC[1].2d,@ACC[1].2d,$temp.2d ++ st1 {@ACC[0].s}[0], [$toutptr],#4 ++ ushr $temp.2d,@ACC[1].2d,#16 ++ mov $Temp.16b,@ACC[1].16b ++ ext @ACC[1].16b,@ACC[1].16b,@ACC[1].16b,#8 ++ add @ACC[1].2d,@ACC[1].2d,$temp.2d ++ ushr $temp.2d,@ACC[1].2d,#16 ++ zip1 @ACC[1].4h,$Temp.4h,@ACC[1].4h ++ ins $temp.d[1],$zero.d[0] ++___ ++ push(@ACC,shift(@ACC)); ++} ++ push(@ACC,shift(@ACC)); ++$code.=<<___; ++ ld1 {@ACC[0].2d,@ACC[1].2d}, [$tinptr],#32 ++ subs $inner,$inner,#8 ++ st1 {@ACC[7].s}[0], [$toutptr],#4 ++ bne .LNEON_tail ++ ++ st1 {$temp.s}[0], [$toutptr],#4 // top-most bit ++ sub $nptr,$nptr,$num,lsl#2 // rewind $nptr ++ subs $aptr,sp,#0 // clear carry flag ++ add $bptr,sp,$num,lsl#2 ++ ++.LNEON_sub: ++ ldp w4,w5,[$aptr],#8 ++ ldp w6,w7,[$aptr],#8 ++ ldp w8,w9,[$nptr],#8 ++ ldp w10,w11,[$nptr],#8 ++ sbcs w8,w4,w8 ++ sbcs w9,w5,w9 ++ sbcs w10,w6,w10 ++ sbcs w11,w7,w11 ++ sub x17,$bptr,$aptr ++ stp w8,w9,[$rptr],#8 ++ stp w10,w11,[$rptr],#8 ++ cbnz x17,.LNEON_sub ++ ++ ldr w10, [$aptr] // load top-most bit ++ mov x11,sp ++ eor v0.16b,v0.16b,v0.16b ++ sub x11,$bptr,x11 // this is num*4 ++ eor v1.16b,v1.16b,v1.16b ++ mov $aptr,sp ++ sub $rptr,$rptr,x11 // rewind $rptr ++ mov $nptr,$bptr // second 3/4th of frame ++ sbcs w10,w10,wzr // result is carry flag ++ ++.LNEON_copy_n_zap: ++ ldp w4,w5,[$aptr],#8 ++ ldp w6,w7,[$aptr],#8 ++ ldp w8,w9,[$rptr],#8 ++ ldp w10,w11,[$rptr] ++ sub $rptr,$rptr,#8 ++ b.cs .LCopy_1 ++ mov w8,w4 ++ mov w9,w5 ++ mov w10,w6 ++ mov w11,w7 ++.LCopy_1: ++ st1 {v0.2d,v1.2d}, [$nptr],#32 // wipe ++ st1 {v0.2d,v1.2d}, [$nptr],#32 // wipe ++ ldp w4,w5,[$aptr],#8 ++ ldp w6,w7,[$aptr],#8 ++ stp w8,w9,[$rptr],#8 ++ stp w10,w11,[$rptr],#8 ++ sub $aptr,$aptr,#32 ++ ldp w8,w9,[$rptr],#8 ++ ldp w10,w11,[$rptr] ++ sub $rptr,$rptr,#8 ++ b.cs .LCopy_2 ++ mov w8, w4 ++ mov w9, w5 ++ mov w10, w6 ++ mov w11, w7 ++.LCopy_2: ++ st1 {v0.2d,v1.2d}, [$aptr],#32 // wipe ++ st1 {v0.2d,v1.2d}, [$nptr],#32 // wipe ++ sub x17,$bptr,$aptr // preserves carry ++ stp w8,w9,[$rptr],#8 ++ stp w10,w11,[$rptr],#8 ++ cbnz x17,.LNEON_copy_n_zap ++ ++ mov sp,x16 ++ ldp d14,d15,[sp,#64] ++ ldp d12,d13,[sp,#48] ++ ldp d10,d11,[sp,#32] ++ ldp d8,d9,[sp,#16] ++ ldr x29,[sp],#80 ++ ret // bx lr ++ ++.size bn_mul8x_mont_neon,.-bn_mul8x_mont_neon ++___ ++} ++{ + ######################################################################## + # Following is ARMv8 adaptation of sqrx8x_mont from x86_64-mont5 module. + +Index: openssl-1.1.1d/crypto/bn/build.info +=================================================================== +--- openssl-1.1.1d.orig/crypto/bn/build.info ++++ openssl-1.1.1d/crypto/bn/build.info +@@ -65,3 +65,4 @@ INCLUDE[armv4-mont.o]=.. + GENERATE[armv4-gf2m.S]=asm/armv4-gf2m.pl $(PERLASM_SCHEME) + INCLUDE[armv4-gf2m.o]=.. + GENERATE[armv8-mont.S]=asm/armv8-mont.pl $(PERLASM_SCHEME) ++INCLUDE[armv8-mont.o]=.. diff --git a/openssl-1_1-Optimize-ppc64.patch b/openssl-1_1-Optimize-ppc64.patch new file mode 100644 index 0000000..28470ce --- /dev/null +++ b/openssl-1_1-Optimize-ppc64.patch @@ -0,0 +1,2308 @@ +From 4dba53694bf633c272075e62acdc5a5ca3003ce6 Mon Sep 17 00:00:00 2001 +From: Amitay Isaacs +Date: Mon, 29 Mar 2021 18:06:13 +1100 +Subject: [PATCH 01/29] numbers: Define 128-bit integers if compiler supports + +Signed-off-by: Amitay Isaacs + +Reviewed-by: Tomas Mraz +Reviewed-by: Matt Caswell +(Merged from https://github.com/openssl/openssl/pull/14784) + +(cherry picked from commit bbed0d1cbd436af6797d7837e270bff4ca4d5a10) +--- + include/internal/numbers.h | 10 ++++++++++ + 1 file changed, 10 insertions(+) + +Index: openssl-1.1.1l/include/internal/numbers.h +=================================================================== +--- openssl-1.1.1l.orig/include/internal/numbers.h ++++ openssl-1.1.1l/include/internal/numbers.h +@@ -60,6 +60,16 @@ + # define UINT64_MAX __MAXUINT__(uint64_t) + # endif + ++# ifndef INT128_MAX ++# if defined(__SIZEOF_INT128__) && __SIZEOF_INT128__ == 16 ++typedef __int128_t int128_t; ++typedef __uint128_t uint128_t; ++# define INT128_MIN __MININT__(int128_t) ++# define INT128_MAX __MAXINT__(int128_t) ++# define UINT128_MAX __MAXUINT__(uint128_t) ++# endif ++# endif ++ + # ifndef SIZE_MAX + # define SIZE_MAX __MAXUINT__(size_t) + # endif +Index: openssl-1.1.1l/crypto/bn/bn_div.c +=================================================================== +--- openssl-1.1.1l.orig/crypto/bn/bn_div.c ++++ openssl-1.1.1l/crypto/bn/bn_div.c +@@ -97,7 +97,7 @@ BN_ULONG bn_div_3_words(const BN_ULONG * + */ + # if BN_BITS2 == 64 && defined(__SIZEOF_INT128__) && __SIZEOF_INT128__==16 + # undef BN_ULLONG +-# define BN_ULLONG __uint128_t ++# define BN_ULLONG uint128_t + # define BN_LLONG + # endif + +Index: openssl-1.1.1l/crypto/bn/bn_local.h +=================================================================== +--- openssl-1.1.1l.orig/crypto/bn/bn_local.h ++++ openssl-1.1.1l/crypto/bn/bn_local.h +@@ -22,6 +22,7 @@ + # endif + + # include "crypto/bn.h" ++# include "internal/numbers.h" + + /* + * These preprocessor symbols control various aspects of the bignum headers +@@ -374,9 +375,9 @@ struct bn_gencb_st { + */ + # if defined(__SIZEOF_INT128__) && __SIZEOF_INT128__==16 && \ + (defined(SIXTY_FOUR_BIT) || defined(SIXTY_FOUR_BIT_LONG)) +-# define BN_UMULT_HIGH(a,b) (((__uint128_t)(a)*(b))>>64) ++# define BN_UMULT_HIGH(a,b) (((uint128_t)(a)*(b))>>64) + # define BN_UMULT_LOHI(low,high,a,b) ({ \ +- __uint128_t ret=(__uint128_t)(a)*(b); \ ++ uint128_t ret=(uint128_t)(a)*(b); \ + (high)=ret>>64; (low)=ret; }) + # elif defined(__alpha) && (defined(SIXTY_FOUR_BIT_LONG) || defined(SIXTY_FOUR_BIT)) + # if defined(__DECC) +Index: openssl-1.1.1l/crypto/ec/curve25519.c +=================================================================== +--- openssl-1.1.1l.orig/crypto/ec/curve25519.c ++++ openssl-1.1.1l/crypto/ec/curve25519.c +@@ -11,6 +11,8 @@ + #include "ec_local.h" + #include + ++#include "internal/numbers.h" ++ + #if defined(X25519_ASM) && (defined(__x86_64) || defined(__x86_64__) || \ + defined(_M_AMD64) || defined(_M_X64)) + +@@ -252,7 +254,7 @@ static void x25519_scalar_mulx(uint8_t o + #endif + + #if defined(X25519_ASM) \ +- || ( (defined(__SIZEOF_INT128__) && __SIZEOF_INT128__ == 16) \ ++ || ( defined(INT128_MAX) \ + && !defined(__sparc__) \ + && (!defined(__SIZEOF_LONG__) || (__SIZEOF_LONG__ == 8)) \ + && !(defined(__ANDROID__) && !defined(__clang__)) ) +@@ -385,7 +387,7 @@ void x25519_fe51_mul121666(fe51 h, fe51 + # define fe51_mul121666 x25519_fe51_mul121666 + # else + +-typedef __uint128_t u128; ++typedef uint128_t u128; + + static void fe51_mul(fe51 h, const fe51 f, const fe51 g) + { +Index: openssl-1.1.1l/crypto/ec/curve448/curve448utils.h +=================================================================== +--- openssl-1.1.1l.orig/crypto/ec/curve448/curve448utils.h ++++ openssl-1.1.1l/crypto/ec/curve448/curve448utils.h +@@ -15,6 +15,8 @@ + + # include + ++# include "internal/numbers.h" ++ + /* + * Internal word types. Somewhat tricky. This could be decided separately per + * platform. However, the structs do need to be all the same size and +@@ -41,9 +43,9 @@ typedef int64_t c448_sword_t; + /* "Boolean" type, will be set to all-zero or all-one (i.e. -1u) */ + typedef uint64_t c448_bool_t; + /* Double-word size for internal computations */ +-typedef __uint128_t c448_dword_t; ++typedef uint128_t c448_dword_t; + /* Signed double-word size for internal computations */ +-typedef __int128_t c448_dsword_t; ++typedef int128_t c448_dsword_t; + # elif C448_WORD_BITS == 32 + /* Word size for internal computations */ + typedef uint32_t c448_word_t; +Index: openssl-1.1.1l/crypto/ec/curve448/word.h +=================================================================== +--- openssl-1.1.1l.orig/crypto/ec/curve448/word.h ++++ openssl-1.1.1l/crypto/ec/curve448/word.h +@@ -17,15 +17,20 @@ + # include + # include + # include +-# include "arch_intrinsics.h" + # include "curve448utils.h" + ++# ifdef INT128_MAX ++# include "arch_64/arch_intrinsics.h" ++# else ++# include "arch_32/arch_intrinsics.h" ++# endif ++ + # if (ARCH_WORD_BITS == 64) + typedef uint64_t word_t, mask_t; +-typedef __uint128_t dword_t; ++typedef uint128_t dword_t; + typedef int32_t hsword_t; + typedef int64_t sword_t; +-typedef __int128_t dsword_t; ++typedef int128_t dsword_t; + # elif (ARCH_WORD_BITS == 32) + typedef uint32_t word_t, mask_t; + typedef uint64_t dword_t; +Index: openssl-1.1.1l/crypto/ec/ecp_nistp224.c +=================================================================== +--- openssl-1.1.1l.orig/crypto/ec/ecp_nistp224.c ++++ openssl-1.1.1l/crypto/ec/ecp_nistp224.c +@@ -40,11 +40,9 @@ NON_EMPTY_TRANSLATION_UNIT + # include + # include "ec_local.h" + +-# if defined(__SIZEOF_INT128__) && __SIZEOF_INT128__==16 +- /* even with gcc, the typedef won't work for 32-bit platforms */ +-typedef __uint128_t uint128_t; /* nonstandard; implemented by gcc on 64-bit +- * platforms */ +-# else ++#include "internal/numbers.h" ++ ++#ifndef INT128_MAX + # error "Your compiler doesn't appear to support 128-bit integer types" + # endif + +Index: openssl-1.1.1l/crypto/ec/ecp_nistp256.c +=================================================================== +--- openssl-1.1.1l.orig/crypto/ec/ecp_nistp256.c ++++ openssl-1.1.1l/crypto/ec/ecp_nistp256.c +@@ -41,14 +41,11 @@ NON_EMPTY_TRANSLATION_UNIT + # include + # include "ec_local.h" + +-# if defined(__SIZEOF_INT128__) && __SIZEOF_INT128__==16 +- /* even with gcc, the typedef won't work for 32-bit platforms */ +-typedef __uint128_t uint128_t; /* nonstandard; implemented by gcc on 64-bit +- * platforms */ +-typedef __int128_t int128_t; +-# else +-# error "Your compiler doesn't appear to support 128-bit integer types" +-# endif ++#include "internal/numbers.h" ++ ++#ifndef INT128_MAX ++# error "Your compiler doesn't appear to support 128-bit integer types" ++#endif + + typedef uint8_t u8; + typedef uint32_t u32; +Index: openssl-1.1.1l/crypto/ec/ecp_nistp521.c +=================================================================== +--- openssl-1.1.1l.orig/crypto/ec/ecp_nistp521.c ++++ openssl-1.1.1l/crypto/ec/ecp_nistp521.c +@@ -40,13 +40,11 @@ NON_EMPTY_TRANSLATION_UNIT + # include + # include "ec_local.h" + +-# if defined(__SIZEOF_INT128__) && __SIZEOF_INT128__==16 +- /* even with gcc, the typedef won't work for 32-bit platforms */ +-typedef __uint128_t uint128_t; /* nonstandard; implemented by gcc on 64-bit +- * platforms */ +-# else +-# error "Your compiler doesn't appear to support 128-bit integer types" +-# endif ++#include "internal/numbers.h" ++ ++#ifndef INT128_MAX ++# error "Your compiler doesn't appear to support 128-bit integer types" ++#endif + + typedef uint8_t u8; + typedef uint64_t u64; +@@ -400,7 +398,7 @@ static void felem_diff128(largefelem out + * On exit: + * out[i] < 17 * max(in[i]) * max(in[i]) + */ +-static void felem_square(largefelem out, const felem in) ++static void felem_square_ref(largefelem out, const felem in) + { + felem inx2, inx4; + felem_scalar(inx2, in, 2); +@@ -484,7 +482,7 @@ static void felem_square(largefelem out, + * On exit: + * out[i] < 17 * max(in1[i]) * max(in2[i]) + */ +-static void felem_mul(largefelem out, const felem in1, const felem in2) ++static void felem_mul_ref(largefelem out, const felem in1, const felem in2) + { + felem in2x2; + felem_scalar(in2x2, in2, 2); +@@ -674,6 +672,57 @@ static void felem_reduce(felem out, cons + */ + } + ++#if defined(ECP_NISTP521_ASM) ++void felem_square_wrapper(largefelem out, const felem in); ++void felem_mul_wrapper(largefelem out, const felem in1, const felem in2); ++ ++static void (*felem_square_p)(largefelem out, const felem in) = ++ felem_square_wrapper; ++static void (*felem_mul_p)(largefelem out, const felem in1, const felem in2) = ++ felem_mul_wrapper; ++ ++void p521_felem_square(largefelem out, const felem in); ++void p521_felem_mul(largefelem out, const felem in1, const felem in2); ++ ++# if defined(_ARCH_PPC64) ++# include "../ppc_arch.h" ++# endif ++ ++void felem_select(void) ++{ ++# if defined(_ARCH_PPC64) ++ if ((OPENSSL_ppccap_P & PPC_MADD300) && (OPENSSL_ppccap_P & PPC_ALTIVEC)) { ++ felem_square_p = p521_felem_square; ++ felem_mul_p = p521_felem_mul; ++ ++ return; ++ } ++# endif ++ ++ /* Default */ ++ felem_square_p = felem_square_ref; ++ felem_mul_p = felem_mul_ref; ++} ++ ++void felem_square_wrapper(largefelem out, const felem in) ++{ ++ felem_select(); ++ felem_square_p(out, in); ++} ++ ++void felem_mul_wrapper(largefelem out, const felem in1, const felem in2) ++{ ++ felem_select(); ++ felem_mul_p(out, in1, in2); ++} ++ ++# define felem_square felem_square_p ++# define felem_mul felem_mul_p ++#else ++# define felem_square felem_square_ref ++# define felem_mul felem_mul_ref ++#endif ++ + static void felem_square_reduce(felem out, const felem in) + { + largefelem tmp; +Index: openssl-1.1.1l/crypto/poly1305/poly1305.c +=================================================================== +--- openssl-1.1.1l.orig/crypto/poly1305/poly1305.c ++++ openssl-1.1.1l/crypto/poly1305/poly1305.c +@@ -95,11 +95,10 @@ poly1305_blocks(void *ctx, const unsigne + (a ^ ((a ^ b) | ((a - b) ^ b))) >> (sizeof(a) * 8 - 1) \ + ) + +-# if (defined(__SIZEOF_INT128__) && __SIZEOF_INT128__==16) && \ +- (defined(__SIZEOF_LONG__) && __SIZEOF_LONG__==8) ++# if defined(INT64_MAX) && defined(INT128_MAX) + + typedef unsigned long u64; +-typedef __uint128_t u128; ++typedef uint128_t u128; + + typedef struct { + u64 h[3]; +Index: openssl-1.1.1l/crypto/poly1305/poly1305_base2_44.c +=================================================================== +--- openssl-1.1.1l.orig/crypto/poly1305/poly1305_base2_44.c ++++ openssl-1.1.1l/crypto/poly1305/poly1305_base2_44.c +@@ -18,7 +18,7 @@ + typedef unsigned char u8; + typedef unsigned int u32; + typedef unsigned long u64; +-typedef unsigned __int128 u128; ++typedef uint128_t u128; + + typedef struct { + u64 h[3]; +Index: openssl-1.1.1l/crypto/ec/build.info +=================================================================== +--- openssl-1.1.1l.orig/crypto/ec/build.info ++++ openssl-1.1.1l/crypto/ec/build.info +@@ -6,8 +13,9 @@ SOURCE[../../libcrypto]=\ + ecp_nistp224.c ecp_nistp256.c ecp_nistp521.c ecp_nistputil.c \ + ecp_oct.c ec2_oct.c ec_oct.c ec_kmeth.c ecdh_ossl.c ecdh_kdf.c \ + ecdsa_ossl.c ecdsa_sign.c ecdsa_vrf.c curve25519.c ecx_meth.c \ +- curve448/arch_32/f_impl.c curve448/f_generic.c curve448/scalar.c \ ++ curve448/f_generic.c curve448/scalar.c \ + curve448/curve448_tables.c curve448/eddsa.c curve448/curve448.c \ ++ curve448/arch_64/f_impl64.c curve448/arch_32/f_impl32.c \ + {- $target{ec_asm_src} -} + + GENERATE[ecp_nistz256-x86.s]=asm/ecp_nistz256-x86.pl \ +@@ -29,6 +38,8 @@ GENERATE[ecp_nistz256-armv8.S]=asm/ecp_n + INCLUDE[ecp_nistz256-armv8.o]=.. + GENERATE[ecp_nistz256-ppc64.s]=asm/ecp_nistz256-ppc64.pl $(PERLASM_SCHEME) + ++GENERATE[ecp_nistp521-ppc64.s]=asm/ecp_nistp521-ppc64.pl $(PERLASM_SCHEME) ++ + GENERATE[x25519-x86_64.s]=asm/x25519-x86_64.pl $(PERLASM_SCHEME) + GENERATE[x25519-ppc64.s]=asm/x25519-ppc64.pl $(PERLASM_SCHEME) + +@@ -36,10 +47,3 @@ BEGINRAW[Makefile] + {- $builddir -}/ecp_nistz256-%.S: {- $sourcedir -}/asm/ecp_nistz256-%.pl + CC="$(CC)" $(PERL) $< $(PERLASM_SCHEME) $@ + ENDRAW[Makefile] +- +-INCLUDE[curve448/arch_32/f_impl.o]=curve448/arch_32 curve448 +-INCLUDE[curve448/f_generic.o]=curve448/arch_32 curve448 +-INCLUDE[curve448/scalar.o]=curve448/arch_32 curve448 +-INCLUDE[curve448/curve448_tables.o]=curve448/arch_32 curve448 +-INCLUDE[curve448/eddsa.o]=curve448/arch_32 curve448 +-INCLUDE[curve448/curve448.o]=curve448/arch_32 curve448 +Index: openssl-1.1.1l/crypto/ec/curve448/field.h +=================================================================== +--- openssl-1.1.1l.orig/crypto/ec/curve448/field.h ++++ openssl-1.1.1l/crypto/ec/curve448/field.h +@@ -66,10 +66,15 @@ void gf_serialize(uint8_t *serial, const + mask_t gf_deserialize(gf x, const uint8_t serial[SER_BYTES], int with_hibit, + uint8_t hi_nmask); + +-# include "f_impl.h" /* Bring in the inline implementations */ + + # define LIMBPERM(i) (i) +-# define LIMB_MASK(i) (((1)<limb[i] = a->limb[i] + b->limb[i]; ++ ++ gf_weak_reduce(out); ++} ++ ++void gf_sub_RAW(gf out, const gf a, const gf b) ++{ ++ uint64_t co1 = ((1ULL << 56) - 1) * 2, co2 = co1 - 2; ++ unsigned int i; ++ ++ for (i = 0; i < NLIMBS; i++) ++ out->limb[i] = a->limb[i] - b->limb[i] + ((i == NLIMBS / 2) ? co2 : co1); ++ ++ gf_weak_reduce(out); ++} ++ ++void gf_bias(gf a, int amt) ++{ ++} ++ ++void gf_weak_reduce(gf a) ++{ ++ uint64_t mask = (1ULL << 56) - 1; ++ uint64_t tmp = a->limb[NLIMBS - 1] >> 56; ++ unsigned int i; ++ ++ a->limb[NLIMBS / 2] += tmp; ++ for (i = NLIMBS - 1; i > 0; i--) ++ a->limb[i] = (a->limb[i] & mask) + (a->limb[i - 1] >> 56); ++ a->limb[0] = (a->limb[0] & mask) + tmp; ++} ++ ++#endif /* OSSL_CRYPTO_EC_CURVE448_ARCH_64_F_IMPL_H */ +Index: openssl-1.1.1l/include/internal/constant_time.h +=================================================================== +--- openssl-1.1.1l.orig/include/internal/constant_time.h ++++ openssl-1.1.1l/include/internal/constant_time.h +@@ -181,6 +181,11 @@ static ossl_inline uint32_t constant_tim + return constant_time_msb_32(~a & (a - 1)); + } + ++static ossl_inline uint64_t constant_time_is_zero_64(uint64_t a) ++{ ++ return constant_time_msb_64(~a & (a - 1)); ++} ++ + static ossl_inline unsigned int constant_time_eq(unsigned int a, + unsigned int b) + { +Index: openssl-1.1.1l/crypto/ec/curve448/arch_32/f_impl32.c +=================================================================== +--- /dev/null ++++ openssl-1.1.1l/crypto/ec/curve448/arch_32/f_impl32.c +@@ -0,0 +1,104 @@ ++/* ++ * Copyright 2017-2018 The OpenSSL Project Authors. All Rights Reserved. ++ * Copyright 2014 Cryptography Research, Inc. ++ * ++ * Licensed under the OpenSSL license (the "License"). You may not use ++ * this file except in compliance with the License. You can obtain a copy ++ * in the file LICENSE in the source distribution or at ++ * https://www.openssl.org/source/license.html ++ * ++ * Originally written by Mike Hamburg ++ */ ++ ++#include ++#include "internal/numbers.h" ++ ++#ifdef UINT128_MAX ++/* We have support for 128 bit ints, so do nothing here */ ++NON_EMPTY_TRANSLATION_UNIT ++#else ++ ++# include "../field.h" ++ ++void gf_mul(gf_s * RESTRICT cs, const gf as, const gf bs) ++{ ++ const uint32_t *a = as->limb, *b = bs->limb; ++ uint32_t *c = cs->limb; ++ uint64_t accum0 = 0, accum1 = 0, accum2 = 0; ++ uint32_t mask = (1 << 28) - 1; ++ uint32_t aa[8], bb[8]; ++ int i, j; ++ ++ for (i = 0; i < 8; i++) { ++ aa[i] = a[i] + a[i + 8]; ++ bb[i] = b[i] + b[i + 8]; ++ } ++ ++ for (j = 0; j < 8; j++) { ++ accum2 = 0; ++ for (i = 0; i < j + 1; i++) { ++ accum2 += widemul(a[j - i], b[i]); ++ accum1 += widemul(aa[j - i], bb[i]); ++ accum0 += widemul(a[8 + j - i], b[8 + i]); ++ } ++ accum1 -= accum2; ++ accum0 += accum2; ++ accum2 = 0; ++ for (i = j + 1; i < 8; i++) { ++ accum0 -= widemul(a[8 + j - i], b[i]); ++ accum2 += widemul(aa[8 + j - i], bb[i]); ++ accum1 += widemul(a[16 + j - i], b[8 + i]); ++ } ++ accum1 += accum2; ++ accum0 += accum2; ++ c[j] = ((uint32_t)(accum0)) & mask; ++ c[j + 8] = ((uint32_t)(accum1)) & mask; ++ accum0 >>= 28; ++ accum1 >>= 28; ++ } ++ ++ accum0 += accum1; ++ accum0 += c[8]; ++ accum1 += c[0]; ++ c[8] = ((uint32_t)(accum0)) & mask; ++ c[0] = ((uint32_t)(accum1)) & mask; ++ ++ accum0 >>= 28; ++ accum1 >>= 28; ++ c[9] += ((uint32_t)(accum0)); ++ c[1] += ((uint32_t)(accum1)); ++} ++ ++void gf_mulw_unsigned(gf_s * RESTRICT cs, const gf as, uint32_t b) ++{ ++ const uint32_t *a = as->limb; ++ uint32_t *c = cs->limb; ++ uint64_t accum0 = 0, accum8 = 0; ++ uint32_t mask = (1 << 28) - 1; ++ int i; ++ ++ assert(b <= mask); ++ ++ for (i = 0; i < 8; i++) { ++ accum0 += widemul(b, a[i]); ++ accum8 += widemul(b, a[i + 8]); ++ c[i] = accum0 & mask; ++ accum0 >>= 28; ++ c[i + 8] = accum8 & mask; ++ accum8 >>= 28; ++ } ++ ++ accum0 += accum8 + c[8]; ++ c[8] = ((uint32_t)accum0) & mask; ++ c[9] += (uint32_t)(accum0 >> 28); ++ ++ accum8 += c[0]; ++ c[0] = ((uint32_t)accum8) & mask; ++ c[1] += (uint32_t)(accum8 >> 28); ++} ++ ++void gf_sqr(gf_s * RESTRICT cs, const gf as) ++{ ++ gf_mul(cs, as, as); /* Performs better with a dedicated square */ ++} ++#endif +Index: openssl-1.1.1l/crypto/ec/curve448/arch_64/f_impl64.c +=================================================================== +--- /dev/null ++++ openssl-1.1.1l/crypto/ec/curve448/arch_64/f_impl64.c +@@ -0,0 +1,210 @@ ++/* ++ * Copyright 2017-2018 The OpenSSL Project Authors. All Rights Reserved. ++ * Copyright 2014 Cryptography Research, Inc. ++ * ++ * Licensed under the OpenSSL license (the "License"). You may not use ++ * this file except in compliance with the License. You can obtain a copy ++ * in the file LICENSE in the source distribution or at ++ * https://www.openssl.org/source/license.html ++ * ++ * Originally written by Mike Hamburg ++ */ ++ ++#include ++#include "internal/numbers.h" ++ ++#ifndef UINT128_MAX ++/* No support for 128 bit ints, so do nothing here */ ++NON_EMPTY_TRANSLATION_UNIT ++#else ++ ++# include "../field.h" ++ ++void gf_mul(gf_s * RESTRICT cs, const gf as, const gf bs) ++{ ++ const uint64_t *a = as->limb, *b = bs->limb; ++ uint64_t *c = cs->limb; ++ uint128_t accum0 = 0, accum1 = 0, accum2; ++ uint64_t mask = (1ULL << 56) - 1; ++ uint64_t aa[4], bb[4], bbb[4]; ++ unsigned int i, j; ++ ++ for (i = 0; i < 4; i++) { ++ aa[i] = a[i] + a[i + 4]; ++ bb[i] = b[i] + b[i + 4]; ++ bbb[i] = bb[i] + b[i + 4]; ++ } ++ ++ for (i = 0; i < 4; i++) { ++ accum2 = 0; ++ ++ for (j = 0; j <= i; j++) { ++ accum2 += widemul(a[j], b[i - j]); ++ accum1 += widemul(aa[j], bb[i - j]); ++ accum0 += widemul(a[j + 4], b[i - j + 4]); ++ } ++ for (; j < 4; j++) { ++ accum2 += widemul(a[j], b[i - j + 8]); ++ accum1 += widemul(aa[j], bbb[i - j + 4]); ++ accum0 += widemul(a[j + 4], bb[i - j + 4]); ++ } ++ ++ accum1 -= accum2; ++ accum0 += accum2; ++ ++ c[i] = ((uint64_t)(accum0)) & mask; ++ c[i + 4] = ((uint64_t)(accum1)) & mask; ++ ++ accum0 >>= 56; ++ accum1 >>= 56; ++ } ++ ++ accum0 += accum1; ++ accum0 += c[4]; ++ accum1 += c[0]; ++ c[4] = ((uint64_t)(accum0)) & mask; ++ c[0] = ((uint64_t)(accum1)) & mask; ++ ++ accum0 >>= 56; ++ accum1 >>= 56; ++ ++ c[5] += ((uint64_t)(accum0)); ++ c[1] += ((uint64_t)(accum1)); ++} ++ ++void gf_mulw_unsigned(gf_s * __restrict__ cs, const gf as, uint32_t b) ++{ ++ const uint64_t *a = as->limb; ++ uint64_t *c = cs->limb; ++ uint128_t accum0 = 0, accum4 = 0; ++ uint64_t mask = (1ULL << 56) - 1; ++ int i; ++ ++ for (i = 0; i < 4; i++) { ++ accum0 += widemul(b, a[i]); ++ accum4 += widemul(b, a[i + 4]); ++ c[i] = accum0 & mask; ++ accum0 >>= 56; ++ c[i + 4] = accum4 & mask; ++ accum4 >>= 56; ++ } ++ ++ accum0 += accum4 + c[4]; ++ c[4] = accum0 & mask; ++ c[5] += accum0 >> 56; ++ ++ accum4 += c[0]; ++ c[0] = accum4 & mask; ++ c[1] += accum4 >> 56; ++} ++ ++void gf_sqr(gf_s * __restrict__ cs, const gf as) ++{ ++ const uint64_t *a = as->limb; ++ uint64_t *c = cs->limb; ++ uint128_t accum0 = 0, accum1 = 0, accum2; ++ uint64_t mask = (1ULL << 56) - 1; ++ uint64_t aa[4]; ++ ++ /* For some reason clang doesn't vectorize this without prompting? */ ++ unsigned int i; ++ for (i = 0; i < 4; i++) { ++ aa[i] = a[i] + a[i + 4]; ++ } ++ ++ accum2 = widemul(a[0], a[3]); ++ accum0 = widemul(aa[0], aa[3]); ++ accum1 = widemul(a[4], a[7]); ++ ++ accum2 += widemul(a[1], a[2]); ++ accum0 += widemul(aa[1], aa[2]); ++ accum1 += widemul(a[5], a[6]); ++ ++ accum0 -= accum2; ++ accum1 += accum2; ++ ++ c[3] = ((uint64_t)(accum1)) << 1 & mask; ++ c[7] = ((uint64_t)(accum0)) << 1 & mask; ++ ++ accum0 >>= 55; ++ accum1 >>= 55; ++ ++ accum0 += widemul(2 * aa[1], aa[3]); ++ accum1 += widemul(2 * a[5], a[7]); ++ accum0 += widemul(aa[2], aa[2]); ++ accum1 += accum0; ++ ++ accum0 -= widemul(2 * a[1], a[3]); ++ accum1 += widemul(a[6], a[6]); ++ ++ accum2 = widemul(a[0], a[0]); ++ accum1 -= accum2; ++ accum0 += accum2; ++ ++ accum0 -= widemul(a[2], a[2]); ++ accum1 += widemul(aa[0], aa[0]); ++ accum0 += widemul(a[4], a[4]); ++ ++ c[0] = ((uint64_t)(accum0)) & mask; ++ c[4] = ((uint64_t)(accum1)) & mask; ++ ++ accum0 >>= 56; ++ accum1 >>= 56; ++ ++ accum2 = widemul(2 * aa[2], aa[3]); ++ accum0 -= widemul(2 * a[2], a[3]); ++ accum1 += widemul(2 * a[6], a[7]); ++ ++ accum1 += accum2; ++ accum0 += accum2; ++ ++ accum2 = widemul(2 * a[0], a[1]); ++ accum1 += widemul(2 * aa[0], aa[1]); ++ accum0 += widemul(2 * a[4], a[5]); ++ ++ accum1 -= accum2; ++ accum0 += accum2; ++ ++ c[1] = ((uint64_t)(accum0)) & mask; ++ c[5] = ((uint64_t)(accum1)) & mask; ++ ++ accum0 >>= 56; ++ accum1 >>= 56; ++ ++ accum2 = widemul(aa[3], aa[3]); ++ accum0 -= widemul(a[3], a[3]); ++ accum1 += widemul(a[7], a[7]); ++ ++ accum1 += accum2; ++ accum0 += accum2; ++ ++ accum2 = widemul(2 * a[0], a[2]); ++ accum1 += widemul(2 * aa[0], aa[2]); ++ accum0 += widemul(2 * a[4], a[6]); ++ ++ accum2 += widemul(a[1], a[1]); ++ accum1 += widemul(aa[1], aa[1]); ++ accum0 += widemul(a[5], a[5]); ++ ++ accum1 -= accum2; ++ accum0 += accum2; ++ ++ c[2] = ((uint64_t)(accum0)) & mask; ++ c[6] = ((uint64_t)(accum1)) & mask; ++ ++ accum0 >>= 56; ++ accum1 >>= 56; ++ ++ accum0 += c[3]; ++ accum1 += c[7]; ++ c[3] = ((uint64_t)(accum0)) & mask; ++ c[7] = ((uint64_t)(accum1)) & mask; ++ ++ /* we could almost stop here, but it wouldn't be stable, so... */ ++ ++ accum0 >>= 56; ++ accum1 >>= 56; ++ c[4] += ((uint64_t)(accum0)) + ((uint64_t)(accum1)); ++ c[0] += ((uint64_t)(accum1)); ++} ++#endif +Index: openssl-1.1.1l/Configurations/00-base-templates.conf +=================================================================== +--- openssl-1.1.1l.orig/Configurations/00-base-templates.conf ++++ openssl-1.1.1l/Configurations/00-base-templates.conf +@@ -351,7 +351,8 @@ my %targets=( + ppc64_asm => { + inherit_from => [ "ppc32_asm" ], + template => 1, +- ec_asm_src => "ecp_nistz256.c ecp_nistz256-ppc64.s x25519-ppc64.s", ++ bn_asm_src => add("ppc64-mont-fixed.s"), ++ ec_asm_src => "ecp_nistz256.c ecp_nistz256-ppc64.s ecp_nistp521-ppc64.s x25519-ppc64.s", + keccak1600_asm_src => "keccak1600-ppc64.s", + }, + ); +Index: openssl-1.1.1l/crypto/bn/asm/ppc64-mont-fixed.pl +=================================================================== +--- /dev/null ++++ openssl-1.1.1l/crypto/bn/asm/ppc64-mont-fixed.pl +@@ -0,0 +1,581 @@ ++#! /usr/bin/env perl ++# Copyright 2021 The OpenSSL Project Authors. All Rights Reserved. ++# ++# Licensed under the Apache License 2.0 (the "License"). You may not use ++# this file except in compliance with the License. You can obtain a copy ++# in the file LICENSE in the source distribution or at ++# https://www.openssl.org/source/license.html ++ ++# ==================================================================== ++# Written by Amitay Isaacs , Martin Schwenke ++# & Alastair D'Silva for ++# the OpenSSL project. ++# ==================================================================== ++ ++# ++# Fixed length (n=6), unrolled PPC Montgomery Multiplication ++# ++ ++# 2021 ++# ++# Although this is a generic implementation for unrolling Montgomery ++# Multiplication for arbitrary values of n, this is currently only ++# used for n = 6 to improve the performance of ECC p384. ++# ++# Unrolling allows intermediate results to be stored in registers, ++# rather than on the stack, improving performance by ~7% compared to ++# the existing PPC assembly code. ++# ++# The ISA 3.0 implementation uses combination multiply/add ++# instructions (maddld, maddhdu) to improve performance by an ++# additional ~10% on Power 9. ++# ++# Finally, saving non-volatile registers into volatile vector ++# registers instead of onto the stack saves a little more. ++# ++# On a Power 9 machine we see an overall improvement of ~18%. ++# ++ ++use strict; ++use warnings; ++ ++my ($flavour, $output, $dir, $xlate); ++ ++# $output is the last argument if it looks like a file (it has an extension) ++# $flavour is the first argument if it doesn't look like a file ++$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef; ++$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef; ++ ++$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; ++( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or ++( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or ++die "can't locate ppc-xlate.pl"; ++ ++open STDOUT,"| $^X $xlate $flavour \"$output\"" ++ or die "can't call $xlate: $!"; ++ ++if ($flavour !~ /64/) { ++ die "bad flavour ($flavour) - only ppc64 permitted"; ++} ++ ++my $SIZE_T= 8; ++ ++# Registers are global so the code is remotely readable ++ ++# Parameters for Montgomery multiplication ++my $sp = "r1"; ++my $toc = "r2"; ++my $rp = "r3"; ++my $ap = "r4"; ++my $bp = "r5"; ++my $np = "r6"; ++my $n0 = "r7"; ++my $num = "r8"; ++ ++my $i = "r9"; ++my $c0 = "r10"; ++my $bp0 = "r11"; ++my $bpi = "r11"; ++my $bpj = "r11"; ++my $tj = "r12"; ++my $apj = "r12"; ++my $npj = "r12"; ++my $lo = "r14"; ++my $c1 = "r14"; ++ ++# Non-volatile registers used for tp[i] ++# ++# 12 registers are available but the limit on unrolling is 10, ++# since registers from $tp[0] to $tp[$n+1] are used. ++my @tp = ("r20" .. "r31"); ++ ++# volatile VSRs for saving non-volatile GPRs - faster than stack ++my @vsrs = ("v32" .. "v46"); ++ ++package Mont; ++ ++sub new($$) ++{ ++ my ($class, $n) = @_; ++ ++ if ($n > 10) { ++ die "Can't unroll for BN length ${n} (maximum 10)" ++ } ++ ++ my $self = { ++ code => "", ++ n => $n, ++ }; ++ bless $self, $class; ++ ++ return $self; ++} ++ ++sub add_code($$) ++{ ++ my ($self, $c) = @_; ++ ++ $self->{code} .= $c; ++} ++ ++sub get_code($) ++{ ++ my ($self) = @_; ++ ++ return $self->{code}; ++} ++ ++sub get_function_name($) ++{ ++ my ($self) = @_; ++ ++ return "bn_mul_mont_fixed_n" . $self->{n}; ++} ++ ++sub get_label($$) ++{ ++ my ($self, $l) = @_; ++ ++ return "L" . $l . "_" . $self->{n}; ++} ++ ++sub get_labels($@) ++{ ++ my ($self, @labels) = @_; ++ ++ my %out = (); ++ ++ foreach my $l (@labels) { ++ $out{"$l"} = $self->get_label("$l"); ++ } ++ ++ return \%out; ++} ++ ++sub nl($) ++{ ++ my ($self) = @_; ++ ++ $self->add_code("\n"); ++} ++ ++sub copy_result($) ++{ ++ my ($self) = @_; ++ ++ my ($n) = $self->{n}; ++ ++ for (my $j = 0; $j < $n; $j++) { ++ $self->add_code(<<___); ++ std $tp[$j],`$j*$SIZE_T`($rp) ++___ ++ } ++ ++} ++ ++sub mul_mont_fixed($) ++{ ++ my ($self) = @_; ++ ++ my ($n) = $self->{n}; ++ my $fname = $self->get_function_name(); ++ my $label = $self->get_labels("outer", "enter", "sub", "copy", "end"); ++ ++ $self->add_code(<<___); ++ ++.globl .${fname} ++.align 5 ++.${fname}: ++ ++___ ++ ++ $self->save_registers(); ++ ++ $self->add_code(<<___); ++ ld $n0,0($n0) ++ ++ ld $bp0,0($bp) ++ ++ ld $apj,0($ap) ++___ ++ ++ $self->mul_c_0($tp[0], $apj, $bp0, $c0); ++ ++ for (my $j = 1; $j < $n - 1; $j++) { ++ $self->add_code(<<___); ++ ld $apj,`$j*$SIZE_T`($ap) ++___ ++ $self->mul($tp[$j], $apj, $bp0, $c0); ++ } ++ ++ $self->add_code(<<___); ++ ld $apj,`($n-1)*$SIZE_T`($ap) ++___ ++ ++ $self->mul_last($tp[$n-1], $tp[$n], $apj, $bp0, $c0); ++ ++ $self->add_code(<<___); ++ li $tp[$n+1],0 ++ ++___ ++ ++ $self->add_code(<<___); ++ li $i,0 ++ mtctr $num ++ b $label->{"enter"} ++ ++.align 4 ++$label->{"outer"}: ++ ldx $bpi,$bp,$i ++ ++ ld $apj,0($ap) ++___ ++ ++ $self->mul_add_c_0($tp[0], $tp[0], $apj, $bpi, $c0); ++ ++ for (my $j = 1; $j < $n; $j++) { ++ $self->add_code(<<___); ++ ld $apj,`$j*$SIZE_T`($ap) ++___ ++ $self->mul_add($tp[$j], $tp[$j], $apj, $bpi, $c0); ++ } ++ ++ $self->add_code(<<___); ++ addc $tp[$n],$tp[$n],$c0 ++ addze $tp[$n+1],$tp[$n+1] ++___ ++ ++ $self->add_code(<<___); ++.align 4 ++$label->{"enter"}: ++ mulld $bpi,$tp[0],$n0 ++ ++ ld $npj,0($np) ++___ ++ ++ $self->mul_add_c_0($lo, $tp[0], $bpi, $npj, $c0); ++ ++ for (my $j = 1; $j < $n; $j++) { ++ $self->add_code(<<___); ++ ld $npj,`$j*$SIZE_T`($np) ++___ ++ $self->mul_add($tp[$j-1], $tp[$j], $npj, $bpi, $c0); ++ } ++ ++ $self->add_code(<<___); ++ addc $tp[$n-1],$tp[$n],$c0 ++ addze $tp[$n],$tp[$n+1] ++ ++ addi $i,$i,$SIZE_T ++ bdnz $label->{"outer"} ++ ++ and. $tp[$n],$tp[$n],$tp[$n] ++ bne $label->{"sub"} ++ ++ cmpld $tp[$n-1],$npj ++ blt $label->{"copy"} ++ ++$label->{"sub"}: ++___ ++ ++ # ++ # Reduction ++ # ++ ++ $self->add_code(<<___); ++ ld $bpj,`0*$SIZE_T`($np) ++ subfc $c1,$bpj,$tp[0] ++ std $c1,`0*$SIZE_T`($rp) ++ ++___ ++ for (my $j = 1; $j < $n - 1; $j++) { ++ $self->add_code(<<___); ++ ld $bpj,`$j*$SIZE_T`($np) ++ subfe $c1,$bpj,$tp[$j] ++ std $c1,`$j*$SIZE_T`($rp) ++ ++___ ++ } ++ ++ $self->add_code(<<___); ++ subfe $c1,$npj,$tp[$n-1] ++ std $c1,`($n-1)*$SIZE_T`($rp) ++ ++___ ++ ++ $self->add_code(<<___); ++ addme. $tp[$n],$tp[$n] ++ beq $label->{"end"} ++ ++$label->{"copy"}: ++___ ++ ++ $self->copy_result(); ++ ++ $self->add_code(<<___); ++ ++$label->{"end"}: ++___ ++ ++ $self->restore_registers(); ++ ++ $self->add_code(<<___); ++ li r3,1 ++ blr ++.size .${fname},.-.${fname} ++___ ++ ++} ++ ++package Mont::GPR; ++ ++our @ISA = ('Mont'); ++ ++sub new($$) ++{ ++ my ($class, $n) = @_; ++ ++ return $class->SUPER::new($n); ++} ++ ++sub save_registers($) ++{ ++ my ($self) = @_; ++ ++ my $n = $self->{n}; ++ ++ $self->add_code(<<___); ++ std $lo,-8($sp) ++___ ++ ++ for (my $j = 0; $j <= $n+1; $j++) { ++ $self->{code}.=<<___; ++ std $tp[$j],-`($j+2)*8`($sp) ++___ ++ } ++ ++ $self->add_code(<<___); ++ ++___ ++} ++ ++sub restore_registers($) ++{ ++ my ($self) = @_; ++ ++ my $n = $self->{n}; ++ ++ $self->add_code(<<___); ++ ld $lo,-8($sp) ++___ ++ ++ for (my $j = 0; $j <= $n+1; $j++) { ++ $self->{code}.=<<___; ++ ld $tp[$j],-`($j+2)*8`($sp) ++___ ++ } ++ ++ $self->{code} .=<<___; ++ ++___ ++} ++ ++# Direct translation of C mul() ++sub mul($$$$$) ++{ ++ my ($self, $r, $a, $w, $c) = @_; ++ ++ $self->add_code(<<___); ++ mulld $lo,$a,$w ++ addc $r,$lo,$c ++ mulhdu $c,$a,$w ++ addze $c,$c ++ ++___ ++} ++ ++# Like mul() but $c is ignored as an input - an optimisation to save a ++# preliminary instruction that would set input $c to 0 ++sub mul_c_0($$$$$) ++{ ++ my ($self, $r, $a, $w, $c) = @_; ++ ++ $self->add_code(<<___); ++ mulld $r,$a,$w ++ mulhdu $c,$a,$w ++ ++___ ++} ++ ++# Like mul() but does not to the final addition of CA into $c - an ++# optimisation to save an instruction ++sub mul_last($$$$$$) ++{ ++ my ($self, $r1, $r2, $a, $w, $c) = @_; ++ ++ $self->add_code(<<___); ++ mulld $lo,$a,$w ++ addc $r1,$lo,$c ++ mulhdu $c,$a,$w ++ ++ addze $r2,$c ++___ ++} ++ ++# Like C mul_add() but allow $r_out and $r_in to be different ++sub mul_add($$$$$$) ++{ ++ my ($self, $r_out, $r_in, $a, $w, $c) = @_; ++ ++ $self->add_code(<<___); ++ mulld $lo,$a,$w ++ addc $lo,$lo,$c ++ mulhdu $c,$a,$w ++ addze $c,$c ++ addc $r_out,$r_in,$lo ++ addze $c,$c ++ ++___ ++} ++ ++# Like mul_add() but $c is ignored as an input - an optimisation to save a ++# preliminary instruction that would set input $c to 0 ++sub mul_add_c_0($$$$$$) ++{ ++ my ($self, $r_out, $r_in, $a, $w, $c) = @_; ++ ++ $self->add_code(<<___); ++ mulld $lo,$a,$w ++ addc $r_out,$r_in,$lo ++ mulhdu $c,$a,$w ++ addze $c,$c ++ ++___ ++} ++ ++package Mont::GPR_300; ++ ++our @ISA = ('Mont::GPR'); ++ ++sub new($$) ++{ ++ my ($class, $n) = @_; ++ ++ my $mont = $class->SUPER::new($n); ++ ++ return $mont; ++} ++ ++sub get_function_name($) ++{ ++ my ($self) = @_; ++ ++ return "bn_mul_mont_300_fixed_n" . $self->{n}; ++} ++ ++sub get_label($$) ++{ ++ my ($self, $l) = @_; ++ ++ return "L" . $l . "_300_" . $self->{n}; ++} ++ ++# Direct translation of C mul() ++sub mul($$$$$) ++{ ++ my ($self, $r, $a, $w, $c, $last) = @_; ++ ++ $self->add_code(<<___); ++ maddld $r,$a,$w,$c ++ maddhdu $c,$a,$w,$c ++ ++___ ++} ++ ++# Save the last carry as the final entry ++sub mul_last($$$$$) ++{ ++ my ($self, $r1, $r2, $a, $w, $c) = @_; ++ ++ $self->add_code(<<___); ++ maddld $r1,$a,$w,$c ++ maddhdu $r2,$a,$w,$c ++ ++___ ++} ++ ++# Like mul() but $c is ignored as an input - an optimisation to save a ++# preliminary instruction that would set input $c to 0 ++sub mul_c_0($$$$$) ++{ ++ my ($self, $r, $a, $w, $c) = @_; ++ ++ $self->add_code(<<___); ++ mulld $r,$a,$w ++ mulhdu $c,$a,$w ++ ++___ ++} ++ ++# Like C mul_add() but allow $r_out and $r_in to be different ++sub mul_add($$$$$$) ++{ ++ my ($self, $r_out, $r_in, $a, $w, $c) = @_; ++ ++ $self->add_code(<<___); ++ maddld $lo,$a,$w,$c ++ maddhdu $c,$a,$w,$c ++ addc $r_out,$r_in,$lo ++ addze $c,$c ++ ++___ ++} ++ ++# Like mul_add() but $c is ignored as an input - an optimisation to save a ++# preliminary instruction that would set input $c to 0 ++sub mul_add_c_0($$$$$$) ++{ ++ my ($self, $r_out, $r_in, $a, $w, $c) = @_; ++ ++ $self->add_code(<<___); ++ maddld $lo,$a,$w,$r_in ++ maddhdu $c,$a,$w,$r_in ++___ ++ ++ if ($r_out ne $lo) { ++ $self->add_code(<<___); ++ mr $r_out,$lo ++___ ++ } ++ ++ $self->nl(); ++} ++ ++ ++package main; ++ ++my $code; ++ ++$code.=<<___; ++.machine "any" ++.text ++___ ++ ++my $mont; ++ ++$mont = new Mont::GPR(6); ++$mont->mul_mont_fixed(); ++$code .= $mont->get_code(); ++ ++$mont = new Mont::GPR_300(6); ++$mont->mul_mont_fixed(); ++$code .= $mont->get_code(); ++ ++$code =~ s/\`([^\`]*)\`/eval $1/gem; ++ ++$code.=<<___; ++.asciz "Montgomery Multiplication for PPC by , " ++___ ++ ++print $code; ++close STDOUT or die "error closing STDOUT: $!"; +Index: openssl-1.1.1l/crypto/bn/build.info +=================================================================== +--- openssl-1.1.1l.orig/crypto/bn/build.info ++++ openssl-1.1.1l/crypto/bn/build.info +@@ -56,6 +56,7 @@ GENERATE[parisc-mont.s]=asm/parisc-mont. + GENERATE[bn-ppc.s]=asm/ppc.pl $(PERLASM_SCHEME) + GENERATE[ppc-mont.s]=asm/ppc-mont.pl $(PERLASM_SCHEME) + GENERATE[ppc64-mont.s]=asm/ppc64-mont.pl $(PERLASM_SCHEME) ++GENERATE[ppc64-mont-fixed.s]=asm/ppc64-mont-fixed.pl $(PERLASM_SCHEME) + + GENERATE[alpha-mont.S]=asm/alpha-mont.pl $(PERLASM_SCHEME) + +Index: openssl-1.1.1l/crypto/ppccap.c +=================================================================== +--- openssl-1.1.1l.orig/crypto/ppccap.c ++++ openssl-1.1.1l/crypto/ppccap.c +@@ -46,6 +46,12 @@ int bn_mul_mont(BN_ULONG *rp, const BN_U + const BN_ULONG *np, const BN_ULONG *n0, int num); + int bn_mul4x_mont_int(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, + const BN_ULONG *np, const BN_ULONG *n0, int num); ++ int bn_mul_mont_fixed_n6(BN_ULONG *rp, const BN_ULONG *ap, ++ const BN_ULONG *bp, const BN_ULONG *np, ++ const BN_ULONG *n0, int num); ++ int bn_mul_mont_300_fixed_n6(BN_ULONG *rp, const BN_ULONG *ap, ++ const BN_ULONG *bp, const BN_ULONG *np, ++ const BN_ULONG *n0, int num); + + if (num < 4) + return 0; +@@ -61,6 +67,15 @@ int bn_mul_mont(BN_ULONG *rp, const BN_U + * no opportunity to figure it out... + */ + ++#if defined(_ARCH_PPC64) ++ if (num == 6) { ++ if (OPENSSL_ppccap_P & PPC_MADD300) ++ return bn_mul_mont_300_fixed_n6(rp, ap, bp, np, n0, num); ++ else ++ return bn_mul_mont_fixed_n6(rp, ap, bp, np, n0, num); ++ } ++#endif ++ + return bn_mul_mont_int(rp, ap, bp, np, n0, num); + } + #endif +Index: openssl-1.1.1l/crypto/perlasm/ppc-xlate.pl +=================================================================== +--- openssl-1.1.1l.orig/crypto/perlasm/ppc-xlate.pl ++++ openssl-1.1.1l/crypto/perlasm/ppc-xlate.pl +@@ -136,6 +136,71 @@ my $quad = sub { + }; + + ################################################################ ++# vector register number hacking ++################################################################ ++ ++# It is convenient to be able to set a variable like: ++# my $foo = "v33"; ++# and use this in different contexts where: ++# * a VSR (Vector-Scaler Register) number (i.e. "v33") is required ++# * a VR (Vector Register) number (i.e. "v1") is required ++# Map VSR numbering to VR number for certain vector instructions. ++ ++# vs -> v if N > 32 ++sub vsr2vr1 { ++ my $in = shift; ++ ++ my $n = int($in); ++ if ($n >= 32) { ++ $n -= 32; ++ } ++ ++ return "$n"; ++} ++# As above for first $num register args, returns list ++sub _vsr2vr { ++ my $num = shift; ++ my @rest = @_; ++ my @subst = splice(@rest, 0, $num); ++ ++ @subst = map { vsr2vr1($_); } @subst; ++ ++ return (@subst, @rest); ++} ++# As above but 1st arg ($f) is extracted and reinserted after ++# processing so that it can be ignored by a code generation function ++# that consumes the result ++sub vsr2vr_args { ++ my $num = shift; ++ my $f = shift; ++ ++ my @out = _vsr2vr($num, @_); ++ ++ return ($f, @out); ++} ++# As above but 1st arg is mnemonic, return formatted instruction ++sub vsr2vr { ++ my $mnemonic = shift; ++ my $num = shift; ++ my $f = shift; ++ ++ my @out = _vsr2vr($num, @_); ++ ++ " ${mnemonic}${f} " . join(",", @out); ++} ++ ++# ISA 2.03 ++my $vsel = sub { vsr2vr("vsel", 4, @_); }; ++my $vsl = sub { vsr2vr("vsl", 3, @_); }; ++my $vspltisb = sub { vsr2vr("vspltisb", 1, @_); }; ++my $vspltisw = sub { vsr2vr("vspltisw", 1, @_); }; ++my $vsr = sub { vsr2vr("vsr", 3, @_); }; ++my $vsro = sub { vsr2vr("vsro", 3, @_); }; ++ ++# ISA 3.0 ++my $lxsd = sub { vsr2vr("lxsd", 1, @_); }; ++ ++################################################################ + # simplified mnemonics not handled by at least one assembler + ################################################################ + my $cmplw = sub { +@@ -226,13 +291,18 @@ my $vpermdi = sub { # xxpermdi + + # PowerISA 2.07 stuff + sub vcrypto_op { +- my ($f, $vrt, $vra, $vrb, $op) = @_; ++ my ($f, $vrt, $vra, $vrb, $op) = vsr2vr_args(3, @_); + " .long ".sprintf "0x%X",(4<<26)|($vrt<<21)|($vra<<16)|($vrb<<11)|$op; + } + sub vfour { + my ($f, $vrt, $vra, $vrb, $vrc, $op) = @_; + " .long ".sprintf "0x%X",(4<<26)|($vrt<<21)|($vra<<16)|($vrb<<11)|($vrc<<6)|$op; + }; ++sub vfour_vsr { ++ my ($f, $vrt, $vra, $vrb, $vrc, $op) = vsr2vr_args(4, @_); ++ " .long ".sprintf "0x%X",(4<<26)|($vrt<<21)|($vra<<16)|($vrb<<11)|($vrc<<6)|$op; ++}; ++ + my $vcipher = sub { vcrypto_op(@_, 1288); }; + my $vcipherlast = sub { vcrypto_op(@_, 1289); }; + my $vncipher = sub { vcrypto_op(@_, 1352); }; +@@ -254,10 +324,10 @@ my $vsld = sub { vcrypto_op(@_, 1476); } + my $vsrd = sub { vcrypto_op(@_, 1732); }; + my $vsubudm = sub { vcrypto_op(@_, 1216); }; + my $vaddcuq = sub { vcrypto_op(@_, 320); }; +-my $vaddeuqm = sub { vfour(@_,60); }; +-my $vaddecuq = sub { vfour(@_,61); }; +-my $vmrgew = sub { vfour(@_,0,1932); }; +-my $vmrgow = sub { vfour(@_,0,1676); }; ++my $vaddeuqm = sub { vfour_vsr(@_,60); }; ++my $vaddecuq = sub { vfour_vsr(@_,61); }; ++my $vmrgew = sub { vfour_vsr(@_,0,1932); }; ++my $vmrgow = sub { vfour_vsr(@_,0,1676); }; + + my $mtsle = sub { + my ($f, $arg) = @_; +@@ -298,7 +368,7 @@ my $addex = sub { + my ($f, $rt, $ra, $rb, $cy) = @_; # only cy==0 is specified in 3.0B + " .long ".sprintf "0x%X",(31<<26)|($rt<<21)|($ra<<16)|($rb<<11)|($cy<<9)|(170<<1); + }; +-my $vmsumudm = sub { vfour(@_,35); }; ++my $vmsumudm = sub { vfour_vsr(@_, 35); }; + + while($line=<>) { + +Index: openssl-1.1.1l/Configurations/10-main.conf +=================================================================== +--- openssl-1.1.1l.orig/Configurations/10-main.conf ++++ openssl-1.1.1l/Configurations/10-main.conf +@@ -669,7 +669,7 @@ my %targets = ( + inherit_from => [ "linux-generic64", asm("ppc64_asm") ], + cflags => add("-m64"), + cxxflags => add("-m64"), +- lib_cppflags => add("-DB_ENDIAN"), ++ lib_cppflags => add("-DB_ENDIAN -DECP_NISTP521_ASM"), + perlasm_scheme => "linux64", + multilib => "64", + }, +@@ -677,7 +677,7 @@ my %targets = ( + inherit_from => [ "linux-generic64", asm("ppc64_asm") ], + cflags => add("-m64"), + cxxflags => add("-m64"), +- lib_cppflags => add("-DL_ENDIAN"), ++ lib_cppflags => add("-DL_ENDIAN -DECP_NISTP521_ASM"), + perlasm_scheme => "linux64le", + }, + +Index: openssl-1.1.1l/crypto/ec/asm/ecp_nistp521-ppc64.pl +=================================================================== +--- /dev/null ++++ openssl-1.1.1l/crypto/ec/asm/ecp_nistp521-ppc64.pl +@@ -0,0 +1,435 @@ ++#! /usr/bin/env perl ++# Copyright 2021 The OpenSSL Project Authors. All Rights Reserved. ++# ++# Licensed under the OpenSSL license (the "License"). You may not use ++# this file except in compliance with the License. You can obtain a copy ++# in the file LICENSE in the source distribution or at ++# https://www.openssl.org/source/license.html ++# ++# ==================================================================== ++# Written by Amitay Isaacs and Martin Schwenke ++# for the OpenSSL project. ++# ==================================================================== ++# ++# p521 lower-level primitives for PPC64 using vector instructions. ++# ++ ++use strict; ++use warnings; ++ ++my $flavour = shift; ++my $output = ""; ++while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} ++if (!$output) { ++ $output = "-"; ++} ++ ++my ($xlate, $dir); ++$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; ++( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or ++( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or ++die "can't locate ppc-xlate.pl"; ++ ++open OUT,"| \"$^X\" $xlate $flavour $output"; ++*STDOUT=*OUT; ++ ++my $code = ""; ++ ++my ($sp, $outp, $savelr, $savesp) = ("r1", "r3", "r10", "r12"); ++ ++my $vzero = "v32"; ++ ++sub startproc($) ++{ ++ my ($name) = @_; ++ ++ $code.=<<___; ++ .globl ${name} ++ .align 5 ++${name}: ++ ++___ ++} ++ ++sub endproc($) ++{ ++ my ($name) = @_; ++ ++ $code.=<<___; ++ blr ++ .size ${name},.-${name} ++ ++___ ++} ++ ++ ++sub push_vrs($$) ++{ ++ my ($min, $max) = @_; ++ ++ my $count = $max - $min + 1; ++ ++ $code.=<<___; ++ mr $savesp,$sp ++ stdu $sp,-16*`$count+1`($sp) ++ ++___ ++ for (my $i = $min; $i <= $max; $i++) { ++ my $mult = $max - $i + 1; ++ $code.=<<___; ++ stxv $i,-16*$mult($savesp) ++___ ++ ++ } ++ ++ $code.=<<___; ++ ++___ ++} ++ ++sub pop_vrs($$) ++{ ++ my ($min, $max) = @_; ++ ++ $code.=<<___; ++ ld $savesp,0($sp) ++___ ++ for (my $i = $min; $i <= $max; $i++) { ++ my $mult = $max - $i + 1; ++ $code.=<<___; ++ lxv $i,-16*$mult($savesp) ++___ ++ } ++ ++ $code.=<<___; ++ mr $sp,$savesp ++ ++___ ++} ++ ++sub load_vrs($$) ++{ ++ my ($pointer, $reg_list) = @_; ++ ++ for (my $i = 0; $i <= 8; $i++) { ++ my $offset = $i * 8; ++ $code.=<<___; ++ lxsd $reg_list->[$i],$offset($pointer) ++___ ++ } ++ ++ $code.=<<___; ++ ++___ ++} ++ ++sub store_vrs($$) ++{ ++ my ($pointer, $reg_list) = @_; ++ ++ for (my $i = 0; $i <= 8; $i++) { ++ my $offset = $i * 16; ++ $code.=<<___; ++ stxv $reg_list->[$i],$offset($pointer) ++___ ++ } ++ ++ $code.=<<___; ++ ++___ ++} ++ ++$code.=<<___; ++.text ++ ++___ ++ ++{ ++ # mul/square common ++ my ($t1, $t2, $t3, $t4) = ("v33", "v34", "v44", "v54"); ++ my ($zero, $one) = ("r8", "r9"); ++ my @out = map("v$_",(55..63)); ++ ++ { ++ # ++ # p521_felem_mul ++ # ++ ++ my ($in1p, $in2p) = ("r4", "r5"); ++ my @in1 = map("v$_",(45..53)); ++ my @in2 = map("v$_",(35..43)); ++ ++ startproc("p521_felem_mul"); ++ ++ push_vrs(52, 63); ++ ++ $code.=<<___; ++ vspltisw $vzero,0 ++ ++___ ++ ++ load_vrs($in1p, \@in1); ++ load_vrs($in2p, \@in2); ++ ++ $code.=<<___; ++ vmsumudm $out[0],$in1[0],$in2[0],$vzero ++ ++ xxpermdi $t1,$in1[0],$in1[1],0b00 ++ xxpermdi $t2,$in2[1],$in2[0],0b00 ++ vmsumudm $out[1],$t1,$t2,$vzero ++ ++ xxpermdi $t2,$in2[2],$in2[1],0b00 ++ vmsumudm $out[2],$t1,$t2,$vzero ++ vmsumudm $out[2],$in1[2],$in2[0],$out[2] ++ ++ xxpermdi $t2,$in2[3],$in2[2],0b00 ++ vmsumudm $out[3],$t1,$t2,$vzero ++ xxpermdi $t3,$in1[2],$in1[3],0b00 ++ xxpermdi $t4,$in2[1],$in2[0],0b00 ++ vmsumudm $out[3],$t3,$t4,$out[3] ++ ++ xxpermdi $t2,$in2[4],$in2[3],0b00 ++ vmsumudm $out[4],$t1,$t2,$vzero ++ xxpermdi $t4,$in2[2],$in2[1],0b00 ++ vmsumudm $out[4],$t3,$t4,$out[4] ++ vmsumudm $out[4],$in1[4],$in2[0],$out[4] ++ ++ xxpermdi $t2,$in2[5],$in2[4],0b00 ++ vmsumudm $out[5],$t1,$t2,$vzero ++ xxpermdi $t4,$in2[3],$in2[2],0b00 ++ vmsumudm $out[5],$t3,$t4,$out[5] ++ ++ xxpermdi $t2,$in2[6],$in2[5],0b00 ++ vmsumudm $out[6],$t1,$t2,$vzero ++ xxpermdi $t4,$in2[4],$in2[3],0b00 ++ vmsumudm $out[6],$t3,$t4,$out[6] ++ ++ xxpermdi $t2,$in2[7],$in2[6],0b00 ++ vmsumudm $out[7],$t1,$t2,$vzero ++ xxpermdi $t4,$in2[5],$in2[4],0b00 ++ vmsumudm $out[7],$t3,$t4,$out[7] ++ ++ xxpermdi $t2,$in2[8],$in2[7],0b00 ++ vmsumudm $out[8],$t1,$t2,$vzero ++ xxpermdi $t4,$in2[6],$in2[5],0b00 ++ vmsumudm $out[8],$t3,$t4,$out[8] ++ ++ xxpermdi $t1,$in1[4],$in1[5],0b00 ++ xxpermdi $t2,$in2[1],$in2[0],0b00 ++ vmsumudm $out[5],$t1,$t2,$out[5] ++ ++ xxpermdi $t2,$in2[2],$in2[1],0b00 ++ vmsumudm $out[6],$t1,$t2,$out[6] ++ vmsumudm $out[6],$in1[6],$in2[0],$out[6] ++ ++ xxpermdi $t2,$in2[3],$in2[2],0b00 ++ vmsumudm $out[7],$t1,$t2,$out[7] ++ xxpermdi $t3,$in1[6],$in1[7],0b00 ++ xxpermdi $t4,$in2[1],$in2[0],0b00 ++ vmsumudm $out[7],$t3,$t4,$out[7] ++ ++ xxpermdi $t2,$in2[4],$in2[3],0b00 ++ vmsumudm $out[8],$t1,$t2,$out[8] ++ xxpermdi $t4,$in2[2],$in2[1],0b00 ++ vmsumudm $out[8],$t3,$t4,$out[8] ++ vmsumudm $out[8],$in1[8],$in2[0],$out[8] ++ ++ li $zero,0 ++ li $one,1 ++ mtvsrdd $t1,$one,$zero ++___ ++ ++ for (my $i = 0; $i <= 8; $i++) { ++ $code.=<<___; ++ vsld $in2[$i],$in2[$i],$t1 ++___ ++ } ++ ++ $code.=<<___; ++ ++ vmsumudm $out[7],$in1[8],$in2[8],$out[7] ++ ++ xxpermdi $t2,$in2[8],$in2[7],0b00 ++ xxpermdi $t1,$in1[7],$in1[8],0b00 ++ vmsumudm $out[6],$t1,$t2,$out[6] ++ ++ xxpermdi $t1,$in1[6],$in1[7],0b00 ++ vmsumudm $out[5],$t1,$t2,$out[5] ++ vmsumudm $out[5],$in1[8],$in2[6],$out[5] ++ ++ xxpermdi $t1,$in1[5],$in1[6],0b00 ++ vmsumudm $out[4],$t1,$t2,$out[4] ++ xxpermdi $t4,$in2[6],$in2[5],0b00 ++ xxpermdi $t3,$in1[7],$in1[8],0b00 ++ vmsumudm $out[4],$t3,$t4,$out[4] ++ ++ xxpermdi $t1,$in1[4],$in1[5],0b00 ++ vmsumudm $out[3],$t1,$t2,$out[3] ++ xxpermdi $t3,$in1[6],$in1[7],0b00 ++ vmsumudm $out[3],$t3,$t4,$out[3] ++ vmsumudm $out[3],$in1[8],$in2[4],$out[3] ++ ++ xxpermdi $t1,$in1[3],$in1[4],0b00 ++ vmsumudm $out[2],$t1,$t2,$out[2] ++ xxpermdi $t3,$in1[5],$in1[6],0b00 ++ vmsumudm $out[2],$t3,$t4,$out[2] ++ ++ xxpermdi $t1,$in1[2],$in1[3],0b00 ++ vmsumudm $out[1],$t1,$t2,$out[1] ++ xxpermdi $t3,$in1[4],$in1[5],0b00 ++ vmsumudm $out[1],$t3,$t4,$out[1] ++ ++ xxpermdi $t1,$in1[1],$in1[2],0b00 ++ vmsumudm $out[0],$t1,$t2,$out[0] ++ xxpermdi $t3,$in1[3],$in1[4],0b00 ++ vmsumudm $out[0],$t3,$t4,$out[0] ++ ++ xxpermdi $t2,$in2[4],$in2[3],0b00 ++ xxpermdi $t1,$in1[7],$in1[8],0b00 ++ vmsumudm $out[2],$t1,$t2,$out[2] ++ ++ xxpermdi $t1,$in1[6],$in1[7],0b00 ++ vmsumudm $out[1],$t1,$t2,$out[1] ++ vmsumudm $out[1],$in1[8],$in2[2],$out[1] ++ ++ xxpermdi $t1,$in1[5],$in1[6],0b00 ++ vmsumudm $out[0],$t1,$t2,$out[0] ++ xxpermdi $t4,$in2[2],$in2[1],0b00 ++ xxpermdi $t3,$in1[7],$in1[8],0b00 ++ vmsumudm $out[0],$t3,$t4,$out[0] ++ ++___ ++ ++ store_vrs($outp, \@out); ++ ++ pop_vrs(52, 63); ++ ++ endproc("p521_felem_mul"); ++ } ++ ++ { ++ # ++ # p51_felem_square ++ # ++ ++ my ($inp) = ("r4"); ++ my @in = map("v$_",(45..53)); ++ my @inx2 = map("v$_",(35..43)); ++ ++ startproc("p521_felem_square"); ++ ++ push_vrs(52, 63); ++ ++ $code.=<<___; ++ vspltisw $vzero,0 ++ ++___ ++ ++ load_vrs($inp, \@in); ++ ++ $code.=<<___; ++ li $zero,0 ++ li $one,1 ++ mtvsrdd $t1,$one,$zero ++___ ++ ++ for (my $i = 0; $i <= 8; $i++) { ++ $code.=<<___; ++ vsld $inx2[$i],$in[$i],$t1 ++___ ++ } ++ ++ $code.=<<___; ++ vmsumudm $out[0],$in[0],$in[0],$vzero ++ ++ vmsumudm $out[1],$in[0],$inx2[1],$vzero ++ ++ xxpermdi $t1,$in[0],$in[1],0b00 ++ xxpermdi $t2,$inx2[2],$in[1],0b00 ++ vmsumudm $out[2],$t1,$t2,$vzero ++ ++ xxpermdi $t2,$inx2[3],$inx2[2],0b00 ++ vmsumudm $out[3],$t1,$t2,$vzero ++ ++ xxpermdi $t2,$inx2[4],$inx2[3],0b00 ++ vmsumudm $out[4],$t1,$t2,$vzero ++ vmsumudm $out[4],$in[2],$in[2],$out[4] ++ ++ xxpermdi $t2,$inx2[5],$inx2[4],0b00 ++ vmsumudm $out[5],$t1,$t2,$vzero ++ vmsumudm $out[5],$in[2],$inx2[3],$out[5] ++ ++ xxpermdi $t2,$inx2[6],$inx2[5],0b00 ++ vmsumudm $out[6],$t1,$t2,$vzero ++ xxpermdi $t3,$in[2],$in[3],0b00 ++ xxpermdi $t4,$inx2[4],$in[3],0b00 ++ vmsumudm $out[6],$t3,$t4,$out[6] ++ ++ xxpermdi $t2,$inx2[7],$inx2[6],0b00 ++ vmsumudm $out[7],$t1,$t2,$vzero ++ xxpermdi $t4,$inx2[5],$inx2[4],0b00 ++ vmsumudm $out[7],$t3,$t4,$out[7] ++ ++ xxpermdi $t2,$inx2[8],$inx2[7],0b00 ++ vmsumudm $out[8],$t1,$t2,$vzero ++ xxpermdi $t4,$inx2[6],$inx2[5],0b00 ++ vmsumudm $out[8],$t3,$t4,$out[8] ++ vmsumudm $out[8],$in[4],$in[4],$out[8] ++ ++ vmsumudm $out[1],$in[5],$inx2[5],$out[1] ++ ++ vmsumudm $out[3],$in[6],$inx2[6],$out[3] ++ ++ vmsumudm $out[5],$in[7],$inx2[7],$out[5] ++ ++ vmsumudm $out[7],$in[8],$inx2[8],$out[7] ++ ++ mtvsrdd $t1,$one,$zero ++___ ++ ++ for (my $i = 5; $i <= 8; $i++) { ++ $code.=<<___; ++ vsld $inx2[$i],$inx2[$i],$t1 ++___ ++ } ++ ++ $code.=<<___; ++ ++ vmsumudm $out[6],$in[7],$inx2[8],$out[6] ++ ++ vmsumudm $out[5],$in[6],$inx2[8],$out[5] ++ ++ xxpermdi $t2,$inx2[8],$inx2[7],0b00 ++ xxpermdi $t1,$in[5],$in[6],0b00 ++ vmsumudm $out[4],$t1,$t2,$out[4] ++ ++ xxpermdi $t1,$in[4],$in[5],0b00 ++ vmsumudm $out[3],$t1,$t2,$out[3] ++ ++ xxpermdi $t1,$in[3],$in[4],0b00 ++ vmsumudm $out[2],$t1,$t2,$out[2] ++ vmsumudm $out[2],$in[5],$inx2[6],$out[2] ++ ++ xxpermdi $t1,$in[2],$in[3],0b00 ++ vmsumudm $out[1],$t1,$t2,$out[1] ++ vmsumudm $out[1],$in[4],$inx2[6],$out[1] ++ ++ xxpermdi $t1,$in[1],$in[2],0b00 ++ vmsumudm $out[0],$t1,$t2,$out[0] ++ xxpermdi $t2,$inx2[6],$inx2[5],0b00 ++ xxpermdi $t1,$in[3],$in[4],0b00 ++ vmsumudm $out[0],$t1,$t2,$out[0] ++ ++___ ++ ++ store_vrs($outp, \@out); ++ ++ pop_vrs(52, 63); ++ ++ endproc("p521_felem_square"); ++ } ++} ++ ++$code =~ s/\`([^\`]*)\`/eval $1/gem; ++print $code; ++close STDOUT or die "error closing STDOUT: $!"; +Index: openssl-1.1.1l/crypto/ec/ec_local.h +=================================================================== +--- openssl-1.1.1l.orig/crypto/ec/ec_local.h ++++ openssl-1.1.1l/crypto/ec/ec_local.h +@@ -499,6 +499,10 @@ int ec_GF2m_simple_field_div(const EC_GR + const BIGNUM *b, BN_CTX *); + + #ifndef OPENSSL_NO_EC_NISTP_64_GCC_128 ++# ifdef B_ENDIAN ++# error "Can not enable ec_nistp_64_gcc_128 on big-endian systems" ++# endif ++ + /* method functions in ecp_nistp224.c */ + int ec_GFp_nistp224_group_init(EC_GROUP *group); + int ec_GFp_nistp224_group_set_curve(EC_GROUP *group, const BIGNUM *p, +Index: openssl-1.1.1l/crypto/ec/curve448/arch_32/f_impl.c +=================================================================== +--- openssl-1.1.1l.orig/crypto/ec/curve448/arch_32/f_impl.c ++++ openssl-1.1.1l/crypto/ec/curve448/arch_32/f_impl.c +@@ -10,7 +10,7 @@ + * Originally written by Mike Hamburg + */ + +-#include "field.h" ++#include "../field.h" + + void gf_mul(gf_s * RESTRICT cs, const gf as, const gf bs) + { +Index: openssl-1.1.1l/crypto/ec/curve448/arch_64/f_impl.c +=================================================================== +--- /dev/null ++++ openssl-1.1.1l/crypto/ec/curve448/arch_64/f_impl.c +@@ -0,0 +1,200 @@ ++/* ++ * Copyright 2017-2018 The OpenSSL Project Authors. All Rights Reserved. ++ * Copyright 2014 Cryptography Research, Inc. ++ * ++ * Licensed under the OpenSSL license (the "License"). You may not use ++ * this file except in compliance with the License. You can obtain a copy ++ * in the file LICENSE in the source distribution or at ++ * https://www.openssl.org/source/license.html ++ * ++ * Originally written by Mike Hamburg ++ */ ++ ++#include "../field.h" ++ ++void gf_mul(gf_s * RESTRICT cs, const gf as, const gf bs) ++{ ++ const uint64_t *a = as->limb, *b = bs->limb; ++ uint64_t *c = cs->limb; ++ uint128_t accum0 = 0, accum1 = 0, accum2; ++ uint64_t mask = (1ULL << 56) - 1; ++ uint64_t aa[4], bb[4], bbb[4]; ++ unsigned int i, j; ++ ++ for (i = 0; i < 4; i++) { ++ aa[i] = a[i] + a[i + 4]; ++ bb[i] = b[i] + b[i + 4]; ++ bbb[i] = bb[i] + b[i + 4]; ++ } ++ ++ for (i = 0; i < 4; i++) { ++ accum2 = 0; ++ ++ for (j = 0; j <= i; j++) { ++ accum2 += widemul(a[j], b[i - j]); ++ accum1 += widemul(aa[j], bb[i - j]); ++ accum0 += widemul(a[j + 4], b[i - j + 4]); ++ } ++ for (; j < 4; j++) { ++ accum2 += widemul(a[j], b[i - j + 8]); ++ accum1 += widemul(aa[j], bbb[i - j + 4]); ++ accum0 += widemul(a[j + 4], bb[i - j + 4]); ++ } ++ ++ accum1 -= accum2; ++ accum0 += accum2; ++ ++ c[i] = ((uint64_t)(accum0)) & mask; ++ c[i + 4] = ((uint64_t)(accum1)) & mask; ++ ++ accum0 >>= 56; ++ accum1 >>= 56; ++ } ++ ++ accum0 += accum1; ++ accum0 += c[4]; ++ accum1 += c[0]; ++ c[4] = ((uint64_t)(accum0)) & mask; ++ c[0] = ((uint64_t)(accum1)) & mask; ++ ++ accum0 >>= 56; ++ accum1 >>= 56; ++ ++ c[5] += ((uint64_t)(accum0)); ++ c[1] += ((uint64_t)(accum1)); ++} ++ ++void gf_mulw_unsigned(gf_s * RESTRICT cs, const gf as, uint32_t b) ++{ ++ const uint64_t *a = as->limb; ++ uint64_t *c = cs->limb; ++ uint128_t accum0 = 0, accum4 = 0; ++ uint64_t mask = (1ULL << 56) - 1; ++ int i; ++ ++ for (i = 0; i < 4; i++) { ++ accum0 += widemul(b, a[i]); ++ accum4 += widemul(b, a[i + 4]); ++ c[i] = accum0 & mask; ++ accum0 >>= 56; ++ c[i + 4] = accum4 & mask; ++ accum4 >>= 56; ++ } ++ ++ accum0 += accum4 + c[4]; ++ c[4] = accum0 & mask; ++ c[5] += accum0 >> 56; ++ ++ accum4 += c[0]; ++ c[0] = accum4 & mask; ++ c[1] += accum4 >> 56; ++} ++ ++void gf_sqr(gf_s * RESTRICT cs, const gf as) ++{ ++ const uint64_t *a = as->limb; ++ uint64_t *c = cs->limb; ++ uint128_t accum0 = 0, accum1 = 0, accum2; ++ uint64_t mask = (1ULL << 56) - 1; ++ uint64_t aa[4]; ++ unsigned int i; ++ ++ /* For some reason clang doesn't vectorize this without prompting? */ ++ for (i = 0; i < 4; i++) ++ aa[i] = a[i] + a[i + 4]; ++ ++ accum2 = widemul(a[0], a[3]); ++ accum0 = widemul(aa[0], aa[3]); ++ accum1 = widemul(a[4], a[7]); ++ ++ accum2 += widemul(a[1], a[2]); ++ accum0 += widemul(aa[1], aa[2]); ++ accum1 += widemul(a[5], a[6]); ++ ++ accum0 -= accum2; ++ accum1 += accum2; ++ ++ c[3] = ((uint64_t)(accum1)) << 1 & mask; ++ c[7] = ((uint64_t)(accum0)) << 1 & mask; ++ ++ accum0 >>= 55; ++ accum1 >>= 55; ++ ++ accum0 += widemul(2 * aa[1], aa[3]); ++ accum1 += widemul(2 * a[5], a[7]); ++ accum0 += widemul(aa[2], aa[2]); ++ accum1 += accum0; ++ ++ accum0 -= widemul(2 * a[1], a[3]); ++ accum1 += widemul(a[6], a[6]); ++ ++ accum2 = widemul(a[0], a[0]); ++ accum1 -= accum2; ++ accum0 += accum2; ++ ++ accum0 -= widemul(a[2], a[2]); ++ accum1 += widemul(aa[0], aa[0]); ++ accum0 += widemul(a[4], a[4]); ++ ++ c[0] = ((uint64_t)(accum0)) & mask; ++ c[4] = ((uint64_t)(accum1)) & mask; ++ ++ accum0 >>= 56; ++ accum1 >>= 56; ++ ++ accum2 = widemul(2 * aa[2], aa[3]); ++ accum0 -= widemul(2 * a[2], a[3]); ++ accum1 += widemul(2 * a[6], a[7]); ++ ++ accum1 += accum2; ++ accum0 += accum2; ++ ++ accum2 = widemul(2 * a[0], a[1]); ++ accum1 += widemul(2 * aa[0], aa[1]); ++ accum0 += widemul(2 * a[4], a[5]); ++ ++ accum1 -= accum2; ++ accum0 += accum2; ++ ++ c[1] = ((uint64_t)(accum0)) & mask; ++ c[5] = ((uint64_t)(accum1)) & mask; ++ ++ accum0 >>= 56; ++ accum1 >>= 56; ++ ++ accum2 = widemul(aa[3], aa[3]); ++ accum0 -= widemul(a[3], a[3]); ++ accum1 += widemul(a[7], a[7]); ++ ++ accum1 += accum2; ++ accum0 += accum2; ++ ++ accum2 = widemul(2 * a[0], a[2]); ++ accum1 += widemul(2 * aa[0], aa[2]); ++ accum0 += widemul(2 * a[4], a[6]); ++ ++ accum2 += widemul(a[1], a[1]); ++ accum1 += widemul(aa[1], aa[1]); ++ accum0 += widemul(a[5], a[5]); ++ ++ accum1 -= accum2; ++ accum0 += accum2; ++ ++ c[2] = ((uint64_t)(accum0)) & mask; ++ c[6] = ((uint64_t)(accum1)) & mask; ++ ++ accum0 >>= 56; ++ accum1 >>= 56; ++ ++ accum0 += c[3]; ++ accum1 += c[7]; ++ c[3] = ((uint64_t)(accum0)) & mask; ++ c[7] = ((uint64_t)(accum1)) & mask; ++ ++ /* we could almost stop here, but it wouldn't be stable, so... */ ++ ++ accum0 >>= 56; ++ accum1 >>= 56; ++ c[4] += ((uint64_t)(accum0)) + ((uint64_t)(accum1)); ++ c[0] += ((uint64_t)(accum1)); ++} +Index: openssl-1.1.1l/Configure +=================================================================== +--- openssl-1.1.1l.orig/Configure ++++ openssl-1.1.1l/Configure +@@ -1476,6 +1476,20 @@ if (!$disabled{asm} && !$predefined_C{__ + } + } + ++# Check if __SIZEOF_INT128__ is defined by compiler ++$config{use_int128} = 0; ++{ ++ my $cc = $config{CROSS_COMPILE}.$config{CC}; ++ open(PIPE, "$cc -E -dM - &1 |"); ++ while() { ++ if (m/__SIZEOF_INT128__/) { ++ $config{use_int128} = 1; ++ last; ++ } ++ } ++ close(PIPE); ++} ++ + # Deal with bn_ops ################################################### + + $config{bn_ll} =0; diff --git a/openssl-1_1.changes b/openssl-1_1.changes index 8cf43e1..2a1622f 100644 --- a/openssl-1_1.changes +++ b/openssl-1_1.changes @@ -1,3 +1,19 @@ +------------------------------------------------------------------- +Fri Jan 28 17:33:51 UTC 2022 - Pedro Monreal + +- Backport cryptographic improvements from OpenSSL 3 [jsc#SLE-19766] + * Optimize RSA on armv8: openssl-1_1-Optimize-RSA-armv8.patch + * Optimize AES-XTS mode for aarch64: + openssl-1_1-Optimize-AES-XTS-aarch64.patch + * Optimize AES-GCM for uarchs with unroll and new instructions: + openssl-1_1-Optimize-AES-GCM-uarchs.patch + +------------------------------------------------------------------- +Fri Jan 28 17:33:23 UTC 2022 - Pedro Monreal + +- POWER10 performance enhancements for cryptography [jsc#SLE-19409] + * openssl-1_1-Optimize-ppc64.patch + ------------------------------------------------------------------- Tue Dec 28 12:34:04 UTC 2021 - Pedro Monreal diff --git a/openssl-1_1.spec b/openssl-1_1.spec index 63e2fad..ce2d744 100644 --- a/openssl-1_1.spec +++ b/openssl-1_1.spec @@ -1,7 +1,7 @@ # # spec file for package openssl-1_1 # -# Copyright (c) 2021 SUSE LLC +# Copyright (c) 2022 SUSE LLC # # All modifications and additions to the file contributed by third parties # remain the property of their copyright owners, unless otherwise agreed @@ -112,6 +112,12 @@ Patch54: openssl-1_1-use-seclevel2-in-tests.patch Patch55: openssl-1_1-disable-test_srp-sslapi.patch Patch56: openssl-add_rfc3526_rfc7919.patch Patch57: openssl-1_1-use-include-directive.patch +#PATCH-FIX-UPSTREAM jsc#SLE-19409 POWER10 performance enhancements for cryptography +Patch69: openssl-1_1-Optimize-ppc64.patch +#PATCH-FIX-UPSTREAM jsc#SLE-19766 Backport Arm improvements from OpenSSL 3 +Patch70: openssl-1_1-Optimize-RSA-armv8.patch +Patch71: openssl-1_1-Optimize-AES-XTS-aarch64.patch +Patch72: openssl-1_1-Optimize-AES-GCM-uarchs.patch BuildRequires: pkgconfig %if 0%{?sle_version} >= 150400 || 0%{?suse_version} >= 1550 Requires: crypto-policies @@ -245,6 +251,7 @@ make all %{?_smp_mflags} export MALLOC_CHECK_=3 export MALLOC_PERTURB_=$(($RANDOM % 255 + 1)) #export HARNESS_VERBOSE=1 +#export OPENSSL_FORCE_FIPS_MODE=1 LD_LIBRARY_PATH=`pwd` make test -j1 # show ciphers @@ -330,7 +337,7 @@ cp %{SOURCE5} . # invalidates a HMAC that may have been created earlier. # solution: create the hashes _after_ the macro runs. # -# this shows up earlier because otherwise the %%expand of +# this shows up earlier because otherwise the expand of # the macro is too late. # remark: This is the same as running # openssl dgst -sha256 -hmac 'ppaksykemnsecgtsttplmamstKMEs'