forked from pool/openssl-1_1
Pedro Monreal Gonzalez
8903999f6a
- Backport cryptographic improvements from OpenSSL 3 [jsc#SLE-19766] * Optimize RSA on armv8: openssl-1_1-Optimize-RSA-armv8.patch * Optimize AES-XTS mode for aarch64: openssl-1_1-Optimize-AES-XTS-aarch64.patch * Optimize AES-GCM for uarchs with unroll and new instructions: openssl-1_1-Optimize-AES-GCM-uarchs.patch - POWER10 performance enhancements for cryptography [jsc#SLE-19409] * openssl-1_1-Optimize-ppc64.patch OBS-URL: https://build.opensuse.org/request/show/949750 OBS-URL: https://build.opensuse.org/package/show/security:tls/openssl-1_1?expand=0&rev=102
7710 lines
335 KiB
Diff
7710 lines
335 KiB
Diff
From 954f45ba4c504570206ff5bed811e512cf92dc8e Mon Sep 17 00:00:00 2001
|
|
From: XiaokangQian <xiaokang.qian@arm.com>
|
|
Date: Wed, 9 Jun 2021 06:35:46 +0000
|
|
Subject: [PATCH] Optimize AES-GCM for uarchs with unroll and new instructions
|
|
|
|
Increase the block numbers to 8 for every iteration. Increase the hash
|
|
table capacity. Make use of EOR3 instruction to improve the performance.
|
|
|
|
This can improve performance 25-40% on out-of-order microarchitectures
|
|
with a large number of fast execution units, such as Neoverse V1. We also
|
|
see 20-30% performance improvements on other architectures such as the M1.
|
|
|
|
Assembly code reviewd by Tom Cosgrove (ARM).
|
|
|
|
Reviewed-by: Bernd Edlinger <bernd.edlinger@hotmail.de>
|
|
Reviewed-by: Paul Dale <pauli@openssl.org>
|
|
(Merged from https://github.com/openssl/openssl/pull/15916)
|
|
---
|
|
crypto/arm64cpuid.pl | 8 +
|
|
crypto/arm_arch.h | 6 +
|
|
crypto/armcap.c | 24 +-
|
|
crypto/modes/asm/aes-gcm-armv8-unroll8_64.pl | 7369 +++++++++++++++++
|
|
crypto/modes/asm/ghashv8-armx.pl | 105 +-
|
|
crypto/modes/build.info | 4 +-
|
|
include/crypto/aes_platform.h | 12 +
|
|
.../ciphers/cipher_aes_gcm_hw_armv8.inc | 36 +-
|
|
8 files changed, 7546 insertions(+), 18 deletions(-)
|
|
create mode 100644 crypto/modes/asm/aes-gcm-armv8-unroll8_64.pl
|
|
|
|
Index: openssl-1.1.1m/crypto/arm64cpuid.pl
|
|
===================================================================
|
|
--- openssl-1.1.1m.orig/crypto/arm64cpuid.pl
|
|
+++ openssl-1.1.1m/crypto/arm64cpuid.pl
|
|
@@ -78,6 +78,14 @@ _armv8_sha512_probe:
|
|
ret
|
|
.size _armv8_sha512_probe,.-_armv8_sha512_probe
|
|
|
|
+.globl _armv8_eor3_probe
|
|
+.type _armv8_eor3_probe,%function
|
|
+_armv8_eor3_probe:
|
|
+ AARCH64_VALID_CALL_TARGET
|
|
+ .long 0xce010800 // eor3 v0.16b, v0.16b, v1.16b, v2.16b
|
|
+ ret
|
|
+.size _armv8_eor3_probe,.-_armv8_eor3_probe
|
|
+
|
|
.globl _armv8_cpuid_probe
|
|
.type _armv8_cpuid_probe,%function
|
|
_armv8_cpuid_probe:
|
|
Index: openssl-1.1.1m/crypto/arm_arch.h
|
|
===================================================================
|
|
--- openssl-1.1.1m.orig/crypto/arm_arch.h
|
|
+++ openssl-1.1.1m/crypto/arm_arch.h
|
|
@@ -83,6 +83,9 @@ extern unsigned int OPENSSL_arm_midr;
|
|
# define ARMV8_SHA512 (1<<6)
|
|
# define ARMV8_CPUID (1<<7)
|
|
|
|
+# define ARMV8_SHA3 (1<<11)
|
|
+# define ARMV8_UNROLL8_EOR3 (1<<12)
|
|
+
|
|
/*
|
|
* MIDR_EL1 system register
|
|
*
|
|
@@ -97,6 +100,7 @@ extern unsigned int OPENSSL_arm_midr;
|
|
|
|
# define ARM_CPU_PART_CORTEX_A72 0xD08
|
|
# define ARM_CPU_PART_N1 0xD0C
|
|
+# define ARM_CPU_PART_V1 0xD40
|
|
|
|
# define MIDR_PARTNUM_SHIFT 4
|
|
# define MIDR_PARTNUM_MASK (0xfff << MIDR_PARTNUM_SHIFT)
|
|
@@ -125,4 +129,29 @@ extern unsigned int OPENSSL_arm_midr;
|
|
|
|
# define MIDR_IS_CPU_MODEL(midr, imp, partnum) \
|
|
(((midr) & MIDR_CPU_MODEL_MASK) == MIDR_CPU_MODEL(imp, partnum))
|
|
+
|
|
+# define IS_CPU_SUPPORT_UNROLL8_EOR3() \
|
|
+ (OPENSSL_armcap_P & ARMV8_UNROLL8_EOR3)
|
|
+
|
|
+#if defined(__ASSEMBLER__)
|
|
+
|
|
+ /*
|
|
+ * Support macros for
|
|
+ * - Armv8.3-A Pointer Authentication and
|
|
+ * - Armv8.5-A Branch Target Identification
|
|
+ * features which require emitting a .note.gnu.property section with the
|
|
+ * appropriate architecture-dependent feature bits set.
|
|
+ * Read more: "ELF for the Arm® 64-bit Architecture"
|
|
+ */
|
|
+
|
|
+# if defined(__ARM_FEATURE_BTI_DEFAULT) && __ARM_FEATURE_BTI_DEFAULT == 1
|
|
+# define GNU_PROPERTY_AARCH64_BTI (1 << 0) /* Has Branch Target Identification */
|
|
+# define AARCH64_VALID_CALL_TARGET hint #34 /* BTI 'c' */
|
|
+# else
|
|
+# define GNU_PROPERTY_AARCH64_BTI 0 /* No Branch Target Identification */
|
|
+# define AARCH64_VALID_CALL_TARGET
|
|
+# endif
|
|
+
|
|
+# endif /* defined __ASSEMBLER__ */
|
|
+
|
|
#endif
|
|
Index: openssl-1.1.1m/crypto/armcap.c
|
|
===================================================================
|
|
--- openssl-1.1.1m.orig/crypto/armcap.c
|
|
+++ openssl-1.1.1m/crypto/armcap.c
|
|
@@ -13,6 +13,9 @@
|
|
#include <setjmp.h>
|
|
#include <signal.h>
|
|
#include <openssl/crypto.h>
|
|
+#ifdef __APPLE__
|
|
+#include <sys/sysctl.h>
|
|
+#endif
|
|
#include "internal/cryptlib.h"
|
|
|
|
#include "arm_arch.h"
|
|
@@ -134,6 +137,7 @@ static unsigned long getauxval(unsigned
|
|
# define HWCAP_CE_SHA1 (1 << 5)
|
|
# define HWCAP_CE_SHA256 (1 << 6)
|
|
# define HWCAP_CPUID (1 << 11)
|
|
+# define HWCAP_SHA3 (1 << 17)
|
|
# define HWCAP_CE_SHA512 (1 << 21)
|
|
# endif
|
|
|
|
@@ -148,12 +152,15 @@ void OPENSSL_cpuid_setup(void)
|
|
return;
|
|
trigger = 1;
|
|
|
|
+ OPENSSL_armcap_P = 0;
|
|
+
|
|
if ((e = getenv("OPENSSL_armcap"))) {
|
|
OPENSSL_armcap_P = (unsigned int)strtoul(e, NULL, 0);
|
|
return;
|
|
}
|
|
|
|
-# if defined(__APPLE__) && !defined(__aarch64__)
|
|
+# if defined(__APPLE__)
|
|
+# if !defined(__aarch64__)
|
|
/*
|
|
* Capability probing by catching SIGILL appears to be problematic
|
|
* on iOS. But since Apple universe is "monocultural", it's actually
|
|
@@ -169,9 +176,25 @@ void OPENSSL_cpuid_setup(void)
|
|
* Unified code works because it never triggers SIGILL on Apple
|
|
* devices...
|
|
*/
|
|
-# endif
|
|
+# else
|
|
+ {
|
|
+ unsigned int feature;
|
|
+ size_t len = sizeof(feature);
|
|
+ char uarch[64];
|
|
|
|
- OPENSSL_armcap_P = 0;
|
|
+ if (sysctlbyname("hw.optional.armv8_2_sha512", &feature, &len, NULL, 0) == 0 && feature == 1)
|
|
+ OPENSSL_armcap_P |= ARMV8_SHA512;
|
|
+ feature = 0;
|
|
+ if (sysctlbyname("hw.optional.armv8_2_sha3", &feature, &len, NULL, 0) == 0 && feature == 1) {
|
|
+ OPENSSL_armcap_P |= ARMV8_SHA3;
|
|
+ len = sizeof(uarch);
|
|
+ if ((sysctlbyname("machdep.cpu.brand_string", uarch, &len, NULL, 0) == 0) &&
|
|
+ (strncmp(uarch, "Apple M1", 8) == 0))
|
|
+ OPENSSL_armcap_P |= ARMV8_UNROLL8_EOR3;
|
|
+ }
|
|
+ }
|
|
+# endif
|
|
+# endif
|
|
|
|
# ifdef OSSL_IMPLEMENT_GETAUXVAL
|
|
if (getauxval(HWCAP) & HWCAP_NEON) {
|
|
@@ -197,6 +220,9 @@ void OPENSSL_cpuid_setup(void)
|
|
|
|
if (hwcap & HWCAP_CPUID)
|
|
OPENSSL_armcap_P |= ARMV8_CPUID;
|
|
+
|
|
+ if (hwcap & HWCAP_SHA3)
|
|
+ OPENSSL_armcap_P |= ARMV8_SHA3;
|
|
# endif
|
|
}
|
|
# endif
|
|
@@ -240,6 +266,10 @@ void OPENSSL_cpuid_setup(void)
|
|
_armv8_sha512_probe();
|
|
OPENSSL_armcap_P |= ARMV8_SHA512;
|
|
}
|
|
+ if (sigsetjmp(ill_jmp, 1) == 0) {
|
|
+ _armv8_eor3_probe();
|
|
+ OPENSSL_armcap_P |= ARMV8_SHA3;
|
|
+ }
|
|
# endif
|
|
}
|
|
# endif
|
|
@@ -262,6 +292,9 @@ void OPENSSL_cpuid_setup(void)
|
|
(OPENSSL_armcap_P & ARMV7_NEON)) {
|
|
OPENSSL_armv8_rsa_neonized = 1;
|
|
}
|
|
+ if ((MIDR_IS_CPU_MODEL(OPENSSL_arm_midr, ARM_CPU_IMP_ARM, ARM_CPU_PART_V1)) &&
|
|
+ (OPENSSL_armcap_P & ARMV8_SHA3))
|
|
+ OPENSSL_armcap_P |= ARMV8_UNROLL8_EOR3;
|
|
# endif
|
|
}
|
|
#endif
|
|
Index: openssl-1.1.1m/crypto/modes/asm/aes-gcm-armv8-unroll8_64.pl
|
|
===================================================================
|
|
--- /dev/null
|
|
+++ openssl-1.1.1m/crypto/modes/asm/aes-gcm-armv8-unroll8_64.pl
|
|
@@ -0,0 +1,7369 @@
|
|
+#! /usr/bin/env perl
|
|
+# Copyright 2020-2021 The OpenSSL Project Authors. All Rights Reserved.
|
|
+#
|
|
+# Licensed under the Apache License 2.0 (the "License"). You may not use
|
|
+# this file except in compliance with the License. You can obtain a copy
|
|
+# in the file LICENSE in the source distribution or at
|
|
+# https://www.openssl.org/source/license.html
|
|
+
|
|
+#
|
|
+#========================================================================
|
|
+# Written by Xiaokang Qian <xiaokang.qian@arm.com> for the OpenSSL project,
|
|
+# derived from https://github.com/ARM-software/AArch64cryptolib, original
|
|
+# author Samuel Lee <Samuel.Lee@arm.com>. The module is, however, dual
|
|
+# licensed under OpenSSL and SPDX BSD-3-Clause licenses depending on where you
|
|
+# obtain it.
|
|
+#========================================================================
|
|
+#
|
|
+# Approach - We want to reload constants as we have plenty of spare ASIMD slots around crypto units for loading
|
|
+# Unroll x8 in main loop, main loop to act on 8 16B blocks per iteration, and then do modulo of the accumulated
|
|
+# intermediate hashesfrom the 8 blocks.
|
|
+#
|
|
+# ____________________________________________________
|
|
+# | |
|
|
+# | PRE |
|
|
+# |____________________________________________________|
|
|
+# | | | |
|
|
+# | CTR block 8k+13| AES block 8k+8 | GHASH block 8k+0 |
|
|
+# |________________|________________|__________________|
|
|
+# | | | |
|
|
+# | CTR block 8k+14| AES block 8k+9 | GHASH block 8k+1 |
|
|
+# |________________|________________|__________________|
|
|
+# | | | |
|
|
+# | CTR block 8k+15| AES block 8k+10| GHASH block 8k+2 |
|
|
+# |________________|________________|__________________|
|
|
+# | | | |
|
|
+# | CTR block 8k+16| AES block 8k+11| GHASH block 8k+3 |
|
|
+# |________________|________________|__________________|
|
|
+# | | | |
|
|
+# | CTR block 8k+17| AES block 8k+12| GHASH block 8k+4 |
|
|
+# |________________|________________|__________________|
|
|
+# | | | |
|
|
+# | CTR block 8k+18| AES block 8k+13| GHASH block 8k+5 |
|
|
+# |________________|________________|__________________|
|
|
+# | | | |
|
|
+# | CTR block 8k+19| AES block 8k+14| GHASH block 8k+6 |
|
|
+# |________________|________________|__________________|
|
|
+# | | | |
|
|
+# | CTR block 8k+20| AES block 8k+15| GHASH block 8k+7 |
|
|
+# |________________|____(mostly)____|__________________|
|
|
+# | |
|
|
+# | MODULO |
|
|
+# |____________________________________________________|
|
|
+#
|
|
+# PRE:
|
|
+# Ensure previous generated intermediate hash is aligned and merged with result for GHASH 4k+0
|
|
+# EXT low_acc, low_acc, low_acc, #8
|
|
+# EOR res_curr (8k+0), res_curr (4k+0), low_acc
|
|
+#
|
|
+# CTR block:
|
|
+# Increment and byte reverse counter in scalar registers and transfer to SIMD registers
|
|
+# REV ctr32, rev_ctr32
|
|
+# ORR ctr64, constctr96_top32, ctr32, LSL #32
|
|
+# INS ctr_next.d[0], constctr96_bottom64 // Keeping this in scalar registers to free up space in SIMD RF
|
|
+# INS ctr_next.d[1], ctr64X
|
|
+# ADD rev_ctr32, #1
|
|
+#
|
|
+# AES block:
|
|
+# Do AES encryption/decryption on CTR block X and EOR it with input block X. Take 256 bytes key below for example.
|
|
+# Doing small trick here of loading input in scalar registers, EORing with last key and then transferring
|
|
+# Given we are very constrained in our ASIMD registers this is quite important
|
|
+#
|
|
+# Encrypt:
|
|
+# LDR input_low, [ input_ptr ], #8
|
|
+# LDR input_high, [ input_ptr ], #8
|
|
+# EOR input_low, k14_low
|
|
+# EOR input_high, k14_high
|
|
+# INS res_curr.d[0], input_low
|
|
+# INS res_curr.d[1], input_high
|
|
+# AESE ctr_curr, k0; AESMC ctr_curr, ctr_curr
|
|
+# AESE ctr_curr, k1; AESMC ctr_curr, ctr_curr
|
|
+# AESE ctr_curr, k2; AESMC ctr_curr, ctr_curr
|
|
+# AESE ctr_curr, k3; AESMC ctr_curr, ctr_curr
|
|
+# AESE ctr_curr, k4; AESMC ctr_curr, ctr_curr
|
|
+# AESE ctr_curr, k5; AESMC ctr_curr, ctr_curr
|
|
+# AESE ctr_curr, k6; AESMC ctr_curr, ctr_curr
|
|
+# AESE ctr_curr, k7; AESMC ctr_curr, ctr_curr
|
|
+# AESE ctr_curr, k8; AESMC ctr_curr, ctr_curr
|
|
+# AESE ctr_curr, k9; AESMC ctr_curr, ctr_curr
|
|
+# AESE ctr_curr, k10; AESMC ctr_curr, ctr_curr
|
|
+# AESE ctr_curr, k11; AESMC ctr_curr, ctr_curr
|
|
+# AESE ctr_curr, k12; AESMC ctr_curr, ctr_curr
|
|
+# AESE ctr_curr, k13
|
|
+# EOR res_curr, res_curr, ctr_curr
|
|
+# ST1 { res_curr.16b }, [ output_ptr ], #16
|
|
+#
|
|
+# Decrypt:
|
|
+# AESE ctr_curr, k0; AESMC ctr_curr, ctr_curr
|
|
+# AESE ctr_curr, k1; AESMC ctr_curr, ctr_curr
|
|
+# AESE ctr_curr, k2; AESMC ctr_curr, ctr_curr
|
|
+# AESE ctr_curr, k3; AESMC ctr_curr, ctr_curr
|
|
+# AESE ctr_curr, k4; AESMC ctr_curr, ctr_curr
|
|
+# AESE ctr_curr, k5; AESMC ctr_curr, ctr_curr
|
|
+# AESE ctr_curr, k6; AESMC ctr_curr, ctr_curr
|
|
+# AESE ctr_curr, k7; AESMC ctr_curr, ctr_curr
|
|
+# AESE ctr_curr, k8; AESMC ctr_curr, ctr_curr
|
|
+# AESE ctr_curr, k9; AESMC ctr_curr, ctr_curr
|
|
+# AESE ctr_curr, k10; AESMC ctr_curr, ctr_curr
|
|
+# AESE ctr_curr, k11; AESMC ctr_curr, ctr_curr
|
|
+# AESE ctr_curr, k12; AESMC ctr_curr, ctr_curr
|
|
+# AESE ctr_curr, k13
|
|
+# LDR res_curr, [ input_ptr ], #16
|
|
+# EOR res_curr, res_curr, ctr_curr
|
|
+# MOV output_low, res_curr.d[0]
|
|
+# MOV output_high, res_curr.d[1]
|
|
+# EOR output_low, k14_low
|
|
+# EOR output_high, k14_high
|
|
+# STP output_low, output_high, [ output_ptr ], #16
|
|
+
|
|
+# GHASH block X:
|
|
+# Do 128b karatsuba polynomial multiplication on block
|
|
+# We only have 64b->128b polynomial multipliers, naively that means we need to do 4 64b multiplies to generate a 128b
|
|
+#
|
|
+# multiplication:
|
|
+# Pmull(A,B) == (Pmull(Ah,Bh)<<128 | Pmull(Al,Bl)) ^ (Pmull(Ah,Bl) ^ Pmull(Al,Bh))<<64
|
|
+#
|
|
+# The idea behind Karatsuba multiplication is that we can do just 3 64b multiplies:
|
|
+# Pmull(A,B) == (Pmull(Ah,Bh)<<128 | Pmull(Al,Bl)) ^ (Pmull(Ah^Al,Bh^Bl) ^ Pmull(Ah,Bh) ^ Pmull(Al,Bl))<<64
|
|
+#
|
|
+# There is some complication here because the bit order of GHASH's PMULL is reversed compared to elsewhere, so we are
|
|
+# multiplying with "twisted" powers of H
|
|
+#
|
|
+# Note: We can PMULL directly into the acc_x in first GHASH of the loop
|
|
+# Note: For scheduling big cores we want to split the processing to happen over two loop iterations - otherwise the critical
|
|
+# path latency dominates the performance
|
|
+#
|
|
+# This has a knock on effect on register pressure, so we have to be a bit more clever with our temporary registers
|
|
+# than indicated here
|
|
+# REV64 res_curr, res_curr
|
|
+# INS t_m.d[0], res_curr.d[1]
|
|
+# EOR t_m.8B, t_m.8B, res_curr.8B
|
|
+# PMULL2 t_h, res_curr, HX
|
|
+# PMULL t_l, res_curr, HX
|
|
+# PMULL t_m, t_m, HX_k
|
|
+# EOR acc_h, acc_h, t_h
|
|
+# EOR acc_l, acc_l, t_l
|
|
+# EOR acc_m, acc_m, t_m
|
|
+#
|
|
+# MODULO: take the partial accumulators (~representing sum of 256b multiplication results), from GHASH and do modulo reduction on them
|
|
+# There is some complication here because the bit order of GHASH's PMULL is reversed compared to elsewhere, so we are doing modulo
|
|
+# with a reversed constant
|
|
+# EOR3 acc_m, acc_m, acc_l, acc_h // Finish off karatsuba processing
|
|
+# PMULL t_mod, acc_h, mod_constant
|
|
+# EXT acc_h, acc_h, acc_h, #8
|
|
+# EOR3 acc_m, acc_m, t_mod, acc_h
|
|
+# PMULL acc_h, acc_m, mod_constant
|
|
+# EXT acc_m, acc_m, acc_m, #8
|
|
+# EOR3 acc_l, acc_l, acc_m, acc_h
|
|
+
|
|
+$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
|
|
+$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
|
|
+
|
|
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
|
|
+( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
|
|
+( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate ) or
|
|
+die "can't locate arm-xlate.pl";
|
|
+
|
|
+die "only for 64 bit" if $flavour !~ /64/;
|
|
+
|
|
+open OUT,"| \"$^X\" $xlate $flavour $output";
|
|
+*STDOUT=*OUT;
|
|
+
|
|
+$code=<<___;
|
|
+#include "arm_arch.h"
|
|
+
|
|
+#if __ARM_MAX_ARCH__>=8
|
|
+___
|
|
+$code.=".arch armv8.2-a+crypto\n.arch_extension sha3\n.text\n";
|
|
+
|
|
+$input_ptr="x0"; #argument block
|
|
+$bit_length="x1";
|
|
+$output_ptr="x2";
|
|
+$current_tag="x3";
|
|
+$counter="x16";
|
|
+$constant_temp="x15";
|
|
+$modulo_constant="x10";
|
|
+$cc="x8";
|
|
+{
|
|
+my ($end_input_ptr,$main_end_input_ptr,$temp0_x,$temp1_x)=map("x$_",(4..7));
|
|
+my ($temp2_x,$temp3_x)=map("x$_",(13..14));
|
|
+my ($ctr0b,$ctr1b,$ctr2b,$ctr3b,$ctr4b,$ctr5b,$ctr6b,$ctr7b,$res0b,$res1b,$res2b,$res3b,$res4b,$res5b,$res6b,$res7b)=map("v$_.16b",(0..15));
|
|
+my ($ctr0,$ctr1,$ctr2,$ctr3,$ctr4,$ctr5,$ctr6,$ctr7,$res0,$res1,$res2,$res3,$res4,$res5,$res6,$res7)=map("v$_",(0..15));
|
|
+my ($ctr0d,$ctr1d,$ctr2d,$ctr3d,$ctr4d,$ctr5d,$ctr6d,$ctr7d)=map("d$_",(0..7));
|
|
+my ($ctr0q,$ctr1q,$ctr2q,$ctr3q,$ctr4q,$ctr5q,$ctr6q,$ctr7q)=map("q$_",(0..7));
|
|
+my ($res0q,$res1q,$res2q,$res3q,$res4q,$res5q,$res6q,$res7q)=map("q$_",(8..15));
|
|
+
|
|
+my ($ctr_t0,$ctr_t1,$ctr_t2,$ctr_t3,$ctr_t4,$ctr_t5,$ctr_t6,$ctr_t7)=map("v$_",(8..15));
|
|
+my ($ctr_t0b,$ctr_t1b,$ctr_t2b,$ctr_t3b,$ctr_t4b,$ctr_t5b,$ctr_t6b,$ctr_t7b)=map("v$_.16b",(8..15));
|
|
+my ($ctr_t0q,$ctr_t1q,$ctr_t2q,$ctr_t3q,$ctr_t4q,$ctr_t5q,$ctr_t6q,$ctr_t7q)=map("q$_",(8..15));
|
|
+
|
|
+my ($acc_hb,$acc_mb,$acc_lb)=map("v$_.16b",(17..19));
|
|
+my ($acc_h,$acc_m,$acc_l)=map("v$_",(17..19));
|
|
+
|
|
+my ($h1,$h12k,$h2,$h3,$h34k,$h4)=map("v$_",(20..25));
|
|
+my ($h5,$h56k,$h6,$h7,$h78k,$h8)=map("v$_",(20..25));
|
|
+my ($h1q,$h12kq,$h2q,$h3q,$h34kq,$h4q)=map("q$_",(20..25));
|
|
+my ($h5q,$h56kq,$h6q,$h7q,$h78kq,$h8q)=map("q$_",(20..25));
|
|
+
|
|
+my $t0="v16";
|
|
+my $t0d="d16";
|
|
+
|
|
+my $t1="v29";
|
|
+my $t2=$res1;
|
|
+my $t3=$t1;
|
|
+
|
|
+my $t4=$res0;
|
|
+my $t5=$res2;
|
|
+my $t6=$t0;
|
|
+
|
|
+my $t7=$res3;
|
|
+my $t8=$res4;
|
|
+my $t9=$res5;
|
|
+
|
|
+my $t10=$res6;
|
|
+my $t11="v21";
|
|
+my $t12=$t1;
|
|
+
|
|
+my $rtmp_ctr="v30";
|
|
+my $rtmp_ctrq="q30";
|
|
+my $rctr_inc="v31";
|
|
+my $rctr_incd="d31";
|
|
+
|
|
+my $mod_constantd=$t0d;
|
|
+my $mod_constant=$t0;
|
|
+
|
|
+my ($rk0,$rk1,$rk2)=map("v$_.16b",(26..28));
|
|
+my ($rk3,$rk4,$rk5)=map("v$_.16b",(26..28));
|
|
+my ($rk6,$rk7,$rk8)=map("v$_.16b",(26..28));
|
|
+my ($rk9,$rk10,$rk11)=map("v$_.16b",(26..28));
|
|
+my ($rk12,$rk13,$rk14)=map("v$_.16b",(26..28));
|
|
+my ($rk0q,$rk1q,$rk2q)=map("q$_",(26..28));
|
|
+my ($rk3q,$rk4q,$rk5q)=map("q$_",(26..28));
|
|
+my ($rk6q,$rk7q,$rk8q)=map("q$_",(26..28));
|
|
+my ($rk9q,$rk10q,$rk11q)=map("q$_",(26..28));
|
|
+my ($rk12q,$rk13q,$rk14q)=map("q$_",(26..28));
|
|
+my $rk2q1="v28.1q";
|
|
+my $rk3q1="v26.1q";
|
|
+my $rk4v="v27";
|
|
+
|
|
+
|
|
+#########################################################################################
|
|
+# size_t unroll8_eor3_aes_gcm_enc_128_kernel(const unsigned char *in,
|
|
+# size_t len,
|
|
+# unsigned char *out,
|
|
+# const void *key,
|
|
+# unsigned char ivec[16],
|
|
+# u64 *Xi);
|
|
+#
|
|
+$code.=<<___;
|
|
+.global unroll8_eor3_aes_gcm_enc_128_kernel
|
|
+.type unroll8_eor3_aes_gcm_enc_128_kernel,%function
|
|
+.align 4
|
|
+unroll8_eor3_aes_gcm_enc_128_kernel:
|
|
+ AARCH64_VALID_CALL_TARGET
|
|
+ cbz x1, .L128_enc_ret
|
|
+ stp d8, d9, [sp, #-80]!
|
|
+ mov $counter, x4
|
|
+ mov $cc, x5
|
|
+ stp d10, d11, [sp, #16]
|
|
+ stp d12, d13, [sp, #32]
|
|
+ stp d14, d15, [sp, #48]
|
|
+ mov x5, #0xc200000000000000
|
|
+ stp x5, xzr, [sp, #64]
|
|
+ add $modulo_constant, sp, #64
|
|
+
|
|
+ mov $constant_temp, #0x100000000 @ set up counter increment
|
|
+ movi $rctr_inc.16b, #0x0
|
|
+ mov $rctr_inc.d[1], $constant_temp
|
|
+ lsr $main_end_input_ptr, $bit_length, #3 @ byte_len
|
|
+ ld1 { $ctr0b}, [$counter] @ CTR block 0
|
|
+
|
|
+ sub $main_end_input_ptr, $main_end_input_ptr, #1 @ byte_len - 1
|
|
+
|
|
+ and $main_end_input_ptr, $main_end_input_ptr, #0xffffffffffffff80 @ number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
|
|
+
|
|
+ rev32 $rtmp_ctr.16b, $ctr0.16b @ set up reversed counter
|
|
+
|
|
+ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 0
|
|
+
|
|
+ rev32 $ctr1.16b, $rtmp_ctr.16b @ CTR block 1
|
|
+ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 1
|
|
+
|
|
+ rev32 $ctr2.16b, $rtmp_ctr.16b @ CTR block 2
|
|
+ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 2
|
|
+
|
|
+ rev32 $ctr3.16b, $rtmp_ctr.16b @ CTR block 3
|
|
+ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 3
|
|
+
|
|
+ rev32 $ctr4.16b, $rtmp_ctr.16b @ CTR block 4
|
|
+ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 4
|
|
+
|
|
+ rev32 $ctr5.16b, $rtmp_ctr.16b @ CTR block 5
|
|
+ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 5
|
|
+ ldp $rk0q, $rk1q, [$cc, #0] @ load rk0, rk1
|
|
+
|
|
+ rev32 $ctr6.16b, $rtmp_ctr.16b @ CTR block 6
|
|
+ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 6
|
|
+
|
|
+ rev32 $ctr7.16b, $rtmp_ctr.16b @ CTR block 7
|
|
+ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 7
|
|
+
|
|
+ aese $ctr4b, $rk0 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 0
|
|
+ aese $ctr6b, $rk0 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 0
|
|
+ aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 0
|
|
+
|
|
+ aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 0
|
|
+ aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 0
|
|
+ aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 0
|
|
+
|
|
+ aese $ctr7b, $rk0 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 0
|
|
+ aese $ctr5b, $rk0 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 0
|
|
+ ldp $rk2q, $rk3q, [$cc, #32] @ load rk2, rk3
|
|
+
|
|
+ aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 1
|
|
+
|
|
+ aese $ctr7b, $rk1 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 1
|
|
+ aese $ctr5b, $rk1 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 1
|
|
+ aese $ctr4b, $rk1 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 1
|
|
+
|
|
+ aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 1
|
|
+ aese $ctr6b, $rk1 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 1
|
|
+ aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 1
|
|
+
|
|
+ aese $ctr5b, $rk2 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 2
|
|
+ aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 1
|
|
+ aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 2
|
|
+
|
|
+ aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 2
|
|
+ aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 2
|
|
+ aese $ctr7b, $rk2 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 2
|
|
+
|
|
+ aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 2
|
|
+ aese $ctr6b, $rk2 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 2
|
|
+ aese $ctr4b, $rk2 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 2
|
|
+
|
|
+ aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 3
|
|
+
|
|
+ ldp $rk4q, $rk5q, [$cc, #64] @ load rk4, rk5
|
|
+ aese $ctr5b, $rk3 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 3
|
|
+ aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 3
|
|
+
|
|
+ aese $ctr4b, $rk3 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 3
|
|
+ aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 3
|
|
+ aese $ctr6b, $rk3 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 3
|
|
+
|
|
+ aese $ctr7b, $rk3 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 3
|
|
+
|
|
+ aese $ctr6b, $rk4 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 4
|
|
+ aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 3
|
|
+ aese $ctr5b, $rk4 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 4
|
|
+
|
|
+ aese $ctr7b, $rk4 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 4
|
|
+ aese $ctr4b, $rk4 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 4
|
|
+ aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 4
|
|
+
|
|
+ aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 4
|
|
+ aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 4
|
|
+ aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 4
|
|
+
|
|
+ aese $ctr7b, $rk5 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 5
|
|
+ aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 5
|
|
+ ldp $rk6q, $rk7q, [$cc, #96] @ load rk6, rk7
|
|
+
|
|
+ aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 5
|
|
+ aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 5
|
|
+ aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 5
|
|
+
|
|
+ aese $ctr4b, $rk5 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 5
|
|
+ aese $ctr5b, $rk5 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 5
|
|
+ aese $ctr6b, $rk5 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 5
|
|
+
|
|
+ aese $ctr4b, $rk6 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 6
|
|
+ aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 6
|
|
+ aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 6
|
|
+
|
|
+ aese $ctr7b, $rk6 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 6
|
|
+ aese $ctr6b, $rk6 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 6
|
|
+ aese $ctr5b, $rk6 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 6
|
|
+
|
|
+ aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 6
|
|
+ aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 6
|
|
+ ldp $rk8q, $rk9q, [$cc, #128] @ load rk8, rk9
|
|
+
|
|
+ aese $ctr5b, $rk7 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 7
|
|
+
|
|
+ ld1 { $acc_lb}, [$current_tag]
|
|
+ ext $acc_lb, $acc_lb, $acc_lb, #8
|
|
+ rev64 $acc_lb, $acc_lb
|
|
+
|
|
+ aese $ctr7b, $rk7 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 7
|
|
+
|
|
+ aese $ctr4b, $rk7 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 7
|
|
+ aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 7
|
|
+ aese $ctr6b, $rk7 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 7
|
|
+
|
|
+ aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 7
|
|
+ aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 7
|
|
+ aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 7
|
|
+
|
|
+ aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 8
|
|
+ aese $ctr6b, $rk8 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 8
|
|
+ aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 8
|
|
+
|
|
+ aese $ctr7b, $rk8 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 8
|
|
+ aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 8
|
|
+ ldr $rk10q, [$cc, #160] @ load rk10
|
|
+
|
|
+ aese $ctr3b, $rk9 @ AES block 8k+11 - round 9
|
|
+ aese $ctr4b, $rk8 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 8
|
|
+ aese $ctr2b, $rk9 @ AES block 8k+10 - round 9
|
|
+
|
|
+ aese $ctr5b, $rk8 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 8
|
|
+ aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 8
|
|
+ aese $ctr6b, $rk9 @ AES block 8k+14 - round 9
|
|
+
|
|
+ aese $ctr4b, $rk9 @ AES block 8k+12 - round 9
|
|
+ add $main_end_input_ptr, $main_end_input_ptr, $input_ptr
|
|
+ aese $ctr0b, $rk9 @ AES block 8k+8 - round 9
|
|
+
|
|
+ aese $ctr7b, $rk9 @ AES block 8k+15 - round 9
|
|
+ aese $ctr5b, $rk9 @ AES block 8k+13 - round 9
|
|
+ aese $ctr1b, $rk9 @ AES block 8k+9 - round 9
|
|
+
|
|
+ add $end_input_ptr, $input_ptr, $bit_length, lsr #3 @ end_input_ptr
|
|
+ cmp $input_ptr, $main_end_input_ptr @ check if we have <= 8 blocks
|
|
+ b.ge .L128_enc_tail @ handle tail
|
|
+
|
|
+ ldp $ctr_t0q, $ctr_t1q, [$input_ptr], #32 @ AES block 0, 1 - load plaintext
|
|
+
|
|
+ ldp $ctr_t2q, $ctr_t3q, [$input_ptr], #32 @ AES block 2, 3 - load plaintext
|
|
+
|
|
+ ldp $ctr_t4q, $ctr_t5q, [$input_ptr], #32 @ AES block 4, 5 - load plaintext
|
|
+
|
|
+ ldp $ctr_t6q, $ctr_t7q, [$input_ptr], #32 @ AES block 6, 7 - load plaintext
|
|
+ cmp $input_ptr, $main_end_input_ptr @ check if we have <= 8 blocks
|
|
+
|
|
+ eor3 $res0b, $ctr_t0b, $ctr0b, $rk10 @ AES block 0 - result
|
|
+ rev32 $ctr0.16b, $rtmp_ctr.16b @ CTR block 8
|
|
+ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8
|
|
+
|
|
+ eor3 $res1b, $ctr_t1b, $ctr1b, $rk10 @ AES block 1 - result
|
|
+ stp $res0q, $res1q, [$output_ptr], #32 @ AES block 0, 1 - store result
|
|
+
|
|
+ rev32 $ctr1.16b, $rtmp_ctr.16b @ CTR block 9
|
|
+ eor3 $res5b, $ctr_t5b, $ctr5b, $rk10 @ AES block 5 - result
|
|
+ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 9
|
|
+
|
|
+ eor3 $res2b, $ctr_t2b, $ctr2b, $rk10 @ AES block 2 - result
|
|
+ eor3 $res6b, $ctr_t6b, $ctr6b, $rk10 @ AES block 6 - result
|
|
+ eor3 $res4b, $ctr_t4b, $ctr4b, $rk10 @ AES block 4 - result
|
|
+
|
|
+ rev32 $ctr2.16b, $rtmp_ctr.16b @ CTR block 10
|
|
+ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 10
|
|
+
|
|
+ eor3 $res3b, $ctr_t3b, $ctr3b, $rk10 @ AES block 3 - result
|
|
+ eor3 $res7b, $ctr_t7b, $ctr7b,$rk10 @ AES block 7 - result
|
|
+ stp $res2q, $res3q, [$output_ptr], #32 @ AES block 2, 3 - store result
|
|
+
|
|
+ rev32 $ctr3.16b, $rtmp_ctr.16b @ CTR block 11
|
|
+ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 11
|
|
+ stp $res4q, $res5q, [$output_ptr], #32 @ AES block 4, 5 - store result
|
|
+
|
|
+ stp $res6q, $res7q, [$output_ptr], #32 @ AES block 6, 7 - store result
|
|
+
|
|
+ rev32 $ctr4.16b, $rtmp_ctr.16b @ CTR block 12
|
|
+ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 12
|
|
+ b.ge .L128_enc_prepretail @ do prepretail
|
|
+
|
|
+.L128_enc_main_loop: @ main loop start
|
|
+ rev32 $ctr5.16b, $rtmp_ctr.16b @ CTR block 8k+13
|
|
+ ldr $h5q, [$current_tag, #128] @ load h5l | h5h
|
|
+ ext $h5.16b, $h5.16b, $h5.16b, #8
|
|
+ ldr $h6q, [$current_tag, #160] @ load h6l | h6h
|
|
+ ext $h6.16b, $h6.16b, $h6.16b, #8
|
|
+ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+13
|
|
+
|
|
+ rev64 $res1b, $res1b @ GHASH block 8k+1
|
|
+ rev64 $res0b, $res0b @ GHASH block 8k
|
|
+ ldr $h7q, [$current_tag, #176] @ load h7l | h7h
|
|
+ ext $h7.16b, $h7.16b, $h7.16b, #8
|
|
+ ldr $h8q, [$current_tag, #208] @ load h8l | h8h
|
|
+ ext $h8.16b, $h8.16b, $h8.16b, #8
|
|
+
|
|
+ rev32 $ctr6.16b, $rtmp_ctr.16b @ CTR block 8k+14
|
|
+ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+14
|
|
+ ext $acc_lb, $acc_lb, $acc_lb, #8 @ PRE 0
|
|
+
|
|
+ ldr $h56kq, [$current_tag, #144] @ load h6k | h5k
|
|
+ ldr $h78kq, [$current_tag, #192] @ load h8k | h7k
|
|
+ rev64 $res5b, $res5b @ GHASH block 8k+5 (t0, t1, t2 and t3 free)
|
|
+ rev64 $res3b, $res3b @ GHASH block 8k+3
|
|
+
|
|
+ ldp $rk0q, $rk1q, [$cc, #0] @ load rk0, rk1
|
|
+ eor $res0b, $res0b, $acc_lb @ PRE 1
|
|
+ rev32 $ctr7.16b, $rtmp_ctr.16b @ CTR block 8k+15
|
|
+
|
|
+ rev64 $res7b, $res7b @ GHASH block 8k+7 (t0, t1, t2 and t3 free)
|
|
+
|
|
+ pmull2 $t0.1q, $res1.2d, $h7.2d @ GHASH block 8k+1 - high
|
|
+ rev64 $res2b, $res2b @ GHASH block 8k+2
|
|
+ pmull2 $acc_h.1q, $res0.2d, $h8.2d @ GHASH block 8k - high
|
|
+
|
|
+ pmull $h7.1q, $res1.1d, $h7.1d @ GHASH block 8k+1 - low
|
|
+ trn1 $acc_m.2d, $res1.2d, $res0.2d @ GHASH block 8k, 8k+1 - mid
|
|
+ pmull $acc_l.1q, $res0.1d, $h8.1d @ GHASH block 8k - low
|
|
+
|
|
+ trn2 $res0.2d, $res1.2d, $res0.2d @ GHASH block 8k, 8k+1 - mid
|
|
+ pmull2 $t1.1q, $res2.2d, $h6.2d @ GHASH block 8k+2 - high
|
|
+ pmull2 $t2.1q, $res3.2d, $h5.2d @ GHASH block 8k+3 - high
|
|
+
|
|
+ eor $acc_lb, $acc_lb, $h7.16b @ GHASH block 8k+1 - low
|
|
+ ldr $h3q, [$current_tag, #80] @ load h3l | h3h
|
|
+ ext $h3.16b, $h3.16b, $h3.16b, #8
|
|
+ ldr $h4q, [$current_tag, #112] @ load h3l | h3h
|
|
+ ext $h4.16b, $h4.16b, $h4.16b, #8
|
|
+ aese $ctr5b, $rk0 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 0
|
|
+
|
|
+ aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 0
|
|
+ aese $ctr4b, $rk0 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 0
|
|
+ eor $acc_hb, $acc_hb, $t0.16b @ GHASH block 8k+1 - high
|
|
+
|
|
+ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+15
|
|
+ aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 0
|
|
+ eor $res0.16b, $res0.16b, $acc_m.16b @ GHASH block 8k, 8k+1 - mid
|
|
+
|
|
+ aese $ctr6b, $rk0 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 0
|
|
+ aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 1
|
|
+ aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 0
|
|
+
|
|
+ aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 1
|
|
+ aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 0
|
|
+ pmull $h6.1q, $res2.1d, $h6.1d @ GHASH block 8k+2 - low
|
|
+
|
|
+ aese $ctr5b, $rk1 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 1
|
|
+ aese $ctr7b, $rk0 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 0
|
|
+ aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 1
|
|
+
|
|
+ eor3 $acc_hb, $acc_hb, $t1.16b,$t2.16b @ GHASH block 8k+2, 8k+3 - high
|
|
+ trn1 $t3.2d, $res3.2d, $res2.2d @ GHASH block 8k+2, 8k+3 - mid
|
|
+ trn2 $res2.2d, $res3.2d, $res2.2d @ GHASH block 8k+2, 8k+3 - mid
|
|
+
|
|
+ ldp $rk2q, $rk3q, [$cc, #32] @ load rk2, rk3
|
|
+ aese $ctr4b, $rk1 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 1
|
|
+ aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 1
|
|
+
|
|
+ pmull $h5.1q, $res3.1d, $h5.1d @ GHASH block 8k+3 - low
|
|
+ aese $ctr7b, $rk1 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 1
|
|
+ aese $ctr6b, $rk1 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 1
|
|
+
|
|
+ pmull2 $acc_m.1q, $res0.2d, $h78k.2d @ GHASH block 8k - mid
|
|
+ eor $res2.16b, $res2.16b, $t3.16b @ GHASH block 8k+2, 8k+3 - mid
|
|
+ pmull $h78k.1q, $res0.1d, $h78k.1d @ GHASH block 8k+1 - mid
|
|
+
|
|
+ rev64 $res6b, $res6b @ GHASH block 8k+6 (t0, t1, and t2 free)
|
|
+ eor3 $acc_lb, $acc_lb, $h6.16b, $h5.16b @ GHASH block 8k+2, 8k+3 - low
|
|
+
|
|
+ pmull2 $t3.1q, $res2.2d, $h56k.2d @ GHASH block 8k+2 - mid
|
|
+ eor $acc_mb, $acc_mb, $h78k.16b @ GHASH block 8k+1 - mid
|
|
+ pmull $h56k.1q, $res2.1d, $h56k.1d @ GHASH block 8k+3 - mid
|
|
+
|
|
+ aese $ctr5b, $rk2 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 2
|
|
+ aese $ctr4b, $rk2 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 2
|
|
+ aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 2
|
|
+
|
|
+ aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 2
|
|
+ eor3 $acc_mb, $acc_mb, $h56k.16b, $t3.16b @ GHASH block 8k+2, 8k+3 - mid
|
|
+ aese $ctr6b, $rk2 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 2
|
|
+
|
|
+ aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 2
|
|
+ aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 2
|
|
+ aese $ctr7b, $rk2 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 2
|
|
+
|
|
+ aese $ctr6b, $rk3 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 3
|
|
+ ldr $h12kq, [$current_tag, #48] @ load h2k | h1k
|
|
+ ldr $h34kq, [$current_tag, #96] @ load h4k | h3k
|
|
+ rev64 $res4b, $res4b @ GHASH block 8k+4 (t0, t1, and t2 free)
|
|
+
|
|
+ ldp $rk4q, $rk5q, [$cc, #64] @ load rk4, rk5
|
|
+ aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 3
|
|
+ aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 3
|
|
+
|
|
+ ldr $h1q, [$current_tag, #32] @ load h1l | h1h
|
|
+ ext $h1.16b, $h1.16b, $h1.16b, #8
|
|
+ ldr $h2q, [$current_tag, #64] @ load h1l | h1h
|
|
+ ext $h2.16b, $h2.16b, $h2.16b, #8
|
|
+ pmull2 $t4.1q, $res4.2d, $h4.2d @ GHASH block 8k+4 - high
|
|
+ pmull $h4.1q, $res4.1d, $h4.1d @ GHASH block 8k+4 - low
|
|
+
|
|
+ trn1 $t6.2d, $res5.2d, $res4.2d @ GHASH block 8k+4, 8k+5 - mid
|
|
+ trn2 $res4.2d, $res5.2d, $res4.2d @ GHASH block 8k+4, 8k+5 - mid
|
|
+
|
|
+ aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 3
|
|
+ aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 3
|
|
+
|
|
+ aese $ctr7b, $rk3 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 3
|
|
+ aese $ctr4b, $rk3 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 3
|
|
+
|
|
+ aese $ctr5b, $rk3 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 3
|
|
+ aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 4
|
|
+
|
|
+ aese $ctr7b, $rk4 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 4
|
|
+ aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 4
|
|
+ aese $ctr4b, $rk4 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 4
|
|
+
|
|
+ aese $ctr5b, $rk4 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 4
|
|
+ aese $ctr6b, $rk4 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 4
|
|
+ aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 4
|
|
+
|
|
+ pmull2 $t5.1q, $res5.2d, $h3.2d @ GHASH block 8k+5 - high
|
|
+ eor $res4.16b, $res4.16b, $t6.16b @ GHASH block 8k+4, 8k+5 - mid
|
|
+ pmull $h3.1q, $res5.1d, $h3.1d @ GHASH block 8k+5 - low
|
|
+
|
|
+ aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 4
|
|
+ ldp $rk6q, $rk7q, [$cc, #96] @ load rk6, rk7
|
|
+ trn1 $t9.2d, $res7.2d, $res6.2d @ GHASH block 8k+6, 8k+7 - mid
|
|
+
|
|
+ pmull2 $t6.1q, $res4.2d, $h34k.2d @ GHASH block 8k+4 - mid
|
|
+ pmull $h34k.1q, $res4.1d, $h34k.1d @ GHASH block 8k+5 - mid
|
|
+ pmull2 $t7.1q, $res6.2d, $h2.2d @ GHASH block 8k+6 - high
|
|
+
|
|
+ pmull2 $t8.1q, $res7.2d, $h1.2d @ GHASH block 8k+7 - high
|
|
+ aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 5
|
|
+ aese $ctr5b, $rk5 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 5
|
|
+
|
|
+ pmull $h2.1q, $res6.1d, $h2.1d @ GHASH block 8k+6 - low
|
|
+ eor3 $acc_hb, $acc_hb, $t4.16b, $t5.16b @ GHASH block 8k+4, 8k+5 - high
|
|
+ trn2 $res6.2d, $res7.2d, $res6.2d @ GHASH block 8k+6, 8k+7 - mid
|
|
+
|
|
+ eor3 $acc_lb, $acc_lb, $h4.16b, $h3.16b @ GHASH block 8k+4, 8k+5 - low
|
|
+ aese $ctr6b, $rk5 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 5
|
|
+
|
|
+ eor $res6.16b, $res6.16b, $t9.16b @ GHASH block 8k+6, 8k+7 - mid
|
|
+ aese $ctr7b, $rk5 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 5
|
|
+ aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 5
|
|
+
|
|
+ aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 5
|
|
+ aese $ctr4b, $rk5 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 5
|
|
+ aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 5
|
|
+
|
|
+ eor3 $acc_mb, $acc_mb, $h34k.16b, $t6.16b @ GHASH block 8k+4, 8k+5 - mid
|
|
+ ldr $mod_constantd, [$modulo_constant] @ MODULO - load modulo constant
|
|
+ pmull $h1.1q, $res7.1d, $h1.1d @ GHASH block 8k+7 - low
|
|
+
|
|
+ aese $ctr7b, $rk6 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 6
|
|
+ aese $ctr5b, $rk6 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 6
|
|
+
|
|
+ pmull2 $t9.1q, $res6.2d, $h12k.2d @ GHASH block 8k+6 - mid
|
|
+ aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 6
|
|
+ aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 6
|
|
+
|
|
+ pmull $h12k.1q, $res6.1d, $h12k.1d @ GHASH block 8k+7 - mid
|
|
+ eor3 $acc_lb, $acc_lb, $h2.16b, $h1.16b @ GHASH block 8k+6, 8k+7 - low
|
|
+ ldp $ctr_t0q, $ctr_t1q, [$input_ptr], #32 @ AES block 8k+8, 8k+9 - load plaintext
|
|
+
|
|
+ aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 6
|
|
+ rev32 $h1.16b, $rtmp_ctr.16b @ CTR block 8k+16
|
|
+ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+16
|
|
+
|
|
+ aese $ctr4b, $rk6 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 6
|
|
+ aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 6
|
|
+ aese $ctr6b, $rk6 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 6
|
|
+
|
|
+ eor3 $acc_mb, $acc_mb, $h12k.16b, $t9.16b @ GHASH block 8k+6, 8k+7 - mid
|
|
+ ldp $rk8q, $rk9q, [$cc, #128] @ load rk8, rk9
|
|
+ eor3 $acc_hb, $acc_hb, $t7.16b, $t8.16b @ GHASH block 8k+6, 8k+7 - high
|
|
+
|
|
+ aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 7
|
|
+ aese $ctr7b, $rk7 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 7
|
|
+ ldp $ctr_t2q, $ctr_t3q, [$input_ptr], #32 @ AES block 8k+10, 8k+11 - load plaintext
|
|
+
|
|
+ aese $ctr5b, $rk7 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 7
|
|
+ aese $ctr6b, $rk7 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 7
|
|
+ aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 7
|
|
+
|
|
+ pmull $t11.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid
|
|
+ aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 7
|
|
+ aese $ctr4b, $rk7 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 7
|
|
+
|
|
+ rev32 $h2.16b, $rtmp_ctr.16b @ CTR block 8k+17
|
|
+ aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 7
|
|
+
|
|
+ aese $ctr5b, $rk8 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 8
|
|
+ ldp $ctr_t4q, $ctr_t5q, [$input_ptr], #32 @ AES block 8k+12, 8k+13 - load plaintext
|
|
+ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+17
|
|
+
|
|
+ aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 8
|
|
+ aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 8
|
|
+ aese $ctr7b, $rk8 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 8
|
|
+
|
|
+ aese $ctr4b, $rk8 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 8
|
|
+ eor3 $acc_mb, $acc_mb, $acc_hb, $acc_lb @ MODULO - karatsuba tidy up
|
|
+ ldr $rk10q, [$cc, #160] @ load rk10
|
|
+
|
|
+ ext $t12.16b, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment
|
|
+ rev32 $h3.16b, $rtmp_ctr.16b @ CTR block 8k+18
|
|
+ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+18
|
|
+ aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 8
|
|
+
|
|
+ aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 8
|
|
+ eor3 $acc_mb, $acc_mb, $t12.16b, $t11.16b @ MODULO - fold into mid
|
|
+ aese $ctr6b, $rk8 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 8
|
|
+
|
|
+ aese $ctr2b, $rk9 @ AES block 8k+10 - round 9
|
|
+ aese $ctr4b, $rk9 @ AES block 8k+12 - round 9
|
|
+ aese $ctr1b, $rk9 @ AES block 8k+9 - round 9
|
|
+
|
|
+ ldp $ctr_t6q, $ctr_t7q, [$input_ptr], #32 @ AES block 8k+14, 8k+15 - load plaintext
|
|
+ rev32 $h4.16b, $rtmp_ctr.16b @ CTR block 8k+19
|
|
+ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+19
|
|
+
|
|
+ cmp $input_ptr, $main_end_input_ptr @ LOOP CONTROL
|
|
+ eor3 $res4b, $ctr_t4b, $ctr4b, $rk10 @ AES block 4 - result
|
|
+ aese $ctr7b, $rk9 @ AES block 8k+15 - round 9
|
|
+
|
|
+ aese $ctr6b, $rk9 @ AES block 8k+14 - round 9
|
|
+ aese $ctr3b, $rk9 @ AES block 8k+11 - round 9
|
|
+
|
|
+ eor3 $res2b, $ctr_t2b, $ctr2b, $rk10 @ AES block 8k+10 - result
|
|
+
|
|
+ mov $ctr2.16b, $h3.16b @ CTR block 8k+18
|
|
+ aese $ctr0b, $rk9 @ AES block 8k+8 - round 9
|
|
+
|
|
+ rev32 $ctr4.16b, $rtmp_ctr.16b @ CTR block 8k+20
|
|
+ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+20
|
|
+
|
|
+ eor3 $res7b, $ctr_t7b, $ctr7b, $rk10 @ AES block 7 - result
|
|
+ aese $ctr5b, $rk9 @ AES block 8k+13 - round 9
|
|
+ pmull $acc_h.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low
|
|
+
|
|
+ eor3 $res1b, $ctr_t1b, $ctr1b, $rk10 @ AES block 8k+9 - result
|
|
+ eor3 $res3b, $ctr_t3b, $ctr3b, $rk10 @ AES block 8k+11 - result
|
|
+ mov $ctr3.16b, $h4.16b @ CTR block 8k+19
|
|
+
|
|
+ ext $t11.16b, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment
|
|
+ eor3 $res5b, $ctr_t5b, $ctr5b, $rk10 @ AES block 5 - result
|
|
+ mov $ctr1.16b, $h2.16b @ CTR block 8k+17
|
|
+
|
|
+ eor3 $res0b, $ctr_t0b, $ctr0b, $rk10 @ AES block 8k+8 - result
|
|
+ mov $ctr0.16b, $h1.16b @ CTR block 8k+16
|
|
+ stp $res0q, $res1q, [$output_ptr], #32 @ AES block 8k+8, 8k+9 - store result
|
|
+
|
|
+ stp $res2q, $res3q, [$output_ptr], #32 @ AES block 8k+10, 8k+11 - store result
|
|
+ eor3 $res6b, $ctr_t6b, $ctr6b, $rk10 @ AES block 6 - result
|
|
+
|
|
+ stp $res4q, $res5q, [$output_ptr], #32 @ AES block 8k+12, 8k+13 - store result
|
|
+ eor3 $acc_lb, $acc_lb, $acc_hb, $t11.16b @ MODULO - fold into low
|
|
+
|
|
+ stp $res6q, $res7q, [$output_ptr], #32 @ AES block 8k+14, 8k+15 - store result
|
|
+ b.lt .L128_enc_main_loop
|
|
+
|
|
+.L128_enc_prepretail: @ PREPRETAIL
|
|
+ rev32 $ctr5.16b, $rtmp_ctr.16b @ CTR block 8k+13
|
|
+ ldr $h7q, [$current_tag, #176] @ load h7l | h7h
|
|
+ ext $h7.16b, $h7.16b, $h7.16b, #8
|
|
+ ldr $h8q, [$current_tag, #208] @ load h8l | h8h
|
|
+ ext $h8.16b, $h8.16b, $h8.16b, #8
|
|
+ ext $acc_lb, $acc_lb, $acc_lb, #8 @ PRE 0
|
|
+
|
|
+ ldr $h5q, [$current_tag, #128] @ load h5l | h5h
|
|
+ ext $h5.16b, $h5.16b, $h5.16b, #8
|
|
+ ldr $h6q, [$current_tag, #160] @ load h6l | h6h
|
|
+ ext $h6.16b, $h6.16b, $h6.16b, #8
|
|
+ rev64 $res0b, $res0b @ GHASH block 8k
|
|
+ rev64 $res1b, $res1b @ GHASH block 8k+1
|
|
+
|
|
+ ldr $h56kq, [$current_tag, #144] @ load h6k | h5k
|
|
+ ldr $h78kq, [$current_tag, #192] @ load h6k | h5k
|
|
+ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+13
|
|
+ rev64 $res3b, $res3b @ GHASH block 8k+3
|
|
+
|
|
+ rev64 $res2b, $res2b @ GHASH block 8k+2
|
|
+ eor $res0b, $res0b, $acc_lb @ PRE 1
|
|
+
|
|
+ rev32 $ctr6.16b, $rtmp_ctr.16b @ CTR block 8k+14
|
|
+
|
|
+ pmull2 $t0.1q, $res1.2d, $h7.2d @ GHASH block 8k+1 - high
|
|
+ pmull $acc_l.1q, $res0.1d, $h8.1d @ GHASH block 8k - low
|
|
+ pmull2 $acc_h.1q, $res0.2d, $h8.2d @ GHASH block 8k - high
|
|
+
|
|
+ rev64 $res5b, $res5b @ GHASH block 8k+5 (t0, t1, t2 and t3 free)
|
|
+ trn1 $acc_m.2d, $res1.2d, $res0.2d @ GHASH block 8k, 8k+1 - mid
|
|
+
|
|
+ pmull $h7.1q, $res1.1d, $h7.1d @ GHASH block 8k+1 - low
|
|
+ eor $acc_hb, $acc_hb, $t0.16b @ GHASH block 8k+1 - high
|
|
+ trn2 $res0.2d, $res1.2d, $res0.2d @ GHASH block 8k, 8k+1 - mid
|
|
+
|
|
+ eor $acc_lb, $acc_lb, $h7.16b @ GHASH block 8k+1 - low
|
|
+ eor $res0.16b, $res0.16b, $acc_m.16b @ GHASH block 8k, 8k+1 - mid
|
|
+
|
|
+ ldp $rk0q, $rk1q, [$cc, #0] @ load rk0, rk1
|
|
+ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+14
|
|
+
|
|
+ pmull2 $acc_m.1q, $res0.2d, $h78k.2d @ GHASH block 8k - mid
|
|
+ pmull $h78k.1q, $res0.1d, $h78k.1d @ GHASH block 8k+1 - mid
|
|
+
|
|
+ rev64 $res4b, $res4b @ GHASH block 8k+4 (t0, t1, and t2 free)
|
|
+ rev64 $res7b, $res7b @ GHASH block 8k+7 (t0, t1, t2 and t3 free)
|
|
+
|
|
+ eor $acc_mb, $acc_mb, $h78k.16b @ GHASH block 8k+1 - mid
|
|
+
|
|
+ rev32 $ctr7.16b, $rtmp_ctr.16b @ CTR block 8k+15
|
|
+
|
|
+ rev64 $res6b, $res6b @ GHASH block 8k+6 (t0, t1, and t2 free)
|
|
+
|
|
+ aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 0
|
|
+
|
|
+ pmull2 $t2.1q, $res3.2d, $h5.2d @ GHASH block 8k+3 - high
|
|
+ pmull2 $t1.1q, $res2.2d, $h6.2d @ GHASH block 8k+2 - high
|
|
+
|
|
+ aese $ctr6b, $rk0 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 0
|
|
+ aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 0
|
|
+
|
|
+ pmull $h6.1q, $res2.1d, $h6.1d @ GHASH block 8k+2 - low
|
|
+ aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 0
|
|
+
|
|
+ eor3 $acc_hb, $acc_hb, $t1.16b, $t2.16b @ GHASH block 8k+2, 8k+3 - high
|
|
+ trn1 $t3.2d, $res3.2d, $res2.2d @ GHASH block 8k+2, 8k+3 - mid
|
|
+ trn2 $res2.2d, $res3.2d, $res2.2d @ GHASH block 8k+2, 8k+3 - mid
|
|
+
|
|
+ aese $ctr5b, $rk0 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 0
|
|
+ aese $ctr7b, $rk0 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 0
|
|
+
|
|
+ eor $res2.16b, $res2.16b, $t3.16b @ GHASH block 8k+2, 8k+3 - mid
|
|
+ aese $ctr4b, $rk0 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 0
|
|
+ aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 0
|
|
+
|
|
+ aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 1
|
|
+ pmull $h5.1q, $res3.1d, $h5.1d @ GHASH block 8k+3 - low
|
|
+
|
|
+ ldr $h3q, [$current_tag, #80] @ load h3l | h3h
|
|
+ ext $h3.16b, $h3.16b, $h3.16b, #8
|
|
+ ldr $h4q, [$current_tag, #112] @ load h4l | h4h
|
|
+ ext $h4.16b, $h4.16b, $h4.16b, #8
|
|
+
|
|
+ ldp $rk2q, $rk3q, [$cc, #32] @ load rk2, rk3
|
|
+ aese $ctr5b, $rk1 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 1
|
|
+ pmull2 $t3.1q, $res2.2d, $h56k.2d @ GHASH block 8k+2 - mid
|
|
+
|
|
+ eor3 $acc_lb, $acc_lb, $h6.16b, $h5.16b @ GHASH block 8k+2, 8k+3 - low
|
|
+ pmull $h56k.1q, $res2.1d, $h56k.1d @ GHASH block 8k+3 - mid
|
|
+
|
|
+ aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 1
|
|
+ aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 1
|
|
+
|
|
+ eor3 $acc_mb, $acc_mb, $h56k.16b, $t3.16b @ GHASH block 8k+2, 8k+3 - mid
|
|
+ ldr $h12kq, [$current_tag, #48] @ load h2k | h1k
|
|
+ ldr $h34kq, [$current_tag, #96] @ load h4k | h3k
|
|
+ aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 1
|
|
+
|
|
+ aese $ctr4b, $rk1 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 1
|
|
+ aese $ctr7b, $rk1 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 1
|
|
+
|
|
+ aese $ctr5b, $rk2 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 2
|
|
+ aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 2
|
|
+ aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 2
|
|
+
|
|
+ aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 2
|
|
+ aese $ctr6b, $rk1 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 1
|
|
+ aese $ctr4b, $rk2 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 2
|
|
+
|
|
+ aese $ctr5b, $rk3 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 3
|
|
+ aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 2
|
|
+
|
|
+ aese $ctr6b, $rk2 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 2
|
|
+ aese $ctr7b, $rk2 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 2
|
|
+ ldp $rk4q, $rk5q, [$cc, #64] @ load rk4, rk5
|
|
+
|
|
+ ldr $h1q, [$current_tag, #32] @ load h1l | h1h
|
|
+ ext $h1.16b, $h1.16b, $h1.16b, #8
|
|
+ ldr $h2q, [$current_tag, #64] @ load h1l | h1h
|
|
+ ext $h2.16b, $h2.16b, $h2.16b, #8
|
|
+ trn1 $t6.2d, $res5.2d, $res4.2d @ GHASH block 8k+4, 8k+5 - mid
|
|
+ aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 3
|
|
+
|
|
+ pmull2 $t4.1q, $res4.2d, $h4.2d @ GHASH block 8k+4 - high
|
|
+ aese $ctr6b, $rk3 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 3
|
|
+ aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 3
|
|
+
|
|
+ pmull $h4.1q, $res4.1d, $h4.1d @ GHASH block 8k+4 - low
|
|
+ trn2 $res4.2d, $res5.2d, $res4.2d @ GHASH block 8k+4, 8k+5 - mid
|
|
+ pmull2 $t5.1q, $res5.2d, $h3.2d @ GHASH block 8k+5 - high
|
|
+
|
|
+ aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 3
|
|
+ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+15
|
|
+
|
|
+ aese $ctr7b, $rk3 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 3
|
|
+ aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 3
|
|
+ eor $res4.16b, $res4.16b, $t6.16b @ GHASH block 8k+4, 8k+5 - mid
|
|
+
|
|
+ pmull $h3.1q, $res5.1d, $h3.1d @ GHASH block 8k+5 - low
|
|
+ aese $ctr4b, $rk3 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 3
|
|
+ pmull2 $t7.1q, $res6.2d, $h2.2d @ GHASH block 8k+6 - high
|
|
+
|
|
+ trn1 $t9.2d, $res7.2d, $res6.2d @ GHASH block 8k+6, 8k+7 - mid
|
|
+ pmull $h2.1q, $res6.1d, $h2.1d @ GHASH block 8k+6 - low
|
|
+ trn2 $res6.2d, $res7.2d, $res6.2d @ GHASH block 8k+6, 8k+7 - mid
|
|
+
|
|
+ aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 4
|
|
+ aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 4
|
|
+ eor3 $acc_hb, $acc_hb, $t4.16b, $t5.16b @ GHASH block 8k+4, 8k+5 - high
|
|
+
|
|
+ eor3 $acc_lb, $acc_lb, $h4.16b, $h3.16b @ GHASH block 8k+4, 8k+5 - low
|
|
+ eor $res6.16b, $res6.16b, $t9.16b @ GHASH block 8k+6, 8k+7 - mid
|
|
+ pmull2 $t6.1q, $res4.2d, $h34k.2d @ GHASH block 8k+4 - mid
|
|
+
|
|
+ aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 5
|
|
+ aese $ctr6b, $rk4 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 4
|
|
+ aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 4
|
|
+
|
|
+ aese $ctr7b, $rk4 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 4
|
|
+ aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 4
|
|
+
|
|
+ pmull $h34k.1q, $res4.1d, $h34k.1d @ GHASH block 8k+5 - mid
|
|
+ aese $ctr4b, $rk4 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 4
|
|
+ aese $ctr5b, $rk4 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 4
|
|
+
|
|
+ pmull2 $t8.1q, $res7.2d, $h1.2d @ GHASH block 8k+7 - high
|
|
+ ldp $rk6q, $rk7q, [$cc, #96] @ load rk6, rk7
|
|
+ pmull $h1.1q, $res7.1d, $h1.1d @ GHASH block 8k+7 - low
|
|
+
|
|
+ eor3 $acc_mb, $acc_mb, $h34k.16b, $t6.16b @ GHASH block 8k+4, 8k+5 - mid
|
|
+ pmull2 $t9.1q, $res6.2d, $h12k.2d @ GHASH block 8k+6 - mid
|
|
+ pmull $h12k.1q, $res6.1d, $h12k.1d @ GHASH block 8k+7 - mid
|
|
+
|
|
+ aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 5
|
|
+ aese $ctr7b, $rk5 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 5
|
|
+ ldr $mod_constantd, [$modulo_constant] @ MODULO - load modulo constant
|
|
+
|
|
+ aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 5
|
|
+ aese $ctr4b, $rk5 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 5
|
|
+
|
|
+ eor3 $acc_hb, $acc_hb, $t7.16b, $t8.16b @ GHASH block 8k+6, 8k+7 - high
|
|
+ aese $ctr5b, $rk5 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 5
|
|
+ aese $ctr6b, $rk5 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 5
|
|
+
|
|
+ aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 5
|
|
+ aese $ctr4b, $rk6 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 6
|
|
+
|
|
+ aese $ctr5b, $rk6 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 6
|
|
+ aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 6
|
|
+ aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 6
|
|
+
|
|
+ aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 6
|
|
+ eor3 $acc_lb, $acc_lb, $h2.16b, $h1.16b @ GHASH block 8k+6, 8k+7 - low
|
|
+ eor3 $acc_mb, $acc_mb, $h12k.16b, $t9.16b @ GHASH block 8k+6, 8k+7 - mid
|
|
+
|
|
+ aese $ctr6b, $rk6 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 6
|
|
+ aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 6
|
|
+ aese $ctr7b, $rk6 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 6
|
|
+
|
|
+ pmull $t11.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid
|
|
+ eor3 $acc_mb, $acc_mb, $acc_hb, $acc_lb @ MODULO - karatsuba tidy up
|
|
+ ldp $rk8q, $rk9q, [$cc, #128] @ load rk8, rk9
|
|
+
|
|
+ aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 7
|
|
+ aese $ctr6b, $rk7 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 7
|
|
+ aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 7
|
|
+ ext $t12.16b, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment
|
|
+
|
|
+ aese $ctr5b, $rk7 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 7
|
|
+ aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 7
|
|
+ eor3 $acc_mb, $acc_mb, $t12.16b, $t11.16b @ MODULO - fold into mid
|
|
+
|
|
+ aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 7
|
|
+ aese $ctr7b, $rk7 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 7
|
|
+
|
|
+ pmull $acc_h.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low
|
|
+ aese $ctr4b, $rk7 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 7
|
|
+
|
|
+ aese $ctr7b, $rk8 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 8
|
|
+ aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 8
|
|
+ aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 8
|
|
+ ext $acc_mb, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment
|
|
+
|
|
+ aese $ctr6b, $rk8 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 8
|
|
+ eor3 $acc_lb, $acc_lb, $acc_hb, $acc_mb @ MODULO - fold into low
|
|
+ aese $ctr4b, $rk8 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 8
|
|
+
|
|
+ aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 8
|
|
+ aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 8
|
|
+ aese $ctr5b, $rk8 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 8
|
|
+
|
|
+ ldr $rk10q, [$cc, #160] @ load rk10
|
|
+ aese $ctr6b, $rk9 @ AES block 8k+14 - round 9
|
|
+ aese $ctr2b, $rk9 @ AES block 8k+10 - round 9
|
|
+
|
|
+ aese $ctr0b, $rk9 @ AES block 8k+8 - round 9
|
|
+ aese $ctr1b, $rk9 @ AES block 8k+9 - round 9
|
|
+
|
|
+ aese $ctr3b, $rk9 @ AES block 8k+11 - round 9
|
|
+ aese $ctr5b, $rk9 @ AES block 8k+13 - round 9
|
|
+
|
|
+ aese $ctr4b, $rk9 @ AES block 8k+12 - round 9
|
|
+ aese $ctr7b, $rk9 @ AES block 8k+15 - round 9
|
|
+.L128_enc_tail: @ TAIL
|
|
+
|
|
+ sub $main_end_input_ptr, $end_input_ptr, $input_ptr @ main_end_input_ptr is number of bytes left to process
|
|
+ ldr $ctr_t0q, [$input_ptr], #16 @ AES block 8k+8 - load plaintext
|
|
+
|
|
+ mov $t1.16b, $rk10
|
|
+ ldp $h5q, $h56kq, [$current_tag, #128] @ load h5l | h5h
|
|
+ ext $h5.16b, $h5.16b, $h5.16b, #8
|
|
+
|
|
+ eor3 $res1b, $ctr_t0b, $ctr0b, $t1.16b @ AES block 8k+8 - result
|
|
+ ext $t0.16b, $acc_lb, $acc_lb, #8 @ prepare final partial tag
|
|
+ ldp $h6q, $h7q, [$current_tag, #160] @ load h6k | h5k
|
|
+ ext $h6.16b, $h6.16b, $h6.16b, #8
|
|
+ ext $h7.16b, $h7.16b, $h7.16b, #8
|
|
+
|
|
+ ldp $h78kq, $h8q, [$current_tag, #192] @ load h7l | h7h
|
|
+ ext $h8.16b, $h8.16b, $h8.16b, #8
|
|
+ cmp $main_end_input_ptr, #112
|
|
+ b.gt .L128_enc_blocks_more_than_7
|
|
+
|
|
+ mov $ctr7b, $ctr6b
|
|
+ mov $ctr6b, $ctr5b
|
|
+ movi $acc_h.8b, #0
|
|
+
|
|
+ cmp $main_end_input_ptr, #96
|
|
+ sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
|
|
+ mov $ctr5b, $ctr4b
|
|
+
|
|
+ mov $ctr4b, $ctr3b
|
|
+ mov $ctr3b, $ctr2b
|
|
+ mov $ctr2b, $ctr1b
|
|
+
|
|
+ movi $acc_l.8b, #0
|
|
+ movi $acc_m.8b, #0
|
|
+ b.gt .L128_enc_blocks_more_than_6
|
|
+
|
|
+ mov $ctr7b, $ctr6b
|
|
+ cmp $main_end_input_ptr, #80
|
|
+
|
|
+ sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
|
|
+ mov $ctr6b, $ctr5b
|
|
+ mov $ctr5b, $ctr4b
|
|
+
|
|
+ mov $ctr4b, $ctr3b
|
|
+ mov $ctr3b, $ctr1b
|
|
+ b.gt .L128_enc_blocks_more_than_5
|
|
+
|
|
+ cmp $main_end_input_ptr, #64
|
|
+ sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
|
|
+
|
|
+ mov $ctr7b, $ctr6b
|
|
+ mov $ctr6b, $ctr5b
|
|
+
|
|
+ mov $ctr5b, $ctr4b
|
|
+ mov $ctr4b, $ctr1b
|
|
+ b.gt .L128_enc_blocks_more_than_4
|
|
+
|
|
+ mov $ctr7b, $ctr6b
|
|
+ sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
|
|
+ mov $ctr6b, $ctr5b
|
|
+
|
|
+ mov $ctr5b, $ctr1b
|
|
+ cmp $main_end_input_ptr, #48
|
|
+ b.gt .L128_enc_blocks_more_than_3
|
|
+
|
|
+ sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
|
|
+ mov $ctr7b, $ctr6b
|
|
+ mov $ctr6b, $ctr1b
|
|
+
|
|
+ cmp $main_end_input_ptr, #32
|
|
+ ldr $h34kq, [$current_tag, #96] @ load h4k | h3k
|
|
+ b.gt .L128_enc_blocks_more_than_2
|
|
+
|
|
+ cmp $main_end_input_ptr, #16
|
|
+
|
|
+ sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
|
|
+ mov $ctr7b, $ctr1b
|
|
+ b.gt .L128_enc_blocks_more_than_1
|
|
+
|
|
+ ldr $h12kq, [$current_tag, #48] @ load h2k | h1k
|
|
+ sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
|
|
+ b .L128_enc_blocks_less_than_1
|
|
+.L128_enc_blocks_more_than_7: @ blocks left > 7
|
|
+ st1 { $res1b}, [$output_ptr], #16 @ AES final-7 block - store result
|
|
+
|
|
+ rev64 $res0b, $res1b @ GHASH final-7 block
|
|
+ ldr $ctr_t1q, [$input_ptr], #16 @ AES final-6 block - load plaintext
|
|
+
|
|
+ eor $res0b, $res0b, $t0.16b @ feed in partial tag
|
|
+
|
|
+ ins $rk4v.d[0], $res0.d[1] @ GHASH final-7 block - mid
|
|
+
|
|
+ pmull2 $acc_h.1q, $res0.2d, $h8.2d @ GHASH final-7 block - high
|
|
+
|
|
+ ins $acc_m.d[0], $h78k.d[1] @ GHASH final-7 block - mid
|
|
+
|
|
+ eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-7 block - mid
|
|
+ movi $t0.8b, #0 @ surpress further partial tag feed in
|
|
+
|
|
+ eor3 $res1b, $ctr_t1b, $ctr1b, $t1.16b @ AES final-6 block - result
|
|
+
|
|
+ pmull $acc_m.1q, $rk4v.1d, $acc_m.1d @ GHASH final-7 block - mid
|
|
+ pmull $acc_l.1q, $res0.1d, $h8.1d @ GHASH final-7 block - low
|
|
+.L128_enc_blocks_more_than_6: @ blocks left > 6
|
|
+
|
|
+ st1 { $res1b}, [$output_ptr], #16 @ AES final-6 block - store result
|
|
+
|
|
+ rev64 $res0b, $res1b @ GHASH final-6 block
|
|
+ ldr $ctr_t1q, [$input_ptr], #16 @ AES final-5 block - load plaintext
|
|
+
|
|
+ eor $res0b, $res0b, $t0.16b @ feed in partial tag
|
|
+
|
|
+ ins $rk4v.d[0], $res0.d[1] @ GHASH final-6 block - mid
|
|
+
|
|
+ eor3 $res1b, $ctr_t1b, $ctr2b, $t1.16b @ AES final-5 block - result
|
|
+ pmull $rk3q1, $res0.1d, $h7.1d @ GHASH final-6 block - low
|
|
+
|
|
+ eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-6 block - mid
|
|
+ movi $t0.8b, #0 @ surpress further partial tag feed in
|
|
+
|
|
+ pmull $rk4v.1q, $rk4v.1d, $h78k.1d @ GHASH final-6 block - mid
|
|
+ pmull2 $rk2q1, $res0.2d, $h7.2d @ GHASH final-6 block - high
|
|
+
|
|
+ eor $acc_lb, $acc_lb, $rk3 @ GHASH final-6 block - low
|
|
+
|
|
+ eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-6 block - mid
|
|
+ eor $acc_hb, $acc_hb, $rk2 @ GHASH final-6 block - high
|
|
+.L128_enc_blocks_more_than_5: @ blocks left > 5
|
|
+
|
|
+ st1 { $res1b}, [$output_ptr], #16 @ AES final-5 block - store result
|
|
+
|
|
+ rev64 $res0b, $res1b @ GHASH final-5 block
|
|
+
|
|
+ eor $res0b, $res0b, $t0.16b @ feed in partial tag
|
|
+
|
|
+ ins $rk4v.d[0], $res0.d[1] @ GHASH final-5 block - mid
|
|
+ ldr $ctr_t1q, [$input_ptr], #16 @ AES final-4 block - load plaintext
|
|
+ pmull2 $rk2q1, $res0.2d, $h6.2d @ GHASH final-5 block - high
|
|
+
|
|
+ eor $acc_hb, $acc_hb, $rk2 @ GHASH final-5 block - high
|
|
+
|
|
+ eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-5 block - mid
|
|
+
|
|
+ ins $rk4v.d[1], $rk4v.d[0] @ GHASH final-5 block - mid
|
|
+
|
|
+ eor3 $res1b, $ctr_t1b, $ctr3b, $t1.16b @ AES final-4 block - result
|
|
+ pmull $rk3q1, $res0.1d, $h6.1d @ GHASH final-5 block - low
|
|
+ movi $t0.8b, #0 @ surpress further partial tag feed in
|
|
+
|
|
+ pmull2 $rk4v.1q, $rk4v.2d, $h56k.2d @ GHASH final-5 block - mid
|
|
+ eor $acc_lb, $acc_lb, $rk3 @ GHASH final-5 block - low
|
|
+
|
|
+ eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-5 block - mid
|
|
+.L128_enc_blocks_more_than_4: @ blocks left > 4
|
|
+
|
|
+ st1 { $res1b}, [$output_ptr], #16 @ AES final-4 block - store result
|
|
+
|
|
+ rev64 $res0b, $res1b @ GHASH final-4 block
|
|
+
|
|
+ ldr $ctr_t1q, [$input_ptr], #16 @ AES final-3 block - load plaintext
|
|
+
|
|
+ eor $res0b, $res0b, $t0.16b @ feed in partial tag
|
|
+
|
|
+ ins $rk4v.d[0], $res0.d[1] @ GHASH final-4 block - mid
|
|
+ movi $t0.8b, #0 @ surpress further partial tag feed in
|
|
+ pmull2 $rk2q1, $res0.2d, $h5.2d @ GHASH final-4 block - high
|
|
+
|
|
+ eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-4 block - mid
|
|
+
|
|
+ pmull $rk3q1, $res0.1d, $h5.1d @ GHASH final-4 block - low
|
|
+
|
|
+ eor $acc_hb, $acc_hb, $rk2 @ GHASH final-4 block - high
|
|
+ pmull $rk4v.1q, $rk4v.1d, $h56k.1d @ GHASH final-4 block - mid
|
|
+
|
|
+ eor $acc_lb, $acc_lb, $rk3 @ GHASH final-4 block - low
|
|
+
|
|
+ eor3 $res1b, $ctr_t1b, $ctr4b, $t1.16b @ AES final-3 block - result
|
|
+ eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-4 block - mid
|
|
+.L128_enc_blocks_more_than_3: @ blocks left > 3
|
|
+
|
|
+ st1 { $res1b}, [$output_ptr], #16 @ AES final-3 block - store result
|
|
+
|
|
+ ldr $h4q, [$current_tag, #112] @ load h4l | h4h
|
|
+ ext $h4.16b, $h4.16b, $h4.16b, #8
|
|
+
|
|
+ rev64 $res0b, $res1b @ GHASH final-3 block
|
|
+
|
|
+ eor $res0b, $res0b, $t0.16b @ feed in partial tag
|
|
+ movi $t0.8b, #0 @ surpress further partial tag feed in
|
|
+
|
|
+ ins $rk4v.d[0], $res0.d[1] @ GHASH final-3 block - mid
|
|
+ ldr $h34kq, [$current_tag, #96] @ load h4k | h3k
|
|
+ pmull $rk3q1, $res0.1d, $h4.1d @ GHASH final-3 block - low
|
|
+
|
|
+ ldr $ctr_t1q, [$input_ptr], #16 @ AES final-2 block - load plaintext
|
|
+
|
|
+ eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-3 block - mid
|
|
+
|
|
+ ins $rk4v.d[1], $rk4v.d[0] @ GHASH final-3 block - mid
|
|
+ eor $acc_lb, $acc_lb, $rk3 @ GHASH final-3 block - low
|
|
+
|
|
+ eor3 $res1b, $ctr_t1b, $ctr5b, $t1.16b @ AES final-2 block - result
|
|
+
|
|
+ pmull2 $rk4v.1q, $rk4v.2d, $h34k.2d @ GHASH final-3 block - mid
|
|
+ pmull2 $rk2q1, $res0.2d, $h4.2d @ GHASH final-3 block - high
|
|
+
|
|
+ eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-3 block - mid
|
|
+ eor $acc_hb, $acc_hb, $rk2 @ GHASH final-3 block - high
|
|
+.L128_enc_blocks_more_than_2: @ blocks left > 2
|
|
+
|
|
+ st1 { $res1b}, [$output_ptr], #16 @ AES final-2 block - store result
|
|
+
|
|
+ rev64 $res0b, $res1b @ GHASH final-2 block
|
|
+
|
|
+ eor $res0b, $res0b, $t0.16b @ feed in partial tag
|
|
+
|
|
+ ldr $ctr_t1q, [$input_ptr], #16 @ AES final-1 block - load plaintext
|
|
+
|
|
+ ins $rk4v.d[0], $res0.d[1] @ GHASH final-2 block - mid
|
|
+ ldr $h3q, [$current_tag, #80] @ load h3l | h3h
|
|
+ ext $h3.16b, $h3.16b, $h3.16b, #8
|
|
+ movi $t0.8b, #0 @ surpress further partial tag feed in
|
|
+
|
|
+ eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-2 block - mid
|
|
+ eor3 $res1b, $ctr_t1b, $ctr6b, $t1.16b @ AES final-1 block - result
|
|
+
|
|
+ pmull2 $rk2q1, $res0.2d, $h3.2d @ GHASH final-2 block - high
|
|
+
|
|
+ pmull $rk3q1, $res0.1d, $h3.1d @ GHASH final-2 block - low
|
|
+ pmull $rk4v.1q, $rk4v.1d, $h34k.1d @ GHASH final-2 block - mid
|
|
+
|
|
+ eor $acc_hb, $acc_hb, $rk2 @ GHASH final-2 block - high
|
|
+
|
|
+ eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-2 block - mid
|
|
+ eor $acc_lb, $acc_lb, $rk3 @ GHASH final-2 block - low
|
|
+.L128_enc_blocks_more_than_1: @ blocks left > 1
|
|
+
|
|
+ st1 { $res1b}, [$output_ptr], #16 @ AES final-1 block - store result
|
|
+
|
|
+ ldr $h2q, [$current_tag, #64] @ load h2l | h2h
|
|
+ ext $h2.16b, $h2.16b, $h2.16b, #8
|
|
+ rev64 $res0b, $res1b @ GHASH final-1 block
|
|
+ ldr $ctr_t1q, [$input_ptr], #16 @ AES final block - load plaintext
|
|
+
|
|
+ eor $res0b, $res0b, $t0.16b @ feed in partial tag
|
|
+
|
|
+ movi $t0.8b, #0 @ surpress further partial tag feed in
|
|
+ ins $rk4v.d[0], $res0.d[1] @ GHASH final-1 block - mid
|
|
+ eor3 $res1b, $ctr_t1b, $ctr7b, $t1.16b @ AES final block - result
|
|
+
|
|
+ pmull2 $rk2q1, $res0.2d, $h2.2d @ GHASH final-1 block - high
|
|
+
|
|
+ eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-1 block - mid
|
|
+
|
|
+ ldr $h12kq, [$current_tag, #48] @ load h2k | h1k
|
|
+
|
|
+ ins $rk4v.d[1], $rk4v.d[0] @ GHASH final-1 block - mid
|
|
+
|
|
+ pmull $rk3q1, $res0.1d, $h2.1d @ GHASH final-1 block - low
|
|
+ pmull2 $rk4v.1q, $rk4v.2d, $h12k.2d @ GHASH final-1 block - mid
|
|
+
|
|
+ eor $acc_hb, $acc_hb, $rk2 @ GHASH final-1 block - high
|
|
+
|
|
+ eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-1 block - mid
|
|
+ eor $acc_lb, $acc_lb, $rk3 @ GHASH final-1 block - low
|
|
+.L128_enc_blocks_less_than_1: @ blocks left <= 1
|
|
+
|
|
+ rev32 $rtmp_ctr.16b, $rtmp_ctr.16b
|
|
+ str $rtmp_ctrq, [$counter] @ store the updated counter
|
|
+ and $bit_length, $bit_length, #127 @ bit_length %= 128
|
|
+
|
|
+ sub $bit_length, $bit_length, #128 @ bit_length -= 128
|
|
+
|
|
+ neg $bit_length, $bit_length @ bit_length = 128 - #bits in input (in range [1,128])
|
|
+
|
|
+ mvn $temp0_x, xzr @ temp0_x = 0xffffffffffffffff
|
|
+ ld1 { $rk0}, [$output_ptr] @ load existing bytes where the possibly partial last block is to be stored
|
|
+ and $bit_length, $bit_length, #127 @ bit_length %= 128
|
|
+
|
|
+ lsr $temp0_x, $temp0_x, $bit_length @ temp0_x is mask for top 64b of last block
|
|
+ mvn $temp1_x, xzr @ temp1_x = 0xffffffffffffffff
|
|
+ cmp $bit_length, #64
|
|
+
|
|
+ csel $temp2_x, $temp1_x, $temp0_x, lt
|
|
+ csel $temp3_x, $temp0_x, xzr, lt
|
|
+
|
|
+ mov $ctr0.d[1], $temp3_x
|
|
+ mov $ctr0.d[0], $temp2_x @ ctr0b is mask for last block
|
|
+
|
|
+ and $res1b, $res1b, $ctr0b @ possibly partial last block has zeroes in highest bits
|
|
+
|
|
+ rev64 $res0b, $res1b @ GHASH final block
|
|
+
|
|
+ bif $res1b, $rk0, $ctr0b @ insert existing bytes in top end of result before storing
|
|
+ st1 { $res1b}, [$output_ptr] @ store all 16B
|
|
+
|
|
+ eor $res0b, $res0b, $t0.16b @ feed in partial tag
|
|
+
|
|
+ ins $t0.d[0], $res0.d[1] @ GHASH final block - mid
|
|
+
|
|
+ eor $t0.8b, $t0.8b, $res0.8b @ GHASH final block - mid
|
|
+ ldr $h1q, [$current_tag, #32] @ load h1l | h1h
|
|
+ ext $h1.16b, $h1.16b, $h1.16b, #8
|
|
+
|
|
+ pmull $t0.1q, $t0.1d, $h12k.1d @ GHASH final block - mid
|
|
+
|
|
+ pmull2 $rk2q1, $res0.2d, $h1.2d @ GHASH final block - high
|
|
+ eor $acc_mb, $acc_mb, $t0.16b @ GHASH final block - mid
|
|
+ ldr $mod_constantd, [$modulo_constant] @ MODULO - load modulo constant
|
|
+
|
|
+ pmull $rk3q1, $res0.1d, $h1.1d @ GHASH final block - low
|
|
+
|
|
+ eor $acc_hb, $acc_hb, $rk2 @ GHASH final block - high
|
|
+
|
|
+ eor $acc_lb, $acc_lb, $rk3 @ GHASH final block - low
|
|
+
|
|
+ ext $t11.16b, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment
|
|
+ pmull $t12.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid
|
|
+
|
|
+ eor3 $acc_mb, $acc_mb, $acc_hb, $acc_lb @ MODULO - karatsuba tidy up
|
|
+
|
|
+ eor3 $acc_mb, $acc_mb, $t12.16b, $t11.16b @ MODULO - fold into mid
|
|
+
|
|
+ pmull $acc_h.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low
|
|
+ ext $t11.16b, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment
|
|
+
|
|
+ eor3 $acc_lb, $acc_lb, $acc_hb, $t11.16b @ MODULO - fold into low
|
|
+ ext $acc_lb, $acc_lb, $acc_lb, #8
|
|
+ rev64 $acc_lb, $acc_lb
|
|
+ st1 { $acc_l.16b }, [$current_tag]
|
|
+ lsr x0, $bit_length, #3 @ return sizes
|
|
+
|
|
+ ldp d10, d11, [sp, #16]
|
|
+ ldp d12, d13, [sp, #32]
|
|
+ ldp d14, d15, [sp, #48]
|
|
+ ldp d8, d9, [sp], #80
|
|
+ ret
|
|
+
|
|
+.L128_enc_ret:
|
|
+ mov w0, #0x0
|
|
+ ret
|
|
+.size unroll8_eor3_aes_gcm_enc_128_kernel,.-unroll8_eor3_aes_gcm_enc_128_kernel
|
|
+___
|
|
+
|
|
+#########################################################################################
|
|
+# size_t unroll8_eor3_aes_gcm_dec_128_kernel(const unsigned char *in,
|
|
+# size_t len,
|
|
+# unsigned char *out,
|
|
+# u64 *Xi,
|
|
+# unsigned char ivec[16],
|
|
+# const void *key);
|
|
+#
|
|
+$code.=<<___;
|
|
+.global unroll8_eor3_aes_gcm_dec_128_kernel
|
|
+.type unroll8_eor3_aes_gcm_dec_128_kernel,%function
|
|
+.align 4
|
|
+unroll8_eor3_aes_gcm_dec_128_kernel:
|
|
+ AARCH64_VALID_CALL_TARGET
|
|
+ cbz x1, .L128_dec_ret
|
|
+ stp d8, d9, [sp, #-80]!
|
|
+ mov $counter, x4
|
|
+ mov $cc, x5
|
|
+ stp d10, d11, [sp, #16]
|
|
+ stp d12, d13, [sp, #32]
|
|
+ stp d14, d15, [sp, #48]
|
|
+ mov x5, #0xc200000000000000
|
|
+ stp x5, xzr, [sp, #64]
|
|
+ add $modulo_constant, sp, #64
|
|
+
|
|
+ lsr $main_end_input_ptr, $bit_length, #3 @ byte_len
|
|
+ ld1 { $ctr0b}, [$counter] @ CTR block 0
|
|
+
|
|
+ ldp $rk0q, $rk1q, [$cc, #0] @ load rk0, rk1
|
|
+ sub $main_end_input_ptr, $main_end_input_ptr, #1 @ byte_len - 1
|
|
+
|
|
+ mov $constant_temp, #0x100000000 @ set up counter increment
|
|
+ movi $rctr_inc.16b, #0x0
|
|
+ mov $rctr_inc.d[1], $constant_temp
|
|
+ ld1 { $acc_lb}, [$current_tag]
|
|
+ ext $acc_lb, $acc_lb, $acc_lb, #8
|
|
+ rev64 $acc_lb, $acc_lb
|
|
+
|
|
+ rev32 $rtmp_ctr.16b, $ctr0.16b @ set up reversed counter
|
|
+
|
|
+ aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 0
|
|
+
|
|
+ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 0
|
|
+
|
|
+ rev32 $ctr1.16b, $rtmp_ctr.16b @ CTR block 1
|
|
+ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 1
|
|
+
|
|
+ and $main_end_input_ptr, $main_end_input_ptr, #0xffffffffffffff80 @ number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
|
|
+
|
|
+ rev32 $ctr2.16b, $rtmp_ctr.16b @ CTR block 2
|
|
+ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 2
|
|
+ aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 0
|
|
+
|
|
+ rev32 $ctr3.16b, $rtmp_ctr.16b @ CTR block 3
|
|
+ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 3
|
|
+
|
|
+ aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 1
|
|
+ aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 1
|
|
+
|
|
+ rev32 $ctr4.16b, $rtmp_ctr.16b @ CTR block 4
|
|
+ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 4
|
|
+
|
|
+ rev32 $ctr5.16b, $rtmp_ctr.16b @ CTR block 5
|
|
+ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 5
|
|
+
|
|
+ aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 0
|
|
+
|
|
+ rev32 $ctr6.16b, $rtmp_ctr.16b @ CTR block 6
|
|
+ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 6
|
|
+ aese $ctr5b, $rk0 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 0
|
|
+
|
|
+ aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 0
|
|
+ aese $ctr4b, $rk0 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 0
|
|
+
|
|
+ rev32 $ctr7.16b, $rtmp_ctr.16b @ CTR block 7
|
|
+
|
|
+ aese $ctr6b, $rk0 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 0
|
|
+ aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 1
|
|
+
|
|
+ aese $ctr7b, $rk0 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 0
|
|
+
|
|
+ ldp $rk2q, $rk3q, [$cc, #32] @ load rk2, rk3
|
|
+
|
|
+ aese $ctr6b, $rk1 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 1
|
|
+ aese $ctr5b, $rk1 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 1
|
|
+
|
|
+ aese $ctr4b, $rk1 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 1
|
|
+ aese $ctr7b, $rk1 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 1
|
|
+
|
|
+ aese $ctr7b, $rk2 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 2
|
|
+ aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 2
|
|
+ aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 1
|
|
+
|
|
+ aese $ctr6b, $rk2 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 2
|
|
+ aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 2
|
|
+ aese $ctr5b, $rk2 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 2
|
|
+
|
|
+ aese $ctr4b, $rk2 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 2
|
|
+ aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 2
|
|
+ aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 2
|
|
+
|
|
+ aese $ctr6b, $rk3 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 3
|
|
+ aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 3
|
|
+
|
|
+ ldp $rk4q, $rk5q, [$cc, #64] @ load rk4, rk5
|
|
+ aese $ctr5b, $rk3 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 3
|
|
+
|
|
+ aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 3
|
|
+ aese $ctr7b, $rk3 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 3
|
|
+
|
|
+ aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 3
|
|
+ aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 3
|
|
+
|
|
+ aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 4
|
|
+ aese $ctr7b, $rk4 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 4
|
|
+ aese $ctr4b, $rk3 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 3
|
|
+
|
|
+ aese $ctr6b, $rk4 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 4
|
|
+ aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 4
|
|
+ aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 4
|
|
+
|
|
+ aese $ctr5b, $rk4 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 4
|
|
+ aese $ctr4b, $rk4 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 4
|
|
+ aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 4
|
|
+
|
|
+ ldp $rk6q, $rk7q, [$cc, #96] @ load rk6, rk7
|
|
+ aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 5
|
|
+ aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 5
|
|
+
|
|
+ aese $ctr6b, $rk5 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 5
|
|
+ aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 5
|
|
+
|
|
+ aese $ctr7b, $rk5 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 5
|
|
+ aese $ctr5b, $rk5 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 5
|
|
+
|
|
+ aese $ctr4b, $rk5 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 5
|
|
+
|
|
+ aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 6
|
|
+ aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 6
|
|
+ aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 5
|
|
+
|
|
+ aese $ctr5b, $rk6 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 6
|
|
+ aese $ctr4b, $rk6 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 6
|
|
+ aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 6
|
|
+
|
|
+ aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 6
|
|
+ aese $ctr7b, $rk6 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 6
|
|
+ aese $ctr6b, $rk6 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 6
|
|
+
|
|
+ aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 7
|
|
+ aese $ctr4b, $rk7 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 7
|
|
+ aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 7
|
|
+
|
|
+ aese $ctr7b, $rk7 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 7
|
|
+ aese $ctr5b, $rk7 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 7
|
|
+ ldp $rk8q, $rk9q, [$cc, #128] @ load rk8, rk9
|
|
+
|
|
+ aese $ctr6b, $rk7 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 7
|
|
+ aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 7
|
|
+ aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 7
|
|
+
|
|
+ add $main_end_input_ptr, $main_end_input_ptr, $input_ptr
|
|
+ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 7
|
|
+
|
|
+ aese $ctr6b, $rk8 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 8
|
|
+ aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 8
|
|
+
|
|
+ aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 8
|
|
+ aese $ctr7b, $rk8 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 8
|
|
+ aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 8
|
|
+
|
|
+ aese $ctr5b, $rk8 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 8
|
|
+ aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 8
|
|
+ aese $ctr4b, $rk8 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 8
|
|
+
|
|
+ aese $ctr0b, $rk9 @ AES block 0 - round 9
|
|
+ aese $ctr1b, $rk9 @ AES block 1 - round 9
|
|
+ aese $ctr6b, $rk9 @ AES block 6 - round 9
|
|
+
|
|
+ ldr $rk10q, [$cc, #160] @ load rk10
|
|
+ aese $ctr4b, $rk9 @ AES block 4 - round 9
|
|
+ aese $ctr3b, $rk9 @ AES block 3 - round 9
|
|
+
|
|
+ aese $ctr2b, $rk9 @ AES block 2 - round 9
|
|
+ aese $ctr5b, $rk9 @ AES block 5 - round 9
|
|
+ aese $ctr7b, $rk9 @ AES block 7 - round 9
|
|
+
|
|
+ add $end_input_ptr, $input_ptr, $bit_length, lsr #3 @ end_input_ptr
|
|
+ cmp $input_ptr, $main_end_input_ptr @ check if we have <= 8 blocks
|
|
+ b.ge .L128_dec_tail @ handle tail
|
|
+
|
|
+ ldp $res0q, $res1q, [$input_ptr], #32 @ AES block 0, 1 - load ciphertext
|
|
+
|
|
+ eor3 $ctr0b, $res0b, $ctr0b, $rk10 @ AES block 0 - result
|
|
+ eor3 $ctr1b, $res1b, $ctr1b, $rk10 @ AES block 1 - result
|
|
+ stp $ctr0q, $ctr1q, [$output_ptr], #32 @ AES block 0, 1 - store result
|
|
+
|
|
+ rev32 $ctr0.16b, $rtmp_ctr.16b @ CTR block 8
|
|
+ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8
|
|
+ ldp $res2q, $res3q, [$input_ptr], #32 @ AES block 2, 3 - load ciphertext
|
|
+
|
|
+ ldp $res4q, $res5q, [$input_ptr], #32 @ AES block 4, 5 - load ciphertext
|
|
+
|
|
+ rev32 $ctr1.16b, $rtmp_ctr.16b @ CTR block 9
|
|
+ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 9
|
|
+ ldp $res6q, $res7q, [$input_ptr], #32 @ AES block 6, 7 - load ciphertext
|
|
+
|
|
+ eor3 $ctr3b, $res3b, $ctr3b, $rk10 @ AES block 3 - result
|
|
+ eor3 $ctr2b, $res2b, $ctr2b, $rk10 @ AES block 2 - result
|
|
+ stp $ctr2q, $ctr3q, [$output_ptr], #32 @ AES block 2, 3 - store result
|
|
+
|
|
+ rev32 $ctr2.16b, $rtmp_ctr.16b @ CTR block 10
|
|
+ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 10
|
|
+
|
|
+ eor3 $ctr6b, $res6b, $ctr6b, $rk10 @ AES block 6 - result
|
|
+
|
|
+ rev32 $ctr3.16b, $rtmp_ctr.16b @ CTR block 11
|
|
+ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 11
|
|
+
|
|
+ eor3 $ctr4b, $res4b, $ctr4b, $rk10 @ AES block 4 - result
|
|
+ eor3 $ctr5b, $res5b, $ctr5b, $rk10 @ AES block 5 - result
|
|
+ stp $ctr4q, $ctr5q, [$output_ptr], #32 @ AES block 4, 5 - store result
|
|
+
|
|
+ eor3 $ctr7b, $res7b, $ctr7b, $rk10 @ AES block 7 - result
|
|
+ stp $ctr6q, $ctr7q, [$output_ptr], #32 @ AES block 6, 7 - store result
|
|
+ rev32 $ctr4.16b, $rtmp_ctr.16b @ CTR block 12
|
|
+
|
|
+ cmp $input_ptr, $main_end_input_ptr @ check if we have <= 8 blocks
|
|
+ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 12
|
|
+ b.ge .L128_dec_prepretail @ do prepretail
|
|
+
|
|
+.L128_dec_main_loop: @ main loop start
|
|
+ ldr $h7q, [$current_tag, #176] @ load h7l | h7h
|
|
+ ext $h7.16b, $h7.16b, $h7.16b, #8
|
|
+ ldr $h8q, [$current_tag, #208] @ load h7l | h7h
|
|
+ ext $h8.16b, $h8.16b, $h8.16b, #8
|
|
+
|
|
+ rev64 $res1b, $res1b @ GHASH block 8k+1
|
|
+ rev64 $res0b, $res0b @ GHASH block 8k
|
|
+ ext $acc_lb, $acc_lb, $acc_lb, #8 @ PRE 0
|
|
+
|
|
+ rev64 $res6b, $res6b @ GHASH block 8k+6
|
|
+ ldr $h5q, [$current_tag, #128] @ load h5l | h5h
|
|
+ ext $h5.16b, $h5.16b, $h5.16b, #8
|
|
+ ldr $h6q, [$current_tag, #160] @ load h6l | h6h
|
|
+ ext $h6.16b, $h6.16b, $h6.16b, #8
|
|
+
|
|
+ eor $res0b, $res0b, $acc_lb @ PRE 1
|
|
+ rev32 $ctr5.16b, $rtmp_ctr.16b @ CTR block 8k+13
|
|
+ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+13
|
|
+
|
|
+ rev64 $res2b, $res2b @ GHASH block 8k+2
|
|
+ rev64 $res4b, $res4b @ GHASH block 8k+4
|
|
+ ldp $rk0q, $rk1q, [$cc, #0] @ load rk0, rk1
|
|
+
|
|
+ rev32 $ctr6.16b, $rtmp_ctr.16b @ CTR block 8k+14
|
|
+ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+14
|
|
+ ldr $h56kq, [$current_tag, #144] @ load h6k | h5k
|
|
+ ldr $h78kq, [$current_tag, #192] @ load h6k | h5k
|
|
+
|
|
+ pmull2 $t0.1q, $res1.2d, $h7.2d @ GHASH block 8k+1 - high
|
|
+ pmull2 $acc_h.1q, $res0.2d, $h8.2d @ GHASH block 8k - high
|
|
+ rev64 $res3b, $res3b @ GHASH block 8k+3
|
|
+
|
|
+ rev32 $ctr7.16b, $rtmp_ctr.16b @ CTR block 8k+15
|
|
+ trn1 $acc_m.2d, $res1.2d, $res0.2d @ GHASH block 8k, 8k+1 - mid
|
|
+ rev64 $res5b, $res5b @ GHASH block 8k+5
|
|
+
|
|
+ pmull $h7.1q, $res1.1d, $h7.1d @ GHASH block 8k+1 - low
|
|
+ pmull $acc_l.1q, $res0.1d, $h8.1d @ GHASH block 8k - low
|
|
+ trn2 $res0.2d, $res1.2d, $res0.2d @ GHASH block 8k, 8k+1 - mid
|
|
+
|
|
+ pmull2 $t1.1q, $res2.2d, $h6.2d @ GHASH block 8k+2 - high
|
|
+ aese $ctr4b, $rk0 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 0
|
|
+ pmull2 $t2.1q, $res3.2d, $h5.2d @ GHASH block 8k+3 - high
|
|
+
|
|
+ aese $ctr6b, $rk0 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 0
|
|
+ aese $ctr5b, $rk0 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 0
|
|
+ aese $ctr7b, $rk0 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 0
|
|
+
|
|
+ aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 0
|
|
+ aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 0
|
|
+ eor $acc_hb, $acc_hb, $t0.16b @ GHASH block 8k+1 - high
|
|
+
|
|
+ aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 0
|
|
+ eor $res0.16b, $res0.16b, $acc_m.16b @ GHASH block 8k, 8k+1 - mid
|
|
+ aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 0
|
|
+
|
|
+ aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 1
|
|
+ eor $acc_lb, $acc_lb, $h7.16b @ GHASH block 8k+1 - low
|
|
+ eor3 $acc_hb, $acc_hb, $t1.16b, $t2.16b @ GHASH block 8k+2, 8k+3 - high
|
|
+
|
|
+ ldp $rk2q, $rk3q, [$cc, #32] @ load rk2, rk3
|
|
+ trn1 $t3.2d, $res3.2d, $res2.2d @ GHASH block 8k+2, 8k+3 - mid
|
|
+ aese $ctr7b, $rk1 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 1
|
|
+
|
|
+ pmull $h6.1q, $res2.1d, $h6.1d @ GHASH block 8k+2 - low
|
|
+ trn2 $res2.2d, $res3.2d, $res2.2d @ GHASH block 8k+2, 8k+3 - mid
|
|
+ pmull2 $acc_m.1q, $res0.2d, $h78k.2d @ GHASH block 8k - mid
|
|
+
|
|
+ ldr $h3q, [$current_tag, #80] @ load h3l | h3h
|
|
+ ext $h3.16b, $h3.16b, $h3.16b, #8
|
|
+ ldr $h4q, [$current_tag, #112] @ load h4l | h4h
|
|
+ ext $h4.16b, $h4.16b, $h4.16b, #8
|
|
+ pmull $h78k.1q, $res0.1d, $h78k.1d @ GHASH block 8k+1 - mid
|
|
+ aese $ctr6b, $rk1 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 1
|
|
+
|
|
+ aese $ctr4b, $rk1 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 1
|
|
+ aese $ctr5b, $rk1 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 1
|
|
+ pmull $h5.1q, $res3.1d, $h5.1d @ GHASH block 8k+3 - low
|
|
+
|
|
+ aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 1
|
|
+ aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 1
|
|
+ aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 1
|
|
+
|
|
+ aese $ctr7b, $rk2 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 2
|
|
+ aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 2
|
|
+ eor3 $acc_lb, $acc_lb, $h6.16b, $h5.16b @ GHASH block 8k+2, 8k+3 - low
|
|
+
|
|
+ aese $ctr4b, $rk2 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 2
|
|
+ eor $res2.16b, $res2.16b, $t3.16b @ GHASH block 8k+2, 8k+3 - mid
|
|
+ ldr $h1q, [$current_tag, #32] @ load h1l | h1h
|
|
+ ext $h1.16b, $h1.16b, $h1.16b, #8
|
|
+ ldr $h2q, [$current_tag, #64] @ load h1l | h1h
|
|
+ ext $h2.16b, $h2.16b, $h2.16b, #8
|
|
+
|
|
+ eor $acc_mb, $acc_mb, $h78k.16b @ GHASH block 8k+1 - mid
|
|
+ aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 2
|
|
+ aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 2
|
|
+
|
|
+ trn1 $t6.2d, $res5.2d, $res4.2d @ GHASH block 8k+4, 8k+5 - mid
|
|
+ aese $ctr5b, $rk2 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 2
|
|
+ aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 2
|
|
+
|
|
+ aese $ctr6b, $rk2 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 2
|
|
+ pmull2 $t3.1q, $res2.2d, $h56k.2d @ GHASH block 8k+2 - mid
|
|
+ pmull $h56k.1q, $res2.1d, $h56k.1d @ GHASH block 8k+3 - mid
|
|
+
|
|
+ aese $ctr7b, $rk3 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 3
|
|
+ rev64 $res7b, $res7b @ GHASH block 8k+7
|
|
+ pmull2 $t4.1q, $res4.2d, $h4.2d @ GHASH block 8k+4 - high
|
|
+
|
|
+ ldp $rk4q, $rk5q, [$cc, #64] @ load rk4, rk5
|
|
+ pmull $h4.1q, $res4.1d, $h4.1d @ GHASH block 8k+4 - low
|
|
+ eor3 $acc_mb, $acc_mb, $h56k.16b, $t3.16b @ GHASH block 8k+2, 8k+3 - mid
|
|
+
|
|
+ ldr $h12kq, [$current_tag, #48] @ load h2k | h1k
|
|
+ ldr $h34kq, [$current_tag, #96] @ load h4k | h3k
|
|
+ aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 3
|
|
+ trn2 $res4.2d, $res5.2d, $res4.2d @ GHASH block 8k+4, 8k+5 - mid
|
|
+
|
|
+ aese $ctr4b, $rk3 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 3
|
|
+ aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 3
|
|
+ aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 3
|
|
+
|
|
+ aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 3
|
|
+ aese $ctr6b, $rk3 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 3
|
|
+ aese $ctr5b, $rk3 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 3
|
|
+
|
|
+ pmull2 $t5.1q, $res5.2d, $h3.2d @ GHASH block 8k+5 - high
|
|
+ pmull $h3.1q, $res5.1d, $h3.1d @ GHASH block 8k+5 - low
|
|
+ pmull2 $t7.1q, $res6.2d, $h2.2d @ GHASH block 8k+6 - high
|
|
+
|
|
+ pmull $h2.1q, $res6.1d, $h2.1d @ GHASH block 8k+6 - low
|
|
+ aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 4
|
|
+ aese $ctr7b, $rk4 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 4
|
|
+
|
|
+ eor $res4.16b, $res4.16b, $t6.16b @ GHASH block 8k+4, 8k+5 - mid
|
|
+ trn1 $t9.2d, $res7.2d, $res6.2d @ GHASH block 8k+6, 8k+7 - mid
|
|
+ aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 4
|
|
+
|
|
+ aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 4
|
|
+ aese $ctr5b, $rk4 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 4
|
|
+ aese $ctr6b, $rk4 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 4
|
|
+
|
|
+ aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 4
|
|
+ aese $ctr4b, $rk4 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 4
|
|
+ trn2 $res6.2d, $res7.2d, $res6.2d @ GHASH block 8k+6, 8k+7 - mid
|
|
+
|
|
+ ldp $rk6q, $rk7q, [$cc, #96] @ load rk6, rk7
|
|
+ aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 5
|
|
+ pmull2 $t6.1q, $res4.2d, $h34k.2d @ GHASH block 8k+4 - mid
|
|
+
|
|
+ aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 5
|
|
+ eor $res6.16b, $res6.16b, $t9.16b @ GHASH block 8k+6, 8k+7 - mid
|
|
+ aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 5
|
|
+
|
|
+ pmull $h34k.1q, $res4.1d, $h34k.1d @ GHASH block 8k+5 - mid
|
|
+ aese $ctr6b, $rk5 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 5
|
|
+ aese $ctr7b, $rk5 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 5
|
|
+
|
|
+ aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 5
|
|
+ aese $ctr5b, $rk5 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 5
|
|
+ aese $ctr4b, $rk5 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 5
|
|
+
|
|
+ pmull2 $t8.1q, $res7.2d, $h1.2d @ GHASH block 8k+7 - high
|
|
+ eor3 $acc_mb, $acc_mb, $h34k.16b, $t6.16b @ GHASH block 8k+4, 8k+5 - mid
|
|
+ eor3 $acc_lb, $acc_lb, $h4.16b, $h3.16b @ GHASH block 8k+4, 8k+5 - low
|
|
+
|
|
+ aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 6
|
|
+ eor3 $acc_hb, $acc_hb, $t4.16b, $t5.16b @ GHASH block 8k+4, 8k+5 - high
|
|
+ aese $ctr7b, $rk6 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 6
|
|
+
|
|
+ aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 6
|
|
+ pmull2 $t9.1q, $res6.2d, $h12k.2d @ GHASH block 8k+6 - mid
|
|
+ aese $ctr6b, $rk6 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 6
|
|
+
|
|
+ aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 6
|
|
+ aese $ctr5b, $rk6 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 6
|
|
+ pmull $h1.1q, $res7.1d, $h1.1d @ GHASH block 8k+7 - low
|
|
+
|
|
+ pmull $h12k.1q, $res6.1d, $h12k.1d @ GHASH block 8k+7 - mid
|
|
+ aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 6
|
|
+ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+15
|
|
+
|
|
+ eor3 $acc_hb, $acc_hb, $t7.16b, $t8.16b @ GHASH block 8k+6, 8k+7 - high
|
|
+ aese $ctr4b, $rk6 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 6
|
|
+ ldp $rk8q, $rk9q, [$cc, #128] @ load rk8, rk9
|
|
+
|
|
+ ldr $mod_constantd, [$modulo_constant] @ MODULO - load modulo constant
|
|
+ eor3 $acc_lb, $acc_lb, $h2.16b, $h1.16b @ GHASH block 8k+6, 8k+7 - low
|
|
+ aese $ctr5b, $rk7 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 7
|
|
+
|
|
+ rev32 $h1.16b, $rtmp_ctr.16b @ CTR block 8k+16
|
|
+ eor3 $acc_mb, $acc_mb, $h12k.16b, $t9.16b @ GHASH block 8k+6, 8k+7 - mid
|
|
+ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+16
|
|
+
|
|
+ aese $ctr6b, $rk7 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 7
|
|
+ aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 7
|
|
+ aese $ctr7b, $rk7 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 7
|
|
+
|
|
+ aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 7
|
|
+ aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 7
|
|
+ rev32 $h2.16b, $rtmp_ctr.16b @ CTR block 8k+17
|
|
+
|
|
+ aese $ctr4b, $rk7 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 7
|
|
+ ext $t11.16b, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment
|
|
+ pmull $t12.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid
|
|
+
|
|
+ eor3 $acc_mb, $acc_mb, $acc_hb, $acc_lb @ MODULO - karatsuba tidy up
|
|
+ aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 7
|
|
+ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+17
|
|
+
|
|
+ aese $ctr5b, $rk8 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 8
|
|
+ aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 8
|
|
+ ldp $res0q, $res1q, [$input_ptr], #32 @ AES block 8k+8, 8k+9 - load ciphertext
|
|
+
|
|
+ ldp $res2q, $res3q, [$input_ptr], #32 @ AES block 8k+10, 8k+11 - load ciphertext
|
|
+ aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 8
|
|
+ rev32 $h3.16b, $rtmp_ctr.16b @ CTR block 8k+18
|
|
+
|
|
+ ldp $res4q, $res5q, [$input_ptr], #32 @ AES block 8k+12, 8k+13 - load ciphertext
|
|
+ aese $ctr4b, $rk8 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 8
|
|
+ eor3 $acc_mb, $acc_mb, $t12.16b, $t11.16b @ MODULO - fold into mid
|
|
+
|
|
+ ldp $res6q, $res7q, [$input_ptr], #32 @ AES block 8k+14, 8k+15 - load ciphertext
|
|
+ aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 8
|
|
+ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+18
|
|
+
|
|
+ aese $ctr7b, $rk8 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 8
|
|
+ aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 8
|
|
+ aese $ctr6b, $rk8 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 8
|
|
+
|
|
+ aese $ctr0b, $rk9 @ AES block 8k+8 - round 9
|
|
+ aese $ctr1b, $rk9 @ AES block 8k+9 - round 9
|
|
+ ldr $rk10q, [$cc, #160] @ load rk10
|
|
+
|
|
+ aese $ctr6b, $rk9 @ AES block 8k+14 - round 9
|
|
+ pmull $acc_h.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low
|
|
+ aese $ctr2b, $rk9 @ AES block 8k+10 - round 9
|
|
+
|
|
+ aese $ctr7b, $rk9 @ AES block 8k+15 - round 9
|
|
+ aese $ctr4b, $rk9 @ AES block 8k+12 - round 9
|
|
+ ext $t11.16b, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment
|
|
+
|
|
+ rev32 $h4.16b, $rtmp_ctr.16b @ CTR block 8k+19
|
|
+ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+19
|
|
+
|
|
+ aese $ctr3b, $rk9 @ AES block 8k+11 - round 9
|
|
+ aese $ctr5b, $rk9 @ AES block 8k+13 - round 9
|
|
+ eor3 $ctr1b, $res1b, $ctr1b, $rk10 @ AES block 8k+9 - result
|
|
+
|
|
+ eor3 $ctr0b, $res0b, $ctr0b, $rk10 @ AES block 8k+8 - result
|
|
+ eor3 $ctr7b, $res7b, $ctr7b, $rk10 @ AES block 8k+15 - result
|
|
+ eor3 $ctr6b, $res6b, $ctr6b, $rk10 @ AES block 8k+14 - result
|
|
+
|
|
+ eor3 $ctr2b, $res2b, $ctr2b, $rk10 @ AES block 8k+10 - result
|
|
+ stp $ctr0q, $ctr1q, [$output_ptr], #32 @ AES block 8k+8, 8k+9 - store result
|
|
+ mov $ctr1.16b, $h2.16b @ CTR block 8k+17
|
|
+
|
|
+ eor3 $ctr4b, $res4b, $ctr4b, $rk10 @ AES block 8k+12 - result
|
|
+ eor3 $acc_lb, $acc_lb, $acc_hb, $t11.16b @ MODULO - fold into low
|
|
+ mov $ctr0.16b, $h1.16b @ CTR block 8k+16
|
|
+
|
|
+ eor3 $ctr3b, $res3b, $ctr3b, $rk10 @ AES block 8k+11 - result
|
|
+ cmp $input_ptr, $main_end_input_ptr @ LOOP CONTROL
|
|
+ stp $ctr2q, $ctr3q, [$output_ptr], #32 @ AES block 8k+10, 8k+11 - store result
|
|
+
|
|
+ eor3 $ctr5b, $res5b, $ctr5b, $rk10 @ AES block 8k+13 - result
|
|
+ mov $ctr2.16b, $h3.16b @ CTR block 8k+18
|
|
+
|
|
+ stp $ctr4q, $ctr5q, [$output_ptr], #32 @ AES block 8k+12, 8k+13 - store result
|
|
+ rev32 $ctr4.16b, $rtmp_ctr.16b @ CTR block 8k+20
|
|
+ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+20
|
|
+
|
|
+ stp $ctr6q, $ctr7q, [$output_ptr], #32 @ AES block 8k+14, 8k+15 - store result
|
|
+ mov $ctr3.16b, $h4.16b @ CTR block 8k+19
|
|
+ b.lt .L128_dec_main_loop
|
|
+
|
|
+.L128_dec_prepretail: @ PREPRETAIL
|
|
+ rev64 $res3b, $res3b @ GHASH block 8k+3
|
|
+ ext $acc_lb, $acc_lb, $acc_lb, #8 @ PRE 0
|
|
+ rev64 $res0b, $res0b @ GHASH block 8k
|
|
+
|
|
+ rev64 $res2b, $res2b @ GHASH block 8k+2
|
|
+ rev32 $ctr5.16b, $rtmp_ctr.16b @ CTR block 8k+13
|
|
+ ldp $rk0q, $rk1q, [$cc, #0] @ load rk0, rk1
|
|
+
|
|
+ ldr $h7q, [$current_tag, #176] @ load h7l | h7h
|
|
+ ext $h7.16b, $h7.16b, $h7.16b, #8
|
|
+ ldr $h8q, [$current_tag, #208] @ load h8l | h8h
|
|
+ ext $h8.16b, $h8.16b, $h8.16b, #8
|
|
+ eor $res0b, $res0b, $acc_lb @ PRE 1
|
|
+ rev64 $res1b, $res1b @ GHASH block 8k+1
|
|
+
|
|
+ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+13
|
|
+ ldr $h5q, [$current_tag, #128] @ load h5l | h5h
|
|
+ ext $h5.16b, $h5.16b, $h5.16b, #8
|
|
+ ldr $h6q, [$current_tag, #160] @ load h6l | h6h
|
|
+ ext $h6.16b, $h6.16b, $h6.16b, #8
|
|
+ rev64 $res5b, $res5b @ GHASH block 8k+5
|
|
+
|
|
+ rev64 $res4b, $res4b @ GHASH block 8k+4
|
|
+
|
|
+ rev64 $res6b, $res6b @ GHASH block 8k+6
|
|
+
|
|
+ ldr $h56kq, [$current_tag, #144] @ load h6k | h5k
|
|
+ ldr $h78kq, [$current_tag, #192] @ load h6k | h5k
|
|
+ rev32 $ctr6.16b, $rtmp_ctr.16b @ CTR block 8k+14
|
|
+ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+14
|
|
+
|
|
+ pmull2 $t0.1q, $res1.2d, $h7.2d @ GHASH block 8k+1 - high
|
|
+ pmull $acc_l.1q, $res0.1d, $h8.1d @ GHASH block 8k - low
|
|
+ pmull2 $acc_h.1q, $res0.2d, $h8.2d @ GHASH block 8k - high
|
|
+
|
|
+ trn1 $acc_m.2d, $res1.2d, $res0.2d @ GHASH block 8k, 8k+1 - mid
|
|
+ trn2 $res0.2d, $res1.2d, $res0.2d @ GHASH block 8k, 8k+1 - mid
|
|
+ pmull2 $t1.1q, $res2.2d, $h6.2d @ GHASH block 8k+2 - high
|
|
+
|
|
+ pmull $h7.1q, $res1.1d, $h7.1d @ GHASH block 8k+1 - low
|
|
+ pmull2 $t2.1q, $res3.2d, $h5.2d @ GHASH block 8k+3 - high
|
|
+ aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 0
|
|
+
|
|
+ eor $acc_hb, $acc_hb, $t0.16b @ GHASH block 8k+1 - high
|
|
+ aese $ctr4b, $rk0 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 0
|
|
+ eor $res0.16b, $res0.16b, $acc_m.16b @ GHASH block 8k, 8k+1 - mid
|
|
+
|
|
+ pmull $h6.1q, $res2.1d, $h6.1d @ GHASH block 8k+2 - low
|
|
+ rev32 $ctr7.16b, $rtmp_ctr.16b @ CTR block 8k+15
|
|
+ aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 0
|
|
+
|
|
+ eor3 $acc_hb, $acc_hb, $t1.16b, $t2.16b @ GHASH block 8k+2, 8k+3 - high
|
|
+ trn1 $t3.2d, $res3.2d, $res2.2d @ GHASH block 8k+2, 8k+3 - mid
|
|
+ trn2 $res2.2d, $res3.2d, $res2.2d @ GHASH block 8k+2, 8k+3 - mid
|
|
+
|
|
+ aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 0
|
|
+ aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 0
|
|
+ aese $ctr5b, $rk0 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 0
|
|
+
|
|
+ pmull2 $acc_m.1q, $res0.2d, $h78k.2d @ GHASH block 8k - mid
|
|
+ pmull $h78k.1q, $res0.1d, $h78k.1d @ GHASH block 8k+1 - mid
|
|
+ pmull $h5.1q, $res3.1d, $h5.1d @ GHASH block 8k+3 - low
|
|
+
|
|
+ aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 1
|
|
+ aese $ctr7b, $rk0 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 0
|
|
+ aese $ctr6b, $rk0 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 0
|
|
+
|
|
+ eor $acc_lb, $acc_lb, $h7.16b @ GHASH block 8k+1 - low
|
|
+ eor $res2.16b, $res2.16b, $t3.16b @ GHASH block 8k+2, 8k+3 - mid
|
|
+ eor $acc_mb, $acc_mb, $h78k.16b @ GHASH block 8k+1 - mid
|
|
+
|
|
+ aese $ctr6b, $rk1 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 1
|
|
+ aese $ctr4b, $rk1 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 1
|
|
+ aese $ctr5b, $rk1 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 1
|
|
+
|
|
+ ldp $rk2q, $rk3q, [$cc, #32] @ load rk2, rk3
|
|
+ eor3 $acc_lb, $acc_lb, $h6.16b, $h5.16b @ GHASH block 8k+2, 8k+3 - low
|
|
+ pmull2 $t3.1q, $res2.2d, $h56k.2d @ GHASH block 8k+2 - mid
|
|
+
|
|
+ ldr $h3q, [$current_tag, #80] @ load h3l | h3h
|
|
+ ext $h3.16b, $h3.16b, $h3.16b, #8
|
|
+ ldr $h4q, [$current_tag, #112] @ load h4l | h4h
|
|
+ ext $h4.16b, $h4.16b, $h4.16b, #8
|
|
+ aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 1
|
|
+ pmull $h56k.1q, $res2.1d, $h56k.1d @ GHASH block 8k+3 - mid
|
|
+
|
|
+ aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 1
|
|
+ aese $ctr7b, $rk1 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 1
|
|
+ aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 1
|
|
+
|
|
+ ldr $h1q, [$current_tag, #32] @ load h1l | h1h
|
|
+ ext $h1.16b, $h1.16b, $h1.16b, #8
|
|
+ ldr $h2q, [$current_tag, #64] @ load h1l | h1h
|
|
+ ext $h2.16b, $h2.16b, $h2.16b, #8
|
|
+ eor3 $acc_mb, $acc_mb, $h56k.16b, $t3.16b @ GHASH block 8k+2, 8k+3 - mid
|
|
+
|
|
+ aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 2
|
|
+ aese $ctr6b, $rk2 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 2
|
|
+ aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 2
|
|
+
|
|
+ aese $ctr4b, $rk2 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 2
|
|
+ trn1 $t6.2d, $res5.2d, $res4.2d @ GHASH block 8k+4, 8k+5 - mid
|
|
+ aese $ctr7b, $rk2 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 2
|
|
+
|
|
+ aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 2
|
|
+ aese $ctr5b, $rk2 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 2
|
|
+ aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 2
|
|
+
|
|
+ pmull2 $t4.1q, $res4.2d, $h4.2d @ GHASH block 8k+4 - high
|
|
+ pmull $h4.1q, $res4.1d, $h4.1d @ GHASH block 8k+4 - low
|
|
+ trn2 $res4.2d, $res5.2d, $res4.2d @ GHASH block 8k+4, 8k+5 - mid
|
|
+
|
|
+ ldp $rk4q, $rk5q, [$cc, #64] @ load rk4, rk5
|
|
+ rev64 $res7b, $res7b @ GHASH block 8k+7
|
|
+ aese $ctr6b, $rk3 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 3
|
|
+
|
|
+ ldr $h12kq, [$current_tag, #48] @ load h2k | h1k
|
|
+ ldr $h34kq, [$current_tag, #96] @ load h4k | h3k
|
|
+ pmull2 $t5.1q, $res5.2d, $h3.2d @ GHASH block 8k+5 - high
|
|
+ pmull $h3.1q, $res5.1d, $h3.1d @ GHASH block 8k+5 - low
|
|
+
|
|
+ aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 3
|
|
+ aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 3
|
|
+ trn1 $t9.2d, $res7.2d, $res6.2d @ GHASH block 8k+6, 8k+7 - mid
|
|
+
|
|
+ pmull2 $t7.1q, $res6.2d, $h2.2d @ GHASH block 8k+6 - high
|
|
+ pmull $h2.1q, $res6.1d, $h2.1d @ GHASH block 8k+6 - low
|
|
+ trn2 $res6.2d, $res7.2d, $res6.2d @ GHASH block 8k+6, 8k+7 - mid
|
|
+
|
|
+ aese $ctr4b, $rk3 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 3
|
|
+ aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 3
|
|
+ aese $ctr7b, $rk3 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 3
|
|
+
|
|
+ aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 3
|
|
+ aese $ctr5b, $rk3 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 3
|
|
+ eor $res4.16b, $res4.16b, $t6.16b @ GHASH block 8k+4, 8k+5 - mid
|
|
+
|
|
+ eor3 $acc_hb, $acc_hb, $t4.16b, $t5.16b @ GHASH block 8k+4, 8k+5 - high
|
|
+ aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 4
|
|
+ aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 4
|
|
+
|
|
+ eor $res6.16b, $res6.16b, $t9.16b @ GHASH block 8k+6, 8k+7 - mid
|
|
+ aese $ctr5b, $rk4 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 4
|
|
+ pmull2 $t6.1q, $res4.2d, $h34k.2d @ GHASH block 8k+4 - mid
|
|
+
|
|
+ aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 4
|
|
+ aese $ctr6b, $rk4 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 4
|
|
+ aese $ctr4b, $rk4 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 4
|
|
+
|
|
+ aese $ctr7b, $rk4 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 4
|
|
+ aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 4
|
|
+ pmull $h34k.1q, $res4.1d, $h34k.1d @ GHASH block 8k+5 - mid
|
|
+
|
|
+ pmull2 $t8.1q, $res7.2d, $h1.2d @ GHASH block 8k+7 - high
|
|
+ pmull2 $t9.1q, $res6.2d, $h12k.2d @ GHASH block 8k+6 - mid
|
|
+ pmull $h12k.1q, $res6.1d, $h12k.1d @ GHASH block 8k+7 - mid
|
|
+
|
|
+ ldp $rk6q, $rk7q, [$cc, #96] @ load rk6, rk7
|
|
+ eor3 $acc_mb, $acc_mb, $h34k.16b, $t6.16b @ GHASH block 8k+4, 8k+5 - mid
|
|
+ aese $ctr6b, $rk5 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 5
|
|
+
|
|
+ ldr $mod_constantd, [$modulo_constant] @ MODULO - load modulo constant
|
|
+ pmull $h1.1q, $res7.1d, $h1.1d @ GHASH block 8k+7 - low
|
|
+ eor3 $acc_lb, $acc_lb, $h4.16b, $h3.16b @ GHASH block 8k+4, 8k+5 - low
|
|
+
|
|
+ aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 5
|
|
+ aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 5
|
|
+ aese $ctr4b, $rk5 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 5
|
|
+
|
|
+ aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 5
|
|
+ aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 5
|
|
+ aese $ctr5b, $rk5 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 5
|
|
+
|
|
+ aese $ctr7b, $rk5 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 5
|
|
+ eor3 $acc_mb, $acc_mb, $h12k.16b, $t9.16b @ GHASH block 8k+6, 8k+7 - mid
|
|
+ eor3 $acc_lb, $acc_lb, $h2.16b, $h1.16b @ GHASH block 8k+6, 8k+7 - low
|
|
+
|
|
+ aese $ctr4b, $rk6 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 6
|
|
+ aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 6
|
|
+ aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 6
|
|
+
|
|
+ eor3 $acc_hb, $acc_hb, $t7.16b, $t8.16b @ GHASH block 8k+6, 8k+7 - high
|
|
+ aese $ctr5b, $rk6 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 6
|
|
+ aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 6
|
|
+
|
|
+ aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 6
|
|
+ aese $ctr6b, $rk6 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 6
|
|
+ aese $ctr7b, $rk6 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 6
|
|
+
|
|
+ aese $ctr4b, $rk7 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 7
|
|
+ eor3 $acc_mb, $acc_mb, $acc_hb, $acc_lb @ MODULO - karatsuba tidy up
|
|
+ ldp $rk8q, $rk9q, [$cc, #128] @ load rk8, rk9
|
|
+
|
|
+ pmull $t12.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid
|
|
+ aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 7
|
|
+ ext $t11.16b, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment
|
|
+
|
|
+ aese $ctr5b, $rk7 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 7
|
|
+ aese $ctr6b, $rk7 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 7
|
|
+ aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 7
|
|
+
|
|
+ aese $ctr7b, $rk7 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 7
|
|
+ aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 7
|
|
+ aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 7
|
|
+
|
|
+ eor3 $acc_mb, $acc_mb, $t12.16b, $t11.16b @ MODULO - fold into mid
|
|
+ ldr $rk10q, [$cc, #160] @ load rk10
|
|
+
|
|
+ aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 8
|
|
+ aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 8
|
|
+
|
|
+ pmull $acc_h.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low
|
|
+ aese $ctr6b, $rk8 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 8
|
|
+ ext $t11.16b, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment
|
|
+
|
|
+ aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 8
|
|
+ aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 8
|
|
+ aese $ctr7b, $rk8 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 8
|
|
+
|
|
+ aese $ctr6b, $rk9 @ AES block 8k+14 - round 9
|
|
+ aese $ctr5b, $rk8 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 8
|
|
+ aese $ctr4b, $rk8 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 8
|
|
+
|
|
+ eor3 $acc_lb, $acc_lb, $acc_hb, $t11.16b @ MODULO - fold into low
|
|
+ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+15
|
|
+ aese $ctr2b, $rk9 @ AES block 8k+10 - round 9
|
|
+
|
|
+ aese $ctr3b, $rk9 @ AES block 8k+11 - round 9
|
|
+ aese $ctr5b, $rk9 @ AES block 8k+13 - round 9
|
|
+ aese $ctr0b, $rk9 @ AES block 8k+8 - round 9
|
|
+
|
|
+ aese $ctr4b, $rk9 @ AES block 8k+12 - round 9
|
|
+ aese $ctr1b, $rk9 @ AES block 8k+9 - round 9
|
|
+ aese $ctr7b, $rk9 @ AES block 8k+15 - round 9
|
|
+
|
|
+.L128_dec_tail: @ TAIL
|
|
+
|
|
+ mov $t1.16b, $rk10
|
|
+ sub $main_end_input_ptr, $end_input_ptr, $input_ptr @ main_end_input_ptr is number of bytes left to process
|
|
+
|
|
+ cmp $main_end_input_ptr, #112
|
|
+
|
|
+ ldp $h78kq, $h8q, [$current_tag, #192] @ load h7l | h7h
|
|
+ ext $h8.16b, $h8.16b, $h8.16b, #8
|
|
+ ldr $res1q, [$input_ptr], #16 @ AES block 8k+8 - load ciphertext
|
|
+
|
|
+ ldp $h5q, $h56kq, [$current_tag, #128] @ load h5l | h5h
|
|
+ ext $h5.16b, $h5.16b, $h5.16b, #8
|
|
+ ext $t0.16b, $acc_lb, $acc_lb, #8 @ prepare final partial tag
|
|
+
|
|
+ ldp $h6q, $h7q, [$current_tag, #160] @ load h6k | h5k
|
|
+ ext $h6.16b, $h6.16b, $h6.16b, #8
|
|
+ ext $h7.16b, $h7.16b, $h7.16b, #8
|
|
+
|
|
+ eor3 $res4b, $res1b, $ctr0b, $t1.16b @ AES block 8k+8 - result
|
|
+ b.gt .L128_dec_blocks_more_than_7
|
|
+
|
|
+ cmp $main_end_input_ptr, #96
|
|
+ mov $ctr7b, $ctr6b
|
|
+ movi $acc_l.8b, #0
|
|
+
|
|
+ movi $acc_h.8b, #0
|
|
+ mov $ctr6b, $ctr5b
|
|
+ mov $ctr5b, $ctr4b
|
|
+
|
|
+ mov $ctr4b, $ctr3b
|
|
+ mov $ctr3b, $ctr2b
|
|
+ mov $ctr2b, $ctr1b
|
|
+
|
|
+ movi $acc_m.8b, #0
|
|
+ sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
|
|
+ b.gt .L128_dec_blocks_more_than_6
|
|
+
|
|
+ cmp $main_end_input_ptr, #80
|
|
+ sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
|
|
+
|
|
+ mov $ctr7b, $ctr6b
|
|
+ mov $ctr6b, $ctr5b
|
|
+ mov $ctr5b, $ctr4b
|
|
+
|
|
+ mov $ctr4b, $ctr3b
|
|
+ mov $ctr3b, $ctr1b
|
|
+ b.gt .L128_dec_blocks_more_than_5
|
|
+
|
|
+ cmp $main_end_input_ptr, #64
|
|
+
|
|
+ mov $ctr7b, $ctr6b
|
|
+ mov $ctr6b, $ctr5b
|
|
+ mov $ctr5b, $ctr4b
|
|
+
|
|
+ mov $ctr4b, $ctr1b
|
|
+ sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
|
|
+ b.gt .L128_dec_blocks_more_than_4
|
|
+
|
|
+ sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
|
|
+ mov $ctr7b, $ctr6b
|
|
+ mov $ctr6b, $ctr5b
|
|
+
|
|
+ mov $ctr5b, $ctr1b
|
|
+ cmp $main_end_input_ptr, #48
|
|
+ b.gt .L128_dec_blocks_more_than_3
|
|
+
|
|
+ sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
|
|
+ mov $ctr7b, $ctr6b
|
|
+ cmp $main_end_input_ptr, #32
|
|
+
|
|
+ ldr $h34kq, [$current_tag, #96] @ load h4k | h3k
|
|
+ mov $ctr6b, $ctr1b
|
|
+ b.gt .L128_dec_blocks_more_than_2
|
|
+
|
|
+ cmp $main_end_input_ptr, #16
|
|
+
|
|
+ mov $ctr7b, $ctr1b
|
|
+ sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
|
|
+ b.gt L128_dec_blocks_more_than_1
|
|
+
|
|
+ sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
|
|
+ ldr $h12kq, [$current_tag, #48] @ load h2k | h1k
|
|
+ b .L128_dec_blocks_less_than_1
|
|
+.L128_dec_blocks_more_than_7: @ blocks left > 7
|
|
+ rev64 $res0b, $res1b @ GHASH final-7 block
|
|
+
|
|
+ eor $res0b, $res0b, $t0.16b @ feed in partial tag
|
|
+
|
|
+ ins $acc_m.d[0], $h78k.d[1] @ GHASH final-7 block - mid
|
|
+
|
|
+ pmull $acc_l.1q, $res0.1d, $h8.1d @ GHASH final-7 block - low
|
|
+ ins $rk4v.d[0], $res0.d[1] @ GHASH final-7 block - mid
|
|
+
|
|
+ movi $t0.8b, #0 @ surpress further partial tag feed in
|
|
+ ldr $res1q, [$input_ptr], #16 @ AES final-6 block - load ciphertext
|
|
+
|
|
+ eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-7 block - mid
|
|
+
|
|
+ pmull2 $acc_h.1q, $res0.2d, $h8.2d @ GHASH final-7 block - high
|
|
+ st1 { $res4b}, [$output_ptr], #16 @ AES final-7 block - store result
|
|
+ eor3 $res4b, $res1b, $ctr1b, $t1.16b @ AES final-6 block - result
|
|
+
|
|
+ pmull $acc_m.1q, $rk4v.1d, $acc_m.1d @ GHASH final-7 block - mid
|
|
+.L128_dec_blocks_more_than_6: @ blocks left > 6
|
|
+
|
|
+ rev64 $res0b, $res1b @ GHASH final-6 block
|
|
+
|
|
+ eor $res0b, $res0b, $t0.16b @ feed in partial tag
|
|
+
|
|
+ ins $rk4v.d[0], $res0.d[1] @ GHASH final-6 block - mid
|
|
+
|
|
+ eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-6 block - mid
|
|
+
|
|
+ pmull $rk3q1, $res0.1d, $h7.1d @ GHASH final-6 block - low
|
|
+ ldr $res1q, [$input_ptr], #16 @ AES final-5 block - load ciphertext
|
|
+ movi $t0.8b, #0 @ surpress further partial tag feed in
|
|
+
|
|
+ pmull $rk4v.1q, $rk4v.1d, $h78k.1d @ GHASH final-6 block - mid
|
|
+ st1 { $res4b}, [$output_ptr], #16 @ AES final-6 block - store result
|
|
+ pmull2 $rk2q1, $res0.2d, $h7.2d @ GHASH final-6 block - high
|
|
+
|
|
+ eor $acc_lb, $acc_lb, $rk3 @ GHASH final-6 block - low
|
|
+ eor $acc_hb, $acc_hb, $rk2 @ GHASH final-6 block - high
|
|
+
|
|
+ eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-6 block - mid
|
|
+ eor3 $res4b, $res1b, $ctr2b, $t1.16b @ AES final-5 block - result
|
|
+.L128_dec_blocks_more_than_5: @ blocks left > 5
|
|
+
|
|
+ rev64 $res0b, $res1b @ GHASH final-5 block
|
|
+
|
|
+ ldr $res1q, [$input_ptr], #16 @ AES final-4 block - load ciphertext
|
|
+ st1 { $res4b}, [$output_ptr], #16 @ AES final-5 block - store result
|
|
+
|
|
+ eor $res0b, $res0b, $t0.16b @ feed in partial tag
|
|
+
|
|
+ ins $rk4v.d[0], $res0.d[1] @ GHASH final-5 block - mid
|
|
+
|
|
+ eor3 $res4b, $res1b, $ctr3b, $t1.16b @ AES final-4 block - result
|
|
+
|
|
+ eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-5 block - mid
|
|
+
|
|
+ ins $rk4v.d[1], $rk4v.d[0] @ GHASH final-5 block - mid
|
|
+ pmull $rk3q1, $res0.1d, $h6.1d @ GHASH final-5 block - low
|
|
+ movi $t0.8b, #0 @ surpress further partial tag feed in
|
|
+
|
|
+ pmull2 $rk4v.1q, $rk4v.2d, $h56k.2d @ GHASH final-5 block - mid
|
|
+ pmull2 $rk2q1, $res0.2d, $h6.2d @ GHASH final-5 block - high
|
|
+ eor $acc_lb, $acc_lb, $rk3 @ GHASH final-5 block - low
|
|
+
|
|
+ eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-5 block - mid
|
|
+ eor $acc_hb, $acc_hb, $rk2 @ GHASH final-5 block - high
|
|
+.L128_dec_blocks_more_than_4: @ blocks left > 4
|
|
+
|
|
+ rev64 $res0b, $res1b @ GHASH final-4 block
|
|
+
|
|
+ eor $res0b, $res0b, $t0.16b @ feed in partial tag
|
|
+ ldr $res1q, [$input_ptr], #16 @ AES final-3 block - load ciphertext
|
|
+
|
|
+ ins $rk4v.d[0], $res0.d[1] @ GHASH final-4 block - mid
|
|
+ movi $t0.8b, #0 @ surpress further partial tag feed in
|
|
+ pmull2 $rk2q1, $res0.2d, $h5.2d @ GHASH final-4 block - high
|
|
+
|
|
+ pmull $rk3q1, $res0.1d, $h5.1d @ GHASH final-4 block - low
|
|
+
|
|
+ eor $acc_hb, $acc_hb, $rk2 @ GHASH final-4 block - high
|
|
+
|
|
+ st1 { $res4b}, [$output_ptr], #16 @ AES final-4 block - store result
|
|
+ eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-4 block - mid
|
|
+
|
|
+ eor3 $res4b, $res1b, $ctr4b, $t1.16b @ AES final-3 block - result
|
|
+ eor $acc_lb, $acc_lb, $rk3 @ GHASH final-4 block - low
|
|
+
|
|
+ pmull $rk4v.1q, $rk4v.1d, $h56k.1d @ GHASH final-4 block - mid
|
|
+
|
|
+ eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-4 block - mid
|
|
+.L128_dec_blocks_more_than_3: @ blocks left > 3
|
|
+
|
|
+ st1 { $res4b}, [$output_ptr], #16 @ AES final-3 block - store result
|
|
+ rev64 $res0b, $res1b @ GHASH final-3 block
|
|
+
|
|
+ eor $res0b, $res0b, $t0.16b @ feed in partial tag
|
|
+
|
|
+ ins $rk4v.d[0], $res0.d[1] @ GHASH final-3 block - mid
|
|
+
|
|
+ ldr $h4q, [$current_tag, #112] @ load h4l | h4h
|
|
+ ext $h4.16b, $h4.16b, $h4.16b, #8
|
|
+ ldr $h34kq, [$current_tag, #96] @ load h4k | h3k
|
|
+
|
|
+ eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-3 block - mid
|
|
+
|
|
+ ldr $res1q, [$input_ptr], #16 @ AES final-2 block - load ciphertext
|
|
+
|
|
+ ins $rk4v.d[1], $rk4v.d[0] @ GHASH final-3 block - mid
|
|
+ pmull $rk3q1, $res0.1d, $h4.1d @ GHASH final-3 block - low
|
|
+ pmull2 $rk2q1, $res0.2d, $h4.2d @ GHASH final-3 block - high
|
|
+
|
|
+ movi $t0.8b, #0 @ surpress further partial tag feed in
|
|
+ eor3 $res4b, $res1b, $ctr5b, $t1.16b @ AES final-2 block - result
|
|
+ eor $acc_lb, $acc_lb, $rk3 @ GHASH final-3 block - low
|
|
+
|
|
+ pmull2 $rk4v.1q, $rk4v.2d, $h34k.2d @ GHASH final-3 block - mid
|
|
+
|
|
+ eor $acc_hb, $acc_hb, $rk2 @ GHASH final-3 block - high
|
|
+ eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-3 block - mid
|
|
+.L128_dec_blocks_more_than_2: @ blocks left > 2
|
|
+
|
|
+ rev64 $res0b, $res1b @ GHASH final-2 block
|
|
+
|
|
+ st1 { $res4b}, [$output_ptr], #16 @ AES final-2 block - store result
|
|
+
|
|
+ eor $res0b, $res0b, $t0.16b @ feed in partial tag
|
|
+ ldr $h3q, [$current_tag, #80] @ load h3l | h3h
|
|
+ ext $h3.16b, $h3.16b, $h3.16b, #8
|
|
+ movi $t0.8b, #0 @ surpress further partial tag feed in
|
|
+
|
|
+ ins $rk4v.d[0], $res0.d[1] @ GHASH final-2 block - mid
|
|
+
|
|
+ eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-2 block - mid
|
|
+
|
|
+ pmull $rk3q1, $res0.1d, $h3.1d @ GHASH final-2 block - low
|
|
+
|
|
+ pmull2 $rk2q1, $res0.2d, $h3.2d @ GHASH final-2 block - high
|
|
+ pmull $rk4v.1q, $rk4v.1d, $h34k.1d @ GHASH final-2 block - mid
|
|
+ ldr $res1q, [$input_ptr], #16 @ AES final-1 block - load ciphertext
|
|
+
|
|
+ eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-2 block - mid
|
|
+
|
|
+ eor $acc_lb, $acc_lb, $rk3 @ GHASH final-2 block - low
|
|
+
|
|
+ eor3 $res4b, $res1b, $ctr6b, $t1.16b @ AES final-1 block - result
|
|
+ eor $acc_hb, $acc_hb, $rk2 @ GHASH final-2 block - high
|
|
+.L128_dec_blocks_more_than_1: @ blocks left > 1
|
|
+
|
|
+ st1 { $res4b}, [$output_ptr], #16 @ AES final-1 block - store result
|
|
+ rev64 $res0b, $res1b @ GHASH final-1 block
|
|
+
|
|
+ ldr $h2q, [$current_tag, #64] @ load h1l | h1h
|
|
+ ext $h2.16b, $h2.16b, $h2.16b, #8
|
|
+
|
|
+ eor $res0b, $res0b, $t0.16b @ feed in partial tag
|
|
+
|
|
+ movi $t0.8b, #0 @ surpress further partial tag feed in
|
|
+
|
|
+ ins $rk4v.d[0], $res0.d[1] @ GHASH final-1 block - mid
|
|
+
|
|
+ ldr $res1q, [$input_ptr], #16 @ AES final block - load ciphertext
|
|
+ pmull2 $rk2q1, $res0.2d, $h2.2d @ GHASH final-1 block - high
|
|
+
|
|
+ eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-1 block - mid
|
|
+ eor $acc_hb, $acc_hb, $rk2 @ GHASH final-1 block - high
|
|
+ ldr $h12kq, [$current_tag, #48] @ load h2k | h1k
|
|
+
|
|
+ ins $rk4v.d[1], $rk4v.d[0] @ GHASH final-1 block - mid
|
|
+ eor3 $res4b, $res1b, $ctr7b, $t1.16b @ AES final block - result
|
|
+
|
|
+ pmull $rk3q1, $res0.1d, $h2.1d @ GHASH final-1 block - low
|
|
+
|
|
+ pmull2 $rk4v.1q, $rk4v.2d, $h12k.2d @ GHASH final-1 block - mid
|
|
+
|
|
+ eor $acc_lb, $acc_lb, $rk3 @ GHASH final-1 block - low
|
|
+
|
|
+ eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-1 block - mid
|
|
+.L128_dec_blocks_less_than_1: @ blocks left <= 1
|
|
+
|
|
+ and $bit_length, $bit_length, #127 @ bit_length %= 128
|
|
+
|
|
+ sub $bit_length, $bit_length, #128 @ bit_length -= 128
|
|
+
|
|
+ neg $bit_length, $bit_length @ bit_length = 128 - #bits in input (in range [1,128])
|
|
+
|
|
+ mvn $temp0_x, xzr @ temp0_x = 0xffffffffffffffff
|
|
+ and $bit_length, $bit_length, #127 @ bit_length %= 128
|
|
+
|
|
+ lsr $temp0_x, $temp0_x, $bit_length @ temp0_x is mask for top 64b of last block
|
|
+ cmp $bit_length, #64
|
|
+ mvn $temp1_x, xzr @ temp1_x = 0xffffffffffffffff
|
|
+
|
|
+ csel $temp2_x, $temp1_x, $temp0_x, lt
|
|
+ csel $temp3_x, $temp0_x, xzr, lt
|
|
+
|
|
+ mov $ctr0.d[1], $temp3_x
|
|
+ mov $ctr0.d[0], $temp2_x @ ctr0b is mask for last block
|
|
+
|
|
+ ldr $h1q, [$current_tag, #32] @ load h1l | h1h
|
|
+ ext $h1.16b, $h1.16b, $h1.16b, #8
|
|
+ ld1 { $rk0}, [$output_ptr] @ load existing bytes where the possibly partial last block is to be stored
|
|
+
|
|
+ and $res1b, $res1b, $ctr0b @ possibly partial last block has zeroes in highest bits
|
|
+
|
|
+ rev64 $res0b, $res1b @ GHASH final block
|
|
+
|
|
+ eor $res0b, $res0b, $t0.16b @ feed in partial tag
|
|
+
|
|
+ pmull2 $rk2q1, $res0.2d, $h1.2d @ GHASH final block - high
|
|
+ ins $t0.d[0], $res0.d[1] @ GHASH final block - mid
|
|
+
|
|
+ eor $acc_hb, $acc_hb, $rk2 @ GHASH final block - high
|
|
+ eor $t0.8b, $t0.8b, $res0.8b @ GHASH final block - mid
|
|
+
|
|
+ bif $res4b, $rk0, $ctr0b @ insert existing bytes in top end of result before storing
|
|
+
|
|
+ pmull $t0.1q, $t0.1d, $h12k.1d @ GHASH final block - mid
|
|
+ st1 { $res4b}, [$output_ptr] @ store all 16B
|
|
+
|
|
+ pmull $rk3q1, $res0.1d, $h1.1d @ GHASH final block - low
|
|
+
|
|
+ eor $acc_mb, $acc_mb, $t0.16b @ GHASH final block - mid
|
|
+ ldr $mod_constantd, [$modulo_constant] @ MODULO - load modulo constant
|
|
+
|
|
+ eor $acc_lb, $acc_lb, $rk3 @ GHASH final block - low
|
|
+
|
|
+ eor $t10.16b, $acc_hb, $acc_lb @ MODULO - karatsuba tidy up
|
|
+
|
|
+ pmull $t11.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid
|
|
+ ext $acc_hb, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment
|
|
+
|
|
+ eor $acc_mb, $acc_mb, $t10.16b @ MODULO - karatsuba tidy up
|
|
+
|
|
+ eor3 $acc_mb, $acc_mb, $acc_hb, $t11.16b @ MODULO - fold into mid
|
|
+
|
|
+ pmull $acc_h.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low
|
|
+ ext $acc_mb, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment
|
|
+
|
|
+ eor3 $acc_lb, $acc_lb, $acc_mb, $acc_hb @ MODULO - fold into low
|
|
+ ext $acc_lb, $acc_lb, $acc_lb, #8
|
|
+ rev64 $acc_lb, $acc_lb
|
|
+ st1 { $acc_l.16b }, [$current_tag]
|
|
+ rev32 $rtmp_ctr.16b, $rtmp_ctr.16b
|
|
+
|
|
+ str $rtmp_ctrq, [$counter] @ store the updated counter
|
|
+
|
|
+ lsr x0, $bit_length, #3
|
|
+
|
|
+ ldp d10, d11, [sp, #16]
|
|
+ ldp d12, d13, [sp, #32]
|
|
+ ldp d14, d15, [sp, #48]
|
|
+ ldp d8, d9, [sp], #80
|
|
+ ret
|
|
+.L128_dec_ret:
|
|
+ mov w0, #0x0
|
|
+ ret
|
|
+.size unroll8_eor3_aes_gcm_dec_128_kernel,.-unroll8_eor3_aes_gcm_dec_128_kernel
|
|
+___
|
|
+}
|
|
+
|
|
+{
|
|
+my ($end_input_ptr,$main_end_input_ptr,$temp0_x,$temp1_x)=map("x$_",(4..7));
|
|
+my ($temp2_x,$temp3_x)=map("x$_",(13..14));
|
|
+my ($ctr0b,$ctr1b,$ctr2b,$ctr3b,$ctr4b,$ctr5b,$ctr6b,$ctr7b,$res0b,$res1b,$res2b,$res3b,$res4b,$res5b,$res6b,$res7b)=map("v$_.16b",(0..15));
|
|
+my ($ctr0,$ctr1,$ctr2,$ctr3,$ctr4,$ctr5,$ctr6,$ctr7,$res0,$res1,$res2,$res3,$res4,$res5,$res6,$res7)=map("v$_",(0..15));
|
|
+my ($ctr0d,$ctr1d,$ctr2d,$ctr3d,$ctr4d,$ctr5d,$ctr6d,$ctr7d)=map("d$_",(0..7));
|
|
+my ($ctr0q,$ctr1q,$ctr2q,$ctr3q,$ctr4q,$ctr5q,$ctr6q,$ctr7q)=map("q$_",(0..7));
|
|
+my ($res0q,$res1q,$res2q,$res3q,$res4q,$res5q,$res6q,$res7q)=map("q$_",(8..15));
|
|
+
|
|
+my ($ctr_t0,$ctr_t1,$ctr_t2,$ctr_t3,$ctr_t4,$ctr_t5,$ctr_t6,$ctr_t7)=map("v$_",(8..15));
|
|
+my ($ctr_t0b,$ctr_t1b,$ctr_t2b,$ctr_t3b,$ctr_t4b,$ctr_t5b,$ctr_t6b,$ctr_t7b)=map("v$_.16b",(8..15));
|
|
+my ($ctr_t0q,$ctr_t1q,$ctr_t2q,$ctr_t3q,$ctr_t4q,$ctr_t5q,$ctr_t6q,$ctr_t7q)=map("q$_",(8..15));
|
|
+
|
|
+my ($acc_hb,$acc_mb,$acc_lb)=map("v$_.16b",(17..19));
|
|
+my ($acc_h,$acc_m,$acc_l)=map("v$_",(17..19));
|
|
+
|
|
+my ($h1,$h12k,$h2,$h3,$h34k,$h4)=map("v$_",(20..25));
|
|
+my ($h5,$h56k,$h6,$h7,$h78k,$h8)=map("v$_",(20..25));
|
|
+my ($h1q,$h12kq,$h2q,$h3q,$h34kq,$h4q)=map("q$_",(20..25));
|
|
+my ($h5q,$h56kq,$h6q,$h7q,$h78kq,$h8q)=map("q$_",(20..25));
|
|
+
|
|
+my $t0="v16";
|
|
+my $t0d="d16";
|
|
+
|
|
+my $t1="v29";
|
|
+my $t2=$res1;
|
|
+my $t3=$t1;
|
|
+
|
|
+my $t4=$res0;
|
|
+my $t5=$res2;
|
|
+my $t6=$t0;
|
|
+
|
|
+my $t7=$res3;
|
|
+my $t8=$res4;
|
|
+my $t9=$res5;
|
|
+
|
|
+my $t10=$res6;
|
|
+my $t11="v21";
|
|
+my $t12=$t1;
|
|
+
|
|
+my $rtmp_ctr="v30";
|
|
+my $rtmp_ctrq="q30";
|
|
+my $rctr_inc="v31";
|
|
+my $rctr_incd="d31";
|
|
+
|
|
+my $mod_constantd=$t0d;
|
|
+my $mod_constant=$t0;
|
|
+
|
|
+my ($rk0,$rk1,$rk2)=map("v$_.16b",(26..28));
|
|
+my ($rk3,$rk4,$rk5)=map("v$_.16b",(26..28));
|
|
+my ($rk6,$rk7,$rk8)=map("v$_.16b",(26..28));
|
|
+my ($rk9,$rk10,$rk11)=map("v$_.16b",(26..28));
|
|
+my ($rk12,$rk13,$rk14)=map("v$_.16b",(26..28));
|
|
+my ($rk0q,$rk1q,$rk2q)=map("q$_",(26..28));
|
|
+my ($rk3q,$rk4q,$rk5q)=map("q$_",(26..28));
|
|
+my ($rk6q,$rk7q,$rk8q)=map("q$_",(26..28));
|
|
+my ($rk9q,$rk10q,$rk11q)=map("q$_",(26..28));
|
|
+my ($rk12q,$rk13q,$rk14q)=map("q$_",(26..28));
|
|
+my $rk2q1="v28.1q";
|
|
+my $rk3q1="v26.1q";
|
|
+my $rk4v="v27";
|
|
+
|
|
+#########################################################################################
|
|
+# size_t unroll8_eor3_aes_gcm_enc_192_kernel(const unsigned char *in,
|
|
+# size_t len,
|
|
+# unsigned char *out,
|
|
+# const void *key,
|
|
+# unsigned char ivec[16],
|
|
+# u64 *Xi);
|
|
+#
|
|
+$code.=<<___;
|
|
+.global unroll8_eor3_aes_gcm_enc_192_kernel
|
|
+.type unroll8_eor3_aes_gcm_enc_192_kernel,%function
|
|
+.align 4
|
|
+unroll8_eor3_aes_gcm_enc_192_kernel:
|
|
+ AARCH64_VALID_CALL_TARGET
|
|
+ cbz x1, .L192_enc_ret
|
|
+ stp d8, d9, [sp, #-80]!
|
|
+ mov $counter, x4
|
|
+ mov $cc, x5
|
|
+ stp d10, d11, [sp, #16]
|
|
+ stp d12, d13, [sp, #32]
|
|
+ stp d14, d15, [sp, #48]
|
|
+ mov x5, #0xc200000000000000
|
|
+ stp x5, xzr, [sp, #64]
|
|
+ add $modulo_constant, sp, #64
|
|
+
|
|
+ lsr $main_end_input_ptr, $bit_length, #3 @ byte_len
|
|
+ ld1 { $ctr0b}, [$counter] @ CTR block 0
|
|
+
|
|
+ mov $constant_temp, #0x100000000 @ set up counter increment
|
|
+ movi $rctr_inc.16b, #0x0
|
|
+ mov $rctr_inc.d[1], $constant_temp
|
|
+
|
|
+ rev32 $rtmp_ctr.16b, $ctr0.16b @ set up reversed counter
|
|
+
|
|
+ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 0
|
|
+
|
|
+ rev32 $ctr1.16b, $rtmp_ctr.16b @ CTR block 1
|
|
+ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 1
|
|
+
|
|
+ rev32 $ctr2.16b, $rtmp_ctr.16b @ CTR block 2
|
|
+ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 2
|
|
+
|
|
+ rev32 $ctr3.16b, $rtmp_ctr.16b @ CTR block 3
|
|
+ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 3
|
|
+
|
|
+ rev32 $ctr4.16b, $rtmp_ctr.16b @ CTR block 4
|
|
+ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 4
|
|
+ sub $main_end_input_ptr, $main_end_input_ptr, #1 @ byte_len - 1
|
|
+
|
|
+ and $main_end_input_ptr, $main_end_input_ptr, #0xffffffffffffff80 @ number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
|
|
+
|
|
+ rev32 $ctr5.16b, $rtmp_ctr.16b @ CTR block 5
|
|
+ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 5
|
|
+ ldp $rk0q, $rk1q, [$cc, #0] @ load rk0, rk1
|
|
+
|
|
+ add $main_end_input_ptr, $main_end_input_ptr, $input_ptr
|
|
+
|
|
+ rev32 $ctr6.16b, $rtmp_ctr.16b @ CTR block 6
|
|
+ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 6
|
|
+
|
|
+ rev32 $ctr7.16b, $rtmp_ctr.16b @ CTR block 7
|
|
+
|
|
+ aese $ctr5b, $rk0 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 0
|
|
+ aese $ctr4b, $rk0 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 0
|
|
+ aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 0
|
|
+
|
|
+ aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 0
|
|
+ aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 0
|
|
+ aese $ctr7b, $rk0 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 0
|
|
+
|
|
+ aese $ctr6b, $rk0 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 0
|
|
+ aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 0
|
|
+ ldp $rk2q, $rk3q, [$cc, #32] @ load rk2, rk3
|
|
+
|
|
+ aese $ctr5b, $rk1 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 1
|
|
+ aese $ctr7b, $rk1 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 1
|
|
+
|
|
+ aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 1
|
|
+ aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 1
|
|
+ aese $ctr6b, $rk1 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 1
|
|
+
|
|
+ aese $ctr5b, $rk2 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 2
|
|
+ aese $ctr4b, $rk1 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 1
|
|
+ aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 1
|
|
+
|
|
+ aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 1
|
|
+ aese $ctr7b, $rk2 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 2
|
|
+ aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 2
|
|
+
|
|
+ aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 2
|
|
+ aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 2
|
|
+
|
|
+ aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 2
|
|
+ aese $ctr4b, $rk2 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 2
|
|
+ aese $ctr6b, $rk2 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 2
|
|
+
|
|
+ ldp $rk4q, $rk5q, [$cc, #64] @ load rk4, rk5
|
|
+ aese $ctr4b, $rk3 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 3
|
|
+
|
|
+ aese $ctr7b, $rk3 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 3
|
|
+ aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 3
|
|
+ aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 3
|
|
+
|
|
+ aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 3
|
|
+
|
|
+ aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 3
|
|
+
|
|
+ aese $ctr6b, $rk3 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 3
|
|
+
|
|
+ aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 4
|
|
+ aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 4
|
|
+ aese $ctr5b, $rk3 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 3
|
|
+
|
|
+ aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 4
|
|
+ aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 4
|
|
+ aese $ctr4b, $rk4 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 4
|
|
+
|
|
+ aese $ctr6b, $rk4 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 4
|
|
+ aese $ctr7b, $rk4 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 4
|
|
+ aese $ctr5b, $rk4 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 4
|
|
+
|
|
+ aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 5
|
|
+ ldp $rk6q, $rk7q, [$cc, #96] @ load rk6, rk7
|
|
+ aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 5
|
|
+
|
|
+ aese $ctr4b, $rk5 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 5
|
|
+ aese $ctr7b, $rk5 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 5
|
|
+ aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 5
|
|
+
|
|
+ aese $ctr5b, $rk5 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 5
|
|
+ aese $ctr6b, $rk5 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 5
|
|
+ aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 5
|
|
+
|
|
+ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 7
|
|
+
|
|
+ aese $ctr5b, $rk6 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 6
|
|
+ aese $ctr4b, $rk6 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 6
|
|
+ aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 6
|
|
+
|
|
+ aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 6
|
|
+ aese $ctr6b, $rk6 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 6
|
|
+ aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 6
|
|
+
|
|
+ aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 6
|
|
+ aese $ctr7b, $rk6 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 6
|
|
+ ldp $rk8q, $rk9q, [$cc, #128] @ load rk8, rk9
|
|
+
|
|
+ aese $ctr6b, $rk7 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 7
|
|
+ aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 7
|
|
+
|
|
+ aese $ctr4b, $rk7 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 7
|
|
+ aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 7
|
|
+
|
|
+ aese $ctr7b, $rk7 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 7
|
|
+ aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 7
|
|
+
|
|
+ aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 7
|
|
+ aese $ctr5b, $rk7 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 7
|
|
+
|
|
+ aese $ctr7b, $rk8 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 8
|
|
+ aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 8
|
|
+
|
|
+ aese $ctr4b, $rk8 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 8
|
|
+ aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 8
|
|
+ aese $ctr5b, $rk8 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 8
|
|
+
|
|
+ aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 8
|
|
+ aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 8
|
|
+ aese $ctr6b, $rk8 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 8
|
|
+
|
|
+ add $end_input_ptr, $input_ptr, $bit_length, lsr #3 @ end_input_ptr
|
|
+ cmp $input_ptr, $main_end_input_ptr @ check if we have <= 8 blocks
|
|
+ aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 9
|
|
+
|
|
+ ld1 { $acc_lb}, [$current_tag]
|
|
+ ext $acc_lb, $acc_lb, $acc_lb, #8
|
|
+ rev64 $acc_lb, $acc_lb
|
|
+ ldp $rk10q, $rk11q, [$cc, #160] @ load rk10, rk11
|
|
+
|
|
+ aese $ctr6b, $rk9 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 9
|
|
+ aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 9
|
|
+
|
|
+ aese $ctr5b, $rk9 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 9
|
|
+ aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 9
|
|
+
|
|
+ aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 9
|
|
+ aese $ctr4b, $rk9 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 9
|
|
+
|
|
+ aese $ctr6b, $rk10 \n aesmc $ctr6b, $ctr6b @ AES block 14 - round 10
|
|
+ aese $ctr7b, $rk9 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 9
|
|
+ aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b @ AES block 11 - round 10
|
|
+
|
|
+ aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b @ AES block 9 - round 10
|
|
+ aese $ctr5b, $rk10 \n aesmc $ctr5b, $ctr5b @ AES block 13 - round 10
|
|
+ aese $ctr4b, $rk10 \n aesmc $ctr4b, $ctr4b @ AES block 12 - round 10
|
|
+
|
|
+ aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b @ AES block 8 - round 10
|
|
+ aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b @ AES block 10 - round 10
|
|
+ aese $ctr7b, $rk10 \n aesmc $ctr7b, $ctr7b @ AES block 15 - round 10
|
|
+
|
|
+ aese $ctr6b, $rk11 @ AES block 14 - round 11
|
|
+ aese $ctr3b, $rk11 @ AES block 11 - round 11
|
|
+
|
|
+ aese $ctr4b, $rk11 @ AES block 12 - round 11
|
|
+ aese $ctr7b, $rk11 @ AES block 15 - round 11
|
|
+ ldr $rk12q, [$cc, #192] @ load rk12
|
|
+
|
|
+ aese $ctr1b, $rk11 @ AES block 9 - round 11
|
|
+ aese $ctr5b, $rk11 @ AES block 13 - round 11
|
|
+
|
|
+ aese $ctr2b, $rk11 @ AES block 10 - round 11
|
|
+ aese $ctr0b, $rk11 @ AES block 8 - round 11
|
|
+ b.ge .L192_enc_tail @ handle tail
|
|
+
|
|
+ ldp $ctr_t0q, $ctr_t1q, [$input_ptr], #32 @ AES block 0, 1 - load plaintext
|
|
+
|
|
+ ldp $ctr_t2q, $ctr_t3q, [$input_ptr], #32 @ AES block 2, 3 - load plaintext
|
|
+
|
|
+ ldp $ctr_t4q, $ctr_t5q, [$input_ptr], #32 @ AES block 4, 5 - load plaintext
|
|
+
|
|
+ ldp $ctr_t6q, $ctr_t7q, [$input_ptr], #32 @ AES block 6, 7 - load plaintext
|
|
+
|
|
+ eor3 $res0b, $ctr_t0b, $ctr0b, $rk12 @ AES block 0 - result
|
|
+ rev32 $ctr0.16b, $rtmp_ctr.16b @ CTR block 8
|
|
+ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8
|
|
+
|
|
+ eor3 $res3b, $ctr_t3b, $ctr3b, $rk12 @ AES block 3 - result
|
|
+ eor3 $res1b, $ctr_t1b, $ctr1b, $rk12 @ AES block 1 - result
|
|
+
|
|
+ rev32 $ctr1.16b, $rtmp_ctr.16b @ CTR block 9
|
|
+ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 9
|
|
+ eor3 $res4b, $ctr_t4b, $ctr4b, $rk12 @ AES block 4 - result
|
|
+
|
|
+ eor3 $res5b, $ctr_t5b, $ctr5b, $rk12 @ AES block 5 - result
|
|
+ eor3 $res7b, $ctr_t7b, $ctr7b, $rk12 @ AES block 7 - result
|
|
+ stp $res0q, $res1q, [$output_ptr], #32 @ AES block 0, 1 - store result
|
|
+
|
|
+ eor3 $res2b, $ctr_t2b, $ctr2b, $rk12 @ AES block 2 - result
|
|
+ rev32 $ctr2.16b, $rtmp_ctr.16b @ CTR block 10
|
|
+ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 10
|
|
+
|
|
+ stp $res2q, $res3q, [$output_ptr], #32 @ AES block 2, 3 - store result
|
|
+ cmp $input_ptr, $main_end_input_ptr @ check if we have <= 8 blocks
|
|
+
|
|
+ rev32 $ctr3.16b, $rtmp_ctr.16b @ CTR block 11
|
|
+ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 11
|
|
+ eor3 $res6b, $ctr_t6b, $ctr6b, $rk12 @ AES block 6 - result
|
|
+
|
|
+ stp $res4q, $res5q, [$output_ptr], #32 @ AES block 4, 5 - store result
|
|
+
|
|
+ rev32 $ctr4.16b, $rtmp_ctr.16b @ CTR block 12
|
|
+ stp $res6q, $res7q, [$output_ptr], #32 @ AES block 6, 7 - store result
|
|
+ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 12
|
|
+
|
|
+ b.ge .L192_enc_prepretail @ do prepretail
|
|
+
|
|
+.L192_enc_main_loop: @ main loop start
|
|
+ rev64 $res4b, $res4b @ GHASH block 8k+4 (t0, t1, and t2 free)
|
|
+ ldp $rk0q, $rk1q, [$cc, #0] @ load rk0, rk1
|
|
+ rev64 $res2b, $res2b @ GHASH block 8k+2
|
|
+
|
|
+ rev32 $ctr5.16b, $rtmp_ctr.16b @ CTR block 8k+13
|
|
+ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+13
|
|
+ ldr $h7q, [$current_tag, #176] @ load h7l | h7h
|
|
+ ext $h7.16b, $h7.16b, $h7.16b, #8
|
|
+ ldr $h8q, [$current_tag, #208] @ load h8l | h8h
|
|
+ ext $h8.16b, $h8.16b, $h8.16b, #8
|
|
+
|
|
+ ext $acc_lb, $acc_lb, $acc_lb, #8 @ PRE 0
|
|
+ rev64 $res0b, $res0b @ GHASH block 8k
|
|
+ ldr $h5q, [$current_tag, #128] @ load h5l | h5h
|
|
+ ext $h5.16b, $h5.16b, $h5.16b, #8
|
|
+ ldr $h6q, [$current_tag, #160] @ load h6l | h6h
|
|
+ ext $h6.16b, $h6.16b, $h6.16b, #8
|
|
+
|
|
+ rev64 $res1b, $res1b @ GHASH block 8k+1
|
|
+ rev32 $ctr6.16b, $rtmp_ctr.16b @ CTR block 8k+14
|
|
+ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+14
|
|
+
|
|
+ eor $res0b, $res0b, $acc_lb @ PRE 1
|
|
+ rev64 $res3b, $res3b @ GHASH block 8k+3
|
|
+ rev64 $res5b, $res5b @ GHASH block 8k+5 (t0, t1, t2 and t3 free)
|
|
+
|
|
+ aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 0
|
|
+ rev32 $ctr7.16b, $rtmp_ctr.16b @ CTR block 8k+15
|
|
+ aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 0
|
|
+
|
|
+ aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 0
|
|
+ aese $ctr5b, $rk0 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 0
|
|
+ aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 0
|
|
+
|
|
+ aese $ctr7b, $rk0 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 0
|
|
+ aese $ctr4b, $rk0 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 0
|
|
+ aese $ctr6b, $rk0 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 0
|
|
+
|
|
+ ldp $rk2q, $rk3q, [$cc, #32] @ load rk2, rk3
|
|
+ pmull2 $acc_h.1q, $res0.2d, $h8.2d @ GHASH block 8k - high
|
|
+ aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 1
|
|
+
|
|
+ aese $ctr4b, $rk1 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 1
|
|
+ pmull2 $t0.1q, $res1.2d, $h7.2d @ GHASH block 8k+1 - high
|
|
+ pmull $h7.1q, $res1.1d, $h7.1d @ GHASH block 8k+1 - low
|
|
+
|
|
+ trn1 $acc_m.2d, $res1.2d, $res0.2d @ GHASH block 8k, 8k+1 - mid
|
|
+ aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 1
|
|
+ ldr $h56kq, [$current_tag, #144] @ load h6k | h5k
|
|
+ ldr $h78kq, [$current_tag, #192] @ load h8k | h7k
|
|
+
|
|
+ pmull2 $t1.1q, $res2.2d, $h6.2d @ GHASH block 8k+2 - high
|
|
+ pmull $acc_l.1q, $res0.1d, $h8.1d @ GHASH block 8k - low
|
|
+ trn2 $res0.2d, $res1.2d, $res0.2d @ GHASH block 8k, 8k+1 - mid
|
|
+
|
|
+ aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 1
|
|
+ aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 1
|
|
+ aese $ctr5b, $rk1 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 1
|
|
+
|
|
+ eor $acc_hb, $acc_hb, $t0.16b @ GHASH block 8k+1 - high
|
|
+ aese $ctr6b, $rk1 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 1
|
|
+ aese $ctr7b, $rk1 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 1
|
|
+
|
|
+ pmull2 $t2.1q, $res3.2d, $h5.2d @ GHASH block 8k+3 - high
|
|
+ eor $res0.16b, $res0.16b, $acc_m.16b @ GHASH block 8k, 8k+1 - mid
|
|
+ aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 2
|
|
+
|
|
+ aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 2
|
|
+ aese $ctr4b, $rk2 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 2
|
|
+ aese $ctr6b, $rk2 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 2
|
|
+
|
|
+ aese $ctr5b, $rk2 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 2
|
|
+ aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 3
|
|
+ eor3 $acc_hb, $acc_hb, $t1.16b, $t2.16b @ GHASH block 8k+2, 8k+3 - high
|
|
+
|
|
+ pmull $h6.1q, $res2.1d, $h6.1d @ GHASH block 8k+2 - low
|
|
+ aese $ctr7b, $rk2 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 2
|
|
+ aese $ctr4b, $rk3 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 3
|
|
+
|
|
+ aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 2
|
|
+ trn1 $t3.2d, $res3.2d, $res2.2d @ GHASH block 8k+2, 8k+3 - mid
|
|
+ aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 2
|
|
+
|
|
+ trn2 $res2.2d, $res3.2d, $res2.2d @ GHASH block 8k+2, 8k+3 - mid
|
|
+ aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 3
|
|
+ ldp $rk4q, $rk5q, [$cc, #64] @ load rk4, rk5
|
|
+
|
|
+ aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 3
|
|
+ eor $acc_lb, $acc_lb, $h7.16b @ GHASH block 8k+1 - low
|
|
+ ldr $h3q, [$current_tag, #80] @ load h3l | h3h
|
|
+ ext $h3.16b, $h3.16b, $h3.16b, #8
|
|
+ ldr $h4q, [$current_tag, #112] @ load h4l | h4h
|
|
+ ext $h4.16b, $h4.16b, $h4.16b, #8
|
|
+
|
|
+ pmull2 $acc_m.1q, $res0.2d, $h78k.2d @ GHASH block 8k - mid
|
|
+ pmull $h78k.1q, $res0.1d, $h78k.1d @ GHASH block 8k+1 - mid
|
|
+ pmull $h5.1q, $res3.1d, $h5.1d @ GHASH block 8k+3 - low
|
|
+
|
|
+ aese $ctr5b, $rk3 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 3
|
|
+ eor $res2.16b, $res2.16b, $t3.16b @ GHASH block 8k+2, 8k+3 - mid
|
|
+ trn1 $t6.2d, $res5.2d, $res4.2d @ GHASH block 8k+4, 8k+5 - mid
|
|
+
|
|
+ eor $acc_mb, $acc_mb, $h78k.16b @ GHASH block 8k+1 - mid
|
|
+ aese $ctr6b, $rk3 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 3
|
|
+ eor3 $acc_lb, $acc_lb, $h6.16b, $h5.16b @ GHASH block 8k+2, 8k+3 - low
|
|
+
|
|
+ aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 4
|
|
+ aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 4
|
|
+ aese $ctr7b, $rk3 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 3
|
|
+
|
|
+ pmull2 $t3.1q, $res2.2d, $h56k.2d @ GHASH block 8k+2 - mid
|
|
+ aese $ctr6b, $rk4 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 4
|
|
+ aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 3
|
|
+
|
|
+ pmull $h56k.1q, $res2.1d, $h56k.1d @ GHASH block 8k+3 - mid
|
|
+ aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 4
|
|
+ aese $ctr4b, $rk4 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 4
|
|
+
|
|
+ aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 4
|
|
+ aese $ctr5b, $rk4 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 4
|
|
+ aese $ctr7b, $rk4 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 4
|
|
+
|
|
+ eor3 $acc_mb, $acc_mb, $h56k.16b, $t3.16b @ GHASH block 8k+2, 8k+3 - mid
|
|
+ aese $ctr4b, $rk5 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 5
|
|
+ ldr $h1q, [$current_tag, #32] @ load h1l | h1h
|
|
+ ext $h1.16b, $h1.16b, $h1.16b, #8
|
|
+ ldr $h2q, [$current_tag, #64] @ load h1l | h1h
|
|
+ ext $h2.16b, $h2.16b, $h2.16b, #8
|
|
+
|
|
+ ldp $rk6q, $rk7q, [$cc, #96] @ load rk6, rk7
|
|
+ aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 5
|
|
+ rev64 $res7b, $res7b @ GHASH block 8k+7 (t0, t1, t2 and t3 free)
|
|
+
|
|
+ rev64 $res6b, $res6b @ GHASH block 8k+6 (t0, t1, and t2 free)
|
|
+ pmull2 $t4.1q, $res4.2d, $h4.2d @ GHASH block 8k+4 - high
|
|
+ pmull $h4.1q, $res4.1d, $h4.1d @ GHASH block 8k+4 - low
|
|
+
|
|
+ aese $ctr5b, $rk5 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 5
|
|
+ trn2 $res4.2d, $res5.2d, $res4.2d @ GHASH block 8k+4, 8k+5 - mid
|
|
+
|
|
+ aese $ctr6b, $rk5 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 5
|
|
+ ldr $h12kq, [$current_tag, #48] @ load h2k | h1k
|
|
+ ldr $h34kq, [$current_tag, #96] @ load h4k | h3k
|
|
+
|
|
+ aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 5
|
|
+ pmull2 $t5.1q, $res5.2d, $h3.2d @ GHASH block 8k+5 - high
|
|
+ eor $res4.16b, $res4.16b, $t6.16b @ GHASH block 8k+4, 8k+5 - mid
|
|
+
|
|
+ aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 5
|
|
+ aese $ctr7b, $rk5 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 5
|
|
+ aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 5
|
|
+
|
|
+ pmull $h3.1q, $res5.1d, $h3.1d @ GHASH block 8k+5 - low
|
|
+ aese $ctr4b, $rk6 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 6
|
|
+ trn1 $t9.2d, $res7.2d, $res6.2d @ GHASH block 8k+6, 8k+7 - mid
|
|
+
|
|
+ aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 6
|
|
+ aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 6
|
|
+ pmull2 $t7.1q, $res6.2d, $h2.2d @ GHASH block 8k+6 - high
|
|
+
|
|
+ pmull $h2.1q, $res6.1d, $h2.1d @ GHASH block 8k+6 - low
|
|
+ trn2 $res6.2d, $res7.2d, $res6.2d @ GHASH block 8k+6, 8k+7 - mid
|
|
+ aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 6
|
|
+
|
|
+ aese $ctr6b, $rk6 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 6
|
|
+ aese $ctr5b, $rk6 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 6
|
|
+
|
|
+ aese $ctr7b, $rk6 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 6
|
|
+ aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 7
|
|
+ aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 6
|
|
+
|
|
+ aese $ctr6b, $rk7 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 7
|
|
+ eor $res6.16b, $res6.16b, $t9.16b @ GHASH block 8k+6, 8k+7 - mid
|
|
+
|
|
+ pmull2 $t6.1q, $res4.2d, $h34k.2d @ GHASH block 8k+4 - mid
|
|
+ ldp $rk8q, $rk9q, [$cc, #128] @ load rk8, rk9
|
|
+ pmull $h34k.1q, $res4.1d, $h34k.1d @ GHASH block 8k+5 - mid
|
|
+
|
|
+ aese $ctr4b, $rk7 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 7
|
|
+ pmull2 $t8.1q, $res7.2d, $h1.2d @ GHASH block 8k+7 - high
|
|
+ aese $ctr5b, $rk7 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 7
|
|
+
|
|
+ eor3 $acc_mb, $acc_mb, $h34k.16b, $t6.16b @ GHASH block 8k+4, 8k+5 - mid
|
|
+ aese $ctr7b, $rk7 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 7
|
|
+ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+15
|
|
+
|
|
+ ldr $mod_constantd, [$modulo_constant] @ MODULO - load modulo constant
|
|
+ eor3 $acc_hb, $acc_hb, $t4.16b, $t5.16b @ GHASH block 8k+4, 8k+5 - high
|
|
+ aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 7
|
|
+
|
|
+ pmull2 $t9.1q, $res6.2d, $h12k.2d @ GHASH block 8k+6 - mid
|
|
+ pmull $h1.1q, $res7.1d, $h1.1d @ GHASH block 8k+7 - low
|
|
+ aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 7
|
|
+
|
|
+ aese $ctr5b, $rk8 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 8
|
|
+ aese $ctr4b, $rk8 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 8
|
|
+ aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 8
|
|
+
|
|
+ aese $ctr6b, $rk8 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 8
|
|
+ eor3 $acc_lb, $acc_lb, $h4.16b, $h3.16b @ GHASH block 8k+4, 8k+5 - low
|
|
+ aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 7
|
|
+
|
|
+ aese $ctr7b, $rk8 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 8
|
|
+ aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 8
|
|
+ pmull $h12k.1q, $res6.1d, $h12k.1d @ GHASH block 8k+7 - mid
|
|
+
|
|
+ aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 8
|
|
+ aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 8
|
|
+ ldp $rk10q, $rk11q, [$cc, #160] @ load rk10, rk11
|
|
+
|
|
+ eor3 $acc_lb, $acc_lb, $h2.16b, $h1.16b @ GHASH block 8k+6, 8k+7 - low
|
|
+ rev32 $h1.16b, $rtmp_ctr.16b @ CTR block 8k+16
|
|
+ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+16
|
|
+
|
|
+ aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 9
|
|
+ eor3 $acc_mb, $acc_mb, $h12k.16b, $t9.16b @ GHASH block 8k+6, 8k+7 - mid
|
|
+ eor3 $acc_hb, $acc_hb, $t7.16b, $t8.16b @ GHASH block 8k+6, 8k+7 - high
|
|
+
|
|
+ aese $ctr6b, $rk9 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 9
|
|
+ aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 9
|
|
+ ldp $ctr_t0q, $ctr_t1q, [$input_ptr], #32 @ AES block 8k+8, 8k+9 - load plaintext
|
|
+
|
|
+ pmull $t11.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid
|
|
+ rev32 $h2.16b, $rtmp_ctr.16b @ CTR block 8k+17
|
|
+ aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 9
|
|
+
|
|
+ aese $ctr4b, $rk9 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 9
|
|
+ aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 9
|
|
+ aese $ctr7b, $rk9 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 9
|
|
+
|
|
+ eor3 $acc_mb, $acc_mb, $acc_hb, $acc_lb @ MODULO - karatsuba tidy up
|
|
+ aese $ctr5b, $rk9 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 9
|
|
+ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+17
|
|
+
|
|
+ aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 10
|
|
+ aese $ctr4b, $rk10 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 10
|
|
+ ldr $rk12q, [$cc, #192] @ load rk12
|
|
+ ext $t12.16b, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment
|
|
+
|
|
+ aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 10
|
|
+ aese $ctr7b, $rk10 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 10
|
|
+ ldp $ctr_t2q, $ctr_t3q, [$input_ptr], #32 @ AES block 8k+10, 8k+11 - load plaintext
|
|
+
|
|
+ aese $ctr4b, $rk11 @ AES block 8k+12 - round 11
|
|
+ eor3 $acc_mb, $acc_mb, $t12.16b, $t11.16b @ MODULO - fold into mid
|
|
+ ldp $ctr_t4q, $ctr_t5q, [$input_ptr], #32 @ AES block 8k+12, 8k+13 - load plaintext
|
|
+
|
|
+ ldp $ctr_t6q, $ctr_t7q, [$input_ptr], #32 @ AES block 8k+14, 8k+15 - load plaintext
|
|
+ aese $ctr2b, $rk11 @ AES block 8k+10 - round 11
|
|
+ aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 10
|
|
+
|
|
+ rev32 $h3.16b, $rtmp_ctr.16b @ CTR block 8k+18
|
|
+ aese $ctr5b, $rk10 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 10
|
|
+
|
|
+ aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 10
|
|
+ pmull $acc_h.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low
|
|
+
|
|
+ aese $ctr6b, $rk10 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 10
|
|
+ aese $ctr5b, $rk11 @ AES block 8k+13 - round 11
|
|
+ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+18
|
|
+
|
|
+ aese $ctr7b, $rk11 @ AES block 8k+15 - round 11
|
|
+ aese $ctr0b, $rk11 @ AES block 8k+8 - round 11
|
|
+ eor3 $res4b, $ctr_t4b, $ctr4b, $rk12 @ AES block 4 - result
|
|
+
|
|
+ aese $ctr6b, $rk11 @ AES block 8k+14 - round 11
|
|
+ aese $ctr3b, $rk11 @ AES block 8k+11 - round 11
|
|
+ aese $ctr1b, $rk11 @ AES block 8k+9 - round 11
|
|
+
|
|
+ rev32 $h4.16b, $rtmp_ctr.16b @ CTR block 8k+19
|
|
+ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+19
|
|
+ eor3 $res7b, $ctr_t7b, $ctr7b, $rk12 @ AES block 7 - result
|
|
+
|
|
+ eor3 $res2b, $ctr_t2b, $ctr2b, $rk12 @ AES block 8k+10 - result
|
|
+ eor3 $res0b, $ctr_t0b, $ctr0b, $rk12 @ AES block 8k+8 - result
|
|
+ mov $ctr2.16b, $h3.16b @ CTR block 8k+18
|
|
+
|
|
+ eor3 $res1b, $ctr_t1b, $ctr1b, $rk12 @ AES block 8k+9 - result
|
|
+ mov $ctr1.16b, $h2.16b @ CTR block 8k+17
|
|
+ stp $res0q, $res1q, [$output_ptr], #32 @ AES block 8k+8, 8k+9 - store result
|
|
+ ext $t11.16b, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment
|
|
+
|
|
+ eor3 $res6b, $ctr_t6b, $ctr6b, $rk12 @ AES block 6 - result
|
|
+ mov $ctr0.16b, $h1.16b @ CTR block 8k+16
|
|
+ rev32 $ctr4.16b, $rtmp_ctr.16b @ CTR block 8k+20
|
|
+
|
|
+ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+20
|
|
+ eor3 $res5b, $ctr_t5b, $ctr5b, $rk12 @ AES block 5 - result
|
|
+ eor3 $acc_lb, $acc_lb, $acc_hb, $t11.16b @ MODULO - fold into low
|
|
+
|
|
+ eor3 $res3b, $ctr_t3b, $ctr3b, $rk12 @ AES block 8k+11 - result
|
|
+ mov $ctr3.16b, $h4.16b @ CTR block 8k+19
|
|
+
|
|
+ stp $res2q, $res3q, [$output_ptr], #32 @ AES block 8k+10, 8k+11 - store result
|
|
+
|
|
+ stp $res4q, $res5q, [$output_ptr], #32 @ AES block 8k+12, 8k+13 - store result
|
|
+
|
|
+ cmp $input_ptr, $main_end_input_ptr @ LOOP CONTROL
|
|
+ stp $res6q, $res7q, [$output_ptr], #32 @ AES block 8k+14, 8k+15 - store result
|
|
+ b.lt .L192_enc_main_loop
|
|
+
|
|
+.L192_enc_prepretail: @ PREPRETAIL
|
|
+ rev32 $ctr5.16b, $rtmp_ctr.16b @ CTR block 8k+13
|
|
+ ldp $rk0q, $rk1q, [$cc, #0] @ load rk0, rk1
|
|
+ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+13
|
|
+
|
|
+ ldr $h7q, [$current_tag, #176] @ load h7l | h7h
|
|
+ ext $h7.16b, $h7.16b, $h7.16b, #8
|
|
+ ldr $h8q, [$current_tag, #208] @ load h8l | h8h
|
|
+ ext $h8.16b, $h8.16b, $h8.16b, #8
|
|
+ rev64 $res0b, $res0b @ GHASH block 8k
|
|
+ ext $acc_lb, $acc_lb, $acc_lb, #8 @ PRE 0
|
|
+
|
|
+ rev32 $ctr6.16b, $rtmp_ctr.16b @ CTR block 8k+14
|
|
+ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+14
|
|
+ ldr $h56kq, [$current_tag, #144] @ load h6k | h5k
|
|
+ ldr $h78kq, [$current_tag, #192] @ load h8k | h7k
|
|
+
|
|
+ rev64 $res3b, $res3b @ GHASH block 8k+3
|
|
+ rev64 $res2b, $res2b @ GHASH block 8k+2
|
|
+ ldr $h5q, [$current_tag, #128] @ load h5l | h5h
|
|
+ ext $h5.16b, $h5.16b, $h5.16b, #8
|
|
+ ldr $h6q, [$current_tag, #160] @ load h6l | h6h
|
|
+ ext $h6.16b, $h6.16b, $h6.16b, #8
|
|
+
|
|
+ eor $res0b, $res0b, $acc_lb @ PRE 1
|
|
+ rev32 $ctr7.16b, $rtmp_ctr.16b @ CTR block 8k+15
|
|
+ rev64 $res1b, $res1b @ GHASH block 8k+1
|
|
+
|
|
+ aese $ctr5b, $rk0 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 0
|
|
+ aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 0
|
|
+ aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 0
|
|
+
|
|
+ pmull2 $t0.1q, $res1.2d, $h7.2d @ GHASH block 8k+1 - high
|
|
+ aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 0
|
|
+ aese $ctr6b, $rk0 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 0
|
|
+
|
|
+ aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 0
|
|
+ aese $ctr4b, $rk0 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 0
|
|
+ pmull2 $acc_h.1q, $res0.2d, $h8.2d @ GHASH block 8k - high
|
|
+
|
|
+ aese $ctr6b, $rk1 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 1
|
|
+ pmull $acc_l.1q, $res0.1d, $h8.1d @ GHASH block 8k - low
|
|
+ trn1 $acc_m.2d, $res1.2d, $res0.2d @ GHASH block 8k, 8k+1 - mid
|
|
+
|
|
+ trn2 $res0.2d, $res1.2d, $res0.2d @ GHASH block 8k, 8k+1 - mid
|
|
+ aese $ctr7b, $rk0 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 0
|
|
+ ldp $rk2q, $rk3q, [$cc, #32] @ load rk2, rk3
|
|
+
|
|
+ pmull $h7.1q, $res1.1d, $h7.1d @ GHASH block 8k+1 - low
|
|
+ eor $acc_hb, $acc_hb, $t0.16b @ GHASH block 8k+1 - high
|
|
+ aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 1
|
|
+
|
|
+ aese $ctr5b, $rk1 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 1
|
|
+ eor $res0.16b, $res0.16b, $acc_m.16b @ GHASH block 8k, 8k+1 - mid
|
|
+ aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 1
|
|
+
|
|
+ aese $ctr7b, $rk1 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 1
|
|
+ pmull2 $t2.1q, $res3.2d, $h5.2d @ GHASH block 8k+3 - high
|
|
+ pmull2 $t1.1q, $res2.2d, $h6.2d @ GHASH block 8k+2 - high
|
|
+
|
|
+ aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 1
|
|
+ aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 1
|
|
+ aese $ctr4b, $rk1 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 1
|
|
+
|
|
+ pmull $h6.1q, $res2.1d, $h6.1d @ GHASH block 8k+2 - low
|
|
+ aese $ctr5b, $rk2 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 2
|
|
+ eor $acc_lb, $acc_lb, $h7.16b @ GHASH block 8k+1 - low
|
|
+
|
|
+ pmull $h5.1q, $res3.1d, $h5.1d @ GHASH block 8k+3 - low
|
|
+ aese $ctr7b, $rk2 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 2
|
|
+ eor3 $acc_hb, $acc_hb, $t1.16b, $t2.16b @ GHASH block 8k+2, 8k+3 - high
|
|
+
|
|
+ aese $ctr5b, $rk3 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 3
|
|
+ trn1 $t3.2d, $res3.2d, $res2.2d @ GHASH block 8k+2, 8k+3 - mid
|
|
+ aese $ctr6b, $rk2 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 2
|
|
+
|
|
+ aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 2
|
|
+ pmull2 $acc_m.1q, $res0.2d, $h78k.2d @ GHASH block 8k - mid
|
|
+ trn2 $res2.2d, $res3.2d, $res2.2d @ GHASH block 8k+2, 8k+3 - mid
|
|
+
|
|
+ aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 2
|
|
+ rev64 $res5b, $res5b @ GHASH block 8k+5 (t0, t1, t2 and t3 free)
|
|
+ rev64 $res6b, $res6b @ GHASH block 8k+6 (t0, t1, and t2 free)
|
|
+
|
|
+ aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 2
|
|
+ aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 2
|
|
+ aese $ctr4b, $rk2 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 2
|
|
+
|
|
+ eor $res2.16b, $res2.16b, $t3.16b @ GHASH block 8k+2, 8k+3 - mid
|
|
+ pmull $h78k.1q, $res0.1d, $h78k.1d @ GHASH block 8k+1 - mid
|
|
+ ldp $rk4q, $rk5q, [$cc, #64] @ load rk4, rk5
|
|
+
|
|
+ aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 3
|
|
+ aese $ctr6b, $rk3 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 3
|
|
+ aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 3
|
|
+
|
|
+ eor $acc_mb, $acc_mb, $h78k.16b @ GHASH block 8k+1 - mid
|
|
+ eor3 $acc_lb, $acc_lb, $h6.16b, $h5.16b @ GHASH block 8k+2, 8k+3 - low
|
|
+ aese $ctr7b, $rk3 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 3
|
|
+
|
|
+ ldr $h3q, [$current_tag, #80] @ load h3l | h3h
|
|
+ ext $h3.16b, $h3.16b, $h3.16b, #8
|
|
+ ldr $h4q, [$current_tag, #112] @ load h4l | h4h
|
|
+ ext $h4.16b, $h4.16b, $h4.16b, #8
|
|
+ aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 3
|
|
+ pmull2 $t3.1q, $res2.2d, $h56k.2d @ GHASH block 8k+2 - mid
|
|
+
|
|
+ ldr $h1q, [$current_tag, #32] @ load h1l | h1h
|
|
+ ext $h1.16b, $h1.16b, $h1.16b, #8
|
|
+ ldr $h2q, [$current_tag, #64] @ load h1l | h1h
|
|
+ ext $h2.16b, $h2.16b, $h2.16b, #8
|
|
+ aese $ctr4b, $rk3 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 3
|
|
+ rev64 $res4b, $res4b @ GHASH block 8k+4 (t0, t1, and t2 free)
|
|
+
|
|
+ aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 3
|
|
+ pmull $h56k.1q, $res2.1d, $h56k.1d @ GHASH block 8k+3 - mid
|
|
+ aese $ctr6b, $rk4 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 4
|
|
+
|
|
+ trn1 $t6.2d, $res5.2d, $res4.2d @ GHASH block 8k+4, 8k+5 - mid
|
|
+ aese $ctr7b, $rk4 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 4
|
|
+ aese $ctr5b, $rk4 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 4
|
|
+
|
|
+ eor3 $acc_mb, $acc_mb, $h56k.16b, $t3.16b @ GHASH block 8k+2, 8k+3 - mid
|
|
+ aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 4
|
|
+ aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 4
|
|
+
|
|
+ aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 4
|
|
+ aese $ctr4b, $rk4 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 4
|
|
+ aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 4
|
|
+
|
|
+ aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 5
|
|
+ rev64 $res7b, $res7b @ GHASH block 8k+7 (t0, t1, t2 and t3 free)
|
|
+ ldr $h12kq, [$current_tag, #48] @ load h2k | h1k
|
|
+ ldr $h34kq, [$current_tag, #96] @ load h4k | h3k
|
|
+
|
|
+ aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 5
|
|
+ aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 5
|
|
+ ldp $rk6q, $rk7q, [$cc, #96] @ load rk6, rk7
|
|
+
|
|
+ pmull2 $t7.1q, $res6.2d, $h2.2d @ GHASH block 8k+6 - high
|
|
+ pmull2 $t4.1q, $res4.2d, $h4.2d @ GHASH block 8k+4 - high
|
|
+ pmull $h4.1q, $res4.1d, $h4.1d @ GHASH block 8k+4 - low
|
|
+
|
|
+ aese $ctr4b, $rk5 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 5
|
|
+ trn2 $res4.2d, $res5.2d, $res4.2d @ GHASH block 8k+4, 8k+5 - mid
|
|
+
|
|
+ pmull2 $t5.1q, $res5.2d, $h3.2d @ GHASH block 8k+5 - high
|
|
+ pmull $h3.1q, $res5.1d, $h3.1d @ GHASH block 8k+5 - low
|
|
+ pmull $h2.1q, $res6.1d, $h2.1d @ GHASH block 8k+6 - low
|
|
+
|
|
+ trn1 $t9.2d, $res7.2d, $res6.2d @ GHASH block 8k+6, 8k+7 - mid
|
|
+ eor $res4.16b, $res4.16b, $t6.16b @ GHASH block 8k+4, 8k+5 - mid
|
|
+ trn2 $res6.2d, $res7.2d, $res6.2d @ GHASH block 8k+6, 8k+7 - mid
|
|
+
|
|
+ aese $ctr5b, $rk5 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 5
|
|
+ aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 6
|
|
+ aese $ctr7b, $rk5 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 5
|
|
+
|
|
+ aese $ctr6b, $rk5 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 5
|
|
+ eor $res6.16b, $res6.16b, $t9.16b @ GHASH block 8k+6, 8k+7 - mid
|
|
+ aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 5
|
|
+
|
|
+ pmull2 $t6.1q, $res4.2d, $h34k.2d @ GHASH block 8k+4 - mid
|
|
+ pmull $h34k.1q, $res4.1d, $h34k.1d @ GHASH block 8k+5 - mid
|
|
+
|
|
+ aese $ctr4b, $rk6 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 6
|
|
+ aese $ctr5b, $rk6 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 6
|
|
+ aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 7
|
|
+
|
|
+ aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 6
|
|
+ aese $ctr7b, $rk6 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 6
|
|
+ eor3 $acc_mb, $acc_mb, $h34k.16b, $t6.16b @ GHASH block 8k+4, 8k+5 - mid
|
|
+
|
|
+ aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 6
|
|
+ eor3 $acc_hb, $acc_hb, $t4.16b, $t5.16b @ GHASH block 8k+4, 8k+5 - high
|
|
+ aese $ctr5b, $rk7 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 7
|
|
+
|
|
+ aese $ctr6b, $rk6 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 6
|
|
+ ldr $mod_constantd, [$modulo_constant] @ MODULO - load modulo constant
|
|
+ aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 6
|
|
+
|
|
+ pmull2 $t9.1q, $res6.2d, $h12k.2d @ GHASH block 8k+6 - mid
|
|
+ aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 7
|
|
+ eor3 $acc_lb, $acc_lb, $h4.16b, $h3.16b @ GHASH block 8k+4, 8k+5 - low
|
|
+
|
|
+ pmull2 $t8.1q, $res7.2d, $h1.2d @ GHASH block 8k+7 - high
|
|
+ pmull $h12k.1q, $res6.1d, $h12k.1d @ GHASH block 8k+7 - mid
|
|
+ pmull $h1.1q, $res7.1d, $h1.1d @ GHASH block 8k+7 - low
|
|
+
|
|
+ aese $ctr4b, $rk7 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 7
|
|
+ aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 7
|
|
+ ldp $rk8q, $rk9q, [$cc, #128] @ load rk8, rk9
|
|
+
|
|
+ aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 7
|
|
+ eor3 $acc_mb, $acc_mb, $h12k.16b, $t9.16b @ GHASH block 8k+6, 8k+7 - mid
|
|
+
|
|
+ eor3 $acc_lb, $acc_lb, $h2.16b, $h1.16b @ GHASH block 8k+6, 8k+7 - low
|
|
+ eor3 $acc_hb, $acc_hb, $t7.16b, $t8.16b @ GHASH block 8k+6, 8k+7 - high
|
|
+
|
|
+ eor3 $acc_mb, $acc_mb, $acc_hb, $acc_lb @ MODULO - karatsuba tidy up
|
|
+ ext $t12.16b, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment
|
|
+ aese $ctr7b, $rk7 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 7
|
|
+ pmull $t11.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid
|
|
+
|
|
+ aese $ctr5b, $rk8 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 8
|
|
+ aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 8
|
|
+
|
|
+ aese $ctr6b, $rk7 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 7
|
|
+ aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 8
|
|
+ eor3 $acc_mb, $acc_mb, $t12.16b, $t11.16b @ MODULO - fold into mid
|
|
+
|
|
+ aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 8
|
|
+ aese $ctr5b, $rk9 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 9
|
|
+ aese $ctr4b, $rk8 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 8
|
|
+
|
|
+ aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 8
|
|
+ aese $ctr7b, $rk8 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 8
|
|
+ aese $ctr6b, $rk8 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 8
|
|
+
|
|
+ aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 9
|
|
+ ldp $rk10q, $rk11q, [$cc, #160] @ load rk10, rk11
|
|
+ aese $ctr4b, $rk9 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 9
|
|
+
|
|
+ aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 9
|
|
+ aese $ctr7b, $rk9 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 9
|
|
+
|
|
+ ext $t11.16b, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment
|
|
+ aese $ctr6b, $rk9 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 9
|
|
+ aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 9
|
|
+ aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 9
|
|
+
|
|
+ pmull $acc_h.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low
|
|
+ ldr $rk12q, [$cc, #192] @ load rk12
|
|
+
|
|
+ aese $ctr7b, $rk10 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 10
|
|
+ aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 10
|
|
+ aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 10
|
|
+
|
|
+ eor3 $acc_lb, $acc_lb, $acc_hb, $t11.16b @ MODULO - fold into low
|
|
+ aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 10
|
|
+ aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 10
|
|
+
|
|
+ aese $ctr1b, $rk11 @ AES block 8k+9 - round 11
|
|
+ aese $ctr7b, $rk11 @ AES block 8k+15 - round 11
|
|
+
|
|
+ aese $ctr4b, $rk10 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 10
|
|
+ aese $ctr3b, $rk11 @ AES block 8k+11 - round 11
|
|
+
|
|
+ aese $ctr5b, $rk10 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 10
|
|
+ aese $ctr6b, $rk10 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 10
|
|
+
|
|
+ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+15
|
|
+ aese $ctr2b, $rk11 @ AES block 8k+10 - round 11
|
|
+ aese $ctr0b, $rk11 @ AES block 8k+8 - round 11
|
|
+
|
|
+ aese $ctr6b, $rk11 @ AES block 8k+14 - round 11
|
|
+ aese $ctr4b, $rk11 @ AES block 8k+12 - round 11
|
|
+ aese $ctr5b, $rk11 @ AES block 8k+13 - round 11
|
|
+
|
|
+.L192_enc_tail: @ TAIL
|
|
+
|
|
+ ldp $h5q, $h56kq, [$current_tag, #128] @ load h5l | h5h
|
|
+ ext $h5.16b, $h5.16b, $h5.16b, #8
|
|
+ sub $main_end_input_ptr, $end_input_ptr, $input_ptr @ main_end_input_ptr is number of bytes left to process
|
|
+
|
|
+ ldr $ctr_t0q, [$input_ptr], #16 @ AES block 8k+8 - l3ad plaintext
|
|
+
|
|
+ ldp $h78kq, $h8q, [$current_tag, #192] @ load h8l | h8h
|
|
+ ext $h8.16b, $h8.16b, $h8.16b, #8
|
|
+
|
|
+ mov $t1.16b, $rk12
|
|
+
|
|
+ ldp $h6q, $h7q, [$current_tag, #160] @ load h6l | h6h
|
|
+ ext $h6.16b, $h6.16b, $h6.16b, #8
|
|
+ ext $h7.16b, $h7.16b, $h7.16b, #8
|
|
+ cmp $main_end_input_ptr, #112
|
|
+
|
|
+ eor3 $res1b, $ctr_t0b, $ctr0b, $t1.16b @ AES block 8k+8 - result
|
|
+ ext $t0.16b, $acc_lb, $acc_lb, #8 @ prepare final partial tag
|
|
+ b.gt .L192_enc_blocks_more_than_7
|
|
+
|
|
+ cmp $main_end_input_ptr, #96
|
|
+ mov $ctr7b, $ctr6b
|
|
+ movi $acc_h.8b, #0
|
|
+
|
|
+ mov $ctr6b, $ctr5b
|
|
+ movi $acc_l.8b, #0
|
|
+ sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
|
|
+
|
|
+ mov $ctr5b, $ctr4b
|
|
+ mov $ctr4b, $ctr3b
|
|
+ mov $ctr3b, $ctr2b
|
|
+
|
|
+ mov $ctr2b, $ctr1b
|
|
+ movi $acc_m.8b, #0
|
|
+ b.gt .L192_enc_blocks_more_than_6
|
|
+
|
|
+ mov $ctr7b, $ctr6b
|
|
+ cmp $main_end_input_ptr, #80
|
|
+
|
|
+ mov $ctr6b, $ctr5b
|
|
+ mov $ctr5b, $ctr4b
|
|
+ mov $ctr4b, $ctr3b
|
|
+
|
|
+ mov $ctr3b, $ctr1b
|
|
+ sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
|
|
+ b.gt .L192_enc_blocks_more_than_5
|
|
+
|
|
+ cmp $main_end_input_ptr, #64
|
|
+ sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
|
|
+
|
|
+ mov $ctr7b, $ctr6b
|
|
+ mov $ctr6b, $ctr5b
|
|
+ mov $ctr5b, $ctr4b
|
|
+
|
|
+ mov $ctr4b, $ctr1b
|
|
+ b.gt .L192_enc_blocks_more_than_4
|
|
+
|
|
+ mov $ctr7b, $ctr6b
|
|
+ mov $ctr6b, $ctr5b
|
|
+ mov $ctr5b, $ctr1b
|
|
+
|
|
+ sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
|
|
+ cmp $main_end_input_ptr, #48
|
|
+ b.gt .L192_enc_blocks_more_than_3
|
|
+
|
|
+ mov $ctr7b, $ctr6b
|
|
+ mov $ctr6b, $ctr1b
|
|
+ sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
|
|
+
|
|
+ ldr $h34kq, [$current_tag, #96] @ load h4k | h3k
|
|
+ cmp $main_end_input_ptr, #32
|
|
+ b.gt .L192_enc_blocks_more_than_2
|
|
+
|
|
+ sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
|
|
+
|
|
+ cmp $main_end_input_ptr, #16
|
|
+ mov $ctr7b, $ctr1b
|
|
+ b.gt .L192_enc_blocks_more_than_1
|
|
+
|
|
+ sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
|
|
+ ldr $h12kq, [$current_tag, #48] @ load h2k | h1k
|
|
+ b .L192_enc_blocks_less_than_1
|
|
+.L192_enc_blocks_more_than_7: @ blocks left > 7
|
|
+ st1 { $res1b}, [$output_ptr], #16 @ AES final-7 block - store result
|
|
+
|
|
+ rev64 $res0b, $res1b @ GHASH final-7 block
|
|
+ ins $acc_m.d[0], $h78k.d[1] @ GHASH final-7 block - mid
|
|
+
|
|
+ eor $res0b, $res0b, $t0.16b @ feed in partial tag
|
|
+
|
|
+ ins $rk4v.d[0], $res0.d[1] @ GHASH final-7 block - mid
|
|
+
|
|
+ ldr $ctr_t1q, [$input_ptr], #16 @ AES final-6 block - load plaintext
|
|
+
|
|
+ eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-7 block - mid
|
|
+ movi $t0.8b, #0 @ surpress further partial tag feed in
|
|
+ pmull $acc_l.1q, $res0.1d, $h8.1d @ GHASH final-7 block - low
|
|
+
|
|
+ pmull2 $acc_h.1q, $res0.2d, $h8.2d @ GHASH final-7 block - high
|
|
+
|
|
+ pmull $acc_m.1q, $rk4v.1d, $acc_m.1d @ GHASH final-7 block - mid
|
|
+ eor3 $res1b, $ctr_t1b, $ctr1b, $t1.16b @ AES final-6 block - result
|
|
+.L192_enc_blocks_more_than_6: @ blocks left > 6
|
|
+
|
|
+ st1 { $res1b}, [$output_ptr], #16 @ AES final-6 block - store result
|
|
+
|
|
+ rev64 $res0b, $res1b @ GHASH final-6 block
|
|
+
|
|
+ ldr $ctr_t1q, [$input_ptr], #16 @ AES final-5 block - load plaintext
|
|
+
|
|
+ eor $res0b, $res0b, $t0.16b @ feed in partial tag
|
|
+
|
|
+ ins $rk4v.d[0], $res0.d[1] @ GHASH final-6 block - mid
|
|
+
|
|
+ pmull $rk3q1, $res0.1d, $h7.1d @ GHASH final-6 block - low
|
|
+ eor3 $res1b, $ctr_t1b, $ctr2b, $t1.16b @ AES final-5 block - result
|
|
+
|
|
+ movi $t0.8b, #0 @ surpress further partial tag feed in
|
|
+ pmull2 $rk2q1, $res0.2d, $h7.2d @ GHASH final-6 block - high
|
|
+ eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-6 block - mid
|
|
+
|
|
+ pmull $rk4v.1q, $rk4v.1d, $h78k.1d @ GHASH final-6 block - mid
|
|
+
|
|
+ eor $acc_hb, $acc_hb, $rk2 @ GHASH final-6 block - high
|
|
+ eor $acc_lb, $acc_lb, $rk3 @ GHASH final-6 block - low
|
|
+
|
|
+ eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-6 block - mid
|
|
+.L192_enc_blocks_more_than_5: @ blocks left > 5
|
|
+
|
|
+ st1 { $res1b}, [$output_ptr], #16 @ AES final-5 block - store result
|
|
+
|
|
+ rev64 $res0b, $res1b @ GHASH final-5 block
|
|
+
|
|
+ eor $res0b, $res0b, $t0.16b @ feed in partial tag
|
|
+
|
|
+ ins $rk4v.d[0], $res0.d[1] @ GHASH final-5 block - mid
|
|
+
|
|
+ ldr $ctr_t1q, [$input_ptr], #16 @ AES final-4 block - load plaintext
|
|
+ pmull2 $rk2q1, $res0.2d, $h6.2d @ GHASH final-5 block - high
|
|
+
|
|
+ eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-5 block - mid
|
|
+ eor $acc_hb, $acc_hb, $rk2 @ GHASH final-5 block - high
|
|
+
|
|
+ ins $rk4v.d[1], $rk4v.d[0] @ GHASH final-5 block - mid
|
|
+ pmull $rk3q1, $res0.1d, $h6.1d @ GHASH final-5 block - low
|
|
+
|
|
+ eor $acc_lb, $acc_lb, $rk3 @ GHASH final-5 block - low
|
|
+ pmull2 $rk4v.1q, $rk4v.2d, $h56k.2d @ GHASH final-5 block - mid
|
|
+
|
|
+ eor3 $res1b, $ctr_t1b, $ctr3b, $t1.16b @ AES final-4 block - result
|
|
+ movi $t0.8b, #0 @ surpress further partial tag feed in
|
|
+
|
|
+ eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-5 block - mid
|
|
+.L192_enc_blocks_more_than_4: @ blocks left > 4
|
|
+
|
|
+ st1 { $res1b}, [$output_ptr], #16 @ AES final-4 block - store result
|
|
+
|
|
+ rev64 $res0b, $res1b @ GHASH final-4 block
|
|
+
|
|
+ eor $res0b, $res0b, $t0.16b @ feed in partial tag
|
|
+
|
|
+ ldr $ctr_t1q, [$input_ptr], #16 @ AES final-3 block - load plaintext
|
|
+ pmull2 $rk2q1, $res0.2d, $h5.2d @ GHASH final-4 block - high
|
|
+ ins $rk4v.d[0], $res0.d[1] @ GHASH final-4 block - mid
|
|
+
|
|
+ pmull $rk3q1, $res0.1d, $h5.1d @ GHASH final-4 block - low
|
|
+ eor $acc_hb, $acc_hb, $rk2 @ GHASH final-4 block - high
|
|
+
|
|
+ eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-4 block - mid
|
|
+
|
|
+ movi $t0.8b, #0 @ surpress further partial tag feed in
|
|
+ eor $acc_lb, $acc_lb, $rk3 @ GHASH final-4 block - low
|
|
+
|
|
+ pmull $rk4v.1q, $rk4v.1d, $h56k.1d @ GHASH final-4 block - mid
|
|
+
|
|
+ eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-4 block - mid
|
|
+ eor3 $res1b, $ctr_t1b, $ctr4b, $t1.16b @ AES final-3 block - result
|
|
+.L192_enc_blocks_more_than_3: @ blocks left > 3
|
|
+
|
|
+ ldr $h34kq, [$current_tag, #96] @ load h4k | h3k
|
|
+ st1 { $res1b}, [$output_ptr], #16 @ AES final-3 block - store result
|
|
+
|
|
+ rev64 $res0b, $res1b @ GHASH final-3 block
|
|
+
|
|
+ eor $res0b, $res0b, $t0.16b @ feed in partial tag
|
|
+ movi $t0.8b, #0 @ surpress further partial tag feed in
|
|
+
|
|
+ ldr $ctr_t1q, [$input_ptr], #16 @ AES final-2 block - load plaintext
|
|
+ ldr $h4q, [$current_tag, #112] @ load h4l | h4h
|
|
+ ext $h4.16b, $h4.16b, $h4.16b, #8
|
|
+
|
|
+ ins $rk4v.d[0], $res0.d[1] @ GHASH final-3 block - mid
|
|
+
|
|
+ eor3 $res1b, $ctr_t1b, $ctr5b, $t1.16b @ AES final-2 block - result
|
|
+ eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-3 block - mid
|
|
+
|
|
+ ins $rk4v.d[1], $rk4v.d[0] @ GHASH final-3 block - mid
|
|
+ pmull $rk3q1, $res0.1d, $h4.1d @ GHASH final-3 block - low
|
|
+
|
|
+ pmull2 $rk2q1, $res0.2d, $h4.2d @ GHASH final-3 block - high
|
|
+ pmull2 $rk4v.1q, $rk4v.2d, $h34k.2d @ GHASH final-3 block - mid
|
|
+
|
|
+ eor $acc_lb, $acc_lb, $rk3 @ GHASH final-3 block - low
|
|
+
|
|
+ eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-3 block - mid
|
|
+ eor $acc_hb, $acc_hb, $rk2 @ GHASH final-3 block - high
|
|
+.L192_enc_blocks_more_than_2: @ blocks left > 2
|
|
+
|
|
+ st1 { $res1b}, [$output_ptr], #16 @ AES final-2 block - store result
|
|
+
|
|
+ rev64 $res0b, $res1b @ GHASH final-2 block
|
|
+ ldr $h3q, [$current_tag, #80] @ load h3l | h3h
|
|
+ ext $h3.16b, $h3.16b, $h3.16b, #8
|
|
+
|
|
+ eor $res0b, $res0b, $t0.16b @ feed in partial tag
|
|
+
|
|
+ ldr $ctr_t1q, [$input_ptr], #16 @ AES final-1 block - load plaintext
|
|
+ ins $rk4v.d[0], $res0.d[1] @ GHASH final-2 block - mid
|
|
+
|
|
+ eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-2 block - mid
|
|
+
|
|
+ pmull $rk3q1, $res0.1d, $h3.1d @ GHASH final-2 block - low
|
|
+ pmull2 $rk2q1, $res0.2d, $h3.2d @ GHASH final-2 block - high
|
|
+ movi $t0.8b, #0 @ surpress further partial tag feed in
|
|
+
|
|
+ pmull $rk4v.1q, $rk4v.1d, $h34k.1d @ GHASH final-2 block - mid
|
|
+
|
|
+ eor $acc_lb, $acc_lb, $rk3 @ GHASH final-2 block - low
|
|
+ eor $acc_hb, $acc_hb, $rk2 @ GHASH final-2 block - high
|
|
+
|
|
+ eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-2 block - mid
|
|
+ eor3 $res1b, $ctr_t1b, $ctr6b, $t1.16b @ AES final-1 block - result
|
|
+.L192_enc_blocks_more_than_1: @ blocks left > 1
|
|
+
|
|
+ ldr $h2q, [$current_tag, #64] @ load h1l | h1h
|
|
+ ext $h2.16b, $h2.16b, $h2.16b, #8
|
|
+ st1 { $res1b}, [$output_ptr], #16 @ AES final-1 block - store result
|
|
+
|
|
+ rev64 $res0b, $res1b @ GHASH final-1 block
|
|
+
|
|
+ eor $res0b, $res0b, $t0.16b @ feed in partial tag
|
|
+
|
|
+ ins $rk4v.d[0], $res0.d[1] @ GHASH final-1 block - mid
|
|
+ pmull $rk3q1, $res0.1d, $h2.1d @ GHASH final-1 block - low
|
|
+
|
|
+ eor $acc_lb, $acc_lb, $rk3 @ GHASH final-1 block - low
|
|
+ pmull2 $rk2q1, $res0.2d, $h2.2d @ GHASH final-1 block - high
|
|
+ eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-1 block - mid
|
|
+
|
|
+ ldr $ctr_t1q, [$input_ptr], #16 @ AES final block - load plaintext
|
|
+ ldr $h12kq, [$current_tag, #48] @ load h2k | h1k
|
|
+
|
|
+ ins $rk4v.d[1], $rk4v.d[0] @ GHASH final-1 block - mid
|
|
+
|
|
+ eor3 $res1b, $ctr_t1b, $ctr7b, $t1.16b @ AES final block - result
|
|
+ pmull2 $rk4v.1q, $rk4v.2d, $h12k.2d @ GHASH final-1 block - mid
|
|
+
|
|
+ movi $t0.8b, #0 @ surpress further partial tag feed in
|
|
+
|
|
+ eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-1 block - mid
|
|
+ eor $acc_hb, $acc_hb, $rk2 @ GHASH final-1 block - high
|
|
+.L192_enc_blocks_less_than_1: @ blocks left <= 1
|
|
+
|
|
+ mvn $temp0_x, xzr @ temp0_x = 0xffffffffffffffff
|
|
+ and $bit_length, $bit_length, #127 @ bit_length %= 128
|
|
+
|
|
+ sub $bit_length, $bit_length, #128 @ bit_length -= 128
|
|
+
|
|
+ neg $bit_length, $bit_length @ bit_length = 128 - #bits in input (in range [1,128])
|
|
+
|
|
+ and $bit_length, $bit_length, #127 @ bit_length %= 128
|
|
+
|
|
+ lsr $temp0_x, $temp0_x, $bit_length @ temp0_x is mask for top 64b of last block
|
|
+ cmp $bit_length, #64
|
|
+ mvn $temp1_x, xzr @ temp1_x = 0xffffffffffffffff
|
|
+
|
|
+ csel $temp2_x, $temp1_x, $temp0_x, lt
|
|
+ csel $temp3_x, $temp0_x, xzr, lt
|
|
+
|
|
+ mov $ctr0.d[1], $temp3_x
|
|
+ ldr $h1q, [$current_tag, #32] @ load h1l | h1h
|
|
+ ext $h1.16b, $h1.16b, $h1.16b, #8
|
|
+
|
|
+ ld1 { $rk0}, [$output_ptr] @ load existing bytes where the possibly partial last block is to be stored
|
|
+ mov $ctr0.d[0], $temp2_x @ ctr0b is mask for last block
|
|
+
|
|
+ and $res1b, $res1b, $ctr0b @ possibly partial last block has zeroes in highest bits
|
|
+
|
|
+ rev64 $res0b, $res1b @ GHASH final block
|
|
+ bif $res1b, $rk0, $ctr0b @ insert existing bytes in top end of result before storing
|
|
+
|
|
+ st1 { $res1b}, [$output_ptr] @ store all 16B
|
|
+
|
|
+ eor $res0b, $res0b, $t0.16b @ feed in partial tag
|
|
+
|
|
+ ins $t0.d[0], $res0.d[1] @ GHASH final block - mid
|
|
+ pmull2 $rk2q1, $res0.2d, $h1.2d @ GHASH final block - high
|
|
+
|
|
+ eor $acc_hb, $acc_hb, $rk2 @ GHASH final block - high
|
|
+ pmull $rk3q1, $res0.1d, $h1.1d @ GHASH final block - low
|
|
+
|
|
+ eor $t0.8b, $t0.8b, $res0.8b @ GHASH final block - mid
|
|
+
|
|
+ pmull $t0.1q, $t0.1d, $h12k.1d @ GHASH final block - mid
|
|
+
|
|
+ eor $acc_mb, $acc_mb, $t0.16b @ GHASH final block - mid
|
|
+ ldr $mod_constantd, [$modulo_constant] @ MODULO - load modulo constant
|
|
+
|
|
+ eor $acc_lb, $acc_lb, $rk3 @ GHASH final block - low
|
|
+ ext $t11.16b, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment
|
|
+
|
|
+ rev32 $rtmp_ctr.16b, $rtmp_ctr.16b
|
|
+
|
|
+ str $rtmp_ctrq, [$counter] @ store the updated counter
|
|
+ eor3 $acc_mb, $acc_mb, $acc_hb, $acc_lb @ MODULO - karatsuba tidy up
|
|
+
|
|
+ pmull $t12.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid
|
|
+
|
|
+ eor3 $acc_mb, $acc_mb, $t12.16b, $t11.16b @ MODULO - fold into mid
|
|
+
|
|
+ pmull $acc_h.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low
|
|
+ ext $t11.16b, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment
|
|
+
|
|
+ eor3 $acc_lb, $acc_lb, $acc_hb, $t11.16b @ MODULO - fold into low
|
|
+ ext $acc_lb, $acc_lb, $acc_lb, #8
|
|
+ rev64 $acc_lb, $acc_lb
|
|
+ st1 { $acc_l.16b }, [$current_tag]
|
|
+
|
|
+ lsr x0, $bit_length, #3 @ return sizes
|
|
+
|
|
+ ldp d10, d11, [sp, #16]
|
|
+ ldp d12, d13, [sp, #32]
|
|
+ ldp d14, d15, [sp, #48]
|
|
+ ldp d8, d9, [sp], #80
|
|
+ ret
|
|
+
|
|
+.L192_enc_ret:
|
|
+ mov w0, #0x0
|
|
+ ret
|
|
+.size unroll8_eor3_aes_gcm_enc_192_kernel,.-unroll8_eor3_aes_gcm_enc_192_kernel
|
|
+___
|
|
+
|
|
+#########################################################################################
|
|
+# size_t unroll8_eor3_aes_gcm_dec_192_kernel(const unsigned char *in,
|
|
+# size_t len,
|
|
+# unsigned char *out,
|
|
+# const void *key,
|
|
+# unsigned char ivec[16],
|
|
+# u64 *Xi);
|
|
+#
|
|
+$code.=<<___;
|
|
+.global unroll8_eor3_aes_gcm_dec_192_kernel
|
|
+.type unroll8_eor3_aes_gcm_dec_192_kernel,%function
|
|
+.align 4
|
|
+unroll8_eor3_aes_gcm_dec_192_kernel:
|
|
+ AARCH64_VALID_CALL_TARGET
|
|
+ cbz x1, .L192_dec_ret
|
|
+ stp d8, d9, [sp, #-80]!
|
|
+ mov $counter, x4
|
|
+ mov $cc, x5
|
|
+ stp d10, d11, [sp, #16]
|
|
+ stp d12, d13, [sp, #32]
|
|
+ stp d14, d15, [sp, #48]
|
|
+ mov x5, #0xc200000000000000
|
|
+ stp x5, xzr, [sp, #64]
|
|
+ add $modulo_constant, sp, #64
|
|
+
|
|
+ lsr $main_end_input_ptr, $bit_length, #3 @ byte_len
|
|
+ ld1 { $ctr0b}, [$counter] @ CTR block 0
|
|
+ ld1 { $acc_lb}, [$current_tag]
|
|
+
|
|
+ mov $constant_temp, #0x100000000 @ set up counter increment
|
|
+ movi $rctr_inc.16b, #0x0
|
|
+ mov $rctr_inc.d[1], $constant_temp
|
|
+
|
|
+ rev32 $rtmp_ctr.16b, $ctr0.16b @ set up reversed counter
|
|
+
|
|
+ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 0
|
|
+
|
|
+ rev32 $ctr1.16b, $rtmp_ctr.16b @ CTR block 1
|
|
+ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 1
|
|
+
|
|
+ rev32 $ctr2.16b, $rtmp_ctr.16b @ CTR block 2
|
|
+ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 2
|
|
+
|
|
+ rev32 $ctr3.16b, $rtmp_ctr.16b @ CTR block 3
|
|
+ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 3
|
|
+
|
|
+ rev32 $ctr4.16b, $rtmp_ctr.16b @ CTR block 4
|
|
+ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 4
|
|
+
|
|
+ rev32 $ctr5.16b, $rtmp_ctr.16b @ CTR block 5
|
|
+ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 5
|
|
+ ldp $rk0q, $rk1q, [$cc, #0] @ load rk0, rk1
|
|
+
|
|
+ rev32 $ctr6.16b, $rtmp_ctr.16b @ CTR block 6
|
|
+ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 6
|
|
+
|
|
+ rev32 $ctr7.16b, $rtmp_ctr.16b @ CTR block 7
|
|
+
|
|
+ aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 0
|
|
+ aese $ctr6b, $rk0 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 0
|
|
+ aese $ctr5b, $rk0 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 0
|
|
+
|
|
+ aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 0
|
|
+ aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 0
|
|
+ aese $ctr7b, $rk0 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 0
|
|
+
|
|
+ aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 0
|
|
+ aese $ctr4b, $rk0 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 0
|
|
+ ldp $rk2q, $rk3q, [$cc, #32] @ load rk2, rk3
|
|
+
|
|
+ aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 1
|
|
+
|
|
+ aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 1
|
|
+
|
|
+ aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 1
|
|
+ aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 1
|
|
+ aese $ctr7b, $rk1 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 1
|
|
+
|
|
+ aese $ctr5b, $rk1 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 1
|
|
+ aese $ctr6b, $rk1 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 1
|
|
+
|
|
+ aese $ctr7b, $rk2 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 2
|
|
+ aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 2
|
|
+ aese $ctr4b, $rk1 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 1
|
|
+
|
|
+ aese $ctr5b, $rk2 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 2
|
|
+ aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 2
|
|
+ aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 2
|
|
+
|
|
+ aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 2
|
|
+ aese $ctr4b, $rk2 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 2
|
|
+ aese $ctr6b, $rk2 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 2
|
|
+
|
|
+ aese $ctr7b, $rk3 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 3
|
|
+
|
|
+ ldp $rk4q, $rk5q, [$cc, #64] @ load rk4, rk5
|
|
+ aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 3
|
|
+ aese $ctr5b, $rk3 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 3
|
|
+
|
|
+ aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 3
|
|
+ aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 3
|
|
+
|
|
+ aese $ctr4b, $rk3 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 3
|
|
+ aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 3
|
|
+ aese $ctr6b, $rk3 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 3
|
|
+
|
|
+ aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 4
|
|
+ aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 4
|
|
+ aese $ctr5b, $rk4 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 4
|
|
+
|
|
+ aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 4
|
|
+ aese $ctr7b, $rk4 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 4
|
|
+ aese $ctr6b, $rk4 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 4
|
|
+
|
|
+ aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 4
|
|
+ aese $ctr5b, $rk5 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 5
|
|
+ aese $ctr4b, $rk4 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 4
|
|
+
|
|
+ aese $ctr6b, $rk5 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 5
|
|
+ ldp $rk6q, $rk7q, [$cc, #96] @ load rk6, rk7
|
|
+
|
|
+ aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 5
|
|
+ aese $ctr4b, $rk5 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 5
|
|
+ aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 5
|
|
+
|
|
+ aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 5
|
|
+ aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 5
|
|
+ aese $ctr7b, $rk5 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 5
|
|
+
|
|
+ sub $main_end_input_ptr, $main_end_input_ptr, #1 @ byte_len - 1
|
|
+
|
|
+ aese $ctr4b, $rk6 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 6
|
|
+ aese $ctr5b, $rk6 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 6
|
|
+ aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 6
|
|
+
|
|
+ aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 6
|
|
+ aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 6
|
|
+ aese $ctr6b, $rk6 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 6
|
|
+
|
|
+ aese $ctr7b, $rk6 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 6
|
|
+ aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 6
|
|
+ ldp $rk8q, $rk9q, [$cc, #128] @ load rk8, rk9
|
|
+
|
|
+ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 7
|
|
+
|
|
+ aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 7
|
|
+ aese $ctr7b, $rk7 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 7
|
|
+
|
|
+ aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 7
|
|
+ aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 7
|
|
+ aese $ctr4b, $rk7 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 7
|
|
+
|
|
+ aese $ctr6b, $rk7 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 7
|
|
+ aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 7
|
|
+ aese $ctr5b, $rk7 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 7
|
|
+
|
|
+ aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 8
|
|
+ aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 8
|
|
+ and $main_end_input_ptr, $main_end_input_ptr, #0xffffffffffffff80 @ number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
|
|
+
|
|
+ aese $ctr7b, $rk8 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 8
|
|
+ aese $ctr6b, $rk8 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 8
|
|
+ aese $ctr5b, $rk8 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 8
|
|
+
|
|
+ aese $ctr4b, $rk8 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 8
|
|
+ aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 8
|
|
+ aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 8
|
|
+
|
|
+ add $end_input_ptr, $input_ptr, $bit_length, lsr #3 @ end_input_ptr
|
|
+ aese $ctr6b, $rk9 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 9
|
|
+
|
|
+ ld1 { $acc_lb}, [$current_tag]
|
|
+ ext $acc_lb, $acc_lb, $acc_lb, #8
|
|
+ rev64 $acc_lb, $acc_lb
|
|
+
|
|
+ ldp $rk10q, $rk11q, [$cc, #160] @ load rk10, rk11
|
|
+
|
|
+ aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 9
|
|
+ add $main_end_input_ptr, $main_end_input_ptr, $input_ptr
|
|
+
|
|
+ aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 9
|
|
+ aese $ctr7b, $rk9 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 9
|
|
+ aese $ctr4b, $rk9 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 9
|
|
+
|
|
+ cmp $input_ptr, $main_end_input_ptr @ check if we have <= 8 blocks
|
|
+ aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 9
|
|
+
|
|
+ aese $ctr5b, $rk9 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 9
|
|
+ aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 9
|
|
+
|
|
+ aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 10
|
|
+ aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 10
|
|
+ aese $ctr7b, $rk10 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 10
|
|
+
|
|
+ aese $ctr4b, $rk10 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 10
|
|
+ aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 10
|
|
+ aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 10
|
|
+
|
|
+ aese $ctr6b, $rk10 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 10
|
|
+ aese $ctr5b, $rk10 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 10
|
|
+ ldr $rk12q, [$cc, #192] @ load rk12
|
|
+
|
|
+ aese $ctr0b, $rk11 @ AES block 0 - round 11
|
|
+ aese $ctr1b, $rk11 @ AES block 1 - round 11
|
|
+ aese $ctr4b, $rk11 @ AES block 4 - round 11
|
|
+
|
|
+ aese $ctr6b, $rk11 @ AES block 6 - round 11
|
|
+ aese $ctr5b, $rk11 @ AES block 5 - round 11
|
|
+ aese $ctr7b, $rk11 @ AES block 7 - round 11
|
|
+
|
|
+ aese $ctr2b, $rk11 @ AES block 2 - round 11
|
|
+ aese $ctr3b, $rk11 @ AES block 3 - round 11
|
|
+ b.ge .L192_dec_tail @ handle tail
|
|
+
|
|
+ ldp $res0q, $res1q, [$input_ptr], #32 @ AES block 0, 1 - load ciphertext
|
|
+
|
|
+ ldp $res2q, $res3q, [$input_ptr], #32 @ AES block 2, 3 - load ciphertext
|
|
+
|
|
+ ldp $res4q, $res5q, [$input_ptr], #32 @ AES block 4, 5 - load ciphertext
|
|
+
|
|
+ eor3 $ctr1b, $res1b, $ctr1b, $rk12 @ AES block 1 - result
|
|
+ eor3 $ctr0b, $res0b, $ctr0b, $rk12 @ AES block 0 - result
|
|
+ stp $ctr0q, $ctr1q, [$output_ptr], #32 @ AES block 0, 1 - store result
|
|
+
|
|
+ rev32 $ctr0.16b, $rtmp_ctr.16b @ CTR block 8
|
|
+ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8
|
|
+
|
|
+ rev32 $ctr1.16b, $rtmp_ctr.16b @ CTR block 9
|
|
+ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 9
|
|
+ eor3 $ctr3b, $res3b, $ctr3b, $rk12 @ AES block 3 - result
|
|
+
|
|
+ eor3 $ctr2b, $res2b, $ctr2b, $rk12 @ AES block 2 - result
|
|
+ stp $ctr2q, $ctr3q, [$output_ptr], #32 @ AES block 2, 3 - store result
|
|
+ ldp $res6q, $res7q, [$input_ptr], #32 @ AES block 6, 7 - load ciphertext
|
|
+
|
|
+ rev32 $ctr2.16b, $rtmp_ctr.16b @ CTR block 10
|
|
+ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 10
|
|
+
|
|
+ eor3 $ctr4b, $res4b, $ctr4b, $rk12 @ AES block 4 - result
|
|
+
|
|
+ rev32 $ctr3.16b, $rtmp_ctr.16b @ CTR block 11
|
|
+ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 11
|
|
+
|
|
+ eor3 $ctr5b, $res5b, $ctr5b, $rk12 @ AES block 5 - result
|
|
+ stp $ctr4q, $ctr5q, [$output_ptr], #32 @ AES block 4, 5 - store result
|
|
+ cmp $input_ptr, $main_end_input_ptr @ check if we have <= 8 blocks
|
|
+
|
|
+ eor3 $ctr6b, $res6b, $ctr6b, $rk12 @ AES block 6 - result
|
|
+ eor3 $ctr7b, $res7b, $ctr7b, $rk12 @ AES block 7 - result
|
|
+ rev32 $ctr4.16b, $rtmp_ctr.16b @ CTR block 12
|
|
+
|
|
+ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 12
|
|
+ stp $ctr6q, $ctr7q, [$output_ptr], #32 @ AES block 6, 7 - store result
|
|
+ b.ge .L192_dec_prepretail @ do prepretail
|
|
+
|
|
+.L192_dec_main_loop: @ main loop start
|
|
+ rev64 $res1b, $res1b @ GHASH block 8k+1
|
|
+ ldp $rk0q, $rk1q, [$cc, #0] @ load rk0, rk1
|
|
+ ext $acc_lb, $acc_lb, $acc_lb, #8 @ PRE 0
|
|
+
|
|
+ rev64 $res0b, $res0b @ GHASH block 8k
|
|
+ rev32 $ctr5.16b, $rtmp_ctr.16b @ CTR block 8k+13
|
|
+ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+13
|
|
+
|
|
+ ldr $h7q, [$current_tag, #176] @ load h7l | h7h
|
|
+ ext $h7.16b, $h7.16b, $h7.16b, #8
|
|
+ ldr $h8q, [$current_tag, #208] @ load h8l | h8h
|
|
+ ext $h8.16b, $h8.16b, $h8.16b, #8
|
|
+ rev64 $res4b, $res4b @ GHASH block 8k+4
|
|
+ rev64 $res3b, $res3b @ GHASH block 8k+3
|
|
+
|
|
+ eor $res0b, $res0b, $acc_lb @ PRE 1
|
|
+ rev32 $ctr6.16b, $rtmp_ctr.16b @ CTR block 8k+14
|
|
+ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+14
|
|
+
|
|
+ rev64 $res5b, $res5b @ GHASH block 8k+5
|
|
+
|
|
+ rev32 $ctr7.16b, $rtmp_ctr.16b @ CTR block 8k+15
|
|
+ aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 0
|
|
+ aese $ctr6b, $rk0 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 0
|
|
+
|
|
+ aese $ctr5b, $rk0 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 0
|
|
+ aese $ctr4b, $rk0 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 0
|
|
+ aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 0
|
|
+
|
|
+ aese $ctr7b, $rk0 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 0
|
|
+ aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 0
|
|
+ aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 0
|
|
+
|
|
+ pmull $acc_l.1q, $res0.1d, $h8.1d @ GHASH block 8k - low
|
|
+ pmull2 $t0.1q, $res1.2d, $h7.2d @ GHASH block 8k+1 - high
|
|
+ ldp $rk2q, $rk3q, [$cc, #32] @ load rk2, rk3
|
|
+
|
|
+ aese $ctr6b, $rk1 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 1
|
|
+ pmull $h7.1q, $res1.1d, $h7.1d @ GHASH block 8k+1 - low
|
|
+ ldr $h5q, [$current_tag, #128] @ load h5l | h5h
|
|
+ ext $h5.16b, $h5.16b, $h5.16b, #8
|
|
+ ldr $h6q, [$current_tag, #160] @ load h6l | h6h
|
|
+ ext $h6.16b, $h6.16b, $h6.16b, #8
|
|
+
|
|
+ aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 1
|
|
+ aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 1
|
|
+ aese $ctr7b, $rk1 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 1
|
|
+
|
|
+ pmull2 $acc_h.1q, $res0.2d, $h8.2d @ GHASH block 8k - high
|
|
+ aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 1
|
|
+ aese $ctr4b, $rk1 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 1
|
|
+
|
|
+ trn1 $acc_m.2d, $res1.2d, $res0.2d @ GHASH block 8k, 8k+1 - mid
|
|
+ rev64 $res2b, $res2b @ GHASH block 8k+2
|
|
+ aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 1
|
|
+
|
|
+ aese $ctr5b, $rk1 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 1
|
|
+ ldr $h56kq, [$current_tag, #144] @ load h6k | h5k
|
|
+ ldr $h78kq, [$current_tag, #192] @ load h8k | h7k
|
|
+ trn2 $res0.2d, $res1.2d, $res0.2d @ GHASH block 8k, 8k+1 - mid
|
|
+
|
|
+ eor $acc_hb, $acc_hb, $t0.16b @ GHASH block 8k+1 - high
|
|
+ pmull2 $t2.1q, $res3.2d, $h5.2d @ GHASH block 8k+3 - high
|
|
+ pmull2 $t1.1q, $res2.2d, $h6.2d @ GHASH block 8k+2 - high
|
|
+
|
|
+ eor $res0.16b, $res0.16b, $acc_m.16b @ GHASH block 8k, 8k+1 - mid
|
|
+ eor $acc_lb, $acc_lb, $h7.16b @ GHASH block 8k+1 - low
|
|
+ aese $ctr6b, $rk2 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 2
|
|
+
|
|
+ aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 2
|
|
+ pmull $h5.1q, $res3.1d, $h5.1d @ GHASH block 8k+3 - low
|
|
+ eor3 $acc_hb, $acc_hb, $t1.16b, $t2.16b @ GHASH block 8k+2, 8k+3 - high
|
|
+
|
|
+ aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 2
|
|
+ aese $ctr6b, $rk3 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 3
|
|
+ aese $ctr4b, $rk2 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 2
|
|
+
|
|
+ aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 2
|
|
+ aese $ctr7b, $rk2 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 2
|
|
+ aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 2
|
|
+
|
|
+ ldr $h3q, [$current_tag, #80] @ load h3l | h3h
|
|
+ ext $h3.16b, $h3.16b, $h3.16b, #8
|
|
+ ldr $h4q, [$current_tag, #112] @ load h4l | h4h
|
|
+ ext $h4.16b, $h4.16b, $h4.16b, #8
|
|
+ aese $ctr5b, $rk2 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 2
|
|
+ aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 3
|
|
+
|
|
+ pmull $h6.1q, $res2.1d, $h6.1d @ GHASH block 8k+2 - low
|
|
+ trn1 $t3.2d, $res3.2d, $res2.2d @ GHASH block 8k+2, 8k+3 - mid
|
|
+ trn2 $res2.2d, $res3.2d, $res2.2d @ GHASH block 8k+2, 8k+3 - mid
|
|
+
|
|
+ aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 3
|
|
+ aese $ctr4b, $rk3 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 3
|
|
+
|
|
+ aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 3
|
|
+ aese $ctr7b, $rk3 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 3
|
|
+ ldp $rk4q, $rk5q, [$cc, #64] @ load rk4, rk5
|
|
+
|
|
+ eor $res2.16b, $res2.16b, $t3.16b @ GHASH block 8k+2, 8k+3 - mid
|
|
+ eor3 $acc_lb, $acc_lb, $h6.16b, $h5.16b @ GHASH block 8k+2, 8k+3 - low
|
|
+ aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 3
|
|
+
|
|
+ trn1 $t6.2d, $res5.2d, $res4.2d @ GHASH block 8k+4, 8k+5 - mid
|
|
+ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+15
|
|
+
|
|
+ pmull2 $t3.1q, $res2.2d, $h56k.2d @ GHASH block 8k+2 - mid
|
|
+ pmull2 $acc_m.1q, $res0.2d, $h78k.2d @ GHASH block 8k - mid
|
|
+ pmull $h78k.1q, $res0.1d, $h78k.1d @ GHASH block 8k+1 - mid
|
|
+
|
|
+ aese $ctr5b, $rk3 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 3
|
|
+ pmull $h56k.1q, $res2.1d, $h56k.1d @ GHASH block 8k+3 - mid
|
|
+ pmull2 $t4.1q, $res4.2d, $h4.2d @ GHASH block 8k+4 - high
|
|
+
|
|
+ aese $ctr4b, $rk4 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 4
|
|
+ aese $ctr6b, $rk4 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 4
|
|
+ eor $acc_mb, $acc_mb, $h78k.16b @ GHASH block 8k+1 - mid
|
|
+
|
|
+ aese $ctr5b, $rk4 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 4
|
|
+ aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 4
|
|
+ aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 4
|
|
+
|
|
+ aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 4
|
|
+ aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 4
|
|
+ aese $ctr7b, $rk4 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 4
|
|
+
|
|
+ ldr $h1q, [$current_tag, #32] @ load h1l | h1h
|
|
+ ext $h1.16b, $h1.16b, $h1.16b, #8
|
|
+ ldr $h2q, [$current_tag, #64] @ load h1l | h1h
|
|
+ ext $h2.16b, $h2.16b, $h2.16b, #8
|
|
+ aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 5
|
|
+ aese $ctr5b, $rk5 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 5
|
|
+
|
|
+ ldp $rk6q, $rk7q, [$cc, #96] @ load rk6, rk7
|
|
+ aese $ctr7b, $rk5 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 5
|
|
+ rev64 $res7b, $res7b @ GHASH block 8k+7
|
|
+
|
|
+ aese $ctr4b, $rk5 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 5
|
|
+ eor3 $acc_mb, $acc_mb, $h56k.16b, $t3.16b @ GHASH block 8k+2, 8k+3 - mid
|
|
+ aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 5
|
|
+
|
|
+ pmull $h4.1q, $res4.1d, $h4.1d @ GHASH block 8k+4 - low
|
|
+ trn2 $res4.2d, $res5.2d, $res4.2d @ GHASH block 8k+4, 8k+5 - mid
|
|
+ aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 5
|
|
+
|
|
+ aese $ctr6b, $rk5 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 5
|
|
+ aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 5
|
|
+ rev64 $res6b, $res6b @ GHASH block 8k+6
|
|
+
|
|
+ ldr $h12kq, [$current_tag, #48] @ load h2k | h1k
|
|
+ ldr $h34kq, [$current_tag, #96] @ load h4k | h3k
|
|
+ pmull2 $t5.1q, $res5.2d, $h3.2d @ GHASH block 8k+5 - high
|
|
+ pmull $h3.1q, $res5.1d, $h3.1d @ GHASH block 8k+5 - low
|
|
+
|
|
+ aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 6
|
|
+ eor $res4.16b, $res4.16b, $t6.16b @ GHASH block 8k+4, 8k+5 - mid
|
|
+ trn1 $t9.2d, $res7.2d, $res6.2d @ GHASH block 8k+6, 8k+7 - mid
|
|
+
|
|
+ aese $ctr7b, $rk6 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 6
|
|
+ aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 6
|
|
+ aese $ctr6b, $rk6 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 6
|
|
+
|
|
+ pmull2 $t7.1q, $res6.2d, $h2.2d @ GHASH block 8k+6 - high
|
|
+ aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 6
|
|
+ aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 6
|
|
+
|
|
+ aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 7
|
|
+ aese $ctr6b, $rk7 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 7
|
|
+ aese $ctr5b, $rk6 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 6
|
|
+
|
|
+ pmull2 $t6.1q, $res4.2d, $h34k.2d @ GHASH block 8k+4 - mid
|
|
+ eor3 $acc_hb, $acc_hb, $t4.16b, $t5.16b @ GHASH block 8k+4, 8k+5 - high
|
|
+ eor3 $acc_lb, $acc_lb, $h4.16b, $h3.16b @ GHASH block 8k+4, 8k+5 - low
|
|
+
|
|
+ pmull $h2.1q, $res6.1d, $h2.1d @ GHASH block 8k+6 - low
|
|
+ trn2 $res6.2d, $res7.2d, $res6.2d @ GHASH block 8k+6, 8k+7 - mid
|
|
+ aese $ctr4b, $rk6 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 6
|
|
+
|
|
+ aese $ctr5b, $rk7 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 7
|
|
+ ldp $rk8q, $rk9q, [$cc, #128] @ load rk8, rk9
|
|
+ aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 7
|
|
+
|
|
+ eor $res6.16b, $res6.16b, $t9.16b @ GHASH block 8k+6, 8k+7 - mid
|
|
+ pmull $h34k.1q, $res4.1d, $h34k.1d @ GHASH block 8k+5 - mid
|
|
+ aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 7
|
|
+
|
|
+ aese $ctr4b, $rk7 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 7
|
|
+ aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 7
|
|
+ aese $ctr7b, $rk7 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 7
|
|
+
|
|
+ eor3 $acc_mb, $acc_mb, $h34k.16b, $t6.16b @ GHASH block 8k+4, 8k+5 - mid
|
|
+ pmull2 $t9.1q, $res6.2d, $h12k.2d @ GHASH block 8k+6 - mid
|
|
+ pmull2 $t8.1q, $res7.2d, $h1.2d @ GHASH block 8k+7 - high
|
|
+
|
|
+ pmull $h12k.1q, $res6.1d, $h12k.1d @ GHASH block 8k+7 - mid
|
|
+ ldr $mod_constantd, [$modulo_constant] @ MODULO - load modulo constant
|
|
+ pmull $h1.1q, $res7.1d, $h1.1d @ GHASH block 8k+7 - low
|
|
+
|
|
+ aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 8
|
|
+ aese $ctr5b, $rk8 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 8
|
|
+ aese $ctr7b, $rk8 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 8
|
|
+
|
|
+ aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 8
|
|
+ aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 8
|
|
+ eor3 $acc_lb, $acc_lb, $h2.16b, $h1.16b @ GHASH block 8k+6, 8k+7 - low
|
|
+
|
|
+ aese $ctr4b, $rk8 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 8
|
|
+ aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 8
|
|
+ aese $ctr6b, $rk8 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 8
|
|
+
|
|
+ eor3 $acc_hb, $acc_hb, $t7.16b, $t8.16b @ GHASH block 8k+6, 8k+7 - high
|
|
+ rev32 $h1.16b, $rtmp_ctr.16b @ CTR block 8k+16
|
|
+ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+16
|
|
+
|
|
+ aese $ctr5b, $rk9 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 9
|
|
+ eor3 $acc_mb, $acc_mb, $h12k.16b, $t9.16b @ GHASH block 8k+6, 8k+7 - mid
|
|
+ aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 9
|
|
+
|
|
+ aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 9
|
|
+ aese $ctr7b, $rk9 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 9
|
|
+ ldp $rk10q, $rk11q, [$cc, #160] @ load rk10, rk11
|
|
+
|
|
+ eor3 $acc_mb, $acc_mb, $acc_hb, $acc_lb @ MODULO - karatsuba tidy up
|
|
+ ldp $res0q, $res1q, [$input_ptr], #32 @ AES block 8k+8, 8k+9 - load ciphertext
|
|
+
|
|
+ aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 9
|
|
+ aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 9
|
|
+ ldp $res2q, $res3q, [$input_ptr], #32 @ AES block 8k+10, 8k+11 - load ciphertext
|
|
+
|
|
+ rev32 $h2.16b, $rtmp_ctr.16b @ CTR block 8k+17
|
|
+ pmull $t12.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid
|
|
+ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+17
|
|
+
|
|
+ aese $ctr6b, $rk9 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 9
|
|
+ aese $ctr4b, $rk9 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 9
|
|
+ ext $t11.16b, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment
|
|
+
|
|
+ aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 10
|
|
+ aese $ctr7b, $rk10 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 10
|
|
+ ldp $res4q, $res5q, [$input_ptr], #32 @ AES block 8k+12, 8k+13 - load ciphertext
|
|
+
|
|
+ rev32 $h3.16b, $rtmp_ctr.16b @ CTR block 8k+18
|
|
+ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+18
|
|
+ eor3 $acc_mb, $acc_mb, $t12.16b, $t11.16b @ MODULO - fold into mid
|
|
+
|
|
+ aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 10
|
|
+ aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 10
|
|
+ ldr $rk12q, [$cc, #192] @ load rk12
|
|
+
|
|
+ ldp $res6q, $res7q, [$input_ptr], #32 @ AES block 8k+14, 8k+15 - load ciphertext
|
|
+ aese $ctr4b, $rk10 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 10
|
|
+ aese $ctr6b, $rk10 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 10
|
|
+
|
|
+ aese $ctr0b, $rk11 @ AES block 8k+8 - round 11
|
|
+ ext $t11.16b, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment
|
|
+ aese $ctr1b, $rk11 @ AES block 8k+9 - round 11
|
|
+
|
|
+ aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 10
|
|
+ aese $ctr6b, $rk11 @ AES block 8k+14 - round 11
|
|
+ aese $ctr3b, $rk11 @ AES block 8k+11 - round 11
|
|
+
|
|
+ eor3 $ctr0b, $res0b, $ctr0b, $rk12 @ AES block 8k+8 - result
|
|
+ rev32 $h4.16b, $rtmp_ctr.16b @ CTR block 8k+19
|
|
+ aese $ctr5b, $rk10 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 10
|
|
+
|
|
+ aese $ctr4b, $rk11 @ AES block 8k+12 - round 11
|
|
+ aese $ctr2b, $rk11 @ AES block 8k+10 - round 11
|
|
+ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+19
|
|
+
|
|
+ aese $ctr7b, $rk11 @ AES block 8k+15 - round 11
|
|
+ aese $ctr5b, $rk11 @ AES block 8k+13 - round 11
|
|
+ pmull $acc_h.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low
|
|
+
|
|
+ eor3 $ctr1b, $res1b, $ctr1b, $rk12 @ AES block 8k+9 - result
|
|
+ stp $ctr0q, $ctr1q, [$output_ptr], #32 @ AES block 8k+8, 8k+9 - store result
|
|
+ eor3 $ctr3b, $res3b, $ctr3b, $rk12 @ AES block 8k+11 - result
|
|
+
|
|
+ eor3 $ctr2b, $res2b, $ctr2b, $rk12 @ AES block 8k+10 - result
|
|
+ eor3 $ctr7b, $res7b, $ctr7b, $rk12 @ AES block 8k+15 - result
|
|
+ stp $ctr2q, $ctr3q, [$output_ptr], #32 @ AES block 8k+10, 8k+11 - store result
|
|
+
|
|
+ eor3 $ctr5b, $res5b, $ctr5b, $rk12 @ AES block 8k+13 - result
|
|
+ eor3 $acc_lb, $acc_lb, $acc_hb, $t11.16b @ MODULO - fold into low
|
|
+ mov $ctr3.16b, $h4.16b @ CTR block 8k+19
|
|
+
|
|
+ eor3 $ctr4b, $res4b, $ctr4b, $rk12 @ AES block 8k+12 - result
|
|
+ stp $ctr4q, $ctr5q, [$output_ptr], #32 @ AES block 8k+12, 8k+13 - store result
|
|
+ cmp $input_ptr, $main_end_input_ptr @ LOOP CONTROL
|
|
+
|
|
+ eor3 $ctr6b, $res6b, $ctr6b, $rk12 @ AES block 8k+14 - result
|
|
+ stp $ctr6q, $ctr7q, [$output_ptr], #32 @ AES block 8k+14, 8k+15 - store result
|
|
+ mov $ctr0.16b, $h1.16b @ CTR block 8k+16
|
|
+
|
|
+ mov $ctr1.16b, $h2.16b @ CTR block 8k+17
|
|
+ mov $ctr2.16b, $h3.16b @ CTR block 8k+18
|
|
+
|
|
+ rev32 $ctr4.16b, $rtmp_ctr.16b @ CTR block 8k+20
|
|
+ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+20
|
|
+ b.lt .L192_dec_main_loop
|
|
+
|
|
+.L192_dec_prepretail: @ PREPRETAIL
|
|
+ ldp $rk0q, $rk1q, [$cc, #0] @ load rk0, rk1
|
|
+ rev32 $ctr5.16b, $rtmp_ctr.16b @ CTR block 8k+13
|
|
+ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+13
|
|
+
|
|
+ ldr $h7q, [$current_tag, #176] @ load h7l | h7h
|
|
+ ext $h7.16b, $h7.16b, $h7.16b, #8
|
|
+ ldr $h8q, [$current_tag, #208] @ load h8l | h8h
|
|
+ ext $h8.16b, $h8.16b, $h8.16b, #8
|
|
+ rev64 $res0b, $res0b @ GHASH block 8k
|
|
+ ext $acc_lb, $acc_lb, $acc_lb, #8 @ PRE 0
|
|
+
|
|
+ rev64 $res3b, $res3b @ GHASH block 8k+3
|
|
+ rev32 $ctr6.16b, $rtmp_ctr.16b @ CTR block 8k+14
|
|
+ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+14
|
|
+
|
|
+ eor $res0b, $res0b, $acc_lb @ PRE 1
|
|
+ rev64 $res2b, $res2b @ GHASH block 8k+2
|
|
+ rev64 $res1b, $res1b @ GHASH block 8k+1
|
|
+
|
|
+ ldr $h5q, [$current_tag, #128] @ load h5l | h5h
|
|
+ ext $h5.16b, $h5.16b, $h5.16b, #8
|
|
+ ldr $h6q, [$current_tag, #160] @ load h6l | h6h
|
|
+ ext $h6.16b, $h6.16b, $h6.16b, #8
|
|
+ rev32 $ctr7.16b, $rtmp_ctr.16b @ CTR block 8k+15
|
|
+
|
|
+ aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 0
|
|
+ aese $ctr6b, $rk0 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 0
|
|
+ aese $ctr5b, $rk0 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 0
|
|
+
|
|
+ aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 0
|
|
+ aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 0
|
|
+ pmull2 $t0.1q, $res1.2d, $h7.2d @ GHASH block 8k+1 - high
|
|
+
|
|
+ aese $ctr4b, $rk0 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 0
|
|
+ pmull2 $acc_h.1q, $res0.2d, $h8.2d @ GHASH block 8k - high
|
|
+ aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 0
|
|
+
|
|
+ aese $ctr6b, $rk1 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 1
|
|
+ aese $ctr7b, $rk0 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 0
|
|
+ ldp $rk2q, $rk3q, [$cc, #32] @ load rk2, rk3
|
|
+
|
|
+ aese $ctr4b, $rk1 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 1
|
|
+ pmull2 $t1.1q, $res2.2d, $h6.2d @ GHASH block 8k+2 - high
|
|
+ pmull $h6.1q, $res2.1d, $h6.1d @ GHASH block 8k+2 - low
|
|
+
|
|
+ pmull $h7.1q, $res1.1d, $h7.1d @ GHASH block 8k+1 - low
|
|
+ eor $acc_hb, $acc_hb, $t0.16b @ GHASH block 8k+1 - high
|
|
+ aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 1
|
|
+
|
|
+ pmull $acc_l.1q, $res0.1d, $h8.1d @ GHASH block 8k - low
|
|
+ aese $ctr7b, $rk1 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 1
|
|
+ aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 1
|
|
+
|
|
+ trn1 $acc_m.2d, $res1.2d, $res0.2d @ GHASH block 8k, 8k+1 - mid
|
|
+ trn2 $res0.2d, $res1.2d, $res0.2d @ GHASH block 8k, 8k+1 - mid
|
|
+ pmull2 $t2.1q, $res3.2d, $h5.2d @ GHASH block 8k+3 - high
|
|
+
|
|
+ aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 1
|
|
+ aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 1
|
|
+ aese $ctr5b, $rk1 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 1
|
|
+
|
|
+ ldr $h56kq, [$current_tag, #144] @ load h6k | h5k
|
|
+ ldr $h78kq, [$current_tag, #192] @ load h8k | h7k
|
|
+ aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 2
|
|
+ eor $res0.16b, $res0.16b, $acc_m.16b @ GHASH block 8k, 8k+1 - mid
|
|
+
|
|
+ aese $ctr6b, $rk2 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 2
|
|
+ rev64 $res5b, $res5b @ GHASH block 8k+5
|
|
+ pmull $h5.1q, $res3.1d, $h5.1d @ GHASH block 8k+3 - low
|
|
+
|
|
+ eor3 $acc_hb, $acc_hb, $t1.16b, $t2.16b @ GHASH block 8k+2, 8k+3 - high
|
|
+ aese $ctr4b, $rk2 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 2
|
|
+ aese $ctr5b, $rk2 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 2
|
|
+
|
|
+ trn1 $t3.2d, $res3.2d, $res2.2d @ GHASH block 8k+2, 8k+3 - mid
|
|
+ aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 3
|
|
+ aese $ctr7b, $rk2 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 2
|
|
+
|
|
+ aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 2
|
|
+ aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 2
|
|
+ trn2 $res2.2d, $res3.2d, $res2.2d @ GHASH block 8k+2, 8k+3 - mid
|
|
+
|
|
+ pmull2 $acc_m.1q, $res0.2d, $h78k.2d @ GHASH block 8k - mid
|
|
+ aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 2
|
|
+ pmull $h78k.1q, $res0.1d, $h78k.1d @ GHASH block 8k+1 - mid
|
|
+
|
|
+ aese $ctr5b, $rk3 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 3
|
|
+ eor $res2.16b, $res2.16b, $t3.16b @ GHASH block 8k+2, 8k+3 - mid
|
|
+ eor $acc_lb, $acc_lb, $h7.16b @ GHASH block 8k+1 - low
|
|
+
|
|
+ aese $ctr7b, $rk3 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 3
|
|
+ aese $ctr6b, $rk3 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 3
|
|
+ aese $ctr4b, $rk3 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 3
|
|
+
|
|
+ eor3 $acc_lb, $acc_lb, $h6.16b, $h5.16b @ GHASH block 8k+2, 8k+3 - low
|
|
+ ldp $rk4q, $rk5q, [$cc, #64] @ load rk4, rk5
|
|
+ aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 3
|
|
+
|
|
+ ldr $h3q, [$current_tag, #80] @ load h3l | h3h
|
|
+ ext $h3.16b, $h3.16b, $h3.16b, #8
|
|
+ ldr $h4q, [$current_tag, #112] @ load h4l | h4h
|
|
+ ext $h4.16b, $h4.16b, $h4.16b, #8
|
|
+ pmull2 $t3.1q, $res2.2d, $h56k.2d @ GHASH block 8k+2 - mid
|
|
+ pmull $h56k.1q, $res2.1d, $h56k.1d @ GHASH block 8k+3 - mid
|
|
+
|
|
+ ldr $h1q, [$current_tag, #32] @ load h1l | h1h
|
|
+ ext $h1.16b, $h1.16b, $h1.16b, #8
|
|
+ ldr $h2q, [$current_tag, #64] @ load h1l | h1h
|
|
+ ext $h2.16b, $h2.16b, $h2.16b, #8
|
|
+ eor $acc_mb, $acc_mb, $h78k.16b @ GHASH block 8k+1 - mid
|
|
+ aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 3
|
|
+
|
|
+ rev64 $res7b, $res7b @ GHASH block 8k+7
|
|
+
|
|
+ eor3 $acc_mb, $acc_mb, $h56k.16b, $t3.16b @ GHASH block 8k+2, 8k+3 - mid
|
|
+ rev64 $res4b, $res4b @ GHASH block 8k+4
|
|
+
|
|
+ aese $ctr5b, $rk4 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 4
|
|
+ aese $ctr4b, $rk4 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 4
|
|
+ aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 3
|
|
+
|
|
+ aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 4
|
|
+ aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 4
|
|
+ aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 4
|
|
+
|
|
+ aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 4
|
|
+ aese $ctr6b, $rk4 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 4
|
|
+ aese $ctr7b, $rk4 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 4
|
|
+
|
|
+ rev64 $res6b, $res6b @ GHASH block 8k+6
|
|
+ ldr $h12kq, [$current_tag, #48] @ load h2k | h1k
|
|
+ ldr $h34kq, [$current_tag, #96] @ load h4k | h3k
|
|
+ trn1 $t6.2d, $res5.2d, $res4.2d @ GHASH block 8k+4, 8k+5 - mid
|
|
+
|
|
+ aese $ctr7b, $rk5 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 5
|
|
+ aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 5
|
|
+ aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 5
|
|
+
|
|
+ ldp $rk6q, $rk7q, [$cc, #96] @ load rk6, rk7
|
|
+ aese $ctr6b, $rk5 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 5
|
|
+ aese $ctr5b, $rk5 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 5
|
|
+
|
|
+ pmull2 $t7.1q, $res6.2d, $h2.2d @ GHASH block 8k+6 - high
|
|
+ pmull2 $t4.1q, $res4.2d, $h4.2d @ GHASH block 8k+4 - high
|
|
+ pmull $h2.1q, $res6.1d, $h2.1d @ GHASH block 8k+6 - low
|
|
+
|
|
+ aese $ctr4b, $rk5 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 5
|
|
+
|
|
+ pmull $h4.1q, $res4.1d, $h4.1d @ GHASH block 8k+4 - low
|
|
+ trn2 $res4.2d, $res5.2d, $res4.2d @ GHASH block 8k+4, 8k+5 - mid
|
|
+ pmull2 $t5.1q, $res5.2d, $h3.2d @ GHASH block 8k+5 - high
|
|
+
|
|
+ pmull $h3.1q, $res5.1d, $h3.1d @ GHASH block 8k+5 - low
|
|
+ trn1 $t9.2d, $res7.2d, $res6.2d @ GHASH block 8k+6, 8k+7 - mid
|
|
+ aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 5
|
|
+
|
|
+ trn2 $res6.2d, $res7.2d, $res6.2d @ GHASH block 8k+6, 8k+7 - mid
|
|
+ aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 5
|
|
+ eor $res4.16b, $res4.16b, $t6.16b @ GHASH block 8k+4, 8k+5 - mid
|
|
+
|
|
+ aese $ctr4b, $rk6 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 6
|
|
+ aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 6
|
|
+
|
|
+ eor $res6.16b, $res6.16b, $t9.16b @ GHASH block 8k+6, 8k+7 - mid
|
|
+ aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 6
|
|
+ aese $ctr7b, $rk6 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 6
|
|
+
|
|
+ pmull2 $t6.1q, $res4.2d, $h34k.2d @ GHASH block 8k+4 - mid
|
|
+ pmull $h34k.1q, $res4.1d, $h34k.1d @ GHASH block 8k+5 - mid
|
|
+ aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 6
|
|
+
|
|
+ pmull2 $t9.1q, $res6.2d, $h12k.2d @ GHASH block 8k+6 - mid
|
|
+ aese $ctr5b, $rk6 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 6
|
|
+ pmull2 $t8.1q, $res7.2d, $h1.2d @ GHASH block 8k+7 - high
|
|
+
|
|
+ eor3 $acc_mb, $acc_mb, $h34k.16b, $t6.16b @ GHASH block 8k+4, 8k+5 - mid
|
|
+ aese $ctr4b, $rk7 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 7
|
|
+ eor3 $acc_lb, $acc_lb, $h4.16b, $h3.16b @ GHASH block 8k+4, 8k+5 - low
|
|
+
|
|
+ aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 6
|
|
+ aese $ctr6b, $rk6 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 6
|
|
+ aese $ctr5b, $rk7 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 7
|
|
+
|
|
+ ldp $rk8q, $rk9q, [$cc, #128] @ load rk8, rk9
|
|
+ pmull $h12k.1q, $res6.1d, $h12k.1d @ GHASH block 8k+7 - mid
|
|
+ aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 7
|
|
+
|
|
+ ldr $mod_constantd, [$modulo_constant] @ MODULO - load modulo constant
|
|
+ eor3 $acc_hb, $acc_hb, $t4.16b, $t5.16b @ GHASH block 8k+4, 8k+5 - high
|
|
+ pmull $h1.1q, $res7.1d, $h1.1d @ GHASH block 8k+7 - low
|
|
+
|
|
+ aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 7
|
|
+ aese $ctr7b, $rk7 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 7
|
|
+ aese $ctr6b, $rk7 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 7
|
|
+
|
|
+ eor3 $acc_hb, $acc_hb, $t7.16b, $t8.16b @ GHASH block 8k+6, 8k+7 - high
|
|
+ eor3 $acc_lb, $acc_lb, $h2.16b, $h1.16b @ GHASH block 8k+6, 8k+7 - low
|
|
+ eor3 $acc_mb, $acc_mb, $h12k.16b, $t9.16b @ GHASH block 8k+6, 8k+7 - mid
|
|
+
|
|
+ aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 7
|
|
+ aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 7
|
|
+
|
|
+ eor3 $acc_mb, $acc_mb, $acc_hb, $acc_lb @ MODULO - karatsuba tidy up
|
|
+ ext $t11.16b, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment
|
|
+ aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 8
|
|
+
|
|
+ aese $ctr6b, $rk8 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 8
|
|
+ aese $ctr7b, $rk8 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 8
|
|
+ aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 8
|
|
+
|
|
+ aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 8
|
|
+ pmull $t12.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid
|
|
+ aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 8
|
|
+
|
|
+ aese $ctr5b, $rk8 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 8
|
|
+ aese $ctr4b, $rk8 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 8
|
|
+ ldp $rk10q, $rk11q, [$cc, #160] @ load rk10, rk11
|
|
+
|
|
+ eor3 $acc_mb, $acc_mb, $t12.16b, $t11.16b @ MODULO - fold into mid
|
|
+ aese $ctr7b, $rk9 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 9
|
|
+ aese $ctr6b, $rk9 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 9
|
|
+
|
|
+ aese $ctr5b, $rk9 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 9
|
|
+ aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 9
|
|
+ aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 9
|
|
+
|
|
+ aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 9
|
|
+ aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 9
|
|
+ aese $ctr4b, $rk9 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 9
|
|
+
|
|
+ pmull $acc_h.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low
|
|
+ ldr $rk12q, [$cc, #192] @ load rk12
|
|
+ ext $t11.16b, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment
|
|
+
|
|
+ aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 10
|
|
+ aese $ctr5b, $rk10 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 10
|
|
+ aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 10
|
|
+
|
|
+ aese $ctr4b, $rk10 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 10
|
|
+ aese $ctr6b, $rk10 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 10
|
|
+ aese $ctr7b, $rk10 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 10
|
|
+
|
|
+ aese $ctr0b, $rk11 @ AES block 8k+8 - round 11
|
|
+ eor3 $acc_lb, $acc_lb, $acc_hb, $t11.16b @ MODULO - fold into low
|
|
+ aese $ctr5b, $rk11 @ AES block 8k+13 - round 11
|
|
+
|
|
+ aese $ctr2b, $rk11 @ AES block 8k+10 - round 11
|
|
+ aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 10
|
|
+ aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 10
|
|
+
|
|
+ aese $ctr6b, $rk11 @ AES block 8k+14 - round 11
|
|
+ aese $ctr4b, $rk11 @ AES block 8k+12 - round 11
|
|
+ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+15
|
|
+
|
|
+ aese $ctr3b, $rk11 @ AES block 8k+11 - round 11
|
|
+ aese $ctr1b, $rk11 @ AES block 8k+9 - round 11
|
|
+ aese $ctr7b, $rk11 @ AES block 8k+15 - round 11
|
|
+
|
|
+.L192_dec_tail: @ TAIL
|
|
+
|
|
+ sub $main_end_input_ptr, $end_input_ptr, $input_ptr @ main_end_input_ptr is number of bytes left to process
|
|
+
|
|
+ ldp $h5q, $h56kq, [$current_tag, #128] @ load h5l | h5h
|
|
+ ext $h5.16b, $h5.16b, $h5.16b, #8
|
|
+ ldr $res1q, [$input_ptr], #16 @ AES block 8k+8 - load ciphertext
|
|
+
|
|
+ ldp $h78kq, $h8q, [$current_tag, #192] @ load h8l | h8h
|
|
+ ext $h8.16b, $h8.16b, $h8.16b, #8
|
|
+
|
|
+ mov $t1.16b, $rk12
|
|
+
|
|
+ ldp $h6q, $h7q, [$current_tag, #160] @ load h6l | h6h
|
|
+ ext $h6.16b, $h6.16b, $h6.16b, #8
|
|
+ ext $h7.16b, $h7.16b, $h7.16b, #8
|
|
+ ext $t0.16b, $acc_lb, $acc_lb, #8 @ prepare final partial tag
|
|
+
|
|
+ eor3 $res4b, $res1b, $ctr0b, $t1.16b @ AES block 8k+8 - result
|
|
+ cmp $main_end_input_ptr, #112
|
|
+ b.gt .L192_dec_blocks_more_than_7
|
|
+
|
|
+ mov $ctr7b, $ctr6b
|
|
+ movi $acc_h.8b, #0
|
|
+ sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
|
|
+
|
|
+ mov $ctr6b, $ctr5b
|
|
+ mov $ctr5b, $ctr4b
|
|
+ mov $ctr4b, $ctr3b
|
|
+
|
|
+ cmp $main_end_input_ptr, #96
|
|
+ movi $acc_l.8b, #0
|
|
+ mov $ctr3b, $ctr2b
|
|
+
|
|
+ mov $ctr2b, $ctr1b
|
|
+ movi $acc_m.8b, #0
|
|
+ b.gt .L192_dec_blocks_more_than_6
|
|
+
|
|
+ mov $ctr7b, $ctr6b
|
|
+ mov $ctr6b, $ctr5b
|
|
+ mov $ctr5b, $ctr4b
|
|
+
|
|
+ mov $ctr4b, $ctr3b
|
|
+ mov $ctr3b, $ctr1b
|
|
+
|
|
+ sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
|
|
+ cmp $main_end_input_ptr, #80
|
|
+ b.gt .L192_dec_blocks_more_than_5
|
|
+
|
|
+ mov $ctr7b, $ctr6b
|
|
+ mov $ctr6b, $ctr5b
|
|
+
|
|
+ mov $ctr5b, $ctr4b
|
|
+ mov $ctr4b, $ctr1b
|
|
+ cmp $main_end_input_ptr, #64
|
|
+
|
|
+ sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
|
|
+ b.gt .L192_dec_blocks_more_than_4
|
|
+
|
|
+ sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
|
|
+ mov $ctr7b, $ctr6b
|
|
+ mov $ctr6b, $ctr5b
|
|
+
|
|
+ mov $ctr5b, $ctr1b
|
|
+ cmp $main_end_input_ptr, #48
|
|
+ b.gt .L192_dec_blocks_more_than_3
|
|
+
|
|
+ sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
|
|
+ mov $ctr7b, $ctr6b
|
|
+ cmp $main_end_input_ptr, #32
|
|
+
|
|
+ mov $ctr6b, $ctr1b
|
|
+ ldr $h34kq, [$current_tag, #96] @ load h4k | h3k
|
|
+ b.gt .L192_dec_blocks_more_than_2
|
|
+
|
|
+ sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
|
|
+
|
|
+ mov $ctr7b, $ctr1b
|
|
+ cmp $main_end_input_ptr, #16
|
|
+ b.gt .L192_dec_blocks_more_than_1
|
|
+
|
|
+ sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
|
|
+ ldr $h12kq, [$current_tag, #48] @ load h2k | h1k
|
|
+ b .L192_dec_blocks_less_than_1
|
|
+.L192_dec_blocks_more_than_7: @ blocks left > 7
|
|
+ rev64 $res0b, $res1b @ GHASH final-7 block
|
|
+
|
|
+ ins $acc_m.d[0], $h78k.d[1] @ GHASH final-7 block - mid
|
|
+ eor $res0b, $res0b, $t0.16b @ feed in partial tag
|
|
+
|
|
+ pmull2 $acc_h.1q, $res0.2d, $h8.2d @ GHASH final-7 block - high
|
|
+ ins $rk4v.d[0], $res0.d[1] @ GHASH final-7 block - mid
|
|
+ ldr $res1q, [$input_ptr], #16 @ AES final-6 block - load ciphertext
|
|
+
|
|
+ pmull $acc_l.1q, $res0.1d, $h8.1d @ GHASH final-7 block - low
|
|
+
|
|
+ eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-7 block - mid
|
|
+ st1 { $res4b}, [$output_ptr], #16 @ AES final-7 block - store result
|
|
+
|
|
+ eor3 $res4b, $res1b, $ctr1b, $t1.16b @ AES final-6 block - result
|
|
+
|
|
+ pmull $acc_m.1q, $rk4v.1d, $acc_m.1d @ GHASH final-7 block - mid
|
|
+ movi $t0.8b, #0 @ surpress further partial tag feed in
|
|
+.L192_dec_blocks_more_than_6: @ blocks left > 6
|
|
+
|
|
+ rev64 $res0b, $res1b @ GHASH final-6 block
|
|
+
|
|
+ eor $res0b, $res0b, $t0.16b @ feed in partial tag
|
|
+
|
|
+ ldr $res1q, [$input_ptr], #16 @ AES final-5 block - load ciphertext
|
|
+ ins $rk4v.d[0], $res0.d[1] @ GHASH final-6 block - mid
|
|
+
|
|
+ eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-6 block - mid
|
|
+ movi $t0.8b, #0 @ surpress further partial tag feed in
|
|
+ pmull2 $rk2q1, $res0.2d, $h7.2d @ GHASH final-6 block - high
|
|
+
|
|
+ st1 { $res4b}, [$output_ptr], #16 @ AES final-6 block - store result
|
|
+ eor3 $res4b, $res1b, $ctr2b, $t1.16b @ AES final-5 block - result
|
|
+
|
|
+ eor $acc_hb, $acc_hb, $rk2 @ GHASH final-6 block - high
|
|
+ pmull $rk4v.1q, $rk4v.1d, $h78k.1d @ GHASH final-6 block - mid
|
|
+ pmull $rk3q1, $res0.1d, $h7.1d @ GHASH final-6 block - low
|
|
+
|
|
+ eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-6 block - mid
|
|
+ eor $acc_lb, $acc_lb, $rk3 @ GHASH final-6 block - low
|
|
+.L192_dec_blocks_more_than_5: @ blocks left > 5
|
|
+
|
|
+ rev64 $res0b, $res1b @ GHASH final-5 block
|
|
+
|
|
+ eor $res0b, $res0b, $t0.16b @ feed in partial tag
|
|
+
|
|
+ ins $rk4v.d[0], $res0.d[1] @ GHASH final-5 block - mid
|
|
+
|
|
+ eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-5 block - mid
|
|
+
|
|
+ ins $rk4v.d[1], $rk4v.d[0] @ GHASH final-5 block - mid
|
|
+ pmull2 $rk2q1, $res0.2d, $h6.2d @ GHASH final-5 block - high
|
|
+
|
|
+ ldr $res1q, [$input_ptr], #16 @ AES final-4 block - load ciphertext
|
|
+
|
|
+ eor $acc_hb, $acc_hb, $rk2 @ GHASH final-5 block - high
|
|
+ pmull $rk3q1, $res0.1d, $h6.1d @ GHASH final-5 block - low
|
|
+
|
|
+ pmull2 $rk4v.1q, $rk4v.2d, $h56k.2d @ GHASH final-5 block - mid
|
|
+
|
|
+ eor $acc_lb, $acc_lb, $rk3 @ GHASH final-5 block - low
|
|
+ movi $t0.8b, #0 @ surpress further partial tag feed in
|
|
+ st1 { $res4b}, [$output_ptr], #16 @ AES final-5 block - store result
|
|
+
|
|
+ eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-5 block - mid
|
|
+ eor3 $res4b, $res1b, $ctr3b, $t1.16b @ AES final-4 block - result
|
|
+.L192_dec_blocks_more_than_4: @ blocks left > 4
|
|
+
|
|
+ rev64 $res0b, $res1b @ GHASH final-4 block
|
|
+
|
|
+ eor $res0b, $res0b, $t0.16b @ feed in partial tag
|
|
+ movi $t0.8b, #0 @ surpress further partial tag feed in
|
|
+
|
|
+ ldr $res1q, [$input_ptr], #16 @ AES final-3 block - load ciphertext
|
|
+ ins $rk4v.d[0], $res0.d[1] @ GHASH final-4 block - mid
|
|
+ pmull $rk3q1, $res0.1d, $h5.1d @ GHASH final-4 block - low
|
|
+
|
|
+ eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-4 block - mid
|
|
+
|
|
+ eor $acc_lb, $acc_lb, $rk3 @ GHASH final-4 block - low
|
|
+
|
|
+ pmull $rk4v.1q, $rk4v.1d, $h56k.1d @ GHASH final-4 block - mid
|
|
+ st1 { $res4b}, [$output_ptr], #16 @ AES final-4 block - store result
|
|
+ pmull2 $rk2q1, $res0.2d, $h5.2d @ GHASH final-4 block - high
|
|
+
|
|
+ eor3 $res4b, $res1b, $ctr4b, $t1.16b @ AES final-3 block - result
|
|
+
|
|
+ eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-4 block - mid
|
|
+ eor $acc_hb, $acc_hb, $rk2 @ GHASH final-4 block - high
|
|
+.L192_dec_blocks_more_than_3: @ blocks left > 3
|
|
+
|
|
+ ldr $h4q, [$current_tag, #112] @ load h4l | h4h
|
|
+ ext $h4.16b, $h4.16b, $h4.16b, #8
|
|
+ rev64 $res0b, $res1b @ GHASH final-3 block
|
|
+ ldr $res1q, [$input_ptr], #16 @ AES final-2 block - load ciphertext
|
|
+
|
|
+ eor $res0b, $res0b, $t0.16b @ feed in partial tag
|
|
+
|
|
+ ins $rk4v.d[0], $res0.d[1] @ GHASH final-3 block - mid
|
|
+ pmull2 $rk2q1, $res0.2d, $h4.2d @ GHASH final-3 block - high
|
|
+
|
|
+ eor $acc_hb, $acc_hb, $rk2 @ GHASH final-3 block - high
|
|
+ movi $t0.8b, #0 @ surpress further partial tag feed in
|
|
+ pmull $rk3q1, $res0.1d, $h4.1d @ GHASH final-3 block - low
|
|
+
|
|
+ st1 { $res4b}, [$output_ptr], #16 @ AES final-3 block - store result
|
|
+ eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-3 block - mid
|
|
+ eor3 $res4b, $res1b, $ctr5b, $t1.16b @ AES final-2 block - result
|
|
+
|
|
+ eor $acc_lb, $acc_lb, $rk3 @ GHASH final-3 block - low
|
|
+ ldr $h34kq, [$current_tag, #96] @ load h4k | h3k
|
|
+
|
|
+ ins $rk4v.d[1], $rk4v.d[0] @ GHASH final-3 block - mid
|
|
+
|
|
+ pmull2 $rk4v.1q, $rk4v.2d, $h34k.2d @ GHASH final-3 block - mid
|
|
+
|
|
+ eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-3 block - mid
|
|
+.L192_dec_blocks_more_than_2: @ blocks left > 2
|
|
+
|
|
+ rev64 $res0b, $res1b @ GHASH final-2 block
|
|
+ ldr $h3q, [$current_tag, #80] @ load h3l | h3h
|
|
+ ext $h3.16b, $h3.16b, $h3.16b, #8
|
|
+
|
|
+ eor $res0b, $res0b, $t0.16b @ feed in partial tag
|
|
+
|
|
+ ins $rk4v.d[0], $res0.d[1] @ GHASH final-2 block - mid
|
|
+ ldr $res1q, [$input_ptr], #16 @ AES final-1 block - load ciphertext
|
|
+
|
|
+ pmull2 $rk2q1, $res0.2d, $h3.2d @ GHASH final-2 block - high
|
|
+
|
|
+ eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-2 block - mid
|
|
+
|
|
+ eor $acc_hb, $acc_hb, $rk2 @ GHASH final-2 block - high
|
|
+ pmull $rk3q1, $res0.1d, $h3.1d @ GHASH final-2 block - low
|
|
+
|
|
+ pmull $rk4v.1q, $rk4v.1d, $h34k.1d @ GHASH final-2 block - mid
|
|
+ movi $t0.8b, #0 @ surpress further partial tag feed in
|
|
+
|
|
+ eor $acc_lb, $acc_lb, $rk3 @ GHASH final-2 block - low
|
|
+ st1 { $res4b}, [$output_ptr], #16 @ AES final-2 block - store result
|
|
+
|
|
+ eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-2 block - mid
|
|
+ eor3 $res4b, $res1b, $ctr6b, $t1.16b @ AES final-1 block - result
|
|
+.L192_dec_blocks_more_than_1: @ blocks left > 1
|
|
+
|
|
+ rev64 $res0b, $res1b @ GHASH final-1 block
|
|
+ ldr $res1q, [$input_ptr], #16 @ AES final block - load ciphertext
|
|
+ ldr $h2q, [$current_tag, #64] @ load h1l | h1h
|
|
+ ext $h2.16b, $h2.16b, $h2.16b, #8
|
|
+
|
|
+ eor $res0b, $res0b, $t0.16b @ feed in partial tag
|
|
+ movi $t0.8b, #0 @ surpress further partial tag feed in
|
|
+ ldr $h12kq, [$current_tag, #48] @ load h2k | h1k
|
|
+
|
|
+ pmull $rk3q1, $res0.1d, $h2.1d @ GHASH final-1 block - low
|
|
+ ins $rk4v.d[0], $res0.d[1] @ GHASH final-1 block - mid
|
|
+ st1 { $res4b}, [$output_ptr], #16 @ AES final-1 block - store result
|
|
+
|
|
+ pmull2 $rk2q1, $res0.2d, $h2.2d @ GHASH final-1 block - high
|
|
+
|
|
+ eor3 $res4b, $res1b, $ctr7b, $t1.16b @ AES final block - result
|
|
+
|
|
+ eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-1 block - mid
|
|
+
|
|
+ ins $rk4v.d[1], $rk4v.d[0] @ GHASH final-1 block - mid
|
|
+
|
|
+ pmull2 $rk4v.1q, $rk4v.2d, $h12k.2d @ GHASH final-1 block - mid
|
|
+
|
|
+ eor $acc_lb, $acc_lb, $rk3 @ GHASH final-1 block - low
|
|
+
|
|
+ eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-1 block - mid
|
|
+ eor $acc_hb, $acc_hb, $rk2 @ GHASH final-1 block - high
|
|
+.L192_dec_blocks_less_than_1: @ blocks left <= 1
|
|
+
|
|
+ rev32 $rtmp_ctr.16b, $rtmp_ctr.16b
|
|
+ and $bit_length, $bit_length, #127 @ bit_length %= 128
|
|
+
|
|
+ sub $bit_length, $bit_length, #128 @ bit_length -= 128
|
|
+ str $rtmp_ctrq, [$counter] @ store the updated counter
|
|
+
|
|
+ neg $bit_length, $bit_length @ bit_length = 128 - #bits in input (in range [1,128])
|
|
+ mvn $temp0_x, xzr @ temp0_x = 0xffffffffffffffff
|
|
+
|
|
+ and $bit_length, $bit_length, #127 @ bit_length %= 128
|
|
+
|
|
+ mvn $temp1_x, xzr @ temp1_x = 0xffffffffffffffff
|
|
+ lsr $temp0_x, $temp0_x, $bit_length @ temp0_x is mask for top 64b of last block
|
|
+ cmp $bit_length, #64
|
|
+
|
|
+ csel $temp2_x, $temp1_x, $temp0_x, lt
|
|
+ csel $temp3_x, $temp0_x, xzr, lt
|
|
+ ldr $h1q, [$current_tag, #32] @ load h1l | h1h
|
|
+ ext $h1.16b, $h1.16b, $h1.16b, #8
|
|
+
|
|
+ mov $ctr0.d[1], $temp3_x
|
|
+ ld1 { $rk0}, [$output_ptr] @ load existing bytes where the possibly partial last block is to be stored
|
|
+
|
|
+ mov $ctr0.d[0], $temp2_x @ ctr0b is mask for last block
|
|
+
|
|
+ and $res1b, $res1b, $ctr0b @ possibly partial last block has zeroes in highest bits
|
|
+ bif $res4b, $rk0, $ctr0b @ insert existing bytes in top end of result before storing
|
|
+
|
|
+ rev64 $res0b, $res1b @ GHASH final block
|
|
+
|
|
+ st1 { $res4b}, [$output_ptr] @ store all 16B
|
|
+
|
|
+ eor $res0b, $res0b, $t0.16b @ feed in partial tag
|
|
+
|
|
+ ins $t0.d[0], $res0.d[1] @ GHASH final block - mid
|
|
+ pmull $rk3q1, $res0.1d, $h1.1d @ GHASH final block - low
|
|
+
|
|
+ eor $t0.8b, $t0.8b, $res0.8b @ GHASH final block - mid
|
|
+ pmull2 $rk2q1, $res0.2d, $h1.2d @ GHASH final block - high
|
|
+ eor $acc_lb, $acc_lb, $rk3 @ GHASH final block - low
|
|
+
|
|
+ pmull $t0.1q, $t0.1d, $h12k.1d @ GHASH final block - mid
|
|
+ eor $acc_hb, $acc_hb, $rk2 @ GHASH final block - high
|
|
+
|
|
+ eor $t10.16b, $acc_hb, $acc_lb @ MODULO - karatsuba tidy up
|
|
+ eor $acc_mb, $acc_mb, $t0.16b @ GHASH final block - mid
|
|
+ ldr $mod_constantd, [$modulo_constant] @ MODULO - load modulo constant
|
|
+
|
|
+ pmull $t11.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid
|
|
+ ext $acc_hb, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment
|
|
+
|
|
+ eor $acc_mb, $acc_mb, $t10.16b @ MODULO - karatsuba tidy up
|
|
+
|
|
+ eor3 $acc_mb, $acc_mb, $acc_hb, $t11.16b @ MODULO - fold into mid
|
|
+
|
|
+ pmull $acc_h.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low
|
|
+ ext $acc_mb, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment
|
|
+
|
|
+ eor3 $acc_lb, $acc_lb, $acc_mb, $acc_hb @ MODULO - fold into low
|
|
+ ext $acc_lb, $acc_lb, $acc_lb, #8
|
|
+ rev64 $acc_lb, $acc_lb
|
|
+ st1 { $acc_l.16b }, [$current_tag]
|
|
+
|
|
+ ldp d10, d11, [sp, #16]
|
|
+ ldp d12, d13, [sp, #32]
|
|
+ ldp d14, d15, [sp, #48]
|
|
+ ldp d8, d9, [sp], #80
|
|
+ ret
|
|
+
|
|
+.L192_dec_ret:
|
|
+ mov w0, #0x0
|
|
+ ret
|
|
+.size unroll8_eor3_aes_gcm_dec_192_kernel,.-unroll8_eor3_aes_gcm_dec_192_kernel
|
|
+___
|
|
+}
|
|
+
|
|
+{
|
|
+
|
|
+my ($end_input_ptr,$main_end_input_ptr,$temp0_x,$temp1_x)=map("x$_",(4..7));
|
|
+my ($temp2_x,$temp3_x)=map("x$_",(13..14));
|
|
+my ($ctr0b,$ctr1b,$ctr2b,$ctr3b,$ctr4b,$ctr5b,$ctr6b,$ctr7b,$res0b,$res1b,$res2b,$res3b,$res4b,$res5b,$res6b,$res7b)=map("v$_.16b",(0..15));
|
|
+my ($ctr0,$ctr1,$ctr2,$ctr3,$ctr4,$ctr5,$ctr6,$ctr7,$res0,$res1,$res2,$res3,$res4,$res5,$res6,$res7)=map("v$_",(0..15));
|
|
+my ($ctr0d,$ctr1d,$ctr2d,$ctr3d,$ctr4d,$ctr5d,$ctr6d,$ctr7d)=map("d$_",(0..7));
|
|
+my ($ctr0q,$ctr1q,$ctr2q,$ctr3q,$ctr4q,$ctr5q,$ctr6q,$ctr7q)=map("q$_",(0..7));
|
|
+my ($res0q,$res1q,$res2q,$res3q,$res4q,$res5q,$res6q,$res7q)=map("q$_",(8..15));
|
|
+
|
|
+my ($ctr_t0,$ctr_t1,$ctr_t2,$ctr_t3,$ctr_t4,$ctr_t5,$ctr_t6,$ctr_t7)=map("v$_",(8..15));
|
|
+my ($ctr_t0b,$ctr_t1b,$ctr_t2b,$ctr_t3b,$ctr_t4b,$ctr_t5b,$ctr_t6b,$ctr_t7b)=map("v$_.16b",(8..15));
|
|
+my ($ctr_t0q,$ctr_t1q,$ctr_t2q,$ctr_t3q,$ctr_t4q,$ctr_t5q,$ctr_t6q,$ctr_t7q)=map("q$_",(8..15));
|
|
+
|
|
+my ($acc_hb,$acc_mb,$acc_lb)=map("v$_.16b",(17..19));
|
|
+my ($acc_h,$acc_m,$acc_l)=map("v$_",(17..19));
|
|
+
|
|
+my ($h1,$h12k,$h2,$h3,$h34k,$h4)=map("v$_",(20..25));
|
|
+my ($h5,$h56k,$h6,$h7,$h78k,$h8)=map("v$_",(20..25));
|
|
+my ($h1q,$h12kq,$h2q,$h3q,$h34kq,$h4q)=map("q$_",(20..25));
|
|
+my ($h5q,$h56kq,$h6q,$h7q,$h78kq,$h8q)=map("q$_",(20..25));
|
|
+
|
|
+my $t0="v16";
|
|
+my $t0d="d16";
|
|
+
|
|
+my $t1="v29";
|
|
+my $t2=$res1;
|
|
+my $t3=$t1;
|
|
+
|
|
+my $t4=$res0;
|
|
+my $t5=$res2;
|
|
+my $t6=$t0;
|
|
+
|
|
+my $t7=$res3;
|
|
+my $t8=$res4;
|
|
+my $t9=$res5;
|
|
+
|
|
+my $t10=$res6;
|
|
+my $t11="v21";
|
|
+my $t12=$t1;
|
|
+
|
|
+my $rtmp_ctr="v30";
|
|
+my $rtmp_ctrq="q30";
|
|
+my $rctr_inc="v31";
|
|
+my $rctr_incd="d31";
|
|
+
|
|
+my $mod_constantd=$t0d;
|
|
+my $mod_constant=$t0;
|
|
+
|
|
+my ($rk0,$rk1,$rk2)=map("v$_.16b",(26..28));
|
|
+my ($rk3,$rk4,$rk5)=map("v$_.16b",(26..28));
|
|
+my ($rk6,$rk7,$rk8)=map("v$_.16b",(26..28));
|
|
+my ($rk9,$rk10,$rk11)=map("v$_.16b",(26..28));
|
|
+my ($rk12,$rk13,$rk14)=map("v$_.16b",(26..28));
|
|
+my ($rk0q,$rk1q,$rk2q)=map("q$_",(26..28));
|
|
+my ($rk3q,$rk4q,$rk5q)=map("q$_",(26..28));
|
|
+my ($rk6q,$rk7q,$rk8q)=map("q$_",(26..28));
|
|
+my ($rk9q,$rk10q,$rk11q)=map("q$_",(26..28));
|
|
+my ($rk12q,$rk13q,$rk14q)=map("q$_",(26..28));
|
|
+my $rk2q1="v28.1q";
|
|
+my $rk3q1="v26.1q";
|
|
+my $rk4v="v27";
|
|
+#########################################################################################
|
|
+# size_t unroll8_eor3_aes_gcm_enc_256_kernel(const unsigned char *in,
|
|
+# size_t len,
|
|
+# unsigned char *out,
|
|
+# const void *key,
|
|
+# unsigned char ivec[16],
|
|
+# u64 *Xi);
|
|
+#
|
|
+$code.=<<___;
|
|
+.global unroll8_eor3_aes_gcm_enc_256_kernel
|
|
+.type unroll8_eor3_aes_gcm_enc_256_kernel,%function
|
|
+.align 4
|
|
+unroll8_eor3_aes_gcm_enc_256_kernel:
|
|
+ AARCH64_VALID_CALL_TARGET
|
|
+ cbz x1, .L256_enc_ret
|
|
+ stp d8, d9, [sp, #-80]!
|
|
+ mov $counter, x4
|
|
+ mov $cc, x5
|
|
+ stp d10, d11, [sp, #16]
|
|
+ stp d12, d13, [sp, #32]
|
|
+ stp d14, d15, [sp, #48]
|
|
+ mov x5, #0xc200000000000000
|
|
+ stp x5, xzr, [sp, #64]
|
|
+ add $modulo_constant, sp, #64
|
|
+
|
|
+ ld1 { $ctr0b}, [$counter] @ CTR block 0
|
|
+
|
|
+ lsr $main_end_input_ptr, $bit_length, #3 @ byte_len
|
|
+
|
|
+ mov $constant_temp, #0x100000000 @ set up counter increment
|
|
+ movi $rctr_inc.16b, #0x0
|
|
+ mov $rctr_inc.d[1], $constant_temp
|
|
+ sub $main_end_input_ptr, $main_end_input_ptr, #1 @ byte_len - 1
|
|
+
|
|
+ and $main_end_input_ptr, $main_end_input_ptr, #0xffffffffffffff80 @ number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
|
|
+
|
|
+ add $main_end_input_ptr, $main_end_input_ptr, $input_ptr
|
|
+
|
|
+ rev32 $rtmp_ctr.16b, $ctr0.16b @ set up reversed counter
|
|
+
|
|
+ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 0
|
|
+
|
|
+ rev32 $ctr1.16b, $rtmp_ctr.16b @ CTR block 1
|
|
+ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 1
|
|
+
|
|
+ rev32 $ctr2.16b, $rtmp_ctr.16b @ CTR block 2
|
|
+ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 2
|
|
+
|
|
+ rev32 $ctr3.16b, $rtmp_ctr.16b @ CTR block 3
|
|
+ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 3
|
|
+
|
|
+ rev32 $ctr4.16b, $rtmp_ctr.16b @ CTR block 4
|
|
+ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 4
|
|
+
|
|
+ rev32 $ctr5.16b, $rtmp_ctr.16b @ CTR block 5
|
|
+ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 5
|
|
+ ldp $rk0q, $rk1q, [$cc, #0] @ load rk0, rk1
|
|
+
|
|
+ rev32 $ctr6.16b, $rtmp_ctr.16b @ CTR block 6
|
|
+ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 6
|
|
+
|
|
+ rev32 $ctr7.16b, $rtmp_ctr.16b @ CTR block 7
|
|
+
|
|
+ aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 0
|
|
+ aese $ctr4b, $rk0 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 0
|
|
+ aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 0
|
|
+
|
|
+ aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 0
|
|
+ aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 0
|
|
+ aese $ctr6b, $rk0 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 0
|
|
+
|
|
+ aese $ctr5b, $rk0 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 0
|
|
+ aese $ctr7b, $rk0 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 0
|
|
+ ldp $rk2q, $rk3q, [$cc, #32] @ load rk2, rk3
|
|
+
|
|
+ aese $ctr4b, $rk1 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 1
|
|
+ aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 1
|
|
+ aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 1
|
|
+
|
|
+ aese $ctr6b, $rk1 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 1
|
|
+ aese $ctr5b, $rk1 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 1
|
|
+
|
|
+ aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 1
|
|
+
|
|
+ aese $ctr7b, $rk1 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 1
|
|
+
|
|
+ aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 2
|
|
+ aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 2
|
|
+ aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 1
|
|
+
|
|
+ aese $ctr7b, $rk2 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 2
|
|
+ aese $ctr6b, $rk2 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 2
|
|
+ aese $ctr5b, $rk2 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 2
|
|
+
|
|
+ aese $ctr4b, $rk2 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 2
|
|
+ aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 2
|
|
+ aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 2
|
|
+
|
|
+ aese $ctr5b, $rk3 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 3
|
|
+ aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 3
|
|
+ ldp $rk4q, $rk5q, [$cc, #64] @ load rk4, rk5
|
|
+
|
|
+ aese $ctr4b, $rk3 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 3
|
|
+
|
|
+ aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 3
|
|
+ aese $ctr6b, $rk3 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 3
|
|
+ aese $ctr7b, $rk3 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 3
|
|
+
|
|
+ aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 3
|
|
+ aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 3
|
|
+
|
|
+ aese $ctr4b, $rk4 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 4
|
|
+ aese $ctr6b, $rk4 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 4
|
|
+ aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 4
|
|
+
|
|
+ aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 4
|
|
+ aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 4
|
|
+
|
|
+ aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 4
|
|
+ aese $ctr7b, $rk4 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 4
|
|
+ aese $ctr5b, $rk4 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 4
|
|
+
|
|
+ aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 5
|
|
+ aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 5
|
|
+ ldp $rk6q, $rk7q, [$cc, #96] @ load rk6, rk7
|
|
+
|
|
+ aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 5
|
|
+ aese $ctr4b, $rk5 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 5
|
|
+ aese $ctr5b, $rk5 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 5
|
|
+
|
|
+ aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 5
|
|
+ aese $ctr6b, $rk5 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 5
|
|
+ aese $ctr7b, $rk5 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 5
|
|
+
|
|
+ aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 6
|
|
+ aese $ctr5b, $rk6 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 6
|
|
+ aese $ctr4b, $rk6 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 6
|
|
+
|
|
+ aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 6
|
|
+ aese $ctr6b, $rk6 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 6
|
|
+ aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 6
|
|
+
|
|
+ aese $ctr7b, $rk6 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 6
|
|
+ aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 6
|
|
+ ldp $rk8q, $rk9q, [$cc, #128] @ load rk8, rk9
|
|
+
|
|
+ aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 7
|
|
+ aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 7
|
|
+
|
|
+ aese $ctr7b, $rk7 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 7
|
|
+ aese $ctr6b, $rk7 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 7
|
|
+ aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 7
|
|
+
|
|
+ aese $ctr5b, $rk7 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 7
|
|
+ aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 7
|
|
+
|
|
+ aese $ctr4b, $rk7 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 7
|
|
+
|
|
+ aese $ctr6b, $rk8 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 8
|
|
+ aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 8
|
|
+
|
|
+ aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 8
|
|
+ aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 8
|
|
+ aese $ctr7b, $rk8 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 8
|
|
+
|
|
+ aese $ctr5b, $rk8 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 8
|
|
+ aese $ctr4b, $rk8 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 8
|
|
+ aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 8
|
|
+
|
|
+ ld1 { $acc_lb}, [$current_tag]
|
|
+ ext $acc_lb, $acc_lb, $acc_lb, #8
|
|
+ rev64 $acc_lb, $acc_lb
|
|
+ ldp $rk10q, $rk11q, [$cc, #160] @ load rk10, rk11
|
|
+
|
|
+ aese $ctr6b, $rk9 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 9
|
|
+ aese $ctr7b, $rk9 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 9
|
|
+ aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 9
|
|
+
|
|
+ aese $ctr4b, $rk9 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 9
|
|
+ aese $ctr5b, $rk9 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 9
|
|
+ aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 9
|
|
+
|
|
+ aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 9
|
|
+
|
|
+ aese $ctr7b, $rk10 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 10
|
|
+ aese $ctr4b, $rk10 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 10
|
|
+ aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 9
|
|
+
|
|
+ aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 10
|
|
+ aese $ctr5b, $rk10 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 10
|
|
+ aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 10
|
|
+
|
|
+ aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 10
|
|
+ aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 10
|
|
+ aese $ctr6b, $rk10 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 10
|
|
+
|
|
+ aese $ctr4b, $rk11 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 11
|
|
+ ldp $rk12q, $rk13q, [$cc, #192] @ load rk12, rk13
|
|
+ aese $ctr5b, $rk11 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 11
|
|
+
|
|
+ aese $ctr2b, $rk11 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 11
|
|
+ aese $ctr6b, $rk11 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 11
|
|
+ aese $ctr1b, $rk11 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 11
|
|
+
|
|
+ aese $ctr0b, $rk11 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 11
|
|
+ aese $ctr3b, $rk11 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 11
|
|
+ aese $ctr7b, $rk11 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 11
|
|
+
|
|
+ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 7
|
|
+ ldr $rk14q, [$cc, #224] @ load rk14
|
|
+
|
|
+ aese $ctr4b, $rk12 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 12
|
|
+ aese $ctr2b, $rk12 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 12
|
|
+ aese $ctr1b, $rk12 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 12
|
|
+
|
|
+ aese $ctr0b, $rk12 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 12
|
|
+ aese $ctr5b, $rk12 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 12
|
|
+ aese $ctr3b, $rk12 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 12
|
|
+
|
|
+ aese $ctr2b, $rk13 @ AES block 2 - round 13
|
|
+ aese $ctr1b, $rk13 @ AES block 1 - round 13
|
|
+ aese $ctr4b, $rk13 @ AES block 4 - round 13
|
|
+
|
|
+ aese $ctr6b, $rk12 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 12
|
|
+ aese $ctr7b, $rk12 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 12
|
|
+
|
|
+ aese $ctr0b, $rk13 @ AES block 0 - round 13
|
|
+ aese $ctr5b, $rk13 @ AES block 5 - round 13
|
|
+
|
|
+ aese $ctr6b, $rk13 @ AES block 6 - round 13
|
|
+ aese $ctr7b, $rk13 @ AES block 7 - round 13
|
|
+ aese $ctr3b, $rk13 @ AES block 3 - round 13
|
|
+
|
|
+ add $end_input_ptr, $input_ptr, $bit_length, lsr #3 @ end_input_ptr
|
|
+ cmp $input_ptr, $main_end_input_ptr @ check if we have <= 8 blocks
|
|
+ b.ge .L256_enc_tail @ handle tail
|
|
+
|
|
+ ldp $ctr_t0q, $ctr_t1q, [$input_ptr], #32 @ AES block 0, 1 - load plaintext
|
|
+
|
|
+ ldp $ctr_t2q, $ctr_t3q, [$input_ptr], #32 @ AES block 2, 3 - load plaintext
|
|
+
|
|
+ eor3 $res0b, $ctr_t0b, $ctr0b, $rk14 @ AES block 0 - result
|
|
+ rev32 $ctr0.16b, $rtmp_ctr.16b @ CTR block 8
|
|
+ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8
|
|
+
|
|
+ eor3 $res1b, $ctr_t1b, $ctr1b, $rk14 @ AES block 1 - result
|
|
+ eor3 $res3b, $ctr_t3b, $ctr3b, $rk14 @ AES block 3 - result
|
|
+
|
|
+ rev32 $ctr1.16b, $rtmp_ctr.16b @ CTR block 9
|
|
+ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 9
|
|
+ ldp $ctr_t4q, $ctr_t5q, [$input_ptr], #32 @ AES block 4, 5 - load plaintext
|
|
+
|
|
+ ldp $ctr_t6q, $ctr_t7q, [$input_ptr], #32 @ AES block 6, 7 - load plaintext
|
|
+ eor3 $res2b, $ctr_t2b, $ctr2b, $rk14 @ AES block 2 - result
|
|
+ cmp $input_ptr, $main_end_input_ptr @ check if we have <= 8 blocks
|
|
+
|
|
+ rev32 $ctr2.16b, $rtmp_ctr.16b @ CTR block 10
|
|
+ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 10
|
|
+ stp $res0q, $res1q, [$output_ptr], #32 @ AES block 0, 1 - store result
|
|
+
|
|
+ stp $res2q, $res3q, [$output_ptr], #32 @ AES block 2, 3 - store result
|
|
+
|
|
+ rev32 $ctr3.16b, $rtmp_ctr.16b @ CTR block 11
|
|
+ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 11
|
|
+
|
|
+ eor3 $res4b, $ctr_t4b, $ctr4b, $rk14 @ AES block 4 - result
|
|
+
|
|
+ eor3 $res7b, $ctr_t7b, $ctr7b, $rk14 @ AES block 7 - result
|
|
+ eor3 $res6b, $ctr_t6b, $ctr6b, $rk14 @ AES block 6 - result
|
|
+ eor3 $res5b, $ctr_t5b, $ctr5b, $rk14 @ AES block 5 - result
|
|
+
|
|
+ stp $res4q, $res5q, [$output_ptr], #32 @ AES block 4, 5 - store result
|
|
+ rev32 $ctr4.16b, $rtmp_ctr.16b @ CTR block 12
|
|
+
|
|
+ stp $res6q, $res7q, [$output_ptr], #32 @ AES block 6, 7 - store result
|
|
+ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 12
|
|
+ b.ge .L256_enc_prepretail @ do prepretail
|
|
+
|
|
+.L256_enc_main_loop: @ main loop start
|
|
+ ldp $rk0q, $rk1q, [$cc, #0] @ load rk0, rk1
|
|
+
|
|
+ rev32 $ctr5.16b, $rtmp_ctr.16b @ CTR block 8k+13
|
|
+ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+13
|
|
+ ldr $h56kq, [$current_tag, #144] @ load h6k | h5k
|
|
+ ldr $h78kq, [$current_tag, #192] @ load h8k | h7k
|
|
+
|
|
+ rev64 $res3b, $res3b @ GHASH block 8k+3
|
|
+ ldr $h5q, [$current_tag, #128] @ load h5l | h5h
|
|
+ ext $h5.16b, $h5.16b, $h5.16b, #8
|
|
+ ldr $h6q, [$current_tag, #160] @ load h6l | h6h
|
|
+ ext $h6.16b, $h6.16b, $h6.16b, #8
|
|
+ rev64 $res1b, $res1b @ GHASH block 8k+1
|
|
+
|
|
+ rev32 $ctr6.16b, $rtmp_ctr.16b @ CTR block 8k+14
|
|
+ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+14
|
|
+ rev64 $res0b, $res0b @ GHASH block 8k
|
|
+
|
|
+ rev64 $res4b, $res4b @ GHASH block 8k+4
|
|
+ ext $acc_lb, $acc_lb, $acc_lb, #8 @ PRE 0
|
|
+ ldr $h7q, [$current_tag, #176] @ load h7l | h7h
|
|
+ ext $h7.16b, $h7.16b, $h7.16b, #8
|
|
+ ldr $h8q, [$current_tag, #208] @ load h8l | h8h
|
|
+ ext $h8.16b, $h8.16b, $h8.16b, #8
|
|
+
|
|
+ aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 0
|
|
+ aese $ctr5b, $rk0 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 0
|
|
+ rev32 $ctr7.16b, $rtmp_ctr.16b @ CTR block 8k+15
|
|
+
|
|
+ aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 0
|
|
+ aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 0
|
|
+ aese $ctr6b, $rk0 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 0
|
|
+
|
|
+ aese $ctr7b, $rk0 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 0
|
|
+ aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 0
|
|
+ aese $ctr4b, $rk0 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 0
|
|
+
|
|
+ ldp $rk2q, $rk3q, [$cc, #32] @ load rk2, rk3
|
|
+ eor $res0b, $res0b, $acc_lb @ PRE 1
|
|
+ aese $ctr6b, $rk1 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 1
|
|
+
|
|
+ aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 1
|
|
+ aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 1
|
|
+ aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 1
|
|
+
|
|
+ aese $ctr4b, $rk1 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 1
|
|
+ aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 1
|
|
+ aese $ctr5b, $rk1 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 1
|
|
+
|
|
+ pmull2 $acc_h.1q, $res0.2d, $h8.2d @ GHASH block 8k - high
|
|
+ pmull $acc_l.1q, $res0.1d, $h8.1d @ GHASH block 8k - low
|
|
+ pmull2 $t0.1q, $res1.2d, $h7.2d @ GHASH block 8k+1 - high
|
|
+
|
|
+ trn1 $acc_m.2d, $res1.2d, $res0.2d @ GHASH block 8k, 8k+1 - mid
|
|
+ trn2 $res0.2d, $res1.2d, $res0.2d @ GHASH block 8k, 8k+1 - mid
|
|
+ aese $ctr7b, $rk1 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 1
|
|
+
|
|
+ aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 2
|
|
+ aese $ctr5b, $rk2 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 2
|
|
+ aese $ctr6b, $rk2 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 2
|
|
+
|
|
+ aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 2
|
|
+ pmull $h7.1q, $res1.1d, $h7.1d @ GHASH block 8k+1 - low
|
|
+ aese $ctr4b, $rk2 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 2
|
|
+
|
|
+ aese $ctr5b, $rk3 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 3
|
|
+ aese $ctr6b, $rk3 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 3
|
|
+ aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 2
|
|
+
|
|
+ aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 3
|
|
+ aese $ctr7b, $rk2 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 2
|
|
+ aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 2
|
|
+
|
|
+ aese $ctr4b, $rk3 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 3
|
|
+ rev64 $res6b, $res6b @ GHASH block 8k+6
|
|
+ pmull2 $t2.1q, $res3.2d, $h5.2d @ GHASH block 8k+3 - high
|
|
+
|
|
+ aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 3
|
|
+ ldp $rk4q, $rk5q, [$cc, #64] @ load rk4, rk5
|
|
+ rev64 $res2b, $res2b @ GHASH block 8k+2
|
|
+
|
|
+ aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 3
|
|
+ aese $ctr7b, $rk3 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 3
|
|
+ aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 3
|
|
+
|
|
+ eor $acc_hb, $acc_hb, $t0.16b @ GHASH block 8k+1 - high
|
|
+ pmull2 $t1.1q, $res2.2d, $h6.2d @ GHASH block 8k+2 - high
|
|
+ rev64 $res5b, $res5b @ GHASH block 8k+5
|
|
+
|
|
+ pmull $h5.1q, $res3.1d, $h5.1d @ GHASH block 8k+3 - low
|
|
+ eor $acc_lb, $acc_lb, $h7.16b @ GHASH block 8k+1 - low
|
|
+ ldr $h3q, [$current_tag, #80] @ load h3l | h3h
|
|
+ ext $h3.16b, $h3.16b, $h3.16b, #8
|
|
+ ldr $h4q, [$current_tag, #112] @ load h4l | h4h
|
|
+ ext $h4.16b, $h4.16b, $h4.16b, #8
|
|
+
|
|
+ trn1 $t6.2d, $res5.2d, $res4.2d @ GHASH block 8k+4, 8k+5 - mid
|
|
+ eor3 $acc_hb, $acc_hb, $t1.16b, $t2.16b @ GHASH block 8k+2, 8k+3 - high
|
|
+ pmull $h6.1q, $res2.1d, $h6.1d @ GHASH block 8k+2 - low
|
|
+
|
|
+ aese $ctr4b, $rk4 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 4
|
|
+ aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 4
|
|
+ aese $ctr5b, $rk4 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 4
|
|
+
|
|
+ aese $ctr7b, $rk4 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 4
|
|
+ aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 4
|
|
+ aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 4
|
|
+
|
|
+ trn1 $t3.2d, $res3.2d, $res2.2d @ GHASH block 8k+2, 8k+3 - mid
|
|
+ aese $ctr6b, $rk4 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 4
|
|
+ aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 4
|
|
+
|
|
+ trn2 $res2.2d, $res3.2d, $res2.2d @ GHASH block 8k+2, 8k+3 - mid
|
|
+ eor $res0.16b, $res0.16b, $acc_m.16b @ GHASH block 8k, 8k+1 - mid
|
|
+ ldp $rk6q, $rk7q, [$cc, #96] @ load rk6, rk7
|
|
+
|
|
+ aese $ctr5b, $rk5 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 5
|
|
+ aese $ctr7b, $rk5 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 5
|
|
+ aese $ctr4b, $rk5 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 5
|
|
+
|
|
+ eor $res2.16b, $res2.16b, $t3.16b @ GHASH block 8k+2, 8k+3 - mid
|
|
+ aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 5
|
|
+ rev64 $res7b, $res7b @ GHASH block 8k+7
|
|
+
|
|
+ aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 5
|
|
+ aese $ctr6b, $rk5 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 5
|
|
+ aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 5
|
|
+
|
|
+ pmull2 $t3.1q, $res2.2d, $h56k.2d @ GHASH block 8k+2 - mid
|
|
+ pmull2 $acc_m.1q, $res0.2d, $h78k.2d @ GHASH block 8k - mid
|
|
+ aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 5
|
|
+
|
|
+ pmull $h78k.1q, $res0.1d, $h78k.1d @ GHASH block 8k+1 - mid
|
|
+ aese $ctr4b, $rk6 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 6
|
|
+ aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 6
|
|
+
|
|
+ aese $ctr6b, $rk6 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 6
|
|
+ aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 6
|
|
+ aese $ctr7b, $rk6 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 6
|
|
+
|
|
+ eor $acc_mb, $acc_mb, $h78k.16b @ GHASH block 8k+1 - mid
|
|
+ pmull $h56k.1q, $res2.1d, $h56k.1d @ GHASH block 8k+3 - mid
|
|
+ aese $ctr5b, $rk6 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 6
|
|
+
|
|
+ eor3 $acc_lb, $acc_lb, $h6.16b, $h5.16b @ GHASH block 8k+2, 8k+3 - low
|
|
+ aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 6
|
|
+ aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 6
|
|
+
|
|
+ ldp $rk8q, $rk9q, [$cc, #128] @ load rk8, rk9
|
|
+ pmull2 $t4.1q, $res4.2d, $h4.2d @ GHASH block 8k+4 - high
|
|
+ aese $ctr5b, $rk7 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 7
|
|
+
|
|
+ ldr $h1q, [$current_tag, #32] @ load h1l | h1h
|
|
+ ext $h1.16b, $h1.16b, $h1.16b, #8
|
|
+ ldr $h2q, [$current_tag, #64] @ load h1l | h1h
|
|
+ ext $h2.16b, $h2.16b, $h2.16b, #8
|
|
+ aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 7
|
|
+ eor3 $acc_mb, $acc_mb, $h56k.16b, $t3.16b @ GHASH block 8k+2, 8k+3 - mid
|
|
+
|
|
+ ldr $h12kq, [$current_tag, #48] @ load h2k | h1k
|
|
+ ldr $h34kq, [$current_tag, #96] @ load h4k | h3k
|
|
+ aese $ctr6b, $rk7 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 7
|
|
+ aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 7
|
|
+
|
|
+ aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 7
|
|
+ aese $ctr7b, $rk7 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 7
|
|
+ pmull $h4.1q, $res4.1d, $h4.1d @ GHASH block 8k+4 - low
|
|
+
|
|
+ trn2 $res4.2d, $res5.2d, $res4.2d @ GHASH block 8k+4, 8k+5 - mid
|
|
+ aese $ctr4b, $rk7 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 7
|
|
+ aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 7
|
|
+
|
|
+ pmull2 $t5.1q, $res5.2d, $h3.2d @ GHASH block 8k+5 - high
|
|
+ aese $ctr7b, $rk8 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 8
|
|
+ aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 8
|
|
+
|
|
+ pmull $h3.1q, $res5.1d, $h3.1d @ GHASH block 8k+5 - low
|
|
+ trn1 $t9.2d, $res7.2d, $res6.2d @ GHASH block 8k+6, 8k+7 - mid
|
|
+ eor $res4.16b, $res4.16b, $t6.16b @ GHASH block 8k+4, 8k+5 - mid
|
|
+
|
|
+ aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 8
|
|
+ aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 9
|
|
+ aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 8
|
|
+
|
|
+ pmull2 $t6.1q, $res4.2d, $h34k.2d @ GHASH block 8k+4 - mid
|
|
+ pmull $h34k.1q, $res4.1d, $h34k.1d @ GHASH block 8k+5 - mid
|
|
+ aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 8
|
|
+
|
|
+ aese $ctr5b, $rk8 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 8
|
|
+ pmull2 $t7.1q, $res6.2d, $h2.2d @ GHASH block 8k+6 - high
|
|
+ pmull $h2.1q, $res6.1d, $h2.1d @ GHASH block 8k+6 - low
|
|
+
|
|
+ aese $ctr6b, $rk8 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 8
|
|
+ trn2 $res6.2d, $res7.2d, $res6.2d @ GHASH block 8k+6, 8k+7 - mid
|
|
+ aese $ctr4b, $rk8 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 8
|
|
+
|
|
+ eor3 $acc_mb, $acc_mb, $h34k.16b, $t6.16b @ GHASH block 8k+4, 8k+5 - mid
|
|
+ aese $ctr7b, $rk9 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 9
|
|
+ aese $ctr5b, $rk9 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 9
|
|
+
|
|
+ eor $res6.16b, $res6.16b, $t9.16b @ GHASH block 8k+6, 8k+7 - mid
|
|
+ aese $ctr6b, $rk9 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 9
|
|
+ aese $ctr4b, $rk9 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 9
|
|
+
|
|
+ ldp $rk10q, $rk11q, [$cc, #160] @ load rk10, rk11
|
|
+ aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 9
|
|
+ aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 9
|
|
+
|
|
+ pmull2 $t8.1q, $res7.2d, $h1.2d @ GHASH block 8k+7 - high
|
|
+ eor3 $acc_lb, $acc_lb, $h4.16b, $h3.16b @ GHASH block 8k+4, 8k+5 - low
|
|
+ pmull $h1.1q, $res7.1d, $h1.1d @ GHASH block 8k+7 - low
|
|
+
|
|
+ ldr $mod_constantd, [$modulo_constant] @ MODULO - load modulo constant
|
|
+ pmull2 $t9.1q, $res6.2d, $h12k.2d @ GHASH block 8k+6 - mid
|
|
+ pmull $h12k.1q, $res6.1d, $h12k.1d @ GHASH block 8k+7 - mid
|
|
+
|
|
+ aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 9
|
|
+
|
|
+ eor3 $acc_mb, $acc_mb, $h12k.16b, $t9.16b @ GHASH block 8k+6, 8k+7 - mid
|
|
+ eor3 $acc_lb, $acc_lb, $h2.16b, $h1.16b @ GHASH block 8k+6, 8k+7 - low
|
|
+ eor3 $acc_hb, $acc_hb, $t4.16b, $t5.16b @ GHASH block 8k+4, 8k+5 - high
|
|
+
|
|
+ aese $ctr4b, $rk10 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 10
|
|
+ aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 10
|
|
+ aese $ctr5b, $rk10 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 10
|
|
+
|
|
+ aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 10
|
|
+ aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 10
|
|
+ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+15
|
|
+
|
|
+ aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 10
|
|
+ aese $ctr7b, $rk10 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 10
|
|
+ aese $ctr6b, $rk10 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 10
|
|
+
|
|
+ eor3 $acc_hb, $acc_hb, $t7.16b, $t8.16b @ GHASH block 8k+6, 8k+7 - high
|
|
+
|
|
+ ldp $rk12q, $rk13q, [$cc, #192] @ load rk12, rk13
|
|
+ rev32 $h1.16b, $rtmp_ctr.16b @ CTR block 8k+16
|
|
+
|
|
+ ext $t11.16b, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment
|
|
+ ldp $ctr_t0q, $ctr_t1q, [$input_ptr], #32 @ AES block 8k+8, 8k+9 - load plaintext
|
|
+ aese $ctr2b, $rk11 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 11
|
|
+
|
|
+ aese $ctr6b, $rk11 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 11
|
|
+ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+16
|
|
+ aese $ctr3b, $rk11 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 11
|
|
+
|
|
+ aese $ctr0b, $rk11 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 11
|
|
+ aese $ctr7b, $rk11 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 11
|
|
+
|
|
+ pmull $t12.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid
|
|
+ aese $ctr1b, $rk11 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 11
|
|
+
|
|
+ aese $ctr7b, $rk12 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 12
|
|
+ aese $ctr5b, $rk11 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 11
|
|
+
|
|
+ aese $ctr3b, $rk12 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 12
|
|
+ aese $ctr6b, $rk12 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 12
|
|
+ rev32 $h2.16b, $rtmp_ctr.16b @ CTR block 8k+17
|
|
+
|
|
+ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+17
|
|
+ aese $ctr4b, $rk11 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 11
|
|
+ eor3 $acc_mb, $acc_mb, $acc_hb, $acc_lb @ MODULO - karatsuba tidy up
|
|
+
|
|
+ aese $ctr5b, $rk12 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 12
|
|
+ ldr $rk14q, [$cc, #224] @ load rk14
|
|
+ aese $ctr7b, $rk13 @ AES block 8k+15 - round 13
|
|
+
|
|
+ ldp $ctr_t2q, $ctr_t3q, [$input_ptr], #32 @ AES block 8k+10, 8k+11 - load plaintext
|
|
+ aese $ctr2b, $rk12 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 12
|
|
+ aese $ctr4b, $rk12 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 12
|
|
+
|
|
+ eor3 $acc_mb, $acc_mb, $t12.16b, $t11.16b @ MODULO - fold into mid
|
|
+ aese $ctr1b, $rk12 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 12
|
|
+ ldp $ctr_t4q, $ctr_t5q, [$input_ptr], #32 @ AES block 4, 5 - load plaintext
|
|
+
|
|
+ ldp $ctr_t6q, $ctr_t7q, [$input_ptr], #32 @ AES block 6, 7 - load plaintext
|
|
+ aese $ctr2b, $rk13 @ AES block 8k+10 - round 13
|
|
+ aese $ctr4b, $rk13 @ AES block 8k+12 - round 13
|
|
+
|
|
+ rev32 $h3.16b, $rtmp_ctr.16b @ CTR block 8k+18
|
|
+ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+18
|
|
+ aese $ctr5b, $rk13 @ AES block 8k+13 - round 13
|
|
+
|
|
+ aese $ctr0b, $rk12 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 12
|
|
+ aese $ctr3b, $rk13 @ AES block 8k+11 - round 13
|
|
+ cmp $input_ptr, $main_end_input_ptr @ LOOP CONTROL
|
|
+
|
|
+ eor3 $res2b, $ctr_t2b, $ctr2b, $rk14 @ AES block 8k+10 - result
|
|
+ rev32 $h4.16b, $rtmp_ctr.16b @ CTR block 8k+19
|
|
+ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+19
|
|
+
|
|
+ aese $ctr0b, $rk13 @ AES block 8k+8 - round 13
|
|
+ aese $ctr6b, $rk13 @ AES block 8k+14 - round 13
|
|
+ eor3 $res5b, $ctr_t5b, $ctr5b, $rk14 @ AES block 5 - result
|
|
+
|
|
+ ext $t11.16b, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment
|
|
+ pmull $acc_h.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low
|
|
+ aese $ctr1b, $rk13 @ AES block 8k+9 - round 13
|
|
+
|
|
+ eor3 $res4b, $ctr_t4b, $ctr4b, $rk14 @ AES block 4 - result
|
|
+ rev32 $ctr4.16b, $rtmp_ctr.16b @ CTR block 8k+20
|
|
+ eor3 $res3b, $ctr_t3b, $ctr3b, $rk14 @ AES block 8k+11 - result
|
|
+
|
|
+ mov $ctr3.16b, $h4.16b @ CTR block 8k+19
|
|
+ eor3 $res1b, $ctr_t1b, $ctr1b, $rk14 @ AES block 8k+9 - result
|
|
+ eor3 $res0b, $ctr_t0b, $ctr0b, $rk14 @ AES block 8k+8 - result
|
|
+
|
|
+ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+20
|
|
+ stp $res0q, $res1q, [$output_ptr], #32 @ AES block 8k+8, 8k+9 - store result
|
|
+ mov $ctr2.16b, $h3.16b @ CTR block 8k+18
|
|
+
|
|
+ eor3 $res7b, $ctr_t7b, $ctr7b, $rk14 @ AES block 7 - result
|
|
+ eor3 $acc_lb, $acc_lb, $t11.16b, $acc_hb @ MODULO - fold into low
|
|
+ stp $res2q, $res3q, [$output_ptr], #32 @ AES block 8k+10, 8k+11 - store result
|
|
+
|
|
+ eor3 $res6b, $ctr_t6b, $ctr6b, $rk14 @ AES block 6 - result
|
|
+ mov $ctr1.16b, $h2.16b @ CTR block 8k+17
|
|
+ stp $res4q, $res5q, [$output_ptr], #32 @ AES block 4, 5 - store result
|
|
+
|
|
+ stp $res6q, $res7q, [$output_ptr], #32 @ AES block 6, 7 - store result
|
|
+ mov $ctr0.16b, $h1.16b @ CTR block 8k+16
|
|
+ b.lt .L256_enc_main_loop
|
|
+
|
|
+.L256_enc_prepretail: @ PREPRETAIL
|
|
+ rev32 $ctr5.16b, $rtmp_ctr.16b @ CTR block 8k+13
|
|
+ ldp $rk0q, $rk1q, [$cc, #0] @ load rk0, rk1
|
|
+ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+13
|
|
+
|
|
+ rev64 $res2b, $res2b @ GHASH block 8k+2
|
|
+
|
|
+ rev32 $ctr6.16b, $rtmp_ctr.16b @ CTR block 8k+14
|
|
+ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+14
|
|
+
|
|
+ rev64 $res5b, $res5b @ GHASH block 8k+5
|
|
+ ldr $h56kq, [$current_tag, #144] @ load h6k | h5k
|
|
+ ldr $h78kq, [$current_tag, #192] @ load h8k | h7k
|
|
+
|
|
+ rev32 $ctr7.16b, $rtmp_ctr.16b @ CTR block 8k+15
|
|
+
|
|
+ aese $ctr6b, $rk0 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 0
|
|
+ aese $ctr4b, $rk0 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 0
|
|
+ aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 0
|
|
+
|
|
+ aese $ctr5b, $rk0 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 0
|
|
+ aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 0
|
|
+
|
|
+ aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 0
|
|
+ aese $ctr7b, $rk0 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 0
|
|
+ aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 0
|
|
+
|
|
+ ext $acc_lb, $acc_lb, $acc_lb, #8 @ PRE 0
|
|
+ rev64 $res0b, $res0b @ GHASH block 8k
|
|
+ aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 1
|
|
+
|
|
+ rev64 $res1b, $res1b @ GHASH block 8k+1
|
|
+ ldp $rk2q, $rk3q, [$cc, #32] @ load rk2, rk3
|
|
+ aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 1
|
|
+
|
|
+ ldr $h7q, [$current_tag, #176] @ load h7l | h7h
|
|
+ ext $h7.16b, $h7.16b, $h7.16b, #8
|
|
+ ldr $h8q, [$current_tag, #208] @ load h8l | h8h
|
|
+ ext $h8.16b, $h8.16b, $h8.16b, #8
|
|
+ aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 1
|
|
+
|
|
+ ldr $h5q, [$current_tag, #128] @ load h5l | h5h
|
|
+ ext $h5.16b, $h5.16b, $h5.16b, #8
|
|
+ ldr $h6q, [$current_tag, #160] @ load h6l | h6h
|
|
+ ext $h6.16b, $h6.16b, $h6.16b, #8
|
|
+ aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 1
|
|
+ aese $ctr5b, $rk1 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 1
|
|
+
|
|
+ aese $ctr4b, $rk1 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 1
|
|
+ eor $res0b, $res0b, $acc_lb @ PRE 1
|
|
+
|
|
+ rev64 $res3b, $res3b @ GHASH block 8k+3
|
|
+ aese $ctr6b, $rk1 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 1
|
|
+
|
|
+ aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 2
|
|
+ aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 2
|
|
+ aese $ctr7b, $rk1 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 1
|
|
+
|
|
+ aese $ctr4b, $rk2 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 2
|
|
+ aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 2
|
|
+ aese $ctr6b, $rk2 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 2
|
|
+
|
|
+ aese $ctr5b, $rk2 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 2
|
|
+ aese $ctr7b, $rk2 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 2
|
|
+ aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 2
|
|
+
|
|
+ ldp $rk4q, $rk5q, [$cc, #64] @ load rk4, rk5
|
|
+ trn1 $acc_m.2d, $res1.2d, $res0.2d @ GHASH block 8k, 8k+1 - mid
|
|
+ pmull2 $acc_h.1q, $res0.2d, $h8.2d @ GHASH block 8k - high
|
|
+
|
|
+ rev64 $res6b, $res6b @ GHASH block 8k+6
|
|
+ aese $ctr4b, $rk3 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 3
|
|
+ pmull2 $t0.1q, $res1.2d, $h7.2d @ GHASH block 8k+1 - high
|
|
+
|
|
+ aese $ctr7b, $rk3 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 3
|
|
+ pmull $acc_l.1q, $res0.1d, $h8.1d @ GHASH block 8k - low
|
|
+ trn2 $res0.2d, $res1.2d, $res0.2d @ GHASH block 8k, 8k+1 - mid
|
|
+
|
|
+ pmull2 $t1.1q, $res2.2d, $h6.2d @ GHASH block 8k+2 - high
|
|
+ aese $ctr6b, $rk3 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 3
|
|
+
|
|
+ aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 3
|
|
+ aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 3
|
|
+ eor $acc_hb, $acc_hb, $t0.16b @ GHASH block 8k+1 - high
|
|
+
|
|
+ pmull $h7.1q, $res1.1d, $h7.1d @ GHASH block 8k+1 - low
|
|
+ pmull2 $t2.1q, $res3.2d, $h5.2d @ GHASH block 8k+3 - high
|
|
+ aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 3
|
|
+
|
|
+ aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 3
|
|
+ eor $res0.16b, $res0.16b, $acc_m.16b @ GHASH block 8k, 8k+1 - mid
|
|
+ aese $ctr5b, $rk3 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 3
|
|
+
|
|
+ pmull $h6.1q, $res2.1d, $h6.1d @ GHASH block 8k+2 - low
|
|
+ aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 4
|
|
+ aese $ctr6b, $rk4 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 4
|
|
+
|
|
+ aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 4
|
|
+ aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 4
|
|
+ aese $ctr4b, $rk4 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 4
|
|
+
|
|
+ aese $ctr6b, $rk5 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 5
|
|
+ pmull2 $acc_m.1q, $res0.2d, $h78k.2d @ GHASH block 8k - mid
|
|
+ eor3 $acc_hb, $acc_hb, $t1.16b, $t2.16b @ GHASH block 8k+2, 8k+3 - high
|
|
+
|
|
+ aese $ctr7b, $rk4 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 4
|
|
+ trn1 $t3.2d, $res3.2d, $res2.2d @ GHASH block 8k+2, 8k+3 - mid
|
|
+ trn2 $res2.2d, $res3.2d, $res2.2d @ GHASH block 8k+2, 8k+3 - mid
|
|
+
|
|
+ aese $ctr5b, $rk4 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 4
|
|
+ eor $acc_lb, $acc_lb, $h7.16b @ GHASH block 8k+1 - low
|
|
+ aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 4
|
|
+
|
|
+ pmull $h5.1q, $res3.1d, $h5.1d @ GHASH block 8k+3 - low
|
|
+ pmull $h78k.1q, $res0.1d, $h78k.1d @ GHASH block 8k+1 - mid
|
|
+ eor $res2.16b, $res2.16b, $t3.16b @ GHASH block 8k+2, 8k+3 - mid
|
|
+
|
|
+ rev64 $res4b, $res4b @ GHASH block 8k+4
|
|
+ aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 5
|
|
+ aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 5
|
|
+
|
|
+ aese $ctr7b, $rk5 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 5
|
|
+ aese $ctr4b, $rk5 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 5
|
|
+ ldp $rk6q, $rk7q, [$cc, #96] @ load rk6, rk7
|
|
+
|
|
+ ldr $h3q, [$current_tag, #80] @ load h3l | h3h
|
|
+ ext $h3.16b, $h3.16b, $h3.16b, #8
|
|
+ ldr $h4q, [$current_tag, #112] @ load h4l | h4h
|
|
+ ext $h4.16b, $h4.16b, $h4.16b, #8
|
|
+ pmull2 $t3.1q, $res2.2d, $h56k.2d @ GHASH block 8k+2 - mid
|
|
+ pmull $h56k.1q, $res2.1d, $h56k.1d @ GHASH block 8k+3 - mid
|
|
+
|
|
+ eor3 $acc_lb, $acc_lb, $h6.16b, $h5.16b @ GHASH block 8k+2, 8k+3 - low
|
|
+ eor $acc_mb, $acc_mb, $h78k.16b @ GHASH block 8k+1 - mid
|
|
+
|
|
+ aese $ctr5b, $rk5 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 5
|
|
+ rev64 $res7b, $res7b @ GHASH block 8k+7
|
|
+ trn1 $t6.2d, $res5.2d, $res4.2d @ GHASH block 8k+4, 8k+5 - mid
|
|
+
|
|
+ aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 5
|
|
+ aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 5
|
|
+ eor3 $acc_mb, $acc_mb, $h56k.16b, $t3.16b @ GHASH block 8k+2, 8k+3 - mid
|
|
+
|
|
+ aese $ctr7b, $rk6 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 6
|
|
+ aese $ctr4b, $rk6 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 6
|
|
+ aese $ctr6b, $rk6 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 6
|
|
+
|
|
+ ldr $h12kq, [$current_tag, #48] @ load h2k | h1k
|
|
+ ldr $h34kq, [$current_tag, #96] @ load h4k | h3k
|
|
+ aese $ctr5b, $rk6 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 6
|
|
+ aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 6
|
|
+
|
|
+ aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 6
|
|
+ aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 6
|
|
+ aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 6
|
|
+
|
|
+ pmull2 $t4.1q, $res4.2d, $h4.2d @ GHASH block 8k+4 - high
|
|
+ pmull $h4.1q, $res4.1d, $h4.1d @ GHASH block 8k+4 - low
|
|
+ ldr $h1q, [$current_tag, #32] @ load h1l | h1h
|
|
+ ext $h1.16b, $h1.16b, $h1.16b, #8
|
|
+ ldr $h2q, [$current_tag, #64] @ load h1l | h1h
|
|
+ ext $h2.16b, $h2.16b, $h2.16b, #8
|
|
+
|
|
+ ldp $rk8q, $rk9q, [$cc, #128] @ load rk8, rk9
|
|
+ aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 7
|
|
+ aese $ctr4b, $rk7 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 7
|
|
+
|
|
+ pmull2 $t5.1q, $res5.2d, $h3.2d @ GHASH block 8k+5 - high
|
|
+ trn2 $res4.2d, $res5.2d, $res4.2d @ GHASH block 8k+4, 8k+5 - mid
|
|
+
|
|
+ aese $ctr5b, $rk7 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 7
|
|
+ aese $ctr6b, $rk7 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 7
|
|
+ pmull $h3.1q, $res5.1d, $h3.1d @ GHASH block 8k+5 - low
|
|
+
|
|
+ aese $ctr7b, $rk7 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 7
|
|
+ aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 7
|
|
+ eor $res4.16b, $res4.16b, $t6.16b @ GHASH block 8k+4, 8k+5 - mid
|
|
+
|
|
+ pmull2 $t7.1q, $res6.2d, $h2.2d @ GHASH block 8k+6 - high
|
|
+ pmull $h2.1q, $res6.1d, $h2.1d @ GHASH block 8k+6 - low
|
|
+ aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 7
|
|
+
|
|
+ trn1 $t9.2d, $res7.2d, $res6.2d @ GHASH block 8k+6, 8k+7 - mid
|
|
+ trn2 $res6.2d, $res7.2d, $res6.2d @ GHASH block 8k+6, 8k+7 - mid
|
|
+ aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 7
|
|
+
|
|
+ aese $ctr7b, $rk8 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 8
|
|
+ eor3 $acc_lb, $acc_lb, $h4.16b, $h3.16b @ GHASH block 8k+4, 8k+5 - low
|
|
+ aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 8
|
|
+
|
|
+ aese $ctr6b, $rk8 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 8
|
|
+ aese $ctr4b, $rk8 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 8
|
|
+ aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 8
|
|
+
|
|
+ aese $ctr5b, $rk8 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 8
|
|
+ eor $res6.16b, $res6.16b, $t9.16b @ GHASH block 8k+6, 8k+7 - mid
|
|
+ aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 8
|
|
+
|
|
+ pmull2 $t6.1q, $res4.2d, $h34k.2d @ GHASH block 8k+4 - mid
|
|
+ pmull $h34k.1q, $res4.1d, $h34k.1d @ GHASH block 8k+5 - mid
|
|
+ aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 8
|
|
+
|
|
+ pmull2 $t8.1q, $res7.2d, $h1.2d @ GHASH block 8k+7 - high
|
|
+ pmull2 $t9.1q, $res6.2d, $h12k.2d @ GHASH block 8k+6 - mid
|
|
+ pmull $h12k.1q, $res6.1d, $h12k.1d @ GHASH block 8k+7 - mid
|
|
+
|
|
+ pmull $h1.1q, $res7.1d, $h1.1d @ GHASH block 8k+7 - low
|
|
+ eor3 $acc_mb, $acc_mb, $h34k.16b, $t6.16b @ GHASH block 8k+4, 8k+5 - mid
|
|
+ eor3 $acc_hb, $acc_hb, $t4.16b, $t5.16b @ GHASH block 8k+4, 8k+5 - high
|
|
+
|
|
+ ldp $rk10q, $rk11q, [$cc, #160] @ load rk10, rk11
|
|
+ aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 9
|
|
+ aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 9
|
|
+
|
|
+ eor3 $acc_hb, $acc_hb, $t7.16b, $t8.16b @ GHASH block 8k+6, 8k+7 - high
|
|
+ eor3 $acc_mb, $acc_mb, $h12k.16b, $t9.16b @ GHASH block 8k+6, 8k+7 - mid
|
|
+ ldr $mod_constantd, [$modulo_constant] @ MODULO - load modulo constant
|
|
+
|
|
+ eor3 $acc_lb, $acc_lb, $h2.16b, $h1.16b @ GHASH block 8k+6, 8k+7 - low
|
|
+
|
|
+ aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 9
|
|
+ aese $ctr7b, $rk9 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 9
|
|
+ aese $ctr5b, $rk9 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 9
|
|
+
|
|
+ aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 9
|
|
+ aese $ctr6b, $rk9 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 9
|
|
+
|
|
+ aese $ctr5b, $rk10 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 10
|
|
+ aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 10
|
|
+ aese $ctr4b, $rk9 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 9
|
|
+
|
|
+ aese $ctr7b, $rk10 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 10
|
|
+ aese $ctr6b, $rk10 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 10
|
|
+ aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 10
|
|
+
|
|
+ aese $ctr4b, $rk10 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 10
|
|
+ aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 10
|
|
+ aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 10
|
|
+
|
|
+ pmull $t12.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid
|
|
+ eor3 $acc_mb, $acc_mb, $acc_hb, $acc_lb @ MODULO - karatsuba tidy up
|
|
+ aese $ctr7b, $rk11 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 11
|
|
+
|
|
+ ldp $rk12q, $rk13q, [$cc, #192] @ load rk12, rk13
|
|
+ ext $t11.16b, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment
|
|
+ aese $ctr2b, $rk11 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 11
|
|
+
|
|
+ eor3 $acc_mb, $acc_mb, $t12.16b, $t11.16b @ MODULO - fold into mid
|
|
+ aese $ctr1b, $rk11 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 11
|
|
+ aese $ctr6b, $rk11 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 11
|
|
+
|
|
+ aese $ctr0b, $rk11 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 11
|
|
+ aese $ctr4b, $rk11 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 11
|
|
+ aese $ctr5b, $rk11 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 11
|
|
+
|
|
+ pmull $acc_h.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low
|
|
+ aese $ctr3b, $rk11 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 11
|
|
+ ldr $rk14q, [$cc, #224] @ load rk14
|
|
+
|
|
+ aese $ctr1b, $rk12 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 12
|
|
+ aese $ctr2b, $rk12 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 12
|
|
+ aese $ctr0b, $rk12 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 12
|
|
+
|
|
+ aese $ctr6b, $rk12 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 12
|
|
+ aese $ctr5b, $rk12 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 12
|
|
+ ext $t11.16b, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment
|
|
+
|
|
+ aese $ctr4b, $rk12 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 12
|
|
+ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+15
|
|
+
|
|
+ aese $ctr3b, $rk12 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 12
|
|
+ aese $ctr7b, $rk12 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 12
|
|
+ aese $ctr0b, $rk13 @ AES block 8k+8 - round 13
|
|
+
|
|
+ eor3 $acc_lb, $acc_lb, $t11.16b, $acc_hb @ MODULO - fold into low
|
|
+ aese $ctr5b, $rk13 @ AES block 8k+13 - round 13
|
|
+ aese $ctr1b, $rk13 @ AES block 8k+9 - round 13
|
|
+
|
|
+ aese $ctr3b, $rk13 @ AES block 8k+11 - round 13
|
|
+ aese $ctr4b, $rk13 @ AES block 8k+12 - round 13
|
|
+ aese $ctr7b, $rk13 @ AES block 8k+15 - round 13
|
|
+
|
|
+ aese $ctr2b, $rk13 @ AES block 8k+10 - round 13
|
|
+ aese $ctr6b, $rk13 @ AES block 8k+14 - round 13
|
|
+.L256_enc_tail: @ TAIL
|
|
+
|
|
+ ldp $h78kq, $h8q, [$current_tag, #192] @ load h8l | h8h
|
|
+ ext $h8.16b, $h8.16b, $h8.16b, #8
|
|
+ sub $main_end_input_ptr, $end_input_ptr, $input_ptr @ main_end_input_ptr is number of bytes left to process
|
|
+
|
|
+ ldr $ctr_t0q, [$input_ptr], #16 @ AES block 8k+8 - load plaintext
|
|
+
|
|
+ ldp $h5q, $h56kq, [$current_tag, #128] @ load h5l | h5h
|
|
+ ext $h5.16b, $h5.16b, $h5.16b, #8
|
|
+
|
|
+ ext $t0.16b, $acc_lb, $acc_lb, #8 @ prepare final partial tag
|
|
+ ldp $h6q, $h7q, [$current_tag, #160] @ load h6l | h6h
|
|
+ ext $h6.16b, $h6.16b, $h6.16b, #8
|
|
+ ext $h7.16b, $h7.16b, $h7.16b, #8
|
|
+ mov $t1.16b, $rk14
|
|
+
|
|
+ cmp $main_end_input_ptr, #112
|
|
+ eor3 $res1b, $ctr_t0b, $ctr0b, $t1.16b @ AES block 8k+8 - result
|
|
+ b.gt .L256_enc_blocks_more_than_7
|
|
+
|
|
+ movi $acc_l.8b, #0
|
|
+ mov $ctr7b, $ctr6b
|
|
+ movi $acc_h.8b, #0
|
|
+
|
|
+ mov $ctr6b, $ctr5b
|
|
+ mov $ctr5b, $ctr4b
|
|
+ mov $ctr4b, $ctr3b
|
|
+
|
|
+ mov $ctr3b, $ctr2b
|
|
+ sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
|
|
+ mov $ctr2b, $ctr1b
|
|
+
|
|
+ movi $acc_m.8b, #0
|
|
+ cmp $main_end_input_ptr, #96
|
|
+ b.gt .L256_enc_blocks_more_than_6
|
|
+
|
|
+ mov $ctr7b, $ctr6b
|
|
+ mov $ctr6b, $ctr5b
|
|
+ cmp $main_end_input_ptr, #80
|
|
+
|
|
+ mov $ctr5b, $ctr4b
|
|
+ mov $ctr4b, $ctr3b
|
|
+ mov $ctr3b, $ctr1b
|
|
+
|
|
+ sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
|
|
+ b.gt .L256_enc_blocks_more_than_5
|
|
+
|
|
+ mov $ctr7b, $ctr6b
|
|
+ sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
|
|
+
|
|
+ mov $ctr6b, $ctr5b
|
|
+ mov $ctr5b, $ctr4b
|
|
+
|
|
+ cmp $main_end_input_ptr, #64
|
|
+ mov $ctr4b, $ctr1b
|
|
+ b.gt .L256_enc_blocks_more_than_4
|
|
+
|
|
+ cmp $main_end_input_ptr, #48
|
|
+ mov $ctr7b, $ctr6b
|
|
+ mov $ctr6b, $ctr5b
|
|
+
|
|
+ mov $ctr5b, $ctr1b
|
|
+ sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
|
|
+ b.gt .L256_enc_blocks_more_than_3
|
|
+
|
|
+ cmp $main_end_input_ptr, #32
|
|
+ mov $ctr7b, $ctr6b
|
|
+ ldr $h34kq, [$current_tag, #96] @ load h4k | h3k
|
|
+
|
|
+ mov $ctr6b, $ctr1b
|
|
+ sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
|
|
+ b.gt .L256_enc_blocks_more_than_2
|
|
+
|
|
+ mov $ctr7b, $ctr1b
|
|
+
|
|
+ sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
|
|
+ cmp $main_end_input_ptr, #16
|
|
+ b.gt .L256_enc_blocks_more_than_1
|
|
+
|
|
+ sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
|
|
+ ldr $h12kq, [$current_tag, #48] @ load h2k | h1k
|
|
+ b .L256_enc_blocks_less_than_1
|
|
+.L256_enc_blocks_more_than_7: @ blocks left > 7
|
|
+ st1 { $res1b}, [$output_ptr], #16 @ AES final-7 block - store result
|
|
+
|
|
+ rev64 $res0b, $res1b @ GHASH final-7 block
|
|
+
|
|
+ eor $res0b, $res0b, $t0.16b @ feed in partial tag
|
|
+
|
|
+ ldr $ctr_t1q, [$input_ptr], #16 @ AES final-6 block - load plaintext
|
|
+
|
|
+ pmull2 $acc_h.1q, $res0.2d, $h8.2d @ GHASH final-7 block - high
|
|
+ ins $rk4v.d[0], $res0.d[1] @ GHASH final-7 block - mid
|
|
+ ins $acc_m.d[0], $h78k.d[1] @ GHASH final-7 block - mid
|
|
+
|
|
+ movi $t0.8b, #0 @ surpress further partial tag feed in
|
|
+
|
|
+ eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-7 block - mid
|
|
+ eor3 $res1b, $ctr_t1b, $ctr1b, $t1.16b @ AES final-6 block - result
|
|
+
|
|
+ pmull $acc_m.1q, $rk4v.1d, $acc_m.1d @ GHASH final-7 block - mid
|
|
+ pmull $acc_l.1q, $res0.1d, $h8.1d @ GHASH final-7 block - low
|
|
+.L256_enc_blocks_more_than_6: @ blocks left > 6
|
|
+
|
|
+ st1 { $res1b}, [$output_ptr], #16 @ AES final-6 block - store result
|
|
+
|
|
+ rev64 $res0b, $res1b @ GHASH final-6 block
|
|
+
|
|
+ eor $res0b, $res0b, $t0.16b @ feed in partial tag
|
|
+
|
|
+ pmull $rk3q1, $res0.1d, $h7.1d @ GHASH final-6 block - low
|
|
+ ins $rk4v.d[0], $res0.d[1] @ GHASH final-6 block - mid
|
|
+ pmull2 $rk2q1, $res0.2d, $h7.2d @ GHASH final-6 block - high
|
|
+
|
|
+ ldr $ctr_t1q, [$input_ptr], #16 @ AES final-5 block - load plaintext
|
|
+
|
|
+ eor $acc_lb, $acc_lb, $rk3 @ GHASH final-6 block - low
|
|
+
|
|
+ eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-6 block - mid
|
|
+
|
|
+ pmull $rk4v.1q, $rk4v.1d, $h78k.1d @ GHASH final-6 block - mid
|
|
+ eor3 $res1b, $ctr_t1b, $ctr2b, $t1.16b @ AES final-5 block - result
|
|
+
|
|
+ movi $t0.8b, #0 @ surpress further partial tag feed in
|
|
+
|
|
+ eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-6 block - mid
|
|
+ eor $acc_hb, $acc_hb, $rk2 @ GHASH final-6 block - high
|
|
+.L256_enc_blocks_more_than_5: @ blocks left > 5
|
|
+
|
|
+ st1 { $res1b}, [$output_ptr], #16 @ AES final-5 block - store result
|
|
+
|
|
+ rev64 $res0b, $res1b @ GHASH final-5 block
|
|
+
|
|
+ eor $res0b, $res0b, $t0.16b @ feed in partial tag
|
|
+
|
|
+ ins $rk4v.d[0], $res0.d[1] @ GHASH final-5 block - mid
|
|
+
|
|
+ pmull2 $rk2q1, $res0.2d, $h6.2d @ GHASH final-5 block - high
|
|
+
|
|
+ eor $acc_hb, $acc_hb, $rk2 @ GHASH final-5 block - high
|
|
+ eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-5 block - mid
|
|
+
|
|
+ ins $rk4v.d[1], $rk4v.d[0] @ GHASH final-5 block - mid
|
|
+
|
|
+ ldr $ctr_t1q, [$input_ptr], #16 @ AES final-4 block - load plaintext
|
|
+ pmull $rk3q1, $res0.1d, $h6.1d @ GHASH final-5 block - low
|
|
+
|
|
+ pmull2 $rk4v.1q, $rk4v.2d, $h56k.2d @ GHASH final-5 block - mid
|
|
+ movi $t0.8b, #0 @ surpress further partial tag feed in
|
|
+ eor $acc_lb, $acc_lb, $rk3 @ GHASH final-5 block - low
|
|
+
|
|
+ eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-5 block - mid
|
|
+ eor3 $res1b, $ctr_t1b, $ctr3b, $t1.16b @ AES final-4 block - result
|
|
+.L256_enc_blocks_more_than_4: @ blocks left > 4
|
|
+
|
|
+ st1 { $res1b}, [$output_ptr], #16 @ AES final-4 block - store result
|
|
+
|
|
+ rev64 $res0b, $res1b @ GHASH final-4 block
|
|
+
|
|
+ ldr $ctr_t1q, [$input_ptr], #16 @ AES final-3 block - load plaintext
|
|
+
|
|
+ eor $res0b, $res0b, $t0.16b @ feed in partial tag
|
|
+
|
|
+ ins $rk4v.d[0], $res0.d[1] @ GHASH final-4 block - mid
|
|
+ pmull2 $rk2q1, $res0.2d, $h5.2d @ GHASH final-4 block - high
|
|
+
|
|
+ eor3 $res1b, $ctr_t1b, $ctr4b, $t1.16b @ AES final-3 block - result
|
|
+ pmull $rk3q1, $res0.1d, $h5.1d @ GHASH final-4 block - low
|
|
+
|
|
+ eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-4 block - mid
|
|
+ eor $acc_lb, $acc_lb, $rk3 @ GHASH final-4 block - low
|
|
+
|
|
+ pmull $rk4v.1q, $rk4v.1d, $h56k.1d @ GHASH final-4 block - mid
|
|
+
|
|
+ movi $t0.8b, #0 @ surpress further partial tag feed in
|
|
+
|
|
+ eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-4 block - mid
|
|
+ eor $acc_hb, $acc_hb, $rk2 @ GHASH final-4 block - high
|
|
+.L256_enc_blocks_more_than_3: @ blocks left > 3
|
|
+
|
|
+ st1 { $res1b}, [$output_ptr], #16 @ AES final-3 block - store result
|
|
+
|
|
+ ldr $h4q, [$current_tag, #112] @ load h4l | h4h
|
|
+ ext $h4.16b, $h4.16b, $h4.16b, #8
|
|
+ rev64 $res0b, $res1b @ GHASH final-3 block
|
|
+
|
|
+ eor $res0b, $res0b, $t0.16b @ feed in partial tag
|
|
+
|
|
+ ins $rk4v.d[0], $res0.d[1] @ GHASH final-3 block - mid
|
|
+ pmull2 $rk2q1, $res0.2d, $h4.2d @ GHASH final-3 block - high
|
|
+
|
|
+ eor $acc_hb, $acc_hb, $rk2 @ GHASH final-3 block - high
|
|
+ eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-3 block - mid
|
|
+ ldr $h34kq, [$current_tag, #96] @ load h4k | h3k
|
|
+
|
|
+ ins $rk4v.d[1], $rk4v.d[0] @ GHASH final-3 block - mid
|
|
+ ldr $ctr_t1q, [$input_ptr], #16 @ AES final-2 block - load plaintext
|
|
+
|
|
+ pmull2 $rk4v.1q, $rk4v.2d, $h34k.2d @ GHASH final-3 block - mid
|
|
+ pmull $rk3q1, $res0.1d, $h4.1d @ GHASH final-3 block - low
|
|
+
|
|
+ eor3 $res1b, $ctr_t1b, $ctr5b, $t1.16b @ AES final-2 block - result
|
|
+ movi $t0.8b, #0 @ surpress further partial tag feed in
|
|
+
|
|
+ eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-3 block - mid
|
|
+ eor $acc_lb, $acc_lb, $rk3 @ GHASH final-3 block - low
|
|
+.L256_enc_blocks_more_than_2: @ blocks left > 2
|
|
+
|
|
+ ldr $h3q, [$current_tag, #80] @ load h3l | h3h
|
|
+ ext $h3.16b, $h3.16b, $h3.16b, #8
|
|
+
|
|
+ st1 { $res1b}, [$output_ptr], #16 @ AES final-2 block - store result
|
|
+
|
|
+ rev64 $res0b, $res1b @ GHASH final-2 block
|
|
+ ldr $ctr_t1q, [$input_ptr], #16 @ AES final-1 block - load plaintext
|
|
+
|
|
+ eor $res0b, $res0b, $t0.16b @ feed in partial tag
|
|
+
|
|
+ ins $rk4v.d[0], $res0.d[1] @ GHASH final-2 block - mid
|
|
+
|
|
+ movi $t0.8b, #0 @ surpress further partial tag feed in
|
|
+
|
|
+ pmull2 $rk2q1, $res0.2d, $h3.2d @ GHASH final-2 block - high
|
|
+ eor3 $res1b, $ctr_t1b, $ctr6b, $t1.16b @ AES final-1 block - result
|
|
+
|
|
+ eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-2 block - mid
|
|
+
|
|
+ eor $acc_hb, $acc_hb, $rk2 @ GHASH final-2 block - high
|
|
+
|
|
+ pmull $rk4v.1q, $rk4v.1d, $h34k.1d @ GHASH final-2 block - mid
|
|
+ pmull $rk3q1, $res0.1d, $h3.1d @ GHASH final-2 block - low
|
|
+
|
|
+ eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-2 block - mid
|
|
+ eor $acc_lb, $acc_lb, $rk3 @ GHASH final-2 block - low
|
|
+.L256_enc_blocks_more_than_1: @ blocks left > 1
|
|
+
|
|
+ st1 { $res1b}, [$output_ptr], #16 @ AES final-1 block - store result
|
|
+
|
|
+ ldr $h2q, [$current_tag, #64] @ load h1l | h1h
|
|
+ ext $h2.16b, $h2.16b, $h2.16b, #8
|
|
+ rev64 $res0b, $res1b @ GHASH final-1 block
|
|
+ ldr $ctr_t1q, [$input_ptr], #16 @ AES final block - load plaintext
|
|
+
|
|
+ eor $res0b, $res0b, $t0.16b @ feed in partial tag
|
|
+ movi $t0.8b, #0 @ surpress further partial tag feed in
|
|
+
|
|
+ ins $rk4v.d[0], $res0.d[1] @ GHASH final-1 block - mid
|
|
+ pmull2 $rk2q1, $res0.2d, $h2.2d @ GHASH final-1 block - high
|
|
+
|
|
+ eor3 $res1b, $ctr_t1b, $ctr7b, $t1.16b @ AES final block - result
|
|
+ eor $acc_hb, $acc_hb, $rk2 @ GHASH final-1 block - high
|
|
+
|
|
+ pmull $rk3q1, $res0.1d, $h2.1d @ GHASH final-1 block - low
|
|
+ eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-1 block - mid
|
|
+
|
|
+ ldr $h12kq, [$current_tag, #48] @ load h2k | h1k
|
|
+
|
|
+ eor $acc_lb, $acc_lb, $rk3 @ GHASH final-1 block - low
|
|
+ ins $rk4v.d[1], $rk4v.d[0] @ GHASH final-1 block - mid
|
|
+
|
|
+ pmull2 $rk4v.1q, $rk4v.2d, $h12k.2d @ GHASH final-1 block - mid
|
|
+
|
|
+ eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-1 block - mid
|
|
+.L256_enc_blocks_less_than_1: @ blocks left <= 1
|
|
+
|
|
+ and $bit_length, $bit_length, #127 @ bit_length %= 128
|
|
+
|
|
+ sub $bit_length, $bit_length, #128 @ bit_length -= 128
|
|
+
|
|
+ neg $bit_length, $bit_length @ bit_length = 128 - #bits in input (in range [1,128])
|
|
+
|
|
+ mvn $temp0_x, xzr @ temp0_x = 0xffffffffffffffff
|
|
+ and $bit_length, $bit_length, #127 @ bit_length %= 128
|
|
+
|
|
+ lsr $temp0_x, $temp0_x, $bit_length @ temp0_x is mask for top 64b of last block
|
|
+ cmp $bit_length, #64
|
|
+ mvn $temp1_x, xzr @ temp1_x = 0xffffffffffffffff
|
|
+
|
|
+ csel $temp3_x, $temp0_x, xzr, lt
|
|
+ csel $temp2_x, $temp1_x, $temp0_x, lt
|
|
+
|
|
+ mov $ctr0.d[0], $temp2_x @ ctr0b is mask for last block
|
|
+ ldr $h1q, [$current_tag, #32] @ load h1l | h1h
|
|
+ ext $h1.16b, $h1.16b, $h1.16b, #8
|
|
+
|
|
+ ld1 { $rk0}, [$output_ptr] @ load existing bytes where the possibly partial last block is to be stored
|
|
+ mov $ctr0.d[1], $temp3_x
|
|
+
|
|
+ and $res1b, $res1b, $ctr0b @ possibly partial last block has zeroes in highest bits
|
|
+
|
|
+ rev64 $res0b, $res1b @ GHASH final block
|
|
+
|
|
+ rev32 $rtmp_ctr.16b, $rtmp_ctr.16b
|
|
+ bif $res1b, $rk0, $ctr0b @ insert existing bytes in top end of result before storing
|
|
+ str $rtmp_ctrq, [$counter] @ store the updated counter
|
|
+
|
|
+ eor $res0b, $res0b, $t0.16b @ feed in partial tag
|
|
+ st1 { $res1b}, [$output_ptr] @ store all 16B
|
|
+
|
|
+ ins $t0.d[0], $res0.d[1] @ GHASH final block - mid
|
|
+ pmull2 $rk2q1, $res0.2d, $h1.2d @ GHASH final block - high
|
|
+ pmull $rk3q1, $res0.1d, $h1.1d @ GHASH final block - low
|
|
+
|
|
+ eor $acc_hb, $acc_hb, $rk2 @ GHASH final block - high
|
|
+ eor $acc_lb, $acc_lb, $rk3 @ GHASH final block - low
|
|
+
|
|
+ eor $t0.8b, $t0.8b, $res0.8b @ GHASH final block - mid
|
|
+
|
|
+ pmull $t0.1q, $t0.1d, $h12k.1d @ GHASH final block - mid
|
|
+
|
|
+ eor $acc_mb, $acc_mb, $t0.16b @ GHASH final block - mid
|
|
+ ldr $mod_constantd, [$modulo_constant] @ MODULO - load modulo constant
|
|
+
|
|
+ ext $t11.16b, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment
|
|
+
|
|
+ eor3 $acc_mb, $acc_mb, $acc_hb, $acc_lb @ MODULO - karatsuba tidy up
|
|
+ pmull $t12.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid
|
|
+
|
|
+ eor3 $acc_mb, $acc_mb, $t12.16b, $t11.16b @ MODULO - fold into mid
|
|
+
|
|
+ pmull $acc_h.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low
|
|
+ ext $t11.16b, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment
|
|
+
|
|
+ eor3 $acc_lb, $acc_lb, $acc_hb, $t11.16b @ MODULO - fold into low
|
|
+ ext $acc_lb, $acc_lb, $acc_lb, #8
|
|
+ rev64 $acc_lb, $acc_lb
|
|
+ st1 { $acc_l.16b }, [$current_tag]
|
|
+ lsr x0, $bit_length, #3 @ return sizes
|
|
+
|
|
+ ldp d10, d11, [sp, #16]
|
|
+ ldp d12, d13, [sp, #32]
|
|
+ ldp d14, d15, [sp, #48]
|
|
+ ldp d8, d9, [sp], #80
|
|
+ ret
|
|
+
|
|
+.L256_enc_ret:
|
|
+ mov w0, #0x0
|
|
+ ret
|
|
+.size unroll8_eor3_aes_gcm_enc_256_kernel,.-unroll8_eor3_aes_gcm_enc_256_kernel
|
|
+___
|
|
+
|
|
+{
|
|
+#########################################################################################
|
|
+# size_t unroll8_eor3_aes_gcm_dec_256_kernel(const unsigned char *in,
|
|
+# size_t len,
|
|
+# unsigned char *out,
|
|
+# const void *key,
|
|
+# unsigned char ivec[16],
|
|
+# u64 *Xi);
|
|
+#
|
|
+$code.=<<___;
|
|
+.global unroll8_eor3_aes_gcm_dec_256_kernel
|
|
+.type unroll8_eor3_aes_gcm_dec_256_kernel,%function
|
|
+.align 4
|
|
+unroll8_eor3_aes_gcm_dec_256_kernel:
|
|
+ AARCH64_VALID_CALL_TARGET
|
|
+ cbz x1, .L256_dec_ret
|
|
+ stp d8, d9, [sp, #-80]!
|
|
+ mov $counter, x4
|
|
+ mov $cc, x5
|
|
+ stp d10, d11, [sp, #16]
|
|
+ stp d12, d13, [sp, #32]
|
|
+ stp d14, d15, [sp, #48]
|
|
+ mov x5, #0xc200000000000000
|
|
+ stp x5, xzr, [sp, #64]
|
|
+ add $modulo_constant, sp, #64
|
|
+
|
|
+ ld1 { $ctr0b}, [$counter] @ CTR block 0
|
|
+
|
|
+ mov $constant_temp, #0x100000000 @ set up counter increment
|
|
+ movi $rctr_inc.16b, #0x0
|
|
+ mov $rctr_inc.d[1], $constant_temp
|
|
+ lsr $main_end_input_ptr, $bit_length, #3 @ byte_len
|
|
+
|
|
+ sub $main_end_input_ptr, $main_end_input_ptr, #1 @ byte_len - 1
|
|
+
|
|
+ rev32 $rtmp_ctr.16b, $ctr0.16b @ set up reversed counter
|
|
+
|
|
+ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 0
|
|
+
|
|
+ rev32 $ctr1.16b, $rtmp_ctr.16b @ CTR block 1
|
|
+ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 1
|
|
+
|
|
+ rev32 $ctr2.16b, $rtmp_ctr.16b @ CTR block 2
|
|
+ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 2
|
|
+ ldp $rk0q, $rk1q, [$cc, #0] @ load rk0, rk1
|
|
+
|
|
+ rev32 $ctr3.16b, $rtmp_ctr.16b @ CTR block 3
|
|
+ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 3
|
|
+
|
|
+ rev32 $ctr4.16b, $rtmp_ctr.16b @ CTR block 4
|
|
+ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 4
|
|
+
|
|
+ aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 0
|
|
+
|
|
+ rev32 $ctr5.16b, $rtmp_ctr.16b @ CTR block 5
|
|
+ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 5
|
|
+
|
|
+ aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 0
|
|
+ aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 0
|
|
+
|
|
+ rev32 $ctr6.16b, $rtmp_ctr.16b @ CTR block 6
|
|
+ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 6
|
|
+
|
|
+ rev32 $ctr7.16b, $rtmp_ctr.16b @ CTR block 7
|
|
+ aese $ctr4b, $rk0 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 0
|
|
+
|
|
+ aese $ctr6b, $rk0 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 0
|
|
+ aese $ctr5b, $rk0 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 0
|
|
+
|
|
+ aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 0
|
|
+ aese $ctr7b, $rk0 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 0
|
|
+ ldp $rk2q, $rk3q, [$cc, #32] @ load rk2, rk3
|
|
+
|
|
+ aese $ctr6b, $rk1 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 1
|
|
+ aese $ctr4b, $rk1 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 1
|
|
+ aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 1
|
|
+
|
|
+ aese $ctr5b, $rk1 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 1
|
|
+ aese $ctr7b, $rk1 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 1
|
|
+ aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 1
|
|
+
|
|
+ aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 1
|
|
+ aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 1
|
|
+
|
|
+ aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 2
|
|
+ aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 2
|
|
+ aese $ctr6b, $rk2 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 2
|
|
+
|
|
+ aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 2
|
|
+ aese $ctr7b, $rk2 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 2
|
|
+ aese $ctr5b, $rk2 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 2
|
|
+
|
|
+ aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 2
|
|
+ aese $ctr4b, $rk2 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 2
|
|
+ ldp $rk4q, $rk5q, [$cc, #64] @ load rk4, rk5
|
|
+
|
|
+ aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 3
|
|
+ aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 3
|
|
+
|
|
+ aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 3
|
|
+ aese $ctr4b, $rk3 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 3
|
|
+
|
|
+ aese $ctr5b, $rk3 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 3
|
|
+ aese $ctr7b, $rk3 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 3
|
|
+ aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 3
|
|
+
|
|
+ aese $ctr6b, $rk3 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 3
|
|
+
|
|
+ aese $ctr7b, $rk4 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 4
|
|
+ aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 4
|
|
+
|
|
+ aese $ctr6b, $rk4 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 4
|
|
+ aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 4
|
|
+ aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 4
|
|
+
|
|
+ aese $ctr4b, $rk4 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 4
|
|
+ aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 4
|
|
+ aese $ctr5b, $rk4 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 4
|
|
+
|
|
+ aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 5
|
|
+ aese $ctr6b, $rk5 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 5
|
|
+
|
|
+ ldp $rk6q, $rk7q, [$cc, #96] @ load rk6, rk7
|
|
+ aese $ctr4b, $rk5 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 5
|
|
+ aese $ctr7b, $rk5 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 5
|
|
+
|
|
+ aese $ctr5b, $rk5 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 5
|
|
+
|
|
+ aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 5
|
|
+ aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 5
|
|
+
|
|
+ aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 5
|
|
+
|
|
+ aese $ctr4b, $rk6 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 6
|
|
+ aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 6
|
|
+ aese $ctr7b, $rk6 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 6
|
|
+
|
|
+ aese $ctr6b, $rk6 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 6
|
|
+ aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 6
|
|
+ aese $ctr5b, $rk6 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 6
|
|
+
|
|
+ aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 6
|
|
+ aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 6
|
|
+ ldp $rk8q, $rk9q, [$cc, #128] @ load rk8, rk9
|
|
+
|
|
+ aese $ctr5b, $rk7 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 7
|
|
+ aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 7
|
|
+
|
|
+ aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 7
|
|
+ aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 7
|
|
+ aese $ctr7b, $rk7 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 7
|
|
+
|
|
+ aese $ctr4b, $rk7 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 7
|
|
+ aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 7
|
|
+ aese $ctr6b, $rk7 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 7
|
|
+
|
|
+ and $main_end_input_ptr, $main_end_input_ptr, #0xffffffffffffff80 @ number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
|
|
+ aese $ctr7b, $rk8 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 8
|
|
+ aese $ctr5b, $rk8 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 8
|
|
+
|
|
+ aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 8
|
|
+ aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 8
|
|
+ aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 8
|
|
+
|
|
+ aese $ctr4b, $rk8 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 8
|
|
+ aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 8
|
|
+ aese $ctr6b, $rk8 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 8
|
|
+
|
|
+ aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 9
|
|
+
|
|
+ ld1 { $acc_lb}, [$current_tag]
|
|
+ ext $acc_lb, $acc_lb, $acc_lb, #8
|
|
+ rev64 $acc_lb, $acc_lb
|
|
+ ldp $rk10q, $rk11q, [$cc, #160] @ load rk10, rk11
|
|
+ add $end_input_ptr, $input_ptr, $bit_length, lsr #3 @ end_input_ptr
|
|
+ add $main_end_input_ptr, $main_end_input_ptr, $input_ptr
|
|
+
|
|
+ aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 9
|
|
+ aese $ctr6b, $rk9 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 9
|
|
+
|
|
+ aese $ctr4b, $rk9 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 9
|
|
+ aese $ctr5b, $rk9 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 9
|
|
+
|
|
+ aese $ctr7b, $rk9 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 9
|
|
+
|
|
+ aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 9
|
|
+ aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 9
|
|
+
|
|
+ aese $ctr4b, $rk10 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 10
|
|
+ aese $ctr7b, $rk10 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 10
|
|
+ aese $ctr5b, $rk10 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 10
|
|
+
|
|
+ aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 10
|
|
+ aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 10
|
|
+ aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 10
|
|
+
|
|
+ aese $ctr6b, $rk10 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 10
|
|
+ aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 10
|
|
+ ldp $rk12q, $rk13q, [$cc, #192] @ load rk12, rk13
|
|
+
|
|
+ aese $ctr0b, $rk11 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 11
|
|
+ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 7
|
|
+
|
|
+ aese $ctr7b, $rk11 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 11
|
|
+ aese $ctr3b, $rk11 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 11
|
|
+ aese $ctr1b, $rk11 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 11
|
|
+
|
|
+ aese $ctr5b, $rk11 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 11
|
|
+ aese $ctr4b, $rk11 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 11
|
|
+ aese $ctr2b, $rk11 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 11
|
|
+
|
|
+ aese $ctr6b, $rk11 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 11
|
|
+ ldr $rk14q, [$cc, #224] @ load rk14
|
|
+
|
|
+ aese $ctr1b, $rk12 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 12
|
|
+ aese $ctr4b, $rk12 \n aesmc $ctr4b, $ctr4b @ AES block 4 - round 12
|
|
+ aese $ctr5b, $rk12 \n aesmc $ctr5b, $ctr5b @ AES block 5 - round 12
|
|
+
|
|
+ cmp $input_ptr, $main_end_input_ptr @ check if we have <= 8 blocks
|
|
+ aese $ctr3b, $rk12 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 12
|
|
+ aese $ctr2b, $rk12 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 12
|
|
+
|
|
+ aese $ctr6b, $rk12 \n aesmc $ctr6b, $ctr6b @ AES block 6 - round 12
|
|
+ aese $ctr0b, $rk12 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 12
|
|
+ aese $ctr7b, $rk12 \n aesmc $ctr7b, $ctr7b @ AES block 7 - round 12
|
|
+
|
|
+ aese $ctr5b, $rk13 @ AES block 5 - round 13
|
|
+ aese $ctr1b, $rk13 @ AES block 1 - round 13
|
|
+ aese $ctr2b, $rk13 @ AES block 2 - round 13
|
|
+
|
|
+ aese $ctr0b, $rk13 @ AES block 0 - round 13
|
|
+ aese $ctr4b, $rk13 @ AES block 4 - round 13
|
|
+ aese $ctr6b, $rk13 @ AES block 6 - round 13
|
|
+
|
|
+ aese $ctr3b, $rk13 @ AES block 3 - round 13
|
|
+ aese $ctr7b, $rk13 @ AES block 7 - round 13
|
|
+ b.ge .L256_dec_tail @ handle tail
|
|
+
|
|
+ ldp $res0q, $res1q, [$input_ptr], #32 @ AES block 0, 1 - load ciphertext
|
|
+
|
|
+ ldp $res2q, $res3q, [$input_ptr], #32 @ AES block 2, 3 - load ciphertext
|
|
+
|
|
+ ldp $res4q, $res5q, [$input_ptr], #32 @ AES block 4, 5 - load ciphertext
|
|
+
|
|
+ ldp $res6q, $res7q, [$input_ptr], #32 @ AES block 6, 7 - load ciphertext
|
|
+ cmp $input_ptr, $main_end_input_ptr @ check if we have <= 8 blocks
|
|
+
|
|
+ eor3 $ctr1b, $res1b, $ctr1b, $rk14 @ AES block 1 - result
|
|
+ eor3 $ctr0b, $res0b, $ctr0b, $rk14 @ AES block 0 - result
|
|
+ stp $ctr0q, $ctr1q, [$output_ptr], #32 @ AES block 0, 1 - store result
|
|
+
|
|
+ rev32 $ctr0.16b, $rtmp_ctr.16b @ CTR block 8
|
|
+ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8
|
|
+ eor3 $ctr3b, $res3b, $ctr3b, $rk14 @ AES block 3 - result
|
|
+
|
|
+ eor3 $ctr5b, $res5b, $ctr5b, $rk14 @ AES block 5 - result
|
|
+
|
|
+ eor3 $ctr4b, $res4b, $ctr4b, $rk14 @ AES block 4 - result
|
|
+ rev32 $ctr1.16b, $rtmp_ctr.16b @ CTR block 9
|
|
+ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 9
|
|
+
|
|
+ eor3 $ctr2b, $res2b, $ctr2b, $rk14 @ AES block 2 - result
|
|
+ stp $ctr2q, $ctr3q, [$output_ptr], #32 @ AES block 2, 3 - store result
|
|
+
|
|
+ rev32 $ctr2.16b, $rtmp_ctr.16b @ CTR block 10
|
|
+ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 10
|
|
+
|
|
+ eor3 $ctr6b, $res6b, $ctr6b, $rk14 @ AES block 6 - result
|
|
+
|
|
+ rev32 $ctr3.16b, $rtmp_ctr.16b @ CTR block 11
|
|
+ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 11
|
|
+ stp $ctr4q, $ctr5q, [$output_ptr], #32 @ AES block 4, 5 - store result
|
|
+
|
|
+ eor3 $ctr7b, $res7b, $ctr7b, $rk14 @ AES block 7 - result
|
|
+ stp $ctr6q, $ctr7q, [$output_ptr], #32 @ AES block 6, 7 - store result
|
|
+
|
|
+ rev32 $ctr4.16b, $rtmp_ctr.16b @ CTR block 12
|
|
+ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 12
|
|
+ b.ge .L256_dec_prepretail @ do prepretail
|
|
+
|
|
+.L256_dec_main_loop: @ main loop start
|
|
+ rev32 $ctr5.16b, $rtmp_ctr.16b @ CTR block 8k+13
|
|
+ ldp $rk0q, $rk1q, [$cc, #0] @ load rk0, rk1
|
|
+ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+13
|
|
+
|
|
+ rev64 $res1b, $res1b @ GHASH block 8k+1
|
|
+ ldr $h7q, [$current_tag, #176] @ load h7l | h7h
|
|
+ ext $h7.16b, $h7.16b, $h7.16b, #8
|
|
+ ldr $h8q, [$current_tag, #208] @ load h8l | h8h
|
|
+ ext $h8.16b, $h8.16b, $h8.16b, #8
|
|
+
|
|
+ rev32 $ctr6.16b, $rtmp_ctr.16b @ CTR block 8k+14
|
|
+ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+14
|
|
+ rev64 $res0b, $res0b @ GHASH block 8k
|
|
+
|
|
+ ext $acc_lb, $acc_lb, $acc_lb, #8 @ PRE 0
|
|
+ rev64 $res4b, $res4b @ GHASH block 8k+4
|
|
+ rev64 $res3b, $res3b @ GHASH block 8k+3
|
|
+
|
|
+ rev32 $ctr7.16b, $rtmp_ctr.16b @ CTR block 8k+15
|
|
+ rev64 $res7b, $res7b @ GHASH block 8k+7
|
|
+
|
|
+ aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 0
|
|
+ aese $ctr6b, $rk0 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 0
|
|
+ aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 0
|
|
+
|
|
+ aese $ctr7b, $rk0 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 0
|
|
+ aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 0
|
|
+ aese $ctr5b, $rk0 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 0
|
|
+
|
|
+ aese $ctr4b, $rk0 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 0
|
|
+ aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 0
|
|
+ ldp $rk2q, $rk3q, [$cc, #32] @ load rk2, rk3
|
|
+
|
|
+ eor $res0b, $res0b, $acc_lb @ PRE 1
|
|
+ ldr $h5q, [$current_tag, #128] @ load h5l | h5h
|
|
+ ext $h5.16b, $h5.16b, $h5.16b, #8
|
|
+ ldr $h6q, [$current_tag, #160] @ load h6l | h6h
|
|
+ ext $h6.16b, $h6.16b, $h6.16b, #8
|
|
+ aese $ctr6b, $rk1 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 1
|
|
+
|
|
+ aese $ctr4b, $rk1 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 1
|
|
+ rev64 $res2b, $res2b @ GHASH block 8k+2
|
|
+ aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 1
|
|
+
|
|
+ aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 1
|
|
+ aese $ctr5b, $rk1 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 1
|
|
+ aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 1
|
|
+
|
|
+ trn1 $acc_m.2d, $res1.2d, $res0.2d @ GHASH block 8k, 8k+1 - mid
|
|
+ aese $ctr7b, $rk1 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 1
|
|
+ aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 1
|
|
+
|
|
+ aese $ctr4b, $rk2 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 2
|
|
+ aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 2
|
|
+ aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 2
|
|
+
|
|
+ aese $ctr6b, $rk2 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 2
|
|
+ aese $ctr7b, $rk2 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 2
|
|
+ pmull $acc_l.1q, $res0.1d, $h8.1d @ GHASH block 8k - low
|
|
+
|
|
+ aese $ctr5b, $rk2 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 2
|
|
+ aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 2
|
|
+ aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 2
|
|
+
|
|
+ ldp $rk4q, $rk5q, [$cc, #64] @ load rk4, rk5
|
|
+ pmull2 $t1.1q, $res2.2d, $h6.2d @ GHASH block 8k+2 - high
|
|
+ aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 3
|
|
+
|
|
+ aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 3
|
|
+ pmull2 $t0.1q, $res1.2d, $h7.2d @ GHASH block 8k+1 - high
|
|
+ pmull $h7.1q, $res1.1d, $h7.1d @ GHASH block 8k+1 - low
|
|
+
|
|
+ aese $ctr5b, $rk3 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 3
|
|
+ aese $ctr6b, $rk3 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 3
|
|
+ pmull2 $acc_h.1q, $res0.2d, $h8.2d @ GHASH block 8k - high
|
|
+
|
|
+ aese $ctr4b, $rk3 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 3
|
|
+ aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 3
|
|
+ trn2 $res0.2d, $res1.2d, $res0.2d @ GHASH block 8k, 8k+1 - mid
|
|
+
|
|
+ pmull2 $t2.1q, $res3.2d, $h5.2d @ GHASH block 8k+3 - high
|
|
+ aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 3
|
|
+ eor $acc_hb, $acc_hb, $t0.16b @ GHASH block 8k+1 - high
|
|
+
|
|
+ aese $ctr5b, $rk4 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 4
|
|
+ aese $ctr7b, $rk3 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 3
|
|
+ aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 4
|
|
+
|
|
+ aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 4
|
|
+ aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 4
|
|
+ aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 4
|
|
+
|
|
+ aese $ctr6b, $rk4 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 4
|
|
+ aese $ctr7b, $rk4 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 4
|
|
+ aese $ctr4b, $rk4 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 4
|
|
+
|
|
+ ldr $h56kq, [$current_tag, #144] @ load h6k | h5k
|
|
+ ldr $h78kq, [$current_tag, #192] @ load h8k | h7k
|
|
+ eor $res0.16b, $res0.16b, $acc_m.16b @ GHASH block 8k, 8k+1 - mid
|
|
+ pmull $h6.1q, $res2.1d, $h6.1d @ GHASH block 8k+2 - low
|
|
+
|
|
+ ldp $rk6q, $rk7q, [$cc, #96] @ load rk6, rk7
|
|
+ aese $ctr5b, $rk5 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 5
|
|
+ eor $acc_lb, $acc_lb, $h7.16b @ GHASH block 8k+1 - low
|
|
+
|
|
+ aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 5
|
|
+ aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 5
|
|
+ aese $ctr7b, $rk5 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 5
|
|
+
|
|
+ aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 5
|
|
+ aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 5
|
|
+ aese $ctr6b, $rk5 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 5
|
|
+
|
|
+ eor3 $acc_hb, $acc_hb, $t1.16b, $t2.16b @ GHASH block 8k+2, 8k+3 - high
|
|
+ trn1 $t3.2d, $res3.2d, $res2.2d @ GHASH block 8k+2, 8k+3 - mid
|
|
+ rev64 $res5b, $res5b @ GHASH block 8k+5
|
|
+
|
|
+ pmull2 $acc_m.1q, $res0.2d, $h78k.2d @ GHASH block 8k - mid
|
|
+ pmull $h78k.1q, $res0.1d, $h78k.1d @ GHASH block 8k+1 - mid
|
|
+ trn2 $res2.2d, $res3.2d, $res2.2d @ GHASH block 8k+2, 8k+3 - mid
|
|
+
|
|
+ aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 6
|
|
+ aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 6
|
|
+ aese $ctr4b, $rk5 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 5
|
|
+
|
|
+ trn1 $t6.2d, $res5.2d, $res4.2d @ GHASH block 8k+4, 8k+5 - mid
|
|
+ aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 6
|
|
+ aese $ctr6b, $rk6 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 6
|
|
+
|
|
+ eor $res2.16b, $res2.16b, $t3.16b @ GHASH block 8k+2, 8k+3 - mid
|
|
+ pmull $h5.1q, $res3.1d, $h5.1d @ GHASH block 8k+3 - low
|
|
+ aese $ctr4b, $rk6 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 6
|
|
+
|
|
+ aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 6
|
|
+ aese $ctr5b, $rk6 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 6
|
|
+ aese $ctr7b, $rk6 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 6
|
|
+
|
|
+ pmull2 $t3.1q, $res2.2d, $h56k.2d @ GHASH block 8k+2 - mid
|
|
+ pmull $h56k.1q, $res2.1d, $h56k.1d @ GHASH block 8k+3 - mid
|
|
+ eor3 $acc_lb, $acc_lb, $h6.16b, $h5.16b @ GHASH block 8k+2, 8k+3 - low
|
|
+
|
|
+ ldr $h3q, [$current_tag, #80] @ load h3l | h3h
|
|
+ ext $h3.16b, $h3.16b, $h3.16b, #8
|
|
+ ldr $h4q, [$current_tag, #112] @ load h4l | h4h
|
|
+ ext $h4.16b, $h4.16b, $h4.16b, #8
|
|
+ rev64 $res6b, $res6b @ GHASH block 8k+6
|
|
+ eor $acc_mb, $acc_mb, $h78k.16b @ GHASH block 8k+1 - mid
|
|
+
|
|
+ aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 7
|
|
+ aese $ctr5b, $rk7 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 7
|
|
+ ldp $rk8q, $rk9q, [$cc, #128] @ load rk8, rk9
|
|
+
|
|
+ ldr $h1q, [$current_tag, #32] @ load h1l | h1h
|
|
+ ext $h1.16b, $h1.16b, $h1.16b, #8
|
|
+ ldr $h2q, [$current_tag, #64] @ load h1l | h1h
|
|
+ ext $h2.16b, $h2.16b, $h2.16b, #8
|
|
+ eor3 $acc_mb, $acc_mb, $h56k.16b, $t3.16b @ GHASH block 8k+2, 8k+3 - mid
|
|
+ aese $ctr7b, $rk7 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 7
|
|
+
|
|
+ aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 7
|
|
+ aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 7
|
|
+ aese $ctr6b, $rk7 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 7
|
|
+
|
|
+ ldr $h12kq, [$current_tag, #48] @ load h2k | h1k
|
|
+ ldr $h34kq, [$current_tag, #96] @ load h4k | h3k
|
|
+ aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 7
|
|
+ aese $ctr4b, $rk7 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 7
|
|
+
|
|
+ pmull2 $t4.1q, $res4.2d, $h4.2d @ GHASH block 8k+4 - high
|
|
+ pmull $h4.1q, $res4.1d, $h4.1d @ GHASH block 8k+4 - low
|
|
+ trn2 $res4.2d, $res5.2d, $res4.2d @ GHASH block 8k+4, 8k+5 - mid
|
|
+
|
|
+ aese $ctr5b, $rk8 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 8
|
|
+ pmull2 $t5.1q, $res5.2d, $h3.2d @ GHASH block 8k+5 - high
|
|
+ aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 8
|
|
+
|
|
+ aese $ctr6b, $rk8 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 8
|
|
+ pmull $h3.1q, $res5.1d, $h3.1d @ GHASH block 8k+5 - low
|
|
+ aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 8
|
|
+
|
|
+ aese $ctr4b, $rk8 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 8
|
|
+ aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 8
|
|
+ pmull2 $t7.1q, $res6.2d, $h2.2d @ GHASH block 8k+6 - high
|
|
+
|
|
+ trn1 $t9.2d, $res7.2d, $res6.2d @ GHASH block 8k+6, 8k+7 - mid
|
|
+ aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 8
|
|
+ aese $ctr7b, $rk8 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 8
|
|
+
|
|
+ ldp $rk10q, $rk11q, [$cc, #160] @ load rk10, rk11
|
|
+ pmull $h2.1q, $res6.1d, $h2.1d @ GHASH block 8k+6 - low
|
|
+ trn2 $res6.2d, $res7.2d, $res6.2d @ GHASH block 8k+6, 8k+7 - mid
|
|
+
|
|
+ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+15
|
|
+ eor3 $acc_hb, $acc_hb, $t4.16b, $t5.16b @ GHASH block 8k+4, 8k+5 - high
|
|
+ aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 9
|
|
+
|
|
+ aese $ctr6b, $rk9 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 9
|
|
+ eor $res6.16b, $res6.16b, $t9.16b @ GHASH block 8k+6, 8k+7 - mid
|
|
+ aese $ctr5b, $rk9 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 9
|
|
+
|
|
+ ldp $res0q, $res1q, [$input_ptr], #32 @ AES block 8k+8, 8k+9 - load ciphertext
|
|
+ eor $res4.16b, $res4.16b, $t6.16b @ GHASH block 8k+4, 8k+5 - mid
|
|
+ aese $ctr7b, $rk9 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 9
|
|
+
|
|
+ pmull2 $t9.1q, $res6.2d, $h12k.2d @ GHASH block 8k+6 - mid
|
|
+ aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 9
|
|
+ aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 9
|
|
+
|
|
+ pmull2 $t6.1q, $res4.2d, $h34k.2d @ GHASH block 8k+4 - mid
|
|
+ pmull $h34k.1q, $res4.1d, $h34k.1d @ GHASH block 8k+5 - mid
|
|
+ pmull2 $t8.1q, $res7.2d, $h1.2d @ GHASH block 8k+7 - high
|
|
+
|
|
+ pmull $h1.1q, $res7.1d, $h1.1d @ GHASH block 8k+7 - low
|
|
+ aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 10
|
|
+ aese $ctr6b, $rk10 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 10
|
|
+
|
|
+ pmull $h12k.1q, $res6.1d, $h12k.1d @ GHASH block 8k+7 - mid
|
|
+ aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 9
|
|
+ eor3 $acc_lb, $acc_lb, $h4.16b, $h3.16b @ GHASH block 8k+4, 8k+5 - low
|
|
+
|
|
+ aese $ctr4b, $rk9 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 9
|
|
+ eor3 $acc_mb, $acc_mb, $h34k.16b, $t6.16b @ GHASH block 8k+4, 8k+5 - mid
|
|
+ eor3 $acc_hb, $acc_hb, $t7.16b, $t8.16b @ GHASH block 8k+6, 8k+7 - high
|
|
+
|
|
+ aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 10
|
|
+ aese $ctr5b, $rk10 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 10
|
|
+ aese $ctr7b, $rk10 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 10
|
|
+
|
|
+ aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 10
|
|
+ aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 10
|
|
+ aese $ctr4b, $rk10 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 10
|
|
+
|
|
+ eor3 $acc_lb, $acc_lb, $h2.16b, $h1.16b @ GHASH block 8k+6, 8k+7 - low
|
|
+ rev32 $h1.16b, $rtmp_ctr.16b @ CTR block 8k+16
|
|
+ ldr $mod_constantd, [$modulo_constant] @ MODULO - load modulo constant
|
|
+
|
|
+ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+16
|
|
+ aese $ctr1b, $rk11 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 11
|
|
+ ldp $rk12q, $rk13q, [$cc, #192] @ load rk12, rk13
|
|
+
|
|
+ aese $ctr0b, $rk11 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 11
|
|
+ aese $ctr6b, $rk11 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 11
|
|
+
|
|
+ eor3 $acc_mb, $acc_mb, $h12k.16b, $t9.16b @ GHASH block 8k+6, 8k+7 - mid
|
|
+ rev32 $h2.16b, $rtmp_ctr.16b @ CTR block 8k+17
|
|
+ aese $ctr2b, $rk11 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 11
|
|
+
|
|
+ ldp $res2q, $res3q, [$input_ptr], #32 @ AES block 8k+10, 8k+11 - load ciphertext
|
|
+ aese $ctr7b, $rk11 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 11
|
|
+ ext $t11.16b, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment
|
|
+
|
|
+ aese $ctr5b, $rk11 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 11
|
|
+ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+17
|
|
+ aese $ctr3b, $rk11 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 11
|
|
+
|
|
+ aese $ctr2b, $rk12 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 12
|
|
+ aese $ctr7b, $rk12 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 12
|
|
+ aese $ctr6b, $rk12 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 12
|
|
+
|
|
+ rev32 $h3.16b, $rtmp_ctr.16b @ CTR block 8k+18
|
|
+ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+18
|
|
+ pmull $t12.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid
|
|
+
|
|
+ eor3 $acc_mb, $acc_mb, $acc_hb, $acc_lb @ MODULO - karatsuba tidy up
|
|
+ aese $ctr1b, $rk12 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 12
|
|
+ aese $ctr4b, $rk11 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 11
|
|
+
|
|
+ ldr $rk14q, [$cc, #224] @ load rk14
|
|
+ aese $ctr5b, $rk12 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 12
|
|
+ aese $ctr3b, $rk12 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 12
|
|
+
|
|
+ eor3 $acc_mb, $acc_mb, $t12.16b, $t11.16b @ MODULO - fold into mid
|
|
+ aese $ctr0b, $rk12 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 12
|
|
+ aese $ctr4b, $rk12 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 12
|
|
+
|
|
+ ldp $res4q, $res5q, [$input_ptr], #32 @ AES block 8k+12, 8k+13 - load ciphertext
|
|
+ aese $ctr1b, $rk13 @ AES block 8k+9 - round 13
|
|
+ aese $ctr2b, $rk13 @ AES block 8k+10 - round 13
|
|
+
|
|
+ ldp $res6q, $res7q, [$input_ptr], #32 @ AES block 8k+14, 8k+15 - load ciphertext
|
|
+ aese $ctr0b, $rk13 @ AES block 8k+8 - round 13
|
|
+ aese $ctr5b, $rk13 @ AES block 8k+13 - round 13
|
|
+
|
|
+ rev32 $h4.16b, $rtmp_ctr.16b @ CTR block 8k+19
|
|
+ eor3 $ctr2b, $res2b, $ctr2b, $rk14 @ AES block 8k+10 - result
|
|
+ eor3 $ctr1b, $res1b, $ctr1b, $rk14 @ AES block 8k+9 - result
|
|
+
|
|
+ ext $t11.16b, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment
|
|
+ aese $ctr7b, $rk13 @ AES block 8k+15 - round 13
|
|
+
|
|
+ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+19
|
|
+ pmull $acc_h.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low
|
|
+ aese $ctr4b, $rk13 @ AES block 8k+12 - round 13
|
|
+
|
|
+ eor3 $ctr5b, $res5b, $ctr5b, $rk14 @ AES block 8k+13 - result
|
|
+ eor3 $ctr0b, $res0b, $ctr0b, $rk14 @ AES block 8k+8 - result
|
|
+ aese $ctr3b, $rk13 @ AES block 8k+11 - round 13
|
|
+
|
|
+ stp $ctr0q, $ctr1q, [$output_ptr], #32 @ AES block 8k+8, 8k+9 - store result
|
|
+ mov $ctr0.16b, $h1.16b @ CTR block 8k+16
|
|
+ eor3 $ctr4b, $res4b, $ctr4b, $rk14 @ AES block 8k+12 - result
|
|
+
|
|
+ eor3 $acc_lb, $acc_lb, $t11.16b, $acc_hb @ MODULO - fold into low
|
|
+ eor3 $ctr3b, $res3b, $ctr3b, $rk14 @ AES block 8k+11 - result
|
|
+ stp $ctr2q, $ctr3q, [$output_ptr], #32 @ AES block 8k+10, 8k+11 - store result
|
|
+
|
|
+ mov $ctr3.16b, $h4.16b @ CTR block 8k+19
|
|
+ mov $ctr2.16b, $h3.16b @ CTR block 8k+18
|
|
+ aese $ctr6b, $rk13 @ AES block 8k+14 - round 13
|
|
+
|
|
+ mov $ctr1.16b, $h2.16b @ CTR block 8k+17
|
|
+ stp $ctr4q, $ctr5q, [$output_ptr], #32 @ AES block 8k+12, 8k+13 - store result
|
|
+ eor3 $ctr7b, $res7b, $ctr7b, $rk14 @ AES block 8k+15 - result
|
|
+
|
|
+ eor3 $ctr6b, $res6b, $ctr6b, $rk14 @ AES block 8k+14 - result
|
|
+ rev32 $ctr4.16b, $rtmp_ctr.16b @ CTR block 8k+20
|
|
+ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+20
|
|
+
|
|
+ cmp $input_ptr, $main_end_input_ptr @ LOOP CONTROL
|
|
+ stp $ctr6q, $ctr7q, [$output_ptr], #32 @ AES block 8k+14, 8k+15 - store result
|
|
+ b.lt .L256_dec_main_loop
|
|
+
|
|
+.L256_dec_prepretail: @ PREPRETAIL
|
|
+ ldp $rk0q, $rk1q, [$cc, #0] @ load rk0, rk1
|
|
+ rev32 $ctr5.16b, $rtmp_ctr.16b @ CTR block 8k+13
|
|
+ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+13
|
|
+
|
|
+ rev64 $res4b, $res4b @ GHASH block 8k+4
|
|
+ ldr $h56kq, [$current_tag, #144] @ load h6k | h5k
|
|
+ ldr $h78kq, [$current_tag, #192] @ load h8k | h7k
|
|
+
|
|
+ rev32 $ctr6.16b, $rtmp_ctr.16b @ CTR block 8k+14
|
|
+ rev64 $res0b, $res0b @ GHASH block 8k
|
|
+ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+14
|
|
+
|
|
+ ext $acc_lb, $acc_lb, $acc_lb, #8 @ PRE 0
|
|
+ ldr $h7q, [$current_tag, #176] @ load h7l | h7h
|
|
+ ext $h7.16b, $h7.16b, $h7.16b, #8
|
|
+ ldr $h8q, [$current_tag, #208] @ load h8l | h8h
|
|
+ ext $h8.16b, $h8.16b, $h8.16b, #8
|
|
+ rev64 $res1b, $res1b @ GHASH block 8k+1
|
|
+
|
|
+ rev32 $ctr7.16b, $rtmp_ctr.16b @ CTR block 8k+15
|
|
+ rev64 $res2b, $res2b @ GHASH block 8k+2
|
|
+ ldr $h5q, [$current_tag, #128] @ load h5l | h5h
|
|
+ ext $h5.16b, $h5.16b, $h5.16b, #8
|
|
+ ldr $h6q, [$current_tag, #160] @ load h6l | h6h
|
|
+ ext $h6.16b, $h6.16b, $h6.16b, #8
|
|
+
|
|
+ aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 0
|
|
+ aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 0
|
|
+ aese $ctr4b, $rk0 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 0
|
|
+
|
|
+ aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 0
|
|
+ aese $ctr5b, $rk0 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 0
|
|
+ aese $ctr6b, $rk0 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 0
|
|
+
|
|
+ aese $ctr4b, $rk1 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 1
|
|
+ aese $ctr7b, $rk0 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 0
|
|
+ aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 0
|
|
+
|
|
+ ldp $rk2q, $rk3q, [$cc, #32] @ load rk2, rk3
|
|
+ aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 1
|
|
+ eor $res0b, $res0b, $acc_lb @ PRE 1
|
|
+
|
|
+ aese $ctr7b, $rk1 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 1
|
|
+ aese $ctr6b, $rk1 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 1
|
|
+ aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 1
|
|
+
|
|
+ aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 1
|
|
+ aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 1
|
|
+ aese $ctr5b, $rk1 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 1
|
|
+
|
|
+ pmull2 $t0.1q, $res1.2d, $h7.2d @ GHASH block 8k+1 - high
|
|
+ trn1 $acc_m.2d, $res1.2d, $res0.2d @ GHASH block 8k, 8k+1 - mid
|
|
+ pmull $acc_l.1q, $res0.1d, $h8.1d @ GHASH block 8k - low
|
|
+
|
|
+ rev64 $res3b, $res3b @ GHASH block 8k+3
|
|
+ pmull $h7.1q, $res1.1d, $h7.1d @ GHASH block 8k+1 - low
|
|
+
|
|
+ aese $ctr5b, $rk2 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 2
|
|
+ aese $ctr7b, $rk2 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 2
|
|
+ aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 2
|
|
+
|
|
+ aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 2
|
|
+ aese $ctr6b, $rk2 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 2
|
|
+ pmull2 $acc_h.1q, $res0.2d, $h8.2d @ GHASH block 8k - high
|
|
+
|
|
+ aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 2
|
|
+ aese $ctr7b, $rk3 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 3
|
|
+
|
|
+ aese $ctr5b, $rk3 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 3
|
|
+ rev64 $res6b, $res6b @ GHASH block 8k+6
|
|
+
|
|
+ aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 3
|
|
+ aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 2
|
|
+ aese $ctr6b, $rk3 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 3
|
|
+
|
|
+ pmull2 $t1.1q, $res2.2d, $h6.2d @ GHASH block 8k+2 - high
|
|
+ trn2 $res0.2d, $res1.2d, $res0.2d @ GHASH block 8k, 8k+1 - mid
|
|
+ aese $ctr4b, $rk2 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 2
|
|
+
|
|
+ ldp $rk4q, $rk5q, [$cc, #64] @ load rk4, rk5
|
|
+ aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 3
|
|
+ pmull2 $t2.1q, $res3.2d, $h5.2d @ GHASH block 8k+3 - high
|
|
+
|
|
+ aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 3
|
|
+ eor $acc_hb, $acc_hb, $t0.16b @ GHASH block 8k+1 - high
|
|
+ eor $res0.16b, $res0.16b, $acc_m.16b @ GHASH block 8k, 8k+1 - mid
|
|
+
|
|
+ aese $ctr4b, $rk3 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 3
|
|
+ pmull $h6.1q, $res2.1d, $h6.1d @ GHASH block 8k+2 - low
|
|
+ aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 3
|
|
+
|
|
+ eor3 $acc_hb, $acc_hb, $t1.16b, $t2.16b @ GHASH block 8k+2, 8k+3 - high
|
|
+ trn1 $t3.2d, $res3.2d, $res2.2d @ GHASH block 8k+2, 8k+3 - mid
|
|
+ trn2 $res2.2d, $res3.2d, $res2.2d @ GHASH block 8k+2, 8k+3 - mid
|
|
+
|
|
+ pmull2 $acc_m.1q, $res0.2d, $h78k.2d @ GHASH block 8k - mid
|
|
+ pmull $h5.1q, $res3.1d, $h5.1d @ GHASH block 8k+3 - low
|
|
+ eor $acc_lb, $acc_lb, $h7.16b @ GHASH block 8k+1 - low
|
|
+
|
|
+ pmull $h78k.1q, $res0.1d, $h78k.1d @ GHASH block 8k+1 - mid
|
|
+ aese $ctr5b, $rk4 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 4
|
|
+ aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 4
|
|
+
|
|
+ eor3 $acc_lb, $acc_lb, $h6.16b, $h5.16b @ GHASH block 8k+2, 8k+3 - low
|
|
+ ldr $h1q, [$current_tag, #32] @ load h1l | h1h
|
|
+ ext $h1.16b, $h1.16b, $h1.16b, #8
|
|
+ ldr $h2q, [$current_tag, #64] @ load h1l | h1h
|
|
+ ext $h2.16b, $h2.16b, $h2.16b, #8
|
|
+ aese $ctr7b, $rk4 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 4
|
|
+
|
|
+ aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 4
|
|
+ aese $ctr6b, $rk4 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 4
|
|
+ eor $acc_mb, $acc_mb, $h78k.16b @ GHASH block 8k+1 - mid
|
|
+
|
|
+ eor $res2.16b, $res2.16b, $t3.16b @ GHASH block 8k+2, 8k+3 - mid
|
|
+ aese $ctr7b, $rk5 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 5
|
|
+ aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 4
|
|
+
|
|
+ aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 5
|
|
+ aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 4
|
|
+ aese $ctr4b, $rk4 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 4
|
|
+
|
|
+ aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 5
|
|
+ pmull2 $t3.1q, $res2.2d, $h56k.2d @ GHASH block 8k+2 - mid
|
|
+ aese $ctr6b, $rk5 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 5
|
|
+
|
|
+ aese $ctr4b, $rk5 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 5
|
|
+ aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 5
|
|
+ pmull $h56k.1q, $res2.1d, $h56k.1d @ GHASH block 8k+3 - mid
|
|
+
|
|
+ aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 5
|
|
+ aese $ctr5b, $rk5 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 5
|
|
+ ldp $rk6q, $rk7q, [$cc, #96] @ load rk6, rk7
|
|
+
|
|
+ ldr $h3q, [$current_tag, #80] @ load h3l | h3h
|
|
+ ext $h3.16b, $h3.16b, $h3.16b, #8
|
|
+ ldr $h4q, [$current_tag, #112] @ load h4l | h4h
|
|
+ ext $h4.16b, $h4.16b, $h4.16b, #8
|
|
+ rev64 $res7b, $res7b @ GHASH block 8k+7
|
|
+ rev64 $res5b, $res5b @ GHASH block 8k+5
|
|
+
|
|
+ eor3 $acc_mb, $acc_mb, $h56k.16b, $t3.16b @ GHASH block 8k+2, 8k+3 - mid
|
|
+
|
|
+ trn1 $t6.2d, $res5.2d, $res4.2d @ GHASH block 8k+4, 8k+5 - mid
|
|
+
|
|
+ aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 6
|
|
+ ldr $h12kq, [$current_tag, #48] @ load h2k | h1k
|
|
+ ldr $h34kq, [$current_tag, #96] @ load h4k | h3k
|
|
+ aese $ctr6b, $rk6 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 6
|
|
+
|
|
+ aese $ctr5b, $rk6 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 6
|
|
+ aese $ctr7b, $rk6 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 6
|
|
+
|
|
+ pmull2 $t4.1q, $res4.2d, $h4.2d @ GHASH block 8k+4 - high
|
|
+ pmull2 $t5.1q, $res5.2d, $h3.2d @ GHASH block 8k+5 - high
|
|
+ pmull $h4.1q, $res4.1d, $h4.1d @ GHASH block 8k+4 - low
|
|
+
|
|
+ trn2 $res4.2d, $res5.2d, $res4.2d @ GHASH block 8k+4, 8k+5 - mid
|
|
+ pmull $h3.1q, $res5.1d, $h3.1d @ GHASH block 8k+5 - low
|
|
+ trn1 $t9.2d, $res7.2d, $res6.2d @ GHASH block 8k+6, 8k+7 - mid
|
|
+
|
|
+ aese $ctr7b, $rk7 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 7
|
|
+ pmull2 $t7.1q, $res6.2d, $h2.2d @ GHASH block 8k+6 - high
|
|
+ aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 6
|
|
+
|
|
+ aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 6
|
|
+ aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 6
|
|
+ aese $ctr4b, $rk6 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 6
|
|
+
|
|
+ ldp $rk8q, $rk9q, [$cc, #128] @ load rk8, rk9
|
|
+ pmull $h2.1q, $res6.1d, $h2.1d @ GHASH block 8k+6 - low
|
|
+ aese $ctr5b, $rk7 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 7
|
|
+
|
|
+ aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 7
|
|
+ aese $ctr4b, $rk7 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 7
|
|
+
|
|
+ aese $ctr6b, $rk7 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 7
|
|
+ aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 7
|
|
+ eor3 $acc_hb, $acc_hb, $t4.16b, $t5.16b @ GHASH block 8k+4, 8k+5 - high
|
|
+
|
|
+ aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 7
|
|
+ trn2 $res6.2d, $res7.2d, $res6.2d @ GHASH block 8k+6, 8k+7 - mid
|
|
+ aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 7
|
|
+
|
|
+ aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 8
|
|
+ aese $ctr7b, $rk8 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 8
|
|
+ aese $ctr4b, $rk8 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 8
|
|
+
|
|
+ aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 8
|
|
+ aese $ctr5b, $rk8 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 8
|
|
+ aese $ctr6b, $rk8 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 8
|
|
+
|
|
+ aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 8
|
|
+ aese $ctr4b, $rk9 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 9
|
|
+ eor $res4.16b, $res4.16b, $t6.16b @ GHASH block 8k+4, 8k+5 - mid
|
|
+
|
|
+ aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 9
|
|
+ aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 9
|
|
+ eor $res6.16b, $res6.16b, $t9.16b @ GHASH block 8k+6, 8k+7 - mid
|
|
+
|
|
+ aese $ctr6b, $rk9 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 9
|
|
+ aese $ctr7b, $rk9 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 9
|
|
+ pmull2 $t6.1q, $res4.2d, $h34k.2d @ GHASH block 8k+4 - mid
|
|
+
|
|
+ aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 8
|
|
+ pmull $h34k.1q, $res4.1d, $h34k.1d @ GHASH block 8k+5 - mid
|
|
+ pmull2 $t8.1q, $res7.2d, $h1.2d @ GHASH block 8k+7 - high
|
|
+
|
|
+ pmull2 $t9.1q, $res6.2d, $h12k.2d @ GHASH block 8k+6 - mid
|
|
+ pmull $h12k.1q, $res6.1d, $h12k.1d @ GHASH block 8k+7 - mid
|
|
+ pmull $h1.1q, $res7.1d, $h1.1d @ GHASH block 8k+7 - low
|
|
+
|
|
+ ldp $rk10q, $rk11q, [$cc, #160] @ load rk10, rk11
|
|
+ eor3 $acc_lb, $acc_lb, $h4.16b, $h3.16b @ GHASH block 8k+4, 8k+5 - low
|
|
+ eor3 $acc_mb, $acc_mb, $h34k.16b, $t6.16b @ GHASH block 8k+4, 8k+5 - mid
|
|
+
|
|
+ aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 9
|
|
+ aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 9
|
|
+ aese $ctr5b, $rk9 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 9
|
|
+
|
|
+ eor3 $acc_hb, $acc_hb, $t7.16b, $t8.16b @ GHASH block 8k+6, 8k+7 - high
|
|
+ eor3 $acc_lb, $acc_lb, $h2.16b, $h1.16b @ GHASH block 8k+6, 8k+7 - low
|
|
+ ldr $mod_constantd, [$modulo_constant] @ MODULO - load modulo constant
|
|
+
|
|
+ eor3 $acc_mb, $acc_mb, $h12k.16b, $t9.16b @ GHASH block 8k+6, 8k+7 - mid
|
|
+
|
|
+ aese $ctr4b, $rk10 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 10
|
|
+ aese $ctr6b, $rk10 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 10
|
|
+ aese $ctr5b, $rk10 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 10
|
|
+
|
|
+ aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 10
|
|
+ aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 10
|
|
+ aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 10
|
|
+
|
|
+ eor3 $acc_mb, $acc_mb, $acc_hb, $acc_lb @ MODULO - karatsuba tidy up
|
|
+
|
|
+ aese $ctr7b, $rk10 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 10
|
|
+ aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 10
|
|
+ ldp $rk12q, $rk13q, [$cc, #192] @ load rk12, rk13
|
|
+
|
|
+ ext $t11.16b, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment
|
|
+
|
|
+ aese $ctr2b, $rk11 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 11
|
|
+ aese $ctr1b, $rk11 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 11
|
|
+ aese $ctr0b, $rk11 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 11
|
|
+
|
|
+ pmull $t12.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid
|
|
+ aese $ctr3b, $rk11 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 11
|
|
+
|
|
+ aese $ctr7b, $rk11 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 11
|
|
+ aese $ctr6b, $rk11 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 11
|
|
+ aese $ctr4b, $rk11 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 11
|
|
+
|
|
+ aese $ctr5b, $rk11 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 11
|
|
+ aese $ctr3b, $rk12 \n aesmc $ctr3b, $ctr3b @ AES block 8k+11 - round 12
|
|
+
|
|
+ eor3 $acc_mb, $acc_mb, $t12.16b, $t11.16b @ MODULO - fold into mid
|
|
+
|
|
+ aese $ctr3b, $rk13 @ AES block 8k+11 - round 13
|
|
+ aese $ctr2b, $rk12 \n aesmc $ctr2b, $ctr2b @ AES block 8k+10 - round 12
|
|
+ aese $ctr6b, $rk12 \n aesmc $ctr6b, $ctr6b @ AES block 8k+14 - round 12
|
|
+
|
|
+ pmull $acc_h.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low
|
|
+ aese $ctr4b, $rk12 \n aesmc $ctr4b, $ctr4b @ AES block 8k+12 - round 12
|
|
+ aese $ctr7b, $rk12 \n aesmc $ctr7b, $ctr7b @ AES block 8k+15 - round 12
|
|
+
|
|
+ aese $ctr0b, $rk12 \n aesmc $ctr0b, $ctr0b @ AES block 8k+8 - round 12
|
|
+ ldr $rk14q, [$cc, #224] @ load rk14
|
|
+ aese $ctr1b, $rk12 \n aesmc $ctr1b, $ctr1b @ AES block 8k+9 - round 12
|
|
+
|
|
+ aese $ctr4b, $rk13 @ AES block 8k+12 - round 13
|
|
+ ext $t11.16b, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment
|
|
+ aese $ctr5b, $rk12 \n aesmc $ctr5b, $ctr5b @ AES block 8k+13 - round 12
|
|
+
|
|
+ aese $ctr6b, $rk13 @ AES block 8k+14 - round 13
|
|
+ aese $ctr2b, $rk13 @ AES block 8k+10 - round 13
|
|
+ aese $ctr1b, $rk13 @ AES block 8k+9 - round 13
|
|
+
|
|
+ aese $ctr5b, $rk13 @ AES block 8k+13 - round 13
|
|
+ eor3 $acc_lb, $acc_lb, $t11.16b, $acc_hb @ MODULO - fold into low
|
|
+ add $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 8k+15
|
|
+
|
|
+ aese $ctr7b, $rk13 @ AES block 8k+15 - round 13
|
|
+ aese $ctr0b, $rk13 @ AES block 8k+8 - round 13
|
|
+.L256_dec_tail: @ TAIL
|
|
+
|
|
+ ext $t0.16b, $acc_lb, $acc_lb, #8 @ prepare final partial tag
|
|
+ sub $main_end_input_ptr, $end_input_ptr, $input_ptr @ main_end_input_ptr is number of bytes left to process
|
|
+ cmp $main_end_input_ptr, #112
|
|
+
|
|
+ ldr $res1q, [$input_ptr], #16 @ AES block 8k+8 - load ciphertext
|
|
+
|
|
+ ldp $h78kq, $h8q, [$current_tag, #192] @ load h8l | h8h
|
|
+ ext $h8.16b, $h8.16b, $h8.16b, #8
|
|
+ mov $t1.16b, $rk14
|
|
+
|
|
+ ldp $h5q, $h56kq, [$current_tag, #128] @ load h5l | h5h
|
|
+ ext $h5.16b, $h5.16b, $h5.16b, #8
|
|
+
|
|
+ eor3 $res4b, $res1b, $ctr0b, $t1.16b @ AES block 8k+8 - result
|
|
+ ldp $h6q, $h7q, [$current_tag, #160] @ load h6l | h6h
|
|
+ ext $h6.16b, $h6.16b, $h6.16b, #8
|
|
+ ext $h7.16b, $h7.16b, $h7.16b, #8
|
|
+ b.gt .L256_dec_blocks_more_than_7
|
|
+
|
|
+ mov $ctr7b, $ctr6b
|
|
+ sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
|
|
+ mov $ctr6b, $ctr5b
|
|
+
|
|
+ mov $ctr5b, $ctr4b
|
|
+ mov $ctr4b, $ctr3b
|
|
+ movi $acc_l.8b, #0
|
|
+
|
|
+ movi $acc_h.8b, #0
|
|
+ movi $acc_m.8b, #0
|
|
+ mov $ctr3b, $ctr2b
|
|
+
|
|
+ cmp $main_end_input_ptr, #96
|
|
+ mov $ctr2b, $ctr1b
|
|
+ b.gt .L256_dec_blocks_more_than_6
|
|
+
|
|
+ mov $ctr7b, $ctr6b
|
|
+ mov $ctr6b, $ctr5b
|
|
+
|
|
+ mov $ctr5b, $ctr4b
|
|
+ cmp $main_end_input_ptr, #80
|
|
+ sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
|
|
+
|
|
+ mov $ctr4b, $ctr3b
|
|
+ mov $ctr3b, $ctr1b
|
|
+ b.gt .L256_dec_blocks_more_than_5
|
|
+
|
|
+ cmp $main_end_input_ptr, #64
|
|
+ mov $ctr7b, $ctr6b
|
|
+ sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
|
|
+
|
|
+ mov $ctr6b, $ctr5b
|
|
+
|
|
+ mov $ctr5b, $ctr4b
|
|
+ mov $ctr4b, $ctr1b
|
|
+ b.gt .L256_dec_blocks_more_than_4
|
|
+
|
|
+ sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
|
|
+ mov $ctr7b, $ctr6b
|
|
+ cmp $main_end_input_ptr, #48
|
|
+
|
|
+ mov $ctr6b, $ctr5b
|
|
+ mov $ctr5b, $ctr1b
|
|
+ b.gt .L256_dec_blocks_more_than_3
|
|
+
|
|
+ ldr $h34kq, [$current_tag, #96] @ load h4k | h3k
|
|
+ sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
|
|
+ mov $ctr7b, $ctr6b
|
|
+
|
|
+ cmp $main_end_input_ptr, #32
|
|
+ mov $ctr6b, $ctr1b
|
|
+ b.gt .L256_dec_blocks_more_than_2
|
|
+
|
|
+ sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
|
|
+
|
|
+ mov $ctr7b, $ctr1b
|
|
+ cmp $main_end_input_ptr, #16
|
|
+ b.gt .L256_dec_blocks_more_than_1
|
|
+
|
|
+ sub $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
|
|
+ ldr $h12kq, [$current_tag, #48] @ load h2k | h1k
|
|
+ b .L256_dec_blocks_less_than_1
|
|
+.L256_dec_blocks_more_than_7: @ blocks left > 7
|
|
+ rev64 $res0b, $res1b @ GHASH final-7 block
|
|
+ ldr $res1q, [$input_ptr], #16 @ AES final-6 block - load ciphertext
|
|
+ st1 { $res4b}, [$output_ptr], #16 @ AES final-7 block - store result
|
|
+
|
|
+ ins $acc_m.d[0], $h78k.d[1] @ GHASH final-7 block - mid
|
|
+
|
|
+ eor $res0b, $res0b, $t0.16b @ feed in partial tag
|
|
+
|
|
+ ins $rk4v.d[0], $res0.d[1] @ GHASH final-7 block - mid
|
|
+ eor3 $res4b, $res1b, $ctr1b, $t1.16b @ AES final-6 block - result
|
|
+
|
|
+ pmull2 $acc_h.1q, $res0.2d, $h8.2d @ GHASH final-7 block - high
|
|
+
|
|
+ eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-7 block - mid
|
|
+ movi $t0.8b, #0 @ surpress further partial tag feed in
|
|
+
|
|
+ pmull $acc_l.1q, $res0.1d, $h8.1d @ GHASH final-7 block - low
|
|
+ pmull $acc_m.1q, $rk4v.1d, $acc_m.1d @ GHASH final-7 block - mid
|
|
+.L256_dec_blocks_more_than_6: @ blocks left > 6
|
|
+
|
|
+ rev64 $res0b, $res1b @ GHASH final-6 block
|
|
+
|
|
+ eor $res0b, $res0b, $t0.16b @ feed in partial tag
|
|
+ ldr $res1q, [$input_ptr], #16 @ AES final-5 block - load ciphertext
|
|
+ movi $t0.8b, #0 @ surpress further partial tag feed in
|
|
+
|
|
+ ins $rk4v.d[0], $res0.d[1] @ GHASH final-6 block - mid
|
|
+ st1 { $res4b}, [$output_ptr], #16 @ AES final-6 block - store result
|
|
+ pmull2 $rk2q1, $res0.2d, $h7.2d @ GHASH final-6 block - high
|
|
+
|
|
+ pmull $rk3q1, $res0.1d, $h7.1d @ GHASH final-6 block - low
|
|
+
|
|
+ eor3 $res4b, $res1b, $ctr2b, $t1.16b @ AES final-5 block - result
|
|
+ eor $acc_lb, $acc_lb, $rk3 @ GHASH final-6 block - low
|
|
+ eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-6 block - mid
|
|
+
|
|
+ pmull $rk4v.1q, $rk4v.1d, $h78k.1d @ GHASH final-6 block - mid
|
|
+
|
|
+ eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-6 block - mid
|
|
+ eor $acc_hb, $acc_hb, $rk2 @ GHASH final-6 block - high
|
|
+.L256_dec_blocks_more_than_5: @ blocks left > 5
|
|
+
|
|
+ rev64 $res0b, $res1b @ GHASH final-5 block
|
|
+
|
|
+ eor $res0b, $res0b, $t0.16b @ feed in partial tag
|
|
+
|
|
+ pmull2 $rk2q1, $res0.2d, $h6.2d @ GHASH final-5 block - high
|
|
+ ins $rk4v.d[0], $res0.d[1] @ GHASH final-5 block - mid
|
|
+
|
|
+ ldr $res1q, [$input_ptr], #16 @ AES final-4 block - load ciphertext
|
|
+
|
|
+ eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-5 block - mid
|
|
+ st1 { $res4b}, [$output_ptr], #16 @ AES final-5 block - store result
|
|
+
|
|
+ pmull $rk3q1, $res0.1d, $h6.1d @ GHASH final-5 block - low
|
|
+ ins $rk4v.d[1], $rk4v.d[0] @ GHASH final-5 block - mid
|
|
+
|
|
+ pmull2 $rk4v.1q, $rk4v.2d, $h56k.2d @ GHASH final-5 block - mid
|
|
+
|
|
+ eor $acc_hb, $acc_hb, $rk2 @ GHASH final-5 block - high
|
|
+ eor3 $res4b, $res1b, $ctr3b, $t1.16b @ AES final-4 block - result
|
|
+ eor $acc_lb, $acc_lb, $rk3 @ GHASH final-5 block - low
|
|
+
|
|
+ eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-5 block - mid
|
|
+ movi $t0.8b, #0 @ surpress further partial tag feed in
|
|
+.L256_dec_blocks_more_than_4: @ blocks left > 4
|
|
+
|
|
+ rev64 $res0b, $res1b @ GHASH final-4 block
|
|
+
|
|
+ eor $res0b, $res0b, $t0.16b @ feed in partial tag
|
|
+
|
|
+ ins $rk4v.d[0], $res0.d[1] @ GHASH final-4 block - mid
|
|
+ ldr $res1q, [$input_ptr], #16 @ AES final-3 block - load ciphertext
|
|
+
|
|
+ movi $t0.8b, #0 @ surpress further partial tag feed in
|
|
+
|
|
+ pmull $rk3q1, $res0.1d, $h5.1d @ GHASH final-4 block - low
|
|
+ pmull2 $rk2q1, $res0.2d, $h5.2d @ GHASH final-4 block - high
|
|
+
|
|
+ eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-4 block - mid
|
|
+
|
|
+ eor $acc_hb, $acc_hb, $rk2 @ GHASH final-4 block - high
|
|
+
|
|
+ pmull $rk4v.1q, $rk4v.1d, $h56k.1d @ GHASH final-4 block - mid
|
|
+
|
|
+ eor $acc_lb, $acc_lb, $rk3 @ GHASH final-4 block - low
|
|
+ st1 { $res4b}, [$output_ptr], #16 @ AES final-4 block - store result
|
|
+
|
|
+ eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-4 block - mid
|
|
+ eor3 $res4b, $res1b, $ctr4b, $t1.16b @ AES final-3 block - result
|
|
+.L256_dec_blocks_more_than_3: @ blocks left > 3
|
|
+
|
|
+ ldr $h4q, [$current_tag, #112] @ load h4l | h4h
|
|
+ ext $h4.16b, $h4.16b, $h4.16b, #8
|
|
+ rev64 $res0b, $res1b @ GHASH final-3 block
|
|
+
|
|
+ eor $res0b, $res0b, $t0.16b @ feed in partial tag
|
|
+ ldr $res1q, [$input_ptr], #16 @ AES final-2 block - load ciphertext
|
|
+ ldr $h34kq, [$current_tag, #96] @ load h4k | h3k
|
|
+
|
|
+ ins $rk4v.d[0], $res0.d[1] @ GHASH final-3 block - mid
|
|
+ st1 { $res4b}, [$output_ptr], #16 @ AES final-3 block - store result
|
|
+
|
|
+ eor3 $res4b, $res1b, $ctr5b, $t1.16b @ AES final-2 block - result
|
|
+
|
|
+ eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-3 block - mid
|
|
+
|
|
+ ins $rk4v.d[1], $rk4v.d[0] @ GHASH final-3 block - mid
|
|
+ pmull $rk3q1, $res0.1d, $h4.1d @ GHASH final-3 block - low
|
|
+ pmull2 $rk2q1, $res0.2d, $h4.2d @ GHASH final-3 block - high
|
|
+
|
|
+ movi $t0.8b, #0 @ surpress further partial tag feed in
|
|
+ pmull2 $rk4v.1q, $rk4v.2d, $h34k.2d @ GHASH final-3 block - mid
|
|
+ eor $acc_lb, $acc_lb, $rk3 @ GHASH final-3 block - low
|
|
+
|
|
+ eor $acc_hb, $acc_hb, $rk2 @ GHASH final-3 block - high
|
|
+
|
|
+ eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-3 block - mid
|
|
+.L256_dec_blocks_more_than_2: @ blocks left > 2
|
|
+
|
|
+ rev64 $res0b, $res1b @ GHASH final-2 block
|
|
+
|
|
+ ldr $h3q, [$current_tag, #80] @ load h3l | h3h
|
|
+ ext $h3.16b, $h3.16b, $h3.16b, #8
|
|
+ ldr $res1q, [$input_ptr], #16 @ AES final-1 block - load ciphertext
|
|
+
|
|
+ eor $res0b, $res0b, $t0.16b @ feed in partial tag
|
|
+
|
|
+ ins $rk4v.d[0], $res0.d[1] @ GHASH final-2 block - mid
|
|
+
|
|
+ pmull $rk3q1, $res0.1d, $h3.1d @ GHASH final-2 block - low
|
|
+ st1 { $res4b}, [$output_ptr], #16 @ AES final-2 block - store result
|
|
+ eor3 $res4b, $res1b, $ctr6b, $t1.16b @ AES final-1 block - result
|
|
+
|
|
+ eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-2 block - mid
|
|
+ eor $acc_lb, $acc_lb, $rk3 @ GHASH final-2 block - low
|
|
+ movi $t0.8b, #0 @ surpress further partial tag feed in
|
|
+
|
|
+ pmull $rk4v.1q, $rk4v.1d, $h34k.1d @ GHASH final-2 block - mid
|
|
+ pmull2 $rk2q1, $res0.2d, $h3.2d @ GHASH final-2 block - high
|
|
+
|
|
+ eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-2 block - mid
|
|
+ eor $acc_hb, $acc_hb, $rk2 @ GHASH final-2 block - high
|
|
+.L256_dec_blocks_more_than_1: @ blocks left > 1
|
|
+
|
|
+ rev64 $res0b, $res1b @ GHASH final-1 block
|
|
+
|
|
+ eor $res0b, $res0b, $t0.16b @ feed in partial tag
|
|
+
|
|
+ ins $rk4v.d[0], $res0.d[1] @ GHASH final-1 block - mid
|
|
+ ldr $h2q, [$current_tag, #64] @ load h1l | h1h
|
|
+ ext $h2.16b, $h2.16b, $h2.16b, #8
|
|
+
|
|
+ eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-1 block - mid
|
|
+ ldr $res1q, [$input_ptr], #16 @ AES final block - load ciphertext
|
|
+ st1 { $res4b}, [$output_ptr], #16 @ AES final-1 block - store result
|
|
+
|
|
+ ldr $h12kq, [$current_tag, #48] @ load h2k | h1k
|
|
+ pmull $rk3q1, $res0.1d, $h2.1d @ GHASH final-1 block - low
|
|
+
|
|
+ ins $rk4v.d[1], $rk4v.d[0] @ GHASH final-1 block - mid
|
|
+
|
|
+ eor $acc_lb, $acc_lb, $rk3 @ GHASH final-1 block - low
|
|
+
|
|
+ eor3 $res4b, $res1b, $ctr7b, $t1.16b @ AES final block - result
|
|
+ pmull2 $rk2q1, $res0.2d, $h2.2d @ GHASH final-1 block - high
|
|
+
|
|
+ pmull2 $rk4v.1q, $rk4v.2d, $h12k.2d @ GHASH final-1 block - mid
|
|
+
|
|
+ movi $t0.8b, #0 @ surpress further partial tag feed in
|
|
+ eor $acc_hb, $acc_hb, $rk2 @ GHASH final-1 block - high
|
|
+
|
|
+ eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-1 block - mid
|
|
+.L256_dec_blocks_less_than_1: @ blocks left <= 1
|
|
+
|
|
+ ld1 { $rk0}, [$output_ptr] @ load existing bytes where the possibly partial last block is to be stored
|
|
+ mvn $temp0_x, xzr @ temp0_x = 0xffffffffffffffff
|
|
+ and $bit_length, $bit_length, #127 @ bit_length %= 128
|
|
+
|
|
+ sub $bit_length, $bit_length, #128 @ bit_length -= 128
|
|
+ rev32 $rtmp_ctr.16b, $rtmp_ctr.16b
|
|
+ str $rtmp_ctrq, [$counter] @ store the updated counter
|
|
+
|
|
+ neg $bit_length, $bit_length @ bit_length = 128 - #bits in input (in range [1,128])
|
|
+
|
|
+ and $bit_length, $bit_length, #127 @ bit_length %= 128
|
|
+
|
|
+ lsr $temp0_x, $temp0_x, $bit_length @ temp0_x is mask for top 64b of last block
|
|
+ cmp $bit_length, #64
|
|
+ mvn $temp1_x, xzr @ temp1_x = 0xffffffffffffffff
|
|
+
|
|
+ csel $temp3_x, $temp0_x, xzr, lt
|
|
+ csel $temp2_x, $temp1_x, $temp0_x, lt
|
|
+
|
|
+ mov $ctr0.d[0], $temp2_x @ ctr0b is mask for last block
|
|
+ mov $ctr0.d[1], $temp3_x
|
|
+
|
|
+ and $res1b, $res1b, $ctr0b @ possibly partial last block has zeroes in highest bits
|
|
+ ldr $h1q, [$current_tag, #32] @ load h1l | h1h
|
|
+ ext $h1.16b, $h1.16b, $h1.16b, #8
|
|
+ bif $res4b, $rk0, $ctr0b @ insert existing bytes in top end of result before storing
|
|
+
|
|
+ rev64 $res0b, $res1b @ GHASH final block
|
|
+
|
|
+ eor $res0b, $res0b, $t0.16b @ feed in partial tag
|
|
+
|
|
+ ins $t0.d[0], $res0.d[1] @ GHASH final block - mid
|
|
+ pmull2 $rk2q1, $res0.2d, $h1.2d @ GHASH final block - high
|
|
+
|
|
+ eor $t0.8b, $t0.8b, $res0.8b @ GHASH final block - mid
|
|
+
|
|
+ pmull $rk3q1, $res0.1d, $h1.1d @ GHASH final block - low
|
|
+ eor $acc_hb, $acc_hb, $rk2 @ GHASH final block - high
|
|
+
|
|
+ pmull $t0.1q, $t0.1d, $h12k.1d @ GHASH final block - mid
|
|
+
|
|
+ eor $acc_mb, $acc_mb, $t0.16b @ GHASH final block - mid
|
|
+ ldr $mod_constantd, [$modulo_constant] @ MODULO - load modulo constant
|
|
+ eor $acc_lb, $acc_lb, $rk3 @ GHASH final block - low
|
|
+
|
|
+ pmull $t11.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid
|
|
+ eor $t10.16b, $acc_hb, $acc_lb @ MODULO - karatsuba tidy up
|
|
+
|
|
+ ext $acc_hb, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment
|
|
+ st1 { $res4b}, [$output_ptr] @ store all 16B
|
|
+
|
|
+ eor $acc_mb, $acc_mb, $t10.16b @ MODULO - karatsuba tidy up
|
|
+
|
|
+ eor $t11.16b, $acc_hb, $t11.16b @ MODULO - fold into mid
|
|
+ eor $acc_mb, $acc_mb, $t11.16b @ MODULO - fold into mid
|
|
+
|
|
+ pmull $acc_h.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low
|
|
+
|
|
+ ext $acc_mb, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment
|
|
+ eor $acc_lb, $acc_lb, $acc_hb @ MODULO - fold into low
|
|
+
|
|
+ eor $acc_lb, $acc_lb, $acc_mb @ MODULO - fold into low
|
|
+ ext $acc_lb, $acc_lb, $acc_lb, #8
|
|
+ rev64 $acc_lb, $acc_lb
|
|
+ st1 { $acc_l.16b }, [$current_tag]
|
|
+ lsr x0, $bit_length, #3 @ return sizes
|
|
+
|
|
+ ldp d10, d11, [sp, #16]
|
|
+ ldp d12, d13, [sp, #32]
|
|
+ ldp d14, d15, [sp, #48]
|
|
+ ldp d8, d9, [sp], #80
|
|
+ ret
|
|
+
|
|
+.L256_dec_ret:
|
|
+ mov w0, #0x0
|
|
+ ret
|
|
+.size unroll8_eor3_aes_gcm_dec_256_kernel,.-unroll8_eor3_aes_gcm_dec_256_kernel
|
|
+___
|
|
+}
|
|
+}
|
|
+
|
|
+$code.=<<___;
|
|
+.asciz "AES GCM module for ARMv8, SPDX BSD-3-Clause by <xiaokang.qian\@arm.com>"
|
|
+.align 2
|
|
+#endif
|
|
+___
|
|
+
|
|
+{
|
|
+ my %opcode = (
|
|
+ "rax1" => 0xce608c00, "eor3" => 0xce000000,
|
|
+ "bcax" => 0xce200000, "xar" => 0xce800000 );
|
|
+
|
|
+ sub unsha3 {
|
|
+ my ($mnemonic,$arg)=@_;
|
|
+
|
|
+ $arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)[^,]*(?:,\s*[qv]([0-9]+)[^,]*(?:,\s*[qv#]([0-9\-]+))?)?/
|
|
+ &&
|
|
+ sprintf ".inst\t0x%08x\t//%s %s",
|
|
+ $opcode{$mnemonic}|$1|($2<<5)|($3<<16)|(eval($4)<<10),
|
|
+ $mnemonic,$arg;
|
|
+ }
|
|
+ sub unvmov {
|
|
+ my $arg=shift;
|
|
+
|
|
+ $arg =~ m/q([0-9]+)#(lo|hi),\s*q([0-9]+)#(lo|hi)/o &&
|
|
+ sprintf "ins v%d.d[%d],v%d.d[%d]",$1<8?$1:$1+8,($2 eq "lo")?0:1,
|
|
+ $3<8?$3:$3+8,($4 eq "lo")?0:1;
|
|
+ }
|
|
+
|
|
+ foreach(split("\n",$code)) {
|
|
+ s/@\s/\/\//o; # old->new style commentary
|
|
+ s/\`([^\`]*)\`/eval($1)/ge;
|
|
+
|
|
+ m/\bld1r\b/ and s/\.16b/.2d/g or
|
|
+ s/\b(eor3|rax1|xar|bcax)\s+(v.*)/unsha3($1,$2)/ge;
|
|
+ print $_,"\n";
|
|
+ }
|
|
+}
|
|
+
|
|
+close STDOUT or die "error closing STDOUT: $!"; # enforce flush
|
|
Index: openssl-1.1.1m/crypto/modes/asm/ghashv8-armx.pl
|
|
===================================================================
|
|
--- openssl-1.1.1m.orig/crypto/modes/asm/ghashv8-armx.pl
|
|
+++ openssl-1.1.1m/crypto/modes/asm/ghashv8-armx.pl
|
|
@@ -141,6 +141,7 @@ gcm_init_v8:
|
|
___
|
|
if ($flavour =~ /64/) {
|
|
my ($t3,$Yl,$Ym,$Yh) = map("q$_",(4..7));
|
|
+my ($H3,$H34k,$H4,$H5,$H56k,$H6,$H7,$H78k,$H8) = map("q$_",(15..23));
|
|
|
|
$code.=<<___;
|
|
@ calculate H^3 and H^4
|
|
@@ -175,15 +176,103 @@ $code.=<<___;
|
|
vpmull.p64 $Yl,$Yl,$xC2
|
|
veor $t2,$t2,$Xh
|
|
veor $t3,$t3,$Yh
|
|
- veor $H, $Xl,$t2 @ H^3
|
|
- veor $H2,$Yl,$t3 @ H^4
|
|
+ veor $H3, $Xl,$t2 @ H^3
|
|
+ veor $H4,$Yl,$t3 @ H^4
|
|
|
|
- vext.8 $t0,$H, $H,#8 @ Karatsuba pre-processing
|
|
- vext.8 $t1,$H2,$H2,#8
|
|
- veor $t0,$t0,$H
|
|
- veor $t1,$t1,$H2
|
|
- vext.8 $Hhl,$t0,$t1,#8 @ pack Karatsuba pre-processed
|
|
- vst1.64 {$H-$H2},[x0] @ store Htable[3..5]
|
|
+ vext.8 $t0,$H3, $H3,#8 @ Karatsuba pre-processing
|
|
+ vext.8 $t1,$H4,$H4,#8
|
|
+ vext.8 $t2,$H2,$H2,#8
|
|
+ veor $t0,$t0,$H3
|
|
+ veor $t1,$t1,$H4
|
|
+ veor $t2,$t2,$H2
|
|
+ vext.8 $H34k,$t0,$t1,#8 @ pack Karatsuba pre-processed
|
|
+ vst1.64 {$H3-$H4},[x0],#48 @ store Htable[3..5]
|
|
+
|
|
+ @ calculate H^5 and H^6
|
|
+ vpmull.p64 $Xl,$H2, $H3
|
|
+ vpmull.p64 $Yl,$H3,$H3
|
|
+ vpmull2.p64 $Xh,$H2, $H3
|
|
+ vpmull2.p64 $Yh,$H3,$H3
|
|
+ vpmull.p64 $Xm,$t0,$t2
|
|
+ vpmull.p64 $Ym,$t0,$t0
|
|
+
|
|
+ vext.8 $t0,$Xl,$Xh,#8 @ Karatsuba post-processing
|
|
+ vext.8 $t1,$Yl,$Yh,#8
|
|
+ veor $t2,$Xl,$Xh
|
|
+ veor $Xm,$Xm,$t0
|
|
+ veor $t3,$Yl,$Yh
|
|
+ veor $Ym,$Ym,$t1
|
|
+ veor $Xm,$Xm,$t2
|
|
+ vpmull.p64 $t2,$Xl,$xC2 @ 1st phase
|
|
+ veor $Ym,$Ym,$t3
|
|
+ vpmull.p64 $t3,$Yl,$xC2
|
|
+
|
|
+ vmov $Xh#lo,$Xm#hi @ Xh|Xm - 256-bit result
|
|
+ vmov $Yh#lo,$Ym#hi
|
|
+ vmov $Xm#hi,$Xl#lo @ Xm is rotated Xl
|
|
+ vmov $Ym#hi,$Yl#lo
|
|
+ veor $Xl,$Xm,$t2
|
|
+ veor $Yl,$Ym,$t3
|
|
+
|
|
+ vext.8 $t2,$Xl,$Xl,#8 @ 2nd phase
|
|
+ vext.8 $t3,$Yl,$Yl,#8
|
|
+ vpmull.p64 $Xl,$Xl,$xC2
|
|
+ vpmull.p64 $Yl,$Yl,$xC2
|
|
+ veor $t2,$t2,$Xh
|
|
+ veor $t3,$t3,$Yh
|
|
+ veor $H5,$Xl,$t2 @ H^5
|
|
+ veor $H6,$Yl,$t3 @ H^6
|
|
+
|
|
+ vext.8 $t0,$H5, $H5,#8 @ Karatsuba pre-processing
|
|
+ vext.8 $t1,$H6,$H6,#8
|
|
+ vext.8 $t2,$H2,$H2,#8
|
|
+ veor $t0,$t0,$H5
|
|
+ veor $t1,$t1,$H6
|
|
+ veor $t2,$t2,$H2
|
|
+ vext.8 $H56k,$t0,$t1,#8 @ pack Karatsuba pre-processed
|
|
+ vst1.64 {$H5-$H6},[x0],#48 @ store Htable[6..8]
|
|
+
|
|
+ @ calculate H^7 and H^8
|
|
+ vpmull.p64 $Xl,$H2,$H5
|
|
+ vpmull.p64 $Yl,$H2,$H6
|
|
+ vpmull2.p64 $Xh,$H2,$H5
|
|
+ vpmull2.p64 $Yh,$H2,$H6
|
|
+ vpmull.p64 $Xm,$t0,$t2
|
|
+ vpmull.p64 $Ym,$t1,$t2
|
|
+
|
|
+ vext.8 $t0,$Xl,$Xh,#8 @ Karatsuba post-processing
|
|
+ vext.8 $t1,$Yl,$Yh,#8
|
|
+ veor $t2,$Xl,$Xh
|
|
+ veor $Xm,$Xm,$t0
|
|
+ veor $t3,$Yl,$Yh
|
|
+ veor $Ym,$Ym,$t1
|
|
+ veor $Xm,$Xm,$t2
|
|
+ vpmull.p64 $t2,$Xl,$xC2 @ 1st phase
|
|
+ veor $Ym,$Ym,$t3
|
|
+ vpmull.p64 $t3,$Yl,$xC2
|
|
+
|
|
+ vmov $Xh#lo,$Xm#hi @ Xh|Xm - 256-bit result
|
|
+ vmov $Yh#lo,$Ym#hi
|
|
+ vmov $Xm#hi,$Xl#lo @ Xm is rotated Xl
|
|
+ vmov $Ym#hi,$Yl#lo
|
|
+ veor $Xl,$Xm,$t2
|
|
+ veor $Yl,$Ym,$t3
|
|
+
|
|
+ vext.8 $t2,$Xl,$Xl,#8 @ 2nd phase
|
|
+ vext.8 $t3,$Yl,$Yl,#8
|
|
+ vpmull.p64 $Xl,$Xl,$xC2
|
|
+ vpmull.p64 $Yl,$Yl,$xC2
|
|
+ veor $t2,$t2,$Xh
|
|
+ veor $t3,$t3,$Yh
|
|
+ veor $H7,$Xl,$t2 @ H^7
|
|
+ veor $H8,$Yl,$t3 @ H^8
|
|
+
|
|
+ vext.8 $t0,$H7,$H7,#8 @ Karatsuba pre-processing
|
|
+ vext.8 $t1,$H8,$H8,#8
|
|
+ veor $t0,$t0,$H7
|
|
+ veor $t1,$t1,$H8
|
|
+ vext.8 $H78k,$t0,$t1,#8 @ pack Karatsuba pre-processed
|
|
+ vst1.64 {$H7-$H8},[x0] @ store Htable[9..11]
|
|
___
|
|
}
|
|
$code.=<<___;
|
|
Index: openssl-1.1.1m/crypto/modes/build.info
|
|
===================================================================
|
|
--- openssl-1.1.1m.orig/crypto/modes/build.info
|
|
+++ openssl-1.1.1m/crypto/modes/build.info
|
|
@@ -20,6 +20,8 @@ GENERATE[ghash-armv4.S]=asm/ghash-armv4.
|
|
INCLUDE[ghash-armv4.o]=..
|
|
GENERATE[ghashv8-armx.S]=asm/ghashv8-armx.pl $(PERLASM_SCHEME)
|
|
INCLUDE[ghashv8-armx.o]=..
|
|
+GENERATE[aes-gcm-armv8-unroll8_64.S]=asm/aes-gcm-armv8-unroll8_64.pl $(PERLASM_SCHEME)
|
|
+INCLUDE[aes-gcm-armv8-unroll8_64.o]=..
|
|
GENERATE[ghash-s390x.S]=asm/ghash-s390x.pl $(PERLASM_SCHEME)
|
|
INCLUDE[ghash-s390x.o]=..
|
|
|