forked from pool/openssl-1_1
Pedro Monreal Gonzalez
18ecb7a582
- Security fix: [bsc#1227138, CVE-2024-5535] * SSL_select_next_proto buffer overread * Add openssl-CVE-2024-5535.patch - Apply "openssl-CVE-2024-4741.patch" to fix a use-after-free security vulnerability. Calling the function SSL_free_buffers() potentially caused memory to be accessed that was previously freed in some situations and a malicious attacker could attempt to engineer a stituation where this occurs to facilitate a denial-of-service attack. [CVE-2024-4741, bsc#1225551] OBS-URL: https://build.opensuse.org/package/show/security:tls/openssl-1_1?expand=0&rev=164
1095 lines
28 KiB
Diff
1095 lines
28 KiB
Diff
From d1229190bfbb19439589557e4d65f9bccab09b2d Mon Sep 17 00:00:00 2001
|
|
From: Patrick Steuer <patrick.steuer@de.ibm.com>
|
|
Date: Mon, 25 Feb 2019 18:55:04 +0100
|
|
Subject: [PATCH] s390x assembly pack: import chacha from cryptogams repo
|
|
|
|
featuring 6x"horizontal" code path which is up to 25%
|
|
faster than present 4x"vertical" for larger blocks.
|
|
|
|
Signed-off-by: Patrick Steuer <patrick.steuer@de.ibm.com>
|
|
|
|
Reviewed-by: Matt Caswell <matt@openssl.org>
|
|
Reviewed-by: Richard Levitte <levitte@openssl.org>
|
|
(Merged from https://github.com/openssl/openssl/pull/8287)
|
|
---
|
|
crypto/chacha/asm/chacha-s390x.pl | 1006 +++++++++++++++++++++--------
|
|
1 file changed, 719 insertions(+), 287 deletions(-)
|
|
|
|
diff --git a/crypto/chacha/asm/chacha-s390x.pl b/crypto/chacha/asm/chacha-s390x.pl
|
|
index abf7283dd8..51efe64408 100755
|
|
--- a/crypto/chacha/asm/chacha-s390x.pl
|
|
+++ b/crypto/chacha/asm/chacha-s390x.pl
|
|
@@ -23,11 +23,20 @@
|
|
#
|
|
# August 2018
|
|
#
|
|
-# Add vx code path.
|
|
+# Add vx code path: 4x"vertical".
|
|
#
|
|
# Copyright IBM Corp. 2018
|
|
# Author: Patrick Steuer <patrick.steuer@de.ibm.com>
|
|
|
|
+#
|
|
+# February 2019
|
|
+#
|
|
+# Add 6x"horizontal" VX implementation. It's ~25% faster than IBM's
|
|
+# 4x"vertical" submission [on z13] and >3 faster than scalar code.
|
|
+# But to harness overheads revert to transliteration of VSX code path
|
|
+# from chacha-ppc module, which is also 4x"vertical", to handle inputs
|
|
+# not longer than 256 bytes.
|
|
+
|
|
use strict;
|
|
use FindBin qw($Bin);
|
|
use lib "$Bin/../..";
|
|
@@ -50,11 +59,9 @@ while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {}
|
|
my $sp="%r15";
|
|
my $stdframe=16*$SIZE_T+4*8;
|
|
|
|
+sub ROUND {
|
|
my @x=map("%r$_",(0..7,"x","x","x","x",(10..13)));
|
|
my @t=map("%r$_",(8,9));
|
|
-my @v=map("%v$_",(16..31));
|
|
-
|
|
-sub ROUND {
|
|
my ($a0,$b0,$c0,$d0)=@_;
|
|
my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0));
|
|
my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1));
|
|
@@ -143,63 +150,92 @@ my ($xc,$xc_)=map("$_",@t);
|
|
rll (@x[$b3],@x[$b3],7);
|
|
}
|
|
|
|
-sub VX_ROUND {
|
|
+sub VX_lane_ROUND {
|
|
my ($a0,$b0,$c0,$d0)=@_;
|
|
my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0));
|
|
my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1));
|
|
my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2));
|
|
+my @x=map("%v$_",(0..15));
|
|
+
|
|
+ vaf (@x[$a0],@x[$a0],@x[$b0]); # Q1
|
|
+ vx (@x[$d0],@x[$d0],@x[$a0]);
|
|
+ verllf (@x[$d0],@x[$d0],16);
|
|
+ vaf (@x[$a1],@x[$a1],@x[$b1]); # Q2
|
|
+ vx (@x[$d1],@x[$d1],@x[$a1]);
|
|
+ verllf (@x[$d1],@x[$d1],16);
|
|
+ vaf (@x[$a2],@x[$a2],@x[$b2]); # Q3
|
|
+ vx (@x[$d2],@x[$d2],@x[$a2]);
|
|
+ verllf (@x[$d2],@x[$d2],16);
|
|
+ vaf (@x[$a3],@x[$a3],@x[$b3]); # Q4
|
|
+ vx (@x[$d3],@x[$d3],@x[$a3]);
|
|
+ verllf (@x[$d3],@x[$d3],16);
|
|
+
|
|
+ vaf (@x[$c0],@x[$c0],@x[$d0]);
|
|
+ vx (@x[$b0],@x[$b0],@x[$c0]);
|
|
+ verllf (@x[$b0],@x[$b0],12);
|
|
+ vaf (@x[$c1],@x[$c1],@x[$d1]);
|
|
+ vx (@x[$b1],@x[$b1],@x[$c1]);
|
|
+ verllf (@x[$b1],@x[$b1],12);
|
|
+ vaf (@x[$c2],@x[$c2],@x[$d2]);
|
|
+ vx (@x[$b2],@x[$b2],@x[$c2]);
|
|
+ verllf (@x[$b2],@x[$b2],12);
|
|
+ vaf (@x[$c3],@x[$c3],@x[$d3]);
|
|
+ vx (@x[$b3],@x[$b3],@x[$c3]);
|
|
+ verllf (@x[$b3],@x[$b3],12);
|
|
+
|
|
+ vaf (@x[$a0],@x[$a0],@x[$b0]);
|
|
+ vx (@x[$d0],@x[$d0],@x[$a0]);
|
|
+ verllf (@x[$d0],@x[$d0],8);
|
|
+ vaf (@x[$a1],@x[$a1],@x[$b1]);
|
|
+ vx (@x[$d1],@x[$d1],@x[$a1]);
|
|
+ verllf (@x[$d1],@x[$d1],8);
|
|
+ vaf (@x[$a2],@x[$a2],@x[$b2]);
|
|
+ vx (@x[$d2],@x[$d2],@x[$a2]);
|
|
+ verllf (@x[$d2],@x[$d2],8);
|
|
+ vaf (@x[$a3],@x[$a3],@x[$b3]);
|
|
+ vx (@x[$d3],@x[$d3],@x[$a3]);
|
|
+ verllf (@x[$d3],@x[$d3],8);
|
|
+
|
|
+ vaf (@x[$c0],@x[$c0],@x[$d0]);
|
|
+ vx (@x[$b0],@x[$b0],@x[$c0]);
|
|
+ verllf (@x[$b0],@x[$b0],7);
|
|
+ vaf (@x[$c1],@x[$c1],@x[$d1]);
|
|
+ vx (@x[$b1],@x[$b1],@x[$c1]);
|
|
+ verllf (@x[$b1],@x[$b1],7);
|
|
+ vaf (@x[$c2],@x[$c2],@x[$d2]);
|
|
+ vx (@x[$b2],@x[$b2],@x[$c2]);
|
|
+ verllf (@x[$b2],@x[$b2],7);
|
|
+ vaf (@x[$c3],@x[$c3],@x[$d3]);
|
|
+ vx (@x[$b3],@x[$b3],@x[$c3]);
|
|
+ verllf (@x[$b3],@x[$b3],7);
|
|
+}
|
|
|
|
- vaf (@v[$a0],@v[$a0],@v[$b0]);
|
|
- vaf (@v[$a1],@v[$a1],@v[$b1]);
|
|
- vaf (@v[$a2],@v[$a2],@v[$b2]);
|
|
- vaf (@v[$a3],@v[$a3],@v[$b3]);
|
|
- vx (@v[$d0],@v[$d0],@v[$a0]);
|
|
- vx (@v[$d1],@v[$d1],@v[$a1]);
|
|
- vx (@v[$d2],@v[$d2],@v[$a2]);
|
|
- vx (@v[$d3],@v[$d3],@v[$a3]);
|
|
- verllf (@v[$d0],@v[$d0],16);
|
|
- verllf (@v[$d1],@v[$d1],16);
|
|
- verllf (@v[$d2],@v[$d2],16);
|
|
- verllf (@v[$d3],@v[$d3],16);
|
|
-
|
|
- vaf (@v[$c0],@v[$c0],@v[$d0]);
|
|
- vaf (@v[$c1],@v[$c1],@v[$d1]);
|
|
- vaf (@v[$c2],@v[$c2],@v[$d2]);
|
|
- vaf (@v[$c3],@v[$c3],@v[$d3]);
|
|
- vx (@v[$b0],@v[$b0],@v[$c0]);
|
|
- vx (@v[$b1],@v[$b1],@v[$c1]);
|
|
- vx (@v[$b2],@v[$b2],@v[$c2]);
|
|
- vx (@v[$b3],@v[$b3],@v[$c3]);
|
|
- verllf (@v[$b0],@v[$b0],12);
|
|
- verllf (@v[$b1],@v[$b1],12);
|
|
- verllf (@v[$b2],@v[$b2],12);
|
|
- verllf (@v[$b3],@v[$b3],12);
|
|
-
|
|
- vaf (@v[$a0],@v[$a0],@v[$b0]);
|
|
- vaf (@v[$a1],@v[$a1],@v[$b1]);
|
|
- vaf (@v[$a2],@v[$a2],@v[$b2]);
|
|
- vaf (@v[$a3],@v[$a3],@v[$b3]);
|
|
- vx (@v[$d0],@v[$d0],@v[$a0]);
|
|
- vx (@v[$d1],@v[$d1],@v[$a1]);
|
|
- vx (@v[$d2],@v[$d2],@v[$a2]);
|
|
- vx (@v[$d3],@v[$d3],@v[$a3]);
|
|
- verllf (@v[$d0],@v[$d0],8);
|
|
- verllf (@v[$d1],@v[$d1],8);
|
|
- verllf (@v[$d2],@v[$d2],8);
|
|
- verllf (@v[$d3],@v[$d3],8);
|
|
-
|
|
- vaf (@v[$c0],@v[$c0],@v[$d0]);
|
|
- vaf (@v[$c1],@v[$c1],@v[$d1]);
|
|
- vaf (@v[$c2],@v[$c2],@v[$d2]);
|
|
- vaf (@v[$c3],@v[$c3],@v[$d3]);
|
|
- vx (@v[$b0],@v[$b0],@v[$c0]);
|
|
- vx (@v[$b1],@v[$b1],@v[$c1]);
|
|
- vx (@v[$b2],@v[$b2],@v[$c2]);
|
|
- vx (@v[$b3],@v[$b3],@v[$c3]);
|
|
- verllf (@v[$b0],@v[$b0],7);
|
|
- verllf (@v[$b1],@v[$b1],7);
|
|
- verllf (@v[$b2],@v[$b2],7);
|
|
- verllf (@v[$b3],@v[$b3],7);
|
|
+sub VX_ROUND {
|
|
+my @a=@_[0..5];
|
|
+my @b=@_[6..11];
|
|
+my @c=@_[12..17];
|
|
+my @d=@_[18..23];
|
|
+my $odd=@_[24];
|
|
+
|
|
+ vaf (@a[$_],@a[$_],@b[$_]) for (0..5);
|
|
+ vx (@d[$_],@d[$_],@a[$_]) for (0..5);
|
|
+ verllf (@d[$_],@d[$_],16) for (0..5);
|
|
+
|
|
+ vaf (@c[$_],@c[$_],@d[$_]) for (0..5);
|
|
+ vx (@b[$_],@b[$_],@c[$_]) for (0..5);
|
|
+ verllf (@b[$_],@b[$_],12) for (0..5);
|
|
+
|
|
+ vaf (@a[$_],@a[$_],@b[$_]) for (0..5);
|
|
+ vx (@d[$_],@d[$_],@a[$_]) for (0..5);
|
|
+ verllf (@d[$_],@d[$_],8) for (0..5);
|
|
+
|
|
+ vaf (@c[$_],@c[$_],@d[$_]) for (0..5);
|
|
+ vx (@b[$_],@b[$_],@c[$_]) for (0..5);
|
|
+ verllf (@b[$_],@b[$_],7) for (0..5);
|
|
+
|
|
+ vsldb (@c[$_],@c[$_],@c[$_],8) for (0..5);
|
|
+ vsldb (@b[$_],@b[$_],@b[$_],$odd?12:4) for (0..5);
|
|
+ vsldb (@d[$_],@d[$_],@d[$_],$odd?4:12) for (0..5);
|
|
}
|
|
|
|
PERLASM_BEGIN($output);
|
|
@@ -210,13 +246,11 @@ TEXT ();
|
|
################
|
|
# void ChaCha20_ctr32(unsigned char *out, const unsigned char *inp, size_t len,
|
|
# const unsigned int key[8], const unsigned int counter[4])
|
|
-{
|
|
my ($out,$inp,$len,$key,$counter)=map("%r$_",(2..6));
|
|
-
|
|
-# VX CODE PATH
|
|
{
|
|
-my $off=$z*8*16+8; # offset(initial state)
|
|
-my $frame=$stdframe+4*16+$off;
|
|
+my $frame=$stdframe+4*20;
|
|
+my @x=map("%r$_",(0..7,"x","x","x","x",(10..13)));
|
|
+my @t=map("%r$_",(8,9));
|
|
|
|
GLOBL ("ChaCha20_ctr32");
|
|
TYPE ("ChaCha20_ctr32","\@function");
|
|
@@ -225,230 +259,16 @@ LABEL ("ChaCha20_ctr32");
|
|
larl ("%r1","OPENSSL_s390xcap_P");
|
|
|
|
lghi ("%r0",64);
|
|
+&{$z? \<gr:\<r} ($len,$len); # len==0?
|
|
+ bzr ("%r14");
|
|
+ lg ("%r1","S390X_STFLE+16(%r1)");
|
|
&{$z? \&clgr:\&clr} ($len,"%r0");
|
|
- jle ("_s390x_chacha_novx");
|
|
-
|
|
- lg ("%r0","S390X_STFLE+16(%r1)");
|
|
- tmhh ("%r0",0x4000); # check for vector facility
|
|
- jz ("_s390x_chacha_novx");
|
|
-
|
|
-if (!$z) {
|
|
- llgfr ($len,$len);
|
|
- std ("%f4","16*$SIZE_T+2*8($sp)");
|
|
- std ("%f6","16*$SIZE_T+3*8($sp)");
|
|
-}
|
|
-&{$z? \&stmg:\&stm} ("%r6","%r7","6*$SIZE_T($sp)");
|
|
+ jle (".Lshort");
|
|
|
|
- lghi ("%r1",-$frame);
|
|
- lgr ("%r0",$sp);
|
|
- la ($sp,"0(%r1,$sp)"); # allocate stack frame
|
|
+ tmhh ("%r1",0x4000); # check for vx bit
|
|
+ jnz (".LChaCha20_ctr32_vx");
|
|
|
|
- larl ("%r7",".Lsigma");
|
|
-&{$z? \&stg:\&st} ("%r0","0($sp)"); # backchain
|
|
-
|
|
- vstm ("%v8","%v15","8($sp)") if ($z);
|
|
-
|
|
- vlm ("%v1","%v2","0($key)"); # load key
|
|
- vl ("%v0","0(%r7)"); # load sigma constant
|
|
- vl ("%v3","0($counter)"); # load iv (counter||nonce)
|
|
- l ("%r0","0($counter)"); # load counter
|
|
- vstm ("%v0","%v3","$off($sp)"); # copy initial state to stack
|
|
-
|
|
- srlg ("%r1",$len,8);
|
|
- ltgr ("%r1","%r1");
|
|
- jz (".Lvx_4x_done");
|
|
-
|
|
-ALIGN (16); # process 4 64-byte blocks
|
|
-LABEL (".Lvx_4x");
|
|
- vlrepf ("%v$_",($_*4)."+$off($sp)") for (0..15); # load initial
|
|
- # state
|
|
- vl ("%v31","16(%r7)");
|
|
- vaf ("%v12","%v12","%v31"); # increment counter
|
|
-
|
|
- vlr (@v[$_],"%v$_") for (0..15); # copy initial state
|
|
-
|
|
- lhi ("%r6",10);
|
|
- j (".Loop_vx_4x");
|
|
-
|
|
-ALIGN (16);
|
|
-LABEL (".Loop_vx_4x");
|
|
- VX_ROUND( 0, 4, 8,12); # column round
|
|
- VX_ROUND( 0, 5,10,15); # diagonal round
|
|
- brct ("%r6",".Loop_vx_4x");
|
|
-
|
|
- vaf (@v[$_],@v[$_],"%v$_") for (0..15); # state += initial
|
|
- # state (mod 32)
|
|
- vlm ("%v6","%v7","32(%r7)"); # load vperm operands
|
|
-
|
|
-for (0..3) { # blocks 1,2
|
|
- vmrhf ("%v0",@v[$_*4+0],@v[$_*4+1]); # ks = serialize(state)
|
|
- vmrhf ("%v1",@v[$_*4+2],@v[$_*4+3]);
|
|
- vperm ("%v".($_+ 8),"%v0","%v1","%v6");
|
|
- vperm ("%v".($_+12),"%v0","%v1","%v7");
|
|
-}
|
|
- vlm ("%v0","%v7","0($inp)"); # load in
|
|
- vx ("%v$_","%v$_","%v".($_+8)) for (0..7); # out = in ^ ks
|
|
- vstm ("%v0","%v7","0($out)"); # store out
|
|
-
|
|
- vlm ("%v6","%v7","32(%r7)"); # restore vperm operands
|
|
-
|
|
-for (0..3) { # blocks 2,3
|
|
- vmrlf ("%v0",@v[$_*4+0],@v[$_*4+1]); # ks = serialize(state)
|
|
- vmrlf ("%v1",@v[$_*4+2],@v[$_*4+3]);
|
|
- vperm ("%v".($_+ 8),"%v0","%v1","%v6");
|
|
- vperm ("%v".($_+12),"%v0","%v1","%v7");
|
|
-}
|
|
- vlm ("%v0","%v7","128($inp)"); # load in
|
|
- vx ("%v$_","%v$_","%v".($_+8)) for (0..7); # out = in ^ ks
|
|
- vstm ("%v0","%v7","128($out)"); # store out
|
|
-
|
|
- ahi ("%r0",4);
|
|
- st ("%r0","48+$off($sp)"); # update initial state
|
|
-
|
|
- la ($inp,"256($inp)");
|
|
- la ($out,"256($out)");
|
|
- brctg ("%r1",".Lvx_4x");
|
|
-
|
|
-ALIGN (16);
|
|
-LABEL (".Lvx_4x_done");
|
|
- lghi ("%r1",0xff);
|
|
- ngr ($len,"%r1");
|
|
- jnz (".Lvx_rem");
|
|
-
|
|
-ALIGN (16);
|
|
-LABEL (".Lvx_done");
|
|
- vzero ("%v$_") for (16..31); # wipe ks and key copy
|
|
- vstm ("%v16","%v17","16+$off($sp)");
|
|
- vlm ("%v8","%v15","8($sp)") if ($z);
|
|
-
|
|
- la ($sp,"$frame($sp)");
|
|
-&{$z? \&lmg:\&lm} ("%r6","%r7","6*$SIZE_T($sp)");
|
|
-
|
|
-if (!$z) {
|
|
- ld ("%f4","16*$SIZE_T+2*8($sp)");
|
|
- ld ("%f6","16*$SIZE_T+3*8($sp)");
|
|
- vzero ("%v$_") for (8..15);
|
|
-}
|
|
- br ("%r14");
|
|
-ALIGN (16);
|
|
-LABEL (".Lvx_rem");
|
|
- lhi ("%r0",64);
|
|
-
|
|
- sr ($len,"%r0");
|
|
- brc (2,".Lvx_rem_g64"); # cc==2?
|
|
-
|
|
- lghi ("%r1",-$stdframe);
|
|
-
|
|
- la ($counter,"48+$off($sp)"); # load updated iv
|
|
- ar ($len,"%r0"); # restore len
|
|
-
|
|
- lgr ("%r7",$counter);
|
|
-&{$z? \&stg:\&st} ("%r14","14*$SIZE_T+$frame($sp)");
|
|
- la ($sp,"0(%r1,$sp)");
|
|
-
|
|
- bras ("%r14","_s390x_chacha_novx");
|
|
-
|
|
- la ($sp,"$stdframe($sp)");
|
|
-&{$z? \&lg:\&l} ("%r14","14*$SIZE_T+$frame($sp)");
|
|
- lgr ($counter,"%r7");
|
|
- j (".Lvx_done");
|
|
-
|
|
-ALIGN (16);
|
|
-LABEL (".Lvx_rem_g64");
|
|
- vlrepf ("%v$_",($_*4)."+$off($sp)") for (0..15); # load initial
|
|
- # state
|
|
- vl ("%v31","16(%r7)");
|
|
- vaf ("%v12","%v12","%v31"); # increment counter
|
|
-
|
|
- vlr (@v[$_],"%v$_") for (0..15); # state = initial state
|
|
-
|
|
- lhi ("%r6",10);
|
|
- j (".Loop_vx_rem");
|
|
-
|
|
-ALIGN (16);
|
|
-LABEL (".Loop_vx_rem");
|
|
- VX_ROUND( 0, 4, 8,12); # column round
|
|
- VX_ROUND( 0, 5,10,15); # diagonal round
|
|
- brct ("%r6",".Loop_vx_rem");
|
|
-
|
|
- vaf (@v[$_],@v[$_],"%v$_") for (0..15); # state += initial
|
|
- # state (mod 32)
|
|
- vlm ("%v6","%v7","32(%r7)"); # load vperm operands
|
|
-
|
|
-for (0..3) { # blocks 1,2
|
|
- vmrhf ("%v0",@v[$_*4+0],@v[$_*4+1]); # ks = serialize(state)
|
|
- vmrhf ("%v1",@v[$_*4+2],@v[$_*4+3]);
|
|
- vperm ("%v".($_+8),"%v0","%v1","%v6");
|
|
- vperm ("%v".($_+12),"%v0","%v1","%v7");
|
|
-}
|
|
- vlm ("%v0","%v3","0($inp)"); # load in
|
|
- vx ("%v$_","%v$_","%v".($_+8)) for (0..3); # out = in ^ ks
|
|
- vstm ("%v0","%v3","0($out)"); # store out
|
|
-
|
|
- la ($inp,"64($inp)");
|
|
- la ($out,"64($out)");
|
|
-
|
|
- sr ($len,"%r0");
|
|
- brc (4,".Lvx_tail"); # cc==4?
|
|
-
|
|
- vlm ("%v0","%v3","0($inp)"); # load in
|
|
- vx ("%v$_","%v$_","%v".($_+12)) for (0..3); # out = in ^ ks
|
|
- vstm ("%v0","%v3","0($out)"); # store out
|
|
- jz (".Lvx_done");
|
|
-
|
|
-for (0..3) { # blocks 3,4
|
|
- vmrlf ("%v0",@v[$_*4+0],@v[$_*4+1]); # ks = serialize(state)
|
|
- vmrlf ("%v1",@v[$_*4+2],@v[$_*4+3]);
|
|
- vperm ("%v".($_+12),"%v0","%v1","%v6");
|
|
- vperm ("%v".($_+8),"%v0","%v1","%v7");
|
|
-}
|
|
- la ($inp,"64($inp)");
|
|
- la ($out,"64($out)");
|
|
-
|
|
- sr ($len,"%r0");
|
|
- brc (4,".Lvx_tail"); # cc==4?
|
|
-
|
|
- vlm ("%v0","%v3","0($inp)"); # load in
|
|
- vx ("%v$_","%v$_","%v".($_+12)) for (0..3); # out = in ^ ks
|
|
- vstm ("%v0","%v3","0($out)"); # store out
|
|
- jz (".Lvx_done");
|
|
-
|
|
- la ($inp,"64($inp)");
|
|
- la ($out,"64($out)");
|
|
-
|
|
- sr ($len,"%r0");
|
|
- vlr ("%v".($_+4),"%v$_") for (8..11);
|
|
- j (".Lvx_tail");
|
|
-
|
|
-ALIGN (16);
|
|
-LABEL (".Lvx_tail");
|
|
- ar ($len,"%r0"); # restore $len
|
|
- ahi ($len,-1);
|
|
-
|
|
- lhi ("%r0",16);
|
|
-for (0..2) {
|
|
- vll ("%v0",$len,($_*16)."($inp)");
|
|
- vx ("%v0","%v0","%v".($_+12));
|
|
- vstl ("%v0",$len,($_*16)."($out)");
|
|
- sr ($len,"%r0");
|
|
- brc (4,".Lvx_done"); # cc==4?
|
|
-}
|
|
- vll ("%v0",$len,"3*16($inp)");
|
|
- vx ("%v0","%v0","%v15");
|
|
- vstl ("%v0",$len,"3*16($out)");
|
|
- j (".Lvx_done");
|
|
-SIZE ("ChaCha20_ctr32",".-ChaCha20_ctr32");
|
|
-}
|
|
-
|
|
-# NOVX CODE PATH
|
|
-{
|
|
-my $frame=$stdframe+4*20;
|
|
-
|
|
-TYPE ("_s390x_chacha_novx","\@function");
|
|
-ALIGN (32);
|
|
-LABEL ("_s390x_chacha_novx");
|
|
-&{$z? \<gr:\<r} ($len,$len); # $len==0?
|
|
- bzr ("%r14");
|
|
+LABEL (".Lshort");
|
|
&{$z? \&aghi:\&ahi} ($len,-64);
|
|
&{$z? \&lghi:\&lhi} ("%r1",-$frame);
|
|
&{$z? \&stmg:\&stm} ("%r6","%r15","6*$SIZE_T($sp)");
|
|
@@ -607,17 +427,629 @@ LABEL (".Loop_tail");
|
|
brct (@t[1],".Loop_tail");
|
|
|
|
j (".Ldone");
|
|
-SIZE ("_s390x_chacha_novx",".-_s390x_chacha_novx");
|
|
+SIZE ("ChaCha20_ctr32",".-ChaCha20_ctr32");
|
|
+}
|
|
+
|
|
+########################################################################
|
|
+# 4x"vertical" layout minimizes amount of instructions, but pipeline
|
|
+# runs underutilized [because of vector instructions' high latency].
|
|
+# On the other hand minimum amount of data it takes to fully utilize
|
|
+# the pipeline is higher, so that effectively, short inputs would be
|
|
+# processed slower. Hence this code path targeting <=256 bytes lengths.
|
|
+#
|
|
+{
|
|
+my ($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3,
|
|
+ $xc0,$xc1,$xc2,$xc3, $xd0,$xd1,$xd2,$xd3)=map("%v$_",(0..15));
|
|
+my @K=map("%v$_",(16..19));
|
|
+my $CTR="%v26";
|
|
+my ($xt0,$xt1,$xt2,$xt3)=map("%v$_",(27..30));
|
|
+my $beperm="%v31";
|
|
+my ($x00,$x10,$x20,$x30)=(0,map("r$_",(8..10)));
|
|
+my $FRAME=$stdframe+4*16;
|
|
+
|
|
+ALIGN (32);
|
|
+LABEL ("ChaCha20_ctr32_4x");
|
|
+LABEL (".LChaCha20_ctr32_4x");
|
|
+&{$z? \&stmg:\&stm} ("%r6","%r7","6*$SIZE_T($sp)");
|
|
+if (!$z) {
|
|
+ std ("%f4","16*$SIZE_T+2*8($sp)");
|
|
+ std ("%f6","16*$SIZE_T+3*8($sp)");
|
|
+}
|
|
+&{$z? \&lghi:\&lhi} ("%r1",-$FRAME);
|
|
+ lgr ("%r0",$sp);
|
|
+ la ($sp,"0(%r1,$sp)");
|
|
+&{$z? \&stg:\&st} ("%r0","0($sp)"); # back-chain
|
|
+if ($z) {
|
|
+ std ("%f8","$stdframe+8*0($sp)");
|
|
+ std ("%f9","$stdframe+8*1($sp)");
|
|
+ std ("%f10","$stdframe+8*2($sp)");
|
|
+ std ("%f11","$stdframe+8*3($sp)");
|
|
+ std ("%f12","$stdframe+8*4($sp)");
|
|
+ std ("%f13","$stdframe+8*5($sp)");
|
|
+ std ("%f14","$stdframe+8*6($sp)");
|
|
+ std ("%f15","$stdframe+8*7($sp)");
|
|
+}
|
|
+ larl ("%r7",".Lsigma");
|
|
+ lhi ("%r0",10);
|
|
+ lhi ("%r1",0);
|
|
+
|
|
+ vl (@K[0],"0(%r7)"); # load sigma
|
|
+ vl (@K[1],"0($key)"); # load key
|
|
+ vl (@K[2],"16($key)");
|
|
+ vl (@K[3],"0($counter)"); # load counter
|
|
+
|
|
+ vl ($beperm,"0x40(%r7)");
|
|
+ vl ($xt1,"0x50(%r7)");
|
|
+ vrepf ($CTR,@K[3],0);
|
|
+ vlvgf (@K[3],"%r1",0); # clear @K[3].word[0]
|
|
+ vaf ($CTR,$CTR,$xt1);
|
|
+
|
|
+#LABEL (".Loop_outer_4x");
|
|
+ vlm ($xa0,$xa3,"0x60(%r7)"); # load [smashed] sigma
|
|
+
|
|
+ vrepf ($xb0,@K[1],0); # smash the key
|
|
+ vrepf ($xb1,@K[1],1);
|
|
+ vrepf ($xb2,@K[1],2);
|
|
+ vrepf ($xb3,@K[1],3);
|
|
+
|
|
+ vrepf ($xc0,@K[2],0);
|
|
+ vrepf ($xc1,@K[2],1);
|
|
+ vrepf ($xc2,@K[2],2);
|
|
+ vrepf ($xc3,@K[2],3);
|
|
+
|
|
+ vlr ($xd0,$CTR);
|
|
+ vrepf ($xd1,@K[3],1);
|
|
+ vrepf ($xd2,@K[3],2);
|
|
+ vrepf ($xd3,@K[3],3);
|
|
+
|
|
+LABEL (".Loop_4x");
|
|
+ VX_lane_ROUND(0, 4, 8,12);
|
|
+ VX_lane_ROUND(0, 5,10,15);
|
|
+ brct ("%r0",".Loop_4x");
|
|
+
|
|
+ vaf ($xd0,$xd0,$CTR);
|
|
+
|
|
+ vmrhf ($xt0,$xa0,$xa1); # transpose data
|
|
+ vmrhf ($xt1,$xa2,$xa3);
|
|
+ vmrlf ($xt2,$xa0,$xa1);
|
|
+ vmrlf ($xt3,$xa2,$xa3);
|
|
+ vpdi ($xa0,$xt0,$xt1,0b0000);
|
|
+ vpdi ($xa1,$xt0,$xt1,0b0101);
|
|
+ vpdi ($xa2,$xt2,$xt3,0b0000);
|
|
+ vpdi ($xa3,$xt2,$xt3,0b0101);
|
|
+
|
|
+ vmrhf ($xt0,$xb0,$xb1);
|
|
+ vmrhf ($xt1,$xb2,$xb3);
|
|
+ vmrlf ($xt2,$xb0,$xb1);
|
|
+ vmrlf ($xt3,$xb2,$xb3);
|
|
+ vpdi ($xb0,$xt0,$xt1,0b0000);
|
|
+ vpdi ($xb1,$xt0,$xt1,0b0101);
|
|
+ vpdi ($xb2,$xt2,$xt3,0b0000);
|
|
+ vpdi ($xb3,$xt2,$xt3,0b0101);
|
|
+
|
|
+ vmrhf ($xt0,$xc0,$xc1);
|
|
+ vmrhf ($xt1,$xc2,$xc3);
|
|
+ vmrlf ($xt2,$xc0,$xc1);
|
|
+ vmrlf ($xt3,$xc2,$xc3);
|
|
+ vpdi ($xc0,$xt0,$xt1,0b0000);
|
|
+ vpdi ($xc1,$xt0,$xt1,0b0101);
|
|
+ vpdi ($xc2,$xt2,$xt3,0b0000);
|
|
+ vpdi ($xc3,$xt2,$xt3,0b0101);
|
|
+
|
|
+ vmrhf ($xt0,$xd0,$xd1);
|
|
+ vmrhf ($xt1,$xd2,$xd3);
|
|
+ vmrlf ($xt2,$xd0,$xd1);
|
|
+ vmrlf ($xt3,$xd2,$xd3);
|
|
+ vpdi ($xd0,$xt0,$xt1,0b0000);
|
|
+ vpdi ($xd1,$xt0,$xt1,0b0101);
|
|
+ vpdi ($xd2,$xt2,$xt3,0b0000);
|
|
+ vpdi ($xd3,$xt2,$xt3,0b0101);
|
|
+
|
|
+ #vrepif ($xt0,4);
|
|
+ #vaf ($CTR,$CTR,$xt0); # next counter value
|
|
+
|
|
+ vaf ($xa0,$xa0,@K[0]);
|
|
+ vaf ($xb0,$xb0,@K[1]);
|
|
+ vaf ($xc0,$xc0,@K[2]);
|
|
+ vaf ($xd0,$xd0,@K[3]);
|
|
+
|
|
+ vperm ($xa0,$xa0,$xa0,$beperm);
|
|
+ vperm ($xb0,$xb0,$xb0,$beperm);
|
|
+ vperm ($xc0,$xc0,$xc0,$beperm);
|
|
+ vperm ($xd0,$xd0,$xd0,$beperm);
|
|
+
|
|
+ #&{$z? \&clgfi:\&clfi} ($len,0x40);
|
|
+ #jl (".Ltail_4x");
|
|
+
|
|
+ vlm ($xt0,$xt3,"0($inp)");
|
|
+
|
|
+ vx ($xt0,$xt0,$xa0);
|
|
+ vx ($xt1,$xt1,$xb0);
|
|
+ vx ($xt2,$xt2,$xc0);
|
|
+ vx ($xt3,$xt3,$xd0);
|
|
+
|
|
+ vstm ($xt0,$xt3,"0($out)");
|
|
+
|
|
+ la ($inp,"0x40($inp)");
|
|
+ la ($out,"0x40($out)");
|
|
+&{$z? \&aghi:\&ahi} ($len,-0x40);
|
|
+ #je (".Ldone_4x");
|
|
+
|
|
+ vaf ($xa0,$xa1,@K[0]);
|
|
+ vaf ($xb0,$xb1,@K[1]);
|
|
+ vaf ($xc0,$xc1,@K[2]);
|
|
+ vaf ($xd0,$xd1,@K[3]);
|
|
+
|
|
+ vperm ($xa0,$xa0,$xa0,$beperm);
|
|
+ vperm ($xb0,$xb0,$xb0,$beperm);
|
|
+ vperm ($xc0,$xc0,$xc0,$beperm);
|
|
+ vperm ($xd0,$xd0,$xd0,$beperm);
|
|
+
|
|
+&{$z? \&clgfi:\&clfi} ($len,0x40);
|
|
+ jl (".Ltail_4x");
|
|
+
|
|
+ vlm ($xt0,$xt3,"0($inp)");
|
|
+
|
|
+ vx ($xt0,$xt0,$xa0);
|
|
+ vx ($xt1,$xt1,$xb0);
|
|
+ vx ($xt2,$xt2,$xc0);
|
|
+ vx ($xt3,$xt3,$xd0);
|
|
+
|
|
+ vstm ($xt0,$xt3,"0($out)");
|
|
+
|
|
+ la ($inp,"0x40($inp)");
|
|
+ la ($out,"0x40($out)");
|
|
+&{$z? \&aghi:\&ahi} ($len,-0x40);
|
|
+ je (".Ldone_4x");
|
|
+
|
|
+ vaf ($xa0,$xa2,@K[0]);
|
|
+ vaf ($xb0,$xb2,@K[1]);
|
|
+ vaf ($xc0,$xc2,@K[2]);
|
|
+ vaf ($xd0,$xd2,@K[3]);
|
|
+
|
|
+ vperm ($xa0,$xa0,$xa0,$beperm);
|
|
+ vperm ($xb0,$xb0,$xb0,$beperm);
|
|
+ vperm ($xc0,$xc0,$xc0,$beperm);
|
|
+ vperm ($xd0,$xd0,$xd0,$beperm);
|
|
+
|
|
+&{$z? \&clgfi:\&clfi} ($len,0x40);
|
|
+ jl (".Ltail_4x");
|
|
+
|
|
+ vlm ($xt0,$xt3,"0($inp)");
|
|
+
|
|
+ vx ($xt0,$xt0,$xa0);
|
|
+ vx ($xt1,$xt1,$xb0);
|
|
+ vx ($xt2,$xt2,$xc0);
|
|
+ vx ($xt3,$xt3,$xd0);
|
|
+
|
|
+ vstm ($xt0,$xt3,"0($out)");
|
|
+
|
|
+ la ($inp,"0x40($inp)");
|
|
+ la ($out,"0x40($out)");
|
|
+&{$z? \&aghi:\&ahi} ($len,-0x40);
|
|
+ je (".Ldone_4x");
|
|
+
|
|
+ vaf ($xa0,$xa3,@K[0]);
|
|
+ vaf ($xb0,$xb3,@K[1]);
|
|
+ vaf ($xc0,$xc3,@K[2]);
|
|
+ vaf ($xd0,$xd3,@K[3]);
|
|
+
|
|
+ vperm ($xa0,$xa0,$xa0,$beperm);
|
|
+ vperm ($xb0,$xb0,$xb0,$beperm);
|
|
+ vperm ($xc0,$xc0,$xc0,$beperm);
|
|
+ vperm ($xd0,$xd0,$xd0,$beperm);
|
|
+
|
|
+&{$z? \&clgfi:\&clfi} ($len,0x40);
|
|
+ jl (".Ltail_4x");
|
|
+
|
|
+ vlm ($xt0,$xt3,"0($inp)");
|
|
+
|
|
+ vx ($xt0,$xt0,$xa0);
|
|
+ vx ($xt1,$xt1,$xb0);
|
|
+ vx ($xt2,$xt2,$xc0);
|
|
+ vx ($xt3,$xt3,$xd0);
|
|
+
|
|
+ vstm ($xt0,$xt3,"0($out)");
|
|
+
|
|
+ #la $inp,0x40($inp));
|
|
+ #la $out,0x40($out));
|
|
+ #lhi %r0,10);
|
|
+ #&{$z? \&aghi:\&ahi} $len,-0x40);
|
|
+ #jne .Loop_outer_4x);
|
|
+
|
|
+LABEL (".Ldone_4x");
|
|
+if (!$z) {
|
|
+ ld ("%f4","$FRAME+16*$SIZE_T+2*8($sp)");
|
|
+ ld ("%f6","$FRAME+16*$SIZE_T+3*8($sp)");
|
|
+} else {
|
|
+ ld ("%f8","$stdframe+8*0($sp)");
|
|
+ ld ("%f9","$stdframe+8*1($sp)");
|
|
+ ld ("%f10","$stdframe+8*2($sp)");
|
|
+ ld ("%f11","$stdframe+8*3($sp)");
|
|
+ ld ("%f12","$stdframe+8*4($sp)");
|
|
+ ld ("%f13","$stdframe+8*5($sp)");
|
|
+ ld ("%f14","$stdframe+8*6($sp)");
|
|
+ ld ("%f15","$stdframe+8*7($sp)");
|
|
+}
|
|
+&{$z? \&lmg:\&lm} ("%r6","%r7","$FRAME+6*$SIZE_T($sp)");
|
|
+ la ($sp,"$FRAME($sp)");
|
|
+ br ("%r14");
|
|
+
|
|
+ALIGN (16);
|
|
+LABEL (".Ltail_4x");
|
|
+if (!$z) {
|
|
+ vlr ($xt0,$xb0);
|
|
+ ld ("%f4","$FRAME+16*$SIZE_T+2*8($sp)");
|
|
+ ld ("%f6","$FRAME+16*$SIZE_T+3*8($sp)");
|
|
+
|
|
+ vst ($xa0,"$stdframe+0x00($sp)");
|
|
+ vst ($xt0,"$stdframe+0x10($sp)");
|
|
+ vst ($xc0,"$stdframe+0x20($sp)");
|
|
+ vst ($xd0,"$stdframe+0x30($sp)");
|
|
+} else {
|
|
+ vlr ($xt0,$xc0);
|
|
+ ld ("%f8","$stdframe+8*0($sp)");
|
|
+ ld ("%f9","$stdframe+8*1($sp)");
|
|
+ ld ("%f10","$stdframe+8*2($sp)");
|
|
+ ld ("%f11","$stdframe+8*3($sp)");
|
|
+ vlr ($xt1,$xd0);
|
|
+ ld ("%f12","$stdframe+8*4($sp)");
|
|
+ ld ("%f13","$stdframe+8*5($sp)");
|
|
+ ld ("%f14","$stdframe+8*6($sp)");
|
|
+ ld ("%f15","$stdframe+8*7($sp)");
|
|
+
|
|
+ vst ($xa0,"$stdframe+0x00($sp)");
|
|
+ vst ($xb0,"$stdframe+0x10($sp)");
|
|
+ vst ($xt0,"$stdframe+0x20($sp)");
|
|
+ vst ($xt1,"$stdframe+0x30($sp)");
|
|
}
|
|
+ lghi ("%r1",0);
|
|
+
|
|
+LABEL (".Loop_tail_4x");
|
|
+ llgc ("%r5","0(%r1,$inp)");
|
|
+ llgc ("%r6","$stdframe(%r1,$sp)");
|
|
+ xr ("%r6","%r5");
|
|
+ stc ("%r6","0(%r1,$out)");
|
|
+ la ("%r1","1(%r1)");
|
|
+ brct ($len,".Loop_tail_4x");
|
|
+
|
|
+&{$z? \&lmg:\&lm} ("%r6","%r7","$FRAME+6*$SIZE_T($sp)");
|
|
+ la ($sp,"$FRAME($sp)");
|
|
+ br ("%r14");
|
|
+SIZE ("ChaCha20_ctr32_4x",".-ChaCha20_ctr32_4x");
|
|
+}
|
|
+
|
|
+########################################################################
|
|
+# 6x"horizontal" layout is optimal fit for the platform in its current
|
|
+# shape, more specifically for given vector instructions' latency. Well,
|
|
+# computational part of 8x"vertical" would be faster, but it consumes
|
|
+# all registers and dealing with that will diminish the return...
|
|
+#
|
|
+{
|
|
+my ($a0,$b0,$c0,$d0, $a1,$b1,$c1,$d1,
|
|
+ $a2,$b2,$c2,$d2, $a3,$b3,$c3,$d3,
|
|
+ $a4,$b4,$c4,$d4, $a5,$b5,$c5,$d5)=map("%v$_",(0..23));
|
|
+my @K=map("%v$_",(27,24..26));
|
|
+my ($t0,$t1,$t2,$t3)=map("%v$_",27..30);
|
|
+my $beperm="%v31";
|
|
+my $FRAME=$stdframe + 4*16;
|
|
+
|
|
+GLOBL ("ChaCha20_ctr32_vx");
|
|
+ALIGN (32);
|
|
+LABEL ("ChaCha20_ctr32_vx");
|
|
+LABEL (".LChaCha20_ctr32_vx");
|
|
+&{$z? \&clgfi:\&clfi} ($len,256);
|
|
+ jle (".LChaCha20_ctr32_4x");
|
|
+&{$z? \&stmg:\&stm} ("%r6","%r7","6*$SIZE_T($sp)");
|
|
+if (!$z) {
|
|
+ std ("%f4","16*$SIZE_T+2*8($sp)");
|
|
+ std ("%f6","16*$SIZE_T+3*8($sp)");
|
|
+}
|
|
+&{$z? \&lghi:\&lhi} ("%r1",-$FRAME);
|
|
+ lgr ("%r0",$sp);
|
|
+ la ($sp,"0(%r1,$sp)");
|
|
+&{$z? \&stg:\&st} ("%r0","0($sp)"); # back-chain
|
|
+if ($z) {
|
|
+ std ("%f8","$FRAME-8*8($sp)");
|
|
+ std ("%f9","$FRAME-8*7($sp)");
|
|
+ std ("%f10","$FRAME-8*6($sp)");
|
|
+ std ("%f11","$FRAME-8*5($sp)");
|
|
+ std ("%f12","$FRAME-8*4($sp)");
|
|
+ std ("%f13","$FRAME-8*3($sp)");
|
|
+ std ("%f14","$FRAME-8*2($sp)");
|
|
+ std ("%f15","$FRAME-8*1($sp)");
|
|
+}
|
|
+ larl ("%r7",".Lsigma");
|
|
+ lhi ("%r0",10);
|
|
+
|
|
+ vlm (@K[1],@K[2],"0($key)"); # load key
|
|
+ vl (@K[3],"0($counter)"); # load counter
|
|
+
|
|
+ vlm (@K[0],"$beperm","0(%r7)"); # load sigma, increments, ...
|
|
+
|
|
+LABEL (".Loop_outer_vx");
|
|
+ vlr ($a0,@K[0]);
|
|
+ vlr ($b0,@K[1]);
|
|
+ vlr ($a1,@K[0]);
|
|
+ vlr ($b1,@K[1]);
|
|
+ vlr ($a2,@K[0]);
|
|
+ vlr ($b2,@K[1]);
|
|
+ vlr ($a3,@K[0]);
|
|
+ vlr ($b3,@K[1]);
|
|
+ vlr ($a4,@K[0]);
|
|
+ vlr ($b4,@K[1]);
|
|
+ vlr ($a5,@K[0]);
|
|
+ vlr ($b5,@K[1]);
|
|
+
|
|
+ vlr ($d0,@K[3]);
|
|
+ vaf ($d1,@K[3],$t1); # K[3]+1
|
|
+ vaf ($d2,@K[3],$t2); # K[3]+2
|
|
+ vaf ($d3,@K[3],$t3); # K[3]+3
|
|
+ vaf ($d4,$d2,$t2); # K[3]+4
|
|
+ vaf ($d5,$d2,$t3); # K[3]+5
|
|
+
|
|
+ vlr ($c0,@K[2]);
|
|
+ vlr ($c1,@K[2]);
|
|
+ vlr ($c2,@K[2]);
|
|
+ vlr ($c3,@K[2]);
|
|
+ vlr ($c4,@K[2]);
|
|
+ vlr ($c5,@K[2]);
|
|
+
|
|
+ vlr ($t1,$d1);
|
|
+ vlr ($t2,$d2);
|
|
+ vlr ($t3,$d3);
|
|
+
|
|
+ALIGN (4);
|
|
+LABEL (".Loop_vx");
|
|
+
|
|
+ VX_ROUND($a0,$a1,$a2,$a3,$a4,$a5,
|
|
+ $b0,$b1,$b2,$b3,$b4,$b5,
|
|
+ $c0,$c1,$c2,$c3,$c4,$c5,
|
|
+ $d0,$d1,$d2,$d3,$d4,$d5,
|
|
+ 0);
|
|
+
|
|
+ VX_ROUND($a0,$a1,$a2,$a3,$a4,$a5,
|
|
+ $b0,$b1,$b2,$b3,$b4,$b5,
|
|
+ $c0,$c1,$c2,$c3,$c4,$c5,
|
|
+ $d0,$d1,$d2,$d3,$d4,$d5,
|
|
+ 1);
|
|
+
|
|
+ brct ("%r0",".Loop_vx");
|
|
+
|
|
+ vaf ($a0,$a0,@K[0]);
|
|
+ vaf ($b0,$b0,@K[1]);
|
|
+ vaf ($c0,$c0,@K[2]);
|
|
+ vaf ($d0,$d0,@K[3]);
|
|
+ vaf ($a1,$a1,@K[0]);
|
|
+ vaf ($d1,$d1,$t1); # +K[3]+1
|
|
+
|
|
+ vperm ($a0,$a0,$a0,$beperm);
|
|
+ vperm ($b0,$b0,$b0,$beperm);
|
|
+ vperm ($c0,$c0,$c0,$beperm);
|
|
+ vperm ($d0,$d0,$d0,$beperm);
|
|
+
|
|
+&{$z? \&clgfi:\&clfi} ($len,0x40);
|
|
+ jl (".Ltail_vx");
|
|
+
|
|
+ vaf ($d2,$d2,$t2); # +K[3]+2
|
|
+ vaf ($d3,$d3,$t3); # +K[3]+3
|
|
+ vlm ($t0,$t3,"0($inp)");
|
|
+
|
|
+ vx ($a0,$a0,$t0);
|
|
+ vx ($b0,$b0,$t1);
|
|
+ vx ($c0,$c0,$t2);
|
|
+ vx ($d0,$d0,$t3);
|
|
+
|
|
+ vlm (@K[0],$t3,"0(%r7)"); # re-load sigma and increments
|
|
+
|
|
+ vstm ($a0,$d0,"0($out)");
|
|
+
|
|
+ la ($inp,"0x40($inp)");
|
|
+ la ($out,"0x40($out)");
|
|
+&{$z? \&aghi:\&ahi} ($len,-0x40);
|
|
+ je (".Ldone_vx");
|
|
+
|
|
+ vaf ($b1,$b1,@K[1]);
|
|
+ vaf ($c1,$c1,@K[2]);
|
|
+
|
|
+ vperm ($a0,$a1,$a1,$beperm);
|
|
+ vperm ($b0,$b1,$b1,$beperm);
|
|
+ vperm ($c0,$c1,$c1,$beperm);
|
|
+ vperm ($d0,$d1,$d1,$beperm);
|
|
+
|
|
+&{$z? \&clgfi:\&clfi} ($len,0x40);
|
|
+ jl (".Ltail_vx");
|
|
+
|
|
+ vlm ($a1,$d1,"0($inp)");
|
|
+
|
|
+ vx ($a0,$a0,$a1);
|
|
+ vx ($b0,$b0,$b1);
|
|
+ vx ($c0,$c0,$c1);
|
|
+ vx ($d0,$d0,$d1);
|
|
+
|
|
+ vstm ($a0,$d0,"0($out)");
|
|
+
|
|
+ la ($inp,"0x40($inp)");
|
|
+ la ($out,"0x40($out)");
|
|
+&{$z? \&aghi:\&ahi} ($len,-0x40);
|
|
+ je (".Ldone_vx");
|
|
+
|
|
+ vaf ($a2,$a2,@K[0]);
|
|
+ vaf ($b2,$b2,@K[1]);
|
|
+ vaf ($c2,$c2,@K[2]);
|
|
+
|
|
+ vperm ($a0,$a2,$a2,$beperm);
|
|
+ vperm ($b0,$b2,$b2,$beperm);
|
|
+ vperm ($c0,$c2,$c2,$beperm);
|
|
+ vperm ($d0,$d2,$d2,$beperm);
|
|
+
|
|
+&{$z? \&clgfi:\&clfi} ($len,0x40);
|
|
+ jl (".Ltail_vx");
|
|
+
|
|
+ vlm ($a1,$d1,"0($inp)");
|
|
+
|
|
+ vx ($a0,$a0,$a1);
|
|
+ vx ($b0,$b0,$b1);
|
|
+ vx ($c0,$c0,$c1);
|
|
+ vx ($d0,$d0,$d1);
|
|
+
|
|
+ vstm ($a0,$d0,"0($out)");
|
|
+
|
|
+ la ($inp,"0x40($inp)");
|
|
+ la ($out,"0x40($out)");
|
|
+&{$z? \&aghi:\&ahi} ($len,-0x40);
|
|
+ je (".Ldone_vx");
|
|
+
|
|
+ vaf ($a3,$a3,@K[0]);
|
|
+ vaf ($b3,$b3,@K[1]);
|
|
+ vaf ($c3,$c3,@K[2]);
|
|
+ vaf ($d2,@K[3],$t3); # K[3]+3
|
|
+
|
|
+ vperm ($a0,$a3,$a3,$beperm);
|
|
+ vperm ($b0,$b3,$b3,$beperm);
|
|
+ vperm ($c0,$c3,$c3,$beperm);
|
|
+ vperm ($d0,$d3,$d3,$beperm);
|
|
+
|
|
+&{$z? \&clgfi:\&clfi} ($len,0x40);
|
|
+ jl (".Ltail_vx");
|
|
+
|
|
+ vaf ($d3,$d2,$t1); # K[3]+4
|
|
+ vlm ($a1,$d1,"0($inp)");
|
|
+
|
|
+ vx ($a0,$a0,$a1);
|
|
+ vx ($b0,$b0,$b1);
|
|
+ vx ($c0,$c0,$c1);
|
|
+ vx ($d0,$d0,$d1);
|
|
+
|
|
+ vstm ($a0,$d0,"0($out)");
|
|
+
|
|
+ la ($inp,"0x40($inp)");
|
|
+ la ($out,"0x40($out)");
|
|
+&{$z? \&aghi:\&ahi} ($len,-0x40);
|
|
+ je (".Ldone_vx");
|
|
+
|
|
+ vaf ($a4,$a4,@K[0]);
|
|
+ vaf ($b4,$b4,@K[1]);
|
|
+ vaf ($c4,$c4,@K[2]);
|
|
+ vaf ($d4,$d4,$d3); # +K[3]+4
|
|
+ vaf ($d3,$d3,$t1); # K[3]+5
|
|
+ vaf (@K[3],$d2,$t3); # K[3]+=6
|
|
+
|
|
+ vperm ($a0,$a4,$a4,$beperm);
|
|
+ vperm ($b0,$b4,$b4,$beperm);
|
|
+ vperm ($c0,$c4,$c4,$beperm);
|
|
+ vperm ($d0,$d4,$d4,$beperm);
|
|
+
|
|
+&{$z? \&clgfi:\&clfi} ($len,0x40);
|
|
+ jl (".Ltail_vx");
|
|
+
|
|
+ vlm ($a1,$d1,"0($inp)");
|
|
+
|
|
+ vx ($a0,$a0,$a1);
|
|
+ vx ($b0,$b0,$b1);
|
|
+ vx ($c0,$c0,$c1);
|
|
+ vx ($d0,$d0,$d1);
|
|
+
|
|
+ vstm ($a0,$d0,"0($out)");
|
|
+
|
|
+ la ($inp,"0x40($inp)");
|
|
+ la ($out,"0x40($out)");
|
|
+&{$z? \&aghi:\&ahi} ($len,-0x40);
|
|
+ je (".Ldone_vx");
|
|
+
|
|
+ vaf ($a5,$a5,@K[0]);
|
|
+ vaf ($b5,$b5,@K[1]);
|
|
+ vaf ($c5,$c5,@K[2]);
|
|
+ vaf ($d5,$d5,$d3); # +K[3]+5
|
|
+
|
|
+ vperm ($a0,$a5,$a5,$beperm);
|
|
+ vperm ($b0,$b5,$b5,$beperm);
|
|
+ vperm ($c0,$c5,$c5,$beperm);
|
|
+ vperm ($d0,$d5,$d5,$beperm);
|
|
+
|
|
+&{$z? \&clgfi:\&clfi} ($len,0x40);
|
|
+ jl (".Ltail_vx");
|
|
+
|
|
+ vlm ($a1,$d1,"0($inp)");
|
|
+
|
|
+ vx ($a0,$a0,$a1);
|
|
+ vx ($b0,$b0,$b1);
|
|
+ vx ($c0,$c0,$c1);
|
|
+ vx ($d0,$d0,$d1);
|
|
+
|
|
+ vstm ($a0,$d0,"0($out)");
|
|
+
|
|
+ la ($inp,"0x40($inp)");
|
|
+ la ($out,"0x40($out)");
|
|
+ lhi ("%r0",10);
|
|
+&{$z? \&aghi:\&ahi} ($len,-0x40);
|
|
+ jne (".Loop_outer_vx");
|
|
+
|
|
+LABEL (".Ldone_vx");
|
|
+if (!$z) {
|
|
+ ld ("%f4","$FRAME+16*$SIZE_T+2*8($sp)");
|
|
+ ld ("%f6","$FRAME+16*$SIZE_T+3*8($sp)");
|
|
+} else {
|
|
+ ld ("%f8","$FRAME-8*8($sp)");
|
|
+ ld ("%f9","$FRAME-8*7($sp)");
|
|
+ ld ("%f10","$FRAME-8*6($sp)");
|
|
+ ld ("%f11","$FRAME-8*5($sp)");
|
|
+ ld ("%f12","$FRAME-8*4($sp)");
|
|
+ ld ("%f13","$FRAME-8*3($sp)");
|
|
+ ld ("%f14","$FRAME-8*2($sp)");
|
|
+ ld ("%f15","$FRAME-8*1($sp)");
|
|
+}
|
|
+&{$z? \&lmg:\&lm} ("%r6","%r7","$FRAME+6*$SIZE_T($sp)");
|
|
+ la ($sp,"$FRAME($sp)");
|
|
+ br ("%r14");
|
|
+
|
|
+ALIGN (16);
|
|
+LABEL (".Ltail_vx");
|
|
+if (!$z) {
|
|
+ ld ("%f4","$FRAME+16*$SIZE_T+2*8($sp)");
|
|
+ ld ("%f6","$FRAME+16*$SIZE_T+3*8($sp)");
|
|
+} else {
|
|
+ ld ("%f8","$FRAME-8*8($sp)");
|
|
+ ld ("%f9","$FRAME-8*7($sp)");
|
|
+ ld ("%f10","$FRAME-8*6($sp)");
|
|
+ ld ("%f11","$FRAME-8*5($sp)");
|
|
+ ld ("%f12","$FRAME-8*4($sp)");
|
|
+ ld ("%f13","$FRAME-8*3($sp)");
|
|
+ ld ("%f14","$FRAME-8*2($sp)");
|
|
+ ld ("%f15","$FRAME-8*1($sp)");
|
|
+}
|
|
+ vstm ($a0,$d0,"$stdframe($sp)");
|
|
+ lghi ("%r1",0);
|
|
+
|
|
+LABEL (".Loop_tail_vx");
|
|
+ llgc ("%r5","0(%r1,$inp)");
|
|
+ llgc ("%r6","$stdframe(%r1,$sp)");
|
|
+ xr ("%r6","%r5");
|
|
+ stc ("%r6","0(%r1,$out)");
|
|
+ la ("%r1","1(%r1)");
|
|
+ brct ($len,".Loop_tail_vx");
|
|
+
|
|
+&{$z? \&lmg:\&lm} ("%r6","%r7","$FRAME+6*$SIZE_T($sp)");
|
|
+ la ($sp,"$FRAME($sp)");
|
|
+ br ("%r14");
|
|
+SIZE ("ChaCha20_ctr32_vx",".-ChaCha20_ctr32_vx");
|
|
}
|
|
################
|
|
|
|
-ALIGN (64);
|
|
+ALIGN (32);
|
|
LABEL (".Lsigma");
|
|
LONG (0x61707865,0x3320646e,0x79622d32,0x6b206574); # endian-neutral sigma
|
|
-LONG (0x00000000,0x00000001,0x00000002,0x00000003); # vaf counter increment
|
|
-LONG (0x03020100,0x07060504,0x13121110,0x17161514); # vperm serialization
|
|
-LONG (0x0b0a0908,0x0f0e0d0c,0x1b1a1918,0x1f1e1d1c); # vperm serialization
|
|
+LONG (1,0,0,0);
|
|
+LONG (2,0,0,0);
|
|
+LONG (3,0,0,0);
|
|
+LONG (0x03020100,0x07060504,0x0b0a0908,0x0f0e0d0c); # byte swap
|
|
+
|
|
+LONG (0,1,2,3);
|
|
+LONG (0x61707865,0x61707865,0x61707865,0x61707865); # smashed sigma
|
|
+LONG (0x3320646e,0x3320646e,0x3320646e,0x3320646e);
|
|
+LONG (0x79622d32,0x79622d32,0x79622d32,0x79622d32);
|
|
+LONG (0x6b206574,0x6b206574,0x6b206574,0x6b206574);
|
|
+
|
|
ASCIZ ("\"ChaCha20 for s390x, CRYPTOGAMS by <appro\@openssl.org>\"");
|
|
ALIGN (4);
|
|
|
|
--
|
|
2.21.0
|
|
|