SHA256
1
0
forked from pool/openssl-1_1
openssl-1_1/0002-crypto-chacha-asm-chacha-s390x.pl-add-vx-code-path.patch

875 lines
23 KiB
Diff
Raw Normal View History

From f760137b2144740916afd9ff381451fa16c710de Mon Sep 17 00:00:00 2001
From: Patrick Steuer <patrick.steuer@de.ibm.com>
Date: Sat, 4 Aug 2018 00:10:06 +0200
Subject: [PATCH] crypto/chacha/asm/chacha-s390x.pl: add vx code path.
Signed-off-by: Patrick Steuer <patrick.steuer@de.ibm.com>
Reviewed-by: Tim Hudson <tjh@openssl.org>
Reviewed-by: Richard Levitte <levitte@openssl.org>
(Merged from https://github.com/openssl/openssl/pull/6919)
---
crypto/chacha/asm/chacha-s390x.pl | 816 ++++++++++++++++++++----------
crypto/chacha/build.info | 1 +
2 files changed, 558 insertions(+), 259 deletions(-)
Accepting request 706506 from home:vitezslav_cizek:branches:o11 - Update openssl-fix_underflow_in_errstr_handling.patch to use upstream approved code * https://github.com/openssl/openssl/pull/8966 - update openssl.keyring to include Richard Levitte's key - Update to 1.1.1c * Prevent over long nonces in ChaCha20-Poly1305 (CVE-2019-1543) ChaCha20-Poly1305 is an AEAD cipher, and requires a unique nonce input for every encryption operation. RFC 7539 specifies that the nonce value (IV) should be 96 bits (12 bytes). OpenSSL allows a variable nonce length and front pads the nonce with 0 bytes if it is less than 12 bytes. However it also incorrectly allows a nonce to be set of up to 16 bytes. In this case only the last 12 bytes are significant and any additional leading bytes are ignored. * Add build tests for C++. These are generated files that only do one thing, to include one public OpenSSL head file each. This tests that the public header files can be usefully included in a C++ application. * Enable SHA3 pre-hashing for ECDSA and DSA. * Change the default RSA, DSA and DH size to 2048 bit instead of 1024. This changes the size when using the genpkey app when no size is given. It fixes an omission in earlier changes that changed all RSA, DSA and DH generation apps to use 2048 bits by default. * Reorganize the manual pages to consistently have RETURN VALUES, EXAMPLES, SEE ALSO and HISTORY come in that order, and adjust util/fix-doc-nits accordingly. * Add the missing accessor EVP_PKEY_get0_engine() * Have apps like 's_client' and 's_server' output the signature scheme along with other cipher suite parameters when debugging. * Make OPENSSL_config() error agnostic again. * Do the error handling in RSA decryption constant time. OBS-URL: https://build.opensuse.org/request/show/706506 OBS-URL: https://build.opensuse.org/package/show/security:tls/openssl-1_1?expand=0&rev=37
2019-05-30 14:30:15 +02:00
Index: openssl-1.1.1c/crypto/chacha/asm/chacha-s390x.pl
===================================================================
--- openssl-1.1.1c.orig/crypto/chacha/asm/chacha-s390x.pl 2019-06-06 12:15:57.271195550 +0200
+++ openssl-1.1.1c/crypto/chacha/asm/chacha-s390x.pl 2019-06-06 12:16:43.787489780 +0200
@@ -1,5 +1,5 @@
#! /usr/bin/env perl
-# Copyright 2016 The OpenSSL Project Authors. All Rights Reserved.
+# Copyright 2016-2019 The OpenSSL Project Authors. All Rights Reserved.
#
# Licensed under the OpenSSL license (the "License"). You may not use
# this file except in compliance with the License. You can obtain a copy
@@ -20,41 +20,46 @@
#
# 3 times faster than compiler-generated code.
-$flavour = shift;
+#
+# August 2018
+#
+# Add vx code path.
+#
+# Copyright IBM Corp. 2018
+# Author: Patrick Steuer <patrick.steuer@de.ibm.com>
+
+use strict;
+use FindBin qw($Bin);
+use lib "$Bin/../..";
+use perlasm::s390x qw(:DEFAULT :VX AUTOLOAD LABEL INCLUDE);
+
+my $flavour = shift;
+my ($z,$SIZE_T);
if ($flavour =~ /3[12]/) {
+ $z=0; # S/390 ABI
$SIZE_T=4;
- $g="";
} else {
+ $z=1; # zSeries ABI
$SIZE_T=8;
- $g="g";
}
+my $output;
while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {}
-open STDOUT,">$output";
-
-sub AUTOLOAD() # thunk [simplified] x86-style perlasm
-{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://;
- $code .= "\t$opcode\t".join(',',@_)."\n";
-}
my $sp="%r15";
-
my $stdframe=16*$SIZE_T+4*8;
-my $frame=$stdframe+4*20;
-
-my ($out,$inp,$len,$key,$counter)=map("%r$_",(2..6));
my @x=map("%r$_",(0..7,"x","x","x","x",(10..13)));
my @t=map("%r$_",(8,9));
+my @v=map("%v$_",(16..31));
sub ROUND {
my ($a0,$b0,$c0,$d0)=@_;
my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0));
my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1));
my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2));
-my ($xc,$xc_)=map("\"$_\"",@t);
-my @x=map("\"$_\"",@x);
+my ($xc,$xc_)=map("$_",@t);
# Consider order in which variables are addressed by their
# index:
@@ -78,249 +83,542 @@ my @x=map("\"$_\"",@x);
# 'c' stores and loads in the middle, but none in the beginning
# or end.
- (
- "&alr (@x[$a0],@x[$b0])", # Q1
- "&alr (@x[$a1],@x[$b1])", # Q2
- "&xr (@x[$d0],@x[$a0])",
- "&xr (@x[$d1],@x[$a1])",
- "&rll (@x[$d0],@x[$d0],16)",
- "&rll (@x[$d1],@x[$d1],16)",
-
- "&alr ($xc,@x[$d0])",
- "&alr ($xc_,@x[$d1])",
- "&xr (@x[$b0],$xc)",
- "&xr (@x[$b1],$xc_)",
- "&rll (@x[$b0],@x[$b0],12)",
- "&rll (@x[$b1],@x[$b1],12)",
-
- "&alr (@x[$a0],@x[$b0])",
- "&alr (@x[$a1],@x[$b1])",
- "&xr (@x[$d0],@x[$a0])",
- "&xr (@x[$d1],@x[$a1])",
- "&rll (@x[$d0],@x[$d0],8)",
- "&rll (@x[$d1],@x[$d1],8)",
-
- "&alr ($xc,@x[$d0])",
- "&alr ($xc_,@x[$d1])",
- "&xr (@x[$b0],$xc)",
- "&xr (@x[$b1],$xc_)",
- "&rll (@x[$b0],@x[$b0],7)",
- "&rll (@x[$b1],@x[$b1],7)",
-
- "&stm ($xc,$xc_,'$stdframe+4*8+4*$c0($sp)')", # reload pair of 'c's
- "&lm ($xc,$xc_,'$stdframe+4*8+4*$c2($sp)')",
-
- "&alr (@x[$a2],@x[$b2])", # Q3
- "&alr (@x[$a3],@x[$b3])", # Q4
- "&xr (@x[$d2],@x[$a2])",
- "&xr (@x[$d3],@x[$a3])",
- "&rll (@x[$d2],@x[$d2],16)",
- "&rll (@x[$d3],@x[$d3],16)",
-
- "&alr ($xc,@x[$d2])",
- "&alr ($xc_,@x[$d3])",
- "&xr (@x[$b2],$xc)",
- "&xr (@x[$b3],$xc_)",
- "&rll (@x[$b2],@x[$b2],12)",
- "&rll (@x[$b3],@x[$b3],12)",
-
- "&alr (@x[$a2],@x[$b2])",
- "&alr (@x[$a3],@x[$b3])",
- "&xr (@x[$d2],@x[$a2])",
- "&xr (@x[$d3],@x[$a3])",
- "&rll (@x[$d2],@x[$d2],8)",
- "&rll (@x[$d3],@x[$d3],8)",
-
- "&alr ($xc,@x[$d2])",
- "&alr ($xc_,@x[$d3])",
- "&xr (@x[$b2],$xc)",
- "&xr (@x[$b3],$xc_)",
- "&rll (@x[$b2],@x[$b2],7)",
- "&rll (@x[$b3],@x[$b3],7)"
- );
-}
-
-$code.=<<___;
-.text
-
-.globl ChaCha20_ctr32
-.type ChaCha20_ctr32,\@function
-.align 32
-ChaCha20_ctr32:
- lt${g}r $len,$len # $len==0?
- bzr %r14
- a${g}hi $len,-64
- l${g}hi %r1,-$frame
- stm${g} %r6,%r15,`6*$SIZE_T`($sp)
- sl${g}r $out,$inp # difference
- la $len,0($inp,$len) # end of input minus 64
- larl %r7,.Lsigma
- lgr %r0,$sp
- la $sp,0(%r1,$sp)
- st${g} %r0,0($sp)
-
- lmg %r8,%r11,0($key) # load key
- lmg %r12,%r13,0($counter) # load counter
- lmg %r6,%r7,0(%r7) # load sigma constant
-
- la %r14,0($inp)
- st${g} $out,$frame+3*$SIZE_T($sp)
- st${g} $len,$frame+4*$SIZE_T($sp)
- stmg %r6,%r13,$stdframe($sp) # copy key schedule to stack
- srlg @x[12],%r12,32 # 32-bit counter value
- j .Loop_outer
-
-.align 16
-.Loop_outer:
- lm @x[0],@x[7],$stdframe+4*0($sp) # load x[0]-x[7]
- lm @t[0],@t[1],$stdframe+4*10($sp) # load x[10]-x[11]
- lm @x[13],@x[15],$stdframe+4*13($sp) # load x[13]-x[15]
- stm @t[0],@t[1],$stdframe+4*8+4*10($sp) # offload x[10]-x[11]
- lm @t[0],@t[1],$stdframe+4*8($sp) # load x[8]-x[9]
- st @x[12],$stdframe+4*12($sp) # save counter
- st${g} %r14,$frame+2*$SIZE_T($sp) # save input pointer
- lhi %r14,10
- j .Loop
-
-.align 4
-.Loop:
-___
- foreach (&ROUND(0, 4, 8,12)) { eval; }
- foreach (&ROUND(0, 5,10,15)) { eval; }
-$code.=<<___;
- brct %r14,.Loop
-
- l${g} %r14,$frame+2*$SIZE_T($sp) # pull input pointer
- stm @t[0],@t[1],$stdframe+4*8+4*8($sp) # offload x[8]-x[9]
- lm${g} @t[0],@t[1],$frame+3*$SIZE_T($sp)
-
- al @x[0],$stdframe+4*0($sp) # accumulate key schedule
- al @x[1],$stdframe+4*1($sp)
- al @x[2],$stdframe+4*2($sp)
- al @x[3],$stdframe+4*3($sp)
- al @x[4],$stdframe+4*4($sp)
- al @x[5],$stdframe+4*5($sp)
- al @x[6],$stdframe+4*6($sp)
- al @x[7],$stdframe+4*7($sp)
- lrvr @x[0],@x[0]
- lrvr @x[1],@x[1]
- lrvr @x[2],@x[2]
- lrvr @x[3],@x[3]
- lrvr @x[4],@x[4]
- lrvr @x[5],@x[5]
- lrvr @x[6],@x[6]
- lrvr @x[7],@x[7]
- al @x[12],$stdframe+4*12($sp)
- al @x[13],$stdframe+4*13($sp)
- al @x[14],$stdframe+4*14($sp)
- al @x[15],$stdframe+4*15($sp)
- lrvr @x[12],@x[12]
- lrvr @x[13],@x[13]
- lrvr @x[14],@x[14]
- lrvr @x[15],@x[15]
-
- la @t[0],0(@t[0],%r14) # reconstruct output pointer
- cl${g}r %r14,@t[1]
- jh .Ltail
-
- x @x[0],4*0(%r14) # xor with input
- x @x[1],4*1(%r14)
- st @x[0],4*0(@t[0]) # store output
- x @x[2],4*2(%r14)
- st @x[1],4*1(@t[0])
- x @x[3],4*3(%r14)
- st @x[2],4*2(@t[0])
- x @x[4],4*4(%r14)
- st @x[3],4*3(@t[0])
- lm @x[0],@x[3],$stdframe+4*8+4*8($sp) # load x[8]-x[11]
- x @x[5],4*5(%r14)
- st @x[4],4*4(@t[0])
- x @x[6],4*6(%r14)
- al @x[0],$stdframe+4*8($sp)
- st @x[5],4*5(@t[0])
- x @x[7],4*7(%r14)
- al @x[1],$stdframe+4*9($sp)
- st @x[6],4*6(@t[0])
- x @x[12],4*12(%r14)
- al @x[2],$stdframe+4*10($sp)
- st @x[7],4*7(@t[0])
- x @x[13],4*13(%r14)
- al @x[3],$stdframe+4*11($sp)
- st @x[12],4*12(@t[0])
- x @x[14],4*14(%r14)
- st @x[13],4*13(@t[0])
- x @x[15],4*15(%r14)
- st @x[14],4*14(@t[0])
- lrvr @x[0],@x[0]
- st @x[15],4*15(@t[0])
- lrvr @x[1],@x[1]
- lrvr @x[2],@x[2]
- lrvr @x[3],@x[3]
- lhi @x[12],1
- x @x[0],4*8(%r14)
- al @x[12],$stdframe+4*12($sp) # increment counter
- x @x[1],4*9(%r14)
- st @x[0],4*8(@t[0])
- x @x[2],4*10(%r14)
- st @x[1],4*9(@t[0])
- x @x[3],4*11(%r14)
- st @x[2],4*10(@t[0])
- st @x[3],4*11(@t[0])
-
- cl${g}r %r14,@t[1] # done yet?
- la %r14,64(%r14)
- jl .Loop_outer
-
-.Ldone:
- xgr %r0,%r0
- xgr %r1,%r1
- xgr %r2,%r2
- xgr %r3,%r3
- stmg %r0,%r3,$stdframe+4*4($sp) # wipe key copy
- stmg %r0,%r3,$stdframe+4*12($sp)
-
- lm${g} %r6,%r15,`$frame+6*$SIZE_T`($sp)
- br %r14
-
-.align 16
-.Ltail:
- la @t[1],64($t[1])
- stm @x[0],@x[7],$stdframe+4*0($sp)
- sl${g}r @t[1],%r14
- lm @x[0],@x[3],$stdframe+4*8+4*8($sp)
- l${g}hi @x[6],0
- stm @x[12],@x[15],$stdframe+4*12($sp)
- al @x[0],$stdframe+4*8($sp)
- al @x[1],$stdframe+4*9($sp)
- al @x[2],$stdframe+4*10($sp)
- al @x[3],$stdframe+4*11($sp)
- lrvr @x[0],@x[0]
- lrvr @x[1],@x[1]
- lrvr @x[2],@x[2]
- lrvr @x[3],@x[3]
- stm @x[0],@x[3],$stdframe+4*8($sp)
-
-.Loop_tail:
- llgc @x[4],0(@x[6],%r14)
- llgc @x[5],$stdframe(@x[6],$sp)
- xr @x[5],@x[4]
- stc @x[5],0(@x[6],@t[0])
- la @x[6],1(@x[6])
- brct @t[1],.Loop_tail
-
- j .Ldone
-.size ChaCha20_ctr32,.-ChaCha20_ctr32
-
-.align 32
-.Lsigma:
-.long 0x61707865,0x3320646e,0x79622d32,0x6b206574 # endian-neutral
-.asciz "ChaCha20 for s390x, CRYPTOGAMS by <appro\@openssl.org>"
-.align 4
-___
+ alr (@x[$a0],@x[$b0]); # Q1
+ alr (@x[$a1],@x[$b1]); # Q2
+ xr (@x[$d0],@x[$a0]);
+ xr (@x[$d1],@x[$a1]);
+ rll (@x[$d0],@x[$d0],16);
+ rll (@x[$d1],@x[$d1],16);
+
+ alr ($xc,@x[$d0]);
+ alr ($xc_,@x[$d1]);
+ xr (@x[$b0],$xc);
+ xr (@x[$b1],$xc_);
+ rll (@x[$b0],@x[$b0],12);
+ rll (@x[$b1],@x[$b1],12);
+
+ alr (@x[$a0],@x[$b0]);
+ alr (@x[$a1],@x[$b1]);
+ xr (@x[$d0],@x[$a0]);
+ xr (@x[$d1],@x[$a1]);
+ rll (@x[$d0],@x[$d0],8);
+ rll (@x[$d1],@x[$d1],8);
+
+ alr ($xc,@x[$d0]);
+ alr ($xc_,@x[$d1]);
+ xr (@x[$b0],$xc);
+ xr (@x[$b1],$xc_);
+ rll (@x[$b0],@x[$b0],7);
+ rll (@x[$b1],@x[$b1],7);
+
+ stm ($xc,$xc_,"$stdframe+4*8+4*$c0($sp)"); # reload pair of 'c's
+ lm ($xc,$xc_,"$stdframe+4*8+4*$c2($sp)");
+
+ alr (@x[$a2],@x[$b2]); # Q3
+ alr (@x[$a3],@x[$b3]); # Q4
+ xr (@x[$d2],@x[$a2]);
+ xr (@x[$d3],@x[$a3]);
+ rll (@x[$d2],@x[$d2],16);
+ rll (@x[$d3],@x[$d3],16);
+
+ alr ($xc,@x[$d2]);
+ alr ($xc_,@x[$d3]);
+ xr (@x[$b2],$xc);
+ xr (@x[$b3],$xc_);
+ rll (@x[$b2],@x[$b2],12);
+ rll (@x[$b3],@x[$b3],12);
+
+ alr (@x[$a2],@x[$b2]);
+ alr (@x[$a3],@x[$b3]);
+ xr (@x[$d2],@x[$a2]);
+ xr (@x[$d3],@x[$a3]);
+ rll (@x[$d2],@x[$d2],8);
+ rll (@x[$d3],@x[$d3],8);
+
+ alr ($xc,@x[$d2]);
+ alr ($xc_,@x[$d3]);
+ xr (@x[$b2],$xc);
+ xr (@x[$b3],$xc_);
+ rll (@x[$b2],@x[$b2],7);
+ rll (@x[$b3],@x[$b3],7);
+}
+
+sub VX_ROUND {
+my ($a0,$b0,$c0,$d0)=@_;
+my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0));
+my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1));
+my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2));
+
+ vaf (@v[$a0],@v[$a0],@v[$b0]);
+ vaf (@v[$a1],@v[$a1],@v[$b1]);
+ vaf (@v[$a2],@v[$a2],@v[$b2]);
+ vaf (@v[$a3],@v[$a3],@v[$b3]);
+ vx (@v[$d0],@v[$d0],@v[$a0]);
+ vx (@v[$d1],@v[$d1],@v[$a1]);
+ vx (@v[$d2],@v[$d2],@v[$a2]);
+ vx (@v[$d3],@v[$d3],@v[$a3]);
+ verllf (@v[$d0],@v[$d0],16);
+ verllf (@v[$d1],@v[$d1],16);
+ verllf (@v[$d2],@v[$d2],16);
+ verllf (@v[$d3],@v[$d3],16);
+
+ vaf (@v[$c0],@v[$c0],@v[$d0]);
+ vaf (@v[$c1],@v[$c1],@v[$d1]);
+ vaf (@v[$c2],@v[$c2],@v[$d2]);
+ vaf (@v[$c3],@v[$c3],@v[$d3]);
+ vx (@v[$b0],@v[$b0],@v[$c0]);
+ vx (@v[$b1],@v[$b1],@v[$c1]);
+ vx (@v[$b2],@v[$b2],@v[$c2]);
+ vx (@v[$b3],@v[$b3],@v[$c3]);
+ verllf (@v[$b0],@v[$b0],12);
+ verllf (@v[$b1],@v[$b1],12);
+ verllf (@v[$b2],@v[$b2],12);
+ verllf (@v[$b3],@v[$b3],12);
+
+ vaf (@v[$a0],@v[$a0],@v[$b0]);
+ vaf (@v[$a1],@v[$a1],@v[$b1]);
+ vaf (@v[$a2],@v[$a2],@v[$b2]);
+ vaf (@v[$a3],@v[$a3],@v[$b3]);
+ vx (@v[$d0],@v[$d0],@v[$a0]);
+ vx (@v[$d1],@v[$d1],@v[$a1]);
+ vx (@v[$d2],@v[$d2],@v[$a2]);
+ vx (@v[$d3],@v[$d3],@v[$a3]);
+ verllf (@v[$d0],@v[$d0],8);
+ verllf (@v[$d1],@v[$d1],8);
+ verllf (@v[$d2],@v[$d2],8);
+ verllf (@v[$d3],@v[$d3],8);
+
+ vaf (@v[$c0],@v[$c0],@v[$d0]);
+ vaf (@v[$c1],@v[$c1],@v[$d1]);
+ vaf (@v[$c2],@v[$c2],@v[$d2]);
+ vaf (@v[$c3],@v[$c3],@v[$d3]);
+ vx (@v[$b0],@v[$b0],@v[$c0]);
+ vx (@v[$b1],@v[$b1],@v[$c1]);
+ vx (@v[$b2],@v[$b2],@v[$c2]);
+ vx (@v[$b3],@v[$b3],@v[$c3]);
+ verllf (@v[$b0],@v[$b0],7);
+ verllf (@v[$b1],@v[$b1],7);
+ verllf (@v[$b2],@v[$b2],7);
+ verllf (@v[$b3],@v[$b3],7);
+}
+
+PERLASM_BEGIN($output);
-foreach (split("\n",$code)) {
- s/\`([^\`]*)\`/eval $1/ge;
+INCLUDE ("s390x_arch.h");
+TEXT ();
- print $_,"\n";
+################
+# void ChaCha20_ctr32(unsigned char *out, const unsigned char *inp, size_t len,
+# const unsigned int key[8], const unsigned int counter[4])
+{
+my ($out,$inp,$len,$key,$counter)=map("%r$_",(2..6));
+
+# VX CODE PATH
+{
+my $off=$z*8*16+8; # offset(initial state)
+my $frame=$stdframe+4*16+$off;
+
+GLOBL ("ChaCha20_ctr32");
+TYPE ("ChaCha20_ctr32","\@function");
+ALIGN (32);
+LABEL ("ChaCha20_ctr32");
+ larl ("%r1","OPENSSL_s390xcap_P");
+
+ lghi ("%r0",64);
+&{$z? \&cgr:\&cr} ($len,"%r0");
+ jle ("_s390x_chacha_novx");
+
+ lg ("%r0","S390X_STFLE+16(%r1)");
+ tmhh ("%r0",0x4000); # check for vector facility
+ jz ("_s390x_chacha_novx");
+
+if (!$z) {
+ llgfr ($len,$len);
+ std ("%f4","16*$SIZE_T+2*8($sp)");
+ std ("%f6","16*$SIZE_T+3*8($sp)");
+}
+&{$z? \&stmg:\&stm} ("%r6","%r7","6*$SIZE_T($sp)");
+
+ lghi ("%r1",-$frame);
+ lgr ("%r0",$sp);
+ la ($sp,"0(%r1,$sp)"); # allocate stack frame
+
+ larl ("%r7",".Lsigma");
+&{$z? \&stg:\&st} ("%r0","0($sp)"); # backchain
+
+ vstm ("%v8","%v15","8($sp)") if ($z);
+
+ vlm ("%v1","%v2","0($key)"); # load key
+ vl ("%v0","0(%r7)"); # load sigma constant
+ vl ("%v3","0($counter)"); # load iv (counter||nonce)
+ l ("%r0","0($counter)"); # load counter
+ vstm ("%v0","%v3","$off($sp)"); # copy initial state to stack
+
+ srlg ("%r1",$len,8);
+ ltgr ("%r1","%r1");
+ jz (".Lvx_4x_done");
+
+ALIGN (16); # process 4 64-byte blocks
+LABEL (".Lvx_4x");
+ vlrepf ("%v$_",($_*4)."+$off($sp)") for (0..15); # load initial
+ # state
+ vl ("%v31","16(%r7)");
+ vaf ("%v12","%v12","%v31"); # increment counter
+
+ vlr (@v[$_],"%v$_") for (0..15); # copy initial state
+
+ lhi ("%r6",10);
+ j (".Loop_vx_4x");
+
+ALIGN (16);
+LABEL (".Loop_vx_4x");
+ VX_ROUND( 0, 4, 8,12); # column round
+ VX_ROUND( 0, 5,10,15); # diagonal round
+ brct ("%r6",".Loop_vx_4x");
+
+ vaf (@v[$_],@v[$_],"%v$_") for (0..15); # state += initial
+ # state (mod 32)
+ vlm ("%v6","%v7","32(%r7)"); # load vperm operands
+
+for (0..3) { # blocks 1,2
+ vmrhf ("%v0",@v[$_*4+0],@v[$_*4+1]); # ks = serialize(state)
+ vmrhf ("%v1",@v[$_*4+2],@v[$_*4+3]);
+ vperm ("%v".($_+ 8),"%v0","%v1","%v6");
+ vperm ("%v".($_+12),"%v0","%v1","%v7");
+}
+ vlm ("%v0","%v7","0($inp)"); # load in
+ vx ("%v$_","%v$_","%v".($_+8)) for (0..7); # out = in ^ ks
+ vstm ("%v0","%v7","0($out)"); # store out
+
+ vlm ("%v6","%v7","32(%r7)"); # restore vperm operands
+
+for (0..3) { # blocks 2,3
+ vmrlf ("%v0",@v[$_*4+0],@v[$_*4+1]); # ks = serialize(state)
+ vmrlf ("%v1",@v[$_*4+2],@v[$_*4+3]);
+ vperm ("%v".($_+ 8),"%v0","%v1","%v6");
+ vperm ("%v".($_+12),"%v0","%v1","%v7");
+}
+ vlm ("%v0","%v7","128($inp)"); # load in
+ vx ("%v$_","%v$_","%v".($_+8)) for (0..7); # out = in ^ ks
+ vstm ("%v0","%v7","128($out)"); # store out
+
+ ahi ("%r0",4);
+ st ("%r0","48+$off($sp)"); # update initial state
+
+ la ($inp,"256($inp)");
+ la ($out,"256($out)");
+ brctg ("%r1",".Lvx_4x");
+
+ALIGN (16);
+LABEL (".Lvx_4x_done");
+ lghi ("%r1",0xff);
+ ngr ($len,"%r1");
+ jnz (".Lvx_rem");
+
+ALIGN (16);
+LABEL (".Lvx_done");
+ vzero ("%v$_") for (16..31); # wipe ks and key copy
+ vstm ("%v16","%v17","16+$off($sp)");
+ vlm ("%v8","%v15","8($sp)") if ($z);
+
+ la ($sp,"$frame($sp)");
+&{$z? \&lmg:\&lm} ("%r6","%r7","6*$SIZE_T($sp)");
+
+if (!$z) {
+ ld ("%f4","16*$SIZE_T+2*8($sp)");
+ ld ("%f6","16*$SIZE_T+3*8($sp)");
+ vzero ("%v$_") for (8..15);
+}
+ br ("%r14");
+ALIGN (16);
+LABEL (".Lvx_rem");
+ lhi ("%r0",64);
+
+ sr ($len,"%r0");
+ brc (2,".Lvx_rem_g64"); # cc==2?
+
+ lghi ("%r1",-$stdframe);
+
+ la ($counter,"48+$off($sp)"); # load updated iv
+ ar ($len,"%r0"); # restore len
+
+ lgr ("%r7",$counter);
+&{$z? \&stg:\&st} ("%r14","14*$SIZE_T+$frame($sp)");
+ la ($sp,"0(%r1,$sp)");
+
+ bras ("%r14","_s390x_chacha_novx");
+
+ la ($sp,"$stdframe($sp)");
+&{$z? \&lg:\&l} ("%r14","14*$SIZE_T+$frame($sp)");
+ lgr ($counter,"%r7");
+ j (".Lvx_done");
+
+ALIGN (16);
+LABEL (".Lvx_rem_g64");
+ vlrepf ("%v$_",($_*4)."+$off($sp)") for (0..15); # load initial
+ # state
+ vl ("%v31","16(%r7)");
+ vaf ("%v12","%v12","%v31"); # increment counter
+
+ vlr (@v[$_],"%v$_") for (0..15); # state = initial state
+
+ lhi ("%r6",10);
+ j (".Loop_vx_rem");
+
+ALIGN (16);
+LABEL (".Loop_vx_rem");
+ VX_ROUND( 0, 4, 8,12); # column round
+ VX_ROUND( 0, 5,10,15); # diagonal round
+ brct ("%r6",".Loop_vx_rem");
+
+ vaf (@v[$_],@v[$_],"%v$_") for (0..15); # state += initial
+ # state (mod 32)
+ vlm ("%v6","%v7","32(%r7)"); # load vperm operands
+
+for (0..3) { # blocks 1,2
+ vmrhf ("%v0",@v[$_*4+0],@v[$_*4+1]); # ks = serialize(state)
+ vmrhf ("%v1",@v[$_*4+2],@v[$_*4+3]);
+ vperm ("%v".($_+8),"%v0","%v1","%v6");
+ vperm ("%v".($_+12),"%v0","%v1","%v7");
+}
+ vlm ("%v0","%v3","0($inp)"); # load in
+ vx ("%v$_","%v$_","%v".($_+8)) for (0..3); # out = in ^ ks
+ vstm ("%v0","%v3","0($out)"); # store out
+
+ la ($inp,"64($inp)");
+ la ($out,"64($out)");
+
+ sr ($len,"%r0");
+ brc (4,".Lvx_tail"); # cc==4?
+
+ vlm ("%v0","%v3","0($inp)"); # load in
+ vx ("%v$_","%v$_","%v".($_+12)) for (0..3); # out = in ^ ks
+ vstm ("%v0","%v3","0($out)"); # store out
+ jz (".Lvx_done");
+
+for (0..3) { # blocks 3,4
+ vmrlf ("%v0",@v[$_*4+0],@v[$_*4+1]); # ks = serialize(state)
+ vmrlf ("%v1",@v[$_*4+2],@v[$_*4+3]);
+ vperm ("%v".($_+12),"%v0","%v1","%v6");
+ vperm ("%v".($_+8),"%v0","%v1","%v7");
+}
+ la ($inp,"64($inp)");
+ la ($out,"64($out)");
+
+ sr ($len,"%r0");
+ brc (4,".Lvx_tail"); # cc==4?
+
+ vlm ("%v0","%v3","0($inp)"); # load in
+ vx ("%v$_","%v$_","%v".($_+12)) for (0..3); # out = in ^ ks
+ vstm ("%v0","%v3","0($out)"); # store out
+ jz (".Lvx_done");
+
+ la ($inp,"64($inp)");
+ la ($out,"64($out)");
+
+ sr ($len,"%r0");
+ vlr ("%v".($_+4),"%v$_") for (8..11);
+ j (".Lvx_tail");
+
+ALIGN (16);
+LABEL (".Lvx_tail");
+ ar ($len,"%r0"); # restore $len
+ ahi ($len,-1);
+
+ lhi ("%r0",16);
+for (0..2) {
+ vll ("%v0",$len,($_*16)."($inp)");
+ vx ("%v0","%v0","%v".($_+12));
+ vstl ("%v0",$len,($_*16)."($out)");
+ sr ($len,"%r0");
+ brc (4,".Lvx_done"); # cc==4?
+}
+ vll ("%v0",$len,"3*16($inp)");
+ vx ("%v0","%v0","%v15");
+ vstl ("%v0",$len,"3*16($out)");
+ j (".Lvx_done");
+SIZE ("ChaCha20_ctr32",".-ChaCha20_ctr32");
+}
+
+# NOVX CODE PATH
+{
+my $frame=$stdframe+4*20;
+
+TYPE ("_s390x_chacha_novx","\@function");
+ALIGN (32);
+LABEL ("_s390x_chacha_novx");
+&{$z? \&ltgr:\&ltr} ($len,$len); # $len==0?
+ bzr ("%r14");
+&{$z? \&aghi:\&ahi} ($len,-64);
+&{$z? \&lghi:\&lhi} ("%r1",-$frame);
+&{$z? \&stmg:\&stm} ("%r6","%r15","6*$SIZE_T($sp)");
+&{$z? \&slgr:\&slr} ($out,$inp); # difference
+ la ($len,"0($inp,$len)"); # end of input minus 64
+ larl ("%r7",".Lsigma");
+ lgr ("%r0",$sp);
+ la ($sp,"0(%r1,$sp)");
+&{$z? \&stg:\&st} ("%r0","0($sp)");
+
+ lmg ("%r8","%r11","0($key)"); # load key
+ lmg ("%r12","%r13","0($counter)"); # load counter
+ lmg ("%r6","%r7","0(%r7)"); # load sigma constant
+
+ la ("%r14","0($inp)");
+&{$z? \&stg:\&st} ($out,"$frame+3*$SIZE_T($sp)");
+&{$z? \&stg:\&st} ($len,"$frame+4*$SIZE_T($sp)");
+ stmg ("%r6","%r13","$stdframe($sp)");# copy key schedule to stack
+ srlg (@x[12],"%r12",32); # 32-bit counter value
+ j (".Loop_outer");
+
+ALIGN (16);
+LABEL (".Loop_outer");
+ lm (@x[0],@x[7],"$stdframe+4*0($sp)"); # load x[0]-x[7]
+ lm (@t[0],@t[1],"$stdframe+4*10($sp)"); # load x[10]-x[11]
+ lm (@x[13],@x[15],"$stdframe+4*13($sp)"); # load x[13]-x[15]
+ stm (@t[0],@t[1],"$stdframe+4*8+4*10($sp)");# offload x[10]-x[11]
+ lm (@t[0],@t[1],"$stdframe+4*8($sp)"); # load x[8]-x[9]
+ st (@x[12],"$stdframe+4*12($sp)"); # save counter
+&{$z? \&stg:\&st} ("%r14","$frame+2*$SIZE_T($sp)");# save input pointer
+ lhi ("%r14",10);
+ j (".Loop");
+
+ALIGN (4);
+LABEL (".Loop");
+ ROUND (0, 4, 8,12);
+ ROUND (0, 5,10,15);
+ brct ("%r14",".Loop");
+
+&{$z? \&lg:\&l} ("%r14","$frame+2*$SIZE_T($sp)");# pull input pointer
+ stm (@t[0],@t[1],"$stdframe+4*8+4*8($sp)"); # offload x[8]-x[9]
+&{$z? \&lmg:\&lm} (@t[0],@t[1],"$frame+3*$SIZE_T($sp)");
+
+ al (@x[0],"$stdframe+4*0($sp)"); # accumulate key schedule
+ al (@x[1],"$stdframe+4*1($sp)");
+ al (@x[2],"$stdframe+4*2($sp)");
+ al (@x[3],"$stdframe+4*3($sp)");
+ al (@x[4],"$stdframe+4*4($sp)");
+ al (@x[5],"$stdframe+4*5($sp)");
+ al (@x[6],"$stdframe+4*6($sp)");
+ al (@x[7],"$stdframe+4*7($sp)");
+ lrvr (@x[0],@x[0]);
+ lrvr (@x[1],@x[1]);
+ lrvr (@x[2],@x[2]);
+ lrvr (@x[3],@x[3]);
+ lrvr (@x[4],@x[4]);
+ lrvr (@x[5],@x[5]);
+ lrvr (@x[6],@x[6]);
+ lrvr (@x[7],@x[7]);
+ al (@x[12],"$stdframe+4*12($sp)");
+ al (@x[13],"$stdframe+4*13($sp)");
+ al (@x[14],"$stdframe+4*14($sp)");
+ al (@x[15],"$stdframe+4*15($sp)");
+ lrvr (@x[12],@x[12]);
+ lrvr (@x[13],@x[13]);
+ lrvr (@x[14],@x[14]);
+ lrvr (@x[15],@x[15]);
+
+ la (@t[0],"0(@t[0],%r14)"); # reconstruct output pointer
+&{$z? \&clgr:\&clr} ("%r14",@t[1]);
+ jh (".Ltail");
+
+ x (@x[0],"4*0(%r14)"); # xor with input
+ x (@x[1],"4*1(%r14)");
+ st (@x[0],"4*0(@t[0])"); # store output
+ x (@x[2],"4*2(%r14)");
+ st (@x[1],"4*1(@t[0])");
+ x (@x[3],"4*3(%r14)");
+ st (@x[2],"4*2(@t[0])");
+ x (@x[4],"4*4(%r14)");
+ st (@x[3],"4*3(@t[0])");
+ lm (@x[0],@x[3],"$stdframe+4*8+4*8($sp)"); # load x[8]-x[11]
+ x (@x[5],"4*5(%r14)");
+ st (@x[4],"4*4(@t[0])");
+ x (@x[6],"4*6(%r14)");
+ al (@x[0],"$stdframe+4*8($sp)");
+ st (@x[5],"4*5(@t[0])");
+ x (@x[7],"4*7(%r14)");
+ al (@x[1],"$stdframe+4*9($sp)");
+ st (@x[6],"4*6(@t[0])");
+ x (@x[12],"4*12(%r14)");
+ al (@x[2],"$stdframe+4*10($sp)");
+ st (@x[7],"4*7(@t[0])");
+ x (@x[13],"4*13(%r14)");
+ al (@x[3],"$stdframe+4*11($sp)");
+ st (@x[12],"4*12(@t[0])");
+ x (@x[14],"4*14(%r14)");
+ st (@x[13],"4*13(@t[0])");
+ x (@x[15],"4*15(%r14)");
+ st (@x[14],"4*14(@t[0])");
+ lrvr (@x[0],@x[0]);
+ st (@x[15],"4*15(@t[0])");
+ lrvr (@x[1],@x[1]);
+ lrvr (@x[2],@x[2]);
+ lrvr (@x[3],@x[3]);
+ lhi (@x[12],1);
+ x (@x[0],"4*8(%r14)");
+ al (@x[12],"$stdframe+4*12($sp)"); # increment counter
+ x (@x[1],"4*9(%r14)");
+ st (@x[0],"4*8(@t[0])");
+ x (@x[2],"4*10(%r14)");
+ st (@x[1],"4*9(@t[0])");
+ x (@x[3],"4*11(%r14)");
+ st (@x[2],"4*10(@t[0])");
+ st (@x[3],"4*11(@t[0])");
+
+&{$z? \&clgr:\&clr} ("%r14",@t[1]); # done yet?
+ la ("%r14","64(%r14)");
+ jl (".Loop_outer");
+
+LABEL (".Ldone");
+ xgr ("%r0","%r0");
+ xgr ("%r1","%r1");
+ xgr ("%r2","%r2");
+ xgr ("%r3","%r3");
+ stmg ("%r0","%r3","$stdframe+4*4($sp)"); # wipe key copy
+ stmg ("%r0","%r3","$stdframe+4*12($sp)");
+
+&{$z? \&lmg:\&lm} ("%r6","%r15","$frame+6*$SIZE_T($sp)");
+ br ("%r14");
+
+ALIGN (16);
+LABEL (".Ltail");
+ la (@t[1],"64($t[1])");
+ stm (@x[0],@x[7],"$stdframe+4*0($sp)");
+&{$z? \&slgr:\&slr} (@t[1],"%r14");
+ lm (@x[0],@x[3],"$stdframe+4*8+4*8($sp)");
+&{$z? \&lghi:\&lhi} (@x[6],0);
+ stm (@x[12],@x[15],"$stdframe+4*12($sp)");
+ al (@x[0],"$stdframe+4*8($sp)");
+ al (@x[1],"$stdframe+4*9($sp)");
+ al (@x[2],"$stdframe+4*10($sp)");
+ al (@x[3],"$stdframe+4*11($sp)");
+ lrvr (@x[0],@x[0]);
+ lrvr (@x[1],@x[1]);
+ lrvr (@x[2],@x[2]);
+ lrvr (@x[3],@x[3]);
+ stm (@x[0],@x[3],"$stdframe+4*8($sp)");
+
+LABEL (".Loop_tail");
+ llgc (@x[4],"0(@x[6],%r14)");
+ llgc (@x[5],"$stdframe(@x[6],$sp)");
+ xr (@x[5],@x[4]);
+ stc (@x[5],"0(@x[6],@t[0])");
+ la (@x[6],"1(@x[6])");
+ brct (@t[1],".Loop_tail");
+
+ j (".Ldone");
+SIZE ("_s390x_chacha_novx",".-_s390x_chacha_novx");
+}
}
-close STDOUT;
+################
+
+ALIGN (64);
+LABEL (".Lsigma");
+LONG (0x61707865,0x3320646e,0x79622d32,0x6b206574); # endian-neutral sigma
+LONG (0x00000000,0x00000001,0x00000002,0x00000003); # vaf counter increment
+LONG (0x03020100,0x07060504,0x13121110,0x17161514); # vperm serialization
+LONG (0x0b0a0908,0x0f0e0d0c,0x1b1a1918,0x1f1e1d1c); # vperm serialization
+ASCIZ ("\"ChaCha20 for s390x, CRYPTOGAMS by <appro\@openssl.org>\"");
+ALIGN (4);
+
+PERLASM_END();