SHA256
1
0
forked from pool/openssl-1_1
openssl-1_1/0002-crypto-chacha-asm-chacha-s390x.pl-add-vx-code-path.patch
Pedro Monreal Gonzalez 32ced036f1 Accepting request 786956 from home:vitezslav_cizek:branches:security:tls
- Update to 1.1.1e
  * Properly detect EOF while reading in libssl. Previously if we hit an EOF
    while reading in libssl then we would report an error back to the
    application (SSL_ERROR_SYSCALL) but errno would be 0. We now add
    an error to the stack (which means we instead return SSL_ERROR_SSL) and
    therefore give a hint as to what went wrong.
  * Check that ed25519 and ed448 are allowed by the security level. Previously
    signature algorithms not using an MD were not being checked that they were
    allowed by the security level.
  * Fixed SSL_get_servername() behaviour. The behaviour of SSL_get_servername()
    was not quite right. The behaviour was not consistent between resumption
    and normal handshakes, and also not quite consistent with historical
    behaviour. The behaviour in various scenarios has been clarified and
    it has been updated to make it match historical behaviour as closely as
    possible.
  * Corrected the documentation of the return values from the EVP_DigestSign*
    set of functions.  The documentation mentioned negative values for some
    errors, but this was never the case, so the mention of negative values
    was removed.
  * Added a new method to gather entropy on VMS, based on SYS$GET_ENTROPY.
    The presence of this system service is determined at run-time.
  * Added newline escaping functionality to a filename when using openssl dgst.
    This output format is to replicate the output format found in the '*sum'
    checksum programs. This aims to preserve backward compatibility.
  * Print all values for a PKCS#12 attribute with 'openssl pkcs12', not just
    the first value.
- Update bunch of patches as the internal crypto headers got reorganized
- drop openssl-1_1-CVE-2019-1551.patch (upstream)

- openssl dgst: default to SHA256 only when called without a digest,

OBS-URL: https://build.opensuse.org/request/show/786956
OBS-URL: https://build.opensuse.org/package/show/security:tls/openssl-1_1?expand=0&rev=65
2020-03-20 17:43:35 +00:00

868 lines
23 KiB
Diff

From f760137b2144740916afd9ff381451fa16c710de Mon Sep 17 00:00:00 2001
From: Patrick Steuer <patrick.steuer@de.ibm.com>
Date: Sat, 4 Aug 2018 00:10:06 +0200
Subject: [PATCH] crypto/chacha/asm/chacha-s390x.pl: add vx code path.
Signed-off-by: Patrick Steuer <patrick.steuer@de.ibm.com>
Reviewed-by: Tim Hudson <tjh@openssl.org>
Reviewed-by: Richard Levitte <levitte@openssl.org>
(Merged from https://github.com/openssl/openssl/pull/6919)
---
crypto/chacha/asm/chacha-s390x.pl | 816 ++++++++++++++++++++----------
crypto/chacha/build.info | 1 +
2 files changed, 558 insertions(+), 259 deletions(-)
Index: openssl-1.1.1e/crypto/chacha/asm/chacha-s390x.pl
===================================================================
--- openssl-1.1.1e.orig/crypto/chacha/asm/chacha-s390x.pl 2020-03-19 11:43:25.650616856 +0100
+++ openssl-1.1.1e/crypto/chacha/asm/chacha-s390x.pl 2020-03-19 11:43:40.614692484 +0100
@@ -20,41 +20,46 @@
#
# 3 times faster than compiler-generated code.
-$flavour = shift;
+#
+# August 2018
+#
+# Add vx code path.
+#
+# Copyright IBM Corp. 2018
+# Author: Patrick Steuer <patrick.steuer@de.ibm.com>
+
+use strict;
+use FindBin qw($Bin);
+use lib "$Bin/../..";
+use perlasm::s390x qw(:DEFAULT :VX AUTOLOAD LABEL INCLUDE);
+
+my $flavour = shift;
+my ($z,$SIZE_T);
if ($flavour =~ /3[12]/) {
+ $z=0; # S/390 ABI
$SIZE_T=4;
- $g="";
} else {
+ $z=1; # zSeries ABI
$SIZE_T=8;
- $g="g";
}
+my $output;
while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {}
-open STDOUT,">$output";
-
-sub AUTOLOAD() # thunk [simplified] x86-style perlasm
-{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://;
- $code .= "\t$opcode\t".join(',',@_)."\n";
-}
my $sp="%r15";
-
my $stdframe=16*$SIZE_T+4*8;
-my $frame=$stdframe+4*20;
-
-my ($out,$inp,$len,$key,$counter)=map("%r$_",(2..6));
my @x=map("%r$_",(0..7,"x","x","x","x",(10..13)));
my @t=map("%r$_",(8,9));
+my @v=map("%v$_",(16..31));
sub ROUND {
my ($a0,$b0,$c0,$d0)=@_;
my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0));
my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1));
my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2));
-my ($xc,$xc_)=map("\"$_\"",@t);
-my @x=map("\"$_\"",@x);
+my ($xc,$xc_)=map("$_",@t);
# Consider order in which variables are addressed by their
# index:
@@ -78,249 +83,542 @@ my @x=map("\"$_\"",@x);
# 'c' stores and loads in the middle, but none in the beginning
# or end.
- (
- "&alr (@x[$a0],@x[$b0])", # Q1
- "&alr (@x[$a1],@x[$b1])", # Q2
- "&xr (@x[$d0],@x[$a0])",
- "&xr (@x[$d1],@x[$a1])",
- "&rll (@x[$d0],@x[$d0],16)",
- "&rll (@x[$d1],@x[$d1],16)",
-
- "&alr ($xc,@x[$d0])",
- "&alr ($xc_,@x[$d1])",
- "&xr (@x[$b0],$xc)",
- "&xr (@x[$b1],$xc_)",
- "&rll (@x[$b0],@x[$b0],12)",
- "&rll (@x[$b1],@x[$b1],12)",
-
- "&alr (@x[$a0],@x[$b0])",
- "&alr (@x[$a1],@x[$b1])",
- "&xr (@x[$d0],@x[$a0])",
- "&xr (@x[$d1],@x[$a1])",
- "&rll (@x[$d0],@x[$d0],8)",
- "&rll (@x[$d1],@x[$d1],8)",
-
- "&alr ($xc,@x[$d0])",
- "&alr ($xc_,@x[$d1])",
- "&xr (@x[$b0],$xc)",
- "&xr (@x[$b1],$xc_)",
- "&rll (@x[$b0],@x[$b0],7)",
- "&rll (@x[$b1],@x[$b1],7)",
-
- "&stm ($xc,$xc_,'$stdframe+4*8+4*$c0($sp)')", # reload pair of 'c's
- "&lm ($xc,$xc_,'$stdframe+4*8+4*$c2($sp)')",
-
- "&alr (@x[$a2],@x[$b2])", # Q3
- "&alr (@x[$a3],@x[$b3])", # Q4
- "&xr (@x[$d2],@x[$a2])",
- "&xr (@x[$d3],@x[$a3])",
- "&rll (@x[$d2],@x[$d2],16)",
- "&rll (@x[$d3],@x[$d3],16)",
-
- "&alr ($xc,@x[$d2])",
- "&alr ($xc_,@x[$d3])",
- "&xr (@x[$b2],$xc)",
- "&xr (@x[$b3],$xc_)",
- "&rll (@x[$b2],@x[$b2],12)",
- "&rll (@x[$b3],@x[$b3],12)",
-
- "&alr (@x[$a2],@x[$b2])",
- "&alr (@x[$a3],@x[$b3])",
- "&xr (@x[$d2],@x[$a2])",
- "&xr (@x[$d3],@x[$a3])",
- "&rll (@x[$d2],@x[$d2],8)",
- "&rll (@x[$d3],@x[$d3],8)",
-
- "&alr ($xc,@x[$d2])",
- "&alr ($xc_,@x[$d3])",
- "&xr (@x[$b2],$xc)",
- "&xr (@x[$b3],$xc_)",
- "&rll (@x[$b2],@x[$b2],7)",
- "&rll (@x[$b3],@x[$b3],7)"
- );
-}
-
-$code.=<<___;
-.text
-
-.globl ChaCha20_ctr32
-.type ChaCha20_ctr32,\@function
-.align 32
-ChaCha20_ctr32:
- lt${g}r $len,$len # $len==0?
- bzr %r14
- a${g}hi $len,-64
- l${g}hi %r1,-$frame
- stm${g} %r6,%r15,`6*$SIZE_T`($sp)
- sl${g}r $out,$inp # difference
- la $len,0($inp,$len) # end of input minus 64
- larl %r7,.Lsigma
- lgr %r0,$sp
- la $sp,0(%r1,$sp)
- st${g} %r0,0($sp)
-
- lmg %r8,%r11,0($key) # load key
- lmg %r12,%r13,0($counter) # load counter
- lmg %r6,%r7,0(%r7) # load sigma constant
-
- la %r14,0($inp)
- st${g} $out,$frame+3*$SIZE_T($sp)
- st${g} $len,$frame+4*$SIZE_T($sp)
- stmg %r6,%r13,$stdframe($sp) # copy key schedule to stack
- srlg @x[12],%r12,32 # 32-bit counter value
- j .Loop_outer
-
-.align 16
-.Loop_outer:
- lm @x[0],@x[7],$stdframe+4*0($sp) # load x[0]-x[7]
- lm @t[0],@t[1],$stdframe+4*10($sp) # load x[10]-x[11]
- lm @x[13],@x[15],$stdframe+4*13($sp) # load x[13]-x[15]
- stm @t[0],@t[1],$stdframe+4*8+4*10($sp) # offload x[10]-x[11]
- lm @t[0],@t[1],$stdframe+4*8($sp) # load x[8]-x[9]
- st @x[12],$stdframe+4*12($sp) # save counter
- st${g} %r14,$frame+2*$SIZE_T($sp) # save input pointer
- lhi %r14,10
- j .Loop
-
-.align 4
-.Loop:
-___
- foreach (&ROUND(0, 4, 8,12)) { eval; }
- foreach (&ROUND(0, 5,10,15)) { eval; }
-$code.=<<___;
- brct %r14,.Loop
-
- l${g} %r14,$frame+2*$SIZE_T($sp) # pull input pointer
- stm @t[0],@t[1],$stdframe+4*8+4*8($sp) # offload x[8]-x[9]
- lm${g} @t[0],@t[1],$frame+3*$SIZE_T($sp)
-
- al @x[0],$stdframe+4*0($sp) # accumulate key schedule
- al @x[1],$stdframe+4*1($sp)
- al @x[2],$stdframe+4*2($sp)
- al @x[3],$stdframe+4*3($sp)
- al @x[4],$stdframe+4*4($sp)
- al @x[5],$stdframe+4*5($sp)
- al @x[6],$stdframe+4*6($sp)
- al @x[7],$stdframe+4*7($sp)
- lrvr @x[0],@x[0]
- lrvr @x[1],@x[1]
- lrvr @x[2],@x[2]
- lrvr @x[3],@x[3]
- lrvr @x[4],@x[4]
- lrvr @x[5],@x[5]
- lrvr @x[6],@x[6]
- lrvr @x[7],@x[7]
- al @x[12],$stdframe+4*12($sp)
- al @x[13],$stdframe+4*13($sp)
- al @x[14],$stdframe+4*14($sp)
- al @x[15],$stdframe+4*15($sp)
- lrvr @x[12],@x[12]
- lrvr @x[13],@x[13]
- lrvr @x[14],@x[14]
- lrvr @x[15],@x[15]
-
- la @t[0],0(@t[0],%r14) # reconstruct output pointer
- cl${g}r %r14,@t[1]
- jh .Ltail
-
- x @x[0],4*0(%r14) # xor with input
- x @x[1],4*1(%r14)
- st @x[0],4*0(@t[0]) # store output
- x @x[2],4*2(%r14)
- st @x[1],4*1(@t[0])
- x @x[3],4*3(%r14)
- st @x[2],4*2(@t[0])
- x @x[4],4*4(%r14)
- st @x[3],4*3(@t[0])
- lm @x[0],@x[3],$stdframe+4*8+4*8($sp) # load x[8]-x[11]
- x @x[5],4*5(%r14)
- st @x[4],4*4(@t[0])
- x @x[6],4*6(%r14)
- al @x[0],$stdframe+4*8($sp)
- st @x[5],4*5(@t[0])
- x @x[7],4*7(%r14)
- al @x[1],$stdframe+4*9($sp)
- st @x[6],4*6(@t[0])
- x @x[12],4*12(%r14)
- al @x[2],$stdframe+4*10($sp)
- st @x[7],4*7(@t[0])
- x @x[13],4*13(%r14)
- al @x[3],$stdframe+4*11($sp)
- st @x[12],4*12(@t[0])
- x @x[14],4*14(%r14)
- st @x[13],4*13(@t[0])
- x @x[15],4*15(%r14)
- st @x[14],4*14(@t[0])
- lrvr @x[0],@x[0]
- st @x[15],4*15(@t[0])
- lrvr @x[1],@x[1]
- lrvr @x[2],@x[2]
- lrvr @x[3],@x[3]
- lhi @x[12],1
- x @x[0],4*8(%r14)
- al @x[12],$stdframe+4*12($sp) # increment counter
- x @x[1],4*9(%r14)
- st @x[0],4*8(@t[0])
- x @x[2],4*10(%r14)
- st @x[1],4*9(@t[0])
- x @x[3],4*11(%r14)
- st @x[2],4*10(@t[0])
- st @x[3],4*11(@t[0])
-
- cl${g}r %r14,@t[1] # done yet?
- la %r14,64(%r14)
- jl .Loop_outer
-
-.Ldone:
- xgr %r0,%r0
- xgr %r1,%r1
- xgr %r2,%r2
- xgr %r3,%r3
- stmg %r0,%r3,$stdframe+4*4($sp) # wipe key copy
- stmg %r0,%r3,$stdframe+4*12($sp)
-
- lm${g} %r6,%r15,`$frame+6*$SIZE_T`($sp)
- br %r14
-
-.align 16
-.Ltail:
- la @t[1],64($t[1])
- stm @x[0],@x[7],$stdframe+4*0($sp)
- sl${g}r @t[1],%r14
- lm @x[0],@x[3],$stdframe+4*8+4*8($sp)
- l${g}hi @x[6],0
- stm @x[12],@x[15],$stdframe+4*12($sp)
- al @x[0],$stdframe+4*8($sp)
- al @x[1],$stdframe+4*9($sp)
- al @x[2],$stdframe+4*10($sp)
- al @x[3],$stdframe+4*11($sp)
- lrvr @x[0],@x[0]
- lrvr @x[1],@x[1]
- lrvr @x[2],@x[2]
- lrvr @x[3],@x[3]
- stm @x[0],@x[3],$stdframe+4*8($sp)
-
-.Loop_tail:
- llgc @x[4],0(@x[6],%r14)
- llgc @x[5],$stdframe(@x[6],$sp)
- xr @x[5],@x[4]
- stc @x[5],0(@x[6],@t[0])
- la @x[6],1(@x[6])
- brct @t[1],.Loop_tail
-
- j .Ldone
-.size ChaCha20_ctr32,.-ChaCha20_ctr32
-
-.align 32
-.Lsigma:
-.long 0x61707865,0x3320646e,0x79622d32,0x6b206574 # endian-neutral
-.asciz "ChaCha20 for s390x, CRYPTOGAMS by <appro\@openssl.org>"
-.align 4
-___
+ alr (@x[$a0],@x[$b0]); # Q1
+ alr (@x[$a1],@x[$b1]); # Q2
+ xr (@x[$d0],@x[$a0]);
+ xr (@x[$d1],@x[$a1]);
+ rll (@x[$d0],@x[$d0],16);
+ rll (@x[$d1],@x[$d1],16);
+
+ alr ($xc,@x[$d0]);
+ alr ($xc_,@x[$d1]);
+ xr (@x[$b0],$xc);
+ xr (@x[$b1],$xc_);
+ rll (@x[$b0],@x[$b0],12);
+ rll (@x[$b1],@x[$b1],12);
+
+ alr (@x[$a0],@x[$b0]);
+ alr (@x[$a1],@x[$b1]);
+ xr (@x[$d0],@x[$a0]);
+ xr (@x[$d1],@x[$a1]);
+ rll (@x[$d0],@x[$d0],8);
+ rll (@x[$d1],@x[$d1],8);
+
+ alr ($xc,@x[$d0]);
+ alr ($xc_,@x[$d1]);
+ xr (@x[$b0],$xc);
+ xr (@x[$b1],$xc_);
+ rll (@x[$b0],@x[$b0],7);
+ rll (@x[$b1],@x[$b1],7);
+
+ stm ($xc,$xc_,"$stdframe+4*8+4*$c0($sp)"); # reload pair of 'c's
+ lm ($xc,$xc_,"$stdframe+4*8+4*$c2($sp)");
+
+ alr (@x[$a2],@x[$b2]); # Q3
+ alr (@x[$a3],@x[$b3]); # Q4
+ xr (@x[$d2],@x[$a2]);
+ xr (@x[$d3],@x[$a3]);
+ rll (@x[$d2],@x[$d2],16);
+ rll (@x[$d3],@x[$d3],16);
+
+ alr ($xc,@x[$d2]);
+ alr ($xc_,@x[$d3]);
+ xr (@x[$b2],$xc);
+ xr (@x[$b3],$xc_);
+ rll (@x[$b2],@x[$b2],12);
+ rll (@x[$b3],@x[$b3],12);
+
+ alr (@x[$a2],@x[$b2]);
+ alr (@x[$a3],@x[$b3]);
+ xr (@x[$d2],@x[$a2]);
+ xr (@x[$d3],@x[$a3]);
+ rll (@x[$d2],@x[$d2],8);
+ rll (@x[$d3],@x[$d3],8);
+
+ alr ($xc,@x[$d2]);
+ alr ($xc_,@x[$d3]);
+ xr (@x[$b2],$xc);
+ xr (@x[$b3],$xc_);
+ rll (@x[$b2],@x[$b2],7);
+ rll (@x[$b3],@x[$b3],7);
+}
+
+sub VX_ROUND {
+my ($a0,$b0,$c0,$d0)=@_;
+my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0));
+my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1));
+my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2));
+
+ vaf (@v[$a0],@v[$a0],@v[$b0]);
+ vaf (@v[$a1],@v[$a1],@v[$b1]);
+ vaf (@v[$a2],@v[$a2],@v[$b2]);
+ vaf (@v[$a3],@v[$a3],@v[$b3]);
+ vx (@v[$d0],@v[$d0],@v[$a0]);
+ vx (@v[$d1],@v[$d1],@v[$a1]);
+ vx (@v[$d2],@v[$d2],@v[$a2]);
+ vx (@v[$d3],@v[$d3],@v[$a3]);
+ verllf (@v[$d0],@v[$d0],16);
+ verllf (@v[$d1],@v[$d1],16);
+ verllf (@v[$d2],@v[$d2],16);
+ verllf (@v[$d3],@v[$d3],16);
+
+ vaf (@v[$c0],@v[$c0],@v[$d0]);
+ vaf (@v[$c1],@v[$c1],@v[$d1]);
+ vaf (@v[$c2],@v[$c2],@v[$d2]);
+ vaf (@v[$c3],@v[$c3],@v[$d3]);
+ vx (@v[$b0],@v[$b0],@v[$c0]);
+ vx (@v[$b1],@v[$b1],@v[$c1]);
+ vx (@v[$b2],@v[$b2],@v[$c2]);
+ vx (@v[$b3],@v[$b3],@v[$c3]);
+ verllf (@v[$b0],@v[$b0],12);
+ verllf (@v[$b1],@v[$b1],12);
+ verllf (@v[$b2],@v[$b2],12);
+ verllf (@v[$b3],@v[$b3],12);
+
+ vaf (@v[$a0],@v[$a0],@v[$b0]);
+ vaf (@v[$a1],@v[$a1],@v[$b1]);
+ vaf (@v[$a2],@v[$a2],@v[$b2]);
+ vaf (@v[$a3],@v[$a3],@v[$b3]);
+ vx (@v[$d0],@v[$d0],@v[$a0]);
+ vx (@v[$d1],@v[$d1],@v[$a1]);
+ vx (@v[$d2],@v[$d2],@v[$a2]);
+ vx (@v[$d3],@v[$d3],@v[$a3]);
+ verllf (@v[$d0],@v[$d0],8);
+ verllf (@v[$d1],@v[$d1],8);
+ verllf (@v[$d2],@v[$d2],8);
+ verllf (@v[$d3],@v[$d3],8);
+
+ vaf (@v[$c0],@v[$c0],@v[$d0]);
+ vaf (@v[$c1],@v[$c1],@v[$d1]);
+ vaf (@v[$c2],@v[$c2],@v[$d2]);
+ vaf (@v[$c3],@v[$c3],@v[$d3]);
+ vx (@v[$b0],@v[$b0],@v[$c0]);
+ vx (@v[$b1],@v[$b1],@v[$c1]);
+ vx (@v[$b2],@v[$b2],@v[$c2]);
+ vx (@v[$b3],@v[$b3],@v[$c3]);
+ verllf (@v[$b0],@v[$b0],7);
+ verllf (@v[$b1],@v[$b1],7);
+ verllf (@v[$b2],@v[$b2],7);
+ verllf (@v[$b3],@v[$b3],7);
+}
+
+PERLASM_BEGIN($output);
-foreach (split("\n",$code)) {
- s/\`([^\`]*)\`/eval $1/ge;
+INCLUDE ("s390x_arch.h");
+TEXT ();
- print $_,"\n";
+################
+# void ChaCha20_ctr32(unsigned char *out, const unsigned char *inp, size_t len,
+# const unsigned int key[8], const unsigned int counter[4])
+{
+my ($out,$inp,$len,$key,$counter)=map("%r$_",(2..6));
+
+# VX CODE PATH
+{
+my $off=$z*8*16+8; # offset(initial state)
+my $frame=$stdframe+4*16+$off;
+
+GLOBL ("ChaCha20_ctr32");
+TYPE ("ChaCha20_ctr32","\@function");
+ALIGN (32);
+LABEL ("ChaCha20_ctr32");
+ larl ("%r1","OPENSSL_s390xcap_P");
+
+ lghi ("%r0",64);
+&{$z? \&cgr:\&cr} ($len,"%r0");
+ jle ("_s390x_chacha_novx");
+
+ lg ("%r0","S390X_STFLE+16(%r1)");
+ tmhh ("%r0",0x4000); # check for vector facility
+ jz ("_s390x_chacha_novx");
+
+if (!$z) {
+ llgfr ($len,$len);
+ std ("%f4","16*$SIZE_T+2*8($sp)");
+ std ("%f6","16*$SIZE_T+3*8($sp)");
+}
+&{$z? \&stmg:\&stm} ("%r6","%r7","6*$SIZE_T($sp)");
+
+ lghi ("%r1",-$frame);
+ lgr ("%r0",$sp);
+ la ($sp,"0(%r1,$sp)"); # allocate stack frame
+
+ larl ("%r7",".Lsigma");
+&{$z? \&stg:\&st} ("%r0","0($sp)"); # backchain
+
+ vstm ("%v8","%v15","8($sp)") if ($z);
+
+ vlm ("%v1","%v2","0($key)"); # load key
+ vl ("%v0","0(%r7)"); # load sigma constant
+ vl ("%v3","0($counter)"); # load iv (counter||nonce)
+ l ("%r0","0($counter)"); # load counter
+ vstm ("%v0","%v3","$off($sp)"); # copy initial state to stack
+
+ srlg ("%r1",$len,8);
+ ltgr ("%r1","%r1");
+ jz (".Lvx_4x_done");
+
+ALIGN (16); # process 4 64-byte blocks
+LABEL (".Lvx_4x");
+ vlrepf ("%v$_",($_*4)."+$off($sp)") for (0..15); # load initial
+ # state
+ vl ("%v31","16(%r7)");
+ vaf ("%v12","%v12","%v31"); # increment counter
+
+ vlr (@v[$_],"%v$_") for (0..15); # copy initial state
+
+ lhi ("%r6",10);
+ j (".Loop_vx_4x");
+
+ALIGN (16);
+LABEL (".Loop_vx_4x");
+ VX_ROUND( 0, 4, 8,12); # column round
+ VX_ROUND( 0, 5,10,15); # diagonal round
+ brct ("%r6",".Loop_vx_4x");
+
+ vaf (@v[$_],@v[$_],"%v$_") for (0..15); # state += initial
+ # state (mod 32)
+ vlm ("%v6","%v7","32(%r7)"); # load vperm operands
+
+for (0..3) { # blocks 1,2
+ vmrhf ("%v0",@v[$_*4+0],@v[$_*4+1]); # ks = serialize(state)
+ vmrhf ("%v1",@v[$_*4+2],@v[$_*4+3]);
+ vperm ("%v".($_+ 8),"%v0","%v1","%v6");
+ vperm ("%v".($_+12),"%v0","%v1","%v7");
+}
+ vlm ("%v0","%v7","0($inp)"); # load in
+ vx ("%v$_","%v$_","%v".($_+8)) for (0..7); # out = in ^ ks
+ vstm ("%v0","%v7","0($out)"); # store out
+
+ vlm ("%v6","%v7","32(%r7)"); # restore vperm operands
+
+for (0..3) { # blocks 2,3
+ vmrlf ("%v0",@v[$_*4+0],@v[$_*4+1]); # ks = serialize(state)
+ vmrlf ("%v1",@v[$_*4+2],@v[$_*4+3]);
+ vperm ("%v".($_+ 8),"%v0","%v1","%v6");
+ vperm ("%v".($_+12),"%v0","%v1","%v7");
+}
+ vlm ("%v0","%v7","128($inp)"); # load in
+ vx ("%v$_","%v$_","%v".($_+8)) for (0..7); # out = in ^ ks
+ vstm ("%v0","%v7","128($out)"); # store out
+
+ ahi ("%r0",4);
+ st ("%r0","48+$off($sp)"); # update initial state
+
+ la ($inp,"256($inp)");
+ la ($out,"256($out)");
+ brctg ("%r1",".Lvx_4x");
+
+ALIGN (16);
+LABEL (".Lvx_4x_done");
+ lghi ("%r1",0xff);
+ ngr ($len,"%r1");
+ jnz (".Lvx_rem");
+
+ALIGN (16);
+LABEL (".Lvx_done");
+ vzero ("%v$_") for (16..31); # wipe ks and key copy
+ vstm ("%v16","%v17","16+$off($sp)");
+ vlm ("%v8","%v15","8($sp)") if ($z);
+
+ la ($sp,"$frame($sp)");
+&{$z? \&lmg:\&lm} ("%r6","%r7","6*$SIZE_T($sp)");
+
+if (!$z) {
+ ld ("%f4","16*$SIZE_T+2*8($sp)");
+ ld ("%f6","16*$SIZE_T+3*8($sp)");
+ vzero ("%v$_") for (8..15);
+}
+ br ("%r14");
+ALIGN (16);
+LABEL (".Lvx_rem");
+ lhi ("%r0",64);
+
+ sr ($len,"%r0");
+ brc (2,".Lvx_rem_g64"); # cc==2?
+
+ lghi ("%r1",-$stdframe);
+
+ la ($counter,"48+$off($sp)"); # load updated iv
+ ar ($len,"%r0"); # restore len
+
+ lgr ("%r7",$counter);
+&{$z? \&stg:\&st} ("%r14","14*$SIZE_T+$frame($sp)");
+ la ($sp,"0(%r1,$sp)");
+
+ bras ("%r14","_s390x_chacha_novx");
+
+ la ($sp,"$stdframe($sp)");
+&{$z? \&lg:\&l} ("%r14","14*$SIZE_T+$frame($sp)");
+ lgr ($counter,"%r7");
+ j (".Lvx_done");
+
+ALIGN (16);
+LABEL (".Lvx_rem_g64");
+ vlrepf ("%v$_",($_*4)."+$off($sp)") for (0..15); # load initial
+ # state
+ vl ("%v31","16(%r7)");
+ vaf ("%v12","%v12","%v31"); # increment counter
+
+ vlr (@v[$_],"%v$_") for (0..15); # state = initial state
+
+ lhi ("%r6",10);
+ j (".Loop_vx_rem");
+
+ALIGN (16);
+LABEL (".Loop_vx_rem");
+ VX_ROUND( 0, 4, 8,12); # column round
+ VX_ROUND( 0, 5,10,15); # diagonal round
+ brct ("%r6",".Loop_vx_rem");
+
+ vaf (@v[$_],@v[$_],"%v$_") for (0..15); # state += initial
+ # state (mod 32)
+ vlm ("%v6","%v7","32(%r7)"); # load vperm operands
+
+for (0..3) { # blocks 1,2
+ vmrhf ("%v0",@v[$_*4+0],@v[$_*4+1]); # ks = serialize(state)
+ vmrhf ("%v1",@v[$_*4+2],@v[$_*4+3]);
+ vperm ("%v".($_+8),"%v0","%v1","%v6");
+ vperm ("%v".($_+12),"%v0","%v1","%v7");
+}
+ vlm ("%v0","%v3","0($inp)"); # load in
+ vx ("%v$_","%v$_","%v".($_+8)) for (0..3); # out = in ^ ks
+ vstm ("%v0","%v3","0($out)"); # store out
+
+ la ($inp,"64($inp)");
+ la ($out,"64($out)");
+
+ sr ($len,"%r0");
+ brc (4,".Lvx_tail"); # cc==4?
+
+ vlm ("%v0","%v3","0($inp)"); # load in
+ vx ("%v$_","%v$_","%v".($_+12)) for (0..3); # out = in ^ ks
+ vstm ("%v0","%v3","0($out)"); # store out
+ jz (".Lvx_done");
+
+for (0..3) { # blocks 3,4
+ vmrlf ("%v0",@v[$_*4+0],@v[$_*4+1]); # ks = serialize(state)
+ vmrlf ("%v1",@v[$_*4+2],@v[$_*4+3]);
+ vperm ("%v".($_+12),"%v0","%v1","%v6");
+ vperm ("%v".($_+8),"%v0","%v1","%v7");
+}
+ la ($inp,"64($inp)");
+ la ($out,"64($out)");
+
+ sr ($len,"%r0");
+ brc (4,".Lvx_tail"); # cc==4?
+
+ vlm ("%v0","%v3","0($inp)"); # load in
+ vx ("%v$_","%v$_","%v".($_+12)) for (0..3); # out = in ^ ks
+ vstm ("%v0","%v3","0($out)"); # store out
+ jz (".Lvx_done");
+
+ la ($inp,"64($inp)");
+ la ($out,"64($out)");
+
+ sr ($len,"%r0");
+ vlr ("%v".($_+4),"%v$_") for (8..11);
+ j (".Lvx_tail");
+
+ALIGN (16);
+LABEL (".Lvx_tail");
+ ar ($len,"%r0"); # restore $len
+ ahi ($len,-1);
+
+ lhi ("%r0",16);
+for (0..2) {
+ vll ("%v0",$len,($_*16)."($inp)");
+ vx ("%v0","%v0","%v".($_+12));
+ vstl ("%v0",$len,($_*16)."($out)");
+ sr ($len,"%r0");
+ brc (4,".Lvx_done"); # cc==4?
+}
+ vll ("%v0",$len,"3*16($inp)");
+ vx ("%v0","%v0","%v15");
+ vstl ("%v0",$len,"3*16($out)");
+ j (".Lvx_done");
+SIZE ("ChaCha20_ctr32",".-ChaCha20_ctr32");
+}
+
+# NOVX CODE PATH
+{
+my $frame=$stdframe+4*20;
+
+TYPE ("_s390x_chacha_novx","\@function");
+ALIGN (32);
+LABEL ("_s390x_chacha_novx");
+&{$z? \&ltgr:\&ltr} ($len,$len); # $len==0?
+ bzr ("%r14");
+&{$z? \&aghi:\&ahi} ($len,-64);
+&{$z? \&lghi:\&lhi} ("%r1",-$frame);
+&{$z? \&stmg:\&stm} ("%r6","%r15","6*$SIZE_T($sp)");
+&{$z? \&slgr:\&slr} ($out,$inp); # difference
+ la ($len,"0($inp,$len)"); # end of input minus 64
+ larl ("%r7",".Lsigma");
+ lgr ("%r0",$sp);
+ la ($sp,"0(%r1,$sp)");
+&{$z? \&stg:\&st} ("%r0","0($sp)");
+
+ lmg ("%r8","%r11","0($key)"); # load key
+ lmg ("%r12","%r13","0($counter)"); # load counter
+ lmg ("%r6","%r7","0(%r7)"); # load sigma constant
+
+ la ("%r14","0($inp)");
+&{$z? \&stg:\&st} ($out,"$frame+3*$SIZE_T($sp)");
+&{$z? \&stg:\&st} ($len,"$frame+4*$SIZE_T($sp)");
+ stmg ("%r6","%r13","$stdframe($sp)");# copy key schedule to stack
+ srlg (@x[12],"%r12",32); # 32-bit counter value
+ j (".Loop_outer");
+
+ALIGN (16);
+LABEL (".Loop_outer");
+ lm (@x[0],@x[7],"$stdframe+4*0($sp)"); # load x[0]-x[7]
+ lm (@t[0],@t[1],"$stdframe+4*10($sp)"); # load x[10]-x[11]
+ lm (@x[13],@x[15],"$stdframe+4*13($sp)"); # load x[13]-x[15]
+ stm (@t[0],@t[1],"$stdframe+4*8+4*10($sp)");# offload x[10]-x[11]
+ lm (@t[0],@t[1],"$stdframe+4*8($sp)"); # load x[8]-x[9]
+ st (@x[12],"$stdframe+4*12($sp)"); # save counter
+&{$z? \&stg:\&st} ("%r14","$frame+2*$SIZE_T($sp)");# save input pointer
+ lhi ("%r14",10);
+ j (".Loop");
+
+ALIGN (4);
+LABEL (".Loop");
+ ROUND (0, 4, 8,12);
+ ROUND (0, 5,10,15);
+ brct ("%r14",".Loop");
+
+&{$z? \&lg:\&l} ("%r14","$frame+2*$SIZE_T($sp)");# pull input pointer
+ stm (@t[0],@t[1],"$stdframe+4*8+4*8($sp)"); # offload x[8]-x[9]
+&{$z? \&lmg:\&lm} (@t[0],@t[1],"$frame+3*$SIZE_T($sp)");
+
+ al (@x[0],"$stdframe+4*0($sp)"); # accumulate key schedule
+ al (@x[1],"$stdframe+4*1($sp)");
+ al (@x[2],"$stdframe+4*2($sp)");
+ al (@x[3],"$stdframe+4*3($sp)");
+ al (@x[4],"$stdframe+4*4($sp)");
+ al (@x[5],"$stdframe+4*5($sp)");
+ al (@x[6],"$stdframe+4*6($sp)");
+ al (@x[7],"$stdframe+4*7($sp)");
+ lrvr (@x[0],@x[0]);
+ lrvr (@x[1],@x[1]);
+ lrvr (@x[2],@x[2]);
+ lrvr (@x[3],@x[3]);
+ lrvr (@x[4],@x[4]);
+ lrvr (@x[5],@x[5]);
+ lrvr (@x[6],@x[6]);
+ lrvr (@x[7],@x[7]);
+ al (@x[12],"$stdframe+4*12($sp)");
+ al (@x[13],"$stdframe+4*13($sp)");
+ al (@x[14],"$stdframe+4*14($sp)");
+ al (@x[15],"$stdframe+4*15($sp)");
+ lrvr (@x[12],@x[12]);
+ lrvr (@x[13],@x[13]);
+ lrvr (@x[14],@x[14]);
+ lrvr (@x[15],@x[15]);
+
+ la (@t[0],"0(@t[0],%r14)"); # reconstruct output pointer
+&{$z? \&clgr:\&clr} ("%r14",@t[1]);
+ jh (".Ltail");
+
+ x (@x[0],"4*0(%r14)"); # xor with input
+ x (@x[1],"4*1(%r14)");
+ st (@x[0],"4*0(@t[0])"); # store output
+ x (@x[2],"4*2(%r14)");
+ st (@x[1],"4*1(@t[0])");
+ x (@x[3],"4*3(%r14)");
+ st (@x[2],"4*2(@t[0])");
+ x (@x[4],"4*4(%r14)");
+ st (@x[3],"4*3(@t[0])");
+ lm (@x[0],@x[3],"$stdframe+4*8+4*8($sp)"); # load x[8]-x[11]
+ x (@x[5],"4*5(%r14)");
+ st (@x[4],"4*4(@t[0])");
+ x (@x[6],"4*6(%r14)");
+ al (@x[0],"$stdframe+4*8($sp)");
+ st (@x[5],"4*5(@t[0])");
+ x (@x[7],"4*7(%r14)");
+ al (@x[1],"$stdframe+4*9($sp)");
+ st (@x[6],"4*6(@t[0])");
+ x (@x[12],"4*12(%r14)");
+ al (@x[2],"$stdframe+4*10($sp)");
+ st (@x[7],"4*7(@t[0])");
+ x (@x[13],"4*13(%r14)");
+ al (@x[3],"$stdframe+4*11($sp)");
+ st (@x[12],"4*12(@t[0])");
+ x (@x[14],"4*14(%r14)");
+ st (@x[13],"4*13(@t[0])");
+ x (@x[15],"4*15(%r14)");
+ st (@x[14],"4*14(@t[0])");
+ lrvr (@x[0],@x[0]);
+ st (@x[15],"4*15(@t[0])");
+ lrvr (@x[1],@x[1]);
+ lrvr (@x[2],@x[2]);
+ lrvr (@x[3],@x[3]);
+ lhi (@x[12],1);
+ x (@x[0],"4*8(%r14)");
+ al (@x[12],"$stdframe+4*12($sp)"); # increment counter
+ x (@x[1],"4*9(%r14)");
+ st (@x[0],"4*8(@t[0])");
+ x (@x[2],"4*10(%r14)");
+ st (@x[1],"4*9(@t[0])");
+ x (@x[3],"4*11(%r14)");
+ st (@x[2],"4*10(@t[0])");
+ st (@x[3],"4*11(@t[0])");
+
+&{$z? \&clgr:\&clr} ("%r14",@t[1]); # done yet?
+ la ("%r14","64(%r14)");
+ jl (".Loop_outer");
+
+LABEL (".Ldone");
+ xgr ("%r0","%r0");
+ xgr ("%r1","%r1");
+ xgr ("%r2","%r2");
+ xgr ("%r3","%r3");
+ stmg ("%r0","%r3","$stdframe+4*4($sp)"); # wipe key copy
+ stmg ("%r0","%r3","$stdframe+4*12($sp)");
+
+&{$z? \&lmg:\&lm} ("%r6","%r15","$frame+6*$SIZE_T($sp)");
+ br ("%r14");
+
+ALIGN (16);
+LABEL (".Ltail");
+ la (@t[1],"64($t[1])");
+ stm (@x[0],@x[7],"$stdframe+4*0($sp)");
+&{$z? \&slgr:\&slr} (@t[1],"%r14");
+ lm (@x[0],@x[3],"$stdframe+4*8+4*8($sp)");
+&{$z? \&lghi:\&lhi} (@x[6],0);
+ stm (@x[12],@x[15],"$stdframe+4*12($sp)");
+ al (@x[0],"$stdframe+4*8($sp)");
+ al (@x[1],"$stdframe+4*9($sp)");
+ al (@x[2],"$stdframe+4*10($sp)");
+ al (@x[3],"$stdframe+4*11($sp)");
+ lrvr (@x[0],@x[0]);
+ lrvr (@x[1],@x[1]);
+ lrvr (@x[2],@x[2]);
+ lrvr (@x[3],@x[3]);
+ stm (@x[0],@x[3],"$stdframe+4*8($sp)");
+
+LABEL (".Loop_tail");
+ llgc (@x[4],"0(@x[6],%r14)");
+ llgc (@x[5],"$stdframe(@x[6],$sp)");
+ xr (@x[5],@x[4]);
+ stc (@x[5],"0(@x[6],@t[0])");
+ la (@x[6],"1(@x[6])");
+ brct (@t[1],".Loop_tail");
+
+ j (".Ldone");
+SIZE ("_s390x_chacha_novx",".-_s390x_chacha_novx");
+}
}
-close STDOUT or die "error closing STDOUT: $!";
+################
+
+ALIGN (64);
+LABEL (".Lsigma");
+LONG (0x61707865,0x3320646e,0x79622d32,0x6b206574); # endian-neutral sigma
+LONG (0x00000000,0x00000001,0x00000002,0x00000003); # vaf counter increment
+LONG (0x03020100,0x07060504,0x13121110,0x17161514); # vperm serialization
+LONG (0x0b0a0908,0x0f0e0d0c,0x1b1a1918,0x1f1e1d1c); # vperm serialization
+ASCIZ ("\"ChaCha20 for s390x, CRYPTOGAMS by <appro\@openssl.org>\"");
+ALIGN (4);
+
+PERLASM_END();