openssl-1_1/0003-crypto-poly1305-asm-poly1305-s390x.pl-add-vx-code-pa.patch
Pedro Monreal Gonzalez 32ced036f1 Accepting request 786956 from home:vitezslav_cizek:branches:security:tls
- Update to 1.1.1e
  * Properly detect EOF while reading in libssl. Previously if we hit an EOF
    while reading in libssl then we would report an error back to the
    application (SSL_ERROR_SYSCALL) but errno would be 0. We now add
    an error to the stack (which means we instead return SSL_ERROR_SSL) and
    therefore give a hint as to what went wrong.
  * Check that ed25519 and ed448 are allowed by the security level. Previously
    signature algorithms not using an MD were not being checked that they were
    allowed by the security level.
  * Fixed SSL_get_servername() behaviour. The behaviour of SSL_get_servername()
    was not quite right. The behaviour was not consistent between resumption
    and normal handshakes, and also not quite consistent with historical
    behaviour. The behaviour in various scenarios has been clarified and
    it has been updated to make it match historical behaviour as closely as
    possible.
  * Corrected the documentation of the return values from the EVP_DigestSign*
    set of functions.  The documentation mentioned negative values for some
    errors, but this was never the case, so the mention of negative values
    was removed.
  * Added a new method to gather entropy on VMS, based on SYS$GET_ENTROPY.
    The presence of this system service is determined at run-time.
  * Added newline escaping functionality to a filename when using openssl dgst.
    This output format is to replicate the output format found in the '*sum'
    checksum programs. This aims to preserve backward compatibility.
  * Print all values for a PKCS#12 attribute with 'openssl pkcs12', not just
    the first value.
- Update bunch of patches as the internal crypto headers got reorganized
- drop openssl-1_1-CVE-2019-1551.patch (upstream)

- openssl dgst: default to SHA256 only when called without a digest,

OBS-URL: https://build.opensuse.org/request/show/786956
OBS-URL: https://build.opensuse.org/package/show/security:tls/openssl-1_1?expand=0&rev=65
2020-03-20 17:43:35 +00:00

1007 lines
25 KiB
Diff

From d6f4b0a8bfbe901c72294d8923eb5b6f54ca7732 Mon Sep 17 00:00:00 2001
From: Patrick Steuer <patrick.steuer@de.ibm.com>
Date: Mon, 6 Feb 2017 10:54:54 +0100
Subject: [PATCH] crypto/poly1305/asm/poly1305-s390x.pl: add vx code path.
Signed-off-by: Patrick Steuer <patrick.steuer@de.ibm.com>
Reviewed-by: Matt Caswell <matt@openssl.org>
Reviewed-by: Richard Levitte <levitte@openssl.org>
(Merged from https://github.com/openssl/openssl/pull/7991)
---
crypto/poly1305/asm/poly1305-s390x.pl | 944 +++++++++++++++++++++-----
1 file changed, 780 insertions(+), 164 deletions(-)
diff --git a/crypto/poly1305/asm/poly1305-s390x.pl b/crypto/poly1305/asm/poly1305-s390x.pl
index 21ca86055e..390f9eefe7 100755
--- a/crypto/poly1305/asm/poly1305-s390x.pl
+++ b/crypto/poly1305/asm/poly1305-s390x.pl
@@ -24,204 +24,820 @@
#
# On side note, z13 enables vector base 2^26 implementation...
-$flavour = shift;
+#
+# January 2019
+#
+# Add vx code path (base 2^26).
+#
+# Copyright IBM Corp. 2019
+# Author: Patrick Steuer <patrick.steuer@de.ibm.com>
+use strict;
+use FindBin qw($Bin);
+use lib "$Bin/../..";
+use perlasm::s390x qw(:DEFAULT :VX AUTOLOAD LABEL);
+
+my $flavour = shift;
+
+my ($z,$SIZE_T);
if ($flavour =~ /3[12]/) {
+ $z=0; # S/390 ABI
$SIZE_T=4;
- $g="";
} else {
+ $z=1; # zSeries ABI
$SIZE_T=8;
- $g="g";
}
+my $output;
while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {}
-open STDOUT,">$output";
-$sp="%r15";
+my $sp="%r15";
+
+# novx code path ctx layout
+# ---------------------------------
+# var value base off
+# ---------------------------------
+# u64 h[3] hash 2^64 0
+# u32 pad[2]
+# u64 r[2] key 2^64 32
+
+# vx code path ctx layout
+# ---------------------------------
+# var value base off
+# ---------------------------------
+# u32 acc1[5] r^2-acc 2^26 0
+# u32 pad
+# u32 acc2[5] r-acc 2^26 24
+# u32 pad
+# u32 r1[5] r 2^26 48
+# u32 r15[5] 5*r 2^26 68
+# u32 r2[5] r^2 2^26 88
+# u32 r25[5] 5*r^2 2^26 108
+# u32 r4[5] r^4 2^26 128
+# u32 r45[5] 5*r^4 2^26 148
+
+PERLASM_BEGIN($output);
+
+TEXT ();
+
+################
+# static void poly1305_init(void *ctx, const unsigned char key[16])
+{
+my ($ctx,$key)=map("%r$_",(2..3));
+my ($r0,$r1,$r2)=map("%r$_",(9,11,13));
-my ($ctx,$inp,$len,$padbit) = map("%r$_",(2..5));
+sub MUL_RKEY { # r*=key
+my ($d0hi,$d0lo,$d1hi,$d1lo)=map("%r$_",(4..7));
+my ($t0,$t1,$s1)=map("%r$_",(8,10,12));
+
+ lg ("%r0","32($ctx)");
+ lg ("%r1","40($ctx)");
+
+ srlg ($s1,"%r1",2);
+ algr ($s1,"%r1");
+
+ lgr ($d0lo,$r0);
+ lgr ($d1lo,$r1);
+
+ mlgr ($d0hi,"%r0");
+ lgr ($r1,$d1lo);
+ mlgr ($d1hi,$s1);
+
+ mlgr ($t0,"%r1");
+ mlgr ($t1,"%r0");
+
+ algr ($d0lo,$d1lo);
+ lgr ($d1lo,$r2);
+ alcgr ($d0hi,$d1hi);
+ lghi ($d1hi,0);
+
+ algr ($r1,$r0);
+ alcgr ($t1,$t0);
+
+ msgr ($d1lo,$s1);
+ msgr ($r2,"%r0");
+
+ algr ($r1,$d1lo);
+ alcgr ($t1,$d1hi);
+
+ algr ($r1,$d0hi);
+ alcgr ($r2,$t1);
+
+ lghi ($r0,-4);
+ ngr ($r0,$r2);
+ srlg ($t0,$r2,2);
+ algr ($r0,$t0);
+ lghi ($t1,3);
+ ngr ($r2,$t1);
+
+ algr ($r0,$d0lo);
+ alcgr ($r1,$d1hi);
+ alcgr ($r2,$d1hi);
+}
+
+sub ST_R5R { # store r,5*r -> base 2^26
+my @d=map("%r$_",(4..8));
+my @off=@_;
+
+ lgr (@d[2],$r0);
+ lr ("%r1",@d[2]);
+ nilh ("%r1",1023);
+ lgr (@d[3],$r1);
+ lr (@d[0],"%r1");
+ srlg ("%r1",@d[2],52);
+ lgr (@d[4],$r2);
+ srlg ("%r0",@d[2],26);
+ sll (@d[4],24);
+ lr (@d[2],@d[3]);
+ nilh ("%r0",1023);
+ sll (@d[2],12);
+ lr (@d[1],"%r0");
+ &or (@d[2],"%r1");
+ srlg ("%r1",@d[3],40);
+ nilh (@d[2],1023);
+ &or (@d[4],"%r1");
+ srlg (@d[3],@d[3],14);
+ nilh (@d[4],1023);
+ nilh (@d[3],1023);
+
+ stm (@d[0],@d[4],"@off[0]($ctx)");
+ mhi (@d[$_],5) for (0..4);
+ stm (@d[0],@d[4],"@off[1]($ctx)");
+}
-$code.=<<___;
-.text
-
-.globl poly1305_init
-.type poly1305_init,\@function
-.align 16
-poly1305_init:
- lghi %r0,0
- lghi %r1,-1
- stg %r0,0($ctx) # zero hash value
- stg %r0,8($ctx)
- stg %r0,16($ctx)
-
- cl${g}r $inp,%r0
- je .Lno_key
-
- lrvg %r4,0($inp) # load little-endian key
- lrvg %r5,8($inp)
-
- nihl %r1,0xffc0 # 0xffffffc0ffffffff
- srlg %r0,%r1,4 # 0x0ffffffc0fffffff
- srlg %r1,%r1,4
- nill %r1,0xfffc # 0x0ffffffc0ffffffc
-
- ngr %r4,%r0
- ngr %r5,%r1
-
- stg %r4,32($ctx)
- stg %r5,40($ctx)
-
-.Lno_key:
- lghi %r2,0
- br %r14
-.size poly1305_init,.-poly1305_init
-___
+GLOBL ("poly1305_init");
+TYPE ("poly1305_init","\@function");
+ALIGN (16);
+LABEL ("poly1305_init");
+ lghi ("%r0",0);
+ lghi ("%r1",-1);
+ stg ("%r0","0($ctx)"); # zero hash value / acc1
+ stg ("%r0","8($ctx)");
+ stg ("%r0","16($ctx)");
+
+&{$z? \&clgr:\&clr} ($key,"%r0");
+ je (".Ldone");
+
+ lrvg ("%r4","0($key)"); # load little-endian key
+ lrvg ("%r5","8($key)");
+
+ nihl ("%r1",0xffc0); # 0xffffffc0ffffffff
+ srlg ("%r0","%r1",4); # 0x0ffffffc0fffffff
+ srlg ("%r1","%r1",4);
+ nill ("%r1",0xfffc); # 0x0ffffffc0ffffffc
+
+ ngr ("%r4","%r0");
+ ngr ("%r5","%r1");
+
+ stg ("%r4","32($ctx)");
+ stg ("%r5","40($ctx)");
+
+ larl ("%r1","OPENSSL_s390xcap_P");
+ lg ("%r0","16(%r1)");
+ tmhh ("%r0",0x4000); # check for vector facility
+ jz (".Ldone");
+
+ larl ("%r4","poly1305_blocks_vx");
+ larl ("%r5","poly1305_emit_vx");
+
+&{$z? \&stmg:\&stm} ("%r6","%r13","6*$SIZE_T($sp)");
+&{$z? \&stmg:\&stm} ("%r4","%r5","4*$z+228($ctx)");
+
+ lg ($r0,"32($ctx)");
+ lg ($r1,"40($ctx)");
+ lghi ($r2,0);
+
+ ST_R5R (48,68); # store r,5*r
+
+ MUL_RKEY();
+ ST_R5R (88,108); # store r^2,5*r^2
+
+ MUL_RKEY();
+ MUL_RKEY();
+ ST_R5R (128,148); # store r^4,5*r^4
+
+ lghi ("%r0",0);
+ stg ("%r0","24($ctx)"); # zero acc2
+ stg ("%r0","32($ctx)");
+ stg ("%r0","40($ctx)");
+
+&{$z? \&lmg:\&lm} ("%r6","%r13","6*$SIZE_T($sp)");
+ lghi ("%r2",1);
+ br ("%r14");
+
+LABEL (".Ldone");
+ lghi ("%r2",0);
+ br ("%r14");
+SIZE ("poly1305_init",".-poly1305_init");
+}
+
+# VX CODE PATH
{
-my ($d0hi,$d0lo,$d1hi,$d1lo,$t0,$h0,$t1,$h1,$h2) = map("%r$_",(6..14));
-my ($r0,$r1,$s1) = map("%r$_",(0..2));
+my $frame=8*16;
+my @m01=map("%v$_",(0..4));
+my @m23=map("%v$_",(5..9));
+my @tmp=@m23;
+my @acc=map("%v$_",(10..14));
+my @r=map("%v$_",(15..19));
+my @r5=map("%v$_",(20..24));
+my $padvec="%v26";
+my $mask4="%v27";
+my @vperm=map("%v$_",(28..30));
+my $mask="%v31";
+
+sub REDUCE {
+ vesrlg (@tmp[0],@acc[0],26);
+ vesrlg (@tmp[3],@acc[3],26);
+ vn (@acc[0],@acc[0],$mask);
+ vn (@acc[3],@acc[3],$mask);
+ vag (@acc[1],@acc[1],@tmp[0]); # carry 0->1
+ vag (@acc[4],@acc[4],@tmp[3]); # carry 3->4
+
+ vesrlg (@tmp[1],@acc[1],26);
+ vesrlg (@tmp[4],@acc[4],26);
+ vn (@acc[1],@acc[1],$mask);
+ vn (@acc[4],@acc[4],$mask);
+ veslg (@tmp[0],@tmp[4],2);
+ vag (@tmp[4],@tmp[4],@tmp[0]); # h[4]*=5
+ vag (@acc[2],@acc[2],@tmp[1]); # carry 1->2
+ vag (@acc[0],@acc[0],@tmp[4]); # carry 4->0
+
+ vesrlg (@tmp[2],@acc[2],26);
+ vesrlg (@tmp[0],@acc[0],26);
+ vn (@acc[2],@acc[2],$mask);
+ vn (@acc[0],@acc[0],$mask);
+ vag (@acc[3],@acc[3],@tmp[2]); # carry 2->3
+ vag (@acc[1],@acc[1],@tmp[0]); # carry 0->1
+
+ vesrlg (@tmp[3],@acc[3],26);
+ vn (@acc[3],@acc[3],$mask);
+ vag (@acc[4],@acc[4],@tmp[3]); # carry 3->4
+}
-$code.=<<___;
-.globl poly1305_blocks
-.type poly1305_blocks,\@function
-.align 16
-poly1305_blocks:
- srl${g} $len,4 # fixed-up in 64-bit build
- lghi %r0,0
- cl${g}r $len,%r0
- je .Lno_data
+################
+# static void poly1305_blocks_vx(void *ctx, const unsigned char *inp,
+# size_t len, u32 padbit)
+{
+my ($ctx,$inp,$len) = map("%r$_",(2..4));
+my $padbit="%r0";
+
+GLOBL ("poly1305_blocks_vx");
+TYPE ("poly1305_blocks_vx","\@function");
+ALIGN (16);
+LABEL ("poly1305_blocks_vx");
+if ($z) {
+ aghi ($sp,-$frame);
+ vstm ("%v8","%v15","0($sp)");
+} else {
+ std ("%f4","16*$SIZE_T+2*8($sp)");
+ std ("%f6","16*$SIZE_T+3*8($sp)");
+ llgfr ($len,$len);
+}
+ llgfr ($padbit,"%r5");
+ vlef (@acc[$_],"4*$_($ctx)",1) for (0..4); # load acc1
+ larl ("%r5",".Lconst");
+ vlef (@acc[$_],"24+4*$_($ctx)",3) for (0..4); # load acc2
+ sllg ($padbit,$padbit,24);
+ vlm (@vperm[0],$mask,"0(%r5)"); # load vperm ops, mask
+ vgbm ($mask4,0x0707);
+ vlvgp ($padvec,$padbit,$padbit);
+
+ srlg ("%r1",$len,6);
+ ltgr ("%r1","%r1");
+ jz (".Lvx_4x_done");
+
+ALIGN (16);
+LABEL (".Lvx_4x");
+ vlm ("%v20","%v23","0($inp)"); # load m0,m1,m2,m3
+
+ # m01,m23 -> base 2^26
+
+ vperm (@m01[0],"%v20","%v21",@vperm[0]);
+ vperm (@m23[0],"%v22","%v23",@vperm[0]);
+ vperm (@m01[2],"%v20","%v21",@vperm[1]);
+ vperm (@m23[2],"%v22","%v23",@vperm[1]);
+ vperm (@m01[4],"%v20","%v21",@vperm[2]);
+ vperm (@m23[4],"%v22","%v23",@vperm[2]);
+
+ vesrlg (@m01[1],@m01[0],26);
+ vesrlg (@m23[1],@m23[0],26);
+ vesrlg (@m01[3],@m01[2],30);
+ vesrlg (@m23[3],@m23[2],30);
+ vesrlg (@m01[2],@m01[2],4);
+ vesrlg (@m23[2],@m23[2],4);
+
+ vn (@m01[4],@m01[4],$mask4);
+ vn (@m23[4],@m23[4],$mask4);
+for (0..3) {
+ vn (@m01[$_],@m01[$_],$mask);
+ vn (@m23[$_],@m23[$_],$mask);
+}
+ vaf (@m01[4],@m01[4],$padvec); # pad m01
+ vaf (@m23[4],@m23[4],$padvec); # pad m23
+
+ # acc = acc * r^4 + m01 * r^2 + m23
+
+ vlrepf (@r5[$_],"4*$_+108($ctx)") for (0..4); # load 5*r^2
+ vlrepf (@r[$_],"4*$_+88($ctx)") for (0..4); # load r^2
+
+ vmalof (@tmp[0],@m01[4],@r5[1],@m23[0]);
+ vmalof (@tmp[1],@m01[4],@r5[2],@m23[1]);
+ vmalof (@tmp[2],@m01[4],@r5[3],@m23[2]);
+ vmalof (@tmp[3],@m01[4],@r5[4],@m23[3]);
+ vmalof (@tmp[4],@m01[4],@r[0],@m23[4]);
+
+ vmalof (@tmp[0],@m01[3],@r5[2],@tmp[0]);
+ vmalof (@tmp[1],@m01[3],@r5[3],@tmp[1]);
+ vmalof (@tmp[2],@m01[3],@r5[4],@tmp[2]);
+ vmalof (@tmp[3],@m01[3],@r[0],@tmp[3]);
+ vmalof (@tmp[4],@m01[3],@r[1],@tmp[4]);
+
+ vmalof (@tmp[0],@m01[2],@r5[3],@tmp[0]);
+ vmalof (@tmp[1],@m01[2],@r5[4],@tmp[1]);
+ vmalof (@tmp[2],@m01[2],@r[0],@tmp[2]);
+ vmalof (@tmp[3],@m01[2],@r[1],@tmp[3]);
+ vmalof (@tmp[4],@m01[2],@r[2],@tmp[4]);
+
+ vmalof (@tmp[0],@m01[1],@r5[4],@tmp[0]);
+ vmalof (@tmp[1],@m01[1],@r[0],@tmp[1]);
+ vmalof (@tmp[2],@m01[1],@r[1],@tmp[2]);
+ vmalof (@tmp[3],@m01[1],@r[2],@tmp[3]);
+ vmalof (@tmp[4],@m01[1],@r[3],@tmp[4]);
+
+ vmalof (@tmp[0],@m01[0],@r[0],@tmp[0]);
+ vmalof (@tmp[1],@m01[0],@r[1],@tmp[1]);
+ vmalof (@tmp[2],@m01[0],@r[2],@tmp[2]);
+ vmalof (@tmp[3],@m01[0],@r[3],@tmp[3]);
+ vmalof (@tmp[4],@m01[0],@r[4],@tmp[4]);
+
+ vlrepf (@r5[$_],"4*$_+148($ctx)") for (0..4); # load 5*r^4
+ vlrepf (@r[$_],"4*$_+128($ctx)") for (0..4); # load r^4
+
+ vmalof (@tmp[0],@acc[4],@r5[1],@tmp[0]);
+ vmalof (@tmp[1],@acc[4],@r5[2],@tmp[1]);
+ vmalof (@tmp[2],@acc[4],@r5[3],@tmp[2]);
+ vmalof (@tmp[3],@acc[4],@r5[4],@tmp[3]);
+ vmalof (@tmp[4],@acc[4],@r[0],@tmp[4]);
+
+ vmalof (@tmp[0],@acc[3],@r5[2],@tmp[0]);
+ vmalof (@tmp[1],@acc[3],@r5[3],@tmp[1]);
+ vmalof (@tmp[2],@acc[3],@r5[4],@tmp[2]);
+ vmalof (@tmp[3],@acc[3],@r[0],@tmp[3]);
+ vmalof (@tmp[4],@acc[3],@r[1],@tmp[4]);
+
+ vmalof (@tmp[0],@acc[2],@r5[3],@tmp[0]);
+ vmalof (@tmp[1],@acc[2],@r5[4],@tmp[1]);
+ vmalof (@tmp[2],@acc[2],@r[0],@tmp[2]);
+ vmalof (@tmp[3],@acc[2],@r[1],@tmp[3]);
+ vmalof (@tmp[4],@acc[2],@r[2],@tmp[4]);
+
+ vmalof (@tmp[0],@acc[1],@r5[4],@tmp[0]);
+ vmalof (@tmp[1],@acc[1],@r[0],@tmp[1]);
+ vmalof (@tmp[2],@acc[1],@r[1],@tmp[2]);
+ vmalof (@tmp[3],@acc[1],@r[2],@tmp[3]);
+ vmalof (@tmp[4],@acc[1],@r[3],@tmp[4]);
+
+ vmalof (@acc[1],@acc[0],@r[1],@tmp[1]);
+ vmalof (@acc[2],@acc[0],@r[2],@tmp[2]);
+ vmalof (@acc[3],@acc[0],@r[3],@tmp[3]);
+ vmalof (@acc[4],@acc[0],@r[4],@tmp[4]);
+ vmalof (@acc[0],@acc[0],@r[0],@tmp[0]);
+
+ REDUCE ();
+
+ la ($inp,"64($inp)");
+ brctg ("%r1",".Lvx_4x");
+
+ALIGN (16);
+LABEL (".Lvx_4x_done");
+ tml ($len,32);
+ jz (".Lvx_2x_done");
+
+ vlm ("%v20","%v21","0($inp)"); # load m0,m1
+
+ # m01 -> base 2^26
+
+ vperm (@m01[0],"%v20","%v21",@vperm[0]);
+ vperm (@m01[2],"%v20","%v21",@vperm[1]);
+ vperm (@m01[4],"%v20","%v21",@vperm[2]);
+
+ vesrlg (@m01[1],@m01[0],26);
+ vesrlg (@m01[3],@m01[2],30);
+ vesrlg (@m01[2],@m01[2],4);
+
+ vn (@m01[4],@m01[4],$mask4);
+ vn (@m01[$_],@m01[$_],$mask) for (0..3);
+
+ vaf (@m01[4],@m01[4],$padvec); # pad m01
+
+ # acc = acc * r^2+ m01
+
+ vlrepf (@r5[$_],"4*$_+108($ctx)") for (0..4); # load 5*r^2
+ vlrepf (@r[$_],"4*$_+88($ctx)") for (0..4); # load r^2
+
+ vmalof (@tmp[0],@acc[4],@r5[1],@m01[0]);
+ vmalof (@tmp[1],@acc[4],@r5[2],@m01[1]);
+ vmalof (@tmp[2],@acc[4],@r5[3],@m01[2]);
+ vmalof (@tmp[3],@acc[4],@r5[4],@m01[3]);
+ vmalof (@tmp[4],@acc[4],@r[0],@m01[4]);
+
+ vmalof (@tmp[0],@acc[3],@r5[2],@tmp[0]);
+ vmalof (@tmp[1],@acc[3],@r5[3],@tmp[1]);
+ vmalof (@tmp[2],@acc[3],@r5[4],@tmp[2]);
+ vmalof (@tmp[3],@acc[3],@r[0],@tmp[3]);
+ vmalof (@tmp[4],@acc[3],@r[1],@tmp[4]);
+
+ vmalof (@tmp[0],@acc[2],@r5[3],@tmp[0]);
+ vmalof (@tmp[1],@acc[2],@r5[4],@tmp[1]);
+ vmalof (@tmp[2],@acc[2],@r[0],@tmp[2]);
+ vmalof (@tmp[3],@acc[2],@r[1],@tmp[3]);
+ vmalof (@tmp[4],@acc[2],@r[2],@tmp[4]);
+
+ vmalof (@tmp[0],@acc[1],@r5[4],@tmp[0]);
+ vmalof (@tmp[1],@acc[1],@r[0],@tmp[1]);
+ vmalof (@tmp[2],@acc[1],@r[1],@tmp[2]);
+ vmalof (@tmp[3],@acc[1],@r[2],@tmp[3]);
+ vmalof (@tmp[4],@acc[1],@r[3],@tmp[4]);
+
+ vmalof (@acc[1],@acc[0],@r[1],@tmp[1]);
+ vmalof (@acc[2],@acc[0],@r[2],@tmp[2]);
+ vmalof (@acc[3],@acc[0],@r[3],@tmp[3]);
+ vmalof (@acc[4],@acc[0],@r[4],@tmp[4]);
+ vmalof (@acc[0],@acc[0],@r[0],@tmp[0]);
+
+ REDUCE ();
+
+ la ($inp,"32($inp)");
+
+ALIGN (16);
+LABEL (".Lvx_2x_done");
+ tml ($len,16);
+ jz (".Lvx_done");
+
+ vleig ($padvec,0,0);
+
+ vzero ("%v20");
+ vl ("%v21","0($inp)"); # load m0
+
+ # m0 -> base 2^26
+
+ vperm (@m01[0],"%v20","%v21",@vperm[0]);
+ vperm (@m01[2],"%v20","%v21",@vperm[1]);
+ vperm (@m01[4],"%v20","%v21",@vperm[2]);
+
+ vesrlg (@m01[1],@m01[0],26);
+ vesrlg (@m01[3],@m01[2],30);
+ vesrlg (@m01[2],@m01[2],4);
+
+ vn (@m01[4],@m01[4],$mask4);
+ vn (@m01[$_],@m01[$_],$mask) for (0..3);
+
+ vaf (@m01[4],@m01[4],$padvec); # pad m0
+
+ # acc = acc * r + m01
+
+ vlrepf (@r5[$_],"4*$_+68($ctx)") for (0..4); # load 5*r
+ vlrepf (@r[$_],"4*$_+48($ctx)") for (0..4); # load r
+
+ vmalof (@tmp[0],@acc[4],@r5[1],@m01[0]);
+ vmalof (@tmp[1],@acc[4],@r5[2],@m01[1]);
+ vmalof (@tmp[2],@acc[4],@r5[3],@m01[2]);
+ vmalof (@tmp[3],@acc[4],@r5[4],@m01[3]);
+ vmalof (@tmp[4],@acc[4],@r[0],@m01[4]);
+
+ vmalof (@tmp[0],@acc[3],@r5[2],@tmp[0]);
+ vmalof (@tmp[1],@acc[3],@r5[3],@tmp[1]);
+ vmalof (@tmp[2],@acc[3],@r5[4],@tmp[2]);
+ vmalof (@tmp[3],@acc[3],@r[0],@tmp[3]);
+ vmalof (@tmp[4],@acc[3],@r[1],@tmp[4]);
+
+ vmalof (@tmp[0],@acc[2],@r5[3],@tmp[0]);
+ vmalof (@tmp[1],@acc[2],@r5[4],@tmp[1]);
+ vmalof (@tmp[2],@acc[2],@r[0],@tmp[2]);
+ vmalof (@tmp[3],@acc[2],@r[1],@tmp[3]);
+ vmalof (@tmp[4],@acc[2],@r[2],@tmp[4]);
+
+ vmalof (@tmp[0],@acc[1],@r5[4],@tmp[0]);
+ vmalof (@tmp[1],@acc[1],@r[0],@tmp[1]);
+ vmalof (@tmp[2],@acc[1],@r[1],@tmp[2]);
+ vmalof (@tmp[3],@acc[1],@r[2],@tmp[3]);
+ vmalof (@tmp[4],@acc[1],@r[3],@tmp[4]);
+
+ vmalof (@acc[1],@acc[0],@r[1],@tmp[1]);
+ vmalof (@acc[2],@acc[0],@r[2],@tmp[2]);
+ vmalof (@acc[3],@acc[0],@r[3],@tmp[3]);
+ vmalof (@acc[4],@acc[0],@r[4],@tmp[4]);
+ vmalof (@acc[0],@acc[0],@r[0],@tmp[0]);
+
+ REDUCE ();
+
+ALIGN (16);
+LABEL (".Lvx_done");
+ vstef (@acc[$_],"4*$_($ctx)",1) for (0..4); # store acc
+ vstef (@acc[$_],"24+4*$_($ctx)",3) for (0..4);
+
+if ($z) {
+ vlm ("%v8","%v15","0($sp)");
+ la ($sp,"$frame($sp)");
+} else {
+ ld ("%f4","16*$SIZE_T+2*8($sp)");
+ ld ("%f6","16*$SIZE_T+3*8($sp)");
+}
+ br ("%r14");
+SIZE ("poly1305_blocks_vx",".-poly1305_blocks_vx");
+}
- stm${g} %r6,%r14,`6*$SIZE_T`($sp)
+################
+# static void poly1305_emit_vx(void *ctx, unsigned char mac[16],
+# const u32 nonce[4])
+{
+my ($ctx,$mac,$nonce) = map("%r$_",(2..4));
+
+GLOBL ("poly1305_emit_vx");
+TYPE ("poly1305_emit_vx","\@function");
+ALIGN (16);
+LABEL ("poly1305_emit_vx");
+if ($z) {
+ aghi ($sp,-$frame);
+ vstm ("%v8","%v15","0($sp)");
+} else {
+ std ("%f4","16*$SIZE_T+2*8($sp)");
+ std ("%f6","16*$SIZE_T+3*8($sp)");
+}
+ larl ("%r5",".Lconst");
- llgfr $padbit,$padbit # clear upper half, much needed with
+ vlef (@acc[$_],"4*$_($ctx)",1) for (0..4); # load acc1
+ vlef (@acc[$_],"24+4*$_($ctx)",3) for (0..4); # load acc2
+ vlef (@r5[$_],"108+4*$_($ctx)",1) for (0..4); # load 5*r^2
+ vlef (@r[$_],"88+4*$_($ctx)",1) for (0..4); # load r^2
+ vlef (@r5[$_],"68+4*$_($ctx)",3) for (0..4); # load 5*r
+ vlef (@r[$_],"48+4*$_($ctx)",3) for (0..4); # load r
+ vl ($mask,"48(%r5)"); # load mask
+
+ # acc = acc1 * r^2 + acc2 * r
+
+ vmlof (@tmp[0],@acc[4],@r5[1]);
+ vmlof (@tmp[1],@acc[4],@r5[2]);
+ vmlof (@tmp[2],@acc[4],@r5[3]);
+ vmlof (@tmp[3],@acc[4],@r5[4]);
+ vmlof (@tmp[4],@acc[4],@r[0]);
+
+ vmalof (@tmp[0],@acc[3],@r5[2],@tmp[0]);
+ vmalof (@tmp[1],@acc[3],@r5[3],@tmp[1]);
+ vmalof (@tmp[2],@acc[3],@r5[4],@tmp[2]);
+ vmalof (@tmp[3],@acc[3],@r[0],@tmp[3]);
+ vmalof (@tmp[4],@acc[3],@r[1],@tmp[4]);
+
+ vmalof (@tmp[0],@acc[2],@r5[3],@tmp[0]);
+ vmalof (@tmp[1],@acc[2],@r5[4],@tmp[1]);
+ vmalof (@tmp[2],@acc[2],@r[0],@tmp[2]);
+ vmalof (@tmp[3],@acc[2],@r[1],@tmp[3]);
+ vmalof (@tmp[4],@acc[2],@r[2],@tmp[4]);
+
+ vmalof (@tmp[0],@acc[1],@r5[4],@tmp[0]);
+ vmalof (@tmp[1],@acc[1],@r[0],@tmp[1]);
+ vmalof (@tmp[2],@acc[1],@r[1],@tmp[2]);
+ vmalof (@tmp[3],@acc[1],@r[2],@tmp[3]);
+ vmalof (@tmp[4],@acc[1],@r[3],@tmp[4]);
+
+ vmalof (@acc[1],@acc[0],@r[1],@tmp[1]);
+ vmalof (@acc[2],@acc[0],@r[2],@tmp[2]);
+ vmalof (@acc[3],@acc[0],@r[3],@tmp[3]);
+ vmalof (@acc[4],@acc[0],@r[4],@tmp[4]);
+ vmalof (@acc[0],@acc[0],@r[0],@tmp[0]);
+
+ vzero ("%v27");
+ vsumqg (@acc[$_],@acc[$_],"%v27") for (0..4);
+
+ REDUCE ();
+
+ vesrlg (@tmp[1],@acc[1],26);
+ vn (@acc[1],@acc[1],$mask);
+ vag (@acc[2],@acc[2],@tmp[1]); # carry 1->2
+
+ vesrlg (@tmp[2],@acc[2],26);
+ vn (@acc[2],@acc[2],$mask);
+ vag (@acc[3],@acc[3],@tmp[2]); # carry 2->3
+
+ vesrlg (@tmp[3],@acc[3],26);
+ vn (@acc[3],@acc[3],$mask);
+ vag (@acc[4],@acc[4],@tmp[3]); # carry 3->4
+
+ # acc -> base 2^64
+ vleib ("%v30",6*8,7);
+ vleib ("%v29",13*8,7);
+ vleib ("%v28",3*8,7);
+
+ veslg (@acc[1],@acc[1],26);
+ veslg (@acc[3],@acc[3],26);
+ vo (@acc[0],@acc[0],@acc[1]);
+ vo (@acc[2],@acc[2],@acc[3]);
+
+ veslg (@acc[2],@acc[2],4);
+ vslb (@acc[2],@acc[2],"%v30"); # <<52
+ vo (@acc[0],@acc[0],@acc[2]);
+
+ vslb (@tmp[4],@acc[4],"%v29"); # <<104
+ vo (@acc[0],@acc[0],@tmp[4]);
+
+ vsrlb (@acc[1],@acc[4],"%v28"); # >>24
+
+ # acc %= 2^130-5
+ vone ("%v26");
+ vleig ("%v27",5,1);
+ vone ("%v29");
+ vleig ("%v26",-4,1);
+
+ vaq (@tmp[0],@acc[0],"%v27");
+ vaccq (@tmp[1],@acc[0],"%v27");
+
+ vaq (@tmp[1],@tmp[1],"%v26");
+ vaccq (@tmp[1],@tmp[1],@acc[1]);
+
+ vaq (@tmp[1],@tmp[1],"%v29");
+
+ vn (@tmp[2],@tmp[1],@acc[0]);
+ vnc (@tmp[3],@tmp[0],@tmp[1]);
+ vo (@acc[0],@tmp[2],@tmp[3]);
+
+ # acc += nonce
+ vl (@vperm[0],"64(%r5)");
+ vlef (@tmp[0],"4*$_($nonce)",3-$_) for (0..3);
+
+ vaq (@acc[0],@acc[0],@tmp[0]);
+
+ vperm (@acc[0],@acc[0],@acc[0],@vperm[0]);
+ vst (@acc[0],"0($mac)"); # store mac
+
+if ($z) {
+ vlm ("%v8","%v15","0($sp)");
+ la ($sp,"$frame($sp)");
+} else {
+ ld ("%f4","16*$SIZE_T+2*8($sp)");
+ ld ("%f6","16*$SIZE_T+3*8($sp)");
+}
+ br ("%r14");
+SIZE ("poly1305_emit_vx",".-poly1305_emit_vx");
+}
+}
+
+# NOVX CODE PATH
+{
+################
+# static void poly1305_blocks(void *ctx, const unsigned char *inp, size_t len,
+# u32 padbit)
+{
+my ($ctx,$inp,$len,$padbit) = map("%r$_",(2..5));
+
+my ($d0hi,$d0lo,$d1hi,$d1lo,$t0,$h0,$t1,$h1,$h2) = map("%r$_",(6..14));
+my ($r0,$r1,$s1) = map("%r$_",(0..2));
+GLOBL ("poly1305_blocks");
+TYPE ("poly1305_blocks","\@function");
+ALIGN (16);
+LABEL ("poly1305_blocks");
+$z? srlg ($len,$len,4) :srl ($len,4);
+ lghi ("%r0",0);
+&{$z? \&clgr:\&clr} ($len,"%r0");
+ je (".Lno_data");
+
+&{$z? \&stmg:\&stm} ("%r6","%r14","6*$SIZE_T($sp)");
+
+ llgfr ($padbit,$padbit); # clear upper half, much needed with
# non-64-bit ABI
- lg $r0,32($ctx) # load key
- lg $r1,40($ctx)
+ lg ($r0,"32($ctx)"); # load key
+ lg ($r1,"40($ctx)");
- lg $h0,0($ctx) # load hash value
- lg $h1,8($ctx)
- lg $h2,16($ctx)
+ lg ($h0,"0($ctx)"); # load hash value
+ lg ($h1,"8($ctx)");
+ lg ($h2,"16($ctx)");
- st$g $ctx,`2*$SIZE_T`($sp) # off-load $ctx
- srlg $s1,$r1,2
- algr $s1,$r1 # s1 = r1 + r1>>2
- j .Loop
+&{$z? \&stg:\&st} ($ctx,"2*$SIZE_T($sp)"); # off-load $ctx
+ srlg ($s1,$r1,2);
+ algr ($s1,$r1); # s1 = r1 + r1>>2
+ j (".Loop");
-.align 16
-.Loop:
- lrvg $d0lo,0($inp) # load little-endian input
- lrvg $d1lo,8($inp)
- la $inp,16($inp)
+ALIGN (16);
+LABEL (".Loop");
+ lrvg ($d0lo,"0($inp)"); # load little-endian input
+ lrvg ($d1lo,"8($inp)");
+ la ($inp,"16($inp)");
- algr $d0lo,$h0 # accumulate input
- alcgr $d1lo,$h1
+ algr ($d0lo,$h0); # accumulate input
+ alcgr ($d1lo,$h1);
- lgr $h0,$d0lo
- mlgr $d0hi,$r0 # h0*r0 -> $d0hi:$d0lo
- lgr $h1,$d1lo
- mlgr $d1hi,$s1 # h1*5*r1 -> $d1hi:$d1lo
+ lgr ($h0,$d0lo);
+ mlgr ($d0hi,$r0); # h0*r0 -> $d0hi:$d0lo
+ lgr ($h1,$d1lo);
+ mlgr ($d1hi,$s1); # h1*5*r1 -> $d1hi:$d1lo
- mlgr $t0,$r1 # h0*r1 -> $t0:$h0
- mlgr $t1,$r0 # h1*r0 -> $t1:$h1
- alcgr $h2,$padbit
+ mlgr ($t0,$r1); # h0*r1 -> $t0:$h0
+ mlgr ($t1,$r0); # h1*r0 -> $t1:$h1
+ alcgr ($h2,$padbit);
- algr $d0lo,$d1lo
- lgr $d1lo,$h2
- alcgr $d0hi,$d1hi
- lghi $d1hi,0
+ algr ($d0lo,$d1lo);
+ lgr ($d1lo,$h2);
+ alcgr ($d0hi,$d1hi);
+ lghi ($d1hi,0);
- algr $h1,$h0
- alcgr $t1,$t0
+ algr ($h1,$h0);
+ alcgr ($t1,$t0);
- msgr $d1lo,$s1 # h2*s1
- msgr $h2,$r0 # h2*r0
+ msgr ($d1lo,$s1); # h2*s1
+ msgr ($h2,$r0); # h2*r0
- algr $h1,$d1lo
- alcgr $t1,$d1hi # $d1hi is zero
+ algr ($h1,$d1lo);
+ alcgr ($t1,$d1hi); # $d1hi is zero
- algr $h1,$d0hi
- alcgr $h2,$t1
+ algr ($h1,$d0hi);
+ alcgr ($h2,$t1);
- lghi $h0,-4 # final reduction step
- ngr $h0,$h2
- srlg $t0,$h2,2
- algr $h0,$t0
- lghi $t1,3
- ngr $h2,$t1
+ lghi ($h0,-4); # final reduction step
+ ngr ($h0,$h2);
+ srlg ($t0,$h2,2);
+ algr ($h0,$t0);
+ lghi ($t1,3);
+ ngr ($h2,$t1);
- algr $h0,$d0lo
- alcgr $h1,$d1hi # $d1hi is still zero
- alcgr $h2,$d1hi # $d1hi is still zero
+ algr ($h0,$d0lo);
+ alcgr ($h1,$d1hi); # $d1hi is still zero
+ alcgr ($h2,$d1hi); # $d1hi is still zero
- brct$g $len,.Loop
+&{$z? \&brctg:\&brct} ($len,".Loop");
- l$g $ctx,`2*$SIZE_T`($sp) # restore $ctx
+&{$z? \&lg:\&l} ($ctx,"2*$SIZE_T($sp)");# restore $ctx
- stg $h0,0($ctx) # store hash value
- stg $h1,8($ctx)
- stg $h2,16($ctx)
+ stg ($h0,"0($ctx)"); # store hash value
+ stg ($h1,"8($ctx)");
+ stg ($h2,"16($ctx)");
- lm${g} %r6,%r14,`6*$SIZE_T`($sp)
-.Lno_data:
- br %r14
-.size poly1305_blocks,.-poly1305_blocks
-___
+&{$z? \&lmg:\&lm} ("%r6","%r14","6*$SIZE_T($sp)");
+LABEL (".Lno_data");
+ br ("%r14");
+SIZE ("poly1305_blocks",".-poly1305_blocks");
}
+
+################
+# static void poly1305_emit(void *ctx, unsigned char mac[16],
+# const u32 nonce[4])
{
-my ($mac,$nonce)=($inp,$len);
+my ($ctx,$mac,$nonce) = map("%r$_",(2..4));
my ($h0,$h1,$h2,$d0,$d1)=map("%r$_",(5..9));
-$code.=<<___;
-.globl poly1305_emit
-.type poly1305_emit,\@function
-.align 16
-poly1305_emit:
- stm${g} %r6,%r9,`6*$SIZE_T`($sp)
-
- lg $h0,0($ctx)
- lg $h1,8($ctx)
- lg $h2,16($ctx)
-
- lghi %r0,5
- lghi %r1,0
- lgr $d0,$h0
- lgr $d1,$h1
-
- algr $h0,%r0 # compare to modulus
- alcgr $h1,%r1
- alcgr $h2,%r1
-
- srlg $h2,$h2,2 # did it borrow/carry?
- slgr %r1,$h2 # 0-$h2>>2
- lg $h2,0($nonce) # load nonce
- lghi %r0,-1
- lg $ctx,8($nonce)
- xgr %r0,%r1 # ~%r1
-
- ngr $h0,%r1
- ngr $d0,%r0
- ngr $h1,%r1
- ngr $d1,%r0
- ogr $h0,$d0
- rllg $d0,$h2,32 # flip nonce words
- ogr $h1,$d1
- rllg $d1,$ctx,32
-
- algr $h0,$d0 # accumulate nonce
- alcgr $h1,$d1
-
- strvg $h0,0($mac) # write little-endian result
- strvg $h1,8($mac)
-
- lm${g} %r6,%r9,`6*$SIZE_T`($sp)
- br %r14
-.size poly1305_emit,.-poly1305_emit
-
-.string "Poly1305 for s390x, CRYPTOGAMS by <appro\@openssl.org>"
-___
-}
-
-$code =~ s/\`([^\`]*)\`/eval $1/gem;
-$code =~ s/\b(srlg\s+)(%r[0-9]+\s*,)\s*([0-9]+)/$1$2$2$3/gm;
-
-print $code;
-close STDOUT or die "error closing STDOUT: $!";
+GLOBL ("poly1305_emit");
+TYPE ("poly1305_emit","\@function");
+ALIGN (16);
+LABEL ("poly1305_emit");
+&{$z? \&stmg:\&stm} ("%r6","%r9","6*$SIZE_T($sp)");
+
+ lg ($h0,"0($ctx)");
+ lg ($h1,"8($ctx)");
+ lg ($h2,"16($ctx)");
+
+ lghi ("%r0",5);
+ lghi ("%r1",0);
+ lgr ($d0,$h0);
+ lgr ($d1,$h1);
+
+ algr ($h0,"%r0"); # compare to modulus
+ alcgr ($h1,"%r1");
+ alcgr ($h2,"%r1");
+
+ srlg ($h2,$h2,2); # did it borrow/carry?
+ slgr ("%r1",$h2); # 0-$h2>>2
+ lg ($h2,"0($nonce)"); # load nonce
+ lghi ("%r0",-1);
+ lg ($ctx,"8($nonce)");
+ xgr ("%r0","%r1"); # ~%r1
+
+ ngr ($h0,"%r1");
+ ngr ($d0,"%r0");
+ ngr ($h1,"%r1");
+ ngr ($d1,"%r0");
+ ogr ($h0,$d0);
+ rllg ($d0,$h2,32); # flip nonce words
+ ogr ($h1,$d1);
+ rllg ($d1,$ctx,32);
+
+ algr ($h0,$d0); # accumulate nonce
+ alcgr ($h1,$d1);
+
+ strvg ($h0,"0($mac)"); # write little-endian result
+ strvg ($h1,"8($mac)");
+
+&{$z? \&lmg:\&lm} ("%r6","%r9","6*$SIZE_T($sp)");
+ br ("%r14");
+SIZE ("poly1305_emit",".-poly1305_emit");
+}
+}
+################
+
+ALIGN (128);
+LABEL (".Lconst");
+LONG (0x00060504,0x03020100,0x00161514,0x13121110); # vperm op[m[1],m[0]]
+LONG (0x000c0b0a,0x09080706,0x001c1b1a,0x19181716); # vperm op[m[3],m[2]]
+LONG (0x00000000,0x000f0e0d,0x00000000,0x001f1e1d); # vperm op[ - ,m[4]]
+LONG (0x00000000,0x03ffffff,0x00000000,0x03ffffff); # [0,2^26-1,0,2^26-1]
+LONG (0x0f0e0d0c,0x0b0a0908,0x07060504,0x03020100); # vperm op endian
+STRING ("\"Poly1305 for s390x, CRYPTOGAMS by <appro\@openssl.org>\"");
+
+PERLASM_END();
--
2.21.0