SHA256
1
0
forked from pool/openssl-1_1
openssl-1_1/0006-s390x-assembly-pack-import-poly-from-cryptogams-repo.patch
Tomáš Chvátal 949eaaafb4 Accepting request 708112 from home:vitezslav_cizek:branches:factory
- Use upstream patch for the locale crash (bsc#1135550)
- delete openssl-fix_underflow_in_errstr_handling.patch
- add 0001-build_SYS_str_reasons-Fix-a-crash-caused-by-overlong.patch

- Add s390x vectorized support for ChaCha20 and Poly1305
  (jsc#SLE-6126, jsc#SLE-6129)
  * 0001-s390x-assembly-pack-perlasm-support.patch
  * 0002-crypto-chacha-asm-chacha-s390x.pl-add-vx-code-path.patch
  * 0003-crypto-poly1305-asm-poly1305-s390x.pl-add-vx-code-pa.patch
  * 0004-s390x-assembly-pack-fix-formal-interface-bug-in-chac.patch
  * 0005-s390x-assembly-pack-import-chacha-from-cryptogams-re.patch
  * 0006-s390x-assembly-pack-import-poly-from-cryptogams-repo.patch
- Update to 1.1.1c (bsc#1133925, jsc#SLE-6430)
- drop upstreamed patches:
- update keyring by including Richard Levitte's key

OBS-URL: https://build.opensuse.org/request/show/708112
OBS-URL: https://build.opensuse.org/package/show/security:tls/openssl-1_1?expand=0&rev=38
2019-06-06 11:11:21 +00:00

1632 lines
43 KiB
Diff

From 2e6b615f795e8ca8ae830a00079c4ea064eaae42 Mon Sep 17 00:00:00 2001
From: Patrick Steuer <patrick.steuer@de.ibm.com>
Date: Sat, 23 Mar 2019 00:03:24 +0100
Subject: [PATCH] s390x assembly pack: import poly from cryptogams repo
>=20% faster than present code.
Signed-off-by: Patrick Steuer <patrick.steuer@de.ibm.com>
Reviewed-by: Matt Caswell <matt@openssl.org>
Reviewed-by: Richard Levitte <levitte@openssl.org>
(Merged from https://github.com/openssl/openssl/pull/8560)
---
crypto/poly1305/asm/poly1305-s390x.pl | 1455 ++++++++++++++-----------
crypto/poly1305/build.info | 1 +
2 files changed, 799 insertions(+), 657 deletions(-)
Index: openssl-1.1.1c/crypto/poly1305/asm/poly1305-s390x.pl
===================================================================
--- openssl-1.1.1c.orig/crypto/poly1305/asm/poly1305-s390x.pl 2019-06-06 12:18:53.384309579 +0200
+++ openssl-1.1.1c/crypto/poly1305/asm/poly1305-s390x.pl 2019-06-06 12:18:54.556316994 +0200
@@ -32,10 +32,20 @@
# Copyright IBM Corp. 2019
# Author: Patrick Steuer <patrick.steuer@de.ibm.com>
+#
+# January 2019
+#
+# Add vector base 2^26 implementation. It's problematic to accurately
+# measure performance, because reference system is hardly idle. But
+# it's sub-cycle, i.e. less than 1 cycle per processed byte, and it's
+# >=20% faster than IBM's submission on long inputs, and much faster on
+# short ones, because calculation of key powers is postponed till we
+# know that input is long enough to justify the additional overhead.
+
use strict;
use FindBin qw($Bin);
use lib "$Bin/../..";
-use perlasm::s390x qw(:DEFAULT :VX AUTOLOAD LABEL);
+use perlasm::s390x qw(:DEFAULT :VX AUTOLOAD LABEL INCLUDE);
my $flavour = shift;
@@ -51,666 +61,98 @@ if ($flavour =~ /3[12]/) {
my $output;
while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {}
+my $stdframe=16*$SIZE_T+4*8;
my $sp="%r15";
-# novx code path ctx layout
-# ---------------------------------
-# var value base off
-# ---------------------------------
-# u64 h[3] hash 2^64 0
-# u32 pad[2]
-# u64 r[2] key 2^64 32
-
-# vx code path ctx layout
-# ---------------------------------
-# var value base off
-# ---------------------------------
-# u32 acc1[5] r^2-acc 2^26 0
-# u32 pad
-# u32 acc2[5] r-acc 2^26 24
-# u32 pad
-# u32 r1[5] r 2^26 48
-# u32 r15[5] 5*r 2^26 68
-# u32 r2[5] r^2 2^26 88
-# u32 r25[5] 5*r^2 2^26 108
-# u32 r4[5] r^4 2^26 128
-# u32 r45[5] 5*r^4 2^26 148
+my ($ctx,$inp,$len,$padbit) = map("%r$_",(2..5));
PERLASM_BEGIN($output);
+INCLUDE ("s390x_arch.h");
TEXT ();
################
# static void poly1305_init(void *ctx, const unsigned char key[16])
{
-my ($ctx,$key)=map("%r$_",(2..3));
-my ($r0,$r1,$r2)=map("%r$_",(9,11,13));
-
-sub MUL_RKEY { # r*=key
-my ($d0hi,$d0lo,$d1hi,$d1lo)=map("%r$_",(4..7));
-my ($t0,$t1,$s1)=map("%r$_",(8,10,12));
-
- lg ("%r0","32($ctx)");
- lg ("%r1","40($ctx)");
-
- srlg ($s1,"%r1",2);
- algr ($s1,"%r1");
-
- lgr ($d0lo,$r0);
- lgr ($d1lo,$r1);
-
- mlgr ($d0hi,"%r0");
- lgr ($r1,$d1lo);
- mlgr ($d1hi,$s1);
-
- mlgr ($t0,"%r1");
- mlgr ($t1,"%r0");
-
- algr ($d0lo,$d1lo);
- lgr ($d1lo,$r2);
- alcgr ($d0hi,$d1hi);
- lghi ($d1hi,0);
-
- algr ($r1,$r0);
- alcgr ($t1,$t0);
-
- msgr ($d1lo,$s1);
- msgr ($r2,"%r0");
-
- algr ($r1,$d1lo);
- alcgr ($t1,$d1hi);
-
- algr ($r1,$d0hi);
- alcgr ($r2,$t1);
-
- lghi ($r0,-4);
- ngr ($r0,$r2);
- srlg ($t0,$r2,2);
- algr ($r0,$t0);
- lghi ($t1,3);
- ngr ($r2,$t1);
-
- algr ($r0,$d0lo);
- alcgr ($r1,$d1hi);
- alcgr ($r2,$d1hi);
-}
-
-sub ST_R5R { # store r,5*r -> base 2^26
-my @d=map("%r$_",(4..8));
-my @off=@_;
-
- lgr (@d[2],$r0);
- lr ("%r1",@d[2]);
- nilh ("%r1",1023);
- lgr (@d[3],$r1);
- lr (@d[0],"%r1");
- srlg ("%r1",@d[2],52);
- lgr (@d[4],$r2);
- srlg ("%r0",@d[2],26);
- sll (@d[4],24);
- lr (@d[2],@d[3]);
- nilh ("%r0",1023);
- sll (@d[2],12);
- lr (@d[1],"%r0");
- &or (@d[2],"%r1");
- srlg ("%r1",@d[3],40);
- nilh (@d[2],1023);
- &or (@d[4],"%r1");
- srlg (@d[3],@d[3],14);
- nilh (@d[4],1023);
- nilh (@d[3],1023);
-
- stm (@d[0],@d[4],"@off[0]($ctx)");
- mhi (@d[$_],5) for (0..4);
- stm (@d[0],@d[4],"@off[1]($ctx)");
-}
-
GLOBL ("poly1305_init");
TYPE ("poly1305_init","\@function");
ALIGN (16);
LABEL ("poly1305_init");
lghi ("%r0",0);
lghi ("%r1",-1);
- stg ("%r0","0($ctx)"); # zero hash value / acc1
+ stg ("%r0","0($ctx)"); # zero hash value
stg ("%r0","8($ctx)");
stg ("%r0","16($ctx)");
+ st ("%r0","24($ctx)"); # clear is_base2_26
+ lgr ("%r5",$ctx); # reassign $ctx
+ lghi ("%r2",0);
-&{$z? \&clgr:\&clr} ($key,"%r0");
- je (".Ldone");
+&{$z? \&clgr:\&clr} ($inp,"%r0");
+ je (".Lno_key");
- lrvg ("%r4","0($key)"); # load little-endian key
- lrvg ("%r5","8($key)");
+ lrvg ("%r2","0($inp)"); # load little-endian key
+ lrvg ("%r3","8($inp)");
- nihl ("%r1",0xffc0); # 0xffffffc0ffffffff
- srlg ("%r0","%r1",4); # 0x0ffffffc0fffffff
+ nihl ("%r1",0xffc0); # 0xffffffc0ffffffff
+ srlg ("%r0","%r1",4); # 0x0ffffffc0fffffff
srlg ("%r1","%r1",4);
- nill ("%r1",0xfffc); # 0x0ffffffc0ffffffc
+ nill ("%r1",0xfffc); # 0x0ffffffc0ffffffc
- ngr ("%r4","%r0");
- ngr ("%r5","%r1");
+ ngr ("%r2","%r0");
+ ngr ("%r3","%r1");
- stg ("%r4","32($ctx)");
- stg ("%r5","40($ctx)");
+ stmg ("%r2","%r3","32(%r5)");
larl ("%r1","OPENSSL_s390xcap_P");
lg ("%r0","16(%r1)");
- tmhh ("%r0",0x4000); # check for vector facility
- jz (".Ldone");
-
- larl ("%r4","poly1305_blocks_vx");
- larl ("%r5","poly1305_emit_vx");
-
-&{$z? \&stmg:\&stm} ("%r6","%r13","6*$SIZE_T($sp)");
-&{$z? \&stmg:\&stm} ("%r4","%r5","4*$z+228($ctx)");
-
- lg ($r0,"32($ctx)");
- lg ($r1,"40($ctx)");
- lghi ($r2,0);
-
- ST_R5R (48,68); # store r,5*r
-
- MUL_RKEY();
- ST_R5R (88,108); # store r^2,5*r^2
-
- MUL_RKEY();
- MUL_RKEY();
- ST_R5R (128,148); # store r^4,5*r^4
-
- lghi ("%r0",0);
- stg ("%r0","24($ctx)"); # zero acc2
- stg ("%r0","32($ctx)");
- stg ("%r0","40($ctx)");
-
-&{$z? \&lmg:\&lm} ("%r6","%r13","6*$SIZE_T($sp)");
+ srlg ("%r0","%r0",62);
+ nill ("%r0",1); # extract vx bit
+ lcgr ("%r0","%r0");
+ larl ("%r1",".Lpoly1305_blocks");
+ larl ("%r2",".Lpoly1305_blocks_vx");
+ larl ("%r3",".Lpoly1305_emit");
+&{$z? \&xgr:\&xr} ("%r2","%r1"); # select between scalar and vector
+&{$z? \&ngr:\&nr} ("%r2","%r0");
+&{$z? \&xgr:\&xr} ("%r2","%r1");
+&{$z? \&stmg:\&stm} ("%r2","%r3","0(%r4)");
lghi ("%r2",1);
- br ("%r14");
-
-LABEL (".Ldone");
- lghi ("%r2",0);
+LABEL (".Lno_key");
br ("%r14");
SIZE ("poly1305_init",".-poly1305_init");
}
-# VX CODE PATH
-{
-my $frame=8*16;
-my @m01=map("%v$_",(0..4));
-my @m23=map("%v$_",(5..9));
-my @tmp=@m23;
-my @acc=map("%v$_",(10..14));
-my @r=map("%v$_",(15..19));
-my @r5=map("%v$_",(20..24));
-my $padvec="%v26";
-my $mask4="%v27";
-my @vperm=map("%v$_",(28..30));
-my $mask="%v31";
-
-sub REDUCE {
- vesrlg (@tmp[0],@acc[0],26);
- vesrlg (@tmp[3],@acc[3],26);
- vn (@acc[0],@acc[0],$mask);
- vn (@acc[3],@acc[3],$mask);
- vag (@acc[1],@acc[1],@tmp[0]); # carry 0->1
- vag (@acc[4],@acc[4],@tmp[3]); # carry 3->4
-
- vesrlg (@tmp[1],@acc[1],26);
- vesrlg (@tmp[4],@acc[4],26);
- vn (@acc[1],@acc[1],$mask);
- vn (@acc[4],@acc[4],$mask);
- veslg (@tmp[0],@tmp[4],2);
- vag (@tmp[4],@tmp[4],@tmp[0]); # h[4]*=5
- vag (@acc[2],@acc[2],@tmp[1]); # carry 1->2
- vag (@acc[0],@acc[0],@tmp[4]); # carry 4->0
-
- vesrlg (@tmp[2],@acc[2],26);
- vesrlg (@tmp[0],@acc[0],26);
- vn (@acc[2],@acc[2],$mask);
- vn (@acc[0],@acc[0],$mask);
- vag (@acc[3],@acc[3],@tmp[2]); # carry 2->3
- vag (@acc[1],@acc[1],@tmp[0]); # carry 0->1
-
- vesrlg (@tmp[3],@acc[3],26);
- vn (@acc[3],@acc[3],$mask);
- vag (@acc[4],@acc[4],@tmp[3]); # carry 3->4
-}
-
################
-# static void poly1305_blocks_vx(void *ctx, const unsigned char *inp,
-# size_t len, u32 padbit)
+# static void poly1305_blocks(void *ctx, const unsigned char *inp,
+# size_t len, u32 padbit)
{
-my ($ctx,$inp,$len) = map("%r$_",(2..4));
-my $padbit="%r0";
-
-GLOBL ("poly1305_blocks_vx");
-TYPE ("poly1305_blocks_vx","\@function");
-ALIGN (16);
-LABEL ("poly1305_blocks_vx");
-if ($z) {
- aghi ($sp,-$frame);
- vstm ("%v8","%v15","0($sp)");
-} else {
- std ("%f4","16*$SIZE_T+2*8($sp)");
- std ("%f6","16*$SIZE_T+3*8($sp)");
- llgfr ($len,$len);
-}
- llgfr ($padbit,"%r5");
- vlef (@acc[$_],"4*$_($ctx)",1) for (0..4); # load acc1
- larl ("%r5",".Lconst");
- vlef (@acc[$_],"24+4*$_($ctx)",3) for (0..4); # load acc2
- sllg ($padbit,$padbit,24);
- vlm (@vperm[0],$mask,"0(%r5)"); # load vperm ops, mask
- vgbm ($mask4,0x0707);
- vlvgp ($padvec,$padbit,$padbit);
-
- srlg ("%r1",$len,6);
- ltgr ("%r1","%r1");
- jz (".Lvx_4x_done");
-
-ALIGN (16);
-LABEL (".Lvx_4x");
- vlm ("%v20","%v23","0($inp)"); # load m0,m1,m2,m3
-
- # m01,m23 -> base 2^26
-
- vperm (@m01[0],"%v20","%v21",@vperm[0]);
- vperm (@m23[0],"%v22","%v23",@vperm[0]);
- vperm (@m01[2],"%v20","%v21",@vperm[1]);
- vperm (@m23[2],"%v22","%v23",@vperm[1]);
- vperm (@m01[4],"%v20","%v21",@vperm[2]);
- vperm (@m23[4],"%v22","%v23",@vperm[2]);
-
- vesrlg (@m01[1],@m01[0],26);
- vesrlg (@m23[1],@m23[0],26);
- vesrlg (@m01[3],@m01[2],30);
- vesrlg (@m23[3],@m23[2],30);
- vesrlg (@m01[2],@m01[2],4);
- vesrlg (@m23[2],@m23[2],4);
-
- vn (@m01[4],@m01[4],$mask4);
- vn (@m23[4],@m23[4],$mask4);
-for (0..3) {
- vn (@m01[$_],@m01[$_],$mask);
- vn (@m23[$_],@m23[$_],$mask);
-}
- vaf (@m01[4],@m01[4],$padvec); # pad m01
- vaf (@m23[4],@m23[4],$padvec); # pad m23
-
- # acc = acc * r^4 + m01 * r^2 + m23
-
- vlrepf (@r5[$_],"4*$_+108($ctx)") for (0..4); # load 5*r^2
- vlrepf (@r[$_],"4*$_+88($ctx)") for (0..4); # load r^2
-
- vmalof (@tmp[0],@m01[4],@r5[1],@m23[0]);
- vmalof (@tmp[1],@m01[4],@r5[2],@m23[1]);
- vmalof (@tmp[2],@m01[4],@r5[3],@m23[2]);
- vmalof (@tmp[3],@m01[4],@r5[4],@m23[3]);
- vmalof (@tmp[4],@m01[4],@r[0],@m23[4]);
-
- vmalof (@tmp[0],@m01[3],@r5[2],@tmp[0]);
- vmalof (@tmp[1],@m01[3],@r5[3],@tmp[1]);
- vmalof (@tmp[2],@m01[3],@r5[4],@tmp[2]);
- vmalof (@tmp[3],@m01[3],@r[0],@tmp[3]);
- vmalof (@tmp[4],@m01[3],@r[1],@tmp[4]);
-
- vmalof (@tmp[0],@m01[2],@r5[3],@tmp[0]);
- vmalof (@tmp[1],@m01[2],@r5[4],@tmp[1]);
- vmalof (@tmp[2],@m01[2],@r[0],@tmp[2]);
- vmalof (@tmp[3],@m01[2],@r[1],@tmp[3]);
- vmalof (@tmp[4],@m01[2],@r[2],@tmp[4]);
-
- vmalof (@tmp[0],@m01[1],@r5[4],@tmp[0]);
- vmalof (@tmp[1],@m01[1],@r[0],@tmp[1]);
- vmalof (@tmp[2],@m01[1],@r[1],@tmp[2]);
- vmalof (@tmp[3],@m01[1],@r[2],@tmp[3]);
- vmalof (@tmp[4],@m01[1],@r[3],@tmp[4]);
-
- vmalof (@tmp[0],@m01[0],@r[0],@tmp[0]);
- vmalof (@tmp[1],@m01[0],@r[1],@tmp[1]);
- vmalof (@tmp[2],@m01[0],@r[2],@tmp[2]);
- vmalof (@tmp[3],@m01[0],@r[3],@tmp[3]);
- vmalof (@tmp[4],@m01[0],@r[4],@tmp[4]);
-
- vlrepf (@r5[$_],"4*$_+148($ctx)") for (0..4); # load 5*r^4
- vlrepf (@r[$_],"4*$_+128($ctx)") for (0..4); # load r^4
-
- vmalof (@tmp[0],@acc[4],@r5[1],@tmp[0]);
- vmalof (@tmp[1],@acc[4],@r5[2],@tmp[1]);
- vmalof (@tmp[2],@acc[4],@r5[3],@tmp[2]);
- vmalof (@tmp[3],@acc[4],@r5[4],@tmp[3]);
- vmalof (@tmp[4],@acc[4],@r[0],@tmp[4]);
-
- vmalof (@tmp[0],@acc[3],@r5[2],@tmp[0]);
- vmalof (@tmp[1],@acc[3],@r5[3],@tmp[1]);
- vmalof (@tmp[2],@acc[3],@r5[4],@tmp[2]);
- vmalof (@tmp[3],@acc[3],@r[0],@tmp[3]);
- vmalof (@tmp[4],@acc[3],@r[1],@tmp[4]);
-
- vmalof (@tmp[0],@acc[2],@r5[3],@tmp[0]);
- vmalof (@tmp[1],@acc[2],@r5[4],@tmp[1]);
- vmalof (@tmp[2],@acc[2],@r[0],@tmp[2]);
- vmalof (@tmp[3],@acc[2],@r[1],@tmp[3]);
- vmalof (@tmp[4],@acc[2],@r[2],@tmp[4]);
-
- vmalof (@tmp[0],@acc[1],@r5[4],@tmp[0]);
- vmalof (@tmp[1],@acc[1],@r[0],@tmp[1]);
- vmalof (@tmp[2],@acc[1],@r[1],@tmp[2]);
- vmalof (@tmp[3],@acc[1],@r[2],@tmp[3]);
- vmalof (@tmp[4],@acc[1],@r[3],@tmp[4]);
-
- vmalof (@acc[1],@acc[0],@r[1],@tmp[1]);
- vmalof (@acc[2],@acc[0],@r[2],@tmp[2]);
- vmalof (@acc[3],@acc[0],@r[3],@tmp[3]);
- vmalof (@acc[4],@acc[0],@r[4],@tmp[4]);
- vmalof (@acc[0],@acc[0],@r[0],@tmp[0]);
-
- REDUCE ();
-
- la ($inp,"64($inp)");
- brctg ("%r1",".Lvx_4x");
-
-ALIGN (16);
-LABEL (".Lvx_4x_done");
- tml ($len,32);
- jz (".Lvx_2x_done");
-
- vlm ("%v20","%v21","0($inp)"); # load m0,m1
-
- # m01 -> base 2^26
-
- vperm (@m01[0],"%v20","%v21",@vperm[0]);
- vperm (@m01[2],"%v20","%v21",@vperm[1]);
- vperm (@m01[4],"%v20","%v21",@vperm[2]);
-
- vesrlg (@m01[1],@m01[0],26);
- vesrlg (@m01[3],@m01[2],30);
- vesrlg (@m01[2],@m01[2],4);
-
- vn (@m01[4],@m01[4],$mask4);
- vn (@m01[$_],@m01[$_],$mask) for (0..3);
-
- vaf (@m01[4],@m01[4],$padvec); # pad m01
-
- # acc = acc * r^2+ m01
-
- vlrepf (@r5[$_],"4*$_+108($ctx)") for (0..4); # load 5*r^2
- vlrepf (@r[$_],"4*$_+88($ctx)") for (0..4); # load r^2
-
- vmalof (@tmp[0],@acc[4],@r5[1],@m01[0]);
- vmalof (@tmp[1],@acc[4],@r5[2],@m01[1]);
- vmalof (@tmp[2],@acc[4],@r5[3],@m01[2]);
- vmalof (@tmp[3],@acc[4],@r5[4],@m01[3]);
- vmalof (@tmp[4],@acc[4],@r[0],@m01[4]);
-
- vmalof (@tmp[0],@acc[3],@r5[2],@tmp[0]);
- vmalof (@tmp[1],@acc[3],@r5[3],@tmp[1]);
- vmalof (@tmp[2],@acc[3],@r5[4],@tmp[2]);
- vmalof (@tmp[3],@acc[3],@r[0],@tmp[3]);
- vmalof (@tmp[4],@acc[3],@r[1],@tmp[4]);
-
- vmalof (@tmp[0],@acc[2],@r5[3],@tmp[0]);
- vmalof (@tmp[1],@acc[2],@r5[4],@tmp[1]);
- vmalof (@tmp[2],@acc[2],@r[0],@tmp[2]);
- vmalof (@tmp[3],@acc[2],@r[1],@tmp[3]);
- vmalof (@tmp[4],@acc[2],@r[2],@tmp[4]);
-
- vmalof (@tmp[0],@acc[1],@r5[4],@tmp[0]);
- vmalof (@tmp[1],@acc[1],@r[0],@tmp[1]);
- vmalof (@tmp[2],@acc[1],@r[1],@tmp[2]);
- vmalof (@tmp[3],@acc[1],@r[2],@tmp[3]);
- vmalof (@tmp[4],@acc[1],@r[3],@tmp[4]);
-
- vmalof (@acc[1],@acc[0],@r[1],@tmp[1]);
- vmalof (@acc[2],@acc[0],@r[2],@tmp[2]);
- vmalof (@acc[3],@acc[0],@r[3],@tmp[3]);
- vmalof (@acc[4],@acc[0],@r[4],@tmp[4]);
- vmalof (@acc[0],@acc[0],@r[0],@tmp[0]);
-
- REDUCE ();
-
- la ($inp,"32($inp)");
-
-ALIGN (16);
-LABEL (".Lvx_2x_done");
- tml ($len,16);
- jz (".Lvx_done");
-
- vleig ($padvec,0,0);
-
- vzero ("%v20");
- vl ("%v21","0($inp)"); # load m0
-
- # m0 -> base 2^26
-
- vperm (@m01[0],"%v20","%v21",@vperm[0]);
- vperm (@m01[2],"%v20","%v21",@vperm[1]);
- vperm (@m01[4],"%v20","%v21",@vperm[2]);
-
- vesrlg (@m01[1],@m01[0],26);
- vesrlg (@m01[3],@m01[2],30);
- vesrlg (@m01[2],@m01[2],4);
-
- vn (@m01[4],@m01[4],$mask4);
- vn (@m01[$_],@m01[$_],$mask) for (0..3);
-
- vaf (@m01[4],@m01[4],$padvec); # pad m0
-
- # acc = acc * r + m01
-
- vlrepf (@r5[$_],"4*$_+68($ctx)") for (0..4); # load 5*r
- vlrepf (@r[$_],"4*$_+48($ctx)") for (0..4); # load r
-
- vmalof (@tmp[0],@acc[4],@r5[1],@m01[0]);
- vmalof (@tmp[1],@acc[4],@r5[2],@m01[1]);
- vmalof (@tmp[2],@acc[4],@r5[3],@m01[2]);
- vmalof (@tmp[3],@acc[4],@r5[4],@m01[3]);
- vmalof (@tmp[4],@acc[4],@r[0],@m01[4]);
-
- vmalof (@tmp[0],@acc[3],@r5[2],@tmp[0]);
- vmalof (@tmp[1],@acc[3],@r5[3],@tmp[1]);
- vmalof (@tmp[2],@acc[3],@r5[4],@tmp[2]);
- vmalof (@tmp[3],@acc[3],@r[0],@tmp[3]);
- vmalof (@tmp[4],@acc[3],@r[1],@tmp[4]);
-
- vmalof (@tmp[0],@acc[2],@r5[3],@tmp[0]);
- vmalof (@tmp[1],@acc[2],@r5[4],@tmp[1]);
- vmalof (@tmp[2],@acc[2],@r[0],@tmp[2]);
- vmalof (@tmp[3],@acc[2],@r[1],@tmp[3]);
- vmalof (@tmp[4],@acc[2],@r[2],@tmp[4]);
-
- vmalof (@tmp[0],@acc[1],@r5[4],@tmp[0]);
- vmalof (@tmp[1],@acc[1],@r[0],@tmp[1]);
- vmalof (@tmp[2],@acc[1],@r[1],@tmp[2]);
- vmalof (@tmp[3],@acc[1],@r[2],@tmp[3]);
- vmalof (@tmp[4],@acc[1],@r[3],@tmp[4]);
-
- vmalof (@acc[1],@acc[0],@r[1],@tmp[1]);
- vmalof (@acc[2],@acc[0],@r[2],@tmp[2]);
- vmalof (@acc[3],@acc[0],@r[3],@tmp[3]);
- vmalof (@acc[4],@acc[0],@r[4],@tmp[4]);
- vmalof (@acc[0],@acc[0],@r[0],@tmp[0]);
-
- REDUCE ();
-
-ALIGN (16);
-LABEL (".Lvx_done");
- vstef (@acc[$_],"4*$_($ctx)",1) for (0..4); # store acc
- vstef (@acc[$_],"24+4*$_($ctx)",3) for (0..4);
-
-if ($z) {
- vlm ("%v8","%v15","0($sp)");
- la ($sp,"$frame($sp)");
-} else {
- ld ("%f4","16*$SIZE_T+2*8($sp)");
- ld ("%f6","16*$SIZE_T+3*8($sp)");
-}
- br ("%r14");
-SIZE ("poly1305_blocks_vx",".-poly1305_blocks_vx");
-}
-
-################
-# static void poly1305_emit_vx(void *ctx, unsigned char mac[16],
-# const u32 nonce[4])
-{
-my ($ctx,$mac,$nonce) = map("%r$_",(2..4));
-
-GLOBL ("poly1305_emit_vx");
-TYPE ("poly1305_emit_vx","\@function");
-ALIGN (16);
-LABEL ("poly1305_emit_vx");
-if ($z) {
- aghi ($sp,-$frame);
- vstm ("%v8","%v15","0($sp)");
-} else {
- std ("%f4","16*$SIZE_T+2*8($sp)");
- std ("%f6","16*$SIZE_T+3*8($sp)");
-}
- larl ("%r5",".Lconst");
-
- vlef (@acc[$_],"4*$_($ctx)",1) for (0..4); # load acc1
- vlef (@acc[$_],"24+4*$_($ctx)",3) for (0..4); # load acc2
- vlef (@r5[$_],"108+4*$_($ctx)",1) for (0..4); # load 5*r^2
- vlef (@r[$_],"88+4*$_($ctx)",1) for (0..4); # load r^2
- vlef (@r5[$_],"68+4*$_($ctx)",3) for (0..4); # load 5*r
- vlef (@r[$_],"48+4*$_($ctx)",3) for (0..4); # load r
- vl ($mask,"48(%r5)"); # load mask
-
- # acc = acc1 * r^2 + acc2 * r
-
- vmlof (@tmp[0],@acc[4],@r5[1]);
- vmlof (@tmp[1],@acc[4],@r5[2]);
- vmlof (@tmp[2],@acc[4],@r5[3]);
- vmlof (@tmp[3],@acc[4],@r5[4]);
- vmlof (@tmp[4],@acc[4],@r[0]);
-
- vmalof (@tmp[0],@acc[3],@r5[2],@tmp[0]);
- vmalof (@tmp[1],@acc[3],@r5[3],@tmp[1]);
- vmalof (@tmp[2],@acc[3],@r5[4],@tmp[2]);
- vmalof (@tmp[3],@acc[3],@r[0],@tmp[3]);
- vmalof (@tmp[4],@acc[3],@r[1],@tmp[4]);
-
- vmalof (@tmp[0],@acc[2],@r5[3],@tmp[0]);
- vmalof (@tmp[1],@acc[2],@r5[4],@tmp[1]);
- vmalof (@tmp[2],@acc[2],@r[0],@tmp[2]);
- vmalof (@tmp[3],@acc[2],@r[1],@tmp[3]);
- vmalof (@tmp[4],@acc[2],@r[2],@tmp[4]);
-
- vmalof (@tmp[0],@acc[1],@r5[4],@tmp[0]);
- vmalof (@tmp[1],@acc[1],@r[0],@tmp[1]);
- vmalof (@tmp[2],@acc[1],@r[1],@tmp[2]);
- vmalof (@tmp[3],@acc[1],@r[2],@tmp[3]);
- vmalof (@tmp[4],@acc[1],@r[3],@tmp[4]);
-
- vmalof (@acc[1],@acc[0],@r[1],@tmp[1]);
- vmalof (@acc[2],@acc[0],@r[2],@tmp[2]);
- vmalof (@acc[3],@acc[0],@r[3],@tmp[3]);
- vmalof (@acc[4],@acc[0],@r[4],@tmp[4]);
- vmalof (@acc[0],@acc[0],@r[0],@tmp[0]);
-
- vzero ("%v27");
- vsumqg (@acc[$_],@acc[$_],"%v27") for (0..4);
-
- REDUCE ();
-
- vesrlg (@tmp[1],@acc[1],26);
- vn (@acc[1],@acc[1],$mask);
- vag (@acc[2],@acc[2],@tmp[1]); # carry 1->2
-
- vesrlg (@tmp[2],@acc[2],26);
- vn (@acc[2],@acc[2],$mask);
- vag (@acc[3],@acc[3],@tmp[2]); # carry 2->3
-
- vesrlg (@tmp[3],@acc[3],26);
- vn (@acc[3],@acc[3],$mask);
- vag (@acc[4],@acc[4],@tmp[3]); # carry 3->4
-
- # acc -> base 2^64
- vleib ("%v30",6*8,7);
- vleib ("%v29",13*8,7);
- vleib ("%v28",3*8,7);
-
- veslg (@acc[1],@acc[1],26);
- veslg (@acc[3],@acc[3],26);
- vo (@acc[0],@acc[0],@acc[1]);
- vo (@acc[2],@acc[2],@acc[3]);
-
- veslg (@acc[2],@acc[2],4);
- vslb (@acc[2],@acc[2],"%v30"); # <<52
- vo (@acc[0],@acc[0],@acc[2]);
-
- vslb (@tmp[4],@acc[4],"%v29"); # <<104
- vo (@acc[0],@acc[0],@tmp[4]);
-
- vsrlb (@acc[1],@acc[4],"%v28"); # >>24
-
- # acc %= 2^130-5
- vone ("%v26");
- vleig ("%v27",5,1);
- vone ("%v29");
- vleig ("%v26",-4,1);
-
- vaq (@tmp[0],@acc[0],"%v27");
- vaccq (@tmp[1],@acc[0],"%v27");
-
- vaq (@tmp[1],@tmp[1],"%v26");
- vaccq (@tmp[1],@tmp[1],@acc[1]);
-
- vaq (@tmp[1],@tmp[1],"%v29");
-
- vn (@tmp[2],@tmp[1],@acc[0]);
- vnc (@tmp[3],@tmp[0],@tmp[1]);
- vo (@acc[0],@tmp[2],@tmp[3]);
-
- # acc += nonce
- vl (@vperm[0],"64(%r5)");
- vlef (@tmp[0],"4*$_($nonce)",3-$_) for (0..3);
-
- vaq (@acc[0],@acc[0],@tmp[0]);
-
- vperm (@acc[0],@acc[0],@acc[0],@vperm[0]);
- vst (@acc[0],"0($mac)"); # store mac
-
-if ($z) {
- vlm ("%v8","%v15","0($sp)");
- la ($sp,"$frame($sp)");
-} else {
- ld ("%f4","16*$SIZE_T+2*8($sp)");
- ld ("%f6","16*$SIZE_T+3*8($sp)");
-}
- br ("%r14");
-SIZE ("poly1305_emit_vx",".-poly1305_emit_vx");
-}
-}
-
-# NOVX CODE PATH
-{
-################
-# static void poly1305_blocks(void *ctx, const unsigned char *inp, size_t len,
-# u32 padbit)
-{
-my ($ctx,$inp,$len,$padbit) = map("%r$_",(2..5));
-
my ($d0hi,$d0lo,$d1hi,$d1lo,$t0,$h0,$t1,$h1,$h2) = map("%r$_",(6..14));
my ($r0,$r1,$s1) = map("%r$_",(0..2));
+
GLOBL ("poly1305_blocks");
TYPE ("poly1305_blocks","\@function");
ALIGN (16);
LABEL ("poly1305_blocks");
-$z? srlg ($len,$len,4) :srl ($len,4);
- lghi ("%r0",0);
-&{$z? \&clgr:\&clr} ($len,"%r0");
- je (".Lno_data");
+LABEL (".Lpoly1305_blocks");
+&{$z? \&ltgr:\&ltr} ("%r0",$len);
+ jz (".Lno_data");
&{$z? \&stmg:\&stm} ("%r6","%r14","6*$SIZE_T($sp)");
- llgfr ($padbit,$padbit); # clear upper half, much needed with
- # non-64-bit ABI
- lg ($r0,"32($ctx)"); # load key
- lg ($r1,"40($ctx)");
-
- lg ($h0,"0($ctx)"); # load hash value
+ lg ($h0,"0($ctx)"); # load hash value
lg ($h1,"8($ctx)");
lg ($h2,"16($ctx)");
+LABEL (".Lpoly1305_blocks_entry");
+if ($z) {
+ srlg ($len,$len,4);
+} else {
+ srl ($len,4);
+}
+ llgfr ($padbit,$padbit); # clear upper half, much needed with
+ # non-64-bit ABI
+ lg ($r0,"32($ctx)"); # load key
+ lg ($r1,"40($ctx)");
+
&{$z? \&stg:\&st} ($ctx,"2*$SIZE_T($sp)"); # off-load $ctx
srlg ($s1,$r1,2);
algr ($s1,$r1); # s1 = r1 + r1>>2
@@ -718,21 +160,21 @@ $z? srlg ($len,$len,4) :srl ($len,4);
ALIGN (16);
LABEL (".Loop");
- lrvg ($d0lo,"0($inp)"); # load little-endian input
+ lrvg ($d0lo,"0($inp)"); # load little-endian input
lrvg ($d1lo,"8($inp)");
la ($inp,"16($inp)");
- algr ($d0lo,$h0); # accumulate input
+ algr ($d0lo,$h0); # accumulate input
alcgr ($d1lo,$h1);
+ alcgr ($h2,$padbit);
lgr ($h0,$d0lo);
- mlgr ($d0hi,$r0); # h0*r0 -> $d0hi:$d0lo
+ mlgr ($d0hi,$r0); # h0*r0 -> $d0hi:$d0lo
lgr ($h1,$d1lo);
- mlgr ($d1hi,$s1); # h1*5*r1 -> $d1hi:$d1lo
+ mlgr ($d1hi,$s1); # h1*5*r1 -> $d1hi:$d1lo
- mlgr ($t0,$r1); # h0*r1 -> $t0:$h0
- mlgr ($t1,$r0); # h1*r0 -> $t1:$h1
- alcgr ($h2,$padbit);
+ mlgr ($t0,$r1); # h0*r1 -> $t0:$h0
+ mlgr ($t1,$r0); # h1*r0 -> $t1:$h1
algr ($d0lo,$d1lo);
lgr ($d1lo,$h2);
@@ -742,16 +184,16 @@ LABEL (".Loop");
algr ($h1,$h0);
alcgr ($t1,$t0);
- msgr ($d1lo,$s1); # h2*s1
- msgr ($h2,$r0); # h2*r0
+ msgr ($d1lo,$s1); # h2*s1
+ msgr ($h2,$r0); # h2*r0
algr ($h1,$d1lo);
- alcgr ($t1,$d1hi); # $d1hi is zero
+ alcgr ($t1,$d1hi); # $d1hi is zero
algr ($h1,$d0hi);
alcgr ($h2,$t1);
- lghi ($h0,-4); # final reduction step
+ lghi ($h0,-4); # final reduction step
ngr ($h0,$h2);
srlg ($t0,$h2,2);
algr ($h0,$t0);
@@ -759,14 +201,14 @@ LABEL (".Loop");
ngr ($h2,$t1);
algr ($h0,$d0lo);
- alcgr ($h1,$d1hi); # $d1hi is still zero
- alcgr ($h2,$d1hi); # $d1hi is still zero
+ alcgr ($h1,$d1hi); # $d1hi is still zero
+ alcgr ($h2,$d1hi); # $d1hi is still zero
&{$z? \&brctg:\&brct} ($len,".Loop");
&{$z? \&lg:\&l} ($ctx,"2*$SIZE_T($sp)");# restore $ctx
- stg ($h0,"0($ctx)"); # store hash value
+ stg ($h0,"0($ctx)"); # store hash value
stg ($h1,"8($ctx)");
stg ($h2,"16($ctx)");
@@ -777,67 +219,766 @@ SIZE ("poly1305_blocks",".-poly1305_bloc
}
################
+# static void poly1305_blocks_vx(void *ctx, const unsigned char *inp,
+# size_t len, u32 padbit)
+{
+my ($H0, $H1, $H2, $H3, $H4) = map("%v$_",(0..4));
+my ($I0, $I1, $I2, $I3, $I4) = map("%v$_",(5..9));
+my ($R0, $R1, $S1, $R2, $S2) = map("%v$_",(10..14));
+my ($R3, $S3, $R4, $S4) = map("%v$_",(15..18));
+my ($ACC0, $ACC1, $ACC2, $ACC3, $ACC4) = map("%v$_",(19..23));
+my ($T1, $T2, $T3, $T4) = map("%v$_",(24..27));
+my ($mask26,$bswaplo,$bswaphi,$bswapmi) = map("%v$_",(28..31));
+
+my ($d2,$d0,$h0,$d1,$h1,$h2)=map("%r$_",(9..14));
+
+TYPE ("poly1305_blocks_vx","\@function");
+ALIGN (16);
+LABEL ("poly1305_blocks_vx");
+LABEL (".Lpoly1305_blocks_vx");
+&{$z? \&clgfi:\&clfi} ($len,128);
+ jhe ("__poly1305_blocks_vx");
+
+&{$z? \&stmg:\&stm} ("%r6","%r14","6*$SIZE_T($sp)");
+
+ lg ($d0,"0($ctx)");
+ lg ($d1,"8($ctx)");
+ lg ($d2,"16($ctx)");
+
+ llgfr ("%r0",$d0); # base 2^26 -> base 2^64
+ srlg ($h0,$d0,32);
+ llgfr ("%r1",$d1);
+ srlg ($h1,$d1,32);
+ srlg ($h2,$d2,32);
+
+ sllg ("%r0","%r0",26);
+ algr ($h0,"%r0");
+ sllg ("%r0",$h1,52);
+ srlg ($h1,$h1,12);
+ sllg ("%r1","%r1",14);
+ algr ($h0,"%r0");
+ alcgr ($h1,"%r1");
+ sllg ("%r0",$h2,40);
+ srlg ($h2,$h2,24);
+ lghi ("%r1",0);
+ algr ($h1,"%r0");
+ alcgr ($h2,"%r1");
+
+ llgf ("%r0","24($ctx)"); # is_base2_26
+ lcgr ("%r0","%r0");
+
+ xgr ($h0,$d0); # choose between radixes
+ xgr ($h1,$d1);
+ xgr ($h2,$d2);
+ ngr ($h0,"%r0");
+ ngr ($h1,"%r0");
+ ngr ($h2,"%r0");
+ xgr ($h0,$d0);
+ xgr ($h1,$d1);
+ xgr ($h2,$d2);
+
+ lhi ("%r0",0);
+ st ("%r0","24($ctx)"); # clear is_base2_26
+
+ j (".Lpoly1305_blocks_entry");
+SIZE ("poly1305_blocks_vx",".-poly1305_blocks_vx");
+
+TYPE ("__poly1305_mul","\@function");
+ALIGN (16);
+LABEL ("__poly1305_mul");
+ vmlof ($ACC0,$H0,$R0);
+ vmlof ($ACC1,$H0,$R1);
+ vmlof ($ACC2,$H0,$R2);
+ vmlof ($ACC3,$H0,$R3);
+ vmlof ($ACC4,$H0,$R4);
+
+ vmalof ($ACC0,$H1,$S4,$ACC0);
+ vmalof ($ACC1,$H1,$R0,$ACC1);
+ vmalof ($ACC2,$H1,$R1,$ACC2);
+ vmalof ($ACC3,$H1,$R2,$ACC3);
+ vmalof ($ACC4,$H1,$R3,$ACC4);
+
+ vmalof ($ACC0,$H2,$S3,$ACC0);
+ vmalof ($ACC1,$H2,$S4,$ACC1);
+ vmalof ($ACC2,$H2,$R0,$ACC2);
+ vmalof ($ACC3,$H2,$R1,$ACC3);
+ vmalof ($ACC4,$H2,$R2,$ACC4);
+
+ vmalof ($ACC0,$H3,$S2,$ACC0);
+ vmalof ($ACC1,$H3,$S3,$ACC1);
+ vmalof ($ACC2,$H3,$S4,$ACC2);
+ vmalof ($ACC3,$H3,$R0,$ACC3);
+ vmalof ($ACC4,$H3,$R1,$ACC4);
+
+ vmalof ($ACC0,$H4,$S1,$ACC0);
+ vmalof ($ACC1,$H4,$S2,$ACC1);
+ vmalof ($ACC2,$H4,$S3,$ACC2);
+ vmalof ($ACC3,$H4,$S4,$ACC3);
+ vmalof ($ACC4,$H4,$R0,$ACC4);
+
+ ################################################################
+ # lazy reduction
+
+ vesrlg ($H4,$ACC3,26);
+ vesrlg ($H1,$ACC0,26);
+ vn ($H3,$ACC3,$mask26);
+ vn ($H0,$ACC0,$mask26);
+ vag ($H4,$H4,$ACC4); # h3 -> h4
+ vag ($H1,$H1,$ACC1); # h0 -> h1
+
+ vesrlg ($ACC4,$H4,26);
+ vesrlg ($ACC1,$H1,26);
+ vn ($H4,$H4,$mask26);
+ vn ($H1,$H1,$mask26);
+ vag ($H0,$H0,$ACC4);
+ vag ($H2,$ACC2,$ACC1); # h1 -> h2
+
+ veslg ($ACC4,$ACC4,2); # <<2
+ vesrlg ($ACC2,$H2,26);
+ vn ($H2,$H2,$mask26);
+ vag ($H0,$H0,$ACC4); # h4 -> h0
+ vag ($H3,$H3,$ACC2); # h2 -> h3
+
+ vesrlg ($ACC0,$H0,26);
+ vesrlg ($ACC3,$H3,26);
+ vn ($H0,$H0,$mask26);
+ vn ($H3,$H3,$mask26);
+ vag ($H1,$H1,$ACC0); # h0 -> h1
+ vag ($H4,$H4,$ACC3); # h3 -> h4
+ br ("%r14");
+SIZE ("__poly1305_mul",".-__poly1305_mul");
+
+TYPE ("__poly1305_blocks_vx","\@function");
+ALIGN (16);
+LABEL ("__poly1305_blocks_vx");
+&{$z? \&lgr:\&lr} ("%r0",$sp);
+&{$z? \&stmg:\&stm} ("%r10","%r15","10*$SIZE_T($sp)");
+if (!$z) {
+ std ("%f4","16*$SIZE_T+2*8($sp)");
+ std ("%f6","16*$SIZE_T+3*8($sp)");
+ ahi ($sp,-$stdframe);
+ st ("%r0","0($sp)"); # back-chain
+
+ llgfr ($len,$len); # so that srlg works on $len
+} else {
+ aghi ($sp,"-($stdframe+8*8)");
+ stg ("%r0","0($sp)"); # back-chain
+
+ std ("%f8","$stdframe+0*8($sp)");
+ std ("%f9","$stdframe+1*8($sp)");
+ std ("%f10","$stdframe+2*8($sp)");
+ std ("%f11","$stdframe+3*8($sp)");
+ std ("%f12","$stdframe+4*8($sp)");
+ std ("%f13","$stdframe+5*8($sp)");
+ std ("%f14","$stdframe+6*8($sp)");
+ std ("%f15","$stdframe+7*8($sp)");
+}
+ larl ("%r1",".Lconst");
+ vgmg ($mask26,38,63);
+ vlm ($bswaplo,$bswapmi,"16(%r1)");
+
+ &lt ("%r0","24($ctx)"); # is_base2_26?
+ jnz (".Lskip_init");
+
+ lg ($h0,"32($ctx)"); # load key base 2^64
+ lg ($h1,"40($ctx)");
+
+ risbg ($d0,$h0,38,0x80+63,38); # base 2^64 -> 2^26
+ srlg ($d1,$h0,52);
+ risbg ($h0,$h0,38,0x80+63,0);
+ vlvgg ($R0,$h0,0);
+ risbg ($d1,$h1,38,51,12);
+ vlvgg ($R1,$d0,0);
+ risbg ($d0,$h1,38,63,50);
+ vlvgg ($R2,$d1,0);
+ srlg ($d1,$h1,40);
+ vlvgg ($R3,$d0,0);
+ vlvgg ($R4,$d1,0);
+
+ veslg ($S1,$R1,2);
+ veslg ($S2,$R2,2);
+ veslg ($S3,$R3,2);
+ veslg ($S4,$R4,2);
+ vlr ($H0,$R0);
+ vlr ($H1,$R1);
+ vlr ($H2,$R2);
+ vlr ($H3,$R3);
+ vlr ($H4,$R4);
+ vag ($S1,$S1,$R1); # * 5
+ vag ($S2,$S2,$R2);
+ vag ($S3,$S3,$R3);
+ vag ($S4,$S4,$R4);
+
+ brasl ("%r14","__poly1305_mul"); # r^1:- * r^1:-
+
+ vpdi ($R0,$H0,$R0,0); # r^2:r^1
+ vpdi ($R1,$H1,$R1,0);
+ vpdi ($R2,$H2,$R2,0);
+ vpdi ($R3,$H3,$R3,0);
+ vpdi ($R4,$H4,$R4,0);
+ vpdi ($H0,$H0,$H0,0); # r^2:r^2
+ vpdi ($H1,$H1,$H1,0);
+ vpdi ($H2,$H2,$H2,0);
+ vpdi ($H3,$H3,$H3,0);
+ vpdi ($H4,$H4,$H4,0);
+ veslg ($S1,$R1,2);
+ veslg ($S2,$R2,2);
+ veslg ($S3,$R3,2);
+ veslg ($S4,$R4,2);
+ vag ($S1,$S1,$R1); # * 5
+ vag ($S2,$S2,$R2);
+ vag ($S3,$S3,$R3);
+ vag ($S4,$S4,$R4);
+
+ brasl ("%r14,__poly1305_mul"); # r^2:r^2 * r^2:r^1
+
+ vl ($I0,"0(%r1)"); # borrow $I0
+ vperm ($R0,$R0,$H0,$I0); # r^2:r^4:r^1:r^3
+ vperm ($R1,$R1,$H1,$I0);
+ vperm ($R2,$R2,$H2,$I0);
+ vperm ($R3,$R3,$H3,$I0);
+ vperm ($R4,$R4,$H4,$I0);
+ veslf ($S1,$R1,2);
+ veslf ($S2,$R2,2);
+ veslf ($S3,$R3,2);
+ veslf ($S4,$R4,2);
+ vaf ($S1,$S1,$R1); # * 5
+ vaf ($S2,$S2,$R2);
+ vaf ($S3,$S3,$R3);
+ vaf ($S4,$S4,$R4);
+
+ lg ($h0,"0($ctx)"); # load hash base 2^64
+ lg ($h1,"8($ctx)");
+ lg ($h2,"16($ctx)");
+
+ vzero ($H0);
+ vzero ($H1);
+ vzero ($H2);
+ vzero ($H3);
+ vzero ($H4);
+
+ risbg ($d0,$h0,38,0x80+63,38); # base 2^64 -> 2^26
+ srlg ($d1,$h0,52);
+ risbg ($h0,$h0,38,0x80+63,0);
+ vlvgg ($H0,$h0,0);
+ risbg ($d1,$h1,38,51,12);
+ vlvgg ($H1,$d0,0);
+ risbg ($d0,$h1,38,63,50);
+ vlvgg ($H2,$d1,0);
+ srlg ($d1,$h1,40);
+ vlvgg ($H3,$d0,0);
+ risbg ($d1,$h2,37,39,24);
+ vlvgg ($H4,$d1,0);
+
+ lhi ("%r0",1);
+ st ("%r0","24($ctx)"); # set is_base2_26
+
+ vstm ($R0,$S4,"48($ctx)"); # save key schedule base 2^26
+
+ vpdi ($R0,$R0,$R0,0); # broadcast r^2:r^4
+ vpdi ($R1,$R1,$R1,0);
+ vpdi ($S1,$S1,$S1,0);
+ vpdi ($R2,$R2,$R2,0);
+ vpdi ($S2,$S2,$S2,0);
+ vpdi ($R3,$R3,$R3,0);
+ vpdi ($S3,$S3,$S3,0);
+ vpdi ($R4,$R4,$R4,0);
+ vpdi ($S4,$S4,$S4,0);
+
+ j (".Loaded_hash");
+
+ALIGN (16);
+LABEL (".Lskip_init");
+ vllezf ($H0,"0($ctx)"); # load hash base 2^26
+ vllezf ($H1,"4($ctx)");
+ vllezf ($H2,"8($ctx)");
+ vllezf ($H3,"12($ctx)");
+ vllezf ($H4,"16($ctx)");
+
+ vlrepg ($R0,"0x30($ctx)"); # broadcast r^2:r^4
+ vlrepg ($R1,"0x40($ctx)");
+ vlrepg ($S1,"0x50($ctx)");
+ vlrepg ($R2,"0x60($ctx)");
+ vlrepg ($S2,"0x70($ctx)");
+ vlrepg ($R3,"0x80($ctx)");
+ vlrepg ($S3,"0x90($ctx)");
+ vlrepg ($R4,"0xa0($ctx)");
+ vlrepg ($S4,"0xb0($ctx)");
+
+LABEL (".Loaded_hash");
+ vzero ($I1);
+ vzero ($I3);
+
+ vlm ($T1,$T4,"0x00($inp)"); # load first input block
+ la ($inp,"0x40($inp)");
+ vgmg ($mask26,6,31);
+ vgmf ($I4,5,5); # padbit<<2
+
+ vperm ($I0,$T3,$T4,$bswaplo);
+ vperm ($I2,$T3,$T4,$bswapmi);
+ vperm ($T3,$T3,$T4,$bswaphi);
+
+ verimg ($I1,$I0,$mask26,6); # >>26
+ veslg ($I0,$I0,32);
+ veslg ($I2,$I2,28); # >>4
+ verimg ($I3,$T3,$mask26,18); # >>14
+ verimg ($I4,$T3,$mask26,58); # >>38
+ vn ($I0,$I0,$mask26);
+ vn ($I2,$I2,$mask26);
+ vesrlf ($I4,$I4,2); # >>2
+
+ vgmg ($mask26,38,63);
+ vperm ($T3,$T1,$T2,$bswaplo);
+ vperm ($T4,$T1,$T2,$bswaphi);
+ vperm ($T2,$T1,$T2,$bswapmi);
+
+ verimg ($I0,$T3,$mask26,0);
+ verimg ($I1,$T3,$mask26,38); # >>26
+ verimg ($I2,$T2,$mask26,60); # >>4
+ verimg ($I3,$T4,$mask26,50); # >>14
+ vesrlg ($T4,$T4,40);
+ vo ($I4,$I4,$T4);
+
+ srlg ("%r0",$len,6);
+&{$z? \&aghi:\&ahi} ("%r0",-1);
+
+ALIGN (16);
+LABEL (".Loop_vx");
+ vmlef ($ACC0,$I0,$R0);
+ vmlef ($ACC1,$I0,$R1);
+ vmlef ($ACC2,$I0,$R2);
+ vmlef ($ACC3,$I0,$R3);
+ vmlef ($ACC4,$I0,$R4);
+
+ vmalef ($ACC0,$I1,$S4,$ACC0);
+ vmalef ($ACC1,$I1,$R0,$ACC1);
+ vmalef ($ACC2,$I1,$R1,$ACC2);
+ vmalef ($ACC3,$I1,$R2,$ACC3);
+ vmalef ($ACC4,$I1,$R3,$ACC4);
+
+ vaf ($H2,$H2,$I2);
+ vaf ($H0,$H0,$I0);
+ vaf ($H3,$H3,$I3);
+ vaf ($H1,$H1,$I1);
+ vaf ($H4,$H4,$I4);
+
+ vmalef ($ACC0,$I2,$S3,$ACC0);
+ vmalef ($ACC1,$I2,$S4,$ACC1);
+ vmalef ($ACC2,$I2,$R0,$ACC2);
+ vmalef ($ACC3,$I2,$R1,$ACC3);
+ vmalef ($ACC4,$I2,$R2,$ACC4);
+
+ vlm ($T1,$T4,"0x00($inp)"); # load next input block
+ la ($inp,"0x40($inp)");
+ vgmg ($mask26,6,31);
+
+ vmalef ($ACC0,$I3,$S2,$ACC0);
+ vmalef ($ACC1,$I3,$S3,$ACC1);
+ vmalef ($ACC2,$I3,$S4,$ACC2);
+ vmalef ($ACC3,$I3,$R0,$ACC3);
+ vmalef ($ACC4,$I3,$R1,$ACC4);
+
+ vperm ($I0,$T3,$T4,$bswaplo);
+ vperm ($I2,$T3,$T4,$bswapmi);
+ vperm ($T3,$T3,$T4,$bswaphi);
+
+ vmalef ($ACC0,$I4,$S1,$ACC0);
+ vmalef ($ACC1,$I4,$S2,$ACC1);
+ vmalef ($ACC2,$I4,$S3,$ACC2);
+ vmalef ($ACC3,$I4,$S4,$ACC3);
+ vmalef ($ACC4,$I4,$R0,$ACC4);
+
+ verimg ($I1,$I0,$mask26,6); # >>26
+ veslg ($I0,$I0,32);
+ veslg ($I2,$I2,28); # >>4
+ verimg ($I3,$T3,$mask26,18); # >>14
+
+ vmalof ($ACC0,$H0,$R0,$ACC0);
+ vmalof ($ACC1,$H0,$R1,$ACC1);
+ vmalof ($ACC2,$H0,$R2,$ACC2);
+ vmalof ($ACC3,$H0,$R3,$ACC3);
+ vmalof ($ACC4,$H0,$R4,$ACC4);
+
+ vgmf ($I4,5,5); # padbit<<2
+ verimg ($I4,$T3,$mask26,58); # >>38
+ vn ($I0,$I0,$mask26);
+ vn ($I2,$I2,$mask26);
+ vesrlf ($I4,$I4,2); # >>2
+
+ vmalof ($ACC0,$H1,$S4,$ACC0);
+ vmalof ($ACC1,$H1,$R0,$ACC1);
+ vmalof ($ACC2,$H1,$R1,$ACC2);
+ vmalof ($ACC3,$H1,$R2,$ACC3);
+ vmalof ($ACC4,$H1,$R3,$ACC4);
+
+ vgmg ($mask26,38,63);
+ vperm ($T3,$T1,$T2,$bswaplo);
+ vperm ($T4,$T1,$T2,$bswaphi);
+ vperm ($T2,$T1,$T2,$bswapmi);
+
+ vmalof ($ACC0,$H2,$S3,$ACC0);
+ vmalof ($ACC1,$H2,$S4,$ACC1);
+ vmalof ($ACC2,$H2,$R0,$ACC2);
+ vmalof ($ACC3,$H2,$R1,$ACC3);
+ vmalof ($ACC4,$H2,$R2,$ACC4);
+
+ verimg ($I0,$T3,$mask26,0);
+ verimg ($I1,$T3,$mask26,38); # >>26
+ verimg ($I2,$T2,$mask26,60); # >>4
+
+ vmalof ($ACC0,$H3,$S2,$ACC0);
+ vmalof ($ACC1,$H3,$S3,$ACC1);
+ vmalof ($ACC2,$H3,$S4,$ACC2);
+ vmalof ($ACC3,$H3,$R0,$ACC3);
+ vmalof ($ACC4,$H3,$R1,$ACC4);
+
+ verimg ($I3,$T4,$mask26,50); # >>14
+ vesrlg ($T4,$T4,40);
+ vo ($I4,$I4,$T4);
+
+ vmalof ($ACC0,$H4,$S1,$ACC0);
+ vmalof ($ACC1,$H4,$S2,$ACC1);
+ vmalof ($ACC2,$H4,$S3,$ACC2);
+ vmalof ($ACC3,$H4,$S4,$ACC3);
+ vmalof ($ACC4,$H4,$R0,$ACC4);
+
+ ################################################################
+ # lazy reduction as discussed in "NEON crypto" by D.J. Bernstein
+ # and P. Schwabe
+
+ vesrlg ($H4,$ACC3,26);
+ vesrlg ($H1,$ACC0,26);
+ vn ($H3,$ACC3,$mask26);
+ vn ($H0,$ACC0,$mask26);
+ vag ($H4,$H4,$ACC4); # h3 -> h4
+ vag ($H1,$H1,$ACC1); # h0 -> h1
+
+ vesrlg ($ACC4,$H4,26);
+ vesrlg ($ACC1,$H1,26);
+ vn ($H4,$H4,$mask26);
+ vn ($H1,$H1,$mask26);
+ vag ($H0,$H0,$ACC4);
+ vag ($H2,$ACC2,$ACC1); # h1 -> h2
+
+ veslg ($ACC4,$ACC4,2); # <<2
+ vesrlg ($ACC2,$H2,26);
+ vn ($H2,$H2,$mask26);
+ vag ($H0,$H0,$ACC4); # h4 -> h0
+ vag ($H3,$H3,$ACC2); # h2 -> h3
+
+ vesrlg ($ACC0,$H0,26);
+ vesrlg ($ACC3,$H3,26);
+ vn ($H0,$H0,$mask26);
+ vn ($H3,$H3,$mask26);
+ vag ($H1,$H1,$ACC0); # h0 -> h1
+ vag ($H4,$H4,$ACC3); # h3 -> h4
+
+&{$z? \&brctg:\&brct} ("%r0",".Loop_vx");
+
+ vlm ($R0,$S4,"48($ctx)"); # load all powers
+
+ lghi ("%r0",0x30);
+&{$z? \&lcgr:\&lcr} ($len,$len);
+&{$z? \&ngr:\&nr} ($len,"%r0");
+&{$z? \&slgr:\&slr} ($inp,$len);
+
+LABEL (".Last");
+ vmlef ($ACC0,$I0,$R0);
+ vmlef ($ACC1,$I0,$R1);
+ vmlef ($ACC2,$I0,$R2);
+ vmlef ($ACC3,$I0,$R3);
+ vmlef ($ACC4,$I0,$R4);
+
+ vmalef ($ACC0,$I1,$S4,$ACC0);
+ vmalef ($ACC1,$I1,$R0,$ACC1);
+ vmalef ($ACC2,$I1,$R1,$ACC2);
+ vmalef ($ACC3,$I1,$R2,$ACC3);
+ vmalef ($ACC4,$I1,$R3,$ACC4);
+
+ vaf ($H0,$H0,$I0);
+ vaf ($H1,$H1,$I1);
+ vaf ($H2,$H2,$I2);
+ vaf ($H3,$H3,$I3);
+ vaf ($H4,$H4,$I4);
+
+ vmalef ($ACC0,$I2,$S3,$ACC0);
+ vmalef ($ACC1,$I2,$S4,$ACC1);
+ vmalef ($ACC2,$I2,$R0,$ACC2);
+ vmalef ($ACC3,$I2,$R1,$ACC3);
+ vmalef ($ACC4,$I2,$R2,$ACC4);
+
+ vmalef ($ACC0,$I3,$S2,$ACC0);
+ vmalef ($ACC1,$I3,$S3,$ACC1);
+ vmalef ($ACC2,$I3,$S4,$ACC2);
+ vmalef ($ACC3,$I3,$R0,$ACC3);
+ vmalef ($ACC4,$I3,$R1,$ACC4);
+
+ vmalef ($ACC0,$I4,$S1,$ACC0);
+ vmalef ($ACC1,$I4,$S2,$ACC1);
+ vmalef ($ACC2,$I4,$S3,$ACC2);
+ vmalef ($ACC3,$I4,$S4,$ACC3);
+ vmalef ($ACC4,$I4,$R0,$ACC4);
+
+ vmalof ($ACC0,$H0,$R0,$ACC0);
+ vmalof ($ACC1,$H0,$R1,$ACC1);
+ vmalof ($ACC2,$H0,$R2,$ACC2);
+ vmalof ($ACC3,$H0,$R3,$ACC3);
+ vmalof ($ACC4,$H0,$R4,$ACC4);
+
+ vmalof ($ACC0,$H1,$S4,$ACC0);
+ vmalof ($ACC1,$H1,$R0,$ACC1);
+ vmalof ($ACC2,$H1,$R1,$ACC2);
+ vmalof ($ACC3,$H1,$R2,$ACC3);
+ vmalof ($ACC4,$H1,$R3,$ACC4);
+
+ vmalof ($ACC0,$H2,$S3,$ACC0);
+ vmalof ($ACC1,$H2,$S4,$ACC1);
+ vmalof ($ACC2,$H2,$R0,$ACC2);
+ vmalof ($ACC3,$H2,$R1,$ACC3);
+ vmalof ($ACC4,$H2,$R2,$ACC4);
+
+ vmalof ($ACC0,$H3,$S2,$ACC0);
+ vmalof ($ACC1,$H3,$S3,$ACC1);
+ vmalof ($ACC2,$H3,$S4,$ACC2);
+ vmalof ($ACC3,$H3,$R0,$ACC3);
+ vmalof ($ACC4,$H3,$R1,$ACC4);
+
+ vmalof ($ACC0,$H4,$S1,$ACC0);
+ vmalof ($ACC1,$H4,$S2,$ACC1);
+ vmalof ($ACC2,$H4,$S3,$ACC2);
+ vmalof ($ACC3,$H4,$S4,$ACC3);
+ vmalof ($ACC4,$H4,$R0,$ACC4);
+
+ ################################################################
+ # horizontal addition
+
+ vzero ($H0);
+ vsumqg ($ACC0,$ACC0,$H0);
+ vsumqg ($ACC1,$ACC1,$H0);
+ vsumqg ($ACC2,$ACC2,$H0);
+ vsumqg ($ACC3,$ACC3,$H0);
+ vsumqg ($ACC4,$ACC4,$H0);
+
+ ################################################################
+ # lazy reduction
+
+ vesrlg ($H4,$ACC3,26);
+ vesrlg ($H1,$ACC0,26);
+ vn ($H3,$ACC3,$mask26);
+ vn ($H0,$ACC0,$mask26);
+ vag ($H4,$H4,$ACC4); # h3 -> h4
+ vag ($H1,$H1,$ACC1); # h0 -> h1
+
+ vesrlg ($ACC4,$H4,26);
+ vesrlg ($ACC1,$H1,26);
+ vn ($H4,$H4,$mask26);
+ vn ($H1,$H1,$mask26);
+ vag ($H0,$H0,$ACC4);
+ vag ($H2,$ACC2,$ACC1); # h1 -> h2
+
+ veslg ($ACC4,$ACC4,2); # <<2
+ vesrlg ($ACC2,$H2,26);
+ vn ($H2,$H2,$mask26);
+ vag ($H0,$H0,$ACC4); # h4 -> h0
+ vag ($H3,$H3,$ACC2); # h2 -> h3
+
+ vesrlg ($ACC0,$H0,26);
+ vesrlg ($ACC3,$H3,26);
+ vn ($H0,$H0,$mask26);
+ vn ($H3,$H3,$mask26);
+ vag ($H1,$H1,$ACC0); # h0 -> h1
+ vag ($H4,$H4,$ACC3); # h3 -> h4
+
+&{$z? \&clgfi:\&clfi} ($len,0);
+ je (".Ldone");
+
+ vlm ($T1,$T4,"0x00($inp)"); # load last partial block
+ vgmg ($mask26,6,31);
+ vgmf ($I4,5,5); # padbit<<2
+
+ vperm ($I0,$T3,$T4,$bswaplo);
+ vperm ($I2,$T3,$T4,$bswapmi);
+ vperm ($T3,$T3,$T4,$bswaphi);
+
+ vl ($ACC0,"0x30($len,%r1)"); # borrow $ACC0,1
+ vl ($ACC1,"0x60($len,%r1)");
+
+ verimg ($I1,$I0,$mask26,6); # >>26
+ veslg ($I0,$I0,32);
+ veslg ($I2,$I2,28); # >>4
+ verimg ($I3,$T3,$mask26,18); # >>14
+ verimg ($I4,$T3,$mask26,58); # >>38
+ vn ($I0,$I0,$mask26);
+ vn ($I2,$I2,$mask26);
+ vesrlf ($I4,$I4,2); # >>2
+
+ vgmg ($mask26,38,63);
+ vperm ($T3,$T1,$T2,$bswaplo);
+ vperm ($T4,$T1,$T2,$bswaphi);
+ vperm ($T2,$T1,$T2,$bswapmi);
+
+ verimg ($I0,$T3,$mask26,0);
+ verimg ($I1,$T3,$mask26,38); # >>26
+ verimg ($I2,$T2,$mask26,60); # >>4
+ verimg ($I3,$T4,$mask26,50); # >>14
+ vesrlg ($T4,$T4,40);
+ vo ($I4,$I4,$T4);
+
+ vperm ($H0,$H0,$H0,$ACC0); # move hash to right lane
+ vn ($I0,$I0,$ACC1); # mask redundant lane[s]
+ vperm ($H1,$H1,$H1,$ACC0);
+ vn ($I1,$I1,$ACC1);
+ vperm ($H2,$H2,$H2,$ACC0);
+ vn ($I2,$I2,$ACC1);
+ vperm ($H3,$H3,$H3,$ACC0);
+ vn ($I3,$I3,$ACC1);
+ vperm ($H4,$H4,$H4,$ACC0);
+ vn ($I4,$I4,$ACC1);
+
+ vaf ($I0,$I0,$H0); # accumulate hash
+ vzero ($H0); # wipe hash value
+ vaf ($I1,$I1,$H1);
+ vzero ($H1);
+ vaf ($I2,$I2,$H2);
+ vzero ($H2);
+ vaf ($I3,$I3,$H3);
+ vzero ($H3);
+ vaf ($I4,$I4,$H4);
+ vzero ($H4);
+
+&{$z? \&lghi:\&lhi} ($len,0);
+ j (".Last");
+ # I don't bother to tell apart cases when only one multiplication
+ # pass is sufficient, because I argue that mispredicted branch
+ # penalties are comparable to overhead of sometimes redundant
+ # multiplication pass...
+
+LABEL (".Ldone");
+ vstef ($H0,"0($ctx)",3); # store hash base 2^26
+ vstef ($H1,"4($ctx)",3);
+ vstef ($H2,"8($ctx)",3);
+ vstef ($H3,"12($ctx)",3);
+ vstef ($H4,"16($ctx)",3);
+
+if ($z) {
+ ld ("%f8","$stdframe+0*8($sp)");
+ ld ("%f9","$stdframe+1*8($sp)");
+ ld ("%f10","$stdframe+2*8($sp)");
+ ld ("%f11","$stdframe+3*8($sp)");
+ ld ("%f12","$stdframe+4*8($sp)");
+ ld ("%f13","$stdframe+5*8($sp)");
+ ld ("%f14","$stdframe+6*8($sp)");
+ ld ("%f15","$stdframe+7*8($sp)");
+&{$z? \&lmg:\&lm} ("%r10","%r15","$stdframe+8*8+10*$SIZE_T($sp)");
+} else {
+ ld ("%f4","$stdframe+16*$SIZE_T+2*8($sp)");
+ ld ("%f6","$stdframe+16*$SIZE_T+3*8($sp)");
+&{$z? \&lmg:\&lm} ("%r10","%r15","$stdframe+10*$SIZE_T($sp)");
+}
+ br ("%r14");
+SIZE ("__poly1305_blocks_vx",".-__poly1305_blocks_vx");
+}
+
+################
# static void poly1305_emit(void *ctx, unsigned char mac[16],
# const u32 nonce[4])
{
-my ($ctx,$mac,$nonce) = map("%r$_",(2..4));
-my ($h0,$h1,$h2,$d0,$d1)=map("%r$_",(5..9));
+my ($mac,$nonce)=($inp,$len);
+my ($h0,$h1,$h2,$d0,$d1,$d2)=map("%r$_",(5..10));
GLOBL ("poly1305_emit");
TYPE ("poly1305_emit","\@function");
ALIGN (16);
LABEL ("poly1305_emit");
-&{$z? \&stmg:\&stm} ("%r6","%r9","6*$SIZE_T($sp)");
+LABEL (".Lpoly1305_emit");
+&{$z? \&stmg:\&stm} ("%r6","%r10","6*$SIZE_T($sp)");
- lg ($h0,"0($ctx)");
- lg ($h1,"8($ctx)");
- lg ($h2,"16($ctx)");
+ lg ($d0,"0($ctx)");
+ lg ($d1,"8($ctx)");
+ lg ($d2,"16($ctx)");
+
+ llgfr ("%r0",$d0); # base 2^26 -> base 2^64
+ srlg ($h0,$d0,32);
+ llgfr ("%r1",$d1);
+ srlg ($h1,$d1,32);
+ srlg ($h2,$d2,32);
+
+ sllg ("%r0","%r0",26);
+ algr ($h0,"%r0");
+ sllg ("%r0",$h1,52);
+ srlg ($h1,$h1,12);
+ sllg ("%r1","%r1",14);
+ algr ($h0,"%r0");
+ alcgr ($h1,"%r1");
+ sllg ("%r0",$h2,40);
+ srlg ($h2,$h2,24);
+ lghi ("%r1",0);
+ algr ($h1,"%r0");
+ alcgr ($h2,"%r1");
+
+ llgf ("%r0","24($ctx)"); # is_base2_26
+ lcgr ("%r0","%r0");
+
+ xgr ($h0,$d0); # choose between radixes
+ xgr ($h1,$d1);
+ xgr ($h2,$d2);
+ ngr ($h0,"%r0");
+ ngr ($h1,"%r0");
+ ngr ($h2,"%r0");
+ xgr ($h0,$d0);
+ xgr ($h1,$d1);
+ xgr ($h2,$d2);
lghi ("%r0",5);
- lghi ("%r1",0);
lgr ($d0,$h0);
lgr ($d1,$h1);
- algr ($h0,"%r0"); # compare to modulus
+ algr ($h0,"%r0"); # compare to modulus
alcgr ($h1,"%r1");
alcgr ($h2,"%r1");
- srlg ($h2,$h2,2); # did it borrow/carry?
- slgr ("%r1",$h2); # 0-$h2>>2
- lg ($h2,"0($nonce)"); # load nonce
- lghi ("%r0",-1);
+ srlg ($h2,$h2,2); # did it borrow/carry?
+ slgr ("%r1",$h2); # 0-$h2>>2
+ lg ($d2,"0($nonce)"); # load nonce
lg ($ctx,"8($nonce)");
- xgr ("%r0","%r1"); # ~%r1
+ xgr ($h0,$d0);
+ xgr ($h1,$d1);
ngr ($h0,"%r1");
- ngr ($d0,"%r0");
ngr ($h1,"%r1");
- ngr ($d1,"%r0");
- ogr ($h0,$d0);
- rllg ($d0,$h2,32); # flip nonce words
- ogr ($h1,$d1);
+ xgr ($h0,$d0);
+ rllg ($d0,$d2,32); # flip nonce words
+ xgr ($h1,$d1);
rllg ($d1,$ctx,32);
- algr ($h0,$d0); # accumulate nonce
+ algr ($h0,$d0); # accumulate nonce
alcgr ($h1,$d1);
- strvg ($h0,"0($mac)"); # write little-endian result
+ strvg ($h0,"0($mac)"); # write little-endian result
strvg ($h1,"8($mac)");
-&{$z? \&lmg:\&lm} ("%r6","%r9","6*$SIZE_T($sp)");
+&{$z? \&lmg:\&lm} ("%r6","%r10","6*$SIZE_T($sp)");
br ("%r14");
SIZE ("poly1305_emit",".-poly1305_emit");
}
-}
+
################
-ALIGN (128);
+ALIGN (16);
LABEL (".Lconst");
-LONG (0x00060504,0x03020100,0x00161514,0x13121110); # vperm op[m[1],m[0]]
-LONG (0x000c0b0a,0x09080706,0x001c1b1a,0x19181716); # vperm op[m[3],m[2]]
-LONG (0x00000000,0x000f0e0d,0x00000000,0x001f1e1d); # vperm op[ - ,m[4]]
-LONG (0x00000000,0x03ffffff,0x00000000,0x03ffffff); # [0,2^26-1,0,2^26-1]
-LONG (0x0f0e0d0c,0x0b0a0908,0x07060504,0x03020100); # vperm op endian
+LONG (0x04050607,0x14151617,0x0c0d0e0f,0x1c1d1e1f); # merge odd
+LONG (0x07060504,0x03020100,0x17161514,0x13121110); # byte swap masks
+LONG (0x0f0e0d0c,0x0b0a0908,0x1f1e1d1c,0x1b1a1918);
+LONG (0x00000000,0x09080706,0x00000000,0x19181716);
+
+LONG (0x00000000,0x00000000,0x00000000,0x0c0d0e0f); # magic tail masks
+LONG (0x0c0d0e0f,0x00000000,0x00000000,0x00000000);
+LONG (0x00000000,0x00000000,0x0c0d0e0f,0x00000000);
+
+LONG (0xffffffff,0x00000000,0xffffffff,0xffffffff);
+LONG (0xffffffff,0x00000000,0xffffffff,0x00000000);
+LONG (0x00000000,0x00000000,0xffffffff,0x00000000);
+
STRING ("\"Poly1305 for s390x, CRYPTOGAMS by <appro\@openssl.org>\"");
PERLASM_END();
Index: openssl-1.1.1c/crypto/poly1305/build.info
===================================================================
--- openssl-1.1.1c.orig/crypto/poly1305/build.info 2019-06-06 12:18:54.556316994 +0200
+++ openssl-1.1.1c/crypto/poly1305/build.info 2019-06-06 12:19:24.232504722 +0200
@@ -17,6 +17,7 @@ GENERATE[poly1305-armv8.S]=asm/poly1305-
INCLUDE[poly1305-armv8.o]=..
GENERATE[poly1305-mips.S]=asm/poly1305-mips.pl $(PERLASM_SCHEME)
INCLUDE[poly1305-mips.o]=..
+INCLUDE[poly1305-s390x.o]=..
GENERATE[poly1305-s390x.S]=asm/poly1305-s390x.pl $(PERLASM_SCHEME)
BEGINRAW[Makefile(unix)]