From 966047ee13188e8634af25af348940acceb9316d Mon Sep 17 00:00:00 2001 From: Rohan McLure Date: Wed, 31 May 2023 14:32:26 +1000 Subject: [PATCH] ec: powerpc64le: Add asm implementation of felem_{square,mul} Add an assembly implementation of felem_{square,mul}, which will be implemented whenever Altivec support is present and the core implements ISA 3.0 (Power 9) or greater. Signed-off-by: Rohan McLure Reviewed-by: Paul Dale Reviewed-by: Shane Lontis Reviewed-by: Dmitry Belyavskiy Reviewed-by: Todd Short (Merged from https://github.com/openssl/openssl/pull/21471) --- crypto/ec/asm/ecp_nistp384-ppc64.pl | 355 ++++++++++++++++++++++++++++ crypto/ec/build.info | 6 +- crypto/ec/ecp_nistp384.c | 9 + 3 files changed, 368 insertions(+), 2 deletions(-) create mode 100755 crypto/ec/asm/ecp_nistp384-ppc64.pl diff --git a/crypto/ec/asm/ecp_nistp384-ppc64.pl b/crypto/ec/asm/ecp_nistp384-ppc64.pl new file mode 100755 index 000000000000..3f86b391af69 --- /dev/null +++ b/crypto/ec/asm/ecp_nistp384-ppc64.pl @@ -0,0 +1,355 @@ +#! /usr/bin/env perl +# Copyright 2023 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the Apache License 2.0 (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html +# +# ==================================================================== +# Written by Rohan McLure for the OpenSSL +# project. +# ==================================================================== +# +# p384 lower-level primitives for PPC64 using vector instructions. +# + +use strict; +use warnings; + +my $flavour = shift; +my $output = ""; +while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} +if (!$output) { + $output = "-"; +} + +my ($xlate, $dir); +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; +( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or +( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or +die "can't locate ppc-xlate.pl"; + +open OUT,"| \"$^X\" $xlate $flavour $output"; +*STDOUT=*OUT; + +my $code = ""; + +my ($sp, $outp, $savelr, $savesp) = ("r1", "r3", "r10", "r12"); + +my $vzero = "v32"; + +sub startproc($) +{ + my ($name) = @_; + + $code.=<<___; + .globl ${name} + .align 5 +${name}: + +___ +} + +sub endproc($) +{ + my ($name) = @_; + + $code.=<<___; + blr + .size ${name},.-${name} + +___ +} + + +sub push_vrs($$) +{ + my ($min, $max) = @_; + + my $count = $max - $min + 1; + + $code.=<<___; + mr $savesp,$sp + stdu $sp,-16*`$count+1`($sp) + +___ + for (my $i = $min; $i <= $max; $i++) { + my $mult = $max - $i + 1; + $code.=<<___; + stxv $i,-16*$mult($savesp) +___ + + } + + $code.=<<___; + +___ +} + +sub pop_vrs($$) +{ + my ($min, $max) = @_; + + $code.=<<___; + ld $savesp,0($sp) +___ + for (my $i = $min; $i <= $max; $i++) { + my $mult = $max - $i + 1; + $code.=<<___; + lxv $i,-16*$mult($savesp) +___ + } + + $code.=<<___; + mr $sp,$savesp + +___ +} + +sub load_vrs($$) +{ + my ($pointer, $reg_list) = @_; + + for (my $i = 0; $i <= 6; $i++) { + my $offset = $i * 8; + $code.=<<___; + lxsd $reg_list->[$i],$offset($pointer) +___ + } + + $code.=<<___; + +___ +} + +sub store_vrs($$) +{ + my ($pointer, $reg_list) = @_; + + for (my $i = 0; $i <= 12; $i++) { + my $offset = $i * 16; + $code.=<<___; + stxv $reg_list->[$i],$offset($pointer) +___ + } + + $code.=<<___; + +___ +} + +$code.=<<___; +.machine "any" +.text + +___ + +{ + # mul/square common + my ($t1, $t2, $t3, $t4) = ("v33", "v34", "v42", "v43"); + my ($zero, $one) = ("r8", "r9"); + my $out = "v51"; + + { + # + # p384_felem_mul + # + + my ($in1p, $in2p) = ("r4", "r5"); + my @in1 = map("v$_",(44..50)); + my @in2 = map("v$_",(35..41)); + + startproc("p384_felem_mul"); + + push_vrs(52, 63); + + $code.=<<___; + vspltisw $vzero,0 + +___ + + load_vrs($in1p, \@in1); + load_vrs($in2p, \@in2); + + $code.=<<___; + vmsumudm $out,$in1[0],$in2[0],$vzero + stxv $out,0($outp) + + xxpermdi $t1,$in1[0],$in1[1],0b00 + xxpermdi $t2,$in2[1],$in2[0],0b00 + vmsumudm $out,$t1,$t2,$vzero + stxv $out,16($outp) + + xxpermdi $t2,$in2[2],$in2[1],0b00 + vmsumudm $out,$t1,$t2,$vzero + vmsumudm $out,$in1[2],$in2[0],$out + stxv $out,32($outp) + + xxpermdi $t2,$in2[1],$in2[0],0b00 + xxpermdi $t3,$in1[2],$in1[3],0b00 + xxpermdi $t4,$in2[3],$in2[2],0b00 + vmsumudm $out,$t1,$t4,$vzero + vmsumudm $out,$t3,$t2,$out + stxv $out,48($outp) + + xxpermdi $t2,$in2[4],$in2[3],0b00 + xxpermdi $t4,$in2[2],$in2[1],0b00 + vmsumudm $out,$t1,$t2,$vzero + vmsumudm $out,$t3,$t4,$out + vmsumudm $out,$in1[4],$in2[0],$out + stxv $out,64($outp) + + xxpermdi $t2,$in2[5],$in2[4],0b00 + xxpermdi $t4,$in2[3],$in2[2],0b00 + vmsumudm $out,$t1,$t2,$vzero + vmsumudm $out,$t3,$t4,$out + xxpermdi $t4,$in2[1],$in2[0],0b00 + xxpermdi $t1,$in1[4],$in1[5],0b00 + vmsumudm $out,$t1,$t4,$out + stxv $out,80($outp) + + xxpermdi $t1,$in1[0],$in1[1],0b00 + xxpermdi $t2,$in2[6],$in2[5],0b00 + xxpermdi $t4,$in2[4],$in2[3],0b00 + vmsumudm $out,$t1,$t2,$vzero + vmsumudm $out,$t3,$t4,$out + xxpermdi $t2,$in2[2],$in2[1],0b00 + xxpermdi $t1,$in1[4],$in1[5],0b00 + vmsumudm $out,$t1,$t2,$out + vmsumudm $out,$in1[6],$in2[0],$out + stxv $out,96($outp) + + xxpermdi $t1,$in1[1],$in1[2],0b00 + xxpermdi $t2,$in2[6],$in2[5],0b00 + xxpermdi $t3,$in1[3],$in1[4],0b00 + vmsumudm $out,$t1,$t2,$vzero + vmsumudm $out,$t3,$t4,$out + xxpermdi $t3,$in2[2],$in2[1],0b00 + xxpermdi $t1,$in1[5],$in1[6],0b00 + vmsumudm $out,$t1,$t3,$out + stxv $out,112($outp) + + xxpermdi $t1,$in1[2],$in1[3],0b00 + xxpermdi $t3,$in1[4],$in1[5],0b00 + vmsumudm $out,$t1,$t2,$vzero + vmsumudm $out,$t3,$t4,$out + vmsumudm $out,$in1[6],$in2[2],$out + stxv $out,128($outp) + + xxpermdi $t1,$in1[3],$in1[4],0b00 + vmsumudm $out,$t1,$t2,$vzero + xxpermdi $t1,$in1[5],$in1[6],0b00 + vmsumudm $out,$t1,$t4,$out + stxv $out,144($outp) + + vmsumudm $out,$t3,$t2,$vzero + vmsumudm $out,$in1[6],$in2[4],$out + stxv $out,160($outp) + + vmsumudm $out,$t1,$t2,$vzero + stxv $out,176($outp) + + vmsumudm $out,$in1[6],$in2[6],$vzero + stxv $out,192($outp) +___ + + endproc("p384_felem_mul"); + } + + { + # + # p384_felem_square + # + + my ($inp) = ("r4"); + my @in = map("v$_",(44..50)); + my @inx2 = map("v$_",(35..41)); + + startproc("p384_felem_square"); + + push_vrs(52, 63); + + $code.=<<___; + vspltisw $vzero,0 + +___ + + load_vrs($inp, \@in); + + $code.=<<___; + li $zero,0 + li $one,1 + mtvsrdd $t1,$one,$zero +___ + + for (my $i = 0; $i <= 6; $i++) { + $code.=<<___; + vsld $inx2[$i],$in[$i],$t1 +___ + } + + $code.=<<___; + vmsumudm $out,$in[0],$in[0],$vzero + stxv $out,0($outp) + + vmsumudm $out,$in[0],$inx2[1],$vzero + stxv $out,16($outp) + + vmsumudm $out,$in[0],$inx2[2],$vzero + vmsumudm $out,$in[1],$in[1],$out + stxv $out,32($outp) + + xxpermdi $t1,$in[0],$in[1],0b00 + xxpermdi $t2,$inx2[3],$inx2[2],0b00 + vmsumudm $out,$t1,$t2,$vzero + stxv $out,48($outp) + + xxpermdi $t4,$inx2[4],$inx2[3],0b00 + vmsumudm $out,$t1,$t4,$vzero + vmsumudm $out,$in[2],$in[2],$out + stxv $out,64($outp) + + xxpermdi $t2,$inx2[5],$inx2[4],0b00 + vmsumudm $out,$t1,$t2,$vzero + vmsumudm $out,$in[2],$inx2[3],$out + stxv $out,80($outp) + + xxpermdi $t2,$inx2[6],$inx2[5],0b00 + vmsumudm $out,$t1,$t2,$vzero + vmsumudm $out,$in[2],$inx2[4],$out + vmsumudm $out,$in[3],$in[3],$out + stxv $out,96($outp) + + xxpermdi $t3,$in[1],$in[2],0b00 + vmsumudm $out,$t3,$t2,$vzero + vmsumudm $out,$in[3],$inx2[4],$out + stxv $out,112($outp) + + xxpermdi $t1,$in[2],$in[3],0b00 + vmsumudm $out,$t1,$t2,$vzero + vmsumudm $out,$in[4],$in[4],$out + stxv $out,128($outp) + + xxpermdi $t1,$in[3],$in[4],0b00 + vmsumudm $out,$t1,$t2,$vzero + stxv $out,144($outp) + + vmsumudm $out,$in[4],$inx2[6],$vzero + vmsumudm $out,$in[5],$in[5],$out + stxv $out,160($outp) + + vmsumudm $out,$in[5],$inx2[6],$vzero + stxv $out,176($outp) + + vmsumudm $out,$in[6],$in[6],$vzero + stxv $out,192($outp) +___ + + endproc("p384_felem_square"); + } +} + +$code =~ s/\`([^\`]*)\`/eval $1/gem; +print $code; +close STDOUT or die "error closing STDOUT: $!"; diff --git a/crypto/ec/build.info b/crypto/ec/build.info index 1fa60a1deddd..4077bead7bdb 100644 --- a/crypto/ec/build.info +++ b/crypto/ec/build.info @@ -39,8 +39,9 @@ IF[{- !$disabled{asm} -}] $ECASM_ppc64=ecp_nistz256.c ecp_ppc.c ecp_nistz256-ppc64.s x25519-ppc64.s $ECDEF_ppc64=ECP_NISTZ256_ASM X25519_ASM IF[{- !$disabled{'ec_nistp_64_gcc_128'} -}] - $ECASM_ppc64=$ECASM_ppc64 ecp_nistp521-ppc64.s - $ECDEF_ppc64=$ECDEF_ppc64 ECP_NISTP521_ASM + $ECASM_ppc64=$ECASM_ppc64 ecp_nistp384-ppc64.s ecp_nistp521-ppc64.s + $ECDEF_ppc64=$ECDEF_ppc64 ECP_NISTP384_ASM ECP_NISTP521_ASM + INCLUDE[ecp_nistp384.o]=.. INCLUDE[ecp_nistp521.o]=.. ENDIF @@ -119,6 +120,7 @@ GENERATE[ecp_nistz256-armv8.S]=asm/ecp_nistz256-armv8.pl INCLUDE[ecp_nistz256-armv8.o]=.. GENERATE[ecp_nistz256-ppc64.s]=asm/ecp_nistz256-ppc64.pl +GENERATE[ecp_nistp384-ppc64.s]=asm/ecp_nistp384-ppc64.pl GENERATE[ecp_nistp521-ppc64.s]=asm/ecp_nistp521-ppc64.pl GENERATE[x25519-x86_64.s]=asm/x25519-x86_64.pl diff --git a/crypto/ec/ecp_nistp384.c b/crypto/ec/ecp_nistp384.c index a0559487ed4e..14f9530d07c6 100644 --- a/crypto/ec/ecp_nistp384.c +++ b/crypto/ec/ecp_nistp384.c @@ -691,6 +691,15 @@ void p384_felem_mul(widefelem out, const felem in1, const felem in2); static void felem_select(void) { +# if defined(_ARCH_PPC64) + if ((OPENSSL_ppccap_P & PPC_MADD300) && (OPENSSL_ppccap_P & PPC_ALTIVEC)) { + felem_square_p = p384_felem_square; + felem_mul_p = p384_felem_mul; + + return; + } +# endif + /* Default */ felem_square_p = felem_square_ref; felem_mul_p = felem_mul_ref;