--- Configure.orig +++ Configure @@ -123,11 +123,11 @@ my $tlib="-lnsl -lsocket"; my $bits1="THIRTY_TWO_BIT "; my $bits2="SIXTY_FOUR_BIT "; -my $x86_asm="x86cpuid.o:bn-586.o co-586.o x86-mont.o:des-586.o crypt586.o:aes-586.o:bf-586.o:md5-586.o:sha1-586.o sha256-586.o sha512-586.o:cast-586.o:rc4-586.o:rmd-586.o:rc5-586.o:wp_block.o wp-mmx.o:cmll-x86.o"; +my $x86_asm="x86cpuid.o:bn-586.o co-586.o x86-mont.o:des-586.o crypt586.o:aes-586.o aesni-x86.o:bf-586.o:md5-586.o:sha1-586.o sha256-586.o sha512-586.o:cast-586.o:rc4-586.o:rmd-586.o:rc5-586.o:wp_block.o wp-mmx.o:cmll-x86.o"; my $x86_elf_asm="$x86_asm:elf"; -my $x86_64_asm="x86_64cpuid.o:x86_64-gcc.o x86_64-mont.o::aes-x86_64.o::md5-x86_64.o:sha1-x86_64.o sha256-x86_64.o sha512-x86_64.o::rc4-x86_64.o:::wp-x86_64.o:cmll-x86_64.o cmll_misc.o"; +my $x86_64_asm="x86_64cpuid.o:x86_64-gcc.o x86_64-mont.o::aes-x86_64.o aesni-x86_64.o::md5-x86_64.o:sha1-x86_64.o sha256-x86_64.o sha512-x86_64.o::rc4-x86_64.o:::wp-x86_64.o:cmll-x86_64.o cmll_misc.o"; my $ia64_asm="ia64cpuid.o:bn-ia64.o::aes_core.o aes_cbc.o aes-ia64.o::md5-ia64.o:sha1-ia64.o sha256-ia64.o sha512-ia64.o::rc4-ia64.o rc4_skey.o:::::void"; my $sparcv9_asm="sparcv9cap.o sparccpuid.o:bn-sparcv9.o sparcv9-mont.o sparcv9a-mont.o:des_enc-sparc.o fcrypt_b.o:aes_core.o aes_cbc.o aes-sparcv9.o:::sha1-sparcv9.o sha256-sparcv9.o sha512-sparcv9.o:::::::void"; my $sparcv8_asm=":sparcv8.o:des_enc-sparc.o fcrypt_b.o:::::::::::void"; @@ -491,7 +491,7 @@ my %table=( # # Win64 targets, WIN64I denotes IA-64 and WIN64A - AMD64 "VC-WIN64I","cl:-W3 -Gs0 -Gy -nologo -DOPENSSL_SYSNAME_WIN32 -DWIN32_LEAN_AND_MEAN -DL_ENDIAN -DUNICODE -D_UNICODE -D_CRT_SECURE_NO_DEPRECATE:::WIN64I::SIXTY_FOUR_BIT RC4_CHUNK_LL DES_INT EXPORT_VAR_AS_FN:ia64cpuid.o:ia64.o::aes_core.o aes_cbc.o aes-ia64.o::md5-ia64.o:sha1-ia64.o sha256-ia64.o sha512-ia64.o:::::::ias:win32", -"VC-WIN64A","cl:-W3 -Gs0 -Gy -nologo -DOPENSSL_SYSNAME_WIN32 -DWIN32_LEAN_AND_MEAN -DL_ENDIAN -DUNICODE -D_UNICODE -D_CRT_SECURE_NO_DEPRECATE:::WIN64A::SIXTY_FOUR_BIT RC4_CHUNK_LL DES_INT EXPORT_VAR_AS_FN:x86_64cpuid.o:bn_asm.o x86_64-mont.o::aes-x86_64.o::md5-x86_64.o:sha1-x86_64.o sha256-x86_64.o sha512-x86_64.o::rc4-x86_64.o:::wp-x86_64.o:cmll-x86_64.o cmll_misc.o:auto:win32", +"VC-WIN64A","cl:-W3 -Gs0 -Gy -nologo -DOPENSSL_SYSNAME_WIN32 -DWIN32_LEAN_AND_MEAN -DL_ENDIAN -DUNICODE -D_UNICODE -D_CRT_SECURE_NO_DEPRECATE:::WIN64A::SIXTY_FOUR_BIT RC4_CHUNK_LL DES_INT EXPORT_VAR_AS_FN:x86_64cpuid.o:bn_asm.o x86_64-mont.o::aes-x86_64.o aesni-x86_64.o::md5-x86_64.o:sha1-x86_64.o sha256-x86_64.o sha512-x86_64.o::rc4-x86_64.o:::wp-x86_64.o:cmll-x86_64.o cmll_misc.o:auto:win32", "debug-VC-WIN64I","cl:-W3 -Gs0 -Gy -Zi -nologo -DOPENSSL_SYSNAME_WIN32 -DWIN32_LEAN_AND_MEAN -DL_ENDIAN -DUNICODE -D_UNICODE -D_CRT_SECURE_NO_DEPRECATE:::WIN64I::SIXTY_FOUR_BIT RC4_CHUNK_LL DES_INT EXPORT_VAR_AS_FN:ia64cpuid.o:ia64.o::aes_core.o aes_cbc.o aes-ia64.o::md5-ia64.o:sha1-ia64.o sha256-ia64.o sha512-ia64.o:::::::ias:win32", "debug-VC-WIN64A","cl:-W3 -Gs0 -Gy -Zi -nologo -DOPENSSL_SYSNAME_WIN32 -DWIN32_LEAN_AND_MEAN -DL_ENDIAN -DUNICODE -D_UNICODE -D_CRT_SECURE_NO_DEPRECATE:::WIN64A::SIXTY_FOUR_BIT RC4_CHUNK_LL DES_INT EXPORT_VAR_AS_FN:x86_64cpuid.o:bn_asm.o x86_64-mont.o::aes-x86_64.o::md5-x86_64.o:sha1-x86_64.o sha256-x86_64.o sha512-x86_64.o::rc4-x86_64.o:::wp-x86_64.o:cmll-x86_64.o cmll_misc.o:auto:win32", # x86 Win32 target defaults to ANSI API, if you want UNICODE, complement @@ -1410,6 +1410,7 @@ if ($rmd160_obj =~ /\.o$/) if ($aes_obj =~ /\.o$/) { $cflags.=" -DAES_ASM"; + $aes_obj =~ s/\s*aesni\-x86\.o// if ($no_sse2); } else { $aes_obj=$aes_enc; --- /dev/null +++ crypto/aes/asm/aesni-x86.pl @@ -0,0 +1,765 @@ +#!/usr/bin/env perl + +# ==================================================================== +# Written by Andy Polyakov for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. +# ==================================================================== +# +# This module implements support for Intel AES-NI extension. In +# OpenSSL context it's used with Intel engine, but can also be used as +# drop-in replacement for crypto/aes/asm/aes-586.pl [see below for +# details]. + +$PREFIX="aesni"; # if $PREFIX is set to "AES", the script + # generates drop-in replacement for + # crypto/aes/asm/aes-586.pl:-) + +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; +push(@INC,"${dir}","${dir}../../perlasm"); +require "x86asm.pl"; + +&asm_init($ARGV[0],$0); + +$movekey = eval($RREFIX eq "aseni" ? "*movaps" : "*movups"); + +$len="eax"; +$rounds="ecx"; +$key="edx"; +$inp="esi"; +$out="edi"; +$rounds_="ebx"; # backup copy for $rounds +$key_="ebp"; # backup copy for $key + +$inout0="xmm0"; +$inout1="xmm1"; +$inout2="xmm2"; +$rndkey0="xmm3"; +$rndkey1="xmm4"; +$ivec="xmm5"; +$in0="xmm6"; +$in1="xmm7"; $inout3="xmm7"; + +# Inline version of internal aesni_[en|de]crypt1 +sub aesni_inline_generate1 +{ my $p=shift; + + &$movekey ($rndkey0,&QWP(0,$key)); + &$movekey ($rndkey1,&QWP(16,$key)); + &lea ($key,&DWP(32,$key)); + &pxor ($inout0,$rndkey0); + &set_label("${p}1_loop"); + eval"&aes${p} ($inout0,$rndkey1)"; + &dec ($rounds); + &$movekey ($rndkey1,&QWP(0,$key)); + &lea ($key,&DWP(16,$key)); + &jnz (&label("${p}1_loop")); + eval"&aes${p}last ($inout0,$rndkey1)"; +} + +sub aesni_generate1 # fully unrolled loop +{ my $p=shift; + + &function_begin_B("_aesni_${p}rypt1"); + &$movekey ($rndkey0,&QWP(0,$key)); + &$movekey ($rndkey1,&QWP(0x10,$key)); + &cmp ($rounds,11); + &pxor ($inout0,$rndkey0); + &$movekey ($rndkey0,&QWP(0x20,$key)); + &lea ($key,&DWP(0x30,$key)); + &jb (&label("${p}128")); + &lea ($key,&DWP(0x20,$key)); + &je (&label("${p}192")); + &lea ($key,&DWP(0x20,$key)); + eval"&aes${p} ($inout0,$rndkey1)"; + &$movekey ($rndkey1,&QWP(-0x40,$key)); + eval"&aes${p} ($inout0,$rndkey0)"; + &$movekey ($rndkey0,&QWP(-0x30,$key)); + &set_label("${p}192"); + eval"&aes${p} ($inout0,$rndkey1)"; + &$movekey ($rndkey1,&QWP(-0x20,$key)); + eval"&aes${p} ($inout0,$rndkey0)"; + &$movekey ($rndkey0,&QWP(-0x10,$key)); + &set_label("${p}128"); + eval"&aes${p} ($inout0,$rndkey1)"; + &$movekey ($rndkey1,&QWP(0,$key)); + eval"&aes${p} ($inout0,$rndkey0)"; + &$movekey ($rndkey0,&QWP(0x10,$key)); + eval"&aes${p} ($inout0,$rndkey1)"; + &$movekey ($rndkey1,&QWP(0x20,$key)); + eval"&aes${p} ($inout0,$rndkey0)"; + &$movekey ($rndkey0,&QWP(0x30,$key)); + eval"&aes${p} ($inout0,$rndkey1)"; + &$movekey ($rndkey1,&QWP(0x40,$key)); + eval"&aes${p} ($inout0,$rndkey0)"; + &$movekey ($rndkey0,&QWP(0x50,$key)); + eval"&aes${p} ($inout0,$rndkey1)"; + &$movekey ($rndkey1,&QWP(0x60,$key)); + eval"&aes${p} ($inout0,$rndkey0)"; + &$movekey ($rndkey0,&QWP(0x70,$key)); + eval"&aes${p} ($inout0,$rndkey1)"; + eval"&aes${p}last ($inout0,$rndkey0)"; + &ret(); + &function_end_B("_aesni_${p}rypt1"); +} + +# void $PREFIX_encrypt (const void *inp,void *out,const AES_KEY *key); +# &aesni_generate1("dec"); +&function_begin_B("${PREFIX}_encrypt"); + &mov ("eax",&wparam(0)); + &mov ($key,&wparam(2)); + &movups ($inout0,&QWP(0,"eax")); + &mov ($rounds,&DWP(240,$key)); + &mov ("eax",&wparam(1)); + &aesni_inline_generate1("enc"); # &call ("_aesni_encrypt1"); + &movups (&QWP(0,"eax"),$inout0); + &ret (); +&function_end_B("${PREFIX}_encrypt"); + +# void $PREFIX_decrypt (const void *inp,void *out,const AES_KEY *key); +# &aesni_generate1("dec"); +&function_begin_B("${PREFIX}_decrypt"); + &mov ("eax",&wparam(0)); + &mov ($key,&wparam(2)); + &movups ($inout0,&QWP(0,"eax")); + &mov ($rounds,&DWP(240,$key)); + &mov ("eax",&wparam(1)); + &aesni_inline_generate1("dec"); # &call ("_aesni_decrypt1"); + &movups (&QWP(0,"eax"),$inout0); + &ret (); +&function_end_B("${PREFIX}_decrypt"); + +# _aesni_[en|de]crypt[34] are private interfaces, N denotes interleave +# factor. Why 3x subroutine is used in loops? Even though aes[enc|dec] +# latency is 6, it turned out that it can be scheduled only every +# *second* cycle. Thus 3x interleave is the one providing optimal +# utilization, i.e. when subroutine's throughput is virtually same as +# of non-interleaved subroutine [for number of input blocks up to 3]. +# This is why it makes no sense to implement 2x subroutine. As soon +# as/if Intel improves throughput by making it possible to schedule +# the instructions in question *every* cycles I would have to +# implement 6x interleave and use it in loop... +sub aesni_generate3 +{ my $p=shift; + + &function_begin_B("_aesni_${p}rypt3"); + &$movekey ($rndkey0,&QWP(0,$key)); + &shr ($rounds,1); + &$movekey ($rndkey1,&QWP(16,$key)); + &lea ($key,&DWP(32,$key)); + &pxor ($inout0,$rndkey0); + &pxor ($inout1,$rndkey0); + &pxor ($inout2,$rndkey0); + &jmp (&label("${p}3_loop")); + &set_label("${p}3_loop",16); + eval"&aes${p} ($inout0,$rndkey1)"; + &$movekey ($rndkey0,&QWP(0,$key)); + eval"&aes${p} ($inout1,$rndkey1)"; + &dec ($rounds); + eval"&aes${p} ($inout2,$rndkey1)"; + &$movekey ($rndkey1,&QWP(16,$key)); + eval"&aes${p} ($inout0,$rndkey0)"; + &lea ($key,&DWP(32,$key)); + eval"&aes${p} ($inout1,$rndkey0)"; + eval"&aes${p} ($inout2,$rndkey0)"; + &jnz (&label("${p}3_loop")); + eval"&aes${p} ($inout0,$rndkey1)"; + &$movekey ($rndkey0,&QWP(0,$key)); + eval"&aes${p} ($inout1,$rndkey1)"; + eval"&aes${p} ($inout2,$rndkey1)"; + eval"&aes${p}last ($inout0,$rndkey0)"; + eval"&aes${p}last ($inout1,$rndkey0)"; + eval"&aes${p}last ($inout2,$rndkey0)"; + &ret(); + &function_end_B("_aesni_${p}rypt3"); +} + +# 4x interleave is implemented to improve small block performance, +# most notably [and naturally] 4 block by ~30%. One can argue that one +# should have implemented 5x as well, but improvement would be <20%, +# so it's not worth it... +sub aesni_generate4 +{ my $p=shift; + + &function_begin_B("_aesni_${p}rypt4"); + &$movekey ($rndkey0,&QWP(0,$key)); + &$movekey ($rndkey1,&QWP(16,$key)); + &shr ($rounds,1); + &lea ($key,&DWP(32,$key)); + &pxor ($inout0,$rndkey0); + &pxor ($inout1,$rndkey0); + &pxor ($inout2,$rndkey0); + &pxor ($inout3,$rndkey0); + &jmp (&label("${p}3_loop")); + &set_label("${p}3_loop",16); + eval"&aes${p} ($inout0,$rndkey1)"; + &$movekey ($rndkey0,&QWP(0,$key)); + eval"&aes${p} ($inout1,$rndkey1)"; + &dec ($rounds); + eval"&aes${p} ($inout2,$rndkey1)"; + eval"&aes${p} ($inout3,$rndkey1)"; + &$movekey ($rndkey1,&QWP(16,$key)); + eval"&aes${p} ($inout0,$rndkey0)"; + &lea ($key,&DWP(32,$key)); + eval"&aes${p} ($inout1,$rndkey0)"; + eval"&aes${p} ($inout2,$rndkey0)"; + eval"&aes${p} ($inout3,$rndkey0)"; + &jnz (&label("${p}3_loop")); + eval"&aes${p} ($inout0,$rndkey1)"; + &$movekey ($rndkey0,&QWP(0,$key)); + eval"&aes${p} ($inout1,$rndkey1)"; + eval"&aes${p} ($inout2,$rndkey1)"; + eval"&aes${p} ($inout3,$rndkey1)"; + eval"&aes${p}last ($inout0,$rndkey0)"; + eval"&aes${p}last ($inout1,$rndkey0)"; + eval"&aes${p}last ($inout2,$rndkey0)"; + eval"&aes${p}last ($inout3,$rndkey0)"; + &ret(); + &function_end_B("_aesni_${p}rypt4"); +} +&aesni_generate3("enc") if ($PREFIX eq "aesni"); +&aesni_generate3("dec"); +&aesni_generate4("enc") if ($PREFIX eq "aesni"); +&aesni_generate4("dec"); + +if ($PREFIX eq "aesni") { +# void aesni_ecb_encrypt (const void *in, void *out, +# size_t length, const AES_KEY *key, +# int enc); +&function_begin("aesni_ecb_encrypt"); + &mov ($inp,&wparam(0)); + &mov ($out,&wparam(1)); + &mov ($len,&wparam(2)); + &mov ($key,&wparam(3)); + &mov ($rounds,&wparam(4)); + &cmp ($len,16); + &jb (&label("ecb_ret")); + &and ($len,-16); + &test ($rounds,$rounds) + &mov ($rounds,&DWP(240,$key)); + &mov ($key_,$key); # backup $key + &mov ($rounds_,$rounds); # backup $rounds + &jz (&label("ecb_decrypt")); + + &sub ($len,0x40); + &jbe (&label("ecb_enc_tail")); + &jmp (&label("ecb_enc_loop3")); + +&set_label("ecb_enc_loop3",16); + &movups ($inout0,&QWP(0,$inp)); + &movups ($inout1,&QWP(0x10,$inp)); + &movups ($inout2,&QWP(0x20,$inp)); + &call ("_aesni_encrypt3"); + &sub ($len,0x30); + &lea ($inp,&DWP(0x30,$inp)); + &lea ($out,&DWP(0x30,$out)); + &movups (&QWP(-0x30,$out),$inout0); + &mov ($key,$key_); # restore $key + &movups (&QWP(-0x20,$out),$inout1); + &mov ($rounds,$rounds_); # restore $rounds + &movups (&QWP(-0x10,$out),$inout2); + &ja (&label("ecb_enc_loop3")); + +&set_label("ecb_enc_tail"); + &add ($len,0x40); + &jz (&label("ecb_ret")); + + &cmp ($len,0x10); + &movups ($inout0,&QWP(0,$inp)); + &je (&label("ecb_enc_one")); + &cmp ($len,0x20); + &movups ($inout1,&QWP(0x10,$inp)); + &je (&label("ecb_enc_two")); + &cmp ($len,0x30); + &movups ($inout2,&QWP(0x20,$inp)); + &je (&label("ecb_enc_three")); + &movups ($inout3,&QWP(0x30,$inp)); + &call ("_aesni_encrypt4"); + &movups (&QWP(0,$out),$inout0); + &movups (&QWP(0x10,$out),$inout1); + &movups (&QWP(0x20,$out),$inout2); + &movups (&QWP(0x30,$out),$inout3); + jmp (&label("ecb_ret")); + +&set_label("ecb_enc_one",16); + &aesni_inline_generate1("enc"); # &call ("_aesni_encrypt1"); + &movups (&QWP(0,$out),$inout0); + &jmp (&label("ecb_ret")); + +&set_label("ecb_enc_two",16); + &call ("_aesni_encrypt3"); + &movups (&QWP(0,$out),$inout0); + &movups (&QWP(0x10,$out),$inout1); + &jmp (&label("ecb_ret")); + +&set_label("ecb_enc_three",16); + &call ("_aesni_encrypt3"); + &movups (&QWP(0,$out),$inout0); + &movups (&QWP(0x10,$out),$inout1); + &movups (&QWP(0x20,$out),$inout2); + &jmp (&label("ecb_ret")); + +&set_label("ecb_decrypt",16); + &sub ($len,0x40); + &jbe (&label("ecb_dec_tail")); + &jmp (&label("ecb_dec_loop3")); + +&set_label("ecb_dec_loop3",16); + &movups ($inout0,&QWP(0,$inp)); + &movups ($inout1,&QWP(0x10,$inp)); + &movups ($inout2,&QWP(0x20,$inp)); + &call ("_aesni_decrypt3"); + &sub ($len,0x30); + &lea ($inp,&DWP(0x30,$inp)); + &lea ($out,&DWP(0x30,$out)); + &movups (&QWP(-0x30,$out),$inout0); + &mov ($key,$key_); # restore $key + &movups (&QWP(-0x20,$out),$inout1); + &mov ($rounds,$rounds_); # restore $rounds + &movups (&QWP(-0x10,$out),$inout2); + &ja (&label("ecb_dec_loop3")); + +&set_label("ecb_dec_tail"); + &add ($len,0x40); + &jz (&label("ecb_ret")); + + &cmp ($len,0x10); + &movups ($inout0,&QWP(0,$inp)); + &je (&label("ecb_dec_one")); + &cmp ($len,0x20); + &movups ($inout1,&QWP(0x10,$inp)); + &je (&label("ecb_dec_two")); + &cmp ($len,0x30); + &movups ($inout2,&QWP(0x20,$inp)); + &je (&label("ecb_dec_three")); + &movups ($inout3,&QWP(0x30,$inp)); + &call ("_aesni_decrypt4"); + &movups (&QWP(0,$out),$inout0); + &movups (&QWP(0x10,$out),$inout1); + &movups (&QWP(0x20,$out),$inout2); + &movups (&QWP(0x30,$out),$inout3); + &jmp (&label("ecb_ret")); + +&set_label("ecb_dec_one",16); + &aesni_inline_generate1("dec"); # &call ("_aesni_decrypt3"); + &movups (&QWP(0,$out),$inout0); + &jmp (&label("ecb_ret")); + +&set_label("ecb_dec_two",16); + &call ("_aesni_decrypt3"); + &movups (&QWP(0,$out),$inout0); + &movups (&QWP(0x10,$out),$inout1); + &jmp (&label("ecb_ret")); + +&set_label("ecb_dec_three",16); + &call ("_aesni_decrypt3"); + &movups (&QWP(0,$out),$inout0); + &movups (&QWP(0x10,$out),$inout1); + &movups (&QWP(0x20,$out),$inout2); + +&set_label("ecb_ret"); +&function_end("aesni_ecb_encrypt"); +} + +# void $PREFIX_cbc_encrypt (const void *inp, void *out, +# size_t length, const AES_KEY *key, +# unsigned char *ivp,const int enc); +&function_begin("${PREFIX}_cbc_encrypt"); + &mov ($inp,&wparam(0)); + &mov ($out,&wparam(1)); + &mov ($len,&wparam(2)); + &mov ($key,&wparam(3)); + &test ($len,$len); + &mov ($key_,&wparam(4)); + &jz (&label("cbc_ret")); + + &cmp (&wparam(5),0); + &movups ($ivec,&QWP(0,$key_)); # load IV + &mov ($rounds,&DWP(240,$key)); + &mov ($key_,$key); # backup $key + &mov ($rounds_,$rounds); # backup $rounds + &je (&label("cbc_decrypt")); + + &movaps ($inout0,$ivec); + &cmp ($len,16); + &jb (&label("cbc_enc_tail")); + &sub ($len,16); + &jmp (&label("cbc_enc_loop")); + +&set_label("cbc_enc_loop",16); + &movups ($ivec,&QWP(0,$inp)); + &lea ($inp,&DWP(16,$inp)); + &pxor ($inout0,$ivec); + &aesni_inline_generate1("enc"); # &call ("_aesni_encrypt3"); + &sub ($len,16); + &lea ($out,&DWP(16,$out)); + &mov ($rounds,$rounds_); # restore $rounds + &mov ($key,$key_); # restore $key + &movups (&QWP(-16,$out),$inout0); + &jnc (&label("cbc_enc_loop")); + &add ($len,16); + &jnz (&label("cbc_enc_tail")); + &movaps ($ivec,$inout0); + &jmp (&label("cbc_ret")); + +&set_label("cbc_enc_tail"); + &mov ("ecx",$len); # zaps $rounds + &data_word(0xA4F3F689); # rep movsb + &mov ("ecx",16); # zero tail + &sub ("ecx",$len); + &xor ("eax","eax"); # zaps $len + &data_word(0xAAF3F689); # rep stosb + &lea ($out,&DWP(-16,$out)); # rewind $out by 1 block + &mov ($rounds,$rounds_); # restore $rounds + &mov ($inp,$out); # $inp and $out are the same + &mov ($key,$key_); # restore $key + &jmp (&label("cbc_enc_loop")); + +&set_label("cbc_decrypt",16); + &sub ($len,0x40); + &jbe (&label("cbc_dec_tail")); + &jmp (&label("cbc_dec_loop3")); + +&set_label("cbc_dec_loop3",16); + &movups ($inout0,&QWP(0,$inp)); + &movups ($inout1,&QWP(0x10,$inp)); + &movups ($inout2,&QWP(0x20,$inp)); + &movaps ($in0,$inout0); + &movaps ($in1,$inout1); + &call ("_aesni_decrypt3"); + &sub ($len,0x30); + &lea ($inp,&DWP(0x30,$inp)); + &lea ($out,&DWP(0x30,$out)); + &pxor ($inout0,$ivec); + &pxor ($inout1,$in0); + &movups ($ivec,&QWP(-0x10,$inp)); + &pxor ($inout2,$in1); + &movups (&QWP(-0x30,$out),$inout0); + &mov ($rounds,$rounds_) # restore $rounds + &movups (&QWP(-0x20,$out),$inout1); + &mov ($key,$key_); # restore $key + &movups (&QWP(-0x10,$out),$inout2); + &ja (&label("cbc_dec_loop3")); + +&set_label("cbc_dec_tail"); + &add ($len,0x40); + &jz (&label("cbc_ret")); + + &movups ($inout0,&QWP(0,$inp)); + &cmp ($len,0x10); + &movaps ($in0,$inout0); + &jbe (&label("cbc_dec_one")); + &movups ($inout1,&QWP(0x10,$inp)); + &cmp ($len,0x20); + &movaps ($in1,$inout1); + &jbe (&label("cbc_dec_two")); + &movups ($inout2,&QWP(0x20,$inp)); + &cmp ($len,0x30); + &jbe (&label("cbc_dec_three")); + &movups ($inout3,&QWP(0x30,$inp)); + &call ("_aesni_decrypt4"); + &movups ($rndkey0,&QWP(0x10,$inp)); + &movups ($rndkey1,&QWP(0x20,$inp)); + &pxor ($inout0,$ivec); + &pxor ($inout1,$in0); + &movups ($ivec,&QWP(0x30,$inp)); + &movups (&QWP(0,$out),$inout0); + &pxor ($inout2,$rndkey0); + &pxor ($inout3,$rndkey1); + &movups (&QWP(0x10,$out),$inout1); + &movups (&QWP(0x20,$out),$inout2); + &movaps ($inout0,$inout3); + &lea ($out,&DWP(0x30,$out)); + &jmp (&label("cbc_dec_tail_collected")); + +&set_label("cbc_dec_one"); + &aesni_inline_generate1("dec"); # &call ("_aesni_decrypt3"); + &pxor ($inout0,$ivec); + &movaps ($ivec,$in0); + &jmp (&label("cbc_dec_tail_collected")); + +&set_label("cbc_dec_two"); + &call ("_aesni_decrypt3"); + &pxor ($inout0,$ivec); + &pxor ($inout1,$in0); + &movups (&QWP(0,$out),$inout0); + &movaps ($inout0,$inout1); + &movaps ($ivec,$in1); + &lea ($out,&DWP(0x10,$out)); + &jmp (&label("cbc_dec_tail_collected")); + +&set_label("cbc_dec_three"); + &call ("_aesni_decrypt3"); + &pxor ($inout0,$ivec); + &pxor ($inout1,$in0); + &pxor ($inout2,$in1); + &movups (&QWP(0,$out),$inout0); + &movups (&QWP(0x10,$out),$inout1); + &movaps ($inout0,$inout2); + &movups ($ivec,&QWP(0x20,$inp)); + &lea ($out,&DWP(0x20,$out)); + +&set_label("cbc_dec_tail_collected"); + &and ($len,15); + &jnz (&label("cbc_dec_tail_partial")); + &movups (&QWP(0,$out),$inout0); + &jmp (&label("cbc_ret")); + +&set_label("cbc_dec_tail_partial"); + &mov ($key_,"esp"); + &sub ("esp",16); + &and ("esp",-16); + &movaps (&QWP(0,"esp"),$inout0); + &mov ($inp,"esp"); + &mov ("ecx",$len); + &data_word(0xA4F3F689); # rep movsb + &mov ("esp",$key_); + +&set_label("cbc_ret"); + &mov ($key_,&wparam(4)); + &movups (&QWP(0,$key_),$ivec); # output IV +&function_end("${PREFIX}_cbc_encrypt"); + +# Mechanical port from aesni-x86_64.pl. +# +# _aesni_set_encrypt_key is private interface, +# input: +# "eax" const unsigned char *userKey +# $rounds int bits +# $key AES_KEY *key +# output: +# "eax" return code +# $round rounds + +&function_begin_B("_aesni_set_encrypt_key"); + &test ("eax","eax"); + &jz (&label("bad_pointer")); + &test ($key,$key); + &jz (&label("bad_pointer")); + + &movups ("xmm0",&QWP(0,"eax")); # pull first 128 bits of *userKey + &pxor ("xmm4","xmm4"); # low dword of xmm4 is assumed 0 + &lea ($key,&DWP(16,$key)); + &cmp ($rounds,256); + &je (&label("14rounds")); + &cmp ($rounds,192); + &je (&label("12rounds")); + &cmp ($rounds,128); + &jne (&label("bad_keybits")); + +&set_label("10rounds",16); + &mov ($rounds,9); + &$movekey (&QWP(-16,$key),"xmm0"); # round 0 + &aeskeygenassist("xmm1","xmm0",0x01); # round 1 + &call (&label("key_128_cold")); + &aeskeygenassist("xmm1","xmm0",0x2); # round 2 + &call (&label("key_128")); + &aeskeygenassist("xmm1","xmm0",0x04); # round 3 + &call (&label("key_128")); + &aeskeygenassist("xmm1","xmm0",0x08); # round 4 + &call (&label("key_128")); + &aeskeygenassist("xmm1","xmm0",0x10); # round 5 + &call (&label("key_128")); + &aeskeygenassist("xmm1","xmm0",0x20); # round 6 + &call (&label("key_128")); + &aeskeygenassist("xmm1","xmm0",0x40); # round 7 + &call (&label("key_128")); + &aeskeygenassist("xmm1","xmm0",0x80); # round 8 + &call (&label("key_128")); + &aeskeygenassist("xmm1","xmm0",0x1b); # round 9 + &call (&label("key_128")); + &aeskeygenassist("xmm1","xmm0",0x36); # round 10 + &call (&label("key_128")); + &$movekey (&QWP(0,$key),"xmm0"); + &mov (&DWP(80,$key),$rounds); + &xor ("eax","eax"); + &ret(); + +&set_label("key_128",16); + &$movekey (&QWP(0,$key),"xmm0"); + &lea ($key,&DWP(16,$key)); +&set_label("key_128_cold"); + &shufps ("xmm4","xmm0",0b00010000); + &pxor ("xmm0","xmm4"); + &shufps ("xmm4","xmm0",0b10001100,); + &pxor ("xmm0","xmm4"); + &pshufd ("xmm1","xmm1",0b11111111); # critical path + &pxor ("xmm0","xmm1"); + &ret(); + +&set_label("12rounds",16); + &movq ("xmm2",&QWP(16,"eax")); # remaining 1/3 of *userKey + &mov ($rounds,11); + &$movekey (&QWP(-16,$key),"xmm0") # round 0 + &aeskeygenassist("xmm1","xmm2",0x01); # round 1,2 + &call (&label("key_192a_cold")); + &aeskeygenassist("xmm1","xmm2",0x02); # round 2,3 + &call (&label("key_192b")); + &aeskeygenassist("xmm1","xmm2",0x04); # round 4,5 + &call (&label("key_192a")); + &aeskeygenassist("xmm1","xmm2",0x08); # round 5,6 + &call (&label("key_192b")); + &aeskeygenassist("xmm1","xmm2",0x10); # round 7,8 + &call (&label("key_192a")); + &aeskeygenassist("xmm1","xmm2",0x20); # round 8,9 + &call (&label("key_192b")); + &aeskeygenassist("xmm1","xmm2",0x40); # round 10,11 + &call (&label("key_192a")); + &aeskeygenassist("xmm1","xmm2",0x80); # round 11,12 + &call (&label("key_192b")); + &$movekey (&QWP(0,$key),"xmm0"); + &mov (&DWP(48,$key),$rounds); + &xor ("eax","eax"); + &ret(); + +&set_label("key_192a",16); + &$movekey (&QWP(0,$key),"xmm0"); + &lea ($key,&DWP(16,$key)); +&set_label("key_192a_cold",16); + &movaps ("xmm5","xmm2"); +&set_label("key_192b_warm"); + &shufps ("xmm4","xmm0",0b00010000); + &movaps ("xmm3","xmm2"); + &pxor ("xmm0","xmm4"); + &shufps ("xmm4","xmm0",0b10001100); + &pslldq ("xmm3",4); + &pxor ("xmm0","xmm4"); + &pshufd ("xmm1","xmm1",0b01010101); # critical path + &pxor ("xmm2","xmm3"); + &pxor ("xmm0","xmm1"); + &pshufd ("xmm3","xmm0",0b11111111); + &pxor ("xmm2","xmm3"); + &ret(); + +&set_label("key_192b",16); + &movaps ("xmm3","xmm0"); + &shufps ("xmm5","xmm0",0b01000100); + &$movekey (&QWP(0,$key),"xmm5"); + &shufps ("xmm3","xmm2",0b01001110); + &$movekey (&QWP(16,$key),"xmm3"); + &lea ($key,&DWP(32,$key)); + &jmp (&label("key_192b_warm")); + +&set_label("14rounds",16); + &movups ("xmm2",&QWP(16,"eax")); # remaining half of *userKey + &mov ($rounds,13); + &lea ($key,&DWP(16,$key)); + &$movekey (&QWP(-32,$key),"xmm0"); # round 0 + &$movekey (&QWP(-16,$key),"xmm2"); # round 1 + &aeskeygenassist("xmm1","xmm2",0x01); # round 2 + &call (&label("key_256a_cold")); + &aeskeygenassist("xmm1","xmm0",0x01); # round 3 + &call (&label("key_256b")); + &aeskeygenassist("xmm1","xmm2",0x02); # round 4 + &call (&label("key_256a")); + &aeskeygenassist("xmm1","xmm0",0x02); # round 5 + &call (&label("key_256b")); + &aeskeygenassist("xmm1","xmm2",0x04); # round 6 + &call (&label("key_256a")); + &aeskeygenassist("xmm1","xmm0",0x04); # round 7 + &call (&label("key_256b")); + &aeskeygenassist("xmm1","xmm2",0x08); # round 8 + &call (&label("key_256a")); + &aeskeygenassist("xmm1","xmm0",0x08); # round 9 + &call (&label("key_256b")); + &aeskeygenassist("xmm1","xmm2",0x10); # round 10 + &call (&label("key_256a")); + &aeskeygenassist("xmm1","xmm0",0x10); # round 11 + &call (&label("key_256b")); + &aeskeygenassist("xmm1","xmm2",0x20); # round 12 + &call (&label("key_256a")); + &aeskeygenassist("xmm1","xmm0",0x20); # round 13 + &call (&label("key_256b")); + &aeskeygenassist("xmm1","xmm2",0x40); # round 14 + &call (&label("key_256a")); + &$movekey (&QWP(0,$key),"xmm0"); + &mov (&DWP(16,$key),$rounds); + &xor ("eax","eax"); + &ret(); + +&set_label("key_256a",16); + &$movekey (&QWP(0,$key),"xmm2"); + &lea ($key,&DWP(16,$key)); +&set_label("key_256a_cold"); + &shufps ("xmm4","xmm0",0b00010000); + &pxor ("xmm0","xmm4"); + &shufps ("xmm4","xmm0",0b10001100); + &pxor ("xmm0","xmm4"); + &pshufd ("xmm1","xmm1",0b11111111); # critical path + &pxor ("xmm0","xmm1"); + &ret(); + +&set_label("key_256b",16); + &$movekey (&QWP(0,$key),"xmm0"); + &lea ($key,&DWP(16,$key)); + + &shufps ("xmm4","xmm2",0b00010000); + &pxor ("xmm2","xmm4"); + &shufps ("xmm4","xmm2",0b10001100); + &pxor ("xmm2","xmm4"); + &pshufd ("xmm1","xmm1",0b10101010); # critical path + &pxor ("xmm2","xmm1"); + &ret(); + +&set_label("bad_pointer",4); + &mov ("eax",-1); + &ret (); +&set_label("bad_keybits",4); + &mov ("eax",-2); + &ret (); +&function_end_B("_aesni_set_encrypt_key"); + +# int $PREFIX_set_encrypt_key (const unsigned char *userKey, int bits, +# AES_KEY *key) +&function_begin_B("${PREFIX}_set_encrypt_key"); + &mov ("eax",&wparam(0)); + &mov ($rounds,&wparam(1)); + &mov ($key,&wparam(2)); + &call ("_aesni_set_encrypt_key"); + &ret (); +&function_end_B("${PREFIX}_set_encrypt_key"); + +# int $PREFIX_set_decrypt_key (const unsigned char *userKey, int bits, +# AES_KEY *key) +&function_begin_B("${PREFIX}_set_decrypt_key"); + &mov ("eax",&wparam(0)); + &mov ($rounds,&wparam(1)); + &mov ($key,&wparam(2)); + &call ("_aesni_set_encrypt_key"); + &mov ($key,&wparam(2)); + &shl ($rounds,4) # rounds-1 after _aesni_set_encrypt_key + &test ("eax","eax"); + &jnz (&label("dec_key_ret")); + &lea ("eax",&DWP(16,$key,$rounds)); # end of key schedule + + &$movekey ("xmm0",&QWP(0,$key)); # just swap + &$movekey ("xmm1",&QWP(0,"eax")); + &$movekey (&QWP(0,"eax"),"xmm0"); + &$movekey (&QWP(0,$key),"xmm1"); + &lea ($key,&DWP(16,$key)); + &lea ("eax",&DWP(-16,"eax")); + +&set_label("dec_key_inverse"); + &$movekey ("xmm0",&QWP(0,$key)); # swap and inverse + &$movekey ("xmm1",&QWP(0,"eax")); + &aesimc ("xmm0","xmm0"); + &aesimc ("xmm1","xmm1"); + &lea ($key,&DWP(16,$key)); + &lea ("eax",&DWP(-16,"eax")); + &cmp ("eax",$key); + &$movekey (&QWP(16,"eax"),"xmm0"); + &$movekey (&QWP(-16,$key),"xmm1"); + &ja (&label("dec_key_inverse")); + + &$movekey ("xmm0",&QWP(0,$key)); # inverse middle + &aesimc ("xmm0","xmm0"); + &$movekey (&QWP(0,$key),"xmm0"); + + &xor ("eax","eax"); # return success +&set_label("dec_key_ret"); + &ret (); +&function_end_B("${PREFIX}_set_decrypt_key"); +&asciz("AES for Intel AES-NI, CRYPTOGAMS by "); + +&asm_finish(); --- /dev/null +++ crypto/aes/asm/aesni-x86_64.pl @@ -0,0 +1,991 @@ +#!/usr/bin/env perl +# +# ==================================================================== +# Written by Andy Polyakov for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. +# ==================================================================== +# +# This module implements support for Intel AES-NI extension. In +# OpenSSL context it's used with Intel engine, but can also be used as +# drop-in replacement for crypto/aes/asm/aes-x86_64.pl [see below for +# details]. + +$PREFIX="aesni"; # if $PREFIX is set to "AES", the script + # generates drop-in replacement for + # crypto/aes/asm/aes-x86_64.pl:-) + +$flavour = shift; +$output = shift; +if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } + +$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); + +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; +( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or +( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or +die "can't locate x86_64-xlate.pl"; + +open STDOUT,"| $^X $xlate $flavour $output"; + +$movkey = $PREFIX eq "aesni" ? "movaps" : "movups"; +@_4args=$win64? ("%rcx","%rdx","%r8", "%r9") : # Win64 order + ("%rdi","%rsi","%rdx","%rcx"); # Unix order + +$code=".text\n"; + +$rounds="%eax"; # input to and changed by aesni_[en|de]cryptN !!! +# this is natural Unix argument order for public $PREFIX_[ecb|cbc]_encrypt ... +$inp="%rdi"; +$out="%rsi"; +$len="%rdx"; +$key="%rcx"; # input to and changed by aesni_[en|de]cryptN !!! +$ivp="%r8"; # cbc + +$rnds_="%r10d"; # backup copy for $rounds +$key_="%r11"; # backup copy for $key + +# %xmm register layout +$inout0="%xmm0"; $inout1="%xmm1"; +$inout2="%xmm2"; $inout3="%xmm3"; +$rndkey0="%xmm4"; $rndkey1="%xmm5"; + +$iv="%xmm6"; $in0="%xmm7"; # used in CBC decrypt +$in1="%xmm8"; $in2="%xmm9"; + +# Inline version of internal aesni_[en|de]crypt1. +# +# Why folded loop? Because aes[enc|dec] is slow enough to accommodate +# cycles which take care of loop variables... +{ my $sn; +sub aesni_generate1 { +my ($p,$key,$rounds)=@_; +++$sn; +$code.=<<___; + $movkey ($key),$rndkey0 + $movkey 16($key),$rndkey1 + lea 32($key),$key + pxor $rndkey0,$inout0 +.Loop_${p}1_$sn: + aes${p} $rndkey1,$inout0 + dec $rounds + $movkey ($key),$rndkey1 + lea 16($key),$key + jnz .Loop_${p}1_$sn # loop body is 16 bytes + aes${p}last $rndkey1,$inout0 +___ +}} +# void $PREFIX_[en|de]crypt (const void *inp,void *out,const AES_KEY *key); +# +{ my ($inp,$out,$key) = @_4args; + +$code.=<<___; +.globl ${PREFIX}_encrypt +.type ${PREFIX}_encrypt,\@abi-omnipotent +.align 16 +${PREFIX}_encrypt: + movups ($inp),$inout0 # load input + mov 240($key),$rounds # pull $rounds +___ + &aesni_generate1("enc",$key,$rounds); +$code.=<<___; + movups $inout0,($out) # output + ret +.size ${PREFIX}_encrypt,.-${PREFIX}_encrypt + +.globl ${PREFIX}_decrypt +.type ${PREFIX}_decrypt,\@abi-omnipotent +.align 16 +${PREFIX}_decrypt: + movups ($inp),$inout0 # load input + mov 240($key),$rounds # pull $rounds +___ + &aesni_generate1("dec",$key,$rounds); +$code.=<<___; + movups $inout0,($out) # output + ret +.size ${PREFIX}_decrypt, .-${PREFIX}_decrypt +___ +} + +# _aesni_[en|de]crypt[34] are private interfaces, N denotes interleave +# factor. Why 3x subroutine is used in loops? Even though aes[enc|dec] +# latency is 6, it turned out that it can be scheduled only every +# *second* cycle. Thus 3x interleave is the one providing optimal +# utilization, i.e. when subroutine's throughput is virtually same as +# of non-interleaved subroutine [for number of input blocks up to 3]. +# This is why it makes no sense to implement 2x subroutine. As soon +# as/if Intel improves throughput by making it possible to schedule +# the instructions in question *every* cycles I would have to +# implement 6x interleave and use it in loop... +sub aesni_generate3 { +my $dir=shift; +# As already mentioned it takes in $key and $rounds, which are *not* +# preserved. $inout[0-2] is cipher/clear text... +$code.=<<___; +.type _aesni_${dir}rypt3,\@abi-omnipotent +.align 16 +_aesni_${dir}rypt3: + $movkey ($key),$rndkey0 + shr \$1,$rounds + $movkey 16($key),$rndkey1 + lea 32($key),$key + pxor $rndkey0,$inout0 + pxor $rndkey0,$inout1 + pxor $rndkey0,$inout2 + +.L${dir}_loop3: + aes${dir} $rndkey1,$inout0 + $movkey ($key),$rndkey0 + aes${dir} $rndkey1,$inout1 + dec $rounds + aes${dir} $rndkey1,$inout2 + aes${dir} $rndkey0,$inout0 + $movkey 16($key),$rndkey1 + aes${dir} $rndkey0,$inout1 + lea 32($key),$key + aes${dir} $rndkey0,$inout2 + jnz .L${dir}_loop3 + + aes${dir} $rndkey1,$inout0 + $movkey ($key),$rndkey0 + aes${dir} $rndkey1,$inout1 + aes${dir} $rndkey1,$inout2 + aes${dir}last $rndkey0,$inout0 + aes${dir}last $rndkey0,$inout1 + aes${dir}last $rndkey0,$inout2 + ret +.size _aesni_${dir}rypt3,.-_aesni_${dir}rypt3 +___ +} +# 4x interleave is implemented to improve small block performance, +# most notably [and naturally] 4 block by ~30%. One can argue that one +# should have implemented 5x as well, but improvement would be <20%, +# so it's not worth it... +sub aesni_generate4 { +my $dir=shift; +# As already mentioned it takes in $key and $rounds, which are *not* +# preserved. $inout[0-3] is cipher/clear text... +$code.=<<___; +.type _aesni_${dir}rypt4,\@abi-omnipotent +.align 16 +_aesni_${dir}rypt4: + $movkey ($key),$rndkey0 + shr \$1,$rounds + $movkey 16($key),$rndkey1 + lea 32($key),$key + pxor $rndkey0,$inout0 + pxor $rndkey0,$inout1 + pxor $rndkey0,$inout2 + pxor $rndkey0,$inout3 + +.L${dir}_loop4: + aes${dir} $rndkey1,$inout0 + $movkey ($key),$rndkey0 + aes${dir} $rndkey1,$inout1 + dec $rounds + aes${dir} $rndkey1,$inout2 + aes${dir} $rndkey1,$inout3 + aes${dir} $rndkey0,$inout0 + $movkey 16($key),$rndkey1 + aes${dir} $rndkey0,$inout1 + lea 32($key),$key + aes${dir} $rndkey0,$inout2 + aes${dir} $rndkey0,$inout3 + jnz .L${dir}_loop4 + + aes${dir} $rndkey1,$inout0 + $movkey ($key),$rndkey0 + aes${dir} $rndkey1,$inout1 + aes${dir} $rndkey1,$inout2 + aes${dir} $rndkey1,$inout3 + aes${dir}last $rndkey0,$inout0 + aes${dir}last $rndkey0,$inout1 + aes${dir}last $rndkey0,$inout2 + aes${dir}last $rndkey0,$inout3 + ret +.size _aesni_${dir}rypt4,.-_aesni_${dir}rypt4 +___ +} +&aesni_generate3("enc") if ($PREFIX eq "aesni"); +&aesni_generate3("dec"); +&aesni_generate4("enc") if ($PREFIX eq "aesni"); +&aesni_generate4("dec"); + +if ($PREFIX eq "aesni") { +# void aesni_ecb_encrypt (const void *in, void *out, +# size_t length, const AES_KEY *key, +# int enc); +$code.=<<___; +.globl aesni_ecb_encrypt +.type aesni_ecb_encrypt,\@function,5 +.align 16 +aesni_ecb_encrypt: + cmp \$16,$len # check length + jb .Lecb_ret + + mov 240($key),$rounds # pull $rounds + and \$-16,$len + mov $key,$key_ # backup $key + test %r8d,%r8d # 5th argument + mov $rounds,$rnds_ # backup $rounds + jz .Lecb_decrypt +#--------------------------- ECB ENCRYPT ------------------------------# + sub \$0x40,$len + jbe .Lecb_enc_tail + jmp .Lecb_enc_loop3 +.align 16 +.Lecb_enc_loop3: + movups ($inp),$inout0 + movups 0x10($inp),$inout1 + movups 0x20($inp),$inout2 + call _aesni_encrypt3 + sub \$0x30,$len + lea 0x30($inp),$inp + lea 0x30($out),$out + movups $inout0,-0x30($out) + mov $rnds_,$rounds # restore $rounds + movups $inout1,-0x20($out) + mov $key_,$key # restore $key + movups $inout2,-0x10($out) + ja .Lecb_enc_loop3 + +.Lecb_enc_tail: + add \$0x40,$len + jz .Lecb_ret + + cmp \$0x10,$len + movups ($inp),$inout0 + je .Lecb_enc_one + cmp \$0x20,$len + movups 0x10($inp),$inout1 + je .Lecb_enc_two + cmp \$0x30,$len + movups 0x20($inp),$inout2 + je .Lecb_enc_three + movups 0x30($inp),$inout3 + call _aesni_encrypt4 + movups $inout0,($out) + movups $inout1,0x10($out) + movups $inout2,0x20($out) + movups $inout3,0x30($out) + jmp .Lecb_ret +.align 16 +.Lecb_enc_one: +___ + &aesni_generate1("enc",$key,$rounds); +$code.=<<___; + movups $inout0,($out) + jmp .Lecb_ret +.align 16 +.Lecb_enc_two: + call _aesni_encrypt3 + movups $inout0,($out) + movups $inout1,0x10($out) + jmp .Lecb_ret +.align 16 +.Lecb_enc_three: + call _aesni_encrypt3 + movups $inout0,($out) + movups $inout1,0x10($out) + movups $inout2,0x20($out) + jmp .Lecb_ret + #--------------------------- ECB DECRYPT ------------------------------# +.align 16 +.Lecb_decrypt: + sub \$0x40,$len + jbe .Lecb_dec_tail + jmp .Lecb_dec_loop3 +.align 16 +.Lecb_dec_loop3: + movups ($inp),$inout0 + movups 0x10($inp),$inout1 + movups 0x20($inp),$inout2 + call _aesni_decrypt3 + sub \$0x30,$len + lea 0x30($inp),$inp + lea 0x30($out),$out + movups $inout0,-0x30($out) + mov $rnds_,$rounds # restore $rounds + movups $inout1,-0x20($out) + mov $key_,$key # restore $key + movups $inout2,-0x10($out) + ja .Lecb_dec_loop3 + +.Lecb_dec_tail: + add \$0x40,$len + jz .Lecb_ret + + cmp \$0x10,$len + movups ($inp),$inout0 + je .Lecb_dec_one + cmp \$0x20,$len + movups 0x10($inp),$inout1 + je .Lecb_dec_two + cmp \$0x30,$len + movups 0x20($inp),$inout2 + je .Lecb_dec_three + movups 0x30($inp),$inout3 + call _aesni_decrypt4 + movups $inout0,($out) + movups $inout1,0x10($out) + movups $inout2,0x20($out) + movups $inout3,0x30($out) + jmp .Lecb_ret +.align 16 +.Lecb_dec_one: +___ + &aesni_generate1("dec",$key,$rounds); +$code.=<<___; + movups $inout0,($out) + jmp .Lecb_ret +.align 16 +.Lecb_dec_two: + call _aesni_decrypt3 + movups $inout0,($out) + movups $inout1,0x10($out) + jmp .Lecb_ret +.align 16 +.Lecb_dec_three: + call _aesni_decrypt3 + movups $inout0,($out) + movups $inout1,0x10($out) + movups $inout2,0x20($out) + +.Lecb_ret: + ret +.size aesni_ecb_encrypt,.-aesni_ecb_encrypt +___ +} + +# void $PREFIX_cbc_encrypt (const void *inp, void *out, +# size_t length, const AES_KEY *key, +# unsigned char *ivp,const int enc); +$reserved = $win64?0x40:-0x18; # used in decrypt +$code.=<<___; +.globl ${PREFIX}_cbc_encrypt +.type ${PREFIX}_cbc_encrypt,\@function,6 +.align 16 +${PREFIX}_cbc_encrypt: + test $len,$len # check length + jz .Lcbc_ret + + mov 240($key),$rnds_ # pull $rounds + mov $key,$key_ # backup $key + test %r9d,%r9d # 6th argument + jz .Lcbc_decrypt +#--------------------------- CBC ENCRYPT ------------------------------# + movups ($ivp),$inout0 # load iv as initial state + cmp \$16,$len + mov $rnds_,$rounds + jb .Lcbc_enc_tail + sub \$16,$len + jmp .Lcbc_enc_loop +.align 16 +.Lcbc_enc_loop: + movups ($inp),$inout1 # load input + lea 16($inp),$inp + pxor $inout1,$inout0 +___ + &aesni_generate1("enc",$key,$rounds); +$code.=<<___; + sub \$16,$len + lea 16($out),$out + mov $rnds_,$rounds # restore $rounds + mov $key_,$key # restore $key + movups $inout0,-16($out) # store output + jnc .Lcbc_enc_loop + add \$16,$len + jnz .Lcbc_enc_tail + movups $inout0,($ivp) + jmp .Lcbc_ret + +.Lcbc_enc_tail: + mov $len,%rcx # zaps $key + xchg $inp,$out # $inp is %rsi and $out is %rdi now + .long 0x9066A4F3 # rep movsb + mov \$16,%ecx # zero tail + sub $len,%rcx + xor %eax,%eax + .long 0x9066AAF3 # rep stosb + lea -16(%rdi),%rdi # rewind $out by 1 block + mov $rnds_,$rounds # restore $rounds + mov %rdi,%rsi # $inp and $out are the same + mov $key_,$key # restore $key + xor $len,$len # len=16 + jmp .Lcbc_enc_loop # one more spin + #--------------------------- CBC DECRYPT ------------------------------# +.align 16 +.Lcbc_decrypt: +___ +$code.=<<___ if ($win64); + lea -0x58(%rsp),%rsp + movaps %xmm6,(%rsp) + movaps %xmm7,0x10(%rsp) + movaps %xmm8,0x20(%rsp) + movaps %xmm9,0x30(%rsp) +.Lcbc_decrypt_body: +___ +$code.=<<___; + movups ($ivp),$iv + sub \$0x40,$len + mov $rnds_,$rounds + jbe .Lcbc_dec_tail + jmp .Lcbc_dec_loop3 +.align 16 +.Lcbc_dec_loop3: + movups ($inp),$inout0 + movups 0x10($inp),$inout1 + movups 0x20($inp),$inout2 + movaps $inout0,$in0 + movaps $inout1,$in1 + movaps $inout2,$in2 + call _aesni_decrypt3 + sub \$0x30,$len + lea 0x30($inp),$inp + lea 0x30($out),$out + pxor $iv,$inout0 + pxor $in0,$inout1 + movaps $in2,$iv + pxor $in1,$inout2 + movups $inout0,-0x30($out) + mov $rnds_,$rounds # restore $rounds + movups $inout1,-0x20($out) + mov $key_,$key # restore $key + movups $inout2,-0x10($out) + ja .Lcbc_dec_loop3 + +.Lcbc_dec_tail: + add \$0x40,$len + movups $iv,($ivp) + jz .Lcbc_dec_ret + + movups ($inp),$inout0 + cmp \$0x10,$len + movaps $inout0,$in0 + jbe .Lcbc_dec_one + movups 0x10($inp),$inout1 + cmp \$0x20,$len + movaps $inout1,$in1 + jbe .Lcbc_dec_two + movups 0x20($inp),$inout2 + cmp \$0x30,$len + movaps $inout2,$in2 + jbe .Lcbc_dec_three + movups 0x30($inp),$inout3 + call _aesni_decrypt4 + pxor $iv,$inout0 + movups 0x30($inp),$iv + pxor $in0,$inout1 + movups $inout0,($out) + pxor $in1,$inout2 + movups $inout1,0x10($out) + pxor $in2,$inout3 + movups $inout2,0x20($out) + movaps $inout3,$inout0 + lea 0x30($out),$out + jmp .Lcbc_dec_tail_collected +.align 16 +.Lcbc_dec_one: +___ + &aesni_generate1("dec",$key,$rounds); +$code.=<<___; + pxor $iv,$inout0 + movaps $in0,$iv + jmp .Lcbc_dec_tail_collected +.align 16 +.Lcbc_dec_two: + call _aesni_decrypt3 + pxor $iv,$inout0 + pxor $in0,$inout1 + movups $inout0,($out) + movaps $in1,$iv + movaps $inout1,$inout0 + lea 0x10($out),$out + jmp .Lcbc_dec_tail_collected +.align 16 +.Lcbc_dec_three: + call _aesni_decrypt3 + pxor $iv,$inout0 + pxor $in0,$inout1 + movups $inout0,($out) + pxor $in1,$inout2 + movups $inout1,0x10($out) + movaps $in2,$iv + movaps $inout2,$inout0 + lea 0x20($out),$out + jmp .Lcbc_dec_tail_collected +.align 16 +.Lcbc_dec_tail_collected: + and \$15,$len + movups $iv,($ivp) + jnz .Lcbc_dec_tail_partial + movups $inout0,($out) + jmp .Lcbc_dec_ret +.Lcbc_dec_tail_partial: + movaps $inout0,$reserved(%rsp) + mov $out,%rdi + mov $len,%rcx + lea $reserved(%rsp),%rsi + .long 0x9066A4F3 # rep movsb + +.Lcbc_dec_ret: +___ +$code.=<<___ if ($win64); + movaps (%rsp),%xmm6 + movaps 0x10(%rsp),%xmm7 + movaps 0x20(%rsp),%xmm8 + movaps 0x30(%rsp),%xmm9 + lea 0x58(%rsp),%rsp +___ +$code.=<<___; +.Lcbc_ret: + ret +.size ${PREFIX}_cbc_encrypt,.-${PREFIX}_cbc_encrypt +___ + +# int $PREFIX_set_[en|de]crypt_key (const unsigned char *userKey, +# int bits, AES_KEY *key) +{ my ($inp,$bits,$key) = @_4args; + $bits =~ s/%r/%e/; + +$code.=<<___; +.globl ${PREFIX}_set_decrypt_key +.type ${PREFIX}_set_decrypt_key,\@abi-omnipotent +.align 16 +${PREFIX}_set_decrypt_key: + .byte 0x48,0x83,0xEC,0x08 # sub rsp,8 + call _aesni_set_encrypt_key + shl \$4,$bits # rounds-1 after _aesni_set_encrypt_key + test %eax,%eax + jnz .Ldec_key_ret + lea 16($key,$bits),$inp # points at the end of key schedule + + $movkey ($key),%xmm0 # just swap + $movkey ($inp),%xmm1 + $movkey %xmm0,($inp) + $movkey %xmm1,($key) + lea 16($key),$key + lea -16($inp),$inp + +.Ldec_key_inverse: + $movkey ($key),%xmm0 # swap and inverse + $movkey ($inp),%xmm1 + aesimc %xmm0,%xmm0 + aesimc %xmm1,%xmm1 + lea 16($key),$key + lea -16($inp),$inp + cmp $key,$inp + $movkey %xmm0,16($inp) + $movkey %xmm1,-16($key) + ja .Ldec_key_inverse + + $movkey ($key),%xmm0 # inverse middle + aesimc %xmm0,%xmm0 + $movkey %xmm0,($inp) +.Ldec_key_ret: + add \$8,%rsp + ret +.LSEH_end_set_decrypt_key: +.size ${PREFIX}_set_decrypt_key,.-${PREFIX}_set_decrypt_key +___ + +# This is based on submission by +# +# Huang Ying +# Vinodh Gopal +# Kahraman Akdemir +# +# Agressively optimized in respect to aeskeygenassist's critical path +# and is contained in %xmm0-5 to meet Win64 ABI requirement. +# +$code.=<<___; +.globl ${PREFIX}_set_encrypt_key +.type ${PREFIX}_set_encrypt_key,\@abi-omnipotent +.align 16 +${PREFIX}_set_encrypt_key: +_aesni_set_encrypt_key: + .byte 0x48,0x83,0xEC,0x08 # sub rsp,8 + test $inp,$inp + mov \$-1,%rax + jz .Lenc_key_ret + test $key,$key + jz .Lenc_key_ret + + movups ($inp),%xmm0 # pull first 128 bits of *userKey + pxor %xmm4,%xmm4 # low dword of xmm4 is assumed 0 + lea 16($key),%rax + cmp \$256,$bits + je .L14rounds + cmp \$192,$bits + je .L12rounds + cmp \$128,$bits + jne .Lbad_keybits + +.L10rounds: + mov \$9,$bits # 10 rounds for 128-bit key + $movkey %xmm0,($key) # round 0 + aeskeygenassist \$0x1,%xmm0,%xmm1 # round 1 + call .Lkey_expansion_128_cold + aeskeygenassist \$0x2,%xmm0,%xmm1 # round 2 + call .Lkey_expansion_128 + aeskeygenassist \$0x4,%xmm0,%xmm1 # round 3 + call .Lkey_expansion_128 + aeskeygenassist \$0x8,%xmm0,%xmm1 # round 4 + call .Lkey_expansion_128 + aeskeygenassist \$0x10,%xmm0,%xmm1 # round 5 + call .Lkey_expansion_128 + aeskeygenassist \$0x20,%xmm0,%xmm1 # round 6 + call .Lkey_expansion_128 + aeskeygenassist \$0x40,%xmm0,%xmm1 # round 7 + call .Lkey_expansion_128 + aeskeygenassist \$0x80,%xmm0,%xmm1 # round 8 + call .Lkey_expansion_128 + aeskeygenassist \$0x1b,%xmm0,%xmm1 # round 9 + call .Lkey_expansion_128 + aeskeygenassist \$0x36,%xmm0,%xmm1 # round 10 + call .Lkey_expansion_128 + $movkey %xmm0,(%rax) + mov $bits,80(%rax) # 240(%rdx) + xor %eax,%eax + jmp .Lenc_key_ret + +.align 16 +.L12rounds: + movq 16($inp),%xmm2 # remaining 1/3 of *userKey + mov \$11,$bits # 12 rounds for 192 + $movkey %xmm0,($key) # round 0 + aeskeygenassist \$0x1,%xmm2,%xmm1 # round 1,2 + call .Lkey_expansion_192a_cold + aeskeygenassist \$0x2,%xmm2,%xmm1 # round 2,3 + call .Lkey_expansion_192b + aeskeygenassist \$0x4,%xmm2,%xmm1 # round 4,5 + call .Lkey_expansion_192a + aeskeygenassist \$0x8,%xmm2,%xmm1 # round 5,6 + call .Lkey_expansion_192b + aeskeygenassist \$0x10,%xmm2,%xmm1 # round 7,8 + call .Lkey_expansion_192a + aeskeygenassist \$0x20,%xmm2,%xmm1 # round 8,9 + call .Lkey_expansion_192b + aeskeygenassist \$0x40,%xmm2,%xmm1 # round 10,11 + call .Lkey_expansion_192a + aeskeygenassist \$0x80,%xmm2,%xmm1 # round 11,12 + call .Lkey_expansion_192b + $movkey %xmm0,(%rax) + mov $bits,48(%rax) # 240(%rdx) + xor %rax, %rax + jmp .Lenc_key_ret + +.align 16 +.L14rounds: + movups 16($inp),%xmm2 # remaning half of *userKey + mov \$13,$bits # 14 rounds for 256 + lea 16(%rax),%rax + $movkey %xmm0,($key) # round 0 + $movkey %xmm2,16($key) # round 1 + aeskeygenassist \$0x1,%xmm2,%xmm1 # round 2 + call .Lkey_expansion_256a_cold + aeskeygenassist \$0x1,%xmm0,%xmm1 # round 3 + call .Lkey_expansion_256b + aeskeygenassist \$0x2,%xmm2,%xmm1 # round 4 + call .Lkey_expansion_256a + aeskeygenassist \$0x2,%xmm0,%xmm1 # round 5 + call .Lkey_expansion_256b + aeskeygenassist \$0x4,%xmm2,%xmm1 # round 6 + call .Lkey_expansion_256a + aeskeygenassist \$0x4,%xmm0,%xmm1 # round 7 + call .Lkey_expansion_256b + aeskeygenassist \$0x8,%xmm2,%xmm1 # round 8 + call .Lkey_expansion_256a + aeskeygenassist \$0x8,%xmm0,%xmm1 # round 9 + call .Lkey_expansion_256b + aeskeygenassist \$0x10,%xmm2,%xmm1 # round 10 + call .Lkey_expansion_256a + aeskeygenassist \$0x10,%xmm0,%xmm1 # round 11 + call .Lkey_expansion_256b + aeskeygenassist \$0x20,%xmm2,%xmm1 # round 12 + call .Lkey_expansion_256a + aeskeygenassist \$0x20,%xmm0,%xmm1 # round 13 + call .Lkey_expansion_256b + aeskeygenassist \$0x40,%xmm2,%xmm1 # round 14 + call .Lkey_expansion_256a + $movkey %xmm0,(%rax) + mov $bits,16(%rax) # 240(%rdx) + xor %rax,%rax + jmp .Lenc_key_ret + +.align 16 +.Lbad_keybits: + mov \$-2,%rax +.Lenc_key_ret: + add \$8,%rsp + ret +.LSEH_end_set_encrypt_key: + +.align 16 +.Lkey_expansion_128: + $movkey %xmm0,(%rax) + lea 16(%rax),%rax +.Lkey_expansion_128_cold: + shufps \$0b00010000,%xmm0,%xmm4 + pxor %xmm4, %xmm0 + shufps \$0b10001100,%xmm0,%xmm4 + pxor %xmm4, %xmm0 + pshufd \$0b11111111,%xmm1,%xmm1 # critical path + pxor %xmm1,%xmm0 + ret + +.align 16 +.Lkey_expansion_192a: + $movkey %xmm0,(%rax) + lea 16(%rax),%rax +.Lkey_expansion_192a_cold: + movaps %xmm2, %xmm5 +.Lkey_expansion_192b_warm: + shufps \$0b00010000,%xmm0,%xmm4 + movaps %xmm2,%xmm3 + pxor %xmm4,%xmm0 + shufps \$0b10001100,%xmm0,%xmm4 + pslldq \$4,%xmm3 + pxor %xmm4,%xmm0 + pshufd \$0b01010101,%xmm1,%xmm1 # critical path + pxor %xmm3,%xmm2 + pxor %xmm1,%xmm0 + pshufd \$0b11111111,%xmm0,%xmm3 + pxor %xmm3,%xmm2 + ret + +.align 16 +.Lkey_expansion_192b: + movaps %xmm0,%xmm3 + shufps \$0b01000100,%xmm0,%xmm5 + $movkey %xmm5,(%rax) + shufps \$0b01001110,%xmm2,%xmm3 + $movkey %xmm3,16(%rax) + lea 32(%rax),%rax + jmp .Lkey_expansion_192b_warm + +.align 16 +.Lkey_expansion_256a: + $movkey %xmm2,(%rax) + lea 16(%rax),%rax +.Lkey_expansion_256a_cold: + shufps \$0b00010000,%xmm0,%xmm4 + pxor %xmm4,%xmm0 + shufps \$0b10001100,%xmm0,%xmm4 + pxor %xmm4,%xmm0 + pshufd \$0b11111111,%xmm1,%xmm1 # critical path + pxor %xmm1,%xmm0 + ret + +.align 16 +.Lkey_expansion_256b: + $movkey %xmm0,(%rax) + lea 16(%rax),%rax + + shufps \$0b00010000,%xmm2,%xmm4 + pxor %xmm4,%xmm2 + shufps \$0b10001100,%xmm2,%xmm4 + pxor %xmm4,%xmm2 + pshufd \$0b10101010,%xmm1,%xmm1 # critical path + pxor %xmm1,%xmm2 + ret +.size ${PREFIX}_set_encrypt_key,.-${PREFIX}_set_encrypt_key +___ +} + +$code.=<<___; +.asciz "AES for Intel AES-NI, CRYPTOGAMS by " +.align 64 +___ + +# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, +# CONTEXT *context,DISPATCHER_CONTEXT *disp) +if ($win64) { +$rec="%rcx"; +$frame="%rdx"; +$context="%r8"; +$disp="%r9"; + +$code.=<<___; +.extern __imp_RtlVirtualUnwind +.type cbc_se_handler,\@abi-omnipotent +.align 16 +cbc_se_handler: + push %rsi + push %rdi + push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + push %r15 + pushfq + sub \$64,%rsp + + mov 152($context),%rax # pull context->Rsp + mov 248($context),%rbx # pull context->Rip + + lea .Lcbc_decrypt(%rip),%r10 + cmp %r10,%rbx # context->Rip<"prologue" label + jb .Lin_prologue + + lea .Lcbc_decrypt_body(%rip),%r10 + cmp %r10,%rbx # context->RipRip>="epilogue" label + jae .Lin_prologue + + lea 0(%rax),%rsi # top of stack + lea 512($context),%rdi # &context.Xmm6 + mov \$8,%ecx # 4*sizeof(%xmm0)/sizeof(%rax) + .long 0xa548f3fc # cld; rep movsq + lea 0x58(%rax),%rax # adjust stack pointer + jmp .Lin_prologue + +.Lrestore_rax: + mov 120($context),%rax +.Lin_prologue: + mov 8(%rax),%rdi + mov 16(%rax),%rsi + mov %rax,152($context) # restore context->Rsp + mov %rsi,168($context) # restore context->Rsi + mov %rdi,176($context) # restore context->Rdi + + jmp .Lcommon_seh_exit +.size cbc_se_handler,.-cbc_se_handler + +.type ecb_se_handler,\@abi-omnipotent +.align 16 +ecb_se_handler: + push %rsi + push %rdi + push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + push %r15 + pushfq + sub \$64,%rsp + + mov 152($context),%rax # pull context->Rsp + mov 8(%rax),%rdi + mov 16(%rax),%rsi + mov %rsi,168($context) # restore context->Rsi + mov %rdi,176($context) # restore context->Rdi + +.Lcommon_seh_exit: + + mov 40($disp),%rdi # disp->ContextRecord + mov $context,%rsi # context + mov \$154,%ecx # sizeof(CONTEXT) + .long 0xa548f3fc # cld; rep movsq + + mov $disp,%rsi + xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER + mov 8(%rsi),%rdx # arg2, disp->ImageBase + mov 0(%rsi),%r8 # arg3, disp->ControlPc + mov 16(%rsi),%r9 # arg4, disp->FunctionEntry + mov 40(%rsi),%r10 # disp->ContextRecord + lea 56(%rsi),%r11 # &disp->HandlerData + lea 24(%rsi),%r12 # &disp->EstablisherFrame + mov %r10,32(%rsp) # arg5 + mov %r11,40(%rsp) # arg6 + mov %r12,48(%rsp) # arg7 + mov %rcx,56(%rsp) # arg8, (NULL) + call *__imp_RtlVirtualUnwind(%rip) + + mov \$1,%eax # ExceptionContinueSearch + add \$64,%rsp + popfq + pop %r15 + pop %r14 + pop %r13 + pop %r12 + pop %rbp + pop %rbx + pop %rdi + pop %rsi + ret +.size cbc_se_handler,.-cbc_se_handler + +.section .pdata +.align 4 + .rva .LSEH_begin_${PREFIX}_ecb_encrypt + .rva .LSEH_end_${PREFIX}_ecb_encrypt + .rva .LSEH_info_ecb + + .rva .LSEH_begin_${PREFIX}_cbc_encrypt + .rva .LSEH_end_${PREFIX}_cbc_encrypt + .rva .LSEH_info_cbc + + .rva ${PREFIX}_set_decrypt_key + .rva .LSEH_end_set_decrypt_key + .rva .LSEH_info_key + + .rva ${PREFIX}_set_encrypt_key + .rva .LSEH_end_set_encrypt_key + .rva .LSEH_info_key +.section .xdata +.align 8 +.LSEH_info_ecb: + .byte 9,0,0,0 + .rva ecb_se_handler +.LSEH_info_cbc: + .byte 9,0,0,0 + .rva cbc_se_handler +.LSEH_info_key: + .byte 0x01,0x04,0x01,0x00 + .byte 0x04,0x02,0x00,0x00 +___ +} + +sub rex { + local *opcode=shift; + my ($dst,$src)=@_; + + if ($dst>=8 || $src>=8) { + $rex=0x40; + $rex|=0x04 if($dst>=8); + $rex|=0x01 if($src>=8); + push @opcode,$rex; + } +} + +sub aesni { + my $line=shift; + my @opcode=(0x66); + + if ($line=~/(aeskeygenassist)\s+\$([x0-9a-f]+),\s*%xmm([0-9]+),\s*%xmm([0-9]+)/) { + rex(\@opcode,$4,$3); + push @opcode,0x0f,0x3a,0xdf; + push @opcode,0xc0|($3&7)|(($4&7)<<3); # ModR/M + my $c=$2; + push @opcode,$c=~/^0/?oct($c):$c; + return ".byte\t".join(',',@opcode); + } + elsif ($line=~/(aes[a-z]+)\s+%xmm([0-9]+),\s*%xmm([0-9]+)/) { + my %opcodelet = ( + "aesimc" => 0xdb, + "aesenc" => 0xdc, "aesenclast" => 0xdd, + "aesdec" => 0xde, "aesdeclast" => 0xdf + ); + return undef if (!defined($opcodelet{$1})); + rex(\@opcode,$3,$2); + push @opcode,0x0f,0x38,$opcodelet{$1}; + push @opcode,0xc0|($2&7)|(($3&7)<<3); # ModR/M + return ".byte\t".join(',',@opcode); + } + return $line; +} + +$code =~ s/\`([^\`]*)\`/eval($1)/gem; +$code =~ s/\b(aes.*%xmm[0-9]+).*$/aesni($1)/gem; + +print $code; + +close STDOUT; --- crypto/aes/Makefile.orig +++ crypto/aes/Makefile @@ -50,9 +50,13 @@ aes-ia64.s: asm/aes-ia64.S aes-586.s: asm/aes-586.pl ../perlasm/x86asm.pl $(PERL) asm/aes-586.pl $(PERLASM_SCHEME) $(CFLAGS) $(PROCESSOR) > $@ +aesni-x86.s: asm/aesni-x86.pl ../perlasm/x86asm.pl + $(PERL) asm/aesni-x86.pl $(PERLASM_SCHEME) $(CFLAGS) $(PROCESSOR) > $@ aes-x86_64.s: asm/aes-x86_64.pl $(PERL) asm/aes-x86_64.pl $(PERLASM_SCHEME) > $@ +aesni-x86_64.s: asm/aesni-x86_64.pl + $(PERL) asm/aesni-x86_64.pl $(PERLASM_SCHEME) > $@ aes-sparcv9.s: asm/aes-sparcv9.pl $(PERL) asm/aes-sparcv9.pl $(CFLAGS) > $@ --- /dev/null +++ crypto/engine/eng_aesni.c @@ -0,0 +1,413 @@ +/* + * Support for Intel AES-NI intruction set + * Author: Huang Ying + * + * Intel AES-NI is a new set of Single Instruction Multiple Data + * (SIMD) instructions that are going to be introduced in the next + * generation of Intel processor, as of 2009. These instructions + * enable fast and secure data encryption and decryption, using the + * Advanced Encryption Standard (AES), defined by FIPS Publication + * number 197. The architecture introduces six instructions that + * offer full hardware support for AES. Four of them support high + * performance data encryption and decryption, and the other two + * instructions support the AES key expansion procedure. + * + * The white paper can be downloaded from: + * http://softwarecommunity.intel.com/isn/downloads/intelavx/AES-Instructions-Set_WP.pdf + * + * This file is based on engines/e_padlock.c + */ + +/* ==================================================================== + * Copyright (c) 1999-2001 The OpenSSL Project. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * 3. All advertising materials mentioning features or use of this + * software must display the following acknowledgment: + * "This product includes software developed by the OpenSSL Project + * for use in the OpenSSL Toolkit. (http://www.OpenSSL.org/)" + * + * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to + * endorse or promote products derived from this software without + * prior written permission. For written permission, please contact + * licensing@OpenSSL.org. + * + * 5. Products derived from this software may not be called "OpenSSL" + * nor may "OpenSSL" appear in their names without prior written + * permission of the OpenSSL Project. + * + * 6. Redistributions of any form whatsoever must retain the following + * acknowledgment: + * "This product includes software developed by the OpenSSL Project + * for use in the OpenSSL Toolkit (http://www.OpenSSL.org/)" + * + * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY + * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR + * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED + * OF THE POSSIBILITY OF SUCH DAMAGE. + * ==================================================================== + * + * This product includes cryptographic software written by Eric Young + * (eay@cryptsoft.com). This product includes software written by Tim + * Hudson (tjh@cryptsoft.com). + * + */ + + +#include + +#if !defined(OPENSSL_NO_HW) && !defined(OPENSSL_NO_HW_AES_NI) && !defined(OPENSSL_NO_AES) + +#include +#include "cryptlib.h" +#include +#include +#include +#include +#include +#include + +/* AES-NI is available *ONLY* on some x86 CPUs. Not only that it + doesn't exist elsewhere, but it even can't be compiled on other + platforms! */ +#undef COMPILE_HW_AESNI +#if (defined(__x86_64) || defined(__x86_64__) || \ + defined(_M_AMD64) || defined(_M_X64) || \ + defined(OPENSSL_IA32_SSE2)) && !defined(OPENSSL_NO_ASM) +#define COMPILE_HW_AESNI +static ENGINE *ENGINE_aesni (void); +#endif + +void ENGINE_load_aesni (void) +{ +/* On non-x86 CPUs it just returns. */ +#ifdef COMPILE_HW_AESNI + ENGINE *toadd = ENGINE_aesni(); + if (!toadd) + return; + if(ENGINE_add (toadd)) + ENGINE_register_complete (toadd); + ENGINE_free (toadd); + ERR_clear_error (); +#endif +} + +#ifdef COMPILE_HW_AESNI +int aesni_set_encrypt_key(const unsigned char *userKey, int bits, + AES_KEY *key); +int aesni_set_decrypt_key(const unsigned char *userKey, int bits, + AES_KEY *key); + +void aesni_encrypt(const unsigned char *in, unsigned char *out, + const AES_KEY *key); +void aesni_decrypt(const unsigned char *in, unsigned char *out, + const AES_KEY *key); + +void aesni_ecb_encrypt(const unsigned char *in, + unsigned char *out, + size_t length, + const AES_KEY *key, + int enc); +void aesni_cbc_encrypt(const unsigned char *in, + unsigned char *out, + size_t length, + const AES_KEY *key, + unsigned char *ivec, int enc); + +/* Function for ENGINE detection and control */ +static int aesni_init(ENGINE *e); + +/* Cipher Stuff */ +static int aesni_ciphers(ENGINE *e, const EVP_CIPHER **cipher, + const int **nids, int nid); + +#define AESNI_MIN_ALIGN 16 +#define AESNI_ALIGN(x) \ + ((void *)(((unsigned long)(x)+AESNI_MIN_ALIGN-1)&~(AESNI_MIN_ALIGN-1))) + +/* Engine names */ +static const char aesni_id[] = "aesni", + aesni_name[] = "Intel AES-NI engine", + no_aesni_name[] = "Intel AES-NI engine (no-aesni)"; + +/* ===== Engine "management" functions ===== */ + +#if defined(_WIN32) +typedef unsigned __int64 IA32CAP; +#else +typedef unsigned long long IA32CAP; +#endif + +/* Prepare the ENGINE structure for registration */ +static int +aesni_bind_helper(ENGINE *e) +{ + int engage; + if (sizeof(OPENSSL_ia32cap_P) > 4) { + engage = (OPENSSL_ia32cap_P >> 57) & 1; + } else { + IA32CAP OPENSSL_ia32_cpuid(void); + engage = (OPENSSL_ia32_cpuid() >> 57) & 1; + } + + /* Register everything or return with an error */ + if (!ENGINE_set_id(e, aesni_id) || + !ENGINE_set_name(e, engage ? aesni_name : no_aesni_name) || + + !ENGINE_set_init_function(e, aesni_init) || + (engage && !ENGINE_set_ciphers (e, aesni_ciphers)) + ) + return 0; + + /* Everything looks good */ + return 1; +} + +/* Constructor */ +static ENGINE * +ENGINE_aesni(void) +{ + ENGINE *eng = ENGINE_new(); + + if (!eng) { + return NULL; + } + + if (!aesni_bind_helper(eng)) { + ENGINE_free(eng); + return NULL; + } + + return eng; +} + +/* Check availability of the engine */ +static int +aesni_init(ENGINE *e) +{ + return 1; +} + +#if defined(NID_aes_128_cfb128) && ! defined (NID_aes_128_cfb) +#define NID_aes_128_cfb NID_aes_128_cfb128 +#endif + +#if defined(NID_aes_128_ofb128) && ! defined (NID_aes_128_ofb) +#define NID_aes_128_ofb NID_aes_128_ofb128 +#endif + +#if defined(NID_aes_192_cfb128) && ! defined (NID_aes_192_cfb) +#define NID_aes_192_cfb NID_aes_192_cfb128 +#endif + +#if defined(NID_aes_192_ofb128) && ! defined (NID_aes_192_ofb) +#define NID_aes_192_ofb NID_aes_192_ofb128 +#endif + +#if defined(NID_aes_256_cfb128) && ! defined (NID_aes_256_cfb) +#define NID_aes_256_cfb NID_aes_256_cfb128 +#endif + +#if defined(NID_aes_256_ofb128) && ! defined (NID_aes_256_ofb) +#define NID_aes_256_ofb NID_aes_256_ofb128 +#endif + +/* List of supported ciphers. */ +static int aesni_cipher_nids[] = { + NID_aes_128_ecb, + NID_aes_128_cbc, + NID_aes_128_cfb, + NID_aes_128_ofb, + + NID_aes_192_ecb, + NID_aes_192_cbc, + NID_aes_192_cfb, + NID_aes_192_ofb, + + NID_aes_256_ecb, + NID_aes_256_cbc, + NID_aes_256_cfb, + NID_aes_256_ofb, +}; +static int aesni_cipher_nids_num = + (sizeof(aesni_cipher_nids)/sizeof(aesni_cipher_nids[0])); + +typedef struct +{ + AES_KEY ks; + unsigned int _pad1[3]; +} AESNI_KEY; + +static int +aesni_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *user_key, + const unsigned char *iv, int enc) +{ + int ret; + AES_KEY *key = AESNI_ALIGN(ctx->cipher_data); + + if ((ctx->cipher->flags & EVP_CIPH_MODE) == EVP_CIPH_CFB_MODE + || (ctx->cipher->flags & EVP_CIPH_MODE) == EVP_CIPH_OFB_MODE + || enc) + ret=aesni_set_encrypt_key(user_key, ctx->key_len * 8, key); + else + ret=aesni_set_decrypt_key(user_key, ctx->key_len * 8, key); + + if(ret < 0) { + EVPerr(EVP_F_AESNI_INIT_KEY,EVP_R_AES_KEY_SETUP_FAILED); + return 0; + } + + return 1; +} + +static int aesni_cipher_ecb(EVP_CIPHER_CTX *ctx, unsigned char *out, + const unsigned char *in, size_t inl) +{ AES_KEY *key = AESNI_ALIGN(ctx->cipher_data); + aesni_ecb_encrypt(in, out, inl, key, ctx->encrypt); + return 1; +} +static int aesni_cipher_cbc(EVP_CIPHER_CTX *ctx, unsigned char *out, + const unsigned char *in, size_t inl) +{ AES_KEY *key = AESNI_ALIGN(ctx->cipher_data); + aesni_cbc_encrypt(in, out, inl, key, + ctx->iv, ctx->encrypt); + return 1; +} +static int aesni_cipher_cfb(EVP_CIPHER_CTX *ctx, unsigned char *out, + const unsigned char *in, size_t inl) +{ AES_KEY *key = AESNI_ALIGN(ctx->cipher_data); + CRYPTO_cfb128_encrypt(in, out, inl, key, ctx->iv, + &ctx->num, ctx->encrypt, + (block128_f)aesni_encrypt); + return 1; +} +static int aesni_cipher_ofb(EVP_CIPHER_CTX *ctx, unsigned char *out, + const unsigned char *in, size_t inl) +{ AES_KEY *key = AESNI_ALIGN(ctx->cipher_data); + CRYPTO_ofb128_encrypt(in, out, inl, key, ctx->iv, + &ctx->num, (block128_f)aesni_encrypt); + return 1; +} + +#define AES_BLOCK_SIZE 16 + +#define EVP_CIPHER_block_size_ECB AES_BLOCK_SIZE +#define EVP_CIPHER_block_size_CBC AES_BLOCK_SIZE +#define EVP_CIPHER_block_size_OFB 1 +#define EVP_CIPHER_block_size_CFB 1 + +/* Declaring so many ciphers by hand would be a pain. + Instead introduce a bit of preprocessor magic :-) */ +#define DECLARE_AES_EVP(ksize,lmode,umode) \ +static const EVP_CIPHER aesni_##ksize##_##lmode = { \ + NID_aes_##ksize##_##lmode, \ + EVP_CIPHER_block_size_##umode, \ + ksize / 8, \ + AES_BLOCK_SIZE, \ + 0 | EVP_CIPH_##umode##_MODE, \ + aesni_init_key, \ + aesni_cipher_##lmode, \ + NULL, \ + sizeof(AESNI_KEY), \ + EVP_CIPHER_set_asn1_iv, \ + EVP_CIPHER_get_asn1_iv, \ + NULL, \ + NULL \ +} + +DECLARE_AES_EVP(128,ecb,ECB); +DECLARE_AES_EVP(128,cbc,CBC); +DECLARE_AES_EVP(128,cfb,CFB); +DECLARE_AES_EVP(128,ofb,OFB); + +DECLARE_AES_EVP(192,ecb,ECB); +DECLARE_AES_EVP(192,cbc,CBC); +DECLARE_AES_EVP(192,cfb,CFB); +DECLARE_AES_EVP(192,ofb,OFB); + +DECLARE_AES_EVP(256,ecb,ECB); +DECLARE_AES_EVP(256,cbc,CBC); +DECLARE_AES_EVP(256,cfb,CFB); +DECLARE_AES_EVP(256,ofb,OFB); + +static int +aesni_ciphers (ENGINE *e, const EVP_CIPHER **cipher, + const int **nids, int nid) +{ + /* No specific cipher => return a list of supported nids ... */ + if (!cipher) { + *nids = aesni_cipher_nids; + return aesni_cipher_nids_num; + } + + /* ... or the requested "cipher" otherwise */ + switch (nid) { + case NID_aes_128_ecb: + *cipher = &aesni_128_ecb; + break; + case NID_aes_128_cbc: + *cipher = &aesni_128_cbc; + break; + case NID_aes_128_cfb: + *cipher = &aesni_128_cfb; + break; + case NID_aes_128_ofb: + *cipher = &aesni_128_ofb; + break; + + case NID_aes_192_ecb: + *cipher = &aesni_192_ecb; + break; + case NID_aes_192_cbc: + *cipher = &aesni_192_cbc; + break; + case NID_aes_192_cfb: + *cipher = &aesni_192_cfb; + break; + case NID_aes_192_ofb: + *cipher = &aesni_192_ofb; + break; + + case NID_aes_256_ecb: + *cipher = &aesni_256_ecb; + break; + case NID_aes_256_cbc: + *cipher = &aesni_256_cbc; + break; + case NID_aes_256_cfb: + *cipher = &aesni_256_cfb; + break; + case NID_aes_256_ofb: + *cipher = &aesni_256_ofb; + break; + + default: + /* Sorry, we don't support this NID */ + *cipher = NULL; + return 0; + } + + return 1; +} + +#endif /* COMPILE_HW_AESNI */ +#endif /* !defined(OPENSSL_NO_HW) && !defined(OPENSSL_NO_HW_AESNI) && !defined(OPENSSL_NO_AES) */ --- crypto/engine/eng_all.c.orig +++ crypto/engine/eng_all.c @@ -71,6 +71,9 @@ void ENGINE_load_builtin_engines(void) #if !defined(OPENSSL_NO_HW) && (defined(__OpenBSD__) || defined(__FreeBSD__) || defined(HAVE_CRYPTODEV)) ENGINE_load_cryptodev(); #endif +#if !defined(OPENSSL_NO_HW) && !defined(OPENSSL_NO_HW_AESNI) + ENGINE_load_aesni(); +#endif ENGINE_load_dynamic(); #ifndef OPENSSL_NO_STATIC_ENGINE #ifndef OPENSSL_NO_HW --- crypto/engine/engine.h.orig +++ crypto/engine/engine.h @@ -344,6 +344,7 @@ void ENGINE_load_gost(void); #endif #endif void ENGINE_load_cryptodev(void); +void ENGINE_load_aesni(void); void ENGINE_load_builtin_engines(void); /* Get and set global flags (ENGINE_TABLE_FLAG_***) for the implementation --- crypto/engine/Makefile.orig +++ crypto/engine/Makefile @@ -21,12 +21,14 @@ LIBSRC= eng_err.c eng_lib.c eng_list.c e eng_table.c eng_pkey.c eng_fat.c eng_all.c \ tb_rsa.c tb_dsa.c tb_ecdsa.c tb_dh.c tb_ecdh.c tb_rand.c tb_store.c \ tb_cipher.c tb_digest.c tb_pkmeth.c tb_asnmth.c \ - eng_openssl.c eng_cnf.c eng_dyn.c eng_cryptodev.c + eng_openssl.c eng_cnf.c eng_dyn.c eng_cryptodev.c \ + eng_aesni.c LIBOBJ= eng_err.o eng_lib.o eng_list.o eng_init.o eng_ctrl.o \ eng_table.o eng_pkey.o eng_fat.o eng_all.o \ tb_rsa.o tb_dsa.o tb_ecdsa.o tb_dh.o tb_ecdh.o tb_rand.o tb_store.o \ tb_cipher.o tb_digest.o tb_pkmeth.o tb_asnmth.o \ - eng_openssl.o eng_cnf.o eng_dyn.o eng_cryptodev.o + eng_openssl.o eng_cnf.o eng_dyn.o eng_cryptodev.o \ + eng_aesni.o SRC= $(LIBSRC) --- crypto/evp/evp_err.c.orig +++ crypto/evp/evp_err.c @@ -1,6 +1,6 @@ /* crypto/evp/evp_err.c */ /* ==================================================================== - * Copyright (c) 1999-2008 The OpenSSL Project. All rights reserved. + * Copyright (c) 1999-2009 The OpenSSL Project. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -70,6 +70,7 @@ static ERR_STRING_DATA EVP_str_functs[]= { +{ERR_FUNC(EVP_F_AESNI_INIT_KEY), "AESNI_INIT_KEY"}, {ERR_FUNC(EVP_F_AES_INIT_KEY), "AES_INIT_KEY"}, {ERR_FUNC(EVP_F_CAMELLIA_INIT_KEY), "CAMELLIA_INIT_KEY"}, {ERR_FUNC(EVP_F_D2I_PKEY), "D2I_PKEY"}, @@ -86,7 +87,7 @@ static ERR_STRING_DATA EVP_str_functs[]= {ERR_FUNC(EVP_F_EVP_DIGESTINIT_EX), "EVP_DigestInit_ex"}, {ERR_FUNC(EVP_F_EVP_ENCRYPTFINAL_EX), "EVP_EncryptFinal_ex"}, {ERR_FUNC(EVP_F_EVP_MD_CTX_COPY_EX), "EVP_MD_CTX_copy_ex"}, -{ERR_FUNC(EVP_F_EVP_MD_SIZE), "EVP_MD_SIZE"}, +{ERR_FUNC(EVP_F_EVP_MD_SIZE), "EVP_MD_size"}, {ERR_FUNC(EVP_F_EVP_OPENINIT), "EVP_OpenInit"}, {ERR_FUNC(EVP_F_EVP_PBE_ALG_ADD), "EVP_PBE_alg_add"}, {ERR_FUNC(EVP_F_EVP_PBE_ALG_ADD_TYPE), "EVP_PBE_alg_add_type"}, --- crypto/evp/evp.h.orig +++ crypto/evp/evp.h @@ -1190,6 +1190,7 @@ void ERR_load_EVP_strings(void); /* Error codes for the EVP functions. */ /* Function codes. */ +#define EVP_F_AESNI_INIT_KEY 163 #define EVP_F_AES_INIT_KEY 133 #define EVP_F_CAMELLIA_INIT_KEY 159 #define EVP_F_D2I_PKEY 100 --- /dev/null +++ test/test_aesni @@ -0,0 +1,69 @@ +#!/bin/sh + +PROG=$1 + +if [ -x $PROG ]; then + if expr "x`$PROG version`" : "xOpenSSL" > /dev/null; then + : + else + echo "$PROG is not OpenSSL executable" + exit 1 + fi +else + echo "$PROG is not executable" + exit 1; +fi + +if $PROG engine aesni | grep -v no-aesni; then + + HASH=`cat $PROG | $PROG dgst -hex` + + AES_ALGS=" aes-128-ecb aes-192-ecb aes-256-ecb \ + aes-128-cbc aes-192-cbc aes-256-cbc \ + aes-128-cfb aes-192-cfb aes-256-cfb \ + aes-128-ofb aes-192-ofb aes-256-ofb" + BUFSIZE="16 32 48 64 80 96 128 144 999" + + nerr=0 + + for alg in $AES_ALGS; do + echo $alg + for bufsize in $BUFSIZE; do + TEST=`( cat $PROG | \ + $PROG enc -e -k "$HASH" -$alg -bufsize $bufsize -engine aesni | \ + $PROG enc -d -k "$HASH" -$alg | \ + $PROG dgst -hex ) 2>/dev/null` + if [ "$TEST" != "$HASH" ]; then + echo "-$alg/$bufsize encrypt test failed" + nerr=`expr $nerr + 1` + fi + done + for bufsize in $BUFSIZE; do + TEST=`( cat $PROG | \ + $PROG enc -e -k "$HASH" -$alg | \ + $PROG enc -d -k "$HASH" -$alg -bufsize $bufsize -engine aesni | \ + $PROG dgst -hex ) 2>/dev/null` + if [ "$TEST" != "$HASH" ]; then + echo "-$alg/$bufsize decrypt test failed" + nerr=`expr $nerr + 1` + fi + done + TEST=`( cat $PROG | \ + $PROG enc -e -k "$HASH" -$alg -engine aesni | \ + $PROG enc -d -k "$HASH" -$alg -engine aesni | \ + $PROG dgst -hex ) 2>/dev/null` + if [ "$TEST" != "$HASH" ]; then + echo "-$alg en/decrypt test failed" + nerr=`expr $nerr + 1` + fi + done + + if [ $nerr -gt 0 ]; then + echo "AESNI engine test failed." + exit 1; + fi +else + echo "AESNI engine is not available" +fi + +exit 0