#!/usr/bin/env perl # ==================================================================== # Written by Andy Polyakov for the OpenSSL # project. The module is, however, dual licensed under OpenSSL and # CRYPTOGAMS licenses depending on where you obtain it. For further # details see http://www.openssl.org/~appro/cryptogams/. # ==================================================================== # # This module implements support for Intel AES-NI extension. In # OpenSSL context it's used with Intel engine, but can also be used as # drop-in replacement for crypto/aes/asm/aes-586.pl [see below for # details]. $PREFIX="aesni"; # if $PREFIX is set to "AES", the script # generates drop-in replacement for # crypto/aes/asm/aes-586.pl:-) $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; push(@INC,"${dir}","${dir}../../perlasm"); require "x86asm.pl"; &asm_init($ARGV[0],$0); $movekey = eval($RREFIX eq "aseni" ? "*movaps" : "*movups"); $len="eax"; $rounds="ecx"; $key="edx"; $inp="esi"; $out="edi"; $rounds_="ebx"; # backup copy for $rounds $key_="ebp"; # backup copy for $key $inout0="xmm0"; $inout1="xmm1"; $inout2="xmm2"; $rndkey0="xmm3"; $rndkey1="xmm4"; $ivec="xmm5"; $in0="xmm6"; $in1="xmm7"; $inout3="xmm7"; # Inline version of internal aesni_[en|de]crypt1 sub aesni_inline_generate1 { my $p=shift; &$movekey ($rndkey0,&QWP(0,$key)); &$movekey ($rndkey1,&QWP(16,$key)); &lea ($key,&DWP(32,$key)); &pxor ($inout0,$rndkey0); &set_label("${p}1_loop"); eval"&aes${p} ($inout0,$rndkey1)"; &dec ($rounds); &$movekey ($rndkey1,&QWP(0,$key)); &lea ($key,&DWP(16,$key)); &jnz (&label("${p}1_loop")); eval"&aes${p}last ($inout0,$rndkey1)"; } sub aesni_generate1 # fully unrolled loop { my $p=shift; &function_begin_B("_aesni_${p}rypt1"); &$movekey ($rndkey0,&QWP(0,$key)); &$movekey ($rndkey1,&QWP(0x10,$key)); &cmp ($rounds,11); &pxor ($inout0,$rndkey0); &$movekey ($rndkey0,&QWP(0x20,$key)); &lea ($key,&DWP(0x30,$key)); &jb (&label("${p}128")); &lea ($key,&DWP(0x20,$key)); &je (&label("${p}192")); &lea ($key,&DWP(0x20,$key)); eval"&aes${p} ($inout0,$rndkey1)"; &$movekey ($rndkey1,&QWP(-0x40,$key)); eval"&aes${p} ($inout0,$rndkey0)"; &$movekey ($rndkey0,&QWP(-0x30,$key)); &set_label("${p}192"); eval"&aes${p} ($inout0,$rndkey1)"; &$movekey ($rndkey1,&QWP(-0x20,$key)); eval"&aes${p} ($inout0,$rndkey0)"; &$movekey ($rndkey0,&QWP(-0x10,$key)); &set_label("${p}128"); eval"&aes${p} ($inout0,$rndkey1)"; &$movekey ($rndkey1,&QWP(0,$key)); eval"&aes${p} ($inout0,$rndkey0)"; &$movekey ($rndkey0,&QWP(0x10,$key)); eval"&aes${p} ($inout0,$rndkey1)"; &$movekey ($rndkey1,&QWP(0x20,$key)); eval"&aes${p} ($inout0,$rndkey0)"; &$movekey ($rndkey0,&QWP(0x30,$key)); eval"&aes${p} ($inout0,$rndkey1)"; &$movekey ($rndkey1,&QWP(0x40,$key)); eval"&aes${p} ($inout0,$rndkey0)"; &$movekey ($rndkey0,&QWP(0x50,$key)); eval"&aes${p} ($inout0,$rndkey1)"; &$movekey ($rndkey1,&QWP(0x60,$key)); eval"&aes${p} ($inout0,$rndkey0)"; &$movekey ($rndkey0,&QWP(0x70,$key)); eval"&aes${p} ($inout0,$rndkey1)"; eval"&aes${p}last ($inout0,$rndkey0)"; &ret(); &function_end_B("_aesni_${p}rypt1"); } # void $PREFIX_encrypt (const void *inp,void *out,const AES_KEY *key); # &aesni_generate1("dec"); &function_begin_B("${PREFIX}_encrypt"); &mov ("eax",&wparam(0)); &mov ($key,&wparam(2)); &movups ($inout0,&QWP(0,"eax")); &mov ($rounds,&DWP(240,$key)); &mov ("eax",&wparam(1)); &aesni_inline_generate1("enc"); # &call ("_aesni_encrypt1"); &movups (&QWP(0,"eax"),$inout0); &ret (); &function_end_B("${PREFIX}_encrypt"); # void $PREFIX_decrypt (const void *inp,void *out,const AES_KEY *key); # &aesni_generate1("dec"); &function_begin_B("${PREFIX}_decrypt"); &mov ("eax",&wparam(0)); &mov ($key,&wparam(2)); &movups ($inout0,&QWP(0,"eax")); &mov ($rounds,&DWP(240,$key)); &mov ("eax",&wparam(1)); &aesni_inline_generate1("dec"); # &call ("_aesni_decrypt1"); &movups (&QWP(0,"eax"),$inout0); &ret (); &function_end_B("${PREFIX}_decrypt"); # _aesni_[en|de]crypt[34] are private interfaces, N denotes interleave # factor. Why 3x subroutine is used in loops? Even though aes[enc|dec] # latency is 6, it turned out that it can be scheduled only every # *second* cycle. Thus 3x interleave is the one providing optimal # utilization, i.e. when subroutine's throughput is virtually same as # of non-interleaved subroutine [for number of input blocks up to 3]. # This is why it makes no sense to implement 2x subroutine. As soon # as/if Intel improves throughput by making it possible to schedule # the instructions in question *every* cycles I would have to # implement 6x interleave and use it in loop... sub aesni_generate3 { my $p=shift; &function_begin_B("_aesni_${p}rypt3"); &$movekey ($rndkey0,&QWP(0,$key)); &shr ($rounds,1); &$movekey ($rndkey1,&QWP(16,$key)); &lea ($key,&DWP(32,$key)); &pxor ($inout0,$rndkey0); &pxor ($inout1,$rndkey0); &pxor ($inout2,$rndkey0); &jmp (&label("${p}3_loop")); &set_label("${p}3_loop",16); eval"&aes${p} ($inout0,$rndkey1)"; &$movekey ($rndkey0,&QWP(0,$key)); eval"&aes${p} ($inout1,$rndkey1)"; &dec ($rounds); eval"&aes${p} ($inout2,$rndkey1)"; &$movekey ($rndkey1,&QWP(16,$key)); eval"&aes${p} ($inout0,$rndkey0)"; &lea ($key,&DWP(32,$key)); eval"&aes${p} ($inout1,$rndkey0)"; eval"&aes${p} ($inout2,$rndkey0)"; &jnz (&label("${p}3_loop")); eval"&aes${p} ($inout0,$rndkey1)"; &$movekey ($rndkey0,&QWP(0,$key)); eval"&aes${p} ($inout1,$rndkey1)"; eval"&aes${p} ($inout2,$rndkey1)"; eval"&aes${p}last ($inout0,$rndkey0)"; eval"&aes${p}last ($inout1,$rndkey0)"; eval"&aes${p}last ($inout2,$rndkey0)"; &ret(); &function_end_B("_aesni_${p}rypt3"); } # 4x interleave is implemented to improve small block performance, # most notably [and naturally] 4 block by ~30%. One can argue that one # should have implemented 5x as well, but improvement would be <20%, # so it's not worth it... sub aesni_generate4 { my $p=shift; &function_begin_B("_aesni_${p}rypt4"); &$movekey ($rndkey0,&QWP(0,$key)); &$movekey ($rndkey1,&QWP(16,$key)); &shr ($rounds,1); &lea ($key,&DWP(32,$key)); &pxor ($inout0,$rndkey0); &pxor ($inout1,$rndkey0); &pxor ($inout2,$rndkey0); &pxor ($inout3,$rndkey0); &jmp (&label("${p}3_loop")); &set_label("${p}3_loop",16); eval"&aes${p} ($inout0,$rndkey1)"; &$movekey ($rndkey0,&QWP(0,$key)); eval"&aes${p} ($inout1,$rndkey1)"; &dec ($rounds); eval"&aes${p} ($inout2,$rndkey1)"; eval"&aes${p} ($inout3,$rndkey1)"; &$movekey ($rndkey1,&QWP(16,$key)); eval"&aes${p} ($inout0,$rndkey0)"; &lea ($key,&DWP(32,$key)); eval"&aes${p} ($inout1,$rndkey0)"; eval"&aes${p} ($inout2,$rndkey0)"; eval"&aes${p} ($inout3,$rndkey0)"; &jnz (&label("${p}3_loop")); eval"&aes${p} ($inout0,$rndkey1)"; &$movekey ($rndkey0,&QWP(0,$key)); eval"&aes${p} ($inout1,$rndkey1)"; eval"&aes${p} ($inout2,$rndkey1)"; eval"&aes${p} ($inout3,$rndkey1)"; eval"&aes${p}last ($inout0,$rndkey0)"; eval"&aes${p}last ($inout1,$rndkey0)"; eval"&aes${p}last ($inout2,$rndkey0)"; eval"&aes${p}last ($inout3,$rndkey0)"; &ret(); &function_end_B("_aesni_${p}rypt4"); } &aesni_generate3("enc") if ($PREFIX eq "aesni"); &aesni_generate3("dec"); &aesni_generate4("enc") if ($PREFIX eq "aesni"); &aesni_generate4("dec"); if ($PREFIX eq "aesni") { # void aesni_ecb_encrypt (const void *in, void *out, # size_t length, const AES_KEY *key, # int enc); &function_begin("aesni_ecb_encrypt"); &mov ($inp,&wparam(0)); &mov ($out,&wparam(1)); &mov ($len,&wparam(2)); &mov ($key,&wparam(3)); &mov ($rounds,&wparam(4)); &cmp ($len,16); &jb (&label("ecb_ret")); &and ($len,-16); &test ($rounds,$rounds) &mov ($rounds,&DWP(240,$key)); &mov ($key_,$key); # backup $key &mov ($rounds_,$rounds); # backup $rounds &jz (&label("ecb_decrypt")); &sub ($len,0x40); &jbe (&label("ecb_enc_tail")); &jmp (&label("ecb_enc_loop3")); &set_label("ecb_enc_loop3",16); &movups ($inout0,&QWP(0,$inp)); &movups ($inout1,&QWP(0x10,$inp)); &movups ($inout2,&QWP(0x20,$inp)); &call ("_aesni_encrypt3"); &sub ($len,0x30); &lea ($inp,&DWP(0x30,$inp)); &lea ($out,&DWP(0x30,$out)); &movups (&QWP(-0x30,$out),$inout0); &mov ($key,$key_); # restore $key &movups (&QWP(-0x20,$out),$inout1); &mov ($rounds,$rounds_); # restore $rounds &movups (&QWP(-0x10,$out),$inout2); &ja (&label("ecb_enc_loop3")); &set_label("ecb_enc_tail"); &add ($len,0x40); &jz (&label("ecb_ret")); &cmp ($len,0x10); &movups ($inout0,&QWP(0,$inp)); &je (&label("ecb_enc_one")); &cmp ($len,0x20); &movups ($inout1,&QWP(0x10,$inp)); &je (&label("ecb_enc_two")); &cmp ($len,0x30); &movups ($inout2,&QWP(0x20,$inp)); &je (&label("ecb_enc_three")); &movups ($inout3,&QWP(0x30,$inp)); &call ("_aesni_encrypt4"); &movups (&QWP(0,$out),$inout0); &movups (&QWP(0x10,$out),$inout1); &movups (&QWP(0x20,$out),$inout2); &movups (&QWP(0x30,$out),$inout3); jmp (&label("ecb_ret")); &set_label("ecb_enc_one",16); &aesni_inline_generate1("enc"); # &call ("_aesni_encrypt1"); &movups (&QWP(0,$out),$inout0); &jmp (&label("ecb_ret")); &set_label("ecb_enc_two",16); &call ("_aesni_encrypt3"); &movups (&QWP(0,$out),$inout0); &movups (&QWP(0x10,$out),$inout1); &jmp (&label("ecb_ret")); &set_label("ecb_enc_three",16); &call ("_aesni_encrypt3"); &movups (&QWP(0,$out),$inout0); &movups (&QWP(0x10,$out),$inout1); &movups (&QWP(0x20,$out),$inout2); &jmp (&label("ecb_ret")); &set_label("ecb_decrypt",16); &sub ($len,0x40); &jbe (&label("ecb_dec_tail")); &jmp (&label("ecb_dec_loop3")); &set_label("ecb_dec_loop3",16); &movups ($inout0,&QWP(0,$inp)); &movups ($inout1,&QWP(0x10,$inp)); &movups ($inout2,&QWP(0x20,$inp)); &call ("_aesni_decrypt3"); &sub ($len,0x30); &lea ($inp,&DWP(0x30,$inp)); &lea ($out,&DWP(0x30,$out)); &movups (&QWP(-0x30,$out),$inout0); &mov ($key,$key_); # restore $key &movups (&QWP(-0x20,$out),$inout1); &mov ($rounds,$rounds_); # restore $rounds &movups (&QWP(-0x10,$out),$inout2); &ja (&label("ecb_dec_loop3")); &set_label("ecb_dec_tail"); &add ($len,0x40); &jz (&label("ecb_ret")); &cmp ($len,0x10); &movups ($inout0,&QWP(0,$inp)); &je (&label("ecb_dec_one")); &cmp ($len,0x20); &movups ($inout1,&QWP(0x10,$inp)); &je (&label("ecb_dec_two")); &cmp ($len,0x30); &movups ($inout2,&QWP(0x20,$inp)); &je (&label("ecb_dec_three")); &movups ($inout3,&QWP(0x30,$inp)); &call ("_aesni_decrypt4"); &movups (&QWP(0,$out),$inout0); &movups (&QWP(0x10,$out),$inout1); &movups (&QWP(0x20,$out),$inout2); &movups (&QWP(0x30,$out),$inout3); &jmp (&label("ecb_ret")); &set_label("ecb_dec_one",16); &aesni_inline_generate1("dec"); # &call ("_aesni_decrypt3"); &movups (&QWP(0,$out),$inout0); &jmp (&label("ecb_ret")); &set_label("ecb_dec_two",16); &call ("_aesni_decrypt3"); &movups (&QWP(0,$out),$inout0); &movups (&QWP(0x10,$out),$inout1); &jmp (&label("ecb_ret")); &set_label("ecb_dec_three",16); &call ("_aesni_decrypt3"); &movups (&QWP(0,$out),$inout0); &movups (&QWP(0x10,$out),$inout1); &movups (&QWP(0x20,$out),$inout2); &set_label("ecb_ret"); &function_end("aesni_ecb_encrypt"); } # void $PREFIX_cbc_encrypt (const void *inp, void *out, # size_t length, const AES_KEY *key, # unsigned char *ivp,const int enc); &function_begin("${PREFIX}_cbc_encrypt"); &mov ($inp,&wparam(0)); &mov ($out,&wparam(1)); &mov ($len,&wparam(2)); &mov ($key,&wparam(3)); &test ($len,$len); &mov ($key_,&wparam(4)); &jz (&label("cbc_ret")); &cmp (&wparam(5),0); &movups ($ivec,&QWP(0,$key_)); # load IV &mov ($rounds,&DWP(240,$key)); &mov ($key_,$key); # backup $key &mov ($rounds_,$rounds); # backup $rounds &je (&label("cbc_decrypt")); &movaps ($inout0,$ivec); &cmp ($len,16); &jb (&label("cbc_enc_tail")); &sub ($len,16); &jmp (&label("cbc_enc_loop")); &set_label("cbc_enc_loop",16); &movups ($ivec,&QWP(0,$inp)); &lea ($inp,&DWP(16,$inp)); &pxor ($inout0,$ivec); &aesni_inline_generate1("enc"); # &call ("_aesni_encrypt3"); &sub ($len,16); &lea ($out,&DWP(16,$out)); &mov ($rounds,$rounds_); # restore $rounds &mov ($key,$key_); # restore $key &movups (&QWP(-16,$out),$inout0); &jnc (&label("cbc_enc_loop")); &add ($len,16); &jnz (&label("cbc_enc_tail")); &movaps ($ivec,$inout0); &jmp (&label("cbc_ret")); &set_label("cbc_enc_tail"); &mov ("ecx",$len); # zaps $rounds &data_word(0xA4F3F689); # rep movsb &mov ("ecx",16); # zero tail &sub ("ecx",$len); &xor ("eax","eax"); # zaps $len &data_word(0xAAF3F689); # rep stosb &lea ($out,&DWP(-16,$out)); # rewind $out by 1 block &mov ($rounds,$rounds_); # restore $rounds &mov ($inp,$out); # $inp and $out are the same &mov ($key,$key_); # restore $key &jmp (&label("cbc_enc_loop")); &set_label("cbc_decrypt",16); &sub ($len,0x40); &jbe (&label("cbc_dec_tail")); &jmp (&label("cbc_dec_loop3")); &set_label("cbc_dec_loop3",16); &movups ($inout0,&QWP(0,$inp)); &movups ($inout1,&QWP(0x10,$inp)); &movups ($inout2,&QWP(0x20,$inp)); &movaps ($in0,$inout0); &movaps ($in1,$inout1); &call ("_aesni_decrypt3"); &sub ($len,0x30); &lea ($inp,&DWP(0x30,$inp)); &lea ($out,&DWP(0x30,$out)); &pxor ($inout0,$ivec); &pxor ($inout1,$in0); &movups ($ivec,&QWP(-0x10,$inp)); &pxor ($inout2,$in1); &movups (&QWP(-0x30,$out),$inout0); &mov ($rounds,$rounds_) # restore $rounds &movups (&QWP(-0x20,$out),$inout1); &mov ($key,$key_); # restore $key &movups (&QWP(-0x10,$out),$inout2); &ja (&label("cbc_dec_loop3")); &set_label("cbc_dec_tail"); &add ($len,0x40); &jz (&label("cbc_ret")); &movups ($inout0,&QWP(0,$inp)); &cmp ($len,0x10); &movaps ($in0,$inout0); &jbe (&label("cbc_dec_one")); &movups ($inout1,&QWP(0x10,$inp)); &cmp ($len,0x20); &movaps ($in1,$inout1); &jbe (&label("cbc_dec_two")); &movups ($inout2,&QWP(0x20,$inp)); &cmp ($len,0x30); &jbe (&label("cbc_dec_three")); &movups ($inout3,&QWP(0x30,$inp)); &call ("_aesni_decrypt4"); &movups ($rndkey0,&QWP(0x10,$inp)); &movups ($rndkey1,&QWP(0x20,$inp)); &pxor ($inout0,$ivec); &pxor ($inout1,$in0); &movups ($ivec,&QWP(0x30,$inp)); &movups (&QWP(0,$out),$inout0); &pxor ($inout2,$rndkey0); &pxor ($inout3,$rndkey1); &movups (&QWP(0x10,$out),$inout1); &movups (&QWP(0x20,$out),$inout2); &movaps ($inout0,$inout3); &lea ($out,&DWP(0x30,$out)); &jmp (&label("cbc_dec_tail_collected")); &set_label("cbc_dec_one"); &aesni_inline_generate1("dec"); # &call ("_aesni_decrypt3"); &pxor ($inout0,$ivec); &movaps ($ivec,$in0); &jmp (&label("cbc_dec_tail_collected")); &set_label("cbc_dec_two"); &call ("_aesni_decrypt3"); &pxor ($inout0,$ivec); &pxor ($inout1,$in0); &movups (&QWP(0,$out),$inout0); &movaps ($inout0,$inout1); &movaps ($ivec,$in1); &lea ($out,&DWP(0x10,$out)); &jmp (&label("cbc_dec_tail_collected")); &set_label("cbc_dec_three"); &call ("_aesni_decrypt3"); &pxor ($inout0,$ivec); &pxor ($inout1,$in0); &pxor ($inout2,$in1); &movups (&QWP(0,$out),$inout0); &movups (&QWP(0x10,$out),$inout1); &movaps ($inout0,$inout2); &movups ($ivec,&QWP(0x20,$inp)); &lea ($out,&DWP(0x20,$out)); &set_label("cbc_dec_tail_collected"); &and ($len,15); &jnz (&label("cbc_dec_tail_partial")); &movups (&QWP(0,$out),$inout0); &jmp (&label("cbc_ret")); &set_label("cbc_dec_tail_partial"); &mov ($key_,"esp"); &sub ("esp",16); &and ("esp",-16); &movaps (&QWP(0,"esp"),$inout0); &mov ($inp,"esp"); &mov ("ecx",$len); &data_word(0xA4F3F689); # rep movsb &mov ("esp",$key_); &set_label("cbc_ret"); &mov ($key_,&wparam(4)); &movups (&QWP(0,$key_),$ivec); # output IV &function_end("${PREFIX}_cbc_encrypt"); # Mechanical port from aesni-x86_64.pl. # # _aesni_set_encrypt_key is private interface, # input: # "eax" const unsigned char *userKey # $rounds int bits # $key AES_KEY *key # output: # "eax" return code # $round rounds &function_begin_B("_aesni_set_encrypt_key"); &test ("eax","eax"); &jz (&label("bad_pointer")); &test ($key,$key); &jz (&label("bad_pointer")); &movups ("xmm0",&QWP(0,"eax")); # pull first 128 bits of *userKey &pxor ("xmm4","xmm4"); # low dword of xmm4 is assumed 0 &lea ($key,&DWP(16,$key)); &cmp ($rounds,256); &je (&label("14rounds")); &cmp ($rounds,192); &je (&label("12rounds")); &cmp ($rounds,128); &jne (&label("bad_keybits")); &set_label("10rounds",16); &mov ($rounds,9); &$movekey (&QWP(-16,$key),"xmm0"); # round 0 &aeskeygenassist("xmm1","xmm0",0x01); # round 1 &call (&label("key_128_cold")); &aeskeygenassist("xmm1","xmm0",0x2); # round 2 &call (&label("key_128")); &aeskeygenassist("xmm1","xmm0",0x04); # round 3 &call (&label("key_128")); &aeskeygenassist("xmm1","xmm0",0x08); # round 4 &call (&label("key_128")); &aeskeygenassist("xmm1","xmm0",0x10); # round 5 &call (&label("key_128")); &aeskeygenassist("xmm1","xmm0",0x20); # round 6 &call (&label("key_128")); &aeskeygenassist("xmm1","xmm0",0x40); # round 7 &call (&label("key_128")); &aeskeygenassist("xmm1","xmm0",0x80); # round 8 &call (&label("key_128")); &aeskeygenassist("xmm1","xmm0",0x1b); # round 9 &call (&label("key_128")); &aeskeygenassist("xmm1","xmm0",0x36); # round 10 &call (&label("key_128")); &$movekey (&QWP(0,$key),"xmm0"); &mov (&DWP(80,$key),$rounds); &xor ("eax","eax"); &ret(); &set_label("key_128",16); &$movekey (&QWP(0,$key),"xmm0"); &lea ($key,&DWP(16,$key)); &set_label("key_128_cold"); &shufps ("xmm4","xmm0",0b00010000); &pxor ("xmm0","xmm4"); &shufps ("xmm4","xmm0",0b10001100,); &pxor ("xmm0","xmm4"); &pshufd ("xmm1","xmm1",0b11111111); # critical path &pxor ("xmm0","xmm1"); &ret(); &set_label("12rounds",16); &movq ("xmm2",&QWP(16,"eax")); # remaining 1/3 of *userKey &mov ($rounds,11); &$movekey (&QWP(-16,$key),"xmm0") # round 0 &aeskeygenassist("xmm1","xmm2",0x01); # round 1,2 &call (&label("key_192a_cold")); &aeskeygenassist("xmm1","xmm2",0x02); # round 2,3 &call (&label("key_192b")); &aeskeygenassist("xmm1","xmm2",0x04); # round 4,5 &call (&label("key_192a")); &aeskeygenassist("xmm1","xmm2",0x08); # round 5,6 &call (&label("key_192b")); &aeskeygenassist("xmm1","xmm2",0x10); # round 7,8 &call (&label("key_192a")); &aeskeygenassist("xmm1","xmm2",0x20); # round 8,9 &call (&label("key_192b")); &aeskeygenassist("xmm1","xmm2",0x40); # round 10,11 &call (&label("key_192a")); &aeskeygenassist("xmm1","xmm2",0x80); # round 11,12 &call (&label("key_192b")); &$movekey (&QWP(0,$key),"xmm0"); &mov (&DWP(48,$key),$rounds); &xor ("eax","eax"); &ret(); &set_label("key_192a",16); &$movekey (&QWP(0,$key),"xmm0"); &lea ($key,&DWP(16,$key)); &set_label("key_192a_cold",16); &movaps ("xmm5","xmm2"); &set_label("key_192b_warm"); &shufps ("xmm4","xmm0",0b00010000); &movaps ("xmm3","xmm2"); &pxor ("xmm0","xmm4"); &shufps ("xmm4","xmm0",0b10001100); &pslldq ("xmm3",4); &pxor ("xmm0","xmm4"); &pshufd ("xmm1","xmm1",0b01010101); # critical path &pxor ("xmm2","xmm3"); &pxor ("xmm0","xmm1"); &pshufd ("xmm3","xmm0",0b11111111); &pxor ("xmm2","xmm3"); &ret(); &set_label("key_192b",16); &movaps ("xmm3","xmm0"); &shufps ("xmm5","xmm0",0b01000100); &$movekey (&QWP(0,$key),"xmm5"); &shufps ("xmm3","xmm2",0b01001110); &$movekey (&QWP(16,$key),"xmm3"); &lea ($key,&DWP(32,$key)); &jmp (&label("key_192b_warm")); &set_label("14rounds",16); &movups ("xmm2",&QWP(16,"eax")); # remaining half of *userKey &mov ($rounds,13); &lea ($key,&DWP(16,$key)); &$movekey (&QWP(-32,$key),"xmm0"); # round 0 &$movekey (&QWP(-16,$key),"xmm2"); # round 1 &aeskeygenassist("xmm1","xmm2",0x01); # round 2 &call (&label("key_256a_cold")); &aeskeygenassist("xmm1","xmm0",0x01); # round 3 &call (&label("key_256b")); &aeskeygenassist("xmm1","xmm2",0x02); # round 4 &call (&label("key_256a")); &aeskeygenassist("xmm1","xmm0",0x02); # round 5 &call (&label("key_256b")); &aeskeygenassist("xmm1","xmm2",0x04); # round 6 &call (&label("key_256a")); &aeskeygenassist("xmm1","xmm0",0x04); # round 7 &call (&label("key_256b")); &aeskeygenassist("xmm1","xmm2",0x08); # round 8 &call (&label("key_256a")); &aeskeygenassist("xmm1","xmm0",0x08); # round 9 &call (&label("key_256b")); &aeskeygenassist("xmm1","xmm2",0x10); # round 10 &call (&label("key_256a")); &aeskeygenassist("xmm1","xmm0",0x10); # round 11 &call (&label("key_256b")); &aeskeygenassist("xmm1","xmm2",0x20); # round 12 &call (&label("key_256a")); &aeskeygenassist("xmm1","xmm0",0x20); # round 13 &call (&label("key_256b")); &aeskeygenassist("xmm1","xmm2",0x40); # round 14 &call (&label("key_256a")); &$movekey (&QWP(0,$key),"xmm0"); &mov (&DWP(16,$key),$rounds); &xor ("eax","eax"); &ret(); &set_label("key_256a",16); &$movekey (&QWP(0,$key),"xmm2"); &lea ($key,&DWP(16,$key)); &set_label("key_256a_cold"); &shufps ("xmm4","xmm0",0b00010000); &pxor ("xmm0","xmm4"); &shufps ("xmm4","xmm0",0b10001100); &pxor ("xmm0","xmm4"); &pshufd ("xmm1","xmm1",0b11111111); # critical path &pxor ("xmm0","xmm1"); &ret(); &set_label("key_256b",16); &$movekey (&QWP(0,$key),"xmm0"); &lea ($key,&DWP(16,$key)); &shufps ("xmm4","xmm2",0b00010000); &pxor ("xmm2","xmm4"); &shufps ("xmm4","xmm2",0b10001100); &pxor ("xmm2","xmm4"); &pshufd ("xmm1","xmm1",0b10101010); # critical path &pxor ("xmm2","xmm1"); &ret(); &set_label("bad_pointer",4); &mov ("eax",-1); &ret (); &set_label("bad_keybits",4); &mov ("eax",-2); &ret (); &function_end_B("_aesni_set_encrypt_key"); # int $PREFIX_set_encrypt_key (const unsigned char *userKey, int bits, # AES_KEY *key) &function_begin_B("${PREFIX}_set_encrypt_key"); &mov ("eax",&wparam(0)); &mov ($rounds,&wparam(1)); &mov ($key,&wparam(2)); &call ("_aesni_set_encrypt_key"); &ret (); &function_end_B("${PREFIX}_set_encrypt_key"); # int $PREFIX_set_decrypt_key (const unsigned char *userKey, int bits, # AES_KEY *key) &function_begin_B("${PREFIX}_set_decrypt_key"); &mov ("eax",&wparam(0)); &mov ($rounds,&wparam(1)); &mov ($key,&wparam(2)); &call ("_aesni_set_encrypt_key"); &mov ($key,&wparam(2)); &shl ($rounds,4) # rounds-1 after _aesni_set_encrypt_key &test ("eax","eax"); &jnz (&label("dec_key_ret")); &lea ("eax",&DWP(16,$key,$rounds)); # end of key schedule &$movekey ("xmm0",&QWP(0,$key)); # just swap &$movekey ("xmm1",&QWP(0,"eax")); &$movekey (&QWP(0,"eax"),"xmm0"); &$movekey (&QWP(0,$key),"xmm1"); &lea ($key,&DWP(16,$key)); &lea ("eax",&DWP(-16,"eax")); &set_label("dec_key_inverse"); &$movekey ("xmm0",&QWP(0,$key)); # swap and inverse &$movekey ("xmm1",&QWP(0,"eax")); &aesimc ("xmm0","xmm0"); &aesimc ("xmm1","xmm1"); &lea ($key,&DWP(16,$key)); &lea ("eax",&DWP(-16,"eax")); &cmp ("eax",$key); &$movekey (&QWP(16,"eax"),"xmm0"); &$movekey (&QWP(-16,$key),"xmm1"); &ja (&label("dec_key_inverse")); &$movekey ("xmm0",&QWP(0,$key)); # inverse middle &aesimc ("xmm0","xmm0"); &$movekey (&QWP(0,$key),"xmm0"); &xor ("eax","eax"); # return success &set_label("dec_key_ret"); &ret (); &function_end_B("${PREFIX}_set_decrypt_key"); &asciz("AES for Intel AES-NI, CRYPTOGAMS by "); &asm_finish();