VirtualBox

source: vbox/trunk/src/libs/openssl-3.0.7/crypto/aes/asm/aesni-x86.pl@ 98103

最後變更 在這個檔案從98103是 97372,由 vboxsync 提交於 2 年 前

libs: Switch to openssl-3.0.7, bugref:10317

檔案大小: 99.8 KB
 
1#! /usr/bin/env perl
2# Copyright 2009-2022 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the Apache License 2.0 (the "License"). You may not use
5# this file except in compliance with the License. You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
9
10# ====================================================================
11# Written by Andy Polyakov <[email protected]> for the OpenSSL
12# project. The module is, however, dual licensed under OpenSSL and
13# CRYPTOGAMS licenses depending on where you obtain it. For further
14# details see http://www.openssl.org/~appro/cryptogams/.
15# ====================================================================
16#
17# This module implements support for Intel AES-NI extension. In
18# OpenSSL context it's used with Intel engine, but can also be used as
19# drop-in replacement for crypto/aes/asm/aes-586.pl [see below for
20# details].
21#
22# Performance.
23#
24# To start with see corresponding paragraph in aesni-x86_64.pl...
25# Instead of filling table similar to one found there I've chosen to
26# summarize *comparison* results for raw ECB, CTR and CBC benchmarks.
27# The simplified table below represents 32-bit performance relative
28# to 64-bit one in every given point. Ratios vary for different
29# encryption modes, therefore interval values.
30#
31# 16-byte 64-byte 256-byte 1-KB 8-KB
32# 53-67% 67-84% 91-94% 95-98% 97-99.5%
33#
34# Lower ratios for smaller block sizes are perfectly understandable,
35# because function call overhead is higher in 32-bit mode. Largest
36# 8-KB block performance is virtually same: 32-bit code is less than
37# 1% slower for ECB, CBC and CCM, and ~3% slower otherwise.
38
39# January 2011
40#
41# See aesni-x86_64.pl for details. Unlike x86_64 version this module
42# interleaves at most 6 aes[enc|dec] instructions, because there are
43# not enough registers for 8x interleave [which should be optimal for
44# Sandy Bridge]. Actually, performance results for 6x interleave
45# factor presented in aesni-x86_64.pl (except for CTR) are for this
46# module.
47
48# April 2011
49#
50# Add aesni_xts_[en|de]crypt. Westmere spends 1.50 cycles processing
51# one byte out of 8KB with 128-bit key, Sandy Bridge - 1.09.
52
53# November 2015
54#
55# Add aesni_ocb_[en|de]crypt.
56
57######################################################################
58# Current large-block performance in cycles per byte processed with
59# 128-bit key (less is better).
60#
61# CBC en-/decrypt CTR XTS ECB OCB
62# Westmere 3.77/1.37 1.37 1.52 1.27
63# * Bridge 5.07/0.98 0.99 1.09 0.91 1.10
64# Haswell 4.44/0.80 0.97 1.03 0.72 0.76
65# Skylake 2.68/0.65 0.65 0.66 0.64 0.66
66# Silvermont 5.77/3.56 3.67 4.03 3.46 4.03
67# Goldmont 3.84/1.39 1.39 1.63 1.31 1.70
68# Bulldozer 5.80/0.98 1.05 1.24 0.93 1.23
69
70$PREFIX="aesni"; # if $PREFIX is set to "AES", the script
71 # generates drop-in replacement for
72 # crypto/aes/asm/aes-586.pl:-)
73$inline=1; # inline _aesni_[en|de]crypt
74
75$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
76push(@INC,"${dir}","${dir}../../perlasm");
77require "x86asm.pl";
78
79$output = pop and open STDOUT,">$output";
80
81&asm_init($ARGV[0]);
82
83&external_label("OPENSSL_ia32cap_P");
84&static_label("key_const");
85
86if ($PREFIX eq "aesni") { $movekey=\&movups; }
87else { $movekey=\&movups; }
88
89$len="eax";
90$rounds="ecx";
91$key="edx";
92$inp="esi";
93$out="edi";
94$rounds_="ebx"; # backup copy for $rounds
95$key_="ebp"; # backup copy for $key
96
97$rndkey0="xmm0";
98$rndkey1="xmm1";
99$inout0="xmm2";
100$inout1="xmm3";
101$inout2="xmm4";
102$inout3="xmm5"; $in1="xmm5";
103$inout4="xmm6"; $in0="xmm6";
104$inout5="xmm7"; $ivec="xmm7";
105
106# AESNI extension
107sub aeskeygenassist
108{ my($dst,$src,$imm)=@_;
109 if ("$dst:$src" =~ /xmm([0-7]):xmm([0-7])/)
110 { &data_byte(0x66,0x0f,0x3a,0xdf,0xc0|($1<<3)|$2,$imm); }
111}
112sub aescommon
113{ my($opcodelet,$dst,$src)=@_;
114 if ("$dst:$src" =~ /xmm([0-7]):xmm([0-7])/)
115 { &data_byte(0x66,0x0f,0x38,$opcodelet,0xc0|($1<<3)|$2);}
116}
117sub aesimc { aescommon(0xdb,@_); }
118sub aesenc { aescommon(0xdc,@_); }
119sub aesenclast { aescommon(0xdd,@_); }
120sub aesdec { aescommon(0xde,@_); }
121sub aesdeclast { aescommon(0xdf,@_); }
122
123
124# Inline version of internal aesni_[en|de]crypt1
125{ my $sn;
126sub aesni_inline_generate1
127{ my ($p,$inout,$ivec)=@_; $inout=$inout0 if (!defined($inout));
128 $sn++;
129
130 &$movekey ($rndkey0,&QWP(0,$key));
131 &$movekey ($rndkey1,&QWP(16,$key));
132 &xorps ($ivec,$rndkey0) if (defined($ivec));
133 &lea ($key,&DWP(32,$key));
134 &xorps ($inout,$ivec) if (defined($ivec));
135 &xorps ($inout,$rndkey0) if (!defined($ivec));
136 &set_label("${p}1_loop_$sn");
137 eval"&aes${p} ($inout,$rndkey1)";
138 &dec ($rounds);
139 &$movekey ($rndkey1,&QWP(0,$key));
140 &lea ($key,&DWP(16,$key));
141 &jnz (&label("${p}1_loop_$sn"));
142 eval"&aes${p}last ($inout,$rndkey1)";
143}}
144
145sub aesni_generate1 # fully unrolled loop
146{ my ($p,$inout)=@_; $inout=$inout0 if (!defined($inout));
147
148 &function_begin_B("_aesni_${p}rypt1");
149 &movups ($rndkey0,&QWP(0,$key));
150 &$movekey ($rndkey1,&QWP(0x10,$key));
151 &xorps ($inout,$rndkey0);
152 &$movekey ($rndkey0,&QWP(0x20,$key));
153 &lea ($key,&DWP(0x30,$key));
154 &cmp ($rounds,11);
155 &jb (&label("${p}128"));
156 &lea ($key,&DWP(0x20,$key));
157 &je (&label("${p}192"));
158 &lea ($key,&DWP(0x20,$key));
159 eval"&aes${p} ($inout,$rndkey1)";
160 &$movekey ($rndkey1,&QWP(-0x40,$key));
161 eval"&aes${p} ($inout,$rndkey0)";
162 &$movekey ($rndkey0,&QWP(-0x30,$key));
163 &set_label("${p}192");
164 eval"&aes${p} ($inout,$rndkey1)";
165 &$movekey ($rndkey1,&QWP(-0x20,$key));
166 eval"&aes${p} ($inout,$rndkey0)";
167 &$movekey ($rndkey0,&QWP(-0x10,$key));
168 &set_label("${p}128");
169 eval"&aes${p} ($inout,$rndkey1)";
170 &$movekey ($rndkey1,&QWP(0,$key));
171 eval"&aes${p} ($inout,$rndkey0)";
172 &$movekey ($rndkey0,&QWP(0x10,$key));
173 eval"&aes${p} ($inout,$rndkey1)";
174 &$movekey ($rndkey1,&QWP(0x20,$key));
175 eval"&aes${p} ($inout,$rndkey0)";
176 &$movekey ($rndkey0,&QWP(0x30,$key));
177 eval"&aes${p} ($inout,$rndkey1)";
178 &$movekey ($rndkey1,&QWP(0x40,$key));
179 eval"&aes${p} ($inout,$rndkey0)";
180 &$movekey ($rndkey0,&QWP(0x50,$key));
181 eval"&aes${p} ($inout,$rndkey1)";
182 &$movekey ($rndkey1,&QWP(0x60,$key));
183 eval"&aes${p} ($inout,$rndkey0)";
184 &$movekey ($rndkey0,&QWP(0x70,$key));
185 eval"&aes${p} ($inout,$rndkey1)";
186 eval"&aes${p}last ($inout,$rndkey0)";
187 &ret();
188 &function_end_B("_aesni_${p}rypt1");
189}
190
191
192# void $PREFIX_encrypt (const void *inp,void *out,const AES_KEY *key);
193&aesni_generate1("enc") if (!$inline);
194&function_begin_B("${PREFIX}_encrypt");
195 &mov ("eax",&wparam(0));
196 &mov ($key,&wparam(2));
197 &movups ($inout0,&QWP(0,"eax"));
198 &mov ($rounds,&DWP(240,$key));
199 &mov ("eax",&wparam(1));
200 if ($inline)
201 { &aesni_inline_generate1("enc"); }
202 else
203 { &call ("_aesni_encrypt1"); }
204 &pxor ($rndkey0,$rndkey0); # clear register bank
205 &pxor ($rndkey1,$rndkey1);
206 &movups (&QWP(0,"eax"),$inout0);
207 &pxor ($inout0,$inout0);
208 &ret ();
209&function_end_B("${PREFIX}_encrypt");
210
211# void $PREFIX_decrypt (const void *inp,void *out,const AES_KEY *key);
212&aesni_generate1("dec") if(!$inline);
213&function_begin_B("${PREFIX}_decrypt");
214 &mov ("eax",&wparam(0));
215 &mov ($key,&wparam(2));
216 &movups ($inout0,&QWP(0,"eax"));
217 &mov ($rounds,&DWP(240,$key));
218 &mov ("eax",&wparam(1));
219 if ($inline)
220 { &aesni_inline_generate1("dec"); }
221 else
222 { &call ("_aesni_decrypt1"); }
223 &pxor ($rndkey0,$rndkey0); # clear register bank
224 &pxor ($rndkey1,$rndkey1);
225 &movups (&QWP(0,"eax"),$inout0);
226 &pxor ($inout0,$inout0);
227 &ret ();
228&function_end_B("${PREFIX}_decrypt");
229
230# _aesni_[en|de]cryptN are private interfaces, N denotes interleave
231# factor. Why 3x subroutine were originally used in loops? Even though
232# aes[enc|dec] latency was originally 6, it could be scheduled only
233# every *2nd* cycle. Thus 3x interleave was the one providing optimal
234# utilization, i.e. when subroutine's throughput is virtually same as
235# of non-interleaved subroutine [for number of input blocks up to 3].
236# This is why it originally made no sense to implement 2x subroutine.
237# But times change and it became appropriate to spend extra 192 bytes
238# on 2x subroutine on Atom Silvermont account. For processors that
239# can schedule aes[enc|dec] every cycle optimal interleave factor
240# equals to corresponding instructions latency. 8x is optimal for
241# * Bridge, but it's unfeasible to accommodate such implementation
242# in XMM registers addressable in 32-bit mode and therefore maximum
243# of 6x is used instead...
244
245sub aesni_generate2
246{ my $p=shift;
247
248 &function_begin_B("_aesni_${p}rypt2");
249 &$movekey ($rndkey0,&QWP(0,$key));
250 &shl ($rounds,4);
251 &$movekey ($rndkey1,&QWP(16,$key));
252 &xorps ($inout0,$rndkey0);
253 &pxor ($inout1,$rndkey0);
254 &$movekey ($rndkey0,&QWP(32,$key));
255 &lea ($key,&DWP(32,$key,$rounds));
256 &neg ($rounds);
257 &add ($rounds,16);
258
259 &set_label("${p}2_loop");
260 eval"&aes${p} ($inout0,$rndkey1)";
261 eval"&aes${p} ($inout1,$rndkey1)";
262 &$movekey ($rndkey1,&QWP(0,$key,$rounds));
263 &add ($rounds,32);
264 eval"&aes${p} ($inout0,$rndkey0)";
265 eval"&aes${p} ($inout1,$rndkey0)";
266 &$movekey ($rndkey0,&QWP(-16,$key,$rounds));
267 &jnz (&label("${p}2_loop"));
268 eval"&aes${p} ($inout0,$rndkey1)";
269 eval"&aes${p} ($inout1,$rndkey1)";
270 eval"&aes${p}last ($inout0,$rndkey0)";
271 eval"&aes${p}last ($inout1,$rndkey0)";
272 &ret();
273 &function_end_B("_aesni_${p}rypt2");
274}
275
276sub aesni_generate3
277{ my $p=shift;
278
279 &function_begin_B("_aesni_${p}rypt3");
280 &$movekey ($rndkey0,&QWP(0,$key));
281 &shl ($rounds,4);
282 &$movekey ($rndkey1,&QWP(16,$key));
283 &xorps ($inout0,$rndkey0);
284 &pxor ($inout1,$rndkey0);
285 &pxor ($inout2,$rndkey0);
286 &$movekey ($rndkey0,&QWP(32,$key));
287 &lea ($key,&DWP(32,$key,$rounds));
288 &neg ($rounds);
289 &add ($rounds,16);
290
291 &set_label("${p}3_loop");
292 eval"&aes${p} ($inout0,$rndkey1)";
293 eval"&aes${p} ($inout1,$rndkey1)";
294 eval"&aes${p} ($inout2,$rndkey1)";
295 &$movekey ($rndkey1,&QWP(0,$key,$rounds));
296 &add ($rounds,32);
297 eval"&aes${p} ($inout0,$rndkey0)";
298 eval"&aes${p} ($inout1,$rndkey0)";
299 eval"&aes${p} ($inout2,$rndkey0)";
300 &$movekey ($rndkey0,&QWP(-16,$key,$rounds));
301 &jnz (&label("${p}3_loop"));
302 eval"&aes${p} ($inout0,$rndkey1)";
303 eval"&aes${p} ($inout1,$rndkey1)";
304 eval"&aes${p} ($inout2,$rndkey1)";
305 eval"&aes${p}last ($inout0,$rndkey0)";
306 eval"&aes${p}last ($inout1,$rndkey0)";
307 eval"&aes${p}last ($inout2,$rndkey0)";
308 &ret();
309 &function_end_B("_aesni_${p}rypt3");
310}
311
312# 4x interleave is implemented to improve small block performance,
313# most notably [and naturally] 4 block by ~30%. One can argue that one
314# should have implemented 5x as well, but improvement would be <20%,
315# so it's not worth it...
316sub aesni_generate4
317{ my $p=shift;
318
319 &function_begin_B("_aesni_${p}rypt4");
320 &$movekey ($rndkey0,&QWP(0,$key));
321 &$movekey ($rndkey1,&QWP(16,$key));
322 &shl ($rounds,4);
323 &xorps ($inout0,$rndkey0);
324 &pxor ($inout1,$rndkey0);
325 &pxor ($inout2,$rndkey0);
326 &pxor ($inout3,$rndkey0);
327 &$movekey ($rndkey0,&QWP(32,$key));
328 &lea ($key,&DWP(32,$key,$rounds));
329 &neg ($rounds);
330 &data_byte (0x0f,0x1f,0x40,0x00);
331 &add ($rounds,16);
332
333 &set_label("${p}4_loop");
334 eval"&aes${p} ($inout0,$rndkey1)";
335 eval"&aes${p} ($inout1,$rndkey1)";
336 eval"&aes${p} ($inout2,$rndkey1)";
337 eval"&aes${p} ($inout3,$rndkey1)";
338 &$movekey ($rndkey1,&QWP(0,$key,$rounds));
339 &add ($rounds,32);
340 eval"&aes${p} ($inout0,$rndkey0)";
341 eval"&aes${p} ($inout1,$rndkey0)";
342 eval"&aes${p} ($inout2,$rndkey0)";
343 eval"&aes${p} ($inout3,$rndkey0)";
344 &$movekey ($rndkey0,&QWP(-16,$key,$rounds));
345 &jnz (&label("${p}4_loop"));
346
347 eval"&aes${p} ($inout0,$rndkey1)";
348 eval"&aes${p} ($inout1,$rndkey1)";
349 eval"&aes${p} ($inout2,$rndkey1)";
350 eval"&aes${p} ($inout3,$rndkey1)";
351 eval"&aes${p}last ($inout0,$rndkey0)";
352 eval"&aes${p}last ($inout1,$rndkey0)";
353 eval"&aes${p}last ($inout2,$rndkey0)";
354 eval"&aes${p}last ($inout3,$rndkey0)";
355 &ret();
356 &function_end_B("_aesni_${p}rypt4");
357}
358
359sub aesni_generate6
360{ my $p=shift;
361
362 &function_begin_B("_aesni_${p}rypt6");
363 &static_label("_aesni_${p}rypt6_enter");
364 &$movekey ($rndkey0,&QWP(0,$key));
365 &shl ($rounds,4);
366 &$movekey ($rndkey1,&QWP(16,$key));
367 &xorps ($inout0,$rndkey0);
368 &pxor ($inout1,$rndkey0); # pxor does better here
369 &pxor ($inout2,$rndkey0);
370 eval"&aes${p} ($inout0,$rndkey1)";
371 &pxor ($inout3,$rndkey0);
372 &pxor ($inout4,$rndkey0);
373 eval"&aes${p} ($inout1,$rndkey1)";
374 &lea ($key,&DWP(32,$key,$rounds));
375 &neg ($rounds);
376 eval"&aes${p} ($inout2,$rndkey1)";
377 &pxor ($inout5,$rndkey0);
378 &$movekey ($rndkey0,&QWP(0,$key,$rounds));
379 &add ($rounds,16);
380 &jmp (&label("_aesni_${p}rypt6_inner"));
381
382 &set_label("${p}6_loop",16);
383 eval"&aes${p} ($inout0,$rndkey1)";
384 eval"&aes${p} ($inout1,$rndkey1)";
385 eval"&aes${p} ($inout2,$rndkey1)";
386 &set_label("_aesni_${p}rypt6_inner");
387 eval"&aes${p} ($inout3,$rndkey1)";
388 eval"&aes${p} ($inout4,$rndkey1)";
389 eval"&aes${p} ($inout5,$rndkey1)";
390 &set_label("_aesni_${p}rypt6_enter");
391 &$movekey ($rndkey1,&QWP(0,$key,$rounds));
392 &add ($rounds,32);
393 eval"&aes${p} ($inout0,$rndkey0)";
394 eval"&aes${p} ($inout1,$rndkey0)";
395 eval"&aes${p} ($inout2,$rndkey0)";
396 eval"&aes${p} ($inout3,$rndkey0)";
397 eval"&aes${p} ($inout4,$rndkey0)";
398 eval"&aes${p} ($inout5,$rndkey0)";
399 &$movekey ($rndkey0,&QWP(-16,$key,$rounds));
400 &jnz (&label("${p}6_loop"));
401
402 eval"&aes${p} ($inout0,$rndkey1)";
403 eval"&aes${p} ($inout1,$rndkey1)";
404 eval"&aes${p} ($inout2,$rndkey1)";
405 eval"&aes${p} ($inout3,$rndkey1)";
406 eval"&aes${p} ($inout4,$rndkey1)";
407 eval"&aes${p} ($inout5,$rndkey1)";
408 eval"&aes${p}last ($inout0,$rndkey0)";
409 eval"&aes${p}last ($inout1,$rndkey0)";
410 eval"&aes${p}last ($inout2,$rndkey0)";
411 eval"&aes${p}last ($inout3,$rndkey0)";
412 eval"&aes${p}last ($inout4,$rndkey0)";
413 eval"&aes${p}last ($inout5,$rndkey0)";
414 &ret();
415 &function_end_B("_aesni_${p}rypt6");
416}
417&aesni_generate2("enc") if ($PREFIX eq "aesni");
418&aesni_generate2("dec");
419&aesni_generate3("enc") if ($PREFIX eq "aesni");
420&aesni_generate3("dec");
421&aesni_generate4("enc") if ($PREFIX eq "aesni");
422&aesni_generate4("dec");
423&aesni_generate6("enc") if ($PREFIX eq "aesni");
424&aesni_generate6("dec");
425
426
427if ($PREFIX eq "aesni") {
428######################################################################
429# void aesni_ecb_encrypt (const void *in, void *out,
430# size_t length, const AES_KEY *key,
431# int enc);
432&function_begin("aesni_ecb_encrypt");
433 &mov ($inp,&wparam(0));
434 &mov ($out,&wparam(1));
435 &mov ($len,&wparam(2));
436 &mov ($key,&wparam(3));
437 &mov ($rounds_,&wparam(4));
438 &and ($len,-16);
439 &jz (&label("ecb_ret"));
440 &mov ($rounds,&DWP(240,$key));
441 &test ($rounds_,$rounds_);
442 &jz (&label("ecb_decrypt"));
443
444 &mov ($key_,$key); # backup $key
445 &mov ($rounds_,$rounds); # backup $rounds
446 &cmp ($len,0x60);
447 &jb (&label("ecb_enc_tail"));
448
449 &movdqu ($inout0,&QWP(0,$inp));
450 &movdqu ($inout1,&QWP(0x10,$inp));
451 &movdqu ($inout2,&QWP(0x20,$inp));
452 &movdqu ($inout3,&QWP(0x30,$inp));
453 &movdqu ($inout4,&QWP(0x40,$inp));
454 &movdqu ($inout5,&QWP(0x50,$inp));
455 &lea ($inp,&DWP(0x60,$inp));
456 &sub ($len,0x60);
457 &jmp (&label("ecb_enc_loop6_enter"));
458
459&set_label("ecb_enc_loop6",16);
460 &movups (&QWP(0,$out),$inout0);
461 &movdqu ($inout0,&QWP(0,$inp));
462 &movups (&QWP(0x10,$out),$inout1);
463 &movdqu ($inout1,&QWP(0x10,$inp));
464 &movups (&QWP(0x20,$out),$inout2);
465 &movdqu ($inout2,&QWP(0x20,$inp));
466 &movups (&QWP(0x30,$out),$inout3);
467 &movdqu ($inout3,&QWP(0x30,$inp));
468 &movups (&QWP(0x40,$out),$inout4);
469 &movdqu ($inout4,&QWP(0x40,$inp));
470 &movups (&QWP(0x50,$out),$inout5);
471 &lea ($out,&DWP(0x60,$out));
472 &movdqu ($inout5,&QWP(0x50,$inp));
473 &lea ($inp,&DWP(0x60,$inp));
474&set_label("ecb_enc_loop6_enter");
475
476 &call ("_aesni_encrypt6");
477
478 &mov ($key,$key_); # restore $key
479 &mov ($rounds,$rounds_); # restore $rounds
480 &sub ($len,0x60);
481 &jnc (&label("ecb_enc_loop6"));
482
483 &movups (&QWP(0,$out),$inout0);
484 &movups (&QWP(0x10,$out),$inout1);
485 &movups (&QWP(0x20,$out),$inout2);
486 &movups (&QWP(0x30,$out),$inout3);
487 &movups (&QWP(0x40,$out),$inout4);
488 &movups (&QWP(0x50,$out),$inout5);
489 &lea ($out,&DWP(0x60,$out));
490 &add ($len,0x60);
491 &jz (&label("ecb_ret"));
492
493&set_label("ecb_enc_tail");
494 &movups ($inout0,&QWP(0,$inp));
495 &cmp ($len,0x20);
496 &jb (&label("ecb_enc_one"));
497 &movups ($inout1,&QWP(0x10,$inp));
498 &je (&label("ecb_enc_two"));
499 &movups ($inout2,&QWP(0x20,$inp));
500 &cmp ($len,0x40);
501 &jb (&label("ecb_enc_three"));
502 &movups ($inout3,&QWP(0x30,$inp));
503 &je (&label("ecb_enc_four"));
504 &movups ($inout4,&QWP(0x40,$inp));
505 &xorps ($inout5,$inout5);
506 &call ("_aesni_encrypt6");
507 &movups (&QWP(0,$out),$inout0);
508 &movups (&QWP(0x10,$out),$inout1);
509 &movups (&QWP(0x20,$out),$inout2);
510 &movups (&QWP(0x30,$out),$inout3);
511 &movups (&QWP(0x40,$out),$inout4);
512 jmp (&label("ecb_ret"));
513
514&set_label("ecb_enc_one",16);
515 if ($inline)
516 { &aesni_inline_generate1("enc"); }
517 else
518 { &call ("_aesni_encrypt1"); }
519 &movups (&QWP(0,$out),$inout0);
520 &jmp (&label("ecb_ret"));
521
522&set_label("ecb_enc_two",16);
523 &call ("_aesni_encrypt2");
524 &movups (&QWP(0,$out),$inout0);
525 &movups (&QWP(0x10,$out),$inout1);
526 &jmp (&label("ecb_ret"));
527
528&set_label("ecb_enc_three",16);
529 &call ("_aesni_encrypt3");
530 &movups (&QWP(0,$out),$inout0);
531 &movups (&QWP(0x10,$out),$inout1);
532 &movups (&QWP(0x20,$out),$inout2);
533 &jmp (&label("ecb_ret"));
534
535&set_label("ecb_enc_four",16);
536 &call ("_aesni_encrypt4");
537 &movups (&QWP(0,$out),$inout0);
538 &movups (&QWP(0x10,$out),$inout1);
539 &movups (&QWP(0x20,$out),$inout2);
540 &movups (&QWP(0x30,$out),$inout3);
541 &jmp (&label("ecb_ret"));
542######################################################################
543&set_label("ecb_decrypt",16);
544 &mov ($key_,$key); # backup $key
545 &mov ($rounds_,$rounds); # backup $rounds
546 &cmp ($len,0x60);
547 &jb (&label("ecb_dec_tail"));
548
549 &movdqu ($inout0,&QWP(0,$inp));
550 &movdqu ($inout1,&QWP(0x10,$inp));
551 &movdqu ($inout2,&QWP(0x20,$inp));
552 &movdqu ($inout3,&QWP(0x30,$inp));
553 &movdqu ($inout4,&QWP(0x40,$inp));
554 &movdqu ($inout5,&QWP(0x50,$inp));
555 &lea ($inp,&DWP(0x60,$inp));
556 &sub ($len,0x60);
557 &jmp (&label("ecb_dec_loop6_enter"));
558
559&set_label("ecb_dec_loop6",16);
560 &movups (&QWP(0,$out),$inout0);
561 &movdqu ($inout0,&QWP(0,$inp));
562 &movups (&QWP(0x10,$out),$inout1);
563 &movdqu ($inout1,&QWP(0x10,$inp));
564 &movups (&QWP(0x20,$out),$inout2);
565 &movdqu ($inout2,&QWP(0x20,$inp));
566 &movups (&QWP(0x30,$out),$inout3);
567 &movdqu ($inout3,&QWP(0x30,$inp));
568 &movups (&QWP(0x40,$out),$inout4);
569 &movdqu ($inout4,&QWP(0x40,$inp));
570 &movups (&QWP(0x50,$out),$inout5);
571 &lea ($out,&DWP(0x60,$out));
572 &movdqu ($inout5,&QWP(0x50,$inp));
573 &lea ($inp,&DWP(0x60,$inp));
574&set_label("ecb_dec_loop6_enter");
575
576 &call ("_aesni_decrypt6");
577
578 &mov ($key,$key_); # restore $key
579 &mov ($rounds,$rounds_); # restore $rounds
580 &sub ($len,0x60);
581 &jnc (&label("ecb_dec_loop6"));
582
583 &movups (&QWP(0,$out),$inout0);
584 &movups (&QWP(0x10,$out),$inout1);
585 &movups (&QWP(0x20,$out),$inout2);
586 &movups (&QWP(0x30,$out),$inout3);
587 &movups (&QWP(0x40,$out),$inout4);
588 &movups (&QWP(0x50,$out),$inout5);
589 &lea ($out,&DWP(0x60,$out));
590 &add ($len,0x60);
591 &jz (&label("ecb_ret"));
592
593&set_label("ecb_dec_tail");
594 &movups ($inout0,&QWP(0,$inp));
595 &cmp ($len,0x20);
596 &jb (&label("ecb_dec_one"));
597 &movups ($inout1,&QWP(0x10,$inp));
598 &je (&label("ecb_dec_two"));
599 &movups ($inout2,&QWP(0x20,$inp));
600 &cmp ($len,0x40);
601 &jb (&label("ecb_dec_three"));
602 &movups ($inout3,&QWP(0x30,$inp));
603 &je (&label("ecb_dec_four"));
604 &movups ($inout4,&QWP(0x40,$inp));
605 &xorps ($inout5,$inout5);
606 &call ("_aesni_decrypt6");
607 &movups (&QWP(0,$out),$inout0);
608 &movups (&QWP(0x10,$out),$inout1);
609 &movups (&QWP(0x20,$out),$inout2);
610 &movups (&QWP(0x30,$out),$inout3);
611 &movups (&QWP(0x40,$out),$inout4);
612 &jmp (&label("ecb_ret"));
613
614&set_label("ecb_dec_one",16);
615 if ($inline)
616 { &aesni_inline_generate1("dec"); }
617 else
618 { &call ("_aesni_decrypt1"); }
619 &movups (&QWP(0,$out),$inout0);
620 &jmp (&label("ecb_ret"));
621
622&set_label("ecb_dec_two",16);
623 &call ("_aesni_decrypt2");
624 &movups (&QWP(0,$out),$inout0);
625 &movups (&QWP(0x10,$out),$inout1);
626 &jmp (&label("ecb_ret"));
627
628&set_label("ecb_dec_three",16);
629 &call ("_aesni_decrypt3");
630 &movups (&QWP(0,$out),$inout0);
631 &movups (&QWP(0x10,$out),$inout1);
632 &movups (&QWP(0x20,$out),$inout2);
633 &jmp (&label("ecb_ret"));
634
635&set_label("ecb_dec_four",16);
636 &call ("_aesni_decrypt4");
637 &movups (&QWP(0,$out),$inout0);
638 &movups (&QWP(0x10,$out),$inout1);
639 &movups (&QWP(0x20,$out),$inout2);
640 &movups (&QWP(0x30,$out),$inout3);
641
642&set_label("ecb_ret");
643 &pxor ("xmm0","xmm0"); # clear register bank
644 &pxor ("xmm1","xmm1");
645 &pxor ("xmm2","xmm2");
646 &pxor ("xmm3","xmm3");
647 &pxor ("xmm4","xmm4");
648 &pxor ("xmm5","xmm5");
649 &pxor ("xmm6","xmm6");
650 &pxor ("xmm7","xmm7");
651&function_end("aesni_ecb_encrypt");
652
653
654######################################################################
655# void aesni_ccm64_[en|de]crypt_blocks (const void *in, void *out,
656# size_t blocks, const AES_KEY *key,
657# const char *ivec,char *cmac);
658#
659# Handles only complete blocks, operates on 64-bit counter and
660# does not update *ivec! Nor does it finalize CMAC value
661# (see engine/eng_aesni.c for details)
662#
663{ my $cmac=$inout1;
664&function_begin("aesni_ccm64_encrypt_blocks");
665 &mov ($inp,&wparam(0));
666 &mov ($out,&wparam(1));
667 &mov ($len,&wparam(2));
668 &mov ($key,&wparam(3));
669 &mov ($rounds_,&wparam(4));
670 &mov ($rounds,&wparam(5));
671 &mov ($key_,"esp");
672 &sub ("esp",60);
673 &and ("esp",-16); # align stack
674 &mov (&DWP(48,"esp"),$key_);
675
676 &movdqu ($ivec,&QWP(0,$rounds_)); # load ivec
677 &movdqu ($cmac,&QWP(0,$rounds)); # load cmac
678 &mov ($rounds,&DWP(240,$key));
679
680 # compose byte-swap control mask for pshufb on stack
681 &mov (&DWP(0,"esp"),0x0c0d0e0f);
682 &mov (&DWP(4,"esp"),0x08090a0b);
683 &mov (&DWP(8,"esp"),0x04050607);
684 &mov (&DWP(12,"esp"),0x00010203);
685
686 # compose counter increment vector on stack
687 &mov ($rounds_,1);
688 &xor ($key_,$key_);
689 &mov (&DWP(16,"esp"),$rounds_);
690 &mov (&DWP(20,"esp"),$key_);
691 &mov (&DWP(24,"esp"),$key_);
692 &mov (&DWP(28,"esp"),$key_);
693
694 &shl ($rounds,4);
695 &mov ($rounds_,16);
696 &lea ($key_,&DWP(0,$key));
697 &movdqa ($inout3,&QWP(0,"esp"));
698 &movdqa ($inout0,$ivec);
699 &lea ($key,&DWP(32,$key,$rounds));
700 &sub ($rounds_,$rounds);
701 &pshufb ($ivec,$inout3);
702
703&set_label("ccm64_enc_outer");
704 &$movekey ($rndkey0,&QWP(0,$key_));
705 &mov ($rounds,$rounds_);
706 &movups ($in0,&QWP(0,$inp));
707
708 &xorps ($inout0,$rndkey0);
709 &$movekey ($rndkey1,&QWP(16,$key_));
710 &xorps ($rndkey0,$in0);
711 &xorps ($cmac,$rndkey0); # cmac^=inp
712 &$movekey ($rndkey0,&QWP(32,$key_));
713
714&set_label("ccm64_enc2_loop");
715 &aesenc ($inout0,$rndkey1);
716 &aesenc ($cmac,$rndkey1);
717 &$movekey ($rndkey1,&QWP(0,$key,$rounds));
718 &add ($rounds,32);
719 &aesenc ($inout0,$rndkey0);
720 &aesenc ($cmac,$rndkey0);
721 &$movekey ($rndkey0,&QWP(-16,$key,$rounds));
722 &jnz (&label("ccm64_enc2_loop"));
723 &aesenc ($inout0,$rndkey1);
724 &aesenc ($cmac,$rndkey1);
725 &paddq ($ivec,&QWP(16,"esp"));
726 &dec ($len);
727 &aesenclast ($inout0,$rndkey0);
728 &aesenclast ($cmac,$rndkey0);
729
730 &lea ($inp,&DWP(16,$inp));
731 &xorps ($in0,$inout0); # inp^=E(ivec)
732 &movdqa ($inout0,$ivec);
733 &movups (&QWP(0,$out),$in0); # save output
734 &pshufb ($inout0,$inout3);
735 &lea ($out,&DWP(16,$out));
736 &jnz (&label("ccm64_enc_outer"));
737
738 &mov ("esp",&DWP(48,"esp"));
739 &mov ($out,&wparam(5));
740 &movups (&QWP(0,$out),$cmac);
741
742 &pxor ("xmm0","xmm0"); # clear register bank
743 &pxor ("xmm1","xmm1");
744 &pxor ("xmm2","xmm2");
745 &pxor ("xmm3","xmm3");
746 &pxor ("xmm4","xmm4");
747 &pxor ("xmm5","xmm5");
748 &pxor ("xmm6","xmm6");
749 &pxor ("xmm7","xmm7");
750&function_end("aesni_ccm64_encrypt_blocks");
751
752&function_begin("aesni_ccm64_decrypt_blocks");
753 &mov ($inp,&wparam(0));
754 &mov ($out,&wparam(1));
755 &mov ($len,&wparam(2));
756 &mov ($key,&wparam(3));
757 &mov ($rounds_,&wparam(4));
758 &mov ($rounds,&wparam(5));
759 &mov ($key_,"esp");
760 &sub ("esp",60);
761 &and ("esp",-16); # align stack
762 &mov (&DWP(48,"esp"),$key_);
763
764 &movdqu ($ivec,&QWP(0,$rounds_)); # load ivec
765 &movdqu ($cmac,&QWP(0,$rounds)); # load cmac
766 &mov ($rounds,&DWP(240,$key));
767
768 # compose byte-swap control mask for pshufb on stack
769 &mov (&DWP(0,"esp"),0x0c0d0e0f);
770 &mov (&DWP(4,"esp"),0x08090a0b);
771 &mov (&DWP(8,"esp"),0x04050607);
772 &mov (&DWP(12,"esp"),0x00010203);
773
774 # compose counter increment vector on stack
775 &mov ($rounds_,1);
776 &xor ($key_,$key_);
777 &mov (&DWP(16,"esp"),$rounds_);
778 &mov (&DWP(20,"esp"),$key_);
779 &mov (&DWP(24,"esp"),$key_);
780 &mov (&DWP(28,"esp"),$key_);
781
782 &movdqa ($inout3,&QWP(0,"esp")); # bswap mask
783 &movdqa ($inout0,$ivec);
784
785 &mov ($key_,$key);
786 &mov ($rounds_,$rounds);
787
788 &pshufb ($ivec,$inout3);
789 if ($inline)
790 { &aesni_inline_generate1("enc"); }
791 else
792 { &call ("_aesni_encrypt1"); }
793 &shl ($rounds_,4);
794 &mov ($rounds,16);
795 &movups ($in0,&QWP(0,$inp)); # load inp
796 &paddq ($ivec,&QWP(16,"esp"));
797 &lea ($inp,&QWP(16,$inp));
798 &sub ($rounds,$rounds_);
799 &lea ($key,&DWP(32,$key_,$rounds_));
800 &mov ($rounds_,$rounds);
801 &jmp (&label("ccm64_dec_outer"));
802
803&set_label("ccm64_dec_outer",16);
804 &xorps ($in0,$inout0); # inp ^= E(ivec)
805 &movdqa ($inout0,$ivec);
806 &movups (&QWP(0,$out),$in0); # save output
807 &lea ($out,&DWP(16,$out));
808 &pshufb ($inout0,$inout3);
809
810 &sub ($len,1);
811 &jz (&label("ccm64_dec_break"));
812
813 &$movekey ($rndkey0,&QWP(0,$key_));
814 &mov ($rounds,$rounds_);
815 &$movekey ($rndkey1,&QWP(16,$key_));
816 &xorps ($in0,$rndkey0);
817 &xorps ($inout0,$rndkey0);
818 &xorps ($cmac,$in0); # cmac^=out
819 &$movekey ($rndkey0,&QWP(32,$key_));
820
821&set_label("ccm64_dec2_loop");
822 &aesenc ($inout0,$rndkey1);
823 &aesenc ($cmac,$rndkey1);
824 &$movekey ($rndkey1,&QWP(0,$key,$rounds));
825 &add ($rounds,32);
826 &aesenc ($inout0,$rndkey0);
827 &aesenc ($cmac,$rndkey0);
828 &$movekey ($rndkey0,&QWP(-16,$key,$rounds));
829 &jnz (&label("ccm64_dec2_loop"));
830 &movups ($in0,&QWP(0,$inp)); # load inp
831 &paddq ($ivec,&QWP(16,"esp"));
832 &aesenc ($inout0,$rndkey1);
833 &aesenc ($cmac,$rndkey1);
834 &aesenclast ($inout0,$rndkey0);
835 &aesenclast ($cmac,$rndkey0);
836 &lea ($inp,&QWP(16,$inp));
837 &jmp (&label("ccm64_dec_outer"));
838
839&set_label("ccm64_dec_break",16);
840 &mov ($rounds,&DWP(240,$key_));
841 &mov ($key,$key_);
842 if ($inline)
843 { &aesni_inline_generate1("enc",$cmac,$in0); }
844 else
845 { &call ("_aesni_encrypt1",$cmac); }
846
847 &mov ("esp",&DWP(48,"esp"));
848 &mov ($out,&wparam(5));
849 &movups (&QWP(0,$out),$cmac);
850
851 &pxor ("xmm0","xmm0"); # clear register bank
852 &pxor ("xmm1","xmm1");
853 &pxor ("xmm2","xmm2");
854 &pxor ("xmm3","xmm3");
855 &pxor ("xmm4","xmm4");
856 &pxor ("xmm5","xmm5");
857 &pxor ("xmm6","xmm6");
858 &pxor ("xmm7","xmm7");
859&function_end("aesni_ccm64_decrypt_blocks");
860}
861
862
863######################################################################
864# void aesni_ctr32_encrypt_blocks (const void *in, void *out,
865# size_t blocks, const AES_KEY *key,
866# const char *ivec);
867#
868# Handles only complete blocks, operates on 32-bit counter and
869# does not update *ivec! (see crypto/modes/ctr128.c for details)
870#
871# stack layout:
872# 0 pshufb mask
873# 16 vector addend: 0,6,6,6
874# 32 counter-less ivec
875# 48 1st triplet of counter vector
876# 64 2nd triplet of counter vector
877# 80 saved %esp
878
879&function_begin("aesni_ctr32_encrypt_blocks");
880 &mov ($inp,&wparam(0));
881 &mov ($out,&wparam(1));
882 &mov ($len,&wparam(2));
883 &mov ($key,&wparam(3));
884 &mov ($rounds_,&wparam(4));
885 &mov ($key_,"esp");
886 &sub ("esp",88);
887 &and ("esp",-16); # align stack
888 &mov (&DWP(80,"esp"),$key_);
889
890 &cmp ($len,1);
891 &je (&label("ctr32_one_shortcut"));
892
893 &movdqu ($inout5,&QWP(0,$rounds_)); # load ivec
894
895 # compose byte-swap control mask for pshufb on stack
896 &mov (&DWP(0,"esp"),0x0c0d0e0f);
897 &mov (&DWP(4,"esp"),0x08090a0b);
898 &mov (&DWP(8,"esp"),0x04050607);
899 &mov (&DWP(12,"esp"),0x00010203);
900
901 # compose counter increment vector on stack
902 &mov ($rounds,6);
903 &xor ($key_,$key_);
904 &mov (&DWP(16,"esp"),$rounds);
905 &mov (&DWP(20,"esp"),$rounds);
906 &mov (&DWP(24,"esp"),$rounds);
907 &mov (&DWP(28,"esp"),$key_);
908
909 &pextrd ($rounds_,$inout5,3); # pull 32-bit counter
910 &pinsrd ($inout5,$key_,3); # wipe 32-bit counter
911
912 &mov ($rounds,&DWP(240,$key)); # key->rounds
913
914 # compose 2 vectors of 3x32-bit counters
915 &bswap ($rounds_);
916 &pxor ($rndkey0,$rndkey0);
917 &pxor ($rndkey1,$rndkey1);
918 &movdqa ($inout0,&QWP(0,"esp")); # load byte-swap mask
919 &pinsrd ($rndkey0,$rounds_,0);
920 &lea ($key_,&DWP(3,$rounds_));
921 &pinsrd ($rndkey1,$key_,0);
922 &inc ($rounds_);
923 &pinsrd ($rndkey0,$rounds_,1);
924 &inc ($key_);
925 &pinsrd ($rndkey1,$key_,1);
926 &inc ($rounds_);
927 &pinsrd ($rndkey0,$rounds_,2);
928 &inc ($key_);
929 &pinsrd ($rndkey1,$key_,2);
930 &movdqa (&QWP(48,"esp"),$rndkey0); # save 1st triplet
931 &pshufb ($rndkey0,$inout0); # byte swap
932 &movdqu ($inout4,&QWP(0,$key)); # key[0]
933 &movdqa (&QWP(64,"esp"),$rndkey1); # save 2nd triplet
934 &pshufb ($rndkey1,$inout0); # byte swap
935
936 &pshufd ($inout0,$rndkey0,3<<6); # place counter to upper dword
937 &pshufd ($inout1,$rndkey0,2<<6);
938 &cmp ($len,6);
939 &jb (&label("ctr32_tail"));
940 &pxor ($inout5,$inout4); # counter-less ivec^key[0]
941 &shl ($rounds,4);
942 &mov ($rounds_,16);
943 &movdqa (&QWP(32,"esp"),$inout5); # save counter-less ivec^key[0]
944 &mov ($key_,$key); # backup $key
945 &sub ($rounds_,$rounds); # backup twisted $rounds
946 &lea ($key,&DWP(32,$key,$rounds));
947 &sub ($len,6);
948 &jmp (&label("ctr32_loop6"));
949
950&set_label("ctr32_loop6",16);
951 # inlining _aesni_encrypt6's prologue gives ~6% improvement...
952 &pshufd ($inout2,$rndkey0,1<<6);
953 &movdqa ($rndkey0,&QWP(32,"esp")); # pull counter-less ivec
954 &pshufd ($inout3,$rndkey1,3<<6);
955 &pxor ($inout0,$rndkey0); # merge counter-less ivec
956 &pshufd ($inout4,$rndkey1,2<<6);
957 &pxor ($inout1,$rndkey0);
958 &pshufd ($inout5,$rndkey1,1<<6);
959 &$movekey ($rndkey1,&QWP(16,$key_));
960 &pxor ($inout2,$rndkey0);
961 &pxor ($inout3,$rndkey0);
962 &aesenc ($inout0,$rndkey1);
963 &pxor ($inout4,$rndkey0);
964 &pxor ($inout5,$rndkey0);
965 &aesenc ($inout1,$rndkey1);
966 &$movekey ($rndkey0,&QWP(32,$key_));
967 &mov ($rounds,$rounds_);
968 &aesenc ($inout2,$rndkey1);
969 &aesenc ($inout3,$rndkey1);
970 &aesenc ($inout4,$rndkey1);
971 &aesenc ($inout5,$rndkey1);
972
973 &call (&label("_aesni_encrypt6_enter"));
974
975 &movups ($rndkey1,&QWP(0,$inp));
976 &movups ($rndkey0,&QWP(0x10,$inp));
977 &xorps ($inout0,$rndkey1);
978 &movups ($rndkey1,&QWP(0x20,$inp));
979 &xorps ($inout1,$rndkey0);
980 &movups (&QWP(0,$out),$inout0);
981 &movdqa ($rndkey0,&QWP(16,"esp")); # load increment
982 &xorps ($inout2,$rndkey1);
983 &movdqa ($rndkey1,&QWP(64,"esp")); # load 2nd triplet
984 &movups (&QWP(0x10,$out),$inout1);
985 &movups (&QWP(0x20,$out),$inout2);
986
987 &paddd ($rndkey1,$rndkey0); # 2nd triplet increment
988 &paddd ($rndkey0,&QWP(48,"esp")); # 1st triplet increment
989 &movdqa ($inout0,&QWP(0,"esp")); # load byte swap mask
990
991 &movups ($inout1,&QWP(0x30,$inp));
992 &movups ($inout2,&QWP(0x40,$inp));
993 &xorps ($inout3,$inout1);
994 &movups ($inout1,&QWP(0x50,$inp));
995 &lea ($inp,&DWP(0x60,$inp));
996 &movdqa (&QWP(48,"esp"),$rndkey0); # save 1st triplet
997 &pshufb ($rndkey0,$inout0); # byte swap
998 &xorps ($inout4,$inout2);
999 &movups (&QWP(0x30,$out),$inout3);
1000 &xorps ($inout5,$inout1);
1001 &movdqa (&QWP(64,"esp"),$rndkey1); # save 2nd triplet
1002 &pshufb ($rndkey1,$inout0); # byte swap
1003 &movups (&QWP(0x40,$out),$inout4);
1004 &pshufd ($inout0,$rndkey0,3<<6);
1005 &movups (&QWP(0x50,$out),$inout5);
1006 &lea ($out,&DWP(0x60,$out));
1007
1008 &pshufd ($inout1,$rndkey0,2<<6);
1009 &sub ($len,6);
1010 &jnc (&label("ctr32_loop6"));
1011
1012 &add ($len,6);
1013 &jz (&label("ctr32_ret"));
1014 &movdqu ($inout5,&QWP(0,$key_));
1015 &mov ($key,$key_);
1016 &pxor ($inout5,&QWP(32,"esp")); # restore count-less ivec
1017 &mov ($rounds,&DWP(240,$key_)); # restore $rounds
1018
1019&set_label("ctr32_tail");
1020 &por ($inout0,$inout5);
1021 &cmp ($len,2);
1022 &jb (&label("ctr32_one"));
1023
1024 &pshufd ($inout2,$rndkey0,1<<6);
1025 &por ($inout1,$inout5);
1026 &je (&label("ctr32_two"));
1027
1028 &pshufd ($inout3,$rndkey1,3<<6);
1029 &por ($inout2,$inout5);
1030 &cmp ($len,4);
1031 &jb (&label("ctr32_three"));
1032
1033 &pshufd ($inout4,$rndkey1,2<<6);
1034 &por ($inout3,$inout5);
1035 &je (&label("ctr32_four"));
1036
1037 &por ($inout4,$inout5);
1038 &call ("_aesni_encrypt6");
1039 &movups ($rndkey1,&QWP(0,$inp));
1040 &movups ($rndkey0,&QWP(0x10,$inp));
1041 &xorps ($inout0,$rndkey1);
1042 &movups ($rndkey1,&QWP(0x20,$inp));
1043 &xorps ($inout1,$rndkey0);
1044 &movups ($rndkey0,&QWP(0x30,$inp));
1045 &xorps ($inout2,$rndkey1);
1046 &movups ($rndkey1,&QWP(0x40,$inp));
1047 &xorps ($inout3,$rndkey0);
1048 &movups (&QWP(0,$out),$inout0);
1049 &xorps ($inout4,$rndkey1);
1050 &movups (&QWP(0x10,$out),$inout1);
1051 &movups (&QWP(0x20,$out),$inout2);
1052 &movups (&QWP(0x30,$out),$inout3);
1053 &movups (&QWP(0x40,$out),$inout4);
1054 &jmp (&label("ctr32_ret"));
1055
1056&set_label("ctr32_one_shortcut",16);
1057 &movups ($inout0,&QWP(0,$rounds_)); # load ivec
1058 &mov ($rounds,&DWP(240,$key));
1059
1060&set_label("ctr32_one");
1061 if ($inline)
1062 { &aesni_inline_generate1("enc"); }
1063 else
1064 { &call ("_aesni_encrypt1"); }
1065 &movups ($in0,&QWP(0,$inp));
1066 &xorps ($in0,$inout0);
1067 &movups (&QWP(0,$out),$in0);
1068 &jmp (&label("ctr32_ret"));
1069
1070&set_label("ctr32_two",16);
1071 &call ("_aesni_encrypt2");
1072 &movups ($inout3,&QWP(0,$inp));
1073 &movups ($inout4,&QWP(0x10,$inp));
1074 &xorps ($inout0,$inout3);
1075 &xorps ($inout1,$inout4);
1076 &movups (&QWP(0,$out),$inout0);
1077 &movups (&QWP(0x10,$out),$inout1);
1078 &jmp (&label("ctr32_ret"));
1079
1080&set_label("ctr32_three",16);
1081 &call ("_aesni_encrypt3");
1082 &movups ($inout3,&QWP(0,$inp));
1083 &movups ($inout4,&QWP(0x10,$inp));
1084 &xorps ($inout0,$inout3);
1085 &movups ($inout5,&QWP(0x20,$inp));
1086 &xorps ($inout1,$inout4);
1087 &movups (&QWP(0,$out),$inout0);
1088 &xorps ($inout2,$inout5);
1089 &movups (&QWP(0x10,$out),$inout1);
1090 &movups (&QWP(0x20,$out),$inout2);
1091 &jmp (&label("ctr32_ret"));
1092
1093&set_label("ctr32_four",16);
1094 &call ("_aesni_encrypt4");
1095 &movups ($inout4,&QWP(0,$inp));
1096 &movups ($inout5,&QWP(0x10,$inp));
1097 &movups ($rndkey1,&QWP(0x20,$inp));
1098 &xorps ($inout0,$inout4);
1099 &movups ($rndkey0,&QWP(0x30,$inp));
1100 &xorps ($inout1,$inout5);
1101 &movups (&QWP(0,$out),$inout0);
1102 &xorps ($inout2,$rndkey1);
1103 &movups (&QWP(0x10,$out),$inout1);
1104 &xorps ($inout3,$rndkey0);
1105 &movups (&QWP(0x20,$out),$inout2);
1106 &movups (&QWP(0x30,$out),$inout3);
1107
1108&set_label("ctr32_ret");
1109 &pxor ("xmm0","xmm0"); # clear register bank
1110 &pxor ("xmm1","xmm1");
1111 &pxor ("xmm2","xmm2");
1112 &pxor ("xmm3","xmm3");
1113 &pxor ("xmm4","xmm4");
1114 &movdqa (&QWP(32,"esp"),"xmm0"); # clear stack
1115 &pxor ("xmm5","xmm5");
1116 &movdqa (&QWP(48,"esp"),"xmm0");
1117 &pxor ("xmm6","xmm6");
1118 &movdqa (&QWP(64,"esp"),"xmm0");
1119 &pxor ("xmm7","xmm7");
1120 &mov ("esp",&DWP(80,"esp"));
1121&function_end("aesni_ctr32_encrypt_blocks");
1122
1123
1124######################################################################
1125# void aesni_xts_[en|de]crypt(const char *inp,char *out,size_t len,
1126# const AES_KEY *key1, const AES_KEY *key2
1127# const unsigned char iv[16]);
1128#
1129{ my ($tweak,$twtmp,$twres,$twmask)=($rndkey1,$rndkey0,$inout0,$inout1);
1130
1131&function_begin("aesni_xts_encrypt");
1132 &mov ($key,&wparam(4)); # key2
1133 &mov ($inp,&wparam(5)); # clear-text tweak
1134
1135 &mov ($rounds,&DWP(240,$key)); # key2->rounds
1136 &movups ($inout0,&QWP(0,$inp));
1137 if ($inline)
1138 { &aesni_inline_generate1("enc"); }
1139 else
1140 { &call ("_aesni_encrypt1"); }
1141
1142 &mov ($inp,&wparam(0));
1143 &mov ($out,&wparam(1));
1144 &mov ($len,&wparam(2));
1145 &mov ($key,&wparam(3)); # key1
1146
1147 &mov ($key_,"esp");
1148 &sub ("esp",16*7+8);
1149 &mov ($rounds,&DWP(240,$key)); # key1->rounds
1150 &and ("esp",-16); # align stack
1151
1152 &mov (&DWP(16*6+0,"esp"),0x87); # compose the magic constant
1153 &mov (&DWP(16*6+4,"esp"),0);
1154 &mov (&DWP(16*6+8,"esp"),1);
1155 &mov (&DWP(16*6+12,"esp"),0);
1156 &mov (&DWP(16*7+0,"esp"),$len); # save original $len
1157 &mov (&DWP(16*7+4,"esp"),$key_); # save original %esp
1158
1159 &movdqa ($tweak,$inout0);
1160 &pxor ($twtmp,$twtmp);
1161 &movdqa ($twmask,&QWP(6*16,"esp")); # 0x0...010...87
1162 &pcmpgtd($twtmp,$tweak); # broadcast upper bits
1163
1164 &and ($len,-16);
1165 &mov ($key_,$key); # backup $key
1166 &mov ($rounds_,$rounds); # backup $rounds
1167 &sub ($len,16*6);
1168 &jc (&label("xts_enc_short"));
1169
1170 &shl ($rounds,4);
1171 &mov ($rounds_,16);
1172 &sub ($rounds_,$rounds);
1173 &lea ($key,&DWP(32,$key,$rounds));
1174 &jmp (&label("xts_enc_loop6"));
1175
1176&set_label("xts_enc_loop6",16);
1177 for ($i=0;$i<4;$i++) {
1178 &pshufd ($twres,$twtmp,0x13);
1179 &pxor ($twtmp,$twtmp);
1180 &movdqa (&QWP(16*$i,"esp"),$tweak);
1181 &paddq ($tweak,$tweak); # &psllq($tweak,1);
1182 &pand ($twres,$twmask); # isolate carry and residue
1183 &pcmpgtd ($twtmp,$tweak); # broadcast upper bits
1184 &pxor ($tweak,$twres);
1185 }
1186 &pshufd ($inout5,$twtmp,0x13);
1187 &movdqa (&QWP(16*$i++,"esp"),$tweak);
1188 &paddq ($tweak,$tweak); # &psllq($tweak,1);
1189 &$movekey ($rndkey0,&QWP(0,$key_));
1190 &pand ($inout5,$twmask); # isolate carry and residue
1191 &movups ($inout0,&QWP(0,$inp)); # load input
1192 &pxor ($inout5,$tweak);
1193
1194 # inline _aesni_encrypt6 prologue and flip xor with tweak and key[0]
1195 &mov ($rounds,$rounds_); # restore $rounds
1196 &movdqu ($inout1,&QWP(16*1,$inp));
1197 &xorps ($inout0,$rndkey0); # input^=rndkey[0]
1198 &movdqu ($inout2,&QWP(16*2,$inp));
1199 &pxor ($inout1,$rndkey0);
1200 &movdqu ($inout3,&QWP(16*3,$inp));
1201 &pxor ($inout2,$rndkey0);
1202 &movdqu ($inout4,&QWP(16*4,$inp));
1203 &pxor ($inout3,$rndkey0);
1204 &movdqu ($rndkey1,&QWP(16*5,$inp));
1205 &pxor ($inout4,$rndkey0);
1206 &lea ($inp,&DWP(16*6,$inp));
1207 &pxor ($inout0,&QWP(16*0,"esp")); # input^=tweak
1208 &movdqa (&QWP(16*$i,"esp"),$inout5); # save last tweak
1209 &pxor ($inout5,$rndkey1);
1210
1211 &$movekey ($rndkey1,&QWP(16,$key_));
1212 &pxor ($inout1,&QWP(16*1,"esp"));
1213 &pxor ($inout2,&QWP(16*2,"esp"));
1214 &aesenc ($inout0,$rndkey1);
1215 &pxor ($inout3,&QWP(16*3,"esp"));
1216 &pxor ($inout4,&QWP(16*4,"esp"));
1217 &aesenc ($inout1,$rndkey1);
1218 &pxor ($inout5,$rndkey0);
1219 &$movekey ($rndkey0,&QWP(32,$key_));
1220 &aesenc ($inout2,$rndkey1);
1221 &aesenc ($inout3,$rndkey1);
1222 &aesenc ($inout4,$rndkey1);
1223 &aesenc ($inout5,$rndkey1);
1224 &call (&label("_aesni_encrypt6_enter"));
1225
1226 &movdqa ($tweak,&QWP(16*5,"esp")); # last tweak
1227 &pxor ($twtmp,$twtmp);
1228 &xorps ($inout0,&QWP(16*0,"esp")); # output^=tweak
1229 &pcmpgtd ($twtmp,$tweak); # broadcast upper bits
1230 &xorps ($inout1,&QWP(16*1,"esp"));
1231 &movups (&QWP(16*0,$out),$inout0); # write output
1232 &xorps ($inout2,&QWP(16*2,"esp"));
1233 &movups (&QWP(16*1,$out),$inout1);
1234 &xorps ($inout3,&QWP(16*3,"esp"));
1235 &movups (&QWP(16*2,$out),$inout2);
1236 &xorps ($inout4,&QWP(16*4,"esp"));
1237 &movups (&QWP(16*3,$out),$inout3);
1238 &xorps ($inout5,$tweak);
1239 &movups (&QWP(16*4,$out),$inout4);
1240 &pshufd ($twres,$twtmp,0x13);
1241 &movups (&QWP(16*5,$out),$inout5);
1242 &lea ($out,&DWP(16*6,$out));
1243 &movdqa ($twmask,&QWP(16*6,"esp")); # 0x0...010...87
1244
1245 &pxor ($twtmp,$twtmp);
1246 &paddq ($tweak,$tweak); # &psllq($tweak,1);
1247 &pand ($twres,$twmask); # isolate carry and residue
1248 &pcmpgtd($twtmp,$tweak); # broadcast upper bits
1249 &pxor ($tweak,$twres);
1250
1251 &sub ($len,16*6);
1252 &jnc (&label("xts_enc_loop6"));
1253
1254 &mov ($rounds,&DWP(240,$key_)); # restore $rounds
1255 &mov ($key,$key_); # restore $key
1256 &mov ($rounds_,$rounds);
1257
1258&set_label("xts_enc_short");
1259 &add ($len,16*6);
1260 &jz (&label("xts_enc_done6x"));
1261
1262 &movdqa ($inout3,$tweak); # put aside previous tweak
1263 &cmp ($len,0x20);
1264 &jb (&label("xts_enc_one"));
1265
1266 &pshufd ($twres,$twtmp,0x13);
1267 &pxor ($twtmp,$twtmp);
1268 &paddq ($tweak,$tweak); # &psllq($tweak,1);
1269 &pand ($twres,$twmask); # isolate carry and residue
1270 &pcmpgtd($twtmp,$tweak); # broadcast upper bits
1271 &pxor ($tweak,$twres);
1272 &je (&label("xts_enc_two"));
1273
1274 &pshufd ($twres,$twtmp,0x13);
1275 &pxor ($twtmp,$twtmp);
1276 &movdqa ($inout4,$tweak); # put aside previous tweak
1277 &paddq ($tweak,$tweak); # &psllq($tweak,1);
1278 &pand ($twres,$twmask); # isolate carry and residue
1279 &pcmpgtd($twtmp,$tweak); # broadcast upper bits
1280 &pxor ($tweak,$twres);
1281 &cmp ($len,0x40);
1282 &jb (&label("xts_enc_three"));
1283
1284 &pshufd ($twres,$twtmp,0x13);
1285 &pxor ($twtmp,$twtmp);
1286 &movdqa ($inout5,$tweak); # put aside previous tweak
1287 &paddq ($tweak,$tweak); # &psllq($tweak,1);
1288 &pand ($twres,$twmask); # isolate carry and residue
1289 &pcmpgtd($twtmp,$tweak); # broadcast upper bits
1290 &pxor ($tweak,$twres);
1291 &movdqa (&QWP(16*0,"esp"),$inout3);
1292 &movdqa (&QWP(16*1,"esp"),$inout4);
1293 &je (&label("xts_enc_four"));
1294
1295 &movdqa (&QWP(16*2,"esp"),$inout5);
1296 &pshufd ($inout5,$twtmp,0x13);
1297 &movdqa (&QWP(16*3,"esp"),$tweak);
1298 &paddq ($tweak,$tweak); # &psllq($inout0,1);
1299 &pand ($inout5,$twmask); # isolate carry and residue
1300 &pxor ($inout5,$tweak);
1301
1302 &movdqu ($inout0,&QWP(16*0,$inp)); # load input
1303 &movdqu ($inout1,&QWP(16*1,$inp));
1304 &movdqu ($inout2,&QWP(16*2,$inp));
1305 &pxor ($inout0,&QWP(16*0,"esp")); # input^=tweak
1306 &movdqu ($inout3,&QWP(16*3,$inp));
1307 &pxor ($inout1,&QWP(16*1,"esp"));
1308 &movdqu ($inout4,&QWP(16*4,$inp));
1309 &pxor ($inout2,&QWP(16*2,"esp"));
1310 &lea ($inp,&DWP(16*5,$inp));
1311 &pxor ($inout3,&QWP(16*3,"esp"));
1312 &movdqa (&QWP(16*4,"esp"),$inout5); # save last tweak
1313 &pxor ($inout4,$inout5);
1314
1315 &call ("_aesni_encrypt6");
1316
1317 &movaps ($tweak,&QWP(16*4,"esp")); # last tweak
1318 &xorps ($inout0,&QWP(16*0,"esp")); # output^=tweak
1319 &xorps ($inout1,&QWP(16*1,"esp"));
1320 &xorps ($inout2,&QWP(16*2,"esp"));
1321 &movups (&QWP(16*0,$out),$inout0); # write output
1322 &xorps ($inout3,&QWP(16*3,"esp"));
1323 &movups (&QWP(16*1,$out),$inout1);
1324 &xorps ($inout4,$tweak);
1325 &movups (&QWP(16*2,$out),$inout2);
1326 &movups (&QWP(16*3,$out),$inout3);
1327 &movups (&QWP(16*4,$out),$inout4);
1328 &lea ($out,&DWP(16*5,$out));
1329 &jmp (&label("xts_enc_done"));
1330
1331&set_label("xts_enc_one",16);
1332 &movups ($inout0,&QWP(16*0,$inp)); # load input
1333 &lea ($inp,&DWP(16*1,$inp));
1334 &xorps ($inout0,$inout3); # input^=tweak
1335 if ($inline)
1336 { &aesni_inline_generate1("enc"); }
1337 else
1338 { &call ("_aesni_encrypt1"); }
1339 &xorps ($inout0,$inout3); # output^=tweak
1340 &movups (&QWP(16*0,$out),$inout0); # write output
1341 &lea ($out,&DWP(16*1,$out));
1342
1343 &movdqa ($tweak,$inout3); # last tweak
1344 &jmp (&label("xts_enc_done"));
1345
1346&set_label("xts_enc_two",16);
1347 &movaps ($inout4,$tweak); # put aside last tweak
1348
1349 &movups ($inout0,&QWP(16*0,$inp)); # load input
1350 &movups ($inout1,&QWP(16*1,$inp));
1351 &lea ($inp,&DWP(16*2,$inp));
1352 &xorps ($inout0,$inout3); # input^=tweak
1353 &xorps ($inout1,$inout4);
1354
1355 &call ("_aesni_encrypt2");
1356
1357 &xorps ($inout0,$inout3); # output^=tweak
1358 &xorps ($inout1,$inout4);
1359 &movups (&QWP(16*0,$out),$inout0); # write output
1360 &movups (&QWP(16*1,$out),$inout1);
1361 &lea ($out,&DWP(16*2,$out));
1362
1363 &movdqa ($tweak,$inout4); # last tweak
1364 &jmp (&label("xts_enc_done"));
1365
1366&set_label("xts_enc_three",16);
1367 &movaps ($inout5,$tweak); # put aside last tweak
1368 &movups ($inout0,&QWP(16*0,$inp)); # load input
1369 &movups ($inout1,&QWP(16*1,$inp));
1370 &movups ($inout2,&QWP(16*2,$inp));
1371 &lea ($inp,&DWP(16*3,$inp));
1372 &xorps ($inout0,$inout3); # input^=tweak
1373 &xorps ($inout1,$inout4);
1374 &xorps ($inout2,$inout5);
1375
1376 &call ("_aesni_encrypt3");
1377
1378 &xorps ($inout0,$inout3); # output^=tweak
1379 &xorps ($inout1,$inout4);
1380 &xorps ($inout2,$inout5);
1381 &movups (&QWP(16*0,$out),$inout0); # write output
1382 &movups (&QWP(16*1,$out),$inout1);
1383 &movups (&QWP(16*2,$out),$inout2);
1384 &lea ($out,&DWP(16*3,$out));
1385
1386 &movdqa ($tweak,$inout5); # last tweak
1387 &jmp (&label("xts_enc_done"));
1388
1389&set_label("xts_enc_four",16);
1390 &movaps ($inout4,$tweak); # put aside last tweak
1391
1392 &movups ($inout0,&QWP(16*0,$inp)); # load input
1393 &movups ($inout1,&QWP(16*1,$inp));
1394 &movups ($inout2,&QWP(16*2,$inp));
1395 &xorps ($inout0,&QWP(16*0,"esp")); # input^=tweak
1396 &movups ($inout3,&QWP(16*3,$inp));
1397 &lea ($inp,&DWP(16*4,$inp));
1398 &xorps ($inout1,&QWP(16*1,"esp"));
1399 &xorps ($inout2,$inout5);
1400 &xorps ($inout3,$inout4);
1401
1402 &call ("_aesni_encrypt4");
1403
1404 &xorps ($inout0,&QWP(16*0,"esp")); # output^=tweak
1405 &xorps ($inout1,&QWP(16*1,"esp"));
1406 &xorps ($inout2,$inout5);
1407 &movups (&QWP(16*0,$out),$inout0); # write output
1408 &xorps ($inout3,$inout4);
1409 &movups (&QWP(16*1,$out),$inout1);
1410 &movups (&QWP(16*2,$out),$inout2);
1411 &movups (&QWP(16*3,$out),$inout3);
1412 &lea ($out,&DWP(16*4,$out));
1413
1414 &movdqa ($tweak,$inout4); # last tweak
1415 &jmp (&label("xts_enc_done"));
1416
1417&set_label("xts_enc_done6x",16); # $tweak is pre-calculated
1418 &mov ($len,&DWP(16*7+0,"esp")); # restore original $len
1419 &and ($len,15);
1420 &jz (&label("xts_enc_ret"));
1421 &movdqa ($inout3,$tweak);
1422 &mov (&DWP(16*7+0,"esp"),$len); # save $len%16
1423 &jmp (&label("xts_enc_steal"));
1424
1425&set_label("xts_enc_done",16);
1426 &mov ($len,&DWP(16*7+0,"esp")); # restore original $len
1427 &pxor ($twtmp,$twtmp);
1428 &and ($len,15);
1429 &jz (&label("xts_enc_ret"));
1430
1431 &pcmpgtd($twtmp,$tweak); # broadcast upper bits
1432 &mov (&DWP(16*7+0,"esp"),$len); # save $len%16
1433 &pshufd ($inout3,$twtmp,0x13);
1434 &paddq ($tweak,$tweak); # &psllq($tweak,1);
1435 &pand ($inout3,&QWP(16*6,"esp")); # isolate carry and residue
1436 &pxor ($inout3,$tweak);
1437
1438&set_label("xts_enc_steal");
1439 &movz ($rounds,&BP(0,$inp));
1440 &movz ($key,&BP(-16,$out));
1441 &lea ($inp,&DWP(1,$inp));
1442 &mov (&BP(-16,$out),&LB($rounds));
1443 &mov (&BP(0,$out),&LB($key));
1444 &lea ($out,&DWP(1,$out));
1445 &sub ($len,1);
1446 &jnz (&label("xts_enc_steal"));
1447
1448 &sub ($out,&DWP(16*7+0,"esp")); # rewind $out
1449 &mov ($key,$key_); # restore $key
1450 &mov ($rounds,$rounds_); # restore $rounds
1451
1452 &movups ($inout0,&QWP(-16,$out)); # load input
1453 &xorps ($inout0,$inout3); # input^=tweak
1454 if ($inline)
1455 { &aesni_inline_generate1("enc"); }
1456 else
1457 { &call ("_aesni_encrypt1"); }
1458 &xorps ($inout0,$inout3); # output^=tweak
1459 &movups (&QWP(-16,$out),$inout0); # write output
1460
1461&set_label("xts_enc_ret");
1462 &pxor ("xmm0","xmm0"); # clear register bank
1463 &pxor ("xmm1","xmm1");
1464 &pxor ("xmm2","xmm2");
1465 &movdqa (&QWP(16*0,"esp"),"xmm0"); # clear stack
1466 &pxor ("xmm3","xmm3");
1467 &movdqa (&QWP(16*1,"esp"),"xmm0");
1468 &pxor ("xmm4","xmm4");
1469 &movdqa (&QWP(16*2,"esp"),"xmm0");
1470 &pxor ("xmm5","xmm5");
1471 &movdqa (&QWP(16*3,"esp"),"xmm0");
1472 &pxor ("xmm6","xmm6");
1473 &movdqa (&QWP(16*4,"esp"),"xmm0");
1474 &pxor ("xmm7","xmm7");
1475 &movdqa (&QWP(16*5,"esp"),"xmm0");
1476 &mov ("esp",&DWP(16*7+4,"esp")); # restore %esp
1477&function_end("aesni_xts_encrypt");
1478
1479&function_begin("aesni_xts_decrypt");
1480 &mov ($key,&wparam(4)); # key2
1481 &mov ($inp,&wparam(5)); # clear-text tweak
1482
1483 &mov ($rounds,&DWP(240,$key)); # key2->rounds
1484 &movups ($inout0,&QWP(0,$inp));
1485 if ($inline)
1486 { &aesni_inline_generate1("enc"); }
1487 else
1488 { &call ("_aesni_encrypt1"); }
1489
1490 &mov ($inp,&wparam(0));
1491 &mov ($out,&wparam(1));
1492 &mov ($len,&wparam(2));
1493 &mov ($key,&wparam(3)); # key1
1494
1495 &mov ($key_,"esp");
1496 &sub ("esp",16*7+8);
1497 &and ("esp",-16); # align stack
1498
1499 &xor ($rounds_,$rounds_); # if(len%16) len-=16;
1500 &test ($len,15);
1501 &setnz (&LB($rounds_));
1502 &shl ($rounds_,4);
1503 &sub ($len,$rounds_);
1504
1505 &mov (&DWP(16*6+0,"esp"),0x87); # compose the magic constant
1506 &mov (&DWP(16*6+4,"esp"),0);
1507 &mov (&DWP(16*6+8,"esp"),1);
1508 &mov (&DWP(16*6+12,"esp"),0);
1509 &mov (&DWP(16*7+0,"esp"),$len); # save original $len
1510 &mov (&DWP(16*7+4,"esp"),$key_); # save original %esp
1511
1512 &mov ($rounds,&DWP(240,$key)); # key1->rounds
1513 &mov ($key_,$key); # backup $key
1514 &mov ($rounds_,$rounds); # backup $rounds
1515
1516 &movdqa ($tweak,$inout0);
1517 &pxor ($twtmp,$twtmp);
1518 &movdqa ($twmask,&QWP(6*16,"esp")); # 0x0...010...87
1519 &pcmpgtd($twtmp,$tweak); # broadcast upper bits
1520
1521 &and ($len,-16);
1522 &sub ($len,16*6);
1523 &jc (&label("xts_dec_short"));
1524
1525 &shl ($rounds,4);
1526 &mov ($rounds_,16);
1527 &sub ($rounds_,$rounds);
1528 &lea ($key,&DWP(32,$key,$rounds));
1529 &jmp (&label("xts_dec_loop6"));
1530
1531&set_label("xts_dec_loop6",16);
1532 for ($i=0;$i<4;$i++) {
1533 &pshufd ($twres,$twtmp,0x13);
1534 &pxor ($twtmp,$twtmp);
1535 &movdqa (&QWP(16*$i,"esp"),$tweak);
1536 &paddq ($tweak,$tweak); # &psllq($tweak,1);
1537 &pand ($twres,$twmask); # isolate carry and residue
1538 &pcmpgtd ($twtmp,$tweak); # broadcast upper bits
1539 &pxor ($tweak,$twres);
1540 }
1541 &pshufd ($inout5,$twtmp,0x13);
1542 &movdqa (&QWP(16*$i++,"esp"),$tweak);
1543 &paddq ($tweak,$tweak); # &psllq($tweak,1);
1544 &$movekey ($rndkey0,&QWP(0,$key_));
1545 &pand ($inout5,$twmask); # isolate carry and residue
1546 &movups ($inout0,&QWP(0,$inp)); # load input
1547 &pxor ($inout5,$tweak);
1548
1549 # inline _aesni_encrypt6 prologue and flip xor with tweak and key[0]
1550 &mov ($rounds,$rounds_);
1551 &movdqu ($inout1,&QWP(16*1,$inp));
1552 &xorps ($inout0,$rndkey0); # input^=rndkey[0]
1553 &movdqu ($inout2,&QWP(16*2,$inp));
1554 &pxor ($inout1,$rndkey0);
1555 &movdqu ($inout3,&QWP(16*3,$inp));
1556 &pxor ($inout2,$rndkey0);
1557 &movdqu ($inout4,&QWP(16*4,$inp));
1558 &pxor ($inout3,$rndkey0);
1559 &movdqu ($rndkey1,&QWP(16*5,$inp));
1560 &pxor ($inout4,$rndkey0);
1561 &lea ($inp,&DWP(16*6,$inp));
1562 &pxor ($inout0,&QWP(16*0,"esp")); # input^=tweak
1563 &movdqa (&QWP(16*$i,"esp"),$inout5); # save last tweak
1564 &pxor ($inout5,$rndkey1);
1565
1566 &$movekey ($rndkey1,&QWP(16,$key_));
1567 &pxor ($inout1,&QWP(16*1,"esp"));
1568 &pxor ($inout2,&QWP(16*2,"esp"));
1569 &aesdec ($inout0,$rndkey1);
1570 &pxor ($inout3,&QWP(16*3,"esp"));
1571 &pxor ($inout4,&QWP(16*4,"esp"));
1572 &aesdec ($inout1,$rndkey1);
1573 &pxor ($inout5,$rndkey0);
1574 &$movekey ($rndkey0,&QWP(32,$key_));
1575 &aesdec ($inout2,$rndkey1);
1576 &aesdec ($inout3,$rndkey1);
1577 &aesdec ($inout4,$rndkey1);
1578 &aesdec ($inout5,$rndkey1);
1579 &call (&label("_aesni_decrypt6_enter"));
1580
1581 &movdqa ($tweak,&QWP(16*5,"esp")); # last tweak
1582 &pxor ($twtmp,$twtmp);
1583 &xorps ($inout0,&QWP(16*0,"esp")); # output^=tweak
1584 &pcmpgtd ($twtmp,$tweak); # broadcast upper bits
1585 &xorps ($inout1,&QWP(16*1,"esp"));
1586 &movups (&QWP(16*0,$out),$inout0); # write output
1587 &xorps ($inout2,&QWP(16*2,"esp"));
1588 &movups (&QWP(16*1,$out),$inout1);
1589 &xorps ($inout3,&QWP(16*3,"esp"));
1590 &movups (&QWP(16*2,$out),$inout2);
1591 &xorps ($inout4,&QWP(16*4,"esp"));
1592 &movups (&QWP(16*3,$out),$inout3);
1593 &xorps ($inout5,$tweak);
1594 &movups (&QWP(16*4,$out),$inout4);
1595 &pshufd ($twres,$twtmp,0x13);
1596 &movups (&QWP(16*5,$out),$inout5);
1597 &lea ($out,&DWP(16*6,$out));
1598 &movdqa ($twmask,&QWP(16*6,"esp")); # 0x0...010...87
1599
1600 &pxor ($twtmp,$twtmp);
1601 &paddq ($tweak,$tweak); # &psllq($tweak,1);
1602 &pand ($twres,$twmask); # isolate carry and residue
1603 &pcmpgtd($twtmp,$tweak); # broadcast upper bits
1604 &pxor ($tweak,$twres);
1605
1606 &sub ($len,16*6);
1607 &jnc (&label("xts_dec_loop6"));
1608
1609 &mov ($rounds,&DWP(240,$key_)); # restore $rounds
1610 &mov ($key,$key_); # restore $key
1611 &mov ($rounds_,$rounds);
1612
1613&set_label("xts_dec_short");
1614 &add ($len,16*6);
1615 &jz (&label("xts_dec_done6x"));
1616
1617 &movdqa ($inout3,$tweak); # put aside previous tweak
1618 &cmp ($len,0x20);
1619 &jb (&label("xts_dec_one"));
1620
1621 &pshufd ($twres,$twtmp,0x13);
1622 &pxor ($twtmp,$twtmp);
1623 &paddq ($tweak,$tweak); # &psllq($tweak,1);
1624 &pand ($twres,$twmask); # isolate carry and residue
1625 &pcmpgtd($twtmp,$tweak); # broadcast upper bits
1626 &pxor ($tweak,$twres);
1627 &je (&label("xts_dec_two"));
1628
1629 &pshufd ($twres,$twtmp,0x13);
1630 &pxor ($twtmp,$twtmp);
1631 &movdqa ($inout4,$tweak); # put aside previous tweak
1632 &paddq ($tweak,$tweak); # &psllq($tweak,1);
1633 &pand ($twres,$twmask); # isolate carry and residue
1634 &pcmpgtd($twtmp,$tweak); # broadcast upper bits
1635 &pxor ($tweak,$twres);
1636 &cmp ($len,0x40);
1637 &jb (&label("xts_dec_three"));
1638
1639 &pshufd ($twres,$twtmp,0x13);
1640 &pxor ($twtmp,$twtmp);
1641 &movdqa ($inout5,$tweak); # put aside previous tweak
1642 &paddq ($tweak,$tweak); # &psllq($tweak,1);
1643 &pand ($twres,$twmask); # isolate carry and residue
1644 &pcmpgtd($twtmp,$tweak); # broadcast upper bits
1645 &pxor ($tweak,$twres);
1646 &movdqa (&QWP(16*0,"esp"),$inout3);
1647 &movdqa (&QWP(16*1,"esp"),$inout4);
1648 &je (&label("xts_dec_four"));
1649
1650 &movdqa (&QWP(16*2,"esp"),$inout5);
1651 &pshufd ($inout5,$twtmp,0x13);
1652 &movdqa (&QWP(16*3,"esp"),$tweak);
1653 &paddq ($tweak,$tweak); # &psllq($inout0,1);
1654 &pand ($inout5,$twmask); # isolate carry and residue
1655 &pxor ($inout5,$tweak);
1656
1657 &movdqu ($inout0,&QWP(16*0,$inp)); # load input
1658 &movdqu ($inout1,&QWP(16*1,$inp));
1659 &movdqu ($inout2,&QWP(16*2,$inp));
1660 &pxor ($inout0,&QWP(16*0,"esp")); # input^=tweak
1661 &movdqu ($inout3,&QWP(16*3,$inp));
1662 &pxor ($inout1,&QWP(16*1,"esp"));
1663 &movdqu ($inout4,&QWP(16*4,$inp));
1664 &pxor ($inout2,&QWP(16*2,"esp"));
1665 &lea ($inp,&DWP(16*5,$inp));
1666 &pxor ($inout3,&QWP(16*3,"esp"));
1667 &movdqa (&QWP(16*4,"esp"),$inout5); # save last tweak
1668 &pxor ($inout4,$inout5);
1669
1670 &call ("_aesni_decrypt6");
1671
1672 &movaps ($tweak,&QWP(16*4,"esp")); # last tweak
1673 &xorps ($inout0,&QWP(16*0,"esp")); # output^=tweak
1674 &xorps ($inout1,&QWP(16*1,"esp"));
1675 &xorps ($inout2,&QWP(16*2,"esp"));
1676 &movups (&QWP(16*0,$out),$inout0); # write output
1677 &xorps ($inout3,&QWP(16*3,"esp"));
1678 &movups (&QWP(16*1,$out),$inout1);
1679 &xorps ($inout4,$tweak);
1680 &movups (&QWP(16*2,$out),$inout2);
1681 &movups (&QWP(16*3,$out),$inout3);
1682 &movups (&QWP(16*4,$out),$inout4);
1683 &lea ($out,&DWP(16*5,$out));
1684 &jmp (&label("xts_dec_done"));
1685
1686&set_label("xts_dec_one",16);
1687 &movups ($inout0,&QWP(16*0,$inp)); # load input
1688 &lea ($inp,&DWP(16*1,$inp));
1689 &xorps ($inout0,$inout3); # input^=tweak
1690 if ($inline)
1691 { &aesni_inline_generate1("dec"); }
1692 else
1693 { &call ("_aesni_decrypt1"); }
1694 &xorps ($inout0,$inout3); # output^=tweak
1695 &movups (&QWP(16*0,$out),$inout0); # write output
1696 &lea ($out,&DWP(16*1,$out));
1697
1698 &movdqa ($tweak,$inout3); # last tweak
1699 &jmp (&label("xts_dec_done"));
1700
1701&set_label("xts_dec_two",16);
1702 &movaps ($inout4,$tweak); # put aside last tweak
1703
1704 &movups ($inout0,&QWP(16*0,$inp)); # load input
1705 &movups ($inout1,&QWP(16*1,$inp));
1706 &lea ($inp,&DWP(16*2,$inp));
1707 &xorps ($inout0,$inout3); # input^=tweak
1708 &xorps ($inout1,$inout4);
1709
1710 &call ("_aesni_decrypt2");
1711
1712 &xorps ($inout0,$inout3); # output^=tweak
1713 &xorps ($inout1,$inout4);
1714 &movups (&QWP(16*0,$out),$inout0); # write output
1715 &movups (&QWP(16*1,$out),$inout1);
1716 &lea ($out,&DWP(16*2,$out));
1717
1718 &movdqa ($tweak,$inout4); # last tweak
1719 &jmp (&label("xts_dec_done"));
1720
1721&set_label("xts_dec_three",16);
1722 &movaps ($inout5,$tweak); # put aside last tweak
1723 &movups ($inout0,&QWP(16*0,$inp)); # load input
1724 &movups ($inout1,&QWP(16*1,$inp));
1725 &movups ($inout2,&QWP(16*2,$inp));
1726 &lea ($inp,&DWP(16*3,$inp));
1727 &xorps ($inout0,$inout3); # input^=tweak
1728 &xorps ($inout1,$inout4);
1729 &xorps ($inout2,$inout5);
1730
1731 &call ("_aesni_decrypt3");
1732
1733 &xorps ($inout0,$inout3); # output^=tweak
1734 &xorps ($inout1,$inout4);
1735 &xorps ($inout2,$inout5);
1736 &movups (&QWP(16*0,$out),$inout0); # write output
1737 &movups (&QWP(16*1,$out),$inout1);
1738 &movups (&QWP(16*2,$out),$inout2);
1739 &lea ($out,&DWP(16*3,$out));
1740
1741 &movdqa ($tweak,$inout5); # last tweak
1742 &jmp (&label("xts_dec_done"));
1743
1744&set_label("xts_dec_four",16);
1745 &movaps ($inout4,$tweak); # put aside last tweak
1746
1747 &movups ($inout0,&QWP(16*0,$inp)); # load input
1748 &movups ($inout1,&QWP(16*1,$inp));
1749 &movups ($inout2,&QWP(16*2,$inp));
1750 &xorps ($inout0,&QWP(16*0,"esp")); # input^=tweak
1751 &movups ($inout3,&QWP(16*3,$inp));
1752 &lea ($inp,&DWP(16*4,$inp));
1753 &xorps ($inout1,&QWP(16*1,"esp"));
1754 &xorps ($inout2,$inout5);
1755 &xorps ($inout3,$inout4);
1756
1757 &call ("_aesni_decrypt4");
1758
1759 &xorps ($inout0,&QWP(16*0,"esp")); # output^=tweak
1760 &xorps ($inout1,&QWP(16*1,"esp"));
1761 &xorps ($inout2,$inout5);
1762 &movups (&QWP(16*0,$out),$inout0); # write output
1763 &xorps ($inout3,$inout4);
1764 &movups (&QWP(16*1,$out),$inout1);
1765 &movups (&QWP(16*2,$out),$inout2);
1766 &movups (&QWP(16*3,$out),$inout3);
1767 &lea ($out,&DWP(16*4,$out));
1768
1769 &movdqa ($tweak,$inout4); # last tweak
1770 &jmp (&label("xts_dec_done"));
1771
1772&set_label("xts_dec_done6x",16); # $tweak is pre-calculated
1773 &mov ($len,&DWP(16*7+0,"esp")); # restore original $len
1774 &and ($len,15);
1775 &jz (&label("xts_dec_ret"));
1776 &mov (&DWP(16*7+0,"esp"),$len); # save $len%16
1777 &jmp (&label("xts_dec_only_one_more"));
1778
1779&set_label("xts_dec_done",16);
1780 &mov ($len,&DWP(16*7+0,"esp")); # restore original $len
1781 &pxor ($twtmp,$twtmp);
1782 &and ($len,15);
1783 &jz (&label("xts_dec_ret"));
1784
1785 &pcmpgtd($twtmp,$tweak); # broadcast upper bits
1786 &mov (&DWP(16*7+0,"esp"),$len); # save $len%16
1787 &pshufd ($twres,$twtmp,0x13);
1788 &pxor ($twtmp,$twtmp);
1789 &movdqa ($twmask,&QWP(16*6,"esp"));
1790 &paddq ($tweak,$tweak); # &psllq($tweak,1);
1791 &pand ($twres,$twmask); # isolate carry and residue
1792 &pcmpgtd($twtmp,$tweak); # broadcast upper bits
1793 &pxor ($tweak,$twres);
1794
1795&set_label("xts_dec_only_one_more");
1796 &pshufd ($inout3,$twtmp,0x13);
1797 &movdqa ($inout4,$tweak); # put aside previous tweak
1798 &paddq ($tweak,$tweak); # &psllq($tweak,1);
1799 &pand ($inout3,$twmask); # isolate carry and residue
1800 &pxor ($inout3,$tweak);
1801
1802 &mov ($key,$key_); # restore $key
1803 &mov ($rounds,$rounds_); # restore $rounds
1804
1805 &movups ($inout0,&QWP(0,$inp)); # load input
1806 &xorps ($inout0,$inout3); # input^=tweak
1807 if ($inline)
1808 { &aesni_inline_generate1("dec"); }
1809 else
1810 { &call ("_aesni_decrypt1"); }
1811 &xorps ($inout0,$inout3); # output^=tweak
1812 &movups (&QWP(0,$out),$inout0); # write output
1813
1814&set_label("xts_dec_steal");
1815 &movz ($rounds,&BP(16,$inp));
1816 &movz ($key,&BP(0,$out));
1817 &lea ($inp,&DWP(1,$inp));
1818 &mov (&BP(0,$out),&LB($rounds));
1819 &mov (&BP(16,$out),&LB($key));
1820 &lea ($out,&DWP(1,$out));
1821 &sub ($len,1);
1822 &jnz (&label("xts_dec_steal"));
1823
1824 &sub ($out,&DWP(16*7+0,"esp")); # rewind $out
1825 &mov ($key,$key_); # restore $key
1826 &mov ($rounds,$rounds_); # restore $rounds
1827
1828 &movups ($inout0,&QWP(0,$out)); # load input
1829 &xorps ($inout0,$inout4); # input^=tweak
1830 if ($inline)
1831 { &aesni_inline_generate1("dec"); }
1832 else
1833 { &call ("_aesni_decrypt1"); }
1834 &xorps ($inout0,$inout4); # output^=tweak
1835 &movups (&QWP(0,$out),$inout0); # write output
1836
1837&set_label("xts_dec_ret");
1838 &pxor ("xmm0","xmm0"); # clear register bank
1839 &pxor ("xmm1","xmm1");
1840 &pxor ("xmm2","xmm2");
1841 &movdqa (&QWP(16*0,"esp"),"xmm0"); # clear stack
1842 &pxor ("xmm3","xmm3");
1843 &movdqa (&QWP(16*1,"esp"),"xmm0");
1844 &pxor ("xmm4","xmm4");
1845 &movdqa (&QWP(16*2,"esp"),"xmm0");
1846 &pxor ("xmm5","xmm5");
1847 &movdqa (&QWP(16*3,"esp"),"xmm0");
1848 &pxor ("xmm6","xmm6");
1849 &movdqa (&QWP(16*4,"esp"),"xmm0");
1850 &pxor ("xmm7","xmm7");
1851 &movdqa (&QWP(16*5,"esp"),"xmm0");
1852 &mov ("esp",&DWP(16*7+4,"esp")); # restore %esp
1853&function_end("aesni_xts_decrypt");
1854}
1855
1856
1857######################################################################
1858# void aesni_ocb_[en|de]crypt(const char *inp, char *out, size_t blocks,
1859# const AES_KEY *key, unsigned int start_block_num,
1860# unsigned char offset_i[16], const unsigned char L_[][16],
1861# unsigned char checksum[16]);
1862#
1863{
1864# offsets within stack frame
1865my $checksum = 16*6;
1866my ($key_off,$rounds_off,$out_off,$end_off,$esp_off)=map(16*7+4*$_,(0..4));
1867
1868# reassigned registers
1869my ($l_,$block,$i1,$i3,$i5) = ($rounds_,$key_,$rounds,$len,$out);
1870# $l_, $blocks, $inp, $key are permanently allocated in registers;
1871# remaining non-volatile ones are offloaded to stack, which even
1872# stay invariant after written to stack.
1873
1874&function_begin("aesni_ocb_encrypt");
1875 &mov ($rounds,&wparam(5)); # &offset_i
1876 &mov ($rounds_,&wparam(7)); # &checksum
1877
1878 &mov ($inp,&wparam(0));
1879 &mov ($out,&wparam(1));
1880 &mov ($len,&wparam(2));
1881 &mov ($key,&wparam(3));
1882 &movdqu ($rndkey0,&QWP(0,$rounds)); # load offset_i
1883 &mov ($block,&wparam(4)); # start_block_num
1884 &movdqu ($rndkey1,&QWP(0,$rounds_)); # load checksum
1885 &mov ($l_,&wparam(6)); # L_
1886
1887 &mov ($rounds,"esp");
1888 &sub ("esp",$esp_off+4); # alloca
1889 &and ("esp",-16); # align stack
1890
1891 &sub ($out,$inp);
1892 &shl ($len,4);
1893 &lea ($len,&DWP(-16*6,$inp,$len)); # end of input - 16*6
1894 &mov (&DWP($out_off,"esp"),$out);
1895 &mov (&DWP($end_off,"esp"),$len);
1896 &mov (&DWP($esp_off,"esp"),$rounds);
1897
1898 &mov ($rounds,&DWP(240,$key));
1899
1900 &test ($block,1);
1901 &jnz (&label("odd"));
1902
1903 &bsf ($i3,$block);
1904 &add ($block,1);
1905 &shl ($i3,4);
1906 &movdqu ($inout5,&QWP(0,$l_,$i3));
1907 &mov ($i3,$key); # put aside key
1908
1909 &movdqu ($inout0,&QWP(16*0,$inp)); # load input
1910 &lea ($inp,&DWP(16,$inp));
1911
1912 &pxor ($inout5,$rndkey0); # ^ last offset_i
1913 &pxor ($rndkey1,$inout0); # checksum
1914 &pxor ($inout0,$inout5); # ^ offset_i
1915
1916 &movdqa ($inout4,$rndkey1);
1917 if ($inline)
1918 { &aesni_inline_generate1("enc"); }
1919 else
1920 { &call ("_aesni_encrypt1"); }
1921
1922 &xorps ($inout0,$inout5); # ^ offset_i
1923 &movdqa ($rndkey0,$inout5); # pass last offset_i
1924 &movdqa ($rndkey1,$inout4); # pass the checksum
1925
1926 &movups (&QWP(-16,$out,$inp),$inout0); # store output
1927
1928 &mov ($rounds,&DWP(240,$i3));
1929 &mov ($key,$i3); # restore key
1930 &mov ($len,&DWP($end_off,"esp"));
1931
1932&set_label("odd");
1933 &shl ($rounds,4);
1934 &mov ($out,16);
1935 &sub ($out,$rounds); # twisted rounds
1936 &mov (&DWP($key_off,"esp"),$key);
1937 &lea ($key,&DWP(32,$key,$rounds)); # end of key schedule
1938 &mov (&DWP($rounds_off,"esp"),$out);
1939
1940 &cmp ($inp,$len);
1941 &ja (&label("short"));
1942 &jmp (&label("grandloop"));
1943
1944&set_label("grandloop",32);
1945 &lea ($i1,&DWP(1,$block));
1946 &lea ($i3,&DWP(3,$block));
1947 &lea ($i5,&DWP(5,$block));
1948 &add ($block,6);
1949 &bsf ($i1,$i1);
1950 &bsf ($i3,$i3);
1951 &bsf ($i5,$i5);
1952 &shl ($i1,4);
1953 &shl ($i3,4);
1954 &shl ($i5,4);
1955 &movdqu ($inout0,&QWP(0,$l_));
1956 &movdqu ($inout1,&QWP(0,$l_,$i1));
1957 &mov ($rounds,&DWP($rounds_off,"esp"));
1958 &movdqa ($inout2,$inout0);
1959 &movdqu ($inout3,&QWP(0,$l_,$i3));
1960 &movdqa ($inout4,$inout0);
1961 &movdqu ($inout5,&QWP(0,$l_,$i5));
1962
1963 &pxor ($inout0,$rndkey0); # ^ last offset_i
1964 &pxor ($inout1,$inout0);
1965 &movdqa (&QWP(16*0,"esp"),$inout0);
1966 &pxor ($inout2,$inout1);
1967 &movdqa (&QWP(16*1,"esp"),$inout1);
1968 &pxor ($inout3,$inout2);
1969 &movdqa (&QWP(16*2,"esp"),$inout2);
1970 &pxor ($inout4,$inout3);
1971 &movdqa (&QWP(16*3,"esp"),$inout3);
1972 &pxor ($inout5,$inout4);
1973 &movdqa (&QWP(16*4,"esp"),$inout4);
1974 &movdqa (&QWP(16*5,"esp"),$inout5);
1975
1976 &$movekey ($rndkey0,&QWP(-48,$key,$rounds));
1977 &movdqu ($inout0,&QWP(16*0,$inp)); # load input
1978 &movdqu ($inout1,&QWP(16*1,$inp));
1979 &movdqu ($inout2,&QWP(16*2,$inp));
1980 &movdqu ($inout3,&QWP(16*3,$inp));
1981 &movdqu ($inout4,&QWP(16*4,$inp));
1982 &movdqu ($inout5,&QWP(16*5,$inp));
1983 &lea ($inp,&DWP(16*6,$inp));
1984
1985 &pxor ($rndkey1,$inout0); # checksum
1986 &pxor ($inout0,$rndkey0); # ^ roundkey[0]
1987 &pxor ($rndkey1,$inout1);
1988 &pxor ($inout1,$rndkey0);
1989 &pxor ($rndkey1,$inout2);
1990 &pxor ($inout2,$rndkey0);
1991 &pxor ($rndkey1,$inout3);
1992 &pxor ($inout3,$rndkey0);
1993 &pxor ($rndkey1,$inout4);
1994 &pxor ($inout4,$rndkey0);
1995 &pxor ($rndkey1,$inout5);
1996 &pxor ($inout5,$rndkey0);
1997 &movdqa (&QWP($checksum,"esp"),$rndkey1);
1998
1999 &$movekey ($rndkey1,&QWP(-32,$key,$rounds));
2000 &pxor ($inout0,&QWP(16*0,"esp")); # ^ offset_i
2001 &pxor ($inout1,&QWP(16*1,"esp"));
2002 &pxor ($inout2,&QWP(16*2,"esp"));
2003 &pxor ($inout3,&QWP(16*3,"esp"));
2004 &pxor ($inout4,&QWP(16*4,"esp"));
2005 &pxor ($inout5,&QWP(16*5,"esp"));
2006
2007 &$movekey ($rndkey0,&QWP(-16,$key,$rounds));
2008 &aesenc ($inout0,$rndkey1);
2009 &aesenc ($inout1,$rndkey1);
2010 &aesenc ($inout2,$rndkey1);
2011 &aesenc ($inout3,$rndkey1);
2012 &aesenc ($inout4,$rndkey1);
2013 &aesenc ($inout5,$rndkey1);
2014
2015 &mov ($out,&DWP($out_off,"esp"));
2016 &mov ($len,&DWP($end_off,"esp"));
2017 &call ("_aesni_encrypt6_enter");
2018
2019 &movdqa ($rndkey0,&QWP(16*5,"esp")); # pass last offset_i
2020 &pxor ($inout0,&QWP(16*0,"esp")); # ^ offset_i
2021 &pxor ($inout1,&QWP(16*1,"esp"));
2022 &pxor ($inout2,&QWP(16*2,"esp"));
2023 &pxor ($inout3,&QWP(16*3,"esp"));
2024 &pxor ($inout4,&QWP(16*4,"esp"));
2025 &pxor ($inout5,$rndkey0);
2026 &movdqa ($rndkey1,&QWP($checksum,"esp"));# pass the checksum
2027
2028 &movdqu (&QWP(-16*6,$out,$inp),$inout0);# store output
2029 &movdqu (&QWP(-16*5,$out,$inp),$inout1);
2030 &movdqu (&QWP(-16*4,$out,$inp),$inout2);
2031 &movdqu (&QWP(-16*3,$out,$inp),$inout3);
2032 &movdqu (&QWP(-16*2,$out,$inp),$inout4);
2033 &movdqu (&QWP(-16*1,$out,$inp),$inout5);
2034 &cmp ($inp,$len); # done yet?
2035 &jbe (&label("grandloop"));
2036
2037&set_label("short");
2038 &add ($len,16*6);
2039 &sub ($len,$inp);
2040 &jz (&label("done"));
2041
2042 &cmp ($len,16*2);
2043 &jb (&label("one"));
2044 &je (&label("two"));
2045
2046 &cmp ($len,16*4);
2047 &jb (&label("three"));
2048 &je (&label("four"));
2049
2050 &lea ($i1,&DWP(1,$block));
2051 &lea ($i3,&DWP(3,$block));
2052 &bsf ($i1,$i1);
2053 &bsf ($i3,$i3);
2054 &shl ($i1,4);
2055 &shl ($i3,4);
2056 &movdqu ($inout0,&QWP(0,$l_));
2057 &movdqu ($inout1,&QWP(0,$l_,$i1));
2058 &mov ($rounds,&DWP($rounds_off,"esp"));
2059 &movdqa ($inout2,$inout0);
2060 &movdqu ($inout3,&QWP(0,$l_,$i3));
2061 &movdqa ($inout4,$inout0);
2062
2063 &pxor ($inout0,$rndkey0); # ^ last offset_i
2064 &pxor ($inout1,$inout0);
2065 &movdqa (&QWP(16*0,"esp"),$inout0);
2066 &pxor ($inout2,$inout1);
2067 &movdqa (&QWP(16*1,"esp"),$inout1);
2068 &pxor ($inout3,$inout2);
2069 &movdqa (&QWP(16*2,"esp"),$inout2);
2070 &pxor ($inout4,$inout3);
2071 &movdqa (&QWP(16*3,"esp"),$inout3);
2072 &pxor ($inout5,$inout4);
2073 &movdqa (&QWP(16*4,"esp"),$inout4);
2074
2075 &$movekey ($rndkey0,&QWP(-48,$key,$rounds));
2076 &movdqu ($inout0,&QWP(16*0,$inp)); # load input
2077 &movdqu ($inout1,&QWP(16*1,$inp));
2078 &movdqu ($inout2,&QWP(16*2,$inp));
2079 &movdqu ($inout3,&QWP(16*3,$inp));
2080 &movdqu ($inout4,&QWP(16*4,$inp));
2081 &pxor ($inout5,$inout5);
2082
2083 &pxor ($rndkey1,$inout0); # checksum
2084 &pxor ($inout0,$rndkey0); # ^ roundkey[0]
2085 &pxor ($rndkey1,$inout1);
2086 &pxor ($inout1,$rndkey0);
2087 &pxor ($rndkey1,$inout2);
2088 &pxor ($inout2,$rndkey0);
2089 &pxor ($rndkey1,$inout3);
2090 &pxor ($inout3,$rndkey0);
2091 &pxor ($rndkey1,$inout4);
2092 &pxor ($inout4,$rndkey0);
2093 &movdqa (&QWP($checksum,"esp"),$rndkey1);
2094
2095 &$movekey ($rndkey1,&QWP(-32,$key,$rounds));
2096 &pxor ($inout0,&QWP(16*0,"esp")); # ^ offset_i
2097 &pxor ($inout1,&QWP(16*1,"esp"));
2098 &pxor ($inout2,&QWP(16*2,"esp"));
2099 &pxor ($inout3,&QWP(16*3,"esp"));
2100 &pxor ($inout4,&QWP(16*4,"esp"));
2101
2102 &$movekey ($rndkey0,&QWP(-16,$key,$rounds));
2103 &aesenc ($inout0,$rndkey1);
2104 &aesenc ($inout1,$rndkey1);
2105 &aesenc ($inout2,$rndkey1);
2106 &aesenc ($inout3,$rndkey1);
2107 &aesenc ($inout4,$rndkey1);
2108 &aesenc ($inout5,$rndkey1);
2109
2110 &mov ($out,&DWP($out_off,"esp"));
2111 &call ("_aesni_encrypt6_enter");
2112
2113 &movdqa ($rndkey0,&QWP(16*4,"esp")); # pass last offset_i
2114 &pxor ($inout0,&QWP(16*0,"esp")); # ^ offset_i
2115 &pxor ($inout1,&QWP(16*1,"esp"));
2116 &pxor ($inout2,&QWP(16*2,"esp"));
2117 &pxor ($inout3,&QWP(16*3,"esp"));
2118 &pxor ($inout4,$rndkey0);
2119 &movdqa ($rndkey1,&QWP($checksum,"esp"));# pass the checksum
2120
2121 &movdqu (&QWP(16*0,$out,$inp),$inout0); # store output
2122 &movdqu (&QWP(16*1,$out,$inp),$inout1);
2123 &movdqu (&QWP(16*2,$out,$inp),$inout2);
2124 &movdqu (&QWP(16*3,$out,$inp),$inout3);
2125 &movdqu (&QWP(16*4,$out,$inp),$inout4);
2126
2127 &jmp (&label("done"));
2128
2129&set_label("one",16);
2130 &movdqu ($inout5,&QWP(0,$l_));
2131 &mov ($key,&DWP($key_off,"esp")); # restore key
2132
2133 &movdqu ($inout0,&QWP(16*0,$inp)); # load input
2134 &mov ($rounds,&DWP(240,$key));
2135
2136 &pxor ($inout5,$rndkey0); # ^ last offset_i
2137 &pxor ($rndkey1,$inout0); # checksum
2138 &pxor ($inout0,$inout5); # ^ offset_i
2139
2140 &movdqa ($inout4,$rndkey1);
2141 &mov ($out,&DWP($out_off,"esp"));
2142 if ($inline)
2143 { &aesni_inline_generate1("enc"); }
2144 else
2145 { &call ("_aesni_encrypt1"); }
2146
2147 &xorps ($inout0,$inout5); # ^ offset_i
2148 &movdqa ($rndkey0,$inout5); # pass last offset_i
2149 &movdqa ($rndkey1,$inout4); # pass the checksum
2150 &movups (&QWP(0,$out,$inp),$inout0);
2151
2152 &jmp (&label("done"));
2153
2154&set_label("two",16);
2155 &lea ($i1,&DWP(1,$block));
2156 &mov ($key,&DWP($key_off,"esp")); # restore key
2157 &bsf ($i1,$i1);
2158 &shl ($i1,4);
2159 &movdqu ($inout4,&QWP(0,$l_));
2160 &movdqu ($inout5,&QWP(0,$l_,$i1));
2161
2162 &movdqu ($inout0,&QWP(16*0,$inp)); # load input
2163 &movdqu ($inout1,&QWP(16*1,$inp));
2164 &mov ($rounds,&DWP(240,$key));
2165
2166 &pxor ($inout4,$rndkey0); # ^ last offset_i
2167 &pxor ($inout5,$inout4);
2168
2169 &pxor ($rndkey1,$inout0); # checksum
2170 &pxor ($inout0,$inout4); # ^ offset_i
2171 &pxor ($rndkey1,$inout1);
2172 &pxor ($inout1,$inout5);
2173
2174 &movdqa ($inout3,$rndkey1)
2175 &mov ($out,&DWP($out_off,"esp"));
2176 &call ("_aesni_encrypt2");
2177
2178 &xorps ($inout0,$inout4); # ^ offset_i
2179 &xorps ($inout1,$inout5);
2180 &movdqa ($rndkey0,$inout5); # pass last offset_i
2181 &movdqa ($rndkey1,$inout3); # pass the checksum
2182 &movups (&QWP(16*0,$out,$inp),$inout0); # store output
2183 &movups (&QWP(16*1,$out,$inp),$inout1);
2184
2185 &jmp (&label("done"));
2186
2187&set_label("three",16);
2188 &lea ($i1,&DWP(1,$block));
2189 &mov ($key,&DWP($key_off,"esp")); # restore key
2190 &bsf ($i1,$i1);
2191 &shl ($i1,4);
2192 &movdqu ($inout3,&QWP(0,$l_));
2193 &movdqu ($inout4,&QWP(0,$l_,$i1));
2194 &movdqa ($inout5,$inout3);
2195
2196 &movdqu ($inout0,&QWP(16*0,$inp)); # load input
2197 &movdqu ($inout1,&QWP(16*1,$inp));
2198 &movdqu ($inout2,&QWP(16*2,$inp));
2199 &mov ($rounds,&DWP(240,$key));
2200
2201 &pxor ($inout3,$rndkey0); # ^ last offset_i
2202 &pxor ($inout4,$inout3);
2203 &pxor ($inout5,$inout4);
2204
2205 &pxor ($rndkey1,$inout0); # checksum
2206 &pxor ($inout0,$inout3); # ^ offset_i
2207 &pxor ($rndkey1,$inout1);
2208 &pxor ($inout1,$inout4);
2209 &pxor ($rndkey1,$inout2);
2210 &pxor ($inout2,$inout5);
2211
2212 &movdqa (&QWP($checksum,"esp"),$rndkey1);
2213 &mov ($out,&DWP($out_off,"esp"));
2214 &call ("_aesni_encrypt3");
2215
2216 &xorps ($inout0,$inout3); # ^ offset_i
2217 &xorps ($inout1,$inout4);
2218 &xorps ($inout2,$inout5);
2219 &movdqa ($rndkey0,$inout5); # pass last offset_i
2220 &movdqa ($rndkey1,&QWP($checksum,"esp"));# pass the checksum
2221 &movups (&QWP(16*0,$out,$inp),$inout0); # store output
2222 &movups (&QWP(16*1,$out,$inp),$inout1);
2223 &movups (&QWP(16*2,$out,$inp),$inout2);
2224
2225 &jmp (&label("done"));
2226
2227&set_label("four",16);
2228 &lea ($i1,&DWP(1,$block));
2229 &lea ($i3,&DWP(3,$block));
2230 &bsf ($i1,$i1);
2231 &bsf ($i3,$i3);
2232 &mov ($key,&DWP($key_off,"esp")); # restore key
2233 &shl ($i1,4);
2234 &shl ($i3,4);
2235 &movdqu ($inout2,&QWP(0,$l_));
2236 &movdqu ($inout3,&QWP(0,$l_,$i1));
2237 &movdqa ($inout4,$inout2);
2238 &movdqu ($inout5,&QWP(0,$l_,$i3));
2239
2240 &pxor ($inout2,$rndkey0); # ^ last offset_i
2241 &movdqu ($inout0,&QWP(16*0,$inp)); # load input
2242 &pxor ($inout3,$inout2);
2243 &movdqu ($inout1,&QWP(16*1,$inp));
2244 &pxor ($inout4,$inout3);
2245 &movdqa (&QWP(16*0,"esp"),$inout2);
2246 &pxor ($inout5,$inout4);
2247 &movdqa (&QWP(16*1,"esp"),$inout3);
2248 &movdqu ($inout2,&QWP(16*2,$inp));
2249 &movdqu ($inout3,&QWP(16*3,$inp));
2250 &mov ($rounds,&DWP(240,$key));
2251
2252 &pxor ($rndkey1,$inout0); # checksum
2253 &pxor ($inout0,&QWP(16*0,"esp")); # ^ offset_i
2254 &pxor ($rndkey1,$inout1);
2255 &pxor ($inout1,&QWP(16*1,"esp"));
2256 &pxor ($rndkey1,$inout2);
2257 &pxor ($inout2,$inout4);
2258 &pxor ($rndkey1,$inout3);
2259 &pxor ($inout3,$inout5);
2260
2261 &movdqa (&QWP($checksum,"esp"),$rndkey1)
2262 &mov ($out,&DWP($out_off,"esp"));
2263 &call ("_aesni_encrypt4");
2264
2265 &xorps ($inout0,&QWP(16*0,"esp")); # ^ offset_i
2266 &xorps ($inout1,&QWP(16*1,"esp"));
2267 &xorps ($inout2,$inout4);
2268 &movups (&QWP(16*0,$out,$inp),$inout0); # store output
2269 &xorps ($inout3,$inout5);
2270 &movups (&QWP(16*1,$out,$inp),$inout1);
2271 &movdqa ($rndkey0,$inout5); # pass last offset_i
2272 &movups (&QWP(16*2,$out,$inp),$inout2);
2273 &movdqa ($rndkey1,&QWP($checksum,"esp"));# pass the checksum
2274 &movups (&QWP(16*3,$out,$inp),$inout3);
2275
2276&set_label("done");
2277 &mov ($key,&DWP($esp_off,"esp"));
2278 &pxor ($inout0,$inout0); # clear register bank
2279 &pxor ($inout1,$inout1);
2280 &movdqa (&QWP(16*0,"esp"),$inout0); # clear stack
2281 &pxor ($inout2,$inout2);
2282 &movdqa (&QWP(16*1,"esp"),$inout0);
2283 &pxor ($inout3,$inout3);
2284 &movdqa (&QWP(16*2,"esp"),$inout0);
2285 &pxor ($inout4,$inout4);
2286 &movdqa (&QWP(16*3,"esp"),$inout0);
2287 &pxor ($inout5,$inout5);
2288 &movdqa (&QWP(16*4,"esp"),$inout0);
2289 &movdqa (&QWP(16*5,"esp"),$inout0);
2290 &movdqa (&QWP(16*6,"esp"),$inout0);
2291
2292 &lea ("esp",&DWP(0,$key));
2293 &mov ($rounds,&wparam(5)); # &offset_i
2294 &mov ($rounds_,&wparam(7)); # &checksum
2295 &movdqu (&QWP(0,$rounds),$rndkey0);
2296 &pxor ($rndkey0,$rndkey0);
2297 &movdqu (&QWP(0,$rounds_),$rndkey1);
2298 &pxor ($rndkey1,$rndkey1);
2299&function_end("aesni_ocb_encrypt");
2300
2301&function_begin("aesni_ocb_decrypt");
2302 &mov ($rounds,&wparam(5)); # &offset_i
2303 &mov ($rounds_,&wparam(7)); # &checksum
2304
2305 &mov ($inp,&wparam(0));
2306 &mov ($out,&wparam(1));
2307 &mov ($len,&wparam(2));
2308 &mov ($key,&wparam(3));
2309 &movdqu ($rndkey0,&QWP(0,$rounds)); # load offset_i
2310 &mov ($block,&wparam(4)); # start_block_num
2311 &movdqu ($rndkey1,&QWP(0,$rounds_)); # load checksum
2312 &mov ($l_,&wparam(6)); # L_
2313
2314 &mov ($rounds,"esp");
2315 &sub ("esp",$esp_off+4); # alloca
2316 &and ("esp",-16); # align stack
2317
2318 &sub ($out,$inp);
2319 &shl ($len,4);
2320 &lea ($len,&DWP(-16*6,$inp,$len)); # end of input - 16*6
2321 &mov (&DWP($out_off,"esp"),$out);
2322 &mov (&DWP($end_off,"esp"),$len);
2323 &mov (&DWP($esp_off,"esp"),$rounds);
2324
2325 &mov ($rounds,&DWP(240,$key));
2326
2327 &test ($block,1);
2328 &jnz (&label("odd"));
2329
2330 &bsf ($i3,$block);
2331 &add ($block,1);
2332 &shl ($i3,4);
2333 &movdqu ($inout5,&QWP(0,$l_,$i3));
2334 &mov ($i3,$key); # put aside key
2335
2336 &movdqu ($inout0,&QWP(16*0,$inp)); # load input
2337 &lea ($inp,&DWP(16,$inp));
2338
2339 &pxor ($inout5,$rndkey0); # ^ last offset_i
2340 &pxor ($inout0,$inout5); # ^ offset_i
2341
2342 &movdqa ($inout4,$rndkey1);
2343 if ($inline)
2344 { &aesni_inline_generate1("dec"); }
2345 else
2346 { &call ("_aesni_decrypt1"); }
2347
2348 &xorps ($inout0,$inout5); # ^ offset_i
2349 &movaps ($rndkey1,$inout4); # pass the checksum
2350 &movdqa ($rndkey0,$inout5); # pass last offset_i
2351 &xorps ($rndkey1,$inout0); # checksum
2352 &movups (&QWP(-16,$out,$inp),$inout0); # store output
2353
2354 &mov ($rounds,&DWP(240,$i3));
2355 &mov ($key,$i3); # restore key
2356 &mov ($len,&DWP($end_off,"esp"));
2357
2358&set_label("odd");
2359 &shl ($rounds,4);
2360 &mov ($out,16);
2361 &sub ($out,$rounds); # twisted rounds
2362 &mov (&DWP($key_off,"esp"),$key);
2363 &lea ($key,&DWP(32,$key,$rounds)); # end of key schedule
2364 &mov (&DWP($rounds_off,"esp"),$out);
2365
2366 &cmp ($inp,$len);
2367 &ja (&label("short"));
2368 &jmp (&label("grandloop"));
2369
2370&set_label("grandloop",32);
2371 &lea ($i1,&DWP(1,$block));
2372 &lea ($i3,&DWP(3,$block));
2373 &lea ($i5,&DWP(5,$block));
2374 &add ($block,6);
2375 &bsf ($i1,$i1);
2376 &bsf ($i3,$i3);
2377 &bsf ($i5,$i5);
2378 &shl ($i1,4);
2379 &shl ($i3,4);
2380 &shl ($i5,4);
2381 &movdqu ($inout0,&QWP(0,$l_));
2382 &movdqu ($inout1,&QWP(0,$l_,$i1));
2383 &mov ($rounds,&DWP($rounds_off,"esp"));
2384 &movdqa ($inout2,$inout0);
2385 &movdqu ($inout3,&QWP(0,$l_,$i3));
2386 &movdqa ($inout4,$inout0);
2387 &movdqu ($inout5,&QWP(0,$l_,$i5));
2388
2389 &pxor ($inout0,$rndkey0); # ^ last offset_i
2390 &pxor ($inout1,$inout0);
2391 &movdqa (&QWP(16*0,"esp"),$inout0);
2392 &pxor ($inout2,$inout1);
2393 &movdqa (&QWP(16*1,"esp"),$inout1);
2394 &pxor ($inout3,$inout2);
2395 &movdqa (&QWP(16*2,"esp"),$inout2);
2396 &pxor ($inout4,$inout3);
2397 &movdqa (&QWP(16*3,"esp"),$inout3);
2398 &pxor ($inout5,$inout4);
2399 &movdqa (&QWP(16*4,"esp"),$inout4);
2400 &movdqa (&QWP(16*5,"esp"),$inout5);
2401
2402 &$movekey ($rndkey0,&QWP(-48,$key,$rounds));
2403 &movdqu ($inout0,&QWP(16*0,$inp)); # load input
2404 &movdqu ($inout1,&QWP(16*1,$inp));
2405 &movdqu ($inout2,&QWP(16*2,$inp));
2406 &movdqu ($inout3,&QWP(16*3,$inp));
2407 &movdqu ($inout4,&QWP(16*4,$inp));
2408 &movdqu ($inout5,&QWP(16*5,$inp));
2409 &lea ($inp,&DWP(16*6,$inp));
2410
2411 &movdqa (&QWP($checksum,"esp"),$rndkey1);
2412 &pxor ($inout0,$rndkey0); # ^ roundkey[0]
2413 &pxor ($inout1,$rndkey0);
2414 &pxor ($inout2,$rndkey0);
2415 &pxor ($inout3,$rndkey0);
2416 &pxor ($inout4,$rndkey0);
2417 &pxor ($inout5,$rndkey0);
2418
2419 &$movekey ($rndkey1,&QWP(-32,$key,$rounds));
2420 &pxor ($inout0,&QWP(16*0,"esp")); # ^ offset_i
2421 &pxor ($inout1,&QWP(16*1,"esp"));
2422 &pxor ($inout2,&QWP(16*2,"esp"));
2423 &pxor ($inout3,&QWP(16*3,"esp"));
2424 &pxor ($inout4,&QWP(16*4,"esp"));
2425 &pxor ($inout5,&QWP(16*5,"esp"));
2426
2427 &$movekey ($rndkey0,&QWP(-16,$key,$rounds));
2428 &aesdec ($inout0,$rndkey1);
2429 &aesdec ($inout1,$rndkey1);
2430 &aesdec ($inout2,$rndkey1);
2431 &aesdec ($inout3,$rndkey1);
2432 &aesdec ($inout4,$rndkey1);
2433 &aesdec ($inout5,$rndkey1);
2434
2435 &mov ($out,&DWP($out_off,"esp"));
2436 &mov ($len,&DWP($end_off,"esp"));
2437 &call ("_aesni_decrypt6_enter");
2438
2439 &movdqa ($rndkey0,&QWP(16*5,"esp")); # pass last offset_i
2440 &pxor ($inout0,&QWP(16*0,"esp")); # ^ offset_i
2441 &movdqa ($rndkey1,&QWP($checksum,"esp"));
2442 &pxor ($inout1,&QWP(16*1,"esp"));
2443 &pxor ($inout2,&QWP(16*2,"esp"));
2444 &pxor ($inout3,&QWP(16*3,"esp"));
2445 &pxor ($inout4,&QWP(16*4,"esp"));
2446 &pxor ($inout5,$rndkey0);
2447
2448 &pxor ($rndkey1,$inout0); # checksum
2449 &movdqu (&QWP(-16*6,$out,$inp),$inout0);# store output
2450 &pxor ($rndkey1,$inout1);
2451 &movdqu (&QWP(-16*5,$out,$inp),$inout1);
2452 &pxor ($rndkey1,$inout2);
2453 &movdqu (&QWP(-16*4,$out,$inp),$inout2);
2454 &pxor ($rndkey1,$inout3);
2455 &movdqu (&QWP(-16*3,$out,$inp),$inout3);
2456 &pxor ($rndkey1,$inout4);
2457 &movdqu (&QWP(-16*2,$out,$inp),$inout4);
2458 &pxor ($rndkey1,$inout5);
2459 &movdqu (&QWP(-16*1,$out,$inp),$inout5);
2460 &cmp ($inp,$len); # done yet?
2461 &jbe (&label("grandloop"));
2462
2463&set_label("short");
2464 &add ($len,16*6);
2465 &sub ($len,$inp);
2466 &jz (&label("done"));
2467
2468 &cmp ($len,16*2);
2469 &jb (&label("one"));
2470 &je (&label("two"));
2471
2472 &cmp ($len,16*4);
2473 &jb (&label("three"));
2474 &je (&label("four"));
2475
2476 &lea ($i1,&DWP(1,$block));
2477 &lea ($i3,&DWP(3,$block));
2478 &bsf ($i1,$i1);
2479 &bsf ($i3,$i3);
2480 &shl ($i1,4);
2481 &shl ($i3,4);
2482 &movdqu ($inout0,&QWP(0,$l_));
2483 &movdqu ($inout1,&QWP(0,$l_,$i1));
2484 &mov ($rounds,&DWP($rounds_off,"esp"));
2485 &movdqa ($inout2,$inout0);
2486 &movdqu ($inout3,&QWP(0,$l_,$i3));
2487 &movdqa ($inout4,$inout0);
2488
2489 &pxor ($inout0,$rndkey0); # ^ last offset_i
2490 &pxor ($inout1,$inout0);
2491 &movdqa (&QWP(16*0,"esp"),$inout0);
2492 &pxor ($inout2,$inout1);
2493 &movdqa (&QWP(16*1,"esp"),$inout1);
2494 &pxor ($inout3,$inout2);
2495 &movdqa (&QWP(16*2,"esp"),$inout2);
2496 &pxor ($inout4,$inout3);
2497 &movdqa (&QWP(16*3,"esp"),$inout3);
2498 &pxor ($inout5,$inout4);
2499 &movdqa (&QWP(16*4,"esp"),$inout4);
2500
2501 &$movekey ($rndkey0,&QWP(-48,$key,$rounds));
2502 &movdqu ($inout0,&QWP(16*0,$inp)); # load input
2503 &movdqu ($inout1,&QWP(16*1,$inp));
2504 &movdqu ($inout2,&QWP(16*2,$inp));
2505 &movdqu ($inout3,&QWP(16*3,$inp));
2506 &movdqu ($inout4,&QWP(16*4,$inp));
2507 &pxor ($inout5,$inout5);
2508
2509 &movdqa (&QWP($checksum,"esp"),$rndkey1);
2510 &pxor ($inout0,$rndkey0); # ^ roundkey[0]
2511 &pxor ($inout1,$rndkey0);
2512 &pxor ($inout2,$rndkey0);
2513 &pxor ($inout3,$rndkey0);
2514 &pxor ($inout4,$rndkey0);
2515
2516 &$movekey ($rndkey1,&QWP(-32,$key,$rounds));
2517 &pxor ($inout0,&QWP(16*0,"esp")); # ^ offset_i
2518 &pxor ($inout1,&QWP(16*1,"esp"));
2519 &pxor ($inout2,&QWP(16*2,"esp"));
2520 &pxor ($inout3,&QWP(16*3,"esp"));
2521 &pxor ($inout4,&QWP(16*4,"esp"));
2522
2523 &$movekey ($rndkey0,&QWP(-16,$key,$rounds));
2524 &aesdec ($inout0,$rndkey1);
2525 &aesdec ($inout1,$rndkey1);
2526 &aesdec ($inout2,$rndkey1);
2527 &aesdec ($inout3,$rndkey1);
2528 &aesdec ($inout4,$rndkey1);
2529 &aesdec ($inout5,$rndkey1);
2530
2531 &mov ($out,&DWP($out_off,"esp"));
2532 &call ("_aesni_decrypt6_enter");
2533
2534 &movdqa ($rndkey0,&QWP(16*4,"esp")); # pass last offset_i
2535 &pxor ($inout0,&QWP(16*0,"esp")); # ^ offset_i
2536 &movdqa ($rndkey1,&QWP($checksum,"esp"));
2537 &pxor ($inout1,&QWP(16*1,"esp"));
2538 &pxor ($inout2,&QWP(16*2,"esp"));
2539 &pxor ($inout3,&QWP(16*3,"esp"));
2540 &pxor ($inout4,$rndkey0);
2541
2542 &pxor ($rndkey1,$inout0); # checksum
2543 &movdqu (&QWP(16*0,$out,$inp),$inout0); # store output
2544 &pxor ($rndkey1,$inout1);
2545 &movdqu (&QWP(16*1,$out,$inp),$inout1);
2546 &pxor ($rndkey1,$inout2);
2547 &movdqu (&QWP(16*2,$out,$inp),$inout2);
2548 &pxor ($rndkey1,$inout3);
2549 &movdqu (&QWP(16*3,$out,$inp),$inout3);
2550 &pxor ($rndkey1,$inout4);
2551 &movdqu (&QWP(16*4,$out,$inp),$inout4);
2552
2553 &jmp (&label("done"));
2554
2555&set_label("one",16);
2556 &movdqu ($inout5,&QWP(0,$l_));
2557 &mov ($key,&DWP($key_off,"esp")); # restore key
2558
2559 &movdqu ($inout0,&QWP(16*0,$inp)); # load input
2560 &mov ($rounds,&DWP(240,$key));
2561
2562 &pxor ($inout5,$rndkey0); # ^ last offset_i
2563 &pxor ($inout0,$inout5); # ^ offset_i
2564
2565 &movdqa ($inout4,$rndkey1);
2566 &mov ($out,&DWP($out_off,"esp"));
2567 if ($inline)
2568 { &aesni_inline_generate1("dec"); }
2569 else
2570 { &call ("_aesni_decrypt1"); }
2571
2572 &xorps ($inout0,$inout5); # ^ offset_i
2573 &movaps ($rndkey1,$inout4); # pass the checksum
2574 &movdqa ($rndkey0,$inout5); # pass last offset_i
2575 &xorps ($rndkey1,$inout0); # checksum
2576 &movups (&QWP(0,$out,$inp),$inout0);
2577
2578 &jmp (&label("done"));
2579
2580&set_label("two",16);
2581 &lea ($i1,&DWP(1,$block));
2582 &mov ($key,&DWP($key_off,"esp")); # restore key
2583 &bsf ($i1,$i1);
2584 &shl ($i1,4);
2585 &movdqu ($inout4,&QWP(0,$l_));
2586 &movdqu ($inout5,&QWP(0,$l_,$i1));
2587
2588 &movdqu ($inout0,&QWP(16*0,$inp)); # load input
2589 &movdqu ($inout1,&QWP(16*1,$inp));
2590 &mov ($rounds,&DWP(240,$key));
2591
2592 &movdqa ($inout3,$rndkey1);
2593 &pxor ($inout4,$rndkey0); # ^ last offset_i
2594 &pxor ($inout5,$inout4);
2595
2596 &pxor ($inout0,$inout4); # ^ offset_i
2597 &pxor ($inout1,$inout5);
2598
2599 &mov ($out,&DWP($out_off,"esp"));
2600 &call ("_aesni_decrypt2");
2601
2602 &xorps ($inout0,$inout4); # ^ offset_i
2603 &xorps ($inout1,$inout5);
2604 &movdqa ($rndkey0,$inout5); # pass last offset_i
2605 &xorps ($inout3,$inout0); # checksum
2606 &movups (&QWP(16*0,$out,$inp),$inout0); # store output
2607 &xorps ($inout3,$inout1);
2608 &movups (&QWP(16*1,$out,$inp),$inout1);
2609 &movaps ($rndkey1,$inout3); # pass the checksum
2610
2611 &jmp (&label("done"));
2612
2613&set_label("three",16);
2614 &lea ($i1,&DWP(1,$block));
2615 &mov ($key,&DWP($key_off,"esp")); # restore key
2616 &bsf ($i1,$i1);
2617 &shl ($i1,4);
2618 &movdqu ($inout3,&QWP(0,$l_));
2619 &movdqu ($inout4,&QWP(0,$l_,$i1));
2620 &movdqa ($inout5,$inout3);
2621
2622 &movdqu ($inout0,&QWP(16*0,$inp)); # load input
2623 &movdqu ($inout1,&QWP(16*1,$inp));
2624 &movdqu ($inout2,&QWP(16*2,$inp));
2625 &mov ($rounds,&DWP(240,$key));
2626
2627 &movdqa (&QWP($checksum,"esp"),$rndkey1);
2628 &pxor ($inout3,$rndkey0); # ^ last offset_i
2629 &pxor ($inout4,$inout3);
2630 &pxor ($inout5,$inout4);
2631
2632 &pxor ($inout0,$inout3); # ^ offset_i
2633 &pxor ($inout1,$inout4);
2634 &pxor ($inout2,$inout5);
2635
2636 &mov ($out,&DWP($out_off,"esp"));
2637 &call ("_aesni_decrypt3");
2638
2639 &movdqa ($rndkey1,&QWP($checksum,"esp"));# pass the checksum
2640 &xorps ($inout0,$inout3); # ^ offset_i
2641 &xorps ($inout1,$inout4);
2642 &xorps ($inout2,$inout5);
2643 &movups (&QWP(16*0,$out,$inp),$inout0); # store output
2644 &pxor ($rndkey1,$inout0); # checksum
2645 &movdqa ($rndkey0,$inout5); # pass last offset_i
2646 &movups (&QWP(16*1,$out,$inp),$inout1);
2647 &pxor ($rndkey1,$inout1);
2648 &movups (&QWP(16*2,$out,$inp),$inout2);
2649 &pxor ($rndkey1,$inout2);
2650
2651 &jmp (&label("done"));
2652
2653&set_label("four",16);
2654 &lea ($i1,&DWP(1,$block));
2655 &lea ($i3,&DWP(3,$block));
2656 &bsf ($i1,$i1);
2657 &bsf ($i3,$i3);
2658 &mov ($key,&DWP($key_off,"esp")); # restore key
2659 &shl ($i1,4);
2660 &shl ($i3,4);
2661 &movdqu ($inout2,&QWP(0,$l_));
2662 &movdqu ($inout3,&QWP(0,$l_,$i1));
2663 &movdqa ($inout4,$inout2);
2664 &movdqu ($inout5,&QWP(0,$l_,$i3));
2665
2666 &pxor ($inout2,$rndkey0); # ^ last offset_i
2667 &movdqu ($inout0,&QWP(16*0,$inp)); # load input
2668 &pxor ($inout3,$inout2);
2669 &movdqu ($inout1,&QWP(16*1,$inp));
2670 &pxor ($inout4,$inout3);
2671 &movdqa (&QWP(16*0,"esp"),$inout2);
2672 &pxor ($inout5,$inout4);
2673 &movdqa (&QWP(16*1,"esp"),$inout3);
2674 &movdqu ($inout2,&QWP(16*2,$inp));
2675 &movdqu ($inout3,&QWP(16*3,$inp));
2676 &mov ($rounds,&DWP(240,$key));
2677
2678 &movdqa (&QWP($checksum,"esp"),$rndkey1);
2679 &pxor ($inout0,&QWP(16*0,"esp")); # ^ offset_i
2680 &pxor ($inout1,&QWP(16*1,"esp"));
2681 &pxor ($inout2,$inout4);
2682 &pxor ($inout3,$inout5);
2683
2684 &mov ($out,&DWP($out_off,"esp"));
2685 &call ("_aesni_decrypt4");
2686
2687 &movdqa ($rndkey1,&QWP($checksum,"esp"));# pass the checksum
2688 &xorps ($inout0,&QWP(16*0,"esp")); # ^ offset_i
2689 &xorps ($inout1,&QWP(16*1,"esp"));
2690 &xorps ($inout2,$inout4);
2691 &movups (&QWP(16*0,$out,$inp),$inout0); # store output
2692 &pxor ($rndkey1,$inout0); # checksum
2693 &xorps ($inout3,$inout5);
2694 &movups (&QWP(16*1,$out,$inp),$inout1);
2695 &pxor ($rndkey1,$inout1);
2696 &movdqa ($rndkey0,$inout5); # pass last offset_i
2697 &movups (&QWP(16*2,$out,$inp),$inout2);
2698 &pxor ($rndkey1,$inout2);
2699 &movups (&QWP(16*3,$out,$inp),$inout3);
2700 &pxor ($rndkey1,$inout3);
2701
2702&set_label("done");
2703 &mov ($key,&DWP($esp_off,"esp"));
2704 &pxor ($inout0,$inout0); # clear register bank
2705 &pxor ($inout1,$inout1);
2706 &movdqa (&QWP(16*0,"esp"),$inout0); # clear stack
2707 &pxor ($inout2,$inout2);
2708 &movdqa (&QWP(16*1,"esp"),$inout0);
2709 &pxor ($inout3,$inout3);
2710 &movdqa (&QWP(16*2,"esp"),$inout0);
2711 &pxor ($inout4,$inout4);
2712 &movdqa (&QWP(16*3,"esp"),$inout0);
2713 &pxor ($inout5,$inout5);
2714 &movdqa (&QWP(16*4,"esp"),$inout0);
2715 &movdqa (&QWP(16*5,"esp"),$inout0);
2716 &movdqa (&QWP(16*6,"esp"),$inout0);
2717
2718 &lea ("esp",&DWP(0,$key));
2719 &mov ($rounds,&wparam(5)); # &offset_i
2720 &mov ($rounds_,&wparam(7)); # &checksum
2721 &movdqu (&QWP(0,$rounds),$rndkey0);
2722 &pxor ($rndkey0,$rndkey0);
2723 &movdqu (&QWP(0,$rounds_),$rndkey1);
2724 &pxor ($rndkey1,$rndkey1);
2725&function_end("aesni_ocb_decrypt");
2726}
2727}
2728
2729
2730######################################################################
2731# void $PREFIX_cbc_encrypt (const void *inp, void *out,
2732# size_t length, const AES_KEY *key,
2733# unsigned char *ivp,const int enc);
2734&function_begin("${PREFIX}_cbc_encrypt");
2735 &mov ($inp,&wparam(0));
2736 &mov ($rounds_,"esp");
2737 &mov ($out,&wparam(1));
2738 &sub ($rounds_,24);
2739 &mov ($len,&wparam(2));
2740 &and ($rounds_,-16);
2741 &mov ($key,&wparam(3));
2742 &mov ($key_,&wparam(4));
2743 &test ($len,$len);
2744 &jz (&label("cbc_abort"));
2745
2746 &cmp (&wparam(5),0);
2747 &xchg ($rounds_,"esp"); # alloca
2748 &movups ($ivec,&QWP(0,$key_)); # load IV
2749 &mov ($rounds,&DWP(240,$key));
2750 &mov ($key_,$key); # backup $key
2751 &mov (&DWP(16,"esp"),$rounds_); # save original %esp
2752 &mov ($rounds_,$rounds); # backup $rounds
2753 &je (&label("cbc_decrypt"));
2754
2755 &movaps ($inout0,$ivec);
2756 &cmp ($len,16);
2757 &jb (&label("cbc_enc_tail"));
2758 &sub ($len,16);
2759 &jmp (&label("cbc_enc_loop"));
2760
2761&set_label("cbc_enc_loop",16);
2762 &movups ($ivec,&QWP(0,$inp)); # input actually
2763 &lea ($inp,&DWP(16,$inp));
2764 if ($inline)
2765 { &aesni_inline_generate1("enc",$inout0,$ivec); }
2766 else
2767 { &xorps($inout0,$ivec); &call("_aesni_encrypt1"); }
2768 &mov ($rounds,$rounds_); # restore $rounds
2769 &mov ($key,$key_); # restore $key
2770 &movups (&QWP(0,$out),$inout0); # store output
2771 &lea ($out,&DWP(16,$out));
2772 &sub ($len,16);
2773 &jnc (&label("cbc_enc_loop"));
2774 &add ($len,16);
2775 &jnz (&label("cbc_enc_tail"));
2776 &movaps ($ivec,$inout0);
2777 &pxor ($inout0,$inout0);
2778 &jmp (&label("cbc_ret"));
2779
2780&set_label("cbc_enc_tail");
2781 &mov ("ecx",$len); # zaps $rounds
2782 &data_word(0xA4F3F689); # rep movsb
2783 &mov ("ecx",16); # zero tail
2784 &sub ("ecx",$len);
2785 &xor ("eax","eax"); # zaps $len
2786 &data_word(0xAAF3F689); # rep stosb
2787 &lea ($out,&DWP(-16,$out)); # rewind $out by 1 block
2788 &mov ($rounds,$rounds_); # restore $rounds
2789 &mov ($inp,$out); # $inp and $out are the same
2790 &mov ($key,$key_); # restore $key
2791 &jmp (&label("cbc_enc_loop"));
2792######################################################################
2793&set_label("cbc_decrypt",16);
2794 &cmp ($len,0x50);
2795 &jbe (&label("cbc_dec_tail"));
2796 &movaps (&QWP(0,"esp"),$ivec); # save IV
2797 &sub ($len,0x50);
2798 &jmp (&label("cbc_dec_loop6_enter"));
2799
2800&set_label("cbc_dec_loop6",16);
2801 &movaps (&QWP(0,"esp"),$rndkey0); # save IV
2802 &movups (&QWP(0,$out),$inout5);
2803 &lea ($out,&DWP(0x10,$out));
2804&set_label("cbc_dec_loop6_enter");
2805 &movdqu ($inout0,&QWP(0,$inp));
2806 &movdqu ($inout1,&QWP(0x10,$inp));
2807 &movdqu ($inout2,&QWP(0x20,$inp));
2808 &movdqu ($inout3,&QWP(0x30,$inp));
2809 &movdqu ($inout4,&QWP(0x40,$inp));
2810 &movdqu ($inout5,&QWP(0x50,$inp));
2811
2812 &call ("_aesni_decrypt6");
2813
2814 &movups ($rndkey1,&QWP(0,$inp));
2815 &movups ($rndkey0,&QWP(0x10,$inp));
2816 &xorps ($inout0,&QWP(0,"esp")); # ^=IV
2817 &xorps ($inout1,$rndkey1);
2818 &movups ($rndkey1,&QWP(0x20,$inp));
2819 &xorps ($inout2,$rndkey0);
2820 &movups ($rndkey0,&QWP(0x30,$inp));
2821 &xorps ($inout3,$rndkey1);
2822 &movups ($rndkey1,&QWP(0x40,$inp));
2823 &xorps ($inout4,$rndkey0);
2824 &movups ($rndkey0,&QWP(0x50,$inp)); # IV
2825 &xorps ($inout5,$rndkey1);
2826 &movups (&QWP(0,$out),$inout0);
2827 &movups (&QWP(0x10,$out),$inout1);
2828 &lea ($inp,&DWP(0x60,$inp));
2829 &movups (&QWP(0x20,$out),$inout2);
2830 &mov ($rounds,$rounds_); # restore $rounds
2831 &movups (&QWP(0x30,$out),$inout3);
2832 &mov ($key,$key_); # restore $key
2833 &movups (&QWP(0x40,$out),$inout4);
2834 &lea ($out,&DWP(0x50,$out));
2835 &sub ($len,0x60);
2836 &ja (&label("cbc_dec_loop6"));
2837
2838 &movaps ($inout0,$inout5);
2839 &movaps ($ivec,$rndkey0);
2840 &add ($len,0x50);
2841 &jle (&label("cbc_dec_clear_tail_collected"));
2842 &movups (&QWP(0,$out),$inout0);
2843 &lea ($out,&DWP(0x10,$out));
2844&set_label("cbc_dec_tail");
2845 &movups ($inout0,&QWP(0,$inp));
2846 &movaps ($in0,$inout0);
2847 &cmp ($len,0x10);
2848 &jbe (&label("cbc_dec_one"));
2849
2850 &movups ($inout1,&QWP(0x10,$inp));
2851 &movaps ($in1,$inout1);
2852 &cmp ($len,0x20);
2853 &jbe (&label("cbc_dec_two"));
2854
2855 &movups ($inout2,&QWP(0x20,$inp));
2856 &cmp ($len,0x30);
2857 &jbe (&label("cbc_dec_three"));
2858
2859 &movups ($inout3,&QWP(0x30,$inp));
2860 &cmp ($len,0x40);
2861 &jbe (&label("cbc_dec_four"));
2862
2863 &movups ($inout4,&QWP(0x40,$inp));
2864 &movaps (&QWP(0,"esp"),$ivec); # save IV
2865 &movups ($inout0,&QWP(0,$inp));
2866 &xorps ($inout5,$inout5);
2867 &call ("_aesni_decrypt6");
2868 &movups ($rndkey1,&QWP(0,$inp));
2869 &movups ($rndkey0,&QWP(0x10,$inp));
2870 &xorps ($inout0,&QWP(0,"esp")); # ^= IV
2871 &xorps ($inout1,$rndkey1);
2872 &movups ($rndkey1,&QWP(0x20,$inp));
2873 &xorps ($inout2,$rndkey0);
2874 &movups ($rndkey0,&QWP(0x30,$inp));
2875 &xorps ($inout3,$rndkey1);
2876 &movups ($ivec,&QWP(0x40,$inp)); # IV
2877 &xorps ($inout4,$rndkey0);
2878 &movups (&QWP(0,$out),$inout0);
2879 &movups (&QWP(0x10,$out),$inout1);
2880 &pxor ($inout1,$inout1);
2881 &movups (&QWP(0x20,$out),$inout2);
2882 &pxor ($inout2,$inout2);
2883 &movups (&QWP(0x30,$out),$inout3);
2884 &pxor ($inout3,$inout3);
2885 &lea ($out,&DWP(0x40,$out));
2886 &movaps ($inout0,$inout4);
2887 &pxor ($inout4,$inout4);
2888 &sub ($len,0x50);
2889 &jmp (&label("cbc_dec_tail_collected"));
2890
2891&set_label("cbc_dec_one",16);
2892 if ($inline)
2893 { &aesni_inline_generate1("dec"); }
2894 else
2895 { &call ("_aesni_decrypt1"); }
2896 &xorps ($inout0,$ivec);
2897 &movaps ($ivec,$in0);
2898 &sub ($len,0x10);
2899 &jmp (&label("cbc_dec_tail_collected"));
2900
2901&set_label("cbc_dec_two",16);
2902 &call ("_aesni_decrypt2");
2903 &xorps ($inout0,$ivec);
2904 &xorps ($inout1,$in0);
2905 &movups (&QWP(0,$out),$inout0);
2906 &movaps ($inout0,$inout1);
2907 &pxor ($inout1,$inout1);
2908 &lea ($out,&DWP(0x10,$out));
2909 &movaps ($ivec,$in1);
2910 &sub ($len,0x20);
2911 &jmp (&label("cbc_dec_tail_collected"));
2912
2913&set_label("cbc_dec_three",16);
2914 &call ("_aesni_decrypt3");
2915 &xorps ($inout0,$ivec);
2916 &xorps ($inout1,$in0);
2917 &xorps ($inout2,$in1);
2918 &movups (&QWP(0,$out),$inout0);
2919 &movaps ($inout0,$inout2);
2920 &pxor ($inout2,$inout2);
2921 &movups (&QWP(0x10,$out),$inout1);
2922 &pxor ($inout1,$inout1);
2923 &lea ($out,&DWP(0x20,$out));
2924 &movups ($ivec,&QWP(0x20,$inp));
2925 &sub ($len,0x30);
2926 &jmp (&label("cbc_dec_tail_collected"));
2927
2928&set_label("cbc_dec_four",16);
2929 &call ("_aesni_decrypt4");
2930 &movups ($rndkey1,&QWP(0x10,$inp));
2931 &movups ($rndkey0,&QWP(0x20,$inp));
2932 &xorps ($inout0,$ivec);
2933 &movups ($ivec,&QWP(0x30,$inp));
2934 &xorps ($inout1,$in0);
2935 &movups (&QWP(0,$out),$inout0);
2936 &xorps ($inout2,$rndkey1);
2937 &movups (&QWP(0x10,$out),$inout1);
2938 &pxor ($inout1,$inout1);
2939 &xorps ($inout3,$rndkey0);
2940 &movups (&QWP(0x20,$out),$inout2);
2941 &pxor ($inout2,$inout2);
2942 &lea ($out,&DWP(0x30,$out));
2943 &movaps ($inout0,$inout3);
2944 &pxor ($inout3,$inout3);
2945 &sub ($len,0x40);
2946 &jmp (&label("cbc_dec_tail_collected"));
2947
2948&set_label("cbc_dec_clear_tail_collected",16);
2949 &pxor ($inout1,$inout1);
2950 &pxor ($inout2,$inout2);
2951 &pxor ($inout3,$inout3);
2952 &pxor ($inout4,$inout4);
2953&set_label("cbc_dec_tail_collected");
2954 &and ($len,15);
2955 &jnz (&label("cbc_dec_tail_partial"));
2956 &movups (&QWP(0,$out),$inout0);
2957 &pxor ($rndkey0,$rndkey0);
2958 &jmp (&label("cbc_ret"));
2959
2960&set_label("cbc_dec_tail_partial",16);
2961 &movaps (&QWP(0,"esp"),$inout0);
2962 &pxor ($rndkey0,$rndkey0);
2963 &mov ("ecx",16);
2964 &mov ($inp,"esp");
2965 &sub ("ecx",$len);
2966 &data_word(0xA4F3F689); # rep movsb
2967 &movdqa (&QWP(0,"esp"),$inout0);
2968
2969&set_label("cbc_ret");
2970 &mov ("esp",&DWP(16,"esp")); # pull original %esp
2971 &mov ($key_,&wparam(4));
2972 &pxor ($inout0,$inout0);
2973 &pxor ($rndkey1,$rndkey1);
2974 &movups (&QWP(0,$key_),$ivec); # output IV
2975 &pxor ($ivec,$ivec);
2976&set_label("cbc_abort");
2977&function_end("${PREFIX}_cbc_encrypt");
2978
2979
2980######################################################################
2981# Mechanical port from aesni-x86_64.pl.
2982#
2983# _aesni_set_encrypt_key is private interface,
2984# input:
2985# "eax" const unsigned char *userKey
2986# $rounds int bits
2987# $key AES_KEY *key
2988# output:
2989# "eax" return code
2990# $round rounds
2991
2992&function_begin_B("_aesni_set_encrypt_key");
2993 &push ("ebp");
2994 &push ("ebx");
2995 &test ("eax","eax");
2996 &jz (&label("bad_pointer"));
2997 &test ($key,$key);
2998 &jz (&label("bad_pointer"));
2999
3000 &call (&label("pic"));
3001&set_label("pic");
3002 &blindpop("ebx");
3003 &lea ("ebx",&DWP(&label("key_const")."-".&label("pic"),"ebx"));
3004
3005 &picmeup("ebp","OPENSSL_ia32cap_P","ebx",&label("key_const"));
3006 &movups ("xmm0",&QWP(0,"eax")); # pull first 128 bits of *userKey
3007 &xorps ("xmm4","xmm4"); # low dword of xmm4 is assumed 0
3008 &mov ("ebp",&DWP(4,"ebp"));
3009 &lea ($key,&DWP(16,$key));
3010 &and ("ebp",1<<28|1<<11); # AVX and XOP bits
3011 &cmp ($rounds,256);
3012 &je (&label("14rounds"));
3013 &cmp ($rounds,192);
3014 &je (&label("12rounds"));
3015 &cmp ($rounds,128);
3016 &jne (&label("bad_keybits"));
3017
3018&set_label("10rounds",16);
3019 &cmp ("ebp",1<<28);
3020 &je (&label("10rounds_alt"));
3021
3022 &mov ($rounds,9);
3023 &$movekey (&QWP(-16,$key),"xmm0"); # round 0
3024 &aeskeygenassist("xmm1","xmm0",0x01); # round 1
3025 &call (&label("key_128_cold"));
3026 &aeskeygenassist("xmm1","xmm0",0x2); # round 2
3027 &call (&label("key_128"));
3028 &aeskeygenassist("xmm1","xmm0",0x04); # round 3
3029 &call (&label("key_128"));
3030 &aeskeygenassist("xmm1","xmm0",0x08); # round 4
3031 &call (&label("key_128"));
3032 &aeskeygenassist("xmm1","xmm0",0x10); # round 5
3033 &call (&label("key_128"));
3034 &aeskeygenassist("xmm1","xmm0",0x20); # round 6
3035 &call (&label("key_128"));
3036 &aeskeygenassist("xmm1","xmm0",0x40); # round 7
3037 &call (&label("key_128"));
3038 &aeskeygenassist("xmm1","xmm0",0x80); # round 8
3039 &call (&label("key_128"));
3040 &aeskeygenassist("xmm1","xmm0",0x1b); # round 9
3041 &call (&label("key_128"));
3042 &aeskeygenassist("xmm1","xmm0",0x36); # round 10
3043 &call (&label("key_128"));
3044 &$movekey (&QWP(0,$key),"xmm0");
3045 &mov (&DWP(80,$key),$rounds);
3046
3047 &jmp (&label("good_key"));
3048
3049&set_label("key_128",16);
3050 &$movekey (&QWP(0,$key),"xmm0");
3051 &lea ($key,&DWP(16,$key));
3052&set_label("key_128_cold");
3053 &shufps ("xmm4","xmm0",0b00010000);
3054 &xorps ("xmm0","xmm4");
3055 &shufps ("xmm4","xmm0",0b10001100);
3056 &xorps ("xmm0","xmm4");
3057 &shufps ("xmm1","xmm1",0b11111111); # critical path
3058 &xorps ("xmm0","xmm1");
3059 &ret();
3060
3061&set_label("10rounds_alt",16);
3062 &movdqa ("xmm5",&QWP(0x00,"ebx"));
3063 &mov ($rounds,8);
3064 &movdqa ("xmm4",&QWP(0x20,"ebx"));
3065 &movdqa ("xmm2","xmm0");
3066 &movdqu (&QWP(-16,$key),"xmm0");
3067
3068&set_label("loop_key128");
3069 &pshufb ("xmm0","xmm5");
3070 &aesenclast ("xmm0","xmm4");
3071 &pslld ("xmm4",1);
3072 &lea ($key,&DWP(16,$key));
3073
3074 &movdqa ("xmm3","xmm2");
3075 &pslldq ("xmm2",4);
3076 &pxor ("xmm3","xmm2");
3077 &pslldq ("xmm2",4);
3078 &pxor ("xmm3","xmm2");
3079 &pslldq ("xmm2",4);
3080 &pxor ("xmm2","xmm3");
3081
3082 &pxor ("xmm0","xmm2");
3083 &movdqu (&QWP(-16,$key),"xmm0");
3084 &movdqa ("xmm2","xmm0");
3085
3086 &dec ($rounds);
3087 &jnz (&label("loop_key128"));
3088
3089 &movdqa ("xmm4",&QWP(0x30,"ebx"));
3090
3091 &pshufb ("xmm0","xmm5");
3092 &aesenclast ("xmm0","xmm4");
3093 &pslld ("xmm4",1);
3094
3095 &movdqa ("xmm3","xmm2");
3096 &pslldq ("xmm2",4);
3097 &pxor ("xmm3","xmm2");
3098 &pslldq ("xmm2",4);
3099 &pxor ("xmm3","xmm2");
3100 &pslldq ("xmm2",4);
3101 &pxor ("xmm2","xmm3");
3102
3103 &pxor ("xmm0","xmm2");
3104 &movdqu (&QWP(0,$key),"xmm0");
3105
3106 &movdqa ("xmm2","xmm0");
3107 &pshufb ("xmm0","xmm5");
3108 &aesenclast ("xmm0","xmm4");
3109
3110 &movdqa ("xmm3","xmm2");
3111 &pslldq ("xmm2",4);
3112 &pxor ("xmm3","xmm2");
3113 &pslldq ("xmm2",4);
3114 &pxor ("xmm3","xmm2");
3115 &pslldq ("xmm2",4);
3116 &pxor ("xmm2","xmm3");
3117
3118 &pxor ("xmm0","xmm2");
3119 &movdqu (&QWP(16,$key),"xmm0");
3120
3121 &mov ($rounds,9);
3122 &mov (&DWP(96,$key),$rounds);
3123
3124 &jmp (&label("good_key"));
3125
3126&set_label("12rounds",16);
3127 &movq ("xmm2",&QWP(16,"eax")); # remaining 1/3 of *userKey
3128 &cmp ("ebp",1<<28);
3129 &je (&label("12rounds_alt"));
3130
3131 &mov ($rounds,11);
3132 &$movekey (&QWP(-16,$key),"xmm0"); # round 0
3133 &aeskeygenassist("xmm1","xmm2",0x01); # round 1,2
3134 &call (&label("key_192a_cold"));
3135 &aeskeygenassist("xmm1","xmm2",0x02); # round 2,3
3136 &call (&label("key_192b"));
3137 &aeskeygenassist("xmm1","xmm2",0x04); # round 4,5
3138 &call (&label("key_192a"));
3139 &aeskeygenassist("xmm1","xmm2",0x08); # round 5,6
3140 &call (&label("key_192b"));
3141 &aeskeygenassist("xmm1","xmm2",0x10); # round 7,8
3142 &call (&label("key_192a"));
3143 &aeskeygenassist("xmm1","xmm2",0x20); # round 8,9
3144 &call (&label("key_192b"));
3145 &aeskeygenassist("xmm1","xmm2",0x40); # round 10,11
3146 &call (&label("key_192a"));
3147 &aeskeygenassist("xmm1","xmm2",0x80); # round 11,12
3148 &call (&label("key_192b"));
3149 &$movekey (&QWP(0,$key),"xmm0");
3150 &mov (&DWP(48,$key),$rounds);
3151
3152 &jmp (&label("good_key"));
3153
3154&set_label("key_192a",16);
3155 &$movekey (&QWP(0,$key),"xmm0");
3156 &lea ($key,&DWP(16,$key));
3157&set_label("key_192a_cold",16);
3158 &movaps ("xmm5","xmm2");
3159&set_label("key_192b_warm");
3160 &shufps ("xmm4","xmm0",0b00010000);
3161 &movdqa ("xmm3","xmm2");
3162 &xorps ("xmm0","xmm4");
3163 &shufps ("xmm4","xmm0",0b10001100);
3164 &pslldq ("xmm3",4);
3165 &xorps ("xmm0","xmm4");
3166 &pshufd ("xmm1","xmm1",0b01010101); # critical path
3167 &pxor ("xmm2","xmm3");
3168 &pxor ("xmm0","xmm1");
3169 &pshufd ("xmm3","xmm0",0b11111111);
3170 &pxor ("xmm2","xmm3");
3171 &ret();
3172
3173&set_label("key_192b",16);
3174 &movaps ("xmm3","xmm0");
3175 &shufps ("xmm5","xmm0",0b01000100);
3176 &$movekey (&QWP(0,$key),"xmm5");
3177 &shufps ("xmm3","xmm2",0b01001110);
3178 &$movekey (&QWP(16,$key),"xmm3");
3179 &lea ($key,&DWP(32,$key));
3180 &jmp (&label("key_192b_warm"));
3181
3182&set_label("12rounds_alt",16);
3183 &movdqa ("xmm5",&QWP(0x10,"ebx"));
3184 &movdqa ("xmm4",&QWP(0x20,"ebx"));
3185 &mov ($rounds,8);
3186 &movdqu (&QWP(-16,$key),"xmm0");
3187
3188&set_label("loop_key192");
3189 &movq (&QWP(0,$key),"xmm2");
3190 &movdqa ("xmm1","xmm2");
3191 &pshufb ("xmm2","xmm5");
3192 &aesenclast ("xmm2","xmm4");
3193 &pslld ("xmm4",1);
3194 &lea ($key,&DWP(24,$key));
3195
3196 &movdqa ("xmm3","xmm0");
3197 &pslldq ("xmm0",4);
3198 &pxor ("xmm3","xmm0");
3199 &pslldq ("xmm0",4);
3200 &pxor ("xmm3","xmm0");
3201 &pslldq ("xmm0",4);
3202 &pxor ("xmm0","xmm3");
3203
3204 &pshufd ("xmm3","xmm0",0xff);
3205 &pxor ("xmm3","xmm1");
3206 &pslldq ("xmm1",4);
3207 &pxor ("xmm3","xmm1");
3208
3209 &pxor ("xmm0","xmm2");
3210 &pxor ("xmm2","xmm3");
3211 &movdqu (&QWP(-16,$key),"xmm0");
3212
3213 &dec ($rounds);
3214 &jnz (&label("loop_key192"));
3215
3216 &mov ($rounds,11);
3217 &mov (&DWP(32,$key),$rounds);
3218
3219 &jmp (&label("good_key"));
3220
3221&set_label("14rounds",16);
3222 &movups ("xmm2",&QWP(16,"eax")); # remaining half of *userKey
3223 &lea ($key,&DWP(16,$key));
3224 &cmp ("ebp",1<<28);
3225 &je (&label("14rounds_alt"));
3226
3227 &mov ($rounds,13);
3228 &$movekey (&QWP(-32,$key),"xmm0"); # round 0
3229 &$movekey (&QWP(-16,$key),"xmm2"); # round 1
3230 &aeskeygenassist("xmm1","xmm2",0x01); # round 2
3231 &call (&label("key_256a_cold"));
3232 &aeskeygenassist("xmm1","xmm0",0x01); # round 3
3233 &call (&label("key_256b"));
3234 &aeskeygenassist("xmm1","xmm2",0x02); # round 4
3235 &call (&label("key_256a"));
3236 &aeskeygenassist("xmm1","xmm0",0x02); # round 5
3237 &call (&label("key_256b"));
3238 &aeskeygenassist("xmm1","xmm2",0x04); # round 6
3239 &call (&label("key_256a"));
3240 &aeskeygenassist("xmm1","xmm0",0x04); # round 7
3241 &call (&label("key_256b"));
3242 &aeskeygenassist("xmm1","xmm2",0x08); # round 8
3243 &call (&label("key_256a"));
3244 &aeskeygenassist("xmm1","xmm0",0x08); # round 9
3245 &call (&label("key_256b"));
3246 &aeskeygenassist("xmm1","xmm2",0x10); # round 10
3247 &call (&label("key_256a"));
3248 &aeskeygenassist("xmm1","xmm0",0x10); # round 11
3249 &call (&label("key_256b"));
3250 &aeskeygenassist("xmm1","xmm2",0x20); # round 12
3251 &call (&label("key_256a"));
3252 &aeskeygenassist("xmm1","xmm0",0x20); # round 13
3253 &call (&label("key_256b"));
3254 &aeskeygenassist("xmm1","xmm2",0x40); # round 14
3255 &call (&label("key_256a"));
3256 &$movekey (&QWP(0,$key),"xmm0");
3257 &mov (&DWP(16,$key),$rounds);
3258 &xor ("eax","eax");
3259
3260 &jmp (&label("good_key"));
3261
3262&set_label("key_256a",16);
3263 &$movekey (&QWP(0,$key),"xmm2");
3264 &lea ($key,&DWP(16,$key));
3265&set_label("key_256a_cold");
3266 &shufps ("xmm4","xmm0",0b00010000);
3267 &xorps ("xmm0","xmm4");
3268 &shufps ("xmm4","xmm0",0b10001100);
3269 &xorps ("xmm0","xmm4");
3270 &shufps ("xmm1","xmm1",0b11111111); # critical path
3271 &xorps ("xmm0","xmm1");
3272 &ret();
3273
3274&set_label("key_256b",16);
3275 &$movekey (&QWP(0,$key),"xmm0");
3276 &lea ($key,&DWP(16,$key));
3277
3278 &shufps ("xmm4","xmm2",0b00010000);
3279 &xorps ("xmm2","xmm4");
3280 &shufps ("xmm4","xmm2",0b10001100);
3281 &xorps ("xmm2","xmm4");
3282 &shufps ("xmm1","xmm1",0b10101010); # critical path
3283 &xorps ("xmm2","xmm1");
3284 &ret();
3285
3286&set_label("14rounds_alt",16);
3287 &movdqa ("xmm5",&QWP(0x00,"ebx"));
3288 &movdqa ("xmm4",&QWP(0x20,"ebx"));
3289 &mov ($rounds,7);
3290 &movdqu (&QWP(-32,$key),"xmm0");
3291 &movdqa ("xmm1","xmm2");
3292 &movdqu (&QWP(-16,$key),"xmm2");
3293
3294&set_label("loop_key256");
3295 &pshufb ("xmm2","xmm5");
3296 &aesenclast ("xmm2","xmm4");
3297
3298 &movdqa ("xmm3","xmm0");
3299 &pslldq ("xmm0",4);
3300 &pxor ("xmm3","xmm0");
3301 &pslldq ("xmm0",4);
3302 &pxor ("xmm3","xmm0");
3303 &pslldq ("xmm0",4);
3304 &pxor ("xmm0","xmm3");
3305 &pslld ("xmm4",1);
3306
3307 &pxor ("xmm0","xmm2");
3308 &movdqu (&QWP(0,$key),"xmm0");
3309
3310 &dec ($rounds);
3311 &jz (&label("done_key256"));
3312
3313 &pshufd ("xmm2","xmm0",0xff);
3314 &pxor ("xmm3","xmm3");
3315 &aesenclast ("xmm2","xmm3");
3316
3317 &movdqa ("xmm3","xmm1");
3318 &pslldq ("xmm1",4);
3319 &pxor ("xmm3","xmm1");
3320 &pslldq ("xmm1",4);
3321 &pxor ("xmm3","xmm1");
3322 &pslldq ("xmm1",4);
3323 &pxor ("xmm1","xmm3");
3324
3325 &pxor ("xmm2","xmm1");
3326 &movdqu (&QWP(16,$key),"xmm2");
3327 &lea ($key,&DWP(32,$key));
3328 &movdqa ("xmm1","xmm2");
3329 &jmp (&label("loop_key256"));
3330
3331&set_label("done_key256");
3332 &mov ($rounds,13);
3333 &mov (&DWP(16,$key),$rounds);
3334
3335&set_label("good_key");
3336 &pxor ("xmm0","xmm0");
3337 &pxor ("xmm1","xmm1");
3338 &pxor ("xmm2","xmm2");
3339 &pxor ("xmm3","xmm3");
3340 &pxor ("xmm4","xmm4");
3341 &pxor ("xmm5","xmm5");
3342 &xor ("eax","eax");
3343 &pop ("ebx");
3344 &pop ("ebp");
3345 &ret ();
3346
3347&set_label("bad_pointer",4);
3348 &mov ("eax",-1);
3349 &pop ("ebx");
3350 &pop ("ebp");
3351 &ret ();
3352&set_label("bad_keybits",4);
3353 &pxor ("xmm0","xmm0");
3354 &mov ("eax",-2);
3355 &pop ("ebx");
3356 &pop ("ebp");
3357 &ret ();
3358&function_end_B("_aesni_set_encrypt_key");
3359
3360# int $PREFIX_set_encrypt_key (const unsigned char *userKey, int bits,
3361# AES_KEY *key)
3362&function_begin_B("${PREFIX}_set_encrypt_key");
3363 &mov ("eax",&wparam(0));
3364 &mov ($rounds,&wparam(1));
3365 &mov ($key,&wparam(2));
3366 &call ("_aesni_set_encrypt_key");
3367 &ret ();
3368&function_end_B("${PREFIX}_set_encrypt_key");
3369
3370# int $PREFIX_set_decrypt_key (const unsigned char *userKey, int bits,
3371# AES_KEY *key)
3372&function_begin_B("${PREFIX}_set_decrypt_key");
3373 &mov ("eax",&wparam(0));
3374 &mov ($rounds,&wparam(1));
3375 &mov ($key,&wparam(2));
3376 &call ("_aesni_set_encrypt_key");
3377 &mov ($key,&wparam(2));
3378 &shl ($rounds,4); # rounds-1 after _aesni_set_encrypt_key
3379 &test ("eax","eax");
3380 &jnz (&label("dec_key_ret"));
3381 &lea ("eax",&DWP(16,$key,$rounds)); # end of key schedule
3382
3383 &$movekey ("xmm0",&QWP(0,$key)); # just swap
3384 &$movekey ("xmm1",&QWP(0,"eax"));
3385 &$movekey (&QWP(0,"eax"),"xmm0");
3386 &$movekey (&QWP(0,$key),"xmm1");
3387 &lea ($key,&DWP(16,$key));
3388 &lea ("eax",&DWP(-16,"eax"));
3389
3390&set_label("dec_key_inverse");
3391 &$movekey ("xmm0",&QWP(0,$key)); # swap and inverse
3392 &$movekey ("xmm1",&QWP(0,"eax"));
3393 &aesimc ("xmm0","xmm0");
3394 &aesimc ("xmm1","xmm1");
3395 &lea ($key,&DWP(16,$key));
3396 &lea ("eax",&DWP(-16,"eax"));
3397 &$movekey (&QWP(16,"eax"),"xmm0");
3398 &$movekey (&QWP(-16,$key),"xmm1");
3399 &cmp ("eax",$key);
3400 &ja (&label("dec_key_inverse"));
3401
3402 &$movekey ("xmm0",&QWP(0,$key)); # inverse middle
3403 &aesimc ("xmm0","xmm0");
3404 &$movekey (&QWP(0,$key),"xmm0");
3405
3406 &pxor ("xmm0","xmm0");
3407 &pxor ("xmm1","xmm1");
3408 &xor ("eax","eax"); # return success
3409&set_label("dec_key_ret");
3410 &ret ();
3411&function_end_B("${PREFIX}_set_decrypt_key");
3412
3413&set_label("key_const",64);
3414&data_word(0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d);
3415&data_word(0x04070605,0x04070605,0x04070605,0x04070605);
3416&data_word(1,1,1,1);
3417&data_word(0x1b,0x1b,0x1b,0x1b);
3418&asciz("AES for Intel AES-NI, CRYPTOGAMS by <appro\@openssl.org>");
3419
3420&asm_finish();
3421
3422close STDOUT or die "error closing STDOUT: $!";
注意: 瀏覽 TracBrowser 來幫助您使用儲存庫瀏覽器

© 2025 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette