VirtualBox

source: vbox/trunk/src/libs/openssl-3.0.3/crypto/chacha/asm/chacha-x86_64.pl@ 96159

最後變更 在這個檔案從96159是 94082,由 vboxsync 提交於 3 年 前

libs/openssl-3.0.1: started applying and adjusting our OpenSSL changes to 3.0.1. bugref:10128

  • 屬性 svn:executable 設為 *
檔案大小: 95.4 KB
 
1#! /usr/bin/env perl
2# Copyright 2016-2021 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the Apache License 2.0 (the "License"). You may not use
5# this file except in compliance with the License. You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
9#
10# ====================================================================
11# Written by Andy Polyakov <[email protected]> for the OpenSSL
12# project. The module is, however, dual licensed under OpenSSL and
13# CRYPTOGAMS licenses depending on where you obtain it. For further
14# details see http://www.openssl.org/~appro/cryptogams/.
15# ====================================================================
16#
17# November 2014
18#
19# ChaCha20 for x86_64.
20#
21# December 2016
22#
23# Add AVX512F code path.
24#
25# December 2017
26#
27# Add AVX512VL code path.
28#
29# Performance in cycles per byte out of large buffer.
30#
31# IALU/gcc 4.8(i) 1x/2xSSSE3(ii) 4xSSSE3 NxAVX(v)
32#
33# P4 9.48/+99% - -
34# Core2 7.83/+55% 7.90/5.76 4.35
35# Westmere 7.19/+50% 5.60/4.50 3.00
36# Sandy Bridge 8.31/+42% 5.45/4.00 2.72
37# Ivy Bridge 6.71/+46% 5.40/? 2.41
38# Haswell 5.92/+43% 5.20/3.45 2.42 1.23
39# Skylake[-X] 5.87/+39% 4.70/3.22 2.31 1.19[0.80(vi)]
40# Silvermont 12.0/+33% 7.75/6.90 7.03(iii)
41# Knights L 11.7/- ? 9.60(iii) 0.80
42# Goldmont 10.6/+17% 5.10/3.52 3.28
43# Sledgehammer 7.28/+52% - -
44# Bulldozer 9.66/+28% 9.85/5.35(iv) 3.06(iv)
45# Ryzen 5.96/+50% 5.19/3.00 2.40 2.09
46# VIA Nano 10.5/+46% 6.72/6.88 6.05
47#
48# (i) compared to older gcc 3.x one can observe >2x improvement on
49# most platforms;
50# (ii) 2xSSSE3 is code path optimized specifically for 128 bytes used
51# by chacha20_poly1305_tls_cipher, results are EVP-free;
52# (iii) this is not optimal result for Atom because of MSROM
53# limitations, SSE2 can do better, but gain is considered too
54# low to justify the [maintenance] effort;
55# (iv) Bulldozer actually executes 4xXOP code path that delivers 2.20
56# and 4.85 for 128-byte inputs;
57# (v) 8xAVX2, 8xAVX512VL or 16xAVX512F, whichever best applicable;
58# (vi) even though Skylake-X can execute AVX512F code and deliver 0.57
59# cpb in single thread, the corresponding capability is suppressed;
60
61# $output is the last argument if it looks like a file (it has an extension)
62# $flavour is the first argument if it doesn't look like a file
63$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
64$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
65
66$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
67
68$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
69( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
70( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
71die "can't locate x86_64-xlate.pl";
72
73if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
74 =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
75 $avx = ($1>=2.19) + ($1>=2.22) + ($1>=2.25);
76}
77
78if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
79 `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)(?:\.([0-9]+))?/) {
80 $avx = ($1>=2.09) + ($1>=2.10) + ($1>=2.12);
81 $avx += 1 if ($1==2.11 && $2>=8);
82}
83
84if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
85 `ml64 2>&1` =~ /Version ([0-9]+)\./) {
86 $avx = ($1>=10) + ($1>=11);
87}
88
89if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:clang|LLVM) version|.*based on LLVM) ([0-9]+\.[0-9]+)/) {
90 $avx = ($2>=3.0) + ($2>3.0);
91}
92
93open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""
94 or die "can't call $xlate: $!";
95*STDOUT=*OUT;
96
97# input parameter block
98($out,$inp,$len,$key,$counter)=("%rdi","%rsi","%rdx","%rcx","%r8");
99
100$code.=<<___;
101.text
102
103.extern OPENSSL_ia32cap_P
104
105.align 64
106.Lzero:
107.long 0,0,0,0
108.Lone:
109.long 1,0,0,0
110.Linc:
111.long 0,1,2,3
112.Lfour:
113.long 4,4,4,4
114.Lincy:
115.long 0,2,4,6,1,3,5,7
116.Leight:
117.long 8,8,8,8,8,8,8,8
118.Lrot16:
119.byte 0x2,0x3,0x0,0x1, 0x6,0x7,0x4,0x5, 0xa,0xb,0x8,0x9, 0xe,0xf,0xc,0xd
120.Lrot24:
121.byte 0x3,0x0,0x1,0x2, 0x7,0x4,0x5,0x6, 0xb,0x8,0x9,0xa, 0xf,0xc,0xd,0xe
122.Ltwoy:
123.long 2,0,0,0, 2,0,0,0
124.align 64
125.Lzeroz:
126.long 0,0,0,0, 1,0,0,0, 2,0,0,0, 3,0,0,0
127.Lfourz:
128.long 4,0,0,0, 4,0,0,0, 4,0,0,0, 4,0,0,0
129.Lincz:
130.long 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
131.Lsixteen:
132.long 16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16
133.Lsigma:
134.asciz "expand 32-byte k"
135.asciz "ChaCha20 for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
136___
137
138sub AUTOLOAD() # thunk [simplified] 32-bit style perlasm
139{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://;
140 my $arg = pop;
141 $arg = "\$$arg" if ($arg*1 eq $arg);
142 $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n";
143}
144
145@x=("%eax","%ebx","%ecx","%edx",map("%r${_}d",(8..11)),
146 "%nox","%nox","%nox","%nox",map("%r${_}d",(12..15)));
147@t=("%esi","%edi");
148
149sub ROUND { # critical path is 24 cycles per round
150my ($a0,$b0,$c0,$d0)=@_;
151my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0));
152my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1));
153my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2));
154my ($xc,$xc_)=map("\"$_\"",@t);
155my @x=map("\"$_\"",@x);
156
157 # Consider order in which variables are addressed by their
158 # index:
159 #
160 # a b c d
161 #
162 # 0 4 8 12 < even round
163 # 1 5 9 13
164 # 2 6 10 14
165 # 3 7 11 15
166 # 0 5 10 15 < odd round
167 # 1 6 11 12
168 # 2 7 8 13
169 # 3 4 9 14
170 #
171 # 'a', 'b' and 'd's are permanently allocated in registers,
172 # @x[0..7,12..15], while 'c's are maintained in memory. If
173 # you observe 'c' column, you'll notice that pair of 'c's is
174 # invariant between rounds. This means that we have to reload
175 # them once per round, in the middle. This is why you'll see
176 # bunch of 'c' stores and loads in the middle, but none in
177 # the beginning or end.
178
179 # Normally instructions would be interleaved to favour in-order
180 # execution. Generally out-of-order cores manage it gracefully,
181 # but not this time for some reason. As in-order execution
182 # cores are dying breed, old Atom is the only one around,
183 # instructions are left uninterleaved. Besides, Atom is better
184 # off executing 1xSSSE3 code anyway...
185
186 (
187 "&add (@x[$a0],@x[$b0])", # Q1
188 "&xor (@x[$d0],@x[$a0])",
189 "&rol (@x[$d0],16)",
190 "&add (@x[$a1],@x[$b1])", # Q2
191 "&xor (@x[$d1],@x[$a1])",
192 "&rol (@x[$d1],16)",
193
194 "&add ($xc,@x[$d0])",
195 "&xor (@x[$b0],$xc)",
196 "&rol (@x[$b0],12)",
197 "&add ($xc_,@x[$d1])",
198 "&xor (@x[$b1],$xc_)",
199 "&rol (@x[$b1],12)",
200
201 "&add (@x[$a0],@x[$b0])",
202 "&xor (@x[$d0],@x[$a0])",
203 "&rol (@x[$d0],8)",
204 "&add (@x[$a1],@x[$b1])",
205 "&xor (@x[$d1],@x[$a1])",
206 "&rol (@x[$d1],8)",
207
208 "&add ($xc,@x[$d0])",
209 "&xor (@x[$b0],$xc)",
210 "&rol (@x[$b0],7)",
211 "&add ($xc_,@x[$d1])",
212 "&xor (@x[$b1],$xc_)",
213 "&rol (@x[$b1],7)",
214
215 "&mov (\"4*$c0(%rsp)\",$xc)", # reload pair of 'c's
216 "&mov (\"4*$c1(%rsp)\",$xc_)",
217 "&mov ($xc,\"4*$c2(%rsp)\")",
218 "&mov ($xc_,\"4*$c3(%rsp)\")",
219
220 "&add (@x[$a2],@x[$b2])", # Q3
221 "&xor (@x[$d2],@x[$a2])",
222 "&rol (@x[$d2],16)",
223 "&add (@x[$a3],@x[$b3])", # Q4
224 "&xor (@x[$d3],@x[$a3])",
225 "&rol (@x[$d3],16)",
226
227 "&add ($xc,@x[$d2])",
228 "&xor (@x[$b2],$xc)",
229 "&rol (@x[$b2],12)",
230 "&add ($xc_,@x[$d3])",
231 "&xor (@x[$b3],$xc_)",
232 "&rol (@x[$b3],12)",
233
234 "&add (@x[$a2],@x[$b2])",
235 "&xor (@x[$d2],@x[$a2])",
236 "&rol (@x[$d2],8)",
237 "&add (@x[$a3],@x[$b3])",
238 "&xor (@x[$d3],@x[$a3])",
239 "&rol (@x[$d3],8)",
240
241 "&add ($xc,@x[$d2])",
242 "&xor (@x[$b2],$xc)",
243 "&rol (@x[$b2],7)",
244 "&add ($xc_,@x[$d3])",
245 "&xor (@x[$b3],$xc_)",
246 "&rol (@x[$b3],7)"
247 );
248}
249
250########################################################################
251# Generic code path that handles all lengths on pre-SSSE3 processors.
252$code.=<<___;
253.globl ChaCha20_ctr32
254.type ChaCha20_ctr32,\@function,5
255.align 64
256ChaCha20_ctr32:
257.cfi_startproc
258 cmp \$0,$len
259 je .Lno_data
260 mov OPENSSL_ia32cap_P+4(%rip),%r10
261___
262$code.=<<___ if ($avx>2);
263 bt \$48,%r10 # check for AVX512F
264 jc .LChaCha20_avx512
265 test %r10,%r10 # check for AVX512VL
266 js .LChaCha20_avx512vl
267___
268$code.=<<___;
269 test \$`1<<(41-32)`,%r10d
270 jnz .LChaCha20_ssse3
271
272 push %rbx
273.cfi_push %rbx
274 push %rbp
275.cfi_push %rbp
276 push %r12
277.cfi_push %r12
278 push %r13
279.cfi_push %r13
280 push %r14
281.cfi_push %r14
282 push %r15
283.cfi_push %r15
284 sub \$64+24,%rsp
285.cfi_adjust_cfa_offset 64+24
286.Lctr32_body:
287
288 #movdqa .Lsigma(%rip),%xmm0
289 movdqu ($key),%xmm1
290 movdqu 16($key),%xmm2
291 movdqu ($counter),%xmm3
292 movdqa .Lone(%rip),%xmm4
293
294 #movdqa %xmm0,4*0(%rsp) # key[0]
295 movdqa %xmm1,4*4(%rsp) # key[1]
296 movdqa %xmm2,4*8(%rsp) # key[2]
297 movdqa %xmm3,4*12(%rsp) # key[3]
298 mov $len,%rbp # reassign $len
299 jmp .Loop_outer
300
301.align 32
302.Loop_outer:
303 mov \$0x61707865,@x[0] # 'expa'
304 mov \$0x3320646e,@x[1] # 'nd 3'
305 mov \$0x79622d32,@x[2] # '2-by'
306 mov \$0x6b206574,@x[3] # 'te k'
307 mov 4*4(%rsp),@x[4]
308 mov 4*5(%rsp),@x[5]
309 mov 4*6(%rsp),@x[6]
310 mov 4*7(%rsp),@x[7]
311 movd %xmm3,@x[12]
312 mov 4*13(%rsp),@x[13]
313 mov 4*14(%rsp),@x[14]
314 mov 4*15(%rsp),@x[15]
315
316 mov %rbp,64+0(%rsp) # save len
317 mov \$10,%ebp
318 mov $inp,64+8(%rsp) # save inp
319 movq %xmm2,%rsi # "@x[8]"
320 mov $out,64+16(%rsp) # save out
321 mov %rsi,%rdi
322 shr \$32,%rdi # "@x[9]"
323 jmp .Loop
324
325.align 32
326.Loop:
327___
328 foreach (&ROUND (0, 4, 8,12)) { eval; }
329 foreach (&ROUND (0, 5,10,15)) { eval; }
330 &dec ("%ebp");
331 &jnz (".Loop");
332
333$code.=<<___;
334 mov @t[1],4*9(%rsp) # modulo-scheduled
335 mov @t[0],4*8(%rsp)
336 mov 64(%rsp),%rbp # load len
337 movdqa %xmm2,%xmm1
338 mov 64+8(%rsp),$inp # load inp
339 paddd %xmm4,%xmm3 # increment counter
340 mov 64+16(%rsp),$out # load out
341
342 add \$0x61707865,@x[0] # 'expa'
343 add \$0x3320646e,@x[1] # 'nd 3'
344 add \$0x79622d32,@x[2] # '2-by'
345 add \$0x6b206574,@x[3] # 'te k'
346 add 4*4(%rsp),@x[4]
347 add 4*5(%rsp),@x[5]
348 add 4*6(%rsp),@x[6]
349 add 4*7(%rsp),@x[7]
350 add 4*12(%rsp),@x[12]
351 add 4*13(%rsp),@x[13]
352 add 4*14(%rsp),@x[14]
353 add 4*15(%rsp),@x[15]
354 paddd 4*8(%rsp),%xmm1
355
356 cmp \$64,%rbp
357 jb .Ltail
358
359 xor 4*0($inp),@x[0] # xor with input
360 xor 4*1($inp),@x[1]
361 xor 4*2($inp),@x[2]
362 xor 4*3($inp),@x[3]
363 xor 4*4($inp),@x[4]
364 xor 4*5($inp),@x[5]
365 xor 4*6($inp),@x[6]
366 xor 4*7($inp),@x[7]
367 movdqu 4*8($inp),%xmm0
368 xor 4*12($inp),@x[12]
369 xor 4*13($inp),@x[13]
370 xor 4*14($inp),@x[14]
371 xor 4*15($inp),@x[15]
372 lea 4*16($inp),$inp # inp+=64
373 pxor %xmm1,%xmm0
374
375 movdqa %xmm2,4*8(%rsp)
376 movd %xmm3,4*12(%rsp)
377
378 mov @x[0],4*0($out) # write output
379 mov @x[1],4*1($out)
380 mov @x[2],4*2($out)
381 mov @x[3],4*3($out)
382 mov @x[4],4*4($out)
383 mov @x[5],4*5($out)
384 mov @x[6],4*6($out)
385 mov @x[7],4*7($out)
386 movdqu %xmm0,4*8($out)
387 mov @x[12],4*12($out)
388 mov @x[13],4*13($out)
389 mov @x[14],4*14($out)
390 mov @x[15],4*15($out)
391 lea 4*16($out),$out # out+=64
392
393 sub \$64,%rbp
394 jnz .Loop_outer
395
396 jmp .Ldone
397
398.align 16
399.Ltail:
400 mov @x[0],4*0(%rsp)
401 mov @x[1],4*1(%rsp)
402 xor %rbx,%rbx
403 mov @x[2],4*2(%rsp)
404 mov @x[3],4*3(%rsp)
405 mov @x[4],4*4(%rsp)
406 mov @x[5],4*5(%rsp)
407 mov @x[6],4*6(%rsp)
408 mov @x[7],4*7(%rsp)
409 movdqa %xmm1,4*8(%rsp)
410 mov @x[12],4*12(%rsp)
411 mov @x[13],4*13(%rsp)
412 mov @x[14],4*14(%rsp)
413 mov @x[15],4*15(%rsp)
414
415.Loop_tail:
416 movzb ($inp,%rbx),%eax
417 movzb (%rsp,%rbx),%edx
418 lea 1(%rbx),%rbx
419 xor %edx,%eax
420 mov %al,-1($out,%rbx)
421 dec %rbp
422 jnz .Loop_tail
423
424.Ldone:
425 lea 64+24+48(%rsp),%rsi
426.cfi_def_cfa %rsi,8
427 mov -48(%rsi),%r15
428.cfi_restore %r15
429 mov -40(%rsi),%r14
430.cfi_restore %r14
431 mov -32(%rsi),%r13
432.cfi_restore %r13
433 mov -24(%rsi),%r12
434.cfi_restore %r12
435 mov -16(%rsi),%rbp
436.cfi_restore %rbp
437 mov -8(%rsi),%rbx
438.cfi_restore %rbx
439 lea (%rsi),%rsp
440.cfi_def_cfa_register %rsp
441.Lno_data:
442 ret
443.cfi_endproc
444.size ChaCha20_ctr32,.-ChaCha20_ctr32
445___
446
447########################################################################
448# SSSE3 code path that handles shorter lengths
449{
450my ($a,$b,$c,$d,$t,$t1,$rot16,$rot24)=map("%xmm$_",(0..7));
451
452sub SSSE3ROUND { # critical path is 20 "SIMD ticks" per round
453 &paddd ($a,$b);
454 &pxor ($d,$a);
455 &pshufb ($d,$rot16);
456
457 &paddd ($c,$d);
458 &pxor ($b,$c);
459 &movdqa ($t,$b);
460 &psrld ($b,20);
461 &pslld ($t,12);
462 &por ($b,$t);
463
464 &paddd ($a,$b);
465 &pxor ($d,$a);
466 &pshufb ($d,$rot24);
467
468 &paddd ($c,$d);
469 &pxor ($b,$c);
470 &movdqa ($t,$b);
471 &psrld ($b,25);
472 &pslld ($t,7);
473 &por ($b,$t);
474}
475
476my $xframe = $win64 ? 160+8 : 8;
477
478$code.=<<___;
479.type ChaCha20_ssse3,\@function,5
480.align 32
481ChaCha20_ssse3:
482.cfi_startproc
483.LChaCha20_ssse3:
484 mov %rsp,%r9 # frame pointer
485.cfi_def_cfa_register %r9
486___
487$code.=<<___ if ($avx);
488 test \$`1<<(43-32)`,%r10d
489 jnz .LChaCha20_4xop # XOP is fastest even if we use 1/4
490___
491$code.=<<___;
492 cmp \$128,$len # we might throw away some data,
493 je .LChaCha20_128
494 ja .LChaCha20_4x # but overall it won't be slower
495
496.Ldo_sse3_after_all:
497 sub \$64+$xframe,%rsp
498___
499$code.=<<___ if ($win64);
500 movaps %xmm6,-0x28(%r9)
501 movaps %xmm7,-0x18(%r9)
502.Lssse3_body:
503___
504$code.=<<___;
505 movdqa .Lsigma(%rip),$a
506 movdqu ($key),$b
507 movdqu 16($key),$c
508 movdqu ($counter),$d
509 movdqa .Lrot16(%rip),$rot16
510 movdqa .Lrot24(%rip),$rot24
511
512 movdqa $a,0x00(%rsp)
513 movdqa $b,0x10(%rsp)
514 movdqa $c,0x20(%rsp)
515 movdqa $d,0x30(%rsp)
516 mov \$10,$counter # reuse $counter
517 jmp .Loop_ssse3
518
519.align 32
520.Loop_outer_ssse3:
521 movdqa .Lone(%rip),$d
522 movdqa 0x00(%rsp),$a
523 movdqa 0x10(%rsp),$b
524 movdqa 0x20(%rsp),$c
525 paddd 0x30(%rsp),$d
526 mov \$10,$counter
527 movdqa $d,0x30(%rsp)
528 jmp .Loop_ssse3
529
530.align 32
531.Loop_ssse3:
532___
533 &SSSE3ROUND();
534 &pshufd ($c,$c,0b01001110);
535 &pshufd ($b,$b,0b00111001);
536 &pshufd ($d,$d,0b10010011);
537 &nop ();
538
539 &SSSE3ROUND();
540 &pshufd ($c,$c,0b01001110);
541 &pshufd ($b,$b,0b10010011);
542 &pshufd ($d,$d,0b00111001);
543
544 &dec ($counter);
545 &jnz (".Loop_ssse3");
546
547$code.=<<___;
548 paddd 0x00(%rsp),$a
549 paddd 0x10(%rsp),$b
550 paddd 0x20(%rsp),$c
551 paddd 0x30(%rsp),$d
552
553 cmp \$64,$len
554 jb .Ltail_ssse3
555
556 movdqu 0x00($inp),$t
557 movdqu 0x10($inp),$t1
558 pxor $t,$a # xor with input
559 movdqu 0x20($inp),$t
560 pxor $t1,$b
561 movdqu 0x30($inp),$t1
562 lea 0x40($inp),$inp # inp+=64
563 pxor $t,$c
564 pxor $t1,$d
565
566 movdqu $a,0x00($out) # write output
567 movdqu $b,0x10($out)
568 movdqu $c,0x20($out)
569 movdqu $d,0x30($out)
570 lea 0x40($out),$out # out+=64
571
572 sub \$64,$len
573 jnz .Loop_outer_ssse3
574
575 jmp .Ldone_ssse3
576
577.align 16
578.Ltail_ssse3:
579 movdqa $a,0x00(%rsp)
580 movdqa $b,0x10(%rsp)
581 movdqa $c,0x20(%rsp)
582 movdqa $d,0x30(%rsp)
583 xor $counter,$counter
584
585.Loop_tail_ssse3:
586 movzb ($inp,$counter),%eax
587 movzb (%rsp,$counter),%ecx
588 lea 1($counter),$counter
589 xor %ecx,%eax
590 mov %al,-1($out,$counter)
591 dec $len
592 jnz .Loop_tail_ssse3
593
594.Ldone_ssse3:
595___
596$code.=<<___ if ($win64);
597 movaps -0x28(%r9),%xmm6
598 movaps -0x18(%r9),%xmm7
599___
600$code.=<<___;
601 lea (%r9),%rsp
602.cfi_def_cfa_register %rsp
603.Lssse3_epilogue:
604 ret
605.cfi_endproc
606.size ChaCha20_ssse3,.-ChaCha20_ssse3
607___
608}
609
610########################################################################
611# SSSE3 code path that handles 128-byte inputs
612{
613my ($a,$b,$c,$d,$t,$t1,$rot16,$rot24)=map("%xmm$_",(8,9,2..7));
614my ($a1,$b1,$c1,$d1)=map("%xmm$_",(10,11,0,1));
615
616sub SSSE3ROUND_2x {
617 &paddd ($a,$b);
618 &pxor ($d,$a);
619 &paddd ($a1,$b1);
620 &pxor ($d1,$a1);
621 &pshufb ($d,$rot16);
622 &pshufb($d1,$rot16);
623
624 &paddd ($c,$d);
625 &paddd ($c1,$d1);
626 &pxor ($b,$c);
627 &pxor ($b1,$c1);
628 &movdqa ($t,$b);
629 &psrld ($b,20);
630 &movdqa($t1,$b1);
631 &pslld ($t,12);
632 &psrld ($b1,20);
633 &por ($b,$t);
634 &pslld ($t1,12);
635 &por ($b1,$t1);
636
637 &paddd ($a,$b);
638 &pxor ($d,$a);
639 &paddd ($a1,$b1);
640 &pxor ($d1,$a1);
641 &pshufb ($d,$rot24);
642 &pshufb($d1,$rot24);
643
644 &paddd ($c,$d);
645 &paddd ($c1,$d1);
646 &pxor ($b,$c);
647 &pxor ($b1,$c1);
648 &movdqa ($t,$b);
649 &psrld ($b,25);
650 &movdqa($t1,$b1);
651 &pslld ($t,7);
652 &psrld ($b1,25);
653 &por ($b,$t);
654 &pslld ($t1,7);
655 &por ($b1,$t1);
656}
657
658my $xframe = $win64 ? 0x68 : 8;
659
660$code.=<<___;
661.type ChaCha20_128,\@function,5
662.align 32
663ChaCha20_128:
664.cfi_startproc
665.LChaCha20_128:
666 mov %rsp,%r9 # frame pointer
667.cfi_def_cfa_register %r9
668 sub \$64+$xframe,%rsp
669___
670$code.=<<___ if ($win64);
671 movaps %xmm6,-0x68(%r9)
672 movaps %xmm7,-0x58(%r9)
673 movaps %xmm8,-0x48(%r9)
674 movaps %xmm9,-0x38(%r9)
675 movaps %xmm10,-0x28(%r9)
676 movaps %xmm11,-0x18(%r9)
677.L128_body:
678___
679$code.=<<___;
680 movdqa .Lsigma(%rip),$a
681 movdqu ($key),$b
682 movdqu 16($key),$c
683 movdqu ($counter),$d
684 movdqa .Lone(%rip),$d1
685 movdqa .Lrot16(%rip),$rot16
686 movdqa .Lrot24(%rip),$rot24
687
688 movdqa $a,$a1
689 movdqa $a,0x00(%rsp)
690 movdqa $b,$b1
691 movdqa $b,0x10(%rsp)
692 movdqa $c,$c1
693 movdqa $c,0x20(%rsp)
694 paddd $d,$d1
695 movdqa $d,0x30(%rsp)
696 mov \$10,$counter # reuse $counter
697 jmp .Loop_128
698
699.align 32
700.Loop_128:
701___
702 &SSSE3ROUND_2x();
703 &pshufd ($c,$c,0b01001110);
704 &pshufd ($b,$b,0b00111001);
705 &pshufd ($d,$d,0b10010011);
706 &pshufd ($c1,$c1,0b01001110);
707 &pshufd ($b1,$b1,0b00111001);
708 &pshufd ($d1,$d1,0b10010011);
709
710 &SSSE3ROUND_2x();
711 &pshufd ($c,$c,0b01001110);
712 &pshufd ($b,$b,0b10010011);
713 &pshufd ($d,$d,0b00111001);
714 &pshufd ($c1,$c1,0b01001110);
715 &pshufd ($b1,$b1,0b10010011);
716 &pshufd ($d1,$d1,0b00111001);
717
718 &dec ($counter);
719 &jnz (".Loop_128");
720
721$code.=<<___;
722 paddd 0x00(%rsp),$a
723 paddd 0x10(%rsp),$b
724 paddd 0x20(%rsp),$c
725 paddd 0x30(%rsp),$d
726 paddd .Lone(%rip),$d1
727 paddd 0x00(%rsp),$a1
728 paddd 0x10(%rsp),$b1
729 paddd 0x20(%rsp),$c1
730 paddd 0x30(%rsp),$d1
731
732 movdqu 0x00($inp),$t
733 movdqu 0x10($inp),$t1
734 pxor $t,$a # xor with input
735 movdqu 0x20($inp),$t
736 pxor $t1,$b
737 movdqu 0x30($inp),$t1
738 pxor $t,$c
739 movdqu 0x40($inp),$t
740 pxor $t1,$d
741 movdqu 0x50($inp),$t1
742 pxor $t,$a1
743 movdqu 0x60($inp),$t
744 pxor $t1,$b1
745 movdqu 0x70($inp),$t1
746 pxor $t,$c1
747 pxor $t1,$d1
748
749 movdqu $a,0x00($out) # write output
750 movdqu $b,0x10($out)
751 movdqu $c,0x20($out)
752 movdqu $d,0x30($out)
753 movdqu $a1,0x40($out)
754 movdqu $b1,0x50($out)
755 movdqu $c1,0x60($out)
756 movdqu $d1,0x70($out)
757___
758$code.=<<___ if ($win64);
759 movaps -0x68(%r9),%xmm6
760 movaps -0x58(%r9),%xmm7
761 movaps -0x48(%r9),%xmm8
762 movaps -0x38(%r9),%xmm9
763 movaps -0x28(%r9),%xmm10
764 movaps -0x18(%r9),%xmm11
765___
766$code.=<<___;
767 lea (%r9),%rsp
768.cfi_def_cfa_register %rsp
769.L128_epilogue:
770 ret
771.cfi_endproc
772.size ChaCha20_128,.-ChaCha20_128
773___
774}
775
776########################################################################
777# SSSE3 code path that handles longer messages.
778{
779# assign variables to favor Atom front-end
780my ($xd0,$xd1,$xd2,$xd3, $xt0,$xt1,$xt2,$xt3,
781 $xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3)=map("%xmm$_",(0..15));
782my @xx=($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3,
783 "%nox","%nox","%nox","%nox", $xd0,$xd1,$xd2,$xd3);
784
785sub SSSE3_lane_ROUND {
786my ($a0,$b0,$c0,$d0)=@_;
787my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0));
788my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1));
789my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2));
790my ($xc,$xc_,$t0,$t1)=map("\"$_\"",$xt0,$xt1,$xt2,$xt3);
791my @x=map("\"$_\"",@xx);
792
793 # Consider order in which variables are addressed by their
794 # index:
795 #
796 # a b c d
797 #
798 # 0 4 8 12 < even round
799 # 1 5 9 13
800 # 2 6 10 14
801 # 3 7 11 15
802 # 0 5 10 15 < odd round
803 # 1 6 11 12
804 # 2 7 8 13
805 # 3 4 9 14
806 #
807 # 'a', 'b' and 'd's are permanently allocated in registers,
808 # @x[0..7,12..15], while 'c's are maintained in memory. If
809 # you observe 'c' column, you'll notice that pair of 'c's is
810 # invariant between rounds. This means that we have to reload
811 # them once per round, in the middle. This is why you'll see
812 # bunch of 'c' stores and loads in the middle, but none in
813 # the beginning or end.
814
815 (
816 "&paddd (@x[$a0],@x[$b0])", # Q1
817 "&paddd (@x[$a1],@x[$b1])", # Q2
818 "&pxor (@x[$d0],@x[$a0])",
819 "&pxor (@x[$d1],@x[$a1])",
820 "&pshufb (@x[$d0],$t1)",
821 "&pshufb (@x[$d1],$t1)",
822
823 "&paddd ($xc,@x[$d0])",
824 "&paddd ($xc_,@x[$d1])",
825 "&pxor (@x[$b0],$xc)",
826 "&pxor (@x[$b1],$xc_)",
827 "&movdqa ($t0,@x[$b0])",
828 "&pslld (@x[$b0],12)",
829 "&psrld ($t0,20)",
830 "&movdqa ($t1,@x[$b1])",
831 "&pslld (@x[$b1],12)",
832 "&por (@x[$b0],$t0)",
833 "&psrld ($t1,20)",
834 "&movdqa ($t0,'(%r11)')", # .Lrot24(%rip)
835 "&por (@x[$b1],$t1)",
836
837 "&paddd (@x[$a0],@x[$b0])",
838 "&paddd (@x[$a1],@x[$b1])",
839 "&pxor (@x[$d0],@x[$a0])",
840 "&pxor (@x[$d1],@x[$a1])",
841 "&pshufb (@x[$d0],$t0)",
842 "&pshufb (@x[$d1],$t0)",
843
844 "&paddd ($xc,@x[$d0])",
845 "&paddd ($xc_,@x[$d1])",
846 "&pxor (@x[$b0],$xc)",
847 "&pxor (@x[$b1],$xc_)",
848 "&movdqa ($t1,@x[$b0])",
849 "&pslld (@x[$b0],7)",
850 "&psrld ($t1,25)",
851 "&movdqa ($t0,@x[$b1])",
852 "&pslld (@x[$b1],7)",
853 "&por (@x[$b0],$t1)",
854 "&psrld ($t0,25)",
855 "&movdqa ($t1,'(%r10)')", # .Lrot16(%rip)
856 "&por (@x[$b1],$t0)",
857
858 "&movdqa (\"`16*($c0-8)`(%rsp)\",$xc)", # reload pair of 'c's
859 "&movdqa (\"`16*($c1-8)`(%rsp)\",$xc_)",
860 "&movdqa ($xc,\"`16*($c2-8)`(%rsp)\")",
861 "&movdqa ($xc_,\"`16*($c3-8)`(%rsp)\")",
862
863 "&paddd (@x[$a2],@x[$b2])", # Q3
864 "&paddd (@x[$a3],@x[$b3])", # Q4
865 "&pxor (@x[$d2],@x[$a2])",
866 "&pxor (@x[$d3],@x[$a3])",
867 "&pshufb (@x[$d2],$t1)",
868 "&pshufb (@x[$d3],$t1)",
869
870 "&paddd ($xc,@x[$d2])",
871 "&paddd ($xc_,@x[$d3])",
872 "&pxor (@x[$b2],$xc)",
873 "&pxor (@x[$b3],$xc_)",
874 "&movdqa ($t0,@x[$b2])",
875 "&pslld (@x[$b2],12)",
876 "&psrld ($t0,20)",
877 "&movdqa ($t1,@x[$b3])",
878 "&pslld (@x[$b3],12)",
879 "&por (@x[$b2],$t0)",
880 "&psrld ($t1,20)",
881 "&movdqa ($t0,'(%r11)')", # .Lrot24(%rip)
882 "&por (@x[$b3],$t1)",
883
884 "&paddd (@x[$a2],@x[$b2])",
885 "&paddd (@x[$a3],@x[$b3])",
886 "&pxor (@x[$d2],@x[$a2])",
887 "&pxor (@x[$d3],@x[$a3])",
888 "&pshufb (@x[$d2],$t0)",
889 "&pshufb (@x[$d3],$t0)",
890
891 "&paddd ($xc,@x[$d2])",
892 "&paddd ($xc_,@x[$d3])",
893 "&pxor (@x[$b2],$xc)",
894 "&pxor (@x[$b3],$xc_)",
895 "&movdqa ($t1,@x[$b2])",
896 "&pslld (@x[$b2],7)",
897 "&psrld ($t1,25)",
898 "&movdqa ($t0,@x[$b3])",
899 "&pslld (@x[$b3],7)",
900 "&por (@x[$b2],$t1)",
901 "&psrld ($t0,25)",
902 "&movdqa ($t1,'(%r10)')", # .Lrot16(%rip)
903 "&por (@x[$b3],$t0)"
904 );
905}
906
907my $xframe = $win64 ? 0xa8 : 8;
908
909$code.=<<___;
910.type ChaCha20_4x,\@function,5
911.align 32
912ChaCha20_4x:
913.cfi_startproc
914.LChaCha20_4x:
915 mov %rsp,%r9 # frame pointer
916.cfi_def_cfa_register %r9
917 mov %r10,%r11
918___
919$code.=<<___ if ($avx>1);
920 shr \$32,%r10 # OPENSSL_ia32cap_P+8
921 test \$`1<<5`,%r10 # test AVX2
922 jnz .LChaCha20_8x
923___
924$code.=<<___;
925 cmp \$192,$len
926 ja .Lproceed4x
927
928 and \$`1<<26|1<<22`,%r11 # isolate XSAVE+MOVBE
929 cmp \$`1<<22`,%r11 # check for MOVBE without XSAVE
930 je .Ldo_sse3_after_all # to detect Atom
931
932.Lproceed4x:
933 sub \$0x140+$xframe,%rsp
934___
935 ################ stack layout
936 # +0x00 SIMD equivalent of @x[8-12]
937 # ...
938 # +0x40 constant copy of key[0-2] smashed by lanes
939 # ...
940 # +0x100 SIMD counters (with nonce smashed by lanes)
941 # ...
942 # +0x140
943$code.=<<___ if ($win64);
944 movaps %xmm6,-0xa8(%r9)
945 movaps %xmm7,-0x98(%r9)
946 movaps %xmm8,-0x88(%r9)
947 movaps %xmm9,-0x78(%r9)
948 movaps %xmm10,-0x68(%r9)
949 movaps %xmm11,-0x58(%r9)
950 movaps %xmm12,-0x48(%r9)
951 movaps %xmm13,-0x38(%r9)
952 movaps %xmm14,-0x28(%r9)
953 movaps %xmm15,-0x18(%r9)
954.L4x_body:
955___
956$code.=<<___;
957 movdqa .Lsigma(%rip),$xa3 # key[0]
958 movdqu ($key),$xb3 # key[1]
959 movdqu 16($key),$xt3 # key[2]
960 movdqu ($counter),$xd3 # key[3]
961 lea 0x100(%rsp),%rcx # size optimization
962 lea .Lrot16(%rip),%r10
963 lea .Lrot24(%rip),%r11
964
965 pshufd \$0x00,$xa3,$xa0 # smash key by lanes...
966 pshufd \$0x55,$xa3,$xa1
967 movdqa $xa0,0x40(%rsp) # ... and offload
968 pshufd \$0xaa,$xa3,$xa2
969 movdqa $xa1,0x50(%rsp)
970 pshufd \$0xff,$xa3,$xa3
971 movdqa $xa2,0x60(%rsp)
972 movdqa $xa3,0x70(%rsp)
973
974 pshufd \$0x00,$xb3,$xb0
975 pshufd \$0x55,$xb3,$xb1
976 movdqa $xb0,0x80-0x100(%rcx)
977 pshufd \$0xaa,$xb3,$xb2
978 movdqa $xb1,0x90-0x100(%rcx)
979 pshufd \$0xff,$xb3,$xb3
980 movdqa $xb2,0xa0-0x100(%rcx)
981 movdqa $xb3,0xb0-0x100(%rcx)
982
983 pshufd \$0x00,$xt3,$xt0 # "$xc0"
984 pshufd \$0x55,$xt3,$xt1 # "$xc1"
985 movdqa $xt0,0xc0-0x100(%rcx)
986 pshufd \$0xaa,$xt3,$xt2 # "$xc2"
987 movdqa $xt1,0xd0-0x100(%rcx)
988 pshufd \$0xff,$xt3,$xt3 # "$xc3"
989 movdqa $xt2,0xe0-0x100(%rcx)
990 movdqa $xt3,0xf0-0x100(%rcx)
991
992 pshufd \$0x00,$xd3,$xd0
993 pshufd \$0x55,$xd3,$xd1
994 paddd .Linc(%rip),$xd0 # don't save counters yet
995 pshufd \$0xaa,$xd3,$xd2
996 movdqa $xd1,0x110-0x100(%rcx)
997 pshufd \$0xff,$xd3,$xd3
998 movdqa $xd2,0x120-0x100(%rcx)
999 movdqa $xd3,0x130-0x100(%rcx)
1000
1001 jmp .Loop_enter4x
1002
1003.align 32
1004.Loop_outer4x:
1005 movdqa 0x40(%rsp),$xa0 # re-load smashed key
1006 movdqa 0x50(%rsp),$xa1
1007 movdqa 0x60(%rsp),$xa2
1008 movdqa 0x70(%rsp),$xa3
1009 movdqa 0x80-0x100(%rcx),$xb0
1010 movdqa 0x90-0x100(%rcx),$xb1
1011 movdqa 0xa0-0x100(%rcx),$xb2
1012 movdqa 0xb0-0x100(%rcx),$xb3
1013 movdqa 0xc0-0x100(%rcx),$xt0 # "$xc0"
1014 movdqa 0xd0-0x100(%rcx),$xt1 # "$xc1"
1015 movdqa 0xe0-0x100(%rcx),$xt2 # "$xc2"
1016 movdqa 0xf0-0x100(%rcx),$xt3 # "$xc3"
1017 movdqa 0x100-0x100(%rcx),$xd0
1018 movdqa 0x110-0x100(%rcx),$xd1
1019 movdqa 0x120-0x100(%rcx),$xd2
1020 movdqa 0x130-0x100(%rcx),$xd3
1021 paddd .Lfour(%rip),$xd0 # next SIMD counters
1022
1023.Loop_enter4x:
1024 movdqa $xt2,0x20(%rsp) # SIMD equivalent of "@x[10]"
1025 movdqa $xt3,0x30(%rsp) # SIMD equivalent of "@x[11]"
1026 movdqa (%r10),$xt3 # .Lrot16(%rip)
1027 mov \$10,%eax
1028 movdqa $xd0,0x100-0x100(%rcx) # save SIMD counters
1029 jmp .Loop4x
1030
1031.align 32
1032.Loop4x:
1033___
1034 foreach (&SSSE3_lane_ROUND(0, 4, 8,12)) { eval; }
1035 foreach (&SSSE3_lane_ROUND(0, 5,10,15)) { eval; }
1036$code.=<<___;
1037 dec %eax
1038 jnz .Loop4x
1039
1040 paddd 0x40(%rsp),$xa0 # accumulate key material
1041 paddd 0x50(%rsp),$xa1
1042 paddd 0x60(%rsp),$xa2
1043 paddd 0x70(%rsp),$xa3
1044
1045 movdqa $xa0,$xt2 # "de-interlace" data
1046 punpckldq $xa1,$xa0
1047 movdqa $xa2,$xt3
1048 punpckldq $xa3,$xa2
1049 punpckhdq $xa1,$xt2
1050 punpckhdq $xa3,$xt3
1051 movdqa $xa0,$xa1
1052 punpcklqdq $xa2,$xa0 # "a0"
1053 movdqa $xt2,$xa3
1054 punpcklqdq $xt3,$xt2 # "a2"
1055 punpckhqdq $xa2,$xa1 # "a1"
1056 punpckhqdq $xt3,$xa3 # "a3"
1057___
1058 ($xa2,$xt2)=($xt2,$xa2);
1059$code.=<<___;
1060 paddd 0x80-0x100(%rcx),$xb0
1061 paddd 0x90-0x100(%rcx),$xb1
1062 paddd 0xa0-0x100(%rcx),$xb2
1063 paddd 0xb0-0x100(%rcx),$xb3
1064
1065 movdqa $xa0,0x00(%rsp) # offload $xaN
1066 movdqa $xa1,0x10(%rsp)
1067 movdqa 0x20(%rsp),$xa0 # "xc2"
1068 movdqa 0x30(%rsp),$xa1 # "xc3"
1069
1070 movdqa $xb0,$xt2
1071 punpckldq $xb1,$xb0
1072 movdqa $xb2,$xt3
1073 punpckldq $xb3,$xb2
1074 punpckhdq $xb1,$xt2
1075 punpckhdq $xb3,$xt3
1076 movdqa $xb0,$xb1
1077 punpcklqdq $xb2,$xb0 # "b0"
1078 movdqa $xt2,$xb3
1079 punpcklqdq $xt3,$xt2 # "b2"
1080 punpckhqdq $xb2,$xb1 # "b1"
1081 punpckhqdq $xt3,$xb3 # "b3"
1082___
1083 ($xb2,$xt2)=($xt2,$xb2);
1084 my ($xc0,$xc1,$xc2,$xc3)=($xt0,$xt1,$xa0,$xa1);
1085$code.=<<___;
1086 paddd 0xc0-0x100(%rcx),$xc0
1087 paddd 0xd0-0x100(%rcx),$xc1
1088 paddd 0xe0-0x100(%rcx),$xc2
1089 paddd 0xf0-0x100(%rcx),$xc3
1090
1091 movdqa $xa2,0x20(%rsp) # keep offloading $xaN
1092 movdqa $xa3,0x30(%rsp)
1093
1094 movdqa $xc0,$xt2
1095 punpckldq $xc1,$xc0
1096 movdqa $xc2,$xt3
1097 punpckldq $xc3,$xc2
1098 punpckhdq $xc1,$xt2
1099 punpckhdq $xc3,$xt3
1100 movdqa $xc0,$xc1
1101 punpcklqdq $xc2,$xc0 # "c0"
1102 movdqa $xt2,$xc3
1103 punpcklqdq $xt3,$xt2 # "c2"
1104 punpckhqdq $xc2,$xc1 # "c1"
1105 punpckhqdq $xt3,$xc3 # "c3"
1106___
1107 ($xc2,$xt2)=($xt2,$xc2);
1108 ($xt0,$xt1)=($xa2,$xa3); # use $xaN as temporary
1109$code.=<<___;
1110 paddd 0x100-0x100(%rcx),$xd0
1111 paddd 0x110-0x100(%rcx),$xd1
1112 paddd 0x120-0x100(%rcx),$xd2
1113 paddd 0x130-0x100(%rcx),$xd3
1114
1115 movdqa $xd0,$xt2
1116 punpckldq $xd1,$xd0
1117 movdqa $xd2,$xt3
1118 punpckldq $xd3,$xd2
1119 punpckhdq $xd1,$xt2
1120 punpckhdq $xd3,$xt3
1121 movdqa $xd0,$xd1
1122 punpcklqdq $xd2,$xd0 # "d0"
1123 movdqa $xt2,$xd3
1124 punpcklqdq $xt3,$xt2 # "d2"
1125 punpckhqdq $xd2,$xd1 # "d1"
1126 punpckhqdq $xt3,$xd3 # "d3"
1127___
1128 ($xd2,$xt2)=($xt2,$xd2);
1129$code.=<<___;
1130 cmp \$64*4,$len
1131 jb .Ltail4x
1132
1133 movdqu 0x00($inp),$xt0 # xor with input
1134 movdqu 0x10($inp),$xt1
1135 movdqu 0x20($inp),$xt2
1136 movdqu 0x30($inp),$xt3
1137 pxor 0x00(%rsp),$xt0 # $xaN is offloaded, remember?
1138 pxor $xb0,$xt1
1139 pxor $xc0,$xt2
1140 pxor $xd0,$xt3
1141
1142 movdqu $xt0,0x00($out)
1143 movdqu 0x40($inp),$xt0
1144 movdqu $xt1,0x10($out)
1145 movdqu 0x50($inp),$xt1
1146 movdqu $xt2,0x20($out)
1147 movdqu 0x60($inp),$xt2
1148 movdqu $xt3,0x30($out)
1149 movdqu 0x70($inp),$xt3
1150 lea 0x80($inp),$inp # size optimization
1151 pxor 0x10(%rsp),$xt0
1152 pxor $xb1,$xt1
1153 pxor $xc1,$xt2
1154 pxor $xd1,$xt3
1155
1156 movdqu $xt0,0x40($out)
1157 movdqu 0x00($inp),$xt0
1158 movdqu $xt1,0x50($out)
1159 movdqu 0x10($inp),$xt1
1160 movdqu $xt2,0x60($out)
1161 movdqu 0x20($inp),$xt2
1162 movdqu $xt3,0x70($out)
1163 lea 0x80($out),$out # size optimization
1164 movdqu 0x30($inp),$xt3
1165 pxor 0x20(%rsp),$xt0
1166 pxor $xb2,$xt1
1167 pxor $xc2,$xt2
1168 pxor $xd2,$xt3
1169
1170 movdqu $xt0,0x00($out)
1171 movdqu 0x40($inp),$xt0
1172 movdqu $xt1,0x10($out)
1173 movdqu 0x50($inp),$xt1
1174 movdqu $xt2,0x20($out)
1175 movdqu 0x60($inp),$xt2
1176 movdqu $xt3,0x30($out)
1177 movdqu 0x70($inp),$xt3
1178 lea 0x80($inp),$inp # inp+=64*4
1179 pxor 0x30(%rsp),$xt0
1180 pxor $xb3,$xt1
1181 pxor $xc3,$xt2
1182 pxor $xd3,$xt3
1183 movdqu $xt0,0x40($out)
1184 movdqu $xt1,0x50($out)
1185 movdqu $xt2,0x60($out)
1186 movdqu $xt3,0x70($out)
1187 lea 0x80($out),$out # out+=64*4
1188
1189 sub \$64*4,$len
1190 jnz .Loop_outer4x
1191
1192 jmp .Ldone4x
1193
1194.Ltail4x:
1195 cmp \$192,$len
1196 jae .L192_or_more4x
1197 cmp \$128,$len
1198 jae .L128_or_more4x
1199 cmp \$64,$len
1200 jae .L64_or_more4x
1201
1202 #movdqa 0x00(%rsp),$xt0 # $xaN is offloaded, remember?
1203 xor %r10,%r10
1204 #movdqa $xt0,0x00(%rsp)
1205 movdqa $xb0,0x10(%rsp)
1206 movdqa $xc0,0x20(%rsp)
1207 movdqa $xd0,0x30(%rsp)
1208 jmp .Loop_tail4x
1209
1210.align 32
1211.L64_or_more4x:
1212 movdqu 0x00($inp),$xt0 # xor with input
1213 movdqu 0x10($inp),$xt1
1214 movdqu 0x20($inp),$xt2
1215 movdqu 0x30($inp),$xt3
1216 pxor 0x00(%rsp),$xt0 # $xaxN is offloaded, remember?
1217 pxor $xb0,$xt1
1218 pxor $xc0,$xt2
1219 pxor $xd0,$xt3
1220 movdqu $xt0,0x00($out)
1221 movdqu $xt1,0x10($out)
1222 movdqu $xt2,0x20($out)
1223 movdqu $xt3,0x30($out)
1224 je .Ldone4x
1225
1226 movdqa 0x10(%rsp),$xt0 # $xaN is offloaded, remember?
1227 lea 0x40($inp),$inp # inp+=64*1
1228 xor %r10,%r10
1229 movdqa $xt0,0x00(%rsp)
1230 movdqa $xb1,0x10(%rsp)
1231 lea 0x40($out),$out # out+=64*1
1232 movdqa $xc1,0x20(%rsp)
1233 sub \$64,$len # len-=64*1
1234 movdqa $xd1,0x30(%rsp)
1235 jmp .Loop_tail4x
1236
1237.align 32
1238.L128_or_more4x:
1239 movdqu 0x00($inp),$xt0 # xor with input
1240 movdqu 0x10($inp),$xt1
1241 movdqu 0x20($inp),$xt2
1242 movdqu 0x30($inp),$xt3
1243 pxor 0x00(%rsp),$xt0 # $xaN is offloaded, remember?
1244 pxor $xb0,$xt1
1245 pxor $xc0,$xt2
1246 pxor $xd0,$xt3
1247
1248 movdqu $xt0,0x00($out)
1249 movdqu 0x40($inp),$xt0
1250 movdqu $xt1,0x10($out)
1251 movdqu 0x50($inp),$xt1
1252 movdqu $xt2,0x20($out)
1253 movdqu 0x60($inp),$xt2
1254 movdqu $xt3,0x30($out)
1255 movdqu 0x70($inp),$xt3
1256 pxor 0x10(%rsp),$xt0
1257 pxor $xb1,$xt1
1258 pxor $xc1,$xt2
1259 pxor $xd1,$xt3
1260 movdqu $xt0,0x40($out)
1261 movdqu $xt1,0x50($out)
1262 movdqu $xt2,0x60($out)
1263 movdqu $xt3,0x70($out)
1264 je .Ldone4x
1265
1266 movdqa 0x20(%rsp),$xt0 # $xaN is offloaded, remember?
1267 lea 0x80($inp),$inp # inp+=64*2
1268 xor %r10,%r10
1269 movdqa $xt0,0x00(%rsp)
1270 movdqa $xb2,0x10(%rsp)
1271 lea 0x80($out),$out # out+=64*2
1272 movdqa $xc2,0x20(%rsp)
1273 sub \$128,$len # len-=64*2
1274 movdqa $xd2,0x30(%rsp)
1275 jmp .Loop_tail4x
1276
1277.align 32
1278.L192_or_more4x:
1279 movdqu 0x00($inp),$xt0 # xor with input
1280 movdqu 0x10($inp),$xt1
1281 movdqu 0x20($inp),$xt2
1282 movdqu 0x30($inp),$xt3
1283 pxor 0x00(%rsp),$xt0 # $xaN is offloaded, remember?
1284 pxor $xb0,$xt1
1285 pxor $xc0,$xt2
1286 pxor $xd0,$xt3
1287
1288 movdqu $xt0,0x00($out)
1289 movdqu 0x40($inp),$xt0
1290 movdqu $xt1,0x10($out)
1291 movdqu 0x50($inp),$xt1
1292 movdqu $xt2,0x20($out)
1293 movdqu 0x60($inp),$xt2
1294 movdqu $xt3,0x30($out)
1295 movdqu 0x70($inp),$xt3
1296 lea 0x80($inp),$inp # size optimization
1297 pxor 0x10(%rsp),$xt0
1298 pxor $xb1,$xt1
1299 pxor $xc1,$xt2
1300 pxor $xd1,$xt3
1301
1302 movdqu $xt0,0x40($out)
1303 movdqu 0x00($inp),$xt0
1304 movdqu $xt1,0x50($out)
1305 movdqu 0x10($inp),$xt1
1306 movdqu $xt2,0x60($out)
1307 movdqu 0x20($inp),$xt2
1308 movdqu $xt3,0x70($out)
1309 lea 0x80($out),$out # size optimization
1310 movdqu 0x30($inp),$xt3
1311 pxor 0x20(%rsp),$xt0
1312 pxor $xb2,$xt1
1313 pxor $xc2,$xt2
1314 pxor $xd2,$xt3
1315 movdqu $xt0,0x00($out)
1316 movdqu $xt1,0x10($out)
1317 movdqu $xt2,0x20($out)
1318 movdqu $xt3,0x30($out)
1319 je .Ldone4x
1320
1321 movdqa 0x30(%rsp),$xt0 # $xaN is offloaded, remember?
1322 lea 0x40($inp),$inp # inp+=64*3
1323 xor %r10,%r10
1324 movdqa $xt0,0x00(%rsp)
1325 movdqa $xb3,0x10(%rsp)
1326 lea 0x40($out),$out # out+=64*3
1327 movdqa $xc3,0x20(%rsp)
1328 sub \$192,$len # len-=64*3
1329 movdqa $xd3,0x30(%rsp)
1330
1331.Loop_tail4x:
1332 movzb ($inp,%r10),%eax
1333 movzb (%rsp,%r10),%ecx
1334 lea 1(%r10),%r10
1335 xor %ecx,%eax
1336 mov %al,-1($out,%r10)
1337 dec $len
1338 jnz .Loop_tail4x
1339
1340.Ldone4x:
1341___
1342$code.=<<___ if ($win64);
1343 movaps -0xa8(%r9),%xmm6
1344 movaps -0x98(%r9),%xmm7
1345 movaps -0x88(%r9),%xmm8
1346 movaps -0x78(%r9),%xmm9
1347 movaps -0x68(%r9),%xmm10
1348 movaps -0x58(%r9),%xmm11
1349 movaps -0x48(%r9),%xmm12
1350 movaps -0x38(%r9),%xmm13
1351 movaps -0x28(%r9),%xmm14
1352 movaps -0x18(%r9),%xmm15
1353___
1354$code.=<<___;
1355 lea (%r9),%rsp
1356.cfi_def_cfa_register %rsp
1357.L4x_epilogue:
1358 ret
1359.cfi_endproc
1360.size ChaCha20_4x,.-ChaCha20_4x
1361___
1362}
1363
1364########################################################################
1365# XOP code path that handles all lengths.
1366if ($avx) {
1367# There is some "anomaly" observed depending on instructions' size or
1368# alignment. If you look closely at below code you'll notice that
1369# sometimes argument order varies. The order affects instruction
1370# encoding by making it larger, and such fiddling gives 5% performance
1371# improvement. This is on FX-4100...
1372
1373my ($xb0,$xb1,$xb2,$xb3, $xd0,$xd1,$xd2,$xd3,
1374 $xa0,$xa1,$xa2,$xa3, $xt0,$xt1,$xt2,$xt3)=map("%xmm$_",(0..15));
1375my @xx=($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3,
1376 $xt0,$xt1,$xt2,$xt3, $xd0,$xd1,$xd2,$xd3);
1377
1378sub XOP_lane_ROUND {
1379my ($a0,$b0,$c0,$d0)=@_;
1380my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0));
1381my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1));
1382my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2));
1383my @x=map("\"$_\"",@xx);
1384
1385 (
1386 "&vpaddd (@x[$a0],@x[$a0],@x[$b0])", # Q1
1387 "&vpaddd (@x[$a1],@x[$a1],@x[$b1])", # Q2
1388 "&vpaddd (@x[$a2],@x[$a2],@x[$b2])", # Q3
1389 "&vpaddd (@x[$a3],@x[$a3],@x[$b3])", # Q4
1390 "&vpxor (@x[$d0],@x[$a0],@x[$d0])",
1391 "&vpxor (@x[$d1],@x[$a1],@x[$d1])",
1392 "&vpxor (@x[$d2],@x[$a2],@x[$d2])",
1393 "&vpxor (@x[$d3],@x[$a3],@x[$d3])",
1394 "&vprotd (@x[$d0],@x[$d0],16)",
1395 "&vprotd (@x[$d1],@x[$d1],16)",
1396 "&vprotd (@x[$d2],@x[$d2],16)",
1397 "&vprotd (@x[$d3],@x[$d3],16)",
1398
1399 "&vpaddd (@x[$c0],@x[$c0],@x[$d0])",
1400 "&vpaddd (@x[$c1],@x[$c1],@x[$d1])",
1401 "&vpaddd (@x[$c2],@x[$c2],@x[$d2])",
1402 "&vpaddd (@x[$c3],@x[$c3],@x[$d3])",
1403 "&vpxor (@x[$b0],@x[$c0],@x[$b0])",
1404 "&vpxor (@x[$b1],@x[$c1],@x[$b1])",
1405 "&vpxor (@x[$b2],@x[$b2],@x[$c2])", # flip
1406 "&vpxor (@x[$b3],@x[$b3],@x[$c3])", # flip
1407 "&vprotd (@x[$b0],@x[$b0],12)",
1408 "&vprotd (@x[$b1],@x[$b1],12)",
1409 "&vprotd (@x[$b2],@x[$b2],12)",
1410 "&vprotd (@x[$b3],@x[$b3],12)",
1411
1412 "&vpaddd (@x[$a0],@x[$b0],@x[$a0])", # flip
1413 "&vpaddd (@x[$a1],@x[$b1],@x[$a1])", # flip
1414 "&vpaddd (@x[$a2],@x[$a2],@x[$b2])",
1415 "&vpaddd (@x[$a3],@x[$a3],@x[$b3])",
1416 "&vpxor (@x[$d0],@x[$a0],@x[$d0])",
1417 "&vpxor (@x[$d1],@x[$a1],@x[$d1])",
1418 "&vpxor (@x[$d2],@x[$a2],@x[$d2])",
1419 "&vpxor (@x[$d3],@x[$a3],@x[$d3])",
1420 "&vprotd (@x[$d0],@x[$d0],8)",
1421 "&vprotd (@x[$d1],@x[$d1],8)",
1422 "&vprotd (@x[$d2],@x[$d2],8)",
1423 "&vprotd (@x[$d3],@x[$d3],8)",
1424
1425 "&vpaddd (@x[$c0],@x[$c0],@x[$d0])",
1426 "&vpaddd (@x[$c1],@x[$c1],@x[$d1])",
1427 "&vpaddd (@x[$c2],@x[$c2],@x[$d2])",
1428 "&vpaddd (@x[$c3],@x[$c3],@x[$d3])",
1429 "&vpxor (@x[$b0],@x[$c0],@x[$b0])",
1430 "&vpxor (@x[$b1],@x[$c1],@x[$b1])",
1431 "&vpxor (@x[$b2],@x[$b2],@x[$c2])", # flip
1432 "&vpxor (@x[$b3],@x[$b3],@x[$c3])", # flip
1433 "&vprotd (@x[$b0],@x[$b0],7)",
1434 "&vprotd (@x[$b1],@x[$b1],7)",
1435 "&vprotd (@x[$b2],@x[$b2],7)",
1436 "&vprotd (@x[$b3],@x[$b3],7)"
1437 );
1438}
1439
1440my $xframe = $win64 ? 0xa8 : 8;
1441
1442$code.=<<___;
1443.type ChaCha20_4xop,\@function,5
1444.align 32
1445ChaCha20_4xop:
1446.cfi_startproc
1447.LChaCha20_4xop:
1448 mov %rsp,%r9 # frame pointer
1449.cfi_def_cfa_register %r9
1450 sub \$0x140+$xframe,%rsp
1451___
1452 ################ stack layout
1453 # +0x00 SIMD equivalent of @x[8-12]
1454 # ...
1455 # +0x40 constant copy of key[0-2] smashed by lanes
1456 # ...
1457 # +0x100 SIMD counters (with nonce smashed by lanes)
1458 # ...
1459 # +0x140
1460$code.=<<___ if ($win64);
1461 movaps %xmm6,-0xa8(%r9)
1462 movaps %xmm7,-0x98(%r9)
1463 movaps %xmm8,-0x88(%r9)
1464 movaps %xmm9,-0x78(%r9)
1465 movaps %xmm10,-0x68(%r9)
1466 movaps %xmm11,-0x58(%r9)
1467 movaps %xmm12,-0x48(%r9)
1468 movaps %xmm13,-0x38(%r9)
1469 movaps %xmm14,-0x28(%r9)
1470 movaps %xmm15,-0x18(%r9)
1471.L4xop_body:
1472___
1473$code.=<<___;
1474 vzeroupper
1475
1476 vmovdqa .Lsigma(%rip),$xa3 # key[0]
1477 vmovdqu ($key),$xb3 # key[1]
1478 vmovdqu 16($key),$xt3 # key[2]
1479 vmovdqu ($counter),$xd3 # key[3]
1480 lea 0x100(%rsp),%rcx # size optimization
1481
1482 vpshufd \$0x00,$xa3,$xa0 # smash key by lanes...
1483 vpshufd \$0x55,$xa3,$xa1
1484 vmovdqa $xa0,0x40(%rsp) # ... and offload
1485 vpshufd \$0xaa,$xa3,$xa2
1486 vmovdqa $xa1,0x50(%rsp)
1487 vpshufd \$0xff,$xa3,$xa3
1488 vmovdqa $xa2,0x60(%rsp)
1489 vmovdqa $xa3,0x70(%rsp)
1490
1491 vpshufd \$0x00,$xb3,$xb0
1492 vpshufd \$0x55,$xb3,$xb1
1493 vmovdqa $xb0,0x80-0x100(%rcx)
1494 vpshufd \$0xaa,$xb3,$xb2
1495 vmovdqa $xb1,0x90-0x100(%rcx)
1496 vpshufd \$0xff,$xb3,$xb3
1497 vmovdqa $xb2,0xa0-0x100(%rcx)
1498 vmovdqa $xb3,0xb0-0x100(%rcx)
1499
1500 vpshufd \$0x00,$xt3,$xt0 # "$xc0"
1501 vpshufd \$0x55,$xt3,$xt1 # "$xc1"
1502 vmovdqa $xt0,0xc0-0x100(%rcx)
1503 vpshufd \$0xaa,$xt3,$xt2 # "$xc2"
1504 vmovdqa $xt1,0xd0-0x100(%rcx)
1505 vpshufd \$0xff,$xt3,$xt3 # "$xc3"
1506 vmovdqa $xt2,0xe0-0x100(%rcx)
1507 vmovdqa $xt3,0xf0-0x100(%rcx)
1508
1509 vpshufd \$0x00,$xd3,$xd0
1510 vpshufd \$0x55,$xd3,$xd1
1511 vpaddd .Linc(%rip),$xd0,$xd0 # don't save counters yet
1512 vpshufd \$0xaa,$xd3,$xd2
1513 vmovdqa $xd1,0x110-0x100(%rcx)
1514 vpshufd \$0xff,$xd3,$xd3
1515 vmovdqa $xd2,0x120-0x100(%rcx)
1516 vmovdqa $xd3,0x130-0x100(%rcx)
1517
1518 jmp .Loop_enter4xop
1519
1520.align 32
1521.Loop_outer4xop:
1522 vmovdqa 0x40(%rsp),$xa0 # re-load smashed key
1523 vmovdqa 0x50(%rsp),$xa1
1524 vmovdqa 0x60(%rsp),$xa2
1525 vmovdqa 0x70(%rsp),$xa3
1526 vmovdqa 0x80-0x100(%rcx),$xb0
1527 vmovdqa 0x90-0x100(%rcx),$xb1
1528 vmovdqa 0xa0-0x100(%rcx),$xb2
1529 vmovdqa 0xb0-0x100(%rcx),$xb3
1530 vmovdqa 0xc0-0x100(%rcx),$xt0 # "$xc0"
1531 vmovdqa 0xd0-0x100(%rcx),$xt1 # "$xc1"
1532 vmovdqa 0xe0-0x100(%rcx),$xt2 # "$xc2"
1533 vmovdqa 0xf0-0x100(%rcx),$xt3 # "$xc3"
1534 vmovdqa 0x100-0x100(%rcx),$xd0
1535 vmovdqa 0x110-0x100(%rcx),$xd1
1536 vmovdqa 0x120-0x100(%rcx),$xd2
1537 vmovdqa 0x130-0x100(%rcx),$xd3
1538 vpaddd .Lfour(%rip),$xd0,$xd0 # next SIMD counters
1539
1540.Loop_enter4xop:
1541 mov \$10,%eax
1542 vmovdqa $xd0,0x100-0x100(%rcx) # save SIMD counters
1543 jmp .Loop4xop
1544
1545.align 32
1546.Loop4xop:
1547___
1548 foreach (&XOP_lane_ROUND(0, 4, 8,12)) { eval; }
1549 foreach (&XOP_lane_ROUND(0, 5,10,15)) { eval; }
1550$code.=<<___;
1551 dec %eax
1552 jnz .Loop4xop
1553
1554 vpaddd 0x40(%rsp),$xa0,$xa0 # accumulate key material
1555 vpaddd 0x50(%rsp),$xa1,$xa1
1556 vpaddd 0x60(%rsp),$xa2,$xa2
1557 vpaddd 0x70(%rsp),$xa3,$xa3
1558
1559 vmovdqa $xt2,0x20(%rsp) # offload $xc2,3
1560 vmovdqa $xt3,0x30(%rsp)
1561
1562 vpunpckldq $xa1,$xa0,$xt2 # "de-interlace" data
1563 vpunpckldq $xa3,$xa2,$xt3
1564 vpunpckhdq $xa1,$xa0,$xa0
1565 vpunpckhdq $xa3,$xa2,$xa2
1566 vpunpcklqdq $xt3,$xt2,$xa1 # "a0"
1567 vpunpckhqdq $xt3,$xt2,$xt2 # "a1"
1568 vpunpcklqdq $xa2,$xa0,$xa3 # "a2"
1569 vpunpckhqdq $xa2,$xa0,$xa0 # "a3"
1570___
1571 ($xa0,$xa1,$xa2,$xa3,$xt2)=($xa1,$xt2,$xa3,$xa0,$xa2);
1572$code.=<<___;
1573 vpaddd 0x80-0x100(%rcx),$xb0,$xb0
1574 vpaddd 0x90-0x100(%rcx),$xb1,$xb1
1575 vpaddd 0xa0-0x100(%rcx),$xb2,$xb2
1576 vpaddd 0xb0-0x100(%rcx),$xb3,$xb3
1577
1578 vmovdqa $xa0,0x00(%rsp) # offload $xa0,1
1579 vmovdqa $xa1,0x10(%rsp)
1580 vmovdqa 0x20(%rsp),$xa0 # "xc2"
1581 vmovdqa 0x30(%rsp),$xa1 # "xc3"
1582
1583 vpunpckldq $xb1,$xb0,$xt2
1584 vpunpckldq $xb3,$xb2,$xt3
1585 vpunpckhdq $xb1,$xb0,$xb0
1586 vpunpckhdq $xb3,$xb2,$xb2
1587 vpunpcklqdq $xt3,$xt2,$xb1 # "b0"
1588 vpunpckhqdq $xt3,$xt2,$xt2 # "b1"
1589 vpunpcklqdq $xb2,$xb0,$xb3 # "b2"
1590 vpunpckhqdq $xb2,$xb0,$xb0 # "b3"
1591___
1592 ($xb0,$xb1,$xb2,$xb3,$xt2)=($xb1,$xt2,$xb3,$xb0,$xb2);
1593 my ($xc0,$xc1,$xc2,$xc3)=($xt0,$xt1,$xa0,$xa1);
1594$code.=<<___;
1595 vpaddd 0xc0-0x100(%rcx),$xc0,$xc0
1596 vpaddd 0xd0-0x100(%rcx),$xc1,$xc1
1597 vpaddd 0xe0-0x100(%rcx),$xc2,$xc2
1598 vpaddd 0xf0-0x100(%rcx),$xc3,$xc3
1599
1600 vpunpckldq $xc1,$xc0,$xt2
1601 vpunpckldq $xc3,$xc2,$xt3
1602 vpunpckhdq $xc1,$xc0,$xc0
1603 vpunpckhdq $xc3,$xc2,$xc2
1604 vpunpcklqdq $xt3,$xt2,$xc1 # "c0"
1605 vpunpckhqdq $xt3,$xt2,$xt2 # "c1"
1606 vpunpcklqdq $xc2,$xc0,$xc3 # "c2"
1607 vpunpckhqdq $xc2,$xc0,$xc0 # "c3"
1608___
1609 ($xc0,$xc1,$xc2,$xc3,$xt2)=($xc1,$xt2,$xc3,$xc0,$xc2);
1610$code.=<<___;
1611 vpaddd 0x100-0x100(%rcx),$xd0,$xd0
1612 vpaddd 0x110-0x100(%rcx),$xd1,$xd1
1613 vpaddd 0x120-0x100(%rcx),$xd2,$xd2
1614 vpaddd 0x130-0x100(%rcx),$xd3,$xd3
1615
1616 vpunpckldq $xd1,$xd0,$xt2
1617 vpunpckldq $xd3,$xd2,$xt3
1618 vpunpckhdq $xd1,$xd0,$xd0
1619 vpunpckhdq $xd3,$xd2,$xd2
1620 vpunpcklqdq $xt3,$xt2,$xd1 # "d0"
1621 vpunpckhqdq $xt3,$xt2,$xt2 # "d1"
1622 vpunpcklqdq $xd2,$xd0,$xd3 # "d2"
1623 vpunpckhqdq $xd2,$xd0,$xd0 # "d3"
1624___
1625 ($xd0,$xd1,$xd2,$xd3,$xt2)=($xd1,$xt2,$xd3,$xd0,$xd2);
1626 ($xa0,$xa1)=($xt2,$xt3);
1627$code.=<<___;
1628 vmovdqa 0x00(%rsp),$xa0 # restore $xa0,1
1629 vmovdqa 0x10(%rsp),$xa1
1630
1631 cmp \$64*4,$len
1632 jb .Ltail4xop
1633
1634 vpxor 0x00($inp),$xa0,$xa0 # xor with input
1635 vpxor 0x10($inp),$xb0,$xb0
1636 vpxor 0x20($inp),$xc0,$xc0
1637 vpxor 0x30($inp),$xd0,$xd0
1638 vpxor 0x40($inp),$xa1,$xa1
1639 vpxor 0x50($inp),$xb1,$xb1
1640 vpxor 0x60($inp),$xc1,$xc1
1641 vpxor 0x70($inp),$xd1,$xd1
1642 lea 0x80($inp),$inp # size optimization
1643 vpxor 0x00($inp),$xa2,$xa2
1644 vpxor 0x10($inp),$xb2,$xb2
1645 vpxor 0x20($inp),$xc2,$xc2
1646 vpxor 0x30($inp),$xd2,$xd2
1647 vpxor 0x40($inp),$xa3,$xa3
1648 vpxor 0x50($inp),$xb3,$xb3
1649 vpxor 0x60($inp),$xc3,$xc3
1650 vpxor 0x70($inp),$xd3,$xd3
1651 lea 0x80($inp),$inp # inp+=64*4
1652
1653 vmovdqu $xa0,0x00($out)
1654 vmovdqu $xb0,0x10($out)
1655 vmovdqu $xc0,0x20($out)
1656 vmovdqu $xd0,0x30($out)
1657 vmovdqu $xa1,0x40($out)
1658 vmovdqu $xb1,0x50($out)
1659 vmovdqu $xc1,0x60($out)
1660 vmovdqu $xd1,0x70($out)
1661 lea 0x80($out),$out # size optimization
1662 vmovdqu $xa2,0x00($out)
1663 vmovdqu $xb2,0x10($out)
1664 vmovdqu $xc2,0x20($out)
1665 vmovdqu $xd2,0x30($out)
1666 vmovdqu $xa3,0x40($out)
1667 vmovdqu $xb3,0x50($out)
1668 vmovdqu $xc3,0x60($out)
1669 vmovdqu $xd3,0x70($out)
1670 lea 0x80($out),$out # out+=64*4
1671
1672 sub \$64*4,$len
1673 jnz .Loop_outer4xop
1674
1675 jmp .Ldone4xop
1676
1677.align 32
1678.Ltail4xop:
1679 cmp \$192,$len
1680 jae .L192_or_more4xop
1681 cmp \$128,$len
1682 jae .L128_or_more4xop
1683 cmp \$64,$len
1684 jae .L64_or_more4xop
1685
1686 xor %r10,%r10
1687 vmovdqa $xa0,0x00(%rsp)
1688 vmovdqa $xb0,0x10(%rsp)
1689 vmovdqa $xc0,0x20(%rsp)
1690 vmovdqa $xd0,0x30(%rsp)
1691 jmp .Loop_tail4xop
1692
1693.align 32
1694.L64_or_more4xop:
1695 vpxor 0x00($inp),$xa0,$xa0 # xor with input
1696 vpxor 0x10($inp),$xb0,$xb0
1697 vpxor 0x20($inp),$xc0,$xc0
1698 vpxor 0x30($inp),$xd0,$xd0
1699 vmovdqu $xa0,0x00($out)
1700 vmovdqu $xb0,0x10($out)
1701 vmovdqu $xc0,0x20($out)
1702 vmovdqu $xd0,0x30($out)
1703 je .Ldone4xop
1704
1705 lea 0x40($inp),$inp # inp+=64*1
1706 vmovdqa $xa1,0x00(%rsp)
1707 xor %r10,%r10
1708 vmovdqa $xb1,0x10(%rsp)
1709 lea 0x40($out),$out # out+=64*1
1710 vmovdqa $xc1,0x20(%rsp)
1711 sub \$64,$len # len-=64*1
1712 vmovdqa $xd1,0x30(%rsp)
1713 jmp .Loop_tail4xop
1714
1715.align 32
1716.L128_or_more4xop:
1717 vpxor 0x00($inp),$xa0,$xa0 # xor with input
1718 vpxor 0x10($inp),$xb0,$xb0
1719 vpxor 0x20($inp),$xc0,$xc0
1720 vpxor 0x30($inp),$xd0,$xd0
1721 vpxor 0x40($inp),$xa1,$xa1
1722 vpxor 0x50($inp),$xb1,$xb1
1723 vpxor 0x60($inp),$xc1,$xc1
1724 vpxor 0x70($inp),$xd1,$xd1
1725
1726 vmovdqu $xa0,0x00($out)
1727 vmovdqu $xb0,0x10($out)
1728 vmovdqu $xc0,0x20($out)
1729 vmovdqu $xd0,0x30($out)
1730 vmovdqu $xa1,0x40($out)
1731 vmovdqu $xb1,0x50($out)
1732 vmovdqu $xc1,0x60($out)
1733 vmovdqu $xd1,0x70($out)
1734 je .Ldone4xop
1735
1736 lea 0x80($inp),$inp # inp+=64*2
1737 vmovdqa $xa2,0x00(%rsp)
1738 xor %r10,%r10
1739 vmovdqa $xb2,0x10(%rsp)
1740 lea 0x80($out),$out # out+=64*2
1741 vmovdqa $xc2,0x20(%rsp)
1742 sub \$128,$len # len-=64*2
1743 vmovdqa $xd2,0x30(%rsp)
1744 jmp .Loop_tail4xop
1745
1746.align 32
1747.L192_or_more4xop:
1748 vpxor 0x00($inp),$xa0,$xa0 # xor with input
1749 vpxor 0x10($inp),$xb0,$xb0
1750 vpxor 0x20($inp),$xc0,$xc0
1751 vpxor 0x30($inp),$xd0,$xd0
1752 vpxor 0x40($inp),$xa1,$xa1
1753 vpxor 0x50($inp),$xb1,$xb1
1754 vpxor 0x60($inp),$xc1,$xc1
1755 vpxor 0x70($inp),$xd1,$xd1
1756 lea 0x80($inp),$inp # size optimization
1757 vpxor 0x00($inp),$xa2,$xa2
1758 vpxor 0x10($inp),$xb2,$xb2
1759 vpxor 0x20($inp),$xc2,$xc2
1760 vpxor 0x30($inp),$xd2,$xd2
1761
1762 vmovdqu $xa0,0x00($out)
1763 vmovdqu $xb0,0x10($out)
1764 vmovdqu $xc0,0x20($out)
1765 vmovdqu $xd0,0x30($out)
1766 vmovdqu $xa1,0x40($out)
1767 vmovdqu $xb1,0x50($out)
1768 vmovdqu $xc1,0x60($out)
1769 vmovdqu $xd1,0x70($out)
1770 lea 0x80($out),$out # size optimization
1771 vmovdqu $xa2,0x00($out)
1772 vmovdqu $xb2,0x10($out)
1773 vmovdqu $xc2,0x20($out)
1774 vmovdqu $xd2,0x30($out)
1775 je .Ldone4xop
1776
1777 lea 0x40($inp),$inp # inp+=64*3
1778 vmovdqa $xa3,0x00(%rsp)
1779 xor %r10,%r10
1780 vmovdqa $xb3,0x10(%rsp)
1781 lea 0x40($out),$out # out+=64*3
1782 vmovdqa $xc3,0x20(%rsp)
1783 sub \$192,$len # len-=64*3
1784 vmovdqa $xd3,0x30(%rsp)
1785
1786.Loop_tail4xop:
1787 movzb ($inp,%r10),%eax
1788 movzb (%rsp,%r10),%ecx
1789 lea 1(%r10),%r10
1790 xor %ecx,%eax
1791 mov %al,-1($out,%r10)
1792 dec $len
1793 jnz .Loop_tail4xop
1794
1795.Ldone4xop:
1796 vzeroupper
1797___
1798$code.=<<___ if ($win64);
1799 movaps -0xa8(%r9),%xmm6
1800 movaps -0x98(%r9),%xmm7
1801 movaps -0x88(%r9),%xmm8
1802 movaps -0x78(%r9),%xmm9
1803 movaps -0x68(%r9),%xmm10
1804 movaps -0x58(%r9),%xmm11
1805 movaps -0x48(%r9),%xmm12
1806 movaps -0x38(%r9),%xmm13
1807 movaps -0x28(%r9),%xmm14
1808 movaps -0x18(%r9),%xmm15
1809___
1810$code.=<<___;
1811 lea (%r9),%rsp
1812.cfi_def_cfa_register %rsp
1813.L4xop_epilogue:
1814 ret
1815.cfi_endproc
1816.size ChaCha20_4xop,.-ChaCha20_4xop
1817___
1818}
1819
1820########################################################################
1821# AVX2 code path
1822if ($avx>1) {
1823my ($xb0,$xb1,$xb2,$xb3, $xd0,$xd1,$xd2,$xd3,
1824 $xa0,$xa1,$xa2,$xa3, $xt0,$xt1,$xt2,$xt3)=map("%ymm$_",(0..15));
1825my @xx=($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3,
1826 "%nox","%nox","%nox","%nox", $xd0,$xd1,$xd2,$xd3);
1827
1828sub AVX2_lane_ROUND {
1829my ($a0,$b0,$c0,$d0)=@_;
1830my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0));
1831my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1));
1832my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2));
1833my ($xc,$xc_,$t0,$t1)=map("\"$_\"",$xt0,$xt1,$xt2,$xt3);
1834my @x=map("\"$_\"",@xx);
1835
1836 # Consider order in which variables are addressed by their
1837 # index:
1838 #
1839 # a b c d
1840 #
1841 # 0 4 8 12 < even round
1842 # 1 5 9 13
1843 # 2 6 10 14
1844 # 3 7 11 15
1845 # 0 5 10 15 < odd round
1846 # 1 6 11 12
1847 # 2 7 8 13
1848 # 3 4 9 14
1849 #
1850 # 'a', 'b' and 'd's are permanently allocated in registers,
1851 # @x[0..7,12..15], while 'c's are maintained in memory. If
1852 # you observe 'c' column, you'll notice that pair of 'c's is
1853 # invariant between rounds. This means that we have to reload
1854 # them once per round, in the middle. This is why you'll see
1855 # bunch of 'c' stores and loads in the middle, but none in
1856 # the beginning or end.
1857
1858 (
1859 "&vpaddd (@x[$a0],@x[$a0],@x[$b0])", # Q1
1860 "&vpxor (@x[$d0],@x[$a0],@x[$d0])",
1861 "&vpshufb (@x[$d0],@x[$d0],$t1)",
1862 "&vpaddd (@x[$a1],@x[$a1],@x[$b1])", # Q2
1863 "&vpxor (@x[$d1],@x[$a1],@x[$d1])",
1864 "&vpshufb (@x[$d1],@x[$d1],$t1)",
1865
1866 "&vpaddd ($xc,$xc,@x[$d0])",
1867 "&vpxor (@x[$b0],$xc,@x[$b0])",
1868 "&vpslld ($t0,@x[$b0],12)",
1869 "&vpsrld (@x[$b0],@x[$b0],20)",
1870 "&vpor (@x[$b0],$t0,@x[$b0])",
1871 "&vbroadcasti128($t0,'(%r11)')", # .Lrot24(%rip)
1872 "&vpaddd ($xc_,$xc_,@x[$d1])",
1873 "&vpxor (@x[$b1],$xc_,@x[$b1])",
1874 "&vpslld ($t1,@x[$b1],12)",
1875 "&vpsrld (@x[$b1],@x[$b1],20)",
1876 "&vpor (@x[$b1],$t1,@x[$b1])",
1877
1878 "&vpaddd (@x[$a0],@x[$a0],@x[$b0])",
1879 "&vpxor (@x[$d0],@x[$a0],@x[$d0])",
1880 "&vpshufb (@x[$d0],@x[$d0],$t0)",
1881 "&vpaddd (@x[$a1],@x[$a1],@x[$b1])",
1882 "&vpxor (@x[$d1],@x[$a1],@x[$d1])",
1883 "&vpshufb (@x[$d1],@x[$d1],$t0)",
1884
1885 "&vpaddd ($xc,$xc,@x[$d0])",
1886 "&vpxor (@x[$b0],$xc,@x[$b0])",
1887 "&vpslld ($t1,@x[$b0],7)",
1888 "&vpsrld (@x[$b0],@x[$b0],25)",
1889 "&vpor (@x[$b0],$t1,@x[$b0])",
1890 "&vbroadcasti128($t1,'(%r10)')", # .Lrot16(%rip)
1891 "&vpaddd ($xc_,$xc_,@x[$d1])",
1892 "&vpxor (@x[$b1],$xc_,@x[$b1])",
1893 "&vpslld ($t0,@x[$b1],7)",
1894 "&vpsrld (@x[$b1],@x[$b1],25)",
1895 "&vpor (@x[$b1],$t0,@x[$b1])",
1896
1897 "&vmovdqa (\"`32*($c0-8)`(%rsp)\",$xc)", # reload pair of 'c's
1898 "&vmovdqa (\"`32*($c1-8)`(%rsp)\",$xc_)",
1899 "&vmovdqa ($xc,\"`32*($c2-8)`(%rsp)\")",
1900 "&vmovdqa ($xc_,\"`32*($c3-8)`(%rsp)\")",
1901
1902 "&vpaddd (@x[$a2],@x[$a2],@x[$b2])", # Q3
1903 "&vpxor (@x[$d2],@x[$a2],@x[$d2])",
1904 "&vpshufb (@x[$d2],@x[$d2],$t1)",
1905 "&vpaddd (@x[$a3],@x[$a3],@x[$b3])", # Q4
1906 "&vpxor (@x[$d3],@x[$a3],@x[$d3])",
1907 "&vpshufb (@x[$d3],@x[$d3],$t1)",
1908
1909 "&vpaddd ($xc,$xc,@x[$d2])",
1910 "&vpxor (@x[$b2],$xc,@x[$b2])",
1911 "&vpslld ($t0,@x[$b2],12)",
1912 "&vpsrld (@x[$b2],@x[$b2],20)",
1913 "&vpor (@x[$b2],$t0,@x[$b2])",
1914 "&vbroadcasti128($t0,'(%r11)')", # .Lrot24(%rip)
1915 "&vpaddd ($xc_,$xc_,@x[$d3])",
1916 "&vpxor (@x[$b3],$xc_,@x[$b3])",
1917 "&vpslld ($t1,@x[$b3],12)",
1918 "&vpsrld (@x[$b3],@x[$b3],20)",
1919 "&vpor (@x[$b3],$t1,@x[$b3])",
1920
1921 "&vpaddd (@x[$a2],@x[$a2],@x[$b2])",
1922 "&vpxor (@x[$d2],@x[$a2],@x[$d2])",
1923 "&vpshufb (@x[$d2],@x[$d2],$t0)",
1924 "&vpaddd (@x[$a3],@x[$a3],@x[$b3])",
1925 "&vpxor (@x[$d3],@x[$a3],@x[$d3])",
1926 "&vpshufb (@x[$d3],@x[$d3],$t0)",
1927
1928 "&vpaddd ($xc,$xc,@x[$d2])",
1929 "&vpxor (@x[$b2],$xc,@x[$b2])",
1930 "&vpslld ($t1,@x[$b2],7)",
1931 "&vpsrld (@x[$b2],@x[$b2],25)",
1932 "&vpor (@x[$b2],$t1,@x[$b2])",
1933 "&vbroadcasti128($t1,'(%r10)')", # .Lrot16(%rip)
1934 "&vpaddd ($xc_,$xc_,@x[$d3])",
1935 "&vpxor (@x[$b3],$xc_,@x[$b3])",
1936 "&vpslld ($t0,@x[$b3],7)",
1937 "&vpsrld (@x[$b3],@x[$b3],25)",
1938 "&vpor (@x[$b3],$t0,@x[$b3])"
1939 );
1940}
1941
1942my $xframe = $win64 ? 0xa8 : 8;
1943
1944$code.=<<___;
1945.type ChaCha20_8x,\@function,5
1946.align 32
1947ChaCha20_8x:
1948.cfi_startproc
1949.LChaCha20_8x:
1950 mov %rsp,%r9 # frame register
1951.cfi_def_cfa_register %r9
1952 sub \$0x280+$xframe,%rsp
1953 and \$-32,%rsp
1954___
1955$code.=<<___ if ($win64);
1956 movaps %xmm6,-0xa8(%r9)
1957 movaps %xmm7,-0x98(%r9)
1958 movaps %xmm8,-0x88(%r9)
1959 movaps %xmm9,-0x78(%r9)
1960 movaps %xmm10,-0x68(%r9)
1961 movaps %xmm11,-0x58(%r9)
1962 movaps %xmm12,-0x48(%r9)
1963 movaps %xmm13,-0x38(%r9)
1964 movaps %xmm14,-0x28(%r9)
1965 movaps %xmm15,-0x18(%r9)
1966.L8x_body:
1967___
1968$code.=<<___;
1969 vzeroupper
1970
1971 ################ stack layout
1972 # +0x00 SIMD equivalent of @x[8-12]
1973 # ...
1974 # +0x80 constant copy of key[0-2] smashed by lanes
1975 # ...
1976 # +0x200 SIMD counters (with nonce smashed by lanes)
1977 # ...
1978 # +0x280
1979
1980 vbroadcasti128 .Lsigma(%rip),$xa3 # key[0]
1981 vbroadcasti128 ($key),$xb3 # key[1]
1982 vbroadcasti128 16($key),$xt3 # key[2]
1983 vbroadcasti128 ($counter),$xd3 # key[3]
1984 lea 0x100(%rsp),%rcx # size optimization
1985 lea 0x200(%rsp),%rax # size optimization
1986 lea .Lrot16(%rip),%r10
1987 lea .Lrot24(%rip),%r11
1988
1989 vpshufd \$0x00,$xa3,$xa0 # smash key by lanes...
1990 vpshufd \$0x55,$xa3,$xa1
1991 vmovdqa $xa0,0x80-0x100(%rcx) # ... and offload
1992 vpshufd \$0xaa,$xa3,$xa2
1993 vmovdqa $xa1,0xa0-0x100(%rcx)
1994 vpshufd \$0xff,$xa3,$xa3
1995 vmovdqa $xa2,0xc0-0x100(%rcx)
1996 vmovdqa $xa3,0xe0-0x100(%rcx)
1997
1998 vpshufd \$0x00,$xb3,$xb0
1999 vpshufd \$0x55,$xb3,$xb1
2000 vmovdqa $xb0,0x100-0x100(%rcx)
2001 vpshufd \$0xaa,$xb3,$xb2
2002 vmovdqa $xb1,0x120-0x100(%rcx)
2003 vpshufd \$0xff,$xb3,$xb3
2004 vmovdqa $xb2,0x140-0x100(%rcx)
2005 vmovdqa $xb3,0x160-0x100(%rcx)
2006
2007 vpshufd \$0x00,$xt3,$xt0 # "xc0"
2008 vpshufd \$0x55,$xt3,$xt1 # "xc1"
2009 vmovdqa $xt0,0x180-0x200(%rax)
2010 vpshufd \$0xaa,$xt3,$xt2 # "xc2"
2011 vmovdqa $xt1,0x1a0-0x200(%rax)
2012 vpshufd \$0xff,$xt3,$xt3 # "xc3"
2013 vmovdqa $xt2,0x1c0-0x200(%rax)
2014 vmovdqa $xt3,0x1e0-0x200(%rax)
2015
2016 vpshufd \$0x00,$xd3,$xd0
2017 vpshufd \$0x55,$xd3,$xd1
2018 vpaddd .Lincy(%rip),$xd0,$xd0 # don't save counters yet
2019 vpshufd \$0xaa,$xd3,$xd2
2020 vmovdqa $xd1,0x220-0x200(%rax)
2021 vpshufd \$0xff,$xd3,$xd3
2022 vmovdqa $xd2,0x240-0x200(%rax)
2023 vmovdqa $xd3,0x260-0x200(%rax)
2024
2025 jmp .Loop_enter8x
2026
2027.align 32
2028.Loop_outer8x:
2029 vmovdqa 0x80-0x100(%rcx),$xa0 # re-load smashed key
2030 vmovdqa 0xa0-0x100(%rcx),$xa1
2031 vmovdqa 0xc0-0x100(%rcx),$xa2
2032 vmovdqa 0xe0-0x100(%rcx),$xa3
2033 vmovdqa 0x100-0x100(%rcx),$xb0
2034 vmovdqa 0x120-0x100(%rcx),$xb1
2035 vmovdqa 0x140-0x100(%rcx),$xb2
2036 vmovdqa 0x160-0x100(%rcx),$xb3
2037 vmovdqa 0x180-0x200(%rax),$xt0 # "xc0"
2038 vmovdqa 0x1a0-0x200(%rax),$xt1 # "xc1"
2039 vmovdqa 0x1c0-0x200(%rax),$xt2 # "xc2"
2040 vmovdqa 0x1e0-0x200(%rax),$xt3 # "xc3"
2041 vmovdqa 0x200-0x200(%rax),$xd0
2042 vmovdqa 0x220-0x200(%rax),$xd1
2043 vmovdqa 0x240-0x200(%rax),$xd2
2044 vmovdqa 0x260-0x200(%rax),$xd3
2045 vpaddd .Leight(%rip),$xd0,$xd0 # next SIMD counters
2046
2047.Loop_enter8x:
2048 vmovdqa $xt2,0x40(%rsp) # SIMD equivalent of "@x[10]"
2049 vmovdqa $xt3,0x60(%rsp) # SIMD equivalent of "@x[11]"
2050 vbroadcasti128 (%r10),$xt3
2051 vmovdqa $xd0,0x200-0x200(%rax) # save SIMD counters
2052 mov \$10,%eax
2053 jmp .Loop8x
2054
2055.align 32
2056.Loop8x:
2057___
2058 foreach (&AVX2_lane_ROUND(0, 4, 8,12)) { eval; }
2059 foreach (&AVX2_lane_ROUND(0, 5,10,15)) { eval; }
2060$code.=<<___;
2061 dec %eax
2062 jnz .Loop8x
2063
2064 lea 0x200(%rsp),%rax # size optimization
2065 vpaddd 0x80-0x100(%rcx),$xa0,$xa0 # accumulate key
2066 vpaddd 0xa0-0x100(%rcx),$xa1,$xa1
2067 vpaddd 0xc0-0x100(%rcx),$xa2,$xa2
2068 vpaddd 0xe0-0x100(%rcx),$xa3,$xa3
2069
2070 vpunpckldq $xa1,$xa0,$xt2 # "de-interlace" data
2071 vpunpckldq $xa3,$xa2,$xt3
2072 vpunpckhdq $xa1,$xa0,$xa0
2073 vpunpckhdq $xa3,$xa2,$xa2
2074 vpunpcklqdq $xt3,$xt2,$xa1 # "a0"
2075 vpunpckhqdq $xt3,$xt2,$xt2 # "a1"
2076 vpunpcklqdq $xa2,$xa0,$xa3 # "a2"
2077 vpunpckhqdq $xa2,$xa0,$xa0 # "a3"
2078___
2079 ($xa0,$xa1,$xa2,$xa3,$xt2)=($xa1,$xt2,$xa3,$xa0,$xa2);
2080$code.=<<___;
2081 vpaddd 0x100-0x100(%rcx),$xb0,$xb0
2082 vpaddd 0x120-0x100(%rcx),$xb1,$xb1
2083 vpaddd 0x140-0x100(%rcx),$xb2,$xb2
2084 vpaddd 0x160-0x100(%rcx),$xb3,$xb3
2085
2086 vpunpckldq $xb1,$xb0,$xt2
2087 vpunpckldq $xb3,$xb2,$xt3
2088 vpunpckhdq $xb1,$xb0,$xb0
2089 vpunpckhdq $xb3,$xb2,$xb2
2090 vpunpcklqdq $xt3,$xt2,$xb1 # "b0"
2091 vpunpckhqdq $xt3,$xt2,$xt2 # "b1"
2092 vpunpcklqdq $xb2,$xb0,$xb3 # "b2"
2093 vpunpckhqdq $xb2,$xb0,$xb0 # "b3"
2094___
2095 ($xb0,$xb1,$xb2,$xb3,$xt2)=($xb1,$xt2,$xb3,$xb0,$xb2);
2096$code.=<<___;
2097 vperm2i128 \$0x20,$xb0,$xa0,$xt3 # "de-interlace" further
2098 vperm2i128 \$0x31,$xb0,$xa0,$xb0
2099 vperm2i128 \$0x20,$xb1,$xa1,$xa0
2100 vperm2i128 \$0x31,$xb1,$xa1,$xb1
2101 vperm2i128 \$0x20,$xb2,$xa2,$xa1
2102 vperm2i128 \$0x31,$xb2,$xa2,$xb2
2103 vperm2i128 \$0x20,$xb3,$xa3,$xa2
2104 vperm2i128 \$0x31,$xb3,$xa3,$xb3
2105___
2106 ($xa0,$xa1,$xa2,$xa3,$xt3)=($xt3,$xa0,$xa1,$xa2,$xa3);
2107 my ($xc0,$xc1,$xc2,$xc3)=($xt0,$xt1,$xa0,$xa1);
2108$code.=<<___;
2109 vmovdqa $xa0,0x00(%rsp) # offload $xaN
2110 vmovdqa $xa1,0x20(%rsp)
2111 vmovdqa 0x40(%rsp),$xc2 # $xa0
2112 vmovdqa 0x60(%rsp),$xc3 # $xa1
2113
2114 vpaddd 0x180-0x200(%rax),$xc0,$xc0
2115 vpaddd 0x1a0-0x200(%rax),$xc1,$xc1
2116 vpaddd 0x1c0-0x200(%rax),$xc2,$xc2
2117 vpaddd 0x1e0-0x200(%rax),$xc3,$xc3
2118
2119 vpunpckldq $xc1,$xc0,$xt2
2120 vpunpckldq $xc3,$xc2,$xt3
2121 vpunpckhdq $xc1,$xc0,$xc0
2122 vpunpckhdq $xc3,$xc2,$xc2
2123 vpunpcklqdq $xt3,$xt2,$xc1 # "c0"
2124 vpunpckhqdq $xt3,$xt2,$xt2 # "c1"
2125 vpunpcklqdq $xc2,$xc0,$xc3 # "c2"
2126 vpunpckhqdq $xc2,$xc0,$xc0 # "c3"
2127___
2128 ($xc0,$xc1,$xc2,$xc3,$xt2)=($xc1,$xt2,$xc3,$xc0,$xc2);
2129$code.=<<___;
2130 vpaddd 0x200-0x200(%rax),$xd0,$xd0
2131 vpaddd 0x220-0x200(%rax),$xd1,$xd1
2132 vpaddd 0x240-0x200(%rax),$xd2,$xd2
2133 vpaddd 0x260-0x200(%rax),$xd3,$xd3
2134
2135 vpunpckldq $xd1,$xd0,$xt2
2136 vpunpckldq $xd3,$xd2,$xt3
2137 vpunpckhdq $xd1,$xd0,$xd0
2138 vpunpckhdq $xd3,$xd2,$xd2
2139 vpunpcklqdq $xt3,$xt2,$xd1 # "d0"
2140 vpunpckhqdq $xt3,$xt2,$xt2 # "d1"
2141 vpunpcklqdq $xd2,$xd0,$xd3 # "d2"
2142 vpunpckhqdq $xd2,$xd0,$xd0 # "d3"
2143___
2144 ($xd0,$xd1,$xd2,$xd3,$xt2)=($xd1,$xt2,$xd3,$xd0,$xd2);
2145$code.=<<___;
2146 vperm2i128 \$0x20,$xd0,$xc0,$xt3 # "de-interlace" further
2147 vperm2i128 \$0x31,$xd0,$xc0,$xd0
2148 vperm2i128 \$0x20,$xd1,$xc1,$xc0
2149 vperm2i128 \$0x31,$xd1,$xc1,$xd1
2150 vperm2i128 \$0x20,$xd2,$xc2,$xc1
2151 vperm2i128 \$0x31,$xd2,$xc2,$xd2
2152 vperm2i128 \$0x20,$xd3,$xc3,$xc2
2153 vperm2i128 \$0x31,$xd3,$xc3,$xd3
2154___
2155 ($xc0,$xc1,$xc2,$xc3,$xt3)=($xt3,$xc0,$xc1,$xc2,$xc3);
2156 ($xb0,$xb1,$xb2,$xb3,$xc0,$xc1,$xc2,$xc3)=
2157 ($xc0,$xc1,$xc2,$xc3,$xb0,$xb1,$xb2,$xb3);
2158 ($xa0,$xa1)=($xt2,$xt3);
2159$code.=<<___;
2160 vmovdqa 0x00(%rsp),$xa0 # $xaN was offloaded, remember?
2161 vmovdqa 0x20(%rsp),$xa1
2162
2163 cmp \$64*8,$len
2164 jb .Ltail8x
2165
2166 vpxor 0x00($inp),$xa0,$xa0 # xor with input
2167 vpxor 0x20($inp),$xb0,$xb0
2168 vpxor 0x40($inp),$xc0,$xc0
2169 vpxor 0x60($inp),$xd0,$xd0
2170 lea 0x80($inp),$inp # size optimization
2171 vmovdqu $xa0,0x00($out)
2172 vmovdqu $xb0,0x20($out)
2173 vmovdqu $xc0,0x40($out)
2174 vmovdqu $xd0,0x60($out)
2175 lea 0x80($out),$out # size optimization
2176
2177 vpxor 0x00($inp),$xa1,$xa1
2178 vpxor 0x20($inp),$xb1,$xb1
2179 vpxor 0x40($inp),$xc1,$xc1
2180 vpxor 0x60($inp),$xd1,$xd1
2181 lea 0x80($inp),$inp # size optimization
2182 vmovdqu $xa1,0x00($out)
2183 vmovdqu $xb1,0x20($out)
2184 vmovdqu $xc1,0x40($out)
2185 vmovdqu $xd1,0x60($out)
2186 lea 0x80($out),$out # size optimization
2187
2188 vpxor 0x00($inp),$xa2,$xa2
2189 vpxor 0x20($inp),$xb2,$xb2
2190 vpxor 0x40($inp),$xc2,$xc2
2191 vpxor 0x60($inp),$xd2,$xd2
2192 lea 0x80($inp),$inp # size optimization
2193 vmovdqu $xa2,0x00($out)
2194 vmovdqu $xb2,0x20($out)
2195 vmovdqu $xc2,0x40($out)
2196 vmovdqu $xd2,0x60($out)
2197 lea 0x80($out),$out # size optimization
2198
2199 vpxor 0x00($inp),$xa3,$xa3
2200 vpxor 0x20($inp),$xb3,$xb3
2201 vpxor 0x40($inp),$xc3,$xc3
2202 vpxor 0x60($inp),$xd3,$xd3
2203 lea 0x80($inp),$inp # size optimization
2204 vmovdqu $xa3,0x00($out)
2205 vmovdqu $xb3,0x20($out)
2206 vmovdqu $xc3,0x40($out)
2207 vmovdqu $xd3,0x60($out)
2208 lea 0x80($out),$out # size optimization
2209
2210 sub \$64*8,$len
2211 jnz .Loop_outer8x
2212
2213 jmp .Ldone8x
2214
2215.Ltail8x:
2216 cmp \$448,$len
2217 jae .L448_or_more8x
2218 cmp \$384,$len
2219 jae .L384_or_more8x
2220 cmp \$320,$len
2221 jae .L320_or_more8x
2222 cmp \$256,$len
2223 jae .L256_or_more8x
2224 cmp \$192,$len
2225 jae .L192_or_more8x
2226 cmp \$128,$len
2227 jae .L128_or_more8x
2228 cmp \$64,$len
2229 jae .L64_or_more8x
2230
2231 xor %r10,%r10
2232 vmovdqa $xa0,0x00(%rsp)
2233 vmovdqa $xb0,0x20(%rsp)
2234 jmp .Loop_tail8x
2235
2236.align 32
2237.L64_or_more8x:
2238 vpxor 0x00($inp),$xa0,$xa0 # xor with input
2239 vpxor 0x20($inp),$xb0,$xb0
2240 vmovdqu $xa0,0x00($out)
2241 vmovdqu $xb0,0x20($out)
2242 je .Ldone8x
2243
2244 lea 0x40($inp),$inp # inp+=64*1
2245 xor %r10,%r10
2246 vmovdqa $xc0,0x00(%rsp)
2247 lea 0x40($out),$out # out+=64*1
2248 sub \$64,$len # len-=64*1
2249 vmovdqa $xd0,0x20(%rsp)
2250 jmp .Loop_tail8x
2251
2252.align 32
2253.L128_or_more8x:
2254 vpxor 0x00($inp),$xa0,$xa0 # xor with input
2255 vpxor 0x20($inp),$xb0,$xb0
2256 vpxor 0x40($inp),$xc0,$xc0
2257 vpxor 0x60($inp),$xd0,$xd0
2258 vmovdqu $xa0,0x00($out)
2259 vmovdqu $xb0,0x20($out)
2260 vmovdqu $xc0,0x40($out)
2261 vmovdqu $xd0,0x60($out)
2262 je .Ldone8x
2263
2264 lea 0x80($inp),$inp # inp+=64*2
2265 xor %r10,%r10
2266 vmovdqa $xa1,0x00(%rsp)
2267 lea 0x80($out),$out # out+=64*2
2268 sub \$128,$len # len-=64*2
2269 vmovdqa $xb1,0x20(%rsp)
2270 jmp .Loop_tail8x
2271
2272.align 32
2273.L192_or_more8x:
2274 vpxor 0x00($inp),$xa0,$xa0 # xor with input
2275 vpxor 0x20($inp),$xb0,$xb0
2276 vpxor 0x40($inp),$xc0,$xc0
2277 vpxor 0x60($inp),$xd0,$xd0
2278 vpxor 0x80($inp),$xa1,$xa1
2279 vpxor 0xa0($inp),$xb1,$xb1
2280 vmovdqu $xa0,0x00($out)
2281 vmovdqu $xb0,0x20($out)
2282 vmovdqu $xc0,0x40($out)
2283 vmovdqu $xd0,0x60($out)
2284 vmovdqu $xa1,0x80($out)
2285 vmovdqu $xb1,0xa0($out)
2286 je .Ldone8x
2287
2288 lea 0xc0($inp),$inp # inp+=64*3
2289 xor %r10,%r10
2290 vmovdqa $xc1,0x00(%rsp)
2291 lea 0xc0($out),$out # out+=64*3
2292 sub \$192,$len # len-=64*3
2293 vmovdqa $xd1,0x20(%rsp)
2294 jmp .Loop_tail8x
2295
2296.align 32
2297.L256_or_more8x:
2298 vpxor 0x00($inp),$xa0,$xa0 # xor with input
2299 vpxor 0x20($inp),$xb0,$xb0
2300 vpxor 0x40($inp),$xc0,$xc0
2301 vpxor 0x60($inp),$xd0,$xd0
2302 vpxor 0x80($inp),$xa1,$xa1
2303 vpxor 0xa0($inp),$xb1,$xb1
2304 vpxor 0xc0($inp),$xc1,$xc1
2305 vpxor 0xe0($inp),$xd1,$xd1
2306 vmovdqu $xa0,0x00($out)
2307 vmovdqu $xb0,0x20($out)
2308 vmovdqu $xc0,0x40($out)
2309 vmovdqu $xd0,0x60($out)
2310 vmovdqu $xa1,0x80($out)
2311 vmovdqu $xb1,0xa0($out)
2312 vmovdqu $xc1,0xc0($out)
2313 vmovdqu $xd1,0xe0($out)
2314 je .Ldone8x
2315
2316 lea 0x100($inp),$inp # inp+=64*4
2317 xor %r10,%r10
2318 vmovdqa $xa2,0x00(%rsp)
2319 lea 0x100($out),$out # out+=64*4
2320 sub \$256,$len # len-=64*4
2321 vmovdqa $xb2,0x20(%rsp)
2322 jmp .Loop_tail8x
2323
2324.align 32
2325.L320_or_more8x:
2326 vpxor 0x00($inp),$xa0,$xa0 # xor with input
2327 vpxor 0x20($inp),$xb0,$xb0
2328 vpxor 0x40($inp),$xc0,$xc0
2329 vpxor 0x60($inp),$xd0,$xd0
2330 vpxor 0x80($inp),$xa1,$xa1
2331 vpxor 0xa0($inp),$xb1,$xb1
2332 vpxor 0xc0($inp),$xc1,$xc1
2333 vpxor 0xe0($inp),$xd1,$xd1
2334 vpxor 0x100($inp),$xa2,$xa2
2335 vpxor 0x120($inp),$xb2,$xb2
2336 vmovdqu $xa0,0x00($out)
2337 vmovdqu $xb0,0x20($out)
2338 vmovdqu $xc0,0x40($out)
2339 vmovdqu $xd0,0x60($out)
2340 vmovdqu $xa1,0x80($out)
2341 vmovdqu $xb1,0xa0($out)
2342 vmovdqu $xc1,0xc0($out)
2343 vmovdqu $xd1,0xe0($out)
2344 vmovdqu $xa2,0x100($out)
2345 vmovdqu $xb2,0x120($out)
2346 je .Ldone8x
2347
2348 lea 0x140($inp),$inp # inp+=64*5
2349 xor %r10,%r10
2350 vmovdqa $xc2,0x00(%rsp)
2351 lea 0x140($out),$out # out+=64*5
2352 sub \$320,$len # len-=64*5
2353 vmovdqa $xd2,0x20(%rsp)
2354 jmp .Loop_tail8x
2355
2356.align 32
2357.L384_or_more8x:
2358 vpxor 0x00($inp),$xa0,$xa0 # xor with input
2359 vpxor 0x20($inp),$xb0,$xb0
2360 vpxor 0x40($inp),$xc0,$xc0
2361 vpxor 0x60($inp),$xd0,$xd0
2362 vpxor 0x80($inp),$xa1,$xa1
2363 vpxor 0xa0($inp),$xb1,$xb1
2364 vpxor 0xc0($inp),$xc1,$xc1
2365 vpxor 0xe0($inp),$xd1,$xd1
2366 vpxor 0x100($inp),$xa2,$xa2
2367 vpxor 0x120($inp),$xb2,$xb2
2368 vpxor 0x140($inp),$xc2,$xc2
2369 vpxor 0x160($inp),$xd2,$xd2
2370 vmovdqu $xa0,0x00($out)
2371 vmovdqu $xb0,0x20($out)
2372 vmovdqu $xc0,0x40($out)
2373 vmovdqu $xd0,0x60($out)
2374 vmovdqu $xa1,0x80($out)
2375 vmovdqu $xb1,0xa0($out)
2376 vmovdqu $xc1,0xc0($out)
2377 vmovdqu $xd1,0xe0($out)
2378 vmovdqu $xa2,0x100($out)
2379 vmovdqu $xb2,0x120($out)
2380 vmovdqu $xc2,0x140($out)
2381 vmovdqu $xd2,0x160($out)
2382 je .Ldone8x
2383
2384 lea 0x180($inp),$inp # inp+=64*6
2385 xor %r10,%r10
2386 vmovdqa $xa3,0x00(%rsp)
2387 lea 0x180($out),$out # out+=64*6
2388 sub \$384,$len # len-=64*6
2389 vmovdqa $xb3,0x20(%rsp)
2390 jmp .Loop_tail8x
2391
2392.align 32
2393.L448_or_more8x:
2394 vpxor 0x00($inp),$xa0,$xa0 # xor with input
2395 vpxor 0x20($inp),$xb0,$xb0
2396 vpxor 0x40($inp),$xc0,$xc0
2397 vpxor 0x60($inp),$xd0,$xd0
2398 vpxor 0x80($inp),$xa1,$xa1
2399 vpxor 0xa0($inp),$xb1,$xb1
2400 vpxor 0xc0($inp),$xc1,$xc1
2401 vpxor 0xe0($inp),$xd1,$xd1
2402 vpxor 0x100($inp),$xa2,$xa2
2403 vpxor 0x120($inp),$xb2,$xb2
2404 vpxor 0x140($inp),$xc2,$xc2
2405 vpxor 0x160($inp),$xd2,$xd2
2406 vpxor 0x180($inp),$xa3,$xa3
2407 vpxor 0x1a0($inp),$xb3,$xb3
2408 vmovdqu $xa0,0x00($out)
2409 vmovdqu $xb0,0x20($out)
2410 vmovdqu $xc0,0x40($out)
2411 vmovdqu $xd0,0x60($out)
2412 vmovdqu $xa1,0x80($out)
2413 vmovdqu $xb1,0xa0($out)
2414 vmovdqu $xc1,0xc0($out)
2415 vmovdqu $xd1,0xe0($out)
2416 vmovdqu $xa2,0x100($out)
2417 vmovdqu $xb2,0x120($out)
2418 vmovdqu $xc2,0x140($out)
2419 vmovdqu $xd2,0x160($out)
2420 vmovdqu $xa3,0x180($out)
2421 vmovdqu $xb3,0x1a0($out)
2422 je .Ldone8x
2423
2424 lea 0x1c0($inp),$inp # inp+=64*7
2425 xor %r10,%r10
2426 vmovdqa $xc3,0x00(%rsp)
2427 lea 0x1c0($out),$out # out+=64*7
2428 sub \$448,$len # len-=64*7
2429 vmovdqa $xd3,0x20(%rsp)
2430
2431.Loop_tail8x:
2432 movzb ($inp,%r10),%eax
2433 movzb (%rsp,%r10),%ecx
2434 lea 1(%r10),%r10
2435 xor %ecx,%eax
2436 mov %al,-1($out,%r10)
2437 dec $len
2438 jnz .Loop_tail8x
2439
2440.Ldone8x:
2441 vzeroall
2442___
2443$code.=<<___ if ($win64);
2444 movaps -0xa8(%r9),%xmm6
2445 movaps -0x98(%r9),%xmm7
2446 movaps -0x88(%r9),%xmm8
2447 movaps -0x78(%r9),%xmm9
2448 movaps -0x68(%r9),%xmm10
2449 movaps -0x58(%r9),%xmm11
2450 movaps -0x48(%r9),%xmm12
2451 movaps -0x38(%r9),%xmm13
2452 movaps -0x28(%r9),%xmm14
2453 movaps -0x18(%r9),%xmm15
2454___
2455$code.=<<___;
2456 lea (%r9),%rsp
2457.cfi_def_cfa_register %rsp
2458.L8x_epilogue:
2459 ret
2460.cfi_endproc
2461.size ChaCha20_8x,.-ChaCha20_8x
2462___
2463}
2464
2465########################################################################
2466# AVX512 code paths
2467if ($avx>2) {
2468# This one handles shorter inputs...
2469
2470my ($a,$b,$c,$d, $a_,$b_,$c_,$d_,$fourz) = map("%zmm$_",(0..3,16..20));
2471my ($t0,$t1,$t2,$t3) = map("%xmm$_",(4..7));
2472
2473sub vpxord() # size optimization
2474{ my $opcode = "vpxor"; # adhere to vpxor when possible
2475
2476 foreach (@_) {
2477 if (/%([zy])mm([0-9]+)/ && ($1 eq "z" || $2>=16)) {
2478 $opcode = "vpxord";
2479 last;
2480 }
2481 }
2482
2483 $code .= "\t$opcode\t".join(',',reverse @_)."\n";
2484}
2485
2486sub AVX512ROUND { # critical path is 14 "SIMD ticks" per round
2487 &vpaddd ($a,$a,$b);
2488 &vpxord ($d,$d,$a);
2489 &vprold ($d,$d,16);
2490
2491 &vpaddd ($c,$c,$d);
2492 &vpxord ($b,$b,$c);
2493 &vprold ($b,$b,12);
2494
2495 &vpaddd ($a,$a,$b);
2496 &vpxord ($d,$d,$a);
2497 &vprold ($d,$d,8);
2498
2499 &vpaddd ($c,$c,$d);
2500 &vpxord ($b,$b,$c);
2501 &vprold ($b,$b,7);
2502}
2503
2504my $xframe = $win64 ? 160+8 : 8;
2505
2506$code.=<<___;
2507.type ChaCha20_avx512,\@function,5
2508.align 32
2509ChaCha20_avx512:
2510.cfi_startproc
2511.LChaCha20_avx512:
2512 mov %rsp,%r9 # frame pointer
2513.cfi_def_cfa_register %r9
2514 cmp \$512,$len
2515 ja .LChaCha20_16x
2516
2517 sub \$64+$xframe,%rsp
2518___
2519$code.=<<___ if ($win64);
2520 movaps %xmm6,-0xa8(%r9)
2521 movaps %xmm7,-0x98(%r9)
2522 movaps %xmm8,-0x88(%r9)
2523 movaps %xmm9,-0x78(%r9)
2524 movaps %xmm10,-0x68(%r9)
2525 movaps %xmm11,-0x58(%r9)
2526 movaps %xmm12,-0x48(%r9)
2527 movaps %xmm13,-0x38(%r9)
2528 movaps %xmm14,-0x28(%r9)
2529 movaps %xmm15,-0x18(%r9)
2530.Lavx512_body:
2531___
2532$code.=<<___;
2533 vbroadcasti32x4 .Lsigma(%rip),$a
2534 vbroadcasti32x4 ($key),$b
2535 vbroadcasti32x4 16($key),$c
2536 vbroadcasti32x4 ($counter),$d
2537
2538 vmovdqa32 $a,$a_
2539 vmovdqa32 $b,$b_
2540 vmovdqa32 $c,$c_
2541 vpaddd .Lzeroz(%rip),$d,$d
2542 vmovdqa32 .Lfourz(%rip),$fourz
2543 mov \$10,$counter # reuse $counter
2544 vmovdqa32 $d,$d_
2545 jmp .Loop_avx512
2546
2547.align 16
2548.Loop_outer_avx512:
2549 vmovdqa32 $a_,$a
2550 vmovdqa32 $b_,$b
2551 vmovdqa32 $c_,$c
2552 vpaddd $fourz,$d_,$d
2553 mov \$10,$counter
2554 vmovdqa32 $d,$d_
2555 jmp .Loop_avx512
2556
2557.align 32
2558.Loop_avx512:
2559___
2560 &AVX512ROUND();
2561 &vpshufd ($c,$c,0b01001110);
2562 &vpshufd ($b,$b,0b00111001);
2563 &vpshufd ($d,$d,0b10010011);
2564
2565 &AVX512ROUND();
2566 &vpshufd ($c,$c,0b01001110);
2567 &vpshufd ($b,$b,0b10010011);
2568 &vpshufd ($d,$d,0b00111001);
2569
2570 &dec ($counter);
2571 &jnz (".Loop_avx512");
2572
2573$code.=<<___;
2574 vpaddd $a_,$a,$a
2575 vpaddd $b_,$b,$b
2576 vpaddd $c_,$c,$c
2577 vpaddd $d_,$d,$d
2578
2579 sub \$64,$len
2580 jb .Ltail64_avx512
2581
2582 vpxor 0x00($inp),%x#$a,$t0 # xor with input
2583 vpxor 0x10($inp),%x#$b,$t1
2584 vpxor 0x20($inp),%x#$c,$t2
2585 vpxor 0x30($inp),%x#$d,$t3
2586 lea 0x40($inp),$inp # inp+=64
2587
2588 vmovdqu $t0,0x00($out) # write output
2589 vmovdqu $t1,0x10($out)
2590 vmovdqu $t2,0x20($out)
2591 vmovdqu $t3,0x30($out)
2592 lea 0x40($out),$out # out+=64
2593
2594 jz .Ldone_avx512
2595
2596 vextracti32x4 \$1,$a,$t0
2597 vextracti32x4 \$1,$b,$t1
2598 vextracti32x4 \$1,$c,$t2
2599 vextracti32x4 \$1,$d,$t3
2600
2601 sub \$64,$len
2602 jb .Ltail_avx512
2603
2604 vpxor 0x00($inp),$t0,$t0 # xor with input
2605 vpxor 0x10($inp),$t1,$t1
2606 vpxor 0x20($inp),$t2,$t2
2607 vpxor 0x30($inp),$t3,$t3
2608 lea 0x40($inp),$inp # inp+=64
2609
2610 vmovdqu $t0,0x00($out) # write output
2611 vmovdqu $t1,0x10($out)
2612 vmovdqu $t2,0x20($out)
2613 vmovdqu $t3,0x30($out)
2614 lea 0x40($out),$out # out+=64
2615
2616 jz .Ldone_avx512
2617
2618 vextracti32x4 \$2,$a,$t0
2619 vextracti32x4 \$2,$b,$t1
2620 vextracti32x4 \$2,$c,$t2
2621 vextracti32x4 \$2,$d,$t3
2622
2623 sub \$64,$len
2624 jb .Ltail_avx512
2625
2626 vpxor 0x00($inp),$t0,$t0 # xor with input
2627 vpxor 0x10($inp),$t1,$t1
2628 vpxor 0x20($inp),$t2,$t2
2629 vpxor 0x30($inp),$t3,$t3
2630 lea 0x40($inp),$inp # inp+=64
2631
2632 vmovdqu $t0,0x00($out) # write output
2633 vmovdqu $t1,0x10($out)
2634 vmovdqu $t2,0x20($out)
2635 vmovdqu $t3,0x30($out)
2636 lea 0x40($out),$out # out+=64
2637
2638 jz .Ldone_avx512
2639
2640 vextracti32x4 \$3,$a,$t0
2641 vextracti32x4 \$3,$b,$t1
2642 vextracti32x4 \$3,$c,$t2
2643 vextracti32x4 \$3,$d,$t3
2644
2645 sub \$64,$len
2646 jb .Ltail_avx512
2647
2648 vpxor 0x00($inp),$t0,$t0 # xor with input
2649 vpxor 0x10($inp),$t1,$t1
2650 vpxor 0x20($inp),$t2,$t2
2651 vpxor 0x30($inp),$t3,$t3
2652 lea 0x40($inp),$inp # inp+=64
2653
2654 vmovdqu $t0,0x00($out) # write output
2655 vmovdqu $t1,0x10($out)
2656 vmovdqu $t2,0x20($out)
2657 vmovdqu $t3,0x30($out)
2658 lea 0x40($out),$out # out+=64
2659
2660 jnz .Loop_outer_avx512
2661
2662 jmp .Ldone_avx512
2663
2664.align 16
2665.Ltail64_avx512:
2666 vmovdqa %x#$a,0x00(%rsp)
2667 vmovdqa %x#$b,0x10(%rsp)
2668 vmovdqa %x#$c,0x20(%rsp)
2669 vmovdqa %x#$d,0x30(%rsp)
2670 add \$64,$len
2671 jmp .Loop_tail_avx512
2672
2673.align 16
2674.Ltail_avx512:
2675 vmovdqa $t0,0x00(%rsp)
2676 vmovdqa $t1,0x10(%rsp)
2677 vmovdqa $t2,0x20(%rsp)
2678 vmovdqa $t3,0x30(%rsp)
2679 add \$64,$len
2680
2681.Loop_tail_avx512:
2682 movzb ($inp,$counter),%eax
2683 movzb (%rsp,$counter),%ecx
2684 lea 1($counter),$counter
2685 xor %ecx,%eax
2686 mov %al,-1($out,$counter)
2687 dec $len
2688 jnz .Loop_tail_avx512
2689
2690 vmovdqu32 $a_,0x00(%rsp)
2691
2692.Ldone_avx512:
2693 vzeroall
2694___
2695$code.=<<___ if ($win64);
2696 movaps -0xa8(%r9),%xmm6
2697 movaps -0x98(%r9),%xmm7
2698 movaps -0x88(%r9),%xmm8
2699 movaps -0x78(%r9),%xmm9
2700 movaps -0x68(%r9),%xmm10
2701 movaps -0x58(%r9),%xmm11
2702 movaps -0x48(%r9),%xmm12
2703 movaps -0x38(%r9),%xmm13
2704 movaps -0x28(%r9),%xmm14
2705 movaps -0x18(%r9),%xmm15
2706___
2707$code.=<<___;
2708 lea (%r9),%rsp
2709.cfi_def_cfa_register %rsp
2710.Lavx512_epilogue:
2711 ret
2712.cfi_endproc
2713.size ChaCha20_avx512,.-ChaCha20_avx512
2714___
2715
2716map(s/%z/%y/, $a,$b,$c,$d, $a_,$b_,$c_,$d_,$fourz);
2717
2718$code.=<<___;
2719.type ChaCha20_avx512vl,\@function,5
2720.align 32
2721ChaCha20_avx512vl:
2722.cfi_startproc
2723.LChaCha20_avx512vl:
2724 mov %rsp,%r9 # frame pointer
2725.cfi_def_cfa_register %r9
2726 cmp \$128,$len
2727 ja .LChaCha20_8xvl
2728
2729 sub \$64+$xframe,%rsp
2730___
2731$code.=<<___ if ($win64);
2732 movaps %xmm6,-0xa8(%r9)
2733 movaps %xmm7,-0x98(%r9)
2734 movaps %xmm8,-0x88(%r9)
2735 movaps %xmm9,-0x78(%r9)
2736 movaps %xmm10,-0x68(%r9)
2737 movaps %xmm11,-0x58(%r9)
2738 movaps %xmm12,-0x48(%r9)
2739 movaps %xmm13,-0x38(%r9)
2740 movaps %xmm14,-0x28(%r9)
2741 movaps %xmm15,-0x18(%r9)
2742.Lavx512vl_body:
2743___
2744$code.=<<___;
2745 vbroadcasti128 .Lsigma(%rip),$a
2746 vbroadcasti128 ($key),$b
2747 vbroadcasti128 16($key),$c
2748 vbroadcasti128 ($counter),$d
2749
2750 vmovdqa32 $a,$a_
2751 vmovdqa32 $b,$b_
2752 vmovdqa32 $c,$c_
2753 vpaddd .Lzeroz(%rip),$d,$d
2754 vmovdqa32 .Ltwoy(%rip),$fourz
2755 mov \$10,$counter # reuse $counter
2756 vmovdqa32 $d,$d_
2757 jmp .Loop_avx512vl
2758
2759.align 16
2760.Loop_outer_avx512vl:
2761 vmovdqa32 $c_,$c
2762 vpaddd $fourz,$d_,$d
2763 mov \$10,$counter
2764 vmovdqa32 $d,$d_
2765 jmp .Loop_avx512vl
2766
2767.align 32
2768.Loop_avx512vl:
2769___
2770 &AVX512ROUND();
2771 &vpshufd ($c,$c,0b01001110);
2772 &vpshufd ($b,$b,0b00111001);
2773 &vpshufd ($d,$d,0b10010011);
2774
2775 &AVX512ROUND();
2776 &vpshufd ($c,$c,0b01001110);
2777 &vpshufd ($b,$b,0b10010011);
2778 &vpshufd ($d,$d,0b00111001);
2779
2780 &dec ($counter);
2781 &jnz (".Loop_avx512vl");
2782
2783$code.=<<___;
2784 vpaddd $a_,$a,$a
2785 vpaddd $b_,$b,$b
2786 vpaddd $c_,$c,$c
2787 vpaddd $d_,$d,$d
2788
2789 sub \$64,$len
2790 jb .Ltail64_avx512vl
2791
2792 vpxor 0x00($inp),%x#$a,$t0 # xor with input
2793 vpxor 0x10($inp),%x#$b,$t1
2794 vpxor 0x20($inp),%x#$c,$t2
2795 vpxor 0x30($inp),%x#$d,$t3
2796 lea 0x40($inp),$inp # inp+=64
2797
2798 vmovdqu $t0,0x00($out) # write output
2799 vmovdqu $t1,0x10($out)
2800 vmovdqu $t2,0x20($out)
2801 vmovdqu $t3,0x30($out)
2802 lea 0x40($out),$out # out+=64
2803
2804 jz .Ldone_avx512vl
2805
2806 vextracti128 \$1,$a,$t0
2807 vextracti128 \$1,$b,$t1
2808 vextracti128 \$1,$c,$t2
2809 vextracti128 \$1,$d,$t3
2810
2811 sub \$64,$len
2812 jb .Ltail_avx512vl
2813
2814 vpxor 0x00($inp),$t0,$t0 # xor with input
2815 vpxor 0x10($inp),$t1,$t1
2816 vpxor 0x20($inp),$t2,$t2
2817 vpxor 0x30($inp),$t3,$t3
2818 lea 0x40($inp),$inp # inp+=64
2819
2820 vmovdqu $t0,0x00($out) # write output
2821 vmovdqu $t1,0x10($out)
2822 vmovdqu $t2,0x20($out)
2823 vmovdqu $t3,0x30($out)
2824 lea 0x40($out),$out # out+=64
2825
2826 vmovdqa32 $a_,$a
2827 vmovdqa32 $b_,$b
2828 jnz .Loop_outer_avx512vl
2829
2830 jmp .Ldone_avx512vl
2831
2832.align 16
2833.Ltail64_avx512vl:
2834 vmovdqa %x#$a,0x00(%rsp)
2835 vmovdqa %x#$b,0x10(%rsp)
2836 vmovdqa %x#$c,0x20(%rsp)
2837 vmovdqa %x#$d,0x30(%rsp)
2838 add \$64,$len
2839 jmp .Loop_tail_avx512vl
2840
2841.align 16
2842.Ltail_avx512vl:
2843 vmovdqa $t0,0x00(%rsp)
2844 vmovdqa $t1,0x10(%rsp)
2845 vmovdqa $t2,0x20(%rsp)
2846 vmovdqa $t3,0x30(%rsp)
2847 add \$64,$len
2848
2849.Loop_tail_avx512vl:
2850 movzb ($inp,$counter),%eax
2851 movzb (%rsp,$counter),%ecx
2852 lea 1($counter),$counter
2853 xor %ecx,%eax
2854 mov %al,-1($out,$counter)
2855 dec $len
2856 jnz .Loop_tail_avx512vl
2857
2858 vmovdqu32 $a_,0x00(%rsp)
2859 vmovdqu32 $a_,0x20(%rsp)
2860
2861.Ldone_avx512vl:
2862 vzeroall
2863___
2864$code.=<<___ if ($win64);
2865 movaps -0xa8(%r9),%xmm6
2866 movaps -0x98(%r9),%xmm7
2867 movaps -0x88(%r9),%xmm8
2868 movaps -0x78(%r9),%xmm9
2869 movaps -0x68(%r9),%xmm10
2870 movaps -0x58(%r9),%xmm11
2871 movaps -0x48(%r9),%xmm12
2872 movaps -0x38(%r9),%xmm13
2873 movaps -0x28(%r9),%xmm14
2874 movaps -0x18(%r9),%xmm15
2875___
2876$code.=<<___;
2877 lea (%r9),%rsp
2878.cfi_def_cfa_register %rsp
2879.Lavx512vl_epilogue:
2880 ret
2881.cfi_endproc
2882.size ChaCha20_avx512vl,.-ChaCha20_avx512vl
2883___
2884}
2885if ($avx>2) {
2886# This one handles longer inputs...
2887
2888my ($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3,
2889 $xc0,$xc1,$xc2,$xc3, $xd0,$xd1,$xd2,$xd3)=map("%zmm$_",(0..15));
2890my @xx=($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3,
2891 $xc0,$xc1,$xc2,$xc3, $xd0,$xd1,$xd2,$xd3);
2892my @key=map("%zmm$_",(16..31));
2893my ($xt0,$xt1,$xt2,$xt3)=@key[0..3];
2894
2895sub AVX512_lane_ROUND {
2896my ($a0,$b0,$c0,$d0)=@_;
2897my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0));
2898my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1));
2899my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2));
2900my @x=map("\"$_\"",@xx);
2901
2902 (
2903 "&vpaddd (@x[$a0],@x[$a0],@x[$b0])", # Q1
2904 "&vpaddd (@x[$a1],@x[$a1],@x[$b1])", # Q2
2905 "&vpaddd (@x[$a2],@x[$a2],@x[$b2])", # Q3
2906 "&vpaddd (@x[$a3],@x[$a3],@x[$b3])", # Q4
2907 "&vpxord (@x[$d0],@x[$d0],@x[$a0])",
2908 "&vpxord (@x[$d1],@x[$d1],@x[$a1])",
2909 "&vpxord (@x[$d2],@x[$d2],@x[$a2])",
2910 "&vpxord (@x[$d3],@x[$d3],@x[$a3])",
2911 "&vprold (@x[$d0],@x[$d0],16)",
2912 "&vprold (@x[$d1],@x[$d1],16)",
2913 "&vprold (@x[$d2],@x[$d2],16)",
2914 "&vprold (@x[$d3],@x[$d3],16)",
2915
2916 "&vpaddd (@x[$c0],@x[$c0],@x[$d0])",
2917 "&vpaddd (@x[$c1],@x[$c1],@x[$d1])",
2918 "&vpaddd (@x[$c2],@x[$c2],@x[$d2])",
2919 "&vpaddd (@x[$c3],@x[$c3],@x[$d3])",
2920 "&vpxord (@x[$b0],@x[$b0],@x[$c0])",
2921 "&vpxord (@x[$b1],@x[$b1],@x[$c1])",
2922 "&vpxord (@x[$b2],@x[$b2],@x[$c2])",
2923 "&vpxord (@x[$b3],@x[$b3],@x[$c3])",
2924 "&vprold (@x[$b0],@x[$b0],12)",
2925 "&vprold (@x[$b1],@x[$b1],12)",
2926 "&vprold (@x[$b2],@x[$b2],12)",
2927 "&vprold (@x[$b3],@x[$b3],12)",
2928
2929 "&vpaddd (@x[$a0],@x[$a0],@x[$b0])",
2930 "&vpaddd (@x[$a1],@x[$a1],@x[$b1])",
2931 "&vpaddd (@x[$a2],@x[$a2],@x[$b2])",
2932 "&vpaddd (@x[$a3],@x[$a3],@x[$b3])",
2933 "&vpxord (@x[$d0],@x[$d0],@x[$a0])",
2934 "&vpxord (@x[$d1],@x[$d1],@x[$a1])",
2935 "&vpxord (@x[$d2],@x[$d2],@x[$a2])",
2936 "&vpxord (@x[$d3],@x[$d3],@x[$a3])",
2937 "&vprold (@x[$d0],@x[$d0],8)",
2938 "&vprold (@x[$d1],@x[$d1],8)",
2939 "&vprold (@x[$d2],@x[$d2],8)",
2940 "&vprold (@x[$d3],@x[$d3],8)",
2941
2942 "&vpaddd (@x[$c0],@x[$c0],@x[$d0])",
2943 "&vpaddd (@x[$c1],@x[$c1],@x[$d1])",
2944 "&vpaddd (@x[$c2],@x[$c2],@x[$d2])",
2945 "&vpaddd (@x[$c3],@x[$c3],@x[$d3])",
2946 "&vpxord (@x[$b0],@x[$b0],@x[$c0])",
2947 "&vpxord (@x[$b1],@x[$b1],@x[$c1])",
2948 "&vpxord (@x[$b2],@x[$b2],@x[$c2])",
2949 "&vpxord (@x[$b3],@x[$b3],@x[$c3])",
2950 "&vprold (@x[$b0],@x[$b0],7)",
2951 "&vprold (@x[$b1],@x[$b1],7)",
2952 "&vprold (@x[$b2],@x[$b2],7)",
2953 "&vprold (@x[$b3],@x[$b3],7)"
2954 );
2955}
2956
2957my $xframe = $win64 ? 0xa8 : 8;
2958
2959$code.=<<___;
2960.type ChaCha20_16x,\@function,5
2961.align 32
2962ChaCha20_16x:
2963.cfi_startproc
2964.LChaCha20_16x:
2965 mov %rsp,%r9 # frame register
2966.cfi_def_cfa_register %r9
2967 sub \$64+$xframe,%rsp
2968 and \$-64,%rsp
2969___
2970$code.=<<___ if ($win64);
2971 movaps %xmm6,-0xa8(%r9)
2972 movaps %xmm7,-0x98(%r9)
2973 movaps %xmm8,-0x88(%r9)
2974 movaps %xmm9,-0x78(%r9)
2975 movaps %xmm10,-0x68(%r9)
2976 movaps %xmm11,-0x58(%r9)
2977 movaps %xmm12,-0x48(%r9)
2978 movaps %xmm13,-0x38(%r9)
2979 movaps %xmm14,-0x28(%r9)
2980 movaps %xmm15,-0x18(%r9)
2981.L16x_body:
2982___
2983$code.=<<___;
2984 vzeroupper
2985
2986 lea .Lsigma(%rip),%r10
2987 vbroadcasti32x4 (%r10),$xa3 # key[0]
2988 vbroadcasti32x4 ($key),$xb3 # key[1]
2989 vbroadcasti32x4 16($key),$xc3 # key[2]
2990 vbroadcasti32x4 ($counter),$xd3 # key[3]
2991
2992 vpshufd \$0x00,$xa3,$xa0 # smash key by lanes...
2993 vpshufd \$0x55,$xa3,$xa1
2994 vpshufd \$0xaa,$xa3,$xa2
2995 vpshufd \$0xff,$xa3,$xa3
2996 vmovdqa64 $xa0,@key[0]
2997 vmovdqa64 $xa1,@key[1]
2998 vmovdqa64 $xa2,@key[2]
2999 vmovdqa64 $xa3,@key[3]
3000
3001 vpshufd \$0x00,$xb3,$xb0
3002 vpshufd \$0x55,$xb3,$xb1
3003 vpshufd \$0xaa,$xb3,$xb2
3004 vpshufd \$0xff,$xb3,$xb3
3005 vmovdqa64 $xb0,@key[4]
3006 vmovdqa64 $xb1,@key[5]
3007 vmovdqa64 $xb2,@key[6]
3008 vmovdqa64 $xb3,@key[7]
3009
3010 vpshufd \$0x00,$xc3,$xc0
3011 vpshufd \$0x55,$xc3,$xc1
3012 vpshufd \$0xaa,$xc3,$xc2
3013 vpshufd \$0xff,$xc3,$xc3
3014 vmovdqa64 $xc0,@key[8]
3015 vmovdqa64 $xc1,@key[9]
3016 vmovdqa64 $xc2,@key[10]
3017 vmovdqa64 $xc3,@key[11]
3018
3019 vpshufd \$0x00,$xd3,$xd0
3020 vpshufd \$0x55,$xd3,$xd1
3021 vpshufd \$0xaa,$xd3,$xd2
3022 vpshufd \$0xff,$xd3,$xd3
3023 vpaddd .Lincz(%rip),$xd0,$xd0 # don't save counters yet
3024 vmovdqa64 $xd0,@key[12]
3025 vmovdqa64 $xd1,@key[13]
3026 vmovdqa64 $xd2,@key[14]
3027 vmovdqa64 $xd3,@key[15]
3028
3029 mov \$10,%eax
3030 jmp .Loop16x
3031
3032.align 32
3033.Loop_outer16x:
3034 vpbroadcastd 0(%r10),$xa0 # reload key
3035 vpbroadcastd 4(%r10),$xa1
3036 vpbroadcastd 8(%r10),$xa2
3037 vpbroadcastd 12(%r10),$xa3
3038 vpaddd .Lsixteen(%rip),@key[12],@key[12] # next SIMD counters
3039 vmovdqa64 @key[4],$xb0
3040 vmovdqa64 @key[5],$xb1
3041 vmovdqa64 @key[6],$xb2
3042 vmovdqa64 @key[7],$xb3
3043 vmovdqa64 @key[8],$xc0
3044 vmovdqa64 @key[9],$xc1
3045 vmovdqa64 @key[10],$xc2
3046 vmovdqa64 @key[11],$xc3
3047 vmovdqa64 @key[12],$xd0
3048 vmovdqa64 @key[13],$xd1
3049 vmovdqa64 @key[14],$xd2
3050 vmovdqa64 @key[15],$xd3
3051
3052 vmovdqa64 $xa0,@key[0]
3053 vmovdqa64 $xa1,@key[1]
3054 vmovdqa64 $xa2,@key[2]
3055 vmovdqa64 $xa3,@key[3]
3056
3057 mov \$10,%eax
3058 jmp .Loop16x
3059
3060.align 32
3061.Loop16x:
3062___
3063 foreach (&AVX512_lane_ROUND(0, 4, 8,12)) { eval; }
3064 foreach (&AVX512_lane_ROUND(0, 5,10,15)) { eval; }
3065$code.=<<___;
3066 dec %eax
3067 jnz .Loop16x
3068
3069 vpaddd @key[0],$xa0,$xa0 # accumulate key
3070 vpaddd @key[1],$xa1,$xa1
3071 vpaddd @key[2],$xa2,$xa2
3072 vpaddd @key[3],$xa3,$xa3
3073
3074 vpunpckldq $xa1,$xa0,$xt2 # "de-interlace" data
3075 vpunpckldq $xa3,$xa2,$xt3
3076 vpunpckhdq $xa1,$xa0,$xa0
3077 vpunpckhdq $xa3,$xa2,$xa2
3078 vpunpcklqdq $xt3,$xt2,$xa1 # "a0"
3079 vpunpckhqdq $xt3,$xt2,$xt2 # "a1"
3080 vpunpcklqdq $xa2,$xa0,$xa3 # "a2"
3081 vpunpckhqdq $xa2,$xa0,$xa0 # "a3"
3082___
3083 ($xa0,$xa1,$xa2,$xa3,$xt2)=($xa1,$xt2,$xa3,$xa0,$xa2);
3084$code.=<<___;
3085 vpaddd @key[4],$xb0,$xb0
3086 vpaddd @key[5],$xb1,$xb1
3087 vpaddd @key[6],$xb2,$xb2
3088 vpaddd @key[7],$xb3,$xb3
3089
3090 vpunpckldq $xb1,$xb0,$xt2
3091 vpunpckldq $xb3,$xb2,$xt3
3092 vpunpckhdq $xb1,$xb0,$xb0
3093 vpunpckhdq $xb3,$xb2,$xb2
3094 vpunpcklqdq $xt3,$xt2,$xb1 # "b0"
3095 vpunpckhqdq $xt3,$xt2,$xt2 # "b1"
3096 vpunpcklqdq $xb2,$xb0,$xb3 # "b2"
3097 vpunpckhqdq $xb2,$xb0,$xb0 # "b3"
3098___
3099 ($xb0,$xb1,$xb2,$xb3,$xt2)=($xb1,$xt2,$xb3,$xb0,$xb2);
3100$code.=<<___;
3101 vshufi32x4 \$0x44,$xb0,$xa0,$xt3 # "de-interlace" further
3102 vshufi32x4 \$0xee,$xb0,$xa0,$xb0
3103 vshufi32x4 \$0x44,$xb1,$xa1,$xa0
3104 vshufi32x4 \$0xee,$xb1,$xa1,$xb1
3105 vshufi32x4 \$0x44,$xb2,$xa2,$xa1
3106 vshufi32x4 \$0xee,$xb2,$xa2,$xb2
3107 vshufi32x4 \$0x44,$xb3,$xa3,$xa2
3108 vshufi32x4 \$0xee,$xb3,$xa3,$xb3
3109___
3110 ($xa0,$xa1,$xa2,$xa3,$xt3)=($xt3,$xa0,$xa1,$xa2,$xa3);
3111$code.=<<___;
3112 vpaddd @key[8],$xc0,$xc0
3113 vpaddd @key[9],$xc1,$xc1
3114 vpaddd @key[10],$xc2,$xc2
3115 vpaddd @key[11],$xc3,$xc3
3116
3117 vpunpckldq $xc1,$xc0,$xt2
3118 vpunpckldq $xc3,$xc2,$xt3
3119 vpunpckhdq $xc1,$xc0,$xc0
3120 vpunpckhdq $xc3,$xc2,$xc2
3121 vpunpcklqdq $xt3,$xt2,$xc1 # "c0"
3122 vpunpckhqdq $xt3,$xt2,$xt2 # "c1"
3123 vpunpcklqdq $xc2,$xc0,$xc3 # "c2"
3124 vpunpckhqdq $xc2,$xc0,$xc0 # "c3"
3125___
3126 ($xc0,$xc1,$xc2,$xc3,$xt2)=($xc1,$xt2,$xc3,$xc0,$xc2);
3127$code.=<<___;
3128 vpaddd @key[12],$xd0,$xd0
3129 vpaddd @key[13],$xd1,$xd1
3130 vpaddd @key[14],$xd2,$xd2
3131 vpaddd @key[15],$xd3,$xd3
3132
3133 vpunpckldq $xd1,$xd0,$xt2
3134 vpunpckldq $xd3,$xd2,$xt3
3135 vpunpckhdq $xd1,$xd0,$xd0
3136 vpunpckhdq $xd3,$xd2,$xd2
3137 vpunpcklqdq $xt3,$xt2,$xd1 # "d0"
3138 vpunpckhqdq $xt3,$xt2,$xt2 # "d1"
3139 vpunpcklqdq $xd2,$xd0,$xd3 # "d2"
3140 vpunpckhqdq $xd2,$xd0,$xd0 # "d3"
3141___
3142 ($xd0,$xd1,$xd2,$xd3,$xt2)=($xd1,$xt2,$xd3,$xd0,$xd2);
3143$code.=<<___;
3144 vshufi32x4 \$0x44,$xd0,$xc0,$xt3 # "de-interlace" further
3145 vshufi32x4 \$0xee,$xd0,$xc0,$xd0
3146 vshufi32x4 \$0x44,$xd1,$xc1,$xc0
3147 vshufi32x4 \$0xee,$xd1,$xc1,$xd1
3148 vshufi32x4 \$0x44,$xd2,$xc2,$xc1
3149 vshufi32x4 \$0xee,$xd2,$xc2,$xd2
3150 vshufi32x4 \$0x44,$xd3,$xc3,$xc2
3151 vshufi32x4 \$0xee,$xd3,$xc3,$xd3
3152___
3153 ($xc0,$xc1,$xc2,$xc3,$xt3)=($xt3,$xc0,$xc1,$xc2,$xc3);
3154$code.=<<___;
3155 vshufi32x4 \$0x88,$xc0,$xa0,$xt0 # "de-interlace" further
3156 vshufi32x4 \$0xdd,$xc0,$xa0,$xa0
3157 vshufi32x4 \$0x88,$xd0,$xb0,$xc0
3158 vshufi32x4 \$0xdd,$xd0,$xb0,$xd0
3159 vshufi32x4 \$0x88,$xc1,$xa1,$xt1
3160 vshufi32x4 \$0xdd,$xc1,$xa1,$xa1
3161 vshufi32x4 \$0x88,$xd1,$xb1,$xc1
3162 vshufi32x4 \$0xdd,$xd1,$xb1,$xd1
3163 vshufi32x4 \$0x88,$xc2,$xa2,$xt2
3164 vshufi32x4 \$0xdd,$xc2,$xa2,$xa2
3165 vshufi32x4 \$0x88,$xd2,$xb2,$xc2
3166 vshufi32x4 \$0xdd,$xd2,$xb2,$xd2
3167 vshufi32x4 \$0x88,$xc3,$xa3,$xt3
3168 vshufi32x4 \$0xdd,$xc3,$xa3,$xa3
3169 vshufi32x4 \$0x88,$xd3,$xb3,$xc3
3170 vshufi32x4 \$0xdd,$xd3,$xb3,$xd3
3171___
3172 ($xa0,$xa1,$xa2,$xa3,$xb0,$xb1,$xb2,$xb3)=
3173 ($xt0,$xt1,$xt2,$xt3,$xa0,$xa1,$xa2,$xa3);
3174
3175 ($xa0,$xb0,$xc0,$xd0, $xa1,$xb1,$xc1,$xd1,
3176 $xa2,$xb2,$xc2,$xd2, $xa3,$xb3,$xc3,$xd3) =
3177 ($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3,
3178 $xc0,$xc1,$xc2,$xc3, $xd0,$xd1,$xd2,$xd3);
3179$code.=<<___;
3180 cmp \$64*16,$len
3181 jb .Ltail16x
3182
3183 vpxord 0x00($inp),$xa0,$xa0 # xor with input
3184 vpxord 0x40($inp),$xb0,$xb0
3185 vpxord 0x80($inp),$xc0,$xc0
3186 vpxord 0xc0($inp),$xd0,$xd0
3187 vmovdqu32 $xa0,0x00($out)
3188 vmovdqu32 $xb0,0x40($out)
3189 vmovdqu32 $xc0,0x80($out)
3190 vmovdqu32 $xd0,0xc0($out)
3191
3192 vpxord 0x100($inp),$xa1,$xa1
3193 vpxord 0x140($inp),$xb1,$xb1
3194 vpxord 0x180($inp),$xc1,$xc1
3195 vpxord 0x1c0($inp),$xd1,$xd1
3196 vmovdqu32 $xa1,0x100($out)
3197 vmovdqu32 $xb1,0x140($out)
3198 vmovdqu32 $xc1,0x180($out)
3199 vmovdqu32 $xd1,0x1c0($out)
3200
3201 vpxord 0x200($inp),$xa2,$xa2
3202 vpxord 0x240($inp),$xb2,$xb2
3203 vpxord 0x280($inp),$xc2,$xc2
3204 vpxord 0x2c0($inp),$xd2,$xd2
3205 vmovdqu32 $xa2,0x200($out)
3206 vmovdqu32 $xb2,0x240($out)
3207 vmovdqu32 $xc2,0x280($out)
3208 vmovdqu32 $xd2,0x2c0($out)
3209
3210 vpxord 0x300($inp),$xa3,$xa3
3211 vpxord 0x340($inp),$xb3,$xb3
3212 vpxord 0x380($inp),$xc3,$xc3
3213 vpxord 0x3c0($inp),$xd3,$xd3
3214 lea 0x400($inp),$inp
3215 vmovdqu32 $xa3,0x300($out)
3216 vmovdqu32 $xb3,0x340($out)
3217 vmovdqu32 $xc3,0x380($out)
3218 vmovdqu32 $xd3,0x3c0($out)
3219 lea 0x400($out),$out
3220
3221 sub \$64*16,$len
3222 jnz .Loop_outer16x
3223
3224 jmp .Ldone16x
3225
3226.align 32
3227.Ltail16x:
3228 xor %r10,%r10
3229 sub $inp,$out
3230 cmp \$64*1,$len
3231 jb .Less_than_64_16x
3232 vpxord ($inp),$xa0,$xa0 # xor with input
3233 vmovdqu32 $xa0,($out,$inp)
3234 je .Ldone16x
3235 vmovdqa32 $xb0,$xa0
3236 lea 64($inp),$inp
3237
3238 cmp \$64*2,$len
3239 jb .Less_than_64_16x
3240 vpxord ($inp),$xb0,$xb0
3241 vmovdqu32 $xb0,($out,$inp)
3242 je .Ldone16x
3243 vmovdqa32 $xc0,$xa0
3244 lea 64($inp),$inp
3245
3246 cmp \$64*3,$len
3247 jb .Less_than_64_16x
3248 vpxord ($inp),$xc0,$xc0
3249 vmovdqu32 $xc0,($out,$inp)
3250 je .Ldone16x
3251 vmovdqa32 $xd0,$xa0
3252 lea 64($inp),$inp
3253
3254 cmp \$64*4,$len
3255 jb .Less_than_64_16x
3256 vpxord ($inp),$xd0,$xd0
3257 vmovdqu32 $xd0,($out,$inp)
3258 je .Ldone16x
3259 vmovdqa32 $xa1,$xa0
3260 lea 64($inp),$inp
3261
3262 cmp \$64*5,$len
3263 jb .Less_than_64_16x
3264 vpxord ($inp),$xa1,$xa1
3265 vmovdqu32 $xa1,($out,$inp)
3266 je .Ldone16x
3267 vmovdqa32 $xb1,$xa0
3268 lea 64($inp),$inp
3269
3270 cmp \$64*6,$len
3271 jb .Less_than_64_16x
3272 vpxord ($inp),$xb1,$xb1
3273 vmovdqu32 $xb1,($out,$inp)
3274 je .Ldone16x
3275 vmovdqa32 $xc1,$xa0
3276 lea 64($inp),$inp
3277
3278 cmp \$64*7,$len
3279 jb .Less_than_64_16x
3280 vpxord ($inp),$xc1,$xc1
3281 vmovdqu32 $xc1,($out,$inp)
3282 je .Ldone16x
3283 vmovdqa32 $xd1,$xa0
3284 lea 64($inp),$inp
3285
3286 cmp \$64*8,$len
3287 jb .Less_than_64_16x
3288 vpxord ($inp),$xd1,$xd1
3289 vmovdqu32 $xd1,($out,$inp)
3290 je .Ldone16x
3291 vmovdqa32 $xa2,$xa0
3292 lea 64($inp),$inp
3293
3294 cmp \$64*9,$len
3295 jb .Less_than_64_16x
3296 vpxord ($inp),$xa2,$xa2
3297 vmovdqu32 $xa2,($out,$inp)
3298 je .Ldone16x
3299 vmovdqa32 $xb2,$xa0
3300 lea 64($inp),$inp
3301
3302 cmp \$64*10,$len
3303 jb .Less_than_64_16x
3304 vpxord ($inp),$xb2,$xb2
3305 vmovdqu32 $xb2,($out,$inp)
3306 je .Ldone16x
3307 vmovdqa32 $xc2,$xa0
3308 lea 64($inp),$inp
3309
3310 cmp \$64*11,$len
3311 jb .Less_than_64_16x
3312 vpxord ($inp),$xc2,$xc2
3313 vmovdqu32 $xc2,($out,$inp)
3314 je .Ldone16x
3315 vmovdqa32 $xd2,$xa0
3316 lea 64($inp),$inp
3317
3318 cmp \$64*12,$len
3319 jb .Less_than_64_16x
3320 vpxord ($inp),$xd2,$xd2
3321 vmovdqu32 $xd2,($out,$inp)
3322 je .Ldone16x
3323 vmovdqa32 $xa3,$xa0
3324 lea 64($inp),$inp
3325
3326 cmp \$64*13,$len
3327 jb .Less_than_64_16x
3328 vpxord ($inp),$xa3,$xa3
3329 vmovdqu32 $xa3,($out,$inp)
3330 je .Ldone16x
3331 vmovdqa32 $xb3,$xa0
3332 lea 64($inp),$inp
3333
3334 cmp \$64*14,$len
3335 jb .Less_than_64_16x
3336 vpxord ($inp),$xb3,$xb3
3337 vmovdqu32 $xb3,($out,$inp)
3338 je .Ldone16x
3339 vmovdqa32 $xc3,$xa0
3340 lea 64($inp),$inp
3341
3342 cmp \$64*15,$len
3343 jb .Less_than_64_16x
3344 vpxord ($inp),$xc3,$xc3
3345 vmovdqu32 $xc3,($out,$inp)
3346 je .Ldone16x
3347 vmovdqa32 $xd3,$xa0
3348 lea 64($inp),$inp
3349
3350.Less_than_64_16x:
3351 vmovdqa32 $xa0,0x00(%rsp)
3352 lea ($out,$inp),$out
3353 and \$63,$len
3354
3355.Loop_tail16x:
3356 movzb ($inp,%r10),%eax
3357 movzb (%rsp,%r10),%ecx
3358 lea 1(%r10),%r10
3359 xor %ecx,%eax
3360 mov %al,-1($out,%r10)
3361 dec $len
3362 jnz .Loop_tail16x
3363
3364 vpxord $xa0,$xa0,$xa0
3365 vmovdqa32 $xa0,0(%rsp)
3366
3367.Ldone16x:
3368 vzeroall
3369___
3370$code.=<<___ if ($win64);
3371 movaps -0xa8(%r9),%xmm6
3372 movaps -0x98(%r9),%xmm7
3373 movaps -0x88(%r9),%xmm8
3374 movaps -0x78(%r9),%xmm9
3375 movaps -0x68(%r9),%xmm10
3376 movaps -0x58(%r9),%xmm11
3377 movaps -0x48(%r9),%xmm12
3378 movaps -0x38(%r9),%xmm13
3379 movaps -0x28(%r9),%xmm14
3380 movaps -0x18(%r9),%xmm15
3381___
3382$code.=<<___;
3383 lea (%r9),%rsp
3384.cfi_def_cfa_register %rsp
3385.L16x_epilogue:
3386 ret
3387.cfi_endproc
3388.size ChaCha20_16x,.-ChaCha20_16x
3389___
3390
3391# switch to %ymm domain
3392($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3,
3393 $xc0,$xc1,$xc2,$xc3, $xd0,$xd1,$xd2,$xd3)=map("%ymm$_",(0..15));
3394@xx=($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3,
3395 $xc0,$xc1,$xc2,$xc3, $xd0,$xd1,$xd2,$xd3);
3396@key=map("%ymm$_",(16..31));
3397($xt0,$xt1,$xt2,$xt3)=@key[0..3];
3398
3399$code.=<<___;
3400.type ChaCha20_8xvl,\@function,5
3401.align 32
3402ChaCha20_8xvl:
3403.cfi_startproc
3404.LChaCha20_8xvl:
3405 mov %rsp,%r9 # frame register
3406.cfi_def_cfa_register %r9
3407 sub \$64+$xframe,%rsp
3408 and \$-64,%rsp
3409___
3410$code.=<<___ if ($win64);
3411 movaps %xmm6,-0xa8(%r9)
3412 movaps %xmm7,-0x98(%r9)
3413 movaps %xmm8,-0x88(%r9)
3414 movaps %xmm9,-0x78(%r9)
3415 movaps %xmm10,-0x68(%r9)
3416 movaps %xmm11,-0x58(%r9)
3417 movaps %xmm12,-0x48(%r9)
3418 movaps %xmm13,-0x38(%r9)
3419 movaps %xmm14,-0x28(%r9)
3420 movaps %xmm15,-0x18(%r9)
3421.L8xvl_body:
3422___
3423$code.=<<___;
3424 vzeroupper
3425
3426 lea .Lsigma(%rip),%r10
3427 vbroadcasti128 (%r10),$xa3 # key[0]
3428 vbroadcasti128 ($key),$xb3 # key[1]
3429 vbroadcasti128 16($key),$xc3 # key[2]
3430 vbroadcasti128 ($counter),$xd3 # key[3]
3431
3432 vpshufd \$0x00,$xa3,$xa0 # smash key by lanes...
3433 vpshufd \$0x55,$xa3,$xa1
3434 vpshufd \$0xaa,$xa3,$xa2
3435 vpshufd \$0xff,$xa3,$xa3
3436 vmovdqa64 $xa0,@key[0]
3437 vmovdqa64 $xa1,@key[1]
3438 vmovdqa64 $xa2,@key[2]
3439 vmovdqa64 $xa3,@key[3]
3440
3441 vpshufd \$0x00,$xb3,$xb0
3442 vpshufd \$0x55,$xb3,$xb1
3443 vpshufd \$0xaa,$xb3,$xb2
3444 vpshufd \$0xff,$xb3,$xb3
3445 vmovdqa64 $xb0,@key[4]
3446 vmovdqa64 $xb1,@key[5]
3447 vmovdqa64 $xb2,@key[6]
3448 vmovdqa64 $xb3,@key[7]
3449
3450 vpshufd \$0x00,$xc3,$xc0
3451 vpshufd \$0x55,$xc3,$xc1
3452 vpshufd \$0xaa,$xc3,$xc2
3453 vpshufd \$0xff,$xc3,$xc3
3454 vmovdqa64 $xc0,@key[8]
3455 vmovdqa64 $xc1,@key[9]
3456 vmovdqa64 $xc2,@key[10]
3457 vmovdqa64 $xc3,@key[11]
3458
3459 vpshufd \$0x00,$xd3,$xd0
3460 vpshufd \$0x55,$xd3,$xd1
3461 vpshufd \$0xaa,$xd3,$xd2
3462 vpshufd \$0xff,$xd3,$xd3
3463 vpaddd .Lincy(%rip),$xd0,$xd0 # don't save counters yet
3464 vmovdqa64 $xd0,@key[12]
3465 vmovdqa64 $xd1,@key[13]
3466 vmovdqa64 $xd2,@key[14]
3467 vmovdqa64 $xd3,@key[15]
3468
3469 mov \$10,%eax
3470 jmp .Loop8xvl
3471
3472.align 32
3473.Loop_outer8xvl:
3474 #vpbroadcastd 0(%r10),$xa0 # reload key
3475 #vpbroadcastd 4(%r10),$xa1
3476 vpbroadcastd 8(%r10),$xa2
3477 vpbroadcastd 12(%r10),$xa3
3478 vpaddd .Leight(%rip),@key[12],@key[12] # next SIMD counters
3479 vmovdqa64 @key[4],$xb0
3480 vmovdqa64 @key[5],$xb1
3481 vmovdqa64 @key[6],$xb2
3482 vmovdqa64 @key[7],$xb3
3483 vmovdqa64 @key[8],$xc0
3484 vmovdqa64 @key[9],$xc1
3485 vmovdqa64 @key[10],$xc2
3486 vmovdqa64 @key[11],$xc3
3487 vmovdqa64 @key[12],$xd0
3488 vmovdqa64 @key[13],$xd1
3489 vmovdqa64 @key[14],$xd2
3490 vmovdqa64 @key[15],$xd3
3491
3492 vmovdqa64 $xa0,@key[0]
3493 vmovdqa64 $xa1,@key[1]
3494 vmovdqa64 $xa2,@key[2]
3495 vmovdqa64 $xa3,@key[3]
3496
3497 mov \$10,%eax
3498 jmp .Loop8xvl
3499
3500.align 32
3501.Loop8xvl:
3502___
3503 foreach (&AVX512_lane_ROUND(0, 4, 8,12)) { eval; }
3504 foreach (&AVX512_lane_ROUND(0, 5,10,15)) { eval; }
3505$code.=<<___;
3506 dec %eax
3507 jnz .Loop8xvl
3508
3509 vpaddd @key[0],$xa0,$xa0 # accumulate key
3510 vpaddd @key[1],$xa1,$xa1
3511 vpaddd @key[2],$xa2,$xa2
3512 vpaddd @key[3],$xa3,$xa3
3513
3514 vpunpckldq $xa1,$xa0,$xt2 # "de-interlace" data
3515 vpunpckldq $xa3,$xa2,$xt3
3516 vpunpckhdq $xa1,$xa0,$xa0
3517 vpunpckhdq $xa3,$xa2,$xa2
3518 vpunpcklqdq $xt3,$xt2,$xa1 # "a0"
3519 vpunpckhqdq $xt3,$xt2,$xt2 # "a1"
3520 vpunpcklqdq $xa2,$xa0,$xa3 # "a2"
3521 vpunpckhqdq $xa2,$xa0,$xa0 # "a3"
3522___
3523 ($xa0,$xa1,$xa2,$xa3,$xt2)=($xa1,$xt2,$xa3,$xa0,$xa2);
3524$code.=<<___;
3525 vpaddd @key[4],$xb0,$xb0
3526 vpaddd @key[5],$xb1,$xb1
3527 vpaddd @key[6],$xb2,$xb2
3528 vpaddd @key[7],$xb3,$xb3
3529
3530 vpunpckldq $xb1,$xb0,$xt2
3531 vpunpckldq $xb3,$xb2,$xt3
3532 vpunpckhdq $xb1,$xb0,$xb0
3533 vpunpckhdq $xb3,$xb2,$xb2
3534 vpunpcklqdq $xt3,$xt2,$xb1 # "b0"
3535 vpunpckhqdq $xt3,$xt2,$xt2 # "b1"
3536 vpunpcklqdq $xb2,$xb0,$xb3 # "b2"
3537 vpunpckhqdq $xb2,$xb0,$xb0 # "b3"
3538___
3539 ($xb0,$xb1,$xb2,$xb3,$xt2)=($xb1,$xt2,$xb3,$xb0,$xb2);
3540$code.=<<___;
3541 vshufi32x4 \$0,$xb0,$xa0,$xt3 # "de-interlace" further
3542 vshufi32x4 \$3,$xb0,$xa0,$xb0
3543 vshufi32x4 \$0,$xb1,$xa1,$xa0
3544 vshufi32x4 \$3,$xb1,$xa1,$xb1
3545 vshufi32x4 \$0,$xb2,$xa2,$xa1
3546 vshufi32x4 \$3,$xb2,$xa2,$xb2
3547 vshufi32x4 \$0,$xb3,$xa3,$xa2
3548 vshufi32x4 \$3,$xb3,$xa3,$xb3
3549___
3550 ($xa0,$xa1,$xa2,$xa3,$xt3)=($xt3,$xa0,$xa1,$xa2,$xa3);
3551$code.=<<___;
3552 vpaddd @key[8],$xc0,$xc0
3553 vpaddd @key[9],$xc1,$xc1
3554 vpaddd @key[10],$xc2,$xc2
3555 vpaddd @key[11],$xc3,$xc3
3556
3557 vpunpckldq $xc1,$xc0,$xt2
3558 vpunpckldq $xc3,$xc2,$xt3
3559 vpunpckhdq $xc1,$xc0,$xc0
3560 vpunpckhdq $xc3,$xc2,$xc2
3561 vpunpcklqdq $xt3,$xt2,$xc1 # "c0"
3562 vpunpckhqdq $xt3,$xt2,$xt2 # "c1"
3563 vpunpcklqdq $xc2,$xc0,$xc3 # "c2"
3564 vpunpckhqdq $xc2,$xc0,$xc0 # "c3"
3565___
3566 ($xc0,$xc1,$xc2,$xc3,$xt2)=($xc1,$xt2,$xc3,$xc0,$xc2);
3567$code.=<<___;
3568 vpaddd @key[12],$xd0,$xd0
3569 vpaddd @key[13],$xd1,$xd1
3570 vpaddd @key[14],$xd2,$xd2
3571 vpaddd @key[15],$xd3,$xd3
3572
3573 vpunpckldq $xd1,$xd0,$xt2
3574 vpunpckldq $xd3,$xd2,$xt3
3575 vpunpckhdq $xd1,$xd0,$xd0
3576 vpunpckhdq $xd3,$xd2,$xd2
3577 vpunpcklqdq $xt3,$xt2,$xd1 # "d0"
3578 vpunpckhqdq $xt3,$xt2,$xt2 # "d1"
3579 vpunpcklqdq $xd2,$xd0,$xd3 # "d2"
3580 vpunpckhqdq $xd2,$xd0,$xd0 # "d3"
3581___
3582 ($xd0,$xd1,$xd2,$xd3,$xt2)=($xd1,$xt2,$xd3,$xd0,$xd2);
3583$code.=<<___;
3584 vperm2i128 \$0x20,$xd0,$xc0,$xt3 # "de-interlace" further
3585 vperm2i128 \$0x31,$xd0,$xc0,$xd0
3586 vperm2i128 \$0x20,$xd1,$xc1,$xc0
3587 vperm2i128 \$0x31,$xd1,$xc1,$xd1
3588 vperm2i128 \$0x20,$xd2,$xc2,$xc1
3589 vperm2i128 \$0x31,$xd2,$xc2,$xd2
3590 vperm2i128 \$0x20,$xd3,$xc3,$xc2
3591 vperm2i128 \$0x31,$xd3,$xc3,$xd3
3592___
3593 ($xc0,$xc1,$xc2,$xc3,$xt3)=($xt3,$xc0,$xc1,$xc2,$xc3);
3594 ($xb0,$xb1,$xb2,$xb3,$xc0,$xc1,$xc2,$xc3)=
3595 ($xc0,$xc1,$xc2,$xc3,$xb0,$xb1,$xb2,$xb3);
3596$code.=<<___;
3597 cmp \$64*8,$len
3598 jb .Ltail8xvl
3599
3600 mov \$0x80,%eax # size optimization
3601 vpxord 0x00($inp),$xa0,$xa0 # xor with input
3602 vpxor 0x20($inp),$xb0,$xb0
3603 vpxor 0x40($inp),$xc0,$xc0
3604 vpxor 0x60($inp),$xd0,$xd0
3605 lea ($inp,%rax),$inp # size optimization
3606 vmovdqu32 $xa0,0x00($out)
3607 vmovdqu $xb0,0x20($out)
3608 vmovdqu $xc0,0x40($out)
3609 vmovdqu $xd0,0x60($out)
3610 lea ($out,%rax),$out # size optimization
3611
3612 vpxor 0x00($inp),$xa1,$xa1
3613 vpxor 0x20($inp),$xb1,$xb1
3614 vpxor 0x40($inp),$xc1,$xc1
3615 vpxor 0x60($inp),$xd1,$xd1
3616 lea ($inp,%rax),$inp # size optimization
3617 vmovdqu $xa1,0x00($out)
3618 vmovdqu $xb1,0x20($out)
3619 vmovdqu $xc1,0x40($out)
3620 vmovdqu $xd1,0x60($out)
3621 lea ($out,%rax),$out # size optimization
3622
3623 vpxord 0x00($inp),$xa2,$xa2
3624 vpxor 0x20($inp),$xb2,$xb2
3625 vpxor 0x40($inp),$xc2,$xc2
3626 vpxor 0x60($inp),$xd2,$xd2
3627 lea ($inp,%rax),$inp # size optimization
3628 vmovdqu32 $xa2,0x00($out)
3629 vmovdqu $xb2,0x20($out)
3630 vmovdqu $xc2,0x40($out)
3631 vmovdqu $xd2,0x60($out)
3632 lea ($out,%rax),$out # size optimization
3633
3634 vpxor 0x00($inp),$xa3,$xa3
3635 vpxor 0x20($inp),$xb3,$xb3
3636 vpxor 0x40($inp),$xc3,$xc3
3637 vpxor 0x60($inp),$xd3,$xd3
3638 lea ($inp,%rax),$inp # size optimization
3639 vmovdqu $xa3,0x00($out)
3640 vmovdqu $xb3,0x20($out)
3641 vmovdqu $xc3,0x40($out)
3642 vmovdqu $xd3,0x60($out)
3643 lea ($out,%rax),$out # size optimization
3644
3645 vpbroadcastd 0(%r10),%ymm0 # reload key
3646 vpbroadcastd 4(%r10),%ymm1
3647
3648 sub \$64*8,$len
3649 jnz .Loop_outer8xvl
3650
3651 jmp .Ldone8xvl
3652
3653.align 32
3654.Ltail8xvl:
3655 vmovdqa64 $xa0,%ymm8 # size optimization
3656___
3657$xa0 = "%ymm8";
3658$code.=<<___;
3659 xor %r10,%r10
3660 sub $inp,$out
3661 cmp \$64*1,$len
3662 jb .Less_than_64_8xvl
3663 vpxor 0x00($inp),$xa0,$xa0 # xor with input
3664 vpxor 0x20($inp),$xb0,$xb0
3665 vmovdqu $xa0,0x00($out,$inp)
3666 vmovdqu $xb0,0x20($out,$inp)
3667 je .Ldone8xvl
3668 vmovdqa $xc0,$xa0
3669 vmovdqa $xd0,$xb0
3670 lea 64($inp),$inp
3671
3672 cmp \$64*2,$len
3673 jb .Less_than_64_8xvl
3674 vpxor 0x00($inp),$xc0,$xc0
3675 vpxor 0x20($inp),$xd0,$xd0
3676 vmovdqu $xc0,0x00($out,$inp)
3677 vmovdqu $xd0,0x20($out,$inp)
3678 je .Ldone8xvl
3679 vmovdqa $xa1,$xa0
3680 vmovdqa $xb1,$xb0
3681 lea 64($inp),$inp
3682
3683 cmp \$64*3,$len
3684 jb .Less_than_64_8xvl
3685 vpxor 0x00($inp),$xa1,$xa1
3686 vpxor 0x20($inp),$xb1,$xb1
3687 vmovdqu $xa1,0x00($out,$inp)
3688 vmovdqu $xb1,0x20($out,$inp)
3689 je .Ldone8xvl
3690 vmovdqa $xc1,$xa0
3691 vmovdqa $xd1,$xb0
3692 lea 64($inp),$inp
3693
3694 cmp \$64*4,$len
3695 jb .Less_than_64_8xvl
3696 vpxor 0x00($inp),$xc1,$xc1
3697 vpxor 0x20($inp),$xd1,$xd1
3698 vmovdqu $xc1,0x00($out,$inp)
3699 vmovdqu $xd1,0x20($out,$inp)
3700 je .Ldone8xvl
3701 vmovdqa32 $xa2,$xa0
3702 vmovdqa $xb2,$xb0
3703 lea 64($inp),$inp
3704
3705 cmp \$64*5,$len
3706 jb .Less_than_64_8xvl
3707 vpxord 0x00($inp),$xa2,$xa2
3708 vpxor 0x20($inp),$xb2,$xb2
3709 vmovdqu32 $xa2,0x00($out,$inp)
3710 vmovdqu $xb2,0x20($out,$inp)
3711 je .Ldone8xvl
3712 vmovdqa $xc2,$xa0
3713 vmovdqa $xd2,$xb0
3714 lea 64($inp),$inp
3715
3716 cmp \$64*6,$len
3717 jb .Less_than_64_8xvl
3718 vpxor 0x00($inp),$xc2,$xc2
3719 vpxor 0x20($inp),$xd2,$xd2
3720 vmovdqu $xc2,0x00($out,$inp)
3721 vmovdqu $xd2,0x20($out,$inp)
3722 je .Ldone8xvl
3723 vmovdqa $xa3,$xa0
3724 vmovdqa $xb3,$xb0
3725 lea 64($inp),$inp
3726
3727 cmp \$64*7,$len
3728 jb .Less_than_64_8xvl
3729 vpxor 0x00($inp),$xa3,$xa3
3730 vpxor 0x20($inp),$xb3,$xb3
3731 vmovdqu $xa3,0x00($out,$inp)
3732 vmovdqu $xb3,0x20($out,$inp)
3733 je .Ldone8xvl
3734 vmovdqa $xc3,$xa0
3735 vmovdqa $xd3,$xb0
3736 lea 64($inp),$inp
3737
3738.Less_than_64_8xvl:
3739 vmovdqa $xa0,0x00(%rsp)
3740 vmovdqa $xb0,0x20(%rsp)
3741 lea ($out,$inp),$out
3742 and \$63,$len
3743
3744.Loop_tail8xvl:
3745 movzb ($inp,%r10),%eax
3746 movzb (%rsp,%r10),%ecx
3747 lea 1(%r10),%r10
3748 xor %ecx,%eax
3749 mov %al,-1($out,%r10)
3750 dec $len
3751 jnz .Loop_tail8xvl
3752
3753 vpxor $xa0,$xa0,$xa0
3754 vmovdqa $xa0,0x00(%rsp)
3755 vmovdqa $xa0,0x20(%rsp)
3756
3757.Ldone8xvl:
3758 vzeroall
3759___
3760$code.=<<___ if ($win64);
3761 movaps -0xa8(%r9),%xmm6
3762 movaps -0x98(%r9),%xmm7
3763 movaps -0x88(%r9),%xmm8
3764 movaps -0x78(%r9),%xmm9
3765 movaps -0x68(%r9),%xmm10
3766 movaps -0x58(%r9),%xmm11
3767 movaps -0x48(%r9),%xmm12
3768 movaps -0x38(%r9),%xmm13
3769 movaps -0x28(%r9),%xmm14
3770 movaps -0x18(%r9),%xmm15
3771___
3772$code.=<<___;
3773 lea (%r9),%rsp
3774.cfi_def_cfa_register %rsp
3775.L8xvl_epilogue:
3776 ret
3777.cfi_endproc
3778.size ChaCha20_8xvl,.-ChaCha20_8xvl
3779___
3780}
3781
3782# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
3783# CONTEXT *context,DISPATCHER_CONTEXT *disp)
3784if ($win64) {
3785$rec="%rcx";
3786$frame="%rdx";
3787$context="%r8";
3788$disp="%r9";
3789
3790$code.=<<___;
3791.extern __imp_RtlVirtualUnwind
3792.type se_handler,\@abi-omnipotent
3793.align 16
3794se_handler:
3795 push %rsi
3796 push %rdi
3797 push %rbx
3798 push %rbp
3799 push %r12
3800 push %r13
3801 push %r14
3802 push %r15
3803 pushfq
3804 sub \$64,%rsp
3805
3806 mov 120($context),%rax # pull context->Rax
3807 mov 248($context),%rbx # pull context->Rip
3808
3809 mov 8($disp),%rsi # disp->ImageBase
3810 mov 56($disp),%r11 # disp->HandlerData
3811
3812 lea .Lctr32_body(%rip),%r10
3813 cmp %r10,%rbx # context->Rip<.Lprologue
3814 jb .Lcommon_seh_tail
3815
3816 mov 152($context),%rax # pull context->Rsp
3817
3818 lea .Lno_data(%rip),%r10 # epilogue label
3819 cmp %r10,%rbx # context->Rip>=.Lepilogue
3820 jae .Lcommon_seh_tail
3821
3822 lea 64+24+48(%rax),%rax
3823
3824 mov -8(%rax),%rbx
3825 mov -16(%rax),%rbp
3826 mov -24(%rax),%r12
3827 mov -32(%rax),%r13
3828 mov -40(%rax),%r14
3829 mov -48(%rax),%r15
3830 mov %rbx,144($context) # restore context->Rbx
3831 mov %rbp,160($context) # restore context->Rbp
3832 mov %r12,216($context) # restore context->R12
3833 mov %r13,224($context) # restore context->R13
3834 mov %r14,232($context) # restore context->R14
3835 mov %r15,240($context) # restore context->R14
3836
3837.Lcommon_seh_tail:
3838 mov 8(%rax),%rdi
3839 mov 16(%rax),%rsi
3840 mov %rax,152($context) # restore context->Rsp
3841 mov %rsi,168($context) # restore context->Rsi
3842 mov %rdi,176($context) # restore context->Rdi
3843
3844 mov 40($disp),%rdi # disp->ContextRecord
3845 mov $context,%rsi # context
3846 mov \$154,%ecx # sizeof(CONTEXT)
3847 .long 0xa548f3fc # cld; rep movsq
3848
3849 mov $disp,%rsi
3850 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
3851 mov 8(%rsi),%rdx # arg2, disp->ImageBase
3852 mov 0(%rsi),%r8 # arg3, disp->ControlPc
3853 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
3854 mov 40(%rsi),%r10 # disp->ContextRecord
3855 lea 56(%rsi),%r11 # &disp->HandlerData
3856 lea 24(%rsi),%r12 # &disp->EstablisherFrame
3857 mov %r10,32(%rsp) # arg5
3858 mov %r11,40(%rsp) # arg6
3859 mov %r12,48(%rsp) # arg7
3860 mov %rcx,56(%rsp) # arg8, (NULL)
3861 call *__imp_RtlVirtualUnwind(%rip)
3862
3863 mov \$1,%eax # ExceptionContinueSearch
3864 add \$64,%rsp
3865 popfq
3866 pop %r15
3867 pop %r14
3868 pop %r13
3869 pop %r12
3870 pop %rbp
3871 pop %rbx
3872 pop %rdi
3873 pop %rsi
3874 ret
3875.size se_handler,.-se_handler
3876
3877.type simd_handler,\@abi-omnipotent
3878.align 16
3879simd_handler:
3880 push %rsi
3881 push %rdi
3882 push %rbx
3883 push %rbp
3884 push %r12
3885 push %r13
3886 push %r14
3887 push %r15
3888 pushfq
3889 sub \$64,%rsp
3890
3891 mov 120($context),%rax # pull context->Rax
3892 mov 248($context),%rbx # pull context->Rip
3893
3894 mov 8($disp),%rsi # disp->ImageBase
3895 mov 56($disp),%r11 # disp->HandlerData
3896
3897 mov 0(%r11),%r10d # HandlerData[0]
3898 lea (%rsi,%r10),%r10 # prologue label
3899 cmp %r10,%rbx # context->Rip<prologue label
3900 jb .Lcommon_seh_tail
3901
3902 mov 192($context),%rax # pull context->R9
3903
3904 mov 4(%r11),%r10d # HandlerData[1]
3905 mov 8(%r11),%ecx # HandlerData[2]
3906 lea (%rsi,%r10),%r10 # epilogue label
3907 cmp %r10,%rbx # context->Rip>=epilogue label
3908 jae .Lcommon_seh_tail
3909
3910 neg %rcx
3911 lea -8(%rax,%rcx),%rsi
3912 lea 512($context),%rdi # &context.Xmm6
3913 neg %ecx
3914 shr \$3,%ecx
3915 .long 0xa548f3fc # cld; rep movsq
3916
3917 jmp .Lcommon_seh_tail
3918.size simd_handler,.-simd_handler
3919
3920.section .pdata
3921.align 4
3922 .rva .LSEH_begin_ChaCha20_ctr32
3923 .rva .LSEH_end_ChaCha20_ctr32
3924 .rva .LSEH_info_ChaCha20_ctr32
3925
3926 .rva .LSEH_begin_ChaCha20_ssse3
3927 .rva .LSEH_end_ChaCha20_ssse3
3928 .rva .LSEH_info_ChaCha20_ssse3
3929
3930 .rva .LSEH_begin_ChaCha20_128
3931 .rva .LSEH_end_ChaCha20_128
3932 .rva .LSEH_info_ChaCha20_128
3933
3934 .rva .LSEH_begin_ChaCha20_4x
3935 .rva .LSEH_end_ChaCha20_4x
3936 .rva .LSEH_info_ChaCha20_4x
3937___
3938$code.=<<___ if ($avx);
3939 .rva .LSEH_begin_ChaCha20_4xop
3940 .rva .LSEH_end_ChaCha20_4xop
3941 .rva .LSEH_info_ChaCha20_4xop
3942___
3943$code.=<<___ if ($avx>1);
3944 .rva .LSEH_begin_ChaCha20_8x
3945 .rva .LSEH_end_ChaCha20_8x
3946 .rva .LSEH_info_ChaCha20_8x
3947___
3948$code.=<<___ if ($avx>2);
3949 .rva .LSEH_begin_ChaCha20_avx512
3950 .rva .LSEH_end_ChaCha20_avx512
3951 .rva .LSEH_info_ChaCha20_avx512
3952
3953 .rva .LSEH_begin_ChaCha20_avx512vl
3954 .rva .LSEH_end_ChaCha20_avx512vl
3955 .rva .LSEH_info_ChaCha20_avx512vl
3956
3957 .rva .LSEH_begin_ChaCha20_16x
3958 .rva .LSEH_end_ChaCha20_16x
3959 .rva .LSEH_info_ChaCha20_16x
3960
3961 .rva .LSEH_begin_ChaCha20_8xvl
3962 .rva .LSEH_end_ChaCha20_8xvl
3963 .rva .LSEH_info_ChaCha20_8xvl
3964___
3965$code.=<<___;
3966.section .xdata
3967.align 8
3968.LSEH_info_ChaCha20_ctr32:
3969 .byte 9,0,0,0
3970 .rva se_handler
3971
3972.LSEH_info_ChaCha20_ssse3:
3973 .byte 9,0,0,0
3974 .rva simd_handler
3975 .rva .Lssse3_body,.Lssse3_epilogue
3976 .long 0x20,0
3977
3978.LSEH_info_ChaCha20_128:
3979 .byte 9,0,0,0
3980 .rva simd_handler
3981 .rva .L128_body,.L128_epilogue
3982 .long 0x60,0
3983
3984.LSEH_info_ChaCha20_4x:
3985 .byte 9,0,0,0
3986 .rva simd_handler
3987 .rva .L4x_body,.L4x_epilogue
3988 .long 0xa0,0
3989___
3990$code.=<<___ if ($avx);
3991.LSEH_info_ChaCha20_4xop:
3992 .byte 9,0,0,0
3993 .rva simd_handler
3994 .rva .L4xop_body,.L4xop_epilogue # HandlerData[]
3995 .long 0xa0,0
3996___
3997$code.=<<___ if ($avx>1);
3998.LSEH_info_ChaCha20_8x:
3999 .byte 9,0,0,0
4000 .rva simd_handler
4001 .rva .L8x_body,.L8x_epilogue # HandlerData[]
4002 .long 0xa0,0
4003___
4004$code.=<<___ if ($avx>2);
4005.LSEH_info_ChaCha20_avx512:
4006 .byte 9,0,0,0
4007 .rva simd_handler
4008 .rva .Lavx512_body,.Lavx512_epilogue # HandlerData[]
4009 .long 0x20,0
4010
4011.LSEH_info_ChaCha20_avx512vl:
4012 .byte 9,0,0,0
4013 .rva simd_handler
4014 .rva .Lavx512vl_body,.Lavx512vl_epilogue # HandlerData[]
4015 .long 0x20,0
4016
4017.LSEH_info_ChaCha20_16x:
4018 .byte 9,0,0,0
4019 .rva simd_handler
4020 .rva .L16x_body,.L16x_epilogue # HandlerData[]
4021 .long 0xa0,0
4022
4023.LSEH_info_ChaCha20_8xvl:
4024 .byte 9,0,0,0
4025 .rva simd_handler
4026 .rva .L8xvl_body,.L8xvl_epilogue # HandlerData[]
4027 .long 0xa0,0
4028___
4029}
4030
4031foreach (split("\n",$code)) {
4032 s/\`([^\`]*)\`/eval $1/ge;
4033
4034 s/%x#%[yz]/%x/g; # "down-shift"
4035
4036 print $_,"\n";
4037}
4038
4039close STDOUT or die "error closing STDOUT: $!";
注意: 瀏覽 TracBrowser 來幫助您使用儲存庫瀏覽器

© 2025 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette