VirtualBox

source: vbox/trunk/src/libs/openssl-3.1.7/crypto/modes/asm/ghash-x86_64.pl@ 108344

最後變更 在這個檔案從108344是 104078,由 vboxsync 提交於 12 月 前

openssl-3.1.5: Applied and adjusted our OpenSSL changes to 3.1.4. bugref:10638

檔案大小: 43.6 KB
 
1#! /usr/bin/env perl
2# Copyright 2010-2020 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the Apache License 2.0 (the "License"). You may not use
5# this file except in compliance with the License. You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
9#
10# ====================================================================
11# Written by Andy Polyakov <[email protected]> for the OpenSSL
12# project. The module is, however, dual licensed under OpenSSL and
13# CRYPTOGAMS licenses depending on where you obtain it. For further
14# details see http://www.openssl.org/~appro/cryptogams/.
15# ====================================================================
16#
17# March, June 2010
18#
19# The module implements "4-bit" GCM GHASH function and underlying
20# single multiplication operation in GF(2^128). "4-bit" means that
21# it uses 256 bytes per-key table [+128 bytes shared table]. GHASH
22# function features so called "528B" variant utilizing additional
23# 256+16 bytes of per-key storage [+512 bytes shared table].
24# Performance results are for this streamed GHASH subroutine and are
25# expressed in cycles per processed byte, less is better:
26#
27# gcc 3.4.x(*) assembler
28#
29# P4 28.6 14.0 +100%
30# Opteron 19.3 7.7 +150%
31# Core2 17.8 8.1(**) +120%
32# Atom 31.6 16.8 +88%
33# VIA Nano 21.8 10.1 +115%
34#
35# (*) comparison is not completely fair, because C results are
36# for vanilla "256B" implementation, while assembler results
37# are for "528B";-)
38# (**) it's mystery [to me] why Core2 result is not same as for
39# Opteron;
40
41# May 2010
42#
43# Add PCLMULQDQ version performing at 2.02 cycles per processed byte.
44# See ghash-x86.pl for background information and details about coding
45# techniques.
46#
47# Special thanks to David Woodhouse for providing access to a
48# Westmere-based system on behalf of Intel Open Source Technology Centre.
49
50# December 2012
51#
52# Overhaul: aggregate Karatsuba post-processing, improve ILP in
53# reduction_alg9, increase reduction aggregate factor to 4x. As for
54# the latter. ghash-x86.pl discusses that it makes lesser sense to
55# increase aggregate factor. Then why increase here? Critical path
56# consists of 3 independent pclmulqdq instructions, Karatsuba post-
57# processing and reduction. "On top" of this we lay down aggregated
58# multiplication operations, triplets of independent pclmulqdq's. As
59# issue rate for pclmulqdq is limited, it makes lesser sense to
60# aggregate more multiplications than it takes to perform remaining
61# non-multiplication operations. 2x is near-optimal coefficient for
62# contemporary Intel CPUs (therefore modest improvement coefficient),
63# but not for Bulldozer. Latter is because logical SIMD operations
64# are twice as slow in comparison to Intel, so that critical path is
65# longer. A CPU with higher pclmulqdq issue rate would also benefit
66# from higher aggregate factor...
67#
68# Westmere 1.78(+13%)
69# Sandy Bridge 1.80(+8%)
70# Ivy Bridge 1.80(+7%)
71# Haswell 0.55(+93%) (if system doesn't support AVX)
72# Broadwell 0.45(+110%)(if system doesn't support AVX)
73# Skylake 0.44(+110%)(if system doesn't support AVX)
74# Bulldozer 1.49(+27%)
75# Silvermont 2.88(+13%)
76# Knights L 2.12(-) (if system doesn't support AVX)
77# Goldmont 1.08(+24%)
78
79# March 2013
80#
81# ... 8x aggregate factor AVX code path is using reduction algorithm
82# suggested by Shay Gueron[1]. Even though contemporary AVX-capable
83# CPUs such as Sandy and Ivy Bridge can execute it, the code performs
84# sub-optimally in comparison to above mentioned version. But thanks
85# to Ilya Albrekht and Max Locktyukhin of Intel Corp. we knew that
86# it performs in 0.41 cycles per byte on Haswell processor, in
87# 0.29 on Broadwell, and in 0.36 on Skylake.
88#
89# Knights Landing achieves 1.09 cpb.
90#
91# [1] http://rt.openssl.org/Ticket/Display.html?id=2900&user=guest&pass=guest
92
93# $output is the last argument if it looks like a file (it has an extension)
94# $flavour is the first argument if it doesn't look like a file
95$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
96$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
97
98$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
99
100$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
101( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
102( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
103die "can't locate x86_64-xlate.pl";
104
105if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
106 =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
107 $avx = ($1>=2.20) + ($1>=2.22);
108}
109
110if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
111 `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
112 $avx = ($1>=2.09) + ($1>=2.10);
113}
114
115if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
116 `ml64 2>&1` =~ /Version ([0-9]+)\./) {
117 $avx = ($1>=10) + ($1>=11);
118}
119
120if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:clang|LLVM) version|.*based on LLVM) ([0-9]+\.[0-9]+)/) {
121 $avx = ($2>=3.0) + ($2>3.0);
122}
123
124open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""
125 or die "can't call $xlate: $!";
126*STDOUT=*OUT;
127
128$do4xaggr=1;
129
130# common register layout
131$nlo="%rax";
132$nhi="%rbx";
133$Zlo="%r8";
134$Zhi="%r9";
135$tmp="%r10";
136$rem_4bit = "%r11";
137
138$Xi="%rdi";
139$Htbl="%rsi";
140
141# per-function register layout
142$cnt="%rcx";
143$rem="%rdx";
144
145sub LB() { my $r=shift; $r =~ s/%[er]([a-d])x/%\1l/ or
146 $r =~ s/%[er]([sd]i)/%\1l/ or
147 $r =~ s/%[er](bp)/%\1l/ or
148 $r =~ s/%(r[0-9]+)[d]?/%\1b/; $r; }
149
150sub AUTOLOAD() # thunk [simplified] 32-bit style perlasm
151{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://;
152 my $arg = pop;
153 $arg = "\$$arg" if ($arg*1 eq $arg);
154 $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n";
155}
156
157
158{ my $N;
159 sub loop() {
160 my $inp = shift;
161
162 $N++;
163$code.=<<___;
164 xor $nlo,$nlo
165 xor $nhi,$nhi
166 mov `&LB("$Zlo")`,`&LB("$nlo")`
167 mov `&LB("$Zlo")`,`&LB("$nhi")`
168 shl \$4,`&LB("$nlo")`
169 mov \$14,$cnt
170 mov 8($Htbl,$nlo),$Zlo
171 mov ($Htbl,$nlo),$Zhi
172 and \$0xf0,`&LB("$nhi")`
173 mov $Zlo,$rem
174 jmp .Loop$N
175
176.align 16
177.Loop$N:
178 shr \$4,$Zlo
179 and \$0xf,$rem
180 mov $Zhi,$tmp
181 mov ($inp,$cnt),`&LB("$nlo")`
182 shr \$4,$Zhi
183 xor 8($Htbl,$nhi),$Zlo
184 shl \$60,$tmp
185 xor ($Htbl,$nhi),$Zhi
186 mov `&LB("$nlo")`,`&LB("$nhi")`
187 xor ($rem_4bit,$rem,8),$Zhi
188 mov $Zlo,$rem
189 shl \$4,`&LB("$nlo")`
190 xor $tmp,$Zlo
191 dec $cnt
192 js .Lbreak$N
193
194 shr \$4,$Zlo
195 and \$0xf,$rem
196 mov $Zhi,$tmp
197 shr \$4,$Zhi
198 xor 8($Htbl,$nlo),$Zlo
199 shl \$60,$tmp
200 xor ($Htbl,$nlo),$Zhi
201 and \$0xf0,`&LB("$nhi")`
202 xor ($rem_4bit,$rem,8),$Zhi
203 mov $Zlo,$rem
204 xor $tmp,$Zlo
205 jmp .Loop$N
206
207.align 16
208.Lbreak$N:
209 shr \$4,$Zlo
210 and \$0xf,$rem
211 mov $Zhi,$tmp
212 shr \$4,$Zhi
213 xor 8($Htbl,$nlo),$Zlo
214 shl \$60,$tmp
215 xor ($Htbl,$nlo),$Zhi
216 and \$0xf0,`&LB("$nhi")`
217 xor ($rem_4bit,$rem,8),$Zhi
218 mov $Zlo,$rem
219 xor $tmp,$Zlo
220
221 shr \$4,$Zlo
222 and \$0xf,$rem
223 mov $Zhi,$tmp
224 shr \$4,$Zhi
225 xor 8($Htbl,$nhi),$Zlo
226 shl \$60,$tmp
227 xor ($Htbl,$nhi),$Zhi
228 xor $tmp,$Zlo
229 xor ($rem_4bit,$rem,8),$Zhi
230
231 bswap $Zlo
232 bswap $Zhi
233___
234}}
235
236$code=<<___;
237.text
238.extern OPENSSL_ia32cap_P
239
240.globl gcm_gmult_4bit
241.type gcm_gmult_4bit,\@function,2
242.align 16
243gcm_gmult_4bit:
244.cfi_startproc
245 endbranch
246 push %rbx
247.cfi_push %rbx
248 push %rbp # %rbp and others are pushed exclusively in
249.cfi_push %rbp
250 push %r12 # order to reuse Win64 exception handler...
251.cfi_push %r12
252 push %r13
253.cfi_push %r13
254 push %r14
255.cfi_push %r14
256 push %r15
257.cfi_push %r15
258 sub \$280,%rsp
259.cfi_adjust_cfa_offset 280
260.Lgmult_prologue:
261
262 movzb 15($Xi),$Zlo
263 lea .Lrem_4bit(%rip),$rem_4bit
264___
265 &loop ($Xi);
266$code.=<<___;
267 mov $Zlo,8($Xi)
268 mov $Zhi,($Xi)
269
270 lea 280+48(%rsp),%rsi
271.cfi_def_cfa %rsi,8
272 mov -8(%rsi),%rbx
273.cfi_restore %rbx
274 lea (%rsi),%rsp
275.cfi_def_cfa_register %rsp
276.Lgmult_epilogue:
277 ret
278.cfi_endproc
279.size gcm_gmult_4bit,.-gcm_gmult_4bit
280___
281
282
283# per-function register layout
284$inp="%rdx";
285$len="%rcx";
286$rem_8bit=$rem_4bit;
287
288$code.=<<___;
289.globl gcm_ghash_4bit
290.type gcm_ghash_4bit,\@function,4
291.align 16
292gcm_ghash_4bit:
293.cfi_startproc
294 endbranch
295 push %rbx
296.cfi_push %rbx
297 push %rbp
298.cfi_push %rbp
299 push %r12
300.cfi_push %r12
301 push %r13
302.cfi_push %r13
303 push %r14
304.cfi_push %r14
305 push %r15
306.cfi_push %r15
307 sub \$280,%rsp
308.cfi_adjust_cfa_offset 280
309.Lghash_prologue:
310 mov $inp,%r14 # reassign couple of args
311 mov $len,%r15
312___
313{ my $inp="%r14";
314 my $dat="%edx";
315 my $len="%r15";
316 my @nhi=("%ebx","%ecx");
317 my @rem=("%r12","%r13");
318 my $Hshr4="%rbp";
319
320 &sub ($Htbl,-128); # size optimization
321 &lea ($Hshr4,"16+128(%rsp)");
322 { my @lo =($nlo,$nhi);
323 my @hi =($Zlo,$Zhi);
324
325 &xor ($dat,$dat);
326 for ($i=0,$j=-2;$i<18;$i++,$j++) {
327 &mov ("$j(%rsp)",&LB($dat)) if ($i>1);
328 &or ($lo[0],$tmp) if ($i>1);
329 &mov (&LB($dat),&LB($lo[1])) if ($i>0 && $i<17);
330 &shr ($lo[1],4) if ($i>0 && $i<17);
331 &mov ($tmp,$hi[1]) if ($i>0 && $i<17);
332 &shr ($hi[1],4) if ($i>0 && $i<17);
333 &mov ("8*$j($Hshr4)",$hi[0]) if ($i>1);
334 &mov ($hi[0],"16*$i+0-128($Htbl)") if ($i<16);
335 &shl (&LB($dat),4) if ($i>0 && $i<17);
336 &mov ("8*$j-128($Hshr4)",$lo[0]) if ($i>1);
337 &mov ($lo[0],"16*$i+8-128($Htbl)") if ($i<16);
338 &shl ($tmp,60) if ($i>0 && $i<17);
339
340 push (@lo,shift(@lo));
341 push (@hi,shift(@hi));
342 }
343 }
344 &add ($Htbl,-128);
345 &mov ($Zlo,"8($Xi)");
346 &mov ($Zhi,"0($Xi)");
347 &add ($len,$inp); # pointer to the end of data
348 &lea ($rem_8bit,".Lrem_8bit(%rip)");
349 &jmp (".Louter_loop");
350
351$code.=".align 16\n.Louter_loop:\n";
352 &xor ($Zhi,"($inp)");
353 &mov ("%rdx","8($inp)");
354 &lea ($inp,"16($inp)");
355 &xor ("%rdx",$Zlo);
356 &mov ("($Xi)",$Zhi);
357 &mov ("8($Xi)","%rdx");
358 &shr ("%rdx",32);
359
360 &xor ($nlo,$nlo);
361 &rol ($dat,8);
362 &mov (&LB($nlo),&LB($dat));
363 &movz ($nhi[0],&LB($dat));
364 &shl (&LB($nlo),4);
365 &shr ($nhi[0],4);
366
367 for ($j=11,$i=0;$i<15;$i++) {
368 &rol ($dat,8);
369 &xor ($Zlo,"8($Htbl,$nlo)") if ($i>0);
370 &xor ($Zhi,"($Htbl,$nlo)") if ($i>0);
371 &mov ($Zlo,"8($Htbl,$nlo)") if ($i==0);
372 &mov ($Zhi,"($Htbl,$nlo)") if ($i==0);
373
374 &mov (&LB($nlo),&LB($dat));
375 &xor ($Zlo,$tmp) if ($i>0);
376 &movzw ($rem[1],"($rem_8bit,$rem[1],2)") if ($i>0);
377
378 &movz ($nhi[1],&LB($dat));
379 &shl (&LB($nlo),4);
380 &movzb ($rem[0],"(%rsp,$nhi[0])");
381
382 &shr ($nhi[1],4) if ($i<14);
383 &and ($nhi[1],0xf0) if ($i==14);
384 &shl ($rem[1],48) if ($i>0);
385 &xor ($rem[0],$Zlo);
386
387 &mov ($tmp,$Zhi);
388 &xor ($Zhi,$rem[1]) if ($i>0);
389 &shr ($Zlo,8);
390
391 &movz ($rem[0],&LB($rem[0]));
392 &mov ($dat,"$j($Xi)") if (--$j%4==0);
393 &shr ($Zhi,8);
394
395 &xor ($Zlo,"-128($Hshr4,$nhi[0],8)");
396 &shl ($tmp,56);
397 &xor ($Zhi,"($Hshr4,$nhi[0],8)");
398
399 unshift (@nhi,pop(@nhi)); # "rotate" registers
400 unshift (@rem,pop(@rem));
401 }
402 &movzw ($rem[1],"($rem_8bit,$rem[1],2)");
403 &xor ($Zlo,"8($Htbl,$nlo)");
404 &xor ($Zhi,"($Htbl,$nlo)");
405
406 &shl ($rem[1],48);
407 &xor ($Zlo,$tmp);
408
409 &xor ($Zhi,$rem[1]);
410 &movz ($rem[0],&LB($Zlo));
411 &shr ($Zlo,4);
412
413 &mov ($tmp,$Zhi);
414 &shl (&LB($rem[0]),4);
415 &shr ($Zhi,4);
416
417 &xor ($Zlo,"8($Htbl,$nhi[0])");
418 &movzw ($rem[0],"($rem_8bit,$rem[0],2)");
419 &shl ($tmp,60);
420
421 &xor ($Zhi,"($Htbl,$nhi[0])");
422 &xor ($Zlo,$tmp);
423 &shl ($rem[0],48);
424
425 &bswap ($Zlo);
426 &xor ($Zhi,$rem[0]);
427
428 &bswap ($Zhi);
429 &cmp ($inp,$len);
430 &jb (".Louter_loop");
431}
432$code.=<<___;
433 mov $Zlo,8($Xi)
434 mov $Zhi,($Xi)
435
436 lea 280+48(%rsp),%rsi
437.cfi_def_cfa %rsi,8
438 mov -48(%rsi),%r15
439.cfi_restore %r15
440 mov -40(%rsi),%r14
441.cfi_restore %r14
442 mov -32(%rsi),%r13
443.cfi_restore %r13
444 mov -24(%rsi),%r12
445.cfi_restore %r12
446 mov -16(%rsi),%rbp
447.cfi_restore %rbp
448 mov -8(%rsi),%rbx
449.cfi_restore %rbx
450 lea 0(%rsi),%rsp
451.cfi_def_cfa_register %rsp
452.Lghash_epilogue:
453 ret
454.cfi_endproc
455.size gcm_ghash_4bit,.-gcm_ghash_4bit
456___
457
458
459######################################################################
460# PCLMULQDQ version.
461
462@_4args=$win64? ("%rcx","%rdx","%r8", "%r9") : # Win64 order
463 ("%rdi","%rsi","%rdx","%rcx"); # Unix order
464
465($Xi,$Xhi)=("%xmm0","%xmm1"); $Hkey="%xmm2";
466($T1,$T2,$T3)=("%xmm3","%xmm4","%xmm5");
467
468sub clmul64x64_T2 { # minimal register pressure
469my ($Xhi,$Xi,$Hkey,$HK)=@_;
470
471if (!defined($HK)) { $HK = $T2;
472$code.=<<___;
473 movdqa $Xi,$Xhi #
474 pshufd \$0b01001110,$Xi,$T1
475 pshufd \$0b01001110,$Hkey,$T2
476 pxor $Xi,$T1 #
477 pxor $Hkey,$T2
478___
479} else {
480$code.=<<___;
481 movdqa $Xi,$Xhi #
482 pshufd \$0b01001110,$Xi,$T1
483 pxor $Xi,$T1 #
484___
485}
486$code.=<<___;
487 pclmulqdq \$0x00,$Hkey,$Xi #######
488 pclmulqdq \$0x11,$Hkey,$Xhi #######
489 pclmulqdq \$0x00,$HK,$T1 #######
490 pxor $Xi,$T1 #
491 pxor $Xhi,$T1 #
492
493 movdqa $T1,$T2 #
494 psrldq \$8,$T1
495 pslldq \$8,$T2 #
496 pxor $T1,$Xhi
497 pxor $T2,$Xi #
498___
499}
500
501sub reduction_alg9 { # 17/11 times faster than Intel version
502my ($Xhi,$Xi) = @_;
503
504$code.=<<___;
505 # 1st phase
506 movdqa $Xi,$T2 #
507 movdqa $Xi,$T1
508 psllq \$5,$Xi
509 pxor $Xi,$T1 #
510 psllq \$1,$Xi
511 pxor $T1,$Xi #
512 psllq \$57,$Xi #
513 movdqa $Xi,$T1 #
514 pslldq \$8,$Xi
515 psrldq \$8,$T1 #
516 pxor $T2,$Xi
517 pxor $T1,$Xhi #
518
519 # 2nd phase
520 movdqa $Xi,$T2
521 psrlq \$1,$Xi
522 pxor $T2,$Xhi #
523 pxor $Xi,$T2
524 psrlq \$5,$Xi
525 pxor $T2,$Xi #
526 psrlq \$1,$Xi #
527 pxor $Xhi,$Xi #
528___
529}
530
531
532{ my ($Htbl,$Xip)=@_4args;
533 my $HK="%xmm6";
534
535$code.=<<___;
536.globl gcm_init_clmul
537.type gcm_init_clmul,\@abi-omnipotent
538.align 16
539gcm_init_clmul:
540.cfi_startproc
541.L_init_clmul:
542___
543$code.=<<___ if ($win64);
544.LSEH_begin_gcm_init_clmul:
545 # I can't trust assembler to use specific encoding:-(
546 .byte 0x48,0x83,0xec,0x18 #sub $0x18,%rsp
547 .byte 0x0f,0x29,0x34,0x24 #movaps %xmm6,(%rsp)
548___
549$code.=<<___;
550 movdqu ($Xip),$Hkey
551 pshufd \$0b01001110,$Hkey,$Hkey # dword swap
552
553 # <<1 twist
554 pshufd \$0b11111111,$Hkey,$T2 # broadcast uppermost dword
555 movdqa $Hkey,$T1
556 psllq \$1,$Hkey
557 pxor $T3,$T3 #
558 psrlq \$63,$T1
559 pcmpgtd $T2,$T3 # broadcast carry bit
560 pslldq \$8,$T1
561 por $T1,$Hkey # H<<=1
562
563 # magic reduction
564 pand .L0x1c2_polynomial(%rip),$T3
565 pxor $T3,$Hkey # if(carry) H^=0x1c2_polynomial
566
567 # calculate H^2
568 pshufd \$0b01001110,$Hkey,$HK
569 movdqa $Hkey,$Xi
570 pxor $Hkey,$HK
571___
572 &clmul64x64_T2 ($Xhi,$Xi,$Hkey,$HK);
573 &reduction_alg9 ($Xhi,$Xi);
574$code.=<<___;
575 pshufd \$0b01001110,$Hkey,$T1
576 pshufd \$0b01001110,$Xi,$T2
577 pxor $Hkey,$T1 # Karatsuba pre-processing
578 movdqu $Hkey,0x00($Htbl) # save H
579 pxor $Xi,$T2 # Karatsuba pre-processing
580 movdqu $Xi,0x10($Htbl) # save H^2
581 palignr \$8,$T1,$T2 # low part is H.lo^H.hi...
582 movdqu $T2,0x20($Htbl) # save Karatsuba "salt"
583___
584if ($do4xaggr) {
585 &clmul64x64_T2 ($Xhi,$Xi,$Hkey,$HK); # H^3
586 &reduction_alg9 ($Xhi,$Xi);
587$code.=<<___;
588 movdqa $Xi,$T3
589___
590 &clmul64x64_T2 ($Xhi,$Xi,$Hkey,$HK); # H^4
591 &reduction_alg9 ($Xhi,$Xi);
592$code.=<<___;
593 pshufd \$0b01001110,$T3,$T1
594 pshufd \$0b01001110,$Xi,$T2
595 pxor $T3,$T1 # Karatsuba pre-processing
596 movdqu $T3,0x30($Htbl) # save H^3
597 pxor $Xi,$T2 # Karatsuba pre-processing
598 movdqu $Xi,0x40($Htbl) # save H^4
599 palignr \$8,$T1,$T2 # low part is H^3.lo^H^3.hi...
600 movdqu $T2,0x50($Htbl) # save Karatsuba "salt"
601___
602}
603$code.=<<___ if ($win64);
604 movaps (%rsp),%xmm6
605 lea 0x18(%rsp),%rsp
606.LSEH_end_gcm_init_clmul:
607___
608$code.=<<___;
609 ret
610.cfi_endproc
611.size gcm_init_clmul,.-gcm_init_clmul
612___
613}
614
615{ my ($Xip,$Htbl)=@_4args;
616
617$code.=<<___;
618.globl gcm_gmult_clmul
619.type gcm_gmult_clmul,\@abi-omnipotent
620.align 16
621gcm_gmult_clmul:
622.cfi_startproc
623 endbranch
624.L_gmult_clmul:
625 movdqu ($Xip),$Xi
626 movdqa .Lbswap_mask(%rip),$T3
627 movdqu ($Htbl),$Hkey
628 movdqu 0x20($Htbl),$T2
629 pshufb $T3,$Xi
630___
631 &clmul64x64_T2 ($Xhi,$Xi,$Hkey,$T2);
632$code.=<<___ if (0 || (&reduction_alg9($Xhi,$Xi)&&0));
633 # experimental alternative. special thing about is that there
634 # no dependency between the two multiplications...
635 mov \$`0xE1<<1`,%eax
636 mov \$0xA040608020C0E000,%r10 # ((7..0)·0xE0)&0xff
637 mov \$0x07,%r11d
638 movq %rax,$T1
639 movq %r10,$T2
640 movq %r11,$T3 # borrow $T3
641 pand $Xi,$T3
642 pshufb $T3,$T2 # ($Xi&7)·0xE0
643 movq %rax,$T3
644 pclmulqdq \$0x00,$Xi,$T1 # ·(0xE1<<1)
645 pxor $Xi,$T2
646 pslldq \$15,$T2
647 paddd $T2,$T2 # <<(64+56+1)
648 pxor $T2,$Xi
649 pclmulqdq \$0x01,$T3,$Xi
650 movdqa .Lbswap_mask(%rip),$T3 # reload $T3
651 psrldq \$1,$T1
652 pxor $T1,$Xhi
653 pslldq \$7,$Xi
654 pxor $Xhi,$Xi
655___
656$code.=<<___;
657 pshufb $T3,$Xi
658 movdqu $Xi,($Xip)
659 ret
660.cfi_endproc
661.size gcm_gmult_clmul,.-gcm_gmult_clmul
662___
663}
664
665
666{ my ($Xip,$Htbl,$inp,$len)=@_4args;
667 my ($Xln,$Xmn,$Xhn,$Hkey2,$HK) = map("%xmm$_",(3..7));
668 my ($T1,$T2,$T3)=map("%xmm$_",(8..10));
669
670$code.=<<___;
671.globl gcm_ghash_clmul
672.type gcm_ghash_clmul,\@abi-omnipotent
673.align 32
674gcm_ghash_clmul:
675.cfi_startproc
676 endbranch
677.L_ghash_clmul:
678___
679$code.=<<___ if ($win64);
680 lea -0x88(%rsp),%rax
681.LSEH_begin_gcm_ghash_clmul:
682 # I can't trust assembler to use specific encoding:-(
683 .byte 0x48,0x8d,0x60,0xe0 #lea -0x20(%rax),%rsp
684 .byte 0x0f,0x29,0x70,0xe0 #movaps %xmm6,-0x20(%rax)
685 .byte 0x0f,0x29,0x78,0xf0 #movaps %xmm7,-0x10(%rax)
686 .byte 0x44,0x0f,0x29,0x00 #movaps %xmm8,0(%rax)
687 .byte 0x44,0x0f,0x29,0x48,0x10 #movaps %xmm9,0x10(%rax)
688 .byte 0x44,0x0f,0x29,0x50,0x20 #movaps %xmm10,0x20(%rax)
689 .byte 0x44,0x0f,0x29,0x58,0x30 #movaps %xmm11,0x30(%rax)
690 .byte 0x44,0x0f,0x29,0x60,0x40 #movaps %xmm12,0x40(%rax)
691 .byte 0x44,0x0f,0x29,0x68,0x50 #movaps %xmm13,0x50(%rax)
692 .byte 0x44,0x0f,0x29,0x70,0x60 #movaps %xmm14,0x60(%rax)
693 .byte 0x44,0x0f,0x29,0x78,0x70 #movaps %xmm15,0x70(%rax)
694___
695$code.=<<___;
696 movdqa .Lbswap_mask(%rip),$T3
697
698 movdqu ($Xip),$Xi
699 movdqu ($Htbl),$Hkey
700 movdqu 0x20($Htbl),$HK
701 pshufb $T3,$Xi
702
703 sub \$0x10,$len
704 jz .Lodd_tail
705
706 movdqu 0x10($Htbl),$Hkey2
707___
708if ($do4xaggr) {
709my ($Xl,$Xm,$Xh,$Hkey3,$Hkey4)=map("%xmm$_",(11..15));
710
711$code.=<<___;
712 mov OPENSSL_ia32cap_P+4(%rip),%eax
713 cmp \$0x30,$len
714 jb .Lskip4x
715
716 and \$`1<<26|1<<22`,%eax # isolate MOVBE+XSAVE
717 cmp \$`1<<22`,%eax # check for MOVBE without XSAVE
718 je .Lskip4x
719
720 sub \$0x30,$len
721 mov \$0xA040608020C0E000,%rax # ((7..0)·0xE0)&0xff
722 movdqu 0x30($Htbl),$Hkey3
723 movdqu 0x40($Htbl),$Hkey4
724
725 #######
726 # Xi+4 =[(H*Ii+3) + (H^2*Ii+2) + (H^3*Ii+1) + H^4*(Ii+Xi)] mod P
727 #
728 movdqu 0x30($inp),$Xln
729 movdqu 0x20($inp),$Xl
730 pshufb $T3,$Xln
731 pshufb $T3,$Xl
732 movdqa $Xln,$Xhn
733 pshufd \$0b01001110,$Xln,$Xmn
734 pxor $Xln,$Xmn
735 pclmulqdq \$0x00,$Hkey,$Xln
736 pclmulqdq \$0x11,$Hkey,$Xhn
737 pclmulqdq \$0x00,$HK,$Xmn
738
739 movdqa $Xl,$Xh
740 pshufd \$0b01001110,$Xl,$Xm
741 pxor $Xl,$Xm
742 pclmulqdq \$0x00,$Hkey2,$Xl
743 pclmulqdq \$0x11,$Hkey2,$Xh
744 pclmulqdq \$0x10,$HK,$Xm
745 xorps $Xl,$Xln
746 xorps $Xh,$Xhn
747 movups 0x50($Htbl),$HK
748 xorps $Xm,$Xmn
749
750 movdqu 0x10($inp),$Xl
751 movdqu 0($inp),$T1
752 pshufb $T3,$Xl
753 pshufb $T3,$T1
754 movdqa $Xl,$Xh
755 pshufd \$0b01001110,$Xl,$Xm
756 pxor $T1,$Xi
757 pxor $Xl,$Xm
758 pclmulqdq \$0x00,$Hkey3,$Xl
759 movdqa $Xi,$Xhi
760 pshufd \$0b01001110,$Xi,$T1
761 pxor $Xi,$T1
762 pclmulqdq \$0x11,$Hkey3,$Xh
763 pclmulqdq \$0x00,$HK,$Xm
764 xorps $Xl,$Xln
765 xorps $Xh,$Xhn
766
767 lea 0x40($inp),$inp
768 sub \$0x40,$len
769 jc .Ltail4x
770
771 jmp .Lmod4_loop
772.align 32
773.Lmod4_loop:
774 pclmulqdq \$0x00,$Hkey4,$Xi
775 xorps $Xm,$Xmn
776 movdqu 0x30($inp),$Xl
777 pshufb $T3,$Xl
778 pclmulqdq \$0x11,$Hkey4,$Xhi
779 xorps $Xln,$Xi
780 movdqu 0x20($inp),$Xln
781 movdqa $Xl,$Xh
782 pclmulqdq \$0x10,$HK,$T1
783 pshufd \$0b01001110,$Xl,$Xm
784 xorps $Xhn,$Xhi
785 pxor $Xl,$Xm
786 pshufb $T3,$Xln
787 movups 0x20($Htbl),$HK
788 xorps $Xmn,$T1
789 pclmulqdq \$0x00,$Hkey,$Xl
790 pshufd \$0b01001110,$Xln,$Xmn
791
792 pxor $Xi,$T1 # aggregated Karatsuba post-processing
793 movdqa $Xln,$Xhn
794 pxor $Xhi,$T1 #
795 pxor $Xln,$Xmn
796 movdqa $T1,$T2 #
797 pclmulqdq \$0x11,$Hkey,$Xh
798 pslldq \$8,$T1
799 psrldq \$8,$T2 #
800 pxor $T1,$Xi
801 movdqa .L7_mask(%rip),$T1
802 pxor $T2,$Xhi #
803 movq %rax,$T2
804
805 pand $Xi,$T1 # 1st phase
806 pshufb $T1,$T2 #
807 pxor $Xi,$T2 #
808 pclmulqdq \$0x00,$HK,$Xm
809 psllq \$57,$T2 #
810 movdqa $T2,$T1 #
811 pslldq \$8,$T2
812 pclmulqdq \$0x00,$Hkey2,$Xln
813 psrldq \$8,$T1 #
814 pxor $T2,$Xi
815 pxor $T1,$Xhi #
816 movdqu 0($inp),$T1
817
818 movdqa $Xi,$T2 # 2nd phase
819 psrlq \$1,$Xi
820 pclmulqdq \$0x11,$Hkey2,$Xhn
821 xorps $Xl,$Xln
822 movdqu 0x10($inp),$Xl
823 pshufb $T3,$Xl
824 pclmulqdq \$0x10,$HK,$Xmn
825 xorps $Xh,$Xhn
826 movups 0x50($Htbl),$HK
827 pshufb $T3,$T1
828 pxor $T2,$Xhi #
829 pxor $Xi,$T2
830 psrlq \$5,$Xi
831
832 movdqa $Xl,$Xh
833 pxor $Xm,$Xmn
834 pshufd \$0b01001110,$Xl,$Xm
835 pxor $T2,$Xi #
836 pxor $T1,$Xhi
837 pxor $Xl,$Xm
838 pclmulqdq \$0x00,$Hkey3,$Xl
839 psrlq \$1,$Xi #
840 pxor $Xhi,$Xi #
841 movdqa $Xi,$Xhi
842 pclmulqdq \$0x11,$Hkey3,$Xh
843 xorps $Xl,$Xln
844 pshufd \$0b01001110,$Xi,$T1
845 pxor $Xi,$T1
846
847 pclmulqdq \$0x00,$HK,$Xm
848 xorps $Xh,$Xhn
849
850 lea 0x40($inp),$inp
851 sub \$0x40,$len
852 jnc .Lmod4_loop
853
854.Ltail4x:
855 pclmulqdq \$0x00,$Hkey4,$Xi
856 pclmulqdq \$0x11,$Hkey4,$Xhi
857 pclmulqdq \$0x10,$HK,$T1
858 xorps $Xm,$Xmn
859 xorps $Xln,$Xi
860 xorps $Xhn,$Xhi
861 pxor $Xi,$Xhi # aggregated Karatsuba post-processing
862 pxor $Xmn,$T1
863
864 pxor $Xhi,$T1 #
865 pxor $Xi,$Xhi
866
867 movdqa $T1,$T2 #
868 psrldq \$8,$T1
869 pslldq \$8,$T2 #
870 pxor $T1,$Xhi
871 pxor $T2,$Xi #
872___
873 &reduction_alg9($Xhi,$Xi);
874$code.=<<___;
875 add \$0x40,$len
876 jz .Ldone
877 movdqu 0x20($Htbl),$HK
878 sub \$0x10,$len
879 jz .Lodd_tail
880.Lskip4x:
881___
882}
883$code.=<<___;
884 #######
885 # Xi+2 =[H*(Ii+1 + Xi+1)] mod P =
886 # [(H*Ii+1) + (H*Xi+1)] mod P =
887 # [(H*Ii+1) + H^2*(Ii+Xi)] mod P
888 #
889 movdqu ($inp),$T1 # Ii
890 movdqu 16($inp),$Xln # Ii+1
891 pshufb $T3,$T1
892 pshufb $T3,$Xln
893 pxor $T1,$Xi # Ii+Xi
894
895 movdqa $Xln,$Xhn
896 pshufd \$0b01001110,$Xln,$Xmn
897 pxor $Xln,$Xmn
898 pclmulqdq \$0x00,$Hkey,$Xln
899 pclmulqdq \$0x11,$Hkey,$Xhn
900 pclmulqdq \$0x00,$HK,$Xmn
901
902 lea 32($inp),$inp # i+=2
903 nop
904 sub \$0x20,$len
905 jbe .Leven_tail
906 nop
907 jmp .Lmod_loop
908
909.align 32
910.Lmod_loop:
911 movdqa $Xi,$Xhi
912 movdqa $Xmn,$T1
913 pshufd \$0b01001110,$Xi,$Xmn #
914 pxor $Xi,$Xmn #
915
916 pclmulqdq \$0x00,$Hkey2,$Xi
917 pclmulqdq \$0x11,$Hkey2,$Xhi
918 pclmulqdq \$0x10,$HK,$Xmn
919
920 pxor $Xln,$Xi # (H*Ii+1) + H^2*(Ii+Xi)
921 pxor $Xhn,$Xhi
922 movdqu ($inp),$T2 # Ii
923 pxor $Xi,$T1 # aggregated Karatsuba post-processing
924 pshufb $T3,$T2
925 movdqu 16($inp),$Xln # Ii+1
926
927 pxor $Xhi,$T1
928 pxor $T2,$Xhi # "Ii+Xi", consume early
929 pxor $T1,$Xmn
930 pshufb $T3,$Xln
931 movdqa $Xmn,$T1 #
932 psrldq \$8,$T1
933 pslldq \$8,$Xmn #
934 pxor $T1,$Xhi
935 pxor $Xmn,$Xi #
936
937 movdqa $Xln,$Xhn #
938
939 movdqa $Xi,$T2 # 1st phase
940 movdqa $Xi,$T1
941 psllq \$5,$Xi
942 pxor $Xi,$T1 #
943 pclmulqdq \$0x00,$Hkey,$Xln #######
944 psllq \$1,$Xi
945 pxor $T1,$Xi #
946 psllq \$57,$Xi #
947 movdqa $Xi,$T1 #
948 pslldq \$8,$Xi
949 psrldq \$8,$T1 #
950 pxor $T2,$Xi
951 pshufd \$0b01001110,$Xhn,$Xmn
952 pxor $T1,$Xhi #
953 pxor $Xhn,$Xmn #
954
955 movdqa $Xi,$T2 # 2nd phase
956 psrlq \$1,$Xi
957 pclmulqdq \$0x11,$Hkey,$Xhn #######
958 pxor $T2,$Xhi #
959 pxor $Xi,$T2
960 psrlq \$5,$Xi
961 pxor $T2,$Xi #
962 lea 32($inp),$inp
963 psrlq \$1,$Xi #
964 pclmulqdq \$0x00,$HK,$Xmn #######
965 pxor $Xhi,$Xi #
966
967 sub \$0x20,$len
968 ja .Lmod_loop
969
970.Leven_tail:
971 movdqa $Xi,$Xhi
972 movdqa $Xmn,$T1
973 pshufd \$0b01001110,$Xi,$Xmn #
974 pxor $Xi,$Xmn #
975
976 pclmulqdq \$0x00,$Hkey2,$Xi
977 pclmulqdq \$0x11,$Hkey2,$Xhi
978 pclmulqdq \$0x10,$HK,$Xmn
979
980 pxor $Xln,$Xi # (H*Ii+1) + H^2*(Ii+Xi)
981 pxor $Xhn,$Xhi
982 pxor $Xi,$T1
983 pxor $Xhi,$T1
984 pxor $T1,$Xmn
985 movdqa $Xmn,$T1 #
986 psrldq \$8,$T1
987 pslldq \$8,$Xmn #
988 pxor $T1,$Xhi
989 pxor $Xmn,$Xi #
990___
991 &reduction_alg9 ($Xhi,$Xi);
992$code.=<<___;
993 test $len,$len
994 jnz .Ldone
995
996.Lodd_tail:
997 movdqu ($inp),$T1 # Ii
998 pshufb $T3,$T1
999 pxor $T1,$Xi # Ii+Xi
1000___
1001 &clmul64x64_T2 ($Xhi,$Xi,$Hkey,$HK); # H*(Ii+Xi)
1002 &reduction_alg9 ($Xhi,$Xi);
1003$code.=<<___;
1004.Ldone:
1005 pshufb $T3,$Xi
1006 movdqu $Xi,($Xip)
1007___
1008$code.=<<___ if ($win64);
1009 movaps (%rsp),%xmm6
1010 movaps 0x10(%rsp),%xmm7
1011 movaps 0x20(%rsp),%xmm8
1012 movaps 0x30(%rsp),%xmm9
1013 movaps 0x40(%rsp),%xmm10
1014 movaps 0x50(%rsp),%xmm11
1015 movaps 0x60(%rsp),%xmm12
1016 movaps 0x70(%rsp),%xmm13
1017 movaps 0x80(%rsp),%xmm14
1018 movaps 0x90(%rsp),%xmm15
1019 lea 0xa8(%rsp),%rsp
1020.LSEH_end_gcm_ghash_clmul:
1021___
1022$code.=<<___;
1023 ret
1024.cfi_endproc
1025.size gcm_ghash_clmul,.-gcm_ghash_clmul
1026___
1027}
1028
1029
1030$code.=<<___;
1031.globl gcm_init_avx
1032.type gcm_init_avx,\@abi-omnipotent
1033.align 32
1034gcm_init_avx:
1035.cfi_startproc
1036___
1037if ($avx) {
1038my ($Htbl,$Xip)=@_4args;
1039my $HK="%xmm6";
1040
1041$code.=<<___ if ($win64);
1042.LSEH_begin_gcm_init_avx:
1043 # I can't trust assembler to use specific encoding:-(
1044 .byte 0x48,0x83,0xec,0x18 #sub $0x18,%rsp
1045 .byte 0x0f,0x29,0x34,0x24 #movaps %xmm6,(%rsp)
1046___
1047$code.=<<___;
1048 vzeroupper
1049
1050 vmovdqu ($Xip),$Hkey
1051 vpshufd \$0b01001110,$Hkey,$Hkey # dword swap
1052
1053 # <<1 twist
1054 vpshufd \$0b11111111,$Hkey,$T2 # broadcast uppermost dword
1055 vpsrlq \$63,$Hkey,$T1
1056 vpsllq \$1,$Hkey,$Hkey
1057 vpxor $T3,$T3,$T3 #
1058 vpcmpgtd $T2,$T3,$T3 # broadcast carry bit
1059 vpslldq \$8,$T1,$T1
1060 vpor $T1,$Hkey,$Hkey # H<<=1
1061
1062 # magic reduction
1063 vpand .L0x1c2_polynomial(%rip),$T3,$T3
1064 vpxor $T3,$Hkey,$Hkey # if(carry) H^=0x1c2_polynomial
1065
1066 vpunpckhqdq $Hkey,$Hkey,$HK
1067 vmovdqa $Hkey,$Xi
1068 vpxor $Hkey,$HK,$HK
1069 mov \$4,%r10 # up to H^8
1070 jmp .Linit_start_avx
1071___
1072
1073sub clmul64x64_avx {
1074my ($Xhi,$Xi,$Hkey,$HK)=@_;
1075
1076if (!defined($HK)) { $HK = $T2;
1077$code.=<<___;
1078 vpunpckhqdq $Xi,$Xi,$T1
1079 vpunpckhqdq $Hkey,$Hkey,$T2
1080 vpxor $Xi,$T1,$T1 #
1081 vpxor $Hkey,$T2,$T2
1082___
1083} else {
1084$code.=<<___;
1085 vpunpckhqdq $Xi,$Xi,$T1
1086 vpxor $Xi,$T1,$T1 #
1087___
1088}
1089$code.=<<___;
1090 vpclmulqdq \$0x11,$Hkey,$Xi,$Xhi #######
1091 vpclmulqdq \$0x00,$Hkey,$Xi,$Xi #######
1092 vpclmulqdq \$0x00,$HK,$T1,$T1 #######
1093 vpxor $Xi,$Xhi,$T2 #
1094 vpxor $T2,$T1,$T1 #
1095
1096 vpslldq \$8,$T1,$T2 #
1097 vpsrldq \$8,$T1,$T1
1098 vpxor $T2,$Xi,$Xi #
1099 vpxor $T1,$Xhi,$Xhi
1100___
1101}
1102
1103sub reduction_avx {
1104my ($Xhi,$Xi) = @_;
1105
1106$code.=<<___;
1107 vpsllq \$57,$Xi,$T1 # 1st phase
1108 vpsllq \$62,$Xi,$T2
1109 vpxor $T1,$T2,$T2 #
1110 vpsllq \$63,$Xi,$T1
1111 vpxor $T1,$T2,$T2 #
1112 vpslldq \$8,$T2,$T1 #
1113 vpsrldq \$8,$T2,$T2
1114 vpxor $T1,$Xi,$Xi #
1115 vpxor $T2,$Xhi,$Xhi
1116
1117 vpsrlq \$1,$Xi,$T2 # 2nd phase
1118 vpxor $Xi,$Xhi,$Xhi
1119 vpxor $T2,$Xi,$Xi #
1120 vpsrlq \$5,$T2,$T2
1121 vpxor $T2,$Xi,$Xi #
1122 vpsrlq \$1,$Xi,$Xi #
1123 vpxor $Xhi,$Xi,$Xi #
1124___
1125}
1126
1127$code.=<<___;
1128.align 32
1129.Linit_loop_avx:
1130 vpalignr \$8,$T1,$T2,$T3 # low part is H.lo^H.hi...
1131 vmovdqu $T3,-0x10($Htbl) # save Karatsuba "salt"
1132___
1133 &clmul64x64_avx ($Xhi,$Xi,$Hkey,$HK); # calculate H^3,5,7
1134 &reduction_avx ($Xhi,$Xi);
1135$code.=<<___;
1136.Linit_start_avx:
1137 vmovdqa $Xi,$T3
1138___
1139 &clmul64x64_avx ($Xhi,$Xi,$Hkey,$HK); # calculate H^2,4,6,8
1140 &reduction_avx ($Xhi,$Xi);
1141$code.=<<___;
1142 vpshufd \$0b01001110,$T3,$T1
1143 vpshufd \$0b01001110,$Xi,$T2
1144 vpxor $T3,$T1,$T1 # Karatsuba pre-processing
1145 vmovdqu $T3,0x00($Htbl) # save H^1,3,5,7
1146 vpxor $Xi,$T2,$T2 # Karatsuba pre-processing
1147 vmovdqu $Xi,0x10($Htbl) # save H^2,4,6,8
1148 lea 0x30($Htbl),$Htbl
1149 sub \$1,%r10
1150 jnz .Linit_loop_avx
1151
1152 vpalignr \$8,$T2,$T1,$T3 # last "salt" is flipped
1153 vmovdqu $T3,-0x10($Htbl)
1154
1155 vzeroupper
1156___
1157$code.=<<___ if ($win64);
1158 movaps (%rsp),%xmm6
1159 lea 0x18(%rsp),%rsp
1160.LSEH_end_gcm_init_avx:
1161___
1162$code.=<<___;
1163 ret
1164.cfi_endproc
1165.size gcm_init_avx,.-gcm_init_avx
1166___
1167} else {
1168$code.=<<___;
1169 jmp .L_init_clmul
1170.cfi_endproc
1171.size gcm_init_avx,.-gcm_init_avx
1172___
1173}
1174
1175$code.=<<___;
1176.globl gcm_gmult_avx
1177.type gcm_gmult_avx,\@abi-omnipotent
1178.align 32
1179gcm_gmult_avx:
1180.cfi_startproc
1181 endbranch
1182 jmp .L_gmult_clmul
1183.cfi_endproc
1184.size gcm_gmult_avx,.-gcm_gmult_avx
1185___
1186
1187
1188$code.=<<___;
1189.globl gcm_ghash_avx
1190.type gcm_ghash_avx,\@abi-omnipotent
1191.align 32
1192gcm_ghash_avx:
1193.cfi_startproc
1194 endbranch
1195___
1196if ($avx) {
1197my ($Xip,$Htbl,$inp,$len)=@_4args;
1198my ($Xlo,$Xhi,$Xmi,
1199 $Zlo,$Zhi,$Zmi,
1200 $Hkey,$HK,$T1,$T2,
1201 $Xi,$Xo,$Tred,$bswap,$Ii,$Ij) = map("%xmm$_",(0..15));
1202
1203$code.=<<___ if ($win64);
1204 lea -0x88(%rsp),%rax
1205.LSEH_begin_gcm_ghash_avx:
1206 # I can't trust assembler to use specific encoding:-(
1207 .byte 0x48,0x8d,0x60,0xe0 #lea -0x20(%rax),%rsp
1208 .byte 0x0f,0x29,0x70,0xe0 #movaps %xmm6,-0x20(%rax)
1209 .byte 0x0f,0x29,0x78,0xf0 #movaps %xmm7,-0x10(%rax)
1210 .byte 0x44,0x0f,0x29,0x00 #movaps %xmm8,0(%rax)
1211 .byte 0x44,0x0f,0x29,0x48,0x10 #movaps %xmm9,0x10(%rax)
1212 .byte 0x44,0x0f,0x29,0x50,0x20 #movaps %xmm10,0x20(%rax)
1213 .byte 0x44,0x0f,0x29,0x58,0x30 #movaps %xmm11,0x30(%rax)
1214 .byte 0x44,0x0f,0x29,0x60,0x40 #movaps %xmm12,0x40(%rax)
1215 .byte 0x44,0x0f,0x29,0x68,0x50 #movaps %xmm13,0x50(%rax)
1216 .byte 0x44,0x0f,0x29,0x70,0x60 #movaps %xmm14,0x60(%rax)
1217 .byte 0x44,0x0f,0x29,0x78,0x70 #movaps %xmm15,0x70(%rax)
1218___
1219$code.=<<___;
1220 vzeroupper
1221
1222 vmovdqu ($Xip),$Xi # load $Xi
1223 lea .L0x1c2_polynomial(%rip),%r10
1224 lea 0x40($Htbl),$Htbl # size optimization
1225 vmovdqu .Lbswap_mask(%rip),$bswap
1226 vpshufb $bswap,$Xi,$Xi
1227 cmp \$0x80,$len
1228 jb .Lshort_avx
1229 sub \$0x80,$len
1230
1231 vmovdqu 0x70($inp),$Ii # I[7]
1232 vmovdqu 0x00-0x40($Htbl),$Hkey # $Hkey^1
1233 vpshufb $bswap,$Ii,$Ii
1234 vmovdqu 0x20-0x40($Htbl),$HK
1235
1236 vpunpckhqdq $Ii,$Ii,$T2
1237 vmovdqu 0x60($inp),$Ij # I[6]
1238 vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo
1239 vpxor $Ii,$T2,$T2
1240 vpshufb $bswap,$Ij,$Ij
1241 vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi
1242 vmovdqu 0x10-0x40($Htbl),$Hkey # $Hkey^2
1243 vpunpckhqdq $Ij,$Ij,$T1
1244 vmovdqu 0x50($inp),$Ii # I[5]
1245 vpclmulqdq \$0x00,$HK,$T2,$Xmi
1246 vpxor $Ij,$T1,$T1
1247
1248 vpshufb $bswap,$Ii,$Ii
1249 vpclmulqdq \$0x00,$Hkey,$Ij,$Zlo
1250 vpunpckhqdq $Ii,$Ii,$T2
1251 vpclmulqdq \$0x11,$Hkey,$Ij,$Zhi
1252 vmovdqu 0x30-0x40($Htbl),$Hkey # $Hkey^3
1253 vpxor $Ii,$T2,$T2
1254 vmovdqu 0x40($inp),$Ij # I[4]
1255 vpclmulqdq \$0x10,$HK,$T1,$Zmi
1256 vmovdqu 0x50-0x40($Htbl),$HK
1257
1258 vpshufb $bswap,$Ij,$Ij
1259 vpxor $Xlo,$Zlo,$Zlo
1260 vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo
1261 vpxor $Xhi,$Zhi,$Zhi
1262 vpunpckhqdq $Ij,$Ij,$T1
1263 vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi
1264 vmovdqu 0x40-0x40($Htbl),$Hkey # $Hkey^4
1265 vpxor $Xmi,$Zmi,$Zmi
1266 vpclmulqdq \$0x00,$HK,$T2,$Xmi
1267 vpxor $Ij,$T1,$T1
1268
1269 vmovdqu 0x30($inp),$Ii # I[3]
1270 vpxor $Zlo,$Xlo,$Xlo
1271 vpclmulqdq \$0x00,$Hkey,$Ij,$Zlo
1272 vpxor $Zhi,$Xhi,$Xhi
1273 vpshufb $bswap,$Ii,$Ii
1274 vpclmulqdq \$0x11,$Hkey,$Ij,$Zhi
1275 vmovdqu 0x60-0x40($Htbl),$Hkey # $Hkey^5
1276 vpxor $Zmi,$Xmi,$Xmi
1277 vpunpckhqdq $Ii,$Ii,$T2
1278 vpclmulqdq \$0x10,$HK,$T1,$Zmi
1279 vmovdqu 0x80-0x40($Htbl),$HK
1280 vpxor $Ii,$T2,$T2
1281
1282 vmovdqu 0x20($inp),$Ij # I[2]
1283 vpxor $Xlo,$Zlo,$Zlo
1284 vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo
1285 vpxor $Xhi,$Zhi,$Zhi
1286 vpshufb $bswap,$Ij,$Ij
1287 vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi
1288 vmovdqu 0x70-0x40($Htbl),$Hkey # $Hkey^6
1289 vpxor $Xmi,$Zmi,$Zmi
1290 vpunpckhqdq $Ij,$Ij,$T1
1291 vpclmulqdq \$0x00,$HK,$T2,$Xmi
1292 vpxor $Ij,$T1,$T1
1293
1294 vmovdqu 0x10($inp),$Ii # I[1]
1295 vpxor $Zlo,$Xlo,$Xlo
1296 vpclmulqdq \$0x00,$Hkey,$Ij,$Zlo
1297 vpxor $Zhi,$Xhi,$Xhi
1298 vpshufb $bswap,$Ii,$Ii
1299 vpclmulqdq \$0x11,$Hkey,$Ij,$Zhi
1300 vmovdqu 0x90-0x40($Htbl),$Hkey # $Hkey^7
1301 vpxor $Zmi,$Xmi,$Xmi
1302 vpunpckhqdq $Ii,$Ii,$T2
1303 vpclmulqdq \$0x10,$HK,$T1,$Zmi
1304 vmovdqu 0xb0-0x40($Htbl),$HK
1305 vpxor $Ii,$T2,$T2
1306
1307 vmovdqu ($inp),$Ij # I[0]
1308 vpxor $Xlo,$Zlo,$Zlo
1309 vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo
1310 vpxor $Xhi,$Zhi,$Zhi
1311 vpshufb $bswap,$Ij,$Ij
1312 vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi
1313 vmovdqu 0xa0-0x40($Htbl),$Hkey # $Hkey^8
1314 vpxor $Xmi,$Zmi,$Zmi
1315 vpclmulqdq \$0x10,$HK,$T2,$Xmi
1316
1317 lea 0x80($inp),$inp
1318 cmp \$0x80,$len
1319 jb .Ltail_avx
1320
1321 vpxor $Xi,$Ij,$Ij # accumulate $Xi
1322 sub \$0x80,$len
1323 jmp .Loop8x_avx
1324
1325.align 32
1326.Loop8x_avx:
1327 vpunpckhqdq $Ij,$Ij,$T1
1328 vmovdqu 0x70($inp),$Ii # I[7]
1329 vpxor $Xlo,$Zlo,$Zlo
1330 vpxor $Ij,$T1,$T1
1331 vpclmulqdq \$0x00,$Hkey,$Ij,$Xi
1332 vpshufb $bswap,$Ii,$Ii
1333 vpxor $Xhi,$Zhi,$Zhi
1334 vpclmulqdq \$0x11,$Hkey,$Ij,$Xo
1335 vmovdqu 0x00-0x40($Htbl),$Hkey # $Hkey^1
1336 vpunpckhqdq $Ii,$Ii,$T2
1337 vpxor $Xmi,$Zmi,$Zmi
1338 vpclmulqdq \$0x00,$HK,$T1,$Tred
1339 vmovdqu 0x20-0x40($Htbl),$HK
1340 vpxor $Ii,$T2,$T2
1341
1342 vmovdqu 0x60($inp),$Ij # I[6]
1343 vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo
1344 vpxor $Zlo,$Xi,$Xi # collect result
1345 vpshufb $bswap,$Ij,$Ij
1346 vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi
1347 vxorps $Zhi,$Xo,$Xo
1348 vmovdqu 0x10-0x40($Htbl),$Hkey # $Hkey^2
1349 vpunpckhqdq $Ij,$Ij,$T1
1350 vpclmulqdq \$0x00,$HK, $T2,$Xmi
1351 vpxor $Zmi,$Tred,$Tred
1352 vxorps $Ij,$T1,$T1
1353
1354 vmovdqu 0x50($inp),$Ii # I[5]
1355 vpxor $Xi,$Tred,$Tred # aggregated Karatsuba post-processing
1356 vpclmulqdq \$0x00,$Hkey,$Ij,$Zlo
1357 vpxor $Xo,$Tred,$Tred
1358 vpslldq \$8,$Tred,$T2
1359 vpxor $Xlo,$Zlo,$Zlo
1360 vpclmulqdq \$0x11,$Hkey,$Ij,$Zhi
1361 vpsrldq \$8,$Tred,$Tred
1362 vpxor $T2, $Xi, $Xi
1363 vmovdqu 0x30-0x40($Htbl),$Hkey # $Hkey^3
1364 vpshufb $bswap,$Ii,$Ii
1365 vxorps $Tred,$Xo, $Xo
1366 vpxor $Xhi,$Zhi,$Zhi
1367 vpunpckhqdq $Ii,$Ii,$T2
1368 vpclmulqdq \$0x10,$HK, $T1,$Zmi
1369 vmovdqu 0x50-0x40($Htbl),$HK
1370 vpxor $Ii,$T2,$T2
1371 vpxor $Xmi,$Zmi,$Zmi
1372
1373 vmovdqu 0x40($inp),$Ij # I[4]
1374 vpalignr \$8,$Xi,$Xi,$Tred # 1st phase
1375 vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo
1376 vpshufb $bswap,$Ij,$Ij
1377 vpxor $Zlo,$Xlo,$Xlo
1378 vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi
1379 vmovdqu 0x40-0x40($Htbl),$Hkey # $Hkey^4
1380 vpunpckhqdq $Ij,$Ij,$T1
1381 vpxor $Zhi,$Xhi,$Xhi
1382 vpclmulqdq \$0x00,$HK, $T2,$Xmi
1383 vxorps $Ij,$T1,$T1
1384 vpxor $Zmi,$Xmi,$Xmi
1385
1386 vmovdqu 0x30($inp),$Ii # I[3]
1387 vpclmulqdq \$0x10,(%r10),$Xi,$Xi
1388 vpclmulqdq \$0x00,$Hkey,$Ij,$Zlo
1389 vpshufb $bswap,$Ii,$Ii
1390 vpxor $Xlo,$Zlo,$Zlo
1391 vpclmulqdq \$0x11,$Hkey,$Ij,$Zhi
1392 vmovdqu 0x60-0x40($Htbl),$Hkey # $Hkey^5
1393 vpunpckhqdq $Ii,$Ii,$T2
1394 vpxor $Xhi,$Zhi,$Zhi
1395 vpclmulqdq \$0x10,$HK, $T1,$Zmi
1396 vmovdqu 0x80-0x40($Htbl),$HK
1397 vpxor $Ii,$T2,$T2
1398 vpxor $Xmi,$Zmi,$Zmi
1399
1400 vmovdqu 0x20($inp),$Ij # I[2]
1401 vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo
1402 vpshufb $bswap,$Ij,$Ij
1403 vpxor $Zlo,$Xlo,$Xlo
1404 vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi
1405 vmovdqu 0x70-0x40($Htbl),$Hkey # $Hkey^6
1406 vpunpckhqdq $Ij,$Ij,$T1
1407 vpxor $Zhi,$Xhi,$Xhi
1408 vpclmulqdq \$0x00,$HK, $T2,$Xmi
1409 vpxor $Ij,$T1,$T1
1410 vpxor $Zmi,$Xmi,$Xmi
1411 vxorps $Tred,$Xi,$Xi
1412
1413 vmovdqu 0x10($inp),$Ii # I[1]
1414 vpalignr \$8,$Xi,$Xi,$Tred # 2nd phase
1415 vpclmulqdq \$0x00,$Hkey,$Ij,$Zlo
1416 vpshufb $bswap,$Ii,$Ii
1417 vpxor $Xlo,$Zlo,$Zlo
1418 vpclmulqdq \$0x11,$Hkey,$Ij,$Zhi
1419 vmovdqu 0x90-0x40($Htbl),$Hkey # $Hkey^7
1420 vpclmulqdq \$0x10,(%r10),$Xi,$Xi
1421 vxorps $Xo,$Tred,$Tred
1422 vpunpckhqdq $Ii,$Ii,$T2
1423 vpxor $Xhi,$Zhi,$Zhi
1424 vpclmulqdq \$0x10,$HK, $T1,$Zmi
1425 vmovdqu 0xb0-0x40($Htbl),$HK
1426 vpxor $Ii,$T2,$T2
1427 vpxor $Xmi,$Zmi,$Zmi
1428
1429 vmovdqu ($inp),$Ij # I[0]
1430 vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo
1431 vpshufb $bswap,$Ij,$Ij
1432 vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi
1433 vmovdqu 0xa0-0x40($Htbl),$Hkey # $Hkey^8
1434 vpxor $Tred,$Ij,$Ij
1435 vpclmulqdq \$0x10,$HK, $T2,$Xmi
1436 vpxor $Xi,$Ij,$Ij # accumulate $Xi
1437
1438 lea 0x80($inp),$inp
1439 sub \$0x80,$len
1440 jnc .Loop8x_avx
1441
1442 add \$0x80,$len
1443 jmp .Ltail_no_xor_avx
1444
1445.align 32
1446.Lshort_avx:
1447 vmovdqu -0x10($inp,$len),$Ii # very last word
1448 lea ($inp,$len),$inp
1449 vmovdqu 0x00-0x40($Htbl),$Hkey # $Hkey^1
1450 vmovdqu 0x20-0x40($Htbl),$HK
1451 vpshufb $bswap,$Ii,$Ij
1452
1453 vmovdqa $Xlo,$Zlo # subtle way to zero $Zlo,
1454 vmovdqa $Xhi,$Zhi # $Zhi and
1455 vmovdqa $Xmi,$Zmi # $Zmi
1456 sub \$0x10,$len
1457 jz .Ltail_avx
1458
1459 vpunpckhqdq $Ij,$Ij,$T1
1460 vpxor $Xlo,$Zlo,$Zlo
1461 vpclmulqdq \$0x00,$Hkey,$Ij,$Xlo
1462 vpxor $Ij,$T1,$T1
1463 vmovdqu -0x20($inp),$Ii
1464 vpxor $Xhi,$Zhi,$Zhi
1465 vpclmulqdq \$0x11,$Hkey,$Ij,$Xhi
1466 vmovdqu 0x10-0x40($Htbl),$Hkey # $Hkey^2
1467 vpshufb $bswap,$Ii,$Ij
1468 vpxor $Xmi,$Zmi,$Zmi
1469 vpclmulqdq \$0x00,$HK,$T1,$Xmi
1470 vpsrldq \$8,$HK,$HK
1471 sub \$0x10,$len
1472 jz .Ltail_avx
1473
1474 vpunpckhqdq $Ij,$Ij,$T1
1475 vpxor $Xlo,$Zlo,$Zlo
1476 vpclmulqdq \$0x00,$Hkey,$Ij,$Xlo
1477 vpxor $Ij,$T1,$T1
1478 vmovdqu -0x30($inp),$Ii
1479 vpxor $Xhi,$Zhi,$Zhi
1480 vpclmulqdq \$0x11,$Hkey,$Ij,$Xhi
1481 vmovdqu 0x30-0x40($Htbl),$Hkey # $Hkey^3
1482 vpshufb $bswap,$Ii,$Ij
1483 vpxor $Xmi,$Zmi,$Zmi
1484 vpclmulqdq \$0x00,$HK,$T1,$Xmi
1485 vmovdqu 0x50-0x40($Htbl),$HK
1486 sub \$0x10,$len
1487 jz .Ltail_avx
1488
1489 vpunpckhqdq $Ij,$Ij,$T1
1490 vpxor $Xlo,$Zlo,$Zlo
1491 vpclmulqdq \$0x00,$Hkey,$Ij,$Xlo
1492 vpxor $Ij,$T1,$T1
1493 vmovdqu -0x40($inp),$Ii
1494 vpxor $Xhi,$Zhi,$Zhi
1495 vpclmulqdq \$0x11,$Hkey,$Ij,$Xhi
1496 vmovdqu 0x40-0x40($Htbl),$Hkey # $Hkey^4
1497 vpshufb $bswap,$Ii,$Ij
1498 vpxor $Xmi,$Zmi,$Zmi
1499 vpclmulqdq \$0x00,$HK,$T1,$Xmi
1500 vpsrldq \$8,$HK,$HK
1501 sub \$0x10,$len
1502 jz .Ltail_avx
1503
1504 vpunpckhqdq $Ij,$Ij,$T1
1505 vpxor $Xlo,$Zlo,$Zlo
1506 vpclmulqdq \$0x00,$Hkey,$Ij,$Xlo
1507 vpxor $Ij,$T1,$T1
1508 vmovdqu -0x50($inp),$Ii
1509 vpxor $Xhi,$Zhi,$Zhi
1510 vpclmulqdq \$0x11,$Hkey,$Ij,$Xhi
1511 vmovdqu 0x60-0x40($Htbl),$Hkey # $Hkey^5
1512 vpshufb $bswap,$Ii,$Ij
1513 vpxor $Xmi,$Zmi,$Zmi
1514 vpclmulqdq \$0x00,$HK,$T1,$Xmi
1515 vmovdqu 0x80-0x40($Htbl),$HK
1516 sub \$0x10,$len
1517 jz .Ltail_avx
1518
1519 vpunpckhqdq $Ij,$Ij,$T1
1520 vpxor $Xlo,$Zlo,$Zlo
1521 vpclmulqdq \$0x00,$Hkey,$Ij,$Xlo
1522 vpxor $Ij,$T1,$T1
1523 vmovdqu -0x60($inp),$Ii
1524 vpxor $Xhi,$Zhi,$Zhi
1525 vpclmulqdq \$0x11,$Hkey,$Ij,$Xhi
1526 vmovdqu 0x70-0x40($Htbl),$Hkey # $Hkey^6
1527 vpshufb $bswap,$Ii,$Ij
1528 vpxor $Xmi,$Zmi,$Zmi
1529 vpclmulqdq \$0x00,$HK,$T1,$Xmi
1530 vpsrldq \$8,$HK,$HK
1531 sub \$0x10,$len
1532 jz .Ltail_avx
1533
1534 vpunpckhqdq $Ij,$Ij,$T1
1535 vpxor $Xlo,$Zlo,$Zlo
1536 vpclmulqdq \$0x00,$Hkey,$Ij,$Xlo
1537 vpxor $Ij,$T1,$T1
1538 vmovdqu -0x70($inp),$Ii
1539 vpxor $Xhi,$Zhi,$Zhi
1540 vpclmulqdq \$0x11,$Hkey,$Ij,$Xhi
1541 vmovdqu 0x90-0x40($Htbl),$Hkey # $Hkey^7
1542 vpshufb $bswap,$Ii,$Ij
1543 vpxor $Xmi,$Zmi,$Zmi
1544 vpclmulqdq \$0x00,$HK,$T1,$Xmi
1545 vmovq 0xb8-0x40($Htbl),$HK
1546 sub \$0x10,$len
1547 jmp .Ltail_avx
1548
1549.align 32
1550.Ltail_avx:
1551 vpxor $Xi,$Ij,$Ij # accumulate $Xi
1552.Ltail_no_xor_avx:
1553 vpunpckhqdq $Ij,$Ij,$T1
1554 vpxor $Xlo,$Zlo,$Zlo
1555 vpclmulqdq \$0x00,$Hkey,$Ij,$Xlo
1556 vpxor $Ij,$T1,$T1
1557 vpxor $Xhi,$Zhi,$Zhi
1558 vpclmulqdq \$0x11,$Hkey,$Ij,$Xhi
1559 vpxor $Xmi,$Zmi,$Zmi
1560 vpclmulqdq \$0x00,$HK,$T1,$Xmi
1561
1562 vmovdqu (%r10),$Tred
1563
1564 vpxor $Xlo,$Zlo,$Xi
1565 vpxor $Xhi,$Zhi,$Xo
1566 vpxor $Xmi,$Zmi,$Zmi
1567
1568 vpxor $Xi, $Zmi,$Zmi # aggregated Karatsuba post-processing
1569 vpxor $Xo, $Zmi,$Zmi
1570 vpslldq \$8, $Zmi,$T2
1571 vpsrldq \$8, $Zmi,$Zmi
1572 vpxor $T2, $Xi, $Xi
1573 vpxor $Zmi,$Xo, $Xo
1574
1575 vpclmulqdq \$0x10,$Tred,$Xi,$T2 # 1st phase
1576 vpalignr \$8,$Xi,$Xi,$Xi
1577 vpxor $T2,$Xi,$Xi
1578
1579 vpclmulqdq \$0x10,$Tred,$Xi,$T2 # 2nd phase
1580 vpalignr \$8,$Xi,$Xi,$Xi
1581 vpxor $Xo,$Xi,$Xi
1582 vpxor $T2,$Xi,$Xi
1583
1584 cmp \$0,$len
1585 jne .Lshort_avx
1586
1587 vpshufb $bswap,$Xi,$Xi
1588 vmovdqu $Xi,($Xip)
1589 vzeroupper
1590___
1591$code.=<<___ if ($win64);
1592 movaps (%rsp),%xmm6
1593 movaps 0x10(%rsp),%xmm7
1594 movaps 0x20(%rsp),%xmm8
1595 movaps 0x30(%rsp),%xmm9
1596 movaps 0x40(%rsp),%xmm10
1597 movaps 0x50(%rsp),%xmm11
1598 movaps 0x60(%rsp),%xmm12
1599 movaps 0x70(%rsp),%xmm13
1600 movaps 0x80(%rsp),%xmm14
1601 movaps 0x90(%rsp),%xmm15
1602 lea 0xa8(%rsp),%rsp
1603.LSEH_end_gcm_ghash_avx:
1604___
1605$code.=<<___;
1606 ret
1607.cfi_endproc
1608.size gcm_ghash_avx,.-gcm_ghash_avx
1609___
1610} else {
1611$code.=<<___;
1612 jmp .L_ghash_clmul
1613.cfi_endproc
1614.size gcm_ghash_avx,.-gcm_ghash_avx
1615___
1616}
1617
1618
1619$code.=<<___;
1620.align 64
1621.Lbswap_mask:
1622 .byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
1623.L0x1c2_polynomial:
1624 .byte 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2
1625.L7_mask:
1626 .long 7,0,7,0
1627.L7_mask_poly:
1628 .long 7,0,`0xE1<<1`,0
1629.align 64
1630.type .Lrem_4bit,\@object
1631.Lrem_4bit:
1632 .long 0,`0x0000<<16`,0,`0x1C20<<16`,0,`0x3840<<16`,0,`0x2460<<16`
1633 .long 0,`0x7080<<16`,0,`0x6CA0<<16`,0,`0x48C0<<16`,0,`0x54E0<<16`
1634 .long 0,`0xE100<<16`,0,`0xFD20<<16`,0,`0xD940<<16`,0,`0xC560<<16`
1635 .long 0,`0x9180<<16`,0,`0x8DA0<<16`,0,`0xA9C0<<16`,0,`0xB5E0<<16`
1636.type .Lrem_8bit,\@object
1637.Lrem_8bit:
1638 .value 0x0000,0x01C2,0x0384,0x0246,0x0708,0x06CA,0x048C,0x054E
1639 .value 0x0E10,0x0FD2,0x0D94,0x0C56,0x0918,0x08DA,0x0A9C,0x0B5E
1640 .value 0x1C20,0x1DE2,0x1FA4,0x1E66,0x1B28,0x1AEA,0x18AC,0x196E
1641 .value 0x1230,0x13F2,0x11B4,0x1076,0x1538,0x14FA,0x16BC,0x177E
1642 .value 0x3840,0x3982,0x3BC4,0x3A06,0x3F48,0x3E8A,0x3CCC,0x3D0E
1643 .value 0x3650,0x3792,0x35D4,0x3416,0x3158,0x309A,0x32DC,0x331E
1644 .value 0x2460,0x25A2,0x27E4,0x2626,0x2368,0x22AA,0x20EC,0x212E
1645 .value 0x2A70,0x2BB2,0x29F4,0x2836,0x2D78,0x2CBA,0x2EFC,0x2F3E
1646 .value 0x7080,0x7142,0x7304,0x72C6,0x7788,0x764A,0x740C,0x75CE
1647 .value 0x7E90,0x7F52,0x7D14,0x7CD6,0x7998,0x785A,0x7A1C,0x7BDE
1648 .value 0x6CA0,0x6D62,0x6F24,0x6EE6,0x6BA8,0x6A6A,0x682C,0x69EE
1649 .value 0x62B0,0x6372,0x6134,0x60F6,0x65B8,0x647A,0x663C,0x67FE
1650 .value 0x48C0,0x4902,0x4B44,0x4A86,0x4FC8,0x4E0A,0x4C4C,0x4D8E
1651 .value 0x46D0,0x4712,0x4554,0x4496,0x41D8,0x401A,0x425C,0x439E
1652 .value 0x54E0,0x5522,0x5764,0x56A6,0x53E8,0x522A,0x506C,0x51AE
1653 .value 0x5AF0,0x5B32,0x5974,0x58B6,0x5DF8,0x5C3A,0x5E7C,0x5FBE
1654 .value 0xE100,0xE0C2,0xE284,0xE346,0xE608,0xE7CA,0xE58C,0xE44E
1655 .value 0xEF10,0xEED2,0xEC94,0xED56,0xE818,0xE9DA,0xEB9C,0xEA5E
1656 .value 0xFD20,0xFCE2,0xFEA4,0xFF66,0xFA28,0xFBEA,0xF9AC,0xF86E
1657 .value 0xF330,0xF2F2,0xF0B4,0xF176,0xF438,0xF5FA,0xF7BC,0xF67E
1658 .value 0xD940,0xD882,0xDAC4,0xDB06,0xDE48,0xDF8A,0xDDCC,0xDC0E
1659 .value 0xD750,0xD692,0xD4D4,0xD516,0xD058,0xD19A,0xD3DC,0xD21E
1660 .value 0xC560,0xC4A2,0xC6E4,0xC726,0xC268,0xC3AA,0xC1EC,0xC02E
1661 .value 0xCB70,0xCAB2,0xC8F4,0xC936,0xCC78,0xCDBA,0xCFFC,0xCE3E
1662 .value 0x9180,0x9042,0x9204,0x93C6,0x9688,0x974A,0x950C,0x94CE
1663 .value 0x9F90,0x9E52,0x9C14,0x9DD6,0x9898,0x995A,0x9B1C,0x9ADE
1664 .value 0x8DA0,0x8C62,0x8E24,0x8FE6,0x8AA8,0x8B6A,0x892C,0x88EE
1665 .value 0x83B0,0x8272,0x8034,0x81F6,0x84B8,0x857A,0x873C,0x86FE
1666 .value 0xA9C0,0xA802,0xAA44,0xAB86,0xAEC8,0xAF0A,0xAD4C,0xAC8E
1667 .value 0xA7D0,0xA612,0xA454,0xA596,0xA0D8,0xA11A,0xA35C,0xA29E
1668 .value 0xB5E0,0xB422,0xB664,0xB7A6,0xB2E8,0xB32A,0xB16C,0xB0AE
1669 .value 0xBBF0,0xBA32,0xB874,0xB9B6,0xBCF8,0xBD3A,0xBF7C,0xBEBE
1670
1671.asciz "GHASH for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
1672.align 64
1673___
1674
1675
1676# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
1677# CONTEXT *context,DISPATCHER_CONTEXT *disp)
1678if ($win64) {
1679$rec="%rcx";
1680$frame="%rdx";
1681$context="%r8";
1682$disp="%r9";
1683
1684$code.=<<___;
1685.extern __imp_RtlVirtualUnwind
1686.type se_handler,\@abi-omnipotent
1687.align 16
1688se_handler:
1689 push %rsi
1690 push %rdi
1691 push %rbx
1692 push %rbp
1693 push %r12
1694 push %r13
1695 push %r14
1696 push %r15
1697 pushfq
1698 sub \$64,%rsp
1699
1700 mov 120($context),%rax # pull context->Rax
1701 mov 248($context),%rbx # pull context->Rip
1702
1703 mov 8($disp),%rsi # disp->ImageBase
1704 mov 56($disp),%r11 # disp->HandlerData
1705
1706 mov 0(%r11),%r10d # HandlerData[0]
1707 lea (%rsi,%r10),%r10 # prologue label
1708 cmp %r10,%rbx # context->Rip<prologue label
1709 jb .Lin_prologue
1710
1711 mov 152($context),%rax # pull context->Rsp
1712
1713 mov 4(%r11),%r10d # HandlerData[1]
1714 lea (%rsi,%r10),%r10 # epilogue label
1715 cmp %r10,%rbx # context->Rip>=epilogue label
1716 jae .Lin_prologue
1717
1718 lea 48+280(%rax),%rax # adjust "rsp"
1719
1720 mov -8(%rax),%rbx
1721 mov -16(%rax),%rbp
1722 mov -24(%rax),%r12
1723 mov -32(%rax),%r13
1724 mov -40(%rax),%r14
1725 mov -48(%rax),%r15
1726 mov %rbx,144($context) # restore context->Rbx
1727 mov %rbp,160($context) # restore context->Rbp
1728 mov %r12,216($context) # restore context->R12
1729 mov %r13,224($context) # restore context->R13
1730 mov %r14,232($context) # restore context->R14
1731 mov %r15,240($context) # restore context->R15
1732
1733.Lin_prologue:
1734 mov 8(%rax),%rdi
1735 mov 16(%rax),%rsi
1736 mov %rax,152($context) # restore context->Rsp
1737 mov %rsi,168($context) # restore context->Rsi
1738 mov %rdi,176($context) # restore context->Rdi
1739
1740 mov 40($disp),%rdi # disp->ContextRecord
1741 mov $context,%rsi # context
1742 mov \$`1232/8`,%ecx # sizeof(CONTEXT)
1743 .long 0xa548f3fc # cld; rep movsq
1744
1745 mov $disp,%rsi
1746 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
1747 mov 8(%rsi),%rdx # arg2, disp->ImageBase
1748 mov 0(%rsi),%r8 # arg3, disp->ControlPc
1749 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
1750 mov 40(%rsi),%r10 # disp->ContextRecord
1751 lea 56(%rsi),%r11 # &disp->HandlerData
1752 lea 24(%rsi),%r12 # &disp->EstablisherFrame
1753 mov %r10,32(%rsp) # arg5
1754 mov %r11,40(%rsp) # arg6
1755 mov %r12,48(%rsp) # arg7
1756 mov %rcx,56(%rsp) # arg8, (NULL)
1757 call *__imp_RtlVirtualUnwind(%rip)
1758
1759 mov \$1,%eax # ExceptionContinueSearch
1760 add \$64,%rsp
1761 popfq
1762 pop %r15
1763 pop %r14
1764 pop %r13
1765 pop %r12
1766 pop %rbp
1767 pop %rbx
1768 pop %rdi
1769 pop %rsi
1770 ret
1771.size se_handler,.-se_handler
1772
1773.section .pdata
1774.align 4
1775 .rva .LSEH_begin_gcm_gmult_4bit
1776 .rva .LSEH_end_gcm_gmult_4bit
1777 .rva .LSEH_info_gcm_gmult_4bit
1778
1779 .rva .LSEH_begin_gcm_ghash_4bit
1780 .rva .LSEH_end_gcm_ghash_4bit
1781 .rva .LSEH_info_gcm_ghash_4bit
1782
1783 .rva .LSEH_begin_gcm_init_clmul
1784 .rva .LSEH_end_gcm_init_clmul
1785 .rva .LSEH_info_gcm_init_clmul
1786
1787 .rva .LSEH_begin_gcm_ghash_clmul
1788 .rva .LSEH_end_gcm_ghash_clmul
1789 .rva .LSEH_info_gcm_ghash_clmul
1790___
1791$code.=<<___ if ($avx);
1792 .rva .LSEH_begin_gcm_init_avx
1793 .rva .LSEH_end_gcm_init_avx
1794 .rva .LSEH_info_gcm_init_clmul
1795
1796 .rva .LSEH_begin_gcm_ghash_avx
1797 .rva .LSEH_end_gcm_ghash_avx
1798 .rva .LSEH_info_gcm_ghash_clmul
1799___
1800$code.=<<___;
1801.section .xdata
1802.align 8
1803.LSEH_info_gcm_gmult_4bit:
1804 .byte 9,0,0,0
1805 .rva se_handler
1806 .rva .Lgmult_prologue,.Lgmult_epilogue # HandlerData
1807.LSEH_info_gcm_ghash_4bit:
1808 .byte 9,0,0,0
1809 .rva se_handler
1810 .rva .Lghash_prologue,.Lghash_epilogue # HandlerData
1811.LSEH_info_gcm_init_clmul:
1812 .byte 0x01,0x08,0x03,0x00
1813 .byte 0x08,0x68,0x00,0x00 #movaps 0x00(rsp),xmm6
1814 .byte 0x04,0x22,0x00,0x00 #sub rsp,0x18
1815.LSEH_info_gcm_ghash_clmul:
1816 .byte 0x01,0x33,0x16,0x00
1817 .byte 0x33,0xf8,0x09,0x00 #movaps 0x90(rsp),xmm15
1818 .byte 0x2e,0xe8,0x08,0x00 #movaps 0x80(rsp),xmm14
1819 .byte 0x29,0xd8,0x07,0x00 #movaps 0x70(rsp),xmm13
1820 .byte 0x24,0xc8,0x06,0x00 #movaps 0x60(rsp),xmm12
1821 .byte 0x1f,0xb8,0x05,0x00 #movaps 0x50(rsp),xmm11
1822 .byte 0x1a,0xa8,0x04,0x00 #movaps 0x40(rsp),xmm10
1823 .byte 0x15,0x98,0x03,0x00 #movaps 0x30(rsp),xmm9
1824 .byte 0x10,0x88,0x02,0x00 #movaps 0x20(rsp),xmm8
1825 .byte 0x0c,0x78,0x01,0x00 #movaps 0x10(rsp),xmm7
1826 .byte 0x08,0x68,0x00,0x00 #movaps 0x00(rsp),xmm6
1827 .byte 0x04,0x01,0x15,0x00 #sub rsp,0xa8
1828___
1829}
1830
1831
1832$code =~ s/\`([^\`]*)\`/eval($1)/gem;
1833
1834print $code;
1835
1836close STDOUT or die "error closing STDOUT: $!";
注意: 瀏覽 TracBrowser 來幫助您使用儲存庫瀏覽器

© 2025 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette