1 | #! /usr/bin/env perl
|
---|
2 | # Copyright 2010-2020 The OpenSSL Project Authors. All Rights Reserved.
|
---|
3 | #
|
---|
4 | # Licensed under the OpenSSL license (the "License"). You may not use
|
---|
5 | # this file except in compliance with the License. You can obtain a copy
|
---|
6 | # in the file LICENSE in the source distribution or at
|
---|
7 | # https://www.openssl.org/source/license.html
|
---|
8 |
|
---|
9 | #
|
---|
10 | # ====================================================================
|
---|
11 | # Written by Andy Polyakov <[email protected]> for the OpenSSL
|
---|
12 | # project. The module is, however, dual licensed under OpenSSL and
|
---|
13 | # CRYPTOGAMS licenses depending on where you obtain it. For further
|
---|
14 | # details see http://www.openssl.org/~appro/cryptogams/.
|
---|
15 | # ====================================================================
|
---|
16 | #
|
---|
17 | # March, June 2010
|
---|
18 | #
|
---|
19 | # The module implements "4-bit" GCM GHASH function and underlying
|
---|
20 | # single multiplication operation in GF(2^128). "4-bit" means that
|
---|
21 | # it uses 256 bytes per-key table [+128 bytes shared table]. GHASH
|
---|
22 | # function features so called "528B" variant utilizing additional
|
---|
23 | # 256+16 bytes of per-key storage [+512 bytes shared table].
|
---|
24 | # Performance results are for this streamed GHASH subroutine and are
|
---|
25 | # expressed in cycles per processed byte, less is better:
|
---|
26 | #
|
---|
27 | # gcc 3.4.x(*) assembler
|
---|
28 | #
|
---|
29 | # P4 28.6 14.0 +100%
|
---|
30 | # Opteron 19.3 7.7 +150%
|
---|
31 | # Core2 17.8 8.1(**) +120%
|
---|
32 | # Atom 31.6 16.8 +88%
|
---|
33 | # VIA Nano 21.8 10.1 +115%
|
---|
34 | #
|
---|
35 | # (*) comparison is not completely fair, because C results are
|
---|
36 | # for vanilla "256B" implementation, while assembler results
|
---|
37 | # are for "528B";-)
|
---|
38 | # (**) it's mystery [to me] why Core2 result is not same as for
|
---|
39 | # Opteron;
|
---|
40 |
|
---|
41 | # May 2010
|
---|
42 | #
|
---|
43 | # Add PCLMULQDQ version performing at 2.02 cycles per processed byte.
|
---|
44 | # See ghash-x86.pl for background information and details about coding
|
---|
45 | # techniques.
|
---|
46 | #
|
---|
47 | # Special thanks to David Woodhouse for providing access to a
|
---|
48 | # Westmere-based system on behalf of Intel Open Source Technology Centre.
|
---|
49 |
|
---|
50 | # December 2012
|
---|
51 | #
|
---|
52 | # Overhaul: aggregate Karatsuba post-processing, improve ILP in
|
---|
53 | # reduction_alg9, increase reduction aggregate factor to 4x. As for
|
---|
54 | # the latter. ghash-x86.pl discusses that it makes lesser sense to
|
---|
55 | # increase aggregate factor. Then why increase here? Critical path
|
---|
56 | # consists of 3 independent pclmulqdq instructions, Karatsuba post-
|
---|
57 | # processing and reduction. "On top" of this we lay down aggregated
|
---|
58 | # multiplication operations, triplets of independent pclmulqdq's. As
|
---|
59 | # issue rate for pclmulqdq is limited, it makes lesser sense to
|
---|
60 | # aggregate more multiplications than it takes to perform remaining
|
---|
61 | # non-multiplication operations. 2x is near-optimal coefficient for
|
---|
62 | # contemporary Intel CPUs (therefore modest improvement coefficient),
|
---|
63 | # but not for Bulldozer. Latter is because logical SIMD operations
|
---|
64 | # are twice as slow in comparison to Intel, so that critical path is
|
---|
65 | # longer. A CPU with higher pclmulqdq issue rate would also benefit
|
---|
66 | # from higher aggregate factor...
|
---|
67 | #
|
---|
68 | # Westmere 1.78(+13%)
|
---|
69 | # Sandy Bridge 1.80(+8%)
|
---|
70 | # Ivy Bridge 1.80(+7%)
|
---|
71 | # Haswell 0.55(+93%) (if system doesn't support AVX)
|
---|
72 | # Broadwell 0.45(+110%)(if system doesn't support AVX)
|
---|
73 | # Skylake 0.44(+110%)(if system doesn't support AVX)
|
---|
74 | # Bulldozer 1.49(+27%)
|
---|
75 | # Silvermont 2.88(+13%)
|
---|
76 | # Knights L 2.12(-) (if system doesn't support AVX)
|
---|
77 | # Goldmont 1.08(+24%)
|
---|
78 |
|
---|
79 | # March 2013
|
---|
80 | #
|
---|
81 | # ... 8x aggregate factor AVX code path is using reduction algorithm
|
---|
82 | # suggested by Shay Gueron[1]. Even though contemporary AVX-capable
|
---|
83 | # CPUs such as Sandy and Ivy Bridge can execute it, the code performs
|
---|
84 | # sub-optimally in comparison to above mentioned version. But thanks
|
---|
85 | # to Ilya Albrekht and Max Locktyukhin of Intel Corp. we knew that
|
---|
86 | # it performs in 0.41 cycles per byte on Haswell processor, in
|
---|
87 | # 0.29 on Broadwell, and in 0.36 on Skylake.
|
---|
88 | #
|
---|
89 | # Knights Landing achieves 1.09 cpb.
|
---|
90 | #
|
---|
91 | # [1] http://rt.openssl.org/Ticket/Display.html?id=2900&user=guest&pass=guest
|
---|
92 |
|
---|
93 | $flavour = shift;
|
---|
94 | $output = shift;
|
---|
95 | if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
|
---|
96 |
|
---|
97 | $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
|
---|
98 |
|
---|
99 | $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
|
---|
100 | ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
|
---|
101 | ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
|
---|
102 | die "can't locate x86_64-xlate.pl";
|
---|
103 |
|
---|
104 | if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
|
---|
105 | =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
|
---|
106 | $avx = ($1>=2.20) + ($1>=2.22);
|
---|
107 | }
|
---|
108 |
|
---|
109 | if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
|
---|
110 | `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
|
---|
111 | $avx = ($1>=2.09) + ($1>=2.10);
|
---|
112 | }
|
---|
113 |
|
---|
114 | if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
|
---|
115 | `ml64 2>&1` =~ /Version ([0-9]+)\./) {
|
---|
116 | $avx = ($1>=10) + ($1>=11);
|
---|
117 | }
|
---|
118 |
|
---|
119 | if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:clang|LLVM) version|.*based on LLVM) ([0-9]+\.[0-9]+)/) {
|
---|
120 | $avx = ($2>=3.0) + ($2>3.0);
|
---|
121 | }
|
---|
122 |
|
---|
123 | open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
|
---|
124 | *STDOUT=*OUT;
|
---|
125 |
|
---|
126 | $do4xaggr=1;
|
---|
127 |
|
---|
128 | # common register layout
|
---|
129 | $nlo="%rax";
|
---|
130 | $nhi="%rbx";
|
---|
131 | $Zlo="%r8";
|
---|
132 | $Zhi="%r9";
|
---|
133 | $tmp="%r10";
|
---|
134 | $rem_4bit = "%r11";
|
---|
135 |
|
---|
136 | $Xi="%rdi";
|
---|
137 | $Htbl="%rsi";
|
---|
138 |
|
---|
139 | # per-function register layout
|
---|
140 | $cnt="%rcx";
|
---|
141 | $rem="%rdx";
|
---|
142 |
|
---|
143 | sub LB() { my $r=shift; $r =~ s/%[er]([a-d])x/%\1l/ or
|
---|
144 | $r =~ s/%[er]([sd]i)/%\1l/ or
|
---|
145 | $r =~ s/%[er](bp)/%\1l/ or
|
---|
146 | $r =~ s/%(r[0-9]+)[d]?/%\1b/; $r; }
|
---|
147 |
|
---|
148 | sub AUTOLOAD() # thunk [simplified] 32-bit style perlasm
|
---|
149 | { my $opcode = $AUTOLOAD; $opcode =~ s/.*:://;
|
---|
150 | my $arg = pop;
|
---|
151 | $arg = "\$$arg" if ($arg*1 eq $arg);
|
---|
152 | $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n";
|
---|
153 | }
|
---|
154 | |
---|
155 |
|
---|
156 | { my $N;
|
---|
157 | sub loop() {
|
---|
158 | my $inp = shift;
|
---|
159 |
|
---|
160 | $N++;
|
---|
161 | $code.=<<___;
|
---|
162 | xor $nlo,$nlo
|
---|
163 | xor $nhi,$nhi
|
---|
164 | mov `&LB("$Zlo")`,`&LB("$nlo")`
|
---|
165 | mov `&LB("$Zlo")`,`&LB("$nhi")`
|
---|
166 | shl \$4,`&LB("$nlo")`
|
---|
167 | mov \$14,$cnt
|
---|
168 | mov 8($Htbl,$nlo),$Zlo
|
---|
169 | mov ($Htbl,$nlo),$Zhi
|
---|
170 | and \$0xf0,`&LB("$nhi")`
|
---|
171 | mov $Zlo,$rem
|
---|
172 | jmp .Loop$N
|
---|
173 |
|
---|
174 | .align 16
|
---|
175 | .Loop$N:
|
---|
176 | shr \$4,$Zlo
|
---|
177 | and \$0xf,$rem
|
---|
178 | mov $Zhi,$tmp
|
---|
179 | mov ($inp,$cnt),`&LB("$nlo")`
|
---|
180 | shr \$4,$Zhi
|
---|
181 | xor 8($Htbl,$nhi),$Zlo
|
---|
182 | shl \$60,$tmp
|
---|
183 | xor ($Htbl,$nhi),$Zhi
|
---|
184 | mov `&LB("$nlo")`,`&LB("$nhi")`
|
---|
185 | xor ($rem_4bit,$rem,8),$Zhi
|
---|
186 | mov $Zlo,$rem
|
---|
187 | shl \$4,`&LB("$nlo")`
|
---|
188 | xor $tmp,$Zlo
|
---|
189 | dec $cnt
|
---|
190 | js .Lbreak$N
|
---|
191 |
|
---|
192 | shr \$4,$Zlo
|
---|
193 | and \$0xf,$rem
|
---|
194 | mov $Zhi,$tmp
|
---|
195 | shr \$4,$Zhi
|
---|
196 | xor 8($Htbl,$nlo),$Zlo
|
---|
197 | shl \$60,$tmp
|
---|
198 | xor ($Htbl,$nlo),$Zhi
|
---|
199 | and \$0xf0,`&LB("$nhi")`
|
---|
200 | xor ($rem_4bit,$rem,8),$Zhi
|
---|
201 | mov $Zlo,$rem
|
---|
202 | xor $tmp,$Zlo
|
---|
203 | jmp .Loop$N
|
---|
204 |
|
---|
205 | .align 16
|
---|
206 | .Lbreak$N:
|
---|
207 | shr \$4,$Zlo
|
---|
208 | and \$0xf,$rem
|
---|
209 | mov $Zhi,$tmp
|
---|
210 | shr \$4,$Zhi
|
---|
211 | xor 8($Htbl,$nlo),$Zlo
|
---|
212 | shl \$60,$tmp
|
---|
213 | xor ($Htbl,$nlo),$Zhi
|
---|
214 | and \$0xf0,`&LB("$nhi")`
|
---|
215 | xor ($rem_4bit,$rem,8),$Zhi
|
---|
216 | mov $Zlo,$rem
|
---|
217 | xor $tmp,$Zlo
|
---|
218 |
|
---|
219 | shr \$4,$Zlo
|
---|
220 | and \$0xf,$rem
|
---|
221 | mov $Zhi,$tmp
|
---|
222 | shr \$4,$Zhi
|
---|
223 | xor 8($Htbl,$nhi),$Zlo
|
---|
224 | shl \$60,$tmp
|
---|
225 | xor ($Htbl,$nhi),$Zhi
|
---|
226 | xor $tmp,$Zlo
|
---|
227 | xor ($rem_4bit,$rem,8),$Zhi
|
---|
228 |
|
---|
229 | bswap $Zlo
|
---|
230 | bswap $Zhi
|
---|
231 | ___
|
---|
232 | }}
|
---|
233 |
|
---|
234 | $code=<<___;
|
---|
235 | .text
|
---|
236 | .extern OPENSSL_ia32cap_P
|
---|
237 |
|
---|
238 | .globl gcm_gmult_4bit
|
---|
239 | .type gcm_gmult_4bit,\@function,2
|
---|
240 | .align 16
|
---|
241 | gcm_gmult_4bit:
|
---|
242 | .cfi_startproc
|
---|
243 | push %rbx
|
---|
244 | .cfi_push %rbx
|
---|
245 | push %rbp # %rbp and others are pushed exclusively in
|
---|
246 | .cfi_push %rbp
|
---|
247 | push %r12 # order to reuse Win64 exception handler...
|
---|
248 | .cfi_push %r12
|
---|
249 | push %r13
|
---|
250 | .cfi_push %r13
|
---|
251 | push %r14
|
---|
252 | .cfi_push %r14
|
---|
253 | push %r15
|
---|
254 | .cfi_push %r15
|
---|
255 | sub \$280,%rsp
|
---|
256 | .cfi_adjust_cfa_offset 280
|
---|
257 | .Lgmult_prologue:
|
---|
258 |
|
---|
259 | movzb 15($Xi),$Zlo
|
---|
260 | lea .Lrem_4bit(%rip),$rem_4bit
|
---|
261 | ___
|
---|
262 | &loop ($Xi);
|
---|
263 | $code.=<<___;
|
---|
264 | mov $Zlo,8($Xi)
|
---|
265 | mov $Zhi,($Xi)
|
---|
266 |
|
---|
267 | lea 280+48(%rsp),%rsi
|
---|
268 | .cfi_def_cfa %rsi,8
|
---|
269 | mov -8(%rsi),%rbx
|
---|
270 | .cfi_restore %rbx
|
---|
271 | lea (%rsi),%rsp
|
---|
272 | .cfi_def_cfa_register %rsp
|
---|
273 | .Lgmult_epilogue:
|
---|
274 | ret
|
---|
275 | .cfi_endproc
|
---|
276 | .size gcm_gmult_4bit,.-gcm_gmult_4bit
|
---|
277 | ___
|
---|
278 | |
---|
279 |
|
---|
280 | # per-function register layout
|
---|
281 | $inp="%rdx";
|
---|
282 | $len="%rcx";
|
---|
283 | $rem_8bit=$rem_4bit;
|
---|
284 |
|
---|
285 | $code.=<<___;
|
---|
286 | .globl gcm_ghash_4bit
|
---|
287 | .type gcm_ghash_4bit,\@function,4
|
---|
288 | .align 16
|
---|
289 | gcm_ghash_4bit:
|
---|
290 | .cfi_startproc
|
---|
291 | push %rbx
|
---|
292 | .cfi_push %rbx
|
---|
293 | push %rbp
|
---|
294 | .cfi_push %rbp
|
---|
295 | push %r12
|
---|
296 | .cfi_push %r12
|
---|
297 | push %r13
|
---|
298 | .cfi_push %r13
|
---|
299 | push %r14
|
---|
300 | .cfi_push %r14
|
---|
301 | push %r15
|
---|
302 | .cfi_push %r15
|
---|
303 | sub \$280,%rsp
|
---|
304 | .cfi_adjust_cfa_offset 280
|
---|
305 | .Lghash_prologue:
|
---|
306 | mov $inp,%r14 # reassign couple of args
|
---|
307 | mov $len,%r15
|
---|
308 | ___
|
---|
309 | { my $inp="%r14";
|
---|
310 | my $dat="%edx";
|
---|
311 | my $len="%r15";
|
---|
312 | my @nhi=("%ebx","%ecx");
|
---|
313 | my @rem=("%r12","%r13");
|
---|
314 | my $Hshr4="%rbp";
|
---|
315 |
|
---|
316 | &sub ($Htbl,-128); # size optimization
|
---|
317 | &lea ($Hshr4,"16+128(%rsp)");
|
---|
318 | { my @lo =($nlo,$nhi);
|
---|
319 | my @hi =($Zlo,$Zhi);
|
---|
320 |
|
---|
321 | &xor ($dat,$dat);
|
---|
322 | for ($i=0,$j=-2;$i<18;$i++,$j++) {
|
---|
323 | &mov ("$j(%rsp)",&LB($dat)) if ($i>1);
|
---|
324 | &or ($lo[0],$tmp) if ($i>1);
|
---|
325 | &mov (&LB($dat),&LB($lo[1])) if ($i>0 && $i<17);
|
---|
326 | &shr ($lo[1],4) if ($i>0 && $i<17);
|
---|
327 | &mov ($tmp,$hi[1]) if ($i>0 && $i<17);
|
---|
328 | &shr ($hi[1],4) if ($i>0 && $i<17);
|
---|
329 | &mov ("8*$j($Hshr4)",$hi[0]) if ($i>1);
|
---|
330 | &mov ($hi[0],"16*$i+0-128($Htbl)") if ($i<16);
|
---|
331 | &shl (&LB($dat),4) if ($i>0 && $i<17);
|
---|
332 | &mov ("8*$j-128($Hshr4)",$lo[0]) if ($i>1);
|
---|
333 | &mov ($lo[0],"16*$i+8-128($Htbl)") if ($i<16);
|
---|
334 | &shl ($tmp,60) if ($i>0 && $i<17);
|
---|
335 |
|
---|
336 | push (@lo,shift(@lo));
|
---|
337 | push (@hi,shift(@hi));
|
---|
338 | }
|
---|
339 | }
|
---|
340 | &add ($Htbl,-128);
|
---|
341 | &mov ($Zlo,"8($Xi)");
|
---|
342 | &mov ($Zhi,"0($Xi)");
|
---|
343 | &add ($len,$inp); # pointer to the end of data
|
---|
344 | &lea ($rem_8bit,".Lrem_8bit(%rip)");
|
---|
345 | &jmp (".Louter_loop");
|
---|
346 |
|
---|
347 | $code.=".align 16\n.Louter_loop:\n";
|
---|
348 | &xor ($Zhi,"($inp)");
|
---|
349 | &mov ("%rdx","8($inp)");
|
---|
350 | &lea ($inp,"16($inp)");
|
---|
351 | &xor ("%rdx",$Zlo);
|
---|
352 | &mov ("($Xi)",$Zhi);
|
---|
353 | &mov ("8($Xi)","%rdx");
|
---|
354 | &shr ("%rdx",32);
|
---|
355 |
|
---|
356 | &xor ($nlo,$nlo);
|
---|
357 | &rol ($dat,8);
|
---|
358 | &mov (&LB($nlo),&LB($dat));
|
---|
359 | &movz ($nhi[0],&LB($dat));
|
---|
360 | &shl (&LB($nlo),4);
|
---|
361 | &shr ($nhi[0],4);
|
---|
362 |
|
---|
363 | for ($j=11,$i=0;$i<15;$i++) {
|
---|
364 | &rol ($dat,8);
|
---|
365 | &xor ($Zlo,"8($Htbl,$nlo)") if ($i>0);
|
---|
366 | &xor ($Zhi,"($Htbl,$nlo)") if ($i>0);
|
---|
367 | &mov ($Zlo,"8($Htbl,$nlo)") if ($i==0);
|
---|
368 | &mov ($Zhi,"($Htbl,$nlo)") if ($i==0);
|
---|
369 |
|
---|
370 | &mov (&LB($nlo),&LB($dat));
|
---|
371 | &xor ($Zlo,$tmp) if ($i>0);
|
---|
372 | &movzw ($rem[1],"($rem_8bit,$rem[1],2)") if ($i>0);
|
---|
373 |
|
---|
374 | &movz ($nhi[1],&LB($dat));
|
---|
375 | &shl (&LB($nlo),4);
|
---|
376 | &movzb ($rem[0],"(%rsp,$nhi[0])");
|
---|
377 |
|
---|
378 | &shr ($nhi[1],4) if ($i<14);
|
---|
379 | &and ($nhi[1],0xf0) if ($i==14);
|
---|
380 | &shl ($rem[1],48) if ($i>0);
|
---|
381 | &xor ($rem[0],$Zlo);
|
---|
382 |
|
---|
383 | &mov ($tmp,$Zhi);
|
---|
384 | &xor ($Zhi,$rem[1]) if ($i>0);
|
---|
385 | &shr ($Zlo,8);
|
---|
386 |
|
---|
387 | &movz ($rem[0],&LB($rem[0]));
|
---|
388 | &mov ($dat,"$j($Xi)") if (--$j%4==0);
|
---|
389 | &shr ($Zhi,8);
|
---|
390 |
|
---|
391 | &xor ($Zlo,"-128($Hshr4,$nhi[0],8)");
|
---|
392 | &shl ($tmp,56);
|
---|
393 | &xor ($Zhi,"($Hshr4,$nhi[0],8)");
|
---|
394 |
|
---|
395 | unshift (@nhi,pop(@nhi)); # "rotate" registers
|
---|
396 | unshift (@rem,pop(@rem));
|
---|
397 | }
|
---|
398 | &movzw ($rem[1],"($rem_8bit,$rem[1],2)");
|
---|
399 | &xor ($Zlo,"8($Htbl,$nlo)");
|
---|
400 | &xor ($Zhi,"($Htbl,$nlo)");
|
---|
401 |
|
---|
402 | &shl ($rem[1],48);
|
---|
403 | &xor ($Zlo,$tmp);
|
---|
404 |
|
---|
405 | &xor ($Zhi,$rem[1]);
|
---|
406 | &movz ($rem[0],&LB($Zlo));
|
---|
407 | &shr ($Zlo,4);
|
---|
408 |
|
---|
409 | &mov ($tmp,$Zhi);
|
---|
410 | &shl (&LB($rem[0]),4);
|
---|
411 | &shr ($Zhi,4);
|
---|
412 |
|
---|
413 | &xor ($Zlo,"8($Htbl,$nhi[0])");
|
---|
414 | &movzw ($rem[0],"($rem_8bit,$rem[0],2)");
|
---|
415 | &shl ($tmp,60);
|
---|
416 |
|
---|
417 | &xor ($Zhi,"($Htbl,$nhi[0])");
|
---|
418 | &xor ($Zlo,$tmp);
|
---|
419 | &shl ($rem[0],48);
|
---|
420 |
|
---|
421 | &bswap ($Zlo);
|
---|
422 | &xor ($Zhi,$rem[0]);
|
---|
423 |
|
---|
424 | &bswap ($Zhi);
|
---|
425 | &cmp ($inp,$len);
|
---|
426 | &jb (".Louter_loop");
|
---|
427 | }
|
---|
428 | $code.=<<___;
|
---|
429 | mov $Zlo,8($Xi)
|
---|
430 | mov $Zhi,($Xi)
|
---|
431 |
|
---|
432 | lea 280+48(%rsp),%rsi
|
---|
433 | .cfi_def_cfa %rsi,8
|
---|
434 | mov -48(%rsi),%r15
|
---|
435 | .cfi_restore %r15
|
---|
436 | mov -40(%rsi),%r14
|
---|
437 | .cfi_restore %r14
|
---|
438 | mov -32(%rsi),%r13
|
---|
439 | .cfi_restore %r13
|
---|
440 | mov -24(%rsi),%r12
|
---|
441 | .cfi_restore %r12
|
---|
442 | mov -16(%rsi),%rbp
|
---|
443 | .cfi_restore %rbp
|
---|
444 | mov -8(%rsi),%rbx
|
---|
445 | .cfi_restore %rbx
|
---|
446 | lea 0(%rsi),%rsp
|
---|
447 | .cfi_def_cfa_register %rsp
|
---|
448 | .Lghash_epilogue:
|
---|
449 | ret
|
---|
450 | .cfi_endproc
|
---|
451 | .size gcm_ghash_4bit,.-gcm_ghash_4bit
|
---|
452 | ___
|
---|
453 | |
---|
454 |
|
---|
455 | ######################################################################
|
---|
456 | # PCLMULQDQ version.
|
---|
457 |
|
---|
458 | @_4args=$win64? ("%rcx","%rdx","%r8", "%r9") : # Win64 order
|
---|
459 | ("%rdi","%rsi","%rdx","%rcx"); # Unix order
|
---|
460 |
|
---|
461 | ($Xi,$Xhi)=("%xmm0","%xmm1"); $Hkey="%xmm2";
|
---|
462 | ($T1,$T2,$T3)=("%xmm3","%xmm4","%xmm5");
|
---|
463 |
|
---|
464 | sub clmul64x64_T2 { # minimal register pressure
|
---|
465 | my ($Xhi,$Xi,$Hkey,$HK)=@_;
|
---|
466 |
|
---|
467 | if (!defined($HK)) { $HK = $T2;
|
---|
468 | $code.=<<___;
|
---|
469 | movdqa $Xi,$Xhi #
|
---|
470 | pshufd \$0b01001110,$Xi,$T1
|
---|
471 | pshufd \$0b01001110,$Hkey,$T2
|
---|
472 | pxor $Xi,$T1 #
|
---|
473 | pxor $Hkey,$T2
|
---|
474 | ___
|
---|
475 | } else {
|
---|
476 | $code.=<<___;
|
---|
477 | movdqa $Xi,$Xhi #
|
---|
478 | pshufd \$0b01001110,$Xi,$T1
|
---|
479 | pxor $Xi,$T1 #
|
---|
480 | ___
|
---|
481 | }
|
---|
482 | $code.=<<___;
|
---|
483 | pclmulqdq \$0x00,$Hkey,$Xi #######
|
---|
484 | pclmulqdq \$0x11,$Hkey,$Xhi #######
|
---|
485 | pclmulqdq \$0x00,$HK,$T1 #######
|
---|
486 | pxor $Xi,$T1 #
|
---|
487 | pxor $Xhi,$T1 #
|
---|
488 |
|
---|
489 | movdqa $T1,$T2 #
|
---|
490 | psrldq \$8,$T1
|
---|
491 | pslldq \$8,$T2 #
|
---|
492 | pxor $T1,$Xhi
|
---|
493 | pxor $T2,$Xi #
|
---|
494 | ___
|
---|
495 | }
|
---|
496 |
|
---|
497 | sub reduction_alg9 { # 17/11 times faster than Intel version
|
---|
498 | my ($Xhi,$Xi) = @_;
|
---|
499 |
|
---|
500 | $code.=<<___;
|
---|
501 | # 1st phase
|
---|
502 | movdqa $Xi,$T2 #
|
---|
503 | movdqa $Xi,$T1
|
---|
504 | psllq \$5,$Xi
|
---|
505 | pxor $Xi,$T1 #
|
---|
506 | psllq \$1,$Xi
|
---|
507 | pxor $T1,$Xi #
|
---|
508 | psllq \$57,$Xi #
|
---|
509 | movdqa $Xi,$T1 #
|
---|
510 | pslldq \$8,$Xi
|
---|
511 | psrldq \$8,$T1 #
|
---|
512 | pxor $T2,$Xi
|
---|
513 | pxor $T1,$Xhi #
|
---|
514 |
|
---|
515 | # 2nd phase
|
---|
516 | movdqa $Xi,$T2
|
---|
517 | psrlq \$1,$Xi
|
---|
518 | pxor $T2,$Xhi #
|
---|
519 | pxor $Xi,$T2
|
---|
520 | psrlq \$5,$Xi
|
---|
521 | pxor $T2,$Xi #
|
---|
522 | psrlq \$1,$Xi #
|
---|
523 | pxor $Xhi,$Xi #
|
---|
524 | ___
|
---|
525 | }
|
---|
526 | |
---|
527 |
|
---|
528 | { my ($Htbl,$Xip)=@_4args;
|
---|
529 | my $HK="%xmm6";
|
---|
530 |
|
---|
531 | $code.=<<___;
|
---|
532 | .globl gcm_init_clmul
|
---|
533 | .type gcm_init_clmul,\@abi-omnipotent
|
---|
534 | .align 16
|
---|
535 | gcm_init_clmul:
|
---|
536 | .cfi_startproc
|
---|
537 | .L_init_clmul:
|
---|
538 | ___
|
---|
539 | $code.=<<___ if ($win64);
|
---|
540 | .LSEH_begin_gcm_init_clmul:
|
---|
541 | # I can't trust assembler to use specific encoding:-(
|
---|
542 | .byte 0x48,0x83,0xec,0x18 #sub $0x18,%rsp
|
---|
543 | .byte 0x0f,0x29,0x34,0x24 #movaps %xmm6,(%rsp)
|
---|
544 | ___
|
---|
545 | $code.=<<___;
|
---|
546 | movdqu ($Xip),$Hkey
|
---|
547 | pshufd \$0b01001110,$Hkey,$Hkey # dword swap
|
---|
548 |
|
---|
549 | # <<1 twist
|
---|
550 | pshufd \$0b11111111,$Hkey,$T2 # broadcast uppermost dword
|
---|
551 | movdqa $Hkey,$T1
|
---|
552 | psllq \$1,$Hkey
|
---|
553 | pxor $T3,$T3 #
|
---|
554 | psrlq \$63,$T1
|
---|
555 | pcmpgtd $T2,$T3 # broadcast carry bit
|
---|
556 | pslldq \$8,$T1
|
---|
557 | por $T1,$Hkey # H<<=1
|
---|
558 |
|
---|
559 | # magic reduction
|
---|
560 | pand .L0x1c2_polynomial(%rip),$T3
|
---|
561 | pxor $T3,$Hkey # if(carry) H^=0x1c2_polynomial
|
---|
562 |
|
---|
563 | # calculate H^2
|
---|
564 | pshufd \$0b01001110,$Hkey,$HK
|
---|
565 | movdqa $Hkey,$Xi
|
---|
566 | pxor $Hkey,$HK
|
---|
567 | ___
|
---|
568 | &clmul64x64_T2 ($Xhi,$Xi,$Hkey,$HK);
|
---|
569 | &reduction_alg9 ($Xhi,$Xi);
|
---|
570 | $code.=<<___;
|
---|
571 | pshufd \$0b01001110,$Hkey,$T1
|
---|
572 | pshufd \$0b01001110,$Xi,$T2
|
---|
573 | pxor $Hkey,$T1 # Karatsuba pre-processing
|
---|
574 | movdqu $Hkey,0x00($Htbl) # save H
|
---|
575 | pxor $Xi,$T2 # Karatsuba pre-processing
|
---|
576 | movdqu $Xi,0x10($Htbl) # save H^2
|
---|
577 | palignr \$8,$T1,$T2 # low part is H.lo^H.hi...
|
---|
578 | movdqu $T2,0x20($Htbl) # save Karatsuba "salt"
|
---|
579 | ___
|
---|
580 | if ($do4xaggr) {
|
---|
581 | &clmul64x64_T2 ($Xhi,$Xi,$Hkey,$HK); # H^3
|
---|
582 | &reduction_alg9 ($Xhi,$Xi);
|
---|
583 | $code.=<<___;
|
---|
584 | movdqa $Xi,$T3
|
---|
585 | ___
|
---|
586 | &clmul64x64_T2 ($Xhi,$Xi,$Hkey,$HK); # H^4
|
---|
587 | &reduction_alg9 ($Xhi,$Xi);
|
---|
588 | $code.=<<___;
|
---|
589 | pshufd \$0b01001110,$T3,$T1
|
---|
590 | pshufd \$0b01001110,$Xi,$T2
|
---|
591 | pxor $T3,$T1 # Karatsuba pre-processing
|
---|
592 | movdqu $T3,0x30($Htbl) # save H^3
|
---|
593 | pxor $Xi,$T2 # Karatsuba pre-processing
|
---|
594 | movdqu $Xi,0x40($Htbl) # save H^4
|
---|
595 | palignr \$8,$T1,$T2 # low part is H^3.lo^H^3.hi...
|
---|
596 | movdqu $T2,0x50($Htbl) # save Karatsuba "salt"
|
---|
597 | ___
|
---|
598 | }
|
---|
599 | $code.=<<___ if ($win64);
|
---|
600 | movaps (%rsp),%xmm6
|
---|
601 | lea 0x18(%rsp),%rsp
|
---|
602 | .LSEH_end_gcm_init_clmul:
|
---|
603 | ___
|
---|
604 | $code.=<<___;
|
---|
605 | ret
|
---|
606 | .cfi_endproc
|
---|
607 | .size gcm_init_clmul,.-gcm_init_clmul
|
---|
608 | ___
|
---|
609 | }
|
---|
610 |
|
---|
611 | { my ($Xip,$Htbl)=@_4args;
|
---|
612 |
|
---|
613 | $code.=<<___;
|
---|
614 | .globl gcm_gmult_clmul
|
---|
615 | .type gcm_gmult_clmul,\@abi-omnipotent
|
---|
616 | .align 16
|
---|
617 | gcm_gmult_clmul:
|
---|
618 | .cfi_startproc
|
---|
619 | .L_gmult_clmul:
|
---|
620 | movdqu ($Xip),$Xi
|
---|
621 | movdqa .Lbswap_mask(%rip),$T3
|
---|
622 | movdqu ($Htbl),$Hkey
|
---|
623 | movdqu 0x20($Htbl),$T2
|
---|
624 | pshufb $T3,$Xi
|
---|
625 | ___
|
---|
626 | &clmul64x64_T2 ($Xhi,$Xi,$Hkey,$T2);
|
---|
627 | $code.=<<___ if (0 || (&reduction_alg9($Xhi,$Xi)&&0));
|
---|
628 | # experimental alternative. special thing about is that there
|
---|
629 | # no dependency between the two multiplications...
|
---|
630 | mov \$`0xE1<<1`,%eax
|
---|
631 | mov \$0xA040608020C0E000,%r10 # ((7..0)·0xE0)&0xff
|
---|
632 | mov \$0x07,%r11d
|
---|
633 | movq %rax,$T1
|
---|
634 | movq %r10,$T2
|
---|
635 | movq %r11,$T3 # borrow $T3
|
---|
636 | pand $Xi,$T3
|
---|
637 | pshufb $T3,$T2 # ($Xi&7)·0xE0
|
---|
638 | movq %rax,$T3
|
---|
639 | pclmulqdq \$0x00,$Xi,$T1 # ·(0xE1<<1)
|
---|
640 | pxor $Xi,$T2
|
---|
641 | pslldq \$15,$T2
|
---|
642 | paddd $T2,$T2 # <<(64+56+1)
|
---|
643 | pxor $T2,$Xi
|
---|
644 | pclmulqdq \$0x01,$T3,$Xi
|
---|
645 | movdqa .Lbswap_mask(%rip),$T3 # reload $T3
|
---|
646 | psrldq \$1,$T1
|
---|
647 | pxor $T1,$Xhi
|
---|
648 | pslldq \$7,$Xi
|
---|
649 | pxor $Xhi,$Xi
|
---|
650 | ___
|
---|
651 | $code.=<<___;
|
---|
652 | pshufb $T3,$Xi
|
---|
653 | movdqu $Xi,($Xip)
|
---|
654 | ret
|
---|
655 | .cfi_endproc
|
---|
656 | .size gcm_gmult_clmul,.-gcm_gmult_clmul
|
---|
657 | ___
|
---|
658 | }
|
---|
659 | |
---|
660 |
|
---|
661 | { my ($Xip,$Htbl,$inp,$len)=@_4args;
|
---|
662 | my ($Xln,$Xmn,$Xhn,$Hkey2,$HK) = map("%xmm$_",(3..7));
|
---|
663 | my ($T1,$T2,$T3)=map("%xmm$_",(8..10));
|
---|
664 |
|
---|
665 | $code.=<<___;
|
---|
666 | .globl gcm_ghash_clmul
|
---|
667 | .type gcm_ghash_clmul,\@abi-omnipotent
|
---|
668 | .align 32
|
---|
669 | gcm_ghash_clmul:
|
---|
670 | .cfi_startproc
|
---|
671 | .L_ghash_clmul:
|
---|
672 | ___
|
---|
673 | $code.=<<___ if ($win64);
|
---|
674 | lea -0x88(%rsp),%rax
|
---|
675 | .LSEH_begin_gcm_ghash_clmul:
|
---|
676 | # I can't trust assembler to use specific encoding:-(
|
---|
677 | .byte 0x48,0x8d,0x60,0xe0 #lea -0x20(%rax),%rsp
|
---|
678 | .byte 0x0f,0x29,0x70,0xe0 #movaps %xmm6,-0x20(%rax)
|
---|
679 | .byte 0x0f,0x29,0x78,0xf0 #movaps %xmm7,-0x10(%rax)
|
---|
680 | .byte 0x44,0x0f,0x29,0x00 #movaps %xmm8,0(%rax)
|
---|
681 | .byte 0x44,0x0f,0x29,0x48,0x10 #movaps %xmm9,0x10(%rax)
|
---|
682 | .byte 0x44,0x0f,0x29,0x50,0x20 #movaps %xmm10,0x20(%rax)
|
---|
683 | .byte 0x44,0x0f,0x29,0x58,0x30 #movaps %xmm11,0x30(%rax)
|
---|
684 | .byte 0x44,0x0f,0x29,0x60,0x40 #movaps %xmm12,0x40(%rax)
|
---|
685 | .byte 0x44,0x0f,0x29,0x68,0x50 #movaps %xmm13,0x50(%rax)
|
---|
686 | .byte 0x44,0x0f,0x29,0x70,0x60 #movaps %xmm14,0x60(%rax)
|
---|
687 | .byte 0x44,0x0f,0x29,0x78,0x70 #movaps %xmm15,0x70(%rax)
|
---|
688 | ___
|
---|
689 | $code.=<<___;
|
---|
690 | movdqa .Lbswap_mask(%rip),$T3
|
---|
691 |
|
---|
692 | movdqu ($Xip),$Xi
|
---|
693 | movdqu ($Htbl),$Hkey
|
---|
694 | movdqu 0x20($Htbl),$HK
|
---|
695 | pshufb $T3,$Xi
|
---|
696 |
|
---|
697 | sub \$0x10,$len
|
---|
698 | jz .Lodd_tail
|
---|
699 |
|
---|
700 | movdqu 0x10($Htbl),$Hkey2
|
---|
701 | ___
|
---|
702 | if ($do4xaggr) {
|
---|
703 | my ($Xl,$Xm,$Xh,$Hkey3,$Hkey4)=map("%xmm$_",(11..15));
|
---|
704 |
|
---|
705 | $code.=<<___;
|
---|
706 | mov OPENSSL_ia32cap_P+4(%rip),%eax
|
---|
707 | cmp \$0x30,$len
|
---|
708 | jb .Lskip4x
|
---|
709 |
|
---|
710 | and \$`1<<26|1<<22`,%eax # isolate MOVBE+XSAVE
|
---|
711 | cmp \$`1<<22`,%eax # check for MOVBE without XSAVE
|
---|
712 | je .Lskip4x
|
---|
713 |
|
---|
714 | sub \$0x30,$len
|
---|
715 | mov \$0xA040608020C0E000,%rax # ((7..0)·0xE0)&0xff
|
---|
716 | movdqu 0x30($Htbl),$Hkey3
|
---|
717 | movdqu 0x40($Htbl),$Hkey4
|
---|
718 |
|
---|
719 | #######
|
---|
720 | # Xi+4 =[(H*Ii+3) + (H^2*Ii+2) + (H^3*Ii+1) + H^4*(Ii+Xi)] mod P
|
---|
721 | #
|
---|
722 | movdqu 0x30($inp),$Xln
|
---|
723 | movdqu 0x20($inp),$Xl
|
---|
724 | pshufb $T3,$Xln
|
---|
725 | pshufb $T3,$Xl
|
---|
726 | movdqa $Xln,$Xhn
|
---|
727 | pshufd \$0b01001110,$Xln,$Xmn
|
---|
728 | pxor $Xln,$Xmn
|
---|
729 | pclmulqdq \$0x00,$Hkey,$Xln
|
---|
730 | pclmulqdq \$0x11,$Hkey,$Xhn
|
---|
731 | pclmulqdq \$0x00,$HK,$Xmn
|
---|
732 |
|
---|
733 | movdqa $Xl,$Xh
|
---|
734 | pshufd \$0b01001110,$Xl,$Xm
|
---|
735 | pxor $Xl,$Xm
|
---|
736 | pclmulqdq \$0x00,$Hkey2,$Xl
|
---|
737 | pclmulqdq \$0x11,$Hkey2,$Xh
|
---|
738 | pclmulqdq \$0x10,$HK,$Xm
|
---|
739 | xorps $Xl,$Xln
|
---|
740 | xorps $Xh,$Xhn
|
---|
741 | movups 0x50($Htbl),$HK
|
---|
742 | xorps $Xm,$Xmn
|
---|
743 |
|
---|
744 | movdqu 0x10($inp),$Xl
|
---|
745 | movdqu 0($inp),$T1
|
---|
746 | pshufb $T3,$Xl
|
---|
747 | pshufb $T3,$T1
|
---|
748 | movdqa $Xl,$Xh
|
---|
749 | pshufd \$0b01001110,$Xl,$Xm
|
---|
750 | pxor $T1,$Xi
|
---|
751 | pxor $Xl,$Xm
|
---|
752 | pclmulqdq \$0x00,$Hkey3,$Xl
|
---|
753 | movdqa $Xi,$Xhi
|
---|
754 | pshufd \$0b01001110,$Xi,$T1
|
---|
755 | pxor $Xi,$T1
|
---|
756 | pclmulqdq \$0x11,$Hkey3,$Xh
|
---|
757 | pclmulqdq \$0x00,$HK,$Xm
|
---|
758 | xorps $Xl,$Xln
|
---|
759 | xorps $Xh,$Xhn
|
---|
760 |
|
---|
761 | lea 0x40($inp),$inp
|
---|
762 | sub \$0x40,$len
|
---|
763 | jc .Ltail4x
|
---|
764 |
|
---|
765 | jmp .Lmod4_loop
|
---|
766 | .align 32
|
---|
767 | .Lmod4_loop:
|
---|
768 | pclmulqdq \$0x00,$Hkey4,$Xi
|
---|
769 | xorps $Xm,$Xmn
|
---|
770 | movdqu 0x30($inp),$Xl
|
---|
771 | pshufb $T3,$Xl
|
---|
772 | pclmulqdq \$0x11,$Hkey4,$Xhi
|
---|
773 | xorps $Xln,$Xi
|
---|
774 | movdqu 0x20($inp),$Xln
|
---|
775 | movdqa $Xl,$Xh
|
---|
776 | pclmulqdq \$0x10,$HK,$T1
|
---|
777 | pshufd \$0b01001110,$Xl,$Xm
|
---|
778 | xorps $Xhn,$Xhi
|
---|
779 | pxor $Xl,$Xm
|
---|
780 | pshufb $T3,$Xln
|
---|
781 | movups 0x20($Htbl),$HK
|
---|
782 | xorps $Xmn,$T1
|
---|
783 | pclmulqdq \$0x00,$Hkey,$Xl
|
---|
784 | pshufd \$0b01001110,$Xln,$Xmn
|
---|
785 |
|
---|
786 | pxor $Xi,$T1 # aggregated Karatsuba post-processing
|
---|
787 | movdqa $Xln,$Xhn
|
---|
788 | pxor $Xhi,$T1 #
|
---|
789 | pxor $Xln,$Xmn
|
---|
790 | movdqa $T1,$T2 #
|
---|
791 | pclmulqdq \$0x11,$Hkey,$Xh
|
---|
792 | pslldq \$8,$T1
|
---|
793 | psrldq \$8,$T2 #
|
---|
794 | pxor $T1,$Xi
|
---|
795 | movdqa .L7_mask(%rip),$T1
|
---|
796 | pxor $T2,$Xhi #
|
---|
797 | movq %rax,$T2
|
---|
798 |
|
---|
799 | pand $Xi,$T1 # 1st phase
|
---|
800 | pshufb $T1,$T2 #
|
---|
801 | pxor $Xi,$T2 #
|
---|
802 | pclmulqdq \$0x00,$HK,$Xm
|
---|
803 | psllq \$57,$T2 #
|
---|
804 | movdqa $T2,$T1 #
|
---|
805 | pslldq \$8,$T2
|
---|
806 | pclmulqdq \$0x00,$Hkey2,$Xln
|
---|
807 | psrldq \$8,$T1 #
|
---|
808 | pxor $T2,$Xi
|
---|
809 | pxor $T1,$Xhi #
|
---|
810 | movdqu 0($inp),$T1
|
---|
811 |
|
---|
812 | movdqa $Xi,$T2 # 2nd phase
|
---|
813 | psrlq \$1,$Xi
|
---|
814 | pclmulqdq \$0x11,$Hkey2,$Xhn
|
---|
815 | xorps $Xl,$Xln
|
---|
816 | movdqu 0x10($inp),$Xl
|
---|
817 | pshufb $T3,$Xl
|
---|
818 | pclmulqdq \$0x10,$HK,$Xmn
|
---|
819 | xorps $Xh,$Xhn
|
---|
820 | movups 0x50($Htbl),$HK
|
---|
821 | pshufb $T3,$T1
|
---|
822 | pxor $T2,$Xhi #
|
---|
823 | pxor $Xi,$T2
|
---|
824 | psrlq \$5,$Xi
|
---|
825 |
|
---|
826 | movdqa $Xl,$Xh
|
---|
827 | pxor $Xm,$Xmn
|
---|
828 | pshufd \$0b01001110,$Xl,$Xm
|
---|
829 | pxor $T2,$Xi #
|
---|
830 | pxor $T1,$Xhi
|
---|
831 | pxor $Xl,$Xm
|
---|
832 | pclmulqdq \$0x00,$Hkey3,$Xl
|
---|
833 | psrlq \$1,$Xi #
|
---|
834 | pxor $Xhi,$Xi #
|
---|
835 | movdqa $Xi,$Xhi
|
---|
836 | pclmulqdq \$0x11,$Hkey3,$Xh
|
---|
837 | xorps $Xl,$Xln
|
---|
838 | pshufd \$0b01001110,$Xi,$T1
|
---|
839 | pxor $Xi,$T1
|
---|
840 |
|
---|
841 | pclmulqdq \$0x00,$HK,$Xm
|
---|
842 | xorps $Xh,$Xhn
|
---|
843 |
|
---|
844 | lea 0x40($inp),$inp
|
---|
845 | sub \$0x40,$len
|
---|
846 | jnc .Lmod4_loop
|
---|
847 |
|
---|
848 | .Ltail4x:
|
---|
849 | pclmulqdq \$0x00,$Hkey4,$Xi
|
---|
850 | pclmulqdq \$0x11,$Hkey4,$Xhi
|
---|
851 | pclmulqdq \$0x10,$HK,$T1
|
---|
852 | xorps $Xm,$Xmn
|
---|
853 | xorps $Xln,$Xi
|
---|
854 | xorps $Xhn,$Xhi
|
---|
855 | pxor $Xi,$Xhi # aggregated Karatsuba post-processing
|
---|
856 | pxor $Xmn,$T1
|
---|
857 |
|
---|
858 | pxor $Xhi,$T1 #
|
---|
859 | pxor $Xi,$Xhi
|
---|
860 |
|
---|
861 | movdqa $T1,$T2 #
|
---|
862 | psrldq \$8,$T1
|
---|
863 | pslldq \$8,$T2 #
|
---|
864 | pxor $T1,$Xhi
|
---|
865 | pxor $T2,$Xi #
|
---|
866 | ___
|
---|
867 | &reduction_alg9($Xhi,$Xi);
|
---|
868 | $code.=<<___;
|
---|
869 | add \$0x40,$len
|
---|
870 | jz .Ldone
|
---|
871 | movdqu 0x20($Htbl),$HK
|
---|
872 | sub \$0x10,$len
|
---|
873 | jz .Lodd_tail
|
---|
874 | .Lskip4x:
|
---|
875 | ___
|
---|
876 | }
|
---|
877 | $code.=<<___;
|
---|
878 | #######
|
---|
879 | # Xi+2 =[H*(Ii+1 + Xi+1)] mod P =
|
---|
880 | # [(H*Ii+1) + (H*Xi+1)] mod P =
|
---|
881 | # [(H*Ii+1) + H^2*(Ii+Xi)] mod P
|
---|
882 | #
|
---|
883 | movdqu ($inp),$T1 # Ii
|
---|
884 | movdqu 16($inp),$Xln # Ii+1
|
---|
885 | pshufb $T3,$T1
|
---|
886 | pshufb $T3,$Xln
|
---|
887 | pxor $T1,$Xi # Ii+Xi
|
---|
888 |
|
---|
889 | movdqa $Xln,$Xhn
|
---|
890 | pshufd \$0b01001110,$Xln,$Xmn
|
---|
891 | pxor $Xln,$Xmn
|
---|
892 | pclmulqdq \$0x00,$Hkey,$Xln
|
---|
893 | pclmulqdq \$0x11,$Hkey,$Xhn
|
---|
894 | pclmulqdq \$0x00,$HK,$Xmn
|
---|
895 |
|
---|
896 | lea 32($inp),$inp # i+=2
|
---|
897 | nop
|
---|
898 | sub \$0x20,$len
|
---|
899 | jbe .Leven_tail
|
---|
900 | nop
|
---|
901 | jmp .Lmod_loop
|
---|
902 |
|
---|
903 | .align 32
|
---|
904 | .Lmod_loop:
|
---|
905 | movdqa $Xi,$Xhi
|
---|
906 | movdqa $Xmn,$T1
|
---|
907 | pshufd \$0b01001110,$Xi,$Xmn #
|
---|
908 | pxor $Xi,$Xmn #
|
---|
909 |
|
---|
910 | pclmulqdq \$0x00,$Hkey2,$Xi
|
---|
911 | pclmulqdq \$0x11,$Hkey2,$Xhi
|
---|
912 | pclmulqdq \$0x10,$HK,$Xmn
|
---|
913 |
|
---|
914 | pxor $Xln,$Xi # (H*Ii+1) + H^2*(Ii+Xi)
|
---|
915 | pxor $Xhn,$Xhi
|
---|
916 | movdqu ($inp),$T2 # Ii
|
---|
917 | pxor $Xi,$T1 # aggregated Karatsuba post-processing
|
---|
918 | pshufb $T3,$T2
|
---|
919 | movdqu 16($inp),$Xln # Ii+1
|
---|
920 |
|
---|
921 | pxor $Xhi,$T1
|
---|
922 | pxor $T2,$Xhi # "Ii+Xi", consume early
|
---|
923 | pxor $T1,$Xmn
|
---|
924 | pshufb $T3,$Xln
|
---|
925 | movdqa $Xmn,$T1 #
|
---|
926 | psrldq \$8,$T1
|
---|
927 | pslldq \$8,$Xmn #
|
---|
928 | pxor $T1,$Xhi
|
---|
929 | pxor $Xmn,$Xi #
|
---|
930 |
|
---|
931 | movdqa $Xln,$Xhn #
|
---|
932 |
|
---|
933 | movdqa $Xi,$T2 # 1st phase
|
---|
934 | movdqa $Xi,$T1
|
---|
935 | psllq \$5,$Xi
|
---|
936 | pxor $Xi,$T1 #
|
---|
937 | pclmulqdq \$0x00,$Hkey,$Xln #######
|
---|
938 | psllq \$1,$Xi
|
---|
939 | pxor $T1,$Xi #
|
---|
940 | psllq \$57,$Xi #
|
---|
941 | movdqa $Xi,$T1 #
|
---|
942 | pslldq \$8,$Xi
|
---|
943 | psrldq \$8,$T1 #
|
---|
944 | pxor $T2,$Xi
|
---|
945 | pshufd \$0b01001110,$Xhn,$Xmn
|
---|
946 | pxor $T1,$Xhi #
|
---|
947 | pxor $Xhn,$Xmn #
|
---|
948 |
|
---|
949 | movdqa $Xi,$T2 # 2nd phase
|
---|
950 | psrlq \$1,$Xi
|
---|
951 | pclmulqdq \$0x11,$Hkey,$Xhn #######
|
---|
952 | pxor $T2,$Xhi #
|
---|
953 | pxor $Xi,$T2
|
---|
954 | psrlq \$5,$Xi
|
---|
955 | pxor $T2,$Xi #
|
---|
956 | lea 32($inp),$inp
|
---|
957 | psrlq \$1,$Xi #
|
---|
958 | pclmulqdq \$0x00,$HK,$Xmn #######
|
---|
959 | pxor $Xhi,$Xi #
|
---|
960 |
|
---|
961 | sub \$0x20,$len
|
---|
962 | ja .Lmod_loop
|
---|
963 |
|
---|
964 | .Leven_tail:
|
---|
965 | movdqa $Xi,$Xhi
|
---|
966 | movdqa $Xmn,$T1
|
---|
967 | pshufd \$0b01001110,$Xi,$Xmn #
|
---|
968 | pxor $Xi,$Xmn #
|
---|
969 |
|
---|
970 | pclmulqdq \$0x00,$Hkey2,$Xi
|
---|
971 | pclmulqdq \$0x11,$Hkey2,$Xhi
|
---|
972 | pclmulqdq \$0x10,$HK,$Xmn
|
---|
973 |
|
---|
974 | pxor $Xln,$Xi # (H*Ii+1) + H^2*(Ii+Xi)
|
---|
975 | pxor $Xhn,$Xhi
|
---|
976 | pxor $Xi,$T1
|
---|
977 | pxor $Xhi,$T1
|
---|
978 | pxor $T1,$Xmn
|
---|
979 | movdqa $Xmn,$T1 #
|
---|
980 | psrldq \$8,$T1
|
---|
981 | pslldq \$8,$Xmn #
|
---|
982 | pxor $T1,$Xhi
|
---|
983 | pxor $Xmn,$Xi #
|
---|
984 | ___
|
---|
985 | &reduction_alg9 ($Xhi,$Xi);
|
---|
986 | $code.=<<___;
|
---|
987 | test $len,$len
|
---|
988 | jnz .Ldone
|
---|
989 |
|
---|
990 | .Lodd_tail:
|
---|
991 | movdqu ($inp),$T1 # Ii
|
---|
992 | pshufb $T3,$T1
|
---|
993 | pxor $T1,$Xi # Ii+Xi
|
---|
994 | ___
|
---|
995 | &clmul64x64_T2 ($Xhi,$Xi,$Hkey,$HK); # H*(Ii+Xi)
|
---|
996 | &reduction_alg9 ($Xhi,$Xi);
|
---|
997 | $code.=<<___;
|
---|
998 | .Ldone:
|
---|
999 | pshufb $T3,$Xi
|
---|
1000 | movdqu $Xi,($Xip)
|
---|
1001 | ___
|
---|
1002 | $code.=<<___ if ($win64);
|
---|
1003 | movaps (%rsp),%xmm6
|
---|
1004 | movaps 0x10(%rsp),%xmm7
|
---|
1005 | movaps 0x20(%rsp),%xmm8
|
---|
1006 | movaps 0x30(%rsp),%xmm9
|
---|
1007 | movaps 0x40(%rsp),%xmm10
|
---|
1008 | movaps 0x50(%rsp),%xmm11
|
---|
1009 | movaps 0x60(%rsp),%xmm12
|
---|
1010 | movaps 0x70(%rsp),%xmm13
|
---|
1011 | movaps 0x80(%rsp),%xmm14
|
---|
1012 | movaps 0x90(%rsp),%xmm15
|
---|
1013 | lea 0xa8(%rsp),%rsp
|
---|
1014 | .LSEH_end_gcm_ghash_clmul:
|
---|
1015 | ___
|
---|
1016 | $code.=<<___;
|
---|
1017 | ret
|
---|
1018 | .cfi_endproc
|
---|
1019 | .size gcm_ghash_clmul,.-gcm_ghash_clmul
|
---|
1020 | ___
|
---|
1021 | }
|
---|
1022 | |
---|
1023 |
|
---|
1024 | $code.=<<___;
|
---|
1025 | .globl gcm_init_avx
|
---|
1026 | .type gcm_init_avx,\@abi-omnipotent
|
---|
1027 | .align 32
|
---|
1028 | gcm_init_avx:
|
---|
1029 | .cfi_startproc
|
---|
1030 | ___
|
---|
1031 | if ($avx) {
|
---|
1032 | my ($Htbl,$Xip)=@_4args;
|
---|
1033 | my $HK="%xmm6";
|
---|
1034 |
|
---|
1035 | $code.=<<___ if ($win64);
|
---|
1036 | .LSEH_begin_gcm_init_avx:
|
---|
1037 | # I can't trust assembler to use specific encoding:-(
|
---|
1038 | .byte 0x48,0x83,0xec,0x18 #sub $0x18,%rsp
|
---|
1039 | .byte 0x0f,0x29,0x34,0x24 #movaps %xmm6,(%rsp)
|
---|
1040 | ___
|
---|
1041 | $code.=<<___;
|
---|
1042 | vzeroupper
|
---|
1043 |
|
---|
1044 | vmovdqu ($Xip),$Hkey
|
---|
1045 | vpshufd \$0b01001110,$Hkey,$Hkey # dword swap
|
---|
1046 |
|
---|
1047 | # <<1 twist
|
---|
1048 | vpshufd \$0b11111111,$Hkey,$T2 # broadcast uppermost dword
|
---|
1049 | vpsrlq \$63,$Hkey,$T1
|
---|
1050 | vpsllq \$1,$Hkey,$Hkey
|
---|
1051 | vpxor $T3,$T3,$T3 #
|
---|
1052 | vpcmpgtd $T2,$T3,$T3 # broadcast carry bit
|
---|
1053 | vpslldq \$8,$T1,$T1
|
---|
1054 | vpor $T1,$Hkey,$Hkey # H<<=1
|
---|
1055 |
|
---|
1056 | # magic reduction
|
---|
1057 | vpand .L0x1c2_polynomial(%rip),$T3,$T3
|
---|
1058 | vpxor $T3,$Hkey,$Hkey # if(carry) H^=0x1c2_polynomial
|
---|
1059 |
|
---|
1060 | vpunpckhqdq $Hkey,$Hkey,$HK
|
---|
1061 | vmovdqa $Hkey,$Xi
|
---|
1062 | vpxor $Hkey,$HK,$HK
|
---|
1063 | mov \$4,%r10 # up to H^8
|
---|
1064 | jmp .Linit_start_avx
|
---|
1065 | ___
|
---|
1066 |
|
---|
1067 | sub clmul64x64_avx {
|
---|
1068 | my ($Xhi,$Xi,$Hkey,$HK)=@_;
|
---|
1069 |
|
---|
1070 | if (!defined($HK)) { $HK = $T2;
|
---|
1071 | $code.=<<___;
|
---|
1072 | vpunpckhqdq $Xi,$Xi,$T1
|
---|
1073 | vpunpckhqdq $Hkey,$Hkey,$T2
|
---|
1074 | vpxor $Xi,$T1,$T1 #
|
---|
1075 | vpxor $Hkey,$T2,$T2
|
---|
1076 | ___
|
---|
1077 | } else {
|
---|
1078 | $code.=<<___;
|
---|
1079 | vpunpckhqdq $Xi,$Xi,$T1
|
---|
1080 | vpxor $Xi,$T1,$T1 #
|
---|
1081 | ___
|
---|
1082 | }
|
---|
1083 | $code.=<<___;
|
---|
1084 | vpclmulqdq \$0x11,$Hkey,$Xi,$Xhi #######
|
---|
1085 | vpclmulqdq \$0x00,$Hkey,$Xi,$Xi #######
|
---|
1086 | vpclmulqdq \$0x00,$HK,$T1,$T1 #######
|
---|
1087 | vpxor $Xi,$Xhi,$T2 #
|
---|
1088 | vpxor $T2,$T1,$T1 #
|
---|
1089 |
|
---|
1090 | vpslldq \$8,$T1,$T2 #
|
---|
1091 | vpsrldq \$8,$T1,$T1
|
---|
1092 | vpxor $T2,$Xi,$Xi #
|
---|
1093 | vpxor $T1,$Xhi,$Xhi
|
---|
1094 | ___
|
---|
1095 | }
|
---|
1096 |
|
---|
1097 | sub reduction_avx {
|
---|
1098 | my ($Xhi,$Xi) = @_;
|
---|
1099 |
|
---|
1100 | $code.=<<___;
|
---|
1101 | vpsllq \$57,$Xi,$T1 # 1st phase
|
---|
1102 | vpsllq \$62,$Xi,$T2
|
---|
1103 | vpxor $T1,$T2,$T2 #
|
---|
1104 | vpsllq \$63,$Xi,$T1
|
---|
1105 | vpxor $T1,$T2,$T2 #
|
---|
1106 | vpslldq \$8,$T2,$T1 #
|
---|
1107 | vpsrldq \$8,$T2,$T2
|
---|
1108 | vpxor $T1,$Xi,$Xi #
|
---|
1109 | vpxor $T2,$Xhi,$Xhi
|
---|
1110 |
|
---|
1111 | vpsrlq \$1,$Xi,$T2 # 2nd phase
|
---|
1112 | vpxor $Xi,$Xhi,$Xhi
|
---|
1113 | vpxor $T2,$Xi,$Xi #
|
---|
1114 | vpsrlq \$5,$T2,$T2
|
---|
1115 | vpxor $T2,$Xi,$Xi #
|
---|
1116 | vpsrlq \$1,$Xi,$Xi #
|
---|
1117 | vpxor $Xhi,$Xi,$Xi #
|
---|
1118 | ___
|
---|
1119 | }
|
---|
1120 |
|
---|
1121 | $code.=<<___;
|
---|
1122 | .align 32
|
---|
1123 | .Linit_loop_avx:
|
---|
1124 | vpalignr \$8,$T1,$T2,$T3 # low part is H.lo^H.hi...
|
---|
1125 | vmovdqu $T3,-0x10($Htbl) # save Karatsuba "salt"
|
---|
1126 | ___
|
---|
1127 | &clmul64x64_avx ($Xhi,$Xi,$Hkey,$HK); # calculate H^3,5,7
|
---|
1128 | &reduction_avx ($Xhi,$Xi);
|
---|
1129 | $code.=<<___;
|
---|
1130 | .Linit_start_avx:
|
---|
1131 | vmovdqa $Xi,$T3
|
---|
1132 | ___
|
---|
1133 | &clmul64x64_avx ($Xhi,$Xi,$Hkey,$HK); # calculate H^2,4,6,8
|
---|
1134 | &reduction_avx ($Xhi,$Xi);
|
---|
1135 | $code.=<<___;
|
---|
1136 | vpshufd \$0b01001110,$T3,$T1
|
---|
1137 | vpshufd \$0b01001110,$Xi,$T2
|
---|
1138 | vpxor $T3,$T1,$T1 # Karatsuba pre-processing
|
---|
1139 | vmovdqu $T3,0x00($Htbl) # save H^1,3,5,7
|
---|
1140 | vpxor $Xi,$T2,$T2 # Karatsuba pre-processing
|
---|
1141 | vmovdqu $Xi,0x10($Htbl) # save H^2,4,6,8
|
---|
1142 | lea 0x30($Htbl),$Htbl
|
---|
1143 | sub \$1,%r10
|
---|
1144 | jnz .Linit_loop_avx
|
---|
1145 |
|
---|
1146 | vpalignr \$8,$T2,$T1,$T3 # last "salt" is flipped
|
---|
1147 | vmovdqu $T3,-0x10($Htbl)
|
---|
1148 |
|
---|
1149 | vzeroupper
|
---|
1150 | ___
|
---|
1151 | $code.=<<___ if ($win64);
|
---|
1152 | movaps (%rsp),%xmm6
|
---|
1153 | lea 0x18(%rsp),%rsp
|
---|
1154 | .LSEH_end_gcm_init_avx:
|
---|
1155 | ___
|
---|
1156 | $code.=<<___;
|
---|
1157 | ret
|
---|
1158 | .cfi_endproc
|
---|
1159 | .size gcm_init_avx,.-gcm_init_avx
|
---|
1160 | ___
|
---|
1161 | } else {
|
---|
1162 | $code.=<<___;
|
---|
1163 | jmp .L_init_clmul
|
---|
1164 | .cfi_endproc
|
---|
1165 | .size gcm_init_avx,.-gcm_init_avx
|
---|
1166 | ___
|
---|
1167 | }
|
---|
1168 |
|
---|
1169 | $code.=<<___;
|
---|
1170 | .globl gcm_gmult_avx
|
---|
1171 | .type gcm_gmult_avx,\@abi-omnipotent
|
---|
1172 | .align 32
|
---|
1173 | gcm_gmult_avx:
|
---|
1174 | .cfi_startproc
|
---|
1175 | jmp .L_gmult_clmul
|
---|
1176 | .cfi_endproc
|
---|
1177 | .size gcm_gmult_avx,.-gcm_gmult_avx
|
---|
1178 | ___
|
---|
1179 | |
---|
1180 |
|
---|
1181 | $code.=<<___;
|
---|
1182 | .globl gcm_ghash_avx
|
---|
1183 | .type gcm_ghash_avx,\@abi-omnipotent
|
---|
1184 | .align 32
|
---|
1185 | gcm_ghash_avx:
|
---|
1186 | .cfi_startproc
|
---|
1187 | ___
|
---|
1188 | if ($avx) {
|
---|
1189 | my ($Xip,$Htbl,$inp,$len)=@_4args;
|
---|
1190 | my ($Xlo,$Xhi,$Xmi,
|
---|
1191 | $Zlo,$Zhi,$Zmi,
|
---|
1192 | $Hkey,$HK,$T1,$T2,
|
---|
1193 | $Xi,$Xo,$Tred,$bswap,$Ii,$Ij) = map("%xmm$_",(0..15));
|
---|
1194 |
|
---|
1195 | $code.=<<___ if ($win64);
|
---|
1196 | lea -0x88(%rsp),%rax
|
---|
1197 | .LSEH_begin_gcm_ghash_avx:
|
---|
1198 | # I can't trust assembler to use specific encoding:-(
|
---|
1199 | .byte 0x48,0x8d,0x60,0xe0 #lea -0x20(%rax),%rsp
|
---|
1200 | .byte 0x0f,0x29,0x70,0xe0 #movaps %xmm6,-0x20(%rax)
|
---|
1201 | .byte 0x0f,0x29,0x78,0xf0 #movaps %xmm7,-0x10(%rax)
|
---|
1202 | .byte 0x44,0x0f,0x29,0x00 #movaps %xmm8,0(%rax)
|
---|
1203 | .byte 0x44,0x0f,0x29,0x48,0x10 #movaps %xmm9,0x10(%rax)
|
---|
1204 | .byte 0x44,0x0f,0x29,0x50,0x20 #movaps %xmm10,0x20(%rax)
|
---|
1205 | .byte 0x44,0x0f,0x29,0x58,0x30 #movaps %xmm11,0x30(%rax)
|
---|
1206 | .byte 0x44,0x0f,0x29,0x60,0x40 #movaps %xmm12,0x40(%rax)
|
---|
1207 | .byte 0x44,0x0f,0x29,0x68,0x50 #movaps %xmm13,0x50(%rax)
|
---|
1208 | .byte 0x44,0x0f,0x29,0x70,0x60 #movaps %xmm14,0x60(%rax)
|
---|
1209 | .byte 0x44,0x0f,0x29,0x78,0x70 #movaps %xmm15,0x70(%rax)
|
---|
1210 | ___
|
---|
1211 | $code.=<<___;
|
---|
1212 | vzeroupper
|
---|
1213 |
|
---|
1214 | vmovdqu ($Xip),$Xi # load $Xi
|
---|
1215 | lea .L0x1c2_polynomial(%rip),%r10
|
---|
1216 | lea 0x40($Htbl),$Htbl # size optimization
|
---|
1217 | vmovdqu .Lbswap_mask(%rip),$bswap
|
---|
1218 | vpshufb $bswap,$Xi,$Xi
|
---|
1219 | cmp \$0x80,$len
|
---|
1220 | jb .Lshort_avx
|
---|
1221 | sub \$0x80,$len
|
---|
1222 |
|
---|
1223 | vmovdqu 0x70($inp),$Ii # I[7]
|
---|
1224 | vmovdqu 0x00-0x40($Htbl),$Hkey # $Hkey^1
|
---|
1225 | vpshufb $bswap,$Ii,$Ii
|
---|
1226 | vmovdqu 0x20-0x40($Htbl),$HK
|
---|
1227 |
|
---|
1228 | vpunpckhqdq $Ii,$Ii,$T2
|
---|
1229 | vmovdqu 0x60($inp),$Ij # I[6]
|
---|
1230 | vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo
|
---|
1231 | vpxor $Ii,$T2,$T2
|
---|
1232 | vpshufb $bswap,$Ij,$Ij
|
---|
1233 | vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi
|
---|
1234 | vmovdqu 0x10-0x40($Htbl),$Hkey # $Hkey^2
|
---|
1235 | vpunpckhqdq $Ij,$Ij,$T1
|
---|
1236 | vmovdqu 0x50($inp),$Ii # I[5]
|
---|
1237 | vpclmulqdq \$0x00,$HK,$T2,$Xmi
|
---|
1238 | vpxor $Ij,$T1,$T1
|
---|
1239 |
|
---|
1240 | vpshufb $bswap,$Ii,$Ii
|
---|
1241 | vpclmulqdq \$0x00,$Hkey,$Ij,$Zlo
|
---|
1242 | vpunpckhqdq $Ii,$Ii,$T2
|
---|
1243 | vpclmulqdq \$0x11,$Hkey,$Ij,$Zhi
|
---|
1244 | vmovdqu 0x30-0x40($Htbl),$Hkey # $Hkey^3
|
---|
1245 | vpxor $Ii,$T2,$T2
|
---|
1246 | vmovdqu 0x40($inp),$Ij # I[4]
|
---|
1247 | vpclmulqdq \$0x10,$HK,$T1,$Zmi
|
---|
1248 | vmovdqu 0x50-0x40($Htbl),$HK
|
---|
1249 |
|
---|
1250 | vpshufb $bswap,$Ij,$Ij
|
---|
1251 | vpxor $Xlo,$Zlo,$Zlo
|
---|
1252 | vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo
|
---|
1253 | vpxor $Xhi,$Zhi,$Zhi
|
---|
1254 | vpunpckhqdq $Ij,$Ij,$T1
|
---|
1255 | vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi
|
---|
1256 | vmovdqu 0x40-0x40($Htbl),$Hkey # $Hkey^4
|
---|
1257 | vpxor $Xmi,$Zmi,$Zmi
|
---|
1258 | vpclmulqdq \$0x00,$HK,$T2,$Xmi
|
---|
1259 | vpxor $Ij,$T1,$T1
|
---|
1260 |
|
---|
1261 | vmovdqu 0x30($inp),$Ii # I[3]
|
---|
1262 | vpxor $Zlo,$Xlo,$Xlo
|
---|
1263 | vpclmulqdq \$0x00,$Hkey,$Ij,$Zlo
|
---|
1264 | vpxor $Zhi,$Xhi,$Xhi
|
---|
1265 | vpshufb $bswap,$Ii,$Ii
|
---|
1266 | vpclmulqdq \$0x11,$Hkey,$Ij,$Zhi
|
---|
1267 | vmovdqu 0x60-0x40($Htbl),$Hkey # $Hkey^5
|
---|
1268 | vpxor $Zmi,$Xmi,$Xmi
|
---|
1269 | vpunpckhqdq $Ii,$Ii,$T2
|
---|
1270 | vpclmulqdq \$0x10,$HK,$T1,$Zmi
|
---|
1271 | vmovdqu 0x80-0x40($Htbl),$HK
|
---|
1272 | vpxor $Ii,$T2,$T2
|
---|
1273 |
|
---|
1274 | vmovdqu 0x20($inp),$Ij # I[2]
|
---|
1275 | vpxor $Xlo,$Zlo,$Zlo
|
---|
1276 | vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo
|
---|
1277 | vpxor $Xhi,$Zhi,$Zhi
|
---|
1278 | vpshufb $bswap,$Ij,$Ij
|
---|
1279 | vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi
|
---|
1280 | vmovdqu 0x70-0x40($Htbl),$Hkey # $Hkey^6
|
---|
1281 | vpxor $Xmi,$Zmi,$Zmi
|
---|
1282 | vpunpckhqdq $Ij,$Ij,$T1
|
---|
1283 | vpclmulqdq \$0x00,$HK,$T2,$Xmi
|
---|
1284 | vpxor $Ij,$T1,$T1
|
---|
1285 |
|
---|
1286 | vmovdqu 0x10($inp),$Ii # I[1]
|
---|
1287 | vpxor $Zlo,$Xlo,$Xlo
|
---|
1288 | vpclmulqdq \$0x00,$Hkey,$Ij,$Zlo
|
---|
1289 | vpxor $Zhi,$Xhi,$Xhi
|
---|
1290 | vpshufb $bswap,$Ii,$Ii
|
---|
1291 | vpclmulqdq \$0x11,$Hkey,$Ij,$Zhi
|
---|
1292 | vmovdqu 0x90-0x40($Htbl),$Hkey # $Hkey^7
|
---|
1293 | vpxor $Zmi,$Xmi,$Xmi
|
---|
1294 | vpunpckhqdq $Ii,$Ii,$T2
|
---|
1295 | vpclmulqdq \$0x10,$HK,$T1,$Zmi
|
---|
1296 | vmovdqu 0xb0-0x40($Htbl),$HK
|
---|
1297 | vpxor $Ii,$T2,$T2
|
---|
1298 |
|
---|
1299 | vmovdqu ($inp),$Ij # I[0]
|
---|
1300 | vpxor $Xlo,$Zlo,$Zlo
|
---|
1301 | vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo
|
---|
1302 | vpxor $Xhi,$Zhi,$Zhi
|
---|
1303 | vpshufb $bswap,$Ij,$Ij
|
---|
1304 | vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi
|
---|
1305 | vmovdqu 0xa0-0x40($Htbl),$Hkey # $Hkey^8
|
---|
1306 | vpxor $Xmi,$Zmi,$Zmi
|
---|
1307 | vpclmulqdq \$0x10,$HK,$T2,$Xmi
|
---|
1308 |
|
---|
1309 | lea 0x80($inp),$inp
|
---|
1310 | cmp \$0x80,$len
|
---|
1311 | jb .Ltail_avx
|
---|
1312 |
|
---|
1313 | vpxor $Xi,$Ij,$Ij # accumulate $Xi
|
---|
1314 | sub \$0x80,$len
|
---|
1315 | jmp .Loop8x_avx
|
---|
1316 |
|
---|
1317 | .align 32
|
---|
1318 | .Loop8x_avx:
|
---|
1319 | vpunpckhqdq $Ij,$Ij,$T1
|
---|
1320 | vmovdqu 0x70($inp),$Ii # I[7]
|
---|
1321 | vpxor $Xlo,$Zlo,$Zlo
|
---|
1322 | vpxor $Ij,$T1,$T1
|
---|
1323 | vpclmulqdq \$0x00,$Hkey,$Ij,$Xi
|
---|
1324 | vpshufb $bswap,$Ii,$Ii
|
---|
1325 | vpxor $Xhi,$Zhi,$Zhi
|
---|
1326 | vpclmulqdq \$0x11,$Hkey,$Ij,$Xo
|
---|
1327 | vmovdqu 0x00-0x40($Htbl),$Hkey # $Hkey^1
|
---|
1328 | vpunpckhqdq $Ii,$Ii,$T2
|
---|
1329 | vpxor $Xmi,$Zmi,$Zmi
|
---|
1330 | vpclmulqdq \$0x00,$HK,$T1,$Tred
|
---|
1331 | vmovdqu 0x20-0x40($Htbl),$HK
|
---|
1332 | vpxor $Ii,$T2,$T2
|
---|
1333 |
|
---|
1334 | vmovdqu 0x60($inp),$Ij # I[6]
|
---|
1335 | vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo
|
---|
1336 | vpxor $Zlo,$Xi,$Xi # collect result
|
---|
1337 | vpshufb $bswap,$Ij,$Ij
|
---|
1338 | vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi
|
---|
1339 | vxorps $Zhi,$Xo,$Xo
|
---|
1340 | vmovdqu 0x10-0x40($Htbl),$Hkey # $Hkey^2
|
---|
1341 | vpunpckhqdq $Ij,$Ij,$T1
|
---|
1342 | vpclmulqdq \$0x00,$HK, $T2,$Xmi
|
---|
1343 | vpxor $Zmi,$Tred,$Tred
|
---|
1344 | vxorps $Ij,$T1,$T1
|
---|
1345 |
|
---|
1346 | vmovdqu 0x50($inp),$Ii # I[5]
|
---|
1347 | vpxor $Xi,$Tred,$Tred # aggregated Karatsuba post-processing
|
---|
1348 | vpclmulqdq \$0x00,$Hkey,$Ij,$Zlo
|
---|
1349 | vpxor $Xo,$Tred,$Tred
|
---|
1350 | vpslldq \$8,$Tred,$T2
|
---|
1351 | vpxor $Xlo,$Zlo,$Zlo
|
---|
1352 | vpclmulqdq \$0x11,$Hkey,$Ij,$Zhi
|
---|
1353 | vpsrldq \$8,$Tred,$Tred
|
---|
1354 | vpxor $T2, $Xi, $Xi
|
---|
1355 | vmovdqu 0x30-0x40($Htbl),$Hkey # $Hkey^3
|
---|
1356 | vpshufb $bswap,$Ii,$Ii
|
---|
1357 | vxorps $Tred,$Xo, $Xo
|
---|
1358 | vpxor $Xhi,$Zhi,$Zhi
|
---|
1359 | vpunpckhqdq $Ii,$Ii,$T2
|
---|
1360 | vpclmulqdq \$0x10,$HK, $T1,$Zmi
|
---|
1361 | vmovdqu 0x50-0x40($Htbl),$HK
|
---|
1362 | vpxor $Ii,$T2,$T2
|
---|
1363 | vpxor $Xmi,$Zmi,$Zmi
|
---|
1364 |
|
---|
1365 | vmovdqu 0x40($inp),$Ij # I[4]
|
---|
1366 | vpalignr \$8,$Xi,$Xi,$Tred # 1st phase
|
---|
1367 | vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo
|
---|
1368 | vpshufb $bswap,$Ij,$Ij
|
---|
1369 | vpxor $Zlo,$Xlo,$Xlo
|
---|
1370 | vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi
|
---|
1371 | vmovdqu 0x40-0x40($Htbl),$Hkey # $Hkey^4
|
---|
1372 | vpunpckhqdq $Ij,$Ij,$T1
|
---|
1373 | vpxor $Zhi,$Xhi,$Xhi
|
---|
1374 | vpclmulqdq \$0x00,$HK, $T2,$Xmi
|
---|
1375 | vxorps $Ij,$T1,$T1
|
---|
1376 | vpxor $Zmi,$Xmi,$Xmi
|
---|
1377 |
|
---|
1378 | vmovdqu 0x30($inp),$Ii # I[3]
|
---|
1379 | vpclmulqdq \$0x10,(%r10),$Xi,$Xi
|
---|
1380 | vpclmulqdq \$0x00,$Hkey,$Ij,$Zlo
|
---|
1381 | vpshufb $bswap,$Ii,$Ii
|
---|
1382 | vpxor $Xlo,$Zlo,$Zlo
|
---|
1383 | vpclmulqdq \$0x11,$Hkey,$Ij,$Zhi
|
---|
1384 | vmovdqu 0x60-0x40($Htbl),$Hkey # $Hkey^5
|
---|
1385 | vpunpckhqdq $Ii,$Ii,$T2
|
---|
1386 | vpxor $Xhi,$Zhi,$Zhi
|
---|
1387 | vpclmulqdq \$0x10,$HK, $T1,$Zmi
|
---|
1388 | vmovdqu 0x80-0x40($Htbl),$HK
|
---|
1389 | vpxor $Ii,$T2,$T2
|
---|
1390 | vpxor $Xmi,$Zmi,$Zmi
|
---|
1391 |
|
---|
1392 | vmovdqu 0x20($inp),$Ij # I[2]
|
---|
1393 | vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo
|
---|
1394 | vpshufb $bswap,$Ij,$Ij
|
---|
1395 | vpxor $Zlo,$Xlo,$Xlo
|
---|
1396 | vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi
|
---|
1397 | vmovdqu 0x70-0x40($Htbl),$Hkey # $Hkey^6
|
---|
1398 | vpunpckhqdq $Ij,$Ij,$T1
|
---|
1399 | vpxor $Zhi,$Xhi,$Xhi
|
---|
1400 | vpclmulqdq \$0x00,$HK, $T2,$Xmi
|
---|
1401 | vpxor $Ij,$T1,$T1
|
---|
1402 | vpxor $Zmi,$Xmi,$Xmi
|
---|
1403 | vxorps $Tred,$Xi,$Xi
|
---|
1404 |
|
---|
1405 | vmovdqu 0x10($inp),$Ii # I[1]
|
---|
1406 | vpalignr \$8,$Xi,$Xi,$Tred # 2nd phase
|
---|
1407 | vpclmulqdq \$0x00,$Hkey,$Ij,$Zlo
|
---|
1408 | vpshufb $bswap,$Ii,$Ii
|
---|
1409 | vpxor $Xlo,$Zlo,$Zlo
|
---|
1410 | vpclmulqdq \$0x11,$Hkey,$Ij,$Zhi
|
---|
1411 | vmovdqu 0x90-0x40($Htbl),$Hkey # $Hkey^7
|
---|
1412 | vpclmulqdq \$0x10,(%r10),$Xi,$Xi
|
---|
1413 | vxorps $Xo,$Tred,$Tred
|
---|
1414 | vpunpckhqdq $Ii,$Ii,$T2
|
---|
1415 | vpxor $Xhi,$Zhi,$Zhi
|
---|
1416 | vpclmulqdq \$0x10,$HK, $T1,$Zmi
|
---|
1417 | vmovdqu 0xb0-0x40($Htbl),$HK
|
---|
1418 | vpxor $Ii,$T2,$T2
|
---|
1419 | vpxor $Xmi,$Zmi,$Zmi
|
---|
1420 |
|
---|
1421 | vmovdqu ($inp),$Ij # I[0]
|
---|
1422 | vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo
|
---|
1423 | vpshufb $bswap,$Ij,$Ij
|
---|
1424 | vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi
|
---|
1425 | vmovdqu 0xa0-0x40($Htbl),$Hkey # $Hkey^8
|
---|
1426 | vpxor $Tred,$Ij,$Ij
|
---|
1427 | vpclmulqdq \$0x10,$HK, $T2,$Xmi
|
---|
1428 | vpxor $Xi,$Ij,$Ij # accumulate $Xi
|
---|
1429 |
|
---|
1430 | lea 0x80($inp),$inp
|
---|
1431 | sub \$0x80,$len
|
---|
1432 | jnc .Loop8x_avx
|
---|
1433 |
|
---|
1434 | add \$0x80,$len
|
---|
1435 | jmp .Ltail_no_xor_avx
|
---|
1436 |
|
---|
1437 | .align 32
|
---|
1438 | .Lshort_avx:
|
---|
1439 | vmovdqu -0x10($inp,$len),$Ii # very last word
|
---|
1440 | lea ($inp,$len),$inp
|
---|
1441 | vmovdqu 0x00-0x40($Htbl),$Hkey # $Hkey^1
|
---|
1442 | vmovdqu 0x20-0x40($Htbl),$HK
|
---|
1443 | vpshufb $bswap,$Ii,$Ij
|
---|
1444 |
|
---|
1445 | vmovdqa $Xlo,$Zlo # subtle way to zero $Zlo,
|
---|
1446 | vmovdqa $Xhi,$Zhi # $Zhi and
|
---|
1447 | vmovdqa $Xmi,$Zmi # $Zmi
|
---|
1448 | sub \$0x10,$len
|
---|
1449 | jz .Ltail_avx
|
---|
1450 |
|
---|
1451 | vpunpckhqdq $Ij,$Ij,$T1
|
---|
1452 | vpxor $Xlo,$Zlo,$Zlo
|
---|
1453 | vpclmulqdq \$0x00,$Hkey,$Ij,$Xlo
|
---|
1454 | vpxor $Ij,$T1,$T1
|
---|
1455 | vmovdqu -0x20($inp),$Ii
|
---|
1456 | vpxor $Xhi,$Zhi,$Zhi
|
---|
1457 | vpclmulqdq \$0x11,$Hkey,$Ij,$Xhi
|
---|
1458 | vmovdqu 0x10-0x40($Htbl),$Hkey # $Hkey^2
|
---|
1459 | vpshufb $bswap,$Ii,$Ij
|
---|
1460 | vpxor $Xmi,$Zmi,$Zmi
|
---|
1461 | vpclmulqdq \$0x00,$HK,$T1,$Xmi
|
---|
1462 | vpsrldq \$8,$HK,$HK
|
---|
1463 | sub \$0x10,$len
|
---|
1464 | jz .Ltail_avx
|
---|
1465 |
|
---|
1466 | vpunpckhqdq $Ij,$Ij,$T1
|
---|
1467 | vpxor $Xlo,$Zlo,$Zlo
|
---|
1468 | vpclmulqdq \$0x00,$Hkey,$Ij,$Xlo
|
---|
1469 | vpxor $Ij,$T1,$T1
|
---|
1470 | vmovdqu -0x30($inp),$Ii
|
---|
1471 | vpxor $Xhi,$Zhi,$Zhi
|
---|
1472 | vpclmulqdq \$0x11,$Hkey,$Ij,$Xhi
|
---|
1473 | vmovdqu 0x30-0x40($Htbl),$Hkey # $Hkey^3
|
---|
1474 | vpshufb $bswap,$Ii,$Ij
|
---|
1475 | vpxor $Xmi,$Zmi,$Zmi
|
---|
1476 | vpclmulqdq \$0x00,$HK,$T1,$Xmi
|
---|
1477 | vmovdqu 0x50-0x40($Htbl),$HK
|
---|
1478 | sub \$0x10,$len
|
---|
1479 | jz .Ltail_avx
|
---|
1480 |
|
---|
1481 | vpunpckhqdq $Ij,$Ij,$T1
|
---|
1482 | vpxor $Xlo,$Zlo,$Zlo
|
---|
1483 | vpclmulqdq \$0x00,$Hkey,$Ij,$Xlo
|
---|
1484 | vpxor $Ij,$T1,$T1
|
---|
1485 | vmovdqu -0x40($inp),$Ii
|
---|
1486 | vpxor $Xhi,$Zhi,$Zhi
|
---|
1487 | vpclmulqdq \$0x11,$Hkey,$Ij,$Xhi
|
---|
1488 | vmovdqu 0x40-0x40($Htbl),$Hkey # $Hkey^4
|
---|
1489 | vpshufb $bswap,$Ii,$Ij
|
---|
1490 | vpxor $Xmi,$Zmi,$Zmi
|
---|
1491 | vpclmulqdq \$0x00,$HK,$T1,$Xmi
|
---|
1492 | vpsrldq \$8,$HK,$HK
|
---|
1493 | sub \$0x10,$len
|
---|
1494 | jz .Ltail_avx
|
---|
1495 |
|
---|
1496 | vpunpckhqdq $Ij,$Ij,$T1
|
---|
1497 | vpxor $Xlo,$Zlo,$Zlo
|
---|
1498 | vpclmulqdq \$0x00,$Hkey,$Ij,$Xlo
|
---|
1499 | vpxor $Ij,$T1,$T1
|
---|
1500 | vmovdqu -0x50($inp),$Ii
|
---|
1501 | vpxor $Xhi,$Zhi,$Zhi
|
---|
1502 | vpclmulqdq \$0x11,$Hkey,$Ij,$Xhi
|
---|
1503 | vmovdqu 0x60-0x40($Htbl),$Hkey # $Hkey^5
|
---|
1504 | vpshufb $bswap,$Ii,$Ij
|
---|
1505 | vpxor $Xmi,$Zmi,$Zmi
|
---|
1506 | vpclmulqdq \$0x00,$HK,$T1,$Xmi
|
---|
1507 | vmovdqu 0x80-0x40($Htbl),$HK
|
---|
1508 | sub \$0x10,$len
|
---|
1509 | jz .Ltail_avx
|
---|
1510 |
|
---|
1511 | vpunpckhqdq $Ij,$Ij,$T1
|
---|
1512 | vpxor $Xlo,$Zlo,$Zlo
|
---|
1513 | vpclmulqdq \$0x00,$Hkey,$Ij,$Xlo
|
---|
1514 | vpxor $Ij,$T1,$T1
|
---|
1515 | vmovdqu -0x60($inp),$Ii
|
---|
1516 | vpxor $Xhi,$Zhi,$Zhi
|
---|
1517 | vpclmulqdq \$0x11,$Hkey,$Ij,$Xhi
|
---|
1518 | vmovdqu 0x70-0x40($Htbl),$Hkey # $Hkey^6
|
---|
1519 | vpshufb $bswap,$Ii,$Ij
|
---|
1520 | vpxor $Xmi,$Zmi,$Zmi
|
---|
1521 | vpclmulqdq \$0x00,$HK,$T1,$Xmi
|
---|
1522 | vpsrldq \$8,$HK,$HK
|
---|
1523 | sub \$0x10,$len
|
---|
1524 | jz .Ltail_avx
|
---|
1525 |
|
---|
1526 | vpunpckhqdq $Ij,$Ij,$T1
|
---|
1527 | vpxor $Xlo,$Zlo,$Zlo
|
---|
1528 | vpclmulqdq \$0x00,$Hkey,$Ij,$Xlo
|
---|
1529 | vpxor $Ij,$T1,$T1
|
---|
1530 | vmovdqu -0x70($inp),$Ii
|
---|
1531 | vpxor $Xhi,$Zhi,$Zhi
|
---|
1532 | vpclmulqdq \$0x11,$Hkey,$Ij,$Xhi
|
---|
1533 | vmovdqu 0x90-0x40($Htbl),$Hkey # $Hkey^7
|
---|
1534 | vpshufb $bswap,$Ii,$Ij
|
---|
1535 | vpxor $Xmi,$Zmi,$Zmi
|
---|
1536 | vpclmulqdq \$0x00,$HK,$T1,$Xmi
|
---|
1537 | vmovq 0xb8-0x40($Htbl),$HK
|
---|
1538 | sub \$0x10,$len
|
---|
1539 | jmp .Ltail_avx
|
---|
1540 |
|
---|
1541 | .align 32
|
---|
1542 | .Ltail_avx:
|
---|
1543 | vpxor $Xi,$Ij,$Ij # accumulate $Xi
|
---|
1544 | .Ltail_no_xor_avx:
|
---|
1545 | vpunpckhqdq $Ij,$Ij,$T1
|
---|
1546 | vpxor $Xlo,$Zlo,$Zlo
|
---|
1547 | vpclmulqdq \$0x00,$Hkey,$Ij,$Xlo
|
---|
1548 | vpxor $Ij,$T1,$T1
|
---|
1549 | vpxor $Xhi,$Zhi,$Zhi
|
---|
1550 | vpclmulqdq \$0x11,$Hkey,$Ij,$Xhi
|
---|
1551 | vpxor $Xmi,$Zmi,$Zmi
|
---|
1552 | vpclmulqdq \$0x00,$HK,$T1,$Xmi
|
---|
1553 |
|
---|
1554 | vmovdqu (%r10),$Tred
|
---|
1555 |
|
---|
1556 | vpxor $Xlo,$Zlo,$Xi
|
---|
1557 | vpxor $Xhi,$Zhi,$Xo
|
---|
1558 | vpxor $Xmi,$Zmi,$Zmi
|
---|
1559 |
|
---|
1560 | vpxor $Xi, $Zmi,$Zmi # aggregated Karatsuba post-processing
|
---|
1561 | vpxor $Xo, $Zmi,$Zmi
|
---|
1562 | vpslldq \$8, $Zmi,$T2
|
---|
1563 | vpsrldq \$8, $Zmi,$Zmi
|
---|
1564 | vpxor $T2, $Xi, $Xi
|
---|
1565 | vpxor $Zmi,$Xo, $Xo
|
---|
1566 |
|
---|
1567 | vpclmulqdq \$0x10,$Tred,$Xi,$T2 # 1st phase
|
---|
1568 | vpalignr \$8,$Xi,$Xi,$Xi
|
---|
1569 | vpxor $T2,$Xi,$Xi
|
---|
1570 |
|
---|
1571 | vpclmulqdq \$0x10,$Tred,$Xi,$T2 # 2nd phase
|
---|
1572 | vpalignr \$8,$Xi,$Xi,$Xi
|
---|
1573 | vpxor $Xo,$Xi,$Xi
|
---|
1574 | vpxor $T2,$Xi,$Xi
|
---|
1575 |
|
---|
1576 | cmp \$0,$len
|
---|
1577 | jne .Lshort_avx
|
---|
1578 |
|
---|
1579 | vpshufb $bswap,$Xi,$Xi
|
---|
1580 | vmovdqu $Xi,($Xip)
|
---|
1581 | vzeroupper
|
---|
1582 | ___
|
---|
1583 | $code.=<<___ if ($win64);
|
---|
1584 | movaps (%rsp),%xmm6
|
---|
1585 | movaps 0x10(%rsp),%xmm7
|
---|
1586 | movaps 0x20(%rsp),%xmm8
|
---|
1587 | movaps 0x30(%rsp),%xmm9
|
---|
1588 | movaps 0x40(%rsp),%xmm10
|
---|
1589 | movaps 0x50(%rsp),%xmm11
|
---|
1590 | movaps 0x60(%rsp),%xmm12
|
---|
1591 | movaps 0x70(%rsp),%xmm13
|
---|
1592 | movaps 0x80(%rsp),%xmm14
|
---|
1593 | movaps 0x90(%rsp),%xmm15
|
---|
1594 | lea 0xa8(%rsp),%rsp
|
---|
1595 | .LSEH_end_gcm_ghash_avx:
|
---|
1596 | ___
|
---|
1597 | $code.=<<___;
|
---|
1598 | ret
|
---|
1599 | .cfi_endproc
|
---|
1600 | .size gcm_ghash_avx,.-gcm_ghash_avx
|
---|
1601 | ___
|
---|
1602 | } else {
|
---|
1603 | $code.=<<___;
|
---|
1604 | jmp .L_ghash_clmul
|
---|
1605 | .cfi_endproc
|
---|
1606 | .size gcm_ghash_avx,.-gcm_ghash_avx
|
---|
1607 | ___
|
---|
1608 | }
|
---|
1609 | |
---|
1610 |
|
---|
1611 | $code.=<<___;
|
---|
1612 | .align 64
|
---|
1613 | .Lbswap_mask:
|
---|
1614 | .byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
|
---|
1615 | .L0x1c2_polynomial:
|
---|
1616 | .byte 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2
|
---|
1617 | .L7_mask:
|
---|
1618 | .long 7,0,7,0
|
---|
1619 | .L7_mask_poly:
|
---|
1620 | .long 7,0,`0xE1<<1`,0
|
---|
1621 | .align 64
|
---|
1622 | .type .Lrem_4bit,\@object
|
---|
1623 | .Lrem_4bit:
|
---|
1624 | .long 0,`0x0000<<16`,0,`0x1C20<<16`,0,`0x3840<<16`,0,`0x2460<<16`
|
---|
1625 | .long 0,`0x7080<<16`,0,`0x6CA0<<16`,0,`0x48C0<<16`,0,`0x54E0<<16`
|
---|
1626 | .long 0,`0xE100<<16`,0,`0xFD20<<16`,0,`0xD940<<16`,0,`0xC560<<16`
|
---|
1627 | .long 0,`0x9180<<16`,0,`0x8DA0<<16`,0,`0xA9C0<<16`,0,`0xB5E0<<16`
|
---|
1628 | .type .Lrem_8bit,\@object
|
---|
1629 | .Lrem_8bit:
|
---|
1630 | .value 0x0000,0x01C2,0x0384,0x0246,0x0708,0x06CA,0x048C,0x054E
|
---|
1631 | .value 0x0E10,0x0FD2,0x0D94,0x0C56,0x0918,0x08DA,0x0A9C,0x0B5E
|
---|
1632 | .value 0x1C20,0x1DE2,0x1FA4,0x1E66,0x1B28,0x1AEA,0x18AC,0x196E
|
---|
1633 | .value 0x1230,0x13F2,0x11B4,0x1076,0x1538,0x14FA,0x16BC,0x177E
|
---|
1634 | .value 0x3840,0x3982,0x3BC4,0x3A06,0x3F48,0x3E8A,0x3CCC,0x3D0E
|
---|
1635 | .value 0x3650,0x3792,0x35D4,0x3416,0x3158,0x309A,0x32DC,0x331E
|
---|
1636 | .value 0x2460,0x25A2,0x27E4,0x2626,0x2368,0x22AA,0x20EC,0x212E
|
---|
1637 | .value 0x2A70,0x2BB2,0x29F4,0x2836,0x2D78,0x2CBA,0x2EFC,0x2F3E
|
---|
1638 | .value 0x7080,0x7142,0x7304,0x72C6,0x7788,0x764A,0x740C,0x75CE
|
---|
1639 | .value 0x7E90,0x7F52,0x7D14,0x7CD6,0x7998,0x785A,0x7A1C,0x7BDE
|
---|
1640 | .value 0x6CA0,0x6D62,0x6F24,0x6EE6,0x6BA8,0x6A6A,0x682C,0x69EE
|
---|
1641 | .value 0x62B0,0x6372,0x6134,0x60F6,0x65B8,0x647A,0x663C,0x67FE
|
---|
1642 | .value 0x48C0,0x4902,0x4B44,0x4A86,0x4FC8,0x4E0A,0x4C4C,0x4D8E
|
---|
1643 | .value 0x46D0,0x4712,0x4554,0x4496,0x41D8,0x401A,0x425C,0x439E
|
---|
1644 | .value 0x54E0,0x5522,0x5764,0x56A6,0x53E8,0x522A,0x506C,0x51AE
|
---|
1645 | .value 0x5AF0,0x5B32,0x5974,0x58B6,0x5DF8,0x5C3A,0x5E7C,0x5FBE
|
---|
1646 | .value 0xE100,0xE0C2,0xE284,0xE346,0xE608,0xE7CA,0xE58C,0xE44E
|
---|
1647 | .value 0xEF10,0xEED2,0xEC94,0xED56,0xE818,0xE9DA,0xEB9C,0xEA5E
|
---|
1648 | .value 0xFD20,0xFCE2,0xFEA4,0xFF66,0xFA28,0xFBEA,0xF9AC,0xF86E
|
---|
1649 | .value 0xF330,0xF2F2,0xF0B4,0xF176,0xF438,0xF5FA,0xF7BC,0xF67E
|
---|
1650 | .value 0xD940,0xD882,0xDAC4,0xDB06,0xDE48,0xDF8A,0xDDCC,0xDC0E
|
---|
1651 | .value 0xD750,0xD692,0xD4D4,0xD516,0xD058,0xD19A,0xD3DC,0xD21E
|
---|
1652 | .value 0xC560,0xC4A2,0xC6E4,0xC726,0xC268,0xC3AA,0xC1EC,0xC02E
|
---|
1653 | .value 0xCB70,0xCAB2,0xC8F4,0xC936,0xCC78,0xCDBA,0xCFFC,0xCE3E
|
---|
1654 | .value 0x9180,0x9042,0x9204,0x93C6,0x9688,0x974A,0x950C,0x94CE
|
---|
1655 | .value 0x9F90,0x9E52,0x9C14,0x9DD6,0x9898,0x995A,0x9B1C,0x9ADE
|
---|
1656 | .value 0x8DA0,0x8C62,0x8E24,0x8FE6,0x8AA8,0x8B6A,0x892C,0x88EE
|
---|
1657 | .value 0x83B0,0x8272,0x8034,0x81F6,0x84B8,0x857A,0x873C,0x86FE
|
---|
1658 | .value 0xA9C0,0xA802,0xAA44,0xAB86,0xAEC8,0xAF0A,0xAD4C,0xAC8E
|
---|
1659 | .value 0xA7D0,0xA612,0xA454,0xA596,0xA0D8,0xA11A,0xA35C,0xA29E
|
---|
1660 | .value 0xB5E0,0xB422,0xB664,0xB7A6,0xB2E8,0xB32A,0xB16C,0xB0AE
|
---|
1661 | .value 0xBBF0,0xBA32,0xB874,0xB9B6,0xBCF8,0xBD3A,0xBF7C,0xBEBE
|
---|
1662 |
|
---|
1663 | .asciz "GHASH for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
|
---|
1664 | .align 64
|
---|
1665 | ___
|
---|
1666 | |
---|
1667 |
|
---|
1668 | # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
|
---|
1669 | # CONTEXT *context,DISPATCHER_CONTEXT *disp)
|
---|
1670 | if ($win64) {
|
---|
1671 | $rec="%rcx";
|
---|
1672 | $frame="%rdx";
|
---|
1673 | $context="%r8";
|
---|
1674 | $disp="%r9";
|
---|
1675 |
|
---|
1676 | $code.=<<___;
|
---|
1677 | .extern __imp_RtlVirtualUnwind
|
---|
1678 | .type se_handler,\@abi-omnipotent
|
---|
1679 | .align 16
|
---|
1680 | se_handler:
|
---|
1681 | push %rsi
|
---|
1682 | push %rdi
|
---|
1683 | push %rbx
|
---|
1684 | push %rbp
|
---|
1685 | push %r12
|
---|
1686 | push %r13
|
---|
1687 | push %r14
|
---|
1688 | push %r15
|
---|
1689 | pushfq
|
---|
1690 | sub \$64,%rsp
|
---|
1691 |
|
---|
1692 | mov 120($context),%rax # pull context->Rax
|
---|
1693 | mov 248($context),%rbx # pull context->Rip
|
---|
1694 |
|
---|
1695 | mov 8($disp),%rsi # disp->ImageBase
|
---|
1696 | mov 56($disp),%r11 # disp->HandlerData
|
---|
1697 |
|
---|
1698 | mov 0(%r11),%r10d # HandlerData[0]
|
---|
1699 | lea (%rsi,%r10),%r10 # prologue label
|
---|
1700 | cmp %r10,%rbx # context->Rip<prologue label
|
---|
1701 | jb .Lin_prologue
|
---|
1702 |
|
---|
1703 | mov 152($context),%rax # pull context->Rsp
|
---|
1704 |
|
---|
1705 | mov 4(%r11),%r10d # HandlerData[1]
|
---|
1706 | lea (%rsi,%r10),%r10 # epilogue label
|
---|
1707 | cmp %r10,%rbx # context->Rip>=epilogue label
|
---|
1708 | jae .Lin_prologue
|
---|
1709 |
|
---|
1710 | lea 48+280(%rax),%rax # adjust "rsp"
|
---|
1711 |
|
---|
1712 | mov -8(%rax),%rbx
|
---|
1713 | mov -16(%rax),%rbp
|
---|
1714 | mov -24(%rax),%r12
|
---|
1715 | mov -32(%rax),%r13
|
---|
1716 | mov -40(%rax),%r14
|
---|
1717 | mov -48(%rax),%r15
|
---|
1718 | mov %rbx,144($context) # restore context->Rbx
|
---|
1719 | mov %rbp,160($context) # restore context->Rbp
|
---|
1720 | mov %r12,216($context) # restore context->R12
|
---|
1721 | mov %r13,224($context) # restore context->R13
|
---|
1722 | mov %r14,232($context) # restore context->R14
|
---|
1723 | mov %r15,240($context) # restore context->R15
|
---|
1724 |
|
---|
1725 | .Lin_prologue:
|
---|
1726 | mov 8(%rax),%rdi
|
---|
1727 | mov 16(%rax),%rsi
|
---|
1728 | mov %rax,152($context) # restore context->Rsp
|
---|
1729 | mov %rsi,168($context) # restore context->Rsi
|
---|
1730 | mov %rdi,176($context) # restore context->Rdi
|
---|
1731 |
|
---|
1732 | mov 40($disp),%rdi # disp->ContextRecord
|
---|
1733 | mov $context,%rsi # context
|
---|
1734 | mov \$`1232/8`,%ecx # sizeof(CONTEXT)
|
---|
1735 | .long 0xa548f3fc # cld; rep movsq
|
---|
1736 |
|
---|
1737 | mov $disp,%rsi
|
---|
1738 | xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
|
---|
1739 | mov 8(%rsi),%rdx # arg2, disp->ImageBase
|
---|
1740 | mov 0(%rsi),%r8 # arg3, disp->ControlPc
|
---|
1741 | mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
|
---|
1742 | mov 40(%rsi),%r10 # disp->ContextRecord
|
---|
1743 | lea 56(%rsi),%r11 # &disp->HandlerData
|
---|
1744 | lea 24(%rsi),%r12 # &disp->EstablisherFrame
|
---|
1745 | mov %r10,32(%rsp) # arg5
|
---|
1746 | mov %r11,40(%rsp) # arg6
|
---|
1747 | mov %r12,48(%rsp) # arg7
|
---|
1748 | mov %rcx,56(%rsp) # arg8, (NULL)
|
---|
1749 | call *__imp_RtlVirtualUnwind(%rip)
|
---|
1750 |
|
---|
1751 | mov \$1,%eax # ExceptionContinueSearch
|
---|
1752 | add \$64,%rsp
|
---|
1753 | popfq
|
---|
1754 | pop %r15
|
---|
1755 | pop %r14
|
---|
1756 | pop %r13
|
---|
1757 | pop %r12
|
---|
1758 | pop %rbp
|
---|
1759 | pop %rbx
|
---|
1760 | pop %rdi
|
---|
1761 | pop %rsi
|
---|
1762 | ret
|
---|
1763 | .size se_handler,.-se_handler
|
---|
1764 |
|
---|
1765 | .section .pdata
|
---|
1766 | .align 4
|
---|
1767 | .rva .LSEH_begin_gcm_gmult_4bit
|
---|
1768 | .rva .LSEH_end_gcm_gmult_4bit
|
---|
1769 | .rva .LSEH_info_gcm_gmult_4bit
|
---|
1770 |
|
---|
1771 | .rva .LSEH_begin_gcm_ghash_4bit
|
---|
1772 | .rva .LSEH_end_gcm_ghash_4bit
|
---|
1773 | .rva .LSEH_info_gcm_ghash_4bit
|
---|
1774 |
|
---|
1775 | .rva .LSEH_begin_gcm_init_clmul
|
---|
1776 | .rva .LSEH_end_gcm_init_clmul
|
---|
1777 | .rva .LSEH_info_gcm_init_clmul
|
---|
1778 |
|
---|
1779 | .rva .LSEH_begin_gcm_ghash_clmul
|
---|
1780 | .rva .LSEH_end_gcm_ghash_clmul
|
---|
1781 | .rva .LSEH_info_gcm_ghash_clmul
|
---|
1782 | ___
|
---|
1783 | $code.=<<___ if ($avx);
|
---|
1784 | .rva .LSEH_begin_gcm_init_avx
|
---|
1785 | .rva .LSEH_end_gcm_init_avx
|
---|
1786 | .rva .LSEH_info_gcm_init_clmul
|
---|
1787 |
|
---|
1788 | .rva .LSEH_begin_gcm_ghash_avx
|
---|
1789 | .rva .LSEH_end_gcm_ghash_avx
|
---|
1790 | .rva .LSEH_info_gcm_ghash_clmul
|
---|
1791 | ___
|
---|
1792 | $code.=<<___;
|
---|
1793 | .section .xdata
|
---|
1794 | .align 8
|
---|
1795 | .LSEH_info_gcm_gmult_4bit:
|
---|
1796 | .byte 9,0,0,0
|
---|
1797 | .rva se_handler
|
---|
1798 | .rva .Lgmult_prologue,.Lgmult_epilogue # HandlerData
|
---|
1799 | .LSEH_info_gcm_ghash_4bit:
|
---|
1800 | .byte 9,0,0,0
|
---|
1801 | .rva se_handler
|
---|
1802 | .rva .Lghash_prologue,.Lghash_epilogue # HandlerData
|
---|
1803 | .LSEH_info_gcm_init_clmul:
|
---|
1804 | .byte 0x01,0x08,0x03,0x00
|
---|
1805 | .byte 0x08,0x68,0x00,0x00 #movaps 0x00(rsp),xmm6
|
---|
1806 | .byte 0x04,0x22,0x00,0x00 #sub rsp,0x18
|
---|
1807 | .LSEH_info_gcm_ghash_clmul:
|
---|
1808 | .byte 0x01,0x33,0x16,0x00
|
---|
1809 | .byte 0x33,0xf8,0x09,0x00 #movaps 0x90(rsp),xmm15
|
---|
1810 | .byte 0x2e,0xe8,0x08,0x00 #movaps 0x80(rsp),xmm14
|
---|
1811 | .byte 0x29,0xd8,0x07,0x00 #movaps 0x70(rsp),xmm13
|
---|
1812 | .byte 0x24,0xc8,0x06,0x00 #movaps 0x60(rsp),xmm12
|
---|
1813 | .byte 0x1f,0xb8,0x05,0x00 #movaps 0x50(rsp),xmm11
|
---|
1814 | .byte 0x1a,0xa8,0x04,0x00 #movaps 0x40(rsp),xmm10
|
---|
1815 | .byte 0x15,0x98,0x03,0x00 #movaps 0x30(rsp),xmm9
|
---|
1816 | .byte 0x10,0x88,0x02,0x00 #movaps 0x20(rsp),xmm8
|
---|
1817 | .byte 0x0c,0x78,0x01,0x00 #movaps 0x10(rsp),xmm7
|
---|
1818 | .byte 0x08,0x68,0x00,0x00 #movaps 0x00(rsp),xmm6
|
---|
1819 | .byte 0x04,0x01,0x15,0x00 #sub rsp,0xa8
|
---|
1820 | ___
|
---|
1821 | }
|
---|
1822 | |
---|
1823 |
|
---|
1824 | $code =~ s/\`([^\`]*)\`/eval($1)/gem;
|
---|
1825 |
|
---|
1826 | print $code;
|
---|
1827 |
|
---|
1828 | close STDOUT or die "error closing STDOUT: $!";
|
---|