1 | #! /usr/bin/env perl
|
---|
2 | # Copyright 2010-2020 The OpenSSL Project Authors. All Rights Reserved.
|
---|
3 | #
|
---|
4 | # Licensed under the Apache License 2.0 (the "License"). You may not use
|
---|
5 | # this file except in compliance with the License. You can obtain a copy
|
---|
6 | # in the file LICENSE in the source distribution or at
|
---|
7 | # https://www.openssl.org/source/license.html
|
---|
8 |
|
---|
9 | #
|
---|
10 | # ====================================================================
|
---|
11 | # Written by Andy Polyakov <[email protected]> for the OpenSSL
|
---|
12 | # project. The module is, however, dual licensed under OpenSSL and
|
---|
13 | # CRYPTOGAMS licenses depending on where you obtain it. For further
|
---|
14 | # details see http://www.openssl.org/~appro/cryptogams/.
|
---|
15 | # ====================================================================
|
---|
16 | #
|
---|
17 | # March, June 2010
|
---|
18 | #
|
---|
19 | # The module implements "4-bit" GCM GHASH function and underlying
|
---|
20 | # single multiplication operation in GF(2^128). "4-bit" means that
|
---|
21 | # it uses 256 bytes per-key table [+128 bytes shared table]. GHASH
|
---|
22 | # function features so called "528B" variant utilizing additional
|
---|
23 | # 256+16 bytes of per-key storage [+512 bytes shared table].
|
---|
24 | # Performance results are for this streamed GHASH subroutine and are
|
---|
25 | # expressed in cycles per processed byte, less is better:
|
---|
26 | #
|
---|
27 | # gcc 3.4.x(*) assembler
|
---|
28 | #
|
---|
29 | # P4 28.6 14.0 +100%
|
---|
30 | # Opteron 19.3 7.7 +150%
|
---|
31 | # Core2 17.8 8.1(**) +120%
|
---|
32 | # Atom 31.6 16.8 +88%
|
---|
33 | # VIA Nano 21.8 10.1 +115%
|
---|
34 | #
|
---|
35 | # (*) comparison is not completely fair, because C results are
|
---|
36 | # for vanilla "256B" implementation, while assembler results
|
---|
37 | # are for "528B";-)
|
---|
38 | # (**) it's mystery [to me] why Core2 result is not same as for
|
---|
39 | # Opteron;
|
---|
40 |
|
---|
41 | # May 2010
|
---|
42 | #
|
---|
43 | # Add PCLMULQDQ version performing at 2.02 cycles per processed byte.
|
---|
44 | # See ghash-x86.pl for background information and details about coding
|
---|
45 | # techniques.
|
---|
46 | #
|
---|
47 | # Special thanks to David Woodhouse for providing access to a
|
---|
48 | # Westmere-based system on behalf of Intel Open Source Technology Centre.
|
---|
49 |
|
---|
50 | # December 2012
|
---|
51 | #
|
---|
52 | # Overhaul: aggregate Karatsuba post-processing, improve ILP in
|
---|
53 | # reduction_alg9, increase reduction aggregate factor to 4x. As for
|
---|
54 | # the latter. ghash-x86.pl discusses that it makes lesser sense to
|
---|
55 | # increase aggregate factor. Then why increase here? Critical path
|
---|
56 | # consists of 3 independent pclmulqdq instructions, Karatsuba post-
|
---|
57 | # processing and reduction. "On top" of this we lay down aggregated
|
---|
58 | # multiplication operations, triplets of independent pclmulqdq's. As
|
---|
59 | # issue rate for pclmulqdq is limited, it makes lesser sense to
|
---|
60 | # aggregate more multiplications than it takes to perform remaining
|
---|
61 | # non-multiplication operations. 2x is near-optimal coefficient for
|
---|
62 | # contemporary Intel CPUs (therefore modest improvement coefficient),
|
---|
63 | # but not for Bulldozer. Latter is because logical SIMD operations
|
---|
64 | # are twice as slow in comparison to Intel, so that critical path is
|
---|
65 | # longer. A CPU with higher pclmulqdq issue rate would also benefit
|
---|
66 | # from higher aggregate factor...
|
---|
67 | #
|
---|
68 | # Westmere 1.78(+13%)
|
---|
69 | # Sandy Bridge 1.80(+8%)
|
---|
70 | # Ivy Bridge 1.80(+7%)
|
---|
71 | # Haswell 0.55(+93%) (if system doesn't support AVX)
|
---|
72 | # Broadwell 0.45(+110%)(if system doesn't support AVX)
|
---|
73 | # Skylake 0.44(+110%)(if system doesn't support AVX)
|
---|
74 | # Bulldozer 1.49(+27%)
|
---|
75 | # Silvermont 2.88(+13%)
|
---|
76 | # Knights L 2.12(-) (if system doesn't support AVX)
|
---|
77 | # Goldmont 1.08(+24%)
|
---|
78 |
|
---|
79 | # March 2013
|
---|
80 | #
|
---|
81 | # ... 8x aggregate factor AVX code path is using reduction algorithm
|
---|
82 | # suggested by Shay Gueron[1]. Even though contemporary AVX-capable
|
---|
83 | # CPUs such as Sandy and Ivy Bridge can execute it, the code performs
|
---|
84 | # sub-optimally in comparison to above mentioned version. But thanks
|
---|
85 | # to Ilya Albrekht and Max Locktyukhin of Intel Corp. we knew that
|
---|
86 | # it performs in 0.41 cycles per byte on Haswell processor, in
|
---|
87 | # 0.29 on Broadwell, and in 0.36 on Skylake.
|
---|
88 | #
|
---|
89 | # Knights Landing achieves 1.09 cpb.
|
---|
90 | #
|
---|
91 | # [1] http://rt.openssl.org/Ticket/Display.html?id=2900&user=guest&pass=guest
|
---|
92 |
|
---|
93 | # $output is the last argument if it looks like a file (it has an extension)
|
---|
94 | # $flavour is the first argument if it doesn't look like a file
|
---|
95 | $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
|
---|
96 | $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
|
---|
97 |
|
---|
98 | $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
|
---|
99 |
|
---|
100 | $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
|
---|
101 | ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
|
---|
102 | ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
|
---|
103 | die "can't locate x86_64-xlate.pl";
|
---|
104 |
|
---|
105 | if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
|
---|
106 | =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
|
---|
107 | $avx = ($1>=2.20) + ($1>=2.22);
|
---|
108 | }
|
---|
109 |
|
---|
110 | if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
|
---|
111 | `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
|
---|
112 | $avx = ($1>=2.09) + ($1>=2.10);
|
---|
113 | }
|
---|
114 |
|
---|
115 | if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
|
---|
116 | `ml64 2>&1` =~ /Version ([0-9]+)\./) {
|
---|
117 | $avx = ($1>=10) + ($1>=11);
|
---|
118 | }
|
---|
119 |
|
---|
120 | if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:clang|LLVM) version|.*based on LLVM) ([0-9]+\.[0-9]+)/) {
|
---|
121 | $avx = ($2>=3.0) + ($2>3.0);
|
---|
122 | }
|
---|
123 |
|
---|
124 | open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""
|
---|
125 | or die "can't call $xlate: $!";
|
---|
126 | *STDOUT=*OUT;
|
---|
127 |
|
---|
128 | $do4xaggr=1;
|
---|
129 |
|
---|
130 | # common register layout
|
---|
131 | $nlo="%rax";
|
---|
132 | $nhi="%rbx";
|
---|
133 | $Zlo="%r8";
|
---|
134 | $Zhi="%r9";
|
---|
135 | $tmp="%r10";
|
---|
136 | $rem_4bit = "%r11";
|
---|
137 |
|
---|
138 | $Xi="%rdi";
|
---|
139 | $Htbl="%rsi";
|
---|
140 |
|
---|
141 | # per-function register layout
|
---|
142 | $cnt="%rcx";
|
---|
143 | $rem="%rdx";
|
---|
144 |
|
---|
145 | sub LB() { my $r=shift; $r =~ s/%[er]([a-d])x/%\1l/ or
|
---|
146 | $r =~ s/%[er]([sd]i)/%\1l/ or
|
---|
147 | $r =~ s/%[er](bp)/%\1l/ or
|
---|
148 | $r =~ s/%(r[0-9]+)[d]?/%\1b/; $r; }
|
---|
149 |
|
---|
150 | sub AUTOLOAD() # thunk [simplified] 32-bit style perlasm
|
---|
151 | { my $opcode = $AUTOLOAD; $opcode =~ s/.*:://;
|
---|
152 | my $arg = pop;
|
---|
153 | $arg = "\$$arg" if ($arg*1 eq $arg);
|
---|
154 | $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n";
|
---|
155 | }
|
---|
156 | |
---|
157 |
|
---|
158 | { my $N;
|
---|
159 | sub loop() {
|
---|
160 | my $inp = shift;
|
---|
161 |
|
---|
162 | $N++;
|
---|
163 | $code.=<<___;
|
---|
164 | xor $nlo,$nlo
|
---|
165 | xor $nhi,$nhi
|
---|
166 | mov `&LB("$Zlo")`,`&LB("$nlo")`
|
---|
167 | mov `&LB("$Zlo")`,`&LB("$nhi")`
|
---|
168 | shl \$4,`&LB("$nlo")`
|
---|
169 | mov \$14,$cnt
|
---|
170 | mov 8($Htbl,$nlo),$Zlo
|
---|
171 | mov ($Htbl,$nlo),$Zhi
|
---|
172 | and \$0xf0,`&LB("$nhi")`
|
---|
173 | mov $Zlo,$rem
|
---|
174 | jmp .Loop$N
|
---|
175 |
|
---|
176 | .align 16
|
---|
177 | .Loop$N:
|
---|
178 | shr \$4,$Zlo
|
---|
179 | and \$0xf,$rem
|
---|
180 | mov $Zhi,$tmp
|
---|
181 | mov ($inp,$cnt),`&LB("$nlo")`
|
---|
182 | shr \$4,$Zhi
|
---|
183 | xor 8($Htbl,$nhi),$Zlo
|
---|
184 | shl \$60,$tmp
|
---|
185 | xor ($Htbl,$nhi),$Zhi
|
---|
186 | mov `&LB("$nlo")`,`&LB("$nhi")`
|
---|
187 | xor ($rem_4bit,$rem,8),$Zhi
|
---|
188 | mov $Zlo,$rem
|
---|
189 | shl \$4,`&LB("$nlo")`
|
---|
190 | xor $tmp,$Zlo
|
---|
191 | dec $cnt
|
---|
192 | js .Lbreak$N
|
---|
193 |
|
---|
194 | shr \$4,$Zlo
|
---|
195 | and \$0xf,$rem
|
---|
196 | mov $Zhi,$tmp
|
---|
197 | shr \$4,$Zhi
|
---|
198 | xor 8($Htbl,$nlo),$Zlo
|
---|
199 | shl \$60,$tmp
|
---|
200 | xor ($Htbl,$nlo),$Zhi
|
---|
201 | and \$0xf0,`&LB("$nhi")`
|
---|
202 | xor ($rem_4bit,$rem,8),$Zhi
|
---|
203 | mov $Zlo,$rem
|
---|
204 | xor $tmp,$Zlo
|
---|
205 | jmp .Loop$N
|
---|
206 |
|
---|
207 | .align 16
|
---|
208 | .Lbreak$N:
|
---|
209 | shr \$4,$Zlo
|
---|
210 | and \$0xf,$rem
|
---|
211 | mov $Zhi,$tmp
|
---|
212 | shr \$4,$Zhi
|
---|
213 | xor 8($Htbl,$nlo),$Zlo
|
---|
214 | shl \$60,$tmp
|
---|
215 | xor ($Htbl,$nlo),$Zhi
|
---|
216 | and \$0xf0,`&LB("$nhi")`
|
---|
217 | xor ($rem_4bit,$rem,8),$Zhi
|
---|
218 | mov $Zlo,$rem
|
---|
219 | xor $tmp,$Zlo
|
---|
220 |
|
---|
221 | shr \$4,$Zlo
|
---|
222 | and \$0xf,$rem
|
---|
223 | mov $Zhi,$tmp
|
---|
224 | shr \$4,$Zhi
|
---|
225 | xor 8($Htbl,$nhi),$Zlo
|
---|
226 | shl \$60,$tmp
|
---|
227 | xor ($Htbl,$nhi),$Zhi
|
---|
228 | xor $tmp,$Zlo
|
---|
229 | xor ($rem_4bit,$rem,8),$Zhi
|
---|
230 |
|
---|
231 | bswap $Zlo
|
---|
232 | bswap $Zhi
|
---|
233 | ___
|
---|
234 | }}
|
---|
235 |
|
---|
236 | $code=<<___;
|
---|
237 | .text
|
---|
238 | .extern OPENSSL_ia32cap_P
|
---|
239 |
|
---|
240 | .globl gcm_gmult_4bit
|
---|
241 | .type gcm_gmult_4bit,\@function,2
|
---|
242 | .align 16
|
---|
243 | gcm_gmult_4bit:
|
---|
244 | .cfi_startproc
|
---|
245 | endbranch
|
---|
246 | push %rbx
|
---|
247 | .cfi_push %rbx
|
---|
248 | push %rbp # %rbp and others are pushed exclusively in
|
---|
249 | .cfi_push %rbp
|
---|
250 | push %r12 # order to reuse Win64 exception handler...
|
---|
251 | .cfi_push %r12
|
---|
252 | push %r13
|
---|
253 | .cfi_push %r13
|
---|
254 | push %r14
|
---|
255 | .cfi_push %r14
|
---|
256 | push %r15
|
---|
257 | .cfi_push %r15
|
---|
258 | sub \$280,%rsp
|
---|
259 | .cfi_adjust_cfa_offset 280
|
---|
260 | .Lgmult_prologue:
|
---|
261 |
|
---|
262 | movzb 15($Xi),$Zlo
|
---|
263 | lea .Lrem_4bit(%rip),$rem_4bit
|
---|
264 | ___
|
---|
265 | &loop ($Xi);
|
---|
266 | $code.=<<___;
|
---|
267 | mov $Zlo,8($Xi)
|
---|
268 | mov $Zhi,($Xi)
|
---|
269 |
|
---|
270 | lea 280+48(%rsp),%rsi
|
---|
271 | .cfi_def_cfa %rsi,8
|
---|
272 | mov -8(%rsi),%rbx
|
---|
273 | .cfi_restore %rbx
|
---|
274 | lea (%rsi),%rsp
|
---|
275 | .cfi_def_cfa_register %rsp
|
---|
276 | .Lgmult_epilogue:
|
---|
277 | ret
|
---|
278 | .cfi_endproc
|
---|
279 | .size gcm_gmult_4bit,.-gcm_gmult_4bit
|
---|
280 | ___
|
---|
281 | |
---|
282 |
|
---|
283 | # per-function register layout
|
---|
284 | $inp="%rdx";
|
---|
285 | $len="%rcx";
|
---|
286 | $rem_8bit=$rem_4bit;
|
---|
287 |
|
---|
288 | $code.=<<___;
|
---|
289 | .globl gcm_ghash_4bit
|
---|
290 | .type gcm_ghash_4bit,\@function,4
|
---|
291 | .align 16
|
---|
292 | gcm_ghash_4bit:
|
---|
293 | .cfi_startproc
|
---|
294 | endbranch
|
---|
295 | push %rbx
|
---|
296 | .cfi_push %rbx
|
---|
297 | push %rbp
|
---|
298 | .cfi_push %rbp
|
---|
299 | push %r12
|
---|
300 | .cfi_push %r12
|
---|
301 | push %r13
|
---|
302 | .cfi_push %r13
|
---|
303 | push %r14
|
---|
304 | .cfi_push %r14
|
---|
305 | push %r15
|
---|
306 | .cfi_push %r15
|
---|
307 | sub \$280,%rsp
|
---|
308 | .cfi_adjust_cfa_offset 280
|
---|
309 | .Lghash_prologue:
|
---|
310 | mov $inp,%r14 # reassign couple of args
|
---|
311 | mov $len,%r15
|
---|
312 | ___
|
---|
313 | { my $inp="%r14";
|
---|
314 | my $dat="%edx";
|
---|
315 | my $len="%r15";
|
---|
316 | my @nhi=("%ebx","%ecx");
|
---|
317 | my @rem=("%r12","%r13");
|
---|
318 | my $Hshr4="%rbp";
|
---|
319 |
|
---|
320 | &sub ($Htbl,-128); # size optimization
|
---|
321 | &lea ($Hshr4,"16+128(%rsp)");
|
---|
322 | { my @lo =($nlo,$nhi);
|
---|
323 | my @hi =($Zlo,$Zhi);
|
---|
324 |
|
---|
325 | &xor ($dat,$dat);
|
---|
326 | for ($i=0,$j=-2;$i<18;$i++,$j++) {
|
---|
327 | &mov ("$j(%rsp)",&LB($dat)) if ($i>1);
|
---|
328 | &or ($lo[0],$tmp) if ($i>1);
|
---|
329 | &mov (&LB($dat),&LB($lo[1])) if ($i>0 && $i<17);
|
---|
330 | &shr ($lo[1],4) if ($i>0 && $i<17);
|
---|
331 | &mov ($tmp,$hi[1]) if ($i>0 && $i<17);
|
---|
332 | &shr ($hi[1],4) if ($i>0 && $i<17);
|
---|
333 | &mov ("8*$j($Hshr4)",$hi[0]) if ($i>1);
|
---|
334 | &mov ($hi[0],"16*$i+0-128($Htbl)") if ($i<16);
|
---|
335 | &shl (&LB($dat),4) if ($i>0 && $i<17);
|
---|
336 | &mov ("8*$j-128($Hshr4)",$lo[0]) if ($i>1);
|
---|
337 | &mov ($lo[0],"16*$i+8-128($Htbl)") if ($i<16);
|
---|
338 | &shl ($tmp,60) if ($i>0 && $i<17);
|
---|
339 |
|
---|
340 | push (@lo,shift(@lo));
|
---|
341 | push (@hi,shift(@hi));
|
---|
342 | }
|
---|
343 | }
|
---|
344 | &add ($Htbl,-128);
|
---|
345 | &mov ($Zlo,"8($Xi)");
|
---|
346 | &mov ($Zhi,"0($Xi)");
|
---|
347 | &add ($len,$inp); # pointer to the end of data
|
---|
348 | &lea ($rem_8bit,".Lrem_8bit(%rip)");
|
---|
349 | &jmp (".Louter_loop");
|
---|
350 |
|
---|
351 | $code.=".align 16\n.Louter_loop:\n";
|
---|
352 | &xor ($Zhi,"($inp)");
|
---|
353 | &mov ("%rdx","8($inp)");
|
---|
354 | &lea ($inp,"16($inp)");
|
---|
355 | &xor ("%rdx",$Zlo);
|
---|
356 | &mov ("($Xi)",$Zhi);
|
---|
357 | &mov ("8($Xi)","%rdx");
|
---|
358 | &shr ("%rdx",32);
|
---|
359 |
|
---|
360 | &xor ($nlo,$nlo);
|
---|
361 | &rol ($dat,8);
|
---|
362 | &mov (&LB($nlo),&LB($dat));
|
---|
363 | &movz ($nhi[0],&LB($dat));
|
---|
364 | &shl (&LB($nlo),4);
|
---|
365 | &shr ($nhi[0],4);
|
---|
366 |
|
---|
367 | for ($j=11,$i=0;$i<15;$i++) {
|
---|
368 | &rol ($dat,8);
|
---|
369 | &xor ($Zlo,"8($Htbl,$nlo)") if ($i>0);
|
---|
370 | &xor ($Zhi,"($Htbl,$nlo)") if ($i>0);
|
---|
371 | &mov ($Zlo,"8($Htbl,$nlo)") if ($i==0);
|
---|
372 | &mov ($Zhi,"($Htbl,$nlo)") if ($i==0);
|
---|
373 |
|
---|
374 | &mov (&LB($nlo),&LB($dat));
|
---|
375 | &xor ($Zlo,$tmp) if ($i>0);
|
---|
376 | &movzw ($rem[1],"($rem_8bit,$rem[1],2)") if ($i>0);
|
---|
377 |
|
---|
378 | &movz ($nhi[1],&LB($dat));
|
---|
379 | &shl (&LB($nlo),4);
|
---|
380 | &movzb ($rem[0],"(%rsp,$nhi[0])");
|
---|
381 |
|
---|
382 | &shr ($nhi[1],4) if ($i<14);
|
---|
383 | &and ($nhi[1],0xf0) if ($i==14);
|
---|
384 | &shl ($rem[1],48) if ($i>0);
|
---|
385 | &xor ($rem[0],$Zlo);
|
---|
386 |
|
---|
387 | &mov ($tmp,$Zhi);
|
---|
388 | &xor ($Zhi,$rem[1]) if ($i>0);
|
---|
389 | &shr ($Zlo,8);
|
---|
390 |
|
---|
391 | &movz ($rem[0],&LB($rem[0]));
|
---|
392 | &mov ($dat,"$j($Xi)") if (--$j%4==0);
|
---|
393 | &shr ($Zhi,8);
|
---|
394 |
|
---|
395 | &xor ($Zlo,"-128($Hshr4,$nhi[0],8)");
|
---|
396 | &shl ($tmp,56);
|
---|
397 | &xor ($Zhi,"($Hshr4,$nhi[0],8)");
|
---|
398 |
|
---|
399 | unshift (@nhi,pop(@nhi)); # "rotate" registers
|
---|
400 | unshift (@rem,pop(@rem));
|
---|
401 | }
|
---|
402 | &movzw ($rem[1],"($rem_8bit,$rem[1],2)");
|
---|
403 | &xor ($Zlo,"8($Htbl,$nlo)");
|
---|
404 | &xor ($Zhi,"($Htbl,$nlo)");
|
---|
405 |
|
---|
406 | &shl ($rem[1],48);
|
---|
407 | &xor ($Zlo,$tmp);
|
---|
408 |
|
---|
409 | &xor ($Zhi,$rem[1]);
|
---|
410 | &movz ($rem[0],&LB($Zlo));
|
---|
411 | &shr ($Zlo,4);
|
---|
412 |
|
---|
413 | &mov ($tmp,$Zhi);
|
---|
414 | &shl (&LB($rem[0]),4);
|
---|
415 | &shr ($Zhi,4);
|
---|
416 |
|
---|
417 | &xor ($Zlo,"8($Htbl,$nhi[0])");
|
---|
418 | &movzw ($rem[0],"($rem_8bit,$rem[0],2)");
|
---|
419 | &shl ($tmp,60);
|
---|
420 |
|
---|
421 | &xor ($Zhi,"($Htbl,$nhi[0])");
|
---|
422 | &xor ($Zlo,$tmp);
|
---|
423 | &shl ($rem[0],48);
|
---|
424 |
|
---|
425 | &bswap ($Zlo);
|
---|
426 | &xor ($Zhi,$rem[0]);
|
---|
427 |
|
---|
428 | &bswap ($Zhi);
|
---|
429 | &cmp ($inp,$len);
|
---|
430 | &jb (".Louter_loop");
|
---|
431 | }
|
---|
432 | $code.=<<___;
|
---|
433 | mov $Zlo,8($Xi)
|
---|
434 | mov $Zhi,($Xi)
|
---|
435 |
|
---|
436 | lea 280+48(%rsp),%rsi
|
---|
437 | .cfi_def_cfa %rsi,8
|
---|
438 | mov -48(%rsi),%r15
|
---|
439 | .cfi_restore %r15
|
---|
440 | mov -40(%rsi),%r14
|
---|
441 | .cfi_restore %r14
|
---|
442 | mov -32(%rsi),%r13
|
---|
443 | .cfi_restore %r13
|
---|
444 | mov -24(%rsi),%r12
|
---|
445 | .cfi_restore %r12
|
---|
446 | mov -16(%rsi),%rbp
|
---|
447 | .cfi_restore %rbp
|
---|
448 | mov -8(%rsi),%rbx
|
---|
449 | .cfi_restore %rbx
|
---|
450 | lea 0(%rsi),%rsp
|
---|
451 | .cfi_def_cfa_register %rsp
|
---|
452 | .Lghash_epilogue:
|
---|
453 | ret
|
---|
454 | .cfi_endproc
|
---|
455 | .size gcm_ghash_4bit,.-gcm_ghash_4bit
|
---|
456 | ___
|
---|
457 | |
---|
458 |
|
---|
459 | ######################################################################
|
---|
460 | # PCLMULQDQ version.
|
---|
461 |
|
---|
462 | @_4args=$win64? ("%rcx","%rdx","%r8", "%r9") : # Win64 order
|
---|
463 | ("%rdi","%rsi","%rdx","%rcx"); # Unix order
|
---|
464 |
|
---|
465 | ($Xi,$Xhi)=("%xmm0","%xmm1"); $Hkey="%xmm2";
|
---|
466 | ($T1,$T2,$T3)=("%xmm3","%xmm4","%xmm5");
|
---|
467 |
|
---|
468 | sub clmul64x64_T2 { # minimal register pressure
|
---|
469 | my ($Xhi,$Xi,$Hkey,$HK)=@_;
|
---|
470 |
|
---|
471 | if (!defined($HK)) { $HK = $T2;
|
---|
472 | $code.=<<___;
|
---|
473 | movdqa $Xi,$Xhi #
|
---|
474 | pshufd \$0b01001110,$Xi,$T1
|
---|
475 | pshufd \$0b01001110,$Hkey,$T2
|
---|
476 | pxor $Xi,$T1 #
|
---|
477 | pxor $Hkey,$T2
|
---|
478 | ___
|
---|
479 | } else {
|
---|
480 | $code.=<<___;
|
---|
481 | movdqa $Xi,$Xhi #
|
---|
482 | pshufd \$0b01001110,$Xi,$T1
|
---|
483 | pxor $Xi,$T1 #
|
---|
484 | ___
|
---|
485 | }
|
---|
486 | $code.=<<___;
|
---|
487 | pclmulqdq \$0x00,$Hkey,$Xi #######
|
---|
488 | pclmulqdq \$0x11,$Hkey,$Xhi #######
|
---|
489 | pclmulqdq \$0x00,$HK,$T1 #######
|
---|
490 | pxor $Xi,$T1 #
|
---|
491 | pxor $Xhi,$T1 #
|
---|
492 |
|
---|
493 | movdqa $T1,$T2 #
|
---|
494 | psrldq \$8,$T1
|
---|
495 | pslldq \$8,$T2 #
|
---|
496 | pxor $T1,$Xhi
|
---|
497 | pxor $T2,$Xi #
|
---|
498 | ___
|
---|
499 | }
|
---|
500 |
|
---|
501 | sub reduction_alg9 { # 17/11 times faster than Intel version
|
---|
502 | my ($Xhi,$Xi) = @_;
|
---|
503 |
|
---|
504 | $code.=<<___;
|
---|
505 | # 1st phase
|
---|
506 | movdqa $Xi,$T2 #
|
---|
507 | movdqa $Xi,$T1
|
---|
508 | psllq \$5,$Xi
|
---|
509 | pxor $Xi,$T1 #
|
---|
510 | psllq \$1,$Xi
|
---|
511 | pxor $T1,$Xi #
|
---|
512 | psllq \$57,$Xi #
|
---|
513 | movdqa $Xi,$T1 #
|
---|
514 | pslldq \$8,$Xi
|
---|
515 | psrldq \$8,$T1 #
|
---|
516 | pxor $T2,$Xi
|
---|
517 | pxor $T1,$Xhi #
|
---|
518 |
|
---|
519 | # 2nd phase
|
---|
520 | movdqa $Xi,$T2
|
---|
521 | psrlq \$1,$Xi
|
---|
522 | pxor $T2,$Xhi #
|
---|
523 | pxor $Xi,$T2
|
---|
524 | psrlq \$5,$Xi
|
---|
525 | pxor $T2,$Xi #
|
---|
526 | psrlq \$1,$Xi #
|
---|
527 | pxor $Xhi,$Xi #
|
---|
528 | ___
|
---|
529 | }
|
---|
530 | |
---|
531 |
|
---|
532 | { my ($Htbl,$Xip)=@_4args;
|
---|
533 | my $HK="%xmm6";
|
---|
534 |
|
---|
535 | $code.=<<___;
|
---|
536 | .globl gcm_init_clmul
|
---|
537 | .type gcm_init_clmul,\@abi-omnipotent
|
---|
538 | .align 16
|
---|
539 | gcm_init_clmul:
|
---|
540 | .cfi_startproc
|
---|
541 | .L_init_clmul:
|
---|
542 | ___
|
---|
543 | $code.=<<___ if ($win64);
|
---|
544 | .LSEH_begin_gcm_init_clmul:
|
---|
545 | # I can't trust assembler to use specific encoding:-(
|
---|
546 | .byte 0x48,0x83,0xec,0x18 #sub $0x18,%rsp
|
---|
547 | .byte 0x0f,0x29,0x34,0x24 #movaps %xmm6,(%rsp)
|
---|
548 | ___
|
---|
549 | $code.=<<___;
|
---|
550 | movdqu ($Xip),$Hkey
|
---|
551 | pshufd \$0b01001110,$Hkey,$Hkey # dword swap
|
---|
552 |
|
---|
553 | # <<1 twist
|
---|
554 | pshufd \$0b11111111,$Hkey,$T2 # broadcast uppermost dword
|
---|
555 | movdqa $Hkey,$T1
|
---|
556 | psllq \$1,$Hkey
|
---|
557 | pxor $T3,$T3 #
|
---|
558 | psrlq \$63,$T1
|
---|
559 | pcmpgtd $T2,$T3 # broadcast carry bit
|
---|
560 | pslldq \$8,$T1
|
---|
561 | por $T1,$Hkey # H<<=1
|
---|
562 |
|
---|
563 | # magic reduction
|
---|
564 | pand .L0x1c2_polynomial(%rip),$T3
|
---|
565 | pxor $T3,$Hkey # if(carry) H^=0x1c2_polynomial
|
---|
566 |
|
---|
567 | # calculate H^2
|
---|
568 | pshufd \$0b01001110,$Hkey,$HK
|
---|
569 | movdqa $Hkey,$Xi
|
---|
570 | pxor $Hkey,$HK
|
---|
571 | ___
|
---|
572 | &clmul64x64_T2 ($Xhi,$Xi,$Hkey,$HK);
|
---|
573 | &reduction_alg9 ($Xhi,$Xi);
|
---|
574 | $code.=<<___;
|
---|
575 | pshufd \$0b01001110,$Hkey,$T1
|
---|
576 | pshufd \$0b01001110,$Xi,$T2
|
---|
577 | pxor $Hkey,$T1 # Karatsuba pre-processing
|
---|
578 | movdqu $Hkey,0x00($Htbl) # save H
|
---|
579 | pxor $Xi,$T2 # Karatsuba pre-processing
|
---|
580 | movdqu $Xi,0x10($Htbl) # save H^2
|
---|
581 | palignr \$8,$T1,$T2 # low part is H.lo^H.hi...
|
---|
582 | movdqu $T2,0x20($Htbl) # save Karatsuba "salt"
|
---|
583 | ___
|
---|
584 | if ($do4xaggr) {
|
---|
585 | &clmul64x64_T2 ($Xhi,$Xi,$Hkey,$HK); # H^3
|
---|
586 | &reduction_alg9 ($Xhi,$Xi);
|
---|
587 | $code.=<<___;
|
---|
588 | movdqa $Xi,$T3
|
---|
589 | ___
|
---|
590 | &clmul64x64_T2 ($Xhi,$Xi,$Hkey,$HK); # H^4
|
---|
591 | &reduction_alg9 ($Xhi,$Xi);
|
---|
592 | $code.=<<___;
|
---|
593 | pshufd \$0b01001110,$T3,$T1
|
---|
594 | pshufd \$0b01001110,$Xi,$T2
|
---|
595 | pxor $T3,$T1 # Karatsuba pre-processing
|
---|
596 | movdqu $T3,0x30($Htbl) # save H^3
|
---|
597 | pxor $Xi,$T2 # Karatsuba pre-processing
|
---|
598 | movdqu $Xi,0x40($Htbl) # save H^4
|
---|
599 | palignr \$8,$T1,$T2 # low part is H^3.lo^H^3.hi...
|
---|
600 | movdqu $T2,0x50($Htbl) # save Karatsuba "salt"
|
---|
601 | ___
|
---|
602 | }
|
---|
603 | $code.=<<___ if ($win64);
|
---|
604 | movaps (%rsp),%xmm6
|
---|
605 | lea 0x18(%rsp),%rsp
|
---|
606 | .LSEH_end_gcm_init_clmul:
|
---|
607 | ___
|
---|
608 | $code.=<<___;
|
---|
609 | ret
|
---|
610 | .cfi_endproc
|
---|
611 | .size gcm_init_clmul,.-gcm_init_clmul
|
---|
612 | ___
|
---|
613 | }
|
---|
614 |
|
---|
615 | { my ($Xip,$Htbl)=@_4args;
|
---|
616 |
|
---|
617 | $code.=<<___;
|
---|
618 | .globl gcm_gmult_clmul
|
---|
619 | .type gcm_gmult_clmul,\@abi-omnipotent
|
---|
620 | .align 16
|
---|
621 | gcm_gmult_clmul:
|
---|
622 | .cfi_startproc
|
---|
623 | endbranch
|
---|
624 | .L_gmult_clmul:
|
---|
625 | movdqu ($Xip),$Xi
|
---|
626 | movdqa .Lbswap_mask(%rip),$T3
|
---|
627 | movdqu ($Htbl),$Hkey
|
---|
628 | movdqu 0x20($Htbl),$T2
|
---|
629 | pshufb $T3,$Xi
|
---|
630 | ___
|
---|
631 | &clmul64x64_T2 ($Xhi,$Xi,$Hkey,$T2);
|
---|
632 | $code.=<<___ if (0 || (&reduction_alg9($Xhi,$Xi)&&0));
|
---|
633 | # experimental alternative. special thing about is that there
|
---|
634 | # no dependency between the two multiplications...
|
---|
635 | mov \$`0xE1<<1`,%eax
|
---|
636 | mov \$0xA040608020C0E000,%r10 # ((7..0)·0xE0)&0xff
|
---|
637 | mov \$0x07,%r11d
|
---|
638 | movq %rax,$T1
|
---|
639 | movq %r10,$T2
|
---|
640 | movq %r11,$T3 # borrow $T3
|
---|
641 | pand $Xi,$T3
|
---|
642 | pshufb $T3,$T2 # ($Xi&7)·0xE0
|
---|
643 | movq %rax,$T3
|
---|
644 | pclmulqdq \$0x00,$Xi,$T1 # ·(0xE1<<1)
|
---|
645 | pxor $Xi,$T2
|
---|
646 | pslldq \$15,$T2
|
---|
647 | paddd $T2,$T2 # <<(64+56+1)
|
---|
648 | pxor $T2,$Xi
|
---|
649 | pclmulqdq \$0x01,$T3,$Xi
|
---|
650 | movdqa .Lbswap_mask(%rip),$T3 # reload $T3
|
---|
651 | psrldq \$1,$T1
|
---|
652 | pxor $T1,$Xhi
|
---|
653 | pslldq \$7,$Xi
|
---|
654 | pxor $Xhi,$Xi
|
---|
655 | ___
|
---|
656 | $code.=<<___;
|
---|
657 | pshufb $T3,$Xi
|
---|
658 | movdqu $Xi,($Xip)
|
---|
659 | ret
|
---|
660 | .cfi_endproc
|
---|
661 | .size gcm_gmult_clmul,.-gcm_gmult_clmul
|
---|
662 | ___
|
---|
663 | }
|
---|
664 | |
---|
665 |
|
---|
666 | { my ($Xip,$Htbl,$inp,$len)=@_4args;
|
---|
667 | my ($Xln,$Xmn,$Xhn,$Hkey2,$HK) = map("%xmm$_",(3..7));
|
---|
668 | my ($T1,$T2,$T3)=map("%xmm$_",(8..10));
|
---|
669 |
|
---|
670 | $code.=<<___;
|
---|
671 | .globl gcm_ghash_clmul
|
---|
672 | .type gcm_ghash_clmul,\@abi-omnipotent
|
---|
673 | .align 32
|
---|
674 | gcm_ghash_clmul:
|
---|
675 | .cfi_startproc
|
---|
676 | endbranch
|
---|
677 | .L_ghash_clmul:
|
---|
678 | ___
|
---|
679 | $code.=<<___ if ($win64);
|
---|
680 | lea -0x88(%rsp),%rax
|
---|
681 | .LSEH_begin_gcm_ghash_clmul:
|
---|
682 | # I can't trust assembler to use specific encoding:-(
|
---|
683 | .byte 0x48,0x8d,0x60,0xe0 #lea -0x20(%rax),%rsp
|
---|
684 | .byte 0x0f,0x29,0x70,0xe0 #movaps %xmm6,-0x20(%rax)
|
---|
685 | .byte 0x0f,0x29,0x78,0xf0 #movaps %xmm7,-0x10(%rax)
|
---|
686 | .byte 0x44,0x0f,0x29,0x00 #movaps %xmm8,0(%rax)
|
---|
687 | .byte 0x44,0x0f,0x29,0x48,0x10 #movaps %xmm9,0x10(%rax)
|
---|
688 | .byte 0x44,0x0f,0x29,0x50,0x20 #movaps %xmm10,0x20(%rax)
|
---|
689 | .byte 0x44,0x0f,0x29,0x58,0x30 #movaps %xmm11,0x30(%rax)
|
---|
690 | .byte 0x44,0x0f,0x29,0x60,0x40 #movaps %xmm12,0x40(%rax)
|
---|
691 | .byte 0x44,0x0f,0x29,0x68,0x50 #movaps %xmm13,0x50(%rax)
|
---|
692 | .byte 0x44,0x0f,0x29,0x70,0x60 #movaps %xmm14,0x60(%rax)
|
---|
693 | .byte 0x44,0x0f,0x29,0x78,0x70 #movaps %xmm15,0x70(%rax)
|
---|
694 | ___
|
---|
695 | $code.=<<___;
|
---|
696 | movdqa .Lbswap_mask(%rip),$T3
|
---|
697 |
|
---|
698 | movdqu ($Xip),$Xi
|
---|
699 | movdqu ($Htbl),$Hkey
|
---|
700 | movdqu 0x20($Htbl),$HK
|
---|
701 | pshufb $T3,$Xi
|
---|
702 |
|
---|
703 | sub \$0x10,$len
|
---|
704 | jz .Lodd_tail
|
---|
705 |
|
---|
706 | movdqu 0x10($Htbl),$Hkey2
|
---|
707 | ___
|
---|
708 | if ($do4xaggr) {
|
---|
709 | my ($Xl,$Xm,$Xh,$Hkey3,$Hkey4)=map("%xmm$_",(11..15));
|
---|
710 |
|
---|
711 | $code.=<<___;
|
---|
712 | mov OPENSSL_ia32cap_P+4(%rip),%eax
|
---|
713 | cmp \$0x30,$len
|
---|
714 | jb .Lskip4x
|
---|
715 |
|
---|
716 | and \$`1<<26|1<<22`,%eax # isolate MOVBE+XSAVE
|
---|
717 | cmp \$`1<<22`,%eax # check for MOVBE without XSAVE
|
---|
718 | je .Lskip4x
|
---|
719 |
|
---|
720 | sub \$0x30,$len
|
---|
721 | mov \$0xA040608020C0E000,%rax # ((7..0)·0xE0)&0xff
|
---|
722 | movdqu 0x30($Htbl),$Hkey3
|
---|
723 | movdqu 0x40($Htbl),$Hkey4
|
---|
724 |
|
---|
725 | #######
|
---|
726 | # Xi+4 =[(H*Ii+3) + (H^2*Ii+2) + (H^3*Ii+1) + H^4*(Ii+Xi)] mod P
|
---|
727 | #
|
---|
728 | movdqu 0x30($inp),$Xln
|
---|
729 | movdqu 0x20($inp),$Xl
|
---|
730 | pshufb $T3,$Xln
|
---|
731 | pshufb $T3,$Xl
|
---|
732 | movdqa $Xln,$Xhn
|
---|
733 | pshufd \$0b01001110,$Xln,$Xmn
|
---|
734 | pxor $Xln,$Xmn
|
---|
735 | pclmulqdq \$0x00,$Hkey,$Xln
|
---|
736 | pclmulqdq \$0x11,$Hkey,$Xhn
|
---|
737 | pclmulqdq \$0x00,$HK,$Xmn
|
---|
738 |
|
---|
739 | movdqa $Xl,$Xh
|
---|
740 | pshufd \$0b01001110,$Xl,$Xm
|
---|
741 | pxor $Xl,$Xm
|
---|
742 | pclmulqdq \$0x00,$Hkey2,$Xl
|
---|
743 | pclmulqdq \$0x11,$Hkey2,$Xh
|
---|
744 | pclmulqdq \$0x10,$HK,$Xm
|
---|
745 | xorps $Xl,$Xln
|
---|
746 | xorps $Xh,$Xhn
|
---|
747 | movups 0x50($Htbl),$HK
|
---|
748 | xorps $Xm,$Xmn
|
---|
749 |
|
---|
750 | movdqu 0x10($inp),$Xl
|
---|
751 | movdqu 0($inp),$T1
|
---|
752 | pshufb $T3,$Xl
|
---|
753 | pshufb $T3,$T1
|
---|
754 | movdqa $Xl,$Xh
|
---|
755 | pshufd \$0b01001110,$Xl,$Xm
|
---|
756 | pxor $T1,$Xi
|
---|
757 | pxor $Xl,$Xm
|
---|
758 | pclmulqdq \$0x00,$Hkey3,$Xl
|
---|
759 | movdqa $Xi,$Xhi
|
---|
760 | pshufd \$0b01001110,$Xi,$T1
|
---|
761 | pxor $Xi,$T1
|
---|
762 | pclmulqdq \$0x11,$Hkey3,$Xh
|
---|
763 | pclmulqdq \$0x00,$HK,$Xm
|
---|
764 | xorps $Xl,$Xln
|
---|
765 | xorps $Xh,$Xhn
|
---|
766 |
|
---|
767 | lea 0x40($inp),$inp
|
---|
768 | sub \$0x40,$len
|
---|
769 | jc .Ltail4x
|
---|
770 |
|
---|
771 | jmp .Lmod4_loop
|
---|
772 | .align 32
|
---|
773 | .Lmod4_loop:
|
---|
774 | pclmulqdq \$0x00,$Hkey4,$Xi
|
---|
775 | xorps $Xm,$Xmn
|
---|
776 | movdqu 0x30($inp),$Xl
|
---|
777 | pshufb $T3,$Xl
|
---|
778 | pclmulqdq \$0x11,$Hkey4,$Xhi
|
---|
779 | xorps $Xln,$Xi
|
---|
780 | movdqu 0x20($inp),$Xln
|
---|
781 | movdqa $Xl,$Xh
|
---|
782 | pclmulqdq \$0x10,$HK,$T1
|
---|
783 | pshufd \$0b01001110,$Xl,$Xm
|
---|
784 | xorps $Xhn,$Xhi
|
---|
785 | pxor $Xl,$Xm
|
---|
786 | pshufb $T3,$Xln
|
---|
787 | movups 0x20($Htbl),$HK
|
---|
788 | xorps $Xmn,$T1
|
---|
789 | pclmulqdq \$0x00,$Hkey,$Xl
|
---|
790 | pshufd \$0b01001110,$Xln,$Xmn
|
---|
791 |
|
---|
792 | pxor $Xi,$T1 # aggregated Karatsuba post-processing
|
---|
793 | movdqa $Xln,$Xhn
|
---|
794 | pxor $Xhi,$T1 #
|
---|
795 | pxor $Xln,$Xmn
|
---|
796 | movdqa $T1,$T2 #
|
---|
797 | pclmulqdq \$0x11,$Hkey,$Xh
|
---|
798 | pslldq \$8,$T1
|
---|
799 | psrldq \$8,$T2 #
|
---|
800 | pxor $T1,$Xi
|
---|
801 | movdqa .L7_mask(%rip),$T1
|
---|
802 | pxor $T2,$Xhi #
|
---|
803 | movq %rax,$T2
|
---|
804 |
|
---|
805 | pand $Xi,$T1 # 1st phase
|
---|
806 | pshufb $T1,$T2 #
|
---|
807 | pxor $Xi,$T2 #
|
---|
808 | pclmulqdq \$0x00,$HK,$Xm
|
---|
809 | psllq \$57,$T2 #
|
---|
810 | movdqa $T2,$T1 #
|
---|
811 | pslldq \$8,$T2
|
---|
812 | pclmulqdq \$0x00,$Hkey2,$Xln
|
---|
813 | psrldq \$8,$T1 #
|
---|
814 | pxor $T2,$Xi
|
---|
815 | pxor $T1,$Xhi #
|
---|
816 | movdqu 0($inp),$T1
|
---|
817 |
|
---|
818 | movdqa $Xi,$T2 # 2nd phase
|
---|
819 | psrlq \$1,$Xi
|
---|
820 | pclmulqdq \$0x11,$Hkey2,$Xhn
|
---|
821 | xorps $Xl,$Xln
|
---|
822 | movdqu 0x10($inp),$Xl
|
---|
823 | pshufb $T3,$Xl
|
---|
824 | pclmulqdq \$0x10,$HK,$Xmn
|
---|
825 | xorps $Xh,$Xhn
|
---|
826 | movups 0x50($Htbl),$HK
|
---|
827 | pshufb $T3,$T1
|
---|
828 | pxor $T2,$Xhi #
|
---|
829 | pxor $Xi,$T2
|
---|
830 | psrlq \$5,$Xi
|
---|
831 |
|
---|
832 | movdqa $Xl,$Xh
|
---|
833 | pxor $Xm,$Xmn
|
---|
834 | pshufd \$0b01001110,$Xl,$Xm
|
---|
835 | pxor $T2,$Xi #
|
---|
836 | pxor $T1,$Xhi
|
---|
837 | pxor $Xl,$Xm
|
---|
838 | pclmulqdq \$0x00,$Hkey3,$Xl
|
---|
839 | psrlq \$1,$Xi #
|
---|
840 | pxor $Xhi,$Xi #
|
---|
841 | movdqa $Xi,$Xhi
|
---|
842 | pclmulqdq \$0x11,$Hkey3,$Xh
|
---|
843 | xorps $Xl,$Xln
|
---|
844 | pshufd \$0b01001110,$Xi,$T1
|
---|
845 | pxor $Xi,$T1
|
---|
846 |
|
---|
847 | pclmulqdq \$0x00,$HK,$Xm
|
---|
848 | xorps $Xh,$Xhn
|
---|
849 |
|
---|
850 | lea 0x40($inp),$inp
|
---|
851 | sub \$0x40,$len
|
---|
852 | jnc .Lmod4_loop
|
---|
853 |
|
---|
854 | .Ltail4x:
|
---|
855 | pclmulqdq \$0x00,$Hkey4,$Xi
|
---|
856 | pclmulqdq \$0x11,$Hkey4,$Xhi
|
---|
857 | pclmulqdq \$0x10,$HK,$T1
|
---|
858 | xorps $Xm,$Xmn
|
---|
859 | xorps $Xln,$Xi
|
---|
860 | xorps $Xhn,$Xhi
|
---|
861 | pxor $Xi,$Xhi # aggregated Karatsuba post-processing
|
---|
862 | pxor $Xmn,$T1
|
---|
863 |
|
---|
864 | pxor $Xhi,$T1 #
|
---|
865 | pxor $Xi,$Xhi
|
---|
866 |
|
---|
867 | movdqa $T1,$T2 #
|
---|
868 | psrldq \$8,$T1
|
---|
869 | pslldq \$8,$T2 #
|
---|
870 | pxor $T1,$Xhi
|
---|
871 | pxor $T2,$Xi #
|
---|
872 | ___
|
---|
873 | &reduction_alg9($Xhi,$Xi);
|
---|
874 | $code.=<<___;
|
---|
875 | add \$0x40,$len
|
---|
876 | jz .Ldone
|
---|
877 | movdqu 0x20($Htbl),$HK
|
---|
878 | sub \$0x10,$len
|
---|
879 | jz .Lodd_tail
|
---|
880 | .Lskip4x:
|
---|
881 | ___
|
---|
882 | }
|
---|
883 | $code.=<<___;
|
---|
884 | #######
|
---|
885 | # Xi+2 =[H*(Ii+1 + Xi+1)] mod P =
|
---|
886 | # [(H*Ii+1) + (H*Xi+1)] mod P =
|
---|
887 | # [(H*Ii+1) + H^2*(Ii+Xi)] mod P
|
---|
888 | #
|
---|
889 | movdqu ($inp),$T1 # Ii
|
---|
890 | movdqu 16($inp),$Xln # Ii+1
|
---|
891 | pshufb $T3,$T1
|
---|
892 | pshufb $T3,$Xln
|
---|
893 | pxor $T1,$Xi # Ii+Xi
|
---|
894 |
|
---|
895 | movdqa $Xln,$Xhn
|
---|
896 | pshufd \$0b01001110,$Xln,$Xmn
|
---|
897 | pxor $Xln,$Xmn
|
---|
898 | pclmulqdq \$0x00,$Hkey,$Xln
|
---|
899 | pclmulqdq \$0x11,$Hkey,$Xhn
|
---|
900 | pclmulqdq \$0x00,$HK,$Xmn
|
---|
901 |
|
---|
902 | lea 32($inp),$inp # i+=2
|
---|
903 | nop
|
---|
904 | sub \$0x20,$len
|
---|
905 | jbe .Leven_tail
|
---|
906 | nop
|
---|
907 | jmp .Lmod_loop
|
---|
908 |
|
---|
909 | .align 32
|
---|
910 | .Lmod_loop:
|
---|
911 | movdqa $Xi,$Xhi
|
---|
912 | movdqa $Xmn,$T1
|
---|
913 | pshufd \$0b01001110,$Xi,$Xmn #
|
---|
914 | pxor $Xi,$Xmn #
|
---|
915 |
|
---|
916 | pclmulqdq \$0x00,$Hkey2,$Xi
|
---|
917 | pclmulqdq \$0x11,$Hkey2,$Xhi
|
---|
918 | pclmulqdq \$0x10,$HK,$Xmn
|
---|
919 |
|
---|
920 | pxor $Xln,$Xi # (H*Ii+1) + H^2*(Ii+Xi)
|
---|
921 | pxor $Xhn,$Xhi
|
---|
922 | movdqu ($inp),$T2 # Ii
|
---|
923 | pxor $Xi,$T1 # aggregated Karatsuba post-processing
|
---|
924 | pshufb $T3,$T2
|
---|
925 | movdqu 16($inp),$Xln # Ii+1
|
---|
926 |
|
---|
927 | pxor $Xhi,$T1
|
---|
928 | pxor $T2,$Xhi # "Ii+Xi", consume early
|
---|
929 | pxor $T1,$Xmn
|
---|
930 | pshufb $T3,$Xln
|
---|
931 | movdqa $Xmn,$T1 #
|
---|
932 | psrldq \$8,$T1
|
---|
933 | pslldq \$8,$Xmn #
|
---|
934 | pxor $T1,$Xhi
|
---|
935 | pxor $Xmn,$Xi #
|
---|
936 |
|
---|
937 | movdqa $Xln,$Xhn #
|
---|
938 |
|
---|
939 | movdqa $Xi,$T2 # 1st phase
|
---|
940 | movdqa $Xi,$T1
|
---|
941 | psllq \$5,$Xi
|
---|
942 | pxor $Xi,$T1 #
|
---|
943 | pclmulqdq \$0x00,$Hkey,$Xln #######
|
---|
944 | psllq \$1,$Xi
|
---|
945 | pxor $T1,$Xi #
|
---|
946 | psllq \$57,$Xi #
|
---|
947 | movdqa $Xi,$T1 #
|
---|
948 | pslldq \$8,$Xi
|
---|
949 | psrldq \$8,$T1 #
|
---|
950 | pxor $T2,$Xi
|
---|
951 | pshufd \$0b01001110,$Xhn,$Xmn
|
---|
952 | pxor $T1,$Xhi #
|
---|
953 | pxor $Xhn,$Xmn #
|
---|
954 |
|
---|
955 | movdqa $Xi,$T2 # 2nd phase
|
---|
956 | psrlq \$1,$Xi
|
---|
957 | pclmulqdq \$0x11,$Hkey,$Xhn #######
|
---|
958 | pxor $T2,$Xhi #
|
---|
959 | pxor $Xi,$T2
|
---|
960 | psrlq \$5,$Xi
|
---|
961 | pxor $T2,$Xi #
|
---|
962 | lea 32($inp),$inp
|
---|
963 | psrlq \$1,$Xi #
|
---|
964 | pclmulqdq \$0x00,$HK,$Xmn #######
|
---|
965 | pxor $Xhi,$Xi #
|
---|
966 |
|
---|
967 | sub \$0x20,$len
|
---|
968 | ja .Lmod_loop
|
---|
969 |
|
---|
970 | .Leven_tail:
|
---|
971 | movdqa $Xi,$Xhi
|
---|
972 | movdqa $Xmn,$T1
|
---|
973 | pshufd \$0b01001110,$Xi,$Xmn #
|
---|
974 | pxor $Xi,$Xmn #
|
---|
975 |
|
---|
976 | pclmulqdq \$0x00,$Hkey2,$Xi
|
---|
977 | pclmulqdq \$0x11,$Hkey2,$Xhi
|
---|
978 | pclmulqdq \$0x10,$HK,$Xmn
|
---|
979 |
|
---|
980 | pxor $Xln,$Xi # (H*Ii+1) + H^2*(Ii+Xi)
|
---|
981 | pxor $Xhn,$Xhi
|
---|
982 | pxor $Xi,$T1
|
---|
983 | pxor $Xhi,$T1
|
---|
984 | pxor $T1,$Xmn
|
---|
985 | movdqa $Xmn,$T1 #
|
---|
986 | psrldq \$8,$T1
|
---|
987 | pslldq \$8,$Xmn #
|
---|
988 | pxor $T1,$Xhi
|
---|
989 | pxor $Xmn,$Xi #
|
---|
990 | ___
|
---|
991 | &reduction_alg9 ($Xhi,$Xi);
|
---|
992 | $code.=<<___;
|
---|
993 | test $len,$len
|
---|
994 | jnz .Ldone
|
---|
995 |
|
---|
996 | .Lodd_tail:
|
---|
997 | movdqu ($inp),$T1 # Ii
|
---|
998 | pshufb $T3,$T1
|
---|
999 | pxor $T1,$Xi # Ii+Xi
|
---|
1000 | ___
|
---|
1001 | &clmul64x64_T2 ($Xhi,$Xi,$Hkey,$HK); # H*(Ii+Xi)
|
---|
1002 | &reduction_alg9 ($Xhi,$Xi);
|
---|
1003 | $code.=<<___;
|
---|
1004 | .Ldone:
|
---|
1005 | pshufb $T3,$Xi
|
---|
1006 | movdqu $Xi,($Xip)
|
---|
1007 | ___
|
---|
1008 | $code.=<<___ if ($win64);
|
---|
1009 | movaps (%rsp),%xmm6
|
---|
1010 | movaps 0x10(%rsp),%xmm7
|
---|
1011 | movaps 0x20(%rsp),%xmm8
|
---|
1012 | movaps 0x30(%rsp),%xmm9
|
---|
1013 | movaps 0x40(%rsp),%xmm10
|
---|
1014 | movaps 0x50(%rsp),%xmm11
|
---|
1015 | movaps 0x60(%rsp),%xmm12
|
---|
1016 | movaps 0x70(%rsp),%xmm13
|
---|
1017 | movaps 0x80(%rsp),%xmm14
|
---|
1018 | movaps 0x90(%rsp),%xmm15
|
---|
1019 | lea 0xa8(%rsp),%rsp
|
---|
1020 | .LSEH_end_gcm_ghash_clmul:
|
---|
1021 | ___
|
---|
1022 | $code.=<<___;
|
---|
1023 | ret
|
---|
1024 | .cfi_endproc
|
---|
1025 | .size gcm_ghash_clmul,.-gcm_ghash_clmul
|
---|
1026 | ___
|
---|
1027 | }
|
---|
1028 | |
---|
1029 |
|
---|
1030 | $code.=<<___;
|
---|
1031 | .globl gcm_init_avx
|
---|
1032 | .type gcm_init_avx,\@abi-omnipotent
|
---|
1033 | .align 32
|
---|
1034 | gcm_init_avx:
|
---|
1035 | .cfi_startproc
|
---|
1036 | ___
|
---|
1037 | if ($avx) {
|
---|
1038 | my ($Htbl,$Xip)=@_4args;
|
---|
1039 | my $HK="%xmm6";
|
---|
1040 |
|
---|
1041 | $code.=<<___ if ($win64);
|
---|
1042 | .LSEH_begin_gcm_init_avx:
|
---|
1043 | # I can't trust assembler to use specific encoding:-(
|
---|
1044 | .byte 0x48,0x83,0xec,0x18 #sub $0x18,%rsp
|
---|
1045 | .byte 0x0f,0x29,0x34,0x24 #movaps %xmm6,(%rsp)
|
---|
1046 | ___
|
---|
1047 | $code.=<<___;
|
---|
1048 | vzeroupper
|
---|
1049 |
|
---|
1050 | vmovdqu ($Xip),$Hkey
|
---|
1051 | vpshufd \$0b01001110,$Hkey,$Hkey # dword swap
|
---|
1052 |
|
---|
1053 | # <<1 twist
|
---|
1054 | vpshufd \$0b11111111,$Hkey,$T2 # broadcast uppermost dword
|
---|
1055 | vpsrlq \$63,$Hkey,$T1
|
---|
1056 | vpsllq \$1,$Hkey,$Hkey
|
---|
1057 | vpxor $T3,$T3,$T3 #
|
---|
1058 | vpcmpgtd $T2,$T3,$T3 # broadcast carry bit
|
---|
1059 | vpslldq \$8,$T1,$T1
|
---|
1060 | vpor $T1,$Hkey,$Hkey # H<<=1
|
---|
1061 |
|
---|
1062 | # magic reduction
|
---|
1063 | vpand .L0x1c2_polynomial(%rip),$T3,$T3
|
---|
1064 | vpxor $T3,$Hkey,$Hkey # if(carry) H^=0x1c2_polynomial
|
---|
1065 |
|
---|
1066 | vpunpckhqdq $Hkey,$Hkey,$HK
|
---|
1067 | vmovdqa $Hkey,$Xi
|
---|
1068 | vpxor $Hkey,$HK,$HK
|
---|
1069 | mov \$4,%r10 # up to H^8
|
---|
1070 | jmp .Linit_start_avx
|
---|
1071 | ___
|
---|
1072 |
|
---|
1073 | sub clmul64x64_avx {
|
---|
1074 | my ($Xhi,$Xi,$Hkey,$HK)=@_;
|
---|
1075 |
|
---|
1076 | if (!defined($HK)) { $HK = $T2;
|
---|
1077 | $code.=<<___;
|
---|
1078 | vpunpckhqdq $Xi,$Xi,$T1
|
---|
1079 | vpunpckhqdq $Hkey,$Hkey,$T2
|
---|
1080 | vpxor $Xi,$T1,$T1 #
|
---|
1081 | vpxor $Hkey,$T2,$T2
|
---|
1082 | ___
|
---|
1083 | } else {
|
---|
1084 | $code.=<<___;
|
---|
1085 | vpunpckhqdq $Xi,$Xi,$T1
|
---|
1086 | vpxor $Xi,$T1,$T1 #
|
---|
1087 | ___
|
---|
1088 | }
|
---|
1089 | $code.=<<___;
|
---|
1090 | vpclmulqdq \$0x11,$Hkey,$Xi,$Xhi #######
|
---|
1091 | vpclmulqdq \$0x00,$Hkey,$Xi,$Xi #######
|
---|
1092 | vpclmulqdq \$0x00,$HK,$T1,$T1 #######
|
---|
1093 | vpxor $Xi,$Xhi,$T2 #
|
---|
1094 | vpxor $T2,$T1,$T1 #
|
---|
1095 |
|
---|
1096 | vpslldq \$8,$T1,$T2 #
|
---|
1097 | vpsrldq \$8,$T1,$T1
|
---|
1098 | vpxor $T2,$Xi,$Xi #
|
---|
1099 | vpxor $T1,$Xhi,$Xhi
|
---|
1100 | ___
|
---|
1101 | }
|
---|
1102 |
|
---|
1103 | sub reduction_avx {
|
---|
1104 | my ($Xhi,$Xi) = @_;
|
---|
1105 |
|
---|
1106 | $code.=<<___;
|
---|
1107 | vpsllq \$57,$Xi,$T1 # 1st phase
|
---|
1108 | vpsllq \$62,$Xi,$T2
|
---|
1109 | vpxor $T1,$T2,$T2 #
|
---|
1110 | vpsllq \$63,$Xi,$T1
|
---|
1111 | vpxor $T1,$T2,$T2 #
|
---|
1112 | vpslldq \$8,$T2,$T1 #
|
---|
1113 | vpsrldq \$8,$T2,$T2
|
---|
1114 | vpxor $T1,$Xi,$Xi #
|
---|
1115 | vpxor $T2,$Xhi,$Xhi
|
---|
1116 |
|
---|
1117 | vpsrlq \$1,$Xi,$T2 # 2nd phase
|
---|
1118 | vpxor $Xi,$Xhi,$Xhi
|
---|
1119 | vpxor $T2,$Xi,$Xi #
|
---|
1120 | vpsrlq \$5,$T2,$T2
|
---|
1121 | vpxor $T2,$Xi,$Xi #
|
---|
1122 | vpsrlq \$1,$Xi,$Xi #
|
---|
1123 | vpxor $Xhi,$Xi,$Xi #
|
---|
1124 | ___
|
---|
1125 | }
|
---|
1126 |
|
---|
1127 | $code.=<<___;
|
---|
1128 | .align 32
|
---|
1129 | .Linit_loop_avx:
|
---|
1130 | vpalignr \$8,$T1,$T2,$T3 # low part is H.lo^H.hi...
|
---|
1131 | vmovdqu $T3,-0x10($Htbl) # save Karatsuba "salt"
|
---|
1132 | ___
|
---|
1133 | &clmul64x64_avx ($Xhi,$Xi,$Hkey,$HK); # calculate H^3,5,7
|
---|
1134 | &reduction_avx ($Xhi,$Xi);
|
---|
1135 | $code.=<<___;
|
---|
1136 | .Linit_start_avx:
|
---|
1137 | vmovdqa $Xi,$T3
|
---|
1138 | ___
|
---|
1139 | &clmul64x64_avx ($Xhi,$Xi,$Hkey,$HK); # calculate H^2,4,6,8
|
---|
1140 | &reduction_avx ($Xhi,$Xi);
|
---|
1141 | $code.=<<___;
|
---|
1142 | vpshufd \$0b01001110,$T3,$T1
|
---|
1143 | vpshufd \$0b01001110,$Xi,$T2
|
---|
1144 | vpxor $T3,$T1,$T1 # Karatsuba pre-processing
|
---|
1145 | vmovdqu $T3,0x00($Htbl) # save H^1,3,5,7
|
---|
1146 | vpxor $Xi,$T2,$T2 # Karatsuba pre-processing
|
---|
1147 | vmovdqu $Xi,0x10($Htbl) # save H^2,4,6,8
|
---|
1148 | lea 0x30($Htbl),$Htbl
|
---|
1149 | sub \$1,%r10
|
---|
1150 | jnz .Linit_loop_avx
|
---|
1151 |
|
---|
1152 | vpalignr \$8,$T2,$T1,$T3 # last "salt" is flipped
|
---|
1153 | vmovdqu $T3,-0x10($Htbl)
|
---|
1154 |
|
---|
1155 | vzeroupper
|
---|
1156 | ___
|
---|
1157 | $code.=<<___ if ($win64);
|
---|
1158 | movaps (%rsp),%xmm6
|
---|
1159 | lea 0x18(%rsp),%rsp
|
---|
1160 | .LSEH_end_gcm_init_avx:
|
---|
1161 | ___
|
---|
1162 | $code.=<<___;
|
---|
1163 | ret
|
---|
1164 | .cfi_endproc
|
---|
1165 | .size gcm_init_avx,.-gcm_init_avx
|
---|
1166 | ___
|
---|
1167 | } else {
|
---|
1168 | $code.=<<___;
|
---|
1169 | jmp .L_init_clmul
|
---|
1170 | .cfi_endproc
|
---|
1171 | .size gcm_init_avx,.-gcm_init_avx
|
---|
1172 | ___
|
---|
1173 | }
|
---|
1174 |
|
---|
1175 | $code.=<<___;
|
---|
1176 | .globl gcm_gmult_avx
|
---|
1177 | .type gcm_gmult_avx,\@abi-omnipotent
|
---|
1178 | .align 32
|
---|
1179 | gcm_gmult_avx:
|
---|
1180 | .cfi_startproc
|
---|
1181 | endbranch
|
---|
1182 | jmp .L_gmult_clmul
|
---|
1183 | .cfi_endproc
|
---|
1184 | .size gcm_gmult_avx,.-gcm_gmult_avx
|
---|
1185 | ___
|
---|
1186 | |
---|
1187 |
|
---|
1188 | $code.=<<___;
|
---|
1189 | .globl gcm_ghash_avx
|
---|
1190 | .type gcm_ghash_avx,\@abi-omnipotent
|
---|
1191 | .align 32
|
---|
1192 | gcm_ghash_avx:
|
---|
1193 | .cfi_startproc
|
---|
1194 | endbranch
|
---|
1195 | ___
|
---|
1196 | if ($avx) {
|
---|
1197 | my ($Xip,$Htbl,$inp,$len)=@_4args;
|
---|
1198 | my ($Xlo,$Xhi,$Xmi,
|
---|
1199 | $Zlo,$Zhi,$Zmi,
|
---|
1200 | $Hkey,$HK,$T1,$T2,
|
---|
1201 | $Xi,$Xo,$Tred,$bswap,$Ii,$Ij) = map("%xmm$_",(0..15));
|
---|
1202 |
|
---|
1203 | $code.=<<___ if ($win64);
|
---|
1204 | lea -0x88(%rsp),%rax
|
---|
1205 | .LSEH_begin_gcm_ghash_avx:
|
---|
1206 | # I can't trust assembler to use specific encoding:-(
|
---|
1207 | .byte 0x48,0x8d,0x60,0xe0 #lea -0x20(%rax),%rsp
|
---|
1208 | .byte 0x0f,0x29,0x70,0xe0 #movaps %xmm6,-0x20(%rax)
|
---|
1209 | .byte 0x0f,0x29,0x78,0xf0 #movaps %xmm7,-0x10(%rax)
|
---|
1210 | .byte 0x44,0x0f,0x29,0x00 #movaps %xmm8,0(%rax)
|
---|
1211 | .byte 0x44,0x0f,0x29,0x48,0x10 #movaps %xmm9,0x10(%rax)
|
---|
1212 | .byte 0x44,0x0f,0x29,0x50,0x20 #movaps %xmm10,0x20(%rax)
|
---|
1213 | .byte 0x44,0x0f,0x29,0x58,0x30 #movaps %xmm11,0x30(%rax)
|
---|
1214 | .byte 0x44,0x0f,0x29,0x60,0x40 #movaps %xmm12,0x40(%rax)
|
---|
1215 | .byte 0x44,0x0f,0x29,0x68,0x50 #movaps %xmm13,0x50(%rax)
|
---|
1216 | .byte 0x44,0x0f,0x29,0x70,0x60 #movaps %xmm14,0x60(%rax)
|
---|
1217 | .byte 0x44,0x0f,0x29,0x78,0x70 #movaps %xmm15,0x70(%rax)
|
---|
1218 | ___
|
---|
1219 | $code.=<<___;
|
---|
1220 | vzeroupper
|
---|
1221 |
|
---|
1222 | vmovdqu ($Xip),$Xi # load $Xi
|
---|
1223 | lea .L0x1c2_polynomial(%rip),%r10
|
---|
1224 | lea 0x40($Htbl),$Htbl # size optimization
|
---|
1225 | vmovdqu .Lbswap_mask(%rip),$bswap
|
---|
1226 | vpshufb $bswap,$Xi,$Xi
|
---|
1227 | cmp \$0x80,$len
|
---|
1228 | jb .Lshort_avx
|
---|
1229 | sub \$0x80,$len
|
---|
1230 |
|
---|
1231 | vmovdqu 0x70($inp),$Ii # I[7]
|
---|
1232 | vmovdqu 0x00-0x40($Htbl),$Hkey # $Hkey^1
|
---|
1233 | vpshufb $bswap,$Ii,$Ii
|
---|
1234 | vmovdqu 0x20-0x40($Htbl),$HK
|
---|
1235 |
|
---|
1236 | vpunpckhqdq $Ii,$Ii,$T2
|
---|
1237 | vmovdqu 0x60($inp),$Ij # I[6]
|
---|
1238 | vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo
|
---|
1239 | vpxor $Ii,$T2,$T2
|
---|
1240 | vpshufb $bswap,$Ij,$Ij
|
---|
1241 | vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi
|
---|
1242 | vmovdqu 0x10-0x40($Htbl),$Hkey # $Hkey^2
|
---|
1243 | vpunpckhqdq $Ij,$Ij,$T1
|
---|
1244 | vmovdqu 0x50($inp),$Ii # I[5]
|
---|
1245 | vpclmulqdq \$0x00,$HK,$T2,$Xmi
|
---|
1246 | vpxor $Ij,$T1,$T1
|
---|
1247 |
|
---|
1248 | vpshufb $bswap,$Ii,$Ii
|
---|
1249 | vpclmulqdq \$0x00,$Hkey,$Ij,$Zlo
|
---|
1250 | vpunpckhqdq $Ii,$Ii,$T2
|
---|
1251 | vpclmulqdq \$0x11,$Hkey,$Ij,$Zhi
|
---|
1252 | vmovdqu 0x30-0x40($Htbl),$Hkey # $Hkey^3
|
---|
1253 | vpxor $Ii,$T2,$T2
|
---|
1254 | vmovdqu 0x40($inp),$Ij # I[4]
|
---|
1255 | vpclmulqdq \$0x10,$HK,$T1,$Zmi
|
---|
1256 | vmovdqu 0x50-0x40($Htbl),$HK
|
---|
1257 |
|
---|
1258 | vpshufb $bswap,$Ij,$Ij
|
---|
1259 | vpxor $Xlo,$Zlo,$Zlo
|
---|
1260 | vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo
|
---|
1261 | vpxor $Xhi,$Zhi,$Zhi
|
---|
1262 | vpunpckhqdq $Ij,$Ij,$T1
|
---|
1263 | vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi
|
---|
1264 | vmovdqu 0x40-0x40($Htbl),$Hkey # $Hkey^4
|
---|
1265 | vpxor $Xmi,$Zmi,$Zmi
|
---|
1266 | vpclmulqdq \$0x00,$HK,$T2,$Xmi
|
---|
1267 | vpxor $Ij,$T1,$T1
|
---|
1268 |
|
---|
1269 | vmovdqu 0x30($inp),$Ii # I[3]
|
---|
1270 | vpxor $Zlo,$Xlo,$Xlo
|
---|
1271 | vpclmulqdq \$0x00,$Hkey,$Ij,$Zlo
|
---|
1272 | vpxor $Zhi,$Xhi,$Xhi
|
---|
1273 | vpshufb $bswap,$Ii,$Ii
|
---|
1274 | vpclmulqdq \$0x11,$Hkey,$Ij,$Zhi
|
---|
1275 | vmovdqu 0x60-0x40($Htbl),$Hkey # $Hkey^5
|
---|
1276 | vpxor $Zmi,$Xmi,$Xmi
|
---|
1277 | vpunpckhqdq $Ii,$Ii,$T2
|
---|
1278 | vpclmulqdq \$0x10,$HK,$T1,$Zmi
|
---|
1279 | vmovdqu 0x80-0x40($Htbl),$HK
|
---|
1280 | vpxor $Ii,$T2,$T2
|
---|
1281 |
|
---|
1282 | vmovdqu 0x20($inp),$Ij # I[2]
|
---|
1283 | vpxor $Xlo,$Zlo,$Zlo
|
---|
1284 | vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo
|
---|
1285 | vpxor $Xhi,$Zhi,$Zhi
|
---|
1286 | vpshufb $bswap,$Ij,$Ij
|
---|
1287 | vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi
|
---|
1288 | vmovdqu 0x70-0x40($Htbl),$Hkey # $Hkey^6
|
---|
1289 | vpxor $Xmi,$Zmi,$Zmi
|
---|
1290 | vpunpckhqdq $Ij,$Ij,$T1
|
---|
1291 | vpclmulqdq \$0x00,$HK,$T2,$Xmi
|
---|
1292 | vpxor $Ij,$T1,$T1
|
---|
1293 |
|
---|
1294 | vmovdqu 0x10($inp),$Ii # I[1]
|
---|
1295 | vpxor $Zlo,$Xlo,$Xlo
|
---|
1296 | vpclmulqdq \$0x00,$Hkey,$Ij,$Zlo
|
---|
1297 | vpxor $Zhi,$Xhi,$Xhi
|
---|
1298 | vpshufb $bswap,$Ii,$Ii
|
---|
1299 | vpclmulqdq \$0x11,$Hkey,$Ij,$Zhi
|
---|
1300 | vmovdqu 0x90-0x40($Htbl),$Hkey # $Hkey^7
|
---|
1301 | vpxor $Zmi,$Xmi,$Xmi
|
---|
1302 | vpunpckhqdq $Ii,$Ii,$T2
|
---|
1303 | vpclmulqdq \$0x10,$HK,$T1,$Zmi
|
---|
1304 | vmovdqu 0xb0-0x40($Htbl),$HK
|
---|
1305 | vpxor $Ii,$T2,$T2
|
---|
1306 |
|
---|
1307 | vmovdqu ($inp),$Ij # I[0]
|
---|
1308 | vpxor $Xlo,$Zlo,$Zlo
|
---|
1309 | vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo
|
---|
1310 | vpxor $Xhi,$Zhi,$Zhi
|
---|
1311 | vpshufb $bswap,$Ij,$Ij
|
---|
1312 | vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi
|
---|
1313 | vmovdqu 0xa0-0x40($Htbl),$Hkey # $Hkey^8
|
---|
1314 | vpxor $Xmi,$Zmi,$Zmi
|
---|
1315 | vpclmulqdq \$0x10,$HK,$T2,$Xmi
|
---|
1316 |
|
---|
1317 | lea 0x80($inp),$inp
|
---|
1318 | cmp \$0x80,$len
|
---|
1319 | jb .Ltail_avx
|
---|
1320 |
|
---|
1321 | vpxor $Xi,$Ij,$Ij # accumulate $Xi
|
---|
1322 | sub \$0x80,$len
|
---|
1323 | jmp .Loop8x_avx
|
---|
1324 |
|
---|
1325 | .align 32
|
---|
1326 | .Loop8x_avx:
|
---|
1327 | vpunpckhqdq $Ij,$Ij,$T1
|
---|
1328 | vmovdqu 0x70($inp),$Ii # I[7]
|
---|
1329 | vpxor $Xlo,$Zlo,$Zlo
|
---|
1330 | vpxor $Ij,$T1,$T1
|
---|
1331 | vpclmulqdq \$0x00,$Hkey,$Ij,$Xi
|
---|
1332 | vpshufb $bswap,$Ii,$Ii
|
---|
1333 | vpxor $Xhi,$Zhi,$Zhi
|
---|
1334 | vpclmulqdq \$0x11,$Hkey,$Ij,$Xo
|
---|
1335 | vmovdqu 0x00-0x40($Htbl),$Hkey # $Hkey^1
|
---|
1336 | vpunpckhqdq $Ii,$Ii,$T2
|
---|
1337 | vpxor $Xmi,$Zmi,$Zmi
|
---|
1338 | vpclmulqdq \$0x00,$HK,$T1,$Tred
|
---|
1339 | vmovdqu 0x20-0x40($Htbl),$HK
|
---|
1340 | vpxor $Ii,$T2,$T2
|
---|
1341 |
|
---|
1342 | vmovdqu 0x60($inp),$Ij # I[6]
|
---|
1343 | vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo
|
---|
1344 | vpxor $Zlo,$Xi,$Xi # collect result
|
---|
1345 | vpshufb $bswap,$Ij,$Ij
|
---|
1346 | vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi
|
---|
1347 | vxorps $Zhi,$Xo,$Xo
|
---|
1348 | vmovdqu 0x10-0x40($Htbl),$Hkey # $Hkey^2
|
---|
1349 | vpunpckhqdq $Ij,$Ij,$T1
|
---|
1350 | vpclmulqdq \$0x00,$HK, $T2,$Xmi
|
---|
1351 | vpxor $Zmi,$Tred,$Tred
|
---|
1352 | vxorps $Ij,$T1,$T1
|
---|
1353 |
|
---|
1354 | vmovdqu 0x50($inp),$Ii # I[5]
|
---|
1355 | vpxor $Xi,$Tred,$Tred # aggregated Karatsuba post-processing
|
---|
1356 | vpclmulqdq \$0x00,$Hkey,$Ij,$Zlo
|
---|
1357 | vpxor $Xo,$Tred,$Tred
|
---|
1358 | vpslldq \$8,$Tred,$T2
|
---|
1359 | vpxor $Xlo,$Zlo,$Zlo
|
---|
1360 | vpclmulqdq \$0x11,$Hkey,$Ij,$Zhi
|
---|
1361 | vpsrldq \$8,$Tred,$Tred
|
---|
1362 | vpxor $T2, $Xi, $Xi
|
---|
1363 | vmovdqu 0x30-0x40($Htbl),$Hkey # $Hkey^3
|
---|
1364 | vpshufb $bswap,$Ii,$Ii
|
---|
1365 | vxorps $Tred,$Xo, $Xo
|
---|
1366 | vpxor $Xhi,$Zhi,$Zhi
|
---|
1367 | vpunpckhqdq $Ii,$Ii,$T2
|
---|
1368 | vpclmulqdq \$0x10,$HK, $T1,$Zmi
|
---|
1369 | vmovdqu 0x50-0x40($Htbl),$HK
|
---|
1370 | vpxor $Ii,$T2,$T2
|
---|
1371 | vpxor $Xmi,$Zmi,$Zmi
|
---|
1372 |
|
---|
1373 | vmovdqu 0x40($inp),$Ij # I[4]
|
---|
1374 | vpalignr \$8,$Xi,$Xi,$Tred # 1st phase
|
---|
1375 | vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo
|
---|
1376 | vpshufb $bswap,$Ij,$Ij
|
---|
1377 | vpxor $Zlo,$Xlo,$Xlo
|
---|
1378 | vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi
|
---|
1379 | vmovdqu 0x40-0x40($Htbl),$Hkey # $Hkey^4
|
---|
1380 | vpunpckhqdq $Ij,$Ij,$T1
|
---|
1381 | vpxor $Zhi,$Xhi,$Xhi
|
---|
1382 | vpclmulqdq \$0x00,$HK, $T2,$Xmi
|
---|
1383 | vxorps $Ij,$T1,$T1
|
---|
1384 | vpxor $Zmi,$Xmi,$Xmi
|
---|
1385 |
|
---|
1386 | vmovdqu 0x30($inp),$Ii # I[3]
|
---|
1387 | vpclmulqdq \$0x10,(%r10),$Xi,$Xi
|
---|
1388 | vpclmulqdq \$0x00,$Hkey,$Ij,$Zlo
|
---|
1389 | vpshufb $bswap,$Ii,$Ii
|
---|
1390 | vpxor $Xlo,$Zlo,$Zlo
|
---|
1391 | vpclmulqdq \$0x11,$Hkey,$Ij,$Zhi
|
---|
1392 | vmovdqu 0x60-0x40($Htbl),$Hkey # $Hkey^5
|
---|
1393 | vpunpckhqdq $Ii,$Ii,$T2
|
---|
1394 | vpxor $Xhi,$Zhi,$Zhi
|
---|
1395 | vpclmulqdq \$0x10,$HK, $T1,$Zmi
|
---|
1396 | vmovdqu 0x80-0x40($Htbl),$HK
|
---|
1397 | vpxor $Ii,$T2,$T2
|
---|
1398 | vpxor $Xmi,$Zmi,$Zmi
|
---|
1399 |
|
---|
1400 | vmovdqu 0x20($inp),$Ij # I[2]
|
---|
1401 | vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo
|
---|
1402 | vpshufb $bswap,$Ij,$Ij
|
---|
1403 | vpxor $Zlo,$Xlo,$Xlo
|
---|
1404 | vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi
|
---|
1405 | vmovdqu 0x70-0x40($Htbl),$Hkey # $Hkey^6
|
---|
1406 | vpunpckhqdq $Ij,$Ij,$T1
|
---|
1407 | vpxor $Zhi,$Xhi,$Xhi
|
---|
1408 | vpclmulqdq \$0x00,$HK, $T2,$Xmi
|
---|
1409 | vpxor $Ij,$T1,$T1
|
---|
1410 | vpxor $Zmi,$Xmi,$Xmi
|
---|
1411 | vxorps $Tred,$Xi,$Xi
|
---|
1412 |
|
---|
1413 | vmovdqu 0x10($inp),$Ii # I[1]
|
---|
1414 | vpalignr \$8,$Xi,$Xi,$Tred # 2nd phase
|
---|
1415 | vpclmulqdq \$0x00,$Hkey,$Ij,$Zlo
|
---|
1416 | vpshufb $bswap,$Ii,$Ii
|
---|
1417 | vpxor $Xlo,$Zlo,$Zlo
|
---|
1418 | vpclmulqdq \$0x11,$Hkey,$Ij,$Zhi
|
---|
1419 | vmovdqu 0x90-0x40($Htbl),$Hkey # $Hkey^7
|
---|
1420 | vpclmulqdq \$0x10,(%r10),$Xi,$Xi
|
---|
1421 | vxorps $Xo,$Tred,$Tred
|
---|
1422 | vpunpckhqdq $Ii,$Ii,$T2
|
---|
1423 | vpxor $Xhi,$Zhi,$Zhi
|
---|
1424 | vpclmulqdq \$0x10,$HK, $T1,$Zmi
|
---|
1425 | vmovdqu 0xb0-0x40($Htbl),$HK
|
---|
1426 | vpxor $Ii,$T2,$T2
|
---|
1427 | vpxor $Xmi,$Zmi,$Zmi
|
---|
1428 |
|
---|
1429 | vmovdqu ($inp),$Ij # I[0]
|
---|
1430 | vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo
|
---|
1431 | vpshufb $bswap,$Ij,$Ij
|
---|
1432 | vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi
|
---|
1433 | vmovdqu 0xa0-0x40($Htbl),$Hkey # $Hkey^8
|
---|
1434 | vpxor $Tred,$Ij,$Ij
|
---|
1435 | vpclmulqdq \$0x10,$HK, $T2,$Xmi
|
---|
1436 | vpxor $Xi,$Ij,$Ij # accumulate $Xi
|
---|
1437 |
|
---|
1438 | lea 0x80($inp),$inp
|
---|
1439 | sub \$0x80,$len
|
---|
1440 | jnc .Loop8x_avx
|
---|
1441 |
|
---|
1442 | add \$0x80,$len
|
---|
1443 | jmp .Ltail_no_xor_avx
|
---|
1444 |
|
---|
1445 | .align 32
|
---|
1446 | .Lshort_avx:
|
---|
1447 | vmovdqu -0x10($inp,$len),$Ii # very last word
|
---|
1448 | lea ($inp,$len),$inp
|
---|
1449 | vmovdqu 0x00-0x40($Htbl),$Hkey # $Hkey^1
|
---|
1450 | vmovdqu 0x20-0x40($Htbl),$HK
|
---|
1451 | vpshufb $bswap,$Ii,$Ij
|
---|
1452 |
|
---|
1453 | vmovdqa $Xlo,$Zlo # subtle way to zero $Zlo,
|
---|
1454 | vmovdqa $Xhi,$Zhi # $Zhi and
|
---|
1455 | vmovdqa $Xmi,$Zmi # $Zmi
|
---|
1456 | sub \$0x10,$len
|
---|
1457 | jz .Ltail_avx
|
---|
1458 |
|
---|
1459 | vpunpckhqdq $Ij,$Ij,$T1
|
---|
1460 | vpxor $Xlo,$Zlo,$Zlo
|
---|
1461 | vpclmulqdq \$0x00,$Hkey,$Ij,$Xlo
|
---|
1462 | vpxor $Ij,$T1,$T1
|
---|
1463 | vmovdqu -0x20($inp),$Ii
|
---|
1464 | vpxor $Xhi,$Zhi,$Zhi
|
---|
1465 | vpclmulqdq \$0x11,$Hkey,$Ij,$Xhi
|
---|
1466 | vmovdqu 0x10-0x40($Htbl),$Hkey # $Hkey^2
|
---|
1467 | vpshufb $bswap,$Ii,$Ij
|
---|
1468 | vpxor $Xmi,$Zmi,$Zmi
|
---|
1469 | vpclmulqdq \$0x00,$HK,$T1,$Xmi
|
---|
1470 | vpsrldq \$8,$HK,$HK
|
---|
1471 | sub \$0x10,$len
|
---|
1472 | jz .Ltail_avx
|
---|
1473 |
|
---|
1474 | vpunpckhqdq $Ij,$Ij,$T1
|
---|
1475 | vpxor $Xlo,$Zlo,$Zlo
|
---|
1476 | vpclmulqdq \$0x00,$Hkey,$Ij,$Xlo
|
---|
1477 | vpxor $Ij,$T1,$T1
|
---|
1478 | vmovdqu -0x30($inp),$Ii
|
---|
1479 | vpxor $Xhi,$Zhi,$Zhi
|
---|
1480 | vpclmulqdq \$0x11,$Hkey,$Ij,$Xhi
|
---|
1481 | vmovdqu 0x30-0x40($Htbl),$Hkey # $Hkey^3
|
---|
1482 | vpshufb $bswap,$Ii,$Ij
|
---|
1483 | vpxor $Xmi,$Zmi,$Zmi
|
---|
1484 | vpclmulqdq \$0x00,$HK,$T1,$Xmi
|
---|
1485 | vmovdqu 0x50-0x40($Htbl),$HK
|
---|
1486 | sub \$0x10,$len
|
---|
1487 | jz .Ltail_avx
|
---|
1488 |
|
---|
1489 | vpunpckhqdq $Ij,$Ij,$T1
|
---|
1490 | vpxor $Xlo,$Zlo,$Zlo
|
---|
1491 | vpclmulqdq \$0x00,$Hkey,$Ij,$Xlo
|
---|
1492 | vpxor $Ij,$T1,$T1
|
---|
1493 | vmovdqu -0x40($inp),$Ii
|
---|
1494 | vpxor $Xhi,$Zhi,$Zhi
|
---|
1495 | vpclmulqdq \$0x11,$Hkey,$Ij,$Xhi
|
---|
1496 | vmovdqu 0x40-0x40($Htbl),$Hkey # $Hkey^4
|
---|
1497 | vpshufb $bswap,$Ii,$Ij
|
---|
1498 | vpxor $Xmi,$Zmi,$Zmi
|
---|
1499 | vpclmulqdq \$0x00,$HK,$T1,$Xmi
|
---|
1500 | vpsrldq \$8,$HK,$HK
|
---|
1501 | sub \$0x10,$len
|
---|
1502 | jz .Ltail_avx
|
---|
1503 |
|
---|
1504 | vpunpckhqdq $Ij,$Ij,$T1
|
---|
1505 | vpxor $Xlo,$Zlo,$Zlo
|
---|
1506 | vpclmulqdq \$0x00,$Hkey,$Ij,$Xlo
|
---|
1507 | vpxor $Ij,$T1,$T1
|
---|
1508 | vmovdqu -0x50($inp),$Ii
|
---|
1509 | vpxor $Xhi,$Zhi,$Zhi
|
---|
1510 | vpclmulqdq \$0x11,$Hkey,$Ij,$Xhi
|
---|
1511 | vmovdqu 0x60-0x40($Htbl),$Hkey # $Hkey^5
|
---|
1512 | vpshufb $bswap,$Ii,$Ij
|
---|
1513 | vpxor $Xmi,$Zmi,$Zmi
|
---|
1514 | vpclmulqdq \$0x00,$HK,$T1,$Xmi
|
---|
1515 | vmovdqu 0x80-0x40($Htbl),$HK
|
---|
1516 | sub \$0x10,$len
|
---|
1517 | jz .Ltail_avx
|
---|
1518 |
|
---|
1519 | vpunpckhqdq $Ij,$Ij,$T1
|
---|
1520 | vpxor $Xlo,$Zlo,$Zlo
|
---|
1521 | vpclmulqdq \$0x00,$Hkey,$Ij,$Xlo
|
---|
1522 | vpxor $Ij,$T1,$T1
|
---|
1523 | vmovdqu -0x60($inp),$Ii
|
---|
1524 | vpxor $Xhi,$Zhi,$Zhi
|
---|
1525 | vpclmulqdq \$0x11,$Hkey,$Ij,$Xhi
|
---|
1526 | vmovdqu 0x70-0x40($Htbl),$Hkey # $Hkey^6
|
---|
1527 | vpshufb $bswap,$Ii,$Ij
|
---|
1528 | vpxor $Xmi,$Zmi,$Zmi
|
---|
1529 | vpclmulqdq \$0x00,$HK,$T1,$Xmi
|
---|
1530 | vpsrldq \$8,$HK,$HK
|
---|
1531 | sub \$0x10,$len
|
---|
1532 | jz .Ltail_avx
|
---|
1533 |
|
---|
1534 | vpunpckhqdq $Ij,$Ij,$T1
|
---|
1535 | vpxor $Xlo,$Zlo,$Zlo
|
---|
1536 | vpclmulqdq \$0x00,$Hkey,$Ij,$Xlo
|
---|
1537 | vpxor $Ij,$T1,$T1
|
---|
1538 | vmovdqu -0x70($inp),$Ii
|
---|
1539 | vpxor $Xhi,$Zhi,$Zhi
|
---|
1540 | vpclmulqdq \$0x11,$Hkey,$Ij,$Xhi
|
---|
1541 | vmovdqu 0x90-0x40($Htbl),$Hkey # $Hkey^7
|
---|
1542 | vpshufb $bswap,$Ii,$Ij
|
---|
1543 | vpxor $Xmi,$Zmi,$Zmi
|
---|
1544 | vpclmulqdq \$0x00,$HK,$T1,$Xmi
|
---|
1545 | vmovq 0xb8-0x40($Htbl),$HK
|
---|
1546 | sub \$0x10,$len
|
---|
1547 | jmp .Ltail_avx
|
---|
1548 |
|
---|
1549 | .align 32
|
---|
1550 | .Ltail_avx:
|
---|
1551 | vpxor $Xi,$Ij,$Ij # accumulate $Xi
|
---|
1552 | .Ltail_no_xor_avx:
|
---|
1553 | vpunpckhqdq $Ij,$Ij,$T1
|
---|
1554 | vpxor $Xlo,$Zlo,$Zlo
|
---|
1555 | vpclmulqdq \$0x00,$Hkey,$Ij,$Xlo
|
---|
1556 | vpxor $Ij,$T1,$T1
|
---|
1557 | vpxor $Xhi,$Zhi,$Zhi
|
---|
1558 | vpclmulqdq \$0x11,$Hkey,$Ij,$Xhi
|
---|
1559 | vpxor $Xmi,$Zmi,$Zmi
|
---|
1560 | vpclmulqdq \$0x00,$HK,$T1,$Xmi
|
---|
1561 |
|
---|
1562 | vmovdqu (%r10),$Tred
|
---|
1563 |
|
---|
1564 | vpxor $Xlo,$Zlo,$Xi
|
---|
1565 | vpxor $Xhi,$Zhi,$Xo
|
---|
1566 | vpxor $Xmi,$Zmi,$Zmi
|
---|
1567 |
|
---|
1568 | vpxor $Xi, $Zmi,$Zmi # aggregated Karatsuba post-processing
|
---|
1569 | vpxor $Xo, $Zmi,$Zmi
|
---|
1570 | vpslldq \$8, $Zmi,$T2
|
---|
1571 | vpsrldq \$8, $Zmi,$Zmi
|
---|
1572 | vpxor $T2, $Xi, $Xi
|
---|
1573 | vpxor $Zmi,$Xo, $Xo
|
---|
1574 |
|
---|
1575 | vpclmulqdq \$0x10,$Tred,$Xi,$T2 # 1st phase
|
---|
1576 | vpalignr \$8,$Xi,$Xi,$Xi
|
---|
1577 | vpxor $T2,$Xi,$Xi
|
---|
1578 |
|
---|
1579 | vpclmulqdq \$0x10,$Tred,$Xi,$T2 # 2nd phase
|
---|
1580 | vpalignr \$8,$Xi,$Xi,$Xi
|
---|
1581 | vpxor $Xo,$Xi,$Xi
|
---|
1582 | vpxor $T2,$Xi,$Xi
|
---|
1583 |
|
---|
1584 | cmp \$0,$len
|
---|
1585 | jne .Lshort_avx
|
---|
1586 |
|
---|
1587 | vpshufb $bswap,$Xi,$Xi
|
---|
1588 | vmovdqu $Xi,($Xip)
|
---|
1589 | vzeroupper
|
---|
1590 | ___
|
---|
1591 | $code.=<<___ if ($win64);
|
---|
1592 | movaps (%rsp),%xmm6
|
---|
1593 | movaps 0x10(%rsp),%xmm7
|
---|
1594 | movaps 0x20(%rsp),%xmm8
|
---|
1595 | movaps 0x30(%rsp),%xmm9
|
---|
1596 | movaps 0x40(%rsp),%xmm10
|
---|
1597 | movaps 0x50(%rsp),%xmm11
|
---|
1598 | movaps 0x60(%rsp),%xmm12
|
---|
1599 | movaps 0x70(%rsp),%xmm13
|
---|
1600 | movaps 0x80(%rsp),%xmm14
|
---|
1601 | movaps 0x90(%rsp),%xmm15
|
---|
1602 | lea 0xa8(%rsp),%rsp
|
---|
1603 | .LSEH_end_gcm_ghash_avx:
|
---|
1604 | ___
|
---|
1605 | $code.=<<___;
|
---|
1606 | ret
|
---|
1607 | .cfi_endproc
|
---|
1608 | .size gcm_ghash_avx,.-gcm_ghash_avx
|
---|
1609 | ___
|
---|
1610 | } else {
|
---|
1611 | $code.=<<___;
|
---|
1612 | jmp .L_ghash_clmul
|
---|
1613 | .cfi_endproc
|
---|
1614 | .size gcm_ghash_avx,.-gcm_ghash_avx
|
---|
1615 | ___
|
---|
1616 | }
|
---|
1617 | |
---|
1618 |
|
---|
1619 | $code.=<<___;
|
---|
1620 | .align 64
|
---|
1621 | .Lbswap_mask:
|
---|
1622 | .byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
|
---|
1623 | .L0x1c2_polynomial:
|
---|
1624 | .byte 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2
|
---|
1625 | .L7_mask:
|
---|
1626 | .long 7,0,7,0
|
---|
1627 | .L7_mask_poly:
|
---|
1628 | .long 7,0,`0xE1<<1`,0
|
---|
1629 | .align 64
|
---|
1630 | .type .Lrem_4bit,\@object
|
---|
1631 | .Lrem_4bit:
|
---|
1632 | .long 0,`0x0000<<16`,0,`0x1C20<<16`,0,`0x3840<<16`,0,`0x2460<<16`
|
---|
1633 | .long 0,`0x7080<<16`,0,`0x6CA0<<16`,0,`0x48C0<<16`,0,`0x54E0<<16`
|
---|
1634 | .long 0,`0xE100<<16`,0,`0xFD20<<16`,0,`0xD940<<16`,0,`0xC560<<16`
|
---|
1635 | .long 0,`0x9180<<16`,0,`0x8DA0<<16`,0,`0xA9C0<<16`,0,`0xB5E0<<16`
|
---|
1636 | .type .Lrem_8bit,\@object
|
---|
1637 | .Lrem_8bit:
|
---|
1638 | .value 0x0000,0x01C2,0x0384,0x0246,0x0708,0x06CA,0x048C,0x054E
|
---|
1639 | .value 0x0E10,0x0FD2,0x0D94,0x0C56,0x0918,0x08DA,0x0A9C,0x0B5E
|
---|
1640 | .value 0x1C20,0x1DE2,0x1FA4,0x1E66,0x1B28,0x1AEA,0x18AC,0x196E
|
---|
1641 | .value 0x1230,0x13F2,0x11B4,0x1076,0x1538,0x14FA,0x16BC,0x177E
|
---|
1642 | .value 0x3840,0x3982,0x3BC4,0x3A06,0x3F48,0x3E8A,0x3CCC,0x3D0E
|
---|
1643 | .value 0x3650,0x3792,0x35D4,0x3416,0x3158,0x309A,0x32DC,0x331E
|
---|
1644 | .value 0x2460,0x25A2,0x27E4,0x2626,0x2368,0x22AA,0x20EC,0x212E
|
---|
1645 | .value 0x2A70,0x2BB2,0x29F4,0x2836,0x2D78,0x2CBA,0x2EFC,0x2F3E
|
---|
1646 | .value 0x7080,0x7142,0x7304,0x72C6,0x7788,0x764A,0x740C,0x75CE
|
---|
1647 | .value 0x7E90,0x7F52,0x7D14,0x7CD6,0x7998,0x785A,0x7A1C,0x7BDE
|
---|
1648 | .value 0x6CA0,0x6D62,0x6F24,0x6EE6,0x6BA8,0x6A6A,0x682C,0x69EE
|
---|
1649 | .value 0x62B0,0x6372,0x6134,0x60F6,0x65B8,0x647A,0x663C,0x67FE
|
---|
1650 | .value 0x48C0,0x4902,0x4B44,0x4A86,0x4FC8,0x4E0A,0x4C4C,0x4D8E
|
---|
1651 | .value 0x46D0,0x4712,0x4554,0x4496,0x41D8,0x401A,0x425C,0x439E
|
---|
1652 | .value 0x54E0,0x5522,0x5764,0x56A6,0x53E8,0x522A,0x506C,0x51AE
|
---|
1653 | .value 0x5AF0,0x5B32,0x5974,0x58B6,0x5DF8,0x5C3A,0x5E7C,0x5FBE
|
---|
1654 | .value 0xE100,0xE0C2,0xE284,0xE346,0xE608,0xE7CA,0xE58C,0xE44E
|
---|
1655 | .value 0xEF10,0xEED2,0xEC94,0xED56,0xE818,0xE9DA,0xEB9C,0xEA5E
|
---|
1656 | .value 0xFD20,0xFCE2,0xFEA4,0xFF66,0xFA28,0xFBEA,0xF9AC,0xF86E
|
---|
1657 | .value 0xF330,0xF2F2,0xF0B4,0xF176,0xF438,0xF5FA,0xF7BC,0xF67E
|
---|
1658 | .value 0xD940,0xD882,0xDAC4,0xDB06,0xDE48,0xDF8A,0xDDCC,0xDC0E
|
---|
1659 | .value 0xD750,0xD692,0xD4D4,0xD516,0xD058,0xD19A,0xD3DC,0xD21E
|
---|
1660 | .value 0xC560,0xC4A2,0xC6E4,0xC726,0xC268,0xC3AA,0xC1EC,0xC02E
|
---|
1661 | .value 0xCB70,0xCAB2,0xC8F4,0xC936,0xCC78,0xCDBA,0xCFFC,0xCE3E
|
---|
1662 | .value 0x9180,0x9042,0x9204,0x93C6,0x9688,0x974A,0x950C,0x94CE
|
---|
1663 | .value 0x9F90,0x9E52,0x9C14,0x9DD6,0x9898,0x995A,0x9B1C,0x9ADE
|
---|
1664 | .value 0x8DA0,0x8C62,0x8E24,0x8FE6,0x8AA8,0x8B6A,0x892C,0x88EE
|
---|
1665 | .value 0x83B0,0x8272,0x8034,0x81F6,0x84B8,0x857A,0x873C,0x86FE
|
---|
1666 | .value 0xA9C0,0xA802,0xAA44,0xAB86,0xAEC8,0xAF0A,0xAD4C,0xAC8E
|
---|
1667 | .value 0xA7D0,0xA612,0xA454,0xA596,0xA0D8,0xA11A,0xA35C,0xA29E
|
---|
1668 | .value 0xB5E0,0xB422,0xB664,0xB7A6,0xB2E8,0xB32A,0xB16C,0xB0AE
|
---|
1669 | .value 0xBBF0,0xBA32,0xB874,0xB9B6,0xBCF8,0xBD3A,0xBF7C,0xBEBE
|
---|
1670 |
|
---|
1671 | .asciz "GHASH for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
|
---|
1672 | .align 64
|
---|
1673 | ___
|
---|
1674 | |
---|
1675 |
|
---|
1676 | # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
|
---|
1677 | # CONTEXT *context,DISPATCHER_CONTEXT *disp)
|
---|
1678 | if ($win64) {
|
---|
1679 | $rec="%rcx";
|
---|
1680 | $frame="%rdx";
|
---|
1681 | $context="%r8";
|
---|
1682 | $disp="%r9";
|
---|
1683 |
|
---|
1684 | $code.=<<___;
|
---|
1685 | .extern __imp_RtlVirtualUnwind
|
---|
1686 | .type se_handler,\@abi-omnipotent
|
---|
1687 | .align 16
|
---|
1688 | se_handler:
|
---|
1689 | push %rsi
|
---|
1690 | push %rdi
|
---|
1691 | push %rbx
|
---|
1692 | push %rbp
|
---|
1693 | push %r12
|
---|
1694 | push %r13
|
---|
1695 | push %r14
|
---|
1696 | push %r15
|
---|
1697 | pushfq
|
---|
1698 | sub \$64,%rsp
|
---|
1699 |
|
---|
1700 | mov 120($context),%rax # pull context->Rax
|
---|
1701 | mov 248($context),%rbx # pull context->Rip
|
---|
1702 |
|
---|
1703 | mov 8($disp),%rsi # disp->ImageBase
|
---|
1704 | mov 56($disp),%r11 # disp->HandlerData
|
---|
1705 |
|
---|
1706 | mov 0(%r11),%r10d # HandlerData[0]
|
---|
1707 | lea (%rsi,%r10),%r10 # prologue label
|
---|
1708 | cmp %r10,%rbx # context->Rip<prologue label
|
---|
1709 | jb .Lin_prologue
|
---|
1710 |
|
---|
1711 | mov 152($context),%rax # pull context->Rsp
|
---|
1712 |
|
---|
1713 | mov 4(%r11),%r10d # HandlerData[1]
|
---|
1714 | lea (%rsi,%r10),%r10 # epilogue label
|
---|
1715 | cmp %r10,%rbx # context->Rip>=epilogue label
|
---|
1716 | jae .Lin_prologue
|
---|
1717 |
|
---|
1718 | lea 48+280(%rax),%rax # adjust "rsp"
|
---|
1719 |
|
---|
1720 | mov -8(%rax),%rbx
|
---|
1721 | mov -16(%rax),%rbp
|
---|
1722 | mov -24(%rax),%r12
|
---|
1723 | mov -32(%rax),%r13
|
---|
1724 | mov -40(%rax),%r14
|
---|
1725 | mov -48(%rax),%r15
|
---|
1726 | mov %rbx,144($context) # restore context->Rbx
|
---|
1727 | mov %rbp,160($context) # restore context->Rbp
|
---|
1728 | mov %r12,216($context) # restore context->R12
|
---|
1729 | mov %r13,224($context) # restore context->R13
|
---|
1730 | mov %r14,232($context) # restore context->R14
|
---|
1731 | mov %r15,240($context) # restore context->R15
|
---|
1732 |
|
---|
1733 | .Lin_prologue:
|
---|
1734 | mov 8(%rax),%rdi
|
---|
1735 | mov 16(%rax),%rsi
|
---|
1736 | mov %rax,152($context) # restore context->Rsp
|
---|
1737 | mov %rsi,168($context) # restore context->Rsi
|
---|
1738 | mov %rdi,176($context) # restore context->Rdi
|
---|
1739 |
|
---|
1740 | mov 40($disp),%rdi # disp->ContextRecord
|
---|
1741 | mov $context,%rsi # context
|
---|
1742 | mov \$`1232/8`,%ecx # sizeof(CONTEXT)
|
---|
1743 | .long 0xa548f3fc # cld; rep movsq
|
---|
1744 |
|
---|
1745 | mov $disp,%rsi
|
---|
1746 | xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
|
---|
1747 | mov 8(%rsi),%rdx # arg2, disp->ImageBase
|
---|
1748 | mov 0(%rsi),%r8 # arg3, disp->ControlPc
|
---|
1749 | mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
|
---|
1750 | mov 40(%rsi),%r10 # disp->ContextRecord
|
---|
1751 | lea 56(%rsi),%r11 # &disp->HandlerData
|
---|
1752 | lea 24(%rsi),%r12 # &disp->EstablisherFrame
|
---|
1753 | mov %r10,32(%rsp) # arg5
|
---|
1754 | mov %r11,40(%rsp) # arg6
|
---|
1755 | mov %r12,48(%rsp) # arg7
|
---|
1756 | mov %rcx,56(%rsp) # arg8, (NULL)
|
---|
1757 | call *__imp_RtlVirtualUnwind(%rip)
|
---|
1758 |
|
---|
1759 | mov \$1,%eax # ExceptionContinueSearch
|
---|
1760 | add \$64,%rsp
|
---|
1761 | popfq
|
---|
1762 | pop %r15
|
---|
1763 | pop %r14
|
---|
1764 | pop %r13
|
---|
1765 | pop %r12
|
---|
1766 | pop %rbp
|
---|
1767 | pop %rbx
|
---|
1768 | pop %rdi
|
---|
1769 | pop %rsi
|
---|
1770 | ret
|
---|
1771 | .size se_handler,.-se_handler
|
---|
1772 |
|
---|
1773 | .section .pdata
|
---|
1774 | .align 4
|
---|
1775 | .rva .LSEH_begin_gcm_gmult_4bit
|
---|
1776 | .rva .LSEH_end_gcm_gmult_4bit
|
---|
1777 | .rva .LSEH_info_gcm_gmult_4bit
|
---|
1778 |
|
---|
1779 | .rva .LSEH_begin_gcm_ghash_4bit
|
---|
1780 | .rva .LSEH_end_gcm_ghash_4bit
|
---|
1781 | .rva .LSEH_info_gcm_ghash_4bit
|
---|
1782 |
|
---|
1783 | .rva .LSEH_begin_gcm_init_clmul
|
---|
1784 | .rva .LSEH_end_gcm_init_clmul
|
---|
1785 | .rva .LSEH_info_gcm_init_clmul
|
---|
1786 |
|
---|
1787 | .rva .LSEH_begin_gcm_ghash_clmul
|
---|
1788 | .rva .LSEH_end_gcm_ghash_clmul
|
---|
1789 | .rva .LSEH_info_gcm_ghash_clmul
|
---|
1790 | ___
|
---|
1791 | $code.=<<___ if ($avx);
|
---|
1792 | .rva .LSEH_begin_gcm_init_avx
|
---|
1793 | .rva .LSEH_end_gcm_init_avx
|
---|
1794 | .rva .LSEH_info_gcm_init_clmul
|
---|
1795 |
|
---|
1796 | .rva .LSEH_begin_gcm_ghash_avx
|
---|
1797 | .rva .LSEH_end_gcm_ghash_avx
|
---|
1798 | .rva .LSEH_info_gcm_ghash_clmul
|
---|
1799 | ___
|
---|
1800 | $code.=<<___;
|
---|
1801 | .section .xdata
|
---|
1802 | .align 8
|
---|
1803 | .LSEH_info_gcm_gmult_4bit:
|
---|
1804 | .byte 9,0,0,0
|
---|
1805 | .rva se_handler
|
---|
1806 | .rva .Lgmult_prologue,.Lgmult_epilogue # HandlerData
|
---|
1807 | .LSEH_info_gcm_ghash_4bit:
|
---|
1808 | .byte 9,0,0,0
|
---|
1809 | .rva se_handler
|
---|
1810 | .rva .Lghash_prologue,.Lghash_epilogue # HandlerData
|
---|
1811 | .LSEH_info_gcm_init_clmul:
|
---|
1812 | .byte 0x01,0x08,0x03,0x00
|
---|
1813 | .byte 0x08,0x68,0x00,0x00 #movaps 0x00(rsp),xmm6
|
---|
1814 | .byte 0x04,0x22,0x00,0x00 #sub rsp,0x18
|
---|
1815 | .LSEH_info_gcm_ghash_clmul:
|
---|
1816 | .byte 0x01,0x33,0x16,0x00
|
---|
1817 | .byte 0x33,0xf8,0x09,0x00 #movaps 0x90(rsp),xmm15
|
---|
1818 | .byte 0x2e,0xe8,0x08,0x00 #movaps 0x80(rsp),xmm14
|
---|
1819 | .byte 0x29,0xd8,0x07,0x00 #movaps 0x70(rsp),xmm13
|
---|
1820 | .byte 0x24,0xc8,0x06,0x00 #movaps 0x60(rsp),xmm12
|
---|
1821 | .byte 0x1f,0xb8,0x05,0x00 #movaps 0x50(rsp),xmm11
|
---|
1822 | .byte 0x1a,0xa8,0x04,0x00 #movaps 0x40(rsp),xmm10
|
---|
1823 | .byte 0x15,0x98,0x03,0x00 #movaps 0x30(rsp),xmm9
|
---|
1824 | .byte 0x10,0x88,0x02,0x00 #movaps 0x20(rsp),xmm8
|
---|
1825 | .byte 0x0c,0x78,0x01,0x00 #movaps 0x10(rsp),xmm7
|
---|
1826 | .byte 0x08,0x68,0x00,0x00 #movaps 0x00(rsp),xmm6
|
---|
1827 | .byte 0x04,0x01,0x15,0x00 #sub rsp,0xa8
|
---|
1828 | ___
|
---|
1829 | }
|
---|
1830 | |
---|
1831 |
|
---|
1832 | $code =~ s/\`([^\`]*)\`/eval($1)/gem;
|
---|
1833 |
|
---|
1834 | print $code;
|
---|
1835 |
|
---|
1836 | close STDOUT or die "error closing STDOUT: $!";
|
---|