1 | #! /usr/bin/env perl
|
---|
2 | # Copyright 2005-2020 The OpenSSL Project Authors. All Rights Reserved.
|
---|
3 | #
|
---|
4 | # Licensed under the Apache License 2.0 (the "License"). You may not use
|
---|
5 | # this file except in compliance with the License. You can obtain a copy
|
---|
6 | # in the file LICENSE in the source distribution or at
|
---|
7 | # https://www.openssl.org/source/license.html
|
---|
8 |
|
---|
9 |
|
---|
10 | # ====================================================================
|
---|
11 | # Written by Andy Polyakov <[email protected]> for the OpenSSL
|
---|
12 | # project. The module is, however, dual licensed under OpenSSL and
|
---|
13 | # CRYPTOGAMS licenses depending on where you obtain it. For further
|
---|
14 | # details see http://www.openssl.org/~appro/cryptogams/.
|
---|
15 | # ====================================================================
|
---|
16 |
|
---|
17 | # October 2005.
|
---|
18 | #
|
---|
19 | # Montgomery multiplication routine for x86_64. While it gives modest
|
---|
20 | # 9% improvement of rsa4096 sign on Opteron, rsa512 sign runs more
|
---|
21 | # than twice, >2x, as fast. Most common rsa1024 sign is improved by
|
---|
22 | # respectful 50%. It remains to be seen if loop unrolling and
|
---|
23 | # dedicated squaring routine can provide further improvement...
|
---|
24 |
|
---|
25 | # July 2011.
|
---|
26 | #
|
---|
27 | # Add dedicated squaring procedure. Performance improvement varies
|
---|
28 | # from platform to platform, but in average it's ~5%/15%/25%/33%
|
---|
29 | # for 512-/1024-/2048-/4096-bit RSA *sign* benchmarks respectively.
|
---|
30 |
|
---|
31 | # August 2011.
|
---|
32 | #
|
---|
33 | # Unroll and modulo-schedule inner loops in such manner that they
|
---|
34 | # are "fallen through" for input lengths of 8, which is critical for
|
---|
35 | # 1024-bit RSA *sign*. Average performance improvement in comparison
|
---|
36 | # to *initial* version of this module from 2005 is ~0%/30%/40%/45%
|
---|
37 | # for 512-/1024-/2048-/4096-bit RSA *sign* benchmarks respectively.
|
---|
38 |
|
---|
39 | # June 2013.
|
---|
40 | #
|
---|
41 | # Optimize reduction in squaring procedure and improve 1024+-bit RSA
|
---|
42 | # sign performance by 10-16% on Intel Sandy Bridge and later
|
---|
43 | # (virtually same on non-Intel processors).
|
---|
44 |
|
---|
45 | # August 2013.
|
---|
46 | #
|
---|
47 | # Add MULX/ADOX/ADCX code path.
|
---|
48 |
|
---|
49 | # $output is the last argument if it looks like a file (it has an extension)
|
---|
50 | # $flavour is the first argument if it doesn't look like a file
|
---|
51 | $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
|
---|
52 | $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
|
---|
53 |
|
---|
54 | $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
|
---|
55 |
|
---|
56 | $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
|
---|
57 | ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
|
---|
58 | ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
|
---|
59 | die "can't locate x86_64-xlate.pl";
|
---|
60 |
|
---|
61 | open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""
|
---|
62 | or die "can't call $xlate: $!";
|
---|
63 | *STDOUT=*OUT;
|
---|
64 |
|
---|
65 | if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
|
---|
66 | =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
|
---|
67 | $addx = ($1>=2.23);
|
---|
68 | }
|
---|
69 |
|
---|
70 | if (!$addx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
|
---|
71 | `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
|
---|
72 | $addx = ($1>=2.10);
|
---|
73 | }
|
---|
74 |
|
---|
75 | if (!$addx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
|
---|
76 | `ml64 2>&1` =~ /Version ([0-9]+)\./) {
|
---|
77 | $addx = ($1>=12);
|
---|
78 | }
|
---|
79 |
|
---|
80 | if (!$addx && `$ENV{CC} -v 2>&1` =~ /((?:clang|LLVM) version|.*based on LLVM) ([0-9]+)\.([0-9]+)/) {
|
---|
81 | my $ver = $2 + $3/100.0; # 3.1->3.01, 3.10->3.10
|
---|
82 | $addx = ($ver>=3.03);
|
---|
83 | }
|
---|
84 |
|
---|
85 | # int bn_mul_mont(
|
---|
86 | $rp="%rdi"; # BN_ULONG *rp,
|
---|
87 | $ap="%rsi"; # const BN_ULONG *ap,
|
---|
88 | $bp="%rdx"; # const BN_ULONG *bp,
|
---|
89 | $np="%rcx"; # const BN_ULONG *np,
|
---|
90 | $n0="%r8"; # const BN_ULONG *n0,
|
---|
91 | $num="%r9"; # int num);
|
---|
92 | $lo0="%r10";
|
---|
93 | $hi0="%r11";
|
---|
94 | $hi1="%r13";
|
---|
95 | $i="%r14";
|
---|
96 | $j="%r15";
|
---|
97 | $m0="%rbx";
|
---|
98 | $m1="%rbp";
|
---|
99 |
|
---|
100 | $code=<<___;
|
---|
101 | .text
|
---|
102 |
|
---|
103 | .extern OPENSSL_ia32cap_P
|
---|
104 |
|
---|
105 | .globl bn_mul_mont
|
---|
106 | .type bn_mul_mont,\@function,6
|
---|
107 | .align 16
|
---|
108 | bn_mul_mont:
|
---|
109 | .cfi_startproc
|
---|
110 | mov ${num}d,${num}d
|
---|
111 | mov %rsp,%rax
|
---|
112 | .cfi_def_cfa_register %rax
|
---|
113 | test \$3,${num}d
|
---|
114 | jnz .Lmul_enter
|
---|
115 | cmp \$8,${num}d
|
---|
116 | jb .Lmul_enter
|
---|
117 | ___
|
---|
118 | $code.=<<___ if ($addx);
|
---|
119 | mov OPENSSL_ia32cap_P+8(%rip),%r11d
|
---|
120 | ___
|
---|
121 | $code.=<<___;
|
---|
122 | cmp $ap,$bp
|
---|
123 | jne .Lmul4x_enter
|
---|
124 | test \$7,${num}d
|
---|
125 | jz .Lsqr8x_enter
|
---|
126 | jmp .Lmul4x_enter
|
---|
127 |
|
---|
128 | .align 16
|
---|
129 | .Lmul_enter:
|
---|
130 | push %rbx
|
---|
131 | .cfi_push %rbx
|
---|
132 | push %rbp
|
---|
133 | .cfi_push %rbp
|
---|
134 | push %r12
|
---|
135 | .cfi_push %r12
|
---|
136 | push %r13
|
---|
137 | .cfi_push %r13
|
---|
138 | push %r14
|
---|
139 | .cfi_push %r14
|
---|
140 | push %r15
|
---|
141 | .cfi_push %r15
|
---|
142 |
|
---|
143 | neg $num
|
---|
144 | mov %rsp,%r11
|
---|
145 | lea -16(%rsp,$num,8),%r10 # future alloca(8*(num+2))
|
---|
146 | neg $num # restore $num
|
---|
147 | and \$-1024,%r10 # minimize TLB usage
|
---|
148 |
|
---|
149 | # An OS-agnostic version of __chkstk.
|
---|
150 | #
|
---|
151 | # Some OSes (Windows) insist on stack being "wired" to
|
---|
152 | # physical memory in strictly sequential manner, i.e. if stack
|
---|
153 | # allocation spans two pages, then reference to farmost one can
|
---|
154 | # be punishable by SEGV. But page walking can do good even on
|
---|
155 | # other OSes, because it guarantees that villain thread hits
|
---|
156 | # the guard page before it can make damage to innocent one...
|
---|
157 | sub %r10,%r11
|
---|
158 | and \$-4096,%r11
|
---|
159 | lea (%r10,%r11),%rsp
|
---|
160 | mov (%rsp),%r11
|
---|
161 | cmp %r10,%rsp
|
---|
162 | ja .Lmul_page_walk
|
---|
163 | jmp .Lmul_page_walk_done
|
---|
164 |
|
---|
165 | .align 16
|
---|
166 | .Lmul_page_walk:
|
---|
167 | lea -4096(%rsp),%rsp
|
---|
168 | mov (%rsp),%r11
|
---|
169 | cmp %r10,%rsp
|
---|
170 | ja .Lmul_page_walk
|
---|
171 | .Lmul_page_walk_done:
|
---|
172 |
|
---|
173 | mov %rax,8(%rsp,$num,8) # tp[num+1]=%rsp
|
---|
174 | .cfi_cfa_expression %rsp+8,$num,8,mul,plus,deref,+8
|
---|
175 | .Lmul_body:
|
---|
176 | mov $bp,%r12 # reassign $bp
|
---|
177 | ___
|
---|
178 | $bp="%r12";
|
---|
179 | $code.=<<___;
|
---|
180 | mov ($n0),$n0 # pull n0[0] value
|
---|
181 | mov ($bp),$m0 # m0=bp[0]
|
---|
182 | mov ($ap),%rax
|
---|
183 |
|
---|
184 | xor $i,$i # i=0
|
---|
185 | xor $j,$j # j=0
|
---|
186 |
|
---|
187 | mov $n0,$m1
|
---|
188 | mulq $m0 # ap[0]*bp[0]
|
---|
189 | mov %rax,$lo0
|
---|
190 | mov ($np),%rax
|
---|
191 |
|
---|
192 | imulq $lo0,$m1 # "tp[0]"*n0
|
---|
193 | mov %rdx,$hi0
|
---|
194 |
|
---|
195 | mulq $m1 # np[0]*m1
|
---|
196 | add %rax,$lo0 # discarded
|
---|
197 | mov 8($ap),%rax
|
---|
198 | adc \$0,%rdx
|
---|
199 | mov %rdx,$hi1
|
---|
200 |
|
---|
201 | lea 1($j),$j # j++
|
---|
202 | jmp .L1st_enter
|
---|
203 |
|
---|
204 | .align 16
|
---|
205 | .L1st:
|
---|
206 | add %rax,$hi1
|
---|
207 | mov ($ap,$j,8),%rax
|
---|
208 | adc \$0,%rdx
|
---|
209 | add $hi0,$hi1 # np[j]*m1+ap[j]*bp[0]
|
---|
210 | mov $lo0,$hi0
|
---|
211 | adc \$0,%rdx
|
---|
212 | mov $hi1,-16(%rsp,$j,8) # tp[j-1]
|
---|
213 | mov %rdx,$hi1
|
---|
214 |
|
---|
215 | .L1st_enter:
|
---|
216 | mulq $m0 # ap[j]*bp[0]
|
---|
217 | add %rax,$hi0
|
---|
218 | mov ($np,$j,8),%rax
|
---|
219 | adc \$0,%rdx
|
---|
220 | lea 1($j),$j # j++
|
---|
221 | mov %rdx,$lo0
|
---|
222 |
|
---|
223 | mulq $m1 # np[j]*m1
|
---|
224 | cmp $num,$j
|
---|
225 | jne .L1st
|
---|
226 |
|
---|
227 | add %rax,$hi1
|
---|
228 | mov ($ap),%rax # ap[0]
|
---|
229 | adc \$0,%rdx
|
---|
230 | add $hi0,$hi1 # np[j]*m1+ap[j]*bp[0]
|
---|
231 | adc \$0,%rdx
|
---|
232 | mov $hi1,-16(%rsp,$j,8) # tp[j-1]
|
---|
233 | mov %rdx,$hi1
|
---|
234 | mov $lo0,$hi0
|
---|
235 |
|
---|
236 | xor %rdx,%rdx
|
---|
237 | add $hi0,$hi1
|
---|
238 | adc \$0,%rdx
|
---|
239 | mov $hi1,-8(%rsp,$num,8)
|
---|
240 | mov %rdx,(%rsp,$num,8) # store upmost overflow bit
|
---|
241 |
|
---|
242 | lea 1($i),$i # i++
|
---|
243 | jmp .Louter
|
---|
244 | .align 16
|
---|
245 | .Louter:
|
---|
246 | mov ($bp,$i,8),$m0 # m0=bp[i]
|
---|
247 | xor $j,$j # j=0
|
---|
248 | mov $n0,$m1
|
---|
249 | mov (%rsp),$lo0
|
---|
250 | mulq $m0 # ap[0]*bp[i]
|
---|
251 | add %rax,$lo0 # ap[0]*bp[i]+tp[0]
|
---|
252 | mov ($np),%rax
|
---|
253 | adc \$0,%rdx
|
---|
254 |
|
---|
255 | imulq $lo0,$m1 # tp[0]*n0
|
---|
256 | mov %rdx,$hi0
|
---|
257 |
|
---|
258 | mulq $m1 # np[0]*m1
|
---|
259 | add %rax,$lo0 # discarded
|
---|
260 | mov 8($ap),%rax
|
---|
261 | adc \$0,%rdx
|
---|
262 | mov 8(%rsp),$lo0 # tp[1]
|
---|
263 | mov %rdx,$hi1
|
---|
264 |
|
---|
265 | lea 1($j),$j # j++
|
---|
266 | jmp .Linner_enter
|
---|
267 |
|
---|
268 | .align 16
|
---|
269 | .Linner:
|
---|
270 | add %rax,$hi1
|
---|
271 | mov ($ap,$j,8),%rax
|
---|
272 | adc \$0,%rdx
|
---|
273 | add $lo0,$hi1 # np[j]*m1+ap[j]*bp[i]+tp[j]
|
---|
274 | mov (%rsp,$j,8),$lo0
|
---|
275 | adc \$0,%rdx
|
---|
276 | mov $hi1,-16(%rsp,$j,8) # tp[j-1]
|
---|
277 | mov %rdx,$hi1
|
---|
278 |
|
---|
279 | .Linner_enter:
|
---|
280 | mulq $m0 # ap[j]*bp[i]
|
---|
281 | add %rax,$hi0
|
---|
282 | mov ($np,$j,8),%rax
|
---|
283 | adc \$0,%rdx
|
---|
284 | add $hi0,$lo0 # ap[j]*bp[i]+tp[j]
|
---|
285 | mov %rdx,$hi0
|
---|
286 | adc \$0,$hi0
|
---|
287 | lea 1($j),$j # j++
|
---|
288 |
|
---|
289 | mulq $m1 # np[j]*m1
|
---|
290 | cmp $num,$j
|
---|
291 | jne .Linner
|
---|
292 |
|
---|
293 | add %rax,$hi1
|
---|
294 | mov ($ap),%rax # ap[0]
|
---|
295 | adc \$0,%rdx
|
---|
296 | add $lo0,$hi1 # np[j]*m1+ap[j]*bp[i]+tp[j]
|
---|
297 | mov (%rsp,$j,8),$lo0
|
---|
298 | adc \$0,%rdx
|
---|
299 | mov $hi1,-16(%rsp,$j,8) # tp[j-1]
|
---|
300 | mov %rdx,$hi1
|
---|
301 |
|
---|
302 | xor %rdx,%rdx
|
---|
303 | add $hi0,$hi1
|
---|
304 | adc \$0,%rdx
|
---|
305 | add $lo0,$hi1 # pull upmost overflow bit
|
---|
306 | adc \$0,%rdx
|
---|
307 | mov $hi1,-8(%rsp,$num,8)
|
---|
308 | mov %rdx,(%rsp,$num,8) # store upmost overflow bit
|
---|
309 |
|
---|
310 | lea 1($i),$i # i++
|
---|
311 | cmp $num,$i
|
---|
312 | jb .Louter
|
---|
313 |
|
---|
314 | xor $i,$i # i=0 and clear CF!
|
---|
315 | mov (%rsp),%rax # tp[0]
|
---|
316 | mov $num,$j # j=num
|
---|
317 |
|
---|
318 | .align 16
|
---|
319 | .Lsub: sbb ($np,$i,8),%rax
|
---|
320 | mov %rax,($rp,$i,8) # rp[i]=tp[i]-np[i]
|
---|
321 | mov 8(%rsp,$i,8),%rax # tp[i+1]
|
---|
322 | lea 1($i),$i # i++
|
---|
323 | dec $j # doesn't affect CF!
|
---|
324 | jnz .Lsub
|
---|
325 |
|
---|
326 | sbb \$0,%rax # handle upmost overflow bit
|
---|
327 | mov \$-1,%rbx
|
---|
328 | xor %rax,%rbx # not %rax
|
---|
329 | xor $i,$i
|
---|
330 | mov $num,$j # j=num
|
---|
331 |
|
---|
332 | .Lcopy: # conditional copy
|
---|
333 | mov ($rp,$i,8),%rcx
|
---|
334 | mov (%rsp,$i,8),%rdx
|
---|
335 | and %rbx,%rcx
|
---|
336 | and %rax,%rdx
|
---|
337 | mov $num,(%rsp,$i,8) # zap temporary vector
|
---|
338 | or %rcx,%rdx
|
---|
339 | mov %rdx,($rp,$i,8) # rp[i]=tp[i]
|
---|
340 | lea 1($i),$i
|
---|
341 | sub \$1,$j
|
---|
342 | jnz .Lcopy
|
---|
343 |
|
---|
344 | mov 8(%rsp,$num,8),%rsi # restore %rsp
|
---|
345 | .cfi_def_cfa %rsi,8
|
---|
346 | mov \$1,%rax
|
---|
347 | mov -48(%rsi),%r15
|
---|
348 | .cfi_restore %r15
|
---|
349 | mov -40(%rsi),%r14
|
---|
350 | .cfi_restore %r14
|
---|
351 | mov -32(%rsi),%r13
|
---|
352 | .cfi_restore %r13
|
---|
353 | mov -24(%rsi),%r12
|
---|
354 | .cfi_restore %r12
|
---|
355 | mov -16(%rsi),%rbp
|
---|
356 | .cfi_restore %rbp
|
---|
357 | mov -8(%rsi),%rbx
|
---|
358 | .cfi_restore %rbx
|
---|
359 | lea (%rsi),%rsp
|
---|
360 | .cfi_def_cfa_register %rsp
|
---|
361 | .Lmul_epilogue:
|
---|
362 | ret
|
---|
363 | .cfi_endproc
|
---|
364 | .size bn_mul_mont,.-bn_mul_mont
|
---|
365 | ___
|
---|
366 | {{{
|
---|
367 | my @A=("%r10","%r11");
|
---|
368 | my @N=("%r13","%rdi");
|
---|
369 | $code.=<<___;
|
---|
370 | .type bn_mul4x_mont,\@function,6
|
---|
371 | .align 16
|
---|
372 | bn_mul4x_mont:
|
---|
373 | .cfi_startproc
|
---|
374 | mov ${num}d,${num}d
|
---|
375 | mov %rsp,%rax
|
---|
376 | .cfi_def_cfa_register %rax
|
---|
377 | .Lmul4x_enter:
|
---|
378 | ___
|
---|
379 | $code.=<<___ if ($addx);
|
---|
380 | and \$0x80100,%r11d
|
---|
381 | cmp \$0x80100,%r11d
|
---|
382 | je .Lmulx4x_enter
|
---|
383 | ___
|
---|
384 | $code.=<<___;
|
---|
385 | push %rbx
|
---|
386 | .cfi_push %rbx
|
---|
387 | push %rbp
|
---|
388 | .cfi_push %rbp
|
---|
389 | push %r12
|
---|
390 | .cfi_push %r12
|
---|
391 | push %r13
|
---|
392 | .cfi_push %r13
|
---|
393 | push %r14
|
---|
394 | .cfi_push %r14
|
---|
395 | push %r15
|
---|
396 | .cfi_push %r15
|
---|
397 |
|
---|
398 | neg $num
|
---|
399 | mov %rsp,%r11
|
---|
400 | lea -32(%rsp,$num,8),%r10 # future alloca(8*(num+4))
|
---|
401 | neg $num # restore
|
---|
402 | and \$-1024,%r10 # minimize TLB usage
|
---|
403 |
|
---|
404 | sub %r10,%r11
|
---|
405 | and \$-4096,%r11
|
---|
406 | lea (%r10,%r11),%rsp
|
---|
407 | mov (%rsp),%r11
|
---|
408 | cmp %r10,%rsp
|
---|
409 | ja .Lmul4x_page_walk
|
---|
410 | jmp .Lmul4x_page_walk_done
|
---|
411 |
|
---|
412 | .Lmul4x_page_walk:
|
---|
413 | lea -4096(%rsp),%rsp
|
---|
414 | mov (%rsp),%r11
|
---|
415 | cmp %r10,%rsp
|
---|
416 | ja .Lmul4x_page_walk
|
---|
417 | .Lmul4x_page_walk_done:
|
---|
418 |
|
---|
419 | mov %rax,8(%rsp,$num,8) # tp[num+1]=%rsp
|
---|
420 | .cfi_cfa_expression %rsp+8,$num,8,mul,plus,deref,+8
|
---|
421 | .Lmul4x_body:
|
---|
422 | mov $rp,16(%rsp,$num,8) # tp[num+2]=$rp
|
---|
423 | mov %rdx,%r12 # reassign $bp
|
---|
424 | ___
|
---|
425 | $bp="%r12";
|
---|
426 | $code.=<<___;
|
---|
427 | mov ($n0),$n0 # pull n0[0] value
|
---|
428 | mov ($bp),$m0 # m0=bp[0]
|
---|
429 | mov ($ap),%rax
|
---|
430 |
|
---|
431 | xor $i,$i # i=0
|
---|
432 | xor $j,$j # j=0
|
---|
433 |
|
---|
434 | mov $n0,$m1
|
---|
435 | mulq $m0 # ap[0]*bp[0]
|
---|
436 | mov %rax,$A[0]
|
---|
437 | mov ($np),%rax
|
---|
438 |
|
---|
439 | imulq $A[0],$m1 # "tp[0]"*n0
|
---|
440 | mov %rdx,$A[1]
|
---|
441 |
|
---|
442 | mulq $m1 # np[0]*m1
|
---|
443 | add %rax,$A[0] # discarded
|
---|
444 | mov 8($ap),%rax
|
---|
445 | adc \$0,%rdx
|
---|
446 | mov %rdx,$N[1]
|
---|
447 |
|
---|
448 | mulq $m0
|
---|
449 | add %rax,$A[1]
|
---|
450 | mov 8($np),%rax
|
---|
451 | adc \$0,%rdx
|
---|
452 | mov %rdx,$A[0]
|
---|
453 |
|
---|
454 | mulq $m1
|
---|
455 | add %rax,$N[1]
|
---|
456 | mov 16($ap),%rax
|
---|
457 | adc \$0,%rdx
|
---|
458 | add $A[1],$N[1]
|
---|
459 | lea 4($j),$j # j++
|
---|
460 | adc \$0,%rdx
|
---|
461 | mov $N[1],(%rsp)
|
---|
462 | mov %rdx,$N[0]
|
---|
463 | jmp .L1st4x
|
---|
464 | .align 16
|
---|
465 | .L1st4x:
|
---|
466 | mulq $m0 # ap[j]*bp[0]
|
---|
467 | add %rax,$A[0]
|
---|
468 | mov -16($np,$j,8),%rax
|
---|
469 | adc \$0,%rdx
|
---|
470 | mov %rdx,$A[1]
|
---|
471 |
|
---|
472 | mulq $m1 # np[j]*m1
|
---|
473 | add %rax,$N[0]
|
---|
474 | mov -8($ap,$j,8),%rax
|
---|
475 | adc \$0,%rdx
|
---|
476 | add $A[0],$N[0] # np[j]*m1+ap[j]*bp[0]
|
---|
477 | adc \$0,%rdx
|
---|
478 | mov $N[0],-24(%rsp,$j,8) # tp[j-1]
|
---|
479 | mov %rdx,$N[1]
|
---|
480 |
|
---|
481 | mulq $m0 # ap[j]*bp[0]
|
---|
482 | add %rax,$A[1]
|
---|
483 | mov -8($np,$j,8),%rax
|
---|
484 | adc \$0,%rdx
|
---|
485 | mov %rdx,$A[0]
|
---|
486 |
|
---|
487 | mulq $m1 # np[j]*m1
|
---|
488 | add %rax,$N[1]
|
---|
489 | mov ($ap,$j,8),%rax
|
---|
490 | adc \$0,%rdx
|
---|
491 | add $A[1],$N[1] # np[j]*m1+ap[j]*bp[0]
|
---|
492 | adc \$0,%rdx
|
---|
493 | mov $N[1],-16(%rsp,$j,8) # tp[j-1]
|
---|
494 | mov %rdx,$N[0]
|
---|
495 |
|
---|
496 | mulq $m0 # ap[j]*bp[0]
|
---|
497 | add %rax,$A[0]
|
---|
498 | mov ($np,$j,8),%rax
|
---|
499 | adc \$0,%rdx
|
---|
500 | mov %rdx,$A[1]
|
---|
501 |
|
---|
502 | mulq $m1 # np[j]*m1
|
---|
503 | add %rax,$N[0]
|
---|
504 | mov 8($ap,$j,8),%rax
|
---|
505 | adc \$0,%rdx
|
---|
506 | add $A[0],$N[0] # np[j]*m1+ap[j]*bp[0]
|
---|
507 | adc \$0,%rdx
|
---|
508 | mov $N[0],-8(%rsp,$j,8) # tp[j-1]
|
---|
509 | mov %rdx,$N[1]
|
---|
510 |
|
---|
511 | mulq $m0 # ap[j]*bp[0]
|
---|
512 | add %rax,$A[1]
|
---|
513 | mov 8($np,$j,8),%rax
|
---|
514 | adc \$0,%rdx
|
---|
515 | lea 4($j),$j # j++
|
---|
516 | mov %rdx,$A[0]
|
---|
517 |
|
---|
518 | mulq $m1 # np[j]*m1
|
---|
519 | add %rax,$N[1]
|
---|
520 | mov -16($ap,$j,8),%rax
|
---|
521 | adc \$0,%rdx
|
---|
522 | add $A[1],$N[1] # np[j]*m1+ap[j]*bp[0]
|
---|
523 | adc \$0,%rdx
|
---|
524 | mov $N[1],-32(%rsp,$j,8) # tp[j-1]
|
---|
525 | mov %rdx,$N[0]
|
---|
526 | cmp $num,$j
|
---|
527 | jb .L1st4x
|
---|
528 |
|
---|
529 | mulq $m0 # ap[j]*bp[0]
|
---|
530 | add %rax,$A[0]
|
---|
531 | mov -16($np,$j,8),%rax
|
---|
532 | adc \$0,%rdx
|
---|
533 | mov %rdx,$A[1]
|
---|
534 |
|
---|
535 | mulq $m1 # np[j]*m1
|
---|
536 | add %rax,$N[0]
|
---|
537 | mov -8($ap,$j,8),%rax
|
---|
538 | adc \$0,%rdx
|
---|
539 | add $A[0],$N[0] # np[j]*m1+ap[j]*bp[0]
|
---|
540 | adc \$0,%rdx
|
---|
541 | mov $N[0],-24(%rsp,$j,8) # tp[j-1]
|
---|
542 | mov %rdx,$N[1]
|
---|
543 |
|
---|
544 | mulq $m0 # ap[j]*bp[0]
|
---|
545 | add %rax,$A[1]
|
---|
546 | mov -8($np,$j,8),%rax
|
---|
547 | adc \$0,%rdx
|
---|
548 | mov %rdx,$A[0]
|
---|
549 |
|
---|
550 | mulq $m1 # np[j]*m1
|
---|
551 | add %rax,$N[1]
|
---|
552 | mov ($ap),%rax # ap[0]
|
---|
553 | adc \$0,%rdx
|
---|
554 | add $A[1],$N[1] # np[j]*m1+ap[j]*bp[0]
|
---|
555 | adc \$0,%rdx
|
---|
556 | mov $N[1],-16(%rsp,$j,8) # tp[j-1]
|
---|
557 | mov %rdx,$N[0]
|
---|
558 |
|
---|
559 | xor $N[1],$N[1]
|
---|
560 | add $A[0],$N[0]
|
---|
561 | adc \$0,$N[1]
|
---|
562 | mov $N[0],-8(%rsp,$j,8)
|
---|
563 | mov $N[1],(%rsp,$j,8) # store upmost overflow bit
|
---|
564 |
|
---|
565 | lea 1($i),$i # i++
|
---|
566 | .align 4
|
---|
567 | .Louter4x:
|
---|
568 | mov ($bp,$i,8),$m0 # m0=bp[i]
|
---|
569 | xor $j,$j # j=0
|
---|
570 | mov (%rsp),$A[0]
|
---|
571 | mov $n0,$m1
|
---|
572 | mulq $m0 # ap[0]*bp[i]
|
---|
573 | add %rax,$A[0] # ap[0]*bp[i]+tp[0]
|
---|
574 | mov ($np),%rax
|
---|
575 | adc \$0,%rdx
|
---|
576 |
|
---|
577 | imulq $A[0],$m1 # tp[0]*n0
|
---|
578 | mov %rdx,$A[1]
|
---|
579 |
|
---|
580 | mulq $m1 # np[0]*m1
|
---|
581 | add %rax,$A[0] # "$N[0]", discarded
|
---|
582 | mov 8($ap),%rax
|
---|
583 | adc \$0,%rdx
|
---|
584 | mov %rdx,$N[1]
|
---|
585 |
|
---|
586 | mulq $m0 # ap[j]*bp[i]
|
---|
587 | add %rax,$A[1]
|
---|
588 | mov 8($np),%rax
|
---|
589 | adc \$0,%rdx
|
---|
590 | add 8(%rsp),$A[1] # +tp[1]
|
---|
591 | adc \$0,%rdx
|
---|
592 | mov %rdx,$A[0]
|
---|
593 |
|
---|
594 | mulq $m1 # np[j]*m1
|
---|
595 | add %rax,$N[1]
|
---|
596 | mov 16($ap),%rax
|
---|
597 | adc \$0,%rdx
|
---|
598 | add $A[1],$N[1] # np[j]*m1+ap[j]*bp[i]+tp[j]
|
---|
599 | lea 4($j),$j # j+=2
|
---|
600 | adc \$0,%rdx
|
---|
601 | mov $N[1],(%rsp) # tp[j-1]
|
---|
602 | mov %rdx,$N[0]
|
---|
603 | jmp .Linner4x
|
---|
604 | .align 16
|
---|
605 | .Linner4x:
|
---|
606 | mulq $m0 # ap[j]*bp[i]
|
---|
607 | add %rax,$A[0]
|
---|
608 | mov -16($np,$j,8),%rax
|
---|
609 | adc \$0,%rdx
|
---|
610 | add -16(%rsp,$j,8),$A[0] # ap[j]*bp[i]+tp[j]
|
---|
611 | adc \$0,%rdx
|
---|
612 | mov %rdx,$A[1]
|
---|
613 |
|
---|
614 | mulq $m1 # np[j]*m1
|
---|
615 | add %rax,$N[0]
|
---|
616 | mov -8($ap,$j,8),%rax
|
---|
617 | adc \$0,%rdx
|
---|
618 | add $A[0],$N[0]
|
---|
619 | adc \$0,%rdx
|
---|
620 | mov $N[0],-24(%rsp,$j,8) # tp[j-1]
|
---|
621 | mov %rdx,$N[1]
|
---|
622 |
|
---|
623 | mulq $m0 # ap[j]*bp[i]
|
---|
624 | add %rax,$A[1]
|
---|
625 | mov -8($np,$j,8),%rax
|
---|
626 | adc \$0,%rdx
|
---|
627 | add -8(%rsp,$j,8),$A[1]
|
---|
628 | adc \$0,%rdx
|
---|
629 | mov %rdx,$A[0]
|
---|
630 |
|
---|
631 | mulq $m1 # np[j]*m1
|
---|
632 | add %rax,$N[1]
|
---|
633 | mov ($ap,$j,8),%rax
|
---|
634 | adc \$0,%rdx
|
---|
635 | add $A[1],$N[1]
|
---|
636 | adc \$0,%rdx
|
---|
637 | mov $N[1],-16(%rsp,$j,8) # tp[j-1]
|
---|
638 | mov %rdx,$N[0]
|
---|
639 |
|
---|
640 | mulq $m0 # ap[j]*bp[i]
|
---|
641 | add %rax,$A[0]
|
---|
642 | mov ($np,$j,8),%rax
|
---|
643 | adc \$0,%rdx
|
---|
644 | add (%rsp,$j,8),$A[0] # ap[j]*bp[i]+tp[j]
|
---|
645 | adc \$0,%rdx
|
---|
646 | mov %rdx,$A[1]
|
---|
647 |
|
---|
648 | mulq $m1 # np[j]*m1
|
---|
649 | add %rax,$N[0]
|
---|
650 | mov 8($ap,$j,8),%rax
|
---|
651 | adc \$0,%rdx
|
---|
652 | add $A[0],$N[0]
|
---|
653 | adc \$0,%rdx
|
---|
654 | mov $N[0],-8(%rsp,$j,8) # tp[j-1]
|
---|
655 | mov %rdx,$N[1]
|
---|
656 |
|
---|
657 | mulq $m0 # ap[j]*bp[i]
|
---|
658 | add %rax,$A[1]
|
---|
659 | mov 8($np,$j,8),%rax
|
---|
660 | adc \$0,%rdx
|
---|
661 | add 8(%rsp,$j,8),$A[1]
|
---|
662 | adc \$0,%rdx
|
---|
663 | lea 4($j),$j # j++
|
---|
664 | mov %rdx,$A[0]
|
---|
665 |
|
---|
666 | mulq $m1 # np[j]*m1
|
---|
667 | add %rax,$N[1]
|
---|
668 | mov -16($ap,$j,8),%rax
|
---|
669 | adc \$0,%rdx
|
---|
670 | add $A[1],$N[1]
|
---|
671 | adc \$0,%rdx
|
---|
672 | mov $N[1],-32(%rsp,$j,8) # tp[j-1]
|
---|
673 | mov %rdx,$N[0]
|
---|
674 | cmp $num,$j
|
---|
675 | jb .Linner4x
|
---|
676 |
|
---|
677 | mulq $m0 # ap[j]*bp[i]
|
---|
678 | add %rax,$A[0]
|
---|
679 | mov -16($np,$j,8),%rax
|
---|
680 | adc \$0,%rdx
|
---|
681 | add -16(%rsp,$j,8),$A[0] # ap[j]*bp[i]+tp[j]
|
---|
682 | adc \$0,%rdx
|
---|
683 | mov %rdx,$A[1]
|
---|
684 |
|
---|
685 | mulq $m1 # np[j]*m1
|
---|
686 | add %rax,$N[0]
|
---|
687 | mov -8($ap,$j,8),%rax
|
---|
688 | adc \$0,%rdx
|
---|
689 | add $A[0],$N[0]
|
---|
690 | adc \$0,%rdx
|
---|
691 | mov $N[0],-24(%rsp,$j,8) # tp[j-1]
|
---|
692 | mov %rdx,$N[1]
|
---|
693 |
|
---|
694 | mulq $m0 # ap[j]*bp[i]
|
---|
695 | add %rax,$A[1]
|
---|
696 | mov -8($np,$j,8),%rax
|
---|
697 | adc \$0,%rdx
|
---|
698 | add -8(%rsp,$j,8),$A[1]
|
---|
699 | adc \$0,%rdx
|
---|
700 | lea 1($i),$i # i++
|
---|
701 | mov %rdx,$A[0]
|
---|
702 |
|
---|
703 | mulq $m1 # np[j]*m1
|
---|
704 | add %rax,$N[1]
|
---|
705 | mov ($ap),%rax # ap[0]
|
---|
706 | adc \$0,%rdx
|
---|
707 | add $A[1],$N[1]
|
---|
708 | adc \$0,%rdx
|
---|
709 | mov $N[1],-16(%rsp,$j,8) # tp[j-1]
|
---|
710 | mov %rdx,$N[0]
|
---|
711 |
|
---|
712 | xor $N[1],$N[1]
|
---|
713 | add $A[0],$N[0]
|
---|
714 | adc \$0,$N[1]
|
---|
715 | add (%rsp,$num,8),$N[0] # pull upmost overflow bit
|
---|
716 | adc \$0,$N[1]
|
---|
717 | mov $N[0],-8(%rsp,$j,8)
|
---|
718 | mov $N[1],(%rsp,$j,8) # store upmost overflow bit
|
---|
719 |
|
---|
720 | cmp $num,$i
|
---|
721 | jb .Louter4x
|
---|
722 | ___
|
---|
723 | {
|
---|
724 | my @ri=("%rax","%rdx",$m0,$m1);
|
---|
725 | $code.=<<___;
|
---|
726 | mov 16(%rsp,$num,8),$rp # restore $rp
|
---|
727 | lea -4($num),$j
|
---|
728 | mov 0(%rsp),@ri[0] # tp[0]
|
---|
729 | mov 8(%rsp),@ri[1] # tp[1]
|
---|
730 | shr \$2,$j # j=num/4-1
|
---|
731 | lea (%rsp),$ap # borrow ap for tp
|
---|
732 | xor $i,$i # i=0 and clear CF!
|
---|
733 |
|
---|
734 | sub 0($np),@ri[0]
|
---|
735 | mov 16($ap),@ri[2] # tp[2]
|
---|
736 | mov 24($ap),@ri[3] # tp[3]
|
---|
737 | sbb 8($np),@ri[1]
|
---|
738 |
|
---|
739 | .Lsub4x:
|
---|
740 | mov @ri[0],0($rp,$i,8) # rp[i]=tp[i]-np[i]
|
---|
741 | mov @ri[1],8($rp,$i,8) # rp[i]=tp[i]-np[i]
|
---|
742 | sbb 16($np,$i,8),@ri[2]
|
---|
743 | mov 32($ap,$i,8),@ri[0] # tp[i+1]
|
---|
744 | mov 40($ap,$i,8),@ri[1]
|
---|
745 | sbb 24($np,$i,8),@ri[3]
|
---|
746 | mov @ri[2],16($rp,$i,8) # rp[i]=tp[i]-np[i]
|
---|
747 | mov @ri[3],24($rp,$i,8) # rp[i]=tp[i]-np[i]
|
---|
748 | sbb 32($np,$i,8),@ri[0]
|
---|
749 | mov 48($ap,$i,8),@ri[2]
|
---|
750 | mov 56($ap,$i,8),@ri[3]
|
---|
751 | sbb 40($np,$i,8),@ri[1]
|
---|
752 | lea 4($i),$i # i++
|
---|
753 | dec $j # doesn't affect CF!
|
---|
754 | jnz .Lsub4x
|
---|
755 |
|
---|
756 | mov @ri[0],0($rp,$i,8) # rp[i]=tp[i]-np[i]
|
---|
757 | mov 32($ap,$i,8),@ri[0] # load overflow bit
|
---|
758 | sbb 16($np,$i,8),@ri[2]
|
---|
759 | mov @ri[1],8($rp,$i,8) # rp[i]=tp[i]-np[i]
|
---|
760 | sbb 24($np,$i,8),@ri[3]
|
---|
761 | mov @ri[2],16($rp,$i,8) # rp[i]=tp[i]-np[i]
|
---|
762 |
|
---|
763 | sbb \$0,@ri[0] # handle upmost overflow bit
|
---|
764 | mov @ri[3],24($rp,$i,8) # rp[i]=tp[i]-np[i]
|
---|
765 | pxor %xmm0,%xmm0
|
---|
766 | movq @ri[0],%xmm4
|
---|
767 | pcmpeqd %xmm5,%xmm5
|
---|
768 | pshufd \$0,%xmm4,%xmm4
|
---|
769 | mov $num,$j
|
---|
770 | pxor %xmm4,%xmm5
|
---|
771 | shr \$2,$j # j=num/4
|
---|
772 | xor %eax,%eax # i=0
|
---|
773 |
|
---|
774 | jmp .Lcopy4x
|
---|
775 | .align 16
|
---|
776 | .Lcopy4x: # conditional copy
|
---|
777 | movdqa (%rsp,%rax),%xmm1
|
---|
778 | movdqu ($rp,%rax),%xmm2
|
---|
779 | pand %xmm4,%xmm1
|
---|
780 | pand %xmm5,%xmm2
|
---|
781 | movdqa 16(%rsp,%rax),%xmm3
|
---|
782 | movdqa %xmm0,(%rsp,%rax)
|
---|
783 | por %xmm2,%xmm1
|
---|
784 | movdqu 16($rp,%rax),%xmm2
|
---|
785 | movdqu %xmm1,($rp,%rax)
|
---|
786 | pand %xmm4,%xmm3
|
---|
787 | pand %xmm5,%xmm2
|
---|
788 | movdqa %xmm0,16(%rsp,%rax)
|
---|
789 | por %xmm2,%xmm3
|
---|
790 | movdqu %xmm3,16($rp,%rax)
|
---|
791 | lea 32(%rax),%rax
|
---|
792 | dec $j
|
---|
793 | jnz .Lcopy4x
|
---|
794 | ___
|
---|
795 | }
|
---|
796 | $code.=<<___;
|
---|
797 | mov 8(%rsp,$num,8),%rsi # restore %rsp
|
---|
798 | .cfi_def_cfa %rsi, 8
|
---|
799 | mov \$1,%rax
|
---|
800 | mov -48(%rsi),%r15
|
---|
801 | .cfi_restore %r15
|
---|
802 | mov -40(%rsi),%r14
|
---|
803 | .cfi_restore %r14
|
---|
804 | mov -32(%rsi),%r13
|
---|
805 | .cfi_restore %r13
|
---|
806 | mov -24(%rsi),%r12
|
---|
807 | .cfi_restore %r12
|
---|
808 | mov -16(%rsi),%rbp
|
---|
809 | .cfi_restore %rbp
|
---|
810 | mov -8(%rsi),%rbx
|
---|
811 | .cfi_restore %rbx
|
---|
812 | lea (%rsi),%rsp
|
---|
813 | .cfi_def_cfa_register %rsp
|
---|
814 | .Lmul4x_epilogue:
|
---|
815 | ret
|
---|
816 | .cfi_endproc
|
---|
817 | .size bn_mul4x_mont,.-bn_mul4x_mont
|
---|
818 | ___
|
---|
819 | }}}
|
---|
820 | |
---|
821 | {{{
|
---|
822 | ######################################################################
|
---|
823 | # void bn_sqr8x_mont(
|
---|
824 | my $rptr="%rdi"; # const BN_ULONG *rptr,
|
---|
825 | my $aptr="%rsi"; # const BN_ULONG *aptr,
|
---|
826 | my $bptr="%rdx"; # not used
|
---|
827 | my $nptr="%rcx"; # const BN_ULONG *nptr,
|
---|
828 | my $n0 ="%r8"; # const BN_ULONG *n0);
|
---|
829 | my $num ="%r9"; # int num, has to be divisible by 8
|
---|
830 |
|
---|
831 | my ($i,$j,$tptr)=("%rbp","%rcx",$rptr);
|
---|
832 | my @A0=("%r10","%r11");
|
---|
833 | my @A1=("%r12","%r13");
|
---|
834 | my ($a0,$a1,$ai)=("%r14","%r15","%rbx");
|
---|
835 |
|
---|
836 | $code.=<<___ if ($addx);
|
---|
837 | .extern bn_sqrx8x_internal # see x86_64-mont5 module
|
---|
838 | ___
|
---|
839 | $code.=<<___;
|
---|
840 | .extern bn_sqr8x_internal # see x86_64-mont5 module
|
---|
841 |
|
---|
842 | .type bn_sqr8x_mont,\@function,6
|
---|
843 | .align 32
|
---|
844 | bn_sqr8x_mont:
|
---|
845 | .cfi_startproc
|
---|
846 | mov %rsp,%rax
|
---|
847 | .cfi_def_cfa_register %rax
|
---|
848 | .Lsqr8x_enter:
|
---|
849 | push %rbx
|
---|
850 | .cfi_push %rbx
|
---|
851 | push %rbp
|
---|
852 | .cfi_push %rbp
|
---|
853 | push %r12
|
---|
854 | .cfi_push %r12
|
---|
855 | push %r13
|
---|
856 | .cfi_push %r13
|
---|
857 | push %r14
|
---|
858 | .cfi_push %r14
|
---|
859 | push %r15
|
---|
860 | .cfi_push %r15
|
---|
861 | .Lsqr8x_prologue:
|
---|
862 |
|
---|
863 | mov ${num}d,%r10d
|
---|
864 | shl \$3,${num}d # convert $num to bytes
|
---|
865 | shl \$3+2,%r10 # 4*$num
|
---|
866 | neg $num
|
---|
867 |
|
---|
868 | ##############################################################
|
---|
869 | # ensure that stack frame doesn't alias with $aptr modulo
|
---|
870 | # 4096. this is done to allow memory disambiguation logic
|
---|
871 | # do its job.
|
---|
872 | #
|
---|
873 | lea -64(%rsp,$num,2),%r11
|
---|
874 | mov %rsp,%rbp
|
---|
875 | mov ($n0),$n0 # *n0
|
---|
876 | sub $aptr,%r11
|
---|
877 | and \$4095,%r11
|
---|
878 | cmp %r11,%r10
|
---|
879 | jb .Lsqr8x_sp_alt
|
---|
880 | sub %r11,%rbp # align with $aptr
|
---|
881 | lea -64(%rbp,$num,2),%rbp # future alloca(frame+2*$num)
|
---|
882 | jmp .Lsqr8x_sp_done
|
---|
883 |
|
---|
884 | .align 32
|
---|
885 | .Lsqr8x_sp_alt:
|
---|
886 | lea 4096-64(,$num,2),%r10 # 4096-frame-2*$num
|
---|
887 | lea -64(%rbp,$num,2),%rbp # future alloca(frame+2*$num)
|
---|
888 | sub %r10,%r11
|
---|
889 | mov \$0,%r10
|
---|
890 | cmovc %r10,%r11
|
---|
891 | sub %r11,%rbp
|
---|
892 | .Lsqr8x_sp_done:
|
---|
893 | and \$-64,%rbp
|
---|
894 | mov %rsp,%r11
|
---|
895 | sub %rbp,%r11
|
---|
896 | and \$-4096,%r11
|
---|
897 | lea (%rbp,%r11),%rsp
|
---|
898 | mov (%rsp),%r10
|
---|
899 | cmp %rbp,%rsp
|
---|
900 | ja .Lsqr8x_page_walk
|
---|
901 | jmp .Lsqr8x_page_walk_done
|
---|
902 |
|
---|
903 | .align 16
|
---|
904 | .Lsqr8x_page_walk:
|
---|
905 | lea -4096(%rsp),%rsp
|
---|
906 | mov (%rsp),%r10
|
---|
907 | cmp %rbp,%rsp
|
---|
908 | ja .Lsqr8x_page_walk
|
---|
909 | .Lsqr8x_page_walk_done:
|
---|
910 |
|
---|
911 | mov $num,%r10
|
---|
912 | neg $num
|
---|
913 |
|
---|
914 | mov $n0, 32(%rsp)
|
---|
915 | mov %rax, 40(%rsp) # save original %rsp
|
---|
916 | .cfi_cfa_expression %rsp+40,deref,+8
|
---|
917 | .Lsqr8x_body:
|
---|
918 |
|
---|
919 | movq $nptr, %xmm2 # save pointer to modulus
|
---|
920 | pxor %xmm0,%xmm0
|
---|
921 | movq $rptr,%xmm1 # save $rptr
|
---|
922 | movq %r10, %xmm3 # -$num
|
---|
923 | ___
|
---|
924 | $code.=<<___ if ($addx);
|
---|
925 | mov OPENSSL_ia32cap_P+8(%rip),%eax
|
---|
926 | and \$0x80100,%eax
|
---|
927 | cmp \$0x80100,%eax
|
---|
928 | jne .Lsqr8x_nox
|
---|
929 |
|
---|
930 | call bn_sqrx8x_internal # see x86_64-mont5 module
|
---|
931 | # %rax top-most carry
|
---|
932 | # %rbp nptr
|
---|
933 | # %rcx -8*num
|
---|
934 | # %r8 end of tp[2*num]
|
---|
935 | lea (%r8,%rcx),%rbx
|
---|
936 | mov %rcx,$num
|
---|
937 | mov %rcx,%rdx
|
---|
938 | movq %xmm1,$rptr
|
---|
939 | sar \$3+2,%rcx # %cf=0
|
---|
940 | jmp .Lsqr8x_sub
|
---|
941 |
|
---|
942 | .align 32
|
---|
943 | .Lsqr8x_nox:
|
---|
944 | ___
|
---|
945 | $code.=<<___;
|
---|
946 | call bn_sqr8x_internal # see x86_64-mont5 module
|
---|
947 | # %rax top-most carry
|
---|
948 | # %rbp nptr
|
---|
949 | # %r8 -8*num
|
---|
950 | # %rdi end of tp[2*num]
|
---|
951 | lea (%rdi,$num),%rbx
|
---|
952 | mov $num,%rcx
|
---|
953 | mov $num,%rdx
|
---|
954 | movq %xmm1,$rptr
|
---|
955 | sar \$3+2,%rcx # %cf=0
|
---|
956 | jmp .Lsqr8x_sub
|
---|
957 |
|
---|
958 | .align 32
|
---|
959 | .Lsqr8x_sub:
|
---|
960 | mov 8*0(%rbx),%r12
|
---|
961 | mov 8*1(%rbx),%r13
|
---|
962 | mov 8*2(%rbx),%r14
|
---|
963 | mov 8*3(%rbx),%r15
|
---|
964 | lea 8*4(%rbx),%rbx
|
---|
965 | sbb 8*0(%rbp),%r12
|
---|
966 | sbb 8*1(%rbp),%r13
|
---|
967 | sbb 8*2(%rbp),%r14
|
---|
968 | sbb 8*3(%rbp),%r15
|
---|
969 | lea 8*4(%rbp),%rbp
|
---|
970 | mov %r12,8*0($rptr)
|
---|
971 | mov %r13,8*1($rptr)
|
---|
972 | mov %r14,8*2($rptr)
|
---|
973 | mov %r15,8*3($rptr)
|
---|
974 | lea 8*4($rptr),$rptr
|
---|
975 | inc %rcx # preserves %cf
|
---|
976 | jnz .Lsqr8x_sub
|
---|
977 |
|
---|
978 | sbb \$0,%rax # top-most carry
|
---|
979 | lea (%rbx,$num),%rbx # rewind
|
---|
980 | lea ($rptr,$num),$rptr # rewind
|
---|
981 |
|
---|
982 | movq %rax,%xmm1
|
---|
983 | pxor %xmm0,%xmm0
|
---|
984 | pshufd \$0,%xmm1,%xmm1
|
---|
985 | mov 40(%rsp),%rsi # restore %rsp
|
---|
986 | .cfi_def_cfa %rsi,8
|
---|
987 | jmp .Lsqr8x_cond_copy
|
---|
988 |
|
---|
989 | .align 32
|
---|
990 | .Lsqr8x_cond_copy:
|
---|
991 | movdqa 16*0(%rbx),%xmm2
|
---|
992 | movdqa 16*1(%rbx),%xmm3
|
---|
993 | lea 16*2(%rbx),%rbx
|
---|
994 | movdqu 16*0($rptr),%xmm4
|
---|
995 | movdqu 16*1($rptr),%xmm5
|
---|
996 | lea 16*2($rptr),$rptr
|
---|
997 | movdqa %xmm0,-16*2(%rbx) # zero tp
|
---|
998 | movdqa %xmm0,-16*1(%rbx)
|
---|
999 | movdqa %xmm0,-16*2(%rbx,%rdx)
|
---|
1000 | movdqa %xmm0,-16*1(%rbx,%rdx)
|
---|
1001 | pcmpeqd %xmm1,%xmm0
|
---|
1002 | pand %xmm1,%xmm2
|
---|
1003 | pand %xmm1,%xmm3
|
---|
1004 | pand %xmm0,%xmm4
|
---|
1005 | pand %xmm0,%xmm5
|
---|
1006 | pxor %xmm0,%xmm0
|
---|
1007 | por %xmm2,%xmm4
|
---|
1008 | por %xmm3,%xmm5
|
---|
1009 | movdqu %xmm4,-16*2($rptr)
|
---|
1010 | movdqu %xmm5,-16*1($rptr)
|
---|
1011 | add \$32,$num
|
---|
1012 | jnz .Lsqr8x_cond_copy
|
---|
1013 |
|
---|
1014 | mov \$1,%rax
|
---|
1015 | mov -48(%rsi),%r15
|
---|
1016 | .cfi_restore %r15
|
---|
1017 | mov -40(%rsi),%r14
|
---|
1018 | .cfi_restore %r14
|
---|
1019 | mov -32(%rsi),%r13
|
---|
1020 | .cfi_restore %r13
|
---|
1021 | mov -24(%rsi),%r12
|
---|
1022 | .cfi_restore %r12
|
---|
1023 | mov -16(%rsi),%rbp
|
---|
1024 | .cfi_restore %rbp
|
---|
1025 | mov -8(%rsi),%rbx
|
---|
1026 | .cfi_restore %rbx
|
---|
1027 | lea (%rsi),%rsp
|
---|
1028 | .cfi_def_cfa_register %rsp
|
---|
1029 | .Lsqr8x_epilogue:
|
---|
1030 | ret
|
---|
1031 | .cfi_endproc
|
---|
1032 | .size bn_sqr8x_mont,.-bn_sqr8x_mont
|
---|
1033 | ___
|
---|
1034 | }}}
|
---|
1035 | |
---|
1036 |
|
---|
1037 | if ($addx) {{{
|
---|
1038 | my $bp="%rdx"; # original value
|
---|
1039 |
|
---|
1040 | $code.=<<___;
|
---|
1041 | .type bn_mulx4x_mont,\@function,6
|
---|
1042 | .align 32
|
---|
1043 | bn_mulx4x_mont:
|
---|
1044 | .cfi_startproc
|
---|
1045 | mov %rsp,%rax
|
---|
1046 | .cfi_def_cfa_register %rax
|
---|
1047 | .Lmulx4x_enter:
|
---|
1048 | push %rbx
|
---|
1049 | .cfi_push %rbx
|
---|
1050 | push %rbp
|
---|
1051 | .cfi_push %rbp
|
---|
1052 | push %r12
|
---|
1053 | .cfi_push %r12
|
---|
1054 | push %r13
|
---|
1055 | .cfi_push %r13
|
---|
1056 | push %r14
|
---|
1057 | .cfi_push %r14
|
---|
1058 | push %r15
|
---|
1059 | .cfi_push %r15
|
---|
1060 | .Lmulx4x_prologue:
|
---|
1061 |
|
---|
1062 | shl \$3,${num}d # convert $num to bytes
|
---|
1063 | xor %r10,%r10
|
---|
1064 | sub $num,%r10 # -$num
|
---|
1065 | mov ($n0),$n0 # *n0
|
---|
1066 | lea -72(%rsp,%r10),%rbp # future alloca(frame+$num+8)
|
---|
1067 | and \$-128,%rbp
|
---|
1068 | mov %rsp,%r11
|
---|
1069 | sub %rbp,%r11
|
---|
1070 | and \$-4096,%r11
|
---|
1071 | lea (%rbp,%r11),%rsp
|
---|
1072 | mov (%rsp),%r10
|
---|
1073 | cmp %rbp,%rsp
|
---|
1074 | ja .Lmulx4x_page_walk
|
---|
1075 | jmp .Lmulx4x_page_walk_done
|
---|
1076 |
|
---|
1077 | .align 16
|
---|
1078 | .Lmulx4x_page_walk:
|
---|
1079 | lea -4096(%rsp),%rsp
|
---|
1080 | mov (%rsp),%r10
|
---|
1081 | cmp %rbp,%rsp
|
---|
1082 | ja .Lmulx4x_page_walk
|
---|
1083 | .Lmulx4x_page_walk_done:
|
---|
1084 |
|
---|
1085 | lea ($bp,$num),%r10
|
---|
1086 | ##############################################################
|
---|
1087 | # Stack layout
|
---|
1088 | # +0 num
|
---|
1089 | # +8 off-loaded &b[i]
|
---|
1090 | # +16 end of b[num]
|
---|
1091 | # +24 saved n0
|
---|
1092 | # +32 saved rp
|
---|
1093 | # +40 saved %rsp
|
---|
1094 | # +48 inner counter
|
---|
1095 | # +56
|
---|
1096 | # +64 tmp[num+1]
|
---|
1097 | #
|
---|
1098 | mov $num,0(%rsp) # save $num
|
---|
1099 | shr \$5,$num
|
---|
1100 | mov %r10,16(%rsp) # end of b[num]
|
---|
1101 | sub \$1,$num
|
---|
1102 | mov $n0, 24(%rsp) # save *n0
|
---|
1103 | mov $rp, 32(%rsp) # save $rp
|
---|
1104 | mov %rax,40(%rsp) # save original %rsp
|
---|
1105 | .cfi_cfa_expression %rsp+40,deref,+8
|
---|
1106 | mov $num,48(%rsp) # inner counter
|
---|
1107 | jmp .Lmulx4x_body
|
---|
1108 |
|
---|
1109 | .align 32
|
---|
1110 | .Lmulx4x_body:
|
---|
1111 | ___
|
---|
1112 | my ($aptr, $bptr, $nptr, $tptr, $mi, $bi, $zero, $num)=
|
---|
1113 | ("%rsi","%rdi","%rcx","%rbx","%r8","%r9","%rbp","%rax");
|
---|
1114 | my $rptr=$bptr;
|
---|
1115 | $code.=<<___;
|
---|
1116 | lea 8($bp),$bptr
|
---|
1117 | mov ($bp),%rdx # b[0], $bp==%rdx actually
|
---|
1118 | lea 64+32(%rsp),$tptr
|
---|
1119 | mov %rdx,$bi
|
---|
1120 |
|
---|
1121 | mulx 0*8($aptr),$mi,%rax # a[0]*b[0]
|
---|
1122 | mulx 1*8($aptr),%r11,%r14 # a[1]*b[0]
|
---|
1123 | add %rax,%r11
|
---|
1124 | mov $bptr,8(%rsp) # off-load &b[i]
|
---|
1125 | mulx 2*8($aptr),%r12,%r13 # ...
|
---|
1126 | adc %r14,%r12
|
---|
1127 | adc \$0,%r13
|
---|
1128 |
|
---|
1129 | mov $mi,$bptr # borrow $bptr
|
---|
1130 | imulq 24(%rsp),$mi # "t[0]"*n0
|
---|
1131 | xor $zero,$zero # cf=0, of=0
|
---|
1132 |
|
---|
1133 | mulx 3*8($aptr),%rax,%r14
|
---|
1134 | mov $mi,%rdx
|
---|
1135 | lea 4*8($aptr),$aptr
|
---|
1136 | adcx %rax,%r13
|
---|
1137 | adcx $zero,%r14 # cf=0
|
---|
1138 |
|
---|
1139 | mulx 0*8($nptr),%rax,%r10
|
---|
1140 | adcx %rax,$bptr # discarded
|
---|
1141 | adox %r11,%r10
|
---|
1142 | mulx 1*8($nptr),%rax,%r11
|
---|
1143 | adcx %rax,%r10
|
---|
1144 | adox %r12,%r11
|
---|
1145 | .byte 0xc4,0x62,0xfb,0xf6,0xa1,0x10,0x00,0x00,0x00 # mulx 2*8($nptr),%rax,%r12
|
---|
1146 | mov 48(%rsp),$bptr # counter value
|
---|
1147 | mov %r10,-4*8($tptr)
|
---|
1148 | adcx %rax,%r11
|
---|
1149 | adox %r13,%r12
|
---|
1150 | mulx 3*8($nptr),%rax,%r15
|
---|
1151 | mov $bi,%rdx
|
---|
1152 | mov %r11,-3*8($tptr)
|
---|
1153 | adcx %rax,%r12
|
---|
1154 | adox $zero,%r15 # of=0
|
---|
1155 | lea 4*8($nptr),$nptr
|
---|
1156 | mov %r12,-2*8($tptr)
|
---|
1157 |
|
---|
1158 | jmp .Lmulx4x_1st
|
---|
1159 |
|
---|
1160 | .align 32
|
---|
1161 | .Lmulx4x_1st:
|
---|
1162 | adcx $zero,%r15 # cf=0, modulo-scheduled
|
---|
1163 | mulx 0*8($aptr),%r10,%rax # a[4]*b[0]
|
---|
1164 | adcx %r14,%r10
|
---|
1165 | mulx 1*8($aptr),%r11,%r14 # a[5]*b[0]
|
---|
1166 | adcx %rax,%r11
|
---|
1167 | mulx 2*8($aptr),%r12,%rax # ...
|
---|
1168 | adcx %r14,%r12
|
---|
1169 | mulx 3*8($aptr),%r13,%r14
|
---|
1170 | .byte 0x67,0x67
|
---|
1171 | mov $mi,%rdx
|
---|
1172 | adcx %rax,%r13
|
---|
1173 | adcx $zero,%r14 # cf=0
|
---|
1174 | lea 4*8($aptr),$aptr
|
---|
1175 | lea 4*8($tptr),$tptr
|
---|
1176 |
|
---|
1177 | adox %r15,%r10
|
---|
1178 | mulx 0*8($nptr),%rax,%r15
|
---|
1179 | adcx %rax,%r10
|
---|
1180 | adox %r15,%r11
|
---|
1181 | mulx 1*8($nptr),%rax,%r15
|
---|
1182 | adcx %rax,%r11
|
---|
1183 | adox %r15,%r12
|
---|
1184 | mulx 2*8($nptr),%rax,%r15
|
---|
1185 | mov %r10,-5*8($tptr)
|
---|
1186 | adcx %rax,%r12
|
---|
1187 | mov %r11,-4*8($tptr)
|
---|
1188 | adox %r15,%r13
|
---|
1189 | mulx 3*8($nptr),%rax,%r15
|
---|
1190 | mov $bi,%rdx
|
---|
1191 | mov %r12,-3*8($tptr)
|
---|
1192 | adcx %rax,%r13
|
---|
1193 | adox $zero,%r15
|
---|
1194 | lea 4*8($nptr),$nptr
|
---|
1195 | mov %r13,-2*8($tptr)
|
---|
1196 |
|
---|
1197 | dec $bptr # of=0, pass cf
|
---|
1198 | jnz .Lmulx4x_1st
|
---|
1199 |
|
---|
1200 | mov 0(%rsp),$num # load num
|
---|
1201 | mov 8(%rsp),$bptr # re-load &b[i]
|
---|
1202 | adc $zero,%r15 # modulo-scheduled
|
---|
1203 | add %r15,%r14
|
---|
1204 | sbb %r15,%r15 # top-most carry
|
---|
1205 | mov %r14,-1*8($tptr)
|
---|
1206 | jmp .Lmulx4x_outer
|
---|
1207 |
|
---|
1208 | .align 32
|
---|
1209 | .Lmulx4x_outer:
|
---|
1210 | mov ($bptr),%rdx # b[i]
|
---|
1211 | lea 8($bptr),$bptr # b++
|
---|
1212 | sub $num,$aptr # rewind $aptr
|
---|
1213 | mov %r15,($tptr) # save top-most carry
|
---|
1214 | lea 64+4*8(%rsp),$tptr
|
---|
1215 | sub $num,$nptr # rewind $nptr
|
---|
1216 |
|
---|
1217 | mulx 0*8($aptr),$mi,%r11 # a[0]*b[i]
|
---|
1218 | xor %ebp,%ebp # xor $zero,$zero # cf=0, of=0
|
---|
1219 | mov %rdx,$bi
|
---|
1220 | mulx 1*8($aptr),%r14,%r12 # a[1]*b[i]
|
---|
1221 | adox -4*8($tptr),$mi
|
---|
1222 | adcx %r14,%r11
|
---|
1223 | mulx 2*8($aptr),%r15,%r13 # ...
|
---|
1224 | adox -3*8($tptr),%r11
|
---|
1225 | adcx %r15,%r12
|
---|
1226 | adox -2*8($tptr),%r12
|
---|
1227 | adcx $zero,%r13
|
---|
1228 | adox $zero,%r13
|
---|
1229 |
|
---|
1230 | mov $bptr,8(%rsp) # off-load &b[i]
|
---|
1231 | mov $mi,%r15
|
---|
1232 | imulq 24(%rsp),$mi # "t[0]"*n0
|
---|
1233 | xor %ebp,%ebp # xor $zero,$zero # cf=0, of=0
|
---|
1234 |
|
---|
1235 | mulx 3*8($aptr),%rax,%r14
|
---|
1236 | mov $mi,%rdx
|
---|
1237 | adcx %rax,%r13
|
---|
1238 | adox -1*8($tptr),%r13
|
---|
1239 | adcx $zero,%r14
|
---|
1240 | lea 4*8($aptr),$aptr
|
---|
1241 | adox $zero,%r14
|
---|
1242 |
|
---|
1243 | mulx 0*8($nptr),%rax,%r10
|
---|
1244 | adcx %rax,%r15 # discarded
|
---|
1245 | adox %r11,%r10
|
---|
1246 | mulx 1*8($nptr),%rax,%r11
|
---|
1247 | adcx %rax,%r10
|
---|
1248 | adox %r12,%r11
|
---|
1249 | mulx 2*8($nptr),%rax,%r12
|
---|
1250 | mov %r10,-4*8($tptr)
|
---|
1251 | adcx %rax,%r11
|
---|
1252 | adox %r13,%r12
|
---|
1253 | mulx 3*8($nptr),%rax,%r15
|
---|
1254 | mov $bi,%rdx
|
---|
1255 | mov %r11,-3*8($tptr)
|
---|
1256 | lea 4*8($nptr),$nptr
|
---|
1257 | adcx %rax,%r12
|
---|
1258 | adox $zero,%r15 # of=0
|
---|
1259 | mov 48(%rsp),$bptr # counter value
|
---|
1260 | mov %r12,-2*8($tptr)
|
---|
1261 |
|
---|
1262 | jmp .Lmulx4x_inner
|
---|
1263 |
|
---|
1264 | .align 32
|
---|
1265 | .Lmulx4x_inner:
|
---|
1266 | mulx 0*8($aptr),%r10,%rax # a[4]*b[i]
|
---|
1267 | adcx $zero,%r15 # cf=0, modulo-scheduled
|
---|
1268 | adox %r14,%r10
|
---|
1269 | mulx 1*8($aptr),%r11,%r14 # a[5]*b[i]
|
---|
1270 | adcx 0*8($tptr),%r10
|
---|
1271 | adox %rax,%r11
|
---|
1272 | mulx 2*8($aptr),%r12,%rax # ...
|
---|
1273 | adcx 1*8($tptr),%r11
|
---|
1274 | adox %r14,%r12
|
---|
1275 | mulx 3*8($aptr),%r13,%r14
|
---|
1276 | mov $mi,%rdx
|
---|
1277 | adcx 2*8($tptr),%r12
|
---|
1278 | adox %rax,%r13
|
---|
1279 | adcx 3*8($tptr),%r13
|
---|
1280 | adox $zero,%r14 # of=0
|
---|
1281 | lea 4*8($aptr),$aptr
|
---|
1282 | lea 4*8($tptr),$tptr
|
---|
1283 | adcx $zero,%r14 # cf=0
|
---|
1284 |
|
---|
1285 | adox %r15,%r10
|
---|
1286 | mulx 0*8($nptr),%rax,%r15
|
---|
1287 | adcx %rax,%r10
|
---|
1288 | adox %r15,%r11
|
---|
1289 | mulx 1*8($nptr),%rax,%r15
|
---|
1290 | adcx %rax,%r11
|
---|
1291 | adox %r15,%r12
|
---|
1292 | mulx 2*8($nptr),%rax,%r15
|
---|
1293 | mov %r10,-5*8($tptr)
|
---|
1294 | adcx %rax,%r12
|
---|
1295 | adox %r15,%r13
|
---|
1296 | mulx 3*8($nptr),%rax,%r15
|
---|
1297 | mov $bi,%rdx
|
---|
1298 | mov %r11,-4*8($tptr)
|
---|
1299 | mov %r12,-3*8($tptr)
|
---|
1300 | adcx %rax,%r13
|
---|
1301 | adox $zero,%r15
|
---|
1302 | lea 4*8($nptr),$nptr
|
---|
1303 | mov %r13,-2*8($tptr)
|
---|
1304 |
|
---|
1305 | dec $bptr # of=0, pass cf
|
---|
1306 | jnz .Lmulx4x_inner
|
---|
1307 |
|
---|
1308 | mov 0(%rsp),$num # load num
|
---|
1309 | mov 8(%rsp),$bptr # re-load &b[i]
|
---|
1310 | adc $zero,%r15 # modulo-scheduled
|
---|
1311 | sub 0*8($tptr),$zero # pull top-most carry
|
---|
1312 | adc %r15,%r14
|
---|
1313 | sbb %r15,%r15 # top-most carry
|
---|
1314 | mov %r14,-1*8($tptr)
|
---|
1315 |
|
---|
1316 | cmp 16(%rsp),$bptr
|
---|
1317 | jne .Lmulx4x_outer
|
---|
1318 |
|
---|
1319 | lea 64(%rsp),$tptr
|
---|
1320 | sub $num,$nptr # rewind $nptr
|
---|
1321 | neg %r15
|
---|
1322 | mov $num,%rdx
|
---|
1323 | shr \$3+2,$num # %cf=0
|
---|
1324 | mov 32(%rsp),$rptr # restore rp
|
---|
1325 | jmp .Lmulx4x_sub
|
---|
1326 |
|
---|
1327 | .align 32
|
---|
1328 | .Lmulx4x_sub:
|
---|
1329 | mov 8*0($tptr),%r11
|
---|
1330 | mov 8*1($tptr),%r12
|
---|
1331 | mov 8*2($tptr),%r13
|
---|
1332 | mov 8*3($tptr),%r14
|
---|
1333 | lea 8*4($tptr),$tptr
|
---|
1334 | sbb 8*0($nptr),%r11
|
---|
1335 | sbb 8*1($nptr),%r12
|
---|
1336 | sbb 8*2($nptr),%r13
|
---|
1337 | sbb 8*3($nptr),%r14
|
---|
1338 | lea 8*4($nptr),$nptr
|
---|
1339 | mov %r11,8*0($rptr)
|
---|
1340 | mov %r12,8*1($rptr)
|
---|
1341 | mov %r13,8*2($rptr)
|
---|
1342 | mov %r14,8*3($rptr)
|
---|
1343 | lea 8*4($rptr),$rptr
|
---|
1344 | dec $num # preserves %cf
|
---|
1345 | jnz .Lmulx4x_sub
|
---|
1346 |
|
---|
1347 | sbb \$0,%r15 # top-most carry
|
---|
1348 | lea 64(%rsp),$tptr
|
---|
1349 | sub %rdx,$rptr # rewind
|
---|
1350 |
|
---|
1351 | movq %r15,%xmm1
|
---|
1352 | pxor %xmm0,%xmm0
|
---|
1353 | pshufd \$0,%xmm1,%xmm1
|
---|
1354 | mov 40(%rsp),%rsi # restore %rsp
|
---|
1355 | .cfi_def_cfa %rsi,8
|
---|
1356 | jmp .Lmulx4x_cond_copy
|
---|
1357 |
|
---|
1358 | .align 32
|
---|
1359 | .Lmulx4x_cond_copy:
|
---|
1360 | movdqa 16*0($tptr),%xmm2
|
---|
1361 | movdqa 16*1($tptr),%xmm3
|
---|
1362 | lea 16*2($tptr),$tptr
|
---|
1363 | movdqu 16*0($rptr),%xmm4
|
---|
1364 | movdqu 16*1($rptr),%xmm5
|
---|
1365 | lea 16*2($rptr),$rptr
|
---|
1366 | movdqa %xmm0,-16*2($tptr) # zero tp
|
---|
1367 | movdqa %xmm0,-16*1($tptr)
|
---|
1368 | pcmpeqd %xmm1,%xmm0
|
---|
1369 | pand %xmm1,%xmm2
|
---|
1370 | pand %xmm1,%xmm3
|
---|
1371 | pand %xmm0,%xmm4
|
---|
1372 | pand %xmm0,%xmm5
|
---|
1373 | pxor %xmm0,%xmm0
|
---|
1374 | por %xmm2,%xmm4
|
---|
1375 | por %xmm3,%xmm5
|
---|
1376 | movdqu %xmm4,-16*2($rptr)
|
---|
1377 | movdqu %xmm5,-16*1($rptr)
|
---|
1378 | sub \$32,%rdx
|
---|
1379 | jnz .Lmulx4x_cond_copy
|
---|
1380 |
|
---|
1381 | mov %rdx,($tptr)
|
---|
1382 |
|
---|
1383 | mov \$1,%rax
|
---|
1384 | mov -48(%rsi),%r15
|
---|
1385 | .cfi_restore %r15
|
---|
1386 | mov -40(%rsi),%r14
|
---|
1387 | .cfi_restore %r14
|
---|
1388 | mov -32(%rsi),%r13
|
---|
1389 | .cfi_restore %r13
|
---|
1390 | mov -24(%rsi),%r12
|
---|
1391 | .cfi_restore %r12
|
---|
1392 | mov -16(%rsi),%rbp
|
---|
1393 | .cfi_restore %rbp
|
---|
1394 | mov -8(%rsi),%rbx
|
---|
1395 | .cfi_restore %rbx
|
---|
1396 | lea (%rsi),%rsp
|
---|
1397 | .cfi_def_cfa_register %rsp
|
---|
1398 | .Lmulx4x_epilogue:
|
---|
1399 | ret
|
---|
1400 | .cfi_endproc
|
---|
1401 | .size bn_mulx4x_mont,.-bn_mulx4x_mont
|
---|
1402 | ___
|
---|
1403 | }}}
|
---|
1404 | $code.=<<___;
|
---|
1405 | .asciz "Montgomery Multiplication for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
|
---|
1406 | .align 16
|
---|
1407 | ___
|
---|
1408 |
|
---|
1409 | # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
|
---|
1410 | # CONTEXT *context,DISPATCHER_CONTEXT *disp)
|
---|
1411 | if ($win64) {
|
---|
1412 | $rec="%rcx";
|
---|
1413 | $frame="%rdx";
|
---|
1414 | $context="%r8";
|
---|
1415 | $disp="%r9";
|
---|
1416 |
|
---|
1417 | $code.=<<___;
|
---|
1418 | .extern __imp_RtlVirtualUnwind
|
---|
1419 | .type mul_handler,\@abi-omnipotent
|
---|
1420 | .align 16
|
---|
1421 | mul_handler:
|
---|
1422 | push %rsi
|
---|
1423 | push %rdi
|
---|
1424 | push %rbx
|
---|
1425 | push %rbp
|
---|
1426 | push %r12
|
---|
1427 | push %r13
|
---|
1428 | push %r14
|
---|
1429 | push %r15
|
---|
1430 | pushfq
|
---|
1431 | sub \$64,%rsp
|
---|
1432 |
|
---|
1433 | mov 120($context),%rax # pull context->Rax
|
---|
1434 | mov 248($context),%rbx # pull context->Rip
|
---|
1435 |
|
---|
1436 | mov 8($disp),%rsi # disp->ImageBase
|
---|
1437 | mov 56($disp),%r11 # disp->HandlerData
|
---|
1438 |
|
---|
1439 | mov 0(%r11),%r10d # HandlerData[0]
|
---|
1440 | lea (%rsi,%r10),%r10 # end of prologue label
|
---|
1441 | cmp %r10,%rbx # context->Rip<end of prologue label
|
---|
1442 | jb .Lcommon_seh_tail
|
---|
1443 |
|
---|
1444 | mov 152($context),%rax # pull context->Rsp
|
---|
1445 |
|
---|
1446 | mov 4(%r11),%r10d # HandlerData[1]
|
---|
1447 | lea (%rsi,%r10),%r10 # epilogue label
|
---|
1448 | cmp %r10,%rbx # context->Rip>=epilogue label
|
---|
1449 | jae .Lcommon_seh_tail
|
---|
1450 |
|
---|
1451 | mov 192($context),%r10 # pull $num
|
---|
1452 | mov 8(%rax,%r10,8),%rax # pull saved stack pointer
|
---|
1453 |
|
---|
1454 | jmp .Lcommon_pop_regs
|
---|
1455 | .size mul_handler,.-mul_handler
|
---|
1456 |
|
---|
1457 | .type sqr_handler,\@abi-omnipotent
|
---|
1458 | .align 16
|
---|
1459 | sqr_handler:
|
---|
1460 | push %rsi
|
---|
1461 | push %rdi
|
---|
1462 | push %rbx
|
---|
1463 | push %rbp
|
---|
1464 | push %r12
|
---|
1465 | push %r13
|
---|
1466 | push %r14
|
---|
1467 | push %r15
|
---|
1468 | pushfq
|
---|
1469 | sub \$64,%rsp
|
---|
1470 |
|
---|
1471 | mov 120($context),%rax # pull context->Rax
|
---|
1472 | mov 248($context),%rbx # pull context->Rip
|
---|
1473 |
|
---|
1474 | mov 8($disp),%rsi # disp->ImageBase
|
---|
1475 | mov 56($disp),%r11 # disp->HandlerData
|
---|
1476 |
|
---|
1477 | mov 0(%r11),%r10d # HandlerData[0]
|
---|
1478 | lea (%rsi,%r10),%r10 # end of prologue label
|
---|
1479 | cmp %r10,%rbx # context->Rip<.Lsqr_prologue
|
---|
1480 | jb .Lcommon_seh_tail
|
---|
1481 |
|
---|
1482 | mov 4(%r11),%r10d # HandlerData[1]
|
---|
1483 | lea (%rsi,%r10),%r10 # body label
|
---|
1484 | cmp %r10,%rbx # context->Rip<.Lsqr_body
|
---|
1485 | jb .Lcommon_pop_regs
|
---|
1486 |
|
---|
1487 | mov 152($context),%rax # pull context->Rsp
|
---|
1488 |
|
---|
1489 | mov 8(%r11),%r10d # HandlerData[2]
|
---|
1490 | lea (%rsi,%r10),%r10 # epilogue label
|
---|
1491 | cmp %r10,%rbx # context->Rip>=.Lsqr_epilogue
|
---|
1492 | jae .Lcommon_seh_tail
|
---|
1493 |
|
---|
1494 | mov 40(%rax),%rax # pull saved stack pointer
|
---|
1495 |
|
---|
1496 | .Lcommon_pop_regs:
|
---|
1497 | mov -8(%rax),%rbx
|
---|
1498 | mov -16(%rax),%rbp
|
---|
1499 | mov -24(%rax),%r12
|
---|
1500 | mov -32(%rax),%r13
|
---|
1501 | mov -40(%rax),%r14
|
---|
1502 | mov -48(%rax),%r15
|
---|
1503 | mov %rbx,144($context) # restore context->Rbx
|
---|
1504 | mov %rbp,160($context) # restore context->Rbp
|
---|
1505 | mov %r12,216($context) # restore context->R12
|
---|
1506 | mov %r13,224($context) # restore context->R13
|
---|
1507 | mov %r14,232($context) # restore context->R14
|
---|
1508 | mov %r15,240($context) # restore context->R15
|
---|
1509 |
|
---|
1510 | .Lcommon_seh_tail:
|
---|
1511 | mov 8(%rax),%rdi
|
---|
1512 | mov 16(%rax),%rsi
|
---|
1513 | mov %rax,152($context) # restore context->Rsp
|
---|
1514 | mov %rsi,168($context) # restore context->Rsi
|
---|
1515 | mov %rdi,176($context) # restore context->Rdi
|
---|
1516 |
|
---|
1517 | mov 40($disp),%rdi # disp->ContextRecord
|
---|
1518 | mov $context,%rsi # context
|
---|
1519 | mov \$154,%ecx # sizeof(CONTEXT)
|
---|
1520 | .long 0xa548f3fc # cld; rep movsq
|
---|
1521 |
|
---|
1522 | mov $disp,%rsi
|
---|
1523 | xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
|
---|
1524 | mov 8(%rsi),%rdx # arg2, disp->ImageBase
|
---|
1525 | mov 0(%rsi),%r8 # arg3, disp->ControlPc
|
---|
1526 | mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
|
---|
1527 | mov 40(%rsi),%r10 # disp->ContextRecord
|
---|
1528 | lea 56(%rsi),%r11 # &disp->HandlerData
|
---|
1529 | lea 24(%rsi),%r12 # &disp->EstablisherFrame
|
---|
1530 | mov %r10,32(%rsp) # arg5
|
---|
1531 | mov %r11,40(%rsp) # arg6
|
---|
1532 | mov %r12,48(%rsp) # arg7
|
---|
1533 | mov %rcx,56(%rsp) # arg8, (NULL)
|
---|
1534 | call *__imp_RtlVirtualUnwind(%rip)
|
---|
1535 |
|
---|
1536 | mov \$1,%eax # ExceptionContinueSearch
|
---|
1537 | add \$64,%rsp
|
---|
1538 | popfq
|
---|
1539 | pop %r15
|
---|
1540 | pop %r14
|
---|
1541 | pop %r13
|
---|
1542 | pop %r12
|
---|
1543 | pop %rbp
|
---|
1544 | pop %rbx
|
---|
1545 | pop %rdi
|
---|
1546 | pop %rsi
|
---|
1547 | ret
|
---|
1548 | .size sqr_handler,.-sqr_handler
|
---|
1549 |
|
---|
1550 | .section .pdata
|
---|
1551 | .align 4
|
---|
1552 | .rva .LSEH_begin_bn_mul_mont
|
---|
1553 | .rva .LSEH_end_bn_mul_mont
|
---|
1554 | .rva .LSEH_info_bn_mul_mont
|
---|
1555 |
|
---|
1556 | .rva .LSEH_begin_bn_mul4x_mont
|
---|
1557 | .rva .LSEH_end_bn_mul4x_mont
|
---|
1558 | .rva .LSEH_info_bn_mul4x_mont
|
---|
1559 |
|
---|
1560 | .rva .LSEH_begin_bn_sqr8x_mont
|
---|
1561 | .rva .LSEH_end_bn_sqr8x_mont
|
---|
1562 | .rva .LSEH_info_bn_sqr8x_mont
|
---|
1563 | ___
|
---|
1564 | $code.=<<___ if ($addx);
|
---|
1565 | .rva .LSEH_begin_bn_mulx4x_mont
|
---|
1566 | .rva .LSEH_end_bn_mulx4x_mont
|
---|
1567 | .rva .LSEH_info_bn_mulx4x_mont
|
---|
1568 | ___
|
---|
1569 | $code.=<<___;
|
---|
1570 | .section .xdata
|
---|
1571 | .align 8
|
---|
1572 | .LSEH_info_bn_mul_mont:
|
---|
1573 | .byte 9,0,0,0
|
---|
1574 | .rva mul_handler
|
---|
1575 | .rva .Lmul_body,.Lmul_epilogue # HandlerData[]
|
---|
1576 | .LSEH_info_bn_mul4x_mont:
|
---|
1577 | .byte 9,0,0,0
|
---|
1578 | .rva mul_handler
|
---|
1579 | .rva .Lmul4x_body,.Lmul4x_epilogue # HandlerData[]
|
---|
1580 | .LSEH_info_bn_sqr8x_mont:
|
---|
1581 | .byte 9,0,0,0
|
---|
1582 | .rva sqr_handler
|
---|
1583 | .rva .Lsqr8x_prologue,.Lsqr8x_body,.Lsqr8x_epilogue # HandlerData[]
|
---|
1584 | .align 8
|
---|
1585 | ___
|
---|
1586 | $code.=<<___ if ($addx);
|
---|
1587 | .LSEH_info_bn_mulx4x_mont:
|
---|
1588 | .byte 9,0,0,0
|
---|
1589 | .rva sqr_handler
|
---|
1590 | .rva .Lmulx4x_prologue,.Lmulx4x_body,.Lmulx4x_epilogue # HandlerData[]
|
---|
1591 | .align 8
|
---|
1592 | ___
|
---|
1593 | }
|
---|
1594 |
|
---|
1595 | print $code;
|
---|
1596 | close STDOUT or die "error closing STDOUT: $!";
|
---|