VirtualBox

source: vbox/trunk/src/libs/openssl-3.0.7/crypto/bn/asm/x86_64-mont.pl@ 98103

最後變更 在這個檔案從98103是 94082,由 vboxsync 提交於 3 年 前

libs/openssl-3.0.1: started applying and adjusting our OpenSSL changes to 3.0.1. bugref:10128

  • 屬性 svn:executable 設為 *
檔案大小: 32.6 KB
 
1#! /usr/bin/env perl
2# Copyright 2005-2020 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the Apache License 2.0 (the "License"). You may not use
5# this file except in compliance with the License. You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
9
10# ====================================================================
11# Written by Andy Polyakov <[email protected]> for the OpenSSL
12# project. The module is, however, dual licensed under OpenSSL and
13# CRYPTOGAMS licenses depending on where you obtain it. For further
14# details see http://www.openssl.org/~appro/cryptogams/.
15# ====================================================================
16
17# October 2005.
18#
19# Montgomery multiplication routine for x86_64. While it gives modest
20# 9% improvement of rsa4096 sign on Opteron, rsa512 sign runs more
21# than twice, >2x, as fast. Most common rsa1024 sign is improved by
22# respectful 50%. It remains to be seen if loop unrolling and
23# dedicated squaring routine can provide further improvement...
24
25# July 2011.
26#
27# Add dedicated squaring procedure. Performance improvement varies
28# from platform to platform, but in average it's ~5%/15%/25%/33%
29# for 512-/1024-/2048-/4096-bit RSA *sign* benchmarks respectively.
30
31# August 2011.
32#
33# Unroll and modulo-schedule inner loops in such manner that they
34# are "fallen through" for input lengths of 8, which is critical for
35# 1024-bit RSA *sign*. Average performance improvement in comparison
36# to *initial* version of this module from 2005 is ~0%/30%/40%/45%
37# for 512-/1024-/2048-/4096-bit RSA *sign* benchmarks respectively.
38
39# June 2013.
40#
41# Optimize reduction in squaring procedure and improve 1024+-bit RSA
42# sign performance by 10-16% on Intel Sandy Bridge and later
43# (virtually same on non-Intel processors).
44
45# August 2013.
46#
47# Add MULX/ADOX/ADCX code path.
48
49# $output is the last argument if it looks like a file (it has an extension)
50# $flavour is the first argument if it doesn't look like a file
51$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
52$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
53
54$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
55
56$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
57( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
58( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
59die "can't locate x86_64-xlate.pl";
60
61open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""
62 or die "can't call $xlate: $!";
63*STDOUT=*OUT;
64
65if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
66 =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
67 $addx = ($1>=2.23);
68}
69
70if (!$addx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
71 `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
72 $addx = ($1>=2.10);
73}
74
75if (!$addx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
76 `ml64 2>&1` =~ /Version ([0-9]+)\./) {
77 $addx = ($1>=12);
78}
79
80if (!$addx && `$ENV{CC} -v 2>&1` =~ /((?:clang|LLVM) version|.*based on LLVM) ([0-9]+)\.([0-9]+)/) {
81 my $ver = $2 + $3/100.0; # 3.1->3.01, 3.10->3.10
82 $addx = ($ver>=3.03);
83}
84
85# int bn_mul_mont(
86$rp="%rdi"; # BN_ULONG *rp,
87$ap="%rsi"; # const BN_ULONG *ap,
88$bp="%rdx"; # const BN_ULONG *bp,
89$np="%rcx"; # const BN_ULONG *np,
90$n0="%r8"; # const BN_ULONG *n0,
91$num="%r9"; # int num);
92$lo0="%r10";
93$hi0="%r11";
94$hi1="%r13";
95$i="%r14";
96$j="%r15";
97$m0="%rbx";
98$m1="%rbp";
99
100$code=<<___;
101.text
102
103.extern OPENSSL_ia32cap_P
104
105.globl bn_mul_mont
106.type bn_mul_mont,\@function,6
107.align 16
108bn_mul_mont:
109.cfi_startproc
110 mov ${num}d,${num}d
111 mov %rsp,%rax
112.cfi_def_cfa_register %rax
113 test \$3,${num}d
114 jnz .Lmul_enter
115 cmp \$8,${num}d
116 jb .Lmul_enter
117___
118$code.=<<___ if ($addx);
119 mov OPENSSL_ia32cap_P+8(%rip),%r11d
120___
121$code.=<<___;
122 cmp $ap,$bp
123 jne .Lmul4x_enter
124 test \$7,${num}d
125 jz .Lsqr8x_enter
126 jmp .Lmul4x_enter
127
128.align 16
129.Lmul_enter:
130 push %rbx
131.cfi_push %rbx
132 push %rbp
133.cfi_push %rbp
134 push %r12
135.cfi_push %r12
136 push %r13
137.cfi_push %r13
138 push %r14
139.cfi_push %r14
140 push %r15
141.cfi_push %r15
142
143 neg $num
144 mov %rsp,%r11
145 lea -16(%rsp,$num,8),%r10 # future alloca(8*(num+2))
146 neg $num # restore $num
147 and \$-1024,%r10 # minimize TLB usage
148
149 # An OS-agnostic version of __chkstk.
150 #
151 # Some OSes (Windows) insist on stack being "wired" to
152 # physical memory in strictly sequential manner, i.e. if stack
153 # allocation spans two pages, then reference to farmost one can
154 # be punishable by SEGV. But page walking can do good even on
155 # other OSes, because it guarantees that villain thread hits
156 # the guard page before it can make damage to innocent one...
157 sub %r10,%r11
158 and \$-4096,%r11
159 lea (%r10,%r11),%rsp
160 mov (%rsp),%r11
161 cmp %r10,%rsp
162 ja .Lmul_page_walk
163 jmp .Lmul_page_walk_done
164
165.align 16
166.Lmul_page_walk:
167 lea -4096(%rsp),%rsp
168 mov (%rsp),%r11
169 cmp %r10,%rsp
170 ja .Lmul_page_walk
171.Lmul_page_walk_done:
172
173 mov %rax,8(%rsp,$num,8) # tp[num+1]=%rsp
174.cfi_cfa_expression %rsp+8,$num,8,mul,plus,deref,+8
175.Lmul_body:
176 mov $bp,%r12 # reassign $bp
177___
178 $bp="%r12";
179$code.=<<___;
180 mov ($n0),$n0 # pull n0[0] value
181 mov ($bp),$m0 # m0=bp[0]
182 mov ($ap),%rax
183
184 xor $i,$i # i=0
185 xor $j,$j # j=0
186
187 mov $n0,$m1
188 mulq $m0 # ap[0]*bp[0]
189 mov %rax,$lo0
190 mov ($np),%rax
191
192 imulq $lo0,$m1 # "tp[0]"*n0
193 mov %rdx,$hi0
194
195 mulq $m1 # np[0]*m1
196 add %rax,$lo0 # discarded
197 mov 8($ap),%rax
198 adc \$0,%rdx
199 mov %rdx,$hi1
200
201 lea 1($j),$j # j++
202 jmp .L1st_enter
203
204.align 16
205.L1st:
206 add %rax,$hi1
207 mov ($ap,$j,8),%rax
208 adc \$0,%rdx
209 add $hi0,$hi1 # np[j]*m1+ap[j]*bp[0]
210 mov $lo0,$hi0
211 adc \$0,%rdx
212 mov $hi1,-16(%rsp,$j,8) # tp[j-1]
213 mov %rdx,$hi1
214
215.L1st_enter:
216 mulq $m0 # ap[j]*bp[0]
217 add %rax,$hi0
218 mov ($np,$j,8),%rax
219 adc \$0,%rdx
220 lea 1($j),$j # j++
221 mov %rdx,$lo0
222
223 mulq $m1 # np[j]*m1
224 cmp $num,$j
225 jne .L1st
226
227 add %rax,$hi1
228 mov ($ap),%rax # ap[0]
229 adc \$0,%rdx
230 add $hi0,$hi1 # np[j]*m1+ap[j]*bp[0]
231 adc \$0,%rdx
232 mov $hi1,-16(%rsp,$j,8) # tp[j-1]
233 mov %rdx,$hi1
234 mov $lo0,$hi0
235
236 xor %rdx,%rdx
237 add $hi0,$hi1
238 adc \$0,%rdx
239 mov $hi1,-8(%rsp,$num,8)
240 mov %rdx,(%rsp,$num,8) # store upmost overflow bit
241
242 lea 1($i),$i # i++
243 jmp .Louter
244.align 16
245.Louter:
246 mov ($bp,$i,8),$m0 # m0=bp[i]
247 xor $j,$j # j=0
248 mov $n0,$m1
249 mov (%rsp),$lo0
250 mulq $m0 # ap[0]*bp[i]
251 add %rax,$lo0 # ap[0]*bp[i]+tp[0]
252 mov ($np),%rax
253 adc \$0,%rdx
254
255 imulq $lo0,$m1 # tp[0]*n0
256 mov %rdx,$hi0
257
258 mulq $m1 # np[0]*m1
259 add %rax,$lo0 # discarded
260 mov 8($ap),%rax
261 adc \$0,%rdx
262 mov 8(%rsp),$lo0 # tp[1]
263 mov %rdx,$hi1
264
265 lea 1($j),$j # j++
266 jmp .Linner_enter
267
268.align 16
269.Linner:
270 add %rax,$hi1
271 mov ($ap,$j,8),%rax
272 adc \$0,%rdx
273 add $lo0,$hi1 # np[j]*m1+ap[j]*bp[i]+tp[j]
274 mov (%rsp,$j,8),$lo0
275 adc \$0,%rdx
276 mov $hi1,-16(%rsp,$j,8) # tp[j-1]
277 mov %rdx,$hi1
278
279.Linner_enter:
280 mulq $m0 # ap[j]*bp[i]
281 add %rax,$hi0
282 mov ($np,$j,8),%rax
283 adc \$0,%rdx
284 add $hi0,$lo0 # ap[j]*bp[i]+tp[j]
285 mov %rdx,$hi0
286 adc \$0,$hi0
287 lea 1($j),$j # j++
288
289 mulq $m1 # np[j]*m1
290 cmp $num,$j
291 jne .Linner
292
293 add %rax,$hi1
294 mov ($ap),%rax # ap[0]
295 adc \$0,%rdx
296 add $lo0,$hi1 # np[j]*m1+ap[j]*bp[i]+tp[j]
297 mov (%rsp,$j,8),$lo0
298 adc \$0,%rdx
299 mov $hi1,-16(%rsp,$j,8) # tp[j-1]
300 mov %rdx,$hi1
301
302 xor %rdx,%rdx
303 add $hi0,$hi1
304 adc \$0,%rdx
305 add $lo0,$hi1 # pull upmost overflow bit
306 adc \$0,%rdx
307 mov $hi1,-8(%rsp,$num,8)
308 mov %rdx,(%rsp,$num,8) # store upmost overflow bit
309
310 lea 1($i),$i # i++
311 cmp $num,$i
312 jb .Louter
313
314 xor $i,$i # i=0 and clear CF!
315 mov (%rsp),%rax # tp[0]
316 mov $num,$j # j=num
317
318.align 16
319.Lsub: sbb ($np,$i,8),%rax
320 mov %rax,($rp,$i,8) # rp[i]=tp[i]-np[i]
321 mov 8(%rsp,$i,8),%rax # tp[i+1]
322 lea 1($i),$i # i++
323 dec $j # doesn't affect CF!
324 jnz .Lsub
325
326 sbb \$0,%rax # handle upmost overflow bit
327 mov \$-1,%rbx
328 xor %rax,%rbx # not %rax
329 xor $i,$i
330 mov $num,$j # j=num
331
332.Lcopy: # conditional copy
333 mov ($rp,$i,8),%rcx
334 mov (%rsp,$i,8),%rdx
335 and %rbx,%rcx
336 and %rax,%rdx
337 mov $num,(%rsp,$i,8) # zap temporary vector
338 or %rcx,%rdx
339 mov %rdx,($rp,$i,8) # rp[i]=tp[i]
340 lea 1($i),$i
341 sub \$1,$j
342 jnz .Lcopy
343
344 mov 8(%rsp,$num,8),%rsi # restore %rsp
345.cfi_def_cfa %rsi,8
346 mov \$1,%rax
347 mov -48(%rsi),%r15
348.cfi_restore %r15
349 mov -40(%rsi),%r14
350.cfi_restore %r14
351 mov -32(%rsi),%r13
352.cfi_restore %r13
353 mov -24(%rsi),%r12
354.cfi_restore %r12
355 mov -16(%rsi),%rbp
356.cfi_restore %rbp
357 mov -8(%rsi),%rbx
358.cfi_restore %rbx
359 lea (%rsi),%rsp
360.cfi_def_cfa_register %rsp
361.Lmul_epilogue:
362 ret
363.cfi_endproc
364.size bn_mul_mont,.-bn_mul_mont
365___
366{{{
367my @A=("%r10","%r11");
368my @N=("%r13","%rdi");
369$code.=<<___;
370.type bn_mul4x_mont,\@function,6
371.align 16
372bn_mul4x_mont:
373.cfi_startproc
374 mov ${num}d,${num}d
375 mov %rsp,%rax
376.cfi_def_cfa_register %rax
377.Lmul4x_enter:
378___
379$code.=<<___ if ($addx);
380 and \$0x80100,%r11d
381 cmp \$0x80100,%r11d
382 je .Lmulx4x_enter
383___
384$code.=<<___;
385 push %rbx
386.cfi_push %rbx
387 push %rbp
388.cfi_push %rbp
389 push %r12
390.cfi_push %r12
391 push %r13
392.cfi_push %r13
393 push %r14
394.cfi_push %r14
395 push %r15
396.cfi_push %r15
397
398 neg $num
399 mov %rsp,%r11
400 lea -32(%rsp,$num,8),%r10 # future alloca(8*(num+4))
401 neg $num # restore
402 and \$-1024,%r10 # minimize TLB usage
403
404 sub %r10,%r11
405 and \$-4096,%r11
406 lea (%r10,%r11),%rsp
407 mov (%rsp),%r11
408 cmp %r10,%rsp
409 ja .Lmul4x_page_walk
410 jmp .Lmul4x_page_walk_done
411
412.Lmul4x_page_walk:
413 lea -4096(%rsp),%rsp
414 mov (%rsp),%r11
415 cmp %r10,%rsp
416 ja .Lmul4x_page_walk
417.Lmul4x_page_walk_done:
418
419 mov %rax,8(%rsp,$num,8) # tp[num+1]=%rsp
420.cfi_cfa_expression %rsp+8,$num,8,mul,plus,deref,+8
421.Lmul4x_body:
422 mov $rp,16(%rsp,$num,8) # tp[num+2]=$rp
423 mov %rdx,%r12 # reassign $bp
424___
425 $bp="%r12";
426$code.=<<___;
427 mov ($n0),$n0 # pull n0[0] value
428 mov ($bp),$m0 # m0=bp[0]
429 mov ($ap),%rax
430
431 xor $i,$i # i=0
432 xor $j,$j # j=0
433
434 mov $n0,$m1
435 mulq $m0 # ap[0]*bp[0]
436 mov %rax,$A[0]
437 mov ($np),%rax
438
439 imulq $A[0],$m1 # "tp[0]"*n0
440 mov %rdx,$A[1]
441
442 mulq $m1 # np[0]*m1
443 add %rax,$A[0] # discarded
444 mov 8($ap),%rax
445 adc \$0,%rdx
446 mov %rdx,$N[1]
447
448 mulq $m0
449 add %rax,$A[1]
450 mov 8($np),%rax
451 adc \$0,%rdx
452 mov %rdx,$A[0]
453
454 mulq $m1
455 add %rax,$N[1]
456 mov 16($ap),%rax
457 adc \$0,%rdx
458 add $A[1],$N[1]
459 lea 4($j),$j # j++
460 adc \$0,%rdx
461 mov $N[1],(%rsp)
462 mov %rdx,$N[0]
463 jmp .L1st4x
464.align 16
465.L1st4x:
466 mulq $m0 # ap[j]*bp[0]
467 add %rax,$A[0]
468 mov -16($np,$j,8),%rax
469 adc \$0,%rdx
470 mov %rdx,$A[1]
471
472 mulq $m1 # np[j]*m1
473 add %rax,$N[0]
474 mov -8($ap,$j,8),%rax
475 adc \$0,%rdx
476 add $A[0],$N[0] # np[j]*m1+ap[j]*bp[0]
477 adc \$0,%rdx
478 mov $N[0],-24(%rsp,$j,8) # tp[j-1]
479 mov %rdx,$N[1]
480
481 mulq $m0 # ap[j]*bp[0]
482 add %rax,$A[1]
483 mov -8($np,$j,8),%rax
484 adc \$0,%rdx
485 mov %rdx,$A[0]
486
487 mulq $m1 # np[j]*m1
488 add %rax,$N[1]
489 mov ($ap,$j,8),%rax
490 adc \$0,%rdx
491 add $A[1],$N[1] # np[j]*m1+ap[j]*bp[0]
492 adc \$0,%rdx
493 mov $N[1],-16(%rsp,$j,8) # tp[j-1]
494 mov %rdx,$N[0]
495
496 mulq $m0 # ap[j]*bp[0]
497 add %rax,$A[0]
498 mov ($np,$j,8),%rax
499 adc \$0,%rdx
500 mov %rdx,$A[1]
501
502 mulq $m1 # np[j]*m1
503 add %rax,$N[0]
504 mov 8($ap,$j,8),%rax
505 adc \$0,%rdx
506 add $A[0],$N[0] # np[j]*m1+ap[j]*bp[0]
507 adc \$0,%rdx
508 mov $N[0],-8(%rsp,$j,8) # tp[j-1]
509 mov %rdx,$N[1]
510
511 mulq $m0 # ap[j]*bp[0]
512 add %rax,$A[1]
513 mov 8($np,$j,8),%rax
514 adc \$0,%rdx
515 lea 4($j),$j # j++
516 mov %rdx,$A[0]
517
518 mulq $m1 # np[j]*m1
519 add %rax,$N[1]
520 mov -16($ap,$j,8),%rax
521 adc \$0,%rdx
522 add $A[1],$N[1] # np[j]*m1+ap[j]*bp[0]
523 adc \$0,%rdx
524 mov $N[1],-32(%rsp,$j,8) # tp[j-1]
525 mov %rdx,$N[0]
526 cmp $num,$j
527 jb .L1st4x
528
529 mulq $m0 # ap[j]*bp[0]
530 add %rax,$A[0]
531 mov -16($np,$j,8),%rax
532 adc \$0,%rdx
533 mov %rdx,$A[1]
534
535 mulq $m1 # np[j]*m1
536 add %rax,$N[0]
537 mov -8($ap,$j,8),%rax
538 adc \$0,%rdx
539 add $A[0],$N[0] # np[j]*m1+ap[j]*bp[0]
540 adc \$0,%rdx
541 mov $N[0],-24(%rsp,$j,8) # tp[j-1]
542 mov %rdx,$N[1]
543
544 mulq $m0 # ap[j]*bp[0]
545 add %rax,$A[1]
546 mov -8($np,$j,8),%rax
547 adc \$0,%rdx
548 mov %rdx,$A[0]
549
550 mulq $m1 # np[j]*m1
551 add %rax,$N[1]
552 mov ($ap),%rax # ap[0]
553 adc \$0,%rdx
554 add $A[1],$N[1] # np[j]*m1+ap[j]*bp[0]
555 adc \$0,%rdx
556 mov $N[1],-16(%rsp,$j,8) # tp[j-1]
557 mov %rdx,$N[0]
558
559 xor $N[1],$N[1]
560 add $A[0],$N[0]
561 adc \$0,$N[1]
562 mov $N[0],-8(%rsp,$j,8)
563 mov $N[1],(%rsp,$j,8) # store upmost overflow bit
564
565 lea 1($i),$i # i++
566.align 4
567.Louter4x:
568 mov ($bp,$i,8),$m0 # m0=bp[i]
569 xor $j,$j # j=0
570 mov (%rsp),$A[0]
571 mov $n0,$m1
572 mulq $m0 # ap[0]*bp[i]
573 add %rax,$A[0] # ap[0]*bp[i]+tp[0]
574 mov ($np),%rax
575 adc \$0,%rdx
576
577 imulq $A[0],$m1 # tp[0]*n0
578 mov %rdx,$A[1]
579
580 mulq $m1 # np[0]*m1
581 add %rax,$A[0] # "$N[0]", discarded
582 mov 8($ap),%rax
583 adc \$0,%rdx
584 mov %rdx,$N[1]
585
586 mulq $m0 # ap[j]*bp[i]
587 add %rax,$A[1]
588 mov 8($np),%rax
589 adc \$0,%rdx
590 add 8(%rsp),$A[1] # +tp[1]
591 adc \$0,%rdx
592 mov %rdx,$A[0]
593
594 mulq $m1 # np[j]*m1
595 add %rax,$N[1]
596 mov 16($ap),%rax
597 adc \$0,%rdx
598 add $A[1],$N[1] # np[j]*m1+ap[j]*bp[i]+tp[j]
599 lea 4($j),$j # j+=2
600 adc \$0,%rdx
601 mov $N[1],(%rsp) # tp[j-1]
602 mov %rdx,$N[0]
603 jmp .Linner4x
604.align 16
605.Linner4x:
606 mulq $m0 # ap[j]*bp[i]
607 add %rax,$A[0]
608 mov -16($np,$j,8),%rax
609 adc \$0,%rdx
610 add -16(%rsp,$j,8),$A[0] # ap[j]*bp[i]+tp[j]
611 adc \$0,%rdx
612 mov %rdx,$A[1]
613
614 mulq $m1 # np[j]*m1
615 add %rax,$N[0]
616 mov -8($ap,$j,8),%rax
617 adc \$0,%rdx
618 add $A[0],$N[0]
619 adc \$0,%rdx
620 mov $N[0],-24(%rsp,$j,8) # tp[j-1]
621 mov %rdx,$N[1]
622
623 mulq $m0 # ap[j]*bp[i]
624 add %rax,$A[1]
625 mov -8($np,$j,8),%rax
626 adc \$0,%rdx
627 add -8(%rsp,$j,8),$A[1]
628 adc \$0,%rdx
629 mov %rdx,$A[0]
630
631 mulq $m1 # np[j]*m1
632 add %rax,$N[1]
633 mov ($ap,$j,8),%rax
634 adc \$0,%rdx
635 add $A[1],$N[1]
636 adc \$0,%rdx
637 mov $N[1],-16(%rsp,$j,8) # tp[j-1]
638 mov %rdx,$N[0]
639
640 mulq $m0 # ap[j]*bp[i]
641 add %rax,$A[0]
642 mov ($np,$j,8),%rax
643 adc \$0,%rdx
644 add (%rsp,$j,8),$A[0] # ap[j]*bp[i]+tp[j]
645 adc \$0,%rdx
646 mov %rdx,$A[1]
647
648 mulq $m1 # np[j]*m1
649 add %rax,$N[0]
650 mov 8($ap,$j,8),%rax
651 adc \$0,%rdx
652 add $A[0],$N[0]
653 adc \$0,%rdx
654 mov $N[0],-8(%rsp,$j,8) # tp[j-1]
655 mov %rdx,$N[1]
656
657 mulq $m0 # ap[j]*bp[i]
658 add %rax,$A[1]
659 mov 8($np,$j,8),%rax
660 adc \$0,%rdx
661 add 8(%rsp,$j,8),$A[1]
662 adc \$0,%rdx
663 lea 4($j),$j # j++
664 mov %rdx,$A[0]
665
666 mulq $m1 # np[j]*m1
667 add %rax,$N[1]
668 mov -16($ap,$j,8),%rax
669 adc \$0,%rdx
670 add $A[1],$N[1]
671 adc \$0,%rdx
672 mov $N[1],-32(%rsp,$j,8) # tp[j-1]
673 mov %rdx,$N[0]
674 cmp $num,$j
675 jb .Linner4x
676
677 mulq $m0 # ap[j]*bp[i]
678 add %rax,$A[0]
679 mov -16($np,$j,8),%rax
680 adc \$0,%rdx
681 add -16(%rsp,$j,8),$A[0] # ap[j]*bp[i]+tp[j]
682 adc \$0,%rdx
683 mov %rdx,$A[1]
684
685 mulq $m1 # np[j]*m1
686 add %rax,$N[0]
687 mov -8($ap,$j,8),%rax
688 adc \$0,%rdx
689 add $A[0],$N[0]
690 adc \$0,%rdx
691 mov $N[0],-24(%rsp,$j,8) # tp[j-1]
692 mov %rdx,$N[1]
693
694 mulq $m0 # ap[j]*bp[i]
695 add %rax,$A[1]
696 mov -8($np,$j,8),%rax
697 adc \$0,%rdx
698 add -8(%rsp,$j,8),$A[1]
699 adc \$0,%rdx
700 lea 1($i),$i # i++
701 mov %rdx,$A[0]
702
703 mulq $m1 # np[j]*m1
704 add %rax,$N[1]
705 mov ($ap),%rax # ap[0]
706 adc \$0,%rdx
707 add $A[1],$N[1]
708 adc \$0,%rdx
709 mov $N[1],-16(%rsp,$j,8) # tp[j-1]
710 mov %rdx,$N[0]
711
712 xor $N[1],$N[1]
713 add $A[0],$N[0]
714 adc \$0,$N[1]
715 add (%rsp,$num,8),$N[0] # pull upmost overflow bit
716 adc \$0,$N[1]
717 mov $N[0],-8(%rsp,$j,8)
718 mov $N[1],(%rsp,$j,8) # store upmost overflow bit
719
720 cmp $num,$i
721 jb .Louter4x
722___
723{
724my @ri=("%rax","%rdx",$m0,$m1);
725$code.=<<___;
726 mov 16(%rsp,$num,8),$rp # restore $rp
727 lea -4($num),$j
728 mov 0(%rsp),@ri[0] # tp[0]
729 mov 8(%rsp),@ri[1] # tp[1]
730 shr \$2,$j # j=num/4-1
731 lea (%rsp),$ap # borrow ap for tp
732 xor $i,$i # i=0 and clear CF!
733
734 sub 0($np),@ri[0]
735 mov 16($ap),@ri[2] # tp[2]
736 mov 24($ap),@ri[3] # tp[3]
737 sbb 8($np),@ri[1]
738
739.Lsub4x:
740 mov @ri[0],0($rp,$i,8) # rp[i]=tp[i]-np[i]
741 mov @ri[1],8($rp,$i,8) # rp[i]=tp[i]-np[i]
742 sbb 16($np,$i,8),@ri[2]
743 mov 32($ap,$i,8),@ri[0] # tp[i+1]
744 mov 40($ap,$i,8),@ri[1]
745 sbb 24($np,$i,8),@ri[3]
746 mov @ri[2],16($rp,$i,8) # rp[i]=tp[i]-np[i]
747 mov @ri[3],24($rp,$i,8) # rp[i]=tp[i]-np[i]
748 sbb 32($np,$i,8),@ri[0]
749 mov 48($ap,$i,8),@ri[2]
750 mov 56($ap,$i,8),@ri[3]
751 sbb 40($np,$i,8),@ri[1]
752 lea 4($i),$i # i++
753 dec $j # doesn't affect CF!
754 jnz .Lsub4x
755
756 mov @ri[0],0($rp,$i,8) # rp[i]=tp[i]-np[i]
757 mov 32($ap,$i,8),@ri[0] # load overflow bit
758 sbb 16($np,$i,8),@ri[2]
759 mov @ri[1],8($rp,$i,8) # rp[i]=tp[i]-np[i]
760 sbb 24($np,$i,8),@ri[3]
761 mov @ri[2],16($rp,$i,8) # rp[i]=tp[i]-np[i]
762
763 sbb \$0,@ri[0] # handle upmost overflow bit
764 mov @ri[3],24($rp,$i,8) # rp[i]=tp[i]-np[i]
765 pxor %xmm0,%xmm0
766 movq @ri[0],%xmm4
767 pcmpeqd %xmm5,%xmm5
768 pshufd \$0,%xmm4,%xmm4
769 mov $num,$j
770 pxor %xmm4,%xmm5
771 shr \$2,$j # j=num/4
772 xor %eax,%eax # i=0
773
774 jmp .Lcopy4x
775.align 16
776.Lcopy4x: # conditional copy
777 movdqa (%rsp,%rax),%xmm1
778 movdqu ($rp,%rax),%xmm2
779 pand %xmm4,%xmm1
780 pand %xmm5,%xmm2
781 movdqa 16(%rsp,%rax),%xmm3
782 movdqa %xmm0,(%rsp,%rax)
783 por %xmm2,%xmm1
784 movdqu 16($rp,%rax),%xmm2
785 movdqu %xmm1,($rp,%rax)
786 pand %xmm4,%xmm3
787 pand %xmm5,%xmm2
788 movdqa %xmm0,16(%rsp,%rax)
789 por %xmm2,%xmm3
790 movdqu %xmm3,16($rp,%rax)
791 lea 32(%rax),%rax
792 dec $j
793 jnz .Lcopy4x
794___
795}
796$code.=<<___;
797 mov 8(%rsp,$num,8),%rsi # restore %rsp
798.cfi_def_cfa %rsi, 8
799 mov \$1,%rax
800 mov -48(%rsi),%r15
801.cfi_restore %r15
802 mov -40(%rsi),%r14
803.cfi_restore %r14
804 mov -32(%rsi),%r13
805.cfi_restore %r13
806 mov -24(%rsi),%r12
807.cfi_restore %r12
808 mov -16(%rsi),%rbp
809.cfi_restore %rbp
810 mov -8(%rsi),%rbx
811.cfi_restore %rbx
812 lea (%rsi),%rsp
813.cfi_def_cfa_register %rsp
814.Lmul4x_epilogue:
815 ret
816.cfi_endproc
817.size bn_mul4x_mont,.-bn_mul4x_mont
818___
819}}}
820
821{{{
822######################################################################
823# void bn_sqr8x_mont(
824my $rptr="%rdi"; # const BN_ULONG *rptr,
825my $aptr="%rsi"; # const BN_ULONG *aptr,
826my $bptr="%rdx"; # not used
827my $nptr="%rcx"; # const BN_ULONG *nptr,
828my $n0 ="%r8"; # const BN_ULONG *n0);
829my $num ="%r9"; # int num, has to be divisible by 8
830
831my ($i,$j,$tptr)=("%rbp","%rcx",$rptr);
832my @A0=("%r10","%r11");
833my @A1=("%r12","%r13");
834my ($a0,$a1,$ai)=("%r14","%r15","%rbx");
835
836$code.=<<___ if ($addx);
837.extern bn_sqrx8x_internal # see x86_64-mont5 module
838___
839$code.=<<___;
840.extern bn_sqr8x_internal # see x86_64-mont5 module
841
842.type bn_sqr8x_mont,\@function,6
843.align 32
844bn_sqr8x_mont:
845.cfi_startproc
846 mov %rsp,%rax
847.cfi_def_cfa_register %rax
848.Lsqr8x_enter:
849 push %rbx
850.cfi_push %rbx
851 push %rbp
852.cfi_push %rbp
853 push %r12
854.cfi_push %r12
855 push %r13
856.cfi_push %r13
857 push %r14
858.cfi_push %r14
859 push %r15
860.cfi_push %r15
861.Lsqr8x_prologue:
862
863 mov ${num}d,%r10d
864 shl \$3,${num}d # convert $num to bytes
865 shl \$3+2,%r10 # 4*$num
866 neg $num
867
868 ##############################################################
869 # ensure that stack frame doesn't alias with $aptr modulo
870 # 4096. this is done to allow memory disambiguation logic
871 # do its job.
872 #
873 lea -64(%rsp,$num,2),%r11
874 mov %rsp,%rbp
875 mov ($n0),$n0 # *n0
876 sub $aptr,%r11
877 and \$4095,%r11
878 cmp %r11,%r10
879 jb .Lsqr8x_sp_alt
880 sub %r11,%rbp # align with $aptr
881 lea -64(%rbp,$num,2),%rbp # future alloca(frame+2*$num)
882 jmp .Lsqr8x_sp_done
883
884.align 32
885.Lsqr8x_sp_alt:
886 lea 4096-64(,$num,2),%r10 # 4096-frame-2*$num
887 lea -64(%rbp,$num,2),%rbp # future alloca(frame+2*$num)
888 sub %r10,%r11
889 mov \$0,%r10
890 cmovc %r10,%r11
891 sub %r11,%rbp
892.Lsqr8x_sp_done:
893 and \$-64,%rbp
894 mov %rsp,%r11
895 sub %rbp,%r11
896 and \$-4096,%r11
897 lea (%rbp,%r11),%rsp
898 mov (%rsp),%r10
899 cmp %rbp,%rsp
900 ja .Lsqr8x_page_walk
901 jmp .Lsqr8x_page_walk_done
902
903.align 16
904.Lsqr8x_page_walk:
905 lea -4096(%rsp),%rsp
906 mov (%rsp),%r10
907 cmp %rbp,%rsp
908 ja .Lsqr8x_page_walk
909.Lsqr8x_page_walk_done:
910
911 mov $num,%r10
912 neg $num
913
914 mov $n0, 32(%rsp)
915 mov %rax, 40(%rsp) # save original %rsp
916.cfi_cfa_expression %rsp+40,deref,+8
917.Lsqr8x_body:
918
919 movq $nptr, %xmm2 # save pointer to modulus
920 pxor %xmm0,%xmm0
921 movq $rptr,%xmm1 # save $rptr
922 movq %r10, %xmm3 # -$num
923___
924$code.=<<___ if ($addx);
925 mov OPENSSL_ia32cap_P+8(%rip),%eax
926 and \$0x80100,%eax
927 cmp \$0x80100,%eax
928 jne .Lsqr8x_nox
929
930 call bn_sqrx8x_internal # see x86_64-mont5 module
931 # %rax top-most carry
932 # %rbp nptr
933 # %rcx -8*num
934 # %r8 end of tp[2*num]
935 lea (%r8,%rcx),%rbx
936 mov %rcx,$num
937 mov %rcx,%rdx
938 movq %xmm1,$rptr
939 sar \$3+2,%rcx # %cf=0
940 jmp .Lsqr8x_sub
941
942.align 32
943.Lsqr8x_nox:
944___
945$code.=<<___;
946 call bn_sqr8x_internal # see x86_64-mont5 module
947 # %rax top-most carry
948 # %rbp nptr
949 # %r8 -8*num
950 # %rdi end of tp[2*num]
951 lea (%rdi,$num),%rbx
952 mov $num,%rcx
953 mov $num,%rdx
954 movq %xmm1,$rptr
955 sar \$3+2,%rcx # %cf=0
956 jmp .Lsqr8x_sub
957
958.align 32
959.Lsqr8x_sub:
960 mov 8*0(%rbx),%r12
961 mov 8*1(%rbx),%r13
962 mov 8*2(%rbx),%r14
963 mov 8*3(%rbx),%r15
964 lea 8*4(%rbx),%rbx
965 sbb 8*0(%rbp),%r12
966 sbb 8*1(%rbp),%r13
967 sbb 8*2(%rbp),%r14
968 sbb 8*3(%rbp),%r15
969 lea 8*4(%rbp),%rbp
970 mov %r12,8*0($rptr)
971 mov %r13,8*1($rptr)
972 mov %r14,8*2($rptr)
973 mov %r15,8*3($rptr)
974 lea 8*4($rptr),$rptr
975 inc %rcx # preserves %cf
976 jnz .Lsqr8x_sub
977
978 sbb \$0,%rax # top-most carry
979 lea (%rbx,$num),%rbx # rewind
980 lea ($rptr,$num),$rptr # rewind
981
982 movq %rax,%xmm1
983 pxor %xmm0,%xmm0
984 pshufd \$0,%xmm1,%xmm1
985 mov 40(%rsp),%rsi # restore %rsp
986.cfi_def_cfa %rsi,8
987 jmp .Lsqr8x_cond_copy
988
989.align 32
990.Lsqr8x_cond_copy:
991 movdqa 16*0(%rbx),%xmm2
992 movdqa 16*1(%rbx),%xmm3
993 lea 16*2(%rbx),%rbx
994 movdqu 16*0($rptr),%xmm4
995 movdqu 16*1($rptr),%xmm5
996 lea 16*2($rptr),$rptr
997 movdqa %xmm0,-16*2(%rbx) # zero tp
998 movdqa %xmm0,-16*1(%rbx)
999 movdqa %xmm0,-16*2(%rbx,%rdx)
1000 movdqa %xmm0,-16*1(%rbx,%rdx)
1001 pcmpeqd %xmm1,%xmm0
1002 pand %xmm1,%xmm2
1003 pand %xmm1,%xmm3
1004 pand %xmm0,%xmm4
1005 pand %xmm0,%xmm5
1006 pxor %xmm0,%xmm0
1007 por %xmm2,%xmm4
1008 por %xmm3,%xmm5
1009 movdqu %xmm4,-16*2($rptr)
1010 movdqu %xmm5,-16*1($rptr)
1011 add \$32,$num
1012 jnz .Lsqr8x_cond_copy
1013
1014 mov \$1,%rax
1015 mov -48(%rsi),%r15
1016.cfi_restore %r15
1017 mov -40(%rsi),%r14
1018.cfi_restore %r14
1019 mov -32(%rsi),%r13
1020.cfi_restore %r13
1021 mov -24(%rsi),%r12
1022.cfi_restore %r12
1023 mov -16(%rsi),%rbp
1024.cfi_restore %rbp
1025 mov -8(%rsi),%rbx
1026.cfi_restore %rbx
1027 lea (%rsi),%rsp
1028.cfi_def_cfa_register %rsp
1029.Lsqr8x_epilogue:
1030 ret
1031.cfi_endproc
1032.size bn_sqr8x_mont,.-bn_sqr8x_mont
1033___
1034}}}
1035
1036
1037if ($addx) {{{
1038my $bp="%rdx"; # original value
1039
1040$code.=<<___;
1041.type bn_mulx4x_mont,\@function,6
1042.align 32
1043bn_mulx4x_mont:
1044.cfi_startproc
1045 mov %rsp,%rax
1046.cfi_def_cfa_register %rax
1047.Lmulx4x_enter:
1048 push %rbx
1049.cfi_push %rbx
1050 push %rbp
1051.cfi_push %rbp
1052 push %r12
1053.cfi_push %r12
1054 push %r13
1055.cfi_push %r13
1056 push %r14
1057.cfi_push %r14
1058 push %r15
1059.cfi_push %r15
1060.Lmulx4x_prologue:
1061
1062 shl \$3,${num}d # convert $num to bytes
1063 xor %r10,%r10
1064 sub $num,%r10 # -$num
1065 mov ($n0),$n0 # *n0
1066 lea -72(%rsp,%r10),%rbp # future alloca(frame+$num+8)
1067 and \$-128,%rbp
1068 mov %rsp,%r11
1069 sub %rbp,%r11
1070 and \$-4096,%r11
1071 lea (%rbp,%r11),%rsp
1072 mov (%rsp),%r10
1073 cmp %rbp,%rsp
1074 ja .Lmulx4x_page_walk
1075 jmp .Lmulx4x_page_walk_done
1076
1077.align 16
1078.Lmulx4x_page_walk:
1079 lea -4096(%rsp),%rsp
1080 mov (%rsp),%r10
1081 cmp %rbp,%rsp
1082 ja .Lmulx4x_page_walk
1083.Lmulx4x_page_walk_done:
1084
1085 lea ($bp,$num),%r10
1086 ##############################################################
1087 # Stack layout
1088 # +0 num
1089 # +8 off-loaded &b[i]
1090 # +16 end of b[num]
1091 # +24 saved n0
1092 # +32 saved rp
1093 # +40 saved %rsp
1094 # +48 inner counter
1095 # +56
1096 # +64 tmp[num+1]
1097 #
1098 mov $num,0(%rsp) # save $num
1099 shr \$5,$num
1100 mov %r10,16(%rsp) # end of b[num]
1101 sub \$1,$num
1102 mov $n0, 24(%rsp) # save *n0
1103 mov $rp, 32(%rsp) # save $rp
1104 mov %rax,40(%rsp) # save original %rsp
1105.cfi_cfa_expression %rsp+40,deref,+8
1106 mov $num,48(%rsp) # inner counter
1107 jmp .Lmulx4x_body
1108
1109.align 32
1110.Lmulx4x_body:
1111___
1112my ($aptr, $bptr, $nptr, $tptr, $mi, $bi, $zero, $num)=
1113 ("%rsi","%rdi","%rcx","%rbx","%r8","%r9","%rbp","%rax");
1114my $rptr=$bptr;
1115$code.=<<___;
1116 lea 8($bp),$bptr
1117 mov ($bp),%rdx # b[0], $bp==%rdx actually
1118 lea 64+32(%rsp),$tptr
1119 mov %rdx,$bi
1120
1121 mulx 0*8($aptr),$mi,%rax # a[0]*b[0]
1122 mulx 1*8($aptr),%r11,%r14 # a[1]*b[0]
1123 add %rax,%r11
1124 mov $bptr,8(%rsp) # off-load &b[i]
1125 mulx 2*8($aptr),%r12,%r13 # ...
1126 adc %r14,%r12
1127 adc \$0,%r13
1128
1129 mov $mi,$bptr # borrow $bptr
1130 imulq 24(%rsp),$mi # "t[0]"*n0
1131 xor $zero,$zero # cf=0, of=0
1132
1133 mulx 3*8($aptr),%rax,%r14
1134 mov $mi,%rdx
1135 lea 4*8($aptr),$aptr
1136 adcx %rax,%r13
1137 adcx $zero,%r14 # cf=0
1138
1139 mulx 0*8($nptr),%rax,%r10
1140 adcx %rax,$bptr # discarded
1141 adox %r11,%r10
1142 mulx 1*8($nptr),%rax,%r11
1143 adcx %rax,%r10
1144 adox %r12,%r11
1145 .byte 0xc4,0x62,0xfb,0xf6,0xa1,0x10,0x00,0x00,0x00 # mulx 2*8($nptr),%rax,%r12
1146 mov 48(%rsp),$bptr # counter value
1147 mov %r10,-4*8($tptr)
1148 adcx %rax,%r11
1149 adox %r13,%r12
1150 mulx 3*8($nptr),%rax,%r15
1151 mov $bi,%rdx
1152 mov %r11,-3*8($tptr)
1153 adcx %rax,%r12
1154 adox $zero,%r15 # of=0
1155 lea 4*8($nptr),$nptr
1156 mov %r12,-2*8($tptr)
1157
1158 jmp .Lmulx4x_1st
1159
1160.align 32
1161.Lmulx4x_1st:
1162 adcx $zero,%r15 # cf=0, modulo-scheduled
1163 mulx 0*8($aptr),%r10,%rax # a[4]*b[0]
1164 adcx %r14,%r10
1165 mulx 1*8($aptr),%r11,%r14 # a[5]*b[0]
1166 adcx %rax,%r11
1167 mulx 2*8($aptr),%r12,%rax # ...
1168 adcx %r14,%r12
1169 mulx 3*8($aptr),%r13,%r14
1170 .byte 0x67,0x67
1171 mov $mi,%rdx
1172 adcx %rax,%r13
1173 adcx $zero,%r14 # cf=0
1174 lea 4*8($aptr),$aptr
1175 lea 4*8($tptr),$tptr
1176
1177 adox %r15,%r10
1178 mulx 0*8($nptr),%rax,%r15
1179 adcx %rax,%r10
1180 adox %r15,%r11
1181 mulx 1*8($nptr),%rax,%r15
1182 adcx %rax,%r11
1183 adox %r15,%r12
1184 mulx 2*8($nptr),%rax,%r15
1185 mov %r10,-5*8($tptr)
1186 adcx %rax,%r12
1187 mov %r11,-4*8($tptr)
1188 adox %r15,%r13
1189 mulx 3*8($nptr),%rax,%r15
1190 mov $bi,%rdx
1191 mov %r12,-3*8($tptr)
1192 adcx %rax,%r13
1193 adox $zero,%r15
1194 lea 4*8($nptr),$nptr
1195 mov %r13,-2*8($tptr)
1196
1197 dec $bptr # of=0, pass cf
1198 jnz .Lmulx4x_1st
1199
1200 mov 0(%rsp),$num # load num
1201 mov 8(%rsp),$bptr # re-load &b[i]
1202 adc $zero,%r15 # modulo-scheduled
1203 add %r15,%r14
1204 sbb %r15,%r15 # top-most carry
1205 mov %r14,-1*8($tptr)
1206 jmp .Lmulx4x_outer
1207
1208.align 32
1209.Lmulx4x_outer:
1210 mov ($bptr),%rdx # b[i]
1211 lea 8($bptr),$bptr # b++
1212 sub $num,$aptr # rewind $aptr
1213 mov %r15,($tptr) # save top-most carry
1214 lea 64+4*8(%rsp),$tptr
1215 sub $num,$nptr # rewind $nptr
1216
1217 mulx 0*8($aptr),$mi,%r11 # a[0]*b[i]
1218 xor %ebp,%ebp # xor $zero,$zero # cf=0, of=0
1219 mov %rdx,$bi
1220 mulx 1*8($aptr),%r14,%r12 # a[1]*b[i]
1221 adox -4*8($tptr),$mi
1222 adcx %r14,%r11
1223 mulx 2*8($aptr),%r15,%r13 # ...
1224 adox -3*8($tptr),%r11
1225 adcx %r15,%r12
1226 adox -2*8($tptr),%r12
1227 adcx $zero,%r13
1228 adox $zero,%r13
1229
1230 mov $bptr,8(%rsp) # off-load &b[i]
1231 mov $mi,%r15
1232 imulq 24(%rsp),$mi # "t[0]"*n0
1233 xor %ebp,%ebp # xor $zero,$zero # cf=0, of=0
1234
1235 mulx 3*8($aptr),%rax,%r14
1236 mov $mi,%rdx
1237 adcx %rax,%r13
1238 adox -1*8($tptr),%r13
1239 adcx $zero,%r14
1240 lea 4*8($aptr),$aptr
1241 adox $zero,%r14
1242
1243 mulx 0*8($nptr),%rax,%r10
1244 adcx %rax,%r15 # discarded
1245 adox %r11,%r10
1246 mulx 1*8($nptr),%rax,%r11
1247 adcx %rax,%r10
1248 adox %r12,%r11
1249 mulx 2*8($nptr),%rax,%r12
1250 mov %r10,-4*8($tptr)
1251 adcx %rax,%r11
1252 adox %r13,%r12
1253 mulx 3*8($nptr),%rax,%r15
1254 mov $bi,%rdx
1255 mov %r11,-3*8($tptr)
1256 lea 4*8($nptr),$nptr
1257 adcx %rax,%r12
1258 adox $zero,%r15 # of=0
1259 mov 48(%rsp),$bptr # counter value
1260 mov %r12,-2*8($tptr)
1261
1262 jmp .Lmulx4x_inner
1263
1264.align 32
1265.Lmulx4x_inner:
1266 mulx 0*8($aptr),%r10,%rax # a[4]*b[i]
1267 adcx $zero,%r15 # cf=0, modulo-scheduled
1268 adox %r14,%r10
1269 mulx 1*8($aptr),%r11,%r14 # a[5]*b[i]
1270 adcx 0*8($tptr),%r10
1271 adox %rax,%r11
1272 mulx 2*8($aptr),%r12,%rax # ...
1273 adcx 1*8($tptr),%r11
1274 adox %r14,%r12
1275 mulx 3*8($aptr),%r13,%r14
1276 mov $mi,%rdx
1277 adcx 2*8($tptr),%r12
1278 adox %rax,%r13
1279 adcx 3*8($tptr),%r13
1280 adox $zero,%r14 # of=0
1281 lea 4*8($aptr),$aptr
1282 lea 4*8($tptr),$tptr
1283 adcx $zero,%r14 # cf=0
1284
1285 adox %r15,%r10
1286 mulx 0*8($nptr),%rax,%r15
1287 adcx %rax,%r10
1288 adox %r15,%r11
1289 mulx 1*8($nptr),%rax,%r15
1290 adcx %rax,%r11
1291 adox %r15,%r12
1292 mulx 2*8($nptr),%rax,%r15
1293 mov %r10,-5*8($tptr)
1294 adcx %rax,%r12
1295 adox %r15,%r13
1296 mulx 3*8($nptr),%rax,%r15
1297 mov $bi,%rdx
1298 mov %r11,-4*8($tptr)
1299 mov %r12,-3*8($tptr)
1300 adcx %rax,%r13
1301 adox $zero,%r15
1302 lea 4*8($nptr),$nptr
1303 mov %r13,-2*8($tptr)
1304
1305 dec $bptr # of=0, pass cf
1306 jnz .Lmulx4x_inner
1307
1308 mov 0(%rsp),$num # load num
1309 mov 8(%rsp),$bptr # re-load &b[i]
1310 adc $zero,%r15 # modulo-scheduled
1311 sub 0*8($tptr),$zero # pull top-most carry
1312 adc %r15,%r14
1313 sbb %r15,%r15 # top-most carry
1314 mov %r14,-1*8($tptr)
1315
1316 cmp 16(%rsp),$bptr
1317 jne .Lmulx4x_outer
1318
1319 lea 64(%rsp),$tptr
1320 sub $num,$nptr # rewind $nptr
1321 neg %r15
1322 mov $num,%rdx
1323 shr \$3+2,$num # %cf=0
1324 mov 32(%rsp),$rptr # restore rp
1325 jmp .Lmulx4x_sub
1326
1327.align 32
1328.Lmulx4x_sub:
1329 mov 8*0($tptr),%r11
1330 mov 8*1($tptr),%r12
1331 mov 8*2($tptr),%r13
1332 mov 8*3($tptr),%r14
1333 lea 8*4($tptr),$tptr
1334 sbb 8*0($nptr),%r11
1335 sbb 8*1($nptr),%r12
1336 sbb 8*2($nptr),%r13
1337 sbb 8*3($nptr),%r14
1338 lea 8*4($nptr),$nptr
1339 mov %r11,8*0($rptr)
1340 mov %r12,8*1($rptr)
1341 mov %r13,8*2($rptr)
1342 mov %r14,8*3($rptr)
1343 lea 8*4($rptr),$rptr
1344 dec $num # preserves %cf
1345 jnz .Lmulx4x_sub
1346
1347 sbb \$0,%r15 # top-most carry
1348 lea 64(%rsp),$tptr
1349 sub %rdx,$rptr # rewind
1350
1351 movq %r15,%xmm1
1352 pxor %xmm0,%xmm0
1353 pshufd \$0,%xmm1,%xmm1
1354 mov 40(%rsp),%rsi # restore %rsp
1355.cfi_def_cfa %rsi,8
1356 jmp .Lmulx4x_cond_copy
1357
1358.align 32
1359.Lmulx4x_cond_copy:
1360 movdqa 16*0($tptr),%xmm2
1361 movdqa 16*1($tptr),%xmm3
1362 lea 16*2($tptr),$tptr
1363 movdqu 16*0($rptr),%xmm4
1364 movdqu 16*1($rptr),%xmm5
1365 lea 16*2($rptr),$rptr
1366 movdqa %xmm0,-16*2($tptr) # zero tp
1367 movdqa %xmm0,-16*1($tptr)
1368 pcmpeqd %xmm1,%xmm0
1369 pand %xmm1,%xmm2
1370 pand %xmm1,%xmm3
1371 pand %xmm0,%xmm4
1372 pand %xmm0,%xmm5
1373 pxor %xmm0,%xmm0
1374 por %xmm2,%xmm4
1375 por %xmm3,%xmm5
1376 movdqu %xmm4,-16*2($rptr)
1377 movdqu %xmm5,-16*1($rptr)
1378 sub \$32,%rdx
1379 jnz .Lmulx4x_cond_copy
1380
1381 mov %rdx,($tptr)
1382
1383 mov \$1,%rax
1384 mov -48(%rsi),%r15
1385.cfi_restore %r15
1386 mov -40(%rsi),%r14
1387.cfi_restore %r14
1388 mov -32(%rsi),%r13
1389.cfi_restore %r13
1390 mov -24(%rsi),%r12
1391.cfi_restore %r12
1392 mov -16(%rsi),%rbp
1393.cfi_restore %rbp
1394 mov -8(%rsi),%rbx
1395.cfi_restore %rbx
1396 lea (%rsi),%rsp
1397.cfi_def_cfa_register %rsp
1398.Lmulx4x_epilogue:
1399 ret
1400.cfi_endproc
1401.size bn_mulx4x_mont,.-bn_mulx4x_mont
1402___
1403}}}
1404$code.=<<___;
1405.asciz "Montgomery Multiplication for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
1406.align 16
1407___
1408
1409# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
1410# CONTEXT *context,DISPATCHER_CONTEXT *disp)
1411if ($win64) {
1412$rec="%rcx";
1413$frame="%rdx";
1414$context="%r8";
1415$disp="%r9";
1416
1417$code.=<<___;
1418.extern __imp_RtlVirtualUnwind
1419.type mul_handler,\@abi-omnipotent
1420.align 16
1421mul_handler:
1422 push %rsi
1423 push %rdi
1424 push %rbx
1425 push %rbp
1426 push %r12
1427 push %r13
1428 push %r14
1429 push %r15
1430 pushfq
1431 sub \$64,%rsp
1432
1433 mov 120($context),%rax # pull context->Rax
1434 mov 248($context),%rbx # pull context->Rip
1435
1436 mov 8($disp),%rsi # disp->ImageBase
1437 mov 56($disp),%r11 # disp->HandlerData
1438
1439 mov 0(%r11),%r10d # HandlerData[0]
1440 lea (%rsi,%r10),%r10 # end of prologue label
1441 cmp %r10,%rbx # context->Rip<end of prologue label
1442 jb .Lcommon_seh_tail
1443
1444 mov 152($context),%rax # pull context->Rsp
1445
1446 mov 4(%r11),%r10d # HandlerData[1]
1447 lea (%rsi,%r10),%r10 # epilogue label
1448 cmp %r10,%rbx # context->Rip>=epilogue label
1449 jae .Lcommon_seh_tail
1450
1451 mov 192($context),%r10 # pull $num
1452 mov 8(%rax,%r10,8),%rax # pull saved stack pointer
1453
1454 jmp .Lcommon_pop_regs
1455.size mul_handler,.-mul_handler
1456
1457.type sqr_handler,\@abi-omnipotent
1458.align 16
1459sqr_handler:
1460 push %rsi
1461 push %rdi
1462 push %rbx
1463 push %rbp
1464 push %r12
1465 push %r13
1466 push %r14
1467 push %r15
1468 pushfq
1469 sub \$64,%rsp
1470
1471 mov 120($context),%rax # pull context->Rax
1472 mov 248($context),%rbx # pull context->Rip
1473
1474 mov 8($disp),%rsi # disp->ImageBase
1475 mov 56($disp),%r11 # disp->HandlerData
1476
1477 mov 0(%r11),%r10d # HandlerData[0]
1478 lea (%rsi,%r10),%r10 # end of prologue label
1479 cmp %r10,%rbx # context->Rip<.Lsqr_prologue
1480 jb .Lcommon_seh_tail
1481
1482 mov 4(%r11),%r10d # HandlerData[1]
1483 lea (%rsi,%r10),%r10 # body label
1484 cmp %r10,%rbx # context->Rip<.Lsqr_body
1485 jb .Lcommon_pop_regs
1486
1487 mov 152($context),%rax # pull context->Rsp
1488
1489 mov 8(%r11),%r10d # HandlerData[2]
1490 lea (%rsi,%r10),%r10 # epilogue label
1491 cmp %r10,%rbx # context->Rip>=.Lsqr_epilogue
1492 jae .Lcommon_seh_tail
1493
1494 mov 40(%rax),%rax # pull saved stack pointer
1495
1496.Lcommon_pop_regs:
1497 mov -8(%rax),%rbx
1498 mov -16(%rax),%rbp
1499 mov -24(%rax),%r12
1500 mov -32(%rax),%r13
1501 mov -40(%rax),%r14
1502 mov -48(%rax),%r15
1503 mov %rbx,144($context) # restore context->Rbx
1504 mov %rbp,160($context) # restore context->Rbp
1505 mov %r12,216($context) # restore context->R12
1506 mov %r13,224($context) # restore context->R13
1507 mov %r14,232($context) # restore context->R14
1508 mov %r15,240($context) # restore context->R15
1509
1510.Lcommon_seh_tail:
1511 mov 8(%rax),%rdi
1512 mov 16(%rax),%rsi
1513 mov %rax,152($context) # restore context->Rsp
1514 mov %rsi,168($context) # restore context->Rsi
1515 mov %rdi,176($context) # restore context->Rdi
1516
1517 mov 40($disp),%rdi # disp->ContextRecord
1518 mov $context,%rsi # context
1519 mov \$154,%ecx # sizeof(CONTEXT)
1520 .long 0xa548f3fc # cld; rep movsq
1521
1522 mov $disp,%rsi
1523 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
1524 mov 8(%rsi),%rdx # arg2, disp->ImageBase
1525 mov 0(%rsi),%r8 # arg3, disp->ControlPc
1526 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
1527 mov 40(%rsi),%r10 # disp->ContextRecord
1528 lea 56(%rsi),%r11 # &disp->HandlerData
1529 lea 24(%rsi),%r12 # &disp->EstablisherFrame
1530 mov %r10,32(%rsp) # arg5
1531 mov %r11,40(%rsp) # arg6
1532 mov %r12,48(%rsp) # arg7
1533 mov %rcx,56(%rsp) # arg8, (NULL)
1534 call *__imp_RtlVirtualUnwind(%rip)
1535
1536 mov \$1,%eax # ExceptionContinueSearch
1537 add \$64,%rsp
1538 popfq
1539 pop %r15
1540 pop %r14
1541 pop %r13
1542 pop %r12
1543 pop %rbp
1544 pop %rbx
1545 pop %rdi
1546 pop %rsi
1547 ret
1548.size sqr_handler,.-sqr_handler
1549
1550.section .pdata
1551.align 4
1552 .rva .LSEH_begin_bn_mul_mont
1553 .rva .LSEH_end_bn_mul_mont
1554 .rva .LSEH_info_bn_mul_mont
1555
1556 .rva .LSEH_begin_bn_mul4x_mont
1557 .rva .LSEH_end_bn_mul4x_mont
1558 .rva .LSEH_info_bn_mul4x_mont
1559
1560 .rva .LSEH_begin_bn_sqr8x_mont
1561 .rva .LSEH_end_bn_sqr8x_mont
1562 .rva .LSEH_info_bn_sqr8x_mont
1563___
1564$code.=<<___ if ($addx);
1565 .rva .LSEH_begin_bn_mulx4x_mont
1566 .rva .LSEH_end_bn_mulx4x_mont
1567 .rva .LSEH_info_bn_mulx4x_mont
1568___
1569$code.=<<___;
1570.section .xdata
1571.align 8
1572.LSEH_info_bn_mul_mont:
1573 .byte 9,0,0,0
1574 .rva mul_handler
1575 .rva .Lmul_body,.Lmul_epilogue # HandlerData[]
1576.LSEH_info_bn_mul4x_mont:
1577 .byte 9,0,0,0
1578 .rva mul_handler
1579 .rva .Lmul4x_body,.Lmul4x_epilogue # HandlerData[]
1580.LSEH_info_bn_sqr8x_mont:
1581 .byte 9,0,0,0
1582 .rva sqr_handler
1583 .rva .Lsqr8x_prologue,.Lsqr8x_body,.Lsqr8x_epilogue # HandlerData[]
1584.align 8
1585___
1586$code.=<<___ if ($addx);
1587.LSEH_info_bn_mulx4x_mont:
1588 .byte 9,0,0,0
1589 .rva sqr_handler
1590 .rva .Lmulx4x_prologue,.Lmulx4x_body,.Lmulx4x_epilogue # HandlerData[]
1591.align 8
1592___
1593}
1594
1595print $code;
1596close STDOUT or die "error closing STDOUT: $!";
注意: 瀏覽 TracBrowser 來幫助您使用儲存庫瀏覽器

© 2025 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette