VirtualBox

source: vbox/trunk/src/libs/openssl-3.1.0/crypto/bn/asm/armv8-mont.pl@ 99507

最後變更 在這個檔案從99507是 99366,由 vboxsync 提交於 2 年 前

openssl-3.1.0: Applied and adjusted our OpenSSL changes to 3.0.7. bugref:10418

  • 屬性 svn:executable 設為 *
檔案大小: 46.7 KB
 
1#! /usr/bin/env perl
2# Copyright 2015-2022 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the Apache License 2.0 (the "License"). You may not use
5# this file except in compliance with the License. You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
9
10# ====================================================================
11# Written by Andy Polyakov <[email protected]> for the OpenSSL
12# project. The module is, however, dual licensed under OpenSSL and
13# CRYPTOGAMS licenses depending on where you obtain it. For further
14# details see http://www.openssl.org/~appro/cryptogams/.
15# ====================================================================
16
17# March 2015
18#
19# "Teaser" Montgomery multiplication module for ARMv8. Needs more
20# work. While it does improve RSA sign performance by 20-30% (less for
21# longer keys) on most processors, for some reason RSA2048 is not
22# faster and RSA4096 goes 15-20% slower on Cortex-A57. Multiplication
23# instruction issue rate is limited on processor in question, meaning
24# that dedicated squaring procedure is a must. Well, actually all
25# contemporary AArch64 processors seem to have limited multiplication
26# issue rate, i.e. they can't issue multiplication every cycle, which
27# explains moderate improvement coefficients in comparison to
28# compiler-generated code. Recall that compiler is instructed to use
29# umulh and therefore uses same amount of multiplication instructions
30# to do the job. Assembly's edge is to minimize number of "collateral"
31# instructions and of course instruction scheduling.
32#
33# April 2015
34#
35# Squaring procedure that handles lengths divisible by 8 improves
36# RSA/DSA performance by 25-40-60% depending on processor and key
37# length. Overall improvement coefficients are always positive in
38# comparison to compiler-generated code. On Cortex-A57 improvement
39# is still modest on longest key lengths, while others exhibit e.g.
40# 50-70% improvement for RSA4096 sign. RSA2048 sign is ~25% faster
41# on Cortex-A57 and ~60-100% faster on others.
42
43# $output is the last argument if it looks like a file (it has an extension)
44# $flavour is the first argument if it doesn't look like a file
45my $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
46my $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
47
48$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
49( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
50( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
51die "can't locate arm-xlate.pl";
52
53open OUT,"| \"$^X\" $xlate $flavour \"$output\""
54 or die "can't call $xlate: $1";
55*STDOUT=*OUT;
56
57($lo0,$hi0,$aj,$m0,$alo,$ahi,
58 $lo1,$hi1,$nj,$m1,$nlo,$nhi,
59 $ovf, $i,$j,$tp,$tj) = map("x$_",6..17,19..24);
60
61# int bn_mul_mont(
62$rp="x0"; # BN_ULONG *rp,
63$ap="x1"; # const BN_ULONG *ap,
64$bp="x2"; # const BN_ULONG *bp,
65$np="x3"; # const BN_ULONG *np,
66$n0="x4"; # const BN_ULONG *n0,
67$num="x5"; # int num);
68
69$code.=<<___;
70#include "arm_arch.h"
71#ifndef __KERNEL__
72.extern OPENSSL_armv8_rsa_neonized
73.hidden OPENSSL_armv8_rsa_neonized
74#endif
75.text
76
77.globl bn_mul_mont
78.type bn_mul_mont,%function
79.align 5
80bn_mul_mont:
81 AARCH64_SIGN_LINK_REGISTER
82.Lbn_mul_mont:
83 tst $num,#3
84 b.ne .Lmul_mont
85 cmp $num,#32
86 b.le .Lscalar_impl
87#ifndef __KERNEL__
88 adrp x17,OPENSSL_armv8_rsa_neonized
89 ldr w17,[x17,#:lo12:OPENSSL_armv8_rsa_neonized]
90 cbnz w17, bn_mul8x_mont_neon
91#endif
92
93.Lscalar_impl:
94 tst $num,#7
95 b.eq __bn_sqr8x_mont
96 tst $num,#3
97 b.eq __bn_mul4x_mont
98
99.Lmul_mont:
100 stp x29,x30,[sp,#-64]!
101 add x29,sp,#0
102 stp x19,x20,[sp,#16]
103 stp x21,x22,[sp,#32]
104 stp x23,x24,[sp,#48]
105
106 ldr $m0,[$bp],#8 // bp[0]
107 sub $tp,sp,$num,lsl#3
108 ldp $hi0,$aj,[$ap],#16 // ap[0..1]
109 lsl $num,$num,#3
110 ldr $n0,[$n0] // *n0
111 and $tp,$tp,#-16 // ABI says so
112 ldp $hi1,$nj,[$np],#16 // np[0..1]
113
114 mul $lo0,$hi0,$m0 // ap[0]*bp[0]
115 sub $j,$num,#16 // j=num-2
116 umulh $hi0,$hi0,$m0
117 mul $alo,$aj,$m0 // ap[1]*bp[0]
118 umulh $ahi,$aj,$m0
119
120 mul $m1,$lo0,$n0 // "tp[0]"*n0
121 mov sp,$tp // alloca
122
123 // (*) mul $lo1,$hi1,$m1 // np[0]*m1
124 umulh $hi1,$hi1,$m1
125 mul $nlo,$nj,$m1 // np[1]*m1
126 // (*) adds $lo1,$lo1,$lo0 // discarded
127 // (*) As for removal of first multiplication and addition
128 // instructions. The outcome of first addition is
129 // guaranteed to be zero, which leaves two computationally
130 // significant outcomes: it either carries or not. Then
131 // question is when does it carry? Is there alternative
132 // way to deduce it? If you follow operations, you can
133 // observe that condition for carry is quite simple:
134 // $lo0 being non-zero. So that carry can be calculated
135 // by adding -1 to $lo0. That's what next instruction does.
136 subs xzr,$lo0,#1 // (*)
137 umulh $nhi,$nj,$m1
138 adc $hi1,$hi1,xzr
139 cbz $j,.L1st_skip
140
141.L1st:
142 ldr $aj,[$ap],#8
143 adds $lo0,$alo,$hi0
144 sub $j,$j,#8 // j--
145 adc $hi0,$ahi,xzr
146
147 ldr $nj,[$np],#8
148 adds $lo1,$nlo,$hi1
149 mul $alo,$aj,$m0 // ap[j]*bp[0]
150 adc $hi1,$nhi,xzr
151 umulh $ahi,$aj,$m0
152
153 adds $lo1,$lo1,$lo0
154 mul $nlo,$nj,$m1 // np[j]*m1
155 adc $hi1,$hi1,xzr
156 umulh $nhi,$nj,$m1
157 str $lo1,[$tp],#8 // tp[j-1]
158 cbnz $j,.L1st
159
160.L1st_skip:
161 adds $lo0,$alo,$hi0
162 sub $ap,$ap,$num // rewind $ap
163 adc $hi0,$ahi,xzr
164
165 adds $lo1,$nlo,$hi1
166 sub $np,$np,$num // rewind $np
167 adc $hi1,$nhi,xzr
168
169 adds $lo1,$lo1,$lo0
170 sub $i,$num,#8 // i=num-1
171 adcs $hi1,$hi1,$hi0
172
173 adc $ovf,xzr,xzr // upmost overflow bit
174 stp $lo1,$hi1,[$tp]
175
176.Louter:
177 ldr $m0,[$bp],#8 // bp[i]
178 ldp $hi0,$aj,[$ap],#16
179 ldr $tj,[sp] // tp[0]
180 add $tp,sp,#8
181
182 mul $lo0,$hi0,$m0 // ap[0]*bp[i]
183 sub $j,$num,#16 // j=num-2
184 umulh $hi0,$hi0,$m0
185 ldp $hi1,$nj,[$np],#16
186 mul $alo,$aj,$m0 // ap[1]*bp[i]
187 adds $lo0,$lo0,$tj
188 umulh $ahi,$aj,$m0
189 adc $hi0,$hi0,xzr
190
191 mul $m1,$lo0,$n0
192 sub $i,$i,#8 // i--
193
194 // (*) mul $lo1,$hi1,$m1 // np[0]*m1
195 umulh $hi1,$hi1,$m1
196 mul $nlo,$nj,$m1 // np[1]*m1
197 // (*) adds $lo1,$lo1,$lo0
198 subs xzr,$lo0,#1 // (*)
199 umulh $nhi,$nj,$m1
200 cbz $j,.Linner_skip
201
202.Linner:
203 ldr $aj,[$ap],#8
204 adc $hi1,$hi1,xzr
205 ldr $tj,[$tp],#8 // tp[j]
206 adds $lo0,$alo,$hi0
207 sub $j,$j,#8 // j--
208 adc $hi0,$ahi,xzr
209
210 adds $lo1,$nlo,$hi1
211 ldr $nj,[$np],#8
212 adc $hi1,$nhi,xzr
213
214 mul $alo,$aj,$m0 // ap[j]*bp[i]
215 adds $lo0,$lo0,$tj
216 umulh $ahi,$aj,$m0
217 adc $hi0,$hi0,xzr
218
219 mul $nlo,$nj,$m1 // np[j]*m1
220 adds $lo1,$lo1,$lo0
221 umulh $nhi,$nj,$m1
222 stur $lo1,[$tp,#-16] // tp[j-1]
223 cbnz $j,.Linner
224
225.Linner_skip:
226 ldr $tj,[$tp],#8 // tp[j]
227 adc $hi1,$hi1,xzr
228 adds $lo0,$alo,$hi0
229 sub $ap,$ap,$num // rewind $ap
230 adc $hi0,$ahi,xzr
231
232 adds $lo1,$nlo,$hi1
233 sub $np,$np,$num // rewind $np
234 adcs $hi1,$nhi,$ovf
235 adc $ovf,xzr,xzr
236
237 adds $lo0,$lo0,$tj
238 adc $hi0,$hi0,xzr
239
240 adds $lo1,$lo1,$lo0
241 adcs $hi1,$hi1,$hi0
242 adc $ovf,$ovf,xzr // upmost overflow bit
243 stp $lo1,$hi1,[$tp,#-16]
244
245 cbnz $i,.Louter
246
247 // Final step. We see if result is larger than modulus, and
248 // if it is, subtract the modulus. But comparison implies
249 // subtraction. So we subtract modulus, see if it borrowed,
250 // and conditionally copy original value.
251 ldr $tj,[sp] // tp[0]
252 add $tp,sp,#8
253 ldr $nj,[$np],#8 // np[0]
254 subs $j,$num,#8 // j=num-1 and clear borrow
255 mov $ap,$rp
256.Lsub:
257 sbcs $aj,$tj,$nj // tp[j]-np[j]
258 ldr $tj,[$tp],#8
259 sub $j,$j,#8 // j--
260 ldr $nj,[$np],#8
261 str $aj,[$ap],#8 // rp[j]=tp[j]-np[j]
262 cbnz $j,.Lsub
263
264 sbcs $aj,$tj,$nj
265 sbcs $ovf,$ovf,xzr // did it borrow?
266 str $aj,[$ap],#8 // rp[num-1]
267
268 ldr $tj,[sp] // tp[0]
269 add $tp,sp,#8
270 ldr $aj,[$rp],#8 // rp[0]
271 sub $num,$num,#8 // num--
272 nop
273.Lcond_copy:
274 sub $num,$num,#8 // num--
275 csel $nj,$tj,$aj,lo // did it borrow?
276 ldr $tj,[$tp],#8
277 ldr $aj,[$rp],#8
278 stur xzr,[$tp,#-16] // wipe tp
279 stur $nj,[$rp,#-16]
280 cbnz $num,.Lcond_copy
281
282 csel $nj,$tj,$aj,lo
283 stur xzr,[$tp,#-8] // wipe tp
284 stur $nj,[$rp,#-8]
285
286 ldp x19,x20,[x29,#16]
287 mov sp,x29
288 ldp x21,x22,[x29,#32]
289 mov x0,#1
290 ldp x23,x24,[x29,#48]
291 ldr x29,[sp],#64
292 AARCH64_VALIDATE_LINK_REGISTER
293 ret
294.size bn_mul_mont,.-bn_mul_mont
295___
296{
297my ($A0,$A1,$N0,$N1)=map("v$_",(0..3));
298my ($Z,$Temp)=("v4.16b","v5");
299my @ACC=map("v$_",(6..13));
300my ($Bi,$Ni,$M0)=map("v$_",(28..30));
301my $sBi="s28";
302my $sM0="s30";
303my $zero="v14";
304my $temp="v15";
305my $ACCTemp="v16";
306
307my ($rptr,$aptr,$bptr,$nptr,$n0,$num)=map("x$_",(0..5));
308my ($tinptr,$toutptr,$inner,$outer,$bnptr)=map("x$_",(6..11));
309
310$code.=<<___;
311.type bn_mul8x_mont_neon,%function
312.align 5
313bn_mul8x_mont_neon:
314 // Not adding AARCH64_SIGN_LINK_REGISTER here because bn_mul8x_mont_neon is jumped to
315 // only from bn_mul_mont which has already signed the return address.
316 stp x29,x30,[sp,#-80]!
317 mov x16,sp
318 stp d8,d9,[sp,#16]
319 stp d10,d11,[sp,#32]
320 stp d12,d13,[sp,#48]
321 stp d14,d15,[sp,#64]
322 lsl $num,$num,#1
323 eor $zero.16b,$zero.16b,$zero.16b
324
325.align 4
326.LNEON_8n:
327 eor @ACC[0].16b,@ACC[0].16b,@ACC[0].16b
328 sub $toutptr,sp,#128
329 eor @ACC[1].16b,@ACC[1].16b,@ACC[1].16b
330 sub $toutptr,$toutptr,$num,lsl#4
331 eor @ACC[2].16b,@ACC[2].16b,@ACC[2].16b
332 and $toutptr,$toutptr,#-64
333 eor @ACC[3].16b,@ACC[3].16b,@ACC[3].16b
334 mov sp,$toutptr // alloca
335 eor @ACC[4].16b,@ACC[4].16b,@ACC[4].16b
336 add $toutptr,$toutptr,#256
337 eor @ACC[5].16b,@ACC[5].16b,@ACC[5].16b
338 sub $inner,$num,#8
339 eor @ACC[6].16b,@ACC[6].16b,@ACC[6].16b
340 eor @ACC[7].16b,@ACC[7].16b,@ACC[7].16b
341
342.LNEON_8n_init:
343 st1 {@ACC[0].2d,@ACC[1].2d},[$toutptr],#32
344 subs $inner,$inner,#8
345 st1 {@ACC[2].2d,@ACC[3].2d},[$toutptr],#32
346 st1 {@ACC[4].2d,@ACC[5].2d},[$toutptr],#32
347 st1 {@ACC[6].2d,@ACC[7].2d},[$toutptr],#32
348 bne .LNEON_8n_init
349
350 add $tinptr,sp,#256
351 ld1 {$A0.4s,$A1.4s},[$aptr],#32
352 add $bnptr,sp,#8
353 ldr $sM0,[$n0],#4
354 mov $outer,$num
355 b .LNEON_8n_outer
356
357.align 4
358.LNEON_8n_outer:
359 ldr $sBi,[$bptr],#4 // *b++
360 uxtl $Bi.4s,$Bi.4h
361 add $toutptr,sp,#128
362 ld1 {$N0.4s,$N1.4s},[$nptr],#32
363
364 umlal @ACC[0].2d,$Bi.2s,$A0.s[0]
365 umlal @ACC[1].2d,$Bi.2s,$A0.s[1]
366 umlal @ACC[2].2d,$Bi.2s,$A0.s[2]
367 shl $Ni.2d,@ACC[0].2d,#16
368 ext $Ni.16b,$Ni.16b,$Ni.16b,#8
369 umlal @ACC[3].2d,$Bi.2s,$A0.s[3]
370 add $Ni.2d,$Ni.2d,@ACC[0].2d
371 umlal @ACC[4].2d,$Bi.2s,$A1.s[0]
372 mul $Ni.2s,$Ni.2s,$M0.2s
373 umlal @ACC[5].2d,$Bi.2s,$A1.s[1]
374 st1 {$Bi.2s},[sp] // put aside smashed b[8*i+0]
375 umlal @ACC[6].2d,$Bi.2s,$A1.s[2]
376 uxtl $Ni.4s,$Ni.4h
377 umlal @ACC[7].2d,$Bi.2s,$A1.s[3]
378___
379for ($i=0; $i<7;) {
380$code.=<<___;
381 ldr $sBi,[$bptr],#4 // *b++
382 umlal @ACC[0].2d,$Ni.2s,$N0.s[0]
383 umlal @ACC[1].2d,$Ni.2s,$N0.s[1]
384 uxtl $Bi.4s,$Bi.4h
385 umlal @ACC[2].2d,$Ni.2s,$N0.s[2]
386 ushr $temp.2d,@ACC[0].2d,#16
387 umlal @ACC[3].2d,$Ni.2s,$N0.s[3]
388 umlal @ACC[4].2d,$Ni.2s,$N1.s[0]
389 ext @ACC[0].16b,@ACC[0].16b,@ACC[0].16b,#8
390 add @ACC[0].2d,@ACC[0].2d,$temp.2d
391 umlal @ACC[5].2d,$Ni.2s,$N1.s[1]
392 ushr @ACC[0].2d,@ACC[0].2d,#16
393 umlal @ACC[6].2d,$Ni.2s,$N1.s[2]
394 umlal @ACC[7].2d,$Ni.2s,$N1.s[3]
395 add $ACCTemp.2d,@ACC[1].2d,@ACC[0].2d
396 ins @ACC[1].d[0],$ACCTemp.d[0]
397 st1 {$Ni.2s},[$bnptr],#8 // put aside smashed m[8*i+$i]
398___
399 push(@ACC,shift(@ACC)); $i++;
400$code.=<<___;
401 umlal @ACC[0].2d,$Bi.2s,$A0.s[0]
402 ld1 {@ACC[7].2d},[$tinptr],#16
403 umlal @ACC[1].2d,$Bi.2s,$A0.s[1]
404 umlal @ACC[2].2d,$Bi.2s,$A0.s[2]
405 shl $Ni.2d,@ACC[0].2d,#16
406 ext $Ni.16b,$Ni.16b,$Ni.16b,#8
407 umlal @ACC[3].2d,$Bi.2s,$A0.s[3]
408 add $Ni.2d,$Ni.2d,@ACC[0].2d
409 umlal @ACC[4].2d,$Bi.2s,$A1.s[0]
410 mul $Ni.2s,$Ni.2s,$M0.2s
411 umlal @ACC[5].2d,$Bi.2s,$A1.s[1]
412 st1 {$Bi.2s},[$bnptr],#8 // put aside smashed b[8*i+$i]
413 umlal @ACC[6].2d,$Bi.2s,$A1.s[2]
414 uxtl $Ni.4s,$Ni.4h
415 umlal @ACC[7].2d,$Bi.2s,$A1.s[3]
416___
417}
418$code.=<<___;
419 ld1 {$Bi.2s},[sp] // pull smashed b[8*i+0]
420 umlal @ACC[0].2d,$Ni.2s,$N0.s[0]
421 ld1 {$A0.4s,$A1.4s},[$aptr],#32
422 umlal @ACC[1].2d,$Ni.2s,$N0.s[1]
423 umlal @ACC[2].2d,$Ni.2s,$N0.s[2]
424 mov $Temp.16b,@ACC[0].16b
425 ushr $Temp.2d,$Temp.2d,#16
426 ext @ACC[0].16b,@ACC[0].16b,@ACC[0].16b,#8
427 umlal @ACC[3].2d,$Ni.2s,$N0.s[3]
428 umlal @ACC[4].2d,$Ni.2s,$N1.s[0]
429 add @ACC[0].2d,@ACC[0].2d,$Temp.2d
430 umlal @ACC[5].2d,$Ni.2s,$N1.s[1]
431 ushr @ACC[0].2d,@ACC[0].2d,#16
432 eor $temp.16b,$temp.16b,$temp.16b
433 ins @ACC[0].d[1],$temp.d[0]
434 umlal @ACC[6].2d,$Ni.2s,$N1.s[2]
435 umlal @ACC[7].2d,$Ni.2s,$N1.s[3]
436 add @ACC[1].2d,@ACC[1].2d,@ACC[0].2d
437 st1 {$Ni.2s},[$bnptr],#8 // put aside smashed m[8*i+$i]
438 add $bnptr,sp,#8 // rewind
439___
440 push(@ACC,shift(@ACC));
441$code.=<<___;
442 sub $inner,$num,#8
443 b .LNEON_8n_inner
444
445.align 4
446.LNEON_8n_inner:
447 subs $inner,$inner,#8
448 umlal @ACC[0].2d,$Bi.2s,$A0.s[0]
449 ld1 {@ACC[7].2d},[$tinptr]
450 umlal @ACC[1].2d,$Bi.2s,$A0.s[1]
451 ld1 {$Ni.2s},[$bnptr],#8 // pull smashed m[8*i+0]
452 umlal @ACC[2].2d,$Bi.2s,$A0.s[2]
453 ld1 {$N0.4s,$N1.4s},[$nptr],#32
454 umlal @ACC[3].2d,$Bi.2s,$A0.s[3]
455 b.eq .LInner_jump
456 add $tinptr,$tinptr,#16 // don't advance in last iteration
457.LInner_jump:
458 umlal @ACC[4].2d,$Bi.2s,$A1.s[0]
459 umlal @ACC[5].2d,$Bi.2s,$A1.s[1]
460 umlal @ACC[6].2d,$Bi.2s,$A1.s[2]
461 umlal @ACC[7].2d,$Bi.2s,$A1.s[3]
462___
463for ($i=1; $i<8; $i++) {
464$code.=<<___;
465 ld1 {$Bi.2s},[$bnptr],#8 // pull smashed b[8*i+$i]
466 umlal @ACC[0].2d,$Ni.2s,$N0.s[0]
467 umlal @ACC[1].2d,$Ni.2s,$N0.s[1]
468 umlal @ACC[2].2d,$Ni.2s,$N0.s[2]
469 umlal @ACC[3].2d,$Ni.2s,$N0.s[3]
470 umlal @ACC[4].2d,$Ni.2s,$N1.s[0]
471 umlal @ACC[5].2d,$Ni.2s,$N1.s[1]
472 umlal @ACC[6].2d,$Ni.2s,$N1.s[2]
473 umlal @ACC[7].2d,$Ni.2s,$N1.s[3]
474 st1 {@ACC[0].2d},[$toutptr],#16
475___
476 push(@ACC,shift(@ACC));
477$code.=<<___;
478 umlal @ACC[0].2d,$Bi.2s,$A0.s[0]
479 ld1 {@ACC[7].2d},[$tinptr]
480 umlal @ACC[1].2d,$Bi.2s,$A0.s[1]
481 ld1 {$Ni.2s},[$bnptr],#8 // pull smashed m[8*i+$i]
482 umlal @ACC[2].2d,$Bi.2s,$A0.s[2]
483 b.eq .LInner_jump$i
484 add $tinptr,$tinptr,#16 // don't advance in last iteration
485.LInner_jump$i:
486 umlal @ACC[3].2d,$Bi.2s,$A0.s[3]
487 umlal @ACC[4].2d,$Bi.2s,$A1.s[0]
488 umlal @ACC[5].2d,$Bi.2s,$A1.s[1]
489 umlal @ACC[6].2d,$Bi.2s,$A1.s[2]
490 umlal @ACC[7].2d,$Bi.2s,$A1.s[3]
491___
492}
493$code.=<<___;
494 b.ne .LInner_after_rewind$i
495 sub $aptr,$aptr,$num,lsl#2 // rewind
496.LInner_after_rewind$i:
497 umlal @ACC[0].2d,$Ni.2s,$N0.s[0]
498 ld1 {$Bi.2s},[sp] // pull smashed b[8*i+0]
499 umlal @ACC[1].2d,$Ni.2s,$N0.s[1]
500 ld1 {$A0.4s,$A1.4s},[$aptr],#32
501 umlal @ACC[2].2d,$Ni.2s,$N0.s[2]
502 add $bnptr,sp,#8 // rewind
503 umlal @ACC[3].2d,$Ni.2s,$N0.s[3]
504 umlal @ACC[4].2d,$Ni.2s,$N1.s[0]
505 umlal @ACC[5].2d,$Ni.2s,$N1.s[1]
506 umlal @ACC[6].2d,$Ni.2s,$N1.s[2]
507 st1 {@ACC[0].2d},[$toutptr],#16
508 umlal @ACC[7].2d,$Ni.2s,$N1.s[3]
509
510 bne .LNEON_8n_inner
511___
512 push(@ACC,shift(@ACC));
513$code.=<<___;
514 add $tinptr,sp,#128
515 st1 {@ACC[0].2d,@ACC[1].2d},[$toutptr],#32
516 eor $N0.16b,$N0.16b,$N0.16b // $N0
517 st1 {@ACC[2].2d,@ACC[3].2d},[$toutptr],#32
518 eor $N1.16b,$N1.16b,$N1.16b // $N1
519 st1 {@ACC[4].2d,@ACC[5].2d},[$toutptr],#32
520 st1 {@ACC[6].2d},[$toutptr]
521
522 subs $outer,$outer,#8
523 ld1 {@ACC[0].2d,@ACC[1].2d},[$tinptr],#32
524 ld1 {@ACC[2].2d,@ACC[3].2d},[$tinptr],#32
525 ld1 {@ACC[4].2d,@ACC[5].2d},[$tinptr],#32
526 ld1 {@ACC[6].2d,@ACC[7].2d},[$tinptr],#32
527
528 b.eq .LInner_8n_jump_2steps
529 sub $nptr,$nptr,$num,lsl#2 // rewind
530 b .LNEON_8n_outer
531
532.LInner_8n_jump_2steps:
533 add $toutptr,sp,#128
534 st1 {$N0.2d,$N1.2d}, [sp],#32 // start wiping stack frame
535 mov $Temp.16b,@ACC[0].16b
536 ushr $temp.2d,@ACC[0].2d,#16
537 ext @ACC[0].16b,@ACC[0].16b,@ACC[0].16b,#8
538 st1 {$N0.2d,$N1.2d}, [sp],#32
539 add @ACC[0].2d,@ACC[0].2d,$temp.2d
540 st1 {$N0.2d,$N1.2d}, [sp],#32
541 ushr $temp.2d,@ACC[0].2d,#16
542 st1 {$N0.2d,$N1.2d}, [sp],#32
543 zip1 @ACC[0].4h,$Temp.4h,@ACC[0].4h
544 ins $temp.d[1],$zero.d[0]
545
546 mov $inner,$num
547 b .LNEON_tail_entry
548
549.align 4
550.LNEON_tail:
551 add @ACC[0].2d,@ACC[0].2d,$temp.2d
552 mov $Temp.16b,@ACC[0].16b
553 ushr $temp.2d,@ACC[0].2d,#16
554 ext @ACC[0].16b,@ACC[0].16b,@ACC[0].16b,#8
555 ld1 {@ACC[2].2d,@ACC[3].2d}, [$tinptr],#32
556 add @ACC[0].2d,@ACC[0].2d,$temp.2d
557 ld1 {@ACC[4].2d,@ACC[5].2d}, [$tinptr],#32
558 ushr $temp.2d,@ACC[0].2d,#16
559 ld1 {@ACC[6].2d,@ACC[7].2d}, [$tinptr],#32
560 zip1 @ACC[0].4h,$Temp.4h,@ACC[0].4h
561 ins $temp.d[1],$zero.d[0]
562
563.LNEON_tail_entry:
564___
565for ($i=1; $i<8; $i++) {
566$code.=<<___;
567 add @ACC[1].2d,@ACC[1].2d,$temp.2d
568 st1 {@ACC[0].s}[0], [$toutptr],#4
569 ushr $temp.2d,@ACC[1].2d,#16
570 mov $Temp.16b,@ACC[1].16b
571 ext @ACC[1].16b,@ACC[1].16b,@ACC[1].16b,#8
572 add @ACC[1].2d,@ACC[1].2d,$temp.2d
573 ushr $temp.2d,@ACC[1].2d,#16
574 zip1 @ACC[1].4h,$Temp.4h,@ACC[1].4h
575 ins $temp.d[1],$zero.d[0]
576___
577 push(@ACC,shift(@ACC));
578}
579 push(@ACC,shift(@ACC));
580$code.=<<___;
581 ld1 {@ACC[0].2d,@ACC[1].2d}, [$tinptr],#32
582 subs $inner,$inner,#8
583 st1 {@ACC[7].s}[0], [$toutptr],#4
584 bne .LNEON_tail
585
586 st1 {$temp.s}[0], [$toutptr],#4 // top-most bit
587 sub $nptr,$nptr,$num,lsl#2 // rewind $nptr
588 subs $aptr,sp,#0 // clear carry flag
589 add $bptr,sp,$num,lsl#2
590
591.LNEON_sub:
592 ldp w4,w5,[$aptr],#8
593 ldp w6,w7,[$aptr],#8
594 ldp w8,w9,[$nptr],#8
595 ldp w10,w11,[$nptr],#8
596 sbcs w8,w4,w8
597 sbcs w9,w5,w9
598 sbcs w10,w6,w10
599 sbcs w11,w7,w11
600 sub x17,$bptr,$aptr
601 stp w8,w9,[$rptr],#8
602 stp w10,w11,[$rptr],#8
603 cbnz x17,.LNEON_sub
604
605 ldr w10, [$aptr] // load top-most bit
606 mov x11,sp
607 eor v0.16b,v0.16b,v0.16b
608 sub x11,$bptr,x11 // this is num*4
609 eor v1.16b,v1.16b,v1.16b
610 mov $aptr,sp
611 sub $rptr,$rptr,x11 // rewind $rptr
612 mov $nptr,$bptr // second 3/4th of frame
613 sbcs w10,w10,wzr // result is carry flag
614
615.LNEON_copy_n_zap:
616 ldp w4,w5,[$aptr],#8
617 ldp w6,w7,[$aptr],#8
618 ldp w8,w9,[$rptr],#8
619 ldp w10,w11,[$rptr]
620 sub $rptr,$rptr,#8
621 b.cs .LCopy_1
622 mov w8,w4
623 mov w9,w5
624 mov w10,w6
625 mov w11,w7
626.LCopy_1:
627 st1 {v0.2d,v1.2d}, [$nptr],#32 // wipe
628 st1 {v0.2d,v1.2d}, [$nptr],#32 // wipe
629 ldp w4,w5,[$aptr],#8
630 ldp w6,w7,[$aptr],#8
631 stp w8,w9,[$rptr],#8
632 stp w10,w11,[$rptr],#8
633 sub $aptr,$aptr,#32
634 ldp w8,w9,[$rptr],#8
635 ldp w10,w11,[$rptr]
636 sub $rptr,$rptr,#8
637 b.cs .LCopy_2
638 mov w8, w4
639 mov w9, w5
640 mov w10, w6
641 mov w11, w7
642.LCopy_2:
643 st1 {v0.2d,v1.2d}, [$aptr],#32 // wipe
644 st1 {v0.2d,v1.2d}, [$nptr],#32 // wipe
645 sub x17,$bptr,$aptr // preserves carry
646 stp w8,w9,[$rptr],#8
647 stp w10,w11,[$rptr],#8
648 cbnz x17,.LNEON_copy_n_zap
649
650 mov sp,x16
651 ldp d14,d15,[sp,#64]
652 ldp d12,d13,[sp,#48]
653 ldp d10,d11,[sp,#32]
654 ldp d8,d9,[sp,#16]
655 ldr x29,[sp],#80
656 AARCH64_VALIDATE_LINK_REGISTER
657 ret // bx lr
658
659.size bn_mul8x_mont_neon,.-bn_mul8x_mont_neon
660___
661}
662{
663########################################################################
664# Following is ARMv8 adaptation of sqrx8x_mont from x86_64-mont5 module.
665
666my ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("x$_",(6..13));
667my ($t0,$t1,$t2,$t3)=map("x$_",(14..17));
668my ($acc0,$acc1,$acc2,$acc3,$acc4,$acc5,$acc6,$acc7)=map("x$_",(19..26));
669my ($cnt,$carry,$topmost)=("x27","x28","x30");
670my ($tp,$ap_end,$na0)=($bp,$np,$carry);
671
672$code.=<<___;
673.type __bn_sqr8x_mont,%function
674.align 5
675__bn_sqr8x_mont:
676 cmp $ap,$bp
677 b.ne __bn_mul4x_mont
678.Lsqr8x_mont:
679 // Not adding AARCH64_SIGN_LINK_REGISTER here because __bn_sqr8x_mont is jumped to
680 // only from bn_mul_mont which has already signed the return address.
681 stp x29,x30,[sp,#-128]!
682 add x29,sp,#0
683 stp x19,x20,[sp,#16]
684 stp x21,x22,[sp,#32]
685 stp x23,x24,[sp,#48]
686 stp x25,x26,[sp,#64]
687 stp x27,x28,[sp,#80]
688 stp $rp,$np,[sp,#96] // offload rp and np
689
690 ldp $a0,$a1,[$ap,#8*0]
691 ldp $a2,$a3,[$ap,#8*2]
692 ldp $a4,$a5,[$ap,#8*4]
693 ldp $a6,$a7,[$ap,#8*6]
694
695 sub $tp,sp,$num,lsl#4
696 lsl $num,$num,#3
697 ldr $n0,[$n0] // *n0
698 mov sp,$tp // alloca
699 sub $cnt,$num,#8*8
700 b .Lsqr8x_zero_start
701
702.Lsqr8x_zero:
703 sub $cnt,$cnt,#8*8
704 stp xzr,xzr,[$tp,#8*0]
705 stp xzr,xzr,[$tp,#8*2]
706 stp xzr,xzr,[$tp,#8*4]
707 stp xzr,xzr,[$tp,#8*6]
708.Lsqr8x_zero_start:
709 stp xzr,xzr,[$tp,#8*8]
710 stp xzr,xzr,[$tp,#8*10]
711 stp xzr,xzr,[$tp,#8*12]
712 stp xzr,xzr,[$tp,#8*14]
713 add $tp,$tp,#8*16
714 cbnz $cnt,.Lsqr8x_zero
715
716 add $ap_end,$ap,$num
717 add $ap,$ap,#8*8
718 mov $acc0,xzr
719 mov $acc1,xzr
720 mov $acc2,xzr
721 mov $acc3,xzr
722 mov $acc4,xzr
723 mov $acc5,xzr
724 mov $acc6,xzr
725 mov $acc7,xzr
726 mov $tp,sp
727 str $n0,[x29,#112] // offload n0
728
729 // Multiply everything but a[i]*a[i]
730.align 4
731.Lsqr8x_outer_loop:
732 // a[1]a[0] (i)
733 // a[2]a[0]
734 // a[3]a[0]
735 // a[4]a[0]
736 // a[5]a[0]
737 // a[6]a[0]
738 // a[7]a[0]
739 // a[2]a[1] (ii)
740 // a[3]a[1]
741 // a[4]a[1]
742 // a[5]a[1]
743 // a[6]a[1]
744 // a[7]a[1]
745 // a[3]a[2] (iii)
746 // a[4]a[2]
747 // a[5]a[2]
748 // a[6]a[2]
749 // a[7]a[2]
750 // a[4]a[3] (iv)
751 // a[5]a[3]
752 // a[6]a[3]
753 // a[7]a[3]
754 // a[5]a[4] (v)
755 // a[6]a[4]
756 // a[7]a[4]
757 // a[6]a[5] (vi)
758 // a[7]a[5]
759 // a[7]a[6] (vii)
760
761 mul $t0,$a1,$a0 // lo(a[1..7]*a[0]) (i)
762 mul $t1,$a2,$a0
763 mul $t2,$a3,$a0
764 mul $t3,$a4,$a0
765 adds $acc1,$acc1,$t0 // t[1]+lo(a[1]*a[0])
766 mul $t0,$a5,$a0
767 adcs $acc2,$acc2,$t1
768 mul $t1,$a6,$a0
769 adcs $acc3,$acc3,$t2
770 mul $t2,$a7,$a0
771 adcs $acc4,$acc4,$t3
772 umulh $t3,$a1,$a0 // hi(a[1..7]*a[0])
773 adcs $acc5,$acc5,$t0
774 umulh $t0,$a2,$a0
775 adcs $acc6,$acc6,$t1
776 umulh $t1,$a3,$a0
777 adcs $acc7,$acc7,$t2
778 umulh $t2,$a4,$a0
779 stp $acc0,$acc1,[$tp],#8*2 // t[0..1]
780 adc $acc0,xzr,xzr // t[8]
781 adds $acc2,$acc2,$t3 // t[2]+lo(a[1]*a[0])
782 umulh $t3,$a5,$a0
783 adcs $acc3,$acc3,$t0
784 umulh $t0,$a6,$a0
785 adcs $acc4,$acc4,$t1
786 umulh $t1,$a7,$a0
787 adcs $acc5,$acc5,$t2
788 mul $t2,$a2,$a1 // lo(a[2..7]*a[1]) (ii)
789 adcs $acc6,$acc6,$t3
790 mul $t3,$a3,$a1
791 adcs $acc7,$acc7,$t0
792 mul $t0,$a4,$a1
793 adc $acc0,$acc0,$t1
794
795 mul $t1,$a5,$a1
796 adds $acc3,$acc3,$t2
797 mul $t2,$a6,$a1
798 adcs $acc4,$acc4,$t3
799 mul $t3,$a7,$a1
800 adcs $acc5,$acc5,$t0
801 umulh $t0,$a2,$a1 // hi(a[2..7]*a[1])
802 adcs $acc6,$acc6,$t1
803 umulh $t1,$a3,$a1
804 adcs $acc7,$acc7,$t2
805 umulh $t2,$a4,$a1
806 adcs $acc0,$acc0,$t3
807 umulh $t3,$a5,$a1
808 stp $acc2,$acc3,[$tp],#8*2 // t[2..3]
809 adc $acc1,xzr,xzr // t[9]
810 adds $acc4,$acc4,$t0
811 umulh $t0,$a6,$a1
812 adcs $acc5,$acc5,$t1
813 umulh $t1,$a7,$a1
814 adcs $acc6,$acc6,$t2
815 mul $t2,$a3,$a2 // lo(a[3..7]*a[2]) (iii)
816 adcs $acc7,$acc7,$t3
817 mul $t3,$a4,$a2
818 adcs $acc0,$acc0,$t0
819 mul $t0,$a5,$a2
820 adc $acc1,$acc1,$t1
821
822 mul $t1,$a6,$a2
823 adds $acc5,$acc5,$t2
824 mul $t2,$a7,$a2
825 adcs $acc6,$acc6,$t3
826 umulh $t3,$a3,$a2 // hi(a[3..7]*a[2])
827 adcs $acc7,$acc7,$t0
828 umulh $t0,$a4,$a2
829 adcs $acc0,$acc0,$t1
830 umulh $t1,$a5,$a2
831 adcs $acc1,$acc1,$t2
832 umulh $t2,$a6,$a2
833 stp $acc4,$acc5,[$tp],#8*2 // t[4..5]
834 adc $acc2,xzr,xzr // t[10]
835 adds $acc6,$acc6,$t3
836 umulh $t3,$a7,$a2
837 adcs $acc7,$acc7,$t0
838 mul $t0,$a4,$a3 // lo(a[4..7]*a[3]) (iv)
839 adcs $acc0,$acc0,$t1
840 mul $t1,$a5,$a3
841 adcs $acc1,$acc1,$t2
842 mul $t2,$a6,$a3
843 adc $acc2,$acc2,$t3
844
845 mul $t3,$a7,$a3
846 adds $acc7,$acc7,$t0
847 umulh $t0,$a4,$a3 // hi(a[4..7]*a[3])
848 adcs $acc0,$acc0,$t1
849 umulh $t1,$a5,$a3
850 adcs $acc1,$acc1,$t2
851 umulh $t2,$a6,$a3
852 adcs $acc2,$acc2,$t3
853 umulh $t3,$a7,$a3
854 stp $acc6,$acc7,[$tp],#8*2 // t[6..7]
855 adc $acc3,xzr,xzr // t[11]
856 adds $acc0,$acc0,$t0
857 mul $t0,$a5,$a4 // lo(a[5..7]*a[4]) (v)
858 adcs $acc1,$acc1,$t1
859 mul $t1,$a6,$a4
860 adcs $acc2,$acc2,$t2
861 mul $t2,$a7,$a4
862 adc $acc3,$acc3,$t3
863
864 umulh $t3,$a5,$a4 // hi(a[5..7]*a[4])
865 adds $acc1,$acc1,$t0
866 umulh $t0,$a6,$a4
867 adcs $acc2,$acc2,$t1
868 umulh $t1,$a7,$a4
869 adcs $acc3,$acc3,$t2
870 mul $t2,$a6,$a5 // lo(a[6..7]*a[5]) (vi)
871 adc $acc4,xzr,xzr // t[12]
872 adds $acc2,$acc2,$t3
873 mul $t3,$a7,$a5
874 adcs $acc3,$acc3,$t0
875 umulh $t0,$a6,$a5 // hi(a[6..7]*a[5])
876 adc $acc4,$acc4,$t1
877
878 umulh $t1,$a7,$a5
879 adds $acc3,$acc3,$t2
880 mul $t2,$a7,$a6 // lo(a[7]*a[6]) (vii)
881 adcs $acc4,$acc4,$t3
882 umulh $t3,$a7,$a6 // hi(a[7]*a[6])
883 adc $acc5,xzr,xzr // t[13]
884 adds $acc4,$acc4,$t0
885 sub $cnt,$ap_end,$ap // done yet?
886 adc $acc5,$acc5,$t1
887
888 adds $acc5,$acc5,$t2
889 sub $t0,$ap_end,$num // rewinded ap
890 adc $acc6,xzr,xzr // t[14]
891 add $acc6,$acc6,$t3
892
893 cbz $cnt,.Lsqr8x_outer_break
894
895 mov $n0,$a0
896 ldp $a0,$a1,[$tp,#8*0]
897 ldp $a2,$a3,[$tp,#8*2]
898 ldp $a4,$a5,[$tp,#8*4]
899 ldp $a6,$a7,[$tp,#8*6]
900 adds $acc0,$acc0,$a0
901 adcs $acc1,$acc1,$a1
902 ldp $a0,$a1,[$ap,#8*0]
903 adcs $acc2,$acc2,$a2
904 adcs $acc3,$acc3,$a3
905 ldp $a2,$a3,[$ap,#8*2]
906 adcs $acc4,$acc4,$a4
907 adcs $acc5,$acc5,$a5
908 ldp $a4,$a5,[$ap,#8*4]
909 adcs $acc6,$acc6,$a6
910 mov $rp,$ap
911 adcs $acc7,xzr,$a7
912 ldp $a6,$a7,[$ap,#8*6]
913 add $ap,$ap,#8*8
914 //adc $carry,xzr,xzr // moved below
915 mov $cnt,#-8*8
916
917 // a[8]a[0]
918 // a[9]a[0]
919 // a[a]a[0]
920 // a[b]a[0]
921 // a[c]a[0]
922 // a[d]a[0]
923 // a[e]a[0]
924 // a[f]a[0]
925 // a[8]a[1]
926 // a[f]a[1]........................
927 // a[8]a[2]
928 // a[f]a[2]........................
929 // a[8]a[3]
930 // a[f]a[3]........................
931 // a[8]a[4]
932 // a[f]a[4]........................
933 // a[8]a[5]
934 // a[f]a[5]........................
935 // a[8]a[6]
936 // a[f]a[6]........................
937 // a[8]a[7]
938 // a[f]a[7]........................
939.Lsqr8x_mul:
940 mul $t0,$a0,$n0
941 adc $carry,xzr,xzr // carry bit, modulo-scheduled
942 mul $t1,$a1,$n0
943 add $cnt,$cnt,#8
944 mul $t2,$a2,$n0
945 mul $t3,$a3,$n0
946 adds $acc0,$acc0,$t0
947 mul $t0,$a4,$n0
948 adcs $acc1,$acc1,$t1
949 mul $t1,$a5,$n0
950 adcs $acc2,$acc2,$t2
951 mul $t2,$a6,$n0
952 adcs $acc3,$acc3,$t3
953 mul $t3,$a7,$n0
954 adcs $acc4,$acc4,$t0
955 umulh $t0,$a0,$n0
956 adcs $acc5,$acc5,$t1
957 umulh $t1,$a1,$n0
958 adcs $acc6,$acc6,$t2
959 umulh $t2,$a2,$n0
960 adcs $acc7,$acc7,$t3
961 umulh $t3,$a3,$n0
962 adc $carry,$carry,xzr
963 str $acc0,[$tp],#8
964 adds $acc0,$acc1,$t0
965 umulh $t0,$a4,$n0
966 adcs $acc1,$acc2,$t1
967 umulh $t1,$a5,$n0
968 adcs $acc2,$acc3,$t2
969 umulh $t2,$a6,$n0
970 adcs $acc3,$acc4,$t3
971 umulh $t3,$a7,$n0
972 ldr $n0,[$rp,$cnt]
973 adcs $acc4,$acc5,$t0
974 adcs $acc5,$acc6,$t1
975 adcs $acc6,$acc7,$t2
976 adcs $acc7,$carry,$t3
977 //adc $carry,xzr,xzr // moved above
978 cbnz $cnt,.Lsqr8x_mul
979 // note that carry flag is guaranteed
980 // to be zero at this point
981 cmp $ap,$ap_end // done yet?
982 b.eq .Lsqr8x_break
983
984 ldp $a0,$a1,[$tp,#8*0]
985 ldp $a2,$a3,[$tp,#8*2]
986 ldp $a4,$a5,[$tp,#8*4]
987 ldp $a6,$a7,[$tp,#8*6]
988 adds $acc0,$acc0,$a0
989 ldur $n0,[$rp,#-8*8]
990 adcs $acc1,$acc1,$a1
991 ldp $a0,$a1,[$ap,#8*0]
992 adcs $acc2,$acc2,$a2
993 adcs $acc3,$acc3,$a3
994 ldp $a2,$a3,[$ap,#8*2]
995 adcs $acc4,$acc4,$a4
996 adcs $acc5,$acc5,$a5
997 ldp $a4,$a5,[$ap,#8*4]
998 adcs $acc6,$acc6,$a6
999 mov $cnt,#-8*8
1000 adcs $acc7,$acc7,$a7
1001 ldp $a6,$a7,[$ap,#8*6]
1002 add $ap,$ap,#8*8
1003 //adc $carry,xzr,xzr // moved above
1004 b .Lsqr8x_mul
1005
1006.align 4
1007.Lsqr8x_break:
1008 ldp $a0,$a1,[$rp,#8*0]
1009 add $ap,$rp,#8*8
1010 ldp $a2,$a3,[$rp,#8*2]
1011 sub $t0,$ap_end,$ap // is it last iteration?
1012 ldp $a4,$a5,[$rp,#8*4]
1013 sub $t1,$tp,$t0
1014 ldp $a6,$a7,[$rp,#8*6]
1015 cbz $t0,.Lsqr8x_outer_loop
1016
1017 stp $acc0,$acc1,[$tp,#8*0]
1018 ldp $acc0,$acc1,[$t1,#8*0]
1019 stp $acc2,$acc3,[$tp,#8*2]
1020 ldp $acc2,$acc3,[$t1,#8*2]
1021 stp $acc4,$acc5,[$tp,#8*4]
1022 ldp $acc4,$acc5,[$t1,#8*4]
1023 stp $acc6,$acc7,[$tp,#8*6]
1024 mov $tp,$t1
1025 ldp $acc6,$acc7,[$t1,#8*6]
1026 b .Lsqr8x_outer_loop
1027
1028.align 4
1029.Lsqr8x_outer_break:
1030 // Now multiply above result by 2 and add a[n-1]*a[n-1]|...|a[0]*a[0]
1031 ldp $a1,$a3,[$t0,#8*0] // recall that $t0 is &a[0]
1032 ldp $t1,$t2,[sp,#8*1]
1033 ldp $a5,$a7,[$t0,#8*2]
1034 add $ap,$t0,#8*4
1035 ldp $t3,$t0,[sp,#8*3]
1036
1037 stp $acc0,$acc1,[$tp,#8*0]
1038 mul $acc0,$a1,$a1
1039 stp $acc2,$acc3,[$tp,#8*2]
1040 umulh $a1,$a1,$a1
1041 stp $acc4,$acc5,[$tp,#8*4]
1042 mul $a2,$a3,$a3
1043 stp $acc6,$acc7,[$tp,#8*6]
1044 mov $tp,sp
1045 umulh $a3,$a3,$a3
1046 adds $acc1,$a1,$t1,lsl#1
1047 extr $t1,$t2,$t1,#63
1048 sub $cnt,$num,#8*4
1049
1050.Lsqr4x_shift_n_add:
1051 adcs $acc2,$a2,$t1
1052 extr $t2,$t3,$t2,#63
1053 sub $cnt,$cnt,#8*4
1054 adcs $acc3,$a3,$t2
1055 ldp $t1,$t2,[$tp,#8*5]
1056 mul $a4,$a5,$a5
1057 ldp $a1,$a3,[$ap],#8*2
1058 umulh $a5,$a5,$a5
1059 mul $a6,$a7,$a7
1060 umulh $a7,$a7,$a7
1061 extr $t3,$t0,$t3,#63
1062 stp $acc0,$acc1,[$tp,#8*0]
1063 adcs $acc4,$a4,$t3
1064 extr $t0,$t1,$t0,#63
1065 stp $acc2,$acc3,[$tp,#8*2]
1066 adcs $acc5,$a5,$t0
1067 ldp $t3,$t0,[$tp,#8*7]
1068 extr $t1,$t2,$t1,#63
1069 adcs $acc6,$a6,$t1
1070 extr $t2,$t3,$t2,#63
1071 adcs $acc7,$a7,$t2
1072 ldp $t1,$t2,[$tp,#8*9]
1073 mul $a0,$a1,$a1
1074 ldp $a5,$a7,[$ap],#8*2
1075 umulh $a1,$a1,$a1
1076 mul $a2,$a3,$a3
1077 umulh $a3,$a3,$a3
1078 stp $acc4,$acc5,[$tp,#8*4]
1079 extr $t3,$t0,$t3,#63
1080 stp $acc6,$acc7,[$tp,#8*6]
1081 add $tp,$tp,#8*8
1082 adcs $acc0,$a0,$t3
1083 extr $t0,$t1,$t0,#63
1084 adcs $acc1,$a1,$t0
1085 ldp $t3,$t0,[$tp,#8*3]
1086 extr $t1,$t2,$t1,#63
1087 cbnz $cnt,.Lsqr4x_shift_n_add
1088___
1089my ($np,$np_end)=($ap,$ap_end);
1090$code.=<<___;
1091 ldp $np,$n0,[x29,#104] // pull np and n0
1092
1093 adcs $acc2,$a2,$t1
1094 extr $t2,$t3,$t2,#63
1095 adcs $acc3,$a3,$t2
1096 ldp $t1,$t2,[$tp,#8*5]
1097 mul $a4,$a5,$a5
1098 umulh $a5,$a5,$a5
1099 stp $acc0,$acc1,[$tp,#8*0]
1100 mul $a6,$a7,$a7
1101 umulh $a7,$a7,$a7
1102 stp $acc2,$acc3,[$tp,#8*2]
1103 extr $t3,$t0,$t3,#63
1104 adcs $acc4,$a4,$t3
1105 extr $t0,$t1,$t0,#63
1106 ldp $acc0,$acc1,[sp,#8*0]
1107 adcs $acc5,$a5,$t0
1108 extr $t1,$t2,$t1,#63
1109 ldp $a0,$a1,[$np,#8*0]
1110 adcs $acc6,$a6,$t1
1111 extr $t2,xzr,$t2,#63
1112 ldp $a2,$a3,[$np,#8*2]
1113 adc $acc7,$a7,$t2
1114 ldp $a4,$a5,[$np,#8*4]
1115
1116 // Reduce by 512 bits per iteration
1117 mul $na0,$n0,$acc0 // t[0]*n0
1118 ldp $a6,$a7,[$np,#8*6]
1119 add $np_end,$np,$num
1120 ldp $acc2,$acc3,[sp,#8*2]
1121 stp $acc4,$acc5,[$tp,#8*4]
1122 ldp $acc4,$acc5,[sp,#8*4]
1123 stp $acc6,$acc7,[$tp,#8*6]
1124 ldp $acc6,$acc7,[sp,#8*6]
1125 add $np,$np,#8*8
1126 mov $topmost,xzr // initial top-most carry
1127 mov $tp,sp
1128 mov $cnt,#8
1129
1130.Lsqr8x_reduction:
1131 // (*) mul $t0,$a0,$na0 // lo(n[0-7])*lo(t[0]*n0)
1132 mul $t1,$a1,$na0
1133 sub $cnt,$cnt,#1
1134 mul $t2,$a2,$na0
1135 str $na0,[$tp],#8 // put aside t[0]*n0 for tail processing
1136 mul $t3,$a3,$na0
1137 // (*) adds xzr,$acc0,$t0
1138 subs xzr,$acc0,#1 // (*)
1139 mul $t0,$a4,$na0
1140 adcs $acc0,$acc1,$t1
1141 mul $t1,$a5,$na0
1142 adcs $acc1,$acc2,$t2
1143 mul $t2,$a6,$na0
1144 adcs $acc2,$acc3,$t3
1145 mul $t3,$a7,$na0
1146 adcs $acc3,$acc4,$t0
1147 umulh $t0,$a0,$na0 // hi(n[0-7])*lo(t[0]*n0)
1148 adcs $acc4,$acc5,$t1
1149 umulh $t1,$a1,$na0
1150 adcs $acc5,$acc6,$t2
1151 umulh $t2,$a2,$na0
1152 adcs $acc6,$acc7,$t3
1153 umulh $t3,$a3,$na0
1154 adc $acc7,xzr,xzr
1155 adds $acc0,$acc0,$t0
1156 umulh $t0,$a4,$na0
1157 adcs $acc1,$acc1,$t1
1158 umulh $t1,$a5,$na0
1159 adcs $acc2,$acc2,$t2
1160 umulh $t2,$a6,$na0
1161 adcs $acc3,$acc3,$t3
1162 umulh $t3,$a7,$na0
1163 mul $na0,$n0,$acc0 // next t[0]*n0
1164 adcs $acc4,$acc4,$t0
1165 adcs $acc5,$acc5,$t1
1166 adcs $acc6,$acc6,$t2
1167 adc $acc7,$acc7,$t3
1168 cbnz $cnt,.Lsqr8x_reduction
1169
1170 ldp $t0,$t1,[$tp,#8*0]
1171 ldp $t2,$t3,[$tp,#8*2]
1172 mov $rp,$tp
1173 sub $cnt,$np_end,$np // done yet?
1174 adds $acc0,$acc0,$t0
1175 adcs $acc1,$acc1,$t1
1176 ldp $t0,$t1,[$tp,#8*4]
1177 adcs $acc2,$acc2,$t2
1178 adcs $acc3,$acc3,$t3
1179 ldp $t2,$t3,[$tp,#8*6]
1180 adcs $acc4,$acc4,$t0
1181 adcs $acc5,$acc5,$t1
1182 adcs $acc6,$acc6,$t2
1183 adcs $acc7,$acc7,$t3
1184 //adc $carry,xzr,xzr // moved below
1185 cbz $cnt,.Lsqr8x8_post_condition
1186
1187 ldur $n0,[$tp,#-8*8]
1188 ldp $a0,$a1,[$np,#8*0]
1189 ldp $a2,$a3,[$np,#8*2]
1190 ldp $a4,$a5,[$np,#8*4]
1191 mov $cnt,#-8*8
1192 ldp $a6,$a7,[$np,#8*6]
1193 add $np,$np,#8*8
1194
1195.Lsqr8x_tail:
1196 mul $t0,$a0,$n0
1197 adc $carry,xzr,xzr // carry bit, modulo-scheduled
1198 mul $t1,$a1,$n0
1199 add $cnt,$cnt,#8
1200 mul $t2,$a2,$n0
1201 mul $t3,$a3,$n0
1202 adds $acc0,$acc0,$t0
1203 mul $t0,$a4,$n0
1204 adcs $acc1,$acc1,$t1
1205 mul $t1,$a5,$n0
1206 adcs $acc2,$acc2,$t2
1207 mul $t2,$a6,$n0
1208 adcs $acc3,$acc3,$t3
1209 mul $t3,$a7,$n0
1210 adcs $acc4,$acc4,$t0
1211 umulh $t0,$a0,$n0
1212 adcs $acc5,$acc5,$t1
1213 umulh $t1,$a1,$n0
1214 adcs $acc6,$acc6,$t2
1215 umulh $t2,$a2,$n0
1216 adcs $acc7,$acc7,$t3
1217 umulh $t3,$a3,$n0
1218 adc $carry,$carry,xzr
1219 str $acc0,[$tp],#8
1220 adds $acc0,$acc1,$t0
1221 umulh $t0,$a4,$n0
1222 adcs $acc1,$acc2,$t1
1223 umulh $t1,$a5,$n0
1224 adcs $acc2,$acc3,$t2
1225 umulh $t2,$a6,$n0
1226 adcs $acc3,$acc4,$t3
1227 umulh $t3,$a7,$n0
1228 ldr $n0,[$rp,$cnt]
1229 adcs $acc4,$acc5,$t0
1230 adcs $acc5,$acc6,$t1
1231 adcs $acc6,$acc7,$t2
1232 adcs $acc7,$carry,$t3
1233 //adc $carry,xzr,xzr // moved above
1234 cbnz $cnt,.Lsqr8x_tail
1235 // note that carry flag is guaranteed
1236 // to be zero at this point
1237 ldp $a0,$a1,[$tp,#8*0]
1238 sub $cnt,$np_end,$np // done yet?
1239 sub $t2,$np_end,$num // rewinded np
1240 ldp $a2,$a3,[$tp,#8*2]
1241 ldp $a4,$a5,[$tp,#8*4]
1242 ldp $a6,$a7,[$tp,#8*6]
1243 cbz $cnt,.Lsqr8x_tail_break
1244
1245 ldur $n0,[$rp,#-8*8]
1246 adds $acc0,$acc0,$a0
1247 adcs $acc1,$acc1,$a1
1248 ldp $a0,$a1,[$np,#8*0]
1249 adcs $acc2,$acc2,$a2
1250 adcs $acc3,$acc3,$a3
1251 ldp $a2,$a3,[$np,#8*2]
1252 adcs $acc4,$acc4,$a4
1253 adcs $acc5,$acc5,$a5
1254 ldp $a4,$a5,[$np,#8*4]
1255 adcs $acc6,$acc6,$a6
1256 mov $cnt,#-8*8
1257 adcs $acc7,$acc7,$a7
1258 ldp $a6,$a7,[$np,#8*6]
1259 add $np,$np,#8*8
1260 //adc $carry,xzr,xzr // moved above
1261 b .Lsqr8x_tail
1262
1263.align 4
1264.Lsqr8x_tail_break:
1265 ldr $n0,[x29,#112] // pull n0
1266 add $cnt,$tp,#8*8 // end of current t[num] window
1267
1268 subs xzr,$topmost,#1 // "move" top-most carry to carry bit
1269 adcs $t0,$acc0,$a0
1270 adcs $t1,$acc1,$a1
1271 ldp $acc0,$acc1,[$rp,#8*0]
1272 adcs $acc2,$acc2,$a2
1273 ldp $a0,$a1,[$t2,#8*0] // recall that $t2 is &n[0]
1274 adcs $acc3,$acc3,$a3
1275 ldp $a2,$a3,[$t2,#8*2]
1276 adcs $acc4,$acc4,$a4
1277 adcs $acc5,$acc5,$a5
1278 ldp $a4,$a5,[$t2,#8*4]
1279 adcs $acc6,$acc6,$a6
1280 adcs $acc7,$acc7,$a7
1281 ldp $a6,$a7,[$t2,#8*6]
1282 add $np,$t2,#8*8
1283 adc $topmost,xzr,xzr // top-most carry
1284 mul $na0,$n0,$acc0
1285 stp $t0,$t1,[$tp,#8*0]
1286 stp $acc2,$acc3,[$tp,#8*2]
1287 ldp $acc2,$acc3,[$rp,#8*2]
1288 stp $acc4,$acc5,[$tp,#8*4]
1289 ldp $acc4,$acc5,[$rp,#8*4]
1290 cmp $cnt,x29 // did we hit the bottom?
1291 stp $acc6,$acc7,[$tp,#8*6]
1292 mov $tp,$rp // slide the window
1293 ldp $acc6,$acc7,[$rp,#8*6]
1294 mov $cnt,#8
1295 b.ne .Lsqr8x_reduction
1296
1297 // Final step. We see if result is larger than modulus, and
1298 // if it is, subtract the modulus. But comparison implies
1299 // subtraction. So we subtract modulus, see if it borrowed,
1300 // and conditionally copy original value.
1301 ldr $rp,[x29,#96] // pull rp
1302 add $tp,$tp,#8*8
1303 subs $t0,$acc0,$a0
1304 sbcs $t1,$acc1,$a1
1305 sub $cnt,$num,#8*8
1306 mov $ap_end,$rp // $rp copy
1307
1308.Lsqr8x_sub:
1309 sbcs $t2,$acc2,$a2
1310 ldp $a0,$a1,[$np,#8*0]
1311 sbcs $t3,$acc3,$a3
1312 stp $t0,$t1,[$rp,#8*0]
1313 sbcs $t0,$acc4,$a4
1314 ldp $a2,$a3,[$np,#8*2]
1315 sbcs $t1,$acc5,$a5
1316 stp $t2,$t3,[$rp,#8*2]
1317 sbcs $t2,$acc6,$a6
1318 ldp $a4,$a5,[$np,#8*4]
1319 sbcs $t3,$acc7,$a7
1320 ldp $a6,$a7,[$np,#8*6]
1321 add $np,$np,#8*8
1322 ldp $acc0,$acc1,[$tp,#8*0]
1323 sub $cnt,$cnt,#8*8
1324 ldp $acc2,$acc3,[$tp,#8*2]
1325 ldp $acc4,$acc5,[$tp,#8*4]
1326 ldp $acc6,$acc7,[$tp,#8*6]
1327 add $tp,$tp,#8*8
1328 stp $t0,$t1,[$rp,#8*4]
1329 sbcs $t0,$acc0,$a0
1330 stp $t2,$t3,[$rp,#8*6]
1331 add $rp,$rp,#8*8
1332 sbcs $t1,$acc1,$a1
1333 cbnz $cnt,.Lsqr8x_sub
1334
1335 sbcs $t2,$acc2,$a2
1336 mov $tp,sp
1337 add $ap,sp,$num
1338 ldp $a0,$a1,[$ap_end,#8*0]
1339 sbcs $t3,$acc3,$a3
1340 stp $t0,$t1,[$rp,#8*0]
1341 sbcs $t0,$acc4,$a4
1342 ldp $a2,$a3,[$ap_end,#8*2]
1343 sbcs $t1,$acc5,$a5
1344 stp $t2,$t3,[$rp,#8*2]
1345 sbcs $t2,$acc6,$a6
1346 ldp $acc0,$acc1,[$ap,#8*0]
1347 sbcs $t3,$acc7,$a7
1348 ldp $acc2,$acc3,[$ap,#8*2]
1349 sbcs xzr,$topmost,xzr // did it borrow?
1350 ldr x30,[x29,#8] // pull return address
1351 stp $t0,$t1,[$rp,#8*4]
1352 stp $t2,$t3,[$rp,#8*6]
1353
1354 sub $cnt,$num,#8*4
1355.Lsqr4x_cond_copy:
1356 sub $cnt,$cnt,#8*4
1357 csel $t0,$acc0,$a0,lo
1358 stp xzr,xzr,[$tp,#8*0]
1359 csel $t1,$acc1,$a1,lo
1360 ldp $a0,$a1,[$ap_end,#8*4]
1361 ldp $acc0,$acc1,[$ap,#8*4]
1362 csel $t2,$acc2,$a2,lo
1363 stp xzr,xzr,[$tp,#8*2]
1364 add $tp,$tp,#8*4
1365 csel $t3,$acc3,$a3,lo
1366 ldp $a2,$a3,[$ap_end,#8*6]
1367 ldp $acc2,$acc3,[$ap,#8*6]
1368 add $ap,$ap,#8*4
1369 stp $t0,$t1,[$ap_end,#8*0]
1370 stp $t2,$t3,[$ap_end,#8*2]
1371 add $ap_end,$ap_end,#8*4
1372 stp xzr,xzr,[$ap,#8*0]
1373 stp xzr,xzr,[$ap,#8*2]
1374 cbnz $cnt,.Lsqr4x_cond_copy
1375
1376 csel $t0,$acc0,$a0,lo
1377 stp xzr,xzr,[$tp,#8*0]
1378 csel $t1,$acc1,$a1,lo
1379 stp xzr,xzr,[$tp,#8*2]
1380 csel $t2,$acc2,$a2,lo
1381 csel $t3,$acc3,$a3,lo
1382 stp $t0,$t1,[$ap_end,#8*0]
1383 stp $t2,$t3,[$ap_end,#8*2]
1384
1385 b .Lsqr8x_done
1386
1387.align 4
1388.Lsqr8x8_post_condition:
1389 adc $carry,xzr,xzr
1390 ldr x30,[x29,#8] // pull return address
1391 // $acc0-7,$carry hold result, $a0-7 hold modulus
1392 subs $a0,$acc0,$a0
1393 ldr $ap,[x29,#96] // pull rp
1394 sbcs $a1,$acc1,$a1
1395 stp xzr,xzr,[sp,#8*0]
1396 sbcs $a2,$acc2,$a2
1397 stp xzr,xzr,[sp,#8*2]
1398 sbcs $a3,$acc3,$a3
1399 stp xzr,xzr,[sp,#8*4]
1400 sbcs $a4,$acc4,$a4
1401 stp xzr,xzr,[sp,#8*6]
1402 sbcs $a5,$acc5,$a5
1403 stp xzr,xzr,[sp,#8*8]
1404 sbcs $a6,$acc6,$a6
1405 stp xzr,xzr,[sp,#8*10]
1406 sbcs $a7,$acc7,$a7
1407 stp xzr,xzr,[sp,#8*12]
1408 sbcs $carry,$carry,xzr // did it borrow?
1409 stp xzr,xzr,[sp,#8*14]
1410
1411 // $a0-7 hold result-modulus
1412 csel $a0,$acc0,$a0,lo
1413 csel $a1,$acc1,$a1,lo
1414 csel $a2,$acc2,$a2,lo
1415 csel $a3,$acc3,$a3,lo
1416 stp $a0,$a1,[$ap,#8*0]
1417 csel $a4,$acc4,$a4,lo
1418 csel $a5,$acc5,$a5,lo
1419 stp $a2,$a3,[$ap,#8*2]
1420 csel $a6,$acc6,$a6,lo
1421 csel $a7,$acc7,$a7,lo
1422 stp $a4,$a5,[$ap,#8*4]
1423 stp $a6,$a7,[$ap,#8*6]
1424
1425.Lsqr8x_done:
1426 ldp x19,x20,[x29,#16]
1427 mov sp,x29
1428 ldp x21,x22,[x29,#32]
1429 mov x0,#1
1430 ldp x23,x24,[x29,#48]
1431 ldp x25,x26,[x29,#64]
1432 ldp x27,x28,[x29,#80]
1433 ldr x29,[sp],#128
1434 // x30 is loaded earlier
1435 AARCH64_VALIDATE_LINK_REGISTER
1436 ret
1437.size __bn_sqr8x_mont,.-__bn_sqr8x_mont
1438___
1439}
1440
1441{
1442########################################################################
1443# Even though this might look as ARMv8 adaptation of mulx4x_mont from
1444# x86_64-mont5 module, it's different in sense that it performs
1445# reduction 256 bits at a time.
1446
1447my ($a0,$a1,$a2,$a3,
1448 $t0,$t1,$t2,$t3,
1449 $m0,$m1,$m2,$m3,
1450 $acc0,$acc1,$acc2,$acc3,$acc4,
1451 $bi,$mi,$tp,$ap_end,$cnt) = map("x$_",(6..17,19..28));
1452my $bp_end=$rp;
1453my ($carry,$topmost) = ($rp,"x30");
1454
1455$code.=<<___;
1456.type __bn_mul4x_mont,%function
1457.align 5
1458__bn_mul4x_mont:
1459 // Not adding AARCH64_SIGN_LINK_REGISTER here because __bn_mul4x_mont is jumped to
1460 // only from bn_mul_mont (or __bn_sqr8x_mont from bn_mul_mont) which has already signed the return address.
1461 stp x29,x30,[sp,#-128]!
1462 add x29,sp,#0
1463 stp x19,x20,[sp,#16]
1464 stp x21,x22,[sp,#32]
1465 stp x23,x24,[sp,#48]
1466 stp x25,x26,[sp,#64]
1467 stp x27,x28,[sp,#80]
1468
1469 sub $tp,sp,$num,lsl#3
1470 lsl $num,$num,#3
1471 ldr $n0,[$n0] // *n0
1472 sub sp,$tp,#8*4 // alloca
1473
1474 add $t0,$bp,$num
1475 add $ap_end,$ap,$num
1476 stp $rp,$t0,[x29,#96] // offload rp and &b[num]
1477
1478 ldr $bi,[$bp,#8*0] // b[0]
1479 ldp $a0,$a1,[$ap,#8*0] // a[0..3]
1480 ldp $a2,$a3,[$ap,#8*2]
1481 add $ap,$ap,#8*4
1482 mov $acc0,xzr
1483 mov $acc1,xzr
1484 mov $acc2,xzr
1485 mov $acc3,xzr
1486 ldp $m0,$m1,[$np,#8*0] // n[0..3]
1487 ldp $m2,$m3,[$np,#8*2]
1488 adds $np,$np,#8*4 // clear carry bit
1489 mov $carry,xzr
1490 mov $cnt,#0
1491 mov $tp,sp
1492
1493.Loop_mul4x_1st_reduction:
1494 mul $t0,$a0,$bi // lo(a[0..3]*b[0])
1495 adc $carry,$carry,xzr // modulo-scheduled
1496 mul $t1,$a1,$bi
1497 add $cnt,$cnt,#8
1498 mul $t2,$a2,$bi
1499 and $cnt,$cnt,#31
1500 mul $t3,$a3,$bi
1501 adds $acc0,$acc0,$t0
1502 umulh $t0,$a0,$bi // hi(a[0..3]*b[0])
1503 adcs $acc1,$acc1,$t1
1504 mul $mi,$acc0,$n0 // t[0]*n0
1505 adcs $acc2,$acc2,$t2
1506 umulh $t1,$a1,$bi
1507 adcs $acc3,$acc3,$t3
1508 umulh $t2,$a2,$bi
1509 adc $acc4,xzr,xzr
1510 umulh $t3,$a3,$bi
1511 ldr $bi,[$bp,$cnt] // next b[i] (or b[0])
1512 adds $acc1,$acc1,$t0
1513 // (*) mul $t0,$m0,$mi // lo(n[0..3]*t[0]*n0)
1514 str $mi,[$tp],#8 // put aside t[0]*n0 for tail processing
1515 adcs $acc2,$acc2,$t1
1516 mul $t1,$m1,$mi
1517 adcs $acc3,$acc3,$t2
1518 mul $t2,$m2,$mi
1519 adc $acc4,$acc4,$t3 // can't overflow
1520 mul $t3,$m3,$mi
1521 // (*) adds xzr,$acc0,$t0
1522 subs xzr,$acc0,#1 // (*)
1523 umulh $t0,$m0,$mi // hi(n[0..3]*t[0]*n0)
1524 adcs $acc0,$acc1,$t1
1525 umulh $t1,$m1,$mi
1526 adcs $acc1,$acc2,$t2
1527 umulh $t2,$m2,$mi
1528 adcs $acc2,$acc3,$t3
1529 umulh $t3,$m3,$mi
1530 adcs $acc3,$acc4,$carry
1531 adc $carry,xzr,xzr
1532 adds $acc0,$acc0,$t0
1533 sub $t0,$ap_end,$ap
1534 adcs $acc1,$acc1,$t1
1535 adcs $acc2,$acc2,$t2
1536 adcs $acc3,$acc3,$t3
1537 //adc $carry,$carry,xzr
1538 cbnz $cnt,.Loop_mul4x_1st_reduction
1539
1540 cbz $t0,.Lmul4x4_post_condition
1541
1542 ldp $a0,$a1,[$ap,#8*0] // a[4..7]
1543 ldp $a2,$a3,[$ap,#8*2]
1544 add $ap,$ap,#8*4
1545 ldr $mi,[sp] // a[0]*n0
1546 ldp $m0,$m1,[$np,#8*0] // n[4..7]
1547 ldp $m2,$m3,[$np,#8*2]
1548 add $np,$np,#8*4
1549
1550.Loop_mul4x_1st_tail:
1551 mul $t0,$a0,$bi // lo(a[4..7]*b[i])
1552 adc $carry,$carry,xzr // modulo-scheduled
1553 mul $t1,$a1,$bi
1554 add $cnt,$cnt,#8
1555 mul $t2,$a2,$bi
1556 and $cnt,$cnt,#31
1557 mul $t3,$a3,$bi
1558 adds $acc0,$acc0,$t0
1559 umulh $t0,$a0,$bi // hi(a[4..7]*b[i])
1560 adcs $acc1,$acc1,$t1
1561 umulh $t1,$a1,$bi
1562 adcs $acc2,$acc2,$t2
1563 umulh $t2,$a2,$bi
1564 adcs $acc3,$acc3,$t3
1565 umulh $t3,$a3,$bi
1566 adc $acc4,xzr,xzr
1567 ldr $bi,[$bp,$cnt] // next b[i] (or b[0])
1568 adds $acc1,$acc1,$t0
1569 mul $t0,$m0,$mi // lo(n[4..7]*a[0]*n0)
1570 adcs $acc2,$acc2,$t1
1571 mul $t1,$m1,$mi
1572 adcs $acc3,$acc3,$t2
1573 mul $t2,$m2,$mi
1574 adc $acc4,$acc4,$t3 // can't overflow
1575 mul $t3,$m3,$mi
1576 adds $acc0,$acc0,$t0
1577 umulh $t0,$m0,$mi // hi(n[4..7]*a[0]*n0)
1578 adcs $acc1,$acc1,$t1
1579 umulh $t1,$m1,$mi
1580 adcs $acc2,$acc2,$t2
1581 umulh $t2,$m2,$mi
1582 adcs $acc3,$acc3,$t3
1583 adcs $acc4,$acc4,$carry
1584 umulh $t3,$m3,$mi
1585 adc $carry,xzr,xzr
1586 ldr $mi,[sp,$cnt] // next t[0]*n0
1587 str $acc0,[$tp],#8 // result!!!
1588 adds $acc0,$acc1,$t0
1589 sub $t0,$ap_end,$ap // done yet?
1590 adcs $acc1,$acc2,$t1
1591 adcs $acc2,$acc3,$t2
1592 adcs $acc3,$acc4,$t3
1593 //adc $carry,$carry,xzr
1594 cbnz $cnt,.Loop_mul4x_1st_tail
1595
1596 sub $t1,$ap_end,$num // rewinded $ap
1597 cbz $t0,.Lmul4x_proceed
1598
1599 ldp $a0,$a1,[$ap,#8*0]
1600 ldp $a2,$a3,[$ap,#8*2]
1601 add $ap,$ap,#8*4
1602 ldp $m0,$m1,[$np,#8*0]
1603 ldp $m2,$m3,[$np,#8*2]
1604 add $np,$np,#8*4
1605 b .Loop_mul4x_1st_tail
1606
1607.align 5
1608.Lmul4x_proceed:
1609 ldr $bi,[$bp,#8*4]! // *++b
1610 adc $topmost,$carry,xzr
1611 ldp $a0,$a1,[$t1,#8*0] // a[0..3]
1612 sub $np,$np,$num // rewind np
1613 ldp $a2,$a3,[$t1,#8*2]
1614 add $ap,$t1,#8*4
1615
1616 stp $acc0,$acc1,[$tp,#8*0] // result!!!
1617 ldp $acc0,$acc1,[sp,#8*4] // t[0..3]
1618 stp $acc2,$acc3,[$tp,#8*2] // result!!!
1619 ldp $acc2,$acc3,[sp,#8*6]
1620
1621 ldp $m0,$m1,[$np,#8*0] // n[0..3]
1622 mov $tp,sp
1623 ldp $m2,$m3,[$np,#8*2]
1624 adds $np,$np,#8*4 // clear carry bit
1625 mov $carry,xzr
1626
1627.align 4
1628.Loop_mul4x_reduction:
1629 mul $t0,$a0,$bi // lo(a[0..3]*b[4])
1630 adc $carry,$carry,xzr // modulo-scheduled
1631 mul $t1,$a1,$bi
1632 add $cnt,$cnt,#8
1633 mul $t2,$a2,$bi
1634 and $cnt,$cnt,#31
1635 mul $t3,$a3,$bi
1636 adds $acc0,$acc0,$t0
1637 umulh $t0,$a0,$bi // hi(a[0..3]*b[4])
1638 adcs $acc1,$acc1,$t1
1639 mul $mi,$acc0,$n0 // t[0]*n0
1640 adcs $acc2,$acc2,$t2
1641 umulh $t1,$a1,$bi
1642 adcs $acc3,$acc3,$t3
1643 umulh $t2,$a2,$bi
1644 adc $acc4,xzr,xzr
1645 umulh $t3,$a3,$bi
1646 ldr $bi,[$bp,$cnt] // next b[i]
1647 adds $acc1,$acc1,$t0
1648 // (*) mul $t0,$m0,$mi
1649 str $mi,[$tp],#8 // put aside t[0]*n0 for tail processing
1650 adcs $acc2,$acc2,$t1
1651 mul $t1,$m1,$mi // lo(n[0..3]*t[0]*n0
1652 adcs $acc3,$acc3,$t2
1653 mul $t2,$m2,$mi
1654 adc $acc4,$acc4,$t3 // can't overflow
1655 mul $t3,$m3,$mi
1656 // (*) adds xzr,$acc0,$t0
1657 subs xzr,$acc0,#1 // (*)
1658 umulh $t0,$m0,$mi // hi(n[0..3]*t[0]*n0
1659 adcs $acc0,$acc1,$t1
1660 umulh $t1,$m1,$mi
1661 adcs $acc1,$acc2,$t2
1662 umulh $t2,$m2,$mi
1663 adcs $acc2,$acc3,$t3
1664 umulh $t3,$m3,$mi
1665 adcs $acc3,$acc4,$carry
1666 adc $carry,xzr,xzr
1667 adds $acc0,$acc0,$t0
1668 adcs $acc1,$acc1,$t1
1669 adcs $acc2,$acc2,$t2
1670 adcs $acc3,$acc3,$t3
1671 //adc $carry,$carry,xzr
1672 cbnz $cnt,.Loop_mul4x_reduction
1673
1674 adc $carry,$carry,xzr
1675 ldp $t0,$t1,[$tp,#8*4] // t[4..7]
1676 ldp $t2,$t3,[$tp,#8*6]
1677 ldp $a0,$a1,[$ap,#8*0] // a[4..7]
1678 ldp $a2,$a3,[$ap,#8*2]
1679 add $ap,$ap,#8*4
1680 adds $acc0,$acc0,$t0
1681 adcs $acc1,$acc1,$t1
1682 adcs $acc2,$acc2,$t2
1683 adcs $acc3,$acc3,$t3
1684 //adc $carry,$carry,xzr
1685
1686 ldr $mi,[sp] // t[0]*n0
1687 ldp $m0,$m1,[$np,#8*0] // n[4..7]
1688 ldp $m2,$m3,[$np,#8*2]
1689 add $np,$np,#8*4
1690
1691.align 4
1692.Loop_mul4x_tail:
1693 mul $t0,$a0,$bi // lo(a[4..7]*b[4])
1694 adc $carry,$carry,xzr // modulo-scheduled
1695 mul $t1,$a1,$bi
1696 add $cnt,$cnt,#8
1697 mul $t2,$a2,$bi
1698 and $cnt,$cnt,#31
1699 mul $t3,$a3,$bi
1700 adds $acc0,$acc0,$t0
1701 umulh $t0,$a0,$bi // hi(a[4..7]*b[4])
1702 adcs $acc1,$acc1,$t1
1703 umulh $t1,$a1,$bi
1704 adcs $acc2,$acc2,$t2
1705 umulh $t2,$a2,$bi
1706 adcs $acc3,$acc3,$t3
1707 umulh $t3,$a3,$bi
1708 adc $acc4,xzr,xzr
1709 ldr $bi,[$bp,$cnt] // next b[i]
1710 adds $acc1,$acc1,$t0
1711 mul $t0,$m0,$mi // lo(n[4..7]*t[0]*n0)
1712 adcs $acc2,$acc2,$t1
1713 mul $t1,$m1,$mi
1714 adcs $acc3,$acc3,$t2
1715 mul $t2,$m2,$mi
1716 adc $acc4,$acc4,$t3 // can't overflow
1717 mul $t3,$m3,$mi
1718 adds $acc0,$acc0,$t0
1719 umulh $t0,$m0,$mi // hi(n[4..7]*t[0]*n0)
1720 adcs $acc1,$acc1,$t1
1721 umulh $t1,$m1,$mi
1722 adcs $acc2,$acc2,$t2
1723 umulh $t2,$m2,$mi
1724 adcs $acc3,$acc3,$t3
1725 umulh $t3,$m3,$mi
1726 adcs $acc4,$acc4,$carry
1727 ldr $mi,[sp,$cnt] // next a[0]*n0
1728 adc $carry,xzr,xzr
1729 str $acc0,[$tp],#8 // result!!!
1730 adds $acc0,$acc1,$t0
1731 sub $t0,$ap_end,$ap // done yet?
1732 adcs $acc1,$acc2,$t1
1733 adcs $acc2,$acc3,$t2
1734 adcs $acc3,$acc4,$t3
1735 //adc $carry,$carry,xzr
1736 cbnz $cnt,.Loop_mul4x_tail
1737
1738 sub $t1,$np,$num // rewinded np?
1739 adc $carry,$carry,xzr
1740 cbz $t0,.Loop_mul4x_break
1741
1742 ldp $t0,$t1,[$tp,#8*4]
1743 ldp $t2,$t3,[$tp,#8*6]
1744 ldp $a0,$a1,[$ap,#8*0]
1745 ldp $a2,$a3,[$ap,#8*2]
1746 add $ap,$ap,#8*4
1747 adds $acc0,$acc0,$t0
1748 adcs $acc1,$acc1,$t1
1749 adcs $acc2,$acc2,$t2
1750 adcs $acc3,$acc3,$t3
1751 //adc $carry,$carry,xzr
1752 ldp $m0,$m1,[$np,#8*0]
1753 ldp $m2,$m3,[$np,#8*2]
1754 add $np,$np,#8*4
1755 b .Loop_mul4x_tail
1756
1757.align 4
1758.Loop_mul4x_break:
1759 ldp $t2,$t3,[x29,#96] // pull rp and &b[num]
1760 adds $acc0,$acc0,$topmost
1761 add $bp,$bp,#8*4 // bp++
1762 adcs $acc1,$acc1,xzr
1763 sub $ap,$ap,$num // rewind ap
1764 adcs $acc2,$acc2,xzr
1765 stp $acc0,$acc1,[$tp,#8*0] // result!!!
1766 adcs $acc3,$acc3,xzr
1767 ldp $acc0,$acc1,[sp,#8*4] // t[0..3]
1768 adc $topmost,$carry,xzr
1769 stp $acc2,$acc3,[$tp,#8*2] // result!!!
1770 cmp $bp,$t3 // done yet?
1771 ldp $acc2,$acc3,[sp,#8*6]
1772 ldp $m0,$m1,[$t1,#8*0] // n[0..3]
1773 ldp $m2,$m3,[$t1,#8*2]
1774 add $np,$t1,#8*4
1775 b.eq .Lmul4x_post
1776
1777 ldr $bi,[$bp]
1778 ldp $a0,$a1,[$ap,#8*0] // a[0..3]
1779 ldp $a2,$a3,[$ap,#8*2]
1780 adds $ap,$ap,#8*4 // clear carry bit
1781 mov $carry,xzr
1782 mov $tp,sp
1783 b .Loop_mul4x_reduction
1784
1785.align 4
1786.Lmul4x_post:
1787 // Final step. We see if result is larger than modulus, and
1788 // if it is, subtract the modulus. But comparison implies
1789 // subtraction. So we subtract modulus, see if it borrowed,
1790 // and conditionally copy original value.
1791 mov $rp,$t2
1792 mov $ap_end,$t2 // $rp copy
1793 subs $t0,$acc0,$m0
1794 add $tp,sp,#8*8
1795 sbcs $t1,$acc1,$m1
1796 sub $cnt,$num,#8*4
1797
1798.Lmul4x_sub:
1799 sbcs $t2,$acc2,$m2
1800 ldp $m0,$m1,[$np,#8*0]
1801 sub $cnt,$cnt,#8*4
1802 ldp $acc0,$acc1,[$tp,#8*0]
1803 sbcs $t3,$acc3,$m3
1804 ldp $m2,$m3,[$np,#8*2]
1805 add $np,$np,#8*4
1806 ldp $acc2,$acc3,[$tp,#8*2]
1807 add $tp,$tp,#8*4
1808 stp $t0,$t1,[$rp,#8*0]
1809 sbcs $t0,$acc0,$m0
1810 stp $t2,$t3,[$rp,#8*2]
1811 add $rp,$rp,#8*4
1812 sbcs $t1,$acc1,$m1
1813 cbnz $cnt,.Lmul4x_sub
1814
1815 sbcs $t2,$acc2,$m2
1816 mov $tp,sp
1817 add $ap,sp,#8*4
1818 ldp $a0,$a1,[$ap_end,#8*0]
1819 sbcs $t3,$acc3,$m3
1820 stp $t0,$t1,[$rp,#8*0]
1821 ldp $a2,$a3,[$ap_end,#8*2]
1822 stp $t2,$t3,[$rp,#8*2]
1823 ldp $acc0,$acc1,[$ap,#8*0]
1824 ldp $acc2,$acc3,[$ap,#8*2]
1825 sbcs xzr,$topmost,xzr // did it borrow?
1826 ldr x30,[x29,#8] // pull return address
1827
1828 sub $cnt,$num,#8*4
1829.Lmul4x_cond_copy:
1830 sub $cnt,$cnt,#8*4
1831 csel $t0,$acc0,$a0,lo
1832 stp xzr,xzr,[$tp,#8*0]
1833 csel $t1,$acc1,$a1,lo
1834 ldp $a0,$a1,[$ap_end,#8*4]
1835 ldp $acc0,$acc1,[$ap,#8*4]
1836 csel $t2,$acc2,$a2,lo
1837 stp xzr,xzr,[$tp,#8*2]
1838 add $tp,$tp,#8*4
1839 csel $t3,$acc3,$a3,lo
1840 ldp $a2,$a3,[$ap_end,#8*6]
1841 ldp $acc2,$acc3,[$ap,#8*6]
1842 add $ap,$ap,#8*4
1843 stp $t0,$t1,[$ap_end,#8*0]
1844 stp $t2,$t3,[$ap_end,#8*2]
1845 add $ap_end,$ap_end,#8*4
1846 cbnz $cnt,.Lmul4x_cond_copy
1847
1848 csel $t0,$acc0,$a0,lo
1849 stp xzr,xzr,[$tp,#8*0]
1850 csel $t1,$acc1,$a1,lo
1851 stp xzr,xzr,[$tp,#8*2]
1852 csel $t2,$acc2,$a2,lo
1853 stp xzr,xzr,[$tp,#8*3]
1854 csel $t3,$acc3,$a3,lo
1855 stp xzr,xzr,[$tp,#8*4]
1856 stp $t0,$t1,[$ap_end,#8*0]
1857 stp $t2,$t3,[$ap_end,#8*2]
1858
1859 b .Lmul4x_done
1860
1861.align 4
1862.Lmul4x4_post_condition:
1863 adc $carry,$carry,xzr
1864 ldr $ap,[x29,#96] // pull rp
1865 // $acc0-3,$carry hold result, $m0-7 hold modulus
1866 subs $a0,$acc0,$m0
1867 ldr x30,[x29,#8] // pull return address
1868 sbcs $a1,$acc1,$m1
1869 stp xzr,xzr,[sp,#8*0]
1870 sbcs $a2,$acc2,$m2
1871 stp xzr,xzr,[sp,#8*2]
1872 sbcs $a3,$acc3,$m3
1873 stp xzr,xzr,[sp,#8*4]
1874 sbcs xzr,$carry,xzr // did it borrow?
1875 stp xzr,xzr,[sp,#8*6]
1876
1877 // $a0-3 hold result-modulus
1878 csel $a0,$acc0,$a0,lo
1879 csel $a1,$acc1,$a1,lo
1880 csel $a2,$acc2,$a2,lo
1881 csel $a3,$acc3,$a3,lo
1882 stp $a0,$a1,[$ap,#8*0]
1883 stp $a2,$a3,[$ap,#8*2]
1884
1885.Lmul4x_done:
1886 ldp x19,x20,[x29,#16]
1887 mov sp,x29
1888 ldp x21,x22,[x29,#32]
1889 mov x0,#1
1890 ldp x23,x24,[x29,#48]
1891 ldp x25,x26,[x29,#64]
1892 ldp x27,x28,[x29,#80]
1893 ldr x29,[sp],#128
1894 // x30 loaded earlier
1895 AARCH64_VALIDATE_LINK_REGISTER
1896 ret
1897.size __bn_mul4x_mont,.-__bn_mul4x_mont
1898___
1899}
1900$code.=<<___;
1901.asciz "Montgomery Multiplication for ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
1902.align 4
1903___
1904
1905print $code;
1906
1907close STDOUT or die "error closing STDOUT: $!";
注意: 瀏覽 TracBrowser 來幫助您使用儲存庫瀏覽器

© 2025 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette