1 | #! /usr/bin/env perl
|
---|
2 | # Copyright 2015-2022 The OpenSSL Project Authors. All Rights Reserved.
|
---|
3 | #
|
---|
4 | # Licensed under the Apache License 2.0 (the "License"). You may not use
|
---|
5 | # this file except in compliance with the License. You can obtain a copy
|
---|
6 | # in the file LICENSE in the source distribution or at
|
---|
7 | # https://www.openssl.org/source/license.html
|
---|
8 |
|
---|
9 |
|
---|
10 | # ====================================================================
|
---|
11 | # Written by Andy Polyakov <[email protected]> for the OpenSSL
|
---|
12 | # project. The module is, however, dual licensed under OpenSSL and
|
---|
13 | # CRYPTOGAMS licenses depending on where you obtain it. For further
|
---|
14 | # details see http://www.openssl.org/~appro/cryptogams/.
|
---|
15 | # ====================================================================
|
---|
16 |
|
---|
17 | # March 2015
|
---|
18 | #
|
---|
19 | # "Teaser" Montgomery multiplication module for ARMv8. Needs more
|
---|
20 | # work. While it does improve RSA sign performance by 20-30% (less for
|
---|
21 | # longer keys) on most processors, for some reason RSA2048 is not
|
---|
22 | # faster and RSA4096 goes 15-20% slower on Cortex-A57. Multiplication
|
---|
23 | # instruction issue rate is limited on processor in question, meaning
|
---|
24 | # that dedicated squaring procedure is a must. Well, actually all
|
---|
25 | # contemporary AArch64 processors seem to have limited multiplication
|
---|
26 | # issue rate, i.e. they can't issue multiplication every cycle, which
|
---|
27 | # explains moderate improvement coefficients in comparison to
|
---|
28 | # compiler-generated code. Recall that compiler is instructed to use
|
---|
29 | # umulh and therefore uses same amount of multiplication instructions
|
---|
30 | # to do the job. Assembly's edge is to minimize number of "collateral"
|
---|
31 | # instructions and of course instruction scheduling.
|
---|
32 | #
|
---|
33 | # April 2015
|
---|
34 | #
|
---|
35 | # Squaring procedure that handles lengths divisible by 8 improves
|
---|
36 | # RSA/DSA performance by 25-40-60% depending on processor and key
|
---|
37 | # length. Overall improvement coefficients are always positive in
|
---|
38 | # comparison to compiler-generated code. On Cortex-A57 improvement
|
---|
39 | # is still modest on longest key lengths, while others exhibit e.g.
|
---|
40 | # 50-70% improvement for RSA4096 sign. RSA2048 sign is ~25% faster
|
---|
41 | # on Cortex-A57 and ~60-100% faster on others.
|
---|
42 |
|
---|
43 | # $output is the last argument if it looks like a file (it has an extension)
|
---|
44 | # $flavour is the first argument if it doesn't look like a file
|
---|
45 | my $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
|
---|
46 | my $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
|
---|
47 |
|
---|
48 | $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
|
---|
49 | ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
|
---|
50 | ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
|
---|
51 | die "can't locate arm-xlate.pl";
|
---|
52 |
|
---|
53 | open OUT,"| \"$^X\" $xlate $flavour \"$output\""
|
---|
54 | or die "can't call $xlate: $1";
|
---|
55 | *STDOUT=*OUT;
|
---|
56 |
|
---|
57 | ($lo0,$hi0,$aj,$m0,$alo,$ahi,
|
---|
58 | $lo1,$hi1,$nj,$m1,$nlo,$nhi,
|
---|
59 | $ovf, $i,$j,$tp,$tj) = map("x$_",6..17,19..24);
|
---|
60 |
|
---|
61 | # int bn_mul_mont(
|
---|
62 | $rp="x0"; # BN_ULONG *rp,
|
---|
63 | $ap="x1"; # const BN_ULONG *ap,
|
---|
64 | $bp="x2"; # const BN_ULONG *bp,
|
---|
65 | $np="x3"; # const BN_ULONG *np,
|
---|
66 | $n0="x4"; # const BN_ULONG *n0,
|
---|
67 | $num="x5"; # int num);
|
---|
68 |
|
---|
69 | $code.=<<___;
|
---|
70 | #include "arm_arch.h"
|
---|
71 | #ifndef __KERNEL__
|
---|
72 | .extern OPENSSL_armv8_rsa_neonized
|
---|
73 | .hidden OPENSSL_armv8_rsa_neonized
|
---|
74 | #endif
|
---|
75 | .text
|
---|
76 |
|
---|
77 | .globl bn_mul_mont
|
---|
78 | .type bn_mul_mont,%function
|
---|
79 | .align 5
|
---|
80 | bn_mul_mont:
|
---|
81 | AARCH64_SIGN_LINK_REGISTER
|
---|
82 | .Lbn_mul_mont:
|
---|
83 | tst $num,#3
|
---|
84 | b.ne .Lmul_mont
|
---|
85 | cmp $num,#32
|
---|
86 | b.le .Lscalar_impl
|
---|
87 | #ifndef __KERNEL__
|
---|
88 | adrp x17,OPENSSL_armv8_rsa_neonized
|
---|
89 | ldr w17,[x17,#:lo12:OPENSSL_armv8_rsa_neonized]
|
---|
90 | cbnz w17, bn_mul8x_mont_neon
|
---|
91 | #endif
|
---|
92 |
|
---|
93 | .Lscalar_impl:
|
---|
94 | tst $num,#7
|
---|
95 | b.eq __bn_sqr8x_mont
|
---|
96 | tst $num,#3
|
---|
97 | b.eq __bn_mul4x_mont
|
---|
98 |
|
---|
99 | .Lmul_mont:
|
---|
100 | stp x29,x30,[sp,#-64]!
|
---|
101 | add x29,sp,#0
|
---|
102 | stp x19,x20,[sp,#16]
|
---|
103 | stp x21,x22,[sp,#32]
|
---|
104 | stp x23,x24,[sp,#48]
|
---|
105 |
|
---|
106 | ldr $m0,[$bp],#8 // bp[0]
|
---|
107 | sub $tp,sp,$num,lsl#3
|
---|
108 | ldp $hi0,$aj,[$ap],#16 // ap[0..1]
|
---|
109 | lsl $num,$num,#3
|
---|
110 | ldr $n0,[$n0] // *n0
|
---|
111 | and $tp,$tp,#-16 // ABI says so
|
---|
112 | ldp $hi1,$nj,[$np],#16 // np[0..1]
|
---|
113 |
|
---|
114 | mul $lo0,$hi0,$m0 // ap[0]*bp[0]
|
---|
115 | sub $j,$num,#16 // j=num-2
|
---|
116 | umulh $hi0,$hi0,$m0
|
---|
117 | mul $alo,$aj,$m0 // ap[1]*bp[0]
|
---|
118 | umulh $ahi,$aj,$m0
|
---|
119 |
|
---|
120 | mul $m1,$lo0,$n0 // "tp[0]"*n0
|
---|
121 | mov sp,$tp // alloca
|
---|
122 |
|
---|
123 | // (*) mul $lo1,$hi1,$m1 // np[0]*m1
|
---|
124 | umulh $hi1,$hi1,$m1
|
---|
125 | mul $nlo,$nj,$m1 // np[1]*m1
|
---|
126 | // (*) adds $lo1,$lo1,$lo0 // discarded
|
---|
127 | // (*) As for removal of first multiplication and addition
|
---|
128 | // instructions. The outcome of first addition is
|
---|
129 | // guaranteed to be zero, which leaves two computationally
|
---|
130 | // significant outcomes: it either carries or not. Then
|
---|
131 | // question is when does it carry? Is there alternative
|
---|
132 | // way to deduce it? If you follow operations, you can
|
---|
133 | // observe that condition for carry is quite simple:
|
---|
134 | // $lo0 being non-zero. So that carry can be calculated
|
---|
135 | // by adding -1 to $lo0. That's what next instruction does.
|
---|
136 | subs xzr,$lo0,#1 // (*)
|
---|
137 | umulh $nhi,$nj,$m1
|
---|
138 | adc $hi1,$hi1,xzr
|
---|
139 | cbz $j,.L1st_skip
|
---|
140 |
|
---|
141 | .L1st:
|
---|
142 | ldr $aj,[$ap],#8
|
---|
143 | adds $lo0,$alo,$hi0
|
---|
144 | sub $j,$j,#8 // j--
|
---|
145 | adc $hi0,$ahi,xzr
|
---|
146 |
|
---|
147 | ldr $nj,[$np],#8
|
---|
148 | adds $lo1,$nlo,$hi1
|
---|
149 | mul $alo,$aj,$m0 // ap[j]*bp[0]
|
---|
150 | adc $hi1,$nhi,xzr
|
---|
151 | umulh $ahi,$aj,$m0
|
---|
152 |
|
---|
153 | adds $lo1,$lo1,$lo0
|
---|
154 | mul $nlo,$nj,$m1 // np[j]*m1
|
---|
155 | adc $hi1,$hi1,xzr
|
---|
156 | umulh $nhi,$nj,$m1
|
---|
157 | str $lo1,[$tp],#8 // tp[j-1]
|
---|
158 | cbnz $j,.L1st
|
---|
159 |
|
---|
160 | .L1st_skip:
|
---|
161 | adds $lo0,$alo,$hi0
|
---|
162 | sub $ap,$ap,$num // rewind $ap
|
---|
163 | adc $hi0,$ahi,xzr
|
---|
164 |
|
---|
165 | adds $lo1,$nlo,$hi1
|
---|
166 | sub $np,$np,$num // rewind $np
|
---|
167 | adc $hi1,$nhi,xzr
|
---|
168 |
|
---|
169 | adds $lo1,$lo1,$lo0
|
---|
170 | sub $i,$num,#8 // i=num-1
|
---|
171 | adcs $hi1,$hi1,$hi0
|
---|
172 |
|
---|
173 | adc $ovf,xzr,xzr // upmost overflow bit
|
---|
174 | stp $lo1,$hi1,[$tp]
|
---|
175 |
|
---|
176 | .Louter:
|
---|
177 | ldr $m0,[$bp],#8 // bp[i]
|
---|
178 | ldp $hi0,$aj,[$ap],#16
|
---|
179 | ldr $tj,[sp] // tp[0]
|
---|
180 | add $tp,sp,#8
|
---|
181 |
|
---|
182 | mul $lo0,$hi0,$m0 // ap[0]*bp[i]
|
---|
183 | sub $j,$num,#16 // j=num-2
|
---|
184 | umulh $hi0,$hi0,$m0
|
---|
185 | ldp $hi1,$nj,[$np],#16
|
---|
186 | mul $alo,$aj,$m0 // ap[1]*bp[i]
|
---|
187 | adds $lo0,$lo0,$tj
|
---|
188 | umulh $ahi,$aj,$m0
|
---|
189 | adc $hi0,$hi0,xzr
|
---|
190 |
|
---|
191 | mul $m1,$lo0,$n0
|
---|
192 | sub $i,$i,#8 // i--
|
---|
193 |
|
---|
194 | // (*) mul $lo1,$hi1,$m1 // np[0]*m1
|
---|
195 | umulh $hi1,$hi1,$m1
|
---|
196 | mul $nlo,$nj,$m1 // np[1]*m1
|
---|
197 | // (*) adds $lo1,$lo1,$lo0
|
---|
198 | subs xzr,$lo0,#1 // (*)
|
---|
199 | umulh $nhi,$nj,$m1
|
---|
200 | cbz $j,.Linner_skip
|
---|
201 |
|
---|
202 | .Linner:
|
---|
203 | ldr $aj,[$ap],#8
|
---|
204 | adc $hi1,$hi1,xzr
|
---|
205 | ldr $tj,[$tp],#8 // tp[j]
|
---|
206 | adds $lo0,$alo,$hi0
|
---|
207 | sub $j,$j,#8 // j--
|
---|
208 | adc $hi0,$ahi,xzr
|
---|
209 |
|
---|
210 | adds $lo1,$nlo,$hi1
|
---|
211 | ldr $nj,[$np],#8
|
---|
212 | adc $hi1,$nhi,xzr
|
---|
213 |
|
---|
214 | mul $alo,$aj,$m0 // ap[j]*bp[i]
|
---|
215 | adds $lo0,$lo0,$tj
|
---|
216 | umulh $ahi,$aj,$m0
|
---|
217 | adc $hi0,$hi0,xzr
|
---|
218 |
|
---|
219 | mul $nlo,$nj,$m1 // np[j]*m1
|
---|
220 | adds $lo1,$lo1,$lo0
|
---|
221 | umulh $nhi,$nj,$m1
|
---|
222 | stur $lo1,[$tp,#-16] // tp[j-1]
|
---|
223 | cbnz $j,.Linner
|
---|
224 |
|
---|
225 | .Linner_skip:
|
---|
226 | ldr $tj,[$tp],#8 // tp[j]
|
---|
227 | adc $hi1,$hi1,xzr
|
---|
228 | adds $lo0,$alo,$hi0
|
---|
229 | sub $ap,$ap,$num // rewind $ap
|
---|
230 | adc $hi0,$ahi,xzr
|
---|
231 |
|
---|
232 | adds $lo1,$nlo,$hi1
|
---|
233 | sub $np,$np,$num // rewind $np
|
---|
234 | adcs $hi1,$nhi,$ovf
|
---|
235 | adc $ovf,xzr,xzr
|
---|
236 |
|
---|
237 | adds $lo0,$lo0,$tj
|
---|
238 | adc $hi0,$hi0,xzr
|
---|
239 |
|
---|
240 | adds $lo1,$lo1,$lo0
|
---|
241 | adcs $hi1,$hi1,$hi0
|
---|
242 | adc $ovf,$ovf,xzr // upmost overflow bit
|
---|
243 | stp $lo1,$hi1,[$tp,#-16]
|
---|
244 |
|
---|
245 | cbnz $i,.Louter
|
---|
246 |
|
---|
247 | // Final step. We see if result is larger than modulus, and
|
---|
248 | // if it is, subtract the modulus. But comparison implies
|
---|
249 | // subtraction. So we subtract modulus, see if it borrowed,
|
---|
250 | // and conditionally copy original value.
|
---|
251 | ldr $tj,[sp] // tp[0]
|
---|
252 | add $tp,sp,#8
|
---|
253 | ldr $nj,[$np],#8 // np[0]
|
---|
254 | subs $j,$num,#8 // j=num-1 and clear borrow
|
---|
255 | mov $ap,$rp
|
---|
256 | .Lsub:
|
---|
257 | sbcs $aj,$tj,$nj // tp[j]-np[j]
|
---|
258 | ldr $tj,[$tp],#8
|
---|
259 | sub $j,$j,#8 // j--
|
---|
260 | ldr $nj,[$np],#8
|
---|
261 | str $aj,[$ap],#8 // rp[j]=tp[j]-np[j]
|
---|
262 | cbnz $j,.Lsub
|
---|
263 |
|
---|
264 | sbcs $aj,$tj,$nj
|
---|
265 | sbcs $ovf,$ovf,xzr // did it borrow?
|
---|
266 | str $aj,[$ap],#8 // rp[num-1]
|
---|
267 |
|
---|
268 | ldr $tj,[sp] // tp[0]
|
---|
269 | add $tp,sp,#8
|
---|
270 | ldr $aj,[$rp],#8 // rp[0]
|
---|
271 | sub $num,$num,#8 // num--
|
---|
272 | nop
|
---|
273 | .Lcond_copy:
|
---|
274 | sub $num,$num,#8 // num--
|
---|
275 | csel $nj,$tj,$aj,lo // did it borrow?
|
---|
276 | ldr $tj,[$tp],#8
|
---|
277 | ldr $aj,[$rp],#8
|
---|
278 | stur xzr,[$tp,#-16] // wipe tp
|
---|
279 | stur $nj,[$rp,#-16]
|
---|
280 | cbnz $num,.Lcond_copy
|
---|
281 |
|
---|
282 | csel $nj,$tj,$aj,lo
|
---|
283 | stur xzr,[$tp,#-8] // wipe tp
|
---|
284 | stur $nj,[$rp,#-8]
|
---|
285 |
|
---|
286 | ldp x19,x20,[x29,#16]
|
---|
287 | mov sp,x29
|
---|
288 | ldp x21,x22,[x29,#32]
|
---|
289 | mov x0,#1
|
---|
290 | ldp x23,x24,[x29,#48]
|
---|
291 | ldr x29,[sp],#64
|
---|
292 | AARCH64_VALIDATE_LINK_REGISTER
|
---|
293 | ret
|
---|
294 | .size bn_mul_mont,.-bn_mul_mont
|
---|
295 | ___
|
---|
296 | {
|
---|
297 | my ($A0,$A1,$N0,$N1)=map("v$_",(0..3));
|
---|
298 | my ($Z,$Temp)=("v4.16b","v5");
|
---|
299 | my @ACC=map("v$_",(6..13));
|
---|
300 | my ($Bi,$Ni,$M0)=map("v$_",(28..30));
|
---|
301 | my $sBi="s28";
|
---|
302 | my $sM0="s30";
|
---|
303 | my $zero="v14";
|
---|
304 | my $temp="v15";
|
---|
305 | my $ACCTemp="v16";
|
---|
306 |
|
---|
307 | my ($rptr,$aptr,$bptr,$nptr,$n0,$num)=map("x$_",(0..5));
|
---|
308 | my ($tinptr,$toutptr,$inner,$outer,$bnptr)=map("x$_",(6..11));
|
---|
309 |
|
---|
310 | $code.=<<___;
|
---|
311 | .type bn_mul8x_mont_neon,%function
|
---|
312 | .align 5
|
---|
313 | bn_mul8x_mont_neon:
|
---|
314 | // Not adding AARCH64_SIGN_LINK_REGISTER here because bn_mul8x_mont_neon is jumped to
|
---|
315 | // only from bn_mul_mont which has already signed the return address.
|
---|
316 | stp x29,x30,[sp,#-80]!
|
---|
317 | mov x16,sp
|
---|
318 | stp d8,d9,[sp,#16]
|
---|
319 | stp d10,d11,[sp,#32]
|
---|
320 | stp d12,d13,[sp,#48]
|
---|
321 | stp d14,d15,[sp,#64]
|
---|
322 | lsl $num,$num,#1
|
---|
323 | eor $zero.16b,$zero.16b,$zero.16b
|
---|
324 |
|
---|
325 | .align 4
|
---|
326 | .LNEON_8n:
|
---|
327 | eor @ACC[0].16b,@ACC[0].16b,@ACC[0].16b
|
---|
328 | sub $toutptr,sp,#128
|
---|
329 | eor @ACC[1].16b,@ACC[1].16b,@ACC[1].16b
|
---|
330 | sub $toutptr,$toutptr,$num,lsl#4
|
---|
331 | eor @ACC[2].16b,@ACC[2].16b,@ACC[2].16b
|
---|
332 | and $toutptr,$toutptr,#-64
|
---|
333 | eor @ACC[3].16b,@ACC[3].16b,@ACC[3].16b
|
---|
334 | mov sp,$toutptr // alloca
|
---|
335 | eor @ACC[4].16b,@ACC[4].16b,@ACC[4].16b
|
---|
336 | add $toutptr,$toutptr,#256
|
---|
337 | eor @ACC[5].16b,@ACC[5].16b,@ACC[5].16b
|
---|
338 | sub $inner,$num,#8
|
---|
339 | eor @ACC[6].16b,@ACC[6].16b,@ACC[6].16b
|
---|
340 | eor @ACC[7].16b,@ACC[7].16b,@ACC[7].16b
|
---|
341 |
|
---|
342 | .LNEON_8n_init:
|
---|
343 | st1 {@ACC[0].2d,@ACC[1].2d},[$toutptr],#32
|
---|
344 | subs $inner,$inner,#8
|
---|
345 | st1 {@ACC[2].2d,@ACC[3].2d},[$toutptr],#32
|
---|
346 | st1 {@ACC[4].2d,@ACC[5].2d},[$toutptr],#32
|
---|
347 | st1 {@ACC[6].2d,@ACC[7].2d},[$toutptr],#32
|
---|
348 | bne .LNEON_8n_init
|
---|
349 |
|
---|
350 | add $tinptr,sp,#256
|
---|
351 | ld1 {$A0.4s,$A1.4s},[$aptr],#32
|
---|
352 | add $bnptr,sp,#8
|
---|
353 | ldr $sM0,[$n0],#4
|
---|
354 | mov $outer,$num
|
---|
355 | b .LNEON_8n_outer
|
---|
356 |
|
---|
357 | .align 4
|
---|
358 | .LNEON_8n_outer:
|
---|
359 | ldr $sBi,[$bptr],#4 // *b++
|
---|
360 | uxtl $Bi.4s,$Bi.4h
|
---|
361 | add $toutptr,sp,#128
|
---|
362 | ld1 {$N0.4s,$N1.4s},[$nptr],#32
|
---|
363 |
|
---|
364 | umlal @ACC[0].2d,$Bi.2s,$A0.s[0]
|
---|
365 | umlal @ACC[1].2d,$Bi.2s,$A0.s[1]
|
---|
366 | umlal @ACC[2].2d,$Bi.2s,$A0.s[2]
|
---|
367 | shl $Ni.2d,@ACC[0].2d,#16
|
---|
368 | ext $Ni.16b,$Ni.16b,$Ni.16b,#8
|
---|
369 | umlal @ACC[3].2d,$Bi.2s,$A0.s[3]
|
---|
370 | add $Ni.2d,$Ni.2d,@ACC[0].2d
|
---|
371 | umlal @ACC[4].2d,$Bi.2s,$A1.s[0]
|
---|
372 | mul $Ni.2s,$Ni.2s,$M0.2s
|
---|
373 | umlal @ACC[5].2d,$Bi.2s,$A1.s[1]
|
---|
374 | st1 {$Bi.2s},[sp] // put aside smashed b[8*i+0]
|
---|
375 | umlal @ACC[6].2d,$Bi.2s,$A1.s[2]
|
---|
376 | uxtl $Ni.4s,$Ni.4h
|
---|
377 | umlal @ACC[7].2d,$Bi.2s,$A1.s[3]
|
---|
378 | ___
|
---|
379 | for ($i=0; $i<7;) {
|
---|
380 | $code.=<<___;
|
---|
381 | ldr $sBi,[$bptr],#4 // *b++
|
---|
382 | umlal @ACC[0].2d,$Ni.2s,$N0.s[0]
|
---|
383 | umlal @ACC[1].2d,$Ni.2s,$N0.s[1]
|
---|
384 | uxtl $Bi.4s,$Bi.4h
|
---|
385 | umlal @ACC[2].2d,$Ni.2s,$N0.s[2]
|
---|
386 | ushr $temp.2d,@ACC[0].2d,#16
|
---|
387 | umlal @ACC[3].2d,$Ni.2s,$N0.s[3]
|
---|
388 | umlal @ACC[4].2d,$Ni.2s,$N1.s[0]
|
---|
389 | ext @ACC[0].16b,@ACC[0].16b,@ACC[0].16b,#8
|
---|
390 | add @ACC[0].2d,@ACC[0].2d,$temp.2d
|
---|
391 | umlal @ACC[5].2d,$Ni.2s,$N1.s[1]
|
---|
392 | ushr @ACC[0].2d,@ACC[0].2d,#16
|
---|
393 | umlal @ACC[6].2d,$Ni.2s,$N1.s[2]
|
---|
394 | umlal @ACC[7].2d,$Ni.2s,$N1.s[3]
|
---|
395 | add $ACCTemp.2d,@ACC[1].2d,@ACC[0].2d
|
---|
396 | ins @ACC[1].d[0],$ACCTemp.d[0]
|
---|
397 | st1 {$Ni.2s},[$bnptr],#8 // put aside smashed m[8*i+$i]
|
---|
398 | ___
|
---|
399 | push(@ACC,shift(@ACC)); $i++;
|
---|
400 | $code.=<<___;
|
---|
401 | umlal @ACC[0].2d,$Bi.2s,$A0.s[0]
|
---|
402 | ld1 {@ACC[7].2d},[$tinptr],#16
|
---|
403 | umlal @ACC[1].2d,$Bi.2s,$A0.s[1]
|
---|
404 | umlal @ACC[2].2d,$Bi.2s,$A0.s[2]
|
---|
405 | shl $Ni.2d,@ACC[0].2d,#16
|
---|
406 | ext $Ni.16b,$Ni.16b,$Ni.16b,#8
|
---|
407 | umlal @ACC[3].2d,$Bi.2s,$A0.s[3]
|
---|
408 | add $Ni.2d,$Ni.2d,@ACC[0].2d
|
---|
409 | umlal @ACC[4].2d,$Bi.2s,$A1.s[0]
|
---|
410 | mul $Ni.2s,$Ni.2s,$M0.2s
|
---|
411 | umlal @ACC[5].2d,$Bi.2s,$A1.s[1]
|
---|
412 | st1 {$Bi.2s},[$bnptr],#8 // put aside smashed b[8*i+$i]
|
---|
413 | umlal @ACC[6].2d,$Bi.2s,$A1.s[2]
|
---|
414 | uxtl $Ni.4s,$Ni.4h
|
---|
415 | umlal @ACC[7].2d,$Bi.2s,$A1.s[3]
|
---|
416 | ___
|
---|
417 | }
|
---|
418 | $code.=<<___;
|
---|
419 | ld1 {$Bi.2s},[sp] // pull smashed b[8*i+0]
|
---|
420 | umlal @ACC[0].2d,$Ni.2s,$N0.s[0]
|
---|
421 | ld1 {$A0.4s,$A1.4s},[$aptr],#32
|
---|
422 | umlal @ACC[1].2d,$Ni.2s,$N0.s[1]
|
---|
423 | umlal @ACC[2].2d,$Ni.2s,$N0.s[2]
|
---|
424 | mov $Temp.16b,@ACC[0].16b
|
---|
425 | ushr $Temp.2d,$Temp.2d,#16
|
---|
426 | ext @ACC[0].16b,@ACC[0].16b,@ACC[0].16b,#8
|
---|
427 | umlal @ACC[3].2d,$Ni.2s,$N0.s[3]
|
---|
428 | umlal @ACC[4].2d,$Ni.2s,$N1.s[0]
|
---|
429 | add @ACC[0].2d,@ACC[0].2d,$Temp.2d
|
---|
430 | umlal @ACC[5].2d,$Ni.2s,$N1.s[1]
|
---|
431 | ushr @ACC[0].2d,@ACC[0].2d,#16
|
---|
432 | eor $temp.16b,$temp.16b,$temp.16b
|
---|
433 | ins @ACC[0].d[1],$temp.d[0]
|
---|
434 | umlal @ACC[6].2d,$Ni.2s,$N1.s[2]
|
---|
435 | umlal @ACC[7].2d,$Ni.2s,$N1.s[3]
|
---|
436 | add @ACC[1].2d,@ACC[1].2d,@ACC[0].2d
|
---|
437 | st1 {$Ni.2s},[$bnptr],#8 // put aside smashed m[8*i+$i]
|
---|
438 | add $bnptr,sp,#8 // rewind
|
---|
439 | ___
|
---|
440 | push(@ACC,shift(@ACC));
|
---|
441 | $code.=<<___;
|
---|
442 | sub $inner,$num,#8
|
---|
443 | b .LNEON_8n_inner
|
---|
444 |
|
---|
445 | .align 4
|
---|
446 | .LNEON_8n_inner:
|
---|
447 | subs $inner,$inner,#8
|
---|
448 | umlal @ACC[0].2d,$Bi.2s,$A0.s[0]
|
---|
449 | ld1 {@ACC[7].2d},[$tinptr]
|
---|
450 | umlal @ACC[1].2d,$Bi.2s,$A0.s[1]
|
---|
451 | ld1 {$Ni.2s},[$bnptr],#8 // pull smashed m[8*i+0]
|
---|
452 | umlal @ACC[2].2d,$Bi.2s,$A0.s[2]
|
---|
453 | ld1 {$N0.4s,$N1.4s},[$nptr],#32
|
---|
454 | umlal @ACC[3].2d,$Bi.2s,$A0.s[3]
|
---|
455 | b.eq .LInner_jump
|
---|
456 | add $tinptr,$tinptr,#16 // don't advance in last iteration
|
---|
457 | .LInner_jump:
|
---|
458 | umlal @ACC[4].2d,$Bi.2s,$A1.s[0]
|
---|
459 | umlal @ACC[5].2d,$Bi.2s,$A1.s[1]
|
---|
460 | umlal @ACC[6].2d,$Bi.2s,$A1.s[2]
|
---|
461 | umlal @ACC[7].2d,$Bi.2s,$A1.s[3]
|
---|
462 | ___
|
---|
463 | for ($i=1; $i<8; $i++) {
|
---|
464 | $code.=<<___;
|
---|
465 | ld1 {$Bi.2s},[$bnptr],#8 // pull smashed b[8*i+$i]
|
---|
466 | umlal @ACC[0].2d,$Ni.2s,$N0.s[0]
|
---|
467 | umlal @ACC[1].2d,$Ni.2s,$N0.s[1]
|
---|
468 | umlal @ACC[2].2d,$Ni.2s,$N0.s[2]
|
---|
469 | umlal @ACC[3].2d,$Ni.2s,$N0.s[3]
|
---|
470 | umlal @ACC[4].2d,$Ni.2s,$N1.s[0]
|
---|
471 | umlal @ACC[5].2d,$Ni.2s,$N1.s[1]
|
---|
472 | umlal @ACC[6].2d,$Ni.2s,$N1.s[2]
|
---|
473 | umlal @ACC[7].2d,$Ni.2s,$N1.s[3]
|
---|
474 | st1 {@ACC[0].2d},[$toutptr],#16
|
---|
475 | ___
|
---|
476 | push(@ACC,shift(@ACC));
|
---|
477 | $code.=<<___;
|
---|
478 | umlal @ACC[0].2d,$Bi.2s,$A0.s[0]
|
---|
479 | ld1 {@ACC[7].2d},[$tinptr]
|
---|
480 | umlal @ACC[1].2d,$Bi.2s,$A0.s[1]
|
---|
481 | ld1 {$Ni.2s},[$bnptr],#8 // pull smashed m[8*i+$i]
|
---|
482 | umlal @ACC[2].2d,$Bi.2s,$A0.s[2]
|
---|
483 | b.eq .LInner_jump$i
|
---|
484 | add $tinptr,$tinptr,#16 // don't advance in last iteration
|
---|
485 | .LInner_jump$i:
|
---|
486 | umlal @ACC[3].2d,$Bi.2s,$A0.s[3]
|
---|
487 | umlal @ACC[4].2d,$Bi.2s,$A1.s[0]
|
---|
488 | umlal @ACC[5].2d,$Bi.2s,$A1.s[1]
|
---|
489 | umlal @ACC[6].2d,$Bi.2s,$A1.s[2]
|
---|
490 | umlal @ACC[7].2d,$Bi.2s,$A1.s[3]
|
---|
491 | ___
|
---|
492 | }
|
---|
493 | $code.=<<___;
|
---|
494 | b.ne .LInner_after_rewind$i
|
---|
495 | sub $aptr,$aptr,$num,lsl#2 // rewind
|
---|
496 | .LInner_after_rewind$i:
|
---|
497 | umlal @ACC[0].2d,$Ni.2s,$N0.s[0]
|
---|
498 | ld1 {$Bi.2s},[sp] // pull smashed b[8*i+0]
|
---|
499 | umlal @ACC[1].2d,$Ni.2s,$N0.s[1]
|
---|
500 | ld1 {$A0.4s,$A1.4s},[$aptr],#32
|
---|
501 | umlal @ACC[2].2d,$Ni.2s,$N0.s[2]
|
---|
502 | add $bnptr,sp,#8 // rewind
|
---|
503 | umlal @ACC[3].2d,$Ni.2s,$N0.s[3]
|
---|
504 | umlal @ACC[4].2d,$Ni.2s,$N1.s[0]
|
---|
505 | umlal @ACC[5].2d,$Ni.2s,$N1.s[1]
|
---|
506 | umlal @ACC[6].2d,$Ni.2s,$N1.s[2]
|
---|
507 | st1 {@ACC[0].2d},[$toutptr],#16
|
---|
508 | umlal @ACC[7].2d,$Ni.2s,$N1.s[3]
|
---|
509 |
|
---|
510 | bne .LNEON_8n_inner
|
---|
511 | ___
|
---|
512 | push(@ACC,shift(@ACC));
|
---|
513 | $code.=<<___;
|
---|
514 | add $tinptr,sp,#128
|
---|
515 | st1 {@ACC[0].2d,@ACC[1].2d},[$toutptr],#32
|
---|
516 | eor $N0.16b,$N0.16b,$N0.16b // $N0
|
---|
517 | st1 {@ACC[2].2d,@ACC[3].2d},[$toutptr],#32
|
---|
518 | eor $N1.16b,$N1.16b,$N1.16b // $N1
|
---|
519 | st1 {@ACC[4].2d,@ACC[5].2d},[$toutptr],#32
|
---|
520 | st1 {@ACC[6].2d},[$toutptr]
|
---|
521 |
|
---|
522 | subs $outer,$outer,#8
|
---|
523 | ld1 {@ACC[0].2d,@ACC[1].2d},[$tinptr],#32
|
---|
524 | ld1 {@ACC[2].2d,@ACC[3].2d},[$tinptr],#32
|
---|
525 | ld1 {@ACC[4].2d,@ACC[5].2d},[$tinptr],#32
|
---|
526 | ld1 {@ACC[6].2d,@ACC[7].2d},[$tinptr],#32
|
---|
527 |
|
---|
528 | b.eq .LInner_8n_jump_2steps
|
---|
529 | sub $nptr,$nptr,$num,lsl#2 // rewind
|
---|
530 | b .LNEON_8n_outer
|
---|
531 |
|
---|
532 | .LInner_8n_jump_2steps:
|
---|
533 | add $toutptr,sp,#128
|
---|
534 | st1 {$N0.2d,$N1.2d}, [sp],#32 // start wiping stack frame
|
---|
535 | mov $Temp.16b,@ACC[0].16b
|
---|
536 | ushr $temp.2d,@ACC[0].2d,#16
|
---|
537 | ext @ACC[0].16b,@ACC[0].16b,@ACC[0].16b,#8
|
---|
538 | st1 {$N0.2d,$N1.2d}, [sp],#32
|
---|
539 | add @ACC[0].2d,@ACC[0].2d,$temp.2d
|
---|
540 | st1 {$N0.2d,$N1.2d}, [sp],#32
|
---|
541 | ushr $temp.2d,@ACC[0].2d,#16
|
---|
542 | st1 {$N0.2d,$N1.2d}, [sp],#32
|
---|
543 | zip1 @ACC[0].4h,$Temp.4h,@ACC[0].4h
|
---|
544 | ins $temp.d[1],$zero.d[0]
|
---|
545 |
|
---|
546 | mov $inner,$num
|
---|
547 | b .LNEON_tail_entry
|
---|
548 |
|
---|
549 | .align 4
|
---|
550 | .LNEON_tail:
|
---|
551 | add @ACC[0].2d,@ACC[0].2d,$temp.2d
|
---|
552 | mov $Temp.16b,@ACC[0].16b
|
---|
553 | ushr $temp.2d,@ACC[0].2d,#16
|
---|
554 | ext @ACC[0].16b,@ACC[0].16b,@ACC[0].16b,#8
|
---|
555 | ld1 {@ACC[2].2d,@ACC[3].2d}, [$tinptr],#32
|
---|
556 | add @ACC[0].2d,@ACC[0].2d,$temp.2d
|
---|
557 | ld1 {@ACC[4].2d,@ACC[5].2d}, [$tinptr],#32
|
---|
558 | ushr $temp.2d,@ACC[0].2d,#16
|
---|
559 | ld1 {@ACC[6].2d,@ACC[7].2d}, [$tinptr],#32
|
---|
560 | zip1 @ACC[0].4h,$Temp.4h,@ACC[0].4h
|
---|
561 | ins $temp.d[1],$zero.d[0]
|
---|
562 |
|
---|
563 | .LNEON_tail_entry:
|
---|
564 | ___
|
---|
565 | for ($i=1; $i<8; $i++) {
|
---|
566 | $code.=<<___;
|
---|
567 | add @ACC[1].2d,@ACC[1].2d,$temp.2d
|
---|
568 | st1 {@ACC[0].s}[0], [$toutptr],#4
|
---|
569 | ushr $temp.2d,@ACC[1].2d,#16
|
---|
570 | mov $Temp.16b,@ACC[1].16b
|
---|
571 | ext @ACC[1].16b,@ACC[1].16b,@ACC[1].16b,#8
|
---|
572 | add @ACC[1].2d,@ACC[1].2d,$temp.2d
|
---|
573 | ushr $temp.2d,@ACC[1].2d,#16
|
---|
574 | zip1 @ACC[1].4h,$Temp.4h,@ACC[1].4h
|
---|
575 | ins $temp.d[1],$zero.d[0]
|
---|
576 | ___
|
---|
577 | push(@ACC,shift(@ACC));
|
---|
578 | }
|
---|
579 | push(@ACC,shift(@ACC));
|
---|
580 | $code.=<<___;
|
---|
581 | ld1 {@ACC[0].2d,@ACC[1].2d}, [$tinptr],#32
|
---|
582 | subs $inner,$inner,#8
|
---|
583 | st1 {@ACC[7].s}[0], [$toutptr],#4
|
---|
584 | bne .LNEON_tail
|
---|
585 |
|
---|
586 | st1 {$temp.s}[0], [$toutptr],#4 // top-most bit
|
---|
587 | sub $nptr,$nptr,$num,lsl#2 // rewind $nptr
|
---|
588 | subs $aptr,sp,#0 // clear carry flag
|
---|
589 | add $bptr,sp,$num,lsl#2
|
---|
590 |
|
---|
591 | .LNEON_sub:
|
---|
592 | ldp w4,w5,[$aptr],#8
|
---|
593 | ldp w6,w7,[$aptr],#8
|
---|
594 | ldp w8,w9,[$nptr],#8
|
---|
595 | ldp w10,w11,[$nptr],#8
|
---|
596 | sbcs w8,w4,w8
|
---|
597 | sbcs w9,w5,w9
|
---|
598 | sbcs w10,w6,w10
|
---|
599 | sbcs w11,w7,w11
|
---|
600 | sub x17,$bptr,$aptr
|
---|
601 | stp w8,w9,[$rptr],#8
|
---|
602 | stp w10,w11,[$rptr],#8
|
---|
603 | cbnz x17,.LNEON_sub
|
---|
604 |
|
---|
605 | ldr w10, [$aptr] // load top-most bit
|
---|
606 | mov x11,sp
|
---|
607 | eor v0.16b,v0.16b,v0.16b
|
---|
608 | sub x11,$bptr,x11 // this is num*4
|
---|
609 | eor v1.16b,v1.16b,v1.16b
|
---|
610 | mov $aptr,sp
|
---|
611 | sub $rptr,$rptr,x11 // rewind $rptr
|
---|
612 | mov $nptr,$bptr // second 3/4th of frame
|
---|
613 | sbcs w10,w10,wzr // result is carry flag
|
---|
614 |
|
---|
615 | .LNEON_copy_n_zap:
|
---|
616 | ldp w4,w5,[$aptr],#8
|
---|
617 | ldp w6,w7,[$aptr],#8
|
---|
618 | ldp w8,w9,[$rptr],#8
|
---|
619 | ldp w10,w11,[$rptr]
|
---|
620 | sub $rptr,$rptr,#8
|
---|
621 | b.cs .LCopy_1
|
---|
622 | mov w8,w4
|
---|
623 | mov w9,w5
|
---|
624 | mov w10,w6
|
---|
625 | mov w11,w7
|
---|
626 | .LCopy_1:
|
---|
627 | st1 {v0.2d,v1.2d}, [$nptr],#32 // wipe
|
---|
628 | st1 {v0.2d,v1.2d}, [$nptr],#32 // wipe
|
---|
629 | ldp w4,w5,[$aptr],#8
|
---|
630 | ldp w6,w7,[$aptr],#8
|
---|
631 | stp w8,w9,[$rptr],#8
|
---|
632 | stp w10,w11,[$rptr],#8
|
---|
633 | sub $aptr,$aptr,#32
|
---|
634 | ldp w8,w9,[$rptr],#8
|
---|
635 | ldp w10,w11,[$rptr]
|
---|
636 | sub $rptr,$rptr,#8
|
---|
637 | b.cs .LCopy_2
|
---|
638 | mov w8, w4
|
---|
639 | mov w9, w5
|
---|
640 | mov w10, w6
|
---|
641 | mov w11, w7
|
---|
642 | .LCopy_2:
|
---|
643 | st1 {v0.2d,v1.2d}, [$aptr],#32 // wipe
|
---|
644 | st1 {v0.2d,v1.2d}, [$nptr],#32 // wipe
|
---|
645 | sub x17,$bptr,$aptr // preserves carry
|
---|
646 | stp w8,w9,[$rptr],#8
|
---|
647 | stp w10,w11,[$rptr],#8
|
---|
648 | cbnz x17,.LNEON_copy_n_zap
|
---|
649 |
|
---|
650 | mov sp,x16
|
---|
651 | ldp d14,d15,[sp,#64]
|
---|
652 | ldp d12,d13,[sp,#48]
|
---|
653 | ldp d10,d11,[sp,#32]
|
---|
654 | ldp d8,d9,[sp,#16]
|
---|
655 | ldr x29,[sp],#80
|
---|
656 | AARCH64_VALIDATE_LINK_REGISTER
|
---|
657 | ret // bx lr
|
---|
658 |
|
---|
659 | .size bn_mul8x_mont_neon,.-bn_mul8x_mont_neon
|
---|
660 | ___
|
---|
661 | }
|
---|
662 | {
|
---|
663 | ########################################################################
|
---|
664 | # Following is ARMv8 adaptation of sqrx8x_mont from x86_64-mont5 module.
|
---|
665 |
|
---|
666 | my ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("x$_",(6..13));
|
---|
667 | my ($t0,$t1,$t2,$t3)=map("x$_",(14..17));
|
---|
668 | my ($acc0,$acc1,$acc2,$acc3,$acc4,$acc5,$acc6,$acc7)=map("x$_",(19..26));
|
---|
669 | my ($cnt,$carry,$topmost)=("x27","x28","x30");
|
---|
670 | my ($tp,$ap_end,$na0)=($bp,$np,$carry);
|
---|
671 |
|
---|
672 | $code.=<<___;
|
---|
673 | .type __bn_sqr8x_mont,%function
|
---|
674 | .align 5
|
---|
675 | __bn_sqr8x_mont:
|
---|
676 | cmp $ap,$bp
|
---|
677 | b.ne __bn_mul4x_mont
|
---|
678 | .Lsqr8x_mont:
|
---|
679 | // Not adding AARCH64_SIGN_LINK_REGISTER here because __bn_sqr8x_mont is jumped to
|
---|
680 | // only from bn_mul_mont which has already signed the return address.
|
---|
681 | stp x29,x30,[sp,#-128]!
|
---|
682 | add x29,sp,#0
|
---|
683 | stp x19,x20,[sp,#16]
|
---|
684 | stp x21,x22,[sp,#32]
|
---|
685 | stp x23,x24,[sp,#48]
|
---|
686 | stp x25,x26,[sp,#64]
|
---|
687 | stp x27,x28,[sp,#80]
|
---|
688 | stp $rp,$np,[sp,#96] // offload rp and np
|
---|
689 |
|
---|
690 | ldp $a0,$a1,[$ap,#8*0]
|
---|
691 | ldp $a2,$a3,[$ap,#8*2]
|
---|
692 | ldp $a4,$a5,[$ap,#8*4]
|
---|
693 | ldp $a6,$a7,[$ap,#8*6]
|
---|
694 |
|
---|
695 | sub $tp,sp,$num,lsl#4
|
---|
696 | lsl $num,$num,#3
|
---|
697 | ldr $n0,[$n0] // *n0
|
---|
698 | mov sp,$tp // alloca
|
---|
699 | sub $cnt,$num,#8*8
|
---|
700 | b .Lsqr8x_zero_start
|
---|
701 |
|
---|
702 | .Lsqr8x_zero:
|
---|
703 | sub $cnt,$cnt,#8*8
|
---|
704 | stp xzr,xzr,[$tp,#8*0]
|
---|
705 | stp xzr,xzr,[$tp,#8*2]
|
---|
706 | stp xzr,xzr,[$tp,#8*4]
|
---|
707 | stp xzr,xzr,[$tp,#8*6]
|
---|
708 | .Lsqr8x_zero_start:
|
---|
709 | stp xzr,xzr,[$tp,#8*8]
|
---|
710 | stp xzr,xzr,[$tp,#8*10]
|
---|
711 | stp xzr,xzr,[$tp,#8*12]
|
---|
712 | stp xzr,xzr,[$tp,#8*14]
|
---|
713 | add $tp,$tp,#8*16
|
---|
714 | cbnz $cnt,.Lsqr8x_zero
|
---|
715 |
|
---|
716 | add $ap_end,$ap,$num
|
---|
717 | add $ap,$ap,#8*8
|
---|
718 | mov $acc0,xzr
|
---|
719 | mov $acc1,xzr
|
---|
720 | mov $acc2,xzr
|
---|
721 | mov $acc3,xzr
|
---|
722 | mov $acc4,xzr
|
---|
723 | mov $acc5,xzr
|
---|
724 | mov $acc6,xzr
|
---|
725 | mov $acc7,xzr
|
---|
726 | mov $tp,sp
|
---|
727 | str $n0,[x29,#112] // offload n0
|
---|
728 |
|
---|
729 | // Multiply everything but a[i]*a[i]
|
---|
730 | .align 4
|
---|
731 | .Lsqr8x_outer_loop:
|
---|
732 | // a[1]a[0] (i)
|
---|
733 | // a[2]a[0]
|
---|
734 | // a[3]a[0]
|
---|
735 | // a[4]a[0]
|
---|
736 | // a[5]a[0]
|
---|
737 | // a[6]a[0]
|
---|
738 | // a[7]a[0]
|
---|
739 | // a[2]a[1] (ii)
|
---|
740 | // a[3]a[1]
|
---|
741 | // a[4]a[1]
|
---|
742 | // a[5]a[1]
|
---|
743 | // a[6]a[1]
|
---|
744 | // a[7]a[1]
|
---|
745 | // a[3]a[2] (iii)
|
---|
746 | // a[4]a[2]
|
---|
747 | // a[5]a[2]
|
---|
748 | // a[6]a[2]
|
---|
749 | // a[7]a[2]
|
---|
750 | // a[4]a[3] (iv)
|
---|
751 | // a[5]a[3]
|
---|
752 | // a[6]a[3]
|
---|
753 | // a[7]a[3]
|
---|
754 | // a[5]a[4] (v)
|
---|
755 | // a[6]a[4]
|
---|
756 | // a[7]a[4]
|
---|
757 | // a[6]a[5] (vi)
|
---|
758 | // a[7]a[5]
|
---|
759 | // a[7]a[6] (vii)
|
---|
760 |
|
---|
761 | mul $t0,$a1,$a0 // lo(a[1..7]*a[0]) (i)
|
---|
762 | mul $t1,$a2,$a0
|
---|
763 | mul $t2,$a3,$a0
|
---|
764 | mul $t3,$a4,$a0
|
---|
765 | adds $acc1,$acc1,$t0 // t[1]+lo(a[1]*a[0])
|
---|
766 | mul $t0,$a5,$a0
|
---|
767 | adcs $acc2,$acc2,$t1
|
---|
768 | mul $t1,$a6,$a0
|
---|
769 | adcs $acc3,$acc3,$t2
|
---|
770 | mul $t2,$a7,$a0
|
---|
771 | adcs $acc4,$acc4,$t3
|
---|
772 | umulh $t3,$a1,$a0 // hi(a[1..7]*a[0])
|
---|
773 | adcs $acc5,$acc5,$t0
|
---|
774 | umulh $t0,$a2,$a0
|
---|
775 | adcs $acc6,$acc6,$t1
|
---|
776 | umulh $t1,$a3,$a0
|
---|
777 | adcs $acc7,$acc7,$t2
|
---|
778 | umulh $t2,$a4,$a0
|
---|
779 | stp $acc0,$acc1,[$tp],#8*2 // t[0..1]
|
---|
780 | adc $acc0,xzr,xzr // t[8]
|
---|
781 | adds $acc2,$acc2,$t3 // t[2]+lo(a[1]*a[0])
|
---|
782 | umulh $t3,$a5,$a0
|
---|
783 | adcs $acc3,$acc3,$t0
|
---|
784 | umulh $t0,$a6,$a0
|
---|
785 | adcs $acc4,$acc4,$t1
|
---|
786 | umulh $t1,$a7,$a0
|
---|
787 | adcs $acc5,$acc5,$t2
|
---|
788 | mul $t2,$a2,$a1 // lo(a[2..7]*a[1]) (ii)
|
---|
789 | adcs $acc6,$acc6,$t3
|
---|
790 | mul $t3,$a3,$a1
|
---|
791 | adcs $acc7,$acc7,$t0
|
---|
792 | mul $t0,$a4,$a1
|
---|
793 | adc $acc0,$acc0,$t1
|
---|
794 |
|
---|
795 | mul $t1,$a5,$a1
|
---|
796 | adds $acc3,$acc3,$t2
|
---|
797 | mul $t2,$a6,$a1
|
---|
798 | adcs $acc4,$acc4,$t3
|
---|
799 | mul $t3,$a7,$a1
|
---|
800 | adcs $acc5,$acc5,$t0
|
---|
801 | umulh $t0,$a2,$a1 // hi(a[2..7]*a[1])
|
---|
802 | adcs $acc6,$acc6,$t1
|
---|
803 | umulh $t1,$a3,$a1
|
---|
804 | adcs $acc7,$acc7,$t2
|
---|
805 | umulh $t2,$a4,$a1
|
---|
806 | adcs $acc0,$acc0,$t3
|
---|
807 | umulh $t3,$a5,$a1
|
---|
808 | stp $acc2,$acc3,[$tp],#8*2 // t[2..3]
|
---|
809 | adc $acc1,xzr,xzr // t[9]
|
---|
810 | adds $acc4,$acc4,$t0
|
---|
811 | umulh $t0,$a6,$a1
|
---|
812 | adcs $acc5,$acc5,$t1
|
---|
813 | umulh $t1,$a7,$a1
|
---|
814 | adcs $acc6,$acc6,$t2
|
---|
815 | mul $t2,$a3,$a2 // lo(a[3..7]*a[2]) (iii)
|
---|
816 | adcs $acc7,$acc7,$t3
|
---|
817 | mul $t3,$a4,$a2
|
---|
818 | adcs $acc0,$acc0,$t0
|
---|
819 | mul $t0,$a5,$a2
|
---|
820 | adc $acc1,$acc1,$t1
|
---|
821 |
|
---|
822 | mul $t1,$a6,$a2
|
---|
823 | adds $acc5,$acc5,$t2
|
---|
824 | mul $t2,$a7,$a2
|
---|
825 | adcs $acc6,$acc6,$t3
|
---|
826 | umulh $t3,$a3,$a2 // hi(a[3..7]*a[2])
|
---|
827 | adcs $acc7,$acc7,$t0
|
---|
828 | umulh $t0,$a4,$a2
|
---|
829 | adcs $acc0,$acc0,$t1
|
---|
830 | umulh $t1,$a5,$a2
|
---|
831 | adcs $acc1,$acc1,$t2
|
---|
832 | umulh $t2,$a6,$a2
|
---|
833 | stp $acc4,$acc5,[$tp],#8*2 // t[4..5]
|
---|
834 | adc $acc2,xzr,xzr // t[10]
|
---|
835 | adds $acc6,$acc6,$t3
|
---|
836 | umulh $t3,$a7,$a2
|
---|
837 | adcs $acc7,$acc7,$t0
|
---|
838 | mul $t0,$a4,$a3 // lo(a[4..7]*a[3]) (iv)
|
---|
839 | adcs $acc0,$acc0,$t1
|
---|
840 | mul $t1,$a5,$a3
|
---|
841 | adcs $acc1,$acc1,$t2
|
---|
842 | mul $t2,$a6,$a3
|
---|
843 | adc $acc2,$acc2,$t3
|
---|
844 |
|
---|
845 | mul $t3,$a7,$a3
|
---|
846 | adds $acc7,$acc7,$t0
|
---|
847 | umulh $t0,$a4,$a3 // hi(a[4..7]*a[3])
|
---|
848 | adcs $acc0,$acc0,$t1
|
---|
849 | umulh $t1,$a5,$a3
|
---|
850 | adcs $acc1,$acc1,$t2
|
---|
851 | umulh $t2,$a6,$a3
|
---|
852 | adcs $acc2,$acc2,$t3
|
---|
853 | umulh $t3,$a7,$a3
|
---|
854 | stp $acc6,$acc7,[$tp],#8*2 // t[6..7]
|
---|
855 | adc $acc3,xzr,xzr // t[11]
|
---|
856 | adds $acc0,$acc0,$t0
|
---|
857 | mul $t0,$a5,$a4 // lo(a[5..7]*a[4]) (v)
|
---|
858 | adcs $acc1,$acc1,$t1
|
---|
859 | mul $t1,$a6,$a4
|
---|
860 | adcs $acc2,$acc2,$t2
|
---|
861 | mul $t2,$a7,$a4
|
---|
862 | adc $acc3,$acc3,$t3
|
---|
863 |
|
---|
864 | umulh $t3,$a5,$a4 // hi(a[5..7]*a[4])
|
---|
865 | adds $acc1,$acc1,$t0
|
---|
866 | umulh $t0,$a6,$a4
|
---|
867 | adcs $acc2,$acc2,$t1
|
---|
868 | umulh $t1,$a7,$a4
|
---|
869 | adcs $acc3,$acc3,$t2
|
---|
870 | mul $t2,$a6,$a5 // lo(a[6..7]*a[5]) (vi)
|
---|
871 | adc $acc4,xzr,xzr // t[12]
|
---|
872 | adds $acc2,$acc2,$t3
|
---|
873 | mul $t3,$a7,$a5
|
---|
874 | adcs $acc3,$acc3,$t0
|
---|
875 | umulh $t0,$a6,$a5 // hi(a[6..7]*a[5])
|
---|
876 | adc $acc4,$acc4,$t1
|
---|
877 |
|
---|
878 | umulh $t1,$a7,$a5
|
---|
879 | adds $acc3,$acc3,$t2
|
---|
880 | mul $t2,$a7,$a6 // lo(a[7]*a[6]) (vii)
|
---|
881 | adcs $acc4,$acc4,$t3
|
---|
882 | umulh $t3,$a7,$a6 // hi(a[7]*a[6])
|
---|
883 | adc $acc5,xzr,xzr // t[13]
|
---|
884 | adds $acc4,$acc4,$t0
|
---|
885 | sub $cnt,$ap_end,$ap // done yet?
|
---|
886 | adc $acc5,$acc5,$t1
|
---|
887 |
|
---|
888 | adds $acc5,$acc5,$t2
|
---|
889 | sub $t0,$ap_end,$num // rewinded ap
|
---|
890 | adc $acc6,xzr,xzr // t[14]
|
---|
891 | add $acc6,$acc6,$t3
|
---|
892 |
|
---|
893 | cbz $cnt,.Lsqr8x_outer_break
|
---|
894 |
|
---|
895 | mov $n0,$a0
|
---|
896 | ldp $a0,$a1,[$tp,#8*0]
|
---|
897 | ldp $a2,$a3,[$tp,#8*2]
|
---|
898 | ldp $a4,$a5,[$tp,#8*4]
|
---|
899 | ldp $a6,$a7,[$tp,#8*6]
|
---|
900 | adds $acc0,$acc0,$a0
|
---|
901 | adcs $acc1,$acc1,$a1
|
---|
902 | ldp $a0,$a1,[$ap,#8*0]
|
---|
903 | adcs $acc2,$acc2,$a2
|
---|
904 | adcs $acc3,$acc3,$a3
|
---|
905 | ldp $a2,$a3,[$ap,#8*2]
|
---|
906 | adcs $acc4,$acc4,$a4
|
---|
907 | adcs $acc5,$acc5,$a5
|
---|
908 | ldp $a4,$a5,[$ap,#8*4]
|
---|
909 | adcs $acc6,$acc6,$a6
|
---|
910 | mov $rp,$ap
|
---|
911 | adcs $acc7,xzr,$a7
|
---|
912 | ldp $a6,$a7,[$ap,#8*6]
|
---|
913 | add $ap,$ap,#8*8
|
---|
914 | //adc $carry,xzr,xzr // moved below
|
---|
915 | mov $cnt,#-8*8
|
---|
916 |
|
---|
917 | // a[8]a[0]
|
---|
918 | // a[9]a[0]
|
---|
919 | // a[a]a[0]
|
---|
920 | // a[b]a[0]
|
---|
921 | // a[c]a[0]
|
---|
922 | // a[d]a[0]
|
---|
923 | // a[e]a[0]
|
---|
924 | // a[f]a[0]
|
---|
925 | // a[8]a[1]
|
---|
926 | // a[f]a[1]........................
|
---|
927 | // a[8]a[2]
|
---|
928 | // a[f]a[2]........................
|
---|
929 | // a[8]a[3]
|
---|
930 | // a[f]a[3]........................
|
---|
931 | // a[8]a[4]
|
---|
932 | // a[f]a[4]........................
|
---|
933 | // a[8]a[5]
|
---|
934 | // a[f]a[5]........................
|
---|
935 | // a[8]a[6]
|
---|
936 | // a[f]a[6]........................
|
---|
937 | // a[8]a[7]
|
---|
938 | // a[f]a[7]........................
|
---|
939 | .Lsqr8x_mul:
|
---|
940 | mul $t0,$a0,$n0
|
---|
941 | adc $carry,xzr,xzr // carry bit, modulo-scheduled
|
---|
942 | mul $t1,$a1,$n0
|
---|
943 | add $cnt,$cnt,#8
|
---|
944 | mul $t2,$a2,$n0
|
---|
945 | mul $t3,$a3,$n0
|
---|
946 | adds $acc0,$acc0,$t0
|
---|
947 | mul $t0,$a4,$n0
|
---|
948 | adcs $acc1,$acc1,$t1
|
---|
949 | mul $t1,$a5,$n0
|
---|
950 | adcs $acc2,$acc2,$t2
|
---|
951 | mul $t2,$a6,$n0
|
---|
952 | adcs $acc3,$acc3,$t3
|
---|
953 | mul $t3,$a7,$n0
|
---|
954 | adcs $acc4,$acc4,$t0
|
---|
955 | umulh $t0,$a0,$n0
|
---|
956 | adcs $acc5,$acc5,$t1
|
---|
957 | umulh $t1,$a1,$n0
|
---|
958 | adcs $acc6,$acc6,$t2
|
---|
959 | umulh $t2,$a2,$n0
|
---|
960 | adcs $acc7,$acc7,$t3
|
---|
961 | umulh $t3,$a3,$n0
|
---|
962 | adc $carry,$carry,xzr
|
---|
963 | str $acc0,[$tp],#8
|
---|
964 | adds $acc0,$acc1,$t0
|
---|
965 | umulh $t0,$a4,$n0
|
---|
966 | adcs $acc1,$acc2,$t1
|
---|
967 | umulh $t1,$a5,$n0
|
---|
968 | adcs $acc2,$acc3,$t2
|
---|
969 | umulh $t2,$a6,$n0
|
---|
970 | adcs $acc3,$acc4,$t3
|
---|
971 | umulh $t3,$a7,$n0
|
---|
972 | ldr $n0,[$rp,$cnt]
|
---|
973 | adcs $acc4,$acc5,$t0
|
---|
974 | adcs $acc5,$acc6,$t1
|
---|
975 | adcs $acc6,$acc7,$t2
|
---|
976 | adcs $acc7,$carry,$t3
|
---|
977 | //adc $carry,xzr,xzr // moved above
|
---|
978 | cbnz $cnt,.Lsqr8x_mul
|
---|
979 | // note that carry flag is guaranteed
|
---|
980 | // to be zero at this point
|
---|
981 | cmp $ap,$ap_end // done yet?
|
---|
982 | b.eq .Lsqr8x_break
|
---|
983 |
|
---|
984 | ldp $a0,$a1,[$tp,#8*0]
|
---|
985 | ldp $a2,$a3,[$tp,#8*2]
|
---|
986 | ldp $a4,$a5,[$tp,#8*4]
|
---|
987 | ldp $a6,$a7,[$tp,#8*6]
|
---|
988 | adds $acc0,$acc0,$a0
|
---|
989 | ldur $n0,[$rp,#-8*8]
|
---|
990 | adcs $acc1,$acc1,$a1
|
---|
991 | ldp $a0,$a1,[$ap,#8*0]
|
---|
992 | adcs $acc2,$acc2,$a2
|
---|
993 | adcs $acc3,$acc3,$a3
|
---|
994 | ldp $a2,$a3,[$ap,#8*2]
|
---|
995 | adcs $acc4,$acc4,$a4
|
---|
996 | adcs $acc5,$acc5,$a5
|
---|
997 | ldp $a4,$a5,[$ap,#8*4]
|
---|
998 | adcs $acc6,$acc6,$a6
|
---|
999 | mov $cnt,#-8*8
|
---|
1000 | adcs $acc7,$acc7,$a7
|
---|
1001 | ldp $a6,$a7,[$ap,#8*6]
|
---|
1002 | add $ap,$ap,#8*8
|
---|
1003 | //adc $carry,xzr,xzr // moved above
|
---|
1004 | b .Lsqr8x_mul
|
---|
1005 |
|
---|
1006 | .align 4
|
---|
1007 | .Lsqr8x_break:
|
---|
1008 | ldp $a0,$a1,[$rp,#8*0]
|
---|
1009 | add $ap,$rp,#8*8
|
---|
1010 | ldp $a2,$a3,[$rp,#8*2]
|
---|
1011 | sub $t0,$ap_end,$ap // is it last iteration?
|
---|
1012 | ldp $a4,$a5,[$rp,#8*4]
|
---|
1013 | sub $t1,$tp,$t0
|
---|
1014 | ldp $a6,$a7,[$rp,#8*6]
|
---|
1015 | cbz $t0,.Lsqr8x_outer_loop
|
---|
1016 |
|
---|
1017 | stp $acc0,$acc1,[$tp,#8*0]
|
---|
1018 | ldp $acc0,$acc1,[$t1,#8*0]
|
---|
1019 | stp $acc2,$acc3,[$tp,#8*2]
|
---|
1020 | ldp $acc2,$acc3,[$t1,#8*2]
|
---|
1021 | stp $acc4,$acc5,[$tp,#8*4]
|
---|
1022 | ldp $acc4,$acc5,[$t1,#8*4]
|
---|
1023 | stp $acc6,$acc7,[$tp,#8*6]
|
---|
1024 | mov $tp,$t1
|
---|
1025 | ldp $acc6,$acc7,[$t1,#8*6]
|
---|
1026 | b .Lsqr8x_outer_loop
|
---|
1027 |
|
---|
1028 | .align 4
|
---|
1029 | .Lsqr8x_outer_break:
|
---|
1030 | // Now multiply above result by 2 and add a[n-1]*a[n-1]|...|a[0]*a[0]
|
---|
1031 | ldp $a1,$a3,[$t0,#8*0] // recall that $t0 is &a[0]
|
---|
1032 | ldp $t1,$t2,[sp,#8*1]
|
---|
1033 | ldp $a5,$a7,[$t0,#8*2]
|
---|
1034 | add $ap,$t0,#8*4
|
---|
1035 | ldp $t3,$t0,[sp,#8*3]
|
---|
1036 |
|
---|
1037 | stp $acc0,$acc1,[$tp,#8*0]
|
---|
1038 | mul $acc0,$a1,$a1
|
---|
1039 | stp $acc2,$acc3,[$tp,#8*2]
|
---|
1040 | umulh $a1,$a1,$a1
|
---|
1041 | stp $acc4,$acc5,[$tp,#8*4]
|
---|
1042 | mul $a2,$a3,$a3
|
---|
1043 | stp $acc6,$acc7,[$tp,#8*6]
|
---|
1044 | mov $tp,sp
|
---|
1045 | umulh $a3,$a3,$a3
|
---|
1046 | adds $acc1,$a1,$t1,lsl#1
|
---|
1047 | extr $t1,$t2,$t1,#63
|
---|
1048 | sub $cnt,$num,#8*4
|
---|
1049 |
|
---|
1050 | .Lsqr4x_shift_n_add:
|
---|
1051 | adcs $acc2,$a2,$t1
|
---|
1052 | extr $t2,$t3,$t2,#63
|
---|
1053 | sub $cnt,$cnt,#8*4
|
---|
1054 | adcs $acc3,$a3,$t2
|
---|
1055 | ldp $t1,$t2,[$tp,#8*5]
|
---|
1056 | mul $a4,$a5,$a5
|
---|
1057 | ldp $a1,$a3,[$ap],#8*2
|
---|
1058 | umulh $a5,$a5,$a5
|
---|
1059 | mul $a6,$a7,$a7
|
---|
1060 | umulh $a7,$a7,$a7
|
---|
1061 | extr $t3,$t0,$t3,#63
|
---|
1062 | stp $acc0,$acc1,[$tp,#8*0]
|
---|
1063 | adcs $acc4,$a4,$t3
|
---|
1064 | extr $t0,$t1,$t0,#63
|
---|
1065 | stp $acc2,$acc3,[$tp,#8*2]
|
---|
1066 | adcs $acc5,$a5,$t0
|
---|
1067 | ldp $t3,$t0,[$tp,#8*7]
|
---|
1068 | extr $t1,$t2,$t1,#63
|
---|
1069 | adcs $acc6,$a6,$t1
|
---|
1070 | extr $t2,$t3,$t2,#63
|
---|
1071 | adcs $acc7,$a7,$t2
|
---|
1072 | ldp $t1,$t2,[$tp,#8*9]
|
---|
1073 | mul $a0,$a1,$a1
|
---|
1074 | ldp $a5,$a7,[$ap],#8*2
|
---|
1075 | umulh $a1,$a1,$a1
|
---|
1076 | mul $a2,$a3,$a3
|
---|
1077 | umulh $a3,$a3,$a3
|
---|
1078 | stp $acc4,$acc5,[$tp,#8*4]
|
---|
1079 | extr $t3,$t0,$t3,#63
|
---|
1080 | stp $acc6,$acc7,[$tp,#8*6]
|
---|
1081 | add $tp,$tp,#8*8
|
---|
1082 | adcs $acc0,$a0,$t3
|
---|
1083 | extr $t0,$t1,$t0,#63
|
---|
1084 | adcs $acc1,$a1,$t0
|
---|
1085 | ldp $t3,$t0,[$tp,#8*3]
|
---|
1086 | extr $t1,$t2,$t1,#63
|
---|
1087 | cbnz $cnt,.Lsqr4x_shift_n_add
|
---|
1088 | ___
|
---|
1089 | my ($np,$np_end)=($ap,$ap_end);
|
---|
1090 | $code.=<<___;
|
---|
1091 | ldp $np,$n0,[x29,#104] // pull np and n0
|
---|
1092 |
|
---|
1093 | adcs $acc2,$a2,$t1
|
---|
1094 | extr $t2,$t3,$t2,#63
|
---|
1095 | adcs $acc3,$a3,$t2
|
---|
1096 | ldp $t1,$t2,[$tp,#8*5]
|
---|
1097 | mul $a4,$a5,$a5
|
---|
1098 | umulh $a5,$a5,$a5
|
---|
1099 | stp $acc0,$acc1,[$tp,#8*0]
|
---|
1100 | mul $a6,$a7,$a7
|
---|
1101 | umulh $a7,$a7,$a7
|
---|
1102 | stp $acc2,$acc3,[$tp,#8*2]
|
---|
1103 | extr $t3,$t0,$t3,#63
|
---|
1104 | adcs $acc4,$a4,$t3
|
---|
1105 | extr $t0,$t1,$t0,#63
|
---|
1106 | ldp $acc0,$acc1,[sp,#8*0]
|
---|
1107 | adcs $acc5,$a5,$t0
|
---|
1108 | extr $t1,$t2,$t1,#63
|
---|
1109 | ldp $a0,$a1,[$np,#8*0]
|
---|
1110 | adcs $acc6,$a6,$t1
|
---|
1111 | extr $t2,xzr,$t2,#63
|
---|
1112 | ldp $a2,$a3,[$np,#8*2]
|
---|
1113 | adc $acc7,$a7,$t2
|
---|
1114 | ldp $a4,$a5,[$np,#8*4]
|
---|
1115 |
|
---|
1116 | // Reduce by 512 bits per iteration
|
---|
1117 | mul $na0,$n0,$acc0 // t[0]*n0
|
---|
1118 | ldp $a6,$a7,[$np,#8*6]
|
---|
1119 | add $np_end,$np,$num
|
---|
1120 | ldp $acc2,$acc3,[sp,#8*2]
|
---|
1121 | stp $acc4,$acc5,[$tp,#8*4]
|
---|
1122 | ldp $acc4,$acc5,[sp,#8*4]
|
---|
1123 | stp $acc6,$acc7,[$tp,#8*6]
|
---|
1124 | ldp $acc6,$acc7,[sp,#8*6]
|
---|
1125 | add $np,$np,#8*8
|
---|
1126 | mov $topmost,xzr // initial top-most carry
|
---|
1127 | mov $tp,sp
|
---|
1128 | mov $cnt,#8
|
---|
1129 |
|
---|
1130 | .Lsqr8x_reduction:
|
---|
1131 | // (*) mul $t0,$a0,$na0 // lo(n[0-7])*lo(t[0]*n0)
|
---|
1132 | mul $t1,$a1,$na0
|
---|
1133 | sub $cnt,$cnt,#1
|
---|
1134 | mul $t2,$a2,$na0
|
---|
1135 | str $na0,[$tp],#8 // put aside t[0]*n0 for tail processing
|
---|
1136 | mul $t3,$a3,$na0
|
---|
1137 | // (*) adds xzr,$acc0,$t0
|
---|
1138 | subs xzr,$acc0,#1 // (*)
|
---|
1139 | mul $t0,$a4,$na0
|
---|
1140 | adcs $acc0,$acc1,$t1
|
---|
1141 | mul $t1,$a5,$na0
|
---|
1142 | adcs $acc1,$acc2,$t2
|
---|
1143 | mul $t2,$a6,$na0
|
---|
1144 | adcs $acc2,$acc3,$t3
|
---|
1145 | mul $t3,$a7,$na0
|
---|
1146 | adcs $acc3,$acc4,$t0
|
---|
1147 | umulh $t0,$a0,$na0 // hi(n[0-7])*lo(t[0]*n0)
|
---|
1148 | adcs $acc4,$acc5,$t1
|
---|
1149 | umulh $t1,$a1,$na0
|
---|
1150 | adcs $acc5,$acc6,$t2
|
---|
1151 | umulh $t2,$a2,$na0
|
---|
1152 | adcs $acc6,$acc7,$t3
|
---|
1153 | umulh $t3,$a3,$na0
|
---|
1154 | adc $acc7,xzr,xzr
|
---|
1155 | adds $acc0,$acc0,$t0
|
---|
1156 | umulh $t0,$a4,$na0
|
---|
1157 | adcs $acc1,$acc1,$t1
|
---|
1158 | umulh $t1,$a5,$na0
|
---|
1159 | adcs $acc2,$acc2,$t2
|
---|
1160 | umulh $t2,$a6,$na0
|
---|
1161 | adcs $acc3,$acc3,$t3
|
---|
1162 | umulh $t3,$a7,$na0
|
---|
1163 | mul $na0,$n0,$acc0 // next t[0]*n0
|
---|
1164 | adcs $acc4,$acc4,$t0
|
---|
1165 | adcs $acc5,$acc5,$t1
|
---|
1166 | adcs $acc6,$acc6,$t2
|
---|
1167 | adc $acc7,$acc7,$t3
|
---|
1168 | cbnz $cnt,.Lsqr8x_reduction
|
---|
1169 |
|
---|
1170 | ldp $t0,$t1,[$tp,#8*0]
|
---|
1171 | ldp $t2,$t3,[$tp,#8*2]
|
---|
1172 | mov $rp,$tp
|
---|
1173 | sub $cnt,$np_end,$np // done yet?
|
---|
1174 | adds $acc0,$acc0,$t0
|
---|
1175 | adcs $acc1,$acc1,$t1
|
---|
1176 | ldp $t0,$t1,[$tp,#8*4]
|
---|
1177 | adcs $acc2,$acc2,$t2
|
---|
1178 | adcs $acc3,$acc3,$t3
|
---|
1179 | ldp $t2,$t3,[$tp,#8*6]
|
---|
1180 | adcs $acc4,$acc4,$t0
|
---|
1181 | adcs $acc5,$acc5,$t1
|
---|
1182 | adcs $acc6,$acc6,$t2
|
---|
1183 | adcs $acc7,$acc7,$t3
|
---|
1184 | //adc $carry,xzr,xzr // moved below
|
---|
1185 | cbz $cnt,.Lsqr8x8_post_condition
|
---|
1186 |
|
---|
1187 | ldur $n0,[$tp,#-8*8]
|
---|
1188 | ldp $a0,$a1,[$np,#8*0]
|
---|
1189 | ldp $a2,$a3,[$np,#8*2]
|
---|
1190 | ldp $a4,$a5,[$np,#8*4]
|
---|
1191 | mov $cnt,#-8*8
|
---|
1192 | ldp $a6,$a7,[$np,#8*6]
|
---|
1193 | add $np,$np,#8*8
|
---|
1194 |
|
---|
1195 | .Lsqr8x_tail:
|
---|
1196 | mul $t0,$a0,$n0
|
---|
1197 | adc $carry,xzr,xzr // carry bit, modulo-scheduled
|
---|
1198 | mul $t1,$a1,$n0
|
---|
1199 | add $cnt,$cnt,#8
|
---|
1200 | mul $t2,$a2,$n0
|
---|
1201 | mul $t3,$a3,$n0
|
---|
1202 | adds $acc0,$acc0,$t0
|
---|
1203 | mul $t0,$a4,$n0
|
---|
1204 | adcs $acc1,$acc1,$t1
|
---|
1205 | mul $t1,$a5,$n0
|
---|
1206 | adcs $acc2,$acc2,$t2
|
---|
1207 | mul $t2,$a6,$n0
|
---|
1208 | adcs $acc3,$acc3,$t3
|
---|
1209 | mul $t3,$a7,$n0
|
---|
1210 | adcs $acc4,$acc4,$t0
|
---|
1211 | umulh $t0,$a0,$n0
|
---|
1212 | adcs $acc5,$acc5,$t1
|
---|
1213 | umulh $t1,$a1,$n0
|
---|
1214 | adcs $acc6,$acc6,$t2
|
---|
1215 | umulh $t2,$a2,$n0
|
---|
1216 | adcs $acc7,$acc7,$t3
|
---|
1217 | umulh $t3,$a3,$n0
|
---|
1218 | adc $carry,$carry,xzr
|
---|
1219 | str $acc0,[$tp],#8
|
---|
1220 | adds $acc0,$acc1,$t0
|
---|
1221 | umulh $t0,$a4,$n0
|
---|
1222 | adcs $acc1,$acc2,$t1
|
---|
1223 | umulh $t1,$a5,$n0
|
---|
1224 | adcs $acc2,$acc3,$t2
|
---|
1225 | umulh $t2,$a6,$n0
|
---|
1226 | adcs $acc3,$acc4,$t3
|
---|
1227 | umulh $t3,$a7,$n0
|
---|
1228 | ldr $n0,[$rp,$cnt]
|
---|
1229 | adcs $acc4,$acc5,$t0
|
---|
1230 | adcs $acc5,$acc6,$t1
|
---|
1231 | adcs $acc6,$acc7,$t2
|
---|
1232 | adcs $acc7,$carry,$t3
|
---|
1233 | //adc $carry,xzr,xzr // moved above
|
---|
1234 | cbnz $cnt,.Lsqr8x_tail
|
---|
1235 | // note that carry flag is guaranteed
|
---|
1236 | // to be zero at this point
|
---|
1237 | ldp $a0,$a1,[$tp,#8*0]
|
---|
1238 | sub $cnt,$np_end,$np // done yet?
|
---|
1239 | sub $t2,$np_end,$num // rewinded np
|
---|
1240 | ldp $a2,$a3,[$tp,#8*2]
|
---|
1241 | ldp $a4,$a5,[$tp,#8*4]
|
---|
1242 | ldp $a6,$a7,[$tp,#8*6]
|
---|
1243 | cbz $cnt,.Lsqr8x_tail_break
|
---|
1244 |
|
---|
1245 | ldur $n0,[$rp,#-8*8]
|
---|
1246 | adds $acc0,$acc0,$a0
|
---|
1247 | adcs $acc1,$acc1,$a1
|
---|
1248 | ldp $a0,$a1,[$np,#8*0]
|
---|
1249 | adcs $acc2,$acc2,$a2
|
---|
1250 | adcs $acc3,$acc3,$a3
|
---|
1251 | ldp $a2,$a3,[$np,#8*2]
|
---|
1252 | adcs $acc4,$acc4,$a4
|
---|
1253 | adcs $acc5,$acc5,$a5
|
---|
1254 | ldp $a4,$a5,[$np,#8*4]
|
---|
1255 | adcs $acc6,$acc6,$a6
|
---|
1256 | mov $cnt,#-8*8
|
---|
1257 | adcs $acc7,$acc7,$a7
|
---|
1258 | ldp $a6,$a7,[$np,#8*6]
|
---|
1259 | add $np,$np,#8*8
|
---|
1260 | //adc $carry,xzr,xzr // moved above
|
---|
1261 | b .Lsqr8x_tail
|
---|
1262 |
|
---|
1263 | .align 4
|
---|
1264 | .Lsqr8x_tail_break:
|
---|
1265 | ldr $n0,[x29,#112] // pull n0
|
---|
1266 | add $cnt,$tp,#8*8 // end of current t[num] window
|
---|
1267 |
|
---|
1268 | subs xzr,$topmost,#1 // "move" top-most carry to carry bit
|
---|
1269 | adcs $t0,$acc0,$a0
|
---|
1270 | adcs $t1,$acc1,$a1
|
---|
1271 | ldp $acc0,$acc1,[$rp,#8*0]
|
---|
1272 | adcs $acc2,$acc2,$a2
|
---|
1273 | ldp $a0,$a1,[$t2,#8*0] // recall that $t2 is &n[0]
|
---|
1274 | adcs $acc3,$acc3,$a3
|
---|
1275 | ldp $a2,$a3,[$t2,#8*2]
|
---|
1276 | adcs $acc4,$acc4,$a4
|
---|
1277 | adcs $acc5,$acc5,$a5
|
---|
1278 | ldp $a4,$a5,[$t2,#8*4]
|
---|
1279 | adcs $acc6,$acc6,$a6
|
---|
1280 | adcs $acc7,$acc7,$a7
|
---|
1281 | ldp $a6,$a7,[$t2,#8*6]
|
---|
1282 | add $np,$t2,#8*8
|
---|
1283 | adc $topmost,xzr,xzr // top-most carry
|
---|
1284 | mul $na0,$n0,$acc0
|
---|
1285 | stp $t0,$t1,[$tp,#8*0]
|
---|
1286 | stp $acc2,$acc3,[$tp,#8*2]
|
---|
1287 | ldp $acc2,$acc3,[$rp,#8*2]
|
---|
1288 | stp $acc4,$acc5,[$tp,#8*4]
|
---|
1289 | ldp $acc4,$acc5,[$rp,#8*4]
|
---|
1290 | cmp $cnt,x29 // did we hit the bottom?
|
---|
1291 | stp $acc6,$acc7,[$tp,#8*6]
|
---|
1292 | mov $tp,$rp // slide the window
|
---|
1293 | ldp $acc6,$acc7,[$rp,#8*6]
|
---|
1294 | mov $cnt,#8
|
---|
1295 | b.ne .Lsqr8x_reduction
|
---|
1296 |
|
---|
1297 | // Final step. We see if result is larger than modulus, and
|
---|
1298 | // if it is, subtract the modulus. But comparison implies
|
---|
1299 | // subtraction. So we subtract modulus, see if it borrowed,
|
---|
1300 | // and conditionally copy original value.
|
---|
1301 | ldr $rp,[x29,#96] // pull rp
|
---|
1302 | add $tp,$tp,#8*8
|
---|
1303 | subs $t0,$acc0,$a0
|
---|
1304 | sbcs $t1,$acc1,$a1
|
---|
1305 | sub $cnt,$num,#8*8
|
---|
1306 | mov $ap_end,$rp // $rp copy
|
---|
1307 |
|
---|
1308 | .Lsqr8x_sub:
|
---|
1309 | sbcs $t2,$acc2,$a2
|
---|
1310 | ldp $a0,$a1,[$np,#8*0]
|
---|
1311 | sbcs $t3,$acc3,$a3
|
---|
1312 | stp $t0,$t1,[$rp,#8*0]
|
---|
1313 | sbcs $t0,$acc4,$a4
|
---|
1314 | ldp $a2,$a3,[$np,#8*2]
|
---|
1315 | sbcs $t1,$acc5,$a5
|
---|
1316 | stp $t2,$t3,[$rp,#8*2]
|
---|
1317 | sbcs $t2,$acc6,$a6
|
---|
1318 | ldp $a4,$a5,[$np,#8*4]
|
---|
1319 | sbcs $t3,$acc7,$a7
|
---|
1320 | ldp $a6,$a7,[$np,#8*6]
|
---|
1321 | add $np,$np,#8*8
|
---|
1322 | ldp $acc0,$acc1,[$tp,#8*0]
|
---|
1323 | sub $cnt,$cnt,#8*8
|
---|
1324 | ldp $acc2,$acc3,[$tp,#8*2]
|
---|
1325 | ldp $acc4,$acc5,[$tp,#8*4]
|
---|
1326 | ldp $acc6,$acc7,[$tp,#8*6]
|
---|
1327 | add $tp,$tp,#8*8
|
---|
1328 | stp $t0,$t1,[$rp,#8*4]
|
---|
1329 | sbcs $t0,$acc0,$a0
|
---|
1330 | stp $t2,$t3,[$rp,#8*6]
|
---|
1331 | add $rp,$rp,#8*8
|
---|
1332 | sbcs $t1,$acc1,$a1
|
---|
1333 | cbnz $cnt,.Lsqr8x_sub
|
---|
1334 |
|
---|
1335 | sbcs $t2,$acc2,$a2
|
---|
1336 | mov $tp,sp
|
---|
1337 | add $ap,sp,$num
|
---|
1338 | ldp $a0,$a1,[$ap_end,#8*0]
|
---|
1339 | sbcs $t3,$acc3,$a3
|
---|
1340 | stp $t0,$t1,[$rp,#8*0]
|
---|
1341 | sbcs $t0,$acc4,$a4
|
---|
1342 | ldp $a2,$a3,[$ap_end,#8*2]
|
---|
1343 | sbcs $t1,$acc5,$a5
|
---|
1344 | stp $t2,$t3,[$rp,#8*2]
|
---|
1345 | sbcs $t2,$acc6,$a6
|
---|
1346 | ldp $acc0,$acc1,[$ap,#8*0]
|
---|
1347 | sbcs $t3,$acc7,$a7
|
---|
1348 | ldp $acc2,$acc3,[$ap,#8*2]
|
---|
1349 | sbcs xzr,$topmost,xzr // did it borrow?
|
---|
1350 | ldr x30,[x29,#8] // pull return address
|
---|
1351 | stp $t0,$t1,[$rp,#8*4]
|
---|
1352 | stp $t2,$t3,[$rp,#8*6]
|
---|
1353 |
|
---|
1354 | sub $cnt,$num,#8*4
|
---|
1355 | .Lsqr4x_cond_copy:
|
---|
1356 | sub $cnt,$cnt,#8*4
|
---|
1357 | csel $t0,$acc0,$a0,lo
|
---|
1358 | stp xzr,xzr,[$tp,#8*0]
|
---|
1359 | csel $t1,$acc1,$a1,lo
|
---|
1360 | ldp $a0,$a1,[$ap_end,#8*4]
|
---|
1361 | ldp $acc0,$acc1,[$ap,#8*4]
|
---|
1362 | csel $t2,$acc2,$a2,lo
|
---|
1363 | stp xzr,xzr,[$tp,#8*2]
|
---|
1364 | add $tp,$tp,#8*4
|
---|
1365 | csel $t3,$acc3,$a3,lo
|
---|
1366 | ldp $a2,$a3,[$ap_end,#8*6]
|
---|
1367 | ldp $acc2,$acc3,[$ap,#8*6]
|
---|
1368 | add $ap,$ap,#8*4
|
---|
1369 | stp $t0,$t1,[$ap_end,#8*0]
|
---|
1370 | stp $t2,$t3,[$ap_end,#8*2]
|
---|
1371 | add $ap_end,$ap_end,#8*4
|
---|
1372 | stp xzr,xzr,[$ap,#8*0]
|
---|
1373 | stp xzr,xzr,[$ap,#8*2]
|
---|
1374 | cbnz $cnt,.Lsqr4x_cond_copy
|
---|
1375 |
|
---|
1376 | csel $t0,$acc0,$a0,lo
|
---|
1377 | stp xzr,xzr,[$tp,#8*0]
|
---|
1378 | csel $t1,$acc1,$a1,lo
|
---|
1379 | stp xzr,xzr,[$tp,#8*2]
|
---|
1380 | csel $t2,$acc2,$a2,lo
|
---|
1381 | csel $t3,$acc3,$a3,lo
|
---|
1382 | stp $t0,$t1,[$ap_end,#8*0]
|
---|
1383 | stp $t2,$t3,[$ap_end,#8*2]
|
---|
1384 |
|
---|
1385 | b .Lsqr8x_done
|
---|
1386 |
|
---|
1387 | .align 4
|
---|
1388 | .Lsqr8x8_post_condition:
|
---|
1389 | adc $carry,xzr,xzr
|
---|
1390 | ldr x30,[x29,#8] // pull return address
|
---|
1391 | // $acc0-7,$carry hold result, $a0-7 hold modulus
|
---|
1392 | subs $a0,$acc0,$a0
|
---|
1393 | ldr $ap,[x29,#96] // pull rp
|
---|
1394 | sbcs $a1,$acc1,$a1
|
---|
1395 | stp xzr,xzr,[sp,#8*0]
|
---|
1396 | sbcs $a2,$acc2,$a2
|
---|
1397 | stp xzr,xzr,[sp,#8*2]
|
---|
1398 | sbcs $a3,$acc3,$a3
|
---|
1399 | stp xzr,xzr,[sp,#8*4]
|
---|
1400 | sbcs $a4,$acc4,$a4
|
---|
1401 | stp xzr,xzr,[sp,#8*6]
|
---|
1402 | sbcs $a5,$acc5,$a5
|
---|
1403 | stp xzr,xzr,[sp,#8*8]
|
---|
1404 | sbcs $a6,$acc6,$a6
|
---|
1405 | stp xzr,xzr,[sp,#8*10]
|
---|
1406 | sbcs $a7,$acc7,$a7
|
---|
1407 | stp xzr,xzr,[sp,#8*12]
|
---|
1408 | sbcs $carry,$carry,xzr // did it borrow?
|
---|
1409 | stp xzr,xzr,[sp,#8*14]
|
---|
1410 |
|
---|
1411 | // $a0-7 hold result-modulus
|
---|
1412 | csel $a0,$acc0,$a0,lo
|
---|
1413 | csel $a1,$acc1,$a1,lo
|
---|
1414 | csel $a2,$acc2,$a2,lo
|
---|
1415 | csel $a3,$acc3,$a3,lo
|
---|
1416 | stp $a0,$a1,[$ap,#8*0]
|
---|
1417 | csel $a4,$acc4,$a4,lo
|
---|
1418 | csel $a5,$acc5,$a5,lo
|
---|
1419 | stp $a2,$a3,[$ap,#8*2]
|
---|
1420 | csel $a6,$acc6,$a6,lo
|
---|
1421 | csel $a7,$acc7,$a7,lo
|
---|
1422 | stp $a4,$a5,[$ap,#8*4]
|
---|
1423 | stp $a6,$a7,[$ap,#8*6]
|
---|
1424 |
|
---|
1425 | .Lsqr8x_done:
|
---|
1426 | ldp x19,x20,[x29,#16]
|
---|
1427 | mov sp,x29
|
---|
1428 | ldp x21,x22,[x29,#32]
|
---|
1429 | mov x0,#1
|
---|
1430 | ldp x23,x24,[x29,#48]
|
---|
1431 | ldp x25,x26,[x29,#64]
|
---|
1432 | ldp x27,x28,[x29,#80]
|
---|
1433 | ldr x29,[sp],#128
|
---|
1434 | // x30 is loaded earlier
|
---|
1435 | AARCH64_VALIDATE_LINK_REGISTER
|
---|
1436 | ret
|
---|
1437 | .size __bn_sqr8x_mont,.-__bn_sqr8x_mont
|
---|
1438 | ___
|
---|
1439 | }
|
---|
1440 |
|
---|
1441 | {
|
---|
1442 | ########################################################################
|
---|
1443 | # Even though this might look as ARMv8 adaptation of mulx4x_mont from
|
---|
1444 | # x86_64-mont5 module, it's different in sense that it performs
|
---|
1445 | # reduction 256 bits at a time.
|
---|
1446 |
|
---|
1447 | my ($a0,$a1,$a2,$a3,
|
---|
1448 | $t0,$t1,$t2,$t3,
|
---|
1449 | $m0,$m1,$m2,$m3,
|
---|
1450 | $acc0,$acc1,$acc2,$acc3,$acc4,
|
---|
1451 | $bi,$mi,$tp,$ap_end,$cnt) = map("x$_",(6..17,19..28));
|
---|
1452 | my $bp_end=$rp;
|
---|
1453 | my ($carry,$topmost) = ($rp,"x30");
|
---|
1454 |
|
---|
1455 | $code.=<<___;
|
---|
1456 | .type __bn_mul4x_mont,%function
|
---|
1457 | .align 5
|
---|
1458 | __bn_mul4x_mont:
|
---|
1459 | // Not adding AARCH64_SIGN_LINK_REGISTER here because __bn_mul4x_mont is jumped to
|
---|
1460 | // only from bn_mul_mont (or __bn_sqr8x_mont from bn_mul_mont) which has already signed the return address.
|
---|
1461 | stp x29,x30,[sp,#-128]!
|
---|
1462 | add x29,sp,#0
|
---|
1463 | stp x19,x20,[sp,#16]
|
---|
1464 | stp x21,x22,[sp,#32]
|
---|
1465 | stp x23,x24,[sp,#48]
|
---|
1466 | stp x25,x26,[sp,#64]
|
---|
1467 | stp x27,x28,[sp,#80]
|
---|
1468 |
|
---|
1469 | sub $tp,sp,$num,lsl#3
|
---|
1470 | lsl $num,$num,#3
|
---|
1471 | ldr $n0,[$n0] // *n0
|
---|
1472 | sub sp,$tp,#8*4 // alloca
|
---|
1473 |
|
---|
1474 | add $t0,$bp,$num
|
---|
1475 | add $ap_end,$ap,$num
|
---|
1476 | stp $rp,$t0,[x29,#96] // offload rp and &b[num]
|
---|
1477 |
|
---|
1478 | ldr $bi,[$bp,#8*0] // b[0]
|
---|
1479 | ldp $a0,$a1,[$ap,#8*0] // a[0..3]
|
---|
1480 | ldp $a2,$a3,[$ap,#8*2]
|
---|
1481 | add $ap,$ap,#8*4
|
---|
1482 | mov $acc0,xzr
|
---|
1483 | mov $acc1,xzr
|
---|
1484 | mov $acc2,xzr
|
---|
1485 | mov $acc3,xzr
|
---|
1486 | ldp $m0,$m1,[$np,#8*0] // n[0..3]
|
---|
1487 | ldp $m2,$m3,[$np,#8*2]
|
---|
1488 | adds $np,$np,#8*4 // clear carry bit
|
---|
1489 | mov $carry,xzr
|
---|
1490 | mov $cnt,#0
|
---|
1491 | mov $tp,sp
|
---|
1492 |
|
---|
1493 | .Loop_mul4x_1st_reduction:
|
---|
1494 | mul $t0,$a0,$bi // lo(a[0..3]*b[0])
|
---|
1495 | adc $carry,$carry,xzr // modulo-scheduled
|
---|
1496 | mul $t1,$a1,$bi
|
---|
1497 | add $cnt,$cnt,#8
|
---|
1498 | mul $t2,$a2,$bi
|
---|
1499 | and $cnt,$cnt,#31
|
---|
1500 | mul $t3,$a3,$bi
|
---|
1501 | adds $acc0,$acc0,$t0
|
---|
1502 | umulh $t0,$a0,$bi // hi(a[0..3]*b[0])
|
---|
1503 | adcs $acc1,$acc1,$t1
|
---|
1504 | mul $mi,$acc0,$n0 // t[0]*n0
|
---|
1505 | adcs $acc2,$acc2,$t2
|
---|
1506 | umulh $t1,$a1,$bi
|
---|
1507 | adcs $acc3,$acc3,$t3
|
---|
1508 | umulh $t2,$a2,$bi
|
---|
1509 | adc $acc4,xzr,xzr
|
---|
1510 | umulh $t3,$a3,$bi
|
---|
1511 | ldr $bi,[$bp,$cnt] // next b[i] (or b[0])
|
---|
1512 | adds $acc1,$acc1,$t0
|
---|
1513 | // (*) mul $t0,$m0,$mi // lo(n[0..3]*t[0]*n0)
|
---|
1514 | str $mi,[$tp],#8 // put aside t[0]*n0 for tail processing
|
---|
1515 | adcs $acc2,$acc2,$t1
|
---|
1516 | mul $t1,$m1,$mi
|
---|
1517 | adcs $acc3,$acc3,$t2
|
---|
1518 | mul $t2,$m2,$mi
|
---|
1519 | adc $acc4,$acc4,$t3 // can't overflow
|
---|
1520 | mul $t3,$m3,$mi
|
---|
1521 | // (*) adds xzr,$acc0,$t0
|
---|
1522 | subs xzr,$acc0,#1 // (*)
|
---|
1523 | umulh $t0,$m0,$mi // hi(n[0..3]*t[0]*n0)
|
---|
1524 | adcs $acc0,$acc1,$t1
|
---|
1525 | umulh $t1,$m1,$mi
|
---|
1526 | adcs $acc1,$acc2,$t2
|
---|
1527 | umulh $t2,$m2,$mi
|
---|
1528 | adcs $acc2,$acc3,$t3
|
---|
1529 | umulh $t3,$m3,$mi
|
---|
1530 | adcs $acc3,$acc4,$carry
|
---|
1531 | adc $carry,xzr,xzr
|
---|
1532 | adds $acc0,$acc0,$t0
|
---|
1533 | sub $t0,$ap_end,$ap
|
---|
1534 | adcs $acc1,$acc1,$t1
|
---|
1535 | adcs $acc2,$acc2,$t2
|
---|
1536 | adcs $acc3,$acc3,$t3
|
---|
1537 | //adc $carry,$carry,xzr
|
---|
1538 | cbnz $cnt,.Loop_mul4x_1st_reduction
|
---|
1539 |
|
---|
1540 | cbz $t0,.Lmul4x4_post_condition
|
---|
1541 |
|
---|
1542 | ldp $a0,$a1,[$ap,#8*0] // a[4..7]
|
---|
1543 | ldp $a2,$a3,[$ap,#8*2]
|
---|
1544 | add $ap,$ap,#8*4
|
---|
1545 | ldr $mi,[sp] // a[0]*n0
|
---|
1546 | ldp $m0,$m1,[$np,#8*0] // n[4..7]
|
---|
1547 | ldp $m2,$m3,[$np,#8*2]
|
---|
1548 | add $np,$np,#8*4
|
---|
1549 |
|
---|
1550 | .Loop_mul4x_1st_tail:
|
---|
1551 | mul $t0,$a0,$bi // lo(a[4..7]*b[i])
|
---|
1552 | adc $carry,$carry,xzr // modulo-scheduled
|
---|
1553 | mul $t1,$a1,$bi
|
---|
1554 | add $cnt,$cnt,#8
|
---|
1555 | mul $t2,$a2,$bi
|
---|
1556 | and $cnt,$cnt,#31
|
---|
1557 | mul $t3,$a3,$bi
|
---|
1558 | adds $acc0,$acc0,$t0
|
---|
1559 | umulh $t0,$a0,$bi // hi(a[4..7]*b[i])
|
---|
1560 | adcs $acc1,$acc1,$t1
|
---|
1561 | umulh $t1,$a1,$bi
|
---|
1562 | adcs $acc2,$acc2,$t2
|
---|
1563 | umulh $t2,$a2,$bi
|
---|
1564 | adcs $acc3,$acc3,$t3
|
---|
1565 | umulh $t3,$a3,$bi
|
---|
1566 | adc $acc4,xzr,xzr
|
---|
1567 | ldr $bi,[$bp,$cnt] // next b[i] (or b[0])
|
---|
1568 | adds $acc1,$acc1,$t0
|
---|
1569 | mul $t0,$m0,$mi // lo(n[4..7]*a[0]*n0)
|
---|
1570 | adcs $acc2,$acc2,$t1
|
---|
1571 | mul $t1,$m1,$mi
|
---|
1572 | adcs $acc3,$acc3,$t2
|
---|
1573 | mul $t2,$m2,$mi
|
---|
1574 | adc $acc4,$acc4,$t3 // can't overflow
|
---|
1575 | mul $t3,$m3,$mi
|
---|
1576 | adds $acc0,$acc0,$t0
|
---|
1577 | umulh $t0,$m0,$mi // hi(n[4..7]*a[0]*n0)
|
---|
1578 | adcs $acc1,$acc1,$t1
|
---|
1579 | umulh $t1,$m1,$mi
|
---|
1580 | adcs $acc2,$acc2,$t2
|
---|
1581 | umulh $t2,$m2,$mi
|
---|
1582 | adcs $acc3,$acc3,$t3
|
---|
1583 | adcs $acc4,$acc4,$carry
|
---|
1584 | umulh $t3,$m3,$mi
|
---|
1585 | adc $carry,xzr,xzr
|
---|
1586 | ldr $mi,[sp,$cnt] // next t[0]*n0
|
---|
1587 | str $acc0,[$tp],#8 // result!!!
|
---|
1588 | adds $acc0,$acc1,$t0
|
---|
1589 | sub $t0,$ap_end,$ap // done yet?
|
---|
1590 | adcs $acc1,$acc2,$t1
|
---|
1591 | adcs $acc2,$acc3,$t2
|
---|
1592 | adcs $acc3,$acc4,$t3
|
---|
1593 | //adc $carry,$carry,xzr
|
---|
1594 | cbnz $cnt,.Loop_mul4x_1st_tail
|
---|
1595 |
|
---|
1596 | sub $t1,$ap_end,$num // rewinded $ap
|
---|
1597 | cbz $t0,.Lmul4x_proceed
|
---|
1598 |
|
---|
1599 | ldp $a0,$a1,[$ap,#8*0]
|
---|
1600 | ldp $a2,$a3,[$ap,#8*2]
|
---|
1601 | add $ap,$ap,#8*4
|
---|
1602 | ldp $m0,$m1,[$np,#8*0]
|
---|
1603 | ldp $m2,$m3,[$np,#8*2]
|
---|
1604 | add $np,$np,#8*4
|
---|
1605 | b .Loop_mul4x_1st_tail
|
---|
1606 |
|
---|
1607 | .align 5
|
---|
1608 | .Lmul4x_proceed:
|
---|
1609 | ldr $bi,[$bp,#8*4]! // *++b
|
---|
1610 | adc $topmost,$carry,xzr
|
---|
1611 | ldp $a0,$a1,[$t1,#8*0] // a[0..3]
|
---|
1612 | sub $np,$np,$num // rewind np
|
---|
1613 | ldp $a2,$a3,[$t1,#8*2]
|
---|
1614 | add $ap,$t1,#8*4
|
---|
1615 |
|
---|
1616 | stp $acc0,$acc1,[$tp,#8*0] // result!!!
|
---|
1617 | ldp $acc0,$acc1,[sp,#8*4] // t[0..3]
|
---|
1618 | stp $acc2,$acc3,[$tp,#8*2] // result!!!
|
---|
1619 | ldp $acc2,$acc3,[sp,#8*6]
|
---|
1620 |
|
---|
1621 | ldp $m0,$m1,[$np,#8*0] // n[0..3]
|
---|
1622 | mov $tp,sp
|
---|
1623 | ldp $m2,$m3,[$np,#8*2]
|
---|
1624 | adds $np,$np,#8*4 // clear carry bit
|
---|
1625 | mov $carry,xzr
|
---|
1626 |
|
---|
1627 | .align 4
|
---|
1628 | .Loop_mul4x_reduction:
|
---|
1629 | mul $t0,$a0,$bi // lo(a[0..3]*b[4])
|
---|
1630 | adc $carry,$carry,xzr // modulo-scheduled
|
---|
1631 | mul $t1,$a1,$bi
|
---|
1632 | add $cnt,$cnt,#8
|
---|
1633 | mul $t2,$a2,$bi
|
---|
1634 | and $cnt,$cnt,#31
|
---|
1635 | mul $t3,$a3,$bi
|
---|
1636 | adds $acc0,$acc0,$t0
|
---|
1637 | umulh $t0,$a0,$bi // hi(a[0..3]*b[4])
|
---|
1638 | adcs $acc1,$acc1,$t1
|
---|
1639 | mul $mi,$acc0,$n0 // t[0]*n0
|
---|
1640 | adcs $acc2,$acc2,$t2
|
---|
1641 | umulh $t1,$a1,$bi
|
---|
1642 | adcs $acc3,$acc3,$t3
|
---|
1643 | umulh $t2,$a2,$bi
|
---|
1644 | adc $acc4,xzr,xzr
|
---|
1645 | umulh $t3,$a3,$bi
|
---|
1646 | ldr $bi,[$bp,$cnt] // next b[i]
|
---|
1647 | adds $acc1,$acc1,$t0
|
---|
1648 | // (*) mul $t0,$m0,$mi
|
---|
1649 | str $mi,[$tp],#8 // put aside t[0]*n0 for tail processing
|
---|
1650 | adcs $acc2,$acc2,$t1
|
---|
1651 | mul $t1,$m1,$mi // lo(n[0..3]*t[0]*n0
|
---|
1652 | adcs $acc3,$acc3,$t2
|
---|
1653 | mul $t2,$m2,$mi
|
---|
1654 | adc $acc4,$acc4,$t3 // can't overflow
|
---|
1655 | mul $t3,$m3,$mi
|
---|
1656 | // (*) adds xzr,$acc0,$t0
|
---|
1657 | subs xzr,$acc0,#1 // (*)
|
---|
1658 | umulh $t0,$m0,$mi // hi(n[0..3]*t[0]*n0
|
---|
1659 | adcs $acc0,$acc1,$t1
|
---|
1660 | umulh $t1,$m1,$mi
|
---|
1661 | adcs $acc1,$acc2,$t2
|
---|
1662 | umulh $t2,$m2,$mi
|
---|
1663 | adcs $acc2,$acc3,$t3
|
---|
1664 | umulh $t3,$m3,$mi
|
---|
1665 | adcs $acc3,$acc4,$carry
|
---|
1666 | adc $carry,xzr,xzr
|
---|
1667 | adds $acc0,$acc0,$t0
|
---|
1668 | adcs $acc1,$acc1,$t1
|
---|
1669 | adcs $acc2,$acc2,$t2
|
---|
1670 | adcs $acc3,$acc3,$t3
|
---|
1671 | //adc $carry,$carry,xzr
|
---|
1672 | cbnz $cnt,.Loop_mul4x_reduction
|
---|
1673 |
|
---|
1674 | adc $carry,$carry,xzr
|
---|
1675 | ldp $t0,$t1,[$tp,#8*4] // t[4..7]
|
---|
1676 | ldp $t2,$t3,[$tp,#8*6]
|
---|
1677 | ldp $a0,$a1,[$ap,#8*0] // a[4..7]
|
---|
1678 | ldp $a2,$a3,[$ap,#8*2]
|
---|
1679 | add $ap,$ap,#8*4
|
---|
1680 | adds $acc0,$acc0,$t0
|
---|
1681 | adcs $acc1,$acc1,$t1
|
---|
1682 | adcs $acc2,$acc2,$t2
|
---|
1683 | adcs $acc3,$acc3,$t3
|
---|
1684 | //adc $carry,$carry,xzr
|
---|
1685 |
|
---|
1686 | ldr $mi,[sp] // t[0]*n0
|
---|
1687 | ldp $m0,$m1,[$np,#8*0] // n[4..7]
|
---|
1688 | ldp $m2,$m3,[$np,#8*2]
|
---|
1689 | add $np,$np,#8*4
|
---|
1690 |
|
---|
1691 | .align 4
|
---|
1692 | .Loop_mul4x_tail:
|
---|
1693 | mul $t0,$a0,$bi // lo(a[4..7]*b[4])
|
---|
1694 | adc $carry,$carry,xzr // modulo-scheduled
|
---|
1695 | mul $t1,$a1,$bi
|
---|
1696 | add $cnt,$cnt,#8
|
---|
1697 | mul $t2,$a2,$bi
|
---|
1698 | and $cnt,$cnt,#31
|
---|
1699 | mul $t3,$a3,$bi
|
---|
1700 | adds $acc0,$acc0,$t0
|
---|
1701 | umulh $t0,$a0,$bi // hi(a[4..7]*b[4])
|
---|
1702 | adcs $acc1,$acc1,$t1
|
---|
1703 | umulh $t1,$a1,$bi
|
---|
1704 | adcs $acc2,$acc2,$t2
|
---|
1705 | umulh $t2,$a2,$bi
|
---|
1706 | adcs $acc3,$acc3,$t3
|
---|
1707 | umulh $t3,$a3,$bi
|
---|
1708 | adc $acc4,xzr,xzr
|
---|
1709 | ldr $bi,[$bp,$cnt] // next b[i]
|
---|
1710 | adds $acc1,$acc1,$t0
|
---|
1711 | mul $t0,$m0,$mi // lo(n[4..7]*t[0]*n0)
|
---|
1712 | adcs $acc2,$acc2,$t1
|
---|
1713 | mul $t1,$m1,$mi
|
---|
1714 | adcs $acc3,$acc3,$t2
|
---|
1715 | mul $t2,$m2,$mi
|
---|
1716 | adc $acc4,$acc4,$t3 // can't overflow
|
---|
1717 | mul $t3,$m3,$mi
|
---|
1718 | adds $acc0,$acc0,$t0
|
---|
1719 | umulh $t0,$m0,$mi // hi(n[4..7]*t[0]*n0)
|
---|
1720 | adcs $acc1,$acc1,$t1
|
---|
1721 | umulh $t1,$m1,$mi
|
---|
1722 | adcs $acc2,$acc2,$t2
|
---|
1723 | umulh $t2,$m2,$mi
|
---|
1724 | adcs $acc3,$acc3,$t3
|
---|
1725 | umulh $t3,$m3,$mi
|
---|
1726 | adcs $acc4,$acc4,$carry
|
---|
1727 | ldr $mi,[sp,$cnt] // next a[0]*n0
|
---|
1728 | adc $carry,xzr,xzr
|
---|
1729 | str $acc0,[$tp],#8 // result!!!
|
---|
1730 | adds $acc0,$acc1,$t0
|
---|
1731 | sub $t0,$ap_end,$ap // done yet?
|
---|
1732 | adcs $acc1,$acc2,$t1
|
---|
1733 | adcs $acc2,$acc3,$t2
|
---|
1734 | adcs $acc3,$acc4,$t3
|
---|
1735 | //adc $carry,$carry,xzr
|
---|
1736 | cbnz $cnt,.Loop_mul4x_tail
|
---|
1737 |
|
---|
1738 | sub $t1,$np,$num // rewinded np?
|
---|
1739 | adc $carry,$carry,xzr
|
---|
1740 | cbz $t0,.Loop_mul4x_break
|
---|
1741 |
|
---|
1742 | ldp $t0,$t1,[$tp,#8*4]
|
---|
1743 | ldp $t2,$t3,[$tp,#8*6]
|
---|
1744 | ldp $a0,$a1,[$ap,#8*0]
|
---|
1745 | ldp $a2,$a3,[$ap,#8*2]
|
---|
1746 | add $ap,$ap,#8*4
|
---|
1747 | adds $acc0,$acc0,$t0
|
---|
1748 | adcs $acc1,$acc1,$t1
|
---|
1749 | adcs $acc2,$acc2,$t2
|
---|
1750 | adcs $acc3,$acc3,$t3
|
---|
1751 | //adc $carry,$carry,xzr
|
---|
1752 | ldp $m0,$m1,[$np,#8*0]
|
---|
1753 | ldp $m2,$m3,[$np,#8*2]
|
---|
1754 | add $np,$np,#8*4
|
---|
1755 | b .Loop_mul4x_tail
|
---|
1756 |
|
---|
1757 | .align 4
|
---|
1758 | .Loop_mul4x_break:
|
---|
1759 | ldp $t2,$t3,[x29,#96] // pull rp and &b[num]
|
---|
1760 | adds $acc0,$acc0,$topmost
|
---|
1761 | add $bp,$bp,#8*4 // bp++
|
---|
1762 | adcs $acc1,$acc1,xzr
|
---|
1763 | sub $ap,$ap,$num // rewind ap
|
---|
1764 | adcs $acc2,$acc2,xzr
|
---|
1765 | stp $acc0,$acc1,[$tp,#8*0] // result!!!
|
---|
1766 | adcs $acc3,$acc3,xzr
|
---|
1767 | ldp $acc0,$acc1,[sp,#8*4] // t[0..3]
|
---|
1768 | adc $topmost,$carry,xzr
|
---|
1769 | stp $acc2,$acc3,[$tp,#8*2] // result!!!
|
---|
1770 | cmp $bp,$t3 // done yet?
|
---|
1771 | ldp $acc2,$acc3,[sp,#8*6]
|
---|
1772 | ldp $m0,$m1,[$t1,#8*0] // n[0..3]
|
---|
1773 | ldp $m2,$m3,[$t1,#8*2]
|
---|
1774 | add $np,$t1,#8*4
|
---|
1775 | b.eq .Lmul4x_post
|
---|
1776 |
|
---|
1777 | ldr $bi,[$bp]
|
---|
1778 | ldp $a0,$a1,[$ap,#8*0] // a[0..3]
|
---|
1779 | ldp $a2,$a3,[$ap,#8*2]
|
---|
1780 | adds $ap,$ap,#8*4 // clear carry bit
|
---|
1781 | mov $carry,xzr
|
---|
1782 | mov $tp,sp
|
---|
1783 | b .Loop_mul4x_reduction
|
---|
1784 |
|
---|
1785 | .align 4
|
---|
1786 | .Lmul4x_post:
|
---|
1787 | // Final step. We see if result is larger than modulus, and
|
---|
1788 | // if it is, subtract the modulus. But comparison implies
|
---|
1789 | // subtraction. So we subtract modulus, see if it borrowed,
|
---|
1790 | // and conditionally copy original value.
|
---|
1791 | mov $rp,$t2
|
---|
1792 | mov $ap_end,$t2 // $rp copy
|
---|
1793 | subs $t0,$acc0,$m0
|
---|
1794 | add $tp,sp,#8*8
|
---|
1795 | sbcs $t1,$acc1,$m1
|
---|
1796 | sub $cnt,$num,#8*4
|
---|
1797 |
|
---|
1798 | .Lmul4x_sub:
|
---|
1799 | sbcs $t2,$acc2,$m2
|
---|
1800 | ldp $m0,$m1,[$np,#8*0]
|
---|
1801 | sub $cnt,$cnt,#8*4
|
---|
1802 | ldp $acc0,$acc1,[$tp,#8*0]
|
---|
1803 | sbcs $t3,$acc3,$m3
|
---|
1804 | ldp $m2,$m3,[$np,#8*2]
|
---|
1805 | add $np,$np,#8*4
|
---|
1806 | ldp $acc2,$acc3,[$tp,#8*2]
|
---|
1807 | add $tp,$tp,#8*4
|
---|
1808 | stp $t0,$t1,[$rp,#8*0]
|
---|
1809 | sbcs $t0,$acc0,$m0
|
---|
1810 | stp $t2,$t3,[$rp,#8*2]
|
---|
1811 | add $rp,$rp,#8*4
|
---|
1812 | sbcs $t1,$acc1,$m1
|
---|
1813 | cbnz $cnt,.Lmul4x_sub
|
---|
1814 |
|
---|
1815 | sbcs $t2,$acc2,$m2
|
---|
1816 | mov $tp,sp
|
---|
1817 | add $ap,sp,#8*4
|
---|
1818 | ldp $a0,$a1,[$ap_end,#8*0]
|
---|
1819 | sbcs $t3,$acc3,$m3
|
---|
1820 | stp $t0,$t1,[$rp,#8*0]
|
---|
1821 | ldp $a2,$a3,[$ap_end,#8*2]
|
---|
1822 | stp $t2,$t3,[$rp,#8*2]
|
---|
1823 | ldp $acc0,$acc1,[$ap,#8*0]
|
---|
1824 | ldp $acc2,$acc3,[$ap,#8*2]
|
---|
1825 | sbcs xzr,$topmost,xzr // did it borrow?
|
---|
1826 | ldr x30,[x29,#8] // pull return address
|
---|
1827 |
|
---|
1828 | sub $cnt,$num,#8*4
|
---|
1829 | .Lmul4x_cond_copy:
|
---|
1830 | sub $cnt,$cnt,#8*4
|
---|
1831 | csel $t0,$acc0,$a0,lo
|
---|
1832 | stp xzr,xzr,[$tp,#8*0]
|
---|
1833 | csel $t1,$acc1,$a1,lo
|
---|
1834 | ldp $a0,$a1,[$ap_end,#8*4]
|
---|
1835 | ldp $acc0,$acc1,[$ap,#8*4]
|
---|
1836 | csel $t2,$acc2,$a2,lo
|
---|
1837 | stp xzr,xzr,[$tp,#8*2]
|
---|
1838 | add $tp,$tp,#8*4
|
---|
1839 | csel $t3,$acc3,$a3,lo
|
---|
1840 | ldp $a2,$a3,[$ap_end,#8*6]
|
---|
1841 | ldp $acc2,$acc3,[$ap,#8*6]
|
---|
1842 | add $ap,$ap,#8*4
|
---|
1843 | stp $t0,$t1,[$ap_end,#8*0]
|
---|
1844 | stp $t2,$t3,[$ap_end,#8*2]
|
---|
1845 | add $ap_end,$ap_end,#8*4
|
---|
1846 | cbnz $cnt,.Lmul4x_cond_copy
|
---|
1847 |
|
---|
1848 | csel $t0,$acc0,$a0,lo
|
---|
1849 | stp xzr,xzr,[$tp,#8*0]
|
---|
1850 | csel $t1,$acc1,$a1,lo
|
---|
1851 | stp xzr,xzr,[$tp,#8*2]
|
---|
1852 | csel $t2,$acc2,$a2,lo
|
---|
1853 | stp xzr,xzr,[$tp,#8*3]
|
---|
1854 | csel $t3,$acc3,$a3,lo
|
---|
1855 | stp xzr,xzr,[$tp,#8*4]
|
---|
1856 | stp $t0,$t1,[$ap_end,#8*0]
|
---|
1857 | stp $t2,$t3,[$ap_end,#8*2]
|
---|
1858 |
|
---|
1859 | b .Lmul4x_done
|
---|
1860 |
|
---|
1861 | .align 4
|
---|
1862 | .Lmul4x4_post_condition:
|
---|
1863 | adc $carry,$carry,xzr
|
---|
1864 | ldr $ap,[x29,#96] // pull rp
|
---|
1865 | // $acc0-3,$carry hold result, $m0-7 hold modulus
|
---|
1866 | subs $a0,$acc0,$m0
|
---|
1867 | ldr x30,[x29,#8] // pull return address
|
---|
1868 | sbcs $a1,$acc1,$m1
|
---|
1869 | stp xzr,xzr,[sp,#8*0]
|
---|
1870 | sbcs $a2,$acc2,$m2
|
---|
1871 | stp xzr,xzr,[sp,#8*2]
|
---|
1872 | sbcs $a3,$acc3,$m3
|
---|
1873 | stp xzr,xzr,[sp,#8*4]
|
---|
1874 | sbcs xzr,$carry,xzr // did it borrow?
|
---|
1875 | stp xzr,xzr,[sp,#8*6]
|
---|
1876 |
|
---|
1877 | // $a0-3 hold result-modulus
|
---|
1878 | csel $a0,$acc0,$a0,lo
|
---|
1879 | csel $a1,$acc1,$a1,lo
|
---|
1880 | csel $a2,$acc2,$a2,lo
|
---|
1881 | csel $a3,$acc3,$a3,lo
|
---|
1882 | stp $a0,$a1,[$ap,#8*0]
|
---|
1883 | stp $a2,$a3,[$ap,#8*2]
|
---|
1884 |
|
---|
1885 | .Lmul4x_done:
|
---|
1886 | ldp x19,x20,[x29,#16]
|
---|
1887 | mov sp,x29
|
---|
1888 | ldp x21,x22,[x29,#32]
|
---|
1889 | mov x0,#1
|
---|
1890 | ldp x23,x24,[x29,#48]
|
---|
1891 | ldp x25,x26,[x29,#64]
|
---|
1892 | ldp x27,x28,[x29,#80]
|
---|
1893 | ldr x29,[sp],#128
|
---|
1894 | // x30 loaded earlier
|
---|
1895 | AARCH64_VALIDATE_LINK_REGISTER
|
---|
1896 | ret
|
---|
1897 | .size __bn_mul4x_mont,.-__bn_mul4x_mont
|
---|
1898 | ___
|
---|
1899 | }
|
---|
1900 | $code.=<<___;
|
---|
1901 | .asciz "Montgomery Multiplication for ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
|
---|
1902 | .align 4
|
---|
1903 | ___
|
---|
1904 |
|
---|
1905 | print $code;
|
---|
1906 |
|
---|
1907 | close STDOUT or die "error closing STDOUT: $!";
|
---|