VirtualBox

source: vbox/trunk/src/libs/openssl-3.0.1/crypto/bn/asm/alpha-mont.pl@ 94081

最後變更 在這個檔案從94081是 91772,由 vboxsync 提交於 3 年 前

openssl-1.1.1l: Applied and adjusted our OpenSSL changes to 1.1.1l. bugref:10126

檔案大小: 5.8 KB
 
1#! /usr/bin/env perl
2# Copyright 2006-2020 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the OpenSSL license (the "License"). You may not use
5# this file except in compliance with the License. You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
9#
10# ====================================================================
11# Written by Andy Polyakov <[email protected]> for the OpenSSL
12# project. The module is, however, dual licensed under OpenSSL and
13# CRYPTOGAMS licenses depending on where you obtain it. For further
14# details see http://www.openssl.org/~appro/cryptogams/.
15# ====================================================================
16#
17# On 21264 RSA sign performance improves by 70/35/20/15 percent for
18# 512/1024/2048/4096 bit key lengths. This is against vendor compiler
19# instructed to '-tune host' code with in-line assembler. Other
20# benchmarks improve by 15-20%. To anchor it to something else, the
21# code provides approximately the same performance per GHz as AMD64.
22# I.e. if you compare 1GHz 21264 and 2GHz Opteron, you'll observe ~2x
23# difference.
24
25$output=pop;
26open STDOUT,">$output";
27
28# int bn_mul_mont(
29$rp="a0"; # BN_ULONG *rp,
30$ap="a1"; # const BN_ULONG *ap,
31$bp="a2"; # const BN_ULONG *bp,
32$np="a3"; # const BN_ULONG *np,
33$n0="a4"; # const BN_ULONG *n0,
34$num="a5"; # int num);
35
36$lo0="t0";
37$hi0="t1";
38$lo1="t2";
39$hi1="t3";
40$aj="t4";
41$bi="t5";
42$nj="t6";
43$tp="t7";
44$alo="t8";
45$ahi="t9";
46$nlo="t10";
47$nhi="t11";
48$tj="t12";
49$i="s3";
50$j="s4";
51$m1="s5";
52
53$code=<<___;
54#ifdef __linux__
55#include <asm/regdef.h>
56#else
57#include <asm.h>
58#include <regdef.h>
59#endif
60
61.text
62
63.set noat
64.set noreorder
65
66.globl bn_mul_mont
67.align 5
68.ent bn_mul_mont
69bn_mul_mont:
70 lda sp,-48(sp)
71 stq ra,0(sp)
72 stq s3,8(sp)
73 stq s4,16(sp)
74 stq s5,24(sp)
75 stq fp,32(sp)
76 mov sp,fp
77 .mask 0x0400f000,-48
78 .frame fp,48,ra
79 .prologue 0
80
81 .align 4
82 .set reorder
83 sextl $num,$num
84 mov 0,v0
85 cmplt $num,4,AT
86 bne AT,.Lexit
87
88 ldq $hi0,0($ap) # ap[0]
89 s8addq $num,16,AT
90 ldq $aj,8($ap)
91 subq sp,AT,sp
92 ldq $bi,0($bp) # bp[0]
93 lda AT,-4096(zero) # mov -4096,AT
94 ldq $n0,0($n0)
95 and sp,AT,sp
96
97 mulq $hi0,$bi,$lo0
98 ldq $hi1,0($np) # np[0]
99 umulh $hi0,$bi,$hi0
100 ldq $nj,8($np)
101
102 mulq $lo0,$n0,$m1
103
104 mulq $hi1,$m1,$lo1
105 umulh $hi1,$m1,$hi1
106
107 addq $lo1,$lo0,$lo1
108 cmpult $lo1,$lo0,AT
109 addq $hi1,AT,$hi1
110
111 mulq $aj,$bi,$alo
112 mov 2,$j
113 umulh $aj,$bi,$ahi
114 mov sp,$tp
115
116 mulq $nj,$m1,$nlo
117 s8addq $j,$ap,$aj
118 umulh $nj,$m1,$nhi
119 s8addq $j,$np,$nj
120.align 4
121.L1st:
122 .set noreorder
123 ldq $aj,0($aj)
124 addl $j,1,$j
125 ldq $nj,0($nj)
126 lda $tp,8($tp)
127
128 addq $alo,$hi0,$lo0
129 mulq $aj,$bi,$alo
130 cmpult $lo0,$hi0,AT
131 addq $nlo,$hi1,$lo1
132
133 mulq $nj,$m1,$nlo
134 addq $ahi,AT,$hi0
135 cmpult $lo1,$hi1,v0
136 cmplt $j,$num,$tj
137
138 umulh $aj,$bi,$ahi
139 addq $nhi,v0,$hi1
140 addq $lo1,$lo0,$lo1
141 s8addq $j,$ap,$aj
142
143 umulh $nj,$m1,$nhi
144 cmpult $lo1,$lo0,v0
145 addq $hi1,v0,$hi1
146 s8addq $j,$np,$nj
147
148 stq $lo1,-8($tp)
149 nop
150 unop
151 bne $tj,.L1st
152 .set reorder
153
154 addq $alo,$hi0,$lo0
155 addq $nlo,$hi1,$lo1
156 cmpult $lo0,$hi0,AT
157 cmpult $lo1,$hi1,v0
158 addq $ahi,AT,$hi0
159 addq $nhi,v0,$hi1
160
161 addq $lo1,$lo0,$lo1
162 cmpult $lo1,$lo0,v0
163 addq $hi1,v0,$hi1
164
165 stq $lo1,0($tp)
166
167 addq $hi1,$hi0,$hi1
168 cmpult $hi1,$hi0,AT
169 stq $hi1,8($tp)
170 stq AT,16($tp)
171
172 mov 1,$i
173.align 4
174.Louter:
175 s8addq $i,$bp,$bi
176 ldq $hi0,0($ap)
177 ldq $aj,8($ap)
178 ldq $bi,0($bi)
179 ldq $hi1,0($np)
180 ldq $nj,8($np)
181 ldq $tj,0(sp)
182
183 mulq $hi0,$bi,$lo0
184 umulh $hi0,$bi,$hi0
185
186 addq $lo0,$tj,$lo0
187 cmpult $lo0,$tj,AT
188 addq $hi0,AT,$hi0
189
190 mulq $lo0,$n0,$m1
191
192 mulq $hi1,$m1,$lo1
193 umulh $hi1,$m1,$hi1
194
195 addq $lo1,$lo0,$lo1
196 cmpult $lo1,$lo0,AT
197 mov 2,$j
198 addq $hi1,AT,$hi1
199
200 mulq $aj,$bi,$alo
201 mov sp,$tp
202 umulh $aj,$bi,$ahi
203
204 mulq $nj,$m1,$nlo
205 s8addq $j,$ap,$aj
206 umulh $nj,$m1,$nhi
207.align 4
208.Linner:
209 .set noreorder
210 ldq $tj,8($tp) #L0
211 nop #U1
212 ldq $aj,0($aj) #L1
213 s8addq $j,$np,$nj #U0
214
215 ldq $nj,0($nj) #L0
216 nop #U1
217 addq $alo,$hi0,$lo0 #L1
218 lda $tp,8($tp)
219
220 mulq $aj,$bi,$alo #U1
221 cmpult $lo0,$hi0,AT #L0
222 addq $nlo,$hi1,$lo1 #L1
223 addl $j,1,$j
224
225 mulq $nj,$m1,$nlo #U1
226 addq $ahi,AT,$hi0 #L0
227 addq $lo0,$tj,$lo0 #L1
228 cmpult $lo1,$hi1,v0 #U0
229
230 umulh $aj,$bi,$ahi #U1
231 cmpult $lo0,$tj,AT #L0
232 addq $lo1,$lo0,$lo1 #L1
233 addq $nhi,v0,$hi1 #U0
234
235 umulh $nj,$m1,$nhi #U1
236 s8addq $j,$ap,$aj #L0
237 cmpult $lo1,$lo0,v0 #L1
238 cmplt $j,$num,$tj #U0 # borrow $tj
239
240 addq $hi0,AT,$hi0 #L0
241 addq $hi1,v0,$hi1 #U1
242 stq $lo1,-8($tp) #L1
243 bne $tj,.Linner #U0
244 .set reorder
245
246 ldq $tj,8($tp)
247 addq $alo,$hi0,$lo0
248 addq $nlo,$hi1,$lo1
249 cmpult $lo0,$hi0,AT
250 cmpult $lo1,$hi1,v0
251 addq $ahi,AT,$hi0
252 addq $nhi,v0,$hi1
253
254 addq $lo0,$tj,$lo0
255 cmpult $lo0,$tj,AT
256 addq $hi0,AT,$hi0
257
258 ldq $tj,16($tp)
259 addq $lo1,$lo0,$j
260 cmpult $j,$lo0,v0
261 addq $hi1,v0,$hi1
262
263 addq $hi1,$hi0,$lo1
264 stq $j,0($tp)
265 cmpult $lo1,$hi0,$hi1
266 addq $lo1,$tj,$lo1
267 cmpult $lo1,$tj,AT
268 addl $i,1,$i
269 addq $hi1,AT,$hi1
270 stq $lo1,8($tp)
271 cmplt $i,$num,$tj # borrow $tj
272 stq $hi1,16($tp)
273 bne $tj,.Louter
274
275
276 s8addq $num,sp,$tj # &tp[num]
277 mov $rp,$bp # put rp aside
278 mov sp,$tp
279 mov sp,$ap
280 mov 0,$hi0 # clear borrow bit
281
282.align 4
283.Lsub: ldq $lo0,0($tp)
284 ldq $lo1,0($np)
285 lda $tp,8($tp)
286 lda $np,8($np)
287 subq $lo0,$lo1,$lo1 # tp[i]-np[i]
288 cmpult $lo0,$lo1,AT
289 subq $lo1,$hi0,$lo0
290 cmpult $lo1,$lo0,$hi0
291 or $hi0,AT,$hi0
292 stq $lo0,0($rp)
293 cmpult $tp,$tj,v0
294 lda $rp,8($rp)
295 bne v0,.Lsub
296
297 subq $hi1,$hi0,$hi0 # handle upmost overflow bit
298 mov sp,$tp
299 mov $bp,$rp # restore rp
300
301.align 4
302.Lcopy: ldq $aj,0($tp) # conditional copy
303 ldq $nj,0($rp)
304 lda $tp,8($tp)
305 lda $rp,8($rp)
306 cmoveq $hi0,$nj,$aj
307 stq zero,-8($tp) # zap tp
308 cmpult $tp,$tj,AT
309 stq $aj,-8($rp)
310 bne AT,.Lcopy
311 mov 1,v0
312
313.Lexit:
314 .set noreorder
315 mov fp,sp
316 /*ldq ra,0(sp)*/
317 ldq s3,8(sp)
318 ldq s4,16(sp)
319 ldq s5,24(sp)
320 ldq fp,32(sp)
321 lda sp,48(sp)
322 ret (ra)
323.end bn_mul_mont
324.ascii "Montgomery Multiplication for Alpha, CRYPTOGAMS by <appro\@openssl.org>"
325.align 2
326___
327
328print $code;
329close STDOUT or die "error closing STDOUT: $!";
注意: 瀏覽 TracBrowser 來幫助您使用儲存庫瀏覽器

© 2025 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette