VirtualBox

source: vbox/trunk/src/libs/openssl-3.0.1/crypto/sha/asm/keccak1600-x86_64.pl@ 94081

最後變更 在這個檔案從94081是 91772,由 vboxsync 提交於 3 年 前

openssl-1.1.1l: Applied and adjusted our OpenSSL changes to 1.1.1l. bugref:10126

檔案大小: 13.9 KB
 
1#!/usr/bin/env perl
2# Copyright 2017-2020 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the OpenSSL license (the "License"). You may not use
5# this file except in compliance with the License. You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8#
9# ====================================================================
10# Written by Andy Polyakov <[email protected]> for the OpenSSL
11# project. The module is, however, dual licensed under OpenSSL and
12# CRYPTOGAMS licenses depending on where you obtain it. For further
13# details see http://www.openssl.org/~appro/cryptogams/.
14# ====================================================================
15#
16# Keccak-1600 for x86_64.
17#
18# June 2017.
19#
20# Below code is [lane complementing] KECCAK_2X implementation (see
21# sha/keccak1600.c) with C[5] and D[5] held in register bank. Though
22# instead of actually unrolling the loop pair-wise I simply flip
23# pointers to T[][] and A[][] at the end of round. Since number of
24# rounds is even, last round writes to A[][] and everything works out.
25# How does it compare to x86_64 assembly module in Keccak Code Package?
26# Depending on processor it's either as fast or faster by up to 15%...
27#
28########################################################################
29# Numbers are cycles per processed byte out of large message.
30#
31# r=1088(*)
32#
33# P4 25.8
34# Core 2 12.9
35# Westmere 13.7
36# Sandy Bridge 12.9(**)
37# Haswell 9.6
38# Skylake 9.4
39# Silvermont 22.8
40# Goldmont 15.8
41# VIA Nano 17.3
42# Sledgehammer 13.3
43# Bulldozer 16.5
44# Ryzen 8.8
45#
46# (*) Corresponds to SHA3-256. Improvement over compiler-generate
47# varies a lot, most common coefficient is 15% in comparison to
48# gcc-5.x, 50% for gcc-4.x, 90% for gcc-3.x.
49# (**) Sandy Bridge has broken rotate instruction. Performance can be
50# improved by 14% by replacing rotates with double-precision
51# shift with same register as source and destination.
52
53$flavour = shift;
54$output = shift;
55if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
56
57$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
58
59$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
60( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
61( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
62die "can't locate x86_64-xlate.pl";
63
64open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
65*STDOUT=*OUT;
66
67my @A = map([ 8*$_-100, 8*($_+1)-100, 8*($_+2)-100,
68 8*($_+3)-100, 8*($_+4)-100 ], (0,5,10,15,20));
69
70my @C = ("%rax","%rbx","%rcx","%rdx","%rbp");
71my @D = map("%r$_",(8..12));
72my @T = map("%r$_",(13..14));
73my $iotas = "%r15";
74
75my @rhotates = ([ 0, 1, 62, 28, 27 ],
76 [ 36, 44, 6, 55, 20 ],
77 [ 3, 10, 43, 25, 39 ],
78 [ 41, 45, 15, 21, 8 ],
79 [ 18, 2, 61, 56, 14 ]);
80
81$code.=<<___;
82.text
83
84.type __KeccakF1600,\@abi-omnipotent
85.align 32
86__KeccakF1600:
87.cfi_startproc
88 mov $A[4][0](%rdi),@C[0]
89 mov $A[4][1](%rdi),@C[1]
90 mov $A[4][2](%rdi),@C[2]
91 mov $A[4][3](%rdi),@C[3]
92 mov $A[4][4](%rdi),@C[4]
93 jmp .Loop
94
95.align 32
96.Loop:
97 mov $A[0][0](%rdi),@D[0]
98 mov $A[1][1](%rdi),@D[1]
99 mov $A[2][2](%rdi),@D[2]
100 mov $A[3][3](%rdi),@D[3]
101
102 xor $A[0][2](%rdi),@C[2]
103 xor $A[0][3](%rdi),@C[3]
104 xor @D[0], @C[0]
105 xor $A[0][1](%rdi),@C[1]
106 xor $A[1][2](%rdi),@C[2]
107 xor $A[1][0](%rdi),@C[0]
108 mov @C[4],@D[4]
109 xor $A[0][4](%rdi),@C[4]
110
111 xor @D[2], @C[2]
112 xor $A[2][0](%rdi),@C[0]
113 xor $A[1][3](%rdi),@C[3]
114 xor @D[1], @C[1]
115 xor $A[1][4](%rdi),@C[4]
116
117 xor $A[3][2](%rdi),@C[2]
118 xor $A[3][0](%rdi),@C[0]
119 xor $A[2][3](%rdi),@C[3]
120 xor $A[2][1](%rdi),@C[1]
121 xor $A[2][4](%rdi),@C[4]
122
123 mov @C[2],@T[0]
124 rol \$1,@C[2]
125 xor @C[0],@C[2] # D[1] = ROL64(C[2], 1) ^ C[0]
126 xor @D[3], @C[3]
127
128 rol \$1,@C[0]
129 xor @C[3],@C[0] # D[4] = ROL64(C[0], 1) ^ C[3]
130 xor $A[3][1](%rdi),@C[1]
131
132 rol \$1,@C[3]
133 xor @C[1],@C[3] # D[2] = ROL64(C[3], 1) ^ C[1]
134 xor $A[3][4](%rdi),@C[4]
135
136 rol \$1,@C[1]
137 xor @C[4],@C[1] # D[0] = ROL64(C[1], 1) ^ C[4]
138
139 rol \$1,@C[4]
140 xor @T[0],@C[4] # D[3] = ROL64(C[4], 1) ^ C[2]
141___
142 (@D[0..4], @C) = (@C[1..4,0], @D);
143$code.=<<___;
144 xor @D[1],@C[1]
145 xor @D[2],@C[2]
146 rol \$$rhotates[1][1],@C[1]
147 xor @D[3],@C[3]
148 xor @D[4],@C[4]
149 rol \$$rhotates[2][2],@C[2]
150 xor @D[0],@C[0]
151 mov @C[1],@T[0]
152 rol \$$rhotates[3][3],@C[3]
153 or @C[2],@C[1]
154 xor @C[0],@C[1] # C[0] ^ ( C[1] | C[2])
155 rol \$$rhotates[4][4],@C[4]
156
157 xor ($iotas),@C[1]
158 lea 8($iotas),$iotas
159
160 mov @C[4],@T[1]
161 and @C[3],@C[4]
162 mov @C[1],$A[0][0](%rsi) # R[0][0] = C[0] ^ ( C[1] | C[2]) ^ iotas[i]
163 xor @C[2],@C[4] # C[2] ^ ( C[4] & C[3])
164 not @C[2]
165 mov @C[4],$A[0][2](%rsi) # R[0][2] = C[2] ^ ( C[4] & C[3])
166
167 or @C[3],@C[2]
168 mov $A[4][2](%rdi),@C[4]
169 xor @T[0],@C[2] # C[1] ^ (~C[2] | C[3])
170 mov @C[2],$A[0][1](%rsi) # R[0][1] = C[1] ^ (~C[2] | C[3])
171
172 and @C[0],@T[0]
173 mov $A[1][4](%rdi),@C[1]
174 xor @T[1],@T[0] # C[4] ^ ( C[1] & C[0])
175 mov $A[2][0](%rdi),@C[2]
176 mov @T[0],$A[0][4](%rsi) # R[0][4] = C[4] ^ ( C[1] & C[0])
177
178 or @C[0],@T[1]
179 mov $A[0][3](%rdi),@C[0]
180 xor @C[3],@T[1] # C[3] ^ ( C[4] | C[0])
181 mov $A[3][1](%rdi),@C[3]
182 mov @T[1],$A[0][3](%rsi) # R[0][3] = C[3] ^ ( C[4] | C[0])
183
184
185 xor @D[3],@C[0]
186 xor @D[2],@C[4]
187 rol \$$rhotates[0][3],@C[0]
188 xor @D[1],@C[3]
189 xor @D[4],@C[1]
190 rol \$$rhotates[4][2],@C[4]
191 rol \$$rhotates[3][1],@C[3]
192 xor @D[0],@C[2]
193 rol \$$rhotates[1][4],@C[1]
194 mov @C[0],@T[0]
195 or @C[4],@C[0]
196 rol \$$rhotates[2][0],@C[2]
197
198 xor @C[3],@C[0] # C[3] ^ (C[0] | C[4])
199 mov @C[0],$A[1][3](%rsi) # R[1][3] = C[3] ^ (C[0] | C[4])
200
201 mov @C[1],@T[1]
202 and @T[0],@C[1]
203 mov $A[0][1](%rdi),@C[0]
204 xor @C[4],@C[1] # C[4] ^ (C[1] & C[0])
205 not @C[4]
206 mov @C[1],$A[1][4](%rsi) # R[1][4] = C[4] ^ (C[1] & C[0])
207
208 or @C[3],@C[4]
209 mov $A[1][2](%rdi),@C[1]
210 xor @C[2],@C[4] # C[2] ^ (~C[4] | C[3])
211 mov @C[4],$A[1][2](%rsi) # R[1][2] = C[2] ^ (~C[4] | C[3])
212
213 and @C[2],@C[3]
214 mov $A[4][0](%rdi),@C[4]
215 xor @T[1],@C[3] # C[1] ^ (C[3] & C[2])
216 mov @C[3],$A[1][1](%rsi) # R[1][1] = C[1] ^ (C[3] & C[2])
217
218 or @C[2],@T[1]
219 mov $A[2][3](%rdi),@C[2]
220 xor @T[0],@T[1] # C[0] ^ (C[1] | C[2])
221 mov $A[3][4](%rdi),@C[3]
222 mov @T[1],$A[1][0](%rsi) # R[1][0] = C[0] ^ (C[1] | C[2])
223
224
225 xor @D[3],@C[2]
226 xor @D[4],@C[3]
227 rol \$$rhotates[2][3],@C[2]
228 xor @D[2],@C[1]
229 rol \$$rhotates[3][4],@C[3]
230 xor @D[0],@C[4]
231 rol \$$rhotates[1][2],@C[1]
232 xor @D[1],@C[0]
233 rol \$$rhotates[4][0],@C[4]
234 mov @C[2],@T[0]
235 and @C[3],@C[2]
236 rol \$$rhotates[0][1],@C[0]
237
238 not @C[3]
239 xor @C[1],@C[2] # C[1] ^ ( C[2] & C[3])
240 mov @C[2],$A[2][1](%rsi) # R[2][1] = C[1] ^ ( C[2] & C[3])
241
242 mov @C[4],@T[1]
243 and @C[3],@C[4]
244 mov $A[2][1](%rdi),@C[2]
245 xor @T[0],@C[4] # C[2] ^ ( C[4] & ~C[3])
246 mov @C[4],$A[2][2](%rsi) # R[2][2] = C[2] ^ ( C[4] & ~C[3])
247
248 or @C[1],@T[0]
249 mov $A[4][3](%rdi),@C[4]
250 xor @C[0],@T[0] # C[0] ^ ( C[2] | C[1])
251 mov @T[0],$A[2][0](%rsi) # R[2][0] = C[0] ^ ( C[2] | C[1])
252
253 and @C[0],@C[1]
254 xor @T[1],@C[1] # C[4] ^ ( C[1] & C[0])
255 mov @C[1],$A[2][4](%rsi) # R[2][4] = C[4] ^ ( C[1] & C[0])
256
257 or @C[0],@T[1]
258 mov $A[1][0](%rdi),@C[1]
259 xor @C[3],@T[1] # ~C[3] ^ ( C[0] | C[4])
260 mov $A[3][2](%rdi),@C[3]
261 mov @T[1],$A[2][3](%rsi) # R[2][3] = ~C[3] ^ ( C[0] | C[4])
262
263
264 mov $A[0][4](%rdi),@C[0]
265
266 xor @D[1],@C[2]
267 xor @D[2],@C[3]
268 rol \$$rhotates[2][1],@C[2]
269 xor @D[0],@C[1]
270 rol \$$rhotates[3][2],@C[3]
271 xor @D[3],@C[4]
272 rol \$$rhotates[1][0],@C[1]
273 xor @D[4],@C[0]
274 rol \$$rhotates[4][3],@C[4]
275 mov @C[2],@T[0]
276 or @C[3],@C[2]
277 rol \$$rhotates[0][4],@C[0]
278
279 not @C[3]
280 xor @C[1],@C[2] # C[1] ^ ( C[2] | C[3])
281 mov @C[2],$A[3][1](%rsi) # R[3][1] = C[1] ^ ( C[2] | C[3])
282
283 mov @C[4],@T[1]
284 or @C[3],@C[4]
285 xor @T[0],@C[4] # C[2] ^ ( C[4] | ~C[3])
286 mov @C[4],$A[3][2](%rsi) # R[3][2] = C[2] ^ ( C[4] | ~C[3])
287
288 and @C[1],@T[0]
289 xor @C[0],@T[0] # C[0] ^ ( C[2] & C[1])
290 mov @T[0],$A[3][0](%rsi) # R[3][0] = C[0] ^ ( C[2] & C[1])
291
292 or @C[0],@C[1]
293 xor @T[1],@C[1] # C[4] ^ ( C[1] | C[0])
294 mov @C[1],$A[3][4](%rsi) # R[3][4] = C[4] ^ ( C[1] | C[0])
295
296 and @T[1],@C[0]
297 xor @C[3],@C[0] # ~C[3] ^ ( C[0] & C[4])
298 mov @C[0],$A[3][3](%rsi) # R[3][3] = ~C[3] ^ ( C[0] & C[4])
299
300
301 xor $A[0][2](%rdi),@D[2]
302 xor $A[1][3](%rdi),@D[3]
303 rol \$$rhotates[0][2],@D[2]
304 xor $A[4][1](%rdi),@D[1]
305 rol \$$rhotates[1][3],@D[3]
306 xor $A[2][4](%rdi),@D[4]
307 rol \$$rhotates[4][1],@D[1]
308 xor $A[3][0](%rdi),@D[0]
309 xchg %rsi,%rdi
310 rol \$$rhotates[2][4],@D[4]
311 rol \$$rhotates[3][0],@D[0]
312___
313 @C = @D[2..4,0,1];
314$code.=<<___;
315 mov @C[0],@T[0]
316 and @C[1],@C[0]
317 not @C[1]
318 xor @C[4],@C[0] # C[4] ^ ( C[0] & C[1])
319 mov @C[0],$A[4][4](%rdi) # R[4][4] = C[4] ^ ( C[0] & C[1])
320
321 mov @C[2],@T[1]
322 and @C[1],@C[2]
323 xor @T[0],@C[2] # C[0] ^ ( C[2] & ~C[1])
324 mov @C[2],$A[4][0](%rdi) # R[4][0] = C[0] ^ ( C[2] & ~C[1])
325
326 or @C[4],@T[0]
327 xor @C[3],@T[0] # C[3] ^ ( C[0] | C[4])
328 mov @T[0],$A[4][3](%rdi) # R[4][3] = C[3] ^ ( C[0] | C[4])
329
330 and @C[3],@C[4]
331 xor @T[1],@C[4] # C[2] ^ ( C[4] & C[3])
332 mov @C[4],$A[4][2](%rdi) # R[4][2] = C[2] ^ ( C[4] & C[3])
333
334 or @T[1],@C[3]
335 xor @C[1],@C[3] # ~C[1] ^ ( C[2] | C[3])
336 mov @C[3],$A[4][1](%rdi) # R[4][1] = ~C[1] ^ ( C[2] | C[3])
337
338 mov @C[0],@C[1] # harmonize with the loop top
339 mov @T[0],@C[0]
340
341 test \$255,$iotas
342 jnz .Loop
343
344 lea -192($iotas),$iotas # rewind iotas
345 ret
346.cfi_endproc
347.size __KeccakF1600,.-__KeccakF1600
348
349.type KeccakF1600,\@abi-omnipotent
350.align 32
351KeccakF1600:
352.cfi_startproc
353 push %rbx
354.cfi_push %rbx
355 push %rbp
356.cfi_push %rbp
357 push %r12
358.cfi_push %r12
359 push %r13
360.cfi_push %r13
361 push %r14
362.cfi_push %r14
363 push %r15
364.cfi_push %r15
365
366 lea 100(%rdi),%rdi # size optimization
367 sub \$200,%rsp
368.cfi_adjust_cfa_offset 200
369
370 notq $A[0][1](%rdi)
371 notq $A[0][2](%rdi)
372 notq $A[1][3](%rdi)
373 notq $A[2][2](%rdi)
374 notq $A[3][2](%rdi)
375 notq $A[4][0](%rdi)
376
377 lea iotas(%rip),$iotas
378 lea 100(%rsp),%rsi # size optimization
379
380 call __KeccakF1600
381
382 notq $A[0][1](%rdi)
383 notq $A[0][2](%rdi)
384 notq $A[1][3](%rdi)
385 notq $A[2][2](%rdi)
386 notq $A[3][2](%rdi)
387 notq $A[4][0](%rdi)
388 lea -100(%rdi),%rdi # preserve A[][]
389
390 add \$200,%rsp
391.cfi_adjust_cfa_offset -200
392
393 pop %r15
394.cfi_pop %r15
395 pop %r14
396.cfi_pop %r14
397 pop %r13
398.cfi_pop %r13
399 pop %r12
400.cfi_pop %r12
401 pop %rbp
402.cfi_pop %rbp
403 pop %rbx
404.cfi_pop %rbx
405 ret
406.cfi_endproc
407.size KeccakF1600,.-KeccakF1600
408___
409
410{ my ($A_flat,$inp,$len,$bsz) = ("%rdi","%rsi","%rdx","%rcx");
411 ($A_flat,$inp) = ("%r8","%r9");
412$code.=<<___;
413.globl SHA3_absorb
414.type SHA3_absorb,\@function,4
415.align 32
416SHA3_absorb:
417.cfi_startproc
418 push %rbx
419.cfi_push %rbx
420 push %rbp
421.cfi_push %rbp
422 push %r12
423.cfi_push %r12
424 push %r13
425.cfi_push %r13
426 push %r14
427.cfi_push %r14
428 push %r15
429.cfi_push %r15
430
431 lea 100(%rdi),%rdi # size optimization
432 sub \$232,%rsp
433.cfi_adjust_cfa_offset 232
434
435 mov %rsi,$inp
436 lea 100(%rsp),%rsi # size optimization
437
438 notq $A[0][1](%rdi)
439 notq $A[0][2](%rdi)
440 notq $A[1][3](%rdi)
441 notq $A[2][2](%rdi)
442 notq $A[3][2](%rdi)
443 notq $A[4][0](%rdi)
444 lea iotas(%rip),$iotas
445
446 mov $bsz,216-100(%rsi) # save bsz
447
448.Loop_absorb:
449 cmp $bsz,$len
450 jc .Ldone_absorb
451
452 shr \$3,$bsz
453 lea -100(%rdi),$A_flat
454
455.Lblock_absorb:
456 mov ($inp),%rax
457 lea 8($inp),$inp
458 xor ($A_flat),%rax
459 lea 8($A_flat),$A_flat
460 sub \$8,$len
461 mov %rax,-8($A_flat)
462 sub \$1,$bsz
463 jnz .Lblock_absorb
464
465 mov $inp,200-100(%rsi) # save inp
466 mov $len,208-100(%rsi) # save len
467 call __KeccakF1600
468 mov 200-100(%rsi),$inp # pull inp
469 mov 208-100(%rsi),$len # pull len
470 mov 216-100(%rsi),$bsz # pull bsz
471 jmp .Loop_absorb
472
473.align 32
474.Ldone_absorb:
475 mov $len,%rax # return value
476
477 notq $A[0][1](%rdi)
478 notq $A[0][2](%rdi)
479 notq $A[1][3](%rdi)
480 notq $A[2][2](%rdi)
481 notq $A[3][2](%rdi)
482 notq $A[4][0](%rdi)
483
484 add \$232,%rsp
485.cfi_adjust_cfa_offset -232
486
487 pop %r15
488.cfi_pop %r15
489 pop %r14
490.cfi_pop %r14
491 pop %r13
492.cfi_pop %r13
493 pop %r12
494.cfi_pop %r12
495 pop %rbp
496.cfi_pop %rbp
497 pop %rbx
498.cfi_pop %rbx
499 ret
500.cfi_endproc
501.size SHA3_absorb,.-SHA3_absorb
502___
503}
504{ my ($A_flat,$out,$len,$bsz) = ("%rdi","%rsi","%rdx","%rcx");
505 ($out,$len,$bsz) = ("%r12","%r13","%r14");
506
507$code.=<<___;
508.globl SHA3_squeeze
509.type SHA3_squeeze,\@function,4
510.align 32
511SHA3_squeeze:
512.cfi_startproc
513 push %r12
514.cfi_push %r12
515 push %r13
516.cfi_push %r13
517 push %r14
518.cfi_push %r14
519
520 shr \$3,%rcx
521 mov $A_flat,%r8
522 mov %rsi,$out
523 mov %rdx,$len
524 mov %rcx,$bsz
525 jmp .Loop_squeeze
526
527.align 32
528.Loop_squeeze:
529 cmp \$8,$len
530 jb .Ltail_squeeze
531
532 mov (%r8),%rax
533 lea 8(%r8),%r8
534 mov %rax,($out)
535 lea 8($out),$out
536 sub \$8,$len # len -= 8
537 jz .Ldone_squeeze
538
539 sub \$1,%rcx # bsz--
540 jnz .Loop_squeeze
541
542 call KeccakF1600
543 mov $A_flat,%r8
544 mov $bsz,%rcx
545 jmp .Loop_squeeze
546
547.Ltail_squeeze:
548 mov %r8, %rsi
549 mov $out,%rdi
550 mov $len,%rcx
551 .byte 0xf3,0xa4 # rep movsb
552
553.Ldone_squeeze:
554 pop %r14
555.cfi_pop %r14
556 pop %r13
557.cfi_pop %r13
558 pop %r12
559.cfi_pop %r13
560 ret
561.cfi_endproc
562.size SHA3_squeeze,.-SHA3_squeeze
563___
564}
565$code.=<<___;
566.align 256
567 .quad 0,0,0,0,0,0,0,0
568.type iotas,\@object
569iotas:
570 .quad 0x0000000000000001
571 .quad 0x0000000000008082
572 .quad 0x800000000000808a
573 .quad 0x8000000080008000
574 .quad 0x000000000000808b
575 .quad 0x0000000080000001
576 .quad 0x8000000080008081
577 .quad 0x8000000000008009
578 .quad 0x000000000000008a
579 .quad 0x0000000000000088
580 .quad 0x0000000080008009
581 .quad 0x000000008000000a
582 .quad 0x000000008000808b
583 .quad 0x800000000000008b
584 .quad 0x8000000000008089
585 .quad 0x8000000000008003
586 .quad 0x8000000000008002
587 .quad 0x8000000000000080
588 .quad 0x000000000000800a
589 .quad 0x800000008000000a
590 .quad 0x8000000080008081
591 .quad 0x8000000000008080
592 .quad 0x0000000080000001
593 .quad 0x8000000080008008
594.size iotas,.-iotas
595.asciz "Keccak-1600 absorb and squeeze for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
596___
597
598foreach (split("\n",$code)) {
599 # Below replacement results in 11.2 on Sandy Bridge, 9.4 on
600 # Haswell, but it hurts other processors by up to 2-3-4x...
601 #s/rol\s+(\$[0-9]+),(%[a-z][a-z0-9]+)/shld\t$1,$2,$2/;
602 # Below replacement results in 9.3 on Haswell [as well as
603 # on Ryzen, i.e. it *hurts* Ryzen]...
604 #s/rol\s+\$([0-9]+),(%[a-z][a-z0-9]+)/rorx\t\$64-$1,$2,$2/;
605
606 print $_, "\n";
607}
608
609close STDOUT or die "error closing STDOUT: $!";
注意: 瀏覽 TracBrowser 來幫助您使用儲存庫瀏覽器

© 2025 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette