VirtualBox

source: vbox/trunk/src/libs/openssl-3.0.7/crypto/sha/asm/keccak1600-armv8.pl@ 99507

最後變更 在這個檔案從99507是 94082,由 vboxsync 提交於 3 年 前

libs/openssl-3.0.1: started applying and adjusting our OpenSSL changes to 3.0.1. bugref:10128

檔案大小: 21.9 KB
 
1#!/usr/bin/env perl
2# Copyright 2017-2020 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the Apache License 2.0 (the "License"). You may not use
5# this file except in compliance with the License. You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8#
9# ====================================================================
10# Written by Andy Polyakov <[email protected]> for the OpenSSL
11# project. The module is, however, dual licensed under OpenSSL and
12# CRYPTOGAMS licenses depending on where you obtain it. For further
13# details see http://www.openssl.org/~appro/cryptogams/.
14# ====================================================================
15#
16# Keccak-1600 for ARMv8.
17#
18# June 2017.
19#
20# This is straightforward KECCAK_1X_ALT implementation. It makes no
21# sense to attempt SIMD/NEON implementation for following reason.
22# 64-bit lanes of vector registers can't be addressed as easily as in
23# 32-bit mode. This means that 64-bit NEON is bound to be slower than
24# 32-bit NEON, and this implementation is faster than 32-bit NEON on
25# same processor. Even though it takes more scalar xor's and andn's,
26# it gets compensated by availability of rotate. Not to forget that
27# most processors achieve higher issue rate with scalar instructions.
28#
29# February 2018.
30#
31# Add hardware-assisted ARMv8.2 implementation. It's KECCAK_1X_ALT
32# variant with register permutation/rotation twist that allows to
33# eliminate copies to temporary registers. If you look closely you'll
34# notice that it uses only one lane of vector registers. The new
35# instructions effectively facilitate parallel hashing, which we don't
36# support [yet?]. But lowest-level core procedure is prepared for it.
37# The inner round is 67 [vector] instructions, so it's not actually
38# obvious that it will provide performance improvement [in serial
39# hash] as long as vector instructions issue rate is limited to 1 per
40# cycle...
41#
42######################################################################
43# Numbers are cycles per processed byte.
44#
45# r=1088(*)
46#
47# Cortex-A53 13
48# Cortex-A57 12
49# X-Gene 14
50# Mongoose 10
51# Kryo 12
52# Denver 7.8
53# Apple A7 7.2
54# ThunderX2 9.7
55#
56# (*) Corresponds to SHA3-256. No improvement coefficients are listed
57# because they vary too much from compiler to compiler. Newer
58# compiler does much better and improvement varies from 5% on
59# Cortex-A57 to 25% on Cortex-A53. While in comparison to older
60# compiler this code is at least 2x faster...
61
62# $output is the last argument if it looks like a file (it has an extension)
63# $flavour is the first argument if it doesn't look like a file
64$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
65$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
66
67$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
68( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
69( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
70die "can't locate arm-xlate.pl";
71
72open OUT,"| \"$^X\" $xlate $flavour \"$output\""
73 or die "can't call $xlate: $!";
74*STDOUT=*OUT;
75
76my @rhotates = ([ 0, 1, 62, 28, 27 ],
77 [ 36, 44, 6, 55, 20 ],
78 [ 3, 10, 43, 25, 39 ],
79 [ 41, 45, 15, 21, 8 ],
80 [ 18, 2, 61, 56, 14 ]);
81
82$code.=<<___;
83.text
84
85.align 8 // strategic alignment and padding that allows to use
86 // address value as loop termination condition...
87 .quad 0,0,0,0,0,0,0,0
88.type iotas,%object
89iotas:
90 .quad 0x0000000000000001
91 .quad 0x0000000000008082
92 .quad 0x800000000000808a
93 .quad 0x8000000080008000
94 .quad 0x000000000000808b
95 .quad 0x0000000080000001
96 .quad 0x8000000080008081
97 .quad 0x8000000000008009
98 .quad 0x000000000000008a
99 .quad 0x0000000000000088
100 .quad 0x0000000080008009
101 .quad 0x000000008000000a
102 .quad 0x000000008000808b
103 .quad 0x800000000000008b
104 .quad 0x8000000000008089
105 .quad 0x8000000000008003
106 .quad 0x8000000000008002
107 .quad 0x8000000000000080
108 .quad 0x000000000000800a
109 .quad 0x800000008000000a
110 .quad 0x8000000080008081
111 .quad 0x8000000000008080
112 .quad 0x0000000080000001
113 .quad 0x8000000080008008
114.size iotas,.-iotas
115___
116 {{{
117my @A = map([ "x$_", "x".($_+1), "x".($_+2), "x".($_+3), "x".($_+4) ],
118 (0, 5, 10, 15, 20));
119 $A[3][3] = "x25"; # x18 is reserved
120
121my @C = map("x$_", (26,27,28,30));
122
123$code.=<<___;
124.type KeccakF1600_int,%function
125.align 5
126KeccakF1600_int:
127 adr $C[2],iotas
128 .inst 0xd503233f // paciasp
129 stp $C[2],x30,[sp,#16] // 32 bytes on top are mine
130 b .Loop
131.align 4
132.Loop:
133 ////////////////////////////////////////// Theta
134 eor $C[0],$A[0][0],$A[1][0]
135 stp $A[0][4],$A[1][4],[sp,#0] // offload pair...
136 eor $C[1],$A[0][1],$A[1][1]
137 eor $C[2],$A[0][2],$A[1][2]
138 eor $C[3],$A[0][3],$A[1][3]
139___
140 $C[4]=$A[0][4];
141 $C[5]=$A[1][4];
142$code.=<<___;
143 eor $C[4],$A[0][4],$A[1][4]
144 eor $C[0],$C[0],$A[2][0]
145 eor $C[1],$C[1],$A[2][1]
146 eor $C[2],$C[2],$A[2][2]
147 eor $C[3],$C[3],$A[2][3]
148 eor $C[4],$C[4],$A[2][4]
149 eor $C[0],$C[0],$A[3][0]
150 eor $C[1],$C[1],$A[3][1]
151 eor $C[2],$C[2],$A[3][2]
152 eor $C[3],$C[3],$A[3][3]
153 eor $C[4],$C[4],$A[3][4]
154 eor $C[0],$C[0],$A[4][0]
155 eor $C[2],$C[2],$A[4][2]
156 eor $C[1],$C[1],$A[4][1]
157 eor $C[3],$C[3],$A[4][3]
158 eor $C[4],$C[4],$A[4][4]
159
160 eor $C[5],$C[0],$C[2],ror#63
161
162 eor $A[0][1],$A[0][1],$C[5]
163 eor $A[1][1],$A[1][1],$C[5]
164 eor $A[2][1],$A[2][1],$C[5]
165 eor $A[3][1],$A[3][1],$C[5]
166 eor $A[4][1],$A[4][1],$C[5]
167
168 eor $C[5],$C[1],$C[3],ror#63
169 eor $C[2],$C[2],$C[4],ror#63
170 eor $C[3],$C[3],$C[0],ror#63
171 eor $C[4],$C[4],$C[1],ror#63
172
173 eor $C[1], $A[0][2],$C[5] // mov $C[1],$A[0][2]
174 eor $A[1][2],$A[1][2],$C[5]
175 eor $A[2][2],$A[2][2],$C[5]
176 eor $A[3][2],$A[3][2],$C[5]
177 eor $A[4][2],$A[4][2],$C[5]
178
179 eor $A[0][0],$A[0][0],$C[4]
180 eor $A[1][0],$A[1][0],$C[4]
181 eor $A[2][0],$A[2][0],$C[4]
182 eor $A[3][0],$A[3][0],$C[4]
183 eor $A[4][0],$A[4][0],$C[4]
184___
185 $C[4]=undef;
186 $C[5]=undef;
187$code.=<<___;
188 ldp $A[0][4],$A[1][4],[sp,#0] // re-load offloaded data
189 eor $C[0], $A[0][3],$C[2] // mov $C[0],$A[0][3]
190 eor $A[1][3],$A[1][3],$C[2]
191 eor $A[2][3],$A[2][3],$C[2]
192 eor $A[3][3],$A[3][3],$C[2]
193 eor $A[4][3],$A[4][3],$C[2]
194
195 eor $C[2], $A[0][4],$C[3] // mov $C[2],$A[0][4]
196 eor $A[1][4],$A[1][4],$C[3]
197 eor $A[2][4],$A[2][4],$C[3]
198 eor $A[3][4],$A[3][4],$C[3]
199 eor $A[4][4],$A[4][4],$C[3]
200
201 ////////////////////////////////////////// Rho+Pi
202 mov $C[3],$A[0][1]
203 ror $A[0][1],$A[1][1],#64-$rhotates[1][1]
204 //mov $C[1],$A[0][2]
205 ror $A[0][2],$A[2][2],#64-$rhotates[2][2]
206 //mov $C[0],$A[0][3]
207 ror $A[0][3],$A[3][3],#64-$rhotates[3][3]
208 //mov $C[2],$A[0][4]
209 ror $A[0][4],$A[4][4],#64-$rhotates[4][4]
210
211 ror $A[1][1],$A[1][4],#64-$rhotates[1][4]
212 ror $A[2][2],$A[2][3],#64-$rhotates[2][3]
213 ror $A[3][3],$A[3][2],#64-$rhotates[3][2]
214 ror $A[4][4],$A[4][1],#64-$rhotates[4][1]
215
216 ror $A[1][4],$A[4][2],#64-$rhotates[4][2]
217 ror $A[2][3],$A[3][4],#64-$rhotates[3][4]
218 ror $A[3][2],$A[2][1],#64-$rhotates[2][1]
219 ror $A[4][1],$A[1][3],#64-$rhotates[1][3]
220
221 ror $A[4][2],$A[2][4],#64-$rhotates[2][4]
222 ror $A[3][4],$A[4][3],#64-$rhotates[4][3]
223 ror $A[2][1],$A[1][2],#64-$rhotates[1][2]
224 ror $A[1][3],$A[3][1],#64-$rhotates[3][1]
225
226 ror $A[2][4],$A[4][0],#64-$rhotates[4][0]
227 ror $A[4][3],$A[3][0],#64-$rhotates[3][0]
228 ror $A[1][2],$A[2][0],#64-$rhotates[2][0]
229 ror $A[3][1],$A[1][0],#64-$rhotates[1][0]
230
231 ror $A[1][0],$C[0],#64-$rhotates[0][3]
232 ror $A[2][0],$C[3],#64-$rhotates[0][1]
233 ror $A[3][0],$C[2],#64-$rhotates[0][4]
234 ror $A[4][0],$C[1],#64-$rhotates[0][2]
235
236 ////////////////////////////////////////// Chi+Iota
237 bic $C[0],$A[0][2],$A[0][1]
238 bic $C[1],$A[0][3],$A[0][2]
239 bic $C[2],$A[0][0],$A[0][4]
240 bic $C[3],$A[0][1],$A[0][0]
241 eor $A[0][0],$A[0][0],$C[0]
242 bic $C[0],$A[0][4],$A[0][3]
243 eor $A[0][1],$A[0][1],$C[1]
244 ldr $C[1],[sp,#16]
245 eor $A[0][3],$A[0][3],$C[2]
246 eor $A[0][4],$A[0][4],$C[3]
247 eor $A[0][2],$A[0][2],$C[0]
248 ldr $C[3],[$C[1]],#8 // Iota[i++]
249
250 bic $C[0],$A[1][2],$A[1][1]
251 tst $C[1],#255 // are we done?
252 str $C[1],[sp,#16]
253 bic $C[1],$A[1][3],$A[1][2]
254 bic $C[2],$A[1][0],$A[1][4]
255 eor $A[0][0],$A[0][0],$C[3] // A[0][0] ^= Iota
256 bic $C[3],$A[1][1],$A[1][0]
257 eor $A[1][0],$A[1][0],$C[0]
258 bic $C[0],$A[1][4],$A[1][3]
259 eor $A[1][1],$A[1][1],$C[1]
260 eor $A[1][3],$A[1][3],$C[2]
261 eor $A[1][4],$A[1][4],$C[3]
262 eor $A[1][2],$A[1][2],$C[0]
263
264 bic $C[0],$A[2][2],$A[2][1]
265 bic $C[1],$A[2][3],$A[2][2]
266 bic $C[2],$A[2][0],$A[2][4]
267 bic $C[3],$A[2][1],$A[2][0]
268 eor $A[2][0],$A[2][0],$C[0]
269 bic $C[0],$A[2][4],$A[2][3]
270 eor $A[2][1],$A[2][1],$C[1]
271 eor $A[2][3],$A[2][3],$C[2]
272 eor $A[2][4],$A[2][4],$C[3]
273 eor $A[2][2],$A[2][2],$C[0]
274
275 bic $C[0],$A[3][2],$A[3][1]
276 bic $C[1],$A[3][3],$A[3][2]
277 bic $C[2],$A[3][0],$A[3][4]
278 bic $C[3],$A[3][1],$A[3][0]
279 eor $A[3][0],$A[3][0],$C[0]
280 bic $C[0],$A[3][4],$A[3][3]
281 eor $A[3][1],$A[3][1],$C[1]
282 eor $A[3][3],$A[3][3],$C[2]
283 eor $A[3][4],$A[3][4],$C[3]
284 eor $A[3][2],$A[3][2],$C[0]
285
286 bic $C[0],$A[4][2],$A[4][1]
287 bic $C[1],$A[4][3],$A[4][2]
288 bic $C[2],$A[4][0],$A[4][4]
289 bic $C[3],$A[4][1],$A[4][0]
290 eor $A[4][0],$A[4][0],$C[0]
291 bic $C[0],$A[4][4],$A[4][3]
292 eor $A[4][1],$A[4][1],$C[1]
293 eor $A[4][3],$A[4][3],$C[2]
294 eor $A[4][4],$A[4][4],$C[3]
295 eor $A[4][2],$A[4][2],$C[0]
296
297 bne .Loop
298
299 ldr x30,[sp,#24]
300 .inst 0xd50323bf // autiasp
301 ret
302.size KeccakF1600_int,.-KeccakF1600_int
303
304.type KeccakF1600,%function
305.align 5
306KeccakF1600:
307 .inst 0xd503233f // paciasp
308 stp x29,x30,[sp,#-128]!
309 add x29,sp,#0
310 stp x19,x20,[sp,#16]
311 stp x21,x22,[sp,#32]
312 stp x23,x24,[sp,#48]
313 stp x25,x26,[sp,#64]
314 stp x27,x28,[sp,#80]
315 sub sp,sp,#48
316
317 str x0,[sp,#32] // offload argument
318 mov $C[0],x0
319 ldp $A[0][0],$A[0][1],[x0,#16*0]
320 ldp $A[0][2],$A[0][3],[$C[0],#16*1]
321 ldp $A[0][4],$A[1][0],[$C[0],#16*2]
322 ldp $A[1][1],$A[1][2],[$C[0],#16*3]
323 ldp $A[1][3],$A[1][4],[$C[0],#16*4]
324 ldp $A[2][0],$A[2][1],[$C[0],#16*5]
325 ldp $A[2][2],$A[2][3],[$C[0],#16*6]
326 ldp $A[2][4],$A[3][0],[$C[0],#16*7]
327 ldp $A[3][1],$A[3][2],[$C[0],#16*8]
328 ldp $A[3][3],$A[3][4],[$C[0],#16*9]
329 ldp $A[4][0],$A[4][1],[$C[0],#16*10]
330 ldp $A[4][2],$A[4][3],[$C[0],#16*11]
331 ldr $A[4][4],[$C[0],#16*12]
332
333 bl KeccakF1600_int
334
335 ldr $C[0],[sp,#32]
336 stp $A[0][0],$A[0][1],[$C[0],#16*0]
337 stp $A[0][2],$A[0][3],[$C[0],#16*1]
338 stp $A[0][4],$A[1][0],[$C[0],#16*2]
339 stp $A[1][1],$A[1][2],[$C[0],#16*3]
340 stp $A[1][3],$A[1][4],[$C[0],#16*4]
341 stp $A[2][0],$A[2][1],[$C[0],#16*5]
342 stp $A[2][2],$A[2][3],[$C[0],#16*6]
343 stp $A[2][4],$A[3][0],[$C[0],#16*7]
344 stp $A[3][1],$A[3][2],[$C[0],#16*8]
345 stp $A[3][3],$A[3][4],[$C[0],#16*9]
346 stp $A[4][0],$A[4][1],[$C[0],#16*10]
347 stp $A[4][2],$A[4][3],[$C[0],#16*11]
348 str $A[4][4],[$C[0],#16*12]
349
350 ldp x19,x20,[x29,#16]
351 add sp,sp,#48
352 ldp x21,x22,[x29,#32]
353 ldp x23,x24,[x29,#48]
354 ldp x25,x26,[x29,#64]
355 ldp x27,x28,[x29,#80]
356 ldp x29,x30,[sp],#128
357 .inst 0xd50323bf // autiasp
358 ret
359.size KeccakF1600,.-KeccakF1600
360
361.globl SHA3_absorb
362.type SHA3_absorb,%function
363.align 5
364SHA3_absorb:
365 .inst 0xd503233f // paciasp
366 stp x29,x30,[sp,#-128]!
367 add x29,sp,#0
368 stp x19,x20,[sp,#16]
369 stp x21,x22,[sp,#32]
370 stp x23,x24,[sp,#48]
371 stp x25,x26,[sp,#64]
372 stp x27,x28,[sp,#80]
373 sub sp,sp,#64
374
375 stp x0,x1,[sp,#32] // offload arguments
376 stp x2,x3,[sp,#48]
377
378 mov $C[0],x0 // uint64_t A[5][5]
379 mov $C[1],x1 // const void *inp
380 mov $C[2],x2 // size_t len
381 mov $C[3],x3 // size_t bsz
382 ldp $A[0][0],$A[0][1],[$C[0],#16*0]
383 ldp $A[0][2],$A[0][3],[$C[0],#16*1]
384 ldp $A[0][4],$A[1][0],[$C[0],#16*2]
385 ldp $A[1][1],$A[1][2],[$C[0],#16*3]
386 ldp $A[1][3],$A[1][4],[$C[0],#16*4]
387 ldp $A[2][0],$A[2][1],[$C[0],#16*5]
388 ldp $A[2][2],$A[2][3],[$C[0],#16*6]
389 ldp $A[2][4],$A[3][0],[$C[0],#16*7]
390 ldp $A[3][1],$A[3][2],[$C[0],#16*8]
391 ldp $A[3][3],$A[3][4],[$C[0],#16*9]
392 ldp $A[4][0],$A[4][1],[$C[0],#16*10]
393 ldp $A[4][2],$A[4][3],[$C[0],#16*11]
394 ldr $A[4][4],[$C[0],#16*12]
395 b .Loop_absorb
396
397.align 4
398.Loop_absorb:
399 subs $C[0],$C[2],$C[3] // len - bsz
400 blo .Labsorbed
401
402 str $C[0],[sp,#48] // save len - bsz
403___
404for (my $i=0; $i<24; $i+=2) {
405my $j = $i+1;
406$code.=<<___;
407 ldr $C[0],[$C[1]],#8 // *inp++
408#ifdef __AARCH64EB__
409 rev $C[0],$C[0]
410#endif
411 eor $A[$i/5][$i%5],$A[$i/5][$i%5],$C[0]
412 cmp $C[3],#8*($i+2)
413 blo .Lprocess_block
414 ldr $C[0],[$C[1]],#8 // *inp++
415#ifdef __AARCH64EB__
416 rev $C[0],$C[0]
417#endif
418 eor $A[$j/5][$j%5],$A[$j/5][$j%5],$C[0]
419 beq .Lprocess_block
420___
421}
422$code.=<<___;
423 ldr $C[0],[$C[1]],#8 // *inp++
424#ifdef __AARCH64EB__
425 rev $C[0],$C[0]
426#endif
427 eor $A[4][4],$A[4][4],$C[0]
428
429.Lprocess_block:
430 str $C[1],[sp,#40] // save inp
431
432 bl KeccakF1600_int
433
434 ldr $C[1],[sp,#40] // restore arguments
435 ldp $C[2],$C[3],[sp,#48]
436 b .Loop_absorb
437
438.align 4
439.Labsorbed:
440 ldr $C[1],[sp,#32]
441 stp $A[0][0],$A[0][1],[$C[1],#16*0]
442 stp $A[0][2],$A[0][3],[$C[1],#16*1]
443 stp $A[0][4],$A[1][0],[$C[1],#16*2]
444 stp $A[1][1],$A[1][2],[$C[1],#16*3]
445 stp $A[1][3],$A[1][4],[$C[1],#16*4]
446 stp $A[2][0],$A[2][1],[$C[1],#16*5]
447 stp $A[2][2],$A[2][3],[$C[1],#16*6]
448 stp $A[2][4],$A[3][0],[$C[1],#16*7]
449 stp $A[3][1],$A[3][2],[$C[1],#16*8]
450 stp $A[3][3],$A[3][4],[$C[1],#16*9]
451 stp $A[4][0],$A[4][1],[$C[1],#16*10]
452 stp $A[4][2],$A[4][3],[$C[1],#16*11]
453 str $A[4][4],[$C[1],#16*12]
454
455 mov x0,$C[2] // return value
456 ldp x19,x20,[x29,#16]
457 add sp,sp,#64
458 ldp x21,x22,[x29,#32]
459 ldp x23,x24,[x29,#48]
460 ldp x25,x26,[x29,#64]
461 ldp x27,x28,[x29,#80]
462 ldp x29,x30,[sp],#128
463 .inst 0xd50323bf // autiasp
464 ret
465.size SHA3_absorb,.-SHA3_absorb
466___
467{
468my ($A_flat,$out,$len,$bsz) = map("x$_",(19..22));
469$code.=<<___;
470.globl SHA3_squeeze
471.type SHA3_squeeze,%function
472.align 5
473SHA3_squeeze:
474 .inst 0xd503233f // paciasp
475 stp x29,x30,[sp,#-48]!
476 add x29,sp,#0
477 stp x19,x20,[sp,#16]
478 stp x21,x22,[sp,#32]
479
480 mov $A_flat,x0 // put aside arguments
481 mov $out,x1
482 mov $len,x2
483 mov $bsz,x3
484
485.Loop_squeeze:
486 ldr x4,[x0],#8
487 cmp $len,#8
488 blo .Lsqueeze_tail
489#ifdef __AARCH64EB__
490 rev x4,x4
491#endif
492 str x4,[$out],#8
493 subs $len,$len,#8
494 beq .Lsqueeze_done
495
496 subs x3,x3,#8
497 bhi .Loop_squeeze
498
499 mov x0,$A_flat
500 bl KeccakF1600
501 mov x0,$A_flat
502 mov x3,$bsz
503 b .Loop_squeeze
504
505.align 4
506.Lsqueeze_tail:
507 strb w4,[$out],#1
508 lsr x4,x4,#8
509 subs $len,$len,#1
510 beq .Lsqueeze_done
511 strb w4,[$out],#1
512 lsr x4,x4,#8
513 subs $len,$len,#1
514 beq .Lsqueeze_done
515 strb w4,[$out],#1
516 lsr x4,x4,#8
517 subs $len,$len,#1
518 beq .Lsqueeze_done
519 strb w4,[$out],#1
520 lsr x4,x4,#8
521 subs $len,$len,#1
522 beq .Lsqueeze_done
523 strb w4,[$out],#1
524 lsr x4,x4,#8
525 subs $len,$len,#1
526 beq .Lsqueeze_done
527 strb w4,[$out],#1
528 lsr x4,x4,#8
529 subs $len,$len,#1
530 beq .Lsqueeze_done
531 strb w4,[$out],#1
532
533.Lsqueeze_done:
534 ldp x19,x20,[sp,#16]
535 ldp x21,x22,[sp,#32]
536 ldp x29,x30,[sp],#48
537 .inst 0xd50323bf // autiasp
538 ret
539.size SHA3_squeeze,.-SHA3_squeeze
540___
541} }}}
542 {{{
543my @A = map([ "v".$_.".16b", "v".($_+1).".16b", "v".($_+2).".16b",
544 "v".($_+3).".16b", "v".($_+4).".16b" ],
545 (0, 5, 10, 15, 20));
546
547my @C = map("v$_.16b", (25..31));
548my @D = @C[4,5,6,2,3];
549
550$code.=<<___;
551.type KeccakF1600_ce,%function
552.align 5
553KeccakF1600_ce:
554 mov x9,#24
555 adr x10,iotas
556 b .Loop_ce
557.align 4
558.Loop_ce:
559 ////////////////////////////////////////////////// Theta
560 eor3 $C[0],$A[4][0],$A[3][0],$A[2][0]
561 eor3 $C[1],$A[4][1],$A[3][1],$A[2][1]
562 eor3 $C[2],$A[4][2],$A[3][2],$A[2][2]
563 eor3 $C[3],$A[4][3],$A[3][3],$A[2][3]
564 eor3 $C[4],$A[4][4],$A[3][4],$A[2][4]
565 eor3 $C[0],$C[0], $A[1][0],$A[0][0]
566 eor3 $C[1],$C[1], $A[1][1],$A[0][1]
567 eor3 $C[2],$C[2], $A[1][2],$A[0][2]
568 eor3 $C[3],$C[3], $A[1][3],$A[0][3]
569 eor3 $C[4],$C[4], $A[1][4],$A[0][4]
570
571 rax1 $C[5],$C[0],$C[2] // D[1]
572 rax1 $C[6],$C[1],$C[3] // D[2]
573 rax1 $C[2],$C[2],$C[4] // D[3]
574 rax1 $C[3],$C[3],$C[0] // D[4]
575 rax1 $C[4],$C[4],$C[1] // D[0]
576
577 ////////////////////////////////////////////////// Theta+Rho+Pi
578 xar $C[0], $A[0][1],$D[1],#64-$rhotates[0][1] // C[0]=A[2][0]
579
580 xar $A[0][1],$A[1][1],$D[1],#64-$rhotates[1][1]
581 xar $A[1][1],$A[1][4],$D[4],#64-$rhotates[1][4]
582 xar $A[1][4],$A[4][2],$D[2],#64-$rhotates[4][2]
583 xar $A[4][2],$A[2][4],$D[4],#64-$rhotates[2][4]
584 xar $A[2][4],$A[4][0],$D[0],#64-$rhotates[4][0]
585
586 xar $C[1], $A[0][2],$D[2],#64-$rhotates[0][2] // C[1]=A[4][0]
587
588 xar $A[0][2],$A[2][2],$D[2],#64-$rhotates[2][2]
589 xar $A[2][2],$A[2][3],$D[3],#64-$rhotates[2][3]
590 xar $A[2][3],$A[3][4],$D[4],#64-$rhotates[3][4]
591 xar $A[3][4],$A[4][3],$D[3],#64-$rhotates[4][3]
592 xar $A[4][3],$A[3][0],$D[0],#64-$rhotates[3][0]
593
594 xar $A[3][0],$A[0][4],$D[4],#64-$rhotates[0][4]
595
596 xar $D[4], $A[4][4],$D[4],#64-$rhotates[4][4] // D[4]=A[0][4]
597 xar $A[4][4],$A[4][1],$D[1],#64-$rhotates[4][1]
598 xar $A[1][3],$A[1][3],$D[3],#64-$rhotates[1][3] // A[1][3]=A[4][1]
599 xar $A[0][4],$A[3][1],$D[1],#64-$rhotates[3][1] // A[0][4]=A[1][3]
600 xar $A[3][1],$A[1][0],$D[0],#64-$rhotates[1][0]
601
602 xar $A[1][0],$A[0][3],$D[3],#64-$rhotates[0][3]
603
604 eor $A[0][0],$A[0][0],$D[0]
605
606 xar $D[3], $A[3][3],$D[3],#64-$rhotates[3][3] // D[3]=A[0][3]
607 xar $A[0][3],$A[3][2],$D[2],#64-$rhotates[3][2] // A[0][3]=A[3][3]
608 xar $D[1], $A[2][1],$D[1],#64-$rhotates[2][1] // D[1]=A[3][2]
609 xar $D[2], $A[1][2],$D[2],#64-$rhotates[1][2] // D[2]=A[2][1]
610 xar $D[0], $A[2][0],$D[0],#64-$rhotates[2][0] // D[0]=A[1][2]
611
612 ////////////////////////////////////////////////// Chi+Iota
613 bcax $A[4][0],$C[1], $A[4][2],$A[1][3] // A[1][3]=A[4][1]
614 bcax $A[4][1],$A[1][3],$A[4][3],$A[4][2] // A[1][3]=A[4][1]
615 bcax $A[4][2],$A[4][2],$A[4][4],$A[4][3]
616 bcax $A[4][3],$A[4][3],$C[1], $A[4][4]
617 bcax $A[4][4],$A[4][4],$A[1][3],$C[1] // A[1][3]=A[4][1]
618
619 ld1r {$C[1]},[x10],#8
620
621 bcax $A[3][2],$D[1], $A[3][4],$A[0][3] // A[0][3]=A[3][3]
622 bcax $A[3][3],$A[0][3],$A[3][0],$A[3][4] // A[0][3]=A[3][3]
623 bcax $A[3][4],$A[3][4],$A[3][1],$A[3][0]
624 bcax $A[3][0],$A[3][0],$D[1], $A[3][1]
625 bcax $A[3][1],$A[3][1],$A[0][3],$D[1] // A[0][3]=A[3][3]
626
627 bcax $A[2][0],$C[0], $A[2][2],$D[2]
628 bcax $A[2][1],$D[2], $A[2][3],$A[2][2]
629 bcax $A[2][2],$A[2][2],$A[2][4],$A[2][3]
630 bcax $A[2][3],$A[2][3],$C[0], $A[2][4]
631 bcax $A[2][4],$A[2][4],$D[2], $C[0]
632
633 bcax $A[1][2],$D[0], $A[1][4],$A[0][4] // A[0][4]=A[1][3]
634 bcax $A[1][3],$A[0][4],$A[1][0],$A[1][4] // A[0][4]=A[1][3]
635 bcax $A[1][4],$A[1][4],$A[1][1],$A[1][0]
636 bcax $A[1][0],$A[1][0],$D[0], $A[1][1]
637 bcax $A[1][1],$A[1][1],$A[0][4],$D[0] // A[0][4]=A[1][3]
638
639 bcax $A[0][3],$D[3], $A[0][0],$D[4]
640 bcax $A[0][4],$D[4], $A[0][1],$A[0][0]
641 bcax $A[0][0],$A[0][0],$A[0][2],$A[0][1]
642 bcax $A[0][1],$A[0][1],$D[3], $A[0][2]
643 bcax $A[0][2],$A[0][2],$D[4], $D[3]
644
645 eor $A[0][0],$A[0][0],$C[1]
646
647 subs x9,x9,#1
648 bne .Loop_ce
649
650 ret
651.size KeccakF1600_ce,.-KeccakF1600_ce
652
653.type KeccakF1600_cext,%function
654.align 5
655KeccakF1600_cext:
656 .inst 0xd503233f // paciasp
657 stp x29,x30,[sp,#-80]!
658 add x29,sp,#0
659 stp d8,d9,[sp,#16] // per ABI requirement
660 stp d10,d11,[sp,#32]
661 stp d12,d13,[sp,#48]
662 stp d14,d15,[sp,#64]
663___
664for($i=0; $i<24; $i+=2) { # load A[5][5]
665my $j=$i+1;
666$code.=<<___;
667 ldp d$i,d$j,[x0,#8*$i]
668___
669}
670$code.=<<___;
671 ldr d24,[x0,#8*$i]
672 bl KeccakF1600_ce
673 ldr x30,[sp,#8]
674___
675for($i=0; $i<24; $i+=2) { # store A[5][5]
676my $j=$i+1;
677$code.=<<___;
678 stp d$i,d$j,[x0,#8*$i]
679___
680}
681$code.=<<___;
682 str d24,[x0,#8*$i]
683
684 ldp d8,d9,[sp,#16]
685 ldp d10,d11,[sp,#32]
686 ldp d12,d13,[sp,#48]
687 ldp d14,d15,[sp,#64]
688 ldr x29,[sp],#80
689 .inst 0xd50323bf // autiasp
690 ret
691.size KeccakF1600_cext,.-KeccakF1600_cext
692___
693
694{
695my ($ctx,$inp,$len,$bsz) = map("x$_",(0..3));
696
697$code.=<<___;
698.globl SHA3_absorb_cext
699.type SHA3_absorb_cext,%function
700.align 5
701SHA3_absorb_cext:
702 .inst 0xd503233f // paciasp
703 stp x29,x30,[sp,#-80]!
704 add x29,sp,#0
705 stp d8,d9,[sp,#16] // per ABI requirement
706 stp d10,d11,[sp,#32]
707 stp d12,d13,[sp,#48]
708 stp d14,d15,[sp,#64]
709___
710for($i=0; $i<24; $i+=2) { # load A[5][5]
711my $j=$i+1;
712$code.=<<___;
713 ldp d$i,d$j,[x0,#8*$i]
714___
715}
716$code.=<<___;
717 ldr d24,[x0,#8*$i]
718 b .Loop_absorb_ce
719
720.align 4
721.Loop_absorb_ce:
722 subs $len,$len,$bsz // len - bsz
723 blo .Labsorbed_ce
724___
725for (my $i=0; $i<24; $i+=2) {
726my $j = $i+1;
727$code.=<<___;
728 ldr d31,[$inp],#8 // *inp++
729#ifdef __AARCH64EB__
730 rev64 v31.16b,v31.16b
731#endif
732 eor $A[$i/5][$i%5],$A[$i/5][$i%5],v31.16b
733 cmp $bsz,#8*($i+2)
734 blo .Lprocess_block_ce
735 ldr d31,[$inp],#8 // *inp++
736#ifdef __AARCH64EB__
737 rev64 v31.16b,v31.16b
738#endif
739 eor $A[$j/5][$j%5],$A[$j/5][$j%5],v31.16b
740 beq .Lprocess_block_ce
741___
742}
743$code.=<<___;
744 ldr d31,[$inp],#8 // *inp++
745#ifdef __AARCH64EB__
746 rev64 v31.16b,v31.16b
747#endif
748 eor $A[4][4],$A[4][4],v31.16b
749
750.Lprocess_block_ce:
751
752 bl KeccakF1600_ce
753
754 b .Loop_absorb_ce
755
756.align 4
757.Labsorbed_ce:
758___
759for($i=0; $i<24; $i+=2) { # store A[5][5]
760my $j=$i+1;
761$code.=<<___;
762 stp d$i,d$j,[x0,#8*$i]
763___
764}
765$code.=<<___;
766 str d24,[x0,#8*$i]
767 add x0,$len,$bsz // return value
768
769 ldp d8,d9,[sp,#16]
770 ldp d10,d11,[sp,#32]
771 ldp d12,d13,[sp,#48]
772 ldp d14,d15,[sp,#64]
773 ldp x29,x30,[sp],#80
774 .inst 0xd50323bf // autiasp
775 ret
776.size SHA3_absorb_cext,.-SHA3_absorb_cext
777___
778}
779{
780my ($ctx,$out,$len,$bsz) = map("x$_",(0..3));
781$code.=<<___;
782.globl SHA3_squeeze_cext
783.type SHA3_squeeze_cext,%function
784.align 5
785SHA3_squeeze_cext:
786 .inst 0xd503233f // paciasp
787 stp x29,x30,[sp,#-16]!
788 add x29,sp,#0
789 mov x9,$ctx
790 mov x10,$bsz
791
792.Loop_squeeze_ce:
793 ldr x4,[x9],#8
794 cmp $len,#8
795 blo .Lsqueeze_tail_ce
796#ifdef __AARCH64EB__
797 rev x4,x4
798#endif
799 str x4,[$out],#8
800 beq .Lsqueeze_done_ce
801
802 sub $len,$len,#8
803 subs x10,x10,#8
804 bhi .Loop_squeeze_ce
805
806 bl KeccakF1600_cext
807 ldr x30,[sp,#8]
808 mov x9,$ctx
809 mov x10,$bsz
810 b .Loop_squeeze_ce
811
812.align 4
813.Lsqueeze_tail_ce:
814 strb w4,[$out],#1
815 lsr x4,x4,#8
816 subs $len,$len,#1
817 beq .Lsqueeze_done_ce
818 strb w4,[$out],#1
819 lsr x4,x4,#8
820 subs $len,$len,#1
821 beq .Lsqueeze_done_ce
822 strb w4,[$out],#1
823 lsr x4,x4,#8
824 subs $len,$len,#1
825 beq .Lsqueeze_done_ce
826 strb w4,[$out],#1
827 lsr x4,x4,#8
828 subs $len,$len,#1
829 beq .Lsqueeze_done_ce
830 strb w4,[$out],#1
831 lsr x4,x4,#8
832 subs $len,$len,#1
833 beq .Lsqueeze_done_ce
834 strb w4,[$out],#1
835 lsr x4,x4,#8
836 subs $len,$len,#1
837 beq .Lsqueeze_done_ce
838 strb w4,[$out],#1
839
840.Lsqueeze_done_ce:
841 ldr x29,[sp],#16
842 .inst 0xd50323bf // autiasp
843 ret
844.size SHA3_squeeze_cext,.-SHA3_squeeze_cext
845___
846} }}}
847$code.=<<___;
848.asciz "Keccak-1600 absorb and squeeze for ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
849___
850
851{ my %opcode = (
852 "rax1" => 0xce608c00, "eor3" => 0xce000000,
853 "bcax" => 0xce200000, "xar" => 0xce800000 );
854
855 sub unsha3 {
856 my ($mnemonic,$arg)=@_;
857
858 $arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)[^,]*(?:,\s*[qv]([0-9]+)[^,]*(?:,\s*[qv#]([0-9\-]+))?)?/
859 &&
860 sprintf ".inst\t0x%08x\t//%s %s",
861 $opcode{$mnemonic}|$1|($2<<5)|($3<<16)|(eval($4)<<10),
862 $mnemonic,$arg;
863 }
864}
865
866foreach(split("\n",$code)) {
867
868 s/\`([^\`]*)\`/eval($1)/ge;
869
870 m/\bld1r\b/ and s/\.16b/.2d/g or
871 s/\b(eor3|rax1|xar|bcax)\s+(v.*)/unsha3($1,$2)/ge;
872
873 print $_,"\n";
874}
875
876close STDOUT or die "error closing STDOUT: $!";
注意: 瀏覽 TracBrowser 來幫助您使用儲存庫瀏覽器

© 2025 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette