VirtualBox

source: vbox/trunk/src/libs/openssl-3.1.7/crypto/sha/asm/keccak1600p8-ppc.pl@ 108344

最後變更 在這個檔案從108344是 104078,由 vboxsync 提交於 12 月 前

openssl-3.1.5: Applied and adjusted our OpenSSL changes to 3.1.4. bugref:10638

檔案大小: 18.9 KB
 
1#!/usr/bin/env perl
2# Copyright 2017-2020 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the Apache License 2.0 (the "License"). You may not use
5# this file except in compliance with the License. You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8#
9# ====================================================================
10# Written by Andy Polyakov <[email protected]> for the OpenSSL
11# project. The module is, however, dual licensed under OpenSSL and
12# CRYPTOGAMS licenses depending on where you obtain it. For further
13# details see http://www.openssl.org/~appro/cryptogams/.
14# ====================================================================
15#
16# Keccak-1600 for PowerISA 2.07.
17#
18# June 2017.
19#
20# This is straightforward KECCAK_1X_ALT SIMD implementation, but with
21# disjoint Rho and Pi. The module is ABI-bitness- and endian-neutral.
22# POWER8 processor spends 9.8 cycles to process byte out of large
23# buffer for r=1088, which matches SHA3-256. This is 17% better than
24# scalar PPC64 code. It probably should be noted that if POWER8's
25# successor can achieve higher scalar instruction issue rate, then
26# this module will loose... And it does on POWER9 with 12.0 vs. 9.4.
27
28# $output is the last argument if it looks like a file (it has an extension)
29# $flavour is the first argument if it doesn't look like a file
30$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
31$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
32
33if ($flavour =~ /64/) {
34 $SIZE_T =8;
35 $LRSAVE =2*$SIZE_T;
36 $UCMP ="cmpld";
37 $STU ="stdu";
38 $POP ="ld";
39 $PUSH ="std";
40} elsif ($flavour =~ /32/) {
41 $SIZE_T =4;
42 $LRSAVE =$SIZE_T;
43 $STU ="stwu";
44 $POP ="lwz";
45 $PUSH ="stw";
46 $UCMP ="cmplw";
47} else { die "nonsense $flavour"; }
48
49$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
50( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
51( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
52die "can't locate ppc-xlate.pl";
53
54open STDOUT,"| $^X $xlate $flavour \"$output\""
55 or die "can't call $xlate: $!";
56
57$FRAME=6*$SIZE_T+13*16; # 13*16 is for v20-v31 offload
58
59my $sp ="r1";
60
61my $iotas = "r12";
62
63########################################################################
64# Register layout:
65#
66# v0 A[0][0] A[1][0]
67# v1 A[0][1] A[1][1]
68# v2 A[0][2] A[1][2]
69# v3 A[0][3] A[1][3]
70# v4 A[0][4] A[1][4]
71#
72# v5 A[2][0] A[3][0]
73# v6 A[2][1] A[3][1]
74# v7 A[2][2] A[3][2]
75# v8 A[2][3] A[3][3]
76# v9 A[2][4] A[3][4]
77#
78# v10 A[4][0] A[4][1]
79# v11 A[4][2] A[4][3]
80# v12 A[4][4] A[4][4]
81#
82# v13..25 rhotates[][]
83# v26..31 volatile
84#
85$code.=<<___;
86.machine "any"
87.text
88
89.type KeccakF1600_int,\@function
90.align 5
91KeccakF1600_int:
92 li r0,24
93 mtctr r0
94 li r0,0
95 b .Loop
96
97.align 4
98.Loop:
99 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; Theta
100 vxor v26,v0, v5 ; A[0..1][0]^A[2..3][0]
101 vxor v27,v1, v6 ; A[0..1][1]^A[2..3][1]
102 vxor v28,v2, v7 ; A[0..1][2]^A[2..3][2]
103 vxor v29,v3, v8 ; A[0..1][3]^A[2..3][3]
104 vxor v30,v4, v9 ; A[0..1][4]^A[2..3][4]
105 vpermdi v31,v26,v27,0b00 ; A[0][0..1]^A[2][0..1]
106 vpermdi v26,v26,v27,0b11 ; A[1][0..1]^A[3][0..1]
107 vpermdi v27,v28,v29,0b00 ; A[0][2..3]^A[2][2..3]
108 vpermdi v28,v28,v29,0b11 ; A[1][2..3]^A[3][2..3]
109 vpermdi v29,v30,v30,0b10 ; A[1..0][4]^A[3..2][4]
110 vxor v26,v26,v31 ; C[0..1]
111 vxor v27,v27,v28 ; C[2..3]
112 vxor v28,v29,v30 ; C[4..4]
113 vspltisb v31,1
114 vxor v26,v26,v10 ; C[0..1] ^= A[4][0..1]
115 vxor v27,v27,v11 ; C[2..3] ^= A[4][2..3]
116 vxor v28,v28,v12 ; C[4..4] ^= A[4][4..4], low!
117
118 vrld v29,v26,v31 ; ROL64(C[0..1],1)
119 vrld v30,v27,v31 ; ROL64(C[2..3],1)
120 vrld v31,v28,v31 ; ROL64(C[4..4],1)
121 vpermdi v31,v31,v29,0b10
122 vxor v26,v26,v30 ; C[0..1] ^= ROL64(C[2..3],1)
123 vxor v27,v27,v31 ; C[2..3] ^= ROL64(C[4..0],1)
124 vxor v28,v28,v29 ; C[4..4] ^= ROL64(C[0..1],1), low!
125
126 vpermdi v29,v26,v26,0b00 ; C[0..0]
127 vpermdi v30,v28,v26,0b10 ; C[4..0]
128 vpermdi v31,v28,v28,0b11 ; C[4..4]
129 vxor v1, v1, v29 ; A[0..1][1] ^= C[0..0]
130 vxor v6, v6, v29 ; A[2..3][1] ^= C[0..0]
131 vxor v10,v10,v30 ; A[4][0..1] ^= C[4..0]
132 vxor v0, v0, v31 ; A[0..1][0] ^= C[4..4]
133 vxor v5, v5, v31 ; A[2..3][0] ^= C[4..4]
134
135 vpermdi v29,v27,v27,0b00 ; C[2..2]
136 vpermdi v30,v26,v26,0b11 ; C[1..1]
137 vpermdi v31,v26,v27,0b10 ; C[1..2]
138 vxor v3, v3, v29 ; A[0..1][3] ^= C[2..2]
139 vxor v8, v8, v29 ; A[2..3][3] ^= C[2..2]
140 vxor v2, v2, v30 ; A[0..1][2] ^= C[1..1]
141 vxor v7, v7, v30 ; A[2..3][2] ^= C[1..1]
142 vxor v11,v11,v31 ; A[4][2..3] ^= C[1..2]
143
144 vpermdi v29,v27,v27,0b11 ; C[3..3]
145 vxor v4, v4, v29 ; A[0..1][4] ^= C[3..3]
146 vxor v9, v9, v29 ; A[2..3][4] ^= C[3..3]
147 vxor v12,v12,v29 ; A[4..4][4] ^= C[3..3]
148
149 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; Rho
150 vrld v26,v0, v13 ; v0
151 vrld v1, v1, v14
152 vrld v27,v2, v15 ; v2
153 vrld v28,v3, v16 ; v3
154 vrld v4, v4, v17
155 vrld v5, v5, v18
156 vrld v6, v6, v19
157 vrld v29,v7, v20 ; v7
158 vrld v8, v8, v21
159 vrld v9, v9, v22
160 vrld v10,v10,v23
161 vrld v30,v11,v24 ; v11
162 vrld v12,v12,v25
163
164 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; Pi
165 vpermdi v0, v26,v28,0b00 ; [0][0] [1][0] < [0][0] [0][3]
166 vpermdi v2, v29,v5, 0b00 ; [0][2] [1][2] < [2][2] [2][0]
167 vpermdi v11,v9, v5, 0b01 ; [4][2] [4][3] < [2][4] [3][0]
168 vpermdi v5, v1, v4, 0b00 ; [2][0] [3][0] < [0][1] [0][4]
169 vpermdi v1, v1, v4, 0b11 ; [0][1] [1][1] < [1][1] [1][4]
170 vpermdi v3, v8, v6, 0b11 ; [0][3] [1][3] < [3][3] [3][1]
171 vpermdi v4, v12,v30,0b10 ; [0][4] [1][4] < [4][4] [4][2]
172 vpermdi v7, v8, v6, 0b00 ; [2][2] [3][2] < [2][3] [2][1]
173 vpermdi v6, v27,v26,0b11 ; [2][1] [3][1] < [1][2] [1][0]
174 vpermdi v8, v9, v29,0b11 ; [2][3] [3][3] < [3][4] [3][2]
175 vpermdi v12,v10,v10,0b11 ; [4][4] [4][4] < [4][1] [4][1]
176 vpermdi v9, v10,v30,0b01 ; [2][4] [3][4] < [4][0] [4][3]
177 vpermdi v10,v27,v28,0b01 ; [4][0] [4][1] < [0][2] [1][3]
178
179 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; Chi + Iota
180 lvx_u v31,$iotas,r0 ; iotas[index]
181 addic r0,r0,16 ; index++
182
183 vandc v26,v2, v1 ; (~A[0..1][1] & A[0..1][2])
184 vandc v27,v3, v2 ; (~A[0..1][2] & A[0..1][3])
185 vandc v28,v4, v3 ; (~A[0..1][3] & A[0..1][4])
186 vandc v29,v0, v4 ; (~A[0..1][4] & A[0..1][0])
187 vandc v30,v1, v0 ; (~A[0..1][0] & A[0..1][1])
188 vxor v0, v0, v26 ; A[0..1][0] ^= (~A[0..1][1] & A[0..1][2])
189 vxor v1, v1, v27 ; A[0..1][1] ^= (~A[0..1][2] & A[0..1][3])
190 vxor v2, v2, v28 ; A[0..1][2] ^= (~A[0..1][3] & A[0..1][4])
191 vxor v3, v3, v29 ; A[0..1][3] ^= (~A[0..1][4] & A[0..1][0])
192 vxor v4, v4, v30 ; A[0..1][4] ^= (~A[0..1][0] & A[0..1][1])
193
194 vandc v26,v7, v6 ; (~A[2..3][1] & A[2..3][2])
195 vandc v27,v8, v7 ; (~A[2..3][2] & A[2..3][3])
196 vandc v28,v9, v8 ; (~A[2..3][3] & A[2..3][4])
197 vandc v29,v5, v9 ; (~A[2..3][4] & A[2..3][0])
198 vandc v30,v6, v5 ; (~A[2..3][0] & A[2..3][1])
199 vxor v5, v5, v26 ; A[2..3][0] ^= (~A[2..3][1] & A[2..3][2])
200 vxor v6, v6, v27 ; A[2..3][1] ^= (~A[2..3][2] & A[2..3][3])
201 vxor v7, v7, v28 ; A[2..3][2] ^= (~A[2..3][3] & A[2..3][4])
202 vxor v8, v8, v29 ; A[2..3][3] ^= (~A[2..3][4] & A[2..3][0])
203 vxor v9, v9, v30 ; A[2..3][4] ^= (~A[2..3][0] & A[2..3][1])
204
205 vxor v0, v0, v31 ; A[0][0] ^= iotas[index++]
206
207 vpermdi v26,v10,v11,0b10 ; A[4][1..2]
208 vpermdi v27,v12,v10,0b00 ; A[4][4..0]
209 vpermdi v28,v11,v12,0b10 ; A[4][3..4]
210 vpermdi v29,v10,v10,0b10 ; A[4][1..0]
211 vandc v26,v11,v26 ; (~A[4][1..2] & A[4][2..3])
212 vandc v27,v27,v28 ; (~A[4][3..4] & A[4][4..0])
213 vandc v28,v10,v29 ; (~A[4][1..0] & A[4][0..1])
214 vxor v10,v10,v26 ; A[4][0..1] ^= (~A[4][1..2] & A[4][2..3])
215 vxor v11,v11,v27 ; A[4][2..3] ^= (~A[4][3..4] & A[4][4..0])
216 vxor v12,v12,v28 ; A[4][4..4] ^= (~A[4][0..1] & A[4][1..0])
217
218 bdnz .Loop
219
220 vpermdi v12,v12,v12,0b11 ; broadcast A[4][4]
221 blr
222 .long 0
223 .byte 0,12,0x14,0,0,0,0,0
224.size KeccakF1600_int,.-KeccakF1600_int
225
226.type KeccakF1600,\@function
227.align 5
228KeccakF1600:
229 $STU $sp,-$FRAME($sp)
230 li r10,`15+6*$SIZE_T`
231 li r11,`31+6*$SIZE_T`
232 mflr r8
233 mfspr r7, 256 ; save vrsave
234 stvx v20,r10,$sp
235 addi r10,r10,32
236 stvx v21,r11,$sp
237 addi r11,r11,32
238 stvx v22,r10,$sp
239 addi r10,r10,32
240 stvx v23,r11,$sp
241 addi r11,r11,32
242 stvx v24,r10,$sp
243 addi r10,r10,32
244 stvx v25,r11,$sp
245 addi r11,r11,32
246 stvx v26,r10,$sp
247 addi r10,r10,32
248 stvx v27,r11,$sp
249 addi r11,r11,32
250 stvx v28,r10,$sp
251 addi r10,r10,32
252 stvx v29,r11,$sp
253 addi r11,r11,32
254 stvx v30,r10,$sp
255 stvx v31,r11,$sp
256 stw r7,`$FRAME-4`($sp) ; save vrsave
257 li r0, -1
258 $PUSH r8,`$FRAME+$LRSAVE`($sp)
259 mtspr 256, r0 ; preserve all AltiVec registers
260
261 li r11,16
262 lvx_4w v0,0,r3 ; load A[5][5]
263 li r10,32
264 lvx_4w v1,r11,r3
265 addi r11,r11,32
266 lvx_4w v2,r10,r3
267 addi r10,r10,32
268 lvx_4w v3,r11,r3
269 addi r11,r11,32
270 lvx_4w v4,r10,r3
271 addi r10,r10,32
272 lvx_4w v5,r11,r3
273 addi r11,r11,32
274 lvx_4w v6,r10,r3
275 addi r10,r10,32
276 lvx_4w v7,r11,r3
277 addi r11,r11,32
278 lvx_4w v8,r10,r3
279 addi r10,r10,32
280 lvx_4w v9,r11,r3
281 addi r11,r11,32
282 lvx_4w v10,r10,r3
283 addi r10,r10,32
284 lvx_4w v11,r11,r3
285 lvx_splt v12,r10,r3
286
287 bl PICmeup
288
289 li r11,16
290 lvx_u v13,0,r12 ; load rhotates
291 li r10,32
292 lvx_u v14,r11,r12
293 addi r11,r11,32
294 lvx_u v15,r10,r12
295 addi r10,r10,32
296 lvx_u v16,r11,r12
297 addi r11,r11,32
298 lvx_u v17,r10,r12
299 addi r10,r10,32
300 lvx_u v18,r11,r12
301 addi r11,r11,32
302 lvx_u v19,r10,r12
303 addi r10,r10,32
304 lvx_u v20,r11,r12
305 addi r11,r11,32
306 lvx_u v21,r10,r12
307 addi r10,r10,32
308 lvx_u v22,r11,r12
309 addi r11,r11,32
310 lvx_u v23,r10,r12
311 addi r10,r10,32
312 lvx_u v24,r11,r12
313 lvx_u v25,r10,r12
314 addi r12,r12,`16*16` ; points at iotas
315
316 bl KeccakF1600_int
317
318 li r11,16
319 stvx_4w v0,0,r3 ; return A[5][5]
320 li r10,32
321 stvx_4w v1,r11,r3
322 addi r11,r11,32
323 stvx_4w v2,r10,r3
324 addi r10,r10,32
325 stvx_4w v3,r11,r3
326 addi r11,r11,32
327 stvx_4w v4,r10,r3
328 addi r10,r10,32
329 stvx_4w v5,r11,r3
330 addi r11,r11,32
331 stvx_4w v6,r10,r3
332 addi r10,r10,32
333 stvx_4w v7,r11,r3
334 addi r11,r11,32
335 stvx_4w v8,r10,r3
336 addi r10,r10,32
337 stvx_4w v9,r11,r3
338 addi r11,r11,32
339 stvx_4w v10,r10,r3
340 addi r10,r10,32
341 stvx_4w v11,r11,r3
342 stvdx_u v12,r10,r3
343
344 li r10,`15+6*$SIZE_T`
345 li r11,`31+6*$SIZE_T`
346 mtlr r8
347 mtspr 256, r7 ; restore vrsave
348 lvx v20,r10,$sp
349 addi r10,r10,32
350 lvx v21,r11,$sp
351 addi r11,r11,32
352 lvx v22,r10,$sp
353 addi r10,r10,32
354 lvx v23,r11,$sp
355 addi r11,r11,32
356 lvx v24,r10,$sp
357 addi r10,r10,32
358 lvx v25,r11,$sp
359 addi r11,r11,32
360 lvx v26,r10,$sp
361 addi r10,r10,32
362 lvx v27,r11,$sp
363 addi r11,r11,32
364 lvx v28,r10,$sp
365 addi r10,r10,32
366 lvx v29,r11,$sp
367 addi r11,r11,32
368 lvx v30,r10,$sp
369 lvx v31,r11,$sp
370 addi $sp,$sp,$FRAME
371 blr
372 .long 0
373 .byte 0,12,0x04,1,0x80,0,1,0
374 .long 0
375.size KeccakF1600,.-KeccakF1600
376___
377{
378my ($A_jagged,$inp,$len,$bsz) = map("r$_",(3..6));
379
380$code.=<<___;
381.globl SHA3_absorb
382.type SHA3_absorb,\@function
383.align 5
384SHA3_absorb:
385 $STU $sp,-$FRAME($sp)
386 li r10,`15+6*$SIZE_T`
387 li r11,`31+6*$SIZE_T`
388 mflr r8
389 mfspr r7, 256 ; save vrsave
390 stvx v20,r10,$sp
391 addi r10,r10,32
392 stvx v21,r11,$sp
393 addi r11,r11,32
394 stvx v22,r10,$sp
395 addi r10,r10,32
396 stvx v23,r11,$sp
397 addi r11,r11,32
398 stvx v24,r10,$sp
399 addi r10,r10,32
400 stvx v25,r11,$sp
401 addi r11,r11,32
402 stvx v26,r10,$sp
403 addi r10,r10,32
404 stvx v27,r11,$sp
405 addi r11,r11,32
406 stvx v28,r10,$sp
407 addi r10,r10,32
408 stvx v29,r11,$sp
409 addi r11,r11,32
410 stvx v30,r10,$sp
411 stvx v31,r11,$sp
412 stw r7,`$FRAME-4`($sp) ; save vrsave
413 li r0, -1
414 $PUSH r8,`$FRAME+$LRSAVE`($sp)
415 mtspr 256, r0 ; preserve all AltiVec registers
416
417 li r11,16
418 lvx_4w v0,0,$A_jagged ; load A[5][5]
419 li r10,32
420 lvx_4w v1,r11,$A_jagged
421 addi r11,r11,32
422 lvx_4w v2,r10,$A_jagged
423 addi r10,r10,32
424 lvx_4w v3,r11,$A_jagged
425 addi r11,r11,32
426 lvx_4w v4,r10,$A_jagged
427 addi r10,r10,32
428 lvx_4w v5,r11,$A_jagged
429 addi r11,r11,32
430 lvx_4w v6,r10,$A_jagged
431 addi r10,r10,32
432 lvx_4w v7,r11,$A_jagged
433 addi r11,r11,32
434 lvx_4w v8,r10,$A_jagged
435 addi r10,r10,32
436 lvx_4w v9,r11,$A_jagged
437 addi r11,r11,32
438 lvx_4w v10,r10,$A_jagged
439 addi r10,r10,32
440 lvx_4w v11,r11,$A_jagged
441 lvx_splt v12,r10,$A_jagged
442
443 bl PICmeup
444
445 li r11,16
446 lvx_u v13,0,r12 ; load rhotates
447 li r10,32
448 lvx_u v14,r11,r12
449 addi r11,r11,32
450 lvx_u v15,r10,r12
451 addi r10,r10,32
452 lvx_u v16,r11,r12
453 addi r11,r11,32
454 lvx_u v17,r10,r12
455 addi r10,r10,32
456 lvx_u v18,r11,r12
457 addi r11,r11,32
458 lvx_u v19,r10,r12
459 addi r10,r10,32
460 lvx_u v20,r11,r12
461 addi r11,r11,32
462 lvx_u v21,r10,r12
463 addi r10,r10,32
464 lvx_u v22,r11,r12
465 addi r11,r11,32
466 lvx_u v23,r10,r12
467 addi r10,r10,32
468 lvx_u v24,r11,r12
469 lvx_u v25,r10,r12
470 li r10,-32
471 li r11,-16
472 addi r12,r12,`16*16` ; points at iotas
473 b .Loop_absorb
474
475.align 4
476.Loop_absorb:
477 $UCMP $len,$bsz ; len < bsz?
478 blt .Labsorbed
479
480 sub $len,$len,$bsz ; len -= bsz
481 srwi r0,$bsz,3
482 mtctr r0
483
484 lvx_u v30,r10,r12 ; permutation masks
485 lvx_u v31,r11,r12
486 ?vspltisb v27,7 ; prepare masks for byte swap
487 ?vxor v30,v30,v27 ; on big-endian
488 ?vxor v31,v31,v27
489
490 vxor v27,v27,v27 ; zero
491 lvdx_u v26,0,$inp
492 addi $inp,$inp,8
493 vperm v26,v26,v27,v30
494 vxor v0, v0, v26
495 bdz .Lprocess_block
496 lvdx_u v26,0,$inp
497 addi $inp,$inp,8
498 vperm v26,v26,v27,v30
499 vxor v1, v1, v26
500 bdz .Lprocess_block
501 lvdx_u v26,0,$inp
502 addi $inp,$inp,8
503 vperm v26,v26,v27,v30
504 vxor v2, v2, v26
505 bdz .Lprocess_block
506 lvdx_u v26,0,$inp
507 addi $inp,$inp,8
508 vperm v26,v26,v27,v30
509 vxor v3, v3, v26
510 bdz .Lprocess_block
511 lvdx_u v26,0,$inp
512 addi $inp,$inp,8
513 vperm v26,v26,v27,v30
514 vxor v4, v4, v26
515 bdz .Lprocess_block
516 lvdx_u v26,0,$inp
517 addi $inp,$inp,8
518 vperm v26,v26,v27,v31
519 vxor v0, v0, v26
520 bdz .Lprocess_block
521 lvdx_u v26,0,$inp
522 addi $inp,$inp,8
523 vperm v26,v26,v27,v31
524 vxor v1, v1, v26
525 bdz .Lprocess_block
526 lvdx_u v26,0,$inp
527 addi $inp,$inp,8
528 vperm v26,v26,v27,v31
529 vxor v2, v2, v26
530 bdz .Lprocess_block
531 lvdx_u v26,0,$inp
532 addi $inp,$inp,8
533 vperm v26,v26,v27,v31
534 vxor v3, v3, v26
535 bdz .Lprocess_block
536 lvdx_u v26,0,$inp
537 addi $inp,$inp,8
538 vperm v26,v26,v27,v31
539 vxor v4, v4, v26
540 bdz .Lprocess_block
541 lvdx_u v26,0,$inp
542 addi $inp,$inp,8
543 vperm v26,v26,v27,v30
544 vxor v5, v5, v26
545 bdz .Lprocess_block
546 lvdx_u v26,0,$inp
547 addi $inp,$inp,8
548 vperm v26,v26,v27,v30
549 vxor v6, v6, v26
550 bdz .Lprocess_block
551 lvdx_u v26,0,$inp
552 addi $inp,$inp,8
553 vperm v26,v26,v27,v30
554 vxor v7, v7, v26
555 bdz .Lprocess_block
556 lvdx_u v26,0,$inp
557 addi $inp,$inp,8
558 vperm v26,v26,v27,v30
559 vxor v8, v8, v26
560 bdz .Lprocess_block
561 lvdx_u v26,0,$inp
562 addi $inp,$inp,8
563 vperm v26,v26,v27,v30
564 vxor v9, v9, v26
565 bdz .Lprocess_block
566 lvdx_u v26,0,$inp
567 addi $inp,$inp,8
568 vperm v26,v26,v27,v31
569 vxor v5, v5, v26
570 bdz .Lprocess_block
571 lvdx_u v26,0,$inp
572 addi $inp,$inp,8
573 vperm v26,v26,v27,v31
574 vxor v6, v6, v26
575 bdz .Lprocess_block
576 lvdx_u v26,0,$inp
577 addi $inp,$inp,8
578 vperm v26,v26,v27,v31
579 vxor v7, v7, v26
580 bdz .Lprocess_block
581 lvdx_u v26,0,$inp
582 addi $inp,$inp,8
583 vperm v26,v26,v27,v31
584 vxor v8, v8, v26
585 bdz .Lprocess_block
586 lvdx_u v26,0,$inp
587 addi $inp,$inp,8
588 vperm v26,v26,v27,v31
589 vxor v9, v9, v26
590 bdz .Lprocess_block
591 lvdx_u v26,0,$inp
592 addi $inp,$inp,8
593 vperm v26,v26,v27,v30
594 vxor v10, v10, v26
595 bdz .Lprocess_block
596 lvdx_u v26,0,$inp
597 addi $inp,$inp,8
598 vperm v26,v26,v27,v31
599 vxor v10, v10, v26
600 bdz .Lprocess_block
601 lvdx_u v26,0,$inp
602 addi $inp,$inp,8
603 vperm v26,v26,v27,v30
604 vxor v11, v11, v26
605 bdz .Lprocess_block
606 lvdx_u v26,0,$inp
607 addi $inp,$inp,8
608 vperm v26,v26,v27,v31
609 vxor v11, v11, v26
610 bdz .Lprocess_block
611 lvdx_u v26,0,$inp
612 addi $inp,$inp,8
613 vperm v26,v26,v27,v31
614 vxor v12, v12, v26
615
616.Lprocess_block:
617 bl KeccakF1600_int
618
619 b .Loop_absorb
620
621.align 4
622.Labsorbed:
623 li r11,16
624 stvx_4w v0,0,$A_jagged ; return A[5][5]
625 li r10,32
626 stvx_4w v1,r11,$A_jagged
627 addi r11,r11,32
628 stvx_4w v2,r10,$A_jagged
629 addi r10,r10,32
630 stvx_4w v3,r11,$A_jagged
631 addi r11,r11,32
632 stvx_4w v4,r10,$A_jagged
633 addi r10,r10,32
634 stvx_4w v5,r11,$A_jagged
635 addi r11,r11,32
636 stvx_4w v6,r10,$A_jagged
637 addi r10,r10,32
638 stvx_4w v7,r11,$A_jagged
639 addi r11,r11,32
640 stvx_4w v8,r10,$A_jagged
641 addi r10,r10,32
642 stvx_4w v9,r11,$A_jagged
643 addi r11,r11,32
644 stvx_4w v10,r10,$A_jagged
645 addi r10,r10,32
646 stvx_4w v11,r11,$A_jagged
647 stvdx_u v12,r10,$A_jagged
648
649 mr r3,$len ; return value
650 li r10,`15+6*$SIZE_T`
651 li r11,`31+6*$SIZE_T`
652 mtlr r8
653 mtspr 256, r7 ; restore vrsave
654 lvx v20,r10,$sp
655 addi r10,r10,32
656 lvx v21,r11,$sp
657 addi r11,r11,32
658 lvx v22,r10,$sp
659 addi r10,r10,32
660 lvx v23,r11,$sp
661 addi r11,r11,32
662 lvx v24,r10,$sp
663 addi r10,r10,32
664 lvx v25,r11,$sp
665 addi r11,r11,32
666 lvx v26,r10,$sp
667 addi r10,r10,32
668 lvx v27,r11,$sp
669 addi r11,r11,32
670 lvx v28,r10,$sp
671 addi r10,r10,32
672 lvx v29,r11,$sp
673 addi r11,r11,32
674 lvx v30,r10,$sp
675 lvx v31,r11,$sp
676 addi $sp,$sp,$FRAME
677 blr
678 .long 0
679 .byte 0,12,0x04,1,0x80,0,4,0
680 .long 0
681.size SHA3_absorb,.-SHA3_absorb
682___
683}
684{
685my ($A_jagged,$out,$len,$bsz) = map("r$_",(3..6));
686
687$code.=<<___;
688.globl SHA3_squeeze
689.type SHA3_squeeze,\@function
690.align 5
691SHA3_squeeze:
692 mflr r9 ; r9 is not touched by KeccakF1600
693 subi $out,$out,1 ; prepare for stbu
694 addi r8,$A_jagged,4 ; prepare volatiles
695 mr r10,$bsz
696 li r11,0
697 b .Loop_squeeze
698.align 4
699.Loop_squeeze:
700 lwzx r7,r11,r8 ; lo
701 lwzx r0,r11,$A_jagged ; hi
702 ${UCMP}i $len,8
703 blt .Lsqueeze_tail
704
705 stbu r7,1($out) ; write lo
706 srwi r7,r7,8
707 stbu r7,1($out)
708 srwi r7,r7,8
709 stbu r7,1($out)
710 srwi r7,r7,8
711 stbu r7,1($out)
712 stbu r0,1($out) ; write hi
713 srwi r0,r0,8
714 stbu r0,1($out)
715 srwi r0,r0,8
716 stbu r0,1($out)
717 srwi r0,r0,8
718 stbu r0,1($out)
719
720 subic. $len,$len,8
721 beqlr ; return if done
722
723 subic. r10,r10,8
724 ble .Loutput_expand
725
726 addi r11,r11,16 ; calculate jagged index
727 cmplwi r11,`16*5`
728 blt .Loop_squeeze
729 subi r11,r11,72
730 beq .Loop_squeeze
731 addi r11,r11,72
732 cmplwi r11,`16*5+8`
733 subi r11,r11,8
734 beq .Loop_squeeze
735 addi r11,r11,8
736 cmplwi r11,`16*10`
737 subi r11,r11,72
738 beq .Loop_squeeze
739 addi r11,r11,72
740 blt .Loop_squeeze
741 subi r11,r11,8
742 b .Loop_squeeze
743
744.align 4
745.Loutput_expand:
746 bl KeccakF1600
747 mtlr r9
748
749 addi r8,$A_jagged,4 ; restore volatiles
750 mr r10,$bsz
751 li r11,0
752 b .Loop_squeeze
753
754.align 4
755.Lsqueeze_tail:
756 mtctr $len
757 subic. $len,$len,4
758 ble .Loop_tail_lo
759 li r8,4
760 mtctr r8
761.Loop_tail_lo:
762 stbu r7,1($out)
763 srdi r7,r7,8
764 bdnz .Loop_tail_lo
765 ble .Lsqueeze_done
766 mtctr $len
767.Loop_tail_hi:
768 stbu r0,1($out)
769 srdi r0,r0,8
770 bdnz .Loop_tail_hi
771
772.Lsqueeze_done:
773 blr
774 .long 0
775 .byte 0,12,0x14,0,0,0,4,0
776 .long 0
777.size SHA3_squeeze,.-SHA3_squeeze
778___
779}
780$code.=<<___;
781.align 6
782PICmeup:
783 mflr r0
784 bcl 20,31,\$+4
785 mflr r12 ; vvvvvv "distance" between . and 1st data entry
786 addi r12,r12,`64-8`
787 mtlr r0
788 blr
789 .long 0
790 .byte 0,12,0x14,0,0,0,0,0
791 .space `64-9*4`
792.type rhotates,\@object
793.align 6
794rhotates:
795 .quad 0, 36
796 .quad 1, 44
797 .quad 62, 6
798 .quad 28, 55
799 .quad 27, 20
800 .quad 3, 41
801 .quad 10, 45
802 .quad 43, 15
803 .quad 25, 21
804 .quad 39, 8
805 .quad 18, 2
806 .quad 61, 56
807 .quad 14, 14
808.size rhotates,.-rhotates
809 .quad 0,0
810 .quad 0x0001020304050607,0x1011121314151617
811 .quad 0x1011121314151617,0x0001020304050607
812.type iotas,\@object
813iotas:
814 .quad 0x0000000000000001,0
815 .quad 0x0000000000008082,0
816 .quad 0x800000000000808a,0
817 .quad 0x8000000080008000,0
818 .quad 0x000000000000808b,0
819 .quad 0x0000000080000001,0
820 .quad 0x8000000080008081,0
821 .quad 0x8000000000008009,0
822 .quad 0x000000000000008a,0
823 .quad 0x0000000000000088,0
824 .quad 0x0000000080008009,0
825 .quad 0x000000008000000a,0
826 .quad 0x000000008000808b,0
827 .quad 0x800000000000008b,0
828 .quad 0x8000000000008089,0
829 .quad 0x8000000000008003,0
830 .quad 0x8000000000008002,0
831 .quad 0x8000000000000080,0
832 .quad 0x000000000000800a,0
833 .quad 0x800000008000000a,0
834 .quad 0x8000000080008081,0
835 .quad 0x8000000000008080,0
836 .quad 0x0000000080000001,0
837 .quad 0x8000000080008008,0
838.size iotas,.-iotas
839.asciz "Keccak-1600 absorb and squeeze for PowerISA 2.07, CRYPTOGAMS by <appro\@openssl.org>"
840___
841
842foreach (split("\n",$code)) {
843 s/\`([^\`]*)\`/eval $1/ge;
844
845 if ($flavour =~ /le$/) { # little-endian
846 s/\?([a-z]+)/;$1/;
847 } else { # big-endian
848 s/\?([a-z]+)/$1/;
849 }
850
851 print $_,"\n";
852}
853
854close STDOUT or die "error closing STDOUT: $!";
注意: 瀏覽 TracBrowser 來幫助您使用儲存庫瀏覽器

© 2025 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette