VirtualBox

source: vbox/trunk/src/libs/openssl-3.0.1/crypto/poly1305/asm/poly1305-s390x.pl@ 94083

最後變更 在這個檔案從94083是 94082,由 vboxsync 提交於 3 年 前

libs/openssl-3.0.1: started applying and adjusting our OpenSSL changes to 3.0.1. bugref:10128

  • 屬性 svn:executable 設為 *
檔案大小: 24.1 KB
 
1#! /usr/bin/env perl
2# Copyright 2016 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the Apache License 2.0 (the "License"). You may not use
5# this file except in compliance with the License. You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
9#
10# ====================================================================
11# Written by Andy Polyakov <[email protected]> for the OpenSSL
12# project. The module is, however, dual licensed under OpenSSL and
13# CRYPTOGAMS licenses depending on where you obtain it. For further
14# details see http://www.openssl.org/~appro/cryptogams/.
15# ====================================================================
16#
17# This module implements Poly1305 hash for s390x.
18#
19# June 2015
20#
21# ~6.6/2.3 cpb on z10/z196+, >2x improvement over compiler-generated
22# code. For older compiler improvement coefficient is >3x, because
23# then base 2^64 and base 2^32 implementations are compared.
24#
25# On side note, z13 enables vector base 2^26 implementation...
26
27#
28# January 2019
29#
30# Add vx code path (base 2^26).
31#
32# Copyright IBM Corp. 2019
33# Author: Patrick Steuer <[email protected]>
34
35#
36# January 2019
37#
38# Add vector base 2^26 implementation. It's problematic to accurately
39# measure performance, because reference system is hardly idle. But
40# it's sub-cycle, i.e. less than 1 cycle per processed byte, and it's
41# >=20% faster than IBM's submission on long inputs, and much faster on
42# short ones, because calculation of key powers is postponed till we
43# know that input is long enough to justify the additional overhead.
44
45use strict;
46use FindBin qw($Bin);
47use lib "$Bin/../..";
48use perlasm::s390x qw(:DEFAULT :GE :EI :MI1 :VX AUTOLOAD LABEL INCLUDE);
49
50# $output is the last argument if it looks like a file (it has an extension)
51# $flavour is the first argument if it doesn't look like a file
52my $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
53my $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
54
55my ($z,$SIZE_T);
56if ($flavour =~ /3[12]/) {
57 $z=0; # S/390 ABI
58 $SIZE_T=4;
59} else {
60 $z=1; # zSeries ABI
61 $SIZE_T=8;
62}
63
64my $stdframe=16*$SIZE_T+4*8;
65my $sp="%r15";
66
67my ($ctx,$inp,$len,$padbit) = map("%r$_",(2..5));
68
69PERLASM_BEGIN($output);
70
71INCLUDE ("s390x_arch.h");
72TEXT ();
73
74################
75# static void poly1305_init(void *ctx, const unsigned char key[16])
76{
77GLOBL ("poly1305_init");
78TYPE ("poly1305_init","\@function");
79ALIGN (16);
80LABEL ("poly1305_init");
81 lghi ("%r0",0);
82 lghi ("%r1",-1);
83 stg ("%r0","0($ctx)"); # zero hash value
84 stg ("%r0","8($ctx)");
85 stg ("%r0","16($ctx)");
86 st ("%r0","24($ctx)"); # clear is_base2_26
87 lgr ("%r5",$ctx); # reassign $ctx
88 lghi ("%r2",0);
89
90&{$z? \&clgr:\&clr} ($inp,"%r0");
91 je (".Lno_key");
92
93 lrvg ("%r2","0($inp)"); # load little-endian key
94 lrvg ("%r3","8($inp)");
95
96 nihl ("%r1",0xffc0); # 0xffffffc0ffffffff
97 srlg ("%r0","%r1",4); # 0x0ffffffc0fffffff
98 srlg ("%r1","%r1",4);
99 nill ("%r1",0xfffc); # 0x0ffffffc0ffffffc
100
101 ngr ("%r2","%r0");
102 ngr ("%r3","%r1");
103
104 stmg ("%r2","%r3","32(%r5)");
105
106 larl ("%r1","OPENSSL_s390xcap_P");
107 lg ("%r0","16(%r1)");
108 srlg ("%r0","%r0",62);
109 nill ("%r0",1); # extract vx bit
110 lcgr ("%r0","%r0");
111 larl ("%r1",".Lpoly1305_blocks");
112 larl ("%r2",".Lpoly1305_blocks_vx");
113 larl ("%r3",".Lpoly1305_emit");
114&{$z? \&xgr:\&xr} ("%r2","%r1"); # select between scalar and vector
115&{$z? \&ngr:\&nr} ("%r2","%r0");
116&{$z? \&xgr:\&xr} ("%r2","%r1");
117&{$z? \&stmg:\&stm} ("%r2","%r3","0(%r4)");
118 lghi ("%r2",1);
119LABEL (".Lno_key");
120 br ("%r14");
121SIZE ("poly1305_init",".-poly1305_init");
122}
123
124################
125# static void poly1305_blocks(void *ctx, const unsigned char *inp,
126# size_t len, u32 padbit)
127{
128my ($d0hi,$d0lo,$d1hi,$d1lo,$t0,$h0,$t1,$h1,$h2) = map("%r$_",(6..14));
129my ($r0,$r1,$s1) = map("%r$_",(0..2));
130
131GLOBL ("poly1305_blocks");
132TYPE ("poly1305_blocks","\@function");
133ALIGN (16);
134LABEL ("poly1305_blocks");
135LABEL (".Lpoly1305_blocks");
136&{$z? \&ltgr:\&ltr} ("%r0",$len);
137 jz (".Lno_data");
138
139&{$z? \&stmg:\&stm} ("%r6","%r14","6*$SIZE_T($sp)");
140
141 lg ($h0,"0($ctx)"); # load hash value
142 lg ($h1,"8($ctx)");
143 lg ($h2,"16($ctx)");
144
145LABEL (".Lpoly1305_blocks_entry");
146if ($z) {
147 srlg ($len,$len,4);
148} else {
149 srl ($len,4);
150}
151 llgfr ($padbit,$padbit); # clear upper half, much needed with
152 # non-64-bit ABI
153 lg ($r0,"32($ctx)"); # load key
154 lg ($r1,"40($ctx)");
155
156&{$z? \&stg:\&st} ($ctx,"2*$SIZE_T($sp)"); # off-load $ctx
157 srlg ($s1,$r1,2);
158 algr ($s1,$r1); # s1 = r1 + r1>>2
159 j (".Loop");
160
161ALIGN (16);
162LABEL (".Loop");
163 lrvg ($d0lo,"0($inp)"); # load little-endian input
164 lrvg ($d1lo,"8($inp)");
165 la ($inp,"16($inp)");
166
167 algr ($d0lo,$h0); # accumulate input
168 alcgr ($d1lo,$h1);
169 alcgr ($h2,$padbit);
170
171 lgr ($h0,$d0lo);
172 mlgr ($d0hi,$r0); # h0*r0 -> $d0hi:$d0lo
173 lgr ($h1,$d1lo);
174 mlgr ($d1hi,$s1); # h1*5*r1 -> $d1hi:$d1lo
175
176 mlgr ($t0,$r1); # h0*r1 -> $t0:$h0
177 mlgr ($t1,$r0); # h1*r0 -> $t1:$h1
178
179 algr ($d0lo,$d1lo);
180 lgr ($d1lo,$h2);
181 alcgr ($d0hi,$d1hi);
182 lghi ($d1hi,0);
183
184 algr ($h1,$h0);
185 alcgr ($t1,$t0);
186
187 msgr ($d1lo,$s1); # h2*s1
188 msgr ($h2,$r0); # h2*r0
189
190 algr ($h1,$d1lo);
191 alcgr ($t1,$d1hi); # $d1hi is zero
192
193 algr ($h1,$d0hi);
194 alcgr ($h2,$t1);
195
196 lghi ($h0,-4); # final reduction step
197 ngr ($h0,$h2);
198 srlg ($t0,$h2,2);
199 algr ($h0,$t0);
200 lghi ($t1,3);
201 ngr ($h2,$t1);
202
203 algr ($h0,$d0lo);
204 alcgr ($h1,$d1hi); # $d1hi is still zero
205 alcgr ($h2,$d1hi); # $d1hi is still zero
206
207&{$z? \&brctg:\&brct} ($len,".Loop");
208
209&{$z? \&lg:\&l} ($ctx,"2*$SIZE_T($sp)");# restore $ctx
210
211 stg ($h0,"0($ctx)"); # store hash value
212 stg ($h1,"8($ctx)");
213 stg ($h2,"16($ctx)");
214
215&{$z? \&lmg:\&lm} ("%r6","%r14","6*$SIZE_T($sp)");
216LABEL (".Lno_data");
217 br ("%r14");
218SIZE ("poly1305_blocks",".-poly1305_blocks");
219}
220
221################
222# static void poly1305_blocks_vx(void *ctx, const unsigned char *inp,
223# size_t len, u32 padbit)
224{
225my ($H0, $H1, $H2, $H3, $H4) = map("%v$_",(0..4));
226my ($I0, $I1, $I2, $I3, $I4) = map("%v$_",(5..9));
227my ($R0, $R1, $S1, $R2, $S2) = map("%v$_",(10..14));
228my ($R3, $S3, $R4, $S4) = map("%v$_",(15..18));
229my ($ACC0, $ACC1, $ACC2, $ACC3, $ACC4) = map("%v$_",(19..23));
230my ($T1, $T2, $T3, $T4) = map("%v$_",(24..27));
231my ($mask26,$bswaplo,$bswaphi,$bswapmi) = map("%v$_",(28..31));
232
233my ($d2,$d0,$h0,$d1,$h1,$h2)=map("%r$_",(9..14));
234
235TYPE ("poly1305_blocks_vx","\@function");
236ALIGN (16);
237LABEL ("poly1305_blocks_vx");
238LABEL (".Lpoly1305_blocks_vx");
239&{$z? \&clgfi:\&clfi} ($len,128);
240 jhe ("__poly1305_blocks_vx");
241
242&{$z? \&stmg:\&stm} ("%r6","%r14","6*$SIZE_T($sp)");
243
244 lg ($d0,"0($ctx)");
245 lg ($d1,"8($ctx)");
246 lg ($d2,"16($ctx)");
247
248 llgfr ("%r0",$d0); # base 2^26 -> base 2^64
249 srlg ($h0,$d0,32);
250 llgfr ("%r1",$d1);
251 srlg ($h1,$d1,32);
252 srlg ($h2,$d2,32);
253
254 sllg ("%r0","%r0",26);
255 algr ($h0,"%r0");
256 sllg ("%r0",$h1,52);
257 srlg ($h1,$h1,12);
258 sllg ("%r1","%r1",14);
259 algr ($h0,"%r0");
260 alcgr ($h1,"%r1");
261 sllg ("%r0",$h2,40);
262 srlg ($h2,$h2,24);
263 lghi ("%r1",0);
264 algr ($h1,"%r0");
265 alcgr ($h2,"%r1");
266
267 llgf ("%r0","24($ctx)"); # is_base2_26
268 lcgr ("%r0","%r0");
269
270 xgr ($h0,$d0); # choose between radixes
271 xgr ($h1,$d1);
272 xgr ($h2,$d2);
273 ngr ($h0,"%r0");
274 ngr ($h1,"%r0");
275 ngr ($h2,"%r0");
276 xgr ($h0,$d0);
277 xgr ($h1,$d1);
278 xgr ($h2,$d2);
279
280 lhi ("%r0",0);
281 st ("%r0","24($ctx)"); # clear is_base2_26
282
283 j (".Lpoly1305_blocks_entry");
284SIZE ("poly1305_blocks_vx",".-poly1305_blocks_vx");
285
286TYPE ("__poly1305_mul","\@function");
287ALIGN (16);
288LABEL ("__poly1305_mul");
289 vmlof ($ACC0,$H0,$R0);
290 vmlof ($ACC1,$H0,$R1);
291 vmlof ($ACC2,$H0,$R2);
292 vmlof ($ACC3,$H0,$R3);
293 vmlof ($ACC4,$H0,$R4);
294
295 vmalof ($ACC0,$H1,$S4,$ACC0);
296 vmalof ($ACC1,$H1,$R0,$ACC1);
297 vmalof ($ACC2,$H1,$R1,$ACC2);
298 vmalof ($ACC3,$H1,$R2,$ACC3);
299 vmalof ($ACC4,$H1,$R3,$ACC4);
300
301 vmalof ($ACC0,$H2,$S3,$ACC0);
302 vmalof ($ACC1,$H2,$S4,$ACC1);
303 vmalof ($ACC2,$H2,$R0,$ACC2);
304 vmalof ($ACC3,$H2,$R1,$ACC3);
305 vmalof ($ACC4,$H2,$R2,$ACC4);
306
307 vmalof ($ACC0,$H3,$S2,$ACC0);
308 vmalof ($ACC1,$H3,$S3,$ACC1);
309 vmalof ($ACC2,$H3,$S4,$ACC2);
310 vmalof ($ACC3,$H3,$R0,$ACC3);
311 vmalof ($ACC4,$H3,$R1,$ACC4);
312
313 vmalof ($ACC0,$H4,$S1,$ACC0);
314 vmalof ($ACC1,$H4,$S2,$ACC1);
315 vmalof ($ACC2,$H4,$S3,$ACC2);
316 vmalof ($ACC3,$H4,$S4,$ACC3);
317 vmalof ($ACC4,$H4,$R0,$ACC4);
318
319 ################################################################
320 # lazy reduction
321
322 vesrlg ($H4,$ACC3,26);
323 vesrlg ($H1,$ACC0,26);
324 vn ($H3,$ACC3,$mask26);
325 vn ($H0,$ACC0,$mask26);
326 vag ($H4,$H4,$ACC4); # h3 -> h4
327 vag ($H1,$H1,$ACC1); # h0 -> h1
328
329 vesrlg ($ACC4,$H4,26);
330 vesrlg ($ACC1,$H1,26);
331 vn ($H4,$H4,$mask26);
332 vn ($H1,$H1,$mask26);
333 vag ($H0,$H0,$ACC4);
334 vag ($H2,$ACC2,$ACC1); # h1 -> h2
335
336 veslg ($ACC4,$ACC4,2); # <<2
337 vesrlg ($ACC2,$H2,26);
338 vn ($H2,$H2,$mask26);
339 vag ($H0,$H0,$ACC4); # h4 -> h0
340 vag ($H3,$H3,$ACC2); # h2 -> h3
341
342 vesrlg ($ACC0,$H0,26);
343 vesrlg ($ACC3,$H3,26);
344 vn ($H0,$H0,$mask26);
345 vn ($H3,$H3,$mask26);
346 vag ($H1,$H1,$ACC0); # h0 -> h1
347 vag ($H4,$H4,$ACC3); # h3 -> h4
348 br ("%r14");
349SIZE ("__poly1305_mul",".-__poly1305_mul");
350
351TYPE ("__poly1305_blocks_vx","\@function");
352ALIGN (16);
353LABEL ("__poly1305_blocks_vx");
354&{$z? \&lgr:\&lr} ("%r0",$sp);
355&{$z? \&stmg:\&stm} ("%r10","%r15","10*$SIZE_T($sp)");
356if (!$z) {
357 std ("%f4","16*$SIZE_T+2*8($sp)");
358 std ("%f6","16*$SIZE_T+3*8($sp)");
359 ahi ($sp,-$stdframe);
360 st ("%r0","0($sp)"); # back-chain
361
362 llgfr ($len,$len); # so that srlg works on $len
363} else {
364 aghi ($sp,"-($stdframe+8*8)");
365 stg ("%r0","0($sp)"); # back-chain
366
367 std ("%f8","$stdframe+0*8($sp)");
368 std ("%f9","$stdframe+1*8($sp)");
369 std ("%f10","$stdframe+2*8($sp)");
370 std ("%f11","$stdframe+3*8($sp)");
371 std ("%f12","$stdframe+4*8($sp)");
372 std ("%f13","$stdframe+5*8($sp)");
373 std ("%f14","$stdframe+6*8($sp)");
374 std ("%f15","$stdframe+7*8($sp)");
375}
376 larl ("%r1",".Lconst");
377 vgmg ($mask26,38,63);
378 vlm ($bswaplo,$bswapmi,"16(%r1)");
379
380 &lt ("%r0","24($ctx)"); # is_base2_26?
381 jnz (".Lskip_init");
382
383 lg ($h0,"32($ctx)"); # load key base 2^64
384 lg ($h1,"40($ctx)");
385
386 risbg ($d0,$h0,38,0x80+63,38); # base 2^64 -> 2^26
387 srlg ($d1,$h0,52);
388 risbg ($h0,$h0,38,0x80+63,0);
389 vlvgg ($R0,$h0,0);
390 risbg ($d1,$h1,38,51,12);
391 vlvgg ($R1,$d0,0);
392 risbg ($d0,$h1,38,63,50);
393 vlvgg ($R2,$d1,0);
394 srlg ($d1,$h1,40);
395 vlvgg ($R3,$d0,0);
396 vlvgg ($R4,$d1,0);
397
398 veslg ($S1,$R1,2);
399 veslg ($S2,$R2,2);
400 veslg ($S3,$R3,2);
401 veslg ($S4,$R4,2);
402 vlr ($H0,$R0);
403 vlr ($H1,$R1);
404 vlr ($H2,$R2);
405 vlr ($H3,$R3);
406 vlr ($H4,$R4);
407 vag ($S1,$S1,$R1); # * 5
408 vag ($S2,$S2,$R2);
409 vag ($S3,$S3,$R3);
410 vag ($S4,$S4,$R4);
411
412 brasl ("%r14","__poly1305_mul"); # r^1:- * r^1:-
413
414 vpdi ($R0,$H0,$R0,0); # r^2:r^1
415 vpdi ($R1,$H1,$R1,0);
416 vpdi ($R2,$H2,$R2,0);
417 vpdi ($R3,$H3,$R3,0);
418 vpdi ($R4,$H4,$R4,0);
419 vpdi ($H0,$H0,$H0,0); # r^2:r^2
420 vpdi ($H1,$H1,$H1,0);
421 vpdi ($H2,$H2,$H2,0);
422 vpdi ($H3,$H3,$H3,0);
423 vpdi ($H4,$H4,$H4,0);
424 veslg ($S1,$R1,2);
425 veslg ($S2,$R2,2);
426 veslg ($S3,$R3,2);
427 veslg ($S4,$R4,2);
428 vag ($S1,$S1,$R1); # * 5
429 vag ($S2,$S2,$R2);
430 vag ($S3,$S3,$R3);
431 vag ($S4,$S4,$R4);
432
433 brasl ("%r14,__poly1305_mul"); # r^2:r^2 * r^2:r^1
434
435 vl ($I0,"0(%r1)"); # borrow $I0
436 vperm ($R0,$R0,$H0,$I0); # r^2:r^4:r^1:r^3
437 vperm ($R1,$R1,$H1,$I0);
438 vperm ($R2,$R2,$H2,$I0);
439 vperm ($R3,$R3,$H3,$I0);
440 vperm ($R4,$R4,$H4,$I0);
441 veslf ($S1,$R1,2);
442 veslf ($S2,$R2,2);
443 veslf ($S3,$R3,2);
444 veslf ($S4,$R4,2);
445 vaf ($S1,$S1,$R1); # * 5
446 vaf ($S2,$S2,$R2);
447 vaf ($S3,$S3,$R3);
448 vaf ($S4,$S4,$R4);
449
450 lg ($h0,"0($ctx)"); # load hash base 2^64
451 lg ($h1,"8($ctx)");
452 lg ($h2,"16($ctx)");
453
454 vzero ($H0);
455 vzero ($H1);
456 vzero ($H2);
457 vzero ($H3);
458 vzero ($H4);
459
460 risbg ($d0,$h0,38,0x80+63,38); # base 2^64 -> 2^26
461 srlg ($d1,$h0,52);
462 risbg ($h0,$h0,38,0x80+63,0);
463 vlvgg ($H0,$h0,0);
464 risbg ($d1,$h1,38,51,12);
465 vlvgg ($H1,$d0,0);
466 risbg ($d0,$h1,38,63,50);
467 vlvgg ($H2,$d1,0);
468 srlg ($d1,$h1,40);
469 vlvgg ($H3,$d0,0);
470 risbg ($d1,$h2,37,39,24);
471 vlvgg ($H4,$d1,0);
472
473 lhi ("%r0",1);
474 st ("%r0","24($ctx)"); # set is_base2_26
475
476 vstm ($R0,$S4,"48($ctx)"); # save key schedule base 2^26
477
478 vpdi ($R0,$R0,$R0,0); # broadcast r^2:r^4
479 vpdi ($R1,$R1,$R1,0);
480 vpdi ($S1,$S1,$S1,0);
481 vpdi ($R2,$R2,$R2,0);
482 vpdi ($S2,$S2,$S2,0);
483 vpdi ($R3,$R3,$R3,0);
484 vpdi ($S3,$S3,$S3,0);
485 vpdi ($R4,$R4,$R4,0);
486 vpdi ($S4,$S4,$S4,0);
487
488 j (".Loaded_hash");
489
490ALIGN (16);
491LABEL (".Lskip_init");
492 vllezf ($H0,"0($ctx)"); # load hash base 2^26
493 vllezf ($H1,"4($ctx)");
494 vllezf ($H2,"8($ctx)");
495 vllezf ($H3,"12($ctx)");
496 vllezf ($H4,"16($ctx)");
497
498 vlrepg ($R0,"0x30($ctx)"); # broadcast r^2:r^4
499 vlrepg ($R1,"0x40($ctx)");
500 vlrepg ($S1,"0x50($ctx)");
501 vlrepg ($R2,"0x60($ctx)");
502 vlrepg ($S2,"0x70($ctx)");
503 vlrepg ($R3,"0x80($ctx)");
504 vlrepg ($S3,"0x90($ctx)");
505 vlrepg ($R4,"0xa0($ctx)");
506 vlrepg ($S4,"0xb0($ctx)");
507
508LABEL (".Loaded_hash");
509 vzero ($I1);
510 vzero ($I3);
511
512 vlm ($T1,$T4,"0x00($inp)"); # load first input block
513 la ($inp,"0x40($inp)");
514 vgmg ($mask26,6,31);
515 vgmf ($I4,5,5); # padbit<<2
516
517 vperm ($I0,$T3,$T4,$bswaplo);
518 vperm ($I2,$T3,$T4,$bswapmi);
519 vperm ($T3,$T3,$T4,$bswaphi);
520
521 verimg ($I1,$I0,$mask26,6); # >>26
522 veslg ($I0,$I0,32);
523 veslg ($I2,$I2,28); # >>4
524 verimg ($I3,$T3,$mask26,18); # >>14
525 verimg ($I4,$T3,$mask26,58); # >>38
526 vn ($I0,$I0,$mask26);
527 vn ($I2,$I2,$mask26);
528 vesrlf ($I4,$I4,2); # >>2
529
530 vgmg ($mask26,38,63);
531 vperm ($T3,$T1,$T2,$bswaplo);
532 vperm ($T4,$T1,$T2,$bswaphi);
533 vperm ($T2,$T1,$T2,$bswapmi);
534
535 verimg ($I0,$T3,$mask26,0);
536 verimg ($I1,$T3,$mask26,38); # >>26
537 verimg ($I2,$T2,$mask26,60); # >>4
538 verimg ($I3,$T4,$mask26,50); # >>14
539 vesrlg ($T4,$T4,40);
540 vo ($I4,$I4,$T4);
541
542 srlg ("%r0",$len,6);
543&{$z? \&aghi:\&ahi} ("%r0",-1);
544
545ALIGN (16);
546LABEL (".Loop_vx");
547 vmlef ($ACC0,$I0,$R0);
548 vmlef ($ACC1,$I0,$R1);
549 vmlef ($ACC2,$I0,$R2);
550 vmlef ($ACC3,$I0,$R3);
551 vmlef ($ACC4,$I0,$R4);
552
553 vmalef ($ACC0,$I1,$S4,$ACC0);
554 vmalef ($ACC1,$I1,$R0,$ACC1);
555 vmalef ($ACC2,$I1,$R1,$ACC2);
556 vmalef ($ACC3,$I1,$R2,$ACC3);
557 vmalef ($ACC4,$I1,$R3,$ACC4);
558
559 vaf ($H2,$H2,$I2);
560 vaf ($H0,$H0,$I0);
561 vaf ($H3,$H3,$I3);
562 vaf ($H1,$H1,$I1);
563 vaf ($H4,$H4,$I4);
564
565 vmalef ($ACC0,$I2,$S3,$ACC0);
566 vmalef ($ACC1,$I2,$S4,$ACC1);
567 vmalef ($ACC2,$I2,$R0,$ACC2);
568 vmalef ($ACC3,$I2,$R1,$ACC3);
569 vmalef ($ACC4,$I2,$R2,$ACC4);
570
571 vlm ($T1,$T4,"0x00($inp)"); # load next input block
572 la ($inp,"0x40($inp)");
573 vgmg ($mask26,6,31);
574
575 vmalef ($ACC0,$I3,$S2,$ACC0);
576 vmalef ($ACC1,$I3,$S3,$ACC1);
577 vmalef ($ACC2,$I3,$S4,$ACC2);
578 vmalef ($ACC3,$I3,$R0,$ACC3);
579 vmalef ($ACC4,$I3,$R1,$ACC4);
580
581 vperm ($I0,$T3,$T4,$bswaplo);
582 vperm ($I2,$T3,$T4,$bswapmi);
583 vperm ($T3,$T3,$T4,$bswaphi);
584
585 vmalef ($ACC0,$I4,$S1,$ACC0);
586 vmalef ($ACC1,$I4,$S2,$ACC1);
587 vmalef ($ACC2,$I4,$S3,$ACC2);
588 vmalef ($ACC3,$I4,$S4,$ACC3);
589 vmalef ($ACC4,$I4,$R0,$ACC4);
590
591 verimg ($I1,$I0,$mask26,6); # >>26
592 veslg ($I0,$I0,32);
593 veslg ($I2,$I2,28); # >>4
594 verimg ($I3,$T3,$mask26,18); # >>14
595
596 vmalof ($ACC0,$H0,$R0,$ACC0);
597 vmalof ($ACC1,$H0,$R1,$ACC1);
598 vmalof ($ACC2,$H0,$R2,$ACC2);
599 vmalof ($ACC3,$H0,$R3,$ACC3);
600 vmalof ($ACC4,$H0,$R4,$ACC4);
601
602 vgmf ($I4,5,5); # padbit<<2
603 verimg ($I4,$T3,$mask26,58); # >>38
604 vn ($I0,$I0,$mask26);
605 vn ($I2,$I2,$mask26);
606 vesrlf ($I4,$I4,2); # >>2
607
608 vmalof ($ACC0,$H1,$S4,$ACC0);
609 vmalof ($ACC1,$H1,$R0,$ACC1);
610 vmalof ($ACC2,$H1,$R1,$ACC2);
611 vmalof ($ACC3,$H1,$R2,$ACC3);
612 vmalof ($ACC4,$H1,$R3,$ACC4);
613
614 vgmg ($mask26,38,63);
615 vperm ($T3,$T1,$T2,$bswaplo);
616 vperm ($T4,$T1,$T2,$bswaphi);
617 vperm ($T2,$T1,$T2,$bswapmi);
618
619 vmalof ($ACC0,$H2,$S3,$ACC0);
620 vmalof ($ACC1,$H2,$S4,$ACC1);
621 vmalof ($ACC2,$H2,$R0,$ACC2);
622 vmalof ($ACC3,$H2,$R1,$ACC3);
623 vmalof ($ACC4,$H2,$R2,$ACC4);
624
625 verimg ($I0,$T3,$mask26,0);
626 verimg ($I1,$T3,$mask26,38); # >>26
627 verimg ($I2,$T2,$mask26,60); # >>4
628
629 vmalof ($ACC0,$H3,$S2,$ACC0);
630 vmalof ($ACC1,$H3,$S3,$ACC1);
631 vmalof ($ACC2,$H3,$S4,$ACC2);
632 vmalof ($ACC3,$H3,$R0,$ACC3);
633 vmalof ($ACC4,$H3,$R1,$ACC4);
634
635 verimg ($I3,$T4,$mask26,50); # >>14
636 vesrlg ($T4,$T4,40);
637 vo ($I4,$I4,$T4);
638
639 vmalof ($ACC0,$H4,$S1,$ACC0);
640 vmalof ($ACC1,$H4,$S2,$ACC1);
641 vmalof ($ACC2,$H4,$S3,$ACC2);
642 vmalof ($ACC3,$H4,$S4,$ACC3);
643 vmalof ($ACC4,$H4,$R0,$ACC4);
644
645 ################################################################
646 # lazy reduction as discussed in "NEON crypto" by D.J. Bernstein
647 # and P. Schwabe
648
649 vesrlg ($H4,$ACC3,26);
650 vesrlg ($H1,$ACC0,26);
651 vn ($H3,$ACC3,$mask26);
652 vn ($H0,$ACC0,$mask26);
653 vag ($H4,$H4,$ACC4); # h3 -> h4
654 vag ($H1,$H1,$ACC1); # h0 -> h1
655
656 vesrlg ($ACC4,$H4,26);
657 vesrlg ($ACC1,$H1,26);
658 vn ($H4,$H4,$mask26);
659 vn ($H1,$H1,$mask26);
660 vag ($H0,$H0,$ACC4);
661 vag ($H2,$ACC2,$ACC1); # h1 -> h2
662
663 veslg ($ACC4,$ACC4,2); # <<2
664 vesrlg ($ACC2,$H2,26);
665 vn ($H2,$H2,$mask26);
666 vag ($H0,$H0,$ACC4); # h4 -> h0
667 vag ($H3,$H3,$ACC2); # h2 -> h3
668
669 vesrlg ($ACC0,$H0,26);
670 vesrlg ($ACC3,$H3,26);
671 vn ($H0,$H0,$mask26);
672 vn ($H3,$H3,$mask26);
673 vag ($H1,$H1,$ACC0); # h0 -> h1
674 vag ($H4,$H4,$ACC3); # h3 -> h4
675
676&{$z? \&brctg:\&brct} ("%r0",".Loop_vx");
677
678 vlm ($R0,$S4,"48($ctx)"); # load all powers
679
680 lghi ("%r0",0x30);
681&{$z? \&lcgr:\&lcr} ($len,$len);
682&{$z? \&ngr:\&nr} ($len,"%r0");
683&{$z? \&slgr:\&slr} ($inp,$len);
684
685LABEL (".Last");
686 vmlef ($ACC0,$I0,$R0);
687 vmlef ($ACC1,$I0,$R1);
688 vmlef ($ACC2,$I0,$R2);
689 vmlef ($ACC3,$I0,$R3);
690 vmlef ($ACC4,$I0,$R4);
691
692 vmalef ($ACC0,$I1,$S4,$ACC0);
693 vmalef ($ACC1,$I1,$R0,$ACC1);
694 vmalef ($ACC2,$I1,$R1,$ACC2);
695 vmalef ($ACC3,$I1,$R2,$ACC3);
696 vmalef ($ACC4,$I1,$R3,$ACC4);
697
698 vaf ($H0,$H0,$I0);
699 vaf ($H1,$H1,$I1);
700 vaf ($H2,$H2,$I2);
701 vaf ($H3,$H3,$I3);
702 vaf ($H4,$H4,$I4);
703
704 vmalef ($ACC0,$I2,$S3,$ACC0);
705 vmalef ($ACC1,$I2,$S4,$ACC1);
706 vmalef ($ACC2,$I2,$R0,$ACC2);
707 vmalef ($ACC3,$I2,$R1,$ACC3);
708 vmalef ($ACC4,$I2,$R2,$ACC4);
709
710 vmalef ($ACC0,$I3,$S2,$ACC0);
711 vmalef ($ACC1,$I3,$S3,$ACC1);
712 vmalef ($ACC2,$I3,$S4,$ACC2);
713 vmalef ($ACC3,$I3,$R0,$ACC3);
714 vmalef ($ACC4,$I3,$R1,$ACC4);
715
716 vmalef ($ACC0,$I4,$S1,$ACC0);
717 vmalef ($ACC1,$I4,$S2,$ACC1);
718 vmalef ($ACC2,$I4,$S3,$ACC2);
719 vmalef ($ACC3,$I4,$S4,$ACC3);
720 vmalef ($ACC4,$I4,$R0,$ACC4);
721
722 vmalof ($ACC0,$H0,$R0,$ACC0);
723 vmalof ($ACC1,$H0,$R1,$ACC1);
724 vmalof ($ACC2,$H0,$R2,$ACC2);
725 vmalof ($ACC3,$H0,$R3,$ACC3);
726 vmalof ($ACC4,$H0,$R4,$ACC4);
727
728 vmalof ($ACC0,$H1,$S4,$ACC0);
729 vmalof ($ACC1,$H1,$R0,$ACC1);
730 vmalof ($ACC2,$H1,$R1,$ACC2);
731 vmalof ($ACC3,$H1,$R2,$ACC3);
732 vmalof ($ACC4,$H1,$R3,$ACC4);
733
734 vmalof ($ACC0,$H2,$S3,$ACC0);
735 vmalof ($ACC1,$H2,$S4,$ACC1);
736 vmalof ($ACC2,$H2,$R0,$ACC2);
737 vmalof ($ACC3,$H2,$R1,$ACC3);
738 vmalof ($ACC4,$H2,$R2,$ACC4);
739
740 vmalof ($ACC0,$H3,$S2,$ACC0);
741 vmalof ($ACC1,$H3,$S3,$ACC1);
742 vmalof ($ACC2,$H3,$S4,$ACC2);
743 vmalof ($ACC3,$H3,$R0,$ACC3);
744 vmalof ($ACC4,$H3,$R1,$ACC4);
745
746 vmalof ($ACC0,$H4,$S1,$ACC0);
747 vmalof ($ACC1,$H4,$S2,$ACC1);
748 vmalof ($ACC2,$H4,$S3,$ACC2);
749 vmalof ($ACC3,$H4,$S4,$ACC3);
750 vmalof ($ACC4,$H4,$R0,$ACC4);
751
752 ################################################################
753 # horizontal addition
754
755 vzero ($H0);
756 vsumqg ($ACC0,$ACC0,$H0);
757 vsumqg ($ACC1,$ACC1,$H0);
758 vsumqg ($ACC2,$ACC2,$H0);
759 vsumqg ($ACC3,$ACC3,$H0);
760 vsumqg ($ACC4,$ACC4,$H0);
761
762 ################################################################
763 # lazy reduction
764
765 vesrlg ($H4,$ACC3,26);
766 vesrlg ($H1,$ACC0,26);
767 vn ($H3,$ACC3,$mask26);
768 vn ($H0,$ACC0,$mask26);
769 vag ($H4,$H4,$ACC4); # h3 -> h4
770 vag ($H1,$H1,$ACC1); # h0 -> h1
771
772 vesrlg ($ACC4,$H4,26);
773 vesrlg ($ACC1,$H1,26);
774 vn ($H4,$H4,$mask26);
775 vn ($H1,$H1,$mask26);
776 vag ($H0,$H0,$ACC4);
777 vag ($H2,$ACC2,$ACC1); # h1 -> h2
778
779 veslg ($ACC4,$ACC4,2); # <<2
780 vesrlg ($ACC2,$H2,26);
781 vn ($H2,$H2,$mask26);
782 vag ($H0,$H0,$ACC4); # h4 -> h0
783 vag ($H3,$H3,$ACC2); # h2 -> h3
784
785 vesrlg ($ACC0,$H0,26);
786 vesrlg ($ACC3,$H3,26);
787 vn ($H0,$H0,$mask26);
788 vn ($H3,$H3,$mask26);
789 vag ($H1,$H1,$ACC0); # h0 -> h1
790 vag ($H4,$H4,$ACC3); # h3 -> h4
791
792&{$z? \&clgfi:\&clfi} ($len,0);
793 je (".Ldone");
794
795 vlm ($T1,$T4,"0x00($inp)"); # load last partial block
796 vgmg ($mask26,6,31);
797 vgmf ($I4,5,5); # padbit<<2
798
799 vperm ($I0,$T3,$T4,$bswaplo);
800 vperm ($I2,$T3,$T4,$bswapmi);
801 vperm ($T3,$T3,$T4,$bswaphi);
802
803 vl ($ACC0,"0x30($len,%r1)"); # borrow $ACC0,1
804 vl ($ACC1,"0x60($len,%r1)");
805
806 verimg ($I1,$I0,$mask26,6); # >>26
807 veslg ($I0,$I0,32);
808 veslg ($I2,$I2,28); # >>4
809 verimg ($I3,$T3,$mask26,18); # >>14
810 verimg ($I4,$T3,$mask26,58); # >>38
811 vn ($I0,$I0,$mask26);
812 vn ($I2,$I2,$mask26);
813 vesrlf ($I4,$I4,2); # >>2
814
815 vgmg ($mask26,38,63);
816 vperm ($T3,$T1,$T2,$bswaplo);
817 vperm ($T4,$T1,$T2,$bswaphi);
818 vperm ($T2,$T1,$T2,$bswapmi);
819
820 verimg ($I0,$T3,$mask26,0);
821 verimg ($I1,$T3,$mask26,38); # >>26
822 verimg ($I2,$T2,$mask26,60); # >>4
823 verimg ($I3,$T4,$mask26,50); # >>14
824 vesrlg ($T4,$T4,40);
825 vo ($I4,$I4,$T4);
826
827 vperm ($H0,$H0,$H0,$ACC0); # move hash to right lane
828 vn ($I0,$I0,$ACC1); # mask redundant lane[s]
829 vperm ($H1,$H1,$H1,$ACC0);
830 vn ($I1,$I1,$ACC1);
831 vperm ($H2,$H2,$H2,$ACC0);
832 vn ($I2,$I2,$ACC1);
833 vperm ($H3,$H3,$H3,$ACC0);
834 vn ($I3,$I3,$ACC1);
835 vperm ($H4,$H4,$H4,$ACC0);
836 vn ($I4,$I4,$ACC1);
837
838 vaf ($I0,$I0,$H0); # accumulate hash
839 vzero ($H0); # wipe hash value
840 vaf ($I1,$I1,$H1);
841 vzero ($H1);
842 vaf ($I2,$I2,$H2);
843 vzero ($H2);
844 vaf ($I3,$I3,$H3);
845 vzero ($H3);
846 vaf ($I4,$I4,$H4);
847 vzero ($H4);
848
849&{$z? \&lghi:\&lhi} ($len,0);
850 j (".Last");
851 # I don't bother to tell apart cases when only one multiplication
852 # pass is sufficient, because I argue that mispredicted branch
853 # penalties are comparable to overhead of sometimes redundant
854 # multiplication pass...
855
856LABEL (".Ldone");
857 vstef ($H0,"0($ctx)",3); # store hash base 2^26
858 vstef ($H1,"4($ctx)",3);
859 vstef ($H2,"8($ctx)",3);
860 vstef ($H3,"12($ctx)",3);
861 vstef ($H4,"16($ctx)",3);
862
863if ($z) {
864 ld ("%f8","$stdframe+0*8($sp)");
865 ld ("%f9","$stdframe+1*8($sp)");
866 ld ("%f10","$stdframe+2*8($sp)");
867 ld ("%f11","$stdframe+3*8($sp)");
868 ld ("%f12","$stdframe+4*8($sp)");
869 ld ("%f13","$stdframe+5*8($sp)");
870 ld ("%f14","$stdframe+6*8($sp)");
871 ld ("%f15","$stdframe+7*8($sp)");
872&{$z? \&lmg:\&lm} ("%r10","%r15","$stdframe+8*8+10*$SIZE_T($sp)");
873} else {
874 ld ("%f4","$stdframe+16*$SIZE_T+2*8($sp)");
875 ld ("%f6","$stdframe+16*$SIZE_T+3*8($sp)");
876&{$z? \&lmg:\&lm} ("%r10","%r15","$stdframe+10*$SIZE_T($sp)");
877}
878 br ("%r14");
879SIZE ("__poly1305_blocks_vx",".-__poly1305_blocks_vx");
880}
881
882################
883# static void poly1305_emit(void *ctx, unsigned char mac[16],
884# const u32 nonce[4])
885{
886my ($mac,$nonce)=($inp,$len);
887my ($h0,$h1,$h2,$d0,$d1,$d2)=map("%r$_",(5..10));
888
889GLOBL ("poly1305_emit");
890TYPE ("poly1305_emit","\@function");
891ALIGN (16);
892LABEL ("poly1305_emit");
893LABEL (".Lpoly1305_emit");
894&{$z? \&stmg:\&stm} ("%r6","%r10","6*$SIZE_T($sp)");
895
896 lg ($d0,"0($ctx)");
897 lg ($d1,"8($ctx)");
898 lg ($d2,"16($ctx)");
899
900 llgfr ("%r0",$d0); # base 2^26 -> base 2^64
901 srlg ($h0,$d0,32);
902 llgfr ("%r1",$d1);
903 srlg ($h1,$d1,32);
904 srlg ($h2,$d2,32);
905
906 sllg ("%r0","%r0",26);
907 algr ($h0,"%r0");
908 sllg ("%r0",$h1,52);
909 srlg ($h1,$h1,12);
910 sllg ("%r1","%r1",14);
911 algr ($h0,"%r0");
912 alcgr ($h1,"%r1");
913 sllg ("%r0",$h2,40);
914 srlg ($h2,$h2,24);
915 lghi ("%r1",0);
916 algr ($h1,"%r0");
917 alcgr ($h2,"%r1");
918
919 llgf ("%r0","24($ctx)"); # is_base2_26
920 lcgr ("%r0","%r0");
921
922 xgr ($h0,$d0); # choose between radixes
923 xgr ($h1,$d1);
924 xgr ($h2,$d2);
925 ngr ($h0,"%r0");
926 ngr ($h1,"%r0");
927 ngr ($h2,"%r0");
928 xgr ($h0,$d0);
929 xgr ($h1,$d1);
930 xgr ($h2,$d2);
931
932 lghi ("%r0",5);
933 lgr ($d0,$h0);
934 lgr ($d1,$h1);
935
936 algr ($h0,"%r0"); # compare to modulus
937 alcgr ($h1,"%r1");
938 alcgr ($h2,"%r1");
939
940 srlg ($h2,$h2,2); # did it borrow/carry?
941 slgr ("%r1",$h2); # 0-$h2>>2
942 lg ($d2,"0($nonce)"); # load nonce
943 lg ($ctx,"8($nonce)");
944
945 xgr ($h0,$d0);
946 xgr ($h1,$d1);
947 ngr ($h0,"%r1");
948 ngr ($h1,"%r1");
949 xgr ($h0,$d0);
950 rllg ($d0,$d2,32); # flip nonce words
951 xgr ($h1,$d1);
952 rllg ($d1,$ctx,32);
953
954 algr ($h0,$d0); # accumulate nonce
955 alcgr ($h1,$d1);
956
957 strvg ($h0,"0($mac)"); # write little-endian result
958 strvg ($h1,"8($mac)");
959
960&{$z? \&lmg:\&lm} ("%r6","%r10","6*$SIZE_T($sp)");
961 br ("%r14");
962SIZE ("poly1305_emit",".-poly1305_emit");
963}
964
965################
966
967ALIGN (16);
968LABEL (".Lconst");
969LONG (0x04050607,0x14151617,0x0c0d0e0f,0x1c1d1e1f); # merge odd
970LONG (0x07060504,0x03020100,0x17161514,0x13121110); # byte swap masks
971LONG (0x0f0e0d0c,0x0b0a0908,0x1f1e1d1c,0x1b1a1918);
972LONG (0x00000000,0x09080706,0x00000000,0x19181716);
973
974LONG (0x00000000,0x00000000,0x00000000,0x0c0d0e0f); # magic tail masks
975LONG (0x0c0d0e0f,0x00000000,0x00000000,0x00000000);
976LONG (0x00000000,0x00000000,0x0c0d0e0f,0x00000000);
977
978LONG (0xffffffff,0x00000000,0xffffffff,0xffffffff);
979LONG (0xffffffff,0x00000000,0xffffffff,0x00000000);
980LONG (0x00000000,0x00000000,0xffffffff,0x00000000);
981
982STRING ("\"Poly1305 for s390x, CRYPTOGAMS by <appro\@openssl.org>\"");
983
984PERLASM_END();
注意: 瀏覽 TracBrowser 來幫助您使用儲存庫瀏覽器

© 2025 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette