1 | #! /usr/bin/env perl
|
---|
2 | # Copyright 2016 The OpenSSL Project Authors. All Rights Reserved.
|
---|
3 | #
|
---|
4 | # Licensed under the Apache License 2.0 (the "License"). You may not use
|
---|
5 | # this file except in compliance with the License. You can obtain a copy
|
---|
6 | # in the file LICENSE in the source distribution or at
|
---|
7 | # https://www.openssl.org/source/license.html
|
---|
8 |
|
---|
9 | #
|
---|
10 | # ====================================================================
|
---|
11 | # Written by Andy Polyakov <[email protected]> for the OpenSSL
|
---|
12 | # project. The module is, however, dual licensed under OpenSSL and
|
---|
13 | # CRYPTOGAMS licenses depending on where you obtain it. For further
|
---|
14 | # details see http://www.openssl.org/~appro/cryptogams/.
|
---|
15 | # ====================================================================
|
---|
16 | #
|
---|
17 | # This module implements Poly1305 hash for s390x.
|
---|
18 | #
|
---|
19 | # June 2015
|
---|
20 | #
|
---|
21 | # ~6.6/2.3 cpb on z10/z196+, >2x improvement over compiler-generated
|
---|
22 | # code. For older compiler improvement coefficient is >3x, because
|
---|
23 | # then base 2^64 and base 2^32 implementations are compared.
|
---|
24 | #
|
---|
25 | # On side note, z13 enables vector base 2^26 implementation...
|
---|
26 |
|
---|
27 | #
|
---|
28 | # January 2019
|
---|
29 | #
|
---|
30 | # Add vx code path (base 2^26).
|
---|
31 | #
|
---|
32 | # Copyright IBM Corp. 2019
|
---|
33 | # Author: Patrick Steuer <[email protected]>
|
---|
34 |
|
---|
35 | #
|
---|
36 | # January 2019
|
---|
37 | #
|
---|
38 | # Add vector base 2^26 implementation. It's problematic to accurately
|
---|
39 | # measure performance, because reference system is hardly idle. But
|
---|
40 | # it's sub-cycle, i.e. less than 1 cycle per processed byte, and it's
|
---|
41 | # >=20% faster than IBM's submission on long inputs, and much faster on
|
---|
42 | # short ones, because calculation of key powers is postponed till we
|
---|
43 | # know that input is long enough to justify the additional overhead.
|
---|
44 |
|
---|
45 | use strict;
|
---|
46 | use FindBin qw($Bin);
|
---|
47 | use lib "$Bin/../..";
|
---|
48 | use perlasm::s390x qw(:DEFAULT :GE :EI :MI1 :VX AUTOLOAD LABEL INCLUDE);
|
---|
49 |
|
---|
50 | # $output is the last argument if it looks like a file (it has an extension)
|
---|
51 | # $flavour is the first argument if it doesn't look like a file
|
---|
52 | my $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
|
---|
53 | my $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
|
---|
54 |
|
---|
55 | my ($z,$SIZE_T);
|
---|
56 | if ($flavour =~ /3[12]/) {
|
---|
57 | $z=0; # S/390 ABI
|
---|
58 | $SIZE_T=4;
|
---|
59 | } else {
|
---|
60 | $z=1; # zSeries ABI
|
---|
61 | $SIZE_T=8;
|
---|
62 | }
|
---|
63 |
|
---|
64 | my $stdframe=16*$SIZE_T+4*8;
|
---|
65 | my $sp="%r15";
|
---|
66 |
|
---|
67 | my ($ctx,$inp,$len,$padbit) = map("%r$_",(2..5));
|
---|
68 |
|
---|
69 | PERLASM_BEGIN($output);
|
---|
70 |
|
---|
71 | INCLUDE ("s390x_arch.h");
|
---|
72 | TEXT ();
|
---|
73 |
|
---|
74 | ################
|
---|
75 | # static void poly1305_init(void *ctx, const unsigned char key[16])
|
---|
76 | {
|
---|
77 | GLOBL ("poly1305_init");
|
---|
78 | TYPE ("poly1305_init","\@function");
|
---|
79 | ALIGN (16);
|
---|
80 | LABEL ("poly1305_init");
|
---|
81 | lghi ("%r0",0);
|
---|
82 | lghi ("%r1",-1);
|
---|
83 | stg ("%r0","0($ctx)"); # zero hash value
|
---|
84 | stg ("%r0","8($ctx)");
|
---|
85 | stg ("%r0","16($ctx)");
|
---|
86 | st ("%r0","24($ctx)"); # clear is_base2_26
|
---|
87 | lgr ("%r5",$ctx); # reassign $ctx
|
---|
88 | lghi ("%r2",0);
|
---|
89 |
|
---|
90 | &{$z? \&clgr:\&clr} ($inp,"%r0");
|
---|
91 | je (".Lno_key");
|
---|
92 |
|
---|
93 | lrvg ("%r2","0($inp)"); # load little-endian key
|
---|
94 | lrvg ("%r3","8($inp)");
|
---|
95 |
|
---|
96 | nihl ("%r1",0xffc0); # 0xffffffc0ffffffff
|
---|
97 | srlg ("%r0","%r1",4); # 0x0ffffffc0fffffff
|
---|
98 | srlg ("%r1","%r1",4);
|
---|
99 | nill ("%r1",0xfffc); # 0x0ffffffc0ffffffc
|
---|
100 |
|
---|
101 | ngr ("%r2","%r0");
|
---|
102 | ngr ("%r3","%r1");
|
---|
103 |
|
---|
104 | stmg ("%r2","%r3","32(%r5)");
|
---|
105 |
|
---|
106 | larl ("%r1","OPENSSL_s390xcap_P");
|
---|
107 | lg ("%r0","16(%r1)");
|
---|
108 | srlg ("%r0","%r0",62);
|
---|
109 | nill ("%r0",1); # extract vx bit
|
---|
110 | lcgr ("%r0","%r0");
|
---|
111 | larl ("%r1",".Lpoly1305_blocks");
|
---|
112 | larl ("%r2",".Lpoly1305_blocks_vx");
|
---|
113 | larl ("%r3",".Lpoly1305_emit");
|
---|
114 | &{$z? \&xgr:\&xr} ("%r2","%r1"); # select between scalar and vector
|
---|
115 | &{$z? \&ngr:\&nr} ("%r2","%r0");
|
---|
116 | &{$z? \&xgr:\&xr} ("%r2","%r1");
|
---|
117 | &{$z? \&stmg:\&stm} ("%r2","%r3","0(%r4)");
|
---|
118 | lghi ("%r2",1);
|
---|
119 | LABEL (".Lno_key");
|
---|
120 | br ("%r14");
|
---|
121 | SIZE ("poly1305_init",".-poly1305_init");
|
---|
122 | }
|
---|
123 |
|
---|
124 | ################
|
---|
125 | # static void poly1305_blocks(void *ctx, const unsigned char *inp,
|
---|
126 | # size_t len, u32 padbit)
|
---|
127 | {
|
---|
128 | my ($d0hi,$d0lo,$d1hi,$d1lo,$t0,$h0,$t1,$h1,$h2) = map("%r$_",(6..14));
|
---|
129 | my ($r0,$r1,$s1) = map("%r$_",(0..2));
|
---|
130 |
|
---|
131 | GLOBL ("poly1305_blocks");
|
---|
132 | TYPE ("poly1305_blocks","\@function");
|
---|
133 | ALIGN (16);
|
---|
134 | LABEL ("poly1305_blocks");
|
---|
135 | LABEL (".Lpoly1305_blocks");
|
---|
136 | &{$z? \<gr:\<r} ("%r0",$len);
|
---|
137 | jz (".Lno_data");
|
---|
138 |
|
---|
139 | &{$z? \&stmg:\&stm} ("%r6","%r14","6*$SIZE_T($sp)");
|
---|
140 |
|
---|
141 | lg ($h0,"0($ctx)"); # load hash value
|
---|
142 | lg ($h1,"8($ctx)");
|
---|
143 | lg ($h2,"16($ctx)");
|
---|
144 |
|
---|
145 | LABEL (".Lpoly1305_blocks_entry");
|
---|
146 | if ($z) {
|
---|
147 | srlg ($len,$len,4);
|
---|
148 | } else {
|
---|
149 | srl ($len,4);
|
---|
150 | }
|
---|
151 | llgfr ($padbit,$padbit); # clear upper half, much needed with
|
---|
152 | # non-64-bit ABI
|
---|
153 | lg ($r0,"32($ctx)"); # load key
|
---|
154 | lg ($r1,"40($ctx)");
|
---|
155 |
|
---|
156 | &{$z? \&stg:\&st} ($ctx,"2*$SIZE_T($sp)"); # off-load $ctx
|
---|
157 | srlg ($s1,$r1,2);
|
---|
158 | algr ($s1,$r1); # s1 = r1 + r1>>2
|
---|
159 | j (".Loop");
|
---|
160 |
|
---|
161 | ALIGN (16);
|
---|
162 | LABEL (".Loop");
|
---|
163 | lrvg ($d0lo,"0($inp)"); # load little-endian input
|
---|
164 | lrvg ($d1lo,"8($inp)");
|
---|
165 | la ($inp,"16($inp)");
|
---|
166 |
|
---|
167 | algr ($d0lo,$h0); # accumulate input
|
---|
168 | alcgr ($d1lo,$h1);
|
---|
169 | alcgr ($h2,$padbit);
|
---|
170 |
|
---|
171 | lgr ($h0,$d0lo);
|
---|
172 | mlgr ($d0hi,$r0); # h0*r0 -> $d0hi:$d0lo
|
---|
173 | lgr ($h1,$d1lo);
|
---|
174 | mlgr ($d1hi,$s1); # h1*5*r1 -> $d1hi:$d1lo
|
---|
175 |
|
---|
176 | mlgr ($t0,$r1); # h0*r1 -> $t0:$h0
|
---|
177 | mlgr ($t1,$r0); # h1*r0 -> $t1:$h1
|
---|
178 |
|
---|
179 | algr ($d0lo,$d1lo);
|
---|
180 | lgr ($d1lo,$h2);
|
---|
181 | alcgr ($d0hi,$d1hi);
|
---|
182 | lghi ($d1hi,0);
|
---|
183 |
|
---|
184 | algr ($h1,$h0);
|
---|
185 | alcgr ($t1,$t0);
|
---|
186 |
|
---|
187 | msgr ($d1lo,$s1); # h2*s1
|
---|
188 | msgr ($h2,$r0); # h2*r0
|
---|
189 |
|
---|
190 | algr ($h1,$d1lo);
|
---|
191 | alcgr ($t1,$d1hi); # $d1hi is zero
|
---|
192 |
|
---|
193 | algr ($h1,$d0hi);
|
---|
194 | alcgr ($h2,$t1);
|
---|
195 |
|
---|
196 | lghi ($h0,-4); # final reduction step
|
---|
197 | ngr ($h0,$h2);
|
---|
198 | srlg ($t0,$h2,2);
|
---|
199 | algr ($h0,$t0);
|
---|
200 | lghi ($t1,3);
|
---|
201 | ngr ($h2,$t1);
|
---|
202 |
|
---|
203 | algr ($h0,$d0lo);
|
---|
204 | alcgr ($h1,$d1hi); # $d1hi is still zero
|
---|
205 | alcgr ($h2,$d1hi); # $d1hi is still zero
|
---|
206 |
|
---|
207 | &{$z? \&brctg:\&brct} ($len,".Loop");
|
---|
208 |
|
---|
209 | &{$z? \&lg:\&l} ($ctx,"2*$SIZE_T($sp)");# restore $ctx
|
---|
210 |
|
---|
211 | stg ($h0,"0($ctx)"); # store hash value
|
---|
212 | stg ($h1,"8($ctx)");
|
---|
213 | stg ($h2,"16($ctx)");
|
---|
214 |
|
---|
215 | &{$z? \&lmg:\&lm} ("%r6","%r14","6*$SIZE_T($sp)");
|
---|
216 | LABEL (".Lno_data");
|
---|
217 | br ("%r14");
|
---|
218 | SIZE ("poly1305_blocks",".-poly1305_blocks");
|
---|
219 | }
|
---|
220 |
|
---|
221 | ################
|
---|
222 | # static void poly1305_blocks_vx(void *ctx, const unsigned char *inp,
|
---|
223 | # size_t len, u32 padbit)
|
---|
224 | {
|
---|
225 | my ($H0, $H1, $H2, $H3, $H4) = map("%v$_",(0..4));
|
---|
226 | my ($I0, $I1, $I2, $I3, $I4) = map("%v$_",(5..9));
|
---|
227 | my ($R0, $R1, $S1, $R2, $S2) = map("%v$_",(10..14));
|
---|
228 | my ($R3, $S3, $R4, $S4) = map("%v$_",(15..18));
|
---|
229 | my ($ACC0, $ACC1, $ACC2, $ACC3, $ACC4) = map("%v$_",(19..23));
|
---|
230 | my ($T1, $T2, $T3, $T4) = map("%v$_",(24..27));
|
---|
231 | my ($mask26,$bswaplo,$bswaphi,$bswapmi) = map("%v$_",(28..31));
|
---|
232 |
|
---|
233 | my ($d2,$d0,$h0,$d1,$h1,$h2)=map("%r$_",(9..14));
|
---|
234 |
|
---|
235 | TYPE ("poly1305_blocks_vx","\@function");
|
---|
236 | ALIGN (16);
|
---|
237 | LABEL ("poly1305_blocks_vx");
|
---|
238 | LABEL (".Lpoly1305_blocks_vx");
|
---|
239 | &{$z? \&clgfi:\&clfi} ($len,128);
|
---|
240 | jhe ("__poly1305_blocks_vx");
|
---|
241 |
|
---|
242 | &{$z? \&stmg:\&stm} ("%r6","%r14","6*$SIZE_T($sp)");
|
---|
243 |
|
---|
244 | lg ($d0,"0($ctx)");
|
---|
245 | lg ($d1,"8($ctx)");
|
---|
246 | lg ($d2,"16($ctx)");
|
---|
247 |
|
---|
248 | llgfr ("%r0",$d0); # base 2^26 -> base 2^64
|
---|
249 | srlg ($h0,$d0,32);
|
---|
250 | llgfr ("%r1",$d1);
|
---|
251 | srlg ($h1,$d1,32);
|
---|
252 | srlg ($h2,$d2,32);
|
---|
253 |
|
---|
254 | sllg ("%r0","%r0",26);
|
---|
255 | algr ($h0,"%r0");
|
---|
256 | sllg ("%r0",$h1,52);
|
---|
257 | srlg ($h1,$h1,12);
|
---|
258 | sllg ("%r1","%r1",14);
|
---|
259 | algr ($h0,"%r0");
|
---|
260 | alcgr ($h1,"%r1");
|
---|
261 | sllg ("%r0",$h2,40);
|
---|
262 | srlg ($h2,$h2,24);
|
---|
263 | lghi ("%r1",0);
|
---|
264 | algr ($h1,"%r0");
|
---|
265 | alcgr ($h2,"%r1");
|
---|
266 |
|
---|
267 | llgf ("%r0","24($ctx)"); # is_base2_26
|
---|
268 | lcgr ("%r0","%r0");
|
---|
269 |
|
---|
270 | xgr ($h0,$d0); # choose between radixes
|
---|
271 | xgr ($h1,$d1);
|
---|
272 | xgr ($h2,$d2);
|
---|
273 | ngr ($h0,"%r0");
|
---|
274 | ngr ($h1,"%r0");
|
---|
275 | ngr ($h2,"%r0");
|
---|
276 | xgr ($h0,$d0);
|
---|
277 | xgr ($h1,$d1);
|
---|
278 | xgr ($h2,$d2);
|
---|
279 |
|
---|
280 | lhi ("%r0",0);
|
---|
281 | st ("%r0","24($ctx)"); # clear is_base2_26
|
---|
282 |
|
---|
283 | j (".Lpoly1305_blocks_entry");
|
---|
284 | SIZE ("poly1305_blocks_vx",".-poly1305_blocks_vx");
|
---|
285 |
|
---|
286 | TYPE ("__poly1305_mul","\@function");
|
---|
287 | ALIGN (16);
|
---|
288 | LABEL ("__poly1305_mul");
|
---|
289 | vmlof ($ACC0,$H0,$R0);
|
---|
290 | vmlof ($ACC1,$H0,$R1);
|
---|
291 | vmlof ($ACC2,$H0,$R2);
|
---|
292 | vmlof ($ACC3,$H0,$R3);
|
---|
293 | vmlof ($ACC4,$H0,$R4);
|
---|
294 |
|
---|
295 | vmalof ($ACC0,$H1,$S4,$ACC0);
|
---|
296 | vmalof ($ACC1,$H1,$R0,$ACC1);
|
---|
297 | vmalof ($ACC2,$H1,$R1,$ACC2);
|
---|
298 | vmalof ($ACC3,$H1,$R2,$ACC3);
|
---|
299 | vmalof ($ACC4,$H1,$R3,$ACC4);
|
---|
300 |
|
---|
301 | vmalof ($ACC0,$H2,$S3,$ACC0);
|
---|
302 | vmalof ($ACC1,$H2,$S4,$ACC1);
|
---|
303 | vmalof ($ACC2,$H2,$R0,$ACC2);
|
---|
304 | vmalof ($ACC3,$H2,$R1,$ACC3);
|
---|
305 | vmalof ($ACC4,$H2,$R2,$ACC4);
|
---|
306 |
|
---|
307 | vmalof ($ACC0,$H3,$S2,$ACC0);
|
---|
308 | vmalof ($ACC1,$H3,$S3,$ACC1);
|
---|
309 | vmalof ($ACC2,$H3,$S4,$ACC2);
|
---|
310 | vmalof ($ACC3,$H3,$R0,$ACC3);
|
---|
311 | vmalof ($ACC4,$H3,$R1,$ACC4);
|
---|
312 |
|
---|
313 | vmalof ($ACC0,$H4,$S1,$ACC0);
|
---|
314 | vmalof ($ACC1,$H4,$S2,$ACC1);
|
---|
315 | vmalof ($ACC2,$H4,$S3,$ACC2);
|
---|
316 | vmalof ($ACC3,$H4,$S4,$ACC3);
|
---|
317 | vmalof ($ACC4,$H4,$R0,$ACC4);
|
---|
318 |
|
---|
319 | ################################################################
|
---|
320 | # lazy reduction
|
---|
321 |
|
---|
322 | vesrlg ($H4,$ACC3,26);
|
---|
323 | vesrlg ($H1,$ACC0,26);
|
---|
324 | vn ($H3,$ACC3,$mask26);
|
---|
325 | vn ($H0,$ACC0,$mask26);
|
---|
326 | vag ($H4,$H4,$ACC4); # h3 -> h4
|
---|
327 | vag ($H1,$H1,$ACC1); # h0 -> h1
|
---|
328 |
|
---|
329 | vesrlg ($ACC4,$H4,26);
|
---|
330 | vesrlg ($ACC1,$H1,26);
|
---|
331 | vn ($H4,$H4,$mask26);
|
---|
332 | vn ($H1,$H1,$mask26);
|
---|
333 | vag ($H0,$H0,$ACC4);
|
---|
334 | vag ($H2,$ACC2,$ACC1); # h1 -> h2
|
---|
335 |
|
---|
336 | veslg ($ACC4,$ACC4,2); # <<2
|
---|
337 | vesrlg ($ACC2,$H2,26);
|
---|
338 | vn ($H2,$H2,$mask26);
|
---|
339 | vag ($H0,$H0,$ACC4); # h4 -> h0
|
---|
340 | vag ($H3,$H3,$ACC2); # h2 -> h3
|
---|
341 |
|
---|
342 | vesrlg ($ACC0,$H0,26);
|
---|
343 | vesrlg ($ACC3,$H3,26);
|
---|
344 | vn ($H0,$H0,$mask26);
|
---|
345 | vn ($H3,$H3,$mask26);
|
---|
346 | vag ($H1,$H1,$ACC0); # h0 -> h1
|
---|
347 | vag ($H4,$H4,$ACC3); # h3 -> h4
|
---|
348 | br ("%r14");
|
---|
349 | SIZE ("__poly1305_mul",".-__poly1305_mul");
|
---|
350 |
|
---|
351 | TYPE ("__poly1305_blocks_vx","\@function");
|
---|
352 | ALIGN (16);
|
---|
353 | LABEL ("__poly1305_blocks_vx");
|
---|
354 | &{$z? \&lgr:\&lr} ("%r0",$sp);
|
---|
355 | &{$z? \&stmg:\&stm} ("%r10","%r15","10*$SIZE_T($sp)");
|
---|
356 | if (!$z) {
|
---|
357 | std ("%f4","16*$SIZE_T+2*8($sp)");
|
---|
358 | std ("%f6","16*$SIZE_T+3*8($sp)");
|
---|
359 | ahi ($sp,-$stdframe);
|
---|
360 | st ("%r0","0($sp)"); # back-chain
|
---|
361 |
|
---|
362 | llgfr ($len,$len); # so that srlg works on $len
|
---|
363 | } else {
|
---|
364 | aghi ($sp,"-($stdframe+8*8)");
|
---|
365 | stg ("%r0","0($sp)"); # back-chain
|
---|
366 |
|
---|
367 | std ("%f8","$stdframe+0*8($sp)");
|
---|
368 | std ("%f9","$stdframe+1*8($sp)");
|
---|
369 | std ("%f10","$stdframe+2*8($sp)");
|
---|
370 | std ("%f11","$stdframe+3*8($sp)");
|
---|
371 | std ("%f12","$stdframe+4*8($sp)");
|
---|
372 | std ("%f13","$stdframe+5*8($sp)");
|
---|
373 | std ("%f14","$stdframe+6*8($sp)");
|
---|
374 | std ("%f15","$stdframe+7*8($sp)");
|
---|
375 | }
|
---|
376 | larl ("%r1",".Lconst");
|
---|
377 | vgmg ($mask26,38,63);
|
---|
378 | vlm ($bswaplo,$bswapmi,"16(%r1)");
|
---|
379 |
|
---|
380 | < ("%r0","24($ctx)"); # is_base2_26?
|
---|
381 | jnz (".Lskip_init");
|
---|
382 |
|
---|
383 | lg ($h0,"32($ctx)"); # load key base 2^64
|
---|
384 | lg ($h1,"40($ctx)");
|
---|
385 |
|
---|
386 | risbg ($d0,$h0,38,0x80+63,38); # base 2^64 -> 2^26
|
---|
387 | srlg ($d1,$h0,52);
|
---|
388 | risbg ($h0,$h0,38,0x80+63,0);
|
---|
389 | vlvgg ($R0,$h0,0);
|
---|
390 | risbg ($d1,$h1,38,51,12);
|
---|
391 | vlvgg ($R1,$d0,0);
|
---|
392 | risbg ($d0,$h1,38,63,50);
|
---|
393 | vlvgg ($R2,$d1,0);
|
---|
394 | srlg ($d1,$h1,40);
|
---|
395 | vlvgg ($R3,$d0,0);
|
---|
396 | vlvgg ($R4,$d1,0);
|
---|
397 |
|
---|
398 | veslg ($S1,$R1,2);
|
---|
399 | veslg ($S2,$R2,2);
|
---|
400 | veslg ($S3,$R3,2);
|
---|
401 | veslg ($S4,$R4,2);
|
---|
402 | vlr ($H0,$R0);
|
---|
403 | vlr ($H1,$R1);
|
---|
404 | vlr ($H2,$R2);
|
---|
405 | vlr ($H3,$R3);
|
---|
406 | vlr ($H4,$R4);
|
---|
407 | vag ($S1,$S1,$R1); # * 5
|
---|
408 | vag ($S2,$S2,$R2);
|
---|
409 | vag ($S3,$S3,$R3);
|
---|
410 | vag ($S4,$S4,$R4);
|
---|
411 |
|
---|
412 | brasl ("%r14","__poly1305_mul"); # r^1:- * r^1:-
|
---|
413 |
|
---|
414 | vpdi ($R0,$H0,$R0,0); # r^2:r^1
|
---|
415 | vpdi ($R1,$H1,$R1,0);
|
---|
416 | vpdi ($R2,$H2,$R2,0);
|
---|
417 | vpdi ($R3,$H3,$R3,0);
|
---|
418 | vpdi ($R4,$H4,$R4,0);
|
---|
419 | vpdi ($H0,$H0,$H0,0); # r^2:r^2
|
---|
420 | vpdi ($H1,$H1,$H1,0);
|
---|
421 | vpdi ($H2,$H2,$H2,0);
|
---|
422 | vpdi ($H3,$H3,$H3,0);
|
---|
423 | vpdi ($H4,$H4,$H4,0);
|
---|
424 | veslg ($S1,$R1,2);
|
---|
425 | veslg ($S2,$R2,2);
|
---|
426 | veslg ($S3,$R3,2);
|
---|
427 | veslg ($S4,$R4,2);
|
---|
428 | vag ($S1,$S1,$R1); # * 5
|
---|
429 | vag ($S2,$S2,$R2);
|
---|
430 | vag ($S3,$S3,$R3);
|
---|
431 | vag ($S4,$S4,$R4);
|
---|
432 |
|
---|
433 | brasl ("%r14,__poly1305_mul"); # r^2:r^2 * r^2:r^1
|
---|
434 |
|
---|
435 | vl ($I0,"0(%r1)"); # borrow $I0
|
---|
436 | vperm ($R0,$R0,$H0,$I0); # r^2:r^4:r^1:r^3
|
---|
437 | vperm ($R1,$R1,$H1,$I0);
|
---|
438 | vperm ($R2,$R2,$H2,$I0);
|
---|
439 | vperm ($R3,$R3,$H3,$I0);
|
---|
440 | vperm ($R4,$R4,$H4,$I0);
|
---|
441 | veslf ($S1,$R1,2);
|
---|
442 | veslf ($S2,$R2,2);
|
---|
443 | veslf ($S3,$R3,2);
|
---|
444 | veslf ($S4,$R4,2);
|
---|
445 | vaf ($S1,$S1,$R1); # * 5
|
---|
446 | vaf ($S2,$S2,$R2);
|
---|
447 | vaf ($S3,$S3,$R3);
|
---|
448 | vaf ($S4,$S4,$R4);
|
---|
449 |
|
---|
450 | lg ($h0,"0($ctx)"); # load hash base 2^64
|
---|
451 | lg ($h1,"8($ctx)");
|
---|
452 | lg ($h2,"16($ctx)");
|
---|
453 |
|
---|
454 | vzero ($H0);
|
---|
455 | vzero ($H1);
|
---|
456 | vzero ($H2);
|
---|
457 | vzero ($H3);
|
---|
458 | vzero ($H4);
|
---|
459 |
|
---|
460 | risbg ($d0,$h0,38,0x80+63,38); # base 2^64 -> 2^26
|
---|
461 | srlg ($d1,$h0,52);
|
---|
462 | risbg ($h0,$h0,38,0x80+63,0);
|
---|
463 | vlvgg ($H0,$h0,0);
|
---|
464 | risbg ($d1,$h1,38,51,12);
|
---|
465 | vlvgg ($H1,$d0,0);
|
---|
466 | risbg ($d0,$h1,38,63,50);
|
---|
467 | vlvgg ($H2,$d1,0);
|
---|
468 | srlg ($d1,$h1,40);
|
---|
469 | vlvgg ($H3,$d0,0);
|
---|
470 | risbg ($d1,$h2,37,39,24);
|
---|
471 | vlvgg ($H4,$d1,0);
|
---|
472 |
|
---|
473 | lhi ("%r0",1);
|
---|
474 | st ("%r0","24($ctx)"); # set is_base2_26
|
---|
475 |
|
---|
476 | vstm ($R0,$S4,"48($ctx)"); # save key schedule base 2^26
|
---|
477 |
|
---|
478 | vpdi ($R0,$R0,$R0,0); # broadcast r^2:r^4
|
---|
479 | vpdi ($R1,$R1,$R1,0);
|
---|
480 | vpdi ($S1,$S1,$S1,0);
|
---|
481 | vpdi ($R2,$R2,$R2,0);
|
---|
482 | vpdi ($S2,$S2,$S2,0);
|
---|
483 | vpdi ($R3,$R3,$R3,0);
|
---|
484 | vpdi ($S3,$S3,$S3,0);
|
---|
485 | vpdi ($R4,$R4,$R4,0);
|
---|
486 | vpdi ($S4,$S4,$S4,0);
|
---|
487 |
|
---|
488 | j (".Loaded_hash");
|
---|
489 |
|
---|
490 | ALIGN (16);
|
---|
491 | LABEL (".Lskip_init");
|
---|
492 | vllezf ($H0,"0($ctx)"); # load hash base 2^26
|
---|
493 | vllezf ($H1,"4($ctx)");
|
---|
494 | vllezf ($H2,"8($ctx)");
|
---|
495 | vllezf ($H3,"12($ctx)");
|
---|
496 | vllezf ($H4,"16($ctx)");
|
---|
497 |
|
---|
498 | vlrepg ($R0,"0x30($ctx)"); # broadcast r^2:r^4
|
---|
499 | vlrepg ($R1,"0x40($ctx)");
|
---|
500 | vlrepg ($S1,"0x50($ctx)");
|
---|
501 | vlrepg ($R2,"0x60($ctx)");
|
---|
502 | vlrepg ($S2,"0x70($ctx)");
|
---|
503 | vlrepg ($R3,"0x80($ctx)");
|
---|
504 | vlrepg ($S3,"0x90($ctx)");
|
---|
505 | vlrepg ($R4,"0xa0($ctx)");
|
---|
506 | vlrepg ($S4,"0xb0($ctx)");
|
---|
507 |
|
---|
508 | LABEL (".Loaded_hash");
|
---|
509 | vzero ($I1);
|
---|
510 | vzero ($I3);
|
---|
511 |
|
---|
512 | vlm ($T1,$T4,"0x00($inp)"); # load first input block
|
---|
513 | la ($inp,"0x40($inp)");
|
---|
514 | vgmg ($mask26,6,31);
|
---|
515 | vgmf ($I4,5,5); # padbit<<2
|
---|
516 |
|
---|
517 | vperm ($I0,$T3,$T4,$bswaplo);
|
---|
518 | vperm ($I2,$T3,$T4,$bswapmi);
|
---|
519 | vperm ($T3,$T3,$T4,$bswaphi);
|
---|
520 |
|
---|
521 | verimg ($I1,$I0,$mask26,6); # >>26
|
---|
522 | veslg ($I0,$I0,32);
|
---|
523 | veslg ($I2,$I2,28); # >>4
|
---|
524 | verimg ($I3,$T3,$mask26,18); # >>14
|
---|
525 | verimg ($I4,$T3,$mask26,58); # >>38
|
---|
526 | vn ($I0,$I0,$mask26);
|
---|
527 | vn ($I2,$I2,$mask26);
|
---|
528 | vesrlf ($I4,$I4,2); # >>2
|
---|
529 |
|
---|
530 | vgmg ($mask26,38,63);
|
---|
531 | vperm ($T3,$T1,$T2,$bswaplo);
|
---|
532 | vperm ($T4,$T1,$T2,$bswaphi);
|
---|
533 | vperm ($T2,$T1,$T2,$bswapmi);
|
---|
534 |
|
---|
535 | verimg ($I0,$T3,$mask26,0);
|
---|
536 | verimg ($I1,$T3,$mask26,38); # >>26
|
---|
537 | verimg ($I2,$T2,$mask26,60); # >>4
|
---|
538 | verimg ($I3,$T4,$mask26,50); # >>14
|
---|
539 | vesrlg ($T4,$T4,40);
|
---|
540 | vo ($I4,$I4,$T4);
|
---|
541 |
|
---|
542 | srlg ("%r0",$len,6);
|
---|
543 | &{$z? \&aghi:\&ahi} ("%r0",-1);
|
---|
544 |
|
---|
545 | ALIGN (16);
|
---|
546 | LABEL (".Loop_vx");
|
---|
547 | vmlef ($ACC0,$I0,$R0);
|
---|
548 | vmlef ($ACC1,$I0,$R1);
|
---|
549 | vmlef ($ACC2,$I0,$R2);
|
---|
550 | vmlef ($ACC3,$I0,$R3);
|
---|
551 | vmlef ($ACC4,$I0,$R4);
|
---|
552 |
|
---|
553 | vmalef ($ACC0,$I1,$S4,$ACC0);
|
---|
554 | vmalef ($ACC1,$I1,$R0,$ACC1);
|
---|
555 | vmalef ($ACC2,$I1,$R1,$ACC2);
|
---|
556 | vmalef ($ACC3,$I1,$R2,$ACC3);
|
---|
557 | vmalef ($ACC4,$I1,$R3,$ACC4);
|
---|
558 |
|
---|
559 | vaf ($H2,$H2,$I2);
|
---|
560 | vaf ($H0,$H0,$I0);
|
---|
561 | vaf ($H3,$H3,$I3);
|
---|
562 | vaf ($H1,$H1,$I1);
|
---|
563 | vaf ($H4,$H4,$I4);
|
---|
564 |
|
---|
565 | vmalef ($ACC0,$I2,$S3,$ACC0);
|
---|
566 | vmalef ($ACC1,$I2,$S4,$ACC1);
|
---|
567 | vmalef ($ACC2,$I2,$R0,$ACC2);
|
---|
568 | vmalef ($ACC3,$I2,$R1,$ACC3);
|
---|
569 | vmalef ($ACC4,$I2,$R2,$ACC4);
|
---|
570 |
|
---|
571 | vlm ($T1,$T4,"0x00($inp)"); # load next input block
|
---|
572 | la ($inp,"0x40($inp)");
|
---|
573 | vgmg ($mask26,6,31);
|
---|
574 |
|
---|
575 | vmalef ($ACC0,$I3,$S2,$ACC0);
|
---|
576 | vmalef ($ACC1,$I3,$S3,$ACC1);
|
---|
577 | vmalef ($ACC2,$I3,$S4,$ACC2);
|
---|
578 | vmalef ($ACC3,$I3,$R0,$ACC3);
|
---|
579 | vmalef ($ACC4,$I3,$R1,$ACC4);
|
---|
580 |
|
---|
581 | vperm ($I0,$T3,$T4,$bswaplo);
|
---|
582 | vperm ($I2,$T3,$T4,$bswapmi);
|
---|
583 | vperm ($T3,$T3,$T4,$bswaphi);
|
---|
584 |
|
---|
585 | vmalef ($ACC0,$I4,$S1,$ACC0);
|
---|
586 | vmalef ($ACC1,$I4,$S2,$ACC1);
|
---|
587 | vmalef ($ACC2,$I4,$S3,$ACC2);
|
---|
588 | vmalef ($ACC3,$I4,$S4,$ACC3);
|
---|
589 | vmalef ($ACC4,$I4,$R0,$ACC4);
|
---|
590 |
|
---|
591 | verimg ($I1,$I0,$mask26,6); # >>26
|
---|
592 | veslg ($I0,$I0,32);
|
---|
593 | veslg ($I2,$I2,28); # >>4
|
---|
594 | verimg ($I3,$T3,$mask26,18); # >>14
|
---|
595 |
|
---|
596 | vmalof ($ACC0,$H0,$R0,$ACC0);
|
---|
597 | vmalof ($ACC1,$H0,$R1,$ACC1);
|
---|
598 | vmalof ($ACC2,$H0,$R2,$ACC2);
|
---|
599 | vmalof ($ACC3,$H0,$R3,$ACC3);
|
---|
600 | vmalof ($ACC4,$H0,$R4,$ACC4);
|
---|
601 |
|
---|
602 | vgmf ($I4,5,5); # padbit<<2
|
---|
603 | verimg ($I4,$T3,$mask26,58); # >>38
|
---|
604 | vn ($I0,$I0,$mask26);
|
---|
605 | vn ($I2,$I2,$mask26);
|
---|
606 | vesrlf ($I4,$I4,2); # >>2
|
---|
607 |
|
---|
608 | vmalof ($ACC0,$H1,$S4,$ACC0);
|
---|
609 | vmalof ($ACC1,$H1,$R0,$ACC1);
|
---|
610 | vmalof ($ACC2,$H1,$R1,$ACC2);
|
---|
611 | vmalof ($ACC3,$H1,$R2,$ACC3);
|
---|
612 | vmalof ($ACC4,$H1,$R3,$ACC4);
|
---|
613 |
|
---|
614 | vgmg ($mask26,38,63);
|
---|
615 | vperm ($T3,$T1,$T2,$bswaplo);
|
---|
616 | vperm ($T4,$T1,$T2,$bswaphi);
|
---|
617 | vperm ($T2,$T1,$T2,$bswapmi);
|
---|
618 |
|
---|
619 | vmalof ($ACC0,$H2,$S3,$ACC0);
|
---|
620 | vmalof ($ACC1,$H2,$S4,$ACC1);
|
---|
621 | vmalof ($ACC2,$H2,$R0,$ACC2);
|
---|
622 | vmalof ($ACC3,$H2,$R1,$ACC3);
|
---|
623 | vmalof ($ACC4,$H2,$R2,$ACC4);
|
---|
624 |
|
---|
625 | verimg ($I0,$T3,$mask26,0);
|
---|
626 | verimg ($I1,$T3,$mask26,38); # >>26
|
---|
627 | verimg ($I2,$T2,$mask26,60); # >>4
|
---|
628 |
|
---|
629 | vmalof ($ACC0,$H3,$S2,$ACC0);
|
---|
630 | vmalof ($ACC1,$H3,$S3,$ACC1);
|
---|
631 | vmalof ($ACC2,$H3,$S4,$ACC2);
|
---|
632 | vmalof ($ACC3,$H3,$R0,$ACC3);
|
---|
633 | vmalof ($ACC4,$H3,$R1,$ACC4);
|
---|
634 |
|
---|
635 | verimg ($I3,$T4,$mask26,50); # >>14
|
---|
636 | vesrlg ($T4,$T4,40);
|
---|
637 | vo ($I4,$I4,$T4);
|
---|
638 |
|
---|
639 | vmalof ($ACC0,$H4,$S1,$ACC0);
|
---|
640 | vmalof ($ACC1,$H4,$S2,$ACC1);
|
---|
641 | vmalof ($ACC2,$H4,$S3,$ACC2);
|
---|
642 | vmalof ($ACC3,$H4,$S4,$ACC3);
|
---|
643 | vmalof ($ACC4,$H4,$R0,$ACC4);
|
---|
644 |
|
---|
645 | ################################################################
|
---|
646 | # lazy reduction as discussed in "NEON crypto" by D.J. Bernstein
|
---|
647 | # and P. Schwabe
|
---|
648 |
|
---|
649 | vesrlg ($H4,$ACC3,26);
|
---|
650 | vesrlg ($H1,$ACC0,26);
|
---|
651 | vn ($H3,$ACC3,$mask26);
|
---|
652 | vn ($H0,$ACC0,$mask26);
|
---|
653 | vag ($H4,$H4,$ACC4); # h3 -> h4
|
---|
654 | vag ($H1,$H1,$ACC1); # h0 -> h1
|
---|
655 |
|
---|
656 | vesrlg ($ACC4,$H4,26);
|
---|
657 | vesrlg ($ACC1,$H1,26);
|
---|
658 | vn ($H4,$H4,$mask26);
|
---|
659 | vn ($H1,$H1,$mask26);
|
---|
660 | vag ($H0,$H0,$ACC4);
|
---|
661 | vag ($H2,$ACC2,$ACC1); # h1 -> h2
|
---|
662 |
|
---|
663 | veslg ($ACC4,$ACC4,2); # <<2
|
---|
664 | vesrlg ($ACC2,$H2,26);
|
---|
665 | vn ($H2,$H2,$mask26);
|
---|
666 | vag ($H0,$H0,$ACC4); # h4 -> h0
|
---|
667 | vag ($H3,$H3,$ACC2); # h2 -> h3
|
---|
668 |
|
---|
669 | vesrlg ($ACC0,$H0,26);
|
---|
670 | vesrlg ($ACC3,$H3,26);
|
---|
671 | vn ($H0,$H0,$mask26);
|
---|
672 | vn ($H3,$H3,$mask26);
|
---|
673 | vag ($H1,$H1,$ACC0); # h0 -> h1
|
---|
674 | vag ($H4,$H4,$ACC3); # h3 -> h4
|
---|
675 |
|
---|
676 | &{$z? \&brctg:\&brct} ("%r0",".Loop_vx");
|
---|
677 |
|
---|
678 | vlm ($R0,$S4,"48($ctx)"); # load all powers
|
---|
679 |
|
---|
680 | lghi ("%r0",0x30);
|
---|
681 | &{$z? \&lcgr:\&lcr} ($len,$len);
|
---|
682 | &{$z? \&ngr:\&nr} ($len,"%r0");
|
---|
683 | &{$z? \&slgr:\&slr} ($inp,$len);
|
---|
684 |
|
---|
685 | LABEL (".Last");
|
---|
686 | vmlef ($ACC0,$I0,$R0);
|
---|
687 | vmlef ($ACC1,$I0,$R1);
|
---|
688 | vmlef ($ACC2,$I0,$R2);
|
---|
689 | vmlef ($ACC3,$I0,$R3);
|
---|
690 | vmlef ($ACC4,$I0,$R4);
|
---|
691 |
|
---|
692 | vmalef ($ACC0,$I1,$S4,$ACC0);
|
---|
693 | vmalef ($ACC1,$I1,$R0,$ACC1);
|
---|
694 | vmalef ($ACC2,$I1,$R1,$ACC2);
|
---|
695 | vmalef ($ACC3,$I1,$R2,$ACC3);
|
---|
696 | vmalef ($ACC4,$I1,$R3,$ACC4);
|
---|
697 |
|
---|
698 | vaf ($H0,$H0,$I0);
|
---|
699 | vaf ($H1,$H1,$I1);
|
---|
700 | vaf ($H2,$H2,$I2);
|
---|
701 | vaf ($H3,$H3,$I3);
|
---|
702 | vaf ($H4,$H4,$I4);
|
---|
703 |
|
---|
704 | vmalef ($ACC0,$I2,$S3,$ACC0);
|
---|
705 | vmalef ($ACC1,$I2,$S4,$ACC1);
|
---|
706 | vmalef ($ACC2,$I2,$R0,$ACC2);
|
---|
707 | vmalef ($ACC3,$I2,$R1,$ACC3);
|
---|
708 | vmalef ($ACC4,$I2,$R2,$ACC4);
|
---|
709 |
|
---|
710 | vmalef ($ACC0,$I3,$S2,$ACC0);
|
---|
711 | vmalef ($ACC1,$I3,$S3,$ACC1);
|
---|
712 | vmalef ($ACC2,$I3,$S4,$ACC2);
|
---|
713 | vmalef ($ACC3,$I3,$R0,$ACC3);
|
---|
714 | vmalef ($ACC4,$I3,$R1,$ACC4);
|
---|
715 |
|
---|
716 | vmalef ($ACC0,$I4,$S1,$ACC0);
|
---|
717 | vmalef ($ACC1,$I4,$S2,$ACC1);
|
---|
718 | vmalef ($ACC2,$I4,$S3,$ACC2);
|
---|
719 | vmalef ($ACC3,$I4,$S4,$ACC3);
|
---|
720 | vmalef ($ACC4,$I4,$R0,$ACC4);
|
---|
721 |
|
---|
722 | vmalof ($ACC0,$H0,$R0,$ACC0);
|
---|
723 | vmalof ($ACC1,$H0,$R1,$ACC1);
|
---|
724 | vmalof ($ACC2,$H0,$R2,$ACC2);
|
---|
725 | vmalof ($ACC3,$H0,$R3,$ACC3);
|
---|
726 | vmalof ($ACC4,$H0,$R4,$ACC4);
|
---|
727 |
|
---|
728 | vmalof ($ACC0,$H1,$S4,$ACC0);
|
---|
729 | vmalof ($ACC1,$H1,$R0,$ACC1);
|
---|
730 | vmalof ($ACC2,$H1,$R1,$ACC2);
|
---|
731 | vmalof ($ACC3,$H1,$R2,$ACC3);
|
---|
732 | vmalof ($ACC4,$H1,$R3,$ACC4);
|
---|
733 |
|
---|
734 | vmalof ($ACC0,$H2,$S3,$ACC0);
|
---|
735 | vmalof ($ACC1,$H2,$S4,$ACC1);
|
---|
736 | vmalof ($ACC2,$H2,$R0,$ACC2);
|
---|
737 | vmalof ($ACC3,$H2,$R1,$ACC3);
|
---|
738 | vmalof ($ACC4,$H2,$R2,$ACC4);
|
---|
739 |
|
---|
740 | vmalof ($ACC0,$H3,$S2,$ACC0);
|
---|
741 | vmalof ($ACC1,$H3,$S3,$ACC1);
|
---|
742 | vmalof ($ACC2,$H3,$S4,$ACC2);
|
---|
743 | vmalof ($ACC3,$H3,$R0,$ACC3);
|
---|
744 | vmalof ($ACC4,$H3,$R1,$ACC4);
|
---|
745 |
|
---|
746 | vmalof ($ACC0,$H4,$S1,$ACC0);
|
---|
747 | vmalof ($ACC1,$H4,$S2,$ACC1);
|
---|
748 | vmalof ($ACC2,$H4,$S3,$ACC2);
|
---|
749 | vmalof ($ACC3,$H4,$S4,$ACC3);
|
---|
750 | vmalof ($ACC4,$H4,$R0,$ACC4);
|
---|
751 |
|
---|
752 | ################################################################
|
---|
753 | # horizontal addition
|
---|
754 |
|
---|
755 | vzero ($H0);
|
---|
756 | vsumqg ($ACC0,$ACC0,$H0);
|
---|
757 | vsumqg ($ACC1,$ACC1,$H0);
|
---|
758 | vsumqg ($ACC2,$ACC2,$H0);
|
---|
759 | vsumqg ($ACC3,$ACC3,$H0);
|
---|
760 | vsumqg ($ACC4,$ACC4,$H0);
|
---|
761 |
|
---|
762 | ################################################################
|
---|
763 | # lazy reduction
|
---|
764 |
|
---|
765 | vesrlg ($H4,$ACC3,26);
|
---|
766 | vesrlg ($H1,$ACC0,26);
|
---|
767 | vn ($H3,$ACC3,$mask26);
|
---|
768 | vn ($H0,$ACC0,$mask26);
|
---|
769 | vag ($H4,$H4,$ACC4); # h3 -> h4
|
---|
770 | vag ($H1,$H1,$ACC1); # h0 -> h1
|
---|
771 |
|
---|
772 | vesrlg ($ACC4,$H4,26);
|
---|
773 | vesrlg ($ACC1,$H1,26);
|
---|
774 | vn ($H4,$H4,$mask26);
|
---|
775 | vn ($H1,$H1,$mask26);
|
---|
776 | vag ($H0,$H0,$ACC4);
|
---|
777 | vag ($H2,$ACC2,$ACC1); # h1 -> h2
|
---|
778 |
|
---|
779 | veslg ($ACC4,$ACC4,2); # <<2
|
---|
780 | vesrlg ($ACC2,$H2,26);
|
---|
781 | vn ($H2,$H2,$mask26);
|
---|
782 | vag ($H0,$H0,$ACC4); # h4 -> h0
|
---|
783 | vag ($H3,$H3,$ACC2); # h2 -> h3
|
---|
784 |
|
---|
785 | vesrlg ($ACC0,$H0,26);
|
---|
786 | vesrlg ($ACC3,$H3,26);
|
---|
787 | vn ($H0,$H0,$mask26);
|
---|
788 | vn ($H3,$H3,$mask26);
|
---|
789 | vag ($H1,$H1,$ACC0); # h0 -> h1
|
---|
790 | vag ($H4,$H4,$ACC3); # h3 -> h4
|
---|
791 |
|
---|
792 | &{$z? \&clgfi:\&clfi} ($len,0);
|
---|
793 | je (".Ldone");
|
---|
794 |
|
---|
795 | vlm ($T1,$T4,"0x00($inp)"); # load last partial block
|
---|
796 | vgmg ($mask26,6,31);
|
---|
797 | vgmf ($I4,5,5); # padbit<<2
|
---|
798 |
|
---|
799 | vperm ($I0,$T3,$T4,$bswaplo);
|
---|
800 | vperm ($I2,$T3,$T4,$bswapmi);
|
---|
801 | vperm ($T3,$T3,$T4,$bswaphi);
|
---|
802 |
|
---|
803 | vl ($ACC0,"0x30($len,%r1)"); # borrow $ACC0,1
|
---|
804 | vl ($ACC1,"0x60($len,%r1)");
|
---|
805 |
|
---|
806 | verimg ($I1,$I0,$mask26,6); # >>26
|
---|
807 | veslg ($I0,$I0,32);
|
---|
808 | veslg ($I2,$I2,28); # >>4
|
---|
809 | verimg ($I3,$T3,$mask26,18); # >>14
|
---|
810 | verimg ($I4,$T3,$mask26,58); # >>38
|
---|
811 | vn ($I0,$I0,$mask26);
|
---|
812 | vn ($I2,$I2,$mask26);
|
---|
813 | vesrlf ($I4,$I4,2); # >>2
|
---|
814 |
|
---|
815 | vgmg ($mask26,38,63);
|
---|
816 | vperm ($T3,$T1,$T2,$bswaplo);
|
---|
817 | vperm ($T4,$T1,$T2,$bswaphi);
|
---|
818 | vperm ($T2,$T1,$T2,$bswapmi);
|
---|
819 |
|
---|
820 | verimg ($I0,$T3,$mask26,0);
|
---|
821 | verimg ($I1,$T3,$mask26,38); # >>26
|
---|
822 | verimg ($I2,$T2,$mask26,60); # >>4
|
---|
823 | verimg ($I3,$T4,$mask26,50); # >>14
|
---|
824 | vesrlg ($T4,$T4,40);
|
---|
825 | vo ($I4,$I4,$T4);
|
---|
826 |
|
---|
827 | vperm ($H0,$H0,$H0,$ACC0); # move hash to right lane
|
---|
828 | vn ($I0,$I0,$ACC1); # mask redundant lane[s]
|
---|
829 | vperm ($H1,$H1,$H1,$ACC0);
|
---|
830 | vn ($I1,$I1,$ACC1);
|
---|
831 | vperm ($H2,$H2,$H2,$ACC0);
|
---|
832 | vn ($I2,$I2,$ACC1);
|
---|
833 | vperm ($H3,$H3,$H3,$ACC0);
|
---|
834 | vn ($I3,$I3,$ACC1);
|
---|
835 | vperm ($H4,$H4,$H4,$ACC0);
|
---|
836 | vn ($I4,$I4,$ACC1);
|
---|
837 |
|
---|
838 | vaf ($I0,$I0,$H0); # accumulate hash
|
---|
839 | vzero ($H0); # wipe hash value
|
---|
840 | vaf ($I1,$I1,$H1);
|
---|
841 | vzero ($H1);
|
---|
842 | vaf ($I2,$I2,$H2);
|
---|
843 | vzero ($H2);
|
---|
844 | vaf ($I3,$I3,$H3);
|
---|
845 | vzero ($H3);
|
---|
846 | vaf ($I4,$I4,$H4);
|
---|
847 | vzero ($H4);
|
---|
848 |
|
---|
849 | &{$z? \&lghi:\&lhi} ($len,0);
|
---|
850 | j (".Last");
|
---|
851 | # I don't bother to tell apart cases when only one multiplication
|
---|
852 | # pass is sufficient, because I argue that mispredicted branch
|
---|
853 | # penalties are comparable to overhead of sometimes redundant
|
---|
854 | # multiplication pass...
|
---|
855 |
|
---|
856 | LABEL (".Ldone");
|
---|
857 | vstef ($H0,"0($ctx)",3); # store hash base 2^26
|
---|
858 | vstef ($H1,"4($ctx)",3);
|
---|
859 | vstef ($H2,"8($ctx)",3);
|
---|
860 | vstef ($H3,"12($ctx)",3);
|
---|
861 | vstef ($H4,"16($ctx)",3);
|
---|
862 |
|
---|
863 | if ($z) {
|
---|
864 | ld ("%f8","$stdframe+0*8($sp)");
|
---|
865 | ld ("%f9","$stdframe+1*8($sp)");
|
---|
866 | ld ("%f10","$stdframe+2*8($sp)");
|
---|
867 | ld ("%f11","$stdframe+3*8($sp)");
|
---|
868 | ld ("%f12","$stdframe+4*8($sp)");
|
---|
869 | ld ("%f13","$stdframe+5*8($sp)");
|
---|
870 | ld ("%f14","$stdframe+6*8($sp)");
|
---|
871 | ld ("%f15","$stdframe+7*8($sp)");
|
---|
872 | &{$z? \&lmg:\&lm} ("%r10","%r15","$stdframe+8*8+10*$SIZE_T($sp)");
|
---|
873 | } else {
|
---|
874 | ld ("%f4","$stdframe+16*$SIZE_T+2*8($sp)");
|
---|
875 | ld ("%f6","$stdframe+16*$SIZE_T+3*8($sp)");
|
---|
876 | &{$z? \&lmg:\&lm} ("%r10","%r15","$stdframe+10*$SIZE_T($sp)");
|
---|
877 | }
|
---|
878 | br ("%r14");
|
---|
879 | SIZE ("__poly1305_blocks_vx",".-__poly1305_blocks_vx");
|
---|
880 | }
|
---|
881 |
|
---|
882 | ################
|
---|
883 | # static void poly1305_emit(void *ctx, unsigned char mac[16],
|
---|
884 | # const u32 nonce[4])
|
---|
885 | {
|
---|
886 | my ($mac,$nonce)=($inp,$len);
|
---|
887 | my ($h0,$h1,$h2,$d0,$d1,$d2)=map("%r$_",(5..10));
|
---|
888 |
|
---|
889 | GLOBL ("poly1305_emit");
|
---|
890 | TYPE ("poly1305_emit","\@function");
|
---|
891 | ALIGN (16);
|
---|
892 | LABEL ("poly1305_emit");
|
---|
893 | LABEL (".Lpoly1305_emit");
|
---|
894 | &{$z? \&stmg:\&stm} ("%r6","%r10","6*$SIZE_T($sp)");
|
---|
895 |
|
---|
896 | lg ($d0,"0($ctx)");
|
---|
897 | lg ($d1,"8($ctx)");
|
---|
898 | lg ($d2,"16($ctx)");
|
---|
899 |
|
---|
900 | llgfr ("%r0",$d0); # base 2^26 -> base 2^64
|
---|
901 | srlg ($h0,$d0,32);
|
---|
902 | llgfr ("%r1",$d1);
|
---|
903 | srlg ($h1,$d1,32);
|
---|
904 | srlg ($h2,$d2,32);
|
---|
905 |
|
---|
906 | sllg ("%r0","%r0",26);
|
---|
907 | algr ($h0,"%r0");
|
---|
908 | sllg ("%r0",$h1,52);
|
---|
909 | srlg ($h1,$h1,12);
|
---|
910 | sllg ("%r1","%r1",14);
|
---|
911 | algr ($h0,"%r0");
|
---|
912 | alcgr ($h1,"%r1");
|
---|
913 | sllg ("%r0",$h2,40);
|
---|
914 | srlg ($h2,$h2,24);
|
---|
915 | lghi ("%r1",0);
|
---|
916 | algr ($h1,"%r0");
|
---|
917 | alcgr ($h2,"%r1");
|
---|
918 |
|
---|
919 | llgf ("%r0","24($ctx)"); # is_base2_26
|
---|
920 | lcgr ("%r0","%r0");
|
---|
921 |
|
---|
922 | xgr ($h0,$d0); # choose between radixes
|
---|
923 | xgr ($h1,$d1);
|
---|
924 | xgr ($h2,$d2);
|
---|
925 | ngr ($h0,"%r0");
|
---|
926 | ngr ($h1,"%r0");
|
---|
927 | ngr ($h2,"%r0");
|
---|
928 | xgr ($h0,$d0);
|
---|
929 | xgr ($h1,$d1);
|
---|
930 | xgr ($h2,$d2);
|
---|
931 |
|
---|
932 | lghi ("%r0",5);
|
---|
933 | lgr ($d0,$h0);
|
---|
934 | lgr ($d1,$h1);
|
---|
935 |
|
---|
936 | algr ($h0,"%r0"); # compare to modulus
|
---|
937 | alcgr ($h1,"%r1");
|
---|
938 | alcgr ($h2,"%r1");
|
---|
939 |
|
---|
940 | srlg ($h2,$h2,2); # did it borrow/carry?
|
---|
941 | slgr ("%r1",$h2); # 0-$h2>>2
|
---|
942 | lg ($d2,"0($nonce)"); # load nonce
|
---|
943 | lg ($ctx,"8($nonce)");
|
---|
944 |
|
---|
945 | xgr ($h0,$d0);
|
---|
946 | xgr ($h1,$d1);
|
---|
947 | ngr ($h0,"%r1");
|
---|
948 | ngr ($h1,"%r1");
|
---|
949 | xgr ($h0,$d0);
|
---|
950 | rllg ($d0,$d2,32); # flip nonce words
|
---|
951 | xgr ($h1,$d1);
|
---|
952 | rllg ($d1,$ctx,32);
|
---|
953 |
|
---|
954 | algr ($h0,$d0); # accumulate nonce
|
---|
955 | alcgr ($h1,$d1);
|
---|
956 |
|
---|
957 | strvg ($h0,"0($mac)"); # write little-endian result
|
---|
958 | strvg ($h1,"8($mac)");
|
---|
959 |
|
---|
960 | &{$z? \&lmg:\&lm} ("%r6","%r10","6*$SIZE_T($sp)");
|
---|
961 | br ("%r14");
|
---|
962 | SIZE ("poly1305_emit",".-poly1305_emit");
|
---|
963 | }
|
---|
964 |
|
---|
965 | ################
|
---|
966 |
|
---|
967 | ALIGN (16);
|
---|
968 | LABEL (".Lconst");
|
---|
969 | LONG (0x04050607,0x14151617,0x0c0d0e0f,0x1c1d1e1f); # merge odd
|
---|
970 | LONG (0x07060504,0x03020100,0x17161514,0x13121110); # byte swap masks
|
---|
971 | LONG (0x0f0e0d0c,0x0b0a0908,0x1f1e1d1c,0x1b1a1918);
|
---|
972 | LONG (0x00000000,0x09080706,0x00000000,0x19181716);
|
---|
973 |
|
---|
974 | LONG (0x00000000,0x00000000,0x00000000,0x0c0d0e0f); # magic tail masks
|
---|
975 | LONG (0x0c0d0e0f,0x00000000,0x00000000,0x00000000);
|
---|
976 | LONG (0x00000000,0x00000000,0x0c0d0e0f,0x00000000);
|
---|
977 |
|
---|
978 | LONG (0xffffffff,0x00000000,0xffffffff,0xffffffff);
|
---|
979 | LONG (0xffffffff,0x00000000,0xffffffff,0x00000000);
|
---|
980 | LONG (0x00000000,0x00000000,0xffffffff,0x00000000);
|
---|
981 |
|
---|
982 | STRING ("\"Poly1305 for s390x, CRYPTOGAMS by <appro\@openssl.org>\"");
|
---|
983 |
|
---|
984 | PERLASM_END();
|
---|