VirtualBox

source: vbox/trunk/src/libs/openssl-3.0.1/crypto/aes/asm/aesv8-armx.pl@ 94081

最後變更 在這個檔案從94081是 91772,由 vboxsync 提交於 3 年 前

openssl-1.1.1l: Applied and adjusted our OpenSSL changes to 1.1.1l. bugref:10126

  • 屬性 svn:executable 設為 *
檔案大小: 21.8 KB
 
1#! /usr/bin/env perl
2# Copyright 2014-2020 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the OpenSSL license (the "License"). You may not use
5# this file except in compliance with the License. You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
9#
10# ====================================================================
11# Written by Andy Polyakov <[email protected]> for the OpenSSL
12# project. The module is, however, dual licensed under OpenSSL and
13# CRYPTOGAMS licenses depending on where you obtain it. For further
14# details see http://www.openssl.org/~appro/cryptogams/.
15# ====================================================================
16#
17# This module implements support for ARMv8 AES instructions. The
18# module is endian-agnostic in sense that it supports both big- and
19# little-endian cases. As does it support both 32- and 64-bit modes
20# of operation. Latter is achieved by limiting amount of utilized
21# registers to 16, which implies additional NEON load and integer
22# instructions. This has no effect on mighty Apple A7, where results
23# are literally equal to the theoretical estimates based on AES
24# instruction latencies and issue rates. On Cortex-A53, an in-order
25# execution core, this costs up to 10-15%, which is partially
26# compensated by implementing dedicated code path for 128-bit
27# CBC encrypt case. On Cortex-A57 parallelizable mode performance
28# seems to be limited by sheer amount of NEON instructions...
29#
30# Performance in cycles per byte processed with 128-bit key:
31#
32# CBC enc CBC dec CTR
33# Apple A7 2.39 1.20 1.20
34# Cortex-A53 1.32 1.29 1.46
35# Cortex-A57(*) 1.95 0.85 0.93
36# Denver 1.96 0.86 0.80
37# Mongoose 1.33 1.20 1.20
38# Kryo 1.26 0.94 1.00
39#
40# (*) original 3.64/1.34/1.32 results were for r0p0 revision
41# and are still same even for updated module;
42
43$flavour = shift;
44$output = shift;
45
46$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
47( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
48( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
49die "can't locate arm-xlate.pl";
50
51open OUT,"| \"$^X\" $xlate $flavour $output";
52*STDOUT=*OUT;
53
54$prefix="aes_v8";
55
56$code=<<___;
57#include "arm_arch.h"
58
59#if __ARM_MAX_ARCH__>=7
60.text
61___
62$code.=".arch armv8-a+crypto\n" if ($flavour =~ /64/);
63$code.=<<___ if ($flavour !~ /64/);
64.arch armv7-a // don't confuse not-so-latest binutils with argv8 :-)
65.fpu neon
66.code 32
67#undef __thumb2__
68___
69
70# Assembler mnemonics are an eclectic mix of 32- and 64-bit syntax,
71# NEON is mostly 32-bit mnemonics, integer - mostly 64. Goal is to
72# maintain both 32- and 64-bit codes within single module and
73# transliterate common code to either flavour with regex vodoo.
74#
75{{{
76my ($inp,$bits,$out,$ptr,$rounds)=("x0","w1","x2","x3","w12");
77my ($zero,$rcon,$mask,$in0,$in1,$tmp,$key)=
78 $flavour=~/64/? map("q$_",(0..6)) : map("q$_",(0..3,8..10));
79
80
81$code.=<<___;
82.align 5
83.Lrcon:
84.long 0x01,0x01,0x01,0x01
85.long 0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d // rotate-n-splat
86.long 0x1b,0x1b,0x1b,0x1b
87
88.globl ${prefix}_set_encrypt_key
89.type ${prefix}_set_encrypt_key,%function
90.align 5
91${prefix}_set_encrypt_key:
92.Lenc_key:
93___
94$code.=<<___ if ($flavour =~ /64/);
95 stp x29,x30,[sp,#-16]!
96 add x29,sp,#0
97___
98$code.=<<___;
99 mov $ptr,#-1
100 cmp $inp,#0
101 b.eq .Lenc_key_abort
102 cmp $out,#0
103 b.eq .Lenc_key_abort
104 mov $ptr,#-2
105 cmp $bits,#128
106 b.lt .Lenc_key_abort
107 cmp $bits,#256
108 b.gt .Lenc_key_abort
109 tst $bits,#0x3f
110 b.ne .Lenc_key_abort
111
112 adr $ptr,.Lrcon
113 cmp $bits,#192
114
115 veor $zero,$zero,$zero
116 vld1.8 {$in0},[$inp],#16
117 mov $bits,#8 // reuse $bits
118 vld1.32 {$rcon,$mask},[$ptr],#32
119
120 b.lt .Loop128
121 b.eq .L192
122 b .L256
123
124.align 4
125.Loop128:
126 vtbl.8 $key,{$in0},$mask
127 vext.8 $tmp,$zero,$in0,#12
128 vst1.32 {$in0},[$out],#16
129 aese $key,$zero
130 subs $bits,$bits,#1
131
132 veor $in0,$in0,$tmp
133 vext.8 $tmp,$zero,$tmp,#12
134 veor $in0,$in0,$tmp
135 vext.8 $tmp,$zero,$tmp,#12
136 veor $key,$key,$rcon
137 veor $in0,$in0,$tmp
138 vshl.u8 $rcon,$rcon,#1
139 veor $in0,$in0,$key
140 b.ne .Loop128
141
142 vld1.32 {$rcon},[$ptr]
143
144 vtbl.8 $key,{$in0},$mask
145 vext.8 $tmp,$zero,$in0,#12
146 vst1.32 {$in0},[$out],#16
147 aese $key,$zero
148
149 veor $in0,$in0,$tmp
150 vext.8 $tmp,$zero,$tmp,#12
151 veor $in0,$in0,$tmp
152 vext.8 $tmp,$zero,$tmp,#12
153 veor $key,$key,$rcon
154 veor $in0,$in0,$tmp
155 vshl.u8 $rcon,$rcon,#1
156 veor $in0,$in0,$key
157
158 vtbl.8 $key,{$in0},$mask
159 vext.8 $tmp,$zero,$in0,#12
160 vst1.32 {$in0},[$out],#16
161 aese $key,$zero
162
163 veor $in0,$in0,$tmp
164 vext.8 $tmp,$zero,$tmp,#12
165 veor $in0,$in0,$tmp
166 vext.8 $tmp,$zero,$tmp,#12
167 veor $key,$key,$rcon
168 veor $in0,$in0,$tmp
169 veor $in0,$in0,$key
170 vst1.32 {$in0},[$out]
171 add $out,$out,#0x50
172
173 mov $rounds,#10
174 b .Ldone
175
176.align 4
177.L192:
178 vld1.8 {$in1},[$inp],#8
179 vmov.i8 $key,#8 // borrow $key
180 vst1.32 {$in0},[$out],#16
181 vsub.i8 $mask,$mask,$key // adjust the mask
182
183.Loop192:
184 vtbl.8 $key,{$in1},$mask
185 vext.8 $tmp,$zero,$in0,#12
186#ifdef __ARMEB__
187 vst1.32 {$in1},[$out],#16
188 sub $out,$out,#8
189#else
190 vst1.32 {$in1},[$out],#8
191#endif
192 aese $key,$zero
193 subs $bits,$bits,#1
194
195 veor $in0,$in0,$tmp
196 vext.8 $tmp,$zero,$tmp,#12
197 veor $in0,$in0,$tmp
198 vext.8 $tmp,$zero,$tmp,#12
199 veor $in0,$in0,$tmp
200
201 vdup.32 $tmp,${in0}[3]
202 veor $tmp,$tmp,$in1
203 veor $key,$key,$rcon
204 vext.8 $in1,$zero,$in1,#12
205 vshl.u8 $rcon,$rcon,#1
206 veor $in1,$in1,$tmp
207 veor $in0,$in0,$key
208 veor $in1,$in1,$key
209 vst1.32 {$in0},[$out],#16
210 b.ne .Loop192
211
212 mov $rounds,#12
213 add $out,$out,#0x20
214 b .Ldone
215
216.align 4
217.L256:
218 vld1.8 {$in1},[$inp]
219 mov $bits,#7
220 mov $rounds,#14
221 vst1.32 {$in0},[$out],#16
222
223.Loop256:
224 vtbl.8 $key,{$in1},$mask
225 vext.8 $tmp,$zero,$in0,#12
226 vst1.32 {$in1},[$out],#16
227 aese $key,$zero
228 subs $bits,$bits,#1
229
230 veor $in0,$in0,$tmp
231 vext.8 $tmp,$zero,$tmp,#12
232 veor $in0,$in0,$tmp
233 vext.8 $tmp,$zero,$tmp,#12
234 veor $key,$key,$rcon
235 veor $in0,$in0,$tmp
236 vshl.u8 $rcon,$rcon,#1
237 veor $in0,$in0,$key
238 vst1.32 {$in0},[$out],#16
239 b.eq .Ldone
240
241 vdup.32 $key,${in0}[3] // just splat
242 vext.8 $tmp,$zero,$in1,#12
243 aese $key,$zero
244
245 veor $in1,$in1,$tmp
246 vext.8 $tmp,$zero,$tmp,#12
247 veor $in1,$in1,$tmp
248 vext.8 $tmp,$zero,$tmp,#12
249 veor $in1,$in1,$tmp
250
251 veor $in1,$in1,$key
252 b .Loop256
253
254.Ldone:
255 str $rounds,[$out]
256 mov $ptr,#0
257
258.Lenc_key_abort:
259 mov x0,$ptr // return value
260 `"ldr x29,[sp],#16" if ($flavour =~ /64/)`
261 ret
262.size ${prefix}_set_encrypt_key,.-${prefix}_set_encrypt_key
263
264.globl ${prefix}_set_decrypt_key
265.type ${prefix}_set_decrypt_key,%function
266.align 5
267${prefix}_set_decrypt_key:
268___
269$code.=<<___ if ($flavour =~ /64/);
270 .inst 0xd503233f // paciasp
271 stp x29,x30,[sp,#-16]!
272 add x29,sp,#0
273___
274$code.=<<___ if ($flavour !~ /64/);
275 stmdb sp!,{r4,lr}
276___
277$code.=<<___;
278 bl .Lenc_key
279
280 cmp x0,#0
281 b.ne .Ldec_key_abort
282
283 sub $out,$out,#240 // restore original $out
284 mov x4,#-16
285 add $inp,$out,x12,lsl#4 // end of key schedule
286
287 vld1.32 {v0.16b},[$out]
288 vld1.32 {v1.16b},[$inp]
289 vst1.32 {v0.16b},[$inp],x4
290 vst1.32 {v1.16b},[$out],#16
291
292.Loop_imc:
293 vld1.32 {v0.16b},[$out]
294 vld1.32 {v1.16b},[$inp]
295 aesimc v0.16b,v0.16b
296 aesimc v1.16b,v1.16b
297 vst1.32 {v0.16b},[$inp],x4
298 vst1.32 {v1.16b},[$out],#16
299 cmp $inp,$out
300 b.hi .Loop_imc
301
302 vld1.32 {v0.16b},[$out]
303 aesimc v0.16b,v0.16b
304 vst1.32 {v0.16b},[$inp]
305
306 eor x0,x0,x0 // return value
307.Ldec_key_abort:
308___
309$code.=<<___ if ($flavour !~ /64/);
310 ldmia sp!,{r4,pc}
311___
312$code.=<<___ if ($flavour =~ /64/);
313 ldp x29,x30,[sp],#16
314 .inst 0xd50323bf // autiasp
315 ret
316___
317$code.=<<___;
318.size ${prefix}_set_decrypt_key,.-${prefix}_set_decrypt_key
319___
320}}}
321{{{
322sub gen_block () {
323my $dir = shift;
324my ($e,$mc) = $dir eq "en" ? ("e","mc") : ("d","imc");
325my ($inp,$out,$key)=map("x$_",(0..2));
326my $rounds="w3";
327my ($rndkey0,$rndkey1,$inout)=map("q$_",(0..3));
328
329$code.=<<___;
330.globl ${prefix}_${dir}crypt
331.type ${prefix}_${dir}crypt,%function
332.align 5
333${prefix}_${dir}crypt:
334 ldr $rounds,[$key,#240]
335 vld1.32 {$rndkey0},[$key],#16
336 vld1.8 {$inout},[$inp]
337 sub $rounds,$rounds,#2
338 vld1.32 {$rndkey1},[$key],#16
339
340.Loop_${dir}c:
341 aes$e $inout,$rndkey0
342 aes$mc $inout,$inout
343 vld1.32 {$rndkey0},[$key],#16
344 subs $rounds,$rounds,#2
345 aes$e $inout,$rndkey1
346 aes$mc $inout,$inout
347 vld1.32 {$rndkey1},[$key],#16
348 b.gt .Loop_${dir}c
349
350 aes$e $inout,$rndkey0
351 aes$mc $inout,$inout
352 vld1.32 {$rndkey0},[$key]
353 aes$e $inout,$rndkey1
354 veor $inout,$inout,$rndkey0
355
356 vst1.8 {$inout},[$out]
357 ret
358.size ${prefix}_${dir}crypt,.-${prefix}_${dir}crypt
359___
360}
361&gen_block("en");
362&gen_block("de");
363}}}
364{{{
365my ($inp,$out,$len,$key,$ivp)=map("x$_",(0..4)); my $enc="w5";
366my ($rounds,$cnt,$key_,$step,$step1)=($enc,"w6","x7","x8","x12");
367my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$ivec,$rndlast)=map("q$_",(0..7));
368
369my ($dat,$tmp,$rndzero_n_last)=($dat0,$tmp0,$tmp1);
370my ($key4,$key5,$key6,$key7)=("x6","x12","x14",$key);
371
372### q8-q15 preloaded key schedule
373
374$code.=<<___;
375.globl ${prefix}_cbc_encrypt
376.type ${prefix}_cbc_encrypt,%function
377.align 5
378${prefix}_cbc_encrypt:
379___
380$code.=<<___ if ($flavour =~ /64/);
381 stp x29,x30,[sp,#-16]!
382 add x29,sp,#0
383___
384$code.=<<___ if ($flavour !~ /64/);
385 mov ip,sp
386 stmdb sp!,{r4-r8,lr}
387 vstmdb sp!,{d8-d15} @ ABI specification says so
388 ldmia ip,{r4-r5} @ load remaining args
389___
390$code.=<<___;
391 subs $len,$len,#16
392 mov $step,#16
393 b.lo .Lcbc_abort
394 cclr $step,eq
395
396 cmp $enc,#0 // en- or decrypting?
397 ldr $rounds,[$key,#240]
398 and $len,$len,#-16
399 vld1.8 {$ivec},[$ivp]
400 vld1.8 {$dat},[$inp],$step
401
402 vld1.32 {q8-q9},[$key] // load key schedule...
403 sub $rounds,$rounds,#6
404 add $key_,$key,x5,lsl#4 // pointer to last 7 round keys
405 sub $rounds,$rounds,#2
406 vld1.32 {q10-q11},[$key_],#32
407 vld1.32 {q12-q13},[$key_],#32
408 vld1.32 {q14-q15},[$key_],#32
409 vld1.32 {$rndlast},[$key_]
410
411 add $key_,$key,#32
412 mov $cnt,$rounds
413 b.eq .Lcbc_dec
414
415 cmp $rounds,#2
416 veor $dat,$dat,$ivec
417 veor $rndzero_n_last,q8,$rndlast
418 b.eq .Lcbc_enc128
419
420 vld1.32 {$in0-$in1},[$key_]
421 add $key_,$key,#16
422 add $key4,$key,#16*4
423 add $key5,$key,#16*5
424 aese $dat,q8
425 aesmc $dat,$dat
426 add $key6,$key,#16*6
427 add $key7,$key,#16*7
428 b .Lenter_cbc_enc
429
430.align 4
431.Loop_cbc_enc:
432 aese $dat,q8
433 aesmc $dat,$dat
434 vst1.8 {$ivec},[$out],#16
435.Lenter_cbc_enc:
436 aese $dat,q9
437 aesmc $dat,$dat
438 aese $dat,$in0
439 aesmc $dat,$dat
440 vld1.32 {q8},[$key4]
441 cmp $rounds,#4
442 aese $dat,$in1
443 aesmc $dat,$dat
444 vld1.32 {q9},[$key5]
445 b.eq .Lcbc_enc192
446
447 aese $dat,q8
448 aesmc $dat,$dat
449 vld1.32 {q8},[$key6]
450 aese $dat,q9
451 aesmc $dat,$dat
452 vld1.32 {q9},[$key7]
453 nop
454
455.Lcbc_enc192:
456 aese $dat,q8
457 aesmc $dat,$dat
458 subs $len,$len,#16
459 aese $dat,q9
460 aesmc $dat,$dat
461 cclr $step,eq
462 aese $dat,q10
463 aesmc $dat,$dat
464 aese $dat,q11
465 aesmc $dat,$dat
466 vld1.8 {q8},[$inp],$step
467 aese $dat,q12
468 aesmc $dat,$dat
469 veor q8,q8,$rndzero_n_last
470 aese $dat,q13
471 aesmc $dat,$dat
472 vld1.32 {q9},[$key_] // re-pre-load rndkey[1]
473 aese $dat,q14
474 aesmc $dat,$dat
475 aese $dat,q15
476 veor $ivec,$dat,$rndlast
477 b.hs .Loop_cbc_enc
478
479 vst1.8 {$ivec},[$out],#16
480 b .Lcbc_done
481
482.align 5
483.Lcbc_enc128:
484 vld1.32 {$in0-$in1},[$key_]
485 aese $dat,q8
486 aesmc $dat,$dat
487 b .Lenter_cbc_enc128
488.Loop_cbc_enc128:
489 aese $dat,q8
490 aesmc $dat,$dat
491 vst1.8 {$ivec},[$out],#16
492.Lenter_cbc_enc128:
493 aese $dat,q9
494 aesmc $dat,$dat
495 subs $len,$len,#16
496 aese $dat,$in0
497 aesmc $dat,$dat
498 cclr $step,eq
499 aese $dat,$in1
500 aesmc $dat,$dat
501 aese $dat,q10
502 aesmc $dat,$dat
503 aese $dat,q11
504 aesmc $dat,$dat
505 vld1.8 {q8},[$inp],$step
506 aese $dat,q12
507 aesmc $dat,$dat
508 aese $dat,q13
509 aesmc $dat,$dat
510 aese $dat,q14
511 aesmc $dat,$dat
512 veor q8,q8,$rndzero_n_last
513 aese $dat,q15
514 veor $ivec,$dat,$rndlast
515 b.hs .Loop_cbc_enc128
516
517 vst1.8 {$ivec},[$out],#16
518 b .Lcbc_done
519___
520{
521my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9));
522$code.=<<___;
523.align 5
524.Lcbc_dec:
525 vld1.8 {$dat2},[$inp],#16
526 subs $len,$len,#32 // bias
527 add $cnt,$rounds,#2
528 vorr $in1,$dat,$dat
529 vorr $dat1,$dat,$dat
530 vorr $in2,$dat2,$dat2
531 b.lo .Lcbc_dec_tail
532
533 vorr $dat1,$dat2,$dat2
534 vld1.8 {$dat2},[$inp],#16
535 vorr $in0,$dat,$dat
536 vorr $in1,$dat1,$dat1
537 vorr $in2,$dat2,$dat2
538
539.Loop3x_cbc_dec:
540 aesd $dat0,q8
541 aesimc $dat0,$dat0
542 aesd $dat1,q8
543 aesimc $dat1,$dat1
544 aesd $dat2,q8
545 aesimc $dat2,$dat2
546 vld1.32 {q8},[$key_],#16
547 subs $cnt,$cnt,#2
548 aesd $dat0,q9
549 aesimc $dat0,$dat0
550 aesd $dat1,q9
551 aesimc $dat1,$dat1
552 aesd $dat2,q9
553 aesimc $dat2,$dat2
554 vld1.32 {q9},[$key_],#16
555 b.gt .Loop3x_cbc_dec
556
557 aesd $dat0,q8
558 aesimc $dat0,$dat0
559 aesd $dat1,q8
560 aesimc $dat1,$dat1
561 aesd $dat2,q8
562 aesimc $dat2,$dat2
563 veor $tmp0,$ivec,$rndlast
564 subs $len,$len,#0x30
565 veor $tmp1,$in0,$rndlast
566 mov.lo x6,$len // x6, $cnt, is zero at this point
567 aesd $dat0,q9
568 aesimc $dat0,$dat0
569 aesd $dat1,q9
570 aesimc $dat1,$dat1
571 aesd $dat2,q9
572 aesimc $dat2,$dat2
573 veor $tmp2,$in1,$rndlast
574 add $inp,$inp,x6 // $inp is adjusted in such way that
575 // at exit from the loop $dat1-$dat2
576 // are loaded with last "words"
577 vorr $ivec,$in2,$in2
578 mov $key_,$key
579 aesd $dat0,q12
580 aesimc $dat0,$dat0
581 aesd $dat1,q12
582 aesimc $dat1,$dat1
583 aesd $dat2,q12
584 aesimc $dat2,$dat2
585 vld1.8 {$in0},[$inp],#16
586 aesd $dat0,q13
587 aesimc $dat0,$dat0
588 aesd $dat1,q13
589 aesimc $dat1,$dat1
590 aesd $dat2,q13
591 aesimc $dat2,$dat2
592 vld1.8 {$in1},[$inp],#16
593 aesd $dat0,q14
594 aesimc $dat0,$dat0
595 aesd $dat1,q14
596 aesimc $dat1,$dat1
597 aesd $dat2,q14
598 aesimc $dat2,$dat2
599 vld1.8 {$in2},[$inp],#16
600 aesd $dat0,q15
601 aesd $dat1,q15
602 aesd $dat2,q15
603 vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0]
604 add $cnt,$rounds,#2
605 veor $tmp0,$tmp0,$dat0
606 veor $tmp1,$tmp1,$dat1
607 veor $dat2,$dat2,$tmp2
608 vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1]
609 vst1.8 {$tmp0},[$out],#16
610 vorr $dat0,$in0,$in0
611 vst1.8 {$tmp1},[$out],#16
612 vorr $dat1,$in1,$in1
613 vst1.8 {$dat2},[$out],#16
614 vorr $dat2,$in2,$in2
615 b.hs .Loop3x_cbc_dec
616
617 cmn $len,#0x30
618 b.eq .Lcbc_done
619 nop
620
621.Lcbc_dec_tail:
622 aesd $dat1,q8
623 aesimc $dat1,$dat1
624 aesd $dat2,q8
625 aesimc $dat2,$dat2
626 vld1.32 {q8},[$key_],#16
627 subs $cnt,$cnt,#2
628 aesd $dat1,q9
629 aesimc $dat1,$dat1
630 aesd $dat2,q9
631 aesimc $dat2,$dat2
632 vld1.32 {q9},[$key_],#16
633 b.gt .Lcbc_dec_tail
634
635 aesd $dat1,q8
636 aesimc $dat1,$dat1
637 aesd $dat2,q8
638 aesimc $dat2,$dat2
639 aesd $dat1,q9
640 aesimc $dat1,$dat1
641 aesd $dat2,q9
642 aesimc $dat2,$dat2
643 aesd $dat1,q12
644 aesimc $dat1,$dat1
645 aesd $dat2,q12
646 aesimc $dat2,$dat2
647 cmn $len,#0x20
648 aesd $dat1,q13
649 aesimc $dat1,$dat1
650 aesd $dat2,q13
651 aesimc $dat2,$dat2
652 veor $tmp1,$ivec,$rndlast
653 aesd $dat1,q14
654 aesimc $dat1,$dat1
655 aesd $dat2,q14
656 aesimc $dat2,$dat2
657 veor $tmp2,$in1,$rndlast
658 aesd $dat1,q15
659 aesd $dat2,q15
660 b.eq .Lcbc_dec_one
661 veor $tmp1,$tmp1,$dat1
662 veor $tmp2,$tmp2,$dat2
663 vorr $ivec,$in2,$in2
664 vst1.8 {$tmp1},[$out],#16
665 vst1.8 {$tmp2},[$out],#16
666 b .Lcbc_done
667
668.Lcbc_dec_one:
669 veor $tmp1,$tmp1,$dat2
670 vorr $ivec,$in2,$in2
671 vst1.8 {$tmp1},[$out],#16
672
673.Lcbc_done:
674 vst1.8 {$ivec},[$ivp]
675.Lcbc_abort:
676___
677}
678$code.=<<___ if ($flavour !~ /64/);
679 vldmia sp!,{d8-d15}
680 ldmia sp!,{r4-r8,pc}
681___
682$code.=<<___ if ($flavour =~ /64/);
683 ldr x29,[sp],#16
684 ret
685___
686$code.=<<___;
687.size ${prefix}_cbc_encrypt,.-${prefix}_cbc_encrypt
688___
689}}}
690{{{
691my ($inp,$out,$len,$key,$ivp)=map("x$_",(0..4));
692my ($rounds,$cnt,$key_)=("w5","w6","x7");
693my ($ctr,$tctr0,$tctr1,$tctr2)=map("w$_",(8..10,12));
694my $step="x12"; # aliases with $tctr2
695
696my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$ivec,$rndlast)=map("q$_",(0..7));
697my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9));
698
699my ($dat,$tmp)=($dat0,$tmp0);
700
701### q8-q15 preloaded key schedule
702
703$code.=<<___;
704.globl ${prefix}_ctr32_encrypt_blocks
705.type ${prefix}_ctr32_encrypt_blocks,%function
706.align 5
707${prefix}_ctr32_encrypt_blocks:
708___
709$code.=<<___ if ($flavour =~ /64/);
710 stp x29,x30,[sp,#-16]!
711 add x29,sp,#0
712___
713$code.=<<___ if ($flavour !~ /64/);
714 mov ip,sp
715 stmdb sp!,{r4-r10,lr}
716 vstmdb sp!,{d8-d15} @ ABI specification says so
717 ldr r4, [ip] @ load remaining arg
718___
719$code.=<<___;
720 ldr $rounds,[$key,#240]
721
722 ldr $ctr, [$ivp, #12]
723#ifdef __ARMEB__
724 vld1.8 {$dat0},[$ivp]
725#else
726 vld1.32 {$dat0},[$ivp]
727#endif
728 vld1.32 {q8-q9},[$key] // load key schedule...
729 sub $rounds,$rounds,#4
730 mov $step,#16
731 cmp $len,#2
732 add $key_,$key,x5,lsl#4 // pointer to last 5 round keys
733 sub $rounds,$rounds,#2
734 vld1.32 {q12-q13},[$key_],#32
735 vld1.32 {q14-q15},[$key_],#32
736 vld1.32 {$rndlast},[$key_]
737 add $key_,$key,#32
738 mov $cnt,$rounds
739 cclr $step,lo
740#ifndef __ARMEB__
741 rev $ctr, $ctr
742#endif
743 add $tctr1, $ctr, #1
744 vorr $ivec,$dat0,$dat0
745 rev $tctr1, $tctr1
746 vmov.32 ${ivec}[3],$tctr1
747 add $ctr, $ctr, #2
748 vorr $dat1,$ivec,$ivec
749 b.ls .Lctr32_tail
750 rev $tctr2, $ctr
751 vmov.32 ${ivec}[3],$tctr2
752 sub $len,$len,#3 // bias
753 vorr $dat2,$ivec,$ivec
754 b .Loop3x_ctr32
755
756.align 4
757.Loop3x_ctr32:
758 aese $dat0,q8
759 aesmc $dat0,$dat0
760 aese $dat1,q8
761 aesmc $dat1,$dat1
762 aese $dat2,q8
763 aesmc $dat2,$dat2
764 vld1.32 {q8},[$key_],#16
765 subs $cnt,$cnt,#2
766 aese $dat0,q9
767 aesmc $dat0,$dat0
768 aese $dat1,q9
769 aesmc $dat1,$dat1
770 aese $dat2,q9
771 aesmc $dat2,$dat2
772 vld1.32 {q9},[$key_],#16
773 b.gt .Loop3x_ctr32
774
775 aese $dat0,q8
776 aesmc $tmp0,$dat0
777 aese $dat1,q8
778 aesmc $tmp1,$dat1
779 vld1.8 {$in0},[$inp],#16
780 add $tctr0,$ctr,#1
781 aese $dat2,q8
782 aesmc $dat2,$dat2
783 vld1.8 {$in1},[$inp],#16
784 rev $tctr0,$tctr0
785 aese $tmp0,q9
786 aesmc $tmp0,$tmp0
787 aese $tmp1,q9
788 aesmc $tmp1,$tmp1
789 vld1.8 {$in2},[$inp],#16
790 mov $key_,$key
791 aese $dat2,q9
792 aesmc $tmp2,$dat2
793 aese $tmp0,q12
794 aesmc $tmp0,$tmp0
795 aese $tmp1,q12
796 aesmc $tmp1,$tmp1
797 veor $in0,$in0,$rndlast
798 add $tctr1,$ctr,#2
799 aese $tmp2,q12
800 aesmc $tmp2,$tmp2
801 veor $in1,$in1,$rndlast
802 add $ctr,$ctr,#3
803 aese $tmp0,q13
804 aesmc $tmp0,$tmp0
805 aese $tmp1,q13
806 aesmc $tmp1,$tmp1
807 veor $in2,$in2,$rndlast
808 vmov.32 ${ivec}[3], $tctr0
809 aese $tmp2,q13
810 aesmc $tmp2,$tmp2
811 vorr $dat0,$ivec,$ivec
812 rev $tctr1,$tctr1
813 aese $tmp0,q14
814 aesmc $tmp0,$tmp0
815 vmov.32 ${ivec}[3], $tctr1
816 rev $tctr2,$ctr
817 aese $tmp1,q14
818 aesmc $tmp1,$tmp1
819 vorr $dat1,$ivec,$ivec
820 vmov.32 ${ivec}[3], $tctr2
821 aese $tmp2,q14
822 aesmc $tmp2,$tmp2
823 vorr $dat2,$ivec,$ivec
824 subs $len,$len,#3
825 aese $tmp0,q15
826 aese $tmp1,q15
827 aese $tmp2,q15
828
829 veor $in0,$in0,$tmp0
830 vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0]
831 vst1.8 {$in0},[$out],#16
832 veor $in1,$in1,$tmp1
833 mov $cnt,$rounds
834 vst1.8 {$in1},[$out],#16
835 veor $in2,$in2,$tmp2
836 vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1]
837 vst1.8 {$in2},[$out],#16
838 b.hs .Loop3x_ctr32
839
840 adds $len,$len,#3
841 b.eq .Lctr32_done
842 cmp $len,#1
843 mov $step,#16
844 cclr $step,eq
845
846.Lctr32_tail:
847 aese $dat0,q8
848 aesmc $dat0,$dat0
849 aese $dat1,q8
850 aesmc $dat1,$dat1
851 vld1.32 {q8},[$key_],#16
852 subs $cnt,$cnt,#2
853 aese $dat0,q9
854 aesmc $dat0,$dat0
855 aese $dat1,q9
856 aesmc $dat1,$dat1
857 vld1.32 {q9},[$key_],#16
858 b.gt .Lctr32_tail
859
860 aese $dat0,q8
861 aesmc $dat0,$dat0
862 aese $dat1,q8
863 aesmc $dat1,$dat1
864 aese $dat0,q9
865 aesmc $dat0,$dat0
866 aese $dat1,q9
867 aesmc $dat1,$dat1
868 vld1.8 {$in0},[$inp],$step
869 aese $dat0,q12
870 aesmc $dat0,$dat0
871 aese $dat1,q12
872 aesmc $dat1,$dat1
873 vld1.8 {$in1},[$inp]
874 aese $dat0,q13
875 aesmc $dat0,$dat0
876 aese $dat1,q13
877 aesmc $dat1,$dat1
878 veor $in0,$in0,$rndlast
879 aese $dat0,q14
880 aesmc $dat0,$dat0
881 aese $dat1,q14
882 aesmc $dat1,$dat1
883 veor $in1,$in1,$rndlast
884 aese $dat0,q15
885 aese $dat1,q15
886
887 cmp $len,#1
888 veor $in0,$in0,$dat0
889 veor $in1,$in1,$dat1
890 vst1.8 {$in0},[$out],#16
891 b.eq .Lctr32_done
892 vst1.8 {$in1},[$out]
893
894.Lctr32_done:
895___
896$code.=<<___ if ($flavour !~ /64/);
897 vldmia sp!,{d8-d15}
898 ldmia sp!,{r4-r10,pc}
899___
900$code.=<<___ if ($flavour =~ /64/);
901 ldr x29,[sp],#16
902 ret
903___
904$code.=<<___;
905.size ${prefix}_ctr32_encrypt_blocks,.-${prefix}_ctr32_encrypt_blocks
906___
907}}}
908$code.=<<___;
909#endif
910___
911########################################
912if ($flavour =~ /64/) { ######## 64-bit code
913 my %opcode = (
914 "aesd" => 0x4e285800, "aese" => 0x4e284800,
915 "aesimc"=> 0x4e287800, "aesmc" => 0x4e286800 );
916
917 local *unaes = sub {
918 my ($mnemonic,$arg)=@_;
919
920 $arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)/o &&
921 sprintf ".inst\t0x%08x\t//%s %s",
922 $opcode{$mnemonic}|$1|($2<<5),
923 $mnemonic,$arg;
924 };
925
926 foreach(split("\n",$code)) {
927 s/\`([^\`]*)\`/eval($1)/geo;
928
929 s/\bq([0-9]+)\b/"v".($1<8?$1:$1+8).".16b"/geo; # old->new registers
930 s/@\s/\/\//o; # old->new style commentary
931
932 #s/[v]?(aes\w+)\s+([qv].*)/unaes($1,$2)/geo or
933 s/cclr\s+([wx])([^,]+),\s*([a-z]+)/csel $1$2,$1zr,$1$2,$3/o or
934 s/mov\.([a-z]+)\s+([wx][0-9]+),\s*([wx][0-9]+)/csel $2,$3,$2,$1/o or
935 s/vmov\.i8/movi/o or # fix up legacy mnemonics
936 s/vext\.8/ext/o or
937 s/vrev32\.8/rev32/o or
938 s/vtst\.8/cmtst/o or
939 s/vshr/ushr/o or
940 s/^(\s+)v/$1/o or # strip off v prefix
941 s/\bbx\s+lr\b/ret/o;
942
943 # fix up remaining legacy suffixes
944 s/\.[ui]?8//o;
945 m/\],#8/o and s/\.16b/\.8b/go;
946 s/\.[ui]?32//o and s/\.16b/\.4s/go;
947 s/\.[ui]?64//o and s/\.16b/\.2d/go;
948 s/\.[42]([sd])\[([0-3])\]/\.$1\[$2\]/o;
949
950 print $_,"\n";
951 }
952} else { ######## 32-bit code
953 my %opcode = (
954 "aesd" => 0xf3b00340, "aese" => 0xf3b00300,
955 "aesimc"=> 0xf3b003c0, "aesmc" => 0xf3b00380 );
956
957 local *unaes = sub {
958 my ($mnemonic,$arg)=@_;
959
960 if ($arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)/o) {
961 my $word = $opcode{$mnemonic}|(($1&7)<<13)|(($1&8)<<19)
962 |(($2&7)<<1) |(($2&8)<<2);
963 # since ARMv7 instructions are always encoded little-endian.
964 # correct solution is to use .inst directive, but older
965 # assemblers don't implement it:-(
966 sprintf ".byte\t0x%02x,0x%02x,0x%02x,0x%02x\t@ %s %s",
967 $word&0xff,($word>>8)&0xff,
968 ($word>>16)&0xff,($word>>24)&0xff,
969 $mnemonic,$arg;
970 }
971 };
972
973 sub unvtbl {
974 my $arg=shift;
975
976 $arg =~ m/q([0-9]+),\s*\{q([0-9]+)\},\s*q([0-9]+)/o &&
977 sprintf "vtbl.8 d%d,{q%d},d%d\n\t".
978 "vtbl.8 d%d,{q%d},d%d", 2*$1,$2,2*$3, 2*$1+1,$2,2*$3+1;
979 }
980
981 sub unvdup32 {
982 my $arg=shift;
983
984 $arg =~ m/q([0-9]+),\s*q([0-9]+)\[([0-3])\]/o &&
985 sprintf "vdup.32 q%d,d%d[%d]",$1,2*$2+($3>>1),$3&1;
986 }
987
988 sub unvmov32 {
989 my $arg=shift;
990
991 $arg =~ m/q([0-9]+)\[([0-3])\],(.*)/o &&
992 sprintf "vmov.32 d%d[%d],%s",2*$1+($2>>1),$2&1,$3;
993 }
994
995 foreach(split("\n",$code)) {
996 s/\`([^\`]*)\`/eval($1)/geo;
997
998 s/\b[wx]([0-9]+)\b/r$1/go; # new->old registers
999 s/\bv([0-9])\.[12468]+[bsd]\b/q$1/go; # new->old registers
1000 s/\/\/\s?/@ /o; # new->old style commentary
1001
1002 # fix up remaining new-style suffixes
1003 s/\{q([0-9]+)\},\s*\[(.+)\],#8/sprintf "{d%d},[$2]!",2*$1/eo or
1004 s/\],#[0-9]+/]!/o;
1005
1006 s/[v]?(aes\w+)\s+([qv].*)/unaes($1,$2)/geo or
1007 s/cclr\s+([^,]+),\s*([a-z]+)/mov$2 $1,#0/o or
1008 s/vtbl\.8\s+(.*)/unvtbl($1)/geo or
1009 s/vdup\.32\s+(.*)/unvdup32($1)/geo or
1010 s/vmov\.32\s+(.*)/unvmov32($1)/geo or
1011 s/^(\s+)b\./$1b/o or
1012 s/^(\s+)mov\./$1mov/o or
1013 s/^(\s+)ret/$1bx\tlr/o;
1014
1015 print $_,"\n";
1016 }
1017}
1018
1019close STDOUT or die "error closing STDOUT: $!";
注意: 瀏覽 TracBrowser 來幫助您使用儲存庫瀏覽器

© 2025 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette