VirtualBox

source: vbox/trunk/src/libs/openssl-3.0.7/crypto/aes/asm/aesv8-armx.pl@ 99507

最後變更 在這個檔案從99507是 97372,由 vboxsync 提交於 2 年 前

libs: Switch to openssl-3.0.7, bugref:10317

  • 屬性 svn:executable 設為 *
檔案大小: 80.7 KB
 
1#! /usr/bin/env perl
2# Copyright 2014-2022 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the Apache License 2.0 (the "License"). You may not use
5# this file except in compliance with the License. You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
9#
10# ====================================================================
11# Written by Andy Polyakov <[email protected]> for the OpenSSL
12# project. The module is, however, dual licensed under OpenSSL and
13# CRYPTOGAMS licenses depending on where you obtain it. For further
14# details see http://www.openssl.org/~appro/cryptogams/.
15# ====================================================================
16#
17# This module implements support for ARMv8 AES instructions. The
18# module is endian-agnostic in sense that it supports both big- and
19# little-endian cases. As does it support both 32- and 64-bit modes
20# of operation. Latter is achieved by limiting amount of utilized
21# registers to 16, which implies additional NEON load and integer
22# instructions. This has no effect on mighty Apple A7, where results
23# are literally equal to the theoretical estimates based on AES
24# instruction latencies and issue rates. On Cortex-A53, an in-order
25# execution core, this costs up to 10-15%, which is partially
26# compensated by implementing dedicated code path for 128-bit
27# CBC encrypt case. On Cortex-A57 parallelizable mode performance
28# seems to be limited by sheer amount of NEON instructions...
29#
30# April 2019
31#
32# Key to performance of parallelize-able modes is round instruction
33# interleaving. But which factor to use? There is optimal one for
34# each combination of instruction latency and issue rate, beyond
35# which increasing interleave factor doesn't pay off. While on cons
36# side we have code size increase and resource waste on platforms for
37# which interleave factor is too high. In other words you want it to
38# be just right. So far interleave factor of 3x was serving well all
39# platforms. But for ThunderX2 optimal interleave factor was measured
40# to be 5x...
41#
42# Performance in cycles per byte processed with 128-bit key:
43#
44# CBC enc CBC dec CTR
45# Apple A7 2.39 1.20 1.20
46# Cortex-A53 1.32 1.17/1.29(**) 1.36/1.46
47# Cortex-A57(*) 1.95 0.82/0.85 0.89/0.93
48# Cortex-A72 1.33 0.85/0.88 0.92/0.96
49# Denver 1.96 0.65/0.86 0.76/0.80
50# Mongoose 1.33 1.23/1.20 1.30/1.20
51# Kryo 1.26 0.87/0.94 1.00/1.00
52# ThunderX2 5.95 1.25 1.30
53#
54# (*) original 3.64/1.34/1.32 results were for r0p0 revision
55# and are still same even for updated module;
56# (**) numbers after slash are for 32-bit code, which is 3x-
57# interleaved;
58
59# $output is the last argument if it looks like a file (it has an extension)
60# $flavour is the first argument if it doesn't look like a file
61$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
62$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
63
64$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
65( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
66( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
67die "can't locate arm-xlate.pl";
68
69open OUT,"| \"$^X\" $xlate $flavour \"$output\""
70 or die "can't call $xlate: $!";
71*STDOUT=*OUT;
72
73$prefix="aes_v8";
74
75$_byte = ($flavour =~ /win/ ? "DCB" : ".byte");
76
77$code=<<___;
78#include "arm_arch.h"
79
80#if __ARM_MAX_ARCH__>=7
81___
82$code.=".arch armv8-a+crypto\n.text\n" if ($flavour =~ /64/);
83$code.=<<___ if ($flavour !~ /64/);
84.arch armv7-a // don't confuse not-so-latest binutils with argv8 :-)
85.fpu neon
86#ifdef __thumb2__
87.syntax unified
88.thumb
89# define INST(a,b,c,d) $_byte c,d|0xc,a,b
90#else
91.code 32
92# define INST(a,b,c,d) $_byte a,b,c,d
93#endif
94
95.text
96___
97
98# Assembler mnemonics are an eclectic mix of 32- and 64-bit syntax,
99# NEON is mostly 32-bit mnemonics, integer - mostly 64. Goal is to
100# maintain both 32- and 64-bit codes within single module and
101# transliterate common code to either flavour with regex vodoo.
102#
103{{{
104my ($inp,$bits,$out,$ptr,$rounds)=("x0","w1","x2","x3","w12");
105my ($zero,$rcon,$mask,$in0,$in1,$tmp,$key)=
106 $flavour=~/64/? map("q$_",(0..6)) : map("q$_",(0..3,8..10));
107
108
109$code.=<<___;
110.align 5
111.Lrcon:
112.long 0x01,0x01,0x01,0x01
113.long 0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d // rotate-n-splat
114.long 0x1b,0x1b,0x1b,0x1b
115
116.globl ${prefix}_set_encrypt_key
117.type ${prefix}_set_encrypt_key,%function
118.align 5
119${prefix}_set_encrypt_key:
120.Lenc_key:
121___
122$code.=<<___ if ($flavour =~ /64/);
123 stp x29,x30,[sp,#-16]!
124 add x29,sp,#0
125___
126$code.=<<___;
127 mov $ptr,#-1
128 cmp $inp,#0
129 b.eq .Lenc_key_abort
130 cmp $out,#0
131 b.eq .Lenc_key_abort
132 mov $ptr,#-2
133 cmp $bits,#128
134 b.lt .Lenc_key_abort
135 cmp $bits,#256
136 b.gt .Lenc_key_abort
137 tst $bits,#0x3f
138 b.ne .Lenc_key_abort
139
140 adr $ptr,.Lrcon
141 cmp $bits,#192
142
143 veor $zero,$zero,$zero
144 vld1.8 {$in0},[$inp],#16
145 mov $bits,#8 // reuse $bits
146 vld1.32 {$rcon,$mask},[$ptr],#32
147
148 b.lt .Loop128
149 b.eq .L192
150 b .L256
151
152.align 4
153.Loop128:
154 vtbl.8 $key,{$in0},$mask
155 vext.8 $tmp,$zero,$in0,#12
156 vst1.32 {$in0},[$out],#16
157 aese $key,$zero
158 subs $bits,$bits,#1
159
160 veor $in0,$in0,$tmp
161 vext.8 $tmp,$zero,$tmp,#12
162 veor $in0,$in0,$tmp
163 vext.8 $tmp,$zero,$tmp,#12
164 veor $key,$key,$rcon
165 veor $in0,$in0,$tmp
166 vshl.u8 $rcon,$rcon,#1
167 veor $in0,$in0,$key
168 b.ne .Loop128
169
170 vld1.32 {$rcon},[$ptr]
171
172 vtbl.8 $key,{$in0},$mask
173 vext.8 $tmp,$zero,$in0,#12
174 vst1.32 {$in0},[$out],#16
175 aese $key,$zero
176
177 veor $in0,$in0,$tmp
178 vext.8 $tmp,$zero,$tmp,#12
179 veor $in0,$in0,$tmp
180 vext.8 $tmp,$zero,$tmp,#12
181 veor $key,$key,$rcon
182 veor $in0,$in0,$tmp
183 vshl.u8 $rcon,$rcon,#1
184 veor $in0,$in0,$key
185
186 vtbl.8 $key,{$in0},$mask
187 vext.8 $tmp,$zero,$in0,#12
188 vst1.32 {$in0},[$out],#16
189 aese $key,$zero
190
191 veor $in0,$in0,$tmp
192 vext.8 $tmp,$zero,$tmp,#12
193 veor $in0,$in0,$tmp
194 vext.8 $tmp,$zero,$tmp,#12
195 veor $key,$key,$rcon
196 veor $in0,$in0,$tmp
197 veor $in0,$in0,$key
198 vst1.32 {$in0},[$out]
199 add $out,$out,#0x50
200
201 mov $rounds,#10
202 b .Ldone
203
204.align 4
205.L192:
206 vld1.8 {$in1},[$inp],#8
207 vmov.i8 $key,#8 // borrow $key
208 vst1.32 {$in0},[$out],#16
209 vsub.i8 $mask,$mask,$key // adjust the mask
210
211.Loop192:
212 vtbl.8 $key,{$in1},$mask
213 vext.8 $tmp,$zero,$in0,#12
214#ifdef __ARMEB__
215 vst1.32 {$in1},[$out],#16
216 sub $out,$out,#8
217#else
218 vst1.32 {$in1},[$out],#8
219#endif
220 aese $key,$zero
221 subs $bits,$bits,#1
222
223 veor $in0,$in0,$tmp
224 vext.8 $tmp,$zero,$tmp,#12
225 veor $in0,$in0,$tmp
226 vext.8 $tmp,$zero,$tmp,#12
227 veor $in0,$in0,$tmp
228
229 vdup.32 $tmp,${in0}[3]
230 veor $tmp,$tmp,$in1
231 veor $key,$key,$rcon
232 vext.8 $in1,$zero,$in1,#12
233 vshl.u8 $rcon,$rcon,#1
234 veor $in1,$in1,$tmp
235 veor $in0,$in0,$key
236 veor $in1,$in1,$key
237 vst1.32 {$in0},[$out],#16
238 b.ne .Loop192
239
240 mov $rounds,#12
241 add $out,$out,#0x20
242 b .Ldone
243
244.align 4
245.L256:
246 vld1.8 {$in1},[$inp]
247 mov $bits,#7
248 mov $rounds,#14
249 vst1.32 {$in0},[$out],#16
250
251.Loop256:
252 vtbl.8 $key,{$in1},$mask
253 vext.8 $tmp,$zero,$in0,#12
254 vst1.32 {$in1},[$out],#16
255 aese $key,$zero
256 subs $bits,$bits,#1
257
258 veor $in0,$in0,$tmp
259 vext.8 $tmp,$zero,$tmp,#12
260 veor $in0,$in0,$tmp
261 vext.8 $tmp,$zero,$tmp,#12
262 veor $key,$key,$rcon
263 veor $in0,$in0,$tmp
264 vshl.u8 $rcon,$rcon,#1
265 veor $in0,$in0,$key
266 vst1.32 {$in0},[$out],#16
267 b.eq .Ldone
268
269 vdup.32 $key,${in0}[3] // just splat
270 vext.8 $tmp,$zero,$in1,#12
271 aese $key,$zero
272
273 veor $in1,$in1,$tmp
274 vext.8 $tmp,$zero,$tmp,#12
275 veor $in1,$in1,$tmp
276 vext.8 $tmp,$zero,$tmp,#12
277 veor $in1,$in1,$tmp
278
279 veor $in1,$in1,$key
280 b .Loop256
281
282.Ldone:
283 str $rounds,[$out]
284 mov $ptr,#0
285
286.Lenc_key_abort:
287 mov x0,$ptr // return value
288 `"ldr x29,[sp],#16" if ($flavour =~ /64/)`
289 ret
290.size ${prefix}_set_encrypt_key,.-${prefix}_set_encrypt_key
291
292.globl ${prefix}_set_decrypt_key
293.type ${prefix}_set_decrypt_key,%function
294.align 5
295${prefix}_set_decrypt_key:
296___
297$code.=<<___ if ($flavour =~ /64/);
298 .inst 0xd503233f // paciasp
299 stp x29,x30,[sp,#-16]!
300 add x29,sp,#0
301___
302$code.=<<___ if ($flavour !~ /64/);
303 stmdb sp!,{r4,lr}
304___
305$code.=<<___;
306 bl .Lenc_key
307
308 cmp x0,#0
309 b.ne .Ldec_key_abort
310
311 sub $out,$out,#240 // restore original $out
312 mov x4,#-16
313 add $inp,$out,x12,lsl#4 // end of key schedule
314
315 vld1.32 {v0.16b},[$out]
316 vld1.32 {v1.16b},[$inp]
317 vst1.32 {v0.16b},[$inp],x4
318 vst1.32 {v1.16b},[$out],#16
319
320.Loop_imc:
321 vld1.32 {v0.16b},[$out]
322 vld1.32 {v1.16b},[$inp]
323 aesimc v0.16b,v0.16b
324 aesimc v1.16b,v1.16b
325 vst1.32 {v0.16b},[$inp],x4
326 vst1.32 {v1.16b},[$out],#16
327 cmp $inp,$out
328 b.hi .Loop_imc
329
330 vld1.32 {v0.16b},[$out]
331 aesimc v0.16b,v0.16b
332 vst1.32 {v0.16b},[$inp]
333
334 eor x0,x0,x0 // return value
335.Ldec_key_abort:
336___
337$code.=<<___ if ($flavour !~ /64/);
338 ldmia sp!,{r4,pc}
339___
340$code.=<<___ if ($flavour =~ /64/);
341 ldp x29,x30,[sp],#16
342 .inst 0xd50323bf // autiasp
343 ret
344___
345$code.=<<___;
346.size ${prefix}_set_decrypt_key,.-${prefix}_set_decrypt_key
347___
348}}}
349{{{
350sub gen_block () {
351my $dir = shift;
352my ($e,$mc) = $dir eq "en" ? ("e","mc") : ("d","imc");
353my ($inp,$out,$key)=map("x$_",(0..2));
354my $rounds="w3";
355my ($rndkey0,$rndkey1,$inout)=map("q$_",(0..3));
356
357$code.=<<___;
358.globl ${prefix}_${dir}crypt
359.type ${prefix}_${dir}crypt,%function
360.align 5
361${prefix}_${dir}crypt:
362 ldr $rounds,[$key,#240]
363 vld1.32 {$rndkey0},[$key],#16
364 vld1.8 {$inout},[$inp]
365 sub $rounds,$rounds,#2
366 vld1.32 {$rndkey1},[$key],#16
367
368.Loop_${dir}c:
369 aes$e $inout,$rndkey0
370 aes$mc $inout,$inout
371 vld1.32 {$rndkey0},[$key],#16
372 subs $rounds,$rounds,#2
373 aes$e $inout,$rndkey1
374 aes$mc $inout,$inout
375 vld1.32 {$rndkey1},[$key],#16
376 b.gt .Loop_${dir}c
377
378 aes$e $inout,$rndkey0
379 aes$mc $inout,$inout
380 vld1.32 {$rndkey0},[$key]
381 aes$e $inout,$rndkey1
382 veor $inout,$inout,$rndkey0
383
384 vst1.8 {$inout},[$out]
385 ret
386.size ${prefix}_${dir}crypt,.-${prefix}_${dir}crypt
387___
388}
389&gen_block("en");
390&gen_block("de");
391}}}
392
393# Performance in cycles per byte.
394# Processed with AES-ECB different key size.
395# It shows the value before and after optimization as below:
396# (before/after):
397#
398# AES-128-ECB AES-192-ECB AES-256-ECB
399# Cortex-A57 1.85/0.82 2.16/0.96 2.47/1.10
400# Cortex-A72 1.64/0.85 1.82/0.99 2.13/1.14
401
402# Optimization is implemented by loop unrolling and interleaving.
403# Commonly, we choose the unrolling factor as 5, if the input
404# data size smaller than 5 blocks, but not smaller than 3 blocks,
405# choose 3 as the unrolling factor.
406# If the input data size dsize >= 5*16 bytes, then take 5 blocks
407# as one iteration, every loop the left size lsize -= 5*16.
408# If 5*16 > lsize >= 3*16 bytes, take 3 blocks as one iteration,
409# every loop lsize -=3*16.
410# If lsize < 3*16 bytes, treat them as the tail, interleave the
411# two blocks AES instructions.
412# There is one special case, if the original input data size dsize
413# = 16 bytes, we will treat it seperately to improve the
414# performance: one independent code block without LR, FP load and
415# store, just looks like what the original ECB implementation does.
416
417{{{
418my ($inp,$out,$len,$key)=map("x$_",(0..3));
419my ($enc,$rounds,$cnt,$key_,$step)=("w4","w5","w6","x7","x8");
420my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$tmp2,$rndlast)=map("q$_",(0..7));
421
422my ($dat,$tmp,$rndzero_n_last)=($dat0,$tmp0,$tmp1);
423
424### q7 last round key
425### q10-q15 q7 Last 7 round keys
426### q8-q9 preloaded round keys except last 7 keys for big size
427### q5, q6, q8-q9 preloaded round keys except last 7 keys for only 16 byte
428
429{
430my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9));
431
432my ($dat3,$in3,$tmp3); # used only in 64-bit mode
433my ($dat4,$in4,$tmp4);
434if ($flavour =~ /64/) {
435 ($dat2,$dat3,$dat4,$in2,$in3,$in4,$tmp3,$tmp4)=map("q$_",(16..23));
436}
437
438$code.=<<___;
439.globl ${prefix}_ecb_encrypt
440.type ${prefix}_ecb_encrypt,%function
441.align 5
442${prefix}_ecb_encrypt:
443___
444$code.=<<___ if ($flavour =~ /64/);
445 subs $len,$len,#16
446 // Original input data size bigger than 16, jump to big size processing.
447 b.ne .Lecb_big_size
448 vld1.8 {$dat0},[$inp]
449 cmp $enc,#0 // en- or decrypting?
450 ldr $rounds,[$key,#240]
451 vld1.32 {q5-q6},[$key],#32 // load key schedule...
452
453 b.eq .Lecb_small_dec
454 aese $dat0,q5
455 aesmc $dat0,$dat0
456 vld1.32 {q8-q9},[$key],#32 // load key schedule...
457 aese $dat0,q6
458 aesmc $dat0,$dat0
459 subs $rounds,$rounds,#10 // if rounds==10, jump to aes-128-ecb processing
460 b.eq .Lecb_128_enc
461.Lecb_round_loop:
462 aese $dat0,q8
463 aesmc $dat0,$dat0
464 vld1.32 {q8},[$key],#16 // load key schedule...
465 aese $dat0,q9
466 aesmc $dat0,$dat0
467 vld1.32 {q9},[$key],#16 // load key schedule...
468 subs $rounds,$rounds,#2 // bias
469 b.gt .Lecb_round_loop
470.Lecb_128_enc:
471 vld1.32 {q10-q11},[$key],#32 // load key schedule...
472 aese $dat0,q8
473 aesmc $dat0,$dat0
474 aese $dat0,q9
475 aesmc $dat0,$dat0
476 vld1.32 {q12-q13},[$key],#32 // load key schedule...
477 aese $dat0,q10
478 aesmc $dat0,$dat0
479 aese $dat0,q11
480 aesmc $dat0,$dat0
481 vld1.32 {q14-q15},[$key],#32 // load key schedule...
482 aese $dat0,q12
483 aesmc $dat0,$dat0
484 aese $dat0,q13
485 aesmc $dat0,$dat0
486 vld1.32 {$rndlast},[$key]
487 aese $dat0,q14
488 aesmc $dat0,$dat0
489 aese $dat0,q15
490 veor $dat0,$dat0,$rndlast
491 vst1.8 {$dat0},[$out]
492 b .Lecb_Final_abort
493.Lecb_small_dec:
494 aesd $dat0,q5
495 aesimc $dat0,$dat0
496 vld1.32 {q8-q9},[$key],#32 // load key schedule...
497 aesd $dat0,q6
498 aesimc $dat0,$dat0
499 subs $rounds,$rounds,#10 // bias
500 b.eq .Lecb_128_dec
501.Lecb_dec_round_loop:
502 aesd $dat0,q8
503 aesimc $dat0,$dat0
504 vld1.32 {q8},[$key],#16 // load key schedule...
505 aesd $dat0,q9
506 aesimc $dat0,$dat0
507 vld1.32 {q9},[$key],#16 // load key schedule...
508 subs $rounds,$rounds,#2 // bias
509 b.gt .Lecb_dec_round_loop
510.Lecb_128_dec:
511 vld1.32 {q10-q11},[$key],#32 // load key schedule...
512 aesd $dat0,q8
513 aesimc $dat0,$dat0
514 aesd $dat0,q9
515 aesimc $dat0,$dat0
516 vld1.32 {q12-q13},[$key],#32 // load key schedule...
517 aesd $dat0,q10
518 aesimc $dat0,$dat0
519 aesd $dat0,q11
520 aesimc $dat0,$dat0
521 vld1.32 {q14-q15},[$key],#32 // load key schedule...
522 aesd $dat0,q12
523 aesimc $dat0,$dat0
524 aesd $dat0,q13
525 aesimc $dat0,$dat0
526 vld1.32 {$rndlast},[$key]
527 aesd $dat0,q14
528 aesimc $dat0,$dat0
529 aesd $dat0,q15
530 veor $dat0,$dat0,$rndlast
531 vst1.8 {$dat0},[$out]
532 b .Lecb_Final_abort
533.Lecb_big_size:
534___
535$code.=<<___ if ($flavour =~ /64/);
536 stp x29,x30,[sp,#-16]!
537 add x29,sp,#0
538___
539$code.=<<___ if ($flavour !~ /64/);
540 mov ip,sp
541 stmdb sp!,{r4-r8,lr}
542 vstmdb sp!,{d8-d15} @ ABI specification says so
543 ldmia ip,{r4-r5} @ load remaining args
544 subs $len,$len,#16
545___
546$code.=<<___;
547 mov $step,#16
548 b.lo .Lecb_done
549 cclr $step,eq
550
551 cmp $enc,#0 // en- or decrypting?
552 ldr $rounds,[$key,#240]
553 and $len,$len,#-16
554 vld1.8 {$dat},[$inp],$step
555
556 vld1.32 {q8-q9},[$key] // load key schedule...
557 sub $rounds,$rounds,#6
558 add $key_,$key,x5,lsl#4 // pointer to last 7 round keys
559 sub $rounds,$rounds,#2
560 vld1.32 {q10-q11},[$key_],#32
561 vld1.32 {q12-q13},[$key_],#32
562 vld1.32 {q14-q15},[$key_],#32
563 vld1.32 {$rndlast},[$key_]
564
565 add $key_,$key,#32
566 mov $cnt,$rounds
567 b.eq .Lecb_dec
568
569 vld1.8 {$dat1},[$inp],#16
570 subs $len,$len,#32 // bias
571 add $cnt,$rounds,#2
572 vorr $in1,$dat1,$dat1
573 vorr $dat2,$dat1,$dat1
574 vorr $dat1,$dat,$dat
575 b.lo .Lecb_enc_tail
576
577 vorr $dat1,$in1,$in1
578 vld1.8 {$dat2},[$inp],#16
579___
580$code.=<<___ if ($flavour =~ /64/);
581 cmp $len,#32
582 b.lo .Loop3x_ecb_enc
583
584 vld1.8 {$dat3},[$inp],#16
585 vld1.8 {$dat4},[$inp],#16
586 sub $len,$len,#32 // bias
587 mov $cnt,$rounds
588
589.Loop5x_ecb_enc:
590 aese $dat0,q8
591 aesmc $dat0,$dat0
592 aese $dat1,q8
593 aesmc $dat1,$dat1
594 aese $dat2,q8
595 aesmc $dat2,$dat2
596 aese $dat3,q8
597 aesmc $dat3,$dat3
598 aese $dat4,q8
599 aesmc $dat4,$dat4
600 vld1.32 {q8},[$key_],#16
601 subs $cnt,$cnt,#2
602 aese $dat0,q9
603 aesmc $dat0,$dat0
604 aese $dat1,q9
605 aesmc $dat1,$dat1
606 aese $dat2,q9
607 aesmc $dat2,$dat2
608 aese $dat3,q9
609 aesmc $dat3,$dat3
610 aese $dat4,q9
611 aesmc $dat4,$dat4
612 vld1.32 {q9},[$key_],#16
613 b.gt .Loop5x_ecb_enc
614
615 aese $dat0,q8
616 aesmc $dat0,$dat0
617 aese $dat1,q8
618 aesmc $dat1,$dat1
619 aese $dat2,q8
620 aesmc $dat2,$dat2
621 aese $dat3,q8
622 aesmc $dat3,$dat3
623 aese $dat4,q8
624 aesmc $dat4,$dat4
625 cmp $len,#0x40 // because .Lecb_enc_tail4x
626 sub $len,$len,#0x50
627
628 aese $dat0,q9
629 aesmc $dat0,$dat0
630 aese $dat1,q9
631 aesmc $dat1,$dat1
632 aese $dat2,q9
633 aesmc $dat2,$dat2
634 aese $dat3,q9
635 aesmc $dat3,$dat3
636 aese $dat4,q9
637 aesmc $dat4,$dat4
638 csel x6,xzr,$len,gt // borrow x6, $cnt, "gt" is not typo
639 mov $key_,$key
640
641 aese $dat0,q10
642 aesmc $dat0,$dat0
643 aese $dat1,q10
644 aesmc $dat1,$dat1
645 aese $dat2,q10
646 aesmc $dat2,$dat2
647 aese $dat3,q10
648 aesmc $dat3,$dat3
649 aese $dat4,q10
650 aesmc $dat4,$dat4
651 add $inp,$inp,x6 // $inp is adjusted in such way that
652 // at exit from the loop $dat1-$dat4
653 // are loaded with last "words"
654 add x6,$len,#0x60 // because .Lecb_enc_tail4x
655
656 aese $dat0,q11
657 aesmc $dat0,$dat0
658 aese $dat1,q11
659 aesmc $dat1,$dat1
660 aese $dat2,q11
661 aesmc $dat2,$dat2
662 aese $dat3,q11
663 aesmc $dat3,$dat3
664 aese $dat4,q11
665 aesmc $dat4,$dat4
666
667 aese $dat0,q12
668 aesmc $dat0,$dat0
669 aese $dat1,q12
670 aesmc $dat1,$dat1
671 aese $dat2,q12
672 aesmc $dat2,$dat2
673 aese $dat3,q12
674 aesmc $dat3,$dat3
675 aese $dat4,q12
676 aesmc $dat4,$dat4
677
678 aese $dat0,q13
679 aesmc $dat0,$dat0
680 aese $dat1,q13
681 aesmc $dat1,$dat1
682 aese $dat2,q13
683 aesmc $dat2,$dat2
684 aese $dat3,q13
685 aesmc $dat3,$dat3
686 aese $dat4,q13
687 aesmc $dat4,$dat4
688
689 aese $dat0,q14
690 aesmc $dat0,$dat0
691 aese $dat1,q14
692 aesmc $dat1,$dat1
693 aese $dat2,q14
694 aesmc $dat2,$dat2
695 aese $dat3,q14
696 aesmc $dat3,$dat3
697 aese $dat4,q14
698 aesmc $dat4,$dat4
699
700 aese $dat0,q15
701 vld1.8 {$in0},[$inp],#16
702 aese $dat1,q15
703 vld1.8 {$in1},[$inp],#16
704 aese $dat2,q15
705 vld1.8 {$in2},[$inp],#16
706 aese $dat3,q15
707 vld1.8 {$in3},[$inp],#16
708 aese $dat4,q15
709 vld1.8 {$in4},[$inp],#16
710 cbz x6,.Lecb_enc_tail4x
711 vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0]
712 veor $tmp0,$rndlast,$dat0
713 vorr $dat0,$in0,$in0
714 veor $tmp1,$rndlast,$dat1
715 vorr $dat1,$in1,$in1
716 veor $tmp2,$rndlast,$dat2
717 vorr $dat2,$in2,$in2
718 veor $tmp3,$rndlast,$dat3
719 vorr $dat3,$in3,$in3
720 veor $tmp4,$rndlast,$dat4
721 vst1.8 {$tmp0},[$out],#16
722 vorr $dat4,$in4,$in4
723 vst1.8 {$tmp1},[$out],#16
724 mov $cnt,$rounds
725 vst1.8 {$tmp2},[$out],#16
726 vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1]
727 vst1.8 {$tmp3},[$out],#16
728 vst1.8 {$tmp4},[$out],#16
729 b.hs .Loop5x_ecb_enc
730
731 add $len,$len,#0x50
732 cbz $len,.Lecb_done
733
734 add $cnt,$rounds,#2
735 subs $len,$len,#0x30
736 vorr $dat0,$in2,$in2
737 vorr $dat1,$in3,$in3
738 vorr $dat2,$in4,$in4
739 b.lo .Lecb_enc_tail
740
741 b .Loop3x_ecb_enc
742
743.align 4
744.Lecb_enc_tail4x:
745 veor $tmp1,$rndlast,$dat1
746 veor $tmp2,$rndlast,$dat2
747 veor $tmp3,$rndlast,$dat3
748 veor $tmp4,$rndlast,$dat4
749 vst1.8 {$tmp1},[$out],#16
750 vst1.8 {$tmp2},[$out],#16
751 vst1.8 {$tmp3},[$out],#16
752 vst1.8 {$tmp4},[$out],#16
753
754 b .Lecb_done
755.align 4
756___
757$code.=<<___;
758.Loop3x_ecb_enc:
759 aese $dat0,q8
760 aesmc $dat0,$dat0
761 aese $dat1,q8
762 aesmc $dat1,$dat1
763 aese $dat2,q8
764 aesmc $dat2,$dat2
765 vld1.32 {q8},[$key_],#16
766 subs $cnt,$cnt,#2
767 aese $dat0,q9
768 aesmc $dat0,$dat0
769 aese $dat1,q9
770 aesmc $dat1,$dat1
771 aese $dat2,q9
772 aesmc $dat2,$dat2
773 vld1.32 {q9},[$key_],#16
774 b.gt .Loop3x_ecb_enc
775
776 aese $dat0,q8
777 aesmc $dat0,$dat0
778 aese $dat1,q8
779 aesmc $dat1,$dat1
780 aese $dat2,q8
781 aesmc $dat2,$dat2
782 subs $len,$len,#0x30
783 mov.lo x6,$len // x6, $cnt, is zero at this point
784 aese $dat0,q9
785 aesmc $dat0,$dat0
786 aese $dat1,q9
787 aesmc $dat1,$dat1
788 aese $dat2,q9
789 aesmc $dat2,$dat2
790 add $inp,$inp,x6 // $inp is adjusted in such way that
791 // at exit from the loop $dat1-$dat2
792 // are loaded with last "words"
793 mov $key_,$key
794 aese $dat0,q12
795 aesmc $dat0,$dat0
796 aese $dat1,q12
797 aesmc $dat1,$dat1
798 aese $dat2,q12
799 aesmc $dat2,$dat2
800 vld1.8 {$in0},[$inp],#16
801 aese $dat0,q13
802 aesmc $dat0,$dat0
803 aese $dat1,q13
804 aesmc $dat1,$dat1
805 aese $dat2,q13
806 aesmc $dat2,$dat2
807 vld1.8 {$in1},[$inp],#16
808 aese $dat0,q14
809 aesmc $dat0,$dat0
810 aese $dat1,q14
811 aesmc $dat1,$dat1
812 aese $dat2,q14
813 aesmc $dat2,$dat2
814 vld1.8 {$in2},[$inp],#16
815 aese $dat0,q15
816 aese $dat1,q15
817 aese $dat2,q15
818 vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0]
819 add $cnt,$rounds,#2
820 veor $tmp0,$rndlast,$dat0
821 veor $tmp1,$rndlast,$dat1
822 veor $dat2,$dat2,$rndlast
823 vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1]
824 vst1.8 {$tmp0},[$out],#16
825 vorr $dat0,$in0,$in0
826 vst1.8 {$tmp1},[$out],#16
827 vorr $dat1,$in1,$in1
828 vst1.8 {$dat2},[$out],#16
829 vorr $dat2,$in2,$in2
830 b.hs .Loop3x_ecb_enc
831
832 cmn $len,#0x30
833 b.eq .Lecb_done
834 nop
835
836.Lecb_enc_tail:
837 aese $dat1,q8
838 aesmc $dat1,$dat1
839 aese $dat2,q8
840 aesmc $dat2,$dat2
841 vld1.32 {q8},[$key_],#16
842 subs $cnt,$cnt,#2
843 aese $dat1,q9
844 aesmc $dat1,$dat1
845 aese $dat2,q9
846 aesmc $dat2,$dat2
847 vld1.32 {q9},[$key_],#16
848 b.gt .Lecb_enc_tail
849
850 aese $dat1,q8
851 aesmc $dat1,$dat1
852 aese $dat2,q8
853 aesmc $dat2,$dat2
854 aese $dat1,q9
855 aesmc $dat1,$dat1
856 aese $dat2,q9
857 aesmc $dat2,$dat2
858 aese $dat1,q12
859 aesmc $dat1,$dat1
860 aese $dat2,q12
861 aesmc $dat2,$dat2
862 cmn $len,#0x20
863 aese $dat1,q13
864 aesmc $dat1,$dat1
865 aese $dat2,q13
866 aesmc $dat2,$dat2
867 aese $dat1,q14
868 aesmc $dat1,$dat1
869 aese $dat2,q14
870 aesmc $dat2,$dat2
871 aese $dat1,q15
872 aese $dat2,q15
873 b.eq .Lecb_enc_one
874 veor $tmp1,$rndlast,$dat1
875 veor $tmp2,$rndlast,$dat2
876 vst1.8 {$tmp1},[$out],#16
877 vst1.8 {$tmp2},[$out],#16
878 b .Lecb_done
879
880.Lecb_enc_one:
881 veor $tmp1,$rndlast,$dat2
882 vst1.8 {$tmp1},[$out],#16
883 b .Lecb_done
884___
885
886$code.=<<___;
887.align 5
888.Lecb_dec:
889 vld1.8 {$dat1},[$inp],#16
890 subs $len,$len,#32 // bias
891 add $cnt,$rounds,#2
892 vorr $in1,$dat1,$dat1
893 vorr $dat2,$dat1,$dat1
894 vorr $dat1,$dat,$dat
895 b.lo .Lecb_dec_tail
896
897 vorr $dat1,$in1,$in1
898 vld1.8 {$dat2},[$inp],#16
899___
900$code.=<<___ if ($flavour =~ /64/);
901 cmp $len,#32
902 b.lo .Loop3x_ecb_dec
903
904 vld1.8 {$dat3},[$inp],#16
905 vld1.8 {$dat4},[$inp],#16
906 sub $len,$len,#32 // bias
907 mov $cnt,$rounds
908
909.Loop5x_ecb_dec:
910 aesd $dat0,q8
911 aesimc $dat0,$dat0
912 aesd $dat1,q8
913 aesimc $dat1,$dat1
914 aesd $dat2,q8
915 aesimc $dat2,$dat2
916 aesd $dat3,q8
917 aesimc $dat3,$dat3
918 aesd $dat4,q8
919 aesimc $dat4,$dat4
920 vld1.32 {q8},[$key_],#16
921 subs $cnt,$cnt,#2
922 aesd $dat0,q9
923 aesimc $dat0,$dat0
924 aesd $dat1,q9
925 aesimc $dat1,$dat1
926 aesd $dat2,q9
927 aesimc $dat2,$dat2
928 aesd $dat3,q9
929 aesimc $dat3,$dat3
930 aesd $dat4,q9
931 aesimc $dat4,$dat4
932 vld1.32 {q9},[$key_],#16
933 b.gt .Loop5x_ecb_dec
934
935 aesd $dat0,q8
936 aesimc $dat0,$dat0
937 aesd $dat1,q8
938 aesimc $dat1,$dat1
939 aesd $dat2,q8
940 aesimc $dat2,$dat2
941 aesd $dat3,q8
942 aesimc $dat3,$dat3
943 aesd $dat4,q8
944 aesimc $dat4,$dat4
945 cmp $len,#0x40 // because .Lecb_tail4x
946 sub $len,$len,#0x50
947
948 aesd $dat0,q9
949 aesimc $dat0,$dat0
950 aesd $dat1,q9
951 aesimc $dat1,$dat1
952 aesd $dat2,q9
953 aesimc $dat2,$dat2
954 aesd $dat3,q9
955 aesimc $dat3,$dat3
956 aesd $dat4,q9
957 aesimc $dat4,$dat4
958 csel x6,xzr,$len,gt // borrow x6, $cnt, "gt" is not typo
959 mov $key_,$key
960
961 aesd $dat0,q10
962 aesimc $dat0,$dat0
963 aesd $dat1,q10
964 aesimc $dat1,$dat1
965 aesd $dat2,q10
966 aesimc $dat2,$dat2
967 aesd $dat3,q10
968 aesimc $dat3,$dat3
969 aesd $dat4,q10
970 aesimc $dat4,$dat4
971 add $inp,$inp,x6 // $inp is adjusted in such way that
972 // at exit from the loop $dat1-$dat4
973 // are loaded with last "words"
974 add x6,$len,#0x60 // because .Lecb_tail4x
975
976 aesd $dat0,q11
977 aesimc $dat0,$dat0
978 aesd $dat1,q11
979 aesimc $dat1,$dat1
980 aesd $dat2,q11
981 aesimc $dat2,$dat2
982 aesd $dat3,q11
983 aesimc $dat3,$dat3
984 aesd $dat4,q11
985 aesimc $dat4,$dat4
986
987 aesd $dat0,q12
988 aesimc $dat0,$dat0
989 aesd $dat1,q12
990 aesimc $dat1,$dat1
991 aesd $dat2,q12
992 aesimc $dat2,$dat2
993 aesd $dat3,q12
994 aesimc $dat3,$dat3
995 aesd $dat4,q12
996 aesimc $dat4,$dat4
997
998 aesd $dat0,q13
999 aesimc $dat0,$dat0
1000 aesd $dat1,q13
1001 aesimc $dat1,$dat1
1002 aesd $dat2,q13
1003 aesimc $dat2,$dat2
1004 aesd $dat3,q13
1005 aesimc $dat3,$dat3
1006 aesd $dat4,q13
1007 aesimc $dat4,$dat4
1008
1009 aesd $dat0,q14
1010 aesimc $dat0,$dat0
1011 aesd $dat1,q14
1012 aesimc $dat1,$dat1
1013 aesd $dat2,q14
1014 aesimc $dat2,$dat2
1015 aesd $dat3,q14
1016 aesimc $dat3,$dat3
1017 aesd $dat4,q14
1018 aesimc $dat4,$dat4
1019
1020 aesd $dat0,q15
1021 vld1.8 {$in0},[$inp],#16
1022 aesd $dat1,q15
1023 vld1.8 {$in1},[$inp],#16
1024 aesd $dat2,q15
1025 vld1.8 {$in2},[$inp],#16
1026 aesd $dat3,q15
1027 vld1.8 {$in3},[$inp],#16
1028 aesd $dat4,q15
1029 vld1.8 {$in4},[$inp],#16
1030 cbz x6,.Lecb_tail4x
1031 vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0]
1032 veor $tmp0,$rndlast,$dat0
1033 vorr $dat0,$in0,$in0
1034 veor $tmp1,$rndlast,$dat1
1035 vorr $dat1,$in1,$in1
1036 veor $tmp2,$rndlast,$dat2
1037 vorr $dat2,$in2,$in2
1038 veor $tmp3,$rndlast,$dat3
1039 vorr $dat3,$in3,$in3
1040 veor $tmp4,$rndlast,$dat4
1041 vst1.8 {$tmp0},[$out],#16
1042 vorr $dat4,$in4,$in4
1043 vst1.8 {$tmp1},[$out],#16
1044 mov $cnt,$rounds
1045 vst1.8 {$tmp2},[$out],#16
1046 vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1]
1047 vst1.8 {$tmp3},[$out],#16
1048 vst1.8 {$tmp4},[$out],#16
1049 b.hs .Loop5x_ecb_dec
1050
1051 add $len,$len,#0x50
1052 cbz $len,.Lecb_done
1053
1054 add $cnt,$rounds,#2
1055 subs $len,$len,#0x30
1056 vorr $dat0,$in2,$in2
1057 vorr $dat1,$in3,$in3
1058 vorr $dat2,$in4,$in4
1059 b.lo .Lecb_dec_tail
1060
1061 b .Loop3x_ecb_dec
1062
1063.align 4
1064.Lecb_tail4x:
1065 veor $tmp1,$rndlast,$dat1
1066 veor $tmp2,$rndlast,$dat2
1067 veor $tmp3,$rndlast,$dat3
1068 veor $tmp4,$rndlast,$dat4
1069 vst1.8 {$tmp1},[$out],#16
1070 vst1.8 {$tmp2},[$out],#16
1071 vst1.8 {$tmp3},[$out],#16
1072 vst1.8 {$tmp4},[$out],#16
1073
1074 b .Lecb_done
1075.align 4
1076___
1077$code.=<<___;
1078.Loop3x_ecb_dec:
1079 aesd $dat0,q8
1080 aesimc $dat0,$dat0
1081 aesd $dat1,q8
1082 aesimc $dat1,$dat1
1083 aesd $dat2,q8
1084 aesimc $dat2,$dat2
1085 vld1.32 {q8},[$key_],#16
1086 subs $cnt,$cnt,#2
1087 aesd $dat0,q9
1088 aesimc $dat0,$dat0
1089 aesd $dat1,q9
1090 aesimc $dat1,$dat1
1091 aesd $dat2,q9
1092 aesimc $dat2,$dat2
1093 vld1.32 {q9},[$key_],#16
1094 b.gt .Loop3x_ecb_dec
1095
1096 aesd $dat0,q8
1097 aesimc $dat0,$dat0
1098 aesd $dat1,q8
1099 aesimc $dat1,$dat1
1100 aesd $dat2,q8
1101 aesimc $dat2,$dat2
1102 subs $len,$len,#0x30
1103 mov.lo x6,$len // x6, $cnt, is zero at this point
1104 aesd $dat0,q9
1105 aesimc $dat0,$dat0
1106 aesd $dat1,q9
1107 aesimc $dat1,$dat1
1108 aesd $dat2,q9
1109 aesimc $dat2,$dat2
1110 add $inp,$inp,x6 // $inp is adjusted in such way that
1111 // at exit from the loop $dat1-$dat2
1112 // are loaded with last "words"
1113 mov $key_,$key
1114 aesd $dat0,q12
1115 aesimc $dat0,$dat0
1116 aesd $dat1,q12
1117 aesimc $dat1,$dat1
1118 aesd $dat2,q12
1119 aesimc $dat2,$dat2
1120 vld1.8 {$in0},[$inp],#16
1121 aesd $dat0,q13
1122 aesimc $dat0,$dat0
1123 aesd $dat1,q13
1124 aesimc $dat1,$dat1
1125 aesd $dat2,q13
1126 aesimc $dat2,$dat2
1127 vld1.8 {$in1},[$inp],#16
1128 aesd $dat0,q14
1129 aesimc $dat0,$dat0
1130 aesd $dat1,q14
1131 aesimc $dat1,$dat1
1132 aesd $dat2,q14
1133 aesimc $dat2,$dat2
1134 vld1.8 {$in2},[$inp],#16
1135 aesd $dat0,q15
1136 aesd $dat1,q15
1137 aesd $dat2,q15
1138 vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0]
1139 add $cnt,$rounds,#2
1140 veor $tmp0,$rndlast,$dat0
1141 veor $tmp1,$rndlast,$dat1
1142 veor $dat2,$dat2,$rndlast
1143 vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1]
1144 vst1.8 {$tmp0},[$out],#16
1145 vorr $dat0,$in0,$in0
1146 vst1.8 {$tmp1},[$out],#16
1147 vorr $dat1,$in1,$in1
1148 vst1.8 {$dat2},[$out],#16
1149 vorr $dat2,$in2,$in2
1150 b.hs .Loop3x_ecb_dec
1151
1152 cmn $len,#0x30
1153 b.eq .Lecb_done
1154 nop
1155
1156.Lecb_dec_tail:
1157 aesd $dat1,q8
1158 aesimc $dat1,$dat1
1159 aesd $dat2,q8
1160 aesimc $dat2,$dat2
1161 vld1.32 {q8},[$key_],#16
1162 subs $cnt,$cnt,#2
1163 aesd $dat1,q9
1164 aesimc $dat1,$dat1
1165 aesd $dat2,q9
1166 aesimc $dat2,$dat2
1167 vld1.32 {q9},[$key_],#16
1168 b.gt .Lecb_dec_tail
1169
1170 aesd $dat1,q8
1171 aesimc $dat1,$dat1
1172 aesd $dat2,q8
1173 aesimc $dat2,$dat2
1174 aesd $dat1,q9
1175 aesimc $dat1,$dat1
1176 aesd $dat2,q9
1177 aesimc $dat2,$dat2
1178 aesd $dat1,q12
1179 aesimc $dat1,$dat1
1180 aesd $dat2,q12
1181 aesimc $dat2,$dat2
1182 cmn $len,#0x20
1183 aesd $dat1,q13
1184 aesimc $dat1,$dat1
1185 aesd $dat2,q13
1186 aesimc $dat2,$dat2
1187 aesd $dat1,q14
1188 aesimc $dat1,$dat1
1189 aesd $dat2,q14
1190 aesimc $dat2,$dat2
1191 aesd $dat1,q15
1192 aesd $dat2,q15
1193 b.eq .Lecb_dec_one
1194 veor $tmp1,$rndlast,$dat1
1195 veor $tmp2,$rndlast,$dat2
1196 vst1.8 {$tmp1},[$out],#16
1197 vst1.8 {$tmp2},[$out],#16
1198 b .Lecb_done
1199
1200.Lecb_dec_one:
1201 veor $tmp1,$rndlast,$dat2
1202 vst1.8 {$tmp1},[$out],#16
1203
1204.Lecb_done:
1205___
1206}
1207$code.=<<___ if ($flavour !~ /64/);
1208 vldmia sp!,{d8-d15}
1209 ldmia sp!,{r4-r8,pc}
1210___
1211$code.=<<___ if ($flavour =~ /64/);
1212 ldr x29,[sp],#16
1213___
1214$code.=<<___ if ($flavour =~ /64/);
1215.Lecb_Final_abort:
1216 ret
1217___
1218$code.=<<___;
1219.size ${prefix}_ecb_encrypt,.-${prefix}_ecb_encrypt
1220___
1221}}}
1222{{{
1223my ($inp,$out,$len,$key,$ivp)=map("x$_",(0..4)); my $enc="w5";
1224my ($rounds,$cnt,$key_,$step,$step1)=($enc,"w6","x7","x8","x12");
1225my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$ivec,$rndlast)=map("q$_",(0..7));
1226
1227my ($dat,$tmp,$rndzero_n_last)=($dat0,$tmp0,$tmp1);
1228my ($key4,$key5,$key6,$key7)=("x6","x12","x14",$key);
1229
1230### q8-q15 preloaded key schedule
1231
1232$code.=<<___;
1233.globl ${prefix}_cbc_encrypt
1234.type ${prefix}_cbc_encrypt,%function
1235.align 5
1236${prefix}_cbc_encrypt:
1237___
1238$code.=<<___ if ($flavour =~ /64/);
1239 stp x29,x30,[sp,#-16]!
1240 add x29,sp,#0
1241___
1242$code.=<<___ if ($flavour !~ /64/);
1243 mov ip,sp
1244 stmdb sp!,{r4-r8,lr}
1245 vstmdb sp!,{d8-d15} @ ABI specification says so
1246 ldmia ip,{r4-r5} @ load remaining args
1247___
1248$code.=<<___;
1249 subs $len,$len,#16
1250 mov $step,#16
1251 b.lo .Lcbc_abort
1252 cclr $step,eq
1253
1254 cmp $enc,#0 // en- or decrypting?
1255 ldr $rounds,[$key,#240]
1256 and $len,$len,#-16
1257 vld1.8 {$ivec},[$ivp]
1258 vld1.8 {$dat},[$inp],$step
1259
1260 vld1.32 {q8-q9},[$key] // load key schedule...
1261 sub $rounds,$rounds,#6
1262 add $key_,$key,x5,lsl#4 // pointer to last 7 round keys
1263 sub $rounds,$rounds,#2
1264 vld1.32 {q10-q11},[$key_],#32
1265 vld1.32 {q12-q13},[$key_],#32
1266 vld1.32 {q14-q15},[$key_],#32
1267 vld1.32 {$rndlast},[$key_]
1268
1269 add $key_,$key,#32
1270 mov $cnt,$rounds
1271 b.eq .Lcbc_dec
1272
1273 cmp $rounds,#2
1274 veor $dat,$dat,$ivec
1275 veor $rndzero_n_last,q8,$rndlast
1276 b.eq .Lcbc_enc128
1277
1278 vld1.32 {$in0-$in1},[$key_]
1279 add $key_,$key,#16
1280 add $key4,$key,#16*4
1281 add $key5,$key,#16*5
1282 aese $dat,q8
1283 aesmc $dat,$dat
1284 add $key6,$key,#16*6
1285 add $key7,$key,#16*7
1286 b .Lenter_cbc_enc
1287
1288.align 4
1289.Loop_cbc_enc:
1290 aese $dat,q8
1291 aesmc $dat,$dat
1292 vst1.8 {$ivec},[$out],#16
1293.Lenter_cbc_enc:
1294 aese $dat,q9
1295 aesmc $dat,$dat
1296 aese $dat,$in0
1297 aesmc $dat,$dat
1298 vld1.32 {q8},[$key4]
1299 cmp $rounds,#4
1300 aese $dat,$in1
1301 aesmc $dat,$dat
1302 vld1.32 {q9},[$key5]
1303 b.eq .Lcbc_enc192
1304
1305 aese $dat,q8
1306 aesmc $dat,$dat
1307 vld1.32 {q8},[$key6]
1308 aese $dat,q9
1309 aesmc $dat,$dat
1310 vld1.32 {q9},[$key7]
1311 nop
1312
1313.Lcbc_enc192:
1314 aese $dat,q8
1315 aesmc $dat,$dat
1316 subs $len,$len,#16
1317 aese $dat,q9
1318 aesmc $dat,$dat
1319 cclr $step,eq
1320 aese $dat,q10
1321 aesmc $dat,$dat
1322 aese $dat,q11
1323 aesmc $dat,$dat
1324 vld1.8 {q8},[$inp],$step
1325 aese $dat,q12
1326 aesmc $dat,$dat
1327 veor q8,q8,$rndzero_n_last
1328 aese $dat,q13
1329 aesmc $dat,$dat
1330 vld1.32 {q9},[$key_] // re-pre-load rndkey[1]
1331 aese $dat,q14
1332 aesmc $dat,$dat
1333 aese $dat,q15
1334 veor $ivec,$dat,$rndlast
1335 b.hs .Loop_cbc_enc
1336
1337 vst1.8 {$ivec},[$out],#16
1338 b .Lcbc_done
1339
1340.align 5
1341.Lcbc_enc128:
1342 vld1.32 {$in0-$in1},[$key_]
1343 aese $dat,q8
1344 aesmc $dat,$dat
1345 b .Lenter_cbc_enc128
1346.Loop_cbc_enc128:
1347 aese $dat,q8
1348 aesmc $dat,$dat
1349 vst1.8 {$ivec},[$out],#16
1350.Lenter_cbc_enc128:
1351 aese $dat,q9
1352 aesmc $dat,$dat
1353 subs $len,$len,#16
1354 aese $dat,$in0
1355 aesmc $dat,$dat
1356 cclr $step,eq
1357 aese $dat,$in1
1358 aesmc $dat,$dat
1359 aese $dat,q10
1360 aesmc $dat,$dat
1361 aese $dat,q11
1362 aesmc $dat,$dat
1363 vld1.8 {q8},[$inp],$step
1364 aese $dat,q12
1365 aesmc $dat,$dat
1366 aese $dat,q13
1367 aesmc $dat,$dat
1368 aese $dat,q14
1369 aesmc $dat,$dat
1370 veor q8,q8,$rndzero_n_last
1371 aese $dat,q15
1372 veor $ivec,$dat,$rndlast
1373 b.hs .Loop_cbc_enc128
1374
1375 vst1.8 {$ivec},[$out],#16
1376 b .Lcbc_done
1377___
1378{
1379my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9));
1380
1381my ($dat3,$in3,$tmp3); # used only in 64-bit mode
1382my ($dat4,$in4,$tmp4);
1383if ($flavour =~ /64/) {
1384 ($dat2,$dat3,$dat4,$in2,$in3,$in4,$tmp3,$tmp4)=map("q$_",(16..23));
1385}
1386
1387$code.=<<___;
1388.align 5
1389.Lcbc_dec:
1390 vld1.8 {$dat2},[$inp],#16
1391 subs $len,$len,#32 // bias
1392 add $cnt,$rounds,#2
1393 vorr $in1,$dat,$dat
1394 vorr $dat1,$dat,$dat
1395 vorr $in2,$dat2,$dat2
1396 b.lo .Lcbc_dec_tail
1397
1398 vorr $dat1,$dat2,$dat2
1399 vld1.8 {$dat2},[$inp],#16
1400 vorr $in0,$dat,$dat
1401 vorr $in1,$dat1,$dat1
1402 vorr $in2,$dat2,$dat2
1403___
1404$code.=<<___ if ($flavour =~ /64/);
1405 cmp $len,#32
1406 b.lo .Loop3x_cbc_dec
1407
1408 vld1.8 {$dat3},[$inp],#16
1409 vld1.8 {$dat4},[$inp],#16
1410 sub $len,$len,#32 // bias
1411 mov $cnt,$rounds
1412 vorr $in3,$dat3,$dat3
1413 vorr $in4,$dat4,$dat4
1414
1415.Loop5x_cbc_dec:
1416 aesd $dat0,q8
1417 aesimc $dat0,$dat0
1418 aesd $dat1,q8
1419 aesimc $dat1,$dat1
1420 aesd $dat2,q8
1421 aesimc $dat2,$dat2
1422 aesd $dat3,q8
1423 aesimc $dat3,$dat3
1424 aesd $dat4,q8
1425 aesimc $dat4,$dat4
1426 vld1.32 {q8},[$key_],#16
1427 subs $cnt,$cnt,#2
1428 aesd $dat0,q9
1429 aesimc $dat0,$dat0
1430 aesd $dat1,q9
1431 aesimc $dat1,$dat1
1432 aesd $dat2,q9
1433 aesimc $dat2,$dat2
1434 aesd $dat3,q9
1435 aesimc $dat3,$dat3
1436 aesd $dat4,q9
1437 aesimc $dat4,$dat4
1438 vld1.32 {q9},[$key_],#16
1439 b.gt .Loop5x_cbc_dec
1440
1441 aesd $dat0,q8
1442 aesimc $dat0,$dat0
1443 aesd $dat1,q8
1444 aesimc $dat1,$dat1
1445 aesd $dat2,q8
1446 aesimc $dat2,$dat2
1447 aesd $dat3,q8
1448 aesimc $dat3,$dat3
1449 aesd $dat4,q8
1450 aesimc $dat4,$dat4
1451 cmp $len,#0x40 // because .Lcbc_tail4x
1452 sub $len,$len,#0x50
1453
1454 aesd $dat0,q9
1455 aesimc $dat0,$dat0
1456 aesd $dat1,q9
1457 aesimc $dat1,$dat1
1458 aesd $dat2,q9
1459 aesimc $dat2,$dat2
1460 aesd $dat3,q9
1461 aesimc $dat3,$dat3
1462 aesd $dat4,q9
1463 aesimc $dat4,$dat4
1464 csel x6,xzr,$len,gt // borrow x6, $cnt, "gt" is not typo
1465 mov $key_,$key
1466
1467 aesd $dat0,q10
1468 aesimc $dat0,$dat0
1469 aesd $dat1,q10
1470 aesimc $dat1,$dat1
1471 aesd $dat2,q10
1472 aesimc $dat2,$dat2
1473 aesd $dat3,q10
1474 aesimc $dat3,$dat3
1475 aesd $dat4,q10
1476 aesimc $dat4,$dat4
1477 add $inp,$inp,x6 // $inp is adjusted in such way that
1478 // at exit from the loop $dat1-$dat4
1479 // are loaded with last "words"
1480 add x6,$len,#0x60 // because .Lcbc_tail4x
1481
1482 aesd $dat0,q11
1483 aesimc $dat0,$dat0
1484 aesd $dat1,q11
1485 aesimc $dat1,$dat1
1486 aesd $dat2,q11
1487 aesimc $dat2,$dat2
1488 aesd $dat3,q11
1489 aesimc $dat3,$dat3
1490 aesd $dat4,q11
1491 aesimc $dat4,$dat4
1492
1493 aesd $dat0,q12
1494 aesimc $dat0,$dat0
1495 aesd $dat1,q12
1496 aesimc $dat1,$dat1
1497 aesd $dat2,q12
1498 aesimc $dat2,$dat2
1499 aesd $dat3,q12
1500 aesimc $dat3,$dat3
1501 aesd $dat4,q12
1502 aesimc $dat4,$dat4
1503
1504 aesd $dat0,q13
1505 aesimc $dat0,$dat0
1506 aesd $dat1,q13
1507 aesimc $dat1,$dat1
1508 aesd $dat2,q13
1509 aesimc $dat2,$dat2
1510 aesd $dat3,q13
1511 aesimc $dat3,$dat3
1512 aesd $dat4,q13
1513 aesimc $dat4,$dat4
1514
1515 aesd $dat0,q14
1516 aesimc $dat0,$dat0
1517 aesd $dat1,q14
1518 aesimc $dat1,$dat1
1519 aesd $dat2,q14
1520 aesimc $dat2,$dat2
1521 aesd $dat3,q14
1522 aesimc $dat3,$dat3
1523 aesd $dat4,q14
1524 aesimc $dat4,$dat4
1525
1526 veor $tmp0,$ivec,$rndlast
1527 aesd $dat0,q15
1528 veor $tmp1,$in0,$rndlast
1529 vld1.8 {$in0},[$inp],#16
1530 aesd $dat1,q15
1531 veor $tmp2,$in1,$rndlast
1532 vld1.8 {$in1},[$inp],#16
1533 aesd $dat2,q15
1534 veor $tmp3,$in2,$rndlast
1535 vld1.8 {$in2},[$inp],#16
1536 aesd $dat3,q15
1537 veor $tmp4,$in3,$rndlast
1538 vld1.8 {$in3},[$inp],#16
1539 aesd $dat4,q15
1540 vorr $ivec,$in4,$in4
1541 vld1.8 {$in4},[$inp],#16
1542 cbz x6,.Lcbc_tail4x
1543 vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0]
1544 veor $tmp0,$tmp0,$dat0
1545 vorr $dat0,$in0,$in0
1546 veor $tmp1,$tmp1,$dat1
1547 vorr $dat1,$in1,$in1
1548 veor $tmp2,$tmp2,$dat2
1549 vorr $dat2,$in2,$in2
1550 veor $tmp3,$tmp3,$dat3
1551 vorr $dat3,$in3,$in3
1552 veor $tmp4,$tmp4,$dat4
1553 vst1.8 {$tmp0},[$out],#16
1554 vorr $dat4,$in4,$in4
1555 vst1.8 {$tmp1},[$out],#16
1556 mov $cnt,$rounds
1557 vst1.8 {$tmp2},[$out],#16
1558 vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1]
1559 vst1.8 {$tmp3},[$out],#16
1560 vst1.8 {$tmp4},[$out],#16
1561 b.hs .Loop5x_cbc_dec
1562
1563 add $len,$len,#0x50
1564 cbz $len,.Lcbc_done
1565
1566 add $cnt,$rounds,#2
1567 subs $len,$len,#0x30
1568 vorr $dat0,$in2,$in2
1569 vorr $in0,$in2,$in2
1570 vorr $dat1,$in3,$in3
1571 vorr $in1,$in3,$in3
1572 vorr $dat2,$in4,$in4
1573 vorr $in2,$in4,$in4
1574 b.lo .Lcbc_dec_tail
1575
1576 b .Loop3x_cbc_dec
1577
1578.align 4
1579.Lcbc_tail4x:
1580 veor $tmp1,$tmp0,$dat1
1581 veor $tmp2,$tmp2,$dat2
1582 veor $tmp3,$tmp3,$dat3
1583 veor $tmp4,$tmp4,$dat4
1584 vst1.8 {$tmp1},[$out],#16
1585 vst1.8 {$tmp2},[$out],#16
1586 vst1.8 {$tmp3},[$out],#16
1587 vst1.8 {$tmp4},[$out],#16
1588
1589 b .Lcbc_done
1590.align 4
1591___
1592$code.=<<___;
1593.Loop3x_cbc_dec:
1594 aesd $dat0,q8
1595 aesimc $dat0,$dat0
1596 aesd $dat1,q8
1597 aesimc $dat1,$dat1
1598 aesd $dat2,q8
1599 aesimc $dat2,$dat2
1600 vld1.32 {q8},[$key_],#16
1601 subs $cnt,$cnt,#2
1602 aesd $dat0,q9
1603 aesimc $dat0,$dat0
1604 aesd $dat1,q9
1605 aesimc $dat1,$dat1
1606 aesd $dat2,q9
1607 aesimc $dat2,$dat2
1608 vld1.32 {q9},[$key_],#16
1609 b.gt .Loop3x_cbc_dec
1610
1611 aesd $dat0,q8
1612 aesimc $dat0,$dat0
1613 aesd $dat1,q8
1614 aesimc $dat1,$dat1
1615 aesd $dat2,q8
1616 aesimc $dat2,$dat2
1617 veor $tmp0,$ivec,$rndlast
1618 subs $len,$len,#0x30
1619 veor $tmp1,$in0,$rndlast
1620 mov.lo x6,$len // x6, $cnt, is zero at this point
1621 aesd $dat0,q9
1622 aesimc $dat0,$dat0
1623 aesd $dat1,q9
1624 aesimc $dat1,$dat1
1625 aesd $dat2,q9
1626 aesimc $dat2,$dat2
1627 veor $tmp2,$in1,$rndlast
1628 add $inp,$inp,x6 // $inp is adjusted in such way that
1629 // at exit from the loop $dat1-$dat2
1630 // are loaded with last "words"
1631 vorr $ivec,$in2,$in2
1632 mov $key_,$key
1633 aesd $dat0,q12
1634 aesimc $dat0,$dat0
1635 aesd $dat1,q12
1636 aesimc $dat1,$dat1
1637 aesd $dat2,q12
1638 aesimc $dat2,$dat2
1639 vld1.8 {$in0},[$inp],#16
1640 aesd $dat0,q13
1641 aesimc $dat0,$dat0
1642 aesd $dat1,q13
1643 aesimc $dat1,$dat1
1644 aesd $dat2,q13
1645 aesimc $dat2,$dat2
1646 vld1.8 {$in1},[$inp],#16
1647 aesd $dat0,q14
1648 aesimc $dat0,$dat0
1649 aesd $dat1,q14
1650 aesimc $dat1,$dat1
1651 aesd $dat2,q14
1652 aesimc $dat2,$dat2
1653 vld1.8 {$in2},[$inp],#16
1654 aesd $dat0,q15
1655 aesd $dat1,q15
1656 aesd $dat2,q15
1657 vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0]
1658 add $cnt,$rounds,#2
1659 veor $tmp0,$tmp0,$dat0
1660 veor $tmp1,$tmp1,$dat1
1661 veor $dat2,$dat2,$tmp2
1662 vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1]
1663 vst1.8 {$tmp0},[$out],#16
1664 vorr $dat0,$in0,$in0
1665 vst1.8 {$tmp1},[$out],#16
1666 vorr $dat1,$in1,$in1
1667 vst1.8 {$dat2},[$out],#16
1668 vorr $dat2,$in2,$in2
1669 b.hs .Loop3x_cbc_dec
1670
1671 cmn $len,#0x30
1672 b.eq .Lcbc_done
1673 nop
1674
1675.Lcbc_dec_tail:
1676 aesd $dat1,q8
1677 aesimc $dat1,$dat1
1678 aesd $dat2,q8
1679 aesimc $dat2,$dat2
1680 vld1.32 {q8},[$key_],#16
1681 subs $cnt,$cnt,#2
1682 aesd $dat1,q9
1683 aesimc $dat1,$dat1
1684 aesd $dat2,q9
1685 aesimc $dat2,$dat2
1686 vld1.32 {q9},[$key_],#16
1687 b.gt .Lcbc_dec_tail
1688
1689 aesd $dat1,q8
1690 aesimc $dat1,$dat1
1691 aesd $dat2,q8
1692 aesimc $dat2,$dat2
1693 aesd $dat1,q9
1694 aesimc $dat1,$dat1
1695 aesd $dat2,q9
1696 aesimc $dat2,$dat2
1697 aesd $dat1,q12
1698 aesimc $dat1,$dat1
1699 aesd $dat2,q12
1700 aesimc $dat2,$dat2
1701 cmn $len,#0x20
1702 aesd $dat1,q13
1703 aesimc $dat1,$dat1
1704 aesd $dat2,q13
1705 aesimc $dat2,$dat2
1706 veor $tmp1,$ivec,$rndlast
1707 aesd $dat1,q14
1708 aesimc $dat1,$dat1
1709 aesd $dat2,q14
1710 aesimc $dat2,$dat2
1711 veor $tmp2,$in1,$rndlast
1712 aesd $dat1,q15
1713 aesd $dat2,q15
1714 b.eq .Lcbc_dec_one
1715 veor $tmp1,$tmp1,$dat1
1716 veor $tmp2,$tmp2,$dat2
1717 vorr $ivec,$in2,$in2
1718 vst1.8 {$tmp1},[$out],#16
1719 vst1.8 {$tmp2},[$out],#16
1720 b .Lcbc_done
1721
1722.Lcbc_dec_one:
1723 veor $tmp1,$tmp1,$dat2
1724 vorr $ivec,$in2,$in2
1725 vst1.8 {$tmp1},[$out],#16
1726
1727.Lcbc_done:
1728 vst1.8 {$ivec},[$ivp]
1729.Lcbc_abort:
1730___
1731}
1732$code.=<<___ if ($flavour !~ /64/);
1733 vldmia sp!,{d8-d15}
1734 ldmia sp!,{r4-r8,pc}
1735___
1736$code.=<<___ if ($flavour =~ /64/);
1737 ldr x29,[sp],#16
1738 ret
1739___
1740$code.=<<___;
1741.size ${prefix}_cbc_encrypt,.-${prefix}_cbc_encrypt
1742___
1743}}}
1744{{{
1745my ($inp,$out,$len,$key,$ivp)=map("x$_",(0..4));
1746my ($rounds,$cnt,$key_)=("w5","w6","x7");
1747my ($ctr,$tctr0,$tctr1,$tctr2)=map("w$_",(8..10,12));
1748my $step="x12"; # aliases with $tctr2
1749
1750my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$ivec,$rndlast)=map("q$_",(0..7));
1751my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9));
1752
1753# used only in 64-bit mode...
1754my ($dat3,$dat4,$in3,$in4)=map("q$_",(16..23));
1755
1756my ($dat,$tmp)=($dat0,$tmp0);
1757
1758### q8-q15 preloaded key schedule
1759
1760$code.=<<___;
1761.globl ${prefix}_ctr32_encrypt_blocks
1762.type ${prefix}_ctr32_encrypt_blocks,%function
1763.align 5
1764${prefix}_ctr32_encrypt_blocks:
1765___
1766$code.=<<___ if ($flavour =~ /64/);
1767 stp x29,x30,[sp,#-16]!
1768 add x29,sp,#0
1769___
1770$code.=<<___ if ($flavour !~ /64/);
1771 mov ip,sp
1772 stmdb sp!,{r4-r10,lr}
1773 vstmdb sp!,{d8-d15} @ ABI specification says so
1774 ldr r4, [ip] @ load remaining arg
1775___
1776$code.=<<___;
1777 ldr $rounds,[$key,#240]
1778
1779 ldr $ctr, [$ivp, #12]
1780#ifdef __ARMEB__
1781 vld1.8 {$dat0},[$ivp]
1782#else
1783 vld1.32 {$dat0},[$ivp]
1784#endif
1785 vld1.32 {q8-q9},[$key] // load key schedule...
1786 sub $rounds,$rounds,#4
1787 mov $step,#16
1788 cmp $len,#2
1789 add $key_,$key,x5,lsl#4 // pointer to last 5 round keys
1790 sub $rounds,$rounds,#2
1791 vld1.32 {q12-q13},[$key_],#32
1792 vld1.32 {q14-q15},[$key_],#32
1793 vld1.32 {$rndlast},[$key_]
1794 add $key_,$key,#32
1795 mov $cnt,$rounds
1796 cclr $step,lo
1797#ifndef __ARMEB__
1798 rev $ctr, $ctr
1799#endif
1800___
1801$code.=<<___ if ($flavour =~ /64/);
1802 vorr $dat1,$dat0,$dat0
1803 add $tctr1, $ctr, #1
1804 vorr $dat2,$dat0,$dat0
1805 add $ctr, $ctr, #2
1806 vorr $ivec,$dat0,$dat0
1807 rev $tctr1, $tctr1
1808 vmov.32 ${dat1}[3],$tctr1
1809 b.ls .Lctr32_tail
1810 rev $tctr2, $ctr
1811 sub $len,$len,#3 // bias
1812 vmov.32 ${dat2}[3],$tctr2
1813___
1814$code.=<<___ if ($flavour !~ /64/);
1815 add $tctr1, $ctr, #1
1816 vorr $ivec,$dat0,$dat0
1817 rev $tctr1, $tctr1
1818 vmov.32 ${ivec}[3],$tctr1
1819 add $ctr, $ctr, #2
1820 vorr $dat1,$ivec,$ivec
1821 b.ls .Lctr32_tail
1822 rev $tctr2, $ctr
1823 vmov.32 ${ivec}[3],$tctr2
1824 sub $len,$len,#3 // bias
1825 vorr $dat2,$ivec,$ivec
1826___
1827$code.=<<___ if ($flavour =~ /64/);
1828 cmp $len,#32
1829 b.lo .Loop3x_ctr32
1830
1831 add w13,$ctr,#1
1832 add w14,$ctr,#2
1833 vorr $dat3,$dat0,$dat0
1834 rev w13,w13
1835 vorr $dat4,$dat0,$dat0
1836 rev w14,w14
1837 vmov.32 ${dat3}[3],w13
1838 sub $len,$len,#2 // bias
1839 vmov.32 ${dat4}[3],w14
1840 add $ctr,$ctr,#2
1841 b .Loop5x_ctr32
1842
1843.align 4
1844.Loop5x_ctr32:
1845 aese $dat0,q8
1846 aesmc $dat0,$dat0
1847 aese $dat1,q8
1848 aesmc $dat1,$dat1
1849 aese $dat2,q8
1850 aesmc $dat2,$dat2
1851 aese $dat3,q8
1852 aesmc $dat3,$dat3
1853 aese $dat4,q8
1854 aesmc $dat4,$dat4
1855 vld1.32 {q8},[$key_],#16
1856 subs $cnt,$cnt,#2
1857 aese $dat0,q9
1858 aesmc $dat0,$dat0
1859 aese $dat1,q9
1860 aesmc $dat1,$dat1
1861 aese $dat2,q9
1862 aesmc $dat2,$dat2
1863 aese $dat3,q9
1864 aesmc $dat3,$dat3
1865 aese $dat4,q9
1866 aesmc $dat4,$dat4
1867 vld1.32 {q9},[$key_],#16
1868 b.gt .Loop5x_ctr32
1869
1870 mov $key_,$key
1871 aese $dat0,q8
1872 aesmc $dat0,$dat0
1873 aese $dat1,q8
1874 aesmc $dat1,$dat1
1875 aese $dat2,q8
1876 aesmc $dat2,$dat2
1877 aese $dat3,q8
1878 aesmc $dat3,$dat3
1879 aese $dat4,q8
1880 aesmc $dat4,$dat4
1881 vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0]
1882
1883 aese $dat0,q9
1884 aesmc $dat0,$dat0
1885 aese $dat1,q9
1886 aesmc $dat1,$dat1
1887 aese $dat2,q9
1888 aesmc $dat2,$dat2
1889 aese $dat3,q9
1890 aesmc $dat3,$dat3
1891 aese $dat4,q9
1892 aesmc $dat4,$dat4
1893 vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1]
1894
1895 aese $dat0,q12
1896 aesmc $dat0,$dat0
1897 add $tctr0,$ctr,#1
1898 add $tctr1,$ctr,#2
1899 aese $dat1,q12
1900 aesmc $dat1,$dat1
1901 add $tctr2,$ctr,#3
1902 add w13,$ctr,#4
1903 aese $dat2,q12
1904 aesmc $dat2,$dat2
1905 add w14,$ctr,#5
1906 rev $tctr0,$tctr0
1907 aese $dat3,q12
1908 aesmc $dat3,$dat3
1909 rev $tctr1,$tctr1
1910 rev $tctr2,$tctr2
1911 aese $dat4,q12
1912 aesmc $dat4,$dat4
1913 rev w13,w13
1914 rev w14,w14
1915
1916 aese $dat0,q13
1917 aesmc $dat0,$dat0
1918 aese $dat1,q13
1919 aesmc $dat1,$dat1
1920 aese $dat2,q13
1921 aesmc $dat2,$dat2
1922 aese $dat3,q13
1923 aesmc $dat3,$dat3
1924 aese $dat4,q13
1925 aesmc $dat4,$dat4
1926
1927 aese $dat0,q14
1928 aesmc $dat0,$dat0
1929 vld1.8 {$in0},[$inp],#16
1930 aese $dat1,q14
1931 aesmc $dat1,$dat1
1932 vld1.8 {$in1},[$inp],#16
1933 aese $dat2,q14
1934 aesmc $dat2,$dat2
1935 vld1.8 {$in2},[$inp],#16
1936 aese $dat3,q14
1937 aesmc $dat3,$dat3
1938 vld1.8 {$in3},[$inp],#16
1939 aese $dat4,q14
1940 aesmc $dat4,$dat4
1941 vld1.8 {$in4},[$inp],#16
1942
1943 aese $dat0,q15
1944 veor $in0,$in0,$rndlast
1945 aese $dat1,q15
1946 veor $in1,$in1,$rndlast
1947 aese $dat2,q15
1948 veor $in2,$in2,$rndlast
1949 aese $dat3,q15
1950 veor $in3,$in3,$rndlast
1951 aese $dat4,q15
1952 veor $in4,$in4,$rndlast
1953
1954 veor $in0,$in0,$dat0
1955 vorr $dat0,$ivec,$ivec
1956 veor $in1,$in1,$dat1
1957 vorr $dat1,$ivec,$ivec
1958 veor $in2,$in2,$dat2
1959 vorr $dat2,$ivec,$ivec
1960 veor $in3,$in3,$dat3
1961 vorr $dat3,$ivec,$ivec
1962 veor $in4,$in4,$dat4
1963 vorr $dat4,$ivec,$ivec
1964
1965 vst1.8 {$in0},[$out],#16
1966 vmov.32 ${dat0}[3],$tctr0
1967 vst1.8 {$in1},[$out],#16
1968 vmov.32 ${dat1}[3],$tctr1
1969 vst1.8 {$in2},[$out],#16
1970 vmov.32 ${dat2}[3],$tctr2
1971 vst1.8 {$in3},[$out],#16
1972 vmov.32 ${dat3}[3],w13
1973 vst1.8 {$in4},[$out],#16
1974 vmov.32 ${dat4}[3],w14
1975
1976 mov $cnt,$rounds
1977 cbz $len,.Lctr32_done
1978
1979 add $ctr,$ctr,#5
1980 subs $len,$len,#5
1981 b.hs .Loop5x_ctr32
1982
1983 add $len,$len,#5
1984 sub $ctr,$ctr,#5
1985
1986 cmp $len,#2
1987 mov $step,#16
1988 cclr $step,lo
1989 b.ls .Lctr32_tail
1990
1991 sub $len,$len,#3 // bias
1992 add $ctr,$ctr,#3
1993___
1994$code.=<<___;
1995 b .Loop3x_ctr32
1996
1997.align 4
1998.Loop3x_ctr32:
1999 aese $dat0,q8
2000 aesmc $dat0,$dat0
2001 aese $dat1,q8
2002 aesmc $dat1,$dat1
2003 aese $dat2,q8
2004 aesmc $dat2,$dat2
2005 vld1.32 {q8},[$key_],#16
2006 subs $cnt,$cnt,#2
2007 aese $dat0,q9
2008 aesmc $dat0,$dat0
2009 aese $dat1,q9
2010 aesmc $dat1,$dat1
2011 aese $dat2,q9
2012 aesmc $dat2,$dat2
2013 vld1.32 {q9},[$key_],#16
2014 b.gt .Loop3x_ctr32
2015
2016 aese $dat0,q8
2017 aesmc $tmp0,$dat0
2018 aese $dat1,q8
2019 aesmc $tmp1,$dat1
2020 vld1.8 {$in0},[$inp],#16
2021___
2022$code.=<<___ if ($flavour =~ /64/);
2023 vorr $dat0,$ivec,$ivec
2024___
2025$code.=<<___ if ($flavour !~ /64/);
2026 add $tctr0,$ctr,#1
2027___
2028$code.=<<___;
2029 aese $dat2,q8
2030 aesmc $dat2,$dat2
2031 vld1.8 {$in1},[$inp],#16
2032___
2033$code.=<<___ if ($flavour =~ /64/);
2034 vorr $dat1,$ivec,$ivec
2035___
2036$code.=<<___ if ($flavour !~ /64/);
2037 rev $tctr0,$tctr0
2038___
2039$code.=<<___;
2040 aese $tmp0,q9
2041 aesmc $tmp0,$tmp0
2042 aese $tmp1,q9
2043 aesmc $tmp1,$tmp1
2044 vld1.8 {$in2},[$inp],#16
2045 mov $key_,$key
2046 aese $dat2,q9
2047 aesmc $tmp2,$dat2
2048___
2049$code.=<<___ if ($flavour =~ /64/);
2050 vorr $dat2,$ivec,$ivec
2051 add $tctr0,$ctr,#1
2052___
2053$code.=<<___;
2054 aese $tmp0,q12
2055 aesmc $tmp0,$tmp0
2056 aese $tmp1,q12
2057 aesmc $tmp1,$tmp1
2058 veor $in0,$in0,$rndlast
2059 add $tctr1,$ctr,#2
2060 aese $tmp2,q12
2061 aesmc $tmp2,$tmp2
2062 veor $in1,$in1,$rndlast
2063 add $ctr,$ctr,#3
2064 aese $tmp0,q13
2065 aesmc $tmp0,$tmp0
2066 aese $tmp1,q13
2067 aesmc $tmp1,$tmp1
2068 veor $in2,$in2,$rndlast
2069___
2070$code.=<<___ if ($flavour =~ /64/);
2071 rev $tctr0,$tctr0
2072 aese $tmp2,q13
2073 aesmc $tmp2,$tmp2
2074 vmov.32 ${dat0}[3], $tctr0
2075___
2076$code.=<<___ if ($flavour !~ /64/);
2077 vmov.32 ${ivec}[3], $tctr0
2078 aese $tmp2,q13
2079 aesmc $tmp2,$tmp2
2080 vorr $dat0,$ivec,$ivec
2081___
2082$code.=<<___;
2083 rev $tctr1,$tctr1
2084 aese $tmp0,q14
2085 aesmc $tmp0,$tmp0
2086___
2087$code.=<<___ if ($flavour !~ /64/);
2088 vmov.32 ${ivec}[3], $tctr1
2089 rev $tctr2,$ctr
2090___
2091$code.=<<___;
2092 aese $tmp1,q14
2093 aesmc $tmp1,$tmp1
2094___
2095$code.=<<___ if ($flavour =~ /64/);
2096 vmov.32 ${dat1}[3], $tctr1
2097 rev $tctr2,$ctr
2098 aese $tmp2,q14
2099 aesmc $tmp2,$tmp2
2100 vmov.32 ${dat2}[3], $tctr2
2101___
2102$code.=<<___ if ($flavour !~ /64/);
2103 vorr $dat1,$ivec,$ivec
2104 vmov.32 ${ivec}[3], $tctr2
2105 aese $tmp2,q14
2106 aesmc $tmp2,$tmp2
2107 vorr $dat2,$ivec,$ivec
2108___
2109$code.=<<___;
2110 subs $len,$len,#3
2111 aese $tmp0,q15
2112 aese $tmp1,q15
2113 aese $tmp2,q15
2114
2115 veor $in0,$in0,$tmp0
2116 vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0]
2117 vst1.8 {$in0},[$out],#16
2118 veor $in1,$in1,$tmp1
2119 mov $cnt,$rounds
2120 vst1.8 {$in1},[$out],#16
2121 veor $in2,$in2,$tmp2
2122 vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1]
2123 vst1.8 {$in2},[$out],#16
2124 b.hs .Loop3x_ctr32
2125
2126 adds $len,$len,#3
2127 b.eq .Lctr32_done
2128 cmp $len,#1
2129 mov $step,#16
2130 cclr $step,eq
2131
2132.Lctr32_tail:
2133 aese $dat0,q8
2134 aesmc $dat0,$dat0
2135 aese $dat1,q8
2136 aesmc $dat1,$dat1
2137 vld1.32 {q8},[$key_],#16
2138 subs $cnt,$cnt,#2
2139 aese $dat0,q9
2140 aesmc $dat0,$dat0
2141 aese $dat1,q9
2142 aesmc $dat1,$dat1
2143 vld1.32 {q9},[$key_],#16
2144 b.gt .Lctr32_tail
2145
2146 aese $dat0,q8
2147 aesmc $dat0,$dat0
2148 aese $dat1,q8
2149 aesmc $dat1,$dat1
2150 aese $dat0,q9
2151 aesmc $dat0,$dat0
2152 aese $dat1,q9
2153 aesmc $dat1,$dat1
2154 vld1.8 {$in0},[$inp],$step
2155 aese $dat0,q12
2156 aesmc $dat0,$dat0
2157 aese $dat1,q12
2158 aesmc $dat1,$dat1
2159 vld1.8 {$in1},[$inp]
2160 aese $dat0,q13
2161 aesmc $dat0,$dat0
2162 aese $dat1,q13
2163 aesmc $dat1,$dat1
2164 veor $in0,$in0,$rndlast
2165 aese $dat0,q14
2166 aesmc $dat0,$dat0
2167 aese $dat1,q14
2168 aesmc $dat1,$dat1
2169 veor $in1,$in1,$rndlast
2170 aese $dat0,q15
2171 aese $dat1,q15
2172
2173 cmp $len,#1
2174 veor $in0,$in0,$dat0
2175 veor $in1,$in1,$dat1
2176 vst1.8 {$in0},[$out],#16
2177 b.eq .Lctr32_done
2178 vst1.8 {$in1},[$out]
2179
2180.Lctr32_done:
2181___
2182$code.=<<___ if ($flavour !~ /64/);
2183 vldmia sp!,{d8-d15}
2184 ldmia sp!,{r4-r10,pc}
2185___
2186$code.=<<___ if ($flavour =~ /64/);
2187 ldr x29,[sp],#16
2188 ret
2189___
2190$code.=<<___;
2191.size ${prefix}_ctr32_encrypt_blocks,.-${prefix}_ctr32_encrypt_blocks
2192___
2193}}}
2194# Performance in cycles per byte.
2195# Processed with AES-XTS different key size.
2196# It shows the value before and after optimization as below:
2197# (before/after):
2198#
2199# AES-128-XTS AES-256-XTS
2200# Cortex-A57 3.36/1.09 4.02/1.37
2201# Cortex-A72 3.03/1.02 3.28/1.33
2202
2203# Optimization is implemented by loop unrolling and interleaving.
2204# Commonly, we choose the unrolling factor as 5, if the input
2205# data size smaller than 5 blocks, but not smaller than 3 blocks,
2206# choose 3 as the unrolling factor.
2207# If the input data size dsize >= 5*16 bytes, then take 5 blocks
2208# as one iteration, every loop the left size lsize -= 5*16.
2209# If lsize < 5*16 bytes, treat them as the tail. Note: left 4*16 bytes
2210# will be processed specially, which be integrated into the 5*16 bytes
2211# loop to improve the efficiency.
2212# There is one special case, if the original input data size dsize
2213# = 16 bytes, we will treat it seperately to improve the
2214# performance: one independent code block without LR, FP load and
2215# store.
2216# Encryption will process the (length -tailcnt) bytes as mentioned
2217# previously, then encrypt the composite block as last second
2218# cipher block.
2219# Decryption will process the (length -tailcnt -1) bytes as mentioned
2220# previously, then decrypt the last second cipher block to get the
2221# last plain block(tail), decrypt the composite block as last second
2222# plain text block.
2223
2224{{{
2225my ($inp,$out,$len,$key1,$key2,$ivp)=map("x$_",(0..5));
2226my ($rounds0,$rounds,$key_,$step,$ivl,$ivh)=("w5","w6","x7","x8","x9","x10");
2227my ($tmpoutp,$loutp,$l2outp,$tmpinp)=("x13","w14","w15","x20");
2228my ($tailcnt,$midnum,$midnumx,$constnum,$constnumx)=("x21","w22","x22","w19","x19");
2229my ($xoffset,$tmpmx,$tmpmw)=("x6","x11","w11");
2230my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$tmp2,$rndlast)=map("q$_",(0..7));
2231my ($iv0,$iv1,$iv2,$iv3,$iv4)=("v6.16b","v8.16b","v9.16b","v10.16b","v11.16b");
2232my ($ivd00,$ivd01,$ivd20,$ivd21)=("d6","v6.d[1]","d9","v9.d[1]");
2233my ($ivd10,$ivd11,$ivd30,$ivd31,$ivd40,$ivd41)=("d8","v8.d[1]","d10","v10.d[1]","d11","v11.d[1]");
2234
2235my ($tmpin)=("v26.16b");
2236my ($dat,$tmp,$rndzero_n_last)=($dat0,$tmp0,$tmp1);
2237
2238# q7 last round key
2239# q10-q15, q7 Last 7 round keys
2240# q8-q9 preloaded round keys except last 7 keys for big size
2241# q20, q21, q8-q9 preloaded round keys except last 7 keys for only 16 byte
2242
2243
2244my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9));
2245
2246my ($dat3,$in3,$tmp3); # used only in 64-bit mode
2247my ($dat4,$in4,$tmp4);
2248if ($flavour =~ /64/) {
2249 ($dat2,$dat3,$dat4,$in2,$in3,$in4,$tmp3,$tmp4)=map("q$_",(16..23));
2250}
2251
2252$code.=<<___ if ($flavour =~ /64/);
2253.globl ${prefix}_xts_encrypt
2254.type ${prefix}_xts_encrypt,%function
2255.align 5
2256${prefix}_xts_encrypt:
2257___
2258$code.=<<___ if ($flavour =~ /64/);
2259 cmp $len,#16
2260 // Original input data size bigger than 16, jump to big size processing.
2261 b.ne .Lxts_enc_big_size
2262 // Encrypt the iv with key2, as the first XEX iv.
2263 ldr $rounds,[$key2,#240]
2264 vld1.8 {$dat},[$key2],#16
2265 vld1.8 {$iv0},[$ivp]
2266 sub $rounds,$rounds,#2
2267 vld1.8 {$dat1},[$key2],#16
2268
2269.Loop_enc_iv_enc:
2270 aese $iv0,$dat
2271 aesmc $iv0,$iv0
2272 vld1.32 {$dat},[$key2],#16
2273 subs $rounds,$rounds,#2
2274 aese $iv0,$dat1
2275 aesmc $iv0,$iv0
2276 vld1.32 {$dat1},[$key2],#16
2277 b.gt .Loop_enc_iv_enc
2278
2279 aese $iv0,$dat
2280 aesmc $iv0,$iv0
2281 vld1.32 {$dat},[$key2]
2282 aese $iv0,$dat1
2283 veor $iv0,$iv0,$dat
2284
2285 vld1.8 {$dat0},[$inp]
2286 veor $dat0,$iv0,$dat0
2287
2288 ldr $rounds,[$key1,#240]
2289 vld1.32 {q20-q21},[$key1],#32 // load key schedule...
2290
2291 aese $dat0,q20
2292 aesmc $dat0,$dat0
2293 vld1.32 {q8-q9},[$key1],#32 // load key schedule...
2294 aese $dat0,q21
2295 aesmc $dat0,$dat0
2296 subs $rounds,$rounds,#10 // if rounds==10, jump to aes-128-xts processing
2297 b.eq .Lxts_128_enc
2298.Lxts_enc_round_loop:
2299 aese $dat0,q8
2300 aesmc $dat0,$dat0
2301 vld1.32 {q8},[$key1],#16 // load key schedule...
2302 aese $dat0,q9
2303 aesmc $dat0,$dat0
2304 vld1.32 {q9},[$key1],#16 // load key schedule...
2305 subs $rounds,$rounds,#2 // bias
2306 b.gt .Lxts_enc_round_loop
2307.Lxts_128_enc:
2308 vld1.32 {q10-q11},[$key1],#32 // load key schedule...
2309 aese $dat0,q8
2310 aesmc $dat0,$dat0
2311 aese $dat0,q9
2312 aesmc $dat0,$dat0
2313 vld1.32 {q12-q13},[$key1],#32 // load key schedule...
2314 aese $dat0,q10
2315 aesmc $dat0,$dat0
2316 aese $dat0,q11
2317 aesmc $dat0,$dat0
2318 vld1.32 {q14-q15},[$key1],#32 // load key schedule...
2319 aese $dat0,q12
2320 aesmc $dat0,$dat0
2321 aese $dat0,q13
2322 aesmc $dat0,$dat0
2323 vld1.32 {$rndlast},[$key1]
2324 aese $dat0,q14
2325 aesmc $dat0,$dat0
2326 aese $dat0,q15
2327 veor $dat0,$dat0,$rndlast
2328 veor $dat0,$dat0,$iv0
2329 vst1.8 {$dat0},[$out]
2330 b .Lxts_enc_final_abort
2331
2332.align 4
2333.Lxts_enc_big_size:
2334___
2335$code.=<<___ if ($flavour =~ /64/);
2336 stp $constnumx,$tmpinp,[sp,#-64]!
2337 stp $tailcnt,$midnumx,[sp,#48]
2338 stp $ivd10,$ivd20,[sp,#32]
2339 stp $ivd30,$ivd40,[sp,#16]
2340
2341 // tailcnt store the tail value of length%16.
2342 and $tailcnt,$len,#0xf
2343 and $len,$len,#-16
2344 subs $len,$len,#16
2345 mov $step,#16
2346 b.lo .Lxts_abort
2347 csel $step,xzr,$step,eq
2348
2349 // Firstly, encrypt the iv with key2, as the first iv of XEX.
2350 ldr $rounds,[$key2,#240]
2351 vld1.32 {$dat},[$key2],#16
2352 vld1.8 {$iv0},[$ivp]
2353 sub $rounds,$rounds,#2
2354 vld1.32 {$dat1},[$key2],#16
2355
2356.Loop_iv_enc:
2357 aese $iv0,$dat
2358 aesmc $iv0,$iv0
2359 vld1.32 {$dat},[$key2],#16
2360 subs $rounds,$rounds,#2
2361 aese $iv0,$dat1
2362 aesmc $iv0,$iv0
2363 vld1.32 {$dat1},[$key2],#16
2364 b.gt .Loop_iv_enc
2365
2366 aese $iv0,$dat
2367 aesmc $iv0,$iv0
2368 vld1.32 {$dat},[$key2]
2369 aese $iv0,$dat1
2370 veor $iv0,$iv0,$dat
2371
2372 // The iv for second block
2373 // $ivl- iv(low), $ivh - iv(high)
2374 // the five ivs stored into, $iv0,$iv1,$iv2,$iv3,$iv4
2375 fmov $ivl,$ivd00
2376 fmov $ivh,$ivd01
2377 mov $constnum,#0x87
2378 extr $midnumx,$ivh,$ivh,#32
2379 extr $ivh,$ivh,$ivl,#63
2380 and $tmpmw,$constnum,$midnum,asr#31
2381 eor $ivl,$tmpmx,$ivl,lsl#1
2382 fmov $ivd10,$ivl
2383 fmov $ivd11,$ivh
2384
2385 ldr $rounds0,[$key1,#240] // next starting point
2386 vld1.8 {$dat},[$inp],$step
2387
2388 vld1.32 {q8-q9},[$key1] // load key schedule...
2389 sub $rounds0,$rounds0,#6
2390 add $key_,$key1,$ivp,lsl#4 // pointer to last 7 round keys
2391 sub $rounds0,$rounds0,#2
2392 vld1.32 {q10-q11},[$key_],#32
2393 vld1.32 {q12-q13},[$key_],#32
2394 vld1.32 {q14-q15},[$key_],#32
2395 vld1.32 {$rndlast},[$key_]
2396
2397 add $key_,$key1,#32
2398 mov $rounds,$rounds0
2399
2400 // Encryption
2401.Lxts_enc:
2402 vld1.8 {$dat2},[$inp],#16
2403 subs $len,$len,#32 // bias
2404 add $rounds,$rounds0,#2
2405 vorr $in1,$dat,$dat
2406 vorr $dat1,$dat,$dat
2407 vorr $in3,$dat,$dat
2408 vorr $in2,$dat2,$dat2
2409 vorr $in4,$dat2,$dat2
2410 b.lo .Lxts_inner_enc_tail
2411 veor $dat,$dat,$iv0 // before encryption, xor with iv
2412 veor $dat2,$dat2,$iv1
2413
2414 // The iv for third block
2415 extr $midnumx,$ivh,$ivh,#32
2416 extr $ivh,$ivh,$ivl,#63
2417 and $tmpmw,$constnum,$midnum,asr#31
2418 eor $ivl,$tmpmx,$ivl,lsl#1
2419 fmov $ivd20,$ivl
2420 fmov $ivd21,$ivh
2421
2422
2423 vorr $dat1,$dat2,$dat2
2424 vld1.8 {$dat2},[$inp],#16
2425 vorr $in0,$dat,$dat
2426 vorr $in1,$dat1,$dat1
2427 veor $in2,$dat2,$iv2 // the third block
2428 veor $dat2,$dat2,$iv2
2429 cmp $len,#32
2430 b.lo .Lxts_outer_enc_tail
2431
2432 // The iv for fourth block
2433 extr $midnumx,$ivh,$ivh,#32
2434 extr $ivh,$ivh,$ivl,#63
2435 and $tmpmw,$constnum,$midnum,asr#31
2436 eor $ivl,$tmpmx,$ivl,lsl#1
2437 fmov $ivd30,$ivl
2438 fmov $ivd31,$ivh
2439
2440 vld1.8 {$dat3},[$inp],#16
2441 // The iv for fifth block
2442 extr $midnumx,$ivh,$ivh,#32
2443 extr $ivh,$ivh,$ivl,#63
2444 and $tmpmw,$constnum,$midnum,asr#31
2445 eor $ivl,$tmpmx,$ivl,lsl#1
2446 fmov $ivd40,$ivl
2447 fmov $ivd41,$ivh
2448
2449 vld1.8 {$dat4},[$inp],#16
2450 veor $dat3,$dat3,$iv3 // the fourth block
2451 veor $dat4,$dat4,$iv4
2452 sub $len,$len,#32 // bias
2453 mov $rounds,$rounds0
2454 b .Loop5x_xts_enc
2455
2456.align 4
2457.Loop5x_xts_enc:
2458 aese $dat0,q8
2459 aesmc $dat0,$dat0
2460 aese $dat1,q8
2461 aesmc $dat1,$dat1
2462 aese $dat2,q8
2463 aesmc $dat2,$dat2
2464 aese $dat3,q8
2465 aesmc $dat3,$dat3
2466 aese $dat4,q8
2467 aesmc $dat4,$dat4
2468 vld1.32 {q8},[$key_],#16
2469 subs $rounds,$rounds,#2
2470 aese $dat0,q9
2471 aesmc $dat0,$dat0
2472 aese $dat1,q9
2473 aesmc $dat1,$dat1
2474 aese $dat2,q9
2475 aesmc $dat2,$dat2
2476 aese $dat3,q9
2477 aesmc $dat3,$dat3
2478 aese $dat4,q9
2479 aesmc $dat4,$dat4
2480 vld1.32 {q9},[$key_],#16
2481 b.gt .Loop5x_xts_enc
2482
2483 aese $dat0,q8
2484 aesmc $dat0,$dat0
2485 aese $dat1,q8
2486 aesmc $dat1,$dat1
2487 aese $dat2,q8
2488 aesmc $dat2,$dat2
2489 aese $dat3,q8
2490 aesmc $dat3,$dat3
2491 aese $dat4,q8
2492 aesmc $dat4,$dat4
2493 subs $len,$len,#0x50 // because .Lxts_enc_tail4x
2494
2495 aese $dat0,q9
2496 aesmc $dat0,$dat0
2497 aese $dat1,q9
2498 aesmc $dat1,$dat1
2499 aese $dat2,q9
2500 aesmc $dat2,$dat2
2501 aese $dat3,q9
2502 aesmc $dat3,$dat3
2503 aese $dat4,q9
2504 aesmc $dat4,$dat4
2505 csel $xoffset,xzr,$len,gt // borrow x6, w6, "gt" is not typo
2506 mov $key_,$key1
2507
2508 aese $dat0,q10
2509 aesmc $dat0,$dat0
2510 aese $dat1,q10
2511 aesmc $dat1,$dat1
2512 aese $dat2,q10
2513 aesmc $dat2,$dat2
2514 aese $dat3,q10
2515 aesmc $dat3,$dat3
2516 aese $dat4,q10
2517 aesmc $dat4,$dat4
2518 add $inp,$inp,$xoffset // x0 is adjusted in such way that
2519 // at exit from the loop v1.16b-v26.16b
2520 // are loaded with last "words"
2521 add $xoffset,$len,#0x60 // because .Lxts_enc_tail4x
2522
2523 aese $dat0,q11
2524 aesmc $dat0,$dat0
2525 aese $dat1,q11
2526 aesmc $dat1,$dat1
2527 aese $dat2,q11
2528 aesmc $dat2,$dat2
2529 aese $dat3,q11
2530 aesmc $dat3,$dat3
2531 aese $dat4,q11
2532 aesmc $dat4,$dat4
2533
2534 aese $dat0,q12
2535 aesmc $dat0,$dat0
2536 aese $dat1,q12
2537 aesmc $dat1,$dat1
2538 aese $dat2,q12
2539 aesmc $dat2,$dat2
2540 aese $dat3,q12
2541 aesmc $dat3,$dat3
2542 aese $dat4,q12
2543 aesmc $dat4,$dat4
2544
2545 aese $dat0,q13
2546 aesmc $dat0,$dat0
2547 aese $dat1,q13
2548 aesmc $dat1,$dat1
2549 aese $dat2,q13
2550 aesmc $dat2,$dat2
2551 aese $dat3,q13
2552 aesmc $dat3,$dat3
2553 aese $dat4,q13
2554 aesmc $dat4,$dat4
2555
2556 aese $dat0,q14
2557 aesmc $dat0,$dat0
2558 aese $dat1,q14
2559 aesmc $dat1,$dat1
2560 aese $dat2,q14
2561 aesmc $dat2,$dat2
2562 aese $dat3,q14
2563 aesmc $dat3,$dat3
2564 aese $dat4,q14
2565 aesmc $dat4,$dat4
2566
2567 veor $tmp0,$rndlast,$iv0
2568 aese $dat0,q15
2569 // The iv for first block of one iteration
2570 extr $midnumx,$ivh,$ivh,#32
2571 extr $ivh,$ivh,$ivl,#63
2572 and $tmpmw,$constnum,$midnum,asr#31
2573 eor $ivl,$tmpmx,$ivl,lsl#1
2574 fmov $ivd00,$ivl
2575 fmov $ivd01,$ivh
2576 veor $tmp1,$rndlast,$iv1
2577 vld1.8 {$in0},[$inp],#16
2578 aese $dat1,q15
2579 // The iv for second block
2580 extr $midnumx,$ivh,$ivh,#32
2581 extr $ivh,$ivh,$ivl,#63
2582 and $tmpmw,$constnum,$midnum,asr#31
2583 eor $ivl,$tmpmx,$ivl,lsl#1
2584 fmov $ivd10,$ivl
2585 fmov $ivd11,$ivh
2586 veor $tmp2,$rndlast,$iv2
2587 vld1.8 {$in1},[$inp],#16
2588 aese $dat2,q15
2589 // The iv for third block
2590 extr $midnumx,$ivh,$ivh,#32
2591 extr $ivh,$ivh,$ivl,#63
2592 and $tmpmw,$constnum,$midnum,asr#31
2593 eor $ivl,$tmpmx,$ivl,lsl#1
2594 fmov $ivd20,$ivl
2595 fmov $ivd21,$ivh
2596 veor $tmp3,$rndlast,$iv3
2597 vld1.8 {$in2},[$inp],#16
2598 aese $dat3,q15
2599 // The iv for fourth block
2600 extr $midnumx,$ivh,$ivh,#32
2601 extr $ivh,$ivh,$ivl,#63
2602 and $tmpmw,$constnum,$midnum,asr#31
2603 eor $ivl,$tmpmx,$ivl,lsl#1
2604 fmov $ivd30,$ivl
2605 fmov $ivd31,$ivh
2606 veor $tmp4,$rndlast,$iv4
2607 vld1.8 {$in3},[$inp],#16
2608 aese $dat4,q15
2609
2610 // The iv for fifth block
2611 extr $midnumx,$ivh,$ivh,#32
2612 extr $ivh,$ivh,$ivl,#63
2613 and $tmpmw,$constnum,$midnum,asr #31
2614 eor $ivl,$tmpmx,$ivl,lsl #1
2615 fmov $ivd40,$ivl
2616 fmov $ivd41,$ivh
2617
2618 vld1.8 {$in4},[$inp],#16
2619 cbz $xoffset,.Lxts_enc_tail4x
2620 vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0]
2621 veor $tmp0,$tmp0,$dat0
2622 veor $dat0,$in0,$iv0
2623 veor $tmp1,$tmp1,$dat1
2624 veor $dat1,$in1,$iv1
2625 veor $tmp2,$tmp2,$dat2
2626 veor $dat2,$in2,$iv2
2627 veor $tmp3,$tmp3,$dat3
2628 veor $dat3,$in3,$iv3
2629 veor $tmp4,$tmp4,$dat4
2630 vst1.8 {$tmp0},[$out],#16
2631 veor $dat4,$in4,$iv4
2632 vst1.8 {$tmp1},[$out],#16
2633 mov $rounds,$rounds0
2634 vst1.8 {$tmp2},[$out],#16
2635 vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1]
2636 vst1.8 {$tmp3},[$out],#16
2637 vst1.8 {$tmp4},[$out],#16
2638 b.hs .Loop5x_xts_enc
2639
2640
2641 // If left 4 blocks, borrow the five block's processing.
2642 cmn $len,#0x10
2643 b.ne .Loop5x_enc_after
2644 vorr $iv4,$iv3,$iv3
2645 vorr $iv3,$iv2,$iv2
2646 vorr $iv2,$iv1,$iv1
2647 vorr $iv1,$iv0,$iv0
2648 fmov $ivl,$ivd40
2649 fmov $ivh,$ivd41
2650 veor $dat0,$iv0,$in0
2651 veor $dat1,$iv1,$in1
2652 veor $dat2,$in2,$iv2
2653 veor $dat3,$in3,$iv3
2654 veor $dat4,$in4,$iv4
2655 b.eq .Loop5x_xts_enc
2656
2657.Loop5x_enc_after:
2658 add $len,$len,#0x50
2659 cbz $len,.Lxts_enc_done
2660
2661 add $rounds,$rounds0,#2
2662 subs $len,$len,#0x30
2663 b.lo .Lxts_inner_enc_tail
2664
2665 veor $dat0,$iv0,$in2
2666 veor $dat1,$iv1,$in3
2667 veor $dat2,$in4,$iv2
2668 b .Lxts_outer_enc_tail
2669
2670.align 4
2671.Lxts_enc_tail4x:
2672 add $inp,$inp,#16
2673 veor $tmp1,$dat1,$tmp1
2674 vst1.8 {$tmp1},[$out],#16
2675 veor $tmp2,$dat2,$tmp2
2676 vst1.8 {$tmp2},[$out],#16
2677 veor $tmp3,$dat3,$tmp3
2678 veor $tmp4,$dat4,$tmp4
2679 vst1.8 {$tmp3-$tmp4},[$out],#32
2680
2681 b .Lxts_enc_done
2682.align 4
2683.Lxts_outer_enc_tail:
2684 aese $dat0,q8
2685 aesmc $dat0,$dat0
2686 aese $dat1,q8
2687 aesmc $dat1,$dat1
2688 aese $dat2,q8
2689 aesmc $dat2,$dat2
2690 vld1.32 {q8},[$key_],#16
2691 subs $rounds,$rounds,#2
2692 aese $dat0,q9
2693 aesmc $dat0,$dat0
2694 aese $dat1,q9
2695 aesmc $dat1,$dat1
2696 aese $dat2,q9
2697 aesmc $dat2,$dat2
2698 vld1.32 {q9},[$key_],#16
2699 b.gt .Lxts_outer_enc_tail
2700
2701 aese $dat0,q8
2702 aesmc $dat0,$dat0
2703 aese $dat1,q8
2704 aesmc $dat1,$dat1
2705 aese $dat2,q8
2706 aesmc $dat2,$dat2
2707 veor $tmp0,$iv0,$rndlast
2708 subs $len,$len,#0x30
2709 // The iv for first block
2710 fmov $ivl,$ivd20
2711 fmov $ivh,$ivd21
2712 //mov $constnum,#0x87
2713 extr $midnumx,$ivh,$ivh,#32
2714 extr $ivh,$ivh,$ivl,#63
2715 and $tmpmw,$constnum,$midnum,asr#31
2716 eor $ivl,$tmpmx,$ivl,lsl#1
2717 fmov $ivd00,$ivl
2718 fmov $ivd01,$ivh
2719 veor $tmp1,$iv1,$rndlast
2720 csel $xoffset,$len,$xoffset,lo // x6, w6, is zero at this point
2721 aese $dat0,q9
2722 aesmc $dat0,$dat0
2723 aese $dat1,q9
2724 aesmc $dat1,$dat1
2725 aese $dat2,q9
2726 aesmc $dat2,$dat2
2727 veor $tmp2,$iv2,$rndlast
2728
2729 add $xoffset,$xoffset,#0x20
2730 add $inp,$inp,$xoffset
2731 mov $key_,$key1
2732
2733 aese $dat0,q12
2734 aesmc $dat0,$dat0
2735 aese $dat1,q12
2736 aesmc $dat1,$dat1
2737 aese $dat2,q12
2738 aesmc $dat2,$dat2
2739 aese $dat0,q13
2740 aesmc $dat0,$dat0
2741 aese $dat1,q13
2742 aesmc $dat1,$dat1
2743 aese $dat2,q13
2744 aesmc $dat2,$dat2
2745 aese $dat0,q14
2746 aesmc $dat0,$dat0
2747 aese $dat1,q14
2748 aesmc $dat1,$dat1
2749 aese $dat2,q14
2750 aesmc $dat2,$dat2
2751 aese $dat0,q15
2752 aese $dat1,q15
2753 aese $dat2,q15
2754 vld1.8 {$in2},[$inp],#16
2755 add $rounds,$rounds0,#2
2756 vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0]
2757 veor $tmp0,$tmp0,$dat0
2758 veor $tmp1,$tmp1,$dat1
2759 veor $dat2,$dat2,$tmp2
2760 vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1]
2761 vst1.8 {$tmp0},[$out],#16
2762 vst1.8 {$tmp1},[$out],#16
2763 vst1.8 {$dat2},[$out],#16
2764 cmn $len,#0x30
2765 b.eq .Lxts_enc_done
2766.Lxts_encxor_one:
2767 vorr $in3,$in1,$in1
2768 vorr $in4,$in2,$in2
2769 nop
2770
2771.Lxts_inner_enc_tail:
2772 cmn $len,#0x10
2773 veor $dat1,$in3,$iv0
2774 veor $dat2,$in4,$iv1
2775 b.eq .Lxts_enc_tail_loop
2776 veor $dat2,$in4,$iv0
2777.Lxts_enc_tail_loop:
2778 aese $dat1,q8
2779 aesmc $dat1,$dat1
2780 aese $dat2,q8
2781 aesmc $dat2,$dat2
2782 vld1.32 {q8},[$key_],#16
2783 subs $rounds,$rounds,#2
2784 aese $dat1,q9
2785 aesmc $dat1,$dat1
2786 aese $dat2,q9
2787 aesmc $dat2,$dat2
2788 vld1.32 {q9},[$key_],#16
2789 b.gt .Lxts_enc_tail_loop
2790
2791 aese $dat1,q8
2792 aesmc $dat1,$dat1
2793 aese $dat2,q8
2794 aesmc $dat2,$dat2
2795 aese $dat1,q9
2796 aesmc $dat1,$dat1
2797 aese $dat2,q9
2798 aesmc $dat2,$dat2
2799 aese $dat1,q12
2800 aesmc $dat1,$dat1
2801 aese $dat2,q12
2802 aesmc $dat2,$dat2
2803 cmn $len,#0x20
2804 aese $dat1,q13
2805 aesmc $dat1,$dat1
2806 aese $dat2,q13
2807 aesmc $dat2,$dat2
2808 veor $tmp1,$iv0,$rndlast
2809 aese $dat1,q14
2810 aesmc $dat1,$dat1
2811 aese $dat2,q14
2812 aesmc $dat2,$dat2
2813 veor $tmp2,$iv1,$rndlast
2814 aese $dat1,q15
2815 aese $dat2,q15
2816 b.eq .Lxts_enc_one
2817 veor $tmp1,$tmp1,$dat1
2818 vst1.8 {$tmp1},[$out],#16
2819 veor $tmp2,$tmp2,$dat2
2820 vorr $iv0,$iv1,$iv1
2821 vst1.8 {$tmp2},[$out],#16
2822 fmov $ivl,$ivd10
2823 fmov $ivh,$ivd11
2824 mov $constnum,#0x87
2825 extr $midnumx,$ivh,$ivh,#32
2826 extr $ivh,$ivh,$ivl,#63
2827 and $tmpmw,$constnum,$midnum,asr #31
2828 eor $ivl,$tmpmx,$ivl,lsl #1
2829 fmov $ivd00,$ivl
2830 fmov $ivd01,$ivh
2831 b .Lxts_enc_done
2832
2833.Lxts_enc_one:
2834 veor $tmp1,$tmp1,$dat2
2835 vorr $iv0,$iv0,$iv0
2836 vst1.8 {$tmp1},[$out],#16
2837 fmov $ivl,$ivd00
2838 fmov $ivh,$ivd01
2839 mov $constnum,#0x87
2840 extr $midnumx,$ivh,$ivh,#32
2841 extr $ivh,$ivh,$ivl,#63
2842 and $tmpmw,$constnum,$midnum,asr #31
2843 eor $ivl,$tmpmx,$ivl,lsl #1
2844 fmov $ivd00,$ivl
2845 fmov $ivd01,$ivh
2846 b .Lxts_enc_done
2847.align 5
2848.Lxts_enc_done:
2849 // Process the tail block with cipher stealing.
2850 tst $tailcnt,#0xf
2851 b.eq .Lxts_abort
2852
2853 mov $tmpinp,$inp
2854 mov $tmpoutp,$out
2855 sub $out,$out,#16
2856.composite_enc_loop:
2857 subs $tailcnt,$tailcnt,#1
2858 ldrb $l2outp,[$out,$tailcnt]
2859 ldrb $loutp,[$tmpinp,$tailcnt]
2860 strb $l2outp,[$tmpoutp,$tailcnt]
2861 strb $loutp,[$out,$tailcnt]
2862 b.gt .composite_enc_loop
2863.Lxts_enc_load_done:
2864 vld1.8 {$tmpin},[$out]
2865 veor $tmpin,$tmpin,$iv0
2866
2867 // Encrypt the composite block to get the last second encrypted text block
2868 ldr $rounds,[$key1,#240] // load key schedule...
2869 vld1.8 {$dat},[$key1],#16
2870 sub $rounds,$rounds,#2
2871 vld1.8 {$dat1},[$key1],#16 // load key schedule...
2872.Loop_final_enc:
2873 aese $tmpin,$dat0
2874 aesmc $tmpin,$tmpin
2875 vld1.32 {$dat0},[$key1],#16
2876 subs $rounds,$rounds,#2
2877 aese $tmpin,$dat1
2878 aesmc $tmpin,$tmpin
2879 vld1.32 {$dat1},[$key1],#16
2880 b.gt .Loop_final_enc
2881
2882 aese $tmpin,$dat0
2883 aesmc $tmpin,$tmpin
2884 vld1.32 {$dat0},[$key1]
2885 aese $tmpin,$dat1
2886 veor $tmpin,$tmpin,$dat0
2887 veor $tmpin,$tmpin,$iv0
2888 vst1.8 {$tmpin},[$out]
2889
2890.Lxts_abort:
2891 ldp $tailcnt,$midnumx,[sp,#48]
2892 ldp $ivd10,$ivd20,[sp,#32]
2893 ldp $ivd30,$ivd40,[sp,#16]
2894 ldp $constnumx,$tmpinp,[sp],#64
2895.Lxts_enc_final_abort:
2896 ret
2897.size ${prefix}_xts_encrypt,.-${prefix}_xts_encrypt
2898___
2899
2900}}}
2901{{{
2902my ($inp,$out,$len,$key1,$key2,$ivp)=map("x$_",(0..5));
2903my ($rounds0,$rounds,$key_,$step,$ivl,$ivh)=("w5","w6","x7","x8","x9","x10");
2904my ($tmpoutp,$loutp,$l2outp,$tmpinp)=("x13","w14","w15","x20");
2905my ($tailcnt,$midnum,$midnumx,$constnum,$constnumx)=("x21","w22","x22","w19","x19");
2906my ($xoffset,$tmpmx,$tmpmw)=("x6","x11","w11");
2907my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$tmp2,$rndlast)=map("q$_",(0..7));
2908my ($iv0,$iv1,$iv2,$iv3,$iv4,$tmpin)=("v6.16b","v8.16b","v9.16b","v10.16b","v11.16b","v26.16b");
2909my ($ivd00,$ivd01,$ivd20,$ivd21)=("d6","v6.d[1]","d9","v9.d[1]");
2910my ($ivd10,$ivd11,$ivd30,$ivd31,$ivd40,$ivd41)=("d8","v8.d[1]","d10","v10.d[1]","d11","v11.d[1]");
2911
2912my ($dat,$tmp,$rndzero_n_last)=($dat0,$tmp0,$tmp1);
2913
2914# q7 last round key
2915# q10-q15, q7 Last 7 round keys
2916# q8-q9 preloaded round keys except last 7 keys for big size
2917# q20, q21, q8-q9 preloaded round keys except last 7 keys for only 16 byte
2918
2919{
2920my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9));
2921
2922my ($dat3,$in3,$tmp3); # used only in 64-bit mode
2923my ($dat4,$in4,$tmp4);
2924if ($flavour =~ /64/) {
2925 ($dat2,$dat3,$dat4,$in2,$in3,$in4,$tmp3,$tmp4)=map("q$_",(16..23));
2926}
2927
2928$code.=<<___ if ($flavour =~ /64/);
2929.globl ${prefix}_xts_decrypt
2930.type ${prefix}_xts_decrypt,%function
2931.align 5
2932${prefix}_xts_decrypt:
2933___
2934$code.=<<___ if ($flavour =~ /64/);
2935 cmp $len,#16
2936 // Original input data size bigger than 16, jump to big size processing.
2937 b.ne .Lxts_dec_big_size
2938 // Encrypt the iv with key2, as the first XEX iv.
2939 ldr $rounds,[$key2,#240]
2940 vld1.8 {$dat},[$key2],#16
2941 vld1.8 {$iv0},[$ivp]
2942 sub $rounds,$rounds,#2
2943 vld1.8 {$dat1},[$key2],#16
2944
2945.Loop_dec_small_iv_enc:
2946 aese $iv0,$dat
2947 aesmc $iv0,$iv0
2948 vld1.32 {$dat},[$key2],#16
2949 subs $rounds,$rounds,#2
2950 aese $iv0,$dat1
2951 aesmc $iv0,$iv0
2952 vld1.32 {$dat1},[$key2],#16
2953 b.gt .Loop_dec_small_iv_enc
2954
2955 aese $iv0,$dat
2956 aesmc $iv0,$iv0
2957 vld1.32 {$dat},[$key2]
2958 aese $iv0,$dat1
2959 veor $iv0,$iv0,$dat
2960
2961 vld1.8 {$dat0},[$inp]
2962 veor $dat0,$iv0,$dat0
2963
2964 ldr $rounds,[$key1,#240]
2965 vld1.32 {q20-q21},[$key1],#32 // load key schedule...
2966
2967 aesd $dat0,q20
2968 aesimc $dat0,$dat0
2969 vld1.32 {q8-q9},[$key1],#32 // load key schedule...
2970 aesd $dat0,q21
2971 aesimc $dat0,$dat0
2972 subs $rounds,$rounds,#10 // bias
2973 b.eq .Lxts_128_dec
2974.Lxts_dec_round_loop:
2975 aesd $dat0,q8
2976 aesimc $dat0,$dat0
2977 vld1.32 {q8},[$key1],#16 // load key schedule...
2978 aesd $dat0,q9
2979 aesimc $dat0,$dat0
2980 vld1.32 {q9},[$key1],#16 // load key schedule...
2981 subs $rounds,$rounds,#2 // bias
2982 b.gt .Lxts_dec_round_loop
2983.Lxts_128_dec:
2984 vld1.32 {q10-q11},[$key1],#32 // load key schedule...
2985 aesd $dat0,q8
2986 aesimc $dat0,$dat0
2987 aesd $dat0,q9
2988 aesimc $dat0,$dat0
2989 vld1.32 {q12-q13},[$key1],#32 // load key schedule...
2990 aesd $dat0,q10
2991 aesimc $dat0,$dat0
2992 aesd $dat0,q11
2993 aesimc $dat0,$dat0
2994 vld1.32 {q14-q15},[$key1],#32 // load key schedule...
2995 aesd $dat0,q12
2996 aesimc $dat0,$dat0
2997 aesd $dat0,q13
2998 aesimc $dat0,$dat0
2999 vld1.32 {$rndlast},[$key1]
3000 aesd $dat0,q14
3001 aesimc $dat0,$dat0
3002 aesd $dat0,q15
3003 veor $dat0,$dat0,$rndlast
3004 veor $dat0,$iv0,$dat0
3005 vst1.8 {$dat0},[$out]
3006 b .Lxts_dec_final_abort
3007.Lxts_dec_big_size:
3008___
3009$code.=<<___ if ($flavour =~ /64/);
3010 stp $constnumx,$tmpinp,[sp,#-64]!
3011 stp $tailcnt,$midnumx,[sp,#48]
3012 stp $ivd10,$ivd20,[sp,#32]
3013 stp $ivd30,$ivd40,[sp,#16]
3014
3015 and $tailcnt,$len,#0xf
3016 and $len,$len,#-16
3017 subs $len,$len,#16
3018 mov $step,#16
3019 b.lo .Lxts_dec_abort
3020
3021 // Encrypt the iv with key2, as the first XEX iv
3022 ldr $rounds,[$key2,#240]
3023 vld1.8 {$dat},[$key2],#16
3024 vld1.8 {$iv0},[$ivp]
3025 sub $rounds,$rounds,#2
3026 vld1.8 {$dat1},[$key2],#16
3027
3028.Loop_dec_iv_enc:
3029 aese $iv0,$dat
3030 aesmc $iv0,$iv0
3031 vld1.32 {$dat},[$key2],#16
3032 subs $rounds,$rounds,#2
3033 aese $iv0,$dat1
3034 aesmc $iv0,$iv0
3035 vld1.32 {$dat1},[$key2],#16
3036 b.gt .Loop_dec_iv_enc
3037
3038 aese $iv0,$dat
3039 aesmc $iv0,$iv0
3040 vld1.32 {$dat},[$key2]
3041 aese $iv0,$dat1
3042 veor $iv0,$iv0,$dat
3043
3044 // The iv for second block
3045 // $ivl- iv(low), $ivh - iv(high)
3046 // the five ivs stored into, $iv0,$iv1,$iv2,$iv3,$iv4
3047 fmov $ivl,$ivd00
3048 fmov $ivh,$ivd01
3049 mov $constnum,#0x87
3050 extr $midnumx,$ivh,$ivh,#32
3051 extr $ivh,$ivh,$ivl,#63
3052 and $tmpmw,$constnum,$midnum,asr #31
3053 eor $ivl,$tmpmx,$ivl,lsl #1
3054 fmov $ivd10,$ivl
3055 fmov $ivd11,$ivh
3056
3057 ldr $rounds0,[$key1,#240] // load rounds number
3058
3059 // The iv for third block
3060 extr $midnumx,$ivh,$ivh,#32
3061 extr $ivh,$ivh,$ivl,#63
3062 and $tmpmw,$constnum,$midnum,asr #31
3063 eor $ivl,$tmpmx,$ivl,lsl #1
3064 fmov $ivd20,$ivl
3065 fmov $ivd21,$ivh
3066
3067 vld1.32 {q8-q9},[$key1] // load key schedule...
3068 sub $rounds0,$rounds0,#6
3069 add $key_,$key1,$ivp,lsl#4 // pointer to last 7 round keys
3070 sub $rounds0,$rounds0,#2
3071 vld1.32 {q10-q11},[$key_],#32 // load key schedule...
3072 vld1.32 {q12-q13},[$key_],#32
3073 vld1.32 {q14-q15},[$key_],#32
3074 vld1.32 {$rndlast},[$key_]
3075
3076 // The iv for fourth block
3077 extr $midnumx,$ivh,$ivh,#32
3078 extr $ivh,$ivh,$ivl,#63
3079 and $tmpmw,$constnum,$midnum,asr #31
3080 eor $ivl,$tmpmx,$ivl,lsl #1
3081 fmov $ivd30,$ivl
3082 fmov $ivd31,$ivh
3083
3084 add $key_,$key1,#32
3085 mov $rounds,$rounds0
3086 b .Lxts_dec
3087
3088 // Decryption
3089.align 5
3090.Lxts_dec:
3091 tst $tailcnt,#0xf
3092 b.eq .Lxts_dec_begin
3093 subs $len,$len,#16
3094 csel $step,xzr,$step,eq
3095 vld1.8 {$dat},[$inp],#16
3096 b.lo .Lxts_done
3097 sub $inp,$inp,#16
3098.Lxts_dec_begin:
3099 vld1.8 {$dat},[$inp],$step
3100 subs $len,$len,#32 // bias
3101 add $rounds,$rounds0,#2
3102 vorr $in1,$dat,$dat
3103 vorr $dat1,$dat,$dat
3104 vorr $in3,$dat,$dat
3105 vld1.8 {$dat2},[$inp],#16
3106 vorr $in2,$dat2,$dat2
3107 vorr $in4,$dat2,$dat2
3108 b.lo .Lxts_inner_dec_tail
3109 veor $dat,$dat,$iv0 // before decryt, xor with iv
3110 veor $dat2,$dat2,$iv1
3111
3112 vorr $dat1,$dat2,$dat2
3113 vld1.8 {$dat2},[$inp],#16
3114 vorr $in0,$dat,$dat
3115 vorr $in1,$dat1,$dat1
3116 veor $in2,$dat2,$iv2 // third block xox with third iv
3117 veor $dat2,$dat2,$iv2
3118 cmp $len,#32
3119 b.lo .Lxts_outer_dec_tail
3120
3121 vld1.8 {$dat3},[$inp],#16
3122
3123 // The iv for fifth block
3124 extr $midnumx,$ivh,$ivh,#32
3125 extr $ivh,$ivh,$ivl,#63
3126 and $tmpmw,$constnum,$midnum,asr #31
3127 eor $ivl,$tmpmx,$ivl,lsl #1
3128 fmov $ivd40,$ivl
3129 fmov $ivd41,$ivh
3130
3131 vld1.8 {$dat4},[$inp],#16
3132 veor $dat3,$dat3,$iv3 // the fourth block
3133 veor $dat4,$dat4,$iv4
3134 sub $len,$len,#32 // bias
3135 mov $rounds,$rounds0
3136 b .Loop5x_xts_dec
3137
3138.align 4
3139.Loop5x_xts_dec:
3140 aesd $dat0,q8
3141 aesimc $dat0,$dat0
3142 aesd $dat1,q8
3143 aesimc $dat1,$dat1
3144 aesd $dat2,q8
3145 aesimc $dat2,$dat2
3146 aesd $dat3,q8
3147 aesimc $dat3,$dat3
3148 aesd $dat4,q8
3149 aesimc $dat4,$dat4
3150 vld1.32 {q8},[$key_],#16 // load key schedule...
3151 subs $rounds,$rounds,#2
3152 aesd $dat0,q9
3153 aesimc $dat0,$dat0
3154 aesd $dat1,q9
3155 aesimc $dat1,$dat1
3156 aesd $dat2,q9
3157 aesimc $dat2,$dat2
3158 aesd $dat3,q9
3159 aesimc $dat3,$dat3
3160 aesd $dat4,q9
3161 aesimc $dat4,$dat4
3162 vld1.32 {q9},[$key_],#16 // load key schedule...
3163 b.gt .Loop5x_xts_dec
3164
3165 aesd $dat0,q8
3166 aesimc $dat0,$dat0
3167 aesd $dat1,q8
3168 aesimc $dat1,$dat1
3169 aesd $dat2,q8
3170 aesimc $dat2,$dat2
3171 aesd $dat3,q8
3172 aesimc $dat3,$dat3
3173 aesd $dat4,q8
3174 aesimc $dat4,$dat4
3175 subs $len,$len,#0x50 // because .Lxts_dec_tail4x
3176
3177 aesd $dat0,q9
3178 aesimc $dat0,$dat
3179 aesd $dat1,q9
3180 aesimc $dat1,$dat1
3181 aesd $dat2,q9
3182 aesimc $dat2,$dat2
3183 aesd $dat3,q9
3184 aesimc $dat3,$dat3
3185 aesd $dat4,q9
3186 aesimc $dat4,$dat4
3187 csel $xoffset,xzr,$len,gt // borrow x6, w6, "gt" is not typo
3188 mov $key_,$key1
3189
3190 aesd $dat0,q10
3191 aesimc $dat0,$dat0
3192 aesd $dat1,q10
3193 aesimc $dat1,$dat1
3194 aesd $dat2,q10
3195 aesimc $dat2,$dat2
3196 aesd $dat3,q10
3197 aesimc $dat3,$dat3
3198 aesd $dat4,q10
3199 aesimc $dat4,$dat4
3200 add $inp,$inp,$xoffset // x0 is adjusted in such way that
3201 // at exit from the loop v1.16b-v26.16b
3202 // are loaded with last "words"
3203 add $xoffset,$len,#0x60 // because .Lxts_dec_tail4x
3204
3205 aesd $dat0,q11
3206 aesimc $dat0,$dat0
3207 aesd $dat1,q11
3208 aesimc $dat1,$dat1
3209 aesd $dat2,q11
3210 aesimc $dat2,$dat2
3211 aesd $dat3,q11
3212 aesimc $dat3,$dat3
3213 aesd $dat4,q11
3214 aesimc $dat4,$dat4
3215
3216 aesd $dat0,q12
3217 aesimc $dat0,$dat0
3218 aesd $dat1,q12
3219 aesimc $dat1,$dat1
3220 aesd $dat2,q12
3221 aesimc $dat2,$dat2
3222 aesd $dat3,q12
3223 aesimc $dat3,$dat3
3224 aesd $dat4,q12
3225 aesimc $dat4,$dat4
3226
3227 aesd $dat0,q13
3228 aesimc $dat0,$dat0
3229 aesd $dat1,q13
3230 aesimc $dat1,$dat1
3231 aesd $dat2,q13
3232 aesimc $dat2,$dat2
3233 aesd $dat3,q13
3234 aesimc $dat3,$dat3
3235 aesd $dat4,q13
3236 aesimc $dat4,$dat4
3237
3238 aesd $dat0,q14
3239 aesimc $dat0,$dat0
3240 aesd $dat1,q14
3241 aesimc $dat1,$dat1
3242 aesd $dat2,q14
3243 aesimc $dat2,$dat2
3244 aesd $dat3,q14
3245 aesimc $dat3,$dat3
3246 aesd $dat4,q14
3247 aesimc $dat4,$dat4
3248
3249 veor $tmp0,$rndlast,$iv0
3250 aesd $dat0,q15
3251 // The iv for first block of next iteration.
3252 extr $midnumx,$ivh,$ivh,#32
3253 extr $ivh,$ivh,$ivl,#63
3254 and $tmpmw,$constnum,$midnum,asr #31
3255 eor $ivl,$tmpmx,$ivl,lsl #1
3256 fmov $ivd00,$ivl
3257 fmov $ivd01,$ivh
3258 veor $tmp1,$rndlast,$iv1
3259 vld1.8 {$in0},[$inp],#16
3260 aesd $dat1,q15
3261 // The iv for second block
3262 extr $midnumx,$ivh,$ivh,#32
3263 extr $ivh,$ivh,$ivl,#63
3264 and $tmpmw,$constnum,$midnum,asr #31
3265 eor $ivl,$tmpmx,$ivl,lsl #1
3266 fmov $ivd10,$ivl
3267 fmov $ivd11,$ivh
3268 veor $tmp2,$rndlast,$iv2
3269 vld1.8 {$in1},[$inp],#16
3270 aesd $dat2,q15
3271 // The iv for third block
3272 extr $midnumx,$ivh,$ivh,#32
3273 extr $ivh,$ivh,$ivl,#63
3274 and $tmpmw,$constnum,$midnum,asr #31
3275 eor $ivl,$tmpmx,$ivl,lsl #1
3276 fmov $ivd20,$ivl
3277 fmov $ivd21,$ivh
3278 veor $tmp3,$rndlast,$iv3
3279 vld1.8 {$in2},[$inp],#16
3280 aesd $dat3,q15
3281 // The iv for fourth block
3282 extr $midnumx,$ivh,$ivh,#32
3283 extr $ivh,$ivh,$ivl,#63
3284 and $tmpmw,$constnum,$midnum,asr #31
3285 eor $ivl,$tmpmx,$ivl,lsl #1
3286 fmov $ivd30,$ivl
3287 fmov $ivd31,$ivh
3288 veor $tmp4,$rndlast,$iv4
3289 vld1.8 {$in3},[$inp],#16
3290 aesd $dat4,q15
3291
3292 // The iv for fifth block
3293 extr $midnumx,$ivh,$ivh,#32
3294 extr $ivh,$ivh,$ivl,#63
3295 and $tmpmw,$constnum,$midnum,asr #31
3296 eor $ivl,$tmpmx,$ivl,lsl #1
3297 fmov $ivd40,$ivl
3298 fmov $ivd41,$ivh
3299
3300 vld1.8 {$in4},[$inp],#16
3301 cbz $xoffset,.Lxts_dec_tail4x
3302 vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0]
3303 veor $tmp0,$tmp0,$dat0
3304 veor $dat0,$in0,$iv0
3305 veor $tmp1,$tmp1,$dat1
3306 veor $dat1,$in1,$iv1
3307 veor $tmp2,$tmp2,$dat2
3308 veor $dat2,$in2,$iv2
3309 veor $tmp3,$tmp3,$dat3
3310 veor $dat3,$in3,$iv3
3311 veor $tmp4,$tmp4,$dat4
3312 vst1.8 {$tmp0},[$out],#16
3313 veor $dat4,$in4,$iv4
3314 vst1.8 {$tmp1},[$out],#16
3315 mov $rounds,$rounds0
3316 vst1.8 {$tmp2},[$out],#16
3317 vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1]
3318 vst1.8 {$tmp3},[$out],#16
3319 vst1.8 {$tmp4},[$out],#16
3320 b.hs .Loop5x_xts_dec
3321
3322 cmn $len,#0x10
3323 b.ne .Loop5x_dec_after
3324 // If x2($len) equal to -0x10, the left blocks is 4.
3325 // After specially processing, utilize the five blocks processing again.
3326 // It will use the following IVs: $iv0,$iv0,$iv1,$iv2,$iv3.
3327 vorr $iv4,$iv3,$iv3
3328 vorr $iv3,$iv2,$iv2
3329 vorr $iv2,$iv1,$iv1
3330 vorr $iv1,$iv0,$iv0
3331 fmov $ivl,$ivd40
3332 fmov $ivh,$ivd41
3333 veor $dat0,$iv0,$in0
3334 veor $dat1,$iv1,$in1
3335 veor $dat2,$in2,$iv2
3336 veor $dat3,$in3,$iv3
3337 veor $dat4,$in4,$iv4
3338 b.eq .Loop5x_xts_dec
3339
3340.Loop5x_dec_after:
3341 add $len,$len,#0x50
3342 cbz $len,.Lxts_done
3343
3344 add $rounds,$rounds0,#2
3345 subs $len,$len,#0x30
3346 b.lo .Lxts_inner_dec_tail
3347
3348 veor $dat0,$iv0,$in2
3349 veor $dat1,$iv1,$in3
3350 veor $dat2,$in4,$iv2
3351 b .Lxts_outer_dec_tail
3352
3353.align 4
3354.Lxts_dec_tail4x:
3355 add $inp,$inp,#16
3356 vld1.32 {$dat0},[$inp],#16
3357 veor $tmp1,$dat1,$tmp0
3358 vst1.8 {$tmp1},[$out],#16
3359 veor $tmp2,$dat2,$tmp2
3360 vst1.8 {$tmp2},[$out],#16
3361 veor $tmp3,$dat3,$tmp3
3362 veor $tmp4,$dat4,$tmp4
3363 vst1.8 {$tmp3-$tmp4},[$out],#32
3364
3365 b .Lxts_done
3366.align 4
3367.Lxts_outer_dec_tail:
3368 aesd $dat0,q8
3369 aesimc $dat0,$dat0
3370 aesd $dat1,q8
3371 aesimc $dat1,$dat1
3372 aesd $dat2,q8
3373 aesimc $dat2,$dat2
3374 vld1.32 {q8},[$key_],#16
3375 subs $rounds,$rounds,#2
3376 aesd $dat0,q9
3377 aesimc $dat0,$dat0
3378 aesd $dat1,q9
3379 aesimc $dat1,$dat1
3380 aesd $dat2,q9
3381 aesimc $dat2,$dat2
3382 vld1.32 {q9},[$key_],#16
3383 b.gt .Lxts_outer_dec_tail
3384
3385 aesd $dat0,q8
3386 aesimc $dat0,$dat0
3387 aesd $dat1,q8
3388 aesimc $dat1,$dat1
3389 aesd $dat2,q8
3390 aesimc $dat2,$dat2
3391 veor $tmp0,$iv0,$rndlast
3392 subs $len,$len,#0x30
3393 // The iv for first block
3394 fmov $ivl,$ivd20
3395 fmov $ivh,$ivd21
3396 mov $constnum,#0x87
3397 extr $midnumx,$ivh,$ivh,#32
3398 extr $ivh,$ivh,$ivl,#63
3399 and $tmpmw,$constnum,$midnum,asr #31
3400 eor $ivl,$tmpmx,$ivl,lsl #1
3401 fmov $ivd00,$ivl
3402 fmov $ivd01,$ivh
3403 veor $tmp1,$iv1,$rndlast
3404 csel $xoffset,$len,$xoffset,lo // x6, w6, is zero at this point
3405 aesd $dat0,q9
3406 aesimc $dat0,$dat0
3407 aesd $dat1,q9
3408 aesimc $dat1,$dat1
3409 aesd $dat2,q9
3410 aesimc $dat2,$dat2
3411 veor $tmp2,$iv2,$rndlast
3412 // The iv for second block
3413 extr $midnumx,$ivh,$ivh,#32
3414 extr $ivh,$ivh,$ivl,#63
3415 and $tmpmw,$constnum,$midnum,asr #31
3416 eor $ivl,$tmpmx,$ivl,lsl #1
3417 fmov $ivd10,$ivl
3418 fmov $ivd11,$ivh
3419
3420 add $xoffset,$xoffset,#0x20
3421 add $inp,$inp,$xoffset // $inp is adjusted to the last data
3422
3423 mov $key_,$key1
3424
3425 // The iv for third block
3426 extr $midnumx,$ivh,$ivh,#32
3427 extr $ivh,$ivh,$ivl,#63
3428 and $tmpmw,$constnum,$midnum,asr #31
3429 eor $ivl,$tmpmx,$ivl,lsl #1
3430 fmov $ivd20,$ivl
3431 fmov $ivd21,$ivh
3432
3433 aesd $dat0,q12
3434 aesimc $dat0,$dat0
3435 aesd $dat1,q12
3436 aesimc $dat1,$dat1
3437 aesd $dat2,q12
3438 aesimc $dat2,$dat2
3439 aesd $dat0,q13
3440 aesimc $dat0,$dat0
3441 aesd $dat1,q13
3442 aesimc $dat1,$dat1
3443 aesd $dat2,q13
3444 aesimc $dat2,$dat2
3445 aesd $dat0,q14
3446 aesimc $dat0,$dat0
3447 aesd $dat1,q14
3448 aesimc $dat1,$dat1
3449 aesd $dat2,q14
3450 aesimc $dat2,$dat2
3451 vld1.8 {$in2},[$inp],#16
3452 aesd $dat0,q15
3453 aesd $dat1,q15
3454 aesd $dat2,q15
3455 vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0]
3456 add $rounds,$rounds0,#2
3457 veor $tmp0,$tmp0,$dat0
3458 veor $tmp1,$tmp1,$dat1
3459 veor $dat2,$dat2,$tmp2
3460 vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1]
3461 vst1.8 {$tmp0},[$out],#16
3462 vst1.8 {$tmp1},[$out],#16
3463 vst1.8 {$dat2},[$out],#16
3464
3465 cmn $len,#0x30
3466 add $len,$len,#0x30
3467 b.eq .Lxts_done
3468 sub $len,$len,#0x30
3469 vorr $in3,$in1,$in1
3470 vorr $in4,$in2,$in2
3471 nop
3472
3473.Lxts_inner_dec_tail:
3474 // $len == -0x10 means two blocks left.
3475 cmn $len,#0x10
3476 veor $dat1,$in3,$iv0
3477 veor $dat2,$in4,$iv1
3478 b.eq .Lxts_dec_tail_loop
3479 veor $dat2,$in4,$iv0
3480.Lxts_dec_tail_loop:
3481 aesd $dat1,q8
3482 aesimc $dat1,$dat1
3483 aesd $dat2,q8
3484 aesimc $dat2,$dat2
3485 vld1.32 {q8},[$key_],#16
3486 subs $rounds,$rounds,#2
3487 aesd $dat1,q9
3488 aesimc $dat1,$dat1
3489 aesd $dat2,q9
3490 aesimc $dat2,$dat2
3491 vld1.32 {q9},[$key_],#16
3492 b.gt .Lxts_dec_tail_loop
3493
3494 aesd $dat1,q8
3495 aesimc $dat1,$dat1
3496 aesd $dat2,q8
3497 aesimc $dat2,$dat2
3498 aesd $dat1,q9
3499 aesimc $dat1,$dat1
3500 aesd $dat2,q9
3501 aesimc $dat2,$dat2
3502 aesd $dat1,q12
3503 aesimc $dat1,$dat1
3504 aesd $dat2,q12
3505 aesimc $dat2,$dat2
3506 cmn $len,#0x20
3507 aesd $dat1,q13
3508 aesimc $dat1,$dat1
3509 aesd $dat2,q13
3510 aesimc $dat2,$dat2
3511 veor $tmp1,$iv0,$rndlast
3512 aesd $dat1,q14
3513 aesimc $dat1,$dat1
3514 aesd $dat2,q14
3515 aesimc $dat2,$dat2
3516 veor $tmp2,$iv1,$rndlast
3517 aesd $dat1,q15
3518 aesd $dat2,q15
3519 b.eq .Lxts_dec_one
3520 veor $tmp1,$tmp1,$dat1
3521 veor $tmp2,$tmp2,$dat2
3522 vorr $iv0,$iv2,$iv2
3523 vorr $iv1,$iv3,$iv3
3524 vst1.8 {$tmp1},[$out],#16
3525 vst1.8 {$tmp2},[$out],#16
3526 add $len,$len,#16
3527 b .Lxts_done
3528
3529.Lxts_dec_one:
3530 veor $tmp1,$tmp1,$dat2
3531 vorr $iv0,$iv1,$iv1
3532 vorr $iv1,$iv2,$iv2
3533 vst1.8 {$tmp1},[$out],#16
3534 add $len,$len,#32
3535
3536.Lxts_done:
3537 tst $tailcnt,#0xf
3538 b.eq .Lxts_dec_abort
3539 // Processing the last two blocks with cipher stealing.
3540 mov x7,x3
3541 cbnz x2,.Lxts_dec_1st_done
3542 vld1.32 {$dat0},[$inp],#16
3543
3544 // Decrypt the last secod block to get the last plain text block
3545.Lxts_dec_1st_done:
3546 eor $tmpin,$dat0,$iv1
3547 ldr $rounds,[$key1,#240]
3548 vld1.32 {$dat0},[$key1],#16
3549 sub $rounds,$rounds,#2
3550 vld1.32 {$dat1},[$key1],#16
3551.Loop_final_2nd_dec:
3552 aesd $tmpin,$dat0
3553 aesimc $tmpin,$tmpin
3554 vld1.32 {$dat0},[$key1],#16 // load key schedule...
3555 subs $rounds,$rounds,#2
3556 aesd $tmpin,$dat1
3557 aesimc $tmpin,$tmpin
3558 vld1.32 {$dat1},[$key1],#16 // load key schedule...
3559 b.gt .Loop_final_2nd_dec
3560
3561 aesd $tmpin,$dat0
3562 aesimc $tmpin,$tmpin
3563 vld1.32 {$dat0},[$key1]
3564 aesd $tmpin,$dat1
3565 veor $tmpin,$tmpin,$dat0
3566 veor $tmpin,$tmpin,$iv1
3567 vst1.8 {$tmpin},[$out]
3568
3569 mov $tmpinp,$inp
3570 add $tmpoutp,$out,#16
3571
3572 // Composite the tailcnt "16 byte not aligned block" into the last second plain blocks
3573 // to get the last encrypted block.
3574.composite_dec_loop:
3575 subs $tailcnt,$tailcnt,#1
3576 ldrb $l2outp,[$out,$tailcnt]
3577 ldrb $loutp,[$tmpinp,$tailcnt]
3578 strb $l2outp,[$tmpoutp,$tailcnt]
3579 strb $loutp,[$out,$tailcnt]
3580 b.gt .composite_dec_loop
3581.Lxts_dec_load_done:
3582 vld1.8 {$tmpin},[$out]
3583 veor $tmpin,$tmpin,$iv0
3584
3585 // Decrypt the composite block to get the last second plain text block
3586 ldr $rounds,[$key_,#240]
3587 vld1.8 {$dat},[$key_],#16
3588 sub $rounds,$rounds,#2
3589 vld1.8 {$dat1},[$key_],#16
3590.Loop_final_dec:
3591 aesd $tmpin,$dat0
3592 aesimc $tmpin,$tmpin
3593 vld1.32 {$dat0},[$key_],#16 // load key schedule...
3594 subs $rounds,$rounds,#2
3595 aesd $tmpin,$dat1
3596 aesimc $tmpin,$tmpin
3597 vld1.32 {$dat1},[$key_],#16 // load key schedule...
3598 b.gt .Loop_final_dec
3599
3600 aesd $tmpin,$dat0
3601 aesimc $tmpin,$tmpin
3602 vld1.32 {$dat0},[$key_]
3603 aesd $tmpin,$dat1
3604 veor $tmpin,$tmpin,$dat0
3605 veor $tmpin,$tmpin,$iv0
3606 vst1.8 {$tmpin},[$out]
3607
3608.Lxts_dec_abort:
3609 ldp $tailcnt,$midnumx,[sp,#48]
3610 ldp $ivd10,$ivd20,[sp,#32]
3611 ldp $ivd30,$ivd40,[sp,#16]
3612 ldp $constnumx,$tmpinp,[sp],#64
3613
3614.Lxts_dec_final_abort:
3615 ret
3616.size ${prefix}_xts_decrypt,.-${prefix}_xts_decrypt
3617___
3618}
3619}}}
3620$code.=<<___;
3621#endif
3622___
3623########################################
3624if ($flavour =~ /64/) { ######## 64-bit code
3625 my %opcode = (
3626 "aesd" => 0x4e285800, "aese" => 0x4e284800,
3627 "aesimc"=> 0x4e287800, "aesmc" => 0x4e286800 );
3628
3629 local *unaes = sub {
3630 my ($mnemonic,$arg)=@_;
3631
3632 $arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)/o &&
3633 sprintf ".inst\t0x%08x\t//%s %s",
3634 $opcode{$mnemonic}|$1|($2<<5),
3635 $mnemonic,$arg;
3636 };
3637
3638 foreach(split("\n",$code)) {
3639 s/\`([^\`]*)\`/eval($1)/geo;
3640
3641 s/\bq([0-9]+)\b/"v".($1<8?$1:$1+8).".16b"/geo; # old->new registers
3642 s/@\s/\/\//o; # old->new style commentary
3643
3644 #s/[v]?(aes\w+)\s+([qv].*)/unaes($1,$2)/geo or
3645 s/cclr\s+([wx])([^,]+),\s*([a-z]+)/csel $1$2,$1zr,$1$2,$3/o or
3646 s/mov\.([a-z]+)\s+([wx][0-9]+),\s*([wx][0-9]+)/csel $2,$3,$2,$1/o or
3647 s/vmov\.i8/movi/o or # fix up legacy mnemonics
3648 s/vext\.8/ext/o or
3649 s/vrev32\.8/rev32/o or
3650 s/vtst\.8/cmtst/o or
3651 s/vshr/ushr/o or
3652 s/^(\s+)v/$1/o or # strip off v prefix
3653 s/\bbx\s+lr\b/ret/o;
3654
3655 # fix up remaining legacy suffixes
3656 s/\.[ui]?8//o;
3657 m/\],#8/o and s/\.16b/\.8b/go;
3658 s/\.[ui]?32//o and s/\.16b/\.4s/go;
3659 s/\.[ui]?64//o and s/\.16b/\.2d/go;
3660 s/\.[42]([sd])\[([0-3])\]/\.$1\[$2\]/o;
3661
3662 print $_,"\n";
3663 }
3664} else { ######## 32-bit code
3665 my %opcode = (
3666 "aesd" => 0xf3b00340, "aese" => 0xf3b00300,
3667 "aesimc"=> 0xf3b003c0, "aesmc" => 0xf3b00380 );
3668
3669 local *unaes = sub {
3670 my ($mnemonic,$arg)=@_;
3671
3672 if ($arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)/o) {
3673 my $word = $opcode{$mnemonic}|(($1&7)<<13)|(($1&8)<<19)
3674 |(($2&7)<<1) |(($2&8)<<2);
3675 # since ARMv7 instructions are always encoded little-endian.
3676 # correct solution is to use .inst directive, but older
3677 # assemblers don't implement it:-(
3678 sprintf "INST(0x%02x,0x%02x,0x%02x,0x%02x)\t@ %s %s",
3679 $word&0xff,($word>>8)&0xff,
3680 ($word>>16)&0xff,($word>>24)&0xff,
3681 $mnemonic,$arg;
3682 }
3683 };
3684
3685 sub unvtbl {
3686 my $arg=shift;
3687
3688 $arg =~ m/q([0-9]+),\s*\{q([0-9]+)\},\s*q([0-9]+)/o &&
3689 sprintf "vtbl.8 d%d,{q%d},d%d\n\t".
3690 "vtbl.8 d%d,{q%d},d%d", 2*$1,$2,2*$3, 2*$1+1,$2,2*$3+1;
3691 }
3692
3693 sub unvdup32 {
3694 my $arg=shift;
3695
3696 $arg =~ m/q([0-9]+),\s*q([0-9]+)\[([0-3])\]/o &&
3697 sprintf "vdup.32 q%d,d%d[%d]",$1,2*$2+($3>>1),$3&1;
3698 }
3699
3700 sub unvmov32 {
3701 my $arg=shift;
3702
3703 $arg =~ m/q([0-9]+)\[([0-3])\],(.*)/o &&
3704 sprintf "vmov.32 d%d[%d],%s",2*$1+($2>>1),$2&1,$3;
3705 }
3706
3707 foreach(split("\n",$code)) {
3708 s/\`([^\`]*)\`/eval($1)/geo;
3709
3710 s/\b[wx]([0-9]+)\b/r$1/go; # new->old registers
3711 s/\bv([0-9])\.[12468]+[bsd]\b/q$1/go; # new->old registers
3712 s/\/\/\s?/@ /o; # new->old style commentary
3713
3714 # fix up remaining new-style suffixes
3715 s/\{q([0-9]+)\},\s*\[(.+)\],#8/sprintf "{d%d},[$2]!",2*$1/eo or
3716 s/\],#[0-9]+/]!/o;
3717
3718 s/[v]?(aes\w+)\s+([qv].*)/unaes($1,$2)/geo or
3719 s/cclr\s+([^,]+),\s*([a-z]+)/mov.$2 $1,#0/o or
3720 s/vtbl\.8\s+(.*)/unvtbl($1)/geo or
3721 s/vdup\.32\s+(.*)/unvdup32($1)/geo or
3722 s/vmov\.32\s+(.*)/unvmov32($1)/geo or
3723 s/^(\s+)b\./$1b/o or
3724 s/^(\s+)ret/$1bx\tlr/o;
3725
3726 if (s/^(\s+)mov\.([a-z]+)/$1mov$2/) {
3727 print " it $2\n";
3728 }
3729
3730 print $_,"\n";
3731 }
3732}
3733
3734close STDOUT or die "error closing STDOUT: $!";
注意: 瀏覽 TracBrowser 來幫助您使用儲存庫瀏覽器

© 2025 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette