VirtualBox

source: vbox/trunk/src/libs/openssl-3.0.7/crypto/aes/asm/aesv8-armx.pl@ 97371

最後變更 在這個檔案從97371是 94082,由 vboxsync 提交於 3 年 前

libs/openssl-3.0.1: started applying and adjusting our OpenSSL changes to 3.0.1. bugref:10128

  • 屬性 svn:executable 設為 *
檔案大小: 79.5 KB
 
1#! /usr/bin/env perl
2# Copyright 2014-2020 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the Apache License 2.0 (the "License"). You may not use
5# this file except in compliance with the License. You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
9#
10# ====================================================================
11# Written by Andy Polyakov <[email protected]> for the OpenSSL
12# project. The module is, however, dual licensed under OpenSSL and
13# CRYPTOGAMS licenses depending on where you obtain it. For further
14# details see http://www.openssl.org/~appro/cryptogams/.
15# ====================================================================
16#
17# This module implements support for ARMv8 AES instructions. The
18# module is endian-agnostic in sense that it supports both big- and
19# little-endian cases. As does it support both 32- and 64-bit modes
20# of operation. Latter is achieved by limiting amount of utilized
21# registers to 16, which implies additional NEON load and integer
22# instructions. This has no effect on mighty Apple A7, where results
23# are literally equal to the theoretical estimates based on AES
24# instruction latencies and issue rates. On Cortex-A53, an in-order
25# execution core, this costs up to 10-15%, which is partially
26# compensated by implementing dedicated code path for 128-bit
27# CBC encrypt case. On Cortex-A57 parallelizable mode performance
28# seems to be limited by sheer amount of NEON instructions...
29#
30# April 2019
31#
32# Key to performance of parallelize-able modes is round instruction
33# interleaving. But which factor to use? There is optimal one for
34# each combination of instruction latency and issue rate, beyond
35# which increasing interleave factor doesn't pay off. While on cons
36# side we have code size increase and resource waste on platforms for
37# which interleave factor is too high. In other words you want it to
38# be just right. So far interleave factor of 3x was serving well all
39# platforms. But for ThunderX2 optimal interleave factor was measured
40# to be 5x...
41#
42# Performance in cycles per byte processed with 128-bit key:
43#
44# CBC enc CBC dec CTR
45# Apple A7 2.39 1.20 1.20
46# Cortex-A53 1.32 1.17/1.29(**) 1.36/1.46
47# Cortex-A57(*) 1.95 0.82/0.85 0.89/0.93
48# Cortex-A72 1.33 0.85/0.88 0.92/0.96
49# Denver 1.96 0.65/0.86 0.76/0.80
50# Mongoose 1.33 1.23/1.20 1.30/1.20
51# Kryo 1.26 0.87/0.94 1.00/1.00
52# ThunderX2 5.95 1.25 1.30
53#
54# (*) original 3.64/1.34/1.32 results were for r0p0 revision
55# and are still same even for updated module;
56# (**) numbers after slash are for 32-bit code, which is 3x-
57# interleaved;
58
59# $output is the last argument if it looks like a file (it has an extension)
60# $flavour is the first argument if it doesn't look like a file
61$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
62$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
63
64$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
65( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
66( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
67die "can't locate arm-xlate.pl";
68
69open OUT,"| \"$^X\" $xlate $flavour \"$output\""
70 or die "can't call $xlate: $!";
71*STDOUT=*OUT;
72
73$prefix="aes_v8";
74
75$_byte = ($flavour =~ /win/ ? "DCB" : ".byte");
76
77$code=<<___;
78#include "arm_arch.h"
79
80#if __ARM_MAX_ARCH__>=7
81___
82$code.=".arch armv8-a+crypto\n.text\n" if ($flavour =~ /64/);
83$code.=<<___ if ($flavour !~ /64/);
84.arch armv7-a // don't confuse not-so-latest binutils with argv8 :-)
85.fpu neon
86#ifdef __thumb2__
87.syntax unified
88.thumb
89# define INST(a,b,c,d) $_byte c,d|0xc,a,b
90#else
91.code 32
92# define INST(a,b,c,d) $_byte a,b,c,d
93#endif
94
95.text
96___
97
98# Assembler mnemonics are an eclectic mix of 32- and 64-bit syntax,
99# NEON is mostly 32-bit mnemonics, integer - mostly 64. Goal is to
100# maintain both 32- and 64-bit codes within single module and
101# transliterate common code to either flavour with regex vodoo.
102#
103{{{
104my ($inp,$bits,$out,$ptr,$rounds)=("x0","w1","x2","x3","w12");
105my ($zero,$rcon,$mask,$in0,$in1,$tmp,$key)=
106 $flavour=~/64/? map("q$_",(0..6)) : map("q$_",(0..3,8..10));
107
108
109$code.=<<___;
110.align 5
111.Lrcon:
112.long 0x01,0x01,0x01,0x01
113.long 0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d // rotate-n-splat
114.long 0x1b,0x1b,0x1b,0x1b
115
116.globl ${prefix}_set_encrypt_key
117.type ${prefix}_set_encrypt_key,%function
118.align 5
119${prefix}_set_encrypt_key:
120.Lenc_key:
121___
122$code.=<<___ if ($flavour =~ /64/);
123 stp x29,x30,[sp,#-16]!
124 add x29,sp,#0
125___
126$code.=<<___;
127 mov $ptr,#-1
128 cmp $inp,#0
129 b.eq .Lenc_key_abort
130 cmp $out,#0
131 b.eq .Lenc_key_abort
132 mov $ptr,#-2
133 cmp $bits,#128
134 b.lt .Lenc_key_abort
135 cmp $bits,#256
136 b.gt .Lenc_key_abort
137 tst $bits,#0x3f
138 b.ne .Lenc_key_abort
139
140 adr $ptr,.Lrcon
141 cmp $bits,#192
142
143 veor $zero,$zero,$zero
144 vld1.8 {$in0},[$inp],#16
145 mov $bits,#8 // reuse $bits
146 vld1.32 {$rcon,$mask},[$ptr],#32
147
148 b.lt .Loop128
149 b.eq .L192
150 b .L256
151
152.align 4
153.Loop128:
154 vtbl.8 $key,{$in0},$mask
155 vext.8 $tmp,$zero,$in0,#12
156 vst1.32 {$in0},[$out],#16
157 aese $key,$zero
158 subs $bits,$bits,#1
159
160 veor $in0,$in0,$tmp
161 vext.8 $tmp,$zero,$tmp,#12
162 veor $in0,$in0,$tmp
163 vext.8 $tmp,$zero,$tmp,#12
164 veor $key,$key,$rcon
165 veor $in0,$in0,$tmp
166 vshl.u8 $rcon,$rcon,#1
167 veor $in0,$in0,$key
168 b.ne .Loop128
169
170 vld1.32 {$rcon},[$ptr]
171
172 vtbl.8 $key,{$in0},$mask
173 vext.8 $tmp,$zero,$in0,#12
174 vst1.32 {$in0},[$out],#16
175 aese $key,$zero
176
177 veor $in0,$in0,$tmp
178 vext.8 $tmp,$zero,$tmp,#12
179 veor $in0,$in0,$tmp
180 vext.8 $tmp,$zero,$tmp,#12
181 veor $key,$key,$rcon
182 veor $in0,$in0,$tmp
183 vshl.u8 $rcon,$rcon,#1
184 veor $in0,$in0,$key
185
186 vtbl.8 $key,{$in0},$mask
187 vext.8 $tmp,$zero,$in0,#12
188 vst1.32 {$in0},[$out],#16
189 aese $key,$zero
190
191 veor $in0,$in0,$tmp
192 vext.8 $tmp,$zero,$tmp,#12
193 veor $in0,$in0,$tmp
194 vext.8 $tmp,$zero,$tmp,#12
195 veor $key,$key,$rcon
196 veor $in0,$in0,$tmp
197 veor $in0,$in0,$key
198 vst1.32 {$in0},[$out]
199 add $out,$out,#0x50
200
201 mov $rounds,#10
202 b .Ldone
203
204.align 4
205.L192:
206 vld1.8 {$in1},[$inp],#8
207 vmov.i8 $key,#8 // borrow $key
208 vst1.32 {$in0},[$out],#16
209 vsub.i8 $mask,$mask,$key // adjust the mask
210
211.Loop192:
212 vtbl.8 $key,{$in1},$mask
213 vext.8 $tmp,$zero,$in0,#12
214#ifdef __ARMEB__
215 vst1.32 {$in1},[$out],#16
216 sub $out,$out,#8
217#else
218 vst1.32 {$in1},[$out],#8
219#endif
220 aese $key,$zero
221 subs $bits,$bits,#1
222
223 veor $in0,$in0,$tmp
224 vext.8 $tmp,$zero,$tmp,#12
225 veor $in0,$in0,$tmp
226 vext.8 $tmp,$zero,$tmp,#12
227 veor $in0,$in0,$tmp
228
229 vdup.32 $tmp,${in0}[3]
230 veor $tmp,$tmp,$in1
231 veor $key,$key,$rcon
232 vext.8 $in1,$zero,$in1,#12
233 vshl.u8 $rcon,$rcon,#1
234 veor $in1,$in1,$tmp
235 veor $in0,$in0,$key
236 veor $in1,$in1,$key
237 vst1.32 {$in0},[$out],#16
238 b.ne .Loop192
239
240 mov $rounds,#12
241 add $out,$out,#0x20
242 b .Ldone
243
244.align 4
245.L256:
246 vld1.8 {$in1},[$inp]
247 mov $bits,#7
248 mov $rounds,#14
249 vst1.32 {$in0},[$out],#16
250
251.Loop256:
252 vtbl.8 $key,{$in1},$mask
253 vext.8 $tmp,$zero,$in0,#12
254 vst1.32 {$in1},[$out],#16
255 aese $key,$zero
256 subs $bits,$bits,#1
257
258 veor $in0,$in0,$tmp
259 vext.8 $tmp,$zero,$tmp,#12
260 veor $in0,$in0,$tmp
261 vext.8 $tmp,$zero,$tmp,#12
262 veor $key,$key,$rcon
263 veor $in0,$in0,$tmp
264 vshl.u8 $rcon,$rcon,#1
265 veor $in0,$in0,$key
266 vst1.32 {$in0},[$out],#16
267 b.eq .Ldone
268
269 vdup.32 $key,${in0}[3] // just splat
270 vext.8 $tmp,$zero,$in1,#12
271 aese $key,$zero
272
273 veor $in1,$in1,$tmp
274 vext.8 $tmp,$zero,$tmp,#12
275 veor $in1,$in1,$tmp
276 vext.8 $tmp,$zero,$tmp,#12
277 veor $in1,$in1,$tmp
278
279 veor $in1,$in1,$key
280 b .Loop256
281
282.Ldone:
283 str $rounds,[$out]
284 mov $ptr,#0
285
286.Lenc_key_abort:
287 mov x0,$ptr // return value
288 `"ldr x29,[sp],#16" if ($flavour =~ /64/)`
289 ret
290.size ${prefix}_set_encrypt_key,.-${prefix}_set_encrypt_key
291
292.globl ${prefix}_set_decrypt_key
293.type ${prefix}_set_decrypt_key,%function
294.align 5
295${prefix}_set_decrypt_key:
296___
297$code.=<<___ if ($flavour =~ /64/);
298 .inst 0xd503233f // paciasp
299 stp x29,x30,[sp,#-16]!
300 add x29,sp,#0
301___
302$code.=<<___ if ($flavour !~ /64/);
303 stmdb sp!,{r4,lr}
304___
305$code.=<<___;
306 bl .Lenc_key
307
308 cmp x0,#0
309 b.ne .Ldec_key_abort
310
311 sub $out,$out,#240 // restore original $out
312 mov x4,#-16
313 add $inp,$out,x12,lsl#4 // end of key schedule
314
315 vld1.32 {v0.16b},[$out]
316 vld1.32 {v1.16b},[$inp]
317 vst1.32 {v0.16b},[$inp],x4
318 vst1.32 {v1.16b},[$out],#16
319
320.Loop_imc:
321 vld1.32 {v0.16b},[$out]
322 vld1.32 {v1.16b},[$inp]
323 aesimc v0.16b,v0.16b
324 aesimc v1.16b,v1.16b
325 vst1.32 {v0.16b},[$inp],x4
326 vst1.32 {v1.16b},[$out],#16
327 cmp $inp,$out
328 b.hi .Loop_imc
329
330 vld1.32 {v0.16b},[$out]
331 aesimc v0.16b,v0.16b
332 vst1.32 {v0.16b},[$inp]
333
334 eor x0,x0,x0 // return value
335.Ldec_key_abort:
336___
337$code.=<<___ if ($flavour !~ /64/);
338 ldmia sp!,{r4,pc}
339___
340$code.=<<___ if ($flavour =~ /64/);
341 ldp x29,x30,[sp],#16
342 .inst 0xd50323bf // autiasp
343 ret
344___
345$code.=<<___;
346.size ${prefix}_set_decrypt_key,.-${prefix}_set_decrypt_key
347___
348}}}
349{{{
350sub gen_block () {
351my $dir = shift;
352my ($e,$mc) = $dir eq "en" ? ("e","mc") : ("d","imc");
353my ($inp,$out,$key)=map("x$_",(0..2));
354my $rounds="w3";
355my ($rndkey0,$rndkey1,$inout)=map("q$_",(0..3));
356
357$code.=<<___;
358.globl ${prefix}_${dir}crypt
359.type ${prefix}_${dir}crypt,%function
360.align 5
361${prefix}_${dir}crypt:
362 ldr $rounds,[$key,#240]
363 vld1.32 {$rndkey0},[$key],#16
364 vld1.8 {$inout},[$inp]
365 sub $rounds,$rounds,#2
366 vld1.32 {$rndkey1},[$key],#16
367
368.Loop_${dir}c:
369 aes$e $inout,$rndkey0
370 aes$mc $inout,$inout
371 vld1.32 {$rndkey0},[$key],#16
372 subs $rounds,$rounds,#2
373 aes$e $inout,$rndkey1
374 aes$mc $inout,$inout
375 vld1.32 {$rndkey1},[$key],#16
376 b.gt .Loop_${dir}c
377
378 aes$e $inout,$rndkey0
379 aes$mc $inout,$inout
380 vld1.32 {$rndkey0},[$key]
381 aes$e $inout,$rndkey1
382 veor $inout,$inout,$rndkey0
383
384 vst1.8 {$inout},[$out]
385 ret
386.size ${prefix}_${dir}crypt,.-${prefix}_${dir}crypt
387___
388}
389&gen_block("en");
390&gen_block("de");
391}}}
392
393# Performance in cycles per byte.
394# Processed with AES-ECB different key size.
395# It shows the value before and after optimization as below:
396# (before/after):
397#
398# AES-128-ECB AES-192-ECB AES-256-ECB
399# Cortex-A57 1.85/0.82 2.16/0.96 2.47/1.10
400# Cortex-A72 1.64/0.85 1.82/0.99 2.13/1.14
401
402# Optimization is implemented by loop unrolling and interleaving.
403# Commonly, we choose the unrolling factor as 5, if the input
404# data size smaller than 5 blocks, but not smaller than 3 blocks,
405# choose 3 as the unrolling factor.
406# If the input data size dsize >= 5*16 bytes, then take 5 blocks
407# as one iteration, every loop the left size lsize -= 5*16.
408# If 5*16 > lsize >= 3*16 bytes, take 3 blocks as one iteration,
409# every loop lsize -=3*16.
410# If lsize < 3*16 bytes, treat them as the tail, interleave the
411# two blocks AES instructions.
412# There is one special case, if the original input data size dsize
413# = 16 bytes, we will treat it seperately to improve the
414# performance: one independent code block without LR, FP load and
415# store, just looks like what the original ECB implementation does.
416
417{{{
418my ($inp,$out,$len,$key)=map("x$_",(0..3));
419my ($enc,$rounds,$cnt,$key_,$step)=("w4","w5","w6","x7","x8");
420my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$tmp2,$rndlast)=map("q$_",(0..7));
421
422my ($dat,$tmp,$rndzero_n_last)=($dat0,$tmp0,$tmp1);
423
424### q7 last round key
425### q10-q15 q7 Last 7 round keys
426### q8-q9 preloaded round keys except last 7 keys for big size
427### q5, q6, q8-q9 preloaded round keys except last 7 keys for only 16 byte
428
429{
430my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9));
431
432my ($dat3,$in3,$tmp3); # used only in 64-bit mode
433my ($dat4,$in4,$tmp4);
434if ($flavour =~ /64/) {
435 ($dat2,$dat3,$dat4,$in2,$in3,$in4,$tmp3,$tmp4)=map("q$_",(16..23));
436}
437
438$code.=<<___;
439.globl ${prefix}_ecb_encrypt
440.type ${prefix}_ecb_encrypt,%function
441.align 5
442${prefix}_ecb_encrypt:
443___
444$code.=<<___ if ($flavour =~ /64/);
445 subs $len,$len,#16
446 // Original input data size bigger than 16, jump to big size processing.
447 b.ne .Lecb_big_size
448 vld1.8 {$dat0},[$inp]
449 cmp $enc,#0 // en- or decrypting?
450 ldr $rounds,[$key,#240]
451 vld1.32 {q5-q6},[$key],#32 // load key schedule...
452
453 b.eq .Lecb_small_dec
454 aese $dat0,q5
455 aesmc $dat0,$dat0
456 vld1.32 {q8-q9},[$key],#32 // load key schedule...
457 aese $dat0,q6
458 aesmc $dat0,$dat0
459 subs $rounds,$rounds,#10 // if rounds==10, jump to aes-128-ecb processing
460 b.eq .Lecb_128_enc
461.Lecb_round_loop:
462 aese $dat0,q8
463 aesmc $dat0,$dat0
464 vld1.32 {q8},[$key],#16 // load key schedule...
465 aese $dat0,q9
466 aesmc $dat0,$dat0
467 vld1.32 {q9},[$key],#16 // load key schedule...
468 subs $rounds,$rounds,#2 // bias
469 b.gt .Lecb_round_loop
470.Lecb_128_enc:
471 vld1.32 {q10-q11},[$key],#32 // load key schedule...
472 aese $dat0,q8
473 aesmc $dat0,$dat0
474 aese $dat0,q9
475 aesmc $dat0,$dat0
476 vld1.32 {q12-q13},[$key],#32 // load key schedule...
477 aese $dat0,q10
478 aesmc $dat0,$dat0
479 aese $dat0,q11
480 aesmc $dat0,$dat0
481 vld1.32 {q14-q15},[$key],#32 // load key schedule...
482 aese $dat0,q12
483 aesmc $dat0,$dat0
484 aese $dat0,q13
485 aesmc $dat0,$dat0
486 vld1.32 {$rndlast},[$key]
487 aese $dat0,q14
488 aesmc $dat0,$dat0
489 aese $dat0,q15
490 veor $dat0,$dat0,$rndlast
491 vst1.8 {$dat0},[$out]
492 b .Lecb_Final_abort
493.Lecb_small_dec:
494 aesd $dat0,q5
495 aesimc $dat0,$dat0
496 vld1.32 {q8-q9},[$key],#32 // load key schedule...
497 aesd $dat0,q6
498 aesimc $dat0,$dat0
499 subs $rounds,$rounds,#10 // bias
500 b.eq .Lecb_128_dec
501.Lecb_dec_round_loop:
502 aesd $dat0,q8
503 aesimc $dat0,$dat0
504 vld1.32 {q8},[$key],#16 // load key schedule...
505 aesd $dat0,q9
506 aesimc $dat0,$dat0
507 vld1.32 {q9},[$key],#16 // load key schedule...
508 subs $rounds,$rounds,#2 // bias
509 b.gt .Lecb_dec_round_loop
510.Lecb_128_dec:
511 vld1.32 {q10-q11},[$key],#32 // load key schedule...
512 aesd $dat0,q8
513 aesimc $dat0,$dat0
514 aesd $dat0,q9
515 aesimc $dat0,$dat0
516 vld1.32 {q12-q13},[$key],#32 // load key schedule...
517 aesd $dat0,q10
518 aesimc $dat0,$dat0
519 aesd $dat0,q11
520 aesimc $dat0,$dat0
521 vld1.32 {q14-q15},[$key],#32 // load key schedule...
522 aesd $dat0,q12
523 aesimc $dat0,$dat0
524 aesd $dat0,q13
525 aesimc $dat0,$dat0
526 vld1.32 {$rndlast},[$key]
527 aesd $dat0,q14
528 aesimc $dat0,$dat0
529 aesd $dat0,q15
530 veor $dat0,$dat0,$rndlast
531 vst1.8 {$dat0},[$out]
532 b .Lecb_Final_abort
533.Lecb_big_size:
534___
535$code.=<<___ if ($flavour =~ /64/);
536 stp x29,x30,[sp,#-16]!
537 add x29,sp,#0
538___
539$code.=<<___ if ($flavour !~ /64/);
540 mov ip,sp
541 stmdb sp!,{r4-r8,lr}
542 vstmdb sp!,{d8-d15} @ ABI specification says so
543 ldmia ip,{r4-r5} @ load remaining args
544 subs $len,$len,#16
545___
546$code.=<<___;
547 mov $step,#16
548 b.lo .Lecb_done
549 cclr $step,eq
550
551 cmp $enc,#0 // en- or decrypting?
552 ldr $rounds,[$key,#240]
553 and $len,$len,#-16
554 vld1.8 {$dat},[$inp],$step
555
556 vld1.32 {q8-q9},[$key] // load key schedule...
557 sub $rounds,$rounds,#6
558 add $key_,$key,x5,lsl#4 // pointer to last 7 round keys
559 sub $rounds,$rounds,#2
560 vld1.32 {q10-q11},[$key_],#32
561 vld1.32 {q12-q13},[$key_],#32
562 vld1.32 {q14-q15},[$key_],#32
563 vld1.32 {$rndlast},[$key_]
564
565 add $key_,$key,#32
566 mov $cnt,$rounds
567 b.eq .Lecb_dec
568
569 vld1.8 {$dat1},[$inp],#16
570 subs $len,$len,#32 // bias
571 add $cnt,$rounds,#2
572 vorr $in1,$dat1,$dat1
573 vorr $dat2,$dat1,$dat1
574 vorr $dat1,$dat,$dat
575 b.lo .Lecb_enc_tail
576
577 vorr $dat1,$in1,$in1
578 vld1.8 {$dat2},[$inp],#16
579___
580$code.=<<___ if ($flavour =~ /64/);
581 cmp $len,#32
582 b.lo .Loop3x_ecb_enc
583
584 vld1.8 {$dat3},[$inp],#16
585 vld1.8 {$dat4},[$inp],#16
586 sub $len,$len,#32 // bias
587 mov $cnt,$rounds
588
589.Loop5x_ecb_enc:
590 aese $dat0,q8
591 aesmc $dat0,$dat0
592 aese $dat1,q8
593 aesmc $dat1,$dat1
594 aese $dat2,q8
595 aesmc $dat2,$dat2
596 aese $dat3,q8
597 aesmc $dat3,$dat3
598 aese $dat4,q8
599 aesmc $dat4,$dat4
600 vld1.32 {q8},[$key_],#16
601 subs $cnt,$cnt,#2
602 aese $dat0,q9
603 aesmc $dat0,$dat0
604 aese $dat1,q9
605 aesmc $dat1,$dat1
606 aese $dat2,q9
607 aesmc $dat2,$dat2
608 aese $dat3,q9
609 aesmc $dat3,$dat3
610 aese $dat4,q9
611 aesmc $dat4,$dat4
612 vld1.32 {q9},[$key_],#16
613 b.gt .Loop5x_ecb_enc
614
615 aese $dat0,q8
616 aesmc $dat0,$dat0
617 aese $dat1,q8
618 aesmc $dat1,$dat1
619 aese $dat2,q8
620 aesmc $dat2,$dat2
621 aese $dat3,q8
622 aesmc $dat3,$dat3
623 aese $dat4,q8
624 aesmc $dat4,$dat4
625 cmp $len,#0x40 // because .Lecb_enc_tail4x
626 sub $len,$len,#0x50
627
628 aese $dat0,q9
629 aesmc $dat0,$dat0
630 aese $dat1,q9
631 aesmc $dat1,$dat1
632 aese $dat2,q9
633 aesmc $dat2,$dat2
634 aese $dat3,q9
635 aesmc $dat3,$dat3
636 aese $dat4,q9
637 aesmc $dat4,$dat4
638 csel x6,xzr,$len,gt // borrow x6, $cnt, "gt" is not typo
639 mov $key_,$key
640
641 aese $dat0,q10
642 aesmc $dat0,$dat0
643 aese $dat1,q10
644 aesmc $dat1,$dat1
645 aese $dat2,q10
646 aesmc $dat2,$dat2
647 aese $dat3,q10
648 aesmc $dat3,$dat3
649 aese $dat4,q10
650 aesmc $dat4,$dat4
651 add $inp,$inp,x6 // $inp is adjusted in such way that
652 // at exit from the loop $dat1-$dat4
653 // are loaded with last "words"
654 add x6,$len,#0x60 // because .Lecb_enc_tail4x
655
656 aese $dat0,q11
657 aesmc $dat0,$dat0
658 aese $dat1,q11
659 aesmc $dat1,$dat1
660 aese $dat2,q11
661 aesmc $dat2,$dat2
662 aese $dat3,q11
663 aesmc $dat3,$dat3
664 aese $dat4,q11
665 aesmc $dat4,$dat4
666
667 aese $dat0,q12
668 aesmc $dat0,$dat0
669 aese $dat1,q12
670 aesmc $dat1,$dat1
671 aese $dat2,q12
672 aesmc $dat2,$dat2
673 aese $dat3,q12
674 aesmc $dat3,$dat3
675 aese $dat4,q12
676 aesmc $dat4,$dat4
677
678 aese $dat0,q13
679 aesmc $dat0,$dat0
680 aese $dat1,q13
681 aesmc $dat1,$dat1
682 aese $dat2,q13
683 aesmc $dat2,$dat2
684 aese $dat3,q13
685 aesmc $dat3,$dat3
686 aese $dat4,q13
687 aesmc $dat4,$dat4
688
689 aese $dat0,q14
690 aesmc $dat0,$dat0
691 aese $dat1,q14
692 aesmc $dat1,$dat1
693 aese $dat2,q14
694 aesmc $dat2,$dat2
695 aese $dat3,q14
696 aesmc $dat3,$dat3
697 aese $dat4,q14
698 aesmc $dat4,$dat4
699
700 aese $dat0,q15
701 vld1.8 {$in0},[$inp],#16
702 aese $dat1,q15
703 vld1.8 {$in1},[$inp],#16
704 aese $dat2,q15
705 vld1.8 {$in2},[$inp],#16
706 aese $dat3,q15
707 vld1.8 {$in3},[$inp],#16
708 aese $dat4,q15
709 vld1.8 {$in4},[$inp],#16
710 cbz x6,.Lecb_enc_tail4x
711 vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0]
712 veor $tmp0,$rndlast,$dat0
713 vorr $dat0,$in0,$in0
714 veor $tmp1,$rndlast,$dat1
715 vorr $dat1,$in1,$in1
716 veor $tmp2,$rndlast,$dat2
717 vorr $dat2,$in2,$in2
718 veor $tmp3,$rndlast,$dat3
719 vorr $dat3,$in3,$in3
720 veor $tmp4,$rndlast,$dat4
721 vst1.8 {$tmp0},[$out],#16
722 vorr $dat4,$in4,$in4
723 vst1.8 {$tmp1},[$out],#16
724 mov $cnt,$rounds
725 vst1.8 {$tmp2},[$out],#16
726 vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1]
727 vst1.8 {$tmp3},[$out],#16
728 vst1.8 {$tmp4},[$out],#16
729 b.hs .Loop5x_ecb_enc
730
731 add $len,$len,#0x50
732 cbz $len,.Lecb_done
733
734 add $cnt,$rounds,#2
735 subs $len,$len,#0x30
736 vorr $dat0,$in2,$in2
737 vorr $dat1,$in3,$in3
738 vorr $dat2,$in4,$in4
739 b.lo .Lecb_enc_tail
740
741 b .Loop3x_ecb_enc
742
743.align 4
744.Lecb_enc_tail4x:
745 veor $tmp1,$rndlast,$dat1
746 veor $tmp2,$rndlast,$dat2
747 veor $tmp3,$rndlast,$dat3
748 veor $tmp4,$rndlast,$dat4
749 vst1.8 {$tmp1},[$out],#16
750 vst1.8 {$tmp2},[$out],#16
751 vst1.8 {$tmp3},[$out],#16
752 vst1.8 {$tmp4},[$out],#16
753
754 b .Lecb_done
755.align 4
756___
757$code.=<<___;
758.Loop3x_ecb_enc:
759 aese $dat0,q8
760 aesmc $dat0,$dat0
761 aese $dat1,q8
762 aesmc $dat1,$dat1
763 aese $dat2,q8
764 aesmc $dat2,$dat2
765 vld1.32 {q8},[$key_],#16
766 subs $cnt,$cnt,#2
767 aese $dat0,q9
768 aesmc $dat0,$dat0
769 aese $dat1,q9
770 aesmc $dat1,$dat1
771 aese $dat2,q9
772 aesmc $dat2,$dat2
773 vld1.32 {q9},[$key_],#16
774 b.gt .Loop3x_ecb_enc
775
776 aese $dat0,q8
777 aesmc $dat0,$dat0
778 aese $dat1,q8
779 aesmc $dat1,$dat1
780 aese $dat2,q8
781 aesmc $dat2,$dat2
782 subs $len,$len,#0x30
783 mov.lo x6,$len // x6, $cnt, is zero at this point
784 aese $dat0,q9
785 aesmc $dat0,$dat0
786 aese $dat1,q9
787 aesmc $dat1,$dat1
788 aese $dat2,q9
789 aesmc $dat2,$dat2
790 add $inp,$inp,x6 // $inp is adjusted in such way that
791 // at exit from the loop $dat1-$dat2
792 // are loaded with last "words"
793 mov $key_,$key
794 aese $dat0,q12
795 aesmc $dat0,$dat0
796 aese $dat1,q12
797 aesmc $dat1,$dat1
798 aese $dat2,q12
799 aesmc $dat2,$dat2
800 vld1.8 {$in0},[$inp],#16
801 aese $dat0,q13
802 aesmc $dat0,$dat0
803 aese $dat1,q13
804 aesmc $dat1,$dat1
805 aese $dat2,q13
806 aesmc $dat2,$dat2
807 vld1.8 {$in1},[$inp],#16
808 aese $dat0,q14
809 aesmc $dat0,$dat0
810 aese $dat1,q14
811 aesmc $dat1,$dat1
812 aese $dat2,q14
813 aesmc $dat2,$dat2
814 vld1.8 {$in2},[$inp],#16
815 aese $dat0,q15
816 aese $dat1,q15
817 aese $dat2,q15
818 vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0]
819 add $cnt,$rounds,#2
820 veor $tmp0,$rndlast,$dat0
821 veor $tmp1,$rndlast,$dat1
822 veor $dat2,$dat2,$rndlast
823 vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1]
824 vst1.8 {$tmp0},[$out],#16
825 vorr $dat0,$in0,$in0
826 vst1.8 {$tmp1},[$out],#16
827 vorr $dat1,$in1,$in1
828 vst1.8 {$dat2},[$out],#16
829 vorr $dat2,$in2,$in2
830 b.hs .Loop3x_ecb_enc
831
832 cmn $len,#0x30
833 b.eq .Lecb_done
834 nop
835
836.Lecb_enc_tail:
837 aese $dat1,q8
838 aesmc $dat1,$dat1
839 aese $dat2,q8
840 aesmc $dat2,$dat2
841 vld1.32 {q8},[$key_],#16
842 subs $cnt,$cnt,#2
843 aese $dat1,q9
844 aesmc $dat1,$dat1
845 aese $dat2,q9
846 aesmc $dat2,$dat2
847 vld1.32 {q9},[$key_],#16
848 b.gt .Lecb_enc_tail
849
850 aese $dat1,q8
851 aesmc $dat1,$dat1
852 aese $dat2,q8
853 aesmc $dat2,$dat2
854 aese $dat1,q9
855 aesmc $dat1,$dat1
856 aese $dat2,q9
857 aesmc $dat2,$dat2
858 aese $dat1,q12
859 aesmc $dat1,$dat1
860 aese $dat2,q12
861 aesmc $dat2,$dat2
862 cmn $len,#0x20
863 aese $dat1,q13
864 aesmc $dat1,$dat1
865 aese $dat2,q13
866 aesmc $dat2,$dat2
867 aese $dat1,q14
868 aesmc $dat1,$dat1
869 aese $dat2,q14
870 aesmc $dat2,$dat2
871 aese $dat1,q15
872 aese $dat2,q15
873 b.eq .Lecb_enc_one
874 veor $tmp1,$rndlast,$dat1
875 veor $tmp2,$rndlast,$dat2
876 vst1.8 {$tmp1},[$out],#16
877 vst1.8 {$tmp2},[$out],#16
878 b .Lecb_done
879
880.Lecb_enc_one:
881 veor $tmp1,$rndlast,$dat2
882 vst1.8 {$tmp1},[$out],#16
883 b .Lecb_done
884___
885
886$code.=<<___;
887.align 5
888.Lecb_dec:
889 vld1.8 {$dat1},[$inp],#16
890 subs $len,$len,#32 // bias
891 add $cnt,$rounds,#2
892 vorr $in1,$dat1,$dat1
893 vorr $dat2,$dat1,$dat1
894 vorr $dat1,$dat,$dat
895 b.lo .Lecb_dec_tail
896
897 vorr $dat1,$in1,$in1
898 vld1.8 {$dat2},[$inp],#16
899___
900$code.=<<___ if ($flavour =~ /64/);
901 cmp $len,#32
902 b.lo .Loop3x_ecb_dec
903
904 vld1.8 {$dat3},[$inp],#16
905 vld1.8 {$dat4},[$inp],#16
906 sub $len,$len,#32 // bias
907 mov $cnt,$rounds
908
909.Loop5x_ecb_dec:
910 aesd $dat0,q8
911 aesimc $dat0,$dat0
912 aesd $dat1,q8
913 aesimc $dat1,$dat1
914 aesd $dat2,q8
915 aesimc $dat2,$dat2
916 aesd $dat3,q8
917 aesimc $dat3,$dat3
918 aesd $dat4,q8
919 aesimc $dat4,$dat4
920 vld1.32 {q8},[$key_],#16
921 subs $cnt,$cnt,#2
922 aesd $dat0,q9
923 aesimc $dat0,$dat0
924 aesd $dat1,q9
925 aesimc $dat1,$dat1
926 aesd $dat2,q9
927 aesimc $dat2,$dat2
928 aesd $dat3,q9
929 aesimc $dat3,$dat3
930 aesd $dat4,q9
931 aesimc $dat4,$dat4
932 vld1.32 {q9},[$key_],#16
933 b.gt .Loop5x_ecb_dec
934
935 aesd $dat0,q8
936 aesimc $dat0,$dat0
937 aesd $dat1,q8
938 aesimc $dat1,$dat1
939 aesd $dat2,q8
940 aesimc $dat2,$dat2
941 aesd $dat3,q8
942 aesimc $dat3,$dat3
943 aesd $dat4,q8
944 aesimc $dat4,$dat4
945 cmp $len,#0x40 // because .Lecb_tail4x
946 sub $len,$len,#0x50
947
948 aesd $dat0,q9
949 aesimc $dat0,$dat0
950 aesd $dat1,q9
951 aesimc $dat1,$dat1
952 aesd $dat2,q9
953 aesimc $dat2,$dat2
954 aesd $dat3,q9
955 aesimc $dat3,$dat3
956 aesd $dat4,q9
957 aesimc $dat4,$dat4
958 csel x6,xzr,$len,gt // borrow x6, $cnt, "gt" is not typo
959 mov $key_,$key
960
961 aesd $dat0,q10
962 aesimc $dat0,$dat0
963 aesd $dat1,q10
964 aesimc $dat1,$dat1
965 aesd $dat2,q10
966 aesimc $dat2,$dat2
967 aesd $dat3,q10
968 aesimc $dat3,$dat3
969 aesd $dat4,q10
970 aesimc $dat4,$dat4
971 add $inp,$inp,x6 // $inp is adjusted in such way that
972 // at exit from the loop $dat1-$dat4
973 // are loaded with last "words"
974 add x6,$len,#0x60 // because .Lecb_tail4x
975
976 aesd $dat0,q11
977 aesimc $dat0,$dat0
978 aesd $dat1,q11
979 aesimc $dat1,$dat1
980 aesd $dat2,q11
981 aesimc $dat2,$dat2
982 aesd $dat3,q11
983 aesimc $dat3,$dat3
984 aesd $dat4,q11
985 aesimc $dat4,$dat4
986
987 aesd $dat0,q12
988 aesimc $dat0,$dat0
989 aesd $dat1,q12
990 aesimc $dat1,$dat1
991 aesd $dat2,q12
992 aesimc $dat2,$dat2
993 aesd $dat3,q12
994 aesimc $dat3,$dat3
995 aesd $dat4,q12
996 aesimc $dat4,$dat4
997
998 aesd $dat0,q13
999 aesimc $dat0,$dat0
1000 aesd $dat1,q13
1001 aesimc $dat1,$dat1
1002 aesd $dat2,q13
1003 aesimc $dat2,$dat2
1004 aesd $dat3,q13
1005 aesimc $dat3,$dat3
1006 aesd $dat4,q13
1007 aesimc $dat4,$dat4
1008
1009 aesd $dat0,q14
1010 aesimc $dat0,$dat0
1011 aesd $dat1,q14
1012 aesimc $dat1,$dat1
1013 aesd $dat2,q14
1014 aesimc $dat2,$dat2
1015 aesd $dat3,q14
1016 aesimc $dat3,$dat3
1017 aesd $dat4,q14
1018 aesimc $dat4,$dat4
1019
1020 aesd $dat0,q15
1021 vld1.8 {$in0},[$inp],#16
1022 aesd $dat1,q15
1023 vld1.8 {$in1},[$inp],#16
1024 aesd $dat2,q15
1025 vld1.8 {$in2},[$inp],#16
1026 aesd $dat3,q15
1027 vld1.8 {$in3},[$inp],#16
1028 aesd $dat4,q15
1029 vld1.8 {$in4},[$inp],#16
1030 cbz x6,.Lecb_tail4x
1031 vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0]
1032 veor $tmp0,$rndlast,$dat0
1033 vorr $dat0,$in0,$in0
1034 veor $tmp1,$rndlast,$dat1
1035 vorr $dat1,$in1,$in1
1036 veor $tmp2,$rndlast,$dat2
1037 vorr $dat2,$in2,$in2
1038 veor $tmp3,$rndlast,$dat3
1039 vorr $dat3,$in3,$in3
1040 veor $tmp4,$rndlast,$dat4
1041 vst1.8 {$tmp0},[$out],#16
1042 vorr $dat4,$in4,$in4
1043 vst1.8 {$tmp1},[$out],#16
1044 mov $cnt,$rounds
1045 vst1.8 {$tmp2},[$out],#16
1046 vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1]
1047 vst1.8 {$tmp3},[$out],#16
1048 vst1.8 {$tmp4},[$out],#16
1049 b.hs .Loop5x_ecb_dec
1050
1051 add $len,$len,#0x50
1052 cbz $len,.Lecb_done
1053
1054 add $cnt,$rounds,#2
1055 subs $len,$len,#0x30
1056 vorr $dat0,$in2,$in2
1057 vorr $dat1,$in3,$in3
1058 vorr $dat2,$in4,$in4
1059 b.lo .Lecb_dec_tail
1060
1061 b .Loop3x_ecb_dec
1062
1063.align 4
1064.Lecb_tail4x:
1065 veor $tmp1,$rndlast,$dat1
1066 veor $tmp2,$rndlast,$dat2
1067 veor $tmp3,$rndlast,$dat3
1068 veor $tmp4,$rndlast,$dat4
1069 vst1.8 {$tmp1},[$out],#16
1070 vst1.8 {$tmp2},[$out],#16
1071 vst1.8 {$tmp3},[$out],#16
1072 vst1.8 {$tmp4},[$out],#16
1073
1074 b .Lecb_done
1075.align 4
1076___
1077$code.=<<___;
1078.Loop3x_ecb_dec:
1079 aesd $dat0,q8
1080 aesimc $dat0,$dat0
1081 aesd $dat1,q8
1082 aesimc $dat1,$dat1
1083 aesd $dat2,q8
1084 aesimc $dat2,$dat2
1085 vld1.32 {q8},[$key_],#16
1086 subs $cnt,$cnt,#2
1087 aesd $dat0,q9
1088 aesimc $dat0,$dat0
1089 aesd $dat1,q9
1090 aesimc $dat1,$dat1
1091 aesd $dat2,q9
1092 aesimc $dat2,$dat2
1093 vld1.32 {q9},[$key_],#16
1094 b.gt .Loop3x_ecb_dec
1095
1096 aesd $dat0,q8
1097 aesimc $dat0,$dat0
1098 aesd $dat1,q8
1099 aesimc $dat1,$dat1
1100 aesd $dat2,q8
1101 aesimc $dat2,$dat2
1102 subs $len,$len,#0x30
1103 mov.lo x6,$len // x6, $cnt, is zero at this point
1104 aesd $dat0,q9
1105 aesimc $dat0,$dat0
1106 aesd $dat1,q9
1107 aesimc $dat1,$dat1
1108 aesd $dat2,q9
1109 aesimc $dat2,$dat2
1110 add $inp,$inp,x6 // $inp is adjusted in such way that
1111 // at exit from the loop $dat1-$dat2
1112 // are loaded with last "words"
1113 mov $key_,$key
1114 aesd $dat0,q12
1115 aesimc $dat0,$dat0
1116 aesd $dat1,q12
1117 aesimc $dat1,$dat1
1118 aesd $dat2,q12
1119 aesimc $dat2,$dat2
1120 vld1.8 {$in0},[$inp],#16
1121 aesd $dat0,q13
1122 aesimc $dat0,$dat0
1123 aesd $dat1,q13
1124 aesimc $dat1,$dat1
1125 aesd $dat2,q13
1126 aesimc $dat2,$dat2
1127 vld1.8 {$in1},[$inp],#16
1128 aesd $dat0,q14
1129 aesimc $dat0,$dat0
1130 aesd $dat1,q14
1131 aesimc $dat1,$dat1
1132 aesd $dat2,q14
1133 aesimc $dat2,$dat2
1134 vld1.8 {$in2},[$inp],#16
1135 aesd $dat0,q15
1136 aesd $dat1,q15
1137 aesd $dat2,q15
1138 vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0]
1139 add $cnt,$rounds,#2
1140 veor $tmp0,$rndlast,$dat0
1141 veor $tmp1,$rndlast,$dat1
1142 veor $dat2,$dat2,$rndlast
1143 vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1]
1144 vst1.8 {$tmp0},[$out],#16
1145 vorr $dat0,$in0,$in0
1146 vst1.8 {$tmp1},[$out],#16
1147 vorr $dat1,$in1,$in1
1148 vst1.8 {$dat2},[$out],#16
1149 vorr $dat2,$in2,$in2
1150 b.hs .Loop3x_ecb_dec
1151
1152 cmn $len,#0x30
1153 b.eq .Lecb_done
1154 nop
1155
1156.Lecb_dec_tail:
1157 aesd $dat1,q8
1158 aesimc $dat1,$dat1
1159 aesd $dat2,q8
1160 aesimc $dat2,$dat2
1161 vld1.32 {q8},[$key_],#16
1162 subs $cnt,$cnt,#2
1163 aesd $dat1,q9
1164 aesimc $dat1,$dat1
1165 aesd $dat2,q9
1166 aesimc $dat2,$dat2
1167 vld1.32 {q9},[$key_],#16
1168 b.gt .Lecb_dec_tail
1169
1170 aesd $dat1,q8
1171 aesimc $dat1,$dat1
1172 aesd $dat2,q8
1173 aesimc $dat2,$dat2
1174 aesd $dat1,q9
1175 aesimc $dat1,$dat1
1176 aesd $dat2,q9
1177 aesimc $dat2,$dat2
1178 aesd $dat1,q12
1179 aesimc $dat1,$dat1
1180 aesd $dat2,q12
1181 aesimc $dat2,$dat2
1182 cmn $len,#0x20
1183 aesd $dat1,q13
1184 aesimc $dat1,$dat1
1185 aesd $dat2,q13
1186 aesimc $dat2,$dat2
1187 aesd $dat1,q14
1188 aesimc $dat1,$dat1
1189 aesd $dat2,q14
1190 aesimc $dat2,$dat2
1191 aesd $dat1,q15
1192 aesd $dat2,q15
1193 b.eq .Lecb_dec_one
1194 veor $tmp1,$rndlast,$dat1
1195 veor $tmp2,$rndlast,$dat2
1196 vst1.8 {$tmp1},[$out],#16
1197 vst1.8 {$tmp2},[$out],#16
1198 b .Lecb_done
1199
1200.Lecb_dec_one:
1201 veor $tmp1,$rndlast,$dat2
1202 vst1.8 {$tmp1},[$out],#16
1203
1204.Lecb_done:
1205___
1206}
1207$code.=<<___ if ($flavour !~ /64/);
1208 vldmia sp!,{d8-d15}
1209 ldmia sp!,{r4-r8,pc}
1210___
1211$code.=<<___ if ($flavour =~ /64/);
1212 ldr x29,[sp],#16
1213___
1214$code.=<<___ if ($flavour =~ /64/);
1215.Lecb_Final_abort:
1216 ret
1217___
1218$code.=<<___;
1219.size ${prefix}_ecb_encrypt,.-${prefix}_ecb_encrypt
1220___
1221}}}
1222{{{
1223my ($inp,$out,$len,$key,$ivp)=map("x$_",(0..4)); my $enc="w5";
1224my ($rounds,$cnt,$key_,$step,$step1)=($enc,"w6","x7","x8","x12");
1225my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$ivec,$rndlast)=map("q$_",(0..7));
1226
1227my ($dat,$tmp,$rndzero_n_last)=($dat0,$tmp0,$tmp1);
1228my ($key4,$key5,$key6,$key7)=("x6","x12","x14",$key);
1229
1230### q8-q15 preloaded key schedule
1231
1232$code.=<<___;
1233.globl ${prefix}_cbc_encrypt
1234.type ${prefix}_cbc_encrypt,%function
1235.align 5
1236${prefix}_cbc_encrypt:
1237___
1238$code.=<<___ if ($flavour =~ /64/);
1239 stp x29,x30,[sp,#-16]!
1240 add x29,sp,#0
1241___
1242$code.=<<___ if ($flavour !~ /64/);
1243 mov ip,sp
1244 stmdb sp!,{r4-r8,lr}
1245 vstmdb sp!,{d8-d15} @ ABI specification says so
1246 ldmia ip,{r4-r5} @ load remaining args
1247___
1248$code.=<<___;
1249 subs $len,$len,#16
1250 mov $step,#16
1251 b.lo .Lcbc_abort
1252 cclr $step,eq
1253
1254 cmp $enc,#0 // en- or decrypting?
1255 ldr $rounds,[$key,#240]
1256 and $len,$len,#-16
1257 vld1.8 {$ivec},[$ivp]
1258 vld1.8 {$dat},[$inp],$step
1259
1260 vld1.32 {q8-q9},[$key] // load key schedule...
1261 sub $rounds,$rounds,#6
1262 add $key_,$key,x5,lsl#4 // pointer to last 7 round keys
1263 sub $rounds,$rounds,#2
1264 vld1.32 {q10-q11},[$key_],#32
1265 vld1.32 {q12-q13},[$key_],#32
1266 vld1.32 {q14-q15},[$key_],#32
1267 vld1.32 {$rndlast},[$key_]
1268
1269 add $key_,$key,#32
1270 mov $cnt,$rounds
1271 b.eq .Lcbc_dec
1272
1273 cmp $rounds,#2
1274 veor $dat,$dat,$ivec
1275 veor $rndzero_n_last,q8,$rndlast
1276 b.eq .Lcbc_enc128
1277
1278 vld1.32 {$in0-$in1},[$key_]
1279 add $key_,$key,#16
1280 add $key4,$key,#16*4
1281 add $key5,$key,#16*5
1282 aese $dat,q8
1283 aesmc $dat,$dat
1284 add $key6,$key,#16*6
1285 add $key7,$key,#16*7
1286 b .Lenter_cbc_enc
1287
1288.align 4
1289.Loop_cbc_enc:
1290 aese $dat,q8
1291 aesmc $dat,$dat
1292 vst1.8 {$ivec},[$out],#16
1293.Lenter_cbc_enc:
1294 aese $dat,q9
1295 aesmc $dat,$dat
1296 aese $dat,$in0
1297 aesmc $dat,$dat
1298 vld1.32 {q8},[$key4]
1299 cmp $rounds,#4
1300 aese $dat,$in1
1301 aesmc $dat,$dat
1302 vld1.32 {q9},[$key5]
1303 b.eq .Lcbc_enc192
1304
1305 aese $dat,q8
1306 aesmc $dat,$dat
1307 vld1.32 {q8},[$key6]
1308 aese $dat,q9
1309 aesmc $dat,$dat
1310 vld1.32 {q9},[$key7]
1311 nop
1312
1313.Lcbc_enc192:
1314 aese $dat,q8
1315 aesmc $dat,$dat
1316 subs $len,$len,#16
1317 aese $dat,q9
1318 aesmc $dat,$dat
1319 cclr $step,eq
1320 aese $dat,q10
1321 aesmc $dat,$dat
1322 aese $dat,q11
1323 aesmc $dat,$dat
1324 vld1.8 {q8},[$inp],$step
1325 aese $dat,q12
1326 aesmc $dat,$dat
1327 veor q8,q8,$rndzero_n_last
1328 aese $dat,q13
1329 aesmc $dat,$dat
1330 vld1.32 {q9},[$key_] // re-pre-load rndkey[1]
1331 aese $dat,q14
1332 aesmc $dat,$dat
1333 aese $dat,q15
1334 veor $ivec,$dat,$rndlast
1335 b.hs .Loop_cbc_enc
1336
1337 vst1.8 {$ivec},[$out],#16
1338 b .Lcbc_done
1339
1340.align 5
1341.Lcbc_enc128:
1342 vld1.32 {$in0-$in1},[$key_]
1343 aese $dat,q8
1344 aesmc $dat,$dat
1345 b .Lenter_cbc_enc128
1346.Loop_cbc_enc128:
1347 aese $dat,q8
1348 aesmc $dat,$dat
1349 vst1.8 {$ivec},[$out],#16
1350.Lenter_cbc_enc128:
1351 aese $dat,q9
1352 aesmc $dat,$dat
1353 subs $len,$len,#16
1354 aese $dat,$in0
1355 aesmc $dat,$dat
1356 cclr $step,eq
1357 aese $dat,$in1
1358 aesmc $dat,$dat
1359 aese $dat,q10
1360 aesmc $dat,$dat
1361 aese $dat,q11
1362 aesmc $dat,$dat
1363 vld1.8 {q8},[$inp],$step
1364 aese $dat,q12
1365 aesmc $dat,$dat
1366 aese $dat,q13
1367 aesmc $dat,$dat
1368 aese $dat,q14
1369 aesmc $dat,$dat
1370 veor q8,q8,$rndzero_n_last
1371 aese $dat,q15
1372 veor $ivec,$dat,$rndlast
1373 b.hs .Loop_cbc_enc128
1374
1375 vst1.8 {$ivec},[$out],#16
1376 b .Lcbc_done
1377___
1378{
1379my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9));
1380
1381my ($dat3,$in3,$tmp3); # used only in 64-bit mode
1382my ($dat4,$in4,$tmp4);
1383if ($flavour =~ /64/) {
1384 ($dat2,$dat3,$dat4,$in2,$in3,$in4,$tmp3,$tmp4)=map("q$_",(16..23));
1385}
1386
1387$code.=<<___;
1388.align 5
1389.Lcbc_dec:
1390 vld1.8 {$dat2},[$inp],#16
1391 subs $len,$len,#32 // bias
1392 add $cnt,$rounds,#2
1393 vorr $in1,$dat,$dat
1394 vorr $dat1,$dat,$dat
1395 vorr $in2,$dat2,$dat2
1396 b.lo .Lcbc_dec_tail
1397
1398 vorr $dat1,$dat2,$dat2
1399 vld1.8 {$dat2},[$inp],#16
1400 vorr $in0,$dat,$dat
1401 vorr $in1,$dat1,$dat1
1402 vorr $in2,$dat2,$dat2
1403___
1404$code.=<<___ if ($flavour =~ /64/);
1405 cmp $len,#32
1406 b.lo .Loop3x_cbc_dec
1407
1408 vld1.8 {$dat3},[$inp],#16
1409 vld1.8 {$dat4},[$inp],#16
1410 sub $len,$len,#32 // bias
1411 mov $cnt,$rounds
1412 vorr $in3,$dat3,$dat3
1413 vorr $in4,$dat4,$dat4
1414
1415.Loop5x_cbc_dec:
1416 aesd $dat0,q8
1417 aesimc $dat0,$dat0
1418 aesd $dat1,q8
1419 aesimc $dat1,$dat1
1420 aesd $dat2,q8
1421 aesimc $dat2,$dat2
1422 aesd $dat3,q8
1423 aesimc $dat3,$dat3
1424 aesd $dat4,q8
1425 aesimc $dat4,$dat4
1426 vld1.32 {q8},[$key_],#16
1427 subs $cnt,$cnt,#2
1428 aesd $dat0,q9
1429 aesimc $dat0,$dat0
1430 aesd $dat1,q9
1431 aesimc $dat1,$dat1
1432 aesd $dat2,q9
1433 aesimc $dat2,$dat2
1434 aesd $dat3,q9
1435 aesimc $dat3,$dat3
1436 aesd $dat4,q9
1437 aesimc $dat4,$dat4
1438 vld1.32 {q9},[$key_],#16
1439 b.gt .Loop5x_cbc_dec
1440
1441 aesd $dat0,q8
1442 aesimc $dat0,$dat0
1443 aesd $dat1,q8
1444 aesimc $dat1,$dat1
1445 aesd $dat2,q8
1446 aesimc $dat2,$dat2
1447 aesd $dat3,q8
1448 aesimc $dat3,$dat3
1449 aesd $dat4,q8
1450 aesimc $dat4,$dat4
1451 cmp $len,#0x40 // because .Lcbc_tail4x
1452 sub $len,$len,#0x50
1453
1454 aesd $dat0,q9
1455 aesimc $dat0,$dat0
1456 aesd $dat1,q9
1457 aesimc $dat1,$dat1
1458 aesd $dat2,q9
1459 aesimc $dat2,$dat2
1460 aesd $dat3,q9
1461 aesimc $dat3,$dat3
1462 aesd $dat4,q9
1463 aesimc $dat4,$dat4
1464 csel x6,xzr,$len,gt // borrow x6, $cnt, "gt" is not typo
1465 mov $key_,$key
1466
1467 aesd $dat0,q10
1468 aesimc $dat0,$dat0
1469 aesd $dat1,q10
1470 aesimc $dat1,$dat1
1471 aesd $dat2,q10
1472 aesimc $dat2,$dat2
1473 aesd $dat3,q10
1474 aesimc $dat3,$dat3
1475 aesd $dat4,q10
1476 aesimc $dat4,$dat4
1477 add $inp,$inp,x6 // $inp is adjusted in such way that
1478 // at exit from the loop $dat1-$dat4
1479 // are loaded with last "words"
1480 add x6,$len,#0x60 // because .Lcbc_tail4x
1481
1482 aesd $dat0,q11
1483 aesimc $dat0,$dat0
1484 aesd $dat1,q11
1485 aesimc $dat1,$dat1
1486 aesd $dat2,q11
1487 aesimc $dat2,$dat2
1488 aesd $dat3,q11
1489 aesimc $dat3,$dat3
1490 aesd $dat4,q11
1491 aesimc $dat4,$dat4
1492
1493 aesd $dat0,q12
1494 aesimc $dat0,$dat0
1495 aesd $dat1,q12
1496 aesimc $dat1,$dat1
1497 aesd $dat2,q12
1498 aesimc $dat2,$dat2
1499 aesd $dat3,q12
1500 aesimc $dat3,$dat3
1501 aesd $dat4,q12
1502 aesimc $dat4,$dat4
1503
1504 aesd $dat0,q13
1505 aesimc $dat0,$dat0
1506 aesd $dat1,q13
1507 aesimc $dat1,$dat1
1508 aesd $dat2,q13
1509 aesimc $dat2,$dat2
1510 aesd $dat3,q13
1511 aesimc $dat3,$dat3
1512 aesd $dat4,q13
1513 aesimc $dat4,$dat4
1514
1515 aesd $dat0,q14
1516 aesimc $dat0,$dat0
1517 aesd $dat1,q14
1518 aesimc $dat1,$dat1
1519 aesd $dat2,q14
1520 aesimc $dat2,$dat2
1521 aesd $dat3,q14
1522 aesimc $dat3,$dat3
1523 aesd $dat4,q14
1524 aesimc $dat4,$dat4
1525
1526 veor $tmp0,$ivec,$rndlast
1527 aesd $dat0,q15
1528 veor $tmp1,$in0,$rndlast
1529 vld1.8 {$in0},[$inp],#16
1530 aesd $dat1,q15
1531 veor $tmp2,$in1,$rndlast
1532 vld1.8 {$in1},[$inp],#16
1533 aesd $dat2,q15
1534 veor $tmp3,$in2,$rndlast
1535 vld1.8 {$in2},[$inp],#16
1536 aesd $dat3,q15
1537 veor $tmp4,$in3,$rndlast
1538 vld1.8 {$in3},[$inp],#16
1539 aesd $dat4,q15
1540 vorr $ivec,$in4,$in4
1541 vld1.8 {$in4},[$inp],#16
1542 cbz x6,.Lcbc_tail4x
1543 vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0]
1544 veor $tmp0,$tmp0,$dat0
1545 vorr $dat0,$in0,$in0
1546 veor $tmp1,$tmp1,$dat1
1547 vorr $dat1,$in1,$in1
1548 veor $tmp2,$tmp2,$dat2
1549 vorr $dat2,$in2,$in2
1550 veor $tmp3,$tmp3,$dat3
1551 vorr $dat3,$in3,$in3
1552 veor $tmp4,$tmp4,$dat4
1553 vst1.8 {$tmp0},[$out],#16
1554 vorr $dat4,$in4,$in4
1555 vst1.8 {$tmp1},[$out],#16
1556 mov $cnt,$rounds
1557 vst1.8 {$tmp2},[$out],#16
1558 vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1]
1559 vst1.8 {$tmp3},[$out],#16
1560 vst1.8 {$tmp4},[$out],#16
1561 b.hs .Loop5x_cbc_dec
1562
1563 add $len,$len,#0x50
1564 cbz $len,.Lcbc_done
1565
1566 add $cnt,$rounds,#2
1567 subs $len,$len,#0x30
1568 vorr $dat0,$in2,$in2
1569 vorr $in0,$in2,$in2
1570 vorr $dat1,$in3,$in3
1571 vorr $in1,$in3,$in3
1572 vorr $dat2,$in4,$in4
1573 vorr $in2,$in4,$in4
1574 b.lo .Lcbc_dec_tail
1575
1576 b .Loop3x_cbc_dec
1577
1578.align 4
1579.Lcbc_tail4x:
1580 veor $tmp1,$tmp0,$dat1
1581 veor $tmp2,$tmp2,$dat2
1582 veor $tmp3,$tmp3,$dat3
1583 veor $tmp4,$tmp4,$dat4
1584 vst1.8 {$tmp1},[$out],#16
1585 vst1.8 {$tmp2},[$out],#16
1586 vst1.8 {$tmp3},[$out],#16
1587 vst1.8 {$tmp4},[$out],#16
1588
1589 b .Lcbc_done
1590.align 4
1591___
1592$code.=<<___;
1593.Loop3x_cbc_dec:
1594 aesd $dat0,q8
1595 aesimc $dat0,$dat0
1596 aesd $dat1,q8
1597 aesimc $dat1,$dat1
1598 aesd $dat2,q8
1599 aesimc $dat2,$dat2
1600 vld1.32 {q8},[$key_],#16
1601 subs $cnt,$cnt,#2
1602 aesd $dat0,q9
1603 aesimc $dat0,$dat0
1604 aesd $dat1,q9
1605 aesimc $dat1,$dat1
1606 aesd $dat2,q9
1607 aesimc $dat2,$dat2
1608 vld1.32 {q9},[$key_],#16
1609 b.gt .Loop3x_cbc_dec
1610
1611 aesd $dat0,q8
1612 aesimc $dat0,$dat0
1613 aesd $dat1,q8
1614 aesimc $dat1,$dat1
1615 aesd $dat2,q8
1616 aesimc $dat2,$dat2
1617 veor $tmp0,$ivec,$rndlast
1618 subs $len,$len,#0x30
1619 veor $tmp1,$in0,$rndlast
1620 mov.lo x6,$len // x6, $cnt, is zero at this point
1621 aesd $dat0,q9
1622 aesimc $dat0,$dat0
1623 aesd $dat1,q9
1624 aesimc $dat1,$dat1
1625 aesd $dat2,q9
1626 aesimc $dat2,$dat2
1627 veor $tmp2,$in1,$rndlast
1628 add $inp,$inp,x6 // $inp is adjusted in such way that
1629 // at exit from the loop $dat1-$dat2
1630 // are loaded with last "words"
1631 vorr $ivec,$in2,$in2
1632 mov $key_,$key
1633 aesd $dat0,q12
1634 aesimc $dat0,$dat0
1635 aesd $dat1,q12
1636 aesimc $dat1,$dat1
1637 aesd $dat2,q12
1638 aesimc $dat2,$dat2
1639 vld1.8 {$in0},[$inp],#16
1640 aesd $dat0,q13
1641 aesimc $dat0,$dat0
1642 aesd $dat1,q13
1643 aesimc $dat1,$dat1
1644 aesd $dat2,q13
1645 aesimc $dat2,$dat2
1646 vld1.8 {$in1},[$inp],#16
1647 aesd $dat0,q14
1648 aesimc $dat0,$dat0
1649 aesd $dat1,q14
1650 aesimc $dat1,$dat1
1651 aesd $dat2,q14
1652 aesimc $dat2,$dat2
1653 vld1.8 {$in2},[$inp],#16
1654 aesd $dat0,q15
1655 aesd $dat1,q15
1656 aesd $dat2,q15
1657 vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0]
1658 add $cnt,$rounds,#2
1659 veor $tmp0,$tmp0,$dat0
1660 veor $tmp1,$tmp1,$dat1
1661 veor $dat2,$dat2,$tmp2
1662 vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1]
1663 vst1.8 {$tmp0},[$out],#16
1664 vorr $dat0,$in0,$in0
1665 vst1.8 {$tmp1},[$out],#16
1666 vorr $dat1,$in1,$in1
1667 vst1.8 {$dat2},[$out],#16
1668 vorr $dat2,$in2,$in2
1669 b.hs .Loop3x_cbc_dec
1670
1671 cmn $len,#0x30
1672 b.eq .Lcbc_done
1673 nop
1674
1675.Lcbc_dec_tail:
1676 aesd $dat1,q8
1677 aesimc $dat1,$dat1
1678 aesd $dat2,q8
1679 aesimc $dat2,$dat2
1680 vld1.32 {q8},[$key_],#16
1681 subs $cnt,$cnt,#2
1682 aesd $dat1,q9
1683 aesimc $dat1,$dat1
1684 aesd $dat2,q9
1685 aesimc $dat2,$dat2
1686 vld1.32 {q9},[$key_],#16
1687 b.gt .Lcbc_dec_tail
1688
1689 aesd $dat1,q8
1690 aesimc $dat1,$dat1
1691 aesd $dat2,q8
1692 aesimc $dat2,$dat2
1693 aesd $dat1,q9
1694 aesimc $dat1,$dat1
1695 aesd $dat2,q9
1696 aesimc $dat2,$dat2
1697 aesd $dat1,q12
1698 aesimc $dat1,$dat1
1699 aesd $dat2,q12
1700 aesimc $dat2,$dat2
1701 cmn $len,#0x20
1702 aesd $dat1,q13
1703 aesimc $dat1,$dat1
1704 aesd $dat2,q13
1705 aesimc $dat2,$dat2
1706 veor $tmp1,$ivec,$rndlast
1707 aesd $dat1,q14
1708 aesimc $dat1,$dat1
1709 aesd $dat2,q14
1710 aesimc $dat2,$dat2
1711 veor $tmp2,$in1,$rndlast
1712 aesd $dat1,q15
1713 aesd $dat2,q15
1714 b.eq .Lcbc_dec_one
1715 veor $tmp1,$tmp1,$dat1
1716 veor $tmp2,$tmp2,$dat2
1717 vorr $ivec,$in2,$in2
1718 vst1.8 {$tmp1},[$out],#16
1719 vst1.8 {$tmp2},[$out],#16
1720 b .Lcbc_done
1721
1722.Lcbc_dec_one:
1723 veor $tmp1,$tmp1,$dat2
1724 vorr $ivec,$in2,$in2
1725 vst1.8 {$tmp1},[$out],#16
1726
1727.Lcbc_done:
1728 vst1.8 {$ivec},[$ivp]
1729.Lcbc_abort:
1730___
1731}
1732$code.=<<___ if ($flavour !~ /64/);
1733 vldmia sp!,{d8-d15}
1734 ldmia sp!,{r4-r8,pc}
1735___
1736$code.=<<___ if ($flavour =~ /64/);
1737 ldr x29,[sp],#16
1738 ret
1739___
1740$code.=<<___;
1741.size ${prefix}_cbc_encrypt,.-${prefix}_cbc_encrypt
1742___
1743}}}
1744{{{
1745my ($inp,$out,$len,$key,$ivp)=map("x$_",(0..4));
1746my ($rounds,$cnt,$key_)=("w5","w6","x7");
1747my ($ctr,$tctr0,$tctr1,$tctr2)=map("w$_",(8..10,12));
1748my $step="x12"; # aliases with $tctr2
1749
1750my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$ivec,$rndlast)=map("q$_",(0..7));
1751my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9));
1752
1753# used only in 64-bit mode...
1754my ($dat3,$dat4,$in3,$in4)=map("q$_",(16..23));
1755
1756my ($dat,$tmp)=($dat0,$tmp0);
1757
1758### q8-q15 preloaded key schedule
1759
1760$code.=<<___;
1761.globl ${prefix}_ctr32_encrypt_blocks
1762.type ${prefix}_ctr32_encrypt_blocks,%function
1763.align 5
1764${prefix}_ctr32_encrypt_blocks:
1765___
1766$code.=<<___ if ($flavour =~ /64/);
1767 stp x29,x30,[sp,#-16]!
1768 add x29,sp,#0
1769___
1770$code.=<<___ if ($flavour !~ /64/);
1771 mov ip,sp
1772 stmdb sp!,{r4-r10,lr}
1773 vstmdb sp!,{d8-d15} @ ABI specification says so
1774 ldr r4, [ip] @ load remaining arg
1775___
1776$code.=<<___;
1777 ldr $rounds,[$key,#240]
1778
1779 ldr $ctr, [$ivp, #12]
1780#ifdef __ARMEB__
1781 vld1.8 {$dat0},[$ivp]
1782#else
1783 vld1.32 {$dat0},[$ivp]
1784#endif
1785 vld1.32 {q8-q9},[$key] // load key schedule...
1786 sub $rounds,$rounds,#4
1787 mov $step,#16
1788 cmp $len,#2
1789 add $key_,$key,x5,lsl#4 // pointer to last 5 round keys
1790 sub $rounds,$rounds,#2
1791 vld1.32 {q12-q13},[$key_],#32
1792 vld1.32 {q14-q15},[$key_],#32
1793 vld1.32 {$rndlast},[$key_]
1794 add $key_,$key,#32
1795 mov $cnt,$rounds
1796 cclr $step,lo
1797#ifndef __ARMEB__
1798 rev $ctr, $ctr
1799#endif
1800 add $tctr1, $ctr, #1
1801 vorr $ivec,$dat0,$dat0
1802 rev $tctr1, $tctr1
1803 vmov.32 ${ivec}[3],$tctr1
1804 add $ctr, $ctr, #2
1805 vorr $dat1,$ivec,$ivec
1806 b.ls .Lctr32_tail
1807 rev $tctr2, $ctr
1808 vmov.32 ${ivec}[3],$tctr2
1809 sub $len,$len,#3 // bias
1810 vorr $dat2,$ivec,$ivec
1811___
1812$code.=<<___ if ($flavour =~ /64/);
1813 cmp $len,#2
1814 b.lo .Loop3x_ctr32
1815
1816 add w13,$ctr,#1
1817 add w14,$ctr,#2
1818 vorr $dat3,$dat0,$dat0
1819 rev w13,w13
1820 vorr $dat4,$dat0,$dat0
1821 rev w14,w14
1822 vmov.32 ${dat3}[3],w13
1823 sub $len,$len,#2 // bias
1824 vmov.32 ${dat4}[3],w14
1825 add $ctr,$ctr,#2
1826 b .Loop5x_ctr32
1827
1828.align 4
1829.Loop5x_ctr32:
1830 aese $dat0,q8
1831 aesmc $dat0,$dat0
1832 aese $dat1,q8
1833 aesmc $dat1,$dat1
1834 aese $dat2,q8
1835 aesmc $dat2,$dat2
1836 aese $dat3,q8
1837 aesmc $dat3,$dat3
1838 aese $dat4,q8
1839 aesmc $dat4,$dat4
1840 vld1.32 {q8},[$key_],#16
1841 subs $cnt,$cnt,#2
1842 aese $dat0,q9
1843 aesmc $dat0,$dat0
1844 aese $dat1,q9
1845 aesmc $dat1,$dat1
1846 aese $dat2,q9
1847 aesmc $dat2,$dat2
1848 aese $dat3,q9
1849 aesmc $dat3,$dat3
1850 aese $dat4,q9
1851 aesmc $dat4,$dat4
1852 vld1.32 {q9},[$key_],#16
1853 b.gt .Loop5x_ctr32
1854
1855 mov $key_,$key
1856 aese $dat0,q8
1857 aesmc $dat0,$dat0
1858 aese $dat1,q8
1859 aesmc $dat1,$dat1
1860 aese $dat2,q8
1861 aesmc $dat2,$dat2
1862 aese $dat3,q8
1863 aesmc $dat3,$dat3
1864 aese $dat4,q8
1865 aesmc $dat4,$dat4
1866 vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0]
1867
1868 aese $dat0,q9
1869 aesmc $dat0,$dat0
1870 aese $dat1,q9
1871 aesmc $dat1,$dat1
1872 aese $dat2,q9
1873 aesmc $dat2,$dat2
1874 aese $dat3,q9
1875 aesmc $dat3,$dat3
1876 aese $dat4,q9
1877 aesmc $dat4,$dat4
1878 vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1]
1879
1880 aese $dat0,q12
1881 aesmc $dat0,$dat0
1882 add $tctr0,$ctr,#1
1883 add $tctr1,$ctr,#2
1884 aese $dat1,q12
1885 aesmc $dat1,$dat1
1886 add $tctr2,$ctr,#3
1887 add w13,$ctr,#4
1888 aese $dat2,q12
1889 aesmc $dat2,$dat2
1890 add w14,$ctr,#5
1891 rev $tctr0,$tctr0
1892 aese $dat3,q12
1893 aesmc $dat3,$dat3
1894 rev $tctr1,$tctr1
1895 rev $tctr2,$tctr2
1896 aese $dat4,q12
1897 aesmc $dat4,$dat4
1898 rev w13,w13
1899 rev w14,w14
1900
1901 aese $dat0,q13
1902 aesmc $dat0,$dat0
1903 aese $dat1,q13
1904 aesmc $dat1,$dat1
1905 aese $dat2,q13
1906 aesmc $dat2,$dat2
1907 aese $dat3,q13
1908 aesmc $dat3,$dat3
1909 aese $dat4,q13
1910 aesmc $dat4,$dat4
1911
1912 aese $dat0,q14
1913 aesmc $dat0,$dat0
1914 vld1.8 {$in0},[$inp],#16
1915 aese $dat1,q14
1916 aesmc $dat1,$dat1
1917 vld1.8 {$in1},[$inp],#16
1918 aese $dat2,q14
1919 aesmc $dat2,$dat2
1920 vld1.8 {$in2},[$inp],#16
1921 aese $dat3,q14
1922 aesmc $dat3,$dat3
1923 vld1.8 {$in3},[$inp],#16
1924 aese $dat4,q14
1925 aesmc $dat4,$dat4
1926 vld1.8 {$in4},[$inp],#16
1927
1928 aese $dat0,q15
1929 veor $in0,$in0,$rndlast
1930 aese $dat1,q15
1931 veor $in1,$in1,$rndlast
1932 aese $dat2,q15
1933 veor $in2,$in2,$rndlast
1934 aese $dat3,q15
1935 veor $in3,$in3,$rndlast
1936 aese $dat4,q15
1937 veor $in4,$in4,$rndlast
1938
1939 veor $in0,$in0,$dat0
1940 vorr $dat0,$ivec,$ivec
1941 veor $in1,$in1,$dat1
1942 vorr $dat1,$ivec,$ivec
1943 veor $in2,$in2,$dat2
1944 vorr $dat2,$ivec,$ivec
1945 veor $in3,$in3,$dat3
1946 vorr $dat3,$ivec,$ivec
1947 veor $in4,$in4,$dat4
1948 vorr $dat4,$ivec,$ivec
1949
1950 vst1.8 {$in0},[$out],#16
1951 vmov.32 ${dat0}[3],$tctr0
1952 vst1.8 {$in1},[$out],#16
1953 vmov.32 ${dat1}[3],$tctr1
1954 vst1.8 {$in2},[$out],#16
1955 vmov.32 ${dat2}[3],$tctr2
1956 vst1.8 {$in3},[$out],#16
1957 vmov.32 ${dat3}[3],w13
1958 vst1.8 {$in4},[$out],#16
1959 vmov.32 ${dat4}[3],w14
1960
1961 mov $cnt,$rounds
1962 cbz $len,.Lctr32_done
1963
1964 add $ctr,$ctr,#5
1965 subs $len,$len,#5
1966 b.hs .Loop5x_ctr32
1967
1968 add $len,$len,#5
1969 sub $ctr,$ctr,#5
1970
1971 cmp $len,#2
1972 mov $step,#16
1973 cclr $step,lo
1974 b.ls .Lctr32_tail
1975
1976 sub $len,$len,#3 // bias
1977 add $ctr,$ctr,#3
1978___
1979$code.=<<___;
1980 b .Loop3x_ctr32
1981
1982.align 4
1983.Loop3x_ctr32:
1984 aese $dat0,q8
1985 aesmc $dat0,$dat0
1986 aese $dat1,q8
1987 aesmc $dat1,$dat1
1988 aese $dat2,q8
1989 aesmc $dat2,$dat2
1990 vld1.32 {q8},[$key_],#16
1991 subs $cnt,$cnt,#2
1992 aese $dat0,q9
1993 aesmc $dat0,$dat0
1994 aese $dat1,q9
1995 aesmc $dat1,$dat1
1996 aese $dat2,q9
1997 aesmc $dat2,$dat2
1998 vld1.32 {q9},[$key_],#16
1999 b.gt .Loop3x_ctr32
2000
2001 aese $dat0,q8
2002 aesmc $tmp0,$dat0
2003 aese $dat1,q8
2004 aesmc $tmp1,$dat1
2005 vld1.8 {$in0},[$inp],#16
2006 add $tctr0,$ctr,#1
2007 aese $dat2,q8
2008 aesmc $dat2,$dat2
2009 vld1.8 {$in1},[$inp],#16
2010 rev $tctr0,$tctr0
2011 aese $tmp0,q9
2012 aesmc $tmp0,$tmp0
2013 aese $tmp1,q9
2014 aesmc $tmp1,$tmp1
2015 vld1.8 {$in2},[$inp],#16
2016 mov $key_,$key
2017 aese $dat2,q9
2018 aesmc $tmp2,$dat2
2019 aese $tmp0,q12
2020 aesmc $tmp0,$tmp0
2021 aese $tmp1,q12
2022 aesmc $tmp1,$tmp1
2023 veor $in0,$in0,$rndlast
2024 add $tctr1,$ctr,#2
2025 aese $tmp2,q12
2026 aesmc $tmp2,$tmp2
2027 veor $in1,$in1,$rndlast
2028 add $ctr,$ctr,#3
2029 aese $tmp0,q13
2030 aesmc $tmp0,$tmp0
2031 aese $tmp1,q13
2032 aesmc $tmp1,$tmp1
2033 veor $in2,$in2,$rndlast
2034 vmov.32 ${ivec}[3], $tctr0
2035 aese $tmp2,q13
2036 aesmc $tmp2,$tmp2
2037 vorr $dat0,$ivec,$ivec
2038 rev $tctr1,$tctr1
2039 aese $tmp0,q14
2040 aesmc $tmp0,$tmp0
2041 vmov.32 ${ivec}[3], $tctr1
2042 rev $tctr2,$ctr
2043 aese $tmp1,q14
2044 aesmc $tmp1,$tmp1
2045 vorr $dat1,$ivec,$ivec
2046 vmov.32 ${ivec}[3], $tctr2
2047 aese $tmp2,q14
2048 aesmc $tmp2,$tmp2
2049 vorr $dat2,$ivec,$ivec
2050 subs $len,$len,#3
2051 aese $tmp0,q15
2052 aese $tmp1,q15
2053 aese $tmp2,q15
2054
2055 veor $in0,$in0,$tmp0
2056 vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0]
2057 vst1.8 {$in0},[$out],#16
2058 veor $in1,$in1,$tmp1
2059 mov $cnt,$rounds
2060 vst1.8 {$in1},[$out],#16
2061 veor $in2,$in2,$tmp2
2062 vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1]
2063 vst1.8 {$in2},[$out],#16
2064 b.hs .Loop3x_ctr32
2065
2066 adds $len,$len,#3
2067 b.eq .Lctr32_done
2068 cmp $len,#1
2069 mov $step,#16
2070 cclr $step,eq
2071
2072.Lctr32_tail:
2073 aese $dat0,q8
2074 aesmc $dat0,$dat0
2075 aese $dat1,q8
2076 aesmc $dat1,$dat1
2077 vld1.32 {q8},[$key_],#16
2078 subs $cnt,$cnt,#2
2079 aese $dat0,q9
2080 aesmc $dat0,$dat0
2081 aese $dat1,q9
2082 aesmc $dat1,$dat1
2083 vld1.32 {q9},[$key_],#16
2084 b.gt .Lctr32_tail
2085
2086 aese $dat0,q8
2087 aesmc $dat0,$dat0
2088 aese $dat1,q8
2089 aesmc $dat1,$dat1
2090 aese $dat0,q9
2091 aesmc $dat0,$dat0
2092 aese $dat1,q9
2093 aesmc $dat1,$dat1
2094 vld1.8 {$in0},[$inp],$step
2095 aese $dat0,q12
2096 aesmc $dat0,$dat0
2097 aese $dat1,q12
2098 aesmc $dat1,$dat1
2099 vld1.8 {$in1},[$inp]
2100 aese $dat0,q13
2101 aesmc $dat0,$dat0
2102 aese $dat1,q13
2103 aesmc $dat1,$dat1
2104 veor $in0,$in0,$rndlast
2105 aese $dat0,q14
2106 aesmc $dat0,$dat0
2107 aese $dat1,q14
2108 aesmc $dat1,$dat1
2109 veor $in1,$in1,$rndlast
2110 aese $dat0,q15
2111 aese $dat1,q15
2112
2113 cmp $len,#1
2114 veor $in0,$in0,$dat0
2115 veor $in1,$in1,$dat1
2116 vst1.8 {$in0},[$out],#16
2117 b.eq .Lctr32_done
2118 vst1.8 {$in1},[$out]
2119
2120.Lctr32_done:
2121___
2122$code.=<<___ if ($flavour !~ /64/);
2123 vldmia sp!,{d8-d15}
2124 ldmia sp!,{r4-r10,pc}
2125___
2126$code.=<<___ if ($flavour =~ /64/);
2127 ldr x29,[sp],#16
2128 ret
2129___
2130$code.=<<___;
2131.size ${prefix}_ctr32_encrypt_blocks,.-${prefix}_ctr32_encrypt_blocks
2132___
2133}}}
2134# Performance in cycles per byte.
2135# Processed with AES-XTS different key size.
2136# It shows the value before and after optimization as below:
2137# (before/after):
2138#
2139# AES-128-XTS AES-256-XTS
2140# Cortex-A57 3.36/1.09 4.02/1.37
2141# Cortex-A72 3.03/1.02 3.28/1.33
2142
2143# Optimization is implemented by loop unrolling and interleaving.
2144# Commonly, we choose the unrolling factor as 5, if the input
2145# data size smaller than 5 blocks, but not smaller than 3 blocks,
2146# choose 3 as the unrolling factor.
2147# If the input data size dsize >= 5*16 bytes, then take 5 blocks
2148# as one iteration, every loop the left size lsize -= 5*16.
2149# If lsize < 5*16 bytes, treat them as the tail. Note: left 4*16 bytes
2150# will be processed specially, which be integrated into the 5*16 bytes
2151# loop to improve the efficiency.
2152# There is one special case, if the original input data size dsize
2153# = 16 bytes, we will treat it seperately to improve the
2154# performance: one independent code block without LR, FP load and
2155# store.
2156# Encryption will process the (length -tailcnt) bytes as mentioned
2157# previously, then encrypt the composite block as last second
2158# cipher block.
2159# Decryption will process the (length -tailcnt -1) bytes as mentioned
2160# previously, then decrypt the last second cipher block to get the
2161# last plain block(tail), decrypt the composite block as last second
2162# plain text block.
2163
2164{{{
2165my ($inp,$out,$len,$key1,$key2,$ivp)=map("x$_",(0..5));
2166my ($rounds0,$rounds,$key_,$step,$ivl,$ivh)=("w5","w6","x7","x8","x9","x10");
2167my ($tmpoutp,$loutp,$l2outp,$tmpinp)=("x13","w14","w15","x20");
2168my ($tailcnt,$midnum,$midnumx,$constnum,$constnumx)=("x21","w22","x22","w19","x19");
2169my ($xoffset,$tmpmx,$tmpmw)=("x6","x11","w11");
2170my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$tmp2,$rndlast)=map("q$_",(0..7));
2171my ($iv0,$iv1,$iv2,$iv3,$iv4)=("v6.16b","v8.16b","v9.16b","v10.16b","v11.16b");
2172my ($ivd00,$ivd01,$ivd20,$ivd21)=("d6","v6.d[1]","d9","v9.d[1]");
2173my ($ivd10,$ivd11,$ivd30,$ivd31,$ivd40,$ivd41)=("d8","v8.d[1]","d10","v10.d[1]","d11","v11.d[1]");
2174
2175my ($tmpin)=("v26.16b");
2176my ($dat,$tmp,$rndzero_n_last)=($dat0,$tmp0,$tmp1);
2177
2178# q7 last round key
2179# q10-q15, q7 Last 7 round keys
2180# q8-q9 preloaded round keys except last 7 keys for big size
2181# q20, q21, q8-q9 preloaded round keys except last 7 keys for only 16 byte
2182
2183
2184my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9));
2185
2186my ($dat3,$in3,$tmp3); # used only in 64-bit mode
2187my ($dat4,$in4,$tmp4);
2188if ($flavour =~ /64/) {
2189 ($dat2,$dat3,$dat4,$in2,$in3,$in4,$tmp3,$tmp4)=map("q$_",(16..23));
2190}
2191
2192$code.=<<___ if ($flavour =~ /64/);
2193.globl ${prefix}_xts_encrypt
2194.type ${prefix}_xts_encrypt,%function
2195.align 5
2196${prefix}_xts_encrypt:
2197___
2198$code.=<<___ if ($flavour =~ /64/);
2199 cmp $len,#16
2200 // Original input data size bigger than 16, jump to big size processing.
2201 b.ne .Lxts_enc_big_size
2202 // Encrypt the iv with key2, as the first XEX iv.
2203 ldr $rounds,[$key2,#240]
2204 vld1.8 {$dat},[$key2],#16
2205 vld1.8 {$iv0},[$ivp]
2206 sub $rounds,$rounds,#2
2207 vld1.8 {$dat1},[$key2],#16
2208
2209.Loop_enc_iv_enc:
2210 aese $iv0,$dat
2211 aesmc $iv0,$iv0
2212 vld1.32 {$dat},[$key2],#16
2213 subs $rounds,$rounds,#2
2214 aese $iv0,$dat1
2215 aesmc $iv0,$iv0
2216 vld1.32 {$dat1},[$key2],#16
2217 b.gt .Loop_enc_iv_enc
2218
2219 aese $iv0,$dat
2220 aesmc $iv0,$iv0
2221 vld1.32 {$dat},[$key2]
2222 aese $iv0,$dat1
2223 veor $iv0,$iv0,$dat
2224
2225 vld1.8 {$dat0},[$inp]
2226 veor $dat0,$iv0,$dat0
2227
2228 ldr $rounds,[$key1,#240]
2229 vld1.32 {q20-q21},[$key1],#32 // load key schedule...
2230
2231 aese $dat0,q20
2232 aesmc $dat0,$dat0
2233 vld1.32 {q8-q9},[$key1],#32 // load key schedule...
2234 aese $dat0,q21
2235 aesmc $dat0,$dat0
2236 subs $rounds,$rounds,#10 // if rounds==10, jump to aes-128-xts processing
2237 b.eq .Lxts_128_enc
2238.Lxts_enc_round_loop:
2239 aese $dat0,q8
2240 aesmc $dat0,$dat0
2241 vld1.32 {q8},[$key1],#16 // load key schedule...
2242 aese $dat0,q9
2243 aesmc $dat0,$dat0
2244 vld1.32 {q9},[$key1],#16 // load key schedule...
2245 subs $rounds,$rounds,#2 // bias
2246 b.gt .Lxts_enc_round_loop
2247.Lxts_128_enc:
2248 vld1.32 {q10-q11},[$key1],#32 // load key schedule...
2249 aese $dat0,q8
2250 aesmc $dat0,$dat0
2251 aese $dat0,q9
2252 aesmc $dat0,$dat0
2253 vld1.32 {q12-q13},[$key1],#32 // load key schedule...
2254 aese $dat0,q10
2255 aesmc $dat0,$dat0
2256 aese $dat0,q11
2257 aesmc $dat0,$dat0
2258 vld1.32 {q14-q15},[$key1],#32 // load key schedule...
2259 aese $dat0,q12
2260 aesmc $dat0,$dat0
2261 aese $dat0,q13
2262 aesmc $dat0,$dat0
2263 vld1.32 {$rndlast},[$key1]
2264 aese $dat0,q14
2265 aesmc $dat0,$dat0
2266 aese $dat0,q15
2267 veor $dat0,$dat0,$rndlast
2268 veor $dat0,$dat0,$iv0
2269 vst1.8 {$dat0},[$out]
2270 b .Lxts_enc_final_abort
2271
2272.align 4
2273.Lxts_enc_big_size:
2274___
2275$code.=<<___ if ($flavour =~ /64/);
2276 stp $constnumx,$tmpinp,[sp,#-64]!
2277 stp $tailcnt,$midnumx,[sp,#48]
2278 stp $ivd10,$ivd20,[sp,#32]
2279 stp $ivd30,$ivd40,[sp,#16]
2280
2281 // tailcnt store the tail value of length%16.
2282 and $tailcnt,$len,#0xf
2283 and $len,$len,#-16
2284 subs $len,$len,#16
2285 mov $step,#16
2286 b.lo .Lxts_abort
2287 csel $step,xzr,$step,eq
2288
2289 // Firstly, encrypt the iv with key2, as the first iv of XEX.
2290 ldr $rounds,[$key2,#240]
2291 vld1.32 {$dat},[$key2],#16
2292 vld1.8 {$iv0},[$ivp]
2293 sub $rounds,$rounds,#2
2294 vld1.32 {$dat1},[$key2],#16
2295
2296.Loop_iv_enc:
2297 aese $iv0,$dat
2298 aesmc $iv0,$iv0
2299 vld1.32 {$dat},[$key2],#16
2300 subs $rounds,$rounds,#2
2301 aese $iv0,$dat1
2302 aesmc $iv0,$iv0
2303 vld1.32 {$dat1},[$key2],#16
2304 b.gt .Loop_iv_enc
2305
2306 aese $iv0,$dat
2307 aesmc $iv0,$iv0
2308 vld1.32 {$dat},[$key2]
2309 aese $iv0,$dat1
2310 veor $iv0,$iv0,$dat
2311
2312 // The iv for second block
2313 // $ivl- iv(low), $ivh - iv(high)
2314 // the five ivs stored into, $iv0,$iv1,$iv2,$iv3,$iv4
2315 fmov $ivl,$ivd00
2316 fmov $ivh,$ivd01
2317 mov $constnum,#0x87
2318 extr $midnumx,$ivh,$ivh,#32
2319 extr $ivh,$ivh,$ivl,#63
2320 and $tmpmw,$constnum,$midnum,asr#31
2321 eor $ivl,$tmpmx,$ivl,lsl#1
2322 fmov $ivd10,$ivl
2323 fmov $ivd11,$ivh
2324
2325 ldr $rounds0,[$key1,#240] // next starting point
2326 vld1.8 {$dat},[$inp],$step
2327
2328 vld1.32 {q8-q9},[$key1] // load key schedule...
2329 sub $rounds0,$rounds0,#6
2330 add $key_,$key1,$ivp,lsl#4 // pointer to last 7 round keys
2331 sub $rounds0,$rounds0,#2
2332 vld1.32 {q10-q11},[$key_],#32
2333 vld1.32 {q12-q13},[$key_],#32
2334 vld1.32 {q14-q15},[$key_],#32
2335 vld1.32 {$rndlast},[$key_]
2336
2337 add $key_,$key1,#32
2338 mov $rounds,$rounds0
2339
2340 // Encryption
2341.Lxts_enc:
2342 vld1.8 {$dat2},[$inp],#16
2343 subs $len,$len,#32 // bias
2344 add $rounds,$rounds0,#2
2345 vorr $in1,$dat,$dat
2346 vorr $dat1,$dat,$dat
2347 vorr $in3,$dat,$dat
2348 vorr $in2,$dat2,$dat2
2349 vorr $in4,$dat2,$dat2
2350 b.lo .Lxts_inner_enc_tail
2351 veor $dat,$dat,$iv0 // before encryption, xor with iv
2352 veor $dat2,$dat2,$iv1
2353
2354 // The iv for third block
2355 extr $midnumx,$ivh,$ivh,#32
2356 extr $ivh,$ivh,$ivl,#63
2357 and $tmpmw,$constnum,$midnum,asr#31
2358 eor $ivl,$tmpmx,$ivl,lsl#1
2359 fmov $ivd20,$ivl
2360 fmov $ivd21,$ivh
2361
2362
2363 vorr $dat1,$dat2,$dat2
2364 vld1.8 {$dat2},[$inp],#16
2365 vorr $in0,$dat,$dat
2366 vorr $in1,$dat1,$dat1
2367 veor $in2,$dat2,$iv2 // the third block
2368 veor $dat2,$dat2,$iv2
2369 cmp $len,#32
2370 b.lo .Lxts_outer_enc_tail
2371
2372 // The iv for fourth block
2373 extr $midnumx,$ivh,$ivh,#32
2374 extr $ivh,$ivh,$ivl,#63
2375 and $tmpmw,$constnum,$midnum,asr#31
2376 eor $ivl,$tmpmx,$ivl,lsl#1
2377 fmov $ivd30,$ivl
2378 fmov $ivd31,$ivh
2379
2380 vld1.8 {$dat3},[$inp],#16
2381 // The iv for fifth block
2382 extr $midnumx,$ivh,$ivh,#32
2383 extr $ivh,$ivh,$ivl,#63
2384 and $tmpmw,$constnum,$midnum,asr#31
2385 eor $ivl,$tmpmx,$ivl,lsl#1
2386 fmov $ivd40,$ivl
2387 fmov $ivd41,$ivh
2388
2389 vld1.8 {$dat4},[$inp],#16
2390 veor $dat3,$dat3,$iv3 // the fourth block
2391 veor $dat4,$dat4,$iv4
2392 sub $len,$len,#32 // bias
2393 mov $rounds,$rounds0
2394 b .Loop5x_xts_enc
2395
2396.align 4
2397.Loop5x_xts_enc:
2398 aese $dat0,q8
2399 aesmc $dat0,$dat0
2400 aese $dat1,q8
2401 aesmc $dat1,$dat1
2402 aese $dat2,q8
2403 aesmc $dat2,$dat2
2404 aese $dat3,q8
2405 aesmc $dat3,$dat3
2406 aese $dat4,q8
2407 aesmc $dat4,$dat4
2408 vld1.32 {q8},[$key_],#16
2409 subs $rounds,$rounds,#2
2410 aese $dat0,q9
2411 aesmc $dat0,$dat0
2412 aese $dat1,q9
2413 aesmc $dat1,$dat1
2414 aese $dat2,q9
2415 aesmc $dat2,$dat2
2416 aese $dat3,q9
2417 aesmc $dat3,$dat3
2418 aese $dat4,q9
2419 aesmc $dat4,$dat4
2420 vld1.32 {q9},[$key_],#16
2421 b.gt .Loop5x_xts_enc
2422
2423 aese $dat0,q8
2424 aesmc $dat0,$dat0
2425 aese $dat1,q8
2426 aesmc $dat1,$dat1
2427 aese $dat2,q8
2428 aesmc $dat2,$dat2
2429 aese $dat3,q8
2430 aesmc $dat3,$dat3
2431 aese $dat4,q8
2432 aesmc $dat4,$dat4
2433 subs $len,$len,#0x50 // because .Lxts_enc_tail4x
2434
2435 aese $dat0,q9
2436 aesmc $dat0,$dat0
2437 aese $dat1,q9
2438 aesmc $dat1,$dat1
2439 aese $dat2,q9
2440 aesmc $dat2,$dat2
2441 aese $dat3,q9
2442 aesmc $dat3,$dat3
2443 aese $dat4,q9
2444 aesmc $dat4,$dat4
2445 csel $xoffset,xzr,$len,gt // borrow x6, w6, "gt" is not typo
2446 mov $key_,$key1
2447
2448 aese $dat0,q10
2449 aesmc $dat0,$dat0
2450 aese $dat1,q10
2451 aesmc $dat1,$dat1
2452 aese $dat2,q10
2453 aesmc $dat2,$dat2
2454 aese $dat3,q10
2455 aesmc $dat3,$dat3
2456 aese $dat4,q10
2457 aesmc $dat4,$dat4
2458 add $inp,$inp,$xoffset // x0 is adjusted in such way that
2459 // at exit from the loop v1.16b-v26.16b
2460 // are loaded with last "words"
2461 add $xoffset,$len,#0x60 // because .Lxts_enc_tail4x
2462
2463 aese $dat0,q11
2464 aesmc $dat0,$dat0
2465 aese $dat1,q11
2466 aesmc $dat1,$dat1
2467 aese $dat2,q11
2468 aesmc $dat2,$dat2
2469 aese $dat3,q11
2470 aesmc $dat3,$dat3
2471 aese $dat4,q11
2472 aesmc $dat4,$dat4
2473
2474 aese $dat0,q12
2475 aesmc $dat0,$dat0
2476 aese $dat1,q12
2477 aesmc $dat1,$dat1
2478 aese $dat2,q12
2479 aesmc $dat2,$dat2
2480 aese $dat3,q12
2481 aesmc $dat3,$dat3
2482 aese $dat4,q12
2483 aesmc $dat4,$dat4
2484
2485 aese $dat0,q13
2486 aesmc $dat0,$dat0
2487 aese $dat1,q13
2488 aesmc $dat1,$dat1
2489 aese $dat2,q13
2490 aesmc $dat2,$dat2
2491 aese $dat3,q13
2492 aesmc $dat3,$dat3
2493 aese $dat4,q13
2494 aesmc $dat4,$dat4
2495
2496 aese $dat0,q14
2497 aesmc $dat0,$dat0
2498 aese $dat1,q14
2499 aesmc $dat1,$dat1
2500 aese $dat2,q14
2501 aesmc $dat2,$dat2
2502 aese $dat3,q14
2503 aesmc $dat3,$dat3
2504 aese $dat4,q14
2505 aesmc $dat4,$dat4
2506
2507 veor $tmp0,$rndlast,$iv0
2508 aese $dat0,q15
2509 // The iv for first block of one iteration
2510 extr $midnumx,$ivh,$ivh,#32
2511 extr $ivh,$ivh,$ivl,#63
2512 and $tmpmw,$constnum,$midnum,asr#31
2513 eor $ivl,$tmpmx,$ivl,lsl#1
2514 fmov $ivd00,$ivl
2515 fmov $ivd01,$ivh
2516 veor $tmp1,$rndlast,$iv1
2517 vld1.8 {$in0},[$inp],#16
2518 aese $dat1,q15
2519 // The iv for second block
2520 extr $midnumx,$ivh,$ivh,#32
2521 extr $ivh,$ivh,$ivl,#63
2522 and $tmpmw,$constnum,$midnum,asr#31
2523 eor $ivl,$tmpmx,$ivl,lsl#1
2524 fmov $ivd10,$ivl
2525 fmov $ivd11,$ivh
2526 veor $tmp2,$rndlast,$iv2
2527 vld1.8 {$in1},[$inp],#16
2528 aese $dat2,q15
2529 // The iv for third block
2530 extr $midnumx,$ivh,$ivh,#32
2531 extr $ivh,$ivh,$ivl,#63
2532 and $tmpmw,$constnum,$midnum,asr#31
2533 eor $ivl,$tmpmx,$ivl,lsl#1
2534 fmov $ivd20,$ivl
2535 fmov $ivd21,$ivh
2536 veor $tmp3,$rndlast,$iv3
2537 vld1.8 {$in2},[$inp],#16
2538 aese $dat3,q15
2539 // The iv for fourth block
2540 extr $midnumx,$ivh,$ivh,#32
2541 extr $ivh,$ivh,$ivl,#63
2542 and $tmpmw,$constnum,$midnum,asr#31
2543 eor $ivl,$tmpmx,$ivl,lsl#1
2544 fmov $ivd30,$ivl
2545 fmov $ivd31,$ivh
2546 veor $tmp4,$rndlast,$iv4
2547 vld1.8 {$in3},[$inp],#16
2548 aese $dat4,q15
2549
2550 // The iv for fifth block
2551 extr $midnumx,$ivh,$ivh,#32
2552 extr $ivh,$ivh,$ivl,#63
2553 and $tmpmw,$constnum,$midnum,asr #31
2554 eor $ivl,$tmpmx,$ivl,lsl #1
2555 fmov $ivd40,$ivl
2556 fmov $ivd41,$ivh
2557
2558 vld1.8 {$in4},[$inp],#16
2559 cbz $xoffset,.Lxts_enc_tail4x
2560 vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0]
2561 veor $tmp0,$tmp0,$dat0
2562 veor $dat0,$in0,$iv0
2563 veor $tmp1,$tmp1,$dat1
2564 veor $dat1,$in1,$iv1
2565 veor $tmp2,$tmp2,$dat2
2566 veor $dat2,$in2,$iv2
2567 veor $tmp3,$tmp3,$dat3
2568 veor $dat3,$in3,$iv3
2569 veor $tmp4,$tmp4,$dat4
2570 vst1.8 {$tmp0},[$out],#16
2571 veor $dat4,$in4,$iv4
2572 vst1.8 {$tmp1},[$out],#16
2573 mov $rounds,$rounds0
2574 vst1.8 {$tmp2},[$out],#16
2575 vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1]
2576 vst1.8 {$tmp3},[$out],#16
2577 vst1.8 {$tmp4},[$out],#16
2578 b.hs .Loop5x_xts_enc
2579
2580
2581 // If left 4 blocks, borrow the five block's processing.
2582 cmn $len,#0x10
2583 b.ne .Loop5x_enc_after
2584 vorr $iv4,$iv3,$iv3
2585 vorr $iv3,$iv2,$iv2
2586 vorr $iv2,$iv1,$iv1
2587 vorr $iv1,$iv0,$iv0
2588 fmov $ivl,$ivd40
2589 fmov $ivh,$ivd41
2590 veor $dat0,$iv0,$in0
2591 veor $dat1,$iv1,$in1
2592 veor $dat2,$in2,$iv2
2593 veor $dat3,$in3,$iv3
2594 veor $dat4,$in4,$iv4
2595 b.eq .Loop5x_xts_enc
2596
2597.Loop5x_enc_after:
2598 add $len,$len,#0x50
2599 cbz $len,.Lxts_enc_done
2600
2601 add $rounds,$rounds0,#2
2602 subs $len,$len,#0x30
2603 b.lo .Lxts_inner_enc_tail
2604
2605 veor $dat0,$iv0,$in2
2606 veor $dat1,$iv1,$in3
2607 veor $dat2,$in4,$iv2
2608 b .Lxts_outer_enc_tail
2609
2610.align 4
2611.Lxts_enc_tail4x:
2612 add $inp,$inp,#16
2613 veor $tmp1,$dat1,$tmp1
2614 vst1.8 {$tmp1},[$out],#16
2615 veor $tmp2,$dat2,$tmp2
2616 vst1.8 {$tmp2},[$out],#16
2617 veor $tmp3,$dat3,$tmp3
2618 veor $tmp4,$dat4,$tmp4
2619 vst1.8 {$tmp3-$tmp4},[$out],#32
2620
2621 b .Lxts_enc_done
2622.align 4
2623.Lxts_outer_enc_tail:
2624 aese $dat0,q8
2625 aesmc $dat0,$dat0
2626 aese $dat1,q8
2627 aesmc $dat1,$dat1
2628 aese $dat2,q8
2629 aesmc $dat2,$dat2
2630 vld1.32 {q8},[$key_],#16
2631 subs $rounds,$rounds,#2
2632 aese $dat0,q9
2633 aesmc $dat0,$dat0
2634 aese $dat1,q9
2635 aesmc $dat1,$dat1
2636 aese $dat2,q9
2637 aesmc $dat2,$dat2
2638 vld1.32 {q9},[$key_],#16
2639 b.gt .Lxts_outer_enc_tail
2640
2641 aese $dat0,q8
2642 aesmc $dat0,$dat0
2643 aese $dat1,q8
2644 aesmc $dat1,$dat1
2645 aese $dat2,q8
2646 aesmc $dat2,$dat2
2647 veor $tmp0,$iv0,$rndlast
2648 subs $len,$len,#0x30
2649 // The iv for first block
2650 fmov $ivl,$ivd20
2651 fmov $ivh,$ivd21
2652 //mov $constnum,#0x87
2653 extr $midnumx,$ivh,$ivh,#32
2654 extr $ivh,$ivh,$ivl,#63
2655 and $tmpmw,$constnum,$midnum,asr#31
2656 eor $ivl,$tmpmx,$ivl,lsl#1
2657 fmov $ivd00,$ivl
2658 fmov $ivd01,$ivh
2659 veor $tmp1,$iv1,$rndlast
2660 csel $xoffset,$len,$xoffset,lo // x6, w6, is zero at this point
2661 aese $dat0,q9
2662 aesmc $dat0,$dat0
2663 aese $dat1,q9
2664 aesmc $dat1,$dat1
2665 aese $dat2,q9
2666 aesmc $dat2,$dat2
2667 veor $tmp2,$iv2,$rndlast
2668
2669 add $xoffset,$xoffset,#0x20
2670 add $inp,$inp,$xoffset
2671 mov $key_,$key1
2672
2673 aese $dat0,q12
2674 aesmc $dat0,$dat0
2675 aese $dat1,q12
2676 aesmc $dat1,$dat1
2677 aese $dat2,q12
2678 aesmc $dat2,$dat2
2679 aese $dat0,q13
2680 aesmc $dat0,$dat0
2681 aese $dat1,q13
2682 aesmc $dat1,$dat1
2683 aese $dat2,q13
2684 aesmc $dat2,$dat2
2685 aese $dat0,q14
2686 aesmc $dat0,$dat0
2687 aese $dat1,q14
2688 aesmc $dat1,$dat1
2689 aese $dat2,q14
2690 aesmc $dat2,$dat2
2691 aese $dat0,q15
2692 aese $dat1,q15
2693 aese $dat2,q15
2694 vld1.8 {$in2},[$inp],#16
2695 add $rounds,$rounds0,#2
2696 vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0]
2697 veor $tmp0,$tmp0,$dat0
2698 veor $tmp1,$tmp1,$dat1
2699 veor $dat2,$dat2,$tmp2
2700 vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1]
2701 vst1.8 {$tmp0},[$out],#16
2702 vst1.8 {$tmp1},[$out],#16
2703 vst1.8 {$dat2},[$out],#16
2704 cmn $len,#0x30
2705 b.eq .Lxts_enc_done
2706.Lxts_encxor_one:
2707 vorr $in3,$in1,$in1
2708 vorr $in4,$in2,$in2
2709 nop
2710
2711.Lxts_inner_enc_tail:
2712 cmn $len,#0x10
2713 veor $dat1,$in3,$iv0
2714 veor $dat2,$in4,$iv1
2715 b.eq .Lxts_enc_tail_loop
2716 veor $dat2,$in4,$iv0
2717.Lxts_enc_tail_loop:
2718 aese $dat1,q8
2719 aesmc $dat1,$dat1
2720 aese $dat2,q8
2721 aesmc $dat2,$dat2
2722 vld1.32 {q8},[$key_],#16
2723 subs $rounds,$rounds,#2
2724 aese $dat1,q9
2725 aesmc $dat1,$dat1
2726 aese $dat2,q9
2727 aesmc $dat2,$dat2
2728 vld1.32 {q9},[$key_],#16
2729 b.gt .Lxts_enc_tail_loop
2730
2731 aese $dat1,q8
2732 aesmc $dat1,$dat1
2733 aese $dat2,q8
2734 aesmc $dat2,$dat2
2735 aese $dat1,q9
2736 aesmc $dat1,$dat1
2737 aese $dat2,q9
2738 aesmc $dat2,$dat2
2739 aese $dat1,q12
2740 aesmc $dat1,$dat1
2741 aese $dat2,q12
2742 aesmc $dat2,$dat2
2743 cmn $len,#0x20
2744 aese $dat1,q13
2745 aesmc $dat1,$dat1
2746 aese $dat2,q13
2747 aesmc $dat2,$dat2
2748 veor $tmp1,$iv0,$rndlast
2749 aese $dat1,q14
2750 aesmc $dat1,$dat1
2751 aese $dat2,q14
2752 aesmc $dat2,$dat2
2753 veor $tmp2,$iv1,$rndlast
2754 aese $dat1,q15
2755 aese $dat2,q15
2756 b.eq .Lxts_enc_one
2757 veor $tmp1,$tmp1,$dat1
2758 vst1.8 {$tmp1},[$out],#16
2759 veor $tmp2,$tmp2,$dat2
2760 vorr $iv0,$iv1,$iv1
2761 vst1.8 {$tmp2},[$out],#16
2762 fmov $ivl,$ivd10
2763 fmov $ivh,$ivd11
2764 mov $constnum,#0x87
2765 extr $midnumx,$ivh,$ivh,#32
2766 extr $ivh,$ivh,$ivl,#63
2767 and $tmpmw,$constnum,$midnum,asr #31
2768 eor $ivl,$tmpmx,$ivl,lsl #1
2769 fmov $ivd00,$ivl
2770 fmov $ivd01,$ivh
2771 b .Lxts_enc_done
2772
2773.Lxts_enc_one:
2774 veor $tmp1,$tmp1,$dat2
2775 vorr $iv0,$iv0,$iv0
2776 vst1.8 {$tmp1},[$out],#16
2777 fmov $ivl,$ivd00
2778 fmov $ivh,$ivd01
2779 mov $constnum,#0x87
2780 extr $midnumx,$ivh,$ivh,#32
2781 extr $ivh,$ivh,$ivl,#63
2782 and $tmpmw,$constnum,$midnum,asr #31
2783 eor $ivl,$tmpmx,$ivl,lsl #1
2784 fmov $ivd00,$ivl
2785 fmov $ivd01,$ivh
2786 b .Lxts_enc_done
2787.align 5
2788.Lxts_enc_done:
2789 // Process the tail block with cipher stealing.
2790 tst $tailcnt,#0xf
2791 b.eq .Lxts_abort
2792
2793 mov $tmpinp,$inp
2794 mov $tmpoutp,$out
2795 sub $out,$out,#16
2796.composite_enc_loop:
2797 subs $tailcnt,$tailcnt,#1
2798 ldrb $l2outp,[$out,$tailcnt]
2799 ldrb $loutp,[$tmpinp,$tailcnt]
2800 strb $l2outp,[$tmpoutp,$tailcnt]
2801 strb $loutp,[$out,$tailcnt]
2802 b.gt .composite_enc_loop
2803.Lxts_enc_load_done:
2804 vld1.8 {$tmpin},[$out]
2805 veor $tmpin,$tmpin,$iv0
2806
2807 // Encrypt the composite block to get the last second encrypted text block
2808 ldr $rounds,[$key1,#240] // load key schedule...
2809 vld1.8 {$dat},[$key1],#16
2810 sub $rounds,$rounds,#2
2811 vld1.8 {$dat1},[$key1],#16 // load key schedule...
2812.Loop_final_enc:
2813 aese $tmpin,$dat0
2814 aesmc $tmpin,$tmpin
2815 vld1.32 {$dat0},[$key1],#16
2816 subs $rounds,$rounds,#2
2817 aese $tmpin,$dat1
2818 aesmc $tmpin,$tmpin
2819 vld1.32 {$dat1},[$key1],#16
2820 b.gt .Loop_final_enc
2821
2822 aese $tmpin,$dat0
2823 aesmc $tmpin,$tmpin
2824 vld1.32 {$dat0},[$key1]
2825 aese $tmpin,$dat1
2826 veor $tmpin,$tmpin,$dat0
2827 veor $tmpin,$tmpin,$iv0
2828 vst1.8 {$tmpin},[$out]
2829
2830.Lxts_abort:
2831 ldp $tailcnt,$midnumx,[sp,#48]
2832 ldp $ivd10,$ivd20,[sp,#32]
2833 ldp $ivd30,$ivd40,[sp,#16]
2834 ldp $constnumx,$tmpinp,[sp],#64
2835.Lxts_enc_final_abort:
2836 ret
2837.size ${prefix}_xts_encrypt,.-${prefix}_xts_encrypt
2838___
2839
2840}}}
2841{{{
2842my ($inp,$out,$len,$key1,$key2,$ivp)=map("x$_",(0..5));
2843my ($rounds0,$rounds,$key_,$step,$ivl,$ivh)=("w5","w6","x7","x8","x9","x10");
2844my ($tmpoutp,$loutp,$l2outp,$tmpinp)=("x13","w14","w15","x20");
2845my ($tailcnt,$midnum,$midnumx,$constnum,$constnumx)=("x21","w22","x22","w19","x19");
2846my ($xoffset,$tmpmx,$tmpmw)=("x6","x11","w11");
2847my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$tmp2,$rndlast)=map("q$_",(0..7));
2848my ($iv0,$iv1,$iv2,$iv3,$iv4,$tmpin)=("v6.16b","v8.16b","v9.16b","v10.16b","v11.16b","v26.16b");
2849my ($ivd00,$ivd01,$ivd20,$ivd21)=("d6","v6.d[1]","d9","v9.d[1]");
2850my ($ivd10,$ivd11,$ivd30,$ivd31,$ivd40,$ivd41)=("d8","v8.d[1]","d10","v10.d[1]","d11","v11.d[1]");
2851
2852my ($dat,$tmp,$rndzero_n_last)=($dat0,$tmp0,$tmp1);
2853
2854# q7 last round key
2855# q10-q15, q7 Last 7 round keys
2856# q8-q9 preloaded round keys except last 7 keys for big size
2857# q20, q21, q8-q9 preloaded round keys except last 7 keys for only 16 byte
2858
2859{
2860my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9));
2861
2862my ($dat3,$in3,$tmp3); # used only in 64-bit mode
2863my ($dat4,$in4,$tmp4);
2864if ($flavour =~ /64/) {
2865 ($dat2,$dat3,$dat4,$in2,$in3,$in4,$tmp3,$tmp4)=map("q$_",(16..23));
2866}
2867
2868$code.=<<___ if ($flavour =~ /64/);
2869.globl ${prefix}_xts_decrypt
2870.type ${prefix}_xts_decrypt,%function
2871.align 5
2872${prefix}_xts_decrypt:
2873___
2874$code.=<<___ if ($flavour =~ /64/);
2875 cmp $len,#16
2876 // Original input data size bigger than 16, jump to big size processing.
2877 b.ne .Lxts_dec_big_size
2878 // Encrypt the iv with key2, as the first XEX iv.
2879 ldr $rounds,[$key2,#240]
2880 vld1.8 {$dat},[$key2],#16
2881 vld1.8 {$iv0},[$ivp]
2882 sub $rounds,$rounds,#2
2883 vld1.8 {$dat1},[$key2],#16
2884
2885.Loop_dec_small_iv_enc:
2886 aese $iv0,$dat
2887 aesmc $iv0,$iv0
2888 vld1.32 {$dat},[$key2],#16
2889 subs $rounds,$rounds,#2
2890 aese $iv0,$dat1
2891 aesmc $iv0,$iv0
2892 vld1.32 {$dat1},[$key2],#16
2893 b.gt .Loop_dec_small_iv_enc
2894
2895 aese $iv0,$dat
2896 aesmc $iv0,$iv0
2897 vld1.32 {$dat},[$key2]
2898 aese $iv0,$dat1
2899 veor $iv0,$iv0,$dat
2900
2901 vld1.8 {$dat0},[$inp]
2902 veor $dat0,$iv0,$dat0
2903
2904 ldr $rounds,[$key1,#240]
2905 vld1.32 {q20-q21},[$key1],#32 // load key schedule...
2906
2907 aesd $dat0,q20
2908 aesimc $dat0,$dat0
2909 vld1.32 {q8-q9},[$key1],#32 // load key schedule...
2910 aesd $dat0,q21
2911 aesimc $dat0,$dat0
2912 subs $rounds,$rounds,#10 // bias
2913 b.eq .Lxts_128_dec
2914.Lxts_dec_round_loop:
2915 aesd $dat0,q8
2916 aesimc $dat0,$dat0
2917 vld1.32 {q8},[$key1],#16 // load key schedule...
2918 aesd $dat0,q9
2919 aesimc $dat0,$dat0
2920 vld1.32 {q9},[$key1],#16 // load key schedule...
2921 subs $rounds,$rounds,#2 // bias
2922 b.gt .Lxts_dec_round_loop
2923.Lxts_128_dec:
2924 vld1.32 {q10-q11},[$key1],#32 // load key schedule...
2925 aesd $dat0,q8
2926 aesimc $dat0,$dat0
2927 aesd $dat0,q9
2928 aesimc $dat0,$dat0
2929 vld1.32 {q12-q13},[$key1],#32 // load key schedule...
2930 aesd $dat0,q10
2931 aesimc $dat0,$dat0
2932 aesd $dat0,q11
2933 aesimc $dat0,$dat0
2934 vld1.32 {q14-q15},[$key1],#32 // load key schedule...
2935 aesd $dat0,q12
2936 aesimc $dat0,$dat0
2937 aesd $dat0,q13
2938 aesimc $dat0,$dat0
2939 vld1.32 {$rndlast},[$key1]
2940 aesd $dat0,q14
2941 aesimc $dat0,$dat0
2942 aesd $dat0,q15
2943 veor $dat0,$dat0,$rndlast
2944 veor $dat0,$iv0,$dat0
2945 vst1.8 {$dat0},[$out]
2946 b .Lxts_dec_final_abort
2947.Lxts_dec_big_size:
2948___
2949$code.=<<___ if ($flavour =~ /64/);
2950 stp $constnumx,$tmpinp,[sp,#-64]!
2951 stp $tailcnt,$midnumx,[sp,#48]
2952 stp $ivd10,$ivd20,[sp,#32]
2953 stp $ivd30,$ivd40,[sp,#16]
2954
2955 and $tailcnt,$len,#0xf
2956 and $len,$len,#-16
2957 subs $len,$len,#16
2958 mov $step,#16
2959 b.lo .Lxts_dec_abort
2960
2961 // Encrypt the iv with key2, as the first XEX iv
2962 ldr $rounds,[$key2,#240]
2963 vld1.8 {$dat},[$key2],#16
2964 vld1.8 {$iv0},[$ivp]
2965 sub $rounds,$rounds,#2
2966 vld1.8 {$dat1},[$key2],#16
2967
2968.Loop_dec_iv_enc:
2969 aese $iv0,$dat
2970 aesmc $iv0,$iv0
2971 vld1.32 {$dat},[$key2],#16
2972 subs $rounds,$rounds,#2
2973 aese $iv0,$dat1
2974 aesmc $iv0,$iv0
2975 vld1.32 {$dat1},[$key2],#16
2976 b.gt .Loop_dec_iv_enc
2977
2978 aese $iv0,$dat
2979 aesmc $iv0,$iv0
2980 vld1.32 {$dat},[$key2]
2981 aese $iv0,$dat1
2982 veor $iv0,$iv0,$dat
2983
2984 // The iv for second block
2985 // $ivl- iv(low), $ivh - iv(high)
2986 // the five ivs stored into, $iv0,$iv1,$iv2,$iv3,$iv4
2987 fmov $ivl,$ivd00
2988 fmov $ivh,$ivd01
2989 mov $constnum,#0x87
2990 extr $midnumx,$ivh,$ivh,#32
2991 extr $ivh,$ivh,$ivl,#63
2992 and $tmpmw,$constnum,$midnum,asr #31
2993 eor $ivl,$tmpmx,$ivl,lsl #1
2994 fmov $ivd10,$ivl
2995 fmov $ivd11,$ivh
2996
2997 ldr $rounds0,[$key1,#240] // load rounds number
2998
2999 // The iv for third block
3000 extr $midnumx,$ivh,$ivh,#32
3001 extr $ivh,$ivh,$ivl,#63
3002 and $tmpmw,$constnum,$midnum,asr #31
3003 eor $ivl,$tmpmx,$ivl,lsl #1
3004 fmov $ivd20,$ivl
3005 fmov $ivd21,$ivh
3006
3007 vld1.32 {q8-q9},[$key1] // load key schedule...
3008 sub $rounds0,$rounds0,#6
3009 add $key_,$key1,$ivp,lsl#4 // pointer to last 7 round keys
3010 sub $rounds0,$rounds0,#2
3011 vld1.32 {q10-q11},[$key_],#32 // load key schedule...
3012 vld1.32 {q12-q13},[$key_],#32
3013 vld1.32 {q14-q15},[$key_],#32
3014 vld1.32 {$rndlast},[$key_]
3015
3016 // The iv for fourth block
3017 extr $midnumx,$ivh,$ivh,#32
3018 extr $ivh,$ivh,$ivl,#63
3019 and $tmpmw,$constnum,$midnum,asr #31
3020 eor $ivl,$tmpmx,$ivl,lsl #1
3021 fmov $ivd30,$ivl
3022 fmov $ivd31,$ivh
3023
3024 add $key_,$key1,#32
3025 mov $rounds,$rounds0
3026 b .Lxts_dec
3027
3028 // Decryption
3029.align 5
3030.Lxts_dec:
3031 tst $tailcnt,#0xf
3032 b.eq .Lxts_dec_begin
3033 subs $len,$len,#16
3034 csel $step,xzr,$step,eq
3035 vld1.8 {$dat},[$inp],#16
3036 b.lo .Lxts_done
3037 sub $inp,$inp,#16
3038.Lxts_dec_begin:
3039 vld1.8 {$dat},[$inp],$step
3040 subs $len,$len,#32 // bias
3041 add $rounds,$rounds0,#2
3042 vorr $in1,$dat,$dat
3043 vorr $dat1,$dat,$dat
3044 vorr $in3,$dat,$dat
3045 vld1.8 {$dat2},[$inp],#16
3046 vorr $in2,$dat2,$dat2
3047 vorr $in4,$dat2,$dat2
3048 b.lo .Lxts_inner_dec_tail
3049 veor $dat,$dat,$iv0 // before decryt, xor with iv
3050 veor $dat2,$dat2,$iv1
3051
3052 vorr $dat1,$dat2,$dat2
3053 vld1.8 {$dat2},[$inp],#16
3054 vorr $in0,$dat,$dat
3055 vorr $in1,$dat1,$dat1
3056 veor $in2,$dat2,$iv2 // third block xox with third iv
3057 veor $dat2,$dat2,$iv2
3058 cmp $len,#32
3059 b.lo .Lxts_outer_dec_tail
3060
3061 vld1.8 {$dat3},[$inp],#16
3062
3063 // The iv for fifth block
3064 extr $midnumx,$ivh,$ivh,#32
3065 extr $ivh,$ivh,$ivl,#63
3066 and $tmpmw,$constnum,$midnum,asr #31
3067 eor $ivl,$tmpmx,$ivl,lsl #1
3068 fmov $ivd40,$ivl
3069 fmov $ivd41,$ivh
3070
3071 vld1.8 {$dat4},[$inp],#16
3072 veor $dat3,$dat3,$iv3 // the fourth block
3073 veor $dat4,$dat4,$iv4
3074 sub $len,$len,#32 // bias
3075 mov $rounds,$rounds0
3076 b .Loop5x_xts_dec
3077
3078.align 4
3079.Loop5x_xts_dec:
3080 aesd $dat0,q8
3081 aesimc $dat0,$dat0
3082 aesd $dat1,q8
3083 aesimc $dat1,$dat1
3084 aesd $dat2,q8
3085 aesimc $dat2,$dat2
3086 aesd $dat3,q8
3087 aesimc $dat3,$dat3
3088 aesd $dat4,q8
3089 aesimc $dat4,$dat4
3090 vld1.32 {q8},[$key_],#16 // load key schedule...
3091 subs $rounds,$rounds,#2
3092 aesd $dat0,q9
3093 aesimc $dat0,$dat0
3094 aesd $dat1,q9
3095 aesimc $dat1,$dat1
3096 aesd $dat2,q9
3097 aesimc $dat2,$dat2
3098 aesd $dat3,q9
3099 aesimc $dat3,$dat3
3100 aesd $dat4,q9
3101 aesimc $dat4,$dat4
3102 vld1.32 {q9},[$key_],#16 // load key schedule...
3103 b.gt .Loop5x_xts_dec
3104
3105 aesd $dat0,q8
3106 aesimc $dat0,$dat0
3107 aesd $dat1,q8
3108 aesimc $dat1,$dat1
3109 aesd $dat2,q8
3110 aesimc $dat2,$dat2
3111 aesd $dat3,q8
3112 aesimc $dat3,$dat3
3113 aesd $dat4,q8
3114 aesimc $dat4,$dat4
3115 subs $len,$len,#0x50 // because .Lxts_dec_tail4x
3116
3117 aesd $dat0,q9
3118 aesimc $dat0,$dat
3119 aesd $dat1,q9
3120 aesimc $dat1,$dat1
3121 aesd $dat2,q9
3122 aesimc $dat2,$dat2
3123 aesd $dat3,q9
3124 aesimc $dat3,$dat3
3125 aesd $dat4,q9
3126 aesimc $dat4,$dat4
3127 csel $xoffset,xzr,$len,gt // borrow x6, w6, "gt" is not typo
3128 mov $key_,$key1
3129
3130 aesd $dat0,q10
3131 aesimc $dat0,$dat0
3132 aesd $dat1,q10
3133 aesimc $dat1,$dat1
3134 aesd $dat2,q10
3135 aesimc $dat2,$dat2
3136 aesd $dat3,q10
3137 aesimc $dat3,$dat3
3138 aesd $dat4,q10
3139 aesimc $dat4,$dat4
3140 add $inp,$inp,$xoffset // x0 is adjusted in such way that
3141 // at exit from the loop v1.16b-v26.16b
3142 // are loaded with last "words"
3143 add $xoffset,$len,#0x60 // because .Lxts_dec_tail4x
3144
3145 aesd $dat0,q11
3146 aesimc $dat0,$dat0
3147 aesd $dat1,q11
3148 aesimc $dat1,$dat1
3149 aesd $dat2,q11
3150 aesimc $dat2,$dat2
3151 aesd $dat3,q11
3152 aesimc $dat3,$dat3
3153 aesd $dat4,q11
3154 aesimc $dat4,$dat4
3155
3156 aesd $dat0,q12
3157 aesimc $dat0,$dat0
3158 aesd $dat1,q12
3159 aesimc $dat1,$dat1
3160 aesd $dat2,q12
3161 aesimc $dat2,$dat2
3162 aesd $dat3,q12
3163 aesimc $dat3,$dat3
3164 aesd $dat4,q12
3165 aesimc $dat4,$dat4
3166
3167 aesd $dat0,q13
3168 aesimc $dat0,$dat0
3169 aesd $dat1,q13
3170 aesimc $dat1,$dat1
3171 aesd $dat2,q13
3172 aesimc $dat2,$dat2
3173 aesd $dat3,q13
3174 aesimc $dat3,$dat3
3175 aesd $dat4,q13
3176 aesimc $dat4,$dat4
3177
3178 aesd $dat0,q14
3179 aesimc $dat0,$dat0
3180 aesd $dat1,q14
3181 aesimc $dat1,$dat1
3182 aesd $dat2,q14
3183 aesimc $dat2,$dat2
3184 aesd $dat3,q14
3185 aesimc $dat3,$dat3
3186 aesd $dat4,q14
3187 aesimc $dat4,$dat4
3188
3189 veor $tmp0,$rndlast,$iv0
3190 aesd $dat0,q15
3191 // The iv for first block of next iteration.
3192 extr $midnumx,$ivh,$ivh,#32
3193 extr $ivh,$ivh,$ivl,#63
3194 and $tmpmw,$constnum,$midnum,asr #31
3195 eor $ivl,$tmpmx,$ivl,lsl #1
3196 fmov $ivd00,$ivl
3197 fmov $ivd01,$ivh
3198 veor $tmp1,$rndlast,$iv1
3199 vld1.8 {$in0},[$inp],#16
3200 aesd $dat1,q15
3201 // The iv for second block
3202 extr $midnumx,$ivh,$ivh,#32
3203 extr $ivh,$ivh,$ivl,#63
3204 and $tmpmw,$constnum,$midnum,asr #31
3205 eor $ivl,$tmpmx,$ivl,lsl #1
3206 fmov $ivd10,$ivl
3207 fmov $ivd11,$ivh
3208 veor $tmp2,$rndlast,$iv2
3209 vld1.8 {$in1},[$inp],#16
3210 aesd $dat2,q15
3211 // The iv for third block
3212 extr $midnumx,$ivh,$ivh,#32
3213 extr $ivh,$ivh,$ivl,#63
3214 and $tmpmw,$constnum,$midnum,asr #31
3215 eor $ivl,$tmpmx,$ivl,lsl #1
3216 fmov $ivd20,$ivl
3217 fmov $ivd21,$ivh
3218 veor $tmp3,$rndlast,$iv3
3219 vld1.8 {$in2},[$inp],#16
3220 aesd $dat3,q15
3221 // The iv for fourth block
3222 extr $midnumx,$ivh,$ivh,#32
3223 extr $ivh,$ivh,$ivl,#63
3224 and $tmpmw,$constnum,$midnum,asr #31
3225 eor $ivl,$tmpmx,$ivl,lsl #1
3226 fmov $ivd30,$ivl
3227 fmov $ivd31,$ivh
3228 veor $tmp4,$rndlast,$iv4
3229 vld1.8 {$in3},[$inp],#16
3230 aesd $dat4,q15
3231
3232 // The iv for fifth block
3233 extr $midnumx,$ivh,$ivh,#32
3234 extr $ivh,$ivh,$ivl,#63
3235 and $tmpmw,$constnum,$midnum,asr #31
3236 eor $ivl,$tmpmx,$ivl,lsl #1
3237 fmov $ivd40,$ivl
3238 fmov $ivd41,$ivh
3239
3240 vld1.8 {$in4},[$inp],#16
3241 cbz $xoffset,.Lxts_dec_tail4x
3242 vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0]
3243 veor $tmp0,$tmp0,$dat0
3244 veor $dat0,$in0,$iv0
3245 veor $tmp1,$tmp1,$dat1
3246 veor $dat1,$in1,$iv1
3247 veor $tmp2,$tmp2,$dat2
3248 veor $dat2,$in2,$iv2
3249 veor $tmp3,$tmp3,$dat3
3250 veor $dat3,$in3,$iv3
3251 veor $tmp4,$tmp4,$dat4
3252 vst1.8 {$tmp0},[$out],#16
3253 veor $dat4,$in4,$iv4
3254 vst1.8 {$tmp1},[$out],#16
3255 mov $rounds,$rounds0
3256 vst1.8 {$tmp2},[$out],#16
3257 vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1]
3258 vst1.8 {$tmp3},[$out],#16
3259 vst1.8 {$tmp4},[$out],#16
3260 b.hs .Loop5x_xts_dec
3261
3262 cmn $len,#0x10
3263 b.ne .Loop5x_dec_after
3264 // If x2($len) equal to -0x10, the left blocks is 4.
3265 // After specially processing, utilize the five blocks processing again.
3266 // It will use the following IVs: $iv0,$iv0,$iv1,$iv2,$iv3.
3267 vorr $iv4,$iv3,$iv3
3268 vorr $iv3,$iv2,$iv2
3269 vorr $iv2,$iv1,$iv1
3270 vorr $iv1,$iv0,$iv0
3271 fmov $ivl,$ivd40
3272 fmov $ivh,$ivd41
3273 veor $dat0,$iv0,$in0
3274 veor $dat1,$iv1,$in1
3275 veor $dat2,$in2,$iv2
3276 veor $dat3,$in3,$iv3
3277 veor $dat4,$in4,$iv4
3278 b.eq .Loop5x_xts_dec
3279
3280.Loop5x_dec_after:
3281 add $len,$len,#0x50
3282 cbz $len,.Lxts_done
3283
3284 add $rounds,$rounds0,#2
3285 subs $len,$len,#0x30
3286 b.lo .Lxts_inner_dec_tail
3287
3288 veor $dat0,$iv0,$in2
3289 veor $dat1,$iv1,$in3
3290 veor $dat2,$in4,$iv2
3291 b .Lxts_outer_dec_tail
3292
3293.align 4
3294.Lxts_dec_tail4x:
3295 add $inp,$inp,#16
3296 vld1.32 {$dat0},[$inp],#16
3297 veor $tmp1,$dat1,$tmp0
3298 vst1.8 {$tmp1},[$out],#16
3299 veor $tmp2,$dat2,$tmp2
3300 vst1.8 {$tmp2},[$out],#16
3301 veor $tmp3,$dat3,$tmp3
3302 veor $tmp4,$dat4,$tmp4
3303 vst1.8 {$tmp3-$tmp4},[$out],#32
3304
3305 b .Lxts_done
3306.align 4
3307.Lxts_outer_dec_tail:
3308 aesd $dat0,q8
3309 aesimc $dat0,$dat0
3310 aesd $dat1,q8
3311 aesimc $dat1,$dat1
3312 aesd $dat2,q8
3313 aesimc $dat2,$dat2
3314 vld1.32 {q8},[$key_],#16
3315 subs $rounds,$rounds,#2
3316 aesd $dat0,q9
3317 aesimc $dat0,$dat0
3318 aesd $dat1,q9
3319 aesimc $dat1,$dat1
3320 aesd $dat2,q9
3321 aesimc $dat2,$dat2
3322 vld1.32 {q9},[$key_],#16
3323 b.gt .Lxts_outer_dec_tail
3324
3325 aesd $dat0,q8
3326 aesimc $dat0,$dat0
3327 aesd $dat1,q8
3328 aesimc $dat1,$dat1
3329 aesd $dat2,q8
3330 aesimc $dat2,$dat2
3331 veor $tmp0,$iv0,$rndlast
3332 subs $len,$len,#0x30
3333 // The iv for first block
3334 fmov $ivl,$ivd20
3335 fmov $ivh,$ivd21
3336 mov $constnum,#0x87
3337 extr $midnumx,$ivh,$ivh,#32
3338 extr $ivh,$ivh,$ivl,#63
3339 and $tmpmw,$constnum,$midnum,asr #31
3340 eor $ivl,$tmpmx,$ivl,lsl #1
3341 fmov $ivd00,$ivl
3342 fmov $ivd01,$ivh
3343 veor $tmp1,$iv1,$rndlast
3344 csel $xoffset,$len,$xoffset,lo // x6, w6, is zero at this point
3345 aesd $dat0,q9
3346 aesimc $dat0,$dat0
3347 aesd $dat1,q9
3348 aesimc $dat1,$dat1
3349 aesd $dat2,q9
3350 aesimc $dat2,$dat2
3351 veor $tmp2,$iv2,$rndlast
3352 // The iv for second block
3353 extr $midnumx,$ivh,$ivh,#32
3354 extr $ivh,$ivh,$ivl,#63
3355 and $tmpmw,$constnum,$midnum,asr #31
3356 eor $ivl,$tmpmx,$ivl,lsl #1
3357 fmov $ivd10,$ivl
3358 fmov $ivd11,$ivh
3359
3360 add $xoffset,$xoffset,#0x20
3361 add $inp,$inp,$xoffset // $inp is adjusted to the last data
3362
3363 mov $key_,$key1
3364
3365 // The iv for third block
3366 extr $midnumx,$ivh,$ivh,#32
3367 extr $ivh,$ivh,$ivl,#63
3368 and $tmpmw,$constnum,$midnum,asr #31
3369 eor $ivl,$tmpmx,$ivl,lsl #1
3370 fmov $ivd20,$ivl
3371 fmov $ivd21,$ivh
3372
3373 aesd $dat0,q12
3374 aesimc $dat0,$dat0
3375 aesd $dat1,q12
3376 aesimc $dat1,$dat1
3377 aesd $dat2,q12
3378 aesimc $dat2,$dat2
3379 aesd $dat0,q13
3380 aesimc $dat0,$dat0
3381 aesd $dat1,q13
3382 aesimc $dat1,$dat1
3383 aesd $dat2,q13
3384 aesimc $dat2,$dat2
3385 aesd $dat0,q14
3386 aesimc $dat0,$dat0
3387 aesd $dat1,q14
3388 aesimc $dat1,$dat1
3389 aesd $dat2,q14
3390 aesimc $dat2,$dat2
3391 vld1.8 {$in2},[$inp],#16
3392 aesd $dat0,q15
3393 aesd $dat1,q15
3394 aesd $dat2,q15
3395 vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0]
3396 add $rounds,$rounds0,#2
3397 veor $tmp0,$tmp0,$dat0
3398 veor $tmp1,$tmp1,$dat1
3399 veor $dat2,$dat2,$tmp2
3400 vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1]
3401 vst1.8 {$tmp0},[$out],#16
3402 vst1.8 {$tmp1},[$out],#16
3403 vst1.8 {$dat2},[$out],#16
3404
3405 cmn $len,#0x30
3406 add $len,$len,#0x30
3407 b.eq .Lxts_done
3408 sub $len,$len,#0x30
3409 vorr $in3,$in1,$in1
3410 vorr $in4,$in2,$in2
3411 nop
3412
3413.Lxts_inner_dec_tail:
3414 // $len == -0x10 means two blocks left.
3415 cmn $len,#0x10
3416 veor $dat1,$in3,$iv0
3417 veor $dat2,$in4,$iv1
3418 b.eq .Lxts_dec_tail_loop
3419 veor $dat2,$in4,$iv0
3420.Lxts_dec_tail_loop:
3421 aesd $dat1,q8
3422 aesimc $dat1,$dat1
3423 aesd $dat2,q8
3424 aesimc $dat2,$dat2
3425 vld1.32 {q8},[$key_],#16
3426 subs $rounds,$rounds,#2
3427 aesd $dat1,q9
3428 aesimc $dat1,$dat1
3429 aesd $dat2,q9
3430 aesimc $dat2,$dat2
3431 vld1.32 {q9},[$key_],#16
3432 b.gt .Lxts_dec_tail_loop
3433
3434 aesd $dat1,q8
3435 aesimc $dat1,$dat1
3436 aesd $dat2,q8
3437 aesimc $dat2,$dat2
3438 aesd $dat1,q9
3439 aesimc $dat1,$dat1
3440 aesd $dat2,q9
3441 aesimc $dat2,$dat2
3442 aesd $dat1,q12
3443 aesimc $dat1,$dat1
3444 aesd $dat2,q12
3445 aesimc $dat2,$dat2
3446 cmn $len,#0x20
3447 aesd $dat1,q13
3448 aesimc $dat1,$dat1
3449 aesd $dat2,q13
3450 aesimc $dat2,$dat2
3451 veor $tmp1,$iv0,$rndlast
3452 aesd $dat1,q14
3453 aesimc $dat1,$dat1
3454 aesd $dat2,q14
3455 aesimc $dat2,$dat2
3456 veor $tmp2,$iv1,$rndlast
3457 aesd $dat1,q15
3458 aesd $dat2,q15
3459 b.eq .Lxts_dec_one
3460 veor $tmp1,$tmp1,$dat1
3461 veor $tmp2,$tmp2,$dat2
3462 vorr $iv0,$iv2,$iv2
3463 vorr $iv1,$iv3,$iv3
3464 vst1.8 {$tmp1},[$out],#16
3465 vst1.8 {$tmp2},[$out],#16
3466 add $len,$len,#16
3467 b .Lxts_done
3468
3469.Lxts_dec_one:
3470 veor $tmp1,$tmp1,$dat2
3471 vorr $iv0,$iv1,$iv1
3472 vorr $iv1,$iv2,$iv2
3473 vst1.8 {$tmp1},[$out],#16
3474 add $len,$len,#32
3475
3476.Lxts_done:
3477 tst $tailcnt,#0xf
3478 b.eq .Lxts_dec_abort
3479 // Processing the last two blocks with cipher stealing.
3480 mov x7,x3
3481 cbnz x2,.Lxts_dec_1st_done
3482 vld1.32 {$dat0},[$inp],#16
3483
3484 // Decrypt the last secod block to get the last plain text block
3485.Lxts_dec_1st_done:
3486 eor $tmpin,$dat0,$iv1
3487 ldr $rounds,[$key1,#240]
3488 vld1.32 {$dat0},[$key1],#16
3489 sub $rounds,$rounds,#2
3490 vld1.32 {$dat1},[$key1],#16
3491.Loop_final_2nd_dec:
3492 aesd $tmpin,$dat0
3493 aesimc $tmpin,$tmpin
3494 vld1.32 {$dat0},[$key1],#16 // load key schedule...
3495 subs $rounds,$rounds,#2
3496 aesd $tmpin,$dat1
3497 aesimc $tmpin,$tmpin
3498 vld1.32 {$dat1},[$key1],#16 // load key schedule...
3499 b.gt .Loop_final_2nd_dec
3500
3501 aesd $tmpin,$dat0
3502 aesimc $tmpin,$tmpin
3503 vld1.32 {$dat0},[$key1]
3504 aesd $tmpin,$dat1
3505 veor $tmpin,$tmpin,$dat0
3506 veor $tmpin,$tmpin,$iv1
3507 vst1.8 {$tmpin},[$out]
3508
3509 mov $tmpinp,$inp
3510 add $tmpoutp,$out,#16
3511
3512 // Composite the tailcnt "16 byte not aligned block" into the last second plain blocks
3513 // to get the last encrypted block.
3514.composite_dec_loop:
3515 subs $tailcnt,$tailcnt,#1
3516 ldrb $l2outp,[$out,$tailcnt]
3517 ldrb $loutp,[$tmpinp,$tailcnt]
3518 strb $l2outp,[$tmpoutp,$tailcnt]
3519 strb $loutp,[$out,$tailcnt]
3520 b.gt .composite_dec_loop
3521.Lxts_dec_load_done:
3522 vld1.8 {$tmpin},[$out]
3523 veor $tmpin,$tmpin,$iv0
3524
3525 // Decrypt the composite block to get the last second plain text block
3526 ldr $rounds,[$key_,#240]
3527 vld1.8 {$dat},[$key_],#16
3528 sub $rounds,$rounds,#2
3529 vld1.8 {$dat1},[$key_],#16
3530.Loop_final_dec:
3531 aesd $tmpin,$dat0
3532 aesimc $tmpin,$tmpin
3533 vld1.32 {$dat0},[$key_],#16 // load key schedule...
3534 subs $rounds,$rounds,#2
3535 aesd $tmpin,$dat1
3536 aesimc $tmpin,$tmpin
3537 vld1.32 {$dat1},[$key_],#16 // load key schedule...
3538 b.gt .Loop_final_dec
3539
3540 aesd $tmpin,$dat0
3541 aesimc $tmpin,$tmpin
3542 vld1.32 {$dat0},[$key_]
3543 aesd $tmpin,$dat1
3544 veor $tmpin,$tmpin,$dat0
3545 veor $tmpin,$tmpin,$iv0
3546 vst1.8 {$tmpin},[$out]
3547
3548.Lxts_dec_abort:
3549 ldp $tailcnt,$midnumx,[sp,#48]
3550 ldp $ivd10,$ivd20,[sp,#32]
3551 ldp $ivd30,$ivd40,[sp,#16]
3552 ldp $constnumx,$tmpinp,[sp],#64
3553
3554.Lxts_dec_final_abort:
3555 ret
3556.size ${prefix}_xts_decrypt,.-${prefix}_xts_decrypt
3557___
3558}
3559}}}
3560$code.=<<___;
3561#endif
3562___
3563########################################
3564if ($flavour =~ /64/) { ######## 64-bit code
3565 my %opcode = (
3566 "aesd" => 0x4e285800, "aese" => 0x4e284800,
3567 "aesimc"=> 0x4e287800, "aesmc" => 0x4e286800 );
3568
3569 local *unaes = sub {
3570 my ($mnemonic,$arg)=@_;
3571
3572 $arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)/o &&
3573 sprintf ".inst\t0x%08x\t//%s %s",
3574 $opcode{$mnemonic}|$1|($2<<5),
3575 $mnemonic,$arg;
3576 };
3577
3578 foreach(split("\n",$code)) {
3579 s/\`([^\`]*)\`/eval($1)/geo;
3580
3581 s/\bq([0-9]+)\b/"v".($1<8?$1:$1+8).".16b"/geo; # old->new registers
3582 s/@\s/\/\//o; # old->new style commentary
3583
3584 #s/[v]?(aes\w+)\s+([qv].*)/unaes($1,$2)/geo or
3585 s/cclr\s+([wx])([^,]+),\s*([a-z]+)/csel $1$2,$1zr,$1$2,$3/o or
3586 s/mov\.([a-z]+)\s+([wx][0-9]+),\s*([wx][0-9]+)/csel $2,$3,$2,$1/o or
3587 s/vmov\.i8/movi/o or # fix up legacy mnemonics
3588 s/vext\.8/ext/o or
3589 s/vrev32\.8/rev32/o or
3590 s/vtst\.8/cmtst/o or
3591 s/vshr/ushr/o or
3592 s/^(\s+)v/$1/o or # strip off v prefix
3593 s/\bbx\s+lr\b/ret/o;
3594
3595 # fix up remaining legacy suffixes
3596 s/\.[ui]?8//o;
3597 m/\],#8/o and s/\.16b/\.8b/go;
3598 s/\.[ui]?32//o and s/\.16b/\.4s/go;
3599 s/\.[ui]?64//o and s/\.16b/\.2d/go;
3600 s/\.[42]([sd])\[([0-3])\]/\.$1\[$2\]/o;
3601
3602 print $_,"\n";
3603 }
3604} else { ######## 32-bit code
3605 my %opcode = (
3606 "aesd" => 0xf3b00340, "aese" => 0xf3b00300,
3607 "aesimc"=> 0xf3b003c0, "aesmc" => 0xf3b00380 );
3608
3609 local *unaes = sub {
3610 my ($mnemonic,$arg)=@_;
3611
3612 if ($arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)/o) {
3613 my $word = $opcode{$mnemonic}|(($1&7)<<13)|(($1&8)<<19)
3614 |(($2&7)<<1) |(($2&8)<<2);
3615 # since ARMv7 instructions are always encoded little-endian.
3616 # correct solution is to use .inst directive, but older
3617 # assemblers don't implement it:-(
3618 sprintf "INST(0x%02x,0x%02x,0x%02x,0x%02x)\t@ %s %s",
3619 $word&0xff,($word>>8)&0xff,
3620 ($word>>16)&0xff,($word>>24)&0xff,
3621 $mnemonic,$arg;
3622 }
3623 };
3624
3625 sub unvtbl {
3626 my $arg=shift;
3627
3628 $arg =~ m/q([0-9]+),\s*\{q([0-9]+)\},\s*q([0-9]+)/o &&
3629 sprintf "vtbl.8 d%d,{q%d},d%d\n\t".
3630 "vtbl.8 d%d,{q%d},d%d", 2*$1,$2,2*$3, 2*$1+1,$2,2*$3+1;
3631 }
3632
3633 sub unvdup32 {
3634 my $arg=shift;
3635
3636 $arg =~ m/q([0-9]+),\s*q([0-9]+)\[([0-3])\]/o &&
3637 sprintf "vdup.32 q%d,d%d[%d]",$1,2*$2+($3>>1),$3&1;
3638 }
3639
3640 sub unvmov32 {
3641 my $arg=shift;
3642
3643 $arg =~ m/q([0-9]+)\[([0-3])\],(.*)/o &&
3644 sprintf "vmov.32 d%d[%d],%s",2*$1+($2>>1),$2&1,$3;
3645 }
3646
3647 foreach(split("\n",$code)) {
3648 s/\`([^\`]*)\`/eval($1)/geo;
3649
3650 s/\b[wx]([0-9]+)\b/r$1/go; # new->old registers
3651 s/\bv([0-9])\.[12468]+[bsd]\b/q$1/go; # new->old registers
3652 s/\/\/\s?/@ /o; # new->old style commentary
3653
3654 # fix up remaining new-style suffixes
3655 s/\{q([0-9]+)\},\s*\[(.+)\],#8/sprintf "{d%d},[$2]!",2*$1/eo or
3656 s/\],#[0-9]+/]!/o;
3657
3658 s/[v]?(aes\w+)\s+([qv].*)/unaes($1,$2)/geo or
3659 s/cclr\s+([^,]+),\s*([a-z]+)/mov.$2 $1,#0/o or
3660 s/vtbl\.8\s+(.*)/unvtbl($1)/geo or
3661 s/vdup\.32\s+(.*)/unvdup32($1)/geo or
3662 s/vmov\.32\s+(.*)/unvmov32($1)/geo or
3663 s/^(\s+)b\./$1b/o or
3664 s/^(\s+)ret/$1bx\tlr/o;
3665
3666 if (s/^(\s+)mov\.([a-z]+)/$1mov$2/) {
3667 print " it $2\n";
3668 }
3669
3670 print $_,"\n";
3671 }
3672}
3673
3674close STDOUT or die "error closing STDOUT: $!";
注意: 瀏覽 TracBrowser 來幫助您使用儲存庫瀏覽器

© 2025 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette