aesv8-armx.pl@ 97371

最後變更在這個檔案從97371是 94082,由 vboxsync 提交於 3 年前
libs/openssl-3.0.1: started applying and adjusting our OpenSSL changes to 3.0.1. bugref:10128
屬性 svn:executable 設為 ``*
檔案大小: 79.5 KB

行
1	#! /usr/bin/env perl
2	# Copyright 2014-2020 The OpenSSL Project Authors. All Rights Reserved.
3	#
4	# Licensed under the Apache License 2.0 (the "License"). You may not use
5	# this file except in compliance with the License. You can obtain a copy
6	# in the file LICENSE in the source distribution or at
7	# https://www.openssl.org/source/license.html
8
9	#
10	# ====================================================================
11	# Written by Andy Polyakov <[email protected]> for the OpenSSL
12	# project. The module is, however, dual licensed under OpenSSL and
13	# CRYPTOGAMS licenses depending on where you obtain it. For further
14	# details see http://www.openssl.org/~appro/cryptogams/.
15	# ====================================================================
16	#
17	# This module implements support for ARMv8 AES instructions. The
18	# module is endian-agnostic in sense that it supports both big- and
19	# little-endian cases. As does it support both 32- and 64-bit modes
20	# of operation. Latter is achieved by limiting amount of utilized
21	# registers to 16, which implies additional NEON load and integer
22	# instructions. This has no effect on mighty Apple A7, where results
23	# are literally equal to the theoretical estimates based on AES
24	# instruction latencies and issue rates. On Cortex-A53, an in-order
25	# execution core, this costs up to 10-15%, which is partially
26	# compensated by implementing dedicated code path for 128-bit
27	# CBC encrypt case. On Cortex-A57 parallelizable mode performance
28	# seems to be limited by sheer amount of NEON instructions...
29	#
30	# April 2019
31	#
32	# Key to performance of parallelize-able modes is round instruction
33	# interleaving. But which factor to use? There is optimal one for
34	# each combination of instruction latency and issue rate, beyond
35	# which increasing interleave factor doesn't pay off. While on cons
36	# side we have code size increase and resource waste on platforms for
37	# which interleave factor is too high. In other words you want it to
38	# be just right. So far interleave factor of 3x was serving well all
39	# platforms. But for ThunderX2 optimal interleave factor was measured
40	# to be 5x...
41	#
42	# Performance in cycles per byte processed with 128-bit key:
43	#
44	# CBC enc CBC dec CTR
45	# Apple A7 2.39 1.20 1.20
46	# Cortex-A53 1.32 1.17/1.29(**) 1.36/1.46
47	# Cortex-A57(*) 1.95 0.82/0.85 0.89/0.93
48	# Cortex-A72 1.33 0.85/0.88 0.92/0.96
49	# Denver 1.96 0.65/0.86 0.76/0.80
50	# Mongoose 1.33 1.23/1.20 1.30/1.20
51	# Kryo 1.26 0.87/0.94 1.00/1.00
52	# ThunderX2 5.95 1.25 1.30
53	#
54	# (*) original 3.64/1.34/1.32 results were for r0p0 revision
55	# and are still same even for updated module;
56	# (**) numbers after slash are for 32-bit code, which is 3x-
57	# interleaved;
58
59	# $output is the last argument if it looks like a file (it has an extension)
60	# $flavour is the first argument if it doesn't look like a file
61	$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m\|\.\w+$\| ? pop : undef;
62	$flavour = $#ARGV >= 0 && $ARGV[0] !~ m\|\.\| ? shift : undef;
63
64	$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
65	( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
66	( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
67	die "can't locate arm-xlate.pl";
68
69	open OUT,"\| \"$^X\" $xlate $flavour \"$output\""
70	or die "can't call $xlate: $!";
71	STDOUT=OUT;
72
73	$prefix="aes_v8";
74
75	$_byte = ($flavour =~ /win/ ? "DCB" : ".byte");
76
77	$code=<<___;
78	#include "arm_arch.h"
79
80	#if __ARM_MAX_ARCH__>=7
81	___
82	$code.=".arch armv8-a+crypto\n.text\n" if ($flavour =~ /64/);
83	$code.=<<___ if ($flavour !~ /64/);
84	.arch armv7-a // don't confuse not-so-latest binutils with argv8 :-)
85	.fpu neon
86	#ifdef __thumb2__
87	.syntax unified
88	.thumb
89	# define INST(a,b,c,d) $_byte c,d\|0xc,a,b
90	#else
91	.code 32
92	# define INST(a,b,c,d) $_byte a,b,c,d
93	#endif
94
95	.text
96	___
97
98	# Assembler mnemonics are an eclectic mix of 32- and 64-bit syntax,
99	# NEON is mostly 32-bit mnemonics, integer - mostly 64. Goal is to
100	# maintain both 32- and 64-bit codes within single module and
101	# transliterate common code to either flavour with regex vodoo.
102	#
103	{{{
104	my ($inp,$bits,$out,$ptr,$rounds)=("x0","w1","x2","x3","w12");
105	my ($zero,$rcon,$mask,$in0,$in1,$tmp,$key)=
106	$flavour=~/64/? map("q$_",(0..6)) : map("q$_",(0..3,8..10));
107
108
109	$code.=<<___;
110	.align 5
111	.Lrcon:
112	.long 0x01,0x01,0x01,0x01
113	.long 0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d // rotate-n-splat
114	.long 0x1b,0x1b,0x1b,0x1b
115
116	.globl ${prefix}_set_encrypt_key
117	.type ${prefix}_set_encrypt_key,%function
118	.align 5
119	${prefix}_set_encrypt_key:
120	.Lenc_key:
121	___
122	$code.=<<___ if ($flavour =~ /64/);
123	stp x29,x30,[sp,#-16]!
124	add x29,sp,#0
125	___
126	$code.=<<___;
127	mov $ptr,#-1
128	cmp $inp,#0
129	b.eq .Lenc_key_abort
130	cmp $out,#0
131	b.eq .Lenc_key_abort
132	mov $ptr,#-2
133	cmp $bits,#128
134	b.lt .Lenc_key_abort
135	cmp $bits,#256
136	b.gt .Lenc_key_abort
137	tst $bits,#0x3f
138	b.ne .Lenc_key_abort
139
140	adr $ptr,.Lrcon
141	cmp $bits,#192
142
143	veor $zero,$zero,$zero
144	vld1.8 {$in0},[$inp],#16
145	mov $bits,#8 // reuse $bits
146	vld1.32 {$rcon,$mask},[$ptr],#32
147
148	b.lt .Loop128
149	b.eq .L192
150	b .L256
151
152	.align 4
153	.Loop128:
154	vtbl.8 $key,{$in0},$mask
155	vext.8 $tmp,$zero,$in0,#12
156	vst1.32 {$in0},[$out],#16
157	aese $key,$zero
158	subs $bits,$bits,#1
159
160	veor $in0,$in0,$tmp
161	vext.8 $tmp,$zero,$tmp,#12
162	veor $in0,$in0,$tmp
163	vext.8 $tmp,$zero,$tmp,#12
164	veor $key,$key,$rcon
165	veor $in0,$in0,$tmp
166	vshl.u8 $rcon,$rcon,#1
167	veor $in0,$in0,$key
168	b.ne .Loop128
169
170	vld1.32 {$rcon},[$ptr]
171
172	vtbl.8 $key,{$in0},$mask
173	vext.8 $tmp,$zero,$in0,#12
174	vst1.32 {$in0},[$out],#16
175	aese $key,$zero
176
177	veor $in0,$in0,$tmp
178	vext.8 $tmp,$zero,$tmp,#12
179	veor $in0,$in0,$tmp
180	vext.8 $tmp,$zero,$tmp,#12
181	veor $key,$key,$rcon
182	veor $in0,$in0,$tmp
183	vshl.u8 $rcon,$rcon,#1
184	veor $in0,$in0,$key
185
186	vtbl.8 $key,{$in0},$mask
187	vext.8 $tmp,$zero,$in0,#12
188	vst1.32 {$in0},[$out],#16
189	aese $key,$zero
190
191	veor $in0,$in0,$tmp
192	vext.8 $tmp,$zero,$tmp,#12
193	veor $in0,$in0,$tmp
194	vext.8 $tmp,$zero,$tmp,#12
195	veor $key,$key,$rcon
196	veor $in0,$in0,$tmp
197	veor $in0,$in0,$key
198	vst1.32 {$in0},[$out]
199	add $out,$out,#0x50
200
201	mov $rounds,#10
202	b .Ldone
203
204	.align 4
205	.L192:
206	vld1.8 {$in1},[$inp],#8
207	vmov.i8 $key,#8 // borrow $key
208	vst1.32 {$in0},[$out],#16
209	vsub.i8 $mask,$mask,$key // adjust the mask
210
211	.Loop192:
212	vtbl.8 $key,{$in1},$mask
213	vext.8 $tmp,$zero,$in0,#12
214	#ifdef __ARMEB__
215	vst1.32 {$in1},[$out],#16
216	sub $out,$out,#8
217	#else
218	vst1.32 {$in1},[$out],#8
219	#endif
220	aese $key,$zero
221	subs $bits,$bits,#1
222
223	veor $in0,$in0,$tmp
224	vext.8 $tmp,$zero,$tmp,#12
225	veor $in0,$in0,$tmp
226	vext.8 $tmp,$zero,$tmp,#12
227	veor $in0,$in0,$tmp
228
229	vdup.32 $tmp,${in0}[3]
230	veor $tmp,$tmp,$in1
231	veor $key,$key,$rcon
232	vext.8 $in1,$zero,$in1,#12
233	vshl.u8 $rcon,$rcon,#1
234	veor $in1,$in1,$tmp
235	veor $in0,$in0,$key
236	veor $in1,$in1,$key
237	vst1.32 {$in0},[$out],#16
238	b.ne .Loop192
239
240	mov $rounds,#12
241	add $out,$out,#0x20
242	b .Ldone
243
244	.align 4
245	.L256:
246	vld1.8 {$in1},[$inp]
247	mov $bits,#7
248	mov $rounds,#14
249	vst1.32 {$in0},[$out],#16
250
251	.Loop256:
252	vtbl.8 $key,{$in1},$mask
253	vext.8 $tmp,$zero,$in0,#12
254	vst1.32 {$in1},[$out],#16
255	aese $key,$zero
256	subs $bits,$bits,#1
257
258	veor $in0,$in0,$tmp
259	vext.8 $tmp,$zero,$tmp,#12
260	veor $in0,$in0,$tmp
261	vext.8 $tmp,$zero,$tmp,#12
262	veor $key,$key,$rcon
263	veor $in0,$in0,$tmp
264	vshl.u8 $rcon,$rcon,#1
265	veor $in0,$in0,$key
266	vst1.32 {$in0},[$out],#16
267	b.eq .Ldone
268
269	vdup.32 $key,${in0}[3] // just splat
270	vext.8 $tmp,$zero,$in1,#12
271	aese $key,$zero
272
273	veor $in1,$in1,$tmp
274	vext.8 $tmp,$zero,$tmp,#12
275	veor $in1,$in1,$tmp
276	vext.8 $tmp,$zero,$tmp,#12
277	veor $in1,$in1,$tmp
278
279	veor $in1,$in1,$key
280	b .Loop256
281
282	.Ldone:
283	str $rounds,[$out]
284	mov $ptr,#0
285
286	.Lenc_key_abort:
287	mov x0,$ptr // return value
288	`"ldr x29,[sp],#16" if ($flavour =~ /64/)`
289	ret
290	.size ${prefix}_set_encrypt_key,.-${prefix}_set_encrypt_key
291
292	.globl ${prefix}_set_decrypt_key
293	.type ${prefix}_set_decrypt_key,%function
294	.align 5
295	${prefix}_set_decrypt_key:
296	___
297	$code.=<<___ if ($flavour =~ /64/);
298	.inst 0xd503233f // paciasp
299	stp x29,x30,[sp,#-16]!
300	add x29,sp,#0
301	___
302	$code.=<<___ if ($flavour !~ /64/);
303	stmdb sp!,{r4,lr}
304	___
305	$code.=<<___;
306	bl .Lenc_key
307
308	cmp x0,#0
309	b.ne .Ldec_key_abort
310
311	sub $out,$out,#240 // restore original $out
312	mov x4,#-16
313	add $inp,$out,x12,lsl#4 // end of key schedule
314
315	vld1.32 {v0.16b},[$out]
316	vld1.32 {v1.16b},[$inp]
317	vst1.32 {v0.16b},[$inp],x4
318	vst1.32 {v1.16b},[$out],#16
319
320	.Loop_imc:
321	vld1.32 {v0.16b},[$out]
322	vld1.32 {v1.16b},[$inp]
323	aesimc v0.16b,v0.16b
324	aesimc v1.16b,v1.16b
325	vst1.32 {v0.16b},[$inp],x4
326	vst1.32 {v1.16b},[$out],#16
327	cmp $inp,$out
328	b.hi .Loop_imc
329
330	vld1.32 {v0.16b},[$out]
331	aesimc v0.16b,v0.16b
332	vst1.32 {v0.16b},[$inp]
333
334	eor x0,x0,x0 // return value
335	.Ldec_key_abort:
336	___
337	$code.=<<___ if ($flavour !~ /64/);
338	ldmia sp!,{r4,pc}
339	___
340	$code.=<<___ if ($flavour =~ /64/);
341	ldp x29,x30,[sp],#16
342	.inst 0xd50323bf // autiasp
343	ret
344	___
345	$code.=<<___;
346	.size ${prefix}_set_decrypt_key,.-${prefix}_set_decrypt_key
347	___
348	}}}
349	{{{
350	sub gen_block () {
351	my $dir = shift;
352	my ($e,$mc) = $dir eq "en" ? ("e","mc") : ("d","imc");
353	my ($inp,$out,$key)=map("x$_",(0..2));
354	my $rounds="w3";
355	my ($rndkey0,$rndkey1,$inout)=map("q$_",(0..3));
356
357	$code.=<<___;
358	.globl ${prefix}_${dir}crypt
359	.type ${prefix}_${dir}crypt,%function
360	.align 5
361	${prefix}_${dir}crypt:
362	ldr $rounds,[$key,#240]
363	vld1.32 {$rndkey0},[$key],#16
364	vld1.8 {$inout},[$inp]
365	sub $rounds,$rounds,#2
366	vld1.32 {$rndkey1},[$key],#16
367
368	.Loop_${dir}c:
369	aes$e $inout,$rndkey0
370	aes$mc $inout,$inout
371	vld1.32 {$rndkey0},[$key],#16
372	subs $rounds,$rounds,#2
373	aes$e $inout,$rndkey1
374	aes$mc $inout,$inout
375	vld1.32 {$rndkey1},[$key],#16
376	b.gt .Loop_${dir}c
377
378	aes$e $inout,$rndkey0
379	aes$mc $inout,$inout
380	vld1.32 {$rndkey0},[$key]
381	aes$e $inout,$rndkey1
382	veor $inout,$inout,$rndkey0
383
384	vst1.8 {$inout},[$out]
385	ret
386	.size ${prefix}_${dir}crypt,.-${prefix}_${dir}crypt
387	___
388	}
389	&gen_block("en");
390	&gen_block("de");
391	}}}
392
393	# Performance in cycles per byte.
394	# Processed with AES-ECB different key size.
395	# It shows the value before and after optimization as below:
396	# (before/after):
397	#
398	# AES-128-ECB AES-192-ECB AES-256-ECB
399	# Cortex-A57 1.85/0.82 2.16/0.96 2.47/1.10
400	# Cortex-A72 1.64/0.85 1.82/0.99 2.13/1.14
401
402	# Optimization is implemented by loop unrolling and interleaving.
403	# Commonly, we choose the unrolling factor as 5, if the input
404	# data size smaller than 5 blocks, but not smaller than 3 blocks,
405	# choose 3 as the unrolling factor.
406	# If the input data size dsize >= 5*16 bytes, then take 5 blocks
407	# as one iteration, every loop the left size lsize -= 5*16.
408	# If 516 > lsize >= 316 bytes, take 3 blocks as one iteration,
409	# every loop lsize -=3*16.
410	# If lsize < 3*16 bytes, treat them as the tail, interleave the
411	# two blocks AES instructions.
412	# There is one special case, if the original input data size dsize
413	# = 16 bytes, we will treat it seperately to improve the
414	# performance: one independent code block without LR, FP load and
415	# store, just looks like what the original ECB implementation does.
416
417	{{{
418	my ($inp,$out,$len,$key)=map("x$_",(0..3));
419	my ($enc,$rounds,$cnt,$key_,$step)=("w4","w5","w6","x7","x8");
420	my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$tmp2,$rndlast)=map("q$_",(0..7));
421
422	my ($dat,$tmp,$rndzero_n_last)=($dat0,$tmp0,$tmp1);
423
424	### q7 last round key
425	### q10-q15 q7 Last 7 round keys
426	### q8-q9 preloaded round keys except last 7 keys for big size
427	### q5, q6, q8-q9 preloaded round keys except last 7 keys for only 16 byte
428
429	{
430	my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9));
431
432	my ($dat3,$in3,$tmp3); # used only in 64-bit mode
433	my ($dat4,$in4,$tmp4);
434	if ($flavour =~ /64/) {
435	($dat2,$dat3,$dat4,$in2,$in3,$in4,$tmp3,$tmp4)=map("q$_",(16..23));
436	}
437
438	$code.=<<___;
439	.globl ${prefix}_ecb_encrypt
440	.type ${prefix}_ecb_encrypt,%function
441	.align 5
442	${prefix}_ecb_encrypt:
443	___
444	$code.=<<___ if ($flavour =~ /64/);
445	subs $len,$len,#16
446	// Original input data size bigger than 16, jump to big size processing.
447	b.ne .Lecb_big_size
448	vld1.8 {$dat0},[$inp]
449	cmp $enc,#0 // en- or decrypting?
450	ldr $rounds,[$key,#240]
451	vld1.32 {q5-q6},[$key],#32 // load key schedule...
452
453	b.eq .Lecb_small_dec
454	aese $dat0,q5
455	aesmc $dat0,$dat0
456	vld1.32 {q8-q9},[$key],#32 // load key schedule...
457	aese $dat0,q6
458	aesmc $dat0,$dat0
459	subs $rounds,$rounds,#10 // if rounds==10, jump to aes-128-ecb processing
460	b.eq .Lecb_128_enc
461	.Lecb_round_loop:
462	aese $dat0,q8
463	aesmc $dat0,$dat0
464	vld1.32 {q8},[$key],#16 // load key schedule...
465	aese $dat0,q9
466	aesmc $dat0,$dat0
467	vld1.32 {q9},[$key],#16 // load key schedule...
468	subs $rounds,$rounds,#2 // bias
469	b.gt .Lecb_round_loop
470	.Lecb_128_enc:
471	vld1.32 {q10-q11},[$key],#32 // load key schedule...
472	aese $dat0,q8
473	aesmc $dat0,$dat0
474	aese $dat0,q9
475	aesmc $dat0,$dat0
476	vld1.32 {q12-q13},[$key],#32 // load key schedule...
477	aese $dat0,q10
478	aesmc $dat0,$dat0
479	aese $dat0,q11
480	aesmc $dat0,$dat0
481	vld1.32 {q14-q15},[$key],#32 // load key schedule...
482	aese $dat0,q12
483	aesmc $dat0,$dat0
484	aese $dat0,q13
485	aesmc $dat0,$dat0
486	vld1.32 {$rndlast},[$key]
487	aese $dat0,q14
488	aesmc $dat0,$dat0
489	aese $dat0,q15
490	veor $dat0,$dat0,$rndlast
491	vst1.8 {$dat0},[$out]
492	b .Lecb_Final_abort
493	.Lecb_small_dec:
494	aesd $dat0,q5
495	aesimc $dat0,$dat0
496	vld1.32 {q8-q9},[$key],#32 // load key schedule...
497	aesd $dat0,q6
498	aesimc $dat0,$dat0
499	subs $rounds,$rounds,#10 // bias
500	b.eq .Lecb_128_dec
501	.Lecb_dec_round_loop:
502	aesd $dat0,q8
503	aesimc $dat0,$dat0
504	vld1.32 {q8},[$key],#16 // load key schedule...
505	aesd $dat0,q9
506	aesimc $dat0,$dat0
507	vld1.32 {q9},[$key],#16 // load key schedule...
508	subs $rounds,$rounds,#2 // bias
509	b.gt .Lecb_dec_round_loop
510	.Lecb_128_dec:
511	vld1.32 {q10-q11},[$key],#32 // load key schedule...
512	aesd $dat0,q8
513	aesimc $dat0,$dat0
514	aesd $dat0,q9
515	aesimc $dat0,$dat0
516	vld1.32 {q12-q13},[$key],#32 // load key schedule...
517	aesd $dat0,q10
518	aesimc $dat0,$dat0
519	aesd $dat0,q11
520	aesimc $dat0,$dat0
521	vld1.32 {q14-q15},[$key],#32 // load key schedule...
522	aesd $dat0,q12
523	aesimc $dat0,$dat0
524	aesd $dat0,q13
525	aesimc $dat0,$dat0
526	vld1.32 {$rndlast},[$key]
527	aesd $dat0,q14
528	aesimc $dat0,$dat0
529	aesd $dat0,q15
530	veor $dat0,$dat0,$rndlast
531	vst1.8 {$dat0},[$out]
532	b .Lecb_Final_abort
533	.Lecb_big_size:
534	___
535	$code.=<<___ if ($flavour =~ /64/);
536	stp x29,x30,[sp,#-16]!
537	add x29,sp,#0
538	___
539	$code.=<<___ if ($flavour !~ /64/);
540	mov ip,sp
541	stmdb sp!,{r4-r8,lr}
542	vstmdb sp!,{d8-d15} @ ABI specification says so
543	ldmia ip,{r4-r5} @ load remaining args
544	subs $len,$len,#16
545	___
546	$code.=<<___;
547	mov $step,#16
548	b.lo .Lecb_done
549	cclr $step,eq
550
551	cmp $enc,#0 // en- or decrypting?
552	ldr $rounds,[$key,#240]
553	and $len,$len,#-16
554	vld1.8 {$dat},[$inp],$step
555
556	vld1.32 {q8-q9},[$key] // load key schedule...
557	sub $rounds,$rounds,#6
558	add $key_,$key,x5,lsl#4 // pointer to last 7 round keys
559	sub $rounds,$rounds,#2
560	vld1.32 {q10-q11},[$key_],#32
561	vld1.32 {q12-q13},[$key_],#32
562	vld1.32 {q14-q15},[$key_],#32
563	vld1.32 {$rndlast},[$key_]
564
565	add $key_,$key,#32
566	mov $cnt,$rounds
567	b.eq .Lecb_dec
568
569	vld1.8 {$dat1},[$inp],#16
570	subs $len,$len,#32 // bias
571	add $cnt,$rounds,#2
572	vorr $in1,$dat1,$dat1
573	vorr $dat2,$dat1,$dat1
574	vorr $dat1,$dat,$dat
575	b.lo .Lecb_enc_tail
576
577	vorr $dat1,$in1,$in1
578	vld1.8 {$dat2},[$inp],#16
579	___
580	$code.=<<___ if ($flavour =~ /64/);
581	cmp $len,#32
582	b.lo .Loop3x_ecb_enc
583
584	vld1.8 {$dat3},[$inp],#16
585	vld1.8 {$dat4},[$inp],#16
586	sub $len,$len,#32 // bias
587	mov $cnt,$rounds
588
589	.Loop5x_ecb_enc:
590	aese $dat0,q8
591	aesmc $dat0,$dat0
592	aese $dat1,q8
593	aesmc $dat1,$dat1
594	aese $dat2,q8
595	aesmc $dat2,$dat2
596	aese $dat3,q8
597	aesmc $dat3,$dat3
598	aese $dat4,q8
599	aesmc $dat4,$dat4
600	vld1.32 {q8},[$key_],#16
601	subs $cnt,$cnt,#2
602	aese $dat0,q9
603	aesmc $dat0,$dat0
604	aese $dat1,q9
605	aesmc $dat1,$dat1
606	aese $dat2,q9
607	aesmc $dat2,$dat2
608	aese $dat3,q9
609	aesmc $dat3,$dat3
610	aese $dat4,q9
611	aesmc $dat4,$dat4
612	vld1.32 {q9},[$key_],#16
613	b.gt .Loop5x_ecb_enc
614
615	aese $dat0,q8
616	aesmc $dat0,$dat0
617	aese $dat1,q8
618	aesmc $dat1,$dat1
619	aese $dat2,q8
620	aesmc $dat2,$dat2
621	aese $dat3,q8
622	aesmc $dat3,$dat3
623	aese $dat4,q8
624	aesmc $dat4,$dat4
625	cmp $len,#0x40 // because .Lecb_enc_tail4x
626	sub $len,$len,#0x50
627
628	aese $dat0,q9
629	aesmc $dat0,$dat0
630	aese $dat1,q9
631	aesmc $dat1,$dat1
632	aese $dat2,q9
633	aesmc $dat2,$dat2
634	aese $dat3,q9
635	aesmc $dat3,$dat3
636	aese $dat4,q9
637	aesmc $dat4,$dat4
638	csel x6,xzr,$len,gt // borrow x6, $cnt, "gt" is not typo
639	mov $key_,$key
640
641	aese $dat0,q10
642	aesmc $dat0,$dat0
643	aese $dat1,q10
644	aesmc $dat1,$dat1
645	aese $dat2,q10
646	aesmc $dat2,$dat2
647	aese $dat3,q10
648	aesmc $dat3,$dat3
649	aese $dat4,q10
650	aesmc $dat4,$dat4
651	add $inp,$inp,x6 // $inp is adjusted in such way that
652	// at exit from the loop $dat1-$dat4
653	// are loaded with last "words"
654	add x6,$len,#0x60 // because .Lecb_enc_tail4x
655
656	aese $dat0,q11
657	aesmc $dat0,$dat0
658	aese $dat1,q11
659	aesmc $dat1,$dat1
660	aese $dat2,q11
661	aesmc $dat2,$dat2
662	aese $dat3,q11
663	aesmc $dat3,$dat3
664	aese $dat4,q11
665	aesmc $dat4,$dat4
666
667	aese $dat0,q12
668	aesmc $dat0,$dat0
669	aese $dat1,q12
670	aesmc $dat1,$dat1
671	aese $dat2,q12
672	aesmc $dat2,$dat2
673	aese $dat3,q12
674	aesmc $dat3,$dat3
675	aese $dat4,q12
676	aesmc $dat4,$dat4
677
678	aese $dat0,q13
679	aesmc $dat0,$dat0
680	aese $dat1,q13
681	aesmc $dat1,$dat1
682	aese $dat2,q13
683	aesmc $dat2,$dat2
684	aese $dat3,q13
685	aesmc $dat3,$dat3
686	aese $dat4,q13
687	aesmc $dat4,$dat4
688
689	aese $dat0,q14
690	aesmc $dat0,$dat0
691	aese $dat1,q14
692	aesmc $dat1,$dat1
693	aese $dat2,q14
694	aesmc $dat2,$dat2
695	aese $dat3,q14
696	aesmc $dat3,$dat3
697	aese $dat4,q14
698	aesmc $dat4,$dat4
699
700	aese $dat0,q15
701	vld1.8 {$in0},[$inp],#16
702	aese $dat1,q15
703	vld1.8 {$in1},[$inp],#16
704	aese $dat2,q15
705	vld1.8 {$in2},[$inp],#16
706	aese $dat3,q15
707	vld1.8 {$in3},[$inp],#16
708	aese $dat4,q15
709	vld1.8 {$in4},[$inp],#16
710	cbz x6,.Lecb_enc_tail4x
711	vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0]
712	veor $tmp0,$rndlast,$dat0
713	vorr $dat0,$in0,$in0
714	veor $tmp1,$rndlast,$dat1
715	vorr $dat1,$in1,$in1
716	veor $tmp2,$rndlast,$dat2
717	vorr $dat2,$in2,$in2
718	veor $tmp3,$rndlast,$dat3
719	vorr $dat3,$in3,$in3
720	veor $tmp4,$rndlast,$dat4
721	vst1.8 {$tmp0},[$out],#16
722	vorr $dat4,$in4,$in4
723	vst1.8 {$tmp1},[$out],#16
724	mov $cnt,$rounds
725	vst1.8 {$tmp2},[$out],#16
726	vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1]
727	vst1.8 {$tmp3},[$out],#16
728	vst1.8 {$tmp4},[$out],#16
729	b.hs .Loop5x_ecb_enc
730
731	add $len,$len,#0x50
732	cbz $len,.Lecb_done
733
734	add $cnt,$rounds,#2
735	subs $len,$len,#0x30
736	vorr $dat0,$in2,$in2
737	vorr $dat1,$in3,$in3
738	vorr $dat2,$in4,$in4
739	b.lo .Lecb_enc_tail
740
741	b .Loop3x_ecb_enc
742
743	.align 4
744	.Lecb_enc_tail4x:
745	veor $tmp1,$rndlast,$dat1
746	veor $tmp2,$rndlast,$dat2
747	veor $tmp3,$rndlast,$dat3
748	veor $tmp4,$rndlast,$dat4
749	vst1.8 {$tmp1},[$out],#16
750	vst1.8 {$tmp2},[$out],#16
751	vst1.8 {$tmp3},[$out],#16
752	vst1.8 {$tmp4},[$out],#16
753
754	b .Lecb_done
755	.align 4
756	___
757	$code.=<<___;
758	.Loop3x_ecb_enc:
759	aese $dat0,q8
760	aesmc $dat0,$dat0
761	aese $dat1,q8
762	aesmc $dat1,$dat1
763	aese $dat2,q8
764	aesmc $dat2,$dat2
765	vld1.32 {q8},[$key_],#16
766	subs $cnt,$cnt,#2
767	aese $dat0,q9
768	aesmc $dat0,$dat0
769	aese $dat1,q9
770	aesmc $dat1,$dat1
771	aese $dat2,q9
772	aesmc $dat2,$dat2
773	vld1.32 {q9},[$key_],#16
774	b.gt .Loop3x_ecb_enc
775
776	aese $dat0,q8
777	aesmc $dat0,$dat0
778	aese $dat1,q8
779	aesmc $dat1,$dat1
780	aese $dat2,q8
781	aesmc $dat2,$dat2
782	subs $len,$len,#0x30
783	mov.lo x6,$len // x6, $cnt, is zero at this point
784	aese $dat0,q9
785	aesmc $dat0,$dat0
786	aese $dat1,q9
787	aesmc $dat1,$dat1
788	aese $dat2,q9
789	aesmc $dat2,$dat2
790	add $inp,$inp,x6 // $inp is adjusted in such way that
791	// at exit from the loop $dat1-$dat2
792	// are loaded with last "words"
793	mov $key_,$key
794	aese $dat0,q12
795	aesmc $dat0,$dat0
796	aese $dat1,q12
797	aesmc $dat1,$dat1
798	aese $dat2,q12
799	aesmc $dat2,$dat2
800	vld1.8 {$in0},[$inp],#16
801	aese $dat0,q13
802	aesmc $dat0,$dat0
803	aese $dat1,q13
804	aesmc $dat1,$dat1
805	aese $dat2,q13
806	aesmc $dat2,$dat2
807	vld1.8 {$in1},[$inp],#16
808	aese $dat0,q14
809	aesmc $dat0,$dat0
810	aese $dat1,q14
811	aesmc $dat1,$dat1
812	aese $dat2,q14
813	aesmc $dat2,$dat2
814	vld1.8 {$in2},[$inp],#16
815	aese $dat0,q15
816	aese $dat1,q15
817	aese $dat2,q15
818	vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0]
819	add $cnt,$rounds,#2
820	veor $tmp0,$rndlast,$dat0
821	veor $tmp1,$rndlast,$dat1
822	veor $dat2,$dat2,$rndlast
823	vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1]
824	vst1.8 {$tmp0},[$out],#16
825	vorr $dat0,$in0,$in0
826	vst1.8 {$tmp1},[$out],#16
827	vorr $dat1,$in1,$in1
828	vst1.8 {$dat2},[$out],#16
829	vorr $dat2,$in2,$in2
830	b.hs .Loop3x_ecb_enc
831
832	cmn $len,#0x30
833	b.eq .Lecb_done
834	nop
835
836	.Lecb_enc_tail:
837	aese $dat1,q8
838	aesmc $dat1,$dat1
839	aese $dat2,q8
840	aesmc $dat2,$dat2
841	vld1.32 {q8},[$key_],#16
842	subs $cnt,$cnt,#2
843	aese $dat1,q9
844	aesmc $dat1,$dat1
845	aese $dat2,q9
846	aesmc $dat2,$dat2
847	vld1.32 {q9},[$key_],#16
848	b.gt .Lecb_enc_tail
849
850	aese $dat1,q8
851	aesmc $dat1,$dat1
852	aese $dat2,q8
853	aesmc $dat2,$dat2
854	aese $dat1,q9
855	aesmc $dat1,$dat1
856	aese $dat2,q9
857	aesmc $dat2,$dat2
858	aese $dat1,q12
859	aesmc $dat1,$dat1
860	aese $dat2,q12
861	aesmc $dat2,$dat2
862	cmn $len,#0x20
863	aese $dat1,q13
864	aesmc $dat1,$dat1
865	aese $dat2,q13
866	aesmc $dat2,$dat2
867	aese $dat1,q14
868	aesmc $dat1,$dat1
869	aese $dat2,q14
870	aesmc $dat2,$dat2
871	aese $dat1,q15
872	aese $dat2,q15
873	b.eq .Lecb_enc_one
874	veor $tmp1,$rndlast,$dat1
875	veor $tmp2,$rndlast,$dat2
876	vst1.8 {$tmp1},[$out],#16
877	vst1.8 {$tmp2},[$out],#16
878	b .Lecb_done
879
880	.Lecb_enc_one:
881	veor $tmp1,$rndlast,$dat2
882	vst1.8 {$tmp1},[$out],#16
883	b .Lecb_done
884	___
885
886	$code.=<<___;
887	.align 5
888	.Lecb_dec:
889	vld1.8 {$dat1},[$inp],#16
890	subs $len,$len,#32 // bias
891	add $cnt,$rounds,#2
892	vorr $in1,$dat1,$dat1
893	vorr $dat2,$dat1,$dat1
894	vorr $dat1,$dat,$dat
895	b.lo .Lecb_dec_tail
896
897	vorr $dat1,$in1,$in1
898	vld1.8 {$dat2},[$inp],#16
899	___
900	$code.=<<___ if ($flavour =~ /64/);
901	cmp $len,#32
902	b.lo .Loop3x_ecb_dec
903
904	vld1.8 {$dat3},[$inp],#16
905	vld1.8 {$dat4},[$inp],#16
906	sub $len,$len,#32 // bias
907	mov $cnt,$rounds
908
909	.Loop5x_ecb_dec:
910	aesd $dat0,q8
911	aesimc $dat0,$dat0
912	aesd $dat1,q8
913	aesimc $dat1,$dat1
914	aesd $dat2,q8
915	aesimc $dat2,$dat2
916	aesd $dat3,q8
917	aesimc $dat3,$dat3
918	aesd $dat4,q8
919	aesimc $dat4,$dat4
920	vld1.32 {q8},[$key_],#16
921	subs $cnt,$cnt,#2
922	aesd $dat0,q9
923	aesimc $dat0,$dat0
924	aesd $dat1,q9
925	aesimc $dat1,$dat1
926	aesd $dat2,q9
927	aesimc $dat2,$dat2
928	aesd $dat3,q9
929	aesimc $dat3,$dat3
930	aesd $dat4,q9
931	aesimc $dat4,$dat4
932	vld1.32 {q9},[$key_],#16
933	b.gt .Loop5x_ecb_dec
934
935	aesd $dat0,q8
936	aesimc $dat0,$dat0
937	aesd $dat1,q8
938	aesimc $dat1,$dat1
939	aesd $dat2,q8
940	aesimc $dat2,$dat2
941	aesd $dat3,q8
942	aesimc $dat3,$dat3
943	aesd $dat4,q8
944	aesimc $dat4,$dat4
945	cmp $len,#0x40 // because .Lecb_tail4x
946	sub $len,$len,#0x50
947
948	aesd $dat0,q9
949	aesimc $dat0,$dat0
950	aesd $dat1,q9
951	aesimc $dat1,$dat1
952	aesd $dat2,q9
953	aesimc $dat2,$dat2
954	aesd $dat3,q9
955	aesimc $dat3,$dat3
956	aesd $dat4,q9
957	aesimc $dat4,$dat4
958	csel x6,xzr,$len,gt // borrow x6, $cnt, "gt" is not typo
959	mov $key_,$key
960
961	aesd $dat0,q10
962	aesimc $dat0,$dat0
963	aesd $dat1,q10
964	aesimc $dat1,$dat1
965	aesd $dat2,q10
966	aesimc $dat2,$dat2
967	aesd $dat3,q10
968	aesimc $dat3,$dat3
969	aesd $dat4,q10
970	aesimc $dat4,$dat4
971	add $inp,$inp,x6 // $inp is adjusted in such way that
972	// at exit from the loop $dat1-$dat4
973	// are loaded with last "words"
974	add x6,$len,#0x60 // because .Lecb_tail4x
975
976	aesd $dat0,q11
977	aesimc $dat0,$dat0
978	aesd $dat1,q11
979	aesimc $dat1,$dat1
980	aesd $dat2,q11
981	aesimc $dat2,$dat2
982	aesd $dat3,q11
983	aesimc $dat3,$dat3
984	aesd $dat4,q11
985	aesimc $dat4,$dat4
986
987	aesd $dat0,q12
988	aesimc $dat0,$dat0
989	aesd $dat1,q12
990	aesimc $dat1,$dat1
991	aesd $dat2,q12
992	aesimc $dat2,$dat2
993	aesd $dat3,q12
994	aesimc $dat3,$dat3
995	aesd $dat4,q12
996	aesimc $dat4,$dat4
997
998	aesd $dat0,q13
999	aesimc $dat0,$dat0
1000	aesd $dat1,q13
1001	aesimc $dat1,$dat1
1002	aesd $dat2,q13
1003	aesimc $dat2,$dat2
1004	aesd $dat3,q13
1005	aesimc $dat3,$dat3
1006	aesd $dat4,q13
1007	aesimc $dat4,$dat4
1008
1009	aesd $dat0,q14
1010	aesimc $dat0,$dat0
1011	aesd $dat1,q14
1012	aesimc $dat1,$dat1
1013	aesd $dat2,q14
1014	aesimc $dat2,$dat2
1015	aesd $dat3,q14
1016	aesimc $dat3,$dat3
1017	aesd $dat4,q14
1018	aesimc $dat4,$dat4
1019
1020	aesd $dat0,q15
1021	vld1.8 {$in0},[$inp],#16
1022	aesd $dat1,q15
1023	vld1.8 {$in1},[$inp],#16
1024	aesd $dat2,q15
1025	vld1.8 {$in2},[$inp],#16
1026	aesd $dat3,q15
1027	vld1.8 {$in3},[$inp],#16
1028	aesd $dat4,q15
1029	vld1.8 {$in4},[$inp],#16
1030	cbz x6,.Lecb_tail4x
1031	vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0]
1032	veor $tmp0,$rndlast,$dat0
1033	vorr $dat0,$in0,$in0
1034	veor $tmp1,$rndlast,$dat1
1035	vorr $dat1,$in1,$in1
1036	veor $tmp2,$rndlast,$dat2
1037	vorr $dat2,$in2,$in2
1038	veor $tmp3,$rndlast,$dat3
1039	vorr $dat3,$in3,$in3
1040	veor $tmp4,$rndlast,$dat4
1041	vst1.8 {$tmp0},[$out],#16
1042	vorr $dat4,$in4,$in4
1043	vst1.8 {$tmp1},[$out],#16
1044	mov $cnt,$rounds
1045	vst1.8 {$tmp2},[$out],#16
1046	vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1]
1047	vst1.8 {$tmp3},[$out],#16
1048	vst1.8 {$tmp4},[$out],#16
1049	b.hs .Loop5x_ecb_dec
1050
1051	add $len,$len,#0x50
1052	cbz $len,.Lecb_done
1053
1054	add $cnt,$rounds,#2
1055	subs $len,$len,#0x30
1056	vorr $dat0,$in2,$in2
1057	vorr $dat1,$in3,$in3
1058	vorr $dat2,$in4,$in4
1059	b.lo .Lecb_dec_tail
1060
1061	b .Loop3x_ecb_dec
1062
1063	.align 4
1064	.Lecb_tail4x:
1065	veor $tmp1,$rndlast,$dat1
1066	veor $tmp2,$rndlast,$dat2
1067	veor $tmp3,$rndlast,$dat3
1068	veor $tmp4,$rndlast,$dat4
1069	vst1.8 {$tmp1},[$out],#16
1070	vst1.8 {$tmp2},[$out],#16
1071	vst1.8 {$tmp3},[$out],#16
1072	vst1.8 {$tmp4},[$out],#16
1073
1074	b .Lecb_done
1075	.align 4
1076	___
1077	$code.=<<___;
1078	.Loop3x_ecb_dec:
1079	aesd $dat0,q8
1080	aesimc $dat0,$dat0
1081	aesd $dat1,q8
1082	aesimc $dat1,$dat1
1083	aesd $dat2,q8
1084	aesimc $dat2,$dat2
1085	vld1.32 {q8},[$key_],#16
1086	subs $cnt,$cnt,#2
1087	aesd $dat0,q9
1088	aesimc $dat0,$dat0
1089	aesd $dat1,q9
1090	aesimc $dat1,$dat1
1091	aesd $dat2,q9
1092	aesimc $dat2,$dat2
1093	vld1.32 {q9},[$key_],#16
1094	b.gt .Loop3x_ecb_dec
1095
1096	aesd $dat0,q8
1097	aesimc $dat0,$dat0
1098	aesd $dat1,q8
1099	aesimc $dat1,$dat1
1100	aesd $dat2,q8
1101	aesimc $dat2,$dat2
1102	subs $len,$len,#0x30
1103	mov.lo x6,$len // x6, $cnt, is zero at this point
1104	aesd $dat0,q9
1105	aesimc $dat0,$dat0
1106	aesd $dat1,q9
1107	aesimc $dat1,$dat1
1108	aesd $dat2,q9
1109	aesimc $dat2,$dat2
1110	add $inp,$inp,x6 // $inp is adjusted in such way that
1111	// at exit from the loop $dat1-$dat2
1112	// are loaded with last "words"
1113	mov $key_,$key
1114	aesd $dat0,q12
1115	aesimc $dat0,$dat0
1116	aesd $dat1,q12
1117	aesimc $dat1,$dat1
1118	aesd $dat2,q12
1119	aesimc $dat2,$dat2
1120	vld1.8 {$in0},[$inp],#16
1121	aesd $dat0,q13
1122	aesimc $dat0,$dat0
1123	aesd $dat1,q13
1124	aesimc $dat1,$dat1
1125	aesd $dat2,q13
1126	aesimc $dat2,$dat2
1127	vld1.8 {$in1},[$inp],#16
1128	aesd $dat0,q14
1129	aesimc $dat0,$dat0
1130	aesd $dat1,q14
1131	aesimc $dat1,$dat1
1132	aesd $dat2,q14
1133	aesimc $dat2,$dat2
1134	vld1.8 {$in2},[$inp],#16
1135	aesd $dat0,q15
1136	aesd $dat1,q15
1137	aesd $dat2,q15
1138	vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0]
1139	add $cnt,$rounds,#2
1140	veor $tmp0,$rndlast,$dat0
1141	veor $tmp1,$rndlast,$dat1
1142	veor $dat2,$dat2,$rndlast
1143	vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1]
1144	vst1.8 {$tmp0},[$out],#16
1145	vorr $dat0,$in0,$in0
1146	vst1.8 {$tmp1},[$out],#16
1147	vorr $dat1,$in1,$in1
1148	vst1.8 {$dat2},[$out],#16
1149	vorr $dat2,$in2,$in2
1150	b.hs .Loop3x_ecb_dec
1151
1152	cmn $len,#0x30
1153	b.eq .Lecb_done
1154	nop
1155
1156	.Lecb_dec_tail:
1157	aesd $dat1,q8
1158	aesimc $dat1,$dat1
1159	aesd $dat2,q8
1160	aesimc $dat2,$dat2
1161	vld1.32 {q8},[$key_],#16
1162	subs $cnt,$cnt,#2
1163	aesd $dat1,q9
1164	aesimc $dat1,$dat1
1165	aesd $dat2,q9
1166	aesimc $dat2,$dat2
1167	vld1.32 {q9},[$key_],#16
1168	b.gt .Lecb_dec_tail
1169
1170	aesd $dat1,q8
1171	aesimc $dat1,$dat1
1172	aesd $dat2,q8
1173	aesimc $dat2,$dat2
1174	aesd $dat1,q9
1175	aesimc $dat1,$dat1
1176	aesd $dat2,q9
1177	aesimc $dat2,$dat2
1178	aesd $dat1,q12
1179	aesimc $dat1,$dat1
1180	aesd $dat2,q12
1181	aesimc $dat2,$dat2
1182	cmn $len,#0x20
1183	aesd $dat1,q13
1184	aesimc $dat1,$dat1
1185	aesd $dat2,q13
1186	aesimc $dat2,$dat2
1187	aesd $dat1,q14
1188	aesimc $dat1,$dat1
1189	aesd $dat2,q14
1190	aesimc $dat2,$dat2
1191	aesd $dat1,q15
1192	aesd $dat2,q15
1193	b.eq .Lecb_dec_one
1194	veor $tmp1,$rndlast,$dat1
1195	veor $tmp2,$rndlast,$dat2
1196	vst1.8 {$tmp1},[$out],#16
1197	vst1.8 {$tmp2},[$out],#16
1198	b .Lecb_done
1199
1200	.Lecb_dec_one:
1201	veor $tmp1,$rndlast,$dat2
1202	vst1.8 {$tmp1},[$out],#16
1203
1204	.Lecb_done:
1205	___
1206	}
1207	$code.=<<___ if ($flavour !~ /64/);
1208	vldmia sp!,{d8-d15}
1209	ldmia sp!,{r4-r8,pc}
1210	___
1211	$code.=<<___ if ($flavour =~ /64/);
1212	ldr x29,[sp],#16
1213	___
1214	$code.=<<___ if ($flavour =~ /64/);
1215	.Lecb_Final_abort:
1216	ret
1217	___
1218	$code.=<<___;
1219	.size ${prefix}_ecb_encrypt,.-${prefix}_ecb_encrypt
1220	___
1221	}}}
1222	{{{
1223	my ($inp,$out,$len,$key,$ivp)=map("x$_",(0..4)); my $enc="w5";
1224	my ($rounds,$cnt,$key_,$step,$step1)=($enc,"w6","x7","x8","x12");
1225	my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$ivec,$rndlast)=map("q$_",(0..7));
1226
1227	my ($dat,$tmp,$rndzero_n_last)=($dat0,$tmp0,$tmp1);
1228	my ($key4,$key5,$key6,$key7)=("x6","x12","x14",$key);
1229
1230	### q8-q15 preloaded key schedule
1231
1232	$code.=<<___;
1233	.globl ${prefix}_cbc_encrypt
1234	.type ${prefix}_cbc_encrypt,%function
1235	.align 5
1236	${prefix}_cbc_encrypt:
1237	___
1238	$code.=<<___ if ($flavour =~ /64/);
1239	stp x29,x30,[sp,#-16]!
1240	add x29,sp,#0
1241	___
1242	$code.=<<___ if ($flavour !~ /64/);
1243	mov ip,sp
1244	stmdb sp!,{r4-r8,lr}
1245	vstmdb sp!,{d8-d15} @ ABI specification says so
1246	ldmia ip,{r4-r5} @ load remaining args
1247	___
1248	$code.=<<___;
1249	subs $len,$len,#16
1250	mov $step,#16
1251	b.lo .Lcbc_abort
1252	cclr $step,eq
1253
1254	cmp $enc,#0 // en- or decrypting?
1255	ldr $rounds,[$key,#240]
1256	and $len,$len,#-16
1257	vld1.8 {$ivec},[$ivp]
1258	vld1.8 {$dat},[$inp],$step
1259
1260	vld1.32 {q8-q9},[$key] // load key schedule...
1261	sub $rounds,$rounds,#6
1262	add $key_,$key,x5,lsl#4 // pointer to last 7 round keys
1263	sub $rounds,$rounds,#2
1264	vld1.32 {q10-q11},[$key_],#32
1265	vld1.32 {q12-q13},[$key_],#32
1266	vld1.32 {q14-q15},[$key_],#32
1267	vld1.32 {$rndlast},[$key_]
1268
1269	add $key_,$key,#32
1270	mov $cnt,$rounds
1271	b.eq .Lcbc_dec
1272
1273	cmp $rounds,#2
1274	veor $dat,$dat,$ivec
1275	veor $rndzero_n_last,q8,$rndlast
1276	b.eq .Lcbc_enc128
1277
1278	vld1.32 {$in0-$in1},[$key_]
1279	add $key_,$key,#16
1280	add $key4,$key,#16*4
1281	add $key5,$key,#16*5
1282	aese $dat,q8
1283	aesmc $dat,$dat
1284	add $key6,$key,#16*6
1285	add $key7,$key,#16*7
1286	b .Lenter_cbc_enc
1287
1288	.align 4
1289	.Loop_cbc_enc:
1290	aese $dat,q8
1291	aesmc $dat,$dat
1292	vst1.8 {$ivec},[$out],#16
1293	.Lenter_cbc_enc:
1294	aese $dat,q9
1295	aesmc $dat,$dat
1296	aese $dat,$in0
1297	aesmc $dat,$dat
1298	vld1.32 {q8},[$key4]
1299	cmp $rounds,#4
1300	aese $dat,$in1
1301	aesmc $dat,$dat
1302	vld1.32 {q9},[$key5]
1303	b.eq .Lcbc_enc192
1304
1305	aese $dat,q8
1306	aesmc $dat,$dat
1307	vld1.32 {q8},[$key6]
1308	aese $dat,q9
1309	aesmc $dat,$dat
1310	vld1.32 {q9},[$key7]
1311	nop
1312
1313	.Lcbc_enc192:
1314	aese $dat,q8
1315	aesmc $dat,$dat
1316	subs $len,$len,#16
1317	aese $dat,q9
1318	aesmc $dat,$dat
1319	cclr $step,eq
1320	aese $dat,q10
1321	aesmc $dat,$dat
1322	aese $dat,q11
1323	aesmc $dat,$dat
1324	vld1.8 {q8},[$inp],$step
1325	aese $dat,q12
1326	aesmc $dat,$dat
1327	veor q8,q8,$rndzero_n_last
1328	aese $dat,q13
1329	aesmc $dat,$dat
1330	vld1.32 {q9},[$key_] // re-pre-load rndkey[1]
1331	aese $dat,q14
1332	aesmc $dat,$dat
1333	aese $dat,q15
1334	veor $ivec,$dat,$rndlast
1335	b.hs .Loop_cbc_enc
1336
1337	vst1.8 {$ivec},[$out],#16
1338	b .Lcbc_done
1339
1340	.align 5
1341	.Lcbc_enc128:
1342	vld1.32 {$in0-$in1},[$key_]
1343	aese $dat,q8
1344	aesmc $dat,$dat
1345	b .Lenter_cbc_enc128
1346	.Loop_cbc_enc128:
1347	aese $dat,q8
1348	aesmc $dat,$dat
1349	vst1.8 {$ivec},[$out],#16
1350	.Lenter_cbc_enc128:
1351	aese $dat,q9
1352	aesmc $dat,$dat
1353	subs $len,$len,#16
1354	aese $dat,$in0
1355	aesmc $dat,$dat
1356	cclr $step,eq
1357	aese $dat,$in1
1358	aesmc $dat,$dat
1359	aese $dat,q10
1360	aesmc $dat,$dat
1361	aese $dat,q11
1362	aesmc $dat,$dat
1363	vld1.8 {q8},[$inp],$step
1364	aese $dat,q12
1365	aesmc $dat,$dat
1366	aese $dat,q13
1367	aesmc $dat,$dat
1368	aese $dat,q14
1369	aesmc $dat,$dat
1370	veor q8,q8,$rndzero_n_last
1371	aese $dat,q15
1372	veor $ivec,$dat,$rndlast
1373	b.hs .Loop_cbc_enc128
1374
1375	vst1.8 {$ivec},[$out],#16
1376	b .Lcbc_done
1377	___
1378	{
1379	my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9));
1380
1381	my ($dat3,$in3,$tmp3); # used only in 64-bit mode
1382	my ($dat4,$in4,$tmp4);
1383	if ($flavour =~ /64/) {
1384	($dat2,$dat3,$dat4,$in2,$in3,$in4,$tmp3,$tmp4)=map("q$_",(16..23));
1385	}
1386
1387	$code.=<<___;
1388	.align 5
1389	.Lcbc_dec:
1390	vld1.8 {$dat2},[$inp],#16
1391	subs $len,$len,#32 // bias
1392	add $cnt,$rounds,#2
1393	vorr $in1,$dat,$dat
1394	vorr $dat1,$dat,$dat
1395	vorr $in2,$dat2,$dat2
1396	b.lo .Lcbc_dec_tail
1397
1398	vorr $dat1,$dat2,$dat2
1399	vld1.8 {$dat2},[$inp],#16
1400	vorr $in0,$dat,$dat
1401	vorr $in1,$dat1,$dat1
1402	vorr $in2,$dat2,$dat2
1403	___
1404	$code.=<<___ if ($flavour =~ /64/);
1405	cmp $len,#32
1406	b.lo .Loop3x_cbc_dec
1407
1408	vld1.8 {$dat3},[$inp],#16
1409	vld1.8 {$dat4},[$inp],#16
1410	sub $len,$len,#32 // bias
1411	mov $cnt,$rounds
1412	vorr $in3,$dat3,$dat3
1413	vorr $in4,$dat4,$dat4
1414
1415	.Loop5x_cbc_dec:
1416	aesd $dat0,q8
1417	aesimc $dat0,$dat0
1418	aesd $dat1,q8
1419	aesimc $dat1,$dat1
1420	aesd $dat2,q8
1421	aesimc $dat2,$dat2
1422	aesd $dat3,q8
1423	aesimc $dat3,$dat3
1424	aesd $dat4,q8
1425	aesimc $dat4,$dat4
1426	vld1.32 {q8},[$key_],#16
1427	subs $cnt,$cnt,#2
1428	aesd $dat0,q9
1429	aesimc $dat0,$dat0
1430	aesd $dat1,q9
1431	aesimc $dat1,$dat1
1432	aesd $dat2,q9
1433	aesimc $dat2,$dat2
1434	aesd $dat3,q9
1435	aesimc $dat3,$dat3
1436	aesd $dat4,q9
1437	aesimc $dat4,$dat4
1438	vld1.32 {q9},[$key_],#16
1439	b.gt .Loop5x_cbc_dec
1440
1441	aesd $dat0,q8
1442	aesimc $dat0,$dat0
1443	aesd $dat1,q8
1444	aesimc $dat1,$dat1
1445	aesd $dat2,q8
1446	aesimc $dat2,$dat2
1447	aesd $dat3,q8
1448	aesimc $dat3,$dat3
1449	aesd $dat4,q8
1450	aesimc $dat4,$dat4
1451	cmp $len,#0x40 // because .Lcbc_tail4x
1452	sub $len,$len,#0x50
1453
1454	aesd $dat0,q9
1455	aesimc $dat0,$dat0
1456	aesd $dat1,q9
1457	aesimc $dat1,$dat1
1458	aesd $dat2,q9
1459	aesimc $dat2,$dat2
1460	aesd $dat3,q9
1461	aesimc $dat3,$dat3
1462	aesd $dat4,q9
1463	aesimc $dat4,$dat4
1464	csel x6,xzr,$len,gt // borrow x6, $cnt, "gt" is not typo
1465	mov $key_,$key
1466
1467	aesd $dat0,q10
1468	aesimc $dat0,$dat0
1469	aesd $dat1,q10
1470	aesimc $dat1,$dat1
1471	aesd $dat2,q10
1472	aesimc $dat2,$dat2
1473	aesd $dat3,q10
1474	aesimc $dat3,$dat3
1475	aesd $dat4,q10
1476	aesimc $dat4,$dat4
1477	add $inp,$inp,x6 // $inp is adjusted in such way that
1478	// at exit from the loop $dat1-$dat4
1479	// are loaded with last "words"
1480	add x6,$len,#0x60 // because .Lcbc_tail4x
1481
1482	aesd $dat0,q11
1483	aesimc $dat0,$dat0
1484	aesd $dat1,q11
1485	aesimc $dat1,$dat1
1486	aesd $dat2,q11
1487	aesimc $dat2,$dat2
1488	aesd $dat3,q11
1489	aesimc $dat3,$dat3
1490	aesd $dat4,q11
1491	aesimc $dat4,$dat4
1492
1493	aesd $dat0,q12
1494	aesimc $dat0,$dat0
1495	aesd $dat1,q12
1496	aesimc $dat1,$dat1
1497	aesd $dat2,q12
1498	aesimc $dat2,$dat2
1499	aesd $dat3,q12
1500	aesimc $dat3,$dat3
1501	aesd $dat4,q12
1502	aesimc $dat4,$dat4
1503
1504	aesd $dat0,q13
1505	aesimc $dat0,$dat0
1506	aesd $dat1,q13
1507	aesimc $dat1,$dat1
1508	aesd $dat2,q13
1509	aesimc $dat2,$dat2
1510	aesd $dat3,q13
1511	aesimc $dat3,$dat3
1512	aesd $dat4,q13
1513	aesimc $dat4,$dat4
1514
1515	aesd $dat0,q14
1516	aesimc $dat0,$dat0
1517	aesd $dat1,q14
1518	aesimc $dat1,$dat1
1519	aesd $dat2,q14
1520	aesimc $dat2,$dat2
1521	aesd $dat3,q14
1522	aesimc $dat3,$dat3
1523	aesd $dat4,q14
1524	aesimc $dat4,$dat4
1525
1526	veor $tmp0,$ivec,$rndlast
1527	aesd $dat0,q15
1528	veor $tmp1,$in0,$rndlast
1529	vld1.8 {$in0},[$inp],#16
1530	aesd $dat1,q15
1531	veor $tmp2,$in1,$rndlast
1532	vld1.8 {$in1},[$inp],#16
1533	aesd $dat2,q15
1534	veor $tmp3,$in2,$rndlast
1535	vld1.8 {$in2},[$inp],#16
1536	aesd $dat3,q15
1537	veor $tmp4,$in3,$rndlast
1538	vld1.8 {$in3},[$inp],#16
1539	aesd $dat4,q15
1540	vorr $ivec,$in4,$in4
1541	vld1.8 {$in4},[$inp],#16
1542	cbz x6,.Lcbc_tail4x
1543	vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0]
1544	veor $tmp0,$tmp0,$dat0
1545	vorr $dat0,$in0,$in0
1546	veor $tmp1,$tmp1,$dat1
1547	vorr $dat1,$in1,$in1
1548	veor $tmp2,$tmp2,$dat2
1549	vorr $dat2,$in2,$in2
1550	veor $tmp3,$tmp3,$dat3
1551	vorr $dat3,$in3,$in3
1552	veor $tmp4,$tmp4,$dat4
1553	vst1.8 {$tmp0},[$out],#16
1554	vorr $dat4,$in4,$in4
1555	vst1.8 {$tmp1},[$out],#16
1556	mov $cnt,$rounds
1557	vst1.8 {$tmp2},[$out],#16
1558	vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1]
1559	vst1.8 {$tmp3},[$out],#16
1560	vst1.8 {$tmp4},[$out],#16
1561	b.hs .Loop5x_cbc_dec
1562
1563	add $len,$len,#0x50
1564	cbz $len,.Lcbc_done
1565
1566	add $cnt,$rounds,#2
1567	subs $len,$len,#0x30
1568	vorr $dat0,$in2,$in2
1569	vorr $in0,$in2,$in2
1570	vorr $dat1,$in3,$in3
1571	vorr $in1,$in3,$in3
1572	vorr $dat2,$in4,$in4
1573	vorr $in2,$in4,$in4
1574	b.lo .Lcbc_dec_tail
1575
1576	b .Loop3x_cbc_dec
1577
1578	.align 4
1579	.Lcbc_tail4x:
1580	veor $tmp1,$tmp0,$dat1
1581	veor $tmp2,$tmp2,$dat2
1582	veor $tmp3,$tmp3,$dat3
1583	veor $tmp4,$tmp4,$dat4
1584	vst1.8 {$tmp1},[$out],#16
1585	vst1.8 {$tmp2},[$out],#16
1586	vst1.8 {$tmp3},[$out],#16
1587	vst1.8 {$tmp4},[$out],#16
1588
1589	b .Lcbc_done
1590	.align 4
1591	___
1592	$code.=<<___;
1593	.Loop3x_cbc_dec:
1594	aesd $dat0,q8
1595	aesimc $dat0,$dat0
1596	aesd $dat1,q8
1597	aesimc $dat1,$dat1
1598	aesd $dat2,q8
1599	aesimc $dat2,$dat2
1600	vld1.32 {q8},[$key_],#16
1601	subs $cnt,$cnt,#2
1602	aesd $dat0,q9
1603	aesimc $dat0,$dat0
1604	aesd $dat1,q9
1605	aesimc $dat1,$dat1
1606	aesd $dat2,q9
1607	aesimc $dat2,$dat2
1608	vld1.32 {q9},[$key_],#16
1609	b.gt .Loop3x_cbc_dec
1610
1611	aesd $dat0,q8
1612	aesimc $dat0,$dat0
1613	aesd $dat1,q8
1614	aesimc $dat1,$dat1
1615	aesd $dat2,q8
1616	aesimc $dat2,$dat2
1617	veor $tmp0,$ivec,$rndlast
1618	subs $len,$len,#0x30
1619	veor $tmp1,$in0,$rndlast
1620	mov.lo x6,$len // x6, $cnt, is zero at this point
1621	aesd $dat0,q9
1622	aesimc $dat0,$dat0
1623	aesd $dat1,q9
1624	aesimc $dat1,$dat1
1625	aesd $dat2,q9
1626	aesimc $dat2,$dat2
1627	veor $tmp2,$in1,$rndlast
1628	add $inp,$inp,x6 // $inp is adjusted in such way that
1629	// at exit from the loop $dat1-$dat2
1630	// are loaded with last "words"
1631	vorr $ivec,$in2,$in2
1632	mov $key_,$key
1633	aesd $dat0,q12
1634	aesimc $dat0,$dat0
1635	aesd $dat1,q12
1636	aesimc $dat1,$dat1
1637	aesd $dat2,q12
1638	aesimc $dat2,$dat2
1639	vld1.8 {$in0},[$inp],#16
1640	aesd $dat0,q13
1641	aesimc $dat0,$dat0
1642	aesd $dat1,q13
1643	aesimc $dat1,$dat1
1644	aesd $dat2,q13
1645	aesimc $dat2,$dat2
1646	vld1.8 {$in1},[$inp],#16
1647	aesd $dat0,q14
1648	aesimc $dat0,$dat0
1649	aesd $dat1,q14
1650	aesimc $dat1,$dat1
1651	aesd $dat2,q14
1652	aesimc $dat2,$dat2
1653	vld1.8 {$in2},[$inp],#16
1654	aesd $dat0,q15
1655	aesd $dat1,q15
1656	aesd $dat2,q15
1657	vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0]
1658	add $cnt,$rounds,#2
1659	veor $tmp0,$tmp0,$dat0
1660	veor $tmp1,$tmp1,$dat1
1661	veor $dat2,$dat2,$tmp2
1662	vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1]
1663	vst1.8 {$tmp0},[$out],#16
1664	vorr $dat0,$in0,$in0
1665	vst1.8 {$tmp1},[$out],#16
1666	vorr $dat1,$in1,$in1
1667	vst1.8 {$dat2},[$out],#16
1668	vorr $dat2,$in2,$in2
1669	b.hs .Loop3x_cbc_dec
1670
1671	cmn $len,#0x30
1672	b.eq .Lcbc_done
1673	nop
1674
1675	.Lcbc_dec_tail:
1676	aesd $dat1,q8
1677	aesimc $dat1,$dat1
1678	aesd $dat2,q8
1679	aesimc $dat2,$dat2
1680	vld1.32 {q8},[$key_],#16
1681	subs $cnt,$cnt,#2
1682	aesd $dat1,q9
1683	aesimc $dat1,$dat1
1684	aesd $dat2,q9
1685	aesimc $dat2,$dat2
1686	vld1.32 {q9},[$key_],#16
1687	b.gt .Lcbc_dec_tail
1688
1689	aesd $dat1,q8
1690	aesimc $dat1,$dat1
1691	aesd $dat2,q8
1692	aesimc $dat2,$dat2
1693	aesd $dat1,q9
1694	aesimc $dat1,$dat1
1695	aesd $dat2,q9
1696	aesimc $dat2,$dat2
1697	aesd $dat1,q12
1698	aesimc $dat1,$dat1
1699	aesd $dat2,q12
1700	aesimc $dat2,$dat2
1701	cmn $len,#0x20
1702	aesd $dat1,q13
1703	aesimc $dat1,$dat1
1704	aesd $dat2,q13
1705	aesimc $dat2,$dat2
1706	veor $tmp1,$ivec,$rndlast
1707	aesd $dat1,q14
1708	aesimc $dat1,$dat1
1709	aesd $dat2,q14
1710	aesimc $dat2,$dat2
1711	veor $tmp2,$in1,$rndlast
1712	aesd $dat1,q15
1713	aesd $dat2,q15
1714	b.eq .Lcbc_dec_one
1715	veor $tmp1,$tmp1,$dat1
1716	veor $tmp2,$tmp2,$dat2
1717	vorr $ivec,$in2,$in2
1718	vst1.8 {$tmp1},[$out],#16
1719	vst1.8 {$tmp2},[$out],#16
1720	b .Lcbc_done
1721
1722	.Lcbc_dec_one:
1723	veor $tmp1,$tmp1,$dat2
1724	vorr $ivec,$in2,$in2
1725	vst1.8 {$tmp1},[$out],#16
1726
1727	.Lcbc_done:
1728	vst1.8 {$ivec},[$ivp]
1729	.Lcbc_abort:
1730	___
1731	}
1732	$code.=<<___ if ($flavour !~ /64/);
1733	vldmia sp!,{d8-d15}
1734	ldmia sp!,{r4-r8,pc}
1735	___
1736	$code.=<<___ if ($flavour =~ /64/);
1737	ldr x29,[sp],#16
1738	ret
1739	___
1740	$code.=<<___;
1741	.size ${prefix}_cbc_encrypt,.-${prefix}_cbc_encrypt
1742	___
1743	}}}
1744	{{{
1745	my ($inp,$out,$len,$key,$ivp)=map("x$_",(0..4));
1746	my ($rounds,$cnt,$key_)=("w5","w6","x7");
1747	my ($ctr,$tctr0,$tctr1,$tctr2)=map("w$_",(8..10,12));
1748	my $step="x12"; # aliases with $tctr2
1749
1750	my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$ivec,$rndlast)=map("q$_",(0..7));
1751	my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9));
1752
1753	# used only in 64-bit mode...
1754	my ($dat3,$dat4,$in3,$in4)=map("q$_",(16..23));
1755
1756	my ($dat,$tmp)=($dat0,$tmp0);
1757
1758	### q8-q15 preloaded key schedule
1759
1760	$code.=<<___;
1761	.globl ${prefix}_ctr32_encrypt_blocks
1762	.type ${prefix}_ctr32_encrypt_blocks,%function
1763	.align 5
1764	${prefix}_ctr32_encrypt_blocks:
1765	___
1766	$code.=<<___ if ($flavour =~ /64/);
1767	stp x29,x30,[sp,#-16]!
1768	add x29,sp,#0
1769	___
1770	$code.=<<___ if ($flavour !~ /64/);
1771	mov ip,sp
1772	stmdb sp!,{r4-r10,lr}
1773	vstmdb sp!,{d8-d15} @ ABI specification says so
1774	ldr r4, [ip] @ load remaining arg
1775	___
1776	$code.=<<___;
1777	ldr $rounds,[$key,#240]
1778
1779	ldr $ctr, [$ivp, #12]
1780	#ifdef __ARMEB__
1781	vld1.8 {$dat0},[$ivp]
1782	#else
1783	vld1.32 {$dat0},[$ivp]
1784	#endif
1785	vld1.32 {q8-q9},[$key] // load key schedule...
1786	sub $rounds,$rounds,#4
1787	mov $step,#16
1788	cmp $len,#2
1789	add $key_,$key,x5,lsl#4 // pointer to last 5 round keys
1790	sub $rounds,$rounds,#2
1791	vld1.32 {q12-q13},[$key_],#32
1792	vld1.32 {q14-q15},[$key_],#32
1793	vld1.32 {$rndlast},[$key_]
1794	add $key_,$key,#32
1795	mov $cnt,$rounds
1796	cclr $step,lo
1797	#ifndef __ARMEB__
1798	rev $ctr, $ctr
1799	#endif
1800	add $tctr1, $ctr, #1
1801	vorr $ivec,$dat0,$dat0
1802	rev $tctr1, $tctr1
1803	vmov.32 ${ivec}[3],$tctr1
1804	add $ctr, $ctr, #2
1805	vorr $dat1,$ivec,$ivec
1806	b.ls .Lctr32_tail
1807	rev $tctr2, $ctr
1808	vmov.32 ${ivec}[3],$tctr2
1809	sub $len,$len,#3 // bias
1810	vorr $dat2,$ivec,$ivec
1811	___
1812	$code.=<<___ if ($flavour =~ /64/);
1813	cmp $len,#2
1814	b.lo .Loop3x_ctr32
1815
1816	add w13,$ctr,#1
1817	add w14,$ctr,#2
1818	vorr $dat3,$dat0,$dat0
1819	rev w13,w13
1820	vorr $dat4,$dat0,$dat0
1821	rev w14,w14
1822	vmov.32 ${dat3}[3],w13
1823	sub $len,$len,#2 // bias
1824	vmov.32 ${dat4}[3],w14
1825	add $ctr,$ctr,#2
1826	b .Loop5x_ctr32
1827
1828	.align 4
1829	.Loop5x_ctr32:
1830	aese $dat0,q8
1831	aesmc $dat0,$dat0
1832	aese $dat1,q8
1833	aesmc $dat1,$dat1
1834	aese $dat2,q8
1835	aesmc $dat2,$dat2
1836	aese $dat3,q8
1837	aesmc $dat3,$dat3
1838	aese $dat4,q8
1839	aesmc $dat4,$dat4
1840	vld1.32 {q8},[$key_],#16
1841	subs $cnt,$cnt,#2
1842	aese $dat0,q9
1843	aesmc $dat0,$dat0
1844	aese $dat1,q9
1845	aesmc $dat1,$dat1
1846	aese $dat2,q9
1847	aesmc $dat2,$dat2
1848	aese $dat3,q9
1849	aesmc $dat3,$dat3
1850	aese $dat4,q9
1851	aesmc $dat4,$dat4
1852	vld1.32 {q9},[$key_],#16
1853	b.gt .Loop5x_ctr32
1854
1855	mov $key_,$key
1856	aese $dat0,q8
1857	aesmc $dat0,$dat0
1858	aese $dat1,q8
1859	aesmc $dat1,$dat1
1860	aese $dat2,q8
1861	aesmc $dat2,$dat2
1862	aese $dat3,q8
1863	aesmc $dat3,$dat3
1864	aese $dat4,q8
1865	aesmc $dat4,$dat4
1866	vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0]
1867
1868	aese $dat0,q9
1869	aesmc $dat0,$dat0
1870	aese $dat1,q9
1871	aesmc $dat1,$dat1
1872	aese $dat2,q9
1873	aesmc $dat2,$dat2
1874	aese $dat3,q9
1875	aesmc $dat3,$dat3
1876	aese $dat4,q9
1877	aesmc $dat4,$dat4
1878	vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1]
1879
1880	aese $dat0,q12
1881	aesmc $dat0,$dat0
1882	add $tctr0,$ctr,#1
1883	add $tctr1,$ctr,#2
1884	aese $dat1,q12
1885	aesmc $dat1,$dat1
1886	add $tctr2,$ctr,#3
1887	add w13,$ctr,#4
1888	aese $dat2,q12
1889	aesmc $dat2,$dat2
1890	add w14,$ctr,#5
1891	rev $tctr0,$tctr0
1892	aese $dat3,q12
1893	aesmc $dat3,$dat3
1894	rev $tctr1,$tctr1
1895	rev $tctr2,$tctr2
1896	aese $dat4,q12
1897	aesmc $dat4,$dat4
1898	rev w13,w13
1899	rev w14,w14
1900
1901	aese $dat0,q13
1902	aesmc $dat0,$dat0
1903	aese $dat1,q13
1904	aesmc $dat1,$dat1
1905	aese $dat2,q13
1906	aesmc $dat2,$dat2
1907	aese $dat3,q13
1908	aesmc $dat3,$dat3
1909	aese $dat4,q13
1910	aesmc $dat4,$dat4
1911
1912	aese $dat0,q14
1913	aesmc $dat0,$dat0
1914	vld1.8 {$in0},[$inp],#16
1915	aese $dat1,q14
1916	aesmc $dat1,$dat1
1917	vld1.8 {$in1},[$inp],#16
1918	aese $dat2,q14
1919	aesmc $dat2,$dat2
1920	vld1.8 {$in2},[$inp],#16
1921	aese $dat3,q14
1922	aesmc $dat3,$dat3
1923	vld1.8 {$in3},[$inp],#16
1924	aese $dat4,q14
1925	aesmc $dat4,$dat4
1926	vld1.8 {$in4},[$inp],#16
1927
1928	aese $dat0,q15
1929	veor $in0,$in0,$rndlast
1930	aese $dat1,q15
1931	veor $in1,$in1,$rndlast
1932	aese $dat2,q15
1933	veor $in2,$in2,$rndlast
1934	aese $dat3,q15
1935	veor $in3,$in3,$rndlast
1936	aese $dat4,q15
1937	veor $in4,$in4,$rndlast
1938
1939	veor $in0,$in0,$dat0
1940	vorr $dat0,$ivec,$ivec
1941	veor $in1,$in1,$dat1
1942	vorr $dat1,$ivec,$ivec
1943	veor $in2,$in2,$dat2
1944	vorr $dat2,$ivec,$ivec
1945	veor $in3,$in3,$dat3
1946	vorr $dat3,$ivec,$ivec
1947	veor $in4,$in4,$dat4
1948	vorr $dat4,$ivec,$ivec
1949
1950	vst1.8 {$in0},[$out],#16
1951	vmov.32 ${dat0}[3],$tctr0
1952	vst1.8 {$in1},[$out],#16
1953	vmov.32 ${dat1}[3],$tctr1
1954	vst1.8 {$in2},[$out],#16
1955	vmov.32 ${dat2}[3],$tctr2
1956	vst1.8 {$in3},[$out],#16
1957	vmov.32 ${dat3}[3],w13
1958	vst1.8 {$in4},[$out],#16
1959	vmov.32 ${dat4}[3],w14
1960
1961	mov $cnt,$rounds
1962	cbz $len,.Lctr32_done
1963
1964	add $ctr,$ctr,#5
1965	subs $len,$len,#5
1966	b.hs .Loop5x_ctr32
1967
1968	add $len,$len,#5
1969	sub $ctr,$ctr,#5
1970
1971	cmp $len,#2
1972	mov $step,#16
1973	cclr $step,lo
1974	b.ls .Lctr32_tail
1975
1976	sub $len,$len,#3 // bias
1977	add $ctr,$ctr,#3
1978	___
1979	$code.=<<___;
1980	b .Loop3x_ctr32
1981
1982	.align 4
1983	.Loop3x_ctr32:
1984	aese $dat0,q8
1985	aesmc $dat0,$dat0
1986	aese $dat1,q8
1987	aesmc $dat1,$dat1
1988	aese $dat2,q8
1989	aesmc $dat2,$dat2
1990	vld1.32 {q8},[$key_],#16
1991	subs $cnt,$cnt,#2
1992	aese $dat0,q9
1993	aesmc $dat0,$dat0
1994	aese $dat1,q9
1995	aesmc $dat1,$dat1
1996	aese $dat2,q9
1997	aesmc $dat2,$dat2
1998	vld1.32 {q9},[$key_],#16
1999	b.gt .Loop3x_ctr32
2000
2001	aese $dat0,q8
2002	aesmc $tmp0,$dat0
2003	aese $dat1,q8
2004	aesmc $tmp1,$dat1
2005	vld1.8 {$in0},[$inp],#16
2006	add $tctr0,$ctr,#1
2007	aese $dat2,q8
2008	aesmc $dat2,$dat2
2009	vld1.8 {$in1},[$inp],#16
2010	rev $tctr0,$tctr0
2011	aese $tmp0,q9
2012	aesmc $tmp0,$tmp0
2013	aese $tmp1,q9
2014	aesmc $tmp1,$tmp1
2015	vld1.8 {$in2},[$inp],#16
2016	mov $key_,$key
2017	aese $dat2,q9
2018	aesmc $tmp2,$dat2
2019	aese $tmp0,q12
2020	aesmc $tmp0,$tmp0
2021	aese $tmp1,q12
2022	aesmc $tmp1,$tmp1
2023	veor $in0,$in0,$rndlast
2024	add $tctr1,$ctr,#2
2025	aese $tmp2,q12
2026	aesmc $tmp2,$tmp2
2027	veor $in1,$in1,$rndlast
2028	add $ctr,$ctr,#3
2029	aese $tmp0,q13
2030	aesmc $tmp0,$tmp0
2031	aese $tmp1,q13
2032	aesmc $tmp1,$tmp1
2033	veor $in2,$in2,$rndlast
2034	vmov.32 ${ivec}[3], $tctr0
2035	aese $tmp2,q13
2036	aesmc $tmp2,$tmp2
2037	vorr $dat0,$ivec,$ivec
2038	rev $tctr1,$tctr1
2039	aese $tmp0,q14
2040	aesmc $tmp0,$tmp0
2041	vmov.32 ${ivec}[3], $tctr1
2042	rev $tctr2,$ctr
2043	aese $tmp1,q14
2044	aesmc $tmp1,$tmp1
2045	vorr $dat1,$ivec,$ivec
2046	vmov.32 ${ivec}[3], $tctr2
2047	aese $tmp2,q14
2048	aesmc $tmp2,$tmp2
2049	vorr $dat2,$ivec,$ivec
2050	subs $len,$len,#3
2051	aese $tmp0,q15
2052	aese $tmp1,q15
2053	aese $tmp2,q15
2054
2055	veor $in0,$in0,$tmp0
2056	vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0]
2057	vst1.8 {$in0},[$out],#16
2058	veor $in1,$in1,$tmp1
2059	mov $cnt,$rounds
2060	vst1.8 {$in1},[$out],#16
2061	veor $in2,$in2,$tmp2
2062	vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1]
2063	vst1.8 {$in2},[$out],#16
2064	b.hs .Loop3x_ctr32
2065
2066	adds $len,$len,#3
2067	b.eq .Lctr32_done
2068	cmp $len,#1
2069	mov $step,#16
2070	cclr $step,eq
2071
2072	.Lctr32_tail:
2073	aese $dat0,q8
2074	aesmc $dat0,$dat0
2075	aese $dat1,q8
2076	aesmc $dat1,$dat1
2077	vld1.32 {q8},[$key_],#16
2078	subs $cnt,$cnt,#2
2079	aese $dat0,q9
2080	aesmc $dat0,$dat0
2081	aese $dat1,q9
2082	aesmc $dat1,$dat1
2083	vld1.32 {q9},[$key_],#16
2084	b.gt .Lctr32_tail
2085
2086	aese $dat0,q8
2087	aesmc $dat0,$dat0
2088	aese $dat1,q8
2089	aesmc $dat1,$dat1
2090	aese $dat0,q9
2091	aesmc $dat0,$dat0
2092	aese $dat1,q9
2093	aesmc $dat1,$dat1
2094	vld1.8 {$in0},[$inp],$step
2095	aese $dat0,q12
2096	aesmc $dat0,$dat0
2097	aese $dat1,q12
2098	aesmc $dat1,$dat1
2099	vld1.8 {$in1},[$inp]
2100	aese $dat0,q13
2101	aesmc $dat0,$dat0
2102	aese $dat1,q13
2103	aesmc $dat1,$dat1
2104	veor $in0,$in0,$rndlast
2105	aese $dat0,q14
2106	aesmc $dat0,$dat0
2107	aese $dat1,q14
2108	aesmc $dat1,$dat1
2109	veor $in1,$in1,$rndlast
2110	aese $dat0,q15
2111	aese $dat1,q15
2112
2113	cmp $len,#1
2114	veor $in0,$in0,$dat0
2115	veor $in1,$in1,$dat1
2116	vst1.8 {$in0},[$out],#16
2117	b.eq .Lctr32_done
2118	vst1.8 {$in1},[$out]
2119
2120	.Lctr32_done:
2121	___
2122	$code.=<<___ if ($flavour !~ /64/);
2123	vldmia sp!,{d8-d15}
2124	ldmia sp!,{r4-r10,pc}
2125	___
2126	$code.=<<___ if ($flavour =~ /64/);
2127	ldr x29,[sp],#16
2128	ret
2129	___
2130	$code.=<<___;
2131	.size ${prefix}_ctr32_encrypt_blocks,.-${prefix}_ctr32_encrypt_blocks
2132	___
2133	}}}
2134	# Performance in cycles per byte.
2135	# Processed with AES-XTS different key size.
2136	# It shows the value before and after optimization as below:
2137	# (before/after):
2138	#
2139	# AES-128-XTS AES-256-XTS
2140	# Cortex-A57 3.36/1.09 4.02/1.37
2141	# Cortex-A72 3.03/1.02 3.28/1.33
2142
2143	# Optimization is implemented by loop unrolling and interleaving.
2144	# Commonly, we choose the unrolling factor as 5, if the input
2145	# data size smaller than 5 blocks, but not smaller than 3 blocks,
2146	# choose 3 as the unrolling factor.
2147	# If the input data size dsize >= 5*16 bytes, then take 5 blocks
2148	# as one iteration, every loop the left size lsize -= 5*16.
2149	# If lsize < 516 bytes, treat them as the tail. Note: left 416 bytes
2150	# will be processed specially, which be integrated into the 5*16 bytes
2151	# loop to improve the efficiency.
2152	# There is one special case, if the original input data size dsize
2153	# = 16 bytes, we will treat it seperately to improve the
2154	# performance: one independent code block without LR, FP load and
2155	# store.
2156	# Encryption will process the (length -tailcnt) bytes as mentioned
2157	# previously, then encrypt the composite block as last second
2158	# cipher block.
2159	# Decryption will process the (length -tailcnt -1) bytes as mentioned
2160	# previously, then decrypt the last second cipher block to get the
2161	# last plain block(tail), decrypt the composite block as last second
2162	# plain text block.
2163
2164	{{{
2165	my ($inp,$out,$len,$key1,$key2,$ivp)=map("x$_",(0..5));
2166	my ($rounds0,$rounds,$key_,$step,$ivl,$ivh)=("w5","w6","x7","x8","x9","x10");
2167	my ($tmpoutp,$loutp,$l2outp,$tmpinp)=("x13","w14","w15","x20");
2168	my ($tailcnt,$midnum,$midnumx,$constnum,$constnumx)=("x21","w22","x22","w19","x19");
2169	my ($xoffset,$tmpmx,$tmpmw)=("x6","x11","w11");
2170	my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$tmp2,$rndlast)=map("q$_",(0..7));
2171	my ($iv0,$iv1,$iv2,$iv3,$iv4)=("v6.16b","v8.16b","v9.16b","v10.16b","v11.16b");
2172	my ($ivd00,$ivd01,$ivd20,$ivd21)=("d6","v6.d[1]","d9","v9.d[1]");
2173	my ($ivd10,$ivd11,$ivd30,$ivd31,$ivd40,$ivd41)=("d8","v8.d[1]","d10","v10.d[1]","d11","v11.d[1]");
2174
2175	my ($tmpin)=("v26.16b");
2176	my ($dat,$tmp,$rndzero_n_last)=($dat0,$tmp0,$tmp1);
2177
2178	# q7 last round key
2179	# q10-q15, q7 Last 7 round keys
2180	# q8-q9 preloaded round keys except last 7 keys for big size
2181	# q20, q21, q8-q9 preloaded round keys except last 7 keys for only 16 byte
2182
2183
2184	my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9));
2185
2186	my ($dat3,$in3,$tmp3); # used only in 64-bit mode
2187	my ($dat4,$in4,$tmp4);
2188	if ($flavour =~ /64/) {
2189	($dat2,$dat3,$dat4,$in2,$in3,$in4,$tmp3,$tmp4)=map("q$_",(16..23));
2190	}
2191
2192	$code.=<<___ if ($flavour =~ /64/);
2193	.globl ${prefix}_xts_encrypt
2194	.type ${prefix}_xts_encrypt,%function
2195	.align 5
2196	${prefix}_xts_encrypt:
2197	___
2198	$code.=<<___ if ($flavour =~ /64/);
2199	cmp $len,#16
2200	// Original input data size bigger than 16, jump to big size processing.
2201	b.ne .Lxts_enc_big_size
2202	// Encrypt the iv with key2, as the first XEX iv.
2203	ldr $rounds,[$key2,#240]
2204	vld1.8 {$dat},[$key2],#16
2205	vld1.8 {$iv0},[$ivp]
2206	sub $rounds,$rounds,#2
2207	vld1.8 {$dat1},[$key2],#16
2208
2209	.Loop_enc_iv_enc:
2210	aese $iv0,$dat
2211	aesmc $iv0,$iv0
2212	vld1.32 {$dat},[$key2],#16
2213	subs $rounds,$rounds,#2
2214	aese $iv0,$dat1
2215	aesmc $iv0,$iv0
2216	vld1.32 {$dat1},[$key2],#16
2217	b.gt .Loop_enc_iv_enc
2218
2219	aese $iv0,$dat
2220	aesmc $iv0,$iv0
2221	vld1.32 {$dat},[$key2]
2222	aese $iv0,$dat1
2223	veor $iv0,$iv0,$dat
2224
2225	vld1.8 {$dat0},[$inp]
2226	veor $dat0,$iv0,$dat0
2227
2228	ldr $rounds,[$key1,#240]
2229	vld1.32 {q20-q21},[$key1],#32 // load key schedule...
2230
2231	aese $dat0,q20
2232	aesmc $dat0,$dat0
2233	vld1.32 {q8-q9},[$key1],#32 // load key schedule...
2234	aese $dat0,q21
2235	aesmc $dat0,$dat0
2236	subs $rounds,$rounds,#10 // if rounds==10, jump to aes-128-xts processing
2237	b.eq .Lxts_128_enc
2238	.Lxts_enc_round_loop:
2239	aese $dat0,q8
2240	aesmc $dat0,$dat0
2241	vld1.32 {q8},[$key1],#16 // load key schedule...
2242	aese $dat0,q9
2243	aesmc $dat0,$dat0
2244	vld1.32 {q9},[$key1],#16 // load key schedule...
2245	subs $rounds,$rounds,#2 // bias
2246	b.gt .Lxts_enc_round_loop
2247	.Lxts_128_enc:
2248	vld1.32 {q10-q11},[$key1],#32 // load key schedule...
2249	aese $dat0,q8
2250	aesmc $dat0,$dat0
2251	aese $dat0,q9
2252	aesmc $dat0,$dat0
2253	vld1.32 {q12-q13},[$key1],#32 // load key schedule...
2254	aese $dat0,q10
2255	aesmc $dat0,$dat0
2256	aese $dat0,q11
2257	aesmc $dat0,$dat0
2258	vld1.32 {q14-q15},[$key1],#32 // load key schedule...
2259	aese $dat0,q12
2260	aesmc $dat0,$dat0
2261	aese $dat0,q13
2262	aesmc $dat0,$dat0
2263	vld1.32 {$rndlast},[$key1]
2264	aese $dat0,q14
2265	aesmc $dat0,$dat0
2266	aese $dat0,q15
2267	veor $dat0,$dat0,$rndlast
2268	veor $dat0,$dat0,$iv0
2269	vst1.8 {$dat0},[$out]
2270	b .Lxts_enc_final_abort
2271
2272	.align 4
2273	.Lxts_enc_big_size:
2274	___
2275	$code.=<<___ if ($flavour =~ /64/);
2276	stp $constnumx,$tmpinp,[sp,#-64]!
2277	stp $tailcnt,$midnumx,[sp,#48]
2278	stp $ivd10,$ivd20,[sp,#32]
2279	stp $ivd30,$ivd40,[sp,#16]
2280
2281	// tailcnt store the tail value of length%16.
2282	and $tailcnt,$len,#0xf
2283	and $len,$len,#-16
2284	subs $len,$len,#16
2285	mov $step,#16
2286	b.lo .Lxts_abort
2287	csel $step,xzr,$step,eq
2288
2289	// Firstly, encrypt the iv with key2, as the first iv of XEX.
2290	ldr $rounds,[$key2,#240]
2291	vld1.32 {$dat},[$key2],#16
2292	vld1.8 {$iv0},[$ivp]
2293	sub $rounds,$rounds,#2
2294	vld1.32 {$dat1},[$key2],#16
2295
2296	.Loop_iv_enc:
2297	aese $iv0,$dat
2298	aesmc $iv0,$iv0
2299	vld1.32 {$dat},[$key2],#16
2300	subs $rounds,$rounds,#2
2301	aese $iv0,$dat1
2302	aesmc $iv0,$iv0
2303	vld1.32 {$dat1},[$key2],#16
2304	b.gt .Loop_iv_enc
2305
2306	aese $iv0,$dat
2307	aesmc $iv0,$iv0
2308	vld1.32 {$dat},[$key2]
2309	aese $iv0,$dat1
2310	veor $iv0,$iv0,$dat
2311
2312	// The iv for second block
2313	// $ivl- iv(low), $ivh - iv(high)
2314	// the five ivs stored into, $iv0,$iv1,$iv2,$iv3,$iv4
2315	fmov $ivl,$ivd00
2316	fmov $ivh,$ivd01
2317	mov $constnum,#0x87
2318	extr $midnumx,$ivh,$ivh,#32
2319	extr $ivh,$ivh,$ivl,#63
2320	and $tmpmw,$constnum,$midnum,asr#31
2321	eor $ivl,$tmpmx,$ivl,lsl#1
2322	fmov $ivd10,$ivl
2323	fmov $ivd11,$ivh
2324
2325	ldr $rounds0,[$key1,#240] // next starting point
2326	vld1.8 {$dat},[$inp],$step
2327
2328	vld1.32 {q8-q9},[$key1] // load key schedule...
2329	sub $rounds0,$rounds0,#6
2330	add $key_,$key1,$ivp,lsl#4 // pointer to last 7 round keys
2331	sub $rounds0,$rounds0,#2
2332	vld1.32 {q10-q11},[$key_],#32
2333	vld1.32 {q12-q13},[$key_],#32
2334	vld1.32 {q14-q15},[$key_],#32
2335	vld1.32 {$rndlast},[$key_]
2336
2337	add $key_,$key1,#32
2338	mov $rounds,$rounds0
2339
2340	// Encryption
2341	.Lxts_enc:
2342	vld1.8 {$dat2},[$inp],#16
2343	subs $len,$len,#32 // bias
2344	add $rounds,$rounds0,#2
2345	vorr $in1,$dat,$dat
2346	vorr $dat1,$dat,$dat
2347	vorr $in3,$dat,$dat
2348	vorr $in2,$dat2,$dat2
2349	vorr $in4,$dat2,$dat2
2350	b.lo .Lxts_inner_enc_tail
2351	veor $dat,$dat,$iv0 // before encryption, xor with iv
2352	veor $dat2,$dat2,$iv1
2353
2354	// The iv for third block
2355	extr $midnumx,$ivh,$ivh,#32
2356	extr $ivh,$ivh,$ivl,#63
2357	and $tmpmw,$constnum,$midnum,asr#31
2358	eor $ivl,$tmpmx,$ivl,lsl#1
2359	fmov $ivd20,$ivl
2360	fmov $ivd21,$ivh
2361
2362
2363	vorr $dat1,$dat2,$dat2
2364	vld1.8 {$dat2},[$inp],#16
2365	vorr $in0,$dat,$dat
2366	vorr $in1,$dat1,$dat1
2367	veor $in2,$dat2,$iv2 // the third block
2368	veor $dat2,$dat2,$iv2
2369	cmp $len,#32
2370	b.lo .Lxts_outer_enc_tail
2371
2372	// The iv for fourth block
2373	extr $midnumx,$ivh,$ivh,#32
2374	extr $ivh,$ivh,$ivl,#63
2375	and $tmpmw,$constnum,$midnum,asr#31
2376	eor $ivl,$tmpmx,$ivl,lsl#1
2377	fmov $ivd30,$ivl
2378	fmov $ivd31,$ivh
2379
2380	vld1.8 {$dat3},[$inp],#16
2381	// The iv for fifth block
2382	extr $midnumx,$ivh,$ivh,#32
2383	extr $ivh,$ivh,$ivl,#63
2384	and $tmpmw,$constnum,$midnum,asr#31
2385	eor $ivl,$tmpmx,$ivl,lsl#1
2386	fmov $ivd40,$ivl
2387	fmov $ivd41,$ivh
2388
2389	vld1.8 {$dat4},[$inp],#16
2390	veor $dat3,$dat3,$iv3 // the fourth block
2391	veor $dat4,$dat4,$iv4
2392	sub $len,$len,#32 // bias
2393	mov $rounds,$rounds0
2394	b .Loop5x_xts_enc
2395
2396	.align 4
2397	.Loop5x_xts_enc:
2398	aese $dat0,q8
2399	aesmc $dat0,$dat0
2400	aese $dat1,q8
2401	aesmc $dat1,$dat1
2402	aese $dat2,q8
2403	aesmc $dat2,$dat2
2404	aese $dat3,q8
2405	aesmc $dat3,$dat3
2406	aese $dat4,q8
2407	aesmc $dat4,$dat4
2408	vld1.32 {q8},[$key_],#16
2409	subs $rounds,$rounds,#2
2410	aese $dat0,q9
2411	aesmc $dat0,$dat0
2412	aese $dat1,q9
2413	aesmc $dat1,$dat1
2414	aese $dat2,q9
2415	aesmc $dat2,$dat2
2416	aese $dat3,q9
2417	aesmc $dat3,$dat3
2418	aese $dat4,q9
2419	aesmc $dat4,$dat4
2420	vld1.32 {q9},[$key_],#16
2421	b.gt .Loop5x_xts_enc
2422
2423	aese $dat0,q8
2424	aesmc $dat0,$dat0
2425	aese $dat1,q8
2426	aesmc $dat1,$dat1
2427	aese $dat2,q8
2428	aesmc $dat2,$dat2
2429	aese $dat3,q8
2430	aesmc $dat3,$dat3
2431	aese $dat4,q8
2432	aesmc $dat4,$dat4
2433	subs $len,$len,#0x50 // because .Lxts_enc_tail4x
2434
2435	aese $dat0,q9
2436	aesmc $dat0,$dat0
2437	aese $dat1,q9
2438	aesmc $dat1,$dat1
2439	aese $dat2,q9
2440	aesmc $dat2,$dat2
2441	aese $dat3,q9
2442	aesmc $dat3,$dat3
2443	aese $dat4,q9
2444	aesmc $dat4,$dat4
2445	csel $xoffset,xzr,$len,gt // borrow x6, w6, "gt" is not typo
2446	mov $key_,$key1
2447
2448	aese $dat0,q10
2449	aesmc $dat0,$dat0
2450	aese $dat1,q10
2451	aesmc $dat1,$dat1
2452	aese $dat2,q10
2453	aesmc $dat2,$dat2
2454	aese $dat3,q10
2455	aesmc $dat3,$dat3
2456	aese $dat4,q10
2457	aesmc $dat4,$dat4
2458	add $inp,$inp,$xoffset // x0 is adjusted in such way that
2459	// at exit from the loop v1.16b-v26.16b
2460	// are loaded with last "words"
2461	add $xoffset,$len,#0x60 // because .Lxts_enc_tail4x
2462
2463	aese $dat0,q11
2464	aesmc $dat0,$dat0
2465	aese $dat1,q11
2466	aesmc $dat1,$dat1
2467	aese $dat2,q11
2468	aesmc $dat2,$dat2
2469	aese $dat3,q11
2470	aesmc $dat3,$dat3
2471	aese $dat4,q11
2472	aesmc $dat4,$dat4
2473
2474	aese $dat0,q12
2475	aesmc $dat0,$dat0
2476	aese $dat1,q12
2477	aesmc $dat1,$dat1
2478	aese $dat2,q12
2479	aesmc $dat2,$dat2
2480	aese $dat3,q12
2481	aesmc $dat3,$dat3
2482	aese $dat4,q12
2483	aesmc $dat4,$dat4
2484
2485	aese $dat0,q13
2486	aesmc $dat0,$dat0
2487	aese $dat1,q13
2488	aesmc $dat1,$dat1
2489	aese $dat2,q13
2490	aesmc $dat2,$dat2
2491	aese $dat3,q13
2492	aesmc $dat3,$dat3
2493	aese $dat4,q13
2494	aesmc $dat4,$dat4
2495
2496	aese $dat0,q14
2497	aesmc $dat0,$dat0
2498	aese $dat1,q14
2499	aesmc $dat1,$dat1
2500	aese $dat2,q14
2501	aesmc $dat2,$dat2
2502	aese $dat3,q14
2503	aesmc $dat3,$dat3
2504	aese $dat4,q14
2505	aesmc $dat4,$dat4
2506
2507	veor $tmp0,$rndlast,$iv0
2508	aese $dat0,q15
2509	// The iv for first block of one iteration
2510	extr $midnumx,$ivh,$ivh,#32
2511	extr $ivh,$ivh,$ivl,#63
2512	and $tmpmw,$constnum,$midnum,asr#31
2513	eor $ivl,$tmpmx,$ivl,lsl#1
2514	fmov $ivd00,$ivl
2515	fmov $ivd01,$ivh
2516	veor $tmp1,$rndlast,$iv1
2517	vld1.8 {$in0},[$inp],#16
2518	aese $dat1,q15
2519	// The iv for second block
2520	extr $midnumx,$ivh,$ivh,#32
2521	extr $ivh,$ivh,$ivl,#63
2522	and $tmpmw,$constnum,$midnum,asr#31
2523	eor $ivl,$tmpmx,$ivl,lsl#1
2524	fmov $ivd10,$ivl
2525	fmov $ivd11,$ivh
2526	veor $tmp2,$rndlast,$iv2
2527	vld1.8 {$in1},[$inp],#16
2528	aese $dat2,q15
2529	// The iv for third block
2530	extr $midnumx,$ivh,$ivh,#32
2531	extr $ivh,$ivh,$ivl,#63
2532	and $tmpmw,$constnum,$midnum,asr#31
2533	eor $ivl,$tmpmx,$ivl,lsl#1
2534	fmov $ivd20,$ivl
2535	fmov $ivd21,$ivh
2536	veor $tmp3,$rndlast,$iv3
2537	vld1.8 {$in2},[$inp],#16
2538	aese $dat3,q15
2539	// The iv for fourth block
2540	extr $midnumx,$ivh,$ivh,#32
2541	extr $ivh,$ivh,$ivl,#63
2542	and $tmpmw,$constnum,$midnum,asr#31
2543	eor $ivl,$tmpmx,$ivl,lsl#1
2544	fmov $ivd30,$ivl
2545	fmov $ivd31,$ivh
2546	veor $tmp4,$rndlast,$iv4
2547	vld1.8 {$in3},[$inp],#16
2548	aese $dat4,q15
2549
2550	// The iv for fifth block
2551	extr $midnumx,$ivh,$ivh,#32
2552	extr $ivh,$ivh,$ivl,#63
2553	and $tmpmw,$constnum,$midnum,asr #31
2554	eor $ivl,$tmpmx,$ivl,lsl #1
2555	fmov $ivd40,$ivl
2556	fmov $ivd41,$ivh
2557
2558	vld1.8 {$in4},[$inp],#16
2559	cbz $xoffset,.Lxts_enc_tail4x
2560	vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0]
2561	veor $tmp0,$tmp0,$dat0
2562	veor $dat0,$in0,$iv0
2563	veor $tmp1,$tmp1,$dat1
2564	veor $dat1,$in1,$iv1
2565	veor $tmp2,$tmp2,$dat2
2566	veor $dat2,$in2,$iv2
2567	veor $tmp3,$tmp3,$dat3
2568	veor $dat3,$in3,$iv3
2569	veor $tmp4,$tmp4,$dat4
2570	vst1.8 {$tmp0},[$out],#16
2571	veor $dat4,$in4,$iv4
2572	vst1.8 {$tmp1},[$out],#16
2573	mov $rounds,$rounds0
2574	vst1.8 {$tmp2},[$out],#16
2575	vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1]
2576	vst1.8 {$tmp3},[$out],#16
2577	vst1.8 {$tmp4},[$out],#16
2578	b.hs .Loop5x_xts_enc
2579
2580
2581	// If left 4 blocks, borrow the five block's processing.
2582	cmn $len,#0x10
2583	b.ne .Loop5x_enc_after
2584	vorr $iv4,$iv3,$iv3
2585	vorr $iv3,$iv2,$iv2
2586	vorr $iv2,$iv1,$iv1
2587	vorr $iv1,$iv0,$iv0
2588	fmov $ivl,$ivd40
2589	fmov $ivh,$ivd41
2590	veor $dat0,$iv0,$in0
2591	veor $dat1,$iv1,$in1
2592	veor $dat2,$in2,$iv2
2593	veor $dat3,$in3,$iv3
2594	veor $dat4,$in4,$iv4
2595	b.eq .Loop5x_xts_enc
2596
2597	.Loop5x_enc_after:
2598	add $len,$len,#0x50
2599	cbz $len,.Lxts_enc_done
2600
2601	add $rounds,$rounds0,#2
2602	subs $len,$len,#0x30
2603	b.lo .Lxts_inner_enc_tail
2604
2605	veor $dat0,$iv0,$in2
2606	veor $dat1,$iv1,$in3
2607	veor $dat2,$in4,$iv2
2608	b .Lxts_outer_enc_tail
2609
2610	.align 4
2611	.Lxts_enc_tail4x:
2612	add $inp,$inp,#16
2613	veor $tmp1,$dat1,$tmp1
2614	vst1.8 {$tmp1},[$out],#16
2615	veor $tmp2,$dat2,$tmp2
2616	vst1.8 {$tmp2},[$out],#16
2617	veor $tmp3,$dat3,$tmp3
2618	veor $tmp4,$dat4,$tmp4
2619	vst1.8 {$tmp3-$tmp4},[$out],#32
2620
2621	b .Lxts_enc_done
2622	.align 4
2623	.Lxts_outer_enc_tail:
2624	aese $dat0,q8
2625	aesmc $dat0,$dat0
2626	aese $dat1,q8
2627	aesmc $dat1,$dat1
2628	aese $dat2,q8
2629	aesmc $dat2,$dat2
2630	vld1.32 {q8},[$key_],#16
2631	subs $rounds,$rounds,#2
2632	aese $dat0,q9
2633	aesmc $dat0,$dat0
2634	aese $dat1,q9
2635	aesmc $dat1,$dat1
2636	aese $dat2,q9
2637	aesmc $dat2,$dat2
2638	vld1.32 {q9},[$key_],#16
2639	b.gt .Lxts_outer_enc_tail
2640
2641	aese $dat0,q8
2642	aesmc $dat0,$dat0
2643	aese $dat1,q8
2644	aesmc $dat1,$dat1
2645	aese $dat2,q8
2646	aesmc $dat2,$dat2
2647	veor $tmp0,$iv0,$rndlast
2648	subs $len,$len,#0x30
2649	// The iv for first block
2650	fmov $ivl,$ivd20
2651	fmov $ivh,$ivd21
2652	//mov $constnum,#0x87
2653	extr $midnumx,$ivh,$ivh,#32
2654	extr $ivh,$ivh,$ivl,#63
2655	and $tmpmw,$constnum,$midnum,asr#31
2656	eor $ivl,$tmpmx,$ivl,lsl#1
2657	fmov $ivd00,$ivl
2658	fmov $ivd01,$ivh
2659	veor $tmp1,$iv1,$rndlast
2660	csel $xoffset,$len,$xoffset,lo // x6, w6, is zero at this point
2661	aese $dat0,q9
2662	aesmc $dat0,$dat0
2663	aese $dat1,q9
2664	aesmc $dat1,$dat1
2665	aese $dat2,q9
2666	aesmc $dat2,$dat2
2667	veor $tmp2,$iv2,$rndlast
2668
2669	add $xoffset,$xoffset,#0x20
2670	add $inp,$inp,$xoffset
2671	mov $key_,$key1
2672
2673	aese $dat0,q12
2674	aesmc $dat0,$dat0
2675	aese $dat1,q12
2676	aesmc $dat1,$dat1
2677	aese $dat2,q12
2678	aesmc $dat2,$dat2
2679	aese $dat0,q13
2680	aesmc $dat0,$dat0
2681	aese $dat1,q13
2682	aesmc $dat1,$dat1
2683	aese $dat2,q13
2684	aesmc $dat2,$dat2
2685	aese $dat0,q14
2686	aesmc $dat0,$dat0
2687	aese $dat1,q14
2688	aesmc $dat1,$dat1
2689	aese $dat2,q14
2690	aesmc $dat2,$dat2
2691	aese $dat0,q15
2692	aese $dat1,q15
2693	aese $dat2,q15
2694	vld1.8 {$in2},[$inp],#16
2695	add $rounds,$rounds0,#2
2696	vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0]
2697	veor $tmp0,$tmp0,$dat0
2698	veor $tmp1,$tmp1,$dat1
2699	veor $dat2,$dat2,$tmp2
2700	vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1]
2701	vst1.8 {$tmp0},[$out],#16
2702	vst1.8 {$tmp1},[$out],#16
2703	vst1.8 {$dat2},[$out],#16
2704	cmn $len,#0x30
2705	b.eq .Lxts_enc_done
2706	.Lxts_encxor_one:
2707	vorr $in3,$in1,$in1
2708	vorr $in4,$in2,$in2
2709	nop
2710
2711	.Lxts_inner_enc_tail:
2712	cmn $len,#0x10
2713	veor $dat1,$in3,$iv0
2714	veor $dat2,$in4,$iv1
2715	b.eq .Lxts_enc_tail_loop
2716	veor $dat2,$in4,$iv0
2717	.Lxts_enc_tail_loop:
2718	aese $dat1,q8
2719	aesmc $dat1,$dat1
2720	aese $dat2,q8
2721	aesmc $dat2,$dat2
2722	vld1.32 {q8},[$key_],#16
2723	subs $rounds,$rounds,#2
2724	aese $dat1,q9
2725	aesmc $dat1,$dat1
2726	aese $dat2,q9
2727	aesmc $dat2,$dat2
2728	vld1.32 {q9},[$key_],#16
2729	b.gt .Lxts_enc_tail_loop
2730
2731	aese $dat1,q8
2732	aesmc $dat1,$dat1
2733	aese $dat2,q8
2734	aesmc $dat2,$dat2
2735	aese $dat1,q9
2736	aesmc $dat1,$dat1
2737	aese $dat2,q9
2738	aesmc $dat2,$dat2
2739	aese $dat1,q12
2740	aesmc $dat1,$dat1
2741	aese $dat2,q12
2742	aesmc $dat2,$dat2
2743	cmn $len,#0x20
2744	aese $dat1,q13
2745	aesmc $dat1,$dat1
2746	aese $dat2,q13
2747	aesmc $dat2,$dat2
2748	veor $tmp1,$iv0,$rndlast
2749	aese $dat1,q14
2750	aesmc $dat1,$dat1
2751	aese $dat2,q14
2752	aesmc $dat2,$dat2
2753	veor $tmp2,$iv1,$rndlast
2754	aese $dat1,q15
2755	aese $dat2,q15
2756	b.eq .Lxts_enc_one
2757	veor $tmp1,$tmp1,$dat1
2758	vst1.8 {$tmp1},[$out],#16
2759	veor $tmp2,$tmp2,$dat2
2760	vorr $iv0,$iv1,$iv1
2761	vst1.8 {$tmp2},[$out],#16
2762	fmov $ivl,$ivd10
2763	fmov $ivh,$ivd11
2764	mov $constnum,#0x87
2765	extr $midnumx,$ivh,$ivh,#32
2766	extr $ivh,$ivh,$ivl,#63
2767	and $tmpmw,$constnum,$midnum,asr #31
2768	eor $ivl,$tmpmx,$ivl,lsl #1
2769	fmov $ivd00,$ivl
2770	fmov $ivd01,$ivh
2771	b .Lxts_enc_done
2772
2773	.Lxts_enc_one:
2774	veor $tmp1,$tmp1,$dat2
2775	vorr $iv0,$iv0,$iv0
2776	vst1.8 {$tmp1},[$out],#16
2777	fmov $ivl,$ivd00
2778	fmov $ivh,$ivd01
2779	mov $constnum,#0x87
2780	extr $midnumx,$ivh,$ivh,#32
2781	extr $ivh,$ivh,$ivl,#63
2782	and $tmpmw,$constnum,$midnum,asr #31
2783	eor $ivl,$tmpmx,$ivl,lsl #1
2784	fmov $ivd00,$ivl
2785	fmov $ivd01,$ivh
2786	b .Lxts_enc_done
2787	.align 5
2788	.Lxts_enc_done:
2789	// Process the tail block with cipher stealing.
2790	tst $tailcnt,#0xf
2791	b.eq .Lxts_abort
2792
2793	mov $tmpinp,$inp
2794	mov $tmpoutp,$out
2795	sub $out,$out,#16
2796	.composite_enc_loop:
2797	subs $tailcnt,$tailcnt,#1
2798	ldrb $l2outp,[$out,$tailcnt]
2799	ldrb $loutp,[$tmpinp,$tailcnt]
2800	strb $l2outp,[$tmpoutp,$tailcnt]
2801	strb $loutp,[$out,$tailcnt]
2802	b.gt .composite_enc_loop
2803	.Lxts_enc_load_done:
2804	vld1.8 {$tmpin},[$out]
2805	veor $tmpin,$tmpin,$iv0
2806
2807	// Encrypt the composite block to get the last second encrypted text block
2808	ldr $rounds,[$key1,#240] // load key schedule...
2809	vld1.8 {$dat},[$key1],#16
2810	sub $rounds,$rounds,#2
2811	vld1.8 {$dat1},[$key1],#16 // load key schedule...
2812	.Loop_final_enc:
2813	aese $tmpin,$dat0
2814	aesmc $tmpin,$tmpin
2815	vld1.32 {$dat0},[$key1],#16
2816	subs $rounds,$rounds,#2
2817	aese $tmpin,$dat1
2818	aesmc $tmpin,$tmpin
2819	vld1.32 {$dat1},[$key1],#16
2820	b.gt .Loop_final_enc
2821
2822	aese $tmpin,$dat0
2823	aesmc $tmpin,$tmpin
2824	vld1.32 {$dat0},[$key1]
2825	aese $tmpin,$dat1
2826	veor $tmpin,$tmpin,$dat0
2827	veor $tmpin,$tmpin,$iv0
2828	vst1.8 {$tmpin},[$out]
2829
2830	.Lxts_abort:
2831	ldp $tailcnt,$midnumx,[sp,#48]
2832	ldp $ivd10,$ivd20,[sp,#32]
2833	ldp $ivd30,$ivd40,[sp,#16]
2834	ldp $constnumx,$tmpinp,[sp],#64
2835	.Lxts_enc_final_abort:
2836	ret
2837	.size ${prefix}_xts_encrypt,.-${prefix}_xts_encrypt
2838	___
2839
2840	}}}
2841	{{{
2842	my ($inp,$out,$len,$key1,$key2,$ivp)=map("x$_",(0..5));
2843	my ($rounds0,$rounds,$key_,$step,$ivl,$ivh)=("w5","w6","x7","x8","x9","x10");
2844	my ($tmpoutp,$loutp,$l2outp,$tmpinp)=("x13","w14","w15","x20");
2845	my ($tailcnt,$midnum,$midnumx,$constnum,$constnumx)=("x21","w22","x22","w19","x19");
2846	my ($xoffset,$tmpmx,$tmpmw)=("x6","x11","w11");
2847	my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$tmp2,$rndlast)=map("q$_",(0..7));
2848	my ($iv0,$iv1,$iv2,$iv3,$iv4,$tmpin)=("v6.16b","v8.16b","v9.16b","v10.16b","v11.16b","v26.16b");
2849	my ($ivd00,$ivd01,$ivd20,$ivd21)=("d6","v6.d[1]","d9","v9.d[1]");
2850	my ($ivd10,$ivd11,$ivd30,$ivd31,$ivd40,$ivd41)=("d8","v8.d[1]","d10","v10.d[1]","d11","v11.d[1]");
2851
2852	my ($dat,$tmp,$rndzero_n_last)=($dat0,$tmp0,$tmp1);
2853
2854	# q7 last round key
2855	# q10-q15, q7 Last 7 round keys
2856	# q8-q9 preloaded round keys except last 7 keys for big size
2857	# q20, q21, q8-q9 preloaded round keys except last 7 keys for only 16 byte
2858
2859	{
2860	my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9));
2861
2862	my ($dat3,$in3,$tmp3); # used only in 64-bit mode
2863	my ($dat4,$in4,$tmp4);
2864	if ($flavour =~ /64/) {
2865	($dat2,$dat3,$dat4,$in2,$in3,$in4,$tmp3,$tmp4)=map("q$_",(16..23));
2866	}
2867
2868	$code.=<<___ if ($flavour =~ /64/);
2869	.globl ${prefix}_xts_decrypt
2870	.type ${prefix}_xts_decrypt,%function
2871	.align 5
2872	${prefix}_xts_decrypt:
2873	___
2874	$code.=<<___ if ($flavour =~ /64/);
2875	cmp $len,#16
2876	// Original input data size bigger than 16, jump to big size processing.
2877	b.ne .Lxts_dec_big_size
2878	// Encrypt the iv with key2, as the first XEX iv.
2879	ldr $rounds,[$key2,#240]
2880	vld1.8 {$dat},[$key2],#16
2881	vld1.8 {$iv0},[$ivp]
2882	sub $rounds,$rounds,#2
2883	vld1.8 {$dat1},[$key2],#16
2884
2885	.Loop_dec_small_iv_enc:
2886	aese $iv0,$dat
2887	aesmc $iv0,$iv0
2888	vld1.32 {$dat},[$key2],#16
2889	subs $rounds,$rounds,#2
2890	aese $iv0,$dat1
2891	aesmc $iv0,$iv0
2892	vld1.32 {$dat1},[$key2],#16
2893	b.gt .Loop_dec_small_iv_enc
2894
2895	aese $iv0,$dat
2896	aesmc $iv0,$iv0
2897	vld1.32 {$dat},[$key2]
2898	aese $iv0,$dat1
2899	veor $iv0,$iv0,$dat
2900
2901	vld1.8 {$dat0},[$inp]
2902	veor $dat0,$iv0,$dat0
2903
2904	ldr $rounds,[$key1,#240]
2905	vld1.32 {q20-q21},[$key1],#32 // load key schedule...
2906
2907	aesd $dat0,q20
2908	aesimc $dat0,$dat0
2909	vld1.32 {q8-q9},[$key1],#32 // load key schedule...
2910	aesd $dat0,q21
2911	aesimc $dat0,$dat0
2912	subs $rounds,$rounds,#10 // bias
2913	b.eq .Lxts_128_dec
2914	.Lxts_dec_round_loop:
2915	aesd $dat0,q8
2916	aesimc $dat0,$dat0
2917	vld1.32 {q8},[$key1],#16 // load key schedule...
2918	aesd $dat0,q9
2919	aesimc $dat0,$dat0
2920	vld1.32 {q9},[$key1],#16 // load key schedule...
2921	subs $rounds,$rounds,#2 // bias
2922	b.gt .Lxts_dec_round_loop
2923	.Lxts_128_dec:
2924	vld1.32 {q10-q11},[$key1],#32 // load key schedule...
2925	aesd $dat0,q8
2926	aesimc $dat0,$dat0
2927	aesd $dat0,q9
2928	aesimc $dat0,$dat0
2929	vld1.32 {q12-q13},[$key1],#32 // load key schedule...
2930	aesd $dat0,q10
2931	aesimc $dat0,$dat0
2932	aesd $dat0,q11
2933	aesimc $dat0,$dat0
2934	vld1.32 {q14-q15},[$key1],#32 // load key schedule...
2935	aesd $dat0,q12
2936	aesimc $dat0,$dat0
2937	aesd $dat0,q13
2938	aesimc $dat0,$dat0
2939	vld1.32 {$rndlast},[$key1]
2940	aesd $dat0,q14
2941	aesimc $dat0,$dat0
2942	aesd $dat0,q15
2943	veor $dat0,$dat0,$rndlast
2944	veor $dat0,$iv0,$dat0
2945	vst1.8 {$dat0},[$out]
2946	b .Lxts_dec_final_abort
2947	.Lxts_dec_big_size:
2948	___
2949	$code.=<<___ if ($flavour =~ /64/);
2950	stp $constnumx,$tmpinp,[sp,#-64]!
2951	stp $tailcnt,$midnumx,[sp,#48]
2952	stp $ivd10,$ivd20,[sp,#32]
2953	stp $ivd30,$ivd40,[sp,#16]
2954
2955	and $tailcnt,$len,#0xf
2956	and $len,$len,#-16
2957	subs $len,$len,#16
2958	mov $step,#16
2959	b.lo .Lxts_dec_abort
2960
2961	// Encrypt the iv with key2, as the first XEX iv
2962	ldr $rounds,[$key2,#240]
2963	vld1.8 {$dat},[$key2],#16
2964	vld1.8 {$iv0},[$ivp]
2965	sub $rounds,$rounds,#2
2966	vld1.8 {$dat1},[$key2],#16
2967
2968	.Loop_dec_iv_enc:
2969	aese $iv0,$dat
2970	aesmc $iv0,$iv0
2971	vld1.32 {$dat},[$key2],#16
2972	subs $rounds,$rounds,#2
2973	aese $iv0,$dat1
2974	aesmc $iv0,$iv0
2975	vld1.32 {$dat1},[$key2],#16
2976	b.gt .Loop_dec_iv_enc
2977
2978	aese $iv0,$dat
2979	aesmc $iv0,$iv0
2980	vld1.32 {$dat},[$key2]
2981	aese $iv0,$dat1
2982	veor $iv0,$iv0,$dat
2983
2984	// The iv for second block
2985	// $ivl- iv(low), $ivh - iv(high)
2986	// the five ivs stored into, $iv0,$iv1,$iv2,$iv3,$iv4
2987	fmov $ivl,$ivd00
2988	fmov $ivh,$ivd01
2989	mov $constnum,#0x87
2990	extr $midnumx,$ivh,$ivh,#32
2991	extr $ivh,$ivh,$ivl,#63
2992	and $tmpmw,$constnum,$midnum,asr #31
2993	eor $ivl,$tmpmx,$ivl,lsl #1
2994	fmov $ivd10,$ivl
2995	fmov $ivd11,$ivh
2996
2997	ldr $rounds0,[$key1,#240] // load rounds number
2998
2999	// The iv for third block
3000	extr $midnumx,$ivh,$ivh,#32
3001	extr $ivh,$ivh,$ivl,#63
3002	and $tmpmw,$constnum,$midnum,asr #31
3003	eor $ivl,$tmpmx,$ivl,lsl #1
3004	fmov $ivd20,$ivl
3005	fmov $ivd21,$ivh
3006
3007	vld1.32 {q8-q9},[$key1] // load key schedule...
3008	sub $rounds0,$rounds0,#6
3009	add $key_,$key1,$ivp,lsl#4 // pointer to last 7 round keys
3010	sub $rounds0,$rounds0,#2
3011	vld1.32 {q10-q11},[$key_],#32 // load key schedule...
3012	vld1.32 {q12-q13},[$key_],#32
3013	vld1.32 {q14-q15},[$key_],#32
3014	vld1.32 {$rndlast},[$key_]
3015
3016	// The iv for fourth block
3017	extr $midnumx,$ivh,$ivh,#32
3018	extr $ivh,$ivh,$ivl,#63
3019	and $tmpmw,$constnum,$midnum,asr #31
3020	eor $ivl,$tmpmx,$ivl,lsl #1
3021	fmov $ivd30,$ivl
3022	fmov $ivd31,$ivh
3023
3024	add $key_,$key1,#32
3025	mov $rounds,$rounds0
3026	b .Lxts_dec
3027
3028	// Decryption
3029	.align 5
3030	.Lxts_dec:
3031	tst $tailcnt,#0xf
3032	b.eq .Lxts_dec_begin
3033	subs $len,$len,#16
3034	csel $step,xzr,$step,eq
3035	vld1.8 {$dat},[$inp],#16
3036	b.lo .Lxts_done
3037	sub $inp,$inp,#16
3038	.Lxts_dec_begin:
3039	vld1.8 {$dat},[$inp],$step
3040	subs $len,$len,#32 // bias
3041	add $rounds,$rounds0,#2
3042	vorr $in1,$dat,$dat
3043	vorr $dat1,$dat,$dat
3044	vorr $in3,$dat,$dat
3045	vld1.8 {$dat2},[$inp],#16
3046	vorr $in2,$dat2,$dat2
3047	vorr $in4,$dat2,$dat2
3048	b.lo .Lxts_inner_dec_tail
3049	veor $dat,$dat,$iv0 // before decryt, xor with iv
3050	veor $dat2,$dat2,$iv1
3051
3052	vorr $dat1,$dat2,$dat2
3053	vld1.8 {$dat2},[$inp],#16
3054	vorr $in0,$dat,$dat
3055	vorr $in1,$dat1,$dat1
3056	veor $in2,$dat2,$iv2 // third block xox with third iv
3057	veor $dat2,$dat2,$iv2
3058	cmp $len,#32
3059	b.lo .Lxts_outer_dec_tail
3060
3061	vld1.8 {$dat3},[$inp],#16
3062
3063	// The iv for fifth block
3064	extr $midnumx,$ivh,$ivh,#32
3065	extr $ivh,$ivh,$ivl,#63
3066	and $tmpmw,$constnum,$midnum,asr #31
3067	eor $ivl,$tmpmx,$ivl,lsl #1
3068	fmov $ivd40,$ivl
3069	fmov $ivd41,$ivh
3070
3071	vld1.8 {$dat4},[$inp],#16
3072	veor $dat3,$dat3,$iv3 // the fourth block
3073	veor $dat4,$dat4,$iv4
3074	sub $len,$len,#32 // bias
3075	mov $rounds,$rounds0
3076	b .Loop5x_xts_dec
3077
3078	.align 4
3079	.Loop5x_xts_dec:
3080	aesd $dat0,q8
3081	aesimc $dat0,$dat0
3082	aesd $dat1,q8
3083	aesimc $dat1,$dat1
3084	aesd $dat2,q8
3085	aesimc $dat2,$dat2
3086	aesd $dat3,q8
3087	aesimc $dat3,$dat3
3088	aesd $dat4,q8
3089	aesimc $dat4,$dat4
3090	vld1.32 {q8},[$key_],#16 // load key schedule...
3091	subs $rounds,$rounds,#2
3092	aesd $dat0,q9
3093	aesimc $dat0,$dat0
3094	aesd $dat1,q9
3095	aesimc $dat1,$dat1
3096	aesd $dat2,q9
3097	aesimc $dat2,$dat2
3098	aesd $dat3,q9
3099	aesimc $dat3,$dat3
3100	aesd $dat4,q9
3101	aesimc $dat4,$dat4
3102	vld1.32 {q9},[$key_],#16 // load key schedule...
3103	b.gt .Loop5x_xts_dec
3104
3105	aesd $dat0,q8
3106	aesimc $dat0,$dat0
3107	aesd $dat1,q8
3108	aesimc $dat1,$dat1
3109	aesd $dat2,q8
3110	aesimc $dat2,$dat2
3111	aesd $dat3,q8
3112	aesimc $dat3,$dat3
3113	aesd $dat4,q8
3114	aesimc $dat4,$dat4
3115	subs $len,$len,#0x50 // because .Lxts_dec_tail4x
3116
3117	aesd $dat0,q9
3118	aesimc $dat0,$dat
3119	aesd $dat1,q9
3120	aesimc $dat1,$dat1
3121	aesd $dat2,q9
3122	aesimc $dat2,$dat2
3123	aesd $dat3,q9
3124	aesimc $dat3,$dat3
3125	aesd $dat4,q9
3126	aesimc $dat4,$dat4
3127	csel $xoffset,xzr,$len,gt // borrow x6, w6, "gt" is not typo
3128	mov $key_,$key1
3129
3130	aesd $dat0,q10
3131	aesimc $dat0,$dat0
3132	aesd $dat1,q10
3133	aesimc $dat1,$dat1
3134	aesd $dat2,q10
3135	aesimc $dat2,$dat2
3136	aesd $dat3,q10
3137	aesimc $dat3,$dat3
3138	aesd $dat4,q10
3139	aesimc $dat4,$dat4
3140	add $inp,$inp,$xoffset // x0 is adjusted in such way that
3141	// at exit from the loop v1.16b-v26.16b
3142	// are loaded with last "words"
3143	add $xoffset,$len,#0x60 // because .Lxts_dec_tail4x
3144
3145	aesd $dat0,q11
3146	aesimc $dat0,$dat0
3147	aesd $dat1,q11
3148	aesimc $dat1,$dat1
3149	aesd $dat2,q11
3150	aesimc $dat2,$dat2
3151	aesd $dat3,q11
3152	aesimc $dat3,$dat3
3153	aesd $dat4,q11
3154	aesimc $dat4,$dat4
3155
3156	aesd $dat0,q12
3157	aesimc $dat0,$dat0
3158	aesd $dat1,q12
3159	aesimc $dat1,$dat1
3160	aesd $dat2,q12
3161	aesimc $dat2,$dat2
3162	aesd $dat3,q12
3163	aesimc $dat3,$dat3
3164	aesd $dat4,q12
3165	aesimc $dat4,$dat4
3166
3167	aesd $dat0,q13
3168	aesimc $dat0,$dat0
3169	aesd $dat1,q13
3170	aesimc $dat1,$dat1
3171	aesd $dat2,q13
3172	aesimc $dat2,$dat2
3173	aesd $dat3,q13
3174	aesimc $dat3,$dat3
3175	aesd $dat4,q13
3176	aesimc $dat4,$dat4
3177
3178	aesd $dat0,q14
3179	aesimc $dat0,$dat0
3180	aesd $dat1,q14
3181	aesimc $dat1,$dat1
3182	aesd $dat2,q14
3183	aesimc $dat2,$dat2
3184	aesd $dat3,q14
3185	aesimc $dat3,$dat3
3186	aesd $dat4,q14
3187	aesimc $dat4,$dat4
3188
3189	veor $tmp0,$rndlast,$iv0
3190	aesd $dat0,q15
3191	// The iv for first block of next iteration.
3192	extr $midnumx,$ivh,$ivh,#32
3193	extr $ivh,$ivh,$ivl,#63
3194	and $tmpmw,$constnum,$midnum,asr #31
3195	eor $ivl,$tmpmx,$ivl,lsl #1
3196	fmov $ivd00,$ivl
3197	fmov $ivd01,$ivh
3198	veor $tmp1,$rndlast,$iv1
3199	vld1.8 {$in0},[$inp],#16
3200	aesd $dat1,q15
3201	// The iv for second block
3202	extr $midnumx,$ivh,$ivh,#32
3203	extr $ivh,$ivh,$ivl,#63
3204	and $tmpmw,$constnum,$midnum,asr #31
3205	eor $ivl,$tmpmx,$ivl,lsl #1
3206	fmov $ivd10,$ivl
3207	fmov $ivd11,$ivh
3208	veor $tmp2,$rndlast,$iv2
3209	vld1.8 {$in1},[$inp],#16
3210	aesd $dat2,q15
3211	// The iv for third block
3212	extr $midnumx,$ivh,$ivh,#32
3213	extr $ivh,$ivh,$ivl,#63
3214	and $tmpmw,$constnum,$midnum,asr #31
3215	eor $ivl,$tmpmx,$ivl,lsl #1
3216	fmov $ivd20,$ivl
3217	fmov $ivd21,$ivh
3218	veor $tmp3,$rndlast,$iv3
3219	vld1.8 {$in2},[$inp],#16
3220	aesd $dat3,q15
3221	// The iv for fourth block
3222	extr $midnumx,$ivh,$ivh,#32
3223	extr $ivh,$ivh,$ivl,#63
3224	and $tmpmw,$constnum,$midnum,asr #31
3225	eor $ivl,$tmpmx,$ivl,lsl #1
3226	fmov $ivd30,$ivl
3227	fmov $ivd31,$ivh
3228	veor $tmp4,$rndlast,$iv4
3229	vld1.8 {$in3},[$inp],#16
3230	aesd $dat4,q15
3231
3232	// The iv for fifth block
3233	extr $midnumx,$ivh,$ivh,#32
3234	extr $ivh,$ivh,$ivl,#63
3235	and $tmpmw,$constnum,$midnum,asr #31
3236	eor $ivl,$tmpmx,$ivl,lsl #1
3237	fmov $ivd40,$ivl
3238	fmov $ivd41,$ivh
3239
3240	vld1.8 {$in4},[$inp],#16
3241	cbz $xoffset,.Lxts_dec_tail4x
3242	vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0]
3243	veor $tmp0,$tmp0,$dat0
3244	veor $dat0,$in0,$iv0
3245	veor $tmp1,$tmp1,$dat1
3246	veor $dat1,$in1,$iv1
3247	veor $tmp2,$tmp2,$dat2
3248	veor $dat2,$in2,$iv2
3249	veor $tmp3,$tmp3,$dat3
3250	veor $dat3,$in3,$iv3
3251	veor $tmp4,$tmp4,$dat4
3252	vst1.8 {$tmp0},[$out],#16
3253	veor $dat4,$in4,$iv4
3254	vst1.8 {$tmp1},[$out],#16
3255	mov $rounds,$rounds0
3256	vst1.8 {$tmp2},[$out],#16
3257	vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1]
3258	vst1.8 {$tmp3},[$out],#16
3259	vst1.8 {$tmp4},[$out],#16
3260	b.hs .Loop5x_xts_dec
3261
3262	cmn $len,#0x10
3263	b.ne .Loop5x_dec_after
3264	// If x2($len) equal to -0x10, the left blocks is 4.
3265	// After specially processing, utilize the five blocks processing again.
3266	// It will use the following IVs: $iv0,$iv0,$iv1,$iv2,$iv3.
3267	vorr $iv4,$iv3,$iv3
3268	vorr $iv3,$iv2,$iv2
3269	vorr $iv2,$iv1,$iv1
3270	vorr $iv1,$iv0,$iv0
3271	fmov $ivl,$ivd40
3272	fmov $ivh,$ivd41
3273	veor $dat0,$iv0,$in0
3274	veor $dat1,$iv1,$in1
3275	veor $dat2,$in2,$iv2
3276	veor $dat3,$in3,$iv3
3277	veor $dat4,$in4,$iv4
3278	b.eq .Loop5x_xts_dec
3279
3280	.Loop5x_dec_after:
3281	add $len,$len,#0x50
3282	cbz $len,.Lxts_done
3283
3284	add $rounds,$rounds0,#2
3285	subs $len,$len,#0x30
3286	b.lo .Lxts_inner_dec_tail
3287
3288	veor $dat0,$iv0,$in2
3289	veor $dat1,$iv1,$in3
3290	veor $dat2,$in4,$iv2
3291	b .Lxts_outer_dec_tail
3292
3293	.align 4
3294	.Lxts_dec_tail4x:
3295	add $inp,$inp,#16
3296	vld1.32 {$dat0},[$inp],#16
3297	veor $tmp1,$dat1,$tmp0
3298	vst1.8 {$tmp1},[$out],#16
3299	veor $tmp2,$dat2,$tmp2
3300	vst1.8 {$tmp2},[$out],#16
3301	veor $tmp3,$dat3,$tmp3
3302	veor $tmp4,$dat4,$tmp4
3303	vst1.8 {$tmp3-$tmp4},[$out],#32
3304
3305	b .Lxts_done
3306	.align 4
3307	.Lxts_outer_dec_tail:
3308	aesd $dat0,q8
3309	aesimc $dat0,$dat0
3310	aesd $dat1,q8
3311	aesimc $dat1,$dat1
3312	aesd $dat2,q8
3313	aesimc $dat2,$dat2
3314	vld1.32 {q8},[$key_],#16
3315	subs $rounds,$rounds,#2
3316	aesd $dat0,q9
3317	aesimc $dat0,$dat0
3318	aesd $dat1,q9
3319	aesimc $dat1,$dat1
3320	aesd $dat2,q9
3321	aesimc $dat2,$dat2
3322	vld1.32 {q9},[$key_],#16
3323	b.gt .Lxts_outer_dec_tail
3324
3325	aesd $dat0,q8
3326	aesimc $dat0,$dat0
3327	aesd $dat1,q8
3328	aesimc $dat1,$dat1
3329	aesd $dat2,q8
3330	aesimc $dat2,$dat2
3331	veor $tmp0,$iv0,$rndlast
3332	subs $len,$len,#0x30
3333	// The iv for first block
3334	fmov $ivl,$ivd20
3335	fmov $ivh,$ivd21
3336	mov $constnum,#0x87
3337	extr $midnumx,$ivh,$ivh,#32
3338	extr $ivh,$ivh,$ivl,#63
3339	and $tmpmw,$constnum,$midnum,asr #31
3340	eor $ivl,$tmpmx,$ivl,lsl #1
3341	fmov $ivd00,$ivl
3342	fmov $ivd01,$ivh
3343	veor $tmp1,$iv1,$rndlast
3344	csel $xoffset,$len,$xoffset,lo // x6, w6, is zero at this point
3345	aesd $dat0,q9
3346	aesimc $dat0,$dat0
3347	aesd $dat1,q9
3348	aesimc $dat1,$dat1
3349	aesd $dat2,q9
3350	aesimc $dat2,$dat2
3351	veor $tmp2,$iv2,$rndlast
3352	// The iv for second block
3353	extr $midnumx,$ivh,$ivh,#32
3354	extr $ivh,$ivh,$ivl,#63
3355	and $tmpmw,$constnum,$midnum,asr #31
3356	eor $ivl,$tmpmx,$ivl,lsl #1
3357	fmov $ivd10,$ivl
3358	fmov $ivd11,$ivh
3359
3360	add $xoffset,$xoffset,#0x20
3361	add $inp,$inp,$xoffset // $inp is adjusted to the last data
3362
3363	mov $key_,$key1
3364
3365	// The iv for third block
3366	extr $midnumx,$ivh,$ivh,#32
3367	extr $ivh,$ivh,$ivl,#63
3368	and $tmpmw,$constnum,$midnum,asr #31
3369	eor $ivl,$tmpmx,$ivl,lsl #1
3370	fmov $ivd20,$ivl
3371	fmov $ivd21,$ivh
3372
3373	aesd $dat0,q12
3374	aesimc $dat0,$dat0
3375	aesd $dat1,q12
3376	aesimc $dat1,$dat1
3377	aesd $dat2,q12
3378	aesimc $dat2,$dat2
3379	aesd $dat0,q13
3380	aesimc $dat0,$dat0
3381	aesd $dat1,q13
3382	aesimc $dat1,$dat1
3383	aesd $dat2,q13
3384	aesimc $dat2,$dat2
3385	aesd $dat0,q14
3386	aesimc $dat0,$dat0
3387	aesd $dat1,q14
3388	aesimc $dat1,$dat1
3389	aesd $dat2,q14
3390	aesimc $dat2,$dat2
3391	vld1.8 {$in2},[$inp],#16
3392	aesd $dat0,q15
3393	aesd $dat1,q15
3394	aesd $dat2,q15
3395	vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0]
3396	add $rounds,$rounds0,#2
3397	veor $tmp0,$tmp0,$dat0
3398	veor $tmp1,$tmp1,$dat1
3399	veor $dat2,$dat2,$tmp2
3400	vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1]
3401	vst1.8 {$tmp0},[$out],#16
3402	vst1.8 {$tmp1},[$out],#16
3403	vst1.8 {$dat2},[$out],#16
3404
3405	cmn $len,#0x30
3406	add $len,$len,#0x30
3407	b.eq .Lxts_done
3408	sub $len,$len,#0x30
3409	vorr $in3,$in1,$in1
3410	vorr $in4,$in2,$in2
3411	nop
3412
3413	.Lxts_inner_dec_tail:
3414	// $len == -0x10 means two blocks left.
3415	cmn $len,#0x10
3416	veor $dat1,$in3,$iv0
3417	veor $dat2,$in4,$iv1
3418	b.eq .Lxts_dec_tail_loop
3419	veor $dat2,$in4,$iv0
3420	.Lxts_dec_tail_loop:
3421	aesd $dat1,q8
3422	aesimc $dat1,$dat1
3423	aesd $dat2,q8
3424	aesimc $dat2,$dat2
3425	vld1.32 {q8},[$key_],#16
3426	subs $rounds,$rounds,#2
3427	aesd $dat1,q9
3428	aesimc $dat1,$dat1
3429	aesd $dat2,q9
3430	aesimc $dat2,$dat2
3431	vld1.32 {q9},[$key_],#16
3432	b.gt .Lxts_dec_tail_loop
3433
3434	aesd $dat1,q8
3435	aesimc $dat1,$dat1
3436	aesd $dat2,q8
3437	aesimc $dat2,$dat2
3438	aesd $dat1,q9
3439	aesimc $dat1,$dat1
3440	aesd $dat2,q9
3441	aesimc $dat2,$dat2
3442	aesd $dat1,q12
3443	aesimc $dat1,$dat1
3444	aesd $dat2,q12
3445	aesimc $dat2,$dat2
3446	cmn $len,#0x20
3447	aesd $dat1,q13
3448	aesimc $dat1,$dat1
3449	aesd $dat2,q13
3450	aesimc $dat2,$dat2
3451	veor $tmp1,$iv0,$rndlast
3452	aesd $dat1,q14
3453	aesimc $dat1,$dat1
3454	aesd $dat2,q14
3455	aesimc $dat2,$dat2
3456	veor $tmp2,$iv1,$rndlast
3457	aesd $dat1,q15
3458	aesd $dat2,q15
3459	b.eq .Lxts_dec_one
3460	veor $tmp1,$tmp1,$dat1
3461	veor $tmp2,$tmp2,$dat2
3462	vorr $iv0,$iv2,$iv2
3463	vorr $iv1,$iv3,$iv3
3464	vst1.8 {$tmp1},[$out],#16
3465	vst1.8 {$tmp2},[$out],#16
3466	add $len,$len,#16
3467	b .Lxts_done
3468
3469	.Lxts_dec_one:
3470	veor $tmp1,$tmp1,$dat2
3471	vorr $iv0,$iv1,$iv1
3472	vorr $iv1,$iv2,$iv2
3473	vst1.8 {$tmp1},[$out],#16
3474	add $len,$len,#32
3475
3476	.Lxts_done:
3477	tst $tailcnt,#0xf
3478	b.eq .Lxts_dec_abort
3479	// Processing the last two blocks with cipher stealing.
3480	mov x7,x3
3481	cbnz x2,.Lxts_dec_1st_done
3482	vld1.32 {$dat0},[$inp],#16
3483
3484	// Decrypt the last secod block to get the last plain text block
3485	.Lxts_dec_1st_done:
3486	eor $tmpin,$dat0,$iv1
3487	ldr $rounds,[$key1,#240]
3488	vld1.32 {$dat0},[$key1],#16
3489	sub $rounds,$rounds,#2
3490	vld1.32 {$dat1},[$key1],#16
3491	.Loop_final_2nd_dec:
3492	aesd $tmpin,$dat0
3493	aesimc $tmpin,$tmpin
3494	vld1.32 {$dat0},[$key1],#16 // load key schedule...
3495	subs $rounds,$rounds,#2
3496	aesd $tmpin,$dat1
3497	aesimc $tmpin,$tmpin
3498	vld1.32 {$dat1},[$key1],#16 // load key schedule...
3499	b.gt .Loop_final_2nd_dec
3500
3501	aesd $tmpin,$dat0
3502	aesimc $tmpin,$tmpin
3503	vld1.32 {$dat0},[$key1]
3504	aesd $tmpin,$dat1
3505	veor $tmpin,$tmpin,$dat0
3506	veor $tmpin,$tmpin,$iv1
3507	vst1.8 {$tmpin},[$out]
3508
3509	mov $tmpinp,$inp
3510	add $tmpoutp,$out,#16
3511
3512	// Composite the tailcnt "16 byte not aligned block" into the last second plain blocks
3513	// to get the last encrypted block.
3514	.composite_dec_loop:
3515	subs $tailcnt,$tailcnt,#1
3516	ldrb $l2outp,[$out,$tailcnt]
3517	ldrb $loutp,[$tmpinp,$tailcnt]
3518	strb $l2outp,[$tmpoutp,$tailcnt]
3519	strb $loutp,[$out,$tailcnt]
3520	b.gt .composite_dec_loop
3521	.Lxts_dec_load_done:
3522	vld1.8 {$tmpin},[$out]
3523	veor $tmpin,$tmpin,$iv0
3524
3525	// Decrypt the composite block to get the last second plain text block
3526	ldr $rounds,[$key_,#240]
3527	vld1.8 {$dat},[$key_],#16
3528	sub $rounds,$rounds,#2
3529	vld1.8 {$dat1},[$key_],#16
3530	.Loop_final_dec:
3531	aesd $tmpin,$dat0
3532	aesimc $tmpin,$tmpin
3533	vld1.32 {$dat0},[$key_],#16 // load key schedule...
3534	subs $rounds,$rounds,#2
3535	aesd $tmpin,$dat1
3536	aesimc $tmpin,$tmpin
3537	vld1.32 {$dat1},[$key_],#16 // load key schedule...
3538	b.gt .Loop_final_dec
3539
3540	aesd $tmpin,$dat0
3541	aesimc $tmpin,$tmpin
3542	vld1.32 {$dat0},[$key_]
3543	aesd $tmpin,$dat1
3544	veor $tmpin,$tmpin,$dat0
3545	veor $tmpin,$tmpin,$iv0
3546	vst1.8 {$tmpin},[$out]
3547
3548	.Lxts_dec_abort:
3549	ldp $tailcnt,$midnumx,[sp,#48]
3550	ldp $ivd10,$ivd20,[sp,#32]
3551	ldp $ivd30,$ivd40,[sp,#16]
3552	ldp $constnumx,$tmpinp,[sp],#64
3553
3554	.Lxts_dec_final_abort:
3555	ret
3556	.size ${prefix}_xts_decrypt,.-${prefix}_xts_decrypt
3557	___
3558	}
3559	}}}
3560	$code.=<<___;
3561	#endif
3562	___
3563	########################################
3564	if ($flavour =~ /64/) { ######## 64-bit code
3565	my %opcode = (
3566	"aesd" => 0x4e285800, "aese" => 0x4e284800,
3567	"aesimc"=> 0x4e287800, "aesmc" => 0x4e286800 );
3568
3569	local *unaes = sub {
3570	my ($mnemonic,$arg)=@_;
3571
3572	$arg =~ m/[qv]([0-9]+)[^,],\s[qv]([0-9]+)/o &&
3573	sprintf ".inst\t0x%08x\t//%s %s",
3574	$opcode{$mnemonic}\|$1\|($2<<5),
3575	$mnemonic,$arg;
3576	};
3577
3578	foreach(split("\n",$code)) {
3579	s/\`([^\`]*)\`/eval($1)/geo;
3580
3581	s/\bq([0-9]+)\b/"v".($1<8?$1:$1+8).".16b"/geo; # old->new registers
3582	s/@\s/\/\//o; # old->new style commentary
3583
3584	#s/[v]?(aes\w+)\s+([qv].*)/unaes($1,$2)/geo or
3585	s/cclr\s+([wx])([^,]+),\s*([a-z]+)/csel $1$2,$1zr,$1$2,$3/o or
3586	s/mov\.([a-z]+)\s+([wx][0-9]+),\s*([wx][0-9]+)/csel $2,$3,$2,$1/o or
3587	s/vmov\.i8/movi/o or # fix up legacy mnemonics
3588	s/vext\.8/ext/o or
3589	s/vrev32\.8/rev32/o or
3590	s/vtst\.8/cmtst/o or
3591	s/vshr/ushr/o or
3592	s/^(\s+)v/$1/o or # strip off v prefix
3593	s/\bbx\s+lr\b/ret/o;
3594
3595	# fix up remaining legacy suffixes
3596	s/\.[ui]?8//o;
3597	m/\],#8/o and s/\.16b/\.8b/go;
3598	s/\.[ui]?32//o and s/\.16b/\.4s/go;
3599	s/\.[ui]?64//o and s/\.16b/\.2d/go;
3600	s/\.[42]([sd])\[([0-3])\]/\.$1\[$2\]/o;
3601
3602	print $_,"\n";
3603	}
3604	} else { ######## 32-bit code
3605	my %opcode = (
3606	"aesd" => 0xf3b00340, "aese" => 0xf3b00300,
3607	"aesimc"=> 0xf3b003c0, "aesmc" => 0xf3b00380 );
3608
3609	local *unaes = sub {
3610	my ($mnemonic,$arg)=@_;
3611
3612	if ($arg =~ m/[qv]([0-9]+)[^,],\s[qv]([0-9]+)/o) {
3613	my $word = $opcode{$mnemonic}\|(($1&7)<<13)\|(($1&8)<<19)
3614	\|(($2&7)<<1) \|(($2&8)<<2);
3615	# since ARMv7 instructions are always encoded little-endian.
3616	# correct solution is to use .inst directive, but older
3617	# assemblers don't implement it:-(
3618	sprintf "INST(0x%02x,0x%02x,0x%02x,0x%02x)\t@ %s %s",
3619	$word&0xff,($word>>8)&0xff,
3620	($word>>16)&0xff,($word>>24)&0xff,
3621	$mnemonic,$arg;
3622	}
3623	};
3624
3625	sub unvtbl {
3626	my $arg=shift;
3627
3628	$arg =~ m/q([0-9]+),\s\{q([0-9]+)\},\sq([0-9]+)/o &&
3629	sprintf "vtbl.8 d%d,{q%d},d%d\n\t".
3630	"vtbl.8 d%d,{q%d},d%d", 2$1,$2,2$3, 2$1+1,$2,2$3+1;
3631	}
3632
3633	sub unvdup32 {
3634	my $arg=shift;
3635
3636	$arg =~ m/q([0-9]+),\s*q([0-9]+)\[([0-3])\]/o &&
3637	sprintf "vdup.32 q%d,d%d[%d]",$1,2*$2+($3>>1),$3&1;
3638	}
3639
3640	sub unvmov32 {
3641	my $arg=shift;
3642
3643	$arg =~ m/q([0-9]+)\[([0-3])\],(.*)/o &&
3644	sprintf "vmov.32 d%d[%d],%s",2*$1+($2>>1),$2&1,$3;
3645	}
3646
3647	foreach(split("\n",$code)) {
3648	s/\`([^\`]*)\`/eval($1)/geo;
3649
3650	s/\b[wx]([0-9]+)\b/r$1/go; # new->old registers
3651	s/\bv([0-9])\.[12468]+[bsd]\b/q$1/go; # new->old registers
3652	s/\/\/\s?/@ /o; # new->old style commentary
3653
3654	# fix up remaining new-style suffixes
3655	s/\{q([0-9]+)\},\s\[(.+)\],#8/sprintf "{d%d},[$2]!",2$1/eo or
3656	s/\],#[0-9]+/]!/o;
3657
3658	s/[v]?(aes\w+)\s+([qv].*)/unaes($1,$2)/geo or
3659	s/cclr\s+([^,]+),\s*([a-z]+)/mov.$2 $1,#0/o or
3660	s/vtbl\.8\s+(.*)/unvtbl($1)/geo or
3661	s/vdup\.32\s+(.*)/unvdup32($1)/geo or
3662	s/vmov\.32\s+(.*)/unvmov32($1)/geo or
3663	s/^(\s+)b\./$1b/o or
3664	s/^(\s+)ret/$1bx\tlr/o;
3665
3666	if (s/^(\s+)mov\.([a-z]+)/$1mov$2/) {
3667	print " it $2\n";
3668	}
3669
3670	print $_,"\n";
3671	}
3672	}
3673
3674	close STDOUT or die "error closing STDOUT: $!";

注意: 瀏覽 TracBrowser 來幫助您使用儲存庫瀏覽器

source: vbox/trunk/src/libs/openssl-3.0.7/crypto/aes/asm/aesv8-armx.pl@ 97371

以其他格式下載: