aesv8-armx.pl@ 99507

最後變更在這個檔案從99507是 97372,由 vboxsync 提交於 2 年前
libs: Switch to openssl-3.0.7, bugref:10317
屬性 svn:executable 設為 ``*
檔案大小: 80.7 KB

行
1	#! /usr/bin/env perl
2	# Copyright 2014-2022 The OpenSSL Project Authors. All Rights Reserved.
3	#
4	# Licensed under the Apache License 2.0 (the "License"). You may not use
5	# this file except in compliance with the License. You can obtain a copy
6	# in the file LICENSE in the source distribution or at
7	# https://www.openssl.org/source/license.html
8
9	#
10	# ====================================================================
11	# Written by Andy Polyakov <[email protected]> for the OpenSSL
12	# project. The module is, however, dual licensed under OpenSSL and
13	# CRYPTOGAMS licenses depending on where you obtain it. For further
14	# details see http://www.openssl.org/~appro/cryptogams/.
15	# ====================================================================
16	#
17	# This module implements support for ARMv8 AES instructions. The
18	# module is endian-agnostic in sense that it supports both big- and
19	# little-endian cases. As does it support both 32- and 64-bit modes
20	# of operation. Latter is achieved by limiting amount of utilized
21	# registers to 16, which implies additional NEON load and integer
22	# instructions. This has no effect on mighty Apple A7, where results
23	# are literally equal to the theoretical estimates based on AES
24	# instruction latencies and issue rates. On Cortex-A53, an in-order
25	# execution core, this costs up to 10-15%, which is partially
26	# compensated by implementing dedicated code path for 128-bit
27	# CBC encrypt case. On Cortex-A57 parallelizable mode performance
28	# seems to be limited by sheer amount of NEON instructions...
29	#
30	# April 2019
31	#
32	# Key to performance of parallelize-able modes is round instruction
33	# interleaving. But which factor to use? There is optimal one for
34	# each combination of instruction latency and issue rate, beyond
35	# which increasing interleave factor doesn't pay off. While on cons
36	# side we have code size increase and resource waste on platforms for
37	# which interleave factor is too high. In other words you want it to
38	# be just right. So far interleave factor of 3x was serving well all
39	# platforms. But for ThunderX2 optimal interleave factor was measured
40	# to be 5x...
41	#
42	# Performance in cycles per byte processed with 128-bit key:
43	#
44	# CBC enc CBC dec CTR
45	# Apple A7 2.39 1.20 1.20
46	# Cortex-A53 1.32 1.17/1.29(**) 1.36/1.46
47	# Cortex-A57(*) 1.95 0.82/0.85 0.89/0.93
48	# Cortex-A72 1.33 0.85/0.88 0.92/0.96
49	# Denver 1.96 0.65/0.86 0.76/0.80
50	# Mongoose 1.33 1.23/1.20 1.30/1.20
51	# Kryo 1.26 0.87/0.94 1.00/1.00
52	# ThunderX2 5.95 1.25 1.30
53	#
54	# (*) original 3.64/1.34/1.32 results were for r0p0 revision
55	# and are still same even for updated module;
56	# (**) numbers after slash are for 32-bit code, which is 3x-
57	# interleaved;
58
59	# $output is the last argument if it looks like a file (it has an extension)
60	# $flavour is the first argument if it doesn't look like a file
61	$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m\|\.\w+$\| ? pop : undef;
62	$flavour = $#ARGV >= 0 && $ARGV[0] !~ m\|\.\| ? shift : undef;
63
64	$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
65	( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
66	( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
67	die "can't locate arm-xlate.pl";
68
69	open OUT,"\| \"$^X\" $xlate $flavour \"$output\""
70	or die "can't call $xlate: $!";
71	STDOUT=OUT;
72
73	$prefix="aes_v8";
74
75	$_byte = ($flavour =~ /win/ ? "DCB" : ".byte");
76
77	$code=<<___;
78	#include "arm_arch.h"
79
80	#if __ARM_MAX_ARCH__>=7
81	___
82	$code.=".arch armv8-a+crypto\n.text\n" if ($flavour =~ /64/);
83	$code.=<<___ if ($flavour !~ /64/);
84	.arch armv7-a // don't confuse not-so-latest binutils with argv8 :-)
85	.fpu neon
86	#ifdef __thumb2__
87	.syntax unified
88	.thumb
89	# define INST(a,b,c,d) $_byte c,d\|0xc,a,b
90	#else
91	.code 32
92	# define INST(a,b,c,d) $_byte a,b,c,d
93	#endif
94
95	.text
96	___
97
98	# Assembler mnemonics are an eclectic mix of 32- and 64-bit syntax,
99	# NEON is mostly 32-bit mnemonics, integer - mostly 64. Goal is to
100	# maintain both 32- and 64-bit codes within single module and
101	# transliterate common code to either flavour with regex vodoo.
102	#
103	{{{
104	my ($inp,$bits,$out,$ptr,$rounds)=("x0","w1","x2","x3","w12");
105	my ($zero,$rcon,$mask,$in0,$in1,$tmp,$key)=
106	$flavour=~/64/? map("q$_",(0..6)) : map("q$_",(0..3,8..10));
107
108
109	$code.=<<___;
110	.align 5
111	.Lrcon:
112	.long 0x01,0x01,0x01,0x01
113	.long 0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d // rotate-n-splat
114	.long 0x1b,0x1b,0x1b,0x1b
115
116	.globl ${prefix}_set_encrypt_key
117	.type ${prefix}_set_encrypt_key,%function
118	.align 5
119	${prefix}_set_encrypt_key:
120	.Lenc_key:
121	___
122	$code.=<<___ if ($flavour =~ /64/);
123	stp x29,x30,[sp,#-16]!
124	add x29,sp,#0
125	___
126	$code.=<<___;
127	mov $ptr,#-1
128	cmp $inp,#0
129	b.eq .Lenc_key_abort
130	cmp $out,#0
131	b.eq .Lenc_key_abort
132	mov $ptr,#-2
133	cmp $bits,#128
134	b.lt .Lenc_key_abort
135	cmp $bits,#256
136	b.gt .Lenc_key_abort
137	tst $bits,#0x3f
138	b.ne .Lenc_key_abort
139
140	adr $ptr,.Lrcon
141	cmp $bits,#192
142
143	veor $zero,$zero,$zero
144	vld1.8 {$in0},[$inp],#16
145	mov $bits,#8 // reuse $bits
146	vld1.32 {$rcon,$mask},[$ptr],#32
147
148	b.lt .Loop128
149	b.eq .L192
150	b .L256
151
152	.align 4
153	.Loop128:
154	vtbl.8 $key,{$in0},$mask
155	vext.8 $tmp,$zero,$in0,#12
156	vst1.32 {$in0},[$out],#16
157	aese $key,$zero
158	subs $bits,$bits,#1
159
160	veor $in0,$in0,$tmp
161	vext.8 $tmp,$zero,$tmp,#12
162	veor $in0,$in0,$tmp
163	vext.8 $tmp,$zero,$tmp,#12
164	veor $key,$key,$rcon
165	veor $in0,$in0,$tmp
166	vshl.u8 $rcon,$rcon,#1
167	veor $in0,$in0,$key
168	b.ne .Loop128
169
170	vld1.32 {$rcon},[$ptr]
171
172	vtbl.8 $key,{$in0},$mask
173	vext.8 $tmp,$zero,$in0,#12
174	vst1.32 {$in0},[$out],#16
175	aese $key,$zero
176
177	veor $in0,$in0,$tmp
178	vext.8 $tmp,$zero,$tmp,#12
179	veor $in0,$in0,$tmp
180	vext.8 $tmp,$zero,$tmp,#12
181	veor $key,$key,$rcon
182	veor $in0,$in0,$tmp
183	vshl.u8 $rcon,$rcon,#1
184	veor $in0,$in0,$key
185
186	vtbl.8 $key,{$in0},$mask
187	vext.8 $tmp,$zero,$in0,#12
188	vst1.32 {$in0},[$out],#16
189	aese $key,$zero
190
191	veor $in0,$in0,$tmp
192	vext.8 $tmp,$zero,$tmp,#12
193	veor $in0,$in0,$tmp
194	vext.8 $tmp,$zero,$tmp,#12
195	veor $key,$key,$rcon
196	veor $in0,$in0,$tmp
197	veor $in0,$in0,$key
198	vst1.32 {$in0},[$out]
199	add $out,$out,#0x50
200
201	mov $rounds,#10
202	b .Ldone
203
204	.align 4
205	.L192:
206	vld1.8 {$in1},[$inp],#8
207	vmov.i8 $key,#8 // borrow $key
208	vst1.32 {$in0},[$out],#16
209	vsub.i8 $mask,$mask,$key // adjust the mask
210
211	.Loop192:
212	vtbl.8 $key,{$in1},$mask
213	vext.8 $tmp,$zero,$in0,#12
214	#ifdef __ARMEB__
215	vst1.32 {$in1},[$out],#16
216	sub $out,$out,#8
217	#else
218	vst1.32 {$in1},[$out],#8
219	#endif
220	aese $key,$zero
221	subs $bits,$bits,#1
222
223	veor $in0,$in0,$tmp
224	vext.8 $tmp,$zero,$tmp,#12
225	veor $in0,$in0,$tmp
226	vext.8 $tmp,$zero,$tmp,#12
227	veor $in0,$in0,$tmp
228
229	vdup.32 $tmp,${in0}[3]
230	veor $tmp,$tmp,$in1
231	veor $key,$key,$rcon
232	vext.8 $in1,$zero,$in1,#12
233	vshl.u8 $rcon,$rcon,#1
234	veor $in1,$in1,$tmp
235	veor $in0,$in0,$key
236	veor $in1,$in1,$key
237	vst1.32 {$in0},[$out],#16
238	b.ne .Loop192
239
240	mov $rounds,#12
241	add $out,$out,#0x20
242	b .Ldone
243
244	.align 4
245	.L256:
246	vld1.8 {$in1},[$inp]
247	mov $bits,#7
248	mov $rounds,#14
249	vst1.32 {$in0},[$out],#16
250
251	.Loop256:
252	vtbl.8 $key,{$in1},$mask
253	vext.8 $tmp,$zero,$in0,#12
254	vst1.32 {$in1},[$out],#16
255	aese $key,$zero
256	subs $bits,$bits,#1
257
258	veor $in0,$in0,$tmp
259	vext.8 $tmp,$zero,$tmp,#12
260	veor $in0,$in0,$tmp
261	vext.8 $tmp,$zero,$tmp,#12
262	veor $key,$key,$rcon
263	veor $in0,$in0,$tmp
264	vshl.u8 $rcon,$rcon,#1
265	veor $in0,$in0,$key
266	vst1.32 {$in0},[$out],#16
267	b.eq .Ldone
268
269	vdup.32 $key,${in0}[3] // just splat
270	vext.8 $tmp,$zero,$in1,#12
271	aese $key,$zero
272
273	veor $in1,$in1,$tmp
274	vext.8 $tmp,$zero,$tmp,#12
275	veor $in1,$in1,$tmp
276	vext.8 $tmp,$zero,$tmp,#12
277	veor $in1,$in1,$tmp
278
279	veor $in1,$in1,$key
280	b .Loop256
281
282	.Ldone:
283	str $rounds,[$out]
284	mov $ptr,#0
285
286	.Lenc_key_abort:
287	mov x0,$ptr // return value
288	`"ldr x29,[sp],#16" if ($flavour =~ /64/)`
289	ret
290	.size ${prefix}_set_encrypt_key,.-${prefix}_set_encrypt_key
291
292	.globl ${prefix}_set_decrypt_key
293	.type ${prefix}_set_decrypt_key,%function
294	.align 5
295	${prefix}_set_decrypt_key:
296	___
297	$code.=<<___ if ($flavour =~ /64/);
298	.inst 0xd503233f // paciasp
299	stp x29,x30,[sp,#-16]!
300	add x29,sp,#0
301	___
302	$code.=<<___ if ($flavour !~ /64/);
303	stmdb sp!,{r4,lr}
304	___
305	$code.=<<___;
306	bl .Lenc_key
307
308	cmp x0,#0
309	b.ne .Ldec_key_abort
310
311	sub $out,$out,#240 // restore original $out
312	mov x4,#-16
313	add $inp,$out,x12,lsl#4 // end of key schedule
314
315	vld1.32 {v0.16b},[$out]
316	vld1.32 {v1.16b},[$inp]
317	vst1.32 {v0.16b},[$inp],x4
318	vst1.32 {v1.16b},[$out],#16
319
320	.Loop_imc:
321	vld1.32 {v0.16b},[$out]
322	vld1.32 {v1.16b},[$inp]
323	aesimc v0.16b,v0.16b
324	aesimc v1.16b,v1.16b
325	vst1.32 {v0.16b},[$inp],x4
326	vst1.32 {v1.16b},[$out],#16
327	cmp $inp,$out
328	b.hi .Loop_imc
329
330	vld1.32 {v0.16b},[$out]
331	aesimc v0.16b,v0.16b
332	vst1.32 {v0.16b},[$inp]
333
334	eor x0,x0,x0 // return value
335	.Ldec_key_abort:
336	___
337	$code.=<<___ if ($flavour !~ /64/);
338	ldmia sp!,{r4,pc}
339	___
340	$code.=<<___ if ($flavour =~ /64/);
341	ldp x29,x30,[sp],#16
342	.inst 0xd50323bf // autiasp
343	ret
344	___
345	$code.=<<___;
346	.size ${prefix}_set_decrypt_key,.-${prefix}_set_decrypt_key
347	___
348	}}}
349	{{{
350	sub gen_block () {
351	my $dir = shift;
352	my ($e,$mc) = $dir eq "en" ? ("e","mc") : ("d","imc");
353	my ($inp,$out,$key)=map("x$_",(0..2));
354	my $rounds="w3";
355	my ($rndkey0,$rndkey1,$inout)=map("q$_",(0..3));
356
357	$code.=<<___;
358	.globl ${prefix}_${dir}crypt
359	.type ${prefix}_${dir}crypt,%function
360	.align 5
361	${prefix}_${dir}crypt:
362	ldr $rounds,[$key,#240]
363	vld1.32 {$rndkey0},[$key],#16
364	vld1.8 {$inout},[$inp]
365	sub $rounds,$rounds,#2
366	vld1.32 {$rndkey1},[$key],#16
367
368	.Loop_${dir}c:
369	aes$e $inout,$rndkey0
370	aes$mc $inout,$inout
371	vld1.32 {$rndkey0},[$key],#16
372	subs $rounds,$rounds,#2
373	aes$e $inout,$rndkey1
374	aes$mc $inout,$inout
375	vld1.32 {$rndkey1},[$key],#16
376	b.gt .Loop_${dir}c
377
378	aes$e $inout,$rndkey0
379	aes$mc $inout,$inout
380	vld1.32 {$rndkey0},[$key]
381	aes$e $inout,$rndkey1
382	veor $inout,$inout,$rndkey0
383
384	vst1.8 {$inout},[$out]
385	ret
386	.size ${prefix}_${dir}crypt,.-${prefix}_${dir}crypt
387	___
388	}
389	&gen_block("en");
390	&gen_block("de");
391	}}}
392
393	# Performance in cycles per byte.
394	# Processed with AES-ECB different key size.
395	# It shows the value before and after optimization as below:
396	# (before/after):
397	#
398	# AES-128-ECB AES-192-ECB AES-256-ECB
399	# Cortex-A57 1.85/0.82 2.16/0.96 2.47/1.10
400	# Cortex-A72 1.64/0.85 1.82/0.99 2.13/1.14
401
402	# Optimization is implemented by loop unrolling and interleaving.
403	# Commonly, we choose the unrolling factor as 5, if the input
404	# data size smaller than 5 blocks, but not smaller than 3 blocks,
405	# choose 3 as the unrolling factor.
406	# If the input data size dsize >= 5*16 bytes, then take 5 blocks
407	# as one iteration, every loop the left size lsize -= 5*16.
408	# If 516 > lsize >= 316 bytes, take 3 blocks as one iteration,
409	# every loop lsize -=3*16.
410	# If lsize < 3*16 bytes, treat them as the tail, interleave the
411	# two blocks AES instructions.
412	# There is one special case, if the original input data size dsize
413	# = 16 bytes, we will treat it seperately to improve the
414	# performance: one independent code block without LR, FP load and
415	# store, just looks like what the original ECB implementation does.
416
417	{{{
418	my ($inp,$out,$len,$key)=map("x$_",(0..3));
419	my ($enc,$rounds,$cnt,$key_,$step)=("w4","w5","w6","x7","x8");
420	my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$tmp2,$rndlast)=map("q$_",(0..7));
421
422	my ($dat,$tmp,$rndzero_n_last)=($dat0,$tmp0,$tmp1);
423
424	### q7 last round key
425	### q10-q15 q7 Last 7 round keys
426	### q8-q9 preloaded round keys except last 7 keys for big size
427	### q5, q6, q8-q9 preloaded round keys except last 7 keys for only 16 byte
428
429	{
430	my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9));
431
432	my ($dat3,$in3,$tmp3); # used only in 64-bit mode
433	my ($dat4,$in4,$tmp4);
434	if ($flavour =~ /64/) {
435	($dat2,$dat3,$dat4,$in2,$in3,$in4,$tmp3,$tmp4)=map("q$_",(16..23));
436	}
437
438	$code.=<<___;
439	.globl ${prefix}_ecb_encrypt
440	.type ${prefix}_ecb_encrypt,%function
441	.align 5
442	${prefix}_ecb_encrypt:
443	___
444	$code.=<<___ if ($flavour =~ /64/);
445	subs $len,$len,#16
446	// Original input data size bigger than 16, jump to big size processing.
447	b.ne .Lecb_big_size
448	vld1.8 {$dat0},[$inp]
449	cmp $enc,#0 // en- or decrypting?
450	ldr $rounds,[$key,#240]
451	vld1.32 {q5-q6},[$key],#32 // load key schedule...
452
453	b.eq .Lecb_small_dec
454	aese $dat0,q5
455	aesmc $dat0,$dat0
456	vld1.32 {q8-q9},[$key],#32 // load key schedule...
457	aese $dat0,q6
458	aesmc $dat0,$dat0
459	subs $rounds,$rounds,#10 // if rounds==10, jump to aes-128-ecb processing
460	b.eq .Lecb_128_enc
461	.Lecb_round_loop:
462	aese $dat0,q8
463	aesmc $dat0,$dat0
464	vld1.32 {q8},[$key],#16 // load key schedule...
465	aese $dat0,q9
466	aesmc $dat0,$dat0
467	vld1.32 {q9},[$key],#16 // load key schedule...
468	subs $rounds,$rounds,#2 // bias
469	b.gt .Lecb_round_loop
470	.Lecb_128_enc:
471	vld1.32 {q10-q11},[$key],#32 // load key schedule...
472	aese $dat0,q8
473	aesmc $dat0,$dat0
474	aese $dat0,q9
475	aesmc $dat0,$dat0
476	vld1.32 {q12-q13},[$key],#32 // load key schedule...
477	aese $dat0,q10
478	aesmc $dat0,$dat0
479	aese $dat0,q11
480	aesmc $dat0,$dat0
481	vld1.32 {q14-q15},[$key],#32 // load key schedule...
482	aese $dat0,q12
483	aesmc $dat0,$dat0
484	aese $dat0,q13
485	aesmc $dat0,$dat0
486	vld1.32 {$rndlast},[$key]
487	aese $dat0,q14
488	aesmc $dat0,$dat0
489	aese $dat0,q15
490	veor $dat0,$dat0,$rndlast
491	vst1.8 {$dat0},[$out]
492	b .Lecb_Final_abort
493	.Lecb_small_dec:
494	aesd $dat0,q5
495	aesimc $dat0,$dat0
496	vld1.32 {q8-q9},[$key],#32 // load key schedule...
497	aesd $dat0,q6
498	aesimc $dat0,$dat0
499	subs $rounds,$rounds,#10 // bias
500	b.eq .Lecb_128_dec
501	.Lecb_dec_round_loop:
502	aesd $dat0,q8
503	aesimc $dat0,$dat0
504	vld1.32 {q8},[$key],#16 // load key schedule...
505	aesd $dat0,q9
506	aesimc $dat0,$dat0
507	vld1.32 {q9},[$key],#16 // load key schedule...
508	subs $rounds,$rounds,#2 // bias
509	b.gt .Lecb_dec_round_loop
510	.Lecb_128_dec:
511	vld1.32 {q10-q11},[$key],#32 // load key schedule...
512	aesd $dat0,q8
513	aesimc $dat0,$dat0
514	aesd $dat0,q9
515	aesimc $dat0,$dat0
516	vld1.32 {q12-q13},[$key],#32 // load key schedule...
517	aesd $dat0,q10
518	aesimc $dat0,$dat0
519	aesd $dat0,q11
520	aesimc $dat0,$dat0
521	vld1.32 {q14-q15},[$key],#32 // load key schedule...
522	aesd $dat0,q12
523	aesimc $dat0,$dat0
524	aesd $dat0,q13
525	aesimc $dat0,$dat0
526	vld1.32 {$rndlast},[$key]
527	aesd $dat0,q14
528	aesimc $dat0,$dat0
529	aesd $dat0,q15
530	veor $dat0,$dat0,$rndlast
531	vst1.8 {$dat0},[$out]
532	b .Lecb_Final_abort
533	.Lecb_big_size:
534	___
535	$code.=<<___ if ($flavour =~ /64/);
536	stp x29,x30,[sp,#-16]!
537	add x29,sp,#0
538	___
539	$code.=<<___ if ($flavour !~ /64/);
540	mov ip,sp
541	stmdb sp!,{r4-r8,lr}
542	vstmdb sp!,{d8-d15} @ ABI specification says so
543	ldmia ip,{r4-r5} @ load remaining args
544	subs $len,$len,#16
545	___
546	$code.=<<___;
547	mov $step,#16
548	b.lo .Lecb_done
549	cclr $step,eq
550
551	cmp $enc,#0 // en- or decrypting?
552	ldr $rounds,[$key,#240]
553	and $len,$len,#-16
554	vld1.8 {$dat},[$inp],$step
555
556	vld1.32 {q8-q9},[$key] // load key schedule...
557	sub $rounds,$rounds,#6
558	add $key_,$key,x5,lsl#4 // pointer to last 7 round keys
559	sub $rounds,$rounds,#2
560	vld1.32 {q10-q11},[$key_],#32
561	vld1.32 {q12-q13},[$key_],#32
562	vld1.32 {q14-q15},[$key_],#32
563	vld1.32 {$rndlast},[$key_]
564
565	add $key_,$key,#32
566	mov $cnt,$rounds
567	b.eq .Lecb_dec
568
569	vld1.8 {$dat1},[$inp],#16
570	subs $len,$len,#32 // bias
571	add $cnt,$rounds,#2
572	vorr $in1,$dat1,$dat1
573	vorr $dat2,$dat1,$dat1
574	vorr $dat1,$dat,$dat
575	b.lo .Lecb_enc_tail
576
577	vorr $dat1,$in1,$in1
578	vld1.8 {$dat2},[$inp],#16
579	___
580	$code.=<<___ if ($flavour =~ /64/);
581	cmp $len,#32
582	b.lo .Loop3x_ecb_enc
583
584	vld1.8 {$dat3},[$inp],#16
585	vld1.8 {$dat4},[$inp],#16
586	sub $len,$len,#32 // bias
587	mov $cnt,$rounds
588
589	.Loop5x_ecb_enc:
590	aese $dat0,q8
591	aesmc $dat0,$dat0
592	aese $dat1,q8
593	aesmc $dat1,$dat1
594	aese $dat2,q8
595	aesmc $dat2,$dat2
596	aese $dat3,q8
597	aesmc $dat3,$dat3
598	aese $dat4,q8
599	aesmc $dat4,$dat4
600	vld1.32 {q8},[$key_],#16
601	subs $cnt,$cnt,#2
602	aese $dat0,q9
603	aesmc $dat0,$dat0
604	aese $dat1,q9
605	aesmc $dat1,$dat1
606	aese $dat2,q9
607	aesmc $dat2,$dat2
608	aese $dat3,q9
609	aesmc $dat3,$dat3
610	aese $dat4,q9
611	aesmc $dat4,$dat4
612	vld1.32 {q9},[$key_],#16
613	b.gt .Loop5x_ecb_enc
614
615	aese $dat0,q8
616	aesmc $dat0,$dat0
617	aese $dat1,q8
618	aesmc $dat1,$dat1
619	aese $dat2,q8
620	aesmc $dat2,$dat2
621	aese $dat3,q8
622	aesmc $dat3,$dat3
623	aese $dat4,q8
624	aesmc $dat4,$dat4
625	cmp $len,#0x40 // because .Lecb_enc_tail4x
626	sub $len,$len,#0x50
627
628	aese $dat0,q9
629	aesmc $dat0,$dat0
630	aese $dat1,q9
631	aesmc $dat1,$dat1
632	aese $dat2,q9
633	aesmc $dat2,$dat2
634	aese $dat3,q9
635	aesmc $dat3,$dat3
636	aese $dat4,q9
637	aesmc $dat4,$dat4
638	csel x6,xzr,$len,gt // borrow x6, $cnt, "gt" is not typo
639	mov $key_,$key
640
641	aese $dat0,q10
642	aesmc $dat0,$dat0
643	aese $dat1,q10
644	aesmc $dat1,$dat1
645	aese $dat2,q10
646	aesmc $dat2,$dat2
647	aese $dat3,q10
648	aesmc $dat3,$dat3
649	aese $dat4,q10
650	aesmc $dat4,$dat4
651	add $inp,$inp,x6 // $inp is adjusted in such way that
652	// at exit from the loop $dat1-$dat4
653	// are loaded with last "words"
654	add x6,$len,#0x60 // because .Lecb_enc_tail4x
655
656	aese $dat0,q11
657	aesmc $dat0,$dat0
658	aese $dat1,q11
659	aesmc $dat1,$dat1
660	aese $dat2,q11
661	aesmc $dat2,$dat2
662	aese $dat3,q11
663	aesmc $dat3,$dat3
664	aese $dat4,q11
665	aesmc $dat4,$dat4
666
667	aese $dat0,q12
668	aesmc $dat0,$dat0
669	aese $dat1,q12
670	aesmc $dat1,$dat1
671	aese $dat2,q12
672	aesmc $dat2,$dat2
673	aese $dat3,q12
674	aesmc $dat3,$dat3
675	aese $dat4,q12
676	aesmc $dat4,$dat4
677
678	aese $dat0,q13
679	aesmc $dat0,$dat0
680	aese $dat1,q13
681	aesmc $dat1,$dat1
682	aese $dat2,q13
683	aesmc $dat2,$dat2
684	aese $dat3,q13
685	aesmc $dat3,$dat3
686	aese $dat4,q13
687	aesmc $dat4,$dat4
688
689	aese $dat0,q14
690	aesmc $dat0,$dat0
691	aese $dat1,q14
692	aesmc $dat1,$dat1
693	aese $dat2,q14
694	aesmc $dat2,$dat2
695	aese $dat3,q14
696	aesmc $dat3,$dat3
697	aese $dat4,q14
698	aesmc $dat4,$dat4
699
700	aese $dat0,q15
701	vld1.8 {$in0},[$inp],#16
702	aese $dat1,q15
703	vld1.8 {$in1},[$inp],#16
704	aese $dat2,q15
705	vld1.8 {$in2},[$inp],#16
706	aese $dat3,q15
707	vld1.8 {$in3},[$inp],#16
708	aese $dat4,q15
709	vld1.8 {$in4},[$inp],#16
710	cbz x6,.Lecb_enc_tail4x
711	vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0]
712	veor $tmp0,$rndlast,$dat0
713	vorr $dat0,$in0,$in0
714	veor $tmp1,$rndlast,$dat1
715	vorr $dat1,$in1,$in1
716	veor $tmp2,$rndlast,$dat2
717	vorr $dat2,$in2,$in2
718	veor $tmp3,$rndlast,$dat3
719	vorr $dat3,$in3,$in3
720	veor $tmp4,$rndlast,$dat4
721	vst1.8 {$tmp0},[$out],#16
722	vorr $dat4,$in4,$in4
723	vst1.8 {$tmp1},[$out],#16
724	mov $cnt,$rounds
725	vst1.8 {$tmp2},[$out],#16
726	vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1]
727	vst1.8 {$tmp3},[$out],#16
728	vst1.8 {$tmp4},[$out],#16
729	b.hs .Loop5x_ecb_enc
730
731	add $len,$len,#0x50
732	cbz $len,.Lecb_done
733
734	add $cnt,$rounds,#2
735	subs $len,$len,#0x30
736	vorr $dat0,$in2,$in2
737	vorr $dat1,$in3,$in3
738	vorr $dat2,$in4,$in4
739	b.lo .Lecb_enc_tail
740
741	b .Loop3x_ecb_enc
742
743	.align 4
744	.Lecb_enc_tail4x:
745	veor $tmp1,$rndlast,$dat1
746	veor $tmp2,$rndlast,$dat2
747	veor $tmp3,$rndlast,$dat3
748	veor $tmp4,$rndlast,$dat4
749	vst1.8 {$tmp1},[$out],#16
750	vst1.8 {$tmp2},[$out],#16
751	vst1.8 {$tmp3},[$out],#16
752	vst1.8 {$tmp4},[$out],#16
753
754	b .Lecb_done
755	.align 4
756	___
757	$code.=<<___;
758	.Loop3x_ecb_enc:
759	aese $dat0,q8
760	aesmc $dat0,$dat0
761	aese $dat1,q8
762	aesmc $dat1,$dat1
763	aese $dat2,q8
764	aesmc $dat2,$dat2
765	vld1.32 {q8},[$key_],#16
766	subs $cnt,$cnt,#2
767	aese $dat0,q9
768	aesmc $dat0,$dat0
769	aese $dat1,q9
770	aesmc $dat1,$dat1
771	aese $dat2,q9
772	aesmc $dat2,$dat2
773	vld1.32 {q9},[$key_],#16
774	b.gt .Loop3x_ecb_enc
775
776	aese $dat0,q8
777	aesmc $dat0,$dat0
778	aese $dat1,q8
779	aesmc $dat1,$dat1
780	aese $dat2,q8
781	aesmc $dat2,$dat2
782	subs $len,$len,#0x30
783	mov.lo x6,$len // x6, $cnt, is zero at this point
784	aese $dat0,q9
785	aesmc $dat0,$dat0
786	aese $dat1,q9
787	aesmc $dat1,$dat1
788	aese $dat2,q9
789	aesmc $dat2,$dat2
790	add $inp,$inp,x6 // $inp is adjusted in such way that
791	// at exit from the loop $dat1-$dat2
792	// are loaded with last "words"
793	mov $key_,$key
794	aese $dat0,q12
795	aesmc $dat0,$dat0
796	aese $dat1,q12
797	aesmc $dat1,$dat1
798	aese $dat2,q12
799	aesmc $dat2,$dat2
800	vld1.8 {$in0},[$inp],#16
801	aese $dat0,q13
802	aesmc $dat0,$dat0
803	aese $dat1,q13
804	aesmc $dat1,$dat1
805	aese $dat2,q13
806	aesmc $dat2,$dat2
807	vld1.8 {$in1},[$inp],#16
808	aese $dat0,q14
809	aesmc $dat0,$dat0
810	aese $dat1,q14
811	aesmc $dat1,$dat1
812	aese $dat2,q14
813	aesmc $dat2,$dat2
814	vld1.8 {$in2},[$inp],#16
815	aese $dat0,q15
816	aese $dat1,q15
817	aese $dat2,q15
818	vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0]
819	add $cnt,$rounds,#2
820	veor $tmp0,$rndlast,$dat0
821	veor $tmp1,$rndlast,$dat1
822	veor $dat2,$dat2,$rndlast
823	vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1]
824	vst1.8 {$tmp0},[$out],#16
825	vorr $dat0,$in0,$in0
826	vst1.8 {$tmp1},[$out],#16
827	vorr $dat1,$in1,$in1
828	vst1.8 {$dat2},[$out],#16
829	vorr $dat2,$in2,$in2
830	b.hs .Loop3x_ecb_enc
831
832	cmn $len,#0x30
833	b.eq .Lecb_done
834	nop
835
836	.Lecb_enc_tail:
837	aese $dat1,q8
838	aesmc $dat1,$dat1
839	aese $dat2,q8
840	aesmc $dat2,$dat2
841	vld1.32 {q8},[$key_],#16
842	subs $cnt,$cnt,#2
843	aese $dat1,q9
844	aesmc $dat1,$dat1
845	aese $dat2,q9
846	aesmc $dat2,$dat2
847	vld1.32 {q9},[$key_],#16
848	b.gt .Lecb_enc_tail
849
850	aese $dat1,q8
851	aesmc $dat1,$dat1
852	aese $dat2,q8
853	aesmc $dat2,$dat2
854	aese $dat1,q9
855	aesmc $dat1,$dat1
856	aese $dat2,q9
857	aesmc $dat2,$dat2
858	aese $dat1,q12
859	aesmc $dat1,$dat1
860	aese $dat2,q12
861	aesmc $dat2,$dat2
862	cmn $len,#0x20
863	aese $dat1,q13
864	aesmc $dat1,$dat1
865	aese $dat2,q13
866	aesmc $dat2,$dat2
867	aese $dat1,q14
868	aesmc $dat1,$dat1
869	aese $dat2,q14
870	aesmc $dat2,$dat2
871	aese $dat1,q15
872	aese $dat2,q15
873	b.eq .Lecb_enc_one
874	veor $tmp1,$rndlast,$dat1
875	veor $tmp2,$rndlast,$dat2
876	vst1.8 {$tmp1},[$out],#16
877	vst1.8 {$tmp2},[$out],#16
878	b .Lecb_done
879
880	.Lecb_enc_one:
881	veor $tmp1,$rndlast,$dat2
882	vst1.8 {$tmp1},[$out],#16
883	b .Lecb_done
884	___
885
886	$code.=<<___;
887	.align 5
888	.Lecb_dec:
889	vld1.8 {$dat1},[$inp],#16
890	subs $len,$len,#32 // bias
891	add $cnt,$rounds,#2
892	vorr $in1,$dat1,$dat1
893	vorr $dat2,$dat1,$dat1
894	vorr $dat1,$dat,$dat
895	b.lo .Lecb_dec_tail
896
897	vorr $dat1,$in1,$in1
898	vld1.8 {$dat2},[$inp],#16
899	___
900	$code.=<<___ if ($flavour =~ /64/);
901	cmp $len,#32
902	b.lo .Loop3x_ecb_dec
903
904	vld1.8 {$dat3},[$inp],#16
905	vld1.8 {$dat4},[$inp],#16
906	sub $len,$len,#32 // bias
907	mov $cnt,$rounds
908
909	.Loop5x_ecb_dec:
910	aesd $dat0,q8
911	aesimc $dat0,$dat0
912	aesd $dat1,q8
913	aesimc $dat1,$dat1
914	aesd $dat2,q8
915	aesimc $dat2,$dat2
916	aesd $dat3,q8
917	aesimc $dat3,$dat3
918	aesd $dat4,q8
919	aesimc $dat4,$dat4
920	vld1.32 {q8},[$key_],#16
921	subs $cnt,$cnt,#2
922	aesd $dat0,q9
923	aesimc $dat0,$dat0
924	aesd $dat1,q9
925	aesimc $dat1,$dat1
926	aesd $dat2,q9
927	aesimc $dat2,$dat2
928	aesd $dat3,q9
929	aesimc $dat3,$dat3
930	aesd $dat4,q9
931	aesimc $dat4,$dat4
932	vld1.32 {q9},[$key_],#16
933	b.gt .Loop5x_ecb_dec
934
935	aesd $dat0,q8
936	aesimc $dat0,$dat0
937	aesd $dat1,q8
938	aesimc $dat1,$dat1
939	aesd $dat2,q8
940	aesimc $dat2,$dat2
941	aesd $dat3,q8
942	aesimc $dat3,$dat3
943	aesd $dat4,q8
944	aesimc $dat4,$dat4
945	cmp $len,#0x40 // because .Lecb_tail4x
946	sub $len,$len,#0x50
947
948	aesd $dat0,q9
949	aesimc $dat0,$dat0
950	aesd $dat1,q9
951	aesimc $dat1,$dat1
952	aesd $dat2,q9
953	aesimc $dat2,$dat2
954	aesd $dat3,q9
955	aesimc $dat3,$dat3
956	aesd $dat4,q9
957	aesimc $dat4,$dat4
958	csel x6,xzr,$len,gt // borrow x6, $cnt, "gt" is not typo
959	mov $key_,$key
960
961	aesd $dat0,q10
962	aesimc $dat0,$dat0
963	aesd $dat1,q10
964	aesimc $dat1,$dat1
965	aesd $dat2,q10
966	aesimc $dat2,$dat2
967	aesd $dat3,q10
968	aesimc $dat3,$dat3
969	aesd $dat4,q10
970	aesimc $dat4,$dat4
971	add $inp,$inp,x6 // $inp is adjusted in such way that
972	// at exit from the loop $dat1-$dat4
973	// are loaded with last "words"
974	add x6,$len,#0x60 // because .Lecb_tail4x
975
976	aesd $dat0,q11
977	aesimc $dat0,$dat0
978	aesd $dat1,q11
979	aesimc $dat1,$dat1
980	aesd $dat2,q11
981	aesimc $dat2,$dat2
982	aesd $dat3,q11
983	aesimc $dat3,$dat3
984	aesd $dat4,q11
985	aesimc $dat4,$dat4
986
987	aesd $dat0,q12
988	aesimc $dat0,$dat0
989	aesd $dat1,q12
990	aesimc $dat1,$dat1
991	aesd $dat2,q12
992	aesimc $dat2,$dat2
993	aesd $dat3,q12
994	aesimc $dat3,$dat3
995	aesd $dat4,q12
996	aesimc $dat4,$dat4
997
998	aesd $dat0,q13
999	aesimc $dat0,$dat0
1000	aesd $dat1,q13
1001	aesimc $dat1,$dat1
1002	aesd $dat2,q13
1003	aesimc $dat2,$dat2
1004	aesd $dat3,q13
1005	aesimc $dat3,$dat3
1006	aesd $dat4,q13
1007	aesimc $dat4,$dat4
1008
1009	aesd $dat0,q14
1010	aesimc $dat0,$dat0
1011	aesd $dat1,q14
1012	aesimc $dat1,$dat1
1013	aesd $dat2,q14
1014	aesimc $dat2,$dat2
1015	aesd $dat3,q14
1016	aesimc $dat3,$dat3
1017	aesd $dat4,q14
1018	aesimc $dat4,$dat4
1019
1020	aesd $dat0,q15
1021	vld1.8 {$in0},[$inp],#16
1022	aesd $dat1,q15
1023	vld1.8 {$in1},[$inp],#16
1024	aesd $dat2,q15
1025	vld1.8 {$in2},[$inp],#16
1026	aesd $dat3,q15
1027	vld1.8 {$in3},[$inp],#16
1028	aesd $dat4,q15
1029	vld1.8 {$in4},[$inp],#16
1030	cbz x6,.Lecb_tail4x
1031	vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0]
1032	veor $tmp0,$rndlast,$dat0
1033	vorr $dat0,$in0,$in0
1034	veor $tmp1,$rndlast,$dat1
1035	vorr $dat1,$in1,$in1
1036	veor $tmp2,$rndlast,$dat2
1037	vorr $dat2,$in2,$in2
1038	veor $tmp3,$rndlast,$dat3
1039	vorr $dat3,$in3,$in3
1040	veor $tmp4,$rndlast,$dat4
1041	vst1.8 {$tmp0},[$out],#16
1042	vorr $dat4,$in4,$in4
1043	vst1.8 {$tmp1},[$out],#16
1044	mov $cnt,$rounds
1045	vst1.8 {$tmp2},[$out],#16
1046	vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1]
1047	vst1.8 {$tmp3},[$out],#16
1048	vst1.8 {$tmp4},[$out],#16
1049	b.hs .Loop5x_ecb_dec
1050
1051	add $len,$len,#0x50
1052	cbz $len,.Lecb_done
1053
1054	add $cnt,$rounds,#2
1055	subs $len,$len,#0x30
1056	vorr $dat0,$in2,$in2
1057	vorr $dat1,$in3,$in3
1058	vorr $dat2,$in4,$in4
1059	b.lo .Lecb_dec_tail
1060
1061	b .Loop3x_ecb_dec
1062
1063	.align 4
1064	.Lecb_tail4x:
1065	veor $tmp1,$rndlast,$dat1
1066	veor $tmp2,$rndlast,$dat2
1067	veor $tmp3,$rndlast,$dat3
1068	veor $tmp4,$rndlast,$dat4
1069	vst1.8 {$tmp1},[$out],#16
1070	vst1.8 {$tmp2},[$out],#16
1071	vst1.8 {$tmp3},[$out],#16
1072	vst1.8 {$tmp4},[$out],#16
1073
1074	b .Lecb_done
1075	.align 4
1076	___
1077	$code.=<<___;
1078	.Loop3x_ecb_dec:
1079	aesd $dat0,q8
1080	aesimc $dat0,$dat0
1081	aesd $dat1,q8
1082	aesimc $dat1,$dat1
1083	aesd $dat2,q8
1084	aesimc $dat2,$dat2
1085	vld1.32 {q8},[$key_],#16
1086	subs $cnt,$cnt,#2
1087	aesd $dat0,q9
1088	aesimc $dat0,$dat0
1089	aesd $dat1,q9
1090	aesimc $dat1,$dat1
1091	aesd $dat2,q9
1092	aesimc $dat2,$dat2
1093	vld1.32 {q9},[$key_],#16
1094	b.gt .Loop3x_ecb_dec
1095
1096	aesd $dat0,q8
1097	aesimc $dat0,$dat0
1098	aesd $dat1,q8
1099	aesimc $dat1,$dat1
1100	aesd $dat2,q8
1101	aesimc $dat2,$dat2
1102	subs $len,$len,#0x30
1103	mov.lo x6,$len // x6, $cnt, is zero at this point
1104	aesd $dat0,q9
1105	aesimc $dat0,$dat0
1106	aesd $dat1,q9
1107	aesimc $dat1,$dat1
1108	aesd $dat2,q9
1109	aesimc $dat2,$dat2
1110	add $inp,$inp,x6 // $inp is adjusted in such way that
1111	// at exit from the loop $dat1-$dat2
1112	// are loaded with last "words"
1113	mov $key_,$key
1114	aesd $dat0,q12
1115	aesimc $dat0,$dat0
1116	aesd $dat1,q12
1117	aesimc $dat1,$dat1
1118	aesd $dat2,q12
1119	aesimc $dat2,$dat2
1120	vld1.8 {$in0},[$inp],#16
1121	aesd $dat0,q13
1122	aesimc $dat0,$dat0
1123	aesd $dat1,q13
1124	aesimc $dat1,$dat1
1125	aesd $dat2,q13
1126	aesimc $dat2,$dat2
1127	vld1.8 {$in1},[$inp],#16
1128	aesd $dat0,q14
1129	aesimc $dat0,$dat0
1130	aesd $dat1,q14
1131	aesimc $dat1,$dat1
1132	aesd $dat2,q14
1133	aesimc $dat2,$dat2
1134	vld1.8 {$in2},[$inp],#16
1135	aesd $dat0,q15
1136	aesd $dat1,q15
1137	aesd $dat2,q15
1138	vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0]
1139	add $cnt,$rounds,#2
1140	veor $tmp0,$rndlast,$dat0
1141	veor $tmp1,$rndlast,$dat1
1142	veor $dat2,$dat2,$rndlast
1143	vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1]
1144	vst1.8 {$tmp0},[$out],#16
1145	vorr $dat0,$in0,$in0
1146	vst1.8 {$tmp1},[$out],#16
1147	vorr $dat1,$in1,$in1
1148	vst1.8 {$dat2},[$out],#16
1149	vorr $dat2,$in2,$in2
1150	b.hs .Loop3x_ecb_dec
1151
1152	cmn $len,#0x30
1153	b.eq .Lecb_done
1154	nop
1155
1156	.Lecb_dec_tail:
1157	aesd $dat1,q8
1158	aesimc $dat1,$dat1
1159	aesd $dat2,q8
1160	aesimc $dat2,$dat2
1161	vld1.32 {q8},[$key_],#16
1162	subs $cnt,$cnt,#2
1163	aesd $dat1,q9
1164	aesimc $dat1,$dat1
1165	aesd $dat2,q9
1166	aesimc $dat2,$dat2
1167	vld1.32 {q9},[$key_],#16
1168	b.gt .Lecb_dec_tail
1169
1170	aesd $dat1,q8
1171	aesimc $dat1,$dat1
1172	aesd $dat2,q8
1173	aesimc $dat2,$dat2
1174	aesd $dat1,q9
1175	aesimc $dat1,$dat1
1176	aesd $dat2,q9
1177	aesimc $dat2,$dat2
1178	aesd $dat1,q12
1179	aesimc $dat1,$dat1
1180	aesd $dat2,q12
1181	aesimc $dat2,$dat2
1182	cmn $len,#0x20
1183	aesd $dat1,q13
1184	aesimc $dat1,$dat1
1185	aesd $dat2,q13
1186	aesimc $dat2,$dat2
1187	aesd $dat1,q14
1188	aesimc $dat1,$dat1
1189	aesd $dat2,q14
1190	aesimc $dat2,$dat2
1191	aesd $dat1,q15
1192	aesd $dat2,q15
1193	b.eq .Lecb_dec_one
1194	veor $tmp1,$rndlast,$dat1
1195	veor $tmp2,$rndlast,$dat2
1196	vst1.8 {$tmp1},[$out],#16
1197	vst1.8 {$tmp2},[$out],#16
1198	b .Lecb_done
1199
1200	.Lecb_dec_one:
1201	veor $tmp1,$rndlast,$dat2
1202	vst1.8 {$tmp1},[$out],#16
1203
1204	.Lecb_done:
1205	___
1206	}
1207	$code.=<<___ if ($flavour !~ /64/);
1208	vldmia sp!,{d8-d15}
1209	ldmia sp!,{r4-r8,pc}
1210	___
1211	$code.=<<___ if ($flavour =~ /64/);
1212	ldr x29,[sp],#16
1213	___
1214	$code.=<<___ if ($flavour =~ /64/);
1215	.Lecb_Final_abort:
1216	ret
1217	___
1218	$code.=<<___;
1219	.size ${prefix}_ecb_encrypt,.-${prefix}_ecb_encrypt
1220	___
1221	}}}
1222	{{{
1223	my ($inp,$out,$len,$key,$ivp)=map("x$_",(0..4)); my $enc="w5";
1224	my ($rounds,$cnt,$key_,$step,$step1)=($enc,"w6","x7","x8","x12");
1225	my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$ivec,$rndlast)=map("q$_",(0..7));
1226
1227	my ($dat,$tmp,$rndzero_n_last)=($dat0,$tmp0,$tmp1);
1228	my ($key4,$key5,$key6,$key7)=("x6","x12","x14",$key);
1229
1230	### q8-q15 preloaded key schedule
1231
1232	$code.=<<___;
1233	.globl ${prefix}_cbc_encrypt
1234	.type ${prefix}_cbc_encrypt,%function
1235	.align 5
1236	${prefix}_cbc_encrypt:
1237	___
1238	$code.=<<___ if ($flavour =~ /64/);
1239	stp x29,x30,[sp,#-16]!
1240	add x29,sp,#0
1241	___
1242	$code.=<<___ if ($flavour !~ /64/);
1243	mov ip,sp
1244	stmdb sp!,{r4-r8,lr}
1245	vstmdb sp!,{d8-d15} @ ABI specification says so
1246	ldmia ip,{r4-r5} @ load remaining args
1247	___
1248	$code.=<<___;
1249	subs $len,$len,#16
1250	mov $step,#16
1251	b.lo .Lcbc_abort
1252	cclr $step,eq
1253
1254	cmp $enc,#0 // en- or decrypting?
1255	ldr $rounds,[$key,#240]
1256	and $len,$len,#-16
1257	vld1.8 {$ivec},[$ivp]
1258	vld1.8 {$dat},[$inp],$step
1259
1260	vld1.32 {q8-q9},[$key] // load key schedule...
1261	sub $rounds,$rounds,#6
1262	add $key_,$key,x5,lsl#4 // pointer to last 7 round keys
1263	sub $rounds,$rounds,#2
1264	vld1.32 {q10-q11},[$key_],#32
1265	vld1.32 {q12-q13},[$key_],#32
1266	vld1.32 {q14-q15},[$key_],#32
1267	vld1.32 {$rndlast},[$key_]
1268
1269	add $key_,$key,#32
1270	mov $cnt,$rounds
1271	b.eq .Lcbc_dec
1272
1273	cmp $rounds,#2
1274	veor $dat,$dat,$ivec
1275	veor $rndzero_n_last,q8,$rndlast
1276	b.eq .Lcbc_enc128
1277
1278	vld1.32 {$in0-$in1},[$key_]
1279	add $key_,$key,#16
1280	add $key4,$key,#16*4
1281	add $key5,$key,#16*5
1282	aese $dat,q8
1283	aesmc $dat,$dat
1284	add $key6,$key,#16*6
1285	add $key7,$key,#16*7
1286	b .Lenter_cbc_enc
1287
1288	.align 4
1289	.Loop_cbc_enc:
1290	aese $dat,q8
1291	aesmc $dat,$dat
1292	vst1.8 {$ivec},[$out],#16
1293	.Lenter_cbc_enc:
1294	aese $dat,q9
1295	aesmc $dat,$dat
1296	aese $dat,$in0
1297	aesmc $dat,$dat
1298	vld1.32 {q8},[$key4]
1299	cmp $rounds,#4
1300	aese $dat,$in1
1301	aesmc $dat,$dat
1302	vld1.32 {q9},[$key5]
1303	b.eq .Lcbc_enc192
1304
1305	aese $dat,q8
1306	aesmc $dat,$dat
1307	vld1.32 {q8},[$key6]
1308	aese $dat,q9
1309	aesmc $dat,$dat
1310	vld1.32 {q9},[$key7]
1311	nop
1312
1313	.Lcbc_enc192:
1314	aese $dat,q8
1315	aesmc $dat,$dat
1316	subs $len,$len,#16
1317	aese $dat,q9
1318	aesmc $dat,$dat
1319	cclr $step,eq
1320	aese $dat,q10
1321	aesmc $dat,$dat
1322	aese $dat,q11
1323	aesmc $dat,$dat
1324	vld1.8 {q8},[$inp],$step
1325	aese $dat,q12
1326	aesmc $dat,$dat
1327	veor q8,q8,$rndzero_n_last
1328	aese $dat,q13
1329	aesmc $dat,$dat
1330	vld1.32 {q9},[$key_] // re-pre-load rndkey[1]
1331	aese $dat,q14
1332	aesmc $dat,$dat
1333	aese $dat,q15
1334	veor $ivec,$dat,$rndlast
1335	b.hs .Loop_cbc_enc
1336
1337	vst1.8 {$ivec},[$out],#16
1338	b .Lcbc_done
1339
1340	.align 5
1341	.Lcbc_enc128:
1342	vld1.32 {$in0-$in1},[$key_]
1343	aese $dat,q8
1344	aesmc $dat,$dat
1345	b .Lenter_cbc_enc128
1346	.Loop_cbc_enc128:
1347	aese $dat,q8
1348	aesmc $dat,$dat
1349	vst1.8 {$ivec},[$out],#16
1350	.Lenter_cbc_enc128:
1351	aese $dat,q9
1352	aesmc $dat,$dat
1353	subs $len,$len,#16
1354	aese $dat,$in0
1355	aesmc $dat,$dat
1356	cclr $step,eq
1357	aese $dat,$in1
1358	aesmc $dat,$dat
1359	aese $dat,q10
1360	aesmc $dat,$dat
1361	aese $dat,q11
1362	aesmc $dat,$dat
1363	vld1.8 {q8},[$inp],$step
1364	aese $dat,q12
1365	aesmc $dat,$dat
1366	aese $dat,q13
1367	aesmc $dat,$dat
1368	aese $dat,q14
1369	aesmc $dat,$dat
1370	veor q8,q8,$rndzero_n_last
1371	aese $dat,q15
1372	veor $ivec,$dat,$rndlast
1373	b.hs .Loop_cbc_enc128
1374
1375	vst1.8 {$ivec},[$out],#16
1376	b .Lcbc_done
1377	___
1378	{
1379	my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9));
1380
1381	my ($dat3,$in3,$tmp3); # used only in 64-bit mode
1382	my ($dat4,$in4,$tmp4);
1383	if ($flavour =~ /64/) {
1384	($dat2,$dat3,$dat4,$in2,$in3,$in4,$tmp3,$tmp4)=map("q$_",(16..23));
1385	}
1386
1387	$code.=<<___;
1388	.align 5
1389	.Lcbc_dec:
1390	vld1.8 {$dat2},[$inp],#16
1391	subs $len,$len,#32 // bias
1392	add $cnt,$rounds,#2
1393	vorr $in1,$dat,$dat
1394	vorr $dat1,$dat,$dat
1395	vorr $in2,$dat2,$dat2
1396	b.lo .Lcbc_dec_tail
1397
1398	vorr $dat1,$dat2,$dat2
1399	vld1.8 {$dat2},[$inp],#16
1400	vorr $in0,$dat,$dat
1401	vorr $in1,$dat1,$dat1
1402	vorr $in2,$dat2,$dat2
1403	___
1404	$code.=<<___ if ($flavour =~ /64/);
1405	cmp $len,#32
1406	b.lo .Loop3x_cbc_dec
1407
1408	vld1.8 {$dat3},[$inp],#16
1409	vld1.8 {$dat4},[$inp],#16
1410	sub $len,$len,#32 // bias
1411	mov $cnt,$rounds
1412	vorr $in3,$dat3,$dat3
1413	vorr $in4,$dat4,$dat4
1414
1415	.Loop5x_cbc_dec:
1416	aesd $dat0,q8
1417	aesimc $dat0,$dat0
1418	aesd $dat1,q8
1419	aesimc $dat1,$dat1
1420	aesd $dat2,q8
1421	aesimc $dat2,$dat2
1422	aesd $dat3,q8
1423	aesimc $dat3,$dat3
1424	aesd $dat4,q8
1425	aesimc $dat4,$dat4
1426	vld1.32 {q8},[$key_],#16
1427	subs $cnt,$cnt,#2
1428	aesd $dat0,q9
1429	aesimc $dat0,$dat0
1430	aesd $dat1,q9
1431	aesimc $dat1,$dat1
1432	aesd $dat2,q9
1433	aesimc $dat2,$dat2
1434	aesd $dat3,q9
1435	aesimc $dat3,$dat3
1436	aesd $dat4,q9
1437	aesimc $dat4,$dat4
1438	vld1.32 {q9},[$key_],#16
1439	b.gt .Loop5x_cbc_dec
1440
1441	aesd $dat0,q8
1442	aesimc $dat0,$dat0
1443	aesd $dat1,q8
1444	aesimc $dat1,$dat1
1445	aesd $dat2,q8
1446	aesimc $dat2,$dat2
1447	aesd $dat3,q8
1448	aesimc $dat3,$dat3
1449	aesd $dat4,q8
1450	aesimc $dat4,$dat4
1451	cmp $len,#0x40 // because .Lcbc_tail4x
1452	sub $len,$len,#0x50
1453
1454	aesd $dat0,q9
1455	aesimc $dat0,$dat0
1456	aesd $dat1,q9
1457	aesimc $dat1,$dat1
1458	aesd $dat2,q9
1459	aesimc $dat2,$dat2
1460	aesd $dat3,q9
1461	aesimc $dat3,$dat3
1462	aesd $dat4,q9
1463	aesimc $dat4,$dat4
1464	csel x6,xzr,$len,gt // borrow x6, $cnt, "gt" is not typo
1465	mov $key_,$key
1466
1467	aesd $dat0,q10
1468	aesimc $dat0,$dat0
1469	aesd $dat1,q10
1470	aesimc $dat1,$dat1
1471	aesd $dat2,q10
1472	aesimc $dat2,$dat2
1473	aesd $dat3,q10
1474	aesimc $dat3,$dat3
1475	aesd $dat4,q10
1476	aesimc $dat4,$dat4
1477	add $inp,$inp,x6 // $inp is adjusted in such way that
1478	// at exit from the loop $dat1-$dat4
1479	// are loaded with last "words"
1480	add x6,$len,#0x60 // because .Lcbc_tail4x
1481
1482	aesd $dat0,q11
1483	aesimc $dat0,$dat0
1484	aesd $dat1,q11
1485	aesimc $dat1,$dat1
1486	aesd $dat2,q11
1487	aesimc $dat2,$dat2
1488	aesd $dat3,q11
1489	aesimc $dat3,$dat3
1490	aesd $dat4,q11
1491	aesimc $dat4,$dat4
1492
1493	aesd $dat0,q12
1494	aesimc $dat0,$dat0
1495	aesd $dat1,q12
1496	aesimc $dat1,$dat1
1497	aesd $dat2,q12
1498	aesimc $dat2,$dat2
1499	aesd $dat3,q12
1500	aesimc $dat3,$dat3
1501	aesd $dat4,q12
1502	aesimc $dat4,$dat4
1503
1504	aesd $dat0,q13
1505	aesimc $dat0,$dat0
1506	aesd $dat1,q13
1507	aesimc $dat1,$dat1
1508	aesd $dat2,q13
1509	aesimc $dat2,$dat2
1510	aesd $dat3,q13
1511	aesimc $dat3,$dat3
1512	aesd $dat4,q13
1513	aesimc $dat4,$dat4
1514
1515	aesd $dat0,q14
1516	aesimc $dat0,$dat0
1517	aesd $dat1,q14
1518	aesimc $dat1,$dat1
1519	aesd $dat2,q14
1520	aesimc $dat2,$dat2
1521	aesd $dat3,q14
1522	aesimc $dat3,$dat3
1523	aesd $dat4,q14
1524	aesimc $dat4,$dat4
1525
1526	veor $tmp0,$ivec,$rndlast
1527	aesd $dat0,q15
1528	veor $tmp1,$in0,$rndlast
1529	vld1.8 {$in0},[$inp],#16
1530	aesd $dat1,q15
1531	veor $tmp2,$in1,$rndlast
1532	vld1.8 {$in1},[$inp],#16
1533	aesd $dat2,q15
1534	veor $tmp3,$in2,$rndlast
1535	vld1.8 {$in2},[$inp],#16
1536	aesd $dat3,q15
1537	veor $tmp4,$in3,$rndlast
1538	vld1.8 {$in3},[$inp],#16
1539	aesd $dat4,q15
1540	vorr $ivec,$in4,$in4
1541	vld1.8 {$in4},[$inp],#16
1542	cbz x6,.Lcbc_tail4x
1543	vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0]
1544	veor $tmp0,$tmp0,$dat0
1545	vorr $dat0,$in0,$in0
1546	veor $tmp1,$tmp1,$dat1
1547	vorr $dat1,$in1,$in1
1548	veor $tmp2,$tmp2,$dat2
1549	vorr $dat2,$in2,$in2
1550	veor $tmp3,$tmp3,$dat3
1551	vorr $dat3,$in3,$in3
1552	veor $tmp4,$tmp4,$dat4
1553	vst1.8 {$tmp0},[$out],#16
1554	vorr $dat4,$in4,$in4
1555	vst1.8 {$tmp1},[$out],#16
1556	mov $cnt,$rounds
1557	vst1.8 {$tmp2},[$out],#16
1558	vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1]
1559	vst1.8 {$tmp3},[$out],#16
1560	vst1.8 {$tmp4},[$out],#16
1561	b.hs .Loop5x_cbc_dec
1562
1563	add $len,$len,#0x50
1564	cbz $len,.Lcbc_done
1565
1566	add $cnt,$rounds,#2
1567	subs $len,$len,#0x30
1568	vorr $dat0,$in2,$in2
1569	vorr $in0,$in2,$in2
1570	vorr $dat1,$in3,$in3
1571	vorr $in1,$in3,$in3
1572	vorr $dat2,$in4,$in4
1573	vorr $in2,$in4,$in4
1574	b.lo .Lcbc_dec_tail
1575
1576	b .Loop3x_cbc_dec
1577
1578	.align 4
1579	.Lcbc_tail4x:
1580	veor $tmp1,$tmp0,$dat1
1581	veor $tmp2,$tmp2,$dat2
1582	veor $tmp3,$tmp3,$dat3
1583	veor $tmp4,$tmp4,$dat4
1584	vst1.8 {$tmp1},[$out],#16
1585	vst1.8 {$tmp2},[$out],#16
1586	vst1.8 {$tmp3},[$out],#16
1587	vst1.8 {$tmp4},[$out],#16
1588
1589	b .Lcbc_done
1590	.align 4
1591	___
1592	$code.=<<___;
1593	.Loop3x_cbc_dec:
1594	aesd $dat0,q8
1595	aesimc $dat0,$dat0
1596	aesd $dat1,q8
1597	aesimc $dat1,$dat1
1598	aesd $dat2,q8
1599	aesimc $dat2,$dat2
1600	vld1.32 {q8},[$key_],#16
1601	subs $cnt,$cnt,#2
1602	aesd $dat0,q9
1603	aesimc $dat0,$dat0
1604	aesd $dat1,q9
1605	aesimc $dat1,$dat1
1606	aesd $dat2,q9
1607	aesimc $dat2,$dat2
1608	vld1.32 {q9},[$key_],#16
1609	b.gt .Loop3x_cbc_dec
1610
1611	aesd $dat0,q8
1612	aesimc $dat0,$dat0
1613	aesd $dat1,q8
1614	aesimc $dat1,$dat1
1615	aesd $dat2,q8
1616	aesimc $dat2,$dat2
1617	veor $tmp0,$ivec,$rndlast
1618	subs $len,$len,#0x30
1619	veor $tmp1,$in0,$rndlast
1620	mov.lo x6,$len // x6, $cnt, is zero at this point
1621	aesd $dat0,q9
1622	aesimc $dat0,$dat0
1623	aesd $dat1,q9
1624	aesimc $dat1,$dat1
1625	aesd $dat2,q9
1626	aesimc $dat2,$dat2
1627	veor $tmp2,$in1,$rndlast
1628	add $inp,$inp,x6 // $inp is adjusted in such way that
1629	// at exit from the loop $dat1-$dat2
1630	// are loaded with last "words"
1631	vorr $ivec,$in2,$in2
1632	mov $key_,$key
1633	aesd $dat0,q12
1634	aesimc $dat0,$dat0
1635	aesd $dat1,q12
1636	aesimc $dat1,$dat1
1637	aesd $dat2,q12
1638	aesimc $dat2,$dat2
1639	vld1.8 {$in0},[$inp],#16
1640	aesd $dat0,q13
1641	aesimc $dat0,$dat0
1642	aesd $dat1,q13
1643	aesimc $dat1,$dat1
1644	aesd $dat2,q13
1645	aesimc $dat2,$dat2
1646	vld1.8 {$in1},[$inp],#16
1647	aesd $dat0,q14
1648	aesimc $dat0,$dat0
1649	aesd $dat1,q14
1650	aesimc $dat1,$dat1
1651	aesd $dat2,q14
1652	aesimc $dat2,$dat2
1653	vld1.8 {$in2},[$inp],#16
1654	aesd $dat0,q15
1655	aesd $dat1,q15
1656	aesd $dat2,q15
1657	vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0]
1658	add $cnt,$rounds,#2
1659	veor $tmp0,$tmp0,$dat0
1660	veor $tmp1,$tmp1,$dat1
1661	veor $dat2,$dat2,$tmp2
1662	vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1]
1663	vst1.8 {$tmp0},[$out],#16
1664	vorr $dat0,$in0,$in0
1665	vst1.8 {$tmp1},[$out],#16
1666	vorr $dat1,$in1,$in1
1667	vst1.8 {$dat2},[$out],#16
1668	vorr $dat2,$in2,$in2
1669	b.hs .Loop3x_cbc_dec
1670
1671	cmn $len,#0x30
1672	b.eq .Lcbc_done
1673	nop
1674
1675	.Lcbc_dec_tail:
1676	aesd $dat1,q8
1677	aesimc $dat1,$dat1
1678	aesd $dat2,q8
1679	aesimc $dat2,$dat2
1680	vld1.32 {q8},[$key_],#16
1681	subs $cnt,$cnt,#2
1682	aesd $dat1,q9
1683	aesimc $dat1,$dat1
1684	aesd $dat2,q9
1685	aesimc $dat2,$dat2
1686	vld1.32 {q9},[$key_],#16
1687	b.gt .Lcbc_dec_tail
1688
1689	aesd $dat1,q8
1690	aesimc $dat1,$dat1
1691	aesd $dat2,q8
1692	aesimc $dat2,$dat2
1693	aesd $dat1,q9
1694	aesimc $dat1,$dat1
1695	aesd $dat2,q9
1696	aesimc $dat2,$dat2
1697	aesd $dat1,q12
1698	aesimc $dat1,$dat1
1699	aesd $dat2,q12
1700	aesimc $dat2,$dat2
1701	cmn $len,#0x20
1702	aesd $dat1,q13
1703	aesimc $dat1,$dat1
1704	aesd $dat2,q13
1705	aesimc $dat2,$dat2
1706	veor $tmp1,$ivec,$rndlast
1707	aesd $dat1,q14
1708	aesimc $dat1,$dat1
1709	aesd $dat2,q14
1710	aesimc $dat2,$dat2
1711	veor $tmp2,$in1,$rndlast
1712	aesd $dat1,q15
1713	aesd $dat2,q15
1714	b.eq .Lcbc_dec_one
1715	veor $tmp1,$tmp1,$dat1
1716	veor $tmp2,$tmp2,$dat2
1717	vorr $ivec,$in2,$in2
1718	vst1.8 {$tmp1},[$out],#16
1719	vst1.8 {$tmp2},[$out],#16
1720	b .Lcbc_done
1721
1722	.Lcbc_dec_one:
1723	veor $tmp1,$tmp1,$dat2
1724	vorr $ivec,$in2,$in2
1725	vst1.8 {$tmp1},[$out],#16
1726
1727	.Lcbc_done:
1728	vst1.8 {$ivec},[$ivp]
1729	.Lcbc_abort:
1730	___
1731	}
1732	$code.=<<___ if ($flavour !~ /64/);
1733	vldmia sp!,{d8-d15}
1734	ldmia sp!,{r4-r8,pc}
1735	___
1736	$code.=<<___ if ($flavour =~ /64/);
1737	ldr x29,[sp],#16
1738	ret
1739	___
1740	$code.=<<___;
1741	.size ${prefix}_cbc_encrypt,.-${prefix}_cbc_encrypt
1742	___
1743	}}}
1744	{{{
1745	my ($inp,$out,$len,$key,$ivp)=map("x$_",(0..4));
1746	my ($rounds,$cnt,$key_)=("w5","w6","x7");
1747	my ($ctr,$tctr0,$tctr1,$tctr2)=map("w$_",(8..10,12));
1748	my $step="x12"; # aliases with $tctr2
1749
1750	my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$ivec,$rndlast)=map("q$_",(0..7));
1751	my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9));
1752
1753	# used only in 64-bit mode...
1754	my ($dat3,$dat4,$in3,$in4)=map("q$_",(16..23));
1755
1756	my ($dat,$tmp)=($dat0,$tmp0);
1757
1758	### q8-q15 preloaded key schedule
1759
1760	$code.=<<___;
1761	.globl ${prefix}_ctr32_encrypt_blocks
1762	.type ${prefix}_ctr32_encrypt_blocks,%function
1763	.align 5
1764	${prefix}_ctr32_encrypt_blocks:
1765	___
1766	$code.=<<___ if ($flavour =~ /64/);
1767	stp x29,x30,[sp,#-16]!
1768	add x29,sp,#0
1769	___
1770	$code.=<<___ if ($flavour !~ /64/);
1771	mov ip,sp
1772	stmdb sp!,{r4-r10,lr}
1773	vstmdb sp!,{d8-d15} @ ABI specification says so
1774	ldr r4, [ip] @ load remaining arg
1775	___
1776	$code.=<<___;
1777	ldr $rounds,[$key,#240]
1778
1779	ldr $ctr, [$ivp, #12]
1780	#ifdef __ARMEB__
1781	vld1.8 {$dat0},[$ivp]
1782	#else
1783	vld1.32 {$dat0},[$ivp]
1784	#endif
1785	vld1.32 {q8-q9},[$key] // load key schedule...
1786	sub $rounds,$rounds,#4
1787	mov $step,#16
1788	cmp $len,#2
1789	add $key_,$key,x5,lsl#4 // pointer to last 5 round keys
1790	sub $rounds,$rounds,#2
1791	vld1.32 {q12-q13},[$key_],#32
1792	vld1.32 {q14-q15},[$key_],#32
1793	vld1.32 {$rndlast},[$key_]
1794	add $key_,$key,#32
1795	mov $cnt,$rounds
1796	cclr $step,lo
1797	#ifndef __ARMEB__
1798	rev $ctr, $ctr
1799	#endif
1800	___
1801	$code.=<<___ if ($flavour =~ /64/);
1802	vorr $dat1,$dat0,$dat0
1803	add $tctr1, $ctr, #1
1804	vorr $dat2,$dat0,$dat0
1805	add $ctr, $ctr, #2
1806	vorr $ivec,$dat0,$dat0
1807	rev $tctr1, $tctr1
1808	vmov.32 ${dat1}[3],$tctr1
1809	b.ls .Lctr32_tail
1810	rev $tctr2, $ctr
1811	sub $len,$len,#3 // bias
1812	vmov.32 ${dat2}[3],$tctr2
1813	___
1814	$code.=<<___ if ($flavour !~ /64/);
1815	add $tctr1, $ctr, #1
1816	vorr $ivec,$dat0,$dat0
1817	rev $tctr1, $tctr1
1818	vmov.32 ${ivec}[3],$tctr1
1819	add $ctr, $ctr, #2
1820	vorr $dat1,$ivec,$ivec
1821	b.ls .Lctr32_tail
1822	rev $tctr2, $ctr
1823	vmov.32 ${ivec}[3],$tctr2
1824	sub $len,$len,#3 // bias
1825	vorr $dat2,$ivec,$ivec
1826	___
1827	$code.=<<___ if ($flavour =~ /64/);
1828	cmp $len,#32
1829	b.lo .Loop3x_ctr32
1830
1831	add w13,$ctr,#1
1832	add w14,$ctr,#2
1833	vorr $dat3,$dat0,$dat0
1834	rev w13,w13
1835	vorr $dat4,$dat0,$dat0
1836	rev w14,w14
1837	vmov.32 ${dat3}[3],w13
1838	sub $len,$len,#2 // bias
1839	vmov.32 ${dat4}[3],w14
1840	add $ctr,$ctr,#2
1841	b .Loop5x_ctr32
1842
1843	.align 4
1844	.Loop5x_ctr32:
1845	aese $dat0,q8
1846	aesmc $dat0,$dat0
1847	aese $dat1,q8
1848	aesmc $dat1,$dat1
1849	aese $dat2,q8
1850	aesmc $dat2,$dat2
1851	aese $dat3,q8
1852	aesmc $dat3,$dat3
1853	aese $dat4,q8
1854	aesmc $dat4,$dat4
1855	vld1.32 {q8},[$key_],#16
1856	subs $cnt,$cnt,#2
1857	aese $dat0,q9
1858	aesmc $dat0,$dat0
1859	aese $dat1,q9
1860	aesmc $dat1,$dat1
1861	aese $dat2,q9
1862	aesmc $dat2,$dat2
1863	aese $dat3,q9
1864	aesmc $dat3,$dat3
1865	aese $dat4,q9
1866	aesmc $dat4,$dat4
1867	vld1.32 {q9},[$key_],#16
1868	b.gt .Loop5x_ctr32
1869
1870	mov $key_,$key
1871	aese $dat0,q8
1872	aesmc $dat0,$dat0
1873	aese $dat1,q8
1874	aesmc $dat1,$dat1
1875	aese $dat2,q8
1876	aesmc $dat2,$dat2
1877	aese $dat3,q8
1878	aesmc $dat3,$dat3
1879	aese $dat4,q8
1880	aesmc $dat4,$dat4
1881	vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0]
1882
1883	aese $dat0,q9
1884	aesmc $dat0,$dat0
1885	aese $dat1,q9
1886	aesmc $dat1,$dat1
1887	aese $dat2,q9
1888	aesmc $dat2,$dat2
1889	aese $dat3,q9
1890	aesmc $dat3,$dat3
1891	aese $dat4,q9
1892	aesmc $dat4,$dat4
1893	vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1]
1894
1895	aese $dat0,q12
1896	aesmc $dat0,$dat0
1897	add $tctr0,$ctr,#1
1898	add $tctr1,$ctr,#2
1899	aese $dat1,q12
1900	aesmc $dat1,$dat1
1901	add $tctr2,$ctr,#3
1902	add w13,$ctr,#4
1903	aese $dat2,q12
1904	aesmc $dat2,$dat2
1905	add w14,$ctr,#5
1906	rev $tctr0,$tctr0
1907	aese $dat3,q12
1908	aesmc $dat3,$dat3
1909	rev $tctr1,$tctr1
1910	rev $tctr2,$tctr2
1911	aese $dat4,q12
1912	aesmc $dat4,$dat4
1913	rev w13,w13
1914	rev w14,w14
1915
1916	aese $dat0,q13
1917	aesmc $dat0,$dat0
1918	aese $dat1,q13
1919	aesmc $dat1,$dat1
1920	aese $dat2,q13
1921	aesmc $dat2,$dat2
1922	aese $dat3,q13
1923	aesmc $dat3,$dat3
1924	aese $dat4,q13
1925	aesmc $dat4,$dat4
1926
1927	aese $dat0,q14
1928	aesmc $dat0,$dat0
1929	vld1.8 {$in0},[$inp],#16
1930	aese $dat1,q14
1931	aesmc $dat1,$dat1
1932	vld1.8 {$in1},[$inp],#16
1933	aese $dat2,q14
1934	aesmc $dat2,$dat2
1935	vld1.8 {$in2},[$inp],#16
1936	aese $dat3,q14
1937	aesmc $dat3,$dat3
1938	vld1.8 {$in3},[$inp],#16
1939	aese $dat4,q14
1940	aesmc $dat4,$dat4
1941	vld1.8 {$in4},[$inp],#16
1942
1943	aese $dat0,q15
1944	veor $in0,$in0,$rndlast
1945	aese $dat1,q15
1946	veor $in1,$in1,$rndlast
1947	aese $dat2,q15
1948	veor $in2,$in2,$rndlast
1949	aese $dat3,q15
1950	veor $in3,$in3,$rndlast
1951	aese $dat4,q15
1952	veor $in4,$in4,$rndlast
1953
1954	veor $in0,$in0,$dat0
1955	vorr $dat0,$ivec,$ivec
1956	veor $in1,$in1,$dat1
1957	vorr $dat1,$ivec,$ivec
1958	veor $in2,$in2,$dat2
1959	vorr $dat2,$ivec,$ivec
1960	veor $in3,$in3,$dat3
1961	vorr $dat3,$ivec,$ivec
1962	veor $in4,$in4,$dat4
1963	vorr $dat4,$ivec,$ivec
1964
1965	vst1.8 {$in0},[$out],#16
1966	vmov.32 ${dat0}[3],$tctr0
1967	vst1.8 {$in1},[$out],#16
1968	vmov.32 ${dat1}[3],$tctr1
1969	vst1.8 {$in2},[$out],#16
1970	vmov.32 ${dat2}[3],$tctr2
1971	vst1.8 {$in3},[$out],#16
1972	vmov.32 ${dat3}[3],w13
1973	vst1.8 {$in4},[$out],#16
1974	vmov.32 ${dat4}[3],w14
1975
1976	mov $cnt,$rounds
1977	cbz $len,.Lctr32_done
1978
1979	add $ctr,$ctr,#5
1980	subs $len,$len,#5
1981	b.hs .Loop5x_ctr32
1982
1983	add $len,$len,#5
1984	sub $ctr,$ctr,#5
1985
1986	cmp $len,#2
1987	mov $step,#16
1988	cclr $step,lo
1989	b.ls .Lctr32_tail
1990
1991	sub $len,$len,#3 // bias
1992	add $ctr,$ctr,#3
1993	___
1994	$code.=<<___;
1995	b .Loop3x_ctr32
1996
1997	.align 4
1998	.Loop3x_ctr32:
1999	aese $dat0,q8
2000	aesmc $dat0,$dat0
2001	aese $dat1,q8
2002	aesmc $dat1,$dat1
2003	aese $dat2,q8
2004	aesmc $dat2,$dat2
2005	vld1.32 {q8},[$key_],#16
2006	subs $cnt,$cnt,#2
2007	aese $dat0,q9
2008	aesmc $dat0,$dat0
2009	aese $dat1,q9
2010	aesmc $dat1,$dat1
2011	aese $dat2,q9
2012	aesmc $dat2,$dat2
2013	vld1.32 {q9},[$key_],#16
2014	b.gt .Loop3x_ctr32
2015
2016	aese $dat0,q8
2017	aesmc $tmp0,$dat0
2018	aese $dat1,q8
2019	aesmc $tmp1,$dat1
2020	vld1.8 {$in0},[$inp],#16
2021	___
2022	$code.=<<___ if ($flavour =~ /64/);
2023	vorr $dat0,$ivec,$ivec
2024	___
2025	$code.=<<___ if ($flavour !~ /64/);
2026	add $tctr0,$ctr,#1
2027	___
2028	$code.=<<___;
2029	aese $dat2,q8
2030	aesmc $dat2,$dat2
2031	vld1.8 {$in1},[$inp],#16
2032	___
2033	$code.=<<___ if ($flavour =~ /64/);
2034	vorr $dat1,$ivec,$ivec
2035	___
2036	$code.=<<___ if ($flavour !~ /64/);
2037	rev $tctr0,$tctr0
2038	___
2039	$code.=<<___;
2040	aese $tmp0,q9
2041	aesmc $tmp0,$tmp0
2042	aese $tmp1,q9
2043	aesmc $tmp1,$tmp1
2044	vld1.8 {$in2},[$inp],#16
2045	mov $key_,$key
2046	aese $dat2,q9
2047	aesmc $tmp2,$dat2
2048	___
2049	$code.=<<___ if ($flavour =~ /64/);
2050	vorr $dat2,$ivec,$ivec
2051	add $tctr0,$ctr,#1
2052	___
2053	$code.=<<___;
2054	aese $tmp0,q12
2055	aesmc $tmp0,$tmp0
2056	aese $tmp1,q12
2057	aesmc $tmp1,$tmp1
2058	veor $in0,$in0,$rndlast
2059	add $tctr1,$ctr,#2
2060	aese $tmp2,q12
2061	aesmc $tmp2,$tmp2
2062	veor $in1,$in1,$rndlast
2063	add $ctr,$ctr,#3
2064	aese $tmp0,q13
2065	aesmc $tmp0,$tmp0
2066	aese $tmp1,q13
2067	aesmc $tmp1,$tmp1
2068	veor $in2,$in2,$rndlast
2069	___
2070	$code.=<<___ if ($flavour =~ /64/);
2071	rev $tctr0,$tctr0
2072	aese $tmp2,q13
2073	aesmc $tmp2,$tmp2
2074	vmov.32 ${dat0}[3], $tctr0
2075	___
2076	$code.=<<___ if ($flavour !~ /64/);
2077	vmov.32 ${ivec}[3], $tctr0
2078	aese $tmp2,q13
2079	aesmc $tmp2,$tmp2
2080	vorr $dat0,$ivec,$ivec
2081	___
2082	$code.=<<___;
2083	rev $tctr1,$tctr1
2084	aese $tmp0,q14
2085	aesmc $tmp0,$tmp0
2086	___
2087	$code.=<<___ if ($flavour !~ /64/);
2088	vmov.32 ${ivec}[3], $tctr1
2089	rev $tctr2,$ctr
2090	___
2091	$code.=<<___;
2092	aese $tmp1,q14
2093	aesmc $tmp1,$tmp1
2094	___
2095	$code.=<<___ if ($flavour =~ /64/);
2096	vmov.32 ${dat1}[3], $tctr1
2097	rev $tctr2,$ctr
2098	aese $tmp2,q14
2099	aesmc $tmp2,$tmp2
2100	vmov.32 ${dat2}[3], $tctr2
2101	___
2102	$code.=<<___ if ($flavour !~ /64/);
2103	vorr $dat1,$ivec,$ivec
2104	vmov.32 ${ivec}[3], $tctr2
2105	aese $tmp2,q14
2106	aesmc $tmp2,$tmp2
2107	vorr $dat2,$ivec,$ivec
2108	___
2109	$code.=<<___;
2110	subs $len,$len,#3
2111	aese $tmp0,q15
2112	aese $tmp1,q15
2113	aese $tmp2,q15
2114
2115	veor $in0,$in0,$tmp0
2116	vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0]
2117	vst1.8 {$in0},[$out],#16
2118	veor $in1,$in1,$tmp1
2119	mov $cnt,$rounds
2120	vst1.8 {$in1},[$out],#16
2121	veor $in2,$in2,$tmp2
2122	vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1]
2123	vst1.8 {$in2},[$out],#16
2124	b.hs .Loop3x_ctr32
2125
2126	adds $len,$len,#3
2127	b.eq .Lctr32_done
2128	cmp $len,#1
2129	mov $step,#16
2130	cclr $step,eq
2131
2132	.Lctr32_tail:
2133	aese $dat0,q8
2134	aesmc $dat0,$dat0
2135	aese $dat1,q8
2136	aesmc $dat1,$dat1
2137	vld1.32 {q8},[$key_],#16
2138	subs $cnt,$cnt,#2
2139	aese $dat0,q9
2140	aesmc $dat0,$dat0
2141	aese $dat1,q9
2142	aesmc $dat1,$dat1
2143	vld1.32 {q9},[$key_],#16
2144	b.gt .Lctr32_tail
2145
2146	aese $dat0,q8
2147	aesmc $dat0,$dat0
2148	aese $dat1,q8
2149	aesmc $dat1,$dat1
2150	aese $dat0,q9
2151	aesmc $dat0,$dat0
2152	aese $dat1,q9
2153	aesmc $dat1,$dat1
2154	vld1.8 {$in0},[$inp],$step
2155	aese $dat0,q12
2156	aesmc $dat0,$dat0
2157	aese $dat1,q12
2158	aesmc $dat1,$dat1
2159	vld1.8 {$in1},[$inp]
2160	aese $dat0,q13
2161	aesmc $dat0,$dat0
2162	aese $dat1,q13
2163	aesmc $dat1,$dat1
2164	veor $in0,$in0,$rndlast
2165	aese $dat0,q14
2166	aesmc $dat0,$dat0
2167	aese $dat1,q14
2168	aesmc $dat1,$dat1
2169	veor $in1,$in1,$rndlast
2170	aese $dat0,q15
2171	aese $dat1,q15
2172
2173	cmp $len,#1
2174	veor $in0,$in0,$dat0
2175	veor $in1,$in1,$dat1
2176	vst1.8 {$in0},[$out],#16
2177	b.eq .Lctr32_done
2178	vst1.8 {$in1},[$out]
2179
2180	.Lctr32_done:
2181	___
2182	$code.=<<___ if ($flavour !~ /64/);
2183	vldmia sp!,{d8-d15}
2184	ldmia sp!,{r4-r10,pc}
2185	___
2186	$code.=<<___ if ($flavour =~ /64/);
2187	ldr x29,[sp],#16
2188	ret
2189	___
2190	$code.=<<___;
2191	.size ${prefix}_ctr32_encrypt_blocks,.-${prefix}_ctr32_encrypt_blocks
2192	___
2193	}}}
2194	# Performance in cycles per byte.
2195	# Processed with AES-XTS different key size.
2196	# It shows the value before and after optimization as below:
2197	# (before/after):
2198	#
2199	# AES-128-XTS AES-256-XTS
2200	# Cortex-A57 3.36/1.09 4.02/1.37
2201	# Cortex-A72 3.03/1.02 3.28/1.33
2202
2203	# Optimization is implemented by loop unrolling and interleaving.
2204	# Commonly, we choose the unrolling factor as 5, if the input
2205	# data size smaller than 5 blocks, but not smaller than 3 blocks,
2206	# choose 3 as the unrolling factor.
2207	# If the input data size dsize >= 5*16 bytes, then take 5 blocks
2208	# as one iteration, every loop the left size lsize -= 5*16.
2209	# If lsize < 516 bytes, treat them as the tail. Note: left 416 bytes
2210	# will be processed specially, which be integrated into the 5*16 bytes
2211	# loop to improve the efficiency.
2212	# There is one special case, if the original input data size dsize
2213	# = 16 bytes, we will treat it seperately to improve the
2214	# performance: one independent code block without LR, FP load and
2215	# store.
2216	# Encryption will process the (length -tailcnt) bytes as mentioned
2217	# previously, then encrypt the composite block as last second
2218	# cipher block.
2219	# Decryption will process the (length -tailcnt -1) bytes as mentioned
2220	# previously, then decrypt the last second cipher block to get the
2221	# last plain block(tail), decrypt the composite block as last second
2222	# plain text block.
2223
2224	{{{
2225	my ($inp,$out,$len,$key1,$key2,$ivp)=map("x$_",(0..5));
2226	my ($rounds0,$rounds,$key_,$step,$ivl,$ivh)=("w5","w6","x7","x8","x9","x10");
2227	my ($tmpoutp,$loutp,$l2outp,$tmpinp)=("x13","w14","w15","x20");
2228	my ($tailcnt,$midnum,$midnumx,$constnum,$constnumx)=("x21","w22","x22","w19","x19");
2229	my ($xoffset,$tmpmx,$tmpmw)=("x6","x11","w11");
2230	my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$tmp2,$rndlast)=map("q$_",(0..7));
2231	my ($iv0,$iv1,$iv2,$iv3,$iv4)=("v6.16b","v8.16b","v9.16b","v10.16b","v11.16b");
2232	my ($ivd00,$ivd01,$ivd20,$ivd21)=("d6","v6.d[1]","d9","v9.d[1]");
2233	my ($ivd10,$ivd11,$ivd30,$ivd31,$ivd40,$ivd41)=("d8","v8.d[1]","d10","v10.d[1]","d11","v11.d[1]");
2234
2235	my ($tmpin)=("v26.16b");
2236	my ($dat,$tmp,$rndzero_n_last)=($dat0,$tmp0,$tmp1);
2237
2238	# q7 last round key
2239	# q10-q15, q7 Last 7 round keys
2240	# q8-q9 preloaded round keys except last 7 keys for big size
2241	# q20, q21, q8-q9 preloaded round keys except last 7 keys for only 16 byte
2242
2243
2244	my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9));
2245
2246	my ($dat3,$in3,$tmp3); # used only in 64-bit mode
2247	my ($dat4,$in4,$tmp4);
2248	if ($flavour =~ /64/) {
2249	($dat2,$dat3,$dat4,$in2,$in3,$in4,$tmp3,$tmp4)=map("q$_",(16..23));
2250	}
2251
2252	$code.=<<___ if ($flavour =~ /64/);
2253	.globl ${prefix}_xts_encrypt
2254	.type ${prefix}_xts_encrypt,%function
2255	.align 5
2256	${prefix}_xts_encrypt:
2257	___
2258	$code.=<<___ if ($flavour =~ /64/);
2259	cmp $len,#16
2260	// Original input data size bigger than 16, jump to big size processing.
2261	b.ne .Lxts_enc_big_size
2262	// Encrypt the iv with key2, as the first XEX iv.
2263	ldr $rounds,[$key2,#240]
2264	vld1.8 {$dat},[$key2],#16
2265	vld1.8 {$iv0},[$ivp]
2266	sub $rounds,$rounds,#2
2267	vld1.8 {$dat1},[$key2],#16
2268
2269	.Loop_enc_iv_enc:
2270	aese $iv0,$dat
2271	aesmc $iv0,$iv0
2272	vld1.32 {$dat},[$key2],#16
2273	subs $rounds,$rounds,#2
2274	aese $iv0,$dat1
2275	aesmc $iv0,$iv0
2276	vld1.32 {$dat1},[$key2],#16
2277	b.gt .Loop_enc_iv_enc
2278
2279	aese $iv0,$dat
2280	aesmc $iv0,$iv0
2281	vld1.32 {$dat},[$key2]
2282	aese $iv0,$dat1
2283	veor $iv0,$iv0,$dat
2284
2285	vld1.8 {$dat0},[$inp]
2286	veor $dat0,$iv0,$dat0
2287
2288	ldr $rounds,[$key1,#240]
2289	vld1.32 {q20-q21},[$key1],#32 // load key schedule...
2290
2291	aese $dat0,q20
2292	aesmc $dat0,$dat0
2293	vld1.32 {q8-q9},[$key1],#32 // load key schedule...
2294	aese $dat0,q21
2295	aesmc $dat0,$dat0
2296	subs $rounds,$rounds,#10 // if rounds==10, jump to aes-128-xts processing
2297	b.eq .Lxts_128_enc
2298	.Lxts_enc_round_loop:
2299	aese $dat0,q8
2300	aesmc $dat0,$dat0
2301	vld1.32 {q8},[$key1],#16 // load key schedule...
2302	aese $dat0,q9
2303	aesmc $dat0,$dat0
2304	vld1.32 {q9},[$key1],#16 // load key schedule...
2305	subs $rounds,$rounds,#2 // bias
2306	b.gt .Lxts_enc_round_loop
2307	.Lxts_128_enc:
2308	vld1.32 {q10-q11},[$key1],#32 // load key schedule...
2309	aese $dat0,q8
2310	aesmc $dat0,$dat0
2311	aese $dat0,q9
2312	aesmc $dat0,$dat0
2313	vld1.32 {q12-q13},[$key1],#32 // load key schedule...
2314	aese $dat0,q10
2315	aesmc $dat0,$dat0
2316	aese $dat0,q11
2317	aesmc $dat0,$dat0
2318	vld1.32 {q14-q15},[$key1],#32 // load key schedule...
2319	aese $dat0,q12
2320	aesmc $dat0,$dat0
2321	aese $dat0,q13
2322	aesmc $dat0,$dat0
2323	vld1.32 {$rndlast},[$key1]
2324	aese $dat0,q14
2325	aesmc $dat0,$dat0
2326	aese $dat0,q15
2327	veor $dat0,$dat0,$rndlast
2328	veor $dat0,$dat0,$iv0
2329	vst1.8 {$dat0},[$out]
2330	b .Lxts_enc_final_abort
2331
2332	.align 4
2333	.Lxts_enc_big_size:
2334	___
2335	$code.=<<___ if ($flavour =~ /64/);
2336	stp $constnumx,$tmpinp,[sp,#-64]!
2337	stp $tailcnt,$midnumx,[sp,#48]
2338	stp $ivd10,$ivd20,[sp,#32]
2339	stp $ivd30,$ivd40,[sp,#16]
2340
2341	// tailcnt store the tail value of length%16.
2342	and $tailcnt,$len,#0xf
2343	and $len,$len,#-16
2344	subs $len,$len,#16
2345	mov $step,#16
2346	b.lo .Lxts_abort
2347	csel $step,xzr,$step,eq
2348
2349	// Firstly, encrypt the iv with key2, as the first iv of XEX.
2350	ldr $rounds,[$key2,#240]
2351	vld1.32 {$dat},[$key2],#16
2352	vld1.8 {$iv0},[$ivp]
2353	sub $rounds,$rounds,#2
2354	vld1.32 {$dat1},[$key2],#16
2355
2356	.Loop_iv_enc:
2357	aese $iv0,$dat
2358	aesmc $iv0,$iv0
2359	vld1.32 {$dat},[$key2],#16
2360	subs $rounds,$rounds,#2
2361	aese $iv0,$dat1
2362	aesmc $iv0,$iv0
2363	vld1.32 {$dat1},[$key2],#16
2364	b.gt .Loop_iv_enc
2365
2366	aese $iv0,$dat
2367	aesmc $iv0,$iv0
2368	vld1.32 {$dat},[$key2]
2369	aese $iv0,$dat1
2370	veor $iv0,$iv0,$dat
2371
2372	// The iv for second block
2373	// $ivl- iv(low), $ivh - iv(high)
2374	// the five ivs stored into, $iv0,$iv1,$iv2,$iv3,$iv4
2375	fmov $ivl,$ivd00
2376	fmov $ivh,$ivd01
2377	mov $constnum,#0x87
2378	extr $midnumx,$ivh,$ivh,#32
2379	extr $ivh,$ivh,$ivl,#63
2380	and $tmpmw,$constnum,$midnum,asr#31
2381	eor $ivl,$tmpmx,$ivl,lsl#1
2382	fmov $ivd10,$ivl
2383	fmov $ivd11,$ivh
2384
2385	ldr $rounds0,[$key1,#240] // next starting point
2386	vld1.8 {$dat},[$inp],$step
2387
2388	vld1.32 {q8-q9},[$key1] // load key schedule...
2389	sub $rounds0,$rounds0,#6
2390	add $key_,$key1,$ivp,lsl#4 // pointer to last 7 round keys
2391	sub $rounds0,$rounds0,#2
2392	vld1.32 {q10-q11},[$key_],#32
2393	vld1.32 {q12-q13},[$key_],#32
2394	vld1.32 {q14-q15},[$key_],#32
2395	vld1.32 {$rndlast},[$key_]
2396
2397	add $key_,$key1,#32
2398	mov $rounds,$rounds0
2399
2400	// Encryption
2401	.Lxts_enc:
2402	vld1.8 {$dat2},[$inp],#16
2403	subs $len,$len,#32 // bias
2404	add $rounds,$rounds0,#2
2405	vorr $in1,$dat,$dat
2406	vorr $dat1,$dat,$dat
2407	vorr $in3,$dat,$dat
2408	vorr $in2,$dat2,$dat2
2409	vorr $in4,$dat2,$dat2
2410	b.lo .Lxts_inner_enc_tail
2411	veor $dat,$dat,$iv0 // before encryption, xor with iv
2412	veor $dat2,$dat2,$iv1
2413
2414	// The iv for third block
2415	extr $midnumx,$ivh,$ivh,#32
2416	extr $ivh,$ivh,$ivl,#63
2417	and $tmpmw,$constnum,$midnum,asr#31
2418	eor $ivl,$tmpmx,$ivl,lsl#1
2419	fmov $ivd20,$ivl
2420	fmov $ivd21,$ivh
2421
2422
2423	vorr $dat1,$dat2,$dat2
2424	vld1.8 {$dat2},[$inp],#16
2425	vorr $in0,$dat,$dat
2426	vorr $in1,$dat1,$dat1
2427	veor $in2,$dat2,$iv2 // the third block
2428	veor $dat2,$dat2,$iv2
2429	cmp $len,#32
2430	b.lo .Lxts_outer_enc_tail
2431
2432	// The iv for fourth block
2433	extr $midnumx,$ivh,$ivh,#32
2434	extr $ivh,$ivh,$ivl,#63
2435	and $tmpmw,$constnum,$midnum,asr#31
2436	eor $ivl,$tmpmx,$ivl,lsl#1
2437	fmov $ivd30,$ivl
2438	fmov $ivd31,$ivh
2439
2440	vld1.8 {$dat3},[$inp],#16
2441	// The iv for fifth block
2442	extr $midnumx,$ivh,$ivh,#32
2443	extr $ivh,$ivh,$ivl,#63
2444	and $tmpmw,$constnum,$midnum,asr#31
2445	eor $ivl,$tmpmx,$ivl,lsl#1
2446	fmov $ivd40,$ivl
2447	fmov $ivd41,$ivh
2448
2449	vld1.8 {$dat4},[$inp],#16
2450	veor $dat3,$dat3,$iv3 // the fourth block
2451	veor $dat4,$dat4,$iv4
2452	sub $len,$len,#32 // bias
2453	mov $rounds,$rounds0
2454	b .Loop5x_xts_enc
2455
2456	.align 4
2457	.Loop5x_xts_enc:
2458	aese $dat0,q8
2459	aesmc $dat0,$dat0
2460	aese $dat1,q8
2461	aesmc $dat1,$dat1
2462	aese $dat2,q8
2463	aesmc $dat2,$dat2
2464	aese $dat3,q8
2465	aesmc $dat3,$dat3
2466	aese $dat4,q8
2467	aesmc $dat4,$dat4
2468	vld1.32 {q8},[$key_],#16
2469	subs $rounds,$rounds,#2
2470	aese $dat0,q9
2471	aesmc $dat0,$dat0
2472	aese $dat1,q9
2473	aesmc $dat1,$dat1
2474	aese $dat2,q9
2475	aesmc $dat2,$dat2
2476	aese $dat3,q9
2477	aesmc $dat3,$dat3
2478	aese $dat4,q9
2479	aesmc $dat4,$dat4
2480	vld1.32 {q9},[$key_],#16
2481	b.gt .Loop5x_xts_enc
2482
2483	aese $dat0,q8
2484	aesmc $dat0,$dat0
2485	aese $dat1,q8
2486	aesmc $dat1,$dat1
2487	aese $dat2,q8
2488	aesmc $dat2,$dat2
2489	aese $dat3,q8
2490	aesmc $dat3,$dat3
2491	aese $dat4,q8
2492	aesmc $dat4,$dat4
2493	subs $len,$len,#0x50 // because .Lxts_enc_tail4x
2494
2495	aese $dat0,q9
2496	aesmc $dat0,$dat0
2497	aese $dat1,q9
2498	aesmc $dat1,$dat1
2499	aese $dat2,q9
2500	aesmc $dat2,$dat2
2501	aese $dat3,q9
2502	aesmc $dat3,$dat3
2503	aese $dat4,q9
2504	aesmc $dat4,$dat4
2505	csel $xoffset,xzr,$len,gt // borrow x6, w6, "gt" is not typo
2506	mov $key_,$key1
2507
2508	aese $dat0,q10
2509	aesmc $dat0,$dat0
2510	aese $dat1,q10
2511	aesmc $dat1,$dat1
2512	aese $dat2,q10
2513	aesmc $dat2,$dat2
2514	aese $dat3,q10
2515	aesmc $dat3,$dat3
2516	aese $dat4,q10
2517	aesmc $dat4,$dat4
2518	add $inp,$inp,$xoffset // x0 is adjusted in such way that
2519	// at exit from the loop v1.16b-v26.16b
2520	// are loaded with last "words"
2521	add $xoffset,$len,#0x60 // because .Lxts_enc_tail4x
2522
2523	aese $dat0,q11
2524	aesmc $dat0,$dat0
2525	aese $dat1,q11
2526	aesmc $dat1,$dat1
2527	aese $dat2,q11
2528	aesmc $dat2,$dat2
2529	aese $dat3,q11
2530	aesmc $dat3,$dat3
2531	aese $dat4,q11
2532	aesmc $dat4,$dat4
2533
2534	aese $dat0,q12
2535	aesmc $dat0,$dat0
2536	aese $dat1,q12
2537	aesmc $dat1,$dat1
2538	aese $dat2,q12
2539	aesmc $dat2,$dat2
2540	aese $dat3,q12
2541	aesmc $dat3,$dat3
2542	aese $dat4,q12
2543	aesmc $dat4,$dat4
2544
2545	aese $dat0,q13
2546	aesmc $dat0,$dat0
2547	aese $dat1,q13
2548	aesmc $dat1,$dat1
2549	aese $dat2,q13
2550	aesmc $dat2,$dat2
2551	aese $dat3,q13
2552	aesmc $dat3,$dat3
2553	aese $dat4,q13
2554	aesmc $dat4,$dat4
2555
2556	aese $dat0,q14
2557	aesmc $dat0,$dat0
2558	aese $dat1,q14
2559	aesmc $dat1,$dat1
2560	aese $dat2,q14
2561	aesmc $dat2,$dat2
2562	aese $dat3,q14
2563	aesmc $dat3,$dat3
2564	aese $dat4,q14
2565	aesmc $dat4,$dat4
2566
2567	veor $tmp0,$rndlast,$iv0
2568	aese $dat0,q15
2569	// The iv for first block of one iteration
2570	extr $midnumx,$ivh,$ivh,#32
2571	extr $ivh,$ivh,$ivl,#63
2572	and $tmpmw,$constnum,$midnum,asr#31
2573	eor $ivl,$tmpmx,$ivl,lsl#1
2574	fmov $ivd00,$ivl
2575	fmov $ivd01,$ivh
2576	veor $tmp1,$rndlast,$iv1
2577	vld1.8 {$in0},[$inp],#16
2578	aese $dat1,q15
2579	// The iv for second block
2580	extr $midnumx,$ivh,$ivh,#32
2581	extr $ivh,$ivh,$ivl,#63
2582	and $tmpmw,$constnum,$midnum,asr#31
2583	eor $ivl,$tmpmx,$ivl,lsl#1
2584	fmov $ivd10,$ivl
2585	fmov $ivd11,$ivh
2586	veor $tmp2,$rndlast,$iv2
2587	vld1.8 {$in1},[$inp],#16
2588	aese $dat2,q15
2589	// The iv for third block
2590	extr $midnumx,$ivh,$ivh,#32
2591	extr $ivh,$ivh,$ivl,#63
2592	and $tmpmw,$constnum,$midnum,asr#31
2593	eor $ivl,$tmpmx,$ivl,lsl#1
2594	fmov $ivd20,$ivl
2595	fmov $ivd21,$ivh
2596	veor $tmp3,$rndlast,$iv3
2597	vld1.8 {$in2},[$inp],#16
2598	aese $dat3,q15
2599	// The iv for fourth block
2600	extr $midnumx,$ivh,$ivh,#32
2601	extr $ivh,$ivh,$ivl,#63
2602	and $tmpmw,$constnum,$midnum,asr#31
2603	eor $ivl,$tmpmx,$ivl,lsl#1
2604	fmov $ivd30,$ivl
2605	fmov $ivd31,$ivh
2606	veor $tmp4,$rndlast,$iv4
2607	vld1.8 {$in3},[$inp],#16
2608	aese $dat4,q15
2609
2610	// The iv for fifth block
2611	extr $midnumx,$ivh,$ivh,#32
2612	extr $ivh,$ivh,$ivl,#63
2613	and $tmpmw,$constnum,$midnum,asr #31
2614	eor $ivl,$tmpmx,$ivl,lsl #1
2615	fmov $ivd40,$ivl
2616	fmov $ivd41,$ivh
2617
2618	vld1.8 {$in4},[$inp],#16
2619	cbz $xoffset,.Lxts_enc_tail4x
2620	vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0]
2621	veor $tmp0,$tmp0,$dat0
2622	veor $dat0,$in0,$iv0
2623	veor $tmp1,$tmp1,$dat1
2624	veor $dat1,$in1,$iv1
2625	veor $tmp2,$tmp2,$dat2
2626	veor $dat2,$in2,$iv2
2627	veor $tmp3,$tmp3,$dat3
2628	veor $dat3,$in3,$iv3
2629	veor $tmp4,$tmp4,$dat4
2630	vst1.8 {$tmp0},[$out],#16
2631	veor $dat4,$in4,$iv4
2632	vst1.8 {$tmp1},[$out],#16
2633	mov $rounds,$rounds0
2634	vst1.8 {$tmp2},[$out],#16
2635	vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1]
2636	vst1.8 {$tmp3},[$out],#16
2637	vst1.8 {$tmp4},[$out],#16
2638	b.hs .Loop5x_xts_enc
2639
2640
2641	// If left 4 blocks, borrow the five block's processing.
2642	cmn $len,#0x10
2643	b.ne .Loop5x_enc_after
2644	vorr $iv4,$iv3,$iv3
2645	vorr $iv3,$iv2,$iv2
2646	vorr $iv2,$iv1,$iv1
2647	vorr $iv1,$iv0,$iv0
2648	fmov $ivl,$ivd40
2649	fmov $ivh,$ivd41
2650	veor $dat0,$iv0,$in0
2651	veor $dat1,$iv1,$in1
2652	veor $dat2,$in2,$iv2
2653	veor $dat3,$in3,$iv3
2654	veor $dat4,$in4,$iv4
2655	b.eq .Loop5x_xts_enc
2656
2657	.Loop5x_enc_after:
2658	add $len,$len,#0x50
2659	cbz $len,.Lxts_enc_done
2660
2661	add $rounds,$rounds0,#2
2662	subs $len,$len,#0x30
2663	b.lo .Lxts_inner_enc_tail
2664
2665	veor $dat0,$iv0,$in2
2666	veor $dat1,$iv1,$in3
2667	veor $dat2,$in4,$iv2
2668	b .Lxts_outer_enc_tail
2669
2670	.align 4
2671	.Lxts_enc_tail4x:
2672	add $inp,$inp,#16
2673	veor $tmp1,$dat1,$tmp1
2674	vst1.8 {$tmp1},[$out],#16
2675	veor $tmp2,$dat2,$tmp2
2676	vst1.8 {$tmp2},[$out],#16
2677	veor $tmp3,$dat3,$tmp3
2678	veor $tmp4,$dat4,$tmp4
2679	vst1.8 {$tmp3-$tmp4},[$out],#32
2680
2681	b .Lxts_enc_done
2682	.align 4
2683	.Lxts_outer_enc_tail:
2684	aese $dat0,q8
2685	aesmc $dat0,$dat0
2686	aese $dat1,q8
2687	aesmc $dat1,$dat1
2688	aese $dat2,q8
2689	aesmc $dat2,$dat2
2690	vld1.32 {q8},[$key_],#16
2691	subs $rounds,$rounds,#2
2692	aese $dat0,q9
2693	aesmc $dat0,$dat0
2694	aese $dat1,q9
2695	aesmc $dat1,$dat1
2696	aese $dat2,q9
2697	aesmc $dat2,$dat2
2698	vld1.32 {q9},[$key_],#16
2699	b.gt .Lxts_outer_enc_tail
2700
2701	aese $dat0,q8
2702	aesmc $dat0,$dat0
2703	aese $dat1,q8
2704	aesmc $dat1,$dat1
2705	aese $dat2,q8
2706	aesmc $dat2,$dat2
2707	veor $tmp0,$iv0,$rndlast
2708	subs $len,$len,#0x30
2709	// The iv for first block
2710	fmov $ivl,$ivd20
2711	fmov $ivh,$ivd21
2712	//mov $constnum,#0x87
2713	extr $midnumx,$ivh,$ivh,#32
2714	extr $ivh,$ivh,$ivl,#63
2715	and $tmpmw,$constnum,$midnum,asr#31
2716	eor $ivl,$tmpmx,$ivl,lsl#1
2717	fmov $ivd00,$ivl
2718	fmov $ivd01,$ivh
2719	veor $tmp1,$iv1,$rndlast
2720	csel $xoffset,$len,$xoffset,lo // x6, w6, is zero at this point
2721	aese $dat0,q9
2722	aesmc $dat0,$dat0
2723	aese $dat1,q9
2724	aesmc $dat1,$dat1
2725	aese $dat2,q9
2726	aesmc $dat2,$dat2
2727	veor $tmp2,$iv2,$rndlast
2728
2729	add $xoffset,$xoffset,#0x20
2730	add $inp,$inp,$xoffset
2731	mov $key_,$key1
2732
2733	aese $dat0,q12
2734	aesmc $dat0,$dat0
2735	aese $dat1,q12
2736	aesmc $dat1,$dat1
2737	aese $dat2,q12
2738	aesmc $dat2,$dat2
2739	aese $dat0,q13
2740	aesmc $dat0,$dat0
2741	aese $dat1,q13
2742	aesmc $dat1,$dat1
2743	aese $dat2,q13
2744	aesmc $dat2,$dat2
2745	aese $dat0,q14
2746	aesmc $dat0,$dat0
2747	aese $dat1,q14
2748	aesmc $dat1,$dat1
2749	aese $dat2,q14
2750	aesmc $dat2,$dat2
2751	aese $dat0,q15
2752	aese $dat1,q15
2753	aese $dat2,q15
2754	vld1.8 {$in2},[$inp],#16
2755	add $rounds,$rounds0,#2
2756	vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0]
2757	veor $tmp0,$tmp0,$dat0
2758	veor $tmp1,$tmp1,$dat1
2759	veor $dat2,$dat2,$tmp2
2760	vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1]
2761	vst1.8 {$tmp0},[$out],#16
2762	vst1.8 {$tmp1},[$out],#16
2763	vst1.8 {$dat2},[$out],#16
2764	cmn $len,#0x30
2765	b.eq .Lxts_enc_done
2766	.Lxts_encxor_one:
2767	vorr $in3,$in1,$in1
2768	vorr $in4,$in2,$in2
2769	nop
2770
2771	.Lxts_inner_enc_tail:
2772	cmn $len,#0x10
2773	veor $dat1,$in3,$iv0
2774	veor $dat2,$in4,$iv1
2775	b.eq .Lxts_enc_tail_loop
2776	veor $dat2,$in4,$iv0
2777	.Lxts_enc_tail_loop:
2778	aese $dat1,q8
2779	aesmc $dat1,$dat1
2780	aese $dat2,q8
2781	aesmc $dat2,$dat2
2782	vld1.32 {q8},[$key_],#16
2783	subs $rounds,$rounds,#2
2784	aese $dat1,q9
2785	aesmc $dat1,$dat1
2786	aese $dat2,q9
2787	aesmc $dat2,$dat2
2788	vld1.32 {q9},[$key_],#16
2789	b.gt .Lxts_enc_tail_loop
2790
2791	aese $dat1,q8
2792	aesmc $dat1,$dat1
2793	aese $dat2,q8
2794	aesmc $dat2,$dat2
2795	aese $dat1,q9
2796	aesmc $dat1,$dat1
2797	aese $dat2,q9
2798	aesmc $dat2,$dat2
2799	aese $dat1,q12
2800	aesmc $dat1,$dat1
2801	aese $dat2,q12
2802	aesmc $dat2,$dat2
2803	cmn $len,#0x20
2804	aese $dat1,q13
2805	aesmc $dat1,$dat1
2806	aese $dat2,q13
2807	aesmc $dat2,$dat2
2808	veor $tmp1,$iv0,$rndlast
2809	aese $dat1,q14
2810	aesmc $dat1,$dat1
2811	aese $dat2,q14
2812	aesmc $dat2,$dat2
2813	veor $tmp2,$iv1,$rndlast
2814	aese $dat1,q15
2815	aese $dat2,q15
2816	b.eq .Lxts_enc_one
2817	veor $tmp1,$tmp1,$dat1
2818	vst1.8 {$tmp1},[$out],#16
2819	veor $tmp2,$tmp2,$dat2
2820	vorr $iv0,$iv1,$iv1
2821	vst1.8 {$tmp2},[$out],#16
2822	fmov $ivl,$ivd10
2823	fmov $ivh,$ivd11
2824	mov $constnum,#0x87
2825	extr $midnumx,$ivh,$ivh,#32
2826	extr $ivh,$ivh,$ivl,#63
2827	and $tmpmw,$constnum,$midnum,asr #31
2828	eor $ivl,$tmpmx,$ivl,lsl #1
2829	fmov $ivd00,$ivl
2830	fmov $ivd01,$ivh
2831	b .Lxts_enc_done
2832
2833	.Lxts_enc_one:
2834	veor $tmp1,$tmp1,$dat2
2835	vorr $iv0,$iv0,$iv0
2836	vst1.8 {$tmp1},[$out],#16
2837	fmov $ivl,$ivd00
2838	fmov $ivh,$ivd01
2839	mov $constnum,#0x87
2840	extr $midnumx,$ivh,$ivh,#32
2841	extr $ivh,$ivh,$ivl,#63
2842	and $tmpmw,$constnum,$midnum,asr #31
2843	eor $ivl,$tmpmx,$ivl,lsl #1
2844	fmov $ivd00,$ivl
2845	fmov $ivd01,$ivh
2846	b .Lxts_enc_done
2847	.align 5
2848	.Lxts_enc_done:
2849	// Process the tail block with cipher stealing.
2850	tst $tailcnt,#0xf
2851	b.eq .Lxts_abort
2852
2853	mov $tmpinp,$inp
2854	mov $tmpoutp,$out
2855	sub $out,$out,#16
2856	.composite_enc_loop:
2857	subs $tailcnt,$tailcnt,#1
2858	ldrb $l2outp,[$out,$tailcnt]
2859	ldrb $loutp,[$tmpinp,$tailcnt]
2860	strb $l2outp,[$tmpoutp,$tailcnt]
2861	strb $loutp,[$out,$tailcnt]
2862	b.gt .composite_enc_loop
2863	.Lxts_enc_load_done:
2864	vld1.8 {$tmpin},[$out]
2865	veor $tmpin,$tmpin,$iv0
2866
2867	// Encrypt the composite block to get the last second encrypted text block
2868	ldr $rounds,[$key1,#240] // load key schedule...
2869	vld1.8 {$dat},[$key1],#16
2870	sub $rounds,$rounds,#2
2871	vld1.8 {$dat1},[$key1],#16 // load key schedule...
2872	.Loop_final_enc:
2873	aese $tmpin,$dat0
2874	aesmc $tmpin,$tmpin
2875	vld1.32 {$dat0},[$key1],#16
2876	subs $rounds,$rounds,#2
2877	aese $tmpin,$dat1
2878	aesmc $tmpin,$tmpin
2879	vld1.32 {$dat1},[$key1],#16
2880	b.gt .Loop_final_enc
2881
2882	aese $tmpin,$dat0
2883	aesmc $tmpin,$tmpin
2884	vld1.32 {$dat0},[$key1]
2885	aese $tmpin,$dat1
2886	veor $tmpin,$tmpin,$dat0
2887	veor $tmpin,$tmpin,$iv0
2888	vst1.8 {$tmpin},[$out]
2889
2890	.Lxts_abort:
2891	ldp $tailcnt,$midnumx,[sp,#48]
2892	ldp $ivd10,$ivd20,[sp,#32]
2893	ldp $ivd30,$ivd40,[sp,#16]
2894	ldp $constnumx,$tmpinp,[sp],#64
2895	.Lxts_enc_final_abort:
2896	ret
2897	.size ${prefix}_xts_encrypt,.-${prefix}_xts_encrypt
2898	___
2899
2900	}}}
2901	{{{
2902	my ($inp,$out,$len,$key1,$key2,$ivp)=map("x$_",(0..5));
2903	my ($rounds0,$rounds,$key_,$step,$ivl,$ivh)=("w5","w6","x7","x8","x9","x10");
2904	my ($tmpoutp,$loutp,$l2outp,$tmpinp)=("x13","w14","w15","x20");
2905	my ($tailcnt,$midnum,$midnumx,$constnum,$constnumx)=("x21","w22","x22","w19","x19");
2906	my ($xoffset,$tmpmx,$tmpmw)=("x6","x11","w11");
2907	my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$tmp2,$rndlast)=map("q$_",(0..7));
2908	my ($iv0,$iv1,$iv2,$iv3,$iv4,$tmpin)=("v6.16b","v8.16b","v9.16b","v10.16b","v11.16b","v26.16b");
2909	my ($ivd00,$ivd01,$ivd20,$ivd21)=("d6","v6.d[1]","d9","v9.d[1]");
2910	my ($ivd10,$ivd11,$ivd30,$ivd31,$ivd40,$ivd41)=("d8","v8.d[1]","d10","v10.d[1]","d11","v11.d[1]");
2911
2912	my ($dat,$tmp,$rndzero_n_last)=($dat0,$tmp0,$tmp1);
2913
2914	# q7 last round key
2915	# q10-q15, q7 Last 7 round keys
2916	# q8-q9 preloaded round keys except last 7 keys for big size
2917	# q20, q21, q8-q9 preloaded round keys except last 7 keys for only 16 byte
2918
2919	{
2920	my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9));
2921
2922	my ($dat3,$in3,$tmp3); # used only in 64-bit mode
2923	my ($dat4,$in4,$tmp4);
2924	if ($flavour =~ /64/) {
2925	($dat2,$dat3,$dat4,$in2,$in3,$in4,$tmp3,$tmp4)=map("q$_",(16..23));
2926	}
2927
2928	$code.=<<___ if ($flavour =~ /64/);
2929	.globl ${prefix}_xts_decrypt
2930	.type ${prefix}_xts_decrypt,%function
2931	.align 5
2932	${prefix}_xts_decrypt:
2933	___
2934	$code.=<<___ if ($flavour =~ /64/);
2935	cmp $len,#16
2936	// Original input data size bigger than 16, jump to big size processing.
2937	b.ne .Lxts_dec_big_size
2938	// Encrypt the iv with key2, as the first XEX iv.
2939	ldr $rounds,[$key2,#240]
2940	vld1.8 {$dat},[$key2],#16
2941	vld1.8 {$iv0},[$ivp]
2942	sub $rounds,$rounds,#2
2943	vld1.8 {$dat1},[$key2],#16
2944
2945	.Loop_dec_small_iv_enc:
2946	aese $iv0,$dat
2947	aesmc $iv0,$iv0
2948	vld1.32 {$dat},[$key2],#16
2949	subs $rounds,$rounds,#2
2950	aese $iv0,$dat1
2951	aesmc $iv0,$iv0
2952	vld1.32 {$dat1},[$key2],#16
2953	b.gt .Loop_dec_small_iv_enc
2954
2955	aese $iv0,$dat
2956	aesmc $iv0,$iv0
2957	vld1.32 {$dat},[$key2]
2958	aese $iv0,$dat1
2959	veor $iv0,$iv0,$dat
2960
2961	vld1.8 {$dat0},[$inp]
2962	veor $dat0,$iv0,$dat0
2963
2964	ldr $rounds,[$key1,#240]
2965	vld1.32 {q20-q21},[$key1],#32 // load key schedule...
2966
2967	aesd $dat0,q20
2968	aesimc $dat0,$dat0
2969	vld1.32 {q8-q9},[$key1],#32 // load key schedule...
2970	aesd $dat0,q21
2971	aesimc $dat0,$dat0
2972	subs $rounds,$rounds,#10 // bias
2973	b.eq .Lxts_128_dec
2974	.Lxts_dec_round_loop:
2975	aesd $dat0,q8
2976	aesimc $dat0,$dat0
2977	vld1.32 {q8},[$key1],#16 // load key schedule...
2978	aesd $dat0,q9
2979	aesimc $dat0,$dat0
2980	vld1.32 {q9},[$key1],#16 // load key schedule...
2981	subs $rounds,$rounds,#2 // bias
2982	b.gt .Lxts_dec_round_loop
2983	.Lxts_128_dec:
2984	vld1.32 {q10-q11},[$key1],#32 // load key schedule...
2985	aesd $dat0,q8
2986	aesimc $dat0,$dat0
2987	aesd $dat0,q9
2988	aesimc $dat0,$dat0
2989	vld1.32 {q12-q13},[$key1],#32 // load key schedule...
2990	aesd $dat0,q10
2991	aesimc $dat0,$dat0
2992	aesd $dat0,q11
2993	aesimc $dat0,$dat0
2994	vld1.32 {q14-q15},[$key1],#32 // load key schedule...
2995	aesd $dat0,q12
2996	aesimc $dat0,$dat0
2997	aesd $dat0,q13
2998	aesimc $dat0,$dat0
2999	vld1.32 {$rndlast},[$key1]
3000	aesd $dat0,q14
3001	aesimc $dat0,$dat0
3002	aesd $dat0,q15
3003	veor $dat0,$dat0,$rndlast
3004	veor $dat0,$iv0,$dat0
3005	vst1.8 {$dat0},[$out]
3006	b .Lxts_dec_final_abort
3007	.Lxts_dec_big_size:
3008	___
3009	$code.=<<___ if ($flavour =~ /64/);
3010	stp $constnumx,$tmpinp,[sp,#-64]!
3011	stp $tailcnt,$midnumx,[sp,#48]
3012	stp $ivd10,$ivd20,[sp,#32]
3013	stp $ivd30,$ivd40,[sp,#16]
3014
3015	and $tailcnt,$len,#0xf
3016	and $len,$len,#-16
3017	subs $len,$len,#16
3018	mov $step,#16
3019	b.lo .Lxts_dec_abort
3020
3021	// Encrypt the iv with key2, as the first XEX iv
3022	ldr $rounds,[$key2,#240]
3023	vld1.8 {$dat},[$key2],#16
3024	vld1.8 {$iv0},[$ivp]
3025	sub $rounds,$rounds,#2
3026	vld1.8 {$dat1},[$key2],#16
3027
3028	.Loop_dec_iv_enc:
3029	aese $iv0,$dat
3030	aesmc $iv0,$iv0
3031	vld1.32 {$dat},[$key2],#16
3032	subs $rounds,$rounds,#2
3033	aese $iv0,$dat1
3034	aesmc $iv0,$iv0
3035	vld1.32 {$dat1},[$key2],#16
3036	b.gt .Loop_dec_iv_enc
3037
3038	aese $iv0,$dat
3039	aesmc $iv0,$iv0
3040	vld1.32 {$dat},[$key2]
3041	aese $iv0,$dat1
3042	veor $iv0,$iv0,$dat
3043
3044	// The iv for second block
3045	// $ivl- iv(low), $ivh - iv(high)
3046	// the five ivs stored into, $iv0,$iv1,$iv2,$iv3,$iv4
3047	fmov $ivl,$ivd00
3048	fmov $ivh,$ivd01
3049	mov $constnum,#0x87
3050	extr $midnumx,$ivh,$ivh,#32
3051	extr $ivh,$ivh,$ivl,#63
3052	and $tmpmw,$constnum,$midnum,asr #31
3053	eor $ivl,$tmpmx,$ivl,lsl #1
3054	fmov $ivd10,$ivl
3055	fmov $ivd11,$ivh
3056
3057	ldr $rounds0,[$key1,#240] // load rounds number
3058
3059	// The iv for third block
3060	extr $midnumx,$ivh,$ivh,#32
3061	extr $ivh,$ivh,$ivl,#63
3062	and $tmpmw,$constnum,$midnum,asr #31
3063	eor $ivl,$tmpmx,$ivl,lsl #1
3064	fmov $ivd20,$ivl
3065	fmov $ivd21,$ivh
3066
3067	vld1.32 {q8-q9},[$key1] // load key schedule...
3068	sub $rounds0,$rounds0,#6
3069	add $key_,$key1,$ivp,lsl#4 // pointer to last 7 round keys
3070	sub $rounds0,$rounds0,#2
3071	vld1.32 {q10-q11},[$key_],#32 // load key schedule...
3072	vld1.32 {q12-q13},[$key_],#32
3073	vld1.32 {q14-q15},[$key_],#32
3074	vld1.32 {$rndlast},[$key_]
3075
3076	// The iv for fourth block
3077	extr $midnumx,$ivh,$ivh,#32
3078	extr $ivh,$ivh,$ivl,#63
3079	and $tmpmw,$constnum,$midnum,asr #31
3080	eor $ivl,$tmpmx,$ivl,lsl #1
3081	fmov $ivd30,$ivl
3082	fmov $ivd31,$ivh
3083
3084	add $key_,$key1,#32
3085	mov $rounds,$rounds0
3086	b .Lxts_dec
3087
3088	// Decryption
3089	.align 5
3090	.Lxts_dec:
3091	tst $tailcnt,#0xf
3092	b.eq .Lxts_dec_begin
3093	subs $len,$len,#16
3094	csel $step,xzr,$step,eq
3095	vld1.8 {$dat},[$inp],#16
3096	b.lo .Lxts_done
3097	sub $inp,$inp,#16
3098	.Lxts_dec_begin:
3099	vld1.8 {$dat},[$inp],$step
3100	subs $len,$len,#32 // bias
3101	add $rounds,$rounds0,#2
3102	vorr $in1,$dat,$dat
3103	vorr $dat1,$dat,$dat
3104	vorr $in3,$dat,$dat
3105	vld1.8 {$dat2},[$inp],#16
3106	vorr $in2,$dat2,$dat2
3107	vorr $in4,$dat2,$dat2
3108	b.lo .Lxts_inner_dec_tail
3109	veor $dat,$dat,$iv0 // before decryt, xor with iv
3110	veor $dat2,$dat2,$iv1
3111
3112	vorr $dat1,$dat2,$dat2
3113	vld1.8 {$dat2},[$inp],#16
3114	vorr $in0,$dat,$dat
3115	vorr $in1,$dat1,$dat1
3116	veor $in2,$dat2,$iv2 // third block xox with third iv
3117	veor $dat2,$dat2,$iv2
3118	cmp $len,#32
3119	b.lo .Lxts_outer_dec_tail
3120
3121	vld1.8 {$dat3},[$inp],#16
3122
3123	// The iv for fifth block
3124	extr $midnumx,$ivh,$ivh,#32
3125	extr $ivh,$ivh,$ivl,#63
3126	and $tmpmw,$constnum,$midnum,asr #31
3127	eor $ivl,$tmpmx,$ivl,lsl #1
3128	fmov $ivd40,$ivl
3129	fmov $ivd41,$ivh
3130
3131	vld1.8 {$dat4},[$inp],#16
3132	veor $dat3,$dat3,$iv3 // the fourth block
3133	veor $dat4,$dat4,$iv4
3134	sub $len,$len,#32 // bias
3135	mov $rounds,$rounds0
3136	b .Loop5x_xts_dec
3137
3138	.align 4
3139	.Loop5x_xts_dec:
3140	aesd $dat0,q8
3141	aesimc $dat0,$dat0
3142	aesd $dat1,q8
3143	aesimc $dat1,$dat1
3144	aesd $dat2,q8
3145	aesimc $dat2,$dat2
3146	aesd $dat3,q8
3147	aesimc $dat3,$dat3
3148	aesd $dat4,q8
3149	aesimc $dat4,$dat4
3150	vld1.32 {q8},[$key_],#16 // load key schedule...
3151	subs $rounds,$rounds,#2
3152	aesd $dat0,q9
3153	aesimc $dat0,$dat0
3154	aesd $dat1,q9
3155	aesimc $dat1,$dat1
3156	aesd $dat2,q9
3157	aesimc $dat2,$dat2
3158	aesd $dat3,q9
3159	aesimc $dat3,$dat3
3160	aesd $dat4,q9
3161	aesimc $dat4,$dat4
3162	vld1.32 {q9},[$key_],#16 // load key schedule...
3163	b.gt .Loop5x_xts_dec
3164
3165	aesd $dat0,q8
3166	aesimc $dat0,$dat0
3167	aesd $dat1,q8
3168	aesimc $dat1,$dat1
3169	aesd $dat2,q8
3170	aesimc $dat2,$dat2
3171	aesd $dat3,q8
3172	aesimc $dat3,$dat3
3173	aesd $dat4,q8
3174	aesimc $dat4,$dat4
3175	subs $len,$len,#0x50 // because .Lxts_dec_tail4x
3176
3177	aesd $dat0,q9
3178	aesimc $dat0,$dat
3179	aesd $dat1,q9
3180	aesimc $dat1,$dat1
3181	aesd $dat2,q9
3182	aesimc $dat2,$dat2
3183	aesd $dat3,q9
3184	aesimc $dat3,$dat3
3185	aesd $dat4,q9
3186	aesimc $dat4,$dat4
3187	csel $xoffset,xzr,$len,gt // borrow x6, w6, "gt" is not typo
3188	mov $key_,$key1
3189
3190	aesd $dat0,q10
3191	aesimc $dat0,$dat0
3192	aesd $dat1,q10
3193	aesimc $dat1,$dat1
3194	aesd $dat2,q10
3195	aesimc $dat2,$dat2
3196	aesd $dat3,q10
3197	aesimc $dat3,$dat3
3198	aesd $dat4,q10
3199	aesimc $dat4,$dat4
3200	add $inp,$inp,$xoffset // x0 is adjusted in such way that
3201	// at exit from the loop v1.16b-v26.16b
3202	// are loaded with last "words"
3203	add $xoffset,$len,#0x60 // because .Lxts_dec_tail4x
3204
3205	aesd $dat0,q11
3206	aesimc $dat0,$dat0
3207	aesd $dat1,q11
3208	aesimc $dat1,$dat1
3209	aesd $dat2,q11
3210	aesimc $dat2,$dat2
3211	aesd $dat3,q11
3212	aesimc $dat3,$dat3
3213	aesd $dat4,q11
3214	aesimc $dat4,$dat4
3215
3216	aesd $dat0,q12
3217	aesimc $dat0,$dat0
3218	aesd $dat1,q12
3219	aesimc $dat1,$dat1
3220	aesd $dat2,q12
3221	aesimc $dat2,$dat2
3222	aesd $dat3,q12
3223	aesimc $dat3,$dat3
3224	aesd $dat4,q12
3225	aesimc $dat4,$dat4
3226
3227	aesd $dat0,q13
3228	aesimc $dat0,$dat0
3229	aesd $dat1,q13
3230	aesimc $dat1,$dat1
3231	aesd $dat2,q13
3232	aesimc $dat2,$dat2
3233	aesd $dat3,q13
3234	aesimc $dat3,$dat3
3235	aesd $dat4,q13
3236	aesimc $dat4,$dat4
3237
3238	aesd $dat0,q14
3239	aesimc $dat0,$dat0
3240	aesd $dat1,q14
3241	aesimc $dat1,$dat1
3242	aesd $dat2,q14
3243	aesimc $dat2,$dat2
3244	aesd $dat3,q14
3245	aesimc $dat3,$dat3
3246	aesd $dat4,q14
3247	aesimc $dat4,$dat4
3248
3249	veor $tmp0,$rndlast,$iv0
3250	aesd $dat0,q15
3251	// The iv for first block of next iteration.
3252	extr $midnumx,$ivh,$ivh,#32
3253	extr $ivh,$ivh,$ivl,#63
3254	and $tmpmw,$constnum,$midnum,asr #31
3255	eor $ivl,$tmpmx,$ivl,lsl #1
3256	fmov $ivd00,$ivl
3257	fmov $ivd01,$ivh
3258	veor $tmp1,$rndlast,$iv1
3259	vld1.8 {$in0},[$inp],#16
3260	aesd $dat1,q15
3261	// The iv for second block
3262	extr $midnumx,$ivh,$ivh,#32
3263	extr $ivh,$ivh,$ivl,#63
3264	and $tmpmw,$constnum,$midnum,asr #31
3265	eor $ivl,$tmpmx,$ivl,lsl #1
3266	fmov $ivd10,$ivl
3267	fmov $ivd11,$ivh
3268	veor $tmp2,$rndlast,$iv2
3269	vld1.8 {$in1},[$inp],#16
3270	aesd $dat2,q15
3271	// The iv for third block
3272	extr $midnumx,$ivh,$ivh,#32
3273	extr $ivh,$ivh,$ivl,#63
3274	and $tmpmw,$constnum,$midnum,asr #31
3275	eor $ivl,$tmpmx,$ivl,lsl #1
3276	fmov $ivd20,$ivl
3277	fmov $ivd21,$ivh
3278	veor $tmp3,$rndlast,$iv3
3279	vld1.8 {$in2},[$inp],#16
3280	aesd $dat3,q15
3281	// The iv for fourth block
3282	extr $midnumx,$ivh,$ivh,#32
3283	extr $ivh,$ivh,$ivl,#63
3284	and $tmpmw,$constnum,$midnum,asr #31
3285	eor $ivl,$tmpmx,$ivl,lsl #1
3286	fmov $ivd30,$ivl
3287	fmov $ivd31,$ivh
3288	veor $tmp4,$rndlast,$iv4
3289	vld1.8 {$in3},[$inp],#16
3290	aesd $dat4,q15
3291
3292	// The iv for fifth block
3293	extr $midnumx,$ivh,$ivh,#32
3294	extr $ivh,$ivh,$ivl,#63
3295	and $tmpmw,$constnum,$midnum,asr #31
3296	eor $ivl,$tmpmx,$ivl,lsl #1
3297	fmov $ivd40,$ivl
3298	fmov $ivd41,$ivh
3299
3300	vld1.8 {$in4},[$inp],#16
3301	cbz $xoffset,.Lxts_dec_tail4x
3302	vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0]
3303	veor $tmp0,$tmp0,$dat0
3304	veor $dat0,$in0,$iv0
3305	veor $tmp1,$tmp1,$dat1
3306	veor $dat1,$in1,$iv1
3307	veor $tmp2,$tmp2,$dat2
3308	veor $dat2,$in2,$iv2
3309	veor $tmp3,$tmp3,$dat3
3310	veor $dat3,$in3,$iv3
3311	veor $tmp4,$tmp4,$dat4
3312	vst1.8 {$tmp0},[$out],#16
3313	veor $dat4,$in4,$iv4
3314	vst1.8 {$tmp1},[$out],#16
3315	mov $rounds,$rounds0
3316	vst1.8 {$tmp2},[$out],#16
3317	vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1]
3318	vst1.8 {$tmp3},[$out],#16
3319	vst1.8 {$tmp4},[$out],#16
3320	b.hs .Loop5x_xts_dec
3321
3322	cmn $len,#0x10
3323	b.ne .Loop5x_dec_after
3324	// If x2($len) equal to -0x10, the left blocks is 4.
3325	// After specially processing, utilize the five blocks processing again.
3326	// It will use the following IVs: $iv0,$iv0,$iv1,$iv2,$iv3.
3327	vorr $iv4,$iv3,$iv3
3328	vorr $iv3,$iv2,$iv2
3329	vorr $iv2,$iv1,$iv1
3330	vorr $iv1,$iv0,$iv0
3331	fmov $ivl,$ivd40
3332	fmov $ivh,$ivd41
3333	veor $dat0,$iv0,$in0
3334	veor $dat1,$iv1,$in1
3335	veor $dat2,$in2,$iv2
3336	veor $dat3,$in3,$iv3
3337	veor $dat4,$in4,$iv4
3338	b.eq .Loop5x_xts_dec
3339
3340	.Loop5x_dec_after:
3341	add $len,$len,#0x50
3342	cbz $len,.Lxts_done
3343
3344	add $rounds,$rounds0,#2
3345	subs $len,$len,#0x30
3346	b.lo .Lxts_inner_dec_tail
3347
3348	veor $dat0,$iv0,$in2
3349	veor $dat1,$iv1,$in3
3350	veor $dat2,$in4,$iv2
3351	b .Lxts_outer_dec_tail
3352
3353	.align 4
3354	.Lxts_dec_tail4x:
3355	add $inp,$inp,#16
3356	vld1.32 {$dat0},[$inp],#16
3357	veor $tmp1,$dat1,$tmp0
3358	vst1.8 {$tmp1},[$out],#16
3359	veor $tmp2,$dat2,$tmp2
3360	vst1.8 {$tmp2},[$out],#16
3361	veor $tmp3,$dat3,$tmp3
3362	veor $tmp4,$dat4,$tmp4
3363	vst1.8 {$tmp3-$tmp4},[$out],#32
3364
3365	b .Lxts_done
3366	.align 4
3367	.Lxts_outer_dec_tail:
3368	aesd $dat0,q8
3369	aesimc $dat0,$dat0
3370	aesd $dat1,q8
3371	aesimc $dat1,$dat1
3372	aesd $dat2,q8
3373	aesimc $dat2,$dat2
3374	vld1.32 {q8},[$key_],#16
3375	subs $rounds,$rounds,#2
3376	aesd $dat0,q9
3377	aesimc $dat0,$dat0
3378	aesd $dat1,q9
3379	aesimc $dat1,$dat1
3380	aesd $dat2,q9
3381	aesimc $dat2,$dat2
3382	vld1.32 {q9},[$key_],#16
3383	b.gt .Lxts_outer_dec_tail
3384
3385	aesd $dat0,q8
3386	aesimc $dat0,$dat0
3387	aesd $dat1,q8
3388	aesimc $dat1,$dat1
3389	aesd $dat2,q8
3390	aesimc $dat2,$dat2
3391	veor $tmp0,$iv0,$rndlast
3392	subs $len,$len,#0x30
3393	// The iv for first block
3394	fmov $ivl,$ivd20
3395	fmov $ivh,$ivd21
3396	mov $constnum,#0x87
3397	extr $midnumx,$ivh,$ivh,#32
3398	extr $ivh,$ivh,$ivl,#63
3399	and $tmpmw,$constnum,$midnum,asr #31
3400	eor $ivl,$tmpmx,$ivl,lsl #1
3401	fmov $ivd00,$ivl
3402	fmov $ivd01,$ivh
3403	veor $tmp1,$iv1,$rndlast
3404	csel $xoffset,$len,$xoffset,lo // x6, w6, is zero at this point
3405	aesd $dat0,q9
3406	aesimc $dat0,$dat0
3407	aesd $dat1,q9
3408	aesimc $dat1,$dat1
3409	aesd $dat2,q9
3410	aesimc $dat2,$dat2
3411	veor $tmp2,$iv2,$rndlast
3412	// The iv for second block
3413	extr $midnumx,$ivh,$ivh,#32
3414	extr $ivh,$ivh,$ivl,#63
3415	and $tmpmw,$constnum,$midnum,asr #31
3416	eor $ivl,$tmpmx,$ivl,lsl #1
3417	fmov $ivd10,$ivl
3418	fmov $ivd11,$ivh
3419
3420	add $xoffset,$xoffset,#0x20
3421	add $inp,$inp,$xoffset // $inp is adjusted to the last data
3422
3423	mov $key_,$key1
3424
3425	// The iv for third block
3426	extr $midnumx,$ivh,$ivh,#32
3427	extr $ivh,$ivh,$ivl,#63
3428	and $tmpmw,$constnum,$midnum,asr #31
3429	eor $ivl,$tmpmx,$ivl,lsl #1
3430	fmov $ivd20,$ivl
3431	fmov $ivd21,$ivh
3432
3433	aesd $dat0,q12
3434	aesimc $dat0,$dat0
3435	aesd $dat1,q12
3436	aesimc $dat1,$dat1
3437	aesd $dat2,q12
3438	aesimc $dat2,$dat2
3439	aesd $dat0,q13
3440	aesimc $dat0,$dat0
3441	aesd $dat1,q13
3442	aesimc $dat1,$dat1
3443	aesd $dat2,q13
3444	aesimc $dat2,$dat2
3445	aesd $dat0,q14
3446	aesimc $dat0,$dat0
3447	aesd $dat1,q14
3448	aesimc $dat1,$dat1
3449	aesd $dat2,q14
3450	aesimc $dat2,$dat2
3451	vld1.8 {$in2},[$inp],#16
3452	aesd $dat0,q15
3453	aesd $dat1,q15
3454	aesd $dat2,q15
3455	vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0]
3456	add $rounds,$rounds0,#2
3457	veor $tmp0,$tmp0,$dat0
3458	veor $tmp1,$tmp1,$dat1
3459	veor $dat2,$dat2,$tmp2
3460	vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1]
3461	vst1.8 {$tmp0},[$out],#16
3462	vst1.8 {$tmp1},[$out],#16
3463	vst1.8 {$dat2},[$out],#16
3464
3465	cmn $len,#0x30
3466	add $len,$len,#0x30
3467	b.eq .Lxts_done
3468	sub $len,$len,#0x30
3469	vorr $in3,$in1,$in1
3470	vorr $in4,$in2,$in2
3471	nop
3472
3473	.Lxts_inner_dec_tail:
3474	// $len == -0x10 means two blocks left.
3475	cmn $len,#0x10
3476	veor $dat1,$in3,$iv0
3477	veor $dat2,$in4,$iv1
3478	b.eq .Lxts_dec_tail_loop
3479	veor $dat2,$in4,$iv0
3480	.Lxts_dec_tail_loop:
3481	aesd $dat1,q8
3482	aesimc $dat1,$dat1
3483	aesd $dat2,q8
3484	aesimc $dat2,$dat2
3485	vld1.32 {q8},[$key_],#16
3486	subs $rounds,$rounds,#2
3487	aesd $dat1,q9
3488	aesimc $dat1,$dat1
3489	aesd $dat2,q9
3490	aesimc $dat2,$dat2
3491	vld1.32 {q9},[$key_],#16
3492	b.gt .Lxts_dec_tail_loop
3493
3494	aesd $dat1,q8
3495	aesimc $dat1,$dat1
3496	aesd $dat2,q8
3497	aesimc $dat2,$dat2
3498	aesd $dat1,q9
3499	aesimc $dat1,$dat1
3500	aesd $dat2,q9
3501	aesimc $dat2,$dat2
3502	aesd $dat1,q12
3503	aesimc $dat1,$dat1
3504	aesd $dat2,q12
3505	aesimc $dat2,$dat2
3506	cmn $len,#0x20
3507	aesd $dat1,q13
3508	aesimc $dat1,$dat1
3509	aesd $dat2,q13
3510	aesimc $dat2,$dat2
3511	veor $tmp1,$iv0,$rndlast
3512	aesd $dat1,q14
3513	aesimc $dat1,$dat1
3514	aesd $dat2,q14
3515	aesimc $dat2,$dat2
3516	veor $tmp2,$iv1,$rndlast
3517	aesd $dat1,q15
3518	aesd $dat2,q15
3519	b.eq .Lxts_dec_one
3520	veor $tmp1,$tmp1,$dat1
3521	veor $tmp2,$tmp2,$dat2
3522	vorr $iv0,$iv2,$iv2
3523	vorr $iv1,$iv3,$iv3
3524	vst1.8 {$tmp1},[$out],#16
3525	vst1.8 {$tmp2},[$out],#16
3526	add $len,$len,#16
3527	b .Lxts_done
3528
3529	.Lxts_dec_one:
3530	veor $tmp1,$tmp1,$dat2
3531	vorr $iv0,$iv1,$iv1
3532	vorr $iv1,$iv2,$iv2
3533	vst1.8 {$tmp1},[$out],#16
3534	add $len,$len,#32
3535
3536	.Lxts_done:
3537	tst $tailcnt,#0xf
3538	b.eq .Lxts_dec_abort
3539	// Processing the last two blocks with cipher stealing.
3540	mov x7,x3
3541	cbnz x2,.Lxts_dec_1st_done
3542	vld1.32 {$dat0},[$inp],#16
3543
3544	// Decrypt the last secod block to get the last plain text block
3545	.Lxts_dec_1st_done:
3546	eor $tmpin,$dat0,$iv1
3547	ldr $rounds,[$key1,#240]
3548	vld1.32 {$dat0},[$key1],#16
3549	sub $rounds,$rounds,#2
3550	vld1.32 {$dat1},[$key1],#16
3551	.Loop_final_2nd_dec:
3552	aesd $tmpin,$dat0
3553	aesimc $tmpin,$tmpin
3554	vld1.32 {$dat0},[$key1],#16 // load key schedule...
3555	subs $rounds,$rounds,#2
3556	aesd $tmpin,$dat1
3557	aesimc $tmpin,$tmpin
3558	vld1.32 {$dat1},[$key1],#16 // load key schedule...
3559	b.gt .Loop_final_2nd_dec
3560
3561	aesd $tmpin,$dat0
3562	aesimc $tmpin,$tmpin
3563	vld1.32 {$dat0},[$key1]
3564	aesd $tmpin,$dat1
3565	veor $tmpin,$tmpin,$dat0
3566	veor $tmpin,$tmpin,$iv1
3567	vst1.8 {$tmpin},[$out]
3568
3569	mov $tmpinp,$inp
3570	add $tmpoutp,$out,#16
3571
3572	// Composite the tailcnt "16 byte not aligned block" into the last second plain blocks
3573	// to get the last encrypted block.
3574	.composite_dec_loop:
3575	subs $tailcnt,$tailcnt,#1
3576	ldrb $l2outp,[$out,$tailcnt]
3577	ldrb $loutp,[$tmpinp,$tailcnt]
3578	strb $l2outp,[$tmpoutp,$tailcnt]
3579	strb $loutp,[$out,$tailcnt]
3580	b.gt .composite_dec_loop
3581	.Lxts_dec_load_done:
3582	vld1.8 {$tmpin},[$out]
3583	veor $tmpin,$tmpin,$iv0
3584
3585	// Decrypt the composite block to get the last second plain text block
3586	ldr $rounds,[$key_,#240]
3587	vld1.8 {$dat},[$key_],#16
3588	sub $rounds,$rounds,#2
3589	vld1.8 {$dat1},[$key_],#16
3590	.Loop_final_dec:
3591	aesd $tmpin,$dat0
3592	aesimc $tmpin,$tmpin
3593	vld1.32 {$dat0},[$key_],#16 // load key schedule...
3594	subs $rounds,$rounds,#2
3595	aesd $tmpin,$dat1
3596	aesimc $tmpin,$tmpin
3597	vld1.32 {$dat1},[$key_],#16 // load key schedule...
3598	b.gt .Loop_final_dec
3599
3600	aesd $tmpin,$dat0
3601	aesimc $tmpin,$tmpin
3602	vld1.32 {$dat0},[$key_]
3603	aesd $tmpin,$dat1
3604	veor $tmpin,$tmpin,$dat0
3605	veor $tmpin,$tmpin,$iv0
3606	vst1.8 {$tmpin},[$out]
3607
3608	.Lxts_dec_abort:
3609	ldp $tailcnt,$midnumx,[sp,#48]
3610	ldp $ivd10,$ivd20,[sp,#32]
3611	ldp $ivd30,$ivd40,[sp,#16]
3612	ldp $constnumx,$tmpinp,[sp],#64
3613
3614	.Lxts_dec_final_abort:
3615	ret
3616	.size ${prefix}_xts_decrypt,.-${prefix}_xts_decrypt
3617	___
3618	}
3619	}}}
3620	$code.=<<___;
3621	#endif
3622	___
3623	########################################
3624	if ($flavour =~ /64/) { ######## 64-bit code
3625	my %opcode = (
3626	"aesd" => 0x4e285800, "aese" => 0x4e284800,
3627	"aesimc"=> 0x4e287800, "aesmc" => 0x4e286800 );
3628
3629	local *unaes = sub {
3630	my ($mnemonic,$arg)=@_;
3631
3632	$arg =~ m/[qv]([0-9]+)[^,],\s[qv]([0-9]+)/o &&
3633	sprintf ".inst\t0x%08x\t//%s %s",
3634	$opcode{$mnemonic}\|$1\|($2<<5),
3635	$mnemonic,$arg;
3636	};
3637
3638	foreach(split("\n",$code)) {
3639	s/\`([^\`]*)\`/eval($1)/geo;
3640
3641	s/\bq([0-9]+)\b/"v".($1<8?$1:$1+8).".16b"/geo; # old->new registers
3642	s/@\s/\/\//o; # old->new style commentary
3643
3644	#s/[v]?(aes\w+)\s+([qv].*)/unaes($1,$2)/geo or
3645	s/cclr\s+([wx])([^,]+),\s*([a-z]+)/csel $1$2,$1zr,$1$2,$3/o or
3646	s/mov\.([a-z]+)\s+([wx][0-9]+),\s*([wx][0-9]+)/csel $2,$3,$2,$1/o or
3647	s/vmov\.i8/movi/o or # fix up legacy mnemonics
3648	s/vext\.8/ext/o or
3649	s/vrev32\.8/rev32/o or
3650	s/vtst\.8/cmtst/o or
3651	s/vshr/ushr/o or
3652	s/^(\s+)v/$1/o or # strip off v prefix
3653	s/\bbx\s+lr\b/ret/o;
3654
3655	# fix up remaining legacy suffixes
3656	s/\.[ui]?8//o;
3657	m/\],#8/o and s/\.16b/\.8b/go;
3658	s/\.[ui]?32//o and s/\.16b/\.4s/go;
3659	s/\.[ui]?64//o and s/\.16b/\.2d/go;
3660	s/\.[42]([sd])\[([0-3])\]/\.$1\[$2\]/o;
3661
3662	print $_,"\n";
3663	}
3664	} else { ######## 32-bit code
3665	my %opcode = (
3666	"aesd" => 0xf3b00340, "aese" => 0xf3b00300,
3667	"aesimc"=> 0xf3b003c0, "aesmc" => 0xf3b00380 );
3668
3669	local *unaes = sub {
3670	my ($mnemonic,$arg)=@_;
3671
3672	if ($arg =~ m/[qv]([0-9]+)[^,],\s[qv]([0-9]+)/o) {
3673	my $word = $opcode{$mnemonic}\|(($1&7)<<13)\|(($1&8)<<19)
3674	\|(($2&7)<<1) \|(($2&8)<<2);
3675	# since ARMv7 instructions are always encoded little-endian.
3676	# correct solution is to use .inst directive, but older
3677	# assemblers don't implement it:-(
3678	sprintf "INST(0x%02x,0x%02x,0x%02x,0x%02x)\t@ %s %s",
3679	$word&0xff,($word>>8)&0xff,
3680	($word>>16)&0xff,($word>>24)&0xff,
3681	$mnemonic,$arg;
3682	}
3683	};
3684
3685	sub unvtbl {
3686	my $arg=shift;
3687
3688	$arg =~ m/q([0-9]+),\s\{q([0-9]+)\},\sq([0-9]+)/o &&
3689	sprintf "vtbl.8 d%d,{q%d},d%d\n\t".
3690	"vtbl.8 d%d,{q%d},d%d", 2$1,$2,2$3, 2$1+1,$2,2$3+1;
3691	}
3692
3693	sub unvdup32 {
3694	my $arg=shift;
3695
3696	$arg =~ m/q([0-9]+),\s*q([0-9]+)\[([0-3])\]/o &&
3697	sprintf "vdup.32 q%d,d%d[%d]",$1,2*$2+($3>>1),$3&1;
3698	}
3699
3700	sub unvmov32 {
3701	my $arg=shift;
3702
3703	$arg =~ m/q([0-9]+)\[([0-3])\],(.*)/o &&
3704	sprintf "vmov.32 d%d[%d],%s",2*$1+($2>>1),$2&1,$3;
3705	}
3706
3707	foreach(split("\n",$code)) {
3708	s/\`([^\`]*)\`/eval($1)/geo;
3709
3710	s/\b[wx]([0-9]+)\b/r$1/go; # new->old registers
3711	s/\bv([0-9])\.[12468]+[bsd]\b/q$1/go; # new->old registers
3712	s/\/\/\s?/@ /o; # new->old style commentary
3713
3714	# fix up remaining new-style suffixes
3715	s/\{q([0-9]+)\},\s\[(.+)\],#8/sprintf "{d%d},[$2]!",2$1/eo or
3716	s/\],#[0-9]+/]!/o;
3717
3718	s/[v]?(aes\w+)\s+([qv].*)/unaes($1,$2)/geo or
3719	s/cclr\s+([^,]+),\s*([a-z]+)/mov.$2 $1,#0/o or
3720	s/vtbl\.8\s+(.*)/unvtbl($1)/geo or
3721	s/vdup\.32\s+(.*)/unvdup32($1)/geo or
3722	s/vmov\.32\s+(.*)/unvmov32($1)/geo or
3723	s/^(\s+)b\./$1b/o or
3724	s/^(\s+)ret/$1bx\tlr/o;
3725
3726	if (s/^(\s+)mov\.([a-z]+)/$1mov$2/) {
3727	print " it $2\n";
3728	}
3729
3730	print $_,"\n";
3731	}
3732	}
3733
3734	close STDOUT or die "error closing STDOUT: $!";

注意: 瀏覽 TracBrowser 來幫助您使用儲存庫瀏覽器

source: vbox/trunk/src/libs/openssl-3.0.7/crypto/aes/asm/aesv8-armx.pl@ 99507

以其他格式下載: