aesv8-armx.pl@ 94081

最後變更在這個檔案從94081是 91772,由 vboxsync 提交於 3 年前
openssl-1.1.1l: Applied and adjusted our OpenSSL changes to 1.1.1l. bugref:10126
屬性 svn:executable 設為 ``*
檔案大小: 21.8 KB

行
1	#! /usr/bin/env perl
2	# Copyright 2014-2020 The OpenSSL Project Authors. All Rights Reserved.
3	#
4	# Licensed under the OpenSSL license (the "License"). You may not use
5	# this file except in compliance with the License. You can obtain a copy
6	# in the file LICENSE in the source distribution or at
7	# https://www.openssl.org/source/license.html
8
9	#
10	# ====================================================================
11	# Written by Andy Polyakov <[email protected]> for the OpenSSL
12	# project. The module is, however, dual licensed under OpenSSL and
13	# CRYPTOGAMS licenses depending on where you obtain it. For further
14	# details see http://www.openssl.org/~appro/cryptogams/.
15	# ====================================================================
16	#
17	# This module implements support for ARMv8 AES instructions. The
18	# module is endian-agnostic in sense that it supports both big- and
19	# little-endian cases. As does it support both 32- and 64-bit modes
20	# of operation. Latter is achieved by limiting amount of utilized
21	# registers to 16, which implies additional NEON load and integer
22	# instructions. This has no effect on mighty Apple A7, where results
23	# are literally equal to the theoretical estimates based on AES
24	# instruction latencies and issue rates. On Cortex-A53, an in-order
25	# execution core, this costs up to 10-15%, which is partially
26	# compensated by implementing dedicated code path for 128-bit
27	# CBC encrypt case. On Cortex-A57 parallelizable mode performance
28	# seems to be limited by sheer amount of NEON instructions...
29	#
30	# Performance in cycles per byte processed with 128-bit key:
31	#
32	# CBC enc CBC dec CTR
33	# Apple A7 2.39 1.20 1.20
34	# Cortex-A53 1.32 1.29 1.46
35	# Cortex-A57(*) 1.95 0.85 0.93
36	# Denver 1.96 0.86 0.80
37	# Mongoose 1.33 1.20 1.20
38	# Kryo 1.26 0.94 1.00
39	#
40	# (*) original 3.64/1.34/1.32 results were for r0p0 revision
41	# and are still same even for updated module;
42
43	$flavour = shift;
44	$output = shift;
45
46	$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
47	( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
48	( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
49	die "can't locate arm-xlate.pl";
50
51	open OUT,"\| \"$^X\" $xlate $flavour $output";
52	STDOUT=OUT;
53
54	$prefix="aes_v8";
55
56	$code=<<___;
57	#include "arm_arch.h"
58
59	#if __ARM_MAX_ARCH__>=7
60	.text
61	___
62	$code.=".arch armv8-a+crypto\n" if ($flavour =~ /64/);
63	$code.=<<___ if ($flavour !~ /64/);
64	.arch armv7-a // don't confuse not-so-latest binutils with argv8 :-)
65	.fpu neon
66	.code 32
67	#undef __thumb2__
68	___
69
70	# Assembler mnemonics are an eclectic mix of 32- and 64-bit syntax,
71	# NEON is mostly 32-bit mnemonics, integer - mostly 64. Goal is to
72	# maintain both 32- and 64-bit codes within single module and
73	# transliterate common code to either flavour with regex vodoo.
74	#
75	{{{
76	my ($inp,$bits,$out,$ptr,$rounds)=("x0","w1","x2","x3","w12");
77	my ($zero,$rcon,$mask,$in0,$in1,$tmp,$key)=
78	$flavour=~/64/? map("q$_",(0..6)) : map("q$_",(0..3,8..10));
79
80
81	$code.=<<___;
82	.align 5
83	.Lrcon:
84	.long 0x01,0x01,0x01,0x01
85	.long 0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d // rotate-n-splat
86	.long 0x1b,0x1b,0x1b,0x1b
87
88	.globl ${prefix}_set_encrypt_key
89	.type ${prefix}_set_encrypt_key,%function
90	.align 5
91	${prefix}_set_encrypt_key:
92	.Lenc_key:
93	___
94	$code.=<<___ if ($flavour =~ /64/);
95	stp x29,x30,[sp,#-16]!
96	add x29,sp,#0
97	___
98	$code.=<<___;
99	mov $ptr,#-1
100	cmp $inp,#0
101	b.eq .Lenc_key_abort
102	cmp $out,#0
103	b.eq .Lenc_key_abort
104	mov $ptr,#-2
105	cmp $bits,#128
106	b.lt .Lenc_key_abort
107	cmp $bits,#256
108	b.gt .Lenc_key_abort
109	tst $bits,#0x3f
110	b.ne .Lenc_key_abort
111
112	adr $ptr,.Lrcon
113	cmp $bits,#192
114
115	veor $zero,$zero,$zero
116	vld1.8 {$in0},[$inp],#16
117	mov $bits,#8 // reuse $bits
118	vld1.32 {$rcon,$mask},[$ptr],#32
119
120	b.lt .Loop128
121	b.eq .L192
122	b .L256
123
124	.align 4
125	.Loop128:
126	vtbl.8 $key,{$in0},$mask
127	vext.8 $tmp,$zero,$in0,#12
128	vst1.32 {$in0},[$out],#16
129	aese $key,$zero
130	subs $bits,$bits,#1
131
132	veor $in0,$in0,$tmp
133	vext.8 $tmp,$zero,$tmp,#12
134	veor $in0,$in0,$tmp
135	vext.8 $tmp,$zero,$tmp,#12
136	veor $key,$key,$rcon
137	veor $in0,$in0,$tmp
138	vshl.u8 $rcon,$rcon,#1
139	veor $in0,$in0,$key
140	b.ne .Loop128
141
142	vld1.32 {$rcon},[$ptr]
143
144	vtbl.8 $key,{$in0},$mask
145	vext.8 $tmp,$zero,$in0,#12
146	vst1.32 {$in0},[$out],#16
147	aese $key,$zero
148
149	veor $in0,$in0,$tmp
150	vext.8 $tmp,$zero,$tmp,#12
151	veor $in0,$in0,$tmp
152	vext.8 $tmp,$zero,$tmp,#12
153	veor $key,$key,$rcon
154	veor $in0,$in0,$tmp
155	vshl.u8 $rcon,$rcon,#1
156	veor $in0,$in0,$key
157
158	vtbl.8 $key,{$in0},$mask
159	vext.8 $tmp,$zero,$in0,#12
160	vst1.32 {$in0},[$out],#16
161	aese $key,$zero
162
163	veor $in0,$in0,$tmp
164	vext.8 $tmp,$zero,$tmp,#12
165	veor $in0,$in0,$tmp
166	vext.8 $tmp,$zero,$tmp,#12
167	veor $key,$key,$rcon
168	veor $in0,$in0,$tmp
169	veor $in0,$in0,$key
170	vst1.32 {$in0},[$out]
171	add $out,$out,#0x50
172
173	mov $rounds,#10
174	b .Ldone
175
176	.align 4
177	.L192:
178	vld1.8 {$in1},[$inp],#8
179	vmov.i8 $key,#8 // borrow $key
180	vst1.32 {$in0},[$out],#16
181	vsub.i8 $mask,$mask,$key // adjust the mask
182
183	.Loop192:
184	vtbl.8 $key,{$in1},$mask
185	vext.8 $tmp,$zero,$in0,#12
186	#ifdef __ARMEB__
187	vst1.32 {$in1},[$out],#16
188	sub $out,$out,#8
189	#else
190	vst1.32 {$in1},[$out],#8
191	#endif
192	aese $key,$zero
193	subs $bits,$bits,#1
194
195	veor $in0,$in0,$tmp
196	vext.8 $tmp,$zero,$tmp,#12
197	veor $in0,$in0,$tmp
198	vext.8 $tmp,$zero,$tmp,#12
199	veor $in0,$in0,$tmp
200
201	vdup.32 $tmp,${in0}[3]
202	veor $tmp,$tmp,$in1
203	veor $key,$key,$rcon
204	vext.8 $in1,$zero,$in1,#12
205	vshl.u8 $rcon,$rcon,#1
206	veor $in1,$in1,$tmp
207	veor $in0,$in0,$key
208	veor $in1,$in1,$key
209	vst1.32 {$in0},[$out],#16
210	b.ne .Loop192
211
212	mov $rounds,#12
213	add $out,$out,#0x20
214	b .Ldone
215
216	.align 4
217	.L256:
218	vld1.8 {$in1},[$inp]
219	mov $bits,#7
220	mov $rounds,#14
221	vst1.32 {$in0},[$out],#16
222
223	.Loop256:
224	vtbl.8 $key,{$in1},$mask
225	vext.8 $tmp,$zero,$in0,#12
226	vst1.32 {$in1},[$out],#16
227	aese $key,$zero
228	subs $bits,$bits,#1
229
230	veor $in0,$in0,$tmp
231	vext.8 $tmp,$zero,$tmp,#12
232	veor $in0,$in0,$tmp
233	vext.8 $tmp,$zero,$tmp,#12
234	veor $key,$key,$rcon
235	veor $in0,$in0,$tmp
236	vshl.u8 $rcon,$rcon,#1
237	veor $in0,$in0,$key
238	vst1.32 {$in0},[$out],#16
239	b.eq .Ldone
240
241	vdup.32 $key,${in0}[3] // just splat
242	vext.8 $tmp,$zero,$in1,#12
243	aese $key,$zero
244
245	veor $in1,$in1,$tmp
246	vext.8 $tmp,$zero,$tmp,#12
247	veor $in1,$in1,$tmp
248	vext.8 $tmp,$zero,$tmp,#12
249	veor $in1,$in1,$tmp
250
251	veor $in1,$in1,$key
252	b .Loop256
253
254	.Ldone:
255	str $rounds,[$out]
256	mov $ptr,#0
257
258	.Lenc_key_abort:
259	mov x0,$ptr // return value
260	`"ldr x29,[sp],#16" if ($flavour =~ /64/)`
261	ret
262	.size ${prefix}_set_encrypt_key,.-${prefix}_set_encrypt_key
263
264	.globl ${prefix}_set_decrypt_key
265	.type ${prefix}_set_decrypt_key,%function
266	.align 5
267	${prefix}_set_decrypt_key:
268	___
269	$code.=<<___ if ($flavour =~ /64/);
270	.inst 0xd503233f // paciasp
271	stp x29,x30,[sp,#-16]!
272	add x29,sp,#0
273	___
274	$code.=<<___ if ($flavour !~ /64/);
275	stmdb sp!,{r4,lr}
276	___
277	$code.=<<___;
278	bl .Lenc_key
279
280	cmp x0,#0
281	b.ne .Ldec_key_abort
282
283	sub $out,$out,#240 // restore original $out
284	mov x4,#-16
285	add $inp,$out,x12,lsl#4 // end of key schedule
286
287	vld1.32 {v0.16b},[$out]
288	vld1.32 {v1.16b},[$inp]
289	vst1.32 {v0.16b},[$inp],x4
290	vst1.32 {v1.16b},[$out],#16
291
292	.Loop_imc:
293	vld1.32 {v0.16b},[$out]
294	vld1.32 {v1.16b},[$inp]
295	aesimc v0.16b,v0.16b
296	aesimc v1.16b,v1.16b
297	vst1.32 {v0.16b},[$inp],x4
298	vst1.32 {v1.16b},[$out],#16
299	cmp $inp,$out
300	b.hi .Loop_imc
301
302	vld1.32 {v0.16b},[$out]
303	aesimc v0.16b,v0.16b
304	vst1.32 {v0.16b},[$inp]
305
306	eor x0,x0,x0 // return value
307	.Ldec_key_abort:
308	___
309	$code.=<<___ if ($flavour !~ /64/);
310	ldmia sp!,{r4,pc}
311	___
312	$code.=<<___ if ($flavour =~ /64/);
313	ldp x29,x30,[sp],#16
314	.inst 0xd50323bf // autiasp
315	ret
316	___
317	$code.=<<___;
318	.size ${prefix}_set_decrypt_key,.-${prefix}_set_decrypt_key
319	___
320	}}}
321	{{{
322	sub gen_block () {
323	my $dir = shift;
324	my ($e,$mc) = $dir eq "en" ? ("e","mc") : ("d","imc");
325	my ($inp,$out,$key)=map("x$_",(0..2));
326	my $rounds="w3";
327	my ($rndkey0,$rndkey1,$inout)=map("q$_",(0..3));
328
329	$code.=<<___;
330	.globl ${prefix}_${dir}crypt
331	.type ${prefix}_${dir}crypt,%function
332	.align 5
333	${prefix}_${dir}crypt:
334	ldr $rounds,[$key,#240]
335	vld1.32 {$rndkey0},[$key],#16
336	vld1.8 {$inout},[$inp]
337	sub $rounds,$rounds,#2
338	vld1.32 {$rndkey1},[$key],#16
339
340	.Loop_${dir}c:
341	aes$e $inout,$rndkey0
342	aes$mc $inout,$inout
343	vld1.32 {$rndkey0},[$key],#16
344	subs $rounds,$rounds,#2
345	aes$e $inout,$rndkey1
346	aes$mc $inout,$inout
347	vld1.32 {$rndkey1},[$key],#16
348	b.gt .Loop_${dir}c
349
350	aes$e $inout,$rndkey0
351	aes$mc $inout,$inout
352	vld1.32 {$rndkey0},[$key]
353	aes$e $inout,$rndkey1
354	veor $inout,$inout,$rndkey0
355
356	vst1.8 {$inout},[$out]
357	ret
358	.size ${prefix}_${dir}crypt,.-${prefix}_${dir}crypt
359	___
360	}
361	&gen_block("en");
362	&gen_block("de");
363	}}}
364	{{{
365	my ($inp,$out,$len,$key,$ivp)=map("x$_",(0..4)); my $enc="w5";
366	my ($rounds,$cnt,$key_,$step,$step1)=($enc,"w6","x7","x8","x12");
367	my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$ivec,$rndlast)=map("q$_",(0..7));
368
369	my ($dat,$tmp,$rndzero_n_last)=($dat0,$tmp0,$tmp1);
370	my ($key4,$key5,$key6,$key7)=("x6","x12","x14",$key);
371
372	### q8-q15 preloaded key schedule
373
374	$code.=<<___;
375	.globl ${prefix}_cbc_encrypt
376	.type ${prefix}_cbc_encrypt,%function
377	.align 5
378	${prefix}_cbc_encrypt:
379	___
380	$code.=<<___ if ($flavour =~ /64/);
381	stp x29,x30,[sp,#-16]!
382	add x29,sp,#0
383	___
384	$code.=<<___ if ($flavour !~ /64/);
385	mov ip,sp
386	stmdb sp!,{r4-r8,lr}
387	vstmdb sp!,{d8-d15} @ ABI specification says so
388	ldmia ip,{r4-r5} @ load remaining args
389	___
390	$code.=<<___;
391	subs $len,$len,#16
392	mov $step,#16
393	b.lo .Lcbc_abort
394	cclr $step,eq
395
396	cmp $enc,#0 // en- or decrypting?
397	ldr $rounds,[$key,#240]
398	and $len,$len,#-16
399	vld1.8 {$ivec},[$ivp]
400	vld1.8 {$dat},[$inp],$step
401
402	vld1.32 {q8-q9},[$key] // load key schedule...
403	sub $rounds,$rounds,#6
404	add $key_,$key,x5,lsl#4 // pointer to last 7 round keys
405	sub $rounds,$rounds,#2
406	vld1.32 {q10-q11},[$key_],#32
407	vld1.32 {q12-q13},[$key_],#32
408	vld1.32 {q14-q15},[$key_],#32
409	vld1.32 {$rndlast},[$key_]
410
411	add $key_,$key,#32
412	mov $cnt,$rounds
413	b.eq .Lcbc_dec
414
415	cmp $rounds,#2
416	veor $dat,$dat,$ivec
417	veor $rndzero_n_last,q8,$rndlast
418	b.eq .Lcbc_enc128
419
420	vld1.32 {$in0-$in1},[$key_]
421	add $key_,$key,#16
422	add $key4,$key,#16*4
423	add $key5,$key,#16*5
424	aese $dat,q8
425	aesmc $dat,$dat
426	add $key6,$key,#16*6
427	add $key7,$key,#16*7
428	b .Lenter_cbc_enc
429
430	.align 4
431	.Loop_cbc_enc:
432	aese $dat,q8
433	aesmc $dat,$dat
434	vst1.8 {$ivec},[$out],#16
435	.Lenter_cbc_enc:
436	aese $dat,q9
437	aesmc $dat,$dat
438	aese $dat,$in0
439	aesmc $dat,$dat
440	vld1.32 {q8},[$key4]
441	cmp $rounds,#4
442	aese $dat,$in1
443	aesmc $dat,$dat
444	vld1.32 {q9},[$key5]
445	b.eq .Lcbc_enc192
446
447	aese $dat,q8
448	aesmc $dat,$dat
449	vld1.32 {q8},[$key6]
450	aese $dat,q9
451	aesmc $dat,$dat
452	vld1.32 {q9},[$key7]
453	nop
454
455	.Lcbc_enc192:
456	aese $dat,q8
457	aesmc $dat,$dat
458	subs $len,$len,#16
459	aese $dat,q9
460	aesmc $dat,$dat
461	cclr $step,eq
462	aese $dat,q10
463	aesmc $dat,$dat
464	aese $dat,q11
465	aesmc $dat,$dat
466	vld1.8 {q8},[$inp],$step
467	aese $dat,q12
468	aesmc $dat,$dat
469	veor q8,q8,$rndzero_n_last
470	aese $dat,q13
471	aesmc $dat,$dat
472	vld1.32 {q9},[$key_] // re-pre-load rndkey[1]
473	aese $dat,q14
474	aesmc $dat,$dat
475	aese $dat,q15
476	veor $ivec,$dat,$rndlast
477	b.hs .Loop_cbc_enc
478
479	vst1.8 {$ivec},[$out],#16
480	b .Lcbc_done
481
482	.align 5
483	.Lcbc_enc128:
484	vld1.32 {$in0-$in1},[$key_]
485	aese $dat,q8
486	aesmc $dat,$dat
487	b .Lenter_cbc_enc128
488	.Loop_cbc_enc128:
489	aese $dat,q8
490	aesmc $dat,$dat
491	vst1.8 {$ivec},[$out],#16
492	.Lenter_cbc_enc128:
493	aese $dat,q9
494	aesmc $dat,$dat
495	subs $len,$len,#16
496	aese $dat,$in0
497	aesmc $dat,$dat
498	cclr $step,eq
499	aese $dat,$in1
500	aesmc $dat,$dat
501	aese $dat,q10
502	aesmc $dat,$dat
503	aese $dat,q11
504	aesmc $dat,$dat
505	vld1.8 {q8},[$inp],$step
506	aese $dat,q12
507	aesmc $dat,$dat
508	aese $dat,q13
509	aesmc $dat,$dat
510	aese $dat,q14
511	aesmc $dat,$dat
512	veor q8,q8,$rndzero_n_last
513	aese $dat,q15
514	veor $ivec,$dat,$rndlast
515	b.hs .Loop_cbc_enc128
516
517	vst1.8 {$ivec},[$out],#16
518	b .Lcbc_done
519	___
520	{
521	my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9));
522	$code.=<<___;
523	.align 5
524	.Lcbc_dec:
525	vld1.8 {$dat2},[$inp],#16
526	subs $len,$len,#32 // bias
527	add $cnt,$rounds,#2
528	vorr $in1,$dat,$dat
529	vorr $dat1,$dat,$dat
530	vorr $in2,$dat2,$dat2
531	b.lo .Lcbc_dec_tail
532
533	vorr $dat1,$dat2,$dat2
534	vld1.8 {$dat2},[$inp],#16
535	vorr $in0,$dat,$dat
536	vorr $in1,$dat1,$dat1
537	vorr $in2,$dat2,$dat2
538
539	.Loop3x_cbc_dec:
540	aesd $dat0,q8
541	aesimc $dat0,$dat0
542	aesd $dat1,q8
543	aesimc $dat1,$dat1
544	aesd $dat2,q8
545	aesimc $dat2,$dat2
546	vld1.32 {q8},[$key_],#16
547	subs $cnt,$cnt,#2
548	aesd $dat0,q9
549	aesimc $dat0,$dat0
550	aesd $dat1,q9
551	aesimc $dat1,$dat1
552	aesd $dat2,q9
553	aesimc $dat2,$dat2
554	vld1.32 {q9},[$key_],#16
555	b.gt .Loop3x_cbc_dec
556
557	aesd $dat0,q8
558	aesimc $dat0,$dat0
559	aesd $dat1,q8
560	aesimc $dat1,$dat1
561	aesd $dat2,q8
562	aesimc $dat2,$dat2
563	veor $tmp0,$ivec,$rndlast
564	subs $len,$len,#0x30
565	veor $tmp1,$in0,$rndlast
566	mov.lo x6,$len // x6, $cnt, is zero at this point
567	aesd $dat0,q9
568	aesimc $dat0,$dat0
569	aesd $dat1,q9
570	aesimc $dat1,$dat1
571	aesd $dat2,q9
572	aesimc $dat2,$dat2
573	veor $tmp2,$in1,$rndlast
574	add $inp,$inp,x6 // $inp is adjusted in such way that
575	// at exit from the loop $dat1-$dat2
576	// are loaded with last "words"
577	vorr $ivec,$in2,$in2
578	mov $key_,$key
579	aesd $dat0,q12
580	aesimc $dat0,$dat0
581	aesd $dat1,q12
582	aesimc $dat1,$dat1
583	aesd $dat2,q12
584	aesimc $dat2,$dat2
585	vld1.8 {$in0},[$inp],#16
586	aesd $dat0,q13
587	aesimc $dat0,$dat0
588	aesd $dat1,q13
589	aesimc $dat1,$dat1
590	aesd $dat2,q13
591	aesimc $dat2,$dat2
592	vld1.8 {$in1},[$inp],#16
593	aesd $dat0,q14
594	aesimc $dat0,$dat0
595	aesd $dat1,q14
596	aesimc $dat1,$dat1
597	aesd $dat2,q14
598	aesimc $dat2,$dat2
599	vld1.8 {$in2},[$inp],#16
600	aesd $dat0,q15
601	aesd $dat1,q15
602	aesd $dat2,q15
603	vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0]
604	add $cnt,$rounds,#2
605	veor $tmp0,$tmp0,$dat0
606	veor $tmp1,$tmp1,$dat1
607	veor $dat2,$dat2,$tmp2
608	vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1]
609	vst1.8 {$tmp0},[$out],#16
610	vorr $dat0,$in0,$in0
611	vst1.8 {$tmp1},[$out],#16
612	vorr $dat1,$in1,$in1
613	vst1.8 {$dat2},[$out],#16
614	vorr $dat2,$in2,$in2
615	b.hs .Loop3x_cbc_dec
616
617	cmn $len,#0x30
618	b.eq .Lcbc_done
619	nop
620
621	.Lcbc_dec_tail:
622	aesd $dat1,q8
623	aesimc $dat1,$dat1
624	aesd $dat2,q8
625	aesimc $dat2,$dat2
626	vld1.32 {q8},[$key_],#16
627	subs $cnt,$cnt,#2
628	aesd $dat1,q9
629	aesimc $dat1,$dat1
630	aesd $dat2,q9
631	aesimc $dat2,$dat2
632	vld1.32 {q9},[$key_],#16
633	b.gt .Lcbc_dec_tail
634
635	aesd $dat1,q8
636	aesimc $dat1,$dat1
637	aesd $dat2,q8
638	aesimc $dat2,$dat2
639	aesd $dat1,q9
640	aesimc $dat1,$dat1
641	aesd $dat2,q9
642	aesimc $dat2,$dat2
643	aesd $dat1,q12
644	aesimc $dat1,$dat1
645	aesd $dat2,q12
646	aesimc $dat2,$dat2
647	cmn $len,#0x20
648	aesd $dat1,q13
649	aesimc $dat1,$dat1
650	aesd $dat2,q13
651	aesimc $dat2,$dat2
652	veor $tmp1,$ivec,$rndlast
653	aesd $dat1,q14
654	aesimc $dat1,$dat1
655	aesd $dat2,q14
656	aesimc $dat2,$dat2
657	veor $tmp2,$in1,$rndlast
658	aesd $dat1,q15
659	aesd $dat2,q15
660	b.eq .Lcbc_dec_one
661	veor $tmp1,$tmp1,$dat1
662	veor $tmp2,$tmp2,$dat2
663	vorr $ivec,$in2,$in2
664	vst1.8 {$tmp1},[$out],#16
665	vst1.8 {$tmp2},[$out],#16
666	b .Lcbc_done
667
668	.Lcbc_dec_one:
669	veor $tmp1,$tmp1,$dat2
670	vorr $ivec,$in2,$in2
671	vst1.8 {$tmp1},[$out],#16
672
673	.Lcbc_done:
674	vst1.8 {$ivec},[$ivp]
675	.Lcbc_abort:
676	___
677	}
678	$code.=<<___ if ($flavour !~ /64/);
679	vldmia sp!,{d8-d15}
680	ldmia sp!,{r4-r8,pc}
681	___
682	$code.=<<___ if ($flavour =~ /64/);
683	ldr x29,[sp],#16
684	ret
685	___
686	$code.=<<___;
687	.size ${prefix}_cbc_encrypt,.-${prefix}_cbc_encrypt
688	___
689	}}}
690	{{{
691	my ($inp,$out,$len,$key,$ivp)=map("x$_",(0..4));
692	my ($rounds,$cnt,$key_)=("w5","w6","x7");
693	my ($ctr,$tctr0,$tctr1,$tctr2)=map("w$_",(8..10,12));
694	my $step="x12"; # aliases with $tctr2
695
696	my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$ivec,$rndlast)=map("q$_",(0..7));
697	my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9));
698
699	my ($dat,$tmp)=($dat0,$tmp0);
700
701	### q8-q15 preloaded key schedule
702
703	$code.=<<___;
704	.globl ${prefix}_ctr32_encrypt_blocks
705	.type ${prefix}_ctr32_encrypt_blocks,%function
706	.align 5
707	${prefix}_ctr32_encrypt_blocks:
708	___
709	$code.=<<___ if ($flavour =~ /64/);
710	stp x29,x30,[sp,#-16]!
711	add x29,sp,#0
712	___
713	$code.=<<___ if ($flavour !~ /64/);
714	mov ip,sp
715	stmdb sp!,{r4-r10,lr}
716	vstmdb sp!,{d8-d15} @ ABI specification says so
717	ldr r4, [ip] @ load remaining arg
718	___
719	$code.=<<___;
720	ldr $rounds,[$key,#240]
721
722	ldr $ctr, [$ivp, #12]
723	#ifdef __ARMEB__
724	vld1.8 {$dat0},[$ivp]
725	#else
726	vld1.32 {$dat0},[$ivp]
727	#endif
728	vld1.32 {q8-q9},[$key] // load key schedule...
729	sub $rounds,$rounds,#4
730	mov $step,#16
731	cmp $len,#2
732	add $key_,$key,x5,lsl#4 // pointer to last 5 round keys
733	sub $rounds,$rounds,#2
734	vld1.32 {q12-q13},[$key_],#32
735	vld1.32 {q14-q15},[$key_],#32
736	vld1.32 {$rndlast},[$key_]
737	add $key_,$key,#32
738	mov $cnt,$rounds
739	cclr $step,lo
740	#ifndef __ARMEB__
741	rev $ctr, $ctr
742	#endif
743	add $tctr1, $ctr, #1
744	vorr $ivec,$dat0,$dat0
745	rev $tctr1, $tctr1
746	vmov.32 ${ivec}[3],$tctr1
747	add $ctr, $ctr, #2
748	vorr $dat1,$ivec,$ivec
749	b.ls .Lctr32_tail
750	rev $tctr2, $ctr
751	vmov.32 ${ivec}[3],$tctr2
752	sub $len,$len,#3 // bias
753	vorr $dat2,$ivec,$ivec
754	b .Loop3x_ctr32
755
756	.align 4
757	.Loop3x_ctr32:
758	aese $dat0,q8
759	aesmc $dat0,$dat0
760	aese $dat1,q8
761	aesmc $dat1,$dat1
762	aese $dat2,q8
763	aesmc $dat2,$dat2
764	vld1.32 {q8},[$key_],#16
765	subs $cnt,$cnt,#2
766	aese $dat0,q9
767	aesmc $dat0,$dat0
768	aese $dat1,q9
769	aesmc $dat1,$dat1
770	aese $dat2,q9
771	aesmc $dat2,$dat2
772	vld1.32 {q9},[$key_],#16
773	b.gt .Loop3x_ctr32
774
775	aese $dat0,q8
776	aesmc $tmp0,$dat0
777	aese $dat1,q8
778	aesmc $tmp1,$dat1
779	vld1.8 {$in0},[$inp],#16
780	add $tctr0,$ctr,#1
781	aese $dat2,q8
782	aesmc $dat2,$dat2
783	vld1.8 {$in1},[$inp],#16
784	rev $tctr0,$tctr0
785	aese $tmp0,q9
786	aesmc $tmp0,$tmp0
787	aese $tmp1,q9
788	aesmc $tmp1,$tmp1
789	vld1.8 {$in2},[$inp],#16
790	mov $key_,$key
791	aese $dat2,q9
792	aesmc $tmp2,$dat2
793	aese $tmp0,q12
794	aesmc $tmp0,$tmp0
795	aese $tmp1,q12
796	aesmc $tmp1,$tmp1
797	veor $in0,$in0,$rndlast
798	add $tctr1,$ctr,#2
799	aese $tmp2,q12
800	aesmc $tmp2,$tmp2
801	veor $in1,$in1,$rndlast
802	add $ctr,$ctr,#3
803	aese $tmp0,q13
804	aesmc $tmp0,$tmp0
805	aese $tmp1,q13
806	aesmc $tmp1,$tmp1
807	veor $in2,$in2,$rndlast
808	vmov.32 ${ivec}[3], $tctr0
809	aese $tmp2,q13
810	aesmc $tmp2,$tmp2
811	vorr $dat0,$ivec,$ivec
812	rev $tctr1,$tctr1
813	aese $tmp0,q14
814	aesmc $tmp0,$tmp0
815	vmov.32 ${ivec}[3], $tctr1
816	rev $tctr2,$ctr
817	aese $tmp1,q14
818	aesmc $tmp1,$tmp1
819	vorr $dat1,$ivec,$ivec
820	vmov.32 ${ivec}[3], $tctr2
821	aese $tmp2,q14
822	aesmc $tmp2,$tmp2
823	vorr $dat2,$ivec,$ivec
824	subs $len,$len,#3
825	aese $tmp0,q15
826	aese $tmp1,q15
827	aese $tmp2,q15
828
829	veor $in0,$in0,$tmp0
830	vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0]
831	vst1.8 {$in0},[$out],#16
832	veor $in1,$in1,$tmp1
833	mov $cnt,$rounds
834	vst1.8 {$in1},[$out],#16
835	veor $in2,$in2,$tmp2
836	vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1]
837	vst1.8 {$in2},[$out],#16
838	b.hs .Loop3x_ctr32
839
840	adds $len,$len,#3
841	b.eq .Lctr32_done
842	cmp $len,#1
843	mov $step,#16
844	cclr $step,eq
845
846	.Lctr32_tail:
847	aese $dat0,q8
848	aesmc $dat0,$dat0
849	aese $dat1,q8
850	aesmc $dat1,$dat1
851	vld1.32 {q8},[$key_],#16
852	subs $cnt,$cnt,#2
853	aese $dat0,q9
854	aesmc $dat0,$dat0
855	aese $dat1,q9
856	aesmc $dat1,$dat1
857	vld1.32 {q9},[$key_],#16
858	b.gt .Lctr32_tail
859
860	aese $dat0,q8
861	aesmc $dat0,$dat0
862	aese $dat1,q8
863	aesmc $dat1,$dat1
864	aese $dat0,q9
865	aesmc $dat0,$dat0
866	aese $dat1,q9
867	aesmc $dat1,$dat1
868	vld1.8 {$in0},[$inp],$step
869	aese $dat0,q12
870	aesmc $dat0,$dat0
871	aese $dat1,q12
872	aesmc $dat1,$dat1
873	vld1.8 {$in1},[$inp]
874	aese $dat0,q13
875	aesmc $dat0,$dat0
876	aese $dat1,q13
877	aesmc $dat1,$dat1
878	veor $in0,$in0,$rndlast
879	aese $dat0,q14
880	aesmc $dat0,$dat0
881	aese $dat1,q14
882	aesmc $dat1,$dat1
883	veor $in1,$in1,$rndlast
884	aese $dat0,q15
885	aese $dat1,q15
886
887	cmp $len,#1
888	veor $in0,$in0,$dat0
889	veor $in1,$in1,$dat1
890	vst1.8 {$in0},[$out],#16
891	b.eq .Lctr32_done
892	vst1.8 {$in1},[$out]
893
894	.Lctr32_done:
895	___
896	$code.=<<___ if ($flavour !~ /64/);
897	vldmia sp!,{d8-d15}
898	ldmia sp!,{r4-r10,pc}
899	___
900	$code.=<<___ if ($flavour =~ /64/);
901	ldr x29,[sp],#16
902	ret
903	___
904	$code.=<<___;
905	.size ${prefix}_ctr32_encrypt_blocks,.-${prefix}_ctr32_encrypt_blocks
906	___
907	}}}
908	$code.=<<___;
909	#endif
910	___
911	########################################
912	if ($flavour =~ /64/) { ######## 64-bit code
913	my %opcode = (
914	"aesd" => 0x4e285800, "aese" => 0x4e284800,
915	"aesimc"=> 0x4e287800, "aesmc" => 0x4e286800 );
916
917	local *unaes = sub {
918	my ($mnemonic,$arg)=@_;
919
920	$arg =~ m/[qv]([0-9]+)[^,],\s[qv]([0-9]+)/o &&
921	sprintf ".inst\t0x%08x\t//%s %s",
922	$opcode{$mnemonic}\|$1\|($2<<5),
923	$mnemonic,$arg;
924	};
925
926	foreach(split("\n",$code)) {
927	s/\`([^\`]*)\`/eval($1)/geo;
928
929	s/\bq([0-9]+)\b/"v".($1<8?$1:$1+8).".16b"/geo; # old->new registers
930	s/@\s/\/\//o; # old->new style commentary
931
932	#s/[v]?(aes\w+)\s+([qv].*)/unaes($1,$2)/geo or
933	s/cclr\s+([wx])([^,]+),\s*([a-z]+)/csel $1$2,$1zr,$1$2,$3/o or
934	s/mov\.([a-z]+)\s+([wx][0-9]+),\s*([wx][0-9]+)/csel $2,$3,$2,$1/o or
935	s/vmov\.i8/movi/o or # fix up legacy mnemonics
936	s/vext\.8/ext/o or
937	s/vrev32\.8/rev32/o or
938	s/vtst\.8/cmtst/o or
939	s/vshr/ushr/o or
940	s/^(\s+)v/$1/o or # strip off v prefix
941	s/\bbx\s+lr\b/ret/o;
942
943	# fix up remaining legacy suffixes
944	s/\.[ui]?8//o;
945	m/\],#8/o and s/\.16b/\.8b/go;
946	s/\.[ui]?32//o and s/\.16b/\.4s/go;
947	s/\.[ui]?64//o and s/\.16b/\.2d/go;
948	s/\.[42]([sd])\[([0-3])\]/\.$1\[$2\]/o;
949
950	print $_,"\n";
951	}
952	} else { ######## 32-bit code
953	my %opcode = (
954	"aesd" => 0xf3b00340, "aese" => 0xf3b00300,
955	"aesimc"=> 0xf3b003c0, "aesmc" => 0xf3b00380 );
956
957	local *unaes = sub {
958	my ($mnemonic,$arg)=@_;
959
960	if ($arg =~ m/[qv]([0-9]+)[^,],\s[qv]([0-9]+)/o) {
961	my $word = $opcode{$mnemonic}\|(($1&7)<<13)\|(($1&8)<<19)
962	\|(($2&7)<<1) \|(($2&8)<<2);
963	# since ARMv7 instructions are always encoded little-endian.
964	# correct solution is to use .inst directive, but older
965	# assemblers don't implement it:-(
966	sprintf ".byte\t0x%02x,0x%02x,0x%02x,0x%02x\t@ %s %s",
967	$word&0xff,($word>>8)&0xff,
968	($word>>16)&0xff,($word>>24)&0xff,
969	$mnemonic,$arg;
970	}
971	};
972
973	sub unvtbl {
974	my $arg=shift;
975
976	$arg =~ m/q([0-9]+),\s\{q([0-9]+)\},\sq([0-9]+)/o &&
977	sprintf "vtbl.8 d%d,{q%d},d%d\n\t".
978	"vtbl.8 d%d,{q%d},d%d", 2$1,$2,2$3, 2$1+1,$2,2$3+1;
979	}
980
981	sub unvdup32 {
982	my $arg=shift;
983
984	$arg =~ m/q([0-9]+),\s*q([0-9]+)\[([0-3])\]/o &&
985	sprintf "vdup.32 q%d,d%d[%d]",$1,2*$2+($3>>1),$3&1;
986	}
987
988	sub unvmov32 {
989	my $arg=shift;
990
991	$arg =~ m/q([0-9]+)\[([0-3])\],(.*)/o &&
992	sprintf "vmov.32 d%d[%d],%s",2*$1+($2>>1),$2&1,$3;
993	}
994
995	foreach(split("\n",$code)) {
996	s/\`([^\`]*)\`/eval($1)/geo;
997
998	s/\b[wx]([0-9]+)\b/r$1/go; # new->old registers
999	s/\bv([0-9])\.[12468]+[bsd]\b/q$1/go; # new->old registers
1000	s/\/\/\s?/@ /o; # new->old style commentary
1001
1002	# fix up remaining new-style suffixes
1003	s/\{q([0-9]+)\},\s\[(.+)\],#8/sprintf "{d%d},[$2]!",2$1/eo or
1004	s/\],#[0-9]+/]!/o;
1005
1006	s/[v]?(aes\w+)\s+([qv].*)/unaes($1,$2)/geo or
1007	s/cclr\s+([^,]+),\s*([a-z]+)/mov$2 $1,#0/o or
1008	s/vtbl\.8\s+(.*)/unvtbl($1)/geo or
1009	s/vdup\.32\s+(.*)/unvdup32($1)/geo or
1010	s/vmov\.32\s+(.*)/unvmov32($1)/geo or
1011	s/^(\s+)b\./$1b/o or
1012	s/^(\s+)mov\./$1mov/o or
1013	s/^(\s+)ret/$1bx\tlr/o;
1014
1015	print $_,"\n";
1016	}
1017	}
1018
1019	close STDOUT or die "error closing STDOUT: $!";

注意: 瀏覽 TracBrowser 來幫助您使用儲存庫瀏覽器

source: vbox/trunk/src/libs/openssl-3.0.1/crypto/aes/asm/aesv8-armx.pl@ 94081

以其他格式下載: