sha256-armv4.pl@ 94081

最後變更在這個檔案從94081是 91772,由 vboxsync 提交於 3 年前
openssl-1.1.1l: Applied and adjusted our OpenSSL changes to 1.1.1l. bugref:10126
檔案大小: 18.2 KB

行
1	#! /usr/bin/env perl
2	# Copyright 2007-2020 The OpenSSL Project Authors. All Rights Reserved.
3	#
4	# Licensed under the OpenSSL license (the "License"). You may not use
5	# this file except in compliance with the License. You can obtain a copy
6	# in the file LICENSE in the source distribution or at
7	# https://www.openssl.org/source/license.html
8
9
10	# ====================================================================
11	# Written by Andy Polyakov <[email protected]> for the OpenSSL
12	# project. The module is, however, dual licensed under OpenSSL and
13	# CRYPTOGAMS licenses depending on where you obtain it. For further
14	# details see http://www.openssl.org/~appro/cryptogams/.
15	#
16	# Permission to use under GPL terms is granted.
17	# ====================================================================
18
19	# SHA256 block procedure for ARMv4. May 2007.
20
21	# Performance is ~2x better than gcc 3.4 generated code and in "abso-
22	# lute" terms is ~2250 cycles per 64-byte block or ~35 cycles per
23	# byte [on single-issue Xscale PXA250 core].
24
25	# July 2010.
26	#
27	# Rescheduling for dual-issue pipeline resulted in 22% improvement on
28	# Cortex A8 core and ~20 cycles per processed byte.
29
30	# February 2011.
31	#
32	# Profiler-assisted and platform-specific optimization resulted in 16%
33	# improvement on Cortex A8 core and ~15.4 cycles per processed byte.
34
35	# September 2013.
36	#
37	# Add NEON implementation. On Cortex A8 it was measured to process one
38	# byte in 12.5 cycles or 23% faster than integer-only code. Snapdragon
39	# S4 does it in 12.5 cycles too, but it's 50% faster than integer-only
40	# code (meaning that latter performs sub-optimally, nothing was done
41	# about it).
42
43	# May 2014.
44	#
45	# Add ARMv8 code path performing at 2.0 cpb on Apple A7.
46
47	$flavour = shift;
48	if ($flavour=~/\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
49	else { while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} }
50
51	if ($flavour && $flavour ne "void") {
52	$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
53	( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
54	( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
55	die "can't locate arm-xlate.pl";
56
57	open STDOUT,"\| \"$^X\" $xlate $flavour $output";
58	} else {
59	open STDOUT,">$output";
60	}
61
62	$ctx="r0"; $t0="r0";
63	$inp="r1"; $t4="r1";
64	$len="r2"; $t1="r2";
65	$T1="r3"; $t3="r3";
66	$A="r4";
67	$B="r5";
68	$C="r6";
69	$D="r7";
70	$E="r8";
71	$F="r9";
72	$G="r10";
73	$H="r11";
74	@V=($A,$B,$C,$D,$E,$F,$G,$H);
75	$t2="r12";
76	$Ktbl="r14";
77
78	@Sigma0=( 2,13,22);
79	@Sigma1=( 6,11,25);
80	@sigma0=( 7,18, 3);
81	@sigma1=(17,19,10);
82
83	sub BODY_00_15 {
84	my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
85
86	$code.=<<___ if ($i<16);
87	#if __ARM_ARCH__>=7
88	@ ldr $t1,[$inp],#4 @ $i
89	# if $i==15
90	str $inp,[sp,#17*4] @ make room for $t4
91	# endif
92	eor $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]`
93	add $a,$a,$t2 @ h+=Maj(a,b,c) from the past
94	eor $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]` @ Sigma1(e)
95	# ifndef __ARMEB__
96	rev $t1,$t1
97	# endif
98	#else
99	@ ldrb $t1,[$inp,#3] @ $i
100	add $a,$a,$t2 @ h+=Maj(a,b,c) from the past
101	ldrb $t2,[$inp,#2]
102	ldrb $t0,[$inp,#1]
103	orr $t1,$t1,$t2,lsl#8
104	ldrb $t2,[$inp],#4
105	orr $t1,$t1,$t0,lsl#16
106	# if $i==15
107	str $inp,[sp,#17*4] @ make room for $t4
108	# endif
109	eor $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]`
110	orr $t1,$t1,$t2,lsl#24
111	eor $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]` @ Sigma1(e)
112	#endif
113	___
114	$code.=<<___;
115	ldr $t2,[$Ktbl],#4 @ *K256++
116	add $h,$h,$t1 @ h+=X[i]
117	str $t1,[sp,#`$i%16`*4]
118	eor $t1,$f,$g
119	add $h,$h,$t0,ror#$Sigma1[0] @ h+=Sigma1(e)
120	and $t1,$t1,$e
121	add $h,$h,$t2 @ h+=K256[i]
122	eor $t1,$t1,$g @ Ch(e,f,g)
123	eor $t0,$a,$a,ror#`$Sigma0[1]-$Sigma0[0]`
124	add $h,$h,$t1 @ h+=Ch(e,f,g)
125	#if $i==31
126	and $t2,$t2,#0xff
127	cmp $t2,#0xf2 @ done?
128	#endif
129	#if $i<15
130	# if __ARM_ARCH__>=7
131	ldr $t1,[$inp],#4 @ prefetch
132	# else
133	ldrb $t1,[$inp,#3]
134	# endif
135	eor $t2,$a,$b @ a^b, b^c in next round
136	#else
137	ldr $t1,[sp,#`($i+2)%16`*4] @ from future BODY_16_xx
138	eor $t2,$a,$b @ a^b, b^c in next round
139	ldr $t4,[sp,#`($i+15)%16`*4] @ from future BODY_16_xx
140	#endif
141	eor $t0,$t0,$a,ror#`$Sigma0[2]-$Sigma0[0]` @ Sigma0(a)
142	and $t3,$t3,$t2 @ (b^c)&=(a^b)
143	add $d,$d,$h @ d+=h
144	eor $t3,$t3,$b @ Maj(a,b,c)
145	add $h,$h,$t0,ror#$Sigma0[0] @ h+=Sigma0(a)
146	@ add $h,$h,$t3 @ h+=Maj(a,b,c)
147	___
148	($t2,$t3)=($t3,$t2);
149	}
150
151	sub BODY_16_XX {
152	my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
153
154	$code.=<<___;
155	@ ldr $t1,[sp,#`($i+1)%16`*4] @ $i
156	@ ldr $t4,[sp,#`($i+14)%16`*4]
157	mov $t0,$t1,ror#$sigma0[0]
158	add $a,$a,$t2 @ h+=Maj(a,b,c) from the past
159	mov $t2,$t4,ror#$sigma1[0]
160	eor $t0,$t0,$t1,ror#$sigma0[1]
161	eor $t2,$t2,$t4,ror#$sigma1[1]
162	eor $t0,$t0,$t1,lsr#$sigma0[2] @ sigma0(X[i+1])
163	ldr $t1,[sp,#`($i+0)%16`*4]
164	eor $t2,$t2,$t4,lsr#$sigma1[2] @ sigma1(X[i+14])
165	ldr $t4,[sp,#`($i+9)%16`*4]
166
167	add $t2,$t2,$t0
168	eor $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]` @ from BODY_00_15
169	add $t1,$t1,$t2
170	eor $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]` @ Sigma1(e)
171	add $t1,$t1,$t4 @ X[i]
172	___
173	&BODY_00_15(@_);
174	}
175
176	$code=<<___;
177	#ifndef __KERNEL__
178	# include "arm_arch.h"
179	#else
180	# define __ARM_ARCH__ __LINUX_ARM_ARCH__
181	# define __ARM_MAX_ARCH__ 7
182	#endif
183
184	.text
185	#if defined(__thumb2__)
186	.syntax unified
187	.thumb
188	#else
189	.code 32
190	#endif
191
192	.type K256,%object
193	.align 5
194	K256:
195	.word 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
196	.word 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
197	.word 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
198	.word 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
199	.word 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
200	.word 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
201	.word 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
202	.word 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
203	.word 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
204	.word 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
205	.word 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
206	.word 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
207	.word 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
208	.word 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
209	.word 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
210	.word 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
211	.size K256,.-K256
212	.word 0 @ terminator
213	#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
214	.LOPENSSL_armcap:
215	.word OPENSSL_armcap_P-.Lsha256_block_data_order
216	#endif
217	.align 5
218
219	.global sha256_block_data_order
220	.type sha256_block_data_order,%function
221	sha256_block_data_order:
222	.Lsha256_block_data_order:
223	#if __ARM_ARCH__<7 && !defined(__thumb2__)
224	sub r3,pc,#8 @ sha256_block_data_order
225	#else
226	adr r3,.Lsha256_block_data_order
227	#endif
228	#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
229	ldr r12,.LOPENSSL_armcap
230	ldr r12,[r3,r12] @ OPENSSL_armcap_P
231	#ifdef __APPLE__
232	ldr r12,[r12]
233	#endif
234	tst r12,#ARMV8_SHA256
235	bne .LARMv8
236	tst r12,#ARMV7_NEON
237	bne .LNEON
238	#endif
239	add $len,$inp,$len,lsl#6 @ len to point at the end of inp
240	stmdb sp!,{$ctx,$inp,$len,r4-r11,lr}
241	ldmia $ctx,{$A,$B,$C,$D,$E,$F,$G,$H}
242	sub $Ktbl,r3,#256+32 @ K256
243	sub sp,sp,#16*4 @ alloca(X[16])
244	.Loop:
245	# if __ARM_ARCH__>=7
246	ldr $t1,[$inp],#4
247	# else
248	ldrb $t1,[$inp,#3]
249	# endif
250	eor $t3,$B,$C @ magic
251	eor $t2,$t2,$t2
252	___
253	for($i=0;$i<16;$i++) { &BODY_00_15($i,@V); unshift(@V,pop(@V)); }
254	$code.=".Lrounds_16_xx:\n";
255	for (;$i<32;$i++) { &BODY_16_XX($i,@V); unshift(@V,pop(@V)); }
256	$code.=<<___;
257	#ifdef __thumb2__
258	ite eq @ Thumb2 thing, sanity check in ARM
259	#endif
260	ldreq $t3,[sp,#16*4] @ pull ctx
261	bne .Lrounds_16_xx
262
263	add $A,$A,$t2 @ h+=Maj(a,b,c) from the past
264	ldr $t0,[$t3,#0]
265	ldr $t1,[$t3,#4]
266	ldr $t2,[$t3,#8]
267	add $A,$A,$t0
268	ldr $t0,[$t3,#12]
269	add $B,$B,$t1
270	ldr $t1,[$t3,#16]
271	add $C,$C,$t2
272	ldr $t2,[$t3,#20]
273	add $D,$D,$t0
274	ldr $t0,[$t3,#24]
275	add $E,$E,$t1
276	ldr $t1,[$t3,#28]
277	add $F,$F,$t2
278	ldr $inp,[sp,#17*4] @ pull inp
279	ldr $t2,[sp,#18*4] @ pull inp+len
280	add $G,$G,$t0
281	add $H,$H,$t1
282	stmia $t3,{$A,$B,$C,$D,$E,$F,$G,$H}
283	cmp $inp,$t2
284	sub $Ktbl,$Ktbl,#256 @ rewind Ktbl
285	bne .Loop
286
287	add sp,sp,#`16+3`*4 @ destroy frame
288	#if __ARM_ARCH__>=5
289	ldmia sp!,{r4-r11,pc}
290	#else
291	ldmia sp!,{r4-r11,lr}
292	tst lr,#1
293	moveq pc,lr @ be binary compatible with V4, yet
294	bx lr @ interoperable with Thumb ISA:-)
295	#endif
296	.size sha256_block_data_order,.-sha256_block_data_order
297	___
298	######################################################################
299	# NEON stuff
300	#
301	{{{
302	my @X=map("q$_",(0..3));
303	my ($T0,$T1,$T2,$T3,$T4,$T5)=("q8","q9","q10","q11","d24","d25");
304	my $Xfer=$t4;
305	my $j=0;
306
307	sub Dlo() { shift=~m\|q([1]?[0-9])\|?"d".($1*2):""; }
308	sub Dhi() { shift=~m\|q([1]?[0-9])\|?"d".($1*2+1):""; }
309
310	sub AUTOLOAD() # thunk [simplified] x86-style perlasm
311	{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./;
312	my $arg = pop;
313	$arg = "#$arg" if ($arg*1 eq $arg);
314	$code .= "\t$opcode\t".join(',',@_,$arg)."\n";
315	}
316
317	sub Xupdate()
318	{ use integer;
319	my $body = shift;
320	my @insns = (&$body,&$body,&$body,&$body);
321	my ($a,$b,$c,$d,$e,$f,$g,$h);
322
323	&vext_8 ($T0,@X[0],@X[1],4); # X[1..4]
324	eval(shift(@insns));
325	eval(shift(@insns));
326	eval(shift(@insns));
327	&vext_8 ($T1,@X[2],@X[3],4); # X[9..12]
328	eval(shift(@insns));
329	eval(shift(@insns));
330	eval(shift(@insns));
331	&vshr_u32 ($T2,$T0,$sigma0[0]);
332	eval(shift(@insns));
333	eval(shift(@insns));
334	&vadd_i32 (@X[0],@X[0],$T1); # X[0..3] += X[9..12]
335	eval(shift(@insns));
336	eval(shift(@insns));
337	&vshr_u32 ($T1,$T0,$sigma0[2]);
338	eval(shift(@insns));
339	eval(shift(@insns));
340	&vsli_32 ($T2,$T0,32-$sigma0[0]);
341	eval(shift(@insns));
342	eval(shift(@insns));
343	&vshr_u32 ($T3,$T0,$sigma0[1]);
344	eval(shift(@insns));
345	eval(shift(@insns));
346	&veor ($T1,$T1,$T2);
347	eval(shift(@insns));
348	eval(shift(@insns));
349	&vsli_32 ($T3,$T0,32-$sigma0[1]);
350	eval(shift(@insns));
351	eval(shift(@insns));
352	&vshr_u32 ($T4,&Dhi(@X[3]),$sigma1[0]);
353	eval(shift(@insns));
354	eval(shift(@insns));
355	&veor ($T1,$T1,$T3); # sigma0(X[1..4])
356	eval(shift(@insns));
357	eval(shift(@insns));
358	&vsli_32 ($T4,&Dhi(@X[3]),32-$sigma1[0]);
359	eval(shift(@insns));
360	eval(shift(@insns));
361	&vshr_u32 ($T5,&Dhi(@X[3]),$sigma1[2]);
362	eval(shift(@insns));
363	eval(shift(@insns));
364	&vadd_i32 (@X[0],@X[0],$T1); # X[0..3] += sigma0(X[1..4])
365	eval(shift(@insns));
366	eval(shift(@insns));
367	&veor ($T5,$T5,$T4);
368	eval(shift(@insns));
369	eval(shift(@insns));
370	&vshr_u32 ($T4,&Dhi(@X[3]),$sigma1[1]);
371	eval(shift(@insns));
372	eval(shift(@insns));
373	&vsli_32 ($T4,&Dhi(@X[3]),32-$sigma1[1]);
374	eval(shift(@insns));
375	eval(shift(@insns));
376	&veor ($T5,$T5,$T4); # sigma1(X[14..15])
377	eval(shift(@insns));
378	eval(shift(@insns));
379	&vadd_i32 (&Dlo(@X[0]),&Dlo(@X[0]),$T5);# X[0..1] += sigma1(X[14..15])
380	eval(shift(@insns));
381	eval(shift(@insns));
382	&vshr_u32 ($T4,&Dlo(@X[0]),$sigma1[0]);
383	eval(shift(@insns));
384	eval(shift(@insns));
385	&vsli_32 ($T4,&Dlo(@X[0]),32-$sigma1[0]);
386	eval(shift(@insns));
387	eval(shift(@insns));
388	&vshr_u32 ($T5,&Dlo(@X[0]),$sigma1[2]);
389	eval(shift(@insns));
390	eval(shift(@insns));
391	&veor ($T5,$T5,$T4);
392	eval(shift(@insns));
393	eval(shift(@insns));
394	&vshr_u32 ($T4,&Dlo(@X[0]),$sigma1[1]);
395	eval(shift(@insns));
396	eval(shift(@insns));
397	&vld1_32 ("{$T0}","[$Ktbl,:128]!");
398	eval(shift(@insns));
399	eval(shift(@insns));
400	&vsli_32 ($T4,&Dlo(@X[0]),32-$sigma1[1]);
401	eval(shift(@insns));
402	eval(shift(@insns));
403	&veor ($T5,$T5,$T4); # sigma1(X[16..17])
404	eval(shift(@insns));
405	eval(shift(@insns));
406	&vadd_i32 (&Dhi(@X[0]),&Dhi(@X[0]),$T5);# X[2..3] += sigma1(X[16..17])
407	eval(shift(@insns));
408	eval(shift(@insns));
409	&vadd_i32 ($T0,$T0,@X[0]);
410	while($#insns>=2) { eval(shift(@insns)); }
411	&vst1_32 ("{$T0}","[$Xfer,:128]!");
412	eval(shift(@insns));
413	eval(shift(@insns));
414
415	push(@X,shift(@X)); # "rotate" X[]
416	}
417
418	sub Xpreload()
419	{ use integer;
420	my $body = shift;
421	my @insns = (&$body,&$body,&$body,&$body);
422	my ($a,$b,$c,$d,$e,$f,$g,$h);
423
424	eval(shift(@insns));
425	eval(shift(@insns));
426	eval(shift(@insns));
427	eval(shift(@insns));
428	&vld1_32 ("{$T0}","[$Ktbl,:128]!");
429	eval(shift(@insns));
430	eval(shift(@insns));
431	eval(shift(@insns));
432	eval(shift(@insns));
433	&vrev32_8 (@X[0],@X[0]);
434	eval(shift(@insns));
435	eval(shift(@insns));
436	eval(shift(@insns));
437	eval(shift(@insns));
438	&vadd_i32 ($T0,$T0,@X[0]);
439	foreach (@insns) { eval; } # remaining instructions
440	&vst1_32 ("{$T0}","[$Xfer,:128]!");
441
442	push(@X,shift(@X)); # "rotate" X[]
443	}
444
445	sub body_00_15 () {
446	(
447	'($a,$b,$c,$d,$e,$f,$g,$h)=@V;'.
448	'&add ($h,$h,$t1)', # h+=X[i]+K[i]
449	'&eor ($t1,$f,$g)',
450	'&eor ($t0,$e,$e,"ror#".($Sigma1[1]-$Sigma1[0]))',
451	'&add ($a,$a,$t2)', # h+=Maj(a,b,c) from the past
452	'&and ($t1,$t1,$e)',
453	'&eor ($t2,$t0,$e,"ror#".($Sigma1[2]-$Sigma1[0]))', # Sigma1(e)
454	'&eor ($t0,$a,$a,"ror#".($Sigma0[1]-$Sigma0[0]))',
455	'&eor ($t1,$t1,$g)', # Ch(e,f,g)
456	'&add ($h,$h,$t2,"ror#$Sigma1[0]")', # h+=Sigma1(e)
457	'&eor ($t2,$a,$b)', # a^b, b^c in next round
458	'&eor ($t0,$t0,$a,"ror#".($Sigma0[2]-$Sigma0[0]))', # Sigma0(a)
459	'&add ($h,$h,$t1)', # h+=Ch(e,f,g)
460	'&ldr ($t1,sprintf "[sp,#%d]",4*(($j+1)&15)) if (($j&15)!=15);'.
461	'&ldr ($t1,"[$Ktbl]") if ($j==15);'.
462	'&ldr ($t1,"[sp,#64]") if ($j==31)',
463	'&and ($t3,$t3,$t2)', # (b^c)&=(a^b)
464	'&add ($d,$d,$h)', # d+=h
465	'&add ($h,$h,$t0,"ror#$Sigma0[0]");'. # h+=Sigma0(a)
466	'&eor ($t3,$t3,$b)', # Maj(a,b,c)
467	'$j++; unshift(@V,pop(@V)); ($t2,$t3)=($t3,$t2);'
468	)
469	}
470
471	$code.=<<___;
472	#if __ARM_MAX_ARCH__>=7
473	.arch armv7-a
474	.fpu neon
475
476	.global sha256_block_data_order_neon
477	.type sha256_block_data_order_neon,%function
478	.align 5
479	.skip 16
480	sha256_block_data_order_neon:
481	.LNEON:
482	stmdb sp!,{r4-r12,lr}
483
484	sub $H,sp,#16*4+16
485	adr $Ktbl,K256
486	bic $H,$H,#15 @ align for 128-bit stores
487	mov $t2,sp
488	mov sp,$H @ alloca
489	add $len,$inp,$len,lsl#6 @ len to point at the end of inp
490
491	vld1.8 {@X[0]},[$inp]!
492	vld1.8 {@X[1]},[$inp]!
493	vld1.8 {@X[2]},[$inp]!
494	vld1.8 {@X[3]},[$inp]!
495	vld1.32 {$T0},[$Ktbl,:128]!
496	vld1.32 {$T1},[$Ktbl,:128]!
497	vld1.32 {$T2},[$Ktbl,:128]!
498	vld1.32 {$T3},[$Ktbl,:128]!
499	vrev32.8 @X[0],@X[0] @ yes, even on
500	str $ctx,[sp,#64]
501	vrev32.8 @X[1],@X[1] @ big-endian
502	str $inp,[sp,#68]
503	mov $Xfer,sp
504	vrev32.8 @X[2],@X[2]
505	str $len,[sp,#72]
506	vrev32.8 @X[3],@X[3]
507	str $t2,[sp,#76] @ save original sp
508	vadd.i32 $T0,$T0,@X[0]
509	vadd.i32 $T1,$T1,@X[1]
510	vst1.32 {$T0},[$Xfer,:128]!
511	vadd.i32 $T2,$T2,@X[2]
512	vst1.32 {$T1},[$Xfer,:128]!
513	vadd.i32 $T3,$T3,@X[3]
514	vst1.32 {$T2},[$Xfer,:128]!
515	vst1.32 {$T3},[$Xfer,:128]!
516
517	ldmia $ctx,{$A-$H}
518	sub $Xfer,$Xfer,#64
519	ldr $t1,[sp,#0]
520	eor $t2,$t2,$t2
521	eor $t3,$B,$C
522	b .L_00_48
523
524	.align 4
525	.L_00_48:
526	___
527	&Xupdate(\&body_00_15);
528	&Xupdate(\&body_00_15);
529	&Xupdate(\&body_00_15);
530	&Xupdate(\&body_00_15);
531	$code.=<<___;
532	teq $t1,#0 @ check for K256 terminator
533	ldr $t1,[sp,#0]
534	sub $Xfer,$Xfer,#64
535	bne .L_00_48
536
537	ldr $inp,[sp,#68]
538	ldr $t0,[sp,#72]
539	sub $Ktbl,$Ktbl,#256 @ rewind $Ktbl
540	teq $inp,$t0
541	it eq
542	subeq $inp,$inp,#64 @ avoid SEGV
543	vld1.8 {@X[0]},[$inp]! @ load next input block
544	vld1.8 {@X[1]},[$inp]!
545	vld1.8 {@X[2]},[$inp]!
546	vld1.8 {@X[3]},[$inp]!
547	it ne
548	strne $inp,[sp,#68]
549	mov $Xfer,sp
550	___
551	&Xpreload(\&body_00_15);
552	&Xpreload(\&body_00_15);
553	&Xpreload(\&body_00_15);
554	&Xpreload(\&body_00_15);
555	$code.=<<___;
556	ldr $t0,[$t1,#0]
557	add $A,$A,$t2 @ h+=Maj(a,b,c) from the past
558	ldr $t2,[$t1,#4]
559	ldr $t3,[$t1,#8]
560	ldr $t4,[$t1,#12]
561	add $A,$A,$t0 @ accumulate
562	ldr $t0,[$t1,#16]
563	add $B,$B,$t2
564	ldr $t2,[$t1,#20]
565	add $C,$C,$t3
566	ldr $t3,[$t1,#24]
567	add $D,$D,$t4
568	ldr $t4,[$t1,#28]
569	add $E,$E,$t0
570	str $A,[$t1],#4
571	add $F,$F,$t2
572	str $B,[$t1],#4
573	add $G,$G,$t3
574	str $C,[$t1],#4
575	add $H,$H,$t4
576	str $D,[$t1],#4
577	stmia $t1,{$E-$H}
578
579	ittte ne
580	movne $Xfer,sp
581	ldrne $t1,[sp,#0]
582	eorne $t2,$t2,$t2
583	ldreq sp,[sp,#76] @ restore original sp
584	itt ne
585	eorne $t3,$B,$C
586	bne .L_00_48
587
588	ldmia sp!,{r4-r12,pc}
589	.size sha256_block_data_order_neon,.-sha256_block_data_order_neon
590	#endif
591	___
592	}}}
593	######################################################################
594	# ARMv8 stuff
595	#
596	{{{
597	my ($ABCD,$EFGH,$abcd)=map("q$_",(0..2));
598	my @MSG=map("q$_",(8..11));
599	my ($W0,$W1,$ABCD_SAVE,$EFGH_SAVE)=map("q$_",(12..15));
600	my $Ktbl="r3";
601
602	$code.=<<___;
603	#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
604
605	# if defined(__thumb2__)
606	# define INST(a,b,c,d) .byte c,d\|0xc,a,b
607	# else
608	# define INST(a,b,c,d) .byte a,b,c,d
609	# endif
610
611	.type sha256_block_data_order_armv8,%function
612	.align 5
613	sha256_block_data_order_armv8:
614	.LARMv8:
615	vld1.32 {$ABCD,$EFGH},[$ctx]
616	sub $Ktbl,$Ktbl,#256+32
617	add $len,$inp,$len,lsl#6 @ len to point at the end of inp
618	b .Loop_v8
619
620	.align 4
621	.Loop_v8:
622	vld1.8 {@MSG[0]-@MSG[1]},[$inp]!
623	vld1.8 {@MSG[2]-@MSG[3]},[$inp]!
624	vld1.32 {$W0},[$Ktbl]!
625	vrev32.8 @MSG[0],@MSG[0]
626	vrev32.8 @MSG[1],@MSG[1]
627	vrev32.8 @MSG[2],@MSG[2]
628	vrev32.8 @MSG[3],@MSG[3]
629	vmov $ABCD_SAVE,$ABCD @ offload
630	vmov $EFGH_SAVE,$EFGH
631	teq $inp,$len
632	___
633	for($i=0;$i<12;$i++) {
634	$code.=<<___;
635	vld1.32 {$W1},[$Ktbl]!
636	vadd.i32 $W0,$W0,@MSG[0]
637	sha256su0 @MSG[0],@MSG[1]
638	vmov $abcd,$ABCD
639	sha256h $ABCD,$EFGH,$W0
640	sha256h2 $EFGH,$abcd,$W0
641	sha256su1 @MSG[0],@MSG[2],@MSG[3]
642	___
643	($W0,$W1)=($W1,$W0); push(@MSG,shift(@MSG));
644	}
645	$code.=<<___;
646	vld1.32 {$W1},[$Ktbl]!
647	vadd.i32 $W0,$W0,@MSG[0]
648	vmov $abcd,$ABCD
649	sha256h $ABCD,$EFGH,$W0
650	sha256h2 $EFGH,$abcd,$W0
651
652	vld1.32 {$W0},[$Ktbl]!
653	vadd.i32 $W1,$W1,@MSG[1]
654	vmov $abcd,$ABCD
655	sha256h $ABCD,$EFGH,$W1
656	sha256h2 $EFGH,$abcd,$W1
657
658	vld1.32 {$W1},[$Ktbl]
659	vadd.i32 $W0,$W0,@MSG[2]
660	sub $Ktbl,$Ktbl,#256-16 @ rewind
661	vmov $abcd,$ABCD
662	sha256h $ABCD,$EFGH,$W0
663	sha256h2 $EFGH,$abcd,$W0
664
665	vadd.i32 $W1,$W1,@MSG[3]
666	vmov $abcd,$ABCD
667	sha256h $ABCD,$EFGH,$W1
668	sha256h2 $EFGH,$abcd,$W1
669
670	vadd.i32 $ABCD,$ABCD,$ABCD_SAVE
671	vadd.i32 $EFGH,$EFGH,$EFGH_SAVE
672	it ne
673	bne .Loop_v8
674
675	vst1.32 {$ABCD,$EFGH},[$ctx]
676
677	ret @ bx lr
678	.size sha256_block_data_order_armv8,.-sha256_block_data_order_armv8
679	#endif
680	___
681	}}}
682	$code.=<<___;
683	.asciz "SHA256 block transform for ARMv4/NEON/ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
684	.align 2
685	#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
686	.comm OPENSSL_armcap_P,4,4
687	#endif
688	___
689
690	open SELF,$0;
691	while(<SELF>) {
692	next if (/^#!/);
693	last if (!s/^#/@/ and !/^$/);
694	print;
695	}
696	close SELF;
697
698	{ my %opcode = (
699	"sha256h" => 0xf3000c40, "sha256h2" => 0xf3100c40,
700	"sha256su0" => 0xf3ba03c0, "sha256su1" => 0xf3200c40 );
701
702	sub unsha256 {
703	my ($mnemonic,$arg)=@_;
704
705	if ($arg =~ m/q([0-9]+)(?:,\sq([0-9]+))?,\sq([0-9]+)/o) {
706	my $word = $opcode{$mnemonic}\|(($1&7)<<13)\|(($1&8)<<19)
707	\|(($2&7)<<17)\|(($2&8)<<4)
708	\|(($3&7)<<1) \|(($3&8)<<2);
709	# since ARMv7 instructions are always encoded little-endian.
710	# correct solution is to use .inst directive, but older
711	# assemblers don't implement it:-(
712	sprintf "INST(0x%02x,0x%02x,0x%02x,0x%02x)\t@ %s %s",
713	$word&0xff,($word>>8)&0xff,
714	($word>>16)&0xff,($word>>24)&0xff,
715	$mnemonic,$arg;
716	}
717	}
718	}
719
720	foreach (split($/,$code)) {
721
722	s/\`([^\`]*)\`/eval $1/geo;
723
724	s/\b(sha256\w+)\s+(q.*)/unsha256($1,$2)/geo;
725
726	s/\bret\b/bx lr/go or
727	s/\bbx\s+lr\b/.word\t0xe12fff1e/go; # make it possible to compile with -march=armv4
728
729	print $_,"\n";
730	}
731
732	close STDOUT or die "error closing STDOUT: $!"; # enforce flush

注意: 瀏覽 TracBrowser 來幫助您使用儲存庫瀏覽器

source: vbox/trunk/src/libs/openssl-3.0.1/crypto/sha/asm/sha256-armv4.pl@ 94081

以其他格式下載: