1 | #! /usr/bin/env perl
|
---|
2 | # Author: Min Zhou <[email protected]>
|
---|
3 | # Copyright 2023 The OpenSSL Project Authors. All Rights Reserved.
|
---|
4 | #
|
---|
5 | # Licensed under the OpenSSL license (the "License"). You may not use
|
---|
6 | # this file except in compliance with the License. You can obtain a copy
|
---|
7 | # in the file LICENSE in the source distribution or at
|
---|
8 | # https://www.openssl.org/source/license.html
|
---|
9 |
|
---|
10 | # Reference to crypto/md5/asm/md5-x86_64.pl
|
---|
11 | # MD5 optimized for LoongArch.
|
---|
12 |
|
---|
13 | use strict;
|
---|
14 |
|
---|
15 | my $code;
|
---|
16 |
|
---|
17 | my ($zero,$ra,$tp,$sp,$fp)=map("\$r$_",(0..3,22));
|
---|
18 | my ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$r$_",(4..11));
|
---|
19 | my ($t0,$t1,$t2,$t3,$t4,$t5,$t6,$t7,$t8,$x)=map("\$r$_",(12..21));
|
---|
20 |
|
---|
21 | my $output;
|
---|
22 | for (@ARGV) { $output=$_ if (/\w[\w\-]*\.\w+$/); }
|
---|
23 | open STDOUT,">$output";
|
---|
24 |
|
---|
25 | # round1_step() does:
|
---|
26 | # dst = x + ((dst + F(x,y,z) + X[k] + T_i) <<< s)
|
---|
27 | # $t1 = y ^ z
|
---|
28 | # $t2 = dst + X[k_next]
|
---|
29 | sub round1_step
|
---|
30 | {
|
---|
31 | my ($pos, $dst, $x, $y, $z, $k_next, $T_i, $s) = @_;
|
---|
32 | my $T_i_h = ($T_i & 0xfffff000) >> 12;
|
---|
33 | my $T_i_l = $T_i & 0xfff;
|
---|
34 |
|
---|
35 | # In LoongArch we have to use two instructions of lu12i.w and ori to load a
|
---|
36 | # 32-bit immediate into a general register. Meanwhile, the instruction lu12i.w
|
---|
37 | # treats the 20-bit immediate as a signed number. So if the T_i_h is greater
|
---|
38 | # than or equal to (1<<19), we need provide lu12i.w a corresponding negative
|
---|
39 | # number whose complement equals to the sign extension of T_i_h.
|
---|
40 |
|
---|
41 | # The details of the instruction lu12i.w can be found as following:
|
---|
42 | # https://loongson.github.io/LoongArch-Documentation/LoongArch-Vol1-EN.html#_lu12i_w_lu32i_d_lu52i_d
|
---|
43 |
|
---|
44 | $T_i_h = -((1<<32) - (0xfff00000 | $T_i_h)) if ($T_i_h >= (1<<19));
|
---|
45 |
|
---|
46 | $code .= " ld.w $t0,$a1,0 /* (NEXT STEP) X[0] */\n" if ($pos == -1);
|
---|
47 | $code .= " xor $t1,$y,$z /* y ^ z */\n" if ($pos == -1);
|
---|
48 | $code .= " add.w $t2,$dst,$t0 /* dst + X[k] */\n" if ($pos == -1);
|
---|
49 | $code .= <<EOF;
|
---|
50 | lu12i.w $t8,$T_i_h /* load bits [31:12] of constant */
|
---|
51 | and $t1,$x,$t1 /* x & ... */
|
---|
52 | ori $t8,$t8,$T_i_l /* load bits [11:0] of constant */
|
---|
53 | xor $t1,$z,$t1 /* z ^ ... */
|
---|
54 | add.w $t7,$t2,$t8 /* dst + X[k] + Const */
|
---|
55 | ld.w $t0,$a1,$k_next*4 /* (NEXT STEP) X[$k_next] */
|
---|
56 | add.w $dst,$t7,$t1 /* dst += ... */
|
---|
57 | add.w $t2,$z,$t0 /* (NEXT STEP) dst + X[$k_next] */
|
---|
58 | EOF
|
---|
59 |
|
---|
60 | $code .= " rotri.w $dst,$dst,32-$s /* dst <<< s */\n";
|
---|
61 | if ($pos != 1) {
|
---|
62 | $code .= " xor $t1,$x,$y /* (NEXT STEP) y ^ z */\n";
|
---|
63 | } else {
|
---|
64 | $code .= " move $t0,$a7 /* (NEXT ROUND) $t0 = z' (copy of z) */\n";
|
---|
65 | $code .= " nor $t1,$zero,$a7 /* (NEXT ROUND) $t1 = not z' (copy of not z) */\n";
|
---|
66 | }
|
---|
67 | $code .= " add.w $dst,$dst,$x /* dst += x */\n";
|
---|
68 | }
|
---|
69 |
|
---|
70 | # round2_step() does:
|
---|
71 | # dst = x + ((dst + G(x,y,z) + X[k] + T_i) <<< s)
|
---|
72 | # $t0 = z' (copy of z for the next step)
|
---|
73 | # $t1 = not z' (copy of not z for the next step)
|
---|
74 | # $t2 = dst + X[k_next]
|
---|
75 | sub round2_step
|
---|
76 | {
|
---|
77 | my ($pos, $dst, $x, $y, $z, $k_next, $T_i, $s) = @_;
|
---|
78 | my $T_i_h = ($T_i & 0xfffff000) >> 12;
|
---|
79 | my $T_i_l = $T_i & 0xfff;
|
---|
80 | $T_i_h = -((1<<32) - (0xfff00000 | $T_i_h)) if ($T_i_h >= (1<<19));
|
---|
81 |
|
---|
82 | $code .= <<EOF;
|
---|
83 | lu12i.w $t8,$T_i_h /* load bits [31:12] of Constant */
|
---|
84 | and $t0,$x,$t0 /* x & z */
|
---|
85 | ori $t8,$t8,$T_i_l /* load bits [11:0] of Constant */
|
---|
86 | and $t1,$y,$t1 /* y & (not z) */
|
---|
87 | add.w $t7,$t2,$t8 /* dst + X[k] + Const */
|
---|
88 | or $t1,$t0,$t1 /* (y & (not z)) | (x & z) */
|
---|
89 | ld.w $t0,$a1,$k_next*4 /* (NEXT STEP) X[$k_next] */
|
---|
90 | add.w $dst,$t7,$t1 /* dst += ... */
|
---|
91 | add.w $t2,$z,$t0 /* (NEXT STEP) dst + X[$k_next] */
|
---|
92 | EOF
|
---|
93 |
|
---|
94 | $code .= " rotri.w $dst,$dst,32-$s /* dst <<< s */\n";
|
---|
95 | if ($pos != 1) {
|
---|
96 | $code .= " move $t0,$y /* (NEXT STEP) z' = $y */\n";
|
---|
97 | $code .= " nor $t1,$zero,$y /* (NEXT STEP) not z' = not $y */\n";
|
---|
98 | } else {
|
---|
99 | $code .= " xor $t1,$a6,$a7 /* (NEXT ROUND) $t1 = y ^ z */\n";
|
---|
100 | }
|
---|
101 | $code .= " add.w $dst,$dst,$x /* dst += x */\n";
|
---|
102 | }
|
---|
103 |
|
---|
104 | # round3_step() does:
|
---|
105 | # dst = x + ((dst + H(x,y,z) + X[k] + T_i) <<< s)
|
---|
106 | # $t1 = y ^ z
|
---|
107 | # $t2 = dst + X[k_next]
|
---|
108 | sub round3_step
|
---|
109 | {
|
---|
110 | my ($pos, $dst, $x, $y, $z, $k_next, $T_i, $s) = @_;
|
---|
111 | my $T_i_h = ($T_i & 0xfffff000) >> 12;
|
---|
112 | my $T_i_l = $T_i & 0xfff;
|
---|
113 | $T_i_h = -((1<<32) - (0xfff00000 | $T_i_h)) if ($T_i_h >= (1<<19));
|
---|
114 |
|
---|
115 | $code .= <<EOF;
|
---|
116 | lu12i.w $t8,$T_i_h /* load bits [31:12] of Constant */
|
---|
117 | xor $t1,$x,$t1 /* x ^ ... */
|
---|
118 | ori $t8,$t8,$T_i_l /* load bits [11:0] of Constant */
|
---|
119 | add.w $t7,$t2,$t8 /* dst + X[k] + Const */
|
---|
120 | ld.w $t0,$a1,$k_next*4 /* (NEXT STEP) X[$k_next] */
|
---|
121 | add.w $dst,$t7,$t1 /* dst += ... */
|
---|
122 | add.w $t2,$z,$t0 /* (NEXT STEP) dst + X[$k_next] */
|
---|
123 | EOF
|
---|
124 |
|
---|
125 | $code .= " rotri.w $dst,$dst,32-$s /* dst <<< s */\n";
|
---|
126 | if ($pos != 1) {
|
---|
127 | $code .= " xor $t1,$x,$y /* (NEXT STEP) y ^ z */\n";
|
---|
128 | } else {
|
---|
129 | $code .= " nor $t1,$zero,$a7 /* (NEXT ROUND) $t1 = not z */\n";
|
---|
130 | }
|
---|
131 | $code .= " add.w $dst,$dst,$x /* dst += x */\n";
|
---|
132 | }
|
---|
133 |
|
---|
134 | # round4_step() does:
|
---|
135 | # dst = x + ((dst + I(x,y,z) + X[k] + T_i) <<< s)
|
---|
136 | # $t1 = not z' (copy of not z for the next step)
|
---|
137 | # $t2 = dst + X[k_next]
|
---|
138 | sub round4_step
|
---|
139 | {
|
---|
140 | my ($pos, $dst, $x, $y, $z, $k_next, $T_i, $s) = @_;
|
---|
141 | my $T_i_h = ($T_i & 0xfffff000) >> 12;
|
---|
142 | my $T_i_l = $T_i & 0xfff;
|
---|
143 | $T_i_h = -((1<<32) - (0xfff00000 | $T_i_h)) if ($T_i_h >= (1<<19));
|
---|
144 |
|
---|
145 | $code .= <<EOF;
|
---|
146 | lu12i.w $t8,$T_i_h /* load bits [31:12] of Constant */
|
---|
147 | or $t1,$x,$t1 /* x | ... */
|
---|
148 | ori $t8,$t8,$T_i_l /* load bits [11:0] of Constant */
|
---|
149 | xor $t1,$y,$t1 /* y ^ ... */
|
---|
150 | add.w $t7,$t2,$t8 /* dst + X[k] + Const */
|
---|
151 | EOF
|
---|
152 |
|
---|
153 | if ($pos != 1) {
|
---|
154 | $code .= " ld.w $t0,$a1,$k_next*4 /* (NEXT STEP) X[$k_next] */\n";
|
---|
155 | $code .= " add.w $dst,$t7,$t1 /* dst += ... */\n";
|
---|
156 | $code .= " add.w $t2,$z,$t0 /* (NEXT STEP) dst + X[$k_next] */\n";
|
---|
157 | $code .= " rotri.w $dst,$dst,32-$s /* dst <<< s */\n";
|
---|
158 | $code .= " nor $t1,$zero,$y /* (NEXT STEP) not z' = not $y */\n";
|
---|
159 | $code .= " add.w $dst,$dst,$x /* dst += x */\n";
|
---|
160 | } else {
|
---|
161 | $code .= " add.w $a4,$t3,$a4 /* (NEXT LOOP) add old value of A */\n";
|
---|
162 | $code .= " add.w $dst,$t7,$t1 /* dst += ... */\n";
|
---|
163 | $code .= " add.w $a7,$t6,$a7 /* (NEXT LOOP) add old value of D */\n";
|
---|
164 | $code .= " rotri.w $dst,$dst,32-$s /* dst <<< s */\n";
|
---|
165 | $code .= " addi.d $a1,$a1,64 /* (NEXT LOOP) ptr += 64 */\n";
|
---|
166 | $code .= " add.w $dst,$dst,$x /* dst += x */\n";
|
---|
167 | }
|
---|
168 | }
|
---|
169 |
|
---|
170 | $code .= <<EOF;
|
---|
171 | .text
|
---|
172 |
|
---|
173 | .globl ossl_md5_block_asm_data_order
|
---|
174 | .type ossl_md5_block_asm_data_order function
|
---|
175 | ossl_md5_block_asm_data_order:
|
---|
176 | # $a0 = arg #1 (ctx, MD5_CTX pointer)
|
---|
177 | # $a1 = arg #2 (ptr, data pointer)
|
---|
178 | # $a2 = arg #3 (nbr, number of 16-word blocks to process)
|
---|
179 | beqz $a2,.Lend # cmp nbr with 0, jmp if nbr == 0
|
---|
180 |
|
---|
181 | # ptr is '$a1'
|
---|
182 | # end is '$a3'
|
---|
183 | slli.d $t0,$a2,6
|
---|
184 | add.d $a3,$a1,$t0
|
---|
185 |
|
---|
186 | # A is '$a4'
|
---|
187 | # B is '$a5'
|
---|
188 | # C is '$a6'
|
---|
189 | # D is '$a7'
|
---|
190 | ld.w $a4,$a0,0 # a4 = ctx->A
|
---|
191 | ld.w $a5,$a0,4 # a5 = ctx->B
|
---|
192 | ld.w $a6,$a0,8 # a6 = ctx->C
|
---|
193 | ld.w $a7,$a0,12 # a7 = ctx->D
|
---|
194 |
|
---|
195 | # BEGIN of loop over 16-word blocks
|
---|
196 | .align 6
|
---|
197 | .Lloop:
|
---|
198 | # save old values of A, B, C, D
|
---|
199 | move $t3,$a4
|
---|
200 | move $t4,$a5
|
---|
201 | move $t5,$a6
|
---|
202 | move $t6,$a7
|
---|
203 |
|
---|
204 | preld 0,$a1,0
|
---|
205 | preld 0,$a1,64
|
---|
206 | EOF
|
---|
207 |
|
---|
208 | round1_step(-1, $a4, $a5, $a6, $a7, '1', 0xd76aa478, '7');
|
---|
209 | round1_step(0, $a7, $a4, $a5, $a6, '2', 0xe8c7b756, '12');
|
---|
210 | round1_step(0, $a6, $a7, $a4, $a5, '3', 0x242070db, '17');
|
---|
211 | round1_step(0, $a5, $a6, $a7, $a4, '4', 0xc1bdceee, '22');
|
---|
212 | round1_step(0, $a4, $a5, $a6, $a7, '5', 0xf57c0faf, '7');
|
---|
213 | round1_step(0, $a7, $a4, $a5, $a6, '6', 0x4787c62a, '12');
|
---|
214 | round1_step(0, $a6, $a7, $a4, $a5, '7', 0xa8304613, '17');
|
---|
215 | round1_step(0, $a5, $a6, $a7, $a4, '8', 0xfd469501, '22');
|
---|
216 | round1_step(0, $a4, $a5, $a6, $a7, '9', 0x698098d8, '7');
|
---|
217 | round1_step(0, $a7, $a4, $a5, $a6, '10', 0x8b44f7af, '12');
|
---|
218 | round1_step(0, $a6, $a7, $a4, $a5, '11', 0xffff5bb1, '17');
|
---|
219 | round1_step(0, $a5, $a6, $a7, $a4, '12', 0x895cd7be, '22');
|
---|
220 | round1_step(0, $a4, $a5, $a6, $a7, '13', 0x6b901122, '7');
|
---|
221 | round1_step(0, $a7, $a4, $a5, $a6, '14', 0xfd987193, '12');
|
---|
222 | round1_step(0, $a6, $a7, $a4, $a5, '15', 0xa679438e, '17');
|
---|
223 | round1_step(1, $a5, $a6, $a7, $a4, '1', 0x49b40821, '22');
|
---|
224 |
|
---|
225 | round2_step(-1, $a4, $a5, $a6, $a7, '6', 0xf61e2562, '5');
|
---|
226 | round2_step(0, $a7, $a4, $a5, $a6, '11', 0xc040b340, '9');
|
---|
227 | round2_step(0, $a6, $a7, $a4, $a5, '0', 0x265e5a51, '14');
|
---|
228 | round2_step(0, $a5, $a6, $a7, $a4, '5', 0xe9b6c7aa, '20');
|
---|
229 | round2_step(0, $a4, $a5, $a6, $a7, '10', 0xd62f105d, '5');
|
---|
230 | round2_step(0, $a7, $a4, $a5, $a6, '15', 0x2441453, '9');
|
---|
231 | round2_step(0, $a6, $a7, $a4, $a5, '4', 0xd8a1e681, '14');
|
---|
232 | round2_step(0, $a5, $a6, $a7, $a4, '9', 0xe7d3fbc8, '20');
|
---|
233 | round2_step(0, $a4, $a5, $a6, $a7, '14', 0x21e1cde6, '5');
|
---|
234 | round2_step(0, $a7, $a4, $a5, $a6, '3', 0xc33707d6, '9');
|
---|
235 | round2_step(0, $a6, $a7, $a4, $a5, '8', 0xf4d50d87, '14');
|
---|
236 | round2_step(0, $a5, $a6, $a7, $a4, '13', 0x455a14ed, '20');
|
---|
237 | round2_step(0, $a4, $a5, $a6, $a7, '2', 0xa9e3e905, '5');
|
---|
238 | round2_step(0, $a7, $a4, $a5, $a6, '7', 0xfcefa3f8, '9');
|
---|
239 | round2_step(0, $a6, $a7, $a4, $a5, '12', 0x676f02d9, '14');
|
---|
240 | round2_step(1, $a5, $a6, $a7, $a4, '5', 0x8d2a4c8a, '20');
|
---|
241 |
|
---|
242 | round3_step(-1, $a4, $a5, $a6, $a7, '8', 0xfffa3942, '4');
|
---|
243 | round3_step(0, $a7, $a4, $a5, $a6, '11', 0x8771f681, '11');
|
---|
244 | round3_step(0, $a6, $a7, $a4, $a5, '14', 0x6d9d6122, '16');
|
---|
245 | round3_step(0, $a5, $a6, $a7, $a4, '1', 0xfde5380c, '23');
|
---|
246 | round3_step(0, $a4, $a5, $a6, $a7, '4', 0xa4beea44, '4');
|
---|
247 | round3_step(0, $a7, $a4, $a5, $a6, '7', 0x4bdecfa9, '11');
|
---|
248 | round3_step(0, $a6, $a7, $a4, $a5, '10', 0xf6bb4b60, '16');
|
---|
249 | round3_step(0, $a5, $a6, $a7, $a4, '13', 0xbebfbc70, '23');
|
---|
250 | round3_step(0, $a4, $a5, $a6, $a7, '0', 0x289b7ec6, '4');
|
---|
251 | round3_step(0, $a7, $a4, $a5, $a6, '3', 0xeaa127fa, '11');
|
---|
252 | round3_step(0, $a6, $a7, $a4, $a5, '6', 0xd4ef3085, '16');
|
---|
253 | round3_step(0, $a5, $a6, $a7, $a4, '9', 0x4881d05, '23');
|
---|
254 | round3_step(0, $a4, $a5, $a6, $a7, '12', 0xd9d4d039, '4');
|
---|
255 | round3_step(0, $a7, $a4, $a5, $a6, '15', 0xe6db99e5, '11');
|
---|
256 | round3_step(0, $a6, $a7, $a4, $a5, '2', 0x1fa27cf8, '16');
|
---|
257 | round3_step(1, $a5, $a6, $a7, $a4, '0', 0xc4ac5665, '23');
|
---|
258 |
|
---|
259 | round4_step(-1, $a4, $a5, $a6, $a7, '7', 0xf4292244, '6');
|
---|
260 | round4_step(0, $a7, $a4, $a5, $a6, '14', 0x432aff97, '10');
|
---|
261 | round4_step(0, $a6, $a7, $a4, $a5, '5', 0xab9423a7, '15');
|
---|
262 | round4_step(0, $a5, $a6, $a7, $a4, '12', 0xfc93a039, '21');
|
---|
263 | round4_step(0, $a4, $a5, $a6, $a7, '3', 0x655b59c3, '6');
|
---|
264 | round4_step(0, $a7, $a4, $a5, $a6, '10', 0x8f0ccc92, '10');
|
---|
265 | round4_step(0, $a6, $a7, $a4, $a5, '1', 0xffeff47d, '15');
|
---|
266 | round4_step(0, $a5, $a6, $a7, $a4, '8', 0x85845dd1, '21');
|
---|
267 | round4_step(0, $a4, $a5, $a6, $a7, '15', 0x6fa87e4f, '6');
|
---|
268 | round4_step(0, $a7, $a4, $a5, $a6, '6', 0xfe2ce6e0, '10');
|
---|
269 | round4_step(0, $a6, $a7, $a4, $a5, '13', 0xa3014314, '15');
|
---|
270 | round4_step(0, $a5, $a6, $a7, $a4, '4', 0x4e0811a1, '21');
|
---|
271 | round4_step(0, $a4, $a5, $a6, $a7, '11', 0xf7537e82, '6');
|
---|
272 | round4_step(0, $a7, $a4, $a5, $a6, '2', 0xbd3af235, '10');
|
---|
273 | round4_step(0, $a6, $a7, $a4, $a5, '9', 0x2ad7d2bb, '15');
|
---|
274 | round4_step(1, $a5, $a6, $a7, $a4, '0', 0xeb86d391, '21');
|
---|
275 |
|
---|
276 | $code .= <<EOF;
|
---|
277 | # add old values of B, C
|
---|
278 | add.w $a5,$t4,$a5
|
---|
279 | add.w $a6,$t5,$a6
|
---|
280 |
|
---|
281 | bltu $a1,$a3,.Lloop # jmp if ptr < end
|
---|
282 |
|
---|
283 | st.w $a4,$a0,0 # ctx->A = A
|
---|
284 | st.w $a5,$a0,4 # ctx->B = B
|
---|
285 | st.w $a6,$a0,8 # ctx->C = C
|
---|
286 | st.w $a7,$a0,12 # ctx->D = D
|
---|
287 |
|
---|
288 | .Lend:
|
---|
289 | jr $ra
|
---|
290 | .size ossl_md5_block_asm_data_order,.-ossl_md5_block_asm_data_order
|
---|
291 | EOF
|
---|
292 |
|
---|
293 | $code =~ s/\`([^\`]*)\`/eval($1)/gem;
|
---|
294 |
|
---|
295 | print $code;
|
---|
296 |
|
---|
297 | close STDOUT;
|
---|