VirtualBox

source: vbox/trunk/src/libs/openssl-3.1.0/crypto/modes/asm/aes-gcm-armv8_64.pl@ 99507

最後變更 在這個檔案從99507是 99366,由 vboxsync 提交於 2 年 前

openssl-3.1.0: Applied and adjusted our OpenSSL changes to 3.0.7. bugref:10418

檔案大小: 272.2 KB
 
1#! /usr/bin/env perl
2# Copyright 2019-2022 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the Apache License 2.0 (the "License"). You may not use
5# this file except in compliance with the License. You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
9#
10#========================================================================
11# Written by Fangming Fang <[email protected]> for the OpenSSL project,
12# derived from https://github.com/ARM-software/AArch64cryptolib, original
13# author Samuel Lee <[email protected]>. The module is, however, dual
14# licensed under OpenSSL and CRYPTOGAMS licenses depending on where you
15# obtain it. For further details see http://www.openssl.org/~appro/cryptogams/.
16#========================================================================
17#
18# Approach - assume we don't want to reload constants, so reserve ~half of vector register file for constants
19#
20# main loop to act on 4 16B blocks per iteration, and then do modulo of the accumulated intermediate hashes from the 4 blocks
21#
22# ____________________________________________________
23# | |
24# | PRE |
25# |____________________________________________________|
26# | | | |
27# | CTR block 4k+8 | AES block 4k+4 | GHASH block 4k+0 |
28# |________________|________________|__________________|
29# | | | |
30# | CTR block 4k+9 | AES block 4k+5 | GHASH block 4k+1 |
31# |________________|________________|__________________|
32# | | | |
33# | CTR block 4k+10| AES block 4k+6 | GHASH block 4k+2 |
34# |________________|________________|__________________|
35# | | | |
36# | CTR block 4k+11| AES block 4k+7 | GHASH block 4k+3 |
37# |________________|____(mostly)____|__________________|
38# | |
39# | MODULO |
40# |____________________________________________________|
41#
42# PRE:
43# Ensure previous generated intermediate hash is aligned and merged with result for GHASH 4k+0
44# EXT low_acc, low_acc, low_acc, #8
45# EOR res_curr (4k+0), res_curr (4k+0), low_acc
46#
47# CTR block:
48# Increment and byte reverse counter in scalar registers and transfer to SIMD registers
49# REV ctr32, rev_ctr32
50# ORR ctr64, constctr96_top32, ctr32, LSL #32
51# INS ctr_next.d[0], constctr96_bottom64 // Keeping this in scalar registers to free up space in SIMD RF
52# INS ctr_next.d[1], ctr64X
53# ADD rev_ctr32, #1
54#
55# AES block:
56# Do AES encryption/decryption on CTR block X and EOR it with input block X. Take 256 bytes key below for example.
57# Doing small trick here of loading input in scalar registers, EORing with last key and then transferring
58# Given we are very constrained in our ASIMD registers this is quite important
59#
60# Encrypt:
61# LDR input_low, [ input_ptr ], #8
62# LDR input_high, [ input_ptr ], #8
63# EOR input_low, k14_low
64# EOR input_high, k14_high
65# INS res_curr.d[0], input_low
66# INS res_curr.d[1], input_high
67# AESE ctr_curr, k0; AESMC ctr_curr, ctr_curr
68# AESE ctr_curr, k1; AESMC ctr_curr, ctr_curr
69# AESE ctr_curr, k2; AESMC ctr_curr, ctr_curr
70# AESE ctr_curr, k3; AESMC ctr_curr, ctr_curr
71# AESE ctr_curr, k4; AESMC ctr_curr, ctr_curr
72# AESE ctr_curr, k5; AESMC ctr_curr, ctr_curr
73# AESE ctr_curr, k6; AESMC ctr_curr, ctr_curr
74# AESE ctr_curr, k7; AESMC ctr_curr, ctr_curr
75# AESE ctr_curr, k8; AESMC ctr_curr, ctr_curr
76# AESE ctr_curr, k9; AESMC ctr_curr, ctr_curr
77# AESE ctr_curr, k10; AESMC ctr_curr, ctr_curr
78# AESE ctr_curr, k11; AESMC ctr_curr, ctr_curr
79# AESE ctr_curr, k12; AESMC ctr_curr, ctr_curr
80# AESE ctr_curr, k13
81# EOR res_curr, res_curr, ctr_curr
82# ST1 { res_curr.16b }, [ output_ptr ], #16
83#
84# Decrypt:
85# AESE ctr_curr, k0; AESMC ctr_curr, ctr_curr
86# AESE ctr_curr, k1; AESMC ctr_curr, ctr_curr
87# AESE ctr_curr, k2; AESMC ctr_curr, ctr_curr
88# AESE ctr_curr, k3; AESMC ctr_curr, ctr_curr
89# AESE ctr_curr, k4; AESMC ctr_curr, ctr_curr
90# AESE ctr_curr, k5; AESMC ctr_curr, ctr_curr
91# AESE ctr_curr, k6; AESMC ctr_curr, ctr_curr
92# AESE ctr_curr, k7; AESMC ctr_curr, ctr_curr
93# AESE ctr_curr, k8; AESMC ctr_curr, ctr_curr
94# AESE ctr_curr, k9; AESMC ctr_curr, ctr_curr
95# AESE ctr_curr, k10; AESMC ctr_curr, ctr_curr
96# AESE ctr_curr, k11; AESMC ctr_curr, ctr_curr
97# AESE ctr_curr, k12; AESMC ctr_curr, ctr_curr
98# AESE ctr_curr, k13
99# LDR res_curr, [ input_ptr ], #16
100# EOR res_curr, res_curr, ctr_curr
101# MOV output_low, res_curr.d[0]
102# MOV output_high, res_curr.d[1]
103# EOR output_low, k14_low
104# EOR output_high, k14_high
105# STP output_low, output_high, [ output_ptr ], #16
106#
107# GHASH block X:
108# do 128b karatsuba polynomial multiplication on block
109# We only have 64b->128b polynomial multipliers, naively that means we need to do 4 64b multiplies to generate a 128b
110#
111# multiplication:
112# Pmull(A,B) == (Pmull(Ah,Bh)<<128 | Pmull(Al,Bl)) ^ (Pmull(Ah,Bl) ^ Pmull(Al,Bh))<<64
113#
114# The idea behind Karatsuba multiplication is that we can do just 3 64b multiplies:
115# Pmull(A,B) == (Pmull(Ah,Bh)<<128 | Pmull(Al,Bl)) ^ (Pmull(Ah^Al,Bh^Bl) ^ Pmull(Ah,Bh) ^ Pmull(Al,Bl))<<64
116#
117# There is some complication here because the bit order of GHASH's PMULL is reversed compared to elsewhere, so we are
118# multiplying with "twisted" powers of H
119#
120# Note: We can PMULL directly into the acc_x in first GHASH of the loop
121# Note: For scheduling big cores we want to split the processing to happen over two loop iterations - otherwise the critical
122# path latency dominates the performance
123#
124# This has a knock on effect on register pressure, so we have to be a bit more clever with our temporary registers
125# than indicated here
126# REV64 res_curr, res_curr
127# INS t_m.d[0], res_curr.d[1]
128# EOR t_m.8B, t_m.8B, res_curr.8B
129# PMULL2 t_h, res_curr, HX
130# PMULL t_l, res_curr, HX
131# PMULL t_m, t_m, HX_k
132# EOR acc_h, acc_h, t_h
133# EOR acc_l, acc_l, t_l
134# EOR acc_m, acc_m, t_m
135#
136# MODULO: take the partial accumulators (~representing sum of 256b multiplication results), from GHASH and do modulo reduction on them
137# There is some complication here because the bit order of GHASH's PMULL is reversed compared to elsewhere, so we are doing modulo
138# with a reversed constant
139# EOR acc_m, acc_m, acc_h
140# EOR acc_m, acc_m, acc_l // Finish off karatsuba processing
141# PMULL t_mod, acc_h, mod_constant
142# EXT acc_h, acc_h, acc_h, #8
143# EOR acc_m, acc_m, acc_h
144# EOR acc_m, acc_m, t_mod
145# PMULL acc_h, acc_m, mod_constant
146# EXT acc_m, acc_m, acc_m, #8
147# EOR acc_l, acc_l, acc_h
148# EOR acc_l, acc_l, acc_m
149
150$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
151$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
152
153$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
154( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
155( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate ) or
156die "can't locate arm-xlate.pl";
157
158open OUT,"| \"$^X\" $xlate $flavour $output";
159*STDOUT=*OUT;
160
161$input_ptr="x0"; #argument block
162$bit_length="x1";
163$output_ptr="x2";
164$current_tag="x3";
165$counter="x16";
166$cc="x8";
167
168{
169my ($end_input_ptr,$main_end_input_ptr,$input_l0,$input_h0)=map("x$_",(4..7));
170my ($input_l1,$input_h1,$input_l2,$input_h2,$input_l3,$input_h3)=map("x$_",(19..24));
171my ($output_l1,$output_h1,$output_l2,$output_h2,$output_l3,$output_h3)=map("x$_",(19..24));
172my ($output_l0,$output_h0)=map("x$_",(6..7));
173
174my $ctr32w="w9";
175my ($ctr32x,$ctr96_b64x,$ctr96_t32x,$rctr32x,$rk10_l,$rk10_h,$len)=map("x$_",(9..15));
176my ($ctr96_t32w,$rctr32w)=map("w$_",(11..12));
177
178my ($ctr0b,$ctr1b,$ctr2b,$ctr3b,$res0b,$res1b,$res2b,$res3b)=map("v$_.16b",(0..7));
179my ($ctr0,$ctr1,$ctr2,$ctr3,$res0,$res1,$res2,$res3)=map("v$_",(0..7));
180my ($ctr0d,$ctr1d,$ctr2d,$ctr3d,$res0d,$res1d,$res2d,$res3d)=map("d$_",(0..7));
181my ($res0q,$res1q,$res2q,$res3q)=map("q$_",(4..7));
182
183my ($acc_hb,$acc_mb,$acc_lb)=map("v$_.16b",(9..11));
184my ($acc_h,$acc_m,$acc_l)=map("v$_",(9..11));
185my ($acc_hd,$acc_md,$acc_ld)=map("d$_",(9..11));
186
187my ($h1,$h2,$h3,$h4,$h12k,$h34k)=map("v$_",(12..17));
188my ($h1q,$h2q,$h3q,$h4q)=map("q$_",(12..15));
189my ($h1b,$h2b,$h3b,$h4b)=map("v$_.16b",(12..15));
190
191my $t0="v8";
192my $t0d="d8";
193
194my ($t1,$t2,$t3)=map("v$_",(28..30));
195my ($t1d,$t2d,$t3d)=map("d$_",(28..30));
196
197my $t4="v8";
198my $t4d="d8";
199my $t5="v28";
200my $t5d="d28";
201my $t6="v31";
202my $t6d="d31";
203
204my $t7="v4";
205my $t7d="d4";
206my $t8="v29";
207my $t8d="d29";
208my $t9="v30";
209my $t9d="d30";
210
211my ($ctr_t0,$ctr_t1,$ctr_t2,$ctr_t3)=map("v$_",(4..7));
212my ($ctr_t0d,$ctr_t1d,$ctr_t2d,$ctr_t3d)=map("d$_",(4..7));
213my ($ctr_t0b,$ctr_t1b,$ctr_t2b,$ctr_t3b)=map("v$_.16b",(4..7));
214
215my $mod_constantd="d8";
216my $mod_constant="v8";
217my $mod_t="v31";
218
219my ($rk0,$rk1,$rk2,$rk3,$rk4,$rk5,$rk6,$rk7,$rk8,$rk9)=map("v$_.16b",(18..27));
220my ($rk0q,$rk1q,$rk2q,$rk3q,$rk4q,$rk5q,$rk6q,$rk7q,$rk8q,$rk9q)=map("q$_",(18..27));
221my $rk2q1="v20.1q";
222my $rk3q1="v21.1q";
223my $rk4v="v22";
224my $rk4d="d22";
225
226$code=<<___;
227#include "arm_arch.h"
228
229#if __ARM_MAX_ARCH__>=8
230___
231$code.=".arch armv8-a+crypto\n.text\n" if ($flavour =~ /64/);
232$code.=<<___ if ($flavour !~ /64/);
233.fpu neon
234#ifdef __thumb2__
235.syntax unified
236.thumb
237# define INST(a,b,c,d) $_byte c,0xef,a,b
238#else
239.code 32
240# define INST(a,b,c,d) $_byte a,b,c,0xf2
241#endif
242
243.text
244___
245
246#########################################################################################
247# size_t aes_gcm_enc_128_kernel(const unsigned char *in,
248# size_t len,
249# unsigned char *out,
250# const void *key,
251# unsigned char ivec[16],
252# u64 *Xi);
253#
254$code.=<<___;
255.global aes_gcm_enc_128_kernel
256.type aes_gcm_enc_128_kernel,%function
257.align 4
258aes_gcm_enc_128_kernel:
259 AARCH64_VALID_CALL_TARGET
260 cbz x1, .L128_enc_ret
261 stp x19, x20, [sp, #-112]!
262 mov x16, x4
263 mov x8, x5
264 stp x21, x22, [sp, #16]
265 stp x23, x24, [sp, #32]
266 stp d8, d9, [sp, #48]
267 stp d10, d11, [sp, #64]
268 stp d12, d13, [sp, #80]
269 stp d14, d15, [sp, #96]
270
271 ldp $ctr96_b64x, $ctr96_t32x, [$counter] @ ctr96_b64, ctr96_t32
272 ldp $rk10_l, $rk10_h, [$cc, #160] @ load rk10
273
274 ld1 {$acc_lb}, [$current_tag]
275 ext $acc_lb, $acc_lb, $acc_lb, #8
276 rev64 $acc_lb, $acc_lb
277 lsr $main_end_input_ptr, $bit_length, #3 @ byte_len
278 mov $len, $main_end_input_ptr
279
280 ldr $rk9q, [$cc, #144] @ load rk9
281 add $end_input_ptr, $input_ptr, $bit_length, lsr #3 @ end_input_ptr
282 sub $main_end_input_ptr, $main_end_input_ptr, #1 @ byte_len - 1
283
284 lsr $rctr32x, $ctr96_t32x, #32
285 ldr $h4q, [$current_tag, #112] @ load h4l | h4h
286 ext $h4b, $h4b, $h4b, #8
287
288 fmov $ctr1d, $ctr96_b64x @ CTR block 1
289 rev $rctr32w, $rctr32w @ rev_ctr32
290
291 add $rctr32w, $rctr32w, #1 @ increment rev_ctr32
292 orr $ctr96_t32w, $ctr96_t32w, $ctr96_t32w
293 ldr $rk0q, [$cc, #0] @ load rk0
294
295 rev $ctr32w, $rctr32w @ CTR block 1
296 add $rctr32w, $rctr32w, #1 @ CTR block 1
297 fmov $ctr3d, $ctr96_b64x @ CTR block 3
298
299 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 1
300 ld1 { $ctr0b}, [$counter] @ special case vector load initial counter so we can start first AES block as quickly as possible
301
302 fmov $ctr1.d[1], $ctr32x @ CTR block 1
303 rev $ctr32w, $rctr32w @ CTR block 2
304
305 fmov $ctr2d, $ctr96_b64x @ CTR block 2
306 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 2
307 add $rctr32w, $rctr32w, #1 @ CTR block 2
308
309 fmov $ctr2.d[1], $ctr32x @ CTR block 2
310 rev $ctr32w, $rctr32w @ CTR block 3
311
312 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 3
313 ldr $rk1q, [$cc, #16] @ load rk1
314
315 add $rctr32w, $rctr32w, #1 @ CTR block 3
316 fmov $ctr3.d[1], $ctr32x @ CTR block 3
317
318 ldr $h3q, [$current_tag, #80] @ load h3l | h3h
319 ext $h3b, $h3b, $h3b, #8
320
321 aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 0
322 ldr $rk2q, [$cc, #32] @ load rk2
323
324 aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 0
325 ldr $h1q, [$current_tag, #32] @ load h1l | h1h
326 ext $h1b, $h1b, $h1b, #8
327
328 aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 0
329 ldr $rk8q, [$cc, #128] @ load rk8
330
331 aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 0
332 ldr $rk3q, [$cc, #48] @ load rk3
333
334 aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 1
335 trn2 $h34k.2d, $h3.2d, $h4.2d @ h4l | h3l
336
337 aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 1
338 ldr $rk6q, [$cc, #96] @ load rk6
339
340 aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 1
341 ldr $rk7q, [$cc, #112] @ load rk7
342
343 aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 1
344 trn1 $acc_h.2d, $h3.2d, $h4.2d @ h4h | h3h
345
346 aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 2
347 ldr $rk5q, [$cc, #80] @ load rk5
348
349 aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 2
350 ldr $h2q, [$current_tag, #64] @ load h2l | h2h
351 ext $h2b, $h2b, $h2b, #8
352
353 aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 2
354
355 aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 2
356 eor $h34k.16b, $h34k.16b, $acc_h.16b @ h4k | h3k
357
358 aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 3
359
360 aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 3
361
362 aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 3
363 ldr $rk4q, [$cc, #64] @ load rk4
364
365 aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 3
366
367 and $main_end_input_ptr, $main_end_input_ptr, #0xffffffffffffffc0 @ number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
368 trn2 $h12k.2d, $h1.2d, $h2.2d @ h2l | h1l
369
370 aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 4
371 add $main_end_input_ptr, $main_end_input_ptr, $input_ptr
372
373 aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 4
374 cmp $input_ptr, $main_end_input_ptr @ check if we have <= 4 blocks
375
376 aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 4
377
378 aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 5
379
380 aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 5
381
382 aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 5
383
384 aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 6
385
386 aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 4
387
388 aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 6
389 trn1 $t0.2d, $h1.2d, $h2.2d @ h2h | h1h
390
391 aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 6
392
393 aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 5
394
395 aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 7
396
397 aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 7
398
399 aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 6
400
401 aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 7
402
403 aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 8
404
405 aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 7
406
407 aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 8
408
409 aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 8
410
411 aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 8
412
413 aese $ctr2b, $rk9 @ AES block 2 - round 9
414
415 aese $ctr0b, $rk9 @ AES block 0 - round 9
416
417 eor $h12k.16b, $h12k.16b, $t0.16b @ h2k | h1k
418
419 aese $ctr1b, $rk9 @ AES block 1 - round 9
420
421 aese $ctr3b, $rk9 @ AES block 3 - round 9
422 b.ge .L128_enc_tail @ handle tail
423
424 ldp $input_l0, $input_h0, [$input_ptr, #0] @ AES block 0 - load plaintext
425
426 ldp $input_l2, $input_h2, [$input_ptr, #32] @ AES block 2 - load plaintext
427
428 ldp $input_l1, $input_h1, [$input_ptr, #16] @ AES block 1 - load plaintext
429
430 ldp $input_l3, $input_h3, [$input_ptr, #48] @ AES block 3 - load plaintext
431
432 eor $input_l0, $input_l0, $rk10_l @ AES block 0 - round 10 low
433 eor $input_h0, $input_h0, $rk10_h @ AES block 0 - round 10 high
434
435 eor $input_l2, $input_l2, $rk10_l @ AES block 2 - round 10 low
436 fmov $ctr_t0d, $input_l0 @ AES block 0 - mov low
437
438 eor $input_l1, $input_l1, $rk10_l @ AES block 1 - round 10 low
439 eor $input_h2, $input_h2, $rk10_h @ AES block 2 - round 10 high
440 fmov $ctr_t0.d[1], $input_h0 @ AES block 0 - mov high
441
442 fmov $ctr_t1d, $input_l1 @ AES block 1 - mov low
443 eor $input_h1, $input_h1, $rk10_h @ AES block 1 - round 10 high
444
445 eor $input_l3, $input_l3, $rk10_l @ AES block 3 - round 10 low
446 fmov $ctr_t1.d[1], $input_h1 @ AES block 1 - mov high
447
448 fmov $ctr_t2d, $input_l2 @ AES block 2 - mov low
449 eor $input_h3, $input_h3, $rk10_h @ AES block 3 - round 10 high
450 rev $ctr32w, $rctr32w @ CTR block 4
451
452 fmov $ctr_t2.d[1], $input_h2 @ AES block 2 - mov high
453 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4
454
455 eor $res0b, $ctr_t0b, $ctr0b @ AES block 0 - result
456 fmov $ctr0d, $ctr96_b64x @ CTR block 4
457 add $rctr32w, $rctr32w, #1 @ CTR block 4
458
459 fmov $ctr0.d[1], $ctr32x @ CTR block 4
460 rev $ctr32w, $rctr32w @ CTR block 5
461
462 eor $res1b, $ctr_t1b, $ctr1b @ AES block 1 - result
463 fmov $ctr1d, $ctr96_b64x @ CTR block 5
464 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 5
465
466 add $rctr32w, $rctr32w, #1 @ CTR block 5
467 add $input_ptr, $input_ptr, #64 @ AES input_ptr update
468 fmov $ctr1.d[1], $ctr32x @ CTR block 5
469
470 fmov $ctr_t3d, $input_l3 @ AES block 3 - mov low
471 rev $ctr32w, $rctr32w @ CTR block 6
472 st1 { $res0b}, [$output_ptr], #16 @ AES block 0 - store result
473
474 fmov $ctr_t3.d[1], $input_h3 @ AES block 3 - mov high
475 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 6
476
477 add $rctr32w, $rctr32w, #1 @ CTR block 6
478 eor $res2b, $ctr_t2b, $ctr2b @ AES block 2 - result
479 st1 { $res1b}, [$output_ptr], #16 @ AES block 1 - store result
480
481 fmov $ctr2d, $ctr96_b64x @ CTR block 6
482 cmp $input_ptr, $main_end_input_ptr @ check if we have <= 8 blocks
483
484 fmov $ctr2.d[1], $ctr32x @ CTR block 6
485 rev $ctr32w, $rctr32w @ CTR block 7
486 st1 { $res2b}, [$output_ptr], #16 @ AES block 2 - store result
487
488 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 7
489
490 eor $res3b, $ctr_t3b, $ctr3b @ AES block 3 - result
491 st1 { $res3b}, [$output_ptr], #16 @ AES block 3 - store result
492 b.ge .L128_enc_prepretail @ do prepretail
493
494 .L128_enc_main_loop: @ main loop start
495 ldp $input_l3, $input_h3, [$input_ptr, #48] @ AES block 4k+3 - load plaintext
496 rev64 $res0b, $res0b @ GHASH block 4k (only t0 is free)
497 rev64 $res2b, $res2b @ GHASH block 4k+2 (t0, t1, and t2 free)
498
499 aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 0
500 fmov $ctr3d, $ctr96_b64x @ CTR block 4k+3
501
502 ext $acc_lb, $acc_lb, $acc_lb, #8 @ PRE 0
503 rev64 $res1b, $res1b @ GHASH block 4k+1 (t0 and t1 free)
504
505 aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 0
506 add $rctr32w, $rctr32w, #1 @ CTR block 4k+3
507 fmov $ctr3.d[1], $ctr32x @ CTR block 4k+3
508
509 aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 0
510 mov $t6d, $res2.d[1] @ GHASH block 4k+2 - mid
511
512 aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 1
513 mov $t3d, $res1.d[1] @ GHASH block 4k+1 - mid
514
515 aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 1
516 eor $res0b, $res0b, $acc_lb @ PRE 1
517
518 aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 0
519 eor $input_h3, $input_h3, $rk10_h @ AES block 4k+3 - round 10 high
520
521 pmull2 $t1.1q, $res1.2d, $h3.2d @ GHASH block 4k+1 - high
522 eor $t6.8b, $t6.8b, $res2.8b @ GHASH block 4k+2 - mid
523 ldp $input_l0, $input_h0, [$input_ptr, #0] @ AES block 4k+4 - load plaintext
524
525 aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 1
526 rev $ctr32w, $rctr32w @ CTR block 4k+8
527
528 eor $t3.8b, $t3.8b, $res1.8b @ GHASH block 4k+1 - mid
529 mov $t0d, $res0.d[1] @ GHASH block 4k - mid
530 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+8
531
532 pmull2 $acc_h.1q, $res0.2d, $h4.2d @ GHASH block 4k - high
533 add $rctr32w, $rctr32w, #1 @ CTR block 4k+8
534 mov $acc_md, $h34k.d[1] @ GHASH block 4k - mid
535
536 aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 2
537
538 pmull $acc_l.1q, $res0.1d, $h4.1d @ GHASH block 4k - low
539 eor $t0.8b, $t0.8b, $res0.8b @ GHASH block 4k - mid
540
541 aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 2
542
543 aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 3
544 eor $acc_hb, $acc_hb, $t1.16b @ GHASH block 4k+1 - high
545
546 pmull $t5.1q, $res2.1d, $h2.1d @ GHASH block 4k+2 - low
547
548 pmull $acc_m.1q, $t0.1d, $acc_m.1d @ GHASH block 4k - mid
549 rev64 $res3b, $res3b @ GHASH block 4k+3 (t0, t1, t2 and t3 free)
550
551 pmull $t3.1q, $t3.1d, $h34k.1d @ GHASH block 4k+1 - mid
552
553 pmull $t2.1q, $res1.1d, $h3.1d @ GHASH block 4k+1 - low
554 ins $t6.d[1], $t6.d[0] @ GHASH block 4k+2 - mid
555
556 pmull2 $t4.1q, $res2.2d, $h2.2d @ GHASH block 4k+2 - high
557 eor $input_h0, $input_h0, $rk10_h @ AES block 4k+4 - round 10 high
558
559 eor $acc_mb, $acc_mb, $t3.16b @ GHASH block 4k+1 - mid
560 mov $t9d, $res3.d[1] @ GHASH block 4k+3 - mid
561
562 aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 1
563 eor $acc_lb, $acc_lb, $t2.16b @ GHASH block 4k+1 - low
564
565 aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 2
566 eor $input_l0, $input_l0, $rk10_l @ AES block 4k+4 - round 10 low
567
568 aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 3
569 eor $t9.8b, $t9.8b, $res3.8b @ GHASH block 4k+3 - mid
570
571 pmull2 $t7.1q, $res3.2d, $h1.2d @ GHASH block 4k+3 - high
572
573 aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 3
574 eor $acc_hb, $acc_hb, $t4.16b @ GHASH block 4k+2 - high
575
576 pmull2 $t6.1q, $t6.2d, $h12k.2d @ GHASH block 4k+2 - mid
577
578 pmull $t8.1q, $res3.1d, $h1.1d @ GHASH block 4k+3 - low
579 movi $mod_constant.8b, #0xc2
580
581 pmull $t9.1q, $t9.1d, $h12k.1d @ GHASH block 4k+3 - mid
582 eor $acc_lb, $acc_lb, $t5.16b @ GHASH block 4k+2 - low
583
584 aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 4
585
586 aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 2
587 shl $mod_constantd, $mod_constantd, #56 @ mod_constant
588
589 aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 4
590 eor $acc_hb, $acc_hb, $t7.16b @ GHASH block 4k+3 - high
591
592 aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 5
593 ldp $input_l1, $input_h1, [$input_ptr, #16] @ AES block 4k+5 - load plaintext
594
595 aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 3
596 eor $acc_mb, $acc_mb, $t6.16b @ GHASH block 4k+2 - mid
597
598 aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 5
599 ldp $input_l2, $input_h2, [$input_ptr, #32] @ AES block 4k+6 - load plaintext
600
601 pmull $mod_t.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid
602 eor $acc_lb, $acc_lb, $t8.16b @ GHASH block 4k+3 - low
603
604 aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 4
605 eor $input_l1, $input_l1, $rk10_l @ AES block 4k+5 - round 10 low
606
607 aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 4
608 eor $acc_mb, $acc_mb, $t9.16b @ GHASH block 4k+3 - mid
609
610 aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 6
611 eor $input_l3, $input_l3, $rk10_l @ AES block 4k+3 - round 10 low
612
613 aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 5
614 eor $t9.16b, $acc_lb, $acc_hb @ MODULO - karatsuba tidy up
615
616 fmov $ctr_t0d, $input_l0 @ AES block 4k+4 - mov low
617 aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 6
618 fmov $ctr_t0.d[1], $input_h0 @ AES block 4k+4 - mov high
619
620 add $input_ptr, $input_ptr, #64 @ AES input_ptr update
621 fmov $ctr_t3d, $input_l3 @ AES block 4k+3 - mov low
622 ext $acc_hb, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment
623
624 aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 5
625 fmov $ctr_t1d, $input_l1 @ AES block 4k+5 - mov low
626
627 aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 7
628 eor $acc_mb, $acc_mb, $t9.16b @ MODULO - karatsuba tidy up
629
630 aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 6
631 eor $input_h1, $input_h1, $rk10_h @ AES block 4k+5 - round 10 high
632
633 aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 7
634 fmov $ctr_t1.d[1], $input_h1 @ AES block 4k+5 - mov high
635
636 aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 8
637 fmov $ctr_t3.d[1], $input_h3 @ AES block 4k+3 - mov high
638
639 aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 6
640 cmp $input_ptr, $main_end_input_ptr @ LOOP CONTROL
641
642 aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 8
643 eor $acc_mb, $acc_mb, $mod_t.16b @ MODULO - fold into mid
644
645 aese $ctr0b, $rk9 @ AES block 4k+4 - round 9
646 eor $input_l2, $input_l2, $rk10_l @ AES block 4k+6 - round 10 low
647 eor $input_h2, $input_h2, $rk10_h @ AES block 4k+6 - round 10 high
648
649 aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 7
650 fmov $ctr_t2d, $input_l2 @ AES block 4k+6 - mov low
651
652 aese $ctr1b, $rk9 @ AES block 4k+5 - round 9
653 fmov $ctr_t2.d[1], $input_h2 @ AES block 4k+6 - mov high
654
655 aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 7
656 eor $res0b, $ctr_t0b, $ctr0b @ AES block 4k+4 - result
657
658 fmov $ctr0d, $ctr96_b64x @ CTR block 4k+8
659 aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 8
660
661 fmov $ctr0.d[1], $ctr32x @ CTR block 4k+8
662 rev $ctr32w, $rctr32w @ CTR block 4k+9
663 eor $acc_mb, $acc_mb, $acc_hb @ MODULO - fold into mid
664
665 aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 8
666 eor $res1b, $ctr_t1b, $ctr1b @ AES block 4k+5 - result
667
668 add $rctr32w, $rctr32w, #1 @ CTR block 4k+9
669 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+9
670 fmov $ctr1d, $ctr96_b64x @ CTR block 4k+9
671
672 pmull $acc_h.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low
673 fmov $ctr1.d[1], $ctr32x @ CTR block 4k+9
674 rev $ctr32w, $rctr32w @ CTR block 4k+10
675
676 aese $ctr2b, $rk9 @ AES block 4k+6 - round 9
677 st1 { $res0b}, [$output_ptr], #16 @ AES block 4k+4 - store result
678 eor $res2b, $ctr_t2b, $ctr2b @ AES block 4k+6 - result
679 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+10
680
681 aese $ctr3b, $rk9 @ AES block 4k+7 - round 9
682 add $rctr32w, $rctr32w, #1 @ CTR block 4k+10
683 ext $acc_mb, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment
684 fmov $ctr2d, $ctr96_b64x @ CTR block 4k+10
685
686 eor $acc_lb, $acc_lb, $acc_hb @ MODULO - fold into low
687 st1 { $res1b}, [$output_ptr], #16 @ AES block 4k+5 - store result
688
689 fmov $ctr2.d[1], $ctr32x @ CTR block 4k+10
690 st1 { $res2b}, [$output_ptr], #16 @ AES block 4k+6 - store result
691 rev $ctr32w, $rctr32w @ CTR block 4k+11
692
693 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+11
694 eor $res3b, $ctr_t3b, $ctr3b @ AES block 4k+3 - result
695
696 eor $acc_lb, $acc_lb, $acc_mb @ MODULO - fold into low
697 st1 { $res3b}, [$output_ptr], #16 @ AES block 4k+3 - store result
698 b.lt .L128_enc_main_loop
699
700 .L128_enc_prepretail: @ PREPRETAIL
701 rev64 $res0b, $res0b @ GHASH block 4k (only t0 is free)
702 fmov $ctr3d, $ctr96_b64x @ CTR block 4k+3
703 rev64 $res1b, $res1b @ GHASH block 4k+1 (t0 and t1 free)
704
705 ext $acc_lb, $acc_lb, $acc_lb, #8 @ PRE 0
706 add $rctr32w, $rctr32w, #1 @ CTR block 4k+3
707 fmov $ctr3.d[1], $ctr32x @ CTR block 4k+3
708
709 aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 0
710 rev64 $res2b, $res2b @ GHASH block 4k+2 (t0, t1, and t2 free)
711
712 pmull $t2.1q, $res1.1d, $h3.1d @ GHASH block 4k+1 - low
713
714 rev64 $res3b, $res3b @ GHASH block 4k+3 (t0, t1, t2 and t3 free)
715 eor $res0b, $res0b, $acc_lb @ PRE 1
716
717 pmull2 $t1.1q, $res1.2d, $h3.2d @ GHASH block 4k+1 - high
718
719 aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 0
720 mov $t3d, $res1.d[1] @ GHASH block 4k+1 - mid
721
722 pmull $acc_l.1q, $res0.1d, $h4.1d @ GHASH block 4k - low
723 mov $t0d, $res0.d[1] @ GHASH block 4k - mid
724
725 mov $t6d, $res2.d[1] @ GHASH block 4k+2 - mid
726 mov $acc_md, $h34k.d[1] @ GHASH block 4k - mid
727
728 aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 1
729 eor $t3.8b, $t3.8b, $res1.8b @ GHASH block 4k+1 - mid
730
731 eor $t0.8b, $t0.8b, $res0.8b @ GHASH block 4k - mid
732
733 pmull2 $acc_h.1q, $res0.2d, $h4.2d @ GHASH block 4k - high
734 eor $t6.8b, $t6.8b, $res2.8b @ GHASH block 4k+2 - mid
735
736 aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 1
737
738 pmull $t3.1q, $t3.1d, $h34k.1d @ GHASH block 4k+1 - mid
739 eor $acc_lb, $acc_lb, $t2.16b @ GHASH block 4k+1 - low
740
741 pmull $acc_m.1q, $t0.1d, $acc_m.1d @ GHASH block 4k - mid
742
743 aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 0
744 ins $t6.d[1], $t6.d[0] @ GHASH block 4k+2 - mid
745
746 aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 0
747
748 eor $acc_mb, $acc_mb, $t3.16b @ GHASH block 4k+1 - mid
749 mov $t9d, $res3.d[1] @ GHASH block 4k+3 - mid
750
751 aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 1
752 eor $acc_hb, $acc_hb, $t1.16b @ GHASH block 4k+1 - high
753
754 pmull2 $t6.1q, $t6.2d, $h12k.2d @ GHASH block 4k+2 - mid
755
756 pmull2 $t4.1q, $res2.2d, $h2.2d @ GHASH block 4k+2 - high
757 eor $t9.8b, $t9.8b, $res3.8b @ GHASH block 4k+3 - mid
758
759 pmull2 $t7.1q, $res3.2d, $h1.2d @ GHASH block 4k+3 - high
760
761 pmull $t5.1q, $res2.1d, $h2.1d @ GHASH block 4k+2 - low
762
763 aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 1
764 eor $acc_hb, $acc_hb, $t4.16b @ GHASH block 4k+2 - high
765
766 aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 2
767
768 pmull $t8.1q, $res3.1d, $h1.1d @ GHASH block 4k+3 - low
769 movi $mod_constant.8b, #0xc2
770
771 aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 2
772 eor $acc_lb, $acc_lb, $t5.16b @ GHASH block 4k+2 - low
773
774 aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 2
775
776 pmull $t9.1q, $t9.1d, $h12k.1d @ GHASH block 4k+3 - mid
777 eor $acc_mb, $acc_mb, $t6.16b @ GHASH block 4k+2 - mid
778
779 aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 3
780
781 aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 2
782 eor $acc_hb, $acc_hb, $t7.16b @ GHASH block 4k+3 - high
783
784 aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 3
785
786 eor $acc_mb, $acc_mb, $t9.16b @ GHASH block 4k+3 - mid
787 shl $mod_constantd, $mod_constantd, #56 @ mod_constant
788
789 aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 3
790 eor $acc_lb, $acc_lb, $t8.16b @ GHASH block 4k+3 - low
791
792 aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 4
793
794 pmull $t1.1q, $acc_h.1d, $mod_constant.1d
795 eor $acc_mb, $acc_mb, $acc_hb @ karatsuba tidy up
796
797 aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 4
798
799 aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 5
800 ext $acc_hb, $acc_hb, $acc_hb, #8
801
802 aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 3
803
804 aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 4
805 eor $acc_mb, $acc_mb, $acc_lb
806
807 aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 6
808
809 aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 4
810
811 aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 5
812
813 aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 5
814 eor $acc_mb, $acc_mb, $t1.16b
815
816 aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 5
817
818 aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 6
819
820 aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 6
821
822 aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 6
823 eor $acc_mb, $acc_mb, $acc_hb
824
825 aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 7
826
827 aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 7
828
829 aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 7
830
831 pmull $t1.1q, $acc_m.1d, $mod_constant.1d
832
833 aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 7
834 ext $acc_mb, $acc_mb, $acc_mb, #8
835
836 aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 8
837
838 aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 8
839 eor $acc_lb, $acc_lb, $t1.16b
840
841 aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 8
842
843 aese $ctr3b, $rk9 @ AES block 4k+7 - round 9
844
845 aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 8
846
847 aese $ctr0b, $rk9 @ AES block 4k+4 - round 9
848
849 aese $ctr1b, $rk9 @ AES block 4k+5 - round 9
850 eor $acc_lb, $acc_lb, $acc_mb
851
852 aese $ctr2b, $rk9 @ AES block 4k+6 - round 9
853 .L128_enc_tail: @ TAIL
854
855 sub $main_end_input_ptr, $end_input_ptr, $input_ptr @ main_end_input_ptr is number of bytes left to process
856 ldp $input_l0, $input_h0, [$input_ptr], #16 @ AES block 4k+4 - load plaintext
857
858 cmp $main_end_input_ptr, #48
859
860 ext $t0.16b, $acc_lb, $acc_lb, #8 @ prepare final partial tag
861 eor $input_l0, $input_l0, $rk10_l @ AES block 4k+4 - round 10 low
862 eor $input_h0, $input_h0, $rk10_h @ AES block 4k+4 - round 10 high
863
864 fmov $ctr_t0d, $input_l0 @ AES block 4k+4 - mov low
865
866 fmov $ctr_t0.d[1], $input_h0 @ AES block 4k+4 - mov high
867
868 eor $res1b, $ctr_t0b, $ctr0b @ AES block 4k+4 - result
869
870 b.gt .L128_enc_blocks_more_than_3
871
872 sub $rctr32w, $rctr32w, #1
873 movi $acc_l.8b, #0
874 mov $ctr3b, $ctr2b
875
876 cmp $main_end_input_ptr, #32
877 mov $ctr2b, $ctr1b
878 movi $acc_h.8b, #0
879
880 movi $acc_m.8b, #0
881 b.gt .L128_enc_blocks_more_than_2
882
883 mov $ctr3b, $ctr1b
884 cmp $main_end_input_ptr, #16
885
886 sub $rctr32w, $rctr32w, #1
887 b.gt .L128_enc_blocks_more_than_1
888
889 sub $rctr32w, $rctr32w, #1
890 b .L128_enc_blocks_less_than_1
891 .L128_enc_blocks_more_than_3: @ blocks left > 3
892 st1 { $res1b}, [$output_ptr], #16 @ AES final-3 block - store result
893
894 ldp $input_l0, $input_h0, [$input_ptr], #16 @ AES final-2 block - load input low & high
895
896 rev64 $res0b, $res1b @ GHASH final-3 block
897
898 eor $res0b, $res0b, $t0.16b @ feed in partial tag
899 eor $input_h0, $input_h0, $rk10_h @ AES final-2 block - round 10 high
900 eor $input_l0, $input_l0, $rk10_l @ AES final-2 block - round 10 low
901
902 fmov $res1d, $input_l0 @ AES final-2 block - mov low
903
904 movi $t0.8b, #0 @ suppress further partial tag feed in
905 fmov $res1.d[1], $input_h0 @ AES final-2 block - mov high
906
907 pmull $acc_l.1q, $res0.1d, $h4.1d @ GHASH final-3 block - low
908 mov $rk4d, $res0.d[1] @ GHASH final-3 block - mid
909
910 pmull2 $acc_h.1q, $res0.2d, $h4.2d @ GHASH final-3 block - high
911
912 mov $acc_md, $h34k.d[1] @ GHASH final-3 block - mid
913
914 eor $res1b, $res1b, $ctr1b @ AES final-2 block - result
915 eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-3 block - mid
916
917 pmull $acc_m.1q, $rk4v.1d, $acc_m.1d @ GHASH final-3 block - mid
918 .L128_enc_blocks_more_than_2: @ blocks left > 2
919
920 st1 { $res1b}, [$output_ptr], #16 @ AES final-2 block - store result
921
922 rev64 $res0b, $res1b @ GHASH final-2 block
923 ldp $input_l0, $input_h0, [$input_ptr], #16 @ AES final-1 block - load input low & high
924
925 eor $res0b, $res0b, $t0.16b @ feed in partial tag
926
927 eor $input_l0, $input_l0, $rk10_l @ AES final-1 block - round 10 low
928
929 fmov $res1d, $input_l0 @ AES final-1 block - mov low
930 eor $input_h0, $input_h0, $rk10_h @ AES final-1 block - round 10 high
931
932 pmull2 $rk2q1, $res0.2d, $h3.2d @ GHASH final-2 block - high
933 fmov $res1.d[1], $input_h0 @ AES final-1 block - mov high
934
935 mov $rk4d, $res0.d[1] @ GHASH final-2 block - mid
936
937 pmull $rk3q1, $res0.1d, $h3.1d @ GHASH final-2 block - low
938
939 eor $acc_hb, $acc_hb, $rk2 @ GHASH final-2 block - high
940
941 eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-2 block - mid
942
943 eor $res1b, $res1b, $ctr2b @ AES final-1 block - result
944
945 eor $acc_lb, $acc_lb, $rk3 @ GHASH final-2 block - low
946
947 pmull $rk4v.1q, $rk4v.1d, $h34k.1d @ GHASH final-2 block - mid
948
949 movi $t0.8b, #0 @ suppress further partial tag feed in
950
951 eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-2 block - mid
952 .L128_enc_blocks_more_than_1: @ blocks left > 1
953
954 st1 { $res1b}, [$output_ptr], #16 @ AES final-1 block - store result
955
956 rev64 $res0b, $res1b @ GHASH final-1 block
957 ldp $input_l0, $input_h0, [$input_ptr], #16 @ AES final block - load input low & high
958
959 eor $res0b, $res0b, $t0.16b @ feed in partial tag
960
961 eor $input_h0, $input_h0, $rk10_h @ AES final block - round 10 high
962 eor $input_l0, $input_l0, $rk10_l @ AES final block - round 10 low
963
964 fmov $res1d, $input_l0 @ AES final block - mov low
965
966 pmull2 $rk2q1, $res0.2d, $h2.2d @ GHASH final-1 block - high
967 fmov $res1.d[1], $input_h0 @ AES final block - mov high
968
969 mov $rk4d, $res0.d[1] @ GHASH final-1 block - mid
970
971 pmull $rk3q1, $res0.1d, $h2.1d @ GHASH final-1 block - low
972
973 eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-1 block - mid
974
975 eor $res1b, $res1b, $ctr3b @ AES final block - result
976
977 ins $rk4v.d[1], $rk4v.d[0] @ GHASH final-1 block - mid
978
979 pmull2 $rk4v.1q, $rk4v.2d, $h12k.2d @ GHASH final-1 block - mid
980
981 eor $acc_lb, $acc_lb, $rk3 @ GHASH final-1 block - low
982
983 eor $acc_hb, $acc_hb, $rk2 @ GHASH final-1 block - high
984
985 eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-1 block - mid
986 movi $t0.8b, #0 @ suppress further partial tag feed in
987 .L128_enc_blocks_less_than_1: @ blocks left <= 1
988
989 and $bit_length, $bit_length, #127 @ bit_length %= 128
990 mvn $rk10_l, xzr @ rk10_l = 0xffffffffffffffff
991
992 mvn $rk10_h, xzr @ rk10_h = 0xffffffffffffffff
993 sub $bit_length, $bit_length, #128 @ bit_length -= 128
994
995 neg $bit_length, $bit_length @ bit_length = 128 - #bits in input (in range [1,128])
996
997 and $bit_length, $bit_length, #127 @ bit_length %= 128
998
999 lsr $rk10_h, $rk10_h, $bit_length @ rk10_h is mask for top 64b of last block
1000 cmp $bit_length, #64
1001
1002 csel $input_l0, $rk10_l, $rk10_h, lt
1003 csel $input_h0, $rk10_h, xzr, lt
1004
1005 fmov $ctr0d, $input_l0 @ ctr0b is mask for last block
1006
1007 fmov $ctr0.d[1], $input_h0
1008
1009 and $res1b, $res1b, $ctr0b @ possibly partial last block has zeroes in highest bits
1010
1011 rev64 $res0b, $res1b @ GHASH final block
1012
1013 eor $res0b, $res0b, $t0.16b @ feed in partial tag
1014
1015 mov $t0d, $res0.d[1] @ GHASH final block - mid
1016
1017 pmull $rk3q1, $res0.1d, $h1.1d @ GHASH final block - low
1018 ld1 { $rk0}, [$output_ptr] @ load existing bytes where the possibly partial last block is to be stored
1019
1020 eor $t0.8b, $t0.8b, $res0.8b @ GHASH final block - mid
1021
1022 rev $ctr32w, $rctr32w
1023
1024 pmull2 $rk2q1, $res0.2d, $h1.2d @ GHASH final block - high
1025
1026 pmull $t0.1q, $t0.1d, $h12k.1d @ GHASH final block - mid
1027
1028 eor $acc_lb, $acc_lb, $rk3 @ GHASH final block - low
1029
1030 eor $acc_hb, $acc_hb, $rk2 @ GHASH final block - high
1031
1032 eor $acc_mb, $acc_mb, $t0.16b @ GHASH final block - mid
1033 movi $mod_constant.8b, #0xc2
1034
1035 eor $t9.16b, $acc_lb, $acc_hb @ MODULO - karatsuba tidy up
1036
1037 shl $mod_constantd, $mod_constantd, #56 @ mod_constant
1038
1039 eor $acc_mb, $acc_mb, $t9.16b @ MODULO - karatsuba tidy up
1040
1041 pmull $mod_t.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid
1042
1043 ext $acc_hb, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment
1044
1045 eor $acc_mb, $acc_mb, $mod_t.16b @ MODULO - fold into mid
1046
1047 eor $acc_mb, $acc_mb, $acc_hb @ MODULO - fold into mid
1048
1049 pmull $acc_h.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low
1050
1051 ext $acc_mb, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment
1052
1053 bif $res1b, $rk0, $ctr0b @ insert existing bytes in top end of result before storing
1054
1055 eor $acc_lb, $acc_lb, $acc_hb @ MODULO - fold into low
1056 st1 { $res1b}, [$output_ptr] @ store all 16B
1057
1058 str $ctr32w, [$counter, #12] @ store the updated counter
1059
1060 eor $acc_lb, $acc_lb, $acc_mb @ MODULO - fold into low
1061 ext $acc_lb, $acc_lb, $acc_lb, #8
1062 rev64 $acc_lb, $acc_lb
1063 mov x0, $len
1064 st1 { $acc_l.16b }, [$current_tag]
1065 ldp x21, x22, [sp, #16]
1066 ldp x23, x24, [sp, #32]
1067 ldp d8, d9, [sp, #48]
1068 ldp d10, d11, [sp, #64]
1069 ldp d12, d13, [sp, #80]
1070 ldp d14, d15, [sp, #96]
1071 ldp x19, x20, [sp], #112
1072 ret
1073
1074.L128_enc_ret:
1075 mov w0, #0x0
1076 ret
1077.size aes_gcm_enc_128_kernel,.-aes_gcm_enc_128_kernel
1078___
1079
1080#########################################################################################
1081# size_t aes_gcm_dec_128_kernel(const unsigned char *in,
1082# size_t len,
1083# unsigned char *out,
1084# const void *key,
1085# unsigned char ivec[16],
1086# u64 *Xi);
1087#
1088$code.=<<___;
1089.global aes_gcm_dec_128_kernel
1090.type aes_gcm_dec_128_kernel,%function
1091.align 4
1092aes_gcm_dec_128_kernel:
1093 AARCH64_VALID_CALL_TARGET
1094 cbz x1, .L128_dec_ret
1095 stp x19, x20, [sp, #-112]!
1096 mov x16, x4
1097 mov x8, x5
1098 stp x21, x22, [sp, #16]
1099 stp x23, x24, [sp, #32]
1100 stp d8, d9, [sp, #48]
1101 stp d10, d11, [sp, #64]
1102 stp d12, d13, [sp, #80]
1103 stp d14, d15, [sp, #96]
1104
1105 lsr $main_end_input_ptr, $bit_length, #3 @ byte_len
1106 mov $len, $main_end_input_ptr
1107 ldp $ctr96_b64x, $ctr96_t32x, [$counter] @ ctr96_b64, ctr96_t32
1108
1109 sub $main_end_input_ptr, $main_end_input_ptr, #1 @ byte_len - 1
1110 ldr $rk0q, [$cc, #0] @ load rk0
1111
1112 and $main_end_input_ptr, $main_end_input_ptr, #0xffffffffffffffc0 @ number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
1113 ld1 { $ctr0b}, [$counter] @ special case vector load initial counter so we can start first AES block as quickly as possible
1114
1115 ldr $h2q, [$current_tag, #64] @ load h2l | h2h
1116 ext $h2b, $h2b, $h2b, #8
1117
1118 lsr $rctr32x, $ctr96_t32x, #32
1119 fmov $ctr2d, $ctr96_b64x @ CTR block 2
1120
1121 ldr $rk1q, [$cc, #16] @ load rk1
1122 orr $ctr96_t32w, $ctr96_t32w, $ctr96_t32w
1123 rev $rctr32w, $rctr32w @ rev_ctr32
1124
1125 fmov $ctr1d, $ctr96_b64x @ CTR block 1
1126 add $rctr32w, $rctr32w, #1 @ increment rev_ctr32
1127
1128 aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 0
1129 rev $ctr32w, $rctr32w @ CTR block 1
1130
1131 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 1
1132 ldr $rk2q, [$cc, #32] @ load rk2
1133 add $rctr32w, $rctr32w, #1 @ CTR block 1
1134
1135 fmov $ctr1.d[1], $ctr32x @ CTR block 1
1136 rev $ctr32w, $rctr32w @ CTR block 2
1137 add $rctr32w, $rctr32w, #1 @ CTR block 2
1138
1139 aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 1
1140 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 2
1141
1142 fmov $ctr2.d[1], $ctr32x @ CTR block 2
1143 rev $ctr32w, $rctr32w @ CTR block 3
1144
1145 fmov $ctr3d, $ctr96_b64x @ CTR block 3
1146 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 3
1147 add $rctr32w, $rctr32w, #1 @ CTR block 3
1148
1149 fmov $ctr3.d[1], $ctr32x @ CTR block 3
1150 add $end_input_ptr, $input_ptr, $bit_length, lsr #3 @ end_input_ptr
1151
1152 aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 0
1153 ldr $rk3q, [$cc, #48] @ load rk3
1154
1155 aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 2
1156 ldr $rk6q, [$cc, #96] @ load rk6
1157
1158 aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 0
1159 ldr $rk7q, [$cc, #112] @ load rk7
1160
1161 aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 1
1162 ldr $rk4q, [$cc, #64] @ load rk4
1163
1164 aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 0
1165
1166 aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 1
1167
1168 aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 2
1169 ldp $rk10_l, $rk10_h, [$cc, #160] @ load rk10
1170
1171 aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 1
1172 ld1 { $acc_lb}, [$current_tag]
1173 ext $acc_lb, $acc_lb, $acc_lb, #8
1174 rev64 $acc_lb, $acc_lb
1175
1176 aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 3
1177 ldr $rk5q, [$cc, #80] @ load rk5
1178
1179 aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 3
1180
1181 aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 2
1182
1183 aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 2
1184 ldr $rk9q, [$cc, #144] @ load rk9
1185
1186 aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 4
1187
1188 aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 3
1189
1190 aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 3
1191 ldr $h3q, [$current_tag, #80] @ load h3l | h3h
1192 ext $h3b, $h3b, $h3b, #8
1193
1194 aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 4
1195 ldr $rk8q, [$cc, #128] @ load rk8
1196
1197 aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 5
1198
1199 aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 4
1200
1201 aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 4
1202
1203 aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 5
1204
1205 aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 5
1206 ldr $h1q, [$current_tag, #32] @ load h1l | h1h
1207 ext $h1b, $h1b, $h1b, #8
1208
1209 aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 5
1210
1211 aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 6
1212
1213 aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 6
1214
1215 aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 6
1216
1217 aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 6
1218 trn1 $t0.2d, $h1.2d, $h2.2d @ h2h | h1h
1219
1220 ldr $h4q, [$current_tag, #112] @ load h4l | h4h
1221 ext $h4b, $h4b, $h4b, #8
1222 trn2 $h12k.2d, $h1.2d, $h2.2d @ h2l | h1l
1223 add $main_end_input_ptr, $main_end_input_ptr, $input_ptr
1224
1225 aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 7
1226
1227 aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 7
1228
1229 aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 7
1230 eor $h12k.16b, $h12k.16b, $t0.16b @ h2k | h1k
1231
1232 aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 7
1233
1234 aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 8
1235 trn2 $h34k.2d, $h3.2d, $h4.2d @ h4l | h3l
1236
1237 aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 8
1238
1239 aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 8
1240
1241 aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 8
1242 trn1 $acc_h.2d, $h3.2d, $h4.2d @ h4h | h3h
1243
1244 aese $ctr2b, $rk9 @ AES block 2 - round 9
1245
1246 aese $ctr3b, $rk9 @ AES block 3 - round 9
1247
1248 aese $ctr0b, $rk9 @ AES block 0 - round 9
1249 cmp $input_ptr, $main_end_input_ptr @ check if we have <= 4 blocks
1250
1251 aese $ctr1b, $rk9 @ AES block 1 - round 9
1252 eor $h34k.16b, $h34k.16b, $acc_h.16b @ h4k | h3k
1253 b.ge .L128_dec_tail @ handle tail
1254
1255 ldr $res1q, [$input_ptr, #16] @ AES block 1 - load ciphertext
1256
1257 ldr $res0q, [$input_ptr, #0] @ AES block 0 - load ciphertext
1258
1259 eor $ctr1b, $res1b, $ctr1b @ AES block 1 - result
1260 ldr $res2q, [$input_ptr, #32] @ AES block 2 - load ciphertext
1261
1262 eor $ctr0b, $res0b, $ctr0b @ AES block 0 - result
1263 rev64 $res0b, $res0b @ GHASH block 0
1264 rev $ctr32w, $rctr32w @ CTR block 4
1265
1266 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4
1267 add $rctr32w, $rctr32w, #1 @ CTR block 4
1268 ldr $res3q, [$input_ptr, #48] @ AES block 3 - load ciphertext
1269
1270 rev64 $res1b, $res1b @ GHASH block 1
1271 add $input_ptr, $input_ptr, #64 @ AES input_ptr update
1272 mov $output_l1, $ctr1.d[0] @ AES block 1 - mov low
1273
1274 mov $output_h1, $ctr1.d[1] @ AES block 1 - mov high
1275
1276 mov $output_l0, $ctr0.d[0] @ AES block 0 - mov low
1277 cmp $input_ptr, $main_end_input_ptr @ check if we have <= 8 blocks
1278
1279 mov $output_h0, $ctr0.d[1] @ AES block 0 - mov high
1280
1281 fmov $ctr0d, $ctr96_b64x @ CTR block 4
1282
1283 fmov $ctr0.d[1], $ctr32x @ CTR block 4
1284 rev $ctr32w, $rctr32w @ CTR block 5
1285 eor $output_l1, $output_l1, $rk10_l @ AES block 1 - round 10 low
1286
1287 fmov $ctr1d, $ctr96_b64x @ CTR block 5
1288 add $rctr32w, $rctr32w, #1 @ CTR block 5
1289 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 5
1290
1291 fmov $ctr1.d[1], $ctr32x @ CTR block 5
1292 rev $ctr32w, $rctr32w @ CTR block 6
1293 add $rctr32w, $rctr32w, #1 @ CTR block 6
1294
1295 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 6
1296
1297 eor $output_h1, $output_h1, $rk10_h @ AES block 1 - round 10 high
1298 eor $output_l0, $output_l0, $rk10_l @ AES block 0 - round 10 low
1299 eor $ctr2b, $res2b, $ctr2b @ AES block 2 - result
1300
1301 eor $output_h0, $output_h0, $rk10_h @ AES block 0 - round 10 high
1302 stp $output_l0, $output_h0, [$output_ptr], #16 @ AES block 0 - store result
1303
1304 stp $output_l1, $output_h1, [$output_ptr], #16 @ AES block 1 - store result
1305 b.ge .L128_dec_prepretail @ do prepretail
1306
1307 .L128_dec_main_loop: @ main loop start
1308 eor $ctr3b, $res3b, $ctr3b @ AES block 4k+3 - result
1309 ext $acc_lb, $acc_lb, $acc_lb, #8 @ PRE 0
1310 mov $output_l2, $ctr2.d[0] @ AES block 4k+2 - mov low
1311
1312 pmull2 $t1.1q, $res1.2d, $h3.2d @ GHASH block 4k+1 - high
1313 mov $output_h2, $ctr2.d[1] @ AES block 4k+2 - mov high
1314
1315 aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 0
1316 fmov $ctr2d, $ctr96_b64x @ CTR block 4k+6
1317
1318 rev64 $res2b, $res2b @ GHASH block 4k+2
1319 fmov $ctr2.d[1], $ctr32x @ CTR block 4k+6
1320 rev $ctr32w, $rctr32w @ CTR block 4k+7
1321
1322 mov $output_l3, $ctr3.d[0] @ AES block 4k+3 - mov low
1323 eor $res0b, $res0b, $acc_lb @ PRE 1
1324 mov $t3d, $res1.d[1] @ GHASH block 4k+1 - mid
1325
1326 aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 1
1327 rev64 $res3b, $res3b @ GHASH block 4k+3
1328
1329 pmull $t2.1q, $res1.1d, $h3.1d @ GHASH block 4k+1 - low
1330 mov $output_h3, $ctr3.d[1] @ AES block 4k+3 - mov high
1331 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+7
1332
1333 pmull $acc_l.1q, $res0.1d, $h4.1d @ GHASH block 4k - low
1334 fmov $ctr3d, $ctr96_b64x @ CTR block 4k+7
1335 eor $t3.8b, $t3.8b, $res1.8b @ GHASH block 4k+1 - mid
1336
1337 aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 2
1338 fmov $ctr3.d[1], $ctr32x @ CTR block 4k+7
1339
1340 aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 0
1341 mov $acc_md, $h34k.d[1] @ GHASH block 4k - mid
1342
1343 pmull2 $acc_h.1q, $res0.2d, $h4.2d @ GHASH block 4k - high
1344 eor $acc_lb, $acc_lb, $t2.16b @ GHASH block 4k+1 - low
1345
1346 pmull $t8.1q, $res3.1d, $h1.1d @ GHASH block 4k+3 - low
1347
1348 aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 3
1349 mov $t0d, $res0.d[1] @ GHASH block 4k - mid
1350
1351 aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 0
1352 eor $acc_hb, $acc_hb, $t1.16b @ GHASH block 4k+1 - high
1353
1354 aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 0
1355
1356 pmull $t5.1q, $res2.1d, $h2.1d @ GHASH block 4k+2 - low
1357 eor $t0.8b, $t0.8b, $res0.8b @ GHASH block 4k - mid
1358
1359 aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 1
1360 eor $output_l3, $output_l3, $rk10_l @ AES block 4k+3 - round 10 low
1361
1362 pmull $t3.1q, $t3.1d, $h34k.1d @ GHASH block 4k+1 - mid
1363 eor $output_h2, $output_h2, $rk10_h @ AES block 4k+2 - round 10 high
1364 mov $t6d, $res2.d[1] @ GHASH block 4k+2 - mid
1365
1366 aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 1
1367 eor $acc_lb, $acc_lb, $t5.16b @ GHASH block 4k+2 - low
1368
1369 pmull $acc_m.1q, $t0.1d, $acc_m.1d @ GHASH block 4k - mid
1370
1371 aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 2
1372 eor $t6.8b, $t6.8b, $res2.8b @ GHASH block 4k+2 - mid
1373
1374 aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 2
1375
1376 aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 4
1377 eor $acc_mb, $acc_mb, $t3.16b @ GHASH block 4k+1 - mid
1378
1379 pmull2 $t4.1q, $res2.2d, $h2.2d @ GHASH block 4k+2 - high
1380
1381 aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 3
1382 ins $t6.d[1], $t6.d[0] @ GHASH block 4k+2 - mid
1383
1384 pmull2 $t7.1q, $res3.2d, $h1.2d @ GHASH block 4k+3 - high
1385
1386 aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 1
1387 mov $t9d, $res3.d[1] @ GHASH block 4k+3 - mid
1388
1389 aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 4
1390 eor $acc_hb, $acc_hb, $t4.16b @ GHASH block 4k+2 - high
1391
1392 pmull2 $t6.1q, $t6.2d, $h12k.2d @ GHASH block 4k+2 - mid
1393 eor $output_h3, $output_h3, $rk10_h @ AES block 4k+3 - round 10 high
1394
1395 aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 2
1396 eor $t9.8b, $t9.8b, $res3.8b @ GHASH block 4k+3 - mid
1397
1398 aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 5
1399 eor $output_l2, $output_l2, $rk10_l @ AES block 4k+2 - round 10 low
1400
1401 aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 5
1402 movi $mod_constant.8b, #0xc2
1403
1404 aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 3
1405 eor $acc_lb, $acc_lb, $t8.16b @ GHASH block 4k+3 - low
1406
1407 aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 6
1408
1409 aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 6
1410 eor $acc_mb, $acc_mb, $t6.16b @ GHASH block 4k+2 - mid
1411
1412 aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 4
1413 stp $output_l2, $output_h2, [$output_ptr], #16 @ AES block 4k+2 - store result
1414
1415 pmull $t9.1q, $t9.1d, $h12k.1d @ GHASH block 4k+3 - mid
1416 eor $acc_hb, $acc_hb, $t7.16b @ GHASH block 4k+3 - high
1417 ldr $res0q, [$input_ptr, #0] @ AES block 4k+4 - load ciphertext
1418
1419 aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 7
1420 add $rctr32w, $rctr32w, #1 @ CTR block 4k+7
1421
1422 aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 7
1423 shl $mod_constantd, $mod_constantd, #56 @ mod_constant
1424
1425 aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 5
1426 eor $acc_mb, $acc_mb, $t9.16b @ GHASH block 4k+3 - mid
1427
1428 aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 8
1429 stp $output_l3, $output_h3, [$output_ptr], #16 @ AES block 4k+3 - store result
1430
1431 aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 8
1432 eor $t9.16b, $acc_lb, $acc_hb @ MODULO - karatsuba tidy up
1433
1434 aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 3
1435 rev $ctr32w, $rctr32w @ CTR block 4k+8
1436
1437 pmull $mod_t.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid
1438 ldr $res1q, [$input_ptr, #16] @ AES block 4k+5 - load ciphertext
1439 ext $acc_hb, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment
1440
1441 aese $ctr0b, $rk9 @ AES block 4k+4 - round 9
1442 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+8
1443
1444 aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 4
1445 eor $acc_mb, $acc_mb, $t9.16b @ MODULO - karatsuba tidy up
1446
1447 aese $ctr1b, $rk9 @ AES block 4k+5 - round 9
1448
1449 aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 6
1450 eor $ctr0b, $res0b, $ctr0b @ AES block 4k+4 - result
1451
1452 aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 5
1453 ldr $res2q, [$input_ptr, #32] @ AES block 4k+6 - load ciphertext
1454
1455 add $rctr32w, $rctr32w, #1 @ CTR block 4k+8
1456 eor $acc_mb, $acc_mb, $mod_t.16b @ MODULO - fold into mid
1457 eor $ctr1b, $res1b, $ctr1b @ AES block 4k+5 - result
1458
1459 aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 7
1460 ldr $res3q, [$input_ptr, #48] @ AES block 4k+3 - load ciphertext
1461
1462 aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 6
1463 add $input_ptr, $input_ptr, #64 @ AES input_ptr update
1464
1465 rev64 $res1b, $res1b @ GHASH block 4k+5
1466 eor $acc_mb, $acc_mb, $acc_hb @ MODULO - fold into mid
1467 mov $output_h0, $ctr0.d[1] @ AES block 4k+4 - mov high
1468
1469 aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 8
1470 mov $output_l0, $ctr0.d[0] @ AES block 4k+4 - mov low
1471
1472 aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 7
1473 fmov $ctr0d, $ctr96_b64x @ CTR block 4k+8
1474
1475 pmull $mod_constant.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low
1476 fmov $ctr0.d[1], $ctr32x @ CTR block 4k+8
1477 rev $ctr32w, $rctr32w @ CTR block 4k+9
1478
1479 aese $ctr2b, $rk9 @ AES block 4k+6 - round 9
1480 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+9
1481 ext $acc_mb, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment
1482
1483 aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 8
1484 eor $output_h0, $output_h0, $rk10_h @ AES block 4k+4 - round 10 high
1485
1486 eor $acc_lb, $acc_lb, $mod_constant.16b @ MODULO - fold into low
1487 mov $output_h1, $ctr1.d[1] @ AES block 4k+5 - mov high
1488 eor $output_l0, $output_l0, $rk10_l @ AES block 4k+4 - round 10 low
1489
1490 eor $ctr2b, $res2b, $ctr2b @ AES block 4k+6 - result
1491 mov $output_l1, $ctr1.d[0] @ AES block 4k+5 - mov low
1492 add $rctr32w, $rctr32w, #1 @ CTR block 4k+9
1493
1494 aese $ctr3b, $rk9 @ AES block 4k+7 - round 9
1495 fmov $ctr1d, $ctr96_b64x @ CTR block 4k+9
1496 cmp $input_ptr, $main_end_input_ptr @ LOOP CONTROL
1497
1498 rev64 $res0b, $res0b @ GHASH block 4k+4
1499 eor $acc_lb, $acc_lb, $acc_mb @ MODULO - fold into low
1500 fmov $ctr1.d[1], $ctr32x @ CTR block 4k+9
1501
1502 rev $ctr32w, $rctr32w @ CTR block 4k+10
1503 add $rctr32w, $rctr32w, #1 @ CTR block 4k+10
1504
1505 eor $output_h1, $output_h1, $rk10_h @ AES block 4k+5 - round 10 high
1506 stp $output_l0, $output_h0, [$output_ptr], #16 @ AES block 4k+4 - store result
1507
1508 eor $output_l1, $output_l1, $rk10_l @ AES block 4k+5 - round 10 low
1509 stp $output_l1, $output_h1, [$output_ptr], #16 @ AES block 4k+5 - store result
1510
1511 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+10
1512 b.lt L128_dec_main_loop
1513
1514 .L128_dec_prepretail: @ PREPRETAIL
1515 ext $acc_lb, $acc_lb, $acc_lb, #8 @ PRE 0
1516 mov $output_l2, $ctr2.d[0] @ AES block 4k+2 - mov low
1517 mov $t3d, $res1.d[1] @ GHASH block 4k+1 - mid
1518
1519 aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 0
1520 eor $ctr3b, $res3b, $ctr3b @ AES block 4k+3 - result
1521
1522 aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 0
1523 mov $output_h2, $ctr2.d[1] @ AES block 4k+2 - mov high
1524
1525 eor $res0b, $res0b, $acc_lb @ PRE 1
1526 fmov $ctr2d, $ctr96_b64x @ CTR block 4k+6
1527 rev64 $res2b, $res2b @ GHASH block 4k+2
1528
1529 aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 1
1530 fmov $ctr2.d[1], $ctr32x @ CTR block 4k+6
1531
1532 rev $ctr32w, $rctr32w @ CTR block 4k+7
1533 mov $output_l3, $ctr3.d[0] @ AES block 4k+3 - mov low
1534 eor $t3.8b, $t3.8b, $res1.8b @ GHASH block 4k+1 - mid
1535
1536 pmull $acc_l.1q, $res0.1d, $h4.1d @ GHASH block 4k - low
1537 mov $acc_md, $h34k.d[1] @ GHASH block 4k - mid
1538 mov $output_h3, $ctr3.d[1] @ AES block 4k+3 - mov high
1539
1540 aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 1
1541 mov $t6d, $res2.d[1] @ GHASH block 4k+2 - mid
1542
1543 aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 2
1544 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+7
1545
1546 pmull $t2.1q, $res1.1d, $h3.1d @ GHASH block 4k+1 - low
1547 mov $t0d, $res0.d[1] @ GHASH block 4k - mid
1548 fmov $ctr3d, $ctr96_b64x @ CTR block 4k+7
1549
1550 aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 0
1551 fmov $ctr3.d[1], $ctr32x @ CTR block 4k+7
1552
1553 pmull $t3.1q, $t3.1d, $h34k.1d @ GHASH block 4k+1 - mid
1554 eor $t6.8b, $t6.8b, $res2.8b @ GHASH block 4k+2 - mid
1555
1556 rev64 $res3b, $res3b @ GHASH block 4k+3
1557
1558 aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 1
1559 eor $t0.8b, $t0.8b, $res0.8b @ GHASH block 4k - mid
1560
1561 pmull2 $acc_h.1q, $res0.2d, $h4.2d @ GHASH block 4k - high
1562
1563 aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 0
1564 ins $t6.d[1], $t6.d[0] @ GHASH block 4k+2 - mid
1565
1566 pmull2 $t1.1q, $res1.2d, $h3.2d @ GHASH block 4k+1 - high
1567
1568 pmull $acc_m.1q, $t0.1d, $acc_m.1d @ GHASH block 4k - mid
1569 eor $acc_lb, $acc_lb, $t2.16b @ GHASH block 4k+1 - low
1570
1571 pmull $t8.1q, $res3.1d, $h1.1d @ GHASH block 4k+3 - low
1572
1573 pmull2 $t6.1q, $t6.2d, $h12k.2d @ GHASH block 4k+2 - mid
1574 eor $acc_hb, $acc_hb, $t1.16b @ GHASH block 4k+1 - high
1575
1576 eor $acc_mb, $acc_mb, $t3.16b @ GHASH block 4k+1 - mid
1577
1578 pmull2 $t7.1q, $res3.2d, $h1.2d @ GHASH block 4k+3 - high
1579
1580 pmull2 $t4.1q, $res2.2d, $h2.2d @ GHASH block 4k+2 - high
1581 mov $t9d, $res3.d[1] @ GHASH block 4k+3 - mid
1582
1583 aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 2
1584 eor $acc_mb, $acc_mb, $t6.16b @ GHASH block 4k+2 - mid
1585
1586 pmull $t5.1q, $res2.1d, $h2.1d @ GHASH block 4k+2 - low
1587
1588 eor $acc_hb, $acc_hb, $t4.16b @ GHASH block 4k+2 - high
1589 movi $mod_constant.8b, #0xc2
1590
1591 aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 1
1592 eor $t9.8b, $t9.8b, $res3.8b @ GHASH block 4k+3 - mid
1593
1594 eor $acc_lb, $acc_lb, $t5.16b @ GHASH block 4k+2 - low
1595
1596 aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 2
1597 eor $acc_hb, $acc_hb, $t7.16b @ GHASH block 4k+3 - high
1598
1599 aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 2
1600 eor $output_l3, $output_l3, $rk10_l @ AES block 4k+3 - round 10 low
1601
1602 pmull $t9.1q, $t9.1d, $h12k.1d @ GHASH block 4k+3 - mid
1603 eor $output_l2, $output_l2, $rk10_l @ AES block 4k+2 - round 10 low
1604 eor $acc_lb, $acc_lb, $t8.16b @ GHASH block 4k+3 - low
1605
1606 aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 3
1607
1608 aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 3
1609 shl $mod_constantd, $mod_constantd, #56 @ mod_constant
1610
1611 aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 3
1612
1613 aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 4
1614 eor $acc_mb, $acc_mb, $t9.16b @ GHASH block 4k+3 - mid
1615
1616 aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 4
1617
1618 aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 3
1619 eor $t9.16b, $acc_lb, $acc_hb @ MODULO - karatsuba tidy up
1620
1621 aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 5
1622
1623 aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 5
1624
1625 aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 4
1626
1627 aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 4
1628 eor $acc_mb, $acc_mb, $t9.16b @ MODULO - karatsuba tidy up
1629
1630 pmull $mod_t.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid
1631
1632 aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 6
1633 ext $acc_hb, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment
1634
1635 aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 5
1636
1637 aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 5
1638 eor $acc_mb, $acc_mb, $mod_t.16b @ MODULO - fold into mid
1639
1640 aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 7
1641
1642 aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 6
1643
1644 aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 6
1645
1646 aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 8
1647 eor $acc_mb, $acc_mb, $acc_hb @ MODULO - fold into mid
1648
1649 aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 6
1650
1651 aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 7
1652
1653 aese $ctr1b, $rk9 @ AES block 4k+5 - round 9
1654
1655 pmull $mod_constant.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low
1656 eor $output_h3, $output_h3, $rk10_h @ AES block 4k+3 - round 10 high
1657
1658 aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 7
1659 ext $acc_mb, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment
1660
1661 aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 7
1662
1663 aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 8
1664 eor $acc_lb, $acc_lb, $mod_constant.16b @ MODULO - fold into low
1665
1666 aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 8
1667
1668 aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 8
1669 eor $output_h2, $output_h2, $rk10_h @ AES block 4k+2 - round 10 high
1670
1671 aese $ctr0b, $rk9 @ AES block 4k+4 - round 9
1672 stp $output_l2, $output_h2, [$output_ptr], #16 @ AES block 4k+2 - store result
1673
1674 aese $ctr2b, $rk9 @ AES block 4k+6 - round 9
1675 add $rctr32w, $rctr32w, #1 @ CTR block 4k+7
1676 stp $output_l3, $output_h3, [$output_ptr], #16 @ AES block 4k+3 - store result
1677
1678 aese $ctr3b, $rk9 @ AES block 4k+7 - round 9
1679 eor $acc_lb, $acc_lb, $acc_mb @ MODULO - fold into low
1680 .L128_dec_tail: @ TAIL
1681
1682 sub $main_end_input_ptr, $end_input_ptr, $input_ptr @ main_end_input_ptr is number of bytes left to process
1683 ld1 { $res1b}, [$input_ptr], #16 @ AES block 4k+4 - load ciphertext
1684
1685 eor $ctr0b, $res1b, $ctr0b @ AES block 4k+4 - result
1686
1687 mov $output_h0, $ctr0.d[1] @ AES block 4k+4 - mov high
1688
1689 mov $output_l0, $ctr0.d[0] @ AES block 4k+4 - mov low
1690
1691 cmp $main_end_input_ptr, #48
1692
1693 eor $output_h0, $output_h0, $rk10_h @ AES block 4k+4 - round 10 high
1694
1695 ext $t0.16b, $acc_lb, $acc_lb, #8 @ prepare final partial tag
1696 eor $output_l0, $output_l0, $rk10_l @ AES block 4k+4 - round 10 low
1697 b.gt .L128_dec_blocks_more_than_3
1698
1699 mov $ctr3b, $ctr2b
1700 sub $rctr32w, $rctr32w, #1
1701 movi $acc_l.8b, #0
1702
1703 movi $acc_h.8b, #0
1704 mov $ctr2b, $ctr1b
1705
1706 movi $acc_m.8b, #0
1707 cmp $main_end_input_ptr, #32
1708 b.gt .L128_dec_blocks_more_than_2
1709
1710 cmp $main_end_input_ptr, #16
1711
1712 mov $ctr3b, $ctr1b
1713 sub $rctr32w, $rctr32w, #1
1714 b.gt .L128_dec_blocks_more_than_1
1715
1716 sub $rctr32w, $rctr32w, #1
1717 b .L128_dec_blocks_less_than_1
1718 .L128_dec_blocks_more_than_3: @ blocks left > 3
1719 rev64 $res0b, $res1b @ GHASH final-3 block
1720 ld1 { $res1b}, [$input_ptr], #16 @ AES final-2 block - load ciphertext
1721
1722 eor $res0b, $res0b, $t0.16b @ feed in partial tag
1723
1724 mov $acc_md, $h34k.d[1] @ GHASH final-3 block - mid
1725 stp $output_l0, $output_h0, [$output_ptr], #16 @ AES final-3 block - store result
1726 eor $ctr0b, $res1b, $ctr1b @ AES final-2 block - result
1727
1728 mov $rk4d, $res0.d[1] @ GHASH final-3 block - mid
1729 mov $output_h0, $ctr0.d[1] @ AES final-2 block - mov high
1730
1731 pmull $acc_l.1q, $res0.1d, $h4.1d @ GHASH final-3 block - low
1732 mov $output_l0, $ctr0.d[0] @ AES final-2 block - mov low
1733
1734 pmull2 $acc_h.1q, $res0.2d, $h4.2d @ GHASH final-3 block - high
1735
1736 eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-3 block - mid
1737
1738 movi $t0.8b, #0 @ suppress further partial tag feed in
1739 eor $output_h0, $output_h0, $rk10_h @ AES final-2 block - round 10 high
1740
1741 pmull $acc_m.1q, $rk4v.1d, $acc_m.1d @ GHASH final-3 block - mid
1742 eor $output_l0, $output_l0, $rk10_l @ AES final-2 block - round 10 low
1743 .L128_dec_blocks_more_than_2: @ blocks left > 2
1744
1745 rev64 $res0b, $res1b @ GHASH final-2 block
1746 ld1 { $res1b}, [$input_ptr], #16 @ AES final-1 block - load ciphertext
1747
1748 eor $res0b, $res0b, $t0.16b @ feed in partial tag
1749
1750 eor $ctr0b, $res1b, $ctr2b @ AES final-1 block - result
1751 stp $output_l0, $output_h0, [$output_ptr], #16 @ AES final-2 block - store result
1752
1753 mov $rk4d, $res0.d[1] @ GHASH final-2 block - mid
1754
1755 pmull $rk3q1, $res0.1d, $h3.1d @ GHASH final-2 block - low
1756
1757 pmull2 $rk2q1, $res0.2d, $h3.2d @ GHASH final-2 block - high
1758 mov $output_l0, $ctr0.d[0] @ AES final-1 block - mov low
1759
1760 mov $output_h0, $ctr0.d[1] @ AES final-1 block - mov high
1761 eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-2 block - mid
1762
1763 movi $t0.8b, #0 @ suppress further partial tag feed in
1764
1765 pmull $rk4v.1q, $rk4v.1d, $h34k.1d @ GHASH final-2 block - mid
1766
1767 eor $output_l0, $output_l0, $rk10_l @ AES final-1 block - round 10 low
1768 eor $acc_lb, $acc_lb, $rk3 @ GHASH final-2 block - low
1769
1770 eor $acc_hb, $acc_hb, $rk2 @ GHASH final-2 block - high
1771
1772 eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-2 block - mid
1773 eor $output_h0, $output_h0, $rk10_h @ AES final-1 block - round 10 high
1774 .L128_dec_blocks_more_than_1: @ blocks left > 1
1775
1776 rev64 $res0b, $res1b @ GHASH final-1 block
1777
1778 ld1 { $res1b}, [$input_ptr], #16 @ AES final block - load ciphertext
1779 eor $res0b, $res0b, $t0.16b @ feed in partial tag
1780
1781 mov $rk4d, $res0.d[1] @ GHASH final-1 block - mid
1782
1783 eor $ctr0b, $res1b, $ctr3b @ AES final block - result
1784
1785 eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-1 block - mid
1786
1787 stp $output_l0, $output_h0, [$output_ptr], #16 @ AES final-1 block - store result
1788 mov $output_l0, $ctr0.d[0] @ AES final block - mov low
1789
1790 mov $output_h0, $ctr0.d[1] @ AES final block - mov high
1791 ins $rk4v.d[1], $rk4v.d[0] @ GHASH final-1 block - mid
1792
1793 pmull $rk3q1, $res0.1d, $h2.1d @ GHASH final-1 block - low
1794
1795 pmull2 $rk2q1, $res0.2d, $h2.2d @ GHASH final-1 block - high
1796
1797 pmull2 $rk4v.1q, $rk4v.2d, $h12k.2d @ GHASH final-1 block - mid
1798 movi $t0.8b, #0 @ suppress further partial tag feed in
1799
1800 eor $acc_lb, $acc_lb, $rk3 @ GHASH final-1 block - low
1801
1802 eor $acc_hb, $acc_hb, $rk2 @ GHASH final-1 block - high
1803 eor $output_h0, $output_h0, $rk10_h @ AES final block - round 10 high
1804
1805 eor $output_l0, $output_l0, $rk10_l @ AES final block - round 10 low
1806 eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-1 block - mid
1807 .L128_dec_blocks_less_than_1: @ blocks left <= 1
1808
1809 mvn $rk10_h, xzr @ rk10_h = 0xffffffffffffffff
1810 and $bit_length, $bit_length, #127 @ bit_length %= 128
1811
1812 mvn $rk10_l, xzr @ rk10_l = 0xffffffffffffffff
1813 sub $bit_length, $bit_length, #128 @ bit_length -= 128
1814
1815 neg $bit_length, $bit_length @ bit_length = 128 - #bits in input (in range [1,128])
1816
1817 and $bit_length, $bit_length, #127 @ bit_length %= 128
1818
1819 lsr $rk10_h, $rk10_h, $bit_length @ rk10_h is mask for top 64b of last block
1820 cmp $bit_length, #64
1821
1822 csel $ctr96_b64x, $rk10_h, xzr, lt
1823 csel $ctr32x, $rk10_l, $rk10_h, lt
1824
1825 fmov $ctr0d, $ctr32x @ ctr0b is mask for last block
1826
1827 mov $ctr0.d[1], $ctr96_b64x
1828
1829 and $res1b, $res1b, $ctr0b @ possibly partial last block has zeroes in highest bits
1830
1831 rev64 $res0b, $res1b @ GHASH final block
1832
1833 eor $res0b, $res0b, $t0.16b @ feed in partial tag
1834
1835 ldp $end_input_ptr, $main_end_input_ptr, [$output_ptr] @ load existing bytes we need to not overwrite
1836
1837 and $output_h0, $output_h0, $ctr96_b64x
1838
1839 pmull2 $rk2q1, $res0.2d, $h1.2d @ GHASH final block - high
1840 mov $t0d, $res0.d[1] @ GHASH final block - mid
1841
1842 eor $t0.8b, $t0.8b, $res0.8b @ GHASH final block - mid
1843 eor $acc_hb, $acc_hb, $rk2 @ GHASH final block - high
1844
1845 pmull $t0.1q, $t0.1d, $h12k.1d @ GHASH final block - mid
1846
1847 pmull $rk3q1, $res0.1d, $h1.1d @ GHASH final block - low
1848 bic $end_input_ptr, $end_input_ptr, $ctr32x @ mask out low existing bytes
1849 and $output_l0, $output_l0, $ctr32x
1850
1851 rev $ctr32w, $rctr32w
1852
1853 eor $acc_mb, $acc_mb, $t0.16b @ GHASH final block - mid
1854 movi $mod_constant.8b, #0xc2
1855
1856 eor $acc_lb, $acc_lb, $rk3 @ GHASH final block - low
1857
1858 bic $main_end_input_ptr, $main_end_input_ptr, $ctr96_b64x @ mask out high existing bytes
1859 shl $mod_constantd, $mod_constantd, #56 @ mod_constant
1860
1861 eor $t9.16b, $acc_lb, $acc_hb @ MODULO - karatsuba tidy up
1862
1863 pmull $mod_t.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid
1864
1865 eor $acc_mb, $acc_mb, $t9.16b @ MODULO - karatsuba tidy up
1866
1867 orr $output_l0, $output_l0, $end_input_ptr
1868 str $ctr32w, [$counter, #12] @ store the updated counter
1869
1870 orr $output_h0, $output_h0, $main_end_input_ptr
1871 stp $output_l0, $output_h0, [$output_ptr]
1872 ext $acc_hb, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment
1873
1874 eor $acc_mb, $acc_mb, $mod_t.16b @ MODULO - fold into mid
1875
1876 eor $acc_mb, $acc_mb, $acc_hb @ MODULO - fold into mid
1877
1878 pmull $mod_constant.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low
1879 ext $acc_mb, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment
1880
1881 eor $acc_lb, $acc_lb, $mod_constant.16b @ MODULO - fold into low
1882
1883 eor $acc_lb, $acc_lb, $acc_mb @ MODULO - fold into low
1884 ext $acc_lb, $acc_lb, $acc_lb, #8
1885 rev64 $acc_lb, $acc_lb
1886 mov x0, $len
1887 st1 { $acc_l.16b }, [$current_tag]
1888
1889 ldp x21, x22, [sp, #16]
1890 ldp x23, x24, [sp, #32]
1891 ldp d8, d9, [sp, #48]
1892 ldp d10, d11, [sp, #64]
1893 ldp d12, d13, [sp, #80]
1894 ldp d14, d15, [sp, #96]
1895 ldp x19, x20, [sp], #112
1896 ret
1897
1898 .L128_dec_ret:
1899 mov w0, #0x0
1900 ret
1901.size aes_gcm_dec_128_kernel,.-aes_gcm_dec_128_kernel
1902___
1903}
1904
1905{
1906my ($end_input_ptr,$main_end_input_ptr,$input_l0,$input_h0)=map("x$_",(4..7));
1907my ($input_l1,$input_h1,$input_l2,$input_h2,$input_l3,$input_h3)=map("x$_",(19..24));
1908my ($output_l1,$output_h1,$output_l2,$output_h2,$output_l3,$output_h3)=map("x$_",(19..24));
1909my ($output_l0,$output_h0)=map("x$_",(6..7));
1910
1911my $ctr32w="w9";
1912my ($ctr32x,$ctr96_b64x,$ctr96_t32x,$rctr32x,$rk12_l,$rk12_h,$len)=map("x$_",(9..15));
1913my ($ctr96_t32w,$rctr32w)=map("w$_",(11..12));
1914
1915my ($ctr0b,$ctr1b,$ctr2b,$ctr3b,$res0b,$res1b,$res2b,$res3b)=map("v$_.16b",(0..7));
1916my ($ctr0,$ctr1,$ctr2,$ctr3,$res0,$res1,$res2,$res3)=map("v$_",(0..7));
1917my ($ctr0d,$ctr1d,$ctr2d,$ctr3d,$res0d,$res1d,$res2d,$res3d)=map("d$_",(0..7));
1918my ($res0q,$res1q,$res2q,$res3q)=map("q$_",(4..7));
1919
1920my ($acc_hb,$acc_mb,$acc_lb)=map("v$_.16b",(9..11));
1921my ($acc_h,$acc_m,$acc_l)=map("v$_",(9..11));
1922my ($acc_hd,$acc_md,$acc_ld)=map("d$_",(9..11));
1923
1924my ($h1,$h2,$h3,$h4,$h12k,$h34k)=map("v$_",(12..17));
1925my ($h1q,$h2q,$h3q,$h4q)=map("q$_",(12..15));
1926my ($h1b,$h2b,$h3b,$h4b)=map("v$_.16b",(12..15));
1927
1928my $t0="v8";
1929my $t0d="d8";
1930my $t3="v4";
1931my $t3d="d4";
1932
1933my ($t1,$t2)=map("v$_",(30..31));
1934my ($t1d,$t2d)=map("d$_",(30..31));
1935
1936my $t4="v30";
1937my $t4d="d30";
1938my $t5="v8";
1939my $t5d="d8";
1940my $t6="v31";
1941my $t6d="d31";
1942
1943my $t7="v5";
1944my $t7d="d5";
1945my $t8="v6";
1946my $t8d="d6";
1947my $t9="v30";
1948my $t9d="d30";
1949
1950my ($ctr_t0,$ctr_t1,$ctr_t2,$ctr_t3)=map("v$_",(4..7));
1951my ($ctr_t0d,$ctr_t1d,$ctr_t2d,$ctr_t3d)=map("d$_",(4..7));
1952my ($ctr_t0b,$ctr_t1b,$ctr_t2b,$ctr_t3b)=map("v$_.16b",(4..7));
1953
1954my $mod_constantd="d8";
1955my $mod_constant="v8";
1956my $mod_t="v31";
1957
1958my ($rk0,$rk1,$rk2,$rk3,$rk4,$rk5,$rk6,$rk7,$rk8,$rk9,$rk10,$rk11)=map("v$_.16b",(18..29));
1959my ($rk0q,$rk1q,$rk2q,$rk3q,$rk4q,$rk5q,$rk6q,$rk7q,$rk8q,$rk9q,$rk10q,$rk11q)=map("q$_",(18..29));
1960my $rk2q1="v20.1q";
1961my $rk3q1="v21.1q";
1962my $rk4v="v22";
1963my $rk4d="d22";
1964
1965#########################################################################################
1966# size_t aes_gcm_enc_192_kernel(const unsigned char *in,
1967# size_t len,
1968# unsigned char *out,
1969# const void *key,
1970# unsigned char ivec[16],
1971# u64 *Xi);
1972#
1973$code.=<<___;
1974.global aes_gcm_enc_192_kernel
1975.type aes_gcm_enc_192_kernel,%function
1976.align 4
1977aes_gcm_enc_192_kernel:
1978 AARCH64_VALID_CALL_TARGET
1979 cbz x1, .L192_enc_ret
1980 stp x19, x20, [sp, #-112]!
1981 mov x16, x4
1982 mov x8, x5
1983 stp x21, x22, [sp, #16]
1984 stp x23, x24, [sp, #32]
1985 stp d8, d9, [sp, #48]
1986 stp d10, d11, [sp, #64]
1987 stp d12, d13, [sp, #80]
1988 stp d14, d15, [sp, #96]
1989
1990 ldp $ctr96_b64x, $ctr96_t32x, [$counter] @ ctr96_b64, ctr96_t32
1991
1992 ldr $rk5q, [$cc, #80] @ load rk5
1993
1994 ldr $rk4q, [$cc, #64] @ load rk4
1995
1996 ldr $rk8q, [$cc, #128] @ load rk8
1997
1998 lsr $rctr32x, $ctr96_t32x, #32
1999 ldr $rk6q, [$cc, #96] @ load rk6
2000 orr $ctr96_t32w, $ctr96_t32w, $ctr96_t32w
2001
2002 ldr $rk7q, [$cc, #112] @ load rk7
2003 rev $rctr32w, $rctr32w @ rev_ctr32
2004
2005 add $rctr32w, $rctr32w, #1 @ increment rev_ctr32
2006 fmov $ctr3d, $ctr96_b64x @ CTR block 3
2007
2008 rev $ctr32w, $rctr32w @ CTR block 1
2009 add $rctr32w, $rctr32w, #1 @ CTR block 1
2010 fmov $ctr1d, $ctr96_b64x @ CTR block 1
2011
2012 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 1
2013 ld1 { $ctr0b}, [$counter] @ special case vector load initial counter so we can start first AES block as quickly as possible
2014
2015 fmov $ctr1.d[1], $ctr32x @ CTR block 1
2016 rev $ctr32w, $rctr32w @ CTR block 2
2017 add $rctr32w, $rctr32w, #1 @ CTR block 2
2018
2019 fmov $ctr2d, $ctr96_b64x @ CTR block 2
2020 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 2
2021
2022 fmov $ctr2.d[1], $ctr32x @ CTR block 2
2023 rev $ctr32w, $rctr32w @ CTR block 3
2024
2025 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 3
2026 ldr $rk0q, [$cc, #0] @ load rk0
2027
2028 fmov $ctr3.d[1], $ctr32x @ CTR block 3
2029
2030 ldr $rk3q, [$cc, #48] @ load rk3
2031
2032 ldp $rk12_l, $rk12_h, [$cc, #192] @ load rk12
2033
2034 ldr $rk1q, [$cc, #16] @ load rk1
2035
2036 aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 0
2037 ld1 { $acc_lb}, [$current_tag]
2038 ext $acc_lb, $acc_lb, $acc_lb, #8
2039 rev64 $acc_lb, $acc_lb
2040
2041 aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 0
2042 ldr $rk11q, [$cc, #176] @ load rk11
2043
2044 aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 0
2045 ldr $h4q, [$current_tag, #112] @ load h4l | h4h
2046 ext $h4b, $h4b, $h4b, #8
2047
2048 aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 0
2049 ldr $rk2q, [$cc, #32] @ load rk2
2050
2051 aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 1
2052 ldr $rk10q, [$cc, #160] @ load rk10
2053
2054 aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 1
2055 ldr $h1q, [$current_tag, #32] @ load h1l | h1h
2056 ext $h1b, $h1b, $h1b, #8
2057
2058 aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 1
2059 ldr $rk9q, [$cc, #144] @ load rk9
2060
2061 aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 1
2062 ldr $h3q, [$current_tag, #80] @ load h3l | h3h
2063 ext $h3b, $h3b, $h3b, #8
2064
2065 aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 2
2066
2067 aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 2
2068
2069 aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 2
2070
2071 aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 3
2072 trn1 $acc_h.2d, $h3.2d, $h4.2d @ h4h | h3h
2073
2074 aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 3
2075
2076 aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 2
2077 trn2 $h34k.2d, $h3.2d, $h4.2d @ h4l | h3l
2078
2079 aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 4
2080
2081 aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 3
2082
2083 aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 3
2084
2085 aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 5
2086
2087 aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 4
2088
2089 aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 4
2090
2091 aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 6
2092
2093 aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 4
2094
2095 aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 5
2096
2097 aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 5
2098
2099 aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 5
2100
2101 aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 6
2102 ldr $h2q, [$current_tag, #64] @ load h2l | h2h
2103 ext $h2b, $h2b, $h2b, #8
2104
2105 aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 6
2106
2107 aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 6
2108
2109 aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 7
2110
2111 aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 7
2112 trn2 $h12k.2d, $h1.2d, $h2.2d @ h2l | h1l
2113
2114 aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 7
2115
2116 aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 8
2117
2118 aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 7
2119 trn1 $t0.2d, $h1.2d, $h2.2d @ h2h | h1h
2120
2121 aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 8
2122
2123 aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 8
2124
2125 aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 8
2126
2127 aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 9
2128
2129 aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 9
2130
2131 aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 9
2132
2133 aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 9
2134
2135 aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 10
2136
2137 aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 10
2138
2139 aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 10
2140 lsr $main_end_input_ptr, $bit_length, #3 @ byte_len
2141 mov $len, $main_end_input_ptr
2142
2143 aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 10
2144 sub $main_end_input_ptr, $main_end_input_ptr, #1 @ byte_len - 1
2145
2146 eor $h12k.16b, $h12k.16b, $t0.16b @ h2k | h1k
2147 and $main_end_input_ptr, $main_end_input_ptr, #0xffffffffffffffc0 @ number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
2148
2149 eor $h34k.16b, $h34k.16b, $acc_h.16b @ h4k | h3k
2150
2151 aese $ctr2b, $rk11 @ AES block 2 - round 11
2152 add $end_input_ptr, $input_ptr, $bit_length, lsr #3 @ end_input_ptr
2153 add $main_end_input_ptr, $main_end_input_ptr, $input_ptr
2154
2155 aese $ctr1b, $rk11 @ AES block 1 - round 11
2156 cmp $input_ptr, $main_end_input_ptr @ check if we have <= 4 blocks
2157
2158 aese $ctr0b, $rk11 @ AES block 0 - round 11
2159 add $rctr32w, $rctr32w, #1 @ CTR block 3
2160
2161 aese $ctr3b, $rk11 @ AES block 3 - round 11
2162 b.ge .L192_enc_tail @ handle tail
2163
2164 rev $ctr32w, $rctr32w @ CTR block 4
2165 ldp $input_l0, $input_h0, [$input_ptr, #0] @ AES block 0 - load plaintext
2166
2167 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4
2168 ldp $input_l2, $input_h2, [$input_ptr, #32] @ AES block 2 - load plaintext
2169
2170 ldp $input_l3, $input_h3, [$input_ptr, #48] @ AES block 3 - load plaintext
2171
2172 ldp $input_l1, $input_h1, [$input_ptr, #16] @ AES block 1 - load plaintext
2173 add $input_ptr, $input_ptr, #64 @ AES input_ptr update
2174 cmp $input_ptr, $main_end_input_ptr @ check if we have <= 8 blocks
2175
2176 eor $input_l0, $input_l0, $rk12_l @ AES block 0 - round 12 low
2177
2178 eor $input_h0, $input_h0, $rk12_h @ AES block 0 - round 12 high
2179 eor $input_h2, $input_h2, $rk12_h @ AES block 2 - round 12 high
2180 fmov $ctr_t0d, $input_l0 @ AES block 0 - mov low
2181
2182 eor $input_h3, $input_h3, $rk12_h @ AES block 3 - round 12 high
2183 fmov $ctr_t0.d[1], $input_h0 @ AES block 0 - mov high
2184
2185 eor $input_l2, $input_l2, $rk12_l @ AES block 2 - round 12 low
2186 eor $input_l1, $input_l1, $rk12_l @ AES block 1 - round 12 low
2187
2188 fmov $ctr_t1d, $input_l1 @ AES block 1 - mov low
2189 eor $input_h1, $input_h1, $rk12_h @ AES block 1 - round 12 high
2190
2191 fmov $ctr_t1.d[1], $input_h1 @ AES block 1 - mov high
2192
2193 eor $input_l3, $input_l3, $rk12_l @ AES block 3 - round 12 low
2194 fmov $ctr_t2d, $input_l2 @ AES block 2 - mov low
2195
2196 add $rctr32w, $rctr32w, #1 @ CTR block 4
2197 eor $res0b, $ctr_t0b, $ctr0b @ AES block 0 - result
2198 fmov $ctr0d, $ctr96_b64x @ CTR block 4
2199
2200 fmov $ctr0.d[1], $ctr32x @ CTR block 4
2201 rev $ctr32w, $rctr32w @ CTR block 5
2202
2203 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 5
2204 add $rctr32w, $rctr32w, #1 @ CTR block 5
2205
2206 fmov $ctr_t3d, $input_l3 @ AES block 3 - mov low
2207 st1 { $res0b}, [$output_ptr], #16 @ AES block 0 - store result
2208
2209 fmov $ctr_t2.d[1], $input_h2 @ AES block 2 - mov high
2210
2211 eor $res1b, $ctr_t1b, $ctr1b @ AES block 1 - result
2212 fmov $ctr1d, $ctr96_b64x @ CTR block 5
2213 st1 { $res1b}, [$output_ptr], #16 @ AES block 1 - store result
2214
2215 fmov $ctr_t3.d[1], $input_h3 @ AES block 3 - mov high
2216
2217 fmov $ctr1.d[1], $ctr32x @ CTR block 5
2218 rev $ctr32w, $rctr32w @ CTR block 6
2219
2220 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 6
2221
2222 add $rctr32w, $rctr32w, #1 @ CTR block 6
2223 eor $res2b, $ctr_t2b, $ctr2b @ AES block 2 - result
2224 fmov $ctr2d, $ctr96_b64x @ CTR block 6
2225
2226 fmov $ctr2.d[1], $ctr32x @ CTR block 6
2227 rev $ctr32w, $rctr32w @ CTR block 7
2228
2229 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 7
2230 st1 { $res2b}, [$output_ptr], #16 @ AES block 2 - store result
2231
2232 eor $res3b, $ctr_t3b, $ctr3b @ AES block 3 - result
2233 st1 { $res3b}, [$output_ptr], #16 @ AES block 3 - store result
2234 b.ge .L192_enc_prepretail @ do prepretail
2235
2236 .L192_enc_main_loop: @ main loop start
2237 aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 0
2238 rev64 $res1b, $res1b @ GHASH block 4k+1 (t0 and t1 free)
2239
2240 aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 0
2241 ldp $input_l1, $input_h1, [$input_ptr, #16] @ AES block 4k+5 - load plaintext
2242
2243 ext $acc_lb, $acc_lb, $acc_lb, #8 @ PRE 0
2244 fmov $ctr3d, $ctr96_b64x @ CTR block 4k+3
2245 rev64 $res0b, $res0b @ GHASH block 4k (only t0 is free)
2246
2247 aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 1
2248 fmov $ctr3.d[1], $ctr32x @ CTR block 4k+3
2249
2250 pmull2 $t1.1q, $res1.2d, $h3.2d @ GHASH block 4k+1 - high
2251 rev64 $res3b, $res3b @ GHASH block 4k+3 (t0, t1, t2 and t3 free)
2252 ldp $input_l2, $input_h2, [$input_ptr, #32] @ AES block 4k+6 - load plaintext
2253
2254 aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 0
2255 ldp $input_l3, $input_h3, [$input_ptr, #48] @ AES block 4k+3 - load plaintext
2256
2257 pmull $t2.1q, $res1.1d, $h3.1d @ GHASH block 4k+1 - low
2258 eor $res0b, $res0b, $acc_lb @ PRE 1
2259
2260 aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 1
2261
2262 aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 1
2263 rev64 $res2b, $res2b @ GHASH block 4k+2 (t0, t1, and t2 free)
2264
2265 aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 0
2266 eor $input_h3, $input_h3, $rk12_h @ AES block 4k+3 - round 12 high
2267
2268 pmull $acc_l.1q, $res0.1d, $h4.1d @ GHASH block 4k - low
2269 mov $t0d, $res0.d[1] @ GHASH block 4k - mid
2270
2271 aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 2
2272
2273 aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 1
2274 eor $input_l2, $input_l2, $rk12_l @ AES block 4k+6 - round 12 low
2275
2276 eor $t0.8b, $t0.8b, $res0.8b @ GHASH block 4k - mid
2277 eor $acc_lb, $acc_lb, $t2.16b @ GHASH block 4k+1 - low
2278
2279 aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 3
2280 eor $input_l1, $input_l1, $rk12_l @ AES block 4k+5 - round 12 low
2281
2282 aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 2
2283 mov $t6d, $res2.d[1] @ GHASH block 4k+2 - mid
2284
2285 pmull2 $acc_h.1q, $res0.2d, $h4.2d @ GHASH block 4k - high
2286 mov $t3d, $res1.d[1] @ GHASH block 4k+1 - mid
2287
2288 aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 2
2289
2290 aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 3
2291
2292 mov $acc_md, $h34k.d[1] @ GHASH block 4k - mid
2293 eor $acc_hb, $acc_hb, $t1.16b @ GHASH block 4k+1 - high
2294
2295 aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 2
2296 eor $t6.8b, $t6.8b, $res2.8b @ GHASH block 4k+2 - mid
2297
2298 pmull2 $t4.1q, $res2.2d, $h2.2d @ GHASH block 4k+2 - high
2299
2300 aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 4
2301 eor $t3.8b, $t3.8b, $res1.8b @ GHASH block 4k+1 - mid
2302
2303 aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 3
2304
2305 pmull2 $t7.1q, $res3.2d, $h1.2d @ GHASH block 4k+3 - high
2306 eor $input_h1, $input_h1, $rk12_h @ AES block 4k+5 - round 12 high
2307 ins $t6.d[1], $t6.d[0] @ GHASH block 4k+2 - mid
2308
2309 aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 5
2310 add $rctr32w, $rctr32w, #1 @ CTR block 4k+3
2311
2312 aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 4
2313 eor $acc_hb, $acc_hb, $t4.16b @ GHASH block 4k+2 - high
2314
2315 pmull $t3.1q, $t3.1d, $h34k.1d @ GHASH block 4k+1 - mid
2316 eor $input_h2, $input_h2, $rk12_h @ AES block 4k+6 - round 12 high
2317
2318 pmull2 $t6.1q, $t6.2d, $h12k.2d @ GHASH block 4k+2 - mid
2319 eor $input_l3, $input_l3, $rk12_l @ AES block 4k+3 - round 12 low
2320 mov $t9d, $res3.d[1] @ GHASH block 4k+3 - mid
2321
2322 pmull $acc_m.1q, $t0.1d, $acc_m.1d @ GHASH block 4k - mid
2323 rev $ctr32w, $rctr32w @ CTR block 4k+8
2324
2325 pmull $t5.1q, $res2.1d, $h2.1d @ GHASH block 4k+2 - low
2326 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+8
2327
2328 aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 3
2329 eor $t9.8b, $t9.8b, $res3.8b @ GHASH block 4k+3 - mid
2330
2331 aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 4
2332 ldp $input_l0, $input_h0, [$input_ptr, #0] @ AES block 4k+4 - load plaintext
2333
2334 aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 6
2335 eor $acc_lb, $acc_lb, $t5.16b @ GHASH block 4k+2 - low
2336
2337 aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 4
2338 add $input_ptr, $input_ptr, #64 @ AES input_ptr update
2339
2340 aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 5
2341 movi $mod_constant.8b, #0xc2
2342
2343 pmull $t8.1q, $res3.1d, $h1.1d @ GHASH block 4k+3 - low
2344 eor $input_h0, $input_h0, $rk12_h @ AES block 4k+4 - round 12 high
2345 eor $acc_mb, $acc_mb, $t3.16b @ GHASH block 4k+1 - mid
2346
2347 aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 5
2348 eor $input_l0, $input_l0, $rk12_l @ AES block 4k+4 - round 12 low
2349
2350 aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 6
2351 shl $mod_constantd, $mod_constantd, #56 @ mod_constant
2352
2353 aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 5
2354 eor $acc_hb, $acc_hb, $t7.16b @ GHASH block 4k+3 - high
2355
2356 aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 7
2357 fmov $ctr_t1d, $input_l1 @ AES block 4k+5 - mov low
2358
2359 aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 7
2360 eor $acc_mb, $acc_mb, $t6.16b @ GHASH block 4k+2 - mid
2361
2362 aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 6
2363 fmov $ctr_t1.d[1], $input_h1 @ AES block 4k+5 - mov high
2364
2365 aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 8
2366 eor $acc_lb, $acc_lb, $t8.16b @ GHASH block 4k+3 - low
2367
2368 pmull $t9.1q, $t9.1d, $h12k.1d @ GHASH block 4k+3 - mid
2369 cmp $input_ptr, $main_end_input_ptr @ LOOP CONTROL
2370 fmov $ctr_t0d, $input_l0 @ AES block 4k+4 - mov low
2371
2372 aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 6
2373 fmov $ctr_t0.d[1], $input_h0 @ AES block 4k+4 - mov high
2374
2375 aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 8
2376 fmov $ctr_t3d, $input_l3 @ AES block 4k+3 - mov low
2377
2378 eor $acc_mb, $acc_mb, $t9.16b @ GHASH block 4k+3 - mid
2379 eor $t9.16b, $acc_lb, $acc_hb @ MODULO - karatsuba tidy up
2380 add $rctr32w, $rctr32w, #1 @ CTR block 4k+8
2381
2382 aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 7
2383 fmov $ctr_t3.d[1], $input_h3 @ AES block 4k+3 - mov high
2384
2385 pmull $mod_t.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid
2386 ext $acc_hb, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment
2387 fmov $ctr_t2d, $input_l2 @ AES block 4k+6 - mov low
2388
2389 aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 7
2390
2391 aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 9
2392 eor $acc_mb, $acc_mb, $t9.16b @ MODULO - karatsuba tidy up
2393
2394 aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 8
2395
2396 aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 8
2397
2398 aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 9
2399
2400 aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 10
2401 eor $acc_mb, $acc_mb, $mod_t.16b @ MODULO - fold into mid
2402
2403 aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 9
2404
2405 aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 9
2406
2407 aese $ctr0b, $rk11 @ AES block 4k+4 - round 11
2408
2409 aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 10
2410 eor $acc_mb, $acc_mb, $acc_hb @ MODULO - fold into mid
2411
2412 aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 10
2413
2414 eor $res0b, $ctr_t0b, $ctr0b @ AES block 4k+4 - result
2415 fmov $ctr0d, $ctr96_b64x @ CTR block 4k+8
2416
2417 aese $ctr1b, $rk11 @ AES block 4k+5 - round 11
2418 fmov $ctr0.d[1], $ctr32x @ CTR block 4k+8
2419 rev $ctr32w, $rctr32w @ CTR block 4k+9
2420
2421 pmull $acc_h.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low
2422 fmov $ctr_t2.d[1], $input_h2 @ AES block 4k+6 - mov high
2423 st1 { $res0b}, [$output_ptr], #16 @ AES block 4k+4 - store result
2424
2425 aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 10
2426 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+9
2427
2428 eor $res1b, $ctr_t1b, $ctr1b @ AES block 4k+5 - result
2429 add $rctr32w, $rctr32w, #1 @ CTR block 4k+9
2430 fmov $ctr1d, $ctr96_b64x @ CTR block 4k+9
2431
2432 aese $ctr2b, $rk11 @ AES block 4k+6 - round 11
2433 fmov $ctr1.d[1], $ctr32x @ CTR block 4k+9
2434 rev $ctr32w, $rctr32w @ CTR block 4k+10
2435
2436 add $rctr32w, $rctr32w, #1 @ CTR block 4k+10
2437 ext $acc_mb, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment
2438 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+10
2439
2440 st1 { $res1b}, [$output_ptr], #16 @ AES block 4k+5 - store result
2441 eor $acc_lb, $acc_lb, $acc_hb @ MODULO - fold into low
2442
2443 aese $ctr3b, $rk11 @ AES block 4k+7 - round 11
2444 eor $res2b, $ctr_t2b, $ctr2b @ AES block 4k+6 - result
2445 fmov $ctr2d, $ctr96_b64x @ CTR block 4k+10
2446
2447 st1 { $res2b}, [$output_ptr], #16 @ AES block 4k+6 - store result
2448 fmov $ctr2.d[1], $ctr32x @ CTR block 4k+10
2449 rev $ctr32w, $rctr32w @ CTR block 4k+11
2450
2451 eor $acc_lb, $acc_lb, $acc_mb @ MODULO - fold into low
2452 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+11
2453
2454 eor $res3b, $ctr_t3b, $ctr3b @ AES block 4k+3 - result
2455 st1 { $res3b}, [$output_ptr], #16 @ AES block 4k+3 - store result
2456 b.lt .L192_enc_main_loop
2457
2458 .L192_enc_prepretail: @ PREPRETAIL
2459 aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 0
2460 rev64 $res0b, $res0b @ GHASH block 4k (only t0 is free)
2461
2462 fmov $ctr3d, $ctr96_b64x @ CTR block 4k+3
2463 ext $acc_lb, $acc_lb, $acc_lb, #8 @ PRE 0
2464 add $rctr32w, $rctr32w, #1 @ CTR block 4k+3
2465
2466 aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 0
2467 rev64 $res1b, $res1b @ GHASH block 4k+1 (t0 and t1 free)
2468
2469 aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 0
2470
2471 fmov $ctr3.d[1], $ctr32x @ CTR block 4k+3
2472 eor $res0b, $res0b, $acc_lb @ PRE 1
2473 mov $acc_md, $h34k.d[1] @ GHASH block 4k - mid
2474
2475 aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 1
2476 rev64 $res2b, $res2b @ GHASH block 4k+2 (t0, t1, and t2 free)
2477
2478 pmull2 $t1.1q, $res1.2d, $h3.2d @ GHASH block 4k+1 - high
2479
2480 pmull $acc_l.1q, $res0.1d, $h4.1d @ GHASH block 4k - low
2481 mov $t0d, $res0.d[1] @ GHASH block 4k - mid
2482
2483 pmull $t2.1q, $res1.1d, $h3.1d @ GHASH block 4k+1 - low
2484 rev64 $res3b, $res3b @ GHASH block 4k+3 (t0, t1, t2 and t3 free)
2485
2486 pmull2 $acc_h.1q, $res0.2d, $h4.2d @ GHASH block 4k - high
2487
2488 eor $t0.8b, $t0.8b, $res0.8b @ GHASH block 4k - mid
2489 mov $t3d, $res1.d[1] @ GHASH block 4k+1 - mid
2490
2491 eor $acc_lb, $acc_lb, $t2.16b @ GHASH block 4k+1 - low
2492 mov $t6d, $res2.d[1] @ GHASH block 4k+2 - mid
2493
2494 aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 0
2495 eor $acc_hb, $acc_hb, $t1.16b @ GHASH block 4k+1 - high
2496
2497 pmull2 $t4.1q, $res2.2d, $h2.2d @ GHASH block 4k+2 - high
2498
2499 eor $t3.8b, $t3.8b, $res1.8b @ GHASH block 4k+1 - mid
2500 eor $t6.8b, $t6.8b, $res2.8b @ GHASH block 4k+2 - mid
2501
2502 aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 1
2503
2504 aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 1
2505 eor $acc_hb, $acc_hb, $t4.16b @ GHASH block 4k+2 - high
2506
2507 aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 1
2508
2509 aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 2
2510 mov $t9d, $res3.d[1] @ GHASH block 4k+3 - mid
2511
2512 pmull2 $t7.1q, $res3.2d, $h1.2d @ GHASH block 4k+3 - high
2513 ins $t6.d[1], $t6.d[0] @ GHASH block 4k+2 - mid
2514
2515 aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 2
2516
2517 pmull $acc_m.1q, $t0.1d, $acc_m.1d @ GHASH block 4k - mid
2518 eor $t9.8b, $t9.8b, $res3.8b @ GHASH block 4k+3 - mid
2519
2520 aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 3
2521
2522 pmull2 $t6.1q, $t6.2d, $h12k.2d @ GHASH block 4k+2 - mid
2523
2524 pmull $t3.1q, $t3.1d, $h34k.1d @ GHASH block 4k+1 - mid
2525
2526 pmull $t9.1q, $t9.1d, $h12k.1d @ GHASH block 4k+3 - mid
2527 eor $acc_hb, $acc_hb, $t7.16b @ GHASH block 4k+3 - high
2528
2529 pmull $t5.1q, $res2.1d, $h2.1d @ GHASH block 4k+2 - low
2530
2531 aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 3
2532 eor $acc_mb, $acc_mb, $t3.16b @ GHASH block 4k+1 - mid
2533
2534 aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 2
2535
2536 aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 2
2537 eor $acc_lb, $acc_lb, $t5.16b @ GHASH block 4k+2 - low
2538
2539 aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 4
2540
2541 aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 3
2542 eor $acc_mb, $acc_mb, $t6.16b @ GHASH block 4k+2 - mid
2543
2544 aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 3
2545
2546 pmull $t8.1q, $res3.1d, $h1.1d @ GHASH block 4k+3 - low
2547 movi $mod_constant.8b, #0xc2
2548
2549 aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 4
2550
2551 aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 4
2552
2553 aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 4
2554 eor $acc_mb, $acc_mb, $t9.16b @ GHASH block 4k+3 - mid
2555
2556 aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 5
2557
2558 aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 5
2559
2560 aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 5
2561 eor $acc_lb, $acc_lb, $t8.16b @ GHASH block 4k+3 - low
2562
2563 aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 5
2564
2565 aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 6
2566 eor $acc_mb, $acc_mb, $acc_hb @ karatsuba tidy up
2567
2568 aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 6
2569
2570 aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 6
2571 shl $mod_constantd, $mod_constantd, #56 @ mod_constant
2572
2573 aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 7
2574
2575 aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 7
2576 eor $acc_mb, $acc_mb, $acc_lb
2577
2578 aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 7
2579
2580 pmull $t1.1q, $acc_h.1d, $mod_constant.1d
2581
2582 aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 6
2583 ext $acc_hb, $acc_hb, $acc_hb, #8
2584
2585 aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 8
2586
2587 aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 8
2588 eor $acc_mb, $acc_mb, $t1.16b
2589
2590 aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 7
2591
2592 aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 8
2593
2594 aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 9
2595
2596 aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 8
2597 eor $acc_mb, $acc_mb, $acc_hb
2598
2599 aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 9
2600
2601 aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 9
2602
2603 aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 9
2604
2605 pmull $t1.1q, $acc_m.1d, $mod_constant.1d
2606
2607 ext $acc_mb, $acc_mb, $acc_mb, #8
2608
2609 aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 10
2610
2611 aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 10
2612
2613 aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 10
2614
2615 aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 10
2616 eor $acc_lb, $acc_lb, $t1.16b
2617
2618 aese $ctr0b, $rk11 @ AES block 4k+4 - round 11
2619
2620 aese $ctr3b, $rk11 @ AES block 4k+7 - round 11
2621
2622 aese $ctr2b, $rk11 @ AES block 4k+6 - round 11
2623
2624 aese $ctr1b, $rk11 @ AES block 4k+5 - round 11
2625 eor $acc_lb, $acc_lb, $acc_mb
2626 .L192_enc_tail: @ TAIL
2627
2628 sub $main_end_input_ptr, $end_input_ptr, $input_ptr @ main_end_input_ptr is number of bytes left to process
2629 ldp $input_l0, $input_h0, [$input_ptr], #16 @ AES block 4k+4 - load plaintext
2630
2631 eor $input_l0, $input_l0, $rk12_l @ AES block 4k+4 - round 12 low
2632 eor $input_h0, $input_h0, $rk12_h @ AES block 4k+4 - round 12 high
2633
2634 fmov $ctr_t0d, $input_l0 @ AES block 4k+4 - mov low
2635
2636 fmov $ctr_t0.d[1], $input_h0 @ AES block 4k+4 - mov high
2637 cmp $main_end_input_ptr, #48
2638
2639 eor $res1b, $ctr_t0b, $ctr0b @ AES block 4k+4 - result
2640
2641 ext $t0.16b, $acc_lb, $acc_lb, #8 @ prepare final partial tag
2642 b.gt .L192_enc_blocks_more_than_3
2643
2644 sub $rctr32w, $rctr32w, #1
2645 movi $acc_m.8b, #0
2646
2647 mov $ctr3b, $ctr2b
2648 movi $acc_h.8b, #0
2649 cmp $main_end_input_ptr, #32
2650
2651 mov $ctr2b, $ctr1b
2652 movi $acc_l.8b, #0
2653 b.gt .L192_enc_blocks_more_than_2
2654
2655 sub $rctr32w, $rctr32w, #1
2656
2657 mov $ctr3b, $ctr1b
2658 cmp $main_end_input_ptr, #16
2659 b.gt .L192_enc_blocks_more_than_1
2660
2661 sub $rctr32w, $rctr32w, #1
2662 b .L192_enc_blocks_less_than_1
2663 .L192_enc_blocks_more_than_3: @ blocks left > 3
2664 st1 { $res1b}, [$output_ptr], #16 @ AES final-3 block - store result
2665
2666 ldp $input_l0, $input_h0, [$input_ptr], #16 @ AES final-2 block - load input low & high
2667
2668 rev64 $res0b, $res1b @ GHASH final-3 block
2669
2670 eor $input_l0, $input_l0, $rk12_l @ AES final-2 block - round 12 low
2671 eor $res0b, $res0b, $t0.16b @ feed in partial tag
2672
2673 eor $input_h0, $input_h0, $rk12_h @ AES final-2 block - round 12 high
2674 fmov $res1d, $input_l0 @ AES final-2 block - mov low
2675
2676 fmov $res1.d[1], $input_h0 @ AES final-2 block - mov high
2677
2678 mov $rk4d, $res0.d[1] @ GHASH final-3 block - mid
2679
2680 pmull $acc_l.1q, $res0.1d, $h4.1d @ GHASH final-3 block - low
2681
2682 mov $acc_md, $h34k.d[1] @ GHASH final-3 block - mid
2683
2684 eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-3 block - mid
2685
2686 movi $t0.8b, #0 @ suppress further partial tag feed in
2687
2688 pmull2 $acc_h.1q, $res0.2d, $h4.2d @ GHASH final-3 block - high
2689
2690 pmull $acc_m.1q, $rk4v.1d, $acc_m.1d @ GHASH final-3 block - mid
2691 eor $res1b, $res1b, $ctr1b @ AES final-2 block - result
2692 .L192_enc_blocks_more_than_2: @ blocks left > 2
2693
2694 st1 { $res1b}, [$output_ptr], #16 @ AES final-2 block - store result
2695
2696 rev64 $res0b, $res1b @ GHASH final-2 block
2697 ldp $input_l0, $input_h0, [$input_ptr], #16 @ AES final-1 block - load input low & high
2698
2699 eor $res0b, $res0b, $t0.16b @ feed in partial tag
2700
2701 eor $input_h0, $input_h0, $rk12_h @ AES final-1 block - round 12 high
2702
2703 pmull2 $rk2q1, $res0.2d, $h3.2d @ GHASH final-2 block - high
2704 mov $rk4d, $res0.d[1] @ GHASH final-2 block - mid
2705
2706 pmull $rk3q1, $res0.1d, $h3.1d @ GHASH final-2 block - low
2707 eor $input_l0, $input_l0, $rk12_l @ AES final-1 block - round 12 low
2708
2709 fmov $res1d, $input_l0 @ AES final-1 block - mov low
2710
2711 fmov $res1.d[1], $input_h0 @ AES final-1 block - mov high
2712 eor $acc_hb, $acc_hb, $rk2 @ GHASH final-2 block - high
2713 eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-2 block - mid
2714
2715 eor $acc_lb, $acc_lb, $rk3 @ GHASH final-2 block - low
2716
2717 pmull $rk4v.1q, $rk4v.1d, $h34k.1d @ GHASH final-2 block - mid
2718
2719 movi $t0.8b, #0 @ suppress further partial tag feed in
2720
2721 eor $res1b, $res1b, $ctr2b @ AES final-1 block - result
2722
2723 eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-2 block - mid
2724 .L192_enc_blocks_more_than_1: @ blocks left > 1
2725
2726 st1 { $res1b}, [$output_ptr], #16 @ AES final-1 block - store result
2727
2728 ldp $input_l0, $input_h0, [$input_ptr], #16 @ AES final block - load input low & high
2729
2730 rev64 $res0b, $res1b @ GHASH final-1 block
2731
2732 eor $input_l0, $input_l0, $rk12_l @ AES final block - round 12 low
2733 eor $res0b, $res0b, $t0.16b @ feed in partial tag
2734 movi $t0.8b, #0 @ suppress further partial tag feed in
2735
2736 mov $rk4d, $res0.d[1] @ GHASH final-1 block - mid
2737
2738 eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-1 block - mid
2739 eor $input_h0, $input_h0, $rk12_h @ AES final block - round 12 high
2740 fmov $res1d, $input_l0 @ AES final block - mov low
2741
2742 pmull2 $rk2q1, $res0.2d, $h2.2d @ GHASH final-1 block - high
2743 fmov $res1.d[1], $input_h0 @ AES final block - mov high
2744
2745 ins $rk4v.d[1], $rk4v.d[0] @ GHASH final-1 block - mid
2746
2747 eor $acc_hb, $acc_hb, $rk2 @ GHASH final-1 block - high
2748
2749 pmull $rk3q1, $res0.1d, $h2.1d @ GHASH final-1 block - low
2750
2751 pmull2 $rk4v.1q, $rk4v.2d, $h12k.2d @ GHASH final-1 block - mid
2752
2753 eor $res1b, $res1b, $ctr3b @ AES final block - result
2754
2755 eor $acc_lb, $acc_lb, $rk3 @ GHASH final-1 block - low
2756
2757 eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-1 block - mid
2758 .L192_enc_blocks_less_than_1: @ blocks left <= 1
2759
2760 ld1 { $rk0}, [$output_ptr] @ load existing bytes where the possibly partial last block is to be stored
2761 rev $ctr32w, $rctr32w
2762 and $bit_length, $bit_length, #127 @ bit_length %= 128
2763
2764 sub $bit_length, $bit_length, #128 @ bit_length -= 128
2765 mvn $rk12_h, xzr @ rk12_h = 0xffffffffffffffff
2766
2767 neg $bit_length, $bit_length @ bit_length = 128 - #bits in input (in range [1,128])
2768 mvn $rk12_l, xzr @ rk12_l = 0xffffffffffffffff
2769
2770 and $bit_length, $bit_length, #127 @ bit_length %= 128
2771
2772 lsr $rk12_h, $rk12_h, $bit_length @ rk12_h is mask for top 64b of last block
2773 cmp $bit_length, #64
2774
2775 csel $input_l0, $rk12_l, $rk12_h, lt
2776 csel $input_h0, $rk12_h, xzr, lt
2777
2778 fmov $ctr0d, $input_l0 @ ctr0b is mask for last block
2779
2780 fmov $ctr0.d[1], $input_h0
2781
2782 and $res1b, $res1b, $ctr0b @ possibly partial last block has zeroes in highest bits
2783
2784 rev64 $res0b, $res1b @ GHASH final block
2785
2786 eor $res0b, $res0b, $t0.16b @ feed in partial tag
2787
2788 mov $t0d, $res0.d[1] @ GHASH final block - mid
2789
2790 pmull $rk3q1, $res0.1d, $h1.1d @ GHASH final block - low
2791
2792 pmull2 $rk2q1, $res0.2d, $h1.2d @ GHASH final block - high
2793
2794 eor $t0.8b, $t0.8b, $res0.8b @ GHASH final block - mid
2795
2796 eor $acc_lb, $acc_lb, $rk3 @ GHASH final block - low
2797
2798 eor $acc_hb, $acc_hb, $rk2 @ GHASH final block - high
2799
2800 pmull $t0.1q, $t0.1d, $h12k.1d @ GHASH final block - mid
2801
2802 eor $acc_mb, $acc_mb, $t0.16b @ GHASH final block - mid
2803 movi $mod_constant.8b, #0xc2
2804
2805 eor $t9.16b, $acc_lb, $acc_hb @ MODULO - karatsuba tidy up
2806
2807 shl $mod_constantd, $mod_constantd, #56 @ mod_constant
2808
2809 bif $res1b, $rk0, $ctr0b @ insert existing bytes in top end of result before storing
2810
2811 eor $acc_mb, $acc_mb, $t9.16b @ MODULO - karatsuba tidy up
2812
2813 pmull $mod_t.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid
2814
2815 ext $acc_hb, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment
2816
2817 eor $acc_mb, $acc_mb, $mod_t.16b @ MODULO - fold into mid
2818
2819 eor $acc_mb, $acc_mb, $acc_hb @ MODULO - fold into mid
2820
2821 pmull $acc_h.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low
2822
2823 ext $acc_mb, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment
2824
2825 eor $acc_lb, $acc_lb, $acc_hb @ MODULO - fold into low
2826 str $ctr32w, [$counter, #12] @ store the updated counter
2827
2828 st1 { $res1b}, [$output_ptr] @ store all 16B
2829
2830 eor $acc_lb, $acc_lb, $acc_mb @ MODULO - fold into low
2831 ext $acc_lb, $acc_lb, $acc_lb, #8
2832 rev64 $acc_lb, $acc_lb
2833 mov x0, $len
2834 st1 { $acc_l.16b }, [$current_tag]
2835
2836 ldp x21, x22, [sp, #16]
2837 ldp x23, x24, [sp, #32]
2838 ldp d8, d9, [sp, #48]
2839 ldp d10, d11, [sp, #64]
2840 ldp d12, d13, [sp, #80]
2841 ldp d14, d15, [sp, #96]
2842 ldp x19, x20, [sp], #112
2843 ret
2844
2845.L192_enc_ret:
2846 mov w0, #0x0
2847 ret
2848.size aes_gcm_enc_192_kernel,.-aes_gcm_enc_192_kernel
2849___
2850
2851#########################################################################################
2852# size_t aes_gcm_dec_192_kernel(const unsigned char *in,
2853# size_t len,
2854# unsigned char *out,
2855# const void *key,
2856# unsigned char ivec[16],
2857# u64 *Xi);
2858#
2859$code.=<<___;
2860.global aes_gcm_dec_192_kernel
2861.type aes_gcm_dec_192_kernel,%function
2862.align 4
2863aes_gcm_dec_192_kernel:
2864 AARCH64_VALID_CALL_TARGET
2865 cbz x1, .L192_dec_ret
2866 stp x19, x20, [sp, #-112]!
2867 mov x16, x4
2868 mov x8, x5
2869 stp x21, x22, [sp, #16]
2870 stp x23, x24, [sp, #32]
2871 stp d8, d9, [sp, #48]
2872 stp d10, d11, [sp, #64]
2873 stp d12, d13, [sp, #80]
2874 stp d14, d15, [sp, #96]
2875
2876 add $end_input_ptr, $input_ptr, $bit_length, lsr #3 @ end_input_ptr
2877 ldp $ctr96_b64x, $ctr96_t32x, [$counter] @ ctr96_b64, ctr96_t32
2878
2879 ld1 { $ctr0b}, [$counter] @ special case vector load initial counter so we can start first AES block as quickly as possible
2880
2881 ldr $rk0q, [$cc, #0] @ load rk0
2882
2883 lsr $main_end_input_ptr, $bit_length, #3 @ byte_len
2884 mov $len, $main_end_input_ptr
2885 ldr $rk2q, [$cc, #32] @ load rk2
2886
2887 lsr $rctr32x, $ctr96_t32x, #32
2888 orr $ctr96_t32w, $ctr96_t32w, $ctr96_t32w
2889 fmov $ctr3d, $ctr96_b64x @ CTR block 3
2890
2891 rev $rctr32w, $rctr32w @ rev_ctr32
2892 fmov $ctr1d, $ctr96_b64x @ CTR block 1
2893
2894 add $rctr32w, $rctr32w, #1 @ increment rev_ctr32
2895 ldr $rk1q, [$cc, #16] @ load rk1
2896
2897 aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 0
2898 rev $ctr32w, $rctr32w @ CTR block 1
2899
2900 add $rctr32w, $rctr32w, #1 @ CTR block 1
2901 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 1
2902 ldr $rk3q, [$cc, #48] @ load rk3
2903
2904 fmov $ctr1.d[1], $ctr32x @ CTR block 1
2905 rev $ctr32w, $rctr32w @ CTR block 2
2906 add $rctr32w, $rctr32w, #1 @ CTR block 2
2907
2908 fmov $ctr2d, $ctr96_b64x @ CTR block 2
2909 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 2
2910
2911 fmov $ctr2.d[1], $ctr32x @ CTR block 2
2912 rev $ctr32w, $rctr32w @ CTR block 3
2913
2914 aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 1
2915 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 3
2916
2917 fmov $ctr3.d[1], $ctr32x @ CTR block 3
2918
2919 ldr $rk8q, [$cc, #128] @ load rk8
2920
2921 aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 2
2922
2923 aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 0
2924 ldr $rk11q, [$cc, #176] @ load rk11
2925
2926 aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 0
2927 ldr $h4q, [$current_tag, #112] @ load h4l | h4h
2928 ext $h4b, $h4b, $h4b, #8
2929
2930 aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 0
2931 ldr $h2q, [$current_tag, #64] @ load h2l | h2h
2932 ext $h2b, $h2b, $h2b, #8
2933
2934 aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 1
2935 ldr $h3q, [$current_tag, #80] @ load h3l | h3h
2936 ext $h3b, $h3b, $h3b, #8
2937
2938 aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 1
2939 ldp $rk12_l, $rk12_h, [$cc, #192] @ load rk12
2940
2941 aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 1
2942 ldr $h1q, [$current_tag, #32] @ load h1l | h1h
2943 ext $h1b, $h1b, $h1b, #8
2944
2945 aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 2
2946 ldr $rk10q, [$cc, #160] @ load rk10
2947
2948 aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 3
2949 ldr $rk9q, [$cc, #144] @ load rk9
2950
2951 aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 2
2952 ldr $rk7q, [$cc, #112] @ load rk7
2953
2954 aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 2
2955 ldr $rk4q, [$cc, #64] @ load rk4
2956
2957 aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 3
2958 ld1 { $acc_lb}, [$current_tag]
2959 ext $acc_lb, $acc_lb, $acc_lb, #8
2960 rev64 $acc_lb, $acc_lb
2961
2962 aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 3
2963 add $rctr32w, $rctr32w, #1 @ CTR block 3
2964
2965 aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 3
2966 trn1 $acc_h.2d, $h3.2d, $h4.2d @ h4h | h3h
2967
2968 aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 4
2969 ldr $rk5q, [$cc, #80] @ load rk5
2970
2971 aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 4
2972 trn2 $h34k.2d, $h3.2d, $h4.2d @ h4l | h3l
2973
2974 aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 4
2975
2976 aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 4
2977 trn2 $h12k.2d, $h1.2d, $h2.2d @ h2l | h1l
2978
2979 aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 5
2980 ldr $rk6q, [$cc, #96] @ load rk6
2981
2982 aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 5
2983
2984 aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 5
2985
2986 aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 5
2987
2988 aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 6
2989
2990 aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 6
2991
2992 aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 6
2993
2994 aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 7
2995
2996 aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 7
2997
2998 aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 7
2999
3000 aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 6
3001
3002 aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 8
3003
3004 aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 8
3005
3006 aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 7
3007
3008 aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 9
3009
3010 aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 9
3011
3012 aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 8
3013 sub $main_end_input_ptr, $main_end_input_ptr, #1 @ byte_len - 1
3014
3015 aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 8
3016 and $main_end_input_ptr, $main_end_input_ptr, #0xffffffffffffffc0 @ number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
3017
3018 aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 10
3019 add $main_end_input_ptr, $main_end_input_ptr, $input_ptr
3020
3021 aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 9
3022 cmp $input_ptr, $main_end_input_ptr @ check if we have <= 4 blocks
3023
3024 aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 9
3025 trn1 $t0.2d, $h1.2d, $h2.2d @ h2h | h1h
3026
3027 aese $ctr3b, $rk11 @ AES block 3 - round 11
3028
3029 aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 10
3030
3031 aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 10
3032
3033 aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 10
3034 eor $h12k.16b, $h12k.16b, $t0.16b @ h2k | h1k
3035
3036 aese $ctr2b, $rk11 @ AES block 2 - round 11
3037
3038 aese $ctr1b, $rk11 @ AES block 1 - round 11
3039 eor $h34k.16b, $h34k.16b, $acc_h.16b @ h4k | h3k
3040
3041 aese $ctr0b, $rk11 @ AES block 0 - round 11
3042 b.ge .L192_dec_tail @ handle tail
3043
3044 ldr $res1q, [$input_ptr, #16] @ AES block 1 - load ciphertext
3045
3046 ldr $res0q, [$input_ptr, #0] @ AES block 0 - load ciphertext
3047
3048 eor $ctr1b, $res1b, $ctr1b @ AES block 1 - result
3049
3050 eor $ctr0b, $res0b, $ctr0b @ AES block 0 - result
3051 rev $ctr32w, $rctr32w @ CTR block 4
3052 ldr $res3q, [$input_ptr, #48] @ AES block 3 - load ciphertext
3053
3054 ldr $res2q, [$input_ptr, #32] @ AES block 2 - load ciphertext
3055
3056 mov $output_l1, $ctr1.d[0] @ AES block 1 - mov low
3057
3058 mov $output_h1, $ctr1.d[1] @ AES block 1 - mov high
3059
3060 mov $output_l0, $ctr0.d[0] @ AES block 0 - mov low
3061 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4
3062 add $rctr32w, $rctr32w, #1 @ CTR block 4
3063
3064 mov $output_h0, $ctr0.d[1] @ AES block 0 - mov high
3065 rev64 $res0b, $res0b @ GHASH block 0
3066 add $input_ptr, $input_ptr, #64 @ AES input_ptr update
3067
3068 fmov $ctr0d, $ctr96_b64x @ CTR block 4
3069 rev64 $res1b, $res1b @ GHASH block 1
3070 cmp $input_ptr, $main_end_input_ptr @ check if we have <= 8 blocks
3071
3072 eor $output_l1, $output_l1, $rk12_l @ AES block 1 - round 12 low
3073 fmov $ctr0.d[1], $ctr32x @ CTR block 4
3074 rev $ctr32w, $rctr32w @ CTR block 5
3075
3076 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 5
3077 fmov $ctr1d, $ctr96_b64x @ CTR block 5
3078 eor $output_h1, $output_h1, $rk12_h @ AES block 1 - round 12 high
3079
3080 add $rctr32w, $rctr32w, #1 @ CTR block 5
3081 fmov $ctr1.d[1], $ctr32x @ CTR block 5
3082 eor $output_l0, $output_l0, $rk12_l @ AES block 0 - round 12 low
3083
3084 rev $ctr32w, $rctr32w @ CTR block 6
3085 eor $output_h0, $output_h0, $rk12_h @ AES block 0 - round 12 high
3086
3087 stp $output_l0, $output_h0, [$output_ptr], #16 @ AES block 0 - store result
3088 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 6
3089
3090 stp $output_l1, $output_h1, [$output_ptr], #16 @ AES block 1 - store result
3091
3092 add $rctr32w, $rctr32w, #1 @ CTR block 6
3093 eor $ctr2b, $res2b, $ctr2b @ AES block 2 - result
3094 b.ge .L192_dec_prepretail @ do prepretail
3095
3096 .L192_dec_main_loop: @ main loop start
3097 aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 0
3098 ext $acc_lb, $acc_lb, $acc_lb, #8 @ PRE 0
3099
3100 pmull $t2.1q, $res1.1d, $h3.1d @ GHASH block 4k+1 - low
3101 mov $output_l2, $ctr2.d[0] @ AES block 4k+2 - mov low
3102
3103 mov $output_h2, $ctr2.d[1] @ AES block 4k+2 - mov high
3104 eor $ctr3b, $res3b, $ctr3b @ AES block 4k+3 - result
3105 rev64 $res3b, $res3b @ GHASH block 4k+3
3106
3107 aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 1
3108 fmov $ctr2d, $ctr96_b64x @ CTR block 4k+6
3109
3110 aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 0
3111 eor $res0b, $res0b, $acc_lb @ PRE 1
3112
3113 pmull2 $t1.1q, $res1.2d, $h3.2d @ GHASH block 4k+1 - high
3114 fmov $ctr2.d[1], $ctr32x @ CTR block 4k+6
3115
3116 aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 2
3117 mov $output_h3, $ctr3.d[1] @ AES block 4k+3 - mov high
3118
3119 aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 1
3120 mov $output_l3, $ctr3.d[0] @ AES block 4k+3 - mov low
3121
3122 pmull2 $acc_h.1q, $res0.2d, $h4.2d @ GHASH block 4k - high
3123 fmov $ctr3d, $ctr96_b64x @ CTR block 4k+7
3124 mov $t0d, $res0.d[1] @ GHASH block 4k - mid
3125
3126 pmull $acc_l.1q, $res0.1d, $h4.1d @ GHASH block 4k - low
3127 mov $acc_md, $h34k.d[1] @ GHASH block 4k - mid
3128 rev $ctr32w, $rctr32w @ CTR block 4k+7
3129
3130 aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 0
3131 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+7
3132
3133 fmov $ctr3.d[1], $ctr32x @ CTR block 4k+7
3134 eor $t0.8b, $t0.8b, $res0.8b @ GHASH block 4k - mid
3135 mov $t3d, $res1.d[1] @ GHASH block 4k+1 - mid
3136
3137 aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 3
3138
3139 aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 2
3140 eor $output_h2, $output_h2, $rk12_h @ AES block 4k+2 - round 12 high
3141
3142 aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 1
3143 eor $t3.8b, $t3.8b, $res1.8b @ GHASH block 4k+1 - mid
3144
3145 pmull $acc_m.1q, $t0.1d, $acc_m.1d @ GHASH block 4k - mid
3146
3147 aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 0
3148 rev64 $res2b, $res2b @ GHASH block 4k+2
3149
3150 aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 2
3151
3152 pmull $t3.1q, $t3.1d, $h34k.1d @ GHASH block 4k+1 - mid
3153 eor $acc_lb, $acc_lb, $t2.16b @ GHASH block 4k+1 - low
3154 eor $output_l2, $output_l2, $rk12_l @ AES block 4k+2 - round 12 low
3155
3156 aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 4
3157
3158 aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 3
3159
3160 eor $acc_mb, $acc_mb, $t3.16b @ GHASH block 4k+1 - mid
3161 mov $t6d, $res2.d[1] @ GHASH block 4k+2 - mid
3162
3163 aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 1
3164 eor $acc_hb, $acc_hb, $t1.16b @ GHASH block 4k+1 - high
3165
3166 aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 4
3167
3168 pmull2 $t4.1q, $res2.2d, $h2.2d @ GHASH block 4k+2 - high
3169 eor $t6.8b, $t6.8b, $res2.8b @ GHASH block 4k+2 - mid
3170
3171 pmull $t5.1q, $res2.1d, $h2.1d @ GHASH block 4k+2 - low
3172
3173 aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 5
3174
3175 eor $acc_hb, $acc_hb, $t4.16b @ GHASH block 4k+2 - high
3176 mov $t9d, $res3.d[1] @ GHASH block 4k+3 - mid
3177
3178 aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 5
3179
3180 pmull2 $t7.1q, $res3.2d, $h1.2d @ GHASH block 4k+3 - high
3181
3182 aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 2
3183 eor $t9.8b, $t9.8b, $res3.8b @ GHASH block 4k+3 - mid
3184
3185 aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 6
3186
3187 aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 6
3188 ins $t6.d[1], $t6.d[0] @ GHASH block 4k+2 - mid
3189
3190 aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 3
3191
3192 pmull $t9.1q, $t9.1d, $h12k.1d @ GHASH block 4k+3 - mid
3193 eor $acc_lb, $acc_lb, $t5.16b @ GHASH block 4k+2 - low
3194
3195 aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 7
3196
3197 pmull2 $t6.1q, $t6.2d, $h12k.2d @ GHASH block 4k+2 - mid
3198 eor $acc_hb, $acc_hb, $t7.16b @ GHASH block 4k+3 - high
3199
3200 aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 7
3201
3202 aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 8
3203 movi $mod_constant.8b, #0xc2
3204
3205 pmull $t8.1q, $res3.1d, $h1.1d @ GHASH block 4k+3 - low
3206
3207 aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 8
3208 eor $acc_mb, $acc_mb, $t6.16b @ GHASH block 4k+2 - mid
3209
3210 aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 3
3211
3212 aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 9
3213 eor $acc_lb, $acc_lb, $t8.16b @ GHASH block 4k+3 - low
3214
3215 aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 4
3216
3217 aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 4
3218 eor $acc_mb, $acc_mb, $t9.16b @ GHASH block 4k+3 - mid
3219
3220 aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 10
3221
3222 aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 9
3223 eor $t9.16b, $acc_lb, $acc_hb @ MODULO - karatsuba tidy up
3224
3225 aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 5
3226
3227 aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 5
3228 shl $mod_constantd, $mod_constantd, #56 @ mod_constant
3229
3230 aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 10
3231
3232 aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 6
3233 ldr $res2q, [$input_ptr, #32] @ AES block 4k+6 - load ciphertext
3234
3235 aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 6
3236 eor $acc_mb, $acc_mb, $t9.16b @ MODULO - karatsuba tidy up
3237
3238 pmull $mod_t.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid
3239 ldr $res3q, [$input_ptr, #48] @ AES block 4k+7 - load ciphertext
3240 eor $output_l3, $output_l3, $rk12_l @ AES block 4k+3 - round 12 low
3241
3242 aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 7
3243 ext $acc_hb, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment
3244
3245 aese $ctr0b, $rk11 @ AES block 4k+4 - round 11
3246 add $rctr32w, $rctr32w, #1 @ CTR block 4k+7
3247
3248 aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 7
3249 eor $acc_mb, $acc_mb, $mod_t.16b @ MODULO - fold into mid
3250
3251 aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 8
3252 ldr $res0q, [$input_ptr, #0] @ AES block 4k+4 - load ciphertext
3253
3254 aese $ctr1b, $rk11 @ AES block 4k+5 - round 11
3255 ldr $res1q, [$input_ptr, #16] @ AES block 4k+5 - load ciphertext
3256 rev $ctr32w, $rctr32w @ CTR block 4k+8
3257
3258 aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 8
3259 stp $output_l2, $output_h2, [$output_ptr], #16 @ AES block 4k+2 - store result
3260
3261 aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 9
3262 eor $acc_mb, $acc_mb, $acc_hb @ MODULO - fold into mid
3263
3264 add $input_ptr, $input_ptr, #64 @ AES input_ptr update
3265 cmp $input_ptr, $main_end_input_ptr @ LOOP CONTROL
3266
3267 eor $ctr0b, $res0b, $ctr0b @ AES block 4k+4 - result
3268 eor $output_h3, $output_h3, $rk12_h @ AES block 4k+3 - round 12 high
3269 eor $ctr1b, $res1b, $ctr1b @ AES block 4k+5 - result
3270
3271 aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 10
3272 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+8
3273
3274 aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 9
3275
3276 pmull $mod_constant.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low
3277 mov $output_l1, $ctr1.d[0] @ AES block 4k+5 - mov low
3278
3279 mov $output_l0, $ctr0.d[0] @ AES block 4k+4 - mov low
3280 stp $output_l3, $output_h3, [$output_ptr], #16 @ AES block 4k+3 - store result
3281 rev64 $res1b, $res1b @ GHASH block 4k+5
3282
3283 aese $ctr2b, $rk11 @ AES block 4k+6 - round 11
3284 mov $output_h0, $ctr0.d[1] @ AES block 4k+4 - mov high
3285
3286 aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 10
3287 mov $output_h1, $ctr1.d[1] @ AES block 4k+5 - mov high
3288
3289 fmov $ctr0d, $ctr96_b64x @ CTR block 4k+8
3290 add $rctr32w, $rctr32w, #1 @ CTR block 4k+8
3291 ext $acc_mb, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment
3292
3293 eor $ctr2b, $res2b, $ctr2b @ AES block 4k+6 - result
3294 fmov $ctr0.d[1], $ctr32x @ CTR block 4k+8
3295 rev $ctr32w, $rctr32w @ CTR block 4k+9
3296
3297 eor $output_l0, $output_l0, $rk12_l @ AES block 4k+4 - round 12 low
3298 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+9
3299 eor $acc_lb, $acc_lb, $mod_constant.16b @ MODULO - fold into low
3300
3301 fmov $ctr1d, $ctr96_b64x @ CTR block 4k+9
3302 add $rctr32w, $rctr32w, #1 @ CTR block 4k+9
3303 eor $output_l1, $output_l1, $rk12_l @ AES block 4k+5 - round 12 low
3304
3305 fmov $ctr1.d[1], $ctr32x @ CTR block 4k+9
3306 rev $ctr32w, $rctr32w @ CTR block 4k+10
3307 eor $output_h1, $output_h1, $rk12_h @ AES block 4k+5 - round 12 high
3308
3309 eor $output_h0, $output_h0, $rk12_h @ AES block 4k+4 - round 12 high
3310 stp $output_l0, $output_h0, [$output_ptr], #16 @ AES block 4k+4 - store result
3311 eor $acc_lb, $acc_lb, $acc_mb @ MODULO - fold into low
3312
3313 add $rctr32w, $rctr32w, #1 @ CTR block 4k+10
3314 rev64 $res0b, $res0b @ GHASH block 4k+4
3315 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+10
3316
3317 aese $ctr3b, $rk11 @ AES block 4k+7 - round 11
3318 stp $output_l1, $output_h1, [$output_ptr], #16 @ AES block 4k+5 - store result
3319 b.lt .L192_dec_main_loop
3320
3321 .L192_dec_prepretail: @ PREPRETAIL
3322 mov $output_h2, $ctr2.d[1] @ AES block 4k+2 - mov high
3323 ext $acc_lb, $acc_lb, $acc_lb, #8 @ PRE 0
3324 eor $ctr3b, $res3b, $ctr3b @ AES block 4k+3 - result
3325
3326 aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 0
3327 mov $output_l2, $ctr2.d[0] @ AES block 4k+2 - mov low
3328
3329 aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 0
3330 mov $acc_md, $h34k.d[1] @ GHASH block 4k - mid
3331
3332 eor $res0b, $res0b, $acc_lb @ PRE 1
3333 fmov $ctr2d, $ctr96_b64x @ CTR block 4k+6
3334
3335 aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 1
3336 mov $output_l3, $ctr3.d[0] @ AES block 4k+3 - mov low
3337
3338 aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 1
3339 mov $output_h3, $ctr3.d[1] @ AES block 4k+3 - mov high
3340
3341 pmull $acc_l.1q, $res0.1d, $h4.1d @ GHASH block 4k - low
3342 mov $t0d, $res0.d[1] @ GHASH block 4k - mid
3343 fmov $ctr3d, $ctr96_b64x @ CTR block 4k+7
3344
3345 aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 2
3346 rev64 $res2b, $res2b @ GHASH block 4k+2
3347
3348 pmull2 $acc_h.1q, $res0.2d, $h4.2d @ GHASH block 4k - high
3349 fmov $ctr2.d[1], $ctr32x @ CTR block 4k+6
3350 rev $ctr32w, $rctr32w @ CTR block 4k+7
3351
3352 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+7
3353 eor $t0.8b, $t0.8b, $res0.8b @ GHASH block 4k - mid
3354 mov $t3d, $res1.d[1] @ GHASH block 4k+1 - mid
3355
3356 pmull $t2.1q, $res1.1d, $h3.1d @ GHASH block 4k+1 - low
3357 eor $output_h3, $output_h3, $rk12_h @ AES block 4k+3 - round 12 high
3358 fmov $ctr3.d[1], $ctr32x @ CTR block 4k+7
3359
3360 aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 2
3361 eor $output_l2, $output_l2, $rk12_l @ AES block 4k+2 - round 12 low
3362
3363 pmull2 $t1.1q, $res1.2d, $h3.2d @ GHASH block 4k+1 - high
3364 eor $output_h2, $output_h2, $rk12_h @ AES block 4k+2 - round 12 high
3365 eor $t3.8b, $t3.8b, $res1.8b @ GHASH block 4k+1 - mid
3366
3367 pmull $acc_m.1q, $t0.1d, $acc_m.1d @ GHASH block 4k - mid
3368 eor $output_l3, $output_l3, $rk12_l @ AES block 4k+3 - round 12 low
3369 stp $output_l2, $output_h2, [$output_ptr], #16 @ AES block 4k+2 - store result
3370
3371 rev64 $res3b, $res3b @ GHASH block 4k+3
3372 stp $output_l3, $output_h3, [$output_ptr], #16 @ AES block 4k+3 - store result
3373
3374 aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 0
3375 eor $acc_hb, $acc_hb, $t1.16b @ GHASH block 4k+1 - high
3376
3377 pmull $t3.1q, $t3.1d, $h34k.1d @ GHASH block 4k+1 - mid
3378 add $rctr32w, $rctr32w, #1 @ CTR block 4k+7
3379
3380 pmull2 $t4.1q, $res2.2d, $h2.2d @ GHASH block 4k+2 - high
3381 eor $acc_lb, $acc_lb, $t2.16b @ GHASH block 4k+1 - low
3382
3383 aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 0
3384
3385 eor $acc_mb, $acc_mb, $t3.16b @ GHASH block 4k+1 - mid
3386 mov $t6d, $res2.d[1] @ GHASH block 4k+2 - mid
3387
3388 aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 1
3389
3390 aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 1
3391 eor $acc_hb, $acc_hb, $t4.16b @ GHASH block 4k+2 - high
3392
3393 eor $t6.8b, $t6.8b, $res2.8b @ GHASH block 4k+2 - mid
3394
3395 pmull $t5.1q, $res2.1d, $h2.1d @ GHASH block 4k+2 - low
3396
3397 aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 2
3398 mov $t9d, $res3.d[1] @ GHASH block 4k+3 - mid
3399
3400 aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 2
3401 ins $t6.d[1], $t6.d[0] @ GHASH block 4k+2 - mid
3402
3403 pmull $t8.1q, $res3.1d, $h1.1d @ GHASH block 4k+3 - low
3404
3405 aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 3
3406 eor $t9.8b, $t9.8b, $res3.8b @ GHASH block 4k+3 - mid
3407
3408 aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 3
3409
3410 pmull2 $t6.1q, $t6.2d, $h12k.2d @ GHASH block 4k+2 - mid
3411 eor $acc_lb, $acc_lb, $t5.16b @ GHASH block 4k+2 - low
3412
3413 aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 4
3414
3415 pmull2 $t7.1q, $res3.2d, $h1.2d @ GHASH block 4k+3 - high
3416 movi $mod_constant.8b, #0xc2
3417
3418 pmull $t9.1q, $t9.1d, $h12k.1d @ GHASH block 4k+3 - mid
3419
3420 aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 3
3421
3422 shl $mod_constantd, $mod_constantd, #56 @ mod_constant
3423 eor $acc_hb, $acc_hb, $t7.16b @ GHASH block 4k+3 - high
3424
3425 aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 5
3426 eor $acc_mb, $acc_mb, $t6.16b @ GHASH block 4k+2 - mid
3427
3428 aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 4
3429
3430 pmull $mod_t.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid
3431 eor $acc_lb, $acc_lb, $t8.16b @ GHASH block 4k+3 - low
3432
3433 aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 6
3434
3435 aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 3
3436 eor $acc_mb, $acc_mb, $t9.16b @ GHASH block 4k+3 - mid
3437
3438 aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 5
3439
3440 aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 7
3441 eor $t9.16b, $acc_lb, $acc_hb @ MODULO - karatsuba tidy up
3442
3443 aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 4
3444
3445 aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 6
3446 ext $acc_hb, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment
3447
3448 aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 8
3449
3450 aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 5
3451 eor $acc_mb, $acc_mb, $t9.16b @ MODULO - karatsuba tidy up
3452
3453 aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 4
3454
3455 aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 7
3456
3457 aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 9
3458
3459 aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 5
3460
3461 aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 6
3462 eor $acc_mb, $acc_mb, $mod_t.16b @ MODULO - fold into mid
3463
3464 aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 10
3465
3466 aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 6
3467
3468 aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 7
3469
3470 aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 8
3471 eor $acc_mb, $acc_mb, $acc_hb @ MODULO - fold into mid
3472
3473 aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 7
3474
3475 aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 8
3476
3477 aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 9
3478
3479 aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 8
3480
3481 aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 9
3482
3483 pmull $mod_constant.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low
3484
3485 aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 9
3486
3487 aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 10
3488
3489 aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 10
3490 ext $acc_mb, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment
3491
3492 aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 10
3493
3494 aese $ctr0b, $rk11
3495 eor $acc_lb, $acc_lb, $mod_constant.16b @ MODULO - fold into low
3496
3497 aese $ctr2b, $rk11
3498
3499 aese $ctr1b, $rk11
3500
3501 aese $ctr3b, $rk11
3502
3503 eor $acc_lb, $acc_lb, $acc_mb @ MODULO - fold into low
3504 .L192_dec_tail: @ TAIL
3505
3506 sub $main_end_input_ptr, $end_input_ptr, $input_ptr @ main_end_input_ptr is number of bytes left to process
3507 ld1 { $res1b}, [$input_ptr], #16 @ AES block 4k+4 - load ciphertext
3508
3509 eor $ctr0b, $res1b, $ctr0b @ AES block 4k+4 - result
3510
3511 mov $output_h0, $ctr0.d[1] @ AES block 4k+4 - mov high
3512
3513 mov $output_l0, $ctr0.d[0] @ AES block 4k+4 - mov low
3514
3515 ext $t0.16b, $acc_lb, $acc_lb, #8 @ prepare final partial tag
3516
3517 cmp $main_end_input_ptr, #48
3518
3519 eor $output_h0, $output_h0, $rk12_h @ AES block 4k+4 - round 12 high
3520
3521 eor $output_l0, $output_l0, $rk12_l @ AES block 4k+4 - round 12 low
3522 b.gt .L192_dec_blocks_more_than_3
3523
3524 movi $acc_l.8b, #0
3525 movi $acc_h.8b, #0
3526
3527 mov $ctr3b, $ctr2b
3528 mov $ctr2b, $ctr1b
3529 sub $rctr32w, $rctr32w, #1
3530
3531 movi $acc_m.8b, #0
3532 cmp $main_end_input_ptr, #32
3533 b.gt .L192_dec_blocks_more_than_2
3534
3535 mov $ctr3b, $ctr1b
3536 cmp $main_end_input_ptr, #16
3537 sub $rctr32w, $rctr32w, #1
3538
3539 b.gt .L192_dec_blocks_more_than_1
3540
3541 sub $rctr32w, $rctr32w, #1
3542 b .L192_dec_blocks_less_than_1
3543 .L192_dec_blocks_more_than_3: @ blocks left > 3
3544 rev64 $res0b, $res1b @ GHASH final-3 block
3545 ld1 { $res1b}, [$input_ptr], #16 @ AES final-2 block - load ciphertext
3546
3547 stp $output_l0, $output_h0, [$output_ptr], #16 @ AES final-3 block - store result
3548
3549 eor $res0b, $res0b, $t0.16b @ feed in partial tag
3550
3551 eor $ctr0b, $res1b, $ctr1b @ AES final-2 block - result
3552
3553 pmull $acc_l.1q, $res0.1d, $h4.1d @ GHASH final-3 block - low
3554 mov $output_l0, $ctr0.d[0] @ AES final-2 block - mov low
3555 mov $rk4d, $res0.d[1] @ GHASH final-3 block - mid
3556
3557 mov $output_h0, $ctr0.d[1] @ AES final-2 block - mov high
3558
3559 mov $acc_md, $h34k.d[1] @ GHASH final-3 block - mid
3560 eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-3 block - mid
3561
3562 pmull2 $acc_h.1q, $res0.2d, $h4.2d @ GHASH final-3 block - high
3563
3564 eor $output_l0, $output_l0, $rk12_l @ AES final-2 block - round 12 low
3565 movi $t0.8b, #0 @ suppress further partial tag feed in
3566
3567 pmull $acc_m.1q, $rk4v.1d, $acc_m.1d @ GHASH final-3 block - mid
3568 eor $output_h0, $output_h0, $rk12_h @ AES final-2 block - round 12 high
3569 .L192_dec_blocks_more_than_2: @ blocks left > 2
3570
3571 rev64 $res0b, $res1b @ GHASH final-2 block
3572 ld1 { $res1b}, [$input_ptr], #16 @ AES final-1 block - load ciphertext
3573
3574 eor $res0b, $res0b, $t0.16b @ feed in partial tag
3575
3576 movi $t0.8b, #0 @ suppress further partial tag feed in
3577
3578 eor $ctr0b, $res1b, $ctr2b @ AES final-1 block - result
3579
3580 mov $rk4d, $res0.d[1] @ GHASH final-2 block - mid
3581
3582 pmull $rk3q1, $res0.1d, $h3.1d @ GHASH final-2 block - low
3583
3584 stp $output_l0, $output_h0, [$output_ptr], #16 @ AES final-2 block - store result
3585
3586 eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-2 block - mid
3587 mov $output_h0, $ctr0.d[1] @ AES final-1 block - mov high
3588
3589 eor $acc_lb, $acc_lb, $rk3 @ GHASH final-2 block - low
3590 mov $output_l0, $ctr0.d[0] @ AES final-1 block - mov low
3591
3592 pmull2 $rk2q1, $res0.2d, $h3.2d @ GHASH final-2 block - high
3593
3594 pmull $rk4v.1q, $rk4v.1d, $h34k.1d @ GHASH final-2 block - mid
3595
3596 eor $acc_hb, $acc_hb, $rk2 @ GHASH final-2 block - high
3597 eor $output_h0, $output_h0, $rk12_h @ AES final-1 block - round 12 high
3598
3599 eor $output_l0, $output_l0, $rk12_l @ AES final-1 block - round 12 low
3600 eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-2 block - mid
3601 .L192_dec_blocks_more_than_1: @ blocks left > 1
3602
3603 rev64 $res0b, $res1b @ GHASH final-1 block
3604
3605 eor $res0b, $res0b, $t0.16b @ feed in partial tag
3606 ld1 { $res1b}, [$input_ptr], #16 @ AES final block - load ciphertext
3607
3608 mov $rk4d, $res0.d[1] @ GHASH final-1 block - mid
3609
3610 pmull2 $rk2q1, $res0.2d, $h2.2d @ GHASH final-1 block - high
3611
3612 eor $ctr0b, $res1b, $ctr3b @ AES final block - result
3613 stp $output_l0, $output_h0, [$output_ptr], #16 @ AES final-1 block - store result
3614
3615 eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-1 block - mid
3616
3617 eor $acc_hb, $acc_hb, $rk2 @ GHASH final-1 block - high
3618
3619 pmull $rk3q1, $res0.1d, $h2.1d @ GHASH final-1 block - low
3620 mov $output_h0, $ctr0.d[1] @ AES final block - mov high
3621
3622 ins $rk4v.d[1], $rk4v.d[0] @ GHASH final-1 block - mid
3623 mov $output_l0, $ctr0.d[0] @ AES final block - mov low
3624
3625 pmull2 $rk4v.1q, $rk4v.2d, $h12k.2d @ GHASH final-1 block - mid
3626
3627 movi $t0.8b, #0 @ suppress further partial tag feed in
3628 eor $acc_lb, $acc_lb, $rk3 @ GHASH final-1 block - low
3629 eor $output_h0, $output_h0, $rk12_h @ AES final block - round 12 high
3630
3631 eor $output_l0, $output_l0, $rk12_l @ AES final block - round 12 low
3632
3633 eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-1 block - mid
3634 .L192_dec_blocks_less_than_1: @ blocks left <= 1
3635
3636 mvn $rk12_l, xzr @ rk12_l = 0xffffffffffffffff
3637 ldp $end_input_ptr, $main_end_input_ptr, [$output_ptr] @ load existing bytes we need to not overwrite
3638 and $bit_length, $bit_length, #127 @ bit_length %= 128
3639
3640 sub $bit_length, $bit_length, #128 @ bit_length -= 128
3641
3642 neg $bit_length, $bit_length @ bit_length = 128 - #bits in input (in range [1,128])
3643
3644 and $bit_length, $bit_length, #127 @ bit_length %= 128
3645 mvn $rk12_h, xzr @ rk12_h = 0xffffffffffffffff
3646
3647 lsr $rk12_h, $rk12_h, $bit_length @ rk12_h is mask for top 64b of last block
3648 cmp $bit_length, #64
3649
3650 csel $ctr32x, $rk12_l, $rk12_h, lt
3651 csel $ctr96_b64x, $rk12_h, xzr, lt
3652
3653 fmov $ctr0d, $ctr32x @ ctr0b is mask for last block
3654 and $output_l0, $output_l0, $ctr32x
3655 bic $end_input_ptr, $end_input_ptr, $ctr32x @ mask out low existing bytes
3656
3657 orr $output_l0, $output_l0, $end_input_ptr
3658 mov $ctr0.d[1], $ctr96_b64x
3659
3660 rev $ctr32w, $rctr32w
3661
3662 and $res1b, $res1b, $ctr0b @ possibly partial last block has zeroes in highest bits
3663 str $ctr32w, [$counter, #12] @ store the updated counter
3664
3665 rev64 $res0b, $res1b @ GHASH final block
3666
3667 eor $res0b, $res0b, $t0.16b @ feed in partial tag
3668 bic $main_end_input_ptr, $main_end_input_ptr, $ctr96_b64x @ mask out high existing bytes
3669
3670 and $output_h0, $output_h0, $ctr96_b64x
3671
3672 pmull2 $rk2q1, $res0.2d, $h1.2d @ GHASH final block - high
3673 mov $t0d, $res0.d[1] @ GHASH final block - mid
3674
3675 pmull $rk3q1, $res0.1d, $h1.1d @ GHASH final block - low
3676
3677 eor $t0.8b, $t0.8b, $res0.8b @ GHASH final block - mid
3678
3679 eor $acc_hb, $acc_hb, $rk2 @ GHASH final block - high
3680
3681 pmull $t0.1q, $t0.1d, $h12k.1d @ GHASH final block - mid
3682
3683 eor $acc_lb, $acc_lb, $rk3 @ GHASH final block - low
3684
3685 eor $acc_mb, $acc_mb, $t0.16b @ GHASH final block - mid
3686 movi $mod_constant.8b, #0xc2
3687
3688 eor $t9.16b, $acc_lb, $acc_hb @ MODULO - karatsuba tidy up
3689
3690 shl $mod_constantd, $mod_constantd, #56 @ mod_constant
3691
3692 eor $acc_mb, $acc_mb, $t9.16b @ MODULO - karatsuba tidy up
3693
3694 pmull $mod_t.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid
3695 orr $output_h0, $output_h0, $main_end_input_ptr
3696 stp $output_l0, $output_h0, [$output_ptr]
3697
3698 ext $acc_hb, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment
3699
3700 eor $acc_mb, $acc_mb, $mod_t.16b @ MODULO - fold into mid
3701
3702 eor $acc_mb, $acc_mb, $acc_hb @ MODULO - fold into mid
3703
3704 pmull $mod_constant.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low
3705
3706 eor $acc_lb, $acc_lb, $mod_constant.16b @ MODULO - fold into low
3707
3708 ext $acc_mb, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment
3709
3710 eor $acc_lb, $acc_lb, $acc_mb @ MODULO - fold into low
3711 ext $acc_lb, $acc_lb, $acc_lb, #8
3712 rev64 $acc_lb, $acc_lb
3713 mov x0, $len
3714 st1 { $acc_l.16b }, [$current_tag]
3715
3716 ldp x21, x22, [sp, #16]
3717 ldp x23, x24, [sp, #32]
3718 ldp d8, d9, [sp, #48]
3719 ldp d10, d11, [sp, #64]
3720 ldp d12, d13, [sp, #80]
3721 ldp d14, d15, [sp, #96]
3722 ldp x19, x20, [sp], #112
3723 ret
3724
3725.L192_dec_ret:
3726 mov w0, #0x0
3727 ret
3728.size aes_gcm_dec_192_kernel,.-aes_gcm_dec_192_kernel
3729___
3730}
3731
3732{
3733my ($end_input_ptr,$main_end_input_ptr,$input_l0,$input_h0)=map("x$_",(4..7));
3734my ($input_l1,$input_h1,$input_l2,$input_h2,$input_l3,$input_h3)=map("x$_",(19..24));
3735my ($output_l1,$output_h1,$output_l2,$output_h2,$output_l3,$output_h3)=map("x$_",(19..24));
3736my ($output_l0,$output_h0)=map("x$_",(6..7));
3737
3738my $ctr32w="w9";
3739my ($ctr32x,$ctr96_b64x,$ctr96_t32x,$rctr32x,$rk14_l,$rk14_h,$len)=map("x$_",(9..15));
3740my ($ctr96_t32w,$rctr32w)=map("w$_",(11..12));
3741
3742my ($ctr0b,$ctr1b,$ctr2b,$ctr3b,$res0b,$res1b,$res2b,$res3b)=map("v$_.16b",(0..7));
3743my ($ctr0,$ctr1,$ctr2,$ctr3,$res0,$res1,$res2,$res3)=map("v$_",(0..7));
3744my ($ctr0d,$ctr1d,$ctr2d,$ctr3d,$res0d,$res1d,$res2d,$res3d)=map("d$_",(0..7));
3745my ($res0q,$res1q,$res2q,$res3q)=map("q$_",(4..7));
3746
3747my ($acc_hb,$acc_mb,$acc_lb)=map("v$_.16b",(9..11));
3748my ($acc_h,$acc_m,$acc_l)=map("v$_",(9..11));
3749my ($acc_hd,$acc_md,$acc_ld)=map("d$_",(9..11));
3750
3751my ($h1,$h2,$h3,$h4,$h12k,$h34k)=map("v$_",(12..17));
3752my ($h1q,$h2q,$h3q,$h4q)=map("q$_",(12..15));
3753my ($h1b,$h2b,$h3b,$h4b)=map("v$_.16b",(12..15));
3754
3755my $t0="v8";
3756my $t0d="d8";
3757my $t1="v4";
3758my $t1d="d4";
3759my $t2="v8";
3760my $t2d="d8";
3761my $t3="v4";
3762my $t3d="d4";
3763my $t4="v4";
3764my $t4d="d4";
3765my $t5="v5";
3766my $t5d="d5";
3767my $t6="v8";
3768my $t6d="d8";
3769my $t7="v5";
3770my $t7d="d5";
3771my $t8="v6";
3772my $t8d="d6";
3773my $t9="v4";
3774my $t9d="d4";
3775
3776my ($ctr_t0,$ctr_t1,$ctr_t2,$ctr_t3)=map("v$_",(4..7));
3777my ($ctr_t0d,$ctr_t1d,$ctr_t2d,$ctr_t3d)=map("d$_",(4..7));
3778my ($ctr_t0b,$ctr_t1b,$ctr_t2b,$ctr_t3b)=map("v$_.16b",(4..7));
3779
3780my $mod_constantd="d8";
3781my $mod_constant="v8";
3782my $mod_t="v7";
3783
3784my ($rk0,$rk1,$rk2,$rk3,$rk4,$rk5,$rk6,$rk7,$rk8,$rk9,$rk10,$rk11,$rk12,$rk13)=map("v$_.16b",(18..31));
3785my ($rk0q,$rk1q,$rk2q,$rk3q,$rk4q,$rk5q,$rk6q,$rk7q,$rk8q,$rk9q,$rk10q,$rk11q,$rk12q,$rk13q)=map("q$_",(18..31));
3786my $rk2q1="v20.1q";
3787my $rk3q1="v21.1q";
3788my $rk4v="v22";
3789my $rk4d="d22";
3790
3791#########################################################################################
3792# size_t aes_gcm_enc_256_kernel(const unsigned char *in,
3793# size_t len,
3794# unsigned char *out,
3795# const void *key,
3796# unsigned char ivec[16],
3797# u64 *Xi);
3798#
3799$code.=<<___;
3800.global aes_gcm_enc_256_kernel
3801.type aes_gcm_enc_256_kernel,%function
3802.align 4
3803aes_gcm_enc_256_kernel:
3804 AARCH64_VALID_CALL_TARGET
3805 cbz x1, .L256_enc_ret
3806 stp x19, x20, [sp, #-112]!
3807 mov x16, x4
3808 mov x8, x5
3809 stp x21, x22, [sp, #16]
3810 stp x23, x24, [sp, #32]
3811 stp d8, d9, [sp, #48]
3812 stp d10, d11, [sp, #64]
3813 stp d12, d13, [sp, #80]
3814 stp d14, d15, [sp, #96]
3815
3816 add $end_input_ptr, $input_ptr, $bit_length, lsr #3 @ end_input_ptr
3817 lsr $main_end_input_ptr, $bit_length, #3 @ byte_len
3818 mov $len, $main_end_input_ptr
3819 ldp $ctr96_b64x, $ctr96_t32x, [$counter] @ ctr96_b64, ctr96_t32
3820
3821 ld1 { $ctr0b}, [$counter] @ special case vector load initial counter so we can start first AES block as quickly as possible
3822 sub $main_end_input_ptr, $main_end_input_ptr, #1 @ byte_len - 1
3823
3824 ldr $rk0q, [$cc, #0] @ load rk0
3825 and $main_end_input_ptr, $main_end_input_ptr, #0xffffffffffffffc0 @ number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
3826
3827 ldr $rk7q, [$cc, #112] @ load rk7
3828 add $main_end_input_ptr, $main_end_input_ptr, $input_ptr
3829
3830 lsr $rctr32x, $ctr96_t32x, #32
3831 fmov $ctr2d, $ctr96_b64x @ CTR block 2
3832 orr $ctr96_t32w, $ctr96_t32w, $ctr96_t32w
3833
3834 rev $rctr32w, $rctr32w @ rev_ctr32
3835 cmp $input_ptr, $main_end_input_ptr @ check if we have <= 4 blocks
3836 fmov $ctr1d, $ctr96_b64x @ CTR block 1
3837
3838 aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 0
3839 add $rctr32w, $rctr32w, #1 @ increment rev_ctr32
3840
3841 rev $ctr32w, $rctr32w @ CTR block 1
3842 fmov $ctr3d, $ctr96_b64x @ CTR block 3
3843
3844 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 1
3845 add $rctr32w, $rctr32w, #1 @ CTR block 1
3846 ldr $rk1q, [$cc, #16] @ load rk1
3847
3848 fmov $ctr1.d[1], $ctr32x @ CTR block 1
3849 rev $ctr32w, $rctr32w @ CTR block 2
3850 add $rctr32w, $rctr32w, #1 @ CTR block 2
3851
3852 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 2
3853 ldr $rk2q, [$cc, #32] @ load rk2
3854
3855 fmov $ctr2.d[1], $ctr32x @ CTR block 2
3856 rev $ctr32w, $rctr32w @ CTR block 3
3857
3858 aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 1
3859 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 3
3860
3861 fmov $ctr3.d[1], $ctr32x @ CTR block 3
3862
3863 aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 0
3864 ldr $rk3q, [$cc, #48] @ load rk3
3865
3866 aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 2
3867 ldr $rk6q, [$cc, #96] @ load rk6
3868
3869 aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 0
3870 ldr $rk5q, [$cc, #80] @ load rk5
3871
3872 aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 1
3873 ldr $h3q, [$current_tag, #80] @ load h3l | h3h
3874 ext $h3b, $h3b, $h3b, #8
3875
3876 aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 0
3877 ldr $rk13q, [$cc, #208] @ load rk13
3878
3879 aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 1
3880 ldr $rk4q, [$cc, #64] @ load rk4
3881
3882 aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 2
3883 ldr $h2q, [$current_tag, #64] @ load h2l | h2h
3884 ext $h2b, $h2b, $h2b, #8
3885
3886 aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 1
3887 ldr $rk12q, [$cc, #192] @ load rk12
3888
3889 aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 2
3890 ldr $h4q, [$current_tag, #112] @ load h4l | h4h
3891 ext $h4b, $h4b, $h4b, #8
3892
3893 aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 3
3894 ldr $rk11q, [$cc, #176] @ load rk11
3895
3896 aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 2
3897 ldr $rk8q, [$cc, #128] @ load rk8
3898
3899 aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 3
3900 add $rctr32w, $rctr32w, #1 @ CTR block 3
3901
3902 aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 3
3903 ldp $rk14_l, $rk14_h, [$cc, #224] @ load rk14
3904
3905 aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 3
3906 ld1 { $acc_lb}, [$current_tag]
3907 ext $acc_lb, $acc_lb, $acc_lb, #8
3908 rev64 $acc_lb, $acc_lb
3909
3910 aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 4
3911
3912 aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 4
3913
3914 aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 4
3915
3916 aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 4
3917
3918 aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 5
3919
3920 aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 5
3921
3922 aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 5
3923
3924 aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 5
3925
3926 aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 6
3927 trn2 $h34k.2d, $h3.2d, $h4.2d @ h4l | h3l
3928
3929 aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 6
3930 ldr $rk9q, [$cc, #144] @ load rk9
3931
3932 aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 6
3933 ldr $h1q, [$current_tag, #32] @ load h1l | h1h
3934 ext $h1b, $h1b, $h1b, #8
3935
3936 aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 6
3937 ldr $rk10q, [$cc, #160] @ load rk10
3938
3939 aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 7
3940 trn1 $acc_h.2d, $h3.2d, $h4.2d @ h4h | h3h
3941
3942 aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 7
3943
3944 aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 7
3945
3946 aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 7
3947 trn2 $h12k.2d, $h1.2d, $h2.2d @ h2l | h1l
3948
3949 aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 8
3950
3951 aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 8
3952
3953 aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 8
3954
3955 aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 9
3956
3957 aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 9
3958
3959 aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 8
3960
3961 aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 10
3962
3963 aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 9
3964
3965 aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 9
3966
3967 aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 10
3968
3969 aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 10
3970
3971 aese $ctr1b, $rk11 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 11
3972
3973 aese $ctr2b, $rk11 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 11
3974
3975 aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 10
3976
3977 aese $ctr1b, $rk12 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 12
3978
3979 aese $ctr2b, $rk12 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 12
3980
3981 aese $ctr0b, $rk11 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 11
3982 eor $h34k.16b, $h34k.16b, $acc_h.16b @ h4k | h3k
3983
3984 aese $ctr3b, $rk11 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 11
3985
3986 aese $ctr2b, $rk13 @ AES block 2 - round 13
3987 trn1 $t0.2d, $h1.2d, $h2.2d @ h2h | h1h
3988
3989 aese $ctr0b, $rk12 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 12
3990
3991 aese $ctr3b, $rk12 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 12
3992
3993 aese $ctr1b, $rk13 @ AES block 1 - round 13
3994
3995 aese $ctr0b, $rk13 @ AES block 0 - round 13
3996
3997 aese $ctr3b, $rk13 @ AES block 3 - round 13
3998 eor $h12k.16b, $h12k.16b, $t0.16b @ h2k | h1k
3999 b.ge .L256_enc_tail @ handle tail
4000
4001 ldp $input_l1, $input_h1, [$input_ptr, #16] @ AES block 1 - load plaintext
4002
4003 rev $ctr32w, $rctr32w @ CTR block 4
4004 ldp $input_l0, $input_h0, [$input_ptr, #0] @ AES block 0 - load plaintext
4005
4006 ldp $input_l3, $input_h3, [$input_ptr, #48] @ AES block 3 - load plaintext
4007
4008 ldp $input_l2, $input_h2, [$input_ptr, #32] @ AES block 2 - load plaintext
4009 add $input_ptr, $input_ptr, #64 @ AES input_ptr update
4010
4011 eor $input_l1, $input_l1, $rk14_l @ AES block 1 - round 14 low
4012 eor $input_h1, $input_h1, $rk14_h @ AES block 1 - round 14 high
4013
4014 fmov $ctr_t1d, $input_l1 @ AES block 1 - mov low
4015 eor $input_l0, $input_l0, $rk14_l @ AES block 0 - round 14 low
4016
4017 eor $input_h0, $input_h0, $rk14_h @ AES block 0 - round 14 high
4018 eor $input_h3, $input_h3, $rk14_h @ AES block 3 - round 14 high
4019 fmov $ctr_t0d, $input_l0 @ AES block 0 - mov low
4020
4021 cmp $input_ptr, $main_end_input_ptr @ check if we have <= 8 blocks
4022 fmov $ctr_t0.d[1], $input_h0 @ AES block 0 - mov high
4023 eor $input_l3, $input_l3, $rk14_l @ AES block 3 - round 14 low
4024
4025 eor $input_l2, $input_l2, $rk14_l @ AES block 2 - round 14 low
4026 fmov $ctr_t1.d[1], $input_h1 @ AES block 1 - mov high
4027
4028 fmov $ctr_t2d, $input_l2 @ AES block 2 - mov low
4029 add $rctr32w, $rctr32w, #1 @ CTR block 4
4030
4031 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4
4032 fmov $ctr_t3d, $input_l3 @ AES block 3 - mov low
4033 eor $input_h2, $input_h2, $rk14_h @ AES block 2 - round 14 high
4034
4035 fmov $ctr_t2.d[1], $input_h2 @ AES block 2 - mov high
4036
4037 eor $res0b, $ctr_t0b, $ctr0b @ AES block 0 - result
4038 fmov $ctr0d, $ctr96_b64x @ CTR block 4
4039
4040 fmov $ctr0.d[1], $ctr32x @ CTR block 4
4041 rev $ctr32w, $rctr32w @ CTR block 5
4042 add $rctr32w, $rctr32w, #1 @ CTR block 5
4043
4044 eor $res1b, $ctr_t1b, $ctr1b @ AES block 1 - result
4045 fmov $ctr1d, $ctr96_b64x @ CTR block 5
4046 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 5
4047
4048 fmov $ctr1.d[1], $ctr32x @ CTR block 5
4049 rev $ctr32w, $rctr32w @ CTR block 6
4050 st1 { $res0b}, [$output_ptr], #16 @ AES block 0 - store result
4051
4052 fmov $ctr_t3.d[1], $input_h3 @ AES block 3 - mov high
4053 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 6
4054 eor $res2b, $ctr_t2b, $ctr2b @ AES block 2 - result
4055
4056 st1 { $res1b}, [$output_ptr], #16 @ AES block 1 - store result
4057
4058 add $rctr32w, $rctr32w, #1 @ CTR block 6
4059 fmov $ctr2d, $ctr96_b64x @ CTR block 6
4060
4061 fmov $ctr2.d[1], $ctr32x @ CTR block 6
4062 st1 { $res2b}, [$output_ptr], #16 @ AES block 2 - store result
4063 rev $ctr32w, $rctr32w @ CTR block 7
4064
4065 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 7
4066
4067 eor $res3b, $ctr_t3b, $ctr3b @ AES block 3 - result
4068 st1 { $res3b}, [$output_ptr], #16 @ AES block 3 - store result
4069 b.ge L256_enc_prepretail @ do prepretail
4070
4071 .L256_enc_main_loop: @ main loop start
4072 aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 0
4073 rev64 $res0b, $res0b @ GHASH block 4k (only t0 is free)
4074
4075 aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 0
4076 fmov $ctr3d, $ctr96_b64x @ CTR block 4k+3
4077
4078 aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 0
4079 ext $acc_lb, $acc_lb, $acc_lb, #8 @ PRE 0
4080
4081 aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 1
4082 fmov $ctr3.d[1], $ctr32x @ CTR block 4k+3
4083
4084 aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 1
4085 ldp $input_l3, $input_h3, [$input_ptr, #48] @ AES block 4k+7 - load plaintext
4086
4087 aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 1
4088 ldp $input_l2, $input_h2, [$input_ptr, #32] @ AES block 4k+6 - load plaintext
4089
4090 aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 2
4091 eor $res0b, $res0b, $acc_lb @ PRE 1
4092
4093 aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 2
4094
4095 aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 0
4096 eor $input_l3, $input_l3, $rk14_l @ AES block 4k+7 - round 14 low
4097
4098 aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 3
4099 mov $acc_md, $h34k.d[1] @ GHASH block 4k - mid
4100
4101 pmull2 $acc_h.1q, $res0.2d, $h4.2d @ GHASH block 4k - high
4102 eor $input_h2, $input_h2, $rk14_h @ AES block 4k+6 - round 14 high
4103 mov $t0d, $res0.d[1] @ GHASH block 4k - mid
4104
4105 aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 1
4106 rev64 $res1b, $res1b @ GHASH block 4k+1 (t0 and t1 free)
4107
4108 aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 4
4109
4110 pmull $acc_l.1q, $res0.1d, $h4.1d @ GHASH block 4k - low
4111 eor $t0.8b, $t0.8b, $res0.8b @ GHASH block 4k - mid
4112
4113 aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 2
4114
4115 aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 5
4116 rev64 $res3b, $res3b @ GHASH block 4k+3 (t0, t1, t2 and t3 free)
4117
4118 pmull2 $t1.1q, $res1.2d, $h3.2d @ GHASH block 4k+1 - high
4119
4120 pmull $acc_m.1q, $t0.1d, $acc_m.1d @ GHASH block 4k - mid
4121 rev64 $res2b, $res2b @ GHASH block 4k+2 (t0, t1, and t2 free)
4122
4123 pmull $t2.1q, $res1.1d, $h3.1d @ GHASH block 4k+1 - low
4124
4125 eor $acc_hb, $acc_hb, $t1.16b @ GHASH block 4k+1 - high
4126 mov $t3d, $res1.d[1] @ GHASH block 4k+1 - mid
4127
4128 aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 3
4129
4130 aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 2
4131 eor $acc_lb, $acc_lb, $t2.16b @ GHASH block 4k+1 - low
4132
4133 aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 3
4134
4135 aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 4
4136 mov $t6d, $res2.d[1] @ GHASH block 4k+2 - mid
4137
4138 aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 3
4139 eor $t3.8b, $t3.8b, $res1.8b @ GHASH block 4k+1 - mid
4140
4141 aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 4
4142
4143 aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 6
4144 eor $t6.8b, $t6.8b, $res2.8b @ GHASH block 4k+2 - mid
4145
4146 aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 4
4147
4148 pmull $t3.1q, $t3.1d, $h34k.1d @ GHASH block 4k+1 - mid
4149
4150 aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 7
4151
4152 aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 5
4153 ins $t6.d[1], $t6.d[0] @ GHASH block 4k+2 - mid
4154
4155 aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 5
4156
4157 aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 8
4158
4159 aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 5
4160
4161 aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 6
4162 eor $acc_mb, $acc_mb, $t3.16b @ GHASH block 4k+1 - mid
4163
4164 pmull2 $t4.1q, $res2.2d, $h2.2d @ GHASH block 4k+2 - high
4165
4166 pmull $t5.1q, $res2.1d, $h2.1d @ GHASH block 4k+2 - low
4167
4168 aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 7
4169
4170 pmull $t8.1q, $res3.1d, $h1.1d @ GHASH block 4k+3 - low
4171 eor $acc_hb, $acc_hb, $t4.16b @ GHASH block 4k+2 - high
4172
4173 aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 6
4174 ldp $input_l1, $input_h1, [$input_ptr, #16] @ AES block 4k+5 - load plaintext
4175
4176 aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 8
4177 mov $t9d, $res3.d[1] @ GHASH block 4k+3 - mid
4178
4179 aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 6
4180 eor $acc_lb, $acc_lb, $t5.16b @ GHASH block 4k+2 - low
4181
4182 pmull2 $t6.1q, $t6.2d, $h12k.2d @ GHASH block 4k+2 - mid
4183
4184 pmull2 $t7.1q, $res3.2d, $h1.2d @ GHASH block 4k+3 - high
4185 eor $t9.8b, $t9.8b, $res3.8b @ GHASH block 4k+3 - mid
4186
4187 aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 7
4188 eor $input_l1, $input_l1, $rk14_l @ AES block 4k+5 - round 14 low
4189
4190 aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 9
4191 eor $acc_mb, $acc_mb, $t6.16b @ GHASH block 4k+2 - mid
4192
4193 aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 7
4194 eor $input_l2, $input_l2, $rk14_l @ AES block 4k+6 - round 14 low
4195
4196 aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 9
4197 movi $mod_constant.8b, #0xc2
4198
4199 pmull $t9.1q, $t9.1d, $h12k.1d @ GHASH block 4k+3 - mid
4200 eor $acc_hb, $acc_hb, $t7.16b @ GHASH block 4k+3 - high
4201 fmov $ctr_t1d, $input_l1 @ AES block 4k+5 - mov low
4202
4203 aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 8
4204 ldp $input_l0, $input_h0, [$input_ptr, #0] @ AES block 4k+4 - load plaintext
4205
4206 aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 10
4207 shl $mod_constantd, $mod_constantd, #56 @ mod_constant
4208
4209 aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 8
4210 eor $acc_lb, $acc_lb, $t8.16b @ GHASH block 4k+3 - low
4211
4212 aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 9
4213
4214 aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 10
4215 eor $acc_mb, $acc_mb, $t9.16b @ GHASH block 4k+3 - mid
4216
4217 aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 9
4218 add $rctr32w, $rctr32w, #1 @ CTR block 4k+3
4219
4220 aese $ctr0b, $rk11 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 11
4221 eor $t9.16b, $acc_lb, $acc_hb @ MODULO - karatsuba tidy up
4222
4223 aese $ctr1b, $rk11 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 11
4224 add $input_ptr, $input_ptr, #64 @ AES input_ptr update
4225
4226 pmull $mod_t.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid
4227 rev $ctr32w, $rctr32w @ CTR block 4k+8
4228 ext $acc_hb, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment
4229
4230 aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 10
4231 eor $input_l0, $input_l0, $rk14_l @ AES block 4k+4 - round 14 low
4232
4233 aese $ctr1b, $rk12 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 12
4234 eor $acc_mb, $acc_mb, $t9.16b @ MODULO - karatsuba tidy up
4235
4236 aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 10
4237 eor $input_h0, $input_h0, $rk14_h @ AES block 4k+4 - round 14 high
4238
4239 fmov $ctr_t0d, $input_l0 @ AES block 4k+4 - mov low
4240 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+8
4241 eor $mod_t.16b, $acc_hb, $mod_t.16b @ MODULO - fold into mid
4242
4243 aese $ctr0b, $rk12 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 12
4244 eor $input_h1, $input_h1, $rk14_h @ AES block 4k+5 - round 14 high
4245
4246 aese $ctr2b, $rk11 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 11
4247 eor $input_h3, $input_h3, $rk14_h @ AES block 4k+7 - round 14 high
4248
4249 aese $ctr3b, $rk11 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 11
4250 add $rctr32w, $rctr32w, #1 @ CTR block 4k+8
4251
4252 aese $ctr0b, $rk13 @ AES block 4k+4 - round 13
4253 fmov $ctr_t0.d[1], $input_h0 @ AES block 4k+4 - mov high
4254 eor $acc_mb, $acc_mb, $mod_t.16b @ MODULO - fold into mid
4255
4256 aese $ctr2b, $rk12 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 12
4257 fmov $ctr_t3d, $input_l3 @ AES block 4k+7 - mov low
4258
4259 aese $ctr1b, $rk13 @ AES block 4k+5 - round 13
4260 fmov $ctr_t1.d[1], $input_h1 @ AES block 4k+5 - mov high
4261
4262 fmov $ctr_t2d, $input_l2 @ AES block 4k+6 - mov low
4263 cmp $input_ptr, $main_end_input_ptr @ LOOP CONTROL
4264
4265 fmov $ctr_t2.d[1], $input_h2 @ AES block 4k+6 - mov high
4266
4267 pmull $acc_h.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low
4268 eor $res0b, $ctr_t0b, $ctr0b @ AES block 4k+4 - result
4269 fmov $ctr0d, $ctr96_b64x @ CTR block 4k+8
4270
4271 fmov $ctr0.d[1], $ctr32x @ CTR block 4k+8
4272 rev $ctr32w, $rctr32w @ CTR block 4k+9
4273 add $rctr32w, $rctr32w, #1 @ CTR block 4k+9
4274
4275 eor $res1b, $ctr_t1b, $ctr1b @ AES block 4k+5 - result
4276 fmov $ctr1d, $ctr96_b64x @ CTR block 4k+9
4277 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+9
4278
4279 aese $ctr3b, $rk12 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 12
4280 fmov $ctr1.d[1], $ctr32x @ CTR block 4k+9
4281
4282 aese $ctr2b, $rk13 @ AES block 4k+6 - round 13
4283 rev $ctr32w, $rctr32w @ CTR block 4k+10
4284 st1 { $res0b}, [$output_ptr], #16 @ AES block 4k+4 - store result
4285
4286 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+10
4287 eor $acc_lb, $acc_lb, $acc_hb @ MODULO - fold into low
4288 fmov $ctr_t3.d[1], $input_h3 @ AES block 4k+7 - mov high
4289
4290 ext $acc_mb, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment
4291 st1 { $res1b}, [$output_ptr], #16 @ AES block 4k+5 - store result
4292 add $rctr32w, $rctr32w, #1 @ CTR block 4k+10
4293
4294 aese $ctr3b, $rk13 @ AES block 4k+7 - round 13
4295 eor $res2b, $ctr_t2b, $ctr2b @ AES block 4k+6 - result
4296 fmov $ctr2d, $ctr96_b64x @ CTR block 4k+10
4297
4298 st1 { $res2b}, [$output_ptr], #16 @ AES block 4k+6 - store result
4299 fmov $ctr2.d[1], $ctr32x @ CTR block 4k+10
4300 rev $ctr32w, $rctr32w @ CTR block 4k+11
4301
4302 eor $acc_lb, $acc_lb, $acc_mb @ MODULO - fold into low
4303 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+11
4304
4305 eor $res3b, $ctr_t3b, $ctr3b @ AES block 4k+7 - result
4306 st1 { $res3b}, [$output_ptr], #16 @ AES block 4k+7 - store result
4307 b.lt L256_enc_main_loop
4308
4309 .L256_enc_prepretail: @ PREPRETAIL
4310 aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 0
4311 rev64 $res2b, $res2b @ GHASH block 4k+2 (t0, t1, and t2 free)
4312
4313 aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 0
4314 fmov $ctr3d, $ctr96_b64x @ CTR block 4k+3
4315
4316 aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 0
4317 rev64 $res0b, $res0b @ GHASH block 4k (only t0 is free)
4318
4319 fmov $ctr3.d[1], $ctr32x @ CTR block 4k+3
4320 ext $acc_lb, $acc_lb, $acc_lb, #8 @ PRE 0
4321
4322 aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 1
4323
4324 aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 1
4325
4326 eor $res0b, $res0b, $acc_lb @ PRE 1
4327 rev64 $res1b, $res1b @ GHASH block 4k+1 (t0 and t1 free)
4328
4329 aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 2
4330
4331 aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 0
4332 mov $acc_md, $h34k.d[1] @ GHASH block 4k - mid
4333
4334 aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 1
4335
4336 pmull $acc_l.1q, $res0.1d, $h4.1d @ GHASH block 4k - low
4337 mov $t0d, $res0.d[1] @ GHASH block 4k - mid
4338
4339 pmull2 $acc_h.1q, $res0.2d, $h4.2d @ GHASH block 4k - high
4340
4341 aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 3
4342
4343 aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 2
4344 eor $t0.8b, $t0.8b, $res0.8b @ GHASH block 4k - mid
4345
4346 aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 2
4347
4348 aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 1
4349
4350 aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 3
4351
4352 pmull $acc_m.1q, $t0.1d, $acc_m.1d @ GHASH block 4k - mid
4353
4354 pmull2 $t1.1q, $res1.2d, $h3.2d @ GHASH block 4k+1 - high
4355
4356 pmull $t2.1q, $res1.1d, $h3.1d @ GHASH block 4k+1 - low
4357
4358 aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 2
4359
4360 eor $acc_hb, $acc_hb, $t1.16b @ GHASH block 4k+1 - high
4361 mov $t3d, $res1.d[1] @ GHASH block 4k+1 - mid
4362
4363 aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 3
4364 eor $acc_lb, $acc_lb, $t2.16b @ GHASH block 4k+1 - low
4365
4366 aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 3
4367
4368 eor $t3.8b, $t3.8b, $res1.8b @ GHASH block 4k+1 - mid
4369 mov $t6d, $res2.d[1] @ GHASH block 4k+2 - mid
4370
4371 aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 4
4372 rev64 $res3b, $res3b @ GHASH block 4k+3 (t0, t1, t2 and t3 free)
4373
4374 aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 4
4375
4376 pmull $t3.1q, $t3.1d, $h34k.1d @ GHASH block 4k+1 - mid
4377 eor $t6.8b, $t6.8b, $res2.8b @ GHASH block 4k+2 - mid
4378 add $rctr32w, $rctr32w, #1 @ CTR block 4k+3
4379
4380 pmull $t5.1q, $res2.1d, $h2.1d @ GHASH block 4k+2 - low
4381
4382 aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 5
4383
4384 aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 4
4385 eor $acc_mb, $acc_mb, $t3.16b @ GHASH block 4k+1 - mid
4386
4387 pmull2 $t4.1q, $res2.2d, $h2.2d @ GHASH block 4k+2 - high
4388
4389 eor $acc_lb, $acc_lb, $t5.16b @ GHASH block 4k+2 - low
4390 ins $t6.d[1], $t6.d[0] @ GHASH block 4k+2 - mid
4391
4392 aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 5
4393
4394 eor $acc_hb, $acc_hb, $t4.16b @ GHASH block 4k+2 - high
4395 mov $t9d, $res3.d[1] @ GHASH block 4k+3 - mid
4396
4397 aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 4
4398
4399 pmull2 $t6.1q, $t6.2d, $h12k.2d @ GHASH block 4k+2 - mid
4400
4401 eor $t9.8b, $t9.8b, $res3.8b @ GHASH block 4k+3 - mid
4402
4403 pmull2 $t7.1q, $res3.2d, $h1.2d @ GHASH block 4k+3 - high
4404
4405 aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 5
4406
4407 pmull $t9.1q, $t9.1d, $h12k.1d @ GHASH block 4k+3 - mid
4408 eor $acc_mb, $acc_mb, $t6.16b @ GHASH block 4k+2 - mid
4409
4410 aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 5
4411
4412 aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 6
4413
4414 aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 6
4415
4416 aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 6
4417 movi $mod_constant.8b, #0xc2
4418
4419 aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 6
4420
4421 aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 7
4422 eor $acc_hb, $acc_hb, $t7.16b @ GHASH block 4k+3 - high
4423
4424 aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 7
4425
4426 aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 7
4427 shl $mod_constantd, $mod_constantd, #56 @ mod_constant
4428
4429 aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 8
4430 eor $acc_mb, $acc_mb, $t9.16b @ GHASH block 4k+3 - mid
4431
4432 pmull $t8.1q, $res3.1d, $h1.1d @ GHASH block 4k+3 - low
4433
4434 aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 8
4435
4436 aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 9
4437
4438 aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 8
4439 eor $acc_lb, $acc_lb, $t8.16b @ GHASH block 4k+3 - low
4440
4441 aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 9
4442
4443 eor $acc_mb, $acc_mb, $acc_hb @ karatsuba tidy up
4444
4445 pmull $t1.1q, $acc_h.1d, $mod_constant.1d
4446 ext $acc_hb, $acc_hb, $acc_hb, #8
4447
4448 aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 10
4449
4450 aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 7
4451 eor $acc_mb, $acc_mb, $acc_lb
4452
4453 aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 10
4454
4455 aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 9
4456
4457 aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 8
4458
4459 aese $ctr1b, $rk11 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 11
4460 eor $acc_mb, $acc_mb, $t1.16b
4461
4462 aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 10
4463
4464 aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 9
4465
4466 aese $ctr1b, $rk12 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 12
4467
4468 aese $ctr0b, $rk11 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 11
4469 eor $acc_mb, $acc_mb, $acc_hb
4470
4471 aese $ctr3b, $rk11 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 11
4472
4473 aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 10
4474
4475 aese $ctr0b, $rk12 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 12
4476
4477 pmull $t1.1q, $acc_m.1d, $mod_constant.1d
4478
4479 aese $ctr2b, $rk11 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 11
4480 ext $acc_mb, $acc_mb, $acc_mb, #8
4481
4482 aese $ctr3b, $rk12 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 12
4483
4484 aese $ctr1b, $rk13 @ AES block 4k+5 - round 13
4485 eor $acc_lb, $acc_lb, $t1.16b
4486
4487 aese $ctr2b, $rk12 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 12
4488
4489 aese $ctr3b, $rk13 @ AES block 4k+7 - round 13
4490
4491 aese $ctr0b, $rk13 @ AES block 4k+4 - round 13
4492
4493 aese $ctr2b, $rk13 @ AES block 4k+6 - round 13
4494 eor $acc_lb, $acc_lb, $acc_mb
4495 .L256_enc_tail: @ TAIL
4496
4497 ext $t0.16b, $acc_lb, $acc_lb, #8 @ prepare final partial tag
4498 sub $main_end_input_ptr, $end_input_ptr, $input_ptr @ main_end_input_ptr is number of bytes left to process
4499 ldp $input_l0, $input_h0, [$input_ptr], #16 @ AES block 4k+4 - load plaintext
4500
4501 eor $input_l0, $input_l0, $rk14_l @ AES block 4k+4 - round 14 low
4502 eor $input_h0, $input_h0, $rk14_h @ AES block 4k+4 - round 14 high
4503
4504 cmp $main_end_input_ptr, #48
4505 fmov $ctr_t0d, $input_l0 @ AES block 4k+4 - mov low
4506
4507 fmov $ctr_t0.d[1], $input_h0 @ AES block 4k+4 - mov high
4508
4509 eor $res1b, $ctr_t0b, $ctr0b @ AES block 4k+4 - result
4510 b.gt .L256_enc_blocks_more_than_3
4511
4512 cmp $main_end_input_ptr, #32
4513 mov $ctr3b, $ctr2b
4514 movi $acc_l.8b, #0
4515
4516 movi $acc_h.8b, #0
4517 sub $rctr32w, $rctr32w, #1
4518
4519 mov $ctr2b, $ctr1b
4520 movi $acc_m.8b, #0
4521 b.gt .L256_enc_blocks_more_than_2
4522
4523 mov $ctr3b, $ctr1b
4524 sub $rctr32w, $rctr32w, #1
4525 cmp $main_end_input_ptr, #16
4526
4527 b.gt .L256_enc_blocks_more_than_1
4528
4529 sub $rctr32w, $rctr32w, #1
4530 b .L256_enc_blocks_less_than_1
4531 .L256_enc_blocks_more_than_3: @ blocks left > 3
4532 st1 { $res1b}, [$output_ptr], #16 @ AES final-3 block - store result
4533
4534 ldp $input_l0, $input_h0, [$input_ptr], #16 @ AES final-2 block - load input low & high
4535
4536 rev64 $res0b, $res1b @ GHASH final-3 block
4537
4538 eor $input_l0, $input_l0, $rk14_l @ AES final-2 block - round 14 low
4539 eor $res0b, $res0b, $t0.16b @ feed in partial tag
4540
4541 eor $input_h0, $input_h0, $rk14_h @ AES final-2 block - round 14 high
4542
4543 mov $rk4d, $res0.d[1] @ GHASH final-3 block - mid
4544 fmov $res1d, $input_l0 @ AES final-2 block - mov low
4545
4546 fmov $res1.d[1], $input_h0 @ AES final-2 block - mov high
4547
4548 eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-3 block - mid
4549 movi $t0.8b, #0 @ suppress further partial tag feed in
4550
4551 mov $acc_md, $h34k.d[1] @ GHASH final-3 block - mid
4552
4553 pmull $acc_l.1q, $res0.1d, $h4.1d @ GHASH final-3 block - low
4554
4555 pmull2 $acc_h.1q, $res0.2d, $h4.2d @ GHASH final-3 block - high
4556
4557 pmull $acc_m.1q, $rk4v.1d, $acc_m.1d @ GHASH final-3 block - mid
4558 eor $res1b, $res1b, $ctr1b @ AES final-2 block - result
4559 .L256_enc_blocks_more_than_2: @ blocks left > 2
4560
4561 st1 { $res1b}, [$output_ptr], #16 @ AES final-2 block - store result
4562
4563 ldp $input_l0, $input_h0, [$input_ptr], #16 @ AES final-1 block - load input low & high
4564
4565 rev64 $res0b, $res1b @ GHASH final-2 block
4566
4567 eor $input_l0, $input_l0, $rk14_l @ AES final-1 block - round 14 low
4568 eor $res0b, $res0b, $t0.16b @ feed in partial tag
4569
4570 fmov $res1d, $input_l0 @ AES final-1 block - mov low
4571 eor $input_h0, $input_h0, $rk14_h @ AES final-1 block - round 14 high
4572
4573 fmov $res1.d[1], $input_h0 @ AES final-1 block - mov high
4574
4575 movi $t0.8b, #0 @ suppress further partial tag feed in
4576
4577 pmull2 $rk2q1, $res0.2d, $h3.2d @ GHASH final-2 block - high
4578 mov $rk4d, $res0.d[1] @ GHASH final-2 block - mid
4579
4580 pmull $rk3q1, $res0.1d, $h3.1d @ GHASH final-2 block - low
4581
4582 eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-2 block - mid
4583
4584 eor $res1b, $res1b, $ctr2b @ AES final-1 block - result
4585
4586 eor $acc_hb, $acc_hb, $rk2 @ GHASH final-2 block - high
4587
4588 pmull $rk4v.1q, $rk4v.1d, $h34k.1d @ GHASH final-2 block - mid
4589
4590 eor $acc_lb, $acc_lb, $rk3 @ GHASH final-2 block - low
4591
4592 eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-2 block - mid
4593 .L256_enc_blocks_more_than_1: @ blocks left > 1
4594
4595 st1 { $res1b}, [$output_ptr], #16 @ AES final-1 block - store result
4596
4597 rev64 $res0b, $res1b @ GHASH final-1 block
4598
4599 ldp $input_l0, $input_h0, [$input_ptr], #16 @ AES final block - load input low & high
4600
4601 eor $res0b, $res0b, $t0.16b @ feed in partial tag
4602
4603 movi $t0.8b, #0 @ suppress further partial tag feed in
4604
4605 eor $input_l0, $input_l0, $rk14_l @ AES final block - round 14 low
4606 mov $rk4d, $res0.d[1] @ GHASH final-1 block - mid
4607
4608 pmull2 $rk2q1, $res0.2d, $h2.2d @ GHASH final-1 block - high
4609 eor $input_h0, $input_h0, $rk14_h @ AES final block - round 14 high
4610
4611 eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-1 block - mid
4612
4613 eor $acc_hb, $acc_hb, $rk2 @ GHASH final-1 block - high
4614
4615 ins $rk4v.d[1], $rk4v.d[0] @ GHASH final-1 block - mid
4616 fmov $res1d, $input_l0 @ AES final block - mov low
4617
4618 fmov $res1.d[1], $input_h0 @ AES final block - mov high
4619
4620 pmull2 $rk4v.1q, $rk4v.2d, $h12k.2d @ GHASH final-1 block - mid
4621
4622 pmull $rk3q1, $res0.1d, $h2.1d @ GHASH final-1 block - low
4623
4624 eor $res1b, $res1b, $ctr3b @ AES final block - result
4625 eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-1 block - mid
4626
4627 eor $acc_lb, $acc_lb, $rk3 @ GHASH final-1 block - low
4628 .L256_enc_blocks_less_than_1: @ blocks left <= 1
4629
4630 and $bit_length, $bit_length, #127 @ bit_length %= 128
4631
4632 mvn $rk14_l, xzr @ rk14_l = 0xffffffffffffffff
4633 sub $bit_length, $bit_length, #128 @ bit_length -= 128
4634
4635 neg $bit_length, $bit_length @ bit_length = 128 - #bits in input (in range [1,128])
4636 ld1 { $rk0}, [$output_ptr] @ load existing bytes where the possibly partial last block is to be stored
4637
4638 mvn $rk14_h, xzr @ rk14_h = 0xffffffffffffffff
4639 and $bit_length, $bit_length, #127 @ bit_length %= 128
4640
4641 lsr $rk14_h, $rk14_h, $bit_length @ rk14_h is mask for top 64b of last block
4642 cmp $bit_length, #64
4643
4644 csel $input_l0, $rk14_l, $rk14_h, lt
4645 csel $input_h0, $rk14_h, xzr, lt
4646
4647 fmov $ctr0d, $input_l0 @ ctr0b is mask for last block
4648
4649 fmov $ctr0.d[1], $input_h0
4650
4651 and $res1b, $res1b, $ctr0b @ possibly partial last block has zeroes in highest bits
4652
4653 rev64 $res0b, $res1b @ GHASH final block
4654
4655 eor $res0b, $res0b, $t0.16b @ feed in partial tag
4656
4657 bif $res1b, $rk0, $ctr0b @ insert existing bytes in top end of result before storing
4658
4659 pmull2 $rk2q1, $res0.2d, $h1.2d @ GHASH final block - high
4660 mov $t0d, $res0.d[1] @ GHASH final block - mid
4661 rev $ctr32w, $rctr32w
4662
4663 pmull $rk3q1, $res0.1d, $h1.1d @ GHASH final block - low
4664
4665 eor $acc_hb, $acc_hb, $rk2 @ GHASH final block - high
4666 eor $t0.8b, $t0.8b, $res0.8b @ GHASH final block - mid
4667
4668 pmull $t0.1q, $t0.1d, $h12k.1d @ GHASH final block - mid
4669
4670 eor $acc_lb, $acc_lb, $rk3 @ GHASH final block - low
4671
4672 eor $acc_mb, $acc_mb, $t0.16b @ GHASH final block - mid
4673 movi $mod_constant.8b, #0xc2
4674
4675 eor $t9.16b, $acc_lb, $acc_hb @ MODULO - karatsuba tidy up
4676
4677 shl $mod_constantd, $mod_constantd, #56 @ mod_constant
4678
4679 eor $acc_mb, $acc_mb, $t9.16b @ MODULO - karatsuba tidy up
4680
4681 pmull $mod_t.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid
4682
4683 ext $acc_hb, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment
4684
4685 eor $acc_mb, $acc_mb, $mod_t.16b @ MODULO - fold into mid
4686
4687 eor $acc_mb, $acc_mb, $acc_hb @ MODULO - fold into mid
4688
4689 pmull $acc_h.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low
4690
4691 ext $acc_mb, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment
4692
4693 str $ctr32w, [$counter, #12] @ store the updated counter
4694
4695 st1 { $res1b}, [$output_ptr] @ store all 16B
4696 eor $acc_lb, $acc_lb, $acc_hb @ MODULO - fold into low
4697
4698 eor $acc_lb, $acc_lb, $acc_mb @ MODULO - fold into low
4699 ext $acc_lb, $acc_lb, $acc_lb, #8
4700 rev64 $acc_lb, $acc_lb
4701 mov x0, $len
4702 st1 { $acc_l.16b }, [$current_tag]
4703
4704 ldp x21, x22, [sp, #16]
4705 ldp x23, x24, [sp, #32]
4706 ldp d8, d9, [sp, #48]
4707 ldp d10, d11, [sp, #64]
4708 ldp d12, d13, [sp, #80]
4709 ldp d14, d15, [sp, #96]
4710 ldp x19, x20, [sp], #112
4711 ret
4712
4713.L256_enc_ret:
4714 mov w0, #0x0
4715 ret
4716.size aes_gcm_enc_256_kernel,.-aes_gcm_enc_256_kernel
4717___
4718
4719{
4720my $t8="v4";
4721my $t8d="d4";
4722my $t9="v6";
4723my $t9d="d6";
4724#########################################################################################
4725# size_t aes_gcm_dec_256_kernel(const unsigned char *in,
4726# size_t len,
4727# unsigned char *out,
4728# const void *key,
4729# unsigned char ivec[16],
4730# u64 *Xi);
4731#
4732$code.=<<___;
4733.global aes_gcm_dec_256_kernel
4734.type aes_gcm_dec_256_kernel,%function
4735.align 4
4736aes_gcm_dec_256_kernel:
4737 AARCH64_VALID_CALL_TARGET
4738 cbz x1, .L256_dec_ret
4739 stp x19, x20, [sp, #-112]!
4740 mov x16, x4
4741 mov x8, x5
4742 stp x21, x22, [sp, #16]
4743 stp x23, x24, [sp, #32]
4744 stp d8, d9, [sp, #48]
4745 stp d10, d11, [sp, #64]
4746 stp d12, d13, [sp, #80]
4747 stp d14, d15, [sp, #96]
4748
4749 lsr $main_end_input_ptr, $bit_length, #3 @ byte_len
4750 mov $len, $main_end_input_ptr
4751 ldp $ctr96_b64x, $ctr96_t32x, [$counter] @ ctr96_b64, ctr96_t32
4752
4753 ldr $rk8q, [$cc, #128] @ load rk8
4754 sub $main_end_input_ptr, $main_end_input_ptr, #1 @ byte_len - 1
4755
4756 ldr $rk7q, [$cc, #112] @ load rk7
4757 and $main_end_input_ptr, $main_end_input_ptr, #0xffffffffffffffc0 @ number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
4758
4759 add $end_input_ptr, $input_ptr, $bit_length, lsr #3 @ end_input_ptr
4760 ldr $rk6q, [$cc, #96] @ load rk6
4761
4762 lsr $rctr32x, $ctr96_t32x, #32
4763 ldr $rk5q, [$cc, #80] @ load rk5
4764 orr $ctr96_t32w, $ctr96_t32w, $ctr96_t32w
4765
4766 ldr $rk3q, [$cc, #48] @ load rk3
4767 add $main_end_input_ptr, $main_end_input_ptr, $input_ptr
4768 rev $rctr32w, $rctr32w @ rev_ctr32
4769
4770 add $rctr32w, $rctr32w, #1 @ increment rev_ctr32
4771 fmov $ctr3d, $ctr96_b64x @ CTR block 3
4772
4773 rev $ctr32w, $rctr32w @ CTR block 1
4774 add $rctr32w, $rctr32w, #1 @ CTR block 1
4775 fmov $ctr1d, $ctr96_b64x @ CTR block 1
4776
4777 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 1
4778 ld1 { $ctr0b}, [$counter] @ special case vector load initial counter so we can start first AES block as quickly as possible
4779
4780 fmov $ctr1.d[1], $ctr32x @ CTR block 1
4781 rev $ctr32w, $rctr32w @ CTR block 2
4782 add $rctr32w, $rctr32w, #1 @ CTR block 2
4783
4784 fmov $ctr2d, $ctr96_b64x @ CTR block 2
4785 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 2
4786
4787 fmov $ctr2.d[1], $ctr32x @ CTR block 2
4788 rev $ctr32w, $rctr32w @ CTR block 3
4789
4790 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 3
4791 ldr $rk0q, [$cc, #0] @ load rk0
4792
4793 fmov $ctr3.d[1], $ctr32x @ CTR block 3
4794 add $rctr32w, $rctr32w, #1 @ CTR block 3
4795
4796 ldr $rk4q, [$cc, #64] @ load rk4
4797
4798 ldr $rk13q, [$cc, #208] @ load rk13
4799
4800 ldr $rk1q, [$cc, #16] @ load rk1
4801
4802 aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 0
4803 ldr $h3q, [$current_tag, #80] @ load h3l | h3h
4804 ext $h3b, $h3b, $h3b, #8
4805
4806 aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 0
4807 ldr $h4q, [$current_tag, #112] @ load h4l | h4h
4808 ext $h4b, $h4b, $h4b, #8
4809
4810 aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 0
4811 ldr $h2q, [$current_tag, #64] @ load h2l | h2h
4812 ext $h2b, $h2b, $h2b, #8
4813
4814 aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 0
4815 ldr $rk2q, [$cc, #32] @ load rk2
4816
4817 aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 1
4818 ldp $rk14_l, $rk14_h, [$cc, #224] @ load rk14
4819
4820 aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 1
4821 ld1 { $acc_lb}, [$current_tag]
4822 ext $acc_lb, $acc_lb, $acc_lb, #8
4823 rev64 $acc_lb, $acc_lb
4824
4825 aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 1
4826 ldr $rk9q, [$cc, #144] @ load rk9
4827
4828 aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 1
4829 ldr $rk12q, [$cc, #192] @ load rk12
4830
4831 aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 2
4832 ldr $h1q, [$current_tag, #32] @ load h1l | h1h
4833 ext $h1b, $h1b, $h1b, #8
4834
4835 aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 2
4836 ldr $rk10q, [$cc, #160] @ load rk10
4837
4838 aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 2
4839
4840 aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 3
4841
4842 aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 2
4843
4844 aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 3
4845
4846 aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 4
4847 cmp $input_ptr, $main_end_input_ptr @ check if we have <= 4 blocks
4848
4849 aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 3
4850
4851 aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 3
4852
4853 aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 4
4854
4855 aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 4
4856
4857 aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 4
4858
4859 aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 5
4860
4861 aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 5
4862
4863 aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 5
4864
4865 aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 5
4866
4867 aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 6
4868
4869 aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 6
4870
4871 aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 6
4872
4873 aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 6
4874
4875 aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 7
4876
4877 aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 7
4878
4879 aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 7
4880
4881 aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 8
4882
4883 aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 7
4884
4885 aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 8
4886
4887 aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 8
4888
4889 aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 9
4890
4891 aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 8
4892 ldr $rk11q, [$cc, #176] @ load rk11
4893
4894 aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 9
4895
4896 aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 10
4897
4898 aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 9
4899
4900 aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 10
4901
4902 aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 9
4903
4904 aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 10
4905
4906 aese $ctr0b, $rk11 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 11
4907
4908 aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 10
4909
4910 aese $ctr3b, $rk11 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 11
4911
4912 aese $ctr1b, $rk11 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 11
4913
4914 aese $ctr2b, $rk11 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 11
4915
4916 trn1 $acc_h.2d, $h3.2d, $h4.2d @ h4h | h3h
4917
4918 trn2 $h34k.2d, $h3.2d, $h4.2d @ h4l | h3l
4919
4920 trn1 $t0.2d, $h1.2d, $h2.2d @ h2h | h1h
4921 trn2 $h12k.2d, $h1.2d, $h2.2d @ h2l | h1l
4922
4923 aese $ctr1b, $rk12 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 12
4924
4925 aese $ctr0b, $rk12 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 12
4926
4927 aese $ctr2b, $rk12 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 12
4928
4929 aese $ctr3b, $rk12 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 12
4930 eor $h34k.16b, $h34k.16b, $acc_h.16b @ h4k | h3k
4931
4932 aese $ctr1b, $rk13 @ AES block 1 - round 13
4933
4934 aese $ctr2b, $rk13 @ AES block 2 - round 13
4935 eor $h12k.16b, $h12k.16b, $t0.16b @ h2k | h1k
4936
4937 aese $ctr3b, $rk13 @ AES block 3 - round 13
4938
4939 aese $ctr0b, $rk13 @ AES block 0 - round 13
4940 b.ge .L256_dec_tail @ handle tail
4941
4942 ldr $res0q, [$input_ptr, #0] @ AES block 0 - load ciphertext
4943
4944 ldr $res1q, [$input_ptr, #16] @ AES block 1 - load ciphertext
4945
4946 rev $ctr32w, $rctr32w @ CTR block 4
4947
4948 eor $ctr0b, $res0b, $ctr0b @ AES block 0 - result
4949
4950 eor $ctr1b, $res1b, $ctr1b @ AES block 1 - result
4951 rev64 $res1b, $res1b @ GHASH block 1
4952 ldr $res3q, [$input_ptr, #48] @ AES block 3 - load ciphertext
4953
4954 mov $output_h0, $ctr0.d[1] @ AES block 0 - mov high
4955
4956 mov $output_l0, $ctr0.d[0] @ AES block 0 - mov low
4957 rev64 $res0b, $res0b @ GHASH block 0
4958 add $rctr32w, $rctr32w, #1 @ CTR block 4
4959
4960 fmov $ctr0d, $ctr96_b64x @ CTR block 4
4961 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4
4962
4963 fmov $ctr0.d[1], $ctr32x @ CTR block 4
4964 rev $ctr32w, $rctr32w @ CTR block 5
4965 add $rctr32w, $rctr32w, #1 @ CTR block 5
4966
4967 mov $output_l1, $ctr1.d[0] @ AES block 1 - mov low
4968
4969 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 5
4970 mov $output_h1, $ctr1.d[1] @ AES block 1 - mov high
4971 eor $output_h0, $output_h0, $rk14_h @ AES block 0 - round 14 high
4972
4973 eor $output_l0, $output_l0, $rk14_l @ AES block 0 - round 14 low
4974 stp $output_l0, $output_h0, [$output_ptr], #16 @ AES block 0 - store result
4975 fmov $ctr1d, $ctr96_b64x @ CTR block 5
4976
4977 ldr $res2q, [$input_ptr, #32] @ AES block 2 - load ciphertext
4978 add $input_ptr, $input_ptr, #64 @ AES input_ptr update
4979
4980 fmov $ctr1.d[1], $ctr32x @ CTR block 5
4981 rev $ctr32w, $rctr32w @ CTR block 6
4982 add $rctr32w, $rctr32w, #1 @ CTR block 6
4983
4984 eor $output_l1, $output_l1, $rk14_l @ AES block 1 - round 14 low
4985 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 6
4986
4987 eor $output_h1, $output_h1, $rk14_h @ AES block 1 - round 14 high
4988 stp $output_l1, $output_h1, [$output_ptr], #16 @ AES block 1 - store result
4989
4990 eor $ctr2b, $res2b, $ctr2b @ AES block 2 - result
4991 cmp $input_ptr, $main_end_input_ptr @ check if we have <= 8 blocks
4992 b.ge .L256_dec_prepretail @ do prepretail
4993
4994 .L256_dec_main_loop: @ main loop start
4995 mov $output_l2, $ctr2.d[0] @ AES block 4k+2 - mov low
4996 ext $acc_lb, $acc_lb, $acc_lb, #8 @ PRE 0
4997 eor $ctr3b, $res3b, $ctr3b @ AES block 4k+3 - result
4998
4999 aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 0
5000 mov $output_h2, $ctr2.d[1] @ AES block 4k+2 - mov high
5001
5002 aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 0
5003 fmov $ctr2d, $ctr96_b64x @ CTR block 4k+6
5004
5005 fmov $ctr2.d[1], $ctr32x @ CTR block 4k+6
5006 eor $res0b, $res0b, $acc_lb @ PRE 1
5007 rev $ctr32w, $rctr32w @ CTR block 4k+7
5008
5009 aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 1
5010 mov $output_h3, $ctr3.d[1] @ AES block 4k+3 - mov high
5011
5012 aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 1
5013 mov $output_l3, $ctr3.d[0] @ AES block 4k+3 - mov low
5014
5015 pmull2 $acc_h.1q, $res0.2d, $h4.2d @ GHASH block 4k - high
5016 mov $t0d, $res0.d[1] @ GHASH block 4k - mid
5017 fmov $ctr3d, $ctr96_b64x @ CTR block 4k+7
5018
5019 aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 2
5020 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+7
5021
5022 aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 0
5023 fmov $ctr3.d[1], $ctr32x @ CTR block 4k+7
5024
5025 aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 2
5026 eor $t0.8b, $t0.8b, $res0.8b @ GHASH block 4k - mid
5027
5028 aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 3
5029 eor $output_h2, $output_h2, $rk14_h @ AES block 4k+2 - round 14 high
5030
5031 aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 1
5032 mov $acc_md, $h34k.d[1] @ GHASH block 4k - mid
5033
5034 aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 3
5035 rev64 $res2b, $res2b @ GHASH block 4k+2
5036
5037 aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 0
5038 eor $output_l2, $output_l2, $rk14_l @ AES block 4k+2 - round 14 low
5039
5040 aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 2
5041 stp $output_l2, $output_h2, [$output_ptr], #16 @ AES block 4k+2 - store result
5042
5043 pmull $acc_l.1q, $res0.1d, $h4.1d @ GHASH block 4k - low
5044
5045 pmull2 $t1.1q, $res1.2d, $h3.2d @ GHASH block 4k+1 - high
5046
5047 aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 3
5048 rev64 $res3b, $res3b @ GHASH block 4k+3
5049
5050 pmull $acc_m.1q, $t0.1d, $acc_m.1d @ GHASH block 4k - mid
5051 eor $output_l3, $output_l3, $rk14_l @ AES block 4k+3 - round 14 low
5052
5053 pmull $t2.1q, $res1.1d, $h3.1d @ GHASH block 4k+1 - low
5054 eor $output_h3, $output_h3, $rk14_h @ AES block 4k+3 - round 14 high
5055 eor $acc_hb, $acc_hb, $t1.16b @ GHASH block 4k+1 - high
5056
5057 aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 4
5058
5059 aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 1
5060 mov $t3d, $res1.d[1] @ GHASH block 4k+1 - mid
5061
5062 aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 4
5063 eor $acc_lb, $acc_lb, $t2.16b @ GHASH block 4k+1 - low
5064
5065 aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 5
5066 add $rctr32w, $rctr32w, #1 @ CTR block 4k+7
5067
5068 aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 2
5069 mov $t6d, $res2.d[1] @ GHASH block 4k+2 - mid
5070
5071 aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 4
5072 eor $t3.8b, $t3.8b, $res1.8b @ GHASH block 4k+1 - mid
5073
5074 pmull $t5.1q, $res2.1d, $h2.1d @ GHASH block 4k+2 - low
5075
5076 aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 3
5077 eor $t6.8b, $t6.8b, $res2.8b @ GHASH block 4k+2 - mid
5078
5079 aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 5
5080
5081 aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 5
5082 eor $acc_lb, $acc_lb, $t5.16b @ GHASH block 4k+2 - low
5083
5084 pmull $t3.1q, $t3.1d, $h34k.1d @ GHASH block 4k+1 - mid
5085 rev $ctr32w, $rctr32w @ CTR block 4k+8
5086
5087 aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 6
5088 ins $t6.d[1], $t6.d[0] @ GHASH block 4k+2 - mid
5089
5090 aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 6
5091 add $rctr32w, $rctr32w, #1 @ CTR block 4k+8
5092
5093 aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 4
5094
5095 aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 7
5096 eor $acc_mb, $acc_mb, $t3.16b @ GHASH block 4k+1 - mid
5097
5098 aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 7
5099
5100 pmull2 $t4.1q, $res2.2d, $h2.2d @ GHASH block 4k+2 - high
5101 mov $t9d, $res3.d[1] @ GHASH block 4k+3 - mid
5102
5103 aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 5
5104
5105 pmull2 $t6.1q, $t6.2d, $h12k.2d @ GHASH block 4k+2 - mid
5106
5107 aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 8
5108 eor $acc_hb, $acc_hb, $t4.16b @ GHASH block 4k+2 - high
5109
5110 aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 6
5111
5112 pmull $t8.1q, $res3.1d, $h1.1d @ GHASH block 4k+3 - low
5113 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+8
5114 eor $acc_mb, $acc_mb, $t6.16b @ GHASH block 4k+2 - mid
5115
5116 pmull2 $t7.1q, $res3.2d, $h1.2d @ GHASH block 4k+3 - high
5117
5118 aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 9
5119 eor $t9.8b, $t9.8b, $res3.8b @ GHASH block 4k+3 - mid
5120
5121 aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 8
5122
5123 aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 6
5124 eor $acc_hb, $acc_hb, $t7.16b @ GHASH block 4k+3 - high
5125
5126 aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 10
5127
5128 pmull $t9.1q, $t9.1d, $h12k.1d @ GHASH block 4k+3 - mid
5129 movi $mod_constant.8b, #0xc2
5130
5131 aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 7
5132 eor $acc_lb, $acc_lb, $t8.16b @ GHASH block 4k+3 - low
5133
5134 aese $ctr0b, $rk11 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 11
5135
5136 aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 7
5137 shl $mod_constantd, $mod_constantd, #56 @ mod_constant
5138
5139 aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 8
5140 eor $acc_mb, $acc_mb, $t9.16b @ GHASH block 4k+3 - mid
5141
5142 aese $ctr0b, $rk12 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 12
5143
5144 pmull $mod_t.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid
5145 eor $t9.16b, $acc_lb, $acc_hb @ MODULO - karatsuba tidy up
5146
5147 aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 9
5148 ldr $res0q, [$input_ptr, #0] @ AES block 4k+4 - load ciphertext
5149
5150 aese $ctr0b, $rk13 @ AES block 4k+4 - round 13
5151 ext $acc_hb, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment
5152
5153 aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 10
5154 eor $acc_mb, $acc_mb, $t9.16b @ MODULO - karatsuba tidy up
5155
5156 aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 9
5157 ldr $res1q, [$input_ptr, #16] @ AES block 4k+5 - load ciphertext
5158
5159 aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 8
5160 eor $ctr0b, $res0b, $ctr0b @ AES block 4k+4 - result
5161
5162 aese $ctr1b, $rk11 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 11
5163 stp $output_l3, $output_h3, [$output_ptr], #16 @ AES block 4k+3 - store result
5164
5165 aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 10
5166 eor $acc_mb, $acc_mb, $mod_t.16b @ MODULO - fold into mid
5167
5168 aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 9
5169 ldr $res3q, [$input_ptr, #48] @ AES block 4k+7 - load ciphertext
5170
5171 aese $ctr1b, $rk12 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 12
5172 ldr $res2q, [$input_ptr, #32] @ AES block 4k+6 - load ciphertext
5173
5174 aese $ctr2b, $rk11 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 11
5175 mov $output_h0, $ctr0.d[1] @ AES block 4k+4 - mov high
5176
5177 aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 10
5178 eor $acc_mb, $acc_mb, $acc_hb @ MODULO - fold into mid
5179
5180 aese $ctr1b, $rk13 @ AES block 4k+5 - round 13
5181 add $input_ptr, $input_ptr, #64 @ AES input_ptr update
5182 mov $output_l0, $ctr0.d[0] @ AES block 4k+4 - mov low
5183
5184 aese $ctr2b, $rk12 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 12
5185 fmov $ctr0d, $ctr96_b64x @ CTR block 4k+8
5186
5187 aese $ctr3b, $rk11 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 11
5188 fmov $ctr0.d[1], $ctr32x @ CTR block 4k+8
5189
5190 pmull $mod_constant.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low
5191 eor $ctr1b, $res1b, $ctr1b @ AES block 4k+5 - result
5192 rev $ctr32w, $rctr32w @ CTR block 4k+9
5193
5194 aese $ctr2b, $rk13 @ AES block 4k+6 - round 13
5195 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+9
5196 cmp $input_ptr, $main_end_input_ptr @ LOOP CONTROL
5197
5198 add $rctr32w, $rctr32w, #1 @ CTR block 4k+9
5199
5200 eor $output_l0, $output_l0, $rk14_l @ AES block 4k+4 - round 14 low
5201 eor $output_h0, $output_h0, $rk14_h @ AES block 4k+4 - round 14 high
5202
5203 mov $output_h1, $ctr1.d[1] @ AES block 4k+5 - mov high
5204 eor $ctr2b, $res2b, $ctr2b @ AES block 4k+6 - result
5205 eor $acc_lb, $acc_lb, $mod_constant.16b @ MODULO - fold into low
5206
5207 aese $ctr3b, $rk12 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 12
5208 mov $output_l1, $ctr1.d[0] @ AES block 4k+5 - mov low
5209
5210 fmov $ctr1d, $ctr96_b64x @ CTR block 4k+9
5211 ext $acc_mb, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment
5212
5213 fmov $ctr1.d[1], $ctr32x @ CTR block 4k+9
5214 rev $ctr32w, $rctr32w @ CTR block 4k+10
5215 add $rctr32w, $rctr32w, #1 @ CTR block 4k+10
5216
5217 aese $ctr3b, $rk13 @ AES block 4k+7 - round 13
5218 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+10
5219
5220 rev64 $res1b, $res1b @ GHASH block 4k+5
5221 eor $output_h1, $output_h1, $rk14_h @ AES block 4k+5 - round 14 high
5222 stp $output_l0, $output_h0, [$output_ptr], #16 @ AES block 4k+4 - store result
5223
5224 eor $output_l1, $output_l1, $rk14_l @ AES block 4k+5 - round 14 low
5225 stp $output_l1, $output_h1, [$output_ptr], #16 @ AES block 4k+5 - store result
5226
5227 rev64 $res0b, $res0b @ GHASH block 4k+4
5228 eor $acc_lb, $acc_lb, $acc_mb @ MODULO - fold into low
5229 b.lt .L256_dec_main_loop
5230
5231
5232 .L256_dec_prepretail: @ PREPRETAIL
5233 ext $acc_lb, $acc_lb, $acc_lb, #8 @ PRE 0
5234 mov $output_l2, $ctr2.d[0] @ AES block 4k+2 - mov low
5235 eor $ctr3b, $res3b, $ctr3b @ AES block 4k+3 - result
5236
5237 aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 0
5238 mov $output_h2, $ctr2.d[1] @ AES block 4k+2 - mov high
5239
5240 aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 0
5241 fmov $ctr2d, $ctr96_b64x @ CTR block 4k+6
5242
5243 fmov $ctr2.d[1], $ctr32x @ CTR block 4k+6
5244 rev $ctr32w, $rctr32w @ CTR block 4k+7
5245 eor $res0b, $res0b, $acc_lb @ PRE 1
5246
5247 rev64 $res2b, $res2b @ GHASH block 4k+2
5248 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+7
5249 mov $output_l3, $ctr3.d[0] @ AES block 4k+3 - mov low
5250
5251 aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 1
5252 mov $output_h3, $ctr3.d[1] @ AES block 4k+3 - mov high
5253
5254 pmull $acc_l.1q, $res0.1d, $h4.1d @ GHASH block 4k - low
5255 mov $t0d, $res0.d[1] @ GHASH block 4k - mid
5256 fmov $ctr3d, $ctr96_b64x @ CTR block 4k+7
5257
5258 pmull2 $acc_h.1q, $res0.2d, $h4.2d @ GHASH block 4k - high
5259 fmov $ctr3.d[1], $ctr32x @ CTR block 4k+7
5260
5261 aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 0
5262 mov $acc_md, $h34k.d[1] @ GHASH block 4k - mid
5263
5264 aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 1
5265 eor $t0.8b, $t0.8b, $res0.8b @ GHASH block 4k - mid
5266
5267 pmull2 $t1.1q, $res1.2d, $h3.2d @ GHASH block 4k+1 - high
5268
5269 aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 1
5270 rev64 $res3b, $res3b @ GHASH block 4k+3
5271
5272 aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 0
5273
5274 pmull $acc_m.1q, $t0.1d, $acc_m.1d @ GHASH block 4k - mid
5275 eor $acc_hb, $acc_hb, $t1.16b @ GHASH block 4k+1 - high
5276
5277 pmull $t2.1q, $res1.1d, $h3.1d @ GHASH block 4k+1 - low
5278
5279 aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 1
5280 mov $t3d, $res1.d[1] @ GHASH block 4k+1 - mid
5281
5282 aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 2
5283
5284 aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 2
5285 eor $acc_lb, $acc_lb, $t2.16b @ GHASH block 4k+1 - low
5286
5287 aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 2
5288
5289 aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 3
5290 mov $t6d, $res2.d[1] @ GHASH block 4k+2 - mid
5291
5292 aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 2
5293 eor $t3.8b, $t3.8b, $res1.8b @ GHASH block 4k+1 - mid
5294
5295 pmull $t5.1q, $res2.1d, $h2.1d @ GHASH block 4k+2 - low
5296
5297 aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 4
5298
5299 aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 3
5300 eor $t6.8b, $t6.8b, $res2.8b @ GHASH block 4k+2 - mid
5301
5302 pmull $t3.1q, $t3.1d, $h34k.1d @ GHASH block 4k+1 - mid
5303
5304 aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 5
5305 eor $acc_lb, $acc_lb, $t5.16b @ GHASH block 4k+2 - low
5306
5307 aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 4
5308
5309 pmull2 $t7.1q, $res3.2d, $h1.2d @ GHASH block 4k+3 - high
5310 eor $acc_mb, $acc_mb, $t3.16b @ GHASH block 4k+1 - mid
5311
5312 pmull2 $t4.1q, $res2.2d, $h2.2d @ GHASH block 4k+2 - high
5313
5314 aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 5
5315 ins $t6.d[1], $t6.d[0] @ GHASH block 4k+2 - mid
5316
5317 aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 3
5318
5319 aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 3
5320 eor $acc_hb, $acc_hb, $t4.16b @ GHASH block 4k+2 - high
5321
5322 pmull $t8.1q, $res3.1d, $h1.1d @ GHASH block 4k+3 - low
5323
5324 aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 4
5325 mov $t9d, $res3.d[1] @ GHASH block 4k+3 - mid
5326
5327 aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 4
5328
5329 pmull2 $t6.1q, $t6.2d, $h12k.2d @ GHASH block 4k+2 - mid
5330
5331 aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 5
5332 eor $t9.8b, $t9.8b, $res3.8b @ GHASH block 4k+3 - mid
5333
5334 aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 5
5335
5336 aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 6
5337 eor $acc_mb, $acc_mb, $t6.16b @ GHASH block 4k+2 - mid
5338
5339 aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 6
5340
5341 aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 6
5342 movi $mod_constant.8b, #0xc2
5343
5344 aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 6
5345 eor $acc_lb, $acc_lb, $t8.16b @ GHASH block 4k+3 - low
5346
5347 pmull $t9.1q, $t9.1d, $h12k.1d @ GHASH block 4k+3 - mid
5348
5349 aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 7
5350 eor $acc_hb, $acc_hb, $t7.16b @ GHASH block 4k+3 - high
5351
5352 aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 7
5353
5354 aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 7
5355 eor $acc_mb, $acc_mb, $t9.16b @ GHASH block 4k+3 - mid
5356
5357 aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 8
5358
5359 aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 7
5360 eor $t9.16b, $acc_lb, $acc_hb @ MODULO - karatsuba tidy up
5361
5362 aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 8
5363
5364 aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 8
5365 shl $mod_constantd, $mod_constantd, #56 @ mod_constant
5366
5367 aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 8
5368
5369 aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 9
5370 eor $acc_mb, $acc_mb, $t9.16b @ MODULO - karatsuba tidy up
5371
5372 pmull $mod_t.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid
5373
5374 aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 9
5375 ext $acc_hb, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment
5376
5377 aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 9
5378
5379 aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 9
5380 eor $acc_mb, $acc_mb, $mod_t.16b @ MODULO - fold into mid
5381
5382 aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 10
5383
5384 aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 10
5385
5386 aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 10
5387 eor $output_h2, $output_h2, $rk14_h @ AES block 4k+2 - round 14 high
5388
5389 aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 10
5390 eor $output_l3, $output_l3, $rk14_l @ AES block 4k+3 - round 14 low
5391
5392 aese $ctr2b, $rk11 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 11
5393 eor $acc_mb, $acc_mb, $acc_hb @ MODULO - fold into mid
5394
5395 aese $ctr0b, $rk11 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 11
5396 add $rctr32w, $rctr32w, #1 @ CTR block 4k+7
5397
5398 aese $ctr1b, $rk11 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 11
5399 eor $output_l2, $output_l2, $rk14_l @ AES block 4k+2 - round 14 low
5400
5401 aese $ctr2b, $rk12 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 12
5402
5403 pmull $mod_constant.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low
5404 eor $output_h3, $output_h3, $rk14_h @ AES block 4k+3 - round 14 high
5405
5406 aese $ctr3b, $rk11 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 11
5407 stp $output_l2, $output_h2, [$output_ptr], #16 @ AES block 4k+2 - store result
5408
5409 aese $ctr1b, $rk12 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 12
5410 ext $acc_mb, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment
5411
5412 aese $ctr0b, $rk12 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 12
5413 stp $output_l3, $output_h3, [$output_ptr], #16 @ AES block 4k+3 - store result
5414
5415 aese $ctr3b, $rk12 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 12
5416 eor $acc_lb, $acc_lb, $mod_constant.16b @ MODULO - fold into low
5417
5418 aese $ctr1b, $rk13 @ AES block 4k+5 - round 13
5419
5420 aese $ctr0b, $rk13 @ AES block 4k+4 - round 13
5421
5422 aese $ctr3b, $rk13 @ AES block 4k+7 - round 13
5423
5424 aese $ctr2b, $rk13 @ AES block 4k+6 - round 13
5425 eor $acc_lb, $acc_lb, $acc_mb @ MODULO - fold into low
5426 .L256_dec_tail: @ TAIL
5427
5428 sub $main_end_input_ptr, $end_input_ptr, $input_ptr @ main_end_input_ptr is number of bytes left to process
5429 ld1 { $res1b}, [$input_ptr], #16 @ AES block 4k+4 - load ciphertext
5430
5431 eor $ctr0b, $res1b, $ctr0b @ AES block 4k+4 - result
5432
5433 mov $output_l0, $ctr0.d[0] @ AES block 4k+4 - mov low
5434
5435 mov $output_h0, $ctr0.d[1] @ AES block 4k+4 - mov high
5436 ext $t0.16b, $acc_lb, $acc_lb, #8 @ prepare final partial tag
5437
5438 cmp $main_end_input_ptr, #48
5439
5440 eor $output_l0, $output_l0, $rk14_l @ AES block 4k+4 - round 14 low
5441
5442 eor $output_h0, $output_h0, $rk14_h @ AES block 4k+4 - round 14 high
5443 b.gt .L256_dec_blocks_more_than_3
5444
5445 sub $rctr32w, $rctr32w, #1
5446 mov $ctr3b, $ctr2b
5447 movi $acc_m.8b, #0
5448
5449 movi $acc_l.8b, #0
5450 cmp $main_end_input_ptr, #32
5451
5452 movi $acc_h.8b, #0
5453 mov $ctr2b, $ctr1b
5454 b.gt .L256_dec_blocks_more_than_2
5455
5456 sub $rctr32w, $rctr32w, #1
5457
5458 mov $ctr3b, $ctr1b
5459 cmp $main_end_input_ptr, #16
5460 b.gt .L256_dec_blocks_more_than_1
5461
5462 sub $rctr32w, $rctr32w, #1
5463 b .L256_dec_blocks_less_than_1
5464 .L256_dec_blocks_more_than_3: @ blocks left > 3
5465 rev64 $res0b, $res1b @ GHASH final-3 block
5466 ld1 { $res1b}, [$input_ptr], #16 @ AES final-2 block - load ciphertext
5467
5468 stp $output_l0, $output_h0, [$output_ptr], #16 @ AES final-3 block - store result
5469
5470 mov $acc_md, $h34k.d[1] @ GHASH final-3 block - mid
5471
5472 eor $res0b, $res0b, $t0.16b @ feed in partial tag
5473
5474 eor $ctr0b, $res1b, $ctr1b @ AES final-2 block - result
5475
5476 mov $rk4d, $res0.d[1] @ GHASH final-3 block - mid
5477
5478 mov $output_l0, $ctr0.d[0] @ AES final-2 block - mov low
5479
5480 mov $output_h0, $ctr0.d[1] @ AES final-2 block - mov high
5481
5482 eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-3 block - mid
5483
5484 movi $t0.8b, #0 @ suppress further partial tag feed in
5485
5486 pmull2 $acc_h.1q, $res0.2d, $h4.2d @ GHASH final-3 block - high
5487
5488 pmull $acc_m.1q, $rk4v.1d, $acc_m.1d @ GHASH final-3 block - mid
5489 eor $output_l0, $output_l0, $rk14_l @ AES final-2 block - round 14 low
5490
5491 pmull $acc_l.1q, $res0.1d, $h4.1d @ GHASH final-3 block - low
5492 eor $output_h0, $output_h0, $rk14_h @ AES final-2 block - round 14 high
5493 .L256_dec_blocks_more_than_2: @ blocks left > 2
5494
5495 rev64 $res0b, $res1b @ GHASH final-2 block
5496 ld1 { $res1b}, [$input_ptr], #16 @ AES final-1 block - load ciphertext
5497
5498 eor $res0b, $res0b, $t0.16b @ feed in partial tag
5499 stp $output_l0, $output_h0, [$output_ptr], #16 @ AES final-2 block - store result
5500
5501 eor $ctr0b, $res1b, $ctr2b @ AES final-1 block - result
5502
5503 mov $rk4d, $res0.d[1] @ GHASH final-2 block - mid
5504
5505 pmull $rk3q1, $res0.1d, $h3.1d @ GHASH final-2 block - low
5506
5507 pmull2 $rk2q1, $res0.2d, $h3.2d @ GHASH final-2 block - high
5508
5509 eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-2 block - mid
5510 mov $output_l0, $ctr0.d[0] @ AES final-1 block - mov low
5511
5512 mov $output_h0, $ctr0.d[1] @ AES final-1 block - mov high
5513 eor $acc_lb, $acc_lb, $rk3 @ GHASH final-2 block - low
5514 movi $t0.8b, #0 @ suppress further partial tag feed in
5515
5516 pmull $rk4v.1q, $rk4v.1d, $h34k.1d @ GHASH final-2 block - mid
5517
5518 eor $acc_hb, $acc_hb, $rk2 @ GHASH final-2 block - high
5519 eor $output_l0, $output_l0, $rk14_l @ AES final-1 block - round 14 low
5520
5521 eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-2 block - mid
5522 eor $output_h0, $output_h0, $rk14_h @ AES final-1 block - round 14 high
5523 .L256_dec_blocks_more_than_1: @ blocks left > 1
5524
5525 stp $output_l0, $output_h0, [$output_ptr], #16 @ AES final-1 block - store result
5526 rev64 $res0b, $res1b @ GHASH final-1 block
5527
5528 ld1 { $res1b}, [$input_ptr], #16 @ AES final block - load ciphertext
5529
5530 eor $res0b, $res0b, $t0.16b @ feed in partial tag
5531 movi $t0.8b, #0 @ suppress further partial tag feed in
5532
5533 mov $rk4d, $res0.d[1] @ GHASH final-1 block - mid
5534
5535 eor $ctr0b, $res1b, $ctr3b @ AES final block - result
5536
5537 pmull2 $rk2q1, $res0.2d, $h2.2d @ GHASH final-1 block - high
5538
5539 eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-1 block - mid
5540
5541 pmull $rk3q1, $res0.1d, $h2.1d @ GHASH final-1 block - low
5542 mov $output_l0, $ctr0.d[0] @ AES final block - mov low
5543
5544 ins $rk4v.d[1], $rk4v.d[0] @ GHASH final-1 block - mid
5545
5546 mov $output_h0, $ctr0.d[1] @ AES final block - mov high
5547
5548 pmull2 $rk4v.1q, $rk4v.2d, $h12k.2d @ GHASH final-1 block - mid
5549 eor $output_l0, $output_l0, $rk14_l @ AES final block - round 14 low
5550
5551 eor $acc_lb, $acc_lb, $rk3 @ GHASH final-1 block - low
5552
5553 eor $acc_hb, $acc_hb, $rk2 @ GHASH final-1 block - high
5554
5555 eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-1 block - mid
5556 eor $output_h0, $output_h0, $rk14_h @ AES final block - round 14 high
5557 .L256_dec_blocks_less_than_1: @ blocks left <= 1
5558
5559 and $bit_length, $bit_length, #127 @ bit_length %= 128
5560 mvn $rk14_h, xzr @ rk14_h = 0xffffffffffffffff
5561
5562 sub $bit_length, $bit_length, #128 @ bit_length -= 128
5563 mvn $rk14_l, xzr @ rk14_l = 0xffffffffffffffff
5564
5565 ldp $end_input_ptr, $main_end_input_ptr, [$output_ptr] @ load existing bytes we need to not overwrite
5566 neg $bit_length, $bit_length @ bit_length = 128 - #bits in input (in range [1,128])
5567
5568 and $bit_length, $bit_length, #127 @ bit_length %= 128
5569
5570 lsr $rk14_h, $rk14_h, $bit_length @ rk14_h is mask for top 64b of last block
5571 cmp $bit_length, #64
5572
5573 csel $ctr32x, $rk14_l, $rk14_h, lt
5574 csel $ctr96_b64x, $rk14_h, xzr, lt
5575
5576 fmov $ctr0d, $ctr32x @ ctr0b is mask for last block
5577 and $output_l0, $output_l0, $ctr32x
5578
5579 mov $ctr0.d[1], $ctr96_b64x
5580 bic $end_input_ptr, $end_input_ptr, $ctr32x @ mask out low existing bytes
5581
5582 rev $ctr32w, $rctr32w
5583
5584 bic $main_end_input_ptr, $main_end_input_ptr, $ctr96_b64x @ mask out high existing bytes
5585
5586 orr $output_l0, $output_l0, $end_input_ptr
5587
5588 and $output_h0, $output_h0, $ctr96_b64x
5589
5590 orr $output_h0, $output_h0, $main_end_input_ptr
5591
5592 and $res1b, $res1b, $ctr0b @ possibly partial last block has zeroes in highest bits
5593
5594 rev64 $res0b, $res1b @ GHASH final block
5595
5596 eor $res0b, $res0b, $t0.16b @ feed in partial tag
5597
5598 pmull $rk3q1, $res0.1d, $h1.1d @ GHASH final block - low
5599
5600 mov $t0d, $res0.d[1] @ GHASH final block - mid
5601
5602 eor $t0.8b, $t0.8b, $res0.8b @ GHASH final block - mid
5603
5604 pmull2 $rk2q1, $res0.2d, $h1.2d @ GHASH final block - high
5605
5606 pmull $t0.1q, $t0.1d, $h12k.1d @ GHASH final block - mid
5607
5608 eor $acc_hb, $acc_hb, $rk2 @ GHASH final block - high
5609
5610 eor $acc_lb, $acc_lb, $rk3 @ GHASH final block - low
5611
5612 eor $acc_mb, $acc_mb, $t0.16b @ GHASH final block - mid
5613 movi $mod_constant.8b, #0xc2
5614
5615 eor $t9.16b, $acc_lb, $acc_hb @ MODULO - karatsuba tidy up
5616
5617 shl $mod_constantd, $mod_constantd, #56 @ mod_constant
5618
5619 eor $acc_mb, $acc_mb, $t9.16b @ MODULO - karatsuba tidy up
5620
5621 pmull $mod_t.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid
5622
5623 ext $acc_hb, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment
5624
5625 eor $acc_mb, $acc_mb, $mod_t.16b @ MODULO - fold into mid
5626
5627 eor $acc_mb, $acc_mb, $acc_hb @ MODULO - fold into mid
5628
5629 pmull $mod_constant.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low
5630
5631 ext $acc_mb, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment
5632
5633 eor $acc_lb, $acc_lb, $mod_constant.16b @ MODULO - fold into low
5634
5635 stp $output_l0, $output_h0, [$output_ptr]
5636
5637 str $ctr32w, [$counter, #12] @ store the updated counter
5638
5639 eor $acc_lb, $acc_lb, $acc_mb @ MODULO - fold into low
5640 ext $acc_lb, $acc_lb, $acc_lb, #8
5641 rev64 $acc_lb, $acc_lb
5642 mov x0, $len
5643 st1 { $acc_l.16b }, [$current_tag]
5644
5645 ldp x21, x22, [sp, #16]
5646 ldp x23, x24, [sp, #32]
5647 ldp d8, d9, [sp, #48]
5648 ldp d10, d11, [sp, #64]
5649 ldp d12, d13, [sp, #80]
5650 ldp d14, d15, [sp, #96]
5651 ldp x19, x20, [sp], #112
5652 ret
5653
5654.L256_dec_ret:
5655 mov w0, #0x0
5656 ret
5657.size aes_gcm_dec_256_kernel,.-aes_gcm_dec_256_kernel
5658___
5659}
5660}
5661
5662$code.=<<___;
5663.asciz "GHASH for ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
5664.align 2
5665#endif
5666___
5667
5668if ($flavour =~ /64/) { ######## 64-bit code
5669 sub unvmov {
5670 my $arg=shift;
5671
5672 $arg =~ m/q([0-9]+)#(lo|hi),\s*q([0-9]+)#(lo|hi)/o &&
5673 sprintf "ins v%d.d[%d],v%d.d[%d]",$1<8?$1:$1+8,($2 eq "lo")?0:1,
5674 $3<8?$3:$3+8,($4 eq "lo")?0:1;
5675 }
5676 foreach(split("\n",$code)) {
5677 s/@\s/\/\//o; # old->new style commentary
5678 print $_,"\n";
5679 }
5680} else { ######## 32-bit code
5681 sub unvdup32 {
5682 my $arg=shift;
5683
5684 $arg =~ m/q([0-9]+),\s*q([0-9]+)\[([0-3])\]/o &&
5685 sprintf "vdup.32 q%d,d%d[%d]",$1,2*$2+($3>>1),$3&1;
5686 }
5687 sub unvpmullp64 {
5688 my ($mnemonic,$arg)=@_;
5689
5690 if ($arg =~ m/q([0-9]+),\s*q([0-9]+),\s*q([0-9]+)/o) {
5691 my $word = 0xf2a00e00|(($1&7)<<13)|(($1&8)<<19)
5692 |(($2&7)<<17)|(($2&8)<<4)
5693 |(($3&7)<<1) |(($3&8)<<2);
5694 $word |= 0x00010001 if ($mnemonic =~ "2");
5695 # since ARMv7 instructions are always encoded little-endian.
5696 # correct solution is to use .inst directive, but older%%%%
5697 # assemblers don't implement it:-(
5698 sprintf "INST(0x%02x,0x%02x,0x%02x,0x%02x)\t@ %s %s",
5699 $word&0xff,($word>>8)&0xff,
5700 ($word>>16)&0xff,($word>>24)&0xff,
5701 $mnemonic,$arg;
5702 }
5703 }
5704
5705 foreach(split("\n",$code)) {
5706 s/\b[wx]([0-9]+)\b/r$1/go; # new->old registers
5707 s/\bv([0-9])\.[12468]+[bsd]\b/q$1/go; # new->old registers
5708 s/\/\/\s?/@ /o; # new->old style commentary
5709
5710 # fix up remaining new-style suffixes
5711 s/\],#[0-9]+/]!/o;
5712
5713 s/cclr\s+([^,]+),\s*([a-z]+)/mov.$2 $1,#0/o or
5714 s/vdup\.32\s+(.*)/unvdup32($1)/geo or
5715 s/v?(pmull2?)\.p64\s+(.*)/unvpmullp64($1,$2)/geo or
5716 s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo or
5717 s/^(\s+)b\./$1b/o or
5718 s/^(\s+)ret/$1bx\tlr/o;
5719
5720 if (s/^(\s+)mov\.([a-z]+)/$1mov$2/) {
5721 print " it $2\n";
5722 }
5723
5724 print $_,"\n";
5725 }
5726}
5727
5728close STDOUT or die "error closing STDOUT: $!"; # enforce flush
注意: 瀏覽 TracBrowser 來幫助您使用儲存庫瀏覽器

© 2025 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette