VirtualBox

source: vbox/trunk/src/libs/openssl-3.1.7/crypto/modes/asm/aes-gcm-armv8_64.pl@ 108344

最後變更 在這個檔案從108344是 104078,由 vboxsync 提交於 12 月 前

openssl-3.1.5: Applied and adjusted our OpenSSL changes to 3.1.4. bugref:10638

檔案大小: 280.9 KB
 
1#! /usr/bin/env perl
2# Copyright 2019-2023 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the Apache License 2.0 (the "License"). You may not use
5# this file except in compliance with the License. You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
9#
10#========================================================================
11# Written by Fangming Fang <[email protected]> for the OpenSSL project,
12# derived from https://github.com/ARM-software/AArch64cryptolib, original
13# author Samuel Lee <[email protected]>. The module is, however, dual
14# licensed under OpenSSL and CRYPTOGAMS licenses depending on where you
15# obtain it. For further details see http://www.openssl.org/~appro/cryptogams/.
16#========================================================================
17#
18# Approach - assume we don't want to reload constants, so reserve ~half of vector register file for constants
19#
20# main loop to act on 4 16B blocks per iteration, and then do modulo of the accumulated intermediate hashes from the 4 blocks
21#
22# ____________________________________________________
23# | |
24# | PRE |
25# |____________________________________________________|
26# | | | |
27# | CTR block 4k+8 | AES block 4k+4 | GHASH block 4k+0 |
28# |________________|________________|__________________|
29# | | | |
30# | CTR block 4k+9 | AES block 4k+5 | GHASH block 4k+1 |
31# |________________|________________|__________________|
32# | | | |
33# | CTR block 4k+10| AES block 4k+6 | GHASH block 4k+2 |
34# |________________|________________|__________________|
35# | | | |
36# | CTR block 4k+11| AES block 4k+7 | GHASH block 4k+3 |
37# |________________|____(mostly)____|__________________|
38# | |
39# | MODULO |
40# |____________________________________________________|
41#
42# PRE:
43# Ensure previous generated intermediate hash is aligned and merged with result for GHASH 4k+0
44# EXT low_acc, low_acc, low_acc, #8
45# EOR res_curr (4k+0), res_curr (4k+0), low_acc
46#
47# CTR block:
48# Increment and byte reverse counter in scalar registers and transfer to SIMD registers
49# REV ctr32, rev_ctr32
50# ORR ctr64, constctr96_top32, ctr32, LSL #32
51# INS ctr_next.d[0], constctr96_bottom64 // Keeping this in scalar registers to free up space in SIMD RF
52# INS ctr_next.d[1], ctr64X
53# ADD rev_ctr32, #1
54#
55# AES block:
56# Do AES encryption/decryption on CTR block X and EOR it with input block X. Take 256 bytes key below for example.
57# Doing small trick here of loading input in scalar registers, EORing with last key and then transferring
58# Given we are very constrained in our ASIMD registers this is quite important
59#
60# Encrypt:
61# LDR input_low, [ input_ptr ], #8
62# LDR input_high, [ input_ptr ], #8
63# EOR input_low, k14_low
64# EOR input_high, k14_high
65# INS res_curr.d[0], input_low
66# INS res_curr.d[1], input_high
67# AESE ctr_curr, k0; AESMC ctr_curr, ctr_curr
68# AESE ctr_curr, k1; AESMC ctr_curr, ctr_curr
69# AESE ctr_curr, k2; AESMC ctr_curr, ctr_curr
70# AESE ctr_curr, k3; AESMC ctr_curr, ctr_curr
71# AESE ctr_curr, k4; AESMC ctr_curr, ctr_curr
72# AESE ctr_curr, k5; AESMC ctr_curr, ctr_curr
73# AESE ctr_curr, k6; AESMC ctr_curr, ctr_curr
74# AESE ctr_curr, k7; AESMC ctr_curr, ctr_curr
75# AESE ctr_curr, k8; AESMC ctr_curr, ctr_curr
76# AESE ctr_curr, k9; AESMC ctr_curr, ctr_curr
77# AESE ctr_curr, k10; AESMC ctr_curr, ctr_curr
78# AESE ctr_curr, k11; AESMC ctr_curr, ctr_curr
79# AESE ctr_curr, k12; AESMC ctr_curr, ctr_curr
80# AESE ctr_curr, k13
81# EOR res_curr, res_curr, ctr_curr
82# ST1 { res_curr.16b }, [ output_ptr ], #16
83#
84# Decrypt:
85# AESE ctr_curr, k0; AESMC ctr_curr, ctr_curr
86# AESE ctr_curr, k1; AESMC ctr_curr, ctr_curr
87# AESE ctr_curr, k2; AESMC ctr_curr, ctr_curr
88# AESE ctr_curr, k3; AESMC ctr_curr, ctr_curr
89# AESE ctr_curr, k4; AESMC ctr_curr, ctr_curr
90# AESE ctr_curr, k5; AESMC ctr_curr, ctr_curr
91# AESE ctr_curr, k6; AESMC ctr_curr, ctr_curr
92# AESE ctr_curr, k7; AESMC ctr_curr, ctr_curr
93# AESE ctr_curr, k8; AESMC ctr_curr, ctr_curr
94# AESE ctr_curr, k9; AESMC ctr_curr, ctr_curr
95# AESE ctr_curr, k10; AESMC ctr_curr, ctr_curr
96# AESE ctr_curr, k11; AESMC ctr_curr, ctr_curr
97# AESE ctr_curr, k12; AESMC ctr_curr, ctr_curr
98# AESE ctr_curr, k13
99# LDR res_curr, [ input_ptr ], #16
100# EOR res_curr, res_curr, ctr_curr
101# MOV output_low, res_curr.d[0]
102# MOV output_high, res_curr.d[1]
103# EOR output_low, k14_low
104# EOR output_high, k14_high
105# STP output_low, output_high, [ output_ptr ], #16
106#
107# GHASH block X:
108# do 128b karatsuba polynomial multiplication on block
109# We only have 64b->128b polynomial multipliers, naively that means we need to do 4 64b multiplies to generate a 128b
110#
111# multiplication:
112# Pmull(A,B) == (Pmull(Ah,Bh)<<128 | Pmull(Al,Bl)) ^ (Pmull(Ah,Bl) ^ Pmull(Al,Bh))<<64
113#
114# The idea behind Karatsuba multiplication is that we can do just 3 64b multiplies:
115# Pmull(A,B) == (Pmull(Ah,Bh)<<128 | Pmull(Al,Bl)) ^ (Pmull(Ah^Al,Bh^Bl) ^ Pmull(Ah,Bh) ^ Pmull(Al,Bl))<<64
116#
117# There is some complication here because the bit order of GHASH's PMULL is reversed compared to elsewhere, so we are
118# multiplying with "twisted" powers of H
119#
120# Note: We can PMULL directly into the acc_x in first GHASH of the loop
121# Note: For scheduling big cores we want to split the processing to happen over two loop iterations - otherwise the critical
122# path latency dominates the performance
123#
124# This has a knock on effect on register pressure, so we have to be a bit more clever with our temporary registers
125# than indicated here
126# REV64 res_curr, res_curr
127# INS t_m.d[0], res_curr.d[1]
128# EOR t_m.8B, t_m.8B, res_curr.8B
129# PMULL2 t_h, res_curr, HX
130# PMULL t_l, res_curr, HX
131# PMULL t_m, t_m, HX_k
132# EOR acc_h, acc_h, t_h
133# EOR acc_l, acc_l, t_l
134# EOR acc_m, acc_m, t_m
135#
136# MODULO: take the partial accumulators (~representing sum of 256b multiplication results), from GHASH and do modulo reduction on them
137# There is some complication here because the bit order of GHASH's PMULL is reversed compared to elsewhere, so we are doing modulo
138# with a reversed constant
139# EOR acc_m, acc_m, acc_h
140# EOR acc_m, acc_m, acc_l // Finish off karatsuba processing
141# PMULL t_mod, acc_h, mod_constant
142# EXT acc_h, acc_h, acc_h, #8
143# EOR acc_m, acc_m, acc_h
144# EOR acc_m, acc_m, t_mod
145# PMULL acc_h, acc_m, mod_constant
146# EXT acc_m, acc_m, acc_m, #8
147# EOR acc_l, acc_l, acc_h
148# EOR acc_l, acc_l, acc_m
149
150$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
151$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
152
153$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
154( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
155( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate ) or
156die "can't locate arm-xlate.pl";
157
158open OUT,"| \"$^X\" $xlate $flavour $output";
159*STDOUT=*OUT;
160
161$input_ptr="x0"; #argument block
162$bit_length="x1";
163$output_ptr="x2";
164$current_tag="x3";
165$counter="x16";
166$cc="x8";
167
168{
169my ($end_input_ptr,$main_end_input_ptr,$input_l0,$input_h0)=map("x$_",(4..7));
170my ($input_l1,$input_h1,$input_l2,$input_h2,$input_l3,$input_h3)=map("x$_",(19..24));
171my ($output_l1,$output_h1,$output_l2,$output_h2,$output_l3,$output_h3)=map("x$_",(19..24));
172my ($output_l0,$output_h0)=map("x$_",(6..7));
173
174my $ctr32w="w9";
175my ($ctr32x,$ctr96_b64x,$ctr96_t32x,$rctr32x,$rk10_l,$rk10_h,$len)=map("x$_",(9..15));
176my ($ctr96_t32w,$rctr32w)=map("w$_",(11..12));
177
178my ($ctr0b,$ctr1b,$ctr2b,$ctr3b,$res0b,$res1b,$res2b,$res3b)=map("v$_.16b",(0..7));
179my ($ctr0,$ctr1,$ctr2,$ctr3,$res0,$res1,$res2,$res3)=map("v$_",(0..7));
180my ($ctr0d,$ctr1d,$ctr2d,$ctr3d,$res0d,$res1d,$res2d,$res3d)=map("d$_",(0..7));
181my ($res0q,$res1q,$res2q,$res3q)=map("q$_",(4..7));
182
183my ($acc_hb,$acc_mb,$acc_lb)=map("v$_.16b",(9..11));
184my ($acc_h,$acc_m,$acc_l)=map("v$_",(9..11));
185my ($acc_hd,$acc_md,$acc_ld)=map("d$_",(9..11));
186
187my ($h1,$h2,$h3,$h4,$h12k,$h34k)=map("v$_",(12..17));
188my ($h1q,$h2q,$h3q,$h4q)=map("q$_",(12..15));
189my ($h1b,$h2b,$h3b,$h4b)=map("v$_.16b",(12..15));
190
191my $t0="v8";
192my $t0d="d8";
193
194my ($t1,$t2,$t3)=map("v$_",(28..30));
195my ($t1d,$t2d,$t3d)=map("d$_",(28..30));
196
197my $t4="v8";
198my $t4d="d8";
199my $t5="v28";
200my $t5d="d28";
201my $t6="v31";
202my $t6d="d31";
203
204my $t7="v4";
205my $t7d="d4";
206my $t8="v29";
207my $t8d="d29";
208my $t9="v30";
209my $t9d="d30";
210
211my ($ctr_t0,$ctr_t1,$ctr_t2,$ctr_t3)=map("v$_",(4..7));
212my ($ctr_t0d,$ctr_t1d,$ctr_t2d,$ctr_t3d)=map("d$_",(4..7));
213my ($ctr_t0b,$ctr_t1b,$ctr_t2b,$ctr_t3b)=map("v$_.16b",(4..7));
214
215my $mod_constantd="d8";
216my $mod_constant="v8";
217my $mod_t="v31";
218
219my ($rk0,$rk1,$rk2,$rk3,$rk4,$rk5,$rk6,$rk7,$rk8,$rk9)=map("v$_.16b",(18..27));
220my ($rk0s,$rk1s,$rk2s,$rk3s,$rk4s,$rk5s,$rk6s,$rk7s,$rk8s,$rk9s)=map("v$_.4s",(18..27));
221my ($rk0q,$rk1q,$rk2q,$rk3q,$rk4q,$rk5q,$rk6q,$rk7q,$rk8q,$rk9q)=map("q$_",(18..27));
222my $rk2q1="v20.1q";
223my $rk3q1="v21.1q";
224my $rk4v="v22";
225my $rk4d="d22";
226
227$code=<<___;
228#include "arm_arch.h"
229
230#if __ARM_MAX_ARCH__>=8
231___
232$code.=".arch armv8-a+crypto\n.text\n" if ($flavour =~ /64/);
233$code.=<<___ if ($flavour !~ /64/);
234.fpu neon
235#ifdef __thumb2__
236.syntax unified
237.thumb
238# define INST(a,b,c,d) $_byte c,0xef,a,b
239#else
240.code 32
241# define INST(a,b,c,d) $_byte a,b,c,0xf2
242#endif
243
244.text
245___
246
247#########################################################################################
248# size_t aes_gcm_enc_128_kernel(const unsigned char *in,
249# size_t len,
250# unsigned char *out,
251# const void *key,
252# unsigned char ivec[16],
253# u64 *Xi);
254#
255$code.=<<___;
256.global aes_gcm_enc_128_kernel
257.type aes_gcm_enc_128_kernel,%function
258.align 4
259aes_gcm_enc_128_kernel:
260 AARCH64_VALID_CALL_TARGET
261 cbz x1, .L128_enc_ret
262 stp x19, x20, [sp, #-112]!
263 mov x16, x4
264 mov x8, x5
265 stp x21, x22, [sp, #16]
266 stp x23, x24, [sp, #32]
267 stp d8, d9, [sp, #48]
268 stp d10, d11, [sp, #64]
269 stp d12, d13, [sp, #80]
270 stp d14, d15, [sp, #96]
271
272 ldp $ctr96_b64x, $ctr96_t32x, [$counter] @ ctr96_b64, ctr96_t32
273#ifdef __AARCH64EB__
274 rev $ctr96_b64x, $ctr96_b64x
275 rev $ctr96_t32x, $ctr96_t32x
276#endif
277 ldp $rk10_l, $rk10_h, [$cc, #160] @ load rk10
278#ifdef __AARCH64EB__
279 ror $rk10_l, $rk10_l, #32
280 ror $rk10_h, $rk10_h, #32
281#endif
282 ld1 {$acc_lb}, [$current_tag]
283 ext $acc_lb, $acc_lb, $acc_lb, #8
284 rev64 $acc_lb, $acc_lb
285 lsr $main_end_input_ptr, $bit_length, #3 @ byte_len
286 mov $len, $main_end_input_ptr
287
288 ld1 {$rk0s}, [$cc], #16 @ load rk0
289 add $end_input_ptr, $input_ptr, $bit_length, lsr #3 @ end_input_ptr
290 sub $main_end_input_ptr, $main_end_input_ptr, #1 @ byte_len - 1
291
292 lsr $rctr32x, $ctr96_t32x, #32
293 ldr $h4q, [$current_tag, #112] @ load h4l | h4h
294#ifndef __AARCH64EB__
295 ext $h4b, $h4b, $h4b, #8
296#endif
297 fmov $ctr1d, $ctr96_b64x @ CTR block 1
298 rev $rctr32w, $rctr32w @ rev_ctr32
299
300 add $rctr32w, $rctr32w, #1 @ increment rev_ctr32
301 orr $ctr96_t32w, $ctr96_t32w, $ctr96_t32w
302 ld1 {$rk1s}, [$cc], #16 @ load rk1
303
304 rev $ctr32w, $rctr32w @ CTR block 1
305 add $rctr32w, $rctr32w, #1 @ CTR block 1
306 fmov $ctr3d, $ctr96_b64x @ CTR block 3
307
308 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 1
309 ld1 { $ctr0b}, [$counter] @ special case vector load initial counter so we can start first AES block as quickly as possible
310
311 fmov $ctr1.d[1], $ctr32x @ CTR block 1
312 rev $ctr32w, $rctr32w @ CTR block 2
313
314 fmov $ctr2d, $ctr96_b64x @ CTR block 2
315 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 2
316 add $rctr32w, $rctr32w, #1 @ CTR block 2
317
318 fmov $ctr2.d[1], $ctr32x @ CTR block 2
319 rev $ctr32w, $rctr32w @ CTR block 3
320
321 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 3
322 ld1 {$rk2s}, [$cc], #16 @ load rk2
323
324 add $rctr32w, $rctr32w, #1 @ CTR block 3
325 fmov $ctr3.d[1], $ctr32x @ CTR block 3
326
327 ldr $h3q, [$current_tag, #80] @ load h3l | h3h
328#ifndef __AARCH64EB__
329 ext $h3b, $h3b, $h3b, #8
330#endif
331 aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 0
332 ld1 {$rk3s}, [$cc], #16 @ load rk3
333
334 aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 0
335 ldr $h1q, [$current_tag, #32] @ load h1l | h1h
336#ifndef __AARCH64EB__
337 ext $h1b, $h1b, $h1b, #8
338#endif
339
340 aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 0
341 ld1 {$rk4s}, [$cc], #16 @ load rk4
342
343 aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 0
344 ld1 {$rk5s}, [$cc], #16 @ load rk5
345
346 aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 1
347 trn2 $h34k.2d, $h3.2d, $h4.2d @ h4l | h3l
348
349 aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 1
350 ld1 {$rk6s}, [$cc], #16 @ load rk6
351
352 aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 1
353 ld1 {$rk7s}, [$cc], #16 @ load rk7
354
355 aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 1
356 trn1 $acc_h.2d, $h3.2d, $h4.2d @ h4h | h3h
357
358 aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 2
359 ld1 {$rk8s}, [$cc], #16 @ load rk8
360
361 aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 2
362 ldr $h2q, [$current_tag, #64] @ load h2l | h2h
363#ifndef __AARCH64EB__
364 ext $h2b, $h2b, $h2b, #8
365#endif
366
367 aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 2
368
369 aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 2
370 eor $h34k.16b, $h34k.16b, $acc_h.16b @ h4k | h3k
371
372 aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 3
373
374 aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 3
375
376 aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 3
377 ld1 {$rk9s}, [$cc], #16 @ load rk9
378
379 aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 3
380
381 and $main_end_input_ptr, $main_end_input_ptr, #0xffffffffffffffc0 @ number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
382 trn2 $h12k.2d, $h1.2d, $h2.2d @ h2l | h1l
383
384 aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 4
385 add $main_end_input_ptr, $main_end_input_ptr, $input_ptr
386
387 aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 4
388 cmp $input_ptr, $main_end_input_ptr @ check if we have <= 4 blocks
389
390 aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 4
391
392 aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 5
393
394 aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 5
395
396 aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 5
397
398 aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 6
399
400 aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 4
401
402 aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 6
403 trn1 $t0.2d, $h1.2d, $h2.2d @ h2h | h1h
404
405 aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 6
406
407 aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 5
408
409 aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 7
410
411 aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 7
412
413 aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 6
414
415 aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 7
416
417 aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 8
418
419 aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 7
420
421 aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 8
422
423 aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 8
424
425 aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 8
426
427 aese $ctr2b, $rk9 @ AES block 2 - round 9
428
429 aese $ctr0b, $rk9 @ AES block 0 - round 9
430
431 eor $h12k.16b, $h12k.16b, $t0.16b @ h2k | h1k
432
433 aese $ctr1b, $rk9 @ AES block 1 - round 9
434
435 aese $ctr3b, $rk9 @ AES block 3 - round 9
436 b.ge .L128_enc_tail @ handle tail
437
438 ldp $input_l0, $input_h0, [$input_ptr, #0] @ AES block 0 - load plaintext
439#ifdef __AARCH64EB__
440 rev $input_l0, $input_l0
441 rev $input_h0, $input_h0
442#endif
443 ldp $input_l2, $input_h2, [$input_ptr, #32] @ AES block 2 - load plaintext
444#ifdef __AARCH64EB__
445 rev $input_l2, $input_l2
446 rev $input_h2, $input_h2
447#endif
448 ldp $input_l1, $input_h1, [$input_ptr, #16] @ AES block 1 - load plaintext
449#ifdef __AARCH64EB__
450 rev $input_l1, $input_l1
451 rev $input_h1, $input_h1
452#endif
453 ldp $input_l3, $input_h3, [$input_ptr, #48] @ AES block 3 - load plaintext
454#ifdef __AARCH64EB__
455 rev $input_l3, $input_l3
456 rev $input_h3, $input_h3
457#endif
458 eor $input_l0, $input_l0, $rk10_l @ AES block 0 - round 10 low
459 eor $input_h0, $input_h0, $rk10_h @ AES block 0 - round 10 high
460
461 eor $input_l2, $input_l2, $rk10_l @ AES block 2 - round 10 low
462 fmov $ctr_t0d, $input_l0 @ AES block 0 - mov low
463
464 eor $input_l1, $input_l1, $rk10_l @ AES block 1 - round 10 low
465 eor $input_h2, $input_h2, $rk10_h @ AES block 2 - round 10 high
466 fmov $ctr_t0.d[1], $input_h0 @ AES block 0 - mov high
467
468 fmov $ctr_t1d, $input_l1 @ AES block 1 - mov low
469 eor $input_h1, $input_h1, $rk10_h @ AES block 1 - round 10 high
470
471 eor $input_l3, $input_l3, $rk10_l @ AES block 3 - round 10 low
472 fmov $ctr_t1.d[1], $input_h1 @ AES block 1 - mov high
473
474 fmov $ctr_t2d, $input_l2 @ AES block 2 - mov low
475 eor $input_h3, $input_h3, $rk10_h @ AES block 3 - round 10 high
476 rev $ctr32w, $rctr32w @ CTR block 4
477
478 fmov $ctr_t2.d[1], $input_h2 @ AES block 2 - mov high
479 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4
480
481 eor $res0b, $ctr_t0b, $ctr0b @ AES block 0 - result
482 fmov $ctr0d, $ctr96_b64x @ CTR block 4
483 add $rctr32w, $rctr32w, #1 @ CTR block 4
484
485 fmov $ctr0.d[1], $ctr32x @ CTR block 4
486 rev $ctr32w, $rctr32w @ CTR block 5
487
488 eor $res1b, $ctr_t1b, $ctr1b @ AES block 1 - result
489 fmov $ctr1d, $ctr96_b64x @ CTR block 5
490 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 5
491
492 add $rctr32w, $rctr32w, #1 @ CTR block 5
493 add $input_ptr, $input_ptr, #64 @ AES input_ptr update
494 fmov $ctr1.d[1], $ctr32x @ CTR block 5
495
496 fmov $ctr_t3d, $input_l3 @ AES block 3 - mov low
497 rev $ctr32w, $rctr32w @ CTR block 6
498 st1 { $res0b}, [$output_ptr], #16 @ AES block 0 - store result
499
500 fmov $ctr_t3.d[1], $input_h3 @ AES block 3 - mov high
501 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 6
502
503 add $rctr32w, $rctr32w, #1 @ CTR block 6
504 eor $res2b, $ctr_t2b, $ctr2b @ AES block 2 - result
505 st1 { $res1b}, [$output_ptr], #16 @ AES block 1 - store result
506
507 fmov $ctr2d, $ctr96_b64x @ CTR block 6
508 cmp $input_ptr, $main_end_input_ptr @ check if we have <= 8 blocks
509
510 fmov $ctr2.d[1], $ctr32x @ CTR block 6
511 rev $ctr32w, $rctr32w @ CTR block 7
512 st1 { $res2b}, [$output_ptr], #16 @ AES block 2 - store result
513
514 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 7
515
516 eor $res3b, $ctr_t3b, $ctr3b @ AES block 3 - result
517 st1 { $res3b}, [$output_ptr], #16 @ AES block 3 - store result
518 b.ge .L128_enc_prepretail @ do prepretail
519
520 .L128_enc_main_loop: @ main loop start
521 ldp $input_l3, $input_h3, [$input_ptr, #48] @ AES block 4k+3 - load plaintext
522#ifdef __AARCH64EB__
523 rev $input_l3, $input_l3
524 rev $input_h3, $input_h3
525#endif
526 rev64 $res0b, $res0b @ GHASH block 4k (only t0 is free)
527 rev64 $res2b, $res2b @ GHASH block 4k+2 (t0, t1, and t2 free)
528
529 aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 0
530 fmov $ctr3d, $ctr96_b64x @ CTR block 4k+3
531
532 ext $acc_lb, $acc_lb, $acc_lb, #8 @ PRE 0
533 rev64 $res1b, $res1b @ GHASH block 4k+1 (t0 and t1 free)
534
535 aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 0
536 add $rctr32w, $rctr32w, #1 @ CTR block 4k+3
537 fmov $ctr3.d[1], $ctr32x @ CTR block 4k+3
538
539 aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 0
540 mov $t6d, $res2.d[1] @ GHASH block 4k+2 - mid
541
542 aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 1
543 mov $t3d, $res1.d[1] @ GHASH block 4k+1 - mid
544
545 aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 1
546 eor $res0b, $res0b, $acc_lb @ PRE 1
547
548 aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 0
549 eor $input_h3, $input_h3, $rk10_h @ AES block 4k+3 - round 10 high
550
551 pmull2 $t1.1q, $res1.2d, $h3.2d @ GHASH block 4k+1 - high
552 eor $t6.8b, $t6.8b, $res2.8b @ GHASH block 4k+2 - mid
553 ldp $input_l0, $input_h0, [$input_ptr, #0] @ AES block 4k+4 - load plaintext
554#ifdef __AARCH64EB__
555 rev $input_l0, $input_l0
556 rev $input_h0, $input_h0
557#endif
558 aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 1
559 rev $ctr32w, $rctr32w @ CTR block 4k+8
560
561 eor $t3.8b, $t3.8b, $res1.8b @ GHASH block 4k+1 - mid
562 mov $t0d, $res0.d[1] @ GHASH block 4k - mid
563 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+8
564
565 pmull2 $acc_h.1q, $res0.2d, $h4.2d @ GHASH block 4k - high
566 add $rctr32w, $rctr32w, #1 @ CTR block 4k+8
567 mov $acc_md, $h34k.d[1] @ GHASH block 4k - mid
568
569 aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 2
570
571 pmull $acc_l.1q, $res0.1d, $h4.1d @ GHASH block 4k - low
572 eor $t0.8b, $t0.8b, $res0.8b @ GHASH block 4k - mid
573
574 aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 2
575
576 aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 3
577 eor $acc_hb, $acc_hb, $t1.16b @ GHASH block 4k+1 - high
578
579 pmull $t5.1q, $res2.1d, $h2.1d @ GHASH block 4k+2 - low
580
581 pmull $acc_m.1q, $t0.1d, $acc_m.1d @ GHASH block 4k - mid
582 rev64 $res3b, $res3b @ GHASH block 4k+3 (t0, t1, t2 and t3 free)
583
584 pmull $t3.1q, $t3.1d, $h34k.1d @ GHASH block 4k+1 - mid
585
586 pmull $t2.1q, $res1.1d, $h3.1d @ GHASH block 4k+1 - low
587 ins $t6.d[1], $t6.d[0] @ GHASH block 4k+2 - mid
588
589 pmull2 $t4.1q, $res2.2d, $h2.2d @ GHASH block 4k+2 - high
590 eor $input_h0, $input_h0, $rk10_h @ AES block 4k+4 - round 10 high
591
592 eor $acc_mb, $acc_mb, $t3.16b @ GHASH block 4k+1 - mid
593 mov $t9d, $res3.d[1] @ GHASH block 4k+3 - mid
594
595 aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 1
596 eor $acc_lb, $acc_lb, $t2.16b @ GHASH block 4k+1 - low
597
598 aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 2
599 eor $input_l0, $input_l0, $rk10_l @ AES block 4k+4 - round 10 low
600
601 aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 3
602 eor $t9.8b, $t9.8b, $res3.8b @ GHASH block 4k+3 - mid
603
604 pmull2 $t7.1q, $res3.2d, $h1.2d @ GHASH block 4k+3 - high
605
606 aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 3
607 eor $acc_hb, $acc_hb, $t4.16b @ GHASH block 4k+2 - high
608
609 pmull2 $t6.1q, $t6.2d, $h12k.2d @ GHASH block 4k+2 - mid
610
611 pmull $t8.1q, $res3.1d, $h1.1d @ GHASH block 4k+3 - low
612 movi $mod_constant.8b, #0xc2
613
614 pmull $t9.1q, $t9.1d, $h12k.1d @ GHASH block 4k+3 - mid
615 eor $acc_lb, $acc_lb, $t5.16b @ GHASH block 4k+2 - low
616
617 aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 4
618
619 aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 2
620 shl $mod_constantd, $mod_constantd, #56 @ mod_constant
621
622 aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 4
623 eor $acc_hb, $acc_hb, $t7.16b @ GHASH block 4k+3 - high
624
625 aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 5
626 ldp $input_l1, $input_h1, [$input_ptr, #16] @ AES block 4k+5 - load plaintext
627#ifdef __AARCH64EB__
628 rev $input_l1, $input_l1
629 rev $input_h1, $input_h1
630#endif
631 aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 3
632 eor $acc_mb, $acc_mb, $t6.16b @ GHASH block 4k+2 - mid
633
634 aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 5
635 ldp $input_l2, $input_h2, [$input_ptr, #32] @ AES block 4k+6 - load plaintext
636#ifdef __AARCH64EB__
637 rev $input_l2, $input_l2
638 rev $input_h2, $input_h2
639#endif
640 pmull $mod_t.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid
641 eor $acc_lb, $acc_lb, $t8.16b @ GHASH block 4k+3 - low
642
643 aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 4
644 eor $input_l1, $input_l1, $rk10_l @ AES block 4k+5 - round 10 low
645
646 aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 4
647 eor $acc_mb, $acc_mb, $t9.16b @ GHASH block 4k+3 - mid
648
649 aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 6
650 eor $input_l3, $input_l3, $rk10_l @ AES block 4k+3 - round 10 low
651
652 aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 5
653 eor $t9.16b, $acc_lb, $acc_hb @ MODULO - karatsuba tidy up
654
655 fmov $ctr_t0d, $input_l0 @ AES block 4k+4 - mov low
656 aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 6
657 fmov $ctr_t0.d[1], $input_h0 @ AES block 4k+4 - mov high
658
659 add $input_ptr, $input_ptr, #64 @ AES input_ptr update
660 fmov $ctr_t3d, $input_l3 @ AES block 4k+3 - mov low
661 ext $acc_hb, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment
662
663 aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 5
664 fmov $ctr_t1d, $input_l1 @ AES block 4k+5 - mov low
665
666 aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 7
667 eor $acc_mb, $acc_mb, $t9.16b @ MODULO - karatsuba tidy up
668
669 aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 6
670 eor $input_h1, $input_h1, $rk10_h @ AES block 4k+5 - round 10 high
671
672 aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 7
673 fmov $ctr_t1.d[1], $input_h1 @ AES block 4k+5 - mov high
674
675 aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 8
676 fmov $ctr_t3.d[1], $input_h3 @ AES block 4k+3 - mov high
677
678 aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 6
679 cmp $input_ptr, $main_end_input_ptr @ LOOP CONTROL
680
681 aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 8
682 eor $acc_mb, $acc_mb, $mod_t.16b @ MODULO - fold into mid
683
684 aese $ctr0b, $rk9 @ AES block 4k+4 - round 9
685 eor $input_l2, $input_l2, $rk10_l @ AES block 4k+6 - round 10 low
686 eor $input_h2, $input_h2, $rk10_h @ AES block 4k+6 - round 10 high
687
688 aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 7
689 fmov $ctr_t2d, $input_l2 @ AES block 4k+6 - mov low
690
691 aese $ctr1b, $rk9 @ AES block 4k+5 - round 9
692 fmov $ctr_t2.d[1], $input_h2 @ AES block 4k+6 - mov high
693
694 aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 7
695 eor $res0b, $ctr_t0b, $ctr0b @ AES block 4k+4 - result
696
697 fmov $ctr0d, $ctr96_b64x @ CTR block 4k+8
698 aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 8
699
700 fmov $ctr0.d[1], $ctr32x @ CTR block 4k+8
701 rev $ctr32w, $rctr32w @ CTR block 4k+9
702 eor $acc_mb, $acc_mb, $acc_hb @ MODULO - fold into mid
703
704 aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 8
705 eor $res1b, $ctr_t1b, $ctr1b @ AES block 4k+5 - result
706
707 add $rctr32w, $rctr32w, #1 @ CTR block 4k+9
708 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+9
709 fmov $ctr1d, $ctr96_b64x @ CTR block 4k+9
710
711 pmull $acc_h.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low
712 fmov $ctr1.d[1], $ctr32x @ CTR block 4k+9
713 rev $ctr32w, $rctr32w @ CTR block 4k+10
714
715 aese $ctr2b, $rk9 @ AES block 4k+6 - round 9
716 st1 { $res0b}, [$output_ptr], #16 @ AES block 4k+4 - store result
717 eor $res2b, $ctr_t2b, $ctr2b @ AES block 4k+6 - result
718 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+10
719
720 aese $ctr3b, $rk9 @ AES block 4k+7 - round 9
721 add $rctr32w, $rctr32w, #1 @ CTR block 4k+10
722 ext $acc_mb, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment
723 fmov $ctr2d, $ctr96_b64x @ CTR block 4k+10
724
725 eor $acc_lb, $acc_lb, $acc_hb @ MODULO - fold into low
726 st1 { $res1b}, [$output_ptr], #16 @ AES block 4k+5 - store result
727
728 fmov $ctr2.d[1], $ctr32x @ CTR block 4k+10
729 st1 { $res2b}, [$output_ptr], #16 @ AES block 4k+6 - store result
730 rev $ctr32w, $rctr32w @ CTR block 4k+11
731
732 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+11
733 eor $res3b, $ctr_t3b, $ctr3b @ AES block 4k+3 - result
734
735 eor $acc_lb, $acc_lb, $acc_mb @ MODULO - fold into low
736 st1 { $res3b}, [$output_ptr], #16 @ AES block 4k+3 - store result
737 b.lt .L128_enc_main_loop
738
739 .L128_enc_prepretail: @ PREPRETAIL
740 rev64 $res0b, $res0b @ GHASH block 4k (only t0 is free)
741 fmov $ctr3d, $ctr96_b64x @ CTR block 4k+3
742 rev64 $res1b, $res1b @ GHASH block 4k+1 (t0 and t1 free)
743
744 ext $acc_lb, $acc_lb, $acc_lb, #8 @ PRE 0
745 add $rctr32w, $rctr32w, #1 @ CTR block 4k+3
746 fmov $ctr3.d[1], $ctr32x @ CTR block 4k+3
747
748 aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 0
749 rev64 $res2b, $res2b @ GHASH block 4k+2 (t0, t1, and t2 free)
750
751 pmull $t2.1q, $res1.1d, $h3.1d @ GHASH block 4k+1 - low
752
753 rev64 $res3b, $res3b @ GHASH block 4k+3 (t0, t1, t2 and t3 free)
754 eor $res0b, $res0b, $acc_lb @ PRE 1
755
756 pmull2 $t1.1q, $res1.2d, $h3.2d @ GHASH block 4k+1 - high
757
758 aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 0
759 mov $t3d, $res1.d[1] @ GHASH block 4k+1 - mid
760
761 pmull $acc_l.1q, $res0.1d, $h4.1d @ GHASH block 4k - low
762 mov $t0d, $res0.d[1] @ GHASH block 4k - mid
763
764 mov $t6d, $res2.d[1] @ GHASH block 4k+2 - mid
765 mov $acc_md, $h34k.d[1] @ GHASH block 4k - mid
766
767 aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 1
768 eor $t3.8b, $t3.8b, $res1.8b @ GHASH block 4k+1 - mid
769
770 eor $t0.8b, $t0.8b, $res0.8b @ GHASH block 4k - mid
771
772 pmull2 $acc_h.1q, $res0.2d, $h4.2d @ GHASH block 4k - high
773 eor $t6.8b, $t6.8b, $res2.8b @ GHASH block 4k+2 - mid
774
775 aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 1
776
777 pmull $t3.1q, $t3.1d, $h34k.1d @ GHASH block 4k+1 - mid
778 eor $acc_lb, $acc_lb, $t2.16b @ GHASH block 4k+1 - low
779
780 pmull $acc_m.1q, $t0.1d, $acc_m.1d @ GHASH block 4k - mid
781
782 aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 0
783 ins $t6.d[1], $t6.d[0] @ GHASH block 4k+2 - mid
784
785 aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 0
786
787 eor $acc_mb, $acc_mb, $t3.16b @ GHASH block 4k+1 - mid
788 mov $t9d, $res3.d[1] @ GHASH block 4k+3 - mid
789
790 aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 1
791 eor $acc_hb, $acc_hb, $t1.16b @ GHASH block 4k+1 - high
792
793 pmull2 $t6.1q, $t6.2d, $h12k.2d @ GHASH block 4k+2 - mid
794
795 pmull2 $t4.1q, $res2.2d, $h2.2d @ GHASH block 4k+2 - high
796 eor $t9.8b, $t9.8b, $res3.8b @ GHASH block 4k+3 - mid
797
798 pmull2 $t7.1q, $res3.2d, $h1.2d @ GHASH block 4k+3 - high
799
800 pmull $t5.1q, $res2.1d, $h2.1d @ GHASH block 4k+2 - low
801
802 aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 1
803 eor $acc_hb, $acc_hb, $t4.16b @ GHASH block 4k+2 - high
804
805 aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 2
806
807 pmull $t8.1q, $res3.1d, $h1.1d @ GHASH block 4k+3 - low
808 movi $mod_constant.8b, #0xc2
809
810 aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 2
811 eor $acc_lb, $acc_lb, $t5.16b @ GHASH block 4k+2 - low
812
813 aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 2
814
815 pmull $t9.1q, $t9.1d, $h12k.1d @ GHASH block 4k+3 - mid
816 eor $acc_mb, $acc_mb, $t6.16b @ GHASH block 4k+2 - mid
817
818 aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 3
819
820 aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 2
821 eor $acc_hb, $acc_hb, $t7.16b @ GHASH block 4k+3 - high
822
823 aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 3
824
825 eor $acc_mb, $acc_mb, $t9.16b @ GHASH block 4k+3 - mid
826 shl $mod_constantd, $mod_constantd, #56 @ mod_constant
827
828 aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 3
829 eor $acc_lb, $acc_lb, $t8.16b @ GHASH block 4k+3 - low
830
831 aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 4
832
833 pmull $t1.1q, $acc_h.1d, $mod_constant.1d
834 eor $acc_mb, $acc_mb, $acc_hb @ karatsuba tidy up
835
836 aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 4
837
838 aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 5
839 ext $acc_hb, $acc_hb, $acc_hb, #8
840
841 aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 3
842
843 aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 4
844 eor $acc_mb, $acc_mb, $acc_lb
845
846 aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 6
847
848 aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 4
849
850 aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 5
851
852 aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 5
853 eor $acc_mb, $acc_mb, $t1.16b
854
855 aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 5
856
857 aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 6
858
859 aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 6
860
861 aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 6
862 eor $acc_mb, $acc_mb, $acc_hb
863
864 aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 7
865
866 aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 7
867
868 aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 7
869
870 pmull $t1.1q, $acc_m.1d, $mod_constant.1d
871
872 aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 7
873 ext $acc_mb, $acc_mb, $acc_mb, #8
874
875 aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 8
876
877 aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 8
878 eor $acc_lb, $acc_lb, $t1.16b
879
880 aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 8
881
882 aese $ctr3b, $rk9 @ AES block 4k+7 - round 9
883
884 aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 8
885
886 aese $ctr0b, $rk9 @ AES block 4k+4 - round 9
887
888 aese $ctr1b, $rk9 @ AES block 4k+5 - round 9
889 eor $acc_lb, $acc_lb, $acc_mb
890
891 aese $ctr2b, $rk9 @ AES block 4k+6 - round 9
892 .L128_enc_tail: @ TAIL
893
894 sub $main_end_input_ptr, $end_input_ptr, $input_ptr @ main_end_input_ptr is number of bytes left to process
895 ldp $input_l0, $input_h0, [$input_ptr], #16 @ AES block 4k+4 - load plaintext
896#ifdef __AARCH64EB__
897 rev $input_l0, $input_l0
898 rev $input_h0, $input_h0
899#endif
900 cmp $main_end_input_ptr, #48
901
902 ext $t0.16b, $acc_lb, $acc_lb, #8 @ prepare final partial tag
903 eor $input_l0, $input_l0, $rk10_l @ AES block 4k+4 - round 10 low
904 eor $input_h0, $input_h0, $rk10_h @ AES block 4k+4 - round 10 high
905
906 fmov $ctr_t0d, $input_l0 @ AES block 4k+4 - mov low
907
908 fmov $ctr_t0.d[1], $input_h0 @ AES block 4k+4 - mov high
909
910 eor $res1b, $ctr_t0b, $ctr0b @ AES block 4k+4 - result
911
912 b.gt .L128_enc_blocks_more_than_3
913
914 sub $rctr32w, $rctr32w, #1
915 movi $acc_l.8b, #0
916 mov $ctr3b, $ctr2b
917
918 cmp $main_end_input_ptr, #32
919 mov $ctr2b, $ctr1b
920 movi $acc_h.8b, #0
921
922 movi $acc_m.8b, #0
923 b.gt .L128_enc_blocks_more_than_2
924
925 mov $ctr3b, $ctr1b
926 cmp $main_end_input_ptr, #16
927
928 sub $rctr32w, $rctr32w, #1
929 b.gt .L128_enc_blocks_more_than_1
930
931 sub $rctr32w, $rctr32w, #1
932 b .L128_enc_blocks_less_than_1
933 .L128_enc_blocks_more_than_3: @ blocks left > 3
934 st1 { $res1b}, [$output_ptr], #16 @ AES final-3 block - store result
935
936 ldp $input_l0, $input_h0, [$input_ptr], #16 @ AES final-2 block - load input low & high
937#ifdef __AARCH64EB__
938 rev $input_l0, $input_l0
939 rev $input_h0, $input_h0
940#endif
941 rev64 $res0b, $res1b @ GHASH final-3 block
942
943 eor $res0b, $res0b, $t0.16b @ feed in partial tag
944 eor $input_h0, $input_h0, $rk10_h @ AES final-2 block - round 10 high
945 eor $input_l0, $input_l0, $rk10_l @ AES final-2 block - round 10 low
946
947 fmov $res1d, $input_l0 @ AES final-2 block - mov low
948
949 movi $t0.8b, #0 @ suppress further partial tag feed in
950 fmov $res1.d[1], $input_h0 @ AES final-2 block - mov high
951
952 pmull $acc_l.1q, $res0.1d, $h4.1d @ GHASH final-3 block - low
953 mov $rk4d, $res0.d[1] @ GHASH final-3 block - mid
954
955 pmull2 $acc_h.1q, $res0.2d, $h4.2d @ GHASH final-3 block - high
956
957 mov $acc_md, $h34k.d[1] @ GHASH final-3 block - mid
958
959 eor $res1b, $res1b, $ctr1b @ AES final-2 block - result
960 eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-3 block - mid
961
962 pmull $acc_m.1q, $rk4v.1d, $acc_m.1d @ GHASH final-3 block - mid
963 .L128_enc_blocks_more_than_2: @ blocks left > 2
964
965 st1 { $res1b}, [$output_ptr], #16 @ AES final-2 block - store result
966
967 rev64 $res0b, $res1b @ GHASH final-2 block
968 ldp $input_l0, $input_h0, [$input_ptr], #16 @ AES final-1 block - load input low & high
969#ifdef __AARCH64EB__
970 rev $input_l0, $input_l0
971 rev $input_h0, $input_h0
972#endif
973 eor $res0b, $res0b, $t0.16b @ feed in partial tag
974
975 eor $input_l0, $input_l0, $rk10_l @ AES final-1 block - round 10 low
976
977 fmov $res1d, $input_l0 @ AES final-1 block - mov low
978 eor $input_h0, $input_h0, $rk10_h @ AES final-1 block - round 10 high
979
980 pmull2 $rk2q1, $res0.2d, $h3.2d @ GHASH final-2 block - high
981 fmov $res1.d[1], $input_h0 @ AES final-1 block - mov high
982
983 mov $rk4d, $res0.d[1] @ GHASH final-2 block - mid
984
985 pmull $rk3q1, $res0.1d, $h3.1d @ GHASH final-2 block - low
986
987 eor $acc_hb, $acc_hb, $rk2 @ GHASH final-2 block - high
988
989 eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-2 block - mid
990
991 eor $res1b, $res1b, $ctr2b @ AES final-1 block - result
992
993 eor $acc_lb, $acc_lb, $rk3 @ GHASH final-2 block - low
994
995 pmull $rk4v.1q, $rk4v.1d, $h34k.1d @ GHASH final-2 block - mid
996
997 movi $t0.8b, #0 @ suppress further partial tag feed in
998
999 eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-2 block - mid
1000 .L128_enc_blocks_more_than_1: @ blocks left > 1
1001
1002 st1 { $res1b}, [$output_ptr], #16 @ AES final-1 block - store result
1003
1004 rev64 $res0b, $res1b @ GHASH final-1 block
1005 ldp $input_l0, $input_h0, [$input_ptr], #16 @ AES final block - load input low & high
1006#ifdef __AARCH64EB__
1007 rev $input_l0, $input_l0
1008 rev $input_h0, $input_h0
1009#endif
1010 eor $res0b, $res0b, $t0.16b @ feed in partial tag
1011
1012 eor $input_h0, $input_h0, $rk10_h @ AES final block - round 10 high
1013 eor $input_l0, $input_l0, $rk10_l @ AES final block - round 10 low
1014
1015 fmov $res1d, $input_l0 @ AES final block - mov low
1016
1017 pmull2 $rk2q1, $res0.2d, $h2.2d @ GHASH final-1 block - high
1018 fmov $res1.d[1], $input_h0 @ AES final block - mov high
1019
1020 mov $rk4d, $res0.d[1] @ GHASH final-1 block - mid
1021
1022 pmull $rk3q1, $res0.1d, $h2.1d @ GHASH final-1 block - low
1023
1024 eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-1 block - mid
1025
1026 eor $res1b, $res1b, $ctr3b @ AES final block - result
1027
1028 ins $rk4v.d[1], $rk4v.d[0] @ GHASH final-1 block - mid
1029
1030 pmull2 $rk4v.1q, $rk4v.2d, $h12k.2d @ GHASH final-1 block - mid
1031
1032 eor $acc_lb, $acc_lb, $rk3 @ GHASH final-1 block - low
1033
1034 eor $acc_hb, $acc_hb, $rk2 @ GHASH final-1 block - high
1035
1036 eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-1 block - mid
1037 movi $t0.8b, #0 @ suppress further partial tag feed in
1038 .L128_enc_blocks_less_than_1: @ blocks left <= 1
1039
1040 and $bit_length, $bit_length, #127 @ bit_length %= 128
1041 mvn $rk10_l, xzr @ rk10_l = 0xffffffffffffffff
1042
1043 mvn $rk10_h, xzr @ rk10_h = 0xffffffffffffffff
1044 sub $bit_length, $bit_length, #128 @ bit_length -= 128
1045
1046 neg $bit_length, $bit_length @ bit_length = 128 - #bits in input (in range [1,128])
1047
1048 and $bit_length, $bit_length, #127 @ bit_length %= 128
1049
1050 lsr $rk10_h, $rk10_h, $bit_length @ rk10_h is mask for top 64b of last block
1051 cmp $bit_length, #64
1052
1053 csel $input_l0, $rk10_l, $rk10_h, lt
1054 csel $input_h0, $rk10_h, xzr, lt
1055
1056 fmov $ctr0d, $input_l0 @ ctr0b is mask for last block
1057
1058 fmov $ctr0.d[1], $input_h0
1059
1060 and $res1b, $res1b, $ctr0b @ possibly partial last block has zeroes in highest bits
1061
1062 rev64 $res0b, $res1b @ GHASH final block
1063
1064 eor $res0b, $res0b, $t0.16b @ feed in partial tag
1065
1066 mov $t0d, $res0.d[1] @ GHASH final block - mid
1067
1068 pmull $rk3q1, $res0.1d, $h1.1d @ GHASH final block - low
1069 ld1 { $rk0}, [$output_ptr] @ load existing bytes where the possibly partial last block is to be stored
1070
1071 eor $t0.8b, $t0.8b, $res0.8b @ GHASH final block - mid
1072#ifndef __AARCH64EB__
1073 rev $ctr32w, $rctr32w
1074#else
1075 mov $ctr32w, $rctr32w
1076#endif
1077 pmull2 $rk2q1, $res0.2d, $h1.2d @ GHASH final block - high
1078
1079 pmull $t0.1q, $t0.1d, $h12k.1d @ GHASH final block - mid
1080
1081 eor $acc_lb, $acc_lb, $rk3 @ GHASH final block - low
1082
1083 eor $acc_hb, $acc_hb, $rk2 @ GHASH final block - high
1084
1085 eor $acc_mb, $acc_mb, $t0.16b @ GHASH final block - mid
1086 movi $mod_constant.8b, #0xc2
1087
1088 eor $t9.16b, $acc_lb, $acc_hb @ MODULO - karatsuba tidy up
1089
1090 shl $mod_constantd, $mod_constantd, #56 @ mod_constant
1091
1092 eor $acc_mb, $acc_mb, $t9.16b @ MODULO - karatsuba tidy up
1093
1094 pmull $mod_t.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid
1095
1096 ext $acc_hb, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment
1097
1098 eor $acc_mb, $acc_mb, $mod_t.16b @ MODULO - fold into mid
1099
1100 eor $acc_mb, $acc_mb, $acc_hb @ MODULO - fold into mid
1101
1102 pmull $acc_h.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low
1103
1104 ext $acc_mb, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment
1105
1106 bif $res1b, $rk0, $ctr0b @ insert existing bytes in top end of result before storing
1107
1108 eor $acc_lb, $acc_lb, $acc_hb @ MODULO - fold into low
1109 st1 { $res1b}, [$output_ptr] @ store all 16B
1110
1111 str $ctr32w, [$counter, #12] @ store the updated counter
1112
1113 eor $acc_lb, $acc_lb, $acc_mb @ MODULO - fold into low
1114 ext $acc_lb, $acc_lb, $acc_lb, #8
1115 rev64 $acc_lb, $acc_lb
1116 mov x0, $len
1117 st1 { $acc_l.16b }, [$current_tag]
1118 ldp x21, x22, [sp, #16]
1119 ldp x23, x24, [sp, #32]
1120 ldp d8, d9, [sp, #48]
1121 ldp d10, d11, [sp, #64]
1122 ldp d12, d13, [sp, #80]
1123 ldp d14, d15, [sp, #96]
1124 ldp x19, x20, [sp], #112
1125 ret
1126
1127.L128_enc_ret:
1128 mov w0, #0x0
1129 ret
1130.size aes_gcm_enc_128_kernel,.-aes_gcm_enc_128_kernel
1131___
1132
1133#########################################################################################
1134# size_t aes_gcm_dec_128_kernel(const unsigned char *in,
1135# size_t len,
1136# unsigned char *out,
1137# const void *key,
1138# unsigned char ivec[16],
1139# u64 *Xi);
1140#
1141$code.=<<___;
1142.global aes_gcm_dec_128_kernel
1143.type aes_gcm_dec_128_kernel,%function
1144.align 4
1145aes_gcm_dec_128_kernel:
1146 AARCH64_VALID_CALL_TARGET
1147 cbz x1, .L128_dec_ret
1148 stp x19, x20, [sp, #-112]!
1149 mov x16, x4
1150 mov x8, x5
1151 stp x21, x22, [sp, #16]
1152 stp x23, x24, [sp, #32]
1153 stp d8, d9, [sp, #48]
1154 stp d10, d11, [sp, #64]
1155 stp d12, d13, [sp, #80]
1156 stp d14, d15, [sp, #96]
1157
1158 lsr $main_end_input_ptr, $bit_length, #3 @ byte_len
1159 mov $len, $main_end_input_ptr
1160 ldp $ctr96_b64x, $ctr96_t32x, [$counter] @ ctr96_b64, ctr96_t32
1161#ifdef __AARCH64EB__
1162 rev $ctr96_b64x, $ctr96_b64x
1163 rev $ctr96_t32x, $ctr96_t32x
1164#endif
1165 ldp $rk10_l, $rk10_h, [$cc, #160] @ load rk10
1166#ifdef __AARCH64EB__
1167 ror $rk10_h, $rk10_h, 32
1168 ror $rk10_l, $rk10_l, 32
1169#endif
1170 sub $main_end_input_ptr, $main_end_input_ptr, #1 @ byte_len - 1
1171 ld1 {$rk0s}, [$cc], #16 @ load rk0
1172
1173 and $main_end_input_ptr, $main_end_input_ptr, #0xffffffffffffffc0 @ number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
1174 ld1 { $ctr0b}, [$counter] @ special case vector load initial counter so we can start first AES block as quickly as possible
1175
1176 ldr $h2q, [$current_tag, #64] @ load h2l | h2h
1177#ifndef __AARCH64EB__
1178 ext $h2b, $h2b, $h2b, #8
1179#endif
1180 lsr $rctr32x, $ctr96_t32x, #32
1181 fmov $ctr2d, $ctr96_b64x @ CTR block 2
1182
1183 ld1 {$rk1s}, [$cc], #16 @ load rk1
1184 orr $ctr96_t32w, $ctr96_t32w, $ctr96_t32w
1185 rev $rctr32w, $rctr32w @ rev_ctr32
1186
1187 fmov $ctr1d, $ctr96_b64x @ CTR block 1
1188 add $rctr32w, $rctr32w, #1 @ increment rev_ctr32
1189
1190 aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 0
1191 rev $ctr32w, $rctr32w @ CTR block 1
1192
1193 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 1
1194 ld1 {$rk2s}, [$cc], #16 @ load rk2
1195 add $rctr32w, $rctr32w, #1 @ CTR block 1
1196
1197 fmov $ctr1.d[1], $ctr32x @ CTR block 1
1198 rev $ctr32w, $rctr32w @ CTR block 2
1199 add $rctr32w, $rctr32w, #1 @ CTR block 2
1200
1201 aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 1
1202 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 2
1203
1204 fmov $ctr2.d[1], $ctr32x @ CTR block 2
1205 rev $ctr32w, $rctr32w @ CTR block 3
1206
1207 fmov $ctr3d, $ctr96_b64x @ CTR block 3
1208 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 3
1209 add $rctr32w, $rctr32w, #1 @ CTR block 3
1210
1211 fmov $ctr3.d[1], $ctr32x @ CTR block 3
1212 add $end_input_ptr, $input_ptr, $bit_length, lsr #3 @ end_input_ptr
1213
1214 aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 0
1215 ld1 {$rk3s}, [$cc], #16 @ load rk3
1216
1217 aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 2
1218 ld1 {$rk4s}, [$cc], #16 @ load rk4
1219
1220 aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 0
1221 ld1 {$rk5s}, [$cc], #16 @ load rk5
1222
1223 aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 1
1224 ld1 {$rk6s}, [$cc], #16 @ load rk6
1225
1226 aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 0
1227
1228 aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 1
1229
1230 aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 2
1231
1232 aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 1
1233 ld1 { $acc_lb}, [$current_tag]
1234 ext $acc_lb, $acc_lb, $acc_lb, #8
1235 rev64 $acc_lb, $acc_lb
1236
1237 aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 3
1238 ld1 {$rk7s}, [$cc], #16 @ load rk7
1239
1240 aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 3
1241
1242 aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 2
1243
1244 aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 2
1245 ld1 {$rk8s}, [$cc], #16 @ load rk8
1246
1247 aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 4
1248
1249 aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 3
1250
1251 aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 3
1252 ldr $h3q, [$current_tag, #80] @ load h3l | h3h
1253#ifndef __AARCH64EB__
1254 ext $h3b, $h3b, $h3b, #8
1255#endif
1256 aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 4
1257 ld1 {$rk9s}, [$cc], #16 @ load rk9
1258
1259 aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 5
1260
1261 aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 4
1262
1263 aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 4
1264
1265 aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 5
1266
1267 aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 5
1268 ldr $h1q, [$current_tag, #32] @ load h1l | h1h
1269#ifndef __AARCH64EB__
1270 ext $h1b, $h1b, $h1b, #8
1271#endif
1272 aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 5
1273
1274 aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 6
1275
1276 aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 6
1277
1278 aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 6
1279
1280 aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 6
1281 trn1 $t0.2d, $h1.2d, $h2.2d @ h2h | h1h
1282
1283 ldr $h4q, [$current_tag, #112] @ load h4l | h4h
1284#ifndef __AARCH64EB__
1285 ext $h4b, $h4b, $h4b, #8
1286#endif
1287 trn2 $h12k.2d, $h1.2d, $h2.2d @ h2l | h1l
1288 add $main_end_input_ptr, $main_end_input_ptr, $input_ptr
1289
1290 aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 7
1291
1292 aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 7
1293
1294 aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 7
1295 eor $h12k.16b, $h12k.16b, $t0.16b @ h2k | h1k
1296
1297 aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 7
1298
1299 aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 8
1300 trn2 $h34k.2d, $h3.2d, $h4.2d @ h4l | h3l
1301
1302 aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 8
1303
1304 aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 8
1305
1306 aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 8
1307 trn1 $acc_h.2d, $h3.2d, $h4.2d @ h4h | h3h
1308
1309 aese $ctr2b, $rk9 @ AES block 2 - round 9
1310
1311 aese $ctr3b, $rk9 @ AES block 3 - round 9
1312
1313 aese $ctr0b, $rk9 @ AES block 0 - round 9
1314 cmp $input_ptr, $main_end_input_ptr @ check if we have <= 4 blocks
1315
1316 aese $ctr1b, $rk9 @ AES block 1 - round 9
1317 eor $h34k.16b, $h34k.16b, $acc_h.16b @ h4k | h3k
1318 b.ge .L128_dec_tail @ handle tail
1319
1320 ld1 {$res0b, $res1b}, [$input_ptr], #32 @ AES block 0 - load ciphertext; AES block 1 - load ciphertext
1321
1322 eor $ctr1b, $res1b, $ctr1b @ AES block 1 - result
1323 ld1 {$res2b}, [$input_ptr], #16 @ AES block 2 - load ciphertext
1324
1325 eor $ctr0b, $res0b, $ctr0b @ AES block 0 - result
1326 rev64 $res0b, $res0b @ GHASH block 0
1327 rev $ctr32w, $rctr32w @ CTR block 4
1328
1329 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4
1330 add $rctr32w, $rctr32w, #1 @ CTR block 4
1331 ld1 {$res3b}, [$input_ptr], #16 @ AES block 3 - load ciphertext
1332
1333 rev64 $res1b, $res1b @ GHASH block 1
1334 mov $output_l1, $ctr1.d[0] @ AES block 1 - mov low
1335
1336 mov $output_h1, $ctr1.d[1] @ AES block 1 - mov high
1337
1338 mov $output_l0, $ctr0.d[0] @ AES block 0 - mov low
1339 cmp $input_ptr, $main_end_input_ptr @ check if we have <= 8 blocks
1340
1341 mov $output_h0, $ctr0.d[1] @ AES block 0 - mov high
1342
1343 fmov $ctr0d, $ctr96_b64x @ CTR block 4
1344
1345 fmov $ctr0.d[1], $ctr32x @ CTR block 4
1346 rev $ctr32w, $rctr32w @ CTR block 5
1347 eor $output_l1, $output_l1, $rk10_l @ AES block 1 - round 10 low
1348#ifdef __AARCH64EB__
1349 rev $output_l1, $output_l1
1350#endif
1351 fmov $ctr1d, $ctr96_b64x @ CTR block 5
1352 add $rctr32w, $rctr32w, #1 @ CTR block 5
1353 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 5
1354
1355 fmov $ctr1.d[1], $ctr32x @ CTR block 5
1356 rev $ctr32w, $rctr32w @ CTR block 6
1357 add $rctr32w, $rctr32w, #1 @ CTR block 6
1358
1359 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 6
1360
1361 eor $output_h1, $output_h1, $rk10_h @ AES block 1 - round 10 high
1362#ifdef __AARCH64EB__
1363 rev $output_h1, $output_h1
1364#endif
1365 eor $output_l0, $output_l0, $rk10_l @ AES block 0 - round 10 low
1366#ifdef __AARCH64EB__
1367 rev $output_l0, $output_l0
1368#endif
1369 eor $ctr2b, $res2b, $ctr2b @ AES block 2 - result
1370
1371 eor $output_h0, $output_h0, $rk10_h @ AES block 0 - round 10 high
1372#ifdef __AARCH64EB__
1373 rev $output_h0, $output_h0
1374#endif
1375 stp $output_l0, $output_h0, [$output_ptr], #16 @ AES block 0 - store result
1376
1377 stp $output_l1, $output_h1, [$output_ptr], #16 @ AES block 1 - store result
1378 b.ge .L128_dec_prepretail @ do prepretail
1379
1380 .L128_dec_main_loop: @ main loop start
1381 eor $ctr3b, $res3b, $ctr3b @ AES block 4k+3 - result
1382 ext $acc_lb, $acc_lb, $acc_lb, #8 @ PRE 0
1383 mov $output_l2, $ctr2.d[0] @ AES block 4k+2 - mov low
1384
1385 pmull2 $t1.1q, $res1.2d, $h3.2d @ GHASH block 4k+1 - high
1386 mov $output_h2, $ctr2.d[1] @ AES block 4k+2 - mov high
1387
1388 aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 0
1389 fmov $ctr2d, $ctr96_b64x @ CTR block 4k+6
1390
1391 rev64 $res2b, $res2b @ GHASH block 4k+2
1392 fmov $ctr2.d[1], $ctr32x @ CTR block 4k+6
1393 rev $ctr32w, $rctr32w @ CTR block 4k+7
1394
1395 mov $output_l3, $ctr3.d[0] @ AES block 4k+3 - mov low
1396 eor $res0b, $res0b, $acc_lb @ PRE 1
1397 mov $t3d, $res1.d[1] @ GHASH block 4k+1 - mid
1398
1399 aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 1
1400 rev64 $res3b, $res3b @ GHASH block 4k+3
1401
1402 pmull $t2.1q, $res1.1d, $h3.1d @ GHASH block 4k+1 - low
1403 mov $output_h3, $ctr3.d[1] @ AES block 4k+3 - mov high
1404 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+7
1405
1406 pmull $acc_l.1q, $res0.1d, $h4.1d @ GHASH block 4k - low
1407 fmov $ctr3d, $ctr96_b64x @ CTR block 4k+7
1408 eor $t3.8b, $t3.8b, $res1.8b @ GHASH block 4k+1 - mid
1409
1410 aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 2
1411 fmov $ctr3.d[1], $ctr32x @ CTR block 4k+7
1412
1413 aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 0
1414 mov $acc_md, $h34k.d[1] @ GHASH block 4k - mid
1415
1416 pmull2 $acc_h.1q, $res0.2d, $h4.2d @ GHASH block 4k - high
1417 eor $acc_lb, $acc_lb, $t2.16b @ GHASH block 4k+1 - low
1418
1419 pmull $t8.1q, $res3.1d, $h1.1d @ GHASH block 4k+3 - low
1420
1421 aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 3
1422 mov $t0d, $res0.d[1] @ GHASH block 4k - mid
1423
1424 aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 0
1425 eor $acc_hb, $acc_hb, $t1.16b @ GHASH block 4k+1 - high
1426
1427 aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 0
1428
1429 pmull $t5.1q, $res2.1d, $h2.1d @ GHASH block 4k+2 - low
1430 eor $t0.8b, $t0.8b, $res0.8b @ GHASH block 4k - mid
1431
1432 aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 1
1433 eor $output_l3, $output_l3, $rk10_l @ AES block 4k+3 - round 10 low
1434#ifdef __AARCH64EB__
1435 rev $output_l3, $output_l3
1436#endif
1437 pmull $t3.1q, $t3.1d, $h34k.1d @ GHASH block 4k+1 - mid
1438 eor $output_h2, $output_h2, $rk10_h @ AES block 4k+2 - round 10 high
1439#ifdef __AARCH64EB__
1440 rev $output_h2, $output_h2
1441#endif
1442 mov $t6d, $res2.d[1] @ GHASH block 4k+2 - mid
1443
1444 aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 1
1445 eor $acc_lb, $acc_lb, $t5.16b @ GHASH block 4k+2 - low
1446
1447 pmull $acc_m.1q, $t0.1d, $acc_m.1d @ GHASH block 4k - mid
1448
1449 aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 2
1450 eor $t6.8b, $t6.8b, $res2.8b @ GHASH block 4k+2 - mid
1451
1452 aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 2
1453
1454 aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 4
1455 eor $acc_mb, $acc_mb, $t3.16b @ GHASH block 4k+1 - mid
1456
1457 pmull2 $t4.1q, $res2.2d, $h2.2d @ GHASH block 4k+2 - high
1458
1459 aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 3
1460 ins $t6.d[1], $t6.d[0] @ GHASH block 4k+2 - mid
1461
1462 pmull2 $t7.1q, $res3.2d, $h1.2d @ GHASH block 4k+3 - high
1463
1464 aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 1
1465 mov $t9d, $res3.d[1] @ GHASH block 4k+3 - mid
1466
1467 aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 4
1468 eor $acc_hb, $acc_hb, $t4.16b @ GHASH block 4k+2 - high
1469
1470 pmull2 $t6.1q, $t6.2d, $h12k.2d @ GHASH block 4k+2 - mid
1471 eor $output_h3, $output_h3, $rk10_h @ AES block 4k+3 - round 10 high
1472#ifdef __AARCH64EB__
1473 rev $output_h3, $output_h3
1474#endif
1475 aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 2
1476 eor $t9.8b, $t9.8b, $res3.8b @ GHASH block 4k+3 - mid
1477
1478 aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 5
1479 eor $output_l2, $output_l2, $rk10_l @ AES block 4k+2 - round 10 low
1480#ifdef __AARCH64EB__
1481 rev $output_l2, $output_l2
1482#endif
1483 aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 5
1484 movi $mod_constant.8b, #0xc2
1485
1486 aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 3
1487 eor $acc_lb, $acc_lb, $t8.16b @ GHASH block 4k+3 - low
1488
1489 aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 6
1490
1491 aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 6
1492 eor $acc_mb, $acc_mb, $t6.16b @ GHASH block 4k+2 - mid
1493
1494 aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 4
1495 stp $output_l2, $output_h2, [$output_ptr], #16 @ AES block 4k+2 - store result
1496
1497 pmull $t9.1q, $t9.1d, $h12k.1d @ GHASH block 4k+3 - mid
1498 eor $acc_hb, $acc_hb, $t7.16b @ GHASH block 4k+3 - high
1499 ld1 {$res0b}, [$input_ptr], #16 @ AES block 4k+3 - load ciphertext
1500
1501 aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 7
1502 add $rctr32w, $rctr32w, #1 @ CTR block 4k+7
1503
1504 aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 7
1505 shl $mod_constantd, $mod_constantd, #56 @ mod_constant
1506
1507 aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 5
1508 eor $acc_mb, $acc_mb, $t9.16b @ GHASH block 4k+3 - mid
1509
1510 aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 8
1511 stp $output_l3, $output_h3, [$output_ptr], #16 @ AES block 4k+3 - store result
1512
1513 aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 8
1514 eor $t9.16b, $acc_lb, $acc_hb @ MODULO - karatsuba tidy up
1515
1516 aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 3
1517 rev $ctr32w, $rctr32w @ CTR block 4k+8
1518
1519 pmull $mod_t.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid
1520 ld1 {$res1b}, [$input_ptr], #16 @ AES block 4k+4 - load ciphertext
1521 ext $acc_hb, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment
1522
1523 aese $ctr0b, $rk9 @ AES block 4k+4 - round 9
1524 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+8
1525
1526 aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 4
1527 eor $acc_mb, $acc_mb, $t9.16b @ MODULO - karatsuba tidy up
1528
1529 aese $ctr1b, $rk9 @ AES block 4k+5 - round 9
1530
1531 aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 6
1532 eor $ctr0b, $res0b, $ctr0b @ AES block 4k+4 - result
1533
1534 aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 5
1535 ld1 {$res2b}, [$input_ptr], #16 @ AES block 4k+5 - load ciphertext
1536
1537 add $rctr32w, $rctr32w, #1 @ CTR block 4k+8
1538 eor $acc_mb, $acc_mb, $mod_t.16b @ MODULO - fold into mid
1539 eor $ctr1b, $res1b, $ctr1b @ AES block 4k+5 - result
1540
1541 aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 7
1542 ld1 {$res3b}, [$input_ptr], #16 @ AES block 4k+6 - load ciphertext
1543
1544 aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 6
1545
1546 rev64 $res1b, $res1b @ GHASH block 4k+5
1547 eor $acc_mb, $acc_mb, $acc_hb @ MODULO - fold into mid
1548 mov $output_h0, $ctr0.d[1] @ AES block 4k+4 - mov high
1549
1550 aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 8
1551 mov $output_l0, $ctr0.d[0] @ AES block 4k+4 - mov low
1552
1553 aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 7
1554 fmov $ctr0d, $ctr96_b64x @ CTR block 4k+8
1555
1556 pmull $mod_constant.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low
1557 fmov $ctr0.d[1], $ctr32x @ CTR block 4k+8
1558 rev $ctr32w, $rctr32w @ CTR block 4k+9
1559
1560 aese $ctr2b, $rk9 @ AES block 4k+6 - round 9
1561 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+9
1562 ext $acc_mb, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment
1563
1564 aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 8
1565 eor $output_h0, $output_h0, $rk10_h @ AES block 4k+4 - round 10 high
1566#ifdef __AARCH64EB__
1567 rev $output_h0, $output_h0
1568#endif
1569 eor $acc_lb, $acc_lb, $mod_constant.16b @ MODULO - fold into low
1570 mov $output_h1, $ctr1.d[1] @ AES block 4k+5 - mov high
1571 eor $output_l0, $output_l0, $rk10_l @ AES block 4k+4 - round 10 low
1572#ifdef __AARCH64EB__
1573 rev $output_l0, $output_l0
1574#endif
1575 eor $ctr2b, $res2b, $ctr2b @ AES block 4k+6 - result
1576 mov $output_l1, $ctr1.d[0] @ AES block 4k+5 - mov low
1577 add $rctr32w, $rctr32w, #1 @ CTR block 4k+9
1578
1579 aese $ctr3b, $rk9 @ AES block 4k+7 - round 9
1580 fmov $ctr1d, $ctr96_b64x @ CTR block 4k+9
1581 cmp $input_ptr, $main_end_input_ptr @ LOOP CONTROL
1582
1583 rev64 $res0b, $res0b @ GHASH block 4k+4
1584 eor $acc_lb, $acc_lb, $acc_mb @ MODULO - fold into low
1585 fmov $ctr1.d[1], $ctr32x @ CTR block 4k+9
1586
1587 rev $ctr32w, $rctr32w @ CTR block 4k+10
1588 add $rctr32w, $rctr32w, #1 @ CTR block 4k+10
1589
1590 eor $output_h1, $output_h1, $rk10_h @ AES block 4k+5 - round 10 high
1591#ifdef __AARCH64EB__
1592 rev $output_h1, $output_h1
1593#endif
1594 stp $output_l0, $output_h0, [$output_ptr], #16 @ AES block 4k+4 - store result
1595
1596 eor $output_l1, $output_l1, $rk10_l @ AES block 4k+5 - round 10 low
1597#ifdef __AARCH64EB__
1598 rev $output_l1, $output_l1
1599#endif
1600 stp $output_l1, $output_h1, [$output_ptr], #16 @ AES block 4k+5 - store result
1601
1602 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+10
1603 b.lt L128_dec_main_loop
1604
1605 .L128_dec_prepretail: @ PREPRETAIL
1606 ext $acc_lb, $acc_lb, $acc_lb, #8 @ PRE 0
1607 mov $output_l2, $ctr2.d[0] @ AES block 4k+2 - mov low
1608 mov $t3d, $res1.d[1] @ GHASH block 4k+1 - mid
1609
1610 aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 0
1611 eor $ctr3b, $res3b, $ctr3b @ AES block 4k+3 - result
1612
1613 aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 0
1614 mov $output_h2, $ctr2.d[1] @ AES block 4k+2 - mov high
1615
1616 eor $res0b, $res0b, $acc_lb @ PRE 1
1617 fmov $ctr2d, $ctr96_b64x @ CTR block 4k+6
1618 rev64 $res2b, $res2b @ GHASH block 4k+2
1619
1620 aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 1
1621 fmov $ctr2.d[1], $ctr32x @ CTR block 4k+6
1622
1623 rev $ctr32w, $rctr32w @ CTR block 4k+7
1624 mov $output_l3, $ctr3.d[0] @ AES block 4k+3 - mov low
1625 eor $t3.8b, $t3.8b, $res1.8b @ GHASH block 4k+1 - mid
1626
1627 pmull $acc_l.1q, $res0.1d, $h4.1d @ GHASH block 4k - low
1628 mov $acc_md, $h34k.d[1] @ GHASH block 4k - mid
1629 mov $output_h3, $ctr3.d[1] @ AES block 4k+3 - mov high
1630
1631 aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 1
1632 mov $t6d, $res2.d[1] @ GHASH block 4k+2 - mid
1633
1634 aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 2
1635 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+7
1636
1637 pmull $t2.1q, $res1.1d, $h3.1d @ GHASH block 4k+1 - low
1638 mov $t0d, $res0.d[1] @ GHASH block 4k - mid
1639 fmov $ctr3d, $ctr96_b64x @ CTR block 4k+7
1640
1641 aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 0
1642 fmov $ctr3.d[1], $ctr32x @ CTR block 4k+7
1643
1644 pmull $t3.1q, $t3.1d, $h34k.1d @ GHASH block 4k+1 - mid
1645 eor $t6.8b, $t6.8b, $res2.8b @ GHASH block 4k+2 - mid
1646
1647 rev64 $res3b, $res3b @ GHASH block 4k+3
1648
1649 aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 1
1650 eor $t0.8b, $t0.8b, $res0.8b @ GHASH block 4k - mid
1651
1652 pmull2 $acc_h.1q, $res0.2d, $h4.2d @ GHASH block 4k - high
1653
1654 aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 0
1655 ins $t6.d[1], $t6.d[0] @ GHASH block 4k+2 - mid
1656
1657 pmull2 $t1.1q, $res1.2d, $h3.2d @ GHASH block 4k+1 - high
1658
1659 pmull $acc_m.1q, $t0.1d, $acc_m.1d @ GHASH block 4k - mid
1660 eor $acc_lb, $acc_lb, $t2.16b @ GHASH block 4k+1 - low
1661
1662 pmull $t8.1q, $res3.1d, $h1.1d @ GHASH block 4k+3 - low
1663
1664 pmull2 $t6.1q, $t6.2d, $h12k.2d @ GHASH block 4k+2 - mid
1665 eor $acc_hb, $acc_hb, $t1.16b @ GHASH block 4k+1 - high
1666
1667 eor $acc_mb, $acc_mb, $t3.16b @ GHASH block 4k+1 - mid
1668
1669 pmull2 $t7.1q, $res3.2d, $h1.2d @ GHASH block 4k+3 - high
1670
1671 pmull2 $t4.1q, $res2.2d, $h2.2d @ GHASH block 4k+2 - high
1672 mov $t9d, $res3.d[1] @ GHASH block 4k+3 - mid
1673
1674 aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 2
1675 eor $acc_mb, $acc_mb, $t6.16b @ GHASH block 4k+2 - mid
1676
1677 pmull $t5.1q, $res2.1d, $h2.1d @ GHASH block 4k+2 - low
1678
1679 eor $acc_hb, $acc_hb, $t4.16b @ GHASH block 4k+2 - high
1680 movi $mod_constant.8b, #0xc2
1681
1682 aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 1
1683 eor $t9.8b, $t9.8b, $res3.8b @ GHASH block 4k+3 - mid
1684
1685 eor $acc_lb, $acc_lb, $t5.16b @ GHASH block 4k+2 - low
1686
1687 aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 2
1688 eor $acc_hb, $acc_hb, $t7.16b @ GHASH block 4k+3 - high
1689
1690 aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 2
1691 eor $output_l3, $output_l3, $rk10_l @ AES block 4k+3 - round 10 low
1692#ifdef __AARCH64EB__
1693 rev $output_l3, $output_l3
1694#endif
1695 pmull $t9.1q, $t9.1d, $h12k.1d @ GHASH block 4k+3 - mid
1696 eor $output_l2, $output_l2, $rk10_l @ AES block 4k+2 - round 10 low
1697#ifdef __AARCH64EB__
1698 rev $output_l2, $output_l2
1699#endif
1700 eor $acc_lb, $acc_lb, $t8.16b @ GHASH block 4k+3 - low
1701
1702 aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 3
1703
1704 aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 3
1705 shl $mod_constantd, $mod_constantd, #56 @ mod_constant
1706
1707 aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 3
1708
1709 aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 4
1710 eor $acc_mb, $acc_mb, $t9.16b @ GHASH block 4k+3 - mid
1711
1712 aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 4
1713
1714 aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 3
1715 eor $t9.16b, $acc_lb, $acc_hb @ MODULO - karatsuba tidy up
1716
1717 aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 5
1718
1719 aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 5
1720
1721 aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 4
1722
1723 aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 4
1724 eor $acc_mb, $acc_mb, $t9.16b @ MODULO - karatsuba tidy up
1725
1726 pmull $mod_t.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid
1727
1728 aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 6
1729 ext $acc_hb, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment
1730
1731 aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 5
1732
1733 aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 5
1734 eor $acc_mb, $acc_mb, $mod_t.16b @ MODULO - fold into mid
1735
1736 aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 7
1737
1738 aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 6
1739
1740 aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 6
1741
1742 aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 8
1743 eor $acc_mb, $acc_mb, $acc_hb @ MODULO - fold into mid
1744
1745 aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 6
1746
1747 aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 7
1748
1749 aese $ctr1b, $rk9 @ AES block 4k+5 - round 9
1750
1751 pmull $mod_constant.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low
1752 eor $output_h3, $output_h3, $rk10_h @ AES block 4k+3 - round 10 high
1753#ifdef __AARCH64EB__
1754 rev $output_h3, $output_h3
1755#endif
1756 aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 7
1757 ext $acc_mb, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment
1758
1759 aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 7
1760
1761 aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 8
1762 eor $acc_lb, $acc_lb, $mod_constant.16b @ MODULO - fold into low
1763
1764 aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 8
1765
1766 aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 8
1767 eor $output_h2, $output_h2, $rk10_h @ AES block 4k+2 - round 10 high
1768#ifdef __AARCH64EB__
1769 rev $output_h2, $output_h2
1770#endif
1771 aese $ctr0b, $rk9 @ AES block 4k+4 - round 9
1772 stp $output_l2, $output_h2, [$output_ptr], #16 @ AES block 4k+2 - store result
1773
1774 aese $ctr2b, $rk9 @ AES block 4k+6 - round 9
1775 add $rctr32w, $rctr32w, #1 @ CTR block 4k+7
1776 stp $output_l3, $output_h3, [$output_ptr], #16 @ AES block 4k+3 - store result
1777
1778 aese $ctr3b, $rk9 @ AES block 4k+7 - round 9
1779 eor $acc_lb, $acc_lb, $acc_mb @ MODULO - fold into low
1780 .L128_dec_tail: @ TAIL
1781
1782 sub $main_end_input_ptr, $end_input_ptr, $input_ptr @ main_end_input_ptr is number of bytes left to process
1783 ld1 { $res1b}, [$input_ptr], #16 @ AES block 4k+4 - load ciphertext
1784
1785 eor $ctr0b, $res1b, $ctr0b @ AES block 4k+4 - result
1786
1787 mov $output_h0, $ctr0.d[1] @ AES block 4k+4 - mov high
1788
1789 mov $output_l0, $ctr0.d[0] @ AES block 4k+4 - mov low
1790
1791 cmp $main_end_input_ptr, #48
1792
1793 eor $output_h0, $output_h0, $rk10_h @ AES block 4k+4 - round 10 high
1794#ifdef __AARCH64EB__
1795 rev $output_h0, $output_h0
1796#endif
1797 ext $t0.16b, $acc_lb, $acc_lb, #8 @ prepare final partial tag
1798 eor $output_l0, $output_l0, $rk10_l @ AES block 4k+4 - round 10 low
1799#ifdef __AARCH64EB__
1800 rev $output_l0, $output_l0
1801#endif
1802 b.gt .L128_dec_blocks_more_than_3
1803
1804 mov $ctr3b, $ctr2b
1805 sub $rctr32w, $rctr32w, #1
1806 movi $acc_l.8b, #0
1807
1808 movi $acc_h.8b, #0
1809 mov $ctr2b, $ctr1b
1810
1811 movi $acc_m.8b, #0
1812 cmp $main_end_input_ptr, #32
1813 b.gt .L128_dec_blocks_more_than_2
1814
1815 cmp $main_end_input_ptr, #16
1816
1817 mov $ctr3b, $ctr1b
1818 sub $rctr32w, $rctr32w, #1
1819 b.gt .L128_dec_blocks_more_than_1
1820
1821 sub $rctr32w, $rctr32w, #1
1822 b .L128_dec_blocks_less_than_1
1823 .L128_dec_blocks_more_than_3: @ blocks left > 3
1824 rev64 $res0b, $res1b @ GHASH final-3 block
1825 ld1 { $res1b}, [$input_ptr], #16 @ AES final-2 block - load ciphertext
1826
1827 eor $res0b, $res0b, $t0.16b @ feed in partial tag
1828
1829 mov $acc_md, $h34k.d[1] @ GHASH final-3 block - mid
1830 stp $output_l0, $output_h0, [$output_ptr], #16 @ AES final-3 block - store result
1831 eor $ctr0b, $res1b, $ctr1b @ AES final-2 block - result
1832
1833 mov $rk4d, $res0.d[1] @ GHASH final-3 block - mid
1834 mov $output_h0, $ctr0.d[1] @ AES final-2 block - mov high
1835
1836 pmull $acc_l.1q, $res0.1d, $h4.1d @ GHASH final-3 block - low
1837 mov $output_l0, $ctr0.d[0] @ AES final-2 block - mov low
1838
1839 pmull2 $acc_h.1q, $res0.2d, $h4.2d @ GHASH final-3 block - high
1840
1841 eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-3 block - mid
1842
1843 movi $t0.8b, #0 @ suppress further partial tag feed in
1844 eor $output_h0, $output_h0, $rk10_h @ AES final-2 block - round 10 high
1845#ifdef __AARCH64EB__
1846 rev $output_h0, $output_h0
1847#endif
1848 pmull $acc_m.1q, $rk4v.1d, $acc_m.1d @ GHASH final-3 block - mid
1849 eor $output_l0, $output_l0, $rk10_l @ AES final-2 block - round 10 low
1850#ifdef __AARCH64EB__
1851 rev $output_l0, $output_l0
1852#endif
1853 .L128_dec_blocks_more_than_2: @ blocks left > 2
1854
1855 rev64 $res0b, $res1b @ GHASH final-2 block
1856 ld1 { $res1b}, [$input_ptr], #16 @ AES final-1 block - load ciphertext
1857
1858 eor $res0b, $res0b, $t0.16b @ feed in partial tag
1859
1860 eor $ctr0b, $res1b, $ctr2b @ AES final-1 block - result
1861 stp $output_l0, $output_h0, [$output_ptr], #16 @ AES final-2 block - store result
1862
1863 mov $rk4d, $res0.d[1] @ GHASH final-2 block - mid
1864
1865 pmull $rk3q1, $res0.1d, $h3.1d @ GHASH final-2 block - low
1866
1867 pmull2 $rk2q1, $res0.2d, $h3.2d @ GHASH final-2 block - high
1868 mov $output_l0, $ctr0.d[0] @ AES final-1 block - mov low
1869
1870 mov $output_h0, $ctr0.d[1] @ AES final-1 block - mov high
1871 eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-2 block - mid
1872
1873 movi $t0.8b, #0 @ suppress further partial tag feed in
1874
1875 pmull $rk4v.1q, $rk4v.1d, $h34k.1d @ GHASH final-2 block - mid
1876
1877 eor $output_l0, $output_l0, $rk10_l @ AES final-1 block - round 10 low
1878#ifdef __AARCH64EB__
1879 rev $output_l0, $output_l0
1880#endif
1881 eor $acc_lb, $acc_lb, $rk3 @ GHASH final-2 block - low
1882
1883 eor $acc_hb, $acc_hb, $rk2 @ GHASH final-2 block - high
1884
1885 eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-2 block - mid
1886 eor $output_h0, $output_h0, $rk10_h @ AES final-1 block - round 10 high
1887#ifdef __AARCH64EB__
1888 rev $output_h0, $output_h0
1889#endif
1890 .L128_dec_blocks_more_than_1: @ blocks left > 1
1891
1892 rev64 $res0b, $res1b @ GHASH final-1 block
1893
1894 ld1 { $res1b}, [$input_ptr], #16 @ AES final block - load ciphertext
1895 eor $res0b, $res0b, $t0.16b @ feed in partial tag
1896
1897 mov $rk4d, $res0.d[1] @ GHASH final-1 block - mid
1898
1899 eor $ctr0b, $res1b, $ctr3b @ AES final block - result
1900
1901 eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-1 block - mid
1902
1903 stp $output_l0, $output_h0, [$output_ptr], #16 @ AES final-1 block - store result
1904 mov $output_l0, $ctr0.d[0] @ AES final block - mov low
1905
1906 mov $output_h0, $ctr0.d[1] @ AES final block - mov high
1907 ins $rk4v.d[1], $rk4v.d[0] @ GHASH final-1 block - mid
1908
1909 pmull $rk3q1, $res0.1d, $h2.1d @ GHASH final-1 block - low
1910
1911 pmull2 $rk2q1, $res0.2d, $h2.2d @ GHASH final-1 block - high
1912
1913 pmull2 $rk4v.1q, $rk4v.2d, $h12k.2d @ GHASH final-1 block - mid
1914 movi $t0.8b, #0 @ suppress further partial tag feed in
1915
1916 eor $acc_lb, $acc_lb, $rk3 @ GHASH final-1 block - low
1917
1918 eor $acc_hb, $acc_hb, $rk2 @ GHASH final-1 block - high
1919 eor $output_h0, $output_h0, $rk10_h @ AES final block - round 10 high
1920#ifdef __AARCH64EB__
1921 rev $output_h0, $output_h0
1922#endif
1923 eor $output_l0, $output_l0, $rk10_l @ AES final block - round 10 low
1924#ifdef __AARCH64EB__
1925 rev $output_l0, $output_l0
1926#endif
1927 eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-1 block - mid
1928 .L128_dec_blocks_less_than_1: @ blocks left <= 1
1929
1930 mvn $rk10_h, xzr @ rk10_h = 0xffffffffffffffff
1931 and $bit_length, $bit_length, #127 @ bit_length %= 128
1932
1933 mvn $rk10_l, xzr @ rk10_l = 0xffffffffffffffff
1934 sub $bit_length, $bit_length, #128 @ bit_length -= 128
1935
1936 neg $bit_length, $bit_length @ bit_length = 128 - #bits in input (in range [1,128])
1937
1938 and $bit_length, $bit_length, #127 @ bit_length %= 128
1939
1940 lsr $rk10_h, $rk10_h, $bit_length @ rk10_h is mask for top 64b of last block
1941 cmp $bit_length, #64
1942
1943 csel $ctr96_b64x, $rk10_h, xzr, lt
1944 csel $ctr32x, $rk10_l, $rk10_h, lt
1945
1946 fmov $ctr0d, $ctr32x @ ctr0b is mask for last block
1947
1948 mov $ctr0.d[1], $ctr96_b64x
1949
1950 and $res1b, $res1b, $ctr0b @ possibly partial last block has zeroes in highest bits
1951
1952 rev64 $res0b, $res1b @ GHASH final block
1953
1954 eor $res0b, $res0b, $t0.16b @ feed in partial tag
1955
1956 ldp $end_input_ptr, $main_end_input_ptr, [$output_ptr] @ load existing bytes we need to not overwrite
1957
1958 and $output_h0, $output_h0, $ctr96_b64x
1959
1960 pmull2 $rk2q1, $res0.2d, $h1.2d @ GHASH final block - high
1961 mov $t0d, $res0.d[1] @ GHASH final block - mid
1962
1963 eor $t0.8b, $t0.8b, $res0.8b @ GHASH final block - mid
1964 eor $acc_hb, $acc_hb, $rk2 @ GHASH final block - high
1965
1966 pmull $t0.1q, $t0.1d, $h12k.1d @ GHASH final block - mid
1967
1968 pmull $rk3q1, $res0.1d, $h1.1d @ GHASH final block - low
1969 bic $end_input_ptr, $end_input_ptr, $ctr32x @ mask out low existing bytes
1970 and $output_l0, $output_l0, $ctr32x
1971
1972#ifndef __AARCH64EB__
1973 rev $ctr32w, $rctr32w
1974#else
1975 mov $ctr32w, $rctr32w
1976#endif
1977
1978 eor $acc_mb, $acc_mb, $t0.16b @ GHASH final block - mid
1979 movi $mod_constant.8b, #0xc2
1980
1981 eor $acc_lb, $acc_lb, $rk3 @ GHASH final block - low
1982
1983 bic $main_end_input_ptr, $main_end_input_ptr, $ctr96_b64x @ mask out high existing bytes
1984 shl $mod_constantd, $mod_constantd, #56 @ mod_constant
1985
1986 eor $t9.16b, $acc_lb, $acc_hb @ MODULO - karatsuba tidy up
1987
1988 pmull $mod_t.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid
1989
1990 eor $acc_mb, $acc_mb, $t9.16b @ MODULO - karatsuba tidy up
1991
1992 orr $output_l0, $output_l0, $end_input_ptr
1993 str $ctr32w, [$counter, #12] @ store the updated counter
1994
1995 orr $output_h0, $output_h0, $main_end_input_ptr
1996 stp $output_l0, $output_h0, [$output_ptr]
1997 ext $acc_hb, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment
1998
1999 eor $acc_mb, $acc_mb, $mod_t.16b @ MODULO - fold into mid
2000
2001 eor $acc_mb, $acc_mb, $acc_hb @ MODULO - fold into mid
2002
2003 pmull $mod_constant.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low
2004 ext $acc_mb, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment
2005
2006 eor $acc_lb, $acc_lb, $mod_constant.16b @ MODULO - fold into low
2007
2008 eor $acc_lb, $acc_lb, $acc_mb @ MODULO - fold into low
2009 ext $acc_lb, $acc_lb, $acc_lb, #8
2010 rev64 $acc_lb, $acc_lb
2011 mov x0, $len
2012 st1 { $acc_l.16b }, [$current_tag]
2013
2014 ldp x21, x22, [sp, #16]
2015 ldp x23, x24, [sp, #32]
2016 ldp d8, d9, [sp, #48]
2017 ldp d10, d11, [sp, #64]
2018 ldp d12, d13, [sp, #80]
2019 ldp d14, d15, [sp, #96]
2020 ldp x19, x20, [sp], #112
2021 ret
2022
2023 .L128_dec_ret:
2024 mov w0, #0x0
2025 ret
2026.size aes_gcm_dec_128_kernel,.-aes_gcm_dec_128_kernel
2027___
2028}
2029
2030{
2031my ($end_input_ptr,$main_end_input_ptr,$input_l0,$input_h0)=map("x$_",(4..7));
2032my ($input_l1,$input_h1,$input_l2,$input_h2,$input_l3,$input_h3)=map("x$_",(19..24));
2033my ($output_l1,$output_h1,$output_l2,$output_h2,$output_l3,$output_h3)=map("x$_",(19..24));
2034my ($output_l0,$output_h0)=map("x$_",(6..7));
2035
2036my $ctr32w="w9";
2037my ($ctr32x,$ctr96_b64x,$ctr96_t32x,$rctr32x,$rk12_l,$rk12_h,$len)=map("x$_",(9..15));
2038my ($ctr96_t32w,$rctr32w)=map("w$_",(11..12));
2039
2040my ($ctr0b,$ctr1b,$ctr2b,$ctr3b,$res0b,$res1b,$res2b,$res3b)=map("v$_.16b",(0..7));
2041my ($ctr0,$ctr1,$ctr2,$ctr3,$res0,$res1,$res2,$res3)=map("v$_",(0..7));
2042my ($ctr0d,$ctr1d,$ctr2d,$ctr3d,$res0d,$res1d,$res2d,$res3d)=map("d$_",(0..7));
2043my ($res0q,$res1q,$res2q,$res3q)=map("q$_",(4..7));
2044
2045my ($acc_hb,$acc_mb,$acc_lb)=map("v$_.16b",(9..11));
2046my ($acc_h,$acc_m,$acc_l)=map("v$_",(9..11));
2047my ($acc_hd,$acc_md,$acc_ld)=map("d$_",(9..11));
2048
2049my ($h1,$h2,$h3,$h4,$h12k,$h34k)=map("v$_",(12..17));
2050my ($h1q,$h2q,$h3q,$h4q)=map("q$_",(12..15));
2051my ($h1b,$h2b,$h3b,$h4b)=map("v$_.16b",(12..15));
2052
2053my $t0="v8";
2054my $t0d="d8";
2055my $t3="v4";
2056my $t3d="d4";
2057
2058my ($t1,$t2)=map("v$_",(30..31));
2059my ($t1d,$t2d)=map("d$_",(30..31));
2060
2061my $t4="v30";
2062my $t4d="d30";
2063my $t5="v8";
2064my $t5d="d8";
2065my $t6="v31";
2066my $t6d="d31";
2067
2068my $t7="v5";
2069my $t7d="d5";
2070my $t8="v6";
2071my $t8d="d6";
2072my $t9="v30";
2073my $t9d="d30";
2074
2075my ($ctr_t0,$ctr_t1,$ctr_t2,$ctr_t3)=map("v$_",(4..7));
2076my ($ctr_t0d,$ctr_t1d,$ctr_t2d,$ctr_t3d)=map("d$_",(4..7));
2077my ($ctr_t0b,$ctr_t1b,$ctr_t2b,$ctr_t3b)=map("v$_.16b",(4..7));
2078
2079my $mod_constantd="d8";
2080my $mod_constant="v8";
2081my $mod_t="v31";
2082
2083my ($rk0,$rk1,$rk2,$rk3,$rk4,$rk5,$rk6,$rk7,$rk8,$rk9,$rk10,$rk11)=map("v$_.16b",(18..29));
2084my ($rk0q,$rk1q,$rk2q,$rk3q,$rk4q,$rk5q,$rk6q,$rk7q,$rk8q,$rk9q,$rk10q,$rk11q)=map("q$_",(18..29));
2085my ($rk0s,$rk1s,$rk2s,$rk3s,$rk4s,$rk5s,$rk6s,$rk7s,$rk8s,$rk9s,$rk10s,$rk11s)=map("v$_.4s",(18..29));
2086my $rk2q1="v20.1q";
2087my $rk3q1="v21.1q";
2088my $rk4v="v22";
2089my $rk4d="d22";
2090
2091#########################################################################################
2092# size_t aes_gcm_enc_192_kernel(const unsigned char *in,
2093# size_t len,
2094# unsigned char *out,
2095# const void *key,
2096# unsigned char ivec[16],
2097# u64 *Xi);
2098#
2099$code.=<<___;
2100.global aes_gcm_enc_192_kernel
2101.type aes_gcm_enc_192_kernel,%function
2102.align 4
2103aes_gcm_enc_192_kernel:
2104 AARCH64_VALID_CALL_TARGET
2105 cbz x1, .L192_enc_ret
2106 stp x19, x20, [sp, #-112]!
2107 mov x16, x4
2108 mov x8, x5
2109 stp x21, x22, [sp, #16]
2110 stp x23, x24, [sp, #32]
2111 stp d8, d9, [sp, #48]
2112 stp d10, d11, [sp, #64]
2113 stp d12, d13, [sp, #80]
2114 stp d14, d15, [sp, #96]
2115
2116 ldp $ctr96_b64x, $ctr96_t32x, [$counter] @ ctr96_b64, ctr96_t32
2117#ifdef __AARCH64EB__
2118 rev $ctr96_b64x, $ctr96_b64x
2119 rev $ctr96_t32x, $ctr96_t32x
2120#endif
2121 ldp $rk12_l, $rk12_h, [$cc, #192] @ load rk12
2122#ifdef __AARCH64EB__
2123 ror $rk12_l, $rk12_l, #32
2124 ror $rk12_h, $rk12_h, #32
2125#endif
2126 ld1 {$rk0s}, [$cc], #16 @ load rk0
2127
2128 ld1 {$rk1s}, [$cc], #16 @ load rk1
2129
2130 ld1 {$rk2s}, [$cc], #16 @ load rk2
2131
2132 lsr $rctr32x, $ctr96_t32x, #32
2133 ld1 {$rk3s}, [$cc], #16 @ load rk3
2134 orr $ctr96_t32w, $ctr96_t32w, $ctr96_t32w
2135
2136 ld1 {$rk4s}, [$cc], #16 @ load rk4
2137 rev $rctr32w, $rctr32w @ rev_ctr32
2138
2139 add $rctr32w, $rctr32w, #1 @ increment rev_ctr32
2140 fmov $ctr3d, $ctr96_b64x @ CTR block 3
2141
2142 rev $ctr32w, $rctr32w @ CTR block 1
2143 add $rctr32w, $rctr32w, #1 @ CTR block 1
2144 fmov $ctr1d, $ctr96_b64x @ CTR block 1
2145
2146 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 1
2147 ld1 { $ctr0b}, [$counter] @ special case vector load initial counter so we can start first AES block as quickly as possible
2148
2149 fmov $ctr1.d[1], $ctr32x @ CTR block 1
2150 rev $ctr32w, $rctr32w @ CTR block 2
2151 add $rctr32w, $rctr32w, #1 @ CTR block 2
2152
2153 fmov $ctr2d, $ctr96_b64x @ CTR block 2
2154 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 2
2155
2156 fmov $ctr2.d[1], $ctr32x @ CTR block 2
2157 rev $ctr32w, $rctr32w @ CTR block 3
2158
2159 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 3
2160 ld1 {$rk5s}, [$cc], #16 @ load rk5
2161
2162 fmov $ctr3.d[1], $ctr32x @ CTR block 3
2163
2164 ld1 {$rk6s}, [$cc], #16 @ load rk6
2165
2166 ld1 {$rk7s}, [$cc], #16 @ load rk7
2167
2168 aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 0
2169 ld1 { $acc_lb}, [$current_tag]
2170 ext $acc_lb, $acc_lb, $acc_lb, #8
2171 rev64 $acc_lb, $acc_lb
2172
2173 aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 0
2174 ld1 {$rk8s}, [$cc], #16 @ load rk8
2175
2176 aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 0
2177 ldr $h4q, [$current_tag, #112] @ load h4l | h4h
2178#ifndef __AARCH64EB__
2179 ext $h4b, $h4b, $h4b, #8
2180#endif
2181 aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 0
2182 ld1 {$rk9s}, [$cc], #16 @ load rk9
2183
2184 aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 1
2185 ld1 {$rk10s}, [$cc], #16 @ load rk10
2186
2187 aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 1
2188 ldr $h1q, [$current_tag, #32] @ load h1l | h1h
2189#ifndef __AARCH64EB__
2190 ext $h1b, $h1b, $h1b, #8
2191#endif
2192 aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 1
2193 ld1 {$rk11s}, [$cc], #16 @ load rk11
2194
2195 aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 1
2196 ldr $h3q, [$current_tag, #80] @ load h3l | h3h
2197#ifndef __AARCH64EB__
2198 ext $h3b, $h3b, $h3b, #8
2199#endif
2200 aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 2
2201
2202 aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 2
2203
2204 aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 2
2205
2206 aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 3
2207 trn1 $acc_h.2d, $h3.2d, $h4.2d @ h4h | h3h
2208
2209 aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 3
2210
2211 aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 2
2212 trn2 $h34k.2d, $h3.2d, $h4.2d @ h4l | h3l
2213
2214 aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 4
2215
2216 aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 3
2217
2218 aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 3
2219
2220 aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 5
2221
2222 aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 4
2223
2224 aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 4
2225
2226 aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 6
2227
2228 aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 4
2229
2230 aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 5
2231
2232 aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 5
2233
2234 aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 5
2235
2236 aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 6
2237 ldr $h2q, [$current_tag, #64] @ load h2l | h2h
2238#ifndef __AARCH64EB__
2239 ext $h2b, $h2b, $h2b, #8
2240#endif
2241 aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 6
2242
2243 aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 6
2244
2245 aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 7
2246
2247 aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 7
2248 trn2 $h12k.2d, $h1.2d, $h2.2d @ h2l | h1l
2249
2250 aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 7
2251
2252 aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 8
2253
2254 aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 7
2255 trn1 $t0.2d, $h1.2d, $h2.2d @ h2h | h1h
2256
2257 aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 8
2258
2259 aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 8
2260
2261 aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 8
2262
2263 aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 9
2264
2265 aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 9
2266
2267 aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 9
2268
2269 aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 9
2270
2271 aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 10
2272
2273 aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 10
2274
2275 aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 10
2276 lsr $main_end_input_ptr, $bit_length, #3 @ byte_len
2277 mov $len, $main_end_input_ptr
2278
2279 aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 10
2280 sub $main_end_input_ptr, $main_end_input_ptr, #1 @ byte_len - 1
2281
2282 eor $h12k.16b, $h12k.16b, $t0.16b @ h2k | h1k
2283 and $main_end_input_ptr, $main_end_input_ptr, #0xffffffffffffffc0 @ number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
2284
2285 eor $h34k.16b, $h34k.16b, $acc_h.16b @ h4k | h3k
2286
2287 aese $ctr2b, $rk11 @ AES block 2 - round 11
2288 add $end_input_ptr, $input_ptr, $bit_length, lsr #3 @ end_input_ptr
2289 add $main_end_input_ptr, $main_end_input_ptr, $input_ptr
2290
2291 aese $ctr1b, $rk11 @ AES block 1 - round 11
2292 cmp $input_ptr, $main_end_input_ptr @ check if we have <= 4 blocks
2293
2294 aese $ctr0b, $rk11 @ AES block 0 - round 11
2295 add $rctr32w, $rctr32w, #1 @ CTR block 3
2296
2297 aese $ctr3b, $rk11 @ AES block 3 - round 11
2298 b.ge .L192_enc_tail @ handle tail
2299
2300 rev $ctr32w, $rctr32w @ CTR block 4
2301 ldp $input_l0, $input_h0, [$input_ptr, #0] @ AES block 0 - load plaintext
2302#ifdef __AARCH64EB__
2303 rev $input_l0, $input_l0
2304 rev $input_h0, $input_h0
2305#endif
2306 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4
2307 ldp $input_l2, $input_h2, [$input_ptr, #32] @ AES block 2 - load plaintext
2308#ifdef __AARCH64EB__
2309 rev $input_l2, $input_l2
2310 rev $input_h2, $input_h2
2311#endif
2312 ldp $input_l3, $input_h3, [$input_ptr, #48] @ AES block 3 - load plaintext
2313#ifdef __AARCH64EB__
2314 rev $input_l3, $input_l3
2315 rev $input_h3, $input_h3
2316#endif
2317 ldp $input_l1, $input_h1, [$input_ptr, #16] @ AES block 1 - load plaintext
2318#ifdef __AARCH64EB__
2319 rev $input_l1, $input_l1
2320 rev $input_h1, $input_h1
2321#endif
2322 add $input_ptr, $input_ptr, #64 @ AES input_ptr update
2323 cmp $input_ptr, $main_end_input_ptr @ check if we have <= 8 blocks
2324
2325 eor $input_l0, $input_l0, $rk12_l @ AES block 0 - round 12 low
2326
2327 eor $input_h0, $input_h0, $rk12_h @ AES block 0 - round 12 high
2328 eor $input_h2, $input_h2, $rk12_h @ AES block 2 - round 12 high
2329 fmov $ctr_t0d, $input_l0 @ AES block 0 - mov low
2330
2331 eor $input_h3, $input_h3, $rk12_h @ AES block 3 - round 12 high
2332 fmov $ctr_t0.d[1], $input_h0 @ AES block 0 - mov high
2333
2334 eor $input_l2, $input_l2, $rk12_l @ AES block 2 - round 12 low
2335 eor $input_l1, $input_l1, $rk12_l @ AES block 1 - round 12 low
2336
2337 fmov $ctr_t1d, $input_l1 @ AES block 1 - mov low
2338 eor $input_h1, $input_h1, $rk12_h @ AES block 1 - round 12 high
2339
2340 fmov $ctr_t1.d[1], $input_h1 @ AES block 1 - mov high
2341
2342 eor $input_l3, $input_l3, $rk12_l @ AES block 3 - round 12 low
2343 fmov $ctr_t2d, $input_l2 @ AES block 2 - mov low
2344
2345 add $rctr32w, $rctr32w, #1 @ CTR block 4
2346 eor $res0b, $ctr_t0b, $ctr0b @ AES block 0 - result
2347 fmov $ctr0d, $ctr96_b64x @ CTR block 4
2348
2349 fmov $ctr0.d[1], $ctr32x @ CTR block 4
2350 rev $ctr32w, $rctr32w @ CTR block 5
2351
2352 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 5
2353 add $rctr32w, $rctr32w, #1 @ CTR block 5
2354
2355 fmov $ctr_t3d, $input_l3 @ AES block 3 - mov low
2356 st1 { $res0b}, [$output_ptr], #16 @ AES block 0 - store result
2357
2358 fmov $ctr_t2.d[1], $input_h2 @ AES block 2 - mov high
2359
2360 eor $res1b, $ctr_t1b, $ctr1b @ AES block 1 - result
2361 fmov $ctr1d, $ctr96_b64x @ CTR block 5
2362 st1 { $res1b}, [$output_ptr], #16 @ AES block 1 - store result
2363
2364 fmov $ctr_t3.d[1], $input_h3 @ AES block 3 - mov high
2365
2366 fmov $ctr1.d[1], $ctr32x @ CTR block 5
2367 rev $ctr32w, $rctr32w @ CTR block 6
2368
2369 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 6
2370
2371 add $rctr32w, $rctr32w, #1 @ CTR block 6
2372 eor $res2b, $ctr_t2b, $ctr2b @ AES block 2 - result
2373 fmov $ctr2d, $ctr96_b64x @ CTR block 6
2374
2375 fmov $ctr2.d[1], $ctr32x @ CTR block 6
2376 rev $ctr32w, $rctr32w @ CTR block 7
2377
2378 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 7
2379 st1 { $res2b}, [$output_ptr], #16 @ AES block 2 - store result
2380
2381 eor $res3b, $ctr_t3b, $ctr3b @ AES block 3 - result
2382 st1 { $res3b}, [$output_ptr], #16 @ AES block 3 - store result
2383 b.ge .L192_enc_prepretail @ do prepretail
2384
2385 .L192_enc_main_loop: @ main loop start
2386 aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 0
2387 rev64 $res1b, $res1b @ GHASH block 4k+1 (t0 and t1 free)
2388
2389 aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 0
2390 ldp $input_l1, $input_h1, [$input_ptr, #16] @ AES block 4k+5 - load plaintext
2391#ifdef __AARCH64EB__
2392 rev $input_l1, $input_l1
2393 rev $input_h1, $input_h1
2394#endif
2395 ext $acc_lb, $acc_lb, $acc_lb, #8 @ PRE 0
2396 fmov $ctr3d, $ctr96_b64x @ CTR block 4k+3
2397 rev64 $res0b, $res0b @ GHASH block 4k (only t0 is free)
2398
2399 aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 1
2400 fmov $ctr3.d[1], $ctr32x @ CTR block 4k+3
2401
2402 pmull2 $t1.1q, $res1.2d, $h3.2d @ GHASH block 4k+1 - high
2403 rev64 $res3b, $res3b @ GHASH block 4k+3 (t0, t1, t2 and t3 free)
2404 ldp $input_l2, $input_h2, [$input_ptr, #32] @ AES block 4k+6 - load plaintext
2405#ifdef __AARCH64EB__
2406 rev $input_l2, $input_l2
2407 rev $input_h2, $input_h2
2408#endif
2409 aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 0
2410 ldp $input_l3, $input_h3, [$input_ptr, #48] @ AES block 4k+3 - load plaintext
2411#ifdef __AARCH64EB__
2412 rev $input_l3, $input_l3
2413 rev $input_h3, $input_h3
2414#endif
2415 pmull $t2.1q, $res1.1d, $h3.1d @ GHASH block 4k+1 - low
2416 eor $res0b, $res0b, $acc_lb @ PRE 1
2417
2418 aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 1
2419
2420 aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 1
2421 rev64 $res2b, $res2b @ GHASH block 4k+2 (t0, t1, and t2 free)
2422
2423 aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 0
2424 eor $input_h3, $input_h3, $rk12_h @ AES block 4k+3 - round 12 high
2425
2426 pmull $acc_l.1q, $res0.1d, $h4.1d @ GHASH block 4k - low
2427 mov $t0d, $res0.d[1] @ GHASH block 4k - mid
2428
2429 aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 2
2430
2431 aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 1
2432 eor $input_l2, $input_l2, $rk12_l @ AES block 4k+6 - round 12 low
2433
2434 eor $t0.8b, $t0.8b, $res0.8b @ GHASH block 4k - mid
2435 eor $acc_lb, $acc_lb, $t2.16b @ GHASH block 4k+1 - low
2436
2437 aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 3
2438 eor $input_l1, $input_l1, $rk12_l @ AES block 4k+5 - round 12 low
2439
2440 aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 2
2441 mov $t6d, $res2.d[1] @ GHASH block 4k+2 - mid
2442
2443 pmull2 $acc_h.1q, $res0.2d, $h4.2d @ GHASH block 4k - high
2444 mov $t3d, $res1.d[1] @ GHASH block 4k+1 - mid
2445
2446 aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 2
2447
2448 aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 3
2449
2450 mov $acc_md, $h34k.d[1] @ GHASH block 4k - mid
2451 eor $acc_hb, $acc_hb, $t1.16b @ GHASH block 4k+1 - high
2452
2453 aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 2
2454 eor $t6.8b, $t6.8b, $res2.8b @ GHASH block 4k+2 - mid
2455
2456 pmull2 $t4.1q, $res2.2d, $h2.2d @ GHASH block 4k+2 - high
2457
2458 aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 4
2459 eor $t3.8b, $t3.8b, $res1.8b @ GHASH block 4k+1 - mid
2460
2461 aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 3
2462
2463 pmull2 $t7.1q, $res3.2d, $h1.2d @ GHASH block 4k+3 - high
2464 eor $input_h1, $input_h1, $rk12_h @ AES block 4k+5 - round 12 high
2465 ins $t6.d[1], $t6.d[0] @ GHASH block 4k+2 - mid
2466
2467 aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 5
2468 add $rctr32w, $rctr32w, #1 @ CTR block 4k+3
2469
2470 aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 4
2471 eor $acc_hb, $acc_hb, $t4.16b @ GHASH block 4k+2 - high
2472
2473 pmull $t3.1q, $t3.1d, $h34k.1d @ GHASH block 4k+1 - mid
2474 eor $input_h2, $input_h2, $rk12_h @ AES block 4k+6 - round 12 high
2475
2476 pmull2 $t6.1q, $t6.2d, $h12k.2d @ GHASH block 4k+2 - mid
2477 eor $input_l3, $input_l3, $rk12_l @ AES block 4k+3 - round 12 low
2478 mov $t9d, $res3.d[1] @ GHASH block 4k+3 - mid
2479
2480 pmull $acc_m.1q, $t0.1d, $acc_m.1d @ GHASH block 4k - mid
2481 rev $ctr32w, $rctr32w @ CTR block 4k+8
2482
2483 pmull $t5.1q, $res2.1d, $h2.1d @ GHASH block 4k+2 - low
2484 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+8
2485
2486 aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 3
2487 eor $t9.8b, $t9.8b, $res3.8b @ GHASH block 4k+3 - mid
2488
2489 aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 4
2490 ldp $input_l0, $input_h0, [$input_ptr, #0] @ AES block 4k+4 - load plaintext
2491#ifdef __AARCH64EB__
2492 rev $input_l0, $input_l0
2493 rev $input_h0, $input_h0
2494#endif
2495 aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 6
2496 eor $acc_lb, $acc_lb, $t5.16b @ GHASH block 4k+2 - low
2497
2498 aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 4
2499 add $input_ptr, $input_ptr, #64 @ AES input_ptr update
2500
2501 aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 5
2502 movi $mod_constant.8b, #0xc2
2503
2504 pmull $t8.1q, $res3.1d, $h1.1d @ GHASH block 4k+3 - low
2505 eor $input_h0, $input_h0, $rk12_h @ AES block 4k+4 - round 12 high
2506 eor $acc_mb, $acc_mb, $t3.16b @ GHASH block 4k+1 - mid
2507
2508 aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 5
2509 eor $input_l0, $input_l0, $rk12_l @ AES block 4k+4 - round 12 low
2510
2511 aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 6
2512 shl $mod_constantd, $mod_constantd, #56 @ mod_constant
2513
2514 aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 5
2515 eor $acc_hb, $acc_hb, $t7.16b @ GHASH block 4k+3 - high
2516
2517 aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 7
2518 fmov $ctr_t1d, $input_l1 @ AES block 4k+5 - mov low
2519
2520 aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 7
2521 eor $acc_mb, $acc_mb, $t6.16b @ GHASH block 4k+2 - mid
2522
2523 aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 6
2524 fmov $ctr_t1.d[1], $input_h1 @ AES block 4k+5 - mov high
2525
2526 aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 8
2527 eor $acc_lb, $acc_lb, $t8.16b @ GHASH block 4k+3 - low
2528
2529 pmull $t9.1q, $t9.1d, $h12k.1d @ GHASH block 4k+3 - mid
2530 cmp $input_ptr, $main_end_input_ptr @ LOOP CONTROL
2531 fmov $ctr_t0d, $input_l0 @ AES block 4k+4 - mov low
2532
2533 aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 6
2534 fmov $ctr_t0.d[1], $input_h0 @ AES block 4k+4 - mov high
2535
2536 aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 8
2537 fmov $ctr_t3d, $input_l3 @ AES block 4k+3 - mov low
2538
2539 eor $acc_mb, $acc_mb, $t9.16b @ GHASH block 4k+3 - mid
2540 eor $t9.16b, $acc_lb, $acc_hb @ MODULO - karatsuba tidy up
2541 add $rctr32w, $rctr32w, #1 @ CTR block 4k+8
2542
2543 aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 7
2544 fmov $ctr_t3.d[1], $input_h3 @ AES block 4k+3 - mov high
2545
2546 pmull $mod_t.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid
2547 ext $acc_hb, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment
2548 fmov $ctr_t2d, $input_l2 @ AES block 4k+6 - mov low
2549
2550 aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 7
2551
2552 aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 9
2553 eor $acc_mb, $acc_mb, $t9.16b @ MODULO - karatsuba tidy up
2554
2555 aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 8
2556
2557 aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 8
2558
2559 aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 9
2560
2561 aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 10
2562 eor $acc_mb, $acc_mb, $mod_t.16b @ MODULO - fold into mid
2563
2564 aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 9
2565
2566 aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 9
2567
2568 aese $ctr0b, $rk11 @ AES block 4k+4 - round 11
2569
2570 aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 10
2571 eor $acc_mb, $acc_mb, $acc_hb @ MODULO - fold into mid
2572
2573 aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 10
2574
2575 eor $res0b, $ctr_t0b, $ctr0b @ AES block 4k+4 - result
2576 fmov $ctr0d, $ctr96_b64x @ CTR block 4k+8
2577
2578 aese $ctr1b, $rk11 @ AES block 4k+5 - round 11
2579 fmov $ctr0.d[1], $ctr32x @ CTR block 4k+8
2580 rev $ctr32w, $rctr32w @ CTR block 4k+9
2581
2582 pmull $acc_h.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low
2583 fmov $ctr_t2.d[1], $input_h2 @ AES block 4k+6 - mov high
2584 st1 { $res0b}, [$output_ptr], #16 @ AES block 4k+4 - store result
2585
2586 aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 10
2587 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+9
2588
2589 eor $res1b, $ctr_t1b, $ctr1b @ AES block 4k+5 - result
2590 add $rctr32w, $rctr32w, #1 @ CTR block 4k+9
2591 fmov $ctr1d, $ctr96_b64x @ CTR block 4k+9
2592
2593 aese $ctr2b, $rk11 @ AES block 4k+6 - round 11
2594 fmov $ctr1.d[1], $ctr32x @ CTR block 4k+9
2595 rev $ctr32w, $rctr32w @ CTR block 4k+10
2596
2597 add $rctr32w, $rctr32w, #1 @ CTR block 4k+10
2598 ext $acc_mb, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment
2599 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+10
2600
2601 st1 { $res1b}, [$output_ptr], #16 @ AES block 4k+5 - store result
2602 eor $acc_lb, $acc_lb, $acc_hb @ MODULO - fold into low
2603
2604 aese $ctr3b, $rk11 @ AES block 4k+7 - round 11
2605 eor $res2b, $ctr_t2b, $ctr2b @ AES block 4k+6 - result
2606 fmov $ctr2d, $ctr96_b64x @ CTR block 4k+10
2607
2608 st1 { $res2b}, [$output_ptr], #16 @ AES block 4k+6 - store result
2609 fmov $ctr2.d[1], $ctr32x @ CTR block 4k+10
2610 rev $ctr32w, $rctr32w @ CTR block 4k+11
2611
2612 eor $acc_lb, $acc_lb, $acc_mb @ MODULO - fold into low
2613 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+11
2614
2615 eor $res3b, $ctr_t3b, $ctr3b @ AES block 4k+3 - result
2616 st1 { $res3b}, [$output_ptr], #16 @ AES block 4k+3 - store result
2617 b.lt .L192_enc_main_loop
2618
2619 .L192_enc_prepretail: @ PREPRETAIL
2620 aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 0
2621 rev64 $res0b, $res0b @ GHASH block 4k (only t0 is free)
2622
2623 fmov $ctr3d, $ctr96_b64x @ CTR block 4k+3
2624 ext $acc_lb, $acc_lb, $acc_lb, #8 @ PRE 0
2625 add $rctr32w, $rctr32w, #1 @ CTR block 4k+3
2626
2627 aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 0
2628 rev64 $res1b, $res1b @ GHASH block 4k+1 (t0 and t1 free)
2629
2630 aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 0
2631
2632 fmov $ctr3.d[1], $ctr32x @ CTR block 4k+3
2633 eor $res0b, $res0b, $acc_lb @ PRE 1
2634 mov $acc_md, $h34k.d[1] @ GHASH block 4k - mid
2635
2636 aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 1
2637 rev64 $res2b, $res2b @ GHASH block 4k+2 (t0, t1, and t2 free)
2638
2639 pmull2 $t1.1q, $res1.2d, $h3.2d @ GHASH block 4k+1 - high
2640
2641 pmull $acc_l.1q, $res0.1d, $h4.1d @ GHASH block 4k - low
2642 mov $t0d, $res0.d[1] @ GHASH block 4k - mid
2643
2644 pmull $t2.1q, $res1.1d, $h3.1d @ GHASH block 4k+1 - low
2645 rev64 $res3b, $res3b @ GHASH block 4k+3 (t0, t1, t2 and t3 free)
2646
2647 pmull2 $acc_h.1q, $res0.2d, $h4.2d @ GHASH block 4k - high
2648
2649 eor $t0.8b, $t0.8b, $res0.8b @ GHASH block 4k - mid
2650 mov $t3d, $res1.d[1] @ GHASH block 4k+1 - mid
2651
2652 eor $acc_lb, $acc_lb, $t2.16b @ GHASH block 4k+1 - low
2653 mov $t6d, $res2.d[1] @ GHASH block 4k+2 - mid
2654
2655 aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 0
2656 eor $acc_hb, $acc_hb, $t1.16b @ GHASH block 4k+1 - high
2657
2658 pmull2 $t4.1q, $res2.2d, $h2.2d @ GHASH block 4k+2 - high
2659
2660 eor $t3.8b, $t3.8b, $res1.8b @ GHASH block 4k+1 - mid
2661 eor $t6.8b, $t6.8b, $res2.8b @ GHASH block 4k+2 - mid
2662
2663 aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 1
2664
2665 aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 1
2666 eor $acc_hb, $acc_hb, $t4.16b @ GHASH block 4k+2 - high
2667
2668 aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 1
2669
2670 aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 2
2671 mov $t9d, $res3.d[1] @ GHASH block 4k+3 - mid
2672
2673 pmull2 $t7.1q, $res3.2d, $h1.2d @ GHASH block 4k+3 - high
2674 ins $t6.d[1], $t6.d[0] @ GHASH block 4k+2 - mid
2675
2676 aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 2
2677
2678 pmull $acc_m.1q, $t0.1d, $acc_m.1d @ GHASH block 4k - mid
2679 eor $t9.8b, $t9.8b, $res3.8b @ GHASH block 4k+3 - mid
2680
2681 aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 3
2682
2683 pmull2 $t6.1q, $t6.2d, $h12k.2d @ GHASH block 4k+2 - mid
2684
2685 pmull $t3.1q, $t3.1d, $h34k.1d @ GHASH block 4k+1 - mid
2686
2687 pmull $t9.1q, $t9.1d, $h12k.1d @ GHASH block 4k+3 - mid
2688 eor $acc_hb, $acc_hb, $t7.16b @ GHASH block 4k+3 - high
2689
2690 pmull $t5.1q, $res2.1d, $h2.1d @ GHASH block 4k+2 - low
2691
2692 aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 3
2693 eor $acc_mb, $acc_mb, $t3.16b @ GHASH block 4k+1 - mid
2694
2695 aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 2
2696
2697 aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 2
2698 eor $acc_lb, $acc_lb, $t5.16b @ GHASH block 4k+2 - low
2699
2700 aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 4
2701
2702 aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 3
2703 eor $acc_mb, $acc_mb, $t6.16b @ GHASH block 4k+2 - mid
2704
2705 aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 3
2706
2707 pmull $t8.1q, $res3.1d, $h1.1d @ GHASH block 4k+3 - low
2708 movi $mod_constant.8b, #0xc2
2709
2710 aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 4
2711
2712 aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 4
2713
2714 aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 4
2715 eor $acc_mb, $acc_mb, $t9.16b @ GHASH block 4k+3 - mid
2716
2717 aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 5
2718
2719 aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 5
2720
2721 aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 5
2722 eor $acc_lb, $acc_lb, $t8.16b @ GHASH block 4k+3 - low
2723
2724 aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 5
2725
2726 aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 6
2727 eor $acc_mb, $acc_mb, $acc_hb @ karatsuba tidy up
2728
2729 aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 6
2730
2731 aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 6
2732 shl $mod_constantd, $mod_constantd, #56 @ mod_constant
2733
2734 aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 7
2735
2736 aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 7
2737 eor $acc_mb, $acc_mb, $acc_lb
2738
2739 aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 7
2740
2741 pmull $t1.1q, $acc_h.1d, $mod_constant.1d
2742
2743 aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 6
2744 ext $acc_hb, $acc_hb, $acc_hb, #8
2745
2746 aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 8
2747
2748 aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 8
2749 eor $acc_mb, $acc_mb, $t1.16b
2750
2751 aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 7
2752
2753 aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 8
2754
2755 aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 9
2756
2757 aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 8
2758 eor $acc_mb, $acc_mb, $acc_hb
2759
2760 aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 9
2761
2762 aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 9
2763
2764 aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 9
2765
2766 pmull $t1.1q, $acc_m.1d, $mod_constant.1d
2767
2768 ext $acc_mb, $acc_mb, $acc_mb, #8
2769
2770 aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 10
2771
2772 aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 10
2773
2774 aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 10
2775
2776 aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 10
2777 eor $acc_lb, $acc_lb, $t1.16b
2778
2779 aese $ctr0b, $rk11 @ AES block 4k+4 - round 11
2780
2781 aese $ctr3b, $rk11 @ AES block 4k+7 - round 11
2782
2783 aese $ctr2b, $rk11 @ AES block 4k+6 - round 11
2784
2785 aese $ctr1b, $rk11 @ AES block 4k+5 - round 11
2786 eor $acc_lb, $acc_lb, $acc_mb
2787 .L192_enc_tail: @ TAIL
2788
2789 sub $main_end_input_ptr, $end_input_ptr, $input_ptr @ main_end_input_ptr is number of bytes left to process
2790 ldp $input_l0, $input_h0, [$input_ptr], #16 @ AES block 4k+4 - load plaintext
2791#ifdef __AARCH64EB__
2792 rev $input_l0, $input_l0
2793 rev $input_h0, $input_h0
2794#endif
2795 eor $input_l0, $input_l0, $rk12_l @ AES block 4k+4 - round 12 low
2796 eor $input_h0, $input_h0, $rk12_h @ AES block 4k+4 - round 12 high
2797
2798 fmov $ctr_t0d, $input_l0 @ AES block 4k+4 - mov low
2799
2800 fmov $ctr_t0.d[1], $input_h0 @ AES block 4k+4 - mov high
2801 cmp $main_end_input_ptr, #48
2802
2803 eor $res1b, $ctr_t0b, $ctr0b @ AES block 4k+4 - result
2804
2805 ext $t0.16b, $acc_lb, $acc_lb, #8 @ prepare final partial tag
2806 b.gt .L192_enc_blocks_more_than_3
2807
2808 sub $rctr32w, $rctr32w, #1
2809 movi $acc_m.8b, #0
2810
2811 mov $ctr3b, $ctr2b
2812 movi $acc_h.8b, #0
2813 cmp $main_end_input_ptr, #32
2814
2815 mov $ctr2b, $ctr1b
2816 movi $acc_l.8b, #0
2817 b.gt .L192_enc_blocks_more_than_2
2818
2819 sub $rctr32w, $rctr32w, #1
2820
2821 mov $ctr3b, $ctr1b
2822 cmp $main_end_input_ptr, #16
2823 b.gt .L192_enc_blocks_more_than_1
2824
2825 sub $rctr32w, $rctr32w, #1
2826 b .L192_enc_blocks_less_than_1
2827 .L192_enc_blocks_more_than_3: @ blocks left > 3
2828 st1 { $res1b}, [$output_ptr], #16 @ AES final-3 block - store result
2829
2830 ldp $input_l0, $input_h0, [$input_ptr], #16 @ AES final-2 block - load input low & high
2831#ifdef __AARCH64EB__
2832 rev $input_l0, $input_l0
2833 rev $input_h0, $input_h0
2834#endif
2835 rev64 $res0b, $res1b @ GHASH final-3 block
2836
2837 eor $input_l0, $input_l0, $rk12_l @ AES final-2 block - round 12 low
2838 eor $res0b, $res0b, $t0.16b @ feed in partial tag
2839
2840 eor $input_h0, $input_h0, $rk12_h @ AES final-2 block - round 12 high
2841 fmov $res1d, $input_l0 @ AES final-2 block - mov low
2842
2843 fmov $res1.d[1], $input_h0 @ AES final-2 block - mov high
2844
2845 mov $rk4d, $res0.d[1] @ GHASH final-3 block - mid
2846
2847 pmull $acc_l.1q, $res0.1d, $h4.1d @ GHASH final-3 block - low
2848
2849 mov $acc_md, $h34k.d[1] @ GHASH final-3 block - mid
2850
2851 eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-3 block - mid
2852
2853 movi $t0.8b, #0 @ suppress further partial tag feed in
2854
2855 pmull2 $acc_h.1q, $res0.2d, $h4.2d @ GHASH final-3 block - high
2856
2857 pmull $acc_m.1q, $rk4v.1d, $acc_m.1d @ GHASH final-3 block - mid
2858 eor $res1b, $res1b, $ctr1b @ AES final-2 block - result
2859 .L192_enc_blocks_more_than_2: @ blocks left > 2
2860
2861 st1 { $res1b}, [$output_ptr], #16 @ AES final-2 block - store result
2862
2863 rev64 $res0b, $res1b @ GHASH final-2 block
2864 ldp $input_l0, $input_h0, [$input_ptr], #16 @ AES final-1 block - load input low & high
2865#ifdef __AARCH64EB__
2866 rev $input_l0, $input_l0
2867 rev $input_h0, $input_h0
2868#endif
2869 eor $res0b, $res0b, $t0.16b @ feed in partial tag
2870
2871 eor $input_h0, $input_h0, $rk12_h @ AES final-1 block - round 12 high
2872
2873 pmull2 $rk2q1, $res0.2d, $h3.2d @ GHASH final-2 block - high
2874 mov $rk4d, $res0.d[1] @ GHASH final-2 block - mid
2875
2876 pmull $rk3q1, $res0.1d, $h3.1d @ GHASH final-2 block - low
2877 eor $input_l0, $input_l0, $rk12_l @ AES final-1 block - round 12 low
2878
2879 fmov $res1d, $input_l0 @ AES final-1 block - mov low
2880
2881 fmov $res1.d[1], $input_h0 @ AES final-1 block - mov high
2882 eor $acc_hb, $acc_hb, $rk2 @ GHASH final-2 block - high
2883 eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-2 block - mid
2884
2885 eor $acc_lb, $acc_lb, $rk3 @ GHASH final-2 block - low
2886
2887 pmull $rk4v.1q, $rk4v.1d, $h34k.1d @ GHASH final-2 block - mid
2888
2889 movi $t0.8b, #0 @ suppress further partial tag feed in
2890
2891 eor $res1b, $res1b, $ctr2b @ AES final-1 block - result
2892
2893 eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-2 block - mid
2894 .L192_enc_blocks_more_than_1: @ blocks left > 1
2895
2896 st1 { $res1b}, [$output_ptr], #16 @ AES final-1 block - store result
2897
2898 ldp $input_l0, $input_h0, [$input_ptr], #16 @ AES final block - load input low & high
2899#ifdef __AARCH64EB__
2900 rev $input_l0, $input_l0
2901 rev $input_h0, $input_h0
2902#endif
2903 rev64 $res0b, $res1b @ GHASH final-1 block
2904
2905 eor $input_l0, $input_l0, $rk12_l @ AES final block - round 12 low
2906 eor $res0b, $res0b, $t0.16b @ feed in partial tag
2907 movi $t0.8b, #0 @ suppress further partial tag feed in
2908
2909 mov $rk4d, $res0.d[1] @ GHASH final-1 block - mid
2910
2911 eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-1 block - mid
2912 eor $input_h0, $input_h0, $rk12_h @ AES final block - round 12 high
2913 fmov $res1d, $input_l0 @ AES final block - mov low
2914
2915 pmull2 $rk2q1, $res0.2d, $h2.2d @ GHASH final-1 block - high
2916 fmov $res1.d[1], $input_h0 @ AES final block - mov high
2917
2918 ins $rk4v.d[1], $rk4v.d[0] @ GHASH final-1 block - mid
2919
2920 eor $acc_hb, $acc_hb, $rk2 @ GHASH final-1 block - high
2921
2922 pmull $rk3q1, $res0.1d, $h2.1d @ GHASH final-1 block - low
2923
2924 pmull2 $rk4v.1q, $rk4v.2d, $h12k.2d @ GHASH final-1 block - mid
2925
2926 eor $res1b, $res1b, $ctr3b @ AES final block - result
2927
2928 eor $acc_lb, $acc_lb, $rk3 @ GHASH final-1 block - low
2929
2930 eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-1 block - mid
2931 .L192_enc_blocks_less_than_1: @ blocks left <= 1
2932
2933 ld1 { $rk0}, [$output_ptr] @ load existing bytes where the possibly partial last block is to be stored
2934#ifndef __AARCH64EB__
2935 rev $ctr32w, $rctr32w
2936#else
2937 mov $ctr32w, $rctr32w
2938#endif
2939 and $bit_length, $bit_length, #127 @ bit_length %= 128
2940
2941 sub $bit_length, $bit_length, #128 @ bit_length -= 128
2942 mvn $rk12_h, xzr @ rk12_h = 0xffffffffffffffff
2943
2944 neg $bit_length, $bit_length @ bit_length = 128 - #bits in input (in range [1,128])
2945 mvn $rk12_l, xzr @ rk12_l = 0xffffffffffffffff
2946
2947 and $bit_length, $bit_length, #127 @ bit_length %= 128
2948
2949 lsr $rk12_h, $rk12_h, $bit_length @ rk12_h is mask for top 64b of last block
2950 cmp $bit_length, #64
2951
2952 csel $input_l0, $rk12_l, $rk12_h, lt
2953 csel $input_h0, $rk12_h, xzr, lt
2954
2955 fmov $ctr0d, $input_l0 @ ctr0b is mask for last block
2956
2957 fmov $ctr0.d[1], $input_h0
2958
2959 and $res1b, $res1b, $ctr0b @ possibly partial last block has zeroes in highest bits
2960
2961 rev64 $res0b, $res1b @ GHASH final block
2962
2963 eor $res0b, $res0b, $t0.16b @ feed in partial tag
2964
2965 mov $t0d, $res0.d[1] @ GHASH final block - mid
2966
2967 pmull $rk3q1, $res0.1d, $h1.1d @ GHASH final block - low
2968
2969 pmull2 $rk2q1, $res0.2d, $h1.2d @ GHASH final block - high
2970
2971 eor $t0.8b, $t0.8b, $res0.8b @ GHASH final block - mid
2972
2973 eor $acc_lb, $acc_lb, $rk3 @ GHASH final block - low
2974
2975 eor $acc_hb, $acc_hb, $rk2 @ GHASH final block - high
2976
2977 pmull $t0.1q, $t0.1d, $h12k.1d @ GHASH final block - mid
2978
2979 eor $acc_mb, $acc_mb, $t0.16b @ GHASH final block - mid
2980 movi $mod_constant.8b, #0xc2
2981
2982 eor $t9.16b, $acc_lb, $acc_hb @ MODULO - karatsuba tidy up
2983
2984 shl $mod_constantd, $mod_constantd, #56 @ mod_constant
2985
2986 bif $res1b, $rk0, $ctr0b @ insert existing bytes in top end of result before storing
2987
2988 eor $acc_mb, $acc_mb, $t9.16b @ MODULO - karatsuba tidy up
2989
2990 pmull $mod_t.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid
2991
2992 ext $acc_hb, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment
2993
2994 eor $acc_mb, $acc_mb, $mod_t.16b @ MODULO - fold into mid
2995
2996 eor $acc_mb, $acc_mb, $acc_hb @ MODULO - fold into mid
2997
2998 pmull $acc_h.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low
2999
3000 ext $acc_mb, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment
3001
3002 eor $acc_lb, $acc_lb, $acc_hb @ MODULO - fold into low
3003 str $ctr32w, [$counter, #12] @ store the updated counter
3004
3005 st1 { $res1b}, [$output_ptr] @ store all 16B
3006
3007 eor $acc_lb, $acc_lb, $acc_mb @ MODULO - fold into low
3008 ext $acc_lb, $acc_lb, $acc_lb, #8
3009 rev64 $acc_lb, $acc_lb
3010 mov x0, $len
3011 st1 { $acc_l.16b }, [$current_tag]
3012
3013 ldp x21, x22, [sp, #16]
3014 ldp x23, x24, [sp, #32]
3015 ldp d8, d9, [sp, #48]
3016 ldp d10, d11, [sp, #64]
3017 ldp d12, d13, [sp, #80]
3018 ldp d14, d15, [sp, #96]
3019 ldp x19, x20, [sp], #112
3020 ret
3021
3022.L192_enc_ret:
3023 mov w0, #0x0
3024 ret
3025.size aes_gcm_enc_192_kernel,.-aes_gcm_enc_192_kernel
3026___
3027
3028#########################################################################################
3029# size_t aes_gcm_dec_192_kernel(const unsigned char *in,
3030# size_t len,
3031# unsigned char *out,
3032# const void *key,
3033# unsigned char ivec[16],
3034# u64 *Xi);
3035#
3036$code.=<<___;
3037.global aes_gcm_dec_192_kernel
3038.type aes_gcm_dec_192_kernel,%function
3039.align 4
3040aes_gcm_dec_192_kernel:
3041 AARCH64_VALID_CALL_TARGET
3042 cbz x1, .L192_dec_ret
3043 stp x19, x20, [sp, #-112]!
3044 mov x16, x4
3045 mov x8, x5
3046 stp x21, x22, [sp, #16]
3047 stp x23, x24, [sp, #32]
3048 stp d8, d9, [sp, #48]
3049 stp d10, d11, [sp, #64]
3050 stp d12, d13, [sp, #80]
3051 stp d14, d15, [sp, #96]
3052
3053 add $end_input_ptr, $input_ptr, $bit_length, lsr #3 @ end_input_ptr
3054 ldp $ctr96_b64x, $ctr96_t32x, [$counter] @ ctr96_b64, ctr96_t32
3055#ifdef __AARCH64EB__
3056 rev $ctr96_b64x, $ctr96_b64x
3057 rev $ctr96_t32x, $ctr96_t32x
3058#endif
3059 ldp $rk12_l, $rk12_h, [$cc, #192] @ load rk12
3060#ifdef __AARCH64EB__
3061 ror $rk12_l, $rk12_l, #32
3062 ror $rk12_h, $rk12_h, #32
3063#endif
3064 ld1 { $ctr0b}, [$counter] @ special case vector load initial counter so we can start first AES block as quickly as possible
3065
3066 ld1 {$rk0s}, [$cc], #16 @ load rk0
3067
3068 lsr $main_end_input_ptr, $bit_length, #3 @ byte_len
3069 mov $len, $main_end_input_ptr
3070 ld1 {$rk1s}, [$cc], #16 @ load rk1
3071
3072 lsr $rctr32x, $ctr96_t32x, #32
3073 orr $ctr96_t32w, $ctr96_t32w, $ctr96_t32w
3074 fmov $ctr3d, $ctr96_b64x @ CTR block 3
3075
3076 rev $rctr32w, $rctr32w @ rev_ctr32
3077 fmov $ctr1d, $ctr96_b64x @ CTR block 1
3078
3079 add $rctr32w, $rctr32w, #1 @ increment rev_ctr32
3080 ld1 {$rk2s}, [$cc], #16 @ load rk2
3081
3082 aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 0
3083 rev $ctr32w, $rctr32w @ CTR block 1
3084
3085 add $rctr32w, $rctr32w, #1 @ CTR block 1
3086 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 1
3087 ld1 {$rk3s}, [$cc], #16 @ load rk3
3088
3089 fmov $ctr1.d[1], $ctr32x @ CTR block 1
3090 rev $ctr32w, $rctr32w @ CTR block 2
3091 add $rctr32w, $rctr32w, #1 @ CTR block 2
3092
3093 fmov $ctr2d, $ctr96_b64x @ CTR block 2
3094 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 2
3095
3096 fmov $ctr2.d[1], $ctr32x @ CTR block 2
3097 rev $ctr32w, $rctr32w @ CTR block 3
3098
3099 aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 1
3100 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 3
3101
3102 fmov $ctr3.d[1], $ctr32x @ CTR block 3
3103
3104 ld1 {$rk4s}, [$cc], #16 @ load rk4
3105
3106 aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 2
3107
3108 aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 0
3109 ld1 {$rk5s}, [$cc], #16 @ load rk5
3110
3111 aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 0
3112 ldr $h4q, [$current_tag, #112] @ load h4l | h4h
3113#ifndef __AARCH64EB__
3114 ext $h4b, $h4b, $h4b, #8
3115#endif
3116 aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 0
3117 ldr $h2q, [$current_tag, #64] @ load h2l | h2h
3118#ifndef __AARCH64EB__
3119 ext $h2b, $h2b, $h2b, #8
3120#endif
3121 aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 1
3122 ldr $h3q, [$current_tag, #80] @ load h3l | h3h
3123#ifndef __AARCH64EB__
3124 ext $h3b, $h3b, $h3b, #8
3125#endif
3126 aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 1
3127
3128 aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 1
3129 ldr $h1q, [$current_tag, #32] @ load h1l | h1h
3130#ifndef __AARCH64EB__
3131 ext $h1b, $h1b, $h1b, #8
3132#endif
3133 aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 2
3134 ld1 {$rk6s}, [$cc], #16 @ load rk6
3135
3136 aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 3
3137 ld1 {$rk7s}, [$cc], #16 @ load rk7
3138
3139 aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 2
3140 ld1 {$rk8s}, [$cc], #16 @ load rk8
3141
3142 aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 2
3143 ld1 {$rk9s}, [$cc], #16 @ load rk9
3144
3145 aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 3
3146 ld1 { $acc_lb}, [$current_tag]
3147 ext $acc_lb, $acc_lb, $acc_lb, #8
3148 rev64 $acc_lb, $acc_lb
3149
3150 aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 3
3151 add $rctr32w, $rctr32w, #1 @ CTR block 3
3152
3153 aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 3
3154 trn1 $acc_h.2d, $h3.2d, $h4.2d @ h4h | h3h
3155
3156 aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 4
3157 ld1 {$rk10s}, [$cc], #16 @ load rk10
3158
3159 aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 4
3160 trn2 $h34k.2d, $h3.2d, $h4.2d @ h4l | h3l
3161
3162 aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 4
3163
3164 aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 4
3165 trn2 $h12k.2d, $h1.2d, $h2.2d @ h2l | h1l
3166
3167 aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 5
3168 ld1 {$rk11s}, [$cc], #16 @ load rk11
3169
3170 aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 5
3171
3172 aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 5
3173
3174 aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 5
3175
3176 aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 6
3177
3178 aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 6
3179
3180 aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 6
3181
3182 aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 7
3183
3184 aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 7
3185
3186 aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 7
3187
3188 aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 6
3189
3190 aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 8
3191
3192 aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 8
3193
3194 aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 7
3195
3196 aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 9
3197
3198 aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 9
3199
3200 aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 8
3201 sub $main_end_input_ptr, $main_end_input_ptr, #1 @ byte_len - 1
3202
3203 aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 8
3204 and $main_end_input_ptr, $main_end_input_ptr, #0xffffffffffffffc0 @ number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
3205
3206 aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 10
3207 add $main_end_input_ptr, $main_end_input_ptr, $input_ptr
3208
3209 aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 9
3210 cmp $input_ptr, $main_end_input_ptr @ check if we have <= 4 blocks
3211
3212 aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 9
3213 trn1 $t0.2d, $h1.2d, $h2.2d @ h2h | h1h
3214
3215 aese $ctr3b, $rk11 @ AES block 3 - round 11
3216
3217 aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 10
3218
3219 aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 10
3220
3221 aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 10
3222 eor $h12k.16b, $h12k.16b, $t0.16b @ h2k | h1k
3223
3224 aese $ctr2b, $rk11 @ AES block 2 - round 11
3225
3226 aese $ctr1b, $rk11 @ AES block 1 - round 11
3227 eor $h34k.16b, $h34k.16b, $acc_h.16b @ h4k | h3k
3228
3229 aese $ctr0b, $rk11 @ AES block 0 - round 11
3230 b.ge .L192_dec_tail @ handle tail
3231
3232 ld1 {$res0b, $res1b}, [$input_ptr], #32 @ AES block 0,1 - load ciphertext
3233
3234 eor $ctr1b, $res1b, $ctr1b @ AES block 1 - result
3235
3236 eor $ctr0b, $res0b, $ctr0b @ AES block 0 - result
3237 rev $ctr32w, $rctr32w @ CTR block 4
3238 ld1 {$res2b, $res3b}, [$input_ptr], #32 @ AES block 2,3 - load ciphertext
3239
3240 mov $output_l1, $ctr1.d[0] @ AES block 1 - mov low
3241
3242 mov $output_h1, $ctr1.d[1] @ AES block 1 - mov high
3243
3244 mov $output_l0, $ctr0.d[0] @ AES block 0 - mov low
3245 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4
3246 add $rctr32w, $rctr32w, #1 @ CTR block 4
3247
3248 mov $output_h0, $ctr0.d[1] @ AES block 0 - mov high
3249 rev64 $res0b, $res0b @ GHASH block 0
3250
3251 fmov $ctr0d, $ctr96_b64x @ CTR block 4
3252 rev64 $res1b, $res1b @ GHASH block 1
3253 cmp $input_ptr, $main_end_input_ptr @ check if we have <= 8 blocks
3254
3255 eor $output_l1, $output_l1, $rk12_l @ AES block 1 - round 12 low
3256#ifdef __AARCH64EB__
3257 rev $output_l1, $output_l1
3258#endif
3259 fmov $ctr0.d[1], $ctr32x @ CTR block 4
3260 rev $ctr32w, $rctr32w @ CTR block 5
3261
3262 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 5
3263 fmov $ctr1d, $ctr96_b64x @ CTR block 5
3264 eor $output_h1, $output_h1, $rk12_h @ AES block 1 - round 12 high
3265#ifdef __AARCH64EB__
3266 rev $output_h1, $output_h1
3267#endif
3268 add $rctr32w, $rctr32w, #1 @ CTR block 5
3269 fmov $ctr1.d[1], $ctr32x @ CTR block 5
3270 eor $output_l0, $output_l0, $rk12_l @ AES block 0 - round 12 low
3271#ifdef __AARCH64EB__
3272 rev $output_l0, $output_l0
3273#endif
3274 rev $ctr32w, $rctr32w @ CTR block 6
3275 eor $output_h0, $output_h0, $rk12_h @ AES block 0 - round 12 high
3276#ifdef __AARCH64EB__
3277 rev $output_h0, $output_h0
3278#endif
3279 stp $output_l0, $output_h0, [$output_ptr], #16 @ AES block 0 - store result
3280 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 6
3281
3282 stp $output_l1, $output_h1, [$output_ptr], #16 @ AES block 1 - store result
3283
3284 add $rctr32w, $rctr32w, #1 @ CTR block 6
3285 eor $ctr2b, $res2b, $ctr2b @ AES block 2 - result
3286 b.ge .L192_dec_prepretail @ do prepretail
3287
3288 .L192_dec_main_loop: @ main loop start
3289 aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 0
3290 ext $acc_lb, $acc_lb, $acc_lb, #8 @ PRE 0
3291
3292 pmull $t2.1q, $res1.1d, $h3.1d @ GHASH block 4k+1 - low
3293 mov $output_l2, $ctr2.d[0] @ AES block 4k+2 - mov low
3294
3295 mov $output_h2, $ctr2.d[1] @ AES block 4k+2 - mov high
3296 eor $ctr3b, $res3b, $ctr3b @ AES block 4k+3 - result
3297 rev64 $res3b, $res3b @ GHASH block 4k+3
3298
3299 aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 1
3300 fmov $ctr2d, $ctr96_b64x @ CTR block 4k+6
3301
3302 aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 0
3303 eor $res0b, $res0b, $acc_lb @ PRE 1
3304
3305 pmull2 $t1.1q, $res1.2d, $h3.2d @ GHASH block 4k+1 - high
3306 fmov $ctr2.d[1], $ctr32x @ CTR block 4k+6
3307
3308 aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 2
3309 mov $output_h3, $ctr3.d[1] @ AES block 4k+3 - mov high
3310
3311 aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 1
3312 mov $output_l3, $ctr3.d[0] @ AES block 4k+3 - mov low
3313
3314 pmull2 $acc_h.1q, $res0.2d, $h4.2d @ GHASH block 4k - high
3315 fmov $ctr3d, $ctr96_b64x @ CTR block 4k+7
3316 mov $t0d, $res0.d[1] @ GHASH block 4k - mid
3317
3318 pmull $acc_l.1q, $res0.1d, $h4.1d @ GHASH block 4k - low
3319 mov $acc_md, $h34k.d[1] @ GHASH block 4k - mid
3320 rev $ctr32w, $rctr32w @ CTR block 4k+7
3321
3322 aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 0
3323 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+7
3324
3325 fmov $ctr3.d[1], $ctr32x @ CTR block 4k+7
3326 eor $t0.8b, $t0.8b, $res0.8b @ GHASH block 4k - mid
3327 mov $t3d, $res1.d[1] @ GHASH block 4k+1 - mid
3328
3329 aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 3
3330
3331 aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 2
3332 eor $output_h2, $output_h2, $rk12_h @ AES block 4k+2 - round 12 high
3333#ifdef __AARCH64EB__
3334 rev $output_h2, $output_h2
3335#endif
3336 aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 1
3337 eor $t3.8b, $t3.8b, $res1.8b @ GHASH block 4k+1 - mid
3338
3339 pmull $acc_m.1q, $t0.1d, $acc_m.1d @ GHASH block 4k - mid
3340
3341 aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 0
3342 rev64 $res2b, $res2b @ GHASH block 4k+2
3343
3344 aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 2
3345
3346 pmull $t3.1q, $t3.1d, $h34k.1d @ GHASH block 4k+1 - mid
3347 eor $acc_lb, $acc_lb, $t2.16b @ GHASH block 4k+1 - low
3348 eor $output_l2, $output_l2, $rk12_l @ AES block 4k+2 - round 12 low
3349#ifdef __AARCH64EB__
3350 rev $output_l2, $output_l2
3351#endif
3352 aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 4
3353
3354 aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 3
3355
3356 eor $acc_mb, $acc_mb, $t3.16b @ GHASH block 4k+1 - mid
3357 mov $t6d, $res2.d[1] @ GHASH block 4k+2 - mid
3358
3359 aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 1
3360 eor $acc_hb, $acc_hb, $t1.16b @ GHASH block 4k+1 - high
3361
3362 aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 4
3363
3364 pmull2 $t4.1q, $res2.2d, $h2.2d @ GHASH block 4k+2 - high
3365 eor $t6.8b, $t6.8b, $res2.8b @ GHASH block 4k+2 - mid
3366
3367 pmull $t5.1q, $res2.1d, $h2.1d @ GHASH block 4k+2 - low
3368
3369 aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 5
3370
3371 eor $acc_hb, $acc_hb, $t4.16b @ GHASH block 4k+2 - high
3372 mov $t9d, $res3.d[1] @ GHASH block 4k+3 - mid
3373
3374 aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 5
3375
3376 pmull2 $t7.1q, $res3.2d, $h1.2d @ GHASH block 4k+3 - high
3377
3378 aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 2
3379 eor $t9.8b, $t9.8b, $res3.8b @ GHASH block 4k+3 - mid
3380
3381 aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 6
3382
3383 aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 6
3384 ins $t6.d[1], $t6.d[0] @ GHASH block 4k+2 - mid
3385
3386 aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 3
3387
3388 pmull $t9.1q, $t9.1d, $h12k.1d @ GHASH block 4k+3 - mid
3389 eor $acc_lb, $acc_lb, $t5.16b @ GHASH block 4k+2 - low
3390
3391 aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 7
3392
3393 pmull2 $t6.1q, $t6.2d, $h12k.2d @ GHASH block 4k+2 - mid
3394 eor $acc_hb, $acc_hb, $t7.16b @ GHASH block 4k+3 - high
3395
3396 aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 7
3397
3398 aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 8
3399 movi $mod_constant.8b, #0xc2
3400
3401 pmull $t8.1q, $res3.1d, $h1.1d @ GHASH block 4k+3 - low
3402
3403 aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 8
3404 eor $acc_mb, $acc_mb, $t6.16b @ GHASH block 4k+2 - mid
3405
3406 aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 3
3407
3408 aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 9
3409 eor $acc_lb, $acc_lb, $t8.16b @ GHASH block 4k+3 - low
3410
3411 aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 4
3412
3413 aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 4
3414 eor $acc_mb, $acc_mb, $t9.16b @ GHASH block 4k+3 - mid
3415
3416 aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 10
3417
3418 aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 9
3419 eor $t9.16b, $acc_lb, $acc_hb @ MODULO - karatsuba tidy up
3420
3421 aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 5
3422
3423 aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 5
3424 shl $mod_constantd, $mod_constantd, #56 @ mod_constant
3425
3426 aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 10
3427
3428 aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 6
3429 ld1 {$res0b}, [$input_ptr], #16 @ AES block 4k+4 - load ciphertext
3430
3431 aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 6
3432 eor $acc_mb, $acc_mb, $t9.16b @ MODULO - karatsuba tidy up
3433
3434 pmull $mod_t.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid
3435 ld1 {$res1b}, [$input_ptr], #16 @ AES block 4k+5 - load ciphertext
3436 eor $output_l3, $output_l3, $rk12_l @ AES block 4k+3 - round 12 low
3437#ifdef __AARCH64EB__
3438 rev $output_l3, $output_l3
3439#endif
3440 aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 7
3441 ext $acc_hb, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment
3442
3443 aese $ctr0b, $rk11 @ AES block 4k+4 - round 11
3444 add $rctr32w, $rctr32w, #1 @ CTR block 4k+7
3445
3446 aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 7
3447 eor $acc_mb, $acc_mb, $mod_t.16b @ MODULO - fold into mid
3448
3449 aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 8
3450 ld1 {$res2b}, [$input_ptr], #16 @ AES block 4k+6 - load ciphertext
3451
3452 aese $ctr1b, $rk11 @ AES block 4k+5 - round 11
3453 ld1 {$res3b}, [$input_ptr], #16 @ AES block 4k+7 - load ciphertext
3454 rev $ctr32w, $rctr32w @ CTR block 4k+8
3455
3456 aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 8
3457 stp $output_l2, $output_h2, [$output_ptr], #16 @ AES block 4k+2 - store result
3458
3459 aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 9
3460 eor $acc_mb, $acc_mb, $acc_hb @ MODULO - fold into mid
3461
3462 cmp $input_ptr, $main_end_input_ptr @ LOOP CONTROL
3463
3464 eor $ctr0b, $res0b, $ctr0b @ AES block 4k+4 - result
3465 eor $output_h3, $output_h3, $rk12_h @ AES block 4k+3 - round 12 high
3466#ifdef __AARCH64EB__
3467 rev $output_h3, $output_h3
3468#endif
3469 eor $ctr1b, $res1b, $ctr1b @ AES block 4k+5 - result
3470
3471 aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 10
3472 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+8
3473
3474 aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 9
3475
3476 pmull $mod_constant.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low
3477 mov $output_l1, $ctr1.d[0] @ AES block 4k+5 - mov low
3478
3479 mov $output_l0, $ctr0.d[0] @ AES block 4k+4 - mov low
3480 stp $output_l3, $output_h3, [$output_ptr], #16 @ AES block 4k+3 - store result
3481 rev64 $res1b, $res1b @ GHASH block 4k+5
3482
3483 aese $ctr2b, $rk11 @ AES block 4k+6 - round 11
3484 mov $output_h0, $ctr0.d[1] @ AES block 4k+4 - mov high
3485
3486 aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 10
3487 mov $output_h1, $ctr1.d[1] @ AES block 4k+5 - mov high
3488
3489 fmov $ctr0d, $ctr96_b64x @ CTR block 4k+8
3490 add $rctr32w, $rctr32w, #1 @ CTR block 4k+8
3491 ext $acc_mb, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment
3492
3493 eor $ctr2b, $res2b, $ctr2b @ AES block 4k+6 - result
3494 fmov $ctr0.d[1], $ctr32x @ CTR block 4k+8
3495 rev $ctr32w, $rctr32w @ CTR block 4k+9
3496
3497 eor $output_l0, $output_l0, $rk12_l @ AES block 4k+4 - round 12 low
3498#ifdef __AARCH64EB__
3499 rev $output_l0, $output_l0
3500#endif
3501 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+9
3502 eor $acc_lb, $acc_lb, $mod_constant.16b @ MODULO - fold into low
3503
3504 fmov $ctr1d, $ctr96_b64x @ CTR block 4k+9
3505 add $rctr32w, $rctr32w, #1 @ CTR block 4k+9
3506 eor $output_l1, $output_l1, $rk12_l @ AES block 4k+5 - round 12 low
3507#ifdef __AARCH64EB__
3508 rev $output_l1, $output_l1
3509#endif
3510 fmov $ctr1.d[1], $ctr32x @ CTR block 4k+9
3511 rev $ctr32w, $rctr32w @ CTR block 4k+10
3512 eor $output_h1, $output_h1, $rk12_h @ AES block 4k+5 - round 12 high
3513#ifdef __AARCH64EB__
3514 rev $output_h1, $output_h1
3515#endif
3516 eor $output_h0, $output_h0, $rk12_h @ AES block 4k+4 - round 12 high
3517#ifdef __AARCH64EB__
3518 rev $output_h0, $output_h0
3519#endif
3520 stp $output_l0, $output_h0, [$output_ptr], #16 @ AES block 4k+4 - store result
3521 eor $acc_lb, $acc_lb, $acc_mb @ MODULO - fold into low
3522
3523 add $rctr32w, $rctr32w, #1 @ CTR block 4k+10
3524 rev64 $res0b, $res0b @ GHASH block 4k+4
3525 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+10
3526
3527 aese $ctr3b, $rk11 @ AES block 4k+7 - round 11
3528 stp $output_l1, $output_h1, [$output_ptr], #16 @ AES block 4k+5 - store result
3529 b.lt .L192_dec_main_loop
3530
3531 .L192_dec_prepretail: @ PREPRETAIL
3532 mov $output_h2, $ctr2.d[1] @ AES block 4k+2 - mov high
3533 ext $acc_lb, $acc_lb, $acc_lb, #8 @ PRE 0
3534 eor $ctr3b, $res3b, $ctr3b @ AES block 4k+3 - result
3535
3536 aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 0
3537 mov $output_l2, $ctr2.d[0] @ AES block 4k+2 - mov low
3538
3539 aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 0
3540 mov $acc_md, $h34k.d[1] @ GHASH block 4k - mid
3541
3542 eor $res0b, $res0b, $acc_lb @ PRE 1
3543 fmov $ctr2d, $ctr96_b64x @ CTR block 4k+6
3544
3545 aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 1
3546 mov $output_l3, $ctr3.d[0] @ AES block 4k+3 - mov low
3547
3548 aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 1
3549 mov $output_h3, $ctr3.d[1] @ AES block 4k+3 - mov high
3550
3551 pmull $acc_l.1q, $res0.1d, $h4.1d @ GHASH block 4k - low
3552 mov $t0d, $res0.d[1] @ GHASH block 4k - mid
3553 fmov $ctr3d, $ctr96_b64x @ CTR block 4k+7
3554
3555 aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 2
3556 rev64 $res2b, $res2b @ GHASH block 4k+2
3557
3558 pmull2 $acc_h.1q, $res0.2d, $h4.2d @ GHASH block 4k - high
3559 fmov $ctr2.d[1], $ctr32x @ CTR block 4k+6
3560 rev $ctr32w, $rctr32w @ CTR block 4k+7
3561
3562 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+7
3563 eor $t0.8b, $t0.8b, $res0.8b @ GHASH block 4k - mid
3564 mov $t3d, $res1.d[1] @ GHASH block 4k+1 - mid
3565
3566 pmull $t2.1q, $res1.1d, $h3.1d @ GHASH block 4k+1 - low
3567 eor $output_h3, $output_h3, $rk12_h @ AES block 4k+3 - round 12 high
3568#ifdef __AARCH64EB__
3569 rev $output_h3, $output_h3
3570#endif
3571 fmov $ctr3.d[1], $ctr32x @ CTR block 4k+7
3572
3573 aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 2
3574 eor $output_l2, $output_l2, $rk12_l @ AES block 4k+2 - round 12 low
3575#ifdef __AARCH64EB__
3576 rev $output_l2, $output_l2
3577#endif
3578 pmull2 $t1.1q, $res1.2d, $h3.2d @ GHASH block 4k+1 - high
3579 eor $output_h2, $output_h2, $rk12_h @ AES block 4k+2 - round 12 high
3580#ifdef __AARCH64EB__
3581 rev $output_h2, $output_h2
3582#endif
3583 eor $t3.8b, $t3.8b, $res1.8b @ GHASH block 4k+1 - mid
3584
3585 pmull $acc_m.1q, $t0.1d, $acc_m.1d @ GHASH block 4k - mid
3586 eor $output_l3, $output_l3, $rk12_l @ AES block 4k+3 - round 12 low
3587#ifdef __AARCH64EB__
3588 rev $output_l3, $output_l3
3589#endif
3590 stp $output_l2, $output_h2, [$output_ptr], #16 @ AES block 4k+2 - store result
3591
3592 rev64 $res3b, $res3b @ GHASH block 4k+3
3593 stp $output_l3, $output_h3, [$output_ptr], #16 @ AES block 4k+3 - store result
3594
3595 aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 0
3596 eor $acc_hb, $acc_hb, $t1.16b @ GHASH block 4k+1 - high
3597
3598 pmull $t3.1q, $t3.1d, $h34k.1d @ GHASH block 4k+1 - mid
3599 add $rctr32w, $rctr32w, #1 @ CTR block 4k+7
3600
3601 pmull2 $t4.1q, $res2.2d, $h2.2d @ GHASH block 4k+2 - high
3602 eor $acc_lb, $acc_lb, $t2.16b @ GHASH block 4k+1 - low
3603
3604 aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 0
3605
3606 eor $acc_mb, $acc_mb, $t3.16b @ GHASH block 4k+1 - mid
3607 mov $t6d, $res2.d[1] @ GHASH block 4k+2 - mid
3608
3609 aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 1
3610
3611 aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 1
3612 eor $acc_hb, $acc_hb, $t4.16b @ GHASH block 4k+2 - high
3613
3614 eor $t6.8b, $t6.8b, $res2.8b @ GHASH block 4k+2 - mid
3615
3616 pmull $t5.1q, $res2.1d, $h2.1d @ GHASH block 4k+2 - low
3617
3618 aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 2
3619 mov $t9d, $res3.d[1] @ GHASH block 4k+3 - mid
3620
3621 aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 2
3622 ins $t6.d[1], $t6.d[0] @ GHASH block 4k+2 - mid
3623
3624 pmull $t8.1q, $res3.1d, $h1.1d @ GHASH block 4k+3 - low
3625
3626 aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 3
3627 eor $t9.8b, $t9.8b, $res3.8b @ GHASH block 4k+3 - mid
3628
3629 aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 3
3630
3631 pmull2 $t6.1q, $t6.2d, $h12k.2d @ GHASH block 4k+2 - mid
3632 eor $acc_lb, $acc_lb, $t5.16b @ GHASH block 4k+2 - low
3633
3634 aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 4
3635
3636 pmull2 $t7.1q, $res3.2d, $h1.2d @ GHASH block 4k+3 - high
3637 movi $mod_constant.8b, #0xc2
3638
3639 pmull $t9.1q, $t9.1d, $h12k.1d @ GHASH block 4k+3 - mid
3640
3641 aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 3
3642
3643 shl $mod_constantd, $mod_constantd, #56 @ mod_constant
3644 eor $acc_hb, $acc_hb, $t7.16b @ GHASH block 4k+3 - high
3645
3646 aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 5
3647 eor $acc_mb, $acc_mb, $t6.16b @ GHASH block 4k+2 - mid
3648
3649 aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 4
3650
3651 pmull $mod_t.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid
3652 eor $acc_lb, $acc_lb, $t8.16b @ GHASH block 4k+3 - low
3653
3654 aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 6
3655
3656 aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 3
3657 eor $acc_mb, $acc_mb, $t9.16b @ GHASH block 4k+3 - mid
3658
3659 aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 5
3660
3661 aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 7
3662 eor $t9.16b, $acc_lb, $acc_hb @ MODULO - karatsuba tidy up
3663
3664 aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 4
3665
3666 aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 6
3667 ext $acc_hb, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment
3668
3669 aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 8
3670
3671 aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 5
3672 eor $acc_mb, $acc_mb, $t9.16b @ MODULO - karatsuba tidy up
3673
3674 aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 4
3675
3676 aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 7
3677
3678 aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 9
3679
3680 aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 5
3681
3682 aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 6
3683 eor $acc_mb, $acc_mb, $mod_t.16b @ MODULO - fold into mid
3684
3685 aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 10
3686
3687 aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 6
3688
3689 aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 7
3690
3691 aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 8
3692 eor $acc_mb, $acc_mb, $acc_hb @ MODULO - fold into mid
3693
3694 aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 7
3695
3696 aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 8
3697
3698 aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 9
3699
3700 aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 8
3701
3702 aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 9
3703
3704 pmull $mod_constant.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low
3705
3706 aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 9
3707
3708 aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 10
3709
3710 aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 10
3711 ext $acc_mb, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment
3712
3713 aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 10
3714
3715 aese $ctr0b, $rk11
3716 eor $acc_lb, $acc_lb, $mod_constant.16b @ MODULO - fold into low
3717
3718 aese $ctr2b, $rk11
3719
3720 aese $ctr1b, $rk11
3721
3722 aese $ctr3b, $rk11
3723
3724 eor $acc_lb, $acc_lb, $acc_mb @ MODULO - fold into low
3725 .L192_dec_tail: @ TAIL
3726
3727 sub $main_end_input_ptr, $end_input_ptr, $input_ptr @ main_end_input_ptr is number of bytes left to process
3728 ld1 { $res1b}, [$input_ptr], #16 @ AES block 4k+4 - load ciphertext
3729
3730 eor $ctr0b, $res1b, $ctr0b @ AES block 4k+4 - result
3731
3732 mov $output_h0, $ctr0.d[1] @ AES block 4k+4 - mov high
3733
3734 mov $output_l0, $ctr0.d[0] @ AES block 4k+4 - mov low
3735
3736 ext $t0.16b, $acc_lb, $acc_lb, #8 @ prepare final partial tag
3737
3738 cmp $main_end_input_ptr, #48
3739
3740 eor $output_h0, $output_h0, $rk12_h @ AES block 4k+4 - round 12 high
3741#ifdef __AARCH64EB__
3742 rev $output_h0, $output_h0
3743#endif
3744 eor $output_l0, $output_l0, $rk12_l @ AES block 4k+4 - round 12 low
3745#ifdef __AARCH64EB__
3746 rev $output_l0, $output_l0
3747#endif
3748 b.gt .L192_dec_blocks_more_than_3
3749
3750 movi $acc_l.8b, #0
3751 movi $acc_h.8b, #0
3752
3753 mov $ctr3b, $ctr2b
3754 mov $ctr2b, $ctr1b
3755 sub $rctr32w, $rctr32w, #1
3756
3757 movi $acc_m.8b, #0
3758 cmp $main_end_input_ptr, #32
3759 b.gt .L192_dec_blocks_more_than_2
3760
3761 mov $ctr3b, $ctr1b
3762 cmp $main_end_input_ptr, #16
3763 sub $rctr32w, $rctr32w, #1
3764
3765 b.gt .L192_dec_blocks_more_than_1
3766
3767 sub $rctr32w, $rctr32w, #1
3768 b .L192_dec_blocks_less_than_1
3769 .L192_dec_blocks_more_than_3: @ blocks left > 3
3770 rev64 $res0b, $res1b @ GHASH final-3 block
3771 ld1 { $res1b}, [$input_ptr], #16 @ AES final-2 block - load ciphertext
3772
3773 stp $output_l0, $output_h0, [$output_ptr], #16 @ AES final-3 block - store result
3774
3775 eor $res0b, $res0b, $t0.16b @ feed in partial tag
3776
3777 eor $ctr0b, $res1b, $ctr1b @ AES final-2 block - result
3778
3779 pmull $acc_l.1q, $res0.1d, $h4.1d @ GHASH final-3 block - low
3780 mov $output_l0, $ctr0.d[0] @ AES final-2 block - mov low
3781 mov $rk4d, $res0.d[1] @ GHASH final-3 block - mid
3782
3783 mov $output_h0, $ctr0.d[1] @ AES final-2 block - mov high
3784
3785 mov $acc_md, $h34k.d[1] @ GHASH final-3 block - mid
3786 eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-3 block - mid
3787
3788 pmull2 $acc_h.1q, $res0.2d, $h4.2d @ GHASH final-3 block - high
3789
3790 eor $output_l0, $output_l0, $rk12_l @ AES final-2 block - round 12 low
3791#ifdef __AARCH64EB__
3792 rev $output_l0, $output_l0
3793#endif
3794 movi $t0.8b, #0 @ suppress further partial tag feed in
3795
3796 pmull $acc_m.1q, $rk4v.1d, $acc_m.1d @ GHASH final-3 block - mid
3797 eor $output_h0, $output_h0, $rk12_h @ AES final-2 block - round 12 high
3798#ifdef __AARCH64EB__
3799 rev $output_h0, $output_h0
3800#endif
3801 .L192_dec_blocks_more_than_2: @ blocks left > 2
3802
3803 rev64 $res0b, $res1b @ GHASH final-2 block
3804 ld1 { $res1b}, [$input_ptr], #16 @ AES final-1 block - load ciphertext
3805
3806 eor $res0b, $res0b, $t0.16b @ feed in partial tag
3807
3808 movi $t0.8b, #0 @ suppress further partial tag feed in
3809
3810 eor $ctr0b, $res1b, $ctr2b @ AES final-1 block - result
3811
3812 mov $rk4d, $res0.d[1] @ GHASH final-2 block - mid
3813
3814 pmull $rk3q1, $res0.1d, $h3.1d @ GHASH final-2 block - low
3815
3816 stp $output_l0, $output_h0, [$output_ptr], #16 @ AES final-2 block - store result
3817
3818 eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-2 block - mid
3819 mov $output_h0, $ctr0.d[1] @ AES final-1 block - mov high
3820
3821 eor $acc_lb, $acc_lb, $rk3 @ GHASH final-2 block - low
3822 mov $output_l0, $ctr0.d[0] @ AES final-1 block - mov low
3823
3824 pmull2 $rk2q1, $res0.2d, $h3.2d @ GHASH final-2 block - high
3825
3826 pmull $rk4v.1q, $rk4v.1d, $h34k.1d @ GHASH final-2 block - mid
3827
3828 eor $acc_hb, $acc_hb, $rk2 @ GHASH final-2 block - high
3829 eor $output_h0, $output_h0, $rk12_h @ AES final-1 block - round 12 high
3830#ifdef __AARCH64EB__
3831 rev $output_h0, $output_h0
3832#endif
3833 eor $output_l0, $output_l0, $rk12_l @ AES final-1 block - round 12 low
3834#ifdef __AARCH64EB__
3835 rev $output_l0, $output_l0
3836#endif
3837 eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-2 block - mid
3838 .L192_dec_blocks_more_than_1: @ blocks left > 1
3839
3840 rev64 $res0b, $res1b @ GHASH final-1 block
3841
3842 eor $res0b, $res0b, $t0.16b @ feed in partial tag
3843 ld1 { $res1b}, [$input_ptr], #16 @ AES final block - load ciphertext
3844
3845 mov $rk4d, $res0.d[1] @ GHASH final-1 block - mid
3846
3847 pmull2 $rk2q1, $res0.2d, $h2.2d @ GHASH final-1 block - high
3848
3849 eor $ctr0b, $res1b, $ctr3b @ AES final block - result
3850 stp $output_l0, $output_h0, [$output_ptr], #16 @ AES final-1 block - store result
3851
3852 eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-1 block - mid
3853
3854 eor $acc_hb, $acc_hb, $rk2 @ GHASH final-1 block - high
3855
3856 pmull $rk3q1, $res0.1d, $h2.1d @ GHASH final-1 block - low
3857 mov $output_h0, $ctr0.d[1] @ AES final block - mov high
3858
3859 ins $rk4v.d[1], $rk4v.d[0] @ GHASH final-1 block - mid
3860 mov $output_l0, $ctr0.d[0] @ AES final block - mov low
3861
3862 pmull2 $rk4v.1q, $rk4v.2d, $h12k.2d @ GHASH final-1 block - mid
3863
3864 movi $t0.8b, #0 @ suppress further partial tag feed in
3865 eor $acc_lb, $acc_lb, $rk3 @ GHASH final-1 block - low
3866 eor $output_h0, $output_h0, $rk12_h @ AES final block - round 12 high
3867#ifdef __AARCH64EB__
3868 rev $output_h0, $output_h0
3869#endif
3870 eor $output_l0, $output_l0, $rk12_l @ AES final block - round 12 low
3871#ifdef __AARCH64EB__
3872 rev $output_l0, $output_l0
3873#endif
3874 eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-1 block - mid
3875 .L192_dec_blocks_less_than_1: @ blocks left <= 1
3876
3877 mvn $rk12_l, xzr @ rk12_l = 0xffffffffffffffff
3878 ldp $end_input_ptr, $main_end_input_ptr, [$output_ptr] @ load existing bytes we need to not overwrite
3879 and $bit_length, $bit_length, #127 @ bit_length %= 128
3880
3881 sub $bit_length, $bit_length, #128 @ bit_length -= 128
3882
3883 neg $bit_length, $bit_length @ bit_length = 128 - #bits in input (in range [1,128])
3884
3885 and $bit_length, $bit_length, #127 @ bit_length %= 128
3886 mvn $rk12_h, xzr @ rk12_h = 0xffffffffffffffff
3887
3888 lsr $rk12_h, $rk12_h, $bit_length @ rk12_h is mask for top 64b of last block
3889 cmp $bit_length, #64
3890
3891 csel $ctr32x, $rk12_l, $rk12_h, lt
3892 csel $ctr96_b64x, $rk12_h, xzr, lt
3893
3894 fmov $ctr0d, $ctr32x @ ctr0b is mask for last block
3895 and $output_l0, $output_l0, $ctr32x
3896 bic $end_input_ptr, $end_input_ptr, $ctr32x @ mask out low existing bytes
3897
3898 orr $output_l0, $output_l0, $end_input_ptr
3899 mov $ctr0.d[1], $ctr96_b64x
3900#ifndef __AARCH64EB__
3901 rev $ctr32w, $rctr32w
3902#else
3903 mov $ctr32w, $rctr32w
3904#endif
3905
3906 and $res1b, $res1b, $ctr0b @ possibly partial last block has zeroes in highest bits
3907 str $ctr32w, [$counter, #12] @ store the updated counter
3908
3909 rev64 $res0b, $res1b @ GHASH final block
3910
3911 eor $res0b, $res0b, $t0.16b @ feed in partial tag
3912 bic $main_end_input_ptr, $main_end_input_ptr, $ctr96_b64x @ mask out high existing bytes
3913
3914 and $output_h0, $output_h0, $ctr96_b64x
3915
3916 pmull2 $rk2q1, $res0.2d, $h1.2d @ GHASH final block - high
3917 mov $t0d, $res0.d[1] @ GHASH final block - mid
3918
3919 pmull $rk3q1, $res0.1d, $h1.1d @ GHASH final block - low
3920
3921 eor $t0.8b, $t0.8b, $res0.8b @ GHASH final block - mid
3922
3923 eor $acc_hb, $acc_hb, $rk2 @ GHASH final block - high
3924
3925 pmull $t0.1q, $t0.1d, $h12k.1d @ GHASH final block - mid
3926
3927 eor $acc_lb, $acc_lb, $rk3 @ GHASH final block - low
3928
3929 eor $acc_mb, $acc_mb, $t0.16b @ GHASH final block - mid
3930 movi $mod_constant.8b, #0xc2
3931
3932 eor $t9.16b, $acc_lb, $acc_hb @ MODULO - karatsuba tidy up
3933
3934 shl $mod_constantd, $mod_constantd, #56 @ mod_constant
3935
3936 eor $acc_mb, $acc_mb, $t9.16b @ MODULO - karatsuba tidy up
3937
3938 pmull $mod_t.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid
3939 orr $output_h0, $output_h0, $main_end_input_ptr
3940 stp $output_l0, $output_h0, [$output_ptr]
3941
3942 ext $acc_hb, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment
3943
3944 eor $acc_mb, $acc_mb, $mod_t.16b @ MODULO - fold into mid
3945
3946 eor $acc_mb, $acc_mb, $acc_hb @ MODULO - fold into mid
3947
3948 pmull $mod_constant.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low
3949
3950 eor $acc_lb, $acc_lb, $mod_constant.16b @ MODULO - fold into low
3951
3952 ext $acc_mb, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment
3953
3954 eor $acc_lb, $acc_lb, $acc_mb @ MODULO - fold into low
3955 ext $acc_lb, $acc_lb, $acc_lb, #8
3956 rev64 $acc_lb, $acc_lb
3957 mov x0, $len
3958 st1 { $acc_l.16b }, [$current_tag]
3959
3960 ldp x21, x22, [sp, #16]
3961 ldp x23, x24, [sp, #32]
3962 ldp d8, d9, [sp, #48]
3963 ldp d10, d11, [sp, #64]
3964 ldp d12, d13, [sp, #80]
3965 ldp d14, d15, [sp, #96]
3966 ldp x19, x20, [sp], #112
3967 ret
3968
3969.L192_dec_ret:
3970 mov w0, #0x0
3971 ret
3972.size aes_gcm_dec_192_kernel,.-aes_gcm_dec_192_kernel
3973___
3974}
3975
3976{
3977my ($end_input_ptr,$main_end_input_ptr,$input_l0,$input_h0)=map("x$_",(4..7));
3978my ($input_l1,$input_h1,$input_l2,$input_h2,$input_l3,$input_h3)=map("x$_",(19..24));
3979my ($output_l1,$output_h1,$output_l2,$output_h2,$output_l3,$output_h3)=map("x$_",(19..24));
3980my ($output_l0,$output_h0)=map("x$_",(6..7));
3981
3982my $ctr32w="w9";
3983my ($ctr32x,$ctr96_b64x,$ctr96_t32x,$rctr32x,$rk14_l,$rk14_h,$len)=map("x$_",(9..15));
3984my ($ctr96_t32w,$rctr32w)=map("w$_",(11..12));
3985
3986my ($ctr0b,$ctr1b,$ctr2b,$ctr3b,$res0b,$res1b,$res2b,$res3b)=map("v$_.16b",(0..7));
3987my ($ctr0,$ctr1,$ctr2,$ctr3,$res0,$res1,$res2,$res3)=map("v$_",(0..7));
3988my ($ctr0d,$ctr1d,$ctr2d,$ctr3d,$res0d,$res1d,$res2d,$res3d)=map("d$_",(0..7));
3989my ($res0q,$res1q,$res2q,$res3q)=map("q$_",(4..7));
3990
3991my ($acc_hb,$acc_mb,$acc_lb)=map("v$_.16b",(9..11));
3992my ($acc_h,$acc_m,$acc_l)=map("v$_",(9..11));
3993my ($acc_hd,$acc_md,$acc_ld)=map("d$_",(9..11));
3994
3995my ($h1,$h2,$h3,$h4,$h12k,$h34k)=map("v$_",(12..17));
3996my ($h1q,$h2q,$h3q,$h4q)=map("q$_",(12..15));
3997my ($h1b,$h2b,$h3b,$h4b)=map("v$_.16b",(12..15));
3998
3999my $t0="v8";
4000my $t0d="d8";
4001my $t1="v4";
4002my $t1d="d4";
4003my $t2="v8";
4004my $t2d="d8";
4005my $t3="v4";
4006my $t3d="d4";
4007my $t4="v4";
4008my $t4d="d4";
4009my $t5="v5";
4010my $t5d="d5";
4011my $t6="v8";
4012my $t6d="d8";
4013my $t7="v5";
4014my $t7d="d5";
4015my $t8="v6";
4016my $t8d="d6";
4017my $t9="v4";
4018my $t9d="d4";
4019
4020my ($ctr_t0,$ctr_t1,$ctr_t2,$ctr_t3)=map("v$_",(4..7));
4021my ($ctr_t0d,$ctr_t1d,$ctr_t2d,$ctr_t3d)=map("d$_",(4..7));
4022my ($ctr_t0b,$ctr_t1b,$ctr_t2b,$ctr_t3b)=map("v$_.16b",(4..7));
4023
4024my $mod_constantd="d8";
4025my $mod_constant="v8";
4026my $mod_t="v7";
4027
4028my ($rk0,$rk1,$rk2,$rk3,$rk4,$rk5,$rk6,$rk7,$rk8,$rk9,$rk10,$rk11,$rk12,$rk13)=map("v$_.16b",(18..31));
4029my ($rk0s,$rk1s,$rk2s,$rk3s,$rk4s,$rk5s,$rk6s,$rk7s,$rk8s,$rk9s,$rk10s,$rk11s,$rk12s,$rk13s)=map("v$_.4s",(18..31));
4030my ($rk0q,$rk1q,$rk2q,$rk3q,$rk4q,$rk5q,$rk6q,$rk7q,$rk8q,$rk9q,$rk10q,$rk11q,$rk12q,$rk13q)=map("q$_",(18..31));
4031my $rk2q1="v20.1q";
4032my $rk3q1="v21.1q";
4033my $rk4v="v22";
4034my $rk4d="d22";
4035
4036#########################################################################################
4037# size_t aes_gcm_enc_256_kernel(const unsigned char *in,
4038# size_t len,
4039# unsigned char *out,
4040# const void *key,
4041# unsigned char ivec[16],
4042# u64 *Xi);
4043#
4044$code.=<<___;
4045.global aes_gcm_enc_256_kernel
4046.type aes_gcm_enc_256_kernel,%function
4047.align 4
4048aes_gcm_enc_256_kernel:
4049 AARCH64_VALID_CALL_TARGET
4050 cbz x1, .L256_enc_ret
4051 stp x19, x20, [sp, #-112]!
4052 mov x16, x4
4053 mov x8, x5
4054 stp x21, x22, [sp, #16]
4055 stp x23, x24, [sp, #32]
4056 stp d8, d9, [sp, #48]
4057 stp d10, d11, [sp, #64]
4058 stp d12, d13, [sp, #80]
4059 stp d14, d15, [sp, #96]
4060
4061 add $end_input_ptr, $input_ptr, $bit_length, lsr #3 @ end_input_ptr
4062 lsr $main_end_input_ptr, $bit_length, #3 @ byte_len
4063 mov $len, $main_end_input_ptr
4064 ldp $ctr96_b64x, $ctr96_t32x, [$counter] @ ctr96_b64, ctr96_t32
4065#ifdef __AARCH64EB__
4066 rev $ctr96_b64x, $ctr96_b64x
4067 rev $ctr96_t32x, $ctr96_t32x
4068#endif
4069 ldp $rk14_l, $rk14_h, [$cc, #224] @ load rk14
4070#ifdef __AARCH64EB__
4071 ror $rk14_l, $rk14_l, #32
4072 ror $rk14_h, $rk14_h, #32
4073#endif
4074 ld1 { $ctr0b}, [$counter] @ special case vector load initial counter so we can start first AES block as quickly as possible
4075 sub $main_end_input_ptr, $main_end_input_ptr, #1 @ byte_len - 1
4076
4077 ld1 {$rk0s}, [$cc], #16 @ load rk0
4078 and $main_end_input_ptr, $main_end_input_ptr, #0xffffffffffffffc0 @ number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
4079
4080 ld1 {$rk1s}, [$cc], #16 @ load rk1
4081 add $main_end_input_ptr, $main_end_input_ptr, $input_ptr
4082
4083 lsr $rctr32x, $ctr96_t32x, #32
4084 fmov $ctr2d, $ctr96_b64x @ CTR block 2
4085 orr $ctr96_t32w, $ctr96_t32w, $ctr96_t32w
4086
4087 rev $rctr32w, $rctr32w @ rev_ctr32
4088 cmp $input_ptr, $main_end_input_ptr @ check if we have <= 4 blocks
4089 fmov $ctr1d, $ctr96_b64x @ CTR block 1
4090
4091 aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 0
4092 add $rctr32w, $rctr32w, #1 @ increment rev_ctr32
4093
4094 rev $ctr32w, $rctr32w @ CTR block 1
4095 fmov $ctr3d, $ctr96_b64x @ CTR block 3
4096
4097 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 1
4098 add $rctr32w, $rctr32w, #1 @ CTR block 1
4099 ld1 {$rk2s}, [$cc], #16 @ load rk2
4100
4101 fmov $ctr1.d[1], $ctr32x @ CTR block 1
4102 rev $ctr32w, $rctr32w @ CTR block 2
4103 add $rctr32w, $rctr32w, #1 @ CTR block 2
4104
4105 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 2
4106 ld1 {$rk3s}, [$cc], #16 @ load rk3
4107
4108 fmov $ctr2.d[1], $ctr32x @ CTR block 2
4109 rev $ctr32w, $rctr32w @ CTR block 3
4110
4111 aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 1
4112 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 3
4113
4114 fmov $ctr3.d[1], $ctr32x @ CTR block 3
4115
4116 aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 0
4117 ld1 {$rk4s}, [$cc], #16 @ load rk4
4118
4119 aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 2
4120 ld1 {$rk5s}, [$cc], #16 @ load rk5
4121
4122 aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 0
4123 ld1 {$rk6s}, [$cc], #16 @ load rk6
4124
4125 aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 1
4126 ldr $h3q, [$current_tag, #80] @ load h3l | h3h
4127#ifndef __AARCH64EB__
4128 ext $h3b, $h3b, $h3b, #8
4129#endif
4130 aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 0
4131 ld1 {$rk7s}, [$cc], #16 @ load rk7
4132
4133 aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 1
4134 ld1 {$rk8s}, [$cc], #16 @ load rk8
4135
4136 aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 2
4137 ldr $h2q, [$current_tag, #64] @ load h2l | h2h
4138#ifndef __AARCH64EB__
4139 ext $h2b, $h2b, $h2b, #8
4140#endif
4141 aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 1
4142 ld1 {$rk9s}, [$cc], #16 @ load rk9
4143
4144 aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 2
4145 ldr $h4q, [$current_tag, #112] @ load h4l | h4h
4146#ifndef __AARCH64EB__
4147 ext $h4b, $h4b, $h4b, #8
4148#endif
4149 aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 3
4150 ld1 {$rk10s}, [$cc], #16 @ load rk10
4151
4152 aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 2
4153 ld1 {$rk11s}, [$cc], #16 @ load rk11
4154
4155 aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 3
4156 add $rctr32w, $rctr32w, #1 @ CTR block 3
4157
4158 aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 3
4159
4160 aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 3
4161 ld1 { $acc_lb}, [$current_tag]
4162 ext $acc_lb, $acc_lb, $acc_lb, #8
4163 rev64 $acc_lb, $acc_lb
4164
4165 aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 4
4166
4167 aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 4
4168
4169 aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 4
4170
4171 aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 4
4172
4173 aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 5
4174
4175 aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 5
4176
4177 aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 5
4178
4179 aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 5
4180
4181 aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 6
4182 trn2 $h34k.2d, $h3.2d, $h4.2d @ h4l | h3l
4183
4184 aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 6
4185 ld1 {$rk12s}, [$cc], #16 @ load rk12
4186
4187 aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 6
4188 ldr $h1q, [$current_tag, #32] @ load h1l | h1h
4189#ifndef __AARCH64EB__
4190 ext $h1b, $h1b, $h1b, #8
4191#endif
4192 aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 6
4193 ld1 {$rk13s}, [$cc], #16 @ load rk13
4194
4195 aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 7
4196 trn1 $acc_h.2d, $h3.2d, $h4.2d @ h4h | h3h
4197
4198 aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 7
4199
4200 aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 7
4201
4202 aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 7
4203 trn2 $h12k.2d, $h1.2d, $h2.2d @ h2l | h1l
4204
4205 aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 8
4206
4207 aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 8
4208
4209 aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 8
4210
4211 aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 9
4212
4213 aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 9
4214
4215 aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 8
4216
4217 aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 10
4218
4219 aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 9
4220
4221 aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 9
4222
4223 aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 10
4224
4225 aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 10
4226
4227 aese $ctr1b, $rk11 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 11
4228
4229 aese $ctr2b, $rk11 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 11
4230
4231 aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 10
4232
4233 aese $ctr1b, $rk12 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 12
4234
4235 aese $ctr2b, $rk12 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 12
4236
4237 aese $ctr0b, $rk11 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 11
4238 eor $h34k.16b, $h34k.16b, $acc_h.16b @ h4k | h3k
4239
4240 aese $ctr3b, $rk11 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 11
4241
4242 aese $ctr2b, $rk13 @ AES block 2 - round 13
4243 trn1 $t0.2d, $h1.2d, $h2.2d @ h2h | h1h
4244
4245 aese $ctr0b, $rk12 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 12
4246
4247 aese $ctr3b, $rk12 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 12
4248
4249 aese $ctr1b, $rk13 @ AES block 1 - round 13
4250
4251 aese $ctr0b, $rk13 @ AES block 0 - round 13
4252
4253 aese $ctr3b, $rk13 @ AES block 3 - round 13
4254 eor $h12k.16b, $h12k.16b, $t0.16b @ h2k | h1k
4255 b.ge .L256_enc_tail @ handle tail
4256
4257 ldp $input_l1, $input_h1, [$input_ptr, #16] @ AES block 1 - load plaintext
4258#ifdef __AARCH64EB__
4259 rev $input_l1, $input_l1
4260 rev $input_h1, $input_h1
4261#endif
4262 rev $ctr32w, $rctr32w @ CTR block 4
4263 ldp $input_l0, $input_h0, [$input_ptr, #0] @ AES block 0 - load plaintext
4264#ifdef __AARCH64EB__
4265 rev $input_l0, $input_l0
4266 rev $input_h0, $input_h0
4267#endif
4268 ldp $input_l3, $input_h3, [$input_ptr, #48] @ AES block 3 - load plaintext
4269#ifdef __AARCH64EB__
4270 rev $input_l3, $input_l3
4271 rev $input_h3, $input_h3
4272#endif
4273 ldp $input_l2, $input_h2, [$input_ptr, #32] @ AES block 2 - load plaintext
4274#ifdef __AARCH64EB__
4275 rev $input_l2, $input_l2
4276 rev $input_h2, $input_h2
4277#endif
4278 add $input_ptr, $input_ptr, #64 @ AES input_ptr update
4279
4280 eor $input_l1, $input_l1, $rk14_l @ AES block 1 - round 14 low
4281 eor $input_h1, $input_h1, $rk14_h @ AES block 1 - round 14 high
4282
4283 fmov $ctr_t1d, $input_l1 @ AES block 1 - mov low
4284 eor $input_l0, $input_l0, $rk14_l @ AES block 0 - round 14 low
4285
4286 eor $input_h0, $input_h0, $rk14_h @ AES block 0 - round 14 high
4287 eor $input_h3, $input_h3, $rk14_h @ AES block 3 - round 14 high
4288 fmov $ctr_t0d, $input_l0 @ AES block 0 - mov low
4289
4290 cmp $input_ptr, $main_end_input_ptr @ check if we have <= 8 blocks
4291 fmov $ctr_t0.d[1], $input_h0 @ AES block 0 - mov high
4292 eor $input_l3, $input_l3, $rk14_l @ AES block 3 - round 14 low
4293
4294 eor $input_l2, $input_l2, $rk14_l @ AES block 2 - round 14 low
4295 fmov $ctr_t1.d[1], $input_h1 @ AES block 1 - mov high
4296
4297 fmov $ctr_t2d, $input_l2 @ AES block 2 - mov low
4298 add $rctr32w, $rctr32w, #1 @ CTR block 4
4299
4300 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4
4301 fmov $ctr_t3d, $input_l3 @ AES block 3 - mov low
4302 eor $input_h2, $input_h2, $rk14_h @ AES block 2 - round 14 high
4303
4304 fmov $ctr_t2.d[1], $input_h2 @ AES block 2 - mov high
4305
4306 eor $res0b, $ctr_t0b, $ctr0b @ AES block 0 - result
4307 fmov $ctr0d, $ctr96_b64x @ CTR block 4
4308
4309 fmov $ctr0.d[1], $ctr32x @ CTR block 4
4310 rev $ctr32w, $rctr32w @ CTR block 5
4311 add $rctr32w, $rctr32w, #1 @ CTR block 5
4312
4313 eor $res1b, $ctr_t1b, $ctr1b @ AES block 1 - result
4314 fmov $ctr1d, $ctr96_b64x @ CTR block 5
4315 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 5
4316
4317 fmov $ctr1.d[1], $ctr32x @ CTR block 5
4318 rev $ctr32w, $rctr32w @ CTR block 6
4319 st1 { $res0b}, [$output_ptr], #16 @ AES block 0 - store result
4320
4321 fmov $ctr_t3.d[1], $input_h3 @ AES block 3 - mov high
4322 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 6
4323 eor $res2b, $ctr_t2b, $ctr2b @ AES block 2 - result
4324
4325 st1 { $res1b}, [$output_ptr], #16 @ AES block 1 - store result
4326
4327 add $rctr32w, $rctr32w, #1 @ CTR block 6
4328 fmov $ctr2d, $ctr96_b64x @ CTR block 6
4329
4330 fmov $ctr2.d[1], $ctr32x @ CTR block 6
4331 st1 { $res2b}, [$output_ptr], #16 @ AES block 2 - store result
4332 rev $ctr32w, $rctr32w @ CTR block 7
4333
4334 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 7
4335
4336 eor $res3b, $ctr_t3b, $ctr3b @ AES block 3 - result
4337 st1 { $res3b}, [$output_ptr], #16 @ AES block 3 - store result
4338 b.ge L256_enc_prepretail @ do prepretail
4339
4340 .L256_enc_main_loop: @ main loop start
4341 aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 0
4342 rev64 $res0b, $res0b @ GHASH block 4k (only t0 is free)
4343
4344 aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 0
4345 fmov $ctr3d, $ctr96_b64x @ CTR block 4k+3
4346
4347 aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 0
4348 ext $acc_lb, $acc_lb, $acc_lb, #8 @ PRE 0
4349
4350 aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 1
4351 fmov $ctr3.d[1], $ctr32x @ CTR block 4k+3
4352
4353 aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 1
4354 ldp $input_l3, $input_h3, [$input_ptr, #48] @ AES block 4k+7 - load plaintext
4355#ifdef __AARCH64EB__
4356 rev $input_l3, $input_l3
4357 rev $input_h3, $input_h3
4358#endif
4359 aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 1
4360 ldp $input_l2, $input_h2, [$input_ptr, #32] @ AES block 4k+6 - load plaintext
4361#ifdef __AARCH64EB__
4362 rev $input_l2, $input_l2
4363 rev $input_h2, $input_h2
4364#endif
4365 aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 2
4366 eor $res0b, $res0b, $acc_lb @ PRE 1
4367
4368 aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 2
4369
4370 aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 0
4371 eor $input_l3, $input_l3, $rk14_l @ AES block 4k+7 - round 14 low
4372
4373 aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 3
4374 mov $acc_md, $h34k.d[1] @ GHASH block 4k - mid
4375
4376 pmull2 $acc_h.1q, $res0.2d, $h4.2d @ GHASH block 4k - high
4377 eor $input_h2, $input_h2, $rk14_h @ AES block 4k+6 - round 14 high
4378 mov $t0d, $res0.d[1] @ GHASH block 4k - mid
4379
4380 aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 1
4381 rev64 $res1b, $res1b @ GHASH block 4k+1 (t0 and t1 free)
4382
4383 aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 4
4384
4385 pmull $acc_l.1q, $res0.1d, $h4.1d @ GHASH block 4k - low
4386 eor $t0.8b, $t0.8b, $res0.8b @ GHASH block 4k - mid
4387
4388 aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 2
4389
4390 aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 5
4391 rev64 $res3b, $res3b @ GHASH block 4k+3 (t0, t1, t2 and t3 free)
4392
4393 pmull2 $t1.1q, $res1.2d, $h3.2d @ GHASH block 4k+1 - high
4394
4395 pmull $acc_m.1q, $t0.1d, $acc_m.1d @ GHASH block 4k - mid
4396 rev64 $res2b, $res2b @ GHASH block 4k+2 (t0, t1, and t2 free)
4397
4398 pmull $t2.1q, $res1.1d, $h3.1d @ GHASH block 4k+1 - low
4399
4400 eor $acc_hb, $acc_hb, $t1.16b @ GHASH block 4k+1 - high
4401 mov $t3d, $res1.d[1] @ GHASH block 4k+1 - mid
4402
4403 aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 3
4404
4405 aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 2
4406 eor $acc_lb, $acc_lb, $t2.16b @ GHASH block 4k+1 - low
4407
4408 aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 3
4409
4410 aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 4
4411 mov $t6d, $res2.d[1] @ GHASH block 4k+2 - mid
4412
4413 aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 3
4414 eor $t3.8b, $t3.8b, $res1.8b @ GHASH block 4k+1 - mid
4415
4416 aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 4
4417
4418 aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 6
4419 eor $t6.8b, $t6.8b, $res2.8b @ GHASH block 4k+2 - mid
4420
4421 aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 4
4422
4423 pmull $t3.1q, $t3.1d, $h34k.1d @ GHASH block 4k+1 - mid
4424
4425 aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 7
4426
4427 aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 5
4428 ins $t6.d[1], $t6.d[0] @ GHASH block 4k+2 - mid
4429
4430 aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 5
4431
4432 aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 8
4433
4434 aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 5
4435
4436 aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 6
4437 eor $acc_mb, $acc_mb, $t3.16b @ GHASH block 4k+1 - mid
4438
4439 pmull2 $t4.1q, $res2.2d, $h2.2d @ GHASH block 4k+2 - high
4440
4441 pmull $t5.1q, $res2.1d, $h2.1d @ GHASH block 4k+2 - low
4442
4443 aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 7
4444
4445 pmull $t8.1q, $res3.1d, $h1.1d @ GHASH block 4k+3 - low
4446 eor $acc_hb, $acc_hb, $t4.16b @ GHASH block 4k+2 - high
4447
4448 aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 6
4449 ldp $input_l1, $input_h1, [$input_ptr, #16] @ AES block 4k+5 - load plaintext
4450#ifdef __AARCH64EB__
4451 rev $input_l1, $input_l1
4452 rev $input_h1, $input_h1
4453#endif
4454 aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 8
4455 mov $t9d, $res3.d[1] @ GHASH block 4k+3 - mid
4456
4457 aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 6
4458 eor $acc_lb, $acc_lb, $t5.16b @ GHASH block 4k+2 - low
4459
4460 pmull2 $t6.1q, $t6.2d, $h12k.2d @ GHASH block 4k+2 - mid
4461
4462 pmull2 $t7.1q, $res3.2d, $h1.2d @ GHASH block 4k+3 - high
4463 eor $t9.8b, $t9.8b, $res3.8b @ GHASH block 4k+3 - mid
4464
4465 aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 7
4466 eor $input_l1, $input_l1, $rk14_l @ AES block 4k+5 - round 14 low
4467
4468 aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 9
4469 eor $acc_mb, $acc_mb, $t6.16b @ GHASH block 4k+2 - mid
4470
4471 aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 7
4472 eor $input_l2, $input_l2, $rk14_l @ AES block 4k+6 - round 14 low
4473
4474 aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 9
4475 movi $mod_constant.8b, #0xc2
4476
4477 pmull $t9.1q, $t9.1d, $h12k.1d @ GHASH block 4k+3 - mid
4478 eor $acc_hb, $acc_hb, $t7.16b @ GHASH block 4k+3 - high
4479 fmov $ctr_t1d, $input_l1 @ AES block 4k+5 - mov low
4480
4481 aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 8
4482 ldp $input_l0, $input_h0, [$input_ptr, #0] @ AES block 4k+4 - load plaintext
4483#ifdef __AARCH64EB__
4484 rev $input_l0, $input_l0
4485 rev $input_h0, $input_h0
4486#endif
4487 aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 10
4488 shl $mod_constantd, $mod_constantd, #56 @ mod_constant
4489
4490 aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 8
4491 eor $acc_lb, $acc_lb, $t8.16b @ GHASH block 4k+3 - low
4492
4493 aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 9
4494
4495 aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 10
4496 eor $acc_mb, $acc_mb, $t9.16b @ GHASH block 4k+3 - mid
4497
4498 aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 9
4499 add $rctr32w, $rctr32w, #1 @ CTR block 4k+3
4500
4501 aese $ctr0b, $rk11 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 11
4502 eor $t9.16b, $acc_lb, $acc_hb @ MODULO - karatsuba tidy up
4503
4504 aese $ctr1b, $rk11 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 11
4505 add $input_ptr, $input_ptr, #64 @ AES input_ptr update
4506
4507 pmull $mod_t.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid
4508 rev $ctr32w, $rctr32w @ CTR block 4k+8
4509 ext $acc_hb, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment
4510
4511 aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 10
4512 eor $input_l0, $input_l0, $rk14_l @ AES block 4k+4 - round 14 low
4513
4514 aese $ctr1b, $rk12 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 12
4515 eor $acc_mb, $acc_mb, $t9.16b @ MODULO - karatsuba tidy up
4516
4517 aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 10
4518 eor $input_h0, $input_h0, $rk14_h @ AES block 4k+4 - round 14 high
4519
4520 fmov $ctr_t0d, $input_l0 @ AES block 4k+4 - mov low
4521 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+8
4522 eor $mod_t.16b, $acc_hb, $mod_t.16b @ MODULO - fold into mid
4523
4524 aese $ctr0b, $rk12 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 12
4525 eor $input_h1, $input_h1, $rk14_h @ AES block 4k+5 - round 14 high
4526
4527 aese $ctr2b, $rk11 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 11
4528 eor $input_h3, $input_h3, $rk14_h @ AES block 4k+7 - round 14 high
4529
4530 aese $ctr3b, $rk11 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 11
4531 add $rctr32w, $rctr32w, #1 @ CTR block 4k+8
4532
4533 aese $ctr0b, $rk13 @ AES block 4k+4 - round 13
4534 fmov $ctr_t0.d[1], $input_h0 @ AES block 4k+4 - mov high
4535 eor $acc_mb, $acc_mb, $mod_t.16b @ MODULO - fold into mid
4536
4537 aese $ctr2b, $rk12 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 12
4538 fmov $ctr_t3d, $input_l3 @ AES block 4k+7 - mov low
4539
4540 aese $ctr1b, $rk13 @ AES block 4k+5 - round 13
4541 fmov $ctr_t1.d[1], $input_h1 @ AES block 4k+5 - mov high
4542
4543 fmov $ctr_t2d, $input_l2 @ AES block 4k+6 - mov low
4544 cmp $input_ptr, $main_end_input_ptr @ LOOP CONTROL
4545
4546 fmov $ctr_t2.d[1], $input_h2 @ AES block 4k+6 - mov high
4547
4548 pmull $acc_h.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low
4549 eor $res0b, $ctr_t0b, $ctr0b @ AES block 4k+4 - result
4550 fmov $ctr0d, $ctr96_b64x @ CTR block 4k+8
4551
4552 fmov $ctr0.d[1], $ctr32x @ CTR block 4k+8
4553 rev $ctr32w, $rctr32w @ CTR block 4k+9
4554 add $rctr32w, $rctr32w, #1 @ CTR block 4k+9
4555
4556 eor $res1b, $ctr_t1b, $ctr1b @ AES block 4k+5 - result
4557 fmov $ctr1d, $ctr96_b64x @ CTR block 4k+9
4558 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+9
4559
4560 aese $ctr3b, $rk12 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 12
4561 fmov $ctr1.d[1], $ctr32x @ CTR block 4k+9
4562
4563 aese $ctr2b, $rk13 @ AES block 4k+6 - round 13
4564 rev $ctr32w, $rctr32w @ CTR block 4k+10
4565 st1 { $res0b}, [$output_ptr], #16 @ AES block 4k+4 - store result
4566
4567 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+10
4568 eor $acc_lb, $acc_lb, $acc_hb @ MODULO - fold into low
4569 fmov $ctr_t3.d[1], $input_h3 @ AES block 4k+7 - mov high
4570
4571 ext $acc_mb, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment
4572 st1 { $res1b}, [$output_ptr], #16 @ AES block 4k+5 - store result
4573 add $rctr32w, $rctr32w, #1 @ CTR block 4k+10
4574
4575 aese $ctr3b, $rk13 @ AES block 4k+7 - round 13
4576 eor $res2b, $ctr_t2b, $ctr2b @ AES block 4k+6 - result
4577 fmov $ctr2d, $ctr96_b64x @ CTR block 4k+10
4578
4579 st1 { $res2b}, [$output_ptr], #16 @ AES block 4k+6 - store result
4580 fmov $ctr2.d[1], $ctr32x @ CTR block 4k+10
4581 rev $ctr32w, $rctr32w @ CTR block 4k+11
4582
4583 eor $acc_lb, $acc_lb, $acc_mb @ MODULO - fold into low
4584 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+11
4585
4586 eor $res3b, $ctr_t3b, $ctr3b @ AES block 4k+7 - result
4587 st1 { $res3b}, [$output_ptr], #16 @ AES block 4k+7 - store result
4588 b.lt L256_enc_main_loop
4589
4590 .L256_enc_prepretail: @ PREPRETAIL
4591 aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 0
4592 rev64 $res2b, $res2b @ GHASH block 4k+2 (t0, t1, and t2 free)
4593
4594 aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 0
4595 fmov $ctr3d, $ctr96_b64x @ CTR block 4k+3
4596
4597 aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 0
4598 rev64 $res0b, $res0b @ GHASH block 4k (only t0 is free)
4599
4600 fmov $ctr3.d[1], $ctr32x @ CTR block 4k+3
4601 ext $acc_lb, $acc_lb, $acc_lb, #8 @ PRE 0
4602
4603 aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 1
4604
4605 aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 1
4606
4607 eor $res0b, $res0b, $acc_lb @ PRE 1
4608 rev64 $res1b, $res1b @ GHASH block 4k+1 (t0 and t1 free)
4609
4610 aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 2
4611
4612 aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 0
4613 mov $acc_md, $h34k.d[1] @ GHASH block 4k - mid
4614
4615 aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 1
4616
4617 pmull $acc_l.1q, $res0.1d, $h4.1d @ GHASH block 4k - low
4618 mov $t0d, $res0.d[1] @ GHASH block 4k - mid
4619
4620 pmull2 $acc_h.1q, $res0.2d, $h4.2d @ GHASH block 4k - high
4621
4622 aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 3
4623
4624 aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 2
4625 eor $t0.8b, $t0.8b, $res0.8b @ GHASH block 4k - mid
4626
4627 aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 2
4628
4629 aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 1
4630
4631 aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 3
4632
4633 pmull $acc_m.1q, $t0.1d, $acc_m.1d @ GHASH block 4k - mid
4634
4635 pmull2 $t1.1q, $res1.2d, $h3.2d @ GHASH block 4k+1 - high
4636
4637 pmull $t2.1q, $res1.1d, $h3.1d @ GHASH block 4k+1 - low
4638
4639 aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 2
4640
4641 eor $acc_hb, $acc_hb, $t1.16b @ GHASH block 4k+1 - high
4642 mov $t3d, $res1.d[1] @ GHASH block 4k+1 - mid
4643
4644 aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 3
4645 eor $acc_lb, $acc_lb, $t2.16b @ GHASH block 4k+1 - low
4646
4647 aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 3
4648
4649 eor $t3.8b, $t3.8b, $res1.8b @ GHASH block 4k+1 - mid
4650 mov $t6d, $res2.d[1] @ GHASH block 4k+2 - mid
4651
4652 aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 4
4653 rev64 $res3b, $res3b @ GHASH block 4k+3 (t0, t1, t2 and t3 free)
4654
4655 aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 4
4656
4657 pmull $t3.1q, $t3.1d, $h34k.1d @ GHASH block 4k+1 - mid
4658 eor $t6.8b, $t6.8b, $res2.8b @ GHASH block 4k+2 - mid
4659 add $rctr32w, $rctr32w, #1 @ CTR block 4k+3
4660
4661 pmull $t5.1q, $res2.1d, $h2.1d @ GHASH block 4k+2 - low
4662
4663 aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 5
4664
4665 aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 4
4666 eor $acc_mb, $acc_mb, $t3.16b @ GHASH block 4k+1 - mid
4667
4668 pmull2 $t4.1q, $res2.2d, $h2.2d @ GHASH block 4k+2 - high
4669
4670 eor $acc_lb, $acc_lb, $t5.16b @ GHASH block 4k+2 - low
4671 ins $t6.d[1], $t6.d[0] @ GHASH block 4k+2 - mid
4672
4673 aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 5
4674
4675 eor $acc_hb, $acc_hb, $t4.16b @ GHASH block 4k+2 - high
4676 mov $t9d, $res3.d[1] @ GHASH block 4k+3 - mid
4677
4678 aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 4
4679
4680 pmull2 $t6.1q, $t6.2d, $h12k.2d @ GHASH block 4k+2 - mid
4681
4682 eor $t9.8b, $t9.8b, $res3.8b @ GHASH block 4k+3 - mid
4683
4684 pmull2 $t7.1q, $res3.2d, $h1.2d @ GHASH block 4k+3 - high
4685
4686 aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 5
4687
4688 pmull $t9.1q, $t9.1d, $h12k.1d @ GHASH block 4k+3 - mid
4689 eor $acc_mb, $acc_mb, $t6.16b @ GHASH block 4k+2 - mid
4690
4691 aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 5
4692
4693 aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 6
4694
4695 aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 6
4696
4697 aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 6
4698 movi $mod_constant.8b, #0xc2
4699
4700 aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 6
4701
4702 aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 7
4703 eor $acc_hb, $acc_hb, $t7.16b @ GHASH block 4k+3 - high
4704
4705 aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 7
4706
4707 aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 7
4708 shl $mod_constantd, $mod_constantd, #56 @ mod_constant
4709
4710 aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 8
4711 eor $acc_mb, $acc_mb, $t9.16b @ GHASH block 4k+3 - mid
4712
4713 pmull $t8.1q, $res3.1d, $h1.1d @ GHASH block 4k+3 - low
4714
4715 aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 8
4716
4717 aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 9
4718
4719 aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 8
4720 eor $acc_lb, $acc_lb, $t8.16b @ GHASH block 4k+3 - low
4721
4722 aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 9
4723
4724 eor $acc_mb, $acc_mb, $acc_hb @ karatsuba tidy up
4725
4726 pmull $t1.1q, $acc_h.1d, $mod_constant.1d
4727 ext $acc_hb, $acc_hb, $acc_hb, #8
4728
4729 aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 10
4730
4731 aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 7
4732 eor $acc_mb, $acc_mb, $acc_lb
4733
4734 aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 10
4735
4736 aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 9
4737
4738 aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 8
4739
4740 aese $ctr1b, $rk11 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 11
4741 eor $acc_mb, $acc_mb, $t1.16b
4742
4743 aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 10
4744
4745 aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 9
4746
4747 aese $ctr1b, $rk12 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 12
4748
4749 aese $ctr0b, $rk11 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 11
4750 eor $acc_mb, $acc_mb, $acc_hb
4751
4752 aese $ctr3b, $rk11 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 11
4753
4754 aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 10
4755
4756 aese $ctr0b, $rk12 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 12
4757
4758 pmull $t1.1q, $acc_m.1d, $mod_constant.1d
4759
4760 aese $ctr2b, $rk11 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 11
4761 ext $acc_mb, $acc_mb, $acc_mb, #8
4762
4763 aese $ctr3b, $rk12 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 12
4764
4765 aese $ctr1b, $rk13 @ AES block 4k+5 - round 13
4766 eor $acc_lb, $acc_lb, $t1.16b
4767
4768 aese $ctr2b, $rk12 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 12
4769
4770 aese $ctr3b, $rk13 @ AES block 4k+7 - round 13
4771
4772 aese $ctr0b, $rk13 @ AES block 4k+4 - round 13
4773
4774 aese $ctr2b, $rk13 @ AES block 4k+6 - round 13
4775 eor $acc_lb, $acc_lb, $acc_mb
4776 .L256_enc_tail: @ TAIL
4777
4778 ext $t0.16b, $acc_lb, $acc_lb, #8 @ prepare final partial tag
4779 sub $main_end_input_ptr, $end_input_ptr, $input_ptr @ main_end_input_ptr is number of bytes left to process
4780 ldp $input_l0, $input_h0, [$input_ptr], #16 @ AES block 4k+4 - load plaintext
4781#ifdef __AARCH64EB__
4782 rev $input_l0, $input_l0
4783 rev $input_h0, $input_h0
4784#endif
4785 eor $input_l0, $input_l0, $rk14_l @ AES block 4k+4 - round 14 low
4786 eor $input_h0, $input_h0, $rk14_h @ AES block 4k+4 - round 14 high
4787
4788 cmp $main_end_input_ptr, #48
4789 fmov $ctr_t0d, $input_l0 @ AES block 4k+4 - mov low
4790
4791 fmov $ctr_t0.d[1], $input_h0 @ AES block 4k+4 - mov high
4792
4793 eor $res1b, $ctr_t0b, $ctr0b @ AES block 4k+4 - result
4794 b.gt .L256_enc_blocks_more_than_3
4795
4796 cmp $main_end_input_ptr, #32
4797 mov $ctr3b, $ctr2b
4798 movi $acc_l.8b, #0
4799
4800 movi $acc_h.8b, #0
4801 sub $rctr32w, $rctr32w, #1
4802
4803 mov $ctr2b, $ctr1b
4804 movi $acc_m.8b, #0
4805 b.gt .L256_enc_blocks_more_than_2
4806
4807 mov $ctr3b, $ctr1b
4808 sub $rctr32w, $rctr32w, #1
4809 cmp $main_end_input_ptr, #16
4810
4811 b.gt .L256_enc_blocks_more_than_1
4812
4813 sub $rctr32w, $rctr32w, #1
4814 b .L256_enc_blocks_less_than_1
4815 .L256_enc_blocks_more_than_3: @ blocks left > 3
4816 st1 { $res1b}, [$output_ptr], #16 @ AES final-3 block - store result
4817
4818 ldp $input_l0, $input_h0, [$input_ptr], #16 @ AES final-2 block - load input low & high
4819#ifdef __AARCH64EB__
4820 rev $input_l0, $input_l0
4821 rev $input_h0, $input_h0
4822#endif
4823 rev64 $res0b, $res1b @ GHASH final-3 block
4824
4825 eor $input_l0, $input_l0, $rk14_l @ AES final-2 block - round 14 low
4826 eor $res0b, $res0b, $t0.16b @ feed in partial tag
4827
4828 eor $input_h0, $input_h0, $rk14_h @ AES final-2 block - round 14 high
4829
4830 mov $rk4d, $res0.d[1] @ GHASH final-3 block - mid
4831 fmov $res1d, $input_l0 @ AES final-2 block - mov low
4832
4833 fmov $res1.d[1], $input_h0 @ AES final-2 block - mov high
4834
4835 eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-3 block - mid
4836 movi $t0.8b, #0 @ suppress further partial tag feed in
4837
4838 mov $acc_md, $h34k.d[1] @ GHASH final-3 block - mid
4839
4840 pmull $acc_l.1q, $res0.1d, $h4.1d @ GHASH final-3 block - low
4841
4842 pmull2 $acc_h.1q, $res0.2d, $h4.2d @ GHASH final-3 block - high
4843
4844 pmull $acc_m.1q, $rk4v.1d, $acc_m.1d @ GHASH final-3 block - mid
4845 eor $res1b, $res1b, $ctr1b @ AES final-2 block - result
4846 .L256_enc_blocks_more_than_2: @ blocks left > 2
4847
4848 st1 { $res1b}, [$output_ptr], #16 @ AES final-2 block - store result
4849
4850 ldp $input_l0, $input_h0, [$input_ptr], #16 @ AES final-1 block - load input low & high
4851#ifdef __AARCH64EB__
4852 rev $input_l0, $input_l0
4853 rev $input_h0, $input_h0
4854#endif
4855 rev64 $res0b, $res1b @ GHASH final-2 block
4856
4857 eor $input_l0, $input_l0, $rk14_l @ AES final-1 block - round 14 low
4858 eor $res0b, $res0b, $t0.16b @ feed in partial tag
4859
4860 fmov $res1d, $input_l0 @ AES final-1 block - mov low
4861 eor $input_h0, $input_h0, $rk14_h @ AES final-1 block - round 14 high
4862
4863 fmov $res1.d[1], $input_h0 @ AES final-1 block - mov high
4864
4865 movi $t0.8b, #0 @ suppress further partial tag feed in
4866
4867 pmull2 $rk2q1, $res0.2d, $h3.2d @ GHASH final-2 block - high
4868 mov $rk4d, $res0.d[1] @ GHASH final-2 block - mid
4869
4870 pmull $rk3q1, $res0.1d, $h3.1d @ GHASH final-2 block - low
4871
4872 eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-2 block - mid
4873
4874 eor $res1b, $res1b, $ctr2b @ AES final-1 block - result
4875
4876 eor $acc_hb, $acc_hb, $rk2 @ GHASH final-2 block - high
4877
4878 pmull $rk4v.1q, $rk4v.1d, $h34k.1d @ GHASH final-2 block - mid
4879
4880 eor $acc_lb, $acc_lb, $rk3 @ GHASH final-2 block - low
4881
4882 eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-2 block - mid
4883 .L256_enc_blocks_more_than_1: @ blocks left > 1
4884
4885 st1 { $res1b}, [$output_ptr], #16 @ AES final-1 block - store result
4886
4887 rev64 $res0b, $res1b @ GHASH final-1 block
4888
4889 ldp $input_l0, $input_h0, [$input_ptr], #16 @ AES final block - load input low & high
4890#ifdef __AARCH64EB__
4891 rev $input_l0, $input_l0
4892 rev $input_h0, $input_h0
4893#endif
4894 eor $res0b, $res0b, $t0.16b @ feed in partial tag
4895
4896 movi $t0.8b, #0 @ suppress further partial tag feed in
4897
4898 eor $input_l0, $input_l0, $rk14_l @ AES final block - round 14 low
4899 mov $rk4d, $res0.d[1] @ GHASH final-1 block - mid
4900
4901 pmull2 $rk2q1, $res0.2d, $h2.2d @ GHASH final-1 block - high
4902 eor $input_h0, $input_h0, $rk14_h @ AES final block - round 14 high
4903
4904 eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-1 block - mid
4905
4906 eor $acc_hb, $acc_hb, $rk2 @ GHASH final-1 block - high
4907
4908 ins $rk4v.d[1], $rk4v.d[0] @ GHASH final-1 block - mid
4909 fmov $res1d, $input_l0 @ AES final block - mov low
4910
4911 fmov $res1.d[1], $input_h0 @ AES final block - mov high
4912
4913 pmull2 $rk4v.1q, $rk4v.2d, $h12k.2d @ GHASH final-1 block - mid
4914
4915 pmull $rk3q1, $res0.1d, $h2.1d @ GHASH final-1 block - low
4916
4917 eor $res1b, $res1b, $ctr3b @ AES final block - result
4918 eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-1 block - mid
4919
4920 eor $acc_lb, $acc_lb, $rk3 @ GHASH final-1 block - low
4921 .L256_enc_blocks_less_than_1: @ blocks left <= 1
4922
4923 and $bit_length, $bit_length, #127 @ bit_length %= 128
4924
4925 mvn $rk14_l, xzr @ rk14_l = 0xffffffffffffffff
4926 sub $bit_length, $bit_length, #128 @ bit_length -= 128
4927
4928 neg $bit_length, $bit_length @ bit_length = 128 - #bits in input (in range [1,128])
4929 ld1 { $rk0}, [$output_ptr] @ load existing bytes where the possibly partial last block is to be stored
4930
4931 mvn $rk14_h, xzr @ rk14_h = 0xffffffffffffffff
4932 and $bit_length, $bit_length, #127 @ bit_length %= 128
4933
4934 lsr $rk14_h, $rk14_h, $bit_length @ rk14_h is mask for top 64b of last block
4935 cmp $bit_length, #64
4936
4937 csel $input_l0, $rk14_l, $rk14_h, lt
4938 csel $input_h0, $rk14_h, xzr, lt
4939
4940 fmov $ctr0d, $input_l0 @ ctr0b is mask for last block
4941
4942 fmov $ctr0.d[1], $input_h0
4943
4944 and $res1b, $res1b, $ctr0b @ possibly partial last block has zeroes in highest bits
4945
4946 rev64 $res0b, $res1b @ GHASH final block
4947
4948 eor $res0b, $res0b, $t0.16b @ feed in partial tag
4949
4950 bif $res1b, $rk0, $ctr0b @ insert existing bytes in top end of result before storing
4951
4952 pmull2 $rk2q1, $res0.2d, $h1.2d @ GHASH final block - high
4953 mov $t0d, $res0.d[1] @ GHASH final block - mid
4954#ifndef __AARCH64EB__
4955 rev $ctr32w, $rctr32w
4956#else
4957 mov $ctr32w, $rctr32w
4958#endif
4959
4960 pmull $rk3q1, $res0.1d, $h1.1d @ GHASH final block - low
4961
4962 eor $acc_hb, $acc_hb, $rk2 @ GHASH final block - high
4963 eor $t0.8b, $t0.8b, $res0.8b @ GHASH final block - mid
4964
4965 pmull $t0.1q, $t0.1d, $h12k.1d @ GHASH final block - mid
4966
4967 eor $acc_lb, $acc_lb, $rk3 @ GHASH final block - low
4968
4969 eor $acc_mb, $acc_mb, $t0.16b @ GHASH final block - mid
4970 movi $mod_constant.8b, #0xc2
4971
4972 eor $t9.16b, $acc_lb, $acc_hb @ MODULO - karatsuba tidy up
4973
4974 shl $mod_constantd, $mod_constantd, #56 @ mod_constant
4975
4976 eor $acc_mb, $acc_mb, $t9.16b @ MODULO - karatsuba tidy up
4977
4978 pmull $mod_t.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid
4979
4980 ext $acc_hb, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment
4981
4982 eor $acc_mb, $acc_mb, $mod_t.16b @ MODULO - fold into mid
4983
4984 eor $acc_mb, $acc_mb, $acc_hb @ MODULO - fold into mid
4985
4986 pmull $acc_h.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low
4987
4988 ext $acc_mb, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment
4989
4990 str $ctr32w, [$counter, #12] @ store the updated counter
4991
4992 st1 { $res1b}, [$output_ptr] @ store all 16B
4993 eor $acc_lb, $acc_lb, $acc_hb @ MODULO - fold into low
4994
4995 eor $acc_lb, $acc_lb, $acc_mb @ MODULO - fold into low
4996 ext $acc_lb, $acc_lb, $acc_lb, #8
4997 rev64 $acc_lb, $acc_lb
4998 mov x0, $len
4999 st1 { $acc_l.16b }, [$current_tag]
5000
5001 ldp x21, x22, [sp, #16]
5002 ldp x23, x24, [sp, #32]
5003 ldp d8, d9, [sp, #48]
5004 ldp d10, d11, [sp, #64]
5005 ldp d12, d13, [sp, #80]
5006 ldp d14, d15, [sp, #96]
5007 ldp x19, x20, [sp], #112
5008 ret
5009
5010.L256_enc_ret:
5011 mov w0, #0x0
5012 ret
5013.size aes_gcm_enc_256_kernel,.-aes_gcm_enc_256_kernel
5014___
5015
5016{
5017my $t8="v4";
5018my $t8d="d4";
5019my $t9="v6";
5020my $t9d="d6";
5021#########################################################################################
5022# size_t aes_gcm_dec_256_kernel(const unsigned char *in,
5023# size_t len,
5024# unsigned char *out,
5025# const void *key,
5026# unsigned char ivec[16],
5027# u64 *Xi);
5028#
5029$code.=<<___;
5030.global aes_gcm_dec_256_kernel
5031.type aes_gcm_dec_256_kernel,%function
5032.align 4
5033aes_gcm_dec_256_kernel:
5034 AARCH64_VALID_CALL_TARGET
5035 cbz x1, .L256_dec_ret
5036 stp x19, x20, [sp, #-112]!
5037 mov x16, x4
5038 mov x8, x5
5039 stp x21, x22, [sp, #16]
5040 stp x23, x24, [sp, #32]
5041 stp d8, d9, [sp, #48]
5042 stp d10, d11, [sp, #64]
5043 stp d12, d13, [sp, #80]
5044 stp d14, d15, [sp, #96]
5045
5046 lsr $main_end_input_ptr, $bit_length, #3 @ byte_len
5047 mov $len, $main_end_input_ptr
5048 ldp $ctr96_b64x, $ctr96_t32x, [$counter] @ ctr96_b64, ctr96_t32
5049#ifdef __AARCH64EB__
5050 rev $ctr96_b64x, $ctr96_b64x
5051 rev $ctr96_t32x, $ctr96_t32x
5052#endif
5053 ldp $rk14_l, $rk14_h, [$cc, #224] @ load rk14
5054#ifdef __AARCH64EB__
5055 ror $rk14_h, $rk14_h, #32
5056 ror $rk14_l, $rk14_l, #32
5057#endif
5058 ld1 {$rk0s}, [$cc], #16 @ load rk0
5059 sub $main_end_input_ptr, $main_end_input_ptr, #1 @ byte_len - 1
5060
5061 ld1 {$rk1s}, [$cc], #16 @ load rk1
5062 and $main_end_input_ptr, $main_end_input_ptr, #0xffffffffffffffc0 @ number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
5063
5064 add $end_input_ptr, $input_ptr, $bit_length, lsr #3 @ end_input_ptr
5065 ld1 {$rk2s}, [$cc], #16 @ load rk2
5066
5067 lsr $rctr32x, $ctr96_t32x, #32
5068 ld1 {$rk3s}, [$cc], #16 @ load rk3
5069 orr $ctr96_t32w, $ctr96_t32w, $ctr96_t32w
5070
5071 ld1 {$rk4s}, [$cc], #16 @ load rk4
5072 add $main_end_input_ptr, $main_end_input_ptr, $input_ptr
5073 rev $rctr32w, $rctr32w @ rev_ctr32
5074
5075 add $rctr32w, $rctr32w, #1 @ increment rev_ctr32
5076 fmov $ctr3d, $ctr96_b64x @ CTR block 3
5077
5078 rev $ctr32w, $rctr32w @ CTR block 1
5079 add $rctr32w, $rctr32w, #1 @ CTR block 1
5080 fmov $ctr1d, $ctr96_b64x @ CTR block 1
5081
5082 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 1
5083 ld1 { $ctr0b}, [$counter] @ special case vector load initial counter so we can start first AES block as quickly as possible
5084
5085 fmov $ctr1.d[1], $ctr32x @ CTR block 1
5086 rev $ctr32w, $rctr32w @ CTR block 2
5087 add $rctr32w, $rctr32w, #1 @ CTR block 2
5088
5089 fmov $ctr2d, $ctr96_b64x @ CTR block 2
5090 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 2
5091
5092 fmov $ctr2.d[1], $ctr32x @ CTR block 2
5093 rev $ctr32w, $rctr32w @ CTR block 3
5094
5095 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 3
5096 ld1 {$rk5s}, [$cc], #16 @ load rk5
5097
5098 fmov $ctr3.d[1], $ctr32x @ CTR block 3
5099 add $rctr32w, $rctr32w, #1 @ CTR block 3
5100
5101 ld1 {$rk6s}, [$cc], #16 @ load rk6
5102
5103 ld1 {$rk7s}, [$cc], #16 @ load rk7
5104
5105 ld1 {$rk8s}, [$cc], #16 @ load rk8
5106
5107 aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 0
5108 ldr $h3q, [$current_tag, #80] @ load h3l | h3h
5109#ifndef __AARCH64EB__
5110 ext $h3b, $h3b, $h3b, #8
5111#endif
5112
5113 aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 0
5114 ldr $h4q, [$current_tag, #112] @ load h4l | h4h
5115#ifndef __AARCH64EB__
5116 ext $h4b, $h4b, $h4b, #8
5117#endif
5118
5119 aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 0
5120 ldr $h2q, [$current_tag, #64] @ load h2l | h2h
5121#ifndef __AARCH64EB__
5122 ext $h2b, $h2b, $h2b, #8
5123#endif
5124
5125 aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 0
5126 ld1 {$rk9s}, [$cc], #16 @ load rk9
5127
5128 aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 1
5129
5130 aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 1
5131 ld1 { $acc_lb}, [$current_tag]
5132 ext $acc_lb, $acc_lb, $acc_lb, #8
5133 rev64 $acc_lb, $acc_lb
5134
5135 aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 1
5136 ld1 {$rk10s}, [$cc], #16 @ load rk10
5137
5138 aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 1
5139 ld1 {$rk11s}, [$cc], #16 @ load rk11
5140
5141 aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 2
5142 ldr $h1q, [$current_tag, #32] @ load h1l | h1h
5143#ifndef __AARCH64EB__
5144 ext $h1b, $h1b, $h1b, #8
5145#endif
5146 aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 2
5147 ld1 {$rk12s}, [$cc], #16 @ load rk12
5148
5149 aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 2
5150
5151 aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 3
5152
5153 aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 2
5154
5155 aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 3
5156
5157 aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 4
5158 cmp $input_ptr, $main_end_input_ptr @ check if we have <= 4 blocks
5159
5160 aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 3
5161
5162 aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 3
5163
5164 aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 4
5165
5166 aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 4
5167
5168 aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 4
5169
5170 aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 5
5171
5172 aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 5
5173
5174 aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 5
5175
5176 aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 5
5177
5178 aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 6
5179
5180 aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 6
5181
5182 aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 6
5183
5184 aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 6
5185
5186 aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 7
5187
5188 aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 7
5189
5190 aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 7
5191
5192 aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 8
5193
5194 aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 7
5195
5196 aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 8
5197
5198 aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 8
5199
5200 aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 9
5201
5202 aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 8
5203 ld1 {$rk13s}, [$cc], #16 @ load rk13
5204
5205 aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 9
5206
5207 aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 10
5208
5209 aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 9
5210
5211 aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 10
5212
5213 aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 9
5214
5215 aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 10
5216
5217 aese $ctr0b, $rk11 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 11
5218
5219 aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 10
5220
5221 aese $ctr3b, $rk11 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 11
5222
5223 aese $ctr1b, $rk11 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 11
5224
5225 aese $ctr2b, $rk11 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 11
5226
5227 trn1 $acc_h.2d, $h3.2d, $h4.2d @ h4h | h3h
5228
5229 trn2 $h34k.2d, $h3.2d, $h4.2d @ h4l | h3l
5230
5231 trn1 $t0.2d, $h1.2d, $h2.2d @ h2h | h1h
5232 trn2 $h12k.2d, $h1.2d, $h2.2d @ h2l | h1l
5233
5234 aese $ctr1b, $rk12 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 12
5235
5236 aese $ctr0b, $rk12 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 12
5237
5238 aese $ctr2b, $rk12 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 12
5239
5240 aese $ctr3b, $rk12 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 12
5241 eor $h34k.16b, $h34k.16b, $acc_h.16b @ h4k | h3k
5242
5243 aese $ctr1b, $rk13 @ AES block 1 - round 13
5244
5245 aese $ctr2b, $rk13 @ AES block 2 - round 13
5246 eor $h12k.16b, $h12k.16b, $t0.16b @ h2k | h1k
5247
5248 aese $ctr3b, $rk13 @ AES block 3 - round 13
5249
5250 aese $ctr0b, $rk13 @ AES block 0 - round 13
5251 b.ge .L256_dec_tail @ handle tail
5252
5253 ld1 {$res0b, $res1b}, [$input_ptr], #32 @ AES block 0,1 - load ciphertext
5254
5255 rev $ctr32w, $rctr32w @ CTR block 4
5256
5257 eor $ctr0b, $res0b, $ctr0b @ AES block 0 - result
5258
5259 eor $ctr1b, $res1b, $ctr1b @ AES block 1 - result
5260 rev64 $res1b, $res1b @ GHASH block 1
5261 ld1 {$res2b}, [$input_ptr], #16 @ AES block 2 - load ciphertext
5262
5263 mov $output_h0, $ctr0.d[1] @ AES block 0 - mov high
5264
5265 mov $output_l0, $ctr0.d[0] @ AES block 0 - mov low
5266 rev64 $res0b, $res0b @ GHASH block 0
5267 add $rctr32w, $rctr32w, #1 @ CTR block 4
5268
5269 fmov $ctr0d, $ctr96_b64x @ CTR block 4
5270 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4
5271
5272 fmov $ctr0.d[1], $ctr32x @ CTR block 4
5273 rev $ctr32w, $rctr32w @ CTR block 5
5274 add $rctr32w, $rctr32w, #1 @ CTR block 5
5275
5276 mov $output_l1, $ctr1.d[0] @ AES block 1 - mov low
5277
5278 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 5
5279 mov $output_h1, $ctr1.d[1] @ AES block 1 - mov high
5280 eor $output_h0, $output_h0, $rk14_h @ AES block 0 - round 14 high
5281#ifdef __AARCH64EB__
5282 rev $output_h0, $output_h0
5283#endif
5284 eor $output_l0, $output_l0, $rk14_l @ AES block 0 - round 14 low
5285#ifdef __AARCH64EB__
5286 rev $output_l0, $output_l0
5287#endif
5288 stp $output_l0, $output_h0, [$output_ptr], #16 @ AES block 0 - store result
5289 fmov $ctr1d, $ctr96_b64x @ CTR block 5
5290
5291 ld1 {$res3b}, [$input_ptr], #16 @ AES block 3 - load ciphertext
5292
5293 fmov $ctr1.d[1], $ctr32x @ CTR block 5
5294 rev $ctr32w, $rctr32w @ CTR block 6
5295 add $rctr32w, $rctr32w, #1 @ CTR block 6
5296
5297 eor $output_l1, $output_l1, $rk14_l @ AES block 1 - round 14 low
5298#ifdef __AARCH64EB__
5299 rev $output_l1, $output_l1
5300#endif
5301 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 6
5302
5303 eor $output_h1, $output_h1, $rk14_h @ AES block 1 - round 14 high
5304#ifdef __AARCH64EB__
5305 rev $output_h1, $output_h1
5306#endif
5307 stp $output_l1, $output_h1, [$output_ptr], #16 @ AES block 1 - store result
5308
5309 eor $ctr2b, $res2b, $ctr2b @ AES block 2 - result
5310 cmp $input_ptr, $main_end_input_ptr @ check if we have <= 8 blocks
5311 b.ge .L256_dec_prepretail @ do prepretail
5312
5313 .L256_dec_main_loop: @ main loop start
5314 mov $output_l2, $ctr2.d[0] @ AES block 4k+2 - mov low
5315 ext $acc_lb, $acc_lb, $acc_lb, #8 @ PRE 0
5316 eor $ctr3b, $res3b, $ctr3b @ AES block 4k+3 - result
5317
5318 aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 0
5319 mov $output_h2, $ctr2.d[1] @ AES block 4k+2 - mov high
5320
5321 aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 0
5322 fmov $ctr2d, $ctr96_b64x @ CTR block 4k+6
5323
5324 fmov $ctr2.d[1], $ctr32x @ CTR block 4k+6
5325 eor $res0b, $res0b, $acc_lb @ PRE 1
5326 rev $ctr32w, $rctr32w @ CTR block 4k+7
5327
5328 aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 1
5329 mov $output_h3, $ctr3.d[1] @ AES block 4k+3 - mov high
5330
5331 aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 1
5332 mov $output_l3, $ctr3.d[0] @ AES block 4k+3 - mov low
5333
5334 pmull2 $acc_h.1q, $res0.2d, $h4.2d @ GHASH block 4k - high
5335 mov $t0d, $res0.d[1] @ GHASH block 4k - mid
5336 fmov $ctr3d, $ctr96_b64x @ CTR block 4k+7
5337
5338 aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 2
5339 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+7
5340
5341 aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 0
5342 fmov $ctr3.d[1], $ctr32x @ CTR block 4k+7
5343
5344 aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 2
5345 eor $t0.8b, $t0.8b, $res0.8b @ GHASH block 4k - mid
5346
5347 aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 3
5348 eor $output_h2, $output_h2, $rk14_h @ AES block 4k+2 - round 14 high
5349#ifdef __AARCH64EB__
5350 rev $output_h2, $output_h2
5351#endif
5352 aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 1
5353 mov $acc_md, $h34k.d[1] @ GHASH block 4k - mid
5354
5355 aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 3
5356 rev64 $res2b, $res2b @ GHASH block 4k+2
5357
5358 aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 0
5359 eor $output_l2, $output_l2, $rk14_l @ AES block 4k+2 - round 14 low
5360#ifdef __AARCH64EB__
5361 rev $output_l2, $output_l2
5362#endif
5363 aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 2
5364 stp $output_l2, $output_h2, [$output_ptr], #16 @ AES block 4k+2 - store result
5365
5366 pmull $acc_l.1q, $res0.1d, $h4.1d @ GHASH block 4k - low
5367
5368 pmull2 $t1.1q, $res1.2d, $h3.2d @ GHASH block 4k+1 - high
5369
5370 aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 3
5371 rev64 $res3b, $res3b @ GHASH block 4k+3
5372
5373 pmull $acc_m.1q, $t0.1d, $acc_m.1d @ GHASH block 4k - mid
5374 eor $output_l3, $output_l3, $rk14_l @ AES block 4k+3 - round 14 low
5375#ifdef __AARCH64EB__
5376 rev $output_l3, $output_l3
5377#endif
5378 pmull $t2.1q, $res1.1d, $h3.1d @ GHASH block 4k+1 - low
5379 eor $output_h3, $output_h3, $rk14_h @ AES block 4k+3 - round 14 high
5380#ifdef __AARCH64EB__
5381 rev $output_h3, $output_h3
5382#endif
5383 eor $acc_hb, $acc_hb, $t1.16b @ GHASH block 4k+1 - high
5384
5385 aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 4
5386
5387 aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 1
5388 mov $t3d, $res1.d[1] @ GHASH block 4k+1 - mid
5389
5390 aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 4
5391 eor $acc_lb, $acc_lb, $t2.16b @ GHASH block 4k+1 - low
5392
5393 aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 5
5394 add $rctr32w, $rctr32w, #1 @ CTR block 4k+7
5395
5396 aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 2
5397 mov $t6d, $res2.d[1] @ GHASH block 4k+2 - mid
5398
5399 aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 4
5400 eor $t3.8b, $t3.8b, $res1.8b @ GHASH block 4k+1 - mid
5401
5402 pmull $t5.1q, $res2.1d, $h2.1d @ GHASH block 4k+2 - low
5403
5404 aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 3
5405 eor $t6.8b, $t6.8b, $res2.8b @ GHASH block 4k+2 - mid
5406
5407 aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 5
5408
5409 aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 5
5410 eor $acc_lb, $acc_lb, $t5.16b @ GHASH block 4k+2 - low
5411
5412 pmull $t3.1q, $t3.1d, $h34k.1d @ GHASH block 4k+1 - mid
5413 rev $ctr32w, $rctr32w @ CTR block 4k+8
5414
5415 aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 6
5416 ins $t6.d[1], $t6.d[0] @ GHASH block 4k+2 - mid
5417
5418 aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 6
5419 add $rctr32w, $rctr32w, #1 @ CTR block 4k+8
5420
5421 aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 4
5422
5423 aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 7
5424 eor $acc_mb, $acc_mb, $t3.16b @ GHASH block 4k+1 - mid
5425
5426 aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 7
5427
5428 pmull2 $t4.1q, $res2.2d, $h2.2d @ GHASH block 4k+2 - high
5429 mov $t9d, $res3.d[1] @ GHASH block 4k+3 - mid
5430
5431 aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 5
5432
5433 pmull2 $t6.1q, $t6.2d, $h12k.2d @ GHASH block 4k+2 - mid
5434
5435 aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 8
5436 eor $acc_hb, $acc_hb, $t4.16b @ GHASH block 4k+2 - high
5437
5438 aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 6
5439
5440 pmull $t8.1q, $res3.1d, $h1.1d @ GHASH block 4k+3 - low
5441 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+8
5442 eor $acc_mb, $acc_mb, $t6.16b @ GHASH block 4k+2 - mid
5443
5444 pmull2 $t7.1q, $res3.2d, $h1.2d @ GHASH block 4k+3 - high
5445
5446 aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 9
5447 eor $t9.8b, $t9.8b, $res3.8b @ GHASH block 4k+3 - mid
5448
5449 aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 8
5450
5451 aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 6
5452 eor $acc_hb, $acc_hb, $t7.16b @ GHASH block 4k+3 - high
5453
5454 aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 10
5455
5456 pmull $t9.1q, $t9.1d, $h12k.1d @ GHASH block 4k+3 - mid
5457 movi $mod_constant.8b, #0xc2
5458
5459 aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 7
5460 eor $acc_lb, $acc_lb, $t8.16b @ GHASH block 4k+3 - low
5461
5462 aese $ctr0b, $rk11 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 11
5463
5464 aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 7
5465 shl $mod_constantd, $mod_constantd, #56 @ mod_constant
5466
5467 aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 8
5468 eor $acc_mb, $acc_mb, $t9.16b @ GHASH block 4k+3 - mid
5469
5470 aese $ctr0b, $rk12 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 12
5471
5472 pmull $mod_t.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid
5473 eor $t9.16b, $acc_lb, $acc_hb @ MODULO - karatsuba tidy up
5474
5475 aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 9
5476 ld1 {$res0b}, [$input_ptr], #16 @ AES block 4k+4 - load ciphertext
5477
5478 aese $ctr0b, $rk13 @ AES block 4k+4 - round 13
5479 ext $acc_hb, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment
5480
5481 aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 10
5482 eor $acc_mb, $acc_mb, $t9.16b @ MODULO - karatsuba tidy up
5483
5484 aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 9
5485 ld1 {$res1b}, [$input_ptr], #16 @ AES block 4k+5 - load ciphertext
5486
5487 aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 8
5488 eor $ctr0b, $res0b, $ctr0b @ AES block 4k+4 - result
5489
5490 aese $ctr1b, $rk11 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 11
5491 stp $output_l3, $output_h3, [$output_ptr], #16 @ AES block 4k+3 - store result
5492
5493 aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 10
5494 eor $acc_mb, $acc_mb, $mod_t.16b @ MODULO - fold into mid
5495
5496 aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 9
5497 ld1 {$res2b}, [$input_ptr], #16 @ AES block 4k+6 - load ciphertext
5498
5499 aese $ctr1b, $rk12 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 12
5500 ld1 {$res3b}, [$input_ptr], #16 @ AES block 4k+7 - load ciphertext
5501
5502 aese $ctr2b, $rk11 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 11
5503 mov $output_h0, $ctr0.d[1] @ AES block 4k+4 - mov high
5504
5505 aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 10
5506 eor $acc_mb, $acc_mb, $acc_hb @ MODULO - fold into mid
5507
5508 aese $ctr1b, $rk13 @ AES block 4k+5 - round 13
5509 mov $output_l0, $ctr0.d[0] @ AES block 4k+4 - mov low
5510
5511 aese $ctr2b, $rk12 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 12
5512 fmov $ctr0d, $ctr96_b64x @ CTR block 4k+8
5513
5514 aese $ctr3b, $rk11 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 11
5515 fmov $ctr0.d[1], $ctr32x @ CTR block 4k+8
5516
5517 pmull $mod_constant.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low
5518 eor $ctr1b, $res1b, $ctr1b @ AES block 4k+5 - result
5519 rev $ctr32w, $rctr32w @ CTR block 4k+9
5520
5521 aese $ctr2b, $rk13 @ AES block 4k+6 - round 13
5522 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+9
5523 cmp $input_ptr, $main_end_input_ptr @ LOOP CONTROL
5524
5525 add $rctr32w, $rctr32w, #1 @ CTR block 4k+9
5526
5527 eor $output_l0, $output_l0, $rk14_l @ AES block 4k+4 - round 14 low
5528#ifdef __AARCH64EB__
5529 rev $output_l0, $output_l0
5530#endif
5531 eor $output_h0, $output_h0, $rk14_h @ AES block 4k+4 - round 14 high
5532#ifdef __AARCH64EB__
5533 rev $output_h0, $output_h0
5534#endif
5535 mov $output_h1, $ctr1.d[1] @ AES block 4k+5 - mov high
5536 eor $ctr2b, $res2b, $ctr2b @ AES block 4k+6 - result
5537 eor $acc_lb, $acc_lb, $mod_constant.16b @ MODULO - fold into low
5538
5539 aese $ctr3b, $rk12 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 12
5540 mov $output_l1, $ctr1.d[0] @ AES block 4k+5 - mov low
5541
5542 fmov $ctr1d, $ctr96_b64x @ CTR block 4k+9
5543 ext $acc_mb, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment
5544
5545 fmov $ctr1.d[1], $ctr32x @ CTR block 4k+9
5546 rev $ctr32w, $rctr32w @ CTR block 4k+10
5547 add $rctr32w, $rctr32w, #1 @ CTR block 4k+10
5548
5549 aese $ctr3b, $rk13 @ AES block 4k+7 - round 13
5550 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+10
5551
5552 rev64 $res1b, $res1b @ GHASH block 4k+5
5553 eor $output_h1, $output_h1, $rk14_h @ AES block 4k+5 - round 14 high
5554#ifdef __AARCH64EB__
5555 rev $output_h1, $output_h1
5556#endif
5557 stp $output_l0, $output_h0, [$output_ptr], #16 @ AES block 4k+4 - store result
5558
5559 eor $output_l1, $output_l1, $rk14_l @ AES block 4k+5 - round 14 low
5560#ifdef __AARCH64EB__
5561 rev $output_l1, $output_l1
5562#endif
5563 stp $output_l1, $output_h1, [$output_ptr], #16 @ AES block 4k+5 - store result
5564
5565 rev64 $res0b, $res0b @ GHASH block 4k+4
5566 eor $acc_lb, $acc_lb, $acc_mb @ MODULO - fold into low
5567 b.lt .L256_dec_main_loop
5568
5569
5570 .L256_dec_prepretail: @ PREPRETAIL
5571 ext $acc_lb, $acc_lb, $acc_lb, #8 @ PRE 0
5572 mov $output_l2, $ctr2.d[0] @ AES block 4k+2 - mov low
5573 eor $ctr3b, $res3b, $ctr3b @ AES block 4k+3 - result
5574
5575 aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 0
5576 mov $output_h2, $ctr2.d[1] @ AES block 4k+2 - mov high
5577
5578 aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 0
5579 fmov $ctr2d, $ctr96_b64x @ CTR block 4k+6
5580
5581 fmov $ctr2.d[1], $ctr32x @ CTR block 4k+6
5582 rev $ctr32w, $rctr32w @ CTR block 4k+7
5583 eor $res0b, $res0b, $acc_lb @ PRE 1
5584
5585 rev64 $res2b, $res2b @ GHASH block 4k+2
5586 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+7
5587 mov $output_l3, $ctr3.d[0] @ AES block 4k+3 - mov low
5588
5589 aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 1
5590 mov $output_h3, $ctr3.d[1] @ AES block 4k+3 - mov high
5591
5592 pmull $acc_l.1q, $res0.1d, $h4.1d @ GHASH block 4k - low
5593 mov $t0d, $res0.d[1] @ GHASH block 4k - mid
5594 fmov $ctr3d, $ctr96_b64x @ CTR block 4k+7
5595
5596 pmull2 $acc_h.1q, $res0.2d, $h4.2d @ GHASH block 4k - high
5597 fmov $ctr3.d[1], $ctr32x @ CTR block 4k+7
5598
5599 aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 0
5600 mov $acc_md, $h34k.d[1] @ GHASH block 4k - mid
5601
5602 aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 1
5603 eor $t0.8b, $t0.8b, $res0.8b @ GHASH block 4k - mid
5604
5605 pmull2 $t1.1q, $res1.2d, $h3.2d @ GHASH block 4k+1 - high
5606
5607 aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 1
5608 rev64 $res3b, $res3b @ GHASH block 4k+3
5609
5610 aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 0
5611
5612 pmull $acc_m.1q, $t0.1d, $acc_m.1d @ GHASH block 4k - mid
5613 eor $acc_hb, $acc_hb, $t1.16b @ GHASH block 4k+1 - high
5614
5615 pmull $t2.1q, $res1.1d, $h3.1d @ GHASH block 4k+1 - low
5616
5617 aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 1
5618 mov $t3d, $res1.d[1] @ GHASH block 4k+1 - mid
5619
5620 aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 2
5621
5622 aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 2
5623 eor $acc_lb, $acc_lb, $t2.16b @ GHASH block 4k+1 - low
5624
5625 aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 2
5626
5627 aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 3
5628 mov $t6d, $res2.d[1] @ GHASH block 4k+2 - mid
5629
5630 aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 2
5631 eor $t3.8b, $t3.8b, $res1.8b @ GHASH block 4k+1 - mid
5632
5633 pmull $t5.1q, $res2.1d, $h2.1d @ GHASH block 4k+2 - low
5634
5635 aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 4
5636
5637 aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 3
5638 eor $t6.8b, $t6.8b, $res2.8b @ GHASH block 4k+2 - mid
5639
5640 pmull $t3.1q, $t3.1d, $h34k.1d @ GHASH block 4k+1 - mid
5641
5642 aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 5
5643 eor $acc_lb, $acc_lb, $t5.16b @ GHASH block 4k+2 - low
5644
5645 aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 4
5646
5647 pmull2 $t7.1q, $res3.2d, $h1.2d @ GHASH block 4k+3 - high
5648 eor $acc_mb, $acc_mb, $t3.16b @ GHASH block 4k+1 - mid
5649
5650 pmull2 $t4.1q, $res2.2d, $h2.2d @ GHASH block 4k+2 - high
5651
5652 aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 5
5653 ins $t6.d[1], $t6.d[0] @ GHASH block 4k+2 - mid
5654
5655 aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 3
5656
5657 aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 3
5658 eor $acc_hb, $acc_hb, $t4.16b @ GHASH block 4k+2 - high
5659
5660 pmull $t8.1q, $res3.1d, $h1.1d @ GHASH block 4k+3 - low
5661
5662 aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 4
5663 mov $t9d, $res3.d[1] @ GHASH block 4k+3 - mid
5664
5665 aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 4
5666
5667 pmull2 $t6.1q, $t6.2d, $h12k.2d @ GHASH block 4k+2 - mid
5668
5669 aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 5
5670 eor $t9.8b, $t9.8b, $res3.8b @ GHASH block 4k+3 - mid
5671
5672 aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 5
5673
5674 aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 6
5675 eor $acc_mb, $acc_mb, $t6.16b @ GHASH block 4k+2 - mid
5676
5677 aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 6
5678
5679 aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 6
5680 movi $mod_constant.8b, #0xc2
5681
5682 aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 6
5683 eor $acc_lb, $acc_lb, $t8.16b @ GHASH block 4k+3 - low
5684
5685 pmull $t9.1q, $t9.1d, $h12k.1d @ GHASH block 4k+3 - mid
5686
5687 aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 7
5688 eor $acc_hb, $acc_hb, $t7.16b @ GHASH block 4k+3 - high
5689
5690 aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 7
5691
5692 aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 7
5693 eor $acc_mb, $acc_mb, $t9.16b @ GHASH block 4k+3 - mid
5694
5695 aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 8
5696
5697 aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 7
5698 eor $t9.16b, $acc_lb, $acc_hb @ MODULO - karatsuba tidy up
5699
5700 aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 8
5701
5702 aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 8
5703 shl $mod_constantd, $mod_constantd, #56 @ mod_constant
5704
5705 aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 8
5706
5707 aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 9
5708 eor $acc_mb, $acc_mb, $t9.16b @ MODULO - karatsuba tidy up
5709
5710 pmull $mod_t.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid
5711
5712 aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 9
5713 ext $acc_hb, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment
5714
5715 aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 9
5716
5717 aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 9
5718 eor $acc_mb, $acc_mb, $mod_t.16b @ MODULO - fold into mid
5719
5720 aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 10
5721
5722 aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 10
5723
5724 aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 10
5725 eor $output_h2, $output_h2, $rk14_h @ AES block 4k+2 - round 14 high
5726#ifdef __AARCH64EB__
5727 rev $output_h2, $output_h2
5728#endif
5729 aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 10
5730 eor $output_l3, $output_l3, $rk14_l @ AES block 4k+3 - round 14 low
5731#ifdef __AARCH64EB__
5732 rev $output_l3, $output_l3
5733#endif
5734 aese $ctr2b, $rk11 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 11
5735 eor $acc_mb, $acc_mb, $acc_hb @ MODULO - fold into mid
5736
5737 aese $ctr0b, $rk11 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 11
5738 add $rctr32w, $rctr32w, #1 @ CTR block 4k+7
5739
5740 aese $ctr1b, $rk11 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 11
5741 eor $output_l2, $output_l2, $rk14_l @ AES block 4k+2 - round 14 low
5742#ifdef __AARCH64EB__
5743 rev $output_l2, $output_l2
5744#endif
5745
5746 aese $ctr2b, $rk12 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 12
5747
5748 pmull $mod_constant.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low
5749 eor $output_h3, $output_h3, $rk14_h @ AES block 4k+3 - round 14 high
5750#ifdef __AARCH64EB__
5751 rev $output_h3, $output_h3
5752#endif
5753
5754 aese $ctr3b, $rk11 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 11
5755 stp $output_l2, $output_h2, [$output_ptr], #16 @ AES block 4k+2 - store result
5756
5757 aese $ctr1b, $rk12 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 12
5758 ext $acc_mb, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment
5759
5760 aese $ctr0b, $rk12 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 12
5761 stp $output_l3, $output_h3, [$output_ptr], #16 @ AES block 4k+3 - store result
5762
5763 aese $ctr3b, $rk12 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 12
5764 eor $acc_lb, $acc_lb, $mod_constant.16b @ MODULO - fold into low
5765
5766 aese $ctr1b, $rk13 @ AES block 4k+5 - round 13
5767
5768 aese $ctr0b, $rk13 @ AES block 4k+4 - round 13
5769
5770 aese $ctr3b, $rk13 @ AES block 4k+7 - round 13
5771
5772 aese $ctr2b, $rk13 @ AES block 4k+6 - round 13
5773 eor $acc_lb, $acc_lb, $acc_mb @ MODULO - fold into low
5774 .L256_dec_tail: @ TAIL
5775
5776 sub $main_end_input_ptr, $end_input_ptr, $input_ptr @ main_end_input_ptr is number of bytes left to process
5777 ld1 { $res1b}, [$input_ptr], #16 @ AES block 4k+4 - load ciphertext
5778
5779 eor $ctr0b, $res1b, $ctr0b @ AES block 4k+4 - result
5780
5781 mov $output_l0, $ctr0.d[0] @ AES block 4k+4 - mov low
5782
5783 mov $output_h0, $ctr0.d[1] @ AES block 4k+4 - mov high
5784 ext $t0.16b, $acc_lb, $acc_lb, #8 @ prepare final partial tag
5785
5786 cmp $main_end_input_ptr, #48
5787
5788 eor $output_l0, $output_l0, $rk14_l @ AES block 4k+4 - round 14 low
5789#ifdef __AARCH64EB__
5790 rev $output_l0, $output_l0
5791#endif
5792
5793 eor $output_h0, $output_h0, $rk14_h @ AES block 4k+4 - round 14 high
5794#ifdef __AARCH64EB__
5795 rev $output_h0, $output_h0
5796#endif
5797 b.gt .L256_dec_blocks_more_than_3
5798
5799 sub $rctr32w, $rctr32w, #1
5800 mov $ctr3b, $ctr2b
5801 movi $acc_m.8b, #0
5802
5803 movi $acc_l.8b, #0
5804 cmp $main_end_input_ptr, #32
5805
5806 movi $acc_h.8b, #0
5807 mov $ctr2b, $ctr1b
5808 b.gt .L256_dec_blocks_more_than_2
5809
5810 sub $rctr32w, $rctr32w, #1
5811
5812 mov $ctr3b, $ctr1b
5813 cmp $main_end_input_ptr, #16
5814 b.gt .L256_dec_blocks_more_than_1
5815
5816 sub $rctr32w, $rctr32w, #1
5817 b .L256_dec_blocks_less_than_1
5818 .L256_dec_blocks_more_than_3: @ blocks left > 3
5819 rev64 $res0b, $res1b @ GHASH final-3 block
5820 ld1 { $res1b}, [$input_ptr], #16 @ AES final-2 block - load ciphertext
5821
5822 stp $output_l0, $output_h0, [$output_ptr], #16 @ AES final-3 block - store result
5823
5824 mov $acc_md, $h34k.d[1] @ GHASH final-3 block - mid
5825
5826 eor $res0b, $res0b, $t0.16b @ feed in partial tag
5827
5828 eor $ctr0b, $res1b, $ctr1b @ AES final-2 block - result
5829
5830 mov $rk4d, $res0.d[1] @ GHASH final-3 block - mid
5831
5832 mov $output_l0, $ctr0.d[0] @ AES final-2 block - mov low
5833
5834 mov $output_h0, $ctr0.d[1] @ AES final-2 block - mov high
5835
5836 eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-3 block - mid
5837
5838 movi $t0.8b, #0 @ suppress further partial tag feed in
5839
5840 pmull2 $acc_h.1q, $res0.2d, $h4.2d @ GHASH final-3 block - high
5841
5842 pmull $acc_m.1q, $rk4v.1d, $acc_m.1d @ GHASH final-3 block - mid
5843 eor $output_l0, $output_l0, $rk14_l @ AES final-2 block - round 14 low
5844#ifdef __AARCH64EB__
5845 rev $output_l0, $output_l0
5846#endif
5847
5848 pmull $acc_l.1q, $res0.1d, $h4.1d @ GHASH final-3 block - low
5849 eor $output_h0, $output_h0, $rk14_h @ AES final-2 block - round 14 high
5850#ifdef __AARCH64EB__
5851 rev $output_h0, $output_h0
5852#endif
5853 .L256_dec_blocks_more_than_2: @ blocks left > 2
5854
5855 rev64 $res0b, $res1b @ GHASH final-2 block
5856 ld1 { $res1b}, [$input_ptr], #16 @ AES final-1 block - load ciphertext
5857
5858 eor $res0b, $res0b, $t0.16b @ feed in partial tag
5859 stp $output_l0, $output_h0, [$output_ptr], #16 @ AES final-2 block - store result
5860
5861 eor $ctr0b, $res1b, $ctr2b @ AES final-1 block - result
5862
5863 mov $rk4d, $res0.d[1] @ GHASH final-2 block - mid
5864
5865 pmull $rk3q1, $res0.1d, $h3.1d @ GHASH final-2 block - low
5866
5867 pmull2 $rk2q1, $res0.2d, $h3.2d @ GHASH final-2 block - high
5868
5869 eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-2 block - mid
5870 mov $output_l0, $ctr0.d[0] @ AES final-1 block - mov low
5871
5872 mov $output_h0, $ctr0.d[1] @ AES final-1 block - mov high
5873 eor $acc_lb, $acc_lb, $rk3 @ GHASH final-2 block - low
5874 movi $t0.8b, #0 @ suppress further partial tag feed in
5875
5876 pmull $rk4v.1q, $rk4v.1d, $h34k.1d @ GHASH final-2 block - mid
5877
5878 eor $acc_hb, $acc_hb, $rk2 @ GHASH final-2 block - high
5879 eor $output_l0, $output_l0, $rk14_l @ AES final-1 block - round 14 low
5880#ifdef __AARCH64EB__
5881 rev $output_l0, $output_l0
5882#endif
5883
5884 eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-2 block - mid
5885 eor $output_h0, $output_h0, $rk14_h @ AES final-1 block - round 14 high
5886#ifdef __AARCH64EB__
5887 rev $output_h0, $output_h0
5888#endif
5889 .L256_dec_blocks_more_than_1: @ blocks left > 1
5890
5891 stp $output_l0, $output_h0, [$output_ptr], #16 @ AES final-1 block - store result
5892 rev64 $res0b, $res1b @ GHASH final-1 block
5893
5894 ld1 { $res1b}, [$input_ptr], #16 @ AES final block - load ciphertext
5895
5896 eor $res0b, $res0b, $t0.16b @ feed in partial tag
5897 movi $t0.8b, #0 @ suppress further partial tag feed in
5898
5899 mov $rk4d, $res0.d[1] @ GHASH final-1 block - mid
5900
5901 eor $ctr0b, $res1b, $ctr3b @ AES final block - result
5902
5903 pmull2 $rk2q1, $res0.2d, $h2.2d @ GHASH final-1 block - high
5904
5905 eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-1 block - mid
5906
5907 pmull $rk3q1, $res0.1d, $h2.1d @ GHASH final-1 block - low
5908 mov $output_l0, $ctr0.d[0] @ AES final block - mov low
5909
5910 ins $rk4v.d[1], $rk4v.d[0] @ GHASH final-1 block - mid
5911
5912 mov $output_h0, $ctr0.d[1] @ AES final block - mov high
5913
5914 pmull2 $rk4v.1q, $rk4v.2d, $h12k.2d @ GHASH final-1 block - mid
5915 eor $output_l0, $output_l0, $rk14_l @ AES final block - round 14 low
5916#ifdef __AARCH64EB__
5917 rev $output_l0, $output_l0
5918#endif
5919 eor $acc_lb, $acc_lb, $rk3 @ GHASH final-1 block - low
5920
5921 eor $acc_hb, $acc_hb, $rk2 @ GHASH final-1 block - high
5922
5923 eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-1 block - mid
5924 eor $output_h0, $output_h0, $rk14_h @ AES final block - round 14 high
5925#ifdef __AARCH64EB__
5926 rev $output_h0, $output_h0
5927#endif
5928 .L256_dec_blocks_less_than_1: @ blocks left <= 1
5929
5930 and $bit_length, $bit_length, #127 @ bit_length %= 128
5931 mvn $rk14_h, xzr @ rk14_h = 0xffffffffffffffff
5932
5933 sub $bit_length, $bit_length, #128 @ bit_length -= 128
5934 mvn $rk14_l, xzr @ rk14_l = 0xffffffffffffffff
5935
5936 ldp $end_input_ptr, $main_end_input_ptr, [$output_ptr] @ load existing bytes we need to not overwrite
5937 neg $bit_length, $bit_length @ bit_length = 128 - #bits in input (in range [1,128])
5938
5939 and $bit_length, $bit_length, #127 @ bit_length %= 128
5940
5941 lsr $rk14_h, $rk14_h, $bit_length @ rk14_h is mask for top 64b of last block
5942 cmp $bit_length, #64
5943
5944 csel $ctr32x, $rk14_l, $rk14_h, lt
5945 csel $ctr96_b64x, $rk14_h, xzr, lt
5946
5947 fmov $ctr0d, $ctr32x @ ctr0b is mask for last block
5948 and $output_l0, $output_l0, $ctr32x
5949
5950 mov $ctr0.d[1], $ctr96_b64x
5951 bic $end_input_ptr, $end_input_ptr, $ctr32x @ mask out low existing bytes
5952
5953#ifndef __AARCH64EB__
5954 rev $ctr32w, $rctr32w
5955#else
5956 mov $ctr32w, $rctr32w
5957#endif
5958
5959 bic $main_end_input_ptr, $main_end_input_ptr, $ctr96_b64x @ mask out high existing bytes
5960
5961 orr $output_l0, $output_l0, $end_input_ptr
5962
5963 and $output_h0, $output_h0, $ctr96_b64x
5964
5965 orr $output_h0, $output_h0, $main_end_input_ptr
5966
5967 and $res1b, $res1b, $ctr0b @ possibly partial last block has zeroes in highest bits
5968
5969 rev64 $res0b, $res1b @ GHASH final block
5970
5971 eor $res0b, $res0b, $t0.16b @ feed in partial tag
5972
5973 pmull $rk3q1, $res0.1d, $h1.1d @ GHASH final block - low
5974
5975 mov $t0d, $res0.d[1] @ GHASH final block - mid
5976
5977 eor $t0.8b, $t0.8b, $res0.8b @ GHASH final block - mid
5978
5979 pmull2 $rk2q1, $res0.2d, $h1.2d @ GHASH final block - high
5980
5981 pmull $t0.1q, $t0.1d, $h12k.1d @ GHASH final block - mid
5982
5983 eor $acc_hb, $acc_hb, $rk2 @ GHASH final block - high
5984
5985 eor $acc_lb, $acc_lb, $rk3 @ GHASH final block - low
5986
5987 eor $acc_mb, $acc_mb, $t0.16b @ GHASH final block - mid
5988 movi $mod_constant.8b, #0xc2
5989
5990 eor $t9.16b, $acc_lb, $acc_hb @ MODULO - karatsuba tidy up
5991
5992 shl $mod_constantd, $mod_constantd, #56 @ mod_constant
5993
5994 eor $acc_mb, $acc_mb, $t9.16b @ MODULO - karatsuba tidy up
5995
5996 pmull $mod_t.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid
5997
5998 ext $acc_hb, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment
5999
6000 eor $acc_mb, $acc_mb, $mod_t.16b @ MODULO - fold into mid
6001
6002 eor $acc_mb, $acc_mb, $acc_hb @ MODULO - fold into mid
6003
6004 pmull $mod_constant.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low
6005
6006 ext $acc_mb, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment
6007
6008 eor $acc_lb, $acc_lb, $mod_constant.16b @ MODULO - fold into low
6009
6010 stp $output_l0, $output_h0, [$output_ptr]
6011
6012 str $ctr32w, [$counter, #12] @ store the updated counter
6013
6014 eor $acc_lb, $acc_lb, $acc_mb @ MODULO - fold into low
6015 ext $acc_lb, $acc_lb, $acc_lb, #8
6016 rev64 $acc_lb, $acc_lb
6017 mov x0, $len
6018 st1 { $acc_l.16b }, [$current_tag]
6019
6020 ldp x21, x22, [sp, #16]
6021 ldp x23, x24, [sp, #32]
6022 ldp d8, d9, [sp, #48]
6023 ldp d10, d11, [sp, #64]
6024 ldp d12, d13, [sp, #80]
6025 ldp d14, d15, [sp, #96]
6026 ldp x19, x20, [sp], #112
6027 ret
6028
6029.L256_dec_ret:
6030 mov w0, #0x0
6031 ret
6032.size aes_gcm_dec_256_kernel,.-aes_gcm_dec_256_kernel
6033___
6034}
6035}
6036
6037$code.=<<___;
6038.asciz "GHASH for ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
6039.align 2
6040#endif
6041___
6042
6043if ($flavour =~ /64/) { ######## 64-bit code
6044 sub unvmov {
6045 my $arg=shift;
6046
6047 $arg =~ m/q([0-9]+)#(lo|hi),\s*q([0-9]+)#(lo|hi)/o &&
6048 sprintf "ins v%d.d[%d],v%d.d[%d]",$1<8?$1:$1+8,($2 eq "lo")?0:1,
6049 $3<8?$3:$3+8,($4 eq "lo")?0:1;
6050 }
6051 foreach(split("\n",$code)) {
6052 s/@\s/\/\//o; # old->new style commentary
6053 print $_,"\n";
6054 }
6055} else { ######## 32-bit code
6056 sub unvdup32 {
6057 my $arg=shift;
6058
6059 $arg =~ m/q([0-9]+),\s*q([0-9]+)\[([0-3])\]/o &&
6060 sprintf "vdup.32 q%d,d%d[%d]",$1,2*$2+($3>>1),$3&1;
6061 }
6062 sub unvpmullp64 {
6063 my ($mnemonic,$arg)=@_;
6064
6065 if ($arg =~ m/q([0-9]+),\s*q([0-9]+),\s*q([0-9]+)/o) {
6066 my $word = 0xf2a00e00|(($1&7)<<13)|(($1&8)<<19)
6067 |(($2&7)<<17)|(($2&8)<<4)
6068 |(($3&7)<<1) |(($3&8)<<2);
6069 $word |= 0x00010001 if ($mnemonic =~ "2");
6070 # since ARMv7 instructions are always encoded little-endian.
6071 # correct solution is to use .inst directive, but older%%%%
6072 # assemblers don't implement it:-(
6073 sprintf "INST(0x%02x,0x%02x,0x%02x,0x%02x)\t@ %s %s",
6074 $word&0xff,($word>>8)&0xff,
6075 ($word>>16)&0xff,($word>>24)&0xff,
6076 $mnemonic,$arg;
6077 }
6078 }
6079
6080 foreach(split("\n",$code)) {
6081 s/\b[wx]([0-9]+)\b/r$1/go; # new->old registers
6082 s/\bv([0-9])\.[12468]+[bsd]\b/q$1/go; # new->old registers
6083 s/\/\/\s?/@ /o; # new->old style commentary
6084
6085 # fix up remaining new-style suffixes
6086 s/\],#[0-9]+/]!/o;
6087
6088 s/cclr\s+([^,]+),\s*([a-z]+)/mov.$2 $1,#0/o or
6089 s/vdup\.32\s+(.*)/unvdup32($1)/geo or
6090 s/v?(pmull2?)\.p64\s+(.*)/unvpmullp64($1,$2)/geo or
6091 s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo or
6092 s/^(\s+)b\./$1b/o or
6093 s/^(\s+)ret/$1bx\tlr/o;
6094
6095 if (s/^(\s+)mov\.([a-z]+)/$1mov$2/) {
6096 print " it $2\n";
6097 }
6098 s/__AARCH64E([BL])__/__ARME$1__/go;
6099 print $_,"\n";
6100 }
6101}
6102
6103close STDOUT or die "error closing STDOUT: $!"; # enforce flush
注意: 瀏覽 TracBrowser 來幫助您使用儲存庫瀏覽器

© 2025 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette