1 | #! /usr/bin/env perl
|
---|
2 | # Copyright 2010-2022 The OpenSSL Project Authors. All Rights Reserved.
|
---|
3 | #
|
---|
4 | # Licensed under the Apache License 2.0 (the "License"). You may not use
|
---|
5 | # this file except in compliance with the License. You can obtain a copy
|
---|
6 | # in the file LICENSE in the source distribution or at
|
---|
7 | # https://www.openssl.org/source/license.html
|
---|
8 |
|
---|
9 |
|
---|
10 | # ====================================================================
|
---|
11 | # Written by Andy Polyakov <[email protected]> for the OpenSSL
|
---|
12 | # project. The module is, however, dual licensed under OpenSSL and
|
---|
13 | # CRYPTOGAMS licenses depending on where you obtain it. For further
|
---|
14 | # details see http://www.openssl.org/~appro/cryptogams/.
|
---|
15 | # ====================================================================
|
---|
16 |
|
---|
17 | # September 2010.
|
---|
18 | #
|
---|
19 | # The module implements "4-bit" GCM GHASH function and underlying
|
---|
20 | # single multiplication operation in GF(2^128). "4-bit" means that it
|
---|
21 | # uses 256 bytes per-key table [+128 bytes shared table]. Performance
|
---|
22 | # was measured to be ~18 cycles per processed byte on z10, which is
|
---|
23 | # almost 40% better than gcc-generated code. It should be noted that
|
---|
24 | # 18 cycles is worse result than expected: loop is scheduled for 12
|
---|
25 | # and the result should be close to 12. In the lack of instruction-
|
---|
26 | # level profiling data it's impossible to tell why...
|
---|
27 |
|
---|
28 | # November 2010.
|
---|
29 | #
|
---|
30 | # Adapt for -m31 build. If kernel supports what's called "highgprs"
|
---|
31 | # feature on Linux [see /proc/cpuinfo], it's possible to use 64-bit
|
---|
32 | # instructions and achieve "64-bit" performance even in 31-bit legacy
|
---|
33 | # application context. The feature is not specific to any particular
|
---|
34 | # processor, as long as it's "z-CPU". Latter implies that the code
|
---|
35 | # remains z/Architecture specific. On z990 it was measured to perform
|
---|
36 | # 2.8x better than 32-bit code generated by gcc 4.3.
|
---|
37 |
|
---|
38 | # March 2011.
|
---|
39 | #
|
---|
40 | # Support for hardware KIMD-GHASH is verified to produce correct
|
---|
41 | # result and therefore is engaged. On z196 it was measured to process
|
---|
42 | # 8KB buffer ~7 faster than software implementation. It's not as
|
---|
43 | # impressive for smaller buffer sizes and for smallest 16-bytes buffer
|
---|
44 | # it's actually almost 2 times slower. Which is the reason why
|
---|
45 | # KIMD-GHASH is not used in gcm_gmult_4bit.
|
---|
46 |
|
---|
47 | # $output is the last argument if it looks like a file (it has an extension)
|
---|
48 | # $flavour is the first argument if it doesn't look like a file
|
---|
49 | $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
|
---|
50 | $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
|
---|
51 |
|
---|
52 | if ($flavour =~ /3[12]/) {
|
---|
53 | $SIZE_T=4;
|
---|
54 | $g="";
|
---|
55 | } else {
|
---|
56 | $SIZE_T=8;
|
---|
57 | $g="g";
|
---|
58 | }
|
---|
59 |
|
---|
60 | $output and open STDOUT,">$output";
|
---|
61 |
|
---|
62 | $softonly=0;
|
---|
63 |
|
---|
64 | $Zhi="%r0";
|
---|
65 | $Zlo="%r1";
|
---|
66 |
|
---|
67 | $Xi="%r2"; # argument block
|
---|
68 | $Htbl="%r3";
|
---|
69 | $inp="%r4";
|
---|
70 | $len="%r5";
|
---|
71 |
|
---|
72 | $rem0="%r6"; # variables
|
---|
73 | $rem1="%r7";
|
---|
74 | $nlo="%r8";
|
---|
75 | $nhi="%r9";
|
---|
76 | $xi="%r10";
|
---|
77 | $cnt="%r11";
|
---|
78 | $tmp="%r12";
|
---|
79 | $x78="%r13";
|
---|
80 | $rem_4bit="%r14";
|
---|
81 |
|
---|
82 | $sp="%r15";
|
---|
83 |
|
---|
84 | $code.=<<___;
|
---|
85 | #include "s390x_arch.h"
|
---|
86 |
|
---|
87 | .text
|
---|
88 |
|
---|
89 | .globl gcm_gmult_4bit
|
---|
90 | .align 32
|
---|
91 | gcm_gmult_4bit:
|
---|
92 | ___
|
---|
93 | $code.=<<___;
|
---|
94 | stm${g} %r6,%r14,6*$SIZE_T($sp)
|
---|
95 |
|
---|
96 | aghi $Xi,-1
|
---|
97 | lghi $len,1
|
---|
98 | lghi $x78,`0xf<<3`
|
---|
99 | larl $rem_4bit,rem_4bit
|
---|
100 |
|
---|
101 | lg $Zlo,8+1($Xi) # Xi
|
---|
102 | j .Lgmult_shortcut
|
---|
103 | .type gcm_gmult_4bit,\@function
|
---|
104 | .size gcm_gmult_4bit,(.-gcm_gmult_4bit)
|
---|
105 |
|
---|
106 | .globl gcm_ghash_4bit
|
---|
107 | .align 32
|
---|
108 | gcm_ghash_4bit:
|
---|
109 | ___
|
---|
110 | $code.=<<___ if(!$softonly);
|
---|
111 | larl %r1,OPENSSL_s390xcap_P
|
---|
112 | lg %r0,S390X_KIMD+8(%r1) # load second word of kimd capabilities
|
---|
113 | # vector
|
---|
114 | tmhh %r0,0x4000 # check for function 65
|
---|
115 | jz .Lsoft_ghash
|
---|
116 | # Do not assume this function is called from a gcm128_context.
|
---|
117 | # This is not true, e.g., for AES-GCM-SIV.
|
---|
118 | # Parameter Block:
|
---|
119 | # Chaining Value (XI) 128byte
|
---|
120 | # Key (Htable[8]) 128byte
|
---|
121 | lmg %r0,%r1,0($Xi)
|
---|
122 | stmg %r0,%r1,8($sp)
|
---|
123 | lmg %r0,%r1,8*16($Htbl)
|
---|
124 | stmg %r0,%r1,24($sp)
|
---|
125 | la %r1,8($sp)
|
---|
126 | lghi %r0,S390X_GHASH # function 65
|
---|
127 | .long 0xb93e0004 # kimd %r0,$inp
|
---|
128 | brc 1,.-4 # pay attention to "partial completion"
|
---|
129 | lmg %r0,%r1,8($sp)
|
---|
130 | stmg %r0,%r1,0($Xi)
|
---|
131 | br %r14
|
---|
132 | .align 32
|
---|
133 | .Lsoft_ghash:
|
---|
134 | ___
|
---|
135 | $code.=<<___ if ($flavour =~ /3[12]/);
|
---|
136 | llgfr $len,$len
|
---|
137 | ___
|
---|
138 | $code.=<<___;
|
---|
139 | stm${g} %r6,%r14,6*$SIZE_T($sp)
|
---|
140 |
|
---|
141 | aghi $Xi,-1
|
---|
142 | srlg $len,$len,4
|
---|
143 | lghi $x78,`0xf<<3`
|
---|
144 | larl $rem_4bit,rem_4bit
|
---|
145 |
|
---|
146 | lg $Zlo,8+1($Xi) # Xi
|
---|
147 | lg $Zhi,0+1($Xi)
|
---|
148 | lghi $tmp,0
|
---|
149 | .Louter:
|
---|
150 | xg $Zhi,0($inp) # Xi ^= inp
|
---|
151 | xg $Zlo,8($inp)
|
---|
152 | xgr $Zhi,$tmp
|
---|
153 | stg $Zlo,8+1($Xi)
|
---|
154 | stg $Zhi,0+1($Xi)
|
---|
155 |
|
---|
156 | .Lgmult_shortcut:
|
---|
157 | lghi $tmp,0xf0
|
---|
158 | sllg $nlo,$Zlo,4
|
---|
159 | srlg $xi,$Zlo,8 # extract second byte
|
---|
160 | ngr $nlo,$tmp
|
---|
161 | lgr $nhi,$Zlo
|
---|
162 | lghi $cnt,14
|
---|
163 | ngr $nhi,$tmp
|
---|
164 |
|
---|
165 | lg $Zlo,8($nlo,$Htbl)
|
---|
166 | lg $Zhi,0($nlo,$Htbl)
|
---|
167 |
|
---|
168 | sllg $nlo,$xi,4
|
---|
169 | sllg $rem0,$Zlo,3
|
---|
170 | ngr $nlo,$tmp
|
---|
171 | ngr $rem0,$x78
|
---|
172 | ngr $xi,$tmp
|
---|
173 |
|
---|
174 | sllg $tmp,$Zhi,60
|
---|
175 | srlg $Zlo,$Zlo,4
|
---|
176 | srlg $Zhi,$Zhi,4
|
---|
177 | xg $Zlo,8($nhi,$Htbl)
|
---|
178 | xg $Zhi,0($nhi,$Htbl)
|
---|
179 | lgr $nhi,$xi
|
---|
180 | sllg $rem1,$Zlo,3
|
---|
181 | xgr $Zlo,$tmp
|
---|
182 | ngr $rem1,$x78
|
---|
183 | sllg $tmp,$Zhi,60
|
---|
184 | j .Lghash_inner
|
---|
185 | .align 16
|
---|
186 | .Lghash_inner:
|
---|
187 | srlg $Zlo,$Zlo,4
|
---|
188 | srlg $Zhi,$Zhi,4
|
---|
189 | xg $Zlo,8($nlo,$Htbl)
|
---|
190 | llgc $xi,0($cnt,$Xi)
|
---|
191 | xg $Zhi,0($nlo,$Htbl)
|
---|
192 | sllg $nlo,$xi,4
|
---|
193 | xg $Zhi,0($rem0,$rem_4bit)
|
---|
194 | nill $nlo,0xf0
|
---|
195 | sllg $rem0,$Zlo,3
|
---|
196 | xgr $Zlo,$tmp
|
---|
197 | ngr $rem0,$x78
|
---|
198 | nill $xi,0xf0
|
---|
199 |
|
---|
200 | sllg $tmp,$Zhi,60
|
---|
201 | srlg $Zlo,$Zlo,4
|
---|
202 | srlg $Zhi,$Zhi,4
|
---|
203 | xg $Zlo,8($nhi,$Htbl)
|
---|
204 | xg $Zhi,0($nhi,$Htbl)
|
---|
205 | lgr $nhi,$xi
|
---|
206 | xg $Zhi,0($rem1,$rem_4bit)
|
---|
207 | sllg $rem1,$Zlo,3
|
---|
208 | xgr $Zlo,$tmp
|
---|
209 | ngr $rem1,$x78
|
---|
210 | sllg $tmp,$Zhi,60
|
---|
211 | brct $cnt,.Lghash_inner
|
---|
212 |
|
---|
213 | srlg $Zlo,$Zlo,4
|
---|
214 | srlg $Zhi,$Zhi,4
|
---|
215 | xg $Zlo,8($nlo,$Htbl)
|
---|
216 | xg $Zhi,0($nlo,$Htbl)
|
---|
217 | sllg $xi,$Zlo,3
|
---|
218 | xg $Zhi,0($rem0,$rem_4bit)
|
---|
219 | xgr $Zlo,$tmp
|
---|
220 | ngr $xi,$x78
|
---|
221 |
|
---|
222 | sllg $tmp,$Zhi,60
|
---|
223 | srlg $Zlo,$Zlo,4
|
---|
224 | srlg $Zhi,$Zhi,4
|
---|
225 | xg $Zlo,8($nhi,$Htbl)
|
---|
226 | xg $Zhi,0($nhi,$Htbl)
|
---|
227 | xgr $Zlo,$tmp
|
---|
228 | xg $Zhi,0($rem1,$rem_4bit)
|
---|
229 |
|
---|
230 | lg $tmp,0($xi,$rem_4bit)
|
---|
231 | la $inp,16($inp)
|
---|
232 | sllg $tmp,$tmp,4 # correct last rem_4bit[rem]
|
---|
233 | brctg $len,.Louter
|
---|
234 |
|
---|
235 | xgr $Zhi,$tmp
|
---|
236 | stg $Zlo,8+1($Xi)
|
---|
237 | stg $Zhi,0+1($Xi)
|
---|
238 | lm${g} %r6,%r14,6*$SIZE_T($sp)
|
---|
239 | br %r14
|
---|
240 | .type gcm_ghash_4bit,\@function
|
---|
241 | .size gcm_ghash_4bit,(.-gcm_ghash_4bit)
|
---|
242 |
|
---|
243 | .align 64
|
---|
244 | rem_4bit:
|
---|
245 | .long `0x0000<<12`,0,`0x1C20<<12`,0,`0x3840<<12`,0,`0x2460<<12`,0
|
---|
246 | .long `0x7080<<12`,0,`0x6CA0<<12`,0,`0x48C0<<12`,0,`0x54E0<<12`,0
|
---|
247 | .long `0xE100<<12`,0,`0xFD20<<12`,0,`0xD940<<12`,0,`0xC560<<12`,0
|
---|
248 | .long `0x9180<<12`,0,`0x8DA0<<12`,0,`0xA9C0<<12`,0,`0xB5E0<<12`,0
|
---|
249 | .type rem_4bit,\@object
|
---|
250 | .size rem_4bit,(.-rem_4bit)
|
---|
251 | .string "GHASH for s390x, CRYPTOGAMS by <appro\@openssl.org>"
|
---|
252 | ___
|
---|
253 |
|
---|
254 | $code =~ s/\`([^\`]*)\`/eval $1/gem;
|
---|
255 | print $code;
|
---|
256 | close STDOUT or die "error closing STDOUT: $!";
|
---|