VirtualBox

source: vbox/trunk/src/libs/openssl-3.1.7/crypto/modes/asm/ghash-s390x.pl@ 108344

最後變更 在這個檔案從108344是 104078,由 vboxsync 提交於 12 月 前

openssl-3.1.5: Applied and adjusted our OpenSSL changes to 3.1.4. bugref:10638

檔案大小: 6.2 KB
 
1#! /usr/bin/env perl
2# Copyright 2010-2022 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the Apache License 2.0 (the "License"). You may not use
5# this file except in compliance with the License. You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
9
10# ====================================================================
11# Written by Andy Polyakov <[email protected]> for the OpenSSL
12# project. The module is, however, dual licensed under OpenSSL and
13# CRYPTOGAMS licenses depending on where you obtain it. For further
14# details see http://www.openssl.org/~appro/cryptogams/.
15# ====================================================================
16
17# September 2010.
18#
19# The module implements "4-bit" GCM GHASH function and underlying
20# single multiplication operation in GF(2^128). "4-bit" means that it
21# uses 256 bytes per-key table [+128 bytes shared table]. Performance
22# was measured to be ~18 cycles per processed byte on z10, which is
23# almost 40% better than gcc-generated code. It should be noted that
24# 18 cycles is worse result than expected: loop is scheduled for 12
25# and the result should be close to 12. In the lack of instruction-
26# level profiling data it's impossible to tell why...
27
28# November 2010.
29#
30# Adapt for -m31 build. If kernel supports what's called "highgprs"
31# feature on Linux [see /proc/cpuinfo], it's possible to use 64-bit
32# instructions and achieve "64-bit" performance even in 31-bit legacy
33# application context. The feature is not specific to any particular
34# processor, as long as it's "z-CPU". Latter implies that the code
35# remains z/Architecture specific. On z990 it was measured to perform
36# 2.8x better than 32-bit code generated by gcc 4.3.
37
38# March 2011.
39#
40# Support for hardware KIMD-GHASH is verified to produce correct
41# result and therefore is engaged. On z196 it was measured to process
42# 8KB buffer ~7 faster than software implementation. It's not as
43# impressive for smaller buffer sizes and for smallest 16-bytes buffer
44# it's actually almost 2 times slower. Which is the reason why
45# KIMD-GHASH is not used in gcm_gmult_4bit.
46
47# $output is the last argument if it looks like a file (it has an extension)
48# $flavour is the first argument if it doesn't look like a file
49$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
50$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
51
52if ($flavour =~ /3[12]/) {
53 $SIZE_T=4;
54 $g="";
55} else {
56 $SIZE_T=8;
57 $g="g";
58}
59
60$output and open STDOUT,">$output";
61
62$softonly=0;
63
64$Zhi="%r0";
65$Zlo="%r1";
66
67$Xi="%r2"; # argument block
68$Htbl="%r3";
69$inp="%r4";
70$len="%r5";
71
72$rem0="%r6"; # variables
73$rem1="%r7";
74$nlo="%r8";
75$nhi="%r9";
76$xi="%r10";
77$cnt="%r11";
78$tmp="%r12";
79$x78="%r13";
80$rem_4bit="%r14";
81
82$sp="%r15";
83
84$code.=<<___;
85#include "s390x_arch.h"
86
87.text
88
89.globl gcm_gmult_4bit
90.align 32
91gcm_gmult_4bit:
92___
93$code.=<<___;
94 stm${g} %r6,%r14,6*$SIZE_T($sp)
95
96 aghi $Xi,-1
97 lghi $len,1
98 lghi $x78,`0xf<<3`
99 larl $rem_4bit,rem_4bit
100
101 lg $Zlo,8+1($Xi) # Xi
102 j .Lgmult_shortcut
103.type gcm_gmult_4bit,\@function
104.size gcm_gmult_4bit,(.-gcm_gmult_4bit)
105
106.globl gcm_ghash_4bit
107.align 32
108gcm_ghash_4bit:
109___
110$code.=<<___ if(!$softonly);
111 larl %r1,OPENSSL_s390xcap_P
112 lg %r0,S390X_KIMD+8(%r1) # load second word of kimd capabilities
113 # vector
114 tmhh %r0,0x4000 # check for function 65
115 jz .Lsoft_ghash
116 # Do not assume this function is called from a gcm128_context.
117 # This is not true, e.g., for AES-GCM-SIV.
118 # Parameter Block:
119 # Chaining Value (XI) 128byte
120 # Key (Htable[8]) 128byte
121 lmg %r0,%r1,0($Xi)
122 stmg %r0,%r1,8($sp)
123 lmg %r0,%r1,8*16($Htbl)
124 stmg %r0,%r1,24($sp)
125 la %r1,8($sp)
126 lghi %r0,S390X_GHASH # function 65
127 .long 0xb93e0004 # kimd %r0,$inp
128 brc 1,.-4 # pay attention to "partial completion"
129 lmg %r0,%r1,8($sp)
130 stmg %r0,%r1,0($Xi)
131 br %r14
132.align 32
133.Lsoft_ghash:
134___
135$code.=<<___ if ($flavour =~ /3[12]/);
136 llgfr $len,$len
137___
138$code.=<<___;
139 stm${g} %r6,%r14,6*$SIZE_T($sp)
140
141 aghi $Xi,-1
142 srlg $len,$len,4
143 lghi $x78,`0xf<<3`
144 larl $rem_4bit,rem_4bit
145
146 lg $Zlo,8+1($Xi) # Xi
147 lg $Zhi,0+1($Xi)
148 lghi $tmp,0
149.Louter:
150 xg $Zhi,0($inp) # Xi ^= inp
151 xg $Zlo,8($inp)
152 xgr $Zhi,$tmp
153 stg $Zlo,8+1($Xi)
154 stg $Zhi,0+1($Xi)
155
156.Lgmult_shortcut:
157 lghi $tmp,0xf0
158 sllg $nlo,$Zlo,4
159 srlg $xi,$Zlo,8 # extract second byte
160 ngr $nlo,$tmp
161 lgr $nhi,$Zlo
162 lghi $cnt,14
163 ngr $nhi,$tmp
164
165 lg $Zlo,8($nlo,$Htbl)
166 lg $Zhi,0($nlo,$Htbl)
167
168 sllg $nlo,$xi,4
169 sllg $rem0,$Zlo,3
170 ngr $nlo,$tmp
171 ngr $rem0,$x78
172 ngr $xi,$tmp
173
174 sllg $tmp,$Zhi,60
175 srlg $Zlo,$Zlo,4
176 srlg $Zhi,$Zhi,4
177 xg $Zlo,8($nhi,$Htbl)
178 xg $Zhi,0($nhi,$Htbl)
179 lgr $nhi,$xi
180 sllg $rem1,$Zlo,3
181 xgr $Zlo,$tmp
182 ngr $rem1,$x78
183 sllg $tmp,$Zhi,60
184 j .Lghash_inner
185.align 16
186.Lghash_inner:
187 srlg $Zlo,$Zlo,4
188 srlg $Zhi,$Zhi,4
189 xg $Zlo,8($nlo,$Htbl)
190 llgc $xi,0($cnt,$Xi)
191 xg $Zhi,0($nlo,$Htbl)
192 sllg $nlo,$xi,4
193 xg $Zhi,0($rem0,$rem_4bit)
194 nill $nlo,0xf0
195 sllg $rem0,$Zlo,3
196 xgr $Zlo,$tmp
197 ngr $rem0,$x78
198 nill $xi,0xf0
199
200 sllg $tmp,$Zhi,60
201 srlg $Zlo,$Zlo,4
202 srlg $Zhi,$Zhi,4
203 xg $Zlo,8($nhi,$Htbl)
204 xg $Zhi,0($nhi,$Htbl)
205 lgr $nhi,$xi
206 xg $Zhi,0($rem1,$rem_4bit)
207 sllg $rem1,$Zlo,3
208 xgr $Zlo,$tmp
209 ngr $rem1,$x78
210 sllg $tmp,$Zhi,60
211 brct $cnt,.Lghash_inner
212
213 srlg $Zlo,$Zlo,4
214 srlg $Zhi,$Zhi,4
215 xg $Zlo,8($nlo,$Htbl)
216 xg $Zhi,0($nlo,$Htbl)
217 sllg $xi,$Zlo,3
218 xg $Zhi,0($rem0,$rem_4bit)
219 xgr $Zlo,$tmp
220 ngr $xi,$x78
221
222 sllg $tmp,$Zhi,60
223 srlg $Zlo,$Zlo,4
224 srlg $Zhi,$Zhi,4
225 xg $Zlo,8($nhi,$Htbl)
226 xg $Zhi,0($nhi,$Htbl)
227 xgr $Zlo,$tmp
228 xg $Zhi,0($rem1,$rem_4bit)
229
230 lg $tmp,0($xi,$rem_4bit)
231 la $inp,16($inp)
232 sllg $tmp,$tmp,4 # correct last rem_4bit[rem]
233 brctg $len,.Louter
234
235 xgr $Zhi,$tmp
236 stg $Zlo,8+1($Xi)
237 stg $Zhi,0+1($Xi)
238 lm${g} %r6,%r14,6*$SIZE_T($sp)
239 br %r14
240.type gcm_ghash_4bit,\@function
241.size gcm_ghash_4bit,(.-gcm_ghash_4bit)
242
243.align 64
244rem_4bit:
245 .long `0x0000<<12`,0,`0x1C20<<12`,0,`0x3840<<12`,0,`0x2460<<12`,0
246 .long `0x7080<<12`,0,`0x6CA0<<12`,0,`0x48C0<<12`,0,`0x54E0<<12`,0
247 .long `0xE100<<12`,0,`0xFD20<<12`,0,`0xD940<<12`,0,`0xC560<<12`,0
248 .long `0x9180<<12`,0,`0x8DA0<<12`,0,`0xA9C0<<12`,0,`0xB5E0<<12`,0
249.type rem_4bit,\@object
250.size rem_4bit,(.-rem_4bit)
251.string "GHASH for s390x, CRYPTOGAMS by <appro\@openssl.org>"
252___
253
254$code =~ s/\`([^\`]*)\`/eval $1/gem;
255print $code;
256close STDOUT or die "error closing STDOUT: $!";
注意: 瀏覽 TracBrowser 來幫助您使用儲存庫瀏覽器

© 2025 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette