1 | #! /usr/bin/env perl
|
---|
2 | # Copyright 2007-2020 The OpenSSL Project Authors. All Rights Reserved.
|
---|
3 | #
|
---|
4 | # Licensed under the Apache License 2.0 (the "License"). You may not use
|
---|
5 | # this file except in compliance with the License. You can obtain a copy
|
---|
6 | # in the file LICENSE in the source distribution or at
|
---|
7 | # https://www.openssl.org/source/license.html
|
---|
8 |
|
---|
9 |
|
---|
10 | # ====================================================================
|
---|
11 | # Written by Andy Polyakov <[email protected]> for the OpenSSL
|
---|
12 | # project. The module is, however, dual licensed under OpenSSL and
|
---|
13 | # CRYPTOGAMS licenses depending on where you obtain it. For further
|
---|
14 | # details see http://www.openssl.org/~appro/cryptogams/.
|
---|
15 | # ====================================================================
|
---|
16 |
|
---|
17 | # April 2007.
|
---|
18 | #
|
---|
19 | # Performance improvement over vanilla C code varies from 85% to 45%
|
---|
20 | # depending on key length and benchmark. Unfortunately in this context
|
---|
21 | # these are not very impressive results [for code that utilizes "wide"
|
---|
22 | # 64x64=128-bit multiplication, which is not commonly available to C
|
---|
23 | # programmers], at least hand-coded bn_asm.c replacement is known to
|
---|
24 | # provide 30-40% better results for longest keys. Well, on a second
|
---|
25 | # thought it's not very surprising, because z-CPUs are single-issue
|
---|
26 | # and _strictly_ in-order execution, while bn_mul_mont is more or less
|
---|
27 | # dependent on CPU ability to pipe-line instructions and have several
|
---|
28 | # of them "in-flight" at the same time. I mean while other methods,
|
---|
29 | # for example Karatsuba, aim to minimize amount of multiplications at
|
---|
30 | # the cost of other operations increase, bn_mul_mont aim to neatly
|
---|
31 | # "overlap" multiplications and the other operations [and on most
|
---|
32 | # platforms even minimize the amount of the other operations, in
|
---|
33 | # particular references to memory]. But it's possible to improve this
|
---|
34 | # module performance by implementing dedicated squaring code-path and
|
---|
35 | # possibly by unrolling loops...
|
---|
36 |
|
---|
37 | # January 2009.
|
---|
38 | #
|
---|
39 | # Reschedule to minimize/avoid Address Generation Interlock hazard,
|
---|
40 | # make inner loops counter-based.
|
---|
41 |
|
---|
42 | # November 2010.
|
---|
43 | #
|
---|
44 | # Adapt for -m31 build. If kernel supports what's called "highgprs"
|
---|
45 | # feature on Linux [see /proc/cpuinfo], it's possible to use 64-bit
|
---|
46 | # instructions and achieve "64-bit" performance even in 31-bit legacy
|
---|
47 | # application context. The feature is not specific to any particular
|
---|
48 | # processor, as long as it's "z-CPU". Latter implies that the code
|
---|
49 | # remains z/Architecture specific. Compatibility with 32-bit BN_ULONG
|
---|
50 | # is achieved by swapping words after 64-bit loads, follow _dswap-s.
|
---|
51 | # On z990 it was measured to perform 2.6-2.2 times better than
|
---|
52 | # compiler-generated code, less for longer keys...
|
---|
53 |
|
---|
54 | # $output is the last argument if it looks like a file (it has an extension)
|
---|
55 | # $flavour is the first argument if it doesn't look like a file
|
---|
56 | $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
|
---|
57 | $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
|
---|
58 |
|
---|
59 | if ($flavour =~ /3[12]/) {
|
---|
60 | $SIZE_T=4;
|
---|
61 | $g="";
|
---|
62 | } else {
|
---|
63 | $SIZE_T=8;
|
---|
64 | $g="g";
|
---|
65 | }
|
---|
66 |
|
---|
67 | $output and open STDOUT,">$output";
|
---|
68 |
|
---|
69 | $stdframe=16*$SIZE_T+4*8;
|
---|
70 |
|
---|
71 | $mn0="%r0";
|
---|
72 | $num="%r1";
|
---|
73 |
|
---|
74 | # int bn_mul_mont(
|
---|
75 | $rp="%r2"; # BN_ULONG *rp,
|
---|
76 | $ap="%r3"; # const BN_ULONG *ap,
|
---|
77 | $bp="%r4"; # const BN_ULONG *bp,
|
---|
78 | $np="%r5"; # const BN_ULONG *np,
|
---|
79 | $n0="%r6"; # const BN_ULONG *n0,
|
---|
80 | #$num="160(%r15)" # int num);
|
---|
81 |
|
---|
82 | $bi="%r2"; # zaps rp
|
---|
83 | $j="%r7";
|
---|
84 |
|
---|
85 | $ahi="%r8";
|
---|
86 | $alo="%r9";
|
---|
87 | $nhi="%r10";
|
---|
88 | $nlo="%r11";
|
---|
89 | $AHI="%r12";
|
---|
90 | $NHI="%r13";
|
---|
91 | $count="%r14";
|
---|
92 | $sp="%r15";
|
---|
93 |
|
---|
94 | $code.=<<___;
|
---|
95 | .text
|
---|
96 | .globl bn_mul_mont
|
---|
97 | .type bn_mul_mont,\@function
|
---|
98 | bn_mul_mont:
|
---|
99 | lgf $num,`$stdframe+$SIZE_T-4`($sp) # pull $num
|
---|
100 | sla $num,`log($SIZE_T)/log(2)` # $num to enumerate bytes
|
---|
101 | la $bp,0($num,$bp)
|
---|
102 |
|
---|
103 | st${g} %r2,2*$SIZE_T($sp)
|
---|
104 |
|
---|
105 | cghi $num,16 #
|
---|
106 | lghi %r2,0 #
|
---|
107 | blr %r14 # if($num<16) return 0;
|
---|
108 | ___
|
---|
109 | $code.=<<___ if ($flavour =~ /3[12]/);
|
---|
110 | tmll $num,4
|
---|
111 | bnzr %r14 # if ($num&1) return 0;
|
---|
112 | ___
|
---|
113 | $code.=<<___ if ($flavour !~ /3[12]/);
|
---|
114 | cghi $num,96 #
|
---|
115 | bhr %r14 # if($num>96) return 0;
|
---|
116 | ___
|
---|
117 | $code.=<<___;
|
---|
118 | stm${g} %r3,%r15,3*$SIZE_T($sp)
|
---|
119 |
|
---|
120 | lghi $rp,-$stdframe-8 # leave room for carry bit
|
---|
121 | lcgr $j,$num # -$num
|
---|
122 | lgr %r0,$sp
|
---|
123 | la $rp,0($rp,$sp)
|
---|
124 | la $sp,0($j,$rp) # alloca
|
---|
125 | st${g} %r0,0($sp) # back chain
|
---|
126 |
|
---|
127 | sra $num,3 # restore $num
|
---|
128 | la $bp,0($j,$bp) # restore $bp
|
---|
129 | ahi $num,-1 # adjust $num for inner loop
|
---|
130 | lg $n0,0($n0) # pull n0
|
---|
131 | _dswap $n0
|
---|
132 |
|
---|
133 | lg $bi,0($bp)
|
---|
134 | _dswap $bi
|
---|
135 | lg $alo,0($ap)
|
---|
136 | _dswap $alo
|
---|
137 | mlgr $ahi,$bi # ap[0]*bp[0]
|
---|
138 | lgr $AHI,$ahi
|
---|
139 |
|
---|
140 | lgr $mn0,$alo # "tp[0]"*n0
|
---|
141 | msgr $mn0,$n0
|
---|
142 |
|
---|
143 | lg $nlo,0($np) #
|
---|
144 | _dswap $nlo
|
---|
145 | mlgr $nhi,$mn0 # np[0]*m1
|
---|
146 | algr $nlo,$alo # +="tp[0]"
|
---|
147 | lghi $NHI,0
|
---|
148 | alcgr $NHI,$nhi
|
---|
149 |
|
---|
150 | la $j,8 # j=1
|
---|
151 | lr $count,$num
|
---|
152 |
|
---|
153 | .align 16
|
---|
154 | .L1st:
|
---|
155 | lg $alo,0($j,$ap)
|
---|
156 | _dswap $alo
|
---|
157 | mlgr $ahi,$bi # ap[j]*bp[0]
|
---|
158 | algr $alo,$AHI
|
---|
159 | lghi $AHI,0
|
---|
160 | alcgr $AHI,$ahi
|
---|
161 |
|
---|
162 | lg $nlo,0($j,$np)
|
---|
163 | _dswap $nlo
|
---|
164 | mlgr $nhi,$mn0 # np[j]*m1
|
---|
165 | algr $nlo,$NHI
|
---|
166 | lghi $NHI,0
|
---|
167 | alcgr $nhi,$NHI # +="tp[j]"
|
---|
168 | algr $nlo,$alo
|
---|
169 | alcgr $NHI,$nhi
|
---|
170 |
|
---|
171 | stg $nlo,$stdframe-8($j,$sp) # tp[j-1]=
|
---|
172 | la $j,8($j) # j++
|
---|
173 | brct $count,.L1st
|
---|
174 |
|
---|
175 | algr $NHI,$AHI
|
---|
176 | lghi $AHI,0
|
---|
177 | alcgr $AHI,$AHI # upmost overflow bit
|
---|
178 | stg $NHI,$stdframe-8($j,$sp)
|
---|
179 | stg $AHI,$stdframe($j,$sp)
|
---|
180 | la $bp,8($bp) # bp++
|
---|
181 |
|
---|
182 | .Louter:
|
---|
183 | lg $bi,0($bp) # bp[i]
|
---|
184 | _dswap $bi
|
---|
185 | lg $alo,0($ap)
|
---|
186 | _dswap $alo
|
---|
187 | mlgr $ahi,$bi # ap[0]*bp[i]
|
---|
188 | alg $alo,$stdframe($sp) # +=tp[0]
|
---|
189 | lghi $AHI,0
|
---|
190 | alcgr $AHI,$ahi
|
---|
191 |
|
---|
192 | lgr $mn0,$alo
|
---|
193 | msgr $mn0,$n0 # tp[0]*n0
|
---|
194 |
|
---|
195 | lg $nlo,0($np) # np[0]
|
---|
196 | _dswap $nlo
|
---|
197 | mlgr $nhi,$mn0 # np[0]*m1
|
---|
198 | algr $nlo,$alo # +="tp[0]"
|
---|
199 | lghi $NHI,0
|
---|
200 | alcgr $NHI,$nhi
|
---|
201 |
|
---|
202 | la $j,8 # j=1
|
---|
203 | lr $count,$num
|
---|
204 |
|
---|
205 | .align 16
|
---|
206 | .Linner:
|
---|
207 | lg $alo,0($j,$ap)
|
---|
208 | _dswap $alo
|
---|
209 | mlgr $ahi,$bi # ap[j]*bp[i]
|
---|
210 | algr $alo,$AHI
|
---|
211 | lghi $AHI,0
|
---|
212 | alcgr $ahi,$AHI
|
---|
213 | alg $alo,$stdframe($j,$sp)# +=tp[j]
|
---|
214 | alcgr $AHI,$ahi
|
---|
215 |
|
---|
216 | lg $nlo,0($j,$np)
|
---|
217 | _dswap $nlo
|
---|
218 | mlgr $nhi,$mn0 # np[j]*m1
|
---|
219 | algr $nlo,$NHI
|
---|
220 | lghi $NHI,0
|
---|
221 | alcgr $nhi,$NHI
|
---|
222 | algr $nlo,$alo # +="tp[j]"
|
---|
223 | alcgr $NHI,$nhi
|
---|
224 |
|
---|
225 | stg $nlo,$stdframe-8($j,$sp) # tp[j-1]=
|
---|
226 | la $j,8($j) # j++
|
---|
227 | brct $count,.Linner
|
---|
228 |
|
---|
229 | algr $NHI,$AHI
|
---|
230 | lghi $AHI,0
|
---|
231 | alcgr $AHI,$AHI
|
---|
232 | alg $NHI,$stdframe($j,$sp)# accumulate previous upmost overflow bit
|
---|
233 | lghi $ahi,0
|
---|
234 | alcgr $AHI,$ahi # new upmost overflow bit
|
---|
235 | stg $NHI,$stdframe-8($j,$sp)
|
---|
236 | stg $AHI,$stdframe($j,$sp)
|
---|
237 |
|
---|
238 | la $bp,8($bp) # bp++
|
---|
239 | cl${g} $bp,`$stdframe+8+4*$SIZE_T`($j,$sp) # compare to &bp[num]
|
---|
240 | jne .Louter
|
---|
241 |
|
---|
242 | l${g} $rp,`$stdframe+8+2*$SIZE_T`($j,$sp) # reincarnate rp
|
---|
243 | la $ap,$stdframe($sp)
|
---|
244 | ahi $num,1 # restore $num, incidentally clears "borrow"
|
---|
245 |
|
---|
246 | la $j,0
|
---|
247 | lr $count,$num
|
---|
248 | .Lsub: lg $alo,0($j,$ap)
|
---|
249 | lg $nlo,0($j,$np)
|
---|
250 | _dswap $nlo
|
---|
251 | slbgr $alo,$nlo
|
---|
252 | stg $alo,0($j,$rp)
|
---|
253 | la $j,8($j)
|
---|
254 | brct $count,.Lsub
|
---|
255 | lghi $ahi,0
|
---|
256 | slbgr $AHI,$ahi # handle upmost carry
|
---|
257 | lghi $NHI,-1
|
---|
258 | xgr $NHI,$AHI
|
---|
259 |
|
---|
260 | la $j,0
|
---|
261 | lgr $count,$num
|
---|
262 | .Lcopy: lg $ahi,$stdframe($j,$sp) # conditional copy
|
---|
263 | lg $alo,0($j,$rp)
|
---|
264 | ngr $ahi,$AHI
|
---|
265 | ngr $alo,$NHI
|
---|
266 | ogr $alo,$ahi
|
---|
267 | _dswap $alo
|
---|
268 | stg $j,$stdframe($j,$sp) # zap tp
|
---|
269 | stg $alo,0($j,$rp)
|
---|
270 | la $j,8($j)
|
---|
271 | brct $count,.Lcopy
|
---|
272 |
|
---|
273 | la %r1,`$stdframe+8+6*$SIZE_T`($j,$sp)
|
---|
274 | lm${g} %r6,%r15,0(%r1)
|
---|
275 | lghi %r2,1 # signal "processed"
|
---|
276 | br %r14
|
---|
277 | .size bn_mul_mont,.-bn_mul_mont
|
---|
278 | .string "Montgomery Multiplication for s390x, CRYPTOGAMS by <appro\@openssl.org>"
|
---|
279 | ___
|
---|
280 |
|
---|
281 | foreach (split("\n",$code)) {
|
---|
282 | s/\`([^\`]*)\`/eval $1/ge;
|
---|
283 | s/_dswap\s+(%r[0-9]+)/sprintf("rllg\t%s,%s,32",$1,$1) if($SIZE_T==4)/e;
|
---|
284 | print $_,"\n";
|
---|
285 | }
|
---|
286 | close STDOUT or die "error closing STDOUT: $!";
|
---|