1 | #! /usr/bin/env perl
|
---|
2 | # Copyright 2009-2020 The OpenSSL Project Authors. All Rights Reserved.
|
---|
3 | #
|
---|
4 | # Licensed under the Apache License 2.0 (the "License"). You may not use
|
---|
5 | # this file except in compliance with the License. You can obtain a copy
|
---|
6 | # in the file LICENSE in the source distribution or at
|
---|
7 | # https://www.openssl.org/source/license.html
|
---|
8 |
|
---|
9 |
|
---|
10 | # ====================================================================
|
---|
11 | # Written by Andy Polyakov <[email protected]> for the OpenSSL
|
---|
12 | # project. The module is, however, dual licensed under OpenSSL and
|
---|
13 | # CRYPTOGAMS licenses depending on where you obtain it. For further
|
---|
14 | # details see http://www.openssl.org/~appro/cryptogams/.
|
---|
15 | # ====================================================================
|
---|
16 |
|
---|
17 | # On PA-7100LC this module performs ~90-50% better, less for longer
|
---|
18 | # keys, than code generated by gcc 3.2 for PA-RISC 1.1. Latter means
|
---|
19 | # that compiler utilized xmpyu instruction to perform 32x32=64-bit
|
---|
20 | # multiplication, which in turn means that "baseline" performance was
|
---|
21 | # optimal in respect to instruction set capabilities. Fair comparison
|
---|
22 | # with vendor compiler is problematic, because OpenSSL doesn't define
|
---|
23 | # BN_LLONG [presumably] for historical reasons, which drives compiler
|
---|
24 | # toward 4 times 16x16=32-bit multiplications [plus complementary
|
---|
25 | # shifts and additions] instead. This means that you should observe
|
---|
26 | # several times improvement over code generated by vendor compiler
|
---|
27 | # for PA-RISC 1.1, but the "baseline" is far from optimal. The actual
|
---|
28 | # improvement coefficient was never collected on PA-7100LC, or any
|
---|
29 | # other 1.1 CPU, because I don't have access to such machine with
|
---|
30 | # vendor compiler. But to give you a taste, PA-RISC 1.1 code path
|
---|
31 | # reportedly outperformed code generated by cc +DA1.1 +O3 by factor
|
---|
32 | # of ~5x on PA-8600.
|
---|
33 | #
|
---|
34 | # On PA-RISC 2.0 it has to compete with pa-risc2[W].s, which is
|
---|
35 | # reportedly ~2x faster than vendor compiler generated code [according
|
---|
36 | # to comment in pa-risc2[W].s]. Here comes a catch. Execution core of
|
---|
37 | # this implementation is actually 32-bit one, in the sense that it
|
---|
38 | # operates on 32-bit values. But pa-risc2[W].s operates on arrays of
|
---|
39 | # 64-bit BN_LONGs... How do they interoperate then? No problem. This
|
---|
40 | # module picks halves of 64-bit values in reverse order and pretends
|
---|
41 | # they were 32-bit BN_LONGs. But can 32-bit core compete with "pure"
|
---|
42 | # 64-bit code such as pa-risc2[W].s then? Well, the thing is that
|
---|
43 | # 32x32=64-bit multiplication is the best even PA-RISC 2.0 can do,
|
---|
44 | # i.e. there is no "wider" multiplication like on most other 64-bit
|
---|
45 | # platforms. This means that even being effectively 32-bit, this
|
---|
46 | # implementation performs "64-bit" computational task in same amount
|
---|
47 | # of arithmetic operations, most notably multiplications. It requires
|
---|
48 | # more memory references, most notably to tp[num], but this doesn't
|
---|
49 | # seem to exhaust memory port capacity. And indeed, dedicated PA-RISC
|
---|
50 | # 2.0 code path provides virtually same performance as pa-risc2[W].s:
|
---|
51 | # it's ~10% better for shortest key length and ~10% worse for longest
|
---|
52 | # one.
|
---|
53 | #
|
---|
54 | # In case it wasn't clear. The module has two distinct code paths:
|
---|
55 | # PA-RISC 1.1 and PA-RISC 2.0 ones. Latter features carry-free 64-bit
|
---|
56 | # additions and 64-bit integer loads, not to mention specific
|
---|
57 | # instruction scheduling. In 64-bit build naturally only 2.0 code path
|
---|
58 | # is assembled. In 32-bit application context both code paths are
|
---|
59 | # assembled, PA-RISC 2.0 CPU is detected at run-time and proper path
|
---|
60 | # is taken automatically. Also, in 32-bit build the module imposes
|
---|
61 | # couple of limitations: vector lengths has to be even and vector
|
---|
62 | # addresses has to be 64-bit aligned. Normally neither is a problem:
|
---|
63 | # most common key lengths are even and vectors are commonly malloc-ed,
|
---|
64 | # which ensures alignment.
|
---|
65 | #
|
---|
66 | # Special thanks to polarhome.com for providing HP-UX account on
|
---|
67 | # PA-RISC 1.1 machine, and to correspondent who chose to remain
|
---|
68 | # anonymous for testing the code on PA-RISC 2.0 machine.
|
---|
69 | |
---|
70 |
|
---|
71 | $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
|
---|
72 |
|
---|
73 | # $output is the last argument if it looks like a file (it has an extension)
|
---|
74 | # $flavour is the first argument if it doesn't look like a file
|
---|
75 | $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
|
---|
76 | $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
|
---|
77 |
|
---|
78 | $output and open STDOUT,">$output";
|
---|
79 |
|
---|
80 | if ($flavour =~ /64/) {
|
---|
81 | $LEVEL ="2.0W";
|
---|
82 | $SIZE_T =8;
|
---|
83 | $FRAME_MARKER =80;
|
---|
84 | $SAVED_RP =16;
|
---|
85 | $PUSH ="std";
|
---|
86 | $PUSHMA ="std,ma";
|
---|
87 | $POP ="ldd";
|
---|
88 | $POPMB ="ldd,mb";
|
---|
89 | $BN_SZ =$SIZE_T;
|
---|
90 | } else {
|
---|
91 | $LEVEL ="1.1"; #$LEVEL.="\n\t.ALLOW\t2.0";
|
---|
92 | $SIZE_T =4;
|
---|
93 | $FRAME_MARKER =48;
|
---|
94 | $SAVED_RP =20;
|
---|
95 | $PUSH ="stw";
|
---|
96 | $PUSHMA ="stwm";
|
---|
97 | $POP ="ldw";
|
---|
98 | $POPMB ="ldwm";
|
---|
99 | $BN_SZ =$SIZE_T;
|
---|
100 | if (open CONF,"<${dir}../../opensslconf.h") {
|
---|
101 | while(<CONF>) {
|
---|
102 | if (m/#\s*define\s+SIXTY_FOUR_BIT/) {
|
---|
103 | $BN_SZ=8;
|
---|
104 | $LEVEL="2.0";
|
---|
105 | last;
|
---|
106 | }
|
---|
107 | }
|
---|
108 | close CONF;
|
---|
109 | }
|
---|
110 | }
|
---|
111 |
|
---|
112 | $FRAME=8*$SIZE_T+$FRAME_MARKER; # 8 saved regs + frame marker
|
---|
113 | # [+ argument transfer]
|
---|
114 | $LOCALS=$FRAME-$FRAME_MARKER;
|
---|
115 | $FRAME+=32; # local variables
|
---|
116 |
|
---|
117 | $tp="%r31";
|
---|
118 | $ti1="%r29";
|
---|
119 | $ti0="%r28";
|
---|
120 |
|
---|
121 | $rp="%r26";
|
---|
122 | $ap="%r25";
|
---|
123 | $bp="%r24";
|
---|
124 | $np="%r23";
|
---|
125 | $n0="%r22"; # passed through stack in 32-bit
|
---|
126 | $num="%r21"; # passed through stack in 32-bit
|
---|
127 | $idx="%r20";
|
---|
128 | $arrsz="%r19";
|
---|
129 |
|
---|
130 | $nm1="%r7";
|
---|
131 | $nm0="%r6";
|
---|
132 | $ab1="%r5";
|
---|
133 | $ab0="%r4";
|
---|
134 |
|
---|
135 | $fp="%r3";
|
---|
136 | $hi1="%r2";
|
---|
137 | $hi0="%r1";
|
---|
138 |
|
---|
139 | $xfer=$n0; # accommodates [-16..15] offset in fld[dw]s
|
---|
140 |
|
---|
141 | $fm0="%fr4"; $fti=$fm0;
|
---|
142 | $fbi="%fr5L";
|
---|
143 | $fn0="%fr5R";
|
---|
144 | $fai="%fr6"; $fab0="%fr7"; $fab1="%fr8";
|
---|
145 | $fni="%fr9"; $fnm0="%fr10"; $fnm1="%fr11";
|
---|
146 |
|
---|
147 | $code=<<___;
|
---|
148 | .LEVEL $LEVEL
|
---|
149 | .SPACE \$TEXT\$
|
---|
150 | .SUBSPA \$CODE\$,QUAD=0,ALIGN=8,ACCESS=0x2C,CODE_ONLY
|
---|
151 |
|
---|
152 | .EXPORT bn_mul_mont,ENTRY,ARGW0=GR,ARGW1=GR,ARGW2=GR,ARGW3=GR
|
---|
153 | .ALIGN 64
|
---|
154 | bn_mul_mont
|
---|
155 | .PROC
|
---|
156 | .CALLINFO FRAME=`$FRAME-8*$SIZE_T`,NO_CALLS,SAVE_RP,SAVE_SP,ENTRY_GR=6
|
---|
157 | .ENTRY
|
---|
158 | $PUSH %r2,-$SAVED_RP(%sp) ; standard prologue
|
---|
159 | $PUSHMA %r3,$FRAME(%sp)
|
---|
160 | $PUSH %r4,`-$FRAME+1*$SIZE_T`(%sp)
|
---|
161 | $PUSH %r5,`-$FRAME+2*$SIZE_T`(%sp)
|
---|
162 | $PUSH %r6,`-$FRAME+3*$SIZE_T`(%sp)
|
---|
163 | $PUSH %r7,`-$FRAME+4*$SIZE_T`(%sp)
|
---|
164 | $PUSH %r8,`-$FRAME+5*$SIZE_T`(%sp)
|
---|
165 | $PUSH %r9,`-$FRAME+6*$SIZE_T`(%sp)
|
---|
166 | $PUSH %r10,`-$FRAME+7*$SIZE_T`(%sp)
|
---|
167 | ldo -$FRAME(%sp),$fp
|
---|
168 | ___
|
---|
169 | $code.=<<___ if ($SIZE_T==4);
|
---|
170 | ldw `-$FRAME_MARKER-4`($fp),$n0
|
---|
171 | ldw `-$FRAME_MARKER-8`($fp),$num
|
---|
172 | nop
|
---|
173 | nop ; alignment
|
---|
174 | ___
|
---|
175 | $code.=<<___ if ($BN_SZ==4);
|
---|
176 | comiclr,<= 6,$num,%r0 ; are vectors long enough?
|
---|
177 | b L\$abort
|
---|
178 | ldi 0,%r28 ; signal "unhandled"
|
---|
179 | add,ev %r0,$num,$num ; is $num even?
|
---|
180 | b L\$abort
|
---|
181 | nop
|
---|
182 | or $ap,$np,$ti1
|
---|
183 | extru,= $ti1,31,3,%r0 ; are ap and np 64-bit aligned?
|
---|
184 | b L\$abort
|
---|
185 | nop
|
---|
186 | nop ; alignment
|
---|
187 | nop
|
---|
188 |
|
---|
189 | fldws 0($n0),${fn0}
|
---|
190 | fldws,ma 4($bp),${fbi} ; bp[0]
|
---|
191 | ___
|
---|
192 | $code.=<<___ if ($BN_SZ==8);
|
---|
193 | comib,> 3,$num,L\$abort ; are vectors long enough?
|
---|
194 | ldi 0,%r28 ; signal "unhandled"
|
---|
195 | addl $num,$num,$num ; I operate on 32-bit values
|
---|
196 |
|
---|
197 | fldws 4($n0),${fn0} ; only low part of n0
|
---|
198 | fldws 4($bp),${fbi} ; bp[0] in flipped word order
|
---|
199 | ___
|
---|
200 | $code.=<<___;
|
---|
201 | fldds 0($ap),${fai} ; ap[0,1]
|
---|
202 | fldds 0($np),${fni} ; np[0,1]
|
---|
203 |
|
---|
204 | sh2addl $num,%r0,$arrsz
|
---|
205 | ldi 31,$hi0
|
---|
206 | ldo 36($arrsz),$hi1 ; space for tp[num+1]
|
---|
207 | andcm $hi1,$hi0,$hi1 ; align
|
---|
208 | addl $hi1,%sp,%sp
|
---|
209 | $PUSH $fp,-$SIZE_T(%sp)
|
---|
210 |
|
---|
211 | ldo `$LOCALS+16`($fp),$xfer
|
---|
212 | ldo `$LOCALS+32+4`($fp),$tp
|
---|
213 |
|
---|
214 | xmpyu ${fai}L,${fbi},${fab0} ; ap[0]*bp[0]
|
---|
215 | xmpyu ${fai}R,${fbi},${fab1} ; ap[1]*bp[0]
|
---|
216 | xmpyu ${fn0},${fab0}R,${fm0}
|
---|
217 |
|
---|
218 | addl $arrsz,$ap,$ap ; point at the end
|
---|
219 | addl $arrsz,$np,$np
|
---|
220 | subi 0,$arrsz,$idx ; j=0
|
---|
221 | ldo 8($idx),$idx ; j++++
|
---|
222 |
|
---|
223 | xmpyu ${fni}L,${fm0}R,${fnm0} ; np[0]*m
|
---|
224 | xmpyu ${fni}R,${fm0}R,${fnm1} ; np[1]*m
|
---|
225 | fstds ${fab0},-16($xfer)
|
---|
226 | fstds ${fnm0},-8($xfer)
|
---|
227 | fstds ${fab1},0($xfer)
|
---|
228 | fstds ${fnm1},8($xfer)
|
---|
229 | flddx $idx($ap),${fai} ; ap[2,3]
|
---|
230 | flddx $idx($np),${fni} ; np[2,3]
|
---|
231 | ___
|
---|
232 | $code.=<<___ if ($BN_SZ==4);
|
---|
233 | mtctl $hi0,%cr11 ; $hi0 still holds 31
|
---|
234 | extrd,u,*= $hi0,%sar,1,$hi0 ; executes on PA-RISC 1.0
|
---|
235 | b L\$parisc11
|
---|
236 | nop
|
---|
237 | ___
|
---|
238 | $code.=<<___; # PA-RISC 2.0 code-path
|
---|
239 | xmpyu ${fai}L,${fbi},${fab0} ; ap[j]*bp[0]
|
---|
240 | xmpyu ${fni}L,${fm0}R,${fnm0} ; np[j]*m
|
---|
241 | ldd -16($xfer),$ab0
|
---|
242 | fstds ${fab0},-16($xfer)
|
---|
243 |
|
---|
244 | extrd,u $ab0,31,32,$hi0
|
---|
245 | extrd,u $ab0,63,32,$ab0
|
---|
246 | ldd -8($xfer),$nm0
|
---|
247 | fstds ${fnm0},-8($xfer)
|
---|
248 | ldo 8($idx),$idx ; j++++
|
---|
249 | addl $ab0,$nm0,$nm0 ; low part is discarded
|
---|
250 | extrd,u $nm0,31,32,$hi1
|
---|
251 | |
---|
252 |
|
---|
253 | L\$1st
|
---|
254 | xmpyu ${fai}R,${fbi},${fab1} ; ap[j+1]*bp[0]
|
---|
255 | xmpyu ${fni}R,${fm0}R,${fnm1} ; np[j+1]*m
|
---|
256 | ldd 0($xfer),$ab1
|
---|
257 | fstds ${fab1},0($xfer)
|
---|
258 | addl $hi0,$ab1,$ab1
|
---|
259 | extrd,u $ab1,31,32,$hi0
|
---|
260 | ldd 8($xfer),$nm1
|
---|
261 | fstds ${fnm1},8($xfer)
|
---|
262 | extrd,u $ab1,63,32,$ab1
|
---|
263 | addl $hi1,$nm1,$nm1
|
---|
264 | flddx $idx($ap),${fai} ; ap[j,j+1]
|
---|
265 | flddx $idx($np),${fni} ; np[j,j+1]
|
---|
266 | addl $ab1,$nm1,$nm1
|
---|
267 | extrd,u $nm1,31,32,$hi1
|
---|
268 |
|
---|
269 | xmpyu ${fai}L,${fbi},${fab0} ; ap[j]*bp[0]
|
---|
270 | xmpyu ${fni}L,${fm0}R,${fnm0} ; np[j]*m
|
---|
271 | ldd -16($xfer),$ab0
|
---|
272 | fstds ${fab0},-16($xfer)
|
---|
273 | addl $hi0,$ab0,$ab0
|
---|
274 | extrd,u $ab0,31,32,$hi0
|
---|
275 | ldd -8($xfer),$nm0
|
---|
276 | fstds ${fnm0},-8($xfer)
|
---|
277 | extrd,u $ab0,63,32,$ab0
|
---|
278 | addl $hi1,$nm0,$nm0
|
---|
279 | stw $nm1,-4($tp) ; tp[j-1]
|
---|
280 | addl $ab0,$nm0,$nm0
|
---|
281 | stw,ma $nm0,8($tp) ; tp[j-1]
|
---|
282 | addib,<> 8,$idx,L\$1st ; j++++
|
---|
283 | extrd,u $nm0,31,32,$hi1
|
---|
284 |
|
---|
285 | xmpyu ${fai}R,${fbi},${fab1} ; ap[j]*bp[0]
|
---|
286 | xmpyu ${fni}R,${fm0}R,${fnm1} ; np[j]*m
|
---|
287 | ldd 0($xfer),$ab1
|
---|
288 | fstds ${fab1},0($xfer)
|
---|
289 | addl $hi0,$ab1,$ab1
|
---|
290 | extrd,u $ab1,31,32,$hi0
|
---|
291 | ldd 8($xfer),$nm1
|
---|
292 | fstds ${fnm1},8($xfer)
|
---|
293 | extrd,u $ab1,63,32,$ab1
|
---|
294 | addl $hi1,$nm1,$nm1
|
---|
295 | ldd -16($xfer),$ab0
|
---|
296 | addl $ab1,$nm1,$nm1
|
---|
297 | ldd -8($xfer),$nm0
|
---|
298 | extrd,u $nm1,31,32,$hi1
|
---|
299 |
|
---|
300 | addl $hi0,$ab0,$ab0
|
---|
301 | extrd,u $ab0,31,32,$hi0
|
---|
302 | stw $nm1,-4($tp) ; tp[j-1]
|
---|
303 | extrd,u $ab0,63,32,$ab0
|
---|
304 | addl $hi1,$nm0,$nm0
|
---|
305 | ldd 0($xfer),$ab1
|
---|
306 | addl $ab0,$nm0,$nm0
|
---|
307 | ldd,mb 8($xfer),$nm1
|
---|
308 | extrd,u $nm0,31,32,$hi1
|
---|
309 | stw,ma $nm0,8($tp) ; tp[j-1]
|
---|
310 |
|
---|
311 | ldo -1($num),$num ; i--
|
---|
312 | subi 0,$arrsz,$idx ; j=0
|
---|
313 | ___
|
---|
314 | $code.=<<___ if ($BN_SZ==4);
|
---|
315 | fldws,ma 4($bp),${fbi} ; bp[1]
|
---|
316 | ___
|
---|
317 | $code.=<<___ if ($BN_SZ==8);
|
---|
318 | fldws 0($bp),${fbi} ; bp[1] in flipped word order
|
---|
319 | ___
|
---|
320 | $code.=<<___;
|
---|
321 | flddx $idx($ap),${fai} ; ap[0,1]
|
---|
322 | flddx $idx($np),${fni} ; np[0,1]
|
---|
323 | fldws 8($xfer),${fti}R ; tp[0]
|
---|
324 | addl $hi0,$ab1,$ab1
|
---|
325 | extrd,u $ab1,31,32,$hi0
|
---|
326 | extrd,u $ab1,63,32,$ab1
|
---|
327 | ldo 8($idx),$idx ; j++++
|
---|
328 | xmpyu ${fai}L,${fbi},${fab0} ; ap[0]*bp[1]
|
---|
329 | xmpyu ${fai}R,${fbi},${fab1} ; ap[1]*bp[1]
|
---|
330 | addl $hi1,$nm1,$nm1
|
---|
331 | addl $ab1,$nm1,$nm1
|
---|
332 | extrd,u $nm1,31,32,$hi1
|
---|
333 | fstws,mb ${fab0}L,-8($xfer) ; save high part
|
---|
334 | stw $nm1,-4($tp) ; tp[j-1]
|
---|
335 |
|
---|
336 | fcpy,sgl %fr0,${fti}L ; zero high part
|
---|
337 | fcpy,sgl %fr0,${fab0}L
|
---|
338 | addl $hi1,$hi0,$hi0
|
---|
339 | extrd,u $hi0,31,32,$hi1
|
---|
340 | fcnvxf,dbl,dbl ${fti},${fti} ; 32-bit unsigned int -> double
|
---|
341 | fcnvxf,dbl,dbl ${fab0},${fab0}
|
---|
342 | stw $hi0,0($tp)
|
---|
343 | stw $hi1,4($tp)
|
---|
344 |
|
---|
345 | fadd,dbl ${fti},${fab0},${fab0} ; add tp[0]
|
---|
346 | fcnvfx,dbl,dbl ${fab0},${fab0} ; double -> 33-bit unsigned int
|
---|
347 | xmpyu ${fn0},${fab0}R,${fm0}
|
---|
348 | ldo `$LOCALS+32+4`($fp),$tp
|
---|
349 | L\$outer
|
---|
350 | xmpyu ${fni}L,${fm0}R,${fnm0} ; np[0]*m
|
---|
351 | xmpyu ${fni}R,${fm0}R,${fnm1} ; np[1]*m
|
---|
352 | fstds ${fab0},-16($xfer) ; 33-bit value
|
---|
353 | fstds ${fnm0},-8($xfer)
|
---|
354 | flddx $idx($ap),${fai} ; ap[2]
|
---|
355 | flddx $idx($np),${fni} ; np[2]
|
---|
356 | ldo 8($idx),$idx ; j++++
|
---|
357 | ldd -16($xfer),$ab0 ; 33-bit value
|
---|
358 | ldd -8($xfer),$nm0
|
---|
359 | ldw 0($xfer),$hi0 ; high part
|
---|
360 |
|
---|
361 | xmpyu ${fai}L,${fbi},${fab0} ; ap[j]*bp[i]
|
---|
362 | xmpyu ${fni}L,${fm0}R,${fnm0} ; np[j]*m
|
---|
363 | extrd,u $ab0,31,32,$ti0 ; carry bit
|
---|
364 | extrd,u $ab0,63,32,$ab0
|
---|
365 | fstds ${fab1},0($xfer)
|
---|
366 | addl $ti0,$hi0,$hi0 ; account carry bit
|
---|
367 | fstds ${fnm1},8($xfer)
|
---|
368 | addl $ab0,$nm0,$nm0 ; low part is discarded
|
---|
369 | ldw 0($tp),$ti1 ; tp[1]
|
---|
370 | extrd,u $nm0,31,32,$hi1
|
---|
371 | fstds ${fab0},-16($xfer)
|
---|
372 | fstds ${fnm0},-8($xfer)
|
---|
373 | |
---|
374 |
|
---|
375 | L\$inner
|
---|
376 | xmpyu ${fai}R,${fbi},${fab1} ; ap[j+1]*bp[i]
|
---|
377 | xmpyu ${fni}R,${fm0}R,${fnm1} ; np[j+1]*m
|
---|
378 | ldd 0($xfer),$ab1
|
---|
379 | fstds ${fab1},0($xfer)
|
---|
380 | addl $hi0,$ti1,$ti1
|
---|
381 | addl $ti1,$ab1,$ab1
|
---|
382 | ldd 8($xfer),$nm1
|
---|
383 | fstds ${fnm1},8($xfer)
|
---|
384 | extrd,u $ab1,31,32,$hi0
|
---|
385 | extrd,u $ab1,63,32,$ab1
|
---|
386 | flddx $idx($ap),${fai} ; ap[j,j+1]
|
---|
387 | flddx $idx($np),${fni} ; np[j,j+1]
|
---|
388 | addl $hi1,$nm1,$nm1
|
---|
389 | addl $ab1,$nm1,$nm1
|
---|
390 | ldw 4($tp),$ti0 ; tp[j]
|
---|
391 | stw $nm1,-4($tp) ; tp[j-1]
|
---|
392 |
|
---|
393 | xmpyu ${fai}L,${fbi},${fab0} ; ap[j]*bp[i]
|
---|
394 | xmpyu ${fni}L,${fm0}R,${fnm0} ; np[j]*m
|
---|
395 | ldd -16($xfer),$ab0
|
---|
396 | fstds ${fab0},-16($xfer)
|
---|
397 | addl $hi0,$ti0,$ti0
|
---|
398 | addl $ti0,$ab0,$ab0
|
---|
399 | ldd -8($xfer),$nm0
|
---|
400 | fstds ${fnm0},-8($xfer)
|
---|
401 | extrd,u $ab0,31,32,$hi0
|
---|
402 | extrd,u $nm1,31,32,$hi1
|
---|
403 | ldw 8($tp),$ti1 ; tp[j]
|
---|
404 | extrd,u $ab0,63,32,$ab0
|
---|
405 | addl $hi1,$nm0,$nm0
|
---|
406 | addl $ab0,$nm0,$nm0
|
---|
407 | stw,ma $nm0,8($tp) ; tp[j-1]
|
---|
408 | addib,<> 8,$idx,L\$inner ; j++++
|
---|
409 | extrd,u $nm0,31,32,$hi1
|
---|
410 |
|
---|
411 | xmpyu ${fai}R,${fbi},${fab1} ; ap[j]*bp[i]
|
---|
412 | xmpyu ${fni}R,${fm0}R,${fnm1} ; np[j]*m
|
---|
413 | ldd 0($xfer),$ab1
|
---|
414 | fstds ${fab1},0($xfer)
|
---|
415 | addl $hi0,$ti1,$ti1
|
---|
416 | addl $ti1,$ab1,$ab1
|
---|
417 | ldd 8($xfer),$nm1
|
---|
418 | fstds ${fnm1},8($xfer)
|
---|
419 | extrd,u $ab1,31,32,$hi0
|
---|
420 | extrd,u $ab1,63,32,$ab1
|
---|
421 | ldw 4($tp),$ti0 ; tp[j]
|
---|
422 | addl $hi1,$nm1,$nm1
|
---|
423 | addl $ab1,$nm1,$nm1
|
---|
424 | ldd -16($xfer),$ab0
|
---|
425 | ldd -8($xfer),$nm0
|
---|
426 | extrd,u $nm1,31,32,$hi1
|
---|
427 |
|
---|
428 | addl $hi0,$ab0,$ab0
|
---|
429 | addl $ti0,$ab0,$ab0
|
---|
430 | stw $nm1,-4($tp) ; tp[j-1]
|
---|
431 | extrd,u $ab0,31,32,$hi0
|
---|
432 | ldw 8($tp),$ti1 ; tp[j]
|
---|
433 | extrd,u $ab0,63,32,$ab0
|
---|
434 | addl $hi1,$nm0,$nm0
|
---|
435 | ldd 0($xfer),$ab1
|
---|
436 | addl $ab0,$nm0,$nm0
|
---|
437 | ldd,mb 8($xfer),$nm1
|
---|
438 | extrd,u $nm0,31,32,$hi1
|
---|
439 | stw,ma $nm0,8($tp) ; tp[j-1]
|
---|
440 |
|
---|
441 | addib,= -1,$num,L\$outerdone ; i--
|
---|
442 | subi 0,$arrsz,$idx ; j=0
|
---|
443 | ___
|
---|
444 | $code.=<<___ if ($BN_SZ==4);
|
---|
445 | fldws,ma 4($bp),${fbi} ; bp[i]
|
---|
446 | ___
|
---|
447 | $code.=<<___ if ($BN_SZ==8);
|
---|
448 | ldi 12,$ti0 ; bp[i] in flipped word order
|
---|
449 | addl,ev %r0,$num,$num
|
---|
450 | ldi -4,$ti0
|
---|
451 | addl $ti0,$bp,$bp
|
---|
452 | fldws 0($bp),${fbi}
|
---|
453 | ___
|
---|
454 | $code.=<<___;
|
---|
455 | flddx $idx($ap),${fai} ; ap[0]
|
---|
456 | addl $hi0,$ab1,$ab1
|
---|
457 | flddx $idx($np),${fni} ; np[0]
|
---|
458 | fldws 8($xfer),${fti}R ; tp[0]
|
---|
459 | addl $ti1,$ab1,$ab1
|
---|
460 | extrd,u $ab1,31,32,$hi0
|
---|
461 | extrd,u $ab1,63,32,$ab1
|
---|
462 |
|
---|
463 | ldo 8($idx),$idx ; j++++
|
---|
464 | xmpyu ${fai}L,${fbi},${fab0} ; ap[0]*bp[i]
|
---|
465 | xmpyu ${fai}R,${fbi},${fab1} ; ap[1]*bp[i]
|
---|
466 | ldw 4($tp),$ti0 ; tp[j]
|
---|
467 |
|
---|
468 | addl $hi1,$nm1,$nm1
|
---|
469 | fstws,mb ${fab0}L,-8($xfer) ; save high part
|
---|
470 | addl $ab1,$nm1,$nm1
|
---|
471 | extrd,u $nm1,31,32,$hi1
|
---|
472 | fcpy,sgl %fr0,${fti}L ; zero high part
|
---|
473 | fcpy,sgl %fr0,${fab0}L
|
---|
474 | stw $nm1,-4($tp) ; tp[j-1]
|
---|
475 |
|
---|
476 | fcnvxf,dbl,dbl ${fti},${fti} ; 32-bit unsigned int -> double
|
---|
477 | fcnvxf,dbl,dbl ${fab0},${fab0}
|
---|
478 | addl $hi1,$hi0,$hi0
|
---|
479 | fadd,dbl ${fti},${fab0},${fab0} ; add tp[0]
|
---|
480 | addl $ti0,$hi0,$hi0
|
---|
481 | extrd,u $hi0,31,32,$hi1
|
---|
482 | fcnvfx,dbl,dbl ${fab0},${fab0} ; double -> 33-bit unsigned int
|
---|
483 | stw $hi0,0($tp)
|
---|
484 | stw $hi1,4($tp)
|
---|
485 | xmpyu ${fn0},${fab0}R,${fm0}
|
---|
486 |
|
---|
487 | b L\$outer
|
---|
488 | ldo `$LOCALS+32+4`($fp),$tp
|
---|
489 | |
---|
490 |
|
---|
491 | L\$outerdone
|
---|
492 | addl $hi0,$ab1,$ab1
|
---|
493 | addl $ti1,$ab1,$ab1
|
---|
494 | extrd,u $ab1,31,32,$hi0
|
---|
495 | extrd,u $ab1,63,32,$ab1
|
---|
496 |
|
---|
497 | ldw 4($tp),$ti0 ; tp[j]
|
---|
498 |
|
---|
499 | addl $hi1,$nm1,$nm1
|
---|
500 | addl $ab1,$nm1,$nm1
|
---|
501 | extrd,u $nm1,31,32,$hi1
|
---|
502 | stw $nm1,-4($tp) ; tp[j-1]
|
---|
503 |
|
---|
504 | addl $hi1,$hi0,$hi0
|
---|
505 | addl $ti0,$hi0,$hi0
|
---|
506 | extrd,u $hi0,31,32,$hi1
|
---|
507 | stw $hi0,0($tp)
|
---|
508 | stw $hi1,4($tp)
|
---|
509 |
|
---|
510 | ldo `$LOCALS+32`($fp),$tp
|
---|
511 | sub %r0,%r0,%r0 ; clear borrow
|
---|
512 | ___
|
---|
513 | $code.=<<___ if ($BN_SZ==4);
|
---|
514 | ldws,ma 4($tp),$ti0
|
---|
515 | extru,= $rp,31,3,%r0 ; is rp 64-bit aligned?
|
---|
516 | b L\$sub_pa11
|
---|
517 | addl $tp,$arrsz,$tp
|
---|
518 | L\$sub
|
---|
519 | ldwx $idx($np),$hi0
|
---|
520 | subb $ti0,$hi0,$hi1
|
---|
521 | ldwx $idx($tp),$ti0
|
---|
522 | addib,<> 4,$idx,L\$sub
|
---|
523 | stws,ma $hi1,4($rp)
|
---|
524 |
|
---|
525 | subb $ti0,%r0,$hi1
|
---|
526 | ___
|
---|
527 | $code.=<<___ if ($BN_SZ==8);
|
---|
528 | ldd,ma 8($tp),$ti0
|
---|
529 | L\$sub
|
---|
530 | ldd $idx($np),$hi0
|
---|
531 | shrpd $ti0,$ti0,32,$ti0 ; flip word order
|
---|
532 | std $ti0,-8($tp) ; save flipped value
|
---|
533 | sub,db $ti0,$hi0,$hi1
|
---|
534 | ldd,ma 8($tp),$ti0
|
---|
535 | addib,<> 8,$idx,L\$sub
|
---|
536 | std,ma $hi1,8($rp)
|
---|
537 |
|
---|
538 | extrd,u $ti0,31,32,$ti0 ; carry in flipped word order
|
---|
539 | sub,db $ti0,%r0,$hi1
|
---|
540 | ___
|
---|
541 | $code.=<<___;
|
---|
542 | ldo `$LOCALS+32`($fp),$tp
|
---|
543 | sub $rp,$arrsz,$rp ; rewind rp
|
---|
544 | subi 0,$arrsz,$idx
|
---|
545 | L\$copy
|
---|
546 | ldd 0($tp),$ti0
|
---|
547 | ldd 0($rp),$hi0
|
---|
548 | std,ma %r0,8($tp)
|
---|
549 | comiclr,= 0,$hi1,%r0
|
---|
550 | copy $ti0,$hi0
|
---|
551 | addib,<> 8,$idx,L\$copy
|
---|
552 | std,ma $hi0,8($rp)
|
---|
553 | ___
|
---|
554 |
|
---|
555 | if ($BN_SZ==4) { # PA-RISC 1.1 code-path
|
---|
556 | $ablo=$ab0;
|
---|
557 | $abhi=$ab1;
|
---|
558 | $nmlo0=$nm0;
|
---|
559 | $nmhi0=$nm1;
|
---|
560 | $nmlo1="%r9";
|
---|
561 | $nmhi1="%r8";
|
---|
562 |
|
---|
563 | $code.=<<___;
|
---|
564 | b L\$done
|
---|
565 | nop
|
---|
566 |
|
---|
567 | .ALIGN 8
|
---|
568 | L\$parisc11
|
---|
569 | xmpyu ${fai}L,${fbi},${fab0} ; ap[j]*bp[0]
|
---|
570 | xmpyu ${fni}L,${fm0}R,${fnm0} ; np[j]*m
|
---|
571 | ldw -12($xfer),$ablo
|
---|
572 | ldw -16($xfer),$hi0
|
---|
573 | ldw -4($xfer),$nmlo0
|
---|
574 | ldw -8($xfer),$nmhi0
|
---|
575 | fstds ${fab0},-16($xfer)
|
---|
576 | fstds ${fnm0},-8($xfer)
|
---|
577 |
|
---|
578 | ldo 8($idx),$idx ; j++++
|
---|
579 | add $ablo,$nmlo0,$nmlo0 ; discarded
|
---|
580 | addc %r0,$nmhi0,$hi1
|
---|
581 | ldw 4($xfer),$ablo
|
---|
582 | ldw 0($xfer),$abhi
|
---|
583 | nop
|
---|
584 | |
---|
585 |
|
---|
586 | L\$1st_pa11
|
---|
587 | xmpyu ${fai}R,${fbi},${fab1} ; ap[j+1]*bp[0]
|
---|
588 | flddx $idx($ap),${fai} ; ap[j,j+1]
|
---|
589 | xmpyu ${fni}R,${fm0}R,${fnm1} ; np[j+1]*m
|
---|
590 | flddx $idx($np),${fni} ; np[j,j+1]
|
---|
591 | add $hi0,$ablo,$ablo
|
---|
592 | ldw 12($xfer),$nmlo1
|
---|
593 | addc %r0,$abhi,$hi0
|
---|
594 | ldw 8($xfer),$nmhi1
|
---|
595 | add $ablo,$nmlo1,$nmlo1
|
---|
596 | fstds ${fab1},0($xfer)
|
---|
597 | addc %r0,$nmhi1,$nmhi1
|
---|
598 | fstds ${fnm1},8($xfer)
|
---|
599 | add $hi1,$nmlo1,$nmlo1
|
---|
600 | ldw -12($xfer),$ablo
|
---|
601 | addc %r0,$nmhi1,$hi1
|
---|
602 | ldw -16($xfer),$abhi
|
---|
603 |
|
---|
604 | xmpyu ${fai}L,${fbi},${fab0} ; ap[j]*bp[0]
|
---|
605 | ldw -4($xfer),$nmlo0
|
---|
606 | xmpyu ${fni}L,${fm0}R,${fnm0} ; np[j]*m
|
---|
607 | ldw -8($xfer),$nmhi0
|
---|
608 | add $hi0,$ablo,$ablo
|
---|
609 | stw $nmlo1,-4($tp) ; tp[j-1]
|
---|
610 | addc %r0,$abhi,$hi0
|
---|
611 | fstds ${fab0},-16($xfer)
|
---|
612 | add $ablo,$nmlo0,$nmlo0
|
---|
613 | fstds ${fnm0},-8($xfer)
|
---|
614 | addc %r0,$nmhi0,$nmhi0
|
---|
615 | ldw 0($xfer),$abhi
|
---|
616 | add $hi1,$nmlo0,$nmlo0
|
---|
617 | ldw 4($xfer),$ablo
|
---|
618 | stws,ma $nmlo0,8($tp) ; tp[j-1]
|
---|
619 | addib,<> 8,$idx,L\$1st_pa11 ; j++++
|
---|
620 | addc %r0,$nmhi0,$hi1
|
---|
621 |
|
---|
622 | ldw 8($xfer),$nmhi1
|
---|
623 | ldw 12($xfer),$nmlo1
|
---|
624 | xmpyu ${fai}R,${fbi},${fab1} ; ap[j]*bp[0]
|
---|
625 | xmpyu ${fni}R,${fm0}R,${fnm1} ; np[j]*m
|
---|
626 | add $hi0,$ablo,$ablo
|
---|
627 | fstds ${fab1},0($xfer)
|
---|
628 | addc %r0,$abhi,$hi0
|
---|
629 | fstds ${fnm1},8($xfer)
|
---|
630 | add $ablo,$nmlo1,$nmlo1
|
---|
631 | ldw -16($xfer),$abhi
|
---|
632 | addc %r0,$nmhi1,$nmhi1
|
---|
633 | ldw -12($xfer),$ablo
|
---|
634 | add $hi1,$nmlo1,$nmlo1
|
---|
635 | ldw -8($xfer),$nmhi0
|
---|
636 | addc %r0,$nmhi1,$hi1
|
---|
637 | ldw -4($xfer),$nmlo0
|
---|
638 |
|
---|
639 | add $hi0,$ablo,$ablo
|
---|
640 | stw $nmlo1,-4($tp) ; tp[j-1]
|
---|
641 | addc %r0,$abhi,$hi0
|
---|
642 | ldw 0($xfer),$abhi
|
---|
643 | add $ablo,$nmlo0,$nmlo0
|
---|
644 | ldw 4($xfer),$ablo
|
---|
645 | addc %r0,$nmhi0,$nmhi0
|
---|
646 | ldws,mb 8($xfer),$nmhi1
|
---|
647 | add $hi1,$nmlo0,$nmlo0
|
---|
648 | ldw 4($xfer),$nmlo1
|
---|
649 | addc %r0,$nmhi0,$hi1
|
---|
650 | stws,ma $nmlo0,8($tp) ; tp[j-1]
|
---|
651 |
|
---|
652 | ldo -1($num),$num ; i--
|
---|
653 | subi 0,$arrsz,$idx ; j=0
|
---|
654 |
|
---|
655 | fldws,ma 4($bp),${fbi} ; bp[1]
|
---|
656 | flddx $idx($ap),${fai} ; ap[0,1]
|
---|
657 | flddx $idx($np),${fni} ; np[0,1]
|
---|
658 | fldws 8($xfer),${fti}R ; tp[0]
|
---|
659 | add $hi0,$ablo,$ablo
|
---|
660 | addc %r0,$abhi,$hi0
|
---|
661 | ldo 8($idx),$idx ; j++++
|
---|
662 | xmpyu ${fai}L,${fbi},${fab0} ; ap[0]*bp[1]
|
---|
663 | xmpyu ${fai}R,${fbi},${fab1} ; ap[1]*bp[1]
|
---|
664 | add $hi1,$nmlo1,$nmlo1
|
---|
665 | addc %r0,$nmhi1,$nmhi1
|
---|
666 | add $ablo,$nmlo1,$nmlo1
|
---|
667 | addc %r0,$nmhi1,$hi1
|
---|
668 | fstws,mb ${fab0}L,-8($xfer) ; save high part
|
---|
669 | stw $nmlo1,-4($tp) ; tp[j-1]
|
---|
670 |
|
---|
671 | fcpy,sgl %fr0,${fti}L ; zero high part
|
---|
672 | fcpy,sgl %fr0,${fab0}L
|
---|
673 | add $hi1,$hi0,$hi0
|
---|
674 | addc %r0,%r0,$hi1
|
---|
675 | fcnvxf,dbl,dbl ${fti},${fti} ; 32-bit unsigned int -> double
|
---|
676 | fcnvxf,dbl,dbl ${fab0},${fab0}
|
---|
677 | stw $hi0,0($tp)
|
---|
678 | stw $hi1,4($tp)
|
---|
679 |
|
---|
680 | fadd,dbl ${fti},${fab0},${fab0} ; add tp[0]
|
---|
681 | fcnvfx,dbl,dbl ${fab0},${fab0} ; double -> 33-bit unsigned int
|
---|
682 | xmpyu ${fn0},${fab0}R,${fm0}
|
---|
683 | ldo `$LOCALS+32+4`($fp),$tp
|
---|
684 | L\$outer_pa11
|
---|
685 | xmpyu ${fni}L,${fm0}R,${fnm0} ; np[0]*m
|
---|
686 | xmpyu ${fni}R,${fm0}R,${fnm1} ; np[1]*m
|
---|
687 | fstds ${fab0},-16($xfer) ; 33-bit value
|
---|
688 | fstds ${fnm0},-8($xfer)
|
---|
689 | flddx $idx($ap),${fai} ; ap[2,3]
|
---|
690 | flddx $idx($np),${fni} ; np[2,3]
|
---|
691 | ldw -16($xfer),$abhi ; carry bit actually
|
---|
692 | ldo 8($idx),$idx ; j++++
|
---|
693 | ldw -12($xfer),$ablo
|
---|
694 | ldw -8($xfer),$nmhi0
|
---|
695 | ldw -4($xfer),$nmlo0
|
---|
696 | ldw 0($xfer),$hi0 ; high part
|
---|
697 |
|
---|
698 | xmpyu ${fai}L,${fbi},${fab0} ; ap[j]*bp[i]
|
---|
699 | xmpyu ${fni}L,${fm0}R,${fnm0} ; np[j]*m
|
---|
700 | fstds ${fab1},0($xfer)
|
---|
701 | addl $abhi,$hi0,$hi0 ; account carry bit
|
---|
702 | fstds ${fnm1},8($xfer)
|
---|
703 | add $ablo,$nmlo0,$nmlo0 ; discarded
|
---|
704 | ldw 0($tp),$ti1 ; tp[1]
|
---|
705 | addc %r0,$nmhi0,$hi1
|
---|
706 | fstds ${fab0},-16($xfer)
|
---|
707 | fstds ${fnm0},-8($xfer)
|
---|
708 | ldw 4($xfer),$ablo
|
---|
709 | ldw 0($xfer),$abhi
|
---|
710 | |
---|
711 |
|
---|
712 | L\$inner_pa11
|
---|
713 | xmpyu ${fai}R,${fbi},${fab1} ; ap[j+1]*bp[i]
|
---|
714 | flddx $idx($ap),${fai} ; ap[j,j+1]
|
---|
715 | xmpyu ${fni}R,${fm0}R,${fnm1} ; np[j+1]*m
|
---|
716 | flddx $idx($np),${fni} ; np[j,j+1]
|
---|
717 | add $hi0,$ablo,$ablo
|
---|
718 | ldw 4($tp),$ti0 ; tp[j]
|
---|
719 | addc %r0,$abhi,$abhi
|
---|
720 | ldw 12($xfer),$nmlo1
|
---|
721 | add $ti1,$ablo,$ablo
|
---|
722 | ldw 8($xfer),$nmhi1
|
---|
723 | addc %r0,$abhi,$hi0
|
---|
724 | fstds ${fab1},0($xfer)
|
---|
725 | add $ablo,$nmlo1,$nmlo1
|
---|
726 | fstds ${fnm1},8($xfer)
|
---|
727 | addc %r0,$nmhi1,$nmhi1
|
---|
728 | ldw -12($xfer),$ablo
|
---|
729 | add $hi1,$nmlo1,$nmlo1
|
---|
730 | ldw -16($xfer),$abhi
|
---|
731 | addc %r0,$nmhi1,$hi1
|
---|
732 |
|
---|
733 | xmpyu ${fai}L,${fbi},${fab0} ; ap[j]*bp[i]
|
---|
734 | ldw 8($tp),$ti1 ; tp[j]
|
---|
735 | xmpyu ${fni}L,${fm0}R,${fnm0} ; np[j]*m
|
---|
736 | ldw -4($xfer),$nmlo0
|
---|
737 | add $hi0,$ablo,$ablo
|
---|
738 | ldw -8($xfer),$nmhi0
|
---|
739 | addc %r0,$abhi,$abhi
|
---|
740 | stw $nmlo1,-4($tp) ; tp[j-1]
|
---|
741 | add $ti0,$ablo,$ablo
|
---|
742 | fstds ${fab0},-16($xfer)
|
---|
743 | addc %r0,$abhi,$hi0
|
---|
744 | fstds ${fnm0},-8($xfer)
|
---|
745 | add $ablo,$nmlo0,$nmlo0
|
---|
746 | ldw 4($xfer),$ablo
|
---|
747 | addc %r0,$nmhi0,$nmhi0
|
---|
748 | ldw 0($xfer),$abhi
|
---|
749 | add $hi1,$nmlo0,$nmlo0
|
---|
750 | stws,ma $nmlo0,8($tp) ; tp[j-1]
|
---|
751 | addib,<> 8,$idx,L\$inner_pa11 ; j++++
|
---|
752 | addc %r0,$nmhi0,$hi1
|
---|
753 |
|
---|
754 | xmpyu ${fai}R,${fbi},${fab1} ; ap[j]*bp[i]
|
---|
755 | ldw 12($xfer),$nmlo1
|
---|
756 | xmpyu ${fni}R,${fm0}R,${fnm1} ; np[j]*m
|
---|
757 | ldw 8($xfer),$nmhi1
|
---|
758 | add $hi0,$ablo,$ablo
|
---|
759 | ldw 4($tp),$ti0 ; tp[j]
|
---|
760 | addc %r0,$abhi,$abhi
|
---|
761 | fstds ${fab1},0($xfer)
|
---|
762 | add $ti1,$ablo,$ablo
|
---|
763 | fstds ${fnm1},8($xfer)
|
---|
764 | addc %r0,$abhi,$hi0
|
---|
765 | ldw -16($xfer),$abhi
|
---|
766 | add $ablo,$nmlo1,$nmlo1
|
---|
767 | ldw -12($xfer),$ablo
|
---|
768 | addc %r0,$nmhi1,$nmhi1
|
---|
769 | ldw -8($xfer),$nmhi0
|
---|
770 | add $hi1,$nmlo1,$nmlo1
|
---|
771 | ldw -4($xfer),$nmlo0
|
---|
772 | addc %r0,$nmhi1,$hi1
|
---|
773 |
|
---|
774 | add $hi0,$ablo,$ablo
|
---|
775 | stw $nmlo1,-4($tp) ; tp[j-1]
|
---|
776 | addc %r0,$abhi,$abhi
|
---|
777 | add $ti0,$ablo,$ablo
|
---|
778 | ldw 8($tp),$ti1 ; tp[j]
|
---|
779 | addc %r0,$abhi,$hi0
|
---|
780 | ldw 0($xfer),$abhi
|
---|
781 | add $ablo,$nmlo0,$nmlo0
|
---|
782 | ldw 4($xfer),$ablo
|
---|
783 | addc %r0,$nmhi0,$nmhi0
|
---|
784 | ldws,mb 8($xfer),$nmhi1
|
---|
785 | add $hi1,$nmlo0,$nmlo0
|
---|
786 | ldw 4($xfer),$nmlo1
|
---|
787 | addc %r0,$nmhi0,$hi1
|
---|
788 | stws,ma $nmlo0,8($tp) ; tp[j-1]
|
---|
789 |
|
---|
790 | addib,= -1,$num,L\$outerdone_pa11; i--
|
---|
791 | subi 0,$arrsz,$idx ; j=0
|
---|
792 |
|
---|
793 | fldws,ma 4($bp),${fbi} ; bp[i]
|
---|
794 | flddx $idx($ap),${fai} ; ap[0]
|
---|
795 | add $hi0,$ablo,$ablo
|
---|
796 | addc %r0,$abhi,$abhi
|
---|
797 | flddx $idx($np),${fni} ; np[0]
|
---|
798 | fldws 8($xfer),${fti}R ; tp[0]
|
---|
799 | add $ti1,$ablo,$ablo
|
---|
800 | addc %r0,$abhi,$hi0
|
---|
801 |
|
---|
802 | ldo 8($idx),$idx ; j++++
|
---|
803 | xmpyu ${fai}L,${fbi},${fab0} ; ap[0]*bp[i]
|
---|
804 | xmpyu ${fai}R,${fbi},${fab1} ; ap[1]*bp[i]
|
---|
805 | ldw 4($tp),$ti0 ; tp[j]
|
---|
806 |
|
---|
807 | add $hi1,$nmlo1,$nmlo1
|
---|
808 | addc %r0,$nmhi1,$nmhi1
|
---|
809 | fstws,mb ${fab0}L,-8($xfer) ; save high part
|
---|
810 | add $ablo,$nmlo1,$nmlo1
|
---|
811 | addc %r0,$nmhi1,$hi1
|
---|
812 | fcpy,sgl %fr0,${fti}L ; zero high part
|
---|
813 | fcpy,sgl %fr0,${fab0}L
|
---|
814 | stw $nmlo1,-4($tp) ; tp[j-1]
|
---|
815 |
|
---|
816 | fcnvxf,dbl,dbl ${fti},${fti} ; 32-bit unsigned int -> double
|
---|
817 | fcnvxf,dbl,dbl ${fab0},${fab0}
|
---|
818 | add $hi1,$hi0,$hi0
|
---|
819 | addc %r0,%r0,$hi1
|
---|
820 | fadd,dbl ${fti},${fab0},${fab0} ; add tp[0]
|
---|
821 | add $ti0,$hi0,$hi0
|
---|
822 | addc %r0,$hi1,$hi1
|
---|
823 | fcnvfx,dbl,dbl ${fab0},${fab0} ; double -> 33-bit unsigned int
|
---|
824 | stw $hi0,0($tp)
|
---|
825 | stw $hi1,4($tp)
|
---|
826 | xmpyu ${fn0},${fab0}R,${fm0}
|
---|
827 |
|
---|
828 | b L\$outer_pa11
|
---|
829 | ldo `$LOCALS+32+4`($fp),$tp
|
---|
830 | |
---|
831 |
|
---|
832 | L\$outerdone_pa11
|
---|
833 | add $hi0,$ablo,$ablo
|
---|
834 | addc %r0,$abhi,$abhi
|
---|
835 | add $ti1,$ablo,$ablo
|
---|
836 | addc %r0,$abhi,$hi0
|
---|
837 |
|
---|
838 | ldw 4($tp),$ti0 ; tp[j]
|
---|
839 |
|
---|
840 | add $hi1,$nmlo1,$nmlo1
|
---|
841 | addc %r0,$nmhi1,$nmhi1
|
---|
842 | add $ablo,$nmlo1,$nmlo1
|
---|
843 | addc %r0,$nmhi1,$hi1
|
---|
844 | stw $nmlo1,-4($tp) ; tp[j-1]
|
---|
845 |
|
---|
846 | add $hi1,$hi0,$hi0
|
---|
847 | addc %r0,%r0,$hi1
|
---|
848 | add $ti0,$hi0,$hi0
|
---|
849 | addc %r0,$hi1,$hi1
|
---|
850 | stw $hi0,0($tp)
|
---|
851 | stw $hi1,4($tp)
|
---|
852 |
|
---|
853 | ldo `$LOCALS+32+4`($fp),$tp
|
---|
854 | sub %r0,%r0,%r0 ; clear borrow
|
---|
855 | ldw -4($tp),$ti0
|
---|
856 | addl $tp,$arrsz,$tp
|
---|
857 | L\$sub_pa11
|
---|
858 | ldwx $idx($np),$hi0
|
---|
859 | subb $ti0,$hi0,$hi1
|
---|
860 | ldwx $idx($tp),$ti0
|
---|
861 | addib,<> 4,$idx,L\$sub_pa11
|
---|
862 | stws,ma $hi1,4($rp)
|
---|
863 |
|
---|
864 | subb $ti0,%r0,$hi1
|
---|
865 |
|
---|
866 | ldo `$LOCALS+32`($fp),$tp
|
---|
867 | sub $rp,$arrsz,$rp ; rewind rp
|
---|
868 | subi 0,$arrsz,$idx
|
---|
869 | L\$copy_pa11
|
---|
870 | ldw 0($tp),$ti0
|
---|
871 | ldw 0($rp),$hi0
|
---|
872 | stws,ma %r0,4($tp)
|
---|
873 | comiclr,= 0,$hi1,%r0
|
---|
874 | copy $ti0,$hi0
|
---|
875 | addib,<> 4,$idx,L\$copy_pa11
|
---|
876 | stws,ma $hi0,4($rp)
|
---|
877 |
|
---|
878 | nop ; alignment
|
---|
879 | L\$done
|
---|
880 | ___
|
---|
881 | }
|
---|
882 | |
---|
883 |
|
---|
884 | $code.=<<___;
|
---|
885 | ldi 1,%r28 ; signal "handled"
|
---|
886 | ldo $FRAME($fp),%sp ; destroy tp[num+1]
|
---|
887 |
|
---|
888 | $POP `-$FRAME-$SAVED_RP`(%sp),%r2 ; standard epilogue
|
---|
889 | $POP `-$FRAME+1*$SIZE_T`(%sp),%r4
|
---|
890 | $POP `-$FRAME+2*$SIZE_T`(%sp),%r5
|
---|
891 | $POP `-$FRAME+3*$SIZE_T`(%sp),%r6
|
---|
892 | $POP `-$FRAME+4*$SIZE_T`(%sp),%r7
|
---|
893 | $POP `-$FRAME+5*$SIZE_T`(%sp),%r8
|
---|
894 | $POP `-$FRAME+6*$SIZE_T`(%sp),%r9
|
---|
895 | $POP `-$FRAME+7*$SIZE_T`(%sp),%r10
|
---|
896 | L\$abort
|
---|
897 | bv (%r2)
|
---|
898 | .EXIT
|
---|
899 | $POPMB -$FRAME(%sp),%r3
|
---|
900 | .PROCEND
|
---|
901 | .STRINGZ "Montgomery Multiplication for PA-RISC, CRYPTOGAMS by <appro\@openssl.org>"
|
---|
902 | ___
|
---|
903 | |
---|
904 |
|
---|
905 | # Explicitly encode PA-RISC 2.0 instructions used in this module, so
|
---|
906 | # that it can be compiled with .LEVEL 1.0. It should be noted that I
|
---|
907 | # wouldn't have to do this, if GNU assembler understood .ALLOW 2.0
|
---|
908 | # directive...
|
---|
909 |
|
---|
910 | my $ldd = sub {
|
---|
911 | my ($mod,$args) = @_;
|
---|
912 | my $orig = "ldd$mod\t$args";
|
---|
913 |
|
---|
914 | if ($args =~ /%r([0-9]+)\(%r([0-9]+)\),%r([0-9]+)/) # format 4
|
---|
915 | { my $opcode=(0x03<<26)|($2<<21)|($1<<16)|(3<<6)|$3;
|
---|
916 | sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
|
---|
917 | }
|
---|
918 | elsif ($args =~ /(\-?[0-9]+)\(%r([0-9]+)\),%r([0-9]+)/) # format 5
|
---|
919 | { my $opcode=(0x03<<26)|($2<<21)|(1<<12)|(3<<6)|$3;
|
---|
920 | $opcode|=(($1&0xF)<<17)|(($1&0x10)<<12); # encode offset
|
---|
921 | $opcode|=(1<<5) if ($mod =~ /^,m/);
|
---|
922 | $opcode|=(1<<13) if ($mod =~ /^,mb/);
|
---|
923 | sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
|
---|
924 | }
|
---|
925 | else { "\t".$orig; }
|
---|
926 | };
|
---|
927 |
|
---|
928 | my $std = sub {
|
---|
929 | my ($mod,$args) = @_;
|
---|
930 | my $orig = "std$mod\t$args";
|
---|
931 |
|
---|
932 | if ($args =~ /%r([0-9]+),(\-?[0-9]+)\(%r([0-9]+)\)/) # format 6
|
---|
933 | { my $opcode=(0x03<<26)|($3<<21)|($1<<16)|(1<<12)|(0xB<<6);
|
---|
934 | $opcode|=(($2&0xF)<<1)|(($2&0x10)>>4); # encode offset
|
---|
935 | $opcode|=(1<<5) if ($mod =~ /^,m/);
|
---|
936 | $opcode|=(1<<13) if ($mod =~ /^,mb/);
|
---|
937 | sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
|
---|
938 | }
|
---|
939 | else { "\t".$orig; }
|
---|
940 | };
|
---|
941 |
|
---|
942 | my $extrd = sub {
|
---|
943 | my ($mod,$args) = @_;
|
---|
944 | my $orig = "extrd$mod\t$args";
|
---|
945 |
|
---|
946 | # I only have ",u" completer, it's implicitly encoded...
|
---|
947 | if ($args =~ /%r([0-9]+),([0-9]+),([0-9]+),%r([0-9]+)/) # format 15
|
---|
948 | { my $opcode=(0x36<<26)|($1<<21)|($4<<16);
|
---|
949 | my $len=32-$3;
|
---|
950 | $opcode |= (($2&0x20)<<6)|(($2&0x1f)<<5); # encode pos
|
---|
951 | $opcode |= (($len&0x20)<<7)|($len&0x1f); # encode len
|
---|
952 | sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
|
---|
953 | }
|
---|
954 | elsif ($args =~ /%r([0-9]+),%sar,([0-9]+),%r([0-9]+)/) # format 12
|
---|
955 | { my $opcode=(0x34<<26)|($1<<21)|($3<<16)|(2<<11)|(1<<9);
|
---|
956 | my $len=32-$2;
|
---|
957 | $opcode |= (($len&0x20)<<3)|($len&0x1f); # encode len
|
---|
958 | $opcode |= (1<<13) if ($mod =~ /,\**=/);
|
---|
959 | sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
|
---|
960 | }
|
---|
961 | else { "\t".$orig; }
|
---|
962 | };
|
---|
963 |
|
---|
964 | my $shrpd = sub {
|
---|
965 | my ($mod,$args) = @_;
|
---|
966 | my $orig = "shrpd$mod\t$args";
|
---|
967 |
|
---|
968 | if ($args =~ /%r([0-9]+),%r([0-9]+),([0-9]+),%r([0-9]+)/) # format 14
|
---|
969 | { my $opcode=(0x34<<26)|($2<<21)|($1<<16)|(1<<10)|$4;
|
---|
970 | my $cpos=63-$3;
|
---|
971 | $opcode |= (($cpos&0x20)<<6)|(($cpos&0x1f)<<5); # encode sa
|
---|
972 | sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
|
---|
973 | }
|
---|
974 | else { "\t".$orig; }
|
---|
975 | };
|
---|
976 |
|
---|
977 | my $sub = sub {
|
---|
978 | my ($mod,$args) = @_;
|
---|
979 | my $orig = "sub$mod\t$args";
|
---|
980 |
|
---|
981 | if ($mod eq ",db" && $args =~ /%r([0-9]+),%r([0-9]+),%r([0-9]+)/) {
|
---|
982 | my $opcode=(0x02<<26)|($2<<21)|($1<<16)|$3;
|
---|
983 | $opcode|=(1<<10); # e1
|
---|
984 | $opcode|=(1<<8); # e2
|
---|
985 | $opcode|=(1<<5); # d
|
---|
986 | sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig
|
---|
987 | }
|
---|
988 | else { "\t".$orig; }
|
---|
989 | };
|
---|
990 |
|
---|
991 | sub assemble {
|
---|
992 | my ($mnemonic,$mod,$args)=@_;
|
---|
993 | my $opcode = eval("\$$mnemonic");
|
---|
994 |
|
---|
995 | ref($opcode) eq 'CODE' ? &$opcode($mod,$args) : "\t$mnemonic$mod\t$args";
|
---|
996 | }
|
---|
997 |
|
---|
998 | if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
|
---|
999 | =~ /GNU assembler/) {
|
---|
1000 | $gnuas = 1;
|
---|
1001 | }
|
---|
1002 |
|
---|
1003 | foreach (split("\n",$code)) {
|
---|
1004 | s/\`([^\`]*)\`/eval $1/ge;
|
---|
1005 | # flip word order in 64-bit mode...
|
---|
1006 | s/(xmpyu\s+)($fai|$fni)([LR])/$1.$2.($3 eq "L"?"R":"L")/e if ($BN_SZ==8);
|
---|
1007 | # assemble 2.0 instructions in 32-bit mode...
|
---|
1008 | s/^\s+([a-z]+)([\S]*)\s+([\S]*)/&assemble($1,$2,$3)/e if ($BN_SZ==4);
|
---|
1009 |
|
---|
1010 | s/(\.LEVEL\s+2\.0)W/$1w/ if ($gnuas && $SIZE_T==8);
|
---|
1011 | s/\.SPACE\s+\$TEXT\$/.text/ if ($gnuas && $SIZE_T==8);
|
---|
1012 | s/\.SUBSPA.*// if ($gnuas && $SIZE_T==8);
|
---|
1013 | s/\bbv\b/bve/ if ($SIZE_T==8);
|
---|
1014 |
|
---|
1015 | print $_,"\n";
|
---|
1016 | }
|
---|
1017 | close STDOUT or die "error closing STDOUT: $!";
|
---|