VirtualBox

source: vbox/trunk/src/libs/openssl-3.1.3/crypto/bn/asm/parisc-mont.pl@ 102334

最後變更 在這個檔案從102334是 101211,由 vboxsync 提交於 17 月 前

openssl-3.1.3: Applied and adjusted our OpenSSL changes to 3.1.2. bugref:10527

檔案大小: 27.2 KB
 
1#! /usr/bin/env perl
2# Copyright 2009-2020 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the Apache License 2.0 (the "License"). You may not use
5# this file except in compliance with the License. You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
9
10# ====================================================================
11# Written by Andy Polyakov <[email protected]> for the OpenSSL
12# project. The module is, however, dual licensed under OpenSSL and
13# CRYPTOGAMS licenses depending on where you obtain it. For further
14# details see http://www.openssl.org/~appro/cryptogams/.
15# ====================================================================
16
17# On PA-7100LC this module performs ~90-50% better, less for longer
18# keys, than code generated by gcc 3.2 for PA-RISC 1.1. Latter means
19# that compiler utilized xmpyu instruction to perform 32x32=64-bit
20# multiplication, which in turn means that "baseline" performance was
21# optimal in respect to instruction set capabilities. Fair comparison
22# with vendor compiler is problematic, because OpenSSL doesn't define
23# BN_LLONG [presumably] for historical reasons, which drives compiler
24# toward 4 times 16x16=32-bit multiplications [plus complementary
25# shifts and additions] instead. This means that you should observe
26# several times improvement over code generated by vendor compiler
27# for PA-RISC 1.1, but the "baseline" is far from optimal. The actual
28# improvement coefficient was never collected on PA-7100LC, or any
29# other 1.1 CPU, because I don't have access to such machine with
30# vendor compiler. But to give you a taste, PA-RISC 1.1 code path
31# reportedly outperformed code generated by cc +DA1.1 +O3 by factor
32# of ~5x on PA-8600.
33#
34# On PA-RISC 2.0 it has to compete with pa-risc2[W].s, which is
35# reportedly ~2x faster than vendor compiler generated code [according
36# to comment in pa-risc2[W].s]. Here comes a catch. Execution core of
37# this implementation is actually 32-bit one, in the sense that it
38# operates on 32-bit values. But pa-risc2[W].s operates on arrays of
39# 64-bit BN_LONGs... How do they interoperate then? No problem. This
40# module picks halves of 64-bit values in reverse order and pretends
41# they were 32-bit BN_LONGs. But can 32-bit core compete with "pure"
42# 64-bit code such as pa-risc2[W].s then? Well, the thing is that
43# 32x32=64-bit multiplication is the best even PA-RISC 2.0 can do,
44# i.e. there is no "wider" multiplication like on most other 64-bit
45# platforms. This means that even being effectively 32-bit, this
46# implementation performs "64-bit" computational task in same amount
47# of arithmetic operations, most notably multiplications. It requires
48# more memory references, most notably to tp[num], but this doesn't
49# seem to exhaust memory port capacity. And indeed, dedicated PA-RISC
50# 2.0 code path provides virtually same performance as pa-risc2[W].s:
51# it's ~10% better for shortest key length and ~10% worse for longest
52# one.
53#
54# In case it wasn't clear. The module has two distinct code paths:
55# PA-RISC 1.1 and PA-RISC 2.0 ones. Latter features carry-free 64-bit
56# additions and 64-bit integer loads, not to mention specific
57# instruction scheduling. In 64-bit build naturally only 2.0 code path
58# is assembled. In 32-bit application context both code paths are
59# assembled, PA-RISC 2.0 CPU is detected at run-time and proper path
60# is taken automatically. Also, in 32-bit build the module imposes
61# couple of limitations: vector lengths has to be even and vector
62# addresses has to be 64-bit aligned. Normally neither is a problem:
63# most common key lengths are even and vectors are commonly malloc-ed,
64# which ensures alignment.
65#
66# Special thanks to polarhome.com for providing HP-UX account on
67# PA-RISC 1.1 machine, and to correspondent who chose to remain
68# anonymous for testing the code on PA-RISC 2.0 machine.
69
70
71$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
72
73# $output is the last argument if it looks like a file (it has an extension)
74# $flavour is the first argument if it doesn't look like a file
75$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
76$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
77
78$output and open STDOUT,">$output";
79
80if ($flavour =~ /64/) {
81 $LEVEL ="2.0W";
82 $SIZE_T =8;
83 $FRAME_MARKER =80;
84 $SAVED_RP =16;
85 $PUSH ="std";
86 $PUSHMA ="std,ma";
87 $POP ="ldd";
88 $POPMB ="ldd,mb";
89 $BN_SZ =$SIZE_T;
90} else {
91 $LEVEL ="1.1"; #$LEVEL.="\n\t.ALLOW\t2.0";
92 $SIZE_T =4;
93 $FRAME_MARKER =48;
94 $SAVED_RP =20;
95 $PUSH ="stw";
96 $PUSHMA ="stwm";
97 $POP ="ldw";
98 $POPMB ="ldwm";
99 $BN_SZ =$SIZE_T;
100 if (open CONF,"<${dir}../../opensslconf.h") {
101 while(<CONF>) {
102 if (m/#\s*define\s+SIXTY_FOUR_BIT/) {
103 $BN_SZ=8;
104 $LEVEL="2.0";
105 last;
106 }
107 }
108 close CONF;
109 }
110}
111
112$FRAME=8*$SIZE_T+$FRAME_MARKER; # 8 saved regs + frame marker
113 # [+ argument transfer]
114$LOCALS=$FRAME-$FRAME_MARKER;
115$FRAME+=32; # local variables
116
117$tp="%r31";
118$ti1="%r29";
119$ti0="%r28";
120
121$rp="%r26";
122$ap="%r25";
123$bp="%r24";
124$np="%r23";
125$n0="%r22"; # passed through stack in 32-bit
126$num="%r21"; # passed through stack in 32-bit
127$idx="%r20";
128$arrsz="%r19";
129
130$nm1="%r7";
131$nm0="%r6";
132$ab1="%r5";
133$ab0="%r4";
134
135$fp="%r3";
136$hi1="%r2";
137$hi0="%r1";
138
139$xfer=$n0; # accommodates [-16..15] offset in fld[dw]s
140
141$fm0="%fr4"; $fti=$fm0;
142$fbi="%fr5L";
143$fn0="%fr5R";
144$fai="%fr6"; $fab0="%fr7"; $fab1="%fr8";
145$fni="%fr9"; $fnm0="%fr10"; $fnm1="%fr11";
146
147$code=<<___;
148 .LEVEL $LEVEL
149 .SPACE \$TEXT\$
150 .SUBSPA \$CODE\$,QUAD=0,ALIGN=8,ACCESS=0x2C,CODE_ONLY
151
152 .EXPORT bn_mul_mont,ENTRY,ARGW0=GR,ARGW1=GR,ARGW2=GR,ARGW3=GR
153 .ALIGN 64
154bn_mul_mont
155 .PROC
156 .CALLINFO FRAME=`$FRAME-8*$SIZE_T`,NO_CALLS,SAVE_RP,SAVE_SP,ENTRY_GR=6
157 .ENTRY
158 $PUSH %r2,-$SAVED_RP(%sp) ; standard prologue
159 $PUSHMA %r3,$FRAME(%sp)
160 $PUSH %r4,`-$FRAME+1*$SIZE_T`(%sp)
161 $PUSH %r5,`-$FRAME+2*$SIZE_T`(%sp)
162 $PUSH %r6,`-$FRAME+3*$SIZE_T`(%sp)
163 $PUSH %r7,`-$FRAME+4*$SIZE_T`(%sp)
164 $PUSH %r8,`-$FRAME+5*$SIZE_T`(%sp)
165 $PUSH %r9,`-$FRAME+6*$SIZE_T`(%sp)
166 $PUSH %r10,`-$FRAME+7*$SIZE_T`(%sp)
167 ldo -$FRAME(%sp),$fp
168___
169$code.=<<___ if ($SIZE_T==4);
170 ldw `-$FRAME_MARKER-4`($fp),$n0
171 ldw `-$FRAME_MARKER-8`($fp),$num
172 nop
173 nop ; alignment
174___
175$code.=<<___ if ($BN_SZ==4);
176 comiclr,<= 6,$num,%r0 ; are vectors long enough?
177 b L\$abort
178 ldi 0,%r28 ; signal "unhandled"
179 add,ev %r0,$num,$num ; is $num even?
180 b L\$abort
181 nop
182 or $ap,$np,$ti1
183 extru,= $ti1,31,3,%r0 ; are ap and np 64-bit aligned?
184 b L\$abort
185 nop
186 nop ; alignment
187 nop
188
189 fldws 0($n0),${fn0}
190 fldws,ma 4($bp),${fbi} ; bp[0]
191___
192$code.=<<___ if ($BN_SZ==8);
193 comib,> 3,$num,L\$abort ; are vectors long enough?
194 ldi 0,%r28 ; signal "unhandled"
195 addl $num,$num,$num ; I operate on 32-bit values
196
197 fldws 4($n0),${fn0} ; only low part of n0
198 fldws 4($bp),${fbi} ; bp[0] in flipped word order
199___
200$code.=<<___;
201 fldds 0($ap),${fai} ; ap[0,1]
202 fldds 0($np),${fni} ; np[0,1]
203
204 sh2addl $num,%r0,$arrsz
205 ldi 31,$hi0
206 ldo 36($arrsz),$hi1 ; space for tp[num+1]
207 andcm $hi1,$hi0,$hi1 ; align
208 addl $hi1,%sp,%sp
209 $PUSH $fp,-$SIZE_T(%sp)
210
211 ldo `$LOCALS+16`($fp),$xfer
212 ldo `$LOCALS+32+4`($fp),$tp
213
214 xmpyu ${fai}L,${fbi},${fab0} ; ap[0]*bp[0]
215 xmpyu ${fai}R,${fbi},${fab1} ; ap[1]*bp[0]
216 xmpyu ${fn0},${fab0}R,${fm0}
217
218 addl $arrsz,$ap,$ap ; point at the end
219 addl $arrsz,$np,$np
220 subi 0,$arrsz,$idx ; j=0
221 ldo 8($idx),$idx ; j++++
222
223 xmpyu ${fni}L,${fm0}R,${fnm0} ; np[0]*m
224 xmpyu ${fni}R,${fm0}R,${fnm1} ; np[1]*m
225 fstds ${fab0},-16($xfer)
226 fstds ${fnm0},-8($xfer)
227 fstds ${fab1},0($xfer)
228 fstds ${fnm1},8($xfer)
229 flddx $idx($ap),${fai} ; ap[2,3]
230 flddx $idx($np),${fni} ; np[2,3]
231___
232$code.=<<___ if ($BN_SZ==4);
233 mtctl $hi0,%cr11 ; $hi0 still holds 31
234 extrd,u,*= $hi0,%sar,1,$hi0 ; executes on PA-RISC 1.0
235 b L\$parisc11
236 nop
237___
238$code.=<<___; # PA-RISC 2.0 code-path
239 xmpyu ${fai}L,${fbi},${fab0} ; ap[j]*bp[0]
240 xmpyu ${fni}L,${fm0}R,${fnm0} ; np[j]*m
241 ldd -16($xfer),$ab0
242 fstds ${fab0},-16($xfer)
243
244 extrd,u $ab0,31,32,$hi0
245 extrd,u $ab0,63,32,$ab0
246 ldd -8($xfer),$nm0
247 fstds ${fnm0},-8($xfer)
248 ldo 8($idx),$idx ; j++++
249 addl $ab0,$nm0,$nm0 ; low part is discarded
250 extrd,u $nm0,31,32,$hi1
251
252
253L\$1st
254 xmpyu ${fai}R,${fbi},${fab1} ; ap[j+1]*bp[0]
255 xmpyu ${fni}R,${fm0}R,${fnm1} ; np[j+1]*m
256 ldd 0($xfer),$ab1
257 fstds ${fab1},0($xfer)
258 addl $hi0,$ab1,$ab1
259 extrd,u $ab1,31,32,$hi0
260 ldd 8($xfer),$nm1
261 fstds ${fnm1},8($xfer)
262 extrd,u $ab1,63,32,$ab1
263 addl $hi1,$nm1,$nm1
264 flddx $idx($ap),${fai} ; ap[j,j+1]
265 flddx $idx($np),${fni} ; np[j,j+1]
266 addl $ab1,$nm1,$nm1
267 extrd,u $nm1,31,32,$hi1
268
269 xmpyu ${fai}L,${fbi},${fab0} ; ap[j]*bp[0]
270 xmpyu ${fni}L,${fm0}R,${fnm0} ; np[j]*m
271 ldd -16($xfer),$ab0
272 fstds ${fab0},-16($xfer)
273 addl $hi0,$ab0,$ab0
274 extrd,u $ab0,31,32,$hi0
275 ldd -8($xfer),$nm0
276 fstds ${fnm0},-8($xfer)
277 extrd,u $ab0,63,32,$ab0
278 addl $hi1,$nm0,$nm0
279 stw $nm1,-4($tp) ; tp[j-1]
280 addl $ab0,$nm0,$nm0
281 stw,ma $nm0,8($tp) ; tp[j-1]
282 addib,<> 8,$idx,L\$1st ; j++++
283 extrd,u $nm0,31,32,$hi1
284
285 xmpyu ${fai}R,${fbi},${fab1} ; ap[j]*bp[0]
286 xmpyu ${fni}R,${fm0}R,${fnm1} ; np[j]*m
287 ldd 0($xfer),$ab1
288 fstds ${fab1},0($xfer)
289 addl $hi0,$ab1,$ab1
290 extrd,u $ab1,31,32,$hi0
291 ldd 8($xfer),$nm1
292 fstds ${fnm1},8($xfer)
293 extrd,u $ab1,63,32,$ab1
294 addl $hi1,$nm1,$nm1
295 ldd -16($xfer),$ab0
296 addl $ab1,$nm1,$nm1
297 ldd -8($xfer),$nm0
298 extrd,u $nm1,31,32,$hi1
299
300 addl $hi0,$ab0,$ab0
301 extrd,u $ab0,31,32,$hi0
302 stw $nm1,-4($tp) ; tp[j-1]
303 extrd,u $ab0,63,32,$ab0
304 addl $hi1,$nm0,$nm0
305 ldd 0($xfer),$ab1
306 addl $ab0,$nm0,$nm0
307 ldd,mb 8($xfer),$nm1
308 extrd,u $nm0,31,32,$hi1
309 stw,ma $nm0,8($tp) ; tp[j-1]
310
311 ldo -1($num),$num ; i--
312 subi 0,$arrsz,$idx ; j=0
313___
314$code.=<<___ if ($BN_SZ==4);
315 fldws,ma 4($bp),${fbi} ; bp[1]
316___
317$code.=<<___ if ($BN_SZ==8);
318 fldws 0($bp),${fbi} ; bp[1] in flipped word order
319___
320$code.=<<___;
321 flddx $idx($ap),${fai} ; ap[0,1]
322 flddx $idx($np),${fni} ; np[0,1]
323 fldws 8($xfer),${fti}R ; tp[0]
324 addl $hi0,$ab1,$ab1
325 extrd,u $ab1,31,32,$hi0
326 extrd,u $ab1,63,32,$ab1
327 ldo 8($idx),$idx ; j++++
328 xmpyu ${fai}L,${fbi},${fab0} ; ap[0]*bp[1]
329 xmpyu ${fai}R,${fbi},${fab1} ; ap[1]*bp[1]
330 addl $hi1,$nm1,$nm1
331 addl $ab1,$nm1,$nm1
332 extrd,u $nm1,31,32,$hi1
333 fstws,mb ${fab0}L,-8($xfer) ; save high part
334 stw $nm1,-4($tp) ; tp[j-1]
335
336 fcpy,sgl %fr0,${fti}L ; zero high part
337 fcpy,sgl %fr0,${fab0}L
338 addl $hi1,$hi0,$hi0
339 extrd,u $hi0,31,32,$hi1
340 fcnvxf,dbl,dbl ${fti},${fti} ; 32-bit unsigned int -> double
341 fcnvxf,dbl,dbl ${fab0},${fab0}
342 stw $hi0,0($tp)
343 stw $hi1,4($tp)
344
345 fadd,dbl ${fti},${fab0},${fab0} ; add tp[0]
346 fcnvfx,dbl,dbl ${fab0},${fab0} ; double -> 33-bit unsigned int
347 xmpyu ${fn0},${fab0}R,${fm0}
348 ldo `$LOCALS+32+4`($fp),$tp
349L\$outer
350 xmpyu ${fni}L,${fm0}R,${fnm0} ; np[0]*m
351 xmpyu ${fni}R,${fm0}R,${fnm1} ; np[1]*m
352 fstds ${fab0},-16($xfer) ; 33-bit value
353 fstds ${fnm0},-8($xfer)
354 flddx $idx($ap),${fai} ; ap[2]
355 flddx $idx($np),${fni} ; np[2]
356 ldo 8($idx),$idx ; j++++
357 ldd -16($xfer),$ab0 ; 33-bit value
358 ldd -8($xfer),$nm0
359 ldw 0($xfer),$hi0 ; high part
360
361 xmpyu ${fai}L,${fbi},${fab0} ; ap[j]*bp[i]
362 xmpyu ${fni}L,${fm0}R,${fnm0} ; np[j]*m
363 extrd,u $ab0,31,32,$ti0 ; carry bit
364 extrd,u $ab0,63,32,$ab0
365 fstds ${fab1},0($xfer)
366 addl $ti0,$hi0,$hi0 ; account carry bit
367 fstds ${fnm1},8($xfer)
368 addl $ab0,$nm0,$nm0 ; low part is discarded
369 ldw 0($tp),$ti1 ; tp[1]
370 extrd,u $nm0,31,32,$hi1
371 fstds ${fab0},-16($xfer)
372 fstds ${fnm0},-8($xfer)
373
374
375L\$inner
376 xmpyu ${fai}R,${fbi},${fab1} ; ap[j+1]*bp[i]
377 xmpyu ${fni}R,${fm0}R,${fnm1} ; np[j+1]*m
378 ldd 0($xfer),$ab1
379 fstds ${fab1},0($xfer)
380 addl $hi0,$ti1,$ti1
381 addl $ti1,$ab1,$ab1
382 ldd 8($xfer),$nm1
383 fstds ${fnm1},8($xfer)
384 extrd,u $ab1,31,32,$hi0
385 extrd,u $ab1,63,32,$ab1
386 flddx $idx($ap),${fai} ; ap[j,j+1]
387 flddx $idx($np),${fni} ; np[j,j+1]
388 addl $hi1,$nm1,$nm1
389 addl $ab1,$nm1,$nm1
390 ldw 4($tp),$ti0 ; tp[j]
391 stw $nm1,-4($tp) ; tp[j-1]
392
393 xmpyu ${fai}L,${fbi},${fab0} ; ap[j]*bp[i]
394 xmpyu ${fni}L,${fm0}R,${fnm0} ; np[j]*m
395 ldd -16($xfer),$ab0
396 fstds ${fab0},-16($xfer)
397 addl $hi0,$ti0,$ti0
398 addl $ti0,$ab0,$ab0
399 ldd -8($xfer),$nm0
400 fstds ${fnm0},-8($xfer)
401 extrd,u $ab0,31,32,$hi0
402 extrd,u $nm1,31,32,$hi1
403 ldw 8($tp),$ti1 ; tp[j]
404 extrd,u $ab0,63,32,$ab0
405 addl $hi1,$nm0,$nm0
406 addl $ab0,$nm0,$nm0
407 stw,ma $nm0,8($tp) ; tp[j-1]
408 addib,<> 8,$idx,L\$inner ; j++++
409 extrd,u $nm0,31,32,$hi1
410
411 xmpyu ${fai}R,${fbi},${fab1} ; ap[j]*bp[i]
412 xmpyu ${fni}R,${fm0}R,${fnm1} ; np[j]*m
413 ldd 0($xfer),$ab1
414 fstds ${fab1},0($xfer)
415 addl $hi0,$ti1,$ti1
416 addl $ti1,$ab1,$ab1
417 ldd 8($xfer),$nm1
418 fstds ${fnm1},8($xfer)
419 extrd,u $ab1,31,32,$hi0
420 extrd,u $ab1,63,32,$ab1
421 ldw 4($tp),$ti0 ; tp[j]
422 addl $hi1,$nm1,$nm1
423 addl $ab1,$nm1,$nm1
424 ldd -16($xfer),$ab0
425 ldd -8($xfer),$nm0
426 extrd,u $nm1,31,32,$hi1
427
428 addl $hi0,$ab0,$ab0
429 addl $ti0,$ab0,$ab0
430 stw $nm1,-4($tp) ; tp[j-1]
431 extrd,u $ab0,31,32,$hi0
432 ldw 8($tp),$ti1 ; tp[j]
433 extrd,u $ab0,63,32,$ab0
434 addl $hi1,$nm0,$nm0
435 ldd 0($xfer),$ab1
436 addl $ab0,$nm0,$nm0
437 ldd,mb 8($xfer),$nm1
438 extrd,u $nm0,31,32,$hi1
439 stw,ma $nm0,8($tp) ; tp[j-1]
440
441 addib,= -1,$num,L\$outerdone ; i--
442 subi 0,$arrsz,$idx ; j=0
443___
444$code.=<<___ if ($BN_SZ==4);
445 fldws,ma 4($bp),${fbi} ; bp[i]
446___
447$code.=<<___ if ($BN_SZ==8);
448 ldi 12,$ti0 ; bp[i] in flipped word order
449 addl,ev %r0,$num,$num
450 ldi -4,$ti0
451 addl $ti0,$bp,$bp
452 fldws 0($bp),${fbi}
453___
454$code.=<<___;
455 flddx $idx($ap),${fai} ; ap[0]
456 addl $hi0,$ab1,$ab1
457 flddx $idx($np),${fni} ; np[0]
458 fldws 8($xfer),${fti}R ; tp[0]
459 addl $ti1,$ab1,$ab1
460 extrd,u $ab1,31,32,$hi0
461 extrd,u $ab1,63,32,$ab1
462
463 ldo 8($idx),$idx ; j++++
464 xmpyu ${fai}L,${fbi},${fab0} ; ap[0]*bp[i]
465 xmpyu ${fai}R,${fbi},${fab1} ; ap[1]*bp[i]
466 ldw 4($tp),$ti0 ; tp[j]
467
468 addl $hi1,$nm1,$nm1
469 fstws,mb ${fab0}L,-8($xfer) ; save high part
470 addl $ab1,$nm1,$nm1
471 extrd,u $nm1,31,32,$hi1
472 fcpy,sgl %fr0,${fti}L ; zero high part
473 fcpy,sgl %fr0,${fab0}L
474 stw $nm1,-4($tp) ; tp[j-1]
475
476 fcnvxf,dbl,dbl ${fti},${fti} ; 32-bit unsigned int -> double
477 fcnvxf,dbl,dbl ${fab0},${fab0}
478 addl $hi1,$hi0,$hi0
479 fadd,dbl ${fti},${fab0},${fab0} ; add tp[0]
480 addl $ti0,$hi0,$hi0
481 extrd,u $hi0,31,32,$hi1
482 fcnvfx,dbl,dbl ${fab0},${fab0} ; double -> 33-bit unsigned int
483 stw $hi0,0($tp)
484 stw $hi1,4($tp)
485 xmpyu ${fn0},${fab0}R,${fm0}
486
487 b L\$outer
488 ldo `$LOCALS+32+4`($fp),$tp
489
490
491L\$outerdone
492 addl $hi0,$ab1,$ab1
493 addl $ti1,$ab1,$ab1
494 extrd,u $ab1,31,32,$hi0
495 extrd,u $ab1,63,32,$ab1
496
497 ldw 4($tp),$ti0 ; tp[j]
498
499 addl $hi1,$nm1,$nm1
500 addl $ab1,$nm1,$nm1
501 extrd,u $nm1,31,32,$hi1
502 stw $nm1,-4($tp) ; tp[j-1]
503
504 addl $hi1,$hi0,$hi0
505 addl $ti0,$hi0,$hi0
506 extrd,u $hi0,31,32,$hi1
507 stw $hi0,0($tp)
508 stw $hi1,4($tp)
509
510 ldo `$LOCALS+32`($fp),$tp
511 sub %r0,%r0,%r0 ; clear borrow
512___
513$code.=<<___ if ($BN_SZ==4);
514 ldws,ma 4($tp),$ti0
515 extru,= $rp,31,3,%r0 ; is rp 64-bit aligned?
516 b L\$sub_pa11
517 addl $tp,$arrsz,$tp
518L\$sub
519 ldwx $idx($np),$hi0
520 subb $ti0,$hi0,$hi1
521 ldwx $idx($tp),$ti0
522 addib,<> 4,$idx,L\$sub
523 stws,ma $hi1,4($rp)
524
525 subb $ti0,%r0,$hi1
526___
527$code.=<<___ if ($BN_SZ==8);
528 ldd,ma 8($tp),$ti0
529L\$sub
530 ldd $idx($np),$hi0
531 shrpd $ti0,$ti0,32,$ti0 ; flip word order
532 std $ti0,-8($tp) ; save flipped value
533 sub,db $ti0,$hi0,$hi1
534 ldd,ma 8($tp),$ti0
535 addib,<> 8,$idx,L\$sub
536 std,ma $hi1,8($rp)
537
538 extrd,u $ti0,31,32,$ti0 ; carry in flipped word order
539 sub,db $ti0,%r0,$hi1
540___
541$code.=<<___;
542 ldo `$LOCALS+32`($fp),$tp
543 sub $rp,$arrsz,$rp ; rewind rp
544 subi 0,$arrsz,$idx
545L\$copy
546 ldd 0($tp),$ti0
547 ldd 0($rp),$hi0
548 std,ma %r0,8($tp)
549 comiclr,= 0,$hi1,%r0
550 copy $ti0,$hi0
551 addib,<> 8,$idx,L\$copy
552 std,ma $hi0,8($rp)
553___
554
555if ($BN_SZ==4) { # PA-RISC 1.1 code-path
556$ablo=$ab0;
557$abhi=$ab1;
558$nmlo0=$nm0;
559$nmhi0=$nm1;
560$nmlo1="%r9";
561$nmhi1="%r8";
562
563$code.=<<___;
564 b L\$done
565 nop
566
567 .ALIGN 8
568L\$parisc11
569 xmpyu ${fai}L,${fbi},${fab0} ; ap[j]*bp[0]
570 xmpyu ${fni}L,${fm0}R,${fnm0} ; np[j]*m
571 ldw -12($xfer),$ablo
572 ldw -16($xfer),$hi0
573 ldw -4($xfer),$nmlo0
574 ldw -8($xfer),$nmhi0
575 fstds ${fab0},-16($xfer)
576 fstds ${fnm0},-8($xfer)
577
578 ldo 8($idx),$idx ; j++++
579 add $ablo,$nmlo0,$nmlo0 ; discarded
580 addc %r0,$nmhi0,$hi1
581 ldw 4($xfer),$ablo
582 ldw 0($xfer),$abhi
583 nop
584
585
586L\$1st_pa11
587 xmpyu ${fai}R,${fbi},${fab1} ; ap[j+1]*bp[0]
588 flddx $idx($ap),${fai} ; ap[j,j+1]
589 xmpyu ${fni}R,${fm0}R,${fnm1} ; np[j+1]*m
590 flddx $idx($np),${fni} ; np[j,j+1]
591 add $hi0,$ablo,$ablo
592 ldw 12($xfer),$nmlo1
593 addc %r0,$abhi,$hi0
594 ldw 8($xfer),$nmhi1
595 add $ablo,$nmlo1,$nmlo1
596 fstds ${fab1},0($xfer)
597 addc %r0,$nmhi1,$nmhi1
598 fstds ${fnm1},8($xfer)
599 add $hi1,$nmlo1,$nmlo1
600 ldw -12($xfer),$ablo
601 addc %r0,$nmhi1,$hi1
602 ldw -16($xfer),$abhi
603
604 xmpyu ${fai}L,${fbi},${fab0} ; ap[j]*bp[0]
605 ldw -4($xfer),$nmlo0
606 xmpyu ${fni}L,${fm0}R,${fnm0} ; np[j]*m
607 ldw -8($xfer),$nmhi0
608 add $hi0,$ablo,$ablo
609 stw $nmlo1,-4($tp) ; tp[j-1]
610 addc %r0,$abhi,$hi0
611 fstds ${fab0},-16($xfer)
612 add $ablo,$nmlo0,$nmlo0
613 fstds ${fnm0},-8($xfer)
614 addc %r0,$nmhi0,$nmhi0
615 ldw 0($xfer),$abhi
616 add $hi1,$nmlo0,$nmlo0
617 ldw 4($xfer),$ablo
618 stws,ma $nmlo0,8($tp) ; tp[j-1]
619 addib,<> 8,$idx,L\$1st_pa11 ; j++++
620 addc %r0,$nmhi0,$hi1
621
622 ldw 8($xfer),$nmhi1
623 ldw 12($xfer),$nmlo1
624 xmpyu ${fai}R,${fbi},${fab1} ; ap[j]*bp[0]
625 xmpyu ${fni}R,${fm0}R,${fnm1} ; np[j]*m
626 add $hi0,$ablo,$ablo
627 fstds ${fab1},0($xfer)
628 addc %r0,$abhi,$hi0
629 fstds ${fnm1},8($xfer)
630 add $ablo,$nmlo1,$nmlo1
631 ldw -16($xfer),$abhi
632 addc %r0,$nmhi1,$nmhi1
633 ldw -12($xfer),$ablo
634 add $hi1,$nmlo1,$nmlo1
635 ldw -8($xfer),$nmhi0
636 addc %r0,$nmhi1,$hi1
637 ldw -4($xfer),$nmlo0
638
639 add $hi0,$ablo,$ablo
640 stw $nmlo1,-4($tp) ; tp[j-1]
641 addc %r0,$abhi,$hi0
642 ldw 0($xfer),$abhi
643 add $ablo,$nmlo0,$nmlo0
644 ldw 4($xfer),$ablo
645 addc %r0,$nmhi0,$nmhi0
646 ldws,mb 8($xfer),$nmhi1
647 add $hi1,$nmlo0,$nmlo0
648 ldw 4($xfer),$nmlo1
649 addc %r0,$nmhi0,$hi1
650 stws,ma $nmlo0,8($tp) ; tp[j-1]
651
652 ldo -1($num),$num ; i--
653 subi 0,$arrsz,$idx ; j=0
654
655 fldws,ma 4($bp),${fbi} ; bp[1]
656 flddx $idx($ap),${fai} ; ap[0,1]
657 flddx $idx($np),${fni} ; np[0,1]
658 fldws 8($xfer),${fti}R ; tp[0]
659 add $hi0,$ablo,$ablo
660 addc %r0,$abhi,$hi0
661 ldo 8($idx),$idx ; j++++
662 xmpyu ${fai}L,${fbi},${fab0} ; ap[0]*bp[1]
663 xmpyu ${fai}R,${fbi},${fab1} ; ap[1]*bp[1]
664 add $hi1,$nmlo1,$nmlo1
665 addc %r0,$nmhi1,$nmhi1
666 add $ablo,$nmlo1,$nmlo1
667 addc %r0,$nmhi1,$hi1
668 fstws,mb ${fab0}L,-8($xfer) ; save high part
669 stw $nmlo1,-4($tp) ; tp[j-1]
670
671 fcpy,sgl %fr0,${fti}L ; zero high part
672 fcpy,sgl %fr0,${fab0}L
673 add $hi1,$hi0,$hi0
674 addc %r0,%r0,$hi1
675 fcnvxf,dbl,dbl ${fti},${fti} ; 32-bit unsigned int -> double
676 fcnvxf,dbl,dbl ${fab0},${fab0}
677 stw $hi0,0($tp)
678 stw $hi1,4($tp)
679
680 fadd,dbl ${fti},${fab0},${fab0} ; add tp[0]
681 fcnvfx,dbl,dbl ${fab0},${fab0} ; double -> 33-bit unsigned int
682 xmpyu ${fn0},${fab0}R,${fm0}
683 ldo `$LOCALS+32+4`($fp),$tp
684L\$outer_pa11
685 xmpyu ${fni}L,${fm0}R,${fnm0} ; np[0]*m
686 xmpyu ${fni}R,${fm0}R,${fnm1} ; np[1]*m
687 fstds ${fab0},-16($xfer) ; 33-bit value
688 fstds ${fnm0},-8($xfer)
689 flddx $idx($ap),${fai} ; ap[2,3]
690 flddx $idx($np),${fni} ; np[2,3]
691 ldw -16($xfer),$abhi ; carry bit actually
692 ldo 8($idx),$idx ; j++++
693 ldw -12($xfer),$ablo
694 ldw -8($xfer),$nmhi0
695 ldw -4($xfer),$nmlo0
696 ldw 0($xfer),$hi0 ; high part
697
698 xmpyu ${fai}L,${fbi},${fab0} ; ap[j]*bp[i]
699 xmpyu ${fni}L,${fm0}R,${fnm0} ; np[j]*m
700 fstds ${fab1},0($xfer)
701 addl $abhi,$hi0,$hi0 ; account carry bit
702 fstds ${fnm1},8($xfer)
703 add $ablo,$nmlo0,$nmlo0 ; discarded
704 ldw 0($tp),$ti1 ; tp[1]
705 addc %r0,$nmhi0,$hi1
706 fstds ${fab0},-16($xfer)
707 fstds ${fnm0},-8($xfer)
708 ldw 4($xfer),$ablo
709 ldw 0($xfer),$abhi
710
711
712L\$inner_pa11
713 xmpyu ${fai}R,${fbi},${fab1} ; ap[j+1]*bp[i]
714 flddx $idx($ap),${fai} ; ap[j,j+1]
715 xmpyu ${fni}R,${fm0}R,${fnm1} ; np[j+1]*m
716 flddx $idx($np),${fni} ; np[j,j+1]
717 add $hi0,$ablo,$ablo
718 ldw 4($tp),$ti0 ; tp[j]
719 addc %r0,$abhi,$abhi
720 ldw 12($xfer),$nmlo1
721 add $ti1,$ablo,$ablo
722 ldw 8($xfer),$nmhi1
723 addc %r0,$abhi,$hi0
724 fstds ${fab1},0($xfer)
725 add $ablo,$nmlo1,$nmlo1
726 fstds ${fnm1},8($xfer)
727 addc %r0,$nmhi1,$nmhi1
728 ldw -12($xfer),$ablo
729 add $hi1,$nmlo1,$nmlo1
730 ldw -16($xfer),$abhi
731 addc %r0,$nmhi1,$hi1
732
733 xmpyu ${fai}L,${fbi},${fab0} ; ap[j]*bp[i]
734 ldw 8($tp),$ti1 ; tp[j]
735 xmpyu ${fni}L,${fm0}R,${fnm0} ; np[j]*m
736 ldw -4($xfer),$nmlo0
737 add $hi0,$ablo,$ablo
738 ldw -8($xfer),$nmhi0
739 addc %r0,$abhi,$abhi
740 stw $nmlo1,-4($tp) ; tp[j-1]
741 add $ti0,$ablo,$ablo
742 fstds ${fab0},-16($xfer)
743 addc %r0,$abhi,$hi0
744 fstds ${fnm0},-8($xfer)
745 add $ablo,$nmlo0,$nmlo0
746 ldw 4($xfer),$ablo
747 addc %r0,$nmhi0,$nmhi0
748 ldw 0($xfer),$abhi
749 add $hi1,$nmlo0,$nmlo0
750 stws,ma $nmlo0,8($tp) ; tp[j-1]
751 addib,<> 8,$idx,L\$inner_pa11 ; j++++
752 addc %r0,$nmhi0,$hi1
753
754 xmpyu ${fai}R,${fbi},${fab1} ; ap[j]*bp[i]
755 ldw 12($xfer),$nmlo1
756 xmpyu ${fni}R,${fm0}R,${fnm1} ; np[j]*m
757 ldw 8($xfer),$nmhi1
758 add $hi0,$ablo,$ablo
759 ldw 4($tp),$ti0 ; tp[j]
760 addc %r0,$abhi,$abhi
761 fstds ${fab1},0($xfer)
762 add $ti1,$ablo,$ablo
763 fstds ${fnm1},8($xfer)
764 addc %r0,$abhi,$hi0
765 ldw -16($xfer),$abhi
766 add $ablo,$nmlo1,$nmlo1
767 ldw -12($xfer),$ablo
768 addc %r0,$nmhi1,$nmhi1
769 ldw -8($xfer),$nmhi0
770 add $hi1,$nmlo1,$nmlo1
771 ldw -4($xfer),$nmlo0
772 addc %r0,$nmhi1,$hi1
773
774 add $hi0,$ablo,$ablo
775 stw $nmlo1,-4($tp) ; tp[j-1]
776 addc %r0,$abhi,$abhi
777 add $ti0,$ablo,$ablo
778 ldw 8($tp),$ti1 ; tp[j]
779 addc %r0,$abhi,$hi0
780 ldw 0($xfer),$abhi
781 add $ablo,$nmlo0,$nmlo0
782 ldw 4($xfer),$ablo
783 addc %r0,$nmhi0,$nmhi0
784 ldws,mb 8($xfer),$nmhi1
785 add $hi1,$nmlo0,$nmlo0
786 ldw 4($xfer),$nmlo1
787 addc %r0,$nmhi0,$hi1
788 stws,ma $nmlo0,8($tp) ; tp[j-1]
789
790 addib,= -1,$num,L\$outerdone_pa11; i--
791 subi 0,$arrsz,$idx ; j=0
792
793 fldws,ma 4($bp),${fbi} ; bp[i]
794 flddx $idx($ap),${fai} ; ap[0]
795 add $hi0,$ablo,$ablo
796 addc %r0,$abhi,$abhi
797 flddx $idx($np),${fni} ; np[0]
798 fldws 8($xfer),${fti}R ; tp[0]
799 add $ti1,$ablo,$ablo
800 addc %r0,$abhi,$hi0
801
802 ldo 8($idx),$idx ; j++++
803 xmpyu ${fai}L,${fbi},${fab0} ; ap[0]*bp[i]
804 xmpyu ${fai}R,${fbi},${fab1} ; ap[1]*bp[i]
805 ldw 4($tp),$ti0 ; tp[j]
806
807 add $hi1,$nmlo1,$nmlo1
808 addc %r0,$nmhi1,$nmhi1
809 fstws,mb ${fab0}L,-8($xfer) ; save high part
810 add $ablo,$nmlo1,$nmlo1
811 addc %r0,$nmhi1,$hi1
812 fcpy,sgl %fr0,${fti}L ; zero high part
813 fcpy,sgl %fr0,${fab0}L
814 stw $nmlo1,-4($tp) ; tp[j-1]
815
816 fcnvxf,dbl,dbl ${fti},${fti} ; 32-bit unsigned int -> double
817 fcnvxf,dbl,dbl ${fab0},${fab0}
818 add $hi1,$hi0,$hi0
819 addc %r0,%r0,$hi1
820 fadd,dbl ${fti},${fab0},${fab0} ; add tp[0]
821 add $ti0,$hi0,$hi0
822 addc %r0,$hi1,$hi1
823 fcnvfx,dbl,dbl ${fab0},${fab0} ; double -> 33-bit unsigned int
824 stw $hi0,0($tp)
825 stw $hi1,4($tp)
826 xmpyu ${fn0},${fab0}R,${fm0}
827
828 b L\$outer_pa11
829 ldo `$LOCALS+32+4`($fp),$tp
830
831
832L\$outerdone_pa11
833 add $hi0,$ablo,$ablo
834 addc %r0,$abhi,$abhi
835 add $ti1,$ablo,$ablo
836 addc %r0,$abhi,$hi0
837
838 ldw 4($tp),$ti0 ; tp[j]
839
840 add $hi1,$nmlo1,$nmlo1
841 addc %r0,$nmhi1,$nmhi1
842 add $ablo,$nmlo1,$nmlo1
843 addc %r0,$nmhi1,$hi1
844 stw $nmlo1,-4($tp) ; tp[j-1]
845
846 add $hi1,$hi0,$hi0
847 addc %r0,%r0,$hi1
848 add $ti0,$hi0,$hi0
849 addc %r0,$hi1,$hi1
850 stw $hi0,0($tp)
851 stw $hi1,4($tp)
852
853 ldo `$LOCALS+32+4`($fp),$tp
854 sub %r0,%r0,%r0 ; clear borrow
855 ldw -4($tp),$ti0
856 addl $tp,$arrsz,$tp
857L\$sub_pa11
858 ldwx $idx($np),$hi0
859 subb $ti0,$hi0,$hi1
860 ldwx $idx($tp),$ti0
861 addib,<> 4,$idx,L\$sub_pa11
862 stws,ma $hi1,4($rp)
863
864 subb $ti0,%r0,$hi1
865
866 ldo `$LOCALS+32`($fp),$tp
867 sub $rp,$arrsz,$rp ; rewind rp
868 subi 0,$arrsz,$idx
869L\$copy_pa11
870 ldw 0($tp),$ti0
871 ldw 0($rp),$hi0
872 stws,ma %r0,4($tp)
873 comiclr,= 0,$hi1,%r0
874 copy $ti0,$hi0
875 addib,<> 4,$idx,L\$copy_pa11
876 stws,ma $hi0,4($rp)
877
878 nop ; alignment
879L\$done
880___
881}
882
883
884$code.=<<___;
885 ldi 1,%r28 ; signal "handled"
886 ldo $FRAME($fp),%sp ; destroy tp[num+1]
887
888 $POP `-$FRAME-$SAVED_RP`(%sp),%r2 ; standard epilogue
889 $POP `-$FRAME+1*$SIZE_T`(%sp),%r4
890 $POP `-$FRAME+2*$SIZE_T`(%sp),%r5
891 $POP `-$FRAME+3*$SIZE_T`(%sp),%r6
892 $POP `-$FRAME+4*$SIZE_T`(%sp),%r7
893 $POP `-$FRAME+5*$SIZE_T`(%sp),%r8
894 $POP `-$FRAME+6*$SIZE_T`(%sp),%r9
895 $POP `-$FRAME+7*$SIZE_T`(%sp),%r10
896L\$abort
897 bv (%r2)
898 .EXIT
899 $POPMB -$FRAME(%sp),%r3
900 .PROCEND
901 .STRINGZ "Montgomery Multiplication for PA-RISC, CRYPTOGAMS by <appro\@openssl.org>"
902___
903
904
905# Explicitly encode PA-RISC 2.0 instructions used in this module, so
906# that it can be compiled with .LEVEL 1.0. It should be noted that I
907# wouldn't have to do this, if GNU assembler understood .ALLOW 2.0
908# directive...
909
910my $ldd = sub {
911 my ($mod,$args) = @_;
912 my $orig = "ldd$mod\t$args";
913
914 if ($args =~ /%r([0-9]+)\(%r([0-9]+)\),%r([0-9]+)/) # format 4
915 { my $opcode=(0x03<<26)|($2<<21)|($1<<16)|(3<<6)|$3;
916 sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
917 }
918 elsif ($args =~ /(\-?[0-9]+)\(%r([0-9]+)\),%r([0-9]+)/) # format 5
919 { my $opcode=(0x03<<26)|($2<<21)|(1<<12)|(3<<6)|$3;
920 $opcode|=(($1&0xF)<<17)|(($1&0x10)<<12); # encode offset
921 $opcode|=(1<<5) if ($mod =~ /^,m/);
922 $opcode|=(1<<13) if ($mod =~ /^,mb/);
923 sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
924 }
925 else { "\t".$orig; }
926};
927
928my $std = sub {
929 my ($mod,$args) = @_;
930 my $orig = "std$mod\t$args";
931
932 if ($args =~ /%r([0-9]+),(\-?[0-9]+)\(%r([0-9]+)\)/) # format 6
933 { my $opcode=(0x03<<26)|($3<<21)|($1<<16)|(1<<12)|(0xB<<6);
934 $opcode|=(($2&0xF)<<1)|(($2&0x10)>>4); # encode offset
935 $opcode|=(1<<5) if ($mod =~ /^,m/);
936 $opcode|=(1<<13) if ($mod =~ /^,mb/);
937 sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
938 }
939 else { "\t".$orig; }
940};
941
942my $extrd = sub {
943 my ($mod,$args) = @_;
944 my $orig = "extrd$mod\t$args";
945
946 # I only have ",u" completer, it's implicitly encoded...
947 if ($args =~ /%r([0-9]+),([0-9]+),([0-9]+),%r([0-9]+)/) # format 15
948 { my $opcode=(0x36<<26)|($1<<21)|($4<<16);
949 my $len=32-$3;
950 $opcode |= (($2&0x20)<<6)|(($2&0x1f)<<5); # encode pos
951 $opcode |= (($len&0x20)<<7)|($len&0x1f); # encode len
952 sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
953 }
954 elsif ($args =~ /%r([0-9]+),%sar,([0-9]+),%r([0-9]+)/) # format 12
955 { my $opcode=(0x34<<26)|($1<<21)|($3<<16)|(2<<11)|(1<<9);
956 my $len=32-$2;
957 $opcode |= (($len&0x20)<<3)|($len&0x1f); # encode len
958 $opcode |= (1<<13) if ($mod =~ /,\**=/);
959 sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
960 }
961 else { "\t".$orig; }
962};
963
964my $shrpd = sub {
965 my ($mod,$args) = @_;
966 my $orig = "shrpd$mod\t$args";
967
968 if ($args =~ /%r([0-9]+),%r([0-9]+),([0-9]+),%r([0-9]+)/) # format 14
969 { my $opcode=(0x34<<26)|($2<<21)|($1<<16)|(1<<10)|$4;
970 my $cpos=63-$3;
971 $opcode |= (($cpos&0x20)<<6)|(($cpos&0x1f)<<5); # encode sa
972 sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
973 }
974 else { "\t".$orig; }
975};
976
977my $sub = sub {
978 my ($mod,$args) = @_;
979 my $orig = "sub$mod\t$args";
980
981 if ($mod eq ",db" && $args =~ /%r([0-9]+),%r([0-9]+),%r([0-9]+)/) {
982 my $opcode=(0x02<<26)|($2<<21)|($1<<16)|$3;
983 $opcode|=(1<<10); # e1
984 $opcode|=(1<<8); # e2
985 $opcode|=(1<<5); # d
986 sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig
987 }
988 else { "\t".$orig; }
989};
990
991sub assemble {
992 my ($mnemonic,$mod,$args)=@_;
993 my $opcode = eval("\$$mnemonic");
994
995 ref($opcode) eq 'CODE' ? &$opcode($mod,$args) : "\t$mnemonic$mod\t$args";
996}
997
998if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
999 =~ /GNU assembler/) {
1000 $gnuas = 1;
1001}
1002
1003foreach (split("\n",$code)) {
1004 s/\`([^\`]*)\`/eval $1/ge;
1005 # flip word order in 64-bit mode...
1006 s/(xmpyu\s+)($fai|$fni)([LR])/$1.$2.($3 eq "L"?"R":"L")/e if ($BN_SZ==8);
1007 # assemble 2.0 instructions in 32-bit mode...
1008 s/^\s+([a-z]+)([\S]*)\s+([\S]*)/&assemble($1,$2,$3)/e if ($BN_SZ==4);
1009
1010 s/(\.LEVEL\s+2\.0)W/$1w/ if ($gnuas && $SIZE_T==8);
1011 s/\.SPACE\s+\$TEXT\$/.text/ if ($gnuas && $SIZE_T==8);
1012 s/\.SUBSPA.*// if ($gnuas && $SIZE_T==8);
1013 s/\bbv\b/bve/ if ($SIZE_T==8);
1014
1015 print $_,"\n";
1016}
1017close STDOUT or die "error closing STDOUT: $!";
注意: 瀏覽 TracBrowser 來幫助您使用儲存庫瀏覽器

© 2024 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette