1 | #! /usr/bin/env perl
|
---|
2 | # Copyright 2007-2020 The OpenSSL Project Authors. All Rights Reserved.
|
---|
3 | #
|
---|
4 | # Licensed under the Apache License 2.0 (the "License"). You may not use
|
---|
5 | # this file except in compliance with the License. You can obtain a copy
|
---|
6 | # in the file LICENSE in the source distribution or at
|
---|
7 | # https://www.openssl.org/source/license.html
|
---|
8 |
|
---|
9 |
|
---|
10 | # ====================================================================
|
---|
11 | # Written by Andy Polyakov <[email protected]> for the OpenSSL
|
---|
12 | # project. The module is, however, dual licensed under OpenSSL and
|
---|
13 | # CRYPTOGAMS licenses depending on where you obtain it. For further
|
---|
14 | # details see http://www.openssl.org/~appro/cryptogams/.
|
---|
15 | # ====================================================================
|
---|
16 |
|
---|
17 | # December 2007
|
---|
18 |
|
---|
19 | # The reason for undertaken effort is basically following. Even though
|
---|
20 | # Power 6 CPU operates at incredible 4.7GHz clock frequency, its PKI
|
---|
21 | # performance was observed to be less than impressive, essentially as
|
---|
22 | # fast as 1.8GHz PPC970, or 2.6 times(!) slower than one would hope.
|
---|
23 | # Well, it's not surprising that IBM had to make some sacrifices to
|
---|
24 | # boost the clock frequency that much, but no overall improvement?
|
---|
25 | # Having observed how much difference did switching to FPU make on
|
---|
26 | # UltraSPARC, playing same stunt on Power 6 appeared appropriate...
|
---|
27 | # Unfortunately the resulting performance improvement is not as
|
---|
28 | # impressive, ~30%, and in absolute terms is still very far from what
|
---|
29 | # one would expect from 4.7GHz CPU. There is a chance that I'm doing
|
---|
30 | # something wrong, but in the lack of assembler level micro-profiling
|
---|
31 | # data or at least decent platform guide I can't tell... Or better
|
---|
32 | # results might be achieved with VMX... Anyway, this module provides
|
---|
33 | # *worse* performance on other PowerPC implementations, ~40-15% slower
|
---|
34 | # on PPC970 depending on key length and ~40% slower on Power 5 for all
|
---|
35 | # key lengths. As it's obviously inappropriate as "best all-round"
|
---|
36 | # alternative, it has to be complemented with run-time CPU family
|
---|
37 | # detection. Oh! It should also be noted that unlike other PowerPC
|
---|
38 | # implementation IALU ppc-mont.pl module performs *suboptimally* on
|
---|
39 | # >=1024-bit key lengths on Power 6. It should also be noted that
|
---|
40 | # *everything* said so far applies to 64-bit builds! As far as 32-bit
|
---|
41 | # application executed on 64-bit CPU goes, this module is likely to
|
---|
42 | # become preferred choice, because it's easy to adapt it for such
|
---|
43 | # case and *is* faster than 32-bit ppc-mont.pl on *all* processors.
|
---|
44 |
|
---|
45 | # February 2008
|
---|
46 |
|
---|
47 | # Micro-profiling assisted optimization results in ~15% improvement
|
---|
48 | # over original ppc64-mont.pl version, or overall ~50% improvement
|
---|
49 | # over ppc.pl module on Power 6. If compared to ppc-mont.pl on same
|
---|
50 | # Power 6 CPU, this module is 5-150% faster depending on key length,
|
---|
51 | # [hereafter] more for longer keys. But if compared to ppc-mont.pl
|
---|
52 | # on 1.8GHz PPC970, it's only 5-55% faster. Still far from impressive
|
---|
53 | # in absolute terms, but it's apparently the way Power 6 is...
|
---|
54 |
|
---|
55 | # December 2009
|
---|
56 |
|
---|
57 | # Adapted for 32-bit build this module delivers 25-120%, yes, more
|
---|
58 | # than *twice* for longer keys, performance improvement over 32-bit
|
---|
59 | # ppc-mont.pl on 1.8GHz PPC970. However! This implementation utilizes
|
---|
60 | # even 64-bit integer operations and the trouble is that most PPC
|
---|
61 | # operating systems don't preserve upper halves of general purpose
|
---|
62 | # registers upon 32-bit signal delivery. They do preserve them upon
|
---|
63 | # context switch, but not signalling:-( This means that asynchronous
|
---|
64 | # signals have to be blocked upon entry to this subroutine. Signal
|
---|
65 | # masking (and of course complementary unmasking) has quite an impact
|
---|
66 | # on performance, naturally larger for shorter keys. It's so severe
|
---|
67 | # that 512-bit key performance can be as low as 1/3 of expected one.
|
---|
68 | # This is why this routine can be engaged for longer key operations
|
---|
69 | # only on these OSes, see crypto/ppccap.c for further details. MacOS X
|
---|
70 | # is an exception from this and doesn't require signal masking, and
|
---|
71 | # that's where above improvement coefficients were collected. For
|
---|
72 | # others alternative would be to break dependence on upper halves of
|
---|
73 | # GPRs by sticking to 32-bit integer operations...
|
---|
74 |
|
---|
75 | # December 2012
|
---|
76 |
|
---|
77 | # Remove above mentioned dependence on GPRs' upper halves in 32-bit
|
---|
78 | # build. No signal masking overhead, but integer instructions are
|
---|
79 | # *more* numerous... It's still "universally" faster than 32-bit
|
---|
80 | # ppc-mont.pl, but improvement coefficient is not as impressive
|
---|
81 | # for longer keys...
|
---|
82 |
|
---|
83 | # $output is the last argument if it looks like a file (it has an extension)
|
---|
84 | # $flavour is the first argument if it doesn't look like a file
|
---|
85 | $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
|
---|
86 | $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
|
---|
87 |
|
---|
88 | if ($flavour =~ /32/) {
|
---|
89 | $SIZE_T=4;
|
---|
90 | $RZONE= 224;
|
---|
91 | $fname= "bn_mul_mont_fpu64";
|
---|
92 |
|
---|
93 | $STUX= "stwux"; # store indexed and update
|
---|
94 | $PUSH= "stw";
|
---|
95 | $POP= "lwz";
|
---|
96 | } elsif ($flavour =~ /64/) {
|
---|
97 | $SIZE_T=8;
|
---|
98 | $RZONE= 288;
|
---|
99 | $fname= "bn_mul_mont_fpu64";
|
---|
100 |
|
---|
101 | # same as above, but 64-bit mnemonics...
|
---|
102 | $STUX= "stdux"; # store indexed and update
|
---|
103 | $PUSH= "std";
|
---|
104 | $POP= "ld";
|
---|
105 | } else { die "nonsense $flavour"; }
|
---|
106 |
|
---|
107 | $LITTLE_ENDIAN = ($flavour=~/le$/) ? 4 : 0;
|
---|
108 |
|
---|
109 | $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
|
---|
110 | ( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
|
---|
111 | ( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
|
---|
112 | die "can't locate ppc-xlate.pl";
|
---|
113 |
|
---|
114 | open STDOUT,"| $^X $xlate $flavour \"$output\""
|
---|
115 | or die "can't call $xlate: $!";
|
---|
116 |
|
---|
117 | $FRAME=64; # padded frame header
|
---|
118 | $TRANSFER=16*8;
|
---|
119 |
|
---|
120 | $carry="r0";
|
---|
121 | $sp="r1";
|
---|
122 | $toc="r2";
|
---|
123 | $rp="r3"; $ovf="r3";
|
---|
124 | $ap="r4";
|
---|
125 | $bp="r5";
|
---|
126 | $np="r6";
|
---|
127 | $n0="r7";
|
---|
128 | $num="r8";
|
---|
129 | $rp="r9"; # $rp is reassigned
|
---|
130 | $tp="r10";
|
---|
131 | $j="r11";
|
---|
132 | $i="r12";
|
---|
133 | # non-volatile registers
|
---|
134 | $c1="r19";
|
---|
135 | $n1="r20";
|
---|
136 | $a1="r21";
|
---|
137 | $nap_d="r22"; # interleaved ap and np in double format
|
---|
138 | $a0="r23"; # ap[0]
|
---|
139 | $t0="r24"; # temporary registers
|
---|
140 | $t1="r25";
|
---|
141 | $t2="r26";
|
---|
142 | $t3="r27";
|
---|
143 | $t4="r28";
|
---|
144 | $t5="r29";
|
---|
145 | $t6="r30";
|
---|
146 | $t7="r31";
|
---|
147 |
|
---|
148 | # PPC offers enough register bank capacity to unroll inner loops twice
|
---|
149 | #
|
---|
150 | # ..A3A2A1A0
|
---|
151 | # dcba
|
---|
152 | # -----------
|
---|
153 | # A0a
|
---|
154 | # A0b
|
---|
155 | # A0c
|
---|
156 | # A0d
|
---|
157 | # A1a
|
---|
158 | # A1b
|
---|
159 | # A1c
|
---|
160 | # A1d
|
---|
161 | # A2a
|
---|
162 | # A2b
|
---|
163 | # A2c
|
---|
164 | # A2d
|
---|
165 | # A3a
|
---|
166 | # A3b
|
---|
167 | # A3c
|
---|
168 | # A3d
|
---|
169 | # ..a
|
---|
170 | # ..b
|
---|
171 | #
|
---|
172 | $ba="f0"; $bb="f1"; $bc="f2"; $bd="f3";
|
---|
173 | $na="f4"; $nb="f5"; $nc="f6"; $nd="f7";
|
---|
174 | $dota="f8"; $dotb="f9";
|
---|
175 | $A0="f10"; $A1="f11"; $A2="f12"; $A3="f13";
|
---|
176 | $N0="f20"; $N1="f21"; $N2="f22"; $N3="f23";
|
---|
177 | $T0a="f24"; $T0b="f25";
|
---|
178 | $T1a="f26"; $T1b="f27";
|
---|
179 | $T2a="f28"; $T2b="f29";
|
---|
180 | $T3a="f30"; $T3b="f31";
|
---|
181 | |
---|
182 |
|
---|
183 | # sp----------->+-------------------------------+
|
---|
184 | # | saved sp |
|
---|
185 | # +-------------------------------+
|
---|
186 | # . .
|
---|
187 | # +64 +-------------------------------+
|
---|
188 | # | 16 gpr<->fpr transfer zone |
|
---|
189 | # . .
|
---|
190 | # . .
|
---|
191 | # +16*8 +-------------------------------+
|
---|
192 | # | __int64 tmp[-1] |
|
---|
193 | # +-------------------------------+
|
---|
194 | # | __int64 tmp[num] |
|
---|
195 | # . .
|
---|
196 | # . .
|
---|
197 | # . .
|
---|
198 | # +(num+1)*8 +-------------------------------+
|
---|
199 | # | padding to 64 byte boundary |
|
---|
200 | # . .
|
---|
201 | # +X +-------------------------------+
|
---|
202 | # | double nap_d[4*num] |
|
---|
203 | # . .
|
---|
204 | # . .
|
---|
205 | # . .
|
---|
206 | # +-------------------------------+
|
---|
207 | # . .
|
---|
208 | # -13*size_t +-------------------------------+
|
---|
209 | # | 13 saved gpr, r19-r31 |
|
---|
210 | # . .
|
---|
211 | # . .
|
---|
212 | # -12*8 +-------------------------------+
|
---|
213 | # | 12 saved fpr, f20-f31 |
|
---|
214 | # . .
|
---|
215 | # . .
|
---|
216 | # +-------------------------------+
|
---|
217 | |
---|
218 |
|
---|
219 | $code=<<___;
|
---|
220 | .machine "any"
|
---|
221 | .text
|
---|
222 |
|
---|
223 | .globl .$fname
|
---|
224 | .align 5
|
---|
225 | .$fname:
|
---|
226 | cmpwi $num,`3*8/$SIZE_T`
|
---|
227 | mr $rp,r3 ; $rp is reassigned
|
---|
228 | li r3,0 ; possible "not handled" return code
|
---|
229 | bltlr-
|
---|
230 | andi. r0,$num,`16/$SIZE_T-1` ; $num has to be "even"
|
---|
231 | bnelr-
|
---|
232 |
|
---|
233 | slwi $num,$num,`log($SIZE_T)/log(2)` ; num*=sizeof(BN_LONG)
|
---|
234 | li $i,-4096
|
---|
235 | slwi $tp,$num,2 ; place for {an}p_{lh}[num], i.e. 4*num
|
---|
236 | add $tp,$tp,$num ; place for tp[num+1]
|
---|
237 | addi $tp,$tp,`$FRAME+$TRANSFER+8+64+$RZONE`
|
---|
238 | subf $tp,$tp,$sp ; $sp-$tp
|
---|
239 | and $tp,$tp,$i ; minimize TLB usage
|
---|
240 | subf $tp,$sp,$tp ; $tp-$sp
|
---|
241 | mr $i,$sp
|
---|
242 | $STUX $sp,$sp,$tp ; alloca
|
---|
243 |
|
---|
244 | $PUSH r19,`-12*8-13*$SIZE_T`($i)
|
---|
245 | $PUSH r20,`-12*8-12*$SIZE_T`($i)
|
---|
246 | $PUSH r21,`-12*8-11*$SIZE_T`($i)
|
---|
247 | $PUSH r22,`-12*8-10*$SIZE_T`($i)
|
---|
248 | $PUSH r23,`-12*8-9*$SIZE_T`($i)
|
---|
249 | $PUSH r24,`-12*8-8*$SIZE_T`($i)
|
---|
250 | $PUSH r25,`-12*8-7*$SIZE_T`($i)
|
---|
251 | $PUSH r26,`-12*8-6*$SIZE_T`($i)
|
---|
252 | $PUSH r27,`-12*8-5*$SIZE_T`($i)
|
---|
253 | $PUSH r28,`-12*8-4*$SIZE_T`($i)
|
---|
254 | $PUSH r29,`-12*8-3*$SIZE_T`($i)
|
---|
255 | $PUSH r30,`-12*8-2*$SIZE_T`($i)
|
---|
256 | $PUSH r31,`-12*8-1*$SIZE_T`($i)
|
---|
257 | stfd f20,`-12*8`($i)
|
---|
258 | stfd f21,`-11*8`($i)
|
---|
259 | stfd f22,`-10*8`($i)
|
---|
260 | stfd f23,`-9*8`($i)
|
---|
261 | stfd f24,`-8*8`($i)
|
---|
262 | stfd f25,`-7*8`($i)
|
---|
263 | stfd f26,`-6*8`($i)
|
---|
264 | stfd f27,`-5*8`($i)
|
---|
265 | stfd f28,`-4*8`($i)
|
---|
266 | stfd f29,`-3*8`($i)
|
---|
267 | stfd f30,`-2*8`($i)
|
---|
268 | stfd f31,`-1*8`($i)
|
---|
269 |
|
---|
270 | addi $tp,$sp,`$FRAME+$TRANSFER+8+64`
|
---|
271 | li $i,-64
|
---|
272 | add $nap_d,$tp,$num
|
---|
273 | and $nap_d,$nap_d,$i ; align to 64 bytes
|
---|
274 | ; nap_d is off by 1, because it's used with stfdu/lfdu
|
---|
275 | addi $nap_d,$nap_d,-8
|
---|
276 | srwi $j,$num,`3+1` ; counter register, num/2
|
---|
277 | addi $j,$j,-1
|
---|
278 | addi $tp,$sp,`$FRAME+$TRANSFER-8`
|
---|
279 | li $carry,0
|
---|
280 | mtctr $j
|
---|
281 | ___
|
---|
282 | |
---|
283 |
|
---|
284 | $code.=<<___ if ($SIZE_T==8);
|
---|
285 | ld $a0,0($ap) ; pull ap[0] value
|
---|
286 | ld $t3,0($bp) ; bp[0]
|
---|
287 | ld $n0,0($n0) ; pull n0[0] value
|
---|
288 |
|
---|
289 | mulld $t7,$a0,$t3 ; ap[0]*bp[0]
|
---|
290 | ; transfer bp[0] to FPU as 4x16-bit values
|
---|
291 | extrdi $t0,$t3,16,48
|
---|
292 | extrdi $t1,$t3,16,32
|
---|
293 | extrdi $t2,$t3,16,16
|
---|
294 | extrdi $t3,$t3,16,0
|
---|
295 | std $t0,`$FRAME+0`($sp)
|
---|
296 | std $t1,`$FRAME+8`($sp)
|
---|
297 | std $t2,`$FRAME+16`($sp)
|
---|
298 | std $t3,`$FRAME+24`($sp)
|
---|
299 |
|
---|
300 | mulld $t7,$t7,$n0 ; tp[0]*n0
|
---|
301 | ; transfer (ap[0]*bp[0])*n0 to FPU as 4x16-bit values
|
---|
302 | extrdi $t4,$t7,16,48
|
---|
303 | extrdi $t5,$t7,16,32
|
---|
304 | extrdi $t6,$t7,16,16
|
---|
305 | extrdi $t7,$t7,16,0
|
---|
306 | std $t4,`$FRAME+32`($sp)
|
---|
307 | std $t5,`$FRAME+40`($sp)
|
---|
308 | std $t6,`$FRAME+48`($sp)
|
---|
309 | std $t7,`$FRAME+56`($sp)
|
---|
310 |
|
---|
311 | extrdi $t0,$a0,32,32 ; lwz $t0,4($ap)
|
---|
312 | extrdi $t1,$a0,32,0 ; lwz $t1,0($ap)
|
---|
313 | lwz $t2,`12^$LITTLE_ENDIAN`($ap) ; load a[1] as 32-bit word pair
|
---|
314 | lwz $t3,`8^$LITTLE_ENDIAN`($ap)
|
---|
315 | lwz $t4,`4^$LITTLE_ENDIAN`($np) ; load n[0] as 32-bit word pair
|
---|
316 | lwz $t5,`0^$LITTLE_ENDIAN`($np)
|
---|
317 | lwz $t6,`12^$LITTLE_ENDIAN`($np) ; load n[1] as 32-bit word pair
|
---|
318 | lwz $t7,`8^$LITTLE_ENDIAN`($np)
|
---|
319 | ___
|
---|
320 | $code.=<<___ if ($SIZE_T==4);
|
---|
321 | lwz $a0,0($ap) ; pull ap[0,1] value
|
---|
322 | mr $n1,$n0
|
---|
323 | lwz $a1,4($ap)
|
---|
324 | li $c1,0
|
---|
325 | lwz $t1,0($bp) ; bp[0,1]
|
---|
326 | lwz $t3,4($bp)
|
---|
327 | lwz $n0,0($n1) ; pull n0[0,1] value
|
---|
328 | lwz $n1,4($n1)
|
---|
329 |
|
---|
330 | mullw $t4,$a0,$t1 ; mulld ap[0]*bp[0]
|
---|
331 | mulhwu $t5,$a0,$t1
|
---|
332 | mullw $t6,$a1,$t1
|
---|
333 | mullw $t7,$a0,$t3
|
---|
334 | add $t5,$t5,$t6
|
---|
335 | add $t5,$t5,$t7
|
---|
336 | ; transfer bp[0] to FPU as 4x16-bit values
|
---|
337 | extrwi $t0,$t1,16,16
|
---|
338 | extrwi $t1,$t1,16,0
|
---|
339 | extrwi $t2,$t3,16,16
|
---|
340 | extrwi $t3,$t3,16,0
|
---|
341 | std $t0,`$FRAME+0`($sp) ; yes, std in 32-bit build
|
---|
342 | std $t1,`$FRAME+8`($sp)
|
---|
343 | std $t2,`$FRAME+16`($sp)
|
---|
344 | std $t3,`$FRAME+24`($sp)
|
---|
345 |
|
---|
346 | mullw $t0,$t4,$n0 ; mulld tp[0]*n0
|
---|
347 | mulhwu $t1,$t4,$n0
|
---|
348 | mullw $t2,$t5,$n0
|
---|
349 | mullw $t3,$t4,$n1
|
---|
350 | add $t1,$t1,$t2
|
---|
351 | add $t1,$t1,$t3
|
---|
352 | ; transfer (ap[0]*bp[0])*n0 to FPU as 4x16-bit values
|
---|
353 | extrwi $t4,$t0,16,16
|
---|
354 | extrwi $t5,$t0,16,0
|
---|
355 | extrwi $t6,$t1,16,16
|
---|
356 | extrwi $t7,$t1,16,0
|
---|
357 | std $t4,`$FRAME+32`($sp) ; yes, std in 32-bit build
|
---|
358 | std $t5,`$FRAME+40`($sp)
|
---|
359 | std $t6,`$FRAME+48`($sp)
|
---|
360 | std $t7,`$FRAME+56`($sp)
|
---|
361 |
|
---|
362 | mr $t0,$a0 ; lwz $t0,0($ap)
|
---|
363 | mr $t1,$a1 ; lwz $t1,4($ap)
|
---|
364 | lwz $t2,8($ap) ; load a[j..j+3] as 32-bit word pairs
|
---|
365 | lwz $t3,12($ap)
|
---|
366 | lwz $t4,0($np) ; load n[j..j+3] as 32-bit word pairs
|
---|
367 | lwz $t5,4($np)
|
---|
368 | lwz $t6,8($np)
|
---|
369 | lwz $t7,12($np)
|
---|
370 | ___
|
---|
371 | $code.=<<___;
|
---|
372 | lfd $ba,`$FRAME+0`($sp)
|
---|
373 | lfd $bb,`$FRAME+8`($sp)
|
---|
374 | lfd $bc,`$FRAME+16`($sp)
|
---|
375 | lfd $bd,`$FRAME+24`($sp)
|
---|
376 | lfd $na,`$FRAME+32`($sp)
|
---|
377 | lfd $nb,`$FRAME+40`($sp)
|
---|
378 | lfd $nc,`$FRAME+48`($sp)
|
---|
379 | lfd $nd,`$FRAME+56`($sp)
|
---|
380 | std $t0,`$FRAME+64`($sp) ; yes, std even in 32-bit build
|
---|
381 | std $t1,`$FRAME+72`($sp)
|
---|
382 | std $t2,`$FRAME+80`($sp)
|
---|
383 | std $t3,`$FRAME+88`($sp)
|
---|
384 | std $t4,`$FRAME+96`($sp)
|
---|
385 | std $t5,`$FRAME+104`($sp)
|
---|
386 | std $t6,`$FRAME+112`($sp)
|
---|
387 | std $t7,`$FRAME+120`($sp)
|
---|
388 | fcfid $ba,$ba
|
---|
389 | fcfid $bb,$bb
|
---|
390 | fcfid $bc,$bc
|
---|
391 | fcfid $bd,$bd
|
---|
392 | fcfid $na,$na
|
---|
393 | fcfid $nb,$nb
|
---|
394 | fcfid $nc,$nc
|
---|
395 | fcfid $nd,$nd
|
---|
396 |
|
---|
397 | lfd $A0,`$FRAME+64`($sp)
|
---|
398 | lfd $A1,`$FRAME+72`($sp)
|
---|
399 | lfd $A2,`$FRAME+80`($sp)
|
---|
400 | lfd $A3,`$FRAME+88`($sp)
|
---|
401 | lfd $N0,`$FRAME+96`($sp)
|
---|
402 | lfd $N1,`$FRAME+104`($sp)
|
---|
403 | lfd $N2,`$FRAME+112`($sp)
|
---|
404 | lfd $N3,`$FRAME+120`($sp)
|
---|
405 | fcfid $A0,$A0
|
---|
406 | fcfid $A1,$A1
|
---|
407 | fcfid $A2,$A2
|
---|
408 | fcfid $A3,$A3
|
---|
409 | fcfid $N0,$N0
|
---|
410 | fcfid $N1,$N1
|
---|
411 | fcfid $N2,$N2
|
---|
412 | fcfid $N3,$N3
|
---|
413 | addi $ap,$ap,16
|
---|
414 | addi $np,$np,16
|
---|
415 |
|
---|
416 | fmul $T1a,$A1,$ba
|
---|
417 | fmul $T1b,$A1,$bb
|
---|
418 | stfd $A0,8($nap_d) ; save a[j] in double format
|
---|
419 | stfd $A1,16($nap_d)
|
---|
420 | fmul $T2a,$A2,$ba
|
---|
421 | fmul $T2b,$A2,$bb
|
---|
422 | stfd $A2,24($nap_d) ; save a[j+1] in double format
|
---|
423 | stfd $A3,32($nap_d)
|
---|
424 | fmul $T3a,$A3,$ba
|
---|
425 | fmul $T3b,$A3,$bb
|
---|
426 | stfd $N0,40($nap_d) ; save n[j] in double format
|
---|
427 | stfd $N1,48($nap_d)
|
---|
428 | fmul $T0a,$A0,$ba
|
---|
429 | fmul $T0b,$A0,$bb
|
---|
430 | stfd $N2,56($nap_d) ; save n[j+1] in double format
|
---|
431 | stfdu $N3,64($nap_d)
|
---|
432 |
|
---|
433 | fmadd $T1a,$A0,$bc,$T1a
|
---|
434 | fmadd $T1b,$A0,$bd,$T1b
|
---|
435 | fmadd $T2a,$A1,$bc,$T2a
|
---|
436 | fmadd $T2b,$A1,$bd,$T2b
|
---|
437 | fmadd $T3a,$A2,$bc,$T3a
|
---|
438 | fmadd $T3b,$A2,$bd,$T3b
|
---|
439 | fmul $dota,$A3,$bc
|
---|
440 | fmul $dotb,$A3,$bd
|
---|
441 |
|
---|
442 | fmadd $T1a,$N1,$na,$T1a
|
---|
443 | fmadd $T1b,$N1,$nb,$T1b
|
---|
444 | fmadd $T2a,$N2,$na,$T2a
|
---|
445 | fmadd $T2b,$N2,$nb,$T2b
|
---|
446 | fmadd $T3a,$N3,$na,$T3a
|
---|
447 | fmadd $T3b,$N3,$nb,$T3b
|
---|
448 | fmadd $T0a,$N0,$na,$T0a
|
---|
449 | fmadd $T0b,$N0,$nb,$T0b
|
---|
450 |
|
---|
451 | fmadd $T1a,$N0,$nc,$T1a
|
---|
452 | fmadd $T1b,$N0,$nd,$T1b
|
---|
453 | fmadd $T2a,$N1,$nc,$T2a
|
---|
454 | fmadd $T2b,$N1,$nd,$T2b
|
---|
455 | fmadd $T3a,$N2,$nc,$T3a
|
---|
456 | fmadd $T3b,$N2,$nd,$T3b
|
---|
457 | fmadd $dota,$N3,$nc,$dota
|
---|
458 | fmadd $dotb,$N3,$nd,$dotb
|
---|
459 |
|
---|
460 | fctid $T0a,$T0a
|
---|
461 | fctid $T0b,$T0b
|
---|
462 | fctid $T1a,$T1a
|
---|
463 | fctid $T1b,$T1b
|
---|
464 | fctid $T2a,$T2a
|
---|
465 | fctid $T2b,$T2b
|
---|
466 | fctid $T3a,$T3a
|
---|
467 | fctid $T3b,$T3b
|
---|
468 |
|
---|
469 | stfd $T0a,`$FRAME+0`($sp)
|
---|
470 | stfd $T0b,`$FRAME+8`($sp)
|
---|
471 | stfd $T1a,`$FRAME+16`($sp)
|
---|
472 | stfd $T1b,`$FRAME+24`($sp)
|
---|
473 | stfd $T2a,`$FRAME+32`($sp)
|
---|
474 | stfd $T2b,`$FRAME+40`($sp)
|
---|
475 | stfd $T3a,`$FRAME+48`($sp)
|
---|
476 | stfd $T3b,`$FRAME+56`($sp)
|
---|
477 | |
---|
478 |
|
---|
479 | .align 5
|
---|
480 | L1st:
|
---|
481 | ___
|
---|
482 | $code.=<<___ if ($SIZE_T==8);
|
---|
483 | lwz $t0,`4^$LITTLE_ENDIAN`($ap) ; load a[j] as 32-bit word pair
|
---|
484 | lwz $t1,`0^$LITTLE_ENDIAN`($ap)
|
---|
485 | lwz $t2,`12^$LITTLE_ENDIAN`($ap) ; load a[j+1] as 32-bit word pair
|
---|
486 | lwz $t3,`8^$LITTLE_ENDIAN`($ap)
|
---|
487 | lwz $t4,`4^$LITTLE_ENDIAN`($np) ; load n[j] as 32-bit word pair
|
---|
488 | lwz $t5,`0^$LITTLE_ENDIAN`($np)
|
---|
489 | lwz $t6,`12^$LITTLE_ENDIAN`($np) ; load n[j+1] as 32-bit word pair
|
---|
490 | lwz $t7,`8^$LITTLE_ENDIAN`($np)
|
---|
491 | ___
|
---|
492 | $code.=<<___ if ($SIZE_T==4);
|
---|
493 | lwz $t0,0($ap) ; load a[j..j+3] as 32-bit word pairs
|
---|
494 | lwz $t1,4($ap)
|
---|
495 | lwz $t2,8($ap)
|
---|
496 | lwz $t3,12($ap)
|
---|
497 | lwz $t4,0($np) ; load n[j..j+3] as 32-bit word pairs
|
---|
498 | lwz $t5,4($np)
|
---|
499 | lwz $t6,8($np)
|
---|
500 | lwz $t7,12($np)
|
---|
501 | ___
|
---|
502 | $code.=<<___;
|
---|
503 | std $t0,`$FRAME+64`($sp) ; yes, std even in 32-bit build
|
---|
504 | std $t1,`$FRAME+72`($sp)
|
---|
505 | std $t2,`$FRAME+80`($sp)
|
---|
506 | std $t3,`$FRAME+88`($sp)
|
---|
507 | std $t4,`$FRAME+96`($sp)
|
---|
508 | std $t5,`$FRAME+104`($sp)
|
---|
509 | std $t6,`$FRAME+112`($sp)
|
---|
510 | std $t7,`$FRAME+120`($sp)
|
---|
511 | ___
|
---|
512 | if ($SIZE_T==8 or $flavour =~ /osx/) {
|
---|
513 | $code.=<<___;
|
---|
514 | ld $t0,`$FRAME+0`($sp)
|
---|
515 | ld $t1,`$FRAME+8`($sp)
|
---|
516 | ld $t2,`$FRAME+16`($sp)
|
---|
517 | ld $t3,`$FRAME+24`($sp)
|
---|
518 | ld $t4,`$FRAME+32`($sp)
|
---|
519 | ld $t5,`$FRAME+40`($sp)
|
---|
520 | ld $t6,`$FRAME+48`($sp)
|
---|
521 | ld $t7,`$FRAME+56`($sp)
|
---|
522 | ___
|
---|
523 | } else {
|
---|
524 | $code.=<<___;
|
---|
525 | lwz $t1,`$FRAME+0^$LITTLE_ENDIAN`($sp)
|
---|
526 | lwz $t0,`$FRAME+4^$LITTLE_ENDIAN`($sp)
|
---|
527 | lwz $t3,`$FRAME+8^$LITTLE_ENDIAN`($sp)
|
---|
528 | lwz $t2,`$FRAME+12^$LITTLE_ENDIAN`($sp)
|
---|
529 | lwz $t5,`$FRAME+16^$LITTLE_ENDIAN`($sp)
|
---|
530 | lwz $t4,`$FRAME+20^$LITTLE_ENDIAN`($sp)
|
---|
531 | lwz $t7,`$FRAME+24^$LITTLE_ENDIAN`($sp)
|
---|
532 | lwz $t6,`$FRAME+28^$LITTLE_ENDIAN`($sp)
|
---|
533 | ___
|
---|
534 | }
|
---|
535 | $code.=<<___;
|
---|
536 | lfd $A0,`$FRAME+64`($sp)
|
---|
537 | lfd $A1,`$FRAME+72`($sp)
|
---|
538 | lfd $A2,`$FRAME+80`($sp)
|
---|
539 | lfd $A3,`$FRAME+88`($sp)
|
---|
540 | lfd $N0,`$FRAME+96`($sp)
|
---|
541 | lfd $N1,`$FRAME+104`($sp)
|
---|
542 | lfd $N2,`$FRAME+112`($sp)
|
---|
543 | lfd $N3,`$FRAME+120`($sp)
|
---|
544 | fcfid $A0,$A0
|
---|
545 | fcfid $A1,$A1
|
---|
546 | fcfid $A2,$A2
|
---|
547 | fcfid $A3,$A3
|
---|
548 | fcfid $N0,$N0
|
---|
549 | fcfid $N1,$N1
|
---|
550 | fcfid $N2,$N2
|
---|
551 | fcfid $N3,$N3
|
---|
552 | addi $ap,$ap,16
|
---|
553 | addi $np,$np,16
|
---|
554 |
|
---|
555 | fmul $T1a,$A1,$ba
|
---|
556 | fmul $T1b,$A1,$bb
|
---|
557 | fmul $T2a,$A2,$ba
|
---|
558 | fmul $T2b,$A2,$bb
|
---|
559 | stfd $A0,8($nap_d) ; save a[j] in double format
|
---|
560 | stfd $A1,16($nap_d)
|
---|
561 | fmul $T3a,$A3,$ba
|
---|
562 | fmul $T3b,$A3,$bb
|
---|
563 | fmadd $T0a,$A0,$ba,$dota
|
---|
564 | fmadd $T0b,$A0,$bb,$dotb
|
---|
565 | stfd $A2,24($nap_d) ; save a[j+1] in double format
|
---|
566 | stfd $A3,32($nap_d)
|
---|
567 | ___
|
---|
568 | if ($SIZE_T==8 or $flavour =~ /osx/) {
|
---|
569 | $code.=<<___;
|
---|
570 | fmadd $T1a,$A0,$bc,$T1a
|
---|
571 | fmadd $T1b,$A0,$bd,$T1b
|
---|
572 | fmadd $T2a,$A1,$bc,$T2a
|
---|
573 | fmadd $T2b,$A1,$bd,$T2b
|
---|
574 | stfd $N0,40($nap_d) ; save n[j] in double format
|
---|
575 | stfd $N1,48($nap_d)
|
---|
576 | fmadd $T3a,$A2,$bc,$T3a
|
---|
577 | fmadd $T3b,$A2,$bd,$T3b
|
---|
578 | add $t0,$t0,$carry ; can not overflow
|
---|
579 | fmul $dota,$A3,$bc
|
---|
580 | fmul $dotb,$A3,$bd
|
---|
581 | stfd $N2,56($nap_d) ; save n[j+1] in double format
|
---|
582 | stfdu $N3,64($nap_d)
|
---|
583 | srdi $carry,$t0,16
|
---|
584 | add $t1,$t1,$carry
|
---|
585 | srdi $carry,$t1,16
|
---|
586 |
|
---|
587 | fmadd $T1a,$N1,$na,$T1a
|
---|
588 | fmadd $T1b,$N1,$nb,$T1b
|
---|
589 | insrdi $t0,$t1,16,32
|
---|
590 | fmadd $T2a,$N2,$na,$T2a
|
---|
591 | fmadd $T2b,$N2,$nb,$T2b
|
---|
592 | add $t2,$t2,$carry
|
---|
593 | fmadd $T3a,$N3,$na,$T3a
|
---|
594 | fmadd $T3b,$N3,$nb,$T3b
|
---|
595 | srdi $carry,$t2,16
|
---|
596 | fmadd $T0a,$N0,$na,$T0a
|
---|
597 | fmadd $T0b,$N0,$nb,$T0b
|
---|
598 | insrdi $t0,$t2,16,16
|
---|
599 | add $t3,$t3,$carry
|
---|
600 | srdi $carry,$t3,16
|
---|
601 |
|
---|
602 | fmadd $T1a,$N0,$nc,$T1a
|
---|
603 | fmadd $T1b,$N0,$nd,$T1b
|
---|
604 | insrdi $t0,$t3,16,0 ; 0..63 bits
|
---|
605 | fmadd $T2a,$N1,$nc,$T2a
|
---|
606 | fmadd $T2b,$N1,$nd,$T2b
|
---|
607 | add $t4,$t4,$carry
|
---|
608 | fmadd $T3a,$N2,$nc,$T3a
|
---|
609 | fmadd $T3b,$N2,$nd,$T3b
|
---|
610 | srdi $carry,$t4,16
|
---|
611 | fmadd $dota,$N3,$nc,$dota
|
---|
612 | fmadd $dotb,$N3,$nd,$dotb
|
---|
613 | add $t5,$t5,$carry
|
---|
614 | srdi $carry,$t5,16
|
---|
615 | insrdi $t4,$t5,16,32
|
---|
616 |
|
---|
617 | fctid $T0a,$T0a
|
---|
618 | fctid $T0b,$T0b
|
---|
619 | add $t6,$t6,$carry
|
---|
620 | fctid $T1a,$T1a
|
---|
621 | fctid $T1b,$T1b
|
---|
622 | srdi $carry,$t6,16
|
---|
623 | fctid $T2a,$T2a
|
---|
624 | fctid $T2b,$T2b
|
---|
625 | insrdi $t4,$t6,16,16
|
---|
626 | fctid $T3a,$T3a
|
---|
627 | fctid $T3b,$T3b
|
---|
628 | add $t7,$t7,$carry
|
---|
629 | insrdi $t4,$t7,16,0 ; 64..127 bits
|
---|
630 | srdi $carry,$t7,16 ; upper 33 bits
|
---|
631 |
|
---|
632 | stfd $T0a,`$FRAME+0`($sp)
|
---|
633 | stfd $T0b,`$FRAME+8`($sp)
|
---|
634 | stfd $T1a,`$FRAME+16`($sp)
|
---|
635 | stfd $T1b,`$FRAME+24`($sp)
|
---|
636 | stfd $T2a,`$FRAME+32`($sp)
|
---|
637 | stfd $T2b,`$FRAME+40`($sp)
|
---|
638 | stfd $T3a,`$FRAME+48`($sp)
|
---|
639 | stfd $T3b,`$FRAME+56`($sp)
|
---|
640 | std $t0,8($tp) ; tp[j-1]
|
---|
641 | stdu $t4,16($tp) ; tp[j]
|
---|
642 | ___
|
---|
643 | } else {
|
---|
644 | $code.=<<___;
|
---|
645 | fmadd $T1a,$A0,$bc,$T1a
|
---|
646 | fmadd $T1b,$A0,$bd,$T1b
|
---|
647 | addc $t0,$t0,$carry
|
---|
648 | adde $t1,$t1,$c1
|
---|
649 | srwi $carry,$t0,16
|
---|
650 | fmadd $T2a,$A1,$bc,$T2a
|
---|
651 | fmadd $T2b,$A1,$bd,$T2b
|
---|
652 | stfd $N0,40($nap_d) ; save n[j] in double format
|
---|
653 | stfd $N1,48($nap_d)
|
---|
654 | srwi $c1,$t1,16
|
---|
655 | insrwi $carry,$t1,16,0
|
---|
656 | fmadd $T3a,$A2,$bc,$T3a
|
---|
657 | fmadd $T3b,$A2,$bd,$T3b
|
---|
658 | addc $t2,$t2,$carry
|
---|
659 | adde $t3,$t3,$c1
|
---|
660 | srwi $carry,$t2,16
|
---|
661 | fmul $dota,$A3,$bc
|
---|
662 | fmul $dotb,$A3,$bd
|
---|
663 | stfd $N2,56($nap_d) ; save n[j+1] in double format
|
---|
664 | stfdu $N3,64($nap_d)
|
---|
665 | insrwi $t0,$t2,16,0 ; 0..31 bits
|
---|
666 | srwi $c1,$t3,16
|
---|
667 | insrwi $carry,$t3,16,0
|
---|
668 |
|
---|
669 | fmadd $T1a,$N1,$na,$T1a
|
---|
670 | fmadd $T1b,$N1,$nb,$T1b
|
---|
671 | lwz $t3,`$FRAME+32^$LITTLE_ENDIAN`($sp) ; permuted $t1
|
---|
672 | lwz $t2,`$FRAME+36^$LITTLE_ENDIAN`($sp) ; permuted $t0
|
---|
673 | addc $t4,$t4,$carry
|
---|
674 | adde $t5,$t5,$c1
|
---|
675 | srwi $carry,$t4,16
|
---|
676 | fmadd $T2a,$N2,$na,$T2a
|
---|
677 | fmadd $T2b,$N2,$nb,$T2b
|
---|
678 | srwi $c1,$t5,16
|
---|
679 | insrwi $carry,$t5,16,0
|
---|
680 | fmadd $T3a,$N3,$na,$T3a
|
---|
681 | fmadd $T3b,$N3,$nb,$T3b
|
---|
682 | addc $t6,$t6,$carry
|
---|
683 | adde $t7,$t7,$c1
|
---|
684 | srwi $carry,$t6,16
|
---|
685 | fmadd $T0a,$N0,$na,$T0a
|
---|
686 | fmadd $T0b,$N0,$nb,$T0b
|
---|
687 | insrwi $t4,$t6,16,0 ; 32..63 bits
|
---|
688 | srwi $c1,$t7,16
|
---|
689 | insrwi $carry,$t7,16,0
|
---|
690 |
|
---|
691 | fmadd $T1a,$N0,$nc,$T1a
|
---|
692 | fmadd $T1b,$N0,$nd,$T1b
|
---|
693 | lwz $t7,`$FRAME+40^$LITTLE_ENDIAN`($sp) ; permuted $t3
|
---|
694 | lwz $t6,`$FRAME+44^$LITTLE_ENDIAN`($sp) ; permuted $t2
|
---|
695 | addc $t2,$t2,$carry
|
---|
696 | adde $t3,$t3,$c1
|
---|
697 | srwi $carry,$t2,16
|
---|
698 | fmadd $T2a,$N1,$nc,$T2a
|
---|
699 | fmadd $T2b,$N1,$nd,$T2b
|
---|
700 | stw $t0,12($tp) ; tp[j-1]
|
---|
701 | stw $t4,8($tp)
|
---|
702 | srwi $c1,$t3,16
|
---|
703 | insrwi $carry,$t3,16,0
|
---|
704 | fmadd $T3a,$N2,$nc,$T3a
|
---|
705 | fmadd $T3b,$N2,$nd,$T3b
|
---|
706 | lwz $t1,`$FRAME+48^$LITTLE_ENDIAN`($sp) ; permuted $t5
|
---|
707 | lwz $t0,`$FRAME+52^$LITTLE_ENDIAN`($sp) ; permuted $t4
|
---|
708 | addc $t6,$t6,$carry
|
---|
709 | adde $t7,$t7,$c1
|
---|
710 | srwi $carry,$t6,16
|
---|
711 | fmadd $dota,$N3,$nc,$dota
|
---|
712 | fmadd $dotb,$N3,$nd,$dotb
|
---|
713 | insrwi $t2,$t6,16,0 ; 64..95 bits
|
---|
714 | srwi $c1,$t7,16
|
---|
715 | insrwi $carry,$t7,16,0
|
---|
716 |
|
---|
717 | fctid $T0a,$T0a
|
---|
718 | fctid $T0b,$T0b
|
---|
719 | lwz $t5,`$FRAME+56^$LITTLE_ENDIAN`($sp) ; permuted $t7
|
---|
720 | lwz $t4,`$FRAME+60^$LITTLE_ENDIAN`($sp) ; permuted $t6
|
---|
721 | addc $t0,$t0,$carry
|
---|
722 | adde $t1,$t1,$c1
|
---|
723 | srwi $carry,$t0,16
|
---|
724 | fctid $T1a,$T1a
|
---|
725 | fctid $T1b,$T1b
|
---|
726 | srwi $c1,$t1,16
|
---|
727 | insrwi $carry,$t1,16,0
|
---|
728 | fctid $T2a,$T2a
|
---|
729 | fctid $T2b,$T2b
|
---|
730 | addc $t4,$t4,$carry
|
---|
731 | adde $t5,$t5,$c1
|
---|
732 | srwi $carry,$t4,16
|
---|
733 | fctid $T3a,$T3a
|
---|
734 | fctid $T3b,$T3b
|
---|
735 | insrwi $t0,$t4,16,0 ; 96..127 bits
|
---|
736 | srwi $c1,$t5,16
|
---|
737 | insrwi $carry,$t5,16,0
|
---|
738 |
|
---|
739 | stfd $T0a,`$FRAME+0`($sp)
|
---|
740 | stfd $T0b,`$FRAME+8`($sp)
|
---|
741 | stfd $T1a,`$FRAME+16`($sp)
|
---|
742 | stfd $T1b,`$FRAME+24`($sp)
|
---|
743 | stfd $T2a,`$FRAME+32`($sp)
|
---|
744 | stfd $T2b,`$FRAME+40`($sp)
|
---|
745 | stfd $T3a,`$FRAME+48`($sp)
|
---|
746 | stfd $T3b,`$FRAME+56`($sp)
|
---|
747 | stw $t2,20($tp) ; tp[j]
|
---|
748 | stwu $t0,16($tp)
|
---|
749 | ___
|
---|
750 | }
|
---|
751 | $code.=<<___;
|
---|
752 | bdnz L1st
|
---|
753 | |
---|
754 |
|
---|
755 | fctid $dota,$dota
|
---|
756 | fctid $dotb,$dotb
|
---|
757 | ___
|
---|
758 | if ($SIZE_T==8 or $flavour =~ /osx/) {
|
---|
759 | $code.=<<___;
|
---|
760 | ld $t0,`$FRAME+0`($sp)
|
---|
761 | ld $t1,`$FRAME+8`($sp)
|
---|
762 | ld $t2,`$FRAME+16`($sp)
|
---|
763 | ld $t3,`$FRAME+24`($sp)
|
---|
764 | ld $t4,`$FRAME+32`($sp)
|
---|
765 | ld $t5,`$FRAME+40`($sp)
|
---|
766 | ld $t6,`$FRAME+48`($sp)
|
---|
767 | ld $t7,`$FRAME+56`($sp)
|
---|
768 | stfd $dota,`$FRAME+64`($sp)
|
---|
769 | stfd $dotb,`$FRAME+72`($sp)
|
---|
770 |
|
---|
771 | add $t0,$t0,$carry ; can not overflow
|
---|
772 | srdi $carry,$t0,16
|
---|
773 | add $t1,$t1,$carry
|
---|
774 | srdi $carry,$t1,16
|
---|
775 | insrdi $t0,$t1,16,32
|
---|
776 | add $t2,$t2,$carry
|
---|
777 | srdi $carry,$t2,16
|
---|
778 | insrdi $t0,$t2,16,16
|
---|
779 | add $t3,$t3,$carry
|
---|
780 | srdi $carry,$t3,16
|
---|
781 | insrdi $t0,$t3,16,0 ; 0..63 bits
|
---|
782 | add $t4,$t4,$carry
|
---|
783 | srdi $carry,$t4,16
|
---|
784 | add $t5,$t5,$carry
|
---|
785 | srdi $carry,$t5,16
|
---|
786 | insrdi $t4,$t5,16,32
|
---|
787 | add $t6,$t6,$carry
|
---|
788 | srdi $carry,$t6,16
|
---|
789 | insrdi $t4,$t6,16,16
|
---|
790 | add $t7,$t7,$carry
|
---|
791 | insrdi $t4,$t7,16,0 ; 64..127 bits
|
---|
792 | srdi $carry,$t7,16 ; upper 33 bits
|
---|
793 | ld $t6,`$FRAME+64`($sp)
|
---|
794 | ld $t7,`$FRAME+72`($sp)
|
---|
795 |
|
---|
796 | std $t0,8($tp) ; tp[j-1]
|
---|
797 | stdu $t4,16($tp) ; tp[j]
|
---|
798 |
|
---|
799 | add $t6,$t6,$carry ; can not overflow
|
---|
800 | srdi $carry,$t6,16
|
---|
801 | add $t7,$t7,$carry
|
---|
802 | insrdi $t6,$t7,48,0
|
---|
803 | srdi $ovf,$t7,48
|
---|
804 | std $t6,8($tp) ; tp[num-1]
|
---|
805 | ___
|
---|
806 | } else {
|
---|
807 | $code.=<<___;
|
---|
808 | lwz $t1,`$FRAME+0^$LITTLE_ENDIAN`($sp)
|
---|
809 | lwz $t0,`$FRAME+4^$LITTLE_ENDIAN`($sp)
|
---|
810 | lwz $t3,`$FRAME+8^$LITTLE_ENDIAN`($sp)
|
---|
811 | lwz $t2,`$FRAME+12^$LITTLE_ENDIAN`($sp)
|
---|
812 | lwz $t5,`$FRAME+16^$LITTLE_ENDIAN`($sp)
|
---|
813 | lwz $t4,`$FRAME+20^$LITTLE_ENDIAN`($sp)
|
---|
814 | lwz $t7,`$FRAME+24^$LITTLE_ENDIAN`($sp)
|
---|
815 | lwz $t6,`$FRAME+28^$LITTLE_ENDIAN`($sp)
|
---|
816 | stfd $dota,`$FRAME+64`($sp)
|
---|
817 | stfd $dotb,`$FRAME+72`($sp)
|
---|
818 |
|
---|
819 | addc $t0,$t0,$carry
|
---|
820 | adde $t1,$t1,$c1
|
---|
821 | srwi $carry,$t0,16
|
---|
822 | insrwi $carry,$t1,16,0
|
---|
823 | srwi $c1,$t1,16
|
---|
824 | addc $t2,$t2,$carry
|
---|
825 | adde $t3,$t3,$c1
|
---|
826 | srwi $carry,$t2,16
|
---|
827 | insrwi $t0,$t2,16,0 ; 0..31 bits
|
---|
828 | insrwi $carry,$t3,16,0
|
---|
829 | srwi $c1,$t3,16
|
---|
830 | addc $t4,$t4,$carry
|
---|
831 | adde $t5,$t5,$c1
|
---|
832 | srwi $carry,$t4,16
|
---|
833 | insrwi $carry,$t5,16,0
|
---|
834 | srwi $c1,$t5,16
|
---|
835 | addc $t6,$t6,$carry
|
---|
836 | adde $t7,$t7,$c1
|
---|
837 | srwi $carry,$t6,16
|
---|
838 | insrwi $t4,$t6,16,0 ; 32..63 bits
|
---|
839 | insrwi $carry,$t7,16,0
|
---|
840 | srwi $c1,$t7,16
|
---|
841 | stw $t0,12($tp) ; tp[j-1]
|
---|
842 | stw $t4,8($tp)
|
---|
843 |
|
---|
844 | lwz $t3,`$FRAME+32^$LITTLE_ENDIAN`($sp) ; permuted $t1
|
---|
845 | lwz $t2,`$FRAME+36^$LITTLE_ENDIAN`($sp) ; permuted $t0
|
---|
846 | lwz $t7,`$FRAME+40^$LITTLE_ENDIAN`($sp) ; permuted $t3
|
---|
847 | lwz $t6,`$FRAME+44^$LITTLE_ENDIAN`($sp) ; permuted $t2
|
---|
848 | lwz $t1,`$FRAME+48^$LITTLE_ENDIAN`($sp) ; permuted $t5
|
---|
849 | lwz $t0,`$FRAME+52^$LITTLE_ENDIAN`($sp) ; permuted $t4
|
---|
850 | lwz $t5,`$FRAME+56^$LITTLE_ENDIAN`($sp) ; permuted $t7
|
---|
851 | lwz $t4,`$FRAME+60^$LITTLE_ENDIAN`($sp) ; permuted $t6
|
---|
852 |
|
---|
853 | addc $t2,$t2,$carry
|
---|
854 | adde $t3,$t3,$c1
|
---|
855 | srwi $carry,$t2,16
|
---|
856 | insrwi $carry,$t3,16,0
|
---|
857 | srwi $c1,$t3,16
|
---|
858 | addc $t6,$t6,$carry
|
---|
859 | adde $t7,$t7,$c1
|
---|
860 | srwi $carry,$t6,16
|
---|
861 | insrwi $t2,$t6,16,0 ; 64..95 bits
|
---|
862 | insrwi $carry,$t7,16,0
|
---|
863 | srwi $c1,$t7,16
|
---|
864 | addc $t0,$t0,$carry
|
---|
865 | adde $t1,$t1,$c1
|
---|
866 | srwi $carry,$t0,16
|
---|
867 | insrwi $carry,$t1,16,0
|
---|
868 | srwi $c1,$t1,16
|
---|
869 | addc $t4,$t4,$carry
|
---|
870 | adde $t5,$t5,$c1
|
---|
871 | srwi $carry,$t4,16
|
---|
872 | insrwi $t0,$t4,16,0 ; 96..127 bits
|
---|
873 | insrwi $carry,$t5,16,0
|
---|
874 | srwi $c1,$t5,16
|
---|
875 | stw $t2,20($tp) ; tp[j]
|
---|
876 | stwu $t0,16($tp)
|
---|
877 |
|
---|
878 | lwz $t7,`$FRAME+64^$LITTLE_ENDIAN`($sp)
|
---|
879 | lwz $t6,`$FRAME+68^$LITTLE_ENDIAN`($sp)
|
---|
880 | lwz $t5,`$FRAME+72^$LITTLE_ENDIAN`($sp)
|
---|
881 | lwz $t4,`$FRAME+76^$LITTLE_ENDIAN`($sp)
|
---|
882 |
|
---|
883 | addc $t6,$t6,$carry
|
---|
884 | adde $t7,$t7,$c1
|
---|
885 | srwi $carry,$t6,16
|
---|
886 | insrwi $carry,$t7,16,0
|
---|
887 | srwi $c1,$t7,16
|
---|
888 | addc $t4,$t4,$carry
|
---|
889 | adde $t5,$t5,$c1
|
---|
890 |
|
---|
891 | insrwi $t6,$t4,16,0
|
---|
892 | srwi $t4,$t4,16
|
---|
893 | insrwi $t4,$t5,16,0
|
---|
894 | srwi $ovf,$t5,16
|
---|
895 | stw $t6,12($tp) ; tp[num-1]
|
---|
896 | stw $t4,8($tp)
|
---|
897 | ___
|
---|
898 | }
|
---|
899 | $code.=<<___;
|
---|
900 | slwi $t7,$num,2
|
---|
901 | subf $nap_d,$t7,$nap_d ; rewind pointer
|
---|
902 | |
---|
903 |
|
---|
904 | li $i,8 ; i=1
|
---|
905 | .align 5
|
---|
906 | Louter:
|
---|
907 | addi $tp,$sp,`$FRAME+$TRANSFER`
|
---|
908 | li $carry,0
|
---|
909 | mtctr $j
|
---|
910 | ___
|
---|
911 | $code.=<<___ if ($SIZE_T==8);
|
---|
912 | ldx $t3,$bp,$i ; bp[i]
|
---|
913 |
|
---|
914 | ld $t6,`$FRAME+$TRANSFER+8`($sp) ; tp[0]
|
---|
915 | mulld $t7,$a0,$t3 ; ap[0]*bp[i]
|
---|
916 | add $t7,$t7,$t6 ; ap[0]*bp[i]+tp[0]
|
---|
917 | ; transfer bp[i] to FPU as 4x16-bit values
|
---|
918 | extrdi $t0,$t3,16,48
|
---|
919 | extrdi $t1,$t3,16,32
|
---|
920 | extrdi $t2,$t3,16,16
|
---|
921 | extrdi $t3,$t3,16,0
|
---|
922 | std $t0,`$FRAME+0`($sp)
|
---|
923 | std $t1,`$FRAME+8`($sp)
|
---|
924 | std $t2,`$FRAME+16`($sp)
|
---|
925 | std $t3,`$FRAME+24`($sp)
|
---|
926 |
|
---|
927 | mulld $t7,$t7,$n0 ; tp[0]*n0
|
---|
928 | ; transfer (ap[0]*bp[i]+tp[0])*n0 to FPU as 4x16-bit values
|
---|
929 | extrdi $t4,$t7,16,48
|
---|
930 | extrdi $t5,$t7,16,32
|
---|
931 | extrdi $t6,$t7,16,16
|
---|
932 | extrdi $t7,$t7,16,0
|
---|
933 | std $t4,`$FRAME+32`($sp)
|
---|
934 | std $t5,`$FRAME+40`($sp)
|
---|
935 | std $t6,`$FRAME+48`($sp)
|
---|
936 | std $t7,`$FRAME+56`($sp)
|
---|
937 | ___
|
---|
938 | $code.=<<___ if ($SIZE_T==4);
|
---|
939 | add $t0,$bp,$i
|
---|
940 | li $c1,0
|
---|
941 | lwz $t1,0($t0) ; bp[i,i+1]
|
---|
942 | lwz $t3,4($t0)
|
---|
943 |
|
---|
944 | mullw $t4,$a0,$t1 ; ap[0]*bp[i]
|
---|
945 | lwz $t0,`$FRAME+$TRANSFER+8+4`($sp) ; tp[0]
|
---|
946 | mulhwu $t5,$a0,$t1
|
---|
947 | lwz $t2,`$FRAME+$TRANSFER+8`($sp) ; tp[0]
|
---|
948 | mullw $t6,$a1,$t1
|
---|
949 | mullw $t7,$a0,$t3
|
---|
950 | add $t5,$t5,$t6
|
---|
951 | add $t5,$t5,$t7
|
---|
952 | addc $t4,$t4,$t0 ; ap[0]*bp[i]+tp[0]
|
---|
953 | adde $t5,$t5,$t2
|
---|
954 | ; transfer bp[i] to FPU as 4x16-bit values
|
---|
955 | extrwi $t0,$t1,16,16
|
---|
956 | extrwi $t1,$t1,16,0
|
---|
957 | extrwi $t2,$t3,16,16
|
---|
958 | extrwi $t3,$t3,16,0
|
---|
959 | std $t0,`$FRAME+0`($sp) ; yes, std in 32-bit build
|
---|
960 | std $t1,`$FRAME+8`($sp)
|
---|
961 | std $t2,`$FRAME+16`($sp)
|
---|
962 | std $t3,`$FRAME+24`($sp)
|
---|
963 |
|
---|
964 | mullw $t0,$t4,$n0 ; mulld tp[0]*n0
|
---|
965 | mulhwu $t1,$t4,$n0
|
---|
966 | mullw $t2,$t5,$n0
|
---|
967 | mullw $t3,$t4,$n1
|
---|
968 | add $t1,$t1,$t2
|
---|
969 | add $t1,$t1,$t3
|
---|
970 | ; transfer (ap[0]*bp[i]+tp[0])*n0 to FPU as 4x16-bit values
|
---|
971 | extrwi $t4,$t0,16,16
|
---|
972 | extrwi $t5,$t0,16,0
|
---|
973 | extrwi $t6,$t1,16,16
|
---|
974 | extrwi $t7,$t1,16,0
|
---|
975 | std $t4,`$FRAME+32`($sp) ; yes, std in 32-bit build
|
---|
976 | std $t5,`$FRAME+40`($sp)
|
---|
977 | std $t6,`$FRAME+48`($sp)
|
---|
978 | std $t7,`$FRAME+56`($sp)
|
---|
979 | ___
|
---|
980 | $code.=<<___;
|
---|
981 | lfd $A0,8($nap_d) ; load a[j] in double format
|
---|
982 | lfd $A1,16($nap_d)
|
---|
983 | lfd $A2,24($nap_d) ; load a[j+1] in double format
|
---|
984 | lfd $A3,32($nap_d)
|
---|
985 | lfd $N0,40($nap_d) ; load n[j] in double format
|
---|
986 | lfd $N1,48($nap_d)
|
---|
987 | lfd $N2,56($nap_d) ; load n[j+1] in double format
|
---|
988 | lfdu $N3,64($nap_d)
|
---|
989 |
|
---|
990 | lfd $ba,`$FRAME+0`($sp)
|
---|
991 | lfd $bb,`$FRAME+8`($sp)
|
---|
992 | lfd $bc,`$FRAME+16`($sp)
|
---|
993 | lfd $bd,`$FRAME+24`($sp)
|
---|
994 | lfd $na,`$FRAME+32`($sp)
|
---|
995 | lfd $nb,`$FRAME+40`($sp)
|
---|
996 | lfd $nc,`$FRAME+48`($sp)
|
---|
997 | lfd $nd,`$FRAME+56`($sp)
|
---|
998 |
|
---|
999 | fcfid $ba,$ba
|
---|
1000 | fcfid $bb,$bb
|
---|
1001 | fcfid $bc,$bc
|
---|
1002 | fcfid $bd,$bd
|
---|
1003 | fcfid $na,$na
|
---|
1004 | fcfid $nb,$nb
|
---|
1005 | fcfid $nc,$nc
|
---|
1006 | fcfid $nd,$nd
|
---|
1007 |
|
---|
1008 | fmul $T1a,$A1,$ba
|
---|
1009 | fmul $T1b,$A1,$bb
|
---|
1010 | fmul $T2a,$A2,$ba
|
---|
1011 | fmul $T2b,$A2,$bb
|
---|
1012 | fmul $T3a,$A3,$ba
|
---|
1013 | fmul $T3b,$A3,$bb
|
---|
1014 | fmul $T0a,$A0,$ba
|
---|
1015 | fmul $T0b,$A0,$bb
|
---|
1016 |
|
---|
1017 | fmadd $T1a,$A0,$bc,$T1a
|
---|
1018 | fmadd $T1b,$A0,$bd,$T1b
|
---|
1019 | fmadd $T2a,$A1,$bc,$T2a
|
---|
1020 | fmadd $T2b,$A1,$bd,$T2b
|
---|
1021 | fmadd $T3a,$A2,$bc,$T3a
|
---|
1022 | fmadd $T3b,$A2,$bd,$T3b
|
---|
1023 | fmul $dota,$A3,$bc
|
---|
1024 | fmul $dotb,$A3,$bd
|
---|
1025 |
|
---|
1026 | fmadd $T1a,$N1,$na,$T1a
|
---|
1027 | fmadd $T1b,$N1,$nb,$T1b
|
---|
1028 | lfd $A0,8($nap_d) ; load a[j] in double format
|
---|
1029 | lfd $A1,16($nap_d)
|
---|
1030 | fmadd $T2a,$N2,$na,$T2a
|
---|
1031 | fmadd $T2b,$N2,$nb,$T2b
|
---|
1032 | lfd $A2,24($nap_d) ; load a[j+1] in double format
|
---|
1033 | lfd $A3,32($nap_d)
|
---|
1034 | fmadd $T3a,$N3,$na,$T3a
|
---|
1035 | fmadd $T3b,$N3,$nb,$T3b
|
---|
1036 | fmadd $T0a,$N0,$na,$T0a
|
---|
1037 | fmadd $T0b,$N0,$nb,$T0b
|
---|
1038 |
|
---|
1039 | fmadd $T1a,$N0,$nc,$T1a
|
---|
1040 | fmadd $T1b,$N0,$nd,$T1b
|
---|
1041 | fmadd $T2a,$N1,$nc,$T2a
|
---|
1042 | fmadd $T2b,$N1,$nd,$T2b
|
---|
1043 | fmadd $T3a,$N2,$nc,$T3a
|
---|
1044 | fmadd $T3b,$N2,$nd,$T3b
|
---|
1045 | fmadd $dota,$N3,$nc,$dota
|
---|
1046 | fmadd $dotb,$N3,$nd,$dotb
|
---|
1047 |
|
---|
1048 | fctid $T0a,$T0a
|
---|
1049 | fctid $T0b,$T0b
|
---|
1050 | fctid $T1a,$T1a
|
---|
1051 | fctid $T1b,$T1b
|
---|
1052 | fctid $T2a,$T2a
|
---|
1053 | fctid $T2b,$T2b
|
---|
1054 | fctid $T3a,$T3a
|
---|
1055 | fctid $T3b,$T3b
|
---|
1056 |
|
---|
1057 | stfd $T0a,`$FRAME+0`($sp)
|
---|
1058 | stfd $T0b,`$FRAME+8`($sp)
|
---|
1059 | stfd $T1a,`$FRAME+16`($sp)
|
---|
1060 | stfd $T1b,`$FRAME+24`($sp)
|
---|
1061 | stfd $T2a,`$FRAME+32`($sp)
|
---|
1062 | stfd $T2b,`$FRAME+40`($sp)
|
---|
1063 | stfd $T3a,`$FRAME+48`($sp)
|
---|
1064 | stfd $T3b,`$FRAME+56`($sp)
|
---|
1065 | |
---|
1066 |
|
---|
1067 | .align 5
|
---|
1068 | Linner:
|
---|
1069 | fmul $T1a,$A1,$ba
|
---|
1070 | fmul $T1b,$A1,$bb
|
---|
1071 | fmul $T2a,$A2,$ba
|
---|
1072 | fmul $T2b,$A2,$bb
|
---|
1073 | lfd $N0,40($nap_d) ; load n[j] in double format
|
---|
1074 | lfd $N1,48($nap_d)
|
---|
1075 | fmul $T3a,$A3,$ba
|
---|
1076 | fmul $T3b,$A3,$bb
|
---|
1077 | fmadd $T0a,$A0,$ba,$dota
|
---|
1078 | fmadd $T0b,$A0,$bb,$dotb
|
---|
1079 | lfd $N2,56($nap_d) ; load n[j+1] in double format
|
---|
1080 | lfdu $N3,64($nap_d)
|
---|
1081 |
|
---|
1082 | fmadd $T1a,$A0,$bc,$T1a
|
---|
1083 | fmadd $T1b,$A0,$bd,$T1b
|
---|
1084 | fmadd $T2a,$A1,$bc,$T2a
|
---|
1085 | fmadd $T2b,$A1,$bd,$T2b
|
---|
1086 | lfd $A0,8($nap_d) ; load a[j] in double format
|
---|
1087 | lfd $A1,16($nap_d)
|
---|
1088 | fmadd $T3a,$A2,$bc,$T3a
|
---|
1089 | fmadd $T3b,$A2,$bd,$T3b
|
---|
1090 | fmul $dota,$A3,$bc
|
---|
1091 | fmul $dotb,$A3,$bd
|
---|
1092 | lfd $A2,24($nap_d) ; load a[j+1] in double format
|
---|
1093 | lfd $A3,32($nap_d)
|
---|
1094 | ___
|
---|
1095 | if ($SIZE_T==8 or $flavour =~ /osx/) {
|
---|
1096 | $code.=<<___;
|
---|
1097 | fmadd $T1a,$N1,$na,$T1a
|
---|
1098 | fmadd $T1b,$N1,$nb,$T1b
|
---|
1099 | ld $t0,`$FRAME+0`($sp)
|
---|
1100 | ld $t1,`$FRAME+8`($sp)
|
---|
1101 | fmadd $T2a,$N2,$na,$T2a
|
---|
1102 | fmadd $T2b,$N2,$nb,$T2b
|
---|
1103 | ld $t2,`$FRAME+16`($sp)
|
---|
1104 | ld $t3,`$FRAME+24`($sp)
|
---|
1105 | fmadd $T3a,$N3,$na,$T3a
|
---|
1106 | fmadd $T3b,$N3,$nb,$T3b
|
---|
1107 | add $t0,$t0,$carry ; can not overflow
|
---|
1108 | ld $t4,`$FRAME+32`($sp)
|
---|
1109 | ld $t5,`$FRAME+40`($sp)
|
---|
1110 | fmadd $T0a,$N0,$na,$T0a
|
---|
1111 | fmadd $T0b,$N0,$nb,$T0b
|
---|
1112 | srdi $carry,$t0,16
|
---|
1113 | add $t1,$t1,$carry
|
---|
1114 | srdi $carry,$t1,16
|
---|
1115 | ld $t6,`$FRAME+48`($sp)
|
---|
1116 | ld $t7,`$FRAME+56`($sp)
|
---|
1117 |
|
---|
1118 | fmadd $T1a,$N0,$nc,$T1a
|
---|
1119 | fmadd $T1b,$N0,$nd,$T1b
|
---|
1120 | insrdi $t0,$t1,16,32
|
---|
1121 | ld $t1,8($tp) ; tp[j]
|
---|
1122 | fmadd $T2a,$N1,$nc,$T2a
|
---|
1123 | fmadd $T2b,$N1,$nd,$T2b
|
---|
1124 | add $t2,$t2,$carry
|
---|
1125 | fmadd $T3a,$N2,$nc,$T3a
|
---|
1126 | fmadd $T3b,$N2,$nd,$T3b
|
---|
1127 | srdi $carry,$t2,16
|
---|
1128 | insrdi $t0,$t2,16,16
|
---|
1129 | fmadd $dota,$N3,$nc,$dota
|
---|
1130 | fmadd $dotb,$N3,$nd,$dotb
|
---|
1131 | add $t3,$t3,$carry
|
---|
1132 | ldu $t2,16($tp) ; tp[j+1]
|
---|
1133 | srdi $carry,$t3,16
|
---|
1134 | insrdi $t0,$t3,16,0 ; 0..63 bits
|
---|
1135 | add $t4,$t4,$carry
|
---|
1136 |
|
---|
1137 | fctid $T0a,$T0a
|
---|
1138 | fctid $T0b,$T0b
|
---|
1139 | srdi $carry,$t4,16
|
---|
1140 | fctid $T1a,$T1a
|
---|
1141 | fctid $T1b,$T1b
|
---|
1142 | add $t5,$t5,$carry
|
---|
1143 | fctid $T2a,$T2a
|
---|
1144 | fctid $T2b,$T2b
|
---|
1145 | srdi $carry,$t5,16
|
---|
1146 | insrdi $t4,$t5,16,32
|
---|
1147 | fctid $T3a,$T3a
|
---|
1148 | fctid $T3b,$T3b
|
---|
1149 | add $t6,$t6,$carry
|
---|
1150 | srdi $carry,$t6,16
|
---|
1151 | insrdi $t4,$t6,16,16
|
---|
1152 |
|
---|
1153 | stfd $T0a,`$FRAME+0`($sp)
|
---|
1154 | stfd $T0b,`$FRAME+8`($sp)
|
---|
1155 | add $t7,$t7,$carry
|
---|
1156 | addc $t3,$t0,$t1
|
---|
1157 | ___
|
---|
1158 | $code.=<<___ if ($SIZE_T==4); # adjust XER[CA]
|
---|
1159 | extrdi $t0,$t0,32,0
|
---|
1160 | extrdi $t1,$t1,32,0
|
---|
1161 | adde $t0,$t0,$t1
|
---|
1162 | ___
|
---|
1163 | $code.=<<___;
|
---|
1164 | stfd $T1a,`$FRAME+16`($sp)
|
---|
1165 | stfd $T1b,`$FRAME+24`($sp)
|
---|
1166 | insrdi $t4,$t7,16,0 ; 64..127 bits
|
---|
1167 | srdi $carry,$t7,16 ; upper 33 bits
|
---|
1168 | stfd $T2a,`$FRAME+32`($sp)
|
---|
1169 | stfd $T2b,`$FRAME+40`($sp)
|
---|
1170 | adde $t5,$t4,$t2
|
---|
1171 | ___
|
---|
1172 | $code.=<<___ if ($SIZE_T==4); # adjust XER[CA]
|
---|
1173 | extrdi $t4,$t4,32,0
|
---|
1174 | extrdi $t2,$t2,32,0
|
---|
1175 | adde $t4,$t4,$t2
|
---|
1176 | ___
|
---|
1177 | $code.=<<___;
|
---|
1178 | stfd $T3a,`$FRAME+48`($sp)
|
---|
1179 | stfd $T3b,`$FRAME+56`($sp)
|
---|
1180 | addze $carry,$carry
|
---|
1181 | std $t3,-16($tp) ; tp[j-1]
|
---|
1182 | std $t5,-8($tp) ; tp[j]
|
---|
1183 | ___
|
---|
1184 | } else {
|
---|
1185 | $code.=<<___;
|
---|
1186 | fmadd $T1a,$N1,$na,$T1a
|
---|
1187 | fmadd $T1b,$N1,$nb,$T1b
|
---|
1188 | lwz $t1,`$FRAME+0^$LITTLE_ENDIAN`($sp)
|
---|
1189 | lwz $t0,`$FRAME+4^$LITTLE_ENDIAN`($sp)
|
---|
1190 | fmadd $T2a,$N2,$na,$T2a
|
---|
1191 | fmadd $T2b,$N2,$nb,$T2b
|
---|
1192 | lwz $t3,`$FRAME+8^$LITTLE_ENDIAN`($sp)
|
---|
1193 | lwz $t2,`$FRAME+12^$LITTLE_ENDIAN`($sp)
|
---|
1194 | fmadd $T3a,$N3,$na,$T3a
|
---|
1195 | fmadd $T3b,$N3,$nb,$T3b
|
---|
1196 | lwz $t5,`$FRAME+16^$LITTLE_ENDIAN`($sp)
|
---|
1197 | lwz $t4,`$FRAME+20^$LITTLE_ENDIAN`($sp)
|
---|
1198 | addc $t0,$t0,$carry
|
---|
1199 | adde $t1,$t1,$c1
|
---|
1200 | srwi $carry,$t0,16
|
---|
1201 | fmadd $T0a,$N0,$na,$T0a
|
---|
1202 | fmadd $T0b,$N0,$nb,$T0b
|
---|
1203 | lwz $t7,`$FRAME+24^$LITTLE_ENDIAN`($sp)
|
---|
1204 | lwz $t6,`$FRAME+28^$LITTLE_ENDIAN`($sp)
|
---|
1205 | srwi $c1,$t1,16
|
---|
1206 | insrwi $carry,$t1,16,0
|
---|
1207 |
|
---|
1208 | fmadd $T1a,$N0,$nc,$T1a
|
---|
1209 | fmadd $T1b,$N0,$nd,$T1b
|
---|
1210 | addc $t2,$t2,$carry
|
---|
1211 | adde $t3,$t3,$c1
|
---|
1212 | srwi $carry,$t2,16
|
---|
1213 | fmadd $T2a,$N1,$nc,$T2a
|
---|
1214 | fmadd $T2b,$N1,$nd,$T2b
|
---|
1215 | insrwi $t0,$t2,16,0 ; 0..31 bits
|
---|
1216 | srwi $c1,$t3,16
|
---|
1217 | insrwi $carry,$t3,16,0
|
---|
1218 | fmadd $T3a,$N2,$nc,$T3a
|
---|
1219 | fmadd $T3b,$N2,$nd,$T3b
|
---|
1220 | lwz $t2,12($tp) ; tp[j]
|
---|
1221 | lwz $t3,8($tp)
|
---|
1222 | addc $t4,$t4,$carry
|
---|
1223 | adde $t5,$t5,$c1
|
---|
1224 | srwi $carry,$t4,16
|
---|
1225 | fmadd $dota,$N3,$nc,$dota
|
---|
1226 | fmadd $dotb,$N3,$nd,$dotb
|
---|
1227 | srwi $c1,$t5,16
|
---|
1228 | insrwi $carry,$t5,16,0
|
---|
1229 |
|
---|
1230 | fctid $T0a,$T0a
|
---|
1231 | addc $t6,$t6,$carry
|
---|
1232 | adde $t7,$t7,$c1
|
---|
1233 | srwi $carry,$t6,16
|
---|
1234 | fctid $T0b,$T0b
|
---|
1235 | insrwi $t4,$t6,16,0 ; 32..63 bits
|
---|
1236 | srwi $c1,$t7,16
|
---|
1237 | insrwi $carry,$t7,16,0
|
---|
1238 | fctid $T1a,$T1a
|
---|
1239 | addc $t0,$t0,$t2
|
---|
1240 | adde $t4,$t4,$t3
|
---|
1241 | lwz $t3,`$FRAME+32^$LITTLE_ENDIAN`($sp) ; permuted $t1
|
---|
1242 | lwz $t2,`$FRAME+36^$LITTLE_ENDIAN`($sp) ; permuted $t0
|
---|
1243 | fctid $T1b,$T1b
|
---|
1244 | addze $carry,$carry
|
---|
1245 | addze $c1,$c1
|
---|
1246 | stw $t0,4($tp) ; tp[j-1]
|
---|
1247 | stw $t4,0($tp)
|
---|
1248 | fctid $T2a,$T2a
|
---|
1249 | addc $t2,$t2,$carry
|
---|
1250 | adde $t3,$t3,$c1
|
---|
1251 | srwi $carry,$t2,16
|
---|
1252 | lwz $t7,`$FRAME+40^$LITTLE_ENDIAN`($sp) ; permuted $t3
|
---|
1253 | lwz $t6,`$FRAME+44^$LITTLE_ENDIAN`($sp) ; permuted $t2
|
---|
1254 | fctid $T2b,$T2b
|
---|
1255 | srwi $c1,$t3,16
|
---|
1256 | insrwi $carry,$t3,16,0
|
---|
1257 | lwz $t1,`$FRAME+48^$LITTLE_ENDIAN`($sp) ; permuted $t5
|
---|
1258 | lwz $t0,`$FRAME+52^$LITTLE_ENDIAN`($sp) ; permuted $t4
|
---|
1259 | fctid $T3a,$T3a
|
---|
1260 | addc $t6,$t6,$carry
|
---|
1261 | adde $t7,$t7,$c1
|
---|
1262 | srwi $carry,$t6,16
|
---|
1263 | lwz $t5,`$FRAME+56^$LITTLE_ENDIAN`($sp) ; permuted $t7
|
---|
1264 | lwz $t4,`$FRAME+60^$LITTLE_ENDIAN`($sp) ; permuted $t6
|
---|
1265 | fctid $T3b,$T3b
|
---|
1266 |
|
---|
1267 | insrwi $t2,$t6,16,0 ; 64..95 bits
|
---|
1268 | insrwi $carry,$t7,16,0
|
---|
1269 | srwi $c1,$t7,16
|
---|
1270 | lwz $t6,20($tp)
|
---|
1271 | lwzu $t7,16($tp)
|
---|
1272 | addc $t0,$t0,$carry
|
---|
1273 | stfd $T0a,`$FRAME+0`($sp)
|
---|
1274 | adde $t1,$t1,$c1
|
---|
1275 | srwi $carry,$t0,16
|
---|
1276 | stfd $T0b,`$FRAME+8`($sp)
|
---|
1277 | insrwi $carry,$t1,16,0
|
---|
1278 | srwi $c1,$t1,16
|
---|
1279 | addc $t4,$t4,$carry
|
---|
1280 | stfd $T1a,`$FRAME+16`($sp)
|
---|
1281 | adde $t5,$t5,$c1
|
---|
1282 | srwi $carry,$t4,16
|
---|
1283 | insrwi $t0,$t4,16,0 ; 96..127 bits
|
---|
1284 | stfd $T1b,`$FRAME+24`($sp)
|
---|
1285 | insrwi $carry,$t5,16,0
|
---|
1286 | srwi $c1,$t5,16
|
---|
1287 |
|
---|
1288 | addc $t2,$t2,$t6
|
---|
1289 | stfd $T2a,`$FRAME+32`($sp)
|
---|
1290 | adde $t0,$t0,$t7
|
---|
1291 | stfd $T2b,`$FRAME+40`($sp)
|
---|
1292 | addze $carry,$carry
|
---|
1293 | stfd $T3a,`$FRAME+48`($sp)
|
---|
1294 | addze $c1,$c1
|
---|
1295 | stfd $T3b,`$FRAME+56`($sp)
|
---|
1296 | stw $t2,-4($tp) ; tp[j]
|
---|
1297 | stw $t0,-8($tp)
|
---|
1298 | ___
|
---|
1299 | }
|
---|
1300 | $code.=<<___;
|
---|
1301 | bdnz Linner
|
---|
1302 | |
---|
1303 |
|
---|
1304 | fctid $dota,$dota
|
---|
1305 | fctid $dotb,$dotb
|
---|
1306 | ___
|
---|
1307 | if ($SIZE_T==8 or $flavour =~ /osx/) {
|
---|
1308 | $code.=<<___;
|
---|
1309 | ld $t0,`$FRAME+0`($sp)
|
---|
1310 | ld $t1,`$FRAME+8`($sp)
|
---|
1311 | ld $t2,`$FRAME+16`($sp)
|
---|
1312 | ld $t3,`$FRAME+24`($sp)
|
---|
1313 | ld $t4,`$FRAME+32`($sp)
|
---|
1314 | ld $t5,`$FRAME+40`($sp)
|
---|
1315 | ld $t6,`$FRAME+48`($sp)
|
---|
1316 | ld $t7,`$FRAME+56`($sp)
|
---|
1317 | stfd $dota,`$FRAME+64`($sp)
|
---|
1318 | stfd $dotb,`$FRAME+72`($sp)
|
---|
1319 |
|
---|
1320 | add $t0,$t0,$carry ; can not overflow
|
---|
1321 | srdi $carry,$t0,16
|
---|
1322 | add $t1,$t1,$carry
|
---|
1323 | srdi $carry,$t1,16
|
---|
1324 | insrdi $t0,$t1,16,32
|
---|
1325 | add $t2,$t2,$carry
|
---|
1326 | ld $t1,8($tp) ; tp[j]
|
---|
1327 | srdi $carry,$t2,16
|
---|
1328 | insrdi $t0,$t2,16,16
|
---|
1329 | add $t3,$t3,$carry
|
---|
1330 | ldu $t2,16($tp) ; tp[j+1]
|
---|
1331 | srdi $carry,$t3,16
|
---|
1332 | insrdi $t0,$t3,16,0 ; 0..63 bits
|
---|
1333 | add $t4,$t4,$carry
|
---|
1334 | srdi $carry,$t4,16
|
---|
1335 | add $t5,$t5,$carry
|
---|
1336 | srdi $carry,$t5,16
|
---|
1337 | insrdi $t4,$t5,16,32
|
---|
1338 | add $t6,$t6,$carry
|
---|
1339 | srdi $carry,$t6,16
|
---|
1340 | insrdi $t4,$t6,16,16
|
---|
1341 | add $t7,$t7,$carry
|
---|
1342 | insrdi $t4,$t7,16,0 ; 64..127 bits
|
---|
1343 | srdi $carry,$t7,16 ; upper 33 bits
|
---|
1344 | ld $t6,`$FRAME+64`($sp)
|
---|
1345 | ld $t7,`$FRAME+72`($sp)
|
---|
1346 |
|
---|
1347 | addc $t3,$t0,$t1
|
---|
1348 | ___
|
---|
1349 | $code.=<<___ if ($SIZE_T==4); # adjust XER[CA]
|
---|
1350 | extrdi $t0,$t0,32,0
|
---|
1351 | extrdi $t1,$t1,32,0
|
---|
1352 | adde $t0,$t0,$t1
|
---|
1353 | ___
|
---|
1354 | $code.=<<___;
|
---|
1355 | adde $t5,$t4,$t2
|
---|
1356 | ___
|
---|
1357 | $code.=<<___ if ($SIZE_T==4); # adjust XER[CA]
|
---|
1358 | extrdi $t4,$t4,32,0
|
---|
1359 | extrdi $t2,$t2,32,0
|
---|
1360 | adde $t4,$t4,$t2
|
---|
1361 | ___
|
---|
1362 | $code.=<<___;
|
---|
1363 | addze $carry,$carry
|
---|
1364 |
|
---|
1365 | std $t3,-16($tp) ; tp[j-1]
|
---|
1366 | std $t5,-8($tp) ; tp[j]
|
---|
1367 |
|
---|
1368 | add $carry,$carry,$ovf ; consume upmost overflow
|
---|
1369 | add $t6,$t6,$carry ; can not overflow
|
---|
1370 | srdi $carry,$t6,16
|
---|
1371 | add $t7,$t7,$carry
|
---|
1372 | insrdi $t6,$t7,48,0
|
---|
1373 | srdi $ovf,$t7,48
|
---|
1374 | std $t6,0($tp) ; tp[num-1]
|
---|
1375 | ___
|
---|
1376 | } else {
|
---|
1377 | $code.=<<___;
|
---|
1378 | lwz $t1,`$FRAME+0^$LITTLE_ENDIAN`($sp)
|
---|
1379 | lwz $t0,`$FRAME+4^$LITTLE_ENDIAN`($sp)
|
---|
1380 | lwz $t3,`$FRAME+8^$LITTLE_ENDIAN`($sp)
|
---|
1381 | lwz $t2,`$FRAME+12^$LITTLE_ENDIAN`($sp)
|
---|
1382 | lwz $t5,`$FRAME+16^$LITTLE_ENDIAN`($sp)
|
---|
1383 | lwz $t4,`$FRAME+20^$LITTLE_ENDIAN`($sp)
|
---|
1384 | lwz $t7,`$FRAME+24^$LITTLE_ENDIAN`($sp)
|
---|
1385 | lwz $t6,`$FRAME+28^$LITTLE_ENDIAN`($sp)
|
---|
1386 | stfd $dota,`$FRAME+64`($sp)
|
---|
1387 | stfd $dotb,`$FRAME+72`($sp)
|
---|
1388 |
|
---|
1389 | addc $t0,$t0,$carry
|
---|
1390 | adde $t1,$t1,$c1
|
---|
1391 | srwi $carry,$t0,16
|
---|
1392 | insrwi $carry,$t1,16,0
|
---|
1393 | srwi $c1,$t1,16
|
---|
1394 | addc $t2,$t2,$carry
|
---|
1395 | adde $t3,$t3,$c1
|
---|
1396 | srwi $carry,$t2,16
|
---|
1397 | insrwi $t0,$t2,16,0 ; 0..31 bits
|
---|
1398 | lwz $t2,12($tp) ; tp[j]
|
---|
1399 | insrwi $carry,$t3,16,0
|
---|
1400 | srwi $c1,$t3,16
|
---|
1401 | lwz $t3,8($tp)
|
---|
1402 | addc $t4,$t4,$carry
|
---|
1403 | adde $t5,$t5,$c1
|
---|
1404 | srwi $carry,$t4,16
|
---|
1405 | insrwi $carry,$t5,16,0
|
---|
1406 | srwi $c1,$t5,16
|
---|
1407 | addc $t6,$t6,$carry
|
---|
1408 | adde $t7,$t7,$c1
|
---|
1409 | srwi $carry,$t6,16
|
---|
1410 | insrwi $t4,$t6,16,0 ; 32..63 bits
|
---|
1411 | insrwi $carry,$t7,16,0
|
---|
1412 | srwi $c1,$t7,16
|
---|
1413 |
|
---|
1414 | addc $t0,$t0,$t2
|
---|
1415 | adde $t4,$t4,$t3
|
---|
1416 | addze $carry,$carry
|
---|
1417 | addze $c1,$c1
|
---|
1418 | stw $t0,4($tp) ; tp[j-1]
|
---|
1419 | stw $t4,0($tp)
|
---|
1420 |
|
---|
1421 | lwz $t3,`$FRAME+32^$LITTLE_ENDIAN`($sp) ; permuted $t1
|
---|
1422 | lwz $t2,`$FRAME+36^$LITTLE_ENDIAN`($sp) ; permuted $t0
|
---|
1423 | lwz $t7,`$FRAME+40^$LITTLE_ENDIAN`($sp) ; permuted $t3
|
---|
1424 | lwz $t6,`$FRAME+44^$LITTLE_ENDIAN`($sp) ; permuted $t2
|
---|
1425 | lwz $t1,`$FRAME+48^$LITTLE_ENDIAN`($sp) ; permuted $t5
|
---|
1426 | lwz $t0,`$FRAME+52^$LITTLE_ENDIAN`($sp) ; permuted $t4
|
---|
1427 | lwz $t5,`$FRAME+56^$LITTLE_ENDIAN`($sp) ; permuted $t7
|
---|
1428 | lwz $t4,`$FRAME+60^$LITTLE_ENDIAN`($sp) ; permuted $t6
|
---|
1429 |
|
---|
1430 | addc $t2,$t2,$carry
|
---|
1431 | adde $t3,$t3,$c1
|
---|
1432 | srwi $carry,$t2,16
|
---|
1433 | insrwi $carry,$t3,16,0
|
---|
1434 | srwi $c1,$t3,16
|
---|
1435 | addc $t6,$t6,$carry
|
---|
1436 | adde $t7,$t7,$c1
|
---|
1437 | srwi $carry,$t6,16
|
---|
1438 | insrwi $t2,$t6,16,0 ; 64..95 bits
|
---|
1439 | lwz $t6,20($tp)
|
---|
1440 | insrwi $carry,$t7,16,0
|
---|
1441 | srwi $c1,$t7,16
|
---|
1442 | lwzu $t7,16($tp)
|
---|
1443 | addc $t0,$t0,$carry
|
---|
1444 | adde $t1,$t1,$c1
|
---|
1445 | srwi $carry,$t0,16
|
---|
1446 | insrwi $carry,$t1,16,0
|
---|
1447 | srwi $c1,$t1,16
|
---|
1448 | addc $t4,$t4,$carry
|
---|
1449 | adde $t5,$t5,$c1
|
---|
1450 | srwi $carry,$t4,16
|
---|
1451 | insrwi $t0,$t4,16,0 ; 96..127 bits
|
---|
1452 | insrwi $carry,$t5,16,0
|
---|
1453 | srwi $c1,$t5,16
|
---|
1454 |
|
---|
1455 | addc $t2,$t2,$t6
|
---|
1456 | adde $t0,$t0,$t7
|
---|
1457 | lwz $t7,`$FRAME+64^$LITTLE_ENDIAN`($sp)
|
---|
1458 | lwz $t6,`$FRAME+68^$LITTLE_ENDIAN`($sp)
|
---|
1459 | addze $carry,$carry
|
---|
1460 | addze $c1,$c1
|
---|
1461 | lwz $t5,`$FRAME+72^$LITTLE_ENDIAN`($sp)
|
---|
1462 | lwz $t4,`$FRAME+76^$LITTLE_ENDIAN`($sp)
|
---|
1463 |
|
---|
1464 | addc $t6,$t6,$carry
|
---|
1465 | adde $t7,$t7,$c1
|
---|
1466 | stw $t2,-4($tp) ; tp[j]
|
---|
1467 | stw $t0,-8($tp)
|
---|
1468 | addc $t6,$t6,$ovf
|
---|
1469 | addze $t7,$t7
|
---|
1470 | srwi $carry,$t6,16
|
---|
1471 | insrwi $carry,$t7,16,0
|
---|
1472 | srwi $c1,$t7,16
|
---|
1473 | addc $t4,$t4,$carry
|
---|
1474 | adde $t5,$t5,$c1
|
---|
1475 |
|
---|
1476 | insrwi $t6,$t4,16,0
|
---|
1477 | srwi $t4,$t4,16
|
---|
1478 | insrwi $t4,$t5,16,0
|
---|
1479 | srwi $ovf,$t5,16
|
---|
1480 | stw $t6,4($tp) ; tp[num-1]
|
---|
1481 | stw $t4,0($tp)
|
---|
1482 | ___
|
---|
1483 | }
|
---|
1484 | $code.=<<___;
|
---|
1485 | slwi $t7,$num,2
|
---|
1486 | addi $i,$i,8
|
---|
1487 | subf $nap_d,$t7,$nap_d ; rewind pointer
|
---|
1488 | cmpw $i,$num
|
---|
1489 | blt- Louter
|
---|
1490 | ___
|
---|
1491 | |
---|
1492 |
|
---|
1493 | $code.=<<___ if ($SIZE_T==8);
|
---|
1494 | subf $np,$num,$np ; rewind np
|
---|
1495 | addi $j,$j,1 ; restore counter
|
---|
1496 | subfc $i,$i,$i ; j=0 and "clear" XER[CA]
|
---|
1497 | addi $tp,$sp,`$FRAME+$TRANSFER+8`
|
---|
1498 | addi $t4,$sp,`$FRAME+$TRANSFER+16`
|
---|
1499 | addi $t5,$np,8
|
---|
1500 | addi $t6,$rp,8
|
---|
1501 | mtctr $j
|
---|
1502 |
|
---|
1503 | .align 4
|
---|
1504 | Lsub: ldx $t0,$tp,$i
|
---|
1505 | ldx $t1,$np,$i
|
---|
1506 | ldx $t2,$t4,$i
|
---|
1507 | ldx $t3,$t5,$i
|
---|
1508 | subfe $t0,$t1,$t0 ; tp[j]-np[j]
|
---|
1509 | subfe $t2,$t3,$t2 ; tp[j+1]-np[j+1]
|
---|
1510 | stdx $t0,$rp,$i
|
---|
1511 | stdx $t2,$t6,$i
|
---|
1512 | addi $i,$i,16
|
---|
1513 | bdnz Lsub
|
---|
1514 |
|
---|
1515 | li $i,0
|
---|
1516 | subfe $ovf,$i,$ovf ; handle upmost overflow bit
|
---|
1517 | mtctr $j
|
---|
1518 |
|
---|
1519 | .align 4
|
---|
1520 | Lcopy: ; conditional copy
|
---|
1521 | ldx $t0,$tp,$i
|
---|
1522 | ldx $t1,$t4,$i
|
---|
1523 | ldx $t2,$rp,$i
|
---|
1524 | ldx $t3,$t6,$i
|
---|
1525 | std $i,8($nap_d) ; zap nap_d
|
---|
1526 | std $i,16($nap_d)
|
---|
1527 | std $i,24($nap_d)
|
---|
1528 | std $i,32($nap_d)
|
---|
1529 | std $i,40($nap_d)
|
---|
1530 | std $i,48($nap_d)
|
---|
1531 | std $i,56($nap_d)
|
---|
1532 | stdu $i,64($nap_d)
|
---|
1533 | and $t0,$t0,$ovf
|
---|
1534 | and $t1,$t1,$ovf
|
---|
1535 | andc $t2,$t2,$ovf
|
---|
1536 | andc $t3,$t3,$ovf
|
---|
1537 | or $t0,$t0,$t2
|
---|
1538 | or $t1,$t1,$t3
|
---|
1539 | stdx $t0,$rp,$i
|
---|
1540 | stdx $t1,$t6,$i
|
---|
1541 | stdx $i,$tp,$i ; zap tp at once
|
---|
1542 | stdx $i,$t4,$i
|
---|
1543 | addi $i,$i,16
|
---|
1544 | bdnz Lcopy
|
---|
1545 | ___
|
---|
1546 | $code.=<<___ if ($SIZE_T==4);
|
---|
1547 | subf $np,$num,$np ; rewind np
|
---|
1548 | addi $j,$j,1 ; restore counter
|
---|
1549 | subfc $i,$i,$i ; j=0 and "clear" XER[CA]
|
---|
1550 | addi $tp,$sp,`$FRAME+$TRANSFER`
|
---|
1551 | addi $np,$np,-4
|
---|
1552 | addi $rp,$rp,-4
|
---|
1553 | addi $ap,$sp,`$FRAME+$TRANSFER+4`
|
---|
1554 | mtctr $j
|
---|
1555 |
|
---|
1556 | .align 4
|
---|
1557 | Lsub: lwz $t0,12($tp) ; load tp[j..j+3] in 64-bit word order
|
---|
1558 | lwz $t1,8($tp)
|
---|
1559 | lwz $t2,20($tp)
|
---|
1560 | lwzu $t3,16($tp)
|
---|
1561 | lwz $t4,4($np) ; load np[j..j+3] in 32-bit word order
|
---|
1562 | lwz $t5,8($np)
|
---|
1563 | lwz $t6,12($np)
|
---|
1564 | lwzu $t7,16($np)
|
---|
1565 | subfe $t4,$t4,$t0 ; tp[j]-np[j]
|
---|
1566 | stw $t0,4($ap) ; save tp[j..j+3] in 32-bit word order
|
---|
1567 | subfe $t5,$t5,$t1 ; tp[j+1]-np[j+1]
|
---|
1568 | stw $t1,8($ap)
|
---|
1569 | subfe $t6,$t6,$t2 ; tp[j+2]-np[j+2]
|
---|
1570 | stw $t2,12($ap)
|
---|
1571 | subfe $t7,$t7,$t3 ; tp[j+3]-np[j+3]
|
---|
1572 | stwu $t3,16($ap)
|
---|
1573 | stw $t4,4($rp)
|
---|
1574 | stw $t5,8($rp)
|
---|
1575 | stw $t6,12($rp)
|
---|
1576 | stwu $t7,16($rp)
|
---|
1577 | bdnz Lsub
|
---|
1578 |
|
---|
1579 | li $i,0
|
---|
1580 | subfe $ovf,$i,$ovf ; handle upmost overflow bit
|
---|
1581 | addi $ap,$sp,`$FRAME+$TRANSFER+4`
|
---|
1582 | subf $rp,$num,$rp ; rewind rp
|
---|
1583 | addi $tp,$sp,`$FRAME+$TRANSFER`
|
---|
1584 | mtctr $j
|
---|
1585 |
|
---|
1586 | .align 4
|
---|
1587 | Lcopy: ; conditional copy
|
---|
1588 | lwz $t0,4($ap)
|
---|
1589 | lwz $t1,8($ap)
|
---|
1590 | lwz $t2,12($ap)
|
---|
1591 | lwzu $t3,16($ap)
|
---|
1592 | lwz $t4,4($rp)
|
---|
1593 | lwz $t5,8($rp)
|
---|
1594 | lwz $t6,12($rp)
|
---|
1595 | lwz $t7,16($rp)
|
---|
1596 | std $i,8($nap_d) ; zap nap_d
|
---|
1597 | std $i,16($nap_d)
|
---|
1598 | std $i,24($nap_d)
|
---|
1599 | std $i,32($nap_d)
|
---|
1600 | std $i,40($nap_d)
|
---|
1601 | std $i,48($nap_d)
|
---|
1602 | std $i,56($nap_d)
|
---|
1603 | stdu $i,64($nap_d)
|
---|
1604 | and $t0,$t0,$ovf
|
---|
1605 | and $t1,$t1,$ovf
|
---|
1606 | and $t2,$t2,$ovf
|
---|
1607 | and $t3,$t3,$ovf
|
---|
1608 | andc $t4,$t4,$ovf
|
---|
1609 | andc $t5,$t5,$ovf
|
---|
1610 | andc $t6,$t6,$ovf
|
---|
1611 | andc $t7,$t7,$ovf
|
---|
1612 | or $t0,$t0,$t4
|
---|
1613 | or $t1,$t1,$t5
|
---|
1614 | or $t2,$t2,$t6
|
---|
1615 | or $t3,$t3,$t7
|
---|
1616 | stw $t0,4($rp)
|
---|
1617 | stw $t1,8($rp)
|
---|
1618 | stw $t2,12($rp)
|
---|
1619 | stwu $t3,16($rp)
|
---|
1620 | std $i,8($tp) ; zap tp at once
|
---|
1621 | stdu $i,16($tp)
|
---|
1622 | bdnz Lcopy
|
---|
1623 | ___
|
---|
1624 | |
---|
1625 |
|
---|
1626 | $code.=<<___;
|
---|
1627 | $POP $i,0($sp)
|
---|
1628 | li r3,1 ; signal "handled"
|
---|
1629 | $POP r19,`-12*8-13*$SIZE_T`($i)
|
---|
1630 | $POP r20,`-12*8-12*$SIZE_T`($i)
|
---|
1631 | $POP r21,`-12*8-11*$SIZE_T`($i)
|
---|
1632 | $POP r22,`-12*8-10*$SIZE_T`($i)
|
---|
1633 | $POP r23,`-12*8-9*$SIZE_T`($i)
|
---|
1634 | $POP r24,`-12*8-8*$SIZE_T`($i)
|
---|
1635 | $POP r25,`-12*8-7*$SIZE_T`($i)
|
---|
1636 | $POP r26,`-12*8-6*$SIZE_T`($i)
|
---|
1637 | $POP r27,`-12*8-5*$SIZE_T`($i)
|
---|
1638 | $POP r28,`-12*8-4*$SIZE_T`($i)
|
---|
1639 | $POP r29,`-12*8-3*$SIZE_T`($i)
|
---|
1640 | $POP r30,`-12*8-2*$SIZE_T`($i)
|
---|
1641 | $POP r31,`-12*8-1*$SIZE_T`($i)
|
---|
1642 | lfd f20,`-12*8`($i)
|
---|
1643 | lfd f21,`-11*8`($i)
|
---|
1644 | lfd f22,`-10*8`($i)
|
---|
1645 | lfd f23,`-9*8`($i)
|
---|
1646 | lfd f24,`-8*8`($i)
|
---|
1647 | lfd f25,`-7*8`($i)
|
---|
1648 | lfd f26,`-6*8`($i)
|
---|
1649 | lfd f27,`-5*8`($i)
|
---|
1650 | lfd f28,`-4*8`($i)
|
---|
1651 | lfd f29,`-3*8`($i)
|
---|
1652 | lfd f30,`-2*8`($i)
|
---|
1653 | lfd f31,`-1*8`($i)
|
---|
1654 | mr $sp,$i
|
---|
1655 | blr
|
---|
1656 | .long 0
|
---|
1657 | .byte 0,12,4,0,0x8c,13,6,0
|
---|
1658 | .long 0
|
---|
1659 | .size .$fname,.-.$fname
|
---|
1660 |
|
---|
1661 | .asciz "Montgomery Multiplication for PPC64, CRYPTOGAMS by <appro\@openssl.org>"
|
---|
1662 | ___
|
---|
1663 |
|
---|
1664 | $code =~ s/\`([^\`]*)\`/eval $1/gem;
|
---|
1665 | print $code;
|
---|
1666 | close STDOUT or die "error closing STDOUT: $!";
|
---|