VirtualBox

source: vbox/trunk/src/libs/openssl-3.1.0/crypto/bn/asm/ppc-mont.pl@ 99371

最後變更 在這個檔案從99371是 99366,由 vboxsync 提交於 23 月 前

openssl-3.1.0: Applied and adjusted our OpenSSL changes to 3.0.7. bugref:10418

檔案大小: 47.9 KB
 
1#! /usr/bin/env perl
2# Copyright 2006-2020 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the Apache License 2.0 (the "License"). You may not use
5# this file except in compliance with the License. You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
9
10# ====================================================================
11# Written by Andy Polyakov <[email protected]> for the OpenSSL
12# project. The module is, however, dual licensed under OpenSSL and
13# CRYPTOGAMS licenses depending on where you obtain it. For further
14# details see http://www.openssl.org/~appro/cryptogams/.
15# ====================================================================
16
17# April 2006
18
19# "Teaser" Montgomery multiplication module for PowerPC. It's possible
20# to gain a bit more by modulo-scheduling outer loop, then dedicated
21# squaring procedure should give further 20% and code can be adapted
22# for 32-bit application running on 64-bit CPU. As for the latter.
23# It won't be able to achieve "native" 64-bit performance, because in
24# 32-bit application context every addc instruction will have to be
25# expanded as addc, twice right shift by 32 and finally adde, etc.
26# So far RSA *sign* performance improvement over pre-bn_mul_mont asm
27# for 64-bit application running on PPC970/G5 is:
28#
29# 512-bit +65%
30# 1024-bit +35%
31# 2048-bit +18%
32# 4096-bit +4%
33
34# September 2016
35#
36# Add multiplication procedure operating on lengths divisible by 4
37# and squaring procedure operating on lengths divisible by 8. Length
38# is expressed in number of limbs. RSA private key operations are
39# ~35-50% faster (more for longer keys) on contemporary high-end POWER
40# processors in 64-bit builds, [mysteriously enough] more in 32-bit
41# builds. On low-end 32-bit processors performance improvement turned
42# to be marginal...
43
44# $output is the last argument if it looks like a file (it has an extension)
45# $flavour is the first argument if it doesn't look like a file
46$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
47$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
48
49if ($flavour =~ /32/) {
50 $BITS= 32;
51 $BNSZ= $BITS/8;
52 $SIZE_T=4;
53 $RZONE= 224;
54
55 $LD= "lwz"; # load
56 $LDU= "lwzu"; # load and update
57 $LDX= "lwzx"; # load indexed
58 $ST= "stw"; # store
59 $STU= "stwu"; # store and update
60 $STX= "stwx"; # store indexed
61 $STUX= "stwux"; # store indexed and update
62 $UMULL= "mullw"; # unsigned multiply low
63 $UMULH= "mulhwu"; # unsigned multiply high
64 $UCMP= "cmplw"; # unsigned compare
65 $SHRI= "srwi"; # unsigned shift right by immediate
66 $SHLI= "slwi"; # unsigned shift left by immediate
67 $PUSH= $ST;
68 $POP= $LD;
69} elsif ($flavour =~ /64/) {
70 $BITS= 64;
71 $BNSZ= $BITS/8;
72 $SIZE_T=8;
73 $RZONE= 288;
74
75 # same as above, but 64-bit mnemonics...
76 $LD= "ld"; # load
77 $LDU= "ldu"; # load and update
78 $LDX= "ldx"; # load indexed
79 $ST= "std"; # store
80 $STU= "stdu"; # store and update
81 $STX= "stdx"; # store indexed
82 $STUX= "stdux"; # store indexed and update
83 $UMULL= "mulld"; # unsigned multiply low
84 $UMULH= "mulhdu"; # unsigned multiply high
85 $UCMP= "cmpld"; # unsigned compare
86 $SHRI= "srdi"; # unsigned shift right by immediate
87 $SHLI= "sldi"; # unsigned shift left by immediate
88 $PUSH= $ST;
89 $POP= $LD;
90} else { die "nonsense $flavour"; }
91
92$FRAME=8*$SIZE_T+$RZONE;
93$LOCALS=8*$SIZE_T;
94
95$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
96( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
97( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
98die "can't locate ppc-xlate.pl";
99
100open STDOUT,"| $^X $xlate $flavour \"$output\""
101 or die "can't call $xlate: $!";
102
103$sp="r1";
104$toc="r2";
105$rp="r3";
106$ap="r4";
107$bp="r5";
108$np="r6";
109$n0="r7";
110$num="r8";
111
112{
113my $ovf=$rp;
114my $rp="r9"; # $rp is reassigned
115my $aj="r10";
116my $nj="r11";
117my $tj="r12";
118# non-volatile registers
119my $i="r20";
120my $j="r21";
121my $tp="r22";
122my $m0="r23";
123my $m1="r24";
124my $lo0="r25";
125my $hi0="r26";
126my $lo1="r27";
127my $hi1="r28";
128my $alo="r29";
129my $ahi="r30";
130my $nlo="r31";
131#
132my $nhi="r0";
133
134$code=<<___;
135.machine "any"
136.text
137
138.globl .bn_mul_mont_int
139.align 5
140.bn_mul_mont_int:
141 mr $rp,r3 ; $rp is reassigned
142 li r3,0
143___
144$code.=<<___ if ($BNSZ==4);
145 cmpwi $num,32 ; longer key performance is not better
146 bgelr
147___
148$code.=<<___;
149 slwi $num,$num,`log($BNSZ)/log(2)`
150 li $tj,-4096
151 addi $ovf,$num,$FRAME
152 subf $ovf,$ovf,$sp ; $sp-$ovf
153 and $ovf,$ovf,$tj ; minimize TLB usage
154 subf $ovf,$sp,$ovf ; $ovf-$sp
155 mr $tj,$sp
156 srwi $num,$num,`log($BNSZ)/log(2)`
157 $STUX $sp,$sp,$ovf
158
159 $PUSH r20,`-12*$SIZE_T`($tj)
160 $PUSH r21,`-11*$SIZE_T`($tj)
161 $PUSH r22,`-10*$SIZE_T`($tj)
162 $PUSH r23,`-9*$SIZE_T`($tj)
163 $PUSH r24,`-8*$SIZE_T`($tj)
164 $PUSH r25,`-7*$SIZE_T`($tj)
165 $PUSH r26,`-6*$SIZE_T`($tj)
166 $PUSH r27,`-5*$SIZE_T`($tj)
167 $PUSH r28,`-4*$SIZE_T`($tj)
168 $PUSH r29,`-3*$SIZE_T`($tj)
169 $PUSH r30,`-2*$SIZE_T`($tj)
170 $PUSH r31,`-1*$SIZE_T`($tj)
171
172 $LD $n0,0($n0) ; pull n0[0] value
173 addi $num,$num,-2 ; adjust $num for counter register
174
175
176 $LD $m0,0($bp) ; m0=bp[0]
177 $LD $aj,0($ap) ; ap[0]
178 addi $tp,$sp,$LOCALS
179 $UMULL $lo0,$aj,$m0 ; ap[0]*bp[0]
180 $UMULH $hi0,$aj,$m0
181
182 $LD $aj,$BNSZ($ap) ; ap[1]
183 $LD $nj,0($np) ; np[0]
184
185 $UMULL $m1,$lo0,$n0 ; "tp[0]"*n0
186
187 $UMULL $alo,$aj,$m0 ; ap[1]*bp[0]
188 $UMULH $ahi,$aj,$m0
189
190 $UMULL $lo1,$nj,$m1 ; np[0]*m1
191 $UMULH $hi1,$nj,$m1
192 $LD $nj,$BNSZ($np) ; np[1]
193 addc $lo1,$lo1,$lo0
194 addze $hi1,$hi1
195
196 $UMULL $nlo,$nj,$m1 ; np[1]*m1
197 $UMULH $nhi,$nj,$m1
198
199 mtctr $num
200 li $j,`2*$BNSZ`
201.align 4
202L1st:
203 $LDX $aj,$ap,$j ; ap[j]
204 addc $lo0,$alo,$hi0
205 $LDX $nj,$np,$j ; np[j]
206 addze $hi0,$ahi
207 $UMULL $alo,$aj,$m0 ; ap[j]*bp[0]
208 addc $lo1,$nlo,$hi1
209 $UMULH $ahi,$aj,$m0
210 addze $hi1,$nhi
211 $UMULL $nlo,$nj,$m1 ; np[j]*m1
212 addc $lo1,$lo1,$lo0 ; np[j]*m1+ap[j]*bp[0]
213 $UMULH $nhi,$nj,$m1
214 addze $hi1,$hi1
215 $ST $lo1,0($tp) ; tp[j-1]
216
217 addi $j,$j,$BNSZ ; j++
218 addi $tp,$tp,$BNSZ ; tp++
219 bdnz L1st
220;L1st
221 addc $lo0,$alo,$hi0
222 addze $hi0,$ahi
223
224 addc $lo1,$nlo,$hi1
225 addze $hi1,$nhi
226 addc $lo1,$lo1,$lo0 ; np[j]*m1+ap[j]*bp[0]
227 addze $hi1,$hi1
228 $ST $lo1,0($tp) ; tp[j-1]
229
230 li $ovf,0
231 addc $hi1,$hi1,$hi0
232 addze $ovf,$ovf ; upmost overflow bit
233 $ST $hi1,$BNSZ($tp)
234
235
236 li $i,$BNSZ
237.align 4
238Louter:
239 $LDX $m0,$bp,$i ; m0=bp[i]
240 $LD $aj,0($ap) ; ap[0]
241 addi $tp,$sp,$LOCALS
242 $LD $tj,$LOCALS($sp); tp[0]
243 $UMULL $lo0,$aj,$m0 ; ap[0]*bp[i]
244 $UMULH $hi0,$aj,$m0
245 $LD $aj,$BNSZ($ap) ; ap[1]
246 $LD $nj,0($np) ; np[0]
247 addc $lo0,$lo0,$tj ; ap[0]*bp[i]+tp[0]
248 $UMULL $alo,$aj,$m0 ; ap[j]*bp[i]
249 addze $hi0,$hi0
250 $UMULL $m1,$lo0,$n0 ; tp[0]*n0
251 $UMULH $ahi,$aj,$m0
252 $UMULL $lo1,$nj,$m1 ; np[0]*m1
253 $UMULH $hi1,$nj,$m1
254 $LD $nj,$BNSZ($np) ; np[1]
255 addc $lo1,$lo1,$lo0
256 $UMULL $nlo,$nj,$m1 ; np[1]*m1
257 addze $hi1,$hi1
258 $UMULH $nhi,$nj,$m1
259
260
261 mtctr $num
262 li $j,`2*$BNSZ`
263.align 4
264Linner:
265 $LDX $aj,$ap,$j ; ap[j]
266 addc $lo0,$alo,$hi0
267 $LD $tj,$BNSZ($tp) ; tp[j]
268 addze $hi0,$ahi
269 $LDX $nj,$np,$j ; np[j]
270 addc $lo1,$nlo,$hi1
271 $UMULL $alo,$aj,$m0 ; ap[j]*bp[i]
272 addze $hi1,$nhi
273 $UMULH $ahi,$aj,$m0
274 addc $lo0,$lo0,$tj ; ap[j]*bp[i]+tp[j]
275 $UMULL $nlo,$nj,$m1 ; np[j]*m1
276 addze $hi0,$hi0
277 $UMULH $nhi,$nj,$m1
278 addc $lo1,$lo1,$lo0 ; np[j]*m1+ap[j]*bp[i]+tp[j]
279 addi $j,$j,$BNSZ ; j++
280 addze $hi1,$hi1
281 $ST $lo1,0($tp) ; tp[j-1]
282 addi $tp,$tp,$BNSZ ; tp++
283 bdnz Linner
284;Linner
285 $LD $tj,$BNSZ($tp) ; tp[j]
286 addc $lo0,$alo,$hi0
287 addze $hi0,$ahi
288 addc $lo0,$lo0,$tj ; ap[j]*bp[i]+tp[j]
289 addze $hi0,$hi0
290
291 addc $lo1,$nlo,$hi1
292 addze $hi1,$nhi
293 addc $lo1,$lo1,$lo0 ; np[j]*m1+ap[j]*bp[i]+tp[j]
294 addze $hi1,$hi1
295 $ST $lo1,0($tp) ; tp[j-1]
296
297 addic $ovf,$ovf,-1 ; move upmost overflow to XER[CA]
298 li $ovf,0
299 adde $hi1,$hi1,$hi0
300 addze $ovf,$ovf
301 $ST $hi1,$BNSZ($tp)
302;
303 slwi $tj,$num,`log($BNSZ)/log(2)`
304 $UCMP $i,$tj
305 addi $i,$i,$BNSZ
306 ble Louter
307
308
309 addi $num,$num,2 ; restore $num
310 subfc $j,$j,$j ; j=0 and "clear" XER[CA]
311 addi $tp,$sp,$LOCALS
312 mtctr $num
313
314.align 4
315Lsub: $LDX $tj,$tp,$j
316 $LDX $nj,$np,$j
317 subfe $aj,$nj,$tj ; tp[j]-np[j]
318 $STX $aj,$rp,$j
319 addi $j,$j,$BNSZ
320 bdnz Lsub
321
322 li $j,0
323 mtctr $num
324 subfe $ovf,$j,$ovf ; handle upmost overflow bit
325
326.align 4
327Lcopy: ; conditional copy
328 $LDX $tj,$tp,$j
329 $LDX $aj,$rp,$j
330 and $tj,$tj,$ovf
331 andc $aj,$aj,$ovf
332 $STX $j,$tp,$j ; zap at once
333 or $aj,$aj,$tj
334 $STX $aj,$rp,$j
335 addi $j,$j,$BNSZ
336 bdnz Lcopy
337
338 $POP $tj,0($sp)
339 li r3,1
340 $POP r20,`-12*$SIZE_T`($tj)
341 $POP r21,`-11*$SIZE_T`($tj)
342 $POP r22,`-10*$SIZE_T`($tj)
343 $POP r23,`-9*$SIZE_T`($tj)
344 $POP r24,`-8*$SIZE_T`($tj)
345 $POP r25,`-7*$SIZE_T`($tj)
346 $POP r26,`-6*$SIZE_T`($tj)
347 $POP r27,`-5*$SIZE_T`($tj)
348 $POP r28,`-4*$SIZE_T`($tj)
349 $POP r29,`-3*$SIZE_T`($tj)
350 $POP r30,`-2*$SIZE_T`($tj)
351 $POP r31,`-1*$SIZE_T`($tj)
352 mr $sp,$tj
353 blr
354 .long 0
355 .byte 0,12,4,0,0x80,12,6,0
356 .long 0
357.size .bn_mul_mont_int,.-.bn_mul_mont_int
358___
359}
360if (1) {
361my ($a0,$a1,$a2,$a3,
362 $t0,$t1,$t2,$t3,
363 $m0,$m1,$m2,$m3,
364 $acc0,$acc1,$acc2,$acc3,$acc4,
365 $bi,$mi,$tp,$ap_end,$cnt) = map("r$_",(9..12,14..31));
366my ($carry,$zero) = ($rp,"r0");
367
368# sp----------->+-------------------------------+
369# | saved sp |
370# +-------------------------------+
371# . .
372# +8*size_t +-------------------------------+
373# | 4 "n0*t0" |
374# . .
375# . .
376# +12*size_t +-------------------------------+
377# | size_t tmp[num] |
378# . .
379# . .
380# . .
381# +-------------------------------+
382# | topmost carry |
383# . .
384# -18*size_t +-------------------------------+
385# | 18 saved gpr, r14-r31 |
386# . .
387# . .
388# +-------------------------------+
389$code.=<<___;
390.globl .bn_mul4x_mont_int
391.align 5
392.bn_mul4x_mont_int:
393 andi. r0,$num,7
394 bne .Lmul4x_do
395 $UCMP $ap,$bp
396 bne .Lmul4x_do
397 b .Lsqr8x_do
398.Lmul4x_do:
399 slwi $num,$num,`log($SIZE_T)/log(2)`
400 mr $a0,$sp
401 li $a1,-32*$SIZE_T
402 sub $a1,$a1,$num
403 $STUX $sp,$sp,$a1 # alloca
404
405 $PUSH r14,-$SIZE_T*18($a0)
406 $PUSH r15,-$SIZE_T*17($a0)
407 $PUSH r16,-$SIZE_T*16($a0)
408 $PUSH r17,-$SIZE_T*15($a0)
409 $PUSH r18,-$SIZE_T*14($a0)
410 $PUSH r19,-$SIZE_T*13($a0)
411 $PUSH r20,-$SIZE_T*12($a0)
412 $PUSH r21,-$SIZE_T*11($a0)
413 $PUSH r22,-$SIZE_T*10($a0)
414 $PUSH r23,-$SIZE_T*9($a0)
415 $PUSH r24,-$SIZE_T*8($a0)
416 $PUSH r25,-$SIZE_T*7($a0)
417 $PUSH r26,-$SIZE_T*6($a0)
418 $PUSH r27,-$SIZE_T*5($a0)
419 $PUSH r28,-$SIZE_T*4($a0)
420 $PUSH r29,-$SIZE_T*3($a0)
421 $PUSH r30,-$SIZE_T*2($a0)
422 $PUSH r31,-$SIZE_T*1($a0)
423
424 subi $ap,$ap,$SIZE_T # bias by -1
425 subi $np,$np,$SIZE_T # bias by -1
426 subi $rp,$rp,$SIZE_T # bias by -1
427 $LD $n0,0($n0) # *n0
428
429 add $t0,$bp,$num
430 add $ap_end,$ap,$num
431 subi $t0,$t0,$SIZE_T*4 # &b[num-4]
432
433 $LD $bi,$SIZE_T*0($bp) # b[0]
434 li $acc0,0
435 $LD $a0,$SIZE_T*1($ap) # a[0..3]
436 li $acc1,0
437 $LD $a1,$SIZE_T*2($ap)
438 li $acc2,0
439 $LD $a2,$SIZE_T*3($ap)
440 li $acc3,0
441 $LDU $a3,$SIZE_T*4($ap)
442 $LD $m0,$SIZE_T*1($np) # n[0..3]
443 $LD $m1,$SIZE_T*2($np)
444 $LD $m2,$SIZE_T*3($np)
445 $LDU $m3,$SIZE_T*4($np)
446
447 $PUSH $rp,$SIZE_T*6($sp) # offload rp and &b[num-4]
448 $PUSH $t0,$SIZE_T*7($sp)
449 li $carry,0
450 addic $tp,$sp,$SIZE_T*7 # &t[-1], clear carry bit
451 li $cnt,0
452 li $zero,0
453 b .Loop_mul4x_1st_reduction
454
455.align 5
456.Loop_mul4x_1st_reduction:
457 $UMULL $t0,$a0,$bi # lo(a[0..3]*b[0])
458 addze $carry,$carry # modulo-scheduled
459 $UMULL $t1,$a1,$bi
460 addi $cnt,$cnt,$SIZE_T
461 $UMULL $t2,$a2,$bi
462 andi. $cnt,$cnt,$SIZE_T*4-1
463 $UMULL $t3,$a3,$bi
464 addc $acc0,$acc0,$t0
465 $UMULH $t0,$a0,$bi # hi(a[0..3]*b[0])
466 adde $acc1,$acc1,$t1
467 $UMULH $t1,$a1,$bi
468 adde $acc2,$acc2,$t2
469 $UMULL $mi,$acc0,$n0 # t[0]*n0
470 adde $acc3,$acc3,$t3
471 $UMULH $t2,$a2,$bi
472 addze $acc4,$zero
473 $UMULH $t3,$a3,$bi
474 $LDX $bi,$bp,$cnt # next b[i] (or b[0])
475 addc $acc1,$acc1,$t0
476 # (*) mul $t0,$m0,$mi # lo(n[0..3]*t[0]*n0)
477 $STU $mi,$SIZE_T($tp) # put aside t[0]*n0 for tail processing
478 adde $acc2,$acc2,$t1
479 $UMULL $t1,$m1,$mi
480 adde $acc3,$acc3,$t2
481 $UMULL $t2,$m2,$mi
482 adde $acc4,$acc4,$t3 # can't overflow
483 $UMULL $t3,$m3,$mi
484 # (*) addc $acc0,$acc0,$t0
485 # (*) As for removal of first multiplication and addition
486 # instructions. The outcome of first addition is
487 # guaranteed to be zero, which leaves two computationally
488 # significant outcomes: it either carries or not. Then
489 # question is when does it carry? Is there alternative
490 # way to deduce it? If you follow operations, you can
491 # observe that condition for carry is quite simple:
492 # $acc0 being non-zero. So that carry can be calculated
493 # by adding -1 to $acc0. That's what next instruction does.
494 addic $acc0,$acc0,-1 # (*), discarded
495 $UMULH $t0,$m0,$mi # hi(n[0..3]*t[0]*n0)
496 adde $acc0,$acc1,$t1
497 $UMULH $t1,$m1,$mi
498 adde $acc1,$acc2,$t2
499 $UMULH $t2,$m2,$mi
500 adde $acc2,$acc3,$t3
501 $UMULH $t3,$m3,$mi
502 adde $acc3,$acc4,$carry
503 addze $carry,$zero
504 addc $acc0,$acc0,$t0
505 adde $acc1,$acc1,$t1
506 adde $acc2,$acc2,$t2
507 adde $acc3,$acc3,$t3
508 #addze $carry,$carry
509 bne .Loop_mul4x_1st_reduction
510
511 $UCMP $ap_end,$ap
512 beq .Lmul4x4_post_condition
513
514 $LD $a0,$SIZE_T*1($ap) # a[4..7]
515 $LD $a1,$SIZE_T*2($ap)
516 $LD $a2,$SIZE_T*3($ap)
517 $LDU $a3,$SIZE_T*4($ap)
518 $LD $mi,$SIZE_T*8($sp) # a[0]*n0
519 $LD $m0,$SIZE_T*1($np) # n[4..7]
520 $LD $m1,$SIZE_T*2($np)
521 $LD $m2,$SIZE_T*3($np)
522 $LDU $m3,$SIZE_T*4($np)
523 b .Loop_mul4x_1st_tail
524
525.align 5
526.Loop_mul4x_1st_tail:
527 $UMULL $t0,$a0,$bi # lo(a[4..7]*b[i])
528 addze $carry,$carry # modulo-scheduled
529 $UMULL $t1,$a1,$bi
530 addi $cnt,$cnt,$SIZE_T
531 $UMULL $t2,$a2,$bi
532 andi. $cnt,$cnt,$SIZE_T*4-1
533 $UMULL $t3,$a3,$bi
534 addc $acc0,$acc0,$t0
535 $UMULH $t0,$a0,$bi # hi(a[4..7]*b[i])
536 adde $acc1,$acc1,$t1
537 $UMULH $t1,$a1,$bi
538 adde $acc2,$acc2,$t2
539 $UMULH $t2,$a2,$bi
540 adde $acc3,$acc3,$t3
541 $UMULH $t3,$a3,$bi
542 addze $acc4,$zero
543 $LDX $bi,$bp,$cnt # next b[i] (or b[0])
544 addc $acc1,$acc1,$t0
545 $UMULL $t0,$m0,$mi # lo(n[4..7]*a[0]*n0)
546 adde $acc2,$acc2,$t1
547 $UMULL $t1,$m1,$mi
548 adde $acc3,$acc3,$t2
549 $UMULL $t2,$m2,$mi
550 adde $acc4,$acc4,$t3 # can't overflow
551 $UMULL $t3,$m3,$mi
552 addc $acc0,$acc0,$t0
553 $UMULH $t0,$m0,$mi # hi(n[4..7]*a[0]*n0)
554 adde $acc1,$acc1,$t1
555 $UMULH $t1,$m1,$mi
556 adde $acc2,$acc2,$t2
557 $UMULH $t2,$m2,$mi
558 adde $acc3,$acc3,$t3
559 adde $acc4,$acc4,$carry
560 $UMULH $t3,$m3,$mi
561 addze $carry,$zero
562 addi $mi,$sp,$SIZE_T*8
563 $LDX $mi,$mi,$cnt # next t[0]*n0
564 $STU $acc0,$SIZE_T($tp) # word of result
565 addc $acc0,$acc1,$t0
566 adde $acc1,$acc2,$t1
567 adde $acc2,$acc3,$t2
568 adde $acc3,$acc4,$t3
569 #addze $carry,$carry
570 bne .Loop_mul4x_1st_tail
571
572 sub $t1,$ap_end,$num # rewinded $ap
573 $UCMP $ap_end,$ap # done yet?
574 beq .Lmul4x_proceed
575
576 $LD $a0,$SIZE_T*1($ap)
577 $LD $a1,$SIZE_T*2($ap)
578 $LD $a2,$SIZE_T*3($ap)
579 $LDU $a3,$SIZE_T*4($ap)
580 $LD $m0,$SIZE_T*1($np)
581 $LD $m1,$SIZE_T*2($np)
582 $LD $m2,$SIZE_T*3($np)
583 $LDU $m3,$SIZE_T*4($np)
584 b .Loop_mul4x_1st_tail
585
586.align 5
587.Lmul4x_proceed:
588 $LDU $bi,$SIZE_T*4($bp) # *++b
589 addze $carry,$carry # topmost carry
590 $LD $a0,$SIZE_T*1($t1)
591 $LD $a1,$SIZE_T*2($t1)
592 $LD $a2,$SIZE_T*3($t1)
593 $LD $a3,$SIZE_T*4($t1)
594 addi $ap,$t1,$SIZE_T*4
595 sub $np,$np,$num # rewind np
596
597 $ST $acc0,$SIZE_T*1($tp) # result
598 $ST $acc1,$SIZE_T*2($tp)
599 $ST $acc2,$SIZE_T*3($tp)
600 $ST $acc3,$SIZE_T*4($tp)
601 $ST $carry,$SIZE_T*5($tp) # save topmost carry
602 $LD $acc0,$SIZE_T*12($sp) # t[0..3]
603 $LD $acc1,$SIZE_T*13($sp)
604 $LD $acc2,$SIZE_T*14($sp)
605 $LD $acc3,$SIZE_T*15($sp)
606
607 $LD $m0,$SIZE_T*1($np) # n[0..3]
608 $LD $m1,$SIZE_T*2($np)
609 $LD $m2,$SIZE_T*3($np)
610 $LDU $m3,$SIZE_T*4($np)
611 addic $tp,$sp,$SIZE_T*7 # &t[-1], clear carry bit
612 li $carry,0
613 b .Loop_mul4x_reduction
614
615.align 5
616.Loop_mul4x_reduction:
617 $UMULL $t0,$a0,$bi # lo(a[0..3]*b[4])
618 addze $carry,$carry # modulo-scheduled
619 $UMULL $t1,$a1,$bi
620 addi $cnt,$cnt,$SIZE_T
621 $UMULL $t2,$a2,$bi
622 andi. $cnt,$cnt,$SIZE_T*4-1
623 $UMULL $t3,$a3,$bi
624 addc $acc0,$acc0,$t0
625 $UMULH $t0,$a0,$bi # hi(a[0..3]*b[4])
626 adde $acc1,$acc1,$t1
627 $UMULH $t1,$a1,$bi
628 adde $acc2,$acc2,$t2
629 $UMULL $mi,$acc0,$n0 # t[0]*n0
630 adde $acc3,$acc3,$t3
631 $UMULH $t2,$a2,$bi
632 addze $acc4,$zero
633 $UMULH $t3,$a3,$bi
634 $LDX $bi,$bp,$cnt # next b[i]
635 addc $acc1,$acc1,$t0
636 # (*) mul $t0,$m0,$mi
637 $STU $mi,$SIZE_T($tp) # put aside t[0]*n0 for tail processing
638 adde $acc2,$acc2,$t1
639 $UMULL $t1,$m1,$mi # lo(n[0..3]*t[0]*n0
640 adde $acc3,$acc3,$t2
641 $UMULL $t2,$m2,$mi
642 adde $acc4,$acc4,$t3 # can't overflow
643 $UMULL $t3,$m3,$mi
644 # (*) addc $acc0,$acc0,$t0
645 addic $acc0,$acc0,-1 # (*), discarded
646 $UMULH $t0,$m0,$mi # hi(n[0..3]*t[0]*n0
647 adde $acc0,$acc1,$t1
648 $UMULH $t1,$m1,$mi
649 adde $acc1,$acc2,$t2
650 $UMULH $t2,$m2,$mi
651 adde $acc2,$acc3,$t3
652 $UMULH $t3,$m3,$mi
653 adde $acc3,$acc4,$carry
654 addze $carry,$zero
655 addc $acc0,$acc0,$t0
656 adde $acc1,$acc1,$t1
657 adde $acc2,$acc2,$t2
658 adde $acc3,$acc3,$t3
659 #addze $carry,$carry
660 bne .Loop_mul4x_reduction
661
662 $LD $t0,$SIZE_T*5($tp) # t[4..7]
663 addze $carry,$carry
664 $LD $t1,$SIZE_T*6($tp)
665 $LD $t2,$SIZE_T*7($tp)
666 $LD $t3,$SIZE_T*8($tp)
667 $LD $a0,$SIZE_T*1($ap) # a[4..7]
668 $LD $a1,$SIZE_T*2($ap)
669 $LD $a2,$SIZE_T*3($ap)
670 $LDU $a3,$SIZE_T*4($ap)
671 addc $acc0,$acc0,$t0
672 adde $acc1,$acc1,$t1
673 adde $acc2,$acc2,$t2
674 adde $acc3,$acc3,$t3
675 #addze $carry,$carry
676
677 $LD $mi,$SIZE_T*8($sp) # t[0]*n0
678 $LD $m0,$SIZE_T*1($np) # n[4..7]
679 $LD $m1,$SIZE_T*2($np)
680 $LD $m2,$SIZE_T*3($np)
681 $LDU $m3,$SIZE_T*4($np)
682 b .Loop_mul4x_tail
683
684.align 5
685.Loop_mul4x_tail:
686 $UMULL $t0,$a0,$bi # lo(a[4..7]*b[4])
687 addze $carry,$carry # modulo-scheduled
688 $UMULL $t1,$a1,$bi
689 addi $cnt,$cnt,$SIZE_T
690 $UMULL $t2,$a2,$bi
691 andi. $cnt,$cnt,$SIZE_T*4-1
692 $UMULL $t3,$a3,$bi
693 addc $acc0,$acc0,$t0
694 $UMULH $t0,$a0,$bi # hi(a[4..7]*b[4])
695 adde $acc1,$acc1,$t1
696 $UMULH $t1,$a1,$bi
697 adde $acc2,$acc2,$t2
698 $UMULH $t2,$a2,$bi
699 adde $acc3,$acc3,$t3
700 $UMULH $t3,$a3,$bi
701 addze $acc4,$zero
702 $LDX $bi,$bp,$cnt # next b[i]
703 addc $acc1,$acc1,$t0
704 $UMULL $t0,$m0,$mi # lo(n[4..7]*t[0]*n0)
705 adde $acc2,$acc2,$t1
706 $UMULL $t1,$m1,$mi
707 adde $acc3,$acc3,$t2
708 $UMULL $t2,$m2,$mi
709 adde $acc4,$acc4,$t3 # can't overflow
710 $UMULL $t3,$m3,$mi
711 addc $acc0,$acc0,$t0
712 $UMULH $t0,$m0,$mi # hi(n[4..7]*t[0]*n0)
713 adde $acc1,$acc1,$t1
714 $UMULH $t1,$m1,$mi
715 adde $acc2,$acc2,$t2
716 $UMULH $t2,$m2,$mi
717 adde $acc3,$acc3,$t3
718 $UMULH $t3,$m3,$mi
719 adde $acc4,$acc4,$carry
720 addi $mi,$sp,$SIZE_T*8
721 $LDX $mi,$mi,$cnt # next a[0]*n0
722 addze $carry,$zero
723 $STU $acc0,$SIZE_T($tp) # word of result
724 addc $acc0,$acc1,$t0
725 adde $acc1,$acc2,$t1
726 adde $acc2,$acc3,$t2
727 adde $acc3,$acc4,$t3
728 #addze $carry,$carry
729 bne .Loop_mul4x_tail
730
731 $LD $t0,$SIZE_T*5($tp) # next t[i] or topmost carry
732 sub $t1,$np,$num # rewinded np?
733 addze $carry,$carry
734 $UCMP $ap_end,$ap # done yet?
735 beq .Loop_mul4x_break
736
737 $LD $t1,$SIZE_T*6($tp)
738 $LD $t2,$SIZE_T*7($tp)
739 $LD $t3,$SIZE_T*8($tp)
740 $LD $a0,$SIZE_T*1($ap)
741 $LD $a1,$SIZE_T*2($ap)
742 $LD $a2,$SIZE_T*3($ap)
743 $LDU $a3,$SIZE_T*4($ap)
744 addc $acc0,$acc0,$t0
745 adde $acc1,$acc1,$t1
746 adde $acc2,$acc2,$t2
747 adde $acc3,$acc3,$t3
748 #addze $carry,$carry
749
750 $LD $m0,$SIZE_T*1($np) # n[4..7]
751 $LD $m1,$SIZE_T*2($np)
752 $LD $m2,$SIZE_T*3($np)
753 $LDU $m3,$SIZE_T*4($np)
754 b .Loop_mul4x_tail
755
756.align 5
757.Loop_mul4x_break:
758 $POP $t2,$SIZE_T*6($sp) # pull rp and &b[num-4]
759 $POP $t3,$SIZE_T*7($sp)
760 addc $a0,$acc0,$t0 # accumulate topmost carry
761 $LD $acc0,$SIZE_T*12($sp) # t[0..3]
762 addze $a1,$acc1
763 $LD $acc1,$SIZE_T*13($sp)
764 addze $a2,$acc2
765 $LD $acc2,$SIZE_T*14($sp)
766 addze $a3,$acc3
767 $LD $acc3,$SIZE_T*15($sp)
768 addze $carry,$carry # topmost carry
769 $ST $a0,$SIZE_T*1($tp) # result
770 sub $ap,$ap_end,$num # rewind ap
771 $ST $a1,$SIZE_T*2($tp)
772 $ST $a2,$SIZE_T*3($tp)
773 $ST $a3,$SIZE_T*4($tp)
774 $ST $carry,$SIZE_T*5($tp) # store topmost carry
775
776 $LD $m0,$SIZE_T*1($t1) # n[0..3]
777 $LD $m1,$SIZE_T*2($t1)
778 $LD $m2,$SIZE_T*3($t1)
779 $LD $m3,$SIZE_T*4($t1)
780 addi $np,$t1,$SIZE_T*4
781 $UCMP $bp,$t3 # done yet?
782 beq .Lmul4x_post
783
784 $LDU $bi,$SIZE_T*4($bp)
785 $LD $a0,$SIZE_T*1($ap) # a[0..3]
786 $LD $a1,$SIZE_T*2($ap)
787 $LD $a2,$SIZE_T*3($ap)
788 $LDU $a3,$SIZE_T*4($ap)
789 li $carry,0
790 addic $tp,$sp,$SIZE_T*7 # &t[-1], clear carry bit
791 b .Loop_mul4x_reduction
792
793.align 5
794.Lmul4x_post:
795 # Final step. We see if result is larger than modulus, and
796 # if it is, subtract the modulus. But comparison implies
797 # subtraction. So we subtract modulus, see if it borrowed,
798 # and conditionally copy original value.
799 srwi $cnt,$num,`log($SIZE_T)/log(2)+2`
800 mr $bp,$t2 # &rp[-1]
801 subi $cnt,$cnt,1
802 mr $ap_end,$t2 # &rp[-1] copy
803 subfc $t0,$m0,$acc0
804 addi $tp,$sp,$SIZE_T*15
805 subfe $t1,$m1,$acc1
806
807 mtctr $cnt
808.Lmul4x_sub:
809 $LD $m0,$SIZE_T*1($np)
810 $LD $acc0,$SIZE_T*1($tp)
811 subfe $t2,$m2,$acc2
812 $LD $m1,$SIZE_T*2($np)
813 $LD $acc1,$SIZE_T*2($tp)
814 subfe $t3,$m3,$acc3
815 $LD $m2,$SIZE_T*3($np)
816 $LD $acc2,$SIZE_T*3($tp)
817 $LDU $m3,$SIZE_T*4($np)
818 $LDU $acc3,$SIZE_T*4($tp)
819 $ST $t0,$SIZE_T*1($bp)
820 $ST $t1,$SIZE_T*2($bp)
821 subfe $t0,$m0,$acc0
822 $ST $t2,$SIZE_T*3($bp)
823 $STU $t3,$SIZE_T*4($bp)
824 subfe $t1,$m1,$acc1
825 bdnz .Lmul4x_sub
826
827 $LD $a0,$SIZE_T*1($ap_end)
828 $ST $t0,$SIZE_T*1($bp)
829 $LD $t0,$SIZE_T*12($sp)
830 subfe $t2,$m2,$acc2
831 $LD $a1,$SIZE_T*2($ap_end)
832 $ST $t1,$SIZE_T*2($bp)
833 $LD $t1,$SIZE_T*13($sp)
834 subfe $t3,$m3,$acc3
835 subfe $carry,$zero,$carry # did it borrow?
836 addi $tp,$sp,$SIZE_T*12
837 $LD $a2,$SIZE_T*3($ap_end)
838 $ST $t2,$SIZE_T*3($bp)
839 $LD $t2,$SIZE_T*14($sp)
840 $LD $a3,$SIZE_T*4($ap_end)
841 $ST $t3,$SIZE_T*4($bp)
842 $LD $t3,$SIZE_T*15($sp)
843
844 mtctr $cnt
845.Lmul4x_cond_copy:
846 and $t0,$t0,$carry
847 andc $a0,$a0,$carry
848 $ST $zero,$SIZE_T*0($tp) # wipe stack clean
849 and $t1,$t1,$carry
850 andc $a1,$a1,$carry
851 $ST $zero,$SIZE_T*1($tp)
852 and $t2,$t2,$carry
853 andc $a2,$a2,$carry
854 $ST $zero,$SIZE_T*2($tp)
855 and $t3,$t3,$carry
856 andc $a3,$a3,$carry
857 $ST $zero,$SIZE_T*3($tp)
858 or $acc0,$t0,$a0
859 $LD $a0,$SIZE_T*5($ap_end)
860 $LD $t0,$SIZE_T*4($tp)
861 or $acc1,$t1,$a1
862 $LD $a1,$SIZE_T*6($ap_end)
863 $LD $t1,$SIZE_T*5($tp)
864 or $acc2,$t2,$a2
865 $LD $a2,$SIZE_T*7($ap_end)
866 $LD $t2,$SIZE_T*6($tp)
867 or $acc3,$t3,$a3
868 $LD $a3,$SIZE_T*8($ap_end)
869 $LD $t3,$SIZE_T*7($tp)
870 addi $tp,$tp,$SIZE_T*4
871 $ST $acc0,$SIZE_T*1($ap_end)
872 $ST $acc1,$SIZE_T*2($ap_end)
873 $ST $acc2,$SIZE_T*3($ap_end)
874 $STU $acc3,$SIZE_T*4($ap_end)
875 bdnz .Lmul4x_cond_copy
876
877 $POP $bp,0($sp) # pull saved sp
878 and $t0,$t0,$carry
879 andc $a0,$a0,$carry
880 $ST $zero,$SIZE_T*0($tp)
881 and $t1,$t1,$carry
882 andc $a1,$a1,$carry
883 $ST $zero,$SIZE_T*1($tp)
884 and $t2,$t2,$carry
885 andc $a2,$a2,$carry
886 $ST $zero,$SIZE_T*2($tp)
887 and $t3,$t3,$carry
888 andc $a3,$a3,$carry
889 $ST $zero,$SIZE_T*3($tp)
890 or $acc0,$t0,$a0
891 or $acc1,$t1,$a1
892 $ST $zero,$SIZE_T*4($tp)
893 or $acc2,$t2,$a2
894 or $acc3,$t3,$a3
895 $ST $acc0,$SIZE_T*1($ap_end)
896 $ST $acc1,$SIZE_T*2($ap_end)
897 $ST $acc2,$SIZE_T*3($ap_end)
898 $ST $acc3,$SIZE_T*4($ap_end)
899
900 b .Lmul4x_done
901
902.align 4
903.Lmul4x4_post_condition:
904 $POP $ap,$SIZE_T*6($sp) # pull &rp[-1]
905 $POP $bp,0($sp) # pull saved sp
906 addze $carry,$carry # modulo-scheduled
907 # $acc0-3,$carry hold result, $m0-3 hold modulus
908 subfc $a0,$m0,$acc0
909 subfe $a1,$m1,$acc1
910 subfe $a2,$m2,$acc2
911 subfe $a3,$m3,$acc3
912 subfe $carry,$zero,$carry # did it borrow?
913
914 and $m0,$m0,$carry
915 and $m1,$m1,$carry
916 addc $a0,$a0,$m0
917 and $m2,$m2,$carry
918 adde $a1,$a1,$m1
919 and $m3,$m3,$carry
920 adde $a2,$a2,$m2
921 adde $a3,$a3,$m3
922
923 $ST $a0,$SIZE_T*1($ap) # write result
924 $ST $a1,$SIZE_T*2($ap)
925 $ST $a2,$SIZE_T*3($ap)
926 $ST $a3,$SIZE_T*4($ap)
927
928.Lmul4x_done:
929 $ST $zero,$SIZE_T*8($sp) # wipe stack clean
930 $ST $zero,$SIZE_T*9($sp)
931 $ST $zero,$SIZE_T*10($sp)
932 $ST $zero,$SIZE_T*11($sp)
933 li r3,1 # signal "done"
934 $POP r14,-$SIZE_T*18($bp)
935 $POP r15,-$SIZE_T*17($bp)
936 $POP r16,-$SIZE_T*16($bp)
937 $POP r17,-$SIZE_T*15($bp)
938 $POP r18,-$SIZE_T*14($bp)
939 $POP r19,-$SIZE_T*13($bp)
940 $POP r20,-$SIZE_T*12($bp)
941 $POP r21,-$SIZE_T*11($bp)
942 $POP r22,-$SIZE_T*10($bp)
943 $POP r23,-$SIZE_T*9($bp)
944 $POP r24,-$SIZE_T*8($bp)
945 $POP r25,-$SIZE_T*7($bp)
946 $POP r26,-$SIZE_T*6($bp)
947 $POP r27,-$SIZE_T*5($bp)
948 $POP r28,-$SIZE_T*4($bp)
949 $POP r29,-$SIZE_T*3($bp)
950 $POP r30,-$SIZE_T*2($bp)
951 $POP r31,-$SIZE_T*1($bp)
952 mr $sp,$bp
953 blr
954 .long 0
955 .byte 0,12,4,0x20,0x80,18,6,0
956 .long 0
957.size .bn_mul4x_mont_int,.-.bn_mul4x_mont_int
958___
959}
960
961if (1) {
962########################################################################
963# Following is PPC adaptation of sqrx8x_mont from x86_64-mont5 module.
964
965my ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("r$_",(9..12,14..17));
966my ($t0,$t1,$t2,$t3)=map("r$_",(18..21));
967my ($acc0,$acc1,$acc2,$acc3,$acc4,$acc5,$acc6,$acc7)=map("r$_",(22..29));
968my ($cnt,$carry,$zero)=("r30","r31","r0");
969my ($tp,$ap_end,$na0)=($bp,$np,$carry);
970
971# sp----------->+-------------------------------+
972# | saved sp |
973# +-------------------------------+
974# . .
975# +12*size_t +-------------------------------+
976# | size_t tmp[2*num] |
977# . .
978# . .
979# . .
980# +-------------------------------+
981# . .
982# -18*size_t +-------------------------------+
983# | 18 saved gpr, r14-r31 |
984# . .
985# . .
986# +-------------------------------+
987$code.=<<___;
988.align 5
989__bn_sqr8x_mont:
990.Lsqr8x_do:
991 mr $a0,$sp
992 slwi $a1,$num,`log($SIZE_T)/log(2)+1`
993 li $a2,-32*$SIZE_T
994 sub $a1,$a2,$a1
995 slwi $num,$num,`log($SIZE_T)/log(2)`
996 $STUX $sp,$sp,$a1 # alloca
997
998 $PUSH r14,-$SIZE_T*18($a0)
999 $PUSH r15,-$SIZE_T*17($a0)
1000 $PUSH r16,-$SIZE_T*16($a0)
1001 $PUSH r17,-$SIZE_T*15($a0)
1002 $PUSH r18,-$SIZE_T*14($a0)
1003 $PUSH r19,-$SIZE_T*13($a0)
1004 $PUSH r20,-$SIZE_T*12($a0)
1005 $PUSH r21,-$SIZE_T*11($a0)
1006 $PUSH r22,-$SIZE_T*10($a0)
1007 $PUSH r23,-$SIZE_T*9($a0)
1008 $PUSH r24,-$SIZE_T*8($a0)
1009 $PUSH r25,-$SIZE_T*7($a0)
1010 $PUSH r26,-$SIZE_T*6($a0)
1011 $PUSH r27,-$SIZE_T*5($a0)
1012 $PUSH r28,-$SIZE_T*4($a0)
1013 $PUSH r29,-$SIZE_T*3($a0)
1014 $PUSH r30,-$SIZE_T*2($a0)
1015 $PUSH r31,-$SIZE_T*1($a0)
1016
1017 subi $ap,$ap,$SIZE_T # bias by -1
1018 subi $t0,$np,$SIZE_T # bias by -1
1019 subi $rp,$rp,$SIZE_T # bias by -1
1020 $LD $n0,0($n0) # *n0
1021 li $zero,0
1022
1023 add $ap_end,$ap,$num
1024 $LD $a0,$SIZE_T*1($ap)
1025 #li $acc0,0
1026 $LD $a1,$SIZE_T*2($ap)
1027 li $acc1,0
1028 $LD $a2,$SIZE_T*3($ap)
1029 li $acc2,0
1030 $LD $a3,$SIZE_T*4($ap)
1031 li $acc3,0
1032 $LD $a4,$SIZE_T*5($ap)
1033 li $acc4,0
1034 $LD $a5,$SIZE_T*6($ap)
1035 li $acc5,0
1036 $LD $a6,$SIZE_T*7($ap)
1037 li $acc6,0
1038 $LDU $a7,$SIZE_T*8($ap)
1039 li $acc7,0
1040
1041 addi $tp,$sp,$SIZE_T*11 # &tp[-1]
1042 subic. $cnt,$num,$SIZE_T*8
1043 b .Lsqr8x_zero_start
1044
1045.align 5
1046.Lsqr8x_zero:
1047 subic. $cnt,$cnt,$SIZE_T*8
1048 $ST $zero,$SIZE_T*1($tp)
1049 $ST $zero,$SIZE_T*2($tp)
1050 $ST $zero,$SIZE_T*3($tp)
1051 $ST $zero,$SIZE_T*4($tp)
1052 $ST $zero,$SIZE_T*5($tp)
1053 $ST $zero,$SIZE_T*6($tp)
1054 $ST $zero,$SIZE_T*7($tp)
1055 $ST $zero,$SIZE_T*8($tp)
1056.Lsqr8x_zero_start:
1057 $ST $zero,$SIZE_T*9($tp)
1058 $ST $zero,$SIZE_T*10($tp)
1059 $ST $zero,$SIZE_T*11($tp)
1060 $ST $zero,$SIZE_T*12($tp)
1061 $ST $zero,$SIZE_T*13($tp)
1062 $ST $zero,$SIZE_T*14($tp)
1063 $ST $zero,$SIZE_T*15($tp)
1064 $STU $zero,$SIZE_T*16($tp)
1065 bne .Lsqr8x_zero
1066
1067 $PUSH $rp,$SIZE_T*6($sp) # offload &rp[-1]
1068 $PUSH $t0,$SIZE_T*7($sp) # offload &np[-1]
1069 $PUSH $n0,$SIZE_T*8($sp) # offload n0
1070 $PUSH $tp,$SIZE_T*9($sp) # &tp[2*num-1]
1071 $PUSH $zero,$SIZE_T*10($sp) # initial top-most carry
1072 addi $tp,$sp,$SIZE_T*11 # &tp[-1]
1073
1074 # Multiply everything but a[i]*a[i]
1075.align 5
1076.Lsqr8x_outer_loop:
1077 # a[1]a[0] (i)
1078 # a[2]a[0]
1079 # a[3]a[0]
1080 # a[4]a[0]
1081 # a[5]a[0]
1082 # a[6]a[0]
1083 # a[7]a[0]
1084 # a[2]a[1] (ii)
1085 # a[3]a[1]
1086 # a[4]a[1]
1087 # a[5]a[1]
1088 # a[6]a[1]
1089 # a[7]a[1]
1090 # a[3]a[2] (iii)
1091 # a[4]a[2]
1092 # a[5]a[2]
1093 # a[6]a[2]
1094 # a[7]a[2]
1095 # a[4]a[3] (iv)
1096 # a[5]a[3]
1097 # a[6]a[3]
1098 # a[7]a[3]
1099 # a[5]a[4] (v)
1100 # a[6]a[4]
1101 # a[7]a[4]
1102 # a[6]a[5] (vi)
1103 # a[7]a[5]
1104 # a[7]a[6] (vii)
1105
1106 $UMULL $t0,$a1,$a0 # lo(a[1..7]*a[0]) (i)
1107 $UMULL $t1,$a2,$a0
1108 $UMULL $t2,$a3,$a0
1109 $UMULL $t3,$a4,$a0
1110 addc $acc1,$acc1,$t0 # t[1]+lo(a[1]*a[0])
1111 $UMULL $t0,$a5,$a0
1112 adde $acc2,$acc2,$t1
1113 $UMULL $t1,$a6,$a0
1114 adde $acc3,$acc3,$t2
1115 $UMULL $t2,$a7,$a0
1116 adde $acc4,$acc4,$t3
1117 $UMULH $t3,$a1,$a0 # hi(a[1..7]*a[0])
1118 adde $acc5,$acc5,$t0
1119 $UMULH $t0,$a2,$a0
1120 adde $acc6,$acc6,$t1
1121 $UMULH $t1,$a3,$a0
1122 adde $acc7,$acc7,$t2
1123 $UMULH $t2,$a4,$a0
1124 $ST $acc0,$SIZE_T*1($tp) # t[0]
1125 addze $acc0,$zero # t[8]
1126 $ST $acc1,$SIZE_T*2($tp) # t[1]
1127 addc $acc2,$acc2,$t3 # t[2]+lo(a[1]*a[0])
1128 $UMULH $t3,$a5,$a0
1129 adde $acc3,$acc3,$t0
1130 $UMULH $t0,$a6,$a0
1131 adde $acc4,$acc4,$t1
1132 $UMULH $t1,$a7,$a0
1133 adde $acc5,$acc5,$t2
1134 $UMULL $t2,$a2,$a1 # lo(a[2..7]*a[1]) (ii)
1135 adde $acc6,$acc6,$t3
1136 $UMULL $t3,$a3,$a1
1137 adde $acc7,$acc7,$t0
1138 $UMULL $t0,$a4,$a1
1139 adde $acc0,$acc0,$t1
1140
1141 $UMULL $t1,$a5,$a1
1142 addc $acc3,$acc3,$t2
1143 $UMULL $t2,$a6,$a1
1144 adde $acc4,$acc4,$t3
1145 $UMULL $t3,$a7,$a1
1146 adde $acc5,$acc5,$t0
1147 $UMULH $t0,$a2,$a1 # hi(a[2..7]*a[1])
1148 adde $acc6,$acc6,$t1
1149 $UMULH $t1,$a3,$a1
1150 adde $acc7,$acc7,$t2
1151 $UMULH $t2,$a4,$a1
1152 adde $acc0,$acc0,$t3
1153 $UMULH $t3,$a5,$a1
1154 $ST $acc2,$SIZE_T*3($tp) # t[2]
1155 addze $acc1,$zero # t[9]
1156 $ST $acc3,$SIZE_T*4($tp) # t[3]
1157 addc $acc4,$acc4,$t0
1158 $UMULH $t0,$a6,$a1
1159 adde $acc5,$acc5,$t1
1160 $UMULH $t1,$a7,$a1
1161 adde $acc6,$acc6,$t2
1162 $UMULL $t2,$a3,$a2 # lo(a[3..7]*a[2]) (iii)
1163 adde $acc7,$acc7,$t3
1164 $UMULL $t3,$a4,$a2
1165 adde $acc0,$acc0,$t0
1166 $UMULL $t0,$a5,$a2
1167 adde $acc1,$acc1,$t1
1168
1169 $UMULL $t1,$a6,$a2
1170 addc $acc5,$acc5,$t2
1171 $UMULL $t2,$a7,$a2
1172 adde $acc6,$acc6,$t3
1173 $UMULH $t3,$a3,$a2 # hi(a[3..7]*a[2])
1174 adde $acc7,$acc7,$t0
1175 $UMULH $t0,$a4,$a2
1176 adde $acc0,$acc0,$t1
1177 $UMULH $t1,$a5,$a2
1178 adde $acc1,$acc1,$t2
1179 $UMULH $t2,$a6,$a2
1180 $ST $acc4,$SIZE_T*5($tp) # t[4]
1181 addze $acc2,$zero # t[10]
1182 $ST $acc5,$SIZE_T*6($tp) # t[5]
1183 addc $acc6,$acc6,$t3
1184 $UMULH $t3,$a7,$a2
1185 adde $acc7,$acc7,$t0
1186 $UMULL $t0,$a4,$a3 # lo(a[4..7]*a[3]) (iv)
1187 adde $acc0,$acc0,$t1
1188 $UMULL $t1,$a5,$a3
1189 adde $acc1,$acc1,$t2
1190 $UMULL $t2,$a6,$a3
1191 adde $acc2,$acc2,$t3
1192
1193 $UMULL $t3,$a7,$a3
1194 addc $acc7,$acc7,$t0
1195 $UMULH $t0,$a4,$a3 # hi(a[4..7]*a[3])
1196 adde $acc0,$acc0,$t1
1197 $UMULH $t1,$a5,$a3
1198 adde $acc1,$acc1,$t2
1199 $UMULH $t2,$a6,$a3
1200 adde $acc2,$acc2,$t3
1201 $UMULH $t3,$a7,$a3
1202 $ST $acc6,$SIZE_T*7($tp) # t[6]
1203 addze $acc3,$zero # t[11]
1204 $STU $acc7,$SIZE_T*8($tp) # t[7]
1205 addc $acc0,$acc0,$t0
1206 $UMULL $t0,$a5,$a4 # lo(a[5..7]*a[4]) (v)
1207 adde $acc1,$acc1,$t1
1208 $UMULL $t1,$a6,$a4
1209 adde $acc2,$acc2,$t2
1210 $UMULL $t2,$a7,$a4
1211 adde $acc3,$acc3,$t3
1212
1213 $UMULH $t3,$a5,$a4 # hi(a[5..7]*a[4])
1214 addc $acc1,$acc1,$t0
1215 $UMULH $t0,$a6,$a4
1216 adde $acc2,$acc2,$t1
1217 $UMULH $t1,$a7,$a4
1218 adde $acc3,$acc3,$t2
1219 $UMULL $t2,$a6,$a5 # lo(a[6..7]*a[5]) (vi)
1220 addze $acc4,$zero # t[12]
1221 addc $acc2,$acc2,$t3
1222 $UMULL $t3,$a7,$a5
1223 adde $acc3,$acc3,$t0
1224 $UMULH $t0,$a6,$a5 # hi(a[6..7]*a[5])
1225 adde $acc4,$acc4,$t1
1226
1227 $UMULH $t1,$a7,$a5
1228 addc $acc3,$acc3,$t2
1229 $UMULL $t2,$a7,$a6 # lo(a[7]*a[6]) (vii)
1230 adde $acc4,$acc4,$t3
1231 $UMULH $t3,$a7,$a6 # hi(a[7]*a[6])
1232 addze $acc5,$zero # t[13]
1233 addc $acc4,$acc4,$t0
1234 $UCMP $ap_end,$ap # done yet?
1235 adde $acc5,$acc5,$t1
1236
1237 addc $acc5,$acc5,$t2
1238 sub $t0,$ap_end,$num # rewinded ap
1239 addze $acc6,$zero # t[14]
1240 add $acc6,$acc6,$t3
1241
1242 beq .Lsqr8x_outer_break
1243
1244 mr $n0,$a0
1245 $LD $a0,$SIZE_T*1($tp)
1246 $LD $a1,$SIZE_T*2($tp)
1247 $LD $a2,$SIZE_T*3($tp)
1248 $LD $a3,$SIZE_T*4($tp)
1249 $LD $a4,$SIZE_T*5($tp)
1250 $LD $a5,$SIZE_T*6($tp)
1251 $LD $a6,$SIZE_T*7($tp)
1252 $LD $a7,$SIZE_T*8($tp)
1253 addc $acc0,$acc0,$a0
1254 $LD $a0,$SIZE_T*1($ap)
1255 adde $acc1,$acc1,$a1
1256 $LD $a1,$SIZE_T*2($ap)
1257 adde $acc2,$acc2,$a2
1258 $LD $a2,$SIZE_T*3($ap)
1259 adde $acc3,$acc3,$a3
1260 $LD $a3,$SIZE_T*4($ap)
1261 adde $acc4,$acc4,$a4
1262 $LD $a4,$SIZE_T*5($ap)
1263 adde $acc5,$acc5,$a5
1264 $LD $a5,$SIZE_T*6($ap)
1265 adde $acc6,$acc6,$a6
1266 $LD $a6,$SIZE_T*7($ap)
1267 subi $rp,$ap,$SIZE_T*7
1268 addze $acc7,$a7
1269 $LDU $a7,$SIZE_T*8($ap)
1270 #addze $carry,$zero # moved below
1271 li $cnt,0
1272 b .Lsqr8x_mul
1273
1274 # a[8]a[0]
1275 # a[9]a[0]
1276 # a[a]a[0]
1277 # a[b]a[0]
1278 # a[c]a[0]
1279 # a[d]a[0]
1280 # a[e]a[0]
1281 # a[f]a[0]
1282 # a[8]a[1]
1283 # a[f]a[1]........................
1284 # a[8]a[2]
1285 # a[f]a[2]........................
1286 # a[8]a[3]
1287 # a[f]a[3]........................
1288 # a[8]a[4]
1289 # a[f]a[4]........................
1290 # a[8]a[5]
1291 # a[f]a[5]........................
1292 # a[8]a[6]
1293 # a[f]a[6]........................
1294 # a[8]a[7]
1295 # a[f]a[7]........................
1296.align 5
1297.Lsqr8x_mul:
1298 $UMULL $t0,$a0,$n0
1299 addze $carry,$zero # carry bit, modulo-scheduled
1300 $UMULL $t1,$a1,$n0
1301 addi $cnt,$cnt,$SIZE_T
1302 $UMULL $t2,$a2,$n0
1303 andi. $cnt,$cnt,$SIZE_T*8-1
1304 $UMULL $t3,$a3,$n0
1305 addc $acc0,$acc0,$t0
1306 $UMULL $t0,$a4,$n0
1307 adde $acc1,$acc1,$t1
1308 $UMULL $t1,$a5,$n0
1309 adde $acc2,$acc2,$t2
1310 $UMULL $t2,$a6,$n0
1311 adde $acc3,$acc3,$t3
1312 $UMULL $t3,$a7,$n0
1313 adde $acc4,$acc4,$t0
1314 $UMULH $t0,$a0,$n0
1315 adde $acc5,$acc5,$t1
1316 $UMULH $t1,$a1,$n0
1317 adde $acc6,$acc6,$t2
1318 $UMULH $t2,$a2,$n0
1319 adde $acc7,$acc7,$t3
1320 $UMULH $t3,$a3,$n0
1321 addze $carry,$carry
1322 $STU $acc0,$SIZE_T($tp)
1323 addc $acc0,$acc1,$t0
1324 $UMULH $t0,$a4,$n0
1325 adde $acc1,$acc2,$t1
1326 $UMULH $t1,$a5,$n0
1327 adde $acc2,$acc3,$t2
1328 $UMULH $t2,$a6,$n0
1329 adde $acc3,$acc4,$t3
1330 $UMULH $t3,$a7,$n0
1331 $LDX $n0,$rp,$cnt
1332 adde $acc4,$acc5,$t0
1333 adde $acc5,$acc6,$t1
1334 adde $acc6,$acc7,$t2
1335 adde $acc7,$carry,$t3
1336 #addze $carry,$zero # moved above
1337 bne .Lsqr8x_mul
1338 # note that carry flag is guaranteed
1339 # to be zero at this point
1340 $UCMP $ap,$ap_end # done yet?
1341 beq .Lsqr8x_break
1342
1343 $LD $a0,$SIZE_T*1($tp)
1344 $LD $a1,$SIZE_T*2($tp)
1345 $LD $a2,$SIZE_T*3($tp)
1346 $LD $a3,$SIZE_T*4($tp)
1347 $LD $a4,$SIZE_T*5($tp)
1348 $LD $a5,$SIZE_T*6($tp)
1349 $LD $a6,$SIZE_T*7($tp)
1350 $LD $a7,$SIZE_T*8($tp)
1351 addc $acc0,$acc0,$a0
1352 $LD $a0,$SIZE_T*1($ap)
1353 adde $acc1,$acc1,$a1
1354 $LD $a1,$SIZE_T*2($ap)
1355 adde $acc2,$acc2,$a2
1356 $LD $a2,$SIZE_T*3($ap)
1357 adde $acc3,$acc3,$a3
1358 $LD $a3,$SIZE_T*4($ap)
1359 adde $acc4,$acc4,$a4
1360 $LD $a4,$SIZE_T*5($ap)
1361 adde $acc5,$acc5,$a5
1362 $LD $a5,$SIZE_T*6($ap)
1363 adde $acc6,$acc6,$a6
1364 $LD $a6,$SIZE_T*7($ap)
1365 adde $acc7,$acc7,$a7
1366 $LDU $a7,$SIZE_T*8($ap)
1367 #addze $carry,$zero # moved above
1368 b .Lsqr8x_mul
1369
1370.align 5
1371.Lsqr8x_break:
1372 $LD $a0,$SIZE_T*8($rp)
1373 addi $ap,$rp,$SIZE_T*15
1374 $LD $a1,$SIZE_T*9($rp)
1375 sub. $t0,$ap_end,$ap # is it last iteration?
1376 $LD $a2,$SIZE_T*10($rp)
1377 sub $t1,$tp,$t0
1378 $LD $a3,$SIZE_T*11($rp)
1379 $LD $a4,$SIZE_T*12($rp)
1380 $LD $a5,$SIZE_T*13($rp)
1381 $LD $a6,$SIZE_T*14($rp)
1382 $LD $a7,$SIZE_T*15($rp)
1383 beq .Lsqr8x_outer_loop
1384
1385 $ST $acc0,$SIZE_T*1($tp)
1386 $LD $acc0,$SIZE_T*1($t1)
1387 $ST $acc1,$SIZE_T*2($tp)
1388 $LD $acc1,$SIZE_T*2($t1)
1389 $ST $acc2,$SIZE_T*3($tp)
1390 $LD $acc2,$SIZE_T*3($t1)
1391 $ST $acc3,$SIZE_T*4($tp)
1392 $LD $acc3,$SIZE_T*4($t1)
1393 $ST $acc4,$SIZE_T*5($tp)
1394 $LD $acc4,$SIZE_T*5($t1)
1395 $ST $acc5,$SIZE_T*6($tp)
1396 $LD $acc5,$SIZE_T*6($t1)
1397 $ST $acc6,$SIZE_T*7($tp)
1398 $LD $acc6,$SIZE_T*7($t1)
1399 $ST $acc7,$SIZE_T*8($tp)
1400 $LD $acc7,$SIZE_T*8($t1)
1401 mr $tp,$t1
1402 b .Lsqr8x_outer_loop
1403
1404.align 5
1405.Lsqr8x_outer_break:
1406 ####################################################################
1407 # Now multiply above result by 2 and add a[n-1]*a[n-1]|...|a[0]*a[0]
1408 $LD $a1,$SIZE_T*1($t0) # recall that $t0 is &a[-1]
1409 $LD $a3,$SIZE_T*2($t0)
1410 $LD $a5,$SIZE_T*3($t0)
1411 $LD $a7,$SIZE_T*4($t0)
1412 addi $ap,$t0,$SIZE_T*4
1413 # "tp[x]" comments are for num==8 case
1414 $LD $t1,$SIZE_T*13($sp) # =tp[1], t[0] is not interesting
1415 $LD $t2,$SIZE_T*14($sp)
1416 $LD $t3,$SIZE_T*15($sp)
1417 $LD $t0,$SIZE_T*16($sp)
1418
1419 $ST $acc0,$SIZE_T*1($tp) # tp[8]=
1420 srwi $cnt,$num,`log($SIZE_T)/log(2)+2`
1421 $ST $acc1,$SIZE_T*2($tp)
1422 subi $cnt,$cnt,1
1423 $ST $acc2,$SIZE_T*3($tp)
1424 $ST $acc3,$SIZE_T*4($tp)
1425 $ST $acc4,$SIZE_T*5($tp)
1426 $ST $acc5,$SIZE_T*6($tp)
1427 $ST $acc6,$SIZE_T*7($tp)
1428 #$ST $acc7,$SIZE_T*8($tp) # tp[15] is not interesting
1429 addi $tp,$sp,$SIZE_T*11 # &tp[-1]
1430 $UMULL $acc0,$a1,$a1
1431 $UMULH $a1,$a1,$a1
1432 add $acc1,$t1,$t1 # <<1
1433 $SHRI $t1,$t1,$BITS-1
1434 $UMULL $a2,$a3,$a3
1435 $UMULH $a3,$a3,$a3
1436 addc $acc1,$acc1,$a1
1437 add $acc2,$t2,$t2
1438 $SHRI $t2,$t2,$BITS-1
1439 add $acc3,$t3,$t3
1440 $SHRI $t3,$t3,$BITS-1
1441 or $acc2,$acc2,$t1
1442
1443 mtctr $cnt
1444.Lsqr4x_shift_n_add:
1445 $UMULL $a4,$a5,$a5
1446 $UMULH $a5,$a5,$a5
1447 $LD $t1,$SIZE_T*6($tp) # =tp[5]
1448 $LD $a1,$SIZE_T*1($ap)
1449 adde $acc2,$acc2,$a2
1450 add $acc4,$t0,$t0
1451 $SHRI $t0,$t0,$BITS-1
1452 or $acc3,$acc3,$t2
1453 $LD $t2,$SIZE_T*7($tp) # =tp[6]
1454 adde $acc3,$acc3,$a3
1455 $LD $a3,$SIZE_T*2($ap)
1456 add $acc5,$t1,$t1
1457 $SHRI $t1,$t1,$BITS-1
1458 or $acc4,$acc4,$t3
1459 $LD $t3,$SIZE_T*8($tp) # =tp[7]
1460 $UMULL $a6,$a7,$a7
1461 $UMULH $a7,$a7,$a7
1462 adde $acc4,$acc4,$a4
1463 add $acc6,$t2,$t2
1464 $SHRI $t2,$t2,$BITS-1
1465 or $acc5,$acc5,$t0
1466 $LD $t0,$SIZE_T*9($tp) # =tp[8]
1467 adde $acc5,$acc5,$a5
1468 $LD $a5,$SIZE_T*3($ap)
1469 add $acc7,$t3,$t3
1470 $SHRI $t3,$t3,$BITS-1
1471 or $acc6,$acc6,$t1
1472 $LD $t1,$SIZE_T*10($tp) # =tp[9]
1473 $UMULL $a0,$a1,$a1
1474 $UMULH $a1,$a1,$a1
1475 adde $acc6,$acc6,$a6
1476 $ST $acc0,$SIZE_T*1($tp) # tp[0]=
1477 add $acc0,$t0,$t0
1478 $SHRI $t0,$t0,$BITS-1
1479 or $acc7,$acc7,$t2
1480 $LD $t2,$SIZE_T*11($tp) # =tp[10]
1481 adde $acc7,$acc7,$a7
1482 $LDU $a7,$SIZE_T*4($ap)
1483 $ST $acc1,$SIZE_T*2($tp) # tp[1]=
1484 add $acc1,$t1,$t1
1485 $SHRI $t1,$t1,$BITS-1
1486 or $acc0,$acc0,$t3
1487 $LD $t3,$SIZE_T*12($tp) # =tp[11]
1488 $UMULL $a2,$a3,$a3
1489 $UMULH $a3,$a3,$a3
1490 adde $acc0,$acc0,$a0
1491 $ST $acc2,$SIZE_T*3($tp) # tp[2]=
1492 add $acc2,$t2,$t2
1493 $SHRI $t2,$t2,$BITS-1
1494 or $acc1,$acc1,$t0
1495 $LD $t0,$SIZE_T*13($tp) # =tp[12]
1496 adde $acc1,$acc1,$a1
1497 $ST $acc3,$SIZE_T*4($tp) # tp[3]=
1498 $ST $acc4,$SIZE_T*5($tp) # tp[4]=
1499 $ST $acc5,$SIZE_T*6($tp) # tp[5]=
1500 $ST $acc6,$SIZE_T*7($tp) # tp[6]=
1501 $STU $acc7,$SIZE_T*8($tp) # tp[7]=
1502 add $acc3,$t3,$t3
1503 $SHRI $t3,$t3,$BITS-1
1504 or $acc2,$acc2,$t1
1505 bdnz .Lsqr4x_shift_n_add
1506___
1507my ($np,$np_end)=($ap,$ap_end);
1508$code.=<<___;
1509 $POP $np,$SIZE_T*7($sp) # pull &np[-1] and n0
1510 $POP $n0,$SIZE_T*8($sp)
1511
1512 $UMULL $a4,$a5,$a5
1513 $UMULH $a5,$a5,$a5
1514 $ST $acc0,$SIZE_T*1($tp) # tp[8]=
1515 $LD $acc0,$SIZE_T*12($sp) # =tp[0]
1516 $LD $t1,$SIZE_T*6($tp) # =tp[13]
1517 adde $acc2,$acc2,$a2
1518 add $acc4,$t0,$t0
1519 $SHRI $t0,$t0,$BITS-1
1520 or $acc3,$acc3,$t2
1521 $LD $t2,$SIZE_T*7($tp) # =tp[14]
1522 adde $acc3,$acc3,$a3
1523 add $acc5,$t1,$t1
1524 $SHRI $t1,$t1,$BITS-1
1525 or $acc4,$acc4,$t3
1526 $UMULL $a6,$a7,$a7
1527 $UMULH $a7,$a7,$a7
1528 adde $acc4,$acc4,$a4
1529 add $acc6,$t2,$t2
1530 $SHRI $t2,$t2,$BITS-1
1531 or $acc5,$acc5,$t0
1532 $ST $acc1,$SIZE_T*2($tp) # tp[9]=
1533 $LD $acc1,$SIZE_T*13($sp) # =tp[1]
1534 adde $acc5,$acc5,$a5
1535 or $acc6,$acc6,$t1
1536 $LD $a0,$SIZE_T*1($np)
1537 $LD $a1,$SIZE_T*2($np)
1538 adde $acc6,$acc6,$a6
1539 $LD $a2,$SIZE_T*3($np)
1540 $LD $a3,$SIZE_T*4($np)
1541 adde $acc7,$a7,$t2
1542 $LD $a4,$SIZE_T*5($np)
1543 $LD $a5,$SIZE_T*6($np)
1544
1545 ################################################################
1546 # Reduce by 8 limbs per iteration
1547 $UMULL $na0,$n0,$acc0 # t[0]*n0
1548 li $cnt,8
1549 $LD $a6,$SIZE_T*7($np)
1550 add $np_end,$np,$num
1551 $LDU $a7,$SIZE_T*8($np)
1552 $ST $acc2,$SIZE_T*3($tp) # tp[10]=
1553 $LD $acc2,$SIZE_T*14($sp)
1554 $ST $acc3,$SIZE_T*4($tp) # tp[11]=
1555 $LD $acc3,$SIZE_T*15($sp)
1556 $ST $acc4,$SIZE_T*5($tp) # tp[12]=
1557 $LD $acc4,$SIZE_T*16($sp)
1558 $ST $acc5,$SIZE_T*6($tp) # tp[13]=
1559 $LD $acc5,$SIZE_T*17($sp)
1560 $ST $acc6,$SIZE_T*7($tp) # tp[14]=
1561 $LD $acc6,$SIZE_T*18($sp)
1562 $ST $acc7,$SIZE_T*8($tp) # tp[15]=
1563 $LD $acc7,$SIZE_T*19($sp)
1564 addi $tp,$sp,$SIZE_T*11 # &tp[-1]
1565 mtctr $cnt
1566 b .Lsqr8x_reduction
1567
1568.align 5
1569.Lsqr8x_reduction:
1570 # (*) $UMULL $t0,$a0,$na0 # lo(n[0-7])*lo(t[0]*n0)
1571 $UMULL $t1,$a1,$na0
1572 $UMULL $t2,$a2,$na0
1573 $STU $na0,$SIZE_T($tp) # put aside t[0]*n0 for tail processing
1574 $UMULL $t3,$a3,$na0
1575 # (*) addc $acc0,$acc0,$t0
1576 addic $acc0,$acc0,-1 # (*)
1577 $UMULL $t0,$a4,$na0
1578 adde $acc0,$acc1,$t1
1579 $UMULL $t1,$a5,$na0
1580 adde $acc1,$acc2,$t2
1581 $UMULL $t2,$a6,$na0
1582 adde $acc2,$acc3,$t3
1583 $UMULL $t3,$a7,$na0
1584 adde $acc3,$acc4,$t0
1585 $UMULH $t0,$a0,$na0 # hi(n[0-7])*lo(t[0]*n0)
1586 adde $acc4,$acc5,$t1
1587 $UMULH $t1,$a1,$na0
1588 adde $acc5,$acc6,$t2
1589 $UMULH $t2,$a2,$na0
1590 adde $acc6,$acc7,$t3
1591 $UMULH $t3,$a3,$na0
1592 addze $acc7,$zero
1593 addc $acc0,$acc0,$t0
1594 $UMULH $t0,$a4,$na0
1595 adde $acc1,$acc1,$t1
1596 $UMULH $t1,$a5,$na0
1597 adde $acc2,$acc2,$t2
1598 $UMULH $t2,$a6,$na0
1599 adde $acc3,$acc3,$t3
1600 $UMULH $t3,$a7,$na0
1601 $UMULL $na0,$n0,$acc0 # next t[0]*n0
1602 adde $acc4,$acc4,$t0
1603 adde $acc5,$acc5,$t1
1604 adde $acc6,$acc6,$t2
1605 adde $acc7,$acc7,$t3
1606 bdnz .Lsqr8x_reduction
1607
1608 $LD $t0,$SIZE_T*1($tp)
1609 $LD $t1,$SIZE_T*2($tp)
1610 $LD $t2,$SIZE_T*3($tp)
1611 $LD $t3,$SIZE_T*4($tp)
1612 subi $rp,$tp,$SIZE_T*7
1613 $UCMP $np_end,$np # done yet?
1614 addc $acc0,$acc0,$t0
1615 $LD $t0,$SIZE_T*5($tp)
1616 adde $acc1,$acc1,$t1
1617 $LD $t1,$SIZE_T*6($tp)
1618 adde $acc2,$acc2,$t2
1619 $LD $t2,$SIZE_T*7($tp)
1620 adde $acc3,$acc3,$t3
1621 $LD $t3,$SIZE_T*8($tp)
1622 adde $acc4,$acc4,$t0
1623 adde $acc5,$acc5,$t1
1624 adde $acc6,$acc6,$t2
1625 adde $acc7,$acc7,$t3
1626 #addze $carry,$zero # moved below
1627 beq .Lsqr8x8_post_condition
1628
1629 $LD $n0,$SIZE_T*0($rp)
1630 $LD $a0,$SIZE_T*1($np)
1631 $LD $a1,$SIZE_T*2($np)
1632 $LD $a2,$SIZE_T*3($np)
1633 $LD $a3,$SIZE_T*4($np)
1634 $LD $a4,$SIZE_T*5($np)
1635 $LD $a5,$SIZE_T*6($np)
1636 $LD $a6,$SIZE_T*7($np)
1637 $LDU $a7,$SIZE_T*8($np)
1638 li $cnt,0
1639
1640.align 5
1641.Lsqr8x_tail:
1642 $UMULL $t0,$a0,$n0
1643 addze $carry,$zero # carry bit, modulo-scheduled
1644 $UMULL $t1,$a1,$n0
1645 addi $cnt,$cnt,$SIZE_T
1646 $UMULL $t2,$a2,$n0
1647 andi. $cnt,$cnt,$SIZE_T*8-1
1648 $UMULL $t3,$a3,$n0
1649 addc $acc0,$acc0,$t0
1650 $UMULL $t0,$a4,$n0
1651 adde $acc1,$acc1,$t1
1652 $UMULL $t1,$a5,$n0
1653 adde $acc2,$acc2,$t2
1654 $UMULL $t2,$a6,$n0
1655 adde $acc3,$acc3,$t3
1656 $UMULL $t3,$a7,$n0
1657 adde $acc4,$acc4,$t0
1658 $UMULH $t0,$a0,$n0
1659 adde $acc5,$acc5,$t1
1660 $UMULH $t1,$a1,$n0
1661 adde $acc6,$acc6,$t2
1662 $UMULH $t2,$a2,$n0
1663 adde $acc7,$acc7,$t3
1664 $UMULH $t3,$a3,$n0
1665 addze $carry,$carry
1666 $STU $acc0,$SIZE_T($tp)
1667 addc $acc0,$acc1,$t0
1668 $UMULH $t0,$a4,$n0
1669 adde $acc1,$acc2,$t1
1670 $UMULH $t1,$a5,$n0
1671 adde $acc2,$acc3,$t2
1672 $UMULH $t2,$a6,$n0
1673 adde $acc3,$acc4,$t3
1674 $UMULH $t3,$a7,$n0
1675 $LDX $n0,$rp,$cnt
1676 adde $acc4,$acc5,$t0
1677 adde $acc5,$acc6,$t1
1678 adde $acc6,$acc7,$t2
1679 adde $acc7,$carry,$t3
1680 #addze $carry,$zero # moved above
1681 bne .Lsqr8x_tail
1682 # note that carry flag is guaranteed
1683 # to be zero at this point
1684 $LD $a0,$SIZE_T*1($tp)
1685 $POP $carry,$SIZE_T*10($sp) # pull top-most carry in case we break
1686 $UCMP $np_end,$np # done yet?
1687 $LD $a1,$SIZE_T*2($tp)
1688 sub $t2,$np_end,$num # rewinded np
1689 $LD $a2,$SIZE_T*3($tp)
1690 $LD $a3,$SIZE_T*4($tp)
1691 $LD $a4,$SIZE_T*5($tp)
1692 $LD $a5,$SIZE_T*6($tp)
1693 $LD $a6,$SIZE_T*7($tp)
1694 $LD $a7,$SIZE_T*8($tp)
1695 beq .Lsqr8x_tail_break
1696
1697 addc $acc0,$acc0,$a0
1698 $LD $a0,$SIZE_T*1($np)
1699 adde $acc1,$acc1,$a1
1700 $LD $a1,$SIZE_T*2($np)
1701 adde $acc2,$acc2,$a2
1702 $LD $a2,$SIZE_T*3($np)
1703 adde $acc3,$acc3,$a3
1704 $LD $a3,$SIZE_T*4($np)
1705 adde $acc4,$acc4,$a4
1706 $LD $a4,$SIZE_T*5($np)
1707 adde $acc5,$acc5,$a5
1708 $LD $a5,$SIZE_T*6($np)
1709 adde $acc6,$acc6,$a6
1710 $LD $a6,$SIZE_T*7($np)
1711 adde $acc7,$acc7,$a7
1712 $LDU $a7,$SIZE_T*8($np)
1713 #addze $carry,$zero # moved above
1714 b .Lsqr8x_tail
1715
1716.align 5
1717.Lsqr8x_tail_break:
1718 $POP $n0,$SIZE_T*8($sp) # pull n0
1719 $POP $t3,$SIZE_T*9($sp) # &tp[2*num-1]
1720 addi $cnt,$tp,$SIZE_T*8 # end of current t[num] window
1721
1722 addic $carry,$carry,-1 # "move" top-most carry to carry bit
1723 adde $t0,$acc0,$a0
1724 $LD $acc0,$SIZE_T*8($rp)
1725 $LD $a0,$SIZE_T*1($t2) # recall that $t2 is &n[-1]
1726 adde $t1,$acc1,$a1
1727 $LD $acc1,$SIZE_T*9($rp)
1728 $LD $a1,$SIZE_T*2($t2)
1729 adde $acc2,$acc2,$a2
1730 $LD $a2,$SIZE_T*3($t2)
1731 adde $acc3,$acc3,$a3
1732 $LD $a3,$SIZE_T*4($t2)
1733 adde $acc4,$acc4,$a4
1734 $LD $a4,$SIZE_T*5($t2)
1735 adde $acc5,$acc5,$a5
1736 $LD $a5,$SIZE_T*6($t2)
1737 adde $acc6,$acc6,$a6
1738 $LD $a6,$SIZE_T*7($t2)
1739 adde $acc7,$acc7,$a7
1740 $LD $a7,$SIZE_T*8($t2)
1741 addi $np,$t2,$SIZE_T*8
1742 addze $t2,$zero # top-most carry
1743 $UMULL $na0,$n0,$acc0
1744 $ST $t0,$SIZE_T*1($tp)
1745 $UCMP $cnt,$t3 # did we hit the bottom?
1746 $ST $t1,$SIZE_T*2($tp)
1747 li $cnt,8
1748 $ST $acc2,$SIZE_T*3($tp)
1749 $LD $acc2,$SIZE_T*10($rp)
1750 $ST $acc3,$SIZE_T*4($tp)
1751 $LD $acc3,$SIZE_T*11($rp)
1752 $ST $acc4,$SIZE_T*5($tp)
1753 $LD $acc4,$SIZE_T*12($rp)
1754 $ST $acc5,$SIZE_T*6($tp)
1755 $LD $acc5,$SIZE_T*13($rp)
1756 $ST $acc6,$SIZE_T*7($tp)
1757 $LD $acc6,$SIZE_T*14($rp)
1758 $ST $acc7,$SIZE_T*8($tp)
1759 $LD $acc7,$SIZE_T*15($rp)
1760 $PUSH $t2,$SIZE_T*10($sp) # off-load top-most carry
1761 addi $tp,$rp,$SIZE_T*7 # slide the window
1762 mtctr $cnt
1763 bne .Lsqr8x_reduction
1764
1765 ################################################################
1766 # Final step. We see if result is larger than modulus, and
1767 # if it is, subtract the modulus. But comparison implies
1768 # subtraction. So we subtract modulus, see if it borrowed,
1769 # and conditionally copy original value.
1770 $POP $rp,$SIZE_T*6($sp) # pull &rp[-1]
1771 srwi $cnt,$num,`log($SIZE_T)/log(2)+3`
1772 mr $n0,$tp # put tp aside
1773 addi $tp,$tp,$SIZE_T*8
1774 subi $cnt,$cnt,1
1775 subfc $t0,$a0,$acc0
1776 subfe $t1,$a1,$acc1
1777 mr $carry,$t2
1778 mr $ap_end,$rp # $rp copy
1779
1780 mtctr $cnt
1781 b .Lsqr8x_sub
1782
1783.align 5
1784.Lsqr8x_sub:
1785 $LD $a0,$SIZE_T*1($np)
1786 $LD $acc0,$SIZE_T*1($tp)
1787 $LD $a1,$SIZE_T*2($np)
1788 $LD $acc1,$SIZE_T*2($tp)
1789 subfe $t2,$a2,$acc2
1790 $LD $a2,$SIZE_T*3($np)
1791 $LD $acc2,$SIZE_T*3($tp)
1792 subfe $t3,$a3,$acc3
1793 $LD $a3,$SIZE_T*4($np)
1794 $LD $acc3,$SIZE_T*4($tp)
1795 $ST $t0,$SIZE_T*1($rp)
1796 subfe $t0,$a4,$acc4
1797 $LD $a4,$SIZE_T*5($np)
1798 $LD $acc4,$SIZE_T*5($tp)
1799 $ST $t1,$SIZE_T*2($rp)
1800 subfe $t1,$a5,$acc5
1801 $LD $a5,$SIZE_T*6($np)
1802 $LD $acc5,$SIZE_T*6($tp)
1803 $ST $t2,$SIZE_T*3($rp)
1804 subfe $t2,$a6,$acc6
1805 $LD $a6,$SIZE_T*7($np)
1806 $LD $acc6,$SIZE_T*7($tp)
1807 $ST $t3,$SIZE_T*4($rp)
1808 subfe $t3,$a7,$acc7
1809 $LDU $a7,$SIZE_T*8($np)
1810 $LDU $acc7,$SIZE_T*8($tp)
1811 $ST $t0,$SIZE_T*5($rp)
1812 subfe $t0,$a0,$acc0
1813 $ST $t1,$SIZE_T*6($rp)
1814 subfe $t1,$a1,$acc1
1815 $ST $t2,$SIZE_T*7($rp)
1816 $STU $t3,$SIZE_T*8($rp)
1817 bdnz .Lsqr8x_sub
1818
1819 srwi $cnt,$num,`log($SIZE_T)/log(2)+2`
1820 $LD $a0,$SIZE_T*1($ap_end) # original $rp
1821 $LD $acc0,$SIZE_T*1($n0) # original $tp
1822 subi $cnt,$cnt,1
1823 $LD $a1,$SIZE_T*2($ap_end)
1824 $LD $acc1,$SIZE_T*2($n0)
1825 subfe $t2,$a2,$acc2
1826 $LD $a2,$SIZE_T*3($ap_end)
1827 $LD $acc2,$SIZE_T*3($n0)
1828 subfe $t3,$a3,$acc3
1829 $LD $a3,$SIZE_T*4($ap_end)
1830 $LDU $acc3,$SIZE_T*4($n0)
1831 $ST $t0,$SIZE_T*1($rp)
1832 subfe $t0,$a4,$acc4
1833 $ST $t1,$SIZE_T*2($rp)
1834 subfe $t1,$a5,$acc5
1835 $ST $t2,$SIZE_T*3($rp)
1836 subfe $t2,$a6,$acc6
1837 $ST $t3,$SIZE_T*4($rp)
1838 subfe $t3,$a7,$acc7
1839 $ST $t0,$SIZE_T*5($rp)
1840 subfe $carry,$zero,$carry # did it borrow?
1841 $ST $t1,$SIZE_T*6($rp)
1842 $ST $t2,$SIZE_T*7($rp)
1843 $ST $t3,$SIZE_T*8($rp)
1844
1845 addi $tp,$sp,$SIZE_T*11
1846 mtctr $cnt
1847
1848.Lsqr4x_cond_copy:
1849 andc $a0,$a0,$carry
1850 $ST $zero,-$SIZE_T*3($n0) # wipe stack clean
1851 and $acc0,$acc0,$carry
1852 $ST $zero,-$SIZE_T*2($n0)
1853 andc $a1,$a1,$carry
1854 $ST $zero,-$SIZE_T*1($n0)
1855 and $acc1,$acc1,$carry
1856 $ST $zero,-$SIZE_T*0($n0)
1857 andc $a2,$a2,$carry
1858 $ST $zero,$SIZE_T*1($tp)
1859 and $acc2,$acc2,$carry
1860 $ST $zero,$SIZE_T*2($tp)
1861 andc $a3,$a3,$carry
1862 $ST $zero,$SIZE_T*3($tp)
1863 and $acc3,$acc3,$carry
1864 $STU $zero,$SIZE_T*4($tp)
1865 or $t0,$a0,$acc0
1866 $LD $a0,$SIZE_T*5($ap_end)
1867 $LD $acc0,$SIZE_T*1($n0)
1868 or $t1,$a1,$acc1
1869 $LD $a1,$SIZE_T*6($ap_end)
1870 $LD $acc1,$SIZE_T*2($n0)
1871 or $t2,$a2,$acc2
1872 $LD $a2,$SIZE_T*7($ap_end)
1873 $LD $acc2,$SIZE_T*3($n0)
1874 or $t3,$a3,$acc3
1875 $LD $a3,$SIZE_T*8($ap_end)
1876 $LDU $acc3,$SIZE_T*4($n0)
1877 $ST $t0,$SIZE_T*1($ap_end)
1878 $ST $t1,$SIZE_T*2($ap_end)
1879 $ST $t2,$SIZE_T*3($ap_end)
1880 $STU $t3,$SIZE_T*4($ap_end)
1881 bdnz .Lsqr4x_cond_copy
1882
1883 $POP $ap,0($sp) # pull saved sp
1884 andc $a0,$a0,$carry
1885 and $acc0,$acc0,$carry
1886 andc $a1,$a1,$carry
1887 and $acc1,$acc1,$carry
1888 andc $a2,$a2,$carry
1889 and $acc2,$acc2,$carry
1890 andc $a3,$a3,$carry
1891 and $acc3,$acc3,$carry
1892 or $t0,$a0,$acc0
1893 or $t1,$a1,$acc1
1894 or $t2,$a2,$acc2
1895 or $t3,$a3,$acc3
1896 $ST $t0,$SIZE_T*1($ap_end)
1897 $ST $t1,$SIZE_T*2($ap_end)
1898 $ST $t2,$SIZE_T*3($ap_end)
1899 $ST $t3,$SIZE_T*4($ap_end)
1900
1901 b .Lsqr8x_done
1902
1903.align 5
1904.Lsqr8x8_post_condition:
1905 $POP $rp,$SIZE_T*6($sp) # pull rp
1906 $POP $ap,0($sp) # pull saved sp
1907 addze $carry,$zero
1908
1909 # $acc0-7,$carry hold result, $a0-7 hold modulus
1910 subfc $acc0,$a0,$acc0
1911 subfe $acc1,$a1,$acc1
1912 $ST $zero,$SIZE_T*12($sp) # wipe stack clean
1913 $ST $zero,$SIZE_T*13($sp)
1914 subfe $acc2,$a2,$acc2
1915 $ST $zero,$SIZE_T*14($sp)
1916 $ST $zero,$SIZE_T*15($sp)
1917 subfe $acc3,$a3,$acc3
1918 $ST $zero,$SIZE_T*16($sp)
1919 $ST $zero,$SIZE_T*17($sp)
1920 subfe $acc4,$a4,$acc4
1921 $ST $zero,$SIZE_T*18($sp)
1922 $ST $zero,$SIZE_T*19($sp)
1923 subfe $acc5,$a5,$acc5
1924 $ST $zero,$SIZE_T*20($sp)
1925 $ST $zero,$SIZE_T*21($sp)
1926 subfe $acc6,$a6,$acc6
1927 $ST $zero,$SIZE_T*22($sp)
1928 $ST $zero,$SIZE_T*23($sp)
1929 subfe $acc7,$a7,$acc7
1930 $ST $zero,$SIZE_T*24($sp)
1931 $ST $zero,$SIZE_T*25($sp)
1932 subfe $carry,$zero,$carry # did it borrow?
1933 $ST $zero,$SIZE_T*26($sp)
1934 $ST $zero,$SIZE_T*27($sp)
1935
1936 and $a0,$a0,$carry
1937 and $a1,$a1,$carry
1938 addc $acc0,$acc0,$a0 # add modulus back if borrowed
1939 and $a2,$a2,$carry
1940 adde $acc1,$acc1,$a1
1941 and $a3,$a3,$carry
1942 adde $acc2,$acc2,$a2
1943 and $a4,$a4,$carry
1944 adde $acc3,$acc3,$a3
1945 and $a5,$a5,$carry
1946 adde $acc4,$acc4,$a4
1947 and $a6,$a6,$carry
1948 adde $acc5,$acc5,$a5
1949 and $a7,$a7,$carry
1950 adde $acc6,$acc6,$a6
1951 adde $acc7,$acc7,$a7
1952 $ST $acc0,$SIZE_T*1($rp)
1953 $ST $acc1,$SIZE_T*2($rp)
1954 $ST $acc2,$SIZE_T*3($rp)
1955 $ST $acc3,$SIZE_T*4($rp)
1956 $ST $acc4,$SIZE_T*5($rp)
1957 $ST $acc5,$SIZE_T*6($rp)
1958 $ST $acc6,$SIZE_T*7($rp)
1959 $ST $acc7,$SIZE_T*8($rp)
1960
1961.Lsqr8x_done:
1962 $PUSH $zero,$SIZE_T*8($sp)
1963 $PUSH $zero,$SIZE_T*10($sp)
1964
1965 $POP r14,-$SIZE_T*18($ap)
1966 li r3,1 # signal "done"
1967 $POP r15,-$SIZE_T*17($ap)
1968 $POP r16,-$SIZE_T*16($ap)
1969 $POP r17,-$SIZE_T*15($ap)
1970 $POP r18,-$SIZE_T*14($ap)
1971 $POP r19,-$SIZE_T*13($ap)
1972 $POP r20,-$SIZE_T*12($ap)
1973 $POP r21,-$SIZE_T*11($ap)
1974 $POP r22,-$SIZE_T*10($ap)
1975 $POP r23,-$SIZE_T*9($ap)
1976 $POP r24,-$SIZE_T*8($ap)
1977 $POP r25,-$SIZE_T*7($ap)
1978 $POP r26,-$SIZE_T*6($ap)
1979 $POP r27,-$SIZE_T*5($ap)
1980 $POP r28,-$SIZE_T*4($ap)
1981 $POP r29,-$SIZE_T*3($ap)
1982 $POP r30,-$SIZE_T*2($ap)
1983 $POP r31,-$SIZE_T*1($ap)
1984 mr $sp,$ap
1985 blr
1986 .long 0
1987 .byte 0,12,4,0x20,0x80,18,6,0
1988 .long 0
1989.size __bn_sqr8x_mont,.-__bn_sqr8x_mont
1990___
1991}
1992$code.=<<___;
1993.asciz "Montgomery Multiplication for PPC, CRYPTOGAMS by <appro\@openssl.org>"
1994___
1995
1996$code =~ s/\`([^\`]*)\`/eval $1/gem;
1997print $code;
1998close STDOUT or die "error closing STDOUT: $!";
注意: 瀏覽 TracBrowser 來幫助您使用儲存庫瀏覽器

© 2024 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette