parisc-mont.pl@ 102334

最後變更在這個檔案從102334是 101211,由 vboxsync 提交於 17 月前
openssl-3.1.3: Applied and adjusted our OpenSSL changes to 3.1.2. bugref:10527
檔案大小: 27.2 KB

行
1	#! /usr/bin/env perl
2	# Copyright 2009-2020 The OpenSSL Project Authors. All Rights Reserved.
3	#
4	# Licensed under the Apache License 2.0 (the "License"). You may not use
5	# this file except in compliance with the License. You can obtain a copy
6	# in the file LICENSE in the source distribution or at
7	# https://www.openssl.org/source/license.html
8
9
10	# ====================================================================
11	# Written by Andy Polyakov <[email protected]> for the OpenSSL
12	# project. The module is, however, dual licensed under OpenSSL and
13	# CRYPTOGAMS licenses depending on where you obtain it. For further
14	# details see http://www.openssl.org/~appro/cryptogams/.
15	# ====================================================================
16
17	# On PA-7100LC this module performs ~90-50% better, less for longer
18	# keys, than code generated by gcc 3.2 for PA-RISC 1.1. Latter means
19	# that compiler utilized xmpyu instruction to perform 32x32=64-bit
20	# multiplication, which in turn means that "baseline" performance was
21	# optimal in respect to instruction set capabilities. Fair comparison
22	# with vendor compiler is problematic, because OpenSSL doesn't define
23	# BN_LLONG [presumably] for historical reasons, which drives compiler
24	# toward 4 times 16x16=32-bit multiplications [plus complementary
25	# shifts and additions] instead. This means that you should observe
26	# several times improvement over code generated by vendor compiler
27	# for PA-RISC 1.1, but the "baseline" is far from optimal. The actual
28	# improvement coefficient was never collected on PA-7100LC, or any
29	# other 1.1 CPU, because I don't have access to such machine with
30	# vendor compiler. But to give you a taste, PA-RISC 1.1 code path
31	# reportedly outperformed code generated by cc +DA1.1 +O3 by factor
32	# of ~5x on PA-8600.
33	#
34	# On PA-RISC 2.0 it has to compete with pa-risc2[W].s, which is
35	# reportedly ~2x faster than vendor compiler generated code [according
36	# to comment in pa-risc2[W].s]. Here comes a catch. Execution core of
37	# this implementation is actually 32-bit one, in the sense that it
38	# operates on 32-bit values. But pa-risc2[W].s operates on arrays of
39	# 64-bit BN_LONGs... How do they interoperate then? No problem. This
40	# module picks halves of 64-bit values in reverse order and pretends
41	# they were 32-bit BN_LONGs. But can 32-bit core compete with "pure"
42	# 64-bit code such as pa-risc2[W].s then? Well, the thing is that
43	# 32x32=64-bit multiplication is the best even PA-RISC 2.0 can do,
44	# i.e. there is no "wider" multiplication like on most other 64-bit
45	# platforms. This means that even being effectively 32-bit, this
46	# implementation performs "64-bit" computational task in same amount
47	# of arithmetic operations, most notably multiplications. It requires
48	# more memory references, most notably to tp[num], but this doesn't
49	# seem to exhaust memory port capacity. And indeed, dedicated PA-RISC
50	# 2.0 code path provides virtually same performance as pa-risc2[W].s:
51	# it's ~10% better for shortest key length and ~10% worse for longest
52	# one.
53	#
54	# In case it wasn't clear. The module has two distinct code paths:
55	# PA-RISC 1.1 and PA-RISC 2.0 ones. Latter features carry-free 64-bit
56	# additions and 64-bit integer loads, not to mention specific
57	# instruction scheduling. In 64-bit build naturally only 2.0 code path
58	# is assembled. In 32-bit application context both code paths are
59	# assembled, PA-RISC 2.0 CPU is detected at run-time and proper path
60	# is taken automatically. Also, in 32-bit build the module imposes
61	# couple of limitations: vector lengths has to be even and vector
62	# addresses has to be 64-bit aligned. Normally neither is a problem:
63	# most common key lengths are even and vectors are commonly malloc-ed,
64	# which ensures alignment.
65	#
66	# Special thanks to polarhome.com for providing HP-UX account on
67	# PA-RISC 1.1 machine, and to correspondent who chose to remain
68	# anonymous for testing the code on PA-RISC 2.0 machine.
69
70
71	$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
72
73	# $output is the last argument if it looks like a file (it has an extension)
74	# $flavour is the first argument if it doesn't look like a file
75	$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m\|\.\w+$\| ? pop : undef;
76	$flavour = $#ARGV >= 0 && $ARGV[0] !~ m\|\.\| ? shift : undef;
77
78	$output and open STDOUT,">$output";
79
80	if ($flavour =~ /64/) {
81	$LEVEL ="2.0W";
82	$SIZE_T =8;
83	$FRAME_MARKER =80;
84	$SAVED_RP =16;
85	$PUSH ="std";
86	$PUSHMA ="std,ma";
87	$POP ="ldd";
88	$POPMB ="ldd,mb";
89	$BN_SZ =$SIZE_T;
90	} else {
91	$LEVEL ="1.1"; #$LEVEL.="\n\t.ALLOW\t2.0";
92	$SIZE_T =4;
93	$FRAME_MARKER =48;
94	$SAVED_RP =20;
95	$PUSH ="stw";
96	$PUSHMA ="stwm";
97	$POP ="ldw";
98	$POPMB ="ldwm";
99	$BN_SZ =$SIZE_T;
100	if (open CONF,"<${dir}../../opensslconf.h") {
101	while(<CONF>) {
102	if (m/#\s*define\s+SIXTY_FOUR_BIT/) {
103	$BN_SZ=8;
104	$LEVEL="2.0";
105	last;
106	}
107	}
108	close CONF;
109	}
110	}
111
112	$FRAME=8*$SIZE_T+$FRAME_MARKER; # 8 saved regs + frame marker
113	# [+ argument transfer]
114	$LOCALS=$FRAME-$FRAME_MARKER;
115	$FRAME+=32; # local variables
116
117	$tp="%r31";
118	$ti1="%r29";
119	$ti0="%r28";
120
121	$rp="%r26";
122	$ap="%r25";
123	$bp="%r24";
124	$np="%r23";
125	$n0="%r22"; # passed through stack in 32-bit
126	$num="%r21"; # passed through stack in 32-bit
127	$idx="%r20";
128	$arrsz="%r19";
129
130	$nm1="%r7";
131	$nm0="%r6";
132	$ab1="%r5";
133	$ab0="%r4";
134
135	$fp="%r3";
136	$hi1="%r2";
137	$hi0="%r1";
138
139	$xfer=$n0; # accommodates [-16..15] offset in fld[dw]s
140
141	$fm0="%fr4"; $fti=$fm0;
142	$fbi="%fr5L";
143	$fn0="%fr5R";
144	$fai="%fr6"; $fab0="%fr7"; $fab1="%fr8";
145	$fni="%fr9"; $fnm0="%fr10"; $fnm1="%fr11";
146
147	$code=<<___;
148	.LEVEL $LEVEL
149	.SPACE \$TEXT\$
150	.SUBSPA \$CODE\$,QUAD=0,ALIGN=8,ACCESS=0x2C,CODE_ONLY
151
152	.EXPORT bn_mul_mont,ENTRY,ARGW0=GR,ARGW1=GR,ARGW2=GR,ARGW3=GR
153	.ALIGN 64
154	bn_mul_mont
155	.PROC
156	.CALLINFO FRAME=`$FRAME-8*$SIZE_T`,NO_CALLS,SAVE_RP,SAVE_SP,ENTRY_GR=6
157	.ENTRY
158	$PUSH %r2,-$SAVED_RP(%sp) ; standard prologue
159	$PUSHMA %r3,$FRAME(%sp)
160	$PUSH %r4,`-$FRAME+1*$SIZE_T`(%sp)
161	$PUSH %r5,`-$FRAME+2*$SIZE_T`(%sp)
162	$PUSH %r6,`-$FRAME+3*$SIZE_T`(%sp)
163	$PUSH %r7,`-$FRAME+4*$SIZE_T`(%sp)
164	$PUSH %r8,`-$FRAME+5*$SIZE_T`(%sp)
165	$PUSH %r9,`-$FRAME+6*$SIZE_T`(%sp)
166	$PUSH %r10,`-$FRAME+7*$SIZE_T`(%sp)
167	ldo -$FRAME(%sp),$fp
168	___
169	$code.=<<___ if ($SIZE_T==4);
170	ldw `-$FRAME_MARKER-4`($fp),$n0
171	ldw `-$FRAME_MARKER-8`($fp),$num
172	nop
173	nop ; alignment
174	___
175	$code.=<<___ if ($BN_SZ==4);
176	comiclr,<= 6,$num,%r0 ; are vectors long enough?
177	b L\$abort
178	ldi 0,%r28 ; signal "unhandled"
179	add,ev %r0,$num,$num ; is $num even?
180	b L\$abort
181	nop
182	or $ap,$np,$ti1
183	extru,= $ti1,31,3,%r0 ; are ap and np 64-bit aligned?
184	b L\$abort
185	nop
186	nop ; alignment
187	nop
188
189	fldws 0($n0),${fn0}
190	fldws,ma 4($bp),${fbi} ; bp[0]
191	___
192	$code.=<<___ if ($BN_SZ==8);
193	comib,> 3,$num,L\$abort ; are vectors long enough?
194	ldi 0,%r28 ; signal "unhandled"
195	addl $num,$num,$num ; I operate on 32-bit values
196
197	fldws 4($n0),${fn0} ; only low part of n0
198	fldws 4($bp),${fbi} ; bp[0] in flipped word order
199	___
200	$code.=<<___;
201	fldds 0($ap),${fai} ; ap[0,1]
202	fldds 0($np),${fni} ; np[0,1]
203
204	sh2addl $num,%r0,$arrsz
205	ldi 31,$hi0
206	ldo 36($arrsz),$hi1 ; space for tp[num+1]
207	andcm $hi1,$hi0,$hi1 ; align
208	addl $hi1,%sp,%sp
209	$PUSH $fp,-$SIZE_T(%sp)
210
211	ldo `$LOCALS+16`($fp),$xfer
212	ldo `$LOCALS+32+4`($fp),$tp
213
214	xmpyu ${fai}L,${fbi},${fab0} ; ap[0]*bp[0]
215	xmpyu ${fai}R,${fbi},${fab1} ; ap[1]*bp[0]
216	xmpyu ${fn0},${fab0}R,${fm0}
217
218	addl $arrsz,$ap,$ap ; point at the end
219	addl $arrsz,$np,$np
220	subi 0,$arrsz,$idx ; j=0
221	ldo 8($idx),$idx ; j++++
222
223	xmpyu ${fni}L,${fm0}R,${fnm0} ; np[0]*m
224	xmpyu ${fni}R,${fm0}R,${fnm1} ; np[1]*m
225	fstds ${fab0},-16($xfer)
226	fstds ${fnm0},-8($xfer)
227	fstds ${fab1},0($xfer)
228	fstds ${fnm1},8($xfer)
229	flddx $idx($ap),${fai} ; ap[2,3]
230	flddx $idx($np),${fni} ; np[2,3]
231	___
232	$code.=<<___ if ($BN_SZ==4);
233	mtctl $hi0,%cr11 ; $hi0 still holds 31
234	extrd,u,*= $hi0,%sar,1,$hi0 ; executes on PA-RISC 1.0
235	b L\$parisc11
236	nop
237	___
238	$code.=<<___; # PA-RISC 2.0 code-path
239	xmpyu ${fai}L,${fbi},${fab0} ; ap[j]*bp[0]
240	xmpyu ${fni}L,${fm0}R,${fnm0} ; np[j]*m
241	ldd -16($xfer),$ab0
242	fstds ${fab0},-16($xfer)
243
244	extrd,u $ab0,31,32,$hi0
245	extrd,u $ab0,63,32,$ab0
246	ldd -8($xfer),$nm0
247	fstds ${fnm0},-8($xfer)
248	ldo 8($idx),$idx ; j++++
249	addl $ab0,$nm0,$nm0 ; low part is discarded
250	extrd,u $nm0,31,32,$hi1
251
252
253	L\$1st
254	xmpyu ${fai}R,${fbi},${fab1} ; ap[j+1]*bp[0]
255	xmpyu ${fni}R,${fm0}R,${fnm1} ; np[j+1]*m
256	ldd 0($xfer),$ab1
257	fstds ${fab1},0($xfer)
258	addl $hi0,$ab1,$ab1
259	extrd,u $ab1,31,32,$hi0
260	ldd 8($xfer),$nm1
261	fstds ${fnm1},8($xfer)
262	extrd,u $ab1,63,32,$ab1
263	addl $hi1,$nm1,$nm1
264	flddx $idx($ap),${fai} ; ap[j,j+1]
265	flddx $idx($np),${fni} ; np[j,j+1]
266	addl $ab1,$nm1,$nm1
267	extrd,u $nm1,31,32,$hi1
268
269	xmpyu ${fai}L,${fbi},${fab0} ; ap[j]*bp[0]
270	xmpyu ${fni}L,${fm0}R,${fnm0} ; np[j]*m
271	ldd -16($xfer),$ab0
272	fstds ${fab0},-16($xfer)
273	addl $hi0,$ab0,$ab0
274	extrd,u $ab0,31,32,$hi0
275	ldd -8($xfer),$nm0
276	fstds ${fnm0},-8($xfer)
277	extrd,u $ab0,63,32,$ab0
278	addl $hi1,$nm0,$nm0
279	stw $nm1,-4($tp) ; tp[j-1]
280	addl $ab0,$nm0,$nm0
281	stw,ma $nm0,8($tp) ; tp[j-1]
282	addib,<> 8,$idx,L\$1st ; j++++
283	extrd,u $nm0,31,32,$hi1
284
285	xmpyu ${fai}R,${fbi},${fab1} ; ap[j]*bp[0]
286	xmpyu ${fni}R,${fm0}R,${fnm1} ; np[j]*m
287	ldd 0($xfer),$ab1
288	fstds ${fab1},0($xfer)
289	addl $hi0,$ab1,$ab1
290	extrd,u $ab1,31,32,$hi0
291	ldd 8($xfer),$nm1
292	fstds ${fnm1},8($xfer)
293	extrd,u $ab1,63,32,$ab1
294	addl $hi1,$nm1,$nm1
295	ldd -16($xfer),$ab0
296	addl $ab1,$nm1,$nm1
297	ldd -8($xfer),$nm0
298	extrd,u $nm1,31,32,$hi1
299
300	addl $hi0,$ab0,$ab0
301	extrd,u $ab0,31,32,$hi0
302	stw $nm1,-4($tp) ; tp[j-1]
303	extrd,u $ab0,63,32,$ab0
304	addl $hi1,$nm0,$nm0
305	ldd 0($xfer),$ab1
306	addl $ab0,$nm0,$nm0
307	ldd,mb 8($xfer),$nm1
308	extrd,u $nm0,31,32,$hi1
309	stw,ma $nm0,8($tp) ; tp[j-1]
310
311	ldo -1($num),$num ; i--
312	subi 0,$arrsz,$idx ; j=0
313	___
314	$code.=<<___ if ($BN_SZ==4);
315	fldws,ma 4($bp),${fbi} ; bp[1]
316	___
317	$code.=<<___ if ($BN_SZ==8);
318	fldws 0($bp),${fbi} ; bp[1] in flipped word order
319	___
320	$code.=<<___;
321	flddx $idx($ap),${fai} ; ap[0,1]
322	flddx $idx($np),${fni} ; np[0,1]
323	fldws 8($xfer),${fti}R ; tp[0]
324	addl $hi0,$ab1,$ab1
325	extrd,u $ab1,31,32,$hi0
326	extrd,u $ab1,63,32,$ab1
327	ldo 8($idx),$idx ; j++++
328	xmpyu ${fai}L,${fbi},${fab0} ; ap[0]*bp[1]
329	xmpyu ${fai}R,${fbi},${fab1} ; ap[1]*bp[1]
330	addl $hi1,$nm1,$nm1
331	addl $ab1,$nm1,$nm1
332	extrd,u $nm1,31,32,$hi1
333	fstws,mb ${fab0}L,-8($xfer) ; save high part
334	stw $nm1,-4($tp) ; tp[j-1]
335
336	fcpy,sgl %fr0,${fti}L ; zero high part
337	fcpy,sgl %fr0,${fab0}L
338	addl $hi1,$hi0,$hi0
339	extrd,u $hi0,31,32,$hi1
340	fcnvxf,dbl,dbl ${fti},${fti} ; 32-bit unsigned int -> double
341	fcnvxf,dbl,dbl ${fab0},${fab0}
342	stw $hi0,0($tp)
343	stw $hi1,4($tp)
344
345	fadd,dbl ${fti},${fab0},${fab0} ; add tp[0]
346	fcnvfx,dbl,dbl ${fab0},${fab0} ; double -> 33-bit unsigned int
347	xmpyu ${fn0},${fab0}R,${fm0}
348	ldo `$LOCALS+32+4`($fp),$tp
349	L\$outer
350	xmpyu ${fni}L,${fm0}R,${fnm0} ; np[0]*m
351	xmpyu ${fni}R,${fm0}R,${fnm1} ; np[1]*m
352	fstds ${fab0},-16($xfer) ; 33-bit value
353	fstds ${fnm0},-8($xfer)
354	flddx $idx($ap),${fai} ; ap[2]
355	flddx $idx($np),${fni} ; np[2]
356	ldo 8($idx),$idx ; j++++
357	ldd -16($xfer),$ab0 ; 33-bit value
358	ldd -8($xfer),$nm0
359	ldw 0($xfer),$hi0 ; high part
360
361	xmpyu ${fai}L,${fbi},${fab0} ; ap[j]*bp[i]
362	xmpyu ${fni}L,${fm0}R,${fnm0} ; np[j]*m
363	extrd,u $ab0,31,32,$ti0 ; carry bit
364	extrd,u $ab0,63,32,$ab0
365	fstds ${fab1},0($xfer)
366	addl $ti0,$hi0,$hi0 ; account carry bit
367	fstds ${fnm1},8($xfer)
368	addl $ab0,$nm0,$nm0 ; low part is discarded
369	ldw 0($tp),$ti1 ; tp[1]
370	extrd,u $nm0,31,32,$hi1
371	fstds ${fab0},-16($xfer)
372	fstds ${fnm0},-8($xfer)
373
374
375	L\$inner
376	xmpyu ${fai}R,${fbi},${fab1} ; ap[j+1]*bp[i]
377	xmpyu ${fni}R,${fm0}R,${fnm1} ; np[j+1]*m
378	ldd 0($xfer),$ab1
379	fstds ${fab1},0($xfer)
380	addl $hi0,$ti1,$ti1
381	addl $ti1,$ab1,$ab1
382	ldd 8($xfer),$nm1
383	fstds ${fnm1},8($xfer)
384	extrd,u $ab1,31,32,$hi0
385	extrd,u $ab1,63,32,$ab1
386	flddx $idx($ap),${fai} ; ap[j,j+1]
387	flddx $idx($np),${fni} ; np[j,j+1]
388	addl $hi1,$nm1,$nm1
389	addl $ab1,$nm1,$nm1
390	ldw 4($tp),$ti0 ; tp[j]
391	stw $nm1,-4($tp) ; tp[j-1]
392
393	xmpyu ${fai}L,${fbi},${fab0} ; ap[j]*bp[i]
394	xmpyu ${fni}L,${fm0}R,${fnm0} ; np[j]*m
395	ldd -16($xfer),$ab0
396	fstds ${fab0},-16($xfer)
397	addl $hi0,$ti0,$ti0
398	addl $ti0,$ab0,$ab0
399	ldd -8($xfer),$nm0
400	fstds ${fnm0},-8($xfer)
401	extrd,u $ab0,31,32,$hi0
402	extrd,u $nm1,31,32,$hi1
403	ldw 8($tp),$ti1 ; tp[j]
404	extrd,u $ab0,63,32,$ab0
405	addl $hi1,$nm0,$nm0
406	addl $ab0,$nm0,$nm0
407	stw,ma $nm0,8($tp) ; tp[j-1]
408	addib,<> 8,$idx,L\$inner ; j++++
409	extrd,u $nm0,31,32,$hi1
410
411	xmpyu ${fai}R,${fbi},${fab1} ; ap[j]*bp[i]
412	xmpyu ${fni}R,${fm0}R,${fnm1} ; np[j]*m
413	ldd 0($xfer),$ab1
414	fstds ${fab1},0($xfer)
415	addl $hi0,$ti1,$ti1
416	addl $ti1,$ab1,$ab1
417	ldd 8($xfer),$nm1
418	fstds ${fnm1},8($xfer)
419	extrd,u $ab1,31,32,$hi0
420	extrd,u $ab1,63,32,$ab1
421	ldw 4($tp),$ti0 ; tp[j]
422	addl $hi1,$nm1,$nm1
423	addl $ab1,$nm1,$nm1
424	ldd -16($xfer),$ab0
425	ldd -8($xfer),$nm0
426	extrd,u $nm1,31,32,$hi1
427
428	addl $hi0,$ab0,$ab0
429	addl $ti0,$ab0,$ab0
430	stw $nm1,-4($tp) ; tp[j-1]
431	extrd,u $ab0,31,32,$hi0
432	ldw 8($tp),$ti1 ; tp[j]
433	extrd,u $ab0,63,32,$ab0
434	addl $hi1,$nm0,$nm0
435	ldd 0($xfer),$ab1
436	addl $ab0,$nm0,$nm0
437	ldd,mb 8($xfer),$nm1
438	extrd,u $nm0,31,32,$hi1
439	stw,ma $nm0,8($tp) ; tp[j-1]
440
441	addib,= -1,$num,L\$outerdone ; i--
442	subi 0,$arrsz,$idx ; j=0
443	___
444	$code.=<<___ if ($BN_SZ==4);
445	fldws,ma 4($bp),${fbi} ; bp[i]
446	___
447	$code.=<<___ if ($BN_SZ==8);
448	ldi 12,$ti0 ; bp[i] in flipped word order
449	addl,ev %r0,$num,$num
450	ldi -4,$ti0
451	addl $ti0,$bp,$bp
452	fldws 0($bp),${fbi}
453	___
454	$code.=<<___;
455	flddx $idx($ap),${fai} ; ap[0]
456	addl $hi0,$ab1,$ab1
457	flddx $idx($np),${fni} ; np[0]
458	fldws 8($xfer),${fti}R ; tp[0]
459	addl $ti1,$ab1,$ab1
460	extrd,u $ab1,31,32,$hi0
461	extrd,u $ab1,63,32,$ab1
462
463	ldo 8($idx),$idx ; j++++
464	xmpyu ${fai}L,${fbi},${fab0} ; ap[0]*bp[i]
465	xmpyu ${fai}R,${fbi},${fab1} ; ap[1]*bp[i]
466	ldw 4($tp),$ti0 ; tp[j]
467
468	addl $hi1,$nm1,$nm1
469	fstws,mb ${fab0}L,-8($xfer) ; save high part
470	addl $ab1,$nm1,$nm1
471	extrd,u $nm1,31,32,$hi1
472	fcpy,sgl %fr0,${fti}L ; zero high part
473	fcpy,sgl %fr0,${fab0}L
474	stw $nm1,-4($tp) ; tp[j-1]
475
476	fcnvxf,dbl,dbl ${fti},${fti} ; 32-bit unsigned int -> double
477	fcnvxf,dbl,dbl ${fab0},${fab0}
478	addl $hi1,$hi0,$hi0
479	fadd,dbl ${fti},${fab0},${fab0} ; add tp[0]
480	addl $ti0,$hi0,$hi0
481	extrd,u $hi0,31,32,$hi1
482	fcnvfx,dbl,dbl ${fab0},${fab0} ; double -> 33-bit unsigned int
483	stw $hi0,0($tp)
484	stw $hi1,4($tp)
485	xmpyu ${fn0},${fab0}R,${fm0}
486
487	b L\$outer
488	ldo `$LOCALS+32+4`($fp),$tp
489
490
491	L\$outerdone
492	addl $hi0,$ab1,$ab1
493	addl $ti1,$ab1,$ab1
494	extrd,u $ab1,31,32,$hi0
495	extrd,u $ab1,63,32,$ab1
496
497	ldw 4($tp),$ti0 ; tp[j]
498
499	addl $hi1,$nm1,$nm1
500	addl $ab1,$nm1,$nm1
501	extrd,u $nm1,31,32,$hi1
502	stw $nm1,-4($tp) ; tp[j-1]
503
504	addl $hi1,$hi0,$hi0
505	addl $ti0,$hi0,$hi0
506	extrd,u $hi0,31,32,$hi1
507	stw $hi0,0($tp)
508	stw $hi1,4($tp)
509
510	ldo `$LOCALS+32`($fp),$tp
511	sub %r0,%r0,%r0 ; clear borrow
512	___
513	$code.=<<___ if ($BN_SZ==4);
514	ldws,ma 4($tp),$ti0
515	extru,= $rp,31,3,%r0 ; is rp 64-bit aligned?
516	b L\$sub_pa11
517	addl $tp,$arrsz,$tp
518	L\$sub
519	ldwx $idx($np),$hi0
520	subb $ti0,$hi0,$hi1
521	ldwx $idx($tp),$ti0
522	addib,<> 4,$idx,L\$sub
523	stws,ma $hi1,4($rp)
524
525	subb $ti0,%r0,$hi1
526	___
527	$code.=<<___ if ($BN_SZ==8);
528	ldd,ma 8($tp),$ti0
529	L\$sub
530	ldd $idx($np),$hi0
531	shrpd $ti0,$ti0,32,$ti0 ; flip word order
532	std $ti0,-8($tp) ; save flipped value
533	sub,db $ti0,$hi0,$hi1
534	ldd,ma 8($tp),$ti0
535	addib,<> 8,$idx,L\$sub
536	std,ma $hi1,8($rp)
537
538	extrd,u $ti0,31,32,$ti0 ; carry in flipped word order
539	sub,db $ti0,%r0,$hi1
540	___
541	$code.=<<___;
542	ldo `$LOCALS+32`($fp),$tp
543	sub $rp,$arrsz,$rp ; rewind rp
544	subi 0,$arrsz,$idx
545	L\$copy
546	ldd 0($tp),$ti0
547	ldd 0($rp),$hi0
548	std,ma %r0,8($tp)
549	comiclr,= 0,$hi1,%r0
550	copy $ti0,$hi0
551	addib,<> 8,$idx,L\$copy
552	std,ma $hi0,8($rp)
553	___
554
555	if ($BN_SZ==4) { # PA-RISC 1.1 code-path
556	$ablo=$ab0;
557	$abhi=$ab1;
558	$nmlo0=$nm0;
559	$nmhi0=$nm1;
560	$nmlo1="%r9";
561	$nmhi1="%r8";
562
563	$code.=<<___;
564	b L\$done
565	nop
566
567	.ALIGN 8
568	L\$parisc11
569	xmpyu ${fai}L,${fbi},${fab0} ; ap[j]*bp[0]
570	xmpyu ${fni}L,${fm0}R,${fnm0} ; np[j]*m
571	ldw -12($xfer),$ablo
572	ldw -16($xfer),$hi0
573	ldw -4($xfer),$nmlo0
574	ldw -8($xfer),$nmhi0
575	fstds ${fab0},-16($xfer)
576	fstds ${fnm0},-8($xfer)
577
578	ldo 8($idx),$idx ; j++++
579	add $ablo,$nmlo0,$nmlo0 ; discarded
580	addc %r0,$nmhi0,$hi1
581	ldw 4($xfer),$ablo
582	ldw 0($xfer),$abhi
583	nop
584
585
586	L\$1st_pa11
587	xmpyu ${fai}R,${fbi},${fab1} ; ap[j+1]*bp[0]
588	flddx $idx($ap),${fai} ; ap[j,j+1]
589	xmpyu ${fni}R,${fm0}R,${fnm1} ; np[j+1]*m
590	flddx $idx($np),${fni} ; np[j,j+1]
591	add $hi0,$ablo,$ablo
592	ldw 12($xfer),$nmlo1
593	addc %r0,$abhi,$hi0
594	ldw 8($xfer),$nmhi1
595	add $ablo,$nmlo1,$nmlo1
596	fstds ${fab1},0($xfer)
597	addc %r0,$nmhi1,$nmhi1
598	fstds ${fnm1},8($xfer)
599	add $hi1,$nmlo1,$nmlo1
600	ldw -12($xfer),$ablo
601	addc %r0,$nmhi1,$hi1
602	ldw -16($xfer),$abhi
603
604	xmpyu ${fai}L,${fbi},${fab0} ; ap[j]*bp[0]
605	ldw -4($xfer),$nmlo0
606	xmpyu ${fni}L,${fm0}R,${fnm0} ; np[j]*m
607	ldw -8($xfer),$nmhi0
608	add $hi0,$ablo,$ablo
609	stw $nmlo1,-4($tp) ; tp[j-1]
610	addc %r0,$abhi,$hi0
611	fstds ${fab0},-16($xfer)
612	add $ablo,$nmlo0,$nmlo0
613	fstds ${fnm0},-8($xfer)
614	addc %r0,$nmhi0,$nmhi0
615	ldw 0($xfer),$abhi
616	add $hi1,$nmlo0,$nmlo0
617	ldw 4($xfer),$ablo
618	stws,ma $nmlo0,8($tp) ; tp[j-1]
619	addib,<> 8,$idx,L\$1st_pa11 ; j++++
620	addc %r0,$nmhi0,$hi1
621
622	ldw 8($xfer),$nmhi1
623	ldw 12($xfer),$nmlo1
624	xmpyu ${fai}R,${fbi},${fab1} ; ap[j]*bp[0]
625	xmpyu ${fni}R,${fm0}R,${fnm1} ; np[j]*m
626	add $hi0,$ablo,$ablo
627	fstds ${fab1},0($xfer)
628	addc %r0,$abhi,$hi0
629	fstds ${fnm1},8($xfer)
630	add $ablo,$nmlo1,$nmlo1
631	ldw -16($xfer),$abhi
632	addc %r0,$nmhi1,$nmhi1
633	ldw -12($xfer),$ablo
634	add $hi1,$nmlo1,$nmlo1
635	ldw -8($xfer),$nmhi0
636	addc %r0,$nmhi1,$hi1
637	ldw -4($xfer),$nmlo0
638
639	add $hi0,$ablo,$ablo
640	stw $nmlo1,-4($tp) ; tp[j-1]
641	addc %r0,$abhi,$hi0
642	ldw 0($xfer),$abhi
643	add $ablo,$nmlo0,$nmlo0
644	ldw 4($xfer),$ablo
645	addc %r0,$nmhi0,$nmhi0
646	ldws,mb 8($xfer),$nmhi1
647	add $hi1,$nmlo0,$nmlo0
648	ldw 4($xfer),$nmlo1
649	addc %r0,$nmhi0,$hi1
650	stws,ma $nmlo0,8($tp) ; tp[j-1]
651
652	ldo -1($num),$num ; i--
653	subi 0,$arrsz,$idx ; j=0
654
655	fldws,ma 4($bp),${fbi} ; bp[1]
656	flddx $idx($ap),${fai} ; ap[0,1]
657	flddx $idx($np),${fni} ; np[0,1]
658	fldws 8($xfer),${fti}R ; tp[0]
659	add $hi0,$ablo,$ablo
660	addc %r0,$abhi,$hi0
661	ldo 8($idx),$idx ; j++++
662	xmpyu ${fai}L,${fbi},${fab0} ; ap[0]*bp[1]
663	xmpyu ${fai}R,${fbi},${fab1} ; ap[1]*bp[1]
664	add $hi1,$nmlo1,$nmlo1
665	addc %r0,$nmhi1,$nmhi1
666	add $ablo,$nmlo1,$nmlo1
667	addc %r0,$nmhi1,$hi1
668	fstws,mb ${fab0}L,-8($xfer) ; save high part
669	stw $nmlo1,-4($tp) ; tp[j-1]
670
671	fcpy,sgl %fr0,${fti}L ; zero high part
672	fcpy,sgl %fr0,${fab0}L
673	add $hi1,$hi0,$hi0
674	addc %r0,%r0,$hi1
675	fcnvxf,dbl,dbl ${fti},${fti} ; 32-bit unsigned int -> double
676	fcnvxf,dbl,dbl ${fab0},${fab0}
677	stw $hi0,0($tp)
678	stw $hi1,4($tp)
679
680	fadd,dbl ${fti},${fab0},${fab0} ; add tp[0]
681	fcnvfx,dbl,dbl ${fab0},${fab0} ; double -> 33-bit unsigned int
682	xmpyu ${fn0},${fab0}R,${fm0}
683	ldo `$LOCALS+32+4`($fp),$tp
684	L\$outer_pa11
685	xmpyu ${fni}L,${fm0}R,${fnm0} ; np[0]*m
686	xmpyu ${fni}R,${fm0}R,${fnm1} ; np[1]*m
687	fstds ${fab0},-16($xfer) ; 33-bit value
688	fstds ${fnm0},-8($xfer)
689	flddx $idx($ap),${fai} ; ap[2,3]
690	flddx $idx($np),${fni} ; np[2,3]
691	ldw -16($xfer),$abhi ; carry bit actually
692	ldo 8($idx),$idx ; j++++
693	ldw -12($xfer),$ablo
694	ldw -8($xfer),$nmhi0
695	ldw -4($xfer),$nmlo0
696	ldw 0($xfer),$hi0 ; high part
697
698	xmpyu ${fai}L,${fbi},${fab0} ; ap[j]*bp[i]
699	xmpyu ${fni}L,${fm0}R,${fnm0} ; np[j]*m
700	fstds ${fab1},0($xfer)
701	addl $abhi,$hi0,$hi0 ; account carry bit
702	fstds ${fnm1},8($xfer)
703	add $ablo,$nmlo0,$nmlo0 ; discarded
704	ldw 0($tp),$ti1 ; tp[1]
705	addc %r0,$nmhi0,$hi1
706	fstds ${fab0},-16($xfer)
707	fstds ${fnm0},-8($xfer)
708	ldw 4($xfer),$ablo
709	ldw 0($xfer),$abhi
710
711
712	L\$inner_pa11
713	xmpyu ${fai}R,${fbi},${fab1} ; ap[j+1]*bp[i]
714	flddx $idx($ap),${fai} ; ap[j,j+1]
715	xmpyu ${fni}R,${fm0}R,${fnm1} ; np[j+1]*m
716	flddx $idx($np),${fni} ; np[j,j+1]
717	add $hi0,$ablo,$ablo
718	ldw 4($tp),$ti0 ; tp[j]
719	addc %r0,$abhi,$abhi
720	ldw 12($xfer),$nmlo1
721	add $ti1,$ablo,$ablo
722	ldw 8($xfer),$nmhi1
723	addc %r0,$abhi,$hi0
724	fstds ${fab1},0($xfer)
725	add $ablo,$nmlo1,$nmlo1
726	fstds ${fnm1},8($xfer)
727	addc %r0,$nmhi1,$nmhi1
728	ldw -12($xfer),$ablo
729	add $hi1,$nmlo1,$nmlo1
730	ldw -16($xfer),$abhi
731	addc %r0,$nmhi1,$hi1
732
733	xmpyu ${fai}L,${fbi},${fab0} ; ap[j]*bp[i]
734	ldw 8($tp),$ti1 ; tp[j]
735	xmpyu ${fni}L,${fm0}R,${fnm0} ; np[j]*m
736	ldw -4($xfer),$nmlo0
737	add $hi0,$ablo,$ablo
738	ldw -8($xfer),$nmhi0
739	addc %r0,$abhi,$abhi
740	stw $nmlo1,-4($tp) ; tp[j-1]
741	add $ti0,$ablo,$ablo
742	fstds ${fab0},-16($xfer)
743	addc %r0,$abhi,$hi0
744	fstds ${fnm0},-8($xfer)
745	add $ablo,$nmlo0,$nmlo0
746	ldw 4($xfer),$ablo
747	addc %r0,$nmhi0,$nmhi0
748	ldw 0($xfer),$abhi
749	add $hi1,$nmlo0,$nmlo0
750	stws,ma $nmlo0,8($tp) ; tp[j-1]
751	addib,<> 8,$idx,L\$inner_pa11 ; j++++
752	addc %r0,$nmhi0,$hi1
753
754	xmpyu ${fai}R,${fbi},${fab1} ; ap[j]*bp[i]
755	ldw 12($xfer),$nmlo1
756	xmpyu ${fni}R,${fm0}R,${fnm1} ; np[j]*m
757	ldw 8($xfer),$nmhi1
758	add $hi0,$ablo,$ablo
759	ldw 4($tp),$ti0 ; tp[j]
760	addc %r0,$abhi,$abhi
761	fstds ${fab1},0($xfer)
762	add $ti1,$ablo,$ablo
763	fstds ${fnm1},8($xfer)
764	addc %r0,$abhi,$hi0
765	ldw -16($xfer),$abhi
766	add $ablo,$nmlo1,$nmlo1
767	ldw -12($xfer),$ablo
768	addc %r0,$nmhi1,$nmhi1
769	ldw -8($xfer),$nmhi0
770	add $hi1,$nmlo1,$nmlo1
771	ldw -4($xfer),$nmlo0
772	addc %r0,$nmhi1,$hi1
773
774	add $hi0,$ablo,$ablo
775	stw $nmlo1,-4($tp) ; tp[j-1]
776	addc %r0,$abhi,$abhi
777	add $ti0,$ablo,$ablo
778	ldw 8($tp),$ti1 ; tp[j]
779	addc %r0,$abhi,$hi0
780	ldw 0($xfer),$abhi
781	add $ablo,$nmlo0,$nmlo0
782	ldw 4($xfer),$ablo
783	addc %r0,$nmhi0,$nmhi0
784	ldws,mb 8($xfer),$nmhi1
785	add $hi1,$nmlo0,$nmlo0
786	ldw 4($xfer),$nmlo1
787	addc %r0,$nmhi0,$hi1
788	stws,ma $nmlo0,8($tp) ; tp[j-1]
789
790	addib,= -1,$num,L\$outerdone_pa11; i--
791	subi 0,$arrsz,$idx ; j=0
792
793	fldws,ma 4($bp),${fbi} ; bp[i]
794	flddx $idx($ap),${fai} ; ap[0]
795	add $hi0,$ablo,$ablo
796	addc %r0,$abhi,$abhi
797	flddx $idx($np),${fni} ; np[0]
798	fldws 8($xfer),${fti}R ; tp[0]
799	add $ti1,$ablo,$ablo
800	addc %r0,$abhi,$hi0
801
802	ldo 8($idx),$idx ; j++++
803	xmpyu ${fai}L,${fbi},${fab0} ; ap[0]*bp[i]
804	xmpyu ${fai}R,${fbi},${fab1} ; ap[1]*bp[i]
805	ldw 4($tp),$ti0 ; tp[j]
806
807	add $hi1,$nmlo1,$nmlo1
808	addc %r0,$nmhi1,$nmhi1
809	fstws,mb ${fab0}L,-8($xfer) ; save high part
810	add $ablo,$nmlo1,$nmlo1
811	addc %r0,$nmhi1,$hi1
812	fcpy,sgl %fr0,${fti}L ; zero high part
813	fcpy,sgl %fr0,${fab0}L
814	stw $nmlo1,-4($tp) ; tp[j-1]
815
816	fcnvxf,dbl,dbl ${fti},${fti} ; 32-bit unsigned int -> double
817	fcnvxf,dbl,dbl ${fab0},${fab0}
818	add $hi1,$hi0,$hi0
819	addc %r0,%r0,$hi1
820	fadd,dbl ${fti},${fab0},${fab0} ; add tp[0]
821	add $ti0,$hi0,$hi0
822	addc %r0,$hi1,$hi1
823	fcnvfx,dbl,dbl ${fab0},${fab0} ; double -> 33-bit unsigned int
824	stw $hi0,0($tp)
825	stw $hi1,4($tp)
826	xmpyu ${fn0},${fab0}R,${fm0}
827
828	b L\$outer_pa11
829	ldo `$LOCALS+32+4`($fp),$tp
830
831
832	L\$outerdone_pa11
833	add $hi0,$ablo,$ablo
834	addc %r0,$abhi,$abhi
835	add $ti1,$ablo,$ablo
836	addc %r0,$abhi,$hi0
837
838	ldw 4($tp),$ti0 ; tp[j]
839
840	add $hi1,$nmlo1,$nmlo1
841	addc %r0,$nmhi1,$nmhi1
842	add $ablo,$nmlo1,$nmlo1
843	addc %r0,$nmhi1,$hi1
844	stw $nmlo1,-4($tp) ; tp[j-1]
845
846	add $hi1,$hi0,$hi0
847	addc %r0,%r0,$hi1
848	add $ti0,$hi0,$hi0
849	addc %r0,$hi1,$hi1
850	stw $hi0,0($tp)
851	stw $hi1,4($tp)
852
853	ldo `$LOCALS+32+4`($fp),$tp
854	sub %r0,%r0,%r0 ; clear borrow
855	ldw -4($tp),$ti0
856	addl $tp,$arrsz,$tp
857	L\$sub_pa11
858	ldwx $idx($np),$hi0
859	subb $ti0,$hi0,$hi1
860	ldwx $idx($tp),$ti0
861	addib,<> 4,$idx,L\$sub_pa11
862	stws,ma $hi1,4($rp)
863
864	subb $ti0,%r0,$hi1
865
866	ldo `$LOCALS+32`($fp),$tp
867	sub $rp,$arrsz,$rp ; rewind rp
868	subi 0,$arrsz,$idx
869	L\$copy_pa11
870	ldw 0($tp),$ti0
871	ldw 0($rp),$hi0
872	stws,ma %r0,4($tp)
873	comiclr,= 0,$hi1,%r0
874	copy $ti0,$hi0
875	addib,<> 4,$idx,L\$copy_pa11
876	stws,ma $hi0,4($rp)
877
878	nop ; alignment
879	L\$done
880	___
881	}
882
883
884	$code.=<<___;
885	ldi 1,%r28 ; signal "handled"
886	ldo $FRAME($fp),%sp ; destroy tp[num+1]
887
888	$POP `-$FRAME-$SAVED_RP`(%sp),%r2 ; standard epilogue
889	$POP `-$FRAME+1*$SIZE_T`(%sp),%r4
890	$POP `-$FRAME+2*$SIZE_T`(%sp),%r5
891	$POP `-$FRAME+3*$SIZE_T`(%sp),%r6
892	$POP `-$FRAME+4*$SIZE_T`(%sp),%r7
893	$POP `-$FRAME+5*$SIZE_T`(%sp),%r8
894	$POP `-$FRAME+6*$SIZE_T`(%sp),%r9
895	$POP `-$FRAME+7*$SIZE_T`(%sp),%r10
896	L\$abort
897	bv (%r2)
898	.EXIT
899	$POPMB -$FRAME(%sp),%r3
900	.PROCEND
901	.STRINGZ "Montgomery Multiplication for PA-RISC, CRYPTOGAMS by <appro\@openssl.org>"
902	___
903
904
905	# Explicitly encode PA-RISC 2.0 instructions used in this module, so
906	# that it can be compiled with .LEVEL 1.0. It should be noted that I
907	# wouldn't have to do this, if GNU assembler understood .ALLOW 2.0
908	# directive...
909
910	my $ldd = sub {
911	my ($mod,$args) = @_;
912	my $orig = "ldd$mod\t$args";
913
914	if ($args =~ /%r([0-9]+)$%r([0-9]+)$,%r([0-9]+)/) # format 4
915	{ my $opcode=(0x03<<26)\|($2<<21)\|($1<<16)\|(3<<6)\|$3;
916	sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
917	}
918	elsif ($args =~ /(\-?[0-9]+)$%r([0-9]+)$,%r([0-9]+)/) # format 5
919	{ my $opcode=(0x03<<26)\|($2<<21)\|(1<<12)\|(3<<6)\|$3;
920	$opcode\|=(($1&0xF)<<17)\|(($1&0x10)<<12); # encode offset
921	$opcode\|=(1<<5) if ($mod =~ /^,m/);
922	$opcode\|=(1<<13) if ($mod =~ /^,mb/);
923	sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
924	}
925	else { "\t".$orig; }
926	};
927
928	my $std = sub {
929	my ($mod,$args) = @_;
930	my $orig = "std$mod\t$args";
931
932	if ($args =~ /%r([0-9]+),(\-?[0-9]+)$%r([0-9]+)$/) # format 6
933	{ my $opcode=(0x03<<26)\|($3<<21)\|($1<<16)\|(1<<12)\|(0xB<<6);
934	$opcode\|=(($2&0xF)<<1)\|(($2&0x10)>>4); # encode offset
935	$opcode\|=(1<<5) if ($mod =~ /^,m/);
936	$opcode\|=(1<<13) if ($mod =~ /^,mb/);
937	sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
938	}
939	else { "\t".$orig; }
940	};
941
942	my $extrd = sub {
943	my ($mod,$args) = @_;
944	my $orig = "extrd$mod\t$args";
945
946	# I only have ",u" completer, it's implicitly encoded...
947	if ($args =~ /%r([0-9]+),([0-9]+),([0-9]+),%r([0-9]+)/) # format 15
948	{ my $opcode=(0x36<<26)\|($1<<21)\|($4<<16);
949	my $len=32-$3;
950	$opcode \|= (($2&0x20)<<6)\|(($2&0x1f)<<5); # encode pos
951	$opcode \|= (($len&0x20)<<7)\|($len&0x1f); # encode len
952	sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
953	}
954	elsif ($args =~ /%r([0-9]+),%sar,([0-9]+),%r([0-9]+)/) # format 12
955	{ my $opcode=(0x34<<26)\|($1<<21)\|($3<<16)\|(2<<11)\|(1<<9);
956	my $len=32-$2;
957	$opcode \|= (($len&0x20)<<3)\|($len&0x1f); # encode len
958	$opcode \|= (1<<13) if ($mod =~ /,\**=/);
959	sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
960	}
961	else { "\t".$orig; }
962	};
963
964	my $shrpd = sub {
965	my ($mod,$args) = @_;
966	my $orig = "shrpd$mod\t$args";
967
968	if ($args =~ /%r([0-9]+),%r([0-9]+),([0-9]+),%r([0-9]+)/) # format 14
969	{ my $opcode=(0x34<<26)\|($2<<21)\|($1<<16)\|(1<<10)\|$4;
970	my $cpos=63-$3;
971	$opcode \|= (($cpos&0x20)<<6)\|(($cpos&0x1f)<<5); # encode sa
972	sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
973	}
974	else { "\t".$orig; }
975	};
976
977	my $sub = sub {
978	my ($mod,$args) = @_;
979	my $orig = "sub$mod\t$args";
980
981	if ($mod eq ",db" && $args =~ /%r([0-9]+),%r([0-9]+),%r([0-9]+)/) {
982	my $opcode=(0x02<<26)\|($2<<21)\|($1<<16)\|$3;
983	$opcode\|=(1<<10); # e1
984	$opcode\|=(1<<8); # e2
985	$opcode\|=(1<<5); # d
986	sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig
987	}
988	else { "\t".$orig; }
989	};
990
991	sub assemble {
992	my ($mnemonic,$mod,$args)=@_;
993	my $opcode = eval("\$$mnemonic");
994
995	ref($opcode) eq 'CODE' ? &$opcode($mod,$args) : "\t$mnemonic$mod\t$args";
996	}
997
998	if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
999	=~ /GNU assembler/) {
1000	$gnuas = 1;
1001	}
1002
1003	foreach (split("\n",$code)) {
1004	s/\`([^\`]*)\`/eval $1/ge;
1005	# flip word order in 64-bit mode...
1006	s/(xmpyu\s+)($fai\|$fni)([LR])/$1.$2.($3 eq "L"?"R":"L")/e if ($BN_SZ==8);
1007	# assemble 2.0 instructions in 32-bit mode...
1008	s/^\s+([a-z]+)([\S])\s+([\S])/&assemble($1,$2,$3)/e if ($BN_SZ==4);
1009
1010	s/(\.LEVEL\s+2\.0)W/$1w/ if ($gnuas && $SIZE_T==8);
1011	s/\.SPACE\s+\$TEXT\$/.text/ if ($gnuas && $SIZE_T==8);
1012	s/\.SUBSPA.*// if ($gnuas && $SIZE_T==8);
1013	s/\bbv\b/bve/ if ($SIZE_T==8);
1014
1015	print $_,"\n";
1016	}
1017	close STDOUT or die "error closing STDOUT: $!";

注意: 瀏覽 TracBrowser 來幫助您使用儲存庫瀏覽器

source: vbox/trunk/src/libs/openssl-3.1.3/crypto/bn/asm/parisc-mont.pl@ 102334

以其他格式下載: