vis3-mont.pl@ 102334

最後變更在這個檔案從102334是 101211,由 vboxsync 提交於 17 月前
openssl-3.1.3: Applied and adjusted our OpenSSL changes to 3.1.2. bugref:10527
檔案大小: 9.1 KB

行
1	#! /usr/bin/env perl
2	# Copyright 2012-2021 The OpenSSL Project Authors. All Rights Reserved.
3	#
4	# Licensed under the Apache License 2.0 (the "License"). You may not use
5	# this file except in compliance with the License. You can obtain a copy
6	# in the file LICENSE in the source distribution or at
7	# https://www.openssl.org/source/license.html
8
9
10	# ====================================================================
11	# Written by Andy Polyakov <[email protected]> for the OpenSSL
12	# project. The module is, however, dual licensed under OpenSSL and
13	# CRYPTOGAMS licenses depending on where you obtain it. For further
14	# details see http://www.openssl.org/~appro/cryptogams/.
15	# ====================================================================
16
17	# October 2012.
18	#
19	# SPARCv9 VIS3 Montgomery multiplication procedure suitable for T3 and
20	# onward. There are three new instructions used here: umulxhi,
21	# addxc[cc] and initializing store. On T3 RSA private key operations
22	# are 1.54/1.87/2.11/2.26 times faster for 512/1024/2048/4096-bit key
23	# lengths. This is without dedicated squaring procedure. On T4
24	# corresponding coefficients are 1.47/2.10/2.80/2.90x, which is mostly
25	# for reference purposes, because T4 has dedicated Montgomery
26	# multiplication and squaring instructions that deliver even more.
27
28	$output = pop and open STDOUT,">$output";
29
30	$frame = "STACK_FRAME";
31	$bias = "STACK_BIAS";
32
33	$code.=<<___;
34	#ifndef __ASSEMBLER__
35	# define __ASSEMBLER__ 1
36	#endif
37	#include "crypto/sparc_arch.h"
38
39	#ifdef __arch64__
40	.register %g2,#scratch
41	.register %g3,#scratch
42	#endif
43
44	.section ".text",#alloc,#execinstr
45	___
46
47	($n0,$m0,$m1,$lo0,$hi0, $lo1,$hi1,$aj,$alo,$nj,$nlo,$tj)=
48	(map("%g$_",(1..5)),map("%o$_",(0..5,7)));
49
50	# int bn_mul_mont(
51	$rp="%o0"; # BN_ULONG *rp,
52	$ap="%o1"; # const BN_ULONG *ap,
53	$bp="%o2"; # const BN_ULONG *bp,
54	$np="%o3"; # const BN_ULONG *np,
55	$n0p="%o4"; # const BN_ULONG *n0,
56	$num="%o5"; # int num); # caller ensures that num is even
57	# and >=6
58	$code.=<<___;
59	.globl bn_mul_mont_vis3
60	.align 32
61	bn_mul_mont_vis3:
62	add %sp, $bias, %g4 ! real top of stack
63	sll $num, 2, $num ! size in bytes
64	add $num, 63, %g5
65	andn %g5, 63, %g5 ! buffer size rounded up to 64 bytes
66	add %g5, %g5, %g1
67	add %g5, %g1, %g1 ! 3*buffer size
68	sub %g4, %g1, %g1
69	andn %g1, 63, %g1 ! align at 64 byte
70	sub %g1, $frame, %g1 ! new top of stack
71	sub %g1, %g4, %g1
72
73	save %sp, %g1, %sp
74	___
75
76
77	# +-------------------------------+<----- %sp
78	# . .
79	# +-------------------------------+<----- aligned at 64 bytes
80	# \| __int64 tmp[0] \|
81	# +-------------------------------+
82	# . .
83	# . .
84	# +-------------------------------+<----- aligned at 64 bytes
85	# \| __int64 ap[1..0] \| converted ap[]
86	# +-------------------------------+
87	# \| __int64 np[1..0] \| converted np[]
88	# +-------------------------------+
89	# \| __int64 ap[3..2] \|
90	# . .
91	# . .
92	# +-------------------------------+
93	($rp,$ap,$bp,$np,$n0p,$num)=map("%i$_",(0..5));
94	($t0,$t1,$t2,$t3,$cnt,$tp,$bufsz,$anp)=map("%l$_",(0..7));
95	($ovf,$i)=($t0,$t1);
96	$code.=<<___;
97	ld [$n0p+0], $t0 ! pull n0[0..1] value
98	add %sp, $bias+$frame, $tp
99	ld [$n0p+4], $t1
100	add $tp, %g5, $anp
101	ld [$bp+0], $t2 ! m0=bp[0]
102	sllx $t1, 32, $n0
103	ld [$bp+4], $t3
104	or $t0, $n0, $n0
105	add $bp, 8, $bp
106
107
108	ld [$ap+0], $t0 ! ap[0]
109	sllx $t3, 32, $m0
110	ld [$ap+4], $t1
111	or $t2, $m0, $m0
112
113	ld [$ap+8], $t2 ! ap[1]
114	sllx $t1, 32, $aj
115	ld [$ap+12], $t3
116	or $t0, $aj, $aj
117	add $ap, 16, $ap
118	stx $aj, [$anp] ! converted ap[0]
119
120	mulx $aj, $m0, $lo0 ! ap[0]*bp[0]
121	umulxhi $aj, $m0, $hi0
122
123	ld [$np+0], $t0 ! np[0]
124	sllx $t3, 32, $aj
125	ld [$np+4], $t1
126	or $t2, $aj, $aj
127
128	ld [$np+8], $t2 ! np[1]
129	sllx $t1, 32, $nj
130	ld [$np+12], $t3
131	or $t0, $nj, $nj
132	add $np, 16, $np
133	stx $nj, [$anp+8] ! converted np[0]
134
135	mulx $lo0, $n0, $m1 ! "tp[0]"*n0
136	stx $aj, [$anp+16] ! converted ap[1]
137
138	mulx $aj, $m0, $alo ! ap[1]*bp[0]
139	umulxhi $aj, $m0, $aj ! ahi=aj
140
141	mulx $nj, $m1, $lo1 ! np[0]*m1
142	umulxhi $nj, $m1, $hi1
143
144	sllx $t3, 32, $nj
145	or $t2, $nj, $nj
146	stx $nj, [$anp+24] ! converted np[1]
147	add $anp, 32, $anp
148
149	addcc $lo0, $lo1, $lo1
150	addxc %g0, $hi1, $hi1
151
152	mulx $nj, $m1, $nlo ! np[1]*m1
153	umulxhi $nj, $m1, $nj ! nhi=nj
154
155
156	ba .L1st
157	sub $num, 24, $cnt ! cnt=num-3
158
159	.align 16
160	.L1st:
161	ld [$ap+0], $t0 ! ap[j]
162	addcc $alo, $hi0, $lo0
163	ld [$ap+4], $t1
164	addxc $aj, %g0, $hi0
165
166	sllx $t1, 32, $aj
167	add $ap, 8, $ap
168	or $t0, $aj, $aj
169	stx $aj, [$anp] ! converted ap[j]
170
171	ld [$np+0], $t2 ! np[j]
172	addcc $nlo, $hi1, $lo1
173	ld [$np+4], $t3
174	addxc $nj, %g0, $hi1 ! nhi=nj
175
176	sllx $t3, 32, $nj
177	add $np, 8, $np
178	mulx $aj, $m0, $alo ! ap[j]*bp[0]
179	or $t2, $nj, $nj
180	umulxhi $aj, $m0, $aj ! ahi=aj
181	stx $nj, [$anp+8] ! converted np[j]
182	add $anp, 16, $anp ! anp++
183
184	mulx $nj, $m1, $nlo ! np[j]*m1
185	addcc $lo0, $lo1, $lo1 ! np[j]m1+ap[j]bp[0]
186	umulxhi $nj, $m1, $nj ! nhi=nj
187	addxc %g0, $hi1, $hi1
188	stx $lo1, [$tp] ! tp[j-1]
189	add $tp, 8, $tp ! tp++
190
191	brnz,pt $cnt, .L1st
192	sub $cnt, 8, $cnt ! j--
193	!.L1st
194	addcc $alo, $hi0, $lo0
195	addxc $aj, %g0, $hi0 ! ahi=aj
196
197	addcc $nlo, $hi1, $lo1
198	addxc $nj, %g0, $hi1
199	addcc $lo0, $lo1, $lo1 ! np[j]m1+ap[j]bp[0]
200	addxc %g0, $hi1, $hi1
201	stx $lo1, [$tp] ! tp[j-1]
202	add $tp, 8, $tp
203
204	addcc $hi0, $hi1, $hi1
205	addxc %g0, %g0, $ovf ! upmost overflow bit
206	stx $hi1, [$tp]
207	add $tp, 8, $tp
208
209
210	ba .Louter
211	sub $num, 16, $i ! i=num-2
212
213	.align 16
214	.Louter:
215	ld [$bp+0], $t2 ! m0=bp[i]
216	ld [$bp+4], $t3
217
218	sub $anp, $num, $anp ! rewind
219	sub $tp, $num, $tp
220	sub $anp, $num, $anp
221
222	add $bp, 8, $bp
223	sllx $t3, 32, $m0
224	ldx [$anp+0], $aj ! ap[0]
225	or $t2, $m0, $m0
226	ldx [$anp+8], $nj ! np[0]
227
228	mulx $aj, $m0, $lo0 ! ap[0]*bp[i]
229	ldx [$tp], $tj ! tp[0]
230	umulxhi $aj, $m0, $hi0
231	ldx [$anp+16], $aj ! ap[1]
232	addcc $lo0, $tj, $lo0 ! ap[0]*bp[i]+tp[0]
233	mulx $aj, $m0, $alo ! ap[1]*bp[i]
234	addxc %g0, $hi0, $hi0
235	mulx $lo0, $n0, $m1 ! tp[0]*n0
236	umulxhi $aj, $m0, $aj ! ahi=aj
237	mulx $nj, $m1, $lo1 ! np[0]*m1
238	umulxhi $nj, $m1, $hi1
239	ldx [$anp+24], $nj ! np[1]
240	add $anp, 32, $anp
241	addcc $lo1, $lo0, $lo1
242	mulx $nj, $m1, $nlo ! np[1]*m1
243	addxc %g0, $hi1, $hi1
244	umulxhi $nj, $m1, $nj ! nhi=nj
245
246
247	ba .Linner
248	sub $num, 24, $cnt ! cnt=num-3
249	.align 16
250	.Linner:
251	addcc $alo, $hi0, $lo0
252	ldx [$tp+8], $tj ! tp[j]
253	addxc $aj, %g0, $hi0 ! ahi=aj
254	ldx [$anp+0], $aj ! ap[j]
255	addcc $nlo, $hi1, $lo1
256	mulx $aj, $m0, $alo ! ap[j]*bp[i]
257	addxc $nj, %g0, $hi1 ! nhi=nj
258	ldx [$anp+8], $nj ! np[j]
259	add $anp, 16, $anp
260	umulxhi $aj, $m0, $aj ! ahi=aj
261	addcc $lo0, $tj, $lo0 ! ap[j]*bp[i]+tp[j]
262	mulx $nj, $m1, $nlo ! np[j]*m1
263	addxc %g0, $hi0, $hi0
264	umulxhi $nj, $m1, $nj ! nhi=nj
265	addcc $lo1, $lo0, $lo1 ! np[j]m1+ap[j]bp[i]+tp[j]
266	addxc %g0, $hi1, $hi1
267	stx $lo1, [$tp] ! tp[j-1]
268	add $tp, 8, $tp
269	brnz,pt $cnt, .Linner
270	sub $cnt, 8, $cnt
271	!.Linner
272	ldx [$tp+8], $tj ! tp[j]
273	addcc $alo, $hi0, $lo0
274	addxc $aj, %g0, $hi0 ! ahi=aj
275	addcc $lo0, $tj, $lo0 ! ap[j]*bp[i]+tp[j]
276	addxc %g0, $hi0, $hi0
277
278	addcc $nlo, $hi1, $lo1
279	addxc $nj, %g0, $hi1 ! nhi=nj
280	addcc $lo1, $lo0, $lo1 ! np[j]m1+ap[j]bp[i]+tp[j]
281	addxc %g0, $hi1, $hi1
282	stx $lo1, [$tp] ! tp[j-1]
283
284	subcc %g0, $ovf, %g0 ! move upmost overflow to CCR.xcc
285	addxccc $hi1, $hi0, $hi1
286	addxc %g0, %g0, $ovf
287	stx $hi1, [$tp+8]
288	add $tp, 16, $tp
289
290	brnz,pt $i, .Louter
291	sub $i, 8, $i
292
293
294	sub $anp, $num, $anp ! rewind
295	sub $tp, $num, $tp
296	sub $anp, $num, $anp
297	ba .Lsub
298	subcc $num, 8, $cnt ! cnt=num-1 and clear CCR.xcc
299
300	.align 16
301	.Lsub:
302	ldx [$tp], $tj
303	add $tp, 8, $tp
304	ldx [$anp+8], $nj
305	add $anp, 16, $anp
306	subccc $tj, $nj, $t2 ! tp[j]-np[j]
307	srlx $tj, 32, $tj
308	srlx $nj, 32, $nj
309	subccc $tj, $nj, $t3
310	add $rp, 8, $rp
311	st $t2, [$rp-4] ! reverse order
312	st $t3, [$rp-8]
313	brnz,pt $cnt, .Lsub
314	sub $cnt, 8, $cnt
315
316	sub $anp, $num, $anp ! rewind
317	sub $tp, $num, $tp
318	sub $anp, $num, $anp
319	sub $rp, $num, $rp
320
321	subccc $ovf, %g0, $ovf ! handle upmost overflow bit
322	ba .Lcopy
323	sub $num, 8, $cnt
324
325	.align 16
326	.Lcopy: ! conditional copy
327	ld [$tp+0], $t0
328	ld [$tp+4], $t1
329	ld [$rp+0], $t2
330	ld [$rp+4], $t3
331	stx %g0, [$tp] ! zap
332	add $tp, 8, $tp
333	stx %g0, [$anp] ! zap
334	stx %g0, [$anp+8]
335	add $anp, 16, $anp
336	movcs %icc, $t0, $t2
337	movcs %icc, $t1, $t3
338	st $t3, [$rp+0] ! flip order
339	st $t2, [$rp+4]
340	add $rp, 8, $rp
341	brnz $cnt, .Lcopy
342	sub $cnt, 8, $cnt
343
344	mov 1, %o0
345	ret
346	restore
347	.type bn_mul_mont_vis3, #function
348	.size bn_mul_mont_vis3, .-bn_mul_mont_vis3
349	.asciz "Montgomery Multiplication for SPARCv9 VIS3, CRYPTOGAMS by <appro\@openssl.org>"
350	.align 4
351	___
352
353
354	# Purpose of these subroutines is to explicitly encode VIS instructions,
355	# so that one can compile the module without having to specify VIS
356	# extensions on compiler command line, e.g. -xarch=v9 vs. -xarch=v9a.
357	# Idea is to reserve for option to produce "universal" binary and let
358	# programmer detect if current CPU is VIS capable at run-time.
359	sub unvis3 {
360	my ($mnemonic,$rs1,$rs2,$rd)=@_;
361	my %bias = ( "g" => 0, "o" => 8, "l" => 16, "i" => 24 );
362	my ($ref,$opf);
363	my %visopf = ( "addxc" => 0x011,
364	"addxccc" => 0x013,
365	"umulxhi" => 0x016 );
366
367	$ref = "$mnemonic\t$rs1,$rs2,$rd";
368
369	if ($opf=$visopf{$mnemonic}) {
370	foreach ($rs1,$rs2,$rd) {
371	return $ref if (!/%([goli])([0-9])/);
372	$_=$bias{$1}+$2;
373	}
374
375	return sprintf ".word\t0x%08x !%s",
376	0x81b00000\|$rd<<25\|$rs1<<14\|$opf<<5\|$rs2,
377	$ref;
378	} else {
379	return $ref;
380	}
381	}
382
383	foreach (split("\n",$code)) {
384	s/\`([^\`]*)\`/eval $1/ge;
385
386	s/\b(umulxhi\|addxc[c]{0,2})\s+(%[goli][0-7]),\s(%[goli][0-7]),\s(%[goli][0-7])/
387	&unvis3($1,$2,$3,$4)
388	/ge;
389
390	print $_,"\n";
391	}
392
393	close STDOUT or die "error closing STDOUT: $!";

注意: 瀏覽 TracBrowser 來幫助您使用儲存庫瀏覽器

source: vbox/trunk/src/libs/openssl-3.1.3/crypto/bn/asm/vis3-mont.pl@ 102334

以其他格式下載: