bn-c64xplus.asm@ 102334

最後變更在這個檔案從102334是 101211,由 vboxsync 提交於 17 月前
openssl-3.1.3: Applied and adjusted our OpenSSL changes to 3.1.2. bugref:10527
檔案大小: 9.9 KB

行
1	;; Copyright 2012-2016 The OpenSSL Project Authors. All Rights Reserved.
2	;;
3	;; Licensed under the Apache License 2.0 (the "License"). You may not use
4	;; this file except in compliance with the License. You can obtain a copy
5	;; in the file LICENSE in the source distribution or at
6	;; https://www.openssl.org/source/license.html
7	;;
8	;;====================================================================
9	;; Written by Andy Polyakov <[email protected]> for the OpenSSL
10	;; project.
11	;;
12	;; Rights for redistribution and usage in source and binary forms are
13	;; granted according to the License. Warranty of any kind is disclaimed.
14	;;====================================================================
15	;; Compiler-generated multiply-n-add SPLOOP runs at 12*n cycles, n
16	;; being the number of 32-bit words, addition - 8*n. Corresponding 4x
17	;; unrolled SPLOOP-free loops - at ~8n and ~5n. Below assembler
18	;; SPLOOPs spin at ... 2*n cycles [plus epilogue].
19	;;====================================================================
20	.text
21
22	.if .ASSEMBLER_VERSION<7000000
23	.asg 0,__TI_EABI__
24	.endif
25	.if __TI_EABI__
26	.asg bn_mul_add_words,_bn_mul_add_words
27	.asg bn_mul_words,_bn_mul_words
28	.asg bn_sqr_words,_bn_sqr_words
29	.asg bn_add_words,_bn_add_words
30	.asg bn_sub_words,_bn_sub_words
31	.asg bn_div_words,_bn_div_words
32	.asg bn_sqr_comba8,_bn_sqr_comba8
33	.asg bn_mul_comba8,_bn_mul_comba8
34	.asg bn_sqr_comba4,_bn_sqr_comba4
35	.asg bn_mul_comba4,_bn_mul_comba4
36	.endif
37
38	.asg B3,RA
39	.asg A4,ARG0
40	.asg B4,ARG1
41	.asg A6,ARG2
42	.asg B6,ARG3
43	.asg A8,ARG4
44	.asg B8,ARG5
45	.asg A4,RET
46	.asg A15,FP
47	.asg B14,DP
48	.asg B15,SP
49
50	.global _bn_mul_add_words
51	_bn_mul_add_words:
52	.asmfunc
53	MV ARG2,B0
54	[!B0] BNOP RA
55	\|\|[!B0] MVK 0,RET
56	[B0] MVC B0,ILC
57	[B0] ZERO A19 ; high part of accumulator
58	\|\| [B0] MV ARG0,A2
59	\|\| [B0] MV ARG3,A3
60	NOP 3
61
62	SPLOOP 2 ; 2*n+10
63	;;====================================================================
64	LDW *ARG1++,B7 ; ap[i]
65	NOP 3
66	LDW *ARG0++,A7 ; rp[i]
67	MPY32U B7,A3,A17:A16
68	NOP 3 ; [2,0] in epilogue
69	ADDU A16,A7,A21:A20
70	ADDU A19,A21:A20,A19:A18
71	\|\| MV.S A17,A23
72	SPKERNEL 2,1 ; leave slot for "return value"
73	\|\| STW A18,*A2++ ; rp[i]
74	\|\| ADD A19,A23,A19
75	;;====================================================================
76	BNOP RA,4
77	MV A19,RET ; return value
78	.endasmfunc
79
80	.global _bn_mul_words
81	_bn_mul_words:
82	.asmfunc
83	MV ARG2,B0
84	[!B0] BNOP RA
85	\|\|[!B0] MVK 0,RET
86	[B0] MVC B0,ILC
87	[B0] ZERO A19 ; high part of accumulator
88	NOP 3
89
90	SPLOOP 2 ; 2*n+10
91	;;====================================================================
92	LDW *ARG1++,A7 ; ap[i]
93	NOP 4
94	MPY32U A7,ARG3,A17:A16
95	NOP 4 ; [2,0] in epiloque
96	ADDU A19,A16,A19:A18
97	\|\| MV.S A17,A21
98	SPKERNEL 2,1 ; leave slot for "return value"
99	\|\| STW A18,*ARG0++ ; rp[i]
100	\|\| ADD.L A19,A21,A19
101	;;====================================================================
102	BNOP RA,4
103	MV A19,RET ; return value
104	.endasmfunc
105
106	.global _bn_sqr_words
107	_bn_sqr_words:
108	.asmfunc
109	MV ARG2,B0
110	[!B0] BNOP RA
111	\|\|[!B0] MVK 0,RET
112	[B0] MVC B0,ILC
113	[B0] MV ARG0,B2
114	\|\| [B0] ADD 4,ARG0,ARG0
115	NOP 3
116
117	SPLOOP 2 ; 2*n+10
118	;;====================================================================
119	LDW *ARG1++,B7 ; ap[i]
120	NOP 4
121	MPY32U B7,B7,B1:B0
122	NOP 3 ; [2,0] in epilogue
123	STW B0,B2++(8) ; rp[2i]
124	MV B1,A1
125	SPKERNEL 2,0 ; fully overlap BNOP RA,5
126	\|\| STW A1,ARG0++(8) ; rp[2i+1]
127	;;====================================================================
128	BNOP RA,5
129	.endasmfunc
130
131	.global _bn_add_words
132	_bn_add_words:
133	.asmfunc
134	MV ARG3,B0
135	[!B0] BNOP RA
136	\|\|[!B0] MVK 0,RET
137	[B0] MVC B0,ILC
138	[B0] ZERO A1 ; carry flag
139	\|\| [B0] MV ARG0,A3
140	NOP 3
141
142	SPLOOP 2 ; 2*n+6
143	;;====================================================================
144	LDW *ARG2++,A7 ; bp[i]
145	\|\| LDW *ARG1++,B7 ; ap[i]
146	NOP 4
147	ADDU A7,B7,A9:A8
148	ADDU A1,A9:A8,A1:A0
149	SPKERNEL 0,0 ; fully overlap BNOP RA,5
150	\|\| STW A0,*A3++ ; write result
151	\|\| MV A1,RET ; keep carry flag in RET
152	;;====================================================================
153	BNOP RA,5
154	.endasmfunc
155
156	.global _bn_sub_words
157	_bn_sub_words:
158	.asmfunc
159	MV ARG3,B0
160	[!B0] BNOP RA
161	\|\|[!B0] MVK 0,RET
162	[B0] MVC B0,ILC
163	[B0] ZERO A2 ; borrow flag
164	\|\| [B0] MV ARG0,A3
165	NOP 3
166
167	SPLOOP 2 ; 2*n+6
168	;;====================================================================
169	LDW *ARG2++,A7 ; bp[i]
170	\|\| LDW *ARG1++,B7 ; ap[i]
171	NOP 4
172	SUBU B7,A7,A1:A0
173	[A2] SUB A1:A0,1,A1:A0
174	SPKERNEL 0,1 ; leave slot for "return borrow flag"
175	\|\| STW A0,*A3++ ; write result
176	\|\| AND 1,A1,A2 ; pass on borrow flag
177	;;====================================================================
178	BNOP RA,4
179	AND 1,A1,RET ; return borrow flag
180	.endasmfunc
181
182	.global _bn_div_words
183	_bn_div_words:
184	.asmfunc
185	LMBD 1,A6,A0 ; leading zero bits in dv
186	LMBD 1,A4,A1 ; leading zero bits in hi
187	\|\| MVK 32,B0
188	CMPLTU A1,A0,A2
189	\|\| ADD A0,B0,B0
190	[ A2] BNOP RA
191	\|\|[ A2] MVK -1,A4 ; return overflow
192	\|\|[!A2] MV A4,A3 ; reassign hi
193	[!A2] MV B4,A4 ; reassign lo, will be quotient
194	\|\|[!A2] MVC B0,ILC
195	[!A2] SHL A6,A0,A6 ; normalize dv
196	\|\| MVK 1,A1
197
198	[!A2] CMPLTU A3,A6,A1 ; hi<dv?
199	\|\|[!A2] SHL A4,1,A5:A4 ; lo<<1
200	[!A1] SUB A3,A6,A3 ; hi-=dv
201	\|\|[!A1] OR 1,A4,A4
202	[!A2] SHRU A3,31,A1 ; upper bit
203	\|\|[!A2] ADDAH A5,A3,A3 ; hi<<1\|lo>>31
204
205	SPLOOP 3
206	[!A1] CMPLTU A3,A6,A1 ; hi<dv?
207	\|\|[ A1] ZERO A1
208	\|\| SHL A4,1,A5:A4 ; lo<<1
209	[!A1] SUB A3,A6,A3 ; hi-=dv
210	\|\|[!A1] OR 1,A4,A4 ; quotient
211	SHRU A3,31,A1 ; upper bit
212	\|\| ADDAH A5,A3,A3 ; hi<<1\|lo>>31
213	SPKERNEL
214
215	BNOP RA,5
216	.endasmfunc
217
218	;;====================================================================
219	;; Not really Comba algorithm, just straightforward NxM... Dedicated
220	;; fully unrolled real Comba implementations are asymptotically 2x
221	;; faster, but naturally larger undertaking. Purpose of this exercise
222	;; was rather to learn to master nested SPLOOPs...
223	;;====================================================================
224	.global _bn_sqr_comba8
225	.global _bn_mul_comba8
226	_bn_sqr_comba8:
227	MV ARG1,ARG2
228	_bn_mul_comba8:
229	.asmfunc
230	MVK 8,B0 ; N, RILC
231	\|\| MVK 8,A0 ; M, outer loop counter
232	\|\| MV ARG1,A5 ; copy ap
233	\|\| MV ARG0,B4 ; copy rp
234	\|\| ZERO B19 ; high part of accumulator
235	MVC B0,RILC
236	\|\| SUB B0,2,B1 ; N-2, initial ILC
237	\|\| SUB B0,1,B2 ; const B2=N-1
238	\|\| LDW *A5++,B6 ; ap[0]
239	\|\| MV A0,A3 ; const A3=M
240	sploopNxM?: ; for best performance arrange M<=N
241	[A0] SPLOOPD 2 ; 2*n+10
242	\|\| MVC B1,ILC
243	\|\| ADDAW B4,B0,B5
244	\|\| ZERO B7
245	\|\| LDW *A5++,A9 ; pre-fetch ap[1]
246	\|\| ZERO A1
247	\|\| SUB A0,1,A0
248	;;====================================================================
249	;; SPLOOP from bn_mul_add_words, but with flipped A<>B register files.
250	;; This is because of Advisory 15 from TI publication SPRZ247I.
251	LDW *ARG2++,A7 ; bp[i]
252	NOP 3
253	[A1] LDW *B5++,B7 ; rp[i]
254	MPY32U A7,B6,B17:B16
255	NOP 3
256	ADDU B16,B7,B21:B20
257	ADDU B19,B21:B20,B19:B18
258	\|\| MV.S B17,B23
259	SPKERNEL
260	\|\| STW B18,*B4++ ; rp[i]
261	\|\| ADD.S B19,B23,B19
262	;;====================================================================
263	outer?: ; m2(n+1)+10
264	SUBAW ARG2,A3,ARG2 ; rewind bp to bp[0]
265	SPMASKR
266	\|\| CMPGT A0,1,A2 ; done pre-fetching ap[i+1]?
267	MVD A9,B6 ; move through .M unit(*)
268	[A2] LDW *A5++,A9 ; pre-fetch ap[i+1]
269	SUBAW B5,B2,B5 ; rewind rp to rp[1]
270	MVK 1,A1
271	[A0] BNOP.S1 outer?,4
272	\|\| [A0] SUB.L A0,1,A0
273	STW B19,*B4--[B2] ; rewind rp tp rp[1]
274	\|\| ZERO.S B19 ; high part of accumulator
275	;; end of outer?
276	BNOP RA,5 ; return
277	.endasmfunc
278	;; (*) It should be noted that B6 is used as input to MPY32U in
279	;; chronologically next cycle in preceding SPLOOP iteration.
280	;; Normally such arrangement would require DINT, but at this
281	;; point SPLOOP is draining and interrupts are disabled
282	;; implicitly.
283
284	.global _bn_sqr_comba4
285	.global _bn_mul_comba4
286	_bn_sqr_comba4:
287	MV ARG1,ARG2
288	_bn_mul_comba4:
289	.asmfunc
290	.if 0
291	BNOP sploopNxM?,3
292	;; Above mentioned m2(n+1)+10 does not apply in n=m=4 case,
293	;; because of low-counter effect, when prologue phase finishes
294	;; before SPKERNEL instruction is reached. As result it's 25%
295	;; slower than expected...
296	MVK 4,B0 ; N, RILC
297	\|\| MVK 4,A0 ; M, outer loop counter
298	\|\| MV ARG1,A5 ; copy ap
299	\|\| MV ARG0,B4 ; copy rp
300	\|\| ZERO B19 ; high part of accumulator
301	MVC B0,RILC
302	\|\| SUB B0,2,B1 ; first ILC
303	\|\| SUB B0,1,B2 ; const B2=N-1
304	\|\| LDW *A5++,B6 ; ap[0]
305	\|\| MV A0,A3 ; const A3=M
306	.else
307	;; This alternative is an exercise in fully unrolled Comba
308	;; algorithm implementation that operates at n*(n+1)+12, or
309	;; as little as 32 cycles...
310	LDW *ARG1[0],B16 ; a[0]
311	\|\| LDW *ARG2[0],A16 ; b[0]
312	LDW *ARG1[1],B17 ; a[1]
313	\|\| LDW *ARG2[1],A17 ; b[1]
314	LDW *ARG1[2],B18 ; a[2]
315	\|\| LDW *ARG2[2],A18 ; b[2]
316	LDW *ARG1[3],B19 ; a[3]
317	\|\| LDW *ARG2[3],A19 ; b[3]
318	NOP
319	MPY32U A16,B16,A1:A0 ; a[0]*b[0]
320	MPY32U A17,B16,A23:A22 ; a[0]*b[1]
321	MPY32U A16,B17,A25:A24 ; a[1]*b[0]
322	MPY32U A16,B18,A27:A26 ; a[2]*b[0]
323	STW A0,*ARG0[0]
324	\|\| MPY32U A17,B17,A29:A28 ; a[1]*b[1]
325	MPY32U A18,B16,A31:A30 ; a[0]*b[2]
326	\|\| ADDU A22,A1,A1:A0
327	MV A23,B0
328	\|\| MPY32U A19,B16,A21:A20 ; a[3]*b[0]
329	\|\| ADDU A24,A1:A0,A1:A0
330	ADDU A25,B0,B1:B0
331	\|\| STW A0,*ARG0[1]
332	\|\| MPY32U A18,B17,A23:A22 ; a[2]*b[1]
333	\|\| ADDU A26,A1,A9:A8
334	ADDU A27,B1,B9:B8
335	\|\| MPY32U A17,B18,A25:A24 ; a[1]*b[2]
336	\|\| ADDU A28,A9:A8,A9:A8
337	ADDU A29,B9:B8,B9:B8
338	\|\| MPY32U A16,B19,A27:A26 ; a[0]*b[3]
339	\|\| ADDU A30,A9:A8,A9:A8
340	ADDU A31,B9:B8,B9:B8
341	\|\| ADDU B0,A9:A8,A9:A8
342	STW A8,*ARG0[2]
343	\|\| ADDU A20,A9,A1:A0
344	ADDU A21,B9,B1:B0
345	\|\| MPY32U A19,B17,A21:A20 ; a[3]*b[1]
346	\|\| ADDU A22,A1:A0,A1:A0
347	ADDU A23,B1:B0,B1:B0
348	\|\| MPY32U A18,B18,A23:A22 ; a[2]*b[2]
349	\|\| ADDU A24,A1:A0,A1:A0
350	ADDU A25,B1:B0,B1:B0
351	\|\| MPY32U A17,B19,A25:A24 ; a[1]*b[3]
352	\|\| ADDU A26,A1:A0,A1:A0
353	ADDU A27,B1:B0,B1:B0
354	\|\| ADDU B8,A1:A0,A1:A0
355	STW A0,*ARG0[3]
356	\|\| MPY32U A19,B18,A27:A26 ; a[3]*b[2]
357	\|\| ADDU A20,A1,A9:A8
358	ADDU A21,B1,B9:B8
359	\|\| MPY32U A18,B19,A29:A28 ; a[2]*b[3]
360	\|\| ADDU A22,A9:A8,A9:A8
361	ADDU A23,B9:B8,B9:B8
362	\|\| MPY32U A19,B19,A31:A30 ; a[3]*b[3]
363	\|\| ADDU A24,A9:A8,A9:A8
364	ADDU A25,B9:B8,B9:B8
365	\|\| ADDU B0,A9:A8,A9:A8
366	STW A8,*ARG0[4]
367	\|\| ADDU A26,A9,A1:A0
368	ADDU A27,B9,B1:B0
369	\|\| ADDU A28,A1:A0,A1:A0
370	ADDU A29,B1:B0,B1:B0
371	\|\| BNOP RA
372	\|\| ADDU B8,A1:A0,A1:A0
373	STW A0,*ARG0[5]
374	\|\| ADDU A30,A1,A9:A8
375	ADD A31,B1,B8
376	ADDU B0,A9:A8,A9:A8 ; removed \|\| to avoid cross-path stall below
377	ADD B8,A9,A9
378	\|\| STW A8,*ARG0[6]
379	STW A9,*ARG0[7]
380	.endif
381	.endasmfunc

注意: 瀏覽 TracBrowser 來幫助您使用儲存庫瀏覽器

source: vbox/trunk/src/libs/openssl-3.1.3/crypto/bn/asm/bn-c64xplus.asm@ 102334

以其他格式下載: