VirtualBox

source: vbox/trunk/src/libs/openssl-3.1.3/crypto/bn/asm/bn-c64xplus.asm@ 102334

最後變更 在這個檔案從102334是 101211,由 vboxsync 提交於 17 月 前

openssl-3.1.3: Applied and adjusted our OpenSSL changes to 3.1.2. bugref:10527

檔案大小: 9.9 KB
 
1;; Copyright 2012-2016 The OpenSSL Project Authors. All Rights Reserved.
2;;
3;; Licensed under the Apache License 2.0 (the "License"). You may not use
4;; this file except in compliance with the License. You can obtain a copy
5;; in the file LICENSE in the source distribution or at
6;; https://www.openssl.org/source/license.html
7;;
8;;====================================================================
9;; Written by Andy Polyakov <[email protected]> for the OpenSSL
10;; project.
11;;
12;; Rights for redistribution and usage in source and binary forms are
13;; granted according to the License. Warranty of any kind is disclaimed.
14;;====================================================================
15;; Compiler-generated multiply-n-add SPLOOP runs at 12*n cycles, n
16;; being the number of 32-bit words, addition - 8*n. Corresponding 4x
17;; unrolled SPLOOP-free loops - at ~8*n and ~5*n. Below assembler
18;; SPLOOPs spin at ... 2*n cycles [plus epilogue].
19;;====================================================================
20 .text
21
22 .if .ASSEMBLER_VERSION<7000000
23 .asg 0,__TI_EABI__
24 .endif
25 .if __TI_EABI__
26 .asg bn_mul_add_words,_bn_mul_add_words
27 .asg bn_mul_words,_bn_mul_words
28 .asg bn_sqr_words,_bn_sqr_words
29 .asg bn_add_words,_bn_add_words
30 .asg bn_sub_words,_bn_sub_words
31 .asg bn_div_words,_bn_div_words
32 .asg bn_sqr_comba8,_bn_sqr_comba8
33 .asg bn_mul_comba8,_bn_mul_comba8
34 .asg bn_sqr_comba4,_bn_sqr_comba4
35 .asg bn_mul_comba4,_bn_mul_comba4
36 .endif
37
38 .asg B3,RA
39 .asg A4,ARG0
40 .asg B4,ARG1
41 .asg A6,ARG2
42 .asg B6,ARG3
43 .asg A8,ARG4
44 .asg B8,ARG5
45 .asg A4,RET
46 .asg A15,FP
47 .asg B14,DP
48 .asg B15,SP
49
50 .global _bn_mul_add_words
51_bn_mul_add_words:
52 .asmfunc
53 MV ARG2,B0
54 [!B0] BNOP RA
55||[!B0] MVK 0,RET
56 [B0] MVC B0,ILC
57 [B0] ZERO A19 ; high part of accumulator
58|| [B0] MV ARG0,A2
59|| [B0] MV ARG3,A3
60 NOP 3
61
62 SPLOOP 2 ; 2*n+10
63;;====================================================================
64 LDW *ARG1++,B7 ; ap[i]
65 NOP 3
66 LDW *ARG0++,A7 ; rp[i]
67 MPY32U B7,A3,A17:A16
68 NOP 3 ; [2,0] in epilogue
69 ADDU A16,A7,A21:A20
70 ADDU A19,A21:A20,A19:A18
71|| MV.S A17,A23
72 SPKERNEL 2,1 ; leave slot for "return value"
73|| STW A18,*A2++ ; rp[i]
74|| ADD A19,A23,A19
75;;====================================================================
76 BNOP RA,4
77 MV A19,RET ; return value
78 .endasmfunc
79
80 .global _bn_mul_words
81_bn_mul_words:
82 .asmfunc
83 MV ARG2,B0
84 [!B0] BNOP RA
85||[!B0] MVK 0,RET
86 [B0] MVC B0,ILC
87 [B0] ZERO A19 ; high part of accumulator
88 NOP 3
89
90 SPLOOP 2 ; 2*n+10
91;;====================================================================
92 LDW *ARG1++,A7 ; ap[i]
93 NOP 4
94 MPY32U A7,ARG3,A17:A16
95 NOP 4 ; [2,0] in epiloque
96 ADDU A19,A16,A19:A18
97|| MV.S A17,A21
98 SPKERNEL 2,1 ; leave slot for "return value"
99|| STW A18,*ARG0++ ; rp[i]
100|| ADD.L A19,A21,A19
101;;====================================================================
102 BNOP RA,4
103 MV A19,RET ; return value
104 .endasmfunc
105
106 .global _bn_sqr_words
107_bn_sqr_words:
108 .asmfunc
109 MV ARG2,B0
110 [!B0] BNOP RA
111||[!B0] MVK 0,RET
112 [B0] MVC B0,ILC
113 [B0] MV ARG0,B2
114|| [B0] ADD 4,ARG0,ARG0
115 NOP 3
116
117 SPLOOP 2 ; 2*n+10
118;;====================================================================
119 LDW *ARG1++,B7 ; ap[i]
120 NOP 4
121 MPY32U B7,B7,B1:B0
122 NOP 3 ; [2,0] in epilogue
123 STW B0,*B2++(8) ; rp[2*i]
124 MV B1,A1
125 SPKERNEL 2,0 ; fully overlap BNOP RA,5
126|| STW A1,*ARG0++(8) ; rp[2*i+1]
127;;====================================================================
128 BNOP RA,5
129 .endasmfunc
130
131 .global _bn_add_words
132_bn_add_words:
133 .asmfunc
134 MV ARG3,B0
135 [!B0] BNOP RA
136||[!B0] MVK 0,RET
137 [B0] MVC B0,ILC
138 [B0] ZERO A1 ; carry flag
139|| [B0] MV ARG0,A3
140 NOP 3
141
142 SPLOOP 2 ; 2*n+6
143;;====================================================================
144 LDW *ARG2++,A7 ; bp[i]
145|| LDW *ARG1++,B7 ; ap[i]
146 NOP 4
147 ADDU A7,B7,A9:A8
148 ADDU A1,A9:A8,A1:A0
149 SPKERNEL 0,0 ; fully overlap BNOP RA,5
150|| STW A0,*A3++ ; write result
151|| MV A1,RET ; keep carry flag in RET
152;;====================================================================
153 BNOP RA,5
154 .endasmfunc
155
156 .global _bn_sub_words
157_bn_sub_words:
158 .asmfunc
159 MV ARG3,B0
160 [!B0] BNOP RA
161||[!B0] MVK 0,RET
162 [B0] MVC B0,ILC
163 [B0] ZERO A2 ; borrow flag
164|| [B0] MV ARG0,A3
165 NOP 3
166
167 SPLOOP 2 ; 2*n+6
168;;====================================================================
169 LDW *ARG2++,A7 ; bp[i]
170|| LDW *ARG1++,B7 ; ap[i]
171 NOP 4
172 SUBU B7,A7,A1:A0
173 [A2] SUB A1:A0,1,A1:A0
174 SPKERNEL 0,1 ; leave slot for "return borrow flag"
175|| STW A0,*A3++ ; write result
176|| AND 1,A1,A2 ; pass on borrow flag
177;;====================================================================
178 BNOP RA,4
179 AND 1,A1,RET ; return borrow flag
180 .endasmfunc
181
182 .global _bn_div_words
183_bn_div_words:
184 .asmfunc
185 LMBD 1,A6,A0 ; leading zero bits in dv
186 LMBD 1,A4,A1 ; leading zero bits in hi
187|| MVK 32,B0
188 CMPLTU A1,A0,A2
189|| ADD A0,B0,B0
190 [ A2] BNOP RA
191||[ A2] MVK -1,A4 ; return overflow
192||[!A2] MV A4,A3 ; reassign hi
193 [!A2] MV B4,A4 ; reassign lo, will be quotient
194||[!A2] MVC B0,ILC
195 [!A2] SHL A6,A0,A6 ; normalize dv
196|| MVK 1,A1
197
198 [!A2] CMPLTU A3,A6,A1 ; hi<dv?
199||[!A2] SHL A4,1,A5:A4 ; lo<<1
200 [!A1] SUB A3,A6,A3 ; hi-=dv
201||[!A1] OR 1,A4,A4
202 [!A2] SHRU A3,31,A1 ; upper bit
203||[!A2] ADDAH A5,A3,A3 ; hi<<1|lo>>31
204
205 SPLOOP 3
206 [!A1] CMPLTU A3,A6,A1 ; hi<dv?
207||[ A1] ZERO A1
208|| SHL A4,1,A5:A4 ; lo<<1
209 [!A1] SUB A3,A6,A3 ; hi-=dv
210||[!A1] OR 1,A4,A4 ; quotient
211 SHRU A3,31,A1 ; upper bit
212|| ADDAH A5,A3,A3 ; hi<<1|lo>>31
213 SPKERNEL
214
215 BNOP RA,5
216 .endasmfunc
217
218;;====================================================================
219;; Not really Comba algorithm, just straightforward NxM... Dedicated
220;; fully unrolled real Comba implementations are asymptotically 2x
221;; faster, but naturally larger undertaking. Purpose of this exercise
222;; was rather to learn to master nested SPLOOPs...
223;;====================================================================
224 .global _bn_sqr_comba8
225 .global _bn_mul_comba8
226_bn_sqr_comba8:
227 MV ARG1,ARG2
228_bn_mul_comba8:
229 .asmfunc
230 MVK 8,B0 ; N, RILC
231|| MVK 8,A0 ; M, outer loop counter
232|| MV ARG1,A5 ; copy ap
233|| MV ARG0,B4 ; copy rp
234|| ZERO B19 ; high part of accumulator
235 MVC B0,RILC
236|| SUB B0,2,B1 ; N-2, initial ILC
237|| SUB B0,1,B2 ; const B2=N-1
238|| LDW *A5++,B6 ; ap[0]
239|| MV A0,A3 ; const A3=M
240sploopNxM?: ; for best performance arrange M<=N
241 [A0] SPLOOPD 2 ; 2*n+10
242|| MVC B1,ILC
243|| ADDAW B4,B0,B5
244|| ZERO B7
245|| LDW *A5++,A9 ; pre-fetch ap[1]
246|| ZERO A1
247|| SUB A0,1,A0
248;;====================================================================
249;; SPLOOP from bn_mul_add_words, but with flipped A<>B register files.
250;; This is because of Advisory 15 from TI publication SPRZ247I.
251 LDW *ARG2++,A7 ; bp[i]
252 NOP 3
253 [A1] LDW *B5++,B7 ; rp[i]
254 MPY32U A7,B6,B17:B16
255 NOP 3
256 ADDU B16,B7,B21:B20
257 ADDU B19,B21:B20,B19:B18
258|| MV.S B17,B23
259 SPKERNEL
260|| STW B18,*B4++ ; rp[i]
261|| ADD.S B19,B23,B19
262;;====================================================================
263outer?: ; m*2*(n+1)+10
264 SUBAW ARG2,A3,ARG2 ; rewind bp to bp[0]
265 SPMASKR
266|| CMPGT A0,1,A2 ; done pre-fetching ap[i+1]?
267 MVD A9,B6 ; move through .M unit(*)
268 [A2] LDW *A5++,A9 ; pre-fetch ap[i+1]
269 SUBAW B5,B2,B5 ; rewind rp to rp[1]
270 MVK 1,A1
271 [A0] BNOP.S1 outer?,4
272|| [A0] SUB.L A0,1,A0
273 STW B19,*B4--[B2] ; rewind rp tp rp[1]
274|| ZERO.S B19 ; high part of accumulator
275;; end of outer?
276 BNOP RA,5 ; return
277 .endasmfunc
278;; (*) It should be noted that B6 is used as input to MPY32U in
279;; chronologically next cycle in *preceding* SPLOOP iteration.
280;; Normally such arrangement would require DINT, but at this
281;; point SPLOOP is draining and interrupts are disabled
282;; implicitly.
283
284 .global _bn_sqr_comba4
285 .global _bn_mul_comba4
286_bn_sqr_comba4:
287 MV ARG1,ARG2
288_bn_mul_comba4:
289 .asmfunc
290 .if 0
291 BNOP sploopNxM?,3
292 ;; Above mentioned m*2*(n+1)+10 does not apply in n=m=4 case,
293 ;; because of low-counter effect, when prologue phase finishes
294 ;; before SPKERNEL instruction is reached. As result it's 25%
295 ;; slower than expected...
296 MVK 4,B0 ; N, RILC
297|| MVK 4,A0 ; M, outer loop counter
298|| MV ARG1,A5 ; copy ap
299|| MV ARG0,B4 ; copy rp
300|| ZERO B19 ; high part of accumulator
301 MVC B0,RILC
302|| SUB B0,2,B1 ; first ILC
303|| SUB B0,1,B2 ; const B2=N-1
304|| LDW *A5++,B6 ; ap[0]
305|| MV A0,A3 ; const A3=M
306 .else
307 ;; This alternative is an exercise in fully unrolled Comba
308 ;; algorithm implementation that operates at n*(n+1)+12, or
309 ;; as little as 32 cycles...
310 LDW *ARG1[0],B16 ; a[0]
311|| LDW *ARG2[0],A16 ; b[0]
312 LDW *ARG1[1],B17 ; a[1]
313|| LDW *ARG2[1],A17 ; b[1]
314 LDW *ARG1[2],B18 ; a[2]
315|| LDW *ARG2[2],A18 ; b[2]
316 LDW *ARG1[3],B19 ; a[3]
317|| LDW *ARG2[3],A19 ; b[3]
318 NOP
319 MPY32U A16,B16,A1:A0 ; a[0]*b[0]
320 MPY32U A17,B16,A23:A22 ; a[0]*b[1]
321 MPY32U A16,B17,A25:A24 ; a[1]*b[0]
322 MPY32U A16,B18,A27:A26 ; a[2]*b[0]
323 STW A0,*ARG0[0]
324|| MPY32U A17,B17,A29:A28 ; a[1]*b[1]
325 MPY32U A18,B16,A31:A30 ; a[0]*b[2]
326|| ADDU A22,A1,A1:A0
327 MV A23,B0
328|| MPY32U A19,B16,A21:A20 ; a[3]*b[0]
329|| ADDU A24,A1:A0,A1:A0
330 ADDU A25,B0,B1:B0
331|| STW A0,*ARG0[1]
332|| MPY32U A18,B17,A23:A22 ; a[2]*b[1]
333|| ADDU A26,A1,A9:A8
334 ADDU A27,B1,B9:B8
335|| MPY32U A17,B18,A25:A24 ; a[1]*b[2]
336|| ADDU A28,A9:A8,A9:A8
337 ADDU A29,B9:B8,B9:B8
338|| MPY32U A16,B19,A27:A26 ; a[0]*b[3]
339|| ADDU A30,A9:A8,A9:A8
340 ADDU A31,B9:B8,B9:B8
341|| ADDU B0,A9:A8,A9:A8
342 STW A8,*ARG0[2]
343|| ADDU A20,A9,A1:A0
344 ADDU A21,B9,B1:B0
345|| MPY32U A19,B17,A21:A20 ; a[3]*b[1]
346|| ADDU A22,A1:A0,A1:A0
347 ADDU A23,B1:B0,B1:B0
348|| MPY32U A18,B18,A23:A22 ; a[2]*b[2]
349|| ADDU A24,A1:A0,A1:A0
350 ADDU A25,B1:B0,B1:B0
351|| MPY32U A17,B19,A25:A24 ; a[1]*b[3]
352|| ADDU A26,A1:A0,A1:A0
353 ADDU A27,B1:B0,B1:B0
354|| ADDU B8,A1:A0,A1:A0
355 STW A0,*ARG0[3]
356|| MPY32U A19,B18,A27:A26 ; a[3]*b[2]
357|| ADDU A20,A1,A9:A8
358 ADDU A21,B1,B9:B8
359|| MPY32U A18,B19,A29:A28 ; a[2]*b[3]
360|| ADDU A22,A9:A8,A9:A8
361 ADDU A23,B9:B8,B9:B8
362|| MPY32U A19,B19,A31:A30 ; a[3]*b[3]
363|| ADDU A24,A9:A8,A9:A8
364 ADDU A25,B9:B8,B9:B8
365|| ADDU B0,A9:A8,A9:A8
366 STW A8,*ARG0[4]
367|| ADDU A26,A9,A1:A0
368 ADDU A27,B9,B1:B0
369|| ADDU A28,A1:A0,A1:A0
370 ADDU A29,B1:B0,B1:B0
371|| BNOP RA
372|| ADDU B8,A1:A0,A1:A0
373 STW A0,*ARG0[5]
374|| ADDU A30,A1,A9:A8
375 ADD A31,B1,B8
376 ADDU B0,A9:A8,A9:A8 ; removed || to avoid cross-path stall below
377 ADD B8,A9,A9
378|| STW A8,*ARG0[6]
379 STW A9,*ARG0[7]
380 .endif
381 .endasmfunc
注意: 瀏覽 TracBrowser 來幫助您使用儲存庫瀏覽器

© 2024 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette