VirtualBox

source: vbox/trunk/src/libs/openssl-3.1.0/crypto/genasm-nasm/rsaz-4k-avx512.S@ 100939

最後變更 在這個檔案從100939是 100939,由 vboxsync 提交於 18 月 前

openssl: adding missed files bugref:10418

檔案大小: 31.0 KB
 
1.text
2
3.globl ossl_rsaz_amm52x40_x1_ifma256
4.type ossl_rsaz_amm52x40_x1_ifma256,@function
5.align 32
6ossl_rsaz_amm52x40_x1_ifma256:
7.cfi_startproc
8.byte 243,15,30,250
9 pushq %rbx
10.cfi_adjust_cfa_offset 8
11.cfi_offset %rbx,-16
12 pushq %rbp
13.cfi_adjust_cfa_offset 8
14.cfi_offset %rbp,-24
15 pushq %r12
16.cfi_adjust_cfa_offset 8
17.cfi_offset %r12,-32
18 pushq %r13
19.cfi_adjust_cfa_offset 8
20.cfi_offset %r13,-40
21 pushq %r14
22.cfi_adjust_cfa_offset 8
23.cfi_offset %r14,-48
24 pushq %r15
25.cfi_adjust_cfa_offset 8
26.cfi_offset %r15,-56
27
28 vpxord %ymm0,%ymm0,%ymm0
29 vmovdqa64 %ymm0,%ymm3
30 vmovdqa64 %ymm0,%ymm4
31 vmovdqa64 %ymm0,%ymm5
32 vmovdqa64 %ymm0,%ymm6
33 vmovdqa64 %ymm0,%ymm7
34 vmovdqa64 %ymm0,%ymm8
35 vmovdqa64 %ymm0,%ymm9
36 vmovdqa64 %ymm0,%ymm10
37 vmovdqa64 %ymm0,%ymm11
38 vmovdqa64 %ymm0,%ymm12
39
40 xorl %r9d,%r9d
41
42 movq %rdx,%r11
43 movq $0xfffffffffffff,%rax
44
45
46 movl $10,%ebx
47
48.align 32
49.Lloop10:
50 movq 0(%r11),%r13
51
52 vpbroadcastq %r13,%ymm1
53 movq 0(%rsi),%rdx
54 mulxq %r13,%r13,%r12
55 addq %r13,%r9
56 movq %r12,%r10
57 adcq $0,%r10
58
59 movq %r8,%r13
60 imulq %r9,%r13
61 andq %rax,%r13
62
63 vpbroadcastq %r13,%ymm2
64 movq 0(%rcx),%rdx
65 mulxq %r13,%r13,%r12
66 addq %r13,%r9
67 adcq %r12,%r10
68
69 shrq $52,%r9
70 salq $12,%r10
71 orq %r10,%r9
72
73 vpmadd52luq 0(%rsi),%ymm1,%ymm3
74 vpmadd52luq 32(%rsi),%ymm1,%ymm4
75 vpmadd52luq 64(%rsi),%ymm1,%ymm5
76 vpmadd52luq 96(%rsi),%ymm1,%ymm6
77 vpmadd52luq 128(%rsi),%ymm1,%ymm7
78 vpmadd52luq 160(%rsi),%ymm1,%ymm8
79 vpmadd52luq 192(%rsi),%ymm1,%ymm9
80 vpmadd52luq 224(%rsi),%ymm1,%ymm10
81 vpmadd52luq 256(%rsi),%ymm1,%ymm11
82 vpmadd52luq 288(%rsi),%ymm1,%ymm12
83
84 vpmadd52luq 0(%rcx),%ymm2,%ymm3
85 vpmadd52luq 32(%rcx),%ymm2,%ymm4
86 vpmadd52luq 64(%rcx),%ymm2,%ymm5
87 vpmadd52luq 96(%rcx),%ymm2,%ymm6
88 vpmadd52luq 128(%rcx),%ymm2,%ymm7
89 vpmadd52luq 160(%rcx),%ymm2,%ymm8
90 vpmadd52luq 192(%rcx),%ymm2,%ymm9
91 vpmadd52luq 224(%rcx),%ymm2,%ymm10
92 vpmadd52luq 256(%rcx),%ymm2,%ymm11
93 vpmadd52luq 288(%rcx),%ymm2,%ymm12
94
95
96 valignq $1,%ymm3,%ymm4,%ymm3
97 valignq $1,%ymm4,%ymm5,%ymm4
98 valignq $1,%ymm5,%ymm6,%ymm5
99 valignq $1,%ymm6,%ymm7,%ymm6
100 valignq $1,%ymm7,%ymm8,%ymm7
101 valignq $1,%ymm8,%ymm9,%ymm8
102 valignq $1,%ymm9,%ymm10,%ymm9
103 valignq $1,%ymm10,%ymm11,%ymm10
104 valignq $1,%ymm11,%ymm12,%ymm11
105 valignq $1,%ymm12,%ymm0,%ymm12
106
107 vmovq %xmm3,%r13
108 addq %r13,%r9
109
110 vpmadd52huq 0(%rsi),%ymm1,%ymm3
111 vpmadd52huq 32(%rsi),%ymm1,%ymm4
112 vpmadd52huq 64(%rsi),%ymm1,%ymm5
113 vpmadd52huq 96(%rsi),%ymm1,%ymm6
114 vpmadd52huq 128(%rsi),%ymm1,%ymm7
115 vpmadd52huq 160(%rsi),%ymm1,%ymm8
116 vpmadd52huq 192(%rsi),%ymm1,%ymm9
117 vpmadd52huq 224(%rsi),%ymm1,%ymm10
118 vpmadd52huq 256(%rsi),%ymm1,%ymm11
119 vpmadd52huq 288(%rsi),%ymm1,%ymm12
120
121 vpmadd52huq 0(%rcx),%ymm2,%ymm3
122 vpmadd52huq 32(%rcx),%ymm2,%ymm4
123 vpmadd52huq 64(%rcx),%ymm2,%ymm5
124 vpmadd52huq 96(%rcx),%ymm2,%ymm6
125 vpmadd52huq 128(%rcx),%ymm2,%ymm7
126 vpmadd52huq 160(%rcx),%ymm2,%ymm8
127 vpmadd52huq 192(%rcx),%ymm2,%ymm9
128 vpmadd52huq 224(%rcx),%ymm2,%ymm10
129 vpmadd52huq 256(%rcx),%ymm2,%ymm11
130 vpmadd52huq 288(%rcx),%ymm2,%ymm12
131 movq 8(%r11),%r13
132
133 vpbroadcastq %r13,%ymm1
134 movq 0(%rsi),%rdx
135 mulxq %r13,%r13,%r12
136 addq %r13,%r9
137 movq %r12,%r10
138 adcq $0,%r10
139
140 movq %r8,%r13
141 imulq %r9,%r13
142 andq %rax,%r13
143
144 vpbroadcastq %r13,%ymm2
145 movq 0(%rcx),%rdx
146 mulxq %r13,%r13,%r12
147 addq %r13,%r9
148 adcq %r12,%r10
149
150 shrq $52,%r9
151 salq $12,%r10
152 orq %r10,%r9
153
154 vpmadd52luq 0(%rsi),%ymm1,%ymm3
155 vpmadd52luq 32(%rsi),%ymm1,%ymm4
156 vpmadd52luq 64(%rsi),%ymm1,%ymm5
157 vpmadd52luq 96(%rsi),%ymm1,%ymm6
158 vpmadd52luq 128(%rsi),%ymm1,%ymm7
159 vpmadd52luq 160(%rsi),%ymm1,%ymm8
160 vpmadd52luq 192(%rsi),%ymm1,%ymm9
161 vpmadd52luq 224(%rsi),%ymm1,%ymm10
162 vpmadd52luq 256(%rsi),%ymm1,%ymm11
163 vpmadd52luq 288(%rsi),%ymm1,%ymm12
164
165 vpmadd52luq 0(%rcx),%ymm2,%ymm3
166 vpmadd52luq 32(%rcx),%ymm2,%ymm4
167 vpmadd52luq 64(%rcx),%ymm2,%ymm5
168 vpmadd52luq 96(%rcx),%ymm2,%ymm6
169 vpmadd52luq 128(%rcx),%ymm2,%ymm7
170 vpmadd52luq 160(%rcx),%ymm2,%ymm8
171 vpmadd52luq 192(%rcx),%ymm2,%ymm9
172 vpmadd52luq 224(%rcx),%ymm2,%ymm10
173 vpmadd52luq 256(%rcx),%ymm2,%ymm11
174 vpmadd52luq 288(%rcx),%ymm2,%ymm12
175
176
177 valignq $1,%ymm3,%ymm4,%ymm3
178 valignq $1,%ymm4,%ymm5,%ymm4
179 valignq $1,%ymm5,%ymm6,%ymm5
180 valignq $1,%ymm6,%ymm7,%ymm6
181 valignq $1,%ymm7,%ymm8,%ymm7
182 valignq $1,%ymm8,%ymm9,%ymm8
183 valignq $1,%ymm9,%ymm10,%ymm9
184 valignq $1,%ymm10,%ymm11,%ymm10
185 valignq $1,%ymm11,%ymm12,%ymm11
186 valignq $1,%ymm12,%ymm0,%ymm12
187
188 vmovq %xmm3,%r13
189 addq %r13,%r9
190
191 vpmadd52huq 0(%rsi),%ymm1,%ymm3
192 vpmadd52huq 32(%rsi),%ymm1,%ymm4
193 vpmadd52huq 64(%rsi),%ymm1,%ymm5
194 vpmadd52huq 96(%rsi),%ymm1,%ymm6
195 vpmadd52huq 128(%rsi),%ymm1,%ymm7
196 vpmadd52huq 160(%rsi),%ymm1,%ymm8
197 vpmadd52huq 192(%rsi),%ymm1,%ymm9
198 vpmadd52huq 224(%rsi),%ymm1,%ymm10
199 vpmadd52huq 256(%rsi),%ymm1,%ymm11
200 vpmadd52huq 288(%rsi),%ymm1,%ymm12
201
202 vpmadd52huq 0(%rcx),%ymm2,%ymm3
203 vpmadd52huq 32(%rcx),%ymm2,%ymm4
204 vpmadd52huq 64(%rcx),%ymm2,%ymm5
205 vpmadd52huq 96(%rcx),%ymm2,%ymm6
206 vpmadd52huq 128(%rcx),%ymm2,%ymm7
207 vpmadd52huq 160(%rcx),%ymm2,%ymm8
208 vpmadd52huq 192(%rcx),%ymm2,%ymm9
209 vpmadd52huq 224(%rcx),%ymm2,%ymm10
210 vpmadd52huq 256(%rcx),%ymm2,%ymm11
211 vpmadd52huq 288(%rcx),%ymm2,%ymm12
212 movq 16(%r11),%r13
213
214 vpbroadcastq %r13,%ymm1
215 movq 0(%rsi),%rdx
216 mulxq %r13,%r13,%r12
217 addq %r13,%r9
218 movq %r12,%r10
219 adcq $0,%r10
220
221 movq %r8,%r13
222 imulq %r9,%r13
223 andq %rax,%r13
224
225 vpbroadcastq %r13,%ymm2
226 movq 0(%rcx),%rdx
227 mulxq %r13,%r13,%r12
228 addq %r13,%r9
229 adcq %r12,%r10
230
231 shrq $52,%r9
232 salq $12,%r10
233 orq %r10,%r9
234
235 vpmadd52luq 0(%rsi),%ymm1,%ymm3
236 vpmadd52luq 32(%rsi),%ymm1,%ymm4
237 vpmadd52luq 64(%rsi),%ymm1,%ymm5
238 vpmadd52luq 96(%rsi),%ymm1,%ymm6
239 vpmadd52luq 128(%rsi),%ymm1,%ymm7
240 vpmadd52luq 160(%rsi),%ymm1,%ymm8
241 vpmadd52luq 192(%rsi),%ymm1,%ymm9
242 vpmadd52luq 224(%rsi),%ymm1,%ymm10
243 vpmadd52luq 256(%rsi),%ymm1,%ymm11
244 vpmadd52luq 288(%rsi),%ymm1,%ymm12
245
246 vpmadd52luq 0(%rcx),%ymm2,%ymm3
247 vpmadd52luq 32(%rcx),%ymm2,%ymm4
248 vpmadd52luq 64(%rcx),%ymm2,%ymm5
249 vpmadd52luq 96(%rcx),%ymm2,%ymm6
250 vpmadd52luq 128(%rcx),%ymm2,%ymm7
251 vpmadd52luq 160(%rcx),%ymm2,%ymm8
252 vpmadd52luq 192(%rcx),%ymm2,%ymm9
253 vpmadd52luq 224(%rcx),%ymm2,%ymm10
254 vpmadd52luq 256(%rcx),%ymm2,%ymm11
255 vpmadd52luq 288(%rcx),%ymm2,%ymm12
256
257
258 valignq $1,%ymm3,%ymm4,%ymm3
259 valignq $1,%ymm4,%ymm5,%ymm4
260 valignq $1,%ymm5,%ymm6,%ymm5
261 valignq $1,%ymm6,%ymm7,%ymm6
262 valignq $1,%ymm7,%ymm8,%ymm7
263 valignq $1,%ymm8,%ymm9,%ymm8
264 valignq $1,%ymm9,%ymm10,%ymm9
265 valignq $1,%ymm10,%ymm11,%ymm10
266 valignq $1,%ymm11,%ymm12,%ymm11
267 valignq $1,%ymm12,%ymm0,%ymm12
268
269 vmovq %xmm3,%r13
270 addq %r13,%r9
271
272 vpmadd52huq 0(%rsi),%ymm1,%ymm3
273 vpmadd52huq 32(%rsi),%ymm1,%ymm4
274 vpmadd52huq 64(%rsi),%ymm1,%ymm5
275 vpmadd52huq 96(%rsi),%ymm1,%ymm6
276 vpmadd52huq 128(%rsi),%ymm1,%ymm7
277 vpmadd52huq 160(%rsi),%ymm1,%ymm8
278 vpmadd52huq 192(%rsi),%ymm1,%ymm9
279 vpmadd52huq 224(%rsi),%ymm1,%ymm10
280 vpmadd52huq 256(%rsi),%ymm1,%ymm11
281 vpmadd52huq 288(%rsi),%ymm1,%ymm12
282
283 vpmadd52huq 0(%rcx),%ymm2,%ymm3
284 vpmadd52huq 32(%rcx),%ymm2,%ymm4
285 vpmadd52huq 64(%rcx),%ymm2,%ymm5
286 vpmadd52huq 96(%rcx),%ymm2,%ymm6
287 vpmadd52huq 128(%rcx),%ymm2,%ymm7
288 vpmadd52huq 160(%rcx),%ymm2,%ymm8
289 vpmadd52huq 192(%rcx),%ymm2,%ymm9
290 vpmadd52huq 224(%rcx),%ymm2,%ymm10
291 vpmadd52huq 256(%rcx),%ymm2,%ymm11
292 vpmadd52huq 288(%rcx),%ymm2,%ymm12
293 movq 24(%r11),%r13
294
295 vpbroadcastq %r13,%ymm1
296 movq 0(%rsi),%rdx
297 mulxq %r13,%r13,%r12
298 addq %r13,%r9
299 movq %r12,%r10
300 adcq $0,%r10
301
302 movq %r8,%r13
303 imulq %r9,%r13
304 andq %rax,%r13
305
306 vpbroadcastq %r13,%ymm2
307 movq 0(%rcx),%rdx
308 mulxq %r13,%r13,%r12
309 addq %r13,%r9
310 adcq %r12,%r10
311
312 shrq $52,%r9
313 salq $12,%r10
314 orq %r10,%r9
315
316 vpmadd52luq 0(%rsi),%ymm1,%ymm3
317 vpmadd52luq 32(%rsi),%ymm1,%ymm4
318 vpmadd52luq 64(%rsi),%ymm1,%ymm5
319 vpmadd52luq 96(%rsi),%ymm1,%ymm6
320 vpmadd52luq 128(%rsi),%ymm1,%ymm7
321 vpmadd52luq 160(%rsi),%ymm1,%ymm8
322 vpmadd52luq 192(%rsi),%ymm1,%ymm9
323 vpmadd52luq 224(%rsi),%ymm1,%ymm10
324 vpmadd52luq 256(%rsi),%ymm1,%ymm11
325 vpmadd52luq 288(%rsi),%ymm1,%ymm12
326
327 vpmadd52luq 0(%rcx),%ymm2,%ymm3
328 vpmadd52luq 32(%rcx),%ymm2,%ymm4
329 vpmadd52luq 64(%rcx),%ymm2,%ymm5
330 vpmadd52luq 96(%rcx),%ymm2,%ymm6
331 vpmadd52luq 128(%rcx),%ymm2,%ymm7
332 vpmadd52luq 160(%rcx),%ymm2,%ymm8
333 vpmadd52luq 192(%rcx),%ymm2,%ymm9
334 vpmadd52luq 224(%rcx),%ymm2,%ymm10
335 vpmadd52luq 256(%rcx),%ymm2,%ymm11
336 vpmadd52luq 288(%rcx),%ymm2,%ymm12
337
338
339 valignq $1,%ymm3,%ymm4,%ymm3
340 valignq $1,%ymm4,%ymm5,%ymm4
341 valignq $1,%ymm5,%ymm6,%ymm5
342 valignq $1,%ymm6,%ymm7,%ymm6
343 valignq $1,%ymm7,%ymm8,%ymm7
344 valignq $1,%ymm8,%ymm9,%ymm8
345 valignq $1,%ymm9,%ymm10,%ymm9
346 valignq $1,%ymm10,%ymm11,%ymm10
347 valignq $1,%ymm11,%ymm12,%ymm11
348 valignq $1,%ymm12,%ymm0,%ymm12
349
350 vmovq %xmm3,%r13
351 addq %r13,%r9
352
353 vpmadd52huq 0(%rsi),%ymm1,%ymm3
354 vpmadd52huq 32(%rsi),%ymm1,%ymm4
355 vpmadd52huq 64(%rsi),%ymm1,%ymm5
356 vpmadd52huq 96(%rsi),%ymm1,%ymm6
357 vpmadd52huq 128(%rsi),%ymm1,%ymm7
358 vpmadd52huq 160(%rsi),%ymm1,%ymm8
359 vpmadd52huq 192(%rsi),%ymm1,%ymm9
360 vpmadd52huq 224(%rsi),%ymm1,%ymm10
361 vpmadd52huq 256(%rsi),%ymm1,%ymm11
362 vpmadd52huq 288(%rsi),%ymm1,%ymm12
363
364 vpmadd52huq 0(%rcx),%ymm2,%ymm3
365 vpmadd52huq 32(%rcx),%ymm2,%ymm4
366 vpmadd52huq 64(%rcx),%ymm2,%ymm5
367 vpmadd52huq 96(%rcx),%ymm2,%ymm6
368 vpmadd52huq 128(%rcx),%ymm2,%ymm7
369 vpmadd52huq 160(%rcx),%ymm2,%ymm8
370 vpmadd52huq 192(%rcx),%ymm2,%ymm9
371 vpmadd52huq 224(%rcx),%ymm2,%ymm10
372 vpmadd52huq 256(%rcx),%ymm2,%ymm11
373 vpmadd52huq 288(%rcx),%ymm2,%ymm12
374 leaq 32(%r11),%r11
375 decl %ebx
376 jne .Lloop10
377
378 vpbroadcastq %r9,%ymm0
379 vpblendd $3,%ymm0,%ymm3,%ymm3
380
381
382
383 vpsrlq $52,%ymm3,%ymm0
384 vpsrlq $52,%ymm4,%ymm1
385 vpsrlq $52,%ymm5,%ymm2
386 vpsrlq $52,%ymm6,%ymm23
387 vpsrlq $52,%ymm7,%ymm24
388 vpsrlq $52,%ymm8,%ymm25
389 vpsrlq $52,%ymm9,%ymm26
390 vpsrlq $52,%ymm10,%ymm27
391 vpsrlq $52,%ymm11,%ymm28
392 vpsrlq $52,%ymm12,%ymm29
393
394
395 valignq $3,%ymm28,%ymm29,%ymm29
396 valignq $3,%ymm27,%ymm28,%ymm28
397 valignq $3,%ymm26,%ymm27,%ymm27
398 valignq $3,%ymm25,%ymm26,%ymm26
399 valignq $3,%ymm24,%ymm25,%ymm25
400 valignq $3,%ymm23,%ymm24,%ymm24
401 valignq $3,%ymm2,%ymm23,%ymm23
402 valignq $3,%ymm1,%ymm2,%ymm2
403 valignq $3,%ymm0,%ymm1,%ymm1
404 valignq $3,.Lzeros(%rip),%ymm0,%ymm0
405
406
407 vpandq .Lmask52x4(%rip),%ymm3,%ymm3
408 vpandq .Lmask52x4(%rip),%ymm4,%ymm4
409 vpandq .Lmask52x4(%rip),%ymm5,%ymm5
410 vpandq .Lmask52x4(%rip),%ymm6,%ymm6
411 vpandq .Lmask52x4(%rip),%ymm7,%ymm7
412 vpandq .Lmask52x4(%rip),%ymm8,%ymm8
413 vpandq .Lmask52x4(%rip),%ymm9,%ymm9
414 vpandq .Lmask52x4(%rip),%ymm10,%ymm10
415 vpandq .Lmask52x4(%rip),%ymm11,%ymm11
416 vpandq .Lmask52x4(%rip),%ymm12,%ymm12
417
418
419 vpaddq %ymm0,%ymm3,%ymm3
420 vpaddq %ymm1,%ymm4,%ymm4
421 vpaddq %ymm2,%ymm5,%ymm5
422 vpaddq %ymm23,%ymm6,%ymm6
423 vpaddq %ymm24,%ymm7,%ymm7
424 vpaddq %ymm25,%ymm8,%ymm8
425 vpaddq %ymm26,%ymm9,%ymm9
426 vpaddq %ymm27,%ymm10,%ymm10
427 vpaddq %ymm28,%ymm11,%ymm11
428 vpaddq %ymm29,%ymm12,%ymm12
429
430
431
432 vpcmpuq $6,.Lmask52x4(%rip),%ymm3,%k1
433 vpcmpuq $6,.Lmask52x4(%rip),%ymm4,%k2
434 kmovb %k1,%r14d
435 kmovb %k2,%r13d
436 shlb $4,%r13b
437 orb %r13b,%r14b
438
439 vpcmpuq $6,.Lmask52x4(%rip),%ymm5,%k1
440 vpcmpuq $6,.Lmask52x4(%rip),%ymm6,%k2
441 kmovb %k1,%r13d
442 kmovb %k2,%r12d
443 shlb $4,%r12b
444 orb %r12b,%r13b
445
446 vpcmpuq $6,.Lmask52x4(%rip),%ymm7,%k1
447 vpcmpuq $6,.Lmask52x4(%rip),%ymm8,%k2
448 kmovb %k1,%r12d
449 kmovb %k2,%r11d
450 shlb $4,%r11b
451 orb %r11b,%r12b
452
453 vpcmpuq $6,.Lmask52x4(%rip),%ymm9,%k1
454 vpcmpuq $6,.Lmask52x4(%rip),%ymm10,%k2
455 kmovb %k1,%r11d
456 kmovb %k2,%r10d
457 shlb $4,%r10b
458 orb %r10b,%r11b
459
460 vpcmpuq $6,.Lmask52x4(%rip),%ymm11,%k1
461 vpcmpuq $6,.Lmask52x4(%rip),%ymm12,%k2
462 kmovb %k1,%r10d
463 kmovb %k2,%r9d
464 shlb $4,%r9b
465 orb %r9b,%r10b
466
467 addb %r14b,%r14b
468 adcb %r13b,%r13b
469 adcb %r12b,%r12b
470 adcb %r11b,%r11b
471 adcb %r10b,%r10b
472
473
474 vpcmpuq $0,.Lmask52x4(%rip),%ymm3,%k1
475 vpcmpuq $0,.Lmask52x4(%rip),%ymm4,%k2
476 kmovb %k1,%r9d
477 kmovb %k2,%r8d
478 shlb $4,%r8b
479 orb %r8b,%r9b
480
481 vpcmpuq $0,.Lmask52x4(%rip),%ymm5,%k1
482 vpcmpuq $0,.Lmask52x4(%rip),%ymm6,%k2
483 kmovb %k1,%r8d
484 kmovb %k2,%edx
485 shlb $4,%dl
486 orb %dl,%r8b
487
488 vpcmpuq $0,.Lmask52x4(%rip),%ymm7,%k1
489 vpcmpuq $0,.Lmask52x4(%rip),%ymm8,%k2
490 kmovb %k1,%edx
491 kmovb %k2,%ecx
492 shlb $4,%cl
493 orb %cl,%dl
494
495 vpcmpuq $0,.Lmask52x4(%rip),%ymm9,%k1
496 vpcmpuq $0,.Lmask52x4(%rip),%ymm10,%k2
497 kmovb %k1,%ecx
498 kmovb %k2,%ebx
499 shlb $4,%bl
500 orb %bl,%cl
501
502 vpcmpuq $0,.Lmask52x4(%rip),%ymm11,%k1
503 vpcmpuq $0,.Lmask52x4(%rip),%ymm12,%k2
504 kmovb %k1,%ebx
505 kmovb %k2,%eax
506 shlb $4,%al
507 orb %al,%bl
508
509 addb %r9b,%r14b
510 adcb %r8b,%r13b
511 adcb %dl,%r12b
512 adcb %cl,%r11b
513 adcb %bl,%r10b
514
515 xorb %r9b,%r14b
516 xorb %r8b,%r13b
517 xorb %dl,%r12b
518 xorb %cl,%r11b
519 xorb %bl,%r10b
520
521 kmovb %r14d,%k1
522 shrb $4,%r14b
523 kmovb %r14d,%k2
524 kmovb %r13d,%k3
525 shrb $4,%r13b
526 kmovb %r13d,%k4
527 kmovb %r12d,%k5
528 shrb $4,%r12b
529 kmovb %r12d,%k6
530 kmovb %r11d,%k7
531
532 vpsubq .Lmask52x4(%rip),%ymm3,%ymm3{%k1}
533 vpsubq .Lmask52x4(%rip),%ymm4,%ymm4{%k2}
534 vpsubq .Lmask52x4(%rip),%ymm5,%ymm5{%k3}
535 vpsubq .Lmask52x4(%rip),%ymm6,%ymm6{%k4}
536 vpsubq .Lmask52x4(%rip),%ymm7,%ymm7{%k5}
537 vpsubq .Lmask52x4(%rip),%ymm8,%ymm8{%k6}
538 vpsubq .Lmask52x4(%rip),%ymm9,%ymm9{%k7}
539
540 vpandq .Lmask52x4(%rip),%ymm3,%ymm3
541 vpandq .Lmask52x4(%rip),%ymm4,%ymm4
542 vpandq .Lmask52x4(%rip),%ymm5,%ymm5
543 vpandq .Lmask52x4(%rip),%ymm6,%ymm6
544 vpandq .Lmask52x4(%rip),%ymm7,%ymm7
545 vpandq .Lmask52x4(%rip),%ymm8,%ymm8
546 vpandq .Lmask52x4(%rip),%ymm9,%ymm9
547
548 shrb $4,%r11b
549 kmovb %r11d,%k1
550 kmovb %r10d,%k2
551 shrb $4,%r10b
552 kmovb %r10d,%k3
553
554 vpsubq .Lmask52x4(%rip),%ymm10,%ymm10{%k1}
555 vpsubq .Lmask52x4(%rip),%ymm11,%ymm11{%k2}
556 vpsubq .Lmask52x4(%rip),%ymm12,%ymm12{%k3}
557
558 vpandq .Lmask52x4(%rip),%ymm10,%ymm10
559 vpandq .Lmask52x4(%rip),%ymm11,%ymm11
560 vpandq .Lmask52x4(%rip),%ymm12,%ymm12
561
562 vmovdqu64 %ymm3,0(%rdi)
563 vmovdqu64 %ymm4,32(%rdi)
564 vmovdqu64 %ymm5,64(%rdi)
565 vmovdqu64 %ymm6,96(%rdi)
566 vmovdqu64 %ymm7,128(%rdi)
567 vmovdqu64 %ymm8,160(%rdi)
568 vmovdqu64 %ymm9,192(%rdi)
569 vmovdqu64 %ymm10,224(%rdi)
570 vmovdqu64 %ymm11,256(%rdi)
571 vmovdqu64 %ymm12,288(%rdi)
572
573 vzeroupper
574 leaq (%rsp),%rax
575.cfi_def_cfa_register %rax
576 movq 0(%rax),%r15
577.cfi_restore %r15
578 movq 8(%rax),%r14
579.cfi_restore %r14
580 movq 16(%rax),%r13
581.cfi_restore %r13
582 movq 24(%rax),%r12
583.cfi_restore %r12
584 movq 32(%rax),%rbp
585.cfi_restore %rbp
586 movq 40(%rax),%rbx
587.cfi_restore %rbx
588 leaq 48(%rax),%rsp
589.cfi_def_cfa %rsp,8
590.Lossl_rsaz_amm52x40_x1_ifma256_epilogue:
591
592 .byte 0xf3,0xc3
593.cfi_endproc
594.size ossl_rsaz_amm52x40_x1_ifma256, .-ossl_rsaz_amm52x40_x1_ifma256
595.data
596.align 32
597.Lmask52x4:
598.quad 0xfffffffffffff
599.quad 0xfffffffffffff
600.quad 0xfffffffffffff
601.quad 0xfffffffffffff
602.text
603
604.globl ossl_rsaz_amm52x40_x2_ifma256
605.type ossl_rsaz_amm52x40_x2_ifma256,@function
606.align 32
607ossl_rsaz_amm52x40_x2_ifma256:
608.cfi_startproc
609.byte 243,15,30,250
610 pushq %rbx
611.cfi_adjust_cfa_offset 8
612.cfi_offset %rbx,-16
613 pushq %rbp
614.cfi_adjust_cfa_offset 8
615.cfi_offset %rbp,-24
616 pushq %r12
617.cfi_adjust_cfa_offset 8
618.cfi_offset %r12,-32
619 pushq %r13
620.cfi_adjust_cfa_offset 8
621.cfi_offset %r13,-40
622 pushq %r14
623.cfi_adjust_cfa_offset 8
624.cfi_offset %r14,-48
625 pushq %r15
626.cfi_adjust_cfa_offset 8
627.cfi_offset %r15,-56
628
629 vpxord %ymm0,%ymm0,%ymm0
630 vmovdqa64 %ymm0,%ymm3
631 vmovdqa64 %ymm0,%ymm4
632 vmovdqa64 %ymm0,%ymm5
633 vmovdqa64 %ymm0,%ymm6
634 vmovdqa64 %ymm0,%ymm7
635 vmovdqa64 %ymm0,%ymm8
636 vmovdqa64 %ymm0,%ymm9
637 vmovdqa64 %ymm0,%ymm10
638 vmovdqa64 %ymm0,%ymm11
639 vmovdqa64 %ymm0,%ymm12
640
641 vmovdqa64 %ymm0,%ymm13
642 vmovdqa64 %ymm0,%ymm14
643 vmovdqa64 %ymm0,%ymm15
644 vmovdqa64 %ymm0,%ymm16
645 vmovdqa64 %ymm0,%ymm17
646 vmovdqa64 %ymm0,%ymm18
647 vmovdqa64 %ymm0,%ymm19
648 vmovdqa64 %ymm0,%ymm20
649 vmovdqa64 %ymm0,%ymm21
650 vmovdqa64 %ymm0,%ymm22
651
652
653 xorl %r9d,%r9d
654 xorl %r15d,%r15d
655
656 movq %rdx,%r11
657 movq $0xfffffffffffff,%rax
658
659 movl $40,%ebx
660
661.align 32
662.Lloop40:
663 movq 0(%r11),%r13
664
665 vpbroadcastq %r13,%ymm1
666 movq 0(%rsi),%rdx
667 mulxq %r13,%r13,%r12
668 addq %r13,%r9
669 movq %r12,%r10
670 adcq $0,%r10
671
672 movq (%r8),%r13
673 imulq %r9,%r13
674 andq %rax,%r13
675
676 vpbroadcastq %r13,%ymm2
677 movq 0(%rcx),%rdx
678 mulxq %r13,%r13,%r12
679 addq %r13,%r9
680 adcq %r12,%r10
681
682 shrq $52,%r9
683 salq $12,%r10
684 orq %r10,%r9
685
686 vpmadd52luq 0(%rsi),%ymm1,%ymm3
687 vpmadd52luq 32(%rsi),%ymm1,%ymm4
688 vpmadd52luq 64(%rsi),%ymm1,%ymm5
689 vpmadd52luq 96(%rsi),%ymm1,%ymm6
690 vpmadd52luq 128(%rsi),%ymm1,%ymm7
691 vpmadd52luq 160(%rsi),%ymm1,%ymm8
692 vpmadd52luq 192(%rsi),%ymm1,%ymm9
693 vpmadd52luq 224(%rsi),%ymm1,%ymm10
694 vpmadd52luq 256(%rsi),%ymm1,%ymm11
695 vpmadd52luq 288(%rsi),%ymm1,%ymm12
696
697 vpmadd52luq 0(%rcx),%ymm2,%ymm3
698 vpmadd52luq 32(%rcx),%ymm2,%ymm4
699 vpmadd52luq 64(%rcx),%ymm2,%ymm5
700 vpmadd52luq 96(%rcx),%ymm2,%ymm6
701 vpmadd52luq 128(%rcx),%ymm2,%ymm7
702 vpmadd52luq 160(%rcx),%ymm2,%ymm8
703 vpmadd52luq 192(%rcx),%ymm2,%ymm9
704 vpmadd52luq 224(%rcx),%ymm2,%ymm10
705 vpmadd52luq 256(%rcx),%ymm2,%ymm11
706 vpmadd52luq 288(%rcx),%ymm2,%ymm12
707
708
709 valignq $1,%ymm3,%ymm4,%ymm3
710 valignq $1,%ymm4,%ymm5,%ymm4
711 valignq $1,%ymm5,%ymm6,%ymm5
712 valignq $1,%ymm6,%ymm7,%ymm6
713 valignq $1,%ymm7,%ymm8,%ymm7
714 valignq $1,%ymm8,%ymm9,%ymm8
715 valignq $1,%ymm9,%ymm10,%ymm9
716 valignq $1,%ymm10,%ymm11,%ymm10
717 valignq $1,%ymm11,%ymm12,%ymm11
718 valignq $1,%ymm12,%ymm0,%ymm12
719
720 vmovq %xmm3,%r13
721 addq %r13,%r9
722
723 vpmadd52huq 0(%rsi),%ymm1,%ymm3
724 vpmadd52huq 32(%rsi),%ymm1,%ymm4
725 vpmadd52huq 64(%rsi),%ymm1,%ymm5
726 vpmadd52huq 96(%rsi),%ymm1,%ymm6
727 vpmadd52huq 128(%rsi),%ymm1,%ymm7
728 vpmadd52huq 160(%rsi),%ymm1,%ymm8
729 vpmadd52huq 192(%rsi),%ymm1,%ymm9
730 vpmadd52huq 224(%rsi),%ymm1,%ymm10
731 vpmadd52huq 256(%rsi),%ymm1,%ymm11
732 vpmadd52huq 288(%rsi),%ymm1,%ymm12
733
734 vpmadd52huq 0(%rcx),%ymm2,%ymm3
735 vpmadd52huq 32(%rcx),%ymm2,%ymm4
736 vpmadd52huq 64(%rcx),%ymm2,%ymm5
737 vpmadd52huq 96(%rcx),%ymm2,%ymm6
738 vpmadd52huq 128(%rcx),%ymm2,%ymm7
739 vpmadd52huq 160(%rcx),%ymm2,%ymm8
740 vpmadd52huq 192(%rcx),%ymm2,%ymm9
741 vpmadd52huq 224(%rcx),%ymm2,%ymm10
742 vpmadd52huq 256(%rcx),%ymm2,%ymm11
743 vpmadd52huq 288(%rcx),%ymm2,%ymm12
744 movq 320(%r11),%r13
745
746 vpbroadcastq %r13,%ymm1
747 movq 320(%rsi),%rdx
748 mulxq %r13,%r13,%r12
749 addq %r13,%r15
750 movq %r12,%r10
751 adcq $0,%r10
752
753 movq 8(%r8),%r13
754 imulq %r15,%r13
755 andq %rax,%r13
756
757 vpbroadcastq %r13,%ymm2
758 movq 320(%rcx),%rdx
759 mulxq %r13,%r13,%r12
760 addq %r13,%r15
761 adcq %r12,%r10
762
763 shrq $52,%r15
764 salq $12,%r10
765 orq %r10,%r15
766
767 vpmadd52luq 320(%rsi),%ymm1,%ymm13
768 vpmadd52luq 352(%rsi),%ymm1,%ymm14
769 vpmadd52luq 384(%rsi),%ymm1,%ymm15
770 vpmadd52luq 416(%rsi),%ymm1,%ymm16
771 vpmadd52luq 448(%rsi),%ymm1,%ymm17
772 vpmadd52luq 480(%rsi),%ymm1,%ymm18
773 vpmadd52luq 512(%rsi),%ymm1,%ymm19
774 vpmadd52luq 544(%rsi),%ymm1,%ymm20
775 vpmadd52luq 576(%rsi),%ymm1,%ymm21
776 vpmadd52luq 608(%rsi),%ymm1,%ymm22
777
778 vpmadd52luq 320(%rcx),%ymm2,%ymm13
779 vpmadd52luq 352(%rcx),%ymm2,%ymm14
780 vpmadd52luq 384(%rcx),%ymm2,%ymm15
781 vpmadd52luq 416(%rcx),%ymm2,%ymm16
782 vpmadd52luq 448(%rcx),%ymm2,%ymm17
783 vpmadd52luq 480(%rcx),%ymm2,%ymm18
784 vpmadd52luq 512(%rcx),%ymm2,%ymm19
785 vpmadd52luq 544(%rcx),%ymm2,%ymm20
786 vpmadd52luq 576(%rcx),%ymm2,%ymm21
787 vpmadd52luq 608(%rcx),%ymm2,%ymm22
788
789
790 valignq $1,%ymm13,%ymm14,%ymm13
791 valignq $1,%ymm14,%ymm15,%ymm14
792 valignq $1,%ymm15,%ymm16,%ymm15
793 valignq $1,%ymm16,%ymm17,%ymm16
794 valignq $1,%ymm17,%ymm18,%ymm17
795 valignq $1,%ymm18,%ymm19,%ymm18
796 valignq $1,%ymm19,%ymm20,%ymm19
797 valignq $1,%ymm20,%ymm21,%ymm20
798 valignq $1,%ymm21,%ymm22,%ymm21
799 valignq $1,%ymm22,%ymm0,%ymm22
800
801 vmovq %xmm13,%r13
802 addq %r13,%r15
803
804 vpmadd52huq 320(%rsi),%ymm1,%ymm13
805 vpmadd52huq 352(%rsi),%ymm1,%ymm14
806 vpmadd52huq 384(%rsi),%ymm1,%ymm15
807 vpmadd52huq 416(%rsi),%ymm1,%ymm16
808 vpmadd52huq 448(%rsi),%ymm1,%ymm17
809 vpmadd52huq 480(%rsi),%ymm1,%ymm18
810 vpmadd52huq 512(%rsi),%ymm1,%ymm19
811 vpmadd52huq 544(%rsi),%ymm1,%ymm20
812 vpmadd52huq 576(%rsi),%ymm1,%ymm21
813 vpmadd52huq 608(%rsi),%ymm1,%ymm22
814
815 vpmadd52huq 320(%rcx),%ymm2,%ymm13
816 vpmadd52huq 352(%rcx),%ymm2,%ymm14
817 vpmadd52huq 384(%rcx),%ymm2,%ymm15
818 vpmadd52huq 416(%rcx),%ymm2,%ymm16
819 vpmadd52huq 448(%rcx),%ymm2,%ymm17
820 vpmadd52huq 480(%rcx),%ymm2,%ymm18
821 vpmadd52huq 512(%rcx),%ymm2,%ymm19
822 vpmadd52huq 544(%rcx),%ymm2,%ymm20
823 vpmadd52huq 576(%rcx),%ymm2,%ymm21
824 vpmadd52huq 608(%rcx),%ymm2,%ymm22
825 leaq 8(%r11),%r11
826 decl %ebx
827 jne .Lloop40
828
829 vpbroadcastq %r9,%ymm0
830 vpblendd $3,%ymm0,%ymm3,%ymm3
831
832
833
834 vpsrlq $52,%ymm3,%ymm0
835 vpsrlq $52,%ymm4,%ymm1
836 vpsrlq $52,%ymm5,%ymm2
837 vpsrlq $52,%ymm6,%ymm23
838 vpsrlq $52,%ymm7,%ymm24
839 vpsrlq $52,%ymm8,%ymm25
840 vpsrlq $52,%ymm9,%ymm26
841 vpsrlq $52,%ymm10,%ymm27
842 vpsrlq $52,%ymm11,%ymm28
843 vpsrlq $52,%ymm12,%ymm29
844
845
846 valignq $3,%ymm28,%ymm29,%ymm29
847 valignq $3,%ymm27,%ymm28,%ymm28
848 valignq $3,%ymm26,%ymm27,%ymm27
849 valignq $3,%ymm25,%ymm26,%ymm26
850 valignq $3,%ymm24,%ymm25,%ymm25
851 valignq $3,%ymm23,%ymm24,%ymm24
852 valignq $3,%ymm2,%ymm23,%ymm23
853 valignq $3,%ymm1,%ymm2,%ymm2
854 valignq $3,%ymm0,%ymm1,%ymm1
855 valignq $3,.Lzeros(%rip),%ymm0,%ymm0
856
857
858 vpandq .Lmask52x4(%rip),%ymm3,%ymm3
859 vpandq .Lmask52x4(%rip),%ymm4,%ymm4
860 vpandq .Lmask52x4(%rip),%ymm5,%ymm5
861 vpandq .Lmask52x4(%rip),%ymm6,%ymm6
862 vpandq .Lmask52x4(%rip),%ymm7,%ymm7
863 vpandq .Lmask52x4(%rip),%ymm8,%ymm8
864 vpandq .Lmask52x4(%rip),%ymm9,%ymm9
865 vpandq .Lmask52x4(%rip),%ymm10,%ymm10
866 vpandq .Lmask52x4(%rip),%ymm11,%ymm11
867 vpandq .Lmask52x4(%rip),%ymm12,%ymm12
868
869
870 vpaddq %ymm0,%ymm3,%ymm3
871 vpaddq %ymm1,%ymm4,%ymm4
872 vpaddq %ymm2,%ymm5,%ymm5
873 vpaddq %ymm23,%ymm6,%ymm6
874 vpaddq %ymm24,%ymm7,%ymm7
875 vpaddq %ymm25,%ymm8,%ymm8
876 vpaddq %ymm26,%ymm9,%ymm9
877 vpaddq %ymm27,%ymm10,%ymm10
878 vpaddq %ymm28,%ymm11,%ymm11
879 vpaddq %ymm29,%ymm12,%ymm12
880
881
882
883 vpcmpuq $6,.Lmask52x4(%rip),%ymm3,%k1
884 vpcmpuq $6,.Lmask52x4(%rip),%ymm4,%k2
885 kmovb %k1,%r14d
886 kmovb %k2,%r13d
887 shlb $4,%r13b
888 orb %r13b,%r14b
889
890 vpcmpuq $6,.Lmask52x4(%rip),%ymm5,%k1
891 vpcmpuq $6,.Lmask52x4(%rip),%ymm6,%k2
892 kmovb %k1,%r13d
893 kmovb %k2,%r12d
894 shlb $4,%r12b
895 orb %r12b,%r13b
896
897 vpcmpuq $6,.Lmask52x4(%rip),%ymm7,%k1
898 vpcmpuq $6,.Lmask52x4(%rip),%ymm8,%k2
899 kmovb %k1,%r12d
900 kmovb %k2,%r11d
901 shlb $4,%r11b
902 orb %r11b,%r12b
903
904 vpcmpuq $6,.Lmask52x4(%rip),%ymm9,%k1
905 vpcmpuq $6,.Lmask52x4(%rip),%ymm10,%k2
906 kmovb %k1,%r11d
907 kmovb %k2,%r10d
908 shlb $4,%r10b
909 orb %r10b,%r11b
910
911 vpcmpuq $6,.Lmask52x4(%rip),%ymm11,%k1
912 vpcmpuq $6,.Lmask52x4(%rip),%ymm12,%k2
913 kmovb %k1,%r10d
914 kmovb %k2,%r9d
915 shlb $4,%r9b
916 orb %r9b,%r10b
917
918 addb %r14b,%r14b
919 adcb %r13b,%r13b
920 adcb %r12b,%r12b
921 adcb %r11b,%r11b
922 adcb %r10b,%r10b
923
924
925 vpcmpuq $0,.Lmask52x4(%rip),%ymm3,%k1
926 vpcmpuq $0,.Lmask52x4(%rip),%ymm4,%k2
927 kmovb %k1,%r9d
928 kmovb %k2,%r8d
929 shlb $4,%r8b
930 orb %r8b,%r9b
931
932 vpcmpuq $0,.Lmask52x4(%rip),%ymm5,%k1
933 vpcmpuq $0,.Lmask52x4(%rip),%ymm6,%k2
934 kmovb %k1,%r8d
935 kmovb %k2,%edx
936 shlb $4,%dl
937 orb %dl,%r8b
938
939 vpcmpuq $0,.Lmask52x4(%rip),%ymm7,%k1
940 vpcmpuq $0,.Lmask52x4(%rip),%ymm8,%k2
941 kmovb %k1,%edx
942 kmovb %k2,%ecx
943 shlb $4,%cl
944 orb %cl,%dl
945
946 vpcmpuq $0,.Lmask52x4(%rip),%ymm9,%k1
947 vpcmpuq $0,.Lmask52x4(%rip),%ymm10,%k2
948 kmovb %k1,%ecx
949 kmovb %k2,%ebx
950 shlb $4,%bl
951 orb %bl,%cl
952
953 vpcmpuq $0,.Lmask52x4(%rip),%ymm11,%k1
954 vpcmpuq $0,.Lmask52x4(%rip),%ymm12,%k2
955 kmovb %k1,%ebx
956 kmovb %k2,%eax
957 shlb $4,%al
958 orb %al,%bl
959
960 addb %r9b,%r14b
961 adcb %r8b,%r13b
962 adcb %dl,%r12b
963 adcb %cl,%r11b
964 adcb %bl,%r10b
965
966 xorb %r9b,%r14b
967 xorb %r8b,%r13b
968 xorb %dl,%r12b
969 xorb %cl,%r11b
970 xorb %bl,%r10b
971
972 kmovb %r14d,%k1
973 shrb $4,%r14b
974 kmovb %r14d,%k2
975 kmovb %r13d,%k3
976 shrb $4,%r13b
977 kmovb %r13d,%k4
978 kmovb %r12d,%k5
979 shrb $4,%r12b
980 kmovb %r12d,%k6
981 kmovb %r11d,%k7
982
983 vpsubq .Lmask52x4(%rip),%ymm3,%ymm3{%k1}
984 vpsubq .Lmask52x4(%rip),%ymm4,%ymm4{%k2}
985 vpsubq .Lmask52x4(%rip),%ymm5,%ymm5{%k3}
986 vpsubq .Lmask52x4(%rip),%ymm6,%ymm6{%k4}
987 vpsubq .Lmask52x4(%rip),%ymm7,%ymm7{%k5}
988 vpsubq .Lmask52x4(%rip),%ymm8,%ymm8{%k6}
989 vpsubq .Lmask52x4(%rip),%ymm9,%ymm9{%k7}
990
991 vpandq .Lmask52x4(%rip),%ymm3,%ymm3
992 vpandq .Lmask52x4(%rip),%ymm4,%ymm4
993 vpandq .Lmask52x4(%rip),%ymm5,%ymm5
994 vpandq .Lmask52x4(%rip),%ymm6,%ymm6
995 vpandq .Lmask52x4(%rip),%ymm7,%ymm7
996 vpandq .Lmask52x4(%rip),%ymm8,%ymm8
997 vpandq .Lmask52x4(%rip),%ymm9,%ymm9
998
999 shrb $4,%r11b
1000 kmovb %r11d,%k1
1001 kmovb %r10d,%k2
1002 shrb $4,%r10b
1003 kmovb %r10d,%k3
1004
1005 vpsubq .Lmask52x4(%rip),%ymm10,%ymm10{%k1}
1006 vpsubq .Lmask52x4(%rip),%ymm11,%ymm11{%k2}
1007 vpsubq .Lmask52x4(%rip),%ymm12,%ymm12{%k3}
1008
1009 vpandq .Lmask52x4(%rip),%ymm10,%ymm10
1010 vpandq .Lmask52x4(%rip),%ymm11,%ymm11
1011 vpandq .Lmask52x4(%rip),%ymm12,%ymm12
1012
1013 vpbroadcastq %r15,%ymm0
1014 vpblendd $3,%ymm0,%ymm13,%ymm13
1015
1016
1017
1018 vpsrlq $52,%ymm13,%ymm0
1019 vpsrlq $52,%ymm14,%ymm1
1020 vpsrlq $52,%ymm15,%ymm2
1021 vpsrlq $52,%ymm16,%ymm23
1022 vpsrlq $52,%ymm17,%ymm24
1023 vpsrlq $52,%ymm18,%ymm25
1024 vpsrlq $52,%ymm19,%ymm26
1025 vpsrlq $52,%ymm20,%ymm27
1026 vpsrlq $52,%ymm21,%ymm28
1027 vpsrlq $52,%ymm22,%ymm29
1028
1029
1030 valignq $3,%ymm28,%ymm29,%ymm29
1031 valignq $3,%ymm27,%ymm28,%ymm28
1032 valignq $3,%ymm26,%ymm27,%ymm27
1033 valignq $3,%ymm25,%ymm26,%ymm26
1034 valignq $3,%ymm24,%ymm25,%ymm25
1035 valignq $3,%ymm23,%ymm24,%ymm24
1036 valignq $3,%ymm2,%ymm23,%ymm23
1037 valignq $3,%ymm1,%ymm2,%ymm2
1038 valignq $3,%ymm0,%ymm1,%ymm1
1039 valignq $3,.Lzeros(%rip),%ymm0,%ymm0
1040
1041
1042 vpandq .Lmask52x4(%rip),%ymm13,%ymm13
1043 vpandq .Lmask52x4(%rip),%ymm14,%ymm14
1044 vpandq .Lmask52x4(%rip),%ymm15,%ymm15
1045 vpandq .Lmask52x4(%rip),%ymm16,%ymm16
1046 vpandq .Lmask52x4(%rip),%ymm17,%ymm17
1047 vpandq .Lmask52x4(%rip),%ymm18,%ymm18
1048 vpandq .Lmask52x4(%rip),%ymm19,%ymm19
1049 vpandq .Lmask52x4(%rip),%ymm20,%ymm20
1050 vpandq .Lmask52x4(%rip),%ymm21,%ymm21
1051 vpandq .Lmask52x4(%rip),%ymm22,%ymm22
1052
1053
1054 vpaddq %ymm0,%ymm13,%ymm13
1055 vpaddq %ymm1,%ymm14,%ymm14
1056 vpaddq %ymm2,%ymm15,%ymm15
1057 vpaddq %ymm23,%ymm16,%ymm16
1058 vpaddq %ymm24,%ymm17,%ymm17
1059 vpaddq %ymm25,%ymm18,%ymm18
1060 vpaddq %ymm26,%ymm19,%ymm19
1061 vpaddq %ymm27,%ymm20,%ymm20
1062 vpaddq %ymm28,%ymm21,%ymm21
1063 vpaddq %ymm29,%ymm22,%ymm22
1064
1065
1066
1067 vpcmpuq $6,.Lmask52x4(%rip),%ymm13,%k1
1068 vpcmpuq $6,.Lmask52x4(%rip),%ymm14,%k2
1069 kmovb %k1,%r14d
1070 kmovb %k2,%r13d
1071 shlb $4,%r13b
1072 orb %r13b,%r14b
1073
1074 vpcmpuq $6,.Lmask52x4(%rip),%ymm15,%k1
1075 vpcmpuq $6,.Lmask52x4(%rip),%ymm16,%k2
1076 kmovb %k1,%r13d
1077 kmovb %k2,%r12d
1078 shlb $4,%r12b
1079 orb %r12b,%r13b
1080
1081 vpcmpuq $6,.Lmask52x4(%rip),%ymm17,%k1
1082 vpcmpuq $6,.Lmask52x4(%rip),%ymm18,%k2
1083 kmovb %k1,%r12d
1084 kmovb %k2,%r11d
1085 shlb $4,%r11b
1086 orb %r11b,%r12b
1087
1088 vpcmpuq $6,.Lmask52x4(%rip),%ymm19,%k1
1089 vpcmpuq $6,.Lmask52x4(%rip),%ymm20,%k2
1090 kmovb %k1,%r11d
1091 kmovb %k2,%r10d
1092 shlb $4,%r10b
1093 orb %r10b,%r11b
1094
1095 vpcmpuq $6,.Lmask52x4(%rip),%ymm21,%k1
1096 vpcmpuq $6,.Lmask52x4(%rip),%ymm22,%k2
1097 kmovb %k1,%r10d
1098 kmovb %k2,%r9d
1099 shlb $4,%r9b
1100 orb %r9b,%r10b
1101
1102 addb %r14b,%r14b
1103 adcb %r13b,%r13b
1104 adcb %r12b,%r12b
1105 adcb %r11b,%r11b
1106 adcb %r10b,%r10b
1107
1108
1109 vpcmpuq $0,.Lmask52x4(%rip),%ymm13,%k1
1110 vpcmpuq $0,.Lmask52x4(%rip),%ymm14,%k2
1111 kmovb %k1,%r9d
1112 kmovb %k2,%r8d
1113 shlb $4,%r8b
1114 orb %r8b,%r9b
1115
1116 vpcmpuq $0,.Lmask52x4(%rip),%ymm15,%k1
1117 vpcmpuq $0,.Lmask52x4(%rip),%ymm16,%k2
1118 kmovb %k1,%r8d
1119 kmovb %k2,%edx
1120 shlb $4,%dl
1121 orb %dl,%r8b
1122
1123 vpcmpuq $0,.Lmask52x4(%rip),%ymm17,%k1
1124 vpcmpuq $0,.Lmask52x4(%rip),%ymm18,%k2
1125 kmovb %k1,%edx
1126 kmovb %k2,%ecx
1127 shlb $4,%cl
1128 orb %cl,%dl
1129
1130 vpcmpuq $0,.Lmask52x4(%rip),%ymm19,%k1
1131 vpcmpuq $0,.Lmask52x4(%rip),%ymm20,%k2
1132 kmovb %k1,%ecx
1133 kmovb %k2,%ebx
1134 shlb $4,%bl
1135 orb %bl,%cl
1136
1137 vpcmpuq $0,.Lmask52x4(%rip),%ymm21,%k1
1138 vpcmpuq $0,.Lmask52x4(%rip),%ymm22,%k2
1139 kmovb %k1,%ebx
1140 kmovb %k2,%eax
1141 shlb $4,%al
1142 orb %al,%bl
1143
1144 addb %r9b,%r14b
1145 adcb %r8b,%r13b
1146 adcb %dl,%r12b
1147 adcb %cl,%r11b
1148 adcb %bl,%r10b
1149
1150 xorb %r9b,%r14b
1151 xorb %r8b,%r13b
1152 xorb %dl,%r12b
1153 xorb %cl,%r11b
1154 xorb %bl,%r10b
1155
1156 kmovb %r14d,%k1
1157 shrb $4,%r14b
1158 kmovb %r14d,%k2
1159 kmovb %r13d,%k3
1160 shrb $4,%r13b
1161 kmovb %r13d,%k4
1162 kmovb %r12d,%k5
1163 shrb $4,%r12b
1164 kmovb %r12d,%k6
1165 kmovb %r11d,%k7
1166
1167 vpsubq .Lmask52x4(%rip),%ymm13,%ymm13{%k1}
1168 vpsubq .Lmask52x4(%rip),%ymm14,%ymm14{%k2}
1169 vpsubq .Lmask52x4(%rip),%ymm15,%ymm15{%k3}
1170 vpsubq .Lmask52x4(%rip),%ymm16,%ymm16{%k4}
1171 vpsubq .Lmask52x4(%rip),%ymm17,%ymm17{%k5}
1172 vpsubq .Lmask52x4(%rip),%ymm18,%ymm18{%k6}
1173 vpsubq .Lmask52x4(%rip),%ymm19,%ymm19{%k7}
1174
1175 vpandq .Lmask52x4(%rip),%ymm13,%ymm13
1176 vpandq .Lmask52x4(%rip),%ymm14,%ymm14
1177 vpandq .Lmask52x4(%rip),%ymm15,%ymm15
1178 vpandq .Lmask52x4(%rip),%ymm16,%ymm16
1179 vpandq .Lmask52x4(%rip),%ymm17,%ymm17
1180 vpandq .Lmask52x4(%rip),%ymm18,%ymm18
1181 vpandq .Lmask52x4(%rip),%ymm19,%ymm19
1182
1183 shrb $4,%r11b
1184 kmovb %r11d,%k1
1185 kmovb %r10d,%k2
1186 shrb $4,%r10b
1187 kmovb %r10d,%k3
1188
1189 vpsubq .Lmask52x4(%rip),%ymm20,%ymm20{%k1}
1190 vpsubq .Lmask52x4(%rip),%ymm21,%ymm21{%k2}
1191 vpsubq .Lmask52x4(%rip),%ymm22,%ymm22{%k3}
1192
1193 vpandq .Lmask52x4(%rip),%ymm20,%ymm20
1194 vpandq .Lmask52x4(%rip),%ymm21,%ymm21
1195 vpandq .Lmask52x4(%rip),%ymm22,%ymm22
1196
1197 vmovdqu64 %ymm3,0(%rdi)
1198 vmovdqu64 %ymm4,32(%rdi)
1199 vmovdqu64 %ymm5,64(%rdi)
1200 vmovdqu64 %ymm6,96(%rdi)
1201 vmovdqu64 %ymm7,128(%rdi)
1202 vmovdqu64 %ymm8,160(%rdi)
1203 vmovdqu64 %ymm9,192(%rdi)
1204 vmovdqu64 %ymm10,224(%rdi)
1205 vmovdqu64 %ymm11,256(%rdi)
1206 vmovdqu64 %ymm12,288(%rdi)
1207
1208 vmovdqu64 %ymm13,320(%rdi)
1209 vmovdqu64 %ymm14,352(%rdi)
1210 vmovdqu64 %ymm15,384(%rdi)
1211 vmovdqu64 %ymm16,416(%rdi)
1212 vmovdqu64 %ymm17,448(%rdi)
1213 vmovdqu64 %ymm18,480(%rdi)
1214 vmovdqu64 %ymm19,512(%rdi)
1215 vmovdqu64 %ymm20,544(%rdi)
1216 vmovdqu64 %ymm21,576(%rdi)
1217 vmovdqu64 %ymm22,608(%rdi)
1218
1219 vzeroupper
1220 leaq (%rsp),%rax
1221.cfi_def_cfa_register %rax
1222 movq 0(%rax),%r15
1223.cfi_restore %r15
1224 movq 8(%rax),%r14
1225.cfi_restore %r14
1226 movq 16(%rax),%r13
1227.cfi_restore %r13
1228 movq 24(%rax),%r12
1229.cfi_restore %r12
1230 movq 32(%rax),%rbp
1231.cfi_restore %rbp
1232 movq 40(%rax),%rbx
1233.cfi_restore %rbx
1234 leaq 48(%rax),%rsp
1235.cfi_def_cfa %rsp,8
1236.Lossl_rsaz_amm52x40_x2_ifma256_epilogue:
1237 .byte 0xf3,0xc3
1238.cfi_endproc
1239.size ossl_rsaz_amm52x40_x2_ifma256, .-ossl_rsaz_amm52x40_x2_ifma256
1240.text
1241
1242.align 32
1243.globl ossl_extract_multiplier_2x40_win5
1244.type ossl_extract_multiplier_2x40_win5,@function
1245ossl_extract_multiplier_2x40_win5:
1246.cfi_startproc
1247.byte 243,15,30,250
1248 vmovdqa64 .Lones(%rip),%ymm24
1249 vpbroadcastq %rdx,%ymm22
1250 vpbroadcastq %rcx,%ymm23
1251 leaq 20480(%rsi),%rax
1252
1253
1254 movq %rsi,%r10
1255
1256
1257 vpxor %xmm0,%xmm0,%xmm0
1258 vmovdqa64 %ymm0,%ymm1
1259 vmovdqa64 %ymm0,%ymm2
1260 vmovdqa64 %ymm0,%ymm3
1261 vmovdqa64 %ymm0,%ymm4
1262 vmovdqa64 %ymm0,%ymm5
1263 vmovdqa64 %ymm0,%ymm16
1264 vmovdqa64 %ymm0,%ymm17
1265 vmovdqa64 %ymm0,%ymm18
1266 vmovdqa64 %ymm0,%ymm19
1267 vpxorq %ymm21,%ymm21,%ymm21
1268.align 32
1269.Lloop_0:
1270 vpcmpq $0,%ymm21,%ymm22,%k1
1271 vmovdqu64 0(%rsi),%ymm20
1272 vpblendmq %ymm20,%ymm0,%ymm0{%k1}
1273 vmovdqu64 32(%rsi),%ymm20
1274 vpblendmq %ymm20,%ymm1,%ymm1{%k1}
1275 vmovdqu64 64(%rsi),%ymm20
1276 vpblendmq %ymm20,%ymm2,%ymm2{%k1}
1277 vmovdqu64 96(%rsi),%ymm20
1278 vpblendmq %ymm20,%ymm3,%ymm3{%k1}
1279 vmovdqu64 128(%rsi),%ymm20
1280 vpblendmq %ymm20,%ymm4,%ymm4{%k1}
1281 vmovdqu64 160(%rsi),%ymm20
1282 vpblendmq %ymm20,%ymm5,%ymm5{%k1}
1283 vmovdqu64 192(%rsi),%ymm20
1284 vpblendmq %ymm20,%ymm16,%ymm16{%k1}
1285 vmovdqu64 224(%rsi),%ymm20
1286 vpblendmq %ymm20,%ymm17,%ymm17{%k1}
1287 vmovdqu64 256(%rsi),%ymm20
1288 vpblendmq %ymm20,%ymm18,%ymm18{%k1}
1289 vmovdqu64 288(%rsi),%ymm20
1290 vpblendmq %ymm20,%ymm19,%ymm19{%k1}
1291 vpaddq %ymm24,%ymm21,%ymm21
1292 addq $640,%rsi
1293 cmpq %rsi,%rax
1294 jne .Lloop_0
1295 vmovdqu64 %ymm0,0(%rdi)
1296 vmovdqu64 %ymm1,32(%rdi)
1297 vmovdqu64 %ymm2,64(%rdi)
1298 vmovdqu64 %ymm3,96(%rdi)
1299 vmovdqu64 %ymm4,128(%rdi)
1300 vmovdqu64 %ymm5,160(%rdi)
1301 vmovdqu64 %ymm16,192(%rdi)
1302 vmovdqu64 %ymm17,224(%rdi)
1303 vmovdqu64 %ymm18,256(%rdi)
1304 vmovdqu64 %ymm19,288(%rdi)
1305 movq %r10,%rsi
1306 vpxorq %ymm21,%ymm21,%ymm21
1307.align 32
1308.Lloop_320:
1309 vpcmpq $0,%ymm21,%ymm23,%k1
1310 vmovdqu64 320(%rsi),%ymm20
1311 vpblendmq %ymm20,%ymm0,%ymm0{%k1}
1312 vmovdqu64 352(%rsi),%ymm20
1313 vpblendmq %ymm20,%ymm1,%ymm1{%k1}
1314 vmovdqu64 384(%rsi),%ymm20
1315 vpblendmq %ymm20,%ymm2,%ymm2{%k1}
1316 vmovdqu64 416(%rsi),%ymm20
1317 vpblendmq %ymm20,%ymm3,%ymm3{%k1}
1318 vmovdqu64 448(%rsi),%ymm20
1319 vpblendmq %ymm20,%ymm4,%ymm4{%k1}
1320 vmovdqu64 480(%rsi),%ymm20
1321 vpblendmq %ymm20,%ymm5,%ymm5{%k1}
1322 vmovdqu64 512(%rsi),%ymm20
1323 vpblendmq %ymm20,%ymm16,%ymm16{%k1}
1324 vmovdqu64 544(%rsi),%ymm20
1325 vpblendmq %ymm20,%ymm17,%ymm17{%k1}
1326 vmovdqu64 576(%rsi),%ymm20
1327 vpblendmq %ymm20,%ymm18,%ymm18{%k1}
1328 vmovdqu64 608(%rsi),%ymm20
1329 vpblendmq %ymm20,%ymm19,%ymm19{%k1}
1330 vpaddq %ymm24,%ymm21,%ymm21
1331 addq $640,%rsi
1332 cmpq %rsi,%rax
1333 jne .Lloop_320
1334 vmovdqu64 %ymm0,320(%rdi)
1335 vmovdqu64 %ymm1,352(%rdi)
1336 vmovdqu64 %ymm2,384(%rdi)
1337 vmovdqu64 %ymm3,416(%rdi)
1338 vmovdqu64 %ymm4,448(%rdi)
1339 vmovdqu64 %ymm5,480(%rdi)
1340 vmovdqu64 %ymm16,512(%rdi)
1341 vmovdqu64 %ymm17,544(%rdi)
1342 vmovdqu64 %ymm18,576(%rdi)
1343 vmovdqu64 %ymm19,608(%rdi)
1344
1345 .byte 0xf3,0xc3
1346.cfi_endproc
1347.size ossl_extract_multiplier_2x40_win5, .-ossl_extract_multiplier_2x40_win5
1348.data
1349.align 32
1350.Lones:
1351.quad 1,1,1,1
1352.Lzeros:
1353.quad 0,0,0,0
1354 .section ".note.gnu.property", "a"
1355 .p2align 3
1356 .long 1f - 0f
1357 .long 4f - 1f
1358 .long 5
13590:
1360 # "GNU" encoded with .byte, since .asciz isn't supported
1361 # on Solaris.
1362 .byte 0x47
1363 .byte 0x4e
1364 .byte 0x55
1365 .byte 0
13661:
1367 .p2align 3
1368 .long 0xc0000002
1369 .long 3f - 2f
13702:
1371 .long 3
13723:
1373 .p2align 3
13744:
注意: 瀏覽 TracBrowser 來幫助您使用儲存庫瀏覽器

© 2024 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette