VirtualBox

source: vbox/trunk/src/libs/openssl-3.1.0/crypto/genasm-nasm/poly1305-x86_64.S@ 99371

最後變更 在這個檔案從99371是 99371,由 vboxsync 提交於 23 月 前

openssl-3.1.0: After generating headers and asm (kmk recreate-headers recreate-headers)

  • 屬性 svn:eol-style 設為 native
  • 屬性 svn:keywords 設為 Author Date Id Revision
檔案大小: 70.6 KB
 
1default rel
2%define XMMWORD
3%define YMMWORD
4%define ZMMWORD
5section .text code align=64
6
7
8EXTERN OPENSSL_ia32cap_P
9
10global poly1305_init
11
12global poly1305_blocks
13
14global poly1305_emit
15
16
17
18ALIGN 32
19poly1305_init:
20 mov QWORD[8+rsp],rdi ;WIN64 prologue
21 mov QWORD[16+rsp],rsi
22 mov rax,rsp
23$L$SEH_begin_poly1305_init:
24 mov rdi,rcx
25 mov rsi,rdx
26 mov rdx,r8
27
28
29
30 xor rax,rax
31 mov QWORD[rdi],rax
32 mov QWORD[8+rdi],rax
33 mov QWORD[16+rdi],rax
34
35 cmp rsi,0
36 je NEAR $L$no_key
37
38 lea r10,[poly1305_blocks]
39 lea r11,[poly1305_emit]
40 mov r9,QWORD[((OPENSSL_ia32cap_P+4))]
41 lea rax,[poly1305_blocks_avx]
42 lea rcx,[poly1305_emit_avx]
43 bt r9,28
44 cmovc r10,rax
45 cmovc r11,rcx
46 lea rax,[poly1305_blocks_avx2]
47 bt r9,37
48 cmovc r10,rax
49 mov rax,2149646336
50 shr r9,32
51 and r9,rax
52 cmp r9,rax
53 je NEAR $L$init_base2_44
54 mov rax,0x0ffffffc0fffffff
55 mov rcx,0x0ffffffc0ffffffc
56 and rax,QWORD[rsi]
57 and rcx,QWORD[8+rsi]
58 mov QWORD[24+rdi],rax
59 mov QWORD[32+rdi],rcx
60 mov QWORD[rdx],r10
61 mov QWORD[8+rdx],r11
62 mov eax,1
63$L$no_key:
64 mov rdi,QWORD[8+rsp] ;WIN64 epilogue
65 mov rsi,QWORD[16+rsp]
66 DB 0F3h,0C3h ;repret
67
68$L$SEH_end_poly1305_init:
69
70
71ALIGN 32
72poly1305_blocks:
73 mov QWORD[8+rsp],rdi ;WIN64 prologue
74 mov QWORD[16+rsp],rsi
75 mov rax,rsp
76$L$SEH_begin_poly1305_blocks:
77 mov rdi,rcx
78 mov rsi,rdx
79 mov rdx,r8
80 mov rcx,r9
81
82
83
84$L$blocks:
85 shr rdx,4
86 jz NEAR $L$no_data
87
88 push rbx
89
90 push rbp
91
92 push r12
93
94 push r13
95
96 push r14
97
98 push r15
99
100$L$blocks_body:
101
102 mov r15,rdx
103
104 mov r11,QWORD[24+rdi]
105 mov r13,QWORD[32+rdi]
106
107 mov r14,QWORD[rdi]
108 mov rbx,QWORD[8+rdi]
109 mov rbp,QWORD[16+rdi]
110
111 mov r12,r13
112 shr r13,2
113 mov rax,r12
114 add r13,r12
115 jmp NEAR $L$oop
116
117ALIGN 32
118$L$oop:
119 add r14,QWORD[rsi]
120 adc rbx,QWORD[8+rsi]
121 lea rsi,[16+rsi]
122 adc rbp,rcx
123 mul r14
124 mov r9,rax
125 mov rax,r11
126 mov r10,rdx
127
128 mul r14
129 mov r14,rax
130 mov rax,r11
131 mov r8,rdx
132
133 mul rbx
134 add r9,rax
135 mov rax,r13
136 adc r10,rdx
137
138 mul rbx
139 mov rbx,rbp
140 add r14,rax
141 adc r8,rdx
142
143 imul rbx,r13
144 add r9,rbx
145 mov rbx,r8
146 adc r10,0
147
148 imul rbp,r11
149 add rbx,r9
150 mov rax,-4
151 adc r10,rbp
152
153 and rax,r10
154 mov rbp,r10
155 shr r10,2
156 and rbp,3
157 add rax,r10
158 add r14,rax
159 adc rbx,0
160 adc rbp,0
161 mov rax,r12
162 dec r15
163 jnz NEAR $L$oop
164
165 mov QWORD[rdi],r14
166 mov QWORD[8+rdi],rbx
167 mov QWORD[16+rdi],rbp
168
169 mov r15,QWORD[rsp]
170
171 mov r14,QWORD[8+rsp]
172
173 mov r13,QWORD[16+rsp]
174
175 mov r12,QWORD[24+rsp]
176
177 mov rbp,QWORD[32+rsp]
178
179 mov rbx,QWORD[40+rsp]
180
181 lea rsp,[48+rsp]
182
183$L$no_data:
184$L$blocks_epilogue:
185 mov rdi,QWORD[8+rsp] ;WIN64 epilogue
186 mov rsi,QWORD[16+rsp]
187 DB 0F3h,0C3h ;repret
188
189$L$SEH_end_poly1305_blocks:
190
191
192ALIGN 32
193poly1305_emit:
194 mov QWORD[8+rsp],rdi ;WIN64 prologue
195 mov QWORD[16+rsp],rsi
196 mov rax,rsp
197$L$SEH_begin_poly1305_emit:
198 mov rdi,rcx
199 mov rsi,rdx
200 mov rdx,r8
201
202
203
204$L$emit:
205 mov r8,QWORD[rdi]
206 mov r9,QWORD[8+rdi]
207 mov r10,QWORD[16+rdi]
208
209 mov rax,r8
210 add r8,5
211 mov rcx,r9
212 adc r9,0
213 adc r10,0
214 shr r10,2
215 cmovnz rax,r8
216 cmovnz rcx,r9
217
218 add rax,QWORD[rdx]
219 adc rcx,QWORD[8+rdx]
220 mov QWORD[rsi],rax
221 mov QWORD[8+rsi],rcx
222
223 mov rdi,QWORD[8+rsp] ;WIN64 epilogue
224 mov rsi,QWORD[16+rsp]
225 DB 0F3h,0C3h ;repret
226
227$L$SEH_end_poly1305_emit:
228
229ALIGN 32
230__poly1305_block:
231
232 mul r14
233 mov r9,rax
234 mov rax,r11
235 mov r10,rdx
236
237 mul r14
238 mov r14,rax
239 mov rax,r11
240 mov r8,rdx
241
242 mul rbx
243 add r9,rax
244 mov rax,r13
245 adc r10,rdx
246
247 mul rbx
248 mov rbx,rbp
249 add r14,rax
250 adc r8,rdx
251
252 imul rbx,r13
253 add r9,rbx
254 mov rbx,r8
255 adc r10,0
256
257 imul rbp,r11
258 add rbx,r9
259 mov rax,-4
260 adc r10,rbp
261
262 and rax,r10
263 mov rbp,r10
264 shr r10,2
265 and rbp,3
266 add rax,r10
267 add r14,rax
268 adc rbx,0
269 adc rbp,0
270 DB 0F3h,0C3h ;repret
271
272
273
274
275ALIGN 32
276__poly1305_init_avx:
277
278 mov r14,r11
279 mov rbx,r12
280 xor rbp,rbp
281
282 lea rdi,[((48+64))+rdi]
283
284 mov rax,r12
285 call __poly1305_block
286
287 mov eax,0x3ffffff
288 mov edx,0x3ffffff
289 mov r8,r14
290 and eax,r14d
291 mov r9,r11
292 and edx,r11d
293 mov DWORD[((-64))+rdi],eax
294 shr r8,26
295 mov DWORD[((-60))+rdi],edx
296 shr r9,26
297
298 mov eax,0x3ffffff
299 mov edx,0x3ffffff
300 and eax,r8d
301 and edx,r9d
302 mov DWORD[((-48))+rdi],eax
303 lea eax,[rax*4+rax]
304 mov DWORD[((-44))+rdi],edx
305 lea edx,[rdx*4+rdx]
306 mov DWORD[((-32))+rdi],eax
307 shr r8,26
308 mov DWORD[((-28))+rdi],edx
309 shr r9,26
310
311 mov rax,rbx
312 mov rdx,r12
313 shl rax,12
314 shl rdx,12
315 or rax,r8
316 or rdx,r9
317 and eax,0x3ffffff
318 and edx,0x3ffffff
319 mov DWORD[((-16))+rdi],eax
320 lea eax,[rax*4+rax]
321 mov DWORD[((-12))+rdi],edx
322 lea edx,[rdx*4+rdx]
323 mov DWORD[rdi],eax
324 mov r8,rbx
325 mov DWORD[4+rdi],edx
326 mov r9,r12
327
328 mov eax,0x3ffffff
329 mov edx,0x3ffffff
330 shr r8,14
331 shr r9,14
332 and eax,r8d
333 and edx,r9d
334 mov DWORD[16+rdi],eax
335 lea eax,[rax*4+rax]
336 mov DWORD[20+rdi],edx
337 lea edx,[rdx*4+rdx]
338 mov DWORD[32+rdi],eax
339 shr r8,26
340 mov DWORD[36+rdi],edx
341 shr r9,26
342
343 mov rax,rbp
344 shl rax,24
345 or r8,rax
346 mov DWORD[48+rdi],r8d
347 lea r8,[r8*4+r8]
348 mov DWORD[52+rdi],r9d
349 lea r9,[r9*4+r9]
350 mov DWORD[64+rdi],r8d
351 mov DWORD[68+rdi],r9d
352
353 mov rax,r12
354 call __poly1305_block
355
356 mov eax,0x3ffffff
357 mov r8,r14
358 and eax,r14d
359 shr r8,26
360 mov DWORD[((-52))+rdi],eax
361
362 mov edx,0x3ffffff
363 and edx,r8d
364 mov DWORD[((-36))+rdi],edx
365 lea edx,[rdx*4+rdx]
366 shr r8,26
367 mov DWORD[((-20))+rdi],edx
368
369 mov rax,rbx
370 shl rax,12
371 or rax,r8
372 and eax,0x3ffffff
373 mov DWORD[((-4))+rdi],eax
374 lea eax,[rax*4+rax]
375 mov r8,rbx
376 mov DWORD[12+rdi],eax
377
378 mov edx,0x3ffffff
379 shr r8,14
380 and edx,r8d
381 mov DWORD[28+rdi],edx
382 lea edx,[rdx*4+rdx]
383 shr r8,26
384 mov DWORD[44+rdi],edx
385
386 mov rax,rbp
387 shl rax,24
388 or r8,rax
389 mov DWORD[60+rdi],r8d
390 lea r8,[r8*4+r8]
391 mov DWORD[76+rdi],r8d
392
393 mov rax,r12
394 call __poly1305_block
395
396 mov eax,0x3ffffff
397 mov r8,r14
398 and eax,r14d
399 shr r8,26
400 mov DWORD[((-56))+rdi],eax
401
402 mov edx,0x3ffffff
403 and edx,r8d
404 mov DWORD[((-40))+rdi],edx
405 lea edx,[rdx*4+rdx]
406 shr r8,26
407 mov DWORD[((-24))+rdi],edx
408
409 mov rax,rbx
410 shl rax,12
411 or rax,r8
412 and eax,0x3ffffff
413 mov DWORD[((-8))+rdi],eax
414 lea eax,[rax*4+rax]
415 mov r8,rbx
416 mov DWORD[8+rdi],eax
417
418 mov edx,0x3ffffff
419 shr r8,14
420 and edx,r8d
421 mov DWORD[24+rdi],edx
422 lea edx,[rdx*4+rdx]
423 shr r8,26
424 mov DWORD[40+rdi],edx
425
426 mov rax,rbp
427 shl rax,24
428 or r8,rax
429 mov DWORD[56+rdi],r8d
430 lea r8,[r8*4+r8]
431 mov DWORD[72+rdi],r8d
432
433 lea rdi,[((-48-64))+rdi]
434 DB 0F3h,0C3h ;repret
435
436
437
438
439ALIGN 32
440poly1305_blocks_avx:
441 mov QWORD[8+rsp],rdi ;WIN64 prologue
442 mov QWORD[16+rsp],rsi
443 mov rax,rsp
444$L$SEH_begin_poly1305_blocks_avx:
445 mov rdi,rcx
446 mov rsi,rdx
447 mov rdx,r8
448 mov rcx,r9
449
450
451
452 mov r8d,DWORD[20+rdi]
453 cmp rdx,128
454 jae NEAR $L$blocks_avx
455 test r8d,r8d
456 jz NEAR $L$blocks
457
458$L$blocks_avx:
459 and rdx,-16
460 jz NEAR $L$no_data_avx
461
462 vzeroupper
463
464 test r8d,r8d
465 jz NEAR $L$base2_64_avx
466
467 test rdx,31
468 jz NEAR $L$even_avx
469
470 push rbx
471
472 push rbp
473
474 push r12
475
476 push r13
477
478 push r14
479
480 push r15
481
482$L$blocks_avx_body:
483
484 mov r15,rdx
485
486 mov r8,QWORD[rdi]
487 mov r9,QWORD[8+rdi]
488 mov ebp,DWORD[16+rdi]
489
490 mov r11,QWORD[24+rdi]
491 mov r13,QWORD[32+rdi]
492
493
494 mov r14d,r8d
495 and r8,-2147483648
496 mov r12,r9
497 mov ebx,r9d
498 and r9,-2147483648
499
500 shr r8,6
501 shl r12,52
502 add r14,r8
503 shr rbx,12
504 shr r9,18
505 add r14,r12
506 adc rbx,r9
507
508 mov r8,rbp
509 shl r8,40
510 shr rbp,24
511 add rbx,r8
512 adc rbp,0
513
514 mov r9,-4
515 mov r8,rbp
516 and r9,rbp
517 shr r8,2
518 and rbp,3
519 add r8,r9
520 add r14,r8
521 adc rbx,0
522 adc rbp,0
523
524 mov r12,r13
525 mov rax,r13
526 shr r13,2
527 add r13,r12
528
529 add r14,QWORD[rsi]
530 adc rbx,QWORD[8+rsi]
531 lea rsi,[16+rsi]
532 adc rbp,rcx
533
534 call __poly1305_block
535
536 test rcx,rcx
537 jz NEAR $L$store_base2_64_avx
538
539
540 mov rax,r14
541 mov rdx,r14
542 shr r14,52
543 mov r11,rbx
544 mov r12,rbx
545 shr rdx,26
546 and rax,0x3ffffff
547 shl r11,12
548 and rdx,0x3ffffff
549 shr rbx,14
550 or r14,r11
551 shl rbp,24
552 and r14,0x3ffffff
553 shr r12,40
554 and rbx,0x3ffffff
555 or rbp,r12
556
557 sub r15,16
558 jz NEAR $L$store_base2_26_avx
559
560 vmovd xmm0,eax
561 vmovd xmm1,edx
562 vmovd xmm2,r14d
563 vmovd xmm3,ebx
564 vmovd xmm4,ebp
565 jmp NEAR $L$proceed_avx
566
567ALIGN 32
568$L$store_base2_64_avx:
569 mov QWORD[rdi],r14
570 mov QWORD[8+rdi],rbx
571 mov QWORD[16+rdi],rbp
572 jmp NEAR $L$done_avx
573
574ALIGN 16
575$L$store_base2_26_avx:
576 mov DWORD[rdi],eax
577 mov DWORD[4+rdi],edx
578 mov DWORD[8+rdi],r14d
579 mov DWORD[12+rdi],ebx
580 mov DWORD[16+rdi],ebp
581ALIGN 16
582$L$done_avx:
583 mov r15,QWORD[rsp]
584
585 mov r14,QWORD[8+rsp]
586
587 mov r13,QWORD[16+rsp]
588
589 mov r12,QWORD[24+rsp]
590
591 mov rbp,QWORD[32+rsp]
592
593 mov rbx,QWORD[40+rsp]
594
595 lea rsp,[48+rsp]
596
597$L$no_data_avx:
598$L$blocks_avx_epilogue:
599 mov rdi,QWORD[8+rsp] ;WIN64 epilogue
600 mov rsi,QWORD[16+rsp]
601 DB 0F3h,0C3h ;repret
602
603
604ALIGN 32
605$L$base2_64_avx:
606
607 push rbx
608
609 push rbp
610
611 push r12
612
613 push r13
614
615 push r14
616
617 push r15
618
619$L$base2_64_avx_body:
620
621 mov r15,rdx
622
623 mov r11,QWORD[24+rdi]
624 mov r13,QWORD[32+rdi]
625
626 mov r14,QWORD[rdi]
627 mov rbx,QWORD[8+rdi]
628 mov ebp,DWORD[16+rdi]
629
630 mov r12,r13
631 mov rax,r13
632 shr r13,2
633 add r13,r12
634
635 test rdx,31
636 jz NEAR $L$init_avx
637
638 add r14,QWORD[rsi]
639 adc rbx,QWORD[8+rsi]
640 lea rsi,[16+rsi]
641 adc rbp,rcx
642 sub r15,16
643
644 call __poly1305_block
645
646$L$init_avx:
647
648 mov rax,r14
649 mov rdx,r14
650 shr r14,52
651 mov r8,rbx
652 mov r9,rbx
653 shr rdx,26
654 and rax,0x3ffffff
655 shl r8,12
656 and rdx,0x3ffffff
657 shr rbx,14
658 or r14,r8
659 shl rbp,24
660 and r14,0x3ffffff
661 shr r9,40
662 and rbx,0x3ffffff
663 or rbp,r9
664
665 vmovd xmm0,eax
666 vmovd xmm1,edx
667 vmovd xmm2,r14d
668 vmovd xmm3,ebx
669 vmovd xmm4,ebp
670 mov DWORD[20+rdi],1
671
672 call __poly1305_init_avx
673
674$L$proceed_avx:
675 mov rdx,r15
676
677 mov r15,QWORD[rsp]
678
679 mov r14,QWORD[8+rsp]
680
681 mov r13,QWORD[16+rsp]
682
683 mov r12,QWORD[24+rsp]
684
685 mov rbp,QWORD[32+rsp]
686
687 mov rbx,QWORD[40+rsp]
688
689 lea rax,[48+rsp]
690 lea rsp,[48+rsp]
691
692$L$base2_64_avx_epilogue:
693 jmp NEAR $L$do_avx
694
695
696ALIGN 32
697$L$even_avx:
698
699 vmovd xmm0,DWORD[rdi]
700 vmovd xmm1,DWORD[4+rdi]
701 vmovd xmm2,DWORD[8+rdi]
702 vmovd xmm3,DWORD[12+rdi]
703 vmovd xmm4,DWORD[16+rdi]
704
705$L$do_avx:
706 lea r11,[((-248))+rsp]
707 sub rsp,0x218
708 vmovdqa XMMWORD[80+r11],xmm6
709 vmovdqa XMMWORD[96+r11],xmm7
710 vmovdqa XMMWORD[112+r11],xmm8
711 vmovdqa XMMWORD[128+r11],xmm9
712 vmovdqa XMMWORD[144+r11],xmm10
713 vmovdqa XMMWORD[160+r11],xmm11
714 vmovdqa XMMWORD[176+r11],xmm12
715 vmovdqa XMMWORD[192+r11],xmm13
716 vmovdqa XMMWORD[208+r11],xmm14
717 vmovdqa XMMWORD[224+r11],xmm15
718$L$do_avx_body:
719 sub rdx,64
720 lea rax,[((-32))+rsi]
721 cmovc rsi,rax
722
723 vmovdqu xmm14,XMMWORD[48+rdi]
724 lea rdi,[112+rdi]
725 lea rcx,[$L$const]
726
727
728
729 vmovdqu xmm5,XMMWORD[32+rsi]
730 vmovdqu xmm6,XMMWORD[48+rsi]
731 vmovdqa xmm15,XMMWORD[64+rcx]
732
733 vpsrldq xmm7,xmm5,6
734 vpsrldq xmm8,xmm6,6
735 vpunpckhqdq xmm9,xmm5,xmm6
736 vpunpcklqdq xmm5,xmm5,xmm6
737 vpunpcklqdq xmm8,xmm7,xmm8
738
739 vpsrlq xmm9,xmm9,40
740 vpsrlq xmm6,xmm5,26
741 vpand xmm5,xmm5,xmm15
742 vpsrlq xmm7,xmm8,4
743 vpand xmm6,xmm6,xmm15
744 vpsrlq xmm8,xmm8,30
745 vpand xmm7,xmm7,xmm15
746 vpand xmm8,xmm8,xmm15
747 vpor xmm9,xmm9,XMMWORD[32+rcx]
748
749 jbe NEAR $L$skip_loop_avx
750
751
752 vmovdqu xmm11,XMMWORD[((-48))+rdi]
753 vmovdqu xmm12,XMMWORD[((-32))+rdi]
754 vpshufd xmm13,xmm14,0xEE
755 vpshufd xmm10,xmm14,0x44
756 vmovdqa XMMWORD[(-144)+r11],xmm13
757 vmovdqa XMMWORD[rsp],xmm10
758 vpshufd xmm14,xmm11,0xEE
759 vmovdqu xmm10,XMMWORD[((-16))+rdi]
760 vpshufd xmm11,xmm11,0x44
761 vmovdqa XMMWORD[(-128)+r11],xmm14
762 vmovdqa XMMWORD[16+rsp],xmm11
763 vpshufd xmm13,xmm12,0xEE
764 vmovdqu xmm11,XMMWORD[rdi]
765 vpshufd xmm12,xmm12,0x44
766 vmovdqa XMMWORD[(-112)+r11],xmm13
767 vmovdqa XMMWORD[32+rsp],xmm12
768 vpshufd xmm14,xmm10,0xEE
769 vmovdqu xmm12,XMMWORD[16+rdi]
770 vpshufd xmm10,xmm10,0x44
771 vmovdqa XMMWORD[(-96)+r11],xmm14
772 vmovdqa XMMWORD[48+rsp],xmm10
773 vpshufd xmm13,xmm11,0xEE
774 vmovdqu xmm10,XMMWORD[32+rdi]
775 vpshufd xmm11,xmm11,0x44
776 vmovdqa XMMWORD[(-80)+r11],xmm13
777 vmovdqa XMMWORD[64+rsp],xmm11
778 vpshufd xmm14,xmm12,0xEE
779 vmovdqu xmm11,XMMWORD[48+rdi]
780 vpshufd xmm12,xmm12,0x44
781 vmovdqa XMMWORD[(-64)+r11],xmm14
782 vmovdqa XMMWORD[80+rsp],xmm12
783 vpshufd xmm13,xmm10,0xEE
784 vmovdqu xmm12,XMMWORD[64+rdi]
785 vpshufd xmm10,xmm10,0x44
786 vmovdqa XMMWORD[(-48)+r11],xmm13
787 vmovdqa XMMWORD[96+rsp],xmm10
788 vpshufd xmm14,xmm11,0xEE
789 vpshufd xmm11,xmm11,0x44
790 vmovdqa XMMWORD[(-32)+r11],xmm14
791 vmovdqa XMMWORD[112+rsp],xmm11
792 vpshufd xmm13,xmm12,0xEE
793 vmovdqa xmm14,XMMWORD[rsp]
794 vpshufd xmm12,xmm12,0x44
795 vmovdqa XMMWORD[(-16)+r11],xmm13
796 vmovdqa XMMWORD[128+rsp],xmm12
797
798 jmp NEAR $L$oop_avx
799
800ALIGN 32
801$L$oop_avx:
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822 vpmuludq xmm10,xmm14,xmm5
823 vpmuludq xmm11,xmm14,xmm6
824 vmovdqa XMMWORD[32+r11],xmm2
825 vpmuludq xmm12,xmm14,xmm7
826 vmovdqa xmm2,XMMWORD[16+rsp]
827 vpmuludq xmm13,xmm14,xmm8
828 vpmuludq xmm14,xmm14,xmm9
829
830 vmovdqa XMMWORD[r11],xmm0
831 vpmuludq xmm0,xmm9,XMMWORD[32+rsp]
832 vmovdqa XMMWORD[16+r11],xmm1
833 vpmuludq xmm1,xmm2,xmm8
834 vpaddq xmm10,xmm10,xmm0
835 vpaddq xmm14,xmm14,xmm1
836 vmovdqa XMMWORD[48+r11],xmm3
837 vpmuludq xmm0,xmm2,xmm7
838 vpmuludq xmm1,xmm2,xmm6
839 vpaddq xmm13,xmm13,xmm0
840 vmovdqa xmm3,XMMWORD[48+rsp]
841 vpaddq xmm12,xmm12,xmm1
842 vmovdqa XMMWORD[64+r11],xmm4
843 vpmuludq xmm2,xmm2,xmm5
844 vpmuludq xmm0,xmm3,xmm7
845 vpaddq xmm11,xmm11,xmm2
846
847 vmovdqa xmm4,XMMWORD[64+rsp]
848 vpaddq xmm14,xmm14,xmm0
849 vpmuludq xmm1,xmm3,xmm6
850 vpmuludq xmm3,xmm3,xmm5
851 vpaddq xmm13,xmm13,xmm1
852 vmovdqa xmm2,XMMWORD[80+rsp]
853 vpaddq xmm12,xmm12,xmm3
854 vpmuludq xmm0,xmm4,xmm9
855 vpmuludq xmm4,xmm4,xmm8
856 vpaddq xmm11,xmm11,xmm0
857 vmovdqa xmm3,XMMWORD[96+rsp]
858 vpaddq xmm10,xmm10,xmm4
859
860 vmovdqa xmm4,XMMWORD[128+rsp]
861 vpmuludq xmm1,xmm2,xmm6
862 vpmuludq xmm2,xmm2,xmm5
863 vpaddq xmm14,xmm14,xmm1
864 vpaddq xmm13,xmm13,xmm2
865 vpmuludq xmm0,xmm3,xmm9
866 vpmuludq xmm1,xmm3,xmm8
867 vpaddq xmm12,xmm12,xmm0
868 vmovdqu xmm0,XMMWORD[rsi]
869 vpaddq xmm11,xmm11,xmm1
870 vpmuludq xmm3,xmm3,xmm7
871 vpmuludq xmm7,xmm4,xmm7
872 vpaddq xmm10,xmm10,xmm3
873
874 vmovdqu xmm1,XMMWORD[16+rsi]
875 vpaddq xmm11,xmm11,xmm7
876 vpmuludq xmm8,xmm4,xmm8
877 vpmuludq xmm9,xmm4,xmm9
878 vpsrldq xmm2,xmm0,6
879 vpaddq xmm12,xmm12,xmm8
880 vpaddq xmm13,xmm13,xmm9
881 vpsrldq xmm3,xmm1,6
882 vpmuludq xmm9,xmm5,XMMWORD[112+rsp]
883 vpmuludq xmm5,xmm4,xmm6
884 vpunpckhqdq xmm4,xmm0,xmm1
885 vpaddq xmm14,xmm14,xmm9
886 vmovdqa xmm9,XMMWORD[((-144))+r11]
887 vpaddq xmm10,xmm10,xmm5
888
889 vpunpcklqdq xmm0,xmm0,xmm1
890 vpunpcklqdq xmm3,xmm2,xmm3
891
892
893 vpsrldq xmm4,xmm4,5
894 vpsrlq xmm1,xmm0,26
895 vpand xmm0,xmm0,xmm15
896 vpsrlq xmm2,xmm3,4
897 vpand xmm1,xmm1,xmm15
898 vpand xmm4,xmm4,XMMWORD[rcx]
899 vpsrlq xmm3,xmm3,30
900 vpand xmm2,xmm2,xmm15
901 vpand xmm3,xmm3,xmm15
902 vpor xmm4,xmm4,XMMWORD[32+rcx]
903
904 vpaddq xmm0,xmm0,XMMWORD[r11]
905 vpaddq xmm1,xmm1,XMMWORD[16+r11]
906 vpaddq xmm2,xmm2,XMMWORD[32+r11]
907 vpaddq xmm3,xmm3,XMMWORD[48+r11]
908 vpaddq xmm4,xmm4,XMMWORD[64+r11]
909
910 lea rax,[32+rsi]
911 lea rsi,[64+rsi]
912 sub rdx,64
913 cmovc rsi,rax
914
915
916
917
918
919
920
921
922
923
924 vpmuludq xmm5,xmm9,xmm0
925 vpmuludq xmm6,xmm9,xmm1
926 vpaddq xmm10,xmm10,xmm5
927 vpaddq xmm11,xmm11,xmm6
928 vmovdqa xmm7,XMMWORD[((-128))+r11]
929 vpmuludq xmm5,xmm9,xmm2
930 vpmuludq xmm6,xmm9,xmm3
931 vpaddq xmm12,xmm12,xmm5
932 vpaddq xmm13,xmm13,xmm6
933 vpmuludq xmm9,xmm9,xmm4
934 vpmuludq xmm5,xmm4,XMMWORD[((-112))+r11]
935 vpaddq xmm14,xmm14,xmm9
936
937 vpaddq xmm10,xmm10,xmm5
938 vpmuludq xmm6,xmm7,xmm2
939 vpmuludq xmm5,xmm7,xmm3
940 vpaddq xmm13,xmm13,xmm6
941 vmovdqa xmm8,XMMWORD[((-96))+r11]
942 vpaddq xmm14,xmm14,xmm5
943 vpmuludq xmm6,xmm7,xmm1
944 vpmuludq xmm7,xmm7,xmm0
945 vpaddq xmm12,xmm12,xmm6
946 vpaddq xmm11,xmm11,xmm7
947
948 vmovdqa xmm9,XMMWORD[((-80))+r11]
949 vpmuludq xmm5,xmm8,xmm2
950 vpmuludq xmm6,xmm8,xmm1
951 vpaddq xmm14,xmm14,xmm5
952 vpaddq xmm13,xmm13,xmm6
953 vmovdqa xmm7,XMMWORD[((-64))+r11]
954 vpmuludq xmm8,xmm8,xmm0
955 vpmuludq xmm5,xmm9,xmm4
956 vpaddq xmm12,xmm12,xmm8
957 vpaddq xmm11,xmm11,xmm5
958 vmovdqa xmm8,XMMWORD[((-48))+r11]
959 vpmuludq xmm9,xmm9,xmm3
960 vpmuludq xmm6,xmm7,xmm1
961 vpaddq xmm10,xmm10,xmm9
962
963 vmovdqa xmm9,XMMWORD[((-16))+r11]
964 vpaddq xmm14,xmm14,xmm6
965 vpmuludq xmm7,xmm7,xmm0
966 vpmuludq xmm5,xmm8,xmm4
967 vpaddq xmm13,xmm13,xmm7
968 vpaddq xmm12,xmm12,xmm5
969 vmovdqu xmm5,XMMWORD[32+rsi]
970 vpmuludq xmm7,xmm8,xmm3
971 vpmuludq xmm8,xmm8,xmm2
972 vpaddq xmm11,xmm11,xmm7
973 vmovdqu xmm6,XMMWORD[48+rsi]
974 vpaddq xmm10,xmm10,xmm8
975
976 vpmuludq xmm2,xmm9,xmm2
977 vpmuludq xmm3,xmm9,xmm3
978 vpsrldq xmm7,xmm5,6
979 vpaddq xmm11,xmm11,xmm2
980 vpmuludq xmm4,xmm9,xmm4
981 vpsrldq xmm8,xmm6,6
982 vpaddq xmm2,xmm12,xmm3
983 vpaddq xmm3,xmm13,xmm4
984 vpmuludq xmm4,xmm0,XMMWORD[((-32))+r11]
985 vpmuludq xmm0,xmm9,xmm1
986 vpunpckhqdq xmm9,xmm5,xmm6
987 vpaddq xmm4,xmm14,xmm4
988 vpaddq xmm0,xmm10,xmm0
989
990 vpunpcklqdq xmm5,xmm5,xmm6
991 vpunpcklqdq xmm8,xmm7,xmm8
992
993
994 vpsrldq xmm9,xmm9,5
995 vpsrlq xmm6,xmm5,26
996 vmovdqa xmm14,XMMWORD[rsp]
997 vpand xmm5,xmm5,xmm15
998 vpsrlq xmm7,xmm8,4
999 vpand xmm6,xmm6,xmm15
1000 vpand xmm9,xmm9,XMMWORD[rcx]
1001 vpsrlq xmm8,xmm8,30
1002 vpand xmm7,xmm7,xmm15
1003 vpand xmm8,xmm8,xmm15
1004 vpor xmm9,xmm9,XMMWORD[32+rcx]
1005
1006
1007
1008
1009
1010 vpsrlq xmm13,xmm3,26
1011 vpand xmm3,xmm3,xmm15
1012 vpaddq xmm4,xmm4,xmm13
1013
1014 vpsrlq xmm10,xmm0,26
1015 vpand xmm0,xmm0,xmm15
1016 vpaddq xmm1,xmm11,xmm10
1017
1018 vpsrlq xmm10,xmm4,26
1019 vpand xmm4,xmm4,xmm15
1020
1021 vpsrlq xmm11,xmm1,26
1022 vpand xmm1,xmm1,xmm15
1023 vpaddq xmm2,xmm2,xmm11
1024
1025 vpaddq xmm0,xmm0,xmm10
1026 vpsllq xmm10,xmm10,2
1027 vpaddq xmm0,xmm0,xmm10
1028
1029 vpsrlq xmm12,xmm2,26
1030 vpand xmm2,xmm2,xmm15
1031 vpaddq xmm3,xmm3,xmm12
1032
1033 vpsrlq xmm10,xmm0,26
1034 vpand xmm0,xmm0,xmm15
1035 vpaddq xmm1,xmm1,xmm10
1036
1037 vpsrlq xmm13,xmm3,26
1038 vpand xmm3,xmm3,xmm15
1039 vpaddq xmm4,xmm4,xmm13
1040
1041 ja NEAR $L$oop_avx
1042
1043$L$skip_loop_avx:
1044
1045
1046
1047 vpshufd xmm14,xmm14,0x10
1048 add rdx,32
1049 jnz NEAR $L$ong_tail_avx
1050
1051 vpaddq xmm7,xmm7,xmm2
1052 vpaddq xmm5,xmm5,xmm0
1053 vpaddq xmm6,xmm6,xmm1
1054 vpaddq xmm8,xmm8,xmm3
1055 vpaddq xmm9,xmm9,xmm4
1056
1057$L$ong_tail_avx:
1058 vmovdqa XMMWORD[32+r11],xmm2
1059 vmovdqa XMMWORD[r11],xmm0
1060 vmovdqa XMMWORD[16+r11],xmm1
1061 vmovdqa XMMWORD[48+r11],xmm3
1062 vmovdqa XMMWORD[64+r11],xmm4
1063
1064
1065
1066
1067
1068
1069
1070 vpmuludq xmm12,xmm14,xmm7
1071 vpmuludq xmm10,xmm14,xmm5
1072 vpshufd xmm2,XMMWORD[((-48))+rdi],0x10
1073 vpmuludq xmm11,xmm14,xmm6
1074 vpmuludq xmm13,xmm14,xmm8
1075 vpmuludq xmm14,xmm14,xmm9
1076
1077 vpmuludq xmm0,xmm2,xmm8
1078 vpaddq xmm14,xmm14,xmm0
1079 vpshufd xmm3,XMMWORD[((-32))+rdi],0x10
1080 vpmuludq xmm1,xmm2,xmm7
1081 vpaddq xmm13,xmm13,xmm1
1082 vpshufd xmm4,XMMWORD[((-16))+rdi],0x10
1083 vpmuludq xmm0,xmm2,xmm6
1084 vpaddq xmm12,xmm12,xmm0
1085 vpmuludq xmm2,xmm2,xmm5
1086 vpaddq xmm11,xmm11,xmm2
1087 vpmuludq xmm3,xmm3,xmm9
1088 vpaddq xmm10,xmm10,xmm3
1089
1090 vpshufd xmm2,XMMWORD[rdi],0x10
1091 vpmuludq xmm1,xmm4,xmm7
1092 vpaddq xmm14,xmm14,xmm1
1093 vpmuludq xmm0,xmm4,xmm6
1094 vpaddq xmm13,xmm13,xmm0
1095 vpshufd xmm3,XMMWORD[16+rdi],0x10
1096 vpmuludq xmm4,xmm4,xmm5
1097 vpaddq xmm12,xmm12,xmm4
1098 vpmuludq xmm1,xmm2,xmm9
1099 vpaddq xmm11,xmm11,xmm1
1100 vpshufd xmm4,XMMWORD[32+rdi],0x10
1101 vpmuludq xmm2,xmm2,xmm8
1102 vpaddq xmm10,xmm10,xmm2
1103
1104 vpmuludq xmm0,xmm3,xmm6
1105 vpaddq xmm14,xmm14,xmm0
1106 vpmuludq xmm3,xmm3,xmm5
1107 vpaddq xmm13,xmm13,xmm3
1108 vpshufd xmm2,XMMWORD[48+rdi],0x10
1109 vpmuludq xmm1,xmm4,xmm9
1110 vpaddq xmm12,xmm12,xmm1
1111 vpshufd xmm3,XMMWORD[64+rdi],0x10
1112 vpmuludq xmm0,xmm4,xmm8
1113 vpaddq xmm11,xmm11,xmm0
1114 vpmuludq xmm4,xmm4,xmm7
1115 vpaddq xmm10,xmm10,xmm4
1116
1117 vpmuludq xmm2,xmm2,xmm5
1118 vpaddq xmm14,xmm14,xmm2
1119 vpmuludq xmm1,xmm3,xmm9
1120 vpaddq xmm13,xmm13,xmm1
1121 vpmuludq xmm0,xmm3,xmm8
1122 vpaddq xmm12,xmm12,xmm0
1123 vpmuludq xmm1,xmm3,xmm7
1124 vpaddq xmm11,xmm11,xmm1
1125 vpmuludq xmm3,xmm3,xmm6
1126 vpaddq xmm10,xmm10,xmm3
1127
1128 jz NEAR $L$short_tail_avx
1129
1130 vmovdqu xmm0,XMMWORD[rsi]
1131 vmovdqu xmm1,XMMWORD[16+rsi]
1132
1133 vpsrldq xmm2,xmm0,6
1134 vpsrldq xmm3,xmm1,6
1135 vpunpckhqdq xmm4,xmm0,xmm1
1136 vpunpcklqdq xmm0,xmm0,xmm1
1137 vpunpcklqdq xmm3,xmm2,xmm3
1138
1139 vpsrlq xmm4,xmm4,40
1140 vpsrlq xmm1,xmm0,26
1141 vpand xmm0,xmm0,xmm15
1142 vpsrlq xmm2,xmm3,4
1143 vpand xmm1,xmm1,xmm15
1144 vpsrlq xmm3,xmm3,30
1145 vpand xmm2,xmm2,xmm15
1146 vpand xmm3,xmm3,xmm15
1147 vpor xmm4,xmm4,XMMWORD[32+rcx]
1148
1149 vpshufd xmm9,XMMWORD[((-64))+rdi],0x32
1150 vpaddq xmm0,xmm0,XMMWORD[r11]
1151 vpaddq xmm1,xmm1,XMMWORD[16+r11]
1152 vpaddq xmm2,xmm2,XMMWORD[32+r11]
1153 vpaddq xmm3,xmm3,XMMWORD[48+r11]
1154 vpaddq xmm4,xmm4,XMMWORD[64+r11]
1155
1156
1157
1158
1159 vpmuludq xmm5,xmm9,xmm0
1160 vpaddq xmm10,xmm10,xmm5
1161 vpmuludq xmm6,xmm9,xmm1
1162 vpaddq xmm11,xmm11,xmm6
1163 vpmuludq xmm5,xmm9,xmm2
1164 vpaddq xmm12,xmm12,xmm5
1165 vpshufd xmm7,XMMWORD[((-48))+rdi],0x32
1166 vpmuludq xmm6,xmm9,xmm3
1167 vpaddq xmm13,xmm13,xmm6
1168 vpmuludq xmm9,xmm9,xmm4
1169 vpaddq xmm14,xmm14,xmm9
1170
1171 vpmuludq xmm5,xmm7,xmm3
1172 vpaddq xmm14,xmm14,xmm5
1173 vpshufd xmm8,XMMWORD[((-32))+rdi],0x32
1174 vpmuludq xmm6,xmm7,xmm2
1175 vpaddq xmm13,xmm13,xmm6
1176 vpshufd xmm9,XMMWORD[((-16))+rdi],0x32
1177 vpmuludq xmm5,xmm7,xmm1
1178 vpaddq xmm12,xmm12,xmm5
1179 vpmuludq xmm7,xmm7,xmm0
1180 vpaddq xmm11,xmm11,xmm7
1181 vpmuludq xmm8,xmm8,xmm4
1182 vpaddq xmm10,xmm10,xmm8
1183
1184 vpshufd xmm7,XMMWORD[rdi],0x32
1185 vpmuludq xmm6,xmm9,xmm2
1186 vpaddq xmm14,xmm14,xmm6
1187 vpmuludq xmm5,xmm9,xmm1
1188 vpaddq xmm13,xmm13,xmm5
1189 vpshufd xmm8,XMMWORD[16+rdi],0x32
1190 vpmuludq xmm9,xmm9,xmm0
1191 vpaddq xmm12,xmm12,xmm9
1192 vpmuludq xmm6,xmm7,xmm4
1193 vpaddq xmm11,xmm11,xmm6
1194 vpshufd xmm9,XMMWORD[32+rdi],0x32
1195 vpmuludq xmm7,xmm7,xmm3
1196 vpaddq xmm10,xmm10,xmm7
1197
1198 vpmuludq xmm5,xmm8,xmm1
1199 vpaddq xmm14,xmm14,xmm5
1200 vpmuludq xmm8,xmm8,xmm0
1201 vpaddq xmm13,xmm13,xmm8
1202 vpshufd xmm7,XMMWORD[48+rdi],0x32
1203 vpmuludq xmm6,xmm9,xmm4
1204 vpaddq xmm12,xmm12,xmm6
1205 vpshufd xmm8,XMMWORD[64+rdi],0x32
1206 vpmuludq xmm5,xmm9,xmm3
1207 vpaddq xmm11,xmm11,xmm5
1208 vpmuludq xmm9,xmm9,xmm2
1209 vpaddq xmm10,xmm10,xmm9
1210
1211 vpmuludq xmm7,xmm7,xmm0
1212 vpaddq xmm14,xmm14,xmm7
1213 vpmuludq xmm6,xmm8,xmm4
1214 vpaddq xmm13,xmm13,xmm6
1215 vpmuludq xmm5,xmm8,xmm3
1216 vpaddq xmm12,xmm12,xmm5
1217 vpmuludq xmm6,xmm8,xmm2
1218 vpaddq xmm11,xmm11,xmm6
1219 vpmuludq xmm8,xmm8,xmm1
1220 vpaddq xmm10,xmm10,xmm8
1221
1222$L$short_tail_avx:
1223
1224
1225
1226 vpsrldq xmm9,xmm14,8
1227 vpsrldq xmm8,xmm13,8
1228 vpsrldq xmm6,xmm11,8
1229 vpsrldq xmm5,xmm10,8
1230 vpsrldq xmm7,xmm12,8
1231 vpaddq xmm13,xmm13,xmm8
1232 vpaddq xmm14,xmm14,xmm9
1233 vpaddq xmm10,xmm10,xmm5
1234 vpaddq xmm11,xmm11,xmm6
1235 vpaddq xmm12,xmm12,xmm7
1236
1237
1238
1239
1240 vpsrlq xmm3,xmm13,26
1241 vpand xmm13,xmm13,xmm15
1242 vpaddq xmm14,xmm14,xmm3
1243
1244 vpsrlq xmm0,xmm10,26
1245 vpand xmm10,xmm10,xmm15
1246 vpaddq xmm11,xmm11,xmm0
1247
1248 vpsrlq xmm4,xmm14,26
1249 vpand xmm14,xmm14,xmm15
1250
1251 vpsrlq xmm1,xmm11,26
1252 vpand xmm11,xmm11,xmm15
1253 vpaddq xmm12,xmm12,xmm1
1254
1255 vpaddq xmm10,xmm10,xmm4
1256 vpsllq xmm4,xmm4,2
1257 vpaddq xmm10,xmm10,xmm4
1258
1259 vpsrlq xmm2,xmm12,26
1260 vpand xmm12,xmm12,xmm15
1261 vpaddq xmm13,xmm13,xmm2
1262
1263 vpsrlq xmm0,xmm10,26
1264 vpand xmm10,xmm10,xmm15
1265 vpaddq xmm11,xmm11,xmm0
1266
1267 vpsrlq xmm3,xmm13,26
1268 vpand xmm13,xmm13,xmm15
1269 vpaddq xmm14,xmm14,xmm3
1270
1271 vmovd DWORD[(-112)+rdi],xmm10
1272 vmovd DWORD[(-108)+rdi],xmm11
1273 vmovd DWORD[(-104)+rdi],xmm12
1274 vmovd DWORD[(-100)+rdi],xmm13
1275 vmovd DWORD[(-96)+rdi],xmm14
1276 vmovdqa xmm6,XMMWORD[80+r11]
1277 vmovdqa xmm7,XMMWORD[96+r11]
1278 vmovdqa xmm8,XMMWORD[112+r11]
1279 vmovdqa xmm9,XMMWORD[128+r11]
1280 vmovdqa xmm10,XMMWORD[144+r11]
1281 vmovdqa xmm11,XMMWORD[160+r11]
1282 vmovdqa xmm12,XMMWORD[176+r11]
1283 vmovdqa xmm13,XMMWORD[192+r11]
1284 vmovdqa xmm14,XMMWORD[208+r11]
1285 vmovdqa xmm15,XMMWORD[224+r11]
1286 lea rsp,[248+r11]
1287$L$do_avx_epilogue:
1288 vzeroupper
1289 mov rdi,QWORD[8+rsp] ;WIN64 epilogue
1290 mov rsi,QWORD[16+rsp]
1291 DB 0F3h,0C3h ;repret
1292
1293$L$SEH_end_poly1305_blocks_avx:
1294
1295
1296ALIGN 32
1297poly1305_emit_avx:
1298 mov QWORD[8+rsp],rdi ;WIN64 prologue
1299 mov QWORD[16+rsp],rsi
1300 mov rax,rsp
1301$L$SEH_begin_poly1305_emit_avx:
1302 mov rdi,rcx
1303 mov rsi,rdx
1304 mov rdx,r8
1305
1306
1307
1308 cmp DWORD[20+rdi],0
1309 je NEAR $L$emit
1310
1311 mov eax,DWORD[rdi]
1312 mov ecx,DWORD[4+rdi]
1313 mov r8d,DWORD[8+rdi]
1314 mov r11d,DWORD[12+rdi]
1315 mov r10d,DWORD[16+rdi]
1316
1317 shl rcx,26
1318 mov r9,r8
1319 shl r8,52
1320 add rax,rcx
1321 shr r9,12
1322 add r8,rax
1323 adc r9,0
1324
1325 shl r11,14
1326 mov rax,r10
1327 shr r10,24
1328 add r9,r11
1329 shl rax,40
1330 add r9,rax
1331 adc r10,0
1332
1333 mov rax,r10
1334 mov rcx,r10
1335 and r10,3
1336 shr rax,2
1337 and rcx,-4
1338 add rax,rcx
1339 add r8,rax
1340 adc r9,0
1341 adc r10,0
1342
1343 mov rax,r8
1344 add r8,5
1345 mov rcx,r9
1346 adc r9,0
1347 adc r10,0
1348 shr r10,2
1349 cmovnz rax,r8
1350 cmovnz rcx,r9
1351
1352 add rax,QWORD[rdx]
1353 adc rcx,QWORD[8+rdx]
1354 mov QWORD[rsi],rax
1355 mov QWORD[8+rsi],rcx
1356
1357 mov rdi,QWORD[8+rsp] ;WIN64 epilogue
1358 mov rsi,QWORD[16+rsp]
1359 DB 0F3h,0C3h ;repret
1360
1361$L$SEH_end_poly1305_emit_avx:
1362
1363ALIGN 32
1364poly1305_blocks_avx2:
1365 mov QWORD[8+rsp],rdi ;WIN64 prologue
1366 mov QWORD[16+rsp],rsi
1367 mov rax,rsp
1368$L$SEH_begin_poly1305_blocks_avx2:
1369 mov rdi,rcx
1370 mov rsi,rdx
1371 mov rdx,r8
1372 mov rcx,r9
1373
1374
1375
1376 mov r8d,DWORD[20+rdi]
1377 cmp rdx,128
1378 jae NEAR $L$blocks_avx2
1379 test r8d,r8d
1380 jz NEAR $L$blocks
1381
1382$L$blocks_avx2:
1383 and rdx,-16
1384 jz NEAR $L$no_data_avx2
1385
1386 vzeroupper
1387
1388 test r8d,r8d
1389 jz NEAR $L$base2_64_avx2
1390
1391 test rdx,63
1392 jz NEAR $L$even_avx2
1393
1394 push rbx
1395
1396 push rbp
1397
1398 push r12
1399
1400 push r13
1401
1402 push r14
1403
1404 push r15
1405
1406$L$blocks_avx2_body:
1407
1408 mov r15,rdx
1409
1410 mov r8,QWORD[rdi]
1411 mov r9,QWORD[8+rdi]
1412 mov ebp,DWORD[16+rdi]
1413
1414 mov r11,QWORD[24+rdi]
1415 mov r13,QWORD[32+rdi]
1416
1417
1418 mov r14d,r8d
1419 and r8,-2147483648
1420 mov r12,r9
1421 mov ebx,r9d
1422 and r9,-2147483648
1423
1424 shr r8,6
1425 shl r12,52
1426 add r14,r8
1427 shr rbx,12
1428 shr r9,18
1429 add r14,r12
1430 adc rbx,r9
1431
1432 mov r8,rbp
1433 shl r8,40
1434 shr rbp,24
1435 add rbx,r8
1436 adc rbp,0
1437
1438 mov r9,-4
1439 mov r8,rbp
1440 and r9,rbp
1441 shr r8,2
1442 and rbp,3
1443 add r8,r9
1444 add r14,r8
1445 adc rbx,0
1446 adc rbp,0
1447
1448 mov r12,r13
1449 mov rax,r13
1450 shr r13,2
1451 add r13,r12
1452
1453$L$base2_26_pre_avx2:
1454 add r14,QWORD[rsi]
1455 adc rbx,QWORD[8+rsi]
1456 lea rsi,[16+rsi]
1457 adc rbp,rcx
1458 sub r15,16
1459
1460 call __poly1305_block
1461 mov rax,r12
1462
1463 test r15,63
1464 jnz NEAR $L$base2_26_pre_avx2
1465
1466 test rcx,rcx
1467 jz NEAR $L$store_base2_64_avx2
1468
1469
1470 mov rax,r14
1471 mov rdx,r14
1472 shr r14,52
1473 mov r11,rbx
1474 mov r12,rbx
1475 shr rdx,26
1476 and rax,0x3ffffff
1477 shl r11,12
1478 and rdx,0x3ffffff
1479 shr rbx,14
1480 or r14,r11
1481 shl rbp,24
1482 and r14,0x3ffffff
1483 shr r12,40
1484 and rbx,0x3ffffff
1485 or rbp,r12
1486
1487 test r15,r15
1488 jz NEAR $L$store_base2_26_avx2
1489
1490 vmovd xmm0,eax
1491 vmovd xmm1,edx
1492 vmovd xmm2,r14d
1493 vmovd xmm3,ebx
1494 vmovd xmm4,ebp
1495 jmp NEAR $L$proceed_avx2
1496
1497ALIGN 32
1498$L$store_base2_64_avx2:
1499 mov QWORD[rdi],r14
1500 mov QWORD[8+rdi],rbx
1501 mov QWORD[16+rdi],rbp
1502 jmp NEAR $L$done_avx2
1503
1504ALIGN 16
1505$L$store_base2_26_avx2:
1506 mov DWORD[rdi],eax
1507 mov DWORD[4+rdi],edx
1508 mov DWORD[8+rdi],r14d
1509 mov DWORD[12+rdi],ebx
1510 mov DWORD[16+rdi],ebp
1511ALIGN 16
1512$L$done_avx2:
1513 mov r15,QWORD[rsp]
1514
1515 mov r14,QWORD[8+rsp]
1516
1517 mov r13,QWORD[16+rsp]
1518
1519 mov r12,QWORD[24+rsp]
1520
1521 mov rbp,QWORD[32+rsp]
1522
1523 mov rbx,QWORD[40+rsp]
1524
1525 lea rsp,[48+rsp]
1526
1527$L$no_data_avx2:
1528$L$blocks_avx2_epilogue:
1529 mov rdi,QWORD[8+rsp] ;WIN64 epilogue
1530 mov rsi,QWORD[16+rsp]
1531 DB 0F3h,0C3h ;repret
1532
1533
1534ALIGN 32
1535$L$base2_64_avx2:
1536
1537 push rbx
1538
1539 push rbp
1540
1541 push r12
1542
1543 push r13
1544
1545 push r14
1546
1547 push r15
1548
1549$L$base2_64_avx2_body:
1550
1551 mov r15,rdx
1552
1553 mov r11,QWORD[24+rdi]
1554 mov r13,QWORD[32+rdi]
1555
1556 mov r14,QWORD[rdi]
1557 mov rbx,QWORD[8+rdi]
1558 mov ebp,DWORD[16+rdi]
1559
1560 mov r12,r13
1561 mov rax,r13
1562 shr r13,2
1563 add r13,r12
1564
1565 test rdx,63
1566 jz NEAR $L$init_avx2
1567
1568$L$base2_64_pre_avx2:
1569 add r14,QWORD[rsi]
1570 adc rbx,QWORD[8+rsi]
1571 lea rsi,[16+rsi]
1572 adc rbp,rcx
1573 sub r15,16
1574
1575 call __poly1305_block
1576 mov rax,r12
1577
1578 test r15,63
1579 jnz NEAR $L$base2_64_pre_avx2
1580
1581$L$init_avx2:
1582
1583 mov rax,r14
1584 mov rdx,r14
1585 shr r14,52
1586 mov r8,rbx
1587 mov r9,rbx
1588 shr rdx,26
1589 and rax,0x3ffffff
1590 shl r8,12
1591 and rdx,0x3ffffff
1592 shr rbx,14
1593 or r14,r8
1594 shl rbp,24
1595 and r14,0x3ffffff
1596 shr r9,40
1597 and rbx,0x3ffffff
1598 or rbp,r9
1599
1600 vmovd xmm0,eax
1601 vmovd xmm1,edx
1602 vmovd xmm2,r14d
1603 vmovd xmm3,ebx
1604 vmovd xmm4,ebp
1605 mov DWORD[20+rdi],1
1606
1607 call __poly1305_init_avx
1608
1609$L$proceed_avx2:
1610 mov rdx,r15
1611 mov r10d,DWORD[((OPENSSL_ia32cap_P+8))]
1612 mov r11d,3221291008
1613
1614 mov r15,QWORD[rsp]
1615
1616 mov r14,QWORD[8+rsp]
1617
1618 mov r13,QWORD[16+rsp]
1619
1620 mov r12,QWORD[24+rsp]
1621
1622 mov rbp,QWORD[32+rsp]
1623
1624 mov rbx,QWORD[40+rsp]
1625
1626 lea rax,[48+rsp]
1627 lea rsp,[48+rsp]
1628
1629$L$base2_64_avx2_epilogue:
1630 jmp NEAR $L$do_avx2
1631
1632
1633ALIGN 32
1634$L$even_avx2:
1635
1636 mov r10d,DWORD[((OPENSSL_ia32cap_P+8))]
1637 vmovd xmm0,DWORD[rdi]
1638 vmovd xmm1,DWORD[4+rdi]
1639 vmovd xmm2,DWORD[8+rdi]
1640 vmovd xmm3,DWORD[12+rdi]
1641 vmovd xmm4,DWORD[16+rdi]
1642
1643$L$do_avx2:
1644 cmp rdx,512
1645 jb NEAR $L$skip_avx512
1646 and r10d,r11d
1647 test r10d,65536
1648 jnz NEAR $L$blocks_avx512
1649$L$skip_avx512:
1650 lea r11,[((-248))+rsp]
1651 sub rsp,0x1c8
1652 vmovdqa XMMWORD[80+r11],xmm6
1653 vmovdqa XMMWORD[96+r11],xmm7
1654 vmovdqa XMMWORD[112+r11],xmm8
1655 vmovdqa XMMWORD[128+r11],xmm9
1656 vmovdqa XMMWORD[144+r11],xmm10
1657 vmovdqa XMMWORD[160+r11],xmm11
1658 vmovdqa XMMWORD[176+r11],xmm12
1659 vmovdqa XMMWORD[192+r11],xmm13
1660 vmovdqa XMMWORD[208+r11],xmm14
1661 vmovdqa XMMWORD[224+r11],xmm15
1662$L$do_avx2_body:
1663 lea rcx,[$L$const]
1664 lea rdi,[((48+64))+rdi]
1665 vmovdqa ymm7,YMMWORD[96+rcx]
1666
1667
1668 vmovdqu xmm9,XMMWORD[((-64))+rdi]
1669 and rsp,-512
1670 vmovdqu xmm10,XMMWORD[((-48))+rdi]
1671 vmovdqu xmm6,XMMWORD[((-32))+rdi]
1672 vmovdqu xmm11,XMMWORD[((-16))+rdi]
1673 vmovdqu xmm12,XMMWORD[rdi]
1674 vmovdqu xmm13,XMMWORD[16+rdi]
1675 lea rax,[144+rsp]
1676 vmovdqu xmm14,XMMWORD[32+rdi]
1677 vpermd ymm9,ymm7,ymm9
1678 vmovdqu xmm15,XMMWORD[48+rdi]
1679 vpermd ymm10,ymm7,ymm10
1680 vmovdqu xmm5,XMMWORD[64+rdi]
1681 vpermd ymm6,ymm7,ymm6
1682 vmovdqa YMMWORD[rsp],ymm9
1683 vpermd ymm11,ymm7,ymm11
1684 vmovdqa YMMWORD[(32-144)+rax],ymm10
1685 vpermd ymm12,ymm7,ymm12
1686 vmovdqa YMMWORD[(64-144)+rax],ymm6
1687 vpermd ymm13,ymm7,ymm13
1688 vmovdqa YMMWORD[(96-144)+rax],ymm11
1689 vpermd ymm14,ymm7,ymm14
1690 vmovdqa YMMWORD[(128-144)+rax],ymm12
1691 vpermd ymm15,ymm7,ymm15
1692 vmovdqa YMMWORD[(160-144)+rax],ymm13
1693 vpermd ymm5,ymm7,ymm5
1694 vmovdqa YMMWORD[(192-144)+rax],ymm14
1695 vmovdqa YMMWORD[(224-144)+rax],ymm15
1696 vmovdqa YMMWORD[(256-144)+rax],ymm5
1697 vmovdqa ymm5,YMMWORD[64+rcx]
1698
1699
1700
1701 vmovdqu xmm7,XMMWORD[rsi]
1702 vmovdqu xmm8,XMMWORD[16+rsi]
1703 vinserti128 ymm7,ymm7,XMMWORD[32+rsi],1
1704 vinserti128 ymm8,ymm8,XMMWORD[48+rsi],1
1705 lea rsi,[64+rsi]
1706
1707 vpsrldq ymm9,ymm7,6
1708 vpsrldq ymm10,ymm8,6
1709 vpunpckhqdq ymm6,ymm7,ymm8
1710 vpunpcklqdq ymm9,ymm9,ymm10
1711 vpunpcklqdq ymm7,ymm7,ymm8
1712
1713 vpsrlq ymm10,ymm9,30
1714 vpsrlq ymm9,ymm9,4
1715 vpsrlq ymm8,ymm7,26
1716 vpsrlq ymm6,ymm6,40
1717 vpand ymm9,ymm9,ymm5
1718 vpand ymm7,ymm7,ymm5
1719 vpand ymm8,ymm8,ymm5
1720 vpand ymm10,ymm10,ymm5
1721 vpor ymm6,ymm6,YMMWORD[32+rcx]
1722
1723 vpaddq ymm2,ymm9,ymm2
1724 sub rdx,64
1725 jz NEAR $L$tail_avx2
1726 jmp NEAR $L$oop_avx2
1727
1728ALIGN 32
1729$L$oop_avx2:
1730
1731
1732
1733
1734
1735
1736
1737
1738 vpaddq ymm0,ymm7,ymm0
1739 vmovdqa ymm7,YMMWORD[rsp]
1740 vpaddq ymm1,ymm8,ymm1
1741 vmovdqa ymm8,YMMWORD[32+rsp]
1742 vpaddq ymm3,ymm10,ymm3
1743 vmovdqa ymm9,YMMWORD[96+rsp]
1744 vpaddq ymm4,ymm6,ymm4
1745 vmovdqa ymm10,YMMWORD[48+rax]
1746 vmovdqa ymm5,YMMWORD[112+rax]
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763 vpmuludq ymm13,ymm7,ymm2
1764 vpmuludq ymm14,ymm8,ymm2
1765 vpmuludq ymm15,ymm9,ymm2
1766 vpmuludq ymm11,ymm10,ymm2
1767 vpmuludq ymm12,ymm5,ymm2
1768
1769 vpmuludq ymm6,ymm8,ymm0
1770 vpmuludq ymm2,ymm8,ymm1
1771 vpaddq ymm12,ymm12,ymm6
1772 vpaddq ymm13,ymm13,ymm2
1773 vpmuludq ymm6,ymm8,ymm3
1774 vpmuludq ymm2,ymm4,YMMWORD[64+rsp]
1775 vpaddq ymm15,ymm15,ymm6
1776 vpaddq ymm11,ymm11,ymm2
1777 vmovdqa ymm8,YMMWORD[((-16))+rax]
1778
1779 vpmuludq ymm6,ymm7,ymm0
1780 vpmuludq ymm2,ymm7,ymm1
1781 vpaddq ymm11,ymm11,ymm6
1782 vpaddq ymm12,ymm12,ymm2
1783 vpmuludq ymm6,ymm7,ymm3
1784 vpmuludq ymm2,ymm7,ymm4
1785 vmovdqu xmm7,XMMWORD[rsi]
1786 vpaddq ymm14,ymm14,ymm6
1787 vpaddq ymm15,ymm15,ymm2
1788 vinserti128 ymm7,ymm7,XMMWORD[32+rsi],1
1789
1790 vpmuludq ymm6,ymm8,ymm3
1791 vpmuludq ymm2,ymm8,ymm4
1792 vmovdqu xmm8,XMMWORD[16+rsi]
1793 vpaddq ymm11,ymm11,ymm6
1794 vpaddq ymm12,ymm12,ymm2
1795 vmovdqa ymm2,YMMWORD[16+rax]
1796 vpmuludq ymm6,ymm9,ymm1
1797 vpmuludq ymm9,ymm9,ymm0
1798 vpaddq ymm14,ymm14,ymm6
1799 vpaddq ymm13,ymm13,ymm9
1800 vinserti128 ymm8,ymm8,XMMWORD[48+rsi],1
1801 lea rsi,[64+rsi]
1802
1803 vpmuludq ymm6,ymm2,ymm1
1804 vpmuludq ymm2,ymm2,ymm0
1805 vpsrldq ymm9,ymm7,6
1806 vpaddq ymm15,ymm15,ymm6
1807 vpaddq ymm14,ymm14,ymm2
1808 vpmuludq ymm6,ymm10,ymm3
1809 vpmuludq ymm2,ymm10,ymm4
1810 vpsrldq ymm10,ymm8,6
1811 vpaddq ymm12,ymm12,ymm6
1812 vpaddq ymm13,ymm13,ymm2
1813 vpunpckhqdq ymm6,ymm7,ymm8
1814
1815 vpmuludq ymm3,ymm5,ymm3
1816 vpmuludq ymm4,ymm5,ymm4
1817 vpunpcklqdq ymm7,ymm7,ymm8
1818 vpaddq ymm2,ymm13,ymm3
1819 vpaddq ymm3,ymm14,ymm4
1820 vpunpcklqdq ymm10,ymm9,ymm10
1821 vpmuludq ymm4,ymm0,YMMWORD[80+rax]
1822 vpmuludq ymm0,ymm5,ymm1
1823 vmovdqa ymm5,YMMWORD[64+rcx]
1824 vpaddq ymm4,ymm15,ymm4
1825 vpaddq ymm0,ymm11,ymm0
1826
1827
1828
1829
1830 vpsrlq ymm14,ymm3,26
1831 vpand ymm3,ymm3,ymm5
1832 vpaddq ymm4,ymm4,ymm14
1833
1834 vpsrlq ymm11,ymm0,26
1835 vpand ymm0,ymm0,ymm5
1836 vpaddq ymm1,ymm12,ymm11
1837
1838 vpsrlq ymm15,ymm4,26
1839 vpand ymm4,ymm4,ymm5
1840
1841 vpsrlq ymm9,ymm10,4
1842
1843 vpsrlq ymm12,ymm1,26
1844 vpand ymm1,ymm1,ymm5
1845 vpaddq ymm2,ymm2,ymm12
1846
1847 vpaddq ymm0,ymm0,ymm15
1848 vpsllq ymm15,ymm15,2
1849 vpaddq ymm0,ymm0,ymm15
1850
1851 vpand ymm9,ymm9,ymm5
1852 vpsrlq ymm8,ymm7,26
1853
1854 vpsrlq ymm13,ymm2,26
1855 vpand ymm2,ymm2,ymm5
1856 vpaddq ymm3,ymm3,ymm13
1857
1858 vpaddq ymm2,ymm2,ymm9
1859 vpsrlq ymm10,ymm10,30
1860
1861 vpsrlq ymm11,ymm0,26
1862 vpand ymm0,ymm0,ymm5
1863 vpaddq ymm1,ymm1,ymm11
1864
1865 vpsrlq ymm6,ymm6,40
1866
1867 vpsrlq ymm14,ymm3,26
1868 vpand ymm3,ymm3,ymm5
1869 vpaddq ymm4,ymm4,ymm14
1870
1871 vpand ymm7,ymm7,ymm5
1872 vpand ymm8,ymm8,ymm5
1873 vpand ymm10,ymm10,ymm5
1874 vpor ymm6,ymm6,YMMWORD[32+rcx]
1875
1876 sub rdx,64
1877 jnz NEAR $L$oop_avx2
1878
1879DB 0x66,0x90
1880$L$tail_avx2:
1881
1882
1883
1884
1885
1886
1887
1888 vpaddq ymm0,ymm7,ymm0
1889 vmovdqu ymm7,YMMWORD[4+rsp]
1890 vpaddq ymm1,ymm8,ymm1
1891 vmovdqu ymm8,YMMWORD[36+rsp]
1892 vpaddq ymm3,ymm10,ymm3
1893 vmovdqu ymm9,YMMWORD[100+rsp]
1894 vpaddq ymm4,ymm6,ymm4
1895 vmovdqu ymm10,YMMWORD[52+rax]
1896 vmovdqu ymm5,YMMWORD[116+rax]
1897
1898 vpmuludq ymm13,ymm7,ymm2
1899 vpmuludq ymm14,ymm8,ymm2
1900 vpmuludq ymm15,ymm9,ymm2
1901 vpmuludq ymm11,ymm10,ymm2
1902 vpmuludq ymm12,ymm5,ymm2
1903
1904 vpmuludq ymm6,ymm8,ymm0
1905 vpmuludq ymm2,ymm8,ymm1
1906 vpaddq ymm12,ymm12,ymm6
1907 vpaddq ymm13,ymm13,ymm2
1908 vpmuludq ymm6,ymm8,ymm3
1909 vpmuludq ymm2,ymm4,YMMWORD[68+rsp]
1910 vpaddq ymm15,ymm15,ymm6
1911 vpaddq ymm11,ymm11,ymm2
1912
1913 vpmuludq ymm6,ymm7,ymm0
1914 vpmuludq ymm2,ymm7,ymm1
1915 vpaddq ymm11,ymm11,ymm6
1916 vmovdqu ymm8,YMMWORD[((-12))+rax]
1917 vpaddq ymm12,ymm12,ymm2
1918 vpmuludq ymm6,ymm7,ymm3
1919 vpmuludq ymm2,ymm7,ymm4
1920 vpaddq ymm14,ymm14,ymm6
1921 vpaddq ymm15,ymm15,ymm2
1922
1923 vpmuludq ymm6,ymm8,ymm3
1924 vpmuludq ymm2,ymm8,ymm4
1925 vpaddq ymm11,ymm11,ymm6
1926 vpaddq ymm12,ymm12,ymm2
1927 vmovdqu ymm2,YMMWORD[20+rax]
1928 vpmuludq ymm6,ymm9,ymm1
1929 vpmuludq ymm9,ymm9,ymm0
1930 vpaddq ymm14,ymm14,ymm6
1931 vpaddq ymm13,ymm13,ymm9
1932
1933 vpmuludq ymm6,ymm2,ymm1
1934 vpmuludq ymm2,ymm2,ymm0
1935 vpaddq ymm15,ymm15,ymm6
1936 vpaddq ymm14,ymm14,ymm2
1937 vpmuludq ymm6,ymm10,ymm3
1938 vpmuludq ymm2,ymm10,ymm4
1939 vpaddq ymm12,ymm12,ymm6
1940 vpaddq ymm13,ymm13,ymm2
1941
1942 vpmuludq ymm3,ymm5,ymm3
1943 vpmuludq ymm4,ymm5,ymm4
1944 vpaddq ymm2,ymm13,ymm3
1945 vpaddq ymm3,ymm14,ymm4
1946 vpmuludq ymm4,ymm0,YMMWORD[84+rax]
1947 vpmuludq ymm0,ymm5,ymm1
1948 vmovdqa ymm5,YMMWORD[64+rcx]
1949 vpaddq ymm4,ymm15,ymm4
1950 vpaddq ymm0,ymm11,ymm0
1951
1952
1953
1954
1955 vpsrldq ymm8,ymm12,8
1956 vpsrldq ymm9,ymm2,8
1957 vpsrldq ymm10,ymm3,8
1958 vpsrldq ymm6,ymm4,8
1959 vpsrldq ymm7,ymm0,8
1960 vpaddq ymm12,ymm12,ymm8
1961 vpaddq ymm2,ymm2,ymm9
1962 vpaddq ymm3,ymm3,ymm10
1963 vpaddq ymm4,ymm4,ymm6
1964 vpaddq ymm0,ymm0,ymm7
1965
1966 vpermq ymm10,ymm3,0x2
1967 vpermq ymm6,ymm4,0x2
1968 vpermq ymm7,ymm0,0x2
1969 vpermq ymm8,ymm12,0x2
1970 vpermq ymm9,ymm2,0x2
1971 vpaddq ymm3,ymm3,ymm10
1972 vpaddq ymm4,ymm4,ymm6
1973 vpaddq ymm0,ymm0,ymm7
1974 vpaddq ymm12,ymm12,ymm8
1975 vpaddq ymm2,ymm2,ymm9
1976
1977
1978
1979
1980 vpsrlq ymm14,ymm3,26
1981 vpand ymm3,ymm3,ymm5
1982 vpaddq ymm4,ymm4,ymm14
1983
1984 vpsrlq ymm11,ymm0,26
1985 vpand ymm0,ymm0,ymm5
1986 vpaddq ymm1,ymm12,ymm11
1987
1988 vpsrlq ymm15,ymm4,26
1989 vpand ymm4,ymm4,ymm5
1990
1991 vpsrlq ymm12,ymm1,26
1992 vpand ymm1,ymm1,ymm5
1993 vpaddq ymm2,ymm2,ymm12
1994
1995 vpaddq ymm0,ymm0,ymm15
1996 vpsllq ymm15,ymm15,2
1997 vpaddq ymm0,ymm0,ymm15
1998
1999 vpsrlq ymm13,ymm2,26
2000 vpand ymm2,ymm2,ymm5
2001 vpaddq ymm3,ymm3,ymm13
2002
2003 vpsrlq ymm11,ymm0,26
2004 vpand ymm0,ymm0,ymm5
2005 vpaddq ymm1,ymm1,ymm11
2006
2007 vpsrlq ymm14,ymm3,26
2008 vpand ymm3,ymm3,ymm5
2009 vpaddq ymm4,ymm4,ymm14
2010
2011 vmovd DWORD[(-112)+rdi],xmm0
2012 vmovd DWORD[(-108)+rdi],xmm1
2013 vmovd DWORD[(-104)+rdi],xmm2
2014 vmovd DWORD[(-100)+rdi],xmm3
2015 vmovd DWORD[(-96)+rdi],xmm4
2016 vmovdqa xmm6,XMMWORD[80+r11]
2017 vmovdqa xmm7,XMMWORD[96+r11]
2018 vmovdqa xmm8,XMMWORD[112+r11]
2019 vmovdqa xmm9,XMMWORD[128+r11]
2020 vmovdqa xmm10,XMMWORD[144+r11]
2021 vmovdqa xmm11,XMMWORD[160+r11]
2022 vmovdqa xmm12,XMMWORD[176+r11]
2023 vmovdqa xmm13,XMMWORD[192+r11]
2024 vmovdqa xmm14,XMMWORD[208+r11]
2025 vmovdqa xmm15,XMMWORD[224+r11]
2026 lea rsp,[248+r11]
2027$L$do_avx2_epilogue:
2028 vzeroupper
2029 mov rdi,QWORD[8+rsp] ;WIN64 epilogue
2030 mov rsi,QWORD[16+rsp]
2031 DB 0F3h,0C3h ;repret
2032
2033$L$SEH_end_poly1305_blocks_avx2:
2034
2035ALIGN 32
2036poly1305_blocks_avx512:
2037 mov QWORD[8+rsp],rdi ;WIN64 prologue
2038 mov QWORD[16+rsp],rsi
2039 mov rax,rsp
2040$L$SEH_begin_poly1305_blocks_avx512:
2041 mov rdi,rcx
2042 mov rsi,rdx
2043 mov rdx,r8
2044 mov rcx,r9
2045
2046
2047
2048$L$blocks_avx512:
2049 mov eax,15
2050 kmovw k2,eax
2051 lea r11,[((-248))+rsp]
2052 sub rsp,0x1c8
2053 vmovdqa XMMWORD[80+r11],xmm6
2054 vmovdqa XMMWORD[96+r11],xmm7
2055 vmovdqa XMMWORD[112+r11],xmm8
2056 vmovdqa XMMWORD[128+r11],xmm9
2057 vmovdqa XMMWORD[144+r11],xmm10
2058 vmovdqa XMMWORD[160+r11],xmm11
2059 vmovdqa XMMWORD[176+r11],xmm12
2060 vmovdqa XMMWORD[192+r11],xmm13
2061 vmovdqa XMMWORD[208+r11],xmm14
2062 vmovdqa XMMWORD[224+r11],xmm15
2063$L$do_avx512_body:
2064 lea rcx,[$L$const]
2065 lea rdi,[((48+64))+rdi]
2066 vmovdqa ymm9,YMMWORD[96+rcx]
2067
2068
2069 vmovdqu xmm11,XMMWORD[((-64))+rdi]
2070 and rsp,-512
2071 vmovdqu xmm12,XMMWORD[((-48))+rdi]
2072 mov rax,0x20
2073 vmovdqu xmm7,XMMWORD[((-32))+rdi]
2074 vmovdqu xmm13,XMMWORD[((-16))+rdi]
2075 vmovdqu xmm8,XMMWORD[rdi]
2076 vmovdqu xmm14,XMMWORD[16+rdi]
2077 vmovdqu xmm10,XMMWORD[32+rdi]
2078 vmovdqu xmm15,XMMWORD[48+rdi]
2079 vmovdqu xmm6,XMMWORD[64+rdi]
2080 vpermd zmm16,zmm9,zmm11
2081 vpbroadcastq zmm5,QWORD[64+rcx]
2082 vpermd zmm17,zmm9,zmm12
2083 vpermd zmm21,zmm9,zmm7
2084 vpermd zmm18,zmm9,zmm13
2085 vmovdqa64 ZMMWORD[rsp]{k2},zmm16
2086 vpsrlq zmm7,zmm16,32
2087 vpermd zmm22,zmm9,zmm8
2088 vmovdqu64 ZMMWORD[rax*1+rsp]{k2},zmm17
2089 vpsrlq zmm8,zmm17,32
2090 vpermd zmm19,zmm9,zmm14
2091 vmovdqa64 ZMMWORD[64+rsp]{k2},zmm21
2092 vpermd zmm23,zmm9,zmm10
2093 vpermd zmm20,zmm9,zmm15
2094 vmovdqu64 ZMMWORD[64+rax*1+rsp]{k2},zmm18
2095 vpermd zmm24,zmm9,zmm6
2096 vmovdqa64 ZMMWORD[128+rsp]{k2},zmm22
2097 vmovdqu64 ZMMWORD[128+rax*1+rsp]{k2},zmm19
2098 vmovdqa64 ZMMWORD[192+rsp]{k2},zmm23
2099 vmovdqu64 ZMMWORD[192+rax*1+rsp]{k2},zmm20
2100 vmovdqa64 ZMMWORD[256+rsp]{k2},zmm24
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111 vpmuludq zmm11,zmm16,zmm7
2112 vpmuludq zmm12,zmm17,zmm7
2113 vpmuludq zmm13,zmm18,zmm7
2114 vpmuludq zmm14,zmm19,zmm7
2115 vpmuludq zmm15,zmm20,zmm7
2116 vpsrlq zmm9,zmm18,32
2117
2118 vpmuludq zmm25,zmm24,zmm8
2119 vpmuludq zmm26,zmm16,zmm8
2120 vpmuludq zmm27,zmm17,zmm8
2121 vpmuludq zmm28,zmm18,zmm8
2122 vpmuludq zmm29,zmm19,zmm8
2123 vpsrlq zmm10,zmm19,32
2124 vpaddq zmm11,zmm11,zmm25
2125 vpaddq zmm12,zmm12,zmm26
2126 vpaddq zmm13,zmm13,zmm27
2127 vpaddq zmm14,zmm14,zmm28
2128 vpaddq zmm15,zmm15,zmm29
2129
2130 vpmuludq zmm25,zmm23,zmm9
2131 vpmuludq zmm26,zmm24,zmm9
2132 vpmuludq zmm28,zmm17,zmm9
2133 vpmuludq zmm29,zmm18,zmm9
2134 vpmuludq zmm27,zmm16,zmm9
2135 vpsrlq zmm6,zmm20,32
2136 vpaddq zmm11,zmm11,zmm25
2137 vpaddq zmm12,zmm12,zmm26
2138 vpaddq zmm14,zmm14,zmm28
2139 vpaddq zmm15,zmm15,zmm29
2140 vpaddq zmm13,zmm13,zmm27
2141
2142 vpmuludq zmm25,zmm22,zmm10
2143 vpmuludq zmm28,zmm16,zmm10
2144 vpmuludq zmm29,zmm17,zmm10
2145 vpmuludq zmm26,zmm23,zmm10
2146 vpmuludq zmm27,zmm24,zmm10
2147 vpaddq zmm11,zmm11,zmm25
2148 vpaddq zmm14,zmm14,zmm28
2149 vpaddq zmm15,zmm15,zmm29
2150 vpaddq zmm12,zmm12,zmm26
2151 vpaddq zmm13,zmm13,zmm27
2152
2153 vpmuludq zmm28,zmm24,zmm6
2154 vpmuludq zmm29,zmm16,zmm6
2155 vpmuludq zmm25,zmm21,zmm6
2156 vpmuludq zmm26,zmm22,zmm6
2157 vpmuludq zmm27,zmm23,zmm6
2158 vpaddq zmm14,zmm14,zmm28
2159 vpaddq zmm15,zmm15,zmm29
2160 vpaddq zmm11,zmm11,zmm25
2161 vpaddq zmm12,zmm12,zmm26
2162 vpaddq zmm13,zmm13,zmm27
2163
2164
2165
2166 vmovdqu64 zmm10,ZMMWORD[rsi]
2167 vmovdqu64 zmm6,ZMMWORD[64+rsi]
2168 lea rsi,[128+rsi]
2169
2170
2171
2172
2173 vpsrlq zmm28,zmm14,26
2174 vpandq zmm14,zmm14,zmm5
2175 vpaddq zmm15,zmm15,zmm28
2176
2177 vpsrlq zmm25,zmm11,26
2178 vpandq zmm11,zmm11,zmm5
2179 vpaddq zmm12,zmm12,zmm25
2180
2181 vpsrlq zmm29,zmm15,26
2182 vpandq zmm15,zmm15,zmm5
2183
2184 vpsrlq zmm26,zmm12,26
2185 vpandq zmm12,zmm12,zmm5
2186 vpaddq zmm13,zmm13,zmm26
2187
2188 vpaddq zmm11,zmm11,zmm29
2189 vpsllq zmm29,zmm29,2
2190 vpaddq zmm11,zmm11,zmm29
2191
2192 vpsrlq zmm27,zmm13,26
2193 vpandq zmm13,zmm13,zmm5
2194 vpaddq zmm14,zmm14,zmm27
2195
2196 vpsrlq zmm25,zmm11,26
2197 vpandq zmm11,zmm11,zmm5
2198 vpaddq zmm12,zmm12,zmm25
2199
2200 vpsrlq zmm28,zmm14,26
2201 vpandq zmm14,zmm14,zmm5
2202 vpaddq zmm15,zmm15,zmm28
2203
2204
2205
2206
2207
2208 vpunpcklqdq zmm7,zmm10,zmm6
2209 vpunpckhqdq zmm6,zmm10,zmm6
2210
2211
2212
2213
2214
2215
2216 vmovdqa32 zmm25,ZMMWORD[128+rcx]
2217 mov eax,0x7777
2218 kmovw k1,eax
2219
2220 vpermd zmm16,zmm25,zmm16
2221 vpermd zmm17,zmm25,zmm17
2222 vpermd zmm18,zmm25,zmm18
2223 vpermd zmm19,zmm25,zmm19
2224 vpermd zmm20,zmm25,zmm20
2225
2226 vpermd zmm16{k1},zmm25,zmm11
2227 vpermd zmm17{k1},zmm25,zmm12
2228 vpermd zmm18{k1},zmm25,zmm13
2229 vpermd zmm19{k1},zmm25,zmm14
2230 vpermd zmm20{k1},zmm25,zmm15
2231
2232 vpslld zmm21,zmm17,2
2233 vpslld zmm22,zmm18,2
2234 vpslld zmm23,zmm19,2
2235 vpslld zmm24,zmm20,2
2236 vpaddd zmm21,zmm21,zmm17
2237 vpaddd zmm22,zmm22,zmm18
2238 vpaddd zmm23,zmm23,zmm19
2239 vpaddd zmm24,zmm24,zmm20
2240
2241 vpbroadcastq zmm30,QWORD[32+rcx]
2242
2243 vpsrlq zmm9,zmm7,52
2244 vpsllq zmm10,zmm6,12
2245 vporq zmm9,zmm9,zmm10
2246 vpsrlq zmm8,zmm7,26
2247 vpsrlq zmm10,zmm6,14
2248 vpsrlq zmm6,zmm6,40
2249 vpandq zmm9,zmm9,zmm5
2250 vpandq zmm7,zmm7,zmm5
2251
2252
2253
2254
2255 vpaddq zmm2,zmm9,zmm2
2256 sub rdx,192
2257 jbe NEAR $L$tail_avx512
2258 jmp NEAR $L$oop_avx512
2259
2260ALIGN 32
2261$L$oop_avx512:
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290 vpmuludq zmm14,zmm17,zmm2
2291 vpaddq zmm0,zmm7,zmm0
2292 vpmuludq zmm15,zmm18,zmm2
2293 vpandq zmm8,zmm8,zmm5
2294 vpmuludq zmm11,zmm23,zmm2
2295 vpandq zmm10,zmm10,zmm5
2296 vpmuludq zmm12,zmm24,zmm2
2297 vporq zmm6,zmm6,zmm30
2298 vpmuludq zmm13,zmm16,zmm2
2299 vpaddq zmm1,zmm8,zmm1
2300 vpaddq zmm3,zmm10,zmm3
2301 vpaddq zmm4,zmm6,zmm4
2302
2303 vmovdqu64 zmm10,ZMMWORD[rsi]
2304 vmovdqu64 zmm6,ZMMWORD[64+rsi]
2305 lea rsi,[128+rsi]
2306 vpmuludq zmm28,zmm19,zmm0
2307 vpmuludq zmm29,zmm20,zmm0
2308 vpmuludq zmm25,zmm16,zmm0
2309 vpmuludq zmm26,zmm17,zmm0
2310 vpaddq zmm14,zmm14,zmm28
2311 vpaddq zmm15,zmm15,zmm29
2312 vpaddq zmm11,zmm11,zmm25
2313 vpaddq zmm12,zmm12,zmm26
2314
2315 vpmuludq zmm28,zmm18,zmm1
2316 vpmuludq zmm29,zmm19,zmm1
2317 vpmuludq zmm25,zmm24,zmm1
2318 vpmuludq zmm27,zmm18,zmm0
2319 vpaddq zmm14,zmm14,zmm28
2320 vpaddq zmm15,zmm15,zmm29
2321 vpaddq zmm11,zmm11,zmm25
2322 vpaddq zmm13,zmm13,zmm27
2323
2324 vpunpcklqdq zmm7,zmm10,zmm6
2325 vpunpckhqdq zmm6,zmm10,zmm6
2326
2327 vpmuludq zmm28,zmm16,zmm3
2328 vpmuludq zmm29,zmm17,zmm3
2329 vpmuludq zmm26,zmm16,zmm1
2330 vpmuludq zmm27,zmm17,zmm1
2331 vpaddq zmm14,zmm14,zmm28
2332 vpaddq zmm15,zmm15,zmm29
2333 vpaddq zmm12,zmm12,zmm26
2334 vpaddq zmm13,zmm13,zmm27
2335
2336 vpmuludq zmm28,zmm24,zmm4
2337 vpmuludq zmm29,zmm16,zmm4
2338 vpmuludq zmm25,zmm22,zmm3
2339 vpmuludq zmm26,zmm23,zmm3
2340 vpaddq zmm14,zmm14,zmm28
2341 vpmuludq zmm27,zmm24,zmm3
2342 vpaddq zmm15,zmm15,zmm29
2343 vpaddq zmm11,zmm11,zmm25
2344 vpaddq zmm12,zmm12,zmm26
2345 vpaddq zmm13,zmm13,zmm27
2346
2347 vpmuludq zmm25,zmm21,zmm4
2348 vpmuludq zmm26,zmm22,zmm4
2349 vpmuludq zmm27,zmm23,zmm4
2350 vpaddq zmm0,zmm11,zmm25
2351 vpaddq zmm1,zmm12,zmm26
2352 vpaddq zmm2,zmm13,zmm27
2353
2354
2355
2356
2357 vpsrlq zmm9,zmm7,52
2358 vpsllq zmm10,zmm6,12
2359
2360 vpsrlq zmm3,zmm14,26
2361 vpandq zmm14,zmm14,zmm5
2362 vpaddq zmm4,zmm15,zmm3
2363
2364 vporq zmm9,zmm9,zmm10
2365
2366 vpsrlq zmm11,zmm0,26
2367 vpandq zmm0,zmm0,zmm5
2368 vpaddq zmm1,zmm1,zmm11
2369
2370 vpandq zmm9,zmm9,zmm5
2371
2372 vpsrlq zmm15,zmm4,26
2373 vpandq zmm4,zmm4,zmm5
2374
2375 vpsrlq zmm12,zmm1,26
2376 vpandq zmm1,zmm1,zmm5
2377 vpaddq zmm2,zmm2,zmm12
2378
2379 vpaddq zmm0,zmm0,zmm15
2380 vpsllq zmm15,zmm15,2
2381 vpaddq zmm0,zmm0,zmm15
2382
2383 vpaddq zmm2,zmm2,zmm9
2384 vpsrlq zmm8,zmm7,26
2385
2386 vpsrlq zmm13,zmm2,26
2387 vpandq zmm2,zmm2,zmm5
2388 vpaddq zmm3,zmm14,zmm13
2389
2390 vpsrlq zmm10,zmm6,14
2391
2392 vpsrlq zmm11,zmm0,26
2393 vpandq zmm0,zmm0,zmm5
2394 vpaddq zmm1,zmm1,zmm11
2395
2396 vpsrlq zmm6,zmm6,40
2397
2398 vpsrlq zmm14,zmm3,26
2399 vpandq zmm3,zmm3,zmm5
2400 vpaddq zmm4,zmm4,zmm14
2401
2402 vpandq zmm7,zmm7,zmm5
2403
2404
2405
2406
2407 sub rdx,128
2408 ja NEAR $L$oop_avx512
2409
2410$L$tail_avx512:
2411
2412
2413
2414
2415
2416 vpsrlq zmm16,zmm16,32
2417 vpsrlq zmm17,zmm17,32
2418 vpsrlq zmm18,zmm18,32
2419 vpsrlq zmm23,zmm23,32
2420 vpsrlq zmm24,zmm24,32
2421 vpsrlq zmm19,zmm19,32
2422 vpsrlq zmm20,zmm20,32
2423 vpsrlq zmm21,zmm21,32
2424 vpsrlq zmm22,zmm22,32
2425
2426
2427
2428 lea rsi,[rdx*1+rsi]
2429
2430
2431 vpaddq zmm0,zmm7,zmm0
2432
2433 vpmuludq zmm14,zmm17,zmm2
2434 vpmuludq zmm15,zmm18,zmm2
2435 vpmuludq zmm11,zmm23,zmm2
2436 vpandq zmm8,zmm8,zmm5
2437 vpmuludq zmm12,zmm24,zmm2
2438 vpandq zmm10,zmm10,zmm5
2439 vpmuludq zmm13,zmm16,zmm2
2440 vporq zmm6,zmm6,zmm30
2441 vpaddq zmm1,zmm8,zmm1
2442 vpaddq zmm3,zmm10,zmm3
2443 vpaddq zmm4,zmm6,zmm4
2444
2445 vmovdqu xmm7,XMMWORD[rsi]
2446 vpmuludq zmm28,zmm19,zmm0
2447 vpmuludq zmm29,zmm20,zmm0
2448 vpmuludq zmm25,zmm16,zmm0
2449 vpmuludq zmm26,zmm17,zmm0
2450 vpaddq zmm14,zmm14,zmm28
2451 vpaddq zmm15,zmm15,zmm29
2452 vpaddq zmm11,zmm11,zmm25
2453 vpaddq zmm12,zmm12,zmm26
2454
2455 vmovdqu xmm8,XMMWORD[16+rsi]
2456 vpmuludq zmm28,zmm18,zmm1
2457 vpmuludq zmm29,zmm19,zmm1
2458 vpmuludq zmm25,zmm24,zmm1
2459 vpmuludq zmm27,zmm18,zmm0
2460 vpaddq zmm14,zmm14,zmm28
2461 vpaddq zmm15,zmm15,zmm29
2462 vpaddq zmm11,zmm11,zmm25
2463 vpaddq zmm13,zmm13,zmm27
2464
2465 vinserti128 ymm7,ymm7,XMMWORD[32+rsi],1
2466 vpmuludq zmm28,zmm16,zmm3
2467 vpmuludq zmm29,zmm17,zmm3
2468 vpmuludq zmm26,zmm16,zmm1
2469 vpmuludq zmm27,zmm17,zmm1
2470 vpaddq zmm14,zmm14,zmm28
2471 vpaddq zmm15,zmm15,zmm29
2472 vpaddq zmm12,zmm12,zmm26
2473 vpaddq zmm13,zmm13,zmm27
2474
2475 vinserti128 ymm8,ymm8,XMMWORD[48+rsi],1
2476 vpmuludq zmm28,zmm24,zmm4
2477 vpmuludq zmm29,zmm16,zmm4
2478 vpmuludq zmm25,zmm22,zmm3
2479 vpmuludq zmm26,zmm23,zmm3
2480 vpmuludq zmm27,zmm24,zmm3
2481 vpaddq zmm3,zmm14,zmm28
2482 vpaddq zmm15,zmm15,zmm29
2483 vpaddq zmm11,zmm11,zmm25
2484 vpaddq zmm12,zmm12,zmm26
2485 vpaddq zmm13,zmm13,zmm27
2486
2487 vpmuludq zmm25,zmm21,zmm4
2488 vpmuludq zmm26,zmm22,zmm4
2489 vpmuludq zmm27,zmm23,zmm4
2490 vpaddq zmm0,zmm11,zmm25
2491 vpaddq zmm1,zmm12,zmm26
2492 vpaddq zmm2,zmm13,zmm27
2493
2494
2495
2496
2497 mov eax,1
2498 vpermq zmm14,zmm3,0xb1
2499 vpermq zmm4,zmm15,0xb1
2500 vpermq zmm11,zmm0,0xb1
2501 vpermq zmm12,zmm1,0xb1
2502 vpermq zmm13,zmm2,0xb1
2503 vpaddq zmm3,zmm3,zmm14
2504 vpaddq zmm4,zmm4,zmm15
2505 vpaddq zmm0,zmm0,zmm11
2506 vpaddq zmm1,zmm1,zmm12
2507 vpaddq zmm2,zmm2,zmm13
2508
2509 kmovw k3,eax
2510 vpermq zmm14,zmm3,0x2
2511 vpermq zmm15,zmm4,0x2
2512 vpermq zmm11,zmm0,0x2
2513 vpermq zmm12,zmm1,0x2
2514 vpermq zmm13,zmm2,0x2
2515 vpaddq zmm3,zmm3,zmm14
2516 vpaddq zmm4,zmm4,zmm15
2517 vpaddq zmm0,zmm0,zmm11
2518 vpaddq zmm1,zmm1,zmm12
2519 vpaddq zmm2,zmm2,zmm13
2520
2521 vextracti64x4 ymm14,zmm3,0x1
2522 vextracti64x4 ymm15,zmm4,0x1
2523 vextracti64x4 ymm11,zmm0,0x1
2524 vextracti64x4 ymm12,zmm1,0x1
2525 vextracti64x4 ymm13,zmm2,0x1
2526 vpaddq zmm3{k3}{z},zmm3,zmm14
2527 vpaddq zmm4{k3}{z},zmm4,zmm15
2528 vpaddq zmm0{k3}{z},zmm0,zmm11
2529 vpaddq zmm1{k3}{z},zmm1,zmm12
2530 vpaddq zmm2{k3}{z},zmm2,zmm13
2531
2532
2533
2534 vpsrlq ymm14,ymm3,26
2535 vpand ymm3,ymm3,ymm5
2536 vpsrldq ymm9,ymm7,6
2537 vpsrldq ymm10,ymm8,6
2538 vpunpckhqdq ymm6,ymm7,ymm8
2539 vpaddq ymm4,ymm4,ymm14
2540
2541 vpsrlq ymm11,ymm0,26
2542 vpand ymm0,ymm0,ymm5
2543 vpunpcklqdq ymm9,ymm9,ymm10
2544 vpunpcklqdq ymm7,ymm7,ymm8
2545 vpaddq ymm1,ymm1,ymm11
2546
2547 vpsrlq ymm15,ymm4,26
2548 vpand ymm4,ymm4,ymm5
2549
2550 vpsrlq ymm12,ymm1,26
2551 vpand ymm1,ymm1,ymm5
2552 vpsrlq ymm10,ymm9,30
2553 vpsrlq ymm9,ymm9,4
2554 vpaddq ymm2,ymm2,ymm12
2555
2556 vpaddq ymm0,ymm0,ymm15
2557 vpsllq ymm15,ymm15,2
2558 vpsrlq ymm8,ymm7,26
2559 vpsrlq ymm6,ymm6,40
2560 vpaddq ymm0,ymm0,ymm15
2561
2562 vpsrlq ymm13,ymm2,26
2563 vpand ymm2,ymm2,ymm5
2564 vpand ymm9,ymm9,ymm5
2565 vpand ymm7,ymm7,ymm5
2566 vpaddq ymm3,ymm3,ymm13
2567
2568 vpsrlq ymm11,ymm0,26
2569 vpand ymm0,ymm0,ymm5
2570 vpaddq ymm2,ymm9,ymm2
2571 vpand ymm8,ymm8,ymm5
2572 vpaddq ymm1,ymm1,ymm11
2573
2574 vpsrlq ymm14,ymm3,26
2575 vpand ymm3,ymm3,ymm5
2576 vpand ymm10,ymm10,ymm5
2577 vpor ymm6,ymm6,YMMWORD[32+rcx]
2578 vpaddq ymm4,ymm4,ymm14
2579
2580 lea rax,[144+rsp]
2581 add rdx,64
2582 jnz NEAR $L$tail_avx2
2583
2584 vpsubq ymm2,ymm2,ymm9
2585 vmovd DWORD[(-112)+rdi],xmm0
2586 vmovd DWORD[(-108)+rdi],xmm1
2587 vmovd DWORD[(-104)+rdi],xmm2
2588 vmovd DWORD[(-100)+rdi],xmm3
2589 vmovd DWORD[(-96)+rdi],xmm4
2590 vzeroall
2591 movdqa xmm6,XMMWORD[80+r11]
2592 movdqa xmm7,XMMWORD[96+r11]
2593 movdqa xmm8,XMMWORD[112+r11]
2594 movdqa xmm9,XMMWORD[128+r11]
2595 movdqa xmm10,XMMWORD[144+r11]
2596 movdqa xmm11,XMMWORD[160+r11]
2597 movdqa xmm12,XMMWORD[176+r11]
2598 movdqa xmm13,XMMWORD[192+r11]
2599 movdqa xmm14,XMMWORD[208+r11]
2600 movdqa xmm15,XMMWORD[224+r11]
2601 lea rsp,[248+r11]
2602$L$do_avx512_epilogue:
2603 mov rdi,QWORD[8+rsp] ;WIN64 epilogue
2604 mov rsi,QWORD[16+rsp]
2605 DB 0F3h,0C3h ;repret
2606
2607$L$SEH_end_poly1305_blocks_avx512:
2608
2609ALIGN 32
2610poly1305_init_base2_44:
2611 mov QWORD[8+rsp],rdi ;WIN64 prologue
2612 mov QWORD[16+rsp],rsi
2613 mov rax,rsp
2614$L$SEH_begin_poly1305_init_base2_44:
2615 mov rdi,rcx
2616 mov rsi,rdx
2617 mov rdx,r8
2618
2619
2620
2621 xor rax,rax
2622 mov QWORD[rdi],rax
2623 mov QWORD[8+rdi],rax
2624 mov QWORD[16+rdi],rax
2625
2626$L$init_base2_44:
2627 lea r10,[poly1305_blocks_vpmadd52]
2628 lea r11,[poly1305_emit_base2_44]
2629
2630 mov rax,0x0ffffffc0fffffff
2631 mov rcx,0x0ffffffc0ffffffc
2632 and rax,QWORD[rsi]
2633 mov r8,0x00000fffffffffff
2634 and rcx,QWORD[8+rsi]
2635 mov r9,0x00000fffffffffff
2636 and r8,rax
2637 shrd rax,rcx,44
2638 mov QWORD[40+rdi],r8
2639 and rax,r9
2640 shr rcx,24
2641 mov QWORD[48+rdi],rax
2642 lea rax,[rax*4+rax]
2643 mov QWORD[56+rdi],rcx
2644 shl rax,2
2645 lea rcx,[rcx*4+rcx]
2646 shl rcx,2
2647 mov QWORD[24+rdi],rax
2648 mov QWORD[32+rdi],rcx
2649 mov QWORD[64+rdi],-1
2650 mov QWORD[rdx],r10
2651 mov QWORD[8+rdx],r11
2652 mov eax,1
2653 mov rdi,QWORD[8+rsp] ;WIN64 epilogue
2654 mov rsi,QWORD[16+rsp]
2655 DB 0F3h,0C3h ;repret
2656
2657$L$SEH_end_poly1305_init_base2_44:
2658
2659ALIGN 32
2660poly1305_blocks_vpmadd52:
2661 mov QWORD[8+rsp],rdi ;WIN64 prologue
2662 mov QWORD[16+rsp],rsi
2663 mov rax,rsp
2664$L$SEH_begin_poly1305_blocks_vpmadd52:
2665 mov rdi,rcx
2666 mov rsi,rdx
2667 mov rdx,r8
2668 mov rcx,r9
2669
2670
2671
2672DB 243,15,30,250
2673 shr rdx,4
2674 jz NEAR $L$no_data_vpmadd52
2675
2676 shl rcx,40
2677 mov r8,QWORD[64+rdi]
2678
2679
2680
2681
2682
2683
2684 mov rax,3
2685 mov r10,1
2686 cmp rdx,4
2687 cmovae rax,r10
2688 test r8,r8
2689 cmovns rax,r10
2690
2691 and rax,rdx
2692 jz NEAR $L$blocks_vpmadd52_4x
2693
2694 sub rdx,rax
2695 mov r10d,7
2696 mov r11d,1
2697 kmovw k7,r10d
2698 lea r10,[$L$2_44_inp_permd]
2699 kmovw k1,r11d
2700
2701 vmovq xmm21,rcx
2702 vmovdqa64 ymm19,YMMWORD[r10]
2703 vmovdqa64 ymm20,YMMWORD[32+r10]
2704 vpermq ymm21,ymm21,0xcf
2705 vmovdqa64 ymm22,YMMWORD[64+r10]
2706
2707 vmovdqu64 ymm16{k7}{z},[rdi]
2708 vmovdqu64 ymm3{k7}{z},[40+rdi]
2709 vmovdqu64 ymm4{k7}{z},[32+rdi]
2710 vmovdqu64 ymm5{k7}{z},[24+rdi]
2711
2712 vmovdqa64 ymm23,YMMWORD[96+r10]
2713 vmovdqa64 ymm24,YMMWORD[128+r10]
2714
2715 jmp NEAR $L$oop_vpmadd52
2716
2717ALIGN 32
2718$L$oop_vpmadd52:
2719 vmovdqu32 xmm18,XMMWORD[rsi]
2720 lea rsi,[16+rsi]
2721
2722 vpermd ymm18,ymm19,ymm18
2723 vpsrlvq ymm18,ymm18,ymm20
2724 vpandq ymm18,ymm18,ymm22
2725 vporq ymm18,ymm18,ymm21
2726
2727 vpaddq ymm16,ymm16,ymm18
2728
2729 vpermq ymm0{k7}{z},ymm16,0
2730 vpermq ymm1{k7}{z},ymm16,85
2731 vpermq ymm2{k7}{z},ymm16,170
2732
2733 vpxord ymm16,ymm16,ymm16
2734 vpxord ymm17,ymm17,ymm17
2735
2736 vpmadd52luq ymm16,ymm0,ymm3
2737 vpmadd52huq ymm17,ymm0,ymm3
2738
2739 vpmadd52luq ymm16,ymm1,ymm4
2740 vpmadd52huq ymm17,ymm1,ymm4
2741
2742 vpmadd52luq ymm16,ymm2,ymm5
2743 vpmadd52huq ymm17,ymm2,ymm5
2744
2745 vpsrlvq ymm18,ymm16,ymm23
2746 vpsllvq ymm17,ymm17,ymm24
2747 vpandq ymm16,ymm16,ymm22
2748
2749 vpaddq ymm17,ymm17,ymm18
2750
2751 vpermq ymm17,ymm17,147
2752
2753 vpaddq ymm16,ymm16,ymm17
2754
2755 vpsrlvq ymm18,ymm16,ymm23
2756 vpandq ymm16,ymm16,ymm22
2757
2758 vpermq ymm18,ymm18,147
2759
2760 vpaddq ymm16,ymm16,ymm18
2761
2762 vpermq ymm18{k1}{z},ymm16,147
2763
2764 vpaddq ymm16,ymm16,ymm18
2765 vpsllq ymm18,ymm18,2
2766
2767 vpaddq ymm16,ymm16,ymm18
2768
2769 dec rax
2770 jnz NEAR $L$oop_vpmadd52
2771
2772 vmovdqu64 YMMWORD[rdi]{k7},ymm16
2773
2774 test rdx,rdx
2775 jnz NEAR $L$blocks_vpmadd52_4x
2776
2777$L$no_data_vpmadd52:
2778 mov rdi,QWORD[8+rsp] ;WIN64 epilogue
2779 mov rsi,QWORD[16+rsp]
2780 DB 0F3h,0C3h ;repret
2781
2782$L$SEH_end_poly1305_blocks_vpmadd52:
2783
2784ALIGN 32
2785poly1305_blocks_vpmadd52_4x:
2786 mov QWORD[8+rsp],rdi ;WIN64 prologue
2787 mov QWORD[16+rsp],rsi
2788 mov rax,rsp
2789$L$SEH_begin_poly1305_blocks_vpmadd52_4x:
2790 mov rdi,rcx
2791 mov rsi,rdx
2792 mov rdx,r8
2793 mov rcx,r9
2794
2795
2796
2797 shr rdx,4
2798 jz NEAR $L$no_data_vpmadd52_4x
2799
2800 shl rcx,40
2801 mov r8,QWORD[64+rdi]
2802
2803$L$blocks_vpmadd52_4x:
2804 vpbroadcastq ymm31,rcx
2805
2806 vmovdqa64 ymm28,YMMWORD[$L$x_mask44]
2807 mov eax,5
2808 vmovdqa64 ymm29,YMMWORD[$L$x_mask42]
2809 kmovw k1,eax
2810
2811 test r8,r8
2812 js NEAR $L$init_vpmadd52
2813
2814 vmovq xmm0,QWORD[rdi]
2815 vmovq xmm1,QWORD[8+rdi]
2816 vmovq xmm2,QWORD[16+rdi]
2817
2818 test rdx,3
2819 jnz NEAR $L$blocks_vpmadd52_2x_do
2820
2821$L$blocks_vpmadd52_4x_do:
2822 vpbroadcastq ymm3,QWORD[64+rdi]
2823 vpbroadcastq ymm4,QWORD[96+rdi]
2824 vpbroadcastq ymm5,QWORD[128+rdi]
2825 vpbroadcastq ymm16,QWORD[160+rdi]
2826
2827$L$blocks_vpmadd52_4x_key_loaded:
2828 vpsllq ymm17,ymm5,2
2829 vpaddq ymm17,ymm17,ymm5
2830 vpsllq ymm17,ymm17,2
2831
2832 test rdx,7
2833 jz NEAR $L$blocks_vpmadd52_8x
2834
2835 vmovdqu64 ymm26,YMMWORD[rsi]
2836 vmovdqu64 ymm27,YMMWORD[32+rsi]
2837 lea rsi,[64+rsi]
2838
2839 vpunpcklqdq ymm25,ymm26,ymm27
2840 vpunpckhqdq ymm27,ymm26,ymm27
2841
2842
2843
2844 vpsrlq ymm26,ymm27,24
2845 vporq ymm26,ymm26,ymm31
2846 vpaddq ymm2,ymm2,ymm26
2847 vpandq ymm24,ymm25,ymm28
2848 vpsrlq ymm25,ymm25,44
2849 vpsllq ymm27,ymm27,20
2850 vporq ymm25,ymm25,ymm27
2851 vpandq ymm25,ymm25,ymm28
2852
2853 sub rdx,4
2854 jz NEAR $L$tail_vpmadd52_4x
2855 jmp NEAR $L$oop_vpmadd52_4x
2856 ud2
2857
2858ALIGN 32
2859$L$init_vpmadd52:
2860 vmovq xmm16,QWORD[24+rdi]
2861 vmovq xmm2,QWORD[56+rdi]
2862 vmovq xmm17,QWORD[32+rdi]
2863 vmovq xmm3,QWORD[40+rdi]
2864 vmovq xmm4,QWORD[48+rdi]
2865
2866 vmovdqa ymm0,ymm3
2867 vmovdqa ymm1,ymm4
2868 vmovdqa ymm5,ymm2
2869
2870 mov eax,2
2871
2872$L$mul_init_vpmadd52:
2873 vpxorq ymm18,ymm18,ymm18
2874 vpmadd52luq ymm18,ymm16,ymm2
2875 vpxorq ymm19,ymm19,ymm19
2876 vpmadd52huq ymm19,ymm16,ymm2
2877 vpxorq ymm20,ymm20,ymm20
2878 vpmadd52luq ymm20,ymm17,ymm2
2879 vpxorq ymm21,ymm21,ymm21
2880 vpmadd52huq ymm21,ymm17,ymm2
2881 vpxorq ymm22,ymm22,ymm22
2882 vpmadd52luq ymm22,ymm3,ymm2
2883 vpxorq ymm23,ymm23,ymm23
2884 vpmadd52huq ymm23,ymm3,ymm2
2885
2886 vpmadd52luq ymm18,ymm3,ymm0
2887 vpmadd52huq ymm19,ymm3,ymm0
2888 vpmadd52luq ymm20,ymm4,ymm0
2889 vpmadd52huq ymm21,ymm4,ymm0
2890 vpmadd52luq ymm22,ymm5,ymm0
2891 vpmadd52huq ymm23,ymm5,ymm0
2892
2893 vpmadd52luq ymm18,ymm17,ymm1
2894 vpmadd52huq ymm19,ymm17,ymm1
2895 vpmadd52luq ymm20,ymm3,ymm1
2896 vpmadd52huq ymm21,ymm3,ymm1
2897 vpmadd52luq ymm22,ymm4,ymm1
2898 vpmadd52huq ymm23,ymm4,ymm1
2899
2900
2901
2902 vpsrlq ymm30,ymm18,44
2903 vpsllq ymm19,ymm19,8
2904 vpandq ymm0,ymm18,ymm28
2905 vpaddq ymm19,ymm19,ymm30
2906
2907 vpaddq ymm20,ymm20,ymm19
2908
2909 vpsrlq ymm30,ymm20,44
2910 vpsllq ymm21,ymm21,8
2911 vpandq ymm1,ymm20,ymm28
2912 vpaddq ymm21,ymm21,ymm30
2913
2914 vpaddq ymm22,ymm22,ymm21
2915
2916 vpsrlq ymm30,ymm22,42
2917 vpsllq ymm23,ymm23,10
2918 vpandq ymm2,ymm22,ymm29
2919 vpaddq ymm23,ymm23,ymm30
2920
2921 vpaddq ymm0,ymm0,ymm23
2922 vpsllq ymm23,ymm23,2
2923
2924 vpaddq ymm0,ymm0,ymm23
2925
2926 vpsrlq ymm30,ymm0,44
2927 vpandq ymm0,ymm0,ymm28
2928
2929 vpaddq ymm1,ymm1,ymm30
2930
2931 dec eax
2932 jz NEAR $L$done_init_vpmadd52
2933
2934 vpunpcklqdq ymm4,ymm1,ymm4
2935 vpbroadcastq xmm1,xmm1
2936 vpunpcklqdq ymm5,ymm2,ymm5
2937 vpbroadcastq xmm2,xmm2
2938 vpunpcklqdq ymm3,ymm0,ymm3
2939 vpbroadcastq xmm0,xmm0
2940
2941 vpsllq ymm16,ymm4,2
2942 vpsllq ymm17,ymm5,2
2943 vpaddq ymm16,ymm16,ymm4
2944 vpaddq ymm17,ymm17,ymm5
2945 vpsllq ymm16,ymm16,2
2946 vpsllq ymm17,ymm17,2
2947
2948 jmp NEAR $L$mul_init_vpmadd52
2949 ud2
2950
2951ALIGN 32
2952$L$done_init_vpmadd52:
2953 vinserti128 ymm4,ymm1,xmm4,1
2954 vinserti128 ymm5,ymm2,xmm5,1
2955 vinserti128 ymm3,ymm0,xmm3,1
2956
2957 vpermq ymm4,ymm4,216
2958 vpermq ymm5,ymm5,216
2959 vpermq ymm3,ymm3,216
2960
2961 vpsllq ymm16,ymm4,2
2962 vpaddq ymm16,ymm16,ymm4
2963 vpsllq ymm16,ymm16,2
2964
2965 vmovq xmm0,QWORD[rdi]
2966 vmovq xmm1,QWORD[8+rdi]
2967 vmovq xmm2,QWORD[16+rdi]
2968
2969 test rdx,3
2970 jnz NEAR $L$done_init_vpmadd52_2x
2971
2972 vmovdqu64 YMMWORD[64+rdi],ymm3
2973 vpbroadcastq ymm3,xmm3
2974 vmovdqu64 YMMWORD[96+rdi],ymm4
2975 vpbroadcastq ymm4,xmm4
2976 vmovdqu64 YMMWORD[128+rdi],ymm5
2977 vpbroadcastq ymm5,xmm5
2978 vmovdqu64 YMMWORD[160+rdi],ymm16
2979 vpbroadcastq ymm16,xmm16
2980
2981 jmp NEAR $L$blocks_vpmadd52_4x_key_loaded
2982 ud2
2983
2984ALIGN 32
2985$L$done_init_vpmadd52_2x:
2986 vmovdqu64 YMMWORD[64+rdi],ymm3
2987 vpsrldq ymm3,ymm3,8
2988 vmovdqu64 YMMWORD[96+rdi],ymm4
2989 vpsrldq ymm4,ymm4,8
2990 vmovdqu64 YMMWORD[128+rdi],ymm5
2991 vpsrldq ymm5,ymm5,8
2992 vmovdqu64 YMMWORD[160+rdi],ymm16
2993 vpsrldq ymm16,ymm16,8
2994 jmp NEAR $L$blocks_vpmadd52_2x_key_loaded
2995 ud2
2996
2997ALIGN 32
2998$L$blocks_vpmadd52_2x_do:
2999 vmovdqu64 ymm5{k1}{z},[((128+8))+rdi]
3000 vmovdqu64 ymm16{k1}{z},[((160+8))+rdi]
3001 vmovdqu64 ymm3{k1}{z},[((64+8))+rdi]
3002 vmovdqu64 ymm4{k1}{z},[((96+8))+rdi]
3003
3004$L$blocks_vpmadd52_2x_key_loaded:
3005 vmovdqu64 ymm26,YMMWORD[rsi]
3006 vpxorq ymm27,ymm27,ymm27
3007 lea rsi,[32+rsi]
3008
3009 vpunpcklqdq ymm25,ymm26,ymm27
3010 vpunpckhqdq ymm27,ymm26,ymm27
3011
3012
3013
3014 vpsrlq ymm26,ymm27,24
3015 vporq ymm26,ymm26,ymm31
3016 vpaddq ymm2,ymm2,ymm26
3017 vpandq ymm24,ymm25,ymm28
3018 vpsrlq ymm25,ymm25,44
3019 vpsllq ymm27,ymm27,20
3020 vporq ymm25,ymm25,ymm27
3021 vpandq ymm25,ymm25,ymm28
3022
3023 jmp NEAR $L$tail_vpmadd52_2x
3024 ud2
3025
3026ALIGN 32
3027$L$oop_vpmadd52_4x:
3028
3029 vpaddq ymm0,ymm0,ymm24
3030 vpaddq ymm1,ymm1,ymm25
3031
3032 vpxorq ymm18,ymm18,ymm18
3033 vpmadd52luq ymm18,ymm16,ymm2
3034 vpxorq ymm19,ymm19,ymm19
3035 vpmadd52huq ymm19,ymm16,ymm2
3036 vpxorq ymm20,ymm20,ymm20
3037 vpmadd52luq ymm20,ymm17,ymm2
3038 vpxorq ymm21,ymm21,ymm21
3039 vpmadd52huq ymm21,ymm17,ymm2
3040 vpxorq ymm22,ymm22,ymm22
3041 vpmadd52luq ymm22,ymm3,ymm2
3042 vpxorq ymm23,ymm23,ymm23
3043 vpmadd52huq ymm23,ymm3,ymm2
3044
3045 vmovdqu64 ymm26,YMMWORD[rsi]
3046 vmovdqu64 ymm27,YMMWORD[32+rsi]
3047 lea rsi,[64+rsi]
3048 vpmadd52luq ymm18,ymm3,ymm0
3049 vpmadd52huq ymm19,ymm3,ymm0
3050 vpmadd52luq ymm20,ymm4,ymm0
3051 vpmadd52huq ymm21,ymm4,ymm0
3052 vpmadd52luq ymm22,ymm5,ymm0
3053 vpmadd52huq ymm23,ymm5,ymm0
3054
3055 vpunpcklqdq ymm25,ymm26,ymm27
3056 vpunpckhqdq ymm27,ymm26,ymm27
3057 vpmadd52luq ymm18,ymm17,ymm1
3058 vpmadd52huq ymm19,ymm17,ymm1
3059 vpmadd52luq ymm20,ymm3,ymm1
3060 vpmadd52huq ymm21,ymm3,ymm1
3061 vpmadd52luq ymm22,ymm4,ymm1
3062 vpmadd52huq ymm23,ymm4,ymm1
3063
3064
3065
3066 vpsrlq ymm30,ymm18,44
3067 vpsllq ymm19,ymm19,8
3068 vpandq ymm0,ymm18,ymm28
3069 vpaddq ymm19,ymm19,ymm30
3070
3071 vpsrlq ymm26,ymm27,24
3072 vporq ymm26,ymm26,ymm31
3073 vpaddq ymm20,ymm20,ymm19
3074
3075 vpsrlq ymm30,ymm20,44
3076 vpsllq ymm21,ymm21,8
3077 vpandq ymm1,ymm20,ymm28
3078 vpaddq ymm21,ymm21,ymm30
3079
3080 vpandq ymm24,ymm25,ymm28
3081 vpsrlq ymm25,ymm25,44
3082 vpsllq ymm27,ymm27,20
3083 vpaddq ymm22,ymm22,ymm21
3084
3085 vpsrlq ymm30,ymm22,42
3086 vpsllq ymm23,ymm23,10
3087 vpandq ymm2,ymm22,ymm29
3088 vpaddq ymm23,ymm23,ymm30
3089
3090 vpaddq ymm2,ymm2,ymm26
3091 vpaddq ymm0,ymm0,ymm23
3092 vpsllq ymm23,ymm23,2
3093
3094 vpaddq ymm0,ymm0,ymm23
3095 vporq ymm25,ymm25,ymm27
3096 vpandq ymm25,ymm25,ymm28
3097
3098 vpsrlq ymm30,ymm0,44
3099 vpandq ymm0,ymm0,ymm28
3100
3101 vpaddq ymm1,ymm1,ymm30
3102
3103 sub rdx,4
3104 jnz NEAR $L$oop_vpmadd52_4x
3105
3106$L$tail_vpmadd52_4x:
3107 vmovdqu64 ymm5,YMMWORD[128+rdi]
3108 vmovdqu64 ymm16,YMMWORD[160+rdi]
3109 vmovdqu64 ymm3,YMMWORD[64+rdi]
3110 vmovdqu64 ymm4,YMMWORD[96+rdi]
3111
3112$L$tail_vpmadd52_2x:
3113 vpsllq ymm17,ymm5,2
3114 vpaddq ymm17,ymm17,ymm5
3115 vpsllq ymm17,ymm17,2
3116
3117
3118 vpaddq ymm0,ymm0,ymm24
3119 vpaddq ymm1,ymm1,ymm25
3120
3121 vpxorq ymm18,ymm18,ymm18
3122 vpmadd52luq ymm18,ymm16,ymm2
3123 vpxorq ymm19,ymm19,ymm19
3124 vpmadd52huq ymm19,ymm16,ymm2
3125 vpxorq ymm20,ymm20,ymm20
3126 vpmadd52luq ymm20,ymm17,ymm2
3127 vpxorq ymm21,ymm21,ymm21
3128 vpmadd52huq ymm21,ymm17,ymm2
3129 vpxorq ymm22,ymm22,ymm22
3130 vpmadd52luq ymm22,ymm3,ymm2
3131 vpxorq ymm23,ymm23,ymm23
3132 vpmadd52huq ymm23,ymm3,ymm2
3133
3134 vpmadd52luq ymm18,ymm3,ymm0
3135 vpmadd52huq ymm19,ymm3,ymm0
3136 vpmadd52luq ymm20,ymm4,ymm0
3137 vpmadd52huq ymm21,ymm4,ymm0
3138 vpmadd52luq ymm22,ymm5,ymm0
3139 vpmadd52huq ymm23,ymm5,ymm0
3140
3141 vpmadd52luq ymm18,ymm17,ymm1
3142 vpmadd52huq ymm19,ymm17,ymm1
3143 vpmadd52luq ymm20,ymm3,ymm1
3144 vpmadd52huq ymm21,ymm3,ymm1
3145 vpmadd52luq ymm22,ymm4,ymm1
3146 vpmadd52huq ymm23,ymm4,ymm1
3147
3148
3149
3150
3151 mov eax,1
3152 kmovw k1,eax
3153 vpsrldq ymm24,ymm18,8
3154 vpsrldq ymm0,ymm19,8
3155 vpsrldq ymm25,ymm20,8
3156 vpsrldq ymm1,ymm21,8
3157 vpaddq ymm18,ymm18,ymm24
3158 vpaddq ymm19,ymm19,ymm0
3159 vpsrldq ymm26,ymm22,8
3160 vpsrldq ymm2,ymm23,8
3161 vpaddq ymm20,ymm20,ymm25
3162 vpaddq ymm21,ymm21,ymm1
3163 vpermq ymm24,ymm18,0x2
3164 vpermq ymm0,ymm19,0x2
3165 vpaddq ymm22,ymm22,ymm26
3166 vpaddq ymm23,ymm23,ymm2
3167
3168 vpermq ymm25,ymm20,0x2
3169 vpermq ymm1,ymm21,0x2
3170 vpaddq ymm18{k1}{z},ymm18,ymm24
3171 vpaddq ymm19{k1}{z},ymm19,ymm0
3172 vpermq ymm26,ymm22,0x2
3173 vpermq ymm2,ymm23,0x2
3174 vpaddq ymm20{k1}{z},ymm20,ymm25
3175 vpaddq ymm21{k1}{z},ymm21,ymm1
3176 vpaddq ymm22{k1}{z},ymm22,ymm26
3177 vpaddq ymm23{k1}{z},ymm23,ymm2
3178
3179
3180
3181 vpsrlq ymm30,ymm18,44
3182 vpsllq ymm19,ymm19,8
3183 vpandq ymm0,ymm18,ymm28
3184 vpaddq ymm19,ymm19,ymm30
3185
3186 vpaddq ymm20,ymm20,ymm19
3187
3188 vpsrlq ymm30,ymm20,44
3189 vpsllq ymm21,ymm21,8
3190 vpandq ymm1,ymm20,ymm28
3191 vpaddq ymm21,ymm21,ymm30
3192
3193 vpaddq ymm22,ymm22,ymm21
3194
3195 vpsrlq ymm30,ymm22,42
3196 vpsllq ymm23,ymm23,10
3197 vpandq ymm2,ymm22,ymm29
3198 vpaddq ymm23,ymm23,ymm30
3199
3200 vpaddq ymm0,ymm0,ymm23
3201 vpsllq ymm23,ymm23,2
3202
3203 vpaddq ymm0,ymm0,ymm23
3204
3205 vpsrlq ymm30,ymm0,44
3206 vpandq ymm0,ymm0,ymm28
3207
3208 vpaddq ymm1,ymm1,ymm30
3209
3210
3211 sub rdx,2
3212 ja NEAR $L$blocks_vpmadd52_4x_do
3213
3214 vmovq QWORD[rdi],xmm0
3215 vmovq QWORD[8+rdi],xmm1
3216 vmovq QWORD[16+rdi],xmm2
3217 vzeroall
3218
3219$L$no_data_vpmadd52_4x:
3220 mov rdi,QWORD[8+rsp] ;WIN64 epilogue
3221 mov rsi,QWORD[16+rsp]
3222 DB 0F3h,0C3h ;repret
3223
3224$L$SEH_end_poly1305_blocks_vpmadd52_4x:
3225
3226ALIGN 32
3227poly1305_blocks_vpmadd52_8x:
3228 mov QWORD[8+rsp],rdi ;WIN64 prologue
3229 mov QWORD[16+rsp],rsi
3230 mov rax,rsp
3231$L$SEH_begin_poly1305_blocks_vpmadd52_8x:
3232 mov rdi,rcx
3233 mov rsi,rdx
3234 mov rdx,r8
3235 mov rcx,r9
3236
3237
3238
3239 shr rdx,4
3240 jz NEAR $L$no_data_vpmadd52_8x
3241
3242 shl rcx,40
3243 mov r8,QWORD[64+rdi]
3244
3245 vmovdqa64 ymm28,YMMWORD[$L$x_mask44]
3246 vmovdqa64 ymm29,YMMWORD[$L$x_mask42]
3247
3248 test r8,r8
3249 js NEAR $L$init_vpmadd52
3250
3251 vmovq xmm0,QWORD[rdi]
3252 vmovq xmm1,QWORD[8+rdi]
3253 vmovq xmm2,QWORD[16+rdi]
3254
3255$L$blocks_vpmadd52_8x:
3256
3257
3258
3259 vmovdqu64 ymm5,YMMWORD[128+rdi]
3260 vmovdqu64 ymm16,YMMWORD[160+rdi]
3261 vmovdqu64 ymm3,YMMWORD[64+rdi]
3262 vmovdqu64 ymm4,YMMWORD[96+rdi]
3263
3264 vpsllq ymm17,ymm5,2
3265 vpaddq ymm17,ymm17,ymm5
3266 vpsllq ymm17,ymm17,2
3267
3268 vpbroadcastq ymm8,xmm5
3269 vpbroadcastq ymm6,xmm3
3270 vpbroadcastq ymm7,xmm4
3271
3272 vpxorq ymm18,ymm18,ymm18
3273 vpmadd52luq ymm18,ymm16,ymm8
3274 vpxorq ymm19,ymm19,ymm19
3275 vpmadd52huq ymm19,ymm16,ymm8
3276 vpxorq ymm20,ymm20,ymm20
3277 vpmadd52luq ymm20,ymm17,ymm8
3278 vpxorq ymm21,ymm21,ymm21
3279 vpmadd52huq ymm21,ymm17,ymm8
3280 vpxorq ymm22,ymm22,ymm22
3281 vpmadd52luq ymm22,ymm3,ymm8
3282 vpxorq ymm23,ymm23,ymm23
3283 vpmadd52huq ymm23,ymm3,ymm8
3284
3285 vpmadd52luq ymm18,ymm3,ymm6
3286 vpmadd52huq ymm19,ymm3,ymm6
3287 vpmadd52luq ymm20,ymm4,ymm6
3288 vpmadd52huq ymm21,ymm4,ymm6
3289 vpmadd52luq ymm22,ymm5,ymm6
3290 vpmadd52huq ymm23,ymm5,ymm6
3291
3292 vpmadd52luq ymm18,ymm17,ymm7
3293 vpmadd52huq ymm19,ymm17,ymm7
3294 vpmadd52luq ymm20,ymm3,ymm7
3295 vpmadd52huq ymm21,ymm3,ymm7
3296 vpmadd52luq ymm22,ymm4,ymm7
3297 vpmadd52huq ymm23,ymm4,ymm7
3298
3299
3300
3301 vpsrlq ymm30,ymm18,44
3302 vpsllq ymm19,ymm19,8
3303 vpandq ymm6,ymm18,ymm28
3304 vpaddq ymm19,ymm19,ymm30
3305
3306 vpaddq ymm20,ymm20,ymm19
3307
3308 vpsrlq ymm30,ymm20,44
3309 vpsllq ymm21,ymm21,8
3310 vpandq ymm7,ymm20,ymm28
3311 vpaddq ymm21,ymm21,ymm30
3312
3313 vpaddq ymm22,ymm22,ymm21
3314
3315 vpsrlq ymm30,ymm22,42
3316 vpsllq ymm23,ymm23,10
3317 vpandq ymm8,ymm22,ymm29
3318 vpaddq ymm23,ymm23,ymm30
3319
3320 vpaddq ymm6,ymm6,ymm23
3321 vpsllq ymm23,ymm23,2
3322
3323 vpaddq ymm6,ymm6,ymm23
3324
3325 vpsrlq ymm30,ymm6,44
3326 vpandq ymm6,ymm6,ymm28
3327
3328 vpaddq ymm7,ymm7,ymm30
3329
3330
3331
3332
3333
3334 vpunpcklqdq ymm26,ymm8,ymm5
3335 vpunpckhqdq ymm5,ymm8,ymm5
3336 vpunpcklqdq ymm24,ymm6,ymm3
3337 vpunpckhqdq ymm3,ymm6,ymm3
3338 vpunpcklqdq ymm25,ymm7,ymm4
3339 vpunpckhqdq ymm4,ymm7,ymm4
3340 vshufi64x2 zmm8,zmm26,zmm5,0x44
3341 vshufi64x2 zmm6,zmm24,zmm3,0x44
3342 vshufi64x2 zmm7,zmm25,zmm4,0x44
3343
3344 vmovdqu64 zmm26,ZMMWORD[rsi]
3345 vmovdqu64 zmm27,ZMMWORD[64+rsi]
3346 lea rsi,[128+rsi]
3347
3348 vpsllq zmm10,zmm8,2
3349 vpsllq zmm9,zmm7,2
3350 vpaddq zmm10,zmm10,zmm8
3351 vpaddq zmm9,zmm9,zmm7
3352 vpsllq zmm10,zmm10,2
3353 vpsllq zmm9,zmm9,2
3354
3355 vpbroadcastq zmm31,rcx
3356 vpbroadcastq zmm28,xmm28
3357 vpbroadcastq zmm29,xmm29
3358
3359 vpbroadcastq zmm16,xmm9
3360 vpbroadcastq zmm17,xmm10
3361 vpbroadcastq zmm3,xmm6
3362 vpbroadcastq zmm4,xmm7
3363 vpbroadcastq zmm5,xmm8
3364
3365 vpunpcklqdq zmm25,zmm26,zmm27
3366 vpunpckhqdq zmm27,zmm26,zmm27
3367
3368
3369
3370 vpsrlq zmm26,zmm27,24
3371 vporq zmm26,zmm26,zmm31
3372 vpaddq zmm2,zmm2,zmm26
3373 vpandq zmm24,zmm25,zmm28
3374 vpsrlq zmm25,zmm25,44
3375 vpsllq zmm27,zmm27,20
3376 vporq zmm25,zmm25,zmm27
3377 vpandq zmm25,zmm25,zmm28
3378
3379 sub rdx,8
3380 jz NEAR $L$tail_vpmadd52_8x
3381 jmp NEAR $L$oop_vpmadd52_8x
3382
3383ALIGN 32
3384$L$oop_vpmadd52_8x:
3385
3386 vpaddq zmm0,zmm0,zmm24
3387 vpaddq zmm1,zmm1,zmm25
3388
3389 vpxorq zmm18,zmm18,zmm18
3390 vpmadd52luq zmm18,zmm16,zmm2
3391 vpxorq zmm19,zmm19,zmm19
3392 vpmadd52huq zmm19,zmm16,zmm2
3393 vpxorq zmm20,zmm20,zmm20
3394 vpmadd52luq zmm20,zmm17,zmm2
3395 vpxorq zmm21,zmm21,zmm21
3396 vpmadd52huq zmm21,zmm17,zmm2
3397 vpxorq zmm22,zmm22,zmm22
3398 vpmadd52luq zmm22,zmm3,zmm2
3399 vpxorq zmm23,zmm23,zmm23
3400 vpmadd52huq zmm23,zmm3,zmm2
3401
3402 vmovdqu64 zmm26,ZMMWORD[rsi]
3403 vmovdqu64 zmm27,ZMMWORD[64+rsi]
3404 lea rsi,[128+rsi]
3405 vpmadd52luq zmm18,zmm3,zmm0
3406 vpmadd52huq zmm19,zmm3,zmm0
3407 vpmadd52luq zmm20,zmm4,zmm0
3408 vpmadd52huq zmm21,zmm4,zmm0
3409 vpmadd52luq zmm22,zmm5,zmm0
3410 vpmadd52huq zmm23,zmm5,zmm0
3411
3412 vpunpcklqdq zmm25,zmm26,zmm27
3413 vpunpckhqdq zmm27,zmm26,zmm27
3414 vpmadd52luq zmm18,zmm17,zmm1
3415 vpmadd52huq zmm19,zmm17,zmm1
3416 vpmadd52luq zmm20,zmm3,zmm1
3417 vpmadd52huq zmm21,zmm3,zmm1
3418 vpmadd52luq zmm22,zmm4,zmm1
3419 vpmadd52huq zmm23,zmm4,zmm1
3420
3421
3422
3423 vpsrlq zmm30,zmm18,44
3424 vpsllq zmm19,zmm19,8
3425 vpandq zmm0,zmm18,zmm28
3426 vpaddq zmm19,zmm19,zmm30
3427
3428 vpsrlq zmm26,zmm27,24
3429 vporq zmm26,zmm26,zmm31
3430 vpaddq zmm20,zmm20,zmm19
3431
3432 vpsrlq zmm30,zmm20,44
3433 vpsllq zmm21,zmm21,8
3434 vpandq zmm1,zmm20,zmm28
3435 vpaddq zmm21,zmm21,zmm30
3436
3437 vpandq zmm24,zmm25,zmm28
3438 vpsrlq zmm25,zmm25,44
3439 vpsllq zmm27,zmm27,20
3440 vpaddq zmm22,zmm22,zmm21
3441
3442 vpsrlq zmm30,zmm22,42
3443 vpsllq zmm23,zmm23,10
3444 vpandq zmm2,zmm22,zmm29
3445 vpaddq zmm23,zmm23,zmm30
3446
3447 vpaddq zmm2,zmm2,zmm26
3448 vpaddq zmm0,zmm0,zmm23
3449 vpsllq zmm23,zmm23,2
3450
3451 vpaddq zmm0,zmm0,zmm23
3452 vporq zmm25,zmm25,zmm27
3453 vpandq zmm25,zmm25,zmm28
3454
3455 vpsrlq zmm30,zmm0,44
3456 vpandq zmm0,zmm0,zmm28
3457
3458 vpaddq zmm1,zmm1,zmm30
3459
3460 sub rdx,8
3461 jnz NEAR $L$oop_vpmadd52_8x
3462
3463$L$tail_vpmadd52_8x:
3464
3465 vpaddq zmm0,zmm0,zmm24
3466 vpaddq zmm1,zmm1,zmm25
3467
3468 vpxorq zmm18,zmm18,zmm18
3469 vpmadd52luq zmm18,zmm9,zmm2
3470 vpxorq zmm19,zmm19,zmm19
3471 vpmadd52huq zmm19,zmm9,zmm2
3472 vpxorq zmm20,zmm20,zmm20
3473 vpmadd52luq zmm20,zmm10,zmm2
3474 vpxorq zmm21,zmm21,zmm21
3475 vpmadd52huq zmm21,zmm10,zmm2
3476 vpxorq zmm22,zmm22,zmm22
3477 vpmadd52luq zmm22,zmm6,zmm2
3478 vpxorq zmm23,zmm23,zmm23
3479 vpmadd52huq zmm23,zmm6,zmm2
3480
3481 vpmadd52luq zmm18,zmm6,zmm0
3482 vpmadd52huq zmm19,zmm6,zmm0
3483 vpmadd52luq zmm20,zmm7,zmm0
3484 vpmadd52huq zmm21,zmm7,zmm0
3485 vpmadd52luq zmm22,zmm8,zmm0
3486 vpmadd52huq zmm23,zmm8,zmm0
3487
3488 vpmadd52luq zmm18,zmm10,zmm1
3489 vpmadd52huq zmm19,zmm10,zmm1
3490 vpmadd52luq zmm20,zmm6,zmm1
3491 vpmadd52huq zmm21,zmm6,zmm1
3492 vpmadd52luq zmm22,zmm7,zmm1
3493 vpmadd52huq zmm23,zmm7,zmm1
3494
3495
3496
3497
3498 mov eax,1
3499 kmovw k1,eax
3500 vpsrldq zmm24,zmm18,8
3501 vpsrldq zmm0,zmm19,8
3502 vpsrldq zmm25,zmm20,8
3503 vpsrldq zmm1,zmm21,8
3504 vpaddq zmm18,zmm18,zmm24
3505 vpaddq zmm19,zmm19,zmm0
3506 vpsrldq zmm26,zmm22,8
3507 vpsrldq zmm2,zmm23,8
3508 vpaddq zmm20,zmm20,zmm25
3509 vpaddq zmm21,zmm21,zmm1
3510 vpermq zmm24,zmm18,0x2
3511 vpermq zmm0,zmm19,0x2
3512 vpaddq zmm22,zmm22,zmm26
3513 vpaddq zmm23,zmm23,zmm2
3514
3515 vpermq zmm25,zmm20,0x2
3516 vpermq zmm1,zmm21,0x2
3517 vpaddq zmm18,zmm18,zmm24
3518 vpaddq zmm19,zmm19,zmm0
3519 vpermq zmm26,zmm22,0x2
3520 vpermq zmm2,zmm23,0x2
3521 vpaddq zmm20,zmm20,zmm25
3522 vpaddq zmm21,zmm21,zmm1
3523 vextracti64x4 ymm24,zmm18,1
3524 vextracti64x4 ymm0,zmm19,1
3525 vpaddq zmm22,zmm22,zmm26
3526 vpaddq zmm23,zmm23,zmm2
3527
3528 vextracti64x4 ymm25,zmm20,1
3529 vextracti64x4 ymm1,zmm21,1
3530 vextracti64x4 ymm26,zmm22,1
3531 vextracti64x4 ymm2,zmm23,1
3532 vpaddq ymm18{k1}{z},ymm18,ymm24
3533 vpaddq ymm19{k1}{z},ymm19,ymm0
3534 vpaddq ymm20{k1}{z},ymm20,ymm25
3535 vpaddq ymm21{k1}{z},ymm21,ymm1
3536 vpaddq ymm22{k1}{z},ymm22,ymm26
3537 vpaddq ymm23{k1}{z},ymm23,ymm2
3538
3539
3540
3541 vpsrlq ymm30,ymm18,44
3542 vpsllq ymm19,ymm19,8
3543 vpandq ymm0,ymm18,ymm28
3544 vpaddq ymm19,ymm19,ymm30
3545
3546 vpaddq ymm20,ymm20,ymm19
3547
3548 vpsrlq ymm30,ymm20,44
3549 vpsllq ymm21,ymm21,8
3550 vpandq ymm1,ymm20,ymm28
3551 vpaddq ymm21,ymm21,ymm30
3552
3553 vpaddq ymm22,ymm22,ymm21
3554
3555 vpsrlq ymm30,ymm22,42
3556 vpsllq ymm23,ymm23,10
3557 vpandq ymm2,ymm22,ymm29
3558 vpaddq ymm23,ymm23,ymm30
3559
3560 vpaddq ymm0,ymm0,ymm23
3561 vpsllq ymm23,ymm23,2
3562
3563 vpaddq ymm0,ymm0,ymm23
3564
3565 vpsrlq ymm30,ymm0,44
3566 vpandq ymm0,ymm0,ymm28
3567
3568 vpaddq ymm1,ymm1,ymm30
3569
3570
3571
3572 vmovq QWORD[rdi],xmm0
3573 vmovq QWORD[8+rdi],xmm1
3574 vmovq QWORD[16+rdi],xmm2
3575 vzeroall
3576
3577$L$no_data_vpmadd52_8x:
3578 mov rdi,QWORD[8+rsp] ;WIN64 epilogue
3579 mov rsi,QWORD[16+rsp]
3580 DB 0F3h,0C3h ;repret
3581
3582$L$SEH_end_poly1305_blocks_vpmadd52_8x:
3583
3584ALIGN 32
3585poly1305_emit_base2_44:
3586 mov QWORD[8+rsp],rdi ;WIN64 prologue
3587 mov QWORD[16+rsp],rsi
3588 mov rax,rsp
3589$L$SEH_begin_poly1305_emit_base2_44:
3590 mov rdi,rcx
3591 mov rsi,rdx
3592 mov rdx,r8
3593
3594
3595
3596DB 243,15,30,250
3597 mov r8,QWORD[rdi]
3598 mov r9,QWORD[8+rdi]
3599 mov r10,QWORD[16+rdi]
3600
3601 mov rax,r9
3602 shr r9,20
3603 shl rax,44
3604 mov rcx,r10
3605 shr r10,40
3606 shl rcx,24
3607
3608 add r8,rax
3609 adc r9,rcx
3610 adc r10,0
3611
3612 mov rax,r8
3613 add r8,5
3614 mov rcx,r9
3615 adc r9,0
3616 adc r10,0
3617 shr r10,2
3618 cmovnz rax,r8
3619 cmovnz rcx,r9
3620
3621 add rax,QWORD[rdx]
3622 adc rcx,QWORD[8+rdx]
3623 mov QWORD[rsi],rax
3624 mov QWORD[8+rsi],rcx
3625
3626 mov rdi,QWORD[8+rsp] ;WIN64 epilogue
3627 mov rsi,QWORD[16+rsp]
3628 DB 0F3h,0C3h ;repret
3629
3630$L$SEH_end_poly1305_emit_base2_44:
3631ALIGN 64
3632$L$const:
3633$L$mask24:
3634 DD 0x0ffffff,0,0x0ffffff,0,0x0ffffff,0,0x0ffffff,0
3635$L$129:
3636 DD 16777216,0,16777216,0,16777216,0,16777216,0
3637$L$mask26:
3638 DD 0x3ffffff,0,0x3ffffff,0,0x3ffffff,0,0x3ffffff,0
3639$L$permd_avx2:
3640 DD 2,2,2,3,2,0,2,1
3641$L$permd_avx512:
3642 DD 0,0,0,1,0,2,0,3,0,4,0,5,0,6,0,7
3643
3644$L$2_44_inp_permd:
3645 DD 0,1,1,2,2,3,7,7
3646$L$2_44_inp_shift:
3647 DQ 0,12,24,64
3648$L$2_44_mask:
3649 DQ 0xfffffffffff,0xfffffffffff,0x3ffffffffff,0xffffffffffffffff
3650$L$2_44_shift_rgt:
3651 DQ 44,44,42,64
3652$L$2_44_shift_lft:
3653 DQ 8,8,10,64
3654
3655ALIGN 64
3656$L$x_mask44:
3657 DQ 0xfffffffffff,0xfffffffffff,0xfffffffffff,0xfffffffffff
3658 DQ 0xfffffffffff,0xfffffffffff,0xfffffffffff,0xfffffffffff
3659$L$x_mask42:
3660 DQ 0x3ffffffffff,0x3ffffffffff,0x3ffffffffff,0x3ffffffffff
3661 DQ 0x3ffffffffff,0x3ffffffffff,0x3ffffffffff,0x3ffffffffff
3662DB 80,111,108,121,49,51,48,53,32,102,111,114,32,120,56,54
3663DB 95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32
3664DB 98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115
3665DB 108,46,111,114,103,62,0
3666ALIGN 16
3667global xor128_encrypt_n_pad
3668
3669ALIGN 16
3670xor128_encrypt_n_pad:
3671
3672 sub rdx,r8
3673 sub rcx,r8
3674 mov r10,r9
3675 shr r9,4
3676 jz NEAR $L$tail_enc
3677 nop
3678$L$oop_enc_xmm:
3679 movdqu xmm0,XMMWORD[r8*1+rdx]
3680 pxor xmm0,XMMWORD[r8]
3681 movdqu XMMWORD[r8*1+rcx],xmm0
3682 movdqa XMMWORD[r8],xmm0
3683 lea r8,[16+r8]
3684 dec r9
3685 jnz NEAR $L$oop_enc_xmm
3686
3687 and r10,15
3688 jz NEAR $L$done_enc
3689
3690$L$tail_enc:
3691 mov r9,16
3692 sub r9,r10
3693 xor eax,eax
3694$L$oop_enc_byte:
3695 mov al,BYTE[r8*1+rdx]
3696 xor al,BYTE[r8]
3697 mov BYTE[r8*1+rcx],al
3698 mov BYTE[r8],al
3699 lea r8,[1+r8]
3700 dec r10
3701 jnz NEAR $L$oop_enc_byte
3702
3703 xor eax,eax
3704$L$oop_enc_pad:
3705 mov BYTE[r8],al
3706 lea r8,[1+r8]
3707 dec r9
3708 jnz NEAR $L$oop_enc_pad
3709
3710$L$done_enc:
3711 mov rax,r8
3712 DB 0F3h,0C3h ;repret
3713
3714
3715
3716global xor128_decrypt_n_pad
3717
3718ALIGN 16
3719xor128_decrypt_n_pad:
3720
3721 sub rdx,r8
3722 sub rcx,r8
3723 mov r10,r9
3724 shr r9,4
3725 jz NEAR $L$tail_dec
3726 nop
3727$L$oop_dec_xmm:
3728 movdqu xmm0,XMMWORD[r8*1+rdx]
3729 movdqa xmm1,XMMWORD[r8]
3730 pxor xmm1,xmm0
3731 movdqu XMMWORD[r8*1+rcx],xmm1
3732 movdqa XMMWORD[r8],xmm0
3733 lea r8,[16+r8]
3734 dec r9
3735 jnz NEAR $L$oop_dec_xmm
3736
3737 pxor xmm1,xmm1
3738 and r10,15
3739 jz NEAR $L$done_dec
3740
3741$L$tail_dec:
3742 mov r9,16
3743 sub r9,r10
3744 xor eax,eax
3745 xor r11,r11
3746$L$oop_dec_byte:
3747 mov r11b,BYTE[r8*1+rdx]
3748 mov al,BYTE[r8]
3749 xor al,r11b
3750 mov BYTE[r8*1+rcx],al
3751 mov BYTE[r8],r11b
3752 lea r8,[1+r8]
3753 dec r10
3754 jnz NEAR $L$oop_dec_byte
3755
3756 xor eax,eax
3757$L$oop_dec_pad:
3758 mov BYTE[r8],al
3759 lea r8,[1+r8]
3760 dec r9
3761 jnz NEAR $L$oop_dec_pad
3762
3763$L$done_dec:
3764 mov rax,r8
3765 DB 0F3h,0C3h ;repret
3766
3767
3768EXTERN __imp_RtlVirtualUnwind
3769
3770ALIGN 16
3771se_handler:
3772 push rsi
3773 push rdi
3774 push rbx
3775 push rbp
3776 push r12
3777 push r13
3778 push r14
3779 push r15
3780 pushfq
3781 sub rsp,64
3782
3783 mov rax,QWORD[120+r8]
3784 mov rbx,QWORD[248+r8]
3785
3786 mov rsi,QWORD[8+r9]
3787 mov r11,QWORD[56+r9]
3788
3789 mov r10d,DWORD[r11]
3790 lea r10,[r10*1+rsi]
3791 cmp rbx,r10
3792 jb NEAR $L$common_seh_tail
3793
3794 mov rax,QWORD[152+r8]
3795
3796 mov r10d,DWORD[4+r11]
3797 lea r10,[r10*1+rsi]
3798 cmp rbx,r10
3799 jae NEAR $L$common_seh_tail
3800
3801 lea rax,[48+rax]
3802
3803 mov rbx,QWORD[((-8))+rax]
3804 mov rbp,QWORD[((-16))+rax]
3805 mov r12,QWORD[((-24))+rax]
3806 mov r13,QWORD[((-32))+rax]
3807 mov r14,QWORD[((-40))+rax]
3808 mov r15,QWORD[((-48))+rax]
3809 mov QWORD[144+r8],rbx
3810 mov QWORD[160+r8],rbp
3811 mov QWORD[216+r8],r12
3812 mov QWORD[224+r8],r13
3813 mov QWORD[232+r8],r14
3814 mov QWORD[240+r8],r15
3815
3816 jmp NEAR $L$common_seh_tail
3817
3818
3819
3820ALIGN 16
3821avx_handler:
3822 push rsi
3823 push rdi
3824 push rbx
3825 push rbp
3826 push r12
3827 push r13
3828 push r14
3829 push r15
3830 pushfq
3831 sub rsp,64
3832
3833 mov rax,QWORD[120+r8]
3834 mov rbx,QWORD[248+r8]
3835
3836 mov rsi,QWORD[8+r9]
3837 mov r11,QWORD[56+r9]
3838
3839 mov r10d,DWORD[r11]
3840 lea r10,[r10*1+rsi]
3841 cmp rbx,r10
3842 jb NEAR $L$common_seh_tail
3843
3844 mov rax,QWORD[152+r8]
3845
3846 mov r10d,DWORD[4+r11]
3847 lea r10,[r10*1+rsi]
3848 cmp rbx,r10
3849 jae NEAR $L$common_seh_tail
3850
3851 mov rax,QWORD[208+r8]
3852
3853 lea rsi,[80+rax]
3854 lea rax,[248+rax]
3855 lea rdi,[512+r8]
3856 mov ecx,20
3857 DD 0xa548f3fc
3858
3859$L$common_seh_tail:
3860 mov rdi,QWORD[8+rax]
3861 mov rsi,QWORD[16+rax]
3862 mov QWORD[152+r8],rax
3863 mov QWORD[168+r8],rsi
3864 mov QWORD[176+r8],rdi
3865
3866 mov rdi,QWORD[40+r9]
3867 mov rsi,r8
3868 mov ecx,154
3869 DD 0xa548f3fc
3870
3871 mov rsi,r9
3872 xor rcx,rcx
3873 mov rdx,QWORD[8+rsi]
3874 mov r8,QWORD[rsi]
3875 mov r9,QWORD[16+rsi]
3876 mov r10,QWORD[40+rsi]
3877 lea r11,[56+rsi]
3878 lea r12,[24+rsi]
3879 mov QWORD[32+rsp],r10
3880 mov QWORD[40+rsp],r11
3881 mov QWORD[48+rsp],r12
3882 mov QWORD[56+rsp],rcx
3883 call QWORD[__imp_RtlVirtualUnwind]
3884
3885 mov eax,1
3886 add rsp,64
3887 popfq
3888 pop r15
3889 pop r14
3890 pop r13
3891 pop r12
3892 pop rbp
3893 pop rbx
3894 pop rdi
3895 pop rsi
3896 DB 0F3h,0C3h ;repret
3897
3898
3899section .pdata rdata align=4
3900ALIGN 4
3901 DD $L$SEH_begin_poly1305_init wrt ..imagebase
3902 DD $L$SEH_end_poly1305_init wrt ..imagebase
3903 DD $L$SEH_info_poly1305_init wrt ..imagebase
3904
3905 DD $L$SEH_begin_poly1305_blocks wrt ..imagebase
3906 DD $L$SEH_end_poly1305_blocks wrt ..imagebase
3907 DD $L$SEH_info_poly1305_blocks wrt ..imagebase
3908
3909 DD $L$SEH_begin_poly1305_emit wrt ..imagebase
3910 DD $L$SEH_end_poly1305_emit wrt ..imagebase
3911 DD $L$SEH_info_poly1305_emit wrt ..imagebase
3912 DD $L$SEH_begin_poly1305_blocks_avx wrt ..imagebase
3913 DD $L$base2_64_avx wrt ..imagebase
3914 DD $L$SEH_info_poly1305_blocks_avx_1 wrt ..imagebase
3915
3916 DD $L$base2_64_avx wrt ..imagebase
3917 DD $L$even_avx wrt ..imagebase
3918 DD $L$SEH_info_poly1305_blocks_avx_2 wrt ..imagebase
3919
3920 DD $L$even_avx wrt ..imagebase
3921 DD $L$SEH_end_poly1305_blocks_avx wrt ..imagebase
3922 DD $L$SEH_info_poly1305_blocks_avx_3 wrt ..imagebase
3923
3924 DD $L$SEH_begin_poly1305_emit_avx wrt ..imagebase
3925 DD $L$SEH_end_poly1305_emit_avx wrt ..imagebase
3926 DD $L$SEH_info_poly1305_emit_avx wrt ..imagebase
3927 DD $L$SEH_begin_poly1305_blocks_avx2 wrt ..imagebase
3928 DD $L$base2_64_avx2 wrt ..imagebase
3929 DD $L$SEH_info_poly1305_blocks_avx2_1 wrt ..imagebase
3930
3931 DD $L$base2_64_avx2 wrt ..imagebase
3932 DD $L$even_avx2 wrt ..imagebase
3933 DD $L$SEH_info_poly1305_blocks_avx2_2 wrt ..imagebase
3934
3935 DD $L$even_avx2 wrt ..imagebase
3936 DD $L$SEH_end_poly1305_blocks_avx2 wrt ..imagebase
3937 DD $L$SEH_info_poly1305_blocks_avx2_3 wrt ..imagebase
3938 DD $L$SEH_begin_poly1305_blocks_avx512 wrt ..imagebase
3939 DD $L$SEH_end_poly1305_blocks_avx512 wrt ..imagebase
3940 DD $L$SEH_info_poly1305_blocks_avx512 wrt ..imagebase
3941section .xdata rdata align=8
3942ALIGN 8
3943$L$SEH_info_poly1305_init:
3944DB 9,0,0,0
3945 DD se_handler wrt ..imagebase
3946 DD $L$SEH_begin_poly1305_init wrt ..imagebase,$L$SEH_begin_poly1305_init wrt ..imagebase
3947
3948$L$SEH_info_poly1305_blocks:
3949DB 9,0,0,0
3950 DD se_handler wrt ..imagebase
3951 DD $L$blocks_body wrt ..imagebase,$L$blocks_epilogue wrt ..imagebase
3952
3953$L$SEH_info_poly1305_emit:
3954DB 9,0,0,0
3955 DD se_handler wrt ..imagebase
3956 DD $L$SEH_begin_poly1305_emit wrt ..imagebase,$L$SEH_begin_poly1305_emit wrt ..imagebase
3957$L$SEH_info_poly1305_blocks_avx_1:
3958DB 9,0,0,0
3959 DD se_handler wrt ..imagebase
3960 DD $L$blocks_avx_body wrt ..imagebase,$L$blocks_avx_epilogue wrt ..imagebase
3961
3962$L$SEH_info_poly1305_blocks_avx_2:
3963DB 9,0,0,0
3964 DD se_handler wrt ..imagebase
3965 DD $L$base2_64_avx_body wrt ..imagebase,$L$base2_64_avx_epilogue wrt ..imagebase
3966
3967$L$SEH_info_poly1305_blocks_avx_3:
3968DB 9,0,0,0
3969 DD avx_handler wrt ..imagebase
3970 DD $L$do_avx_body wrt ..imagebase,$L$do_avx_epilogue wrt ..imagebase
3971
3972$L$SEH_info_poly1305_emit_avx:
3973DB 9,0,0,0
3974 DD se_handler wrt ..imagebase
3975 DD $L$SEH_begin_poly1305_emit_avx wrt ..imagebase,$L$SEH_begin_poly1305_emit_avx wrt ..imagebase
3976$L$SEH_info_poly1305_blocks_avx2_1:
3977DB 9,0,0,0
3978 DD se_handler wrt ..imagebase
3979 DD $L$blocks_avx2_body wrt ..imagebase,$L$blocks_avx2_epilogue wrt ..imagebase
3980
3981$L$SEH_info_poly1305_blocks_avx2_2:
3982DB 9,0,0,0
3983 DD se_handler wrt ..imagebase
3984 DD $L$base2_64_avx2_body wrt ..imagebase,$L$base2_64_avx2_epilogue wrt ..imagebase
3985
3986$L$SEH_info_poly1305_blocks_avx2_3:
3987DB 9,0,0,0
3988 DD avx_handler wrt ..imagebase
3989 DD $L$do_avx2_body wrt ..imagebase,$L$do_avx2_epilogue wrt ..imagebase
3990$L$SEH_info_poly1305_blocks_avx512:
3991DB 9,0,0,0
3992 DD avx_handler wrt ..imagebase
3993 DD $L$do_avx512_body wrt ..imagebase,$L$do_avx512_epilogue wrt ..imagebase
注意: 瀏覽 TracBrowser 來幫助您使用儲存庫瀏覽器

© 2024 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette