VirtualBox

source: vbox/trunk/src/libs/openssl-3.1.0/crypto/modes/asm/ghash-sparcv9.pl@ 99371

最後變更 在這個檔案從99371是 99366,由 vboxsync 提交於 23 月 前

openssl-3.1.0: Applied and adjusted our OpenSSL changes to 3.0.7. bugref:10418

檔案大小: 12.8 KB
 
1#! /usr/bin/env perl
2# Copyright 2010-2021 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the Apache License 2.0 (the "License"). You may not use
5# this file except in compliance with the License. You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
9
10# ====================================================================
11# Written by Andy Polyakov <[email protected]> for the OpenSSL
12# project. The module is, however, dual licensed under OpenSSL and
13# CRYPTOGAMS licenses depending on where you obtain it. For further
14# details see http://www.openssl.org/~appro/cryptogams/.
15# ====================================================================
16
17# March 2010
18#
19# The module implements "4-bit" GCM GHASH function and underlying
20# single multiplication operation in GF(2^128). "4-bit" means that it
21# uses 256 bytes per-key table [+128 bytes shared table]. Performance
22# results are for streamed GHASH subroutine on UltraSPARC pre-Tx CPU
23# and are expressed in cycles per processed byte, less is better:
24#
25# gcc 3.3.x cc 5.2 this assembler
26#
27# 32-bit build 81.4 43.3 12.6 (+546%/+244%)
28# 64-bit build 20.2 21.2 12.6 (+60%/+68%)
29#
30# Here is data collected on UltraSPARC T1 system running Linux:
31#
32# gcc 4.4.1 this assembler
33#
34# 32-bit build 566 50 (+1000%)
35# 64-bit build 56 50 (+12%)
36#
37# I don't quite understand why difference between 32-bit and 64-bit
38# compiler-generated code is so big. Compilers *were* instructed to
39# generate code for UltraSPARC and should have used 64-bit registers
40# for Z vector (see C code) even in 32-bit build... Oh well, it only
41# means more impressive improvement coefficients for this assembler
42# module;-) Loops are aggressively modulo-scheduled in respect to
43# references to input data and Z.hi updates to achieve 12 cycles
44# timing. To anchor to something else, sha1-sparcv9.pl spends 11.6
45# cycles to process one byte on UltraSPARC pre-Tx CPU and ~24 on T1.
46#
47# October 2012
48#
49# Add VIS3 lookup-table-free implementation using polynomial
50# multiplication xmulx[hi] and extended addition addxc[cc]
51# instructions. 4.52/7.63x improvement on T3/T4 or in absolute
52# terms 7.90/2.14 cycles per byte. On T4 multi-process benchmark
53# saturates at ~15.5x single-process result on 8-core processor,
54# or ~20.5GBps per 2.85GHz socket.
55
56$output=pop and open STDOUT,">$output";
57
58$frame="STACK_FRAME";
59$bias="STACK_BIAS";
60
61$Zhi="%o0"; # 64-bit values
62$Zlo="%o1";
63$Thi="%o2";
64$Tlo="%o3";
65$rem="%o4";
66$tmp="%o5";
67
68$nhi="%l0"; # small values and pointers
69$nlo="%l1";
70$xi0="%l2";
71$xi1="%l3";
72$rem_4bit="%l4";
73$remi="%l5";
74$Htblo="%l6";
75$cnt="%l7";
76
77$Xi="%i0"; # input argument block
78$Htbl="%i1";
79$inp="%i2";
80$len="%i3";
81
82$code.=<<___;
83#ifndef __ASSEMBLER__
84# define __ASSEMBLER__ 1
85#endif
86#include "crypto/sparc_arch.h"
87
88#ifdef __arch64__
89.register %g2,#scratch
90.register %g3,#scratch
91#endif
92
93.section ".text",#alloc,#execinstr
94
95.align 64
96rem_4bit:
97 .long `0x0000<<16`,0,`0x1C20<<16`,0,`0x3840<<16`,0,`0x2460<<16`,0
98 .long `0x7080<<16`,0,`0x6CA0<<16`,0,`0x48C0<<16`,0,`0x54E0<<16`,0
99 .long `0xE100<<16`,0,`0xFD20<<16`,0,`0xD940<<16`,0,`0xC560<<16`,0
100 .long `0x9180<<16`,0,`0x8DA0<<16`,0,`0xA9C0<<16`,0,`0xB5E0<<16`,0
101.type rem_4bit,#object
102.size rem_4bit,(.-rem_4bit)
103
104.globl gcm_ghash_4bit
105.align 32
106gcm_ghash_4bit:
107 save %sp,-$frame,%sp
108 ldub [$inp+15],$nlo
109 ldub [$Xi+15],$xi0
110 ldub [$Xi+14],$xi1
111 add $len,$inp,$len
112 add $Htbl,8,$Htblo
113
1141: call .+8
115 add %o7,rem_4bit-1b,$rem_4bit
116
117.Louter:
118 xor $xi0,$nlo,$nlo
119 and $nlo,0xf0,$nhi
120 and $nlo,0x0f,$nlo
121 sll $nlo,4,$nlo
122 ldx [$Htblo+$nlo],$Zlo
123 ldx [$Htbl+$nlo],$Zhi
124
125 ldub [$inp+14],$nlo
126
127 ldx [$Htblo+$nhi],$Tlo
128 and $Zlo,0xf,$remi
129 ldx [$Htbl+$nhi],$Thi
130 sll $remi,3,$remi
131 ldx [$rem_4bit+$remi],$rem
132 srlx $Zlo,4,$Zlo
133 mov 13,$cnt
134 sllx $Zhi,60,$tmp
135 xor $Tlo,$Zlo,$Zlo
136 srlx $Zhi,4,$Zhi
137 xor $Zlo,$tmp,$Zlo
138
139 xor $xi1,$nlo,$nlo
140 and $Zlo,0xf,$remi
141 and $nlo,0xf0,$nhi
142 and $nlo,0x0f,$nlo
143 ba .Lghash_inner
144 sll $nlo,4,$nlo
145.align 32
146.Lghash_inner:
147 ldx [$Htblo+$nlo],$Tlo
148 sll $remi,3,$remi
149 xor $Thi,$Zhi,$Zhi
150 ldx [$Htbl+$nlo],$Thi
151 srlx $Zlo,4,$Zlo
152 xor $rem,$Zhi,$Zhi
153 ldx [$rem_4bit+$remi],$rem
154 sllx $Zhi,60,$tmp
155 xor $Tlo,$Zlo,$Zlo
156 ldub [$inp+$cnt],$nlo
157 srlx $Zhi,4,$Zhi
158 xor $Zlo,$tmp,$Zlo
159 ldub [$Xi+$cnt],$xi1
160 xor $Thi,$Zhi,$Zhi
161 and $Zlo,0xf,$remi
162
163 ldx [$Htblo+$nhi],$Tlo
164 sll $remi,3,$remi
165 xor $rem,$Zhi,$Zhi
166 ldx [$Htbl+$nhi],$Thi
167 srlx $Zlo,4,$Zlo
168 ldx [$rem_4bit+$remi],$rem
169 sllx $Zhi,60,$tmp
170 xor $xi1,$nlo,$nlo
171 srlx $Zhi,4,$Zhi
172 and $nlo,0xf0,$nhi
173 addcc $cnt,-1,$cnt
174 xor $Zlo,$tmp,$Zlo
175 and $nlo,0x0f,$nlo
176 xor $Tlo,$Zlo,$Zlo
177 sll $nlo,4,$nlo
178 blu .Lghash_inner
179 and $Zlo,0xf,$remi
180
181 ldx [$Htblo+$nlo],$Tlo
182 sll $remi,3,$remi
183 xor $Thi,$Zhi,$Zhi
184 ldx [$Htbl+$nlo],$Thi
185 srlx $Zlo,4,$Zlo
186 xor $rem,$Zhi,$Zhi
187 ldx [$rem_4bit+$remi],$rem
188 sllx $Zhi,60,$tmp
189 xor $Tlo,$Zlo,$Zlo
190 srlx $Zhi,4,$Zhi
191 xor $Zlo,$tmp,$Zlo
192 xor $Thi,$Zhi,$Zhi
193
194 add $inp,16,$inp
195 cmp $inp,$len
196 be,pn SIZE_T_CC,.Ldone
197 and $Zlo,0xf,$remi
198
199 ldx [$Htblo+$nhi],$Tlo
200 sll $remi,3,$remi
201 xor $rem,$Zhi,$Zhi
202 ldx [$Htbl+$nhi],$Thi
203 srlx $Zlo,4,$Zlo
204 ldx [$rem_4bit+$remi],$rem
205 sllx $Zhi,60,$tmp
206 xor $Tlo,$Zlo,$Zlo
207 ldub [$inp+15],$nlo
208 srlx $Zhi,4,$Zhi
209 xor $Zlo,$tmp,$Zlo
210 xor $Thi,$Zhi,$Zhi
211 stx $Zlo,[$Xi+8]
212 xor $rem,$Zhi,$Zhi
213 stx $Zhi,[$Xi]
214 srl $Zlo,8,$xi1
215 and $Zlo,0xff,$xi0
216 ba .Louter
217 and $xi1,0xff,$xi1
218.align 32
219.Ldone:
220 ldx [$Htblo+$nhi],$Tlo
221 sll $remi,3,$remi
222 xor $rem,$Zhi,$Zhi
223 ldx [$Htbl+$nhi],$Thi
224 srlx $Zlo,4,$Zlo
225 ldx [$rem_4bit+$remi],$rem
226 sllx $Zhi,60,$tmp
227 xor $Tlo,$Zlo,$Zlo
228 srlx $Zhi,4,$Zhi
229 xor $Zlo,$tmp,$Zlo
230 xor $Thi,$Zhi,$Zhi
231 stx $Zlo,[$Xi+8]
232 xor $rem,$Zhi,$Zhi
233 stx $Zhi,[$Xi]
234
235 ret
236 restore
237.type gcm_ghash_4bit,#function
238.size gcm_ghash_4bit,(.-gcm_ghash_4bit)
239___
240
241undef $inp;
242undef $len;
243
244$code.=<<___;
245.globl gcm_gmult_4bit
246.align 32
247gcm_gmult_4bit:
248 save %sp,-$frame,%sp
249 ldub [$Xi+15],$nlo
250 add $Htbl,8,$Htblo
251
2521: call .+8
253 add %o7,rem_4bit-1b,$rem_4bit
254
255 and $nlo,0xf0,$nhi
256 and $nlo,0x0f,$nlo
257 sll $nlo,4,$nlo
258 ldx [$Htblo+$nlo],$Zlo
259 ldx [$Htbl+$nlo],$Zhi
260
261 ldub [$Xi+14],$nlo
262
263 ldx [$Htblo+$nhi],$Tlo
264 and $Zlo,0xf,$remi
265 ldx [$Htbl+$nhi],$Thi
266 sll $remi,3,$remi
267 ldx [$rem_4bit+$remi],$rem
268 srlx $Zlo,4,$Zlo
269 mov 13,$cnt
270 sllx $Zhi,60,$tmp
271 xor $Tlo,$Zlo,$Zlo
272 srlx $Zhi,4,$Zhi
273 xor $Zlo,$tmp,$Zlo
274
275 and $Zlo,0xf,$remi
276 and $nlo,0xf0,$nhi
277 and $nlo,0x0f,$nlo
278 ba .Lgmult_inner
279 sll $nlo,4,$nlo
280.align 32
281.Lgmult_inner:
282 ldx [$Htblo+$nlo],$Tlo
283 sll $remi,3,$remi
284 xor $Thi,$Zhi,$Zhi
285 ldx [$Htbl+$nlo],$Thi
286 srlx $Zlo,4,$Zlo
287 xor $rem,$Zhi,$Zhi
288 ldx [$rem_4bit+$remi],$rem
289 sllx $Zhi,60,$tmp
290 xor $Tlo,$Zlo,$Zlo
291 ldub [$Xi+$cnt],$nlo
292 srlx $Zhi,4,$Zhi
293 xor $Zlo,$tmp,$Zlo
294 xor $Thi,$Zhi,$Zhi
295 and $Zlo,0xf,$remi
296
297 ldx [$Htblo+$nhi],$Tlo
298 sll $remi,3,$remi
299 xor $rem,$Zhi,$Zhi
300 ldx [$Htbl+$nhi],$Thi
301 srlx $Zlo,4,$Zlo
302 ldx [$rem_4bit+$remi],$rem
303 sllx $Zhi,60,$tmp
304 srlx $Zhi,4,$Zhi
305 and $nlo,0xf0,$nhi
306 addcc $cnt,-1,$cnt
307 xor $Zlo,$tmp,$Zlo
308 and $nlo,0x0f,$nlo
309 xor $Tlo,$Zlo,$Zlo
310 sll $nlo,4,$nlo
311 blu .Lgmult_inner
312 and $Zlo,0xf,$remi
313
314 ldx [$Htblo+$nlo],$Tlo
315 sll $remi,3,$remi
316 xor $Thi,$Zhi,$Zhi
317 ldx [$Htbl+$nlo],$Thi
318 srlx $Zlo,4,$Zlo
319 xor $rem,$Zhi,$Zhi
320 ldx [$rem_4bit+$remi],$rem
321 sllx $Zhi,60,$tmp
322 xor $Tlo,$Zlo,$Zlo
323 srlx $Zhi,4,$Zhi
324 xor $Zlo,$tmp,$Zlo
325 xor $Thi,$Zhi,$Zhi
326 and $Zlo,0xf,$remi
327
328 ldx [$Htblo+$nhi],$Tlo
329 sll $remi,3,$remi
330 xor $rem,$Zhi,$Zhi
331 ldx [$Htbl+$nhi],$Thi
332 srlx $Zlo,4,$Zlo
333 ldx [$rem_4bit+$remi],$rem
334 sllx $Zhi,60,$tmp
335 xor $Tlo,$Zlo,$Zlo
336 srlx $Zhi,4,$Zhi
337 xor $Zlo,$tmp,$Zlo
338 xor $Thi,$Zhi,$Zhi
339 stx $Zlo,[$Xi+8]
340 xor $rem,$Zhi,$Zhi
341 stx $Zhi,[$Xi]
342
343 ret
344 restore
345.type gcm_gmult_4bit,#function
346.size gcm_gmult_4bit,(.-gcm_gmult_4bit)
347___
348
349
350{{{
351# Straightforward 128x128-bit multiplication using Karatsuba algorithm
352# followed by pair of 64-bit reductions [with a shortcut in first one,
353# which allowed to break dependency between reductions and remove one
354# multiplication from critical path]. While it might be suboptimal
355# with regard to sheer number of multiplications, other methods [such
356# as aggregate reduction] would require more 64-bit registers, which
357# we don't have in 32-bit application context.
358
359($Xip,$Htable,$inp,$len)=map("%i$_",(0..3));
360
361($Hhl,$Hlo,$Hhi,$Xlo,$Xhi,$xE1,$sqr, $C0,$C1,$C2,$C3,$V)=
362 (map("%o$_",(0..5,7)),map("%g$_",(1..5)));
363
364($shl,$shr)=map("%l$_",(0..7));
365
366# For details regarding "twisted H" see ghash-x86.pl.
367$code.=<<___;
368.globl gcm_init_vis3
369.align 32
370gcm_init_vis3:
371 save %sp,-$frame,%sp
372
373 ldx [%i1+0],$Hhi
374 ldx [%i1+8],$Hlo
375 mov 0xE1,$Xhi
376 mov 1,$Xlo
377 sllx $Xhi,57,$Xhi
378 srax $Hhi,63,$C0 ! broadcast carry
379 addcc $Hlo,$Hlo,$Hlo ! H<<=1
380 addxc $Hhi,$Hhi,$Hhi
381 and $C0,$Xlo,$Xlo
382 and $C0,$Xhi,$Xhi
383 xor $Xlo,$Hlo,$Hlo
384 xor $Xhi,$Hhi,$Hhi
385 stx $Hlo,[%i0+8] ! save twisted H
386 stx $Hhi,[%i0+0]
387
388 sethi %hi(0xA0406080),$V
389 sethi %hi(0x20C0E000),%l0
390 or $V,%lo(0xA0406080),$V
391 or %l0,%lo(0x20C0E000),%l0
392 sllx $V,32,$V
393 or %l0,$V,$V ! (0xE0·i)&0xff=0xA040608020C0E000
394 stx $V,[%i0+16]
395
396 ret
397 restore
398.type gcm_init_vis3,#function
399.size gcm_init_vis3,.-gcm_init_vis3
400
401.globl gcm_gmult_vis3
402.align 32
403gcm_gmult_vis3:
404 save %sp,-$frame,%sp
405
406 ldx [$Xip+8],$Xlo ! load Xi
407 ldx [$Xip+0],$Xhi
408 ldx [$Htable+8],$Hlo ! load twisted H
409 ldx [$Htable+0],$Hhi
410
411 mov 0xE1,%l7
412 sllx %l7,57,$xE1 ! 57 is not a typo
413 ldx [$Htable+16],$V ! (0xE0·i)&0xff=0xA040608020C0E000
414
415 xor $Hhi,$Hlo,$Hhl ! Karatsuba pre-processing
416 xmulx $Xlo,$Hlo,$C0
417 xor $Xlo,$Xhi,$C2 ! Karatsuba pre-processing
418 xmulx $C2,$Hhl,$C1
419 xmulxhi $Xlo,$Hlo,$Xlo
420 xmulxhi $C2,$Hhl,$C2
421 xmulxhi $Xhi,$Hhi,$C3
422 xmulx $Xhi,$Hhi,$Xhi
423
424 sll $C0,3,$sqr
425 srlx $V,$sqr,$sqr ! ·0xE0 [implicit &(7<<3)]
426 xor $C0,$sqr,$sqr
427 sllx $sqr,57,$sqr ! ($C0·0xE1)<<1<<56 [implicit &0x7f]
428
429 xor $C0,$C1,$C1 ! Karatsuba post-processing
430 xor $Xlo,$C2,$C2
431 xor $sqr,$Xlo,$Xlo ! real destination is $C1
432 xor $C3,$C2,$C2
433 xor $Xlo,$C1,$C1
434 xor $Xhi,$C2,$C2
435 xor $Xhi,$C1,$C1
436
437 xmulxhi $C0,$xE1,$Xlo ! ·0xE1<<1<<56
438 xor $C0,$C2,$C2
439 xmulx $C1,$xE1,$C0
440 xor $C1,$C3,$C3
441 xmulxhi $C1,$xE1,$C1
442
443 xor $Xlo,$C2,$C2
444 xor $C0,$C2,$C2
445 xor $C1,$C3,$C3
446
447 stx $C2,[$Xip+8] ! save Xi
448 stx $C3,[$Xip+0]
449
450 ret
451 restore
452.type gcm_gmult_vis3,#function
453.size gcm_gmult_vis3,.-gcm_gmult_vis3
454
455.globl gcm_ghash_vis3
456.align 32
457gcm_ghash_vis3:
458 save %sp,-$frame,%sp
459 nop
460 srln $len,0,$len ! needed on v8+, "nop" on v9
461
462 ldx [$Xip+8],$C2 ! load Xi
463 ldx [$Xip+0],$C3
464 ldx [$Htable+8],$Hlo ! load twisted H
465 ldx [$Htable+0],$Hhi
466
467 mov 0xE1,%l7
468 sllx %l7,57,$xE1 ! 57 is not a typo
469 ldx [$Htable+16],$V ! (0xE0·i)&0xff=0xA040608020C0E000
470
471 and $inp,7,$shl
472 andn $inp,7,$inp
473 sll $shl,3,$shl
474 prefetch [$inp+63], 20
475 sub %g0,$shl,$shr
476
477 xor $Hhi,$Hlo,$Hhl ! Karatsuba pre-processing
478.Loop:
479 ldx [$inp+8],$Xlo
480 brz,pt $shl,1f
481 ldx [$inp+0],$Xhi
482
483 ldx [$inp+16],$C1 ! align data
484 srlx $Xlo,$shr,$C0
485 sllx $Xlo,$shl,$Xlo
486 sllx $Xhi,$shl,$Xhi
487 srlx $C1,$shr,$C1
488 or $C0,$Xhi,$Xhi
489 or $C1,$Xlo,$Xlo
4901:
491 add $inp,16,$inp
492 sub $len,16,$len
493 xor $C2,$Xlo,$Xlo
494 xor $C3,$Xhi,$Xhi
495 prefetch [$inp+63], 20
496
497 xmulx $Xlo,$Hlo,$C0
498 xor $Xlo,$Xhi,$C2 ! Karatsuba pre-processing
499 xmulx $C2,$Hhl,$C1
500 xmulxhi $Xlo,$Hlo,$Xlo
501 xmulxhi $C2,$Hhl,$C2
502 xmulxhi $Xhi,$Hhi,$C3
503 xmulx $Xhi,$Hhi,$Xhi
504
505 sll $C0,3,$sqr
506 srlx $V,$sqr,$sqr ! ·0xE0 [implicit &(7<<3)]
507 xor $C0,$sqr,$sqr
508 sllx $sqr,57,$sqr ! ($C0·0xE1)<<1<<56 [implicit &0x7f]
509
510 xor $C0,$C1,$C1 ! Karatsuba post-processing
511 xor $Xlo,$C2,$C2
512 xor $sqr,$Xlo,$Xlo ! real destination is $C1
513 xor $C3,$C2,$C2
514 xor $Xlo,$C1,$C1
515 xor $Xhi,$C2,$C2
516 xor $Xhi,$C1,$C1
517
518 xmulxhi $C0,$xE1,$Xlo ! ·0xE1<<1<<56
519 xor $C0,$C2,$C2
520 xmulx $C1,$xE1,$C0
521 xor $C1,$C3,$C3
522 xmulxhi $C1,$xE1,$C1
523
524 xor $Xlo,$C2,$C2
525 xor $C0,$C2,$C2
526 brnz,pt $len,.Loop
527 xor $C1,$C3,$C3
528
529 stx $C2,[$Xip+8] ! save Xi
530 stx $C3,[$Xip+0]
531
532 ret
533 restore
534.type gcm_ghash_vis3,#function
535.size gcm_ghash_vis3,.-gcm_ghash_vis3
536___
537}}}
538$code.=<<___;
539.asciz "GHASH for SPARCv9/VIS3, CRYPTOGAMS by <appro\@openssl.org>"
540.align 4
541___
542
543
544
545# Purpose of these subroutines is to explicitly encode VIS instructions,
546# so that one can compile the module without having to specify VIS
547# extensions on compiler command line, e.g. -xarch=v9 vs. -xarch=v9a.
548# Idea is to reserve for option to produce "universal" binary and let
549# programmer detect if current CPU is VIS capable at run-time.
550sub unvis3 {
551my ($mnemonic,$rs1,$rs2,$rd)=@_;
552my %bias = ( "g" => 0, "o" => 8, "l" => 16, "i" => 24 );
553my ($ref,$opf);
554my %visopf = ( "addxc" => 0x011,
555 "addxccc" => 0x013,
556 "xmulx" => 0x115,
557 "xmulxhi" => 0x116 );
558
559 $ref = "$mnemonic\t$rs1,$rs2,$rd";
560
561 if ($opf=$visopf{$mnemonic}) {
562 foreach ($rs1,$rs2,$rd) {
563 return $ref if (!/%([goli])([0-9])/);
564 $_=$bias{$1}+$2;
565 }
566
567 return sprintf ".word\t0x%08x !%s",
568 0x81b00000|$rd<<25|$rs1<<14|$opf<<5|$rs2,
569 $ref;
570 } else {
571 return $ref;
572 }
573}
574
575foreach (split("\n",$code)) {
576 s/\`([^\`]*)\`/eval $1/ge;
577
578 s/\b(xmulx[hi]*|addxc[c]{0,2})\s+(%[goli][0-7]),\s*(%[goli][0-7]),\s*(%[goli][0-7])/
579 &unvis3($1,$2,$3,$4)
580 /ge;
581
582 print $_,"\n";
583}
584
585close STDOUT or die "error closing STDOUT: $!";
注意: 瀏覽 TracBrowser 來幫助您使用儲存庫瀏覽器

© 2024 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette