ghash-sparcv9.pl@ 99371

最後變更在這個檔案從99371是 99366,由 vboxsync 提交於 23 月前
openssl-3.1.0: Applied and adjusted our OpenSSL changes to 3.0.7. bugref:10418
檔案大小: 12.8 KB

行
1	#! /usr/bin/env perl
2	# Copyright 2010-2021 The OpenSSL Project Authors. All Rights Reserved.
3	#
4	# Licensed under the Apache License 2.0 (the "License"). You may not use
5	# this file except in compliance with the License. You can obtain a copy
6	# in the file LICENSE in the source distribution or at
7	# https://www.openssl.org/source/license.html
8
9
10	# ====================================================================
11	# Written by Andy Polyakov <[email protected]> for the OpenSSL
12	# project. The module is, however, dual licensed under OpenSSL and
13	# CRYPTOGAMS licenses depending on where you obtain it. For further
14	# details see http://www.openssl.org/~appro/cryptogams/.
15	# ====================================================================
16
17	# March 2010
18	#
19	# The module implements "4-bit" GCM GHASH function and underlying
20	# single multiplication operation in GF(2^128). "4-bit" means that it
21	# uses 256 bytes per-key table [+128 bytes shared table]. Performance
22	# results are for streamed GHASH subroutine on UltraSPARC pre-Tx CPU
23	# and are expressed in cycles per processed byte, less is better:
24	#
25	# gcc 3.3.x cc 5.2 this assembler
26	#
27	# 32-bit build 81.4 43.3 12.6 (+546%/+244%)
28	# 64-bit build 20.2 21.2 12.6 (+60%/+68%)
29	#
30	# Here is data collected on UltraSPARC T1 system running Linux:
31	#
32	# gcc 4.4.1 this assembler
33	#
34	# 32-bit build 566 50 (+1000%)
35	# 64-bit build 56 50 (+12%)
36	#
37	# I don't quite understand why difference between 32-bit and 64-bit
38	# compiler-generated code is so big. Compilers were instructed to
39	# generate code for UltraSPARC and should have used 64-bit registers
40	# for Z vector (see C code) even in 32-bit build... Oh well, it only
41	# means more impressive improvement coefficients for this assembler
42	# module;-) Loops are aggressively modulo-scheduled in respect to
43	# references to input data and Z.hi updates to achieve 12 cycles
44	# timing. To anchor to something else, sha1-sparcv9.pl spends 11.6
45	# cycles to process one byte on UltraSPARC pre-Tx CPU and ~24 on T1.
46	#
47	# October 2012
48	#
49	# Add VIS3 lookup-table-free implementation using polynomial
50	# multiplication xmulx[hi] and extended addition addxc[cc]
51	# instructions. 4.52/7.63x improvement on T3/T4 or in absolute
52	# terms 7.90/2.14 cycles per byte. On T4 multi-process benchmark
53	# saturates at ~15.5x single-process result on 8-core processor,
54	# or ~20.5GBps per 2.85GHz socket.
55
56	$output=pop and open STDOUT,">$output";
57
58	$frame="STACK_FRAME";
59	$bias="STACK_BIAS";
60
61	$Zhi="%o0"; # 64-bit values
62	$Zlo="%o1";
63	$Thi="%o2";
64	$Tlo="%o3";
65	$rem="%o4";
66	$tmp="%o5";
67
68	$nhi="%l0"; # small values and pointers
69	$nlo="%l1";
70	$xi0="%l2";
71	$xi1="%l3";
72	$rem_4bit="%l4";
73	$remi="%l5";
74	$Htblo="%l6";
75	$cnt="%l7";
76
77	$Xi="%i0"; # input argument block
78	$Htbl="%i1";
79	$inp="%i2";
80	$len="%i3";
81
82	$code.=<<___;
83	#ifndef __ASSEMBLER__
84	# define __ASSEMBLER__ 1
85	#endif
86	#include "crypto/sparc_arch.h"
87
88	#ifdef __arch64__
89	.register %g2,#scratch
90	.register %g3,#scratch
91	#endif
92
93	.section ".text",#alloc,#execinstr
94
95	.align 64
96	rem_4bit:
97	.long `0x0000<<16`,0,`0x1C20<<16`,0,`0x3840<<16`,0,`0x2460<<16`,0
98	.long `0x7080<<16`,0,`0x6CA0<<16`,0,`0x48C0<<16`,0,`0x54E0<<16`,0
99	.long `0xE100<<16`,0,`0xFD20<<16`,0,`0xD940<<16`,0,`0xC560<<16`,0
100	.long `0x9180<<16`,0,`0x8DA0<<16`,0,`0xA9C0<<16`,0,`0xB5E0<<16`,0
101	.type rem_4bit,#object
102	.size rem_4bit,(.-rem_4bit)
103
104	.globl gcm_ghash_4bit
105	.align 32
106	gcm_ghash_4bit:
107	save %sp,-$frame,%sp
108	ldub [$inp+15],$nlo
109	ldub [$Xi+15],$xi0
110	ldub [$Xi+14],$xi1
111	add $len,$inp,$len
112	add $Htbl,8,$Htblo
113
114	1: call .+8
115	add %o7,rem_4bit-1b,$rem_4bit
116
117	.Louter:
118	xor $xi0,$nlo,$nlo
119	and $nlo,0xf0,$nhi
120	and $nlo,0x0f,$nlo
121	sll $nlo,4,$nlo
122	ldx [$Htblo+$nlo],$Zlo
123	ldx [$Htbl+$nlo],$Zhi
124
125	ldub [$inp+14],$nlo
126
127	ldx [$Htblo+$nhi],$Tlo
128	and $Zlo,0xf,$remi
129	ldx [$Htbl+$nhi],$Thi
130	sll $remi,3,$remi
131	ldx [$rem_4bit+$remi],$rem
132	srlx $Zlo,4,$Zlo
133	mov 13,$cnt
134	sllx $Zhi,60,$tmp
135	xor $Tlo,$Zlo,$Zlo
136	srlx $Zhi,4,$Zhi
137	xor $Zlo,$tmp,$Zlo
138
139	xor $xi1,$nlo,$nlo
140	and $Zlo,0xf,$remi
141	and $nlo,0xf0,$nhi
142	and $nlo,0x0f,$nlo
143	ba .Lghash_inner
144	sll $nlo,4,$nlo
145	.align 32
146	.Lghash_inner:
147	ldx [$Htblo+$nlo],$Tlo
148	sll $remi,3,$remi
149	xor $Thi,$Zhi,$Zhi
150	ldx [$Htbl+$nlo],$Thi
151	srlx $Zlo,4,$Zlo
152	xor $rem,$Zhi,$Zhi
153	ldx [$rem_4bit+$remi],$rem
154	sllx $Zhi,60,$tmp
155	xor $Tlo,$Zlo,$Zlo
156	ldub [$inp+$cnt],$nlo
157	srlx $Zhi,4,$Zhi
158	xor $Zlo,$tmp,$Zlo
159	ldub [$Xi+$cnt],$xi1
160	xor $Thi,$Zhi,$Zhi
161	and $Zlo,0xf,$remi
162
163	ldx [$Htblo+$nhi],$Tlo
164	sll $remi,3,$remi
165	xor $rem,$Zhi,$Zhi
166	ldx [$Htbl+$nhi],$Thi
167	srlx $Zlo,4,$Zlo
168	ldx [$rem_4bit+$remi],$rem
169	sllx $Zhi,60,$tmp
170	xor $xi1,$nlo,$nlo
171	srlx $Zhi,4,$Zhi
172	and $nlo,0xf0,$nhi
173	addcc $cnt,-1,$cnt
174	xor $Zlo,$tmp,$Zlo
175	and $nlo,0x0f,$nlo
176	xor $Tlo,$Zlo,$Zlo
177	sll $nlo,4,$nlo
178	blu .Lghash_inner
179	and $Zlo,0xf,$remi
180
181	ldx [$Htblo+$nlo],$Tlo
182	sll $remi,3,$remi
183	xor $Thi,$Zhi,$Zhi
184	ldx [$Htbl+$nlo],$Thi
185	srlx $Zlo,4,$Zlo
186	xor $rem,$Zhi,$Zhi
187	ldx [$rem_4bit+$remi],$rem
188	sllx $Zhi,60,$tmp
189	xor $Tlo,$Zlo,$Zlo
190	srlx $Zhi,4,$Zhi
191	xor $Zlo,$tmp,$Zlo
192	xor $Thi,$Zhi,$Zhi
193
194	add $inp,16,$inp
195	cmp $inp,$len
196	be,pn SIZE_T_CC,.Ldone
197	and $Zlo,0xf,$remi
198
199	ldx [$Htblo+$nhi],$Tlo
200	sll $remi,3,$remi
201	xor $rem,$Zhi,$Zhi
202	ldx [$Htbl+$nhi],$Thi
203	srlx $Zlo,4,$Zlo
204	ldx [$rem_4bit+$remi],$rem
205	sllx $Zhi,60,$tmp
206	xor $Tlo,$Zlo,$Zlo
207	ldub [$inp+15],$nlo
208	srlx $Zhi,4,$Zhi
209	xor $Zlo,$tmp,$Zlo
210	xor $Thi,$Zhi,$Zhi
211	stx $Zlo,[$Xi+8]
212	xor $rem,$Zhi,$Zhi
213	stx $Zhi,[$Xi]
214	srl $Zlo,8,$xi1
215	and $Zlo,0xff,$xi0
216	ba .Louter
217	and $xi1,0xff,$xi1
218	.align 32
219	.Ldone:
220	ldx [$Htblo+$nhi],$Tlo
221	sll $remi,3,$remi
222	xor $rem,$Zhi,$Zhi
223	ldx [$Htbl+$nhi],$Thi
224	srlx $Zlo,4,$Zlo
225	ldx [$rem_4bit+$remi],$rem
226	sllx $Zhi,60,$tmp
227	xor $Tlo,$Zlo,$Zlo
228	srlx $Zhi,4,$Zhi
229	xor $Zlo,$tmp,$Zlo
230	xor $Thi,$Zhi,$Zhi
231	stx $Zlo,[$Xi+8]
232	xor $rem,$Zhi,$Zhi
233	stx $Zhi,[$Xi]
234
235	ret
236	restore
237	.type gcm_ghash_4bit,#function
238	.size gcm_ghash_4bit,(.-gcm_ghash_4bit)
239	___
240
241	undef $inp;
242	undef $len;
243
244	$code.=<<___;
245	.globl gcm_gmult_4bit
246	.align 32
247	gcm_gmult_4bit:
248	save %sp,-$frame,%sp
249	ldub [$Xi+15],$nlo
250	add $Htbl,8,$Htblo
251
252	1: call .+8
253	add %o7,rem_4bit-1b,$rem_4bit
254
255	and $nlo,0xf0,$nhi
256	and $nlo,0x0f,$nlo
257	sll $nlo,4,$nlo
258	ldx [$Htblo+$nlo],$Zlo
259	ldx [$Htbl+$nlo],$Zhi
260
261	ldub [$Xi+14],$nlo
262
263	ldx [$Htblo+$nhi],$Tlo
264	and $Zlo,0xf,$remi
265	ldx [$Htbl+$nhi],$Thi
266	sll $remi,3,$remi
267	ldx [$rem_4bit+$remi],$rem
268	srlx $Zlo,4,$Zlo
269	mov 13,$cnt
270	sllx $Zhi,60,$tmp
271	xor $Tlo,$Zlo,$Zlo
272	srlx $Zhi,4,$Zhi
273	xor $Zlo,$tmp,$Zlo
274
275	and $Zlo,0xf,$remi
276	and $nlo,0xf0,$nhi
277	and $nlo,0x0f,$nlo
278	ba .Lgmult_inner
279	sll $nlo,4,$nlo
280	.align 32
281	.Lgmult_inner:
282	ldx [$Htblo+$nlo],$Tlo
283	sll $remi,3,$remi
284	xor $Thi,$Zhi,$Zhi
285	ldx [$Htbl+$nlo],$Thi
286	srlx $Zlo,4,$Zlo
287	xor $rem,$Zhi,$Zhi
288	ldx [$rem_4bit+$remi],$rem
289	sllx $Zhi,60,$tmp
290	xor $Tlo,$Zlo,$Zlo
291	ldub [$Xi+$cnt],$nlo
292	srlx $Zhi,4,$Zhi
293	xor $Zlo,$tmp,$Zlo
294	xor $Thi,$Zhi,$Zhi
295	and $Zlo,0xf,$remi
296
297	ldx [$Htblo+$nhi],$Tlo
298	sll $remi,3,$remi
299	xor $rem,$Zhi,$Zhi
300	ldx [$Htbl+$nhi],$Thi
301	srlx $Zlo,4,$Zlo
302	ldx [$rem_4bit+$remi],$rem
303	sllx $Zhi,60,$tmp
304	srlx $Zhi,4,$Zhi
305	and $nlo,0xf0,$nhi
306	addcc $cnt,-1,$cnt
307	xor $Zlo,$tmp,$Zlo
308	and $nlo,0x0f,$nlo
309	xor $Tlo,$Zlo,$Zlo
310	sll $nlo,4,$nlo
311	blu .Lgmult_inner
312	and $Zlo,0xf,$remi
313
314	ldx [$Htblo+$nlo],$Tlo
315	sll $remi,3,$remi
316	xor $Thi,$Zhi,$Zhi
317	ldx [$Htbl+$nlo],$Thi
318	srlx $Zlo,4,$Zlo
319	xor $rem,$Zhi,$Zhi
320	ldx [$rem_4bit+$remi],$rem
321	sllx $Zhi,60,$tmp
322	xor $Tlo,$Zlo,$Zlo
323	srlx $Zhi,4,$Zhi
324	xor $Zlo,$tmp,$Zlo
325	xor $Thi,$Zhi,$Zhi
326	and $Zlo,0xf,$remi
327
328	ldx [$Htblo+$nhi],$Tlo
329	sll $remi,3,$remi
330	xor $rem,$Zhi,$Zhi
331	ldx [$Htbl+$nhi],$Thi
332	srlx $Zlo,4,$Zlo
333	ldx [$rem_4bit+$remi],$rem
334	sllx $Zhi,60,$tmp
335	xor $Tlo,$Zlo,$Zlo
336	srlx $Zhi,4,$Zhi
337	xor $Zlo,$tmp,$Zlo
338	xor $Thi,$Zhi,$Zhi
339	stx $Zlo,[$Xi+8]
340	xor $rem,$Zhi,$Zhi
341	stx $Zhi,[$Xi]
342
343	ret
344	restore
345	.type gcm_gmult_4bit,#function
346	.size gcm_gmult_4bit,(.-gcm_gmult_4bit)
347	___
348
349
350	{{{
351	# Straightforward 128x128-bit multiplication using Karatsuba algorithm
352	# followed by pair of 64-bit reductions [with a shortcut in first one,
353	# which allowed to break dependency between reductions and remove one
354	# multiplication from critical path]. While it might be suboptimal
355	# with regard to sheer number of multiplications, other methods [such
356	# as aggregate reduction] would require more 64-bit registers, which
357	# we don't have in 32-bit application context.
358
359	($Xip,$Htable,$inp,$len)=map("%i$_",(0..3));
360
361	($Hhl,$Hlo,$Hhi,$Xlo,$Xhi,$xE1,$sqr, $C0,$C1,$C2,$C3,$V)=
362	(map("%o$_",(0..5,7)),map("%g$_",(1..5)));
363
364	($shl,$shr)=map("%l$_",(0..7));
365
366	# For details regarding "twisted H" see ghash-x86.pl.
367	$code.=<<___;
368	.globl gcm_init_vis3
369	.align 32
370	gcm_init_vis3:
371	save %sp,-$frame,%sp
372
373	ldx [%i1+0],$Hhi
374	ldx [%i1+8],$Hlo
375	mov 0xE1,$Xhi
376	mov 1,$Xlo
377	sllx $Xhi,57,$Xhi
378	srax $Hhi,63,$C0 ! broadcast carry
379	addcc $Hlo,$Hlo,$Hlo ! H<<=1
380	addxc $Hhi,$Hhi,$Hhi
381	and $C0,$Xlo,$Xlo
382	and $C0,$Xhi,$Xhi
383	xor $Xlo,$Hlo,$Hlo
384	xor $Xhi,$Hhi,$Hhi
385	stx $Hlo,[%i0+8] ! save twisted H
386	stx $Hhi,[%i0+0]
387
388	sethi %hi(0xA0406080),$V
389	sethi %hi(0x20C0E000),%l0
390	or $V,%lo(0xA0406080),$V
391	or %l0,%lo(0x20C0E000),%l0
392	sllx $V,32,$V
393	or %l0,$V,$V ! (0xE0·i)&0xff=0xA040608020C0E000
394	stx $V,[%i0+16]
395
396	ret
397	restore
398	.type gcm_init_vis3,#function
399	.size gcm_init_vis3,.-gcm_init_vis3
400
401	.globl gcm_gmult_vis3
402	.align 32
403	gcm_gmult_vis3:
404	save %sp,-$frame,%sp
405
406	ldx [$Xip+8],$Xlo ! load Xi
407	ldx [$Xip+0],$Xhi
408	ldx [$Htable+8],$Hlo ! load twisted H
409	ldx [$Htable+0],$Hhi
410
411	mov 0xE1,%l7
412	sllx %l7,57,$xE1 ! 57 is not a typo
413	ldx [$Htable+16],$V ! (0xE0·i)&0xff=0xA040608020C0E000
414
415	xor $Hhi,$Hlo,$Hhl ! Karatsuba pre-processing
416	xmulx $Xlo,$Hlo,$C0
417	xor $Xlo,$Xhi,$C2 ! Karatsuba pre-processing
418	xmulx $C2,$Hhl,$C1
419	xmulxhi $Xlo,$Hlo,$Xlo
420	xmulxhi $C2,$Hhl,$C2
421	xmulxhi $Xhi,$Hhi,$C3
422	xmulx $Xhi,$Hhi,$Xhi
423
424	sll $C0,3,$sqr
425	srlx $V,$sqr,$sqr ! ·0xE0 [implicit &(7<<3)]
426	xor $C0,$sqr,$sqr
427	sllx $sqr,57,$sqr ! ($C0·0xE1)<<1<<56 [implicit &0x7f]
428
429	xor $C0,$C1,$C1 ! Karatsuba post-processing
430	xor $Xlo,$C2,$C2
431	xor $sqr,$Xlo,$Xlo ! real destination is $C1
432	xor $C3,$C2,$C2
433	xor $Xlo,$C1,$C1
434	xor $Xhi,$C2,$C2
435	xor $Xhi,$C1,$C1
436
437	xmulxhi $C0,$xE1,$Xlo ! ·0xE1<<1<<56
438	xor $C0,$C2,$C2
439	xmulx $C1,$xE1,$C0
440	xor $C1,$C3,$C3
441	xmulxhi $C1,$xE1,$C1
442
443	xor $Xlo,$C2,$C2
444	xor $C0,$C2,$C2
445	xor $C1,$C3,$C3
446
447	stx $C2,[$Xip+8] ! save Xi
448	stx $C3,[$Xip+0]
449
450	ret
451	restore
452	.type gcm_gmult_vis3,#function
453	.size gcm_gmult_vis3,.-gcm_gmult_vis3
454
455	.globl gcm_ghash_vis3
456	.align 32
457	gcm_ghash_vis3:
458	save %sp,-$frame,%sp
459	nop
460	srln $len,0,$len ! needed on v8+, "nop" on v9
461
462	ldx [$Xip+8],$C2 ! load Xi
463	ldx [$Xip+0],$C3
464	ldx [$Htable+8],$Hlo ! load twisted H
465	ldx [$Htable+0],$Hhi
466
467	mov 0xE1,%l7
468	sllx %l7,57,$xE1 ! 57 is not a typo
469	ldx [$Htable+16],$V ! (0xE0·i)&0xff=0xA040608020C0E000
470
471	and $inp,7,$shl
472	andn $inp,7,$inp
473	sll $shl,3,$shl
474	prefetch [$inp+63], 20
475	sub %g0,$shl,$shr
476
477	xor $Hhi,$Hlo,$Hhl ! Karatsuba pre-processing
478	.Loop:
479	ldx [$inp+8],$Xlo
480	brz,pt $shl,1f
481	ldx [$inp+0],$Xhi
482
483	ldx [$inp+16],$C1 ! align data
484	srlx $Xlo,$shr,$C0
485	sllx $Xlo,$shl,$Xlo
486	sllx $Xhi,$shl,$Xhi
487	srlx $C1,$shr,$C1
488	or $C0,$Xhi,$Xhi
489	or $C1,$Xlo,$Xlo
490	1:
491	add $inp,16,$inp
492	sub $len,16,$len
493	xor $C2,$Xlo,$Xlo
494	xor $C3,$Xhi,$Xhi
495	prefetch [$inp+63], 20
496
497	xmulx $Xlo,$Hlo,$C0
498	xor $Xlo,$Xhi,$C2 ! Karatsuba pre-processing
499	xmulx $C2,$Hhl,$C1
500	xmulxhi $Xlo,$Hlo,$Xlo
501	xmulxhi $C2,$Hhl,$C2
502	xmulxhi $Xhi,$Hhi,$C3
503	xmulx $Xhi,$Hhi,$Xhi
504
505	sll $C0,3,$sqr
506	srlx $V,$sqr,$sqr ! ·0xE0 [implicit &(7<<3)]
507	xor $C0,$sqr,$sqr
508	sllx $sqr,57,$sqr ! ($C0·0xE1)<<1<<56 [implicit &0x7f]
509
510	xor $C0,$C1,$C1 ! Karatsuba post-processing
511	xor $Xlo,$C2,$C2
512	xor $sqr,$Xlo,$Xlo ! real destination is $C1
513	xor $C3,$C2,$C2
514	xor $Xlo,$C1,$C1
515	xor $Xhi,$C2,$C2
516	xor $Xhi,$C1,$C1
517
518	xmulxhi $C0,$xE1,$Xlo ! ·0xE1<<1<<56
519	xor $C0,$C2,$C2
520	xmulx $C1,$xE1,$C0
521	xor $C1,$C3,$C3
522	xmulxhi $C1,$xE1,$C1
523
524	xor $Xlo,$C2,$C2
525	xor $C0,$C2,$C2
526	brnz,pt $len,.Loop
527	xor $C1,$C3,$C3
528
529	stx $C2,[$Xip+8] ! save Xi
530	stx $C3,[$Xip+0]
531
532	ret
533	restore
534	.type gcm_ghash_vis3,#function
535	.size gcm_ghash_vis3,.-gcm_ghash_vis3
536	___
537	}}}
538	$code.=<<___;
539	.asciz "GHASH for SPARCv9/VIS3, CRYPTOGAMS by <appro\@openssl.org>"
540	.align 4
541	___
542
543
544
545	# Purpose of these subroutines is to explicitly encode VIS instructions,
546	# so that one can compile the module without having to specify VIS
547	# extensions on compiler command line, e.g. -xarch=v9 vs. -xarch=v9a.
548	# Idea is to reserve for option to produce "universal" binary and let
549	# programmer detect if current CPU is VIS capable at run-time.
550	sub unvis3 {
551	my ($mnemonic,$rs1,$rs2,$rd)=@_;
552	my %bias = ( "g" => 0, "o" => 8, "l" => 16, "i" => 24 );
553	my ($ref,$opf);
554	my %visopf = ( "addxc" => 0x011,
555	"addxccc" => 0x013,
556	"xmulx" => 0x115,
557	"xmulxhi" => 0x116 );
558
559	$ref = "$mnemonic\t$rs1,$rs2,$rd";
560
561	if ($opf=$visopf{$mnemonic}) {
562	foreach ($rs1,$rs2,$rd) {
563	return $ref if (!/%([goli])([0-9])/);
564	$_=$bias{$1}+$2;
565	}
566
567	return sprintf ".word\t0x%08x !%s",
568	0x81b00000\|$rd<<25\|$rs1<<14\|$opf<<5\|$rs2,
569	$ref;
570	} else {
571	return $ref;
572	}
573	}
574
575	foreach (split("\n",$code)) {
576	s/\`([^\`]*)\`/eval $1/ge;
577
578	s/\b(xmulx[hi]\|addxc[c]{0,2})\s+(%[goli][0-7]),\s(%[goli][0-7]),\s*(%[goli][0-7])/
579	&unvis3($1,$2,$3,$4)
580	/ge;
581
582	print $_,"\n";
583	}
584
585	close STDOUT or die "error closing STDOUT: $!";

注意: 瀏覽 TracBrowser 來幫助您使用儲存庫瀏覽器

source: vbox/trunk/src/libs/openssl-3.1.0/crypto/modes/asm/ghash-sparcv9.pl@ 99371

以其他格式下載: