1 | #! /usr/bin/env perl
|
---|
2 | # Copyright 2008-2020 The OpenSSL Project Authors. All Rights Reserved.
|
---|
3 | #
|
---|
4 | # Licensed under the OpenSSL license (the "License"). You may not use
|
---|
5 | # this file except in compliance with the License. You can obtain a copy
|
---|
6 | # in the file LICENSE in the source distribution or at
|
---|
7 | # https://www.openssl.org/source/license.html
|
---|
8 |
|
---|
9 |
|
---|
10 | # ====================================================================
|
---|
11 | # Copyright (c) 2008 Andy Polyakov <[email protected]>
|
---|
12 | #
|
---|
13 | # This module may be used under the terms of either the GNU General
|
---|
14 | # Public License version 2 or later, the GNU Lesser General Public
|
---|
15 | # License version 2.1 or later, the Mozilla Public License version
|
---|
16 | # 1.1 or the BSD License. The exact terms of either license are
|
---|
17 | # distributed along with this module. For further details see
|
---|
18 | # http://www.openssl.org/~appro/camellia/.
|
---|
19 | # ====================================================================
|
---|
20 |
|
---|
21 | # Performance in cycles per processed byte (less is better) in
|
---|
22 | # 'openssl speed ...' benchmark:
|
---|
23 | #
|
---|
24 | # AMD K8 Core2 PIII P4
|
---|
25 | # -evp camellia-128-ecb 21.5 22.8 27.0 28.9
|
---|
26 | # + over gcc 3.4.6 +90/11% +70/10% +53/4% +160/64%
|
---|
27 | # + over icc 8.0 +48/19% +21/15% +21/17% +55/37%
|
---|
28 | #
|
---|
29 | # camellia-128-cbc 17.3 21.1 23.9 25.9
|
---|
30 | #
|
---|
31 | # 128-bit key setup 196 280 256 240 cycles/key
|
---|
32 | # + over gcc 3.4.6 +30/0% +17/11% +11/0% +63/40%
|
---|
33 | # + over icc 8.0 +18/3% +10/0% +10/3% +21/10%
|
---|
34 | #
|
---|
35 | # Pairs of numbers in "+" rows represent performance improvement over
|
---|
36 | # compiler generated position-independent code, PIC, and non-PIC
|
---|
37 | # respectively. PIC results are of greater relevance, as this module
|
---|
38 | # is position-independent, i.e. suitable for a shared library or PIE.
|
---|
39 | # Position independence "costs" one register, which is why compilers
|
---|
40 | # are so close with non-PIC results, they have an extra register to
|
---|
41 | # spare. CBC results are better than ECB ones thanks to "zero-copy"
|
---|
42 | # private _x86_* interface, and are ~30-40% better than with compiler
|
---|
43 | # generated cmll_cbc.o, and reach ~80-90% of x86_64 performance on
|
---|
44 | # same CPU (where applicable).
|
---|
45 |
|
---|
46 | $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
|
---|
47 | push(@INC,"${dir}","${dir}../../perlasm");
|
---|
48 | require "x86asm.pl";
|
---|
49 |
|
---|
50 | $OPENSSL=1;
|
---|
51 |
|
---|
52 | $output = pop;
|
---|
53 | open STDOUT,">$output";
|
---|
54 |
|
---|
55 | &asm_init($ARGV[0],$ARGV[$#ARGV] eq "386");
|
---|
56 |
|
---|
57 | @T=("eax","ebx","ecx","edx");
|
---|
58 | $idx="esi";
|
---|
59 | $key="edi";
|
---|
60 | $Tbl="ebp";
|
---|
61 |
|
---|
62 | # stack frame layout in _x86_Camellia_* routines, frame is allocated
|
---|
63 | # by caller
|
---|
64 | $__ra=&DWP(0,"esp"); # return address
|
---|
65 | $__s0=&DWP(4,"esp"); # s0 backing store
|
---|
66 | $__s1=&DWP(8,"esp"); # s1 backing store
|
---|
67 | $__s2=&DWP(12,"esp"); # s2 backing store
|
---|
68 | $__s3=&DWP(16,"esp"); # s3 backing store
|
---|
69 | $__end=&DWP(20,"esp"); # pointer to end/start of key schedule
|
---|
70 |
|
---|
71 | # stack frame layout in Camellia_[en|crypt] routines, which differs from
|
---|
72 | # above by 4 and overlaps by pointer to end/start of key schedule
|
---|
73 | $_end=&DWP(16,"esp");
|
---|
74 | $_esp=&DWP(20,"esp");
|
---|
75 |
|
---|
76 | # const unsigned int Camellia_SBOX[4][256];
|
---|
77 | # Well, sort of... Camellia_SBOX[0][] is interleaved with [1][],
|
---|
78 | # and [2][] - with [3][]. This is done to optimize code size.
|
---|
79 | $SBOX1_1110=0; # Camellia_SBOX[0]
|
---|
80 | $SBOX4_4404=4; # Camellia_SBOX[1]
|
---|
81 | $SBOX2_0222=2048; # Camellia_SBOX[2]
|
---|
82 | $SBOX3_3033=2052; # Camellia_SBOX[3]
|
---|
83 | &static_label("Camellia_SIGMA");
|
---|
84 | &static_label("Camellia_SBOX");
|
---|
85 |
|
---|
86 | sub Camellia_Feistel {
|
---|
87 | my $i=@_[0];
|
---|
88 | my $seed=defined(@_[1])?@_[1]:0;
|
---|
89 | my $scale=$seed<0?-8:8;
|
---|
90 | my $frame=defined(@_[2])?@_[2]:0;
|
---|
91 | my $j=($i&1)*2;
|
---|
92 | my $t0=@T[($j)%4],$t1=@T[($j+1)%4],$t2=@T[($j+2)%4],$t3=@T[($j+3)%4];
|
---|
93 |
|
---|
94 | &xor ($t0,$idx); # t0^=key[0]
|
---|
95 | &xor ($t1,&DWP($seed+$i*$scale+4,$key)); # t1^=key[1]
|
---|
96 | &movz ($idx,&HB($t0)); # (t0>>8)&0xff
|
---|
97 | &mov ($t3,&DWP($SBOX3_3033,$Tbl,$idx,8)); # t3=SBOX3_3033[0]
|
---|
98 | &movz ($idx,&LB($t0)); # (t0>>0)&0xff
|
---|
99 | &xor ($t3,&DWP($SBOX4_4404,$Tbl,$idx,8)); # t3^=SBOX4_4404[0]
|
---|
100 | &shr ($t0,16);
|
---|
101 | &movz ($idx,&LB($t1)); # (t1>>0)&0xff
|
---|
102 | &mov ($t2,&DWP($SBOX1_1110,$Tbl,$idx,8)); # t2=SBOX1_1110[1]
|
---|
103 | &movz ($idx,&HB($t0)); # (t0>>24)&0xff
|
---|
104 | &xor ($t3,&DWP($SBOX1_1110,$Tbl,$idx,8)); # t3^=SBOX1_1110[0]
|
---|
105 | &movz ($idx,&HB($t1)); # (t1>>8)&0xff
|
---|
106 | &xor ($t2,&DWP($SBOX4_4404,$Tbl,$idx,8)); # t2^=SBOX4_4404[1]
|
---|
107 | &shr ($t1,16);
|
---|
108 | &movz ($t0,&LB($t0)); # (t0>>16)&0xff
|
---|
109 | &xor ($t3,&DWP($SBOX2_0222,$Tbl,$t0,8)); # t3^=SBOX2_0222[0]
|
---|
110 | &movz ($idx,&HB($t1)); # (t1>>24)&0xff
|
---|
111 | &mov ($t0,&DWP($frame+4*(($j+3)%4),"esp")); # prefetch "s3"
|
---|
112 | &xor ($t2,$t3); # t2^=t3
|
---|
113 | &rotr ($t3,8); # t3=RightRotate(t3,8)
|
---|
114 | &xor ($t2,&DWP($SBOX2_0222,$Tbl,$idx,8)); # t2^=SBOX2_0222[1]
|
---|
115 | &movz ($idx,&LB($t1)); # (t1>>16)&0xff
|
---|
116 | &mov ($t1,&DWP($frame+4*(($j+2)%4),"esp")); # prefetch "s2"
|
---|
117 | &xor ($t3,$t0); # t3^=s3
|
---|
118 | &xor ($t2,&DWP($SBOX3_3033,$Tbl,$idx,8)); # t2^=SBOX3_3033[1]
|
---|
119 | &mov ($idx,&DWP($seed+($i+1)*$scale,$key)); # prefetch key[i+1]
|
---|
120 | &xor ($t3,$t2); # t3^=t2
|
---|
121 | &mov (&DWP($frame+4*(($j+3)%4),"esp"),$t3); # s3=t3
|
---|
122 | &xor ($t2,$t1); # t2^=s2
|
---|
123 | &mov (&DWP($frame+4*(($j+2)%4),"esp"),$t2); # s2=t2
|
---|
124 | }
|
---|
125 |
|
---|
126 | # void Camellia_EncryptBlock_Rounds(
|
---|
127 | # int grandRounds,
|
---|
128 | # const Byte plaintext[],
|
---|
129 | # const KEY_TABLE_TYPE keyTable,
|
---|
130 | # Byte ciphertext[])
|
---|
131 | &function_begin("Camellia_EncryptBlock_Rounds");
|
---|
132 | &mov ("eax",&wparam(0)); # load grandRounds
|
---|
133 | &mov ($idx,&wparam(1)); # load plaintext pointer
|
---|
134 | &mov ($key,&wparam(2)); # load key schedule pointer
|
---|
135 |
|
---|
136 | &mov ("ebx","esp");
|
---|
137 | &sub ("esp",7*4); # place for s[0-3],keyEnd,esp and ra
|
---|
138 | &and ("esp",-64);
|
---|
139 |
|
---|
140 | # place stack frame just "above mod 1024" the key schedule
|
---|
141 | # this ensures that cache associativity of 2 suffices
|
---|
142 | &lea ("ecx",&DWP(-64-63,$key));
|
---|
143 | &sub ("ecx","esp");
|
---|
144 | &neg ("ecx");
|
---|
145 | &and ("ecx",0x3C0); # modulo 1024, but aligned to cache-line
|
---|
146 | &sub ("esp","ecx");
|
---|
147 | &add ("esp",4); # 4 is reserved for callee's return address
|
---|
148 |
|
---|
149 | &shl ("eax",6);
|
---|
150 | &lea ("eax",&DWP(0,$key,"eax"));
|
---|
151 | &mov ($_esp,"ebx"); # save %esp
|
---|
152 | &mov ($_end,"eax"); # save keyEnd
|
---|
153 |
|
---|
154 | &call (&label("pic_point"));
|
---|
155 | &set_label("pic_point");
|
---|
156 | &blindpop($Tbl);
|
---|
157 | &lea ($Tbl,&DWP(&label("Camellia_SBOX")."-".&label("pic_point"),$Tbl));
|
---|
158 |
|
---|
159 | &mov (@T[0],&DWP(0,$idx)); # load plaintext
|
---|
160 | &mov (@T[1],&DWP(4,$idx));
|
---|
161 | &mov (@T[2],&DWP(8,$idx));
|
---|
162 | &bswap (@T[0]);
|
---|
163 | &mov (@T[3],&DWP(12,$idx));
|
---|
164 | &bswap (@T[1]);
|
---|
165 | &bswap (@T[2]);
|
---|
166 | &bswap (@T[3]);
|
---|
167 |
|
---|
168 | &call ("_x86_Camellia_encrypt");
|
---|
169 |
|
---|
170 | &mov ("esp",$_esp);
|
---|
171 | &bswap (@T[0]);
|
---|
172 | &mov ($idx,&wparam(3)); # load ciphertext pointer
|
---|
173 | &bswap (@T[1]);
|
---|
174 | &bswap (@T[2]);
|
---|
175 | &bswap (@T[3]);
|
---|
176 | &mov (&DWP(0,$idx),@T[0]); # write ciphertext
|
---|
177 | &mov (&DWP(4,$idx),@T[1]);
|
---|
178 | &mov (&DWP(8,$idx),@T[2]);
|
---|
179 | &mov (&DWP(12,$idx),@T[3]);
|
---|
180 | &function_end("Camellia_EncryptBlock_Rounds");
|
---|
181 | # V1.x API
|
---|
182 | &function_begin_B("Camellia_EncryptBlock");
|
---|
183 | &mov ("eax",128);
|
---|
184 | &sub ("eax",&wparam(0)); # load keyBitLength
|
---|
185 | &mov ("eax",3);
|
---|
186 | &adc ("eax",0); # keyBitLength==128?3:4
|
---|
187 | &mov (&wparam(0),"eax");
|
---|
188 | &jmp (&label("Camellia_EncryptBlock_Rounds"));
|
---|
189 | &function_end_B("Camellia_EncryptBlock");
|
---|
190 |
|
---|
191 | if ($OPENSSL) {
|
---|
192 | # void Camellia_encrypt(
|
---|
193 | # const unsigned char *in,
|
---|
194 | # unsigned char *out,
|
---|
195 | # const CAMELLIA_KEY *key)
|
---|
196 | &function_begin("Camellia_encrypt");
|
---|
197 | &mov ($idx,&wparam(0)); # load plaintext pointer
|
---|
198 | &mov ($key,&wparam(2)); # load key schedule pointer
|
---|
199 |
|
---|
200 | &mov ("ebx","esp");
|
---|
201 | &sub ("esp",7*4); # place for s[0-3],keyEnd,esp and ra
|
---|
202 | &and ("esp",-64);
|
---|
203 | &mov ("eax",&DWP(272,$key)); # load grandRounds counter
|
---|
204 |
|
---|
205 | # place stack frame just "above mod 1024" the key schedule
|
---|
206 | # this ensures that cache associativity of 2 suffices
|
---|
207 | &lea ("ecx",&DWP(-64-63,$key));
|
---|
208 | &sub ("ecx","esp");
|
---|
209 | &neg ("ecx");
|
---|
210 | &and ("ecx",0x3C0); # modulo 1024, but aligned to cache-line
|
---|
211 | &sub ("esp","ecx");
|
---|
212 | &add ("esp",4); # 4 is reserved for callee's return address
|
---|
213 |
|
---|
214 | &shl ("eax",6);
|
---|
215 | &lea ("eax",&DWP(0,$key,"eax"));
|
---|
216 | &mov ($_esp,"ebx"); # save %esp
|
---|
217 | &mov ($_end,"eax"); # save keyEnd
|
---|
218 |
|
---|
219 | &call (&label("pic_point"));
|
---|
220 | &set_label("pic_point");
|
---|
221 | &blindpop($Tbl);
|
---|
222 | &lea ($Tbl,&DWP(&label("Camellia_SBOX")."-".&label("pic_point"),$Tbl));
|
---|
223 |
|
---|
224 | &mov (@T[0],&DWP(0,$idx)); # load plaintext
|
---|
225 | &mov (@T[1],&DWP(4,$idx));
|
---|
226 | &mov (@T[2],&DWP(8,$idx));
|
---|
227 | &bswap (@T[0]);
|
---|
228 | &mov (@T[3],&DWP(12,$idx));
|
---|
229 | &bswap (@T[1]);
|
---|
230 | &bswap (@T[2]);
|
---|
231 | &bswap (@T[3]);
|
---|
232 |
|
---|
233 | &call ("_x86_Camellia_encrypt");
|
---|
234 |
|
---|
235 | &mov ("esp",$_esp);
|
---|
236 | &bswap (@T[0]);
|
---|
237 | &mov ($idx,&wparam(1)); # load ciphertext pointer
|
---|
238 | &bswap (@T[1]);
|
---|
239 | &bswap (@T[2]);
|
---|
240 | &bswap (@T[3]);
|
---|
241 | &mov (&DWP(0,$idx),@T[0]); # write ciphertext
|
---|
242 | &mov (&DWP(4,$idx),@T[1]);
|
---|
243 | &mov (&DWP(8,$idx),@T[2]);
|
---|
244 | &mov (&DWP(12,$idx),@T[3]);
|
---|
245 | &function_end("Camellia_encrypt");
|
---|
246 | }
|
---|
247 |
|
---|
248 | &function_begin_B("_x86_Camellia_encrypt");
|
---|
249 | &xor (@T[0],&DWP(0,$key)); # ^=key[0-3]
|
---|
250 | &xor (@T[1],&DWP(4,$key));
|
---|
251 | &xor (@T[2],&DWP(8,$key));
|
---|
252 | &xor (@T[3],&DWP(12,$key));
|
---|
253 | &mov ($idx,&DWP(16,$key)); # prefetch key[4]
|
---|
254 |
|
---|
255 | &mov ($__s0,@T[0]); # save s[0-3]
|
---|
256 | &mov ($__s1,@T[1]);
|
---|
257 | &mov ($__s2,@T[2]);
|
---|
258 | &mov ($__s3,@T[3]);
|
---|
259 |
|
---|
260 | &set_label("loop",16);
|
---|
261 | for ($i=0;$i<6;$i++) { Camellia_Feistel($i,16,4); }
|
---|
262 |
|
---|
263 | &add ($key,16*4);
|
---|
264 | &cmp ($key,$__end);
|
---|
265 | &je (&label("done"));
|
---|
266 |
|
---|
267 | # @T[0-1] are preloaded, $idx is preloaded with key[0]
|
---|
268 | &and ($idx,@T[0]);
|
---|
269 | &mov (@T[3],$__s3);
|
---|
270 | &rotl ($idx,1);
|
---|
271 | &mov (@T[2],@T[3]);
|
---|
272 | &xor (@T[1],$idx);
|
---|
273 | &or (@T[2],&DWP(12,$key));
|
---|
274 | &mov ($__s1,@T[1]); # s1^=LeftRotate(s0&key[0],1);
|
---|
275 | &xor (@T[2],$__s2);
|
---|
276 |
|
---|
277 | &mov ($idx,&DWP(4,$key));
|
---|
278 | &mov ($__s2,@T[2]); # s2^=s3|key[3];
|
---|
279 | &or ($idx,@T[1]);
|
---|
280 | &and (@T[2],&DWP(8,$key));
|
---|
281 | &xor (@T[0],$idx);
|
---|
282 | &rotl (@T[2],1);
|
---|
283 | &mov ($__s0,@T[0]); # s0^=s1|key[1];
|
---|
284 | &xor (@T[3],@T[2]);
|
---|
285 | &mov ($idx,&DWP(16,$key)); # prefetch key[4]
|
---|
286 | &mov ($__s3,@T[3]); # s3^=LeftRotate(s2&key[2],1);
|
---|
287 | &jmp (&label("loop"));
|
---|
288 |
|
---|
289 | &set_label("done",8);
|
---|
290 | &mov (@T[2],@T[0]); # SwapHalf
|
---|
291 | &mov (@T[3],@T[1]);
|
---|
292 | &mov (@T[0],$__s2);
|
---|
293 | &mov (@T[1],$__s3);
|
---|
294 | &xor (@T[0],$idx); # $idx is preloaded with key[0]
|
---|
295 | &xor (@T[1],&DWP(4,$key));
|
---|
296 | &xor (@T[2],&DWP(8,$key));
|
---|
297 | &xor (@T[3],&DWP(12,$key));
|
---|
298 | &ret ();
|
---|
299 | &function_end_B("_x86_Camellia_encrypt");
|
---|
300 |
|
---|
301 | # void Camellia_DecryptBlock_Rounds(
|
---|
302 | # int grandRounds,
|
---|
303 | # const Byte ciphertext[],
|
---|
304 | # const KEY_TABLE_TYPE keyTable,
|
---|
305 | # Byte plaintext[])
|
---|
306 | &function_begin("Camellia_DecryptBlock_Rounds");
|
---|
307 | &mov ("eax",&wparam(0)); # load grandRounds
|
---|
308 | &mov ($idx,&wparam(1)); # load ciphertext pointer
|
---|
309 | &mov ($key,&wparam(2)); # load key schedule pointer
|
---|
310 |
|
---|
311 | &mov ("ebx","esp");
|
---|
312 | &sub ("esp",7*4); # place for s[0-3],keyEnd,esp and ra
|
---|
313 | &and ("esp",-64);
|
---|
314 |
|
---|
315 | # place stack frame just "above mod 1024" the key schedule
|
---|
316 | # this ensures that cache associativity of 2 suffices
|
---|
317 | &lea ("ecx",&DWP(-64-63,$key));
|
---|
318 | &sub ("ecx","esp");
|
---|
319 | &neg ("ecx");
|
---|
320 | &and ("ecx",0x3C0); # modulo 1024, but aligned to cache-line
|
---|
321 | &sub ("esp","ecx");
|
---|
322 | &add ("esp",4); # 4 is reserved for callee's return address
|
---|
323 |
|
---|
324 | &shl ("eax",6);
|
---|
325 | &mov (&DWP(4*4,"esp"),$key); # save keyStart
|
---|
326 | &lea ($key,&DWP(0,$key,"eax"));
|
---|
327 | &mov (&DWP(5*4,"esp"),"ebx");# save %esp
|
---|
328 |
|
---|
329 | &call (&label("pic_point"));
|
---|
330 | &set_label("pic_point");
|
---|
331 | &blindpop($Tbl);
|
---|
332 | &lea ($Tbl,&DWP(&label("Camellia_SBOX")."-".&label("pic_point"),$Tbl));
|
---|
333 |
|
---|
334 | &mov (@T[0],&DWP(0,$idx)); # load ciphertext
|
---|
335 | &mov (@T[1],&DWP(4,$idx));
|
---|
336 | &mov (@T[2],&DWP(8,$idx));
|
---|
337 | &bswap (@T[0]);
|
---|
338 | &mov (@T[3],&DWP(12,$idx));
|
---|
339 | &bswap (@T[1]);
|
---|
340 | &bswap (@T[2]);
|
---|
341 | &bswap (@T[3]);
|
---|
342 |
|
---|
343 | &call ("_x86_Camellia_decrypt");
|
---|
344 |
|
---|
345 | &mov ("esp",&DWP(5*4,"esp"));
|
---|
346 | &bswap (@T[0]);
|
---|
347 | &mov ($idx,&wparam(3)); # load plaintext pointer
|
---|
348 | &bswap (@T[1]);
|
---|
349 | &bswap (@T[2]);
|
---|
350 | &bswap (@T[3]);
|
---|
351 | &mov (&DWP(0,$idx),@T[0]); # write plaintext
|
---|
352 | &mov (&DWP(4,$idx),@T[1]);
|
---|
353 | &mov (&DWP(8,$idx),@T[2]);
|
---|
354 | &mov (&DWP(12,$idx),@T[3]);
|
---|
355 | &function_end("Camellia_DecryptBlock_Rounds");
|
---|
356 | # V1.x API
|
---|
357 | &function_begin_B("Camellia_DecryptBlock");
|
---|
358 | &mov ("eax",128);
|
---|
359 | &sub ("eax",&wparam(0)); # load keyBitLength
|
---|
360 | &mov ("eax",3);
|
---|
361 | &adc ("eax",0); # keyBitLength==128?3:4
|
---|
362 | &mov (&wparam(0),"eax");
|
---|
363 | &jmp (&label("Camellia_DecryptBlock_Rounds"));
|
---|
364 | &function_end_B("Camellia_DecryptBlock");
|
---|
365 |
|
---|
366 | if ($OPENSSL) {
|
---|
367 | # void Camellia_decrypt(
|
---|
368 | # const unsigned char *in,
|
---|
369 | # unsigned char *out,
|
---|
370 | # const CAMELLIA_KEY *key)
|
---|
371 | &function_begin("Camellia_decrypt");
|
---|
372 | &mov ($idx,&wparam(0)); # load ciphertext pointer
|
---|
373 | &mov ($key,&wparam(2)); # load key schedule pointer
|
---|
374 |
|
---|
375 | &mov ("ebx","esp");
|
---|
376 | &sub ("esp",7*4); # place for s[0-3],keyEnd,esp and ra
|
---|
377 | &and ("esp",-64);
|
---|
378 | &mov ("eax",&DWP(272,$key)); # load grandRounds counter
|
---|
379 |
|
---|
380 | # place stack frame just "above mod 1024" the key schedule
|
---|
381 | # this ensures that cache associativity of 2 suffices
|
---|
382 | &lea ("ecx",&DWP(-64-63,$key));
|
---|
383 | &sub ("ecx","esp");
|
---|
384 | &neg ("ecx");
|
---|
385 | &and ("ecx",0x3C0); # modulo 1024, but aligned to cache-line
|
---|
386 | &sub ("esp","ecx");
|
---|
387 | &add ("esp",4); # 4 is reserved for callee's return address
|
---|
388 |
|
---|
389 | &shl ("eax",6);
|
---|
390 | &mov (&DWP(4*4,"esp"),$key); # save keyStart
|
---|
391 | &lea ($key,&DWP(0,$key,"eax"));
|
---|
392 | &mov (&DWP(5*4,"esp"),"ebx");# save %esp
|
---|
393 |
|
---|
394 | &call (&label("pic_point"));
|
---|
395 | &set_label("pic_point");
|
---|
396 | &blindpop($Tbl);
|
---|
397 | &lea ($Tbl,&DWP(&label("Camellia_SBOX")."-".&label("pic_point"),$Tbl));
|
---|
398 |
|
---|
399 | &mov (@T[0],&DWP(0,$idx)); # load ciphertext
|
---|
400 | &mov (@T[1],&DWP(4,$idx));
|
---|
401 | &mov (@T[2],&DWP(8,$idx));
|
---|
402 | &bswap (@T[0]);
|
---|
403 | &mov (@T[3],&DWP(12,$idx));
|
---|
404 | &bswap (@T[1]);
|
---|
405 | &bswap (@T[2]);
|
---|
406 | &bswap (@T[3]);
|
---|
407 |
|
---|
408 | &call ("_x86_Camellia_decrypt");
|
---|
409 |
|
---|
410 | &mov ("esp",&DWP(5*4,"esp"));
|
---|
411 | &bswap (@T[0]);
|
---|
412 | &mov ($idx,&wparam(1)); # load plaintext pointer
|
---|
413 | &bswap (@T[1]);
|
---|
414 | &bswap (@T[2]);
|
---|
415 | &bswap (@T[3]);
|
---|
416 | &mov (&DWP(0,$idx),@T[0]); # write plaintext
|
---|
417 | &mov (&DWP(4,$idx),@T[1]);
|
---|
418 | &mov (&DWP(8,$idx),@T[2]);
|
---|
419 | &mov (&DWP(12,$idx),@T[3]);
|
---|
420 | &function_end("Camellia_decrypt");
|
---|
421 | }
|
---|
422 |
|
---|
423 | &function_begin_B("_x86_Camellia_decrypt");
|
---|
424 | &xor (@T[0],&DWP(0,$key)); # ^=key[0-3]
|
---|
425 | &xor (@T[1],&DWP(4,$key));
|
---|
426 | &xor (@T[2],&DWP(8,$key));
|
---|
427 | &xor (@T[3],&DWP(12,$key));
|
---|
428 | &mov ($idx,&DWP(-8,$key)); # prefetch key[-2]
|
---|
429 |
|
---|
430 | &mov ($__s0,@T[0]); # save s[0-3]
|
---|
431 | &mov ($__s1,@T[1]);
|
---|
432 | &mov ($__s2,@T[2]);
|
---|
433 | &mov ($__s3,@T[3]);
|
---|
434 |
|
---|
435 | &set_label("loop",16);
|
---|
436 | for ($i=0;$i<6;$i++) { Camellia_Feistel($i,-8,4); }
|
---|
437 |
|
---|
438 | &sub ($key,16*4);
|
---|
439 | &cmp ($key,$__end);
|
---|
440 | &je (&label("done"));
|
---|
441 |
|
---|
442 | # @T[0-1] are preloaded, $idx is preloaded with key[2]
|
---|
443 | &and ($idx,@T[0]);
|
---|
444 | &mov (@T[3],$__s3);
|
---|
445 | &rotl ($idx,1);
|
---|
446 | &mov (@T[2],@T[3]);
|
---|
447 | &xor (@T[1],$idx);
|
---|
448 | &or (@T[2],&DWP(4,$key));
|
---|
449 | &mov ($__s1,@T[1]); # s1^=LeftRotate(s0&key[0],1);
|
---|
450 | &xor (@T[2],$__s2);
|
---|
451 |
|
---|
452 | &mov ($idx,&DWP(12,$key));
|
---|
453 | &mov ($__s2,@T[2]); # s2^=s3|key[3];
|
---|
454 | &or ($idx,@T[1]);
|
---|
455 | &and (@T[2],&DWP(0,$key));
|
---|
456 | &xor (@T[0],$idx);
|
---|
457 | &rotl (@T[2],1);
|
---|
458 | &mov ($__s0,@T[0]); # s0^=s1|key[1];
|
---|
459 | &xor (@T[3],@T[2]);
|
---|
460 | &mov ($idx,&DWP(-8,$key)); # prefetch key[4]
|
---|
461 | &mov ($__s3,@T[3]); # s3^=LeftRotate(s2&key[2],1);
|
---|
462 | &jmp (&label("loop"));
|
---|
463 |
|
---|
464 | &set_label("done",8);
|
---|
465 | &mov (@T[2],@T[0]); # SwapHalf
|
---|
466 | &mov (@T[3],@T[1]);
|
---|
467 | &mov (@T[0],$__s2);
|
---|
468 | &mov (@T[1],$__s3);
|
---|
469 | &xor (@T[2],$idx); # $idx is preloaded with key[2]
|
---|
470 | &xor (@T[3],&DWP(12,$key));
|
---|
471 | &xor (@T[0],&DWP(0,$key));
|
---|
472 | &xor (@T[1],&DWP(4,$key));
|
---|
473 | &ret ();
|
---|
474 | &function_end_B("_x86_Camellia_decrypt");
|
---|
475 |
|
---|
476 | # shld is very slow on Intel P4 family. Even on AMD it limits
|
---|
477 | # instruction decode rate [because it's VectorPath] and consequently
|
---|
478 | # performance. PIII, PM and Core[2] seem to be the only ones which
|
---|
479 | # execute this code ~7% faster...
|
---|
480 | sub __rotl128 {
|
---|
481 | my ($i0,$i1,$i2,$i3,$rot,$rnd,@T)=@_;
|
---|
482 |
|
---|
483 | $rnd *= 2;
|
---|
484 | if ($rot) {
|
---|
485 | &mov ($idx,$i0);
|
---|
486 | &shld ($i0,$i1,$rot);
|
---|
487 | &shld ($i1,$i2,$rot);
|
---|
488 | &shld ($i2,$i3,$rot);
|
---|
489 | &shld ($i3,$idx,$rot);
|
---|
490 | }
|
---|
491 | &mov (&DWP(-128+4*$rnd++,$key),shift(@T)) if ($i0 eq @T[0]);
|
---|
492 | &mov (&DWP(-128+4*$rnd++,$key),shift(@T)) if ($i1 eq @T[0]);
|
---|
493 | &mov (&DWP(-128+4*$rnd++,$key),shift(@T)) if ($i2 eq @T[0]);
|
---|
494 | &mov (&DWP(-128+4*$rnd++,$key),shift(@T)) if ($i3 eq @T[0]);
|
---|
495 | }
|
---|
496 |
|
---|
497 | # ... Implementing 128-bit rotate without shld gives >3x performance
|
---|
498 | # improvement on P4, only ~7% degradation on other Intel CPUs and
|
---|
499 | # not worse performance on AMD. This is therefore preferred.
|
---|
500 | sub _rotl128 {
|
---|
501 | my ($i0,$i1,$i2,$i3,$rot,$rnd,@T)=@_;
|
---|
502 |
|
---|
503 | $rnd *= 2;
|
---|
504 | if ($rot) {
|
---|
505 | &mov ($Tbl,$i0);
|
---|
506 | &shl ($i0,$rot);
|
---|
507 | &mov ($idx,$i1);
|
---|
508 | &shr ($idx,32-$rot);
|
---|
509 | &shl ($i1,$rot);
|
---|
510 | &or ($i0,$idx);
|
---|
511 | &mov ($idx,$i2);
|
---|
512 | &shl ($i2,$rot);
|
---|
513 | &mov (&DWP(-128+4*$rnd++,$key),shift(@T)) if ($i0 eq @T[0]);
|
---|
514 | &shr ($idx,32-$rot);
|
---|
515 | &or ($i1,$idx);
|
---|
516 | &shr ($Tbl,32-$rot);
|
---|
517 | &mov ($idx,$i3);
|
---|
518 | &shr ($idx,32-$rot);
|
---|
519 | &mov (&DWP(-128+4*$rnd++,$key),shift(@T)) if ($i1 eq @T[0]);
|
---|
520 | &shl ($i3,$rot);
|
---|
521 | &or ($i2,$idx);
|
---|
522 | &or ($i3,$Tbl);
|
---|
523 | &mov (&DWP(-128+4*$rnd++,$key),shift(@T)) if ($i2 eq @T[0]);
|
---|
524 | &mov (&DWP(-128+4*$rnd++,$key),shift(@T)) if ($i3 eq @T[0]);
|
---|
525 | } else {
|
---|
526 | &mov (&DWP(-128+4*$rnd++,$key),shift(@T)) if ($i0 eq @T[0]);
|
---|
527 | &mov (&DWP(-128+4*$rnd++,$key),shift(@T)) if ($i1 eq @T[0]);
|
---|
528 | &mov (&DWP(-128+4*$rnd++,$key),shift(@T)) if ($i2 eq @T[0]);
|
---|
529 | &mov (&DWP(-128+4*$rnd++,$key),shift(@T)) if ($i3 eq @T[0]);
|
---|
530 | }
|
---|
531 | }
|
---|
532 |
|
---|
533 | sub _saveround {
|
---|
534 | my ($rnd,$key,@T)=@_;
|
---|
535 | my $bias=int(@T[0])?shift(@T):0;
|
---|
536 |
|
---|
537 | &mov (&DWP($bias+$rnd*8+0,$key),@T[0]);
|
---|
538 | &mov (&DWP($bias+$rnd*8+4,$key),@T[1]) if ($#T>=1);
|
---|
539 | &mov (&DWP($bias+$rnd*8+8,$key),@T[2]) if ($#T>=2);
|
---|
540 | &mov (&DWP($bias+$rnd*8+12,$key),@T[3]) if ($#T>=3);
|
---|
541 | }
|
---|
542 |
|
---|
543 | sub _loadround {
|
---|
544 | my ($rnd,$key,@T)=@_;
|
---|
545 | my $bias=int(@T[0])?shift(@T):0;
|
---|
546 |
|
---|
547 | &mov (@T[0],&DWP($bias+$rnd*8+0,$key));
|
---|
548 | &mov (@T[1],&DWP($bias+$rnd*8+4,$key)) if ($#T>=1);
|
---|
549 | &mov (@T[2],&DWP($bias+$rnd*8+8,$key)) if ($#T>=2);
|
---|
550 | &mov (@T[3],&DWP($bias+$rnd*8+12,$key)) if ($#T>=3);
|
---|
551 | }
|
---|
552 |
|
---|
553 | # void Camellia_Ekeygen(
|
---|
554 | # const int keyBitLength,
|
---|
555 | # const Byte *rawKey,
|
---|
556 | # KEY_TABLE_TYPE keyTable)
|
---|
557 | &function_begin("Camellia_Ekeygen");
|
---|
558 | { my $step=0;
|
---|
559 |
|
---|
560 | &stack_push(4); # place for s[0-3]
|
---|
561 |
|
---|
562 | &mov ($Tbl,&wparam(0)); # load arguments
|
---|
563 | &mov ($idx,&wparam(1));
|
---|
564 | &mov ($key,&wparam(2));
|
---|
565 |
|
---|
566 | &mov (@T[0],&DWP(0,$idx)); # load 0-127 bits
|
---|
567 | &mov (@T[1],&DWP(4,$idx));
|
---|
568 | &mov (@T[2],&DWP(8,$idx));
|
---|
569 | &mov (@T[3],&DWP(12,$idx));
|
---|
570 |
|
---|
571 | &bswap (@T[0]);
|
---|
572 | &bswap (@T[1]);
|
---|
573 | &bswap (@T[2]);
|
---|
574 | &bswap (@T[3]);
|
---|
575 |
|
---|
576 | &_saveround (0,$key,@T); # KL<<<0
|
---|
577 |
|
---|
578 | &cmp ($Tbl,128);
|
---|
579 | &je (&label("1st128"));
|
---|
580 |
|
---|
581 | &mov (@T[0],&DWP(16,$idx)); # load 128-191 bits
|
---|
582 | &mov (@T[1],&DWP(20,$idx));
|
---|
583 | &cmp ($Tbl,192);
|
---|
584 | &je (&label("1st192"));
|
---|
585 | &mov (@T[2],&DWP(24,$idx)); # load 192-255 bits
|
---|
586 | &mov (@T[3],&DWP(28,$idx));
|
---|
587 | &jmp (&label("1st256"));
|
---|
588 | &set_label("1st192",4);
|
---|
589 | &mov (@T[2],@T[0]);
|
---|
590 | &mov (@T[3],@T[1]);
|
---|
591 | ¬ (@T[2]);
|
---|
592 | ¬ (@T[3]);
|
---|
593 | &set_label("1st256",4);
|
---|
594 | &bswap (@T[0]);
|
---|
595 | &bswap (@T[1]);
|
---|
596 | &bswap (@T[2]);
|
---|
597 | &bswap (@T[3]);
|
---|
598 |
|
---|
599 | &_saveround (4,$key,@T); # temporary storage for KR!
|
---|
600 |
|
---|
601 | &xor (@T[0],&DWP(0*8+0,$key)); # KR^KL
|
---|
602 | &xor (@T[1],&DWP(0*8+4,$key));
|
---|
603 | &xor (@T[2],&DWP(1*8+0,$key));
|
---|
604 | &xor (@T[3],&DWP(1*8+4,$key));
|
---|
605 |
|
---|
606 | &set_label("1st128",4);
|
---|
607 | &call (&label("pic_point"));
|
---|
608 | &set_label("pic_point");
|
---|
609 | &blindpop($Tbl);
|
---|
610 | &lea ($Tbl,&DWP(&label("Camellia_SBOX")."-".&label("pic_point"),$Tbl));
|
---|
611 | &lea ($key,&DWP(&label("Camellia_SIGMA")."-".&label("Camellia_SBOX"),$Tbl));
|
---|
612 |
|
---|
613 | &mov ($idx,&DWP($step*8,$key)); # prefetch SIGMA[0]
|
---|
614 | &mov (&swtmp(0),@T[0]); # save s[0-3]
|
---|
615 | &mov (&swtmp(1),@T[1]);
|
---|
616 | &mov (&swtmp(2),@T[2]);
|
---|
617 | &mov (&swtmp(3),@T[3]);
|
---|
618 | &Camellia_Feistel($step++);
|
---|
619 | &Camellia_Feistel($step++);
|
---|
620 | &mov (@T[2],&swtmp(2));
|
---|
621 | &mov (@T[3],&swtmp(3));
|
---|
622 |
|
---|
623 | &mov ($idx,&wparam(2));
|
---|
624 | &xor (@T[0],&DWP(0*8+0,$idx)); # ^KL
|
---|
625 | &xor (@T[1],&DWP(0*8+4,$idx));
|
---|
626 | &xor (@T[2],&DWP(1*8+0,$idx));
|
---|
627 | &xor (@T[3],&DWP(1*8+4,$idx));
|
---|
628 |
|
---|
629 | &mov ($idx,&DWP($step*8,$key)); # prefetch SIGMA[4]
|
---|
630 | &mov (&swtmp(0),@T[0]); # save s[0-3]
|
---|
631 | &mov (&swtmp(1),@T[1]);
|
---|
632 | &mov (&swtmp(2),@T[2]);
|
---|
633 | &mov (&swtmp(3),@T[3]);
|
---|
634 | &Camellia_Feistel($step++);
|
---|
635 | &Camellia_Feistel($step++);
|
---|
636 | &mov (@T[2],&swtmp(2));
|
---|
637 | &mov (@T[3],&swtmp(3));
|
---|
638 |
|
---|
639 | &mov ($idx,&wparam(0));
|
---|
640 | &cmp ($idx,128);
|
---|
641 | &jne (&label("2nd256"));
|
---|
642 |
|
---|
643 | &mov ($key,&wparam(2));
|
---|
644 | &lea ($key,&DWP(128,$key)); # size optimization
|
---|
645 |
|
---|
646 | ####### process KA
|
---|
647 | &_saveround (2,$key,-128,@T); # KA<<<0
|
---|
648 | &_rotl128 (@T,15,6,@T); # KA<<<15
|
---|
649 | &_rotl128 (@T,15,8,@T); # KA<<<(15+15=30)
|
---|
650 | &_rotl128 (@T,15,12,@T[0],@T[1]); # KA<<<(30+15=45)
|
---|
651 | &_rotl128 (@T,15,14,@T); # KA<<<(45+15=60)
|
---|
652 | push (@T,shift(@T)); # rotl128(@T,32);
|
---|
653 | &_rotl128 (@T,2,20,@T); # KA<<<(60+32+2=94)
|
---|
654 | &_rotl128 (@T,17,24,@T); # KA<<<(94+17=111)
|
---|
655 |
|
---|
656 | ####### process KL
|
---|
657 | &_loadround (0,$key,-128,@T); # load KL
|
---|
658 | &_rotl128 (@T,15,4,@T); # KL<<<15
|
---|
659 | &_rotl128 (@T,30,10,@T); # KL<<<(15+30=45)
|
---|
660 | &_rotl128 (@T,15,13,@T[2],@T[3]); # KL<<<(45+15=60)
|
---|
661 | &_rotl128 (@T,17,16,@T); # KL<<<(60+17=77)
|
---|
662 | &_rotl128 (@T,17,18,@T); # KL<<<(77+17=94)
|
---|
663 | &_rotl128 (@T,17,22,@T); # KL<<<(94+17=111)
|
---|
664 |
|
---|
665 | while (@T[0] ne "eax") # restore order
|
---|
666 | { unshift (@T,pop(@T)); }
|
---|
667 |
|
---|
668 | &mov ("eax",3); # 3 grandRounds
|
---|
669 | &jmp (&label("done"));
|
---|
670 |
|
---|
671 | &set_label("2nd256",16);
|
---|
672 | &mov ($idx,&wparam(2));
|
---|
673 | &_saveround (6,$idx,@T); # temporary storage for KA!
|
---|
674 |
|
---|
675 | &xor (@T[0],&DWP(4*8+0,$idx)); # KA^KR
|
---|
676 | &xor (@T[1],&DWP(4*8+4,$idx));
|
---|
677 | &xor (@T[2],&DWP(5*8+0,$idx));
|
---|
678 | &xor (@T[3],&DWP(5*8+4,$idx));
|
---|
679 |
|
---|
680 | &mov ($idx,&DWP($step*8,$key)); # prefetch SIGMA[8]
|
---|
681 | &mov (&swtmp(0),@T[0]); # save s[0-3]
|
---|
682 | &mov (&swtmp(1),@T[1]);
|
---|
683 | &mov (&swtmp(2),@T[2]);
|
---|
684 | &mov (&swtmp(3),@T[3]);
|
---|
685 | &Camellia_Feistel($step++);
|
---|
686 | &Camellia_Feistel($step++);
|
---|
687 | &mov (@T[2],&swtmp(2));
|
---|
688 | &mov (@T[3],&swtmp(3));
|
---|
689 |
|
---|
690 | &mov ($key,&wparam(2));
|
---|
691 | &lea ($key,&DWP(128,$key)); # size optimization
|
---|
692 |
|
---|
693 | ####### process KB
|
---|
694 | &_saveround (2,$key,-128,@T); # KB<<<0
|
---|
695 | &_rotl128 (@T,30,10,@T); # KB<<<30
|
---|
696 | &_rotl128 (@T,30,20,@T); # KB<<<(30+30=60)
|
---|
697 | push (@T,shift(@T)); # rotl128(@T,32);
|
---|
698 | &_rotl128 (@T,19,32,@T); # KB<<<(60+32+19=111)
|
---|
699 |
|
---|
700 | ####### process KR
|
---|
701 | &_loadround (4,$key,-128,@T); # load KR
|
---|
702 | &_rotl128 (@T,15,4,@T); # KR<<<15
|
---|
703 | &_rotl128 (@T,15,8,@T); # KR<<<(15+15=30)
|
---|
704 | &_rotl128 (@T,30,18,@T); # KR<<<(30+30=60)
|
---|
705 | push (@T,shift(@T)); # rotl128(@T,32);
|
---|
706 | &_rotl128 (@T,2,26,@T); # KR<<<(60+32+2=94)
|
---|
707 |
|
---|
708 | ####### process KA
|
---|
709 | &_loadround (6,$key,-128,@T); # load KA
|
---|
710 | &_rotl128 (@T,15,6,@T); # KA<<<15
|
---|
711 | &_rotl128 (@T,30,14,@T); # KA<<<(15+30=45)
|
---|
712 | push (@T,shift(@T)); # rotl128(@T,32);
|
---|
713 | &_rotl128 (@T,0,24,@T); # KA<<<(45+32+0=77)
|
---|
714 | &_rotl128 (@T,17,28,@T); # KA<<<(77+17=94)
|
---|
715 |
|
---|
716 | ####### process KL
|
---|
717 | &_loadround (0,$key,-128,@T); # load KL
|
---|
718 | push (@T,shift(@T)); # rotl128(@T,32);
|
---|
719 | &_rotl128 (@T,13,12,@T); # KL<<<(32+13=45)
|
---|
720 | &_rotl128 (@T,15,16,@T); # KL<<<(45+15=60)
|
---|
721 | &_rotl128 (@T,17,22,@T); # KL<<<(60+17=77)
|
---|
722 | push (@T,shift(@T)); # rotl128(@T,32);
|
---|
723 | &_rotl128 (@T,2,30,@T); # KL<<<(77+32+2=111)
|
---|
724 |
|
---|
725 | while (@T[0] ne "eax") # restore order
|
---|
726 | { unshift (@T,pop(@T)); }
|
---|
727 |
|
---|
728 | &mov ("eax",4); # 4 grandRounds
|
---|
729 | &set_label("done");
|
---|
730 | &lea ("edx",&DWP(272-128,$key)); # end of key schedule
|
---|
731 | &stack_pop(4);
|
---|
732 | }
|
---|
733 | &function_end("Camellia_Ekeygen");
|
---|
734 |
|
---|
735 | if ($OPENSSL) {
|
---|
736 | # int Camellia_set_key (
|
---|
737 | # const unsigned char *userKey,
|
---|
738 | # int bits,
|
---|
739 | # CAMELLIA_KEY *key)
|
---|
740 | &function_begin_B("Camellia_set_key");
|
---|
741 | &push ("ebx");
|
---|
742 | &mov ("ecx",&wparam(0)); # pull arguments
|
---|
743 | &mov ("ebx",&wparam(1));
|
---|
744 | &mov ("edx",&wparam(2));
|
---|
745 |
|
---|
746 | &mov ("eax",-1);
|
---|
747 | &test ("ecx","ecx");
|
---|
748 | &jz (&label("done")); # userKey==NULL?
|
---|
749 | &test ("edx","edx");
|
---|
750 | &jz (&label("done")); # key==NULL?
|
---|
751 |
|
---|
752 | &mov ("eax",-2);
|
---|
753 | &cmp ("ebx",256);
|
---|
754 | &je (&label("arg_ok")); # bits==256?
|
---|
755 | &cmp ("ebx",192);
|
---|
756 | &je (&label("arg_ok")); # bits==192?
|
---|
757 | &cmp ("ebx",128);
|
---|
758 | &jne (&label("done")); # bits!=128?
|
---|
759 | &set_label("arg_ok",4);
|
---|
760 |
|
---|
761 | &push ("edx"); # push arguments
|
---|
762 | &push ("ecx");
|
---|
763 | &push ("ebx");
|
---|
764 | &call ("Camellia_Ekeygen");
|
---|
765 | &stack_pop(3);
|
---|
766 |
|
---|
767 | # eax holds grandRounds and edx points at where to put it
|
---|
768 | &mov (&DWP(0,"edx"),"eax");
|
---|
769 | &xor ("eax","eax");
|
---|
770 | &set_label("done",4);
|
---|
771 | &pop ("ebx");
|
---|
772 | &ret ();
|
---|
773 | &function_end_B("Camellia_set_key");
|
---|
774 | }
|
---|
775 |
|
---|
776 | @SBOX=(
|
---|
777 | 112,130, 44,236,179, 39,192,229,228,133, 87, 53,234, 12,174, 65,
|
---|
778 | 35,239,107,147, 69, 25,165, 33,237, 14, 79, 78, 29,101,146,189,
|
---|
779 | 134,184,175,143,124,235, 31,206, 62, 48,220, 95, 94,197, 11, 26,
|
---|
780 | 166,225, 57,202,213, 71, 93, 61,217, 1, 90,214, 81, 86,108, 77,
|
---|
781 | 139, 13,154,102,251,204,176, 45,116, 18, 43, 32,240,177,132,153,
|
---|
782 | 223, 76,203,194, 52,126,118, 5,109,183,169, 49,209, 23, 4,215,
|
---|
783 | 20, 88, 58, 97,222, 27, 17, 28, 50, 15,156, 22, 83, 24,242, 34,
|
---|
784 | 254, 68,207,178,195,181,122,145, 36, 8,232,168, 96,252,105, 80,
|
---|
785 | 170,208,160,125,161,137, 98,151, 84, 91, 30,149,224,255,100,210,
|
---|
786 | 16,196, 0, 72,163,247,117,219,138, 3,230,218, 9, 63,221,148,
|
---|
787 | 135, 92,131, 2,205, 74,144, 51,115,103,246,243,157,127,191,226,
|
---|
788 | 82,155,216, 38,200, 55,198, 59,129,150,111, 75, 19,190, 99, 46,
|
---|
789 | 233,121,167,140,159,110,188,142, 41,245,249,182, 47,253,180, 89,
|
---|
790 | 120,152, 6,106,231, 70,113,186,212, 37,171, 66,136,162,141,250,
|
---|
791 | 114, 7,185, 85,248,238,172, 10, 54, 73, 42,104, 60, 56,241,164,
|
---|
792 | 64, 40,211,123,187,201, 67,193, 21,227,173,244,119,199,128,158);
|
---|
793 |
|
---|
794 | sub S1110 { my $i=shift; $i=@SBOX[$i]; return $i<<24|$i<<16|$i<<8; }
|
---|
795 | sub S4404 { my $i=shift; $i=($i<<1|$i>>7)&0xff; $i=@SBOX[$i]; return $i<<24|$i<<16|$i; }
|
---|
796 | sub S0222 { my $i=shift; $i=@SBOX[$i]; $i=($i<<1|$i>>7)&0xff; return $i<<16|$i<<8|$i; }
|
---|
797 | sub S3033 { my $i=shift; $i=@SBOX[$i]; $i=($i>>1|$i<<7)&0xff; return $i<<24|$i<<8|$i; }
|
---|
798 |
|
---|
799 | &set_label("Camellia_SIGMA",64);
|
---|
800 | &data_word(
|
---|
801 | 0xa09e667f, 0x3bcc908b, 0xb67ae858, 0x4caa73b2,
|
---|
802 | 0xc6ef372f, 0xe94f82be, 0x54ff53a5, 0xf1d36f1c,
|
---|
803 | 0x10e527fa, 0xde682d1d, 0xb05688c2, 0xb3e6c1fd,
|
---|
804 | 0, 0, 0, 0);
|
---|
805 | &set_label("Camellia_SBOX",64);
|
---|
806 | # tables are interleaved, remember?
|
---|
807 | for ($i=0;$i<256;$i++) { &data_word(&S1110($i),&S4404($i)); }
|
---|
808 | for ($i=0;$i<256;$i++) { &data_word(&S0222($i),&S3033($i)); }
|
---|
809 |
|
---|
810 | # void Camellia_cbc_encrypt (const void char *inp, unsigned char *out,
|
---|
811 | # size_t length, const CAMELLIA_KEY *key,
|
---|
812 | # unsigned char *ivp,const int enc);
|
---|
813 | {
|
---|
814 | # stack frame layout
|
---|
815 | # -4(%esp) # return address 0(%esp)
|
---|
816 | # 0(%esp) # s0 4(%esp)
|
---|
817 | # 4(%esp) # s1 8(%esp)
|
---|
818 | # 8(%esp) # s2 12(%esp)
|
---|
819 | # 12(%esp) # s3 16(%esp)
|
---|
820 | # 16(%esp) # end of key schedule 20(%esp)
|
---|
821 | # 20(%esp) # %esp backup
|
---|
822 | my $_inp=&DWP(24,"esp"); #copy of wparam(0)
|
---|
823 | my $_out=&DWP(28,"esp"); #copy of wparam(1)
|
---|
824 | my $_len=&DWP(32,"esp"); #copy of wparam(2)
|
---|
825 | my $_key=&DWP(36,"esp"); #copy of wparam(3)
|
---|
826 | my $_ivp=&DWP(40,"esp"); #copy of wparam(4)
|
---|
827 | my $ivec=&DWP(44,"esp"); #ivec[16]
|
---|
828 | my $_tmp=&DWP(44,"esp"); #volatile variable [yes, aliases with ivec]
|
---|
829 | my ($s0,$s1,$s2,$s3) = @T;
|
---|
830 |
|
---|
831 | &function_begin("Camellia_cbc_encrypt");
|
---|
832 | &mov ($s2 eq "ecx"? $s2 : "",&wparam(2)); # load len
|
---|
833 | &cmp ($s2,0);
|
---|
834 | &je (&label("enc_out"));
|
---|
835 |
|
---|
836 | &pushf ();
|
---|
837 | &cld ();
|
---|
838 |
|
---|
839 | &mov ($s0,&wparam(0)); # load inp
|
---|
840 | &mov ($s1,&wparam(1)); # load out
|
---|
841 | #&mov ($s2,&wparam(2)); # load len
|
---|
842 | &mov ($s3,&wparam(3)); # load key
|
---|
843 | &mov ($Tbl,&wparam(4)); # load ivp
|
---|
844 |
|
---|
845 | # allocate aligned stack frame...
|
---|
846 | &lea ($idx,&DWP(-64,"esp"));
|
---|
847 | &and ($idx,-64);
|
---|
848 |
|
---|
849 | # place stack frame just "above mod 1024" the key schedule
|
---|
850 | # this ensures that cache associativity of 2 suffices
|
---|
851 | &lea ($key,&DWP(-64-63,$s3));
|
---|
852 | &sub ($key,$idx);
|
---|
853 | &neg ($key);
|
---|
854 | &and ($key,0x3C0); # modulo 1024, but aligned to cache-line
|
---|
855 | &sub ($idx,$key);
|
---|
856 |
|
---|
857 | &mov ($key,&wparam(5)); # load enc
|
---|
858 |
|
---|
859 | &exch ("esp",$idx);
|
---|
860 | &add ("esp",4); # reserve for return address!
|
---|
861 | &mov ($_esp,$idx); # save %esp
|
---|
862 |
|
---|
863 | &mov ($_inp,$s0); # save copy of inp
|
---|
864 | &mov ($_out,$s1); # save copy of out
|
---|
865 | &mov ($_len,$s2); # save copy of len
|
---|
866 | &mov ($_key,$s3); # save copy of key
|
---|
867 | &mov ($_ivp,$Tbl); # save copy of ivp
|
---|
868 |
|
---|
869 | &call (&label("pic_point")); # make it PIC!
|
---|
870 | &set_label("pic_point");
|
---|
871 | &blindpop($Tbl);
|
---|
872 | &lea ($Tbl,&DWP(&label("Camellia_SBOX")."-".&label("pic_point"),$Tbl));
|
---|
873 |
|
---|
874 | &mov ($idx,32);
|
---|
875 | &set_label("prefetch_sbox",4);
|
---|
876 | &mov ($s0,&DWP(0,$Tbl));
|
---|
877 | &mov ($s1,&DWP(32,$Tbl));
|
---|
878 | &mov ($s2,&DWP(64,$Tbl));
|
---|
879 | &mov ($s3,&DWP(96,$Tbl));
|
---|
880 | &lea ($Tbl,&DWP(128,$Tbl));
|
---|
881 | &dec ($idx);
|
---|
882 | &jnz (&label("prefetch_sbox"));
|
---|
883 | &mov ($s0,$_key);
|
---|
884 | &sub ($Tbl,4096);
|
---|
885 | &mov ($idx,$_inp);
|
---|
886 | &mov ($s3,&DWP(272,$s0)); # load grandRounds
|
---|
887 |
|
---|
888 | &cmp ($key,0);
|
---|
889 | &je (&label("DECRYPT"));
|
---|
890 |
|
---|
891 | &mov ($s2,$_len);
|
---|
892 | &mov ($key,$_ivp);
|
---|
893 | &shl ($s3,6);
|
---|
894 | &lea ($s3,&DWP(0,$s0,$s3));
|
---|
895 | &mov ($_end,$s3);
|
---|
896 |
|
---|
897 | &test ($s2,0xFFFFFFF0);
|
---|
898 | &jz (&label("enc_tail")); # short input...
|
---|
899 |
|
---|
900 | &mov ($s0,&DWP(0,$key)); # load iv
|
---|
901 | &mov ($s1,&DWP(4,$key));
|
---|
902 |
|
---|
903 | &set_label("enc_loop",4);
|
---|
904 | &mov ($s2,&DWP(8,$key));
|
---|
905 | &mov ($s3,&DWP(12,$key));
|
---|
906 |
|
---|
907 | &xor ($s0,&DWP(0,$idx)); # xor input data
|
---|
908 | &xor ($s1,&DWP(4,$idx));
|
---|
909 | &xor ($s2,&DWP(8,$idx));
|
---|
910 | &bswap ($s0);
|
---|
911 | &xor ($s3,&DWP(12,$idx));
|
---|
912 | &bswap ($s1);
|
---|
913 | &mov ($key,$_key); # load key
|
---|
914 | &bswap ($s2);
|
---|
915 | &bswap ($s3);
|
---|
916 |
|
---|
917 | &call ("_x86_Camellia_encrypt");
|
---|
918 |
|
---|
919 | &mov ($idx,$_inp); # load inp
|
---|
920 | &mov ($key,$_out); # load out
|
---|
921 |
|
---|
922 | &bswap ($s0);
|
---|
923 | &bswap ($s1);
|
---|
924 | &bswap ($s2);
|
---|
925 | &mov (&DWP(0,$key),$s0); # save output data
|
---|
926 | &bswap ($s3);
|
---|
927 | &mov (&DWP(4,$key),$s1);
|
---|
928 | &mov (&DWP(8,$key),$s2);
|
---|
929 | &mov (&DWP(12,$key),$s3);
|
---|
930 |
|
---|
931 | &mov ($s2,$_len); # load len
|
---|
932 |
|
---|
933 | &lea ($idx,&DWP(16,$idx));
|
---|
934 | &mov ($_inp,$idx); # save inp
|
---|
935 |
|
---|
936 | &lea ($s3,&DWP(16,$key));
|
---|
937 | &mov ($_out,$s3); # save out
|
---|
938 |
|
---|
939 | &sub ($s2,16);
|
---|
940 | &test ($s2,0xFFFFFFF0);
|
---|
941 | &mov ($_len,$s2); # save len
|
---|
942 | &jnz (&label("enc_loop"));
|
---|
943 | &test ($s2,15);
|
---|
944 | &jnz (&label("enc_tail"));
|
---|
945 | &mov ($idx,$_ivp); # load ivp
|
---|
946 | &mov ($s2,&DWP(8,$key)); # restore last dwords
|
---|
947 | &mov ($s3,&DWP(12,$key));
|
---|
948 | &mov (&DWP(0,$idx),$s0); # save ivec
|
---|
949 | &mov (&DWP(4,$idx),$s1);
|
---|
950 | &mov (&DWP(8,$idx),$s2);
|
---|
951 | &mov (&DWP(12,$idx),$s3);
|
---|
952 |
|
---|
953 | &mov ("esp",$_esp);
|
---|
954 | &popf ();
|
---|
955 | &set_label("enc_out");
|
---|
956 | &function_end_A();
|
---|
957 | &pushf (); # kludge, never executed
|
---|
958 |
|
---|
959 | &set_label("enc_tail",4);
|
---|
960 | &mov ($s0,$key eq "edi" ? $key : "");
|
---|
961 | &mov ($key,$_out); # load out
|
---|
962 | &push ($s0); # push ivp
|
---|
963 | &mov ($s1,16);
|
---|
964 | &sub ($s1,$s2);
|
---|
965 | &cmp ($key,$idx); # compare with inp
|
---|
966 | &je (&label("enc_in_place"));
|
---|
967 | &align (4);
|
---|
968 | &data_word(0xA4F3F689); # rep movsb # copy input
|
---|
969 | &jmp (&label("enc_skip_in_place"));
|
---|
970 | &set_label("enc_in_place");
|
---|
971 | &lea ($key,&DWP(0,$key,$s2));
|
---|
972 | &set_label("enc_skip_in_place");
|
---|
973 | &mov ($s2,$s1);
|
---|
974 | &xor ($s0,$s0);
|
---|
975 | &align (4);
|
---|
976 | &data_word(0xAAF3F689); # rep stosb # zero tail
|
---|
977 | &pop ($key); # pop ivp
|
---|
978 |
|
---|
979 | &mov ($idx,$_out); # output as input
|
---|
980 | &mov ($s0,&DWP(0,$key));
|
---|
981 | &mov ($s1,&DWP(4,$key));
|
---|
982 | &mov ($_len,16); # len=16
|
---|
983 | &jmp (&label("enc_loop")); # one more spin...
|
---|
984 |
|
---|
985 | #----------------------------- DECRYPT -----------------------------#
|
---|
986 | &set_label("DECRYPT",16);
|
---|
987 | &shl ($s3,6);
|
---|
988 | &lea ($s3,&DWP(0,$s0,$s3));
|
---|
989 | &mov ($_end,$s0);
|
---|
990 | &mov ($_key,$s3);
|
---|
991 |
|
---|
992 | &cmp ($idx,$_out);
|
---|
993 | &je (&label("dec_in_place")); # in-place processing...
|
---|
994 |
|
---|
995 | &mov ($key,$_ivp); # load ivp
|
---|
996 | &mov ($_tmp,$key);
|
---|
997 |
|
---|
998 | &set_label("dec_loop",4);
|
---|
999 | &mov ($s0,&DWP(0,$idx)); # read input
|
---|
1000 | &mov ($s1,&DWP(4,$idx));
|
---|
1001 | &mov ($s2,&DWP(8,$idx));
|
---|
1002 | &bswap ($s0);
|
---|
1003 | &mov ($s3,&DWP(12,$idx));
|
---|
1004 | &bswap ($s1);
|
---|
1005 | &mov ($key,$_key); # load key
|
---|
1006 | &bswap ($s2);
|
---|
1007 | &bswap ($s3);
|
---|
1008 |
|
---|
1009 | &call ("_x86_Camellia_decrypt");
|
---|
1010 |
|
---|
1011 | &mov ($key,$_tmp); # load ivp
|
---|
1012 | &mov ($idx,$_len); # load len
|
---|
1013 |
|
---|
1014 | &bswap ($s0);
|
---|
1015 | &bswap ($s1);
|
---|
1016 | &bswap ($s2);
|
---|
1017 | &xor ($s0,&DWP(0,$key)); # xor iv
|
---|
1018 | &bswap ($s3);
|
---|
1019 | &xor ($s1,&DWP(4,$key));
|
---|
1020 | &xor ($s2,&DWP(8,$key));
|
---|
1021 | &xor ($s3,&DWP(12,$key));
|
---|
1022 |
|
---|
1023 | &sub ($idx,16);
|
---|
1024 | &jc (&label("dec_partial"));
|
---|
1025 | &mov ($_len,$idx); # save len
|
---|
1026 | &mov ($idx,$_inp); # load inp
|
---|
1027 | &mov ($key,$_out); # load out
|
---|
1028 |
|
---|
1029 | &mov (&DWP(0,$key),$s0); # write output
|
---|
1030 | &mov (&DWP(4,$key),$s1);
|
---|
1031 | &mov (&DWP(8,$key),$s2);
|
---|
1032 | &mov (&DWP(12,$key),$s3);
|
---|
1033 |
|
---|
1034 | &mov ($_tmp,$idx); # save ivp
|
---|
1035 | &lea ($idx,&DWP(16,$idx));
|
---|
1036 | &mov ($_inp,$idx); # save inp
|
---|
1037 |
|
---|
1038 | &lea ($key,&DWP(16,$key));
|
---|
1039 | &mov ($_out,$key); # save out
|
---|
1040 |
|
---|
1041 | &jnz (&label("dec_loop"));
|
---|
1042 | &mov ($key,$_tmp); # load temp ivp
|
---|
1043 | &set_label("dec_end");
|
---|
1044 | &mov ($idx,$_ivp); # load user ivp
|
---|
1045 | &mov ($s0,&DWP(0,$key)); # load iv
|
---|
1046 | &mov ($s1,&DWP(4,$key));
|
---|
1047 | &mov ($s2,&DWP(8,$key));
|
---|
1048 | &mov ($s3,&DWP(12,$key));
|
---|
1049 | &mov (&DWP(0,$idx),$s0); # copy back to user
|
---|
1050 | &mov (&DWP(4,$idx),$s1);
|
---|
1051 | &mov (&DWP(8,$idx),$s2);
|
---|
1052 | &mov (&DWP(12,$idx),$s3);
|
---|
1053 | &jmp (&label("dec_out"));
|
---|
1054 |
|
---|
1055 | &set_label("dec_partial",4);
|
---|
1056 | &lea ($key,$ivec);
|
---|
1057 | &mov (&DWP(0,$key),$s0); # dump output to stack
|
---|
1058 | &mov (&DWP(4,$key),$s1);
|
---|
1059 | &mov (&DWP(8,$key),$s2);
|
---|
1060 | &mov (&DWP(12,$key),$s3);
|
---|
1061 | &lea ($s2 eq "ecx" ? $s2 : "",&DWP(16,$idx));
|
---|
1062 | &mov ($idx eq "esi" ? $idx : "",$key);
|
---|
1063 | &mov ($key eq "edi" ? $key : "",$_out); # load out
|
---|
1064 | &data_word(0xA4F3F689); # rep movsb # copy output
|
---|
1065 | &mov ($key,$_inp); # use inp as temp ivp
|
---|
1066 | &jmp (&label("dec_end"));
|
---|
1067 |
|
---|
1068 | &set_label("dec_in_place",4);
|
---|
1069 | &set_label("dec_in_place_loop");
|
---|
1070 | &lea ($key,$ivec);
|
---|
1071 | &mov ($s0,&DWP(0,$idx)); # read input
|
---|
1072 | &mov ($s1,&DWP(4,$idx));
|
---|
1073 | &mov ($s2,&DWP(8,$idx));
|
---|
1074 | &mov ($s3,&DWP(12,$idx));
|
---|
1075 |
|
---|
1076 | &mov (&DWP(0,$key),$s0); # copy to temp
|
---|
1077 | &mov (&DWP(4,$key),$s1);
|
---|
1078 | &mov (&DWP(8,$key),$s2);
|
---|
1079 | &bswap ($s0);
|
---|
1080 | &mov (&DWP(12,$key),$s3);
|
---|
1081 | &bswap ($s1);
|
---|
1082 | &mov ($key,$_key); # load key
|
---|
1083 | &bswap ($s2);
|
---|
1084 | &bswap ($s3);
|
---|
1085 |
|
---|
1086 | &call ("_x86_Camellia_decrypt");
|
---|
1087 |
|
---|
1088 | &mov ($key,$_ivp); # load ivp
|
---|
1089 | &mov ($idx,$_out); # load out
|
---|
1090 |
|
---|
1091 | &bswap ($s0);
|
---|
1092 | &bswap ($s1);
|
---|
1093 | &bswap ($s2);
|
---|
1094 | &xor ($s0,&DWP(0,$key)); # xor iv
|
---|
1095 | &bswap ($s3);
|
---|
1096 | &xor ($s1,&DWP(4,$key));
|
---|
1097 | &xor ($s2,&DWP(8,$key));
|
---|
1098 | &xor ($s3,&DWP(12,$key));
|
---|
1099 |
|
---|
1100 | &mov (&DWP(0,$idx),$s0); # write output
|
---|
1101 | &mov (&DWP(4,$idx),$s1);
|
---|
1102 | &mov (&DWP(8,$idx),$s2);
|
---|
1103 | &mov (&DWP(12,$idx),$s3);
|
---|
1104 |
|
---|
1105 | &lea ($idx,&DWP(16,$idx));
|
---|
1106 | &mov ($_out,$idx); # save out
|
---|
1107 |
|
---|
1108 | &lea ($idx,$ivec);
|
---|
1109 | &mov ($s0,&DWP(0,$idx)); # read temp
|
---|
1110 | &mov ($s1,&DWP(4,$idx));
|
---|
1111 | &mov ($s2,&DWP(8,$idx));
|
---|
1112 | &mov ($s3,&DWP(12,$idx));
|
---|
1113 |
|
---|
1114 | &mov (&DWP(0,$key),$s0); # copy iv
|
---|
1115 | &mov (&DWP(4,$key),$s1);
|
---|
1116 | &mov (&DWP(8,$key),$s2);
|
---|
1117 | &mov (&DWP(12,$key),$s3);
|
---|
1118 |
|
---|
1119 | &mov ($idx,$_inp); # load inp
|
---|
1120 |
|
---|
1121 | &lea ($idx,&DWP(16,$idx));
|
---|
1122 | &mov ($_inp,$idx); # save inp
|
---|
1123 |
|
---|
1124 | &mov ($s2,$_len); # load len
|
---|
1125 | &sub ($s2,16);
|
---|
1126 | &jc (&label("dec_in_place_partial"));
|
---|
1127 | &mov ($_len,$s2); # save len
|
---|
1128 | &jnz (&label("dec_in_place_loop"));
|
---|
1129 | &jmp (&label("dec_out"));
|
---|
1130 |
|
---|
1131 | &set_label("dec_in_place_partial",4);
|
---|
1132 | # one can argue if this is actually required...
|
---|
1133 | &mov ($key eq "edi" ? $key : "",$_out);
|
---|
1134 | &lea ($idx eq "esi" ? $idx : "",$ivec);
|
---|
1135 | &lea ($key,&DWP(0,$key,$s2));
|
---|
1136 | &lea ($idx,&DWP(16,$idx,$s2));
|
---|
1137 | &neg ($s2 eq "ecx" ? $s2 : "");
|
---|
1138 | &data_word(0xA4F3F689); # rep movsb # restore tail
|
---|
1139 |
|
---|
1140 | &set_label("dec_out",4);
|
---|
1141 | &mov ("esp",$_esp);
|
---|
1142 | &popf ();
|
---|
1143 | &function_end("Camellia_cbc_encrypt");
|
---|
1144 | }
|
---|
1145 |
|
---|
1146 | &asciz("Camellia for x86 by <appro\@openssl.org>");
|
---|
1147 |
|
---|
1148 | &asm_finish();
|
---|
1149 |
|
---|
1150 | close STDOUT or die "error closing STDOUT: $!";
|
---|