1 | #! /usr/bin/env perl
|
---|
2 | # Copyright 2004-2020 The OpenSSL Project Authors. All Rights Reserved.
|
---|
3 | #
|
---|
4 | # Licensed under the Apache License 2.0 (the "License"). You may not use
|
---|
5 | # this file except in compliance with the License. You can obtain a copy
|
---|
6 | # in the file LICENSE in the source distribution or at
|
---|
7 | # https://www.openssl.org/source/license.html
|
---|
8 |
|
---|
9 | #
|
---|
10 | # ====================================================================
|
---|
11 | # Written by Andy Polyakov <[email protected]> for the OpenSSL
|
---|
12 | # project. The module is, however, dual licensed under OpenSSL and
|
---|
13 | # CRYPTOGAMS licenses depending on where you obtain it. For further
|
---|
14 | # details see http://www.openssl.org/~appro/cryptogams/.
|
---|
15 | # ====================================================================
|
---|
16 | #
|
---|
17 | # Version 4.3.
|
---|
18 | #
|
---|
19 | # You might fail to appreciate this module performance from the first
|
---|
20 | # try. If compared to "vanilla" linux-ia32-icc target, i.e. considered
|
---|
21 | # to be *the* best Intel C compiler without -KPIC, performance appears
|
---|
22 | # to be virtually identical... But try to re-configure with shared
|
---|
23 | # library support... Aha! Intel compiler "suddenly" lags behind by 30%
|
---|
24 | # [on P4, more on others]:-) And if compared to position-independent
|
---|
25 | # code generated by GNU C, this code performs *more* than *twice* as
|
---|
26 | # fast! Yes, all this buzz about PIC means that unlike other hand-
|
---|
27 | # coded implementations, this one was explicitly designed to be safe
|
---|
28 | # to use even in shared library context... This also means that this
|
---|
29 | # code isn't necessarily absolutely fastest "ever," because in order
|
---|
30 | # to achieve position independence an extra register has to be
|
---|
31 | # off-loaded to stack, which affects the benchmark result.
|
---|
32 | #
|
---|
33 | # Special note about instruction choice. Do you recall RC4_INT code
|
---|
34 | # performing poorly on P4? It might be the time to figure out why.
|
---|
35 | # RC4_INT code implies effective address calculations in base+offset*4
|
---|
36 | # form. Trouble is that it seems that offset scaling turned to be
|
---|
37 | # critical path... At least eliminating scaling resulted in 2.8x RC4
|
---|
38 | # performance improvement [as you might recall]. As AES code is hungry
|
---|
39 | # for scaling too, I [try to] avoid the latter by favoring off-by-2
|
---|
40 | # shifts and masking the result with 0xFF<<2 instead of "boring" 0xFF.
|
---|
41 | #
|
---|
42 | # As was shown by Dean Gaudet, the above note turned out to be
|
---|
43 | # void. Performance improvement with off-by-2 shifts was observed on
|
---|
44 | # intermediate implementation, which was spilling yet another register
|
---|
45 | # to stack... Final offset*4 code below runs just a tad faster on P4,
|
---|
46 | # but exhibits up to 10% improvement on other cores.
|
---|
47 | #
|
---|
48 | # Second version is "monolithic" replacement for aes_core.c, which in
|
---|
49 | # addition to AES_[de|en]crypt implements AES_set_[de|en]cryption_key.
|
---|
50 | # This made it possible to implement little-endian variant of the
|
---|
51 | # algorithm without modifying the base C code. Motivating factor for
|
---|
52 | # the undertaken effort was that it appeared that in tight IA-32
|
---|
53 | # register window little-endian flavor could achieve slightly higher
|
---|
54 | # Instruction Level Parallelism, and it indeed resulted in up to 15%
|
---|
55 | # better performance on most recent µ-archs...
|
---|
56 | #
|
---|
57 | # Third version adds AES_cbc_encrypt implementation, which resulted in
|
---|
58 | # up to 40% performance improvement of CBC benchmark results. 40% was
|
---|
59 | # observed on P4 core, where "overall" improvement coefficient, i.e. if
|
---|
60 | # compared to PIC generated by GCC and in CBC mode, was observed to be
|
---|
61 | # as large as 4x:-) CBC performance is virtually identical to ECB now
|
---|
62 | # and on some platforms even better, e.g. 17.6 "small" cycles/byte on
|
---|
63 | # Opteron, because certain function prologues and epilogues are
|
---|
64 | # effectively taken out of the loop...
|
---|
65 | #
|
---|
66 | # Version 3.2 implements compressed tables and prefetch of these tables
|
---|
67 | # in CBC[!] mode. Former means that 3/4 of table references are now
|
---|
68 | # misaligned, which unfortunately has negative impact on elder IA-32
|
---|
69 | # implementations, Pentium suffered 30% penalty, PIII - 10%.
|
---|
70 | #
|
---|
71 | # Version 3.3 avoids L1 cache aliasing between stack frame and
|
---|
72 | # S-boxes, and 3.4 - L1 cache aliasing even between key schedule. The
|
---|
73 | # latter is achieved by copying the key schedule to controlled place in
|
---|
74 | # stack. This unfortunately has rather strong impact on small block CBC
|
---|
75 | # performance, ~2x deterioration on 16-byte block if compared to 3.3.
|
---|
76 | #
|
---|
77 | # Version 3.5 checks if there is L1 cache aliasing between user-supplied
|
---|
78 | # key schedule and S-boxes and abstains from copying the former if
|
---|
79 | # there is no. This allows end-user to consciously retain small block
|
---|
80 | # performance by aligning key schedule in specific manner.
|
---|
81 | #
|
---|
82 | # Version 3.6 compresses Td4 to 256 bytes and prefetches it in ECB.
|
---|
83 | #
|
---|
84 | # Current ECB performance numbers for 128-bit key in CPU cycles per
|
---|
85 | # processed byte [measure commonly used by AES benchmarkers] are:
|
---|
86 | #
|
---|
87 | # small footprint fully unrolled
|
---|
88 | # P4 24 22
|
---|
89 | # AMD K8 20 19
|
---|
90 | # PIII 25 23
|
---|
91 | # Pentium 81 78
|
---|
92 | #
|
---|
93 | # Version 3.7 reimplements outer rounds as "compact." Meaning that
|
---|
94 | # first and last rounds reference compact 256 bytes S-box. This means
|
---|
95 | # that first round consumes a lot more CPU cycles and that encrypt
|
---|
96 | # and decrypt performance becomes asymmetric. Encrypt performance
|
---|
97 | # drops by 10-12%, while decrypt - by 20-25%:-( 256 bytes S-box is
|
---|
98 | # aggressively pre-fetched.
|
---|
99 | #
|
---|
100 | # Version 4.0 effectively rolls back to 3.6 and instead implements
|
---|
101 | # additional set of functions, _[x86|sse]_AES_[en|de]crypt_compact,
|
---|
102 | # which use exclusively 256 byte S-box. These functions are to be
|
---|
103 | # called in modes not concealing plain text, such as ECB, or when
|
---|
104 | # we're asked to process smaller amount of data [or unconditionally
|
---|
105 | # on hyper-threading CPU]. Currently it's called unconditionally from
|
---|
106 | # AES_[en|de]crypt, which affects all modes, but CBC. CBC routine
|
---|
107 | # still needs to be modified to switch between slower and faster
|
---|
108 | # mode when appropriate... But in either case benchmark landscape
|
---|
109 | # changes dramatically and below numbers are CPU cycles per processed
|
---|
110 | # byte for 128-bit key.
|
---|
111 | #
|
---|
112 | # ECB encrypt ECB decrypt CBC large chunk
|
---|
113 | # P4 52[54] 83[95] 23
|
---|
114 | # AMD K8 46[41] 66[70] 18
|
---|
115 | # PIII 41[50] 60[77] 24
|
---|
116 | # Core 2 31[36] 45[64] 18.5
|
---|
117 | # Atom 76[100] 96[138] 60
|
---|
118 | # Pentium 115 150 77
|
---|
119 | #
|
---|
120 | # Version 4.1 switches to compact S-box even in key schedule setup.
|
---|
121 | #
|
---|
122 | # Version 4.2 prefetches compact S-box in every SSE round or in other
|
---|
123 | # words every cache-line is *guaranteed* to be accessed within ~50
|
---|
124 | # cycles window. Why just SSE? Because it's needed on hyper-threading
|
---|
125 | # CPU! Which is also why it's prefetched with 64 byte stride. Best
|
---|
126 | # part is that it has no negative effect on performance:-)
|
---|
127 | #
|
---|
128 | # Version 4.3 implements switch between compact and non-compact block
|
---|
129 | # functions in AES_cbc_encrypt depending on how much data was asked
|
---|
130 | # to be processed in one stroke.
|
---|
131 | #
|
---|
132 | ######################################################################
|
---|
133 | # Timing attacks are classified in two classes: synchronous when
|
---|
134 | # attacker consciously initiates cryptographic operation and collects
|
---|
135 | # timing data of various character afterwards, and asynchronous when
|
---|
136 | # malicious code is executed on same CPU simultaneously with AES,
|
---|
137 | # instruments itself and performs statistical analysis of this data.
|
---|
138 | #
|
---|
139 | # As far as synchronous attacks go the root to the AES timing
|
---|
140 | # vulnerability is twofold. Firstly, of 256 S-box elements at most 160
|
---|
141 | # are referred to in single 128-bit block operation. Well, in C
|
---|
142 | # implementation with 4 distinct tables it's actually as little as 40
|
---|
143 | # references per 256 elements table, but anyway... Secondly, even
|
---|
144 | # though S-box elements are clustered into smaller amount of cache-
|
---|
145 | # lines, smaller than 160 and even 40, it turned out that for certain
|
---|
146 | # plain-text pattern[s] or simply put chosen plain-text and given key
|
---|
147 | # few cache-lines remain unaccessed during block operation. Now, if
|
---|
148 | # attacker can figure out this access pattern, he can deduct the key
|
---|
149 | # [or at least part of it]. The natural way to mitigate this kind of
|
---|
150 | # attacks is to minimize the amount of cache-lines in S-box and/or
|
---|
151 | # prefetch them to ensure that every one is accessed for more uniform
|
---|
152 | # timing. But note that *if* plain-text was concealed in such way that
|
---|
153 | # input to block function is distributed *uniformly*, then attack
|
---|
154 | # wouldn't apply. Now note that some encryption modes, most notably
|
---|
155 | # CBC, do mask the plain-text in this exact way [secure cipher output
|
---|
156 | # is distributed uniformly]. Yes, one still might find input that
|
---|
157 | # would reveal the information about given key, but if amount of
|
---|
158 | # candidate inputs to be tried is larger than amount of possible key
|
---|
159 | # combinations then attack becomes infeasible. This is why revised
|
---|
160 | # AES_cbc_encrypt "dares" to switch to larger S-box when larger chunk
|
---|
161 | # of data is to be processed in one stroke. The current size limit of
|
---|
162 | # 512 bytes is chosen to provide same [diminishingly low] probability
|
---|
163 | # for cache-line to remain untouched in large chunk operation with
|
---|
164 | # large S-box as for single block operation with compact S-box and
|
---|
165 | # surely needs more careful consideration...
|
---|
166 | #
|
---|
167 | # As for asynchronous attacks. There are two flavours: attacker code
|
---|
168 | # being interleaved with AES on hyper-threading CPU at *instruction*
|
---|
169 | # level, and two processes time sharing single core. As for latter.
|
---|
170 | # Two vectors. 1. Given that attacker process has higher priority,
|
---|
171 | # yield execution to process performing AES just before timer fires
|
---|
172 | # off the scheduler, immediately regain control of CPU and analyze the
|
---|
173 | # cache state. For this attack to be efficient attacker would have to
|
---|
174 | # effectively slow down the operation by several *orders* of magnitude,
|
---|
175 | # by ratio of time slice to duration of handful of AES rounds, which
|
---|
176 | # unlikely to remain unnoticed. Not to mention that this also means
|
---|
177 | # that he would spend correspondingly more time to collect enough
|
---|
178 | # statistical data to mount the attack. It's probably appropriate to
|
---|
179 | # say that if adversary reckons that this attack is beneficial and
|
---|
180 | # risks to be noticed, you probably have larger problems having him
|
---|
181 | # mere opportunity. In other words suggested code design expects you
|
---|
182 | # to preclude/mitigate this attack by overall system security design.
|
---|
183 | # 2. Attacker manages to make his code interrupt driven. In order for
|
---|
184 | # this kind of attack to be feasible, interrupt rate has to be high
|
---|
185 | # enough, again comparable to duration of handful of AES rounds. But
|
---|
186 | # is there interrupt source of such rate? Hardly, not even 1Gbps NIC
|
---|
187 | # generates interrupts at such raging rate...
|
---|
188 | #
|
---|
189 | # And now back to the former, hyper-threading CPU or more specifically
|
---|
190 | # Intel P4. Recall that asynchronous attack implies that malicious
|
---|
191 | # code instruments itself. And naturally instrumentation granularity
|
---|
192 | # has be noticeably lower than duration of codepath accessing S-box.
|
---|
193 | # Given that all cache-lines are accessed during that time that is.
|
---|
194 | # Current implementation accesses *all* cache-lines within ~50 cycles
|
---|
195 | # window, which is actually *less* than RDTSC latency on Intel P4!
|
---|
196 |
|
---|
197 | $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
|
---|
198 | push(@INC,"${dir}","${dir}../../perlasm");
|
---|
199 | require "x86asm.pl";
|
---|
200 |
|
---|
201 | $output = pop and open STDOUT,">$output";
|
---|
202 |
|
---|
203 | &asm_init($ARGV[0],$x86only = $ARGV[$#ARGV] eq "386");
|
---|
204 | &static_label("AES_Te");
|
---|
205 | &static_label("AES_Td");
|
---|
206 |
|
---|
207 | $s0="eax";
|
---|
208 | $s1="ebx";
|
---|
209 | $s2="ecx";
|
---|
210 | $s3="edx";
|
---|
211 | $key="edi";
|
---|
212 | $acc="esi";
|
---|
213 | $tbl="ebp";
|
---|
214 |
|
---|
215 | # stack frame layout in _[x86|sse]_AES_* routines, frame is allocated
|
---|
216 | # by caller
|
---|
217 | $__ra=&DWP(0,"esp"); # return address
|
---|
218 | $__s0=&DWP(4,"esp"); # s0 backing store
|
---|
219 | $__s1=&DWP(8,"esp"); # s1 backing store
|
---|
220 | $__s2=&DWP(12,"esp"); # s2 backing store
|
---|
221 | $__s3=&DWP(16,"esp"); # s3 backing store
|
---|
222 | $__key=&DWP(20,"esp"); # pointer to key schedule
|
---|
223 | $__end=&DWP(24,"esp"); # pointer to end of key schedule
|
---|
224 | $__tbl=&DWP(28,"esp"); # %ebp backing store
|
---|
225 |
|
---|
226 | # stack frame layout in AES_[en|crypt] routines, which differs from
|
---|
227 | # above by 4 and overlaps by %ebp backing store
|
---|
228 | $_tbl=&DWP(24,"esp");
|
---|
229 | $_esp=&DWP(28,"esp");
|
---|
230 |
|
---|
231 | sub _data_word() { my $i; while(defined($i=shift)) { &data_word($i,$i); } }
|
---|
232 |
|
---|
233 | $speed_limit=512; # chunks smaller than $speed_limit are
|
---|
234 | # processed with compact routine in CBC mode
|
---|
235 | $small_footprint=1; # $small_footprint=1 code is ~5% slower [on
|
---|
236 | # recent µ-archs], but ~5 times smaller!
|
---|
237 | # I favor compact code to minimize cache
|
---|
238 | # contention and in hope to "collect" 5% back
|
---|
239 | # in real-life applications...
|
---|
240 |
|
---|
241 | $vertical_spin=0; # shift "vertically" defaults to 0, because of
|
---|
242 | # its proof-of-concept status...
|
---|
243 | # Note that there is no decvert(), as well as last encryption round is
|
---|
244 | # performed with "horizontal" shifts. This is because this "vertical"
|
---|
245 | # implementation [one which groups shifts on a given $s[i] to form a
|
---|
246 | # "column," unlike "horizontal" one, which groups shifts on different
|
---|
247 | # $s[i] to form a "row"] is work in progress. It was observed to run
|
---|
248 | # few percents faster on Intel cores, but not AMD. On AMD K8 core it's
|
---|
249 | # whole 12% slower:-( So we face a trade-off... Shall it be resolved
|
---|
250 | # some day? Till then the code is considered experimental and by
|
---|
251 | # default remains dormant...
|
---|
252 |
|
---|
253 | sub encvert()
|
---|
254 | { my ($te,@s) = @_;
|
---|
255 | my ($v0,$v1) = ($acc,$key);
|
---|
256 |
|
---|
257 | &mov ($v0,$s[3]); # copy s3
|
---|
258 | &mov (&DWP(4,"esp"),$s[2]); # save s2
|
---|
259 | &mov ($v1,$s[0]); # copy s0
|
---|
260 | &mov (&DWP(8,"esp"),$s[1]); # save s1
|
---|
261 |
|
---|
262 | &movz ($s[2],&HB($s[0]));
|
---|
263 | &and ($s[0],0xFF);
|
---|
264 | &mov ($s[0],&DWP(0,$te,$s[0],8)); # s0>>0
|
---|
265 | &shr ($v1,16);
|
---|
266 | &mov ($s[3],&DWP(3,$te,$s[2],8)); # s0>>8
|
---|
267 | &movz ($s[1],&HB($v1));
|
---|
268 | &and ($v1,0xFF);
|
---|
269 | &mov ($s[2],&DWP(2,$te,$v1,8)); # s0>>16
|
---|
270 | &mov ($v1,$v0);
|
---|
271 | &mov ($s[1],&DWP(1,$te,$s[1],8)); # s0>>24
|
---|
272 |
|
---|
273 | &and ($v0,0xFF);
|
---|
274 | &xor ($s[3],&DWP(0,$te,$v0,8)); # s3>>0
|
---|
275 | &movz ($v0,&HB($v1));
|
---|
276 | &shr ($v1,16);
|
---|
277 | &xor ($s[2],&DWP(3,$te,$v0,8)); # s3>>8
|
---|
278 | &movz ($v0,&HB($v1));
|
---|
279 | &and ($v1,0xFF);
|
---|
280 | &xor ($s[1],&DWP(2,$te,$v1,8)); # s3>>16
|
---|
281 | &mov ($v1,&DWP(4,"esp")); # restore s2
|
---|
282 | &xor ($s[0],&DWP(1,$te,$v0,8)); # s3>>24
|
---|
283 |
|
---|
284 | &mov ($v0,$v1);
|
---|
285 | &and ($v1,0xFF);
|
---|
286 | &xor ($s[2],&DWP(0,$te,$v1,8)); # s2>>0
|
---|
287 | &movz ($v1,&HB($v0));
|
---|
288 | &shr ($v0,16);
|
---|
289 | &xor ($s[1],&DWP(3,$te,$v1,8)); # s2>>8
|
---|
290 | &movz ($v1,&HB($v0));
|
---|
291 | &and ($v0,0xFF);
|
---|
292 | &xor ($s[0],&DWP(2,$te,$v0,8)); # s2>>16
|
---|
293 | &mov ($v0,&DWP(8,"esp")); # restore s1
|
---|
294 | &xor ($s[3],&DWP(1,$te,$v1,8)); # s2>>24
|
---|
295 |
|
---|
296 | &mov ($v1,$v0);
|
---|
297 | &and ($v0,0xFF);
|
---|
298 | &xor ($s[1],&DWP(0,$te,$v0,8)); # s1>>0
|
---|
299 | &movz ($v0,&HB($v1));
|
---|
300 | &shr ($v1,16);
|
---|
301 | &xor ($s[0],&DWP(3,$te,$v0,8)); # s1>>8
|
---|
302 | &movz ($v0,&HB($v1));
|
---|
303 | &and ($v1,0xFF);
|
---|
304 | &xor ($s[3],&DWP(2,$te,$v1,8)); # s1>>16
|
---|
305 | &mov ($key,$__key); # reincarnate v1 as key
|
---|
306 | &xor ($s[2],&DWP(1,$te,$v0,8)); # s1>>24
|
---|
307 | }
|
---|
308 |
|
---|
309 | # Another experimental routine, which features "horizontal spin," but
|
---|
310 | # eliminates one reference to stack. Strangely enough runs slower...
|
---|
311 | sub enchoriz()
|
---|
312 | { my ($v0,$v1) = ($key,$acc);
|
---|
313 |
|
---|
314 | &movz ($v0,&LB($s0)); # 3, 2, 1, 0*
|
---|
315 | &rotr ($s2,8); # 8,11,10, 9
|
---|
316 | &mov ($v1,&DWP(0,$te,$v0,8)); # 0
|
---|
317 | &movz ($v0,&HB($s1)); # 7, 6, 5*, 4
|
---|
318 | &rotr ($s3,16); # 13,12,15,14
|
---|
319 | &xor ($v1,&DWP(3,$te,$v0,8)); # 5
|
---|
320 | &movz ($v0,&HB($s2)); # 8,11,10*, 9
|
---|
321 | &rotr ($s0,16); # 1, 0, 3, 2
|
---|
322 | &xor ($v1,&DWP(2,$te,$v0,8)); # 10
|
---|
323 | &movz ($v0,&HB($s3)); # 13,12,15*,14
|
---|
324 | &xor ($v1,&DWP(1,$te,$v0,8)); # 15, t[0] collected
|
---|
325 | &mov ($__s0,$v1); # t[0] saved
|
---|
326 |
|
---|
327 | &movz ($v0,&LB($s1)); # 7, 6, 5, 4*
|
---|
328 | &shr ($s1,16); # -, -, 7, 6
|
---|
329 | &mov ($v1,&DWP(0,$te,$v0,8)); # 4
|
---|
330 | &movz ($v0,&LB($s3)); # 13,12,15,14*
|
---|
331 | &xor ($v1,&DWP(2,$te,$v0,8)); # 14
|
---|
332 | &movz ($v0,&HB($s0)); # 1, 0, 3*, 2
|
---|
333 | &and ($s3,0xffff0000); # 13,12, -, -
|
---|
334 | &xor ($v1,&DWP(1,$te,$v0,8)); # 3
|
---|
335 | &movz ($v0,&LB($s2)); # 8,11,10, 9*
|
---|
336 | &or ($s3,$s1); # 13,12, 7, 6
|
---|
337 | &xor ($v1,&DWP(3,$te,$v0,8)); # 9, t[1] collected
|
---|
338 | &mov ($s1,$v1); # s[1]=t[1]
|
---|
339 |
|
---|
340 | &movz ($v0,&LB($s0)); # 1, 0, 3, 2*
|
---|
341 | &shr ($s2,16); # -, -, 8,11
|
---|
342 | &mov ($v1,&DWP(2,$te,$v0,8)); # 2
|
---|
343 | &movz ($v0,&HB($s3)); # 13,12, 7*, 6
|
---|
344 | &xor ($v1,&DWP(1,$te,$v0,8)); # 7
|
---|
345 | &movz ($v0,&HB($s2)); # -, -, 8*,11
|
---|
346 | &xor ($v1,&DWP(0,$te,$v0,8)); # 8
|
---|
347 | &mov ($v0,$s3);
|
---|
348 | &shr ($v0,24); # 13
|
---|
349 | &xor ($v1,&DWP(3,$te,$v0,8)); # 13, t[2] collected
|
---|
350 |
|
---|
351 | &movz ($v0,&LB($s2)); # -, -, 8,11*
|
---|
352 | &shr ($s0,24); # 1*
|
---|
353 | &mov ($s2,&DWP(1,$te,$v0,8)); # 11
|
---|
354 | &xor ($s2,&DWP(3,$te,$s0,8)); # 1
|
---|
355 | &mov ($s0,$__s0); # s[0]=t[0]
|
---|
356 | &movz ($v0,&LB($s3)); # 13,12, 7, 6*
|
---|
357 | &shr ($s3,16); # , ,13,12
|
---|
358 | &xor ($s2,&DWP(2,$te,$v0,8)); # 6
|
---|
359 | &mov ($key,$__key); # reincarnate v0 as key
|
---|
360 | &and ($s3,0xff); # , ,13,12*
|
---|
361 | &mov ($s3,&DWP(0,$te,$s3,8)); # 12
|
---|
362 | &xor ($s3,$s2); # s[2]=t[3] collected
|
---|
363 | &mov ($s2,$v1); # s[2]=t[2]
|
---|
364 | }
|
---|
365 |
|
---|
366 | # More experimental code... SSE one... Even though this one eliminates
|
---|
367 | # *all* references to stack, it's not faster...
|
---|
368 | sub sse_encbody()
|
---|
369 | {
|
---|
370 | &movz ($acc,&LB("eax")); # 0
|
---|
371 | &mov ("ecx",&DWP(0,$tbl,$acc,8)); # 0
|
---|
372 | &pshufw ("mm2","mm0",0x0d); # 7, 6, 3, 2
|
---|
373 | &movz ("edx",&HB("eax")); # 1
|
---|
374 | &mov ("edx",&DWP(3,$tbl,"edx",8)); # 1
|
---|
375 | &shr ("eax",16); # 5, 4
|
---|
376 |
|
---|
377 | &movz ($acc,&LB("ebx")); # 10
|
---|
378 | &xor ("ecx",&DWP(2,$tbl,$acc,8)); # 10
|
---|
379 | &pshufw ("mm6","mm4",0x08); # 13,12, 9, 8
|
---|
380 | &movz ($acc,&HB("ebx")); # 11
|
---|
381 | &xor ("edx",&DWP(1,$tbl,$acc,8)); # 11
|
---|
382 | &shr ("ebx",16); # 15,14
|
---|
383 |
|
---|
384 | &movz ($acc,&HB("eax")); # 5
|
---|
385 | &xor ("ecx",&DWP(3,$tbl,$acc,8)); # 5
|
---|
386 | &movq ("mm3",QWP(16,$key));
|
---|
387 | &movz ($acc,&HB("ebx")); # 15
|
---|
388 | &xor ("ecx",&DWP(1,$tbl,$acc,8)); # 15
|
---|
389 | &movd ("mm0","ecx"); # t[0] collected
|
---|
390 |
|
---|
391 | &movz ($acc,&LB("eax")); # 4
|
---|
392 | &mov ("ecx",&DWP(0,$tbl,$acc,8)); # 4
|
---|
393 | &movd ("eax","mm2"); # 7, 6, 3, 2
|
---|
394 | &movz ($acc,&LB("ebx")); # 14
|
---|
395 | &xor ("ecx",&DWP(2,$tbl,$acc,8)); # 14
|
---|
396 | &movd ("ebx","mm6"); # 13,12, 9, 8
|
---|
397 |
|
---|
398 | &movz ($acc,&HB("eax")); # 3
|
---|
399 | &xor ("ecx",&DWP(1,$tbl,$acc,8)); # 3
|
---|
400 | &movz ($acc,&HB("ebx")); # 9
|
---|
401 | &xor ("ecx",&DWP(3,$tbl,$acc,8)); # 9
|
---|
402 | &movd ("mm1","ecx"); # t[1] collected
|
---|
403 |
|
---|
404 | &movz ($acc,&LB("eax")); # 2
|
---|
405 | &mov ("ecx",&DWP(2,$tbl,$acc,8)); # 2
|
---|
406 | &shr ("eax",16); # 7, 6
|
---|
407 | &punpckldq ("mm0","mm1"); # t[0,1] collected
|
---|
408 | &movz ($acc,&LB("ebx")); # 8
|
---|
409 | &xor ("ecx",&DWP(0,$tbl,$acc,8)); # 8
|
---|
410 | &shr ("ebx",16); # 13,12
|
---|
411 |
|
---|
412 | &movz ($acc,&HB("eax")); # 7
|
---|
413 | &xor ("ecx",&DWP(1,$tbl,$acc,8)); # 7
|
---|
414 | &pxor ("mm0","mm3");
|
---|
415 | &movz ("eax",&LB("eax")); # 6
|
---|
416 | &xor ("edx",&DWP(2,$tbl,"eax",8)); # 6
|
---|
417 | &pshufw ("mm1","mm0",0x08); # 5, 4, 1, 0
|
---|
418 | &movz ($acc,&HB("ebx")); # 13
|
---|
419 | &xor ("ecx",&DWP(3,$tbl,$acc,8)); # 13
|
---|
420 | &xor ("ecx",&DWP(24,$key)); # t[2]
|
---|
421 | &movd ("mm4","ecx"); # t[2] collected
|
---|
422 | &movz ("ebx",&LB("ebx")); # 12
|
---|
423 | &xor ("edx",&DWP(0,$tbl,"ebx",8)); # 12
|
---|
424 | &shr ("ecx",16);
|
---|
425 | &movd ("eax","mm1"); # 5, 4, 1, 0
|
---|
426 | &mov ("ebx",&DWP(28,$key)); # t[3]
|
---|
427 | &xor ("ebx","edx");
|
---|
428 | &movd ("mm5","ebx"); # t[3] collected
|
---|
429 | &and ("ebx",0xffff0000);
|
---|
430 | &or ("ebx","ecx");
|
---|
431 |
|
---|
432 | &punpckldq ("mm4","mm5"); # t[2,3] collected
|
---|
433 | }
|
---|
434 |
|
---|
435 | ######################################################################
|
---|
436 | # "Compact" block function
|
---|
437 | ######################################################################
|
---|
438 |
|
---|
439 | sub enccompact()
|
---|
440 | { my $Fn = \&mov;
|
---|
441 | while ($#_>5) { pop(@_); $Fn=sub{}; }
|
---|
442 | my ($i,$te,@s)=@_;
|
---|
443 | my $tmp = $key;
|
---|
444 | my $out = $i==3?$s[0]:$acc;
|
---|
445 |
|
---|
446 | # $Fn is used in first compact round and its purpose is to
|
---|
447 | # void restoration of some values from stack, so that after
|
---|
448 | # 4xenccompact with extra argument $key value is left there...
|
---|
449 | if ($i==3) { &$Fn ($key,$__key); }##%edx
|
---|
450 | else { &mov ($out,$s[0]); }
|
---|
451 | &and ($out,0xFF);
|
---|
452 | if ($i==1) { &shr ($s[0],16); }#%ebx[1]
|
---|
453 | if ($i==2) { &shr ($s[0],24); }#%ecx[2]
|
---|
454 | &movz ($out,&BP(-128,$te,$out,1));
|
---|
455 |
|
---|
456 | if ($i==3) { $tmp=$s[1]; }##%eax
|
---|
457 | &movz ($tmp,&HB($s[1]));
|
---|
458 | &movz ($tmp,&BP(-128,$te,$tmp,1));
|
---|
459 | &shl ($tmp,8);
|
---|
460 | &xor ($out,$tmp);
|
---|
461 |
|
---|
462 | if ($i==3) { $tmp=$s[2]; &mov ($s[1],$__s0); }##%ebx
|
---|
463 | else { &mov ($tmp,$s[2]);
|
---|
464 | &shr ($tmp,16); }
|
---|
465 | if ($i==2) { &and ($s[1],0xFF); }#%edx[2]
|
---|
466 | &and ($tmp,0xFF);
|
---|
467 | &movz ($tmp,&BP(-128,$te,$tmp,1));
|
---|
468 | &shl ($tmp,16);
|
---|
469 | &xor ($out,$tmp);
|
---|
470 |
|
---|
471 | if ($i==3) { $tmp=$s[3]; &mov ($s[2],$__s1); }##%ecx
|
---|
472 | elsif($i==2){ &movz ($tmp,&HB($s[3])); }#%ebx[2]
|
---|
473 | else { &mov ($tmp,$s[3]);
|
---|
474 | &shr ($tmp,24); }
|
---|
475 | &movz ($tmp,&BP(-128,$te,$tmp,1));
|
---|
476 | &shl ($tmp,24);
|
---|
477 | &xor ($out,$tmp);
|
---|
478 | if ($i<2) { &mov (&DWP(4+4*$i,"esp"),$out); }
|
---|
479 | if ($i==3) { &mov ($s[3],$acc); }
|
---|
480 | &comment();
|
---|
481 | }
|
---|
482 |
|
---|
483 | sub enctransform()
|
---|
484 | { my @s = ($s0,$s1,$s2,$s3);
|
---|
485 | my $i = shift;
|
---|
486 | my $tmp = $tbl;
|
---|
487 | my $r2 = $key ;
|
---|
488 |
|
---|
489 | &and ($tmp,$s[$i]);
|
---|
490 | &lea ($r2,&DWP(0,$s[$i],$s[$i]));
|
---|
491 | &mov ($acc,$tmp);
|
---|
492 | &shr ($tmp,7);
|
---|
493 | &and ($r2,0xfefefefe);
|
---|
494 | &sub ($acc,$tmp);
|
---|
495 | &mov ($tmp,$s[$i]);
|
---|
496 | &and ($acc,0x1b1b1b1b);
|
---|
497 | &rotr ($tmp,16);
|
---|
498 | &xor ($acc,$r2); # r2
|
---|
499 | &mov ($r2,$s[$i]);
|
---|
500 |
|
---|
501 | &xor ($s[$i],$acc); # r0 ^ r2
|
---|
502 | &rotr ($r2,16+8);
|
---|
503 | &xor ($acc,$tmp);
|
---|
504 | &rotl ($s[$i],24);
|
---|
505 | &xor ($acc,$r2);
|
---|
506 | &mov ($tmp,0x80808080) if ($i!=1);
|
---|
507 | &xor ($s[$i],$acc); # ROTATE(r2^r0,24) ^ r2
|
---|
508 | }
|
---|
509 |
|
---|
510 | &function_begin_B("_x86_AES_encrypt_compact");
|
---|
511 | # note that caller is expected to allocate stack frame for me!
|
---|
512 | &mov ($__key,$key); # save key
|
---|
513 |
|
---|
514 | &xor ($s0,&DWP(0,$key)); # xor with key
|
---|
515 | &xor ($s1,&DWP(4,$key));
|
---|
516 | &xor ($s2,&DWP(8,$key));
|
---|
517 | &xor ($s3,&DWP(12,$key));
|
---|
518 |
|
---|
519 | &mov ($acc,&DWP(240,$key)); # load key->rounds
|
---|
520 | &lea ($acc,&DWP(-2,$acc,$acc));
|
---|
521 | &lea ($acc,&DWP(0,$key,$acc,8));
|
---|
522 | &mov ($__end,$acc); # end of key schedule
|
---|
523 |
|
---|
524 | # prefetch Te4
|
---|
525 | &mov ($key,&DWP(0-128,$tbl));
|
---|
526 | &mov ($acc,&DWP(32-128,$tbl));
|
---|
527 | &mov ($key,&DWP(64-128,$tbl));
|
---|
528 | &mov ($acc,&DWP(96-128,$tbl));
|
---|
529 | &mov ($key,&DWP(128-128,$tbl));
|
---|
530 | &mov ($acc,&DWP(160-128,$tbl));
|
---|
531 | &mov ($key,&DWP(192-128,$tbl));
|
---|
532 | &mov ($acc,&DWP(224-128,$tbl));
|
---|
533 |
|
---|
534 | &set_label("loop",16);
|
---|
535 |
|
---|
536 | &enccompact(0,$tbl,$s0,$s1,$s2,$s3,1);
|
---|
537 | &enccompact(1,$tbl,$s1,$s2,$s3,$s0,1);
|
---|
538 | &enccompact(2,$tbl,$s2,$s3,$s0,$s1,1);
|
---|
539 | &enccompact(3,$tbl,$s3,$s0,$s1,$s2,1);
|
---|
540 | &mov ($tbl,0x80808080);
|
---|
541 | &enctransform(2);
|
---|
542 | &enctransform(3);
|
---|
543 | &enctransform(0);
|
---|
544 | &enctransform(1);
|
---|
545 | &mov ($key,$__key);
|
---|
546 | &mov ($tbl,$__tbl);
|
---|
547 | &add ($key,16); # advance rd_key
|
---|
548 | &xor ($s0,&DWP(0,$key));
|
---|
549 | &xor ($s1,&DWP(4,$key));
|
---|
550 | &xor ($s2,&DWP(8,$key));
|
---|
551 | &xor ($s3,&DWP(12,$key));
|
---|
552 |
|
---|
553 | &cmp ($key,$__end);
|
---|
554 | &mov ($__key,$key);
|
---|
555 | &jb (&label("loop"));
|
---|
556 |
|
---|
557 | &enccompact(0,$tbl,$s0,$s1,$s2,$s3);
|
---|
558 | &enccompact(1,$tbl,$s1,$s2,$s3,$s0);
|
---|
559 | &enccompact(2,$tbl,$s2,$s3,$s0,$s1);
|
---|
560 | &enccompact(3,$tbl,$s3,$s0,$s1,$s2);
|
---|
561 |
|
---|
562 | &xor ($s0,&DWP(16,$key));
|
---|
563 | &xor ($s1,&DWP(20,$key));
|
---|
564 | &xor ($s2,&DWP(24,$key));
|
---|
565 | &xor ($s3,&DWP(28,$key));
|
---|
566 |
|
---|
567 | &ret ();
|
---|
568 | &function_end_B("_x86_AES_encrypt_compact");
|
---|
569 |
|
---|
570 | ######################################################################
|
---|
571 | # "Compact" SSE block function.
|
---|
572 | ######################################################################
|
---|
573 | #
|
---|
574 | # Performance is not actually extraordinary in comparison to pure
|
---|
575 | # x86 code. In particular encrypt performance is virtually the same.
|
---|
576 | # Decrypt performance on the other hand is 15-20% better on newer
|
---|
577 | # µ-archs [but we're thankful for *any* improvement here], and ~50%
|
---|
578 | # better on PIII:-) And additionally on the pros side this code
|
---|
579 | # eliminates redundant references to stack and thus relieves/
|
---|
580 | # minimizes the pressure on the memory bus.
|
---|
581 | #
|
---|
582 | # MMX register layout lsb
|
---|
583 | # +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+
|
---|
584 | # | mm4 | mm0 |
|
---|
585 | # +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+
|
---|
586 | # | s3 | s2 | s1 | s0 |
|
---|
587 | # +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+
|
---|
588 | # |15|14|13|12|11|10| 9| 8| 7| 6| 5| 4| 3| 2| 1| 0|
|
---|
589 | # +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+
|
---|
590 | #
|
---|
591 | # Indexes translate as s[N/4]>>(8*(N%4)), e.g. 5 means s1>>8.
|
---|
592 | # In this terms encryption and decryption "compact" permutation
|
---|
593 | # matrices can be depicted as following:
|
---|
594 | #
|
---|
595 | # encryption lsb # decryption lsb
|
---|
596 | # +----++----+----+----+----+ # +----++----+----+----+----+
|
---|
597 | # | t0 || 15 | 10 | 5 | 0 | # | t0 || 7 | 10 | 13 | 0 |
|
---|
598 | # +----++----+----+----+----+ # +----++----+----+----+----+
|
---|
599 | # | t1 || 3 | 14 | 9 | 4 | # | t1 || 11 | 14 | 1 | 4 |
|
---|
600 | # +----++----+----+----+----+ # +----++----+----+----+----+
|
---|
601 | # | t2 || 7 | 2 | 13 | 8 | # | t2 || 15 | 2 | 5 | 8 |
|
---|
602 | # +----++----+----+----+----+ # +----++----+----+----+----+
|
---|
603 | # | t3 || 11 | 6 | 1 | 12 | # | t3 || 3 | 6 | 9 | 12 |
|
---|
604 | # +----++----+----+----+----+ # +----++----+----+----+----+
|
---|
605 | #
|
---|
606 | ######################################################################
|
---|
607 | # Why not xmm registers? Short answer. It was actually tested and
|
---|
608 | # was not any faster, but *contrary*, most notably on Intel CPUs.
|
---|
609 | # Longer answer. Main advantage of using mm registers is that movd
|
---|
610 | # latency is lower, especially on Intel P4. While arithmetic
|
---|
611 | # instructions are twice as many, they can be scheduled every cycle
|
---|
612 | # and not every second one when they are operating on xmm register,
|
---|
613 | # so that "arithmetic throughput" remains virtually the same. And
|
---|
614 | # finally the code can be executed even on elder SSE-only CPUs:-)
|
---|
615 |
|
---|
616 | sub sse_enccompact()
|
---|
617 | {
|
---|
618 | &pshufw ("mm1","mm0",0x08); # 5, 4, 1, 0
|
---|
619 | &pshufw ("mm5","mm4",0x0d); # 15,14,11,10
|
---|
620 | &movd ("eax","mm1"); # 5, 4, 1, 0
|
---|
621 | &movd ("ebx","mm5"); # 15,14,11,10
|
---|
622 | &mov ($__key,$key);
|
---|
623 |
|
---|
624 | &movz ($acc,&LB("eax")); # 0
|
---|
625 | &movz ("edx",&HB("eax")); # 1
|
---|
626 | &pshufw ("mm2","mm0",0x0d); # 7, 6, 3, 2
|
---|
627 | &movz ("ecx",&BP(-128,$tbl,$acc,1)); # 0
|
---|
628 | &movz ($key,&LB("ebx")); # 10
|
---|
629 | &movz ("edx",&BP(-128,$tbl,"edx",1)); # 1
|
---|
630 | &shr ("eax",16); # 5, 4
|
---|
631 | &shl ("edx",8); # 1
|
---|
632 |
|
---|
633 | &movz ($acc,&BP(-128,$tbl,$key,1)); # 10
|
---|
634 | &movz ($key,&HB("ebx")); # 11
|
---|
635 | &shl ($acc,16); # 10
|
---|
636 | &pshufw ("mm6","mm4",0x08); # 13,12, 9, 8
|
---|
637 | &or ("ecx",$acc); # 10
|
---|
638 | &movz ($acc,&BP(-128,$tbl,$key,1)); # 11
|
---|
639 | &movz ($key,&HB("eax")); # 5
|
---|
640 | &shl ($acc,24); # 11
|
---|
641 | &shr ("ebx",16); # 15,14
|
---|
642 | &or ("edx",$acc); # 11
|
---|
643 |
|
---|
644 | &movz ($acc,&BP(-128,$tbl,$key,1)); # 5
|
---|
645 | &movz ($key,&HB("ebx")); # 15
|
---|
646 | &shl ($acc,8); # 5
|
---|
647 | &or ("ecx",$acc); # 5
|
---|
648 | &movz ($acc,&BP(-128,$tbl,$key,1)); # 15
|
---|
649 | &movz ($key,&LB("eax")); # 4
|
---|
650 | &shl ($acc,24); # 15
|
---|
651 | &or ("ecx",$acc); # 15
|
---|
652 |
|
---|
653 | &movz ($acc,&BP(-128,$tbl,$key,1)); # 4
|
---|
654 | &movz ($key,&LB("ebx")); # 14
|
---|
655 | &movd ("eax","mm2"); # 7, 6, 3, 2
|
---|
656 | &movd ("mm0","ecx"); # t[0] collected
|
---|
657 | &movz ("ecx",&BP(-128,$tbl,$key,1)); # 14
|
---|
658 | &movz ($key,&HB("eax")); # 3
|
---|
659 | &shl ("ecx",16); # 14
|
---|
660 | &movd ("ebx","mm6"); # 13,12, 9, 8
|
---|
661 | &or ("ecx",$acc); # 14
|
---|
662 |
|
---|
663 | &movz ($acc,&BP(-128,$tbl,$key,1)); # 3
|
---|
664 | &movz ($key,&HB("ebx")); # 9
|
---|
665 | &shl ($acc,24); # 3
|
---|
666 | &or ("ecx",$acc); # 3
|
---|
667 | &movz ($acc,&BP(-128,$tbl,$key,1)); # 9
|
---|
668 | &movz ($key,&LB("ebx")); # 8
|
---|
669 | &shl ($acc,8); # 9
|
---|
670 | &shr ("ebx",16); # 13,12
|
---|
671 | &or ("ecx",$acc); # 9
|
---|
672 |
|
---|
673 | &movz ($acc,&BP(-128,$tbl,$key,1)); # 8
|
---|
674 | &movz ($key,&LB("eax")); # 2
|
---|
675 | &shr ("eax",16); # 7, 6
|
---|
676 | &movd ("mm1","ecx"); # t[1] collected
|
---|
677 | &movz ("ecx",&BP(-128,$tbl,$key,1)); # 2
|
---|
678 | &movz ($key,&HB("eax")); # 7
|
---|
679 | &shl ("ecx",16); # 2
|
---|
680 | &and ("eax",0xff); # 6
|
---|
681 | &or ("ecx",$acc); # 2
|
---|
682 |
|
---|
683 | &punpckldq ("mm0","mm1"); # t[0,1] collected
|
---|
684 |
|
---|
685 | &movz ($acc,&BP(-128,$tbl,$key,1)); # 7
|
---|
686 | &movz ($key,&HB("ebx")); # 13
|
---|
687 | &shl ($acc,24); # 7
|
---|
688 | &and ("ebx",0xff); # 12
|
---|
689 | &movz ("eax",&BP(-128,$tbl,"eax",1)); # 6
|
---|
690 | &or ("ecx",$acc); # 7
|
---|
691 | &shl ("eax",16); # 6
|
---|
692 | &movz ($acc,&BP(-128,$tbl,$key,1)); # 13
|
---|
693 | &or ("edx","eax"); # 6
|
---|
694 | &shl ($acc,8); # 13
|
---|
695 | &movz ("ebx",&BP(-128,$tbl,"ebx",1)); # 12
|
---|
696 | &or ("ecx",$acc); # 13
|
---|
697 | &or ("edx","ebx"); # 12
|
---|
698 | &mov ($key,$__key);
|
---|
699 | &movd ("mm4","ecx"); # t[2] collected
|
---|
700 | &movd ("mm5","edx"); # t[3] collected
|
---|
701 |
|
---|
702 | &punpckldq ("mm4","mm5"); # t[2,3] collected
|
---|
703 | }
|
---|
704 |
|
---|
705 | if (!$x86only) {
|
---|
706 | &function_begin_B("_sse_AES_encrypt_compact");
|
---|
707 | &pxor ("mm0",&QWP(0,$key)); # 7, 6, 5, 4, 3, 2, 1, 0
|
---|
708 | &pxor ("mm4",&QWP(8,$key)); # 15,14,13,12,11,10, 9, 8
|
---|
709 |
|
---|
710 | # note that caller is expected to allocate stack frame for me!
|
---|
711 | &mov ($acc,&DWP(240,$key)); # load key->rounds
|
---|
712 | &lea ($acc,&DWP(-2,$acc,$acc));
|
---|
713 | &lea ($acc,&DWP(0,$key,$acc,8));
|
---|
714 | &mov ($__end,$acc); # end of key schedule
|
---|
715 |
|
---|
716 | &mov ($s0,0x1b1b1b1b); # magic constant
|
---|
717 | &mov (&DWP(8,"esp"),$s0);
|
---|
718 | &mov (&DWP(12,"esp"),$s0);
|
---|
719 |
|
---|
720 | # prefetch Te4
|
---|
721 | &mov ($s0,&DWP(0-128,$tbl));
|
---|
722 | &mov ($s1,&DWP(32-128,$tbl));
|
---|
723 | &mov ($s2,&DWP(64-128,$tbl));
|
---|
724 | &mov ($s3,&DWP(96-128,$tbl));
|
---|
725 | &mov ($s0,&DWP(128-128,$tbl));
|
---|
726 | &mov ($s1,&DWP(160-128,$tbl));
|
---|
727 | &mov ($s2,&DWP(192-128,$tbl));
|
---|
728 | &mov ($s3,&DWP(224-128,$tbl));
|
---|
729 |
|
---|
730 | &set_label("loop",16);
|
---|
731 | &sse_enccompact();
|
---|
732 | &add ($key,16);
|
---|
733 | &cmp ($key,$__end);
|
---|
734 | &ja (&label("out"));
|
---|
735 |
|
---|
736 | &movq ("mm2",&QWP(8,"esp"));
|
---|
737 | &pxor ("mm3","mm3"); &pxor ("mm7","mm7");
|
---|
738 | &movq ("mm1","mm0"); &movq ("mm5","mm4"); # r0
|
---|
739 | &pcmpgtb("mm3","mm0"); &pcmpgtb("mm7","mm4");
|
---|
740 | &pand ("mm3","mm2"); &pand ("mm7","mm2");
|
---|
741 | &pshufw ("mm2","mm0",0xb1); &pshufw ("mm6","mm4",0xb1);# ROTATE(r0,16)
|
---|
742 | &paddb ("mm0","mm0"); &paddb ("mm4","mm4");
|
---|
743 | &pxor ("mm0","mm3"); &pxor ("mm4","mm7"); # = r2
|
---|
744 | &pshufw ("mm3","mm2",0xb1); &pshufw ("mm7","mm6",0xb1);# r0
|
---|
745 | &pxor ("mm1","mm0"); &pxor ("mm5","mm4"); # r0^r2
|
---|
746 | &pxor ("mm0","mm2"); &pxor ("mm4","mm6"); # ^= ROTATE(r0,16)
|
---|
747 |
|
---|
748 | &movq ("mm2","mm3"); &movq ("mm6","mm7");
|
---|
749 | &pslld ("mm3",8); &pslld ("mm7",8);
|
---|
750 | &psrld ("mm2",24); &psrld ("mm6",24);
|
---|
751 | &pxor ("mm0","mm3"); &pxor ("mm4","mm7"); # ^= r0<<8
|
---|
752 | &pxor ("mm0","mm2"); &pxor ("mm4","mm6"); # ^= r0>>24
|
---|
753 |
|
---|
754 | &movq ("mm3","mm1"); &movq ("mm7","mm5");
|
---|
755 | &movq ("mm2",&QWP(0,$key)); &movq ("mm6",&QWP(8,$key));
|
---|
756 | &psrld ("mm1",8); &psrld ("mm5",8);
|
---|
757 | &mov ($s0,&DWP(0-128,$tbl));
|
---|
758 | &pslld ("mm3",24); &pslld ("mm7",24);
|
---|
759 | &mov ($s1,&DWP(64-128,$tbl));
|
---|
760 | &pxor ("mm0","mm1"); &pxor ("mm4","mm5"); # ^= (r2^r0)<<8
|
---|
761 | &mov ($s2,&DWP(128-128,$tbl));
|
---|
762 | &pxor ("mm0","mm3"); &pxor ("mm4","mm7"); # ^= (r2^r0)>>24
|
---|
763 | &mov ($s3,&DWP(192-128,$tbl));
|
---|
764 |
|
---|
765 | &pxor ("mm0","mm2"); &pxor ("mm4","mm6");
|
---|
766 | &jmp (&label("loop"));
|
---|
767 |
|
---|
768 | &set_label("out",16);
|
---|
769 | &pxor ("mm0",&QWP(0,$key));
|
---|
770 | &pxor ("mm4",&QWP(8,$key));
|
---|
771 |
|
---|
772 | &ret ();
|
---|
773 | &function_end_B("_sse_AES_encrypt_compact");
|
---|
774 | }
|
---|
775 |
|
---|
776 | ######################################################################
|
---|
777 | # Vanilla block function.
|
---|
778 | ######################################################################
|
---|
779 |
|
---|
780 | sub encstep()
|
---|
781 | { my ($i,$te,@s) = @_;
|
---|
782 | my $tmp = $key;
|
---|
783 | my $out = $i==3?$s[0]:$acc;
|
---|
784 |
|
---|
785 | # lines marked with #%e?x[i] denote "reordered" instructions...
|
---|
786 | if ($i==3) { &mov ($key,$__key); }##%edx
|
---|
787 | else { &mov ($out,$s[0]);
|
---|
788 | &and ($out,0xFF); }
|
---|
789 | if ($i==1) { &shr ($s[0],16); }#%ebx[1]
|
---|
790 | if ($i==2) { &shr ($s[0],24); }#%ecx[2]
|
---|
791 | &mov ($out,&DWP(0,$te,$out,8));
|
---|
792 |
|
---|
793 | if ($i==3) { $tmp=$s[1]; }##%eax
|
---|
794 | &movz ($tmp,&HB($s[1]));
|
---|
795 | &xor ($out,&DWP(3,$te,$tmp,8));
|
---|
796 |
|
---|
797 | if ($i==3) { $tmp=$s[2]; &mov ($s[1],$__s0); }##%ebx
|
---|
798 | else { &mov ($tmp,$s[2]);
|
---|
799 | &shr ($tmp,16); }
|
---|
800 | if ($i==2) { &and ($s[1],0xFF); }#%edx[2]
|
---|
801 | &and ($tmp,0xFF);
|
---|
802 | &xor ($out,&DWP(2,$te,$tmp,8));
|
---|
803 |
|
---|
804 | if ($i==3) { $tmp=$s[3]; &mov ($s[2],$__s1); }##%ecx
|
---|
805 | elsif($i==2){ &movz ($tmp,&HB($s[3])); }#%ebx[2]
|
---|
806 | else { &mov ($tmp,$s[3]);
|
---|
807 | &shr ($tmp,24) }
|
---|
808 | &xor ($out,&DWP(1,$te,$tmp,8));
|
---|
809 | if ($i<2) { &mov (&DWP(4+4*$i,"esp"),$out); }
|
---|
810 | if ($i==3) { &mov ($s[3],$acc); }
|
---|
811 | &comment();
|
---|
812 | }
|
---|
813 |
|
---|
814 | sub enclast()
|
---|
815 | { my ($i,$te,@s)=@_;
|
---|
816 | my $tmp = $key;
|
---|
817 | my $out = $i==3?$s[0]:$acc;
|
---|
818 |
|
---|
819 | if ($i==3) { &mov ($key,$__key); }##%edx
|
---|
820 | else { &mov ($out,$s[0]); }
|
---|
821 | &and ($out,0xFF);
|
---|
822 | if ($i==1) { &shr ($s[0],16); }#%ebx[1]
|
---|
823 | if ($i==2) { &shr ($s[0],24); }#%ecx[2]
|
---|
824 | &mov ($out,&DWP(2,$te,$out,8));
|
---|
825 | &and ($out,0x000000ff);
|
---|
826 |
|
---|
827 | if ($i==3) { $tmp=$s[1]; }##%eax
|
---|
828 | &movz ($tmp,&HB($s[1]));
|
---|
829 | &mov ($tmp,&DWP(0,$te,$tmp,8));
|
---|
830 | &and ($tmp,0x0000ff00);
|
---|
831 | &xor ($out,$tmp);
|
---|
832 |
|
---|
833 | if ($i==3) { $tmp=$s[2]; &mov ($s[1],$__s0); }##%ebx
|
---|
834 | else { &mov ($tmp,$s[2]);
|
---|
835 | &shr ($tmp,16); }
|
---|
836 | if ($i==2) { &and ($s[1],0xFF); }#%edx[2]
|
---|
837 | &and ($tmp,0xFF);
|
---|
838 | &mov ($tmp,&DWP(0,$te,$tmp,8));
|
---|
839 | &and ($tmp,0x00ff0000);
|
---|
840 | &xor ($out,$tmp);
|
---|
841 |
|
---|
842 | if ($i==3) { $tmp=$s[3]; &mov ($s[2],$__s1); }##%ecx
|
---|
843 | elsif($i==2){ &movz ($tmp,&HB($s[3])); }#%ebx[2]
|
---|
844 | else { &mov ($tmp,$s[3]);
|
---|
845 | &shr ($tmp,24); }
|
---|
846 | &mov ($tmp,&DWP(2,$te,$tmp,8));
|
---|
847 | &and ($tmp,0xff000000);
|
---|
848 | &xor ($out,$tmp);
|
---|
849 | if ($i<2) { &mov (&DWP(4+4*$i,"esp"),$out); }
|
---|
850 | if ($i==3) { &mov ($s[3],$acc); }
|
---|
851 | }
|
---|
852 |
|
---|
853 | &function_begin_B("_x86_AES_encrypt");
|
---|
854 | if ($vertical_spin) {
|
---|
855 | # I need high parts of volatile registers to be accessible...
|
---|
856 | &exch ($s1="edi",$key="ebx");
|
---|
857 | &mov ($s2="esi",$acc="ecx");
|
---|
858 | }
|
---|
859 |
|
---|
860 | # note that caller is expected to allocate stack frame for me!
|
---|
861 | &mov ($__key,$key); # save key
|
---|
862 |
|
---|
863 | &xor ($s0,&DWP(0,$key)); # xor with key
|
---|
864 | &xor ($s1,&DWP(4,$key));
|
---|
865 | &xor ($s2,&DWP(8,$key));
|
---|
866 | &xor ($s3,&DWP(12,$key));
|
---|
867 |
|
---|
868 | &mov ($acc,&DWP(240,$key)); # load key->rounds
|
---|
869 |
|
---|
870 | if ($small_footprint) {
|
---|
871 | &lea ($acc,&DWP(-2,$acc,$acc));
|
---|
872 | &lea ($acc,&DWP(0,$key,$acc,8));
|
---|
873 | &mov ($__end,$acc); # end of key schedule
|
---|
874 |
|
---|
875 | &set_label("loop",16);
|
---|
876 | if ($vertical_spin) {
|
---|
877 | &encvert($tbl,$s0,$s1,$s2,$s3);
|
---|
878 | } else {
|
---|
879 | &encstep(0,$tbl,$s0,$s1,$s2,$s3);
|
---|
880 | &encstep(1,$tbl,$s1,$s2,$s3,$s0);
|
---|
881 | &encstep(2,$tbl,$s2,$s3,$s0,$s1);
|
---|
882 | &encstep(3,$tbl,$s3,$s0,$s1,$s2);
|
---|
883 | }
|
---|
884 | &add ($key,16); # advance rd_key
|
---|
885 | &xor ($s0,&DWP(0,$key));
|
---|
886 | &xor ($s1,&DWP(4,$key));
|
---|
887 | &xor ($s2,&DWP(8,$key));
|
---|
888 | &xor ($s3,&DWP(12,$key));
|
---|
889 | &cmp ($key,$__end);
|
---|
890 | &mov ($__key,$key);
|
---|
891 | &jb (&label("loop"));
|
---|
892 | }
|
---|
893 | else {
|
---|
894 | &cmp ($acc,10);
|
---|
895 | &jle (&label("10rounds"));
|
---|
896 | &cmp ($acc,12);
|
---|
897 | &jle (&label("12rounds"));
|
---|
898 |
|
---|
899 | &set_label("14rounds",4);
|
---|
900 | for ($i=1;$i<3;$i++) {
|
---|
901 | if ($vertical_spin) {
|
---|
902 | &encvert($tbl,$s0,$s1,$s2,$s3);
|
---|
903 | } else {
|
---|
904 | &encstep(0,$tbl,$s0,$s1,$s2,$s3);
|
---|
905 | &encstep(1,$tbl,$s1,$s2,$s3,$s0);
|
---|
906 | &encstep(2,$tbl,$s2,$s3,$s0,$s1);
|
---|
907 | &encstep(3,$tbl,$s3,$s0,$s1,$s2);
|
---|
908 | }
|
---|
909 | &xor ($s0,&DWP(16*$i+0,$key));
|
---|
910 | &xor ($s1,&DWP(16*$i+4,$key));
|
---|
911 | &xor ($s2,&DWP(16*$i+8,$key));
|
---|
912 | &xor ($s3,&DWP(16*$i+12,$key));
|
---|
913 | }
|
---|
914 | &add ($key,32);
|
---|
915 | &mov ($__key,$key); # advance rd_key
|
---|
916 | &set_label("12rounds",4);
|
---|
917 | for ($i=1;$i<3;$i++) {
|
---|
918 | if ($vertical_spin) {
|
---|
919 | &encvert($tbl,$s0,$s1,$s2,$s3);
|
---|
920 | } else {
|
---|
921 | &encstep(0,$tbl,$s0,$s1,$s2,$s3);
|
---|
922 | &encstep(1,$tbl,$s1,$s2,$s3,$s0);
|
---|
923 | &encstep(2,$tbl,$s2,$s3,$s0,$s1);
|
---|
924 | &encstep(3,$tbl,$s3,$s0,$s1,$s2);
|
---|
925 | }
|
---|
926 | &xor ($s0,&DWP(16*$i+0,$key));
|
---|
927 | &xor ($s1,&DWP(16*$i+4,$key));
|
---|
928 | &xor ($s2,&DWP(16*$i+8,$key));
|
---|
929 | &xor ($s3,&DWP(16*$i+12,$key));
|
---|
930 | }
|
---|
931 | &add ($key,32);
|
---|
932 | &mov ($__key,$key); # advance rd_key
|
---|
933 | &set_label("10rounds",4);
|
---|
934 | for ($i=1;$i<10;$i++) {
|
---|
935 | if ($vertical_spin) {
|
---|
936 | &encvert($tbl,$s0,$s1,$s2,$s3);
|
---|
937 | } else {
|
---|
938 | &encstep(0,$tbl,$s0,$s1,$s2,$s3);
|
---|
939 | &encstep(1,$tbl,$s1,$s2,$s3,$s0);
|
---|
940 | &encstep(2,$tbl,$s2,$s3,$s0,$s1);
|
---|
941 | &encstep(3,$tbl,$s3,$s0,$s1,$s2);
|
---|
942 | }
|
---|
943 | &xor ($s0,&DWP(16*$i+0,$key));
|
---|
944 | &xor ($s1,&DWP(16*$i+4,$key));
|
---|
945 | &xor ($s2,&DWP(16*$i+8,$key));
|
---|
946 | &xor ($s3,&DWP(16*$i+12,$key));
|
---|
947 | }
|
---|
948 | }
|
---|
949 |
|
---|
950 | if ($vertical_spin) {
|
---|
951 | # "reincarnate" some registers for "horizontal" spin...
|
---|
952 | &mov ($s1="ebx",$key="edi");
|
---|
953 | &mov ($s2="ecx",$acc="esi");
|
---|
954 | }
|
---|
955 | &enclast(0,$tbl,$s0,$s1,$s2,$s3);
|
---|
956 | &enclast(1,$tbl,$s1,$s2,$s3,$s0);
|
---|
957 | &enclast(2,$tbl,$s2,$s3,$s0,$s1);
|
---|
958 | &enclast(3,$tbl,$s3,$s0,$s1,$s2);
|
---|
959 |
|
---|
960 | &add ($key,$small_footprint?16:160);
|
---|
961 | &xor ($s0,&DWP(0,$key));
|
---|
962 | &xor ($s1,&DWP(4,$key));
|
---|
963 | &xor ($s2,&DWP(8,$key));
|
---|
964 | &xor ($s3,&DWP(12,$key));
|
---|
965 |
|
---|
966 | &ret ();
|
---|
967 |
|
---|
968 | &set_label("AES_Te",64); # Yes! I keep it in the code segment!
|
---|
969 | &_data_word(0xa56363c6, 0x847c7cf8, 0x997777ee, 0x8d7b7bf6);
|
---|
970 | &_data_word(0x0df2f2ff, 0xbd6b6bd6, 0xb16f6fde, 0x54c5c591);
|
---|
971 | &_data_word(0x50303060, 0x03010102, 0xa96767ce, 0x7d2b2b56);
|
---|
972 | &_data_word(0x19fefee7, 0x62d7d7b5, 0xe6abab4d, 0x9a7676ec);
|
---|
973 | &_data_word(0x45caca8f, 0x9d82821f, 0x40c9c989, 0x877d7dfa);
|
---|
974 | &_data_word(0x15fafaef, 0xeb5959b2, 0xc947478e, 0x0bf0f0fb);
|
---|
975 | &_data_word(0xecadad41, 0x67d4d4b3, 0xfda2a25f, 0xeaafaf45);
|
---|
976 | &_data_word(0xbf9c9c23, 0xf7a4a453, 0x967272e4, 0x5bc0c09b);
|
---|
977 | &_data_word(0xc2b7b775, 0x1cfdfde1, 0xae93933d, 0x6a26264c);
|
---|
978 | &_data_word(0x5a36366c, 0x413f3f7e, 0x02f7f7f5, 0x4fcccc83);
|
---|
979 | &_data_word(0x5c343468, 0xf4a5a551, 0x34e5e5d1, 0x08f1f1f9);
|
---|
980 | &_data_word(0x937171e2, 0x73d8d8ab, 0x53313162, 0x3f15152a);
|
---|
981 | &_data_word(0x0c040408, 0x52c7c795, 0x65232346, 0x5ec3c39d);
|
---|
982 | &_data_word(0x28181830, 0xa1969637, 0x0f05050a, 0xb59a9a2f);
|
---|
983 | &_data_word(0x0907070e, 0x36121224, 0x9b80801b, 0x3de2e2df);
|
---|
984 | &_data_word(0x26ebebcd, 0x6927274e, 0xcdb2b27f, 0x9f7575ea);
|
---|
985 | &_data_word(0x1b090912, 0x9e83831d, 0x742c2c58, 0x2e1a1a34);
|
---|
986 | &_data_word(0x2d1b1b36, 0xb26e6edc, 0xee5a5ab4, 0xfba0a05b);
|
---|
987 | &_data_word(0xf65252a4, 0x4d3b3b76, 0x61d6d6b7, 0xceb3b37d);
|
---|
988 | &_data_word(0x7b292952, 0x3ee3e3dd, 0x712f2f5e, 0x97848413);
|
---|
989 | &_data_word(0xf55353a6, 0x68d1d1b9, 0x00000000, 0x2cededc1);
|
---|
990 | &_data_word(0x60202040, 0x1ffcfce3, 0xc8b1b179, 0xed5b5bb6);
|
---|
991 | &_data_word(0xbe6a6ad4, 0x46cbcb8d, 0xd9bebe67, 0x4b393972);
|
---|
992 | &_data_word(0xde4a4a94, 0xd44c4c98, 0xe85858b0, 0x4acfcf85);
|
---|
993 | &_data_word(0x6bd0d0bb, 0x2aefefc5, 0xe5aaaa4f, 0x16fbfbed);
|
---|
994 | &_data_word(0xc5434386, 0xd74d4d9a, 0x55333366, 0x94858511);
|
---|
995 | &_data_word(0xcf45458a, 0x10f9f9e9, 0x06020204, 0x817f7ffe);
|
---|
996 | &_data_word(0xf05050a0, 0x443c3c78, 0xba9f9f25, 0xe3a8a84b);
|
---|
997 | &_data_word(0xf35151a2, 0xfea3a35d, 0xc0404080, 0x8a8f8f05);
|
---|
998 | &_data_word(0xad92923f, 0xbc9d9d21, 0x48383870, 0x04f5f5f1);
|
---|
999 | &_data_word(0xdfbcbc63, 0xc1b6b677, 0x75dadaaf, 0x63212142);
|
---|
1000 | &_data_word(0x30101020, 0x1affffe5, 0x0ef3f3fd, 0x6dd2d2bf);
|
---|
1001 | &_data_word(0x4ccdcd81, 0x140c0c18, 0x35131326, 0x2fececc3);
|
---|
1002 | &_data_word(0xe15f5fbe, 0xa2979735, 0xcc444488, 0x3917172e);
|
---|
1003 | &_data_word(0x57c4c493, 0xf2a7a755, 0x827e7efc, 0x473d3d7a);
|
---|
1004 | &_data_word(0xac6464c8, 0xe75d5dba, 0x2b191932, 0x957373e6);
|
---|
1005 | &_data_word(0xa06060c0, 0x98818119, 0xd14f4f9e, 0x7fdcdca3);
|
---|
1006 | &_data_word(0x66222244, 0x7e2a2a54, 0xab90903b, 0x8388880b);
|
---|
1007 | &_data_word(0xca46468c, 0x29eeeec7, 0xd3b8b86b, 0x3c141428);
|
---|
1008 | &_data_word(0x79dedea7, 0xe25e5ebc, 0x1d0b0b16, 0x76dbdbad);
|
---|
1009 | &_data_word(0x3be0e0db, 0x56323264, 0x4e3a3a74, 0x1e0a0a14);
|
---|
1010 | &_data_word(0xdb494992, 0x0a06060c, 0x6c242448, 0xe45c5cb8);
|
---|
1011 | &_data_word(0x5dc2c29f, 0x6ed3d3bd, 0xefacac43, 0xa66262c4);
|
---|
1012 | &_data_word(0xa8919139, 0xa4959531, 0x37e4e4d3, 0x8b7979f2);
|
---|
1013 | &_data_word(0x32e7e7d5, 0x43c8c88b, 0x5937376e, 0xb76d6dda);
|
---|
1014 | &_data_word(0x8c8d8d01, 0x64d5d5b1, 0xd24e4e9c, 0xe0a9a949);
|
---|
1015 | &_data_word(0xb46c6cd8, 0xfa5656ac, 0x07f4f4f3, 0x25eaeacf);
|
---|
1016 | &_data_word(0xaf6565ca, 0x8e7a7af4, 0xe9aeae47, 0x18080810);
|
---|
1017 | &_data_word(0xd5baba6f, 0x887878f0, 0x6f25254a, 0x722e2e5c);
|
---|
1018 | &_data_word(0x241c1c38, 0xf1a6a657, 0xc7b4b473, 0x51c6c697);
|
---|
1019 | &_data_word(0x23e8e8cb, 0x7cdddda1, 0x9c7474e8, 0x211f1f3e);
|
---|
1020 | &_data_word(0xdd4b4b96, 0xdcbdbd61, 0x868b8b0d, 0x858a8a0f);
|
---|
1021 | &_data_word(0x907070e0, 0x423e3e7c, 0xc4b5b571, 0xaa6666cc);
|
---|
1022 | &_data_word(0xd8484890, 0x05030306, 0x01f6f6f7, 0x120e0e1c);
|
---|
1023 | &_data_word(0xa36161c2, 0x5f35356a, 0xf95757ae, 0xd0b9b969);
|
---|
1024 | &_data_word(0x91868617, 0x58c1c199, 0x271d1d3a, 0xb99e9e27);
|
---|
1025 | &_data_word(0x38e1e1d9, 0x13f8f8eb, 0xb398982b, 0x33111122);
|
---|
1026 | &_data_word(0xbb6969d2, 0x70d9d9a9, 0x898e8e07, 0xa7949433);
|
---|
1027 | &_data_word(0xb69b9b2d, 0x221e1e3c, 0x92878715, 0x20e9e9c9);
|
---|
1028 | &_data_word(0x49cece87, 0xff5555aa, 0x78282850, 0x7adfdfa5);
|
---|
1029 | &_data_word(0x8f8c8c03, 0xf8a1a159, 0x80898909, 0x170d0d1a);
|
---|
1030 | &_data_word(0xdabfbf65, 0x31e6e6d7, 0xc6424284, 0xb86868d0);
|
---|
1031 | &_data_word(0xc3414182, 0xb0999929, 0x772d2d5a, 0x110f0f1e);
|
---|
1032 | &_data_word(0xcbb0b07b, 0xfc5454a8, 0xd6bbbb6d, 0x3a16162c);
|
---|
1033 |
|
---|
1034 | #Te4 # four copies of Te4 to choose from to avoid L1 aliasing
|
---|
1035 | &data_byte(0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5);
|
---|
1036 | &data_byte(0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76);
|
---|
1037 | &data_byte(0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0);
|
---|
1038 | &data_byte(0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0);
|
---|
1039 | &data_byte(0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc);
|
---|
1040 | &data_byte(0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15);
|
---|
1041 | &data_byte(0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a);
|
---|
1042 | &data_byte(0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75);
|
---|
1043 | &data_byte(0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0);
|
---|
1044 | &data_byte(0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84);
|
---|
1045 | &data_byte(0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b);
|
---|
1046 | &data_byte(0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf);
|
---|
1047 | &data_byte(0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85);
|
---|
1048 | &data_byte(0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8);
|
---|
1049 | &data_byte(0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5);
|
---|
1050 | &data_byte(0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2);
|
---|
1051 | &data_byte(0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17);
|
---|
1052 | &data_byte(0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73);
|
---|
1053 | &data_byte(0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88);
|
---|
1054 | &data_byte(0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb);
|
---|
1055 | &data_byte(0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c);
|
---|
1056 | &data_byte(0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79);
|
---|
1057 | &data_byte(0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9);
|
---|
1058 | &data_byte(0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08);
|
---|
1059 | &data_byte(0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6);
|
---|
1060 | &data_byte(0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a);
|
---|
1061 | &data_byte(0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e);
|
---|
1062 | &data_byte(0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e);
|
---|
1063 | &data_byte(0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94);
|
---|
1064 | &data_byte(0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf);
|
---|
1065 | &data_byte(0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68);
|
---|
1066 | &data_byte(0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16);
|
---|
1067 |
|
---|
1068 | &data_byte(0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5);
|
---|
1069 | &data_byte(0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76);
|
---|
1070 | &data_byte(0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0);
|
---|
1071 | &data_byte(0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0);
|
---|
1072 | &data_byte(0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc);
|
---|
1073 | &data_byte(0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15);
|
---|
1074 | &data_byte(0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a);
|
---|
1075 | &data_byte(0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75);
|
---|
1076 | &data_byte(0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0);
|
---|
1077 | &data_byte(0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84);
|
---|
1078 | &data_byte(0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b);
|
---|
1079 | &data_byte(0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf);
|
---|
1080 | &data_byte(0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85);
|
---|
1081 | &data_byte(0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8);
|
---|
1082 | &data_byte(0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5);
|
---|
1083 | &data_byte(0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2);
|
---|
1084 | &data_byte(0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17);
|
---|
1085 | &data_byte(0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73);
|
---|
1086 | &data_byte(0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88);
|
---|
1087 | &data_byte(0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb);
|
---|
1088 | &data_byte(0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c);
|
---|
1089 | &data_byte(0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79);
|
---|
1090 | &data_byte(0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9);
|
---|
1091 | &data_byte(0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08);
|
---|
1092 | &data_byte(0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6);
|
---|
1093 | &data_byte(0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a);
|
---|
1094 | &data_byte(0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e);
|
---|
1095 | &data_byte(0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e);
|
---|
1096 | &data_byte(0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94);
|
---|
1097 | &data_byte(0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf);
|
---|
1098 | &data_byte(0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68);
|
---|
1099 | &data_byte(0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16);
|
---|
1100 |
|
---|
1101 | &data_byte(0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5);
|
---|
1102 | &data_byte(0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76);
|
---|
1103 | &data_byte(0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0);
|
---|
1104 | &data_byte(0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0);
|
---|
1105 | &data_byte(0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc);
|
---|
1106 | &data_byte(0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15);
|
---|
1107 | &data_byte(0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a);
|
---|
1108 | &data_byte(0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75);
|
---|
1109 | &data_byte(0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0);
|
---|
1110 | &data_byte(0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84);
|
---|
1111 | &data_byte(0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b);
|
---|
1112 | &data_byte(0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf);
|
---|
1113 | &data_byte(0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85);
|
---|
1114 | &data_byte(0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8);
|
---|
1115 | &data_byte(0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5);
|
---|
1116 | &data_byte(0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2);
|
---|
1117 | &data_byte(0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17);
|
---|
1118 | &data_byte(0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73);
|
---|
1119 | &data_byte(0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88);
|
---|
1120 | &data_byte(0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb);
|
---|
1121 | &data_byte(0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c);
|
---|
1122 | &data_byte(0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79);
|
---|
1123 | &data_byte(0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9);
|
---|
1124 | &data_byte(0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08);
|
---|
1125 | &data_byte(0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6);
|
---|
1126 | &data_byte(0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a);
|
---|
1127 | &data_byte(0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e);
|
---|
1128 | &data_byte(0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e);
|
---|
1129 | &data_byte(0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94);
|
---|
1130 | &data_byte(0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf);
|
---|
1131 | &data_byte(0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68);
|
---|
1132 | &data_byte(0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16);
|
---|
1133 |
|
---|
1134 | &data_byte(0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5);
|
---|
1135 | &data_byte(0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76);
|
---|
1136 | &data_byte(0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0);
|
---|
1137 | &data_byte(0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0);
|
---|
1138 | &data_byte(0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc);
|
---|
1139 | &data_byte(0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15);
|
---|
1140 | &data_byte(0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a);
|
---|
1141 | &data_byte(0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75);
|
---|
1142 | &data_byte(0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0);
|
---|
1143 | &data_byte(0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84);
|
---|
1144 | &data_byte(0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b);
|
---|
1145 | &data_byte(0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf);
|
---|
1146 | &data_byte(0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85);
|
---|
1147 | &data_byte(0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8);
|
---|
1148 | &data_byte(0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5);
|
---|
1149 | &data_byte(0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2);
|
---|
1150 | &data_byte(0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17);
|
---|
1151 | &data_byte(0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73);
|
---|
1152 | &data_byte(0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88);
|
---|
1153 | &data_byte(0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb);
|
---|
1154 | &data_byte(0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c);
|
---|
1155 | &data_byte(0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79);
|
---|
1156 | &data_byte(0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9);
|
---|
1157 | &data_byte(0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08);
|
---|
1158 | &data_byte(0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6);
|
---|
1159 | &data_byte(0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a);
|
---|
1160 | &data_byte(0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e);
|
---|
1161 | &data_byte(0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e);
|
---|
1162 | &data_byte(0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94);
|
---|
1163 | &data_byte(0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf);
|
---|
1164 | &data_byte(0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68);
|
---|
1165 | &data_byte(0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16);
|
---|
1166 | #rcon:
|
---|
1167 | &data_word(0x00000001, 0x00000002, 0x00000004, 0x00000008);
|
---|
1168 | &data_word(0x00000010, 0x00000020, 0x00000040, 0x00000080);
|
---|
1169 | &data_word(0x0000001b, 0x00000036, 0x00000000, 0x00000000);
|
---|
1170 | &data_word(0x00000000, 0x00000000, 0x00000000, 0x00000000);
|
---|
1171 | &function_end_B("_x86_AES_encrypt");
|
---|
1172 |
|
---|
1173 | # void AES_encrypt (const void *inp,void *out,const AES_KEY *key);
|
---|
1174 | &function_begin("AES_encrypt");
|
---|
1175 | &mov ($acc,&wparam(0)); # load inp
|
---|
1176 | &mov ($key,&wparam(2)); # load key
|
---|
1177 |
|
---|
1178 | &mov ($s0,"esp");
|
---|
1179 | &sub ("esp",36);
|
---|
1180 | &and ("esp",-64); # align to cache-line
|
---|
1181 |
|
---|
1182 | # place stack frame just "above" the key schedule
|
---|
1183 | &lea ($s1,&DWP(-64-63,$key));
|
---|
1184 | &sub ($s1,"esp");
|
---|
1185 | &neg ($s1);
|
---|
1186 | &and ($s1,0x3C0); # modulo 1024, but aligned to cache-line
|
---|
1187 | &sub ("esp",$s1);
|
---|
1188 | &add ("esp",4); # 4 is reserved for caller's return address
|
---|
1189 | &mov ($_esp,$s0); # save stack pointer
|
---|
1190 |
|
---|
1191 | &call (&label("pic_point")); # make it PIC!
|
---|
1192 | &set_label("pic_point");
|
---|
1193 | &blindpop($tbl);
|
---|
1194 | &picmeup($s0,"OPENSSL_ia32cap_P",$tbl,&label("pic_point")) if (!$x86only);
|
---|
1195 | &lea ($tbl,&DWP(&label("AES_Te")."-".&label("pic_point"),$tbl));
|
---|
1196 |
|
---|
1197 | # pick Te4 copy which can't "overlap" with stack frame or key schedule
|
---|
1198 | &lea ($s1,&DWP(768-4,"esp"));
|
---|
1199 | &sub ($s1,$tbl);
|
---|
1200 | &and ($s1,0x300);
|
---|
1201 | &lea ($tbl,&DWP(2048+128,$tbl,$s1));
|
---|
1202 |
|
---|
1203 | if (!$x86only) {
|
---|
1204 | &bt (&DWP(0,$s0),25); # check for SSE bit
|
---|
1205 | &jnc (&label("x86"));
|
---|
1206 |
|
---|
1207 | &movq ("mm0",&QWP(0,$acc));
|
---|
1208 | &movq ("mm4",&QWP(8,$acc));
|
---|
1209 | &call ("_sse_AES_encrypt_compact");
|
---|
1210 | &mov ("esp",$_esp); # restore stack pointer
|
---|
1211 | &mov ($acc,&wparam(1)); # load out
|
---|
1212 | &movq (&QWP(0,$acc),"mm0"); # write output data
|
---|
1213 | &movq (&QWP(8,$acc),"mm4");
|
---|
1214 | &emms ();
|
---|
1215 | &function_end_A();
|
---|
1216 | }
|
---|
1217 | &set_label("x86",16);
|
---|
1218 | &mov ($_tbl,$tbl);
|
---|
1219 | &mov ($s0,&DWP(0,$acc)); # load input data
|
---|
1220 | &mov ($s1,&DWP(4,$acc));
|
---|
1221 | &mov ($s2,&DWP(8,$acc));
|
---|
1222 | &mov ($s3,&DWP(12,$acc));
|
---|
1223 | &call ("_x86_AES_encrypt_compact");
|
---|
1224 | &mov ("esp",$_esp); # restore stack pointer
|
---|
1225 | &mov ($acc,&wparam(1)); # load out
|
---|
1226 | &mov (&DWP(0,$acc),$s0); # write output data
|
---|
1227 | &mov (&DWP(4,$acc),$s1);
|
---|
1228 | &mov (&DWP(8,$acc),$s2);
|
---|
1229 | &mov (&DWP(12,$acc),$s3);
|
---|
1230 | &function_end("AES_encrypt");
|
---|
1231 |
|
---|
1232 | #--------------------------------------------------------------------#
|
---|
1233 |
|
---|
1234 | ######################################################################
|
---|
1235 | # "Compact" block function
|
---|
1236 | ######################################################################
|
---|
1237 |
|
---|
1238 | sub deccompact()
|
---|
1239 | { my $Fn = \&mov;
|
---|
1240 | while ($#_>5) { pop(@_); $Fn=sub{}; }
|
---|
1241 | my ($i,$td,@s)=@_;
|
---|
1242 | my $tmp = $key;
|
---|
1243 | my $out = $i==3?$s[0]:$acc;
|
---|
1244 |
|
---|
1245 | # $Fn is used in first compact round and its purpose is to
|
---|
1246 | # void restoration of some values from stack, so that after
|
---|
1247 | # 4xdeccompact with extra argument $key, $s0 and $s1 values
|
---|
1248 | # are left there...
|
---|
1249 | if($i==3) { &$Fn ($key,$__key); }
|
---|
1250 | else { &mov ($out,$s[0]); }
|
---|
1251 | &and ($out,0xFF);
|
---|
1252 | &movz ($out,&BP(-128,$td,$out,1));
|
---|
1253 |
|
---|
1254 | if ($i==3) { $tmp=$s[1]; }
|
---|
1255 | &movz ($tmp,&HB($s[1]));
|
---|
1256 | &movz ($tmp,&BP(-128,$td,$tmp,1));
|
---|
1257 | &shl ($tmp,8);
|
---|
1258 | &xor ($out,$tmp);
|
---|
1259 |
|
---|
1260 | if ($i==3) { $tmp=$s[2]; &mov ($s[1],$acc); }
|
---|
1261 | else { mov ($tmp,$s[2]); }
|
---|
1262 | &shr ($tmp,16);
|
---|
1263 | &and ($tmp,0xFF);
|
---|
1264 | &movz ($tmp,&BP(-128,$td,$tmp,1));
|
---|
1265 | &shl ($tmp,16);
|
---|
1266 | &xor ($out,$tmp);
|
---|
1267 |
|
---|
1268 | if ($i==3) { $tmp=$s[3]; &$Fn ($s[2],$__s1); }
|
---|
1269 | else { &mov ($tmp,$s[3]); }
|
---|
1270 | &shr ($tmp,24);
|
---|
1271 | &movz ($tmp,&BP(-128,$td,$tmp,1));
|
---|
1272 | &shl ($tmp,24);
|
---|
1273 | &xor ($out,$tmp);
|
---|
1274 | if ($i<2) { &mov (&DWP(4+4*$i,"esp"),$out); }
|
---|
1275 | if ($i==3) { &$Fn ($s[3],$__s0); }
|
---|
1276 | }
|
---|
1277 |
|
---|
1278 | # must be called with 2,3,0,1 as argument sequence!!!
|
---|
1279 | sub dectransform()
|
---|
1280 | { my @s = ($s0,$s1,$s2,$s3);
|
---|
1281 | my $i = shift;
|
---|
1282 | my $tmp = $key;
|
---|
1283 | my $tp2 = @s[($i+2)%4]; $tp2 = @s[2] if ($i==1);
|
---|
1284 | my $tp4 = @s[($i+3)%4]; $tp4 = @s[3] if ($i==1);
|
---|
1285 | my $tp8 = $tbl;
|
---|
1286 |
|
---|
1287 | &mov ($tmp,0x80808080);
|
---|
1288 | &and ($tmp,$s[$i]);
|
---|
1289 | &mov ($acc,$tmp);
|
---|
1290 | &shr ($tmp,7);
|
---|
1291 | &lea ($tp2,&DWP(0,$s[$i],$s[$i]));
|
---|
1292 | &sub ($acc,$tmp);
|
---|
1293 | &and ($tp2,0xfefefefe);
|
---|
1294 | &and ($acc,0x1b1b1b1b);
|
---|
1295 | &xor ($tp2,$acc);
|
---|
1296 | &mov ($tmp,0x80808080);
|
---|
1297 |
|
---|
1298 | &and ($tmp,$tp2);
|
---|
1299 | &mov ($acc,$tmp);
|
---|
1300 | &shr ($tmp,7);
|
---|
1301 | &lea ($tp4,&DWP(0,$tp2,$tp2));
|
---|
1302 | &sub ($acc,$tmp);
|
---|
1303 | &and ($tp4,0xfefefefe);
|
---|
1304 | &and ($acc,0x1b1b1b1b);
|
---|
1305 | &xor ($tp2,$s[$i]); # tp2^tp1
|
---|
1306 | &xor ($tp4,$acc);
|
---|
1307 | &mov ($tmp,0x80808080);
|
---|
1308 |
|
---|
1309 | &and ($tmp,$tp4);
|
---|
1310 | &mov ($acc,$tmp);
|
---|
1311 | &shr ($tmp,7);
|
---|
1312 | &lea ($tp8,&DWP(0,$tp4,$tp4));
|
---|
1313 | &sub ($acc,$tmp);
|
---|
1314 | &and ($tp8,0xfefefefe);
|
---|
1315 | &and ($acc,0x1b1b1b1b);
|
---|
1316 | &xor ($tp4,$s[$i]); # tp4^tp1
|
---|
1317 | &rotl ($s[$i],8); # = ROTATE(tp1,8)
|
---|
1318 | &xor ($tp8,$acc);
|
---|
1319 |
|
---|
1320 | &xor ($s[$i],$tp2);
|
---|
1321 | &xor ($tp2,$tp8);
|
---|
1322 | &xor ($s[$i],$tp4);
|
---|
1323 | &xor ($tp4,$tp8);
|
---|
1324 | &rotl ($tp2,24);
|
---|
1325 | &xor ($s[$i],$tp8); # ^= tp8^(tp4^tp1)^(tp2^tp1)
|
---|
1326 | &rotl ($tp4,16);
|
---|
1327 | &xor ($s[$i],$tp2); # ^= ROTATE(tp8^tp2^tp1,24)
|
---|
1328 | &rotl ($tp8,8);
|
---|
1329 | &xor ($s[$i],$tp4); # ^= ROTATE(tp8^tp4^tp1,16)
|
---|
1330 | &mov ($s[0],$__s0) if($i==2); #prefetch $s0
|
---|
1331 | &mov ($s[1],$__s1) if($i==3); #prefetch $s1
|
---|
1332 | &mov ($s[2],$__s2) if($i==1);
|
---|
1333 | &xor ($s[$i],$tp8); # ^= ROTATE(tp8,8)
|
---|
1334 |
|
---|
1335 | &mov ($s[3],$__s3) if($i==1);
|
---|
1336 | &mov (&DWP(4+4*$i,"esp"),$s[$i]) if($i>=2);
|
---|
1337 | }
|
---|
1338 |
|
---|
1339 | &function_begin_B("_x86_AES_decrypt_compact");
|
---|
1340 | # note that caller is expected to allocate stack frame for me!
|
---|
1341 | &mov ($__key,$key); # save key
|
---|
1342 |
|
---|
1343 | &xor ($s0,&DWP(0,$key)); # xor with key
|
---|
1344 | &xor ($s1,&DWP(4,$key));
|
---|
1345 | &xor ($s2,&DWP(8,$key));
|
---|
1346 | &xor ($s3,&DWP(12,$key));
|
---|
1347 |
|
---|
1348 | &mov ($acc,&DWP(240,$key)); # load key->rounds
|
---|
1349 |
|
---|
1350 | &lea ($acc,&DWP(-2,$acc,$acc));
|
---|
1351 | &lea ($acc,&DWP(0,$key,$acc,8));
|
---|
1352 | &mov ($__end,$acc); # end of key schedule
|
---|
1353 |
|
---|
1354 | # prefetch Td4
|
---|
1355 | &mov ($key,&DWP(0-128,$tbl));
|
---|
1356 | &mov ($acc,&DWP(32-128,$tbl));
|
---|
1357 | &mov ($key,&DWP(64-128,$tbl));
|
---|
1358 | &mov ($acc,&DWP(96-128,$tbl));
|
---|
1359 | &mov ($key,&DWP(128-128,$tbl));
|
---|
1360 | &mov ($acc,&DWP(160-128,$tbl));
|
---|
1361 | &mov ($key,&DWP(192-128,$tbl));
|
---|
1362 | &mov ($acc,&DWP(224-128,$tbl));
|
---|
1363 |
|
---|
1364 | &set_label("loop",16);
|
---|
1365 |
|
---|
1366 | &deccompact(0,$tbl,$s0,$s3,$s2,$s1,1);
|
---|
1367 | &deccompact(1,$tbl,$s1,$s0,$s3,$s2,1);
|
---|
1368 | &deccompact(2,$tbl,$s2,$s1,$s0,$s3,1);
|
---|
1369 | &deccompact(3,$tbl,$s3,$s2,$s1,$s0,1);
|
---|
1370 | &dectransform(2);
|
---|
1371 | &dectransform(3);
|
---|
1372 | &dectransform(0);
|
---|
1373 | &dectransform(1);
|
---|
1374 | &mov ($key,$__key);
|
---|
1375 | &mov ($tbl,$__tbl);
|
---|
1376 | &add ($key,16); # advance rd_key
|
---|
1377 | &xor ($s0,&DWP(0,$key));
|
---|
1378 | &xor ($s1,&DWP(4,$key));
|
---|
1379 | &xor ($s2,&DWP(8,$key));
|
---|
1380 | &xor ($s3,&DWP(12,$key));
|
---|
1381 |
|
---|
1382 | &cmp ($key,$__end);
|
---|
1383 | &mov ($__key,$key);
|
---|
1384 | &jb (&label("loop"));
|
---|
1385 |
|
---|
1386 | &deccompact(0,$tbl,$s0,$s3,$s2,$s1);
|
---|
1387 | &deccompact(1,$tbl,$s1,$s0,$s3,$s2);
|
---|
1388 | &deccompact(2,$tbl,$s2,$s1,$s0,$s3);
|
---|
1389 | &deccompact(3,$tbl,$s3,$s2,$s1,$s0);
|
---|
1390 |
|
---|
1391 | &xor ($s0,&DWP(16,$key));
|
---|
1392 | &xor ($s1,&DWP(20,$key));
|
---|
1393 | &xor ($s2,&DWP(24,$key));
|
---|
1394 | &xor ($s3,&DWP(28,$key));
|
---|
1395 |
|
---|
1396 | &ret ();
|
---|
1397 | &function_end_B("_x86_AES_decrypt_compact");
|
---|
1398 |
|
---|
1399 | ######################################################################
|
---|
1400 | # "Compact" SSE block function.
|
---|
1401 | ######################################################################
|
---|
1402 |
|
---|
1403 | sub sse_deccompact()
|
---|
1404 | {
|
---|
1405 | &pshufw ("mm1","mm0",0x0c); # 7, 6, 1, 0
|
---|
1406 | &pshufw ("mm5","mm4",0x09); # 13,12,11,10
|
---|
1407 | &movd ("eax","mm1"); # 7, 6, 1, 0
|
---|
1408 | &movd ("ebx","mm5"); # 13,12,11,10
|
---|
1409 | &mov ($__key,$key);
|
---|
1410 |
|
---|
1411 | &movz ($acc,&LB("eax")); # 0
|
---|
1412 | &movz ("edx",&HB("eax")); # 1
|
---|
1413 | &pshufw ("mm2","mm0",0x06); # 3, 2, 5, 4
|
---|
1414 | &movz ("ecx",&BP(-128,$tbl,$acc,1)); # 0
|
---|
1415 | &movz ($key,&LB("ebx")); # 10
|
---|
1416 | &movz ("edx",&BP(-128,$tbl,"edx",1)); # 1
|
---|
1417 | &shr ("eax",16); # 7, 6
|
---|
1418 | &shl ("edx",8); # 1
|
---|
1419 |
|
---|
1420 | &movz ($acc,&BP(-128,$tbl,$key,1)); # 10
|
---|
1421 | &movz ($key,&HB("ebx")); # 11
|
---|
1422 | &shl ($acc,16); # 10
|
---|
1423 | &pshufw ("mm6","mm4",0x03); # 9, 8,15,14
|
---|
1424 | &or ("ecx",$acc); # 10
|
---|
1425 | &movz ($acc,&BP(-128,$tbl,$key,1)); # 11
|
---|
1426 | &movz ($key,&HB("eax")); # 7
|
---|
1427 | &shl ($acc,24); # 11
|
---|
1428 | &shr ("ebx",16); # 13,12
|
---|
1429 | &or ("edx",$acc); # 11
|
---|
1430 |
|
---|
1431 | &movz ($acc,&BP(-128,$tbl,$key,1)); # 7
|
---|
1432 | &movz ($key,&HB("ebx")); # 13
|
---|
1433 | &shl ($acc,24); # 7
|
---|
1434 | &or ("ecx",$acc); # 7
|
---|
1435 | &movz ($acc,&BP(-128,$tbl,$key,1)); # 13
|
---|
1436 | &movz ($key,&LB("eax")); # 6
|
---|
1437 | &shl ($acc,8); # 13
|
---|
1438 | &movd ("eax","mm2"); # 3, 2, 5, 4
|
---|
1439 | &or ("ecx",$acc); # 13
|
---|
1440 |
|
---|
1441 | &movz ($acc,&BP(-128,$tbl,$key,1)); # 6
|
---|
1442 | &movz ($key,&LB("ebx")); # 12
|
---|
1443 | &shl ($acc,16); # 6
|
---|
1444 | &movd ("ebx","mm6"); # 9, 8,15,14
|
---|
1445 | &movd ("mm0","ecx"); # t[0] collected
|
---|
1446 | &movz ("ecx",&BP(-128,$tbl,$key,1)); # 12
|
---|
1447 | &movz ($key,&LB("eax")); # 4
|
---|
1448 | &or ("ecx",$acc); # 12
|
---|
1449 |
|
---|
1450 | &movz ($acc,&BP(-128,$tbl,$key,1)); # 4
|
---|
1451 | &movz ($key,&LB("ebx")); # 14
|
---|
1452 | &or ("edx",$acc); # 4
|
---|
1453 | &movz ($acc,&BP(-128,$tbl,$key,1)); # 14
|
---|
1454 | &movz ($key,&HB("eax")); # 5
|
---|
1455 | &shl ($acc,16); # 14
|
---|
1456 | &shr ("eax",16); # 3, 2
|
---|
1457 | &or ("edx",$acc); # 14
|
---|
1458 |
|
---|
1459 | &movz ($acc,&BP(-128,$tbl,$key,1)); # 5
|
---|
1460 | &movz ($key,&HB("ebx")); # 15
|
---|
1461 | &shr ("ebx",16); # 9, 8
|
---|
1462 | &shl ($acc,8); # 5
|
---|
1463 | &movd ("mm1","edx"); # t[1] collected
|
---|
1464 | &movz ("edx",&BP(-128,$tbl,$key,1)); # 15
|
---|
1465 | &movz ($key,&HB("ebx")); # 9
|
---|
1466 | &shl ("edx",24); # 15
|
---|
1467 | &and ("ebx",0xff); # 8
|
---|
1468 | &or ("edx",$acc); # 15
|
---|
1469 |
|
---|
1470 | &punpckldq ("mm0","mm1"); # t[0,1] collected
|
---|
1471 |
|
---|
1472 | &movz ($acc,&BP(-128,$tbl,$key,1)); # 9
|
---|
1473 | &movz ($key,&LB("eax")); # 2
|
---|
1474 | &shl ($acc,8); # 9
|
---|
1475 | &movz ("eax",&HB("eax")); # 3
|
---|
1476 | &movz ("ebx",&BP(-128,$tbl,"ebx",1)); # 8
|
---|
1477 | &or ("ecx",$acc); # 9
|
---|
1478 | &movz ($acc,&BP(-128,$tbl,$key,1)); # 2
|
---|
1479 | &or ("edx","ebx"); # 8
|
---|
1480 | &shl ($acc,16); # 2
|
---|
1481 | &movz ("eax",&BP(-128,$tbl,"eax",1)); # 3
|
---|
1482 | &or ("edx",$acc); # 2
|
---|
1483 | &shl ("eax",24); # 3
|
---|
1484 | &or ("ecx","eax"); # 3
|
---|
1485 | &mov ($key,$__key);
|
---|
1486 | &movd ("mm4","edx"); # t[2] collected
|
---|
1487 | &movd ("mm5","ecx"); # t[3] collected
|
---|
1488 |
|
---|
1489 | &punpckldq ("mm4","mm5"); # t[2,3] collected
|
---|
1490 | }
|
---|
1491 |
|
---|
1492 | if (!$x86only) {
|
---|
1493 | &function_begin_B("_sse_AES_decrypt_compact");
|
---|
1494 | &pxor ("mm0",&QWP(0,$key)); # 7, 6, 5, 4, 3, 2, 1, 0
|
---|
1495 | &pxor ("mm4",&QWP(8,$key)); # 15,14,13,12,11,10, 9, 8
|
---|
1496 |
|
---|
1497 | # note that caller is expected to allocate stack frame for me!
|
---|
1498 | &mov ($acc,&DWP(240,$key)); # load key->rounds
|
---|
1499 | &lea ($acc,&DWP(-2,$acc,$acc));
|
---|
1500 | &lea ($acc,&DWP(0,$key,$acc,8));
|
---|
1501 | &mov ($__end,$acc); # end of key schedule
|
---|
1502 |
|
---|
1503 | &mov ($s0,0x1b1b1b1b); # magic constant
|
---|
1504 | &mov (&DWP(8,"esp"),$s0);
|
---|
1505 | &mov (&DWP(12,"esp"),$s0);
|
---|
1506 |
|
---|
1507 | # prefetch Td4
|
---|
1508 | &mov ($s0,&DWP(0-128,$tbl));
|
---|
1509 | &mov ($s1,&DWP(32-128,$tbl));
|
---|
1510 | &mov ($s2,&DWP(64-128,$tbl));
|
---|
1511 | &mov ($s3,&DWP(96-128,$tbl));
|
---|
1512 | &mov ($s0,&DWP(128-128,$tbl));
|
---|
1513 | &mov ($s1,&DWP(160-128,$tbl));
|
---|
1514 | &mov ($s2,&DWP(192-128,$tbl));
|
---|
1515 | &mov ($s3,&DWP(224-128,$tbl));
|
---|
1516 |
|
---|
1517 | &set_label("loop",16);
|
---|
1518 | &sse_deccompact();
|
---|
1519 | &add ($key,16);
|
---|
1520 | &cmp ($key,$__end);
|
---|
1521 | &ja (&label("out"));
|
---|
1522 |
|
---|
1523 | # ROTATE(x^y,N) == ROTATE(x,N)^ROTATE(y,N)
|
---|
1524 | &movq ("mm3","mm0"); &movq ("mm7","mm4");
|
---|
1525 | &movq ("mm2","mm0",1); &movq ("mm6","mm4",1);
|
---|
1526 | &movq ("mm1","mm0"); &movq ("mm5","mm4");
|
---|
1527 | &pshufw ("mm0","mm0",0xb1); &pshufw ("mm4","mm4",0xb1);# = ROTATE(tp0,16)
|
---|
1528 | &pslld ("mm2",8); &pslld ("mm6",8);
|
---|
1529 | &psrld ("mm3",8); &psrld ("mm7",8);
|
---|
1530 | &pxor ("mm0","mm2"); &pxor ("mm4","mm6"); # ^= tp0<<8
|
---|
1531 | &pxor ("mm0","mm3"); &pxor ("mm4","mm7"); # ^= tp0>>8
|
---|
1532 | &pslld ("mm2",16); &pslld ("mm6",16);
|
---|
1533 | &psrld ("mm3",16); &psrld ("mm7",16);
|
---|
1534 | &pxor ("mm0","mm2"); &pxor ("mm4","mm6"); # ^= tp0<<24
|
---|
1535 | &pxor ("mm0","mm3"); &pxor ("mm4","mm7"); # ^= tp0>>24
|
---|
1536 |
|
---|
1537 | &movq ("mm3",&QWP(8,"esp"));
|
---|
1538 | &pxor ("mm2","mm2"); &pxor ("mm6","mm6");
|
---|
1539 | &pcmpgtb("mm2","mm1"); &pcmpgtb("mm6","mm5");
|
---|
1540 | &pand ("mm2","mm3"); &pand ("mm6","mm3");
|
---|
1541 | &paddb ("mm1","mm1"); &paddb ("mm5","mm5");
|
---|
1542 | &pxor ("mm1","mm2"); &pxor ("mm5","mm6"); # tp2
|
---|
1543 | &movq ("mm3","mm1"); &movq ("mm7","mm5");
|
---|
1544 | &movq ("mm2","mm1"); &movq ("mm6","mm5");
|
---|
1545 | &pxor ("mm0","mm1"); &pxor ("mm4","mm5"); # ^= tp2
|
---|
1546 | &pslld ("mm3",24); &pslld ("mm7",24);
|
---|
1547 | &psrld ("mm2",8); &psrld ("mm6",8);
|
---|
1548 | &pxor ("mm0","mm3"); &pxor ("mm4","mm7"); # ^= tp2<<24
|
---|
1549 | &pxor ("mm0","mm2"); &pxor ("mm4","mm6"); # ^= tp2>>8
|
---|
1550 |
|
---|
1551 | &movq ("mm2",&QWP(8,"esp"));
|
---|
1552 | &pxor ("mm3","mm3"); &pxor ("mm7","mm7");
|
---|
1553 | &pcmpgtb("mm3","mm1"); &pcmpgtb("mm7","mm5");
|
---|
1554 | &pand ("mm3","mm2"); &pand ("mm7","mm2");
|
---|
1555 | &paddb ("mm1","mm1"); &paddb ("mm5","mm5");
|
---|
1556 | &pxor ("mm1","mm3"); &pxor ("mm5","mm7"); # tp4
|
---|
1557 | &pshufw ("mm3","mm1",0xb1); &pshufw ("mm7","mm5",0xb1);
|
---|
1558 | &pxor ("mm0","mm1"); &pxor ("mm4","mm5"); # ^= tp4
|
---|
1559 | &pxor ("mm0","mm3"); &pxor ("mm4","mm7"); # ^= ROTATE(tp4,16)
|
---|
1560 |
|
---|
1561 | &pxor ("mm3","mm3"); &pxor ("mm7","mm7");
|
---|
1562 | &pcmpgtb("mm3","mm1"); &pcmpgtb("mm7","mm5");
|
---|
1563 | &pand ("mm3","mm2"); &pand ("mm7","mm2");
|
---|
1564 | &paddb ("mm1","mm1"); &paddb ("mm5","mm5");
|
---|
1565 | &pxor ("mm1","mm3"); &pxor ("mm5","mm7"); # tp8
|
---|
1566 | &pxor ("mm0","mm1"); &pxor ("mm4","mm5"); # ^= tp8
|
---|
1567 | &movq ("mm3","mm1"); &movq ("mm7","mm5");
|
---|
1568 | &pshufw ("mm2","mm1",0xb1); &pshufw ("mm6","mm5",0xb1);
|
---|
1569 | &pxor ("mm0","mm2"); &pxor ("mm4","mm6"); # ^= ROTATE(tp8,16)
|
---|
1570 | &pslld ("mm1",8); &pslld ("mm5",8);
|
---|
1571 | &psrld ("mm3",8); &psrld ("mm7",8);
|
---|
1572 | &movq ("mm2",&QWP(0,$key)); &movq ("mm6",&QWP(8,$key));
|
---|
1573 | &pxor ("mm0","mm1"); &pxor ("mm4","mm5"); # ^= tp8<<8
|
---|
1574 | &pxor ("mm0","mm3"); &pxor ("mm4","mm7"); # ^= tp8>>8
|
---|
1575 | &mov ($s0,&DWP(0-128,$tbl));
|
---|
1576 | &pslld ("mm1",16); &pslld ("mm5",16);
|
---|
1577 | &mov ($s1,&DWP(64-128,$tbl));
|
---|
1578 | &psrld ("mm3",16); &psrld ("mm7",16);
|
---|
1579 | &mov ($s2,&DWP(128-128,$tbl));
|
---|
1580 | &pxor ("mm0","mm1"); &pxor ("mm4","mm5"); # ^= tp8<<24
|
---|
1581 | &mov ($s3,&DWP(192-128,$tbl));
|
---|
1582 | &pxor ("mm0","mm3"); &pxor ("mm4","mm7"); # ^= tp8>>24
|
---|
1583 |
|
---|
1584 | &pxor ("mm0","mm2"); &pxor ("mm4","mm6");
|
---|
1585 | &jmp (&label("loop"));
|
---|
1586 |
|
---|
1587 | &set_label("out",16);
|
---|
1588 | &pxor ("mm0",&QWP(0,$key));
|
---|
1589 | &pxor ("mm4",&QWP(8,$key));
|
---|
1590 |
|
---|
1591 | &ret ();
|
---|
1592 | &function_end_B("_sse_AES_decrypt_compact");
|
---|
1593 | }
|
---|
1594 |
|
---|
1595 | ######################################################################
|
---|
1596 | # Vanilla block function.
|
---|
1597 | ######################################################################
|
---|
1598 |
|
---|
1599 | sub decstep()
|
---|
1600 | { my ($i,$td,@s) = @_;
|
---|
1601 | my $tmp = $key;
|
---|
1602 | my $out = $i==3?$s[0]:$acc;
|
---|
1603 |
|
---|
1604 | # no instructions are reordered, as performance appears
|
---|
1605 | # optimal... or rather that all attempts to reorder didn't
|
---|
1606 | # result in better performance [which by the way is not a
|
---|
1607 | # bit lower than encryption].
|
---|
1608 | if($i==3) { &mov ($key,$__key); }
|
---|
1609 | else { &mov ($out,$s[0]); }
|
---|
1610 | &and ($out,0xFF);
|
---|
1611 | &mov ($out,&DWP(0,$td,$out,8));
|
---|
1612 |
|
---|
1613 | if ($i==3) { $tmp=$s[1]; }
|
---|
1614 | &movz ($tmp,&HB($s[1]));
|
---|
1615 | &xor ($out,&DWP(3,$td,$tmp,8));
|
---|
1616 |
|
---|
1617 | if ($i==3) { $tmp=$s[2]; &mov ($s[1],$acc); }
|
---|
1618 | else { &mov ($tmp,$s[2]); }
|
---|
1619 | &shr ($tmp,16);
|
---|
1620 | &and ($tmp,0xFF);
|
---|
1621 | &xor ($out,&DWP(2,$td,$tmp,8));
|
---|
1622 |
|
---|
1623 | if ($i==3) { $tmp=$s[3]; &mov ($s[2],$__s1); }
|
---|
1624 | else { &mov ($tmp,$s[3]); }
|
---|
1625 | &shr ($tmp,24);
|
---|
1626 | &xor ($out,&DWP(1,$td,$tmp,8));
|
---|
1627 | if ($i<2) { &mov (&DWP(4+4*$i,"esp"),$out); }
|
---|
1628 | if ($i==3) { &mov ($s[3],$__s0); }
|
---|
1629 | &comment();
|
---|
1630 | }
|
---|
1631 |
|
---|
1632 | sub declast()
|
---|
1633 | { my ($i,$td,@s)=@_;
|
---|
1634 | my $tmp = $key;
|
---|
1635 | my $out = $i==3?$s[0]:$acc;
|
---|
1636 |
|
---|
1637 | if($i==0) { &lea ($td,&DWP(2048+128,$td));
|
---|
1638 | &mov ($tmp,&DWP(0-128,$td));
|
---|
1639 | &mov ($acc,&DWP(32-128,$td));
|
---|
1640 | &mov ($tmp,&DWP(64-128,$td));
|
---|
1641 | &mov ($acc,&DWP(96-128,$td));
|
---|
1642 | &mov ($tmp,&DWP(128-128,$td));
|
---|
1643 | &mov ($acc,&DWP(160-128,$td));
|
---|
1644 | &mov ($tmp,&DWP(192-128,$td));
|
---|
1645 | &mov ($acc,&DWP(224-128,$td));
|
---|
1646 | &lea ($td,&DWP(-128,$td)); }
|
---|
1647 | if($i==3) { &mov ($key,$__key); }
|
---|
1648 | else { &mov ($out,$s[0]); }
|
---|
1649 | &and ($out,0xFF);
|
---|
1650 | &movz ($out,&BP(0,$td,$out,1));
|
---|
1651 |
|
---|
1652 | if ($i==3) { $tmp=$s[1]; }
|
---|
1653 | &movz ($tmp,&HB($s[1]));
|
---|
1654 | &movz ($tmp,&BP(0,$td,$tmp,1));
|
---|
1655 | &shl ($tmp,8);
|
---|
1656 | &xor ($out,$tmp);
|
---|
1657 |
|
---|
1658 | if ($i==3) { $tmp=$s[2]; &mov ($s[1],$acc); }
|
---|
1659 | else { mov ($tmp,$s[2]); }
|
---|
1660 | &shr ($tmp,16);
|
---|
1661 | &and ($tmp,0xFF);
|
---|
1662 | &movz ($tmp,&BP(0,$td,$tmp,1));
|
---|
1663 | &shl ($tmp,16);
|
---|
1664 | &xor ($out,$tmp);
|
---|
1665 |
|
---|
1666 | if ($i==3) { $tmp=$s[3]; &mov ($s[2],$__s1); }
|
---|
1667 | else { &mov ($tmp,$s[3]); }
|
---|
1668 | &shr ($tmp,24);
|
---|
1669 | &movz ($tmp,&BP(0,$td,$tmp,1));
|
---|
1670 | &shl ($tmp,24);
|
---|
1671 | &xor ($out,$tmp);
|
---|
1672 | if ($i<2) { &mov (&DWP(4+4*$i,"esp"),$out); }
|
---|
1673 | if ($i==3) { &mov ($s[3],$__s0);
|
---|
1674 | &lea ($td,&DWP(-2048,$td)); }
|
---|
1675 | }
|
---|
1676 |
|
---|
1677 | &function_begin_B("_x86_AES_decrypt");
|
---|
1678 | # note that caller is expected to allocate stack frame for me!
|
---|
1679 | &mov ($__key,$key); # save key
|
---|
1680 |
|
---|
1681 | &xor ($s0,&DWP(0,$key)); # xor with key
|
---|
1682 | &xor ($s1,&DWP(4,$key));
|
---|
1683 | &xor ($s2,&DWP(8,$key));
|
---|
1684 | &xor ($s3,&DWP(12,$key));
|
---|
1685 |
|
---|
1686 | &mov ($acc,&DWP(240,$key)); # load key->rounds
|
---|
1687 |
|
---|
1688 | if ($small_footprint) {
|
---|
1689 | &lea ($acc,&DWP(-2,$acc,$acc));
|
---|
1690 | &lea ($acc,&DWP(0,$key,$acc,8));
|
---|
1691 | &mov ($__end,$acc); # end of key schedule
|
---|
1692 | &set_label("loop",16);
|
---|
1693 | &decstep(0,$tbl,$s0,$s3,$s2,$s1);
|
---|
1694 | &decstep(1,$tbl,$s1,$s0,$s3,$s2);
|
---|
1695 | &decstep(2,$tbl,$s2,$s1,$s0,$s3);
|
---|
1696 | &decstep(3,$tbl,$s3,$s2,$s1,$s0);
|
---|
1697 | &add ($key,16); # advance rd_key
|
---|
1698 | &xor ($s0,&DWP(0,$key));
|
---|
1699 | &xor ($s1,&DWP(4,$key));
|
---|
1700 | &xor ($s2,&DWP(8,$key));
|
---|
1701 | &xor ($s3,&DWP(12,$key));
|
---|
1702 | &cmp ($key,$__end);
|
---|
1703 | &mov ($__key,$key);
|
---|
1704 | &jb (&label("loop"));
|
---|
1705 | }
|
---|
1706 | else {
|
---|
1707 | &cmp ($acc,10);
|
---|
1708 | &jle (&label("10rounds"));
|
---|
1709 | &cmp ($acc,12);
|
---|
1710 | &jle (&label("12rounds"));
|
---|
1711 |
|
---|
1712 | &set_label("14rounds",4);
|
---|
1713 | for ($i=1;$i<3;$i++) {
|
---|
1714 | &decstep(0,$tbl,$s0,$s3,$s2,$s1);
|
---|
1715 | &decstep(1,$tbl,$s1,$s0,$s3,$s2);
|
---|
1716 | &decstep(2,$tbl,$s2,$s1,$s0,$s3);
|
---|
1717 | &decstep(3,$tbl,$s3,$s2,$s1,$s0);
|
---|
1718 | &xor ($s0,&DWP(16*$i+0,$key));
|
---|
1719 | &xor ($s1,&DWP(16*$i+4,$key));
|
---|
1720 | &xor ($s2,&DWP(16*$i+8,$key));
|
---|
1721 | &xor ($s3,&DWP(16*$i+12,$key));
|
---|
1722 | }
|
---|
1723 | &add ($key,32);
|
---|
1724 | &mov ($__key,$key); # advance rd_key
|
---|
1725 | &set_label("12rounds",4);
|
---|
1726 | for ($i=1;$i<3;$i++) {
|
---|
1727 | &decstep(0,$tbl,$s0,$s3,$s2,$s1);
|
---|
1728 | &decstep(1,$tbl,$s1,$s0,$s3,$s2);
|
---|
1729 | &decstep(2,$tbl,$s2,$s1,$s0,$s3);
|
---|
1730 | &decstep(3,$tbl,$s3,$s2,$s1,$s0);
|
---|
1731 | &xor ($s0,&DWP(16*$i+0,$key));
|
---|
1732 | &xor ($s1,&DWP(16*$i+4,$key));
|
---|
1733 | &xor ($s2,&DWP(16*$i+8,$key));
|
---|
1734 | &xor ($s3,&DWP(16*$i+12,$key));
|
---|
1735 | }
|
---|
1736 | &add ($key,32);
|
---|
1737 | &mov ($__key,$key); # advance rd_key
|
---|
1738 | &set_label("10rounds",4);
|
---|
1739 | for ($i=1;$i<10;$i++) {
|
---|
1740 | &decstep(0,$tbl,$s0,$s3,$s2,$s1);
|
---|
1741 | &decstep(1,$tbl,$s1,$s0,$s3,$s2);
|
---|
1742 | &decstep(2,$tbl,$s2,$s1,$s0,$s3);
|
---|
1743 | &decstep(3,$tbl,$s3,$s2,$s1,$s0);
|
---|
1744 | &xor ($s0,&DWP(16*$i+0,$key));
|
---|
1745 | &xor ($s1,&DWP(16*$i+4,$key));
|
---|
1746 | &xor ($s2,&DWP(16*$i+8,$key));
|
---|
1747 | &xor ($s3,&DWP(16*$i+12,$key));
|
---|
1748 | }
|
---|
1749 | }
|
---|
1750 |
|
---|
1751 | &declast(0,$tbl,$s0,$s3,$s2,$s1);
|
---|
1752 | &declast(1,$tbl,$s1,$s0,$s3,$s2);
|
---|
1753 | &declast(2,$tbl,$s2,$s1,$s0,$s3);
|
---|
1754 | &declast(3,$tbl,$s3,$s2,$s1,$s0);
|
---|
1755 |
|
---|
1756 | &add ($key,$small_footprint?16:160);
|
---|
1757 | &xor ($s0,&DWP(0,$key));
|
---|
1758 | &xor ($s1,&DWP(4,$key));
|
---|
1759 | &xor ($s2,&DWP(8,$key));
|
---|
1760 | &xor ($s3,&DWP(12,$key));
|
---|
1761 |
|
---|
1762 | &ret ();
|
---|
1763 |
|
---|
1764 | &set_label("AES_Td",64); # Yes! I keep it in the code segment!
|
---|
1765 | &_data_word(0x50a7f451, 0x5365417e, 0xc3a4171a, 0x965e273a);
|
---|
1766 | &_data_word(0xcb6bab3b, 0xf1459d1f, 0xab58faac, 0x9303e34b);
|
---|
1767 | &_data_word(0x55fa3020, 0xf66d76ad, 0x9176cc88, 0x254c02f5);
|
---|
1768 | &_data_word(0xfcd7e54f, 0xd7cb2ac5, 0x80443526, 0x8fa362b5);
|
---|
1769 | &_data_word(0x495ab1de, 0x671bba25, 0x980eea45, 0xe1c0fe5d);
|
---|
1770 | &_data_word(0x02752fc3, 0x12f04c81, 0xa397468d, 0xc6f9d36b);
|
---|
1771 | &_data_word(0xe75f8f03, 0x959c9215, 0xeb7a6dbf, 0xda595295);
|
---|
1772 | &_data_word(0x2d83bed4, 0xd3217458, 0x2969e049, 0x44c8c98e);
|
---|
1773 | &_data_word(0x6a89c275, 0x78798ef4, 0x6b3e5899, 0xdd71b927);
|
---|
1774 | &_data_word(0xb64fe1be, 0x17ad88f0, 0x66ac20c9, 0xb43ace7d);
|
---|
1775 | &_data_word(0x184adf63, 0x82311ae5, 0x60335197, 0x457f5362);
|
---|
1776 | &_data_word(0xe07764b1, 0x84ae6bbb, 0x1ca081fe, 0x942b08f9);
|
---|
1777 | &_data_word(0x58684870, 0x19fd458f, 0x876cde94, 0xb7f87b52);
|
---|
1778 | &_data_word(0x23d373ab, 0xe2024b72, 0x578f1fe3, 0x2aab5566);
|
---|
1779 | &_data_word(0x0728ebb2, 0x03c2b52f, 0x9a7bc586, 0xa50837d3);
|
---|
1780 | &_data_word(0xf2872830, 0xb2a5bf23, 0xba6a0302, 0x5c8216ed);
|
---|
1781 | &_data_word(0x2b1ccf8a, 0x92b479a7, 0xf0f207f3, 0xa1e2694e);
|
---|
1782 | &_data_word(0xcdf4da65, 0xd5be0506, 0x1f6234d1, 0x8afea6c4);
|
---|
1783 | &_data_word(0x9d532e34, 0xa055f3a2, 0x32e18a05, 0x75ebf6a4);
|
---|
1784 | &_data_word(0x39ec830b, 0xaaef6040, 0x069f715e, 0x51106ebd);
|
---|
1785 | &_data_word(0xf98a213e, 0x3d06dd96, 0xae053edd, 0x46bde64d);
|
---|
1786 | &_data_word(0xb58d5491, 0x055dc471, 0x6fd40604, 0xff155060);
|
---|
1787 | &_data_word(0x24fb9819, 0x97e9bdd6, 0xcc434089, 0x779ed967);
|
---|
1788 | &_data_word(0xbd42e8b0, 0x888b8907, 0x385b19e7, 0xdbeec879);
|
---|
1789 | &_data_word(0x470a7ca1, 0xe90f427c, 0xc91e84f8, 0x00000000);
|
---|
1790 | &_data_word(0x83868009, 0x48ed2b32, 0xac70111e, 0x4e725a6c);
|
---|
1791 | &_data_word(0xfbff0efd, 0x5638850f, 0x1ed5ae3d, 0x27392d36);
|
---|
1792 | &_data_word(0x64d90f0a, 0x21a65c68, 0xd1545b9b, 0x3a2e3624);
|
---|
1793 | &_data_word(0xb1670a0c, 0x0fe75793, 0xd296eeb4, 0x9e919b1b);
|
---|
1794 | &_data_word(0x4fc5c080, 0xa220dc61, 0x694b775a, 0x161a121c);
|
---|
1795 | &_data_word(0x0aba93e2, 0xe52aa0c0, 0x43e0223c, 0x1d171b12);
|
---|
1796 | &_data_word(0x0b0d090e, 0xadc78bf2, 0xb9a8b62d, 0xc8a91e14);
|
---|
1797 | &_data_word(0x8519f157, 0x4c0775af, 0xbbdd99ee, 0xfd607fa3);
|
---|
1798 | &_data_word(0x9f2601f7, 0xbcf5725c, 0xc53b6644, 0x347efb5b);
|
---|
1799 | &_data_word(0x7629438b, 0xdcc623cb, 0x68fcedb6, 0x63f1e4b8);
|
---|
1800 | &_data_word(0xcadc31d7, 0x10856342, 0x40229713, 0x2011c684);
|
---|
1801 | &_data_word(0x7d244a85, 0xf83dbbd2, 0x1132f9ae, 0x6da129c7);
|
---|
1802 | &_data_word(0x4b2f9e1d, 0xf330b2dc, 0xec52860d, 0xd0e3c177);
|
---|
1803 | &_data_word(0x6c16b32b, 0x99b970a9, 0xfa489411, 0x2264e947);
|
---|
1804 | &_data_word(0xc48cfca8, 0x1a3ff0a0, 0xd82c7d56, 0xef903322);
|
---|
1805 | &_data_word(0xc74e4987, 0xc1d138d9, 0xfea2ca8c, 0x360bd498);
|
---|
1806 | &_data_word(0xcf81f5a6, 0x28de7aa5, 0x268eb7da, 0xa4bfad3f);
|
---|
1807 | &_data_word(0xe49d3a2c, 0x0d927850, 0x9bcc5f6a, 0x62467e54);
|
---|
1808 | &_data_word(0xc2138df6, 0xe8b8d890, 0x5ef7392e, 0xf5afc382);
|
---|
1809 | &_data_word(0xbe805d9f, 0x7c93d069, 0xa92dd56f, 0xb31225cf);
|
---|
1810 | &_data_word(0x3b99acc8, 0xa77d1810, 0x6e639ce8, 0x7bbb3bdb);
|
---|
1811 | &_data_word(0x097826cd, 0xf418596e, 0x01b79aec, 0xa89a4f83);
|
---|
1812 | &_data_word(0x656e95e6, 0x7ee6ffaa, 0x08cfbc21, 0xe6e815ef);
|
---|
1813 | &_data_word(0xd99be7ba, 0xce366f4a, 0xd4099fea, 0xd67cb029);
|
---|
1814 | &_data_word(0xafb2a431, 0x31233f2a, 0x3094a5c6, 0xc066a235);
|
---|
1815 | &_data_word(0x37bc4e74, 0xa6ca82fc, 0xb0d090e0, 0x15d8a733);
|
---|
1816 | &_data_word(0x4a9804f1, 0xf7daec41, 0x0e50cd7f, 0x2ff69117);
|
---|
1817 | &_data_word(0x8dd64d76, 0x4db0ef43, 0x544daacc, 0xdf0496e4);
|
---|
1818 | &_data_word(0xe3b5d19e, 0x1b886a4c, 0xb81f2cc1, 0x7f516546);
|
---|
1819 | &_data_word(0x04ea5e9d, 0x5d358c01, 0x737487fa, 0x2e410bfb);
|
---|
1820 | &_data_word(0x5a1d67b3, 0x52d2db92, 0x335610e9, 0x1347d66d);
|
---|
1821 | &_data_word(0x8c61d79a, 0x7a0ca137, 0x8e14f859, 0x893c13eb);
|
---|
1822 | &_data_word(0xee27a9ce, 0x35c961b7, 0xede51ce1, 0x3cb1477a);
|
---|
1823 | &_data_word(0x59dfd29c, 0x3f73f255, 0x79ce1418, 0xbf37c773);
|
---|
1824 | &_data_word(0xeacdf753, 0x5baafd5f, 0x146f3ddf, 0x86db4478);
|
---|
1825 | &_data_word(0x81f3afca, 0x3ec468b9, 0x2c342438, 0x5f40a3c2);
|
---|
1826 | &_data_word(0x72c31d16, 0x0c25e2bc, 0x8b493c28, 0x41950dff);
|
---|
1827 | &_data_word(0x7101a839, 0xdeb30c08, 0x9ce4b4d8, 0x90c15664);
|
---|
1828 | &_data_word(0x6184cb7b, 0x70b632d5, 0x745c6c48, 0x4257b8d0);
|
---|
1829 |
|
---|
1830 | #Td4: # four copies of Td4 to choose from to avoid L1 aliasing
|
---|
1831 | &data_byte(0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38);
|
---|
1832 | &data_byte(0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb);
|
---|
1833 | &data_byte(0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87);
|
---|
1834 | &data_byte(0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb);
|
---|
1835 | &data_byte(0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d);
|
---|
1836 | &data_byte(0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e);
|
---|
1837 | &data_byte(0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2);
|
---|
1838 | &data_byte(0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25);
|
---|
1839 | &data_byte(0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16);
|
---|
1840 | &data_byte(0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92);
|
---|
1841 | &data_byte(0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda);
|
---|
1842 | &data_byte(0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84);
|
---|
1843 | &data_byte(0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a);
|
---|
1844 | &data_byte(0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06);
|
---|
1845 | &data_byte(0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02);
|
---|
1846 | &data_byte(0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b);
|
---|
1847 | &data_byte(0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea);
|
---|
1848 | &data_byte(0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73);
|
---|
1849 | &data_byte(0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85);
|
---|
1850 | &data_byte(0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e);
|
---|
1851 | &data_byte(0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89);
|
---|
1852 | &data_byte(0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b);
|
---|
1853 | &data_byte(0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20);
|
---|
1854 | &data_byte(0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4);
|
---|
1855 | &data_byte(0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31);
|
---|
1856 | &data_byte(0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f);
|
---|
1857 | &data_byte(0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d);
|
---|
1858 | &data_byte(0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef);
|
---|
1859 | &data_byte(0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0);
|
---|
1860 | &data_byte(0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61);
|
---|
1861 | &data_byte(0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26);
|
---|
1862 | &data_byte(0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d);
|
---|
1863 |
|
---|
1864 | &data_byte(0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38);
|
---|
1865 | &data_byte(0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb);
|
---|
1866 | &data_byte(0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87);
|
---|
1867 | &data_byte(0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb);
|
---|
1868 | &data_byte(0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d);
|
---|
1869 | &data_byte(0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e);
|
---|
1870 | &data_byte(0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2);
|
---|
1871 | &data_byte(0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25);
|
---|
1872 | &data_byte(0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16);
|
---|
1873 | &data_byte(0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92);
|
---|
1874 | &data_byte(0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda);
|
---|
1875 | &data_byte(0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84);
|
---|
1876 | &data_byte(0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a);
|
---|
1877 | &data_byte(0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06);
|
---|
1878 | &data_byte(0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02);
|
---|
1879 | &data_byte(0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b);
|
---|
1880 | &data_byte(0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea);
|
---|
1881 | &data_byte(0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73);
|
---|
1882 | &data_byte(0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85);
|
---|
1883 | &data_byte(0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e);
|
---|
1884 | &data_byte(0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89);
|
---|
1885 | &data_byte(0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b);
|
---|
1886 | &data_byte(0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20);
|
---|
1887 | &data_byte(0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4);
|
---|
1888 | &data_byte(0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31);
|
---|
1889 | &data_byte(0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f);
|
---|
1890 | &data_byte(0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d);
|
---|
1891 | &data_byte(0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef);
|
---|
1892 | &data_byte(0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0);
|
---|
1893 | &data_byte(0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61);
|
---|
1894 | &data_byte(0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26);
|
---|
1895 | &data_byte(0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d);
|
---|
1896 |
|
---|
1897 | &data_byte(0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38);
|
---|
1898 | &data_byte(0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb);
|
---|
1899 | &data_byte(0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87);
|
---|
1900 | &data_byte(0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb);
|
---|
1901 | &data_byte(0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d);
|
---|
1902 | &data_byte(0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e);
|
---|
1903 | &data_byte(0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2);
|
---|
1904 | &data_byte(0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25);
|
---|
1905 | &data_byte(0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16);
|
---|
1906 | &data_byte(0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92);
|
---|
1907 | &data_byte(0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda);
|
---|
1908 | &data_byte(0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84);
|
---|
1909 | &data_byte(0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a);
|
---|
1910 | &data_byte(0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06);
|
---|
1911 | &data_byte(0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02);
|
---|
1912 | &data_byte(0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b);
|
---|
1913 | &data_byte(0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea);
|
---|
1914 | &data_byte(0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73);
|
---|
1915 | &data_byte(0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85);
|
---|
1916 | &data_byte(0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e);
|
---|
1917 | &data_byte(0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89);
|
---|
1918 | &data_byte(0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b);
|
---|
1919 | &data_byte(0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20);
|
---|
1920 | &data_byte(0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4);
|
---|
1921 | &data_byte(0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31);
|
---|
1922 | &data_byte(0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f);
|
---|
1923 | &data_byte(0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d);
|
---|
1924 | &data_byte(0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef);
|
---|
1925 | &data_byte(0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0);
|
---|
1926 | &data_byte(0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61);
|
---|
1927 | &data_byte(0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26);
|
---|
1928 | &data_byte(0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d);
|
---|
1929 |
|
---|
1930 | &data_byte(0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38);
|
---|
1931 | &data_byte(0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb);
|
---|
1932 | &data_byte(0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87);
|
---|
1933 | &data_byte(0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb);
|
---|
1934 | &data_byte(0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d);
|
---|
1935 | &data_byte(0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e);
|
---|
1936 | &data_byte(0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2);
|
---|
1937 | &data_byte(0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25);
|
---|
1938 | &data_byte(0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16);
|
---|
1939 | &data_byte(0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92);
|
---|
1940 | &data_byte(0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda);
|
---|
1941 | &data_byte(0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84);
|
---|
1942 | &data_byte(0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a);
|
---|
1943 | &data_byte(0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06);
|
---|
1944 | &data_byte(0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02);
|
---|
1945 | &data_byte(0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b);
|
---|
1946 | &data_byte(0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea);
|
---|
1947 | &data_byte(0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73);
|
---|
1948 | &data_byte(0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85);
|
---|
1949 | &data_byte(0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e);
|
---|
1950 | &data_byte(0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89);
|
---|
1951 | &data_byte(0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b);
|
---|
1952 | &data_byte(0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20);
|
---|
1953 | &data_byte(0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4);
|
---|
1954 | &data_byte(0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31);
|
---|
1955 | &data_byte(0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f);
|
---|
1956 | &data_byte(0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d);
|
---|
1957 | &data_byte(0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef);
|
---|
1958 | &data_byte(0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0);
|
---|
1959 | &data_byte(0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61);
|
---|
1960 | &data_byte(0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26);
|
---|
1961 | &data_byte(0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d);
|
---|
1962 | &function_end_B("_x86_AES_decrypt");
|
---|
1963 |
|
---|
1964 | # void AES_decrypt (const void *inp,void *out,const AES_KEY *key);
|
---|
1965 | &function_begin("AES_decrypt");
|
---|
1966 | &mov ($acc,&wparam(0)); # load inp
|
---|
1967 | &mov ($key,&wparam(2)); # load key
|
---|
1968 |
|
---|
1969 | &mov ($s0,"esp");
|
---|
1970 | &sub ("esp",36);
|
---|
1971 | &and ("esp",-64); # align to cache-line
|
---|
1972 |
|
---|
1973 | # place stack frame just "above" the key schedule
|
---|
1974 | &lea ($s1,&DWP(-64-63,$key));
|
---|
1975 | &sub ($s1,"esp");
|
---|
1976 | &neg ($s1);
|
---|
1977 | &and ($s1,0x3C0); # modulo 1024, but aligned to cache-line
|
---|
1978 | &sub ("esp",$s1);
|
---|
1979 | &add ("esp",4); # 4 is reserved for caller's return address
|
---|
1980 | &mov ($_esp,$s0); # save stack pointer
|
---|
1981 |
|
---|
1982 | &call (&label("pic_point")); # make it PIC!
|
---|
1983 | &set_label("pic_point");
|
---|
1984 | &blindpop($tbl);
|
---|
1985 | &picmeup($s0,"OPENSSL_ia32cap_P",$tbl,&label("pic_point")) if(!$x86only);
|
---|
1986 | &lea ($tbl,&DWP(&label("AES_Td")."-".&label("pic_point"),$tbl));
|
---|
1987 |
|
---|
1988 | # pick Td4 copy which can't "overlap" with stack frame or key schedule
|
---|
1989 | &lea ($s1,&DWP(768-4,"esp"));
|
---|
1990 | &sub ($s1,$tbl);
|
---|
1991 | &and ($s1,0x300);
|
---|
1992 | &lea ($tbl,&DWP(2048+128,$tbl,$s1));
|
---|
1993 |
|
---|
1994 | if (!$x86only) {
|
---|
1995 | &bt (&DWP(0,$s0),25); # check for SSE bit
|
---|
1996 | &jnc (&label("x86"));
|
---|
1997 |
|
---|
1998 | &movq ("mm0",&QWP(0,$acc));
|
---|
1999 | &movq ("mm4",&QWP(8,$acc));
|
---|
2000 | &call ("_sse_AES_decrypt_compact");
|
---|
2001 | &mov ("esp",$_esp); # restore stack pointer
|
---|
2002 | &mov ($acc,&wparam(1)); # load out
|
---|
2003 | &movq (&QWP(0,$acc),"mm0"); # write output data
|
---|
2004 | &movq (&QWP(8,$acc),"mm4");
|
---|
2005 | &emms ();
|
---|
2006 | &function_end_A();
|
---|
2007 | }
|
---|
2008 | &set_label("x86",16);
|
---|
2009 | &mov ($_tbl,$tbl);
|
---|
2010 | &mov ($s0,&DWP(0,$acc)); # load input data
|
---|
2011 | &mov ($s1,&DWP(4,$acc));
|
---|
2012 | &mov ($s2,&DWP(8,$acc));
|
---|
2013 | &mov ($s3,&DWP(12,$acc));
|
---|
2014 | &call ("_x86_AES_decrypt_compact");
|
---|
2015 | &mov ("esp",$_esp); # restore stack pointer
|
---|
2016 | &mov ($acc,&wparam(1)); # load out
|
---|
2017 | &mov (&DWP(0,$acc),$s0); # write output data
|
---|
2018 | &mov (&DWP(4,$acc),$s1);
|
---|
2019 | &mov (&DWP(8,$acc),$s2);
|
---|
2020 | &mov (&DWP(12,$acc),$s3);
|
---|
2021 | &function_end("AES_decrypt");
|
---|
2022 |
|
---|
2023 | # void AES_cbc_encrypt (const void char *inp, unsigned char *out,
|
---|
2024 | # size_t length, const AES_KEY *key,
|
---|
2025 | # unsigned char *ivp,const int enc);
|
---|
2026 | {
|
---|
2027 | # stack frame layout
|
---|
2028 | # -4(%esp) # return address 0(%esp)
|
---|
2029 | # 0(%esp) # s0 backing store 4(%esp)
|
---|
2030 | # 4(%esp) # s1 backing store 8(%esp)
|
---|
2031 | # 8(%esp) # s2 backing store 12(%esp)
|
---|
2032 | # 12(%esp) # s3 backing store 16(%esp)
|
---|
2033 | # 16(%esp) # key backup 20(%esp)
|
---|
2034 | # 20(%esp) # end of key schedule 24(%esp)
|
---|
2035 | # 24(%esp) # %ebp backup 28(%esp)
|
---|
2036 | # 28(%esp) # %esp backup
|
---|
2037 | my $_inp=&DWP(32,"esp"); # copy of wparam(0)
|
---|
2038 | my $_out=&DWP(36,"esp"); # copy of wparam(1)
|
---|
2039 | my $_len=&DWP(40,"esp"); # copy of wparam(2)
|
---|
2040 | my $_key=&DWP(44,"esp"); # copy of wparam(3)
|
---|
2041 | my $_ivp=&DWP(48,"esp"); # copy of wparam(4)
|
---|
2042 | my $_tmp=&DWP(52,"esp"); # volatile variable
|
---|
2043 | #
|
---|
2044 | my $ivec=&DWP(60,"esp"); # ivec[16]
|
---|
2045 | my $aes_key=&DWP(76,"esp"); # copy of aes_key
|
---|
2046 | my $mark=&DWP(76+240,"esp"); # copy of aes_key->rounds
|
---|
2047 |
|
---|
2048 | &function_begin("AES_cbc_encrypt");
|
---|
2049 | &mov ($s2 eq "ecx"? $s2 : "",&wparam(2)); # load len
|
---|
2050 | &cmp ($s2,0);
|
---|
2051 | &je (&label("drop_out"));
|
---|
2052 |
|
---|
2053 | &call (&label("pic_point")); # make it PIC!
|
---|
2054 | &set_label("pic_point");
|
---|
2055 | &blindpop($tbl);
|
---|
2056 | &picmeup($s0,"OPENSSL_ia32cap_P",$tbl,&label("pic_point")) if(!$x86only);
|
---|
2057 |
|
---|
2058 | &cmp (&wparam(5),0);
|
---|
2059 | &lea ($tbl,&DWP(&label("AES_Te")."-".&label("pic_point"),$tbl));
|
---|
2060 | &jne (&label("picked_te"));
|
---|
2061 | &lea ($tbl,&DWP(&label("AES_Td")."-".&label("AES_Te"),$tbl));
|
---|
2062 | &set_label("picked_te");
|
---|
2063 |
|
---|
2064 | # one can argue if this is required
|
---|
2065 | &pushf ();
|
---|
2066 | &cld ();
|
---|
2067 |
|
---|
2068 | &cmp ($s2,$speed_limit);
|
---|
2069 | &jb (&label("slow_way"));
|
---|
2070 | &test ($s2,15);
|
---|
2071 | &jnz (&label("slow_way"));
|
---|
2072 | if (!$x86only) {
|
---|
2073 | &bt (&DWP(0,$s0),28); # check for hyper-threading bit
|
---|
2074 | &jc (&label("slow_way"));
|
---|
2075 | }
|
---|
2076 | # pre-allocate aligned stack frame...
|
---|
2077 | &lea ($acc,&DWP(-80-244,"esp"));
|
---|
2078 | &and ($acc,-64);
|
---|
2079 |
|
---|
2080 | # ... and make sure it doesn't alias with $tbl modulo 4096
|
---|
2081 | &mov ($s0,$tbl);
|
---|
2082 | &lea ($s1,&DWP(2048+256,$tbl));
|
---|
2083 | &mov ($s3,$acc);
|
---|
2084 | &and ($s0,0xfff); # s = %ebp&0xfff
|
---|
2085 | &and ($s1,0xfff); # e = (%ebp+2048+256)&0xfff
|
---|
2086 | &and ($s3,0xfff); # p = %esp&0xfff
|
---|
2087 |
|
---|
2088 | &cmp ($s3,$s1); # if (p>=e) %esp =- (p-e);
|
---|
2089 | &jb (&label("tbl_break_out"));
|
---|
2090 | &sub ($s3,$s1);
|
---|
2091 | &sub ($acc,$s3);
|
---|
2092 | &jmp (&label("tbl_ok"));
|
---|
2093 | &set_label("tbl_break_out",4); # else %esp -= (p-s)&0xfff + framesz;
|
---|
2094 | &sub ($s3,$s0);
|
---|
2095 | &and ($s3,0xfff);
|
---|
2096 | &add ($s3,384);
|
---|
2097 | &sub ($acc,$s3);
|
---|
2098 | &set_label("tbl_ok",4);
|
---|
2099 |
|
---|
2100 | &lea ($s3,&wparam(0)); # obtain pointer to parameter block
|
---|
2101 | &exch ("esp",$acc); # allocate stack frame
|
---|
2102 | &add ("esp",4); # reserve for return address!
|
---|
2103 | &mov ($_tbl,$tbl); # save %ebp
|
---|
2104 | &mov ($_esp,$acc); # save %esp
|
---|
2105 |
|
---|
2106 | &mov ($s0,&DWP(0,$s3)); # load inp
|
---|
2107 | &mov ($s1,&DWP(4,$s3)); # load out
|
---|
2108 | #&mov ($s2,&DWP(8,$s3)); # load len
|
---|
2109 | &mov ($key,&DWP(12,$s3)); # load key
|
---|
2110 | &mov ($acc,&DWP(16,$s3)); # load ivp
|
---|
2111 | &mov ($s3,&DWP(20,$s3)); # load enc flag
|
---|
2112 |
|
---|
2113 | &mov ($_inp,$s0); # save copy of inp
|
---|
2114 | &mov ($_out,$s1); # save copy of out
|
---|
2115 | &mov ($_len,$s2); # save copy of len
|
---|
2116 | &mov ($_key,$key); # save copy of key
|
---|
2117 | &mov ($_ivp,$acc); # save copy of ivp
|
---|
2118 |
|
---|
2119 | &mov ($mark,0); # copy of aes_key->rounds = 0;
|
---|
2120 | # do we copy key schedule to stack?
|
---|
2121 | &mov ($s1 eq "ebx" ? $s1 : "",$key);
|
---|
2122 | &mov ($s2 eq "ecx" ? $s2 : "",244/4);
|
---|
2123 | &sub ($s1,$tbl);
|
---|
2124 | &mov ("esi",$key);
|
---|
2125 | &and ($s1,0xfff);
|
---|
2126 | &lea ("edi",$aes_key);
|
---|
2127 | &cmp ($s1,2048+256);
|
---|
2128 | &jb (&label("do_copy"));
|
---|
2129 | &cmp ($s1,4096-244);
|
---|
2130 | &jb (&label("skip_copy"));
|
---|
2131 | &set_label("do_copy",4);
|
---|
2132 | &mov ($_key,"edi");
|
---|
2133 | &data_word(0xA5F3F689); # rep movsd
|
---|
2134 | &set_label("skip_copy");
|
---|
2135 |
|
---|
2136 | &mov ($key,16);
|
---|
2137 | &set_label("prefetch_tbl",4);
|
---|
2138 | &mov ($s0,&DWP(0,$tbl));
|
---|
2139 | &mov ($s1,&DWP(32,$tbl));
|
---|
2140 | &mov ($s2,&DWP(64,$tbl));
|
---|
2141 | &mov ($acc,&DWP(96,$tbl));
|
---|
2142 | &lea ($tbl,&DWP(128,$tbl));
|
---|
2143 | &sub ($key,1);
|
---|
2144 | &jnz (&label("prefetch_tbl"));
|
---|
2145 | &sub ($tbl,2048);
|
---|
2146 |
|
---|
2147 | &mov ($acc,$_inp);
|
---|
2148 | &mov ($key,$_ivp);
|
---|
2149 |
|
---|
2150 | &cmp ($s3,0);
|
---|
2151 | &je (&label("fast_decrypt"));
|
---|
2152 |
|
---|
2153 | #----------------------------- ENCRYPT -----------------------------#
|
---|
2154 | &mov ($s0,&DWP(0,$key)); # load iv
|
---|
2155 | &mov ($s1,&DWP(4,$key));
|
---|
2156 |
|
---|
2157 | &set_label("fast_enc_loop",16);
|
---|
2158 | &mov ($s2,&DWP(8,$key));
|
---|
2159 | &mov ($s3,&DWP(12,$key));
|
---|
2160 |
|
---|
2161 | &xor ($s0,&DWP(0,$acc)); # xor input data
|
---|
2162 | &xor ($s1,&DWP(4,$acc));
|
---|
2163 | &xor ($s2,&DWP(8,$acc));
|
---|
2164 | &xor ($s3,&DWP(12,$acc));
|
---|
2165 |
|
---|
2166 | &mov ($key,$_key); # load key
|
---|
2167 | &call ("_x86_AES_encrypt");
|
---|
2168 |
|
---|
2169 | &mov ($acc,$_inp); # load inp
|
---|
2170 | &mov ($key,$_out); # load out
|
---|
2171 |
|
---|
2172 | &mov (&DWP(0,$key),$s0); # save output data
|
---|
2173 | &mov (&DWP(4,$key),$s1);
|
---|
2174 | &mov (&DWP(8,$key),$s2);
|
---|
2175 | &mov (&DWP(12,$key),$s3);
|
---|
2176 |
|
---|
2177 | &lea ($acc,&DWP(16,$acc)); # advance inp
|
---|
2178 | &mov ($s2,$_len); # load len
|
---|
2179 | &mov ($_inp,$acc); # save inp
|
---|
2180 | &lea ($s3,&DWP(16,$key)); # advance out
|
---|
2181 | &mov ($_out,$s3); # save out
|
---|
2182 | &sub ($s2,16); # decrease len
|
---|
2183 | &mov ($_len,$s2); # save len
|
---|
2184 | &jnz (&label("fast_enc_loop"));
|
---|
2185 | &mov ($acc,$_ivp); # load ivp
|
---|
2186 | &mov ($s2,&DWP(8,$key)); # restore last 2 dwords
|
---|
2187 | &mov ($s3,&DWP(12,$key));
|
---|
2188 | &mov (&DWP(0,$acc),$s0); # save ivec
|
---|
2189 | &mov (&DWP(4,$acc),$s1);
|
---|
2190 | &mov (&DWP(8,$acc),$s2);
|
---|
2191 | &mov (&DWP(12,$acc),$s3);
|
---|
2192 |
|
---|
2193 | &cmp ($mark,0); # was the key schedule copied?
|
---|
2194 | &mov ("edi",$_key);
|
---|
2195 | &je (&label("skip_ezero"));
|
---|
2196 | # zero copy of key schedule
|
---|
2197 | &mov ("ecx",240/4);
|
---|
2198 | &xor ("eax","eax");
|
---|
2199 | &align (4);
|
---|
2200 | &data_word(0xABF3F689); # rep stosd
|
---|
2201 | &set_label("skip_ezero");
|
---|
2202 | &mov ("esp",$_esp);
|
---|
2203 | &popf ();
|
---|
2204 | &set_label("drop_out");
|
---|
2205 | &function_end_A();
|
---|
2206 | &pushf (); # kludge, never executed
|
---|
2207 |
|
---|
2208 | #----------------------------- DECRYPT -----------------------------#
|
---|
2209 | &set_label("fast_decrypt",16);
|
---|
2210 |
|
---|
2211 | &cmp ($acc,$_out);
|
---|
2212 | &je (&label("fast_dec_in_place")); # in-place processing...
|
---|
2213 |
|
---|
2214 | &mov ($_tmp,$key);
|
---|
2215 |
|
---|
2216 | &align (4);
|
---|
2217 | &set_label("fast_dec_loop",16);
|
---|
2218 | &mov ($s0,&DWP(0,$acc)); # read input
|
---|
2219 | &mov ($s1,&DWP(4,$acc));
|
---|
2220 | &mov ($s2,&DWP(8,$acc));
|
---|
2221 | &mov ($s3,&DWP(12,$acc));
|
---|
2222 |
|
---|
2223 | &mov ($key,$_key); # load key
|
---|
2224 | &call ("_x86_AES_decrypt");
|
---|
2225 |
|
---|
2226 | &mov ($key,$_tmp); # load ivp
|
---|
2227 | &mov ($acc,$_len); # load len
|
---|
2228 | &xor ($s0,&DWP(0,$key)); # xor iv
|
---|
2229 | &xor ($s1,&DWP(4,$key));
|
---|
2230 | &xor ($s2,&DWP(8,$key));
|
---|
2231 | &xor ($s3,&DWP(12,$key));
|
---|
2232 |
|
---|
2233 | &mov ($key,$_out); # load out
|
---|
2234 | &mov ($acc,$_inp); # load inp
|
---|
2235 |
|
---|
2236 | &mov (&DWP(0,$key),$s0); # write output
|
---|
2237 | &mov (&DWP(4,$key),$s1);
|
---|
2238 | &mov (&DWP(8,$key),$s2);
|
---|
2239 | &mov (&DWP(12,$key),$s3);
|
---|
2240 |
|
---|
2241 | &mov ($s2,$_len); # load len
|
---|
2242 | &mov ($_tmp,$acc); # save ivp
|
---|
2243 | &lea ($acc,&DWP(16,$acc)); # advance inp
|
---|
2244 | &mov ($_inp,$acc); # save inp
|
---|
2245 | &lea ($key,&DWP(16,$key)); # advance out
|
---|
2246 | &mov ($_out,$key); # save out
|
---|
2247 | &sub ($s2,16); # decrease len
|
---|
2248 | &mov ($_len,$s2); # save len
|
---|
2249 | &jnz (&label("fast_dec_loop"));
|
---|
2250 | &mov ($key,$_tmp); # load temp ivp
|
---|
2251 | &mov ($acc,$_ivp); # load user ivp
|
---|
2252 | &mov ($s0,&DWP(0,$key)); # load iv
|
---|
2253 | &mov ($s1,&DWP(4,$key));
|
---|
2254 | &mov ($s2,&DWP(8,$key));
|
---|
2255 | &mov ($s3,&DWP(12,$key));
|
---|
2256 | &mov (&DWP(0,$acc),$s0); # copy back to user
|
---|
2257 | &mov (&DWP(4,$acc),$s1);
|
---|
2258 | &mov (&DWP(8,$acc),$s2);
|
---|
2259 | &mov (&DWP(12,$acc),$s3);
|
---|
2260 | &jmp (&label("fast_dec_out"));
|
---|
2261 |
|
---|
2262 | &set_label("fast_dec_in_place",16);
|
---|
2263 | &set_label("fast_dec_in_place_loop");
|
---|
2264 | &mov ($s0,&DWP(0,$acc)); # read input
|
---|
2265 | &mov ($s1,&DWP(4,$acc));
|
---|
2266 | &mov ($s2,&DWP(8,$acc));
|
---|
2267 | &mov ($s3,&DWP(12,$acc));
|
---|
2268 |
|
---|
2269 | &lea ($key,$ivec);
|
---|
2270 | &mov (&DWP(0,$key),$s0); # copy to temp
|
---|
2271 | &mov (&DWP(4,$key),$s1);
|
---|
2272 | &mov (&DWP(8,$key),$s2);
|
---|
2273 | &mov (&DWP(12,$key),$s3);
|
---|
2274 |
|
---|
2275 | &mov ($key,$_key); # load key
|
---|
2276 | &call ("_x86_AES_decrypt");
|
---|
2277 |
|
---|
2278 | &mov ($key,$_ivp); # load ivp
|
---|
2279 | &mov ($acc,$_out); # load out
|
---|
2280 | &xor ($s0,&DWP(0,$key)); # xor iv
|
---|
2281 | &xor ($s1,&DWP(4,$key));
|
---|
2282 | &xor ($s2,&DWP(8,$key));
|
---|
2283 | &xor ($s3,&DWP(12,$key));
|
---|
2284 |
|
---|
2285 | &mov (&DWP(0,$acc),$s0); # write output
|
---|
2286 | &mov (&DWP(4,$acc),$s1);
|
---|
2287 | &mov (&DWP(8,$acc),$s2);
|
---|
2288 | &mov (&DWP(12,$acc),$s3);
|
---|
2289 |
|
---|
2290 | &lea ($acc,&DWP(16,$acc)); # advance out
|
---|
2291 | &mov ($_out,$acc); # save out
|
---|
2292 |
|
---|
2293 | &lea ($acc,$ivec);
|
---|
2294 | &mov ($s0,&DWP(0,$acc)); # read temp
|
---|
2295 | &mov ($s1,&DWP(4,$acc));
|
---|
2296 | &mov ($s2,&DWP(8,$acc));
|
---|
2297 | &mov ($s3,&DWP(12,$acc));
|
---|
2298 |
|
---|
2299 | &mov (&DWP(0,$key),$s0); # copy iv
|
---|
2300 | &mov (&DWP(4,$key),$s1);
|
---|
2301 | &mov (&DWP(8,$key),$s2);
|
---|
2302 | &mov (&DWP(12,$key),$s3);
|
---|
2303 |
|
---|
2304 | &mov ($acc,$_inp); # load inp
|
---|
2305 | &mov ($s2,$_len); # load len
|
---|
2306 | &lea ($acc,&DWP(16,$acc)); # advance inp
|
---|
2307 | &mov ($_inp,$acc); # save inp
|
---|
2308 | &sub ($s2,16); # decrease len
|
---|
2309 | &mov ($_len,$s2); # save len
|
---|
2310 | &jnz (&label("fast_dec_in_place_loop"));
|
---|
2311 |
|
---|
2312 | &set_label("fast_dec_out",4);
|
---|
2313 | &cmp ($mark,0); # was the key schedule copied?
|
---|
2314 | &mov ("edi",$_key);
|
---|
2315 | &je (&label("skip_dzero"));
|
---|
2316 | # zero copy of key schedule
|
---|
2317 | &mov ("ecx",240/4);
|
---|
2318 | &xor ("eax","eax");
|
---|
2319 | &align (4);
|
---|
2320 | &data_word(0xABF3F689); # rep stosd
|
---|
2321 | &set_label("skip_dzero");
|
---|
2322 | &mov ("esp",$_esp);
|
---|
2323 | &popf ();
|
---|
2324 | &function_end_A();
|
---|
2325 | &pushf (); # kludge, never executed
|
---|
2326 |
|
---|
2327 | #--------------------------- SLOW ROUTINE ---------------------------#
|
---|
2328 | &set_label("slow_way",16);
|
---|
2329 |
|
---|
2330 | &mov ($s0,&DWP(0,$s0)) if (!$x86only);# load OPENSSL_ia32cap
|
---|
2331 | &mov ($key,&wparam(3)); # load key
|
---|
2332 |
|
---|
2333 | # pre-allocate aligned stack frame...
|
---|
2334 | &lea ($acc,&DWP(-80,"esp"));
|
---|
2335 | &and ($acc,-64);
|
---|
2336 |
|
---|
2337 | # ... and make sure it doesn't alias with $key modulo 1024
|
---|
2338 | &lea ($s1,&DWP(-80-63,$key));
|
---|
2339 | &sub ($s1,$acc);
|
---|
2340 | &neg ($s1);
|
---|
2341 | &and ($s1,0x3C0); # modulo 1024, but aligned to cache-line
|
---|
2342 | &sub ($acc,$s1);
|
---|
2343 |
|
---|
2344 | # pick S-box copy which can't overlap with stack frame or $key
|
---|
2345 | &lea ($s1,&DWP(768,$acc));
|
---|
2346 | &sub ($s1,$tbl);
|
---|
2347 | &and ($s1,0x300);
|
---|
2348 | &lea ($tbl,&DWP(2048+128,$tbl,$s1));
|
---|
2349 |
|
---|
2350 | &lea ($s3,&wparam(0)); # pointer to parameter block
|
---|
2351 |
|
---|
2352 | &exch ("esp",$acc);
|
---|
2353 | &add ("esp",4); # reserve for return address!
|
---|
2354 | &mov ($_tbl,$tbl); # save %ebp
|
---|
2355 | &mov ($_esp,$acc); # save %esp
|
---|
2356 | &mov ($_tmp,$s0); # save OPENSSL_ia32cap
|
---|
2357 |
|
---|
2358 | &mov ($s0,&DWP(0,$s3)); # load inp
|
---|
2359 | &mov ($s1,&DWP(4,$s3)); # load out
|
---|
2360 | #&mov ($s2,&DWP(8,$s3)); # load len
|
---|
2361 | #&mov ($key,&DWP(12,$s3)); # load key
|
---|
2362 | &mov ($acc,&DWP(16,$s3)); # load ivp
|
---|
2363 | &mov ($s3,&DWP(20,$s3)); # load enc flag
|
---|
2364 |
|
---|
2365 | &mov ($_inp,$s0); # save copy of inp
|
---|
2366 | &mov ($_out,$s1); # save copy of out
|
---|
2367 | &mov ($_len,$s2); # save copy of len
|
---|
2368 | &mov ($_key,$key); # save copy of key
|
---|
2369 | &mov ($_ivp,$acc); # save copy of ivp
|
---|
2370 |
|
---|
2371 | &mov ($key,$acc);
|
---|
2372 | &mov ($acc,$s0);
|
---|
2373 |
|
---|
2374 | &cmp ($s3,0);
|
---|
2375 | &je (&label("slow_decrypt"));
|
---|
2376 |
|
---|
2377 | #--------------------------- SLOW ENCRYPT ---------------------------#
|
---|
2378 | &cmp ($s2,16);
|
---|
2379 | &mov ($s3,$s1);
|
---|
2380 | &jb (&label("slow_enc_tail"));
|
---|
2381 |
|
---|
2382 | if (!$x86only) {
|
---|
2383 | &bt ($_tmp,25); # check for SSE bit
|
---|
2384 | &jnc (&label("slow_enc_x86"));
|
---|
2385 |
|
---|
2386 | &movq ("mm0",&QWP(0,$key)); # load iv
|
---|
2387 | &movq ("mm4",&QWP(8,$key));
|
---|
2388 |
|
---|
2389 | &set_label("slow_enc_loop_sse",16);
|
---|
2390 | &pxor ("mm0",&QWP(0,$acc)); # xor input data
|
---|
2391 | &pxor ("mm4",&QWP(8,$acc));
|
---|
2392 |
|
---|
2393 | &mov ($key,$_key);
|
---|
2394 | &call ("_sse_AES_encrypt_compact");
|
---|
2395 |
|
---|
2396 | &mov ($acc,$_inp); # load inp
|
---|
2397 | &mov ($key,$_out); # load out
|
---|
2398 | &mov ($s2,$_len); # load len
|
---|
2399 |
|
---|
2400 | &movq (&QWP(0,$key),"mm0"); # save output data
|
---|
2401 | &movq (&QWP(8,$key),"mm4");
|
---|
2402 |
|
---|
2403 | &lea ($acc,&DWP(16,$acc)); # advance inp
|
---|
2404 | &mov ($_inp,$acc); # save inp
|
---|
2405 | &lea ($s3,&DWP(16,$key)); # advance out
|
---|
2406 | &mov ($_out,$s3); # save out
|
---|
2407 | &sub ($s2,16); # decrease len
|
---|
2408 | &cmp ($s2,16);
|
---|
2409 | &mov ($_len,$s2); # save len
|
---|
2410 | &jae (&label("slow_enc_loop_sse"));
|
---|
2411 | &test ($s2,15);
|
---|
2412 | &jnz (&label("slow_enc_tail"));
|
---|
2413 | &mov ($acc,$_ivp); # load ivp
|
---|
2414 | &movq (&QWP(0,$acc),"mm0"); # save ivec
|
---|
2415 | &movq (&QWP(8,$acc),"mm4");
|
---|
2416 | &emms ();
|
---|
2417 | &mov ("esp",$_esp);
|
---|
2418 | &popf ();
|
---|
2419 | &function_end_A();
|
---|
2420 | &pushf (); # kludge, never executed
|
---|
2421 | }
|
---|
2422 | &set_label("slow_enc_x86",16);
|
---|
2423 | &mov ($s0,&DWP(0,$key)); # load iv
|
---|
2424 | &mov ($s1,&DWP(4,$key));
|
---|
2425 |
|
---|
2426 | &set_label("slow_enc_loop_x86",4);
|
---|
2427 | &mov ($s2,&DWP(8,$key));
|
---|
2428 | &mov ($s3,&DWP(12,$key));
|
---|
2429 |
|
---|
2430 | &xor ($s0,&DWP(0,$acc)); # xor input data
|
---|
2431 | &xor ($s1,&DWP(4,$acc));
|
---|
2432 | &xor ($s2,&DWP(8,$acc));
|
---|
2433 | &xor ($s3,&DWP(12,$acc));
|
---|
2434 |
|
---|
2435 | &mov ($key,$_key); # load key
|
---|
2436 | &call ("_x86_AES_encrypt_compact");
|
---|
2437 |
|
---|
2438 | &mov ($acc,$_inp); # load inp
|
---|
2439 | &mov ($key,$_out); # load out
|
---|
2440 |
|
---|
2441 | &mov (&DWP(0,$key),$s0); # save output data
|
---|
2442 | &mov (&DWP(4,$key),$s1);
|
---|
2443 | &mov (&DWP(8,$key),$s2);
|
---|
2444 | &mov (&DWP(12,$key),$s3);
|
---|
2445 |
|
---|
2446 | &mov ($s2,$_len); # load len
|
---|
2447 | &lea ($acc,&DWP(16,$acc)); # advance inp
|
---|
2448 | &mov ($_inp,$acc); # save inp
|
---|
2449 | &lea ($s3,&DWP(16,$key)); # advance out
|
---|
2450 | &mov ($_out,$s3); # save out
|
---|
2451 | &sub ($s2,16); # decrease len
|
---|
2452 | &cmp ($s2,16);
|
---|
2453 | &mov ($_len,$s2); # save len
|
---|
2454 | &jae (&label("slow_enc_loop_x86"));
|
---|
2455 | &test ($s2,15);
|
---|
2456 | &jnz (&label("slow_enc_tail"));
|
---|
2457 | &mov ($acc,$_ivp); # load ivp
|
---|
2458 | &mov ($s2,&DWP(8,$key)); # restore last dwords
|
---|
2459 | &mov ($s3,&DWP(12,$key));
|
---|
2460 | &mov (&DWP(0,$acc),$s0); # save ivec
|
---|
2461 | &mov (&DWP(4,$acc),$s1);
|
---|
2462 | &mov (&DWP(8,$acc),$s2);
|
---|
2463 | &mov (&DWP(12,$acc),$s3);
|
---|
2464 |
|
---|
2465 | &mov ("esp",$_esp);
|
---|
2466 | &popf ();
|
---|
2467 | &function_end_A();
|
---|
2468 | &pushf (); # kludge, never executed
|
---|
2469 |
|
---|
2470 | &set_label("slow_enc_tail",16);
|
---|
2471 | &emms () if (!$x86only);
|
---|
2472 | &mov ($key eq "edi"? $key:"",$s3); # load out to edi
|
---|
2473 | &mov ($s1,16);
|
---|
2474 | &sub ($s1,$s2);
|
---|
2475 | &cmp ($key,$acc eq "esi"? $acc:""); # compare with inp
|
---|
2476 | &je (&label("enc_in_place"));
|
---|
2477 | &align (4);
|
---|
2478 | &data_word(0xA4F3F689); # rep movsb # copy input
|
---|
2479 | &jmp (&label("enc_skip_in_place"));
|
---|
2480 | &set_label("enc_in_place");
|
---|
2481 | &lea ($key,&DWP(0,$key,$s2));
|
---|
2482 | &set_label("enc_skip_in_place");
|
---|
2483 | &mov ($s2,$s1);
|
---|
2484 | &xor ($s0,$s0);
|
---|
2485 | &align (4);
|
---|
2486 | &data_word(0xAAF3F689); # rep stosb # zero tail
|
---|
2487 |
|
---|
2488 | &mov ($key,$_ivp); # restore ivp
|
---|
2489 | &mov ($acc,$s3); # output as input
|
---|
2490 | &mov ($s0,&DWP(0,$key));
|
---|
2491 | &mov ($s1,&DWP(4,$key));
|
---|
2492 | &mov ($_len,16); # len=16
|
---|
2493 | &jmp (&label("slow_enc_loop_x86")); # one more spin...
|
---|
2494 |
|
---|
2495 | #--------------------------- SLOW DECRYPT ---------------------------#
|
---|
2496 | &set_label("slow_decrypt",16);
|
---|
2497 | if (!$x86only) {
|
---|
2498 | &bt ($_tmp,25); # check for SSE bit
|
---|
2499 | &jnc (&label("slow_dec_loop_x86"));
|
---|
2500 |
|
---|
2501 | &set_label("slow_dec_loop_sse",4);
|
---|
2502 | &movq ("mm0",&QWP(0,$acc)); # read input
|
---|
2503 | &movq ("mm4",&QWP(8,$acc));
|
---|
2504 |
|
---|
2505 | &mov ($key,$_key);
|
---|
2506 | &call ("_sse_AES_decrypt_compact");
|
---|
2507 |
|
---|
2508 | &mov ($acc,$_inp); # load inp
|
---|
2509 | &lea ($s0,$ivec);
|
---|
2510 | &mov ($s1,$_out); # load out
|
---|
2511 | &mov ($s2,$_len); # load len
|
---|
2512 | &mov ($key,$_ivp); # load ivp
|
---|
2513 |
|
---|
2514 | &movq ("mm1",&QWP(0,$acc)); # re-read input
|
---|
2515 | &movq ("mm5",&QWP(8,$acc));
|
---|
2516 |
|
---|
2517 | &pxor ("mm0",&QWP(0,$key)); # xor iv
|
---|
2518 | &pxor ("mm4",&QWP(8,$key));
|
---|
2519 |
|
---|
2520 | &movq (&QWP(0,$key),"mm1"); # copy input to iv
|
---|
2521 | &movq (&QWP(8,$key),"mm5");
|
---|
2522 |
|
---|
2523 | &sub ($s2,16); # decrease len
|
---|
2524 | &jc (&label("slow_dec_partial_sse"));
|
---|
2525 |
|
---|
2526 | &movq (&QWP(0,$s1),"mm0"); # write output
|
---|
2527 | &movq (&QWP(8,$s1),"mm4");
|
---|
2528 |
|
---|
2529 | &lea ($s1,&DWP(16,$s1)); # advance out
|
---|
2530 | &mov ($_out,$s1); # save out
|
---|
2531 | &lea ($acc,&DWP(16,$acc)); # advance inp
|
---|
2532 | &mov ($_inp,$acc); # save inp
|
---|
2533 | &mov ($_len,$s2); # save len
|
---|
2534 | &jnz (&label("slow_dec_loop_sse"));
|
---|
2535 | &emms ();
|
---|
2536 | &mov ("esp",$_esp);
|
---|
2537 | &popf ();
|
---|
2538 | &function_end_A();
|
---|
2539 | &pushf (); # kludge, never executed
|
---|
2540 |
|
---|
2541 | &set_label("slow_dec_partial_sse",16);
|
---|
2542 | &movq (&QWP(0,$s0),"mm0"); # save output to temp
|
---|
2543 | &movq (&QWP(8,$s0),"mm4");
|
---|
2544 | &emms ();
|
---|
2545 |
|
---|
2546 | &add ($s2 eq "ecx" ? "ecx":"",16);
|
---|
2547 | &mov ("edi",$s1); # out
|
---|
2548 | &mov ("esi",$s0); # temp
|
---|
2549 | &align (4);
|
---|
2550 | &data_word(0xA4F3F689); # rep movsb # copy partial output
|
---|
2551 |
|
---|
2552 | &mov ("esp",$_esp);
|
---|
2553 | &popf ();
|
---|
2554 | &function_end_A();
|
---|
2555 | &pushf (); # kludge, never executed
|
---|
2556 | }
|
---|
2557 | &set_label("slow_dec_loop_x86",16);
|
---|
2558 | &mov ($s0,&DWP(0,$acc)); # read input
|
---|
2559 | &mov ($s1,&DWP(4,$acc));
|
---|
2560 | &mov ($s2,&DWP(8,$acc));
|
---|
2561 | &mov ($s3,&DWP(12,$acc));
|
---|
2562 |
|
---|
2563 | &lea ($key,$ivec);
|
---|
2564 | &mov (&DWP(0,$key),$s0); # copy to temp
|
---|
2565 | &mov (&DWP(4,$key),$s1);
|
---|
2566 | &mov (&DWP(8,$key),$s2);
|
---|
2567 | &mov (&DWP(12,$key),$s3);
|
---|
2568 |
|
---|
2569 | &mov ($key,$_key); # load key
|
---|
2570 | &call ("_x86_AES_decrypt_compact");
|
---|
2571 |
|
---|
2572 | &mov ($key,$_ivp); # load ivp
|
---|
2573 | &mov ($acc,$_len); # load len
|
---|
2574 | &xor ($s0,&DWP(0,$key)); # xor iv
|
---|
2575 | &xor ($s1,&DWP(4,$key));
|
---|
2576 | &xor ($s2,&DWP(8,$key));
|
---|
2577 | &xor ($s3,&DWP(12,$key));
|
---|
2578 |
|
---|
2579 | &sub ($acc,16);
|
---|
2580 | &jc (&label("slow_dec_partial_x86"));
|
---|
2581 |
|
---|
2582 | &mov ($_len,$acc); # save len
|
---|
2583 | &mov ($acc,$_out); # load out
|
---|
2584 |
|
---|
2585 | &mov (&DWP(0,$acc),$s0); # write output
|
---|
2586 | &mov (&DWP(4,$acc),$s1);
|
---|
2587 | &mov (&DWP(8,$acc),$s2);
|
---|
2588 | &mov (&DWP(12,$acc),$s3);
|
---|
2589 |
|
---|
2590 | &lea ($acc,&DWP(16,$acc)); # advance out
|
---|
2591 | &mov ($_out,$acc); # save out
|
---|
2592 |
|
---|
2593 | &lea ($acc,$ivec);
|
---|
2594 | &mov ($s0,&DWP(0,$acc)); # read temp
|
---|
2595 | &mov ($s1,&DWP(4,$acc));
|
---|
2596 | &mov ($s2,&DWP(8,$acc));
|
---|
2597 | &mov ($s3,&DWP(12,$acc));
|
---|
2598 |
|
---|
2599 | &mov (&DWP(0,$key),$s0); # copy it to iv
|
---|
2600 | &mov (&DWP(4,$key),$s1);
|
---|
2601 | &mov (&DWP(8,$key),$s2);
|
---|
2602 | &mov (&DWP(12,$key),$s3);
|
---|
2603 |
|
---|
2604 | &mov ($acc,$_inp); # load inp
|
---|
2605 | &lea ($acc,&DWP(16,$acc)); # advance inp
|
---|
2606 | &mov ($_inp,$acc); # save inp
|
---|
2607 | &jnz (&label("slow_dec_loop_x86"));
|
---|
2608 | &mov ("esp",$_esp);
|
---|
2609 | &popf ();
|
---|
2610 | &function_end_A();
|
---|
2611 | &pushf (); # kludge, never executed
|
---|
2612 |
|
---|
2613 | &set_label("slow_dec_partial_x86",16);
|
---|
2614 | &lea ($acc,$ivec);
|
---|
2615 | &mov (&DWP(0,$acc),$s0); # save output to temp
|
---|
2616 | &mov (&DWP(4,$acc),$s1);
|
---|
2617 | &mov (&DWP(8,$acc),$s2);
|
---|
2618 | &mov (&DWP(12,$acc),$s3);
|
---|
2619 |
|
---|
2620 | &mov ($acc,$_inp);
|
---|
2621 | &mov ($s0,&DWP(0,$acc)); # re-read input
|
---|
2622 | &mov ($s1,&DWP(4,$acc));
|
---|
2623 | &mov ($s2,&DWP(8,$acc));
|
---|
2624 | &mov ($s3,&DWP(12,$acc));
|
---|
2625 |
|
---|
2626 | &mov (&DWP(0,$key),$s0); # copy it to iv
|
---|
2627 | &mov (&DWP(4,$key),$s1);
|
---|
2628 | &mov (&DWP(8,$key),$s2);
|
---|
2629 | &mov (&DWP(12,$key),$s3);
|
---|
2630 |
|
---|
2631 | &mov ("ecx",$_len);
|
---|
2632 | &mov ("edi",$_out);
|
---|
2633 | &lea ("esi",$ivec);
|
---|
2634 | &align (4);
|
---|
2635 | &data_word(0xA4F3F689); # rep movsb # copy partial output
|
---|
2636 |
|
---|
2637 | &mov ("esp",$_esp);
|
---|
2638 | &popf ();
|
---|
2639 | &function_end("AES_cbc_encrypt");
|
---|
2640 | }
|
---|
2641 |
|
---|
2642 | #------------------------------------------------------------------#
|
---|
2643 |
|
---|
2644 | sub enckey()
|
---|
2645 | {
|
---|
2646 | &movz ("esi",&LB("edx")); # rk[i]>>0
|
---|
2647 | &movz ("ebx",&BP(-128,$tbl,"esi",1));
|
---|
2648 | &movz ("esi",&HB("edx")); # rk[i]>>8
|
---|
2649 | &shl ("ebx",24);
|
---|
2650 | &xor ("eax","ebx");
|
---|
2651 |
|
---|
2652 | &movz ("ebx",&BP(-128,$tbl,"esi",1));
|
---|
2653 | &shr ("edx",16);
|
---|
2654 | &movz ("esi",&LB("edx")); # rk[i]>>16
|
---|
2655 | &xor ("eax","ebx");
|
---|
2656 |
|
---|
2657 | &movz ("ebx",&BP(-128,$tbl,"esi",1));
|
---|
2658 | &movz ("esi",&HB("edx")); # rk[i]>>24
|
---|
2659 | &shl ("ebx",8);
|
---|
2660 | &xor ("eax","ebx");
|
---|
2661 |
|
---|
2662 | &movz ("ebx",&BP(-128,$tbl,"esi",1));
|
---|
2663 | &shl ("ebx",16);
|
---|
2664 | &xor ("eax","ebx");
|
---|
2665 |
|
---|
2666 | &xor ("eax",&DWP(1024-128,$tbl,"ecx",4)); # rcon
|
---|
2667 | }
|
---|
2668 |
|
---|
2669 | &function_begin("_x86_AES_set_encrypt_key");
|
---|
2670 | &mov ("esi",&wparam(1)); # user supplied key
|
---|
2671 | &mov ("edi",&wparam(3)); # private key schedule
|
---|
2672 |
|
---|
2673 | &test ("esi",-1);
|
---|
2674 | &jz (&label("badpointer"));
|
---|
2675 | &test ("edi",-1);
|
---|
2676 | &jz (&label("badpointer"));
|
---|
2677 |
|
---|
2678 | &call (&label("pic_point"));
|
---|
2679 | &set_label("pic_point");
|
---|
2680 | &blindpop($tbl);
|
---|
2681 | &lea ($tbl,&DWP(&label("AES_Te")."-".&label("pic_point"),$tbl));
|
---|
2682 | &lea ($tbl,&DWP(2048+128,$tbl));
|
---|
2683 |
|
---|
2684 | # prefetch Te4
|
---|
2685 | &mov ("eax",&DWP(0-128,$tbl));
|
---|
2686 | &mov ("ebx",&DWP(32-128,$tbl));
|
---|
2687 | &mov ("ecx",&DWP(64-128,$tbl));
|
---|
2688 | &mov ("edx",&DWP(96-128,$tbl));
|
---|
2689 | &mov ("eax",&DWP(128-128,$tbl));
|
---|
2690 | &mov ("ebx",&DWP(160-128,$tbl));
|
---|
2691 | &mov ("ecx",&DWP(192-128,$tbl));
|
---|
2692 | &mov ("edx",&DWP(224-128,$tbl));
|
---|
2693 |
|
---|
2694 | &mov ("ecx",&wparam(2)); # number of bits in key
|
---|
2695 | &cmp ("ecx",128);
|
---|
2696 | &je (&label("10rounds"));
|
---|
2697 | &cmp ("ecx",192);
|
---|
2698 | &je (&label("12rounds"));
|
---|
2699 | &cmp ("ecx",256);
|
---|
2700 | &je (&label("14rounds"));
|
---|
2701 | &mov ("eax",-2); # invalid number of bits
|
---|
2702 | &jmp (&label("exit"));
|
---|
2703 |
|
---|
2704 | &set_label("10rounds");
|
---|
2705 | &mov ("eax",&DWP(0,"esi")); # copy first 4 dwords
|
---|
2706 | &mov ("ebx",&DWP(4,"esi"));
|
---|
2707 | &mov ("ecx",&DWP(8,"esi"));
|
---|
2708 | &mov ("edx",&DWP(12,"esi"));
|
---|
2709 | &mov (&DWP(0,"edi"),"eax");
|
---|
2710 | &mov (&DWP(4,"edi"),"ebx");
|
---|
2711 | &mov (&DWP(8,"edi"),"ecx");
|
---|
2712 | &mov (&DWP(12,"edi"),"edx");
|
---|
2713 |
|
---|
2714 | &xor ("ecx","ecx");
|
---|
2715 | &jmp (&label("10shortcut"));
|
---|
2716 |
|
---|
2717 | &align (4);
|
---|
2718 | &set_label("10loop");
|
---|
2719 | &mov ("eax",&DWP(0,"edi")); # rk[0]
|
---|
2720 | &mov ("edx",&DWP(12,"edi")); # rk[3]
|
---|
2721 | &set_label("10shortcut");
|
---|
2722 | &enckey ();
|
---|
2723 |
|
---|
2724 | &mov (&DWP(16,"edi"),"eax"); # rk[4]
|
---|
2725 | &xor ("eax",&DWP(4,"edi"));
|
---|
2726 | &mov (&DWP(20,"edi"),"eax"); # rk[5]
|
---|
2727 | &xor ("eax",&DWP(8,"edi"));
|
---|
2728 | &mov (&DWP(24,"edi"),"eax"); # rk[6]
|
---|
2729 | &xor ("eax",&DWP(12,"edi"));
|
---|
2730 | &mov (&DWP(28,"edi"),"eax"); # rk[7]
|
---|
2731 | &inc ("ecx");
|
---|
2732 | &add ("edi",16);
|
---|
2733 | &cmp ("ecx",10);
|
---|
2734 | &jl (&label("10loop"));
|
---|
2735 |
|
---|
2736 | &mov (&DWP(80,"edi"),10); # setup number of rounds
|
---|
2737 | &xor ("eax","eax");
|
---|
2738 | &jmp (&label("exit"));
|
---|
2739 |
|
---|
2740 | &set_label("12rounds");
|
---|
2741 | &mov ("eax",&DWP(0,"esi")); # copy first 6 dwords
|
---|
2742 | &mov ("ebx",&DWP(4,"esi"));
|
---|
2743 | &mov ("ecx",&DWP(8,"esi"));
|
---|
2744 | &mov ("edx",&DWP(12,"esi"));
|
---|
2745 | &mov (&DWP(0,"edi"),"eax");
|
---|
2746 | &mov (&DWP(4,"edi"),"ebx");
|
---|
2747 | &mov (&DWP(8,"edi"),"ecx");
|
---|
2748 | &mov (&DWP(12,"edi"),"edx");
|
---|
2749 | &mov ("ecx",&DWP(16,"esi"));
|
---|
2750 | &mov ("edx",&DWP(20,"esi"));
|
---|
2751 | &mov (&DWP(16,"edi"),"ecx");
|
---|
2752 | &mov (&DWP(20,"edi"),"edx");
|
---|
2753 |
|
---|
2754 | &xor ("ecx","ecx");
|
---|
2755 | &jmp (&label("12shortcut"));
|
---|
2756 |
|
---|
2757 | &align (4);
|
---|
2758 | &set_label("12loop");
|
---|
2759 | &mov ("eax",&DWP(0,"edi")); # rk[0]
|
---|
2760 | &mov ("edx",&DWP(20,"edi")); # rk[5]
|
---|
2761 | &set_label("12shortcut");
|
---|
2762 | &enckey ();
|
---|
2763 |
|
---|
2764 | &mov (&DWP(24,"edi"),"eax"); # rk[6]
|
---|
2765 | &xor ("eax",&DWP(4,"edi"));
|
---|
2766 | &mov (&DWP(28,"edi"),"eax"); # rk[7]
|
---|
2767 | &xor ("eax",&DWP(8,"edi"));
|
---|
2768 | &mov (&DWP(32,"edi"),"eax"); # rk[8]
|
---|
2769 | &xor ("eax",&DWP(12,"edi"));
|
---|
2770 | &mov (&DWP(36,"edi"),"eax"); # rk[9]
|
---|
2771 |
|
---|
2772 | &cmp ("ecx",7);
|
---|
2773 | &je (&label("12break"));
|
---|
2774 | &inc ("ecx");
|
---|
2775 |
|
---|
2776 | &xor ("eax",&DWP(16,"edi"));
|
---|
2777 | &mov (&DWP(40,"edi"),"eax"); # rk[10]
|
---|
2778 | &xor ("eax",&DWP(20,"edi"));
|
---|
2779 | &mov (&DWP(44,"edi"),"eax"); # rk[11]
|
---|
2780 |
|
---|
2781 | &add ("edi",24);
|
---|
2782 | &jmp (&label("12loop"));
|
---|
2783 |
|
---|
2784 | &set_label("12break");
|
---|
2785 | &mov (&DWP(72,"edi"),12); # setup number of rounds
|
---|
2786 | &xor ("eax","eax");
|
---|
2787 | &jmp (&label("exit"));
|
---|
2788 |
|
---|
2789 | &set_label("14rounds");
|
---|
2790 | &mov ("eax",&DWP(0,"esi")); # copy first 8 dwords
|
---|
2791 | &mov ("ebx",&DWP(4,"esi"));
|
---|
2792 | &mov ("ecx",&DWP(8,"esi"));
|
---|
2793 | &mov ("edx",&DWP(12,"esi"));
|
---|
2794 | &mov (&DWP(0,"edi"),"eax");
|
---|
2795 | &mov (&DWP(4,"edi"),"ebx");
|
---|
2796 | &mov (&DWP(8,"edi"),"ecx");
|
---|
2797 | &mov (&DWP(12,"edi"),"edx");
|
---|
2798 | &mov ("eax",&DWP(16,"esi"));
|
---|
2799 | &mov ("ebx",&DWP(20,"esi"));
|
---|
2800 | &mov ("ecx",&DWP(24,"esi"));
|
---|
2801 | &mov ("edx",&DWP(28,"esi"));
|
---|
2802 | &mov (&DWP(16,"edi"),"eax");
|
---|
2803 | &mov (&DWP(20,"edi"),"ebx");
|
---|
2804 | &mov (&DWP(24,"edi"),"ecx");
|
---|
2805 | &mov (&DWP(28,"edi"),"edx");
|
---|
2806 |
|
---|
2807 | &xor ("ecx","ecx");
|
---|
2808 | &jmp (&label("14shortcut"));
|
---|
2809 |
|
---|
2810 | &align (4);
|
---|
2811 | &set_label("14loop");
|
---|
2812 | &mov ("edx",&DWP(28,"edi")); # rk[7]
|
---|
2813 | &set_label("14shortcut");
|
---|
2814 | &mov ("eax",&DWP(0,"edi")); # rk[0]
|
---|
2815 |
|
---|
2816 | &enckey ();
|
---|
2817 |
|
---|
2818 | &mov (&DWP(32,"edi"),"eax"); # rk[8]
|
---|
2819 | &xor ("eax",&DWP(4,"edi"));
|
---|
2820 | &mov (&DWP(36,"edi"),"eax"); # rk[9]
|
---|
2821 | &xor ("eax",&DWP(8,"edi"));
|
---|
2822 | &mov (&DWP(40,"edi"),"eax"); # rk[10]
|
---|
2823 | &xor ("eax",&DWP(12,"edi"));
|
---|
2824 | &mov (&DWP(44,"edi"),"eax"); # rk[11]
|
---|
2825 |
|
---|
2826 | &cmp ("ecx",6);
|
---|
2827 | &je (&label("14break"));
|
---|
2828 | &inc ("ecx");
|
---|
2829 |
|
---|
2830 | &mov ("edx","eax");
|
---|
2831 | &mov ("eax",&DWP(16,"edi")); # rk[4]
|
---|
2832 | &movz ("esi",&LB("edx")); # rk[11]>>0
|
---|
2833 | &movz ("ebx",&BP(-128,$tbl,"esi",1));
|
---|
2834 | &movz ("esi",&HB("edx")); # rk[11]>>8
|
---|
2835 | &xor ("eax","ebx");
|
---|
2836 |
|
---|
2837 | &movz ("ebx",&BP(-128,$tbl,"esi",1));
|
---|
2838 | &shr ("edx",16);
|
---|
2839 | &shl ("ebx",8);
|
---|
2840 | &movz ("esi",&LB("edx")); # rk[11]>>16
|
---|
2841 | &xor ("eax","ebx");
|
---|
2842 |
|
---|
2843 | &movz ("ebx",&BP(-128,$tbl,"esi",1));
|
---|
2844 | &movz ("esi",&HB("edx")); # rk[11]>>24
|
---|
2845 | &shl ("ebx",16);
|
---|
2846 | &xor ("eax","ebx");
|
---|
2847 |
|
---|
2848 | &movz ("ebx",&BP(-128,$tbl,"esi",1));
|
---|
2849 | &shl ("ebx",24);
|
---|
2850 | &xor ("eax","ebx");
|
---|
2851 |
|
---|
2852 | &mov (&DWP(48,"edi"),"eax"); # rk[12]
|
---|
2853 | &xor ("eax",&DWP(20,"edi"));
|
---|
2854 | &mov (&DWP(52,"edi"),"eax"); # rk[13]
|
---|
2855 | &xor ("eax",&DWP(24,"edi"));
|
---|
2856 | &mov (&DWP(56,"edi"),"eax"); # rk[14]
|
---|
2857 | &xor ("eax",&DWP(28,"edi"));
|
---|
2858 | &mov (&DWP(60,"edi"),"eax"); # rk[15]
|
---|
2859 |
|
---|
2860 | &add ("edi",32);
|
---|
2861 | &jmp (&label("14loop"));
|
---|
2862 |
|
---|
2863 | &set_label("14break");
|
---|
2864 | &mov (&DWP(48,"edi"),14); # setup number of rounds
|
---|
2865 | &xor ("eax","eax");
|
---|
2866 | &jmp (&label("exit"));
|
---|
2867 |
|
---|
2868 | &set_label("badpointer");
|
---|
2869 | &mov ("eax",-1);
|
---|
2870 | &set_label("exit");
|
---|
2871 | &function_end("_x86_AES_set_encrypt_key");
|
---|
2872 |
|
---|
2873 | # int AES_set_encrypt_key(const unsigned char *userKey, const int bits,
|
---|
2874 | # AES_KEY *key)
|
---|
2875 | &function_begin_B("AES_set_encrypt_key");
|
---|
2876 | &call ("_x86_AES_set_encrypt_key");
|
---|
2877 | &ret ();
|
---|
2878 | &function_end_B("AES_set_encrypt_key");
|
---|
2879 |
|
---|
2880 | sub deckey()
|
---|
2881 | { my ($i,$key,$tp1,$tp2,$tp4,$tp8) = @_;
|
---|
2882 | my $tmp = $tbl;
|
---|
2883 |
|
---|
2884 | &mov ($tmp,0x80808080);
|
---|
2885 | &and ($tmp,$tp1);
|
---|
2886 | &lea ($tp2,&DWP(0,$tp1,$tp1));
|
---|
2887 | &mov ($acc,$tmp);
|
---|
2888 | &shr ($tmp,7);
|
---|
2889 | &sub ($acc,$tmp);
|
---|
2890 | &and ($tp2,0xfefefefe);
|
---|
2891 | &and ($acc,0x1b1b1b1b);
|
---|
2892 | &xor ($tp2,$acc);
|
---|
2893 | &mov ($tmp,0x80808080);
|
---|
2894 |
|
---|
2895 | &and ($tmp,$tp2);
|
---|
2896 | &lea ($tp4,&DWP(0,$tp2,$tp2));
|
---|
2897 | &mov ($acc,$tmp);
|
---|
2898 | &shr ($tmp,7);
|
---|
2899 | &sub ($acc,$tmp);
|
---|
2900 | &and ($tp4,0xfefefefe);
|
---|
2901 | &and ($acc,0x1b1b1b1b);
|
---|
2902 | &xor ($tp2,$tp1); # tp2^tp1
|
---|
2903 | &xor ($tp4,$acc);
|
---|
2904 | &mov ($tmp,0x80808080);
|
---|
2905 |
|
---|
2906 | &and ($tmp,$tp4);
|
---|
2907 | &lea ($tp8,&DWP(0,$tp4,$tp4));
|
---|
2908 | &mov ($acc,$tmp);
|
---|
2909 | &shr ($tmp,7);
|
---|
2910 | &xor ($tp4,$tp1); # tp4^tp1
|
---|
2911 | &sub ($acc,$tmp);
|
---|
2912 | &and ($tp8,0xfefefefe);
|
---|
2913 | &and ($acc,0x1b1b1b1b);
|
---|
2914 | &rotl ($tp1,8); # = ROTATE(tp1,8)
|
---|
2915 | &xor ($tp8,$acc);
|
---|
2916 |
|
---|
2917 | &mov ($tmp,&DWP(4*($i+1),$key)); # modulo-scheduled load
|
---|
2918 |
|
---|
2919 | &xor ($tp1,$tp2);
|
---|
2920 | &xor ($tp2,$tp8);
|
---|
2921 | &xor ($tp1,$tp4);
|
---|
2922 | &rotl ($tp2,24);
|
---|
2923 | &xor ($tp4,$tp8);
|
---|
2924 | &xor ($tp1,$tp8); # ^= tp8^(tp4^tp1)^(tp2^tp1)
|
---|
2925 | &rotl ($tp4,16);
|
---|
2926 | &xor ($tp1,$tp2); # ^= ROTATE(tp8^tp2^tp1,24)
|
---|
2927 | &rotl ($tp8,8);
|
---|
2928 | &xor ($tp1,$tp4); # ^= ROTATE(tp8^tp4^tp1,16)
|
---|
2929 | &mov ($tp2,$tmp);
|
---|
2930 | &xor ($tp1,$tp8); # ^= ROTATE(tp8,8)
|
---|
2931 |
|
---|
2932 | &mov (&DWP(4*$i,$key),$tp1);
|
---|
2933 | }
|
---|
2934 |
|
---|
2935 | # int AES_set_decrypt_key(const unsigned char *userKey, const int bits,
|
---|
2936 | # AES_KEY *key)
|
---|
2937 | &function_begin_B("AES_set_decrypt_key");
|
---|
2938 | &call ("_x86_AES_set_encrypt_key");
|
---|
2939 | &cmp ("eax",0);
|
---|
2940 | &je (&label("proceed"));
|
---|
2941 | &ret ();
|
---|
2942 |
|
---|
2943 | &set_label("proceed");
|
---|
2944 | &push ("ebp");
|
---|
2945 | &push ("ebx");
|
---|
2946 | &push ("esi");
|
---|
2947 | &push ("edi");
|
---|
2948 |
|
---|
2949 | &mov ("esi",&wparam(2));
|
---|
2950 | &mov ("ecx",&DWP(240,"esi")); # pull number of rounds
|
---|
2951 | &lea ("ecx",&DWP(0,"","ecx",4));
|
---|
2952 | &lea ("edi",&DWP(0,"esi","ecx",4)); # pointer to last chunk
|
---|
2953 |
|
---|
2954 | &set_label("invert",4); # invert order of chunks
|
---|
2955 | &mov ("eax",&DWP(0,"esi"));
|
---|
2956 | &mov ("ebx",&DWP(4,"esi"));
|
---|
2957 | &mov ("ecx",&DWP(0,"edi"));
|
---|
2958 | &mov ("edx",&DWP(4,"edi"));
|
---|
2959 | &mov (&DWP(0,"edi"),"eax");
|
---|
2960 | &mov (&DWP(4,"edi"),"ebx");
|
---|
2961 | &mov (&DWP(0,"esi"),"ecx");
|
---|
2962 | &mov (&DWP(4,"esi"),"edx");
|
---|
2963 | &mov ("eax",&DWP(8,"esi"));
|
---|
2964 | &mov ("ebx",&DWP(12,"esi"));
|
---|
2965 | &mov ("ecx",&DWP(8,"edi"));
|
---|
2966 | &mov ("edx",&DWP(12,"edi"));
|
---|
2967 | &mov (&DWP(8,"edi"),"eax");
|
---|
2968 | &mov (&DWP(12,"edi"),"ebx");
|
---|
2969 | &mov (&DWP(8,"esi"),"ecx");
|
---|
2970 | &mov (&DWP(12,"esi"),"edx");
|
---|
2971 | &add ("esi",16);
|
---|
2972 | &sub ("edi",16);
|
---|
2973 | &cmp ("esi","edi");
|
---|
2974 | &jne (&label("invert"));
|
---|
2975 |
|
---|
2976 | &mov ($key,&wparam(2));
|
---|
2977 | &mov ($acc,&DWP(240,$key)); # pull number of rounds
|
---|
2978 | &lea ($acc,&DWP(-2,$acc,$acc));
|
---|
2979 | &lea ($acc,&DWP(0,$key,$acc,8));
|
---|
2980 | &mov (&wparam(2),$acc);
|
---|
2981 |
|
---|
2982 | &mov ($s0,&DWP(16,$key)); # modulo-scheduled load
|
---|
2983 | &set_label("permute",4); # permute the key schedule
|
---|
2984 | &add ($key,16);
|
---|
2985 | &deckey (0,$key,$s0,$s1,$s2,$s3);
|
---|
2986 | &deckey (1,$key,$s1,$s2,$s3,$s0);
|
---|
2987 | &deckey (2,$key,$s2,$s3,$s0,$s1);
|
---|
2988 | &deckey (3,$key,$s3,$s0,$s1,$s2);
|
---|
2989 | &cmp ($key,&wparam(2));
|
---|
2990 | &jb (&label("permute"));
|
---|
2991 |
|
---|
2992 | &xor ("eax","eax"); # return success
|
---|
2993 | &function_end("AES_set_decrypt_key");
|
---|
2994 | &asciz("AES for x86, CRYPTOGAMS by <appro\@openssl.org>");
|
---|
2995 |
|
---|
2996 | &asm_finish();
|
---|
2997 |
|
---|
2998 | close STDOUT or die "error closing STDOUT: $!";
|
---|