charon-nm: Set DPD/close action to restart and enable indefinite keying tries
[strongswan.git] / src / libstrongswan / plugins / aesni / aesni_ecb.c
1 /*
2 * Copyright (C) 2015 Martin Willi
3 * Copyright (C) 2015 revosec AG
4 *
5 * Copyright (C) 2019 Andreas Steffen
6 * HSR Hochschule fuer Technik Rapperswil
7 *
8 * This program is free software; you can redistribute it and/or modify it
9 * under the terms of the GNU General Public License as published by the
10 * Free Software Foundation; either version 2 of the License, or (at your
11 * option) any later version. See <http://www.fsf.org/copyleft/gpl.txt>.
12 *
13 * This program is distributed in the hope that it will be useful, but
14 * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
15 * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 * for more details.
17 */
18
19 #include "aesni_ecb.h"
20 #include "aesni_key.h"
21
22 /**
23 * Pipeline parallelism we use for ECB encryption/decryption
24 */
25 #define ECB_PARALLELISM 4
26
27 typedef struct private_aesni_ecb_t private_aesni_ecb_t;
28
29 /**
30 * ECB en/decryption method type
31 */
32 typedef void (*aesni_ecb_fn_t)(aesni_key_t*, u_int, u_char*, u_char*);
33
34 /**
35 * Private data of an aesni_ecb_t object.
36 */
37 struct private_aesni_ecb_t {
38
39 /**
40 * Public aesni_ecb_t interface.
41 */
42 aesni_ecb_t public;
43
44 /**
45 * Key size
46 */
47 u_int key_size;
48
49 /**
50 * Encryption key schedule
51 */
52 aesni_key_t *ekey;
53
54 /**
55 * Decryption key schedule
56 */
57 aesni_key_t *dkey;
58
59 /**
60 * Encryption method
61 */
62 aesni_ecb_fn_t encrypt;
63
64 /**
65 * Decryption method
66 */
67 aesni_ecb_fn_t decrypt;
68 };
69
70 /**
71 * AES-128 ECB encryption
72 */
73 static void encrypt_ecb128(aesni_key_t *key, u_int blocks, u_char *in,
74 u_char *out)
75 {
76 __m128i *ks, *bi, *bo;
77 __m128i t1, t2, t3, t4;
78 u_int i, pblocks;
79
80 ks = key->schedule;
81 bi = (__m128i*)in;
82 bo = (__m128i*)out;
83 pblocks = blocks - (blocks % ECB_PARALLELISM);
84
85 for (i = 0; i < pblocks; i += ECB_PARALLELISM)
86 {
87 t1 = _mm_loadu_si128(bi + i + 0);
88 t2 = _mm_loadu_si128(bi + i + 1);
89 t3 = _mm_loadu_si128(bi + i + 2);
90 t4 = _mm_loadu_si128(bi + i + 3);
91
92 t1 = _mm_xor_si128(t1, ks[0]);
93 t2 = _mm_xor_si128(t2, ks[0]);
94 t3 = _mm_xor_si128(t3, ks[0]);
95 t4 = _mm_xor_si128(t4, ks[0]);
96
97 t1 = _mm_aesenc_si128(t1, ks[1]);
98 t2 = _mm_aesenc_si128(t2, ks[1]);
99 t3 = _mm_aesenc_si128(t3, ks[1]);
100 t4 = _mm_aesenc_si128(t4, ks[1]);
101 t1 = _mm_aesenc_si128(t1, ks[2]);
102 t2 = _mm_aesenc_si128(t2, ks[2]);
103 t3 = _mm_aesenc_si128(t3, ks[2]);
104 t4 = _mm_aesenc_si128(t4, ks[2]);
105 t1 = _mm_aesenc_si128(t1, ks[3]);
106 t2 = _mm_aesenc_si128(t2, ks[3]);
107 t3 = _mm_aesenc_si128(t3, ks[3]);
108 t4 = _mm_aesenc_si128(t4, ks[3]);
109 t1 = _mm_aesenc_si128(t1, ks[4]);
110 t2 = _mm_aesenc_si128(t2, ks[4]);
111 t3 = _mm_aesenc_si128(t3, ks[4]);
112 t4 = _mm_aesenc_si128(t4, ks[4]);
113 t1 = _mm_aesenc_si128(t1, ks[5]);
114 t2 = _mm_aesenc_si128(t2, ks[5]);
115 t3 = _mm_aesenc_si128(t3, ks[5]);
116 t4 = _mm_aesenc_si128(t4, ks[5]);
117 t1 = _mm_aesenc_si128(t1, ks[6]);
118 t2 = _mm_aesenc_si128(t2, ks[6]);
119 t3 = _mm_aesenc_si128(t3, ks[6]);
120 t4 = _mm_aesenc_si128(t4, ks[6]);
121 t1 = _mm_aesenc_si128(t1, ks[7]);
122 t2 = _mm_aesenc_si128(t2, ks[7]);
123 t3 = _mm_aesenc_si128(t3, ks[7]);
124 t4 = _mm_aesenc_si128(t4, ks[7]);
125 t1 = _mm_aesenc_si128(t1, ks[8]);
126 t2 = _mm_aesenc_si128(t2, ks[8]);
127 t3 = _mm_aesenc_si128(t3, ks[8]);
128 t4 = _mm_aesenc_si128(t4, ks[8]);
129 t1 = _mm_aesenc_si128(t1, ks[9]);
130 t2 = _mm_aesenc_si128(t2, ks[9]);
131 t3 = _mm_aesenc_si128(t3, ks[9]);
132 t4 = _mm_aesenc_si128(t4, ks[9]);
133
134 t1 = _mm_aesenclast_si128(t1, ks[10]);
135 t2 = _mm_aesenclast_si128(t2, ks[10]);
136 t3 = _mm_aesenclast_si128(t3, ks[10]);
137 t4 = _mm_aesenclast_si128(t4, ks[10]);
138
139 _mm_storeu_si128(bo + i + 0, t1);
140 _mm_storeu_si128(bo + i + 1, t2);
141 _mm_storeu_si128(bo + i + 2, t3);
142 _mm_storeu_si128(bo + i + 3, t4);
143 }
144
145 for (i = pblocks; i < blocks; i++)
146 {
147 t1 = _mm_loadu_si128(bi + i);
148 t1 = _mm_xor_si128(t1, ks[0]);
149
150 t1 = _mm_aesenc_si128(t1, ks[1]);
151 t1 = _mm_aesenc_si128(t1, ks[2]);
152 t1 = _mm_aesenc_si128(t1, ks[3]);
153 t1 = _mm_aesenc_si128(t1, ks[4]);
154 t1 = _mm_aesenc_si128(t1, ks[5]);
155 t1 = _mm_aesenc_si128(t1, ks[6]);
156 t1 = _mm_aesenc_si128(t1, ks[7]);
157 t1 = _mm_aesenc_si128(t1, ks[8]);
158 t1 = _mm_aesenc_si128(t1, ks[9]);
159
160 t1 = _mm_aesenclast_si128(t1, ks[10]);
161 _mm_storeu_si128(bo + i, t1);
162 }
163 }
164
165 /**
166 * AES-128 ECB decryption
167 */
168 static void decrypt_ecb128(aesni_key_t *key, u_int blocks, u_char *in,
169 u_char *out)
170 {
171 __m128i *ks, *bi, *bo;
172 __m128i t1, t2, t3, t4;
173 u_int i, pblocks;
174
175 ks = key->schedule;
176 bi = (__m128i*)in;
177 bo = (__m128i*)out;
178 pblocks = blocks - (blocks % ECB_PARALLELISM);
179
180 for (i = 0; i < pblocks; i += ECB_PARALLELISM)
181 {
182 t1 = _mm_loadu_si128(bi + i + 0);
183 t2 = _mm_loadu_si128(bi + i + 1);
184 t3 = _mm_loadu_si128(bi + i + 2);
185 t4 = _mm_loadu_si128(bi + i + 3);
186
187 t1 = _mm_xor_si128(t1, ks[0]);
188 t2 = _mm_xor_si128(t2, ks[0]);
189 t3 = _mm_xor_si128(t3, ks[0]);
190 t4 = _mm_xor_si128(t4, ks[0]);
191
192 t1 = _mm_aesdec_si128(t1, ks[1]);
193 t2 = _mm_aesdec_si128(t2, ks[1]);
194 t3 = _mm_aesdec_si128(t3, ks[1]);
195 t4 = _mm_aesdec_si128(t4, ks[1]);
196 t1 = _mm_aesdec_si128(t1, ks[2]);
197 t2 = _mm_aesdec_si128(t2, ks[2]);
198 t3 = _mm_aesdec_si128(t3, ks[2]);
199 t4 = _mm_aesdec_si128(t4, ks[2]);
200 t1 = _mm_aesdec_si128(t1, ks[3]);
201 t2 = _mm_aesdec_si128(t2, ks[3]);
202 t3 = _mm_aesdec_si128(t3, ks[3]);
203 t4 = _mm_aesdec_si128(t4, ks[3]);
204 t1 = _mm_aesdec_si128(t1, ks[4]);
205 t2 = _mm_aesdec_si128(t2, ks[4]);
206 t3 = _mm_aesdec_si128(t3, ks[4]);
207 t4 = _mm_aesdec_si128(t4, ks[4]);
208 t1 = _mm_aesdec_si128(t1, ks[5]);
209 t2 = _mm_aesdec_si128(t2, ks[5]);
210 t3 = _mm_aesdec_si128(t3, ks[5]);
211 t4 = _mm_aesdec_si128(t4, ks[5]);
212 t1 = _mm_aesdec_si128(t1, ks[6]);
213 t2 = _mm_aesdec_si128(t2, ks[6]);
214 t3 = _mm_aesdec_si128(t3, ks[6]);
215 t4 = _mm_aesdec_si128(t4, ks[6]);
216 t1 = _mm_aesdec_si128(t1, ks[7]);
217 t2 = _mm_aesdec_si128(t2, ks[7]);
218 t3 = _mm_aesdec_si128(t3, ks[7]);
219 t4 = _mm_aesdec_si128(t4, ks[7]);
220 t1 = _mm_aesdec_si128(t1, ks[8]);
221 t2 = _mm_aesdec_si128(t2, ks[8]);
222 t3 = _mm_aesdec_si128(t3, ks[8]);
223 t4 = _mm_aesdec_si128(t4, ks[8]);
224 t1 = _mm_aesdec_si128(t1, ks[9]);
225 t2 = _mm_aesdec_si128(t2, ks[9]);
226 t3 = _mm_aesdec_si128(t3, ks[9]);
227 t4 = _mm_aesdec_si128(t4, ks[9]);
228
229 t1 = _mm_aesdeclast_si128(t1, ks[10]);
230 t2 = _mm_aesdeclast_si128(t2, ks[10]);
231 t3 = _mm_aesdeclast_si128(t3, ks[10]);
232 t4 = _mm_aesdeclast_si128(t4, ks[10]);
233
234 _mm_storeu_si128(bo + i + 0, t1);
235 _mm_storeu_si128(bo + i + 1, t2);
236 _mm_storeu_si128(bo + i + 2, t3);
237 _mm_storeu_si128(bo + i + 3, t4);
238 }
239
240 for (i = pblocks; i < blocks; i++)
241 {
242 t1 = _mm_loadu_si128(bi + i);
243 t1 = _mm_xor_si128(t1, ks[0]);
244
245 t1 = _mm_aesdec_si128(t1, ks[1]);
246 t1 = _mm_aesdec_si128(t1, ks[2]);
247 t1 = _mm_aesdec_si128(t1, ks[3]);
248 t1 = _mm_aesdec_si128(t1, ks[4]);
249 t1 = _mm_aesdec_si128(t1, ks[5]);
250 t1 = _mm_aesdec_si128(t1, ks[6]);
251 t1 = _mm_aesdec_si128(t1, ks[7]);
252 t1 = _mm_aesdec_si128(t1, ks[8]);
253 t1 = _mm_aesdec_si128(t1, ks[9]);
254
255 t1 = _mm_aesdeclast_si128(t1, ks[10]);
256 _mm_storeu_si128(bo + i, t1);
257 }
258 }
259
260 /**
261 * AES-192 ECB encryption
262 */
263 static void encrypt_ecb192(aesni_key_t *key, u_int blocks, u_char *in,
264 u_char *out)
265 {
266 __m128i *ks, *bi, *bo;
267 __m128i t1, t2, t3, t4;
268 u_int i, pblocks;
269
270 ks = key->schedule;
271 bi = (__m128i*)in;
272 bo = (__m128i*)out;
273 pblocks = blocks - (blocks % ECB_PARALLELISM);
274
275 for (i = 0; i < pblocks; i += ECB_PARALLELISM)
276 {
277 t1 = _mm_loadu_si128(bi + i + 0);
278 t2 = _mm_loadu_si128(bi + i + 1);
279 t3 = _mm_loadu_si128(bi + i + 2);
280 t4 = _mm_loadu_si128(bi + i + 3);
281
282 t1 = _mm_xor_si128(t1, ks[0]);
283 t2 = _mm_xor_si128(t2, ks[0]);
284 t3 = _mm_xor_si128(t3, ks[0]);
285 t4 = _mm_xor_si128(t4, ks[0]);
286
287 t1 = _mm_aesenc_si128(t1, ks[1]);
288 t2 = _mm_aesenc_si128(t2, ks[1]);
289 t3 = _mm_aesenc_si128(t3, ks[1]);
290 t4 = _mm_aesenc_si128(t4, ks[1]);
291 t1 = _mm_aesenc_si128(t1, ks[2]);
292 t2 = _mm_aesenc_si128(t2, ks[2]);
293 t3 = _mm_aesenc_si128(t3, ks[2]);
294 t4 = _mm_aesenc_si128(t4, ks[2]);
295 t1 = _mm_aesenc_si128(t1, ks[3]);
296 t2 = _mm_aesenc_si128(t2, ks[3]);
297 t3 = _mm_aesenc_si128(t3, ks[3]);
298 t4 = _mm_aesenc_si128(t4, ks[3]);
299 t1 = _mm_aesenc_si128(t1, ks[4]);
300 t2 = _mm_aesenc_si128(t2, ks[4]);
301 t3 = _mm_aesenc_si128(t3, ks[4]);
302 t4 = _mm_aesenc_si128(t4, ks[4]);
303 t1 = _mm_aesenc_si128(t1, ks[5]);
304 t2 = _mm_aesenc_si128(t2, ks[5]);
305 t3 = _mm_aesenc_si128(t3, ks[5]);
306 t4 = _mm_aesenc_si128(t4, ks[5]);
307 t1 = _mm_aesenc_si128(t1, ks[6]);
308 t2 = _mm_aesenc_si128(t2, ks[6]);
309 t3 = _mm_aesenc_si128(t3, ks[6]);
310 t4 = _mm_aesenc_si128(t4, ks[6]);
311 t1 = _mm_aesenc_si128(t1, ks[7]);
312 t2 = _mm_aesenc_si128(t2, ks[7]);
313 t3 = _mm_aesenc_si128(t3, ks[7]);
314 t4 = _mm_aesenc_si128(t4, ks[7]);
315 t1 = _mm_aesenc_si128(t1, ks[8]);
316 t2 = _mm_aesenc_si128(t2, ks[8]);
317 t3 = _mm_aesenc_si128(t3, ks[8]);
318 t4 = _mm_aesenc_si128(t4, ks[8]);
319 t1 = _mm_aesenc_si128(t1, ks[9]);
320 t2 = _mm_aesenc_si128(t2, ks[9]);
321 t3 = _mm_aesenc_si128(t3, ks[9]);
322 t4 = _mm_aesenc_si128(t4, ks[9]);
323 t1 = _mm_aesenc_si128(t1, ks[10]);
324 t2 = _mm_aesenc_si128(t2, ks[10]);
325 t3 = _mm_aesenc_si128(t3, ks[10]);
326 t4 = _mm_aesenc_si128(t4, ks[10]);
327 t1 = _mm_aesenc_si128(t1, ks[11]);
328 t2 = _mm_aesenc_si128(t2, ks[11]);
329 t3 = _mm_aesenc_si128(t3, ks[11]);
330 t4 = _mm_aesenc_si128(t4, ks[11]);
331
332 t1 = _mm_aesenclast_si128(t1, ks[12]);
333 t2 = _mm_aesenclast_si128(t2, ks[12]);
334 t3 = _mm_aesenclast_si128(t3, ks[12]);
335 t4 = _mm_aesenclast_si128(t4, ks[12]);
336
337 _mm_storeu_si128(bo + i + 0, t1);
338 _mm_storeu_si128(bo + i + 1, t2);
339 _mm_storeu_si128(bo + i + 2, t3);
340 _mm_storeu_si128(bo + i + 3, t4);
341 }
342
343 for (i = pblocks; i < blocks; i++)
344 {
345 t1 = _mm_loadu_si128(bi + i);
346 t1 = _mm_xor_si128(t1, ks[0]);
347
348 t1 = _mm_aesenc_si128(t1, ks[1]);
349 t1 = _mm_aesenc_si128(t1, ks[2]);
350 t1 = _mm_aesenc_si128(t1, ks[3]);
351 t1 = _mm_aesenc_si128(t1, ks[4]);
352 t1 = _mm_aesenc_si128(t1, ks[5]);
353 t1 = _mm_aesenc_si128(t1, ks[6]);
354 t1 = _mm_aesenc_si128(t1, ks[7]);
355 t1 = _mm_aesenc_si128(t1, ks[8]);
356 t1 = _mm_aesenc_si128(t1, ks[9]);
357 t1 = _mm_aesenc_si128(t1, ks[10]);
358 t1 = _mm_aesenc_si128(t1, ks[11]);
359
360 t1 = _mm_aesenclast_si128(t1, ks[12]);
361 _mm_storeu_si128(bo + i, t1);
362 }
363 }
364
365 /**
366 * AES-192 ECB decryption
367 */
368 static void decrypt_ecb192(aesni_key_t *key, u_int blocks, u_char *in,
369 u_char *out)
370 {
371 __m128i *ks, *bi, *bo;
372 __m128i t1, t2, t3, t4;
373 u_int i, pblocks;
374
375 ks = key->schedule;
376 bi = (__m128i*)in;
377 bo = (__m128i*)out;
378 pblocks = blocks - (blocks % ECB_PARALLELISM);
379
380 for (i = 0; i < pblocks; i += ECB_PARALLELISM)
381 {
382 t1 = _mm_loadu_si128(bi + i + 0);
383 t2 = _mm_loadu_si128(bi + i + 1);
384 t3 = _mm_loadu_si128(bi + i + 2);
385 t4 = _mm_loadu_si128(bi + i + 3);
386
387 t1 = _mm_xor_si128(t1, ks[0]);
388 t2 = _mm_xor_si128(t2, ks[0]);
389 t3 = _mm_xor_si128(t3, ks[0]);
390 t4 = _mm_xor_si128(t4, ks[0]);
391
392 t1 = _mm_aesdec_si128(t1, ks[1]);
393 t2 = _mm_aesdec_si128(t2, ks[1]);
394 t3 = _mm_aesdec_si128(t3, ks[1]);
395 t4 = _mm_aesdec_si128(t4, ks[1]);
396 t1 = _mm_aesdec_si128(t1, ks[2]);
397 t2 = _mm_aesdec_si128(t2, ks[2]);
398 t3 = _mm_aesdec_si128(t3, ks[2]);
399 t4 = _mm_aesdec_si128(t4, ks[2]);
400 t1 = _mm_aesdec_si128(t1, ks[3]);
401 t2 = _mm_aesdec_si128(t2, ks[3]);
402 t3 = _mm_aesdec_si128(t3, ks[3]);
403 t4 = _mm_aesdec_si128(t4, ks[3]);
404 t1 = _mm_aesdec_si128(t1, ks[4]);
405 t2 = _mm_aesdec_si128(t2, ks[4]);
406 t3 = _mm_aesdec_si128(t3, ks[4]);
407 t4 = _mm_aesdec_si128(t4, ks[4]);
408 t1 = _mm_aesdec_si128(t1, ks[5]);
409 t2 = _mm_aesdec_si128(t2, ks[5]);
410 t3 = _mm_aesdec_si128(t3, ks[5]);
411 t4 = _mm_aesdec_si128(t4, ks[5]);
412 t1 = _mm_aesdec_si128(t1, ks[6]);
413 t2 = _mm_aesdec_si128(t2, ks[6]);
414 t3 = _mm_aesdec_si128(t3, ks[6]);
415 t4 = _mm_aesdec_si128(t4, ks[6]);
416 t1 = _mm_aesdec_si128(t1, ks[7]);
417 t2 = _mm_aesdec_si128(t2, ks[7]);
418 t3 = _mm_aesdec_si128(t3, ks[7]);
419 t4 = _mm_aesdec_si128(t4, ks[7]);
420 t1 = _mm_aesdec_si128(t1, ks[8]);
421 t2 = _mm_aesdec_si128(t2, ks[8]);
422 t3 = _mm_aesdec_si128(t3, ks[8]);
423 t4 = _mm_aesdec_si128(t4, ks[8]);
424 t1 = _mm_aesdec_si128(t1, ks[9]);
425 t2 = _mm_aesdec_si128(t2, ks[9]);
426 t3 = _mm_aesdec_si128(t3, ks[9]);
427 t4 = _mm_aesdec_si128(t4, ks[9]);
428 t1 = _mm_aesdec_si128(t1, ks[10]);
429 t2 = _mm_aesdec_si128(t2, ks[10]);
430 t3 = _mm_aesdec_si128(t3, ks[10]);
431 t4 = _mm_aesdec_si128(t4, ks[10]);
432 t1 = _mm_aesdec_si128(t1, ks[11]);
433 t2 = _mm_aesdec_si128(t2, ks[11]);
434 t3 = _mm_aesdec_si128(t3, ks[11]);
435 t4 = _mm_aesdec_si128(t4, ks[11]);
436
437 t1 = _mm_aesdeclast_si128(t1, ks[12]);
438 t2 = _mm_aesdeclast_si128(t2, ks[12]);
439 t3 = _mm_aesdeclast_si128(t3, ks[12]);
440 t4 = _mm_aesdeclast_si128(t4, ks[12]);
441
442 _mm_storeu_si128(bo + i + 0, t1);
443 _mm_storeu_si128(bo + i + 1, t2);
444 _mm_storeu_si128(bo + i + 2, t3);
445 _mm_storeu_si128(bo + i + 3, t4);
446 }
447
448 for (i = pblocks; i < blocks; i++)
449 {
450 t1 = _mm_loadu_si128(bi + i);
451 t1 = _mm_xor_si128(t1, ks[0]);
452
453 t1 = _mm_aesdec_si128(t1, ks[1]);
454 t1 = _mm_aesdec_si128(t1, ks[2]);
455 t1 = _mm_aesdec_si128(t1, ks[3]);
456 t1 = _mm_aesdec_si128(t1, ks[4]);
457 t1 = _mm_aesdec_si128(t1, ks[5]);
458 t1 = _mm_aesdec_si128(t1, ks[6]);
459 t1 = _mm_aesdec_si128(t1, ks[7]);
460 t1 = _mm_aesdec_si128(t1, ks[8]);
461 t1 = _mm_aesdec_si128(t1, ks[9]);
462 t1 = _mm_aesdec_si128(t1, ks[10]);
463 t1 = _mm_aesdec_si128(t1, ks[11]);
464
465 t1 = _mm_aesdeclast_si128(t1, ks[12]);
466 _mm_storeu_si128(bo + i, t1);
467 }
468 }
469
470 /**
471 * AES-256 ECB encryption
472 */
473 static void encrypt_ecb256(aesni_key_t *key, u_int blocks, u_char *in,
474 u_char *out)
475 {
476 __m128i *ks, *bi, *bo;
477 __m128i t1, t2, t3, t4;
478 u_int i, pblocks;
479
480 ks = key->schedule;
481 bi = (__m128i*)in;
482 bo = (__m128i*)out;
483 pblocks = blocks - (blocks % ECB_PARALLELISM);
484
485 for (i = 0; i < pblocks; i += ECB_PARALLELISM)
486 {
487 t1 = _mm_loadu_si128(bi + i + 0);
488 t2 = _mm_loadu_si128(bi + i + 1);
489 t3 = _mm_loadu_si128(bi + i + 2);
490 t4 = _mm_loadu_si128(bi + i + 3);
491
492 t1 = _mm_xor_si128(t1, ks[0]);
493 t2 = _mm_xor_si128(t2, ks[0]);
494 t3 = _mm_xor_si128(t3, ks[0]);
495 t4 = _mm_xor_si128(t4, ks[0]);
496
497 t1 = _mm_aesenc_si128(t1, ks[1]);
498 t2 = _mm_aesenc_si128(t2, ks[1]);
499 t3 = _mm_aesenc_si128(t3, ks[1]);
500 t4 = _mm_aesenc_si128(t4, ks[1]);
501 t1 = _mm_aesenc_si128(t1, ks[2]);
502 t2 = _mm_aesenc_si128(t2, ks[2]);
503 t3 = _mm_aesenc_si128(t3, ks[2]);
504 t4 = _mm_aesenc_si128(t4, ks[2]);
505 t1 = _mm_aesenc_si128(t1, ks[3]);
506 t2 = _mm_aesenc_si128(t2, ks[3]);
507 t3 = _mm_aesenc_si128(t3, ks[3]);
508 t4 = _mm_aesenc_si128(t4, ks[3]);
509 t1 = _mm_aesenc_si128(t1, ks[4]);
510 t2 = _mm_aesenc_si128(t2, ks[4]);
511 t3 = _mm_aesenc_si128(t3, ks[4]);
512 t4 = _mm_aesenc_si128(t4, ks[4]);
513 t1 = _mm_aesenc_si128(t1, ks[5]);
514 t2 = _mm_aesenc_si128(t2, ks[5]);
515 t3 = _mm_aesenc_si128(t3, ks[5]);
516 t4 = _mm_aesenc_si128(t4, ks[5]);
517 t1 = _mm_aesenc_si128(t1, ks[6]);
518 t2 = _mm_aesenc_si128(t2, ks[6]);
519 t3 = _mm_aesenc_si128(t3, ks[6]);
520 t4 = _mm_aesenc_si128(t4, ks[6]);
521 t1 = _mm_aesenc_si128(t1, ks[7]);
522 t2 = _mm_aesenc_si128(t2, ks[7]);
523 t3 = _mm_aesenc_si128(t3, ks[7]);
524 t4 = _mm_aesenc_si128(t4, ks[7]);
525 t1 = _mm_aesenc_si128(t1, ks[8]);
526 t2 = _mm_aesenc_si128(t2, ks[8]);
527 t3 = _mm_aesenc_si128(t3, ks[8]);
528 t4 = _mm_aesenc_si128(t4, ks[8]);
529 t1 = _mm_aesenc_si128(t1, ks[9]);
530 t2 = _mm_aesenc_si128(t2, ks[9]);
531 t3 = _mm_aesenc_si128(t3, ks[9]);
532 t4 = _mm_aesenc_si128(t4, ks[9]);
533 t1 = _mm_aesenc_si128(t1, ks[10]);
534 t2 = _mm_aesenc_si128(t2, ks[10]);
535 t3 = _mm_aesenc_si128(t3, ks[10]);
536 t4 = _mm_aesenc_si128(t4, ks[10]);
537 t1 = _mm_aesenc_si128(t1, ks[11]);
538 t2 = _mm_aesenc_si128(t2, ks[11]);
539 t3 = _mm_aesenc_si128(t3, ks[11]);
540 t4 = _mm_aesenc_si128(t4, ks[11]);
541 t1 = _mm_aesenc_si128(t1, ks[12]);
542 t2 = _mm_aesenc_si128(t2, ks[12]);
543 t3 = _mm_aesenc_si128(t3, ks[12]);
544 t4 = _mm_aesenc_si128(t4, ks[12]);
545 t1 = _mm_aesenc_si128(t1, ks[13]);
546 t2 = _mm_aesenc_si128(t2, ks[13]);
547 t3 = _mm_aesenc_si128(t3, ks[13]);
548 t4 = _mm_aesenc_si128(t4, ks[13]);
549
550 t1 = _mm_aesenclast_si128(t1, ks[14]);
551 t2 = _mm_aesenclast_si128(t2, ks[14]);
552 t3 = _mm_aesenclast_si128(t3, ks[14]);
553 t4 = _mm_aesenclast_si128(t4, ks[14]);
554
555 _mm_storeu_si128(bo + i + 0, t1);
556 _mm_storeu_si128(bo + i + 1, t2);
557 _mm_storeu_si128(bo + i + 2, t3);
558 _mm_storeu_si128(bo + i + 3, t4);
559 }
560
561 for (i = pblocks; i < blocks; i++)
562 {
563 t1 = _mm_loadu_si128(bi + i);
564 t1 = _mm_xor_si128(t1, ks[0]);
565
566 t1 = _mm_aesenc_si128(t1, ks[1]);
567 t1 = _mm_aesenc_si128(t1, ks[2]);
568 t1 = _mm_aesenc_si128(t1, ks[3]);
569 t1 = _mm_aesenc_si128(t1, ks[4]);
570 t1 = _mm_aesenc_si128(t1, ks[5]);
571 t1 = _mm_aesenc_si128(t1, ks[6]);
572 t1 = _mm_aesenc_si128(t1, ks[7]);
573 t1 = _mm_aesenc_si128(t1, ks[8]);
574 t1 = _mm_aesenc_si128(t1, ks[9]);
575 t1 = _mm_aesenc_si128(t1, ks[10]);
576 t1 = _mm_aesenc_si128(t1, ks[11]);
577 t1 = _mm_aesenc_si128(t1, ks[12]);
578 t1 = _mm_aesenc_si128(t1, ks[13]);
579
580 t1 = _mm_aesenclast_si128(t1, ks[14]);
581 _mm_storeu_si128(bo + i, t1);
582 }
583 }
584
585 /**
586 * AES-256 ECB decryption
587 */
588 static void decrypt_ecb256(aesni_key_t *key, u_int blocks, u_char *in,
589 u_char *out)
590 {
591 __m128i *ks, *bi, *bo;
592 __m128i t1, t2, t3, t4;
593 u_int i, pblocks;
594
595 ks = key->schedule;
596 bi = (__m128i*)in;
597 bo = (__m128i*)out;
598 pblocks = blocks - (blocks % ECB_PARALLELISM);
599
600 for (i = 0; i < pblocks; i += ECB_PARALLELISM)
601 {
602 t1 = _mm_loadu_si128(bi + i + 0);
603 t2 = _mm_loadu_si128(bi + i + 1);
604 t3 = _mm_loadu_si128(bi + i + 2);
605 t4 = _mm_loadu_si128(bi + i + 3);
606
607 t1 = _mm_xor_si128(t1, ks[0]);
608 t2 = _mm_xor_si128(t2, ks[0]);
609 t3 = _mm_xor_si128(t3, ks[0]);
610 t4 = _mm_xor_si128(t4, ks[0]);
611
612 t1 = _mm_aesdec_si128(t1, ks[1]);
613 t2 = _mm_aesdec_si128(t2, ks[1]);
614 t3 = _mm_aesdec_si128(t3, ks[1]);
615 t4 = _mm_aesdec_si128(t4, ks[1]);
616 t1 = _mm_aesdec_si128(t1, ks[2]);
617 t2 = _mm_aesdec_si128(t2, ks[2]);
618 t3 = _mm_aesdec_si128(t3, ks[2]);
619 t4 = _mm_aesdec_si128(t4, ks[2]);
620 t1 = _mm_aesdec_si128(t1, ks[3]);
621 t2 = _mm_aesdec_si128(t2, ks[3]);
622 t3 = _mm_aesdec_si128(t3, ks[3]);
623 t4 = _mm_aesdec_si128(t4, ks[3]);
624 t1 = _mm_aesdec_si128(t1, ks[4]);
625 t2 = _mm_aesdec_si128(t2, ks[4]);
626 t3 = _mm_aesdec_si128(t3, ks[4]);
627 t4 = _mm_aesdec_si128(t4, ks[4]);
628 t1 = _mm_aesdec_si128(t1, ks[5]);
629 t2 = _mm_aesdec_si128(t2, ks[5]);
630 t3 = _mm_aesdec_si128(t3, ks[5]);
631 t4 = _mm_aesdec_si128(t4, ks[5]);
632 t1 = _mm_aesdec_si128(t1, ks[6]);
633 t2 = _mm_aesdec_si128(t2, ks[6]);
634 t3 = _mm_aesdec_si128(t3, ks[6]);
635 t4 = _mm_aesdec_si128(t4, ks[6]);
636 t1 = _mm_aesdec_si128(t1, ks[7]);
637 t2 = _mm_aesdec_si128(t2, ks[7]);
638 t3 = _mm_aesdec_si128(t3, ks[7]);
639 t4 = _mm_aesdec_si128(t4, ks[7]);
640 t1 = _mm_aesdec_si128(t1, ks[8]);
641 t2 = _mm_aesdec_si128(t2, ks[8]);
642 t3 = _mm_aesdec_si128(t3, ks[8]);
643 t4 = _mm_aesdec_si128(t4, ks[8]);
644 t1 = _mm_aesdec_si128(t1, ks[9]);
645 t2 = _mm_aesdec_si128(t2, ks[9]);
646 t3 = _mm_aesdec_si128(t3, ks[9]);
647 t4 = _mm_aesdec_si128(t4, ks[9]);
648 t1 = _mm_aesdec_si128(t1, ks[10]);
649 t2 = _mm_aesdec_si128(t2, ks[10]);
650 t3 = _mm_aesdec_si128(t3, ks[10]);
651 t4 = _mm_aesdec_si128(t4, ks[10]);
652 t1 = _mm_aesdec_si128(t1, ks[11]);
653 t2 = _mm_aesdec_si128(t2, ks[11]);
654 t3 = _mm_aesdec_si128(t3, ks[11]);
655 t4 = _mm_aesdec_si128(t4, ks[11]);
656 t1 = _mm_aesdec_si128(t1, ks[12]);
657 t2 = _mm_aesdec_si128(t2, ks[12]);
658 t3 = _mm_aesdec_si128(t3, ks[12]);
659 t4 = _mm_aesdec_si128(t4, ks[12]);
660 t1 = _mm_aesdec_si128(t1, ks[13]);
661 t2 = _mm_aesdec_si128(t2, ks[13]);
662 t3 = _mm_aesdec_si128(t3, ks[13]);
663 t4 = _mm_aesdec_si128(t4, ks[13]);
664
665 t1 = _mm_aesdeclast_si128(t1, ks[14]);
666 t2 = _mm_aesdeclast_si128(t2, ks[14]);
667 t3 = _mm_aesdeclast_si128(t3, ks[14]);
668 t4 = _mm_aesdeclast_si128(t4, ks[14]);
669
670 _mm_storeu_si128(bo + i + 0, t1);
671 _mm_storeu_si128(bo + i + 1, t2);
672 _mm_storeu_si128(bo + i + 2, t3);
673 _mm_storeu_si128(bo + i + 3, t4);
674 }
675
676 for (i = pblocks; i < blocks; i++)
677 {
678 t1 = _mm_loadu_si128(bi + i);
679 t1 = _mm_xor_si128(t1, ks[0]);
680
681 t1 = _mm_aesdec_si128(t1, ks[1]);
682 t1 = _mm_aesdec_si128(t1, ks[2]);
683 t1 = _mm_aesdec_si128(t1, ks[3]);
684 t1 = _mm_aesdec_si128(t1, ks[4]);
685 t1 = _mm_aesdec_si128(t1, ks[5]);
686 t1 = _mm_aesdec_si128(t1, ks[6]);
687 t1 = _mm_aesdec_si128(t1, ks[7]);
688 t1 = _mm_aesdec_si128(t1, ks[8]);
689 t1 = _mm_aesdec_si128(t1, ks[9]);
690 t1 = _mm_aesdec_si128(t1, ks[10]);
691 t1 = _mm_aesdec_si128(t1, ks[11]);
692 t1 = _mm_aesdec_si128(t1, ks[12]);
693 t1 = _mm_aesdec_si128(t1, ks[13]);
694
695 t1 = _mm_aesdeclast_si128(t1, ks[14]);
696 _mm_storeu_si128(bo + i, t1);
697 }
698 }
699
700 /**
701 * Do inline or allocated de/encryption using key schedule
702 */
703 static bool crypt(aesni_ecb_fn_t fn, aesni_key_t *key, chunk_t data,
704 chunk_t *out)
705 {
706 u_char *buf;
707
708 if (!key || data.len % AES_BLOCK_SIZE)
709 {
710 return FALSE;
711 }
712 if (out)
713 {
714 *out = chunk_alloc(data.len);
715 buf = out->ptr;
716 }
717 else
718 {
719 buf = data.ptr;
720 }
721 fn(key, data.len / AES_BLOCK_SIZE, data.ptr, buf);
722 return TRUE;
723 }
724
725 METHOD(crypter_t, encrypt, bool,
726 private_aesni_ecb_t *this, chunk_t data, chunk_t iv, chunk_t *encrypted)
727 {
728 return crypt(this->encrypt, this->ekey, data, encrypted);
729 }
730
731 METHOD(crypter_t, decrypt, bool,
732 private_aesni_ecb_t *this, chunk_t data, chunk_t iv, chunk_t *decrypted)
733 {
734 return crypt(this->decrypt, this->dkey, data, decrypted);
735 }
736
737 METHOD(crypter_t, get_block_size, size_t,
738 private_aesni_ecb_t *this)
739 {
740 return AES_BLOCK_SIZE;
741 }
742
743 METHOD(crypter_t, get_iv_size, size_t,
744 private_aesni_ecb_t *this)
745 {
746 return 0;
747 }
748
749 METHOD(crypter_t, get_key_size, size_t,
750 private_aesni_ecb_t *this)
751 {
752 return this->key_size;
753 }
754
755 METHOD(crypter_t, set_key, bool,
756 private_aesni_ecb_t *this, chunk_t key)
757 {
758 if (key.len != this->key_size)
759 {
760 return FALSE;
761 }
762
763 DESTROY_IF(this->ekey);
764 DESTROY_IF(this->dkey);
765
766 this->ekey = aesni_key_create(TRUE, key);
767 this->dkey = aesni_key_create(FALSE, key);
768
769 return this->ekey && this->dkey;
770 }
771
772 METHOD(crypter_t, destroy, void,
773 private_aesni_ecb_t *this)
774 {
775 DESTROY_IF(this->ekey);
776 DESTROY_IF(this->dkey);
777 free_align(this);
778 }
779
780 /**
781 * See header
782 */
783 aesni_ecb_t *aesni_ecb_create(encryption_algorithm_t algo, size_t key_size)
784 {
785 private_aesni_ecb_t *this;
786
787 if (algo != ENCR_AES_ECB)
788 {
789 return NULL;
790 }
791 switch (key_size)
792 {
793 case 0:
794 key_size = 16;
795 break;
796 case 16:
797 case 24:
798 case 32:
799 break;
800 default:
801 return NULL;
802 }
803
804 INIT_ALIGN(this, sizeof(__m128i),
805 .public = {
806 .crypter = {
807 .encrypt = _encrypt,
808 .decrypt = _decrypt,
809 .get_block_size = _get_block_size,
810 .get_iv_size = _get_iv_size,
811 .get_key_size = _get_key_size,
812 .set_key = _set_key,
813 .destroy = _destroy,
814 },
815 },
816 .key_size = key_size,
817 );
818
819 switch (key_size)
820 {
821 case 16:
822 this->encrypt = encrypt_ecb128;
823 this->decrypt = decrypt_ecb128;
824 break;
825 case 24:
826 this->encrypt = encrypt_ecb192;
827 this->decrypt = decrypt_ecb192;
828 break;
829 case 32:
830 this->encrypt = encrypt_ecb256;
831 this->decrypt = decrypt_ecb256;
832 break;
833 }
834
835 return &this->public;
836 }