a33bbf15e96af481e381e4657a64b4bf2a5020c8
[strongswan.git] / src / libstrongswan / plugins / chapoly / chapoly_drv_ssse3.c
1 /*
2 * Copyright (C) 2015 Martin Willi
3 * Copyright (C) 2015 revosec AG
4 *
5 * Based on public domain code by Andrew Moon and Daniel J. Bernstein.
6 *
7 * This program is free software; you can redistribute it and/or modify it
8 * under the terms of the GNU General Public License as published by the
9 * Free Software Foundation; either version 2 of the License, or (at your
10 * option) any later version. See <http://www.fsf.org/copyleft/gpl.txt>.
11 *
12 * This program is distributed in the hope that it will be useful, but
13 * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
14 * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
15 * for more details.
16 */
17
18 #include "chapoly_drv_ssse3.h"
19
20 #ifdef __SSSE3__
21
22 #include <utils/cpu_feature.h>
23
24 #include <tmmintrin.h>
25
26 #define CHACHA_DOUBLEROUNDS 10
27
28 typedef struct private_chapoly_drv_ssse3_t private_chapoly_drv_ssse3_t;
29
30 /**
31 * Private data of an chapoly_drv_ssse3_t object.
32 */
33 struct private_chapoly_drv_ssse3_t {
34
35 /**
36 * Public chapoly_drv_ssse3_t interface.
37 */
38 chapoly_drv_t public;
39
40 /**
41 * ChaCha20 state matrix, as 128-bit vectors
42 */
43 __m128i m[4];
44
45 /**
46 * Poly1305 update key
47 */
48 u_int32_t r[5];
49
50 /**
51 * Poly1305 state
52 */
53 u_int32_t h[5];
54
55 /**
56 * Poly1305 finalize key
57 */
58 u_int32_t s[4];
59 };
60
61 /**
62 * Read a 32-bit integer from an unaligned address
63 */
64 static inline u_int32_t ru32(void *p)
65 {
66 u_int32_t ret;
67
68 memcpy(&ret, p, sizeof(ret));
69 return ret;
70 }
71
72 /**
73 * Write a 32-bit word to an unaligned address
74 */
75 static inline void wu32(void *p, u_int32_t v)
76 {
77 memcpy(p, &v, sizeof(v));
78 }
79
80 /**
81 * Shift a 64-bit unsigned integer v right by n bits, clamp to 32 bit
82 */
83 static inline u_int32_t sr(u_int64_t v, u_char n)
84 {
85 return v >> n;
86 }
87
88 /**
89 * AND two values, using a native integer size >= sizeof(u_int32_t)
90 */
91 static inline u_long and(u_long v, u_long mask)
92 {
93 return v & mask;
94 }
95
96 /**
97 * r = shuffle(a ^ b, s)
98 */
99 static inline __m128i sfflxor32(__m128i a, __m128i b, __m128i s)
100 {
101 return _mm_shuffle_epi8(_mm_xor_si128(a, b), s);
102 }
103
104 /**
105 * r = rotl32(a ^ b, r)
106 */
107 static inline __m128i rotlxor32(__m128i a, __m128i b, u_char r)
108 {
109 a = _mm_xor_si128(a, b);
110 return _mm_or_si128(_mm_slli_epi32(a, r), _mm_srli_epi32(a, 32 - r));
111 }
112
113 /**
114 * XOR a Chacha20 keystream block into data, increment counter
115 */
116 static void chacha_block_xor(private_chapoly_drv_ssse3_t *this, void *data)
117 {
118 __m128i x0, x1, x2, x3, r8, r16, *out = data;
119 u_int i;
120
121 r8 = _mm_set_epi8(14, 13, 12, 15, 10, 9, 8, 11, 6, 5, 4, 7, 2, 1, 0, 3);
122 r16 = _mm_set_epi8(13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2);
123
124 x0 = this->m[0];
125 x1 = this->m[1];
126 x2 = this->m[2];
127 x3 = this->m[3];
128
129 for (i = 0 ; i < CHACHA_DOUBLEROUNDS; i++)
130 {
131 x0 = _mm_add_epi32(x0, x1);
132 x3 = sfflxor32(x3, x0, r16);
133
134 x2 = _mm_add_epi32(x2, x3);
135 x1 = rotlxor32(x1, x2, 12);
136
137 x0 = _mm_add_epi32(x0, x1);
138 x3 = sfflxor32(x3, x0, r8);
139
140 x2 = _mm_add_epi32(x2, x3);
141 x1 = rotlxor32(x1, x2, 7);
142
143 x1 = _mm_shuffle_epi32(x1, _MM_SHUFFLE(0, 3, 2, 1));
144 x2 = _mm_shuffle_epi32(x2, _MM_SHUFFLE(1, 0, 3, 2));
145 x3 = _mm_shuffle_epi32(x3, _MM_SHUFFLE(2, 1, 0, 3));
146
147 x0 = _mm_add_epi32(x0, x1);
148 x3 = sfflxor32(x3, x0, r16);
149
150 x2 = _mm_add_epi32(x2, x3);
151 x1 = rotlxor32(x1, x2, 12);
152
153 x0 = _mm_add_epi32(x0, x1);
154 x3 = sfflxor32(x3, x0, r8);
155
156 x2 = _mm_add_epi32(x2, x3);
157 x1 = rotlxor32(x1, x2, 7);
158
159 x1 = _mm_shuffle_epi32(x1, _MM_SHUFFLE(2, 1, 0, 3));
160 x2 = _mm_shuffle_epi32(x2, _MM_SHUFFLE(1, 0, 3, 2));
161 x3 = _mm_shuffle_epi32(x3, _MM_SHUFFLE(0, 3, 2, 1));
162 }
163
164 x0 = _mm_add_epi32(x0, this->m[0]);
165 x1 = _mm_add_epi32(x1, this->m[1]);
166 x2 = _mm_add_epi32(x2, this->m[2]);
167 x3 = _mm_add_epi32(x3, this->m[3]);
168 x0 = _mm_xor_si128(x0, _mm_loadu_si128(out + 0));
169 x1 = _mm_xor_si128(x1, _mm_loadu_si128(out + 1));
170 x2 = _mm_xor_si128(x2, _mm_loadu_si128(out + 2));
171 x3 = _mm_xor_si128(x3, _mm_loadu_si128(out + 3));
172 _mm_storeu_si128(out + 0, x0);
173 _mm_storeu_si128(out + 1, x1);
174 _mm_storeu_si128(out + 2, x2);
175 _mm_storeu_si128(out + 3, x3);
176
177 this->m[3] = _mm_add_epi32(this->m[3], _mm_set_epi32(0, 0, 0, 1));
178 }
179
180 /**
181 * XOR four Chacha20 keystream blocks into data, increment counter
182 */
183 static void chacha_4block_xor(private_chapoly_drv_ssse3_t *this, void *data)
184 {
185 __m128i x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, xa, xb, xc, xd, xe, xf;
186 __m128i r8, r16, ctrinc, t, *out = data;
187 u_int32_t *m = (u_int32_t*)this->m;
188 u_int i;
189
190 r8 = _mm_set_epi8(14, 13, 12, 15, 10, 9, 8, 11, 6, 5, 4, 7, 2, 1, 0, 3);
191 r16 = _mm_set_epi8(13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2);
192 ctrinc = _mm_set_epi32(3, 2, 1, 0);
193
194 x0 = _mm_set1_epi32(m[ 0]);
195 x1 = _mm_set1_epi32(m[ 1]);
196 x2 = _mm_set1_epi32(m[ 2]);
197 x3 = _mm_set1_epi32(m[ 3]);
198 x4 = _mm_set1_epi32(m[ 4]);
199 x5 = _mm_set1_epi32(m[ 5]);
200 x6 = _mm_set1_epi32(m[ 6]);
201 x7 = _mm_set1_epi32(m[ 7]);
202 x8 = _mm_set1_epi32(m[ 8]);
203 x9 = _mm_set1_epi32(m[ 9]);
204 xa = _mm_set1_epi32(m[10]);
205 xb = _mm_set1_epi32(m[11]);
206 xc = _mm_set1_epi32(m[12]);
207 xd = _mm_set1_epi32(m[13]);
208 xe = _mm_set1_epi32(m[14]);
209 xf = _mm_set1_epi32(m[15]);
210
211 xc = _mm_add_epi32(xc, ctrinc);
212
213 for (i = 0 ; i < CHACHA_DOUBLEROUNDS; i++)
214 {
215 x0 = _mm_add_epi32(x0, x4); xc = sfflxor32(xc, x0, r16);
216 x1 = _mm_add_epi32(x1, x5); xd = sfflxor32(xd, x1, r16);
217 x2 = _mm_add_epi32(x2, x6); xe = sfflxor32(xe, x2, r16);
218 x3 = _mm_add_epi32(x3, x7); xf = sfflxor32(xf, x3, r16);
219
220 x8 = _mm_add_epi32(x8, xc); x4 = rotlxor32(x4, x8, 12);
221 x9 = _mm_add_epi32(x9, xd); x5 = rotlxor32(x5, x9, 12);
222 xa = _mm_add_epi32(xa, xe); x6 = rotlxor32(x6, xa, 12);
223 xb = _mm_add_epi32(xb, xf); x7 = rotlxor32(x7, xb, 12);
224
225 x0 = _mm_add_epi32(x0, x4); xc = sfflxor32(xc, x0, r8);
226 x1 = _mm_add_epi32(x1, x5); xd = sfflxor32(xd, x1, r8);
227 x2 = _mm_add_epi32(x2, x6); xe = sfflxor32(xe, x2, r8);
228 x3 = _mm_add_epi32(x3, x7); xf = sfflxor32(xf, x3, r8);
229
230 x8 = _mm_add_epi32(x8, xc); x4 = rotlxor32(x4, x8, 7);
231 x9 = _mm_add_epi32(x9, xd); x5 = rotlxor32(x5, x9, 7);
232 xa = _mm_add_epi32(xa, xe); x6 = rotlxor32(x6, xa, 7);
233 xb = _mm_add_epi32(xb, xf); x7 = rotlxor32(x7, xb, 7);
234
235 x0 = _mm_add_epi32(x0, x5); xf = sfflxor32(xf, x0, r16);
236 x1 = _mm_add_epi32(x1, x6); xc = sfflxor32(xc, x1, r16);
237 x2 = _mm_add_epi32(x2, x7); xd = sfflxor32(xd, x2, r16);
238 x3 = _mm_add_epi32(x3, x4); xe = sfflxor32(xe, x3, r16);
239
240 xa = _mm_add_epi32(xa, xf); x5 = rotlxor32(x5, xa, 12);
241 xb = _mm_add_epi32(xb, xc); x6 = rotlxor32(x6, xb, 12);
242 x8 = _mm_add_epi32(x8, xd); x7 = rotlxor32(x7, x8, 12);
243 x9 = _mm_add_epi32(x9, xe); x4 = rotlxor32(x4, x9, 12);
244
245 x0 = _mm_add_epi32(x0, x5); xf = sfflxor32(xf, x0, r8);
246 x1 = _mm_add_epi32(x1, x6); xc = sfflxor32(xc, x1, r8);
247 x2 = _mm_add_epi32(x2, x7); xd = sfflxor32(xd, x2, r8);
248 x3 = _mm_add_epi32(x3, x4); xe = sfflxor32(xe, x3, r8);
249
250 xa = _mm_add_epi32(xa, xf); x5 = rotlxor32(x5, xa, 7);
251 xb = _mm_add_epi32(xb, xc); x6 = rotlxor32(x6, xb, 7);
252 x8 = _mm_add_epi32(x8, xd); x7 = rotlxor32(x7, x8, 7);
253 x9 = _mm_add_epi32(x9, xe); x4 = rotlxor32(x4, x9, 7);
254 }
255
256 x0 = _mm_add_epi32(x0, _mm_set1_epi32(m[ 0]));
257 x1 = _mm_add_epi32(x1, _mm_set1_epi32(m[ 1]));
258 x2 = _mm_add_epi32(x2, _mm_set1_epi32(m[ 2]));
259 x3 = _mm_add_epi32(x3, _mm_set1_epi32(m[ 3]));
260 x4 = _mm_add_epi32(x4, _mm_set1_epi32(m[ 4]));
261 x5 = _mm_add_epi32(x5, _mm_set1_epi32(m[ 5]));
262 x6 = _mm_add_epi32(x6, _mm_set1_epi32(m[ 6]));
263 x7 = _mm_add_epi32(x7, _mm_set1_epi32(m[ 7]));
264 x8 = _mm_add_epi32(x8, _mm_set1_epi32(m[ 8]));
265 x9 = _mm_add_epi32(x9, _mm_set1_epi32(m[ 9]));
266 xa = _mm_add_epi32(xa, _mm_set1_epi32(m[10]));
267 xb = _mm_add_epi32(xb, _mm_set1_epi32(m[11]));
268 xc = _mm_add_epi32(xc, _mm_set1_epi32(m[12]));
269 xd = _mm_add_epi32(xd, _mm_set1_epi32(m[13]));
270 xe = _mm_add_epi32(xe, _mm_set1_epi32(m[14]));
271 xf = _mm_add_epi32(xf, _mm_set1_epi32(m[15]));
272
273 xc = _mm_add_epi32(xc, ctrinc);
274
275 /* transpose state matrix by interleaving 32-, then 64-bit words */
276 t = x0; x0 = _mm_unpacklo_epi32(t, x1);
277 x1 = _mm_unpackhi_epi32(t, x1);
278 t = x2; x2 = _mm_unpacklo_epi32(t, x3);
279 x3 = _mm_unpackhi_epi32(t, x3);
280 t = x4; x4 = _mm_unpacklo_epi32(t, x5);
281 x5 = _mm_unpackhi_epi32(t, x5);
282 t = x6; x6 = _mm_unpacklo_epi32(t, x7);
283 x7 = _mm_unpackhi_epi32(t, x7);
284 t = x8; x8 = _mm_unpacklo_epi32(t, x9);
285 x9 = _mm_unpackhi_epi32(t, x9);
286 t = xa; xa = _mm_unpacklo_epi32(t, xb);
287 xb = _mm_unpackhi_epi32(t, xb);
288 t = xc; xc = _mm_unpacklo_epi32(t, xd);
289 xd = _mm_unpackhi_epi32(t, xd);
290 t = xe; xe = _mm_unpacklo_epi32(t, xf);
291 xf = _mm_unpackhi_epi32(t, xf);
292
293 t = x0; x0 = _mm_unpacklo_epi64(t, x2);
294 x2 = _mm_unpackhi_epi64(t, x2);
295 t = x1; x1 = _mm_unpacklo_epi64(t, x3);
296 x3 = _mm_unpackhi_epi64(t, x3);
297 t = x4; x4 = _mm_unpacklo_epi64(t, x6);
298 x6 = _mm_unpackhi_epi64(t, x6);
299 t = x5; x5 = _mm_unpacklo_epi64(t, x7);
300 x7 = _mm_unpackhi_epi64(t, x7);
301 t = x8; x8 = _mm_unpacklo_epi64(t, xa);
302 xa = _mm_unpackhi_epi64(t, xa);
303 t = x9; x9 = _mm_unpacklo_epi64(t, xb);
304 xb = _mm_unpackhi_epi64(t, xb);
305 t = xc; xc = _mm_unpacklo_epi64(t, xe);
306 xe = _mm_unpackhi_epi64(t, xe);
307 t = xd; xd = _mm_unpacklo_epi64(t, xf);
308 xf = _mm_unpackhi_epi64(t, xf);
309
310 x0 = _mm_xor_si128(_mm_loadu_si128(out + 0), x0);
311 x1 = _mm_xor_si128(_mm_loadu_si128(out + 8), x1);
312 x2 = _mm_xor_si128(_mm_loadu_si128(out + 4), x2);
313 x3 = _mm_xor_si128(_mm_loadu_si128(out + 12), x3);
314 x4 = _mm_xor_si128(_mm_loadu_si128(out + 1), x4);
315 x5 = _mm_xor_si128(_mm_loadu_si128(out + 9), x5);
316 x6 = _mm_xor_si128(_mm_loadu_si128(out + 5), x6);
317 x7 = _mm_xor_si128(_mm_loadu_si128(out + 13), x7);
318 x8 = _mm_xor_si128(_mm_loadu_si128(out + 2), x8);
319 x9 = _mm_xor_si128(_mm_loadu_si128(out + 10), x9);
320 xa = _mm_xor_si128(_mm_loadu_si128(out + 6), xa);
321 xb = _mm_xor_si128(_mm_loadu_si128(out + 14), xb);
322 xc = _mm_xor_si128(_mm_loadu_si128(out + 3), xc);
323 xd = _mm_xor_si128(_mm_loadu_si128(out + 11), xd);
324 xe = _mm_xor_si128(_mm_loadu_si128(out + 7), xe);
325 xf = _mm_xor_si128(_mm_loadu_si128(out + 15), xf);
326
327 _mm_storeu_si128(out + 0, x0);
328 _mm_storeu_si128(out + 8, x1);
329 _mm_storeu_si128(out + 4, x2);
330 _mm_storeu_si128(out + 12, x3);
331 _mm_storeu_si128(out + 1, x4);
332 _mm_storeu_si128(out + 9, x5);
333 _mm_storeu_si128(out + 5, x6);
334 _mm_storeu_si128(out + 13, x7);
335 _mm_storeu_si128(out + 2, x8);
336 _mm_storeu_si128(out + 10, x9);
337 _mm_storeu_si128(out + 6, xa);
338 _mm_storeu_si128(out + 14, xb);
339 _mm_storeu_si128(out + 3, xc);
340 _mm_storeu_si128(out + 11, xd);
341 _mm_storeu_si128(out + 7, xe);
342 _mm_storeu_si128(out + 15, xf);
343
344 this->m[3] = _mm_add_epi32(this->m[3], _mm_set_epi32(0, 0, 0, 4));
345 }
346
347
348 METHOD(chapoly_drv_t, set_key, bool,
349 private_chapoly_drv_ssse3_t *this, u_char *constant, u_char *key,
350 u_char *salt)
351 {
352 this->m[0] = _mm_loadu_si128((__m128i*)constant);
353 this->m[1] = _mm_loadu_si128((__m128i*)key + 0);
354 this->m[2] = _mm_loadu_si128((__m128i*)key + 1);
355 this->m[3] = _mm_set_epi32(0, 0, ru32(salt), 0);
356
357 return TRUE;
358 }
359
360 METHOD(chapoly_drv_t, init, bool,
361 private_chapoly_drv_ssse3_t *this, u_char *iv)
362 {
363 u_char key[CHACHA_BLOCK_SIZE];
364
365 this->m[3] = _mm_or_si128(
366 _mm_set_epi32(ru32(iv + 4), ru32(iv + 0), 0, 0),
367 _mm_and_si128(this->m[3], _mm_set_epi32(0, 0, ~0, 0)));
368
369 memset(key, 0, CHACHA_BLOCK_SIZE);
370 chacha_block_xor(this, key);
371
372 /* r &= 0xffffffc0ffffffc0ffffffc0fffffff */
373 this->r[0] = (ru32(key + 0) >> 0) & 0x3ffffff;
374 this->r[1] = (ru32(key + 3) >> 2) & 0x3ffff03;
375 this->r[2] = (ru32(key + 6) >> 4) & 0x3ffc0ff;
376 this->r[3] = (ru32(key + 9) >> 6) & 0x3f03fff;
377 this->r[4] = (ru32(key + 12) >> 8) & 0x00fffff;
378
379 /* h = 0 */
380 memwipe(this->h, sizeof(this->h));
381
382 this->s[0] = ru32(key + 16);
383 this->s[1] = ru32(key + 20);
384 this->s[2] = ru32(key + 24);
385 this->s[3] = ru32(key + 28);
386
387 return TRUE;
388 }
389
390 /**
391 * r[127:64] = h[95:64] * a, r[63:0] = h[31:0] * b
392 */
393 static inline __m128i mul2(__m128i h, u_int32_t a, u_int32_t b)
394 {
395 return _mm_mul_epu32(h, _mm_set_epi32(0, a, 0, b));
396 }
397
398 /**
399 * c = a[127:64] + a[63:0] + b[127:64] + b[63:0]
400 * z = x[127:64] + x[63:0] + y[127:64] + y[63:0]
401 */
402 static inline void sum2(__m128i a, __m128i b, __m128i x, __m128i y,
403 u_int64_t *c, u_int64_t *z)
404 {
405 __m128i r, s;
406
407 a = _mm_add_epi64(a, b);
408 x = _mm_add_epi64(x, y);
409 r = _mm_unpacklo_epi64(x, a);
410 s = _mm_unpackhi_epi64(x, a);
411 r = _mm_add_epi64(r, s);
412
413 _mm_storel_epi64((__m128i*)z, r);
414 _mm_storel_epi64((__m128i*)c, _mm_srli_si128(r, 8));
415 }
416
417 METHOD(chapoly_drv_t, poly, bool,
418 private_chapoly_drv_ssse3_t *this, u_char *data, u_int blocks)
419 {
420 u_int32_t r0, r1, r2, r3, r4;
421 u_int32_t s1, s2, s3, s4;
422 u_int32_t h0, h1, h2, h3, h4;
423 u_int64_t d0, d1, d2, d3, d4;
424 __m128i h01, h23, h44;
425 __m128i x0, x1, y0, y1, z0;
426 u_int32_t t0, t1;
427
428 r0 = this->r[0];
429 r1 = this->r[1];
430 r2 = this->r[2];
431 r3 = this->r[3];
432 r4 = this->r[4];
433
434 s1 = r1 * 5;
435 s2 = r2 * 5;
436 s3 = r3 * 5;
437 s4 = r4 * 5;
438
439 h0 = this->h[0];
440 h1 = this->h[1];
441 h2 = this->h[2];
442 h3 = this->h[3];
443 h4 = this->h[4];
444
445 while (blocks--)
446 {
447 h01 = _mm_set_epi32(0, h0, 0, h1);
448 h23 = _mm_set_epi32(0, h2, 0, h3);
449 h44 = _mm_set_epi32(0, h4, 0, h4);
450
451 /* h += m[i] */
452 t0 = (ru32(data + 0) >> 0) & 0x3ffffff;
453 t1 = (ru32(data + 3) >> 2) & 0x3ffffff;
454 h01 = _mm_add_epi32(h01, _mm_set_epi32(0, t0, 0, t1));
455 t0 = (ru32(data + 6) >> 4) & 0x3ffffff;
456 t1 = (ru32(data + 9) >> 6) & 0x3ffffff;
457 h23 = _mm_add_epi32(h23, _mm_set_epi32(0, t0, 0, t1));
458 t0 = (ru32(data + 12) >> 8) | (1 << 24);
459 h44 = _mm_add_epi32(h44, _mm_set_epi32(0, t0, 0, t0));
460
461 /* h *= r */
462 x0 = mul2(h01, r0, s4);
463 x1 = mul2(h01, r1, r0);
464 y0 = mul2(h23, s3, s2);
465 y1 = mul2(h23, s4, s3);
466 z0 = mul2(h44, s1, s2);
467 y0 = _mm_add_epi64(y0, _mm_srli_si128(z0, 8));
468 y1 = _mm_add_epi64(y1, _mm_slli_si128(z0, 8));
469 sum2(x0, y0, x1, y1, &d0, &d1);
470
471 x0 = mul2(h01, r2, r1);
472 x1 = mul2(h01, r3, r2);
473 y0 = mul2(h23, r0, s4);
474 y1 = mul2(h23, r1, r0);
475 z0 = mul2(h44, s3, s4);
476 y0 = _mm_add_epi64(y0, _mm_srli_si128(z0, 8));
477 y1 = _mm_add_epi64(y1, _mm_slli_si128(z0, 8));
478 sum2(x0, y0, x1, y1, &d2, &d3);
479
480 x0 = mul2(h01, r4, r3);
481 y0 = mul2(h23, r2, r1);
482 z0 = mul2(h44, r0, 0);
483 y0 = _mm_add_epi64(y0, z0);
484 x0 = _mm_add_epi64(x0, y0);
485 x0 = _mm_add_epi64(x0, _mm_srli_si128(x0, 8));
486 _mm_storel_epi64((__m128i*)&d4, x0);
487
488 /* (partial) h %= p */
489 d1 += sr(d0, 26); h0 = and(d0, 0x3ffffff);
490 d2 += sr(d1, 26); h1 = and(d1, 0x3ffffff);
491 d3 += sr(d2, 26); h2 = and(d2, 0x3ffffff);
492 d4 += sr(d3, 26); h3 = and(d3, 0x3ffffff);
493 h0 += sr(d4, 26) * 5; h4 = and(d4, 0x3ffffff);
494 h1 += h0 >> 26; h0 = h0 & 0x3ffffff;
495
496 data += POLY_BLOCK_SIZE;
497 }
498
499 this->h[0] = h0;
500 this->h[1] = h1;
501 this->h[2] = h2;
502 this->h[3] = h3;
503 this->h[4] = h4;
504
505 return TRUE;
506 }
507
508 METHOD(chapoly_drv_t, chacha, bool,
509 private_chapoly_drv_ssse3_t *this, u_char *stream)
510 {
511 memset(stream, 0, CHACHA_BLOCK_SIZE);
512 chacha_block_xor(this, stream);
513
514 return TRUE;
515 }
516
517 METHOD(chapoly_drv_t, encrypt, bool,
518 private_chapoly_drv_ssse3_t *this, u_char *data, u_int blocks)
519 {
520 u_int i;
521
522 while (blocks >= 4)
523 {
524 chacha_4block_xor(this, data);
525 poly(this, data, 16);
526 data += CHACHA_BLOCK_SIZE * 4;
527 blocks -= 4;
528 }
529 for (i = 0; i < blocks; i++)
530 {
531 chacha_block_xor(this, data);
532 poly(this, data, 4);
533 data += CHACHA_BLOCK_SIZE;
534 }
535 return TRUE;
536 }
537
538 METHOD(chapoly_drv_t, decrypt, bool,
539 private_chapoly_drv_ssse3_t *this, u_char *data, u_int blocks)
540 {
541 u_int i;
542
543 while (blocks >= 4)
544 {
545 poly(this, data, 16);
546 chacha_4block_xor(this, data);
547 data += CHACHA_BLOCK_SIZE * 4;
548 blocks -= 4;
549 }
550 for (i = 0; i < blocks; i++)
551 {
552 poly(this, data, 4);
553 chacha_block_xor(this, data);
554 data += CHACHA_BLOCK_SIZE;
555 }
556 return TRUE;
557 }
558
559 METHOD(chapoly_drv_t, finish, bool,
560 private_chapoly_drv_ssse3_t *this, u_char *mac)
561 {
562 u_int32_t h0, h1, h2, h3, h4;
563 u_int32_t g0, g1, g2, g3, g4;
564 u_int32_t mask;
565 u_int64_t f = 0;
566
567 /* fully carry h */
568 h0 = this->h[0];
569 h1 = this->h[1];
570 h2 = this->h[2];
571 h3 = this->h[3];
572 h4 = this->h[4];
573
574 h2 += (h1 >> 26); h1 = h1 & 0x3ffffff;
575 h3 += (h2 >> 26); h2 = h2 & 0x3ffffff;
576 h4 += (h3 >> 26); h3 = h3 & 0x3ffffff;
577 h0 += (h4 >> 26) * 5; h4 = h4 & 0x3ffffff;
578 h1 += (h0 >> 26); h0 = h0 & 0x3ffffff;
579
580 /* compute h + -p */
581 g0 = h0 + 5;
582 g1 = h1 + (g0 >> 26); g0 &= 0x3ffffff;
583 g2 = h2 + (g1 >> 26); g1 &= 0x3ffffff;
584 g3 = h3 + (g2 >> 26); g2 &= 0x3ffffff;
585 g4 = h4 + (g3 >> 26) - (1 << 26); g3 &= 0x3ffffff;
586
587 /* select h if h < p, or h + -p if h >= p */
588 mask = (g4 >> ((sizeof(u_int32_t) * 8) - 1)) - 1;
589 g0 &= mask;
590 g1 &= mask;
591 g2 &= mask;
592 g3 &= mask;
593 g4 &= mask;
594 mask = ~mask;
595 h0 = (h0 & mask) | g0;
596 h1 = (h1 & mask) | g1;
597 h2 = (h2 & mask) | g2;
598 h3 = (h3 & mask) | g3;
599 h4 = (h4 & mask) | g4;
600
601 /* h = h % (2^128) */
602 h0 = (h0 >> 0) | (h1 << 26);
603 h1 = (h1 >> 6) | (h2 << 20);
604 h2 = (h2 >> 12) | (h3 << 14);
605 h3 = (h3 >> 18) | (h4 << 8);
606
607 /* mac = (h + s) % (2^128) */
608 f = (f >> 32) + h0 + this->s[0]; wu32(mac + 0, f);
609 f = (f >> 32) + h1 + this->s[1]; wu32(mac + 4, f);
610 f = (f >> 32) + h2 + this->s[2]; wu32(mac + 8, f);
611 f = (f >> 32) + h3 + this->s[3]; wu32(mac + 12, f);
612
613 return TRUE;
614 }
615
616 METHOD(chapoly_drv_t, destroy, void,
617 private_chapoly_drv_ssse3_t *this)
618 {
619 memwipe(this->m, sizeof(this->m));
620 memwipe(this->h, sizeof(this->h));
621 memwipe(this->r, sizeof(this->r));
622 memwipe(this->s, sizeof(this->s));
623 free_align(this);
624 }
625
626 /**
627 * See header
628 */
629 chapoly_drv_t *chapoly_drv_ssse3_create()
630 {
631 private_chapoly_drv_ssse3_t *this;
632
633 if (!cpu_feature_available(CPU_FEATURE_SSSE3))
634 {
635 return FALSE;
636 }
637
638 INIT_ALIGN(this, sizeof(__m128i),
639 .public = {
640 .set_key = _set_key,
641 .init = _init,
642 .poly = _poly,
643 .chacha = _chacha,
644 .encrypt = _encrypt,
645 .decrypt = _decrypt,
646 .finish = _finish,
647 .destroy = _destroy,
648 },
649 );
650
651 return &this->public;
652 }
653
654 #else /* !__SSSE3__ */
655
656 chapoly_drv_t *chapoly_drv_ssse3_create()
657 {
658 return NULL;
659 }
660
661 #endif /* !__SSSE3__ */