chapoly: Process two Poly1305 blocks in parallel in SSSE3 driver
[strongswan.git] / src / libstrongswan / plugins / chapoly / chapoly_drv_ssse3.c
1 /*
2 * Copyright (C) 2015 Martin Willi
3 * Copyright (C) 2015 revosec AG
4 *
5 * Based on public domain code by Andrew Moon and Daniel J. Bernstein.
6 *
7 * This program is free software; you can redistribute it and/or modify it
8 * under the terms of the GNU General Public License as published by the
9 * Free Software Foundation; either version 2 of the License, or (at your
10 * option) any later version. See <http://www.fsf.org/copyleft/gpl.txt>.
11 *
12 * This program is distributed in the hope that it will be useful, but
13 * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
14 * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
15 * for more details.
16 */
17
18 #include "chapoly_drv_ssse3.h"
19
20 #ifdef __SSSE3__
21
22 #include <utils/cpu_feature.h>
23
24 #include <tmmintrin.h>
25
26 #define CHACHA_DOUBLEROUNDS 10
27
28 typedef struct private_chapoly_drv_ssse3_t private_chapoly_drv_ssse3_t;
29
30 /**
31 * Private data of an chapoly_drv_ssse3_t object.
32 */
33 struct private_chapoly_drv_ssse3_t {
34
35 /**
36 * Public chapoly_drv_ssse3_t interface.
37 */
38 chapoly_drv_t public;
39
40 /**
41 * ChaCha20 state matrix, as 128-bit vectors
42 */
43 __m128i m[4];
44
45 /**
46 * Poly1305 update key
47 */
48 u_int32_t r[5];
49
50 /**
51 * Poly1305 update key r^2
52 */
53 u_int32_t u[5];
54
55 /**
56 * Poly1305 state
57 */
58 u_int32_t h[5];
59
60 /**
61 * Poly1305 finalize key
62 */
63 u_int32_t s[4];
64 };
65
66 /**
67 * Read a 32-bit integer from an unaligned address
68 */
69 static inline u_int32_t ru32(void *p)
70 {
71 u_int32_t ret;
72
73 memcpy(&ret, p, sizeof(ret));
74 return ret;
75 }
76
77 /**
78 * Write a 32-bit word to an unaligned address
79 */
80 static inline void wu32(void *p, u_int32_t v)
81 {
82 memcpy(p, &v, sizeof(v));
83 }
84
85 /**
86 * Shift a 64-bit unsigned integer v right by n bits, clamp to 32 bit
87 */
88 static inline u_int32_t sr(u_int64_t v, u_char n)
89 {
90 return v >> n;
91 }
92
93 /**
94 * AND two values, using a native integer size >= sizeof(u_int32_t)
95 */
96 static inline u_long and(u_long v, u_long mask)
97 {
98 return v & mask;
99 }
100
101 /**
102 * r = shuffle(a ^ b, s)
103 */
104 static inline __m128i sfflxor32(__m128i a, __m128i b, __m128i s)
105 {
106 return _mm_shuffle_epi8(_mm_xor_si128(a, b), s);
107 }
108
109 /**
110 * r = rotl32(a ^ b, r)
111 */
112 static inline __m128i rotlxor32(__m128i a, __m128i b, u_char r)
113 {
114 a = _mm_xor_si128(a, b);
115 return _mm_or_si128(_mm_slli_epi32(a, r), _mm_srli_epi32(a, 32 - r));
116 }
117
118 /**
119 * XOR a Chacha20 keystream block into data, increment counter
120 */
121 static void chacha_block_xor(private_chapoly_drv_ssse3_t *this, void *data)
122 {
123 __m128i x0, x1, x2, x3, r8, r16, *out = data;
124 u_int i;
125
126 r8 = _mm_set_epi8(14, 13, 12, 15, 10, 9, 8, 11, 6, 5, 4, 7, 2, 1, 0, 3);
127 r16 = _mm_set_epi8(13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2);
128
129 x0 = this->m[0];
130 x1 = this->m[1];
131 x2 = this->m[2];
132 x3 = this->m[3];
133
134 for (i = 0 ; i < CHACHA_DOUBLEROUNDS; i++)
135 {
136 x0 = _mm_add_epi32(x0, x1);
137 x3 = sfflxor32(x3, x0, r16);
138
139 x2 = _mm_add_epi32(x2, x3);
140 x1 = rotlxor32(x1, x2, 12);
141
142 x0 = _mm_add_epi32(x0, x1);
143 x3 = sfflxor32(x3, x0, r8);
144
145 x2 = _mm_add_epi32(x2, x3);
146 x1 = rotlxor32(x1, x2, 7);
147
148 x1 = _mm_shuffle_epi32(x1, _MM_SHUFFLE(0, 3, 2, 1));
149 x2 = _mm_shuffle_epi32(x2, _MM_SHUFFLE(1, 0, 3, 2));
150 x3 = _mm_shuffle_epi32(x3, _MM_SHUFFLE(2, 1, 0, 3));
151
152 x0 = _mm_add_epi32(x0, x1);
153 x3 = sfflxor32(x3, x0, r16);
154
155 x2 = _mm_add_epi32(x2, x3);
156 x1 = rotlxor32(x1, x2, 12);
157
158 x0 = _mm_add_epi32(x0, x1);
159 x3 = sfflxor32(x3, x0, r8);
160
161 x2 = _mm_add_epi32(x2, x3);
162 x1 = rotlxor32(x1, x2, 7);
163
164 x1 = _mm_shuffle_epi32(x1, _MM_SHUFFLE(2, 1, 0, 3));
165 x2 = _mm_shuffle_epi32(x2, _MM_SHUFFLE(1, 0, 3, 2));
166 x3 = _mm_shuffle_epi32(x3, _MM_SHUFFLE(0, 3, 2, 1));
167 }
168
169 x0 = _mm_add_epi32(x0, this->m[0]);
170 x1 = _mm_add_epi32(x1, this->m[1]);
171 x2 = _mm_add_epi32(x2, this->m[2]);
172 x3 = _mm_add_epi32(x3, this->m[3]);
173 x0 = _mm_xor_si128(x0, _mm_loadu_si128(out + 0));
174 x1 = _mm_xor_si128(x1, _mm_loadu_si128(out + 1));
175 x2 = _mm_xor_si128(x2, _mm_loadu_si128(out + 2));
176 x3 = _mm_xor_si128(x3, _mm_loadu_si128(out + 3));
177 _mm_storeu_si128(out + 0, x0);
178 _mm_storeu_si128(out + 1, x1);
179 _mm_storeu_si128(out + 2, x2);
180 _mm_storeu_si128(out + 3, x3);
181
182 this->m[3] = _mm_add_epi32(this->m[3], _mm_set_epi32(0, 0, 0, 1));
183 }
184
185 /**
186 * XOR four Chacha20 keystream blocks into data, increment counter
187 */
188 static void chacha_4block_xor(private_chapoly_drv_ssse3_t *this, void *data)
189 {
190 __m128i x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, xa, xb, xc, xd, xe, xf;
191 __m128i r8, r16, ctrinc, t, *out = data;
192 u_int32_t *m = (u_int32_t*)this->m;
193 u_int i;
194
195 r8 = _mm_set_epi8(14, 13, 12, 15, 10, 9, 8, 11, 6, 5, 4, 7, 2, 1, 0, 3);
196 r16 = _mm_set_epi8(13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2);
197 ctrinc = _mm_set_epi32(3, 2, 1, 0);
198
199 x0 = _mm_set1_epi32(m[ 0]);
200 x1 = _mm_set1_epi32(m[ 1]);
201 x2 = _mm_set1_epi32(m[ 2]);
202 x3 = _mm_set1_epi32(m[ 3]);
203 x4 = _mm_set1_epi32(m[ 4]);
204 x5 = _mm_set1_epi32(m[ 5]);
205 x6 = _mm_set1_epi32(m[ 6]);
206 x7 = _mm_set1_epi32(m[ 7]);
207 x8 = _mm_set1_epi32(m[ 8]);
208 x9 = _mm_set1_epi32(m[ 9]);
209 xa = _mm_set1_epi32(m[10]);
210 xb = _mm_set1_epi32(m[11]);
211 xc = _mm_set1_epi32(m[12]);
212 xd = _mm_set1_epi32(m[13]);
213 xe = _mm_set1_epi32(m[14]);
214 xf = _mm_set1_epi32(m[15]);
215
216 xc = _mm_add_epi32(xc, ctrinc);
217
218 for (i = 0 ; i < CHACHA_DOUBLEROUNDS; i++)
219 {
220 x0 = _mm_add_epi32(x0, x4); xc = sfflxor32(xc, x0, r16);
221 x1 = _mm_add_epi32(x1, x5); xd = sfflxor32(xd, x1, r16);
222 x2 = _mm_add_epi32(x2, x6); xe = sfflxor32(xe, x2, r16);
223 x3 = _mm_add_epi32(x3, x7); xf = sfflxor32(xf, x3, r16);
224
225 x8 = _mm_add_epi32(x8, xc); x4 = rotlxor32(x4, x8, 12);
226 x9 = _mm_add_epi32(x9, xd); x5 = rotlxor32(x5, x9, 12);
227 xa = _mm_add_epi32(xa, xe); x6 = rotlxor32(x6, xa, 12);
228 xb = _mm_add_epi32(xb, xf); x7 = rotlxor32(x7, xb, 12);
229
230 x0 = _mm_add_epi32(x0, x4); xc = sfflxor32(xc, x0, r8);
231 x1 = _mm_add_epi32(x1, x5); xd = sfflxor32(xd, x1, r8);
232 x2 = _mm_add_epi32(x2, x6); xe = sfflxor32(xe, x2, r8);
233 x3 = _mm_add_epi32(x3, x7); xf = sfflxor32(xf, x3, r8);
234
235 x8 = _mm_add_epi32(x8, xc); x4 = rotlxor32(x4, x8, 7);
236 x9 = _mm_add_epi32(x9, xd); x5 = rotlxor32(x5, x9, 7);
237 xa = _mm_add_epi32(xa, xe); x6 = rotlxor32(x6, xa, 7);
238 xb = _mm_add_epi32(xb, xf); x7 = rotlxor32(x7, xb, 7);
239
240 x0 = _mm_add_epi32(x0, x5); xf = sfflxor32(xf, x0, r16);
241 x1 = _mm_add_epi32(x1, x6); xc = sfflxor32(xc, x1, r16);
242 x2 = _mm_add_epi32(x2, x7); xd = sfflxor32(xd, x2, r16);
243 x3 = _mm_add_epi32(x3, x4); xe = sfflxor32(xe, x3, r16);
244
245 xa = _mm_add_epi32(xa, xf); x5 = rotlxor32(x5, xa, 12);
246 xb = _mm_add_epi32(xb, xc); x6 = rotlxor32(x6, xb, 12);
247 x8 = _mm_add_epi32(x8, xd); x7 = rotlxor32(x7, x8, 12);
248 x9 = _mm_add_epi32(x9, xe); x4 = rotlxor32(x4, x9, 12);
249
250 x0 = _mm_add_epi32(x0, x5); xf = sfflxor32(xf, x0, r8);
251 x1 = _mm_add_epi32(x1, x6); xc = sfflxor32(xc, x1, r8);
252 x2 = _mm_add_epi32(x2, x7); xd = sfflxor32(xd, x2, r8);
253 x3 = _mm_add_epi32(x3, x4); xe = sfflxor32(xe, x3, r8);
254
255 xa = _mm_add_epi32(xa, xf); x5 = rotlxor32(x5, xa, 7);
256 xb = _mm_add_epi32(xb, xc); x6 = rotlxor32(x6, xb, 7);
257 x8 = _mm_add_epi32(x8, xd); x7 = rotlxor32(x7, x8, 7);
258 x9 = _mm_add_epi32(x9, xe); x4 = rotlxor32(x4, x9, 7);
259 }
260
261 x0 = _mm_add_epi32(x0, _mm_set1_epi32(m[ 0]));
262 x1 = _mm_add_epi32(x1, _mm_set1_epi32(m[ 1]));
263 x2 = _mm_add_epi32(x2, _mm_set1_epi32(m[ 2]));
264 x3 = _mm_add_epi32(x3, _mm_set1_epi32(m[ 3]));
265 x4 = _mm_add_epi32(x4, _mm_set1_epi32(m[ 4]));
266 x5 = _mm_add_epi32(x5, _mm_set1_epi32(m[ 5]));
267 x6 = _mm_add_epi32(x6, _mm_set1_epi32(m[ 6]));
268 x7 = _mm_add_epi32(x7, _mm_set1_epi32(m[ 7]));
269 x8 = _mm_add_epi32(x8, _mm_set1_epi32(m[ 8]));
270 x9 = _mm_add_epi32(x9, _mm_set1_epi32(m[ 9]));
271 xa = _mm_add_epi32(xa, _mm_set1_epi32(m[10]));
272 xb = _mm_add_epi32(xb, _mm_set1_epi32(m[11]));
273 xc = _mm_add_epi32(xc, _mm_set1_epi32(m[12]));
274 xd = _mm_add_epi32(xd, _mm_set1_epi32(m[13]));
275 xe = _mm_add_epi32(xe, _mm_set1_epi32(m[14]));
276 xf = _mm_add_epi32(xf, _mm_set1_epi32(m[15]));
277
278 xc = _mm_add_epi32(xc, ctrinc);
279
280 /* transpose state matrix by interleaving 32-, then 64-bit words */
281 t = x0; x0 = _mm_unpacklo_epi32(t, x1);
282 x1 = _mm_unpackhi_epi32(t, x1);
283 t = x2; x2 = _mm_unpacklo_epi32(t, x3);
284 x3 = _mm_unpackhi_epi32(t, x3);
285 t = x4; x4 = _mm_unpacklo_epi32(t, x5);
286 x5 = _mm_unpackhi_epi32(t, x5);
287 t = x6; x6 = _mm_unpacklo_epi32(t, x7);
288 x7 = _mm_unpackhi_epi32(t, x7);
289 t = x8; x8 = _mm_unpacklo_epi32(t, x9);
290 x9 = _mm_unpackhi_epi32(t, x9);
291 t = xa; xa = _mm_unpacklo_epi32(t, xb);
292 xb = _mm_unpackhi_epi32(t, xb);
293 t = xc; xc = _mm_unpacklo_epi32(t, xd);
294 xd = _mm_unpackhi_epi32(t, xd);
295 t = xe; xe = _mm_unpacklo_epi32(t, xf);
296 xf = _mm_unpackhi_epi32(t, xf);
297
298 t = x0; x0 = _mm_unpacklo_epi64(t, x2);
299 x2 = _mm_unpackhi_epi64(t, x2);
300 t = x1; x1 = _mm_unpacklo_epi64(t, x3);
301 x3 = _mm_unpackhi_epi64(t, x3);
302 t = x4; x4 = _mm_unpacklo_epi64(t, x6);
303 x6 = _mm_unpackhi_epi64(t, x6);
304 t = x5; x5 = _mm_unpacklo_epi64(t, x7);
305 x7 = _mm_unpackhi_epi64(t, x7);
306 t = x8; x8 = _mm_unpacklo_epi64(t, xa);
307 xa = _mm_unpackhi_epi64(t, xa);
308 t = x9; x9 = _mm_unpacklo_epi64(t, xb);
309 xb = _mm_unpackhi_epi64(t, xb);
310 t = xc; xc = _mm_unpacklo_epi64(t, xe);
311 xe = _mm_unpackhi_epi64(t, xe);
312 t = xd; xd = _mm_unpacklo_epi64(t, xf);
313 xf = _mm_unpackhi_epi64(t, xf);
314
315 x0 = _mm_xor_si128(_mm_loadu_si128(out + 0), x0);
316 x1 = _mm_xor_si128(_mm_loadu_si128(out + 8), x1);
317 x2 = _mm_xor_si128(_mm_loadu_si128(out + 4), x2);
318 x3 = _mm_xor_si128(_mm_loadu_si128(out + 12), x3);
319 x4 = _mm_xor_si128(_mm_loadu_si128(out + 1), x4);
320 x5 = _mm_xor_si128(_mm_loadu_si128(out + 9), x5);
321 x6 = _mm_xor_si128(_mm_loadu_si128(out + 5), x6);
322 x7 = _mm_xor_si128(_mm_loadu_si128(out + 13), x7);
323 x8 = _mm_xor_si128(_mm_loadu_si128(out + 2), x8);
324 x9 = _mm_xor_si128(_mm_loadu_si128(out + 10), x9);
325 xa = _mm_xor_si128(_mm_loadu_si128(out + 6), xa);
326 xb = _mm_xor_si128(_mm_loadu_si128(out + 14), xb);
327 xc = _mm_xor_si128(_mm_loadu_si128(out + 3), xc);
328 xd = _mm_xor_si128(_mm_loadu_si128(out + 11), xd);
329 xe = _mm_xor_si128(_mm_loadu_si128(out + 7), xe);
330 xf = _mm_xor_si128(_mm_loadu_si128(out + 15), xf);
331
332 _mm_storeu_si128(out + 0, x0);
333 _mm_storeu_si128(out + 8, x1);
334 _mm_storeu_si128(out + 4, x2);
335 _mm_storeu_si128(out + 12, x3);
336 _mm_storeu_si128(out + 1, x4);
337 _mm_storeu_si128(out + 9, x5);
338 _mm_storeu_si128(out + 5, x6);
339 _mm_storeu_si128(out + 13, x7);
340 _mm_storeu_si128(out + 2, x8);
341 _mm_storeu_si128(out + 10, x9);
342 _mm_storeu_si128(out + 6, xa);
343 _mm_storeu_si128(out + 14, xb);
344 _mm_storeu_si128(out + 3, xc);
345 _mm_storeu_si128(out + 11, xd);
346 _mm_storeu_si128(out + 7, xe);
347 _mm_storeu_si128(out + 15, xf);
348
349 this->m[3] = _mm_add_epi32(this->m[3], _mm_set_epi32(0, 0, 0, 4));
350 }
351
352 METHOD(chapoly_drv_t, set_key, bool,
353 private_chapoly_drv_ssse3_t *this, u_char *constant, u_char *key,
354 u_char *salt)
355 {
356 this->m[0] = _mm_loadu_si128((__m128i*)constant);
357 this->m[1] = _mm_loadu_si128((__m128i*)key + 0);
358 this->m[2] = _mm_loadu_si128((__m128i*)key + 1);
359 this->m[3] = _mm_set_epi32(0, 0, ru32(salt), 0);
360
361 return TRUE;
362 }
363
364 /**
365 * r[127:64] = h[95:64] * a, r[63:0] = h[31:0] * b
366 */
367 static inline __m128i mul2(__m128i h, u_int32_t a, u_int32_t b)
368 {
369 return _mm_mul_epu32(h, _mm_set_epi32(0, a, 0, b));
370 }
371
372 /**
373 * c = a[127:64] + a[63:0] + b[127:64] + b[63:0]
374 * z = x[127:64] + x[63:0] + y[127:64] + y[63:0]
375 */
376 static inline void sum2(__m128i a, __m128i b, __m128i x, __m128i y,
377 u_int64_t *c, u_int64_t *z)
378 {
379 __m128i r, s;
380
381 a = _mm_add_epi64(a, b);
382 x = _mm_add_epi64(x, y);
383 r = _mm_unpacklo_epi64(x, a);
384 s = _mm_unpackhi_epi64(x, a);
385 r = _mm_add_epi64(r, s);
386
387 _mm_storel_epi64((__m128i*)z, r);
388 _mm_storel_epi64((__m128i*)c, _mm_srli_si128(r, 8));
389 }
390
391 /**
392 * r = a[127:64] + b[127:64] + c[127:64] + d[127:64] + e[127:64]
393 * + a[63:0] + b[63:0] + c[63:0] + d[63:0] + e[63:0]
394 */
395 static inline u_int64_t sum5(__m128i a, __m128i b, __m128i c,
396 __m128i d, __m128i e)
397 {
398 u_int64_t r;
399
400 a = _mm_add_epi64(a, b);
401 c = _mm_add_epi64(c, d);
402 a = _mm_add_epi64(a, e);
403 a = _mm_add_epi64(a, c);
404
405 a = _mm_add_epi64(a, _mm_srli_si128(a, 8));
406 _mm_storel_epi64((__m128i*)&r, a);
407
408 return r;
409 }
410
411 /**
412 * Make second Poly1305 key u = r^2
413 */
414 static void make_u(private_chapoly_drv_ssse3_t *this)
415 {
416 __m128i r01, r23, r44, x0, x1, y0, y1, z0;
417 u_int32_t r0, r1, r2, r3, r4;
418 u_int32_t u0, u1, u2, u3, u4;
419 u_int32_t s1, s2, s3, s4;
420 u_int64_t d0, d1, d2, d3, d4;
421
422 r0 = this->r[0];
423 r1 = this->r[1];
424 r2 = this->r[2];
425 r3 = this->r[3];
426 r4 = this->r[4];
427
428 s1 = r1 * 5;
429 s2 = r2 * 5;
430 s3 = r3 * 5;
431 s4 = r4 * 5;
432
433 r01 = _mm_set_epi32(0, r0, 0, r1);
434 r23 = _mm_set_epi32(0, r2, 0, r3);
435 r44 = _mm_set_epi32(0, r4, 0, r4);
436
437 /* u = r^2 */
438 x0 = mul2(r01, r0, s4);
439 x1 = mul2(r01, r1, r0);
440 y0 = mul2(r23, s3, s2);
441 y1 = mul2(r23, s4, s3);
442 z0 = mul2(r44, s1, s2);
443 y0 = _mm_add_epi64(y0, _mm_srli_si128(z0, 8));
444 y1 = _mm_add_epi64(y1, _mm_slli_si128(z0, 8));
445 sum2(x0, y0, x1, y1, &d0, &d1);
446
447 x0 = mul2(r01, r2, r1);
448 x1 = mul2(r01, r3, r2);
449 y0 = mul2(r23, r0, s4);
450 y1 = mul2(r23, r1, r0);
451 z0 = mul2(r44, s3, s4);
452 y0 = _mm_add_epi64(y0, _mm_srli_si128(z0, 8));
453 y1 = _mm_add_epi64(y1, _mm_slli_si128(z0, 8));
454 sum2(x0, y0, x1, y1, &d2, &d3);
455
456 x0 = mul2(r01, r4, r3);
457 y0 = mul2(r23, r2, r1);
458 z0 = mul2(r44, r0, 0);
459 y0 = _mm_add_epi64(y0, z0);
460 x0 = _mm_add_epi64(x0, y0);
461 x0 = _mm_add_epi64(x0, _mm_srli_si128(x0, 8));
462 _mm_storel_epi64((__m128i*)&d4, x0);
463
464 /* (partial) r %= p */
465 d1 += sr(d0, 26); u0 = and(d0, 0x3ffffff);
466 d2 += sr(d1, 26); u1 = and(d1, 0x3ffffff);
467 d3 += sr(d2, 26); u2 = and(d2, 0x3ffffff);
468 d4 += sr(d3, 26); u3 = and(d3, 0x3ffffff);
469 u0 += sr(d4, 26) * 5; u4 = and(d4, 0x3ffffff);
470 u1 += u0 >> 26; u0 &= 0x3ffffff;
471
472 this->u[0] = u0;
473 this->u[1] = u1;
474 this->u[2] = u2;
475 this->u[3] = u3;
476 this->u[4] = u4;
477 }
478
479 METHOD(chapoly_drv_t, init, bool,
480 private_chapoly_drv_ssse3_t *this, u_char *iv)
481 {
482 u_char key[CHACHA_BLOCK_SIZE];
483
484 this->m[3] = _mm_or_si128(
485 _mm_set_epi32(ru32(iv + 4), ru32(iv + 0), 0, 0),
486 _mm_and_si128(this->m[3], _mm_set_epi32(0, 0, ~0, 0)));
487
488 memset(key, 0, CHACHA_BLOCK_SIZE);
489 chacha_block_xor(this, key);
490
491 /* r &= 0xffffffc0ffffffc0ffffffc0fffffff */
492 this->r[0] = (ru32(key + 0) >> 0) & 0x3ffffff;
493 this->r[1] = (ru32(key + 3) >> 2) & 0x3ffff03;
494 this->r[2] = (ru32(key + 6) >> 4) & 0x3ffc0ff;
495 this->r[3] = (ru32(key + 9) >> 6) & 0x3f03fff;
496 this->r[4] = (ru32(key + 12) >> 8) & 0x00fffff;
497
498 make_u(this);
499
500 /* h = 0 */
501 memwipe(this->h, sizeof(this->h));
502
503 this->s[0] = ru32(key + 16);
504 this->s[1] = ru32(key + 20);
505 this->s[2] = ru32(key + 24);
506 this->s[3] = ru32(key + 28);
507
508 return TRUE;
509 }
510
511 /**
512 * Update Poly1305 for a multiple of two blocks
513 */
514 static void poly2(private_chapoly_drv_ssse3_t *this, u_char *data, u_int dblks)
515 {
516 u_int32_t r0, r1, r2, r3, r4, u0, u1, u2, u3, u4;
517 u_int32_t s1, s2, s3, s4, v1, v2, v3, v4;
518 __m128i hc0, hc1, hc2, hc3, hc4;
519 u_int32_t h0, h1, h2, h3, h4;
520 u_int32_t c0, c1, c2, c3, c4;
521 u_int64_t d0, d1, d2, d3, d4;
522 u_int i;
523
524 r0 = this->r[0];
525 r1 = this->r[1];
526 r2 = this->r[2];
527 r3 = this->r[3];
528 r4 = this->r[4];
529
530 s1 = r1 * 5;
531 s2 = r2 * 5;
532 s3 = r3 * 5;
533 s4 = r4 * 5;
534
535 u0 = this->u[0];
536 u1 = this->u[1];
537 u2 = this->u[2];
538 u3 = this->u[3];
539 u4 = this->u[4];
540
541 v1 = u1 * 5;
542 v2 = u2 * 5;
543 v3 = u3 * 5;
544 v4 = u4 * 5;
545
546 h0 = this->h[0];
547 h1 = this->h[1];
548 h2 = this->h[2];
549 h3 = this->h[3];
550 h4 = this->h[4];
551
552 /* h = (h + c1) * r^2 + c2 * r */
553 for (i = 0; i < dblks; i++)
554 {
555 /* h += m[i] */
556 h0 += (ru32(data + 0) >> 0) & 0x3ffffff;
557 h1 += (ru32(data + 3) >> 2) & 0x3ffffff;
558 h2 += (ru32(data + 6) >> 4) & 0x3ffffff;
559 h3 += (ru32(data + 9) >> 6) & 0x3ffffff;
560 h4 += (ru32(data + 12) >> 8) | (1 << 24);
561 data += POLY_BLOCK_SIZE;
562
563 /* c = m[i + 1] */
564 c0 = (ru32(data + 0) >> 0) & 0x3ffffff;
565 c1 = (ru32(data + 3) >> 2) & 0x3ffffff;
566 c2 = (ru32(data + 6) >> 4) & 0x3ffffff;
567 c3 = (ru32(data + 9) >> 6) & 0x3ffffff;
568 c4 = (ru32(data + 12) >> 8) | (1 << 24);
569 data += POLY_BLOCK_SIZE;
570
571 hc0 = _mm_set_epi32(0, h0, 0, c0);
572 hc1 = _mm_set_epi32(0, h1, 0, c1);
573 hc2 = _mm_set_epi32(0, h2, 0, c2);
574 hc3 = _mm_set_epi32(0, h3, 0, c3);
575 hc4 = _mm_set_epi32(0, h4, 0, c4);
576
577 /* h = h * r^2 + c * r */
578 d0 = sum5(mul2(hc0, u0, r0),
579 mul2(hc1, v4, s4),
580 mul2(hc2, v3, s3),
581 mul2(hc3, v2, s2),
582 mul2(hc4, v1, s1));
583 d1 = sum5(mul2(hc0, u1, r1),
584 mul2(hc1, u0, r0),
585 mul2(hc2, v4, s4),
586 mul2(hc3, v3, s3),
587 mul2(hc4, v2, s2));
588 d2 = sum5(mul2(hc0, u2, r2),
589 mul2(hc1, u1, r1),
590 mul2(hc2, u0, r0),
591 mul2(hc3, v4, s4),
592 mul2(hc4, v3, s3));
593 d3 = sum5(mul2(hc0, u3, r3),
594 mul2(hc1, u2, r2),
595 mul2(hc2, u1, r1),
596 mul2(hc3, u0, r0),
597 mul2(hc4, v4, s4));
598 d4 = sum5(mul2(hc0, u4, r4),
599 mul2(hc1, u3, r3),
600 mul2(hc2, u2, r2),
601 mul2(hc3, u1, r1),
602 mul2(hc4, u0, r0));
603
604 /* (partial) h %= p */
605 d1 += sr(d0, 26); h0 = and(d0, 0x3ffffff);
606 d2 += sr(d1, 26); h1 = and(d1, 0x3ffffff);
607 d3 += sr(d2, 26); h2 = and(d2, 0x3ffffff);
608 d4 += sr(d3, 26); h3 = and(d3, 0x3ffffff);
609 h0 += sr(d4, 26) * 5; h4 = and(d4, 0x3ffffff);
610 h1 += h0 >> 26; h0 = h0 & 0x3ffffff;
611 }
612
613 this->h[0] = h0;
614 this->h[1] = h1;
615 this->h[2] = h2;
616 this->h[3] = h3;
617 this->h[4] = h4;
618 }
619
620 /**
621 * Update Poly1305 for a single block
622 */
623 static void poly1(private_chapoly_drv_ssse3_t *this, u_char *data)
624 {
625 u_int32_t r0, r1, r2, r3, r4;
626 u_int32_t s1, s2, s3, s4;
627 u_int32_t h0, h1, h2, h3, h4;
628 u_int64_t d0, d1, d2, d3, d4;
629 __m128i h01, h23, h44;
630 __m128i x0, x1, y0, y1, z0;
631 u_int32_t t0, t1;
632
633 r0 = this->r[0];
634 r1 = this->r[1];
635 r2 = this->r[2];
636 r3 = this->r[3];
637 r4 = this->r[4];
638
639 s1 = r1 * 5;
640 s2 = r2 * 5;
641 s3 = r3 * 5;
642 s4 = r4 * 5;
643
644 h0 = this->h[0];
645 h1 = this->h[1];
646 h2 = this->h[2];
647 h3 = this->h[3];
648 h4 = this->h[4];
649
650 h01 = _mm_set_epi32(0, h0, 0, h1);
651 h23 = _mm_set_epi32(0, h2, 0, h3);
652 h44 = _mm_set_epi32(0, h4, 0, h4);
653
654 /* h += m[i] */
655 t0 = (ru32(data + 0) >> 0) & 0x3ffffff;
656 t1 = (ru32(data + 3) >> 2) & 0x3ffffff;
657 h01 = _mm_add_epi32(h01, _mm_set_epi32(0, t0, 0, t1));
658 t0 = (ru32(data + 6) >> 4) & 0x3ffffff;
659 t1 = (ru32(data + 9) >> 6) & 0x3ffffff;
660 h23 = _mm_add_epi32(h23, _mm_set_epi32(0, t0, 0, t1));
661 t0 = (ru32(data + 12) >> 8) | (1 << 24);
662 h44 = _mm_add_epi32(h44, _mm_set_epi32(0, t0, 0, t0));
663
664 /* h *= r */
665 x0 = mul2(h01, r0, s4);
666 x1 = mul2(h01, r1, r0);
667 y0 = mul2(h23, s3, s2);
668 y1 = mul2(h23, s4, s3);
669 z0 = mul2(h44, s1, s2);
670 y0 = _mm_add_epi64(y0, _mm_srli_si128(z0, 8));
671 y1 = _mm_add_epi64(y1, _mm_slli_si128(z0, 8));
672 sum2(x0, y0, x1, y1, &d0, &d1);
673
674 x0 = mul2(h01, r2, r1);
675 x1 = mul2(h01, r3, r2);
676 y0 = mul2(h23, r0, s4);
677 y1 = mul2(h23, r1, r0);
678 z0 = mul2(h44, s3, s4);
679 y0 = _mm_add_epi64(y0, _mm_srli_si128(z0, 8));
680 y1 = _mm_add_epi64(y1, _mm_slli_si128(z0, 8));
681 sum2(x0, y0, x1, y1, &d2, &d3);
682
683 x0 = mul2(h01, r4, r3);
684 y0 = mul2(h23, r2, r1);
685 z0 = mul2(h44, r0, 0);
686 y0 = _mm_add_epi64(y0, z0);
687 x0 = _mm_add_epi64(x0, y0);
688 x0 = _mm_add_epi64(x0, _mm_srli_si128(x0, 8));
689 _mm_storel_epi64((__m128i*)&d4, x0);
690
691 /* (partial) h %= p */
692 d1 += sr(d0, 26); h0 = and(d0, 0x3ffffff);
693 d2 += sr(d1, 26); h1 = and(d1, 0x3ffffff);
694 d3 += sr(d2, 26); h2 = and(d2, 0x3ffffff);
695 d4 += sr(d3, 26); h3 = and(d3, 0x3ffffff);
696 h0 += sr(d4, 26) * 5; h4 = and(d4, 0x3ffffff);
697 h1 += h0 >> 26; h0 = h0 & 0x3ffffff;
698
699 this->h[0] = h0;
700 this->h[1] = h1;
701 this->h[2] = h2;
702 this->h[3] = h3;
703 this->h[4] = h4;
704 }
705
706 METHOD(chapoly_drv_t, poly, bool,
707 private_chapoly_drv_ssse3_t *this, u_char *data, u_int blocks)
708 {
709 poly2(this, data, blocks / 2);
710 if (blocks-- % 2)
711 {
712 poly1(this, data + POLY_BLOCK_SIZE * blocks);
713 }
714 return TRUE;
715 }
716
717 METHOD(chapoly_drv_t, chacha, bool,
718 private_chapoly_drv_ssse3_t *this, u_char *stream)
719 {
720 memset(stream, 0, CHACHA_BLOCK_SIZE);
721 chacha_block_xor(this, stream);
722
723 return TRUE;
724 }
725
726 METHOD(chapoly_drv_t, encrypt, bool,
727 private_chapoly_drv_ssse3_t *this, u_char *data, u_int blocks)
728 {
729 while (blocks >= 4)
730 {
731 chacha_4block_xor(this, data);
732 poly2(this, data, 8);
733 data += CHACHA_BLOCK_SIZE * 4;
734 blocks -= 4;
735 }
736 while (blocks--)
737 {
738 chacha_block_xor(this, data);
739 poly2(this, data, 2);
740 data += CHACHA_BLOCK_SIZE;
741 }
742 return TRUE;
743 }
744
745 METHOD(chapoly_drv_t, decrypt, bool,
746 private_chapoly_drv_ssse3_t *this, u_char *data, u_int blocks)
747 {
748 while (blocks >= 4)
749 {
750 poly2(this, data, 8);
751 chacha_4block_xor(this, data);
752 data += CHACHA_BLOCK_SIZE * 4;
753 blocks -= 4;
754 }
755 while (blocks--)
756 {
757 poly2(this, data, 2);
758 chacha_block_xor(this, data);
759 data += CHACHA_BLOCK_SIZE;
760 }
761 return TRUE;
762 }
763
764 METHOD(chapoly_drv_t, finish, bool,
765 private_chapoly_drv_ssse3_t *this, u_char *mac)
766 {
767 u_int32_t h0, h1, h2, h3, h4;
768 u_int32_t g0, g1, g2, g3, g4;
769 u_int32_t mask;
770 u_int64_t f = 0;
771
772 /* fully carry h */
773 h0 = this->h[0];
774 h1 = this->h[1];
775 h2 = this->h[2];
776 h3 = this->h[3];
777 h4 = this->h[4];
778
779 h2 += (h1 >> 26); h1 = h1 & 0x3ffffff;
780 h3 += (h2 >> 26); h2 = h2 & 0x3ffffff;
781 h4 += (h3 >> 26); h3 = h3 & 0x3ffffff;
782 h0 += (h4 >> 26) * 5; h4 = h4 & 0x3ffffff;
783 h1 += (h0 >> 26); h0 = h0 & 0x3ffffff;
784
785 /* compute h + -p */
786 g0 = h0 + 5;
787 g1 = h1 + (g0 >> 26); g0 &= 0x3ffffff;
788 g2 = h2 + (g1 >> 26); g1 &= 0x3ffffff;
789 g3 = h3 + (g2 >> 26); g2 &= 0x3ffffff;
790 g4 = h4 + (g3 >> 26) - (1 << 26); g3 &= 0x3ffffff;
791
792 /* select h if h < p, or h + -p if h >= p */
793 mask = (g4 >> ((sizeof(u_int32_t) * 8) - 1)) - 1;
794 g0 &= mask;
795 g1 &= mask;
796 g2 &= mask;
797 g3 &= mask;
798 g4 &= mask;
799 mask = ~mask;
800 h0 = (h0 & mask) | g0;
801 h1 = (h1 & mask) | g1;
802 h2 = (h2 & mask) | g2;
803 h3 = (h3 & mask) | g3;
804 h4 = (h4 & mask) | g4;
805
806 /* h = h % (2^128) */
807 h0 = (h0 >> 0) | (h1 << 26);
808 h1 = (h1 >> 6) | (h2 << 20);
809 h2 = (h2 >> 12) | (h3 << 14);
810 h3 = (h3 >> 18) | (h4 << 8);
811
812 /* mac = (h + s) % (2^128) */
813 f = (f >> 32) + h0 + this->s[0]; wu32(mac + 0, f);
814 f = (f >> 32) + h1 + this->s[1]; wu32(mac + 4, f);
815 f = (f >> 32) + h2 + this->s[2]; wu32(mac + 8, f);
816 f = (f >> 32) + h3 + this->s[3]; wu32(mac + 12, f);
817
818 return TRUE;
819 }
820
821 METHOD(chapoly_drv_t, destroy, void,
822 private_chapoly_drv_ssse3_t *this)
823 {
824 memwipe(this->m, sizeof(this->m));
825 memwipe(this->h, sizeof(this->h));
826 memwipe(this->r, sizeof(this->r));
827 memwipe(this->u, sizeof(this->u));
828 memwipe(this->s, sizeof(this->s));
829 free_align(this);
830 }
831
832 /**
833 * See header
834 */
835 chapoly_drv_t *chapoly_drv_ssse3_create()
836 {
837 private_chapoly_drv_ssse3_t *this;
838
839 if (!cpu_feature_available(CPU_FEATURE_SSSE3))
840 {
841 return FALSE;
842 }
843
844 INIT_ALIGN(this, sizeof(__m128i),
845 .public = {
846 .set_key = _set_key,
847 .init = _init,
848 .poly = _poly,
849 .chacha = _chacha,
850 .encrypt = _encrypt,
851 .decrypt = _decrypt,
852 .finish = _finish,
853 .destroy = _destroy,
854 },
855 );
856
857 return &this->public;
858 }
859
860 #else /* !__SSSE3__ */
861
862 chapoly_drv_t *chapoly_drv_ssse3_create()
863 {
864 return NULL;
865 }
866
867 #endif /* !__SSSE3__ */