aesni: Avoid loading AES/GHASH round keys into local variables
authorMartin Willi <martin@revosec.ch>
Tue, 14 Apr 2015 10:38:18 +0000 (12:38 +0200)
committerMartin Willi <martin@revosec.ch>
Wed, 15 Apr 2015 11:44:40 +0000 (13:44 +0200)
The performance impact is not measurable, as the compiler loads these variables
in xmm registers in unrolled loops anyway.

However, we avoid loading these sensitive keys onto the stack. This happens for
larger key schedules, where the register count is insufficient. If that key
material is not on the stack, we can avoid to wipe it explicitly after
crypto operations.

src/libstrongswan/plugins/aesni/aesni_cbc.c
src/libstrongswan/plugins/aesni/aesni_ccm.c
src/libstrongswan/plugins/aesni/aesni_cmac.c
src/libstrongswan/plugins/aesni/aesni_ctr.c
src/libstrongswan/plugins/aesni/aesni_gcm.c
src/libstrongswan/plugins/aesni/aesni_xcbc.c

index f2fce0f..78ada76 100644 (file)
@@ -70,22 +70,10 @@ struct private_aesni_cbc_t {
 static void encrypt_cbc128(aesni_key_t *key, u_int blocks, u_char *in,
                                                   u_char *iv, u_char *out)
 {
-       __m128i k0, k1, k2, k3, k4, k5, k6, k7, k8, k9, k10;
-       __m128i t, fb, *bi, *bo;
+       __m128i *ks, t, fb, *bi, *bo;
        int i;
 
-       k0 = key->schedule[0];
-       k1 = key->schedule[1];
-       k2 = key->schedule[2];
-       k3 = key->schedule[3];
-       k4 = key->schedule[4];
-       k5 = key->schedule[5];
-       k6 = key->schedule[6];
-       k7 = key->schedule[7];
-       k8 = key->schedule[8];
-       k9 = key->schedule[9];
-       k10 = key->schedule[10];
-
+       ks = key->schedule;
        bi = (__m128i*)in;
        bo = (__m128i*)out;
 
@@ -94,19 +82,19 @@ static void encrypt_cbc128(aesni_key_t *key, u_int blocks, u_char *in,
        {
                t = _mm_loadu_si128(bi + i);
                fb = _mm_xor_si128(t, fb);
-               fb = _mm_xor_si128(fb, k0);
-
-               fb = _mm_aesenc_si128(fb, k1);
-               fb = _mm_aesenc_si128(fb, k2);
-               fb = _mm_aesenc_si128(fb, k3);
-               fb = _mm_aesenc_si128(fb, k4);
-               fb = _mm_aesenc_si128(fb, k5);
-               fb = _mm_aesenc_si128(fb, k6);
-               fb = _mm_aesenc_si128(fb, k7);
-               fb = _mm_aesenc_si128(fb, k8);
-               fb = _mm_aesenc_si128(fb, k9);
-
-               fb = _mm_aesenclast_si128(fb, k10);
+               fb = _mm_xor_si128(fb, ks[0]);
+
+               fb = _mm_aesenc_si128(fb, ks[1]);
+               fb = _mm_aesenc_si128(fb, ks[2]);
+               fb = _mm_aesenc_si128(fb, ks[3]);
+               fb = _mm_aesenc_si128(fb, ks[4]);
+               fb = _mm_aesenc_si128(fb, ks[5]);
+               fb = _mm_aesenc_si128(fb, ks[6]);
+               fb = _mm_aesenc_si128(fb, ks[7]);
+               fb = _mm_aesenc_si128(fb, ks[8]);
+               fb = _mm_aesenc_si128(fb, ks[9]);
+
+               fb = _mm_aesenclast_si128(fb, ks[10]);
                _mm_storeu_si128(bo + i, fb);
        }
 }
@@ -117,24 +105,12 @@ static void encrypt_cbc128(aesni_key_t *key, u_int blocks, u_char *in,
 static void decrypt_cbc128(aesni_key_t *key, u_int blocks, u_char *in,
                                                   u_char *iv, u_char *out)
 {
-       __m128i k0, k1, k2, k3, k4, k5, k6, k7, k8, k9, k10;
-       __m128i last, *bi, *bo;
+       __m128i *ks, last, *bi, *bo;
        __m128i t1, t2, t3, t4;
        __m128i f1, f2, f3, f4;
        u_int i, pblocks;
 
-       k0 = key->schedule[0];
-       k1 = key->schedule[1];
-       k2 = key->schedule[2];
-       k3 = key->schedule[3];
-       k4 = key->schedule[4];
-       k5 = key->schedule[5];
-       k6 = key->schedule[6];
-       k7 = key->schedule[7];
-       k8 = key->schedule[8];
-       k9 = key->schedule[9];
-       k10 = key->schedule[10];
-
+       ks = key->schedule;
        bi = (__m128i*)in;
        bo = (__m128i*)out;
        pblocks = blocks - (blocks % CBC_DECRYPT_PARALLELISM);
@@ -153,52 +129,52 @@ static void decrypt_cbc128(aesni_key_t *key, u_int blocks, u_char *in,
                f4 = t3;
                last = t4;
 
-               t1 = _mm_xor_si128(t1, k0);
-               t2 = _mm_xor_si128(t2, k0);
-               t3 = _mm_xor_si128(t3, k0);
-               t4 = _mm_xor_si128(t4, k0);
-
-               t1 = _mm_aesdec_si128(t1, k1);
-               t2 = _mm_aesdec_si128(t2, k1);
-               t3 = _mm_aesdec_si128(t3, k1);
-               t4 = _mm_aesdec_si128(t4, k1);
-               t1 = _mm_aesdec_si128(t1, k2);
-               t2 = _mm_aesdec_si128(t2, k2);
-               t3 = _mm_aesdec_si128(t3, k2);
-               t4 = _mm_aesdec_si128(t4, k2);
-               t1 = _mm_aesdec_si128(t1, k3);
-               t2 = _mm_aesdec_si128(t2, k3);
-               t3 = _mm_aesdec_si128(t3, k3);
-               t4 = _mm_aesdec_si128(t4, k3);
-               t1 = _mm_aesdec_si128(t1, k4);
-               t2 = _mm_aesdec_si128(t2, k4);
-               t3 = _mm_aesdec_si128(t3, k4);
-               t4 = _mm_aesdec_si128(t4, k4);
-               t1 = _mm_aesdec_si128(t1, k5);
-               t2 = _mm_aesdec_si128(t2, k5);
-               t3 = _mm_aesdec_si128(t3, k5);
-               t4 = _mm_aesdec_si128(t4, k5);
-               t1 = _mm_aesdec_si128(t1, k6);
-               t2 = _mm_aesdec_si128(t2, k6);
-               t3 = _mm_aesdec_si128(t3, k6);
-               t4 = _mm_aesdec_si128(t4, k6);
-               t1 = _mm_aesdec_si128(t1, k7);
-               t2 = _mm_aesdec_si128(t2, k7);
-               t3 = _mm_aesdec_si128(t3, k7);
-               t4 = _mm_aesdec_si128(t4, k7);
-               t1 = _mm_aesdec_si128(t1, k8);
-               t2 = _mm_aesdec_si128(t2, k8);
-               t3 = _mm_aesdec_si128(t3, k8);
-               t4 = _mm_aesdec_si128(t4, k8);
-               t1 = _mm_aesdec_si128(t1, k9);
-               t2 = _mm_aesdec_si128(t2, k9);
-               t3 = _mm_aesdec_si128(t3, k9);
-               t4 = _mm_aesdec_si128(t4, k9);
-
-               t1 = _mm_aesdeclast_si128(t1, k10);
-               t2 = _mm_aesdeclast_si128(t2, k10);
-               t3 = _mm_aesdeclast_si128(t3, k10);
-               t4 = _mm_aesdeclast_si128(t4, k10);
+               t1 = _mm_xor_si128(t1, ks[0]);
+               t2 = _mm_xor_si128(t2, ks[0]);
+               t3 = _mm_xor_si128(t3, ks[0]);
+               t4 = _mm_xor_si128(t4, ks[0]);
+
+               t1 = _mm_aesdec_si128(t1, ks[1]);
+               t2 = _mm_aesdec_si128(t2, ks[1]);
+               t3 = _mm_aesdec_si128(t3, ks[1]);
+               t4 = _mm_aesdec_si128(t4, ks[1]);
+               t1 = _mm_aesdec_si128(t1, ks[2]);
+               t2 = _mm_aesdec_si128(t2, ks[2]);
+               t3 = _mm_aesdec_si128(t3, ks[2]);
+               t4 = _mm_aesdec_si128(t4, ks[2]);
+               t1 = _mm_aesdec_si128(t1, ks[3]);
+               t2 = _mm_aesdec_si128(t2, ks[3]);
+               t3 = _mm_aesdec_si128(t3, ks[3]);
+               t4 = _mm_aesdec_si128(t4, ks[3]);
+               t1 = _mm_aesdec_si128(t1, ks[4]);
+               t2 = _mm_aesdec_si128(t2, ks[4]);
+               t3 = _mm_aesdec_si128(t3, ks[4]);
+               t4 = _mm_aesdec_si128(t4, ks[4]);
+               t1 = _mm_aesdec_si128(t1, ks[5]);
+               t2 = _mm_aesdec_si128(t2, ks[5]);
+               t3 = _mm_aesdec_si128(t3, ks[5]);
+               t4 = _mm_aesdec_si128(t4, ks[5]);
+               t1 = _mm_aesdec_si128(t1, ks[6]);
+               t2 = _mm_aesdec_si128(t2, ks[6]);
+               t3 = _mm_aesdec_si128(t3, ks[6]);
+               t4 = _mm_aesdec_si128(t4, ks[6]);
+               t1 = _mm_aesdec_si128(t1, ks[7]);
+               t2 = _mm_aesdec_si128(t2, ks[7]);
+               t3 = _mm_aesdec_si128(t3, ks[7]);
+               t4 = _mm_aesdec_si128(t4, ks[7]);
+               t1 = _mm_aesdec_si128(t1, ks[8]);
+               t2 = _mm_aesdec_si128(t2, ks[8]);
+               t3 = _mm_aesdec_si128(t3, ks[8]);
+               t4 = _mm_aesdec_si128(t4, ks[8]);
+               t1 = _mm_aesdec_si128(t1, ks[9]);
+               t2 = _mm_aesdec_si128(t2, ks[9]);
+               t3 = _mm_aesdec_si128(t3, ks[9]);
+               t4 = _mm_aesdec_si128(t4, ks[9]);
+
+               t1 = _mm_aesdeclast_si128(t1, ks[10]);
+               t2 = _mm_aesdeclast_si128(t2, ks[10]);
+               t3 = _mm_aesdeclast_si128(t3, ks[10]);
+               t4 = _mm_aesdeclast_si128(t4, ks[10]);
                t1 = _mm_xor_si128(t1, f1);
                t2 = _mm_xor_si128(t2, f2);
                t3 = _mm_xor_si128(t3, f3);
@@ -213,19 +189,19 @@ static void decrypt_cbc128(aesni_key_t *key, u_int blocks, u_char *in,
        for (i = pblocks; i < blocks; i++)
        {
                last = _mm_loadu_si128(bi + i);
-               t1 = _mm_xor_si128(last, k0);
-
-               t1 = _mm_aesdec_si128(t1, k1);
-               t1 = _mm_aesdec_si128(t1, k2);
-               t1 = _mm_aesdec_si128(t1, k3);
-               t1 = _mm_aesdec_si128(t1, k4);
-               t1 = _mm_aesdec_si128(t1, k5);
-               t1 = _mm_aesdec_si128(t1, k6);
-               t1 = _mm_aesdec_si128(t1, k7);
-               t1 = _mm_aesdec_si128(t1, k8);
-               t1 = _mm_aesdec_si128(t1, k9);
-
-               t1 = _mm_aesdeclast_si128(t1, k10);
+               t1 = _mm_xor_si128(last, ks[0]);
+
+               t1 = _mm_aesdec_si128(t1, ks[1]);
+               t1 = _mm_aesdec_si128(t1, ks[2]);
+               t1 = _mm_aesdec_si128(t1, ks[3]);
+               t1 = _mm_aesdec_si128(t1, ks[4]);
+               t1 = _mm_aesdec_si128(t1, ks[5]);
+               t1 = _mm_aesdec_si128(t1, ks[6]);
+               t1 = _mm_aesdec_si128(t1, ks[7]);
+               t1 = _mm_aesdec_si128(t1, ks[8]);
+               t1 = _mm_aesdec_si128(t1, ks[9]);
+
+               t1 = _mm_aesdeclast_si128(t1, ks[10]);
                t1 = _mm_xor_si128(t1, f1);
                _mm_storeu_si128(bo + i, t1);
                f1 = last;
@@ -238,24 +214,10 @@ static void decrypt_cbc128(aesni_key_t *key, u_int blocks, u_char *in,
 static void encrypt_cbc192(aesni_key_t *key, u_int blocks, u_char *in,
                                                   u_char *iv, u_char *out)
 {
-       __m128i k0, k1, k2, k3, k4, k5, k6, k7, k8, k9, k10, k11, k12;
-       __m128i t, fb, *bi, *bo;
+       __m128i *ks, t, fb, *bi, *bo;
        int i;
 
-       k0 = key->schedule[0];
-       k1 = key->schedule[1];
-       k2 = key->schedule[2];
-       k3 = key->schedule[3];
-       k4 = key->schedule[4];
-       k5 = key->schedule[5];
-       k6 = key->schedule[6];
-       k7 = key->schedule[7];
-       k8 = key->schedule[8];
-       k9 = key->schedule[9];
-       k10 = key->schedule[10];
-       k11 = key->schedule[11];
-       k12 = key->schedule[12];
-
+       ks = key->schedule;
        bi = (__m128i*)in;
        bo = (__m128i*)out;
 
@@ -264,21 +226,21 @@ static void encrypt_cbc192(aesni_key_t *key, u_int blocks, u_char *in,
        {
                t = _mm_loadu_si128(bi + i);
                fb = _mm_xor_si128(t, fb);
-               fb = _mm_xor_si128(fb, k0);
-
-               fb = _mm_aesenc_si128(fb, k1);
-               fb = _mm_aesenc_si128(fb, k2);
-               fb = _mm_aesenc_si128(fb, k3);
-               fb = _mm_aesenc_si128(fb, k4);
-               fb = _mm_aesenc_si128(fb, k5);
-               fb = _mm_aesenc_si128(fb, k6);
-               fb = _mm_aesenc_si128(fb, k7);
-               fb = _mm_aesenc_si128(fb, k8);
-               fb = _mm_aesenc_si128(fb, k9);
-               fb = _mm_aesenc_si128(fb, k10);
-               fb = _mm_aesenc_si128(fb, k11);
-
-               fb = _mm_aesenclast_si128(fb, k12);
+               fb = _mm_xor_si128(fb, ks[0]);
+
+               fb = _mm_aesenc_si128(fb, ks[1]);
+               fb = _mm_aesenc_si128(fb, ks[2]);
+               fb = _mm_aesenc_si128(fb, ks[3]);
+               fb = _mm_aesenc_si128(fb, ks[4]);
+               fb = _mm_aesenc_si128(fb, ks[5]);
+               fb = _mm_aesenc_si128(fb, ks[6]);
+               fb = _mm_aesenc_si128(fb, ks[7]);
+               fb = _mm_aesenc_si128(fb, ks[8]);
+               fb = _mm_aesenc_si128(fb, ks[9]);
+               fb = _mm_aesenc_si128(fb, ks[10]);
+               fb = _mm_aesenc_si128(fb, ks[11]);
+
+               fb = _mm_aesenclast_si128(fb, ks[12]);
                _mm_storeu_si128(bo + i, fb);
        }
 }
@@ -289,26 +251,12 @@ static void encrypt_cbc192(aesni_key_t *key, u_int blocks, u_char *in,
 static void decrypt_cbc192(aesni_key_t *key, u_int blocks, u_char *in,
                                                   u_char *iv, u_char *out)
 {
-       __m128i k0, k1, k2, k3, k4, k5, k6, k7, k8, k9, k10, k11, k12;
-       __m128i last, *bi, *bo;
+       __m128i *ks, last, *bi, *bo;
        __m128i t1, t2, t3, t4;
        __m128i f1, f2, f3, f4;
        u_int i, pblocks;
 
-       k0 = key->schedule[0];
-       k1 = key->schedule[1];
-       k2 = key->schedule[2];
-       k3 = key->schedule[3];
-       k4 = key->schedule[4];
-       k5 = key->schedule[5];
-       k6 = key->schedule[6];
-       k7 = key->schedule[7];
-       k8 = key->schedule[8];
-       k9 = key->schedule[9];
-       k10 = key->schedule[10];
-       k11 = key->schedule[11];
-       k12 = key->schedule[12];
-
+       ks = key->schedule;
        bi = (__m128i*)in;
        bo = (__m128i*)out;
        pblocks = blocks - (blocks % CBC_DECRYPT_PARALLELISM);
@@ -327,60 +275,60 @@ static void decrypt_cbc192(aesni_key_t *key, u_int blocks, u_char *in,
                f4 = t3;
                last = t4;
 
-               t1 = _mm_xor_si128(t1, k0);
-               t2 = _mm_xor_si128(t2, k0);
-               t3 = _mm_xor_si128(t3, k0);
-               t4 = _mm_xor_si128(t4, k0);
-
-               t1 = _mm_aesdec_si128(t1, k1);
-               t2 = _mm_aesdec_si128(t2, k1);
-               t3 = _mm_aesdec_si128(t3, k1);
-               t4 = _mm_aesdec_si128(t4, k1);
-               t1 = _mm_aesdec_si128(t1, k2);
-               t2 = _mm_aesdec_si128(t2, k2);
-               t3 = _mm_aesdec_si128(t3, k2);
-               t4 = _mm_aesdec_si128(t4, k2);
-               t1 = _mm_aesdec_si128(t1, k3);
-               t2 = _mm_aesdec_si128(t2, k3);
-               t3 = _mm_aesdec_si128(t3, k3);
-               t4 = _mm_aesdec_si128(t4, k3);
-               t1 = _mm_aesdec_si128(t1, k4);
-               t2 = _mm_aesdec_si128(t2, k4);
-               t3 = _mm_aesdec_si128(t3, k4);
-               t4 = _mm_aesdec_si128(t4, k4);
-               t1 = _mm_aesdec_si128(t1, k5);
-               t2 = _mm_aesdec_si128(t2, k5);
-               t3 = _mm_aesdec_si128(t3, k5);
-               t4 = _mm_aesdec_si128(t4, k5);
-               t1 = _mm_aesdec_si128(t1, k6);
-               t2 = _mm_aesdec_si128(t2, k6);
-               t3 = _mm_aesdec_si128(t3, k6);
-               t4 = _mm_aesdec_si128(t4, k6);
-               t1 = _mm_aesdec_si128(t1, k7);
-               t2 = _mm_aesdec_si128(t2, k7);
-               t3 = _mm_aesdec_si128(t3, k7);
-               t4 = _mm_aesdec_si128(t4, k7);
-               t1 = _mm_aesdec_si128(t1, k8);
-               t2 = _mm_aesdec_si128(t2, k8);
-               t3 = _mm_aesdec_si128(t3, k8);
-               t4 = _mm_aesdec_si128(t4, k8);
-               t1 = _mm_aesdec_si128(t1, k9);
-               t2 = _mm_aesdec_si128(t2, k9);
-               t3 = _mm_aesdec_si128(t3, k9);
-               t4 = _mm_aesdec_si128(t4, k9);
-               t1 = _mm_aesdec_si128(t1, k10);
-               t2 = _mm_aesdec_si128(t2, k10);
-               t3 = _mm_aesdec_si128(t3, k10);
-               t4 = _mm_aesdec_si128(t4, k10);
-               t1 = _mm_aesdec_si128(t1, k11);
-               t2 = _mm_aesdec_si128(t2, k11);
-               t3 = _mm_aesdec_si128(t3, k11);
-               t4 = _mm_aesdec_si128(t4, k11);
-
-               t1 = _mm_aesdeclast_si128(t1, k12);
-               t2 = _mm_aesdeclast_si128(t2, k12);
-               t3 = _mm_aesdeclast_si128(t3, k12);
-               t4 = _mm_aesdeclast_si128(t4, k12);
+               t1 = _mm_xor_si128(t1, ks[0]);
+               t2 = _mm_xor_si128(t2, ks[0]);
+               t3 = _mm_xor_si128(t3, ks[0]);
+               t4 = _mm_xor_si128(t4, ks[0]);
+
+               t1 = _mm_aesdec_si128(t1, ks[1]);
+               t2 = _mm_aesdec_si128(t2, ks[1]);
+               t3 = _mm_aesdec_si128(t3, ks[1]);
+               t4 = _mm_aesdec_si128(t4, ks[1]);
+               t1 = _mm_aesdec_si128(t1, ks[2]);
+               t2 = _mm_aesdec_si128(t2, ks[2]);
+               t3 = _mm_aesdec_si128(t3, ks[2]);
+               t4 = _mm_aesdec_si128(t4, ks[2]);
+               t1 = _mm_aesdec_si128(t1, ks[3]);
+               t2 = _mm_aesdec_si128(t2, ks[3]);
+               t3 = _mm_aesdec_si128(t3, ks[3]);
+               t4 = _mm_aesdec_si128(t4, ks[3]);
+               t1 = _mm_aesdec_si128(t1, ks[4]);
+               t2 = _mm_aesdec_si128(t2, ks[4]);
+               t3 = _mm_aesdec_si128(t3, ks[4]);
+               t4 = _mm_aesdec_si128(t4, ks[4]);
+               t1 = _mm_aesdec_si128(t1, ks[5]);
+               t2 = _mm_aesdec_si128(t2, ks[5]);
+               t3 = _mm_aesdec_si128(t3, ks[5]);
+               t4 = _mm_aesdec_si128(t4, ks[5]);
+               t1 = _mm_aesdec_si128(t1, ks[6]);
+               t2 = _mm_aesdec_si128(t2, ks[6]);
+               t3 = _mm_aesdec_si128(t3, ks[6]);
+               t4 = _mm_aesdec_si128(t4, ks[6]);
+               t1 = _mm_aesdec_si128(t1, ks[7]);
+               t2 = _mm_aesdec_si128(t2, ks[7]);
+               t3 = _mm_aesdec_si128(t3, ks[7]);
+               t4 = _mm_aesdec_si128(t4, ks[7]);
+               t1 = _mm_aesdec_si128(t1, ks[8]);
+               t2 = _mm_aesdec_si128(t2, ks[8]);
+               t3 = _mm_aesdec_si128(t3, ks[8]);
+               t4 = _mm_aesdec_si128(t4, ks[8]);
+               t1 = _mm_aesdec_si128(t1, ks[9]);
+               t2 = _mm_aesdec_si128(t2, ks[9]);
+               t3 = _mm_aesdec_si128(t3, ks[9]);
+               t4 = _mm_aesdec_si128(t4, ks[9]);
+               t1 = _mm_aesdec_si128(t1, ks[10]);
+               t2 = _mm_aesdec_si128(t2, ks[10]);
+               t3 = _mm_aesdec_si128(t3, ks[10]);
+               t4 = _mm_aesdec_si128(t4, ks[10]);
+               t1 = _mm_aesdec_si128(t1, ks[11]);
+               t2 = _mm_aesdec_si128(t2, ks[11]);
+               t3 = _mm_aesdec_si128(t3, ks[11]);
+               t4 = _mm_aesdec_si128(t4, ks[11]);
+
+               t1 = _mm_aesdeclast_si128(t1, ks[12]);
+               t2 = _mm_aesdeclast_si128(t2, ks[12]);
+               t3 = _mm_aesdeclast_si128(t3, ks[12]);
+               t4 = _mm_aesdeclast_si128(t4, ks[12]);
                t1 = _mm_xor_si128(t1, f1);
                t2 = _mm_xor_si128(t2, f2);
                t3 = _mm_xor_si128(t3, f3);
@@ -395,21 +343,21 @@ static void decrypt_cbc192(aesni_key_t *key, u_int blocks, u_char *in,
        for (i = pblocks; i < blocks; i++)
        {
                last = _mm_loadu_si128(bi + i);
-               t1 = _mm_xor_si128(last, k0);
-
-               t1 = _mm_aesdec_si128(t1, k1);
-               t1 = _mm_aesdec_si128(t1, k2);
-               t1 = _mm_aesdec_si128(t1, k3);
-               t1 = _mm_aesdec_si128(t1, k4);
-               t1 = _mm_aesdec_si128(t1, k5);
-               t1 = _mm_aesdec_si128(t1, k6);
-               t1 = _mm_aesdec_si128(t1, k7);
-               t1 = _mm_aesdec_si128(t1, k8);
-               t1 = _mm_aesdec_si128(t1, k9);
-               t1 = _mm_aesdec_si128(t1, k10);
-               t1 = _mm_aesdec_si128(t1, k11);
-
-               t1 = _mm_aesdeclast_si128(t1, k12);
+               t1 = _mm_xor_si128(last, ks[0]);
+
+               t1 = _mm_aesdec_si128(t1, ks[1]);
+               t1 = _mm_aesdec_si128(t1, ks[2]);
+               t1 = _mm_aesdec_si128(t1, ks[3]);
+               t1 = _mm_aesdec_si128(t1, ks[4]);
+               t1 = _mm_aesdec_si128(t1, ks[5]);
+               t1 = _mm_aesdec_si128(t1, ks[6]);
+               t1 = _mm_aesdec_si128(t1, ks[7]);
+               t1 = _mm_aesdec_si128(t1, ks[8]);
+               t1 = _mm_aesdec_si128(t1, ks[9]);
+               t1 = _mm_aesdec_si128(t1, ks[10]);
+               t1 = _mm_aesdec_si128(t1, ks[11]);
+
+               t1 = _mm_aesdeclast_si128(t1, ks[12]);
                t1 = _mm_xor_si128(t1, f1);
                _mm_storeu_si128(bo + i, t1);
                f1 = last;
@@ -422,26 +370,10 @@ static void decrypt_cbc192(aesni_key_t *key, u_int blocks, u_char *in,
 static void encrypt_cbc256(aesni_key_t *key, u_int blocks, u_char *in,
                                                   u_char *iv, u_char *out)
 {
-       __m128i k0, k1, k2, k3, k4, k5, k6, k7, k8, k9, k10, k11, k12, k13, k14;
-       __m128i t, fb, *bi, *bo;
+       __m128i *ks, t, fb, *bi, *bo;
        int i;
 
-       k0 = key->schedule[0];
-       k1 = key->schedule[1];
-       k2 = key->schedule[2];
-       k3 = key->schedule[3];
-       k4 = key->schedule[4];
-       k5 = key->schedule[5];
-       k6 = key->schedule[6];
-       k7 = key->schedule[7];
-       k8 = key->schedule[8];
-       k9 = key->schedule[9];
-       k10 = key->schedule[10];
-       k11 = key->schedule[11];
-       k12 = key->schedule[12];
-       k13 = key->schedule[13];
-       k14 = key->schedule[14];
-
+       ks = key->schedule;
        bi = (__m128i*)in;
        bo = (__m128i*)out;
 
@@ -450,23 +382,23 @@ static void encrypt_cbc256(aesni_key_t *key, u_int blocks, u_char *in,
        {
                t = _mm_loadu_si128(bi + i);
                fb = _mm_xor_si128(t, fb);
-               fb = _mm_xor_si128(fb, k0);
-
-               fb = _mm_aesenc_si128(fb, k1);
-               fb = _mm_aesenc_si128(fb, k2);
-               fb = _mm_aesenc_si128(fb, k3);
-               fb = _mm_aesenc_si128(fb, k4);
-               fb = _mm_aesenc_si128(fb, k5);
-               fb = _mm_aesenc_si128(fb, k6);
-               fb = _mm_aesenc_si128(fb, k7);
-               fb = _mm_aesenc_si128(fb, k8);
-               fb = _mm_aesenc_si128(fb, k9);
-               fb = _mm_aesenc_si128(fb, k10);
-               fb = _mm_aesenc_si128(fb, k11);
-               fb = _mm_aesenc_si128(fb, k12);
-               fb = _mm_aesenc_si128(fb, k13);
-
-               fb = _mm_aesenclast_si128(fb, k14);
+               fb = _mm_xor_si128(fb, ks[0]);
+
+               fb = _mm_aesenc_si128(fb, ks[1]);
+               fb = _mm_aesenc_si128(fb, ks[2]);
+               fb = _mm_aesenc_si128(fb, ks[3]);
+               fb = _mm_aesenc_si128(fb, ks[4]);
+               fb = _mm_aesenc_si128(fb, ks[5]);
+               fb = _mm_aesenc_si128(fb, ks[6]);
+               fb = _mm_aesenc_si128(fb, ks[7]);
+               fb = _mm_aesenc_si128(fb, ks[8]);
+               fb = _mm_aesenc_si128(fb, ks[9]);
+               fb = _mm_aesenc_si128(fb, ks[10]);
+               fb = _mm_aesenc_si128(fb, ks[11]);
+               fb = _mm_aesenc_si128(fb, ks[12]);
+               fb = _mm_aesenc_si128(fb, ks[13]);
+
+               fb = _mm_aesenclast_si128(fb, ks[14]);
                _mm_storeu_si128(bo + i, fb);
        }
 }
@@ -477,28 +409,12 @@ static void encrypt_cbc256(aesni_key_t *key, u_int blocks, u_char *in,
 static void decrypt_cbc256(aesni_key_t *key, u_int blocks, u_char *in,
                                                   u_char *iv, u_char *out)
 {
-       __m128i k0, k1, k2, k3, k4, k5, k6, k7, k8, k9, k10, k11, k12, k13, k14;
-       __m128i last, *bi, *bo;
+       __m128i *ks, last, *bi, *bo;
        __m128i t1, t2, t3, t4;
        __m128i f1, f2, f3, f4;
        u_int i, pblocks;
 
-       k0 = key->schedule[0];
-       k1 = key->schedule[1];
-       k2 = key->schedule[2];
-       k3 = key->schedule[3];
-       k4 = key->schedule[4];
-       k5 = key->schedule[5];
-       k6 = key->schedule[6];
-       k7 = key->schedule[7];
-       k8 = key->schedule[8];
-       k9 = key->schedule[9];
-       k10 = key->schedule[10];
-       k11 = key->schedule[11];
-       k12 = key->schedule[12];
-       k13 = key->schedule[13];
-       k14 = key->schedule[14];
-
+       ks = key->schedule;
        bi = (__m128i*)in;
        bo = (__m128i*)out;
        pblocks = blocks - (blocks % CBC_DECRYPT_PARALLELISM);
@@ -517,68 +433,68 @@ static void decrypt_cbc256(aesni_key_t *key, u_int blocks, u_char *in,
                f4 = t3;
                last = t4;
 
-               t1 = _mm_xor_si128(t1, k0);
-               t2 = _mm_xor_si128(t2, k0);
-               t3 = _mm_xor_si128(t3, k0);
-               t4 = _mm_xor_si128(t4, k0);
-
-               t1 = _mm_aesdec_si128(t1, k1);
-               t2 = _mm_aesdec_si128(t2, k1);
-               t3 = _mm_aesdec_si128(t3, k1);
-               t4 = _mm_aesdec_si128(t4, k1);
-               t1 = _mm_aesdec_si128(t1, k2);
-               t2 = _mm_aesdec_si128(t2, k2);
-               t3 = _mm_aesdec_si128(t3, k2);
-               t4 = _mm_aesdec_si128(t4, k2);
-               t1 = _mm_aesdec_si128(t1, k3);
-               t2 = _mm_aesdec_si128(t2, k3);
-               t3 = _mm_aesdec_si128(t3, k3);
-               t4 = _mm_aesdec_si128(t4, k3);
-               t1 = _mm_aesdec_si128(t1, k4);
-               t2 = _mm_aesdec_si128(t2, k4);
-               t3 = _mm_aesdec_si128(t3, k4);
-               t4 = _mm_aesdec_si128(t4, k4);
-               t1 = _mm_aesdec_si128(t1, k5);
-               t2 = _mm_aesdec_si128(t2, k5);
-               t3 = _mm_aesdec_si128(t3, k5);
-               t4 = _mm_aesdec_si128(t4, k5);
-               t1 = _mm_aesdec_si128(t1, k6);
-               t2 = _mm_aesdec_si128(t2, k6);
-               t3 = _mm_aesdec_si128(t3, k6);
-               t4 = _mm_aesdec_si128(t4, k6);
-               t1 = _mm_aesdec_si128(t1, k7);
-               t2 = _mm_aesdec_si128(t2, k7);
-               t3 = _mm_aesdec_si128(t3, k7);
-               t4 = _mm_aesdec_si128(t4, k7);
-               t1 = _mm_aesdec_si128(t1, k8);
-               t2 = _mm_aesdec_si128(t2, k8);
-               t3 = _mm_aesdec_si128(t3, k8);
-               t4 = _mm_aesdec_si128(t4, k8);
-               t1 = _mm_aesdec_si128(t1, k9);
-               t2 = _mm_aesdec_si128(t2, k9);
-               t3 = _mm_aesdec_si128(t3, k9);
-               t4 = _mm_aesdec_si128(t4, k9);
-               t1 = _mm_aesdec_si128(t1, k10);
-               t2 = _mm_aesdec_si128(t2, k10);
-               t3 = _mm_aesdec_si128(t3, k10);
-               t4 = _mm_aesdec_si128(t4, k10);
-               t1 = _mm_aesdec_si128(t1, k11);
-               t2 = _mm_aesdec_si128(t2, k11);
-               t3 = _mm_aesdec_si128(t3, k11);
-               t4 = _mm_aesdec_si128(t4, k11);
-               t1 = _mm_aesdec_si128(t1, k12);
-               t2 = _mm_aesdec_si128(t2, k12);
-               t3 = _mm_aesdec_si128(t3, k12);
-               t4 = _mm_aesdec_si128(t4, k12);
-               t1 = _mm_aesdec_si128(t1, k13);
-               t2 = _mm_aesdec_si128(t2, k13);
-               t3 = _mm_aesdec_si128(t3, k13);
-               t4 = _mm_aesdec_si128(t4, k13);
-
-               t1 = _mm_aesdeclast_si128(t1, k14);
-               t2 = _mm_aesdeclast_si128(t2, k14);
-               t3 = _mm_aesdeclast_si128(t3, k14);
-               t4 = _mm_aesdeclast_si128(t4, k14);
+               t1 = _mm_xor_si128(t1, ks[0]);
+               t2 = _mm_xor_si128(t2, ks[0]);
+               t3 = _mm_xor_si128(t3, ks[0]);
+               t4 = _mm_xor_si128(t4, ks[0]);
+
+               t1 = _mm_aesdec_si128(t1, ks[1]);
+               t2 = _mm_aesdec_si128(t2, ks[1]);
+               t3 = _mm_aesdec_si128(t3, ks[1]);
+               t4 = _mm_aesdec_si128(t4, ks[1]);
+               t1 = _mm_aesdec_si128(t1, ks[2]);
+               t2 = _mm_aesdec_si128(t2, ks[2]);
+               t3 = _mm_aesdec_si128(t3, ks[2]);
+               t4 = _mm_aesdec_si128(t4, ks[2]);
+               t1 = _mm_aesdec_si128(t1, ks[3]);
+               t2 = _mm_aesdec_si128(t2, ks[3]);
+               t3 = _mm_aesdec_si128(t3, ks[3]);
+               t4 = _mm_aesdec_si128(t4, ks[3]);
+               t1 = _mm_aesdec_si128(t1, ks[4]);
+               t2 = _mm_aesdec_si128(t2, ks[4]);
+               t3 = _mm_aesdec_si128(t3, ks[4]);
+               t4 = _mm_aesdec_si128(t4, ks[4]);
+               t1 = _mm_aesdec_si128(t1, ks[5]);
+               t2 = _mm_aesdec_si128(t2, ks[5]);
+               t3 = _mm_aesdec_si128(t3, ks[5]);
+               t4 = _mm_aesdec_si128(t4, ks[5]);
+               t1 = _mm_aesdec_si128(t1, ks[6]);
+               t2 = _mm_aesdec_si128(t2, ks[6]);
+               t3 = _mm_aesdec_si128(t3, ks[6]);
+               t4 = _mm_aesdec_si128(t4, ks[6]);
+               t1 = _mm_aesdec_si128(t1, ks[7]);
+               t2 = _mm_aesdec_si128(t2, ks[7]);
+               t3 = _mm_aesdec_si128(t3, ks[7]);
+               t4 = _mm_aesdec_si128(t4, ks[7]);
+               t1 = _mm_aesdec_si128(t1, ks[8]);
+               t2 = _mm_aesdec_si128(t2, ks[8]);
+               t3 = _mm_aesdec_si128(t3, ks[8]);
+               t4 = _mm_aesdec_si128(t4, ks[8]);
+               t1 = _mm_aesdec_si128(t1, ks[9]);
+               t2 = _mm_aesdec_si128(t2, ks[9]);
+               t3 = _mm_aesdec_si128(t3, ks[9]);
+               t4 = _mm_aesdec_si128(t4, ks[9]);
+               t1 = _mm_aesdec_si128(t1, ks[10]);
+               t2 = _mm_aesdec_si128(t2, ks[10]);
+               t3 = _mm_aesdec_si128(t3, ks[10]);
+               t4 = _mm_aesdec_si128(t4, ks[10]);
+               t1 = _mm_aesdec_si128(t1, ks[11]);
+               t2 = _mm_aesdec_si128(t2, ks[11]);
+               t3 = _mm_aesdec_si128(t3, ks[11]);
+               t4 = _mm_aesdec_si128(t4, ks[11]);
+               t1 = _mm_aesdec_si128(t1, ks[12]);
+               t2 = _mm_aesdec_si128(t2, ks[12]);
+               t3 = _mm_aesdec_si128(t3, ks[12]);
+               t4 = _mm_aesdec_si128(t4, ks[12]);
+               t1 = _mm_aesdec_si128(t1, ks[13]);
+               t2 = _mm_aesdec_si128(t2, ks[13]);
+               t3 = _mm_aesdec_si128(t3, ks[13]);
+               t4 = _mm_aesdec_si128(t4, ks[13]);
+
+               t1 = _mm_aesdeclast_si128(t1, ks[14]);
+               t2 = _mm_aesdeclast_si128(t2, ks[14]);
+               t3 = _mm_aesdeclast_si128(t3, ks[14]);
+               t4 = _mm_aesdeclast_si128(t4, ks[14]);
                t1 = _mm_xor_si128(t1, f1);
                t2 = _mm_xor_si128(t2, f2);
                t3 = _mm_xor_si128(t3, f3);
@@ -593,23 +509,23 @@ static void decrypt_cbc256(aesni_key_t *key, u_int blocks, u_char *in,
        for (i = pblocks; i < blocks; i++)
        {
                last = _mm_loadu_si128(bi + i);
-               t1 = _mm_xor_si128(last, k0);
-
-               t1 = _mm_aesdec_si128(t1, k1);
-               t1 = _mm_aesdec_si128(t1, k2);
-               t1 = _mm_aesdec_si128(t1, k3);
-               t1 = _mm_aesdec_si128(t1, k4);
-               t1 = _mm_aesdec_si128(t1, k5);
-               t1 = _mm_aesdec_si128(t1, k6);
-               t1 = _mm_aesdec_si128(t1, k7);
-               t1 = _mm_aesdec_si128(t1, k8);
-               t1 = _mm_aesdec_si128(t1, k9);
-               t1 = _mm_aesdec_si128(t1, k10);
-               t1 = _mm_aesdec_si128(t1, k11);
-               t1 = _mm_aesdec_si128(t1, k12);
-               t1 = _mm_aesdec_si128(t1, k13);
-
-               t1 = _mm_aesdeclast_si128(t1, k14);
+               t1 = _mm_xor_si128(last, ks[0]);
+
+               t1 = _mm_aesdec_si128(t1, ks[1]);
+               t1 = _mm_aesdec_si128(t1, ks[2]);
+               t1 = _mm_aesdec_si128(t1, ks[3]);
+               t1 = _mm_aesdec_si128(t1, ks[4]);
+               t1 = _mm_aesdec_si128(t1, ks[5]);
+               t1 = _mm_aesdec_si128(t1, ks[6]);
+               t1 = _mm_aesdec_si128(t1, ks[7]);
+               t1 = _mm_aesdec_si128(t1, ks[8]);
+               t1 = _mm_aesdec_si128(t1, ks[9]);
+               t1 = _mm_aesdec_si128(t1, ks[10]);
+               t1 = _mm_aesdec_si128(t1, ks[11]);
+               t1 = _mm_aesdec_si128(t1, ks[12]);
+               t1 = _mm_aesdec_si128(t1, ks[13]);
+
+               t1 = _mm_aesdeclast_si128(t1, ks[14]);
                t1 = _mm_xor_si128(t1, f1);
                _mm_storeu_si128(bo + i, t1);
                f1 = last;
index 0e4a24f..d523bc1 100644 (file)
@@ -159,17 +159,18 @@ static void build_ctr(private_aesni_ccm_t *this, u_int32_t i, u_char *iv,
 static __m128i icv_header(private_aesni_ccm_t *this, size_t len, u_char *iv,
                                                  u_int16_t alen, u_char *assoc)
 {
-       __m128i b, t, c;
+       __m128i *ks, b, t, c;
        u_int i, round, blocks, rem;
 
+       ks = this->key->schedule;
        build_b0(this, len, alen, iv, &b);
        c = _mm_loadu_si128(&b);
-       c = _mm_xor_si128(c, this->key->schedule[0]);
+       c = _mm_xor_si128(c, ks[0]);
        for (round = 1; round < this->key->rounds; round++)
        {
-               c = _mm_aesenc_si128(c, this->key->schedule[round]);
+               c = _mm_aesenc_si128(c, ks[round]);
        }
-       c = _mm_aesenclast_si128(c, this->key->schedule[this->key->rounds]);
+       c = _mm_aesenclast_si128(c, ks[this->key->rounds]);
 
        if (alen)
        {
@@ -200,12 +201,12 @@ static __m128i icv_header(private_aesni_ccm_t *this, size_t len, u_char *iv,
                                t = _mm_loadu_si128(((__m128i*)(assoc - sizeof(alen))) + i);
                        }
                        c = _mm_xor_si128(t, c);
-                       c = _mm_xor_si128(c, this->key->schedule[0]);
+                       c = _mm_xor_si128(c, ks[0]);
                        for (round = 1; round < this->key->rounds; round++)
                        {
-                               c = _mm_aesenc_si128(c, this->key->schedule[round]);
+                               c = _mm_aesenc_si128(c, ks[round]);
                        }
-                       c = _mm_aesenclast_si128(c, this->key->schedule[this->key->rounds]);
+                       c = _mm_aesenclast_si128(c, ks[this->key->rounds]);
                }
        }
        return c;
@@ -217,18 +218,19 @@ static __m128i icv_header(private_aesni_ccm_t *this, size_t len, u_char *iv,
 static void crypt_icv(private_aesni_ccm_t *this, u_char *iv,
                                          __m128i c, u_char *icv)
 {
-       __m128i b, t;
+       __m128i *ks, b, t;
        u_int round;
 
+       ks = this->key->schedule;
        build_ctr(this, 0, iv, &b);
 
        t = _mm_loadu_si128(&b);
-       t = _mm_xor_si128(t, this->key->schedule[0]);
+       t = _mm_xor_si128(t, ks[0]);
        for (round = 1; round < this->key->rounds; round++)
        {
-               t = _mm_aesenc_si128(t, this->key->schedule[round]);
+               t = _mm_aesenc_si128(t, ks[round]);
        }
-       t = _mm_aesenclast_si128(t, this->key->schedule[this->key->rounds]);
+       t = _mm_aesenclast_si128(t, ks[this->key->rounds]);
 
        t = _mm_xor_si128(t, c);
 
@@ -258,23 +260,24 @@ static inline __m128i increment_be(__m128i x)
 static __m128i encrypt_ccm_rem(aesni_key_t *key, u_int rem, __m128i state,
                                                           void *in, void *out, __m128i c)
 {
-       __m128i t, b, d;
+       __m128i *ks, t, b, d;
        u_int round;
 
+       ks = key->schedule;
        memset(&b, 0, sizeof(b));
        memcpy(&b, in, rem);
        d = _mm_loadu_si128(&b);
 
        c = _mm_xor_si128(d, c);
-       c = _mm_xor_si128(c, key->schedule[0]);
-       t = _mm_xor_si128(state, key->schedule[0]);
+       c = _mm_xor_si128(c, ks[0]);
+       t = _mm_xor_si128(state, ks[0]);
        for (round = 1; round < key->rounds; round++)
        {
-               c = _mm_aesenc_si128(c, key->schedule[round]);
-               t = _mm_aesenc_si128(t, key->schedule[round]);
+               c = _mm_aesenc_si128(c, ks[round]);
+               t = _mm_aesenc_si128(t, ks[round]);
        }
-       c = _mm_aesenclast_si128(c, key->schedule[key->rounds]);
-       t = _mm_aesenclast_si128(t, key->schedule[key->rounds]);
+       c = _mm_aesenclast_si128(c, ks[key->rounds]);
+       t = _mm_aesenclast_si128(t, ks[key->rounds]);
 
        t = _mm_xor_si128(t, d);
        _mm_storeu_si128(&b, t);
@@ -290,31 +293,32 @@ static __m128i encrypt_ccm_rem(aesni_key_t *key, u_int rem, __m128i state,
 static __m128i decrypt_ccm_rem(aesni_key_t *key, u_int rem, __m128i state,
                                                           void *in, void *out, __m128i c)
 {
-       __m128i t, b, d;
+       __m128i *ks, t, b, d;
        u_int round;
 
+       ks = key->schedule;
        memset(&b, 0, sizeof(b));
        memcpy(&b, in, rem);
        d = _mm_loadu_si128(&b);
 
-       t = _mm_xor_si128(state, key->schedule[0]);
+       t = _mm_xor_si128(state, ks[0]);
        for (round = 1; round < key->rounds; round++)
        {
-               t = _mm_aesenc_si128(t, key->schedule[round]);
+               t = _mm_aesenc_si128(t, ks[round]);
        }
-       t = _mm_aesenclast_si128(t, key->schedule[key->rounds]);
+       t = _mm_aesenclast_si128(t, ks[key->rounds]);
        t = _mm_xor_si128(t, d);
        _mm_storeu_si128(&b, t);
 
        memset((u_char*)&b + rem, 0, sizeof(b) - rem);
        t = _mm_loadu_si128(&b);
        c = _mm_xor_si128(t, c);
-       c = _mm_xor_si128(c, key->schedule[0]);
+       c = _mm_xor_si128(c, ks[0]);
        for (round = 1; round < key->rounds; round++)
        {
-               c = _mm_aesenc_si128(c, key->schedule[round]);
+               c = _mm_aesenc_si128(c, ks[round]);
        }
-       c = _mm_aesenclast_si128(c, key->schedule[key->rounds]);
+       c = _mm_aesenclast_si128(c, ks[key->rounds]);
 
        memcpy(out, &b, rem);
 
@@ -328,8 +332,7 @@ static void encrypt_ccm128(private_aesni_ccm_t *this,
                                                   size_t len, u_char *in, u_char *out, u_char *iv,
                                                   size_t alen, u_char *assoc, u_char *icv)
 {
-       __m128i k0, k1, k2, k3, k4, k5, k6, k7, k8, k9, k10;
-       __m128i d, t, c, b, state, *bi, *bo;
+       __m128i *ks, d, t, c, b, state, *bi, *bo;
        u_int blocks, rem, i;
 
        c = icv_header(this, len, iv, alen, assoc);
@@ -340,47 +343,37 @@ static void encrypt_ccm128(private_aesni_ccm_t *this,
        bi = (__m128i*)in;
        bo = (__m128i*)out;
 
-       k0 = this->key->schedule[0];
-       k1 = this->key->schedule[1];
-       k2 = this->key->schedule[2];
-       k3 = this->key->schedule[3];
-       k4 = this->key->schedule[4];
-       k5 = this->key->schedule[5];
-       k6 = this->key->schedule[6];
-       k7 = this->key->schedule[7];
-       k8 = this->key->schedule[8];
-       k9 = this->key->schedule[9];
-       k10 = this->key->schedule[10];
+       ks = this->key->schedule;
 
        for (i = 0; i < blocks; i++)
        {
                d = _mm_loadu_si128(bi + i);
 
                c = _mm_xor_si128(d, c);
-               c = _mm_xor_si128(c, k0);
-               t = _mm_xor_si128(state, k0);
-
-               c = _mm_aesenc_si128(c, k1);
-               t = _mm_aesenc_si128(t, k1);
-               c = _mm_aesenc_si128(c, k2);
-               t = _mm_aesenc_si128(t, k2);
-               c = _mm_aesenc_si128(c, k3);
-               t = _mm_aesenc_si128(t, k3);
-               c = _mm_aesenc_si128(c, k4);
-               t = _mm_aesenc_si128(t, k4);
-               c = _mm_aesenc_si128(c, k5);
-               t = _mm_aesenc_si128(t, k5);
-               c = _mm_aesenc_si128(c, k6);
-               t = _mm_aesenc_si128(t, k6);
-               c = _mm_aesenc_si128(c, k7);
-               t = _mm_aesenc_si128(t, k7);
-               c = _mm_aesenc_si128(c, k8);
-               t = _mm_aesenc_si128(t, k8);
-               c = _mm_aesenc_si128(c, k9);
-               t = _mm_aesenc_si128(t, k9);
-
-               c = _mm_aesenclast_si128(c, k10);
-               t = _mm_aesenclast_si128(t, k10);
+               c = _mm_xor_si128(c, ks[0]);
+               t = _mm_xor_si128(state, ks[0]);
+
+               c = _mm_aesenc_si128(c, ks[1]);
+               t = _mm_aesenc_si128(t, ks[1]);
+               c = _mm_aesenc_si128(c, ks[2]);
+               t = _mm_aesenc_si128(t, ks[2]);
+               c = _mm_aesenc_si128(c, ks[3]);
+               t = _mm_aesenc_si128(t, ks[3]);
+               c = _mm_aesenc_si128(c, ks[4]);
+               t = _mm_aesenc_si128(t, ks[4]);
+               c = _mm_aesenc_si128(c, ks[5]);
+               t = _mm_aesenc_si128(t, ks[5]);
+               c = _mm_aesenc_si128(c, ks[6]);
+               t = _mm_aesenc_si128(t, ks[6]);
+               c = _mm_aesenc_si128(c, ks[7]);
+               t = _mm_aesenc_si128(t, ks[7]);
+               c = _mm_aesenc_si128(c, ks[8]);
+               t = _mm_aesenc_si128(t, ks[8]);
+               c = _mm_aesenc_si128(c, ks[9]);
+               t = _mm_aesenc_si128(t, ks[9]);
+
+               c = _mm_aesenclast_si128(c, ks[10]);
+               t = _mm_aesenclast_si128(t, ks[10]);
 
                t = _mm_xor_si128(t, d);
                _mm_storeu_si128(bo + i, t);
@@ -402,8 +395,7 @@ static void decrypt_ccm128(private_aesni_ccm_t *this,
                                                   size_t len, u_char *in, u_char *out, u_char *iv,
                                                   size_t alen, u_char *assoc, u_char *icv)
 {
-       __m128i k0, k1, k2, k3, k4, k5, k6, k7, k8, k9, k10;
-       __m128i d, t, c, b, state, *bi, *bo;
+       __m128i *ks, d, t, c, b, state, *bi, *bo;
        u_int blocks, rem, i;
 
        c = icv_header(this, len, iv, alen, assoc);
@@ -414,52 +406,42 @@ static void decrypt_ccm128(private_aesni_ccm_t *this,
        bi = (__m128i*)in;
        bo = (__m128i*)out;
 
-       k0 = this->key->schedule[0];
-       k1 = this->key->schedule[1];
-       k2 = this->key->schedule[2];
-       k3 = this->key->schedule[3];
-       k4 = this->key->schedule[4];
-       k5 = this->key->schedule[5];
-       k6 = this->key->schedule[6];
-       k7 = this->key->schedule[7];
-       k8 = this->key->schedule[8];
-       k9 = this->key->schedule[9];
-       k10 = this->key->schedule[10];
+       ks = this->key->schedule;
 
        for (i = 0; i < blocks; i++)
        {
                d = _mm_loadu_si128(bi + i);
 
-               t = _mm_xor_si128(state, k0);
+               t = _mm_xor_si128(state, ks[0]);
 
-               t = _mm_aesenc_si128(t, k1);
-               t = _mm_aesenc_si128(t, k2);
-               t = _mm_aesenc_si128(t, k3);
-               t = _mm_aesenc_si128(t, k4);
-               t = _mm_aesenc_si128(t, k5);
-               t = _mm_aesenc_si128(t, k6);
-               t = _mm_aesenc_si128(t, k7);
-               t = _mm_aesenc_si128(t, k8);
-               t = _mm_aesenc_si128(t, k9);
+               t = _mm_aesenc_si128(t, ks[1]);
+               t = _mm_aesenc_si128(t, ks[2]);
+               t = _mm_aesenc_si128(t, ks[3]);
+               t = _mm_aesenc_si128(t, ks[4]);
+               t = _mm_aesenc_si128(t, ks[5]);
+               t = _mm_aesenc_si128(t, ks[6]);
+               t = _mm_aesenc_si128(t, ks[7]);
+               t = _mm_aesenc_si128(t, ks[8]);
+               t = _mm_aesenc_si128(t, ks[9]);
 
-               t = _mm_aesenclast_si128(t, k10);
+               t = _mm_aesenclast_si128(t, ks[10]);
                t = _mm_xor_si128(t, d);
                _mm_storeu_si128(bo + i, t);
 
                c = _mm_xor_si128(t, c);
-               c = _mm_xor_si128(c, k0);
+               c = _mm_xor_si128(c, ks[0]);
 
-               c = _mm_aesenc_si128(c, k1);
-               c = _mm_aesenc_si128(c, k2);
-               c = _mm_aesenc_si128(c, k3);
-               c = _mm_aesenc_si128(c, k4);
-               c = _mm_aesenc_si128(c, k5);
-               c = _mm_aesenc_si128(c, k6);
-               c = _mm_aesenc_si128(c, k7);
-               c = _mm_aesenc_si128(c, k8);
-               c = _mm_aesenc_si128(c, k9);
+               c = _mm_aesenc_si128(c, ks[1]);
+               c = _mm_aesenc_si128(c, ks[2]);
+               c = _mm_aesenc_si128(c, ks[3]);
+               c = _mm_aesenc_si128(c, ks[4]);
+               c = _mm_aesenc_si128(c, ks[5]);
+               c = _mm_aesenc_si128(c, ks[6]);
+               c = _mm_aesenc_si128(c, ks[7]);
+               c = _mm_aesenc_si128(c, ks[8]);
+               c = _mm_aesenc_si128(c, ks[9]);
 
-               c = _mm_aesenclast_si128(c, k10);
+               c = _mm_aesenclast_si128(c, ks[10]);
 
                state = increment_be(state);
        }
@@ -478,8 +460,7 @@ static void encrypt_ccm192(private_aesni_ccm_t *this,
                                                   size_t len, u_char *in, u_char *out, u_char *iv,
                                                   size_t alen, u_char *assoc, u_char *icv)
 {
-       __m128i k0, k1, k2, k3, k4, k5, k6, k7, k8, k9, k10, k11, k12;
-       __m128i d, t, c, b, state, *bi, *bo;
+       __m128i *ks, d, t, c, b, state, *bi, *bo;
        u_int blocks, rem, i;
 
        c = icv_header(this, len, iv, alen, assoc);
@@ -490,53 +471,41 @@ static void encrypt_ccm192(private_aesni_ccm_t *this,
        bi = (__m128i*)in;
        bo = (__m128i*)out;
 
-       k0 = this->key->schedule[0];
-       k1 = this->key->schedule[1];
-       k2 = this->key->schedule[2];
-       k3 = this->key->schedule[3];
-       k4 = this->key->schedule[4];
-       k5 = this->key->schedule[5];
-       k6 = this->key->schedule[6];
-       k7 = this->key->schedule[7];
-       k8 = this->key->schedule[8];
-       k9 = this->key->schedule[9];
-       k10 = this->key->schedule[10];
-       k11 = this->key->schedule[11];
-       k12 = this->key->schedule[12];
+       ks = this->key->schedule;
 
        for (i = 0; i < blocks; i++)
        {
                d = _mm_loadu_si128(bi + i);
 
                c = _mm_xor_si128(d, c);
-               c = _mm_xor_si128(c, k0);
-               t = _mm_xor_si128(state, k0);
-
-               c = _mm_aesenc_si128(c, k1);
-               t = _mm_aesenc_si128(t, k1);
-               c = _mm_aesenc_si128(c, k2);
-               t = _mm_aesenc_si128(t, k2);
-               c = _mm_aesenc_si128(c, k3);
-               t = _mm_aesenc_si128(t, k3);
-               c = _mm_aesenc_si128(c, k4);
-               t = _mm_aesenc_si128(t, k4);
-               c = _mm_aesenc_si128(c, k5);
-               t = _mm_aesenc_si128(t, k5);
-               c = _mm_aesenc_si128(c, k6);
-               t = _mm_aesenc_si128(t, k6);
-               c = _mm_aesenc_si128(c, k7);
-               t = _mm_aesenc_si128(t, k7);
-               c = _mm_aesenc_si128(c, k8);
-               t = _mm_aesenc_si128(t, k8);
-               c = _mm_aesenc_si128(c, k9);
-               t = _mm_aesenc_si128(t, k9);
-               c = _mm_aesenc_si128(c, k10);
-               t = _mm_aesenc_si128(t, k10);
-               c = _mm_aesenc_si128(c, k11);
-               t = _mm_aesenc_si128(t, k11);
-
-               c = _mm_aesenclast_si128(c, k12);
-               t = _mm_aesenclast_si128(t, k12);
+               c = _mm_xor_si128(c, ks[0]);
+               t = _mm_xor_si128(state, ks[0]);
+
+               c = _mm_aesenc_si128(c, ks[1]);
+               t = _mm_aesenc_si128(t, ks[1]);
+               c = _mm_aesenc_si128(c, ks[2]);
+               t = _mm_aesenc_si128(t, ks[2]);
+               c = _mm_aesenc_si128(c, ks[3]);
+               t = _mm_aesenc_si128(t, ks[3]);
+               c = _mm_aesenc_si128(c, ks[4]);
+               t = _mm_aesenc_si128(t, ks[4]);
+               c = _mm_aesenc_si128(c, ks[5]);
+               t = _mm_aesenc_si128(t, ks[5]);
+               c = _mm_aesenc_si128(c, ks[6]);
+               t = _mm_aesenc_si128(t, ks[6]);
+               c = _mm_aesenc_si128(c, ks[7]);
+               t = _mm_aesenc_si128(t, ks[7]);
+               c = _mm_aesenc_si128(c, ks[8]);
+               t = _mm_aesenc_si128(t, ks[8]);
+               c = _mm_aesenc_si128(c, ks[9]);
+               t = _mm_aesenc_si128(t, ks[9]);
+               c = _mm_aesenc_si128(c, ks[10]);
+               t = _mm_aesenc_si128(t, ks[10]);
+               c = _mm_aesenc_si128(c, ks[11]);
+               t = _mm_aesenc_si128(t, ks[11]);
+
+               c = _mm_aesenclast_si128(c, ks[12]);
+               t = _mm_aesenclast_si128(t, ks[12]);
 
                t = _mm_xor_si128(t, d);
                _mm_storeu_si128(bo + i, t);
@@ -558,8 +527,7 @@ static void decrypt_ccm192(private_aesni_ccm_t *this,
                                                   size_t len, u_char *in, u_char *out, u_char *iv,
                                                   size_t alen, u_char *assoc, u_char *icv)
 {
-       __m128i k0, k1, k2, k3, k4, k5, k6, k7, k8, k9, k10, k11, k12;
-       __m128i d, t, c, b, state, *bi, *bo;
+       __m128i *ks, d, t, c, b, state, *bi, *bo;
        u_int blocks, rem, i;
 
        c = icv_header(this, len, iv, alen, assoc);
@@ -570,58 +538,46 @@ static void decrypt_ccm192(private_aesni_ccm_t *this,
        bi = (__m128i*)in;
        bo = (__m128i*)out;
 
-       k0 = this->key->schedule[0];
-       k1 = this->key->schedule[1];
-       k2 = this->key->schedule[2];
-       k3 = this->key->schedule[3];
-       k4 = this->key->schedule[4];
-       k5 = this->key->schedule[5];
-       k6 = this->key->schedule[6];
-       k7 = this->key->schedule[7];
-       k8 = this->key->schedule[8];
-       k9 = this->key->schedule[9];
-       k10 = this->key->schedule[10];
-       k11 = this->key->schedule[11];
-       k12 = this->key->schedule[12];
+       ks = this->key->schedule;
 
        for (i = 0; i < blocks; i++)
        {
                d = _mm_loadu_si128(bi + i);
 
-               t = _mm_xor_si128(state, k0);
-
-               t = _mm_aesenc_si128(t, k1);
-               t = _mm_aesenc_si128(t, k2);
-               t = _mm_aesenc_si128(t, k3);
-               t = _mm_aesenc_si128(t, k4);
-               t = _mm_aesenc_si128(t, k5);
-               t = _mm_aesenc_si128(t, k6);
-               t = _mm_aesenc_si128(t, k7);
-               t = _mm_aesenc_si128(t, k8);
-               t = _mm_aesenc_si128(t, k9);
-               t = _mm_aesenc_si128(t, k10);
-               t = _mm_aesenc_si128(t, k11);
-
-               t = _mm_aesenclast_si128(t, k12);
+               t = _mm_xor_si128(state, ks[0]);
+
+               t = _mm_aesenc_si128(t, ks[1]);
+               t = _mm_aesenc_si128(t, ks[2]);
+               t = _mm_aesenc_si128(t, ks[3]);
+               t = _mm_aesenc_si128(t, ks[4]);
+               t = _mm_aesenc_si128(t, ks[5]);
+               t = _mm_aesenc_si128(t, ks[6]);
+               t = _mm_aesenc_si128(t, ks[7]);
+               t = _mm_aesenc_si128(t, ks[8]);
+               t = _mm_aesenc_si128(t, ks[9]);
+               t = _mm_aesenc_si128(t, ks[10]);
+               t = _mm_aesenc_si128(t, ks[11]);
+
+               t = _mm_aesenclast_si128(t, ks[12]);
                t = _mm_xor_si128(t, d);
                _mm_storeu_si128(bo + i, t);
 
                c = _mm_xor_si128(t, c);
-               c = _mm_xor_si128(c, k0);
-
-               c = _mm_aesenc_si128(c, k1);
-               c = _mm_aesenc_si128(c, k2);
-               c = _mm_aesenc_si128(c, k3);
-               c = _mm_aesenc_si128(c, k4);
-               c = _mm_aesenc_si128(c, k5);
-               c = _mm_aesenc_si128(c, k6);
-               c = _mm_aesenc_si128(c, k7);
-               c = _mm_aesenc_si128(c, k8);
-               c = _mm_aesenc_si128(c, k9);
-               c = _mm_aesenc_si128(c, k10);
-               c = _mm_aesenc_si128(c, k11);
-
-               c = _mm_aesenclast_si128(c, k12);
+               c = _mm_xor_si128(c, ks[0]);
+
+               c = _mm_aesenc_si128(c, ks[1]);
+               c = _mm_aesenc_si128(c, ks[2]);
+               c = _mm_aesenc_si128(c, ks[3]);
+               c = _mm_aesenc_si128(c, ks[4]);
+               c = _mm_aesenc_si128(c, ks[5]);
+               c = _mm_aesenc_si128(c, ks[6]);
+               c = _mm_aesenc_si128(c, ks[7]);
+               c = _mm_aesenc_si128(c, ks[8]);
+               c = _mm_aesenc_si128(c, ks[9]);
+               c = _mm_aesenc_si128(c, ks[10]);
+               c = _mm_aesenc_si128(c, ks[11]);
+
+               c = _mm_aesenclast_si128(c, ks[12]);
 
                state = increment_be(state);
        }
@@ -640,8 +596,7 @@ static void encrypt_ccm256(private_aesni_ccm_t *this,
                                                   size_t len, u_char *in, u_char *out, u_char *iv,
                                                   size_t alen, u_char *assoc, u_char *icv)
 {
-       __m128i k0, k1, k2, k3, k4, k5, k6, k7, k8, k9, k10, k11, k12, k13, k14;
-       __m128i d, t, c, b, state, *bi, *bo;
+       __m128i *ks, d, t, c, b, state, *bi, *bo;
        u_int blocks, rem, i;
 
        c = icv_header(this, len, iv, alen, assoc);
@@ -652,59 +607,45 @@ static void encrypt_ccm256(private_aesni_ccm_t *this,
        bi = (__m128i*)in;
        bo = (__m128i*)out;
 
-       k0 = this->key->schedule[0];
-       k1 = this->key->schedule[1];
-       k2 = this->key->schedule[2];
-       k3 = this->key->schedule[3];
-       k4 = this->key->schedule[4];
-       k5 = this->key->schedule[5];
-       k6 = this->key->schedule[6];
-       k7 = this->key->schedule[7];
-       k8 = this->key->schedule[8];
-       k9 = this->key->schedule[9];
-       k10 = this->key->schedule[10];
-       k11 = this->key->schedule[11];
-       k12 = this->key->schedule[12];
-       k13 = this->key->schedule[13];
-       k14 = this->key->schedule[14];
+       ks = this->key->schedule;
 
        for (i = 0; i < blocks; i++)
        {
                d = _mm_loadu_si128(bi + i);
 
                c = _mm_xor_si128(d, c);
-               c = _mm_xor_si128(c, k0);
-               t = _mm_xor_si128(state, k0);
-
-               c = _mm_aesenc_si128(c, k1);
-               t = _mm_aesenc_si128(t, k1);
-               c = _mm_aesenc_si128(c, k2);
-               t = _mm_aesenc_si128(t, k2);
-               c = _mm_aesenc_si128(c, k3);
-               t = _mm_aesenc_si128(t, k3);
-               c = _mm_aesenc_si128(c, k4);
-               t = _mm_aesenc_si128(t, k4);
-               c = _mm_aesenc_si128(c, k5);
-               t = _mm_aesenc_si128(t, k5);
-               c = _mm_aesenc_si128(c, k6);
-               t = _mm_aesenc_si128(t, k6);
-               c = _mm_aesenc_si128(c, k7);
-               t = _mm_aesenc_si128(t, k7);
-               c = _mm_aesenc_si128(c, k8);
-               t = _mm_aesenc_si128(t, k8);
-               c = _mm_aesenc_si128(c, k9);
-               t = _mm_aesenc_si128(t, k9);
-               c = _mm_aesenc_si128(c, k10);
-               t = _mm_aesenc_si128(t, k10);
-               c = _mm_aesenc_si128(c, k11);
-               t = _mm_aesenc_si128(t, k11);
-               c = _mm_aesenc_si128(c, k12);
-               t = _mm_aesenc_si128(t, k12);
-               c = _mm_aesenc_si128(c, k13);
-               t = _mm_aesenc_si128(t, k13);
-
-               c = _mm_aesenclast_si128(c, k14);
-               t = _mm_aesenclast_si128(t, k14);
+               c = _mm_xor_si128(c, ks[0]);
+               t = _mm_xor_si128(state, ks[0]);
+
+               c = _mm_aesenc_si128(c, ks[1]);
+               t = _mm_aesenc_si128(t, ks[1]);
+               c = _mm_aesenc_si128(c, ks[2]);
+               t = _mm_aesenc_si128(t, ks[2]);
+               c = _mm_aesenc_si128(c, ks[3]);
+               t = _mm_aesenc_si128(t, ks[3]);
+               c = _mm_aesenc_si128(c, ks[4]);
+               t = _mm_aesenc_si128(t, ks[4]);
+               c = _mm_aesenc_si128(c, ks[5]);
+               t = _mm_aesenc_si128(t, ks[5]);
+               c = _mm_aesenc_si128(c, ks[6]);
+               t = _mm_aesenc_si128(t, ks[6]);
+               c = _mm_aesenc_si128(c, ks[7]);
+               t = _mm_aesenc_si128(t, ks[7]);
+               c = _mm_aesenc_si128(c, ks[8]);
+               t = _mm_aesenc_si128(t, ks[8]);
+               c = _mm_aesenc_si128(c, ks[9]);
+               t = _mm_aesenc_si128(t, ks[9]);
+               c = _mm_aesenc_si128(c, ks[10]);
+               t = _mm_aesenc_si128(t, ks[10]);
+               c = _mm_aesenc_si128(c, ks[11]);
+               t = _mm_aesenc_si128(t, ks[11]);
+               c = _mm_aesenc_si128(c, ks[12]);
+               t = _mm_aesenc_si128(t, ks[12]);
+               c = _mm_aesenc_si128(c, ks[13]);
+               t = _mm_aesenc_si128(t, ks[13]);
+
+               c = _mm_aesenclast_si128(c, ks[14]);
+               t = _mm_aesenclast_si128(t, ks[14]);
 
                t = _mm_xor_si128(t, d);
                _mm_storeu_si128(bo + i, t);
@@ -726,8 +667,7 @@ static void decrypt_ccm256(private_aesni_ccm_t *this,
                                                   size_t len, u_char *in, u_char *out, u_char *iv,
                                                   size_t alen, u_char *assoc, u_char *icv)
 {
-       __m128i k0, k1, k2, k3, k4, k5, k6, k7, k8, k9, k10, k11, k12, k13, k14;
-       __m128i d, t, c, b, state, *bi, *bo;
+       __m128i *ks, d, t, c, b, state, *bi, *bo;
        u_int blocks, rem, i;
 
        c = icv_header(this, len, iv, alen, assoc);
@@ -738,64 +678,50 @@ static void decrypt_ccm256(private_aesni_ccm_t *this,
        bi = (__m128i*)in;
        bo = (__m128i*)out;
 
-       k0 = this->key->schedule[0];
-       k1 = this->key->schedule[1];
-       k2 = this->key->schedule[2];
-       k3 = this->key->schedule[3];
-       k4 = this->key->schedule[4];
-       k5 = this->key->schedule[5];
-       k6 = this->key->schedule[6];
-       k7 = this->key->schedule[7];
-       k8 = this->key->schedule[8];
-       k9 = this->key->schedule[9];
-       k10 = this->key->schedule[10];
-       k11 = this->key->schedule[11];
-       k12 = this->key->schedule[12];
-       k13 = this->key->schedule[13];
-       k14 = this->key->schedule[14];
+       ks = this->key->schedule;
 
        for (i = 0; i < blocks; i++)
        {
                d = _mm_loadu_si128(bi + i);
 
-               t = _mm_xor_si128(state, k0);
-
-               t = _mm_aesenc_si128(t, k1);
-               t = _mm_aesenc_si128(t, k2);
-               t = _mm_aesenc_si128(t, k3);
-               t = _mm_aesenc_si128(t, k4);
-               t = _mm_aesenc_si128(t, k5);
-               t = _mm_aesenc_si128(t, k6);
-               t = _mm_aesenc_si128(t, k7);
-               t = _mm_aesenc_si128(t, k8);
-               t = _mm_aesenc_si128(t, k9);
-               t = _mm_aesenc_si128(t, k10);
-               t = _mm_aesenc_si128(t, k11);
-               t = _mm_aesenc_si128(t, k12);
-               t = _mm_aesenc_si128(t, k13);
-
-               t = _mm_aesenclast_si128(t, k14);
+               t = _mm_xor_si128(state, ks[0]);
+
+               t = _mm_aesenc_si128(t, ks[1]);
+               t = _mm_aesenc_si128(t, ks[2]);
+               t = _mm_aesenc_si128(t, ks[3]);
+               t = _mm_aesenc_si128(t, ks[4]);
+               t = _mm_aesenc_si128(t, ks[5]);
+               t = _mm_aesenc_si128(t, ks[6]);
+               t = _mm_aesenc_si128(t, ks[7]);
+               t = _mm_aesenc_si128(t, ks[8]);
+               t = _mm_aesenc_si128(t, ks[9]);
+               t = _mm_aesenc_si128(t, ks[10]);
+               t = _mm_aesenc_si128(t, ks[11]);
+               t = _mm_aesenc_si128(t, ks[12]);
+               t = _mm_aesenc_si128(t, ks[13]);
+
+               t = _mm_aesenclast_si128(t, ks[14]);
                t = _mm_xor_si128(t, d);
                _mm_storeu_si128(bo + i, t);
 
                c = _mm_xor_si128(t, c);
-               c = _mm_xor_si128(c, k0);
-
-               c = _mm_aesenc_si128(c, k1);
-               c = _mm_aesenc_si128(c, k2);
-               c = _mm_aesenc_si128(c, k3);
-               c = _mm_aesenc_si128(c, k4);
-               c = _mm_aesenc_si128(c, k5);
-               c = _mm_aesenc_si128(c, k6);
-               c = _mm_aesenc_si128(c, k7);
-               c = _mm_aesenc_si128(c, k8);
-               c = _mm_aesenc_si128(c, k9);
-               c = _mm_aesenc_si128(c, k10);
-               c = _mm_aesenc_si128(c, k11);
-               c = _mm_aesenc_si128(c, k12);
-               c = _mm_aesenc_si128(c, k13);
-
-               c = _mm_aesenclast_si128(c, k14);
+               c = _mm_xor_si128(c, ks[0]);
+
+               c = _mm_aesenc_si128(c, ks[1]);
+               c = _mm_aesenc_si128(c, ks[2]);
+               c = _mm_aesenc_si128(c, ks[3]);
+               c = _mm_aesenc_si128(c, ks[4]);
+               c = _mm_aesenc_si128(c, ks[5]);
+               c = _mm_aesenc_si128(c, ks[6]);
+               c = _mm_aesenc_si128(c, ks[7]);
+               c = _mm_aesenc_si128(c, ks[8]);
+               c = _mm_aesenc_si128(c, ks[9]);
+               c = _mm_aesenc_si128(c, ks[10]);
+               c = _mm_aesenc_si128(c, ks[11]);
+               c = _mm_aesenc_si128(c, ks[12]);
+               c = _mm_aesenc_si128(c, ks[13]);
+
+               c = _mm_aesenclast_si128(c, ks[14]);
 
                state = increment_be(state);
        }
index a35445f..d6a87e6 100644 (file)
@@ -67,8 +67,7 @@ struct private_mac_t {
 METHOD(mac_t, get_mac, bool,
        private_mac_t *this, chunk_t data, u_int8_t *out)
 {
-       __m128i k0, k1, k2, k3, k4, k5, k6, k7, k8, k9, k10;
-       __m128i t, l, *bi;
+       __m128i *ks, t, l, *bi;
        u_int blocks, rem, i;
 
        if (!this->k)
@@ -76,18 +75,7 @@ METHOD(mac_t, get_mac, bool,
                return FALSE;
        }
 
-       k0 = this->k->schedule[0];
-       k1 = this->k->schedule[1];
-       k2 = this->k->schedule[2];
-       k3 = this->k->schedule[3];
-       k4 = this->k->schedule[4];
-       k5 = this->k->schedule[5];
-       k6 = this->k->schedule[6];
-       k7 = this->k->schedule[7];
-       k8 = this->k->schedule[8];
-       k9 = this->k->schedule[9];
-       k10 = this->k->schedule[10];
-
+       ks = this->k->schedule;
        t = this->t;
 
        if (this->rem_size + data.len > AES_BLOCK_SIZE)
@@ -105,17 +93,17 @@ METHOD(mac_t, get_mac, bool,
 
                t = _mm_xor_si128(t, _mm_loadu_si128((__m128i*)this->rem));
 
-               t = _mm_xor_si128(t, k0);
-               t = _mm_aesenc_si128(t, k1);
-               t = _mm_aesenc_si128(t, k2);
-               t = _mm_aesenc_si128(t, k3);
-               t = _mm_aesenc_si128(t, k4);
-               t = _mm_aesenc_si128(t, k5);
-               t = _mm_aesenc_si128(t, k6);
-               t = _mm_aesenc_si128(t, k7);
-               t = _mm_aesenc_si128(t, k8);
-               t = _mm_aesenc_si128(t, k9);
-               t = _mm_aesenclast_si128(t, k10);
+               t = _mm_xor_si128(t, ks[0]);
+               t = _mm_aesenc_si128(t, ks[1]);
+               t = _mm_aesenc_si128(t, ks[2]);
+               t = _mm_aesenc_si128(t, ks[3]);
+               t = _mm_aesenc_si128(t, ks[4]);
+               t = _mm_aesenc_si128(t, ks[5]);
+               t = _mm_aesenc_si128(t, ks[6]);
+               t = _mm_aesenc_si128(t, ks[7]);
+               t = _mm_aesenc_si128(t, ks[8]);
+               t = _mm_aesenc_si128(t, ks[9]);
+               t = _mm_aesenclast_si128(t, ks[10]);
 
                /* process blocks M_2 ... M_n-1 */
                bi = (__m128i*)data.ptr;
@@ -132,17 +120,17 @@ METHOD(mac_t, get_mac, bool,
                {
                        t = _mm_xor_si128(t, _mm_loadu_si128(bi + i));
 
-                       t = _mm_xor_si128(t, k0);
-                       t = _mm_aesenc_si128(t, k1);
-                       t = _mm_aesenc_si128(t, k2);
-                       t = _mm_aesenc_si128(t, k3);
-                       t = _mm_aesenc_si128(t, k4);
-                       t = _mm_aesenc_si128(t, k5);
-                       t = _mm_aesenc_si128(t, k6);
-                       t = _mm_aesenc_si128(t, k7);
-                       t = _mm_aesenc_si128(t, k8);
-                       t = _mm_aesenc_si128(t, k9);
-                       t = _mm_aesenclast_si128(t, k10);
+                       t = _mm_xor_si128(t, ks[0]);
+                       t = _mm_aesenc_si128(t, ks[1]);
+                       t = _mm_aesenc_si128(t, ks[2]);
+                       t = _mm_aesenc_si128(t, ks[3]);
+                       t = _mm_aesenc_si128(t, ks[4]);
+                       t = _mm_aesenc_si128(t, ks[5]);
+                       t = _mm_aesenc_si128(t, ks[6]);
+                       t = _mm_aesenc_si128(t, ks[7]);
+                       t = _mm_aesenc_si128(t, ks[8]);
+                       t = _mm_aesenc_si128(t, ks[9]);
+                       t = _mm_aesenclast_si128(t, ks[10]);
                }
 
                /* store remaining bytes of block M_n */
@@ -188,17 +176,17 @@ METHOD(mac_t, get_mac, bool,
                 */
                t = _mm_xor_si128(l, t);
 
-               t = _mm_xor_si128(t, k0);
-               t = _mm_aesenc_si128(t, k1);
-               t = _mm_aesenc_si128(t, k2);
-               t = _mm_aesenc_si128(t, k3);
-               t = _mm_aesenc_si128(t, k4);
-               t = _mm_aesenc_si128(t, k5);
-               t = _mm_aesenc_si128(t, k6);
-               t = _mm_aesenc_si128(t, k7);
-               t = _mm_aesenc_si128(t, k8);
-               t = _mm_aesenc_si128(t, k9);
-               t = _mm_aesenclast_si128(t, k10);
+               t = _mm_xor_si128(t, ks[0]);
+               t = _mm_aesenc_si128(t, ks[1]);
+               t = _mm_aesenc_si128(t, ks[2]);
+               t = _mm_aesenc_si128(t, ks[3]);
+               t = _mm_aesenc_si128(t, ks[4]);
+               t = _mm_aesenc_si128(t, ks[5]);
+               t = _mm_aesenc_si128(t, ks[6]);
+               t = _mm_aesenc_si128(t, ks[7]);
+               t = _mm_aesenc_si128(t, ks[8]);
+               t = _mm_aesenc_si128(t, ks[9]);
+               t = _mm_aesenclast_si128(t, ks[10]);
 
                _mm_storeu_si128((__m128i*)out, t);
 
index e6f9b84..9898138 100644 (file)
@@ -87,10 +87,9 @@ static inline __m128i increment_be(__m128i x)
 static void encrypt_ctr128(private_aesni_ctr_t *this,
                                                   size_t len, u_char *in, u_char *out)
 {
-       __m128i k0, k1, k2, k3, k4, k5, k6, k7, k8, k9, k10;
        __m128i t1, t2, t3, t4;
        __m128i d1, d2, d3, d4;
-       __m128i state, b, *bi, *bo;
+       __m128i *ks, state, b, *bi, *bo;
        u_int i, blocks, pblocks, rem;
 
        state = _mm_load_si128((__m128i*)&this->state);
@@ -100,17 +99,7 @@ static void encrypt_ctr128(private_aesni_ctr_t *this,
        bi = (__m128i*)in;
        bo = (__m128i*)out;
 
-       k0 = this->key->schedule[0];
-       k1 = this->key->schedule[1];
-       k2 = this->key->schedule[2];
-       k3 = this->key->schedule[3];
-       k4 = this->key->schedule[4];
-       k5 = this->key->schedule[5];
-       k6 = this->key->schedule[6];
-       k7 = this->key->schedule[7];
-       k8 = this->key->schedule[8];
-       k9 = this->key->schedule[9];
-       k10 = this->key->schedule[10];
+       ks = this->key->schedule;
 
        for (i = 0; i < pblocks; i += CTR_CRYPT_PARALLELISM)
        {
@@ -119,56 +108,56 @@ static void encrypt_ctr128(private_aesni_ctr_t *this,
                d3 = _mm_loadu_si128(bi + i + 2);
                d4 = _mm_loadu_si128(bi + i + 3);
 
-               t1 = _mm_xor_si128(state, k0);
+               t1 = _mm_xor_si128(state, ks[0]);
                state = increment_be(state);
-               t2 = _mm_xor_si128(state, k0);
+               t2 = _mm_xor_si128(state, ks[0]);
                state = increment_be(state);
-               t3 = _mm_xor_si128(state, k0);
+               t3 = _mm_xor_si128(state, ks[0]);
                state = increment_be(state);
-               t4 = _mm_xor_si128(state, k0);
+               t4 = _mm_xor_si128(state, ks[0]);
                state = increment_be(state);
 
-               t1 = _mm_aesenc_si128(t1, k1);
-               t2 = _mm_aesenc_si128(t2, k1);
-               t3 = _mm_aesenc_si128(t3, k1);
-               t4 = _mm_aesenc_si128(t4, k1);
-               t1 = _mm_aesenc_si128(t1, k2);
-               t2 = _mm_aesenc_si128(t2, k2);
-               t3 = _mm_aesenc_si128(t3, k2);
-               t4 = _mm_aesenc_si128(t4, k2);
-               t1 = _mm_aesenc_si128(t1, k3);
-               t2 = _mm_aesenc_si128(t2, k3);
-               t3 = _mm_aesenc_si128(t3, k3);
-               t4 = _mm_aesenc_si128(t4, k3);
-               t1 = _mm_aesenc_si128(t1, k4);
-               t2 = _mm_aesenc_si128(t2, k4);
-               t3 = _mm_aesenc_si128(t3, k4);
-               t4 = _mm_aesenc_si128(t4, k4);
-               t1 = _mm_aesenc_si128(t1, k5);
-               t2 = _mm_aesenc_si128(t2, k5);
-               t3 = _mm_aesenc_si128(t3, k5);
-               t4 = _mm_aesenc_si128(t4, k5);
-               t1 = _mm_aesenc_si128(t1, k6);
-               t2 = _mm_aesenc_si128(t2, k6);
-               t3 = _mm_aesenc_si128(t3, k6);
-               t4 = _mm_aesenc_si128(t4, k6);
-               t1 = _mm_aesenc_si128(t1, k7);
-               t2 = _mm_aesenc_si128(t2, k7);
-               t3 = _mm_aesenc_si128(t3, k7);
-               t4 = _mm_aesenc_si128(t4, k7);
-               t1 = _mm_aesenc_si128(t1, k8);
-               t2 = _mm_aesenc_si128(t2, k8);
-               t3 = _mm_aesenc_si128(t3, k8);
-               t4 = _mm_aesenc_si128(t4, k8);
-               t1 = _mm_aesenc_si128(t1, k9);
-               t2 = _mm_aesenc_si128(t2, k9);
-               t3 = _mm_aesenc_si128(t3, k9);
-               t4 = _mm_aesenc_si128(t4, k9);
-
-               t1 = _mm_aesenclast_si128(t1, k10);
-               t2 = _mm_aesenclast_si128(t2, k10);
-               t3 = _mm_aesenclast_si128(t3, k10);
-               t4 = _mm_aesenclast_si128(t4, k10);
+               t1 = _mm_aesenc_si128(t1, ks[1]);
+               t2 = _mm_aesenc_si128(t2, ks[1]);
+               t3 = _mm_aesenc_si128(t3, ks[1]);
+               t4 = _mm_aesenc_si128(t4, ks[1]);
+               t1 = _mm_aesenc_si128(t1, ks[2]);
+               t2 = _mm_aesenc_si128(t2, ks[2]);
+               t3 = _mm_aesenc_si128(t3, ks[2]);
+               t4 = _mm_aesenc_si128(t4, ks[2]);
+               t1 = _mm_aesenc_si128(t1, ks[3]);
+               t2 = _mm_aesenc_si128(t2, ks[3]);
+               t3 = _mm_aesenc_si128(t3, ks[3]);
+               t4 = _mm_aesenc_si128(t4, ks[3]);
+               t1 = _mm_aesenc_si128(t1, ks[4]);
+               t2 = _mm_aesenc_si128(t2, ks[4]);
+               t3 = _mm_aesenc_si128(t3, ks[4]);
+               t4 = _mm_aesenc_si128(t4, ks[4]);
+               t1 = _mm_aesenc_si128(t1, ks[5]);
+               t2 = _mm_aesenc_si128(t2, ks[5]);
+               t3 = _mm_aesenc_si128(t3, ks[5]);
+               t4 = _mm_aesenc_si128(t4, ks[5]);
+               t1 = _mm_aesenc_si128(t1, ks[6]);
+               t2 = _mm_aesenc_si128(t2, ks[6]);
+               t3 = _mm_aesenc_si128(t3, ks[6]);
+               t4 = _mm_aesenc_si128(t4, ks[6]);
+               t1 = _mm_aesenc_si128(t1, ks[7]);
+               t2 = _mm_aesenc_si128(t2, ks[7]);
+               t3 = _mm_aesenc_si128(t3, ks[7]);
+               t4 = _mm_aesenc_si128(t4, ks[7]);
+               t1 = _mm_aesenc_si128(t1, ks[8]);
+               t2 = _mm_aesenc_si128(t2, ks[8]);
+               t3 = _mm_aesenc_si128(t3, ks[8]);
+               t4 = _mm_aesenc_si128(t4, ks[8]);
+               t1 = _mm_aesenc_si128(t1, ks[9]);
+               t2 = _mm_aesenc_si128(t2, ks[9]);
+               t3 = _mm_aesenc_si128(t3, ks[9]);
+               t4 = _mm_aesenc_si128(t4, ks[9]);
+
+               t1 = _mm_aesenclast_si128(t1, ks[10]);
+               t2 = _mm_aesenclast_si128(t2, ks[10]);
+               t3 = _mm_aesenclast_si128(t3, ks[10]);
+               t4 = _mm_aesenclast_si128(t4, ks[10]);
                t1 = _mm_xor_si128(t1, d1);
                t2 = _mm_xor_si128(t2, d2);
                t3 = _mm_xor_si128(t3, d3);
@@ -183,20 +172,20 @@ static void encrypt_ctr128(private_aesni_ctr_t *this,
        {
                d1 = _mm_loadu_si128(bi + i);
 
-               t1 = _mm_xor_si128(state, k0);
+               t1 = _mm_xor_si128(state, ks[0]);
                state = increment_be(state);
 
-               t1 = _mm_aesenc_si128(t1, k1);
-               t1 = _mm_aesenc_si128(t1, k2);
-               t1 = _mm_aesenc_si128(t1, k3);
-               t1 = _mm_aesenc_si128(t1, k4);
-               t1 = _mm_aesenc_si128(t1, k5);
-               t1 = _mm_aesenc_si128(t1, k6);
-               t1 = _mm_aesenc_si128(t1, k7);
-               t1 = _mm_aesenc_si128(t1, k8);
-               t1 = _mm_aesenc_si128(t1, k9);
-
-               t1 = _mm_aesenclast_si128(t1, k10);
+               t1 = _mm_aesenc_si128(t1, ks[1]);
+               t1 = _mm_aesenc_si128(t1, ks[2]);
+               t1 = _mm_aesenc_si128(t1, ks[3]);
+               t1 = _mm_aesenc_si128(t1, ks[4]);
+               t1 = _mm_aesenc_si128(t1, ks[5]);
+               t1 = _mm_aesenc_si128(t1, ks[6]);
+               t1 = _mm_aesenc_si128(t1, ks[7]);
+               t1 = _mm_aesenc_si128(t1, ks[8]);
+               t1 = _mm_aesenc_si128(t1, ks[9]);
+
+               t1 = _mm_aesenclast_si128(t1, ks[10]);
                t1 = _mm_xor_si128(t1, d1);
                _mm_storeu_si128(bo + i, t1);
        }
@@ -207,19 +196,19 @@ static void encrypt_ctr128(private_aesni_ctr_t *this,
                memcpy(&b, bi + blocks, rem);
 
                d1 = _mm_loadu_si128(&b);
-               t1 = _mm_xor_si128(state, k0);
-
-               t1 = _mm_aesenc_si128(t1, k1);
-               t1 = _mm_aesenc_si128(t1, k2);
-               t1 = _mm_aesenc_si128(t1, k3);
-               t1 = _mm_aesenc_si128(t1, k4);
-               t1 = _mm_aesenc_si128(t1, k5);
-               t1 = _mm_aesenc_si128(t1, k6);
-               t1 = _mm_aesenc_si128(t1, k7);
-               t1 = _mm_aesenc_si128(t1, k8);
-               t1 = _mm_aesenc_si128(t1, k9);
-
-               t1 = _mm_aesenclast_si128(t1, k10);
+               t1 = _mm_xor_si128(state, ks[0]);
+
+               t1 = _mm_aesenc_si128(t1, ks[1]);
+               t1 = _mm_aesenc_si128(t1, ks[2]);
+               t1 = _mm_aesenc_si128(t1, ks[3]);
+               t1 = _mm_aesenc_si128(t1, ks[4]);
+               t1 = _mm_aesenc_si128(t1, ks[5]);
+               t1 = _mm_aesenc_si128(t1, ks[6]);
+               t1 = _mm_aesenc_si128(t1, ks[7]);
+               t1 = _mm_aesenc_si128(t1, ks[8]);
+               t1 = _mm_aesenc_si128(t1, ks[9]);
+
+               t1 = _mm_aesenclast_si128(t1, ks[10]);
                t1 = _mm_xor_si128(t1, d1);
                _mm_storeu_si128(&b, t1);
 
@@ -233,10 +222,9 @@ static void encrypt_ctr128(private_aesni_ctr_t *this,
 static void encrypt_ctr192(private_aesni_ctr_t *this,
                                                   size_t len, u_char *in, u_char *out)
 {
-       __m128i k0, k1, k2, k3, k4, k5, k6, k7, k8, k9, k10, k11, k12;
        __m128i t1, t2, t3, t4;
        __m128i d1, d2, d3, d4;
-       __m128i state, b, *bi, *bo;
+       __m128i *ks, state, b, *bi, *bo;
        u_int i, blocks, pblocks, rem;
 
        state = _mm_load_si128((__m128i*)&this->state);
@@ -246,19 +234,7 @@ static void encrypt_ctr192(private_aesni_ctr_t *this,
        bi = (__m128i*)in;
        bo = (__m128i*)out;
 
-       k0 = this->key->schedule[0];
-       k1 = this->key->schedule[1];
-       k2 = this->key->schedule[2];
-       k3 = this->key->schedule[3];
-       k4 = this->key->schedule[4];
-       k5 = this->key->schedule[5];
-       k6 = this->key->schedule[6];
-       k7 = this->key->schedule[7];
-       k8 = this->key->schedule[8];
-       k9 = this->key->schedule[9];
-       k10 = this->key->schedule[10];
-       k11 = this->key->schedule[11];
-       k12 = this->key->schedule[12];
+       ks = this->key->schedule;
 
        for (i = 0; i < pblocks; i += CTR_CRYPT_PARALLELISM)
        {
@@ -267,64 +243,64 @@ static void encrypt_ctr192(private_aesni_ctr_t *this,
                d3 = _mm_loadu_si128(bi + i + 2);
                d4 = _mm_loadu_si128(bi + i + 3);
 
-               t1 = _mm_xor_si128(state, k0);
+               t1 = _mm_xor_si128(state, ks[0]);
                state = increment_be(state);
-               t2 = _mm_xor_si128(state, k0);
+               t2 = _mm_xor_si128(state, ks[0]);
                state = increment_be(state);
-               t3 = _mm_xor_si128(state, k0);
+               t3 = _mm_xor_si128(state, ks[0]);
                state = increment_be(state);
-               t4 = _mm_xor_si128(state, k0);
+               t4 = _mm_xor_si128(state, ks[0]);
                state = increment_be(state);
 
-               t1 = _mm_aesenc_si128(t1, k1);
-               t2 = _mm_aesenc_si128(t2, k1);
-               t3 = _mm_aesenc_si128(t3, k1);
-               t4 = _mm_aesenc_si128(t4, k1);
-               t1 = _mm_aesenc_si128(t1, k2);
-               t2 = _mm_aesenc_si128(t2, k2);
-               t3 = _mm_aesenc_si128(t3, k2);
-               t4 = _mm_aesenc_si128(t4, k2);
-               t1 = _mm_aesenc_si128(t1, k3);
-               t2 = _mm_aesenc_si128(t2, k3);
-               t3 = _mm_aesenc_si128(t3, k3);
-               t4 = _mm_aesenc_si128(t4, k3);
-               t1 = _mm_aesenc_si128(t1, k4);
-               t2 = _mm_aesenc_si128(t2, k4);
-               t3 = _mm_aesenc_si128(t3, k4);
-               t4 = _mm_aesenc_si128(t4, k4);
-               t1 = _mm_aesenc_si128(t1, k5);
-               t2 = _mm_aesenc_si128(t2, k5);
-               t3 = _mm_aesenc_si128(t3, k5);
-               t4 = _mm_aesenc_si128(t4, k5);
-               t1 = _mm_aesenc_si128(t1, k6);
-               t2 = _mm_aesenc_si128(t2, k6);
-               t3 = _mm_aesenc_si128(t3, k6);
-               t4 = _mm_aesenc_si128(t4, k6);
-               t1 = _mm_aesenc_si128(t1, k7);
-               t2 = _mm_aesenc_si128(t2, k7);
-               t3 = _mm_aesenc_si128(t3, k7);
-               t4 = _mm_aesenc_si128(t4, k7);
-               t1 = _mm_aesenc_si128(t1, k8);
-               t2 = _mm_aesenc_si128(t2, k8);
-               t3 = _mm_aesenc_si128(t3, k8);
-               t4 = _mm_aesenc_si128(t4, k8);
-               t1 = _mm_aesenc_si128(t1, k9);
-               t2 = _mm_aesenc_si128(t2, k9);
-               t3 = _mm_aesenc_si128(t3, k9);
-               t4 = _mm_aesenc_si128(t4, k9);
-               t1 = _mm_aesenc_si128(t1, k10);
-               t2 = _mm_aesenc_si128(t2, k10);
-               t3 = _mm_aesenc_si128(t3, k10);
-               t4 = _mm_aesenc_si128(t4, k10);
-               t1 = _mm_aesenc_si128(t1, k11);
-               t2 = _mm_aesenc_si128(t2, k11);
-               t3 = _mm_aesenc_si128(t3, k11);
-               t4 = _mm_aesenc_si128(t4, k11);
-
-               t1 = _mm_aesenclast_si128(t1, k12);
-               t2 = _mm_aesenclast_si128(t2, k12);
-               t3 = _mm_aesenclast_si128(t3, k12);
-               t4 = _mm_aesenclast_si128(t4, k12);
+               t1 = _mm_aesenc_si128(t1, ks[1]);
+               t2 = _mm_aesenc_si128(t2, ks[1]);
+               t3 = _mm_aesenc_si128(t3, ks[1]);
+               t4 = _mm_aesenc_si128(t4, ks[1]);
+               t1 = _mm_aesenc_si128(t1, ks[2]);
+               t2 = _mm_aesenc_si128(t2, ks[2]);
+               t3 = _mm_aesenc_si128(t3, ks[2]);
+               t4 = _mm_aesenc_si128(t4, ks[2]);
+               t1 = _mm_aesenc_si128(t1, ks[3]);
+               t2 = _mm_aesenc_si128(t2, ks[3]);
+               t3 = _mm_aesenc_si128(t3, ks[3]);
+               t4 = _mm_aesenc_si128(t4, ks[3]);
+               t1 = _mm_aesenc_si128(t1, ks[4]);
+               t2 = _mm_aesenc_si128(t2, ks[4]);
+               t3 = _mm_aesenc_si128(t3, ks[4]);
+               t4 = _mm_aesenc_si128(t4, ks[4]);
+               t1 = _mm_aesenc_si128(t1, ks[5]);
+               t2 = _mm_aesenc_si128(t2, ks[5]);
+               t3 = _mm_aesenc_si128(t3, ks[5]);
+               t4 = _mm_aesenc_si128(t4, ks[5]);
+               t1 = _mm_aesenc_si128(t1, ks[6]);
+               t2 = _mm_aesenc_si128(t2, ks[6]);
+               t3 = _mm_aesenc_si128(t3, ks[6]);
+               t4 = _mm_aesenc_si128(t4, ks[6]);
+               t1 = _mm_aesenc_si128(t1, ks[7]);
+               t2 = _mm_aesenc_si128(t2, ks[7]);
+               t3 = _mm_aesenc_si128(t3, ks[7]);
+               t4 = _mm_aesenc_si128(t4, ks[7]);
+               t1 = _mm_aesenc_si128(t1, ks[8]);
+               t2 = _mm_aesenc_si128(t2, ks[8]);
+               t3 = _mm_aesenc_si128(t3, ks[8]);
+               t4 = _mm_aesenc_si128(t4, ks[8]);
+               t1 = _mm_aesenc_si128(t1, ks[9]);
+               t2 = _mm_aesenc_si128(t2, ks[9]);
+               t3 = _mm_aesenc_si128(t3, ks[9]);
+               t4 = _mm_aesenc_si128(t4, ks[9]);
+               t1 = _mm_aesenc_si128(t1, ks[10]);
+               t2 = _mm_aesenc_si128(t2, ks[10]);
+               t3 = _mm_aesenc_si128(t3, ks[10]);
+               t4 = _mm_aesenc_si128(t4, ks[10]);
+               t1 = _mm_aesenc_si128(t1, ks[11]);
+               t2 = _mm_aesenc_si128(t2, ks[11]);
+               t3 = _mm_aesenc_si128(t3, ks[11]);
+               t4 = _mm_aesenc_si128(t4, ks[11]);
+
+               t1 = _mm_aesenclast_si128(t1, ks[12]);
+               t2 = _mm_aesenclast_si128(t2, ks[12]);
+               t3 = _mm_aesenclast_si128(t3, ks[12]);
+               t4 = _mm_aesenclast_si128(t4, ks[12]);
                t1 = _mm_xor_si128(t1, d1);
                t2 = _mm_xor_si128(t2, d2);
                t3 = _mm_xor_si128(t3, d3);
@@ -339,22 +315,22 @@ static void encrypt_ctr192(private_aesni_ctr_t *this,
        {
                d1 = _mm_loadu_si128(bi + i);
 
-               t1 = _mm_xor_si128(state, k0);
+               t1 = _mm_xor_si128(state, ks[0]);
                state = increment_be(state);
 
-               t1 = _mm_aesenc_si128(t1, k1);
-               t1 = _mm_aesenc_si128(t1, k2);
-               t1 = _mm_aesenc_si128(t1, k3);
-               t1 = _mm_aesenc_si128(t1, k4);
-               t1 = _mm_aesenc_si128(t1, k5);
-               t1 = _mm_aesenc_si128(t1, k6);
-               t1 = _mm_aesenc_si128(t1, k7);
-               t1 = _mm_aesenc_si128(t1, k8);
-               t1 = _mm_aesenc_si128(t1, k9);
-               t1 = _mm_aesenc_si128(t1, k10);
-               t1 = _mm_aesenc_si128(t1, k11);
-
-               t1 = _mm_aesenclast_si128(t1, k12);
+               t1 = _mm_aesenc_si128(t1, ks[1]);
+               t1 = _mm_aesenc_si128(t1, ks[2]);
+               t1 = _mm_aesenc_si128(t1, ks[3]);
+               t1 = _mm_aesenc_si128(t1, ks[4]);
+               t1 = _mm_aesenc_si128(t1, ks[5]);
+               t1 = _mm_aesenc_si128(t1, ks[6]);
+               t1 = _mm_aesenc_si128(t1, ks[7]);
+               t1 = _mm_aesenc_si128(t1, ks[8]);
+               t1 = _mm_aesenc_si128(t1, ks[9]);
+               t1 = _mm_aesenc_si128(t1, ks[10]);
+               t1 = _mm_aesenc_si128(t1, ks[11]);
+
+               t1 = _mm_aesenclast_si128(t1, ks[12]);
                t1 = _mm_xor_si128(t1, d1);
                _mm_storeu_si128(bo + i, t1);
        }
@@ -365,21 +341,21 @@ static void encrypt_ctr192(private_aesni_ctr_t *this,
                memcpy(&b, bi + blocks, rem);
 
                d1 = _mm_loadu_si128(&b);
-               t1 = _mm_xor_si128(state, k0);
-
-               t1 = _mm_aesenc_si128(t1, k1);
-               t1 = _mm_aesenc_si128(t1, k2);
-               t1 = _mm_aesenc_si128(t1, k3);
-               t1 = _mm_aesenc_si128(t1, k4);
-               t1 = _mm_aesenc_si128(t1, k5);
-               t1 = _mm_aesenc_si128(t1, k6);
-               t1 = _mm_aesenc_si128(t1, k7);
-               t1 = _mm_aesenc_si128(t1, k8);
-               t1 = _mm_aesenc_si128(t1, k9);
-               t1 = _mm_aesenc_si128(t1, k10);
-               t1 = _mm_aesenc_si128(t1, k11);
-
-               t1 = _mm_aesenclast_si128(t1, k12);
+               t1 = _mm_xor_si128(state, ks[0]);
+
+               t1 = _mm_aesenc_si128(t1, ks[1]);
+               t1 = _mm_aesenc_si128(t1, ks[2]);
+               t1 = _mm_aesenc_si128(t1, ks[3]);
+               t1 = _mm_aesenc_si128(t1, ks[4]);
+               t1 = _mm_aesenc_si128(t1, ks[5]);
+               t1 = _mm_aesenc_si128(t1, ks[6]);
+               t1 = _mm_aesenc_si128(t1, ks[7]);
+               t1 = _mm_aesenc_si128(t1, ks[8]);
+               t1 = _mm_aesenc_si128(t1, ks[9]);
+               t1 = _mm_aesenc_si128(t1, ks[10]);
+               t1 = _mm_aesenc_si128(t1, ks[11]);
+
+               t1 = _mm_aesenclast_si128(t1, ks[12]);
                t1 = _mm_xor_si128(t1, d1);
                _mm_storeu_si128(&b, t1);
 
@@ -393,10 +369,9 @@ static void encrypt_ctr192(private_aesni_ctr_t *this,
 static void encrypt_ctr256(private_aesni_ctr_t *this,
                                                   size_t len, u_char *in, u_char *out)
 {
-       __m128i k0, k1, k2, k3, k4, k5, k6, k7, k8, k9, k10, k11, k12, k13, k14;
        __m128i t1, t2, t3, t4;
        __m128i d1, d2, d3, d4;
-       __m128i state, b, *bi, *bo;
+       __m128i *ks, state, b, *bi, *bo;
        u_int i, blocks, pblocks, rem;
 
        state = _mm_load_si128((__m128i*)&this->state);
@@ -406,21 +381,7 @@ static void encrypt_ctr256(private_aesni_ctr_t *this,
        bi = (__m128i*)in;
        bo = (__m128i*)out;
 
-       k0 = this->key->schedule[0];
-       k1 = this->key->schedule[1];
-       k2 = this->key->schedule[2];
-       k3 = this->key->schedule[3];
-       k4 = this->key->schedule[4];
-       k5 = this->key->schedule[5];
-       k6 = this->key->schedule[6];
-       k7 = this->key->schedule[7];
-       k8 = this->key->schedule[8];
-       k9 = this->key->schedule[9];
-       k10 = this->key->schedule[10];
-       k11 = this->key->schedule[11];
-       k12 = this->key->schedule[12];
-       k13 = this->key->schedule[13];
-       k14 = this->key->schedule[14];
+       ks = this->key->schedule;
 
        for (i = 0; i < pblocks; i += CTR_CRYPT_PARALLELISM)
        {
@@ -429,72 +390,72 @@ static void encrypt_ctr256(private_aesni_ctr_t *this,
                d3 = _mm_loadu_si128(bi + i + 2);
                d4 = _mm_loadu_si128(bi + i + 3);
 
-               t1 = _mm_xor_si128(state, k0);
+               t1 = _mm_xor_si128(state, ks[0]);
                state = increment_be(state);
-               t2 = _mm_xor_si128(state, k0);
+               t2 = _mm_xor_si128(state, ks[0]);
                state = increment_be(state);
-               t3 = _mm_xor_si128(state, k0);
+               t3 = _mm_xor_si128(state, ks[0]);
                state = increment_be(state);
-               t4 = _mm_xor_si128(state, k0);
+               t4 = _mm_xor_si128(state, ks[0]);
                state = increment_be(state);
 
-               t1 = _mm_aesenc_si128(t1, k1);
-               t2 = _mm_aesenc_si128(t2, k1);
-               t3 = _mm_aesenc_si128(t3, k1);
-               t4 = _mm_aesenc_si128(t4, k1);
-               t1 = _mm_aesenc_si128(t1, k2);
-               t2 = _mm_aesenc_si128(t2, k2);
-               t3 = _mm_aesenc_si128(t3, k2);
-               t4 = _mm_aesenc_si128(t4, k2);
-               t1 = _mm_aesenc_si128(t1, k3);
-               t2 = _mm_aesenc_si128(t2, k3);
-               t3 = _mm_aesenc_si128(t3, k3);
-               t4 = _mm_aesenc_si128(t4, k3);
-               t1 = _mm_aesenc_si128(t1, k4);
-               t2 = _mm_aesenc_si128(t2, k4);
-               t3 = _mm_aesenc_si128(t3, k4);
-               t4 = _mm_aesenc_si128(t4, k4);
-               t1 = _mm_aesenc_si128(t1, k5);
-               t2 = _mm_aesenc_si128(t2, k5);
-               t3 = _mm_aesenc_si128(t3, k5);
-               t4 = _mm_aesenc_si128(t4, k5);
-               t1 = _mm_aesenc_si128(t1, k6);
-               t2 = _mm_aesenc_si128(t2, k6);
-               t3 = _mm_aesenc_si128(t3, k6);
-               t4 = _mm_aesenc_si128(t4, k6);
-               t1 = _mm_aesenc_si128(t1, k7);
-               t2 = _mm_aesenc_si128(t2, k7);
-               t3 = _mm_aesenc_si128(t3, k7);
-               t4 = _mm_aesenc_si128(t4, k7);
-               t1 = _mm_aesenc_si128(t1, k8);
-               t2 = _mm_aesenc_si128(t2, k8);
-               t3 = _mm_aesenc_si128(t3, k8);
-               t4 = _mm_aesenc_si128(t4, k8);
-               t1 = _mm_aesenc_si128(t1, k9);
-               t2 = _mm_aesenc_si128(t2, k9);
-               t3 = _mm_aesenc_si128(t3, k9);
-               t4 = _mm_aesenc_si128(t4, k9);
-               t1 = _mm_aesenc_si128(t1, k10);
-               t2 = _mm_aesenc_si128(t2, k10);
-               t3 = _mm_aesenc_si128(t3, k10);
-               t4 = _mm_aesenc_si128(t4, k10);
-               t1 = _mm_aesenc_si128(t1, k11);
-               t2 = _mm_aesenc_si128(t2, k11);
-               t3 = _mm_aesenc_si128(t3, k11);
-               t4 = _mm_aesenc_si128(t4, k11);
-               t1 = _mm_aesenc_si128(t1, k12);
-               t2 = _mm_aesenc_si128(t2, k12);
-               t3 = _mm_aesenc_si128(t3, k12);
-               t4 = _mm_aesenc_si128(t4, k12);
-               t1 = _mm_aesenc_si128(t1, k13);
-               t2 = _mm_aesenc_si128(t2, k13);
-               t3 = _mm_aesenc_si128(t3, k13);
-               t4 = _mm_aesenc_si128(t4, k13);
-
-               t1 = _mm_aesenclast_si128(t1, k14);
-               t2 = _mm_aesenclast_si128(t2, k14);
-               t3 = _mm_aesenclast_si128(t3, k14);
-               t4 = _mm_aesenclast_si128(t4, k14);
+               t1 = _mm_aesenc_si128(t1, ks[1]);
+               t2 = _mm_aesenc_si128(t2, ks[1]);
+               t3 = _mm_aesenc_si128(t3, ks[1]);
+               t4 = _mm_aesenc_si128(t4, ks[1]);
+               t1 = _mm_aesenc_si128(t1, ks[2]);
+               t2 = _mm_aesenc_si128(t2, ks[2]);
+               t3 = _mm_aesenc_si128(t3, ks[2]);
+               t4 = _mm_aesenc_si128(t4, ks[2]);
+               t1 = _mm_aesenc_si128(t1, ks[3]);
+               t2 = _mm_aesenc_si128(t2, ks[3]);
+               t3 = _mm_aesenc_si128(t3, ks[3]);
+               t4 = _mm_aesenc_si128(t4, ks[3]);
+               t1 = _mm_aesenc_si128(t1, ks[4]);
+               t2 = _mm_aesenc_si128(t2, ks[4]);
+               t3 = _mm_aesenc_si128(t3, ks[4]);
+               t4 = _mm_aesenc_si128(t4, ks[4]);
+               t1 = _mm_aesenc_si128(t1, ks[5]);
+               t2 = _mm_aesenc_si128(t2, ks[5]);
+               t3 = _mm_aesenc_si128(t3, ks[5]);
+               t4 = _mm_aesenc_si128(t4, ks[5]);
+               t1 = _mm_aesenc_si128(t1, ks[6]);
+               t2 = _mm_aesenc_si128(t2, ks[6]);
+               t3 = _mm_aesenc_si128(t3, ks[6]);
+               t4 = _mm_aesenc_si128(t4, ks[6]);
+               t1 = _mm_aesenc_si128(t1, ks[7]);
+               t2 = _mm_aesenc_si128(t2, ks[7]);
+               t3 = _mm_aesenc_si128(t3, ks[7]);
+               t4 = _mm_aesenc_si128(t4, ks[7]);
+               t1 = _mm_aesenc_si128(t1, ks[8]);
+               t2 = _mm_aesenc_si128(t2, ks[8]);
+               t3 = _mm_aesenc_si128(t3, ks[8]);
+               t4 = _mm_aesenc_si128(t4, ks[8]);
+               t1 = _mm_aesenc_si128(t1, ks[9]);
+               t2 = _mm_aesenc_si128(t2, ks[9]);
+               t3 = _mm_aesenc_si128(t3, ks[9]);
+               t4 = _mm_aesenc_si128(t4, ks[9]);
+               t1 = _mm_aesenc_si128(t1, ks[10]);
+               t2 = _mm_aesenc_si128(t2, ks[10]);
+               t3 = _mm_aesenc_si128(t3, ks[10]);
+               t4 = _mm_aesenc_si128(t4, ks[10]);
+               t1 = _mm_aesenc_si128(t1, ks[11]);
+               t2 = _mm_aesenc_si128(t2, ks[11]);
+               t3 = _mm_aesenc_si128(t3, ks[11]);
+               t4 = _mm_aesenc_si128(t4, ks[11]);
+               t1 = _mm_aesenc_si128(t1, ks[12]);
+               t2 = _mm_aesenc_si128(t2, ks[12]);
+               t3 = _mm_aesenc_si128(t3, ks[12]);
+               t4 = _mm_aesenc_si128(t4, ks[12]);
+               t1 = _mm_aesenc_si128(t1, ks[13]);
+               t2 = _mm_aesenc_si128(t2, ks[13]);
+               t3 = _mm_aesenc_si128(t3, ks[13]);
+               t4 = _mm_aesenc_si128(t4, ks[13]);
+
+               t1 = _mm_aesenclast_si128(t1, ks[14]);
+               t2 = _mm_aesenclast_si128(t2, ks[14]);
+               t3 = _mm_aesenclast_si128(t3, ks[14]);
+               t4 = _mm_aesenclast_si128(t4, ks[14]);
                t1 = _mm_xor_si128(t1, d1);
                t2 = _mm_xor_si128(t2, d2);
                t3 = _mm_xor_si128(t3, d3);
@@ -509,24 +470,24 @@ static void encrypt_ctr256(private_aesni_ctr_t *this,
        {
                d1 = _mm_loadu_si128(bi + i);
 
-               t1 = _mm_xor_si128(state, k0);
+               t1 = _mm_xor_si128(state, ks[0]);
                state = increment_be(state);
 
-               t1 = _mm_aesenc_si128(t1, k1);
-               t1 = _mm_aesenc_si128(t1, k2);
-               t1 = _mm_aesenc_si128(t1, k3);
-               t1 = _mm_aesenc_si128(t1, k4);
-               t1 = _mm_aesenc_si128(t1, k5);
-               t1 = _mm_aesenc_si128(t1, k6);
-               t1 = _mm_aesenc_si128(t1, k7);
-               t1 = _mm_aesenc_si128(t1, k8);
-               t1 = _mm_aesenc_si128(t1, k9);
-               t1 = _mm_aesenc_si128(t1, k10);
-               t1 = _mm_aesenc_si128(t1, k11);
-               t1 = _mm_aesenc_si128(t1, k12);
-               t1 = _mm_aesenc_si128(t1, k13);
-
-               t1 = _mm_aesenclast_si128(t1, k14);
+               t1 = _mm_aesenc_si128(t1, ks[1]);
+               t1 = _mm_aesenc_si128(t1, ks[2]);
+               t1 = _mm_aesenc_si128(t1, ks[3]);
+               t1 = _mm_aesenc_si128(t1, ks[4]);
+               t1 = _mm_aesenc_si128(t1, ks[5]);
+               t1 = _mm_aesenc_si128(t1, ks[6]);
+               t1 = _mm_aesenc_si128(t1, ks[7]);
+               t1 = _mm_aesenc_si128(t1, ks[8]);
+               t1 = _mm_aesenc_si128(t1, ks[9]);
+               t1 = _mm_aesenc_si128(t1, ks[10]);
+               t1 = _mm_aesenc_si128(t1, ks[11]);
+               t1 = _mm_aesenc_si128(t1, ks[12]);
+               t1 = _mm_aesenc_si128(t1, ks[13]);
+
+               t1 = _mm_aesenclast_si128(t1, ks[14]);
                t1 = _mm_xor_si128(t1, d1);
                _mm_storeu_si128(bo + i, t1);
        }
@@ -537,23 +498,23 @@ static void encrypt_ctr256(private_aesni_ctr_t *this,
                memcpy(&b, bi + blocks, rem);
 
                d1 = _mm_loadu_si128(&b);
-               t1 = _mm_xor_si128(state, k0);
-
-               t1 = _mm_aesenc_si128(t1, k1);
-               t1 = _mm_aesenc_si128(t1, k2);
-               t1 = _mm_aesenc_si128(t1, k3);
-               t1 = _mm_aesenc_si128(t1, k4);
-               t1 = _mm_aesenc_si128(t1, k5);
-               t1 = _mm_aesenc_si128(t1, k6);
-               t1 = _mm_aesenc_si128(t1, k7);
-               t1 = _mm_aesenc_si128(t1, k8);
-               t1 = _mm_aesenc_si128(t1, k9);
-               t1 = _mm_aesenc_si128(t1, k10);
-               t1 = _mm_aesenc_si128(t1, k11);
-               t1 = _mm_aesenc_si128(t1, k12);
-               t1 = _mm_aesenc_si128(t1, k13);
-
-               t1 = _mm_aesenclast_si128(t1, k14);
+               t1 = _mm_xor_si128(state, ks[0]);
+
+               t1 = _mm_aesenc_si128(t1, ks[1]);
+               t1 = _mm_aesenc_si128(t1, ks[2]);
+               t1 = _mm_aesenc_si128(t1, ks[3]);
+               t1 = _mm_aesenc_si128(t1, ks[4]);
+               t1 = _mm_aesenc_si128(t1, ks[5]);
+               t1 = _mm_aesenc_si128(t1, ks[6]);
+               t1 = _mm_aesenc_si128(t1, ks[7]);
+               t1 = _mm_aesenc_si128(t1, ks[8]);
+               t1 = _mm_aesenc_si128(t1, ks[9]);
+               t1 = _mm_aesenc_si128(t1, ks[10]);
+               t1 = _mm_aesenc_si128(t1, ks[11]);
+               t1 = _mm_aesenc_si128(t1, ks[12]);
+               t1 = _mm_aesenc_si128(t1, ks[13]);
+
+               t1 = _mm_aesenclast_si128(t1, ks[14]);
                t1 = _mm_xor_si128(t1, d1);
                _mm_storeu_si128(&b, t1);
 
index 6296ad2..53c0b14 100644 (file)
@@ -327,15 +327,16 @@ static __m128i icv_tailer(private_aesni_gcm_t *this, __m128i y,
 static void icv_crypt(private_aesni_gcm_t *this, __m128i y, __m128i j,
                                          u_char *icv)
 {
-       __m128i t, b;
+       __m128i *ks, t, b;
        u_int round;
 
-       t = _mm_xor_si128(j, this->key->schedule[0]);
+       ks = this->key->schedule;
+       t = _mm_xor_si128(j, ks[0]);
        for (round = 1; round < this->key->rounds; round++)
        {
-               t = _mm_aesenc_si128(t, this->key->schedule[round]);
+               t = _mm_aesenc_si128(t, ks[round]);
        }
-       t = _mm_aesenclast_si128(t, this->key->schedule[this->key->rounds]);
+       t = _mm_aesenclast_si128(t, ks[this->key->rounds]);
 
        t = _mm_xor_si128(y, t);
 
@@ -375,18 +376,19 @@ static inline __m128i create_j(private_aesni_gcm_t *this, u_char *iv)
 static __m128i encrypt_gcm_rem(private_aesni_gcm_t *this, u_int rem,
                                                           void *in, void *out, __m128i cb, __m128i y)
 {
-       __m128i t, b;
+       __m128i *ks, t, b;
        u_int round;
 
        memset(&b, 0, sizeof(b));
        memcpy(&b, in, rem);
 
-       t = _mm_xor_si128(cb, this->key->schedule[0]);
+       ks = this->key->schedule;
+       t = _mm_xor_si128(cb, ks[0]);
        for (round = 1; round < this->key->rounds; round++)
        {
-               t = _mm_aesenc_si128(t, this->key->schedule[round]);
+               t = _mm_aesenc_si128(t, ks[round]);
        }
-       t = _mm_aesenclast_si128(t, this->key->schedule[this->key->rounds]);
+       t = _mm_aesenclast_si128(t, ks[this->key->rounds]);
        b = _mm_xor_si128(t, b);
 
        memcpy(out, &b, rem);
@@ -401,7 +403,7 @@ static __m128i encrypt_gcm_rem(private_aesni_gcm_t *this, u_int rem,
 static __m128i decrypt_gcm_rem(private_aesni_gcm_t *this, u_int rem,
                                                           void *in, void *out, __m128i cb, __m128i y)
 {
-       __m128i t, b;
+       __m128i *ks, t, b;
        u_int round;
 
        memset(&b, 0, sizeof(b));
@@ -409,12 +411,13 @@ static __m128i decrypt_gcm_rem(private_aesni_gcm_t *this, u_int rem,
 
        y = ghash(this->h, y, b);
 
-       t = _mm_xor_si128(cb, this->key->schedule[0]);
+       ks = this->key->schedule;
+       t = _mm_xor_si128(cb, ks[0]);
        for (round = 1; round < this->key->rounds; round++)
        {
-               t = _mm_aesenc_si128(t, this->key->schedule[round]);
+               t = _mm_aesenc_si128(t, ks[round]);
        }
-       t = _mm_aesenclast_si128(t, this->key->schedule[this->key->rounds]);
+       t = _mm_aesenclast_si128(t, ks[this->key->rounds]);
        b = _mm_xor_si128(t, b);
 
        memcpy(out, &b, rem);
@@ -429,9 +432,8 @@ static void encrypt_gcm128(private_aesni_gcm_t *this,
                                                   size_t len, u_char *in, u_char *out, u_char *iv,
                                                   size_t alen, u_char *assoc, u_char *icv)
 {
-       __m128i k0, k1, k2, k3, k4, k5, k6, k7, k8, k9, k10;
-       __m128i d1, d2, d3, d4, t1, t2, t3, t4, h1, h2, h3, h4;
-       __m128i y, j, cb, *bi, *bo;
+       __m128i d1, d2, d3, d4, t1, t2, t3, t4;
+       __m128i *ks, y, j, cb, *bi, *bo;
        u_int blocks, pblocks, rem, i;
 
        j = create_j(this, iv);
@@ -443,22 +445,7 @@ static void encrypt_gcm128(private_aesni_gcm_t *this,
        bi = (__m128i*)in;
        bo = (__m128i*)out;
 
-       h1 = this->hhhh;
-       h2 = this->hhh;
-       h3 = this->hh;
-       h4 = this->h;
-
-       k0 = this->key->schedule[0];
-       k1 = this->key->schedule[1];
-       k2 = this->key->schedule[2];
-       k3 = this->key->schedule[3];
-       k4 = this->key->schedule[4];
-       k5 = this->key->schedule[5];
-       k6 = this->key->schedule[6];
-       k7 = this->key->schedule[7];
-       k8 = this->key->schedule[8];
-       k9 = this->key->schedule[9];
-       k10 = this->key->schedule[10];
+       ks = this->key->schedule;
 
        for (i = 0; i < pblocks; i += GCM_CRYPT_PARALLELISM)
        {
@@ -467,56 +454,56 @@ static void encrypt_gcm128(private_aesni_gcm_t *this,
                d3 = _mm_loadu_si128(bi + i + 2);
                d4 = _mm_loadu_si128(bi + i + 3);
 
-               t1 = _mm_xor_si128(cb, k0);
+               t1 = _mm_xor_si128(cb, ks[0]);
                cb = increment_be(cb);
-               t2 = _mm_xor_si128(cb, k0);
+               t2 = _mm_xor_si128(cb, ks[0]);
                cb = increment_be(cb);
-               t3 = _mm_xor_si128(cb, k0);
+               t3 = _mm_xor_si128(cb, ks[0]);
                cb = increment_be(cb);
-               t4 = _mm_xor_si128(cb, k0);
+               t4 = _mm_xor_si128(cb, ks[0]);
                cb = increment_be(cb);
 
-               t1 = _mm_aesenc_si128(t1, k1);
-               t2 = _mm_aesenc_si128(t2, k1);
-               t3 = _mm_aesenc_si128(t3, k1);
-               t4 = _mm_aesenc_si128(t4, k1);
-               t1 = _mm_aesenc_si128(t1, k2);
-               t2 = _mm_aesenc_si128(t2, k2);
-               t3 = _mm_aesenc_si128(t3, k2);
-               t4 = _mm_aesenc_si128(t4, k2);
-               t1 = _mm_aesenc_si128(t1, k3);
-               t2 = _mm_aesenc_si128(t2, k3);
-               t3 = _mm_aesenc_si128(t3, k3);
-               t4 = _mm_aesenc_si128(t4, k3);
-               t1 = _mm_aesenc_si128(t1, k4);
-               t2 = _mm_aesenc_si128(t2, k4);
-               t3 = _mm_aesenc_si128(t3, k4);
-               t4 = _mm_aesenc_si128(t4, k4);
-               t1 = _mm_aesenc_si128(t1, k5);
-               t2 = _mm_aesenc_si128(t2, k5);
-               t3 = _mm_aesenc_si128(t3, k5);
-               t4 = _mm_aesenc_si128(t4, k5);
-               t1 = _mm_aesenc_si128(t1, k6);
-               t2 = _mm_aesenc_si128(t2, k6);
-               t3 = _mm_aesenc_si128(t3, k6);
-               t4 = _mm_aesenc_si128(t4, k6);
-               t1 = _mm_aesenc_si128(t1, k7);
-               t2 = _mm_aesenc_si128(t2, k7);
-               t3 = _mm_aesenc_si128(t3, k7);
-               t4 = _mm_aesenc_si128(t4, k7);
-               t1 = _mm_aesenc_si128(t1, k8);
-               t2 = _mm_aesenc_si128(t2, k8);
-               t3 = _mm_aesenc_si128(t3, k8);
-               t4 = _mm_aesenc_si128(t4, k8);
-               t1 = _mm_aesenc_si128(t1, k9);
-               t2 = _mm_aesenc_si128(t2, k9);
-               t3 = _mm_aesenc_si128(t3, k9);
-               t4 = _mm_aesenc_si128(t4, k9);
-
-               t1 = _mm_aesenclast_si128(t1, k10);
-               t2 = _mm_aesenclast_si128(t2, k10);
-               t3 = _mm_aesenclast_si128(t3, k10);
-               t4 = _mm_aesenclast_si128(t4, k10);
+               t1 = _mm_aesenc_si128(t1, ks[1]);
+               t2 = _mm_aesenc_si128(t2, ks[1]);
+               t3 = _mm_aesenc_si128(t3, ks[1]);
+               t4 = _mm_aesenc_si128(t4, ks[1]);
+               t1 = _mm_aesenc_si128(t1, ks[2]);
+               t2 = _mm_aesenc_si128(t2, ks[2]);
+               t3 = _mm_aesenc_si128(t3, ks[2]);
+               t4 = _mm_aesenc_si128(t4, ks[2]);
+               t1 = _mm_aesenc_si128(t1, ks[3]);
+               t2 = _mm_aesenc_si128(t2, ks[3]);
+               t3 = _mm_aesenc_si128(t3, ks[3]);
+               t4 = _mm_aesenc_si128(t4, ks[3]);
+               t1 = _mm_aesenc_si128(t1, ks[4]);
+               t2 = _mm_aesenc_si128(t2, ks[4]);
+               t3 = _mm_aesenc_si128(t3, ks[4]);
+               t4 = _mm_aesenc_si128(t4, ks[4]);
+               t1 = _mm_aesenc_si128(t1, ks[5]);
+               t2 = _mm_aesenc_si128(t2, ks[5]);
+               t3 = _mm_aesenc_si128(t3, ks[5]);
+               t4 = _mm_aesenc_si128(t4, ks[5]);
+               t1 = _mm_aesenc_si128(t1, ks[6]);
+               t2 = _mm_aesenc_si128(t2, ks[6]);
+               t3 = _mm_aesenc_si128(t3, ks[6]);
+               t4 = _mm_aesenc_si128(t4, ks[6]);
+               t1 = _mm_aesenc_si128(t1, ks[7]);
+               t2 = _mm_aesenc_si128(t2, ks[7]);
+               t3 = _mm_aesenc_si128(t3, ks[7]);
+               t4 = _mm_aesenc_si128(t4, ks[7]);
+               t1 = _mm_aesenc_si128(t1, ks[8]);
+               t2 = _mm_aesenc_si128(t2, ks[8]);
+               t3 = _mm_aesenc_si128(t3, ks[8]);
+               t4 = _mm_aesenc_si128(t4, ks[8]);
+               t1 = _mm_aesenc_si128(t1, ks[9]);
+               t2 = _mm_aesenc_si128(t2, ks[9]);
+               t3 = _mm_aesenc_si128(t3, ks[9]);
+               t4 = _mm_aesenc_si128(t4, ks[9]);
+
+               t1 = _mm_aesenclast_si128(t1, ks[10]);
+               t2 = _mm_aesenclast_si128(t2, ks[10]);
+               t3 = _mm_aesenclast_si128(t3, ks[10]);
+               t4 = _mm_aesenclast_si128(t4, ks[10]);
 
                t1 = _mm_xor_si128(t1, d1);
                t2 = _mm_xor_si128(t2, d2);
@@ -524,7 +511,7 @@ static void encrypt_gcm128(private_aesni_gcm_t *this,
                t4 = _mm_xor_si128(t4, d4);
 
                y = _mm_xor_si128(y, t1);
-               y = mult4xor(h1, h2, h3, h4, y, t2, t3, t4);
+               y = mult4xor(this->hhhh, this->hhh, this->hh, this->h, y, t2, t3, t4);
 
                _mm_storeu_si128(bo + i + 0, t1);
                _mm_storeu_si128(bo + i + 1, t2);
@@ -536,22 +523,22 @@ static void encrypt_gcm128(private_aesni_gcm_t *this,
        {
                d1 = _mm_loadu_si128(bi + i);
 
-               t1 = _mm_xor_si128(cb, k0);
-               t1 = _mm_aesenc_si128(t1, k1);
-               t1 = _mm_aesenc_si128(t1, k2);
-               t1 = _mm_aesenc_si128(t1, k3);
-               t1 = _mm_aesenc_si128(t1, k4);
-               t1 = _mm_aesenc_si128(t1, k5);
-               t1 = _mm_aesenc_si128(t1, k6);
-               t1 = _mm_aesenc_si128(t1, k7);
-               t1 = _mm_aesenc_si128(t1, k8);
-               t1 = _mm_aesenc_si128(t1, k9);
-               t1 = _mm_aesenclast_si128(t1, k10);
+               t1 = _mm_xor_si128(cb, ks[0]);
+               t1 = _mm_aesenc_si128(t1, ks[1]);
+               t1 = _mm_aesenc_si128(t1, ks[2]);
+               t1 = _mm_aesenc_si128(t1, ks[3]);
+               t1 = _mm_aesenc_si128(t1, ks[4]);
+               t1 = _mm_aesenc_si128(t1, ks[5]);
+               t1 = _mm_aesenc_si128(t1, ks[6]);
+               t1 = _mm_aesenc_si128(t1, ks[7]);
+               t1 = _mm_aesenc_si128(t1, ks[8]);
+               t1 = _mm_aesenc_si128(t1, ks[9]);
+               t1 = _mm_aesenclast_si128(t1, ks[10]);
 
                t1 = _mm_xor_si128(t1, d1);
                _mm_storeu_si128(bo + i, t1);
 
-               y = ghash(h4, y, t1);
+               y = ghash(this->h, y, t1);
 
                cb = increment_be(cb);
        }
@@ -571,9 +558,8 @@ static void decrypt_gcm128(private_aesni_gcm_t *this,
                                                   size_t len, u_char *in, u_char *out, u_char *iv,
                                                   size_t alen, u_char *assoc, u_char *icv)
 {
-       __m128i k0, k1, k2, k3, k4, k5, k6, k7, k8, k9, k10;
-       __m128i d1, d2, d3, d4, t1, t2, t3, t4, h1, h2, h3, h4;
-       __m128i y, j, cb, *bi, *bo;
+       __m128i d1, d2, d3, d4, t1, t2, t3, t4;
+       __m128i *ks, y, j, cb, *bi, *bo;
        u_int blocks, pblocks, rem, i;
 
        j = create_j(this, iv);
@@ -585,22 +571,7 @@ static void decrypt_gcm128(private_aesni_gcm_t *this,
        bi = (__m128i*)in;
        bo = (__m128i*)out;
 
-       h1 = this->hhhh;
-       h2 = this->hhh;
-       h3 = this->hh;
-       h4 = this->h;
-
-       k0 = this->key->schedule[0];
-       k1 = this->key->schedule[1];
-       k2 = this->key->schedule[2];
-       k3 = this->key->schedule[3];
-       k4 = this->key->schedule[4];
-       k5 = this->key->schedule[5];
-       k6 = this->key->schedule[6];
-       k7 = this->key->schedule[7];
-       k8 = this->key->schedule[8];
-       k9 = this->key->schedule[9];
-       k10 = this->key->schedule[10];
+       ks = this->key->schedule;
 
        for (i = 0; i < pblocks; i += GCM_CRYPT_PARALLELISM)
        {
@@ -610,58 +581,58 @@ static void decrypt_gcm128(private_aesni_gcm_t *this,
                d4 = _mm_loadu_si128(bi + i + 3);
 
                y = _mm_xor_si128(y, d1);
-               y = mult4xor(h1, h2, h3, h4, y, d2, d3, d4);
+               y = mult4xor(this->hhhh, this->hhh, this->hh, this->h, y, d2, d3, d4);
 
-               t1 = _mm_xor_si128(cb, k0);
+               t1 = _mm_xor_si128(cb, ks[0]);
                cb = increment_be(cb);
-               t2 = _mm_xor_si128(cb, k0);
+               t2 = _mm_xor_si128(cb, ks[0]);
                cb = increment_be(cb);
-               t3 = _mm_xor_si128(cb, k0);
+               t3 = _mm_xor_si128(cb, ks[0]);
                cb = increment_be(cb);
-               t4 = _mm_xor_si128(cb, k0);
+               t4 = _mm_xor_si128(cb, ks[0]);
                cb = increment_be(cb);
 
-               t1 = _mm_aesenc_si128(t1, k1);
-               t2 = _mm_aesenc_si128(t2, k1);
-               t3 = _mm_aesenc_si128(t3, k1);
-               t4 = _mm_aesenc_si128(t4, k1);
-               t1 = _mm_aesenc_si128(t1, k2);
-               t2 = _mm_aesenc_si128(t2, k2);
-               t3 = _mm_aesenc_si128(t3, k2);
-               t4 = _mm_aesenc_si128(t4, k2);
-               t1 = _mm_aesenc_si128(t1, k3);
-               t2 = _mm_aesenc_si128(t2, k3);
-               t3 = _mm_aesenc_si128(t3, k3);
-               t4 = _mm_aesenc_si128(t4, k3);
-               t1 = _mm_aesenc_si128(t1, k4);
-               t2 = _mm_aesenc_si128(t2, k4);
-               t3 = _mm_aesenc_si128(t3, k4);
-               t4 = _mm_aesenc_si128(t4, k4);
-               t1 = _mm_aesenc_si128(t1, k5);
-               t2 = _mm_aesenc_si128(t2, k5);
-               t3 = _mm_aesenc_si128(t3, k5);
-               t4 = _mm_aesenc_si128(t4, k5);
-               t1 = _mm_aesenc_si128(t1, k6);
-               t2 = _mm_aesenc_si128(t2, k6);
-               t3 = _mm_aesenc_si128(t3, k6);
-               t4 = _mm_aesenc_si128(t4, k6);
-               t1 = _mm_aesenc_si128(t1, k7);
-               t2 = _mm_aesenc_si128(t2, k7);
-               t3 = _mm_aesenc_si128(t3, k7);
-               t4 = _mm_aesenc_si128(t4, k7);
-               t1 = _mm_aesenc_si128(t1, k8);
-               t2 = _mm_aesenc_si128(t2, k8);
-               t3 = _mm_aesenc_si128(t3, k8);
-               t4 = _mm_aesenc_si128(t4, k8);
-               t1 = _mm_aesenc_si128(t1, k9);
-               t2 = _mm_aesenc_si128(t2, k9);
-               t3 = _mm_aesenc_si128(t3, k9);
-               t4 = _mm_aesenc_si128(t4, k9);
-
-               t1 = _mm_aesenclast_si128(t1, k10);
-               t2 = _mm_aesenclast_si128(t2, k10);
-               t3 = _mm_aesenclast_si128(t3, k10);
-               t4 = _mm_aesenclast_si128(t4, k10);
+               t1 = _mm_aesenc_si128(t1, ks[1]);
+               t2 = _mm_aesenc_si128(t2, ks[1]);
+               t3 = _mm_aesenc_si128(t3, ks[1]);
+               t4 = _mm_aesenc_si128(t4, ks[1]);
+               t1 = _mm_aesenc_si128(t1, ks[2]);
+               t2 = _mm_aesenc_si128(t2, ks[2]);
+               t3 = _mm_aesenc_si128(t3, ks[2]);
+               t4 = _mm_aesenc_si128(t4, ks[2]);
+               t1 = _mm_aesenc_si128(t1, ks[3]);
+               t2 = _mm_aesenc_si128(t2, ks[3]);
+               t3 = _mm_aesenc_si128(t3, ks[3]);
+               t4 = _mm_aesenc_si128(t4, ks[3]);
+               t1 = _mm_aesenc_si128(t1, ks[4]);
+               t2 = _mm_aesenc_si128(t2, ks[4]);
+               t3 = _mm_aesenc_si128(t3, ks[4]);
+               t4 = _mm_aesenc_si128(t4, ks[4]);
+               t1 = _mm_aesenc_si128(t1, ks[5]);
+               t2 = _mm_aesenc_si128(t2, ks[5]);
+               t3 = _mm_aesenc_si128(t3, ks[5]);
+               t4 = _mm_aesenc_si128(t4, ks[5]);
+               t1 = _mm_aesenc_si128(t1, ks[6]);
+               t2 = _mm_aesenc_si128(t2, ks[6]);
+               t3 = _mm_aesenc_si128(t3, ks[6]);
+               t4 = _mm_aesenc_si128(t4, ks[6]);
+               t1 = _mm_aesenc_si128(t1, ks[7]);
+               t2 = _mm_aesenc_si128(t2, ks[7]);
+               t3 = _mm_aesenc_si128(t3, ks[7]);
+               t4 = _mm_aesenc_si128(t4, ks[7]);
+               t1 = _mm_aesenc_si128(t1, ks[8]);
+               t2 = _mm_aesenc_si128(t2, ks[8]);
+               t3 = _mm_aesenc_si128(t3, ks[8]);
+               t4 = _mm_aesenc_si128(t4, ks[8]);
+               t1 = _mm_aesenc_si128(t1, ks[9]);
+               t2 = _mm_aesenc_si128(t2, ks[9]);
+               t3 = _mm_aesenc_si128(t3, ks[9]);
+               t4 = _mm_aesenc_si128(t4, ks[9]);
+
+               t1 = _mm_aesenclast_si128(t1, ks[10]);
+               t2 = _mm_aesenclast_si128(t2, ks[10]);
+               t3 = _mm_aesenclast_si128(t3, ks[10]);
+               t4 = _mm_aesenclast_si128(t4, ks[10]);
 
                t1 = _mm_xor_si128(t1, d1);
                t2 = _mm_xor_si128(t2, d2);
@@ -678,19 +649,19 @@ static void decrypt_gcm128(private_aesni_gcm_t *this,
        {
                d1 = _mm_loadu_si128(bi + i);
 
-               y = ghash(h4, y, d1);
+               y = ghash(this->h, y, d1);
 
-               t1 = _mm_xor_si128(cb, k0);
-               t1 = _mm_aesenc_si128(t1, k1);
-               t1 = _mm_aesenc_si128(t1, k2);
-               t1 = _mm_aesenc_si128(t1, k3);
-               t1 = _mm_aesenc_si128(t1, k4);
-               t1 = _mm_aesenc_si128(t1, k5);
-               t1 = _mm_aesenc_si128(t1, k6);
-               t1 = _mm_aesenc_si128(t1, k7);
-               t1 = _mm_aesenc_si128(t1, k8);
-               t1 = _mm_aesenc_si128(t1, k9);
-               t1 = _mm_aesenclast_si128(t1, k10);
+               t1 = _mm_xor_si128(cb, ks[0]);
+               t1 = _mm_aesenc_si128(t1, ks[1]);
+               t1 = _mm_aesenc_si128(t1, ks[2]);
+               t1 = _mm_aesenc_si128(t1, ks[3]);
+               t1 = _mm_aesenc_si128(t1, ks[4]);
+               t1 = _mm_aesenc_si128(t1, ks[5]);
+               t1 = _mm_aesenc_si128(t1, ks[6]);
+               t1 = _mm_aesenc_si128(t1, ks[7]);
+               t1 = _mm_aesenc_si128(t1, ks[8]);
+               t1 = _mm_aesenc_si128(t1, ks[9]);
+               t1 = _mm_aesenclast_si128(t1, ks[10]);
 
                t1 = _mm_xor_si128(t1, d1);
                _mm_storeu_si128(bo + i, t1);
@@ -713,9 +684,8 @@ static void encrypt_gcm192(private_aesni_gcm_t *this,
                                                   size_t len, u_char *in, u_char *out, u_char *iv,
                                                   size_t alen, u_char *assoc, u_char *icv)
 {
-       __m128i k0, k1, k2, k3, k4, k5, k6, k7, k8, k9, k10, k11, k12;
-       __m128i d1, d2, d3, d4, t1, t2, t3, t4, h1, h2, h3, h4;
-       __m128i y, j, cb, *bi, *bo;
+       __m128i d1, d2, d3, d4, t1, t2, t3, t4;
+       __m128i *ks, y, j, cb, *bi, *bo;
        u_int blocks, pblocks, rem, i;
 
        j = create_j(this, iv);
@@ -727,24 +697,7 @@ static void encrypt_gcm192(private_aesni_gcm_t *this,
        bi = (__m128i*)in;
        bo = (__m128i*)out;
 
-       h1 = this->hhhh;
-       h2 = this->hhh;
-       h3 = this->hh;
-       h4 = this->h;
-
-       k0 = this->key->schedule[0];
-       k1 = this->key->schedule[1];
-       k2 = this->key->schedule[2];
-       k3 = this->key->schedule[3];
-       k4 = this->key->schedule[4];
-       k5 = this->key->schedule[5];
-       k6 = this->key->schedule[6];
-       k7 = this->key->schedule[7];
-       k8 = this->key->schedule[8];
-       k9 = this->key->schedule[9];
-       k10 = this->key->schedule[10];
-       k11 = this->key->schedule[11];
-       k12 = this->key->schedule[12];
+       ks = this->key->schedule;
 
        for (i = 0; i < pblocks; i += GCM_CRYPT_PARALLELISM)
        {
@@ -753,64 +706,64 @@ static void encrypt_gcm192(private_aesni_gcm_t *this,
                d3 = _mm_loadu_si128(bi + i + 2);
                d4 = _mm_loadu_si128(bi + i + 3);
 
-               t1 = _mm_xor_si128(cb, k0);
+               t1 = _mm_xor_si128(cb, ks[0]);
                cb = increment_be(cb);
-               t2 = _mm_xor_si128(cb, k0);
+               t2 = _mm_xor_si128(cb, ks[0]);
                cb = increment_be(cb);
-               t3 = _mm_xor_si128(cb, k0);
+               t3 = _mm_xor_si128(cb, ks[0]);
                cb = increment_be(cb);
-               t4 = _mm_xor_si128(cb, k0);
+               t4 = _mm_xor_si128(cb, ks[0]);
                cb = increment_be(cb);
 
-               t1 = _mm_aesenc_si128(t1, k1);
-               t2 = _mm_aesenc_si128(t2, k1);
-               t3 = _mm_aesenc_si128(t3, k1);
-               t4 = _mm_aesenc_si128(t4, k1);
-               t1 = _mm_aesenc_si128(t1, k2);
-               t2 = _mm_aesenc_si128(t2, k2);
-               t3 = _mm_aesenc_si128(t3, k2);
-               t4 = _mm_aesenc_si128(t4, k2);
-               t1 = _mm_aesenc_si128(t1, k3);
-               t2 = _mm_aesenc_si128(t2, k3);
-               t3 = _mm_aesenc_si128(t3, k3);
-               t4 = _mm_aesenc_si128(t4, k3);
-               t1 = _mm_aesenc_si128(t1, k4);
-               t2 = _mm_aesenc_si128(t2, k4);
-               t3 = _mm_aesenc_si128(t3, k4);
-               t4 = _mm_aesenc_si128(t4, k4);
-               t1 = _mm_aesenc_si128(t1, k5);
-               t2 = _mm_aesenc_si128(t2, k5);
-               t3 = _mm_aesenc_si128(t3, k5);
-               t4 = _mm_aesenc_si128(t4, k5);
-               t1 = _mm_aesenc_si128(t1, k6);
-               t2 = _mm_aesenc_si128(t2, k6);
-               t3 = _mm_aesenc_si128(t3, k6);
-               t4 = _mm_aesenc_si128(t4, k6);
-               t1 = _mm_aesenc_si128(t1, k7);
-               t2 = _mm_aesenc_si128(t2, k7);
-               t3 = _mm_aesenc_si128(t3, k7);
-               t4 = _mm_aesenc_si128(t4, k7);
-               t1 = _mm_aesenc_si128(t1, k8);
-               t2 = _mm_aesenc_si128(t2, k8);
-               t3 = _mm_aesenc_si128(t3, k8);
-               t4 = _mm_aesenc_si128(t4, k8);
-               t1 = _mm_aesenc_si128(t1, k9);
-               t2 = _mm_aesenc_si128(t2, k9);
-               t3 = _mm_aesenc_si128(t3, k9);
-               t4 = _mm_aesenc_si128(t4, k9);
-               t1 = _mm_aesenc_si128(t1, k10);
-               t2 = _mm_aesenc_si128(t2, k10);
-               t3 = _mm_aesenc_si128(t3, k10);
-               t4 = _mm_aesenc_si128(t4, k10);
-               t1 = _mm_aesenc_si128(t1, k11);
-               t2 = _mm_aesenc_si128(t2, k11);
-               t3 = _mm_aesenc_si128(t3, k11);
-               t4 = _mm_aesenc_si128(t4, k11);
-
-               t1 = _mm_aesenclast_si128(t1, k12);
-               t2 = _mm_aesenclast_si128(t2, k12);
-               t3 = _mm_aesenclast_si128(t3, k12);
-               t4 = _mm_aesenclast_si128(t4, k12);
+               t1 = _mm_aesenc_si128(t1, ks[1]);
+               t2 = _mm_aesenc_si128(t2, ks[1]);
+               t3 = _mm_aesenc_si128(t3, ks[1]);
+               t4 = _mm_aesenc_si128(t4, ks[1]);
+               t1 = _mm_aesenc_si128(t1, ks[2]);
+               t2 = _mm_aesenc_si128(t2, ks[2]);
+               t3 = _mm_aesenc_si128(t3, ks[2]);
+               t4 = _mm_aesenc_si128(t4, ks[2]);
+               t1 = _mm_aesenc_si128(t1, ks[3]);
+               t2 = _mm_aesenc_si128(t2, ks[3]);
+               t3 = _mm_aesenc_si128(t3, ks[3]);
+               t4 = _mm_aesenc_si128(t4, ks[3]);
+               t1 = _mm_aesenc_si128(t1, ks[4]);
+               t2 = _mm_aesenc_si128(t2, ks[4]);
+               t3 = _mm_aesenc_si128(t3, ks[4]);
+               t4 = _mm_aesenc_si128(t4, ks[4]);
+               t1 = _mm_aesenc_si128(t1, ks[5]);
+               t2 = _mm_aesenc_si128(t2, ks[5]);
+               t3 = _mm_aesenc_si128(t3, ks[5]);
+               t4 = _mm_aesenc_si128(t4, ks[5]);
+               t1 = _mm_aesenc_si128(t1, ks[6]);
+               t2 = _mm_aesenc_si128(t2, ks[6]);
+               t3 = _mm_aesenc_si128(t3, ks[6]);
+               t4 = _mm_aesenc_si128(t4, ks[6]);
+               t1 = _mm_aesenc_si128(t1, ks[7]);
+               t2 = _mm_aesenc_si128(t2, ks[7]);
+               t3 = _mm_aesenc_si128(t3, ks[7]);
+               t4 = _mm_aesenc_si128(t4, ks[7]);
+               t1 = _mm_aesenc_si128(t1, ks[8]);
+               t2 = _mm_aesenc_si128(t2, ks[8]);
+               t3 = _mm_aesenc_si128(t3, ks[8]);
+               t4 = _mm_aesenc_si128(t4, ks[8]);
+               t1 = _mm_aesenc_si128(t1, ks[9]);
+               t2 = _mm_aesenc_si128(t2, ks[9]);
+               t3 = _mm_aesenc_si128(t3, ks[9]);
+               t4 = _mm_aesenc_si128(t4, ks[9]);
+               t1 = _mm_aesenc_si128(t1, ks[10]);
+               t2 = _mm_aesenc_si128(t2, ks[10]);
+               t3 = _mm_aesenc_si128(t3, ks[10]);
+               t4 = _mm_aesenc_si128(t4, ks[10]);
+               t1 = _mm_aesenc_si128(t1, ks[11]);
+               t2 = _mm_aesenc_si128(t2, ks[11]);
+               t3 = _mm_aesenc_si128(t3, ks[11]);
+               t4 = _mm_aesenc_si128(t4, ks[11]);
+
+               t1 = _mm_aesenclast_si128(t1, ks[12]);
+               t2 = _mm_aesenclast_si128(t2, ks[12]);
+               t3 = _mm_aesenclast_si128(t3, ks[12]);
+               t4 = _mm_aesenclast_si128(t4, ks[12]);
 
                t1 = _mm_xor_si128(t1, d1);
                t2 = _mm_xor_si128(t2, d2);
@@ -818,7 +771,7 @@ static void encrypt_gcm192(private_aesni_gcm_t *this,
                t4 = _mm_xor_si128(t4, d4);
 
                y = _mm_xor_si128(y, t1);
-               y = mult4xor(h1, h2, h3, h4, y, t2, t3, t4);
+               y = mult4xor(this->hhhh, this->hhh, this->hh, this->h, y, t2, t3, t4);
 
                _mm_storeu_si128(bo + i + 0, t1);
                _mm_storeu_si128(bo + i + 1, t2);
@@ -830,24 +783,24 @@ static void encrypt_gcm192(private_aesni_gcm_t *this,
        {
                d1 = _mm_loadu_si128(bi + i);
 
-               t1 = _mm_xor_si128(cb, k0);
-               t1 = _mm_aesenc_si128(t1, k1);
-               t1 = _mm_aesenc_si128(t1, k2);
-               t1 = _mm_aesenc_si128(t1, k3);
-               t1 = _mm_aesenc_si128(t1, k4);
-               t1 = _mm_aesenc_si128(t1, k5);
-               t1 = _mm_aesenc_si128(t1, k6);
-               t1 = _mm_aesenc_si128(t1, k7);
-               t1 = _mm_aesenc_si128(t1, k8);
-               t1 = _mm_aesenc_si128(t1, k9);
-               t1 = _mm_aesenc_si128(t1, k10);
-               t1 = _mm_aesenc_si128(t1, k11);
-               t1 = _mm_aesenclast_si128(t1, k12);
+               t1 = _mm_xor_si128(cb, ks[0]);
+               t1 = _mm_aesenc_si128(t1, ks[1]);
+               t1 = _mm_aesenc_si128(t1, ks[2]);
+               t1 = _mm_aesenc_si128(t1, ks[3]);
+               t1 = _mm_aesenc_si128(t1, ks[4]);
+               t1 = _mm_aesenc_si128(t1, ks[5]);
+               t1 = _mm_aesenc_si128(t1, ks[6]);
+               t1 = _mm_aesenc_si128(t1, ks[7]);
+               t1 = _mm_aesenc_si128(t1, ks[8]);
+               t1 = _mm_aesenc_si128(t1, ks[9]);
+               t1 = _mm_aesenc_si128(t1, ks[10]);
+               t1 = _mm_aesenc_si128(t1, ks[11]);
+               t1 = _mm_aesenclast_si128(t1, ks[12]);
 
                t1 = _mm_xor_si128(t1, d1);
                _mm_storeu_si128(bo + i, t1);
 
-               y = ghash(h4, y, t1);
+               y = ghash(this->h, y, t1);
 
                cb = increment_be(cb);
        }
@@ -867,9 +820,8 @@ static void decrypt_gcm192(private_aesni_gcm_t *this,
                                                   size_t len, u_char *in, u_char *out, u_char *iv,
                                                   size_t alen, u_char *assoc, u_char *icv)
 {
-       __m128i k0, k1, k2, k3, k4, k5, k6, k7, k8, k9, k10, k11, k12;
-       __m128i d1, d2, d3, d4, t1, t2, t3, t4, h1, h2, h3, h4;
-       __m128i y, j, cb, *bi, *bo;
+       __m128i d1, d2, d3, d4, t1, t2, t3, t4;
+       __m128i *ks, y, j, cb, *bi, *bo;
        u_int blocks, pblocks, rem, i;
 
        j = create_j(this, iv);
@@ -881,24 +833,7 @@ static void decrypt_gcm192(private_aesni_gcm_t *this,
        bi = (__m128i*)in;
        bo = (__m128i*)out;
 
-       h1 = this->hhhh;
-       h2 = this->hhh;
-       h3 = this->hh;
-       h4 = this->h;
-
-       k0 = this->key->schedule[0];
-       k1 = this->key->schedule[1];
-       k2 = this->key->schedule[2];
-       k3 = this->key->schedule[3];
-       k4 = this->key->schedule[4];
-       k5 = this->key->schedule[5];
-       k6 = this->key->schedule[6];
-       k7 = this->key->schedule[7];
-       k8 = this->key->schedule[8];
-       k9 = this->key->schedule[9];
-       k10 = this->key->schedule[10];
-       k11 = this->key->schedule[11];
-       k12 = this->key->schedule[12];
+       ks = this->key->schedule;
 
        for (i = 0; i < pblocks; i += GCM_CRYPT_PARALLELISM)
        {
@@ -908,66 +843,66 @@ static void decrypt_gcm192(private_aesni_gcm_t *this,
                d4 = _mm_loadu_si128(bi + i + 3);
 
                y = _mm_xor_si128(y, d1);
-               y = mult4xor(h1, h2, h3, h4, y, d2, d3, d4);
+               y = mult4xor(this->hhhh, this->hhh, this->hh, this->h, y, d2, d3, d4);
 
-               t1 = _mm_xor_si128(cb, k0);
+               t1 = _mm_xor_si128(cb, ks[0]);
                cb = increment_be(cb);
-               t2 = _mm_xor_si128(cb, k0);
+               t2 = _mm_xor_si128(cb, ks[0]);
                cb = increment_be(cb);
-               t3 = _mm_xor_si128(cb, k0);
+               t3 = _mm_xor_si128(cb, ks[0]);
                cb = increment_be(cb);
-               t4 = _mm_xor_si128(cb, k0);
+               t4 = _mm_xor_si128(cb, ks[0]);
                cb = increment_be(cb);
 
-               t1 = _mm_aesenc_si128(t1, k1);
-               t2 = _mm_aesenc_si128(t2, k1);
-               t3 = _mm_aesenc_si128(t3, k1);
-               t4 = _mm_aesenc_si128(t4, k1);
-               t1 = _mm_aesenc_si128(t1, k2);
-               t2 = _mm_aesenc_si128(t2, k2);
-               t3 = _mm_aesenc_si128(t3, k2);
-               t4 = _mm_aesenc_si128(t4, k2);
-               t1 = _mm_aesenc_si128(t1, k3);
-               t2 = _mm_aesenc_si128(t2, k3);
-               t3 = _mm_aesenc_si128(t3, k3);
-               t4 = _mm_aesenc_si128(t4, k3);
-               t1 = _mm_aesenc_si128(t1, k4);
-               t2 = _mm_aesenc_si128(t2, k4);
-               t3 = _mm_aesenc_si128(t3, k4);
-               t4 = _mm_aesenc_si128(t4, k4);
-               t1 = _mm_aesenc_si128(t1, k5);
-               t2 = _mm_aesenc_si128(t2, k5);
-               t3 = _mm_aesenc_si128(t3, k5);
-               t4 = _mm_aesenc_si128(t4, k5);
-               t1 = _mm_aesenc_si128(t1, k6);
-               t2 = _mm_aesenc_si128(t2, k6);
-               t3 = _mm_aesenc_si128(t3, k6);
-               t4 = _mm_aesenc_si128(t4, k6);
-               t1 = _mm_aesenc_si128(t1, k7);
-               t2 = _mm_aesenc_si128(t2, k7);
-               t3 = _mm_aesenc_si128(t3, k7);
-               t4 = _mm_aesenc_si128(t4, k7);
-               t1 = _mm_aesenc_si128(t1, k8);
-               t2 = _mm_aesenc_si128(t2, k8);
-               t3 = _mm_aesenc_si128(t3, k8);
-               t4 = _mm_aesenc_si128(t4, k8);
-               t1 = _mm_aesenc_si128(t1, k9);
-               t2 = _mm_aesenc_si128(t2, k9);
-               t3 = _mm_aesenc_si128(t3, k9);
-               t4 = _mm_aesenc_si128(t4, k9);
-               t1 = _mm_aesenc_si128(t1, k10);
-               t2 = _mm_aesenc_si128(t2, k10);
-               t3 = _mm_aesenc_si128(t3, k10);
-               t4 = _mm_aesenc_si128(t4, k10);
-               t1 = _mm_aesenc_si128(t1, k11);
-               t2 = _mm_aesenc_si128(t2, k11);
-               t3 = _mm_aesenc_si128(t3, k11);
-               t4 = _mm_aesenc_si128(t4, k11);
-
-               t1 = _mm_aesenclast_si128(t1, k12);
-               t2 = _mm_aesenclast_si128(t2, k12);
-               t3 = _mm_aesenclast_si128(t3, k12);
-               t4 = _mm_aesenclast_si128(t4, k12);
+               t1 = _mm_aesenc_si128(t1, ks[1]);
+               t2 = _mm_aesenc_si128(t2, ks[1]);
+               t3 = _mm_aesenc_si128(t3, ks[1]);
+               t4 = _mm_aesenc_si128(t4, ks[1]);
+               t1 = _mm_aesenc_si128(t1, ks[2]);
+               t2 = _mm_aesenc_si128(t2, ks[2]);
+               t3 = _mm_aesenc_si128(t3, ks[2]);
+               t4 = _mm_aesenc_si128(t4, ks[2]);
+               t1 = _mm_aesenc_si128(t1, ks[3]);
+               t2 = _mm_aesenc_si128(t2, ks[3]);
+               t3 = _mm_aesenc_si128(t3, ks[3]);
+               t4 = _mm_aesenc_si128(t4, ks[3]);
+               t1 = _mm_aesenc_si128(t1, ks[4]);
+               t2 = _mm_aesenc_si128(t2, ks[4]);
+               t3 = _mm_aesenc_si128(t3, ks[4]);
+               t4 = _mm_aesenc_si128(t4, ks[4]);
+               t1 = _mm_aesenc_si128(t1, ks[5]);
+               t2 = _mm_aesenc_si128(t2, ks[5]);
+               t3 = _mm_aesenc_si128(t3, ks[5]);
+               t4 = _mm_aesenc_si128(t4, ks[5]);
+               t1 = _mm_aesenc_si128(t1, ks[6]);
+               t2 = _mm_aesenc_si128(t2, ks[6]);
+               t3 = _mm_aesenc_si128(t3, ks[6]);
+               t4 = _mm_aesenc_si128(t4, ks[6]);
+               t1 = _mm_aesenc_si128(t1, ks[7]);
+               t2 = _mm_aesenc_si128(t2, ks[7]);
+               t3 = _mm_aesenc_si128(t3, ks[7]);
+               t4 = _mm_aesenc_si128(t4, ks[7]);
+               t1 = _mm_aesenc_si128(t1, ks[8]);
+               t2 = _mm_aesenc_si128(t2, ks[8]);
+               t3 = _mm_aesenc_si128(t3, ks[8]);
+               t4 = _mm_aesenc_si128(t4, ks[8]);
+               t1 = _mm_aesenc_si128(t1, ks[9]);
+               t2 = _mm_aesenc_si128(t2, ks[9]);
+               t3 = _mm_aesenc_si128(t3, ks[9]);
+               t4 = _mm_aesenc_si128(t4, ks[9]);
+               t1 = _mm_aesenc_si128(t1, ks[10]);
+               t2 = _mm_aesenc_si128(t2, ks[10]);
+               t3 = _mm_aesenc_si128(t3, ks[10]);
+               t4 = _mm_aesenc_si128(t4, ks[10]);
+               t1 = _mm_aesenc_si128(t1, ks[11]);
+               t2 = _mm_aesenc_si128(t2, ks[11]);
+               t3 = _mm_aesenc_si128(t3, ks[11]);
+               t4 = _mm_aesenc_si128(t4, ks[11]);
+
+               t1 = _mm_aesenclast_si128(t1, ks[12]);
+               t2 = _mm_aesenclast_si128(t2, ks[12]);
+               t3 = _mm_aesenclast_si128(t3, ks[12]);
+               t4 = _mm_aesenclast_si128(t4, ks[12]);
 
                t1 = _mm_xor_si128(t1, d1);
                t2 = _mm_xor_si128(t2, d2);
@@ -984,21 +919,21 @@ static void decrypt_gcm192(private_aesni_gcm_t *this,
        {
                d1 = _mm_loadu_si128(bi + i);
 
-               y = ghash(h4, y, d1);
-
-               t1 = _mm_xor_si128(cb, k0);
-               t1 = _mm_aesenc_si128(t1, k1);
-               t1 = _mm_aesenc_si128(t1, k2);
-               t1 = _mm_aesenc_si128(t1, k3);
-               t1 = _mm_aesenc_si128(t1, k4);
-               t1 = _mm_aesenc_si128(t1, k5);
-               t1 = _mm_aesenc_si128(t1, k6);
-               t1 = _mm_aesenc_si128(t1, k7);
-               t1 = _mm_aesenc_si128(t1, k8);
-               t1 = _mm_aesenc_si128(t1, k9);
-               t1 = _mm_aesenc_si128(t1, k10);
-               t1 = _mm_aesenc_si128(t1, k11);
-               t1 = _mm_aesenclast_si128(t1, k12);
+               y = ghash(this->h, y, d1);
+
+               t1 = _mm_xor_si128(cb, ks[0]);
+               t1 = _mm_aesenc_si128(t1, ks[1]);
+               t1 = _mm_aesenc_si128(t1, ks[2]);
+               t1 = _mm_aesenc_si128(t1, ks[3]);
+               t1 = _mm_aesenc_si128(t1, ks[4]);
+               t1 = _mm_aesenc_si128(t1, ks[5]);
+               t1 = _mm_aesenc_si128(t1, ks[6]);
+               t1 = _mm_aesenc_si128(t1, ks[7]);
+               t1 = _mm_aesenc_si128(t1, ks[8]);
+               t1 = _mm_aesenc_si128(t1, ks[9]);
+               t1 = _mm_aesenc_si128(t1, ks[10]);
+               t1 = _mm_aesenc_si128(t1, ks[11]);
+               t1 = _mm_aesenclast_si128(t1, ks[12]);
 
                t1 = _mm_xor_si128(t1, d1);
                _mm_storeu_si128(bo + i, t1);
@@ -1021,9 +956,8 @@ static void encrypt_gcm256(private_aesni_gcm_t *this,
                                                   size_t len, u_char *in, u_char *out, u_char *iv,
                                                   size_t alen, u_char *assoc, u_char *icv)
 {
-       __m128i k0, k1, k2, k3, k4, k5, k6, k7, k8, k9, k10, k11, k12, k13, k14;
-       __m128i d1, d2, d3, d4, t1, t2, t3, t4, h1, h2, h3, h4;
-       __m128i y, j, cb, *bi, *bo;
+       __m128i d1, d2, d3, d4, t1, t2, t3, t4;
+       __m128i *ks, y, j, cb, *bi, *bo;
        u_int blocks, pblocks, rem, i;
 
        j = create_j(this, iv);
@@ -1035,26 +969,7 @@ static void encrypt_gcm256(private_aesni_gcm_t *this,
        bi = (__m128i*)in;
        bo = (__m128i*)out;
 
-       h1 = this->hhhh;
-       h2 = this->hhh;
-       h3 = this->hh;
-       h4 = this->h;
-
-       k0 = this->key->schedule[0];
-       k1 = this->key->schedule[1];
-       k2 = this->key->schedule[2];
-       k3 = this->key->schedule[3];
-       k4 = this->key->schedule[4];
-       k5 = this->key->schedule[5];
-       k6 = this->key->schedule[6];
-       k7 = this->key->schedule[7];
-       k8 = this->key->schedule[8];
-       k9 = this->key->schedule[9];
-       k10 = this->key->schedule[10];
-       k11 = this->key->schedule[11];
-       k12 = this->key->schedule[12];
-       k13 = this->key->schedule[13];
-       k14 = this->key->schedule[14];
+       ks = this->key->schedule;
 
        for (i = 0; i < pblocks; i += GCM_CRYPT_PARALLELISM)
        {
@@ -1063,72 +978,72 @@ static void encrypt_gcm256(private_aesni_gcm_t *this,
                d3 = _mm_loadu_si128(bi + i + 2);
                d4 = _mm_loadu_si128(bi + i + 3);
 
-               t1 = _mm_xor_si128(cb, k0);
+               t1 = _mm_xor_si128(cb, ks[0]);
                cb = increment_be(cb);
-               t2 = _mm_xor_si128(cb, k0);
+               t2 = _mm_xor_si128(cb, ks[0]);
                cb = increment_be(cb);
-               t3 = _mm_xor_si128(cb, k0);
+               t3 = _mm_xor_si128(cb, ks[0]);
                cb = increment_be(cb);
-               t4 = _mm_xor_si128(cb, k0);
+               t4 = _mm_xor_si128(cb, ks[0]);
                cb = increment_be(cb);
 
-               t1 = _mm_aesenc_si128(t1, k1);
-               t2 = _mm_aesenc_si128(t2, k1);
-               t3 = _mm_aesenc_si128(t3, k1);
-               t4 = _mm_aesenc_si128(t4, k1);
-               t1 = _mm_aesenc_si128(t1, k2);
-               t2 = _mm_aesenc_si128(t2, k2);
-               t3 = _mm_aesenc_si128(t3, k2);
-               t4 = _mm_aesenc_si128(t4, k2);
-               t1 = _mm_aesenc_si128(t1, k3);
-               t2 = _mm_aesenc_si128(t2, k3);
-               t3 = _mm_aesenc_si128(t3, k3);
-               t4 = _mm_aesenc_si128(t4, k3);
-               t1 = _mm_aesenc_si128(t1, k4);
-               t2 = _mm_aesenc_si128(t2, k4);
-               t3 = _mm_aesenc_si128(t3, k4);
-               t4 = _mm_aesenc_si128(t4, k4);
-               t1 = _mm_aesenc_si128(t1, k5);
-               t2 = _mm_aesenc_si128(t2, k5);
-               t3 = _mm_aesenc_si128(t3, k5);
-               t4 = _mm_aesenc_si128(t4, k5);
-               t1 = _mm_aesenc_si128(t1, k6);
-               t2 = _mm_aesenc_si128(t2, k6);
-               t3 = _mm_aesenc_si128(t3, k6);
-               t4 = _mm_aesenc_si128(t4, k6);
-               t1 = _mm_aesenc_si128(t1, k7);
-               t2 = _mm_aesenc_si128(t2, k7);
-               t3 = _mm_aesenc_si128(t3, k7);
-               t4 = _mm_aesenc_si128(t4, k7);
-               t1 = _mm_aesenc_si128(t1, k8);
-               t2 = _mm_aesenc_si128(t2, k8);
-               t3 = _mm_aesenc_si128(t3, k8);
-               t4 = _mm_aesenc_si128(t4, k8);
-               t1 = _mm_aesenc_si128(t1, k9);
-               t2 = _mm_aesenc_si128(t2, k9);
-               t3 = _mm_aesenc_si128(t3, k9);
-               t4 = _mm_aesenc_si128(t4, k9);
-               t1 = _mm_aesenc_si128(t1, k10);
-               t2 = _mm_aesenc_si128(t2, k10);
-               t3 = _mm_aesenc_si128(t3, k10);
-               t4 = _mm_aesenc_si128(t4, k10);
-               t1 = _mm_aesenc_si128(t1, k11);
-               t2 = _mm_aesenc_si128(t2, k11);
-               t3 = _mm_aesenc_si128(t3, k11);
-               t4 = _mm_aesenc_si128(t4, k11);
-               t1 = _mm_aesenc_si128(t1, k12);
-               t2 = _mm_aesenc_si128(t2, k12);
-               t3 = _mm_aesenc_si128(t3, k12);
-               t4 = _mm_aesenc_si128(t4, k12);
-               t1 = _mm_aesenc_si128(t1, k13);
-               t2 = _mm_aesenc_si128(t2, k13);
-               t3 = _mm_aesenc_si128(t3, k13);
-               t4 = _mm_aesenc_si128(t4, k13);
-
-               t1 = _mm_aesenclast_si128(t1, k14);
-               t2 = _mm_aesenclast_si128(t2, k14);
-               t3 = _mm_aesenclast_si128(t3, k14);
-               t4 = _mm_aesenclast_si128(t4, k14);
+               t1 = _mm_aesenc_si128(t1, ks[1]);
+               t2 = _mm_aesenc_si128(t2, ks[1]);
+               t3 = _mm_aesenc_si128(t3, ks[1]);
+               t4 = _mm_aesenc_si128(t4, ks[1]);
+               t1 = _mm_aesenc_si128(t1, ks[2]);
+               t2 = _mm_aesenc_si128(t2, ks[2]);
+               t3 = _mm_aesenc_si128(t3, ks[2]);
+               t4 = _mm_aesenc_si128(t4, ks[2]);
+               t1 = _mm_aesenc_si128(t1, ks[3]);
+               t2 = _mm_aesenc_si128(t2, ks[3]);
+               t3 = _mm_aesenc_si128(t3, ks[3]);
+               t4 = _mm_aesenc_si128(t4, ks[3]);
+               t1 = _mm_aesenc_si128(t1, ks[4]);
+               t2 = _mm_aesenc_si128(t2, ks[4]);
+               t3 = _mm_aesenc_si128(t3, ks[4]);
+               t4 = _mm_aesenc_si128(t4, ks[4]);
+               t1 = _mm_aesenc_si128(t1, ks[5]);
+               t2 = _mm_aesenc_si128(t2, ks[5]);
+               t3 = _mm_aesenc_si128(t3, ks[5]);
+               t4 = _mm_aesenc_si128(t4, ks[5]);
+               t1 = _mm_aesenc_si128(t1, ks[6]);
+               t2 = _mm_aesenc_si128(t2, ks[6]);
+               t3 = _mm_aesenc_si128(t3, ks[6]);
+               t4 = _mm_aesenc_si128(t4, ks[6]);
+               t1 = _mm_aesenc_si128(t1, ks[7]);
+               t2 = _mm_aesenc_si128(t2, ks[7]);
+               t3 = _mm_aesenc_si128(t3, ks[7]);
+               t4 = _mm_aesenc_si128(t4, ks[7]);
+               t1 = _mm_aesenc_si128(t1, ks[8]);
+               t2 = _mm_aesenc_si128(t2, ks[8]);
+               t3 = _mm_aesenc_si128(t3, ks[8]);
+               t4 = _mm_aesenc_si128(t4, ks[8]);
+               t1 = _mm_aesenc_si128(t1, ks[9]);
+               t2 = _mm_aesenc_si128(t2, ks[9]);
+               t3 = _mm_aesenc_si128(t3, ks[9]);
+               t4 = _mm_aesenc_si128(t4, ks[9]);
+               t1 = _mm_aesenc_si128(t1, ks[10]);
+               t2 = _mm_aesenc_si128(t2, ks[10]);
+               t3 = _mm_aesenc_si128(t3, ks[10]);
+               t4 = _mm_aesenc_si128(t4, ks[10]);
+               t1 = _mm_aesenc_si128(t1, ks[11]);
+               t2 = _mm_aesenc_si128(t2, ks[11]);
+               t3 = _mm_aesenc_si128(t3, ks[11]);
+               t4 = _mm_aesenc_si128(t4, ks[11]);
+               t1 = _mm_aesenc_si128(t1, ks[12]);
+               t2 = _mm_aesenc_si128(t2, ks[12]);
+               t3 = _mm_aesenc_si128(t3, ks[12]);
+               t4 = _mm_aesenc_si128(t4, ks[12]);
+               t1 = _mm_aesenc_si128(t1, ks[13]);
+               t2 = _mm_aesenc_si128(t2, ks[13]);
+               t3 = _mm_aesenc_si128(t3, ks[13]);
+               t4 = _mm_aesenc_si128(t4, ks[13]);
+
+               t1 = _mm_aesenclast_si128(t1, ks[14]);
+               t2 = _mm_aesenclast_si128(t2, ks[14]);
+               t3 = _mm_aesenclast_si128(t3, ks[14]);
+               t4 = _mm_aesenclast_si128(t4, ks[14]);
 
                t1 = _mm_xor_si128(t1, d1);
                t2 = _mm_xor_si128(t2, d2);
@@ -1136,7 +1051,7 @@ static void encrypt_gcm256(private_aesni_gcm_t *this,
                t4 = _mm_xor_si128(t4, d4);
 
                y = _mm_xor_si128(y, t1);
-               y = mult4xor(h1, h2, h3, h4, y, t2, t3, t4);
+               y = mult4xor(this->hhhh, this->hhh, this->hh, this->h, y, t2, t3, t4);
 
                _mm_storeu_si128(bo + i + 0, t1);
                _mm_storeu_si128(bo + i + 1, t2);
@@ -1148,21 +1063,21 @@ static void encrypt_gcm256(private_aesni_gcm_t *this,
        {
                d1 = _mm_loadu_si128(bi + i);
 
-               t1 = _mm_xor_si128(cb, k0);
-               t1 = _mm_aesenc_si128(t1, k1);
-               t1 = _mm_aesenc_si128(t1, k2);
-               t1 = _mm_aesenc_si128(t1, k3);
-               t1 = _mm_aesenc_si128(t1, k4);
-               t1 = _mm_aesenc_si128(t1, k5);
-               t1 = _mm_aesenc_si128(t1, k6);
-               t1 = _mm_aesenc_si128(t1, k7);
-               t1 = _mm_aesenc_si128(t1, k8);
-               t1 = _mm_aesenc_si128(t1, k9);
-               t1 = _mm_aesenc_si128(t1, k10);
-               t1 = _mm_aesenc_si128(t1, k11);
-               t1 = _mm_aesenc_si128(t1, k12);
-               t1 = _mm_aesenc_si128(t1, k13);
-               t1 = _mm_aesenclast_si128(t1, k14);
+               t1 = _mm_xor_si128(cb, ks[0]);
+               t1 = _mm_aesenc_si128(t1, ks[1]);
+               t1 = _mm_aesenc_si128(t1, ks[2]);
+               t1 = _mm_aesenc_si128(t1, ks[3]);
+               t1 = _mm_aesenc_si128(t1, ks[4]);
+               t1 = _mm_aesenc_si128(t1, ks[5]);
+               t1 = _mm_aesenc_si128(t1, ks[6]);
+               t1 = _mm_aesenc_si128(t1, ks[7]);
+               t1 = _mm_aesenc_si128(t1, ks[8]);
+               t1 = _mm_aesenc_si128(t1, ks[9]);
+               t1 = _mm_aesenc_si128(t1, ks[10]);
+               t1 = _mm_aesenc_si128(t1, ks[11]);
+               t1 = _mm_aesenc_si128(t1, ks[12]);
+               t1 = _mm_aesenc_si128(t1, ks[13]);
+               t1 = _mm_aesenclast_si128(t1, ks[14]);
 
                t1 = _mm_xor_si128(t1, d1);
                _mm_storeu_si128(bo + i, t1);
@@ -1187,9 +1102,8 @@ static void decrypt_gcm256(private_aesni_gcm_t *this,
                                                   size_t len, u_char *in, u_char *out, u_char *iv,
                                                   size_t alen, u_char *assoc, u_char *icv)
 {
-       __m128i k0, k1, k2, k3, k4, k5, k6, k7, k8, k9, k10, k11, k12, k13, k14;
-       __m128i d1, d2, d3, d4, t1, t2, t3, t4, h1, h2, h3, h4;
-       __m128i y, j, cb, *bi, *bo;
+       __m128i d1, d2, d3, d4, t1, t2, t3, t4;
+       __m128i *ks, y, j, cb, *bi, *bo;
        u_int blocks, pblocks, rem, i;
 
        j = create_j(this, iv);
@@ -1201,26 +1115,7 @@ static void decrypt_gcm256(private_aesni_gcm_t *this,
        bi = (__m128i*)in;
        bo = (__m128i*)out;
 
-       h1 = this->hhhh;
-       h2 = this->hhh;
-       h3 = this->hh;
-       h4 = this->h;
-
-       k0 = this->key->schedule[0];
-       k1 = this->key->schedule[1];
-       k2 = this->key->schedule[2];
-       k3 = this->key->schedule[3];
-       k4 = this->key->schedule[4];
-       k5 = this->key->schedule[5];
-       k6 = this->key->schedule[6];
-       k7 = this->key->schedule[7];
-       k8 = this->key->schedule[8];
-       k9 = this->key->schedule[9];
-       k10 = this->key->schedule[10];
-       k11 = this->key->schedule[11];
-       k12 = this->key->schedule[12];
-       k13 = this->key->schedule[13];
-       k14 = this->key->schedule[14];
+       ks = this->key->schedule;
 
        for (i = 0; i < pblocks; i += GCM_CRYPT_PARALLELISM)
        {
@@ -1230,74 +1125,74 @@ static void decrypt_gcm256(private_aesni_gcm_t *this,
                d4 = _mm_loadu_si128(bi + i + 3);
 
                y = _mm_xor_si128(y, d1);
-               y = mult4xor(h1, h2, h3, h4, y, d2, d3, d4);
+               y = mult4xor(this->hhhh, this->hhh, this->hh, this->h, y, d2, d3, d4);
 
-               t1 = _mm_xor_si128(cb, k0);
+               t1 = _mm_xor_si128(cb, ks[0]);
                cb = increment_be(cb);
-               t2 = _mm_xor_si128(cb, k0);
+               t2 = _mm_xor_si128(cb, ks[0]);
                cb = increment_be(cb);
-               t3 = _mm_xor_si128(cb, k0);
+               t3 = _mm_xor_si128(cb, ks[0]);
                cb = increment_be(cb);
-               t4 = _mm_xor_si128(cb, k0);
+               t4 = _mm_xor_si128(cb, ks[0]);
                cb = increment_be(cb);
 
-               t1 = _mm_aesenc_si128(t1, k1);
-               t2 = _mm_aesenc_si128(t2, k1);
-               t3 = _mm_aesenc_si128(t3, k1);
-               t4 = _mm_aesenc_si128(t4, k1);
-               t1 = _mm_aesenc_si128(t1, k2);
-               t2 = _mm_aesenc_si128(t2, k2);
-               t3 = _mm_aesenc_si128(t3, k2);
-               t4 = _mm_aesenc_si128(t4, k2);
-               t1 = _mm_aesenc_si128(t1, k3);
-               t2 = _mm_aesenc_si128(t2, k3);
-               t3 = _mm_aesenc_si128(t3, k3);
-               t4 = _mm_aesenc_si128(t4, k3);
-               t1 = _mm_aesenc_si128(t1, k4);
-               t2 = _mm_aesenc_si128(t2, k4);
-               t3 = _mm_aesenc_si128(t3, k4);
-               t4 = _mm_aesenc_si128(t4, k4);
-               t1 = _mm_aesenc_si128(t1, k5);
-               t2 = _mm_aesenc_si128(t2, k5);
-               t3 = _mm_aesenc_si128(t3, k5);
-               t4 = _mm_aesenc_si128(t4, k5);
-               t1 = _mm_aesenc_si128(t1, k6);
-               t2 = _mm_aesenc_si128(t2, k6);
-               t3 = _mm_aesenc_si128(t3, k6);
-               t4 = _mm_aesenc_si128(t4, k6);
-               t1 = _mm_aesenc_si128(t1, k7);
-               t2 = _mm_aesenc_si128(t2, k7);
-               t3 = _mm_aesenc_si128(t3, k7);
-               t4 = _mm_aesenc_si128(t4, k7);
-               t1 = _mm_aesenc_si128(t1, k8);
-               t2 = _mm_aesenc_si128(t2, k8);
-               t3 = _mm_aesenc_si128(t3, k8);
-               t4 = _mm_aesenc_si128(t4, k8);
-               t1 = _mm_aesenc_si128(t1, k9);
-               t2 = _mm_aesenc_si128(t2, k9);
-               t3 = _mm_aesenc_si128(t3, k9);
-               t4 = _mm_aesenc_si128(t4, k9);
-               t1 = _mm_aesenc_si128(t1, k10);
-               t2 = _mm_aesenc_si128(t2, k10);
-               t3 = _mm_aesenc_si128(t3, k10);
-               t4 = _mm_aesenc_si128(t4, k10);
-               t1 = _mm_aesenc_si128(t1, k11);
-               t2 = _mm_aesenc_si128(t2, k11);
-               t3 = _mm_aesenc_si128(t3, k11);
-               t4 = _mm_aesenc_si128(t4, k11);
-               t1 = _mm_aesenc_si128(t1, k12);
-               t2 = _mm_aesenc_si128(t2, k12);
-               t3 = _mm_aesenc_si128(t3, k12);
-               t4 = _mm_aesenc_si128(t4, k12);
-               t1 = _mm_aesenc_si128(t1, k13);
-               t2 = _mm_aesenc_si128(t2, k13);
-               t3 = _mm_aesenc_si128(t3, k13);
-               t4 = _mm_aesenc_si128(t4, k13);
-
-               t1 = _mm_aesenclast_si128(t1, k14);
-               t2 = _mm_aesenclast_si128(t2, k14);
-               t3 = _mm_aesenclast_si128(t3, k14);
-               t4 = _mm_aesenclast_si128(t4, k14);
+               t1 = _mm_aesenc_si128(t1, ks[1]);
+               t2 = _mm_aesenc_si128(t2, ks[1]);
+               t3 = _mm_aesenc_si128(t3, ks[1]);
+               t4 = _mm_aesenc_si128(t4, ks[1]);
+               t1 = _mm_aesenc_si128(t1, ks[2]);
+               t2 = _mm_aesenc_si128(t2, ks[2]);
+               t3 = _mm_aesenc_si128(t3, ks[2]);
+               t4 = _mm_aesenc_si128(t4, ks[2]);
+               t1 = _mm_aesenc_si128(t1, ks[3]);
+               t2 = _mm_aesenc_si128(t2, ks[3]);
+               t3 = _mm_aesenc_si128(t3, ks[3]);
+               t4 = _mm_aesenc_si128(t4, ks[3]);
+               t1 = _mm_aesenc_si128(t1, ks[4]);
+               t2 = _mm_aesenc_si128(t2, ks[4]);
+               t3 = _mm_aesenc_si128(t3, ks[4]);
+               t4 = _mm_aesenc_si128(t4, ks[4]);
+               t1 = _mm_aesenc_si128(t1, ks[5]);
+               t2 = _mm_aesenc_si128(t2, ks[5]);
+               t3 = _mm_aesenc_si128(t3, ks[5]);
+               t4 = _mm_aesenc_si128(t4, ks[5]);
+               t1 = _mm_aesenc_si128(t1, ks[6]);
+               t2 = _mm_aesenc_si128(t2, ks[6]);
+               t3 = _mm_aesenc_si128(t3, ks[6]);
+               t4 = _mm_aesenc_si128(t4, ks[6]);
+               t1 = _mm_aesenc_si128(t1, ks[7]);
+               t2 = _mm_aesenc_si128(t2, ks[7]);
+               t3 = _mm_aesenc_si128(t3, ks[7]);
+               t4 = _mm_aesenc_si128(t4, ks[7]);
+               t1 = _mm_aesenc_si128(t1, ks[8]);
+               t2 = _mm_aesenc_si128(t2, ks[8]);
+               t3 = _mm_aesenc_si128(t3, ks[8]);
+               t4 = _mm_aesenc_si128(t4, ks[8]);
+               t1 = _mm_aesenc_si128(t1, ks[9]);
+               t2 = _mm_aesenc_si128(t2, ks[9]);
+               t3 = _mm_aesenc_si128(t3, ks[9]);
+               t4 = _mm_aesenc_si128(t4, ks[9]);
+               t1 = _mm_aesenc_si128(t1, ks[10]);
+               t2 = _mm_aesenc_si128(t2, ks[10]);
+               t3 = _mm_aesenc_si128(t3, ks[10]);
+               t4 = _mm_aesenc_si128(t4, ks[10]);
+               t1 = _mm_aesenc_si128(t1, ks[11]);
+               t2 = _mm_aesenc_si128(t2, ks[11]);
+               t3 = _mm_aesenc_si128(t3, ks[11]);
+               t4 = _mm_aesenc_si128(t4, ks[11]);
+               t1 = _mm_aesenc_si128(t1, ks[12]);
+               t2 = _mm_aesenc_si128(t2, ks[12]);
+               t3 = _mm_aesenc_si128(t3, ks[12]);
+               t4 = _mm_aesenc_si128(t4, ks[12]);
+               t1 = _mm_aesenc_si128(t1, ks[13]);
+               t2 = _mm_aesenc_si128(t2, ks[13]);
+               t3 = _mm_aesenc_si128(t3, ks[13]);
+               t4 = _mm_aesenc_si128(t4, ks[13]);
+
+               t1 = _mm_aesenclast_si128(t1, ks[14]);
+               t2 = _mm_aesenclast_si128(t2, ks[14]);
+               t3 = _mm_aesenclast_si128(t3, ks[14]);
+               t4 = _mm_aesenclast_si128(t4, ks[14]);
 
                t1 = _mm_xor_si128(t1, d1);
                t2 = _mm_xor_si128(t2, d2);
@@ -1314,23 +1209,23 @@ static void decrypt_gcm256(private_aesni_gcm_t *this,
        {
                d1 = _mm_loadu_si128(bi + i);
 
-               y = ghash(h4, y, d1);
-
-               t1 = _mm_xor_si128(cb, k0);
-               t1 = _mm_aesenc_si128(t1, k1);
-               t1 = _mm_aesenc_si128(t1, k2);
-               t1 = _mm_aesenc_si128(t1, k3);
-               t1 = _mm_aesenc_si128(t1, k4);
-               t1 = _mm_aesenc_si128(t1, k5);
-               t1 = _mm_aesenc_si128(t1, k6);
-               t1 = _mm_aesenc_si128(t1, k7);
-               t1 = _mm_aesenc_si128(t1, k8);
-               t1 = _mm_aesenc_si128(t1, k9);
-               t1 = _mm_aesenc_si128(t1, k10);
-               t1 = _mm_aesenc_si128(t1, k11);
-               t1 = _mm_aesenc_si128(t1, k12);
-               t1 = _mm_aesenc_si128(t1, k13);
-               t1 = _mm_aesenclast_si128(t1, k14);
+               y = ghash(this->h, y, d1);
+
+               t1 = _mm_xor_si128(cb, ks[0]);
+               t1 = _mm_aesenc_si128(t1, ks[1]);
+               t1 = _mm_aesenc_si128(t1, ks[2]);
+               t1 = _mm_aesenc_si128(t1, ks[3]);
+               t1 = _mm_aesenc_si128(t1, ks[4]);
+               t1 = _mm_aesenc_si128(t1, ks[5]);
+               t1 = _mm_aesenc_si128(t1, ks[6]);
+               t1 = _mm_aesenc_si128(t1, ks[7]);
+               t1 = _mm_aesenc_si128(t1, ks[8]);
+               t1 = _mm_aesenc_si128(t1, ks[9]);
+               t1 = _mm_aesenc_si128(t1, ks[10]);
+               t1 = _mm_aesenc_si128(t1, ks[11]);
+               t1 = _mm_aesenc_si128(t1, ks[12]);
+               t1 = _mm_aesenc_si128(t1, ks[13]);
+               t1 = _mm_aesenclast_si128(t1, ks[14]);
 
                t1 = _mm_xor_si128(t1, d1);
                _mm_storeu_si128(bo + i, t1);
@@ -1423,7 +1318,7 @@ METHOD(aead_t, set_key, bool,
        private_aesni_gcm_t *this, chunk_t key)
 {
        u_int round;
-       __m128i h;
+       __m128i *ks, h;
 
        if (key.len != this->key_size + SALT_SIZE)
        {
@@ -1436,12 +1331,13 @@ METHOD(aead_t, set_key, bool,
        DESTROY_IF(this->key);
        this->key = aesni_key_create(TRUE, key);
 
-       h = _mm_xor_si128(_mm_setzero_si128(), this->key->schedule[0]);
+       ks = this->key->schedule;
+       h = _mm_xor_si128(_mm_setzero_si128(), ks[0]);
        for (round = 1; round < this->key->rounds; round++)
        {
-               h = _mm_aesenc_si128(h, this->key->schedule[round]);
+               h = _mm_aesenc_si128(h, ks[round]);
        }
-       h = _mm_aesenclast_si128(h, this->key->schedule[this->key->rounds]);
+       h = _mm_aesenclast_si128(h, ks[this->key->rounds]);
 
        this->h = h;
        h = swap128(h);
index b2e8cd5..24a75ce 100644 (file)
@@ -72,8 +72,7 @@ struct private_aesni_mac_t {
 METHOD(mac_t, get_mac, bool,
        private_aesni_mac_t *this, chunk_t data, u_int8_t *out)
 {
-       __m128i k0, k1, k2, k3, k4, k5, k6, k7, k8, k9, k10;
-       __m128i e, *bi;
+       __m128i *ks, e, *bi;
        u_int blocks, rem, i;
 
        if (!this->k1)
@@ -81,17 +80,7 @@ METHOD(mac_t, get_mac, bool,
                return FALSE;
        }
 
-       k0 = this->k1->schedule[0];
-       k1 = this->k1->schedule[1];
-       k2 = this->k1->schedule[2];
-       k3 = this->k1->schedule[3];
-       k4 = this->k1->schedule[4];
-       k5 = this->k1->schedule[5];
-       k6 = this->k1->schedule[6];
-       k7 = this->k1->schedule[7];
-       k8 = this->k1->schedule[8];
-       k9 = this->k1->schedule[9];
-       k10 = this->k1->schedule[10];
+       ks = this->k1->schedule;
 
        e = this->e;
 
@@ -114,17 +103,17 @@ METHOD(mac_t, get_mac, bool,
 
                e = _mm_xor_si128(e, _mm_loadu_si128((__m128i*)this->rem));
 
-               e = _mm_xor_si128(e, k0);
-               e = _mm_aesenc_si128(e, k1);
-               e = _mm_aesenc_si128(e, k2);
-               e = _mm_aesenc_si128(e, k3);
-               e = _mm_aesenc_si128(e, k4);
-               e = _mm_aesenc_si128(e, k5);
-               e = _mm_aesenc_si128(e, k6);
-               e = _mm_aesenc_si128(e, k7);
-               e = _mm_aesenc_si128(e, k8);
-               e = _mm_aesenc_si128(e, k9);
-               e = _mm_aesenclast_si128(e, k10);
+               e = _mm_xor_si128(e, ks[0]);
+               e = _mm_aesenc_si128(e, ks[1]);
+               e = _mm_aesenc_si128(e, ks[2]);
+               e = _mm_aesenc_si128(e, ks[3]);
+               e = _mm_aesenc_si128(e, ks[4]);
+               e = _mm_aesenc_si128(e, ks[5]);
+               e = _mm_aesenc_si128(e, ks[6]);
+               e = _mm_aesenc_si128(e, ks[7]);
+               e = _mm_aesenc_si128(e, ks[8]);
+               e = _mm_aesenc_si128(e, ks[9]);
+               e = _mm_aesenclast_si128(e, ks[10]);
 
                bi = (__m128i*)data.ptr;
                rem = data.len % AES_BLOCK_SIZE;
@@ -140,17 +129,17 @@ METHOD(mac_t, get_mac, bool,
                {
                        e = _mm_xor_si128(e, _mm_loadu_si128(bi + i));
 
-                       e = _mm_xor_si128(e, k0);
-                       e = _mm_aesenc_si128(e, k1);
-                       e = _mm_aesenc_si128(e, k2);
-                       e = _mm_aesenc_si128(e, k3);
-                       e = _mm_aesenc_si128(e, k4);
-                       e = _mm_aesenc_si128(e, k5);
-                       e = _mm_aesenc_si128(e, k6);
-                       e = _mm_aesenc_si128(e, k7);
-                       e = _mm_aesenc_si128(e, k8);
-                       e = _mm_aesenc_si128(e, k9);
-                       e = _mm_aesenclast_si128(e, k10);
+                       e = _mm_xor_si128(e, ks[0]);
+                       e = _mm_aesenc_si128(e, ks[1]);
+                       e = _mm_aesenc_si128(e, ks[2]);
+                       e = _mm_aesenc_si128(e, ks[3]);
+                       e = _mm_aesenc_si128(e, ks[4]);
+                       e = _mm_aesenc_si128(e, ks[5]);
+                       e = _mm_aesenc_si128(e, ks[6]);
+                       e = _mm_aesenc_si128(e, ks[7]);
+                       e = _mm_aesenc_si128(e, ks[8]);
+                       e = _mm_aesenc_si128(e, ks[9]);
+                       e = _mm_aesenclast_si128(e, ks[10]);
                }
 
                /* store remaining bytes of block M[n] */
@@ -196,17 +185,17 @@ METHOD(mac_t, get_mac, bool,
                }
                e = _mm_xor_si128(e, _mm_loadu_si128((__m128i*)this->rem));
 
-               e = _mm_xor_si128(e, k0);
-               e = _mm_aesenc_si128(e, k1);
-               e = _mm_aesenc_si128(e, k2);
-               e = _mm_aesenc_si128(e, k3);
-               e = _mm_aesenc_si128(e, k4);
-               e = _mm_aesenc_si128(e, k5);
-               e = _mm_aesenc_si128(e, k6);
-               e = _mm_aesenc_si128(e, k7);
-               e = _mm_aesenc_si128(e, k8);
-               e = _mm_aesenc_si128(e, k9);
-               e = _mm_aesenclast_si128(e, k10);
+               e = _mm_xor_si128(e, ks[0]);
+               e = _mm_aesenc_si128(e, ks[1]);
+               e = _mm_aesenc_si128(e, ks[2]);
+               e = _mm_aesenc_si128(e, ks[3]);
+               e = _mm_aesenc_si128(e, ks[4]);
+               e = _mm_aesenc_si128(e, ks[5]);
+               e = _mm_aesenc_si128(e, ks[6]);
+               e = _mm_aesenc_si128(e, ks[7]);
+               e = _mm_aesenc_si128(e, ks[8]);
+               e = _mm_aesenc_si128(e, ks[9]);
+               e = _mm_aesenclast_si128(e, ks[10]);
                _mm_storeu_si128((__m128i*)out, e);
 
                /* (2) Define E[0] = 0x00000000000000000000000000000000 */