aesni: Calculate GHASH for 4 blocks of encryption data in parallel
authorMartin Willi <martin@revosec.ch>
Tue, 31 Mar 2015 08:00:33 +0000 (10:00 +0200)
committerMartin Willi <martin@revosec.ch>
Wed, 15 Apr 2015 09:35:28 +0000 (11:35 +0200)
Increases performance by another ~30%.

src/libstrongswan/plugins/aesni/aesni_gcm.c

index 4e6ea8f..cdfd2b0 100644 (file)
@@ -86,6 +86,21 @@ struct private_aesni_gcm_t {
         * GHASH subkey H, big-endian
         */
        __m128i h;
+
+       /**
+        * GHASH key H^2, big-endian
+        */
+       __m128i hh;
+
+       /**
+        * GHASH key H^3, big-endian
+        */
+       __m128i hhh;
+
+       /**
+        * GHASH key H^4, big-endian
+        */
+       __m128i hhhh;
 };
 
 /**
@@ -100,7 +115,7 @@ static inline __m128i swap128(__m128i x)
 /**
  * Multiply two blocks in GF128
  */
-static inline __m128i mult_block(__m128i h, __m128i y)
+static __m128i mult_block(__m128i h, __m128i y)
 {
        __m128i t1, t2, t3, t4, t5, t6;
 
@@ -152,6 +167,96 @@ static inline __m128i mult_block(__m128i h, __m128i y)
 }
 
 /**
+ * Multiply four consecutive blocks by their respective GHASH key, XOR
+ */
+static inline __m128i mult4xor(__m128i h1, __m128i h2, __m128i h3, __m128i h4,
+                                                          __m128i d1, __m128i d2, __m128i d3, __m128i d4)
+{
+       __m128i t0, t1, t2, t3, t4, t5, t6, t7, t8, t9;
+
+       d1 = swap128(d1);
+       d2 = swap128(d2);
+       d3 = swap128(d3);
+       d4 = swap128(d4);
+
+       t0 = _mm_clmulepi64_si128(h1, d1, 0x00);
+       t1 = _mm_clmulepi64_si128(h2, d2, 0x00);
+       t2 = _mm_clmulepi64_si128(h3, d3, 0x00);
+       t3 = _mm_clmulepi64_si128(h4, d4, 0x00);
+       t8 = _mm_xor_si128(t0, t1);
+       t8 = _mm_xor_si128(t8, t2);
+       t8 = _mm_xor_si128(t8, t3);
+
+       t4 = _mm_clmulepi64_si128(h1, d1, 0x11);
+       t5 = _mm_clmulepi64_si128(h2, d2, 0x11);
+       t6 = _mm_clmulepi64_si128(h3, d3, 0x11);
+       t7 = _mm_clmulepi64_si128(h4, d4, 0x11);
+       t9 = _mm_xor_si128(t4, t5);
+       t9 = _mm_xor_si128(t9, t6);
+       t9 = _mm_xor_si128(t9, t7);
+
+       t0 = _mm_shuffle_epi32(h1, 78);
+       t4 = _mm_shuffle_epi32(d1, 78);
+       t0 = _mm_xor_si128(t0, h1);
+       t4 = _mm_xor_si128(t4, d1);
+       t1 = _mm_shuffle_epi32(h2, 78);
+       t5 = _mm_shuffle_epi32(d2, 78);
+       t1 = _mm_xor_si128(t1, h2);
+       t5 = _mm_xor_si128(t5, d2);
+       t2 = _mm_shuffle_epi32(h3, 78);
+       t6 = _mm_shuffle_epi32(d3, 78);
+       t2 = _mm_xor_si128(t2, h3);
+       t6 = _mm_xor_si128(t6, d3);
+       t3 = _mm_shuffle_epi32(h4, 78);
+       t7 = _mm_shuffle_epi32(d4, 78);
+       t3 = _mm_xor_si128(t3, h4);
+       t7 = _mm_xor_si128(t7, d4);
+
+       t0 = _mm_clmulepi64_si128(t0, t4, 0x00);
+       t1 = _mm_clmulepi64_si128(t1, t5, 0x00);
+       t2 = _mm_clmulepi64_si128(t2, t6, 0x00);
+       t3 = _mm_clmulepi64_si128(t3, t7, 0x00);
+       t0 = _mm_xor_si128(t0, t8);
+       t0 = _mm_xor_si128(t0, t9);
+       t0 = _mm_xor_si128(t1, t0);
+       t0 = _mm_xor_si128(t2, t0);
+
+       t0 = _mm_xor_si128(t3, t0);
+       t4 = _mm_slli_si128(t0, 8);
+       t0 = _mm_srli_si128(t0, 8);
+       t3 = _mm_xor_si128(t4, t8);
+       t6 = _mm_xor_si128(t0, t9);
+       t7 = _mm_srli_epi32(t3, 31);
+       t8 = _mm_srli_epi32(t6, 31);
+       t3 = _mm_slli_epi32(t3, 1);
+       t6 = _mm_slli_epi32(t6, 1);
+       t9 = _mm_srli_si128(t7, 12);
+       t8 = _mm_slli_si128(t8, 4);
+       t7 = _mm_slli_si128(t7, 4);
+       t3 = _mm_or_si128(t3, t7);
+       t6 = _mm_or_si128(t6, t8);
+       t6 = _mm_or_si128(t6, t9);
+       t7 = _mm_slli_epi32(t3, 31);
+       t8 = _mm_slli_epi32(t3, 30);
+       t9 = _mm_slli_epi32(t3, 25);
+       t7 = _mm_xor_si128(t7, t8);
+       t7 = _mm_xor_si128(t7, t9);
+       t8 = _mm_srli_si128(t7, 4);
+       t7 = _mm_slli_si128(t7, 12);
+       t3 = _mm_xor_si128(t3, t7);
+       t2 = _mm_srli_epi32(t3, 1);
+       t4 = _mm_srli_epi32(t3, 2);
+       t5 = _mm_srli_epi32(t3, 7);
+       t2 = _mm_xor_si128(t2, t4);
+       t2 = _mm_xor_si128(t2, t5);
+       t2 = _mm_xor_si128(t2, t8);
+       t3 = _mm_xor_si128(t3, t2);
+       t6 = _mm_xor_si128(t6, t3);
+
+       return swap128(t6);
+}
+
+/**
  * GHASH on a single block
  */
 static __m128i ghash(__m128i h, __m128i y, __m128i x)
@@ -309,7 +414,7 @@ static void encrypt_gcm128(private_aesni_gcm_t *this,
                                                   size_t alen, u_char *assoc, u_char *icv)
 {
        __m128i k0, k1, k2, k3, k4, k5, k6, k7, k8, k9, k10;
-       __m128i d1, d2, d3, d4, t1, t2, t3, t4;
+       __m128i d1, d2, d3, d4, t1, t2, t3, t4, h1, h2, h3, h4;
        __m128i y, j, cb, *bi, *bo;
        u_int blocks, pblocks, rem, i;
 
@@ -322,6 +427,11 @@ static void encrypt_gcm128(private_aesni_gcm_t *this,
        bi = (__m128i*)in;
        bo = (__m128i*)out;
 
+       h1 = this->hhhh;
+       h2 = this->hhh;
+       h3 = this->hh;
+       h4 = this->h;
+
        k0 = this->key->schedule[0];
        k1 = this->key->schedule[1];
        k2 = this->key->schedule[2];
@@ -396,15 +506,14 @@ static void encrypt_gcm128(private_aesni_gcm_t *this,
                t2 = _mm_xor_si128(t2, d2);
                t3 = _mm_xor_si128(t3, d3);
                t4 = _mm_xor_si128(t4, d4);
+
+               y = _mm_xor_si128(y, t1);
+               y = mult4xor(h1, h2, h3, h4, y, t2, t3, t4);
+
                _mm_storeu_si128(bo + i + 0, t1);
                _mm_storeu_si128(bo + i + 1, t2);
                _mm_storeu_si128(bo + i + 2, t3);
                _mm_storeu_si128(bo + i + 3, t4);
-
-               y = ghash(this->h, y, t1);
-               y = ghash(this->h, y, t2);
-               y = ghash(this->h, y, t3);
-               y = ghash(this->h, y, t4);
        }
 
        for (i = pblocks; i < blocks; i++)
@@ -426,7 +535,7 @@ static void encrypt_gcm128(private_aesni_gcm_t *this,
                t1 = _mm_xor_si128(t1, d1);
                _mm_storeu_si128(bo + i, t1);
 
-               y = ghash(this->h, y, t1);
+               y = ghash(h4, y, t1);
 
                cb = increment_be(cb);
        }
@@ -447,7 +556,7 @@ static void decrypt_gcm128(private_aesni_gcm_t *this,
                                                   size_t alen, u_char *assoc, u_char *icv)
 {
        __m128i k0, k1, k2, k3, k4, k5, k6, k7, k8, k9, k10;
-       __m128i d1, d2, d3, d4, t1, t2, t3, t4;
+       __m128i d1, d2, d3, d4, t1, t2, t3, t4, h1, h2, h3, h4;
        __m128i y, j, cb, *bi, *bo;
        u_int blocks, pblocks, rem, i;
 
@@ -460,6 +569,11 @@ static void decrypt_gcm128(private_aesni_gcm_t *this,
        bi = (__m128i*)in;
        bo = (__m128i*)out;
 
+       h1 = this->hhhh;
+       h2 = this->hhh;
+       h3 = this->hh;
+       h4 = this->h;
+
        k0 = this->key->schedule[0];
        k1 = this->key->schedule[1];
        k2 = this->key->schedule[2];
@@ -479,10 +593,8 @@ static void decrypt_gcm128(private_aesni_gcm_t *this,
                d3 = _mm_loadu_si128(bi + i + 2);
                d4 = _mm_loadu_si128(bi + i + 3);
 
-               y = ghash(this->h, y, d1);
-               y = ghash(this->h, y, d2);
-               y = ghash(this->h, y, d3);
-               y = ghash(this->h, y, d4);
+               y = _mm_xor_si128(y, d1);
+               y = mult4xor(h1, h2, h3, h4, y, d2, d3, d4);
 
                t1 = _mm_xor_si128(cb, k0);
                cb = increment_be(cb);
@@ -539,6 +651,7 @@ static void decrypt_gcm128(private_aesni_gcm_t *this,
                t2 = _mm_xor_si128(t2, d2);
                t3 = _mm_xor_si128(t3, d3);
                t4 = _mm_xor_si128(t4, d4);
+
                _mm_storeu_si128(bo + i + 0, t1);
                _mm_storeu_si128(bo + i + 1, t2);
                _mm_storeu_si128(bo + i + 2, t3);
@@ -549,7 +662,7 @@ static void decrypt_gcm128(private_aesni_gcm_t *this,
        {
                d1 = _mm_loadu_si128(bi + i);
 
-               y = ghash(this->h, y, d1);
+               y = ghash(h4, y, d1);
 
                t1 = _mm_xor_si128(cb, k0);
                t1 = _mm_aesenc_si128(t1, k1);
@@ -585,7 +698,7 @@ static void encrypt_gcm192(private_aesni_gcm_t *this,
                                                   size_t alen, u_char *assoc, u_char *icv)
 {
        __m128i k0, k1, k2, k3, k4, k5, k6, k7, k8, k9, k10, k11, k12;
-       __m128i d1, d2, d3, d4, t1, t2, t3, t4;
+       __m128i d1, d2, d3, d4, t1, t2, t3, t4, h1, h2, h3, h4;
        __m128i y, j, cb, *bi, *bo;
        u_int blocks, pblocks, rem, i;
 
@@ -598,6 +711,11 @@ static void encrypt_gcm192(private_aesni_gcm_t *this,
        bi = (__m128i*)in;
        bo = (__m128i*)out;
 
+       h1 = this->hhhh;
+       h2 = this->hhh;
+       h3 = this->hh;
+       h4 = this->h;
+
        k0 = this->key->schedule[0];
        k1 = this->key->schedule[1];
        k2 = this->key->schedule[2];
@@ -682,15 +800,14 @@ static void encrypt_gcm192(private_aesni_gcm_t *this,
                t2 = _mm_xor_si128(t2, d2);
                t3 = _mm_xor_si128(t3, d3);
                t4 = _mm_xor_si128(t4, d4);
+
+               y = _mm_xor_si128(y, t1);
+               y = mult4xor(h1, h2, h3, h4, y, t2, t3, t4);
+
                _mm_storeu_si128(bo + i + 0, t1);
                _mm_storeu_si128(bo + i + 1, t2);
                _mm_storeu_si128(bo + i + 2, t3);
                _mm_storeu_si128(bo + i + 3, t4);
-
-               y = ghash(this->h, y, t1);
-               y = ghash(this->h, y, t2);
-               y = ghash(this->h, y, t3);
-               y = ghash(this->h, y, t4);
        }
 
        for (i = pblocks; i < blocks; i++)
@@ -714,7 +831,7 @@ static void encrypt_gcm192(private_aesni_gcm_t *this,
                t1 = _mm_xor_si128(t1, d1);
                _mm_storeu_si128(bo + i, t1);
 
-               y = ghash(this->h, y, t1);
+               y = ghash(h4, y, t1);
 
                cb = increment_be(cb);
        }
@@ -735,7 +852,7 @@ static void decrypt_gcm192(private_aesni_gcm_t *this,
                                                   size_t alen, u_char *assoc, u_char *icv)
 {
        __m128i k0, k1, k2, k3, k4, k5, k6, k7, k8, k9, k10, k11, k12;
-       __m128i d1, d2, d3, d4, t1, t2, t3, t4;
+       __m128i d1, d2, d3, d4, t1, t2, t3, t4, h1, h2, h3, h4;
        __m128i y, j, cb, *bi, *bo;
        u_int blocks, pblocks, rem, i;
 
@@ -748,6 +865,11 @@ static void decrypt_gcm192(private_aesni_gcm_t *this,
        bi = (__m128i*)in;
        bo = (__m128i*)out;
 
+       h1 = this->hhhh;
+       h2 = this->hhh;
+       h3 = this->hh;
+       h4 = this->h;
+
        k0 = this->key->schedule[0];
        k1 = this->key->schedule[1];
        k2 = this->key->schedule[2];
@@ -769,10 +891,8 @@ static void decrypt_gcm192(private_aesni_gcm_t *this,
                d3 = _mm_loadu_si128(bi + i + 2);
                d4 = _mm_loadu_si128(bi + i + 3);
 
-               y = ghash(this->h, y, d1);
-               y = ghash(this->h, y, d2);
-               y = ghash(this->h, y, d3);
-               y = ghash(this->h, y, d4);
+               y = _mm_xor_si128(y, d1);
+               y = mult4xor(h1, h2, h3, h4, y, d2, d3, d4);
 
                t1 = _mm_xor_si128(cb, k0);
                cb = increment_be(cb);
@@ -837,6 +957,7 @@ static void decrypt_gcm192(private_aesni_gcm_t *this,
                t2 = _mm_xor_si128(t2, d2);
                t3 = _mm_xor_si128(t3, d3);
                t4 = _mm_xor_si128(t4, d4);
+
                _mm_storeu_si128(bo + i + 0, t1);
                _mm_storeu_si128(bo + i + 1, t2);
                _mm_storeu_si128(bo + i + 2, t3);
@@ -847,7 +968,7 @@ static void decrypt_gcm192(private_aesni_gcm_t *this,
        {
                d1 = _mm_loadu_si128(bi + i);
 
-               y = ghash(this->h, y, d1);
+               y = ghash(h4, y, d1);
 
                t1 = _mm_xor_si128(cb, k0);
                t1 = _mm_aesenc_si128(t1, k1);
@@ -885,7 +1006,7 @@ static void encrypt_gcm256(private_aesni_gcm_t *this,
                                                   size_t alen, u_char *assoc, u_char *icv)
 {
        __m128i k0, k1, k2, k3, k4, k5, k6, k7, k8, k9, k10, k11, k12, k13, k14;
-       __m128i d1, d2, d3, d4, t1, t2, t3, t4;
+       __m128i d1, d2, d3, d4, t1, t2, t3, t4, h1, h2, h3, h4;
        __m128i y, j, cb, *bi, *bo;
        u_int blocks, pblocks, rem, i;
 
@@ -898,6 +1019,11 @@ static void encrypt_gcm256(private_aesni_gcm_t *this,
        bi = (__m128i*)in;
        bo = (__m128i*)out;
 
+       h1 = this->hhhh;
+       h2 = this->hhh;
+       h3 = this->hh;
+       h4 = this->h;
+
        k0 = this->key->schedule[0];
        k1 = this->key->schedule[1];
        k2 = this->key->schedule[2];
@@ -992,15 +1118,14 @@ static void encrypt_gcm256(private_aesni_gcm_t *this,
                t2 = _mm_xor_si128(t2, d2);
                t3 = _mm_xor_si128(t3, d3);
                t4 = _mm_xor_si128(t4, d4);
+
+               y = _mm_xor_si128(y, t1);
+               y = mult4xor(h1, h2, h3, h4, y, t2, t3, t4);
+
                _mm_storeu_si128(bo + i + 0, t1);
                _mm_storeu_si128(bo + i + 1, t2);
                _mm_storeu_si128(bo + i + 2, t3);
                _mm_storeu_si128(bo + i + 3, t4);
-
-               y = ghash(this->h, y, t1);
-               y = ghash(this->h, y, t2);
-               y = ghash(this->h, y, t3);
-               y = ghash(this->h, y, t4);
        }
 
        for (i = pblocks; i < blocks; i++)
@@ -1047,7 +1172,7 @@ static void decrypt_gcm256(private_aesni_gcm_t *this,
                                                   size_t alen, u_char *assoc, u_char *icv)
 {
        __m128i k0, k1, k2, k3, k4, k5, k6, k7, k8, k9, k10, k11, k12, k13, k14;
-       __m128i d1, d2, d3, d4, t1, t2, t3, t4;
+       __m128i d1, d2, d3, d4, t1, t2, t3, t4, h1, h2, h3, h4;
        __m128i y, j, cb, *bi, *bo;
        u_int blocks, pblocks, rem, i;
 
@@ -1060,6 +1185,11 @@ static void decrypt_gcm256(private_aesni_gcm_t *this,
        bi = (__m128i*)in;
        bo = (__m128i*)out;
 
+       h1 = this->hhhh;
+       h2 = this->hhh;
+       h3 = this->hh;
+       h4 = this->h;
+
        k0 = this->key->schedule[0];
        k1 = this->key->schedule[1];
        k2 = this->key->schedule[2];
@@ -1083,10 +1213,8 @@ static void decrypt_gcm256(private_aesni_gcm_t *this,
                d3 = _mm_loadu_si128(bi + i + 2);
                d4 = _mm_loadu_si128(bi + i + 3);
 
-               y = ghash(this->h, y, d1);
-               y = ghash(this->h, y, d2);
-               y = ghash(this->h, y, d3);
-               y = ghash(this->h, y, d4);
+               y = _mm_xor_si128(y, d1);
+               y = mult4xor(h1, h2, h3, h4, y, d2, d3, d4);
 
                t1 = _mm_xor_si128(cb, k0);
                cb = increment_be(cb);
@@ -1159,6 +1287,7 @@ static void decrypt_gcm256(private_aesni_gcm_t *this,
                t2 = _mm_xor_si128(t2, d2);
                t3 = _mm_xor_si128(t3, d3);
                t4 = _mm_xor_si128(t4, d4);
+
                _mm_storeu_si128(bo + i + 0, t1);
                _mm_storeu_si128(bo + i + 1, t2);
                _mm_storeu_si128(bo + i + 2, t3);
@@ -1169,7 +1298,7 @@ static void decrypt_gcm256(private_aesni_gcm_t *this,
        {
                d1 = _mm_loadu_si128(bi + i);
 
-               y = ghash(this->h, y, d1);
+               y = ghash(h4, y, d1);
 
                t1 = _mm_xor_si128(cb, k0);
                t1 = _mm_aesenc_si128(t1, k1);
@@ -1298,7 +1427,15 @@ METHOD(aead_t, set_key, bool,
        }
        h = _mm_aesenclast_si128(h, this->key->schedule[this->key->rounds]);
 
-       this->h = swap128(h);
+       this->h = h;
+       h = swap128(h);
+       this->hh = mult_block(h, this->h);
+       this->hhh = mult_block(h, this->hh);
+       this->hhhh = mult_block(h, this->hhh);
+       this->h = swap128(this->h);
+       this->hh = swap128(this->hh);
+       this->hhh = swap128(this->hhh);
+       this->hhhh = swap128(this->hhhh);
 
        return TRUE;
 }
@@ -1308,6 +1445,9 @@ METHOD(aead_t, destroy, void,
 {
        DESTROY_IF(this->key);
        memwipe(&this->h, sizeof(this->h));
+       memwipe(&this->hh, sizeof(this->hh));
+       memwipe(&this->hhh, sizeof(this->hhh));
+       memwipe(&this->hhhh, sizeof(this->hhhh));
        this->iv_gen->destroy(this->iv_gen);
        free(this);
 }