aesni: Partially use separate code paths for different key sizes in CCM
authorMartin Willi <martin@revosec.ch>
Thu, 26 Mar 2015 18:07:21 +0000 (19:07 +0100)
committerMartin Willi <martin@revosec.ch>
Wed, 15 Apr 2015 09:35:28 +0000 (11:35 +0200)
Due to the serial nature of the CBC mac, this brings only a marginal speedup.

src/libstrongswan/plugins/aesni/aesni_ccm.c

index aeec7bf..20f2e4f 100644 (file)
@@ -322,14 +322,15 @@ static __m128i decrypt_ccm_rem(aesni_key_t *key, u_int rem, __m128i state,
 }
 
 /**
- * Generic CCM encryption/ICV generation
+ * AES-128 CCM encryption/ICV generation
  */
-static void encrypt_ccm(private_aesni_ccm_t *this,
-                                               size_t len, u_char *in, u_char *out, u_char *iv,
-                                               size_t alen, u_char *assoc, u_char *icv)
+static void encrypt_ccm128(private_aesni_ccm_t *this,
+                                                  size_t len, u_char *in, u_char *out, u_char *iv,
+                                                  size_t alen, u_char *assoc, u_char *icv)
 {
+       __m128i k0, k1, k2, k3, k4, k5, k6, k7, k8, k9, k10;
        __m128i d, t, c, b, state, *bi, *bo;
-       u_int round, blocks, rem, i;
+       u_int blocks, rem, i;
 
        c = icv_header(this, len, iv, alen, assoc);
        build_ctr(this, 1, iv, &b);
@@ -339,20 +340,47 @@ static void encrypt_ccm(private_aesni_ccm_t *this,
        bi = (__m128i*)in;
        bo = (__m128i*)out;
 
+       k0 = this->key->schedule[0];
+       k1 = this->key->schedule[1];
+       k2 = this->key->schedule[2];
+       k3 = this->key->schedule[3];
+       k4 = this->key->schedule[4];
+       k5 = this->key->schedule[5];
+       k6 = this->key->schedule[6];
+       k7 = this->key->schedule[7];
+       k8 = this->key->schedule[8];
+       k9 = this->key->schedule[9];
+       k10 = this->key->schedule[10];
+
        for (i = 0; i < blocks; i++)
        {
                d = _mm_loadu_si128(bi + i);
 
                c = _mm_xor_si128(d, c);
-               c = _mm_xor_si128(c, this->key->schedule[0]);
-               t = _mm_xor_si128(state, this->key->schedule[0]);
-               for (round = 1; round < this->key->rounds; round++)
-               {
-                       c = _mm_aesenc_si128(c, this->key->schedule[round]);
-                       t = _mm_aesenc_si128(t, this->key->schedule[round]);
-               }
-               c = _mm_aesenclast_si128(c, this->key->schedule[this->key->rounds]);
-               t = _mm_aesenclast_si128(t, this->key->schedule[this->key->rounds]);
+               c = _mm_xor_si128(c, k0);
+               t = _mm_xor_si128(state, k0);
+
+               c = _mm_aesenc_si128(c, k1);
+               t = _mm_aesenc_si128(t, k1);
+               c = _mm_aesenc_si128(c, k2);
+               t = _mm_aesenc_si128(t, k2);
+               c = _mm_aesenc_si128(c, k3);
+               t = _mm_aesenc_si128(t, k3);
+               c = _mm_aesenc_si128(c, k4);
+               t = _mm_aesenc_si128(t, k4);
+               c = _mm_aesenc_si128(c, k5);
+               t = _mm_aesenc_si128(t, k5);
+               c = _mm_aesenc_si128(c, k6);
+               t = _mm_aesenc_si128(t, k6);
+               c = _mm_aesenc_si128(c, k7);
+               t = _mm_aesenc_si128(t, k7);
+               c = _mm_aesenc_si128(c, k8);
+               t = _mm_aesenc_si128(t, k8);
+               c = _mm_aesenc_si128(c, k9);
+               t = _mm_aesenc_si128(t, k9);
+
+               c = _mm_aesenclast_si128(c, k10);
+               t = _mm_aesenclast_si128(t, k10);
 
                t = _mm_xor_si128(t, d);
                _mm_storeu_si128(bo + i, t);
@@ -368,14 +396,15 @@ static void encrypt_ccm(private_aesni_ccm_t *this,
 }
 
 /**
- * Generic CCM decryption/ICV generation
+ * AES-128 CCM decryption/ICV generation
  */
-static void decrypt_ccm(private_aesni_ccm_t *this,
-                                               size_t len, u_char *in, u_char *out, u_char *iv,
-                                               size_t alen, u_char *assoc, u_char *icv)
+static void decrypt_ccm128(private_aesni_ccm_t *this,
+                                                  size_t len, u_char *in, u_char *out, u_char *iv,
+                                                  size_t alen, u_char *assoc, u_char *icv)
 {
+       __m128i k0, k1, k2, k3, k4, k5, k6, k7, k8, k9, k10;
        __m128i d, t, c, b, state, *bi, *bo;
-       u_int round, blocks, rem, i;
+       u_int blocks, rem, i;
 
        c = icv_header(this, len, iv, alen, assoc);
        build_ctr(this, 1, iv, &b);
@@ -385,26 +414,388 @@ static void decrypt_ccm(private_aesni_ccm_t *this,
        bi = (__m128i*)in;
        bo = (__m128i*)out;
 
+       k0 = this->key->schedule[0];
+       k1 = this->key->schedule[1];
+       k2 = this->key->schedule[2];
+       k3 = this->key->schedule[3];
+       k4 = this->key->schedule[4];
+       k5 = this->key->schedule[5];
+       k6 = this->key->schedule[6];
+       k7 = this->key->schedule[7];
+       k8 = this->key->schedule[8];
+       k9 = this->key->schedule[9];
+       k10 = this->key->schedule[10];
+
        for (i = 0; i < blocks; i++)
        {
                d = _mm_loadu_si128(bi + i);
 
-               t = _mm_xor_si128(state, this->key->schedule[0]);
-               for (round = 1; round < this->key->rounds; round++)
-               {
-                       t = _mm_aesenc_si128(t, this->key->schedule[round]);
-               }
-               t = _mm_aesenclast_si128(t, this->key->schedule[this->key->rounds]);
+               t = _mm_xor_si128(state, k0);
+
+               t = _mm_aesenc_si128(t, k1);
+               t = _mm_aesenc_si128(t, k2);
+               t = _mm_aesenc_si128(t, k3);
+               t = _mm_aesenc_si128(t, k4);
+               t = _mm_aesenc_si128(t, k5);
+               t = _mm_aesenc_si128(t, k6);
+               t = _mm_aesenc_si128(t, k7);
+               t = _mm_aesenc_si128(t, k8);
+               t = _mm_aesenc_si128(t, k9);
+
+               t = _mm_aesenclast_si128(t, k10);
                t = _mm_xor_si128(t, d);
                _mm_storeu_si128(bo + i, t);
 
                c = _mm_xor_si128(t, c);
-               c = _mm_xor_si128(c, this->key->schedule[0]);
-               for (round = 1; round < this->key->rounds; round++)
-               {
-                       c = _mm_aesenc_si128(c, this->key->schedule[round]);
-               }
-               c = _mm_aesenclast_si128(c, this->key->schedule[this->key->rounds]);
+               c = _mm_xor_si128(c, k0);
+
+               c = _mm_aesenc_si128(c, k1);
+               c = _mm_aesenc_si128(c, k2);
+               c = _mm_aesenc_si128(c, k3);
+               c = _mm_aesenc_si128(c, k4);
+               c = _mm_aesenc_si128(c, k5);
+               c = _mm_aesenc_si128(c, k6);
+               c = _mm_aesenc_si128(c, k7);
+               c = _mm_aesenc_si128(c, k8);
+               c = _mm_aesenc_si128(c, k9);
+
+               c = _mm_aesenclast_si128(c, k10);
+
+               state = increment_be(state);
+       }
+
+       if (rem)
+       {
+               c = decrypt_ccm_rem(this->key, rem, state, bi + blocks, bo + blocks, c);
+       }
+       crypt_icv(this, iv, c, icv);
+}
+
+/**
+ * AES-192 CCM encryption/ICV generation
+ */
+static void encrypt_ccm192(private_aesni_ccm_t *this,
+                                                  size_t len, u_char *in, u_char *out, u_char *iv,
+                                                  size_t alen, u_char *assoc, u_char *icv)
+{
+       __m128i k0, k1, k2, k3, k4, k5, k6, k7, k8, k9, k10, k11, k12;
+       __m128i d, t, c, b, state, *bi, *bo;
+       u_int blocks, rem, i;
+
+       c = icv_header(this, len, iv, alen, assoc);
+       build_ctr(this, 1, iv, &b);
+       state = _mm_load_si128(&b);
+       blocks = len / AES_BLOCK_SIZE;
+       rem = len % AES_BLOCK_SIZE;
+       bi = (__m128i*)in;
+       bo = (__m128i*)out;
+
+       k0 = this->key->schedule[0];
+       k1 = this->key->schedule[1];
+       k2 = this->key->schedule[2];
+       k3 = this->key->schedule[3];
+       k4 = this->key->schedule[4];
+       k5 = this->key->schedule[5];
+       k6 = this->key->schedule[6];
+       k7 = this->key->schedule[7];
+       k8 = this->key->schedule[8];
+       k9 = this->key->schedule[9];
+       k10 = this->key->schedule[10];
+       k11 = this->key->schedule[11];
+       k12 = this->key->schedule[12];
+
+       for (i = 0; i < blocks; i++)
+       {
+               d = _mm_loadu_si128(bi + i);
+
+               c = _mm_xor_si128(d, c);
+               c = _mm_xor_si128(c, k0);
+               t = _mm_xor_si128(state, k0);
+
+               c = _mm_aesenc_si128(c, k1);
+               t = _mm_aesenc_si128(t, k1);
+               c = _mm_aesenc_si128(c, k2);
+               t = _mm_aesenc_si128(t, k2);
+               c = _mm_aesenc_si128(c, k3);
+               t = _mm_aesenc_si128(t, k3);
+               c = _mm_aesenc_si128(c, k4);
+               t = _mm_aesenc_si128(t, k4);
+               c = _mm_aesenc_si128(c, k5);
+               t = _mm_aesenc_si128(t, k5);
+               c = _mm_aesenc_si128(c, k6);
+               t = _mm_aesenc_si128(t, k6);
+               c = _mm_aesenc_si128(c, k7);
+               t = _mm_aesenc_si128(t, k7);
+               c = _mm_aesenc_si128(c, k8);
+               t = _mm_aesenc_si128(t, k8);
+               c = _mm_aesenc_si128(c, k9);
+               t = _mm_aesenc_si128(t, k9);
+               c = _mm_aesenc_si128(c, k10);
+               t = _mm_aesenc_si128(t, k10);
+               c = _mm_aesenc_si128(c, k11);
+               t = _mm_aesenc_si128(t, k11);
+
+               c = _mm_aesenclast_si128(c, k12);
+               t = _mm_aesenclast_si128(t, k12);
+
+               t = _mm_xor_si128(t, d);
+               _mm_storeu_si128(bo + i, t);
+
+               state = increment_be(state);
+       }
+
+       if (rem)
+       {
+               c = encrypt_ccm_rem(this->key, rem, state, bi + blocks, bo + blocks, c);
+       }
+       crypt_icv(this, iv, c, icv);
+}
+
+/**
+ * AES-192 CCM decryption/ICV generation
+ */
+static void decrypt_ccm192(private_aesni_ccm_t *this,
+                                                  size_t len, u_char *in, u_char *out, u_char *iv,
+                                                  size_t alen, u_char *assoc, u_char *icv)
+{
+       __m128i k0, k1, k2, k3, k4, k5, k6, k7, k8, k9, k10, k11, k12;
+       __m128i d, t, c, b, state, *bi, *bo;
+       u_int blocks, rem, i;
+
+       c = icv_header(this, len, iv, alen, assoc);
+       build_ctr(this, 1, iv, &b);
+       state = _mm_load_si128(&b);
+       blocks = len / AES_BLOCK_SIZE;
+       rem = len % AES_BLOCK_SIZE;
+       bi = (__m128i*)in;
+       bo = (__m128i*)out;
+
+       k0 = this->key->schedule[0];
+       k1 = this->key->schedule[1];
+       k2 = this->key->schedule[2];
+       k3 = this->key->schedule[3];
+       k4 = this->key->schedule[4];
+       k5 = this->key->schedule[5];
+       k6 = this->key->schedule[6];
+       k7 = this->key->schedule[7];
+       k8 = this->key->schedule[8];
+       k9 = this->key->schedule[9];
+       k10 = this->key->schedule[10];
+       k11 = this->key->schedule[11];
+       k12 = this->key->schedule[12];
+
+       for (i = 0; i < blocks; i++)
+       {
+               d = _mm_loadu_si128(bi + i);
+
+               t = _mm_xor_si128(state, k0);
+
+               t = _mm_aesenc_si128(t, k1);
+               t = _mm_aesenc_si128(t, k2);
+               t = _mm_aesenc_si128(t, k3);
+               t = _mm_aesenc_si128(t, k4);
+               t = _mm_aesenc_si128(t, k5);
+               t = _mm_aesenc_si128(t, k6);
+               t = _mm_aesenc_si128(t, k7);
+               t = _mm_aesenc_si128(t, k8);
+               t = _mm_aesenc_si128(t, k9);
+               t = _mm_aesenc_si128(t, k10);
+               t = _mm_aesenc_si128(t, k11);
+
+               t = _mm_aesenclast_si128(t, k12);
+               t = _mm_xor_si128(t, d);
+               _mm_storeu_si128(bo + i, t);
+
+               c = _mm_xor_si128(t, c);
+               c = _mm_xor_si128(c, k0);
+
+               c = _mm_aesenc_si128(c, k1);
+               c = _mm_aesenc_si128(c, k2);
+               c = _mm_aesenc_si128(c, k3);
+               c = _mm_aesenc_si128(c, k4);
+               c = _mm_aesenc_si128(c, k5);
+               c = _mm_aesenc_si128(c, k6);
+               c = _mm_aesenc_si128(c, k7);
+               c = _mm_aesenc_si128(c, k8);
+               c = _mm_aesenc_si128(c, k9);
+               c = _mm_aesenc_si128(c, k10);
+               c = _mm_aesenc_si128(c, k11);
+
+               c = _mm_aesenclast_si128(c, k12);
+
+               state = increment_be(state);
+       }
+
+       if (rem)
+       {
+               c = decrypt_ccm_rem(this->key, rem, state, bi + blocks, bo + blocks, c);
+       }
+       crypt_icv(this, iv, c, icv);
+}
+
+/**
+ * AES-256 CCM encryption/ICV generation
+ */
+static void encrypt_ccm256(private_aesni_ccm_t *this,
+                                                  size_t len, u_char *in, u_char *out, u_char *iv,
+                                                  size_t alen, u_char *assoc, u_char *icv)
+{
+       __m128i k0, k1, k2, k3, k4, k5, k6, k7, k8, k9, k10, k11, k12, k13, k14;
+       __m128i d, t, c, b, state, *bi, *bo;
+       u_int blocks, rem, i;
+
+       c = icv_header(this, len, iv, alen, assoc);
+       build_ctr(this, 1, iv, &b);
+       state = _mm_load_si128(&b);
+       blocks = len / AES_BLOCK_SIZE;
+       rem = len % AES_BLOCK_SIZE;
+       bi = (__m128i*)in;
+       bo = (__m128i*)out;
+
+       k0 = this->key->schedule[0];
+       k1 = this->key->schedule[1];
+       k2 = this->key->schedule[2];
+       k3 = this->key->schedule[3];
+       k4 = this->key->schedule[4];
+       k5 = this->key->schedule[5];
+       k6 = this->key->schedule[6];
+       k7 = this->key->schedule[7];
+       k8 = this->key->schedule[8];
+       k9 = this->key->schedule[9];
+       k10 = this->key->schedule[10];
+       k11 = this->key->schedule[11];
+       k12 = this->key->schedule[12];
+       k13 = this->key->schedule[13];
+       k14 = this->key->schedule[14];
+
+       for (i = 0; i < blocks; i++)
+       {
+               d = _mm_loadu_si128(bi + i);
+
+               c = _mm_xor_si128(d, c);
+               c = _mm_xor_si128(c, k0);
+               t = _mm_xor_si128(state, k0);
+
+               c = _mm_aesenc_si128(c, k1);
+               t = _mm_aesenc_si128(t, k1);
+               c = _mm_aesenc_si128(c, k2);
+               t = _mm_aesenc_si128(t, k2);
+               c = _mm_aesenc_si128(c, k3);
+               t = _mm_aesenc_si128(t, k3);
+               c = _mm_aesenc_si128(c, k4);
+               t = _mm_aesenc_si128(t, k4);
+               c = _mm_aesenc_si128(c, k5);
+               t = _mm_aesenc_si128(t, k5);
+               c = _mm_aesenc_si128(c, k6);
+               t = _mm_aesenc_si128(t, k6);
+               c = _mm_aesenc_si128(c, k7);
+               t = _mm_aesenc_si128(t, k7);
+               c = _mm_aesenc_si128(c, k8);
+               t = _mm_aesenc_si128(t, k8);
+               c = _mm_aesenc_si128(c, k9);
+               t = _mm_aesenc_si128(t, k9);
+               c = _mm_aesenc_si128(c, k10);
+               t = _mm_aesenc_si128(t, k10);
+               c = _mm_aesenc_si128(c, k11);
+               t = _mm_aesenc_si128(t, k11);
+               c = _mm_aesenc_si128(c, k12);
+               t = _mm_aesenc_si128(t, k12);
+               c = _mm_aesenc_si128(c, k13);
+               t = _mm_aesenc_si128(t, k13);
+
+               c = _mm_aesenclast_si128(c, k14);
+               t = _mm_aesenclast_si128(t, k14);
+
+               t = _mm_xor_si128(t, d);
+               _mm_storeu_si128(bo + i, t);
+
+               state = increment_be(state);
+       }
+
+       if (rem)
+       {
+               c = encrypt_ccm_rem(this->key, rem, state, bi + blocks, bo + blocks, c);
+       }
+       crypt_icv(this, iv, c, icv);
+}
+
+/**
+ * AES-256 CCM decryption/ICV generation
+ */
+static void decrypt_ccm256(private_aesni_ccm_t *this,
+                                                  size_t len, u_char *in, u_char *out, u_char *iv,
+                                                  size_t alen, u_char *assoc, u_char *icv)
+{
+       __m128i k0, k1, k2, k3, k4, k5, k6, k7, k8, k9, k10, k11, k12, k13, k14;
+       __m128i d, t, c, b, state, *bi, *bo;
+       u_int blocks, rem, i;
+
+       c = icv_header(this, len, iv, alen, assoc);
+       build_ctr(this, 1, iv, &b);
+       state = _mm_load_si128(&b);
+       blocks = len / AES_BLOCK_SIZE;
+       rem = len % AES_BLOCK_SIZE;
+       bi = (__m128i*)in;
+       bo = (__m128i*)out;
+
+       k0 = this->key->schedule[0];
+       k1 = this->key->schedule[1];
+       k2 = this->key->schedule[2];
+       k3 = this->key->schedule[3];
+       k4 = this->key->schedule[4];
+       k5 = this->key->schedule[5];
+       k6 = this->key->schedule[6];
+       k7 = this->key->schedule[7];
+       k8 = this->key->schedule[8];
+       k9 = this->key->schedule[9];
+       k10 = this->key->schedule[10];
+       k11 = this->key->schedule[11];
+       k12 = this->key->schedule[12];
+       k13 = this->key->schedule[13];
+       k14 = this->key->schedule[14];
+
+       for (i = 0; i < blocks; i++)
+       {
+               d = _mm_loadu_si128(bi + i);
+
+               t = _mm_xor_si128(state, k0);
+
+               t = _mm_aesenc_si128(t, k1);
+               t = _mm_aesenc_si128(t, k2);
+               t = _mm_aesenc_si128(t, k3);
+               t = _mm_aesenc_si128(t, k4);
+               t = _mm_aesenc_si128(t, k5);
+               t = _mm_aesenc_si128(t, k6);
+               t = _mm_aesenc_si128(t, k7);
+               t = _mm_aesenc_si128(t, k8);
+               t = _mm_aesenc_si128(t, k9);
+               t = _mm_aesenc_si128(t, k10);
+               t = _mm_aesenc_si128(t, k11);
+               t = _mm_aesenc_si128(t, k12);
+               t = _mm_aesenc_si128(t, k13);
+
+               t = _mm_aesenclast_si128(t, k14);
+               t = _mm_xor_si128(t, d);
+               _mm_storeu_si128(bo + i, t);
+
+               c = _mm_xor_si128(t, c);
+               c = _mm_xor_si128(c, k0);
+
+               c = _mm_aesenc_si128(c, k1);
+               c = _mm_aesenc_si128(c, k2);
+               c = _mm_aesenc_si128(c, k3);
+               c = _mm_aesenc_si128(c, k4);
+               c = _mm_aesenc_si128(c, k5);
+               c = _mm_aesenc_si128(c, k6);
+               c = _mm_aesenc_si128(c, k7);
+               c = _mm_aesenc_si128(c, k8);
+               c = _mm_aesenc_si128(c, k9);
+               c = _mm_aesenc_si128(c, k10);
+               c = _mm_aesenc_si128(c, k11);
+               c = _mm_aesenc_si128(c, k12);
+               c = _mm_aesenc_si128(c, k13);
+
+               c = _mm_aesenclast_si128(c, k14);
 
                state = increment_be(state);
        }
@@ -575,9 +966,23 @@ aesni_ccm_t *aesni_ccm_create(encryption_algorithm_t algo,
                .key_size = key_size,
                .iv_gen = iv_gen_seq_create(),
                .icv_size = icv_size,
-               .encrypt = encrypt_ccm,
-               .decrypt = decrypt_ccm,
        );
 
+       switch (key_size)
+       {
+               case 16:
+                       this->encrypt = encrypt_ccm128;
+                       this->decrypt = decrypt_ccm128;
+                       break;
+               case 24:
+                       this->encrypt = encrypt_ccm192;
+                       this->decrypt = decrypt_ccm192;
+                       break;
+               case 32:
+                       this->encrypt = encrypt_ccm256;
+                       this->decrypt = decrypt_ccm256;
+                       break;
+       }
+
        return &this->public;
 }